Kokoro TTS Issue: English Text Generating Chinese-Like Speech

Issue Description

When using Kokoro TTS (v0.19), the generated speech sounds like Chinese regardless of the input text or voice used. This occurs despite:

Correct English phoneme generation
Proper model loading
All voice files being present

Investigation Tools

I've created several test scripts to diagnose the issue:

1. test_phonemes.py

Tests the phoneme generation pipeline:

Verifies espeak-ng installation
Tests direct phoneme generation
Validates phonemizer configuration

Example output showing phonemes are correct:
Input: "Hello world."
Phonemes: həlˈoʊ wˈɜːld.

2. check_model.py

Analyzes model and voice file contents:

Verifies model file integrity
Checks voice file dimensions
Validates tensor ranges

Key findings:

Voice files: [511, 1, 256] tensors
Values in normal range (-1 to 1)
No obvious corruption

3. test_tts.py

Comprehensive testing script that:

Tests multiple voices
Saves audio outputs
Records phonemes and metadata
Includes quality ratings

Test Results

Model loads successfully (312MB, correct hash)
Voice files load correctly
Phonemes generate properly
Audio output consistently sounds Chinese-like

Technical Details

Model: kokoro-v0_19.pth (312MB)
Voices tested: af_bella, af_sarah, af (default mix)
Device: CPU
espeak-ng version: 1.52.0

Files and Outputs

All test files and outputs are included for reference:

test_phonemes.py - Phoneme generation testing
check_model.py - Model and voice file analysis
test_tts.py - Generation testing with logging
Generated audio samples in output/

Questions

Has anyone else experienced this issue?
Could there be a mismatch between model weights and voice files?
Are there known issues with style encoding?

The test files are designed to help diagnose similar issues and provide a framework for testing TTS output quality. I've attached all test files and sample outputs for reference.

Note: The test files include proper error handling, detailed logging, and audio file saving for better debugging. They can be used to:

Verify TTS setup
Test voice quality
Generate samples with metadata
Track issues across different voices

Let me know if you need any clarification or would like to see specific parts of the test output.

test_tts.py

from models import build_model
import torch
import sounddevice as sd
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.wrapper import EspeakWrapper
import re
from pathlib import Path
import scipy.io.wavfile
import datetime
import json

# Set espeak library path explicitly
espeak_path = '/opt/homebrew/Cellar/espeak-ng/1.52.0/lib/libespeak-ng.1.dylib'
EspeakWrapper.set_library(espeak_path)

# Configure phonemizer backend explicitly
backend = EspeakBackend(
    language='en-us',
    preserve_punctuation=True,
    with_stress=True,
    punctuation_marks=';:,.!?¡¿—…"«»""',
    language_switch='remove-flags'
)

def custom_phonemize(text):
    """Phonemize with explicit settings"""
    phones = backend.phonemize([text])[0]
    # Clean up phonemes according to Kokoro's rules
    phones = phones.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
    phones = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', phones)
    phones = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', 'z', phones)
    return phones

def save_output(audio, text, voice_name, phones, out_ps, quality, timestamp):
    """Save audio file and metadata"""
    # Create output directory
    output_dir = Path('output') / timestamp
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Create base filename
    base_name = f"{voice_name}_{text[:30].replace(' ', '_')}"
    
    # Save audio file
    audio_path = output_dir / f"{base_name}.wav"
    scipy.io.wavfile.write(audio_path, 24000, audio)
    
    # Save metadata
    metadata = {
        'text': text,
        'voice': voice_name,
        'timestamp': timestamp,
        'quality_rating': quality,
        'input_phonemes': phones,
        'output_phonemes': out_ps,
        'sample_rate': 24000,
        'audio_file': audio_path.name
    }
    
    metadata_path = output_dir / f"{base_name}.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # Save human-readable description
    desc_path = output_dir / f"{base_name}.txt"
    with open(desc_path, 'w') as f:
        f.write(f"Text: {text}\n")
        f.write(f"Voice: {voice_name}\n")
        f.write(f"Quality Rating: {quality}\n")
        f.write(f"Input Phonemes: {phones}\n")
        f.write(f"Output Phonemes: {out_ps}\n")
        f.write(f"Sample Rate: 24000 Hz\n")
        f.write(f"Generated: {timestamp}\n")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nUsing device: {device}")

# Load model
print("\nLoading model...")
MODEL = build_model('kokoro-v0_19.pth', device)

# Test phrases
test_phrases = [
    "Hello world.",
    "One two three four five.",
    "The quick brown fox jumps over the lazy dog."
]

# Create timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Try different voices
voices = ['af_bella', 'af_sarah', 'af']  # Include default mixed voice
for voice_name in voices:
    print(f"\n=== Testing voice: {voice_name} ===")
    try:
        # Load voice
        print(f"Loading voice file: voices/{voice_name}.pt")
        voice = torch.load(f'voices/{voice_name}.pt', weights_only=True)
        
        # Verify voice dimensions
        print(f"Voice tensor shape: {voice.shape}")
        
        # Split content and style
        content = voice[:, :, :128]
        style = voice[:, :, 128:]
        print(f"Content shape: {content.shape}")
        print(f"Style shape: {style.shape}")
        
        # Create new voice pack with correct dimensions
        VOICEPACK = torch.cat([content, style], dim=2).to(device)
        
        for text in test_phrases:
            print(f"\n--- Testing phrase: {text} ---")
            
            # Get phonemes
            phones = custom_phonemize(text)
            print(f"Phonemes: {phones}")
            
            # Generate with explicit settings
            from kokoro import generate
            audio, out_ps = generate(MODEL, text, VOICEPACK, lang='a', ps=phones)
            
            print(f"Output phonemes: {out_ps}")
            print(f"Audio shape: {audio.shape}")
            
            # Normalize audio
            max_val = abs(audio).max()
            if max_val > 1.0:
                audio = audio / max_val * 0.95
            
            print("\nPlaying audio...")
            sd.play(audio, 24000)
            sd.wait()
            
            response = input("\nHow did that sound? (g)ood/(b)ad/(n)ext: ").lower()
            quality = "good" if response == 'g' else "bad" if response == 'b' else "skipped"
            
            # Save the output
            save_output(audio, text, voice_name, phones, out_ps, quality, timestamp)
            
            if response == 'b':
                print("Marking as bad output...")
            elif response == 'g':
                print("Marking as good output...")
            elif response != 'n':
                break
            
    except Exception as e:
        print(f"Error with voice {voice_name}: {e}")
        import traceback
        traceback.print_exc()

test_model.py

import torch
from pathlib import Path
import numpy as np

def check_voice_file(path):
    """Check the contents of a voice file"""
    print(f"\nChecking voice file: {path}")
    voice = torch.load(path, weights_only=True)
    
    if isinstance(voice, dict):
        print("Voice file is a dictionary with keys:", voice.keys())
        for k, v in voice.items():
            if torch.is_tensor(v):
                print(f"  {k}: shape={v.shape}, range={v.min():.3f} to {v.max():.3f}")
    elif torch.is_tensor(voice):
        print(f"Voice file is a tensor with shape: {voice.shape}")
        print(f"Value range: {voice.min():.3f} to {voice.max():.3f}")
        
        # Check if values look reasonable
        print("Statistics:")
        print(f"  Mean: {voice.float().mean():.3f}")
        print(f"  Std: {voice.float().std():.3f}")
        print(f"  Zeros: {(voice == 0).float().mean()*100:.1f}%")
        
        # Sample some values
        flat = voice.flatten()
        print("Sample values:", flat[:10].tolist())
    else:
        print(f"Unexpected type: {type(voice)}")

def check_model_file(path):
    """Check the contents of the model file"""
    print(f"\nChecking model file: {path}")
    try:
        # Try loading with different protocols
        for protocol in [2, 3, 4]:
            try:
                checkpoint = torch.load(path, map_location='cpu', pickle_protocol=protocol)
                print(f"Successfully loaded with protocol {protocol}")
                if 'net' in checkpoint:
                    net = checkpoint['net']
                    print("\nModel contains these components:")
                    for k, v in net.items():
                        if isinstance(v, dict):
                            print(f"{k}:")
                            for sk, sv in v.items():
                                if torch.is_tensor(sv):
                                    print(f"  {sk}: shape={sv.shape}, range={sv.min():.3f} to {sv.max():.3f}")
                        elif torch.is_tensor(v):
                            print(f"{k}: shape={v.shape}, range={v.min():.3f} to {v.max():.3f}")
                break
            except Exception as e:
                print(f"Failed with protocol {protocol}: {str(e)}")
    except Exception as e:
        print(f"Failed to load model: {str(e)}")

def check_config():
    """Check config.json"""
    config_path = Path('config.json')
    if config_path.exists():
        import json
        with open(config_path) as f:
            config = json.load(f)
        print("\nConfig contents:")
        print(json.dumps(config, indent=2))

print("=== Checking Kokoro Files ===")
check_config()

# Check all voice files
voice_dir = Path('voices')
for voice_file in voice_dir.glob('*.pt'):
    check_voice_file(voice_file)

# Check model file
print("\nChecking model file...")
check_model_file('kokoro-v0_19.pth')

test_phonemes.py

import subprocess
import sys
from pathlib import Path

def test_espeak():
    """Test espeak-ng installation and phoneme generation"""
    print("\n=== Testing espeak-ng installation ===")
    
    # Check espeak-ng installation
    try:
        result = subprocess.run(['espeak-ng', '--version'], capture_output=True, text=True)
        print("espeak-ng version:", result.stdout.strip())
    except FileNotFoundError:
        print("espeak-ng not found! Please install it:")
        print("brew install espeak-ng")
        sys.exit(1)

    # Test direct phoneme generation
    print("\n=== Testing direct phoneme generation ===")
    test_text = "Hello, this is a test."
    try:
        result = subprocess.run(['espeak-ng', '-q', '--ipa', '-v', 'en-us', test_text], 
                              capture_output=True, text=True)
        print(f"Direct espeak output for '{test_text}':")
        print(result.stdout.strip())
    except Exception as e:
        print(f"Error testing espeak: {e}")

def test_phonemizer():
    """Test phonemizer library"""
    print("\n=== Testing phonemizer library ===")
    try:
        from phonemizer.backend.espeak.wrapper import EspeakWrapper
        from phonemizer.backend import EspeakBackend
        
        # Set espeak library path explicitly
        espeak_path = '/opt/homebrew/Cellar/espeak-ng/1.52.0/lib/libespeak-ng.1.dylib'
        EspeakWrapper.set_library(espeak_path)
        
        print("Phonemizer imported successfully")
        print(f"Using espeak library: {espeak_path}")
        
        # Test American English
        backend = EspeakBackend('en-us', preserve_punctuation=True, with_stress=True)
        text = "Hello, this is a test."
        phonemes = backend.phonemize([text])
        print("\nAmerican English phonemes:")
        print(f"Input: {text}")
        print(f"Output: {phonemes[0]}")
        
    except ImportError:
        print("Phonemizer not installed. Install with:")
        print("pip install phonemizer")
    except Exception as e:
        print(f"Error testing phonemizer: {e}")

def test_kokoro():
    """Test Kokoro's phoneme generation"""
    print("\n=== Testing Kokoro's phoneme generation ===")
    try:
        from kokoro import phonemize
        text = "Hello, this is a test."
        
        print("\nTesting American English (lang='a'):")
        ps_us = phonemize(text, 'a')
        print(f"Input: {text}")
        print(f"Output: {ps_us}")
        
        print("\nTesting British English (lang='b'):")
        ps_gb = phonemize(text, 'b')
        print(f"Input: {text}")
        print(f"Output: {ps_gb}")
        
    except Exception as e:
        print(f"Error testing Kokoro phonemizer: {e}")

if __name__ == "__main__":
    test_espeak()
    test_phonemizer()
    test_kokoro()

hexgrad
/

Kokoro-82M

Model produces Chinese-sounding gibberish on Macbook M4 Max