Kokoro-TTS

Running on Zero

File size: 5,457 Bytes

import gradio as gr
import numpy as np
import spaces
from kokoro import KModel, KPipeline

model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"  # noqa: RUF001
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"  # noqa: RUF001

CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
    pipelines[v[0]].load_voice(v)


@spaces.GPU(duration=30)
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
    """Synthesizes speech from English text using the Kokoro TTS model.

    Note:
        This model supports only English input texts.

    Voice Selection:
        - The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
        `<language/accent><gender>_<voice_name>`
        - `<language/accent>`:
            - 'a' for American English
            - 'b' for British English
        - `<gender>`:
            - 'f' for female
            - 'm' for male
        - Example: 'af_heart' indicates an American English female voice named Heart.

    Available voices:
        - af_heart
        - af_bella
        - af_nicole
        - af_aoede
        - af_kore
        - af_sarah
        - af_nova
        - af_sky
        - af_alloy
        - af_jessica
        - af_river
        - am_michael
        - am_fenrir
        - am_puck
        - am_echo
        - am_eric
        - am_liam
        - am_onyx
        - am_santa
        - am_adam
        - bf_emma
        - bf_isabella
        - bf_alice
        - bf_lily
        - bm_george
        - bm_fable
        - bm_lewis
        - bm_daniel

    Args:
        text: Input text to be synthesized. Only English text is supported. Non-English input may result in errors or mispronunciations.
        voice: Identifier for the voice to be used in synthesis. Defaults to "af_heart".
        speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.

    Returns:
        A tuple containing the audio and the tokens used to generate the audio.
    """
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    generator = pipeline(text, voice, speed)
    # Only use the first batch of tokens
    _, ps, _ = next(generator)
    ref_s = pack[len(ps) - 1]
    audio = model(ps, ref_s, speed)
    return (24000, audio.numpy()), ps


with gr.Blocks(css_paths="style.css") as demo:
    gr.Markdown("# Kokoro TTS")
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(
                label="Input Text",
                info="Up to ~500 characters.",
            )
            voice = gr.Dropdown(
                label="Voice",
                choices=list(CHOICES.items()),
                value="af_heart",
                info="Quality and availability vary by language",
            )
            speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
            generate_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
            out_ps = gr.Textbox(
                label="Output Tokens",
                info="Tokens used to generate the audio, up to 510 context length.",
            )

    gr.Examples(
        examples=[
            [
                "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
                "af_heart",
                1.0,
            ],
            [
                "Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
                "af_heart",
                1.0,
            ],
        ],
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

    generate_btn.click(
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

if __name__ == "__main__":
    demo.launch(mcp_server=True)