import gradio as gr import numpy as np import spaces from kokoro import KModel, KPipeline model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda") pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"} pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" # noqa: RUF001 pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" # noqa: RUF001 CHOICES = { "🇺🇸 🚺 Heart ❤️": "af_heart", "🇺🇸 🚺 Bella 🔥": "af_bella", "🇺🇸 🚺 Nicole 🎧": "af_nicole", "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah", "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy", "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael", "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo", "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx", "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma", "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily", "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis", "🇬🇧 🚹 Daniel": "bm_daniel", } for v in CHOICES.values(): pipelines[v[0]].load_voice(v) @spaces.GPU(duration=30) def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]: """Generate audio from text using Kokoro TTS model. Available voices: - af_heart - af_bella - af_nicole - af_aoede - af_kore - af_sarah - af_nova - af_sky - af_alloy - af_jessica - af_river - am_michael - am_fenrir - am_puck - am_echo - am_eric - am_liam - am_onyx - am_santa - am_adam - bf_emma - bf_isabella - bf_alice - bf_lily - bm_george - bm_fable - bm_lewis - bm_daniel Args: text: The text to generate audio from. voice: The voice to use. Defaults to "af_heart". speed: The speed of the audio. Defaults to 1.0. Returns: A tuple containing the audio and the tokens used to generate the audio. """ pipeline = pipelines[voice[0]] pack = pipeline.load_voice(voice) generator = pipeline(text, voice, speed) # Only use the first batch of tokens _, ps, _ = next(generator) ref_s = pack[len(ps) - 1] audio = model(ps, ref_s, speed) return (24000, audio.numpy()), ps with gr.Blocks(css_paths="style.css") as demo: gr.Markdown("# Kokoro TTS") with gr.Row(): with gr.Column(): text = gr.Textbox( label="Input Text", info="Up to ~500 characters.", ) voice = gr.Dropdown( label="Voice", choices=list(CHOICES.items()), value="af_heart", info="Quality and availability vary by language", ) speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1) generate_btn = gr.Button("Generate", variant="primary") with gr.Column(): out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True) out_ps = gr.Textbox( label="Output Tokens", info="Tokens used to generate the audio, up to 510 context length.", ) gr.Examples( examples=[ [ "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.", "af_heart", 1.0, ], [ "Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?", "af_heart", 1.0, ], ], fn=generate, inputs=[text, voice, speed], outputs=[out_audio, out_ps], ) generate_btn.click( fn=generate, inputs=[text, voice, speed], outputs=[out_audio, out_ps], ) if __name__ == "__main__": demo.launch(mcp_server=True)