Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import numpy as np | |
import spaces | |
from kokoro import KModel, KPipeline | |
model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda") | |
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"} | |
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" # noqa: RUF001 | |
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" # noqa: RUF001 | |
CHOICES = { | |
"🇺🇸 🚺 Heart ❤️": "af_heart", | |
"🇺🇸 🚺 Bella 🔥": "af_bella", | |
"🇺🇸 🚺 Nicole 🎧": "af_nicole", | |
"🇺🇸 🚺 Aoede": "af_aoede", | |
"🇺🇸 🚺 Kore": "af_kore", | |
"🇺🇸 🚺 Sarah": "af_sarah", | |
"🇺🇸 🚺 Nova": "af_nova", | |
"🇺🇸 🚺 Sky": "af_sky", | |
"🇺🇸 🚺 Alloy": "af_alloy", | |
"🇺🇸 🚺 Jessica": "af_jessica", | |
"🇺🇸 🚺 River": "af_river", | |
"🇺🇸 🚹 Michael": "am_michael", | |
"🇺🇸 🚹 Fenrir": "am_fenrir", | |
"🇺🇸 🚹 Puck": "am_puck", | |
"🇺🇸 🚹 Echo": "am_echo", | |
"🇺🇸 🚹 Eric": "am_eric", | |
"🇺🇸 🚹 Liam": "am_liam", | |
"🇺🇸 🚹 Onyx": "am_onyx", | |
"🇺🇸 🚹 Santa": "am_santa", | |
"🇺🇸 🚹 Adam": "am_adam", | |
"🇬🇧 🚺 Emma": "bf_emma", | |
"🇬🇧 🚺 Isabella": "bf_isabella", | |
"🇬🇧 🚺 Alice": "bf_alice", | |
"🇬🇧 🚺 Lily": "bf_lily", | |
"🇬🇧 🚹 George": "bm_george", | |
"🇬🇧 🚹 Fable": "bm_fable", | |
"🇬🇧 🚹 Lewis": "bm_lewis", | |
"🇬🇧 🚹 Daniel": "bm_daniel", | |
} | |
for v in CHOICES.values(): | |
pipelines[v[0]].load_voice(v) | |
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]: | |
"""Generate audio from text using Kokoro TTS model. | |
Available voices: | |
- af_heart | |
- af_bella | |
- af_nicole | |
- af_aoede | |
- af_kore | |
- af_sarah | |
- af_nova | |
- af_sky | |
- af_alloy | |
- af_jessica | |
- af_river | |
- am_michael | |
- am_fenrir | |
- am_puck | |
- am_echo | |
- am_eric | |
- am_liam | |
- am_onyx | |
- am_santa | |
- am_adam | |
- bf_emma | |
- bf_isabella | |
- bf_alice | |
- bf_lily | |
- bm_george | |
- bm_fable | |
- bm_lewis | |
- bm_daniel | |
Args: | |
text: The text to generate audio from. | |
voice: The voice to use. Defaults to "af_heart". | |
speed: The speed of the audio. Defaults to 1.0. | |
Returns: | |
A tuple containing the audio and the tokens used to generate the audio. | |
""" | |
pipeline = pipelines[voice[0]] | |
pack = pipeline.load_voice(voice) | |
generator = pipeline(text, voice, speed) | |
# Only use the first batch of tokens | |
_, ps, _ = next(generator) | |
ref_s = pack[len(ps) - 1] | |
audio = model(ps, ref_s, speed) | |
return (24000, audio.numpy()), ps | |
with gr.Blocks(css_paths="style.css") as demo: | |
gr.Markdown("# Kokoro TTS") | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox( | |
label="Input Text", | |
info="Up to ~500 characters.", | |
) | |
voice = gr.Dropdown( | |
label="Voice", | |
choices=list(CHOICES.items()), | |
value="af_heart", | |
info="Quality and availability vary by language", | |
) | |
speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1) | |
generate_btn = gr.Button("Generate", variant="primary") | |
with gr.Column(): | |
out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True) | |
out_ps = gr.Textbox( | |
label="Output Tokens", | |
info="Tokens used to generate the audio, up to 510 context length.", | |
) | |
gr.Examples( | |
examples=[ | |
[ | |
"She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.", | |
"af_heart", | |
1.0, | |
], | |
[ | |
"Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?", | |
"af_heart", | |
1.0, | |
], | |
], | |
fn=generate, | |
inputs=[text, voice, speed], | |
outputs=[out_audio, out_ps], | |
) | |
generate_btn.click( | |
fn=generate, | |
inputs=[text, voice, speed], | |
outputs=[out_audio, out_ps], | |
) | |
if __name__ == "__main__": | |
demo.launch(mcp_server=True) | |