Kokoro-TTS / app.py
hysts's picture
hysts HF Staff
Update
9314cdc
import gradio as gr
import numpy as np
import spaces
from kokoro import KModel, KPipeline
model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" # noqa: RUF001
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" # noqa: RUF001
CHOICES = {
"🇺🇸 🚺 Heart ❤️": "af_heart",
"🇺🇸 🚺 Bella 🔥": "af_bella",
"🇺🇸 🚺 Nicole 🎧": "af_nicole",
"🇺🇸 🚺 Aoede": "af_aoede",
"🇺🇸 🚺 Kore": "af_kore",
"🇺🇸 🚺 Sarah": "af_sarah",
"🇺🇸 🚺 Nova": "af_nova",
"🇺🇸 🚺 Sky": "af_sky",
"🇺🇸 🚺 Alloy": "af_alloy",
"🇺🇸 🚺 Jessica": "af_jessica",
"🇺🇸 🚺 River": "af_river",
"🇺🇸 🚹 Michael": "am_michael",
"🇺🇸 🚹 Fenrir": "am_fenrir",
"🇺🇸 🚹 Puck": "am_puck",
"🇺🇸 🚹 Echo": "am_echo",
"🇺🇸 🚹 Eric": "am_eric",
"🇺🇸 🚹 Liam": "am_liam",
"🇺🇸 🚹 Onyx": "am_onyx",
"🇺🇸 🚹 Santa": "am_santa",
"🇺🇸 🚹 Adam": "am_adam",
"🇬🇧 🚺 Emma": "bf_emma",
"🇬🇧 🚺 Isabella": "bf_isabella",
"🇬🇧 🚺 Alice": "bf_alice",
"🇬🇧 🚺 Lily": "bf_lily",
"🇬🇧 🚹 George": "bm_george",
"🇬🇧 🚹 Fable": "bm_fable",
"🇬🇧 🚹 Lewis": "bm_lewis",
"🇬🇧 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
pipelines[v[0]].load_voice(v)
@spaces.GPU(duration=30)
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
"""Generate audio from text using Kokoro TTS model.
Available voices:
- af_heart
- af_bella
- af_nicole
- af_aoede
- af_kore
- af_sarah
- af_nova
- af_sky
- af_alloy
- af_jessica
- af_river
- am_michael
- am_fenrir
- am_puck
- am_echo
- am_eric
- am_liam
- am_onyx
- am_santa
- am_adam
- bf_emma
- bf_isabella
- bf_alice
- bf_lily
- bm_george
- bm_fable
- bm_lewis
- bm_daniel
Args:
text: The text to generate audio from.
voice: The voice to use. Defaults to "af_heart".
speed: The speed of the audio. Defaults to 1.0.
Returns:
A tuple containing the audio and the tokens used to generate the audio.
"""
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
generator = pipeline(text, voice, speed)
# Only use the first batch of tokens
_, ps, _ = next(generator)
ref_s = pack[len(ps) - 1]
audio = model(ps, ref_s, speed)
return (24000, audio.numpy()), ps
with gr.Blocks(css_paths="style.css") as demo:
gr.Markdown("# Kokoro TTS")
with gr.Row():
with gr.Column():
text = gr.Textbox(
label="Input Text",
info="Up to ~500 characters.",
)
voice = gr.Dropdown(
label="Voice",
choices=list(CHOICES.items()),
value="af_heart",
info="Quality and availability vary by language",
)
speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
generate_btn = gr.Button("Generate", variant="primary")
with gr.Column():
out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
out_ps = gr.Textbox(
label="Output Tokens",
info="Tokens used to generate the audio, up to 510 context length.",
)
gr.Examples(
examples=[
[
"She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
"af_heart",
1.0,
],
[
"Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
"af_heart",
1.0,
],
],
fn=generate,
inputs=[text, voice, speed],
outputs=[out_audio, out_ps],
)
generate_btn.click(
fn=generate,
inputs=[text, voice, speed],
outputs=[out_audio, out_ps],
)
if __name__ == "__main__":
demo.launch(mcp_server=True)