Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,457 Bytes
2f4c162 9314cdc 2f4c162 41b5e7a 9314cdc 2f4c162 c1057fc 2f4c162 b7cfba0 41b5e7a b7cfba0 41b5e7a 9314cdc 8a49e4a 9314cdc 8a49e4a 9314cdc 8a49e4a 9314cdc 7a398c4 a6d6b01 9314cdc 7c600ad 2f4c162 9314cdc 2f4c162 9314cdc 7c600ad 9314cdc 2f4c162 9314cdc 2f4c162 41b5e7a 2f4c162 9314cdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import numpy as np
import spaces
from kokoro import KModel, KPipeline
model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" # noqa: RUF001
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" # noqa: RUF001
CHOICES = {
"🇺🇸 🚺 Heart ❤️": "af_heart",
"🇺🇸 🚺 Bella 🔥": "af_bella",
"🇺🇸 🚺 Nicole 🎧": "af_nicole",
"🇺🇸 🚺 Aoede": "af_aoede",
"🇺🇸 🚺 Kore": "af_kore",
"🇺🇸 🚺 Sarah": "af_sarah",
"🇺🇸 🚺 Nova": "af_nova",
"🇺🇸 🚺 Sky": "af_sky",
"🇺🇸 🚺 Alloy": "af_alloy",
"🇺🇸 🚺 Jessica": "af_jessica",
"🇺🇸 🚺 River": "af_river",
"🇺🇸 🚹 Michael": "am_michael",
"🇺🇸 🚹 Fenrir": "am_fenrir",
"🇺🇸 🚹 Puck": "am_puck",
"🇺🇸 🚹 Echo": "am_echo",
"🇺🇸 🚹 Eric": "am_eric",
"🇺🇸 🚹 Liam": "am_liam",
"🇺🇸 🚹 Onyx": "am_onyx",
"🇺🇸 🚹 Santa": "am_santa",
"🇺🇸 🚹 Adam": "am_adam",
"🇬🇧 🚺 Emma": "bf_emma",
"🇬🇧 🚺 Isabella": "bf_isabella",
"🇬🇧 🚺 Alice": "bf_alice",
"🇬🇧 🚺 Lily": "bf_lily",
"🇬🇧 🚹 George": "bm_george",
"🇬🇧 🚹 Fable": "bm_fable",
"🇬🇧 🚹 Lewis": "bm_lewis",
"🇬🇧 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
pipelines[v[0]].load_voice(v)
@spaces.GPU(duration=30)
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
"""Synthesizes speech from English text using the Kokoro TTS model.
Note:
This model supports only English input texts.
Voice Selection:
- The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
`<language/accent><gender>_<voice_name>`
- `<language/accent>`:
- 'a' for American English
- 'b' for British English
- `<gender>`:
- 'f' for female
- 'm' for male
- Example: 'af_heart' indicates an American English female voice named Heart.
Available voices:
- af_heart
- af_bella
- af_nicole
- af_aoede
- af_kore
- af_sarah
- af_nova
- af_sky
- af_alloy
- af_jessica
- af_river
- am_michael
- am_fenrir
- am_puck
- am_echo
- am_eric
- am_liam
- am_onyx
- am_santa
- am_adam
- bf_emma
- bf_isabella
- bf_alice
- bf_lily
- bm_george
- bm_fable
- bm_lewis
- bm_daniel
Args:
text: Input text to be synthesized. Only English text is supported. Non-English input may result in errors or mispronunciations.
voice: Identifier for the voice to be used in synthesis. Defaults to "af_heart".
speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.
Returns:
A tuple containing the audio and the tokens used to generate the audio.
"""
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
generator = pipeline(text, voice, speed)
# Only use the first batch of tokens
_, ps, _ = next(generator)
ref_s = pack[len(ps) - 1]
audio = model(ps, ref_s, speed)
return (24000, audio.numpy()), ps
with gr.Blocks(css_paths="style.css") as demo:
gr.Markdown("# Kokoro TTS")
with gr.Row():
with gr.Column():
text = gr.Textbox(
label="Input Text",
info="Up to ~500 characters.",
)
voice = gr.Dropdown(
label="Voice",
choices=list(CHOICES.items()),
value="af_heart",
info="Quality and availability vary by language",
)
speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
generate_btn = gr.Button("Generate", variant="primary")
with gr.Column():
out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
out_ps = gr.Textbox(
label="Output Tokens",
info="Tokens used to generate the audio, up to 510 context length.",
)
gr.Examples(
examples=[
[
"She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
"af_heart",
1.0,
],
[
"Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
"af_heart",
1.0,
],
],
fn=generate,
inputs=[text, voice, speed],
outputs=[out_audio, out_ps],
)
generate_btn.click(
fn=generate,
inputs=[text, voice, speed],
outputs=[out_audio, out_ps],
)
if __name__ == "__main__":
demo.launch(mcp_server=True)
|