Kokoro-TTS

Running on Zero

App Files Files Community

Kokoro-TTS / app.py

hysts HF Staff

Update

9314cdc 4 days ago

raw

history blame contribute delete

4.59 kB

	import gradio as gr
	import numpy as np
	import spaces
	from kokoro import KModel, KPipeline

	model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
	pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
	pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" # noqa: RUF001
	pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" # noqa: RUF001

	CHOICES = {
	"🇺🇸 🚺 Heart ❤️": "af_heart",
	"🇺🇸 🚺 Bella 🔥": "af_bella",
	"🇺🇸 🚺 Nicole 🎧": "af_nicole",
	"🇺🇸 🚺 Aoede": "af_aoede",
	"🇺🇸 🚺 Kore": "af_kore",
	"🇺🇸 🚺 Sarah": "af_sarah",
	"🇺🇸 🚺 Nova": "af_nova",
	"🇺🇸 🚺 Sky": "af_sky",
	"🇺🇸 🚺 Alloy": "af_alloy",
	"🇺🇸 🚺 Jessica": "af_jessica",
	"🇺🇸 🚺 River": "af_river",
	"🇺🇸 🚹 Michael": "am_michael",
	"🇺🇸 🚹 Fenrir": "am_fenrir",
	"🇺🇸 🚹 Puck": "am_puck",
	"🇺🇸 🚹 Echo": "am_echo",
	"🇺🇸 🚹 Eric": "am_eric",
	"🇺🇸 🚹 Liam": "am_liam",
	"🇺🇸 🚹 Onyx": "am_onyx",
	"🇺🇸 🚹 Santa": "am_santa",
	"🇺🇸 🚹 Adam": "am_adam",
	"🇬🇧 🚺 Emma": "bf_emma",
	"🇬🇧 🚺 Isabella": "bf_isabella",
	"🇬🇧 🚺 Alice": "bf_alice",
	"🇬🇧 🚺 Lily": "bf_lily",
	"🇬🇧 🚹 George": "bm_george",
	"🇬🇧 🚹 Fable": "bm_fable",
	"🇬🇧 🚹 Lewis": "bm_lewis",
	"🇬🇧 🚹 Daniel": "bm_daniel",
	}
	for v in CHOICES.values():
	pipelines[v[0]].load_voice(v)


	@spaces.GPU(duration=30)
	def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
	"""Generate audio from text using Kokoro TTS model.

	Available voices:
	- af_heart
	- af_bella
	- af_nicole
	- af_aoede
	- af_kore
	- af_sarah
	- af_nova
	- af_sky
	- af_alloy
	- af_jessica
	- af_river
	- am_michael
	- am_fenrir
	- am_puck
	- am_echo
	- am_eric
	- am_liam
	- am_onyx
	- am_santa
	- am_adam
	- bf_emma
	- bf_isabella
	- bf_alice
	- bf_lily
	- bm_george
	- bm_fable
	- bm_lewis
	- bm_daniel

	Args:
	text: The text to generate audio from.
	voice: The voice to use. Defaults to "af_heart".
	speed: The speed of the audio. Defaults to 1.0.

	Returns:
	A tuple containing the audio and the tokens used to generate the audio.
	"""
	pipeline = pipelines[voice[0]]
	pack = pipeline.load_voice(voice)
	generator = pipeline(text, voice, speed)
	# Only use the first batch of tokens
	_, ps, _ = next(generator)
	ref_s = pack[len(ps) - 1]
	audio = model(ps, ref_s, speed)
	return (24000, audio.numpy()), ps


	with gr.Blocks(css_paths="style.css") as demo:
	gr.Markdown("# Kokoro TTS")
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(
	label="Input Text",
	info="Up to ~500 characters.",
	)
	voice = gr.Dropdown(
	label="Voice",
	choices=list(CHOICES.items()),
	value="af_heart",
	info="Quality and availability vary by language",
	)
	speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
	generate_btn = gr.Button("Generate", variant="primary")
	with gr.Column():
	out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
	out_ps = gr.Textbox(
	label="Output Tokens",
	info="Tokens used to generate the audio, up to 510 context length.",
	)

	gr.Examples(
	examples=[
	[
	"She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
	"af_heart",
	1.0,
	],
	[
	"Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
	"af_heart",
	1.0,
	],
	],
	fn=generate,
	inputs=[text, voice, speed],
	outputs=[out_audio, out_ps],
	)

	generate_btn.click(
	fn=generate,
	inputs=[text, voice, speed],
	outputs=[out_audio, out_ps],
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)