import gradio as gr
import numpy as np
import spaces
from kokoro import KModel, KPipeline

model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"  # noqa: RUF001
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"  # noqa: RUF001

CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
    pipelines[v[0]].load_voice(v)


@spaces.GPU(duration=30)
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
    """Generate audio from text using Kokoro TTS model.

    Available voices:
    - af_heart
    - af_bella
    - af_nicole
    - af_aoede
    - af_kore
    - af_sarah
    - af_nova
    - af_sky
    - af_alloy
    - af_jessica
    - af_river
    - am_michael
    - am_fenrir
    - am_puck
    - am_echo
    - am_eric
    - am_liam
    - am_onyx
    - am_santa
    - am_adam
    - bf_emma
    - bf_isabella
    - bf_alice
    - bf_lily
    - bm_george
    - bm_fable
    - bm_lewis
    - bm_daniel

    Args:
        text: The text to generate audio from.
        voice: The voice to use. Defaults to "af_heart".
        speed: The speed of the audio. Defaults to 1.0.

    Returns:
        A tuple containing the audio and the tokens used to generate the audio.
    """
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    generator = pipeline(text, voice, speed)
    # Only use the first batch of tokens
    _, ps, _ = next(generator)
    ref_s = pack[len(ps) - 1]
    audio = model(ps, ref_s, speed)
    return (24000, audio.numpy()), ps


with gr.Blocks(css_paths="style.css") as demo:
    gr.Markdown("# Kokoro TTS")
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(
                label="Input Text",
                info="Up to ~500 characters.",
            )
            voice = gr.Dropdown(
                label="Voice",
                choices=list(CHOICES.items()),
                value="af_heart",
                info="Quality and availability vary by language",
            )
            speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
            generate_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
            out_ps = gr.Textbox(
                label="Output Tokens",
                info="Tokens used to generate the audio, up to 510 context length.",
            )

    gr.Examples(
        examples=[
            [
                "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
                "af_heart",
                1.0,
            ],
            [
                "Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
                "af_heart",
                1.0,
            ],
        ],
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

    generate_btn.click(
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

if __name__ == "__main__":
    demo.launch(mcp_server=True)