File size: 5,457 Bytes
2f4c162
9314cdc
2f4c162
 
41b5e7a
9314cdc
2f4c162
 
 
 
c1057fc
2f4c162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7cfba0
41b5e7a
 
b7cfba0
41b5e7a
9314cdc
 
8a49e4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9314cdc
 
8a49e4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9314cdc
 
8a49e4a
 
 
9314cdc
 
 
 
 
 
 
 
 
 
 
 
7a398c4
a6d6b01
9314cdc
 
7c600ad
 
2f4c162
 
9314cdc
 
 
 
 
 
 
2f4c162
9314cdc
 
7c600ad
9314cdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f4c162
9314cdc
 
 
 
 
2f4c162
41b5e7a
2f4c162
9314cdc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
import numpy as np
import spaces
from kokoro import KModel, KPipeline

model = KModel(repo_id="hexgrad/Kokoro-82M").to("cuda")
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"}
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"  # noqa: RUF001
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"  # noqa: RUF001

CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
    pipelines[v[0]].load_voice(v)


@spaces.GPU(duration=30)
def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
    """Synthesizes speech from English text using the Kokoro TTS model.

    Note:
        This model supports only English input texts.

    Voice Selection:
        - The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
        `<language/accent><gender>_<voice_name>`
        - `<language/accent>`:
            - 'a' for American English
            - 'b' for British English
        - `<gender>`:
            - 'f' for female
            - 'm' for male
        - Example: 'af_heart' indicates an American English female voice named Heart.

    Available voices:
        - af_heart
        - af_bella
        - af_nicole
        - af_aoede
        - af_kore
        - af_sarah
        - af_nova
        - af_sky
        - af_alloy
        - af_jessica
        - af_river
        - am_michael
        - am_fenrir
        - am_puck
        - am_echo
        - am_eric
        - am_liam
        - am_onyx
        - am_santa
        - am_adam
        - bf_emma
        - bf_isabella
        - bf_alice
        - bf_lily
        - bm_george
        - bm_fable
        - bm_lewis
        - bm_daniel

    Args:
        text: Input text to be synthesized. Only English text is supported. Non-English input may result in errors or mispronunciations.
        voice: Identifier for the voice to be used in synthesis. Defaults to "af_heart".
        speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.

    Returns:
        A tuple containing the audio and the tokens used to generate the audio.
    """
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    generator = pipeline(text, voice, speed)
    # Only use the first batch of tokens
    _, ps, _ = next(generator)
    ref_s = pack[len(ps) - 1]
    audio = model(ps, ref_s, speed)
    return (24000, audio.numpy()), ps


with gr.Blocks(css_paths="style.css") as demo:
    gr.Markdown("# Kokoro TTS")
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(
                label="Input Text",
                info="Up to ~500 characters.",
            )
            voice = gr.Dropdown(
                label="Voice",
                choices=list(CHOICES.items()),
                value="af_heart",
                info="Quality and availability vary by language",
            )
            speed = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1)
            generate_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
            out_ps = gr.Textbox(
                label="Output Tokens",
                info="Tokens used to generate the audio, up to 510 context length.",
            )

    gr.Examples(
        examples=[
            [
                "She sells seashells by the seashore. The shells she sells are surely seashells. So if she sells shells on the seashore, I'm sure she sells seashore shells.",
                "af_heart",
                1.0,
            ],
            [
                "Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?",
                "af_heart",
                1.0,
            ],
        ],
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

    generate_btn.click(
        fn=generate,
        inputs=[text, voice, speed],
        outputs=[out_audio, out_ps],
    )

if __name__ == "__main__":
    demo.launch(mcp_server=True)