Spaces:

hexgrad
/

Kokoro-TTS

Running on Zero

App Files Files Community

hexgrad commited on Nov 28, 2024

Commit

5e9cc13

verified ·

1 Parent(s): 6459fb3

Upload app.py

Browse files

Files changed (1) hide show

app.py +25 -26

app.py CHANGED Viewed

@@ -324,12 +324,10 @@ with gr.Blocks() as basic_tts:
     generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
 @torch.no_grad()
-def lf_forward(token_lists, voices, speed, stop_event, device='cpu'):
     voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
     outs = []
     for tokens in token_lists:
-        if stop_event.is_set():
-            break
         ref_s = voicepack[len(tokens)]
         s = ref_s[:, 128:]
         tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
@@ -355,8 +353,8 @@ def lf_forward(token_lists, voices, speed, stop_event, device='cpu'):
     return outs
 @spaces.GPU
-def lf_forward_gpu(token_lists, voices, speed, stop_event):
-    return lf_forward(token_lists, voices, speed, stop_event, device='cuda')
 def resplit_strings(arr):
     # Handle edge cases
@@ -410,9 +408,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
     segments = [row for t in texts for row in recursive_split(t, voice)]
     return [(i, *row) for i, row in enumerate(segments)]
-def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, audio_stream=None):
-    if audio_stream is not None and len(audio_stream) == 3:
-        audio_stream[-1].set()
     token_lists = list(map(tokenize, segments['Tokens']))
     voices = resolve_voices(voice)
     speed = clamp_speed(speed)
@@ -420,19 +416,18 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
     pad_between = int(pad_between / speed)
     batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
     i = 0
-    stop_event = threading.Event()
     while i < len(token_lists):
         bs = batch_sizes.pop() if batch_sizes else 100
         try:
             if use_gpu:
-                outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed, stop_event)
             else:
-                outs = lf_forward(token_lists[i:i+bs], voices, speed, stop_event)
         except gr.exceptions.Error as e:
             if use_gpu:
                 gr.Warning(str(e))
                 gr.Info('Switching to CPU')
-                outs = lf_forward(token_lists[i:i+bs], voices, speed, stop_event)
                 use_gpu = False
             else:
                 raise gr.Error(e)
@@ -442,14 +437,10 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
                     continue
                 out = out[trim:-trim]
             if i > 0 and pad_between > 0:
-                yield SAMPLE_RATE, np.zeros(pad_between), stop_event
-            yield SAMPLE_RATE, out, stop_event
         i += bs
-def lf_stop(audio_stream):
-    if audio_stream is not None and len(audio_stream) == 3:
-        audio_stream[-1].set()
 def did_change_segments(segments):
     x = len(segments) if segments['Length'].any() else 0
     return [
@@ -494,18 +485,26 @@ with gr.Blocks() as lf_tts:
                 speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
                 trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
                 pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
-            with gr.Row():
                 generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
-                stop_btn = gr.Button('Stop', variant='stop')
     with gr.Row():
         segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
         segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
     segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
-    generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, audio_stream], outputs=[audio_stream])
-    stop_btn.click(lf_stop, inputs=[audio_stream], outputs=[audio_stream])
 with gr.Blocks() as about:
-    gr.Markdown("""
 Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
 ### FAQ
@@ -551,10 +550,10 @@ Inference code: MIT<br/>
 [eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
 Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
 Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
-""")
 with gr.Blocks() as changelog:
-    gr.Markdown("""
 **25 Nov 2024**<br/>
 🎨 Voice Mixer added
@@ -577,7 +576,7 @@ with gr.Blocks() as changelog:
 **12 Nov 2024**<br/>
 🚀 Model v0.14<br/>
 🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
-""")
 with gr.Blocks() as app:
     gr.TabbedInterface(

     generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
 @torch.no_grad()
+def lf_forward(token_lists, voices, speed, device='cpu'):
     voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
     outs = []
     for tokens in token_lists:
         ref_s = voicepack[len(tokens)]
         s = ref_s[:, 128:]
         tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
     return outs
 @spaces.GPU
+def lf_forward_gpu(token_lists, voices, speed):
+    return lf_forward(token_lists, voices, speed, device='cuda')
 def resplit_strings(arr):
     # Handle edge cases
     segments = [row for t in texts for row in recursive_split(t, voice)]
     return [(i, *row) for i, row in enumerate(segments)]
+def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
     token_lists = list(map(tokenize, segments['Tokens']))
     voices = resolve_voices(voice)
     speed = clamp_speed(speed)
     pad_between = int(pad_between / speed)
     batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
     i = 0
     while i < len(token_lists):
         bs = batch_sizes.pop() if batch_sizes else 100
         try:
             if use_gpu:
+                outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed)
             else:
+                outs = lf_forward(token_lists[i:i+bs], voices, speed)
         except gr.exceptions.Error as e:
             if use_gpu:
                 gr.Warning(str(e))
                 gr.Info('Switching to CPU')
+                outs = lf_forward(token_lists[i:i+bs], voices, speed)
                 use_gpu = False
             else:
                 raise gr.Error(e)
                     continue
                 out = out[trim:-trim]
             if i > 0 and pad_between > 0:
+                yield (SAMPLE_RATE, np.zeros(pad_between))
+            yield (SAMPLE_RATE, out)
         i += bs
 def did_change_segments(segments):
     x = len(segments) if segments['Length'].any() else 0
     return [
                 speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
                 trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
                 pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
+            with gr.Row(css='''
+    .square-stop-btn {
+        aspect-ratio: 1/1;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        padding: 0;
+    }
+'''):
                 generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
+                stop_btn = gr.Button('■', variant='stop', elem_classes=['square-stop-btn'])
     with gr.Row():
         segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
         segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
     segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
+    generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
+    stop_btn.click(lambda: None, outputs=[audio_stream])
 with gr.Blocks() as about:
+    gr.Markdown('''
 Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
 ### FAQ
 [eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
 Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
 Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
+''')
 with gr.Blocks() as changelog:
+    gr.Markdown('''
 **25 Nov 2024**<br/>
 🎨 Voice Mixer added
 **12 Nov 2024**<br/>
 🚀 Model v0.14<br/>
 🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
+''')
 with gr.Blocks() as app:
     gr.TabbedInterface(