Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -324,12 +324,10 @@ with gr.Blocks() as basic_tts:
|
|
324 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
325 |
|
326 |
@torch.no_grad()
|
327 |
-
def lf_forward(token_lists, voices, speed,
|
328 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
329 |
outs = []
|
330 |
for tokens in token_lists:
|
331 |
-
if stop_event.is_set():
|
332 |
-
break
|
333 |
ref_s = voicepack[len(tokens)]
|
334 |
s = ref_s[:, 128:]
|
335 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
@@ -355,8 +353,8 @@ def lf_forward(token_lists, voices, speed, stop_event, device='cpu'):
|
|
355 |
return outs
|
356 |
|
357 |
@spaces.GPU
|
358 |
-
def lf_forward_gpu(token_lists, voices, speed
|
359 |
-
return lf_forward(token_lists, voices, speed,
|
360 |
|
361 |
def resplit_strings(arr):
|
362 |
# Handle edge cases
|
@@ -410,9 +408,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
410 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
411 |
return [(i, *row) for i, row in enumerate(segments)]
|
412 |
|
413 |
-
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True
|
414 |
-
if audio_stream is not None and len(audio_stream) == 3:
|
415 |
-
audio_stream[-1].set()
|
416 |
token_lists = list(map(tokenize, segments['Tokens']))
|
417 |
voices = resolve_voices(voice)
|
418 |
speed = clamp_speed(speed)
|
@@ -420,19 +416,18 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
|
|
420 |
pad_between = int(pad_between / speed)
|
421 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
422 |
i = 0
|
423 |
-
stop_event = threading.Event()
|
424 |
while i < len(token_lists):
|
425 |
bs = batch_sizes.pop() if batch_sizes else 100
|
426 |
try:
|
427 |
if use_gpu:
|
428 |
-
outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed
|
429 |
else:
|
430 |
-
outs = lf_forward(token_lists[i:i+bs], voices, speed
|
431 |
except gr.exceptions.Error as e:
|
432 |
if use_gpu:
|
433 |
gr.Warning(str(e))
|
434 |
gr.Info('Switching to CPU')
|
435 |
-
outs = lf_forward(token_lists[i:i+bs], voices, speed
|
436 |
use_gpu = False
|
437 |
else:
|
438 |
raise gr.Error(e)
|
@@ -442,14 +437,10 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
|
|
442 |
continue
|
443 |
out = out[trim:-trim]
|
444 |
if i > 0 and pad_between > 0:
|
445 |
-
yield SAMPLE_RATE, np.zeros(pad_between)
|
446 |
-
yield SAMPLE_RATE, out
|
447 |
i += bs
|
448 |
|
449 |
-
def lf_stop(audio_stream):
|
450 |
-
if audio_stream is not None and len(audio_stream) == 3:
|
451 |
-
audio_stream[-1].set()
|
452 |
-
|
453 |
def did_change_segments(segments):
|
454 |
x = len(segments) if segments['Length'].any() else 0
|
455 |
return [
|
@@ -494,18 +485,26 @@ with gr.Blocks() as lf_tts:
|
|
494 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
495 |
trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
|
496 |
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
|
497 |
-
with gr.Row(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
499 |
-
stop_btn = gr.Button('
|
500 |
with gr.Row():
|
501 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
502 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
503 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
504 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu
|
505 |
-
stop_btn.click(
|
506 |
|
507 |
with gr.Blocks() as about:
|
508 |
-
gr.Markdown(
|
509 |
Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
|
510 |
|
511 |
### FAQ
|
@@ -551,10 +550,10 @@ Inference code: MIT<br/>
|
|
551 |
[eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
|
552 |
Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
|
553 |
Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
|
554 |
-
|
555 |
|
556 |
with gr.Blocks() as changelog:
|
557 |
-
gr.Markdown(
|
558 |
**25 Nov 2024**<br/>
|
559 |
🎨 Voice Mixer added
|
560 |
|
@@ -577,7 +576,7 @@ with gr.Blocks() as changelog:
|
|
577 |
**12 Nov 2024**<br/>
|
578 |
🚀 Model v0.14<br/>
|
579 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
580 |
-
|
581 |
|
582 |
with gr.Blocks() as app:
|
583 |
gr.TabbedInterface(
|
|
|
324 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
325 |
|
326 |
@torch.no_grad()
|
327 |
+
def lf_forward(token_lists, voices, speed, device='cpu'):
|
328 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
329 |
outs = []
|
330 |
for tokens in token_lists:
|
|
|
|
|
331 |
ref_s = voicepack[len(tokens)]
|
332 |
s = ref_s[:, 128:]
|
333 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
|
|
353 |
return outs
|
354 |
|
355 |
@spaces.GPU
|
356 |
+
def lf_forward_gpu(token_lists, voices, speed):
|
357 |
+
return lf_forward(token_lists, voices, speed, device='cuda')
|
358 |
|
359 |
def resplit_strings(arr):
|
360 |
# Handle edge cases
|
|
|
408 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
409 |
return [(i, *row) for i, row in enumerate(segments)]
|
410 |
|
411 |
+
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
|
|
|
|
412 |
token_lists = list(map(tokenize, segments['Tokens']))
|
413 |
voices = resolve_voices(voice)
|
414 |
speed = clamp_speed(speed)
|
|
|
416 |
pad_between = int(pad_between / speed)
|
417 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
418 |
i = 0
|
|
|
419 |
while i < len(token_lists):
|
420 |
bs = batch_sizes.pop() if batch_sizes else 100
|
421 |
try:
|
422 |
if use_gpu:
|
423 |
+
outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed)
|
424 |
else:
|
425 |
+
outs = lf_forward(token_lists[i:i+bs], voices, speed)
|
426 |
except gr.exceptions.Error as e:
|
427 |
if use_gpu:
|
428 |
gr.Warning(str(e))
|
429 |
gr.Info('Switching to CPU')
|
430 |
+
outs = lf_forward(token_lists[i:i+bs], voices, speed)
|
431 |
use_gpu = False
|
432 |
else:
|
433 |
raise gr.Error(e)
|
|
|
437 |
continue
|
438 |
out = out[trim:-trim]
|
439 |
if i > 0 and pad_between > 0:
|
440 |
+
yield (SAMPLE_RATE, np.zeros(pad_between))
|
441 |
+
yield (SAMPLE_RATE, out)
|
442 |
i += bs
|
443 |
|
|
|
|
|
|
|
|
|
444 |
def did_change_segments(segments):
|
445 |
x = len(segments) if segments['Length'].any() else 0
|
446 |
return [
|
|
|
485 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
486 |
trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
|
487 |
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
|
488 |
+
with gr.Row(css='''
|
489 |
+
.square-stop-btn {
|
490 |
+
aspect-ratio: 1/1;
|
491 |
+
display: flex;
|
492 |
+
align-items: center;
|
493 |
+
justify-content: center;
|
494 |
+
padding: 0;
|
495 |
+
}
|
496 |
+
'''):
|
497 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
498 |
+
stop_btn = gr.Button('■', variant='stop', elem_classes=['square-stop-btn'])
|
499 |
with gr.Row():
|
500 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
501 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
502 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
503 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
|
504 |
+
stop_btn.click(lambda: None, outputs=[audio_stream])
|
505 |
|
506 |
with gr.Blocks() as about:
|
507 |
+
gr.Markdown('''
|
508 |
Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
|
509 |
|
510 |
### FAQ
|
|
|
550 |
[eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
|
551 |
Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
|
552 |
Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
|
553 |
+
''')
|
554 |
|
555 |
with gr.Blocks() as changelog:
|
556 |
+
gr.Markdown('''
|
557 |
**25 Nov 2024**<br/>
|
558 |
🎨 Voice Mixer added
|
559 |
|
|
|
576 |
**12 Nov 2024**<br/>
|
577 |
🚀 Model v0.14<br/>
|
578 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
579 |
+
''')
|
580 |
|
581 |
with gr.Blocks() as app:
|
582 |
gr.TabbedInterface(
|