hexgrad commited on
Commit
5e9cc13
·
verified ·
1 Parent(s): 6459fb3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -26
app.py CHANGED
@@ -324,12 +324,10 @@ with gr.Blocks() as basic_tts:
324
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
325
 
326
  @torch.no_grad()
327
- def lf_forward(token_lists, voices, speed, stop_event, device='cpu'):
328
  voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
329
  outs = []
330
  for tokens in token_lists:
331
- if stop_event.is_set():
332
- break
333
  ref_s = voicepack[len(tokens)]
334
  s = ref_s[:, 128:]
335
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
@@ -355,8 +353,8 @@ def lf_forward(token_lists, voices, speed, stop_event, device='cpu'):
355
  return outs
356
 
357
  @spaces.GPU
358
- def lf_forward_gpu(token_lists, voices, speed, stop_event):
359
- return lf_forward(token_lists, voices, speed, stop_event, device='cuda')
360
 
361
  def resplit_strings(arr):
362
  # Handle edge cases
@@ -410,9 +408,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
410
  segments = [row for t in texts for row in recursive_split(t, voice)]
411
  return [(i, *row) for i, row in enumerate(segments)]
412
 
413
- def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, audio_stream=None):
414
- if audio_stream is not None and len(audio_stream) == 3:
415
- audio_stream[-1].set()
416
  token_lists = list(map(tokenize, segments['Tokens']))
417
  voices = resolve_voices(voice)
418
  speed = clamp_speed(speed)
@@ -420,19 +416,18 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
420
  pad_between = int(pad_between / speed)
421
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
422
  i = 0
423
- stop_event = threading.Event()
424
  while i < len(token_lists):
425
  bs = batch_sizes.pop() if batch_sizes else 100
426
  try:
427
  if use_gpu:
428
- outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed, stop_event)
429
  else:
430
- outs = lf_forward(token_lists[i:i+bs], voices, speed, stop_event)
431
  except gr.exceptions.Error as e:
432
  if use_gpu:
433
  gr.Warning(str(e))
434
  gr.Info('Switching to CPU')
435
- outs = lf_forward(token_lists[i:i+bs], voices, speed, stop_event)
436
  use_gpu = False
437
  else:
438
  raise gr.Error(e)
@@ -442,14 +437,10 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, a
442
  continue
443
  out = out[trim:-trim]
444
  if i > 0 and pad_between > 0:
445
- yield SAMPLE_RATE, np.zeros(pad_between), stop_event
446
- yield SAMPLE_RATE, out, stop_event
447
  i += bs
448
 
449
- def lf_stop(audio_stream):
450
- if audio_stream is not None and len(audio_stream) == 3:
451
- audio_stream[-1].set()
452
-
453
  def did_change_segments(segments):
454
  x = len(segments) if segments['Length'].any() else 0
455
  return [
@@ -494,18 +485,26 @@ with gr.Blocks() as lf_tts:
494
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
495
  trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
496
  pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
497
- with gr.Row():
 
 
 
 
 
 
 
 
498
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
499
- stop_btn = gr.Button('Stop', variant='stop')
500
  with gr.Row():
501
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
502
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
503
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
504
- generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, audio_stream], outputs=[audio_stream])
505
- stop_btn.click(lf_stop, inputs=[audio_stream], outputs=[audio_stream])
506
 
507
  with gr.Blocks() as about:
508
- gr.Markdown("""
509
  Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
510
 
511
  ### FAQ
@@ -551,10 +550,10 @@ Inference code: MIT<br/>
551
  [eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
552
  Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
553
  Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
554
- """)
555
 
556
  with gr.Blocks() as changelog:
557
- gr.Markdown("""
558
  **25 Nov 2024**<br/>
559
  🎨 Voice Mixer added
560
 
@@ -577,7 +576,7 @@ with gr.Blocks() as changelog:
577
  **12 Nov 2024**<br/>
578
  🚀 Model v0.14<br/>
579
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
580
- """)
581
 
582
  with gr.Blocks() as app:
583
  gr.TabbedInterface(
 
324
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
325
 
326
  @torch.no_grad()
327
+ def lf_forward(token_lists, voices, speed, device='cpu'):
328
  voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
329
  outs = []
330
  for tokens in token_lists:
 
 
331
  ref_s = voicepack[len(tokens)]
332
  s = ref_s[:, 128:]
333
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
 
353
  return outs
354
 
355
  @spaces.GPU
356
+ def lf_forward_gpu(token_lists, voices, speed):
357
+ return lf_forward(token_lists, voices, speed, device='cuda')
358
 
359
  def resplit_strings(arr):
360
  # Handle edge cases
 
408
  segments = [row for t in texts for row in recursive_split(t, voice)]
409
  return [(i, *row) for i, row in enumerate(segments)]
410
 
411
+ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
 
 
412
  token_lists = list(map(tokenize, segments['Tokens']))
413
  voices = resolve_voices(voice)
414
  speed = clamp_speed(speed)
 
416
  pad_between = int(pad_between / speed)
417
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
418
  i = 0
 
419
  while i < len(token_lists):
420
  bs = batch_sizes.pop() if batch_sizes else 100
421
  try:
422
  if use_gpu:
423
+ outs = lf_forward_gpu(token_lists[i:i+bs], voices, speed)
424
  else:
425
+ outs = lf_forward(token_lists[i:i+bs], voices, speed)
426
  except gr.exceptions.Error as e:
427
  if use_gpu:
428
  gr.Warning(str(e))
429
  gr.Info('Switching to CPU')
430
+ outs = lf_forward(token_lists[i:i+bs], voices, speed)
431
  use_gpu = False
432
  else:
433
  raise gr.Error(e)
 
437
  continue
438
  out = out[trim:-trim]
439
  if i > 0 and pad_between > 0:
440
+ yield (SAMPLE_RATE, np.zeros(pad_between))
441
+ yield (SAMPLE_RATE, out)
442
  i += bs
443
 
 
 
 
 
444
  def did_change_segments(segments):
445
  x = len(segments) if segments['Length'].any() else 0
446
  return [
 
485
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
486
  trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
487
  pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
488
+ with gr.Row(css='''
489
+ .square-stop-btn {
490
+ aspect-ratio: 1/1;
491
+ display: flex;
492
+ align-items: center;
493
+ justify-content: center;
494
+ padding: 0;
495
+ }
496
+ '''):
497
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
498
+ stop_btn = gr.Button('', variant='stop', elem_classes=['square-stop-btn'])
499
  with gr.Row():
500
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
501
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
502
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
503
+ generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
504
+ stop_btn.click(lambda: None, outputs=[audio_stream])
505
 
506
  with gr.Blocks() as about:
507
+ gr.Markdown('''
508
  Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L32) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
509
 
510
  ### FAQ
 
550
  [eSpeak NG](https://github.com/espeak-ng/espeak-ng): GPL-3.0<br/>
551
  Random English texts: Unknown from [Quotable Data](https://github.com/quotable-io/data/blob/master/data/quotes.json)<br/>
552
  Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/common-voice/common-voice/tree/main/server/data/ja)
553
+ ''')
554
 
555
  with gr.Blocks() as changelog:
556
+ gr.Markdown('''
557
  **25 Nov 2024**<br/>
558
  🎨 Voice Mixer added
559
 
 
576
  **12 Nov 2024**<br/>
577
  🚀 Model v0.14<br/>
578
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
579
+ ''')
580
 
581
  with gr.Blocks() as app:
582
  gr.TabbedInterface(