hexgrad commited on
Commit
e273309
·
verified ·
1 Parent(s): 67503fa

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +26 -21
  2. requirements.txt +1 -0
app.py CHANGED
@@ -3,6 +3,7 @@ from huggingface_hub import snapshot_download
3
  from katsu import Katsu
4
  from models import build_model
5
  import gradio as gr
 
6
  import numpy as np
7
  import os
8
  import phonemizer
@@ -269,8 +270,17 @@ def clamp_speed(speed):
269
  return 2
270
  return speed
271
 
 
 
 
 
 
 
 
 
 
272
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
273
- def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=None):
274
  ps = ps or phonemize(text, voice)
275
  if not sk and (text in sents or ps.strip('"') in harvsents):
276
  sk = os.environ['SK']
@@ -278,7 +288,7 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
278
  return (None, '')
279
  voices = resolve_voices(voice, warn=ps)
280
  speed = clamp_speed(speed)
281
- trim = trim if isinstance(trim, int) else 3000
282
  use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
283
  tokens = tokenize(ps)
284
  if not tokens:
@@ -302,11 +312,8 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
302
  raise gr.Error(e)
303
  print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
304
  return (None, '')
305
- trim = int(trim / speed)
306
- if trim > 0:
307
- if trim * 2 >= len(out):
308
- return (None, '')
309
- out = out[trim:-trim]
310
  print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
311
  return ((SAMPLE_RATE, out), ps)
312
 
@@ -352,7 +359,7 @@ with gr.Blocks() as basic_tts:
352
  autoplay = gr.Checkbox(value=True, label='Autoplay')
353
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
354
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
355
- trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
356
  with gr.Accordion('Output Tokens', open=True):
357
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
358
  with gr.Accordion('Voice Mixer', open=False):
@@ -367,8 +374,8 @@ with gr.Blocks() as basic_tts:
367
  with gr.Row():
368
  sk = gr.Textbox(visible=False)
369
  text.change(lambda: os.environ['SK'], outputs=[sk])
370
- text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
371
- generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
372
 
373
  @torch.no_grad()
374
  def lf_forward(token_lists, voices, speed, sk, device='cpu'):
@@ -457,14 +464,14 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
457
  segments = [row for t in texts for row in recursive_split(t, voice)]
458
  return [(i, *row) for i, row in enumerate(segments)]
459
 
460
- def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
461
  if sk != os.environ['SK']:
462
  return
463
  token_lists = list(map(tokenize, segments['Tokens']))
464
  voices = resolve_voices(voice)
465
  speed = clamp_speed(speed)
466
- trim = int(trim / speed)
467
- pad_between = int(pad_between / speed)
468
  use_gpu = True
469
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
470
  i = 0
@@ -489,10 +496,8 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, s
489
  else:
490
  raise gr.Error(e)
491
  for out in outs:
492
- if trim > 0:
493
- if trim * 2 >= len(out):
494
- continue
495
- out = out[trim:-trim]
496
  if i > 0 and pad_between > 0:
497
  yield (SAMPLE_RATE, np.zeros(pad_between))
498
  yield (SAMPLE_RATE, out)
@@ -537,8 +542,8 @@ with gr.Blocks() as lf_tts:
537
  audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
538
  with gr.Accordion('Audio Settings', open=True):
539
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
540
- trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends of each segment')
541
- pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
542
  with gr.Row():
543
  segment_btn = gr.Button('Tokenize', variant='primary')
544
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
@@ -550,12 +555,12 @@ with gr.Blocks() as lf_tts:
550
  sk = gr.Textbox(visible=False)
551
  segments.change(lambda: os.environ['SK'], outputs=[sk])
552
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
553
- generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
554
  stop_btn.click(fn=None, cancels=generate_event)
555
 
556
  with gr.Blocks() as about:
557
  gr.Markdown('''
558
- Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L33) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
559
 
560
  ### FAQ
561
  **Will this be open sourced?**<br/>
 
3
  from katsu import Katsu
4
  from models import build_model
5
  import gradio as gr
6
+ import librosa
7
  import numpy as np
8
  import os
9
  import phonemizer
 
270
  return 2
271
  return speed
272
 
273
+ def clamp_top_db(top_db):
274
+ if not isinstance(top_db, float) and not isinstance(top_db, int):
275
+ return 60
276
+ elif top_db < 30:
277
+ return None
278
+ elif top_db > 90:
279
+ return 90
280
+ return top_db
281
+
282
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
283
+ def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=None):
284
  ps = ps or phonemize(text, voice)
285
  if not sk and (text in sents or ps.strip('"') in harvsents):
286
  sk = os.environ['SK']
 
288
  return (None, '')
289
  voices = resolve_voices(voice, warn=ps)
290
  speed = clamp_speed(speed)
291
+ top_db = clamp_top_db(top_db)
292
  use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
293
  tokens = tokenize(ps)
294
  if not tokens:
 
312
  raise gr.Error(e)
313
  print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
314
  return (None, '')
315
+ if top_db:
316
+ out, _ = librosa.effects.trim(out, top_db=top_db)
 
 
 
317
  print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
318
  return ((SAMPLE_RATE, out), ps)
319
 
 
359
  autoplay = gr.Checkbox(value=True, label='Autoplay')
360
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
361
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
362
+ top_db = gr.Slider(minimum=0, maximum=90, value=60, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
363
  with gr.Accordion('Output Tokens', open=True):
364
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
365
  with gr.Accordion('Voice Mixer', open=False):
 
374
  with gr.Row():
375
  sk = gr.Textbox(visible=False)
376
  text.change(lambda: os.environ['SK'], outputs=[sk])
377
+ text.submit(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
378
+ generate_btn.click(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
379
 
380
  @torch.no_grad()
381
  def lf_forward(token_lists, voices, speed, sk, device='cpu'):
 
464
  segments = [row for t in texts for row in recursive_split(t, voice)]
465
  return [(i, *row) for i, row in enumerate(segments)]
466
 
467
+ def lf_generate(segments, voice, speed=1, top_db=0, pad_between=0, use_gpu=True, sk=None):
468
  if sk != os.environ['SK']:
469
  return
470
  token_lists = list(map(tokenize, segments['Tokens']))
471
  voices = resolve_voices(voice)
472
  speed = clamp_speed(speed)
473
+ top_db = clamp_top_db(top_db)
474
+ pad_between = int(pad_between)
475
  use_gpu = True
476
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
477
  i = 0
 
496
  else:
497
  raise gr.Error(e)
498
  for out in outs:
499
+ if top_db:
500
+ out, _ = librosa.effects.trim(out, top_db=top_db)
 
 
501
  if i > 0 and pad_between > 0:
502
  yield (SAMPLE_RATE, np.zeros(pad_between))
503
  yield (SAMPLE_RATE, out)
 
542
  audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
543
  with gr.Accordion('Audio Settings', open=True):
544
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
545
+ top_db = gr.Slider(minimum=0, maximum=90, value=0, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
546
+ pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
547
  with gr.Row():
548
  segment_btn = gr.Button('Tokenize', variant='primary')
549
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
 
555
  sk = gr.Textbox(visible=False)
556
  segments.change(lambda: os.environ['SK'], outputs=[sk])
557
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
558
+ generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, top_db, pad_between, use_gpu, sk], outputs=[audio_stream])
559
  stop_btn.click(fn=None, cancels=generate_event)
560
 
561
  with gr.Blocks() as about:
562
  gr.Markdown('''
563
+ Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L34) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
564
 
565
  ### FAQ
566
  **Will this be open sourced?**<br/>
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  fugashi
2
  gradio
 
3
  mojimoji
4
  munch
5
  phonemizer
 
1
  fugashi
2
  gradio
3
+ librosa
4
  mojimoji
5
  munch
6
  phonemizer