hexgrad commited on
Commit
d01e985
·
verified ·
1 Parent(s): e273309

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -21
app.py CHANGED
@@ -270,17 +270,17 @@ def clamp_speed(speed):
270
  return 2
271
  return speed
272
 
273
- def clamp_top_db(top_db):
274
- if not isinstance(top_db, float) and not isinstance(top_db, int):
275
- return 60
276
- elif top_db < 30:
277
- return None
278
- elif top_db > 90:
279
- return 90
280
- return top_db
281
 
282
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
283
- def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=None):
284
  ps = ps or phonemize(text, voice)
285
  if not sk and (text in sents or ps.strip('"') in harvsents):
286
  sk = os.environ['SK']
@@ -288,7 +288,7 @@ def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=N
288
  return (None, '')
289
  voices = resolve_voices(voice, warn=ps)
290
  speed = clamp_speed(speed)
291
- top_db = clamp_top_db(top_db)
292
  use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
293
  tokens = tokenize(ps)
294
  if not tokens:
@@ -312,8 +312,11 @@ def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=N
312
  raise gr.Error(e)
313
  print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
314
  return (None, '')
315
- if top_db:
316
- out, _ = librosa.effects.trim(out, top_db=top_db)
 
 
 
317
  print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
318
  return ((SAMPLE_RATE, out), ps)
319
 
@@ -359,7 +362,7 @@ with gr.Blocks() as basic_tts:
359
  autoplay = gr.Checkbox(value=True, label='Autoplay')
360
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
361
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
362
- top_db = gr.Slider(minimum=0, maximum=90, value=60, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
363
  with gr.Accordion('Output Tokens', open=True):
364
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
365
  with gr.Accordion('Voice Mixer', open=False):
@@ -374,8 +377,8 @@ with gr.Blocks() as basic_tts:
374
  with gr.Row():
375
  sk = gr.Textbox(visible=False)
376
  text.change(lambda: os.environ['SK'], outputs=[sk])
377
- text.submit(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
378
- generate_btn.click(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
379
 
380
  @torch.no_grad()
381
  def lf_forward(token_lists, voices, speed, sk, device='cpu'):
@@ -464,13 +467,13 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
464
  segments = [row for t in texts for row in recursive_split(t, voice)]
465
  return [(i, *row) for i, row in enumerate(segments)]
466
 
467
- def lf_generate(segments, voice, speed=1, top_db=0, pad_between=0, use_gpu=True, sk=None):
468
  if sk != os.environ['SK']:
469
  return
470
  token_lists = list(map(tokenize, segments['Tokens']))
471
  voices = resolve_voices(voice)
472
  speed = clamp_speed(speed)
473
- top_db = clamp_top_db(top_db)
474
  pad_between = int(pad_between)
475
  use_gpu = True
476
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
@@ -496,8 +499,11 @@ def lf_generate(segments, voice, speed=1, top_db=0, pad_between=0, use_gpu=True,
496
  else:
497
  raise gr.Error(e)
498
  for out in outs:
499
- if top_db:
500
- out, _ = librosa.effects.trim(out, top_db=top_db)
 
 
 
501
  if i > 0 and pad_between > 0:
502
  yield (SAMPLE_RATE, np.zeros(pad_between))
503
  yield (SAMPLE_RATE, out)
@@ -542,7 +548,7 @@ with gr.Blocks() as lf_tts:
542
  audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
543
  with gr.Accordion('Audio Settings', open=True):
544
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
545
- top_db = gr.Slider(minimum=0, maximum=90, value=0, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
546
  pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
547
  with gr.Row():
548
  segment_btn = gr.Button('Tokenize', variant='primary')
@@ -555,7 +561,7 @@ with gr.Blocks() as lf_tts:
555
  sk = gr.Textbox(visible=False)
556
  segments.change(lambda: os.environ['SK'], outputs=[sk])
557
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
558
- generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, top_db, pad_between, use_gpu, sk], outputs=[audio_stream])
559
  stop_btn.click(fn=None, cancels=generate_event)
560
 
561
  with gr.Blocks() as about:
 
270
  return 2
271
  return speed
272
 
273
+ def clamp_trim(trim):
274
+ if not isinstance(trim, float) and not isinstance(trim, int):
275
+ return 0.5
276
+ elif trim < 0:
277
+ return 0
278
+ elif trim > 1:
279
+ return 0.5
280
+ return trim
281
 
282
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
283
+ def generate(text, voice='af', ps=None, speed=1, trim=0.5, use_gpu='auto', sk=None):
284
  ps = ps or phonemize(text, voice)
285
  if not sk and (text in sents or ps.strip('"') in harvsents):
286
  sk = os.environ['SK']
 
288
  return (None, '')
289
  voices = resolve_voices(voice, warn=ps)
290
  speed = clamp_speed(speed)
291
+ trim = clamp_trim(trim)
292
  use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
293
  tokens = tokenize(ps)
294
  if not tokens:
 
312
  raise gr.Error(e)
313
  print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
314
  return (None, '')
315
+ if trim:
316
+ a, b = librosa.effects.trim(out, top_db=30)[1]
317
+ a = int(a*trim)
318
+ b = int(len(out)-(len(out)-b)*trim)
319
+ out = out[a:b]
320
  print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
321
  return ((SAMPLE_RATE, out), ps)
322
 
 
362
  autoplay = gr.Checkbox(value=True, label='Autoplay')
363
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
364
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
365
+ trim = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label='✂️ Trim', info='How much to cut from both ends')
366
  with gr.Accordion('Output Tokens', open=True):
367
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
368
  with gr.Accordion('Voice Mixer', open=False):
 
377
  with gr.Row():
378
  sk = gr.Textbox(visible=False)
379
  text.change(lambda: os.environ['SK'], outputs=[sk])
380
+ text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
381
+ generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
382
 
383
  @torch.no_grad()
384
  def lf_forward(token_lists, voices, speed, sk, device='cpu'):
 
467
  segments = [row for t in texts for row in recursive_split(t, voice)]
468
  return [(i, *row) for i, row in enumerate(segments)]
469
 
470
+ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
471
  if sk != os.environ['SK']:
472
  return
473
  token_lists = list(map(tokenize, segments['Tokens']))
474
  voices = resolve_voices(voice)
475
  speed = clamp_speed(speed)
476
+ trim = clamp_trim(trim)
477
  pad_between = int(pad_between)
478
  use_gpu = True
479
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
 
499
  else:
500
  raise gr.Error(e)
501
  for out in outs:
502
+ if trim:
503
+ a, b = librosa.effects.trim(out, top_db=30)[1]
504
+ a = int(a*trim)
505
+ b = int(len(out)-(len(out)-b)*trim)
506
+ out = out[a:b]
507
  if i > 0 and pad_between > 0:
508
  yield (SAMPLE_RATE, np.zeros(pad_between))
509
  yield (SAMPLE_RATE, out)
 
548
  audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
549
  with gr.Accordion('Audio Settings', open=True):
550
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
551
+ trim = gr.Slider(minimum=0, maximum=1, value=0, step=0.1, label='✂️ Trim', info='How much to cut from both ends')
552
  pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
553
  with gr.Row():
554
  segment_btn = gr.Button('Tokenize', variant='primary')
 
561
  sk = gr.Textbox(visible=False)
562
  segments.change(lambda: os.environ['SK'], outputs=[sk])
563
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
564
+ generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
565
  stop_btn.click(fn=None, cancels=generate_event)
566
 
567
  with gr.Blocks() as about: