hexgrad commited on
Commit
46c7c53
·
verified ·
1 Parent(s): 9ea675a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -220,7 +220,7 @@ def clamp_speed(speed):
220
  return speed
221
 
222
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
223
- def generate(text, voice='af', ps=None, speed=1, _reduce_noise=0.5, trim=3000, _closing_cut=2000, _ease_in=3000, _ease_out=1000, _pad_before=5000, _pad_after=5000, use_gpu='auto'):
224
  voice = voice if voice in VOICES['cpu'] else 'af'
225
  ps = ps or phonemize(text, voice)
226
  speed = clamp_speed(speed)
@@ -232,14 +232,20 @@ def generate(text, voice='af', ps=None, speed=1, _reduce_noise=0.5, trim=3000, _
232
  elif len(tokens) > 510:
233
  tokens = tokens[:510]
234
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
 
235
  try:
236
- if not use_gpu or (use_gpu == 'auto' and len(ps) < 100):
237
- out = forward(tokens, voice, speed)
238
- else:
239
  out = forward_gpu(tokens, voice, speed)
 
 
240
  except gr.exceptions.Error as e:
241
- raise gr.Error(e)
242
- return (None, '')
 
 
 
 
 
243
  trim = int(trim / speed)
244
  if trim > 0:
245
  if trim * 2 >= len(out):
@@ -292,15 +298,8 @@ with gr.Blocks() as basic_tts:
292
  trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
293
  with gr.Accordion('Output Tokens', open=True):
294
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
295
- with gr.Row():
296
- _reduce_noise = gr.Slider(value=0.5, interactive=False, visible=False)
297
- _closing_cut = gr.Slider(value=2000, interactive=False, visible=False)
298
- _ease_in = gr.Slider(value=3000, interactive=False, visible=False)
299
- _ease_out = gr.Slider(value=1000, interactive=False, visible=False)
300
- _pad_before = gr.Slider(value=5000, interactive=False, visible=False)
301
- _pad_after = gr.Slider(value=5000, interactive=False, visible=False)
302
- text.submit(generate, inputs=[text, voice, in_ps, speed, _reduce_noise, trim, _closing_cut, _ease_in, _ease_out, _pad_before, _pad_after, use_gpu], outputs=[audio, out_ps])
303
- generate_btn.click(generate, inputs=[text, voice, in_ps, speed, _reduce_noise, trim, _closing_cut, _ease_in, _ease_out, _pad_before, _pad_after, use_gpu], outputs=[audio, out_ps])
304
 
305
  @torch.no_grad()
306
  def lf_forward(token_lists, voice, speed, device='cpu'):
@@ -470,19 +469,19 @@ with gr.Blocks() as about:
470
  Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L31) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
471
 
472
  ### FAQ
473
- #### Will this be open sourced?
474
  There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.
475
 
476
- #### What is the difference between stable and unstable voices?
477
  Unstable voices are more likely to stumble or produce unnatural artifacts, especially on short or strange texts. Stable voices are more likely to deliver natural speech on a wider range of inputs. The first two audio clips in this [blog post](https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade) are examples of unstable and stable speech. Note that even unstable voices can sound fine on medium to long texts.
478
 
479
- #### How can CPU be faster than ZeroGPU?
480
  The CPU is a dedicated resource for this Space, while the ZeroGPU pool is shared and dynamically allocated across all of HF. The ZeroGPU queue/allocator system inevitably adds latency to each request.<br/>
481
  For Basic TTS under ~100 tokens or characters, only a few seconds of audio need to be generated, so the actual compute is not that heavy. In these short bursts, the dedicated CPU can often compute the result faster than the total time it takes to: enter the ZeroGPU queue, wait to get allocated, and have a GPU compute and deliver the result.<br/>
482
  ZeroGPU catches up beyond 100 tokens and especially closer to the ~500 token context window. Long-Form mode processes batches of 100 segments at a time, so the GPU should outspeed the CPU by 1-2 orders of magnitude.
483
 
484
  ### Compute
485
- The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup><br/>
486
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
487
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
488
 
@@ -522,16 +521,16 @@ with gr.Blocks() as changelog:
522
 
523
  ### 22 Nov 2024
524
  🚀 Model v0.19<br/>
525
- 🧪 Validation losses: 0.261 mel / 0.627 dur / 1.897 f0<br/>
526
  📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
527
 
528
  ### 15 Nov 2024
529
  🚀 Model v0.16<br/>
530
- 🧪 Validation losses: 0.263 mel / 0.646 dur / 1.934 f0
531
 
532
  ### 12 Nov 2024
533
  🚀 Model v0.14<br/>
534
- 🧪 Validation losses: 0.262 mel / 0.642 dur / 1.889 f0
535
  """)
536
 
537
  with gr.Blocks() as app:
 
220
  return speed
221
 
222
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
223
+ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
224
  voice = voice if voice in VOICES['cpu'] else 'af'
225
  ps = ps or phonemize(text, voice)
226
  speed = clamp_speed(speed)
 
232
  elif len(tokens) > 510:
233
  tokens = tokens[:510]
234
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
235
+ use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
236
  try:
237
+ if use_gpu:
 
 
238
  out = forward_gpu(tokens, voice, speed)
239
+ else:
240
+ out = forward(tokens, voice, speed)
241
  except gr.exceptions.Error as e:
242
+ if use_gpu:
243
+ gr.Warning(str(e))
244
+ gr.Info('GPU failover to CPU')
245
+ out = forward(tokens, voice, speed)
246
+ else:
247
+ raise gr.Error(e)
248
+ return (None, '')
249
  trim = int(trim / speed)
250
  if trim > 0:
251
  if trim * 2 >= len(out):
 
298
  trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
299
  with gr.Accordion('Output Tokens', open=True):
300
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
301
+ text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
302
+ generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
 
 
 
 
 
 
 
303
 
304
  @torch.no_grad()
305
  def lf_forward(token_lists, voice, speed, device='cpu'):
 
469
  Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L31) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
470
 
471
  ### FAQ
472
+ **Will this be open sourced?**
473
  There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.
474
 
475
+ **What is the difference between stable and unstable voices?**
476
  Unstable voices are more likely to stumble or produce unnatural artifacts, especially on short or strange texts. Stable voices are more likely to deliver natural speech on a wider range of inputs. The first two audio clips in this [blog post](https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade) are examples of unstable and stable speech. Note that even unstable voices can sound fine on medium to long texts.
477
 
478
+ **How can CPU be faster than ZeroGPU?**
479
  The CPU is a dedicated resource for this Space, while the ZeroGPU pool is shared and dynamically allocated across all of HF. The ZeroGPU queue/allocator system inevitably adds latency to each request.<br/>
480
  For Basic TTS under ~100 tokens or characters, only a few seconds of audio need to be generated, so the actual compute is not that heavy. In these short bursts, the dedicated CPU can often compute the result faster than the total time it takes to: enter the ZeroGPU queue, wait to get allocated, and have a GPU compute and deliver the result.<br/>
481
  ZeroGPU catches up beyond 100 tokens and especially closer to the ~500 token context window. Long-Form mode processes batches of 100 segments at a time, so the GPU should outspeed the CPU by 1-2 orders of magnitude.
482
 
483
  ### Compute
484
+ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<br/>
485
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
486
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
487
 
 
521
 
522
  ### 22 Nov 2024
523
  🚀 Model v0.19<br/>
524
+ 🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
525
  📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
526
 
527
  ### 15 Nov 2024
528
  🚀 Model v0.16<br/>
529
+ 🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
530
 
531
  ### 12 Nov 2024
532
  🚀 Model v0.14<br/>
533
+ 🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
534
  """)
535
 
536
  with gr.Blocks() as app: