Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -308,6 +308,97 @@ with gr.Blocks() as app_podcast:
|
|
308 |
],
|
309 |
outputs=podcast_output
|
310 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
# ========== REST OF ORIGINAL CODE (UNCHANGED BELOW) ========== #
|
313 |
# [Main app configuration, other tabs (TTS, Multistyle, Chat, Credits)]
|
|
|
308 |
],
|
309 |
outputs=podcast_output
|
310 |
)
|
311 |
+
with gr.Blocks() as app_credits:
|
312 |
+
gr.Markdown("""
|
313 |
+
# Credits
|
314 |
+
|
315 |
+
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
316 |
+
* [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
|
317 |
+
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
|
318 |
+
""")
|
319 |
+
|
320 |
+
with gr.Blocks() as app_tts:
|
321 |
+
gr.Markdown("# Batched TTS")
|
322 |
+
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
323 |
+
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
324 |
+
generate_btn = gr.Button("Synthesize", variant="primary")
|
325 |
+
with gr.Accordion("Advanced Settings", open=False):
|
326 |
+
ref_text_input = gr.Textbox(
|
327 |
+
label="Reference Text",
|
328 |
+
info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
|
329 |
+
lines=2,
|
330 |
+
)
|
331 |
+
remove_silence = gr.Checkbox(
|
332 |
+
label="Remove Silences",
|
333 |
+
info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
|
334 |
+
value=False,
|
335 |
+
)
|
336 |
+
speed_slider = gr.Slider(
|
337 |
+
label="Speed",
|
338 |
+
minimum=0.3,
|
339 |
+
maximum=2.0,
|
340 |
+
value=1.0,
|
341 |
+
step=0.1,
|
342 |
+
info="Adjust the speed of the audio.",
|
343 |
+
)
|
344 |
+
nfe_slider = gr.Slider(
|
345 |
+
label="NFE Steps",
|
346 |
+
minimum=4,
|
347 |
+
maximum=64,
|
348 |
+
value=32,
|
349 |
+
step=2,
|
350 |
+
info="Set the number of denoising steps.",
|
351 |
+
)
|
352 |
+
cross_fade_duration_slider = gr.Slider(
|
353 |
+
label="Cross-Fade Duration (s)",
|
354 |
+
minimum=0.0,
|
355 |
+
maximum=1.0,
|
356 |
+
value=0.15,
|
357 |
+
step=0.01,
|
358 |
+
info="Set the duration of the cross-fade between audio clips.",
|
359 |
+
)
|
360 |
+
|
361 |
+
audio_output = gr.Audio(label="Synthesized Audio")
|
362 |
+
spectrogram_output = gr.Image(label="Spectrogram")
|
363 |
+
|
364 |
+
@gpu_decorator
|
365 |
+
def basic_tts(
|
366 |
+
ref_audio_input,
|
367 |
+
ref_text_input,
|
368 |
+
gen_text_input,
|
369 |
+
remove_silence,
|
370 |
+
cross_fade_duration_slider,
|
371 |
+
nfe_slider,
|
372 |
+
speed_slider,
|
373 |
+
):
|
374 |
+
audio_out, spectrogram_path, ref_text_out = infer(
|
375 |
+
ref_audio_input,
|
376 |
+
ref_text_input,
|
377 |
+
gen_text_input,
|
378 |
+
tts_model_choice,
|
379 |
+
remove_silence,
|
380 |
+
cross_fade_duration=cross_fade_duration_slider,
|
381 |
+
nfe_step=nfe_slider,
|
382 |
+
speed=speed_slider,
|
383 |
+
)
|
384 |
+
return audio_out, spectrogram_path, ref_text_out
|
385 |
+
|
386 |
+
generate_btn.click(
|
387 |
+
basic_tts,
|
388 |
+
inputs=[
|
389 |
+
ref_audio_input,
|
390 |
+
ref_text_input,
|
391 |
+
gen_text_input,
|
392 |
+
remove_silence,
|
393 |
+
cross_fade_duration_slider,
|
394 |
+
nfe_slider,
|
395 |
+
speed_slider,
|
396 |
+
],
|
397 |
+
outputs=[audio_output, spectrogram_output, ref_text_input],
|
398 |
+
)
|
399 |
+
with gr.Blocks() as app_multistyle:
|
400 |
+
gr.Markdown("# Multiple Speech-Type Generation")
|
401 |
+
# ... [Keep original multistyle interface unchanged] ...
|
402 |
|
403 |
# ========== REST OF ORIGINAL CODE (UNCHANGED BELOW) ========== #
|
404 |
# [Main app configuration, other tabs (TTS, Multistyle, Chat, Credits)]
|