E2-F5-TTS

Sleeping

App Files Files Community

eBlessings commited on Mar 7

Commit

e10864f

verified ·

1 Parent(s): a3013f3

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -2

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ import numpy as np
 import soundfile as sf
 import torchaudio
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
@@ -534,7 +536,68 @@ with gr.Blocks() as app_multistyle:
         inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
         outputs=generate_multistyle_btn,
     )
 with gr.Blocks() as app_chat:
     gr.Markdown(
@@ -852,8 +915,8 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
     )
     gr.TabbedInterface(
-        [app_tts, app_multistyle, app_chat, app_credits],
-        ["Basic-TTS", "Multi-Speech", "Voice-Chat", "Credits"],
     )

 import soundfile as sf
 import torchaudio
 from cached_path import cached_path
+from pydub import AudioSegment, silence
+import re
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
         inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
         outputs=generate_multistyle_btn,
     )
+with gr.Blocks() as app_podcast:
+    gr.Markdown("# Podcast Generation")
+    speaker1_name = gr.Textbox(label="Speaker 1 Name")
+    ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
+    ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
+    speaker2_name = gr.Textbox(label="Speaker 2 Name")
+    ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
+    ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
+    script_input = gr.Textbox(label="Podcast Script", lines=10,
+                             placeholder="Enter script with speaker names...")
+    podcast_model_choice = gr.Radio(
+        choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
+    )
+    podcast_remove_silence = gr.Checkbox(label="Remove Silences", value=True)
+    generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
+    podcast_output = gr.Audio(label="Generated Podcast")
+    def generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
+        speaker_blocks = re.split(f"({re.escape(speaker1)}:|({re.escape(speaker2)}:)", script)[1:]
+        generated_audio_segments = []
+        for i in range(0, len(speaker_blocks), 2):
+            speaker = speaker_blocks[i].strip(":")
+            text = speaker_blocks[i+1].strip()
+            ref_audio = ref_audio1 if speaker == speaker1 else ref_audio2
+            ref_text = ref_text1 if speaker == speaker1 else ref_text2
+            audio_result, _, _ = infer(
+                ref_audio,
+                ref_text,
+                text,
+                model,
+                remove_silence,
+                cross_fade_duration=0.15,
+                nfe_step=32,
+                speed=1.0
+            )
+            sr, audio_data = audio_result
+            generated_audio_segments.append(audio_data)
+        final_audio = np.concatenate(generated_audio_segments)
+        return (target_sample_rate, final_audio)
+    generate_podcast_btn.click(
+        generate_podcast,
+        inputs=[
+            script_input,
+            speaker1_name,
+            ref_audio_input1,
+            ref_text_input1,
+            speaker2_name,
+            ref_audio_input2,
+            ref_text_input2,
+            podcast_model_choice,
+            podcast_remove_silence,
+        ],
+        outputs=podcast_output,
+    )
 with gr.Blocks() as app_chat:
     gr.Markdown(
     )
     gr.TabbedInterface(
+        [app_tts, app_multistyle, app_podcast, app_chat, app_credits],  # Added app_podcast
+        ["Basic-TTS", "Multi-Speech", "Podcast", "Voice-Chat", "Credits"],
     )