Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on Jul 26

Commit

1cbd297

verified ·

1 Parent(s): b7a138c

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -206

app.py CHANGED Viewed

@@ -1,209 +1,6 @@
-# ruff: noqa: E402
-import os
-import json
-import tempfile
-from functools import lru_cache
-from importlib.resources import files
 import gradio as gr
-import numpy as np
-import soundfile as sf
-import torch
-import torchaudio
-from cached_path import cached_path
-import spaces
-from f5_tts.infer.utils_infer import (
-    infer_process,
-    load_model,
-    load_vocoder,
-    preprocess_ref_audio_text,
-    remove_silence_for_generated_wav,
-    save_spectrogram,
-    tempfile_kwargs,
-)
-from f5_tts.model import DiT, UNetT
-DEFAULT_TTS_MODEL = "F5-TTS_v1"
-DEFAULT_TTS_MODEL_CFG = [
-    "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
-    "hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt",
-    json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
-]
-# Load vocoder and models on module load
-vocoder = load_vocoder()
-model_cache = {}
-model_cache[DEFAULT_TTS_MODEL] = load_model(
-    DiT,
-    json.loads(DEFAULT_TTS_MODEL_CFG[2]),
-    str(cached_path(DEFAULT_TTS_MODEL_CFG[0]))
-)
-model_cache["E2-TTS"] = load_model(
-    UNetT,
-    dict(dim=1024, depth=24, heads=16, ff_mult=4, text_mask_padding=False, pe_attn_head=1),
-    str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
-)
-custom_ema_model, pre_custom_path = None, ""
-tts_model_choice = DEFAULT_TTS_MODEL
-def gpu_decorator(fn):
-    return spaces.GPU(fn)
-with gr.Blocks() as app:
-    gr.Markdown("# ZeroGPU TTS - F5/E2 Demo")
-    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
-    gen_text_input = gr.Textbox(label="Text to Generate", lines=4)
-    gen_text_file = gr.File(label="Upload Text File", file_types=[".txt"])
-    ref_text_input = gr.Textbox(label="Reference Text (optional)", lines=2)
-    ref_text_file = gr.File(label="Upload Reference Text", file_types=[".txt"])
-    remove_silence = gr.Checkbox(label="Remove Silences", value=False)
-    randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
-    seed_input = gr.Number(value=0, precision=0, label="Seed")
-    cross_fade_duration_slider = gr.Slider(label="Cross-Fade Duration", minimum=0.0, maximum=1.0, value=0.15)
-    nfe_slider = gr.Slider(label="NFE Steps", minimum=4, maximum=64, value=32, step=2)
-    speed_slider = gr.Slider(label="Speed", minimum=0.3, maximum=2.0, value=1.0, step=0.1)
-    generate_btn = gr.Button("Generate")
-    audio_output = gr.Audio(label="Synthesized Audio")
-    spectrogram_output = gr.Image(label="Spectrogram")
-    @gpu_decorator
-    def infer(
-        ref_audio_orig,
-        ref_text,
-        gen_text,
-        model,
-        remove_silence,
-        seed,
-        cross_fade_duration=0.15,
-        nfe_step=32,
-        speed=1,
-        show_info=gr.Info,
-    ):
-        if not ref_audio_orig:
-            gr.Warning("Please provide reference audio.")
-            return gr.update(), gr.update(), ref_text
-        if seed < 0 or seed > 2**31 - 1:
-            gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
-            seed = np.random.randint(0, 2**31 - 1)
-        torch.manual_seed(seed)
-        used_seed = seed
-        if not gen_text.strip():
-            gr.Warning("Please enter text to generate or upload a text file.")
-            return gr.update(), gr.update(), ref_text
-        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
-        if isinstance(model, tuple) and model[0] == "Custom":
-            global custom_ema_model, pre_custom_path
-            if pre_custom_path != model[1]:
-                show_info("Loading Custom TTS model...")
-                custom_ema_model = load_model(
-                    DiT,
-                    json.loads(model[3]),
-                    str(cached_path(model[1])),
-                    vocab_file=str(cached_path(model[2]))
-                )
-                pre_custom_path = model[1]
-            ema_model = custom_ema_model
-        else:
-            ema_model = model_cache.get(model, model_cache[DEFAULT_TTS_MODEL])
-        final_wave, final_sample_rate, combined_spectrogram = infer_process(
-            ref_audio,
-            ref_text,
-            gen_text,
-            ema_model,
-            vocoder,
-            cross_fade_duration=cross_fade_duration,
-            nfe_step=nfe_step,
-            speed=speed,
-            show_info=show_info,
-            progress=gr.Progress(),
-        )
-        if remove_silence:
-            with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
-                temp_path = f.name
-            try:
-                sf.write(temp_path, final_wave, final_sample_rate)
-                remove_silence_for_generated_wav(f.name)
-                final_wave, _ = torchaudio.load(f.name)
-            finally:
-                os.unlink(temp_path)
-            final_wave = final_wave.squeeze().cpu().numpy()
-        with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
-            spectrogram_path = tmp_spectrogram.name
-        save_spectrogram(combined_spectrogram, spectrogram_path)
-        return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
-    @gpu_decorator
-    def load_text_from_file(file):
-        if file:
-            with open(file, "r", encoding="utf-8") as f:
-                text = f.read().strip()
-        else:
-            text = ""
-        return gr.update(value=text)
-    @gpu_decorator
-    def basic_tts(
-        ref_audio_input,
-        ref_text_input,
-        gen_text_input,
-        remove_silence,
-        randomize_seed,
-        seed_input,
-        cross_fade_duration_slider,
-        nfe_slider,
-        speed_slider,
-    ):
-        if randomize_seed:
-            seed_input = np.random.randint(0, 2**31 - 1)
-        audio_out, spectrogram_path, ref_text_out, used_seed = infer(
-            ref_audio_input,
-            ref_text_input,
-            gen_text_input,
-            tts_model_choice,
-            remove_silence,
-            seed=seed_input,
-            cross_fade_duration=cross_fade_duration_slider,
-            nfe_step=nfe_slider,
-            speed=speed_slider,
-        )
-        return audio_out, spectrogram_path, ref_text_out, used_seed
-    gen_text_file.upload(load_text_from_file, inputs=[gen_text_file], outputs=[gen_text_input])
-    ref_text_file.upload(load_text_from_file, inputs=[ref_text_file], outputs=[ref_text_input])
-    ref_audio_input.clear(lambda: [None, None], None, [ref_text_input, ref_text_file])
-    generate_btn.click(
-        basic_tts,
-        inputs=[
-            ref_audio_input,
-            ref_text_input,
-            gen_text_input,
-            remove_silence,
-            randomize_seed,
-            seed_input,
-            cross_fade_duration_slider,
-            nfe_slider,
-            speed_slider,
-        ],
-        outputs=[audio_output, spectrogram_output, ref_text_input, seed_input],
-    )
-if __name__ == "__main__":
-    app.queue().launch()

 import gradio as gr
+with gr.Blocks() as demo:
+    gr.Markdown("Hi everyone, due to breaking changes with ZeroGPU/Xet-storage spaces, this space is temporarily down. I hope to find a solution to this soon, so please stay tuned. Sorry for the inconvenience. In the mean time, please check out: https://huggingface.co/spaces/mrfakename/MegaTTS3-Voice-Cloning https://huggingface.co/spaces/styletts2/styletts2 if you need TTS spaces.")
+demo.launch()