Steveeeeeeen's picture
Steveeeeeeen HF Staff
Update app.py
52a7bf3 verified
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import warnings
import spaces
warnings.filterwarnings("ignore")
try:
from models.tts.llm_tts.inference_llm_tts import TTSInferencePipeline
MODEL_AVAILABLE = True
except ImportError:
print("Warning: TaDiCodec models not found. Running in demo mode.")
MODEL_AVAILABLE = False
class TaDiCodecTTSDemo:
def __init__(self):
self.pipeline = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.load_model()
def load_model(self):
try:
if MODEL_AVAILABLE:
print("Loading TaDiCodec-TTS-AR-Qwen2.5-0.5B model...")
self.pipeline = TTSInferencePipeline.from_pretrained(
tadicodec_path="amphion/TaDiCodec",
llm_path="amphion/TaDiCodec-TTS-AR-Qwen2.5-0.5B",
device=self.device,
)
print("Model loaded successfully!")
else:
print("Running in demo mode - model files not available")
self.pipeline = None
except Exception as e:
print(f"Error loading model: {e}")
self.pipeline = None
def synthesize_speech(self, text, reference_audio=None, reference_text=""):
"""
Synthesize speech from text using TaDiCodec TTS
"""
if not text.strip():
return None, "Please enter some text to synthesize."
try:
if self.pipeline is not None:
# Use actual TaDiCodec inference
if reference_audio and reference_text.strip():
audio = self.pipeline(
text=text,
prompt_text=reference_text,
prompt_speech_path=reference_audio,
)
else:
audio = self.pipeline(text=text)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, audio, 24000)
return tmp_file.name, "Speech synthesized successfully!"
else:
# Fallback demo mode - return None to indicate no audio generated
return None, "Demo mode - TaDiCodec model not available. Please install the required models."
except Exception as e:
return None, f"Error during synthesis: {str(e)}"
# Initialize the demo
demo_instance = TaDiCodecTTSDemo()
def tts_interface(text, reference_audio, reference_text):
"""Interface function for Gradio"""
audio_path, message = demo_instance.synthesize_speech(text, reference_audio, reference_text)
return audio_path, message
# Create Gradio interface
with gr.Blocks(title="TaDiCodec-TTS Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# TaDiCodec-TTS-AR-Qwen2.5-0.5B Demo
This is a demo of the TaDiCodec Text-to-Speech model with Qwen2.5-0.5B backbone.
**Features:**
- Voice cloning with reference audio
- Code-switching support (e.g., mixing English and Chinese)
- Extremely low bitrate (0.0875 kbps)
- High-quality speech generation
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want to convert to speech...",
lines=3,
value="但是 to those who 知道 her well, it was a 标志 of her unwavering 决心 and spirit."
)
reference_text = gr.Textbox(
label="Reference Text",
placeholder="Text corresponding to the reference audio for voice cloning...",
lines=2,
value="In short, we embarked on a mission to make America great again, for all Americans."
)
reference_audio = gr.Audio(
label="Reference Audio",
type="filepath",
)
synthesize_btn = gr.Button("Synthesize Speech", variant="primary")
with gr.Column():
output_audio = gr.Audio(
label="Generated Speech",
type="filepath"
)
status_message = gr.Textbox(
label="Status",
interactive=False
)
# Example inputs
gr.Markdown("### Example Inputs")
examples = [
["Yes, usually people choose to face life with more positive emotions, after all, happy times are always yearning. However, sometimes slowing down and experiencing the details of life can bring deeper joy and satisfaction. What do you think?", "Jittery Jack's jam jars jiggled jauntily, jolting Jack's jumbled jelly-filled jars joyously.", "sample/tongueTwisters_en_018.wav"],
["You think you can just waltz in here and cause chaos? Well, I've got news for you. This time, there's no escaping the consequences. So, one by one, step forward, and let's see who's bold enough to face the music. It's time for a little dose of reality—prepare to be dealt with!","Get in line trouble makers, and I will take care of you.", "sample/en_013.wav"],
]
gr.Examples(
examples=examples,
inputs=[text_input, reference_text, reference_audio],
outputs=[output_audio, status_message],
fn=tts_interface
)
# Connect the interface
synthesize_btn.click(
fn=tts_interface,
inputs=[text_input, reference_audio, reference_text],
outputs=[output_audio, status_message]
)
if __name__ == "__main__":
demo.launch(share=True, server_name="0.0.0.0")