Spaces:
Running
on
T4
Running
on
T4
import gradio as gr | |
import torch | |
import soundfile as sf | |
import tempfile | |
import os | |
import warnings | |
import spaces | |
warnings.filterwarnings("ignore") | |
try: | |
from models.tts.llm_tts.inference_llm_tts import TTSInferencePipeline | |
MODEL_AVAILABLE = True | |
except ImportError: | |
print("Warning: TaDiCodec models not found. Running in demo mode.") | |
MODEL_AVAILABLE = False | |
class TaDiCodecTTSDemo: | |
def __init__(self): | |
self.pipeline = None | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.load_model() | |
def load_model(self): | |
try: | |
if MODEL_AVAILABLE: | |
print("Loading TaDiCodec-TTS-AR-Qwen2.5-0.5B model...") | |
self.pipeline = TTSInferencePipeline.from_pretrained( | |
tadicodec_path="amphion/TaDiCodec", | |
llm_path="amphion/TaDiCodec-TTS-AR-Qwen2.5-0.5B", | |
device=self.device, | |
) | |
print("Model loaded successfully!") | |
else: | |
print("Running in demo mode - model files not available") | |
self.pipeline = None | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
self.pipeline = None | |
def synthesize_speech(self, text, reference_audio=None, reference_text=""): | |
""" | |
Synthesize speech from text using TaDiCodec TTS | |
""" | |
if not text.strip(): | |
return None, "Please enter some text to synthesize." | |
try: | |
if self.pipeline is not None: | |
# Use actual TaDiCodec inference | |
if reference_audio and reference_text.strip(): | |
audio = self.pipeline( | |
text=text, | |
prompt_text=reference_text, | |
prompt_speech_path=reference_audio, | |
) | |
else: | |
audio = self.pipeline(text=text) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
sf.write(tmp_file.name, audio, 24000) | |
return tmp_file.name, "Speech synthesized successfully!" | |
else: | |
# Fallback demo mode - return None to indicate no audio generated | |
return None, "Demo mode - TaDiCodec model not available. Please install the required models." | |
except Exception as e: | |
return None, f"Error during synthesis: {str(e)}" | |
# Initialize the demo | |
demo_instance = TaDiCodecTTSDemo() | |
def tts_interface(text, reference_audio, reference_text): | |
"""Interface function for Gradio""" | |
audio_path, message = demo_instance.synthesize_speech(text, reference_audio, reference_text) | |
return audio_path, message | |
# Create Gradio interface | |
with gr.Blocks(title="TaDiCodec-TTS Demo", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# TaDiCodec-TTS-AR-Qwen2.5-0.5B Demo | |
This is a demo of the TaDiCodec Text-to-Speech model with Qwen2.5-0.5B backbone. | |
**Features:** | |
- Voice cloning with reference audio | |
- Code-switching support (e.g., mixing English and Chinese) | |
- Extremely low bitrate (0.0875 kbps) | |
- High-quality speech generation | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Text to Synthesize", | |
placeholder="Enter the text you want to convert to speech...", | |
lines=3, | |
value="但是 to those who 知道 her well, it was a 标志 of her unwavering 决心 and spirit." | |
) | |
reference_text = gr.Textbox( | |
label="Reference Text", | |
placeholder="Text corresponding to the reference audio for voice cloning...", | |
lines=2, | |
value="In short, we embarked on a mission to make America great again, for all Americans." | |
) | |
reference_audio = gr.Audio( | |
label="Reference Audio", | |
type="filepath", | |
) | |
synthesize_btn = gr.Button("Synthesize Speech", variant="primary") | |
with gr.Column(): | |
output_audio = gr.Audio( | |
label="Generated Speech", | |
type="filepath" | |
) | |
status_message = gr.Textbox( | |
label="Status", | |
interactive=False | |
) | |
# Example inputs | |
gr.Markdown("### Example Inputs") | |
examples = [ | |
["Yes, usually people choose to face life with more positive emotions, after all, happy times are always yearning. However, sometimes slowing down and experiencing the details of life can bring deeper joy and satisfaction. What do you think?", "Jittery Jack's jam jars jiggled jauntily, jolting Jack's jumbled jelly-filled jars joyously.", "sample/tongueTwisters_en_018.wav"], | |
["You think you can just waltz in here and cause chaos? Well, I've got news for you. This time, there's no escaping the consequences. So, one by one, step forward, and let's see who's bold enough to face the music. It's time for a little dose of reality—prepare to be dealt with!","Get in line trouble makers, and I will take care of you.", "sample/en_013.wav"], | |
] | |
gr.Examples( | |
examples=examples, | |
inputs=[text_input, reference_text, reference_audio], | |
outputs=[output_audio, status_message], | |
fn=tts_interface | |
) | |
# Connect the interface | |
synthesize_btn.click( | |
fn=tts_interface, | |
inputs=[text_input, reference_audio, reference_text], | |
outputs=[output_audio, status_message] | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True, server_name="0.0.0.0") |