Spaces:
Running
on
T4
Running
on
T4
File size: 5,871 Bytes
ad798d2 996c06e ad798d2 52a7bf3 ad798d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import warnings
import spaces
warnings.filterwarnings("ignore")
try:
from models.tts.llm_tts.inference_llm_tts import TTSInferencePipeline
MODEL_AVAILABLE = True
except ImportError:
print("Warning: TaDiCodec models not found. Running in demo mode.")
MODEL_AVAILABLE = False
class TaDiCodecTTSDemo:
def __init__(self):
self.pipeline = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.load_model()
def load_model(self):
try:
if MODEL_AVAILABLE:
print("Loading TaDiCodec-TTS-AR-Qwen2.5-0.5B model...")
self.pipeline = TTSInferencePipeline.from_pretrained(
tadicodec_path="amphion/TaDiCodec",
llm_path="amphion/TaDiCodec-TTS-AR-Qwen2.5-0.5B",
device=self.device,
)
print("Model loaded successfully!")
else:
print("Running in demo mode - model files not available")
self.pipeline = None
except Exception as e:
print(f"Error loading model: {e}")
self.pipeline = None
def synthesize_speech(self, text, reference_audio=None, reference_text=""):
"""
Synthesize speech from text using TaDiCodec TTS
"""
if not text.strip():
return None, "Please enter some text to synthesize."
try:
if self.pipeline is not None:
# Use actual TaDiCodec inference
if reference_audio and reference_text.strip():
audio = self.pipeline(
text=text,
prompt_text=reference_text,
prompt_speech_path=reference_audio,
)
else:
audio = self.pipeline(text=text)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, audio, 24000)
return tmp_file.name, "Speech synthesized successfully!"
else:
# Fallback demo mode - return None to indicate no audio generated
return None, "Demo mode - TaDiCodec model not available. Please install the required models."
except Exception as e:
return None, f"Error during synthesis: {str(e)}"
# Initialize the demo
demo_instance = TaDiCodecTTSDemo()
def tts_interface(text, reference_audio, reference_text):
"""Interface function for Gradio"""
audio_path, message = demo_instance.synthesize_speech(text, reference_audio, reference_text)
return audio_path, message
# Create Gradio interface
with gr.Blocks(title="TaDiCodec-TTS Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# TaDiCodec-TTS-AR-Qwen2.5-0.5B Demo
This is a demo of the TaDiCodec Text-to-Speech model with Qwen2.5-0.5B backbone.
**Features:**
- Voice cloning with reference audio
- Code-switching support (e.g., mixing English and Chinese)
- Extremely low bitrate (0.0875 kbps)
- High-quality speech generation
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want to convert to speech...",
lines=3,
value="但是 to those who 知道 her well, it was a 标志 of her unwavering 决心 and spirit."
)
reference_text = gr.Textbox(
label="Reference Text",
placeholder="Text corresponding to the reference audio for voice cloning...",
lines=2,
value="In short, we embarked on a mission to make America great again, for all Americans."
)
reference_audio = gr.Audio(
label="Reference Audio",
type="filepath",
)
synthesize_btn = gr.Button("Synthesize Speech", variant="primary")
with gr.Column():
output_audio = gr.Audio(
label="Generated Speech",
type="filepath"
)
status_message = gr.Textbox(
label="Status",
interactive=False
)
# Example inputs
gr.Markdown("### Example Inputs")
examples = [
["Yes, usually people choose to face life with more positive emotions, after all, happy times are always yearning. However, sometimes slowing down and experiencing the details of life can bring deeper joy and satisfaction. What do you think?", "Jittery Jack's jam jars jiggled jauntily, jolting Jack's jumbled jelly-filled jars joyously.", "sample/tongueTwisters_en_018.wav"],
["You think you can just waltz in here and cause chaos? Well, I've got news for you. This time, there's no escaping the consequences. So, one by one, step forward, and let's see who's bold enough to face the music. It's time for a little dose of reality—prepare to be dealt with!","Get in line trouble makers, and I will take care of you.", "sample/en_013.wav"],
]
gr.Examples(
examples=examples,
inputs=[text_input, reference_text, reference_audio],
outputs=[output_audio, status_message],
fn=tts_interface
)
# Connect the interface
synthesize_btn.click(
fn=tts_interface,
inputs=[text_input, reference_audio, reference_text],
outputs=[output_audio, status_message]
)
if __name__ == "__main__":
demo.launch(share=True, server_name="0.0.0.0") |