Spaces:

Steveeeeeeen
/

TaDiCodec-TTS-AR-Qwen2.5-0.5B

Running on T4

App Files Files Community

TaDiCodec-TTS-AR-Qwen2.5-0.5B / app.py

Steveeeeeeen HF Staff

Update app.py

52a7bf3 verified 12 days ago

raw

history blame contribute delete

5.87 kB

	import gradio as gr
	import torch
	import soundfile as sf
	import tempfile
	import os
	import warnings
	import spaces
	warnings.filterwarnings("ignore")

	try:
	from models.tts.llm_tts.inference_llm_tts import TTSInferencePipeline
	MODEL_AVAILABLE = True
	except ImportError:
	print("Warning: TaDiCodec models not found. Running in demo mode.")
	MODEL_AVAILABLE = False

	class TaDiCodecTTSDemo:
	def __init__(self):
	self.pipeline = None
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.load_model()

	def load_model(self):
	try:
	if MODEL_AVAILABLE:
	print("Loading TaDiCodec-TTS-AR-Qwen2.5-0.5B model...")
	self.pipeline = TTSInferencePipeline.from_pretrained(
	tadicodec_path="amphion/TaDiCodec",
	llm_path="amphion/TaDiCodec-TTS-AR-Qwen2.5-0.5B",
	device=self.device,
	)
	print("Model loaded successfully!")
	else:
	print("Running in demo mode - model files not available")
	self.pipeline = None
	except Exception as e:
	print(f"Error loading model: {e}")
	self.pipeline = None

	def synthesize_speech(self, text, reference_audio=None, reference_text=""):
	"""
	Synthesize speech from text using TaDiCodec TTS
	"""
	if not text.strip():
	return None, "Please enter some text to synthesize."

	try:
	if self.pipeline is not None:
	# Use actual TaDiCodec inference
	if reference_audio and reference_text.strip():
	audio = self.pipeline(
	text=text,
	prompt_text=reference_text,
	prompt_speech_path=reference_audio,
	)
	else:
	audio = self.pipeline(text=text)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	sf.write(tmp_file.name, audio, 24000)
	return tmp_file.name, "Speech synthesized successfully!"
	else:
	# Fallback demo mode - return None to indicate no audio generated
	return None, "Demo mode - TaDiCodec model not available. Please install the required models."

	except Exception as e:
	return None, f"Error during synthesis: {str(e)}"

	# Initialize the demo
	demo_instance = TaDiCodecTTSDemo()

	def tts_interface(text, reference_audio, reference_text):
	"""Interface function for Gradio"""
	audio_path, message = demo_instance.synthesize_speech(text, reference_audio, reference_text)
	return audio_path, message

	# Create Gradio interface
	with gr.Blocks(title="TaDiCodec-TTS Demo", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# TaDiCodec-TTS-AR-Qwen2.5-0.5B Demo

	This is a demo of the TaDiCodec Text-to-Speech model with Qwen2.5-0.5B backbone.

	Features:
	- Voice cloning with reference audio
	- Code-switching support (e.g., mixing English and Chinese)
	- Extremely low bitrate (0.0875 kbps)
	- High-quality speech generation
	""")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter the text you want to convert to speech...",
	lines=3,
	value="但是 to those who 知道 her well, it was a 标志 of her unwavering 决心 and spirit."
	)

	reference_text = gr.Textbox(
	label="Reference Text",
	placeholder="Text corresponding to the reference audio for voice cloning...",
	lines=2,
	value="In short, we embarked on a mission to make America great again, for all Americans."
	)

	reference_audio = gr.Audio(
	label="Reference Audio",
	type="filepath",
	)

	synthesize_btn = gr.Button("Synthesize Speech", variant="primary")

	with gr.Column():
	output_audio = gr.Audio(
	label="Generated Speech",
	type="filepath"
	)

	status_message = gr.Textbox(
	label="Status",
	interactive=False
	)

	# Example inputs
	gr.Markdown("### Example Inputs")
	examples = [
	["Yes, usually people choose to face life with more positive emotions, after all, happy times are always yearning. However, sometimes slowing down and experiencing the details of life can bring deeper joy and satisfaction. What do you think?", "Jittery Jack's jam jars jiggled jauntily, jolting Jack's jumbled jelly-filled jars joyously.", "sample/tongueTwisters_en_018.wav"],
	["You think you can just waltz in here and cause chaos? Well, I've got news for you. This time, there's no escaping the consequences. So, one by one, step forward, and let's see who's bold enough to face the music. It's time for a little dose of reality—prepare to be dealt with!","Get in line trouble makers, and I will take care of you.", "sample/en_013.wav"],
	]

	gr.Examples(
	examples=examples,
	inputs=[text_input, reference_text, reference_audio],
	outputs=[output_audio, status_message],
	fn=tts_interface
	)

	# Connect the interface
	synthesize_btn.click(
	fn=tts_interface,
	inputs=[text_input, reference_audio, reference_text],
	outputs=[output_audio, status_message]
	)

	if __name__ == "__main__":
	demo.launch(share=True, server_name="0.0.0.0")