Spaces:

Steveeeeeeen
/

TaDiCodec-TTS-AR-Qwen2.5-0.5B

Running on T4

File size: 5,871 Bytes

import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
import warnings
import spaces
warnings.filterwarnings("ignore")

try:
    from models.tts.llm_tts.inference_llm_tts import TTSInferencePipeline
    MODEL_AVAILABLE = True
except ImportError:
    print("Warning: TaDiCodec models not found. Running in demo mode.")
    MODEL_AVAILABLE = False

class TaDiCodecTTSDemo:
    def __init__(self):
        self.pipeline = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.load_model()
    
    def load_model(self):
        try:
            if MODEL_AVAILABLE:
                print("Loading TaDiCodec-TTS-AR-Qwen2.5-0.5B model...")
                self.pipeline = TTSInferencePipeline.from_pretrained(
                    tadicodec_path="amphion/TaDiCodec",
                    llm_path="amphion/TaDiCodec-TTS-AR-Qwen2.5-0.5B",
                    device=self.device,
                )
                print("Model loaded successfully!")
            else:
                print("Running in demo mode - model files not available")
                self.pipeline = None
        except Exception as e:
            print(f"Error loading model: {e}")
            self.pipeline = None
    
    def synthesize_speech(self, text, reference_audio=None, reference_text=""):
        """
        Synthesize speech from text using TaDiCodec TTS
        """
        if not text.strip():
            return None, "Please enter some text to synthesize."
        
        try:
            if self.pipeline is not None:
                # Use actual TaDiCodec inference
                if reference_audio and reference_text.strip():
                    audio = self.pipeline(
                        text=text,
                        prompt_text=reference_text,
                        prompt_speech_path=reference_audio,
                    )
                else:
                    audio = self.pipeline(text=text)
                
                # Save to temporary file
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                    sf.write(tmp_file.name, audio, 24000)
                    return tmp_file.name, "Speech synthesized successfully!"
            else:
                # Fallback demo mode - return None to indicate no audio generated
                return None, "Demo mode - TaDiCodec model not available. Please install the required models."
                
        except Exception as e:
            return None, f"Error during synthesis: {str(e)}"

# Initialize the demo
demo_instance = TaDiCodecTTSDemo()

def tts_interface(text, reference_audio, reference_text):
    """Interface function for Gradio"""
    audio_path, message = demo_instance.synthesize_speech(text, reference_audio, reference_text)
    return audio_path, message

# Create Gradio interface
with gr.Blocks(title="TaDiCodec-TTS Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # TaDiCodec-TTS-AR-Qwen2.5-0.5B Demo
    
    This is a demo of the TaDiCodec Text-to-Speech model with Qwen2.5-0.5B backbone.
    
    **Features:**
    - Voice cloning with reference audio
    - Code-switching support (e.g., mixing English and Chinese)
    - Extremely low bitrate (0.0875 kbps)
    - High-quality speech generation
    """)
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to Synthesize",
                placeholder="Enter the text you want to convert to speech...",
                lines=3,
                value="但是 to those who 知道 her well, it was a 标志 of her unwavering 决心 and spirit."
            )
            
            reference_text = gr.Textbox(
                label="Reference Text",
                placeholder="Text corresponding to the reference audio for voice cloning...",
                lines=2,
                value="In short, we embarked on a mission to make America great again, for all Americans."
            )
            
            reference_audio = gr.Audio(
                label="Reference Audio",
                type="filepath",
            )
            
            synthesize_btn = gr.Button("Synthesize Speech", variant="primary")
        
        with gr.Column():
            output_audio = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            
            status_message = gr.Textbox(
                label="Status",
                interactive=False
            )
    
    # Example inputs
    gr.Markdown("### Example Inputs")
    examples = [
        ["Yes, usually people choose to face life with more positive emotions, after all, happy times are always yearning. However, sometimes slowing down and experiencing the details of life can bring deeper joy and satisfaction. What do you think?", "Jittery Jack's jam jars jiggled jauntily, jolting Jack's jumbled jelly-filled jars joyously.", "sample/tongueTwisters_en_018.wav"],
        ["You think you can just waltz in here and cause chaos? Well, I've got news for you. This time, there's no escaping the consequences. So, one by one, step forward, and let's see who's bold enough to face the music. It's time for a little dose of reality—prepare to be dealt with!","Get in line trouble makers, and I will take care of you.", "sample/en_013.wav"],
    ]
    
    gr.Examples(
        examples=examples,
        inputs=[text_input, reference_text, reference_audio],
        outputs=[output_audio, status_message],
        fn=tts_interface
    )
    
    # Connect the interface
    synthesize_btn.click(
        fn=tts_interface,
        inputs=[text_input, reference_audio, reference_text],
        outputs=[output_audio, status_message]
    )

if __name__ == "__main__":
    demo.launch(share=True, server_name="0.0.0.0")