import gradio as gr
from TTS.api import TTS
import numpy as np
import soundfile as sf

# Initialize the TTS model
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)  # Set gpu=True if CUDA is available

def clone_voice(reference_audio, text):
    """
    Generate cloned speech from input voice and text.
    """
    # Save the reference audio to a temporary file
    ref_audio_path = "reference.wav"
    sf.write(ref_audio_path, reference_audio[0], reference_audio[1])

    # Generate speech
    output_path = "cloned_voice.wav"
    tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path)

    # Load and return the generated audio
    cloned_audio, sr = sf.read(output_path)
    return (sr, cloned_audio)

# Gradio Interface
description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice."
iface = gr.Interface(
    fn=clone_voice,
    inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="Real-Time Voice Cloning with XTTS",
    description=description
)

if __name__ == "__main__":
    iface.launch()