import gradio as gr from TTS.api import TTS import numpy as np import soundfile as sf # Initialize the TTS model model_name = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name, gpu=False) # Set gpu=True if CUDA is available def clone_voice(reference_audio, text): """ Generate cloned speech from input voice and text. """ # Save the reference audio to a temporary file ref_audio_path = "reference.wav" sf.write(ref_audio_path, reference_audio[0], reference_audio[1]) # Generate speech output_path = "cloned_voice.wav" tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path) # Load and return the generated audio cloned_audio, sr = sf.read(output_path) return (sr, cloned_audio) # Gradio Interface description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice." iface = gr.Interface( fn=clone_voice, inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")], outputs=gr.Audio(label="Cloned Voice Output"), title="Real-Time Voice Cloning with XTTS", description=description ) if __name__ == "__main__": iface.launch()