import streamlit as st import torch import tempfile import os import glob from TTS.api import TTS import numba # Disable numba JIT cache for better compatibility os.environ["NUMBA_DISABLE_CACHE"] = "1" numba.config.THREADING_LAYER = "workqueue" numba.config.DISABLE_JIT = True # Load XTTS model (GPU supported if available) @st.cache_resource def load_xtts_model(): # Check if GPU is available, if not, use CPU return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available()) # Load model tts = load_xtts_model() # UI st.title("XTTS Voice Cloning Demo") st.markdown("1. Select a demo voice OR upload your own\n2. Choose or write text\n3. Hear your cloned voice!") # Load pre-recorded demo voices demo_voice_dir = "./demo_voices" demo_files = glob.glob(f"{demo_voice_dir}/*") demo_names = [os.path.basename(f) for f in demo_files] # Voice input selection voice_source = st.radio("Choose voice input method:", ["Use pre-recorded demo voice", "Upload your own voice"]) # Initialize speaker_wav_path speaker_wav_path = None if voice_source == "Use pre-recorded demo voice": if demo_files: selected_demo = st.selectbox("Choose a demo voice:", demo_names) speaker_wav_path = os.path.join(demo_voice_dir, selected_demo) st.audio(speaker_wav_path, format="audio/wav") else: st.warning("No demo voices found in 'demo_voices/' folder.") elif voice_source == "Upload your own voice": uploaded_file = st.file_uploader("Upload your voice sample (WAV, mono, 16k–48kHz):", type=["wav"]) if uploaded_file: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio.write(uploaded_file.read()) speaker_wav_path = temp_audio.name st.audio(speaker_wav_path, format="audio/wav") # Hindi Predefined Texts predefined_texts = { "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।": "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।", "Hello Everyone, This is my voice cloned using previously recorded voice sample": "Hello Everyone, This is my voice cloned using previously recorded voice sample", "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।": "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।", "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?": "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?", "This is not my real voice, but can you tell the difference":"This is not my real voice, but can you tell the difference", "जीवन एक सुंदर यात्रा है, हर पल को जीओ।": "जीवन एक सुंदर यात्रा है, हर पल को जीओ।", "Use custom text": "custom" } # Text selection for synthesis selected_text = st.selectbox("Choose or write text to synthesize:", list(predefined_texts.keys())) if predefined_texts[selected_text] == "custom": input_text = st.text_area("Enter custom text:", "Hello, how are you?") else: input_text = predefined_texts[selected_text] # Clone & Synthesize functionality if speaker_wav_path and input_text.strip(): if st.button("🎧 Clone & Synthesize"): with st.spinner("Cloning voice..."): output_path = "xtts_output.wav" # Clone and synthesize the voice using XTTS model tts.tts_to_file( text=input_text, speaker_wav=speaker_wav_path, language="en", # Language set as 'en' for English (adjust as needed) file_path=output_path ) # Display the cloned audio st.success("Done! Here's your cloned voice:") st.audio(output_path, format="audio/wav") # Clean up temp file if uploaded if voice_source == "Upload your own voice": os.remove(speaker_wav_path)