Upload 2 files
Browse files- app.py +92 -0
- requirements.txt +57 -0
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
import glob
|
6 |
+
from TTS.api import TTS
|
7 |
+
import numba
|
8 |
+
|
9 |
+
# Disable numba JIT cache for better compatibility
|
10 |
+
os.environ["NUMBA_DISABLE_CACHE"] = "1"
|
11 |
+
numba.config.THREADING_LAYER = "workqueue"
|
12 |
+
numba.config.DISABLE_JIT = True
|
13 |
+
|
14 |
+
# Load XTTS model (GPU supported if available)
|
15 |
+
@st.cache_resource
|
16 |
+
def load_xtts_model():
|
17 |
+
# Check if GPU is available, if not, use CPU
|
18 |
+
return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
|
19 |
+
|
20 |
+
# Load model
|
21 |
+
tts = load_xtts_model()
|
22 |
+
|
23 |
+
# UI
|
24 |
+
st.title("XTTS Voice Cloning Demo")
|
25 |
+
st.markdown("1. Select a demo voice OR upload your own\n2. Choose or write text\n3. Hear your cloned voice!")
|
26 |
+
|
27 |
+
# Load pre-recorded demo voices
|
28 |
+
demo_voice_dir = "./demo_voices"
|
29 |
+
demo_files = glob.glob(f"{demo_voice_dir}/*")
|
30 |
+
demo_names = [os.path.basename(f) for f in demo_files]
|
31 |
+
|
32 |
+
# Voice input selection
|
33 |
+
voice_source = st.radio("Choose voice input method:", ["Use pre-recorded demo voice", "Upload your own voice"])
|
34 |
+
|
35 |
+
# Initialize speaker_wav_path
|
36 |
+
speaker_wav_path = None
|
37 |
+
|
38 |
+
if voice_source == "Use pre-recorded demo voice":
|
39 |
+
if demo_files:
|
40 |
+
selected_demo = st.selectbox("Choose a demo voice:", demo_names)
|
41 |
+
speaker_wav_path = os.path.join(demo_voice_dir, selected_demo)
|
42 |
+
st.audio(speaker_wav_path, format="audio/wav")
|
43 |
+
else:
|
44 |
+
st.warning("No demo voices found in 'demo_voices/' folder.")
|
45 |
+
|
46 |
+
elif voice_source == "Upload your own voice":
|
47 |
+
uploaded_file = st.file_uploader("Upload your voice sample (WAV, mono, 16k–48kHz):", type=["wav"])
|
48 |
+
if uploaded_file:
|
49 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
50 |
+
temp_audio.write(uploaded_file.read())
|
51 |
+
speaker_wav_path = temp_audio.name
|
52 |
+
st.audio(speaker_wav_path, format="audio/wav")
|
53 |
+
|
54 |
+
# Hindi Predefined Texts
|
55 |
+
predefined_texts = {
|
56 |
+
"नमस्ते, यह मेरी क्लोन की गई आवाज़ है।": "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।",
|
57 |
+
"Hello Everyone, This is my voice cloned using previously recorded voice sample": "Hello Everyone, This is my voice cloned using previously recorded voice sample",
|
58 |
+
"मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।": "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।",
|
59 |
+
"यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?": "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?",
|
60 |
+
"This is not my real voice, but can you tell the difference":"This is not my real voice, but can you tell the difference",
|
61 |
+
"जीवन एक सुंदर यात्रा है, हर पल को जीओ।": "जीवन एक सुंदर यात्रा है, हर पल को जीओ।",
|
62 |
+
"Use custom text": "custom"
|
63 |
+
}
|
64 |
+
|
65 |
+
# Text selection for synthesis
|
66 |
+
selected_text = st.selectbox("Choose or write text to synthesize:", list(predefined_texts.keys()))
|
67 |
+
if predefined_texts[selected_text] == "custom":
|
68 |
+
input_text = st.text_area("Enter custom text:", "Hello, how are you?")
|
69 |
+
else:
|
70 |
+
input_text = predefined_texts[selected_text]
|
71 |
+
|
72 |
+
# Clone & Synthesize functionality
|
73 |
+
if speaker_wav_path and input_text.strip():
|
74 |
+
if st.button("🎧 Clone & Synthesize"):
|
75 |
+
with st.spinner("Cloning voice..."):
|
76 |
+
output_path = "xtts_output.wav"
|
77 |
+
|
78 |
+
# Clone and synthesize the voice using XTTS model
|
79 |
+
tts.tts_to_file(
|
80 |
+
text=input_text,
|
81 |
+
speaker_wav=speaker_wav_path,
|
82 |
+
language="en", # Language set as 'en' for English (adjust as needed)
|
83 |
+
file_path=output_path
|
84 |
+
)
|
85 |
+
|
86 |
+
# Display the cloned audio
|
87 |
+
st.success("Done! Here's your cloned voice:")
|
88 |
+
st.audio(output_path, format="audio/wav")
|
89 |
+
|
90 |
+
# Clean up temp file if uploaded
|
91 |
+
if voice_source == "Upload your own voice":
|
92 |
+
os.remove(speaker_wav_path)
|
requirements.txt
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core deps
|
2 |
+
numpy==1.22.0;python_version<="3.10"
|
3 |
+
numpy>=1.24.3;python_version>"3.10"
|
4 |
+
cython>=0.29.30
|
5 |
+
scipy>=1.11.2
|
6 |
+
torch>=2.1
|
7 |
+
torchaudio
|
8 |
+
soundfile>=0.12.0
|
9 |
+
librosa>=0.10.0
|
10 |
+
scikit-learn>=1.3.0
|
11 |
+
numba==0.55.1;python_version<"3.9"
|
12 |
+
numba>=0.57.0;python_version>="3.9"
|
13 |
+
inflect>=5.6.0
|
14 |
+
tqdm>=4.64.1
|
15 |
+
anyascii>=0.3.0
|
16 |
+
pyyaml>=6.0
|
17 |
+
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
|
18 |
+
aiohttp>=3.8.1
|
19 |
+
packaging>=23.1
|
20 |
+
mutagen==1.47.0
|
21 |
+
# deps for examples
|
22 |
+
flask>=2.0.1
|
23 |
+
# deps for inference
|
24 |
+
pysbd>=0.3.4
|
25 |
+
# deps for notebooks
|
26 |
+
umap-learn>=0.5.1
|
27 |
+
pandas>=1.4,<2.0
|
28 |
+
# deps for training
|
29 |
+
matplotlib>=3.7.0
|
30 |
+
# coqui stack
|
31 |
+
trainer>=0.0.36
|
32 |
+
# config management
|
33 |
+
coqpit>=0.0.16
|
34 |
+
# chinese g2p deps
|
35 |
+
jieba
|
36 |
+
pypinyin
|
37 |
+
# korean
|
38 |
+
hangul_romanize
|
39 |
+
# gruut+supported langs
|
40 |
+
gruut[de,es,fr]==2.2.3
|
41 |
+
# deps for korean
|
42 |
+
jamo
|
43 |
+
nltk
|
44 |
+
g2pkk>=0.1.1
|
45 |
+
# deps for bangla
|
46 |
+
bangla
|
47 |
+
bnnumerizer
|
48 |
+
bnunicodenormalizer
|
49 |
+
#deps for tortoise
|
50 |
+
einops>=0.6.0
|
51 |
+
transformers>=4.33.0
|
52 |
+
#deps for bark
|
53 |
+
encodec>=0.1.1
|
54 |
+
# deps for XTTS
|
55 |
+
unidecode>=1.3.2
|
56 |
+
num2words
|
57 |
+
spacy[ja]>=3
|