Spaces:
Running
Running
Upload 8 files
Browse files- .gitattributes +4 -0
- Dockerfile +27 -0
- app.py +83 -0
- demo_voices/English_male_long.mp3 +3 -0
- demo_voices/demo_eng_female.mp3 +3 -0
- demo_voices/demo_eng_male.mp3 +3 -0
- demo_voices/demo_female_hindi.wav +3 -0
- pre-requirements.txt +1 -0
- requirements.txt +17 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
demo_voices/demo_eng_female.mp3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
demo_voices/demo_eng_male.mp3 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
demo_voices/demo_female_hindi.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
demo_voices/English_male_long.mp3 filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
# Avoid prompts during build
|
4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
5 |
+
|
6 |
+
# Install required system packages
|
7 |
+
RUN apt-get update && apt-get install -y \
|
8 |
+
ffmpeg \
|
9 |
+
libsndfile1 \
|
10 |
+
libgl1 \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
WORKDIR /app
|
14 |
+
|
15 |
+
# Install numpy before other libraries to avoid conflicts
|
16 |
+
COPY pre-requirements.txt .
|
17 |
+
RUN pip install --upgrade pip && pip install --no-cache-dir -r pre-requirements.txt
|
18 |
+
|
19 |
+
# Now install the rest of the dependencies
|
20 |
+
COPY requirements.txt .
|
21 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
22 |
+
|
23 |
+
# Copy app code
|
24 |
+
COPY . .
|
25 |
+
|
26 |
+
# Start Streamlit app
|
27 |
+
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
from TTS.api import TTS
|
6 |
+
import soundfile as sf
|
7 |
+
import glob
|
8 |
+
|
9 |
+
# Load XTTS model
|
10 |
+
@st.cache_resource
|
11 |
+
def load_xtts_model():
|
12 |
+
return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
|
13 |
+
|
14 |
+
tts = load_xtts_model()
|
15 |
+
|
16 |
+
# UI
|
17 |
+
st.title("XTTS Voice Cloning Demo")
|
18 |
+
st.markdown("1. Select a demo voice OR upload your own\n2. Choose or write text\n3. Hear your cloned voice!")
|
19 |
+
|
20 |
+
# Load pre-recorded demo voices
|
21 |
+
demo_voice_dir = "./demo_voices"
|
22 |
+
demo_files = glob.glob(f"{demo_voice_dir}/*")
|
23 |
+
demo_names = [os.path.basename(f) for f in demo_files]
|
24 |
+
|
25 |
+
voice_source = st.radio("Choose voice input method:", ["Use pre-recorded demo voice", "Upload your own voice"])
|
26 |
+
|
27 |
+
speaker_wav_path = None
|
28 |
+
|
29 |
+
if voice_source == "Use pre-recorded demo voice":
|
30 |
+
if demo_files:
|
31 |
+
selected_demo = st.selectbox("Choose a demo voice:", demo_names)
|
32 |
+
speaker_wav_path = os.path.join(demo_voice_dir, selected_demo)
|
33 |
+
st.audio(speaker_wav_path, format="audio/wav")
|
34 |
+
else:
|
35 |
+
st.warning("No demo voices found in 'demo_voices/' folder.")
|
36 |
+
|
37 |
+
elif voice_source == "Upload your own voice":
|
38 |
+
uploaded_file = st.file_uploader("Upload your voice sample (WAV, mono, 16k–48kHz):", type=["wav"])
|
39 |
+
if uploaded_file:
|
40 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
41 |
+
temp_audio.write(uploaded_file.read())
|
42 |
+
speaker_wav_path = temp_audio.name
|
43 |
+
st.audio(speaker_wav_path, format="audio/wav")
|
44 |
+
|
45 |
+
|
46 |
+
# Hindi Predefined Texts
|
47 |
+
predefined_texts = {
|
48 |
+
"नमस्ते, यह मेरी क्लोन की गई आवाज़ है।": "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।",
|
49 |
+
"Hello Everyone, This is my voice cloned using previously recorded voice sample": "Hello Everyone, This is my voice cloned using previously recorded voice sample",
|
50 |
+
"मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।": "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।",
|
51 |
+
"यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?": "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?",
|
52 |
+
"This is not my real voice, but can you tell the difference":"This is not my real voice, but can you tell the difference",
|
53 |
+
# "हेलो! मैं टनु हूँ और मुझे AI से खेलना पसंद है।": "हेलो! मैं टनु हूँ और मुझे AI से खेलना पसंद है।",
|
54 |
+
"जीवन एक सुंदर यात्रा है, हर पल को जीओ।": "जीवन एक सुंदर यात्रा है, हर पल को जीओ।",
|
55 |
+
"Use custom text": "custom"
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
selected_text = st.selectbox("Choose or write text to synthesize:", list(predefined_texts.keys()))
|
60 |
+
if predefined_texts[selected_text] == "custom":
|
61 |
+
input_text = st.text_area("Enter custom text:", "Hello, how are you?")
|
62 |
+
else:
|
63 |
+
input_text = predefined_texts[selected_text]
|
64 |
+
|
65 |
+
# Clone & Synthesize
|
66 |
+
if speaker_wav_path and input_text.strip():
|
67 |
+
if st.button("🎧 Clone & Synthesize"):
|
68 |
+
with st.spinner("Cloning voice..."):
|
69 |
+
output_path = "xtts_output.wav"
|
70 |
+
|
71 |
+
tts.tts_to_file(
|
72 |
+
text=input_text,
|
73 |
+
speaker_wav=speaker_wav_path,
|
74 |
+
language="en",
|
75 |
+
file_path=output_path
|
76 |
+
)
|
77 |
+
|
78 |
+
st.success("Done! Here's your cloned voice:")
|
79 |
+
st.audio(output_path, format="audio/wav")
|
80 |
+
|
81 |
+
# Clean up temp file if uploaded
|
82 |
+
if voice_source == "Upload your own voice":
|
83 |
+
os.remove(speaker_wav_path)
|
demo_voices/English_male_long.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e25f7504473dc28cf288828a46bd1cd2974a8bbaf6fc81b4b261e7fae6429e3
|
3 |
+
size 1828181
|
demo_voices/demo_eng_female.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0f3710c54962749d652d06ee880430bbde4a1ced3af3210016b1d2ad04b9ce9
|
3 |
+
size 578228
|
demo_voices/demo_eng_male.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b693f7f888f358cf17ddd660af2f0835924482b7e8614153e763921282da1587
|
3 |
+
size 248810
|
demo_voices/demo_female_hindi.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a90e0875347854b9771b83a12c974166af9a0a844c683c318484985e28c02381
|
3 |
+
size 2986062
|
pre-requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
numpy==1.21.6
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Streamlit frontend
|
2 |
+
streamlit==1.33.0
|
3 |
+
|
4 |
+
# Voice Cloning - Coqui TTS
|
5 |
+
TTS==0.22.0
|
6 |
+
|
7 |
+
# Audio handling
|
8 |
+
soundfile
|
9 |
+
numpy==1.22.0
|
10 |
+
scipy==1.11.4
|
11 |
+
|
12 |
+
# PyTorch (CPU version is okay for Spaces unless GPU is enabled)
|
13 |
+
torch==2.0.1
|
14 |
+
torchaudio==2.0.2
|
15 |
+
|
16 |
+
# File handling
|
17 |
+
ffmpeg-python
|