Itanutiwari527 commited on
Commit
c53363c
·
verified ·
1 Parent(s): c8faf09

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +92 -0
  2. requirements.txt +57 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import tempfile
4
+ import os
5
+ import glob
6
+ from TTS.api import TTS
7
+ import numba
8
+
9
+ # Disable numba JIT cache for better compatibility
10
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
11
+ numba.config.THREADING_LAYER = "workqueue"
12
+ numba.config.DISABLE_JIT = True
13
+
14
+ # Load XTTS model (GPU supported if available)
15
+ @st.cache_resource
16
+ def load_xtts_model():
17
+ # Check if GPU is available, if not, use CPU
18
+ return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
19
+
20
+ # Load model
21
+ tts = load_xtts_model()
22
+
23
+ # UI
24
+ st.title("XTTS Voice Cloning Demo")
25
+ st.markdown("1. Select a demo voice OR upload your own\n2. Choose or write text\n3. Hear your cloned voice!")
26
+
27
+ # Load pre-recorded demo voices
28
+ demo_voice_dir = "./demo_voices"
29
+ demo_files = glob.glob(f"{demo_voice_dir}/*")
30
+ demo_names = [os.path.basename(f) for f in demo_files]
31
+
32
+ # Voice input selection
33
+ voice_source = st.radio("Choose voice input method:", ["Use pre-recorded demo voice", "Upload your own voice"])
34
+
35
+ # Initialize speaker_wav_path
36
+ speaker_wav_path = None
37
+
38
+ if voice_source == "Use pre-recorded demo voice":
39
+ if demo_files:
40
+ selected_demo = st.selectbox("Choose a demo voice:", demo_names)
41
+ speaker_wav_path = os.path.join(demo_voice_dir, selected_demo)
42
+ st.audio(speaker_wav_path, format="audio/wav")
43
+ else:
44
+ st.warning("No demo voices found in 'demo_voices/' folder.")
45
+
46
+ elif voice_source == "Upload your own voice":
47
+ uploaded_file = st.file_uploader("Upload your voice sample (WAV, mono, 16k–48kHz):", type=["wav"])
48
+ if uploaded_file:
49
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
50
+ temp_audio.write(uploaded_file.read())
51
+ speaker_wav_path = temp_audio.name
52
+ st.audio(speaker_wav_path, format="audio/wav")
53
+
54
+ # Hindi Predefined Texts
55
+ predefined_texts = {
56
+ "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।": "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।",
57
+ "Hello Everyone, This is my voice cloned using previously recorded voice sample": "Hello Everyone, This is my voice cloned using previously recorded voice sample",
58
+ "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।": "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।",
59
+ "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?": "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?",
60
+ "This is not my real voice, but can you tell the difference":"This is not my real voice, but can you tell the difference",
61
+ "जीवन एक सुंदर यात्रा है, हर पल को जीओ।": "जीवन एक सुंदर यात्रा है, हर पल को जीओ।",
62
+ "Use custom text": "custom"
63
+ }
64
+
65
+ # Text selection for synthesis
66
+ selected_text = st.selectbox("Choose or write text to synthesize:", list(predefined_texts.keys()))
67
+ if predefined_texts[selected_text] == "custom":
68
+ input_text = st.text_area("Enter custom text:", "Hello, how are you?")
69
+ else:
70
+ input_text = predefined_texts[selected_text]
71
+
72
+ # Clone & Synthesize functionality
73
+ if speaker_wav_path and input_text.strip():
74
+ if st.button("🎧 Clone & Synthesize"):
75
+ with st.spinner("Cloning voice..."):
76
+ output_path = "xtts_output.wav"
77
+
78
+ # Clone and synthesize the voice using XTTS model
79
+ tts.tts_to_file(
80
+ text=input_text,
81
+ speaker_wav=speaker_wav_path,
82
+ language="en", # Language set as 'en' for English (adjust as needed)
83
+ file_path=output_path
84
+ )
85
+
86
+ # Display the cloned audio
87
+ st.success("Done! Here's your cloned voice:")
88
+ st.audio(output_path, format="audio/wav")
89
+
90
+ # Clean up temp file if uploaded
91
+ if voice_source == "Upload your own voice":
92
+ os.remove(speaker_wav_path)
requirements.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core deps
2
+ numpy==1.22.0;python_version<="3.10"
3
+ numpy>=1.24.3;python_version>"3.10"
4
+ cython>=0.29.30
5
+ scipy>=1.11.2
6
+ torch>=2.1
7
+ torchaudio
8
+ soundfile>=0.12.0
9
+ librosa>=0.10.0
10
+ scikit-learn>=1.3.0
11
+ numba==0.55.1;python_version<"3.9"
12
+ numba>=0.57.0;python_version>="3.9"
13
+ inflect>=5.6.0
14
+ tqdm>=4.64.1
15
+ anyascii>=0.3.0
16
+ pyyaml>=6.0
17
+ fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
18
+ aiohttp>=3.8.1
19
+ packaging>=23.1
20
+ mutagen==1.47.0
21
+ # deps for examples
22
+ flask>=2.0.1
23
+ # deps for inference
24
+ pysbd>=0.3.4
25
+ # deps for notebooks
26
+ umap-learn>=0.5.1
27
+ pandas>=1.4,<2.0
28
+ # deps for training
29
+ matplotlib>=3.7.0
30
+ # coqui stack
31
+ trainer>=0.0.36
32
+ # config management
33
+ coqpit>=0.0.16
34
+ # chinese g2p deps
35
+ jieba
36
+ pypinyin
37
+ # korean
38
+ hangul_romanize
39
+ # gruut+supported langs
40
+ gruut[de,es,fr]==2.2.3
41
+ # deps for korean
42
+ jamo
43
+ nltk
44
+ g2pkk>=0.1.1
45
+ # deps for bangla
46
+ bangla
47
+ bnnumerizer
48
+ bnunicodenormalizer
49
+ #deps for tortoise
50
+ einops>=0.6.0
51
+ transformers>=4.33.0
52
+ #deps for bark
53
+ encodec>=0.1.1
54
+ # deps for XTTS
55
+ unidecode>=1.3.2
56
+ num2words
57
+ spacy[ja]>=3