Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,13 +15,11 @@ from TTS.tts.configs.xtts_config import XttsConfig
|
|
15 |
from TTS.tts.models.xtts import Xtts
|
16 |
from vinorm import TTSnorm
|
17 |
|
18 |
-
# download for mecab
|
19 |
os.system("python -m unidic download")
|
20 |
|
21 |
-
HF_TOKEN =
|
22 |
api = HfApi(token=HF_TOKEN)
|
23 |
|
24 |
-
# This will trigger downloading model
|
25 |
print("Downloading if not downloaded viXTTS")
|
26 |
checkpoint_dir = "model/"
|
27 |
repo_id = "capleaf/viXTTS"
|
@@ -56,7 +54,8 @@ if torch.cuda.is_available():
|
|
56 |
supported_languages = config.languages
|
57 |
if not "vi" in supported_languages:
|
58 |
supported_languages.append("vi")
|
59 |
-
|
|
|
60 |
|
61 |
def normalize_vietnamese_text(text):
|
62 |
text = (
|
@@ -75,7 +74,6 @@ def normalize_vietnamese_text(text):
|
|
75 |
|
76 |
|
77 |
def calculate_keep_len(text, lang):
|
78 |
-
"""Simple hack for short sentences"""
|
79 |
if lang in ["ja", "zh-cn"]:
|
80 |
return -1
|
81 |
|
@@ -98,23 +96,19 @@ def predict(
|
|
98 |
):
|
99 |
if language not in supported_languages:
|
100 |
metrics_text = gr.Warning(
|
101 |
-
f"
|
102 |
)
|
103 |
-
|
104 |
return (None, metrics_text)
|
105 |
|
106 |
speaker_wav = audio_file_pth
|
107 |
|
108 |
if len(prompt) < 2:
|
109 |
-
metrics_text = gr.Warning("
|
110 |
return (None, metrics_text)
|
111 |
|
112 |
if len(prompt) > 250000000:
|
113 |
metrics_text = gr.Warning(
|
114 |
-
|
115 |
-
+ " characters.\n"
|
116 |
-
+ "Your prompt is too long, please keep it under 250000000 characters\n"
|
117 |
-
+ "Văn bản quá dài, vui lòng giữ dưới 250000000 ký tự."
|
118 |
)
|
119 |
return (None, metrics_text)
|
120 |
|
@@ -136,7 +130,7 @@ def predict(
|
|
136 |
except Exception as e:
|
137 |
print("Speaker encoding error", str(e))
|
138 |
metrics_text = gr.Warning(
|
139 |
-
"
|
140 |
)
|
141 |
return (None, metrics_text)
|
142 |
|
@@ -145,7 +139,7 @@ def predict(
|
|
145 |
if normalize_text and language == "vi":
|
146 |
prompt = normalize_vietnamese_text(prompt)
|
147 |
|
148 |
-
print("
|
149 |
t0 = time.time()
|
150 |
out = MODEL.inference(
|
151 |
prompt,
|
@@ -157,15 +151,14 @@ def predict(
|
|
157 |
enable_text_splitting=True,
|
158 |
)
|
159 |
inference_time = time.time() - t0
|
160 |
-
print(f"
|
161 |
metrics_text += (
|
162 |
-
f"
|
163 |
)
|
164 |
real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
|
165 |
-
print(f"
|
166 |
-
metrics_text += f"
|
167 |
|
168 |
-
# Temporary hack for short sentences
|
169 |
keep_len = calculate_keep_len(prompt, language)
|
170 |
out["wav"] = out["wav"][:keep_len]
|
171 |
|
@@ -173,12 +166,11 @@ def predict(
|
|
173 |
|
174 |
except RuntimeError as e:
|
175 |
if "device-side assert" in str(e):
|
176 |
-
# cannot do anything on cuda device side error, need tor estart
|
177 |
print(
|
178 |
f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
|
179 |
flush=True,
|
180 |
)
|
181 |
-
gr.Warning("
|
182 |
print("Cuda device-assert Runtime encountered need restart")
|
183 |
|
184 |
error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
|
@@ -205,7 +197,6 @@ def predict(
|
|
205 |
repo_type="dataset",
|
206 |
)
|
207 |
|
208 |
-
# speaker_wav
|
209 |
print("Writing error reference audio")
|
210 |
speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
|
211 |
error_api = HfApi()
|
@@ -216,7 +207,6 @@ def predict(
|
|
216 |
repo_type="dataset",
|
217 |
)
|
218 |
|
219 |
-
# HF Space specific.. This error is unrecoverable need to restart space
|
220 |
space = api.get_space_runtime(repo_id=repo_id)
|
221 |
if space.stage != "BUILDING":
|
222 |
api.restart_space(repo_id=repo_id)
|
@@ -227,12 +217,12 @@ def predict(
|
|
227 |
if "Failed to decode" in str(e):
|
228 |
print("Speaker encoding error", str(e))
|
229 |
metrics_text = gr.Warning(
|
230 |
-
metrics_text="
|
231 |
)
|
232 |
else:
|
233 |
print("RuntimeError: non device-side assert error:", str(e))
|
234 |
metrics_text = gr.Warning(
|
235 |
-
"
|
236 |
)
|
237 |
return (None, metrics_text)
|
238 |
return ("output.wav", metrics_text)
|
@@ -244,24 +234,22 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
244 |
gr.Markdown(
|
245 |
"""
|
246 |
# viXTTS Demo ✨
|
247 |
-
- Github: https://github.com/thinhlpg/vixtts-demo/
|
248 |
-
- viVoice: https://github.com/thinhlpg/viVoice
|
249 |
"""
|
250 |
)
|
251 |
with gr.Column():
|
252 |
-
# placeholder to align the image
|
253 |
pass
|
254 |
|
255 |
with gr.Row():
|
256 |
with gr.Column():
|
257 |
input_text_gr = gr.Textbox(
|
258 |
-
label="
|
259 |
-
info="
|
260 |
-
value="
|
261 |
)
|
262 |
language_gr = gr.Dropdown(
|
263 |
-
label="
|
264 |
choices=[
|
|
|
265 |
"vi",
|
266 |
"en",
|
267 |
"es",
|
@@ -282,28 +270,28 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
282 |
"hi",
|
283 |
],
|
284 |
max_choices=1,
|
285 |
-
value="
|
286 |
)
|
287 |
normalize_text = gr.Checkbox(
|
288 |
-
label="
|
289 |
-
info="
|
290 |
value=True,
|
291 |
)
|
292 |
ref_gr = gr.Audio(
|
293 |
-
label="
|
294 |
type="filepath",
|
295 |
value="model/samples/nu-luu-loat.wav",
|
296 |
)
|
297 |
tts_button = gr.Button(
|
298 |
-
"
|
299 |
elem_id="send-btn",
|
300 |
visible=True,
|
301 |
variant="primary",
|
302 |
)
|
303 |
|
304 |
with gr.Column():
|
305 |
-
audio_gr = gr.Audio(label="
|
306 |
-
out_text_gr = gr.Text(label="
|
307 |
|
308 |
tts_button.click(
|
309 |
predict,
|
@@ -318,4 +306,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
318 |
)
|
319 |
|
320 |
demo.queue()
|
321 |
-
demo.launch(debug=True, show_api=True, share=True)
|
|
|
15 |
from TTS.tts.models.xtts import Xtts
|
16 |
from vinorm import TTSnorm
|
17 |
|
|
|
18 |
os.system("python -m unidic download")
|
19 |
|
20 |
+
HF_TOKEN = None
|
21 |
api = HfApi(token=HF_TOKEN)
|
22 |
|
|
|
23 |
print("Downloading if not downloaded viXTTS")
|
24 |
checkpoint_dir = "model/"
|
25 |
repo_id = "capleaf/viXTTS"
|
|
|
54 |
supported_languages = config.languages
|
55 |
if not "vi" in supported_languages:
|
56 |
supported_languages.append("vi")
|
57 |
+
if not "es-AR" in supported_languages:
|
58 |
+
supported_languages.append("es-AR")
|
59 |
|
60 |
def normalize_vietnamese_text(text):
|
61 |
text = (
|
|
|
74 |
|
75 |
|
76 |
def calculate_keep_len(text, lang):
|
|
|
77 |
if lang in ["ja", "zh-cn"]:
|
78 |
return -1
|
79 |
|
|
|
96 |
):
|
97 |
if language not in supported_languages:
|
98 |
metrics_text = gr.Warning(
|
99 |
+
f"El idioma seleccionado ({language}) no está disponible. Por favor, elige uno de la lista."
|
100 |
)
|
|
|
101 |
return (None, metrics_text)
|
102 |
|
103 |
speaker_wav = audio_file_pth
|
104 |
|
105 |
if len(prompt) < 2:
|
106 |
+
metrics_text = gr.Warning("Por favor, introduce un texto más largo.")
|
107 |
return (None, metrics_text)
|
108 |
|
109 |
if len(prompt) > 250000000:
|
110 |
metrics_text = gr.Warning(
|
111 |
+
f"El texto tiene {len(prompt)} caracteres. Es demasiado largo, por favor, mantenlo por debajo de 250000000 caracteres."
|
|
|
|
|
|
|
112 |
)
|
113 |
return (None, metrics_text)
|
114 |
|
|
|
130 |
except Exception as e:
|
131 |
print("Speaker encoding error", str(e))
|
132 |
metrics_text = gr.Warning(
|
133 |
+
"¿Has activado el micrófono? Parece que hay un problema con la referencia de audio."
|
134 |
)
|
135 |
return (None, metrics_text)
|
136 |
|
|
|
139 |
if normalize_text and language == "vi":
|
140 |
prompt = normalize_vietnamese_text(prompt)
|
141 |
|
142 |
+
print("Generando nuevo audio...")
|
143 |
t0 = time.time()
|
144 |
out = MODEL.inference(
|
145 |
prompt,
|
|
|
151 |
enable_text_splitting=True,
|
152 |
)
|
153 |
inference_time = time.time() - t0
|
154 |
+
print(f"Tiempo de generación de audio: {round(inference_time*1000)} milisegundos")
|
155 |
metrics_text += (
|
156 |
+
f"Tiempo de generación de audio: {round(inference_time*1000)} milisegundos\n"
|
157 |
)
|
158 |
real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
|
159 |
+
print(f"Factor de tiempo real (RTF): {real_time_factor}")
|
160 |
+
metrics_text += f"Factor de tiempo real (RTF): {real_time_factor:.2f}\n"
|
161 |
|
|
|
162 |
keep_len = calculate_keep_len(prompt, language)
|
163 |
out["wav"] = out["wav"][:keep_len]
|
164 |
|
|
|
166 |
|
167 |
except RuntimeError as e:
|
168 |
if "device-side assert" in str(e):
|
|
|
169 |
print(
|
170 |
f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
|
171 |
flush=True,
|
172 |
)
|
173 |
+
gr.Warning("Se ha producido un error. Por favor, inténtalo de nuevo en un minuto.")
|
174 |
print("Cuda device-assert Runtime encountered need restart")
|
175 |
|
176 |
error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
|
|
|
197 |
repo_type="dataset",
|
198 |
)
|
199 |
|
|
|
200 |
print("Writing error reference audio")
|
201 |
speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
|
202 |
error_api = HfApi()
|
|
|
207 |
repo_type="dataset",
|
208 |
)
|
209 |
|
|
|
210 |
space = api.get_space_runtime(repo_id=repo_id)
|
211 |
if space.stage != "BUILDING":
|
212 |
api.restart_space(repo_id=repo_id)
|
|
|
217 |
if "Failed to decode" in str(e):
|
218 |
print("Speaker encoding error", str(e))
|
219 |
metrics_text = gr.Warning(
|
220 |
+
metrics_text="Parece que hay un problema con la referencia de audio. ¿Has activado el micrófono?"
|
221 |
)
|
222 |
else:
|
223 |
print("RuntimeError: non device-side assert error:", str(e))
|
224 |
metrics_text = gr.Warning(
|
225 |
+
"Se ha producido un error inesperado. Por favor, inténtalo de nuevo."
|
226 |
)
|
227 |
return (None, metrics_text)
|
228 |
return ("output.wav", metrics_text)
|
|
|
234 |
gr.Markdown(
|
235 |
"""
|
236 |
# viXTTS Demo ✨
|
|
|
|
|
237 |
"""
|
238 |
)
|
239 |
with gr.Column():
|
|
|
240 |
pass
|
241 |
|
242 |
with gr.Row():
|
243 |
with gr.Column():
|
244 |
input_text_gr = gr.Textbox(
|
245 |
+
label="Texto a convertir a voz",
|
246 |
+
info="Cada frase debe tener al menos 10 palabras. Máximo 250 caracteres (alrededor de 2-3 frases).",
|
247 |
+
value="Hola, soy un modelo de texto a voz.",
|
248 |
)
|
249 |
language_gr = gr.Dropdown(
|
250 |
+
label="Idioma",
|
251 |
choices=[
|
252 |
+
"es-AR",
|
253 |
"vi",
|
254 |
"en",
|
255 |
"es",
|
|
|
270 |
"hi",
|
271 |
],
|
272 |
max_choices=1,
|
273 |
+
value="es-AR",
|
274 |
)
|
275 |
normalize_text = gr.Checkbox(
|
276 |
+
label="Normalizar texto en vietnamita",
|
277 |
+
info="Solo aplicable al idioma vietnamita",
|
278 |
value=True,
|
279 |
)
|
280 |
ref_gr = gr.Audio(
|
281 |
+
label="Audio de referencia (opcional)",
|
282 |
type="filepath",
|
283 |
value="model/samples/nu-luu-loat.wav",
|
284 |
)
|
285 |
tts_button = gr.Button(
|
286 |
+
"Generar voz 🗣️🔥",
|
287 |
elem_id="send-btn",
|
288 |
visible=True,
|
289 |
variant="primary",
|
290 |
)
|
291 |
|
292 |
with gr.Column():
|
293 |
+
audio_gr = gr.Audio(label="Audio generado", autoplay=True)
|
294 |
+
out_text_gr = gr.Text(label="Métricas")
|
295 |
|
296 |
tts_button.click(
|
297 |
predict,
|
|
|
306 |
)
|
307 |
|
308 |
demo.queue()
|
309 |
+
demo.launch(debug=True, show_api=True, share=True)
|