Uhhy commited on
Commit
d270640
·
verified ·
1 Parent(s): 1c71ddb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -40
app.py CHANGED
@@ -15,13 +15,11 @@ from TTS.tts.configs.xtts_config import XttsConfig
15
  from TTS.tts.models.xtts import Xtts
16
  from vinorm import TTSnorm
17
 
18
- # download for mecab
19
  os.system("python -m unidic download")
20
 
21
- HF_TOKEN = os.environ.get("HF_TOKEN")
22
  api = HfApi(token=HF_TOKEN)
23
 
24
- # This will trigger downloading model
25
  print("Downloading if not downloaded viXTTS")
26
  checkpoint_dir = "model/"
27
  repo_id = "capleaf/viXTTS"
@@ -56,7 +54,8 @@ if torch.cuda.is_available():
56
  supported_languages = config.languages
57
  if not "vi" in supported_languages:
58
  supported_languages.append("vi")
59
-
 
60
 
61
  def normalize_vietnamese_text(text):
62
  text = (
@@ -75,7 +74,6 @@ def normalize_vietnamese_text(text):
75
 
76
 
77
  def calculate_keep_len(text, lang):
78
- """Simple hack for short sentences"""
79
  if lang in ["ja", "zh-cn"]:
80
  return -1
81
 
@@ -98,23 +96,19 @@ def predict(
98
  ):
99
  if language not in supported_languages:
100
  metrics_text = gr.Warning(
101
- f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
102
  )
103
-
104
  return (None, metrics_text)
105
 
106
  speaker_wav = audio_file_pth
107
 
108
  if len(prompt) < 2:
109
- metrics_text = gr.Warning("Please give a longer prompt text")
110
  return (None, metrics_text)
111
 
112
  if len(prompt) > 250000000:
113
  metrics_text = gr.Warning(
114
- str(len(prompt))
115
- + " characters.\n"
116
- + "Your prompt is too long, please keep it under 250000000 characters\n"
117
- + "Văn bản quá dài, vui lòng giữ dưới 250000000 ký tự."
118
  )
119
  return (None, metrics_text)
120
 
@@ -136,7 +130,7 @@ def predict(
136
  except Exception as e:
137
  print("Speaker encoding error", str(e))
138
  metrics_text = gr.Warning(
139
- "It appears something wrong with reference, did you unmute your microphone?"
140
  )
141
  return (None, metrics_text)
142
 
@@ -145,7 +139,7 @@ def predict(
145
  if normalize_text and language == "vi":
146
  prompt = normalize_vietnamese_text(prompt)
147
 
148
- print("I: Generating new audio...")
149
  t0 = time.time()
150
  out = MODEL.inference(
151
  prompt,
@@ -157,15 +151,14 @@ def predict(
157
  enable_text_splitting=True,
158
  )
159
  inference_time = time.time() - t0
160
- print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
161
  metrics_text += (
162
- f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
163
  )
164
  real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
165
- print(f"Real-time factor (RTF): {real_time_factor}")
166
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
167
 
168
- # Temporary hack for short sentences
169
  keep_len = calculate_keep_len(prompt, language)
170
  out["wav"] = out["wav"][:keep_len]
171
 
@@ -173,12 +166,11 @@ def predict(
173
 
174
  except RuntimeError as e:
175
  if "device-side assert" in str(e):
176
- # cannot do anything on cuda device side error, need tor estart
177
  print(
178
  f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
179
  flush=True,
180
  )
181
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
182
  print("Cuda device-assert Runtime encountered need restart")
183
 
184
  error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
@@ -205,7 +197,6 @@ def predict(
205
  repo_type="dataset",
206
  )
207
 
208
- # speaker_wav
209
  print("Writing error reference audio")
210
  speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
211
  error_api = HfApi()
@@ -216,7 +207,6 @@ def predict(
216
  repo_type="dataset",
217
  )
218
 
219
- # HF Space specific.. This error is unrecoverable need to restart space
220
  space = api.get_space_runtime(repo_id=repo_id)
221
  if space.stage != "BUILDING":
222
  api.restart_space(repo_id=repo_id)
@@ -227,12 +217,12 @@ def predict(
227
  if "Failed to decode" in str(e):
228
  print("Speaker encoding error", str(e))
229
  metrics_text = gr.Warning(
230
- metrics_text="It appears something wrong with reference, did you unmute your microphone?"
231
  )
232
  else:
233
  print("RuntimeError: non device-side assert error:", str(e))
234
  metrics_text = gr.Warning(
235
- "Something unexpected happened please retry again."
236
  )
237
  return (None, metrics_text)
238
  return ("output.wav", metrics_text)
@@ -244,24 +234,22 @@ with gr.Blocks(analytics_enabled=False) as demo:
244
  gr.Markdown(
245
  """
246
  # viXTTS Demo ✨
247
- - Github: https://github.com/thinhlpg/vixtts-demo/
248
- - viVoice: https://github.com/thinhlpg/viVoice
249
  """
250
  )
251
  with gr.Column():
252
- # placeholder to align the image
253
  pass
254
 
255
  with gr.Row():
256
  with gr.Column():
257
  input_text_gr = gr.Textbox(
258
- label="Text Prompt (Văn bản cần đọc)",
259
- info="Mỗi câu nên từ 10 từ trở lên. Tối đa 250 tự (khoảng 2 - 3 câu).",
260
- value="Xin chào, tôi một hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
261
  )
262
  language_gr = gr.Dropdown(
263
- label="Language (Ngôn ngữ)",
264
  choices=[
 
265
  "vi",
266
  "en",
267
  "es",
@@ -282,28 +270,28 @@ with gr.Blocks(analytics_enabled=False) as demo:
282
  "hi",
283
  ],
284
  max_choices=1,
285
- value="vi",
286
  )
287
  normalize_text = gr.Checkbox(
288
- label="Chuẩn hóa văn bản tiếng Việt",
289
- info="Normalize Vietnamese text",
290
  value=True,
291
  )
292
  ref_gr = gr.Audio(
293
- label="Reference Audio (Giọng mẫu)",
294
  type="filepath",
295
  value="model/samples/nu-luu-loat.wav",
296
  )
297
  tts_button = gr.Button(
298
- "Đọc 🗣️🔥",
299
  elem_id="send-btn",
300
  visible=True,
301
  variant="primary",
302
  )
303
 
304
  with gr.Column():
305
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
306
- out_text_gr = gr.Text(label="Metrics")
307
 
308
  tts_button.click(
309
  predict,
@@ -318,4 +306,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
318
  )
319
 
320
  demo.queue()
321
- demo.launch(debug=True, show_api=True, share=True)
 
15
  from TTS.tts.models.xtts import Xtts
16
  from vinorm import TTSnorm
17
 
 
18
  os.system("python -m unidic download")
19
 
20
+ HF_TOKEN = None
21
  api = HfApi(token=HF_TOKEN)
22
 
 
23
  print("Downloading if not downloaded viXTTS")
24
  checkpoint_dir = "model/"
25
  repo_id = "capleaf/viXTTS"
 
54
  supported_languages = config.languages
55
  if not "vi" in supported_languages:
56
  supported_languages.append("vi")
57
+ if not "es-AR" in supported_languages:
58
+ supported_languages.append("es-AR")
59
 
60
  def normalize_vietnamese_text(text):
61
  text = (
 
74
 
75
 
76
  def calculate_keep_len(text, lang):
 
77
  if lang in ["ja", "zh-cn"]:
78
  return -1
79
 
 
96
  ):
97
  if language not in supported_languages:
98
  metrics_text = gr.Warning(
99
+ f"El idioma seleccionado ({language}) no está disponible. Por favor, elige uno de la lista."
100
  )
 
101
  return (None, metrics_text)
102
 
103
  speaker_wav = audio_file_pth
104
 
105
  if len(prompt) < 2:
106
+ metrics_text = gr.Warning("Por favor, introduce un texto más largo.")
107
  return (None, metrics_text)
108
 
109
  if len(prompt) > 250000000:
110
  metrics_text = gr.Warning(
111
+ f"El texto tiene {len(prompt)} caracteres. Es demasiado largo, por favor, mantenlo por debajo de 250000000 caracteres."
 
 
 
112
  )
113
  return (None, metrics_text)
114
 
 
130
  except Exception as e:
131
  print("Speaker encoding error", str(e))
132
  metrics_text = gr.Warning(
133
+ "¿Has activado el micrófono? Parece que hay un problema con la referencia de audio."
134
  )
135
  return (None, metrics_text)
136
 
 
139
  if normalize_text and language == "vi":
140
  prompt = normalize_vietnamese_text(prompt)
141
 
142
+ print("Generando nuevo audio...")
143
  t0 = time.time()
144
  out = MODEL.inference(
145
  prompt,
 
151
  enable_text_splitting=True,
152
  )
153
  inference_time = time.time() - t0
154
+ print(f"Tiempo de generación de audio: {round(inference_time*1000)} milisegundos")
155
  metrics_text += (
156
+ f"Tiempo de generación de audio: {round(inference_time*1000)} milisegundos\n"
157
  )
158
  real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
159
+ print(f"Factor de tiempo real (RTF): {real_time_factor}")
160
+ metrics_text += f"Factor de tiempo real (RTF): {real_time_factor:.2f}\n"
161
 
 
162
  keep_len = calculate_keep_len(prompt, language)
163
  out["wav"] = out["wav"][:keep_len]
164
 
 
166
 
167
  except RuntimeError as e:
168
  if "device-side assert" in str(e):
 
169
  print(
170
  f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
171
  flush=True,
172
  )
173
+ gr.Warning("Se ha producido un error. Por favor, inténtalo de nuevo en un minuto.")
174
  print("Cuda device-assert Runtime encountered need restart")
175
 
176
  error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
 
197
  repo_type="dataset",
198
  )
199
 
 
200
  print("Writing error reference audio")
201
  speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
202
  error_api = HfApi()
 
207
  repo_type="dataset",
208
  )
209
 
 
210
  space = api.get_space_runtime(repo_id=repo_id)
211
  if space.stage != "BUILDING":
212
  api.restart_space(repo_id=repo_id)
 
217
  if "Failed to decode" in str(e):
218
  print("Speaker encoding error", str(e))
219
  metrics_text = gr.Warning(
220
+ metrics_text="Parece que hay un problema con la referencia de audio. ¿Has activado el micrófono?"
221
  )
222
  else:
223
  print("RuntimeError: non device-side assert error:", str(e))
224
  metrics_text = gr.Warning(
225
+ "Se ha producido un error inesperado. Por favor, inténtalo de nuevo."
226
  )
227
  return (None, metrics_text)
228
  return ("output.wav", metrics_text)
 
234
  gr.Markdown(
235
  """
236
  # viXTTS Demo ✨
 
 
237
  """
238
  )
239
  with gr.Column():
 
240
  pass
241
 
242
  with gr.Row():
243
  with gr.Column():
244
  input_text_gr = gr.Textbox(
245
+ label="Texto a convertir a voz",
246
+ info="Cada frase debe tener al menos 10 palabras. Máximo 250 caracteres (alrededor de 2-3 frases).",
247
+ value="Hola, soy un modelo de texto a voz.",
248
  )
249
  language_gr = gr.Dropdown(
250
+ label="Idioma",
251
  choices=[
252
+ "es-AR",
253
  "vi",
254
  "en",
255
  "es",
 
270
  "hi",
271
  ],
272
  max_choices=1,
273
+ value="es-AR",
274
  )
275
  normalize_text = gr.Checkbox(
276
+ label="Normalizar texto en vietnamita",
277
+ info="Solo aplicable al idioma vietnamita",
278
  value=True,
279
  )
280
  ref_gr = gr.Audio(
281
+ label="Audio de referencia (opcional)",
282
  type="filepath",
283
  value="model/samples/nu-luu-loat.wav",
284
  )
285
  tts_button = gr.Button(
286
+ "Generar voz 🗣️🔥",
287
  elem_id="send-btn",
288
  visible=True,
289
  variant="primary",
290
  )
291
 
292
  with gr.Column():
293
+ audio_gr = gr.Audio(label="Audio generado", autoplay=True)
294
+ out_text_gr = gr.Text(label="Métricas")
295
 
296
  tts_button.click(
297
  predict,
 
306
  )
307
 
308
  demo.queue()
309
+ demo.launch(debug=True, show_api=True, share=True)