tuankg1028 commited on
Commit
a7389cf
·
1 Parent(s): 1cd339f

Revert "Refactors TTS inference to use RunPod API"

Browse files

This reverts commit d7589f0a37ad05d2b0268078ec0eabce6b93c92d.

Files changed (2) hide show
  1. app-hf.py +0 -453
  2. app.py +44 -79
app-hf.py DELETED
@@ -1,453 +0,0 @@
1
- import spaces
2
- import os
3
- from huggingface_hub import login
4
- import gradio as gr
5
- from cached_path import cached_path
6
- import tempfile
7
- from vinorm import TTSnorm
8
-
9
- from f5_tts.model import DiT
10
- from f5_tts.infer.utils_infer import (
11
- preprocess_ref_audio_text,
12
- load_vocoder,
13
- load_model,
14
- infer_process,
15
- save_spectrogram,
16
- )
17
-
18
- # Authentication credentials (in production, use environment variables or secure storage)
19
- VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
20
- VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
21
-
22
- # Retrieve token from secrets
23
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
24
-
25
- # Log in to Hugging Face
26
- if hf_token:
27
- login(token=hf_token)
28
-
29
- def authenticate(username, password):
30
- """Authenticate user credentials"""
31
- if username == VALID_USERNAME and password == VALID_PASSWORD:
32
- return True, gr.update(visible=False), gr.update(visible=True)
33
- else:
34
- return False, gr.update(visible=True), gr.update(visible=False)
35
-
36
- def logout():
37
- """Logout user and return to login page"""
38
- return gr.update(visible=True), gr.update(visible=False)
39
-
40
- def post_process(text):
41
- text = " " + text + " "
42
- text = text.replace(" . . ", " . ")
43
- text = " " + text + " "
44
- text = text.replace(" .. ", " . ")
45
- text = " " + text + " "
46
- text = text.replace(" , , ", " , ")
47
- text = " " + text + " "
48
- text = text.replace(" ,, ", " , ")
49
- text = " " + text + " "
50
- text = text.replace('"', "")
51
- return " ".join(text.split())
52
-
53
- # Load models
54
- vocoder = load_vocoder()
55
- model = load_model(
56
- DiT,
57
- dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
58
- ckpt_path=str(cached_path("hf://tuankg1028/vietvoices/model_420000.pt")),
59
- vocab_file=str(cached_path("hf://tuankg1028/vietvoices/vocab.txt")),
60
- )
61
-
62
- @spaces.GPU
63
- def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
64
-
65
- if not ref_audio_orig:
66
- raise gr.Error("Please upload a sample audio file.")
67
- if not gen_text.strip():
68
- raise gr.Error("Please enter the text content to generate voice.")
69
- if len(gen_text.split()) > 1000:
70
- raise gr.Error("Please enter text content with less than 1000 words.")
71
-
72
- try:
73
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
74
- final_wave, final_sample_rate, spectrogram = infer_process(
75
- ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
76
- )
77
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
78
- spectrogram_path = tmp_spectrogram.name
79
- save_spectrogram(spectrogram, spectrogram_path)
80
-
81
- return (final_sample_rate, final_wave), spectrogram_path
82
- except Exception as e:
83
- raise gr.Error(f"Error generating voice: {e}")
84
-
85
- # Gradio UI
86
- with gr.Blocks(
87
- theme=gr.themes.Base(
88
- primary_hue="red",
89
- secondary_hue="pink",
90
- neutral_hue="slate",
91
- ).set(
92
- body_background_fill="*neutral_950",
93
- body_text_color="*neutral_100",
94
- background_fill_primary="*neutral_900",
95
- background_fill_secondary="*neutral_800",
96
- ),
97
- css="""
98
- .gradio-container {
99
- background: #0f0f23 !important;
100
- color: #ffffff !important;
101
- }
102
- .login-container {
103
- max-width: 400px;
104
- margin: 50px auto;
105
- padding: 40px;
106
- background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
107
- border-radius: 20px;
108
- border: 1px solid rgba(255, 71, 87, 0.3);
109
- box-shadow: 0 10px 30px rgba(0, 0, 0, 0.5);
110
- }
111
- .login-header {
112
- text-align: center;
113
- margin-bottom: 30px;
114
- }
115
- .login-title {
116
- background: linear-gradient(45deg, #FFD700, #FFA500, #FF8C00);
117
- -webkit-background-clip: text;
118
- -webkit-text-fill-color: transparent;
119
- font-size: 2.5em !important;
120
- font-weight: bold;
121
- margin-bottom: 10px;
122
- }
123
- .login-subtitle {
124
- color: #a0a0a0;
125
- font-size: 1.1em;
126
- }
127
- .login-btn {
128
- background: linear-gradient(45deg, #ff4757 0%, #ff3838 50%, #ff6b7a 100%) !important;
129
- border: none !important;
130
- color: white !important;
131
- font-weight: bold !important;
132
- font-size: 1.1em !important;
133
- padding: 15px 30px !important;
134
- border-radius: 25px !important;
135
- box-shadow: 0 4px 15px rgba(255, 71, 87, 0.4) !important;
136
- transition: all 0.3s ease !important;
137
- width: 100% !important;
138
- }
139
- .login-btn:hover {
140
- transform: translateY(-2px) !important;
141
- box-shadow: 0 6px 20px rgba(255, 71, 87, 0.6) !important;
142
- }
143
- .logout-btn {
144
- background: linear-gradient(45deg, #666 0%, #555 50%, #444 100%) !important;
145
- border: none !important;
146
- color: white !important;
147
- font-weight: bold !important;
148
- padding: 10px 20px !important;
149
- border-radius: 20px !important;
150
- margin-bottom: 20px !important;
151
- }
152
- .main-header {
153
- display: flex;
154
- align-items: center;
155
- justify-content: center;
156
- padding: 20px;
157
- border-bottom: 1px solid rgba(255, 71, 87, 0.2);
158
- margin-bottom: 2em;
159
- }
160
- .logo-svg {
161
- width: 40px;
162
- height: 40px;
163
- color: #FFD700;
164
- animation: pulse 2s infinite;
165
- margin-right: 15px;
166
- }
167
- @keyframes pulse {
168
- 0%, 100% { opacity: 1; }
169
- 50% { opacity: 0.7; }
170
- }
171
- .logo-text {
172
- background: linear-gradient(45deg, #FFD700, #FFA500, #FF8C00);
173
- -webkit-background-clip: text;
174
- -webkit-text-fill-color: transparent;
175
- font-size: 2.8em !important;
176
- font-weight: bold;
177
- margin: 0;
178
- }
179
- .subtitle {
180
- text-align: center;
181
- color: #a0a0a0;
182
- font-size: 1.2em;
183
- margin-bottom: 2em;
184
- background: rgba(255, 71, 87, 0.1);
185
- padding: 15px;
186
- border-radius: 10px;
187
- border: 1px solid rgba(255, 71, 87, 0.2);
188
- }
189
- .feature-box {
190
- border: 1px solid #333;
191
- border-radius: 15px;
192
- padding: 20px;
193
- margin: 10px 0;
194
- background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
195
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
196
- }
197
- .generate-btn {
198
- background: linear-gradient(45deg, #ff4757 0%, #ff3838 50%, #ff6b7a 100%) !important;
199
- border: none !important;
200
- color: white !important;
201
- font-weight: bold !important;
202
- font-size: 1.2em !important;
203
- padding: 18px 40px !important;
204
- border-radius: 30px !important;
205
- box-shadow: 0 6px 20px rgba(255, 71, 87, 0.4) !important;
206
- transition: all 0.3s ease !important;
207
- text-transform: uppercase !important;
208
- letter-spacing: 1px !important;
209
- }
210
- .generate-btn:hover {
211
- transform: translateY(-3px) !important;
212
- box-shadow: 0 8px 25px rgba(255, 71, 87, 0.6) !important;
213
- background: linear-gradient(45deg, #ff6b7a 0%, #ff4757 50%, #ff3838 100%) !important;
214
- }
215
- .dark-card {
216
- background: rgba(255, 71, 87, 0.05) !important;
217
- border: 1px solid rgba(255, 71, 87, 0.2) !important;
218
- border-radius: 15px !important;
219
- padding: 20px !important;
220
- margin: 10px 0 !important;
221
- }
222
- .accent-text {
223
- color: #ff4757 !important;
224
- font-weight: bold !important;
225
- }
226
- """,
227
- title="VietVoices - Vietnamese AI Voice",
228
- ) as demo:
229
-
230
- # Authentication State
231
- auth_state = gr.State(False)
232
-
233
- # Login Interface
234
- with gr.Column(visible=True) as login_interface:
235
- gr.HTML("""
236
- <div class="login-container">
237
- <div class="login-header">
238
- <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
239
- <svg xmlns="http://www.w3.org/2000/svg" width="60" height="60" viewBox="0 0 24 24" fill="#FFD700" stroke="#FFD700" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="margin-right: 15px;">
240
- <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path>
241
- <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
242
- <line x1="12" x2="12" y1="19" y2="22"></line>
243
- </svg>
244
- <h1 class="login-title">VietVoices</h1>
245
- </div>
246
- <p class="login-subtitle">🔐 Đăng nhập để sử dụng AI Voice</p>
247
- </div>
248
- """)
249
-
250
- with gr.Row():
251
- with gr.Column(scale=1):
252
- pass
253
- with gr.Column(scale=2):
254
- username_input = gr.Textbox(
255
- label="👤 Tên đăng nhập",
256
- placeholder="Nhập tên đăng nhập...",
257
- type="text"
258
- )
259
- password_input = gr.Textbox(
260
- label="🔑 Mật khẩu",
261
- placeholder="Nhập mật khẩu...",
262
- type="password"
263
- )
264
- login_btn = gr.Button("🚀 Đăng nhập", elem_classes="login-btn")
265
- login_status = gr.HTML("")
266
- with gr.Column(scale=1):
267
- pass
268
-
269
- gr.HTML("</div>")
270
-
271
- # Main Interface (initially hidden)
272
- with gr.Column(visible=False) as main_interface:
273
- # Logout button
274
- logout_btn = gr.Button("🚪 Đăng xuất", elem_classes="logout-btn")
275
-
276
- # Header Section with SVG Logo
277
- gr.HTML("""
278
- <div class="main-header">
279
- <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="#FFD700" stroke="#FFD700" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="logo-svg">
280
- <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path>
281
- <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
282
- <line x1="12" x2="12" y1="19" y2="22"></line>
283
- </svg>
284
- <h1 class="logo-text">VietVoices</h1>
285
- </div>
286
- <div class="subtitle">
287
- 🇻🇳 Chuyển đổi văn bản thành giọng nói tự nhiên bằng AI 🇻🇳<br>
288
- <span style="color: #ff4757;">🚀 Công nghệ AI tiên tiến • 🎯 500K bước huấn luyện • ⚡ 150 giờ dữ liệu</span>
289
- </div>
290
- """)
291
-
292
- # Instructions Section
293
- with gr.Accordion("📋 Hướng dẫn sử dụng", open=False):
294
- gr.Markdown("""
295
- <div class="dark-card">
296
-
297
- ### 🎯 Hướng dẫn nhanh:
298
- 1. **🎵 Tải lên giọng mẫu**: Chọn file âm thanh rõ ràng (WAV/MP3) làm giọng tham chiếu
299
- 2. **✍️ Nhập văn bản**: Gõ văn bản tiếng Việt bạn muốn chuyển đổi
300
- 3. **⚡ Điều chỉnh tốc độ**: Tinh chỉnh tốc độ nói (0.3x đến 2.0x)
301
- 4. **🔥 Tạo giọng nói**: Nhấn nút và chờ AI tạo giọng nói!
302
-
303
- 💡 **Mẹo hay**: Sử dụng âm thanh tham chiếu rõ ràng, phát âm chuẩn để có kết quả tốt nhất!
304
-
305
- </div>
306
- """)
307
-
308
- # Main Input Section
309
- with gr.Row():
310
- with gr.Column(scale=1):
311
- gr.HTML('<div class="dark-card">')
312
- gr.Markdown("### 🎙️ <span class='accent-text'>Giọng nói mẫu</span>")
313
- ref_audio = gr.Audio(
314
- label="🔊 Tải lên giọng mẫu",
315
- type="filepath",
316
- sources=["upload", "microphone"],
317
- show_download_button=True
318
- )
319
- gr.HTML('</div>')
320
-
321
- with gr.Column(scale=2):
322
- gr.HTML('<div class="dark-card">')
323
- gr.Markdown("### 📝 <span class='accent-text'>Văn bản đầu vào</span>")
324
- gen_text = gr.Textbox(
325
- label="✏️ Nhập văn bản tiếng Việt",
326
- placeholder="Nhập văn bản tiếng Việt bạn muốn chuyển đổi thành giọng nói...",
327
- lines=4,
328
- max_lines=8
329
- )
330
-
331
- with gr.Row():
332
- speed = gr.Slider(
333
- 0.3, 2.0,
334
- value=1.0,
335
- step=0.1,
336
- label="⚡ Tốc độ giọng nói",
337
- info="Điều chỉnh tốc độ nói nhanh hay chậm"
338
- )
339
- word_count = gr.HTML("<span class='accent-text'>📊 Số từ: 0/1000</span>")
340
-
341
- gr.HTML('</div>')
342
-
343
- # Generate Button
344
- with gr.Row():
345
- with gr.Column():
346
- btn_synthesize = gr.Button(
347
- "🔥 Tạo giọng nói AI! ✨",
348
- elem_classes="generate-btn",
349
- size="lg"
350
- )
351
-
352
- # Output Section
353
- gr.Markdown("## 🎧 <span class='accent-text'>Kết quả tạo ra</span>")
354
- with gr.Row():
355
- with gr.Column():
356
- gr.HTML('<div class="dark-card">')
357
- gr.Markdown("### 🎵 Âm thanh đầu ra")
358
- output_audio = gr.Audio(
359
- label="🎧 Giọng nói được tạo ra",
360
- type="numpy",
361
- show_download_button=True,
362
- show_share_button=True
363
- )
364
- gr.HTML('</div>')
365
-
366
- with gr.Column():
367
- gr.HTML('<div class="dark-card">')
368
- gr.Markdown("### 📊 Phân tích âm thanh")
369
- output_spectrogram = gr.Image(
370
- label="📈 Phổ âm thanh",
371
- show_download_button=True
372
- )
373
- gr.HTML('</div>')
374
-
375
- # Model Information
376
- with gr.Accordion("🔬 Thông tin mô hình & Hạn chế", open=False):
377
- gr.HTML('<div class="dark-card">')
378
- gr.Markdown("""
379
- ### 🤖 <span class='accent-text'>Về mô hình VietVoices:</span>
380
- - 🏋️ **Huấn luyện**: 500,000 bước trên GPU RTX 3090
381
- - 📚 **Dữ liệu**: 150 giờ âm thanh tiếng Việt
382
- - 🎯 **Kiến trúc**: Mô hình DiT (Diffusion Transformer) tiên tiến
383
- - 🌟 **Chuyên biệt**: Tối ưu hóa cho tiếng Việt
384
- """)
385
-
386
- model_limitations = gr.Textbox(
387
- value="""🚨 Hạn chế hiện tại:
388
-
389
- 1. 🔢 Số & Ngày tháng: Có thể gặp khó khăn với nội dung số, ngày tháng và ký tự đặc biệt
390
- 2. 🎵 Nhịp điệu: Một số đầu ra có thể có nhịp điệu không nhất quán - sử d��ng âm thanh tham chiếu rõ ràng
391
- 3. 🎤 Nhận dạng: Sử dụng Whisper-large-v3-turbo có thể nhận dạng sai văn bản tiếng Việt
392
- 4. 🎭 Nhân bản giọng: Độ chính xác nhân bản giọng nói không phải người Việt có thể thay đổi
393
- 5. 📄 Văn bản dài: Đoạn văn rất dài (>1000 từ) có thể tạo ra kết quả không tối ưu
394
-
395
- 💡 Để có kết quả tốt nhất: Sử dụng câu ngắn, rõ ràng với âm thanh tham chiếu tốt!""",
396
- label="⚠️ Lưu ý quan trọng",
397
- lines=8,
398
- interactive=False
399
- )
400
- gr.HTML('</div>')
401
-
402
- # Footer
403
- gr.HTML("""
404
- <div style="text-align: center; margin-top: 2em; padding: 20px; background: linear-gradient(45deg, rgba(255, 71, 87, 0.2) 0%, rgba(255, 107, 122, 0.2) 100%); border-radius: 15px; border: 1px solid rgba(255, 71, 87, 0.3);">
405
- <h3 style="color: #ff4757;">🌟 Tận hưởng giọng nói AI tiếng Việt! 🌟</h3>
406
- <p style="color: #a0a0a0;">Được tạo ra với ❤️ cho cộng đồng Việt Nam</p>
407
- </div>
408
- """)
409
-
410
- # JavaScript for word count
411
- gen_text.change(
412
- fn=lambda text: f"<span class='accent-text'>📊 Số từ: {len(text.split())}/1000</span>",
413
- inputs=[gen_text],
414
- outputs=[word_count]
415
- )
416
-
417
- btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
418
-
419
- # Authentication event handlers
420
- def handle_login(username, password):
421
- if username == VALID_USERNAME and password == VALID_PASSWORD:
422
- return (
423
- gr.update(visible=False), # Hide login interface
424
- gr.update(visible=True), # Show main interface
425
- "<p style='color: #4CAF50; text-align: center;'>✅ Đăng nhập thành công!</p>"
426
- )
427
- else:
428
- return (
429
- gr.update(visible=True), # Keep login interface visible
430
- gr.update(visible=False), # Keep main interface hidden
431
- "<p style='color: #ff4757; text-align: center;'>❌ Sai tên đăng nhập hoặc mật khẩu!</p>"
432
- )
433
-
434
- def handle_logout():
435
- return (
436
- gr.update(visible=True), # Show login interface
437
- gr.update(visible=False) # Hide main interface
438
- )
439
-
440
- # Bind events
441
- login_btn.click(
442
- handle_login,
443
- inputs=[username_input, password_input],
444
- outputs=[login_interface, main_interface, login_status]
445
- )
446
-
447
- logout_btn.click(
448
- handle_logout,
449
- outputs=[login_interface, main_interface]
450
- )
451
-
452
- # Run Gradio with share=True to get a gradio.live link
453
- demo.queue().launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,18 +1,24 @@
 
1
  import os
2
  from huggingface_hub import login
3
  import gradio as gr
 
4
  import tempfile
5
- import requests
6
- import base64
 
 
 
 
 
 
 
 
7
 
8
  # Authentication credentials (in production, use environment variables or secure storage)
9
  VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
10
  VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
11
 
12
- # RunPod API configuration
13
- RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY")
14
- RUNPOD_ENDPOINT_ID = os.getenv("RUNPOD_ENDPOINT_ID")
15
-
16
  # Retrieve token from secrets
17
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
 
@@ -31,10 +37,31 @@ def logout():
31
  """Logout user and return to login page"""
32
  return gr.update(visible=True), gr.update(visible=False)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
35
- if not RUNPOD_API_KEY or not RUNPOD_ENDPOINT_ID:
36
- raise gr.Error("RunPod API configuration missing. Please check environment variables.")
37
-
38
  if not ref_audio_orig:
39
  raise gr.Error("Please upload a sample audio file.")
40
  if not gen_text.strip():
@@ -43,79 +70,17 @@ def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: g
43
  raise gr.Error("Please enter text content with less than 1000 words.")
44
 
45
  try:
46
- # Read and encode reference audio to base64
47
- with open(ref_audio_orig, "rb") as f:
48
- audio_data = f.read()
49
- audio_b64 = base64.b64encode(audio_data).decode('utf-8')
50
-
51
- # Prepare request payload
52
- payload = {
53
- "input": {
54
- "ref_audio": audio_b64,
55
- "gen_text": gen_text,
56
- "speed": speed
57
- }
58
- }
59
-
60
- # Make request to RunPod endpoint
61
- response = requests.post(
62
- f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync",
63
- headers={
64
- "Authorization": f"Bearer {RUNPOD_API_KEY}",
65
- "Content-Type": "application/json"
66
- },
67
- json=payload,
68
- timeout=300 # 5 minute timeout
69
  )
70
-
71
- if response.status_code != 200:
72
- raise gr.Error(f"RunPod API error: {response.status_code} - {response.text}")
73
-
74
- result = response.json()
75
-
76
- if "error" in result:
77
- raise gr.Error(f"RunPod processing error: {result['error']}")
78
-
79
- if 'output' not in result or 'audio_base64' not in result['output']:
80
- raise gr.Error("No audio data received from RunPod API")
81
-
82
- # Decode audio from base64
83
- audio_data = base64.b64decode(result['output']['audio_base64'])
84
- sample_rate = result['output'].get('sample_rate', 24000)
85
-
86
- # Save audio to temporary file and return as numpy array
87
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
88
- tmp_audio.write(audio_data)
89
- tmp_audio_path = tmp_audio.name
90
-
91
- # Load audio as numpy array for Gradio
92
- import soundfile as sf
93
- audio_array, sr = sf.read(tmp_audio_path)
94
-
95
- # Create a simple spectrogram placeholder since we don't have the actual spectrogram
96
- import numpy as np
97
- import matplotlib.pyplot as plt
98
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
99
- plt.figure(figsize=(10, 4))
100
- plt.specgram(audio_array, Fs=sr)
101
- plt.title("Audio Spectrogram")
102
- plt.xlabel("Time")
103
- plt.ylabel("Frequency")
104
- plt.savefig(tmp_spectrogram.name)
105
- plt.close()
106
  spectrogram_path = tmp_spectrogram.name
107
-
108
- # Clean up temporary audio file
109
- os.unlink(tmp_audio_path)
110
-
111
- return (sr, audio_array), spectrogram_path
112
-
113
- except requests.exceptions.Timeout:
114
- raise gr.Error("Request timeout. The audio generation took too long.")
115
- except requests.exceptions.RequestException as e:
116
- raise gr.Error(f"Network error: {str(e)}")
117
  except Exception as e:
118
- raise gr.Error(f"Error generating voice: {str(e)}")
119
 
120
  # Gradio UI
121
  with gr.Blocks(
@@ -485,4 +450,4 @@ with gr.Blocks(
485
  )
486
 
487
  # Run Gradio with share=True to get a gradio.live link
488
- demo.queue().launch(share=False)
 
1
+ import spaces
2
  import os
3
  from huggingface_hub import login
4
  import gradio as gr
5
+ from cached_path import cached_path
6
  import tempfile
7
+ from vinorm import TTSnorm
8
+
9
+ from f5_tts.model import DiT
10
+ from f5_tts.infer.utils_infer import (
11
+ preprocess_ref_audio_text,
12
+ load_vocoder,
13
+ load_model,
14
+ infer_process,
15
+ save_spectrogram,
16
+ )
17
 
18
  # Authentication credentials (in production, use environment variables or secure storage)
19
  VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
20
  VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
21
 
 
 
 
 
22
  # Retrieve token from secrets
23
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
24
 
 
37
  """Logout user and return to login page"""
38
  return gr.update(visible=True), gr.update(visible=False)
39
 
40
+ def post_process(text):
41
+ text = " " + text + " "
42
+ text = text.replace(" . . ", " . ")
43
+ text = " " + text + " "
44
+ text = text.replace(" .. ", " . ")
45
+ text = " " + text + " "
46
+ text = text.replace(" , , ", " , ")
47
+ text = " " + text + " "
48
+ text = text.replace(" ,, ", " , ")
49
+ text = " " + text + " "
50
+ text = text.replace('"', "")
51
+ return " ".join(text.split())
52
+
53
+ # Load models
54
+ vocoder = load_vocoder()
55
+ model = load_model(
56
+ DiT,
57
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
58
+ ckpt_path=str(cached_path("hf://tuankg1028/vietvoices/model_420000.pt")),
59
+ vocab_file=str(cached_path("hf://tuankg1028/vietvoices/vocab.txt")),
60
+ )
61
+
62
+ @spaces.GPU
63
  def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
64
+
 
 
65
  if not ref_audio_orig:
66
  raise gr.Error("Please upload a sample audio file.")
67
  if not gen_text.strip():
 
70
  raise gr.Error("Please enter text content with less than 1000 words.")
71
 
72
  try:
73
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
74
+ final_wave, final_sample_rate, spectrogram = infer_process(
75
+ ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
 
 
 
 
 
 
 
78
  spectrogram_path = tmp_spectrogram.name
79
+ save_spectrogram(spectrogram, spectrogram_path)
80
+
81
+ return (final_sample_rate, final_wave), spectrogram_path
 
 
 
 
 
 
 
82
  except Exception as e:
83
+ raise gr.Error(f"Error generating voice: {e}")
84
 
85
  # Gradio UI
86
  with gr.Blocks(
 
450
  )
451
 
452
  # Run Gradio with share=True to get a gradio.live link
453
+ demo.queue().launch(share=True)