Spaces:

tuankg1028
/

vietvoices

Sleeping

App Files Files Community

tuankg1028 commited on Jun 18

Commit

a7389cf

1 Parent(s): 1cd339f

Revert "Refactors TTS inference to use RunPod API"

Browse files

This reverts commit d7589f0a37ad05d2b0268078ec0eabce6b93c92d.

Files changed (2) hide show

app-hf.py +0 -453
app.py +44 -79

app-hf.py DELETED Viewed

@@ -1,453 +0,0 @@
-import spaces
-import os
-from huggingface_hub import login
-import gradio as gr
-from cached_path import cached_path
-import tempfile
-from vinorm import TTSnorm
-from f5_tts.model import DiT
-from f5_tts.infer.utils_infer import (
-    preprocess_ref_audio_text,
-    load_vocoder,
-    load_model,
-    infer_process,
-    save_spectrogram,
-)
-# Authentication credentials (in production, use environment variables or secure storage)
-VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
-VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
-# Retrieve token from secrets
-hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-# Log in to Hugging Face
-if hf_token:
-    login(token=hf_token)
-def authenticate(username, password):
-    """Authenticate user credentials"""
-    if username == VALID_USERNAME and password == VALID_PASSWORD:
-        return True, gr.update(visible=False), gr.update(visible=True)
-    else:
-        return False, gr.update(visible=True), gr.update(visible=False)
-def logout():
-    """Logout user and return to login page"""
-    return gr.update(visible=True), gr.update(visible=False)
-def post_process(text):
-    text = " " + text + " "
-    text = text.replace(" . . ", " . ")
-    text = " " + text + " "
-    text = text.replace(" .. ", " . ")
-    text = " " + text + " "
-    text = text.replace(" , , ", " , ")
-    text = " " + text + " "
-    text = text.replace(" ,, ", " , ")
-    text = " " + text + " "
-    text = text.replace('"', "")
-    return " ".join(text.split())
-# Load models
-vocoder = load_vocoder()
-model = load_model(
-    DiT,
-    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
-    ckpt_path=str(cached_path("hf://tuankg1028/vietvoices/model_420000.pt")),
-    vocab_file=str(cached_path("hf://tuankg1028/vietvoices/vocab.txt")),
-)
-@spaces.GPU
-def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
-    if not ref_audio_orig:
-        raise gr.Error("Please upload a sample audio file.")
-    if not gen_text.strip():
-        raise gr.Error("Please enter the text content to generate voice.")
-    if len(gen_text.split()) > 1000:
-        raise gr.Error("Please enter text content with less than 1000 words.")
-    try:
-        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
-        final_wave, final_sample_rate, spectrogram = infer_process(
-            ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
-        )
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-            spectrogram_path = tmp_spectrogram.name
-            save_spectrogram(spectrogram, spectrogram_path)
-        return (final_sample_rate, final_wave), spectrogram_path
-    except Exception as e:
-        raise gr.Error(f"Error generating voice: {e}")
-# Gradio UI
-with gr.Blocks(
-    theme=gr.themes.Base(
-        primary_hue="red",
-        secondary_hue="pink",
-        neutral_hue="slate",
-    ).set(
-        body_background_fill="*neutral_950",
-        body_text_color="*neutral_100",
-        background_fill_primary="*neutral_900",
-        background_fill_secondary="*neutral_800",
-    ),
-    css="""
-    .gradio-container {
-        background: #0f0f23 !important;
-        color: #ffffff !important;
-    }
-    .login-container {
-        max-width: 400px;
-        margin: 50px auto;
-        padding: 40px;
-        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
-        border-radius: 20px;
-        border: 1px solid rgba(255, 71, 87, 0.3);
-        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.5);
-    }
-    .login-header {
-        text-align: center;
-        margin-bottom: 30px;
-    }
-    .login-title {
-        background: linear-gradient(45deg, #FFD700, #FFA500, #FF8C00);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        font-size: 2.5em !important;
-        font-weight: bold;
-        margin-bottom: 10px;
-    }
-    .login-subtitle {
-        color: #a0a0a0;
-        font-size: 1.1em;
-    }
-    .login-btn {
-        background: linear-gradient(45deg, #ff4757 0%, #ff3838 50%, #ff6b7a 100%) !important;
-        border: none !important;
-        color: white !important;
-        font-weight: bold !important;
-        font-size: 1.1em !important;
-        padding: 15px 30px !important;
-        border-radius: 25px !important;
-        box-shadow: 0 4px 15px rgba(255, 71, 87, 0.4) !important;
-        transition: all 0.3s ease !important;
-        width: 100% !important;
-    }
-    .login-btn:hover {
-        transform: translateY(-2px) !important;
-        box-shadow: 0 6px 20px rgba(255, 71, 87, 0.6) !important;
-    }
-    .logout-btn {
-        background: linear-gradient(45deg, #666 0%, #555 50%, #444 100%) !important;
-        border: none !important;
-        color: white !important;
-        font-weight: bold !important;
-        padding: 10px 20px !important;
-        border-radius: 20px !important;
-        margin-bottom: 20px !important;
-    }
-    .main-header {
-        display: flex;
-        align-items: center;
-        justify-content: center;
-        padding: 20px;
-        border-bottom: 1px solid rgba(255, 71, 87, 0.2);
-        margin-bottom: 2em;
-    }
-    .logo-svg {
-        width: 40px;
-        height: 40px;
-        color: #FFD700;
-        animation: pulse 2s infinite;
-        margin-right: 15px;
-    }
-    @keyframes pulse {
-        0%, 100% { opacity: 1; }
-        50% { opacity: 0.7; }
-    }
-    .logo-text {
-        background: linear-gradient(45deg, #FFD700, #FFA500, #FF8C00);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        font-size: 2.8em !important;
-        font-weight: bold;
-        margin: 0;
-    }
-    .subtitle {
-        text-align: center;
-        color: #a0a0a0;
-        font-size: 1.2em;
-        margin-bottom: 2em;
-        background: rgba(255, 71, 87, 0.1);
-        padding: 15px;
-        border-radius: 10px;
-        border: 1px solid rgba(255, 71, 87, 0.2);
-    }
-    .feature-box {
-        border: 1px solid #333;
-        border-radius: 15px;
-        padding: 20px;
-        margin: 10px 0;
-        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
-        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
-    }
-    .generate-btn {
-        background: linear-gradient(45deg, #ff4757 0%, #ff3838 50%, #ff6b7a 100%) !important;
-        border: none !important;
-        color: white !important;
-        font-weight: bold !important;
-        font-size: 1.2em !important;
-        padding: 18px 40px !important;
-        border-radius: 30px !important;
-        box-shadow: 0 6px 20px rgba(255, 71, 87, 0.4) !important;
-        transition: all 0.3s ease !important;
-        text-transform: uppercase !important;
-        letter-spacing: 1px !important;
-    }
-    .generate-btn:hover {
-        transform: translateY(-3px) !important;
-        box-shadow: 0 8px 25px rgba(255, 71, 87, 0.6) !important;
-        background: linear-gradient(45deg, #ff6b7a 0%, #ff4757 50%, #ff3838 100%) !important;
-    }
-    .dark-card {
-        background: rgba(255, 71, 87, 0.05) !important;
-        border: 1px solid rgba(255, 71, 87, 0.2) !important;
-        border-radius: 15px !important;
-        padding: 20px !important;
-        margin: 10px 0 !important;
-    }
-    .accent-text {
-        color: #ff4757 !important;
-        font-weight: bold !important;
-    }
-    """,
-    title="VietVoices - Vietnamese AI Voice",
-) as demo:
-    # Authentication State
-    auth_state = gr.State(False)
-    # Login Interface
-    with gr.Column(visible=True) as login_interface:
-        gr.HTML("""
-        <div class="login-container">
-            <div class="login-header">
-                <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
-                    <svg xmlns="http://www.w3.org/2000/svg" width="60" height="60" viewBox="0 0 24 24" fill="#FFD700" stroke="#FFD700" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="margin-right: 15px;">
-                        <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path>
-                        <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
-                        <line x1="12" x2="12" y1="19" y2="22"></line>
-                    </svg>
-                    <h1 class="login-title">VietVoices</h1>
-                </div>
-                <p class="login-subtitle">🔐 Đăng nhập để sử dụng AI Voice</p>
-            </div>
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                pass
-            with gr.Column(scale=2):
-                username_input = gr.Textbox(
-                    label="👤 Tên đăng nhập",
-                    placeholder="Nhập tên đăng nhập...",
-                    type="text"
-                )
-                password_input = gr.Textbox(
-                    label="🔑 Mật khẩu",
-                    placeholder="Nhập mật khẩu...",
-                    type="password"
-                )
-                login_btn = gr.Button("🚀 Đăng nhập", elem_classes="login-btn")
-                login_status = gr.HTML("")
-            with gr.Column(scale=1):
-                pass
-        gr.HTML("</div>")
-    # Main Interface (initially hidden)
-    with gr.Column(visible=False) as main_interface:
-        # Logout button
-        logout_btn = gr.Button("🚪 Đăng xuất", elem_classes="logout-btn")
-        # Header Section with SVG Logo
-        gr.HTML("""
-        <div class="main-header">
-            <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="#FFD700" stroke="#FFD700" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="logo-svg">
-                <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path>
-                <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
-                <line x1="12" x2="12" y1="19" y2="22"></line>
-            </svg>
-            <h1 class="logo-text">VietVoices</h1>
-        </div>
-        <div class="subtitle">
-            🇻🇳 Chuyển đổi văn bản thành giọng nói tự nhiên bằng AI 🇻🇳<br>
-            <span style="color: #ff4757;">🚀 Công nghệ AI tiên tiến • 🎯 500K bước huấn luyện • ⚡ 150 giờ dữ liệu</span>
-        </div>
-        """)
-        # Instructions Section
-        with gr.Accordion("📋 Hướng dẫn sử dụng", open=False):
-            gr.Markdown("""
-            <div class="dark-card">
-            ### 🎯 Hướng dẫn nhanh:
-            1. **🎵 Tải lên giọng mẫu**: Chọn file âm thanh rõ ràng (WAV/MP3) làm giọng tham chiếu
-            2. **✍️ Nhập văn bản**: Gõ văn bản tiếng Việt bạn muốn chuyển đổi
-            3. **⚡ Điều chỉnh tốc độ**: Tinh chỉnh tốc độ nói (0.3x đến 2.0x)
-            4. **🔥 Tạo giọng nói**: Nhấn nút và chờ AI tạo giọng nói!
-            💡 **Mẹo hay**: Sử dụng âm thanh tham chiếu rõ ràng, phát âm chuẩn để có kết quả tốt nhất!
-            </div>
-            """)
-        # Main Input Section
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.HTML('<div class="dark-card">')
-                gr.Markdown("### 🎙️ <span class='accent-text'>Giọng nói mẫu</span>")
-                ref_audio = gr.Audio(
-                    label="🔊 Tải lên giọng mẫu",
-                    type="filepath",
-                    sources=["upload", "microphone"],
-                    show_download_button=True
-                )
-                gr.HTML('</div>')
-            with gr.Column(scale=2):
-                gr.HTML('<div class="dark-card">')
-                gr.Markdown("### 📝 <span class='accent-text'>Văn bản đầu vào</span>")
-                gen_text = gr.Textbox(
-                    label="✏️ Nhập văn bản tiếng Việt",
-                    placeholder="Nhập văn bản tiếng Việt bạn muốn chuyển đổi thành giọng nói...",
-                    lines=4,
-                    max_lines=8
-                )
-                with gr.Row():
-                    speed = gr.Slider(
-                        0.3, 2.0,
-                        value=1.0,
-                        step=0.1,
-                        label="⚡ Tốc độ giọng nói",
-                        info="Điều chỉnh tốc độ nói nhanh hay chậm"
-                    )
-                    word_count = gr.HTML("<span class='accent-text'>📊 Số từ: 0/1000</span>")
-                gr.HTML('</div>')
-        # Generate Button
-        with gr.Row():
-            with gr.Column():
-                btn_synthesize = gr.Button(
-                    "🔥 Tạo giọng nói AI! ✨",
-                    elem_classes="generate-btn",
-                    size="lg"
-                )
-        # Output Section
-        gr.Markdown("## 🎧 <span class='accent-text'>Kết quả tạo ra</span>")
-        with gr.Row():
-            with gr.Column():
-                gr.HTML('<div class="dark-card">')
-                gr.Markdown("### 🎵 Âm thanh đầu ra")
-                output_audio = gr.Audio(
-                    label="🎧 Giọng nói được tạo ra",
-                    type="numpy",
-                    show_download_button=True,
-                    show_share_button=True
-                )
-                gr.HTML('</div>')
-            with gr.Column():
-                gr.HTML('<div class="dark-card">')
-                gr.Markdown("### 📊 Phân tích âm thanh")
-                output_spectrogram = gr.Image(
-                    label="📈 Phổ âm thanh",
-                    show_download_button=True
-                )
-                gr.HTML('</div>')
-        # Model Information
-        with gr.Accordion("🔬 Thông tin mô hình & Hạn chế", open=False):
-            gr.HTML('<div class="dark-card">')
-            gr.Markdown("""
-            ### 🤖 <span class='accent-text'>Về mô hình VietVoices:</span>
-            - 🏋️ **Huấn luyện**: 500,000 bước trên GPU RTX 3090
-            - 📚 **Dữ liệu**: 150 giờ âm thanh tiếng Việt
-            - 🎯 **Kiến trúc**: Mô hình DiT (Diffusion Transformer) tiên tiến
-            - 🌟 **Chuyên biệt**: Tối ưu hóa cho tiếng Việt
-            """)
-            model_limitations = gr.Textbox(
-                value="""🚨 Hạn chế hiện tại:
-    1. 🔢 Số & Ngày tháng: Có thể gặp khó khăn với nội dung số, ngày tháng và ký tự đặc biệt
-    2. 🎵 Nhịp điệu: Một số đầu ra có thể có nhịp điệu không nhất quán - sử d��ng âm thanh tham chiếu rõ ràng
-    3. 🎤 Nhận dạng: Sử dụng Whisper-large-v3-turbo có thể nhận dạng sai văn bản tiếng Việt
-    4. 🎭 Nhân bản giọng: Độ chính xác nhân bản giọng nói không phải người Việt có thể thay đổi
-    5. 📄 Văn bản dài: Đoạn văn rất dài (>1000 từ) có thể tạo ra kết quả không tối ưu
-    💡 Để có kết quả tốt nhất: Sử dụng câu ngắn, rõ ràng với âm thanh tham chiếu tốt!""",
-                label="⚠️ Lưu ý quan trọng",
-                lines=8,
-                interactive=False
-            )
-            gr.HTML('</div>')
-        # Footer
-        gr.HTML("""
-        <div style="text-align: center; margin-top: 2em; padding: 20px; background: linear-gradient(45deg, rgba(255, 71, 87, 0.2) 0%, rgba(255, 107, 122, 0.2) 100%); border-radius: 15px; border: 1px solid rgba(255, 71, 87, 0.3);">
-            <h3 style="color: #ff4757;">🌟 Tận hưởng giọng nói AI tiếng Việt! 🌟</h3>
-            <p style="color: #a0a0a0;">Được tạo ra với ❤️ cho cộng đồng Việt Nam</p>
-        </div>
-        """)
-        # JavaScript for word count
-        gen_text.change(
-            fn=lambda text: f"<span class='accent-text'>📊 Số từ: {len(text.split())}/1000</span>",
-            inputs=[gen_text],
-            outputs=[word_count]
-        )
-        btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
-    # Authentication event handlers
-    def handle_login(username, password):
-        if username == VALID_USERNAME and password == VALID_PASSWORD:
-            return (
-                gr.update(visible=False),  # Hide login interface
-                gr.update(visible=True),   # Show main interface
-                "<p style='color: #4CAF50; text-align: center;'>✅ Đăng nhập thành công!</p>"
-            )
-        else:
-            return (
-                gr.update(visible=True),   # Keep login interface visible
-                gr.update(visible=False),  # Keep main interface hidden
-                "<p style='color: #ff4757; text-align: center;'>❌ Sai tên đăng nhập hoặc mật khẩu!</p>"
-            )
-    def handle_logout():
-        return (
-            gr.update(visible=True),   # Show login interface
-            gr.update(visible=False)   # Hide main interface
-        )
-    # Bind events
-    login_btn.click(
-        handle_login,
-        inputs=[username_input, password_input],
-        outputs=[login_interface, main_interface, login_status]
-    )
-    logout_btn.click(
-        handle_logout,
-        outputs=[login_interface, main_interface]
-    )
-# Run Gradio with share=True to get a gradio.live link
-demo.queue().launch(share=True)

app.py CHANGED Viewed

@@ -1,18 +1,24 @@
 import os
 from huggingface_hub import login
 import gradio as gr
 import tempfile
-import requests
-import base64
 # Authentication credentials (in production, use environment variables or secure storage)
 VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
 VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
-# RunPod API configuration
-RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY")
-RUNPOD_ENDPOINT_ID = os.getenv("RUNPOD_ENDPOINT_ID")
 # Retrieve token from secrets
 hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
@@ -31,10 +37,31 @@ def logout():
     """Logout user and return to login page"""
     return gr.update(visible=True), gr.update(visible=False)
 def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
-    if not RUNPOD_API_KEY or not RUNPOD_ENDPOINT_ID:
-        raise gr.Error("RunPod API configuration missing. Please check environment variables.")
     if not ref_audio_orig:
         raise gr.Error("Please upload a sample audio file.")
     if not gen_text.strip():
@@ -43,79 +70,17 @@ def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: g
         raise gr.Error("Please enter text content with less than 1000 words.")
     try:
-        # Read and encode reference audio to base64
-        with open(ref_audio_orig, "rb") as f:
-            audio_data = f.read()
-            audio_b64 = base64.b64encode(audio_data).decode('utf-8')
-        # Prepare request payload
-        payload = {
-            "input": {
-                "ref_audio": audio_b64,
-                "gen_text": gen_text,
-                "speed": speed
-            }
-        }
-        # Make request to RunPod endpoint
-        response = requests.post(
-            f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync",
-            headers={
-                "Authorization": f"Bearer {RUNPOD_API_KEY}",
-                "Content-Type": "application/json"
-            },
-            json=payload,
-            timeout=300  # 5 minute timeout
         )
-        if response.status_code != 200:
-            raise gr.Error(f"RunPod API error: {response.status_code} - {response.text}")
-        result = response.json()
-        if "error" in result:
-            raise gr.Error(f"RunPod processing error: {result['error']}")
-        if 'output' not in result or 'audio_base64' not in result['output']:
-            raise gr.Error("No audio data received from RunPod API")
-        # Decode audio from base64
-        audio_data = base64.b64decode(result['output']['audio_base64'])
-        sample_rate = result['output'].get('sample_rate', 24000)
-        # Save audio to temporary file and return as numpy array
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
-            tmp_audio.write(audio_data)
-            tmp_audio_path = tmp_audio.name
-        # Load audio as numpy array for Gradio
-        import soundfile as sf
-        audio_array, sr = sf.read(tmp_audio_path)
-        # Create a simple spectrogram placeholder since we don't have the actual spectrogram
-        import numpy as np
-        import matplotlib.pyplot as plt
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-            plt.figure(figsize=(10, 4))
-            plt.specgram(audio_array, Fs=sr)
-            plt.title("Audio Spectrogram")
-            plt.xlabel("Time")
-            plt.ylabel("Frequency")
-            plt.savefig(tmp_spectrogram.name)
-            plt.close()
             spectrogram_path = tmp_spectrogram.name
-        # Clean up temporary audio file
-        os.unlink(tmp_audio_path)
-        return (sr, audio_array), spectrogram_path
-    except requests.exceptions.Timeout:
-        raise gr.Error("Request timeout. The audio generation took too long.")
-    except requests.exceptions.RequestException as e:
-        raise gr.Error(f"Network error: {str(e)}")
     except Exception as e:
-        raise gr.Error(f"Error generating voice: {str(e)}")
 # Gradio UI
 with gr.Blocks(
@@ -485,4 +450,4 @@ with gr.Blocks(
     )
 # Run Gradio with share=True to get a gradio.live link
-demo.queue().launch(share=False)

+import spaces
 import os
 from huggingface_hub import login
 import gradio as gr
+from cached_path import cached_path
 import tempfile
+from vinorm import TTSnorm
+from f5_tts.model import DiT
+from f5_tts.infer.utils_infer import (
+    preprocess_ref_audio_text,
+    load_vocoder,
+    load_model,
+    infer_process,
+    save_spectrogram,
+)
 # Authentication credentials (in production, use environment variables or secure storage)
 VALID_USERNAME = os.getenv("AUTH_USERNAME", "admin")
 VALID_PASSWORD = os.getenv("AUTH_PASSWORD", "password123")
 # Retrieve token from secrets
 hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
     """Logout user and return to login page"""
     return gr.update(visible=True), gr.update(visible=False)
+def post_process(text):
+    text = " " + text + " "
+    text = text.replace(" . . ", " . ")
+    text = " " + text + " "
+    text = text.replace(" .. ", " . ")
+    text = " " + text + " "
+    text = text.replace(" , , ", " , ")
+    text = " " + text + " "
+    text = text.replace(" ,, ", " , ")
+    text = " " + text + " "
+    text = text.replace('"', "")
+    return " ".join(text.split())
+# Load models
+vocoder = load_vocoder()
+model = load_model(
+    DiT,
+    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
+    ckpt_path=str(cached_path("hf://tuankg1028/vietvoices/model_420000.pt")),
+    vocab_file=str(cached_path("hf://tuankg1028/vietvoices/vocab.txt")),
+)
+@spaces.GPU
 def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
     if not ref_audio_orig:
         raise gr.Error("Please upload a sample audio file.")
     if not gen_text.strip():
         raise gr.Error("Please enter text content with less than 1000 words.")
     try:
+        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
+        final_wave, final_sample_rate, spectrogram = infer_process(
+            ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
         )
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
             spectrogram_path = tmp_spectrogram.name
+            save_spectrogram(spectrogram, spectrogram_path)
+        return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
+        raise gr.Error(f"Error generating voice: {e}")
 # Gradio UI
 with gr.Blocks(
     )
 # Run Gradio with share=True to get a gradio.live link
+demo.queue().launch(share=True)