Spaces:

rbgo
/

Open-Source-TTS-Gallary

Running

App Files Files Community

rbgo commited on 9 days ago

Commit

a8a8f61

verified ·

1 Parent(s): 30c775e

Update app.py

Browse files

Files changed (1) hide show

app.py +367 -131

app.py CHANGED Viewed

@@ -1,152 +1,388 @@
 # ---------------------------------------------------------------
-# app.py – “TTS Showcase” (static-audio-only Streamlit demo)
 # ---------------------------------------------------------------
 import os
-import streamlit as st
-# ---------- 1. Page-wide settings ----------
-st.set_page_config(
-    page_title="🔊 TTS Showcase",
-    page_icon="🎧",
-    layout="wide"
-)
-# ---------- 2. Demo metadata ----------
 MODELS = {
-    "nari-labs/Dia-1.6B"              : "Dia-1.6B",
-    "hexgrad/Kokoro-82M"              : "Kokoro 82M",
-    "sesame/csm-1b"                   : "CSM 1B",
-    "SparkAudio/Spark-TTS-0.5B"       : "Spark-TTS 0.5B",
-    "canopylabs/orpheus-3b-0.1-ft"    : "Orpheus3b-0.1-ft",
-    "SWivid/F5-TTS"                   : "F5-TTS",
-    "Zyphra/Zonos-v0.1-transformer"   : "Zonos v0.1",
-    "coqui/XTTS-v2"                   : "XTTS-v2",
-    "HKUSTAudio/Llasa-3B"             : "Llasa 3B",
-    "amphion/MaskGCT"                 : "MaskGCT",
-    "OuteAI/Llama-OuteTTS-1.0-1B"     : "Llama-OuteTTS-1.0-1B",
-    "ByteDance/MegaTTS3"              : "MegaTTS 3"
 }
 # Folder that contains subfolders with the audio clips
-SAMPLES_DIR = "samples"                      # <- change if yours is different
-CLIP_NAME   = "generated-audio.wav"          # <- your agreed filename
-# ---------- 3. Light CSS glow-up ----------
-st.markdown(
-    """
-    <style>
-    /* Wider central column & soft grey background */
-    .block-container { padding-top: 2rem; }
-    body              { background: #f5f7fa; }
-    /* Simple card look */
-    .tts-card {
-        background: #ffffff;
-        border-radius: 12px;
-        padding: 1.2rem 1rem;
-        box-shadow: 0 2px 8px rgba(0,0,0,.04);
-        margin-bottom: 1.5rem;
-    }
-    .tts-title {
-        font-weight: 600;
-        font-size: 1.05rem;
-        margin-bottom: .5rem;
-    }
-    audio { width: 100%; }   /* Full-width players */
-    </style>
-    """,
-    unsafe_allow_html=True
-)
-st.markdown(
-    """
-    <style>
-    /* (-- existing styles here --) */
-    /* ---------- Inferless banner ---------- */
-    #inferless-banner{
-        display:flex;
-        align-items:center;
-        gap:.5rem;
-        margin-top:2rem;
-        font-size:.85rem;
-        color:#555;
-        opacity:.8;
-    }
-    #inferless-banner img{
-        height:24px;            /* 👈 nice & small */
-        width:24px;
-        object-fit:contain;
-        border-radius:4px;      /* optional: soft corners */
-    }
-    .inferless-text{
-        letter-spacing:.2px;
-        font-weight:500;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True
-)
-st.markdown(
-    """
-    <div id="inferless-banner">
-        <img src="https://i.tracxn.com/logo/company/1678863153264_9e6a9a4d-b955-42b3-895e-b94ade13c997.jpeg?format=webp&height=120&width=120" alt="Inferless Logo">
-        <div class="inferless-text">Powered by Inferless</div>
     </div>
-    """,
-    unsafe_allow_html=True
-)
-# ---------- 4. Header & optional quick-filter ----------
-st.title("🎙️ Open-Source Text to Speech Model Gallery")
-with st.expander("ℹ️ About this demo", expanded=True):
-    st.write(
-        """
-        * 12 popular TTS checkpoints, each with a single **_pre-synthesised_** sample
-        """
-    )
-filter_text = st.text_input(
-    "Filter models… (e.g. “coqui” or “3B”)",
-    placeholder="Search Model",
-    label_visibility="collapsed"
-).lower().strip()
-# ---------- 5. Render cards in a responsive 3-column grid ----------
-COLS_PER_ROW = 3
-cols = st.columns(COLS_PER_ROW)
-def repo_to_slug(repo: str) -> str:
-    """huggingface/xxx -> huggingface_xxx (for folder naming)."""
-    return repo.replace("/", "_")
-visible_models = [
-    (repo, nice_name)
-    for repo, nice_name in MODELS.items()
-    if filter_text in repo.lower() or filter_text in nice_name.lower()
-]
-if not visible_models:
-    st.warning("No models match that filter.")
-else:
-    for idx, (repo, display_name) in enumerate(visible_models):
-        with cols[idx % COLS_PER_ROW]:
-            with st.container():
-                st.markdown("<div class='tts-card'>", unsafe_allow_html=True)
-                st.markdown(f"<div class='tts-title'>🎧 {display_name}</div>", unsafe_allow_html=True)
-                # Resolved path: samples/<repo-as-slug>/generated-audio.wav
-                audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME)
-                if os.path.isfile(audio_path):
-                    st.audio(audio_path)
-                else:
-                    st.error("Sample clip not found 🤷‍♂️")
-                st.markdown("</div>", unsafe_allow_html=True)

 # ---------------------------------------------------------------
+# app.py – "TTS Showcase" (Gradio Implementation)
 # ---------------------------------------------------------------
 import os
+import gradio as gr
+# ---------- 1. Demo metadata ----------
 MODELS = {
+    "nari-labs/Dia-1.6B": "Dia-1.6B",
+    "hexgrad/Kokoro-82M": "Kokoro-82M",
+    "sesame/csm-1b": "csm-1b",
+    "SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B",
+    "canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft",
+    "SWivid/F5-TTS": "F5-TTS",
+    "Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer",
+    "coqui/XTTS-v2": "XTTS-v2",
+    "HKUSTAudio/Llasa-3B": "Llasa-3B",
+    "amphion/MaskGCT": "MaskGCT",
+    "OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B",
+    "ByteDance/MegaTTS3": "MegaTTS3"
+}
+# Performance ratings for each model
+MODEL_RATINGS = {
+    "nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"},
+    "hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
+    "sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
+    "SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
+    "canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
+    "SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
+    "Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"},
+    "coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
+    "HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"},
+    "amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
+    "OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"},
+    "ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"}
+}
+# Model descriptions for better understanding
+MODEL_DESCRIPTIONS = {
+    "nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality",
+    "hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity",
+    "sesame/csm-1b": "High-quality synthesis with excellent naturalness",
+    "SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance",
+    "canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality",
+    "SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings",
+    "Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model",
+    "coqui/XTTS-v2": "Multi-lingual excellence with proven performance",
+    "HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model",
+    "amphion/MaskGCT": "Masked generative modeling approach",
+    "OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance",
+    "ByteDance/MegaTTS3": "Industrial-grade TTS solution"
 }
 # Folder that contains subfolders with the audio clips
+SAMPLES_DIR = "samples"
+CLIP_NAME = "generated-audio.wav"
+# Test prompt used for evaluation
+TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!"
+def repo_to_slug(repo: str) -> str:
+    """Convert huggingface/xxx to huggingface_xxx for folder naming."""
+    return repo.replace("/", "_")
+def get_rating_emoji(rating: str) -> str:
+    """Convert rating to emoji."""
+    if rating == "Excellent":
+        return "🟢"
+    elif rating == "Good":
+        return "🟡"
+    else:
+        return "🟠"
+def get_audio_path(repo: str) -> str:
+    """Get the audio file path for a given repository."""
+    audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME)
+    return audio_path if os.path.isfile(audio_path) else None
+def filter_models(search_term: str):
+    """Filter models based on search term."""
+    if not search_term.strip():
+        return list(MODELS.keys())
+    search_lower = search_term.lower().strip()
+    return [
+        repo for repo, name in MODELS.items()
+        if search_lower in repo.lower() or search_lower in name.lower()
+    ]
+def create_model_card(repo: str) -> str:
+    """Create a formatted model card with ratings and description."""
+    display_name = MODELS[repo]
+    description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model")
+    ratings = MODEL_RATINGS.get(repo, {})
+    card_html = f"""
+    <div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;">
+        <h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3>
+        <div style="display: flex; gap: 15px; margin: 15px 0;">
+            <span style="color: #888;"><strong style="color: #888;">Naturalness:</strong> {get_rating_emoji(ratings.get('naturalness', 'Moderate'))} {ratings.get('naturalness', 'Moderate')}</span>
+            <span style="color: #888;"><strong style="color: #888;">Intelligibility:</strong> {get_rating_emoji(ratings.get('intelligibility', 'Moderate'))} {ratings.get('intelligibility', 'Moderate')}</span>
+            <span style="color: #888;"><strong style="color: #888;">Controllability:</strong> {get_rating_emoji(ratings.get('controllability', 'Moderate'))} {ratings.get('controllability', 'Moderate')}</span>
+        </div>
+        <p style="font-size: 0.9em; color: #888; margin: 5px 0;">Repository: <code style="color: #888;">{repo}</code></p>
     </div>
+    """
+    return card_html
+# ---------- 2. Custom CSS ----------
+custom_css = """
+#title {
+    text-align: center;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 2rem;
+    border-radius: 15px;
+    margin-bottom: 2rem;
+}
+#intro-section {
+    background: #f8f9fa;
+    color: #2c3e50;
+    padding: 1.5rem;
+    border-radius: 10px;
+    margin: 1rem 0;
+    border-left: 4px solid #667eea;
+}
+#intro-section h2,
+#intro-section h3 {
+    color: #2c3e50;
+}
+#intro-section p {
+    color: #34495e;
+}
+#intro-section ul li {
+    color: #34495e;
+}
+#intro-section .mission-text {
+    color: #667eea !important;
+    font-weight: bold;
+    text-align: center;
+}
+#intro-section strong {
+    color: #2c3e50 !important;
+}
+#intro-section em {
+    color: #2c3e50 !important;
+}
+#intro-section .mission-text strong {
+    color: #667eea !important;
+}
+#test-prompt {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 1.5rem;
+    border-radius: 10px;
+    text-align: center;
+    margin: 1rem 0;
+}
+.model-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
+    gap: 1rem;
+    margin: 1rem 0;
+}
+#footer {
+    text-align: center;
+    padding: 2rem;
+    color: #666;
+    border-top: 1px solid #eee;
+    margin-top: 2rem;
+}
+/* make all the text in our white‐background cards dark */
+.model-grid .gr-html * {
+  color: #2c3e50 !important;
+}
+.model-card {
+  background: white;
+  color: #2c3e50 !important;
+  border: 1px solid #ddd;
+  border-radius: 12px;
+  padding: 20px;
+  margin: 10px 0;
+}
+"""
+# ---------- 3. Main Gradio Interface ----------
+def create_interface():
+    with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo:
+        # Header Section
+        gr.HTML("""
+        <div id="title">
+            <h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1>
+        </div>
+        """)
+        # Introduction Section
+        gr.HTML("""
+        <div id="intro-section">
+            <h3>🔬 Our Exciting Quest</h3>
+            <p>We're on a thrilling journey to help developers discover the perfect TTS models for their innovative audio projects!
+            We've put these 12 cutting-edge models through their paces using a scientifically designed universal test prompt.</p>
+            <p><strong>Featured TTS Engines:</strong></p>
+            <ul>
+                <li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
+                <li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
+                <li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li>
+                <li>🎵 <strong>XTTS-v2</strong> - Multi-lingual excellence</li>
+                <li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li>
+                <li>🎤 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li>
+                <li><em>...and 6 more incredible models!</em></li>
+            </ul>
+        </div>
+        """)
+        # Test Prompt Section
+        # gr.HTML(f"""
+        # <div id="test-prompt">
+        #     <h3>🎯 Universal Test Prompt</h3>
+        #     <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p>
+        #     <p style="font-size: 0.9em; opacity: 0.9;">
+        #         Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models
+        #     </p>
+        # </div>
+        # """)
+        # Evaluation Criteria
+        with gr.Row():
+            with gr.Column():
+                gr.HTML("""
+                <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
+                    <div style="font-size: 2rem;">🎭</div>
+                    <strong>Naturalness</strong><br>
+                    <small>Human-like quality & emotional expression</small>
+                </div>
+                """)
+            with gr.Column():
+                gr.HTML("""
+                <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
+                    <div style="font-size: 2rem;">🗣️</div>
+                    <strong>Intelligibility</strong><br>
+                    <small>Clarity & pronunciation accuracy</small>
+                </div>
+                """)
+            with gr.Column():
+                gr.HTML("""
+                <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
+                    <div style="font-size: 2rem;">🎛️</div>
+                    <strong>Controllability</strong><br>
+                    <small>Tone, pace & parameter flexibility</small>
+                </div>
+                """)
+        gr.Markdown("---")
+        # Search and Filter Section
+        with gr.Row():
+            search_box = gr.Textbox(
+                label="🔍 Search Models",
+                placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')",
+                value="",
+                scale=3
+            )
+            clear_btn = gr.Button("Clear", scale=1)
+        # Model Gallery Section
+        gr.Markdown("## 🎧 Model Gallery")
+        # Create model cards and audio players
+        model_components = []
+        for repo, display_name in MODELS.items():
+            with gr.Group():
+                # Model information card
+                model_info = gr.HTML(create_model_card(repo))
+                # Audio player
+                audio_path = get_audio_path(repo)
+                if audio_path:
+                    audio_player = gr.Audio(
+                        value=audio_path,
+                        label=f"🎵 {display_name} Audio Sample",
+                        interactive=False
+                    )
+                else:
+                    audio_player = gr.HTML(f"<p style='color: red;'>🤷‍♂️ Audio sample not found for {display_name}</p>")
+                model_components.append((repo, model_info, audio_player))
+        # Search functionality
+        def update_visibility(search_term):
+            filtered_repos = filter_models(search_term)
+            updates = []
+            for repo, model_info, audio_player in model_components:
+                visible = repo in filtered_repos
+                updates.extend([
+                    gr.update(visible=visible),  # model_info
+                    gr.update(visible=visible)   # audio_player
+                ])
+            return updates
+        # Connect search functionality
+        search_box.change(
+            fn=update_visibility,
+            inputs=[search_box],
+            outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]]
+        )
+        clear_btn.click(
+            fn=lambda: "",
+            outputs=[search_box]
+        )
+        # Methodology Section
+        with gr.Accordion("📋 Detailed Evaluation Methodology", open=False):
+            gr.Markdown("""
+            ### Test Prompt
+            `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`
+            ### Model Evaluation Criteria:
+            🎭 **Naturalness (Human-like Quality)**
+            - Prosody and rhythm patterns
+            - Emotional expression capability
+            - Voice texture and warmth
+            - Natural breathing and pauses
+            🗣️ **Intelligibility (Clarity & Accuracy)**
+            - Word pronunciation precision
+            - Consonant and vowel clarity
+            - Sentence comprehensibility
+            - Technical term handling
+            🎛️ **Controllability (Flexibility)**
+            - Parameter responsiveness
+            - Tone modification capability
+            - Speed and pitch control
+            - Customization potential
+            ### Key Insights:
+            - Smaller models (82M-500M) can excel in specific scenarios
+            - Larger models (1B-3B+) offer more versatility but require more resources
+            - Architecture matters as much as parameter count
+            - Training data quality significantly impacts output quality
+            """)
+        # Footer
+        # gr.HTML("""
+        # <div id="footer">
+        #     <p><strong>🚀 Ready to deploy your own TTS model?</strong></p>
+        #     <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p>
+        #     <p><em>Built with ❤️ using Gradio • All models are open-source and available on Hugging Face</em></p>
+        #     <p>⚡ Powered by Inferless</p>
+        # </div>
+        # """)
+    return demo
+# ---------- 4. Launch the application ----------
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        share=True,
+        inbrowser=True,
+        show_error=True
+    )