Spaces:

rbgo
/

Open-Source-TTS-Gallary

Running

File size: 18,517 Bytes

# ---------------------------------------------------------------
# app.py – "TTS Showcase" (Gradio Implementation)
# ---------------------------------------------------------------
import os
import gradio as gr

# ---------- 1. Demo metadata ----------
MODELS = {
    "nari-labs/Dia-1.6B": "Dia-1.6B",
    "hexgrad/Kokoro-82M": "Kokoro-82M", 
    "sesame/csm-1b": "csm-1b",
    "SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B",
    "canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft",
    "SWivid/F5-TTS": "F5-TTS",
    "Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer",
    "coqui/XTTS-v2": "XTTS-v2",
    "HKUSTAudio/Llasa-3B": "Llasa-3B",
    "amphion/MaskGCT": "MaskGCT",
    "OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B",
    "ByteDance/MegaTTS3": "MegaTTS3"
}

# Performance ratings for each model
MODEL_RATINGS = {
    "nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"},
    "hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
    "sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
    "SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
    "canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
    "SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
    "Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"},
    "coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
    "HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"},
    "amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
    "OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"},
    "ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"}
}

# Model descriptions for better understanding
MODEL_DESCRIPTIONS = {
    "nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality",
    "hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity",
    "sesame/csm-1b": "High-quality synthesis with excellent naturalness",
    "SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance",
    "canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality",
    "SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings",
    "Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model",
    "coqui/XTTS-v2": "Multi-lingual excellence with proven performance",
    "HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model",
    "amphion/MaskGCT": "Masked generative modeling approach",
    "OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance",
    "ByteDance/MegaTTS3": "Industrial-grade TTS solution"
}

# Folder that contains subfolders with the audio clips
SAMPLES_DIR = "samples"
CLIP_NAME = "generated-audio.wav"

# Test prompt used for evaluation
TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!"

def repo_to_slug(repo: str) -> str:
    """Convert huggingface/xxx to huggingface_xxx for folder naming."""
    return repo.replace("/", "_")

def get_rating_emoji(rating: str) -> str:
    """Convert rating to emoji."""
    if rating == "Excellent":
        return "🟢"
    elif rating == "Good":
        return "🟡"
    else:
        return "🟠"

def get_audio_path(repo: str) -> str:
    """Get the audio file path for a given repository."""
    audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME)
    return audio_path if os.path.isfile(audio_path) else None

def filter_models(search_term: str):
    """Filter models based on search term."""
    if not search_term.strip():
        return list(MODELS.keys())
    
    search_lower = search_term.lower().strip()
    return [
        repo for repo, name in MODELS.items()
        if search_lower in repo.lower() or search_lower in name.lower()
    ]

def create_model_card(repo: str) -> str:
    """Create a formatted model card with ratings and description."""
    display_name = MODELS[repo]
    description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model")
    ratings = MODEL_RATINGS.get(repo, {})
    
    card_html = f"""
    <div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;">
        <h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3>
        
    </div>
    """
    return card_html

# ---------- 2. Custom CSS ----------
custom_css = """
#title {
    text-align: center;
    background: rgb(203, 255, 77);
    color: white;
    padding: 2rem;
    border-radius: 15px;
    margin-bottom: 2rem;
}

#intro-section {
    background: #f8f9fa;
    color: #2c3e50;
    padding: 1.5rem;
    border-radius: 10px;
    margin: 1rem 0;
    border-left: 4px solid rgb(0, 72, 10);
}

#intro-section h2,
#intro-section h3 {
    color: #2c3e50;
}

#intro-section p {
    color: #34495e;
}

#intro-section ul li {
    color: #34495e;
}

#intro-section .mission-text {
    color: #667eea !important;
    font-weight: bold;
    text-align: center;
}

#intro-section strong {
    color: #2c3e50 !important;
}

#intro-section em {
    color: #2c3e50 !important;
}

#intro-section .mission-text strong {
    color: #667eea !important;
}

#test-prompt {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 1.5rem;
    border-radius: 10px;
    text-align: center;
    margin: 1rem 0;
}

.model-grid {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
    gap: 1rem;
    margin: 1rem 0;
}

#footer {
    text-align: center;
    padding: 2rem;
    color: #666;
    border-top: 1px solid #eee;
    margin-top: 2rem;
}

/* make all the text in our white‐background cards dark */
.model-grid .gr-html * {
  color: #2c3e50 !important;
}

.model-card {
  background: white;
  color: #2c3e50 !important;
  border: 1px solid #ddd;
  border-radius: 12px;
  padding: 20px;
  margin: 10px 0;
}

"""

# ---------- 3. Main Gradio Interface ----------
def create_interface():
    with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo:
        
        # Header Section
        gr.HTML("""
        <div id="title">
            <h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1>
        </div>
        """)
        
        # Introduction Section
        gr.HTML("""
        <div id="intro-section">
            <h3>🔬 Our Exciting Quest</h3>
            <p>We’re on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.</p>
            
            <p><strong>Featured TTS Models:</strong></p>
            <ul>
                <li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
                <li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
                <li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li>
                <li>🎵 <strong>XTTS-v2</strong> - Multi-lingual excellence</li>
                <li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li>
                <li>🎤 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li>
                <li><em>...and 6 more incredible models!</em></li>
            </ul>
            
            <h3>🔑 Key Findings</h3>
            <ol>
                <li><strong>Outstanding Speech Quality</strong><br>
                    Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>, 
                    <strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong> delivered exceptionally 
                    natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong> 
                    stood out as the most well-rounded model as they combined good synthesized speech with solid controllability.
                </li>
                <li><strong>Superior Controllability</strong><br>
                    <strong>Zonos-v0.1-transformer</strong> emerged as the best in fine-grained control: it offers detailed 
                    adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise 
                    voice modulation.
                </li>
                <li><strong>Performance vs. Footprint Trade-off</strong><br>
                    Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical. 
                    Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual 
                    synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources.
                </li>
                <li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br>
                    <strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice 
                    cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. 
                    <strong>Llama-OuteTTS-1.0-1B</strong> and <strong>MegaTTS3</strong> also offer multilingual input handling, 
                    though they may require careful sampling parameter tuning to achieve optimal results.
                </li>
            </ol>
            
        </div>
        """)
        
        # Test Prompt Section
        # gr.HTML(f"""
        # <div id="test-prompt">
        #     <h3>🎯 Universal Test Prompt</h3>
        #     <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p>
        #     <p style="font-size: 0.9em; opacity: 0.9;">
        #         Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models
        #     </p>
        # </div>
        # """)
        
        # Evaluation Criteria
        # with gr.Row():
        #     with gr.Column():
        #         gr.HTML("""
        #         <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
        #             <div style="font-size: 2rem;">🎭</div>
        #             <strong>Naturalness</strong><br>
        #             <small>Human-like quality & emotional expression</small>
        #         </div>
        #         """)
        #     with gr.Column():
        #         gr.HTML("""
        #         <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
        #             <div style="font-size: 2rem;">🗣️</div>
        #             <strong>Intelligibility</strong><br>
        #             <small>Clarity & pronunciation accuracy</small>
        #         </div>
        #         """)
        #     with gr.Column():
        #         gr.HTML("""
        #         <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
        #             <div style="font-size: 2rem;">🎛️</div>
        #             <strong>Controllability</strong><br>
        #             <small>Tone, pace & parameter flexibility</small>
        #         </div>
        #         """)
        
        # gr.Markdown("---")
        # gr.Markdown("""
        #             ## 🔑 Key Findings
                    
        #             1. **Outstanding Speech Quality**  
        #                Several models—namely **Kokoro-82M**, **csm-1b**, **Spark-TTS-0.5B**, **Orpheus-3b-0.1-ft**, **F5-TTS**, and **Llasa-3B**—delivered exceptionally natural, clear, and realistic synthesized speech. Among these, **csm-1b** and **F5-TTS** stood out as the most well-rounded: they combined top-tier naturalness and intelligibility with solid controllability.
                    
        #             2. **Superior Controllability**  
        #                **Zonos-v0.1-transformer** emerged as the leader in fine-grained control: it offers detailed adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise voice modulation.
                    
        #             3. **Performance vs. Footprint Trade-off**  
        #                Smaller models (e.g., **Kokoro-82M** at 82 million parameters) can still achieve “Good” or “Excellent” ratings in many scenarios, especially when efficient inference or low VRAM usage is critical. Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual synthesis, zero-shot voice cloning, and multi-speaker generation—but require heavier compute resources.
                    
        #             4. **Special Notes on Multilingual & Cloning Capabilities**  
        #                **Spark-TTS-0.5B** and **XTTS-v2** excel at cross-lingual and zero-shot voice cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. **Llama-OuteTTS-1.0-1B** and **MegaTTS3** also offer multilingual input handling, though they may require careful sampling parameter tuning to achieve optimal results.  
        #                     """)
        
        # Search and Filter Section
        with gr.Row():
            search_box = gr.Textbox(
                label="🔍 Search Models",
                placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')",
                value="",
                scale=3
            )
            clear_btn = gr.Button("Clear", scale=1)
        
        # Model Gallery Section
        gr.Markdown("## 🎧 Model Gallery")
        
        # Create model cards and audio players
        model_components = []
        
        for repo, display_name in MODELS.items():
            with gr.Group():
                # Model information card
                model_info = gr.HTML(create_model_card(repo))
                
                # Audio player
                audio_path = get_audio_path(repo)
                if audio_path:
                    audio_player = gr.Audio(
                        value=audio_path,
                        label=f"🎵 {display_name} Audio Sample",
                        interactive=False
                    )
                else:
                    audio_player = gr.HTML(f"<p style='color: red;'>🤷‍♂️ Audio sample not found for {display_name}</p>")
                
                model_components.append((repo, model_info, audio_player))
        
        # Search functionality
        def update_visibility(search_term):
            filtered_repos = filter_models(search_term)
            updates = []
            
            for repo, model_info, audio_player in model_components:
                visible = repo in filtered_repos
                updates.extend([
                    gr.update(visible=visible),  # model_info
                    gr.update(visible=visible)   # audio_player
                ])
            
            return updates
        
        # Connect search functionality
        search_box.change(
            fn=update_visibility,
            inputs=[search_box],
            outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]]
        )
        
        clear_btn.click(
            fn=lambda: "",
            outputs=[search_box]
        )
        
        # Methodology Section
        # with gr.Accordion("📋 Detailed Evaluation Methodology", open=False):
        #     gr.Markdown("""
        #     ### Test Prompt
            
        #     `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`
            
            
        #     ### Model Evaluation Criteria:
            
        #     🎭 **Naturalness (Human-like Quality)**
        #     - Prosody and rhythm patterns
        #     - Emotional expression capability
        #     - Voice texture and warmth
        #     - Natural breathing and pauses
            
        #     🗣️ **Intelligibility (Clarity & Accuracy)**
        #     - Word pronunciation precision
        #     - Consonant and vowel clarity
        #     - Sentence comprehensibility
        #     - Technical term handling
            
        #     🎛️ **Controllability (Flexibility)**
        #     - Parameter responsiveness
        #     - Tone modification capability
        #     - Speed and pitch control
        #     - Customization potential
            
        #     ### Key Insights:
        #     - Smaller models (82M-500M) can excel in specific scenarios
        #     - Larger models (1B-3B+) offer more versatility but require more resources
        #     - Architecture matters as much as parameter count
        #     - Training data quality significantly impacts output quality
        #     """)
        
        # Footer
        # gr.HTML("""
        # <div id="footer">
        #     <p><strong>🚀 Ready to deploy your own TTS model?</strong></p>
        #     <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p>
        #     <p><em>Built with ❤️ using Gradio • All models are open-source and available on Hugging Face</em></p>
        #     <p>⚡ Powered by Inferless</p>
        # </div>
        # """)
    
    return demo

# ---------- 4. Launch the application ----------
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        share=True,
        inbrowser=True,
        show_error=True
    )