Spaces:

akera
/

leaderboard

Running

App Files Files Community

akera commited on Jun 16

Commit

7c90731

verified ·

1 Parent(s): 3dcbb9d

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -964

app.py CHANGED Viewed

@@ -3,11 +3,13 @@ import subprocess
 import sys
 import os
 from pathlib import Path
 def setup_salt():
     """Clone and setup SALT library like in Colab."""
     try:
-        # Check if salt is already available
         import salt.dataset
         print("✅ SALT library already available")
         return True
@@ -17,7 +19,6 @@ def setup_salt():
     print("📥 Setting up SALT library...")
     try:
-        # Clone SALT repo if not exists
         salt_dir = Path("salt")
         if not salt_dir.exists():
             print("🔄 Cloning SALT repository...")
@@ -27,7 +28,6 @@ def setup_salt():
         else:
             print("📁 SALT repository already exists")
-        # Install SALT requirements
         salt_requirements = salt_dir / "requirements.txt"
         if salt_requirements.exists():
             print("📦 Installing SALT requirements...")
@@ -35,13 +35,11 @@ def setup_salt():
                 sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
             ])
-        # Add SALT directory to Python path
         salt_path = str(salt_dir.absolute())
         if salt_path not in sys.path:
             sys.path.insert(0, salt_path)
             print(f"🔗 Added {salt_path} to Python path")
-        # Test import
         import salt.dataset
         print("✅ SALT library setup completed successfully")
         return True
@@ -51,186 +49,119 @@ def setup_salt():
         return False
 # Setup SALT on startup
-print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 if not setup_salt():
     print("❌ Cannot continue without SALT library")
-    print("💡 Please check that git is available and GitHub is accessible")
     sys.exit(1)
 import gradio as gr
 import pandas as pd
 import json
-import traceback
-from datetime import datetime
-from typing import Optional, Dict, Tuple, List
-# Import our enhanced modules
 from src.test_set import (
-    get_public_test_set_scientific,
-    get_complete_test_set_scientific,
-    create_test_set_download_scientific,
-    validate_test_set_integrity_scientific,
-    get_track_test_set
-)
-from src.validation import validate_submission_scientific
-from src.evaluation import (
-    evaluate_predictions_scientific,
-    generate_scientific_report,
-    compare_models_statistically
 )
 from src.leaderboard import (
-    load_scientific_leaderboard,
-    add_model_to_scientific_leaderboard,
-    get_scientific_leaderboard_stats,
     get_track_leaderboard,
-    prepare_track_leaderboard_display,
-    perform_fair_comparison,
-    export_scientific_leaderboard
 )
 from src.plotting import (
-    create_scientific_leaderboard_plot,
-    create_language_pair_heatmap_scientific,
-    create_statistical_comparison_plot,
-    create_category_comparison_plot,
-    create_adequacy_analysis_plot,
-    create_cross_track_analysis_plot,
-    create_scientific_model_detail_plot
-)
-from src.utils import (
-    sanitize_model_name,
-    get_all_language_pairs,
-    get_google_comparable_pairs,
-    get_track_language_pairs,
-    format_metric_value
 )
 from config import *
 # Global variables for caching
 current_leaderboard = None
 public_test_set = None
 complete_test_set = None
-test_set_stats = None
-def initialize_scientific_data():
-    """Initialize scientific test sets and leaderboard data."""
-    global public_test_set, complete_test_set, current_leaderboard, test_set_stats
     try:
-        print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")
-        # Load scientific test sets
-        print("📥 Loading scientific test sets...")
-        public_test_set = get_public_test_set_scientific()
-        complete_test_set = get_complete_test_set_scientific()
-        # Load scientific leaderboard
-        print("🏆 Loading scientific leaderboard...")
-        current_leaderboard = load_scientific_leaderboard()
-        # Validate test set integrity
-        print("🔍 Validating test set integrity...")
-        test_set_stats = validate_test_set_integrity_scientific()
-        print(f"✅ Scientific initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
-        print(f"   - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
-        print(f"   - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
-        print(f"❌ Scientific initialization failed: {e}")
         traceback.print_exc()
         return False
-def download_scientific_test_set() -> Tuple[str, str]:
-    """Create downloadable scientific test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
-            public_test_set = get_public_test_set_scientific()
-        # Create download file
-        download_path, stats = create_test_set_download_scientific()
-        # Create comprehensive info message
-        adequacy = stats.get('adequacy_assessment', 'unknown')
-        adequacy_emoji = {
-            'excellent': '🟢',
-            'good': '🟡',
-            'fair': '🟠',
-            'insufficient': '🔴',
-            'unknown': '⚪'
-        }.get(adequacy, '⚪')
         info_msg = f"""
-## 📥 SALT Scientific Test Set Downloaded Successfully!
-### 🔬 Scientific Edition Features:
-- **Stratified Sampling**: Ensures representative coverage across domains
-- **Statistical Weighting**: Samples weighted by track importance
-- **Track Balancing**: Optimized for fair cross-track comparison
-- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
 ### 📊 Dataset Statistics:
 - **Total Samples**: {stats['total_samples']:,}
 - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
-- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
-- **Domains**: {', '.join(stats.get('domains', ['general']))}
 ### 🏁 Track Breakdown:
 """
         track_breakdown = stats.get('track_breakdown', {})
         for track_name, track_info in track_breakdown.items():
-            status_emoji = '✅' if track_info.get('statistical_adequacy', False) else '⚠️'
             info_msg += f"""
-**{status_emoji} {track_info.get('name', track_name)}**:
 - Samples: {track_info.get('total_samples', 0):,}
 - Language Pairs: {track_info.get('language_pairs', 0)}
-- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
-- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
 """
         info_msg += f"""
-### 📋 Enhanced File Format:
 - `sample_id`: Unique identifier for each sample
 - `source_text`: Text to be translated
 - `source_language`: Source language code
 - `target_language`: Target language code
 - `domain`: Content domain (if available)
 - `google_comparable`: Whether this pair can be compared with Google Translate
-- `tracks_included`: Comma-separated list of tracks that include this sample
-- `statistical_weight`: Statistical importance weight (1.0-5.0)
-### 🔬 Next Steps for Scientific Evaluation:
 1. **Run your model** on the source texts to generate translations
 2. **Create a predictions file** with columns: `sample_id`, `prediction`
-3. **Optional**: Add `category` column to help with model classification
-4. **Submit** your predictions using the appropriate track tab
-5. **Analyze** results with statistical confidence intervals
-### 💡 Tips for Best Results:
-- Ensure coverage of all language pairs for chosen track
-- Include confidence scores if available
-- Provide detailed model description for proper categorization
-- Consider submitting to multiple tracks for comprehensive evaluation
         """
         return download_path, info_msg
     except Exception as e:
-        error_msg = f"❌ Error creating scientific test set download: {str(e)}"
         return None, error_msg
-def validate_scientific_submission(
-    file, model_name: str, author: str, description: str
-) -> Tuple[str, Optional[pd.DataFrame], str]:
-    """Validate uploaded prediction file with scientific rigor."""
     try:
         if file is None:
             return "❌ Please upload a predictions file", None, "community"
@@ -252,70 +183,50 @@ def validate_scientific_submission(
         else:
             return "❌ Could not read uploaded file", None, "community"
-        # Determine filename
-        filename = (
-            getattr(file, "name", None)
-            or getattr(file, "filename", None)
-            or "predictions.csv"
-        )
-        # Load test set if needed
         global complete_test_set
         if complete_test_set is None:
-            complete_test_set = get_complete_test_set_scientific()
-        # Run enhanced scientific validation
-        validation_result = validate_submission_scientific(
             file_content, filename, complete_test_set, model_name, author, description
         )
         detected_category = validation_result.get("category", "community")
-        # Return predictions if evaluation is possible (even with limitations)
         if validation_result.get("can_evaluate", False):
             return validation_result["report"], validation_result["predictions"], detected_category
         else:
             return validation_result["report"], None, detected_category
     except Exception as e:
-        return (
-            f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
-            None,
-            "community"
-        )
-def evaluate_scientific_submission(
     predictions_df: pd.DataFrame,
     model_name: str,
     author: str,
     description: str,
     detected_category: str,
-    validation_info: Dict,
 ) -> Tuple[str, pd.DataFrame, object, object]:
-    """Evaluate validated predictions using scientific methodology."""
     try:
         if predictions_df is None:
             return "❌ No valid predictions to evaluate", None, None, None
-        # Get complete test set with targets
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
-            complete_test_set = get_complete_test_set_scientific()
-        # Run scientific evaluation across all tracks
-        print(f"🔬 Starting scientific evaluation for {model_name}...")
-        evaluation_results = evaluate_predictions_scientific(
-            predictions_df, complete_test_set, detected_category
-        )
-        if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
-            errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
-            return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
-        # Add to scientific leaderboard
-        print("🏆 Adding to scientific leaderboard...")
-        updated_leaderboard = add_model_to_scientific_leaderboard(
             model_name=sanitize_model_name(model_name),
             author=author or "Anonymous",
             evaluation_results=evaluation_results,
@@ -323,526 +234,221 @@ def evaluate_scientific_submission(
             description=description or ""
         )
-        # Update global leaderboard
         current_leaderboard = updated_leaderboard
-        # Generate scientific report
-        report = generate_scientific_report(evaluation_results, model_name)
         # Create visualizations
-        summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
-        cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
-        # Prepare display leaderboard (Google-comparable track by default)
         google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
-        display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
-        # Format success message with track-specific results
         success_msg = f"""
-## 🎉 Scientific Evaluation Complete!
 ### 📊 Model Information:
 - **Model**: {model_name}
 - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
 - **Author**: {author or 'Anonymous'}
-### 🏆 Track Performance Summary:
-"""
-        tracks = evaluation_results.get('tracks', {})
-        for track_name, track_data in tracks.items():
-            if not track_data.get('error'):
-                track_config = EVALUATION_TRACKS[track_name]
-                track_averages = track_data.get('track_averages', {})
-                summary = track_data.get('summary', {})
-                # Get rank in this track
-                track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
-                if not track_leaderboard.empty:
-                    model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
-                    rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
-                    total_models = len(track_leaderboard)
-                else:
-                    rank = "N/A"
-                    total_models = 0
-                quality_score = track_averages.get('quality_score', 0)
-                bleu_score = track_averages.get('bleu', 0)
-                samples = summary.get('total_samples', 0)
-                success_msg += f"""
-**🏁 {track_config['name']}**:
-- Rank: #{rank} out of {total_models} models
-- Quality Score: {quality_score:.4f}
-- BLEU: {bleu_score:.2f}
-- Samples: {samples:,}
-"""
-        success_msg += f"""
-### 🔬 Scientific Adequacy:
-- **Cross-Track Consistency**: Available in detailed analysis
-- **Statistical Confidence**: 95% confidence intervals computed
-- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
 {report}
         """
-        return success_msg, display_leaderboard, summary_plot, cross_track_plot
     except Exception as e:
-        error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         return error_msg, None, None, None
-def refresh_track_leaderboard(
-    track: str,
-    search_query: str = "",
-    category_filter: str = "all",
-    min_adequacy: float = 0.0,
-    show_ci: bool = True
-) -> Tuple[pd.DataFrame, object, object, str]:
     """Refresh leaderboard for a specific track with filters."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
-            current_leaderboard = load_scientific_leaderboard()
-        # Get track-specific leaderboard with better error handling
-        try:
-            track_leaderboard = get_track_leaderboard(
-                current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
-            )
-        except Exception as e:
-            print(f"Error getting track leaderboard for {track}: {e}")
-            track_leaderboard = pd.DataFrame()
         # Apply search filter
         if search_query and not track_leaderboard.empty:
-            try:
-                query_lower = search_query.lower()
-                mask = (
-                    track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
-                    track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
-                )
-                track_leaderboard = track_leaderboard[mask]
-            except Exception as e:
-                print(f"Error applying search filter: {e}")
-        # Prepare for display
-        try:
-            display_df = prepare_track_leaderboard_display(track_leaderboard, track)
-        except Exception as e:
-            print(f"Error preparing display: {e}")
-            display_df = pd.DataFrame()
-        # Create plots with error handling
-        try:
-            ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
-        except Exception as e:
-            print(f"Error creating ranking plot: {e}")
-            ranking_plot = None
-        try:
-            comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
-        except Exception as e:
-            print(f"Error creating comparison plot: {e}")
-            comparison_plot = None
-        # Get track statistics
-        try:
-            track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
-            track_config = EVALUATION_TRACKS[track]
-            stats_text = f"""
 ### 📊 {track_config['name']} Statistics
-- **Total Models**: {track_stats.get('total_models', 0)}
-- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
-- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
-**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
-**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
-### 🔬 Scientific Notes:
-- All metrics include 95% confidence intervals
-- Statistical adequacy verified for reliable comparisons
-- {track_config['description']}
-            """
-        except Exception as e:
-            print(f"Error generating stats: {e}")
-            stats_text = f"Error loading {track} statistics: {str(e)}"
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
         error_msg = f"Error loading {track} leaderboard: {str(e)}"
         print(error_msg)
-        empty_df = pd.DataFrame()
-        return empty_df, None, None, error_msg
-def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
-    """Get detailed scientific analysis for a specific model."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
-            return "Leaderboard not loaded", None, None
-        # Find model
-        model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
-        if model_row.empty:
-            return f"Model '{model_name}' not found in leaderboard", None, None
-        model_info = model_row.iloc[0]
-        # Parse detailed metrics for the requested track
-        detailed_results = {}
-        detailed_col = f'detailed_{track}'
-        if detailed_col in model_info and pd.notna(model_info[detailed_col]):
-            try:
-                detailed_results = json.loads(model_info[detailed_col])
-                print(f"Successfully loaded detailed results for {model_name} in {track}")
-            except json.JSONDecodeError as e:
-                print(f"Error parsing detailed metrics for {model_name}: {e}")
-                detailed_results = {}
-        else:
-            print(f"No detailed metrics found for {model_name} in column {detailed_col}")
-            # Create a fallback structure
-            detailed_results = {
-                'tracks': {
-                    track: {
-                        'pair_metrics': {},
-                        'track_averages': {
-                            'quality_score': model_info.get(f'{track}_quality', 0),
-                            'bleu': model_info.get(f'{track}_bleu', 0),
-                            'chrf': model_info.get(f'{track}_chrf', 0)
-                        }
-                    }
-                }
-            }
-        # Create detailed plots
-        try:
-            detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
-        except Exception as e:
-            print(f"Error creating detail plot: {e}")
-            detail_plot = None
-        try:
-            heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
-        except Exception as e:
-            print(f"Error creating heatmap plot: {e}")
-            heatmap_plot = None
-        # Format model details with scientific information
-        track_config = EVALUATION_TRACKS[track]
-        category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
-        # Extract track-specific metrics
-        quality_col = f"{track}_quality"
-        bleu_col = f"{track}_bleu"
-        chrf_col = f"{track}_chrf"
-        ci_lower_col = f"{track}_ci_lower"
-        ci_upper_col = f"{track}_ci_upper"
-        samples_col = f"{track}_samples"
-        pairs_col = f"{track}_pairs"
-        adequate_col = f"{track}_adequate"
-        details_text = f"""
-## 🔬 Scientific Model Analysis: {model_name}
-### 📋 Basic Information:
-- **Author**: {model_info['author']}
-- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
-- **Submission Date**: {model_info['submission_date'][:10]}
-- **Description**: {model_info['description'] or 'No description provided'}
-### 🏁 {track_config['name']} Performance:
-- **Quality Score**: {model_info.get(quality_col, 0):.4f}
-- **BLEU**: {model_info.get(bleu_col, 0):.2f}
-- **ChrF**: {model_info.get(chrf_col, 0):.4f}
-- **95% CI**: [{model_info.get(ci_lower_col, 0):.4f}, {model_info.get(ci_upper_col, 0):.4f}]
-### 📊 Coverage Information:
-- **Total Samples**: {model_info.get(samples_col, 0):,}
-- **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
-- **Statistical Adequacy**: {'✅ Yes' if model_info.get(adequate_col, False) else '❌ No'}
-### 🔬 Statistical Metadata:
-- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
-- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
-- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
-### 📈 Cross-Track Performance:
-"""
-        # Add other track performances for comparison
-        for other_track in EVALUATION_TRACKS.keys():
-            if other_track != track:
-                other_quality_col = f"{other_track}_quality"
-                other_adequate_col = f"{other_track}_adequate"
-                if model_info.get(other_adequate_col, False):
-                    other_quality = model_info.get(other_quality_col, 0)
-                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
-                else:
-                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
-        # Add language pair performance if available
-        if detailed_results and 'tracks' in detailed_results and track in detailed_results['tracks']:
-            track_data = detailed_results['tracks'][track]
-            pair_metrics = track_data.get('pair_metrics', {})
-            if pair_metrics:
-                details_text += f"""
-### 🗺️ Language Pair Performance:
-Top performing pairs:
-"""
-                # Sort pairs by quality score
-                pairs_sorted = []
-                for pair_key, metrics in pair_metrics.items():
-                    if 'quality_score' in metrics and 'mean' in metrics['quality_score']:
-                        pairs_sorted.append((pair_key, metrics['quality_score']['mean']))
-                pairs_sorted.sort(key=lambda x: x[1], reverse=True)
-                for pair_key, score in pairs_sorted[:5]:  # Top 5
-                    src, tgt = pair_key.split('_to_')
-                    src_name = LANGUAGE_NAMES.get(src, src)
-                    tgt_name = LANGUAGE_NAMES.get(tgt, tgt)
-                    details_text += f"- **{src_name} → {tgt_name}**: {score:.4f}\n"
-        details_text += f"""
-### 💡 Scientific Interpretation:
-- Performance metrics include 95% confidence intervals for reliability
-- Statistical adequacy ensures meaningful comparisons with other models
-- Cross-track analysis reveals model strengths across different language sets
-- Category classification helps contextualize performance expectations
-        """
-        return details_text, detail_plot, heatmap_plot
     except Exception as e:
-        error_msg = f"Error getting model details: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return error_msg, None, None
-def perform_model_comparison(
-    model_names: List[str], track: str, comparison_type: str = "statistical"
-) -> Tuple[str, object]:
-    """Perform scientific comparison between selected models."""
-    try:
-        global current_leaderboard
-        if current_leaderboard is None:
-            return "Leaderboard not loaded", None
-        if len(model_names) < 2:
-            return "Please select at least 2 models for comparison", None
-        # Get models
-        models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
-        if len(models) < 2:
-            return "Selected models not found in leaderboard", None
-        # Perform fair comparison
-        comparison_result = perform_fair_comparison(current_leaderboard, model_names)
-        if comparison_result.get('error'):
-            return f"Comparison error: {comparison_result['error']}", None
-        # Create comparison visualization
-        if comparison_type == "statistical":
-            comparison_plot = create_statistical_comparison_plot(models, track)
-        else:
-            comparison_plot = create_category_comparison_plot(models, track)
-        # Format comparison report
-        track_config = EVALUATION_TRACKS[track]
-        comparison_text = f"""
-## 🔬 Scientific Model Comparison - {track_config['name']}
-### 📊 Models Compared:
-"""
-        quality_col = f"{track}_quality"
-        ci_lower_col = f"{track}_ci_lower"
-        ci_upper_col = f"{track}_ci_upper"
-        # Sort models by performance
-        models_sorted = models.sort_values(quality_col, ascending=False)
-        for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
-            category_info = MODEL_CATEGORIES.get(model['model_category'], {})
-            comparison_text += f"""
-**#{i}. {model['model_name']}**
-- Category: {category_info.get('name', 'Unknown')}
-- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
-- Author: {model['author']}
-"""
-        # Add statistical analysis
-        track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
-        if track_comparison:
-            comparison_text += f"""
-### 🔬 Statistical Analysis:
-- **Models with adequate data**: {track_comparison.get('participating_models', 0)}
-- **Confidence intervals available**: Yes (95% level)
-- **Fair comparison possible**: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
-"""
-            # Check for statistical significance (simplified)
-            quality_scores = list(track_comparison.get('quality_scores', {}).values())
-            if len(quality_scores) >= 2:
-                score_range = max(quality_scores) - min(quality_scores)
-                if score_range > 0.05:  # 5% difference threshold
-                    comparison_text += "- **Performance differences**: Potentially significant\n"
-                else:
-                    comparison_text += "- **Performance differences**: Minimal\n"
-        # Add recommendations
-        recommendations = comparison_result.get('recommendations', [])
-        if recommendations:
-            comparison_text += "\n### 💡 Recommendations:\n"
-            for rec in recommendations:
-                comparison_text += f"- {rec}\n"
-        return comparison_text, comparison_plot
-    except Exception as e:
-        error_msg = f"Error performing comparison: {str(e)}"
-        return error_msg, None
 # Initialize data on startup
-print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
-initialization_success = initialize_scientific_data()
-# Create Gradio interface with scientific design
 with gr.Blocks(
-    title=UI_CONFIG["title"],
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
         max-width: 1600px !important;
         margin: 0 auto;
     }
-    .scientific-header {
-        text-align: center;
-        margin-bottom: 2rem;
-        padding: 2rem;
-        background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
-        color: white !important;
-        border-radius: 10px;
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-    }
-    .scientific-header h1, .scientific-header p {
-        color: white !important;
-    }
-    /* Simple fix for text visibility - force dark text on light background */
-    .markdown, .gr-markdown {
-        background: #ffffff !important;
-        color: #1f2937 !important;
-        padding: 1rem;
-        border-radius: 8px;
-        margin: 0.5rem 0;
     }
-    .markdown h1, .markdown h2, .markdown h3, .markdown h4, .markdown h5, .markdown h6 {
-        color: #1f2937 !important;
     }
-    .markdown p, .markdown li, .markdown strong, .markdown em {
-        color: #1f2937 !important;
     }
-    .markdown code {
-        background: #f3f4f6 !important;
-        color: #1f2937 !important;
-        padding: 0.2em 0.4em;
-        border-radius: 4px;
     }
-    .markdown pre {
-        background: #f3f4f6 !important;
-        color: #1f2937 !important;
-        padding: 1rem;
-        border-radius: 8px;
     }
-    /* Track tab styling */
-    .track-tab {
-        background: #ffffff !important;
-        color: #1f2937 !important;
-        border-radius: 8px;
-        margin: 0.5rem;
-        padding: 1rem;
-        border: 2px solid #e5e7eb;
     }
     """
 ) as demo:
-    # Scientific Header
-    gr.HTML(f"""
-    <div class="scientific-header">
-    <h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
-    <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
-    <p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
-    <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
     </div>
     """)
     # Status indicator
     if initialization_success:
-        status_msg = "✅ Scientific system initialized successfully"
-        adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
-        status_msg += f" | Test set adequacy: {adequacy_info.title()}"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
     gr.Markdown(f"**System Status**: {status_msg}")
-    # Add scientific overview
-    gr.Markdown("""
-    ## 🔬 Scientific Evaluation Framework
-    This leaderboard implements rigorous scientific methodology for translation model evaluation:
-    - **Three Evaluation Tracks**: Fair comparison across different model capabilities
-    - **Statistical Significance**: 95% confidence intervals and effect size analysis
-    - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
-    - **Cross-Track Consistency**: Validate model performance across language sets
-    """)
     with gr.Tabs():
         # Tab 1: Download Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
             gr.Markdown("""
-            ## 📋 Get the SALT Scientific Test Set
-            Download our scientifically designed test set with stratified sampling and statistical weighting.
             """)
-            with gr.Row():
-                download_btn = gr.Button("📥 Download Scientific Test Set", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
-                    download_info = gr.Markdown(label="ℹ️ Test Set Information")
         # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
             gr.Markdown("""
-            ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
-            Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
             """)
             with gr.Row():
@@ -864,51 +470,38 @@ with gr.Blocks(
                     description_input = gr.Textbox(
                         label="📄 Model Description",
                         placeholder="Architecture, training data, special features...",
-                        lines=4,
-                        info="Detailed description helps with proper categorization"
                     )
-                    gr.Markdown("### 📤 Upload Predictions")
                     predictions_file = gr.File(
                         label="📂 Predictions File",
                         file_types=[".csv", ".tsv", ".json"]
                     )
                     validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
-                    submit_btn = gr.Button("🚀 Submit for Scientific Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
-                    gr.Markdown("### 📊 Validation Results")
                     validation_output = gr.Markdown()
-            # Results section
-            gr.Markdown("### 🏆 Scientific Evaluation Results")
-            with gr.Row():
-                evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    submission_plot = gr.Plot(label="📈 Submission Analysis")
                 with gr.Column():
-                    cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")
-            with gr.Row():
-                results_table = gr.Dataframe(label="📊 Updated Leaderboard (Google-Comparable Track)", interactive=False)
         # Tab 3: Google-Comparable Track
-        with gr.Tab("🤖 Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
             gr.Markdown(f"""
-            ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
-            **Fair comparison with commercial translation systems**
-            This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
             enabling direct comparison with commercial baselines.
-            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
-            - **Purpose**: Commercial system comparison and baseline establishment
-            - **Statistical Power**: High (optimized sample sizes)
             """)
             with gr.Row():
@@ -920,39 +513,28 @@ with gr.Blocks(
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
                         value="all"
                     )
-                with gr.Column(scale=1):
-                    google_adequacy = gr.Slider(
-                        label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
-                    )
                 with gr.Column(scale=1):
                     google_refresh = gr.Button("🔄 Refresh", variant="secondary")
-            with gr.Row():
-                google_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
                 with gr.Column():
-                    google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
-            with gr.Row():
-                google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
         # Tab 4: UG40-Complete Track
-        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
             gr.Markdown(f"""
-            ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
-            **Comprehensive evaluation across all Ugandan languages**
-            This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
-            providing the most comprehensive assessment of Ugandan language translation capabilities.
-            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
-            - **Purpose**: Comprehensive Ugandan language capability assessment
-            - **Coverage**: Complete linguistic landscape of Uganda
             """)
             with gr.Row():
@@ -964,479 +546,197 @@ with gr.Blocks(
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
                         value="all"
                     )
-                with gr.Column(scale=1):
-                    ug40_adequacy = gr.Slider(
-                        label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
-                    )
                 with gr.Column(scale=1):
                     ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
-            with gr.Row():
-                ug40_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
                 with gr.Column():
-                    ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
-            with gr.Row():
-                ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
-        # Tab 5: Language-Pair Matrix
-        with gr.Tab("📊 Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
-            gr.Markdown(f"""
-            ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
-            **Detailed language pair analysis with statistical significance**
-            This view provides granular analysis of model performance across individual language pairs
-            with statistical significance testing and effect size analysis.
-            - **Resolution**: Individual language pair performance
-            - **Purpose**: Detailed linguistic analysis and model diagnostics
-            - **Statistics**: Pairwise significance testing available
-            """)
-            with gr.Row():
-                with gr.Column(scale=2):
-                    matrix_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
-                with gr.Column(scale=1):
-                    matrix_category = gr.Dropdown(
-                        label="🏷️ Category Filter",
-                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
-                        value="all"
-                    )
-                with gr.Column(scale=1):
-                    matrix_adequacy = gr.Slider(
-                        label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
-                    )
-                with gr.Column(scale=1):
-                    matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")
-            with gr.Row():
-                matrix_stats = gr.Markdown()
-            with gr.Row():
-                with gr.Column():
-                    matrix_ranking_plot = gr.Plot(label="🏆 Language-Pair Matrix Rankings")
-                with gr.Column():
-                    matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
-            with gr.Row():
-                matrix_leaderboard = gr.Dataframe(label="📈 Language-Pair Matrix Leaderboard", interactive=False)
-        # Tab 6: Model Analysis
-        with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
-            gr.Markdown("""
-            ## 🔬 Detailed Scientific Model Analysis
-            Comprehensive analysis of individual models with statistical confidence intervals,
-            cross-track performance, and detailed language pair breakdowns.
-            """)
-            with gr.Row():
-                with gr.Column(scale=2):
-                    model_select = gr.Dropdown(
-                        label="🤖 Select Model",
-                        choices=[],
-                        value=None,
-                        info="Choose a model for detailed scientific analysis"
-                    )
-                with gr.Column(scale=1):
-                    track_select = gr.Dropdown(
-                        label="🏁 Analysis Track",
-                        choices=list(EVALUATION_TRACKS.keys()),
-                        value="google_comparable",
-                        info="Track for detailed analysis"
-                    )
-                with gr.Column(scale=1):
-                    analyze_btn = gr.Button("🔍 Analyze", variant="primary")
-            with gr.Row():
-                model_details = gr.Markdown()
-            with gr.Row():
-                with gr.Column():
-                    model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
-                with gr.Column():
-                    model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")
-        # Tab 7: Model Comparison
-        with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
             gr.Markdown("""
-            ## 🔬 Scientific Model Comparison
-            Compare multiple models with statistical significance testing and fair comparison analysis.
-            Only models evaluated on the same language pairs are compared for scientific validity.
             """)
             with gr.Row():
-                with gr.Column(scale=2):
-                    comparison_models = gr.CheckboxGroup(
-                        label="🤖 Select Models to Compare",
-                        choices=[],
-                        value=[],
-                        info="Select 2-6 models for comparison"
-                    )
                 with gr.Column(scale=1):
-                    comparison_track = gr.Dropdown(
-                        label="🏁 Comparison Track",
                         choices=list(EVALUATION_TRACKS.keys()),
                         value="google_comparable"
                     )
-                    comparison_type = gr.Radio(
-                        label="📊 Comparison Type",
-                        choices=["statistical", "category"],
-                        value="statistical"
-                    )
-                    compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
-            with gr.Row():
-                comparison_output = gr.Markdown()
-            with gr.Row():
-                comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")
-        # Tab 8: Documentation
-        with gr.Tab("📚 Scientific Documentation", id="docs"):
             gr.Markdown(f"""
-            # 📖 SALT Translation Leaderboard - Scientific Edition Documentation
             ## 🎯 Overview
-            The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
-            for translation models on Ugandan languages, designed for research publication and scientific analysis.
-            ## 🔬 Scientific Methodology
-            ### Three-Tier Evaluation System
             **1. 🤖 Google-Comparable Track**
             - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
-            - **Pairs**: {len(get_google_comparable_pairs())} language pairs
             - **Purpose**: Fair comparison with commercial translation systems
-            - **Statistical Power**: High (≥200 samples per pair recommended)
             **2. 🌍 UG40-Complete Track**
             - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
-            - **Pairs**: {len(get_all_language_pairs())} language pairs
             - **Purpose**: Comprehensive Ugandan language capability assessment
-            - **Statistical Power**: Moderate (≥100 samples per pair recommended)
-            **3. 📊 Language-Pair Matrix**
-            - **Resolution**: Individual language pair analysis
-            - **Purpose**: Detailed linguistic analysis and model diagnostics
-            - **Statistics**: Pairwise significance testing with multiple comparison correction
-            ### Statistical Rigor
-            - **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
-            - **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
-            - **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
-            - **Statistical Power**: Estimated based on sample sizes and effect sizes
-            ### Model Categories
-            Models are automatically categorized for fair comparison:
-            - **🏢 Commercial**: Production translation systems (Google Translate, Azure, etc.)
-            - **🔬 Research**: Academic and research institution models (NLLB, M2M-100, etc.)
-            - **📊 Baseline**: Simple baseline and reference models
-            - **👥 Community**: User-submitted models and fine-tuned variants
             ## 📊 Evaluation Metrics
             ### Primary Metrics
-            - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
             - **BLEU**: Bilingual Evaluation Understudy (0-100)
             - **ChrF**: Character-level F-score (0-1)
-            ### Secondary Metrics
-            - **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
-            - **CER/WER**: Character/Word Error Rate (lower is better)
-            - **Length Ratio**: Prediction/reference length ratio
-            All metrics include 95% confidence intervals for statistical reliability.
             ## 🔄 Submission Process
-            ### Step 1: Download Scientific Test Set
-            1. Click "Download Scientific Test Set" in the first tab
-            2. Review test set adequacy and track breakdown
-            3. Save the enhanced test set with statistical weights
             ### Step 2: Generate Predictions
             1. Load the test set in your evaluation pipeline
             2. For each row, translate `source_text` from `source_language` to `target_language`
             3. Save results as CSV with columns: `sample_id`, `prediction`
-            4. Optional: Add `category` column for automatic classification
             ### Step 3: Submit & Evaluate
-            1. Fill in detailed model information (improves categorization)
             2. Upload your predictions file
-            3. Review validation report with track-specific adequacy assessment
-            4. Submit for scientific evaluation across all tracks
-            ## 📋 Enhanced File Formats
-            ### Scientific Test Set Format
             ```csv
-            sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
-            salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
-            salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
-            salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
             ```
             ### Predictions Format
             ```csv
-            sample_id,prediction,category
-            salt_000001,"Amakuru ensi","community"
-            salt_000002,"Ibino nining?","community"
-            salt_000003,"Ejok nanu","community"
             ```
-            ## 🏆 Scientific Leaderboard Features
-            ### Fair Comparison
-            - Models only compared within the same category and track
-            - Statistical significance testing prevents misleading rankings
-            - Confidence intervals show measurement uncertainty
-            ### Cross-Track Analysis
-            - Consistency analysis across evaluation tracks
-            - Identification of model strengths and weaknesses
-            - Language-specific performance patterns
-            ### Publication Quality
-            - All visualizations include error bars and statistical annotations
-            - Comprehensive methodology documentation
-            - Reproducible evaluation pipeline
-            ## 🔬 Statistical Interpretation Guide
-            ### Confidence Intervals
-            - **Non-overlapping CIs**: Likely significant difference
-            - **Overlapping CIs**: May or may not be significant (requires formal testing)
-            - **Wide CIs**: High uncertainty (need more data)
-            ### Effect Sizes
-            - **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
-            - **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
-            - **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
-            - **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
-            ### Statistical Adequacy
-            - **Excellent**: High statistical power (>0.8) for all comparisons
-            - **Good**: Adequate power for most comparisons
-            - **Fair**: Limited power, interpret with caution
-            - **Insufficient**: Results not reliable for scientific conclusions
-            ## 🤝 Contributing to Science
             This leaderboard is designed for the research community. When using results:
-            1. **Always report confidence intervals** along with point estimates
-            2. **Acknowledge statistical adequacy** when interpreting results
-            3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
-            4. **Consider effect sizes** not just statistical significance
-            ## 📄 Citation
-            If you use this leaderboard in your research, please cite:
-            ```bibtex
-            @misc{{salt_leaderboard_scientific_2024,
-              title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
-              author={{Sunbird AI}},
-              year={{2024}},
-              url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
-              note={{Three-tier evaluation system with statistical significance testing}}
-            }}
-            ```
-            ## 🔗 Related Resources
-            - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
-            - **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
-            - **Statistical Methodology**: See our technical paper on rigorous MT evaluation
-            - **Open Source Code**: Available on GitHub for reproducibility
             ---
-            *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
             """)
-    # Event handlers with enhanced scientific functionality
     predictions_validated = gr.State(value=None)
-    validation_info_state = gr.State(value=None)
     detected_category_state = gr.State(value="community")
     # Download test set
     download_btn.click(
-        fn=download_scientific_test_set,
         outputs=[download_file, download_info]
     )
     # Validate predictions
-    def handle_scientific_validation(file, model_name, author, description):
-        report, predictions, category = validate_scientific_submission(file, model_name, author, description)
-        # Enable button if predictions are available (allows evaluation with limitations)
         can_evaluate = predictions is not None
-        # Add user-friendly button status message to report
         if can_evaluate:
-            if "🎉 **Final Verdict**: Ready for scientific evaluation!" in report:
-                button_status = "\n\n✅ **Button Status**: Ready to submit for evaluation!"
-            elif "⚠️ **Final Verdict**: Can be evaluated with limitations" in report:
-                button_status = "\n\n⚠️ **Button Status**: Can submit for evaluation (results will include limitations note)"
-            else:
-                button_status = "\n\n✅ **Button Status**: Evaluation possible"
         else:
-            button_status = "\n\n❌ **Button Status**: Please fix issues above before evaluation"
         enhanced_report = report + button_status
         return (
             enhanced_report,
             predictions,
-            {"category": category, "validation_passed": can_evaluate},
             category,
             gr.update(interactive=can_evaluate)
         )
     validate_btn.click(
-        fn=handle_scientific_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
-        outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
     )
     # Submit for evaluation
-    def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
-        if predictions is None:
-            return "❌ Please validate your submission first", None, None, None
-        result = evaluate_scientific_submission(
-            predictions, model_name, author, description, category, validation_info
-        )
-        # After successful evaluation, update dropdown choices
-        global current_leaderboard
-        if current_leaderboard is not None and not current_leaderboard.empty:
-            model_choices = current_leaderboard['model_name'].unique().tolist()
-        else:
-            model_choices = []
-        # Return the evaluation results plus updated dropdown choices
-        return result + (
-            gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None),
-            gr.CheckboxGroup(choices=model_choices, value=[])
-        )
     submit_btn.click(
-        fn=handle_scientific_submission,
-        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
-        outputs=[evaluation_output, results_table, submission_plot, cross_track_plot, model_select, comparison_models]
     )
     # Track leaderboard refresh functions
-    def refresh_google_track(*args):
-        result = refresh_track_leaderboard("google_comparable", *args)
-        # Update dropdowns too
-        if current_leaderboard is not None and not current_leaderboard.empty:
-            model_choices = current_leaderboard['model_name'].unique().tolist()
-        else:
-            model_choices = []
-        return result + (
-            gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None),
-            gr.CheckboxGroup(choices=model_choices, value=[])
-        )
-    def refresh_ug40_track(*args):
-        return refresh_track_leaderboard("ug40_complete", *args)
-    def refresh_matrix_track(*args):
-        return refresh_track_leaderboard("language_pair_matrix", *args)
-    # Google-Comparable Track
     google_refresh.click(
-        fn=refresh_google_track,
-        inputs=[google_search, google_category, google_adequacy],
-        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats, model_select, comparison_models]
     )
-    # UG40-Complete Track
     ug40_refresh.click(
-        fn=refresh_ug40_track,
-        inputs=[ug40_search, ug40_category, ug40_adequacy],
         outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
     )
-    # Language-Pair Matrix Track
-    matrix_refresh.click(
-        fn=refresh_matrix_track,
-        inputs=[matrix_search, matrix_category, matrix_adequacy],
-        outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
     )
-    # Model analysis
-    def handle_model_analysis(model_name, track):
-        if not model_name:
-            return "Please select a model for analysis", None, None
-        print(f"Analyzing model: {model_name} for track: {track}")
-        global current_leaderboard
-        if current_leaderboard is not None:
-            print(f"Available models: {current_leaderboard['model_name'].tolist()}")
-        return get_scientific_model_details(model_name, track)
-    analyze_btn.click(
-        fn=handle_model_analysis,
-        inputs=[model_select, track_select],
-        outputs=[model_details, model_analysis_plot, model_heatmap_plot]
-    )
-    # Model comparison
-    compare_btn.click(
-        fn=perform_model_comparison,
-        inputs=[comparison_models, comparison_track, comparison_type],
-        outputs=[comparison_output, comparison_plot]
-    )
-    # Load initial data and update dropdowns
     def load_initial_data():
-        # Load initial Google track data
-        google_data = refresh_google_track("", "all", 0.0)
-        # Update dropdown choices
-        if current_leaderboard is not None and not current_leaderboard.empty:
-            model_choices = current_leaderboard['model_name'].unique().tolist()
-        else:
-            model_choices = []
-        return (
-            google_data[0],  # google_leaderboard
-            google_data[1],  # google_ranking_plot
-            google_data[2],  # google_comparison_plot
-            google_data[3],  # google_stats
-            gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None),  # model_select
-            gr.CheckboxGroup(choices=model_choices, value=[])  # comparison_models
-        )
     demo.load(
         fn=load_initial_data,
-        outputs=[
-            google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
-            model_select, comparison_models
-        ]
     )
-# Launch the scientific application
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",

 import sys
 import os
 from pathlib import Path
+import traceback
+from datetime import datetime
+from typing import Optional, Dict, Tuple, List
 def setup_salt():
     """Clone and setup SALT library like in Colab."""
     try:
         import salt.dataset
         print("✅ SALT library already available")
         return True
     print("📥 Setting up SALT library...")
     try:
         salt_dir = Path("salt")
         if not salt_dir.exists():
             print("🔄 Cloning SALT repository...")
         else:
             print("📁 SALT repository already exists")
         salt_requirements = salt_dir / "requirements.txt"
         if salt_requirements.exists():
             print("📦 Installing SALT requirements...")
                 sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
             ])
         salt_path = str(salt_dir.absolute())
         if salt_path not in sys.path:
             sys.path.insert(0, salt_path)
             print(f"🔗 Added {salt_path} to Python path")
         import salt.dataset
         print("✅ SALT library setup completed successfully")
         return True
         return False
 # Setup SALT on startup
+print("🚀 Starting SALT Translation Leaderboard...")
 if not setup_salt():
     print("❌ Cannot continue without SALT library")
     sys.exit(1)
 import gradio as gr
 import pandas as pd
 import json
+# Import our modules
 from src.test_set import (
+    get_public_test_set,
+    get_complete_test_set,
+    create_test_set_download
 )
+from src.validation import validate_submission
+from src.evaluation import evaluate_predictions, generate_evaluation_report
 from src.leaderboard import (
+    load_leaderboard,
+    add_model_to_leaderboard,
     get_track_leaderboard,
+    prepare_leaderboard_display
 )
 from src.plotting import (
+    create_leaderboard_plot,
+    create_language_pair_heatmap,
+    create_performance_comparison_plot,
+    create_language_pair_comparison_plot
 )
+from src.utils import sanitize_model_name, get_all_language_pairs
 from config import *
 # Global variables for caching
 current_leaderboard = None
 public_test_set = None
 complete_test_set = None
+def initialize_data():
+    """Initialize test sets and leaderboard data."""
+    global public_test_set, complete_test_set, current_leaderboard
     try:
+        print("📥 Loading test sets...")
+        public_test_set = get_public_test_set()
+        complete_test_set = get_complete_test_set()
+        print("🏆 Loading leaderboard...")
+        current_leaderboard = load_leaderboard()
+        print(f"✅ Initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
+        print(f"❌ Initialization failed: {e}")
         traceback.print_exc()
         return False
+def download_test_set() -> Tuple[str, str]:
+    """Create downloadable test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
+            public_test_set = get_public_test_set()
+        download_path, stats = create_test_set_download()
         info_msg = f"""
+## 📥 SALT Test Set Downloaded Successfully!
 ### 📊 Dataset Statistics:
 - **Total Samples**: {stats['total_samples']:,}
 - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
+- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples
+- **Language Pairs**: {stats.get('language_pairs', 0)}
 ### 🏁 Track Breakdown:
 """
         track_breakdown = stats.get('track_breakdown', {})
         for track_name, track_info in track_breakdown.items():
             info_msg += f"""
+**{EVALUATION_TRACKS[track_name]['name']}**:
 - Samples: {track_info.get('total_samples', 0):,}
 - Language Pairs: {track_info.get('language_pairs', 0)}
 """
         info_msg += f"""
+### 📋 File Format:
 - `sample_id`: Unique identifier for each sample
 - `source_text`: Text to be translated
 - `source_language`: Source language code
 - `target_language`: Target language code
 - `domain`: Content domain (if available)
 - `google_comparable`: Whether this pair can be compared with Google Translate
+### 🔬 Next Steps:
 1. **Run your model** on the source texts to generate translations
 2. **Create a predictions file** with columns: `sample_id`, `prediction`
+3. **Submit** your predictions using the submission tab
         """
         return download_path, info_msg
     except Exception as e:
+        error_msg = f"❌ Error creating test set download: {str(e)}"
         return None, error_msg
+def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]:
+    """Validate uploaded prediction file."""
     try:
         if file is None:
             return "❌ Please upload a predictions file", None, "community"
         else:
             return "❌ Could not read uploaded file", None, "community"
+        filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv"
         global complete_test_set
         if complete_test_set is None:
+            complete_test_set = get_complete_test_set()
+        validation_result = validate_submission(
             file_content, filename, complete_test_set, model_name, author, description
         )
         detected_category = validation_result.get("category", "community")
         if validation_result.get("can_evaluate", False):
             return validation_result["report"], validation_result["predictions"], detected_category
         else:
             return validation_result["report"], None, detected_category
     except Exception as e:
+        return f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community"
+def evaluate_submission(
     predictions_df: pd.DataFrame,
     model_name: str,
     author: str,
     description: str,
     detected_category: str,
 ) -> Tuple[str, pd.DataFrame, object, object]:
+    """Evaluate validated predictions."""
     try:
         if predictions_df is None:
             return "❌ No valid predictions to evaluate", None, None, None
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
+            complete_test_set = get_complete_test_set()
+        print(f"🔬 Starting evaluation for {model_name}...")
+        evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category)
+        if evaluation_results.get('error'):
+            return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
+        print("🏆 Adding to leaderboard...")
+        updated_leaderboard = add_model_to_leaderboard(
             model_name=sanitize_model_name(model_name),
             author=author or "Anonymous",
             evaluation_results=evaluation_results,
             description=description or ""
         )
         current_leaderboard = updated_leaderboard
+        report = generate_evaluation_report(evaluation_results, model_name)
         # Create visualizations
+        summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable")
         google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
+        display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable")
         success_msg = f"""
+## 🎉 Evaluation Complete!
 ### 📊 Model Information:
 - **Model**: {model_name}
 - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
 - **Author**: {author or 'Anonymous'}
 {report}
         """
+        return success_msg, display_leaderboard, summary_plot, None
     except Exception as e:
+        error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         return error_msg, None, None, None
+def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]:
     """Refresh leaderboard for a specific track with filters."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
+            current_leaderboard = load_leaderboard()
+        track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter)
         # Apply search filter
         if search_query and not track_leaderboard.empty:
+            query_lower = search_query.lower()
+            mask = (
+                track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
+                track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
+            )
+            track_leaderboard = track_leaderboard[mask]
+        display_df = prepare_leaderboard_display(track_leaderboard, track)
+        ranking_plot = create_leaderboard_plot(track_leaderboard, track)
+        comparison_plot = create_performance_comparison_plot(track_leaderboard, track)
+        track_config = EVALUATION_TRACKS[track]
+        stats_text = f"""
 ### 📊 {track_config['name']} Statistics
+- **Total Models**: {len(track_leaderboard)}
+- **Best Model**: {track_leaderboard.iloc[0]['model_name'] if not track_leaderboard.empty else 'None'}
+- **Best Score**: {track_leaderboard.iloc[0][f'{track}_quality']:.4f if not track_leaderboard.empty else 0.0}
+### 🔬 Track Information:
+{track_config['description']}
+        """
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
         error_msg = f"Error loading {track} leaderboard: {str(e)}"
         print(error_msg)
+        return pd.DataFrame(), None, None, error_msg
+def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]:
+    """Get language pair comparison data and visualization."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
+            return pd.DataFrame(), None
+        track_leaderboard = get_track_leaderboard(current_leaderboard, track)
+        if track_leaderboard.empty:
+            return pd.DataFrame(), None
+        # Create language pair comparison table
+        pairs_data = []
+        track_languages = EVALUATION_TRACKS[track]["languages"]
+        for src in track_languages:
+            for tgt in track_languages:
+                if src == tgt:
+                    continue
+                pair_key = f"{src}_to_{tgt}"
+                pair_display = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
+                for _, model in track_leaderboard.iterrows():
+                    # Extract detailed results if available
+                    detailed_col = f'detailed_{track}'
+                    if detailed_col in model and pd.notna(model[detailed_col]):
+                        try:
+                            detailed_results = json.loads(model[detailed_col])
+                            pair_metrics = detailed_results.get('pair_metrics', {})
+                            if pair_key in pair_metrics:
+                                metrics = pair_metrics[pair_key]
+                                pairs_data.append({
+                                    'Language Pair': pair_display,
+                                    'Model': model['model_name'],
+                                    'Category': model['model_category'],
+                                    'Quality Score': metrics.get('quality_score', {}).get('mean', 0),
+                                    'BLEU': metrics.get('bleu', {}).get('mean', 0),
+                                    'ChrF': metrics.get('chrf', {}).get('mean', 0),
+                                    'Samples': metrics.get('sample_count', 0)
+                                })
+                        except (json.JSONDecodeError, KeyError):
+                            continue
+        pairs_df = pd.DataFrame(pairs_data)
+        if pairs_df.empty:
+            return pd.DataFrame(), None
+        # Create visualization
+        comparison_plot = create_language_pair_comparison_plot(pairs_df, track)
+        return pairs_df, comparison_plot
     except Exception as e:
+        print(f"Error in language pair comparison: {e}")
+        return pd.DataFrame(), None
 # Initialize data on startup
+initialization_success = initialize_data()
+# Create Gradio interface
 with gr.Blocks(
+    title="🏆 SALT Translation Leaderboard",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
         max-width: 1600px !important;
         margin: 0 auto;
     }
+    /* Force readable text in all themes */
+    .markdown, .gr-markdown, .gr-html {
+        color: var(--body-text-color) !important;
+        background: var(--background-fill-primary) !important;
     }
+    .markdown h1, .markdown h2, .markdown h3,
+    .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
+        color: var(--body-text-color) !important;
     }
+    .markdown p, .markdown li, .markdown strong,
+    .gr-markdown p, .gr-markdown li, .gr-markdown strong {
+        color: var(--body-text-color) !important;
     }
+    /* Table styling */
+    .dataframe, .gr-dataframe {
+        color: var(--body-text-color) !important;
+        background: var(--background-fill-primary) !important;
     }
+    /* Button and input styling */
+    .gr-button, .gr-textbox, .gr-dropdown {
+        color: var(--body-text-color) !important;
     }
+    /* Ensure plot backgrounds work in both themes */
+    .plot-container {
+        background: var(--background-fill-primary) !important;
     }
     """
 ) as demo:
+    # Header
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;">
+    <h1 style="color: white !important;">🏆 SALT Translation Leaderboard</h1>
+    <p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p>
+    <p style="color: white !important;">Three-tier evaluation • Statistical confidence intervals • Research-grade analysis</p>
     </div>
     """)
     # Status indicator
     if initialization_success:
+        status_msg = "✅ System initialized successfully"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
     gr.Markdown(f"**System Status**: {status_msg}")
     with gr.Tabs():
         # Tab 1: Download Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
             gr.Markdown("""
+            ## 📋 Get the SALT Test Set
+            Download our test set for translation model evaluation.
             """)
+            download_btn = gr.Button("📥 Download Test Set", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
+                    download_info = gr.Markdown()
         # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
             gr.Markdown("""
+            ## 🎯 Submit Your Model's Predictions
+            Upload predictions for evaluation across all tracks.
             """)
             with gr.Row():
                     description_input = gr.Textbox(
                         label="📄 Model Description",
                         placeholder="Architecture, training data, special features...",
+                        lines=4
                     )
                     predictions_file = gr.File(
                         label="📂 Predictions File",
                         file_types=[".csv", ".tsv", ".json"]
                     )
                     validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
+                    submit_btn = gr.Button("🚀 Submit for Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
                     validation_output = gr.Markdown()
+            gr.Markdown("### 🏆 Evaluation Results")
+            evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    submission_plot = gr.Plot(label="📈 Performance Analysis")
                 with gr.Column():
+                    results_table = gr.Dataframe(label="📊 Updated Leaderboard", interactive=False)
         # Tab 3: Google-Comparable Track
+        with gr.Tab("🤖 Google-Comparable Track", id="google_track"):
             gr.Markdown(f"""
+            ## {EVALUATION_TRACKS['google_comparable']['name']}
+            **{EVALUATION_TRACKS['google_comparable']['description']}**
+            This track evaluates models on language pairs supported by Google Translate,
             enabling direct comparison with commercial baselines.
             """)
             with gr.Row():
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
                         value="all"
                     )
                 with gr.Column(scale=1):
                     google_refresh = gr.Button("🔄 Refresh", variant="secondary")
+            google_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    google_ranking_plot = gr.Plot(label="🏆 Rankings")
                 with gr.Column():
+                    google_comparison_plot = gr.Plot(label="📊 Performance Comparison")
+            google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
         # Tab 4: UG40-Complete Track
+        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track"):
             gr.Markdown(f"""
+            ## {EVALUATION_TRACKS['ug40_complete']['name']}
+            **{EVALUATION_TRACKS['ug40_complete']['description']}**
+            This track evaluates models on all UG40 language pairs,
+            providing comprehensive assessment of Ugandan language translation capabilities.
             """)
             with gr.Row():
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
                         value="all"
                     )
                 with gr.Column(scale=1):
                     ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
+            ug40_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    ug40_ranking_plot = gr.Plot(label="🏆 Rankings")
                 with gr.Column():
+                    ug40_comparison_plot = gr.Plot(label="📊 Performance Comparison")
+            ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
+        # Tab 5: Language Pair Analysis
+        with gr.Tab("📊 Language Pair Analysis", id="pairs_analysis"):
             gr.Markdown("""
+            ## 📊 Language Pair Performance Analysis
+            Compare model performance across individual language pairs with detailed breakdowns.
             """)
             with gr.Row():
                 with gr.Column(scale=1):
+                    pairs_track_select = gr.Dropdown(
+                        label="🏁 Select Track",
                         choices=list(EVALUATION_TRACKS.keys()),
                         value="google_comparable"
                     )
+                with gr.Column(scale=1):
+                    pairs_refresh = gr.Button("🔄 Analyze Language Pairs", variant="primary")
+            pairs_comparison_plot = gr.Plot(label="📊 Language Pair Comparison")
+            pairs_table = gr.Dataframe(label="📈 Language Pair Performance", interactive=False)
+        # Tab 6: Documentation
+        with gr.Tab("📚 Documentation", id="docs"):
             gr.Markdown(f"""
+            # 📖 SALT Translation Leaderboard Documentation
             ## 🎯 Overview
+            The SALT Translation Leaderboard provides rigorous evaluation of translation models
+            on Ugandan languages using three different tracks for fair comparison.
+            ## 🏁 Evaluation Tracks
             **1. 🤖 Google-Comparable Track**
             - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
             - **Purpose**: Fair comparison with commercial translation systems
+            - **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])}
             **2. 🌍 UG40-Complete Track**
             - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
             - **Purpose**: Comprehensive Ugandan language capability assessment
+            - **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])}
             ## 📊 Evaluation Metrics
             ### Primary Metrics
+            - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates
             - **BLEU**: Bilingual Evaluation Understudy (0-100)
             - **ChrF**: Character-level F-score (0-1)
+            ### Model Categories
+            Models are automatically categorized for fair comparison:
+            - **🏢 Commercial**: Production translation systems
+            - **🔬 Research**: Academic and research institution models
+            - **📊 Baseline**: Simple baseline and reference models
+            - **👥 Community**: User-submitted models
             ## 🔄 Submission Process
+            ### Step 1: Download Test Set
+            1. Click "Download Test Set" in the first tab
+            2. Save the test set file
             ### Step 2: Generate Predictions
             1. Load the test set in your evaluation pipeline
             2. For each row, translate `source_text` from `source_language` to `target_language`
             3. Save results as CSV with columns: `sample_id`, `prediction`
             ### Step 3: Submit & Evaluate
+            1. Fill in model information
             2. Upload your predictions file
+            3. Review validation report
+            4. Submit for evaluation
+            ## 📋 File Formats
+            ### Test Set Format
             ```csv
+            sample_id,source_text,source_language,target_language,domain,google_comparable
+            salt_000001,"Hello world",eng,lug,general,true
+            salt_000002,"How are you?",eng,ach,conversation,true
             ```
             ### Predictions Format
             ```csv
+            sample_id,prediction
+            salt_000001,"Amakuru ensi"
+            salt_000002,"Ibino nining?"
             ```
+            ## 🤝 Contributing
             This leaderboard is designed for the research community. When using results:
+            1. Consider the appropriate track for your comparison
+            2. Report confidence intervals when available
+            3. Acknowledge the model category in comparisons
             ---
+            *For questions, contact the team at [email protected]*
             """)
+    # Event handlers
     predictions_validated = gr.State(value=None)
     detected_category_state = gr.State(value="community")
     # Download test set
     download_btn.click(
+        fn=download_test_set,
         outputs=[download_file, download_info]
     )
     # Validate predictions
+    def handle_validation(file, model_name, author, description):
+        report, predictions, category = validate_submission_file(file, model_name, author, description)
         can_evaluate = predictions is not None
         if can_evaluate:
+            button_status = "\n\n✅ **Ready to submit for evaluation!**"
         else:
+            button_status = "\n\n❌ **Please fix issues above before evaluation**"
         enhanced_report = report + button_status
         return (
             enhanced_report,
             predictions,
             category,
             gr.update(interactive=can_evaluate)
         )
     validate_btn.click(
+        fn=handle_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
+        outputs=[validation_output, predictions_validated, detected_category_state, submit_btn]
     )
     # Submit for evaluation
     submit_btn.click(
+        fn=evaluate_submission,
+        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state],
+        outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)]
     )
     # Track leaderboard refresh functions
     google_refresh.click(
+        fn=lambda *args: refresh_track_leaderboard("google_comparable", *args),
+        inputs=[google_search, google_category],
+        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
     )
     ug40_refresh.click(
+        fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args),
+        inputs=[ug40_search, ug40_category],
         outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
     )
+    # Language pair analysis
+    pairs_refresh.click(
+        fn=get_language_pair_comparison,
+        inputs=[pairs_track_select],
+        outputs=[pairs_table, pairs_comparison_plot]
     )
+    # Load initial data
     def load_initial_data():
+        google_data = refresh_track_leaderboard("google_comparable", "", "all")
+        return google_data
     demo.load(
         fn=load_initial_data,
+        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
     )
+# Launch the application
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",