Spaces:

ruanchaves
/

napolab

Running

App Files Files Community

ruanchaves commited on Jul 27

Commit

0855f92

verified ·

1 Parent(s): b1394c3

Upload 14 files

Browse files

Files changed (14) hide show

README.md +60 -13
app.py +1006 -0
config.py +48 -0
data.yaml +422 -0
data_loader.py +133 -0
download_external_models.py +124 -0
example_usage.py +134 -0
external_models.csv +31 -0
extract_portuguese_leaderboard.py +195 -0
manage_data.py +226 -0
portuguese_leaderboard.csv +0 -0
requirements.txt +8 -0
run_app.py +73 -0
validate_data.py +106 -0

README.md CHANGED Viewed

@@ -1,13 +1,60 @@
----
-title: Napolab
-emoji: 🏃
-colorFrom: gray
-colorTo: gray
-sdk: gradio
-sdk_version: 5.38.2
-app_file: app.py
-pinned: false
-short_description: The Natural Portuguese Language Benchmark Leaderboard
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Napolab Leaderboard - Gradio App
+A comprehensive Gradio web application for exploring and benchmarking Portuguese language models using the Napolab dataset collection.
+## Features
+- **🏆 Benchmark Results**: Single comprehensive table with one column per dataset and clickable model links
+- **📈 Model Analysis**: Radar chart showing model performance across all datasets
+## Installation
+1. Navigate to the leaderboard directory:
+```bash
+cd dev/napolab/leaderboard
+```
+2. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Extract data from external sources (optional but recommended):
+```bash
+# Extract data from Portuguese LLM Leaderboard
+python extract_portuguese_leaderboard.py
+# Download external models data
+python download_external_models.py
+```
+4. Run the Gradio app:
+```bash
+python app.py
+```
+The app will be available at `http://localhost:7860`
+## Data Management
+The app uses a YAML configuration file (`data.yaml`) for adding new data, making it easy to edit and maintain.
+### Data Extraction Scripts
+The leaderboard includes scripts to automatically extract and update data from external sources:
+#### `extract_portuguese_leaderboard.py`
+This script extracts benchmark results from the Open Portuguese LLM Leaderboard:
+- Fetches data from the Hugging Face Spaces leaderboard
+- Updates the `portuguese_leaderboard.csv` file
+- Includes both open-source and proprietary models
+- Automatically handles data formatting and validation
+#### `download_external_models.py`
+This script downloads additional model data:
+- Fetches model metadata from various sources
+- Updates the `external_models.csv` file
+- Includes model links and performance metrics
+- Ensures data consistency with the main leaderboard
+**Note**: These scripts require internet connection and may take a few minutes to complete. Run them periodically to keep the leaderboard data up to date.

app.py ADDED Viewed

	@@ -0,0 +1,1006 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+# Import data loader
+from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
+# Load data from YAML file
+NAPOLAB_DATASETS = get_napolab_datasets()
+SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
+MODEL_METADATA = get_model_metadata()
+def load_portuguese_leaderboard_data() -> pd.DataFrame:
+    """Load data from the Portuguese leaderboard CSV file."""
+    try:
+        csv_path = "portuguese_leaderboard.csv"
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            # Select only the relevant columns
+            relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
+            df = df[relevant_columns].copy()
+            # Rename columns to match the existing format
+            df = df.rename(columns={
+                'assin2_rte': 'ASSIN2 RTE',
+                'assin2_sts': 'ASSIN2 STS',
+                'faquad_nli': 'FaQuAD-NLI',
+                'hatebr_offensive': 'HateBR'
+            })
+            # Add source information
+            df['source'] = 'portuguese_leaderboard'
+            print(f"Loaded {len(df)} models from Portuguese leaderboard")
+            return df
+        else:
+            print(f"Portuguese leaderboard CSV not found: {csv_path}")
+            return pd.DataFrame()
+    except Exception as e:
+        print(f"Error loading Portuguese leaderboard data: {e}")
+        return pd.DataFrame()
+def load_external_models_data() -> pd.DataFrame:
+    """Load data from the external models CSV file."""
+    try:
+        csv_path = "external_models.csv"
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            # Select only the relevant columns
+            relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
+            df = df[relevant_columns].copy()
+            # Rename columns to match the existing format
+            df = df.rename(columns={
+                'model': 'model_name',
+                'assin2_rte': 'ASSIN2 RTE',
+                'assin2_sts': 'ASSIN2 STS',
+                'faquad_nli': 'FaQuAD-NLI',
+                'hatebr_offensive': 'HateBR'
+            })
+            # Add source information
+            df['source'] = 'external_models'
+            print(f"Loaded {len(df)} external models")
+            return df
+        else:
+            print(f"External models CSV not found: {csv_path}")
+            return pd.DataFrame()
+    except Exception as e:
+        print(f"Error loading external models data: {e}")
+        return pd.DataFrame()
+# Load Portuguese leaderboard data
+PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
+# Load external models data
+EXTERNAL_MODELS_DATA = load_external_models_data()
+def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
+    """Create a simplified benchmark table with one column per dataset."""
+    # Get all dataset names
+    dataset_names = sorted(NAPOLAB_DATASETS.keys())
+    dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
+    # Use selected datasets if provided, otherwise use all datasets
+    if selected_datasets is None:
+        selected_datasets = dataset_names
+    # Collect data for each model
+    model_data = {}
+    # Process existing benchmark results
+    for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
+        for model_name, metrics in models.items():
+            if model_name not in model_data:
+                model_data[model_name] = {
+                    'dataset_scores': {},
+                    'url': None,
+                    'source': 'existing'
+                }
+            # Calculate average performance for this dataset
+            avg_performance = np.mean(list(metrics.values()))
+            model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
+    # Process Portuguese leaderboard data
+    if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
+        for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
+            model_name = row['model_name']
+            if model_name not in model_data:
+                model_data[model_name] = {
+                    'dataset_scores': {},
+                    'url': None,
+                    'source': 'portuguese_leaderboard'
+                }
+            # Map Portuguese leaderboard columns to dataset names
+            column_mapping = {
+                'ASSIN2 RTE': 'assin2_rte',
+                'ASSIN2 STS': 'assin2_sts',
+                'FaQuAD-NLI': 'faquad-nli',
+                'HateBR': 'hatebr'
+            }
+            for display_name, dataset_name in column_mapping.items():
+                if dataset_name in NAPOLAB_DATASETS:
+                    score = row[display_name]
+                    if pd.notna(score) and score > 0:
+                        model_data[model_name]['dataset_scores'][dataset_name] = score
+    # Process external models data
+    if show_external_models and not EXTERNAL_MODELS_DATA.empty:
+        for _, row in EXTERNAL_MODELS_DATA.iterrows():
+            model_name = row['model_name']
+            if model_name not in model_data:
+                model_data[model_name] = {
+                    'dataset_scores': {},
+                    'url': row.get('link', ''),
+                    'source': 'external_models'
+                }
+            # Map external models columns to dataset names
+            column_mapping = {
+                'ASSIN2 RTE': 'assin2_rte',
+                'ASSIN2 STS': 'assin2_sts',
+                'FaQuAD-NLI': 'faquad-nli',
+                'HateBR': 'hatebr'
+            }
+            for display_name, dataset_name in column_mapping.items():
+                if dataset_name in NAPOLAB_DATASETS:
+                    score = row[display_name]
+                    if pd.notna(score) and score > 0:
+                        model_data[model_name]['dataset_scores'][dataset_name] = score
+    # Get model URLs and source information for existing models
+    additional_models = data_loader.get_additional_models()
+    for model_name in model_data.keys():
+        if model_data[model_name]['source'] == 'existing':
+            # Get URL
+            for arch_models in additional_models.values():
+                if model_name in arch_models:
+                    model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
+                    break
+            # Get source information
+            model_metadata = MODEL_METADATA.get(model_name, {})
+            source = model_metadata.get('source', 'unknown')
+            model_data[model_name]['source'] = source
+    # Create table data
+    table_data = []
+    for model_name, data in model_data.items():
+        # Apply source filtering
+        source = data['source']
+        # Apply show filters - only show models from sources that are checked
+        if source == 'napolab_thesis' and not show_napolab_thesis:
+            continue
+        if source == 'teenytinyllama_paper' and not show_teenytinyllama:
+            continue
+        if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
+            continue
+        if source == 'external_models' and not show_external_models:
+            continue
+        # Hide models with unknown source (should not happen with proper data)
+        if source == 'unknown':
+            continue
+        # Create clickable link for model name
+        if data['url']:
+            model_display = f"[{model_name}]({data['url']})"
+        elif source == 'portuguese_leaderboard' and '/' in model_name:
+            # Create Hugging Face link for Portuguese leaderboard models with slashes
+            huggingface_url = f"https://huggingface.co/{model_name}"
+            model_display = f"[{model_name}]({huggingface_url})"
+        else:
+            model_display = model_name
+        # Create row with dataset scores
+        row_data = {'Model': model_display}
+        # Calculate average only over selected datasets
+        selected_scores = []
+        for dataset_name in selected_datasets:
+            score = data['dataset_scores'].get(dataset_name, 0)
+            if score > 0:  # Only include non-zero scores in average
+                selected_scores.append(score)
+        overall_avg = np.mean(selected_scores) if selected_scores else 0
+        row_data['Average'] = round(overall_avg, 4)
+        # Add scores for each dataset (only selected ones)
+        for dataset_name in dataset_names:
+            score = data['dataset_scores'].get(dataset_name, 0)
+            display_name = dataset_display_names[dataset_names.index(dataset_name)]
+            # Only add columns for selected datasets
+            if dataset_name in selected_datasets:
+                row_data[display_name] = round(score, 4)
+        table_data.append(row_data)
+    df = pd.DataFrame(table_data)
+    # Filter to show only models that have scores for at least one selected dataset
+    if selected_datasets and not df.empty:
+        # Get display names for selected datasets
+        selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
+        # Filter models based on selection criteria
+        models_to_keep = []
+        for _, row in df.iterrows():
+            has_score = False
+            has_all_scores = True
+            # Only check the datasets that are actually selected for display
+            for dataset_name in selected_datasets:
+                display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+                if display_name in df.columns:
+                    score = row[display_name]
+                    if score > 0:
+                        has_score = True
+                    else:
+                        has_all_scores = False
+            # Keep model if it has at least one score
+            if has_score:
+                # If hide_incomplete_models is True, only keep models with all scores in selected datasets
+                if not hide_incomplete_models or has_all_scores:
+                    models_to_keep.append(row['Model'])
+        # Filter dataframe to only include selected models
+        if models_to_keep:
+            df = df[df['Model'].isin(models_to_keep)]
+        else:
+            # If no models to keep, create empty DataFrame with proper structure
+            # Create columns list first
+            columns = ['Model']
+            for dataset_name in dataset_names:
+                display_name = dataset_display_names[dataset_names.index(dataset_name)]
+                if dataset_name in selected_datasets:
+                    columns.append(display_name)
+            columns.append('Average')
+            # Create empty DataFrame with correct columns
+            df = pd.DataFrame(columns=columns)
+    # Filter by minimum average performance
+    if min_average_performance > 0 and not df.empty:
+        df = df[df['Average'] >= min_average_performance]
+    # Filter by search query
+    if search_query and not df.empty:
+        # Extract model names from markdown links for searching
+        df_filtered = df.copy()
+        df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
+        df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
+        df = df_filtered.drop('model_name_clean', axis=1)
+    # Sort by Average (descending)
+    if not df.empty:
+        df = df.sort_values('Average', ascending=False)
+    # Add rank column with medal emojis for top 3 and color-coded emojis for others
+    if not df.empty:
+        df = df.reset_index(drop=True)
+        df.index = df.index + 1  # Start ranking from 1
+        # Create rank column with medal emojis and color-coded emojis
+        rank_column = []
+        total_models = len(df)
+        for rank in df.index:
+            if rank == 1:
+                rank_column.append("🥇 1")
+            elif rank == 2:
+                rank_column.append("🥈 2")
+            elif rank == 3:
+                rank_column.append("🥉 3")
+            else:
+                # Color-code based on position relative to total
+                position_ratio = rank / total_models
+                if position_ratio <= 0.33:  # Top third
+                    rank_column.append("🟢 " + str(rank))
+                elif position_ratio <= 0.67:  # Middle third
+                    rank_column.append("🟡 " + str(rank))
+                else:  # Bottom third
+                    rank_column.append("🔴 " + str(rank))
+        df.insert(0, 'Rank', rank_column)
+    return df
+# Global variable to track the current CSV file
+current_csv_file = None
+def export_csv(df: pd.DataFrame):
+    """Export the benchmark table to CSV."""
+    global current_csv_file
+    print(f"Export function called with dataframe shape: {df.shape}")
+    if df.empty:
+        print("Dataframe is empty, returning None")
+        return None
+    # Clean up previous file if it exists
+    if current_csv_file:
+        try:
+            import os
+            if os.path.exists(current_csv_file):
+                os.remove(current_csv_file)
+                print(f"Deleted previous CSV file: {current_csv_file}")
+        except Exception as e:
+            print(f"Error deleting previous file {current_csv_file}: {e}")
+    # Clean the dataframe for CSV export
+    df_clean = df.copy()
+    # Remove markdown formatting from model names for cleaner CSV
+    df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
+    # Create filename with timestamp
+    from datetime import datetime
+    import tempfile
+    import os
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"napolab_benchmark_results_{timestamp}.csv"
+    # Create file in current directory (simpler approach)
+    file_path = filename
+    print(f"Creating CSV file at: {file_path}")
+    # Save to CSV file
+    df_clean.to_csv(file_path, index=False)
+    print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
+    # Update current file tracking
+    current_csv_file = file_path
+    return file_path
+def cleanup_current_csv():
+    """Clean up the current CSV file after download."""
+    global current_csv_file
+    import os
+    if current_csv_file and os.path.exists(current_csv_file):
+        try:
+            os.remove(current_csv_file)
+            print(f"Deleted CSV file after download: {current_csv_file}")
+            current_csv_file = None
+        except Exception as e:
+            print(f"Error deleting file {current_csv_file}: {e}")
+def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
+    """Create a radar chart showing model performance across all datasets."""
+    # Use selected datasets if provided, otherwise use all datasets
+    if selected_datasets is None:
+        selected_datasets = list(NAPOLAB_DATASETS.keys())
+    # Get dataset names for the radar axes (only selected ones)
+    dataset_names = selected_datasets
+    dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
+    # Collect data for each model
+    model_data = {}
+    # Process existing benchmark results
+    for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
+        if dataset_name in selected_datasets:
+            for model_name, metrics in models.items():
+                if model_name not in model_data:
+                    model_data[model_name] = {
+                        'performances': {},
+                        'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
+                        'source': 'existing'
+                    }
+                # Calculate average performance for this dataset
+                avg_performance = np.mean(list(metrics.values()))
+                model_data[model_name]['performances'][dataset_name] = avg_performance
+    # Process Portuguese leaderboard data
+    if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
+        for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
+            model_name = row['model_name']
+            if model_name not in model_data:
+                model_data[model_name] = {
+                    'performances': {},
+                    'architecture': 'Unknown',
+                    'source': 'portuguese_leaderboard'
+                }
+            # Map Portuguese leaderboard columns to dataset names
+            column_mapping = {
+                'ASSIN2 RTE': 'assin2_rte',
+                'ASSIN2 STS': 'assin2_sts',
+                'FaQuAD-NLI': 'faquad-nli',
+                'HateBR': 'hatebr'
+            }
+            for display_name, dataset_name in column_mapping.items():
+                if dataset_name in selected_datasets:
+                    score = row[display_name]
+                    if pd.notna(score) and score > 0:
+                        model_data[model_name]['performances'][dataset_name] = score
+    # Process external models data
+    if show_external_models and not EXTERNAL_MODELS_DATA.empty:
+        for _, row in EXTERNAL_MODELS_DATA.iterrows():
+            model_name = row['model_name']
+            if model_name not in model_data:
+                model_data[model_name] = {
+                    'performances': {},
+                    'architecture': 'Unknown',
+                    'source': 'external_models'
+                }
+            # Map external models columns to dataset names
+            column_mapping = {
+                'ASSIN2 RTE': 'assin2_rte',
+                'ASSIN2 STS': 'assin2_sts',
+                'FaQuAD-NLI': 'faquad-nli',
+                'HateBR': 'hatebr'
+            }
+            for display_name, dataset_name in column_mapping.items():
+                if dataset_name in selected_datasets:
+                    score = row[display_name]
+                    if pd.notna(score) and score > 0:
+                        model_data[model_name]['performances'][dataset_name] = score
+    # Get model URLs and source information for existing models
+    additional_models = data_loader.get_additional_models()
+    for model_name in model_data.keys():
+        if model_data[model_name]['source'] == 'existing':
+            # Get URL
+            for arch_models in additional_models.values():
+                if model_name in arch_models:
+                    model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
+                    break
+            # Get source information
+            model_metadata = MODEL_METADATA.get(model_name, {})
+            source = model_metadata.get('source', 'unknown')
+            model_data[model_name]['source'] = source
+    # Apply source filtering
+    filtered_model_data = {}
+    for model_name, data in model_data.items():
+        source = data.get('source', 'existing')
+        # Apply show filters - only show models from sources that are checked
+        if source == 'napolab_thesis' and not show_napolab_thesis:
+            continue
+        if source == 'teenytinyllama_paper' and not show_teenytinyllama:
+            continue
+        if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
+            continue
+        if source == 'external_models' and not show_external_models:
+            continue
+        # Hide models with unknown source (should not happen with proper data)
+        if source == 'unknown':
+            continue
+        filtered_model_data[model_name] = data
+    # Apply incomplete model filtering
+    if hide_incomplete_models and selected_datasets:
+        final_filtered_data = {}
+        for model_name, data in filtered_model_data.items():
+            has_all_scores = True
+            for dataset_name in selected_datasets:
+                if data['performances'].get(dataset_name, 0) == 0:
+                    has_all_scores = False
+                    break
+            if has_all_scores:
+                final_filtered_data[model_name] = data
+        filtered_model_data = final_filtered_data
+    # Apply minimum average performance filtering
+    if min_average_performance > 0 and selected_datasets:
+        final_filtered_data = {}
+        for model_name, data in filtered_model_data.items():
+            # Calculate average performance for selected datasets
+            scores = []
+            for dataset_name in selected_datasets:
+                score = data['performances'].get(dataset_name, 0)
+                if score > 0:  # Only include non-zero scores
+                    scores.append(score)
+            if scores:
+                avg_performance = np.mean(scores)
+                if avg_performance >= min_average_performance:
+                    final_filtered_data[model_name] = data
+        filtered_model_data = final_filtered_data
+    # Apply search query filtering
+    if search_query:
+        final_filtered_data = {}
+        for model_name, data in filtered_model_data.items():
+            if search_query.lower() in model_name.lower():
+                final_filtered_data[model_name] = data
+        filtered_model_data = final_filtered_data
+    # Sort models by average performance (descending)
+    model_performances = []
+    for model_name, data in filtered_model_data.items():
+        # Calculate average performance for selected datasets
+        scores = []
+        for dataset_name in selected_datasets:
+            score = data['performances'].get(dataset_name, 0)
+            if score > 0:  # Only include non-zero scores
+                scores.append(score)
+        avg_performance = np.mean(scores) if scores else 0
+        model_performances.append((model_name, data, avg_performance))
+    # Sort by average performance (descending)
+    model_performances.sort(key=lambda x: x[2], reverse=True)
+    # Create radar chart
+    fig = go.Figure()
+    # Generate a dynamic color palette based on the number of models
+    num_models = len(model_performances)
+    if num_models <= 10:
+        # Use a qualitative color palette for small numbers
+        colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel1 + px.colors.qualitative.Dark2
+    else:
+        # Use a continuous color palette for larger numbers
+        colors = px.colors.sequential.Viridis + px.colors.sequential.Plasma + px.colors.sequential.Inferno
+    # Ensure we have enough colors
+    while len(colors) < num_models:
+        colors.extend(colors)
+    for i, (model_name, data, avg_performance) in enumerate(model_performances):
+        # Get performance values for all datasets (fill with 0 if missing)
+        performance_values = []
+        for dataset_name in dataset_names:
+            performance_values.append(data['performances'].get(dataset_name, 0))
+        # Assign color based on model index for better differentiation
+        color = colors[i % len(colors)]
+        # Show first two models by default, hide the rest
+        visible = True if i < 2 else 'legendonly'
+        fig.add_trace(go.Scatterpolar(
+            r=performance_values,
+            theta=dataset_display_names,
+            fill='toself',
+            name=model_name,
+            line_color=color,
+            opacity=0.6,
+            visible=visible,
+            hovertemplate=(
+                "<b>%{fullData.name}</b><br>" +
+                "Dataset: %{theta}<br>" +
+                "Performance: %{r:.3f}<br>" +
+                "Architecture: " + data['architecture'] + "<br>" +
+                "<extra></extra>"
+            )
+        ))
+    # Update layout
+    fig.update_layout(
+        title="Model Performance Radar Chart - All Datasets",
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0.6, 1],
+                ticktext=['0.6', '0.7', '0.8', '0.9', '1.0'],
+                tickvals=[0.6, 0.7, 0.8, 0.9, 1.0]
+            ),
+            angularaxis=dict(
+                tickmode='array',
+                tickvals=list(range(len(dataset_display_names))),
+                ticktext=dataset_display_names
+            )
+        ),
+        height=700,
+        showlegend=True,
+        legend=dict(
+            yanchor="top",
+            y=-0.15,
+            xanchor="center",
+            x=0.5,
+            bgcolor='rgba(255, 255, 255, 0.9)',
+            bordercolor='rgba(0, 0, 0, 0.2)',
+            borderwidth=1,
+            orientation="h"
+        ),
+        margin=dict(l=50, r=50, t=100, b=100)
+    )
+    return fig
+# Gradio Interface
+with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 🌎 Napolab Leaderboard
+    Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
+    [⭐ Star us on GitHub](https://github.com/ruanchaves/napolab)
+    """)
+    with gr.Tabs():
+        # Benchmark Results Tab
+        with gr.Tab("🏆 Benchmark Results"):
+            gr.Markdown("### Model Performance Benchmarks")
+            with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
+                with gr.Row():
+                    # Create checkboxes for each dataset
+                    dataset_checkboxes = []
+                    for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
+                        display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+                        checkbox = gr.Checkbox(
+                            label=display_name,
+                            value=True  # Default to selected
+                        )
+                        dataset_checkboxes.append((dataset_name, checkbox))
+            with gr.Accordion("Filter by Score: (Click to expand)", open=False):
+                with gr.Row():
+                    hide_incomplete_models = gr.Checkbox(
+                        label="Hide models with zero scores in selected datasets",
+                        value=False
+                    )
+                    min_average_performance = gr.Slider(
+                        minimum=0,
+                        maximum=100,
+                        value=80,
+                        step=1,
+                        label="Minimum Average Performance (%)"
+                    )
+            with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
+                with gr.Row():
+                    show_napolab_thesis = gr.Checkbox(
+                        label="Napolab Thesis models",
+                        value=True
+                    )
+                    show_teenytinyllama = gr.Checkbox(
+                        label="TeenyTinyLlama models",
+                        value=True
+                    )
+                    show_portuguese_leaderboard = gr.Checkbox(
+                        label="Open Portuguese LLM Leaderboard models (open-source)",
+                        value=True
+                    )
+                    show_external_models = gr.Checkbox(
+                        label="Open Portuguese LLM Leaderboard models (proprietary)",
+                        value=True
+                    )
+            # Search bar for filtering models
+            search_query = gr.Textbox(
+                label="Search models by name",
+                placeholder="Enter model name to filter...",
+                value=""
+            )
+            benchmark_table = gr.DataFrame(
+                label="Model Performance Benchmarks",
+                wrap=[True, False, False, False, False, False, False, False, False, False],
+                interactive=False,
+                datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
+                column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
+            )
+            gr.Markdown("*🥇🥈🥉 = Top 3 | 🟢 = Top 33% | 🟡 = Middle 33% | 🔴 = Bottom 33%*")
+            # Export to CSV button and file component
+            export_button = gr.Button("📥 Export to CSV", variant="secondary")
+            csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
+        # Model Analysis Tab
+        with gr.Tab("📈 Model Analysis"):
+            gr.Markdown("### Model Performance Radar Chart")
+            # Dataset Selection Controls
+            with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
+                with gr.Row():
+                    # Create checkboxes for each dataset
+                    analysis_dataset_checkboxes = []
+                    for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
+                        display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+                        checkbox = gr.Checkbox(
+                            label=display_name,
+                            value=True
+                        )
+                        analysis_dataset_checkboxes.append((dataset_name, checkbox))
+            # Filter Controls
+            with gr.Accordion("Filter by Score: (Click to expand)", open=False):
+                with gr.Row():
+                    hide_incomplete_models_analysis = gr.Checkbox(
+                        label="Hide models with zero scores in selected datasets",
+                        value=False
+                    )
+                    min_average_performance_analysis = gr.Slider(
+                        minimum=0,
+                        maximum=100,
+                        value=80,
+                        step=1,
+                        label="Minimum Average Performance (%)"
+                    )
+            with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
+                with gr.Row():
+                    show_napolab_thesis_analysis = gr.Checkbox(
+                        label="Napolab Thesis models",
+                        value=True
+                    )
+                    show_teenytinyllama_analysis = gr.Checkbox(
+                        label="TeenyTinyLlama models",
+                        value=True
+                    )
+                    show_portuguese_leaderboard_analysis = gr.Checkbox(
+                        label="Open Portuguese LLM Leaderboard models (open-source)",
+                        value=True
+                    )
+                    show_external_models_analysis = gr.Checkbox(
+                        label="Open Portuguese LLM Leaderboard models (proprietary)",
+                        value=True
+                    )
+            # Search bar for filtering models in radar chart
+            search_query_analysis = gr.Textbox(
+                label="Search models by name",
+                placeholder="Enter model name to filter...",
+                value=""
+            )
+            model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
+            gr.Markdown("""
+            **How to interact with the chart:**
+            - **Click on legend items** to show/hide specific models.
+            - **Double-click on a legend item** to isolate that model (hide all others).
+            - **Double-click again** to show all models.
+            Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
+            """)
+        # About Tab
+        with gr.Tab("ℹ️ About"):
+            gr.Markdown("""
+            ## About Napolab
+            **Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
+            For more information, please visit the [GitHub repository](https://github.com/ruanchaves/napolab) and the [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab).
+            ### Data Sources:
+            The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
+            **1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
+            **2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
+            **3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by Corrêa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
+            ### Thesis Citation:
+            ```bibtex
+            @mastersthesis{chaves2023lessons,
+            title={Lessons learned from the evaluation of Portuguese language models},
+            author={Chaves Rodrigues, Ruan},
+            year={2023},
+            school={University of Malta},
+            url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
+            }
+            ```
+            ### Napolab Citation:
+            ```bibtex
+            @software{Chaves_Rodrigues_napolab_2023,
+            author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
+            doi = {10.5281/zenodo.7781848},
+            month = {3},
+            title = {{Natural Portuguese Language Benchmark (Napolab)}},
+            url = {https://github.com/ruanchaves/napolab},
+            version = {1.0.0},
+            year = {2023}
+            }
+            ```
+            """)
+    # Event handlers
+    def update_radar_chart(*args):
+        # Extract arguments for radar chart
+        dataset_values = args[:len(analysis_dataset_checkboxes)]
+        hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
+        min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0  # Convert percentage to decimal
+        show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
+        show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
+        show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
+        show_external_models = args[len(analysis_dataset_checkboxes) + 5]
+        search_query = args[len(analysis_dataset_checkboxes) + 6]
+        # Convert dataset selections to list of selected dataset names
+        selected_datasets = []
+        for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
+            if dataset_values[i]:
+                selected_datasets.append(dataset_name)
+        return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
+    def update_benchmark_table(*args):
+        # Extract arguments
+        dataset_values = args[:len(dataset_checkboxes)]
+        hide_incomplete_models = args[len(dataset_checkboxes)]
+        min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0  # Convert percentage to decimal
+        show_napolab_thesis = args[len(dataset_checkboxes) + 2]
+        show_teenytinyllama = args[len(dataset_checkboxes) + 3]
+        show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
+        show_external_models = args[len(dataset_checkboxes) + 5]
+        search_query = args[len(dataset_checkboxes) + 6]
+        # Convert dataset selections to list of selected dataset names
+        selected_datasets = []
+        for i, (dataset_name, _) in enumerate(dataset_checkboxes):
+            if dataset_values[i]:
+                selected_datasets.append(dataset_name)
+        df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
+        return df
+    # Connect events
+    # Load model analysis chart on app start
+    app.load(lambda: update_radar_chart(*([True] * len(analysis_dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=model_analysis_chart)
+    # Load benchmark table on app start
+    app.load(lambda: update_benchmark_table(*([True] * len(dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=benchmark_table)
+    # Connect dataset checkboxes to update table
+    for dataset_name, checkbox in dataset_checkboxes:
+        checkbox.change(
+            update_benchmark_table,
+            inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+            outputs=benchmark_table
+        )
+    hide_incomplete_models.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    min_average_performance.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    show_napolab_thesis.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    show_teenytinyllama.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    show_portuguese_leaderboard.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    show_external_models.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    # Connect search query to update table
+    search_query.change(
+        update_benchmark_table,
+        inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
+        outputs=benchmark_table
+    )
+    # Connect export button
+    export_button.click(
+        export_csv,
+        inputs=benchmark_table,
+        outputs=csv_file
+    )
+    # Connect file download to cleanup
+    csv_file.change(
+        cleanup_current_csv,
+        inputs=None,
+        outputs=None
+    )
+    # Connect analysis chart events
+    # Connect dataset checkboxes to update radar chart
+    for dataset_name, checkbox in analysis_dataset_checkboxes:
+        checkbox.change(
+            update_radar_chart,
+            inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+            outputs=model_analysis_chart
+        )
+    hide_incomplete_models_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    min_average_performance_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    show_napolab_thesis_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    show_teenytinyllama_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    show_portuguese_leaderboard_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    show_external_models_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+    # Connect search query to update radar chart
+    search_query_analysis.change(
+        update_radar_chart,
+        inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
+        outputs=model_analysis_chart
+    )
+if __name__ == "__main__":
+    app.launch(share=True, server_name="0.0.0.0", server_port=7860)

config.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Configuration file for the Napolab Leaderboard Gradio App
+"""
+# App Configuration
+APP_TITLE = "Napolab Leaderboard"
+APP_DESCRIPTION = "Natural Portuguese Language Benchmark Leaderboard"
+APP_THEME = "soft"
+APP_PORT = 7860
+APP_HOST = "0.0.0.0"
+APP_SHARE = True
+# Dataset Configuration
+DEFAULT_DATASET = "assin"
+DEFAULT_SPLIT = "test"
+DEFAULT_SAMPLES = 5
+MAX_SAMPLES = 20
+# Chart Configuration
+CHART_HEIGHT = 400
+OVERVIEW_CHART_HEIGHT = 600
+CHART_COLORS = {
+    "primary": "#1f77b4",
+    "secondary": "#ff7f0e",
+    "success": "#2ca02c",
+    "warning": "#d62728"
+}
+# Model Configuration
+DEFAULT_MODELS_TO_COMPARE = 2
+# Cache Configuration
+CACHE_DURATION = 3600  # 1 hour in seconds
+# Error Messages
+ERROR_MESSAGES = {
+    "dataset_load": "Error loading dataset. Please check your internet connection.",
+    "no_benchmark": "No benchmark data available for this dataset.",
+    "no_models": "No models found for comparison.",
+    "invalid_selection": "Invalid selection. Please try again."
+}
+# Links
+LINKS = {
+    "github": "https://github.com/ruanchaves/napolab",
+    "huggingface_dataset": "https://huggingface.co/datasets/ruanchaves/napolab",
+    "open_pt_llm_leaderboard": "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
+}

data.yaml ADDED Viewed

	@@ -0,0 +1,422 @@

+# Napolab Leaderboard Data Configuration
+# This file contains all datasets and benchmark results for the Gradio app
+#
+# Data Source: "Lessons learned from the evaluation of Portuguese language models"
+# by Ruan Chaves Rodrigues (2023) - Master's dissertation, University of Malta
+# Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557
+# Data Sources
+sources:
+  napolab_thesis:
+    name: "Napolab Thesis"
+    description: "Lessons learned from the evaluation of Portuguese language models"
+    author: "Ruan Chaves Rodrigues"
+    year: 2023
+    url: "https://www.um.edu.mt/library/oar/handle/123456789/120557"
+    institution: "University of Malta"
+  open_pt_llm_leaderboard:
+    name: "Open PT LLM Leaderboard"
+    description: "Large Language Models on Portuguese Benchmarks"
+    author: "Eduardo Garcia"
+    year: 2025
+    url: "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
+    platform: "Hugging Face Spaces"
+  teenytinyllama_paper:
+    name: "TeenyTinyLlama Paper"
+    description: "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"
+    authors: ["Corrêa, Nicholas Kluge", "Falk, Sophia", "Fatimah, Shiza", "Sen, Aniket", "De Oliveira, Nythamar"]
+    year: 2024
+    journal: "Machine Learning with Applications"
+    doi: "10.1016/j.mlwa.2024.100558"
+# Dataset Information
+datasets:
+  assin_rte:
+    name: "ASSIN RTE"
+    description: "Avaliação de Similaridade Semântica e Inferência Textual - RTE"
+    tasks: ["RTE"]
+    url: "https://huggingface.co/datasets/assin"
+  assin_sts:
+    name: "ASSIN STS"
+    description: "Avaliação de Similaridade Semântica e Inferência Textual - STS"
+    tasks: ["STS"]
+    url: "https://huggingface.co/datasets/assin"
+  assin2_rte:
+    name: "ASSIN 2 RTE"
+    description: "Avaliação de Similaridade Semântica e Inferência Textual (v2) - RTE"
+    tasks: ["RTE"]
+    url: "https://huggingface.co/datasets/assin2"
+  assin2_sts:
+    name: "ASSIN 2 STS"
+    description: "Avaliação de Similaridade Semântica e Inferência Textual (v2) - STS"
+    tasks: ["STS"]
+    url: "https://huggingface.co/datasets/assin2"
+  faquad-nli:
+    name: "FaQUaD-NLI"
+    description: "Factual Question Answering and Natural Language Inference"
+    tasks: ["NLI"]
+    url: "https://huggingface.co/datasets/ruanchaves/faquad-nli"
+  hatebr:
+    name: "HateBR"
+    description: "Hate Speech Detection in Brazilian Portuguese"
+    tasks: ["Classification"]
+    url: "https://huggingface.co/datasets/ruanchaves/hatebr"
+  porsimplessent:
+    name: "PorSimplesSent"
+    description: "Portuguese Simple Sentences Sentiment Analysis"
+    tasks: ["Sentiment Analysis"]
+    url: "https://huggingface.co/datasets/ruanchaves/porsimplessent"
+  reli-sa:
+    name: "Reli-SA"
+    description: "Religious Sentiment Analysis"
+    tasks: ["Sentiment Analysis"]
+    url: "https://huggingface.co/datasets/ruanchaves/reli-sa"
+# Benchmark Results
+benchmark_results:
+  assin_rte:
+    albertina-pt-pt:
+      accuracy: 0.887
+    albertina-pt-br:
+      accuracy: 0.844
+    deberta-v2-large:
+      accuracy: 0.864
+    xlm-roberta-large:
+      accuracy: 0.874
+    mdeberta-v3-base:
+      accuracy: 0.863
+    bertimbau-large:
+      accuracy: 0.838
+    bert-large:
+      accuracy: 0.802
+    bertimbau-base:
+      accuracy: 0.828
+    bert-multilingual-base:
+      accuracy: 0.815
+    xlm-roberta-base:
+      accuracy: 0.822
+    bertinho:
+      accuracy: 0.786
+    ixaes:
+      accuracy: 0.782
+  assin_sts:
+    albertina-pt-pt:
+      accuracy: 0.874
+    albertina-pt-br:
+      accuracy: 0.883
+    deberta-v2-large:
+      accuracy: 0.861
+    xlm-roberta-large:
+      accuracy: 0.863
+    mdeberta-v3-base:
+      accuracy: 0.855
+    bertimbau-large:
+      accuracy: 0.826
+    bert-large:
+      accuracy: 0.822
+    bertimbau-base:
+      accuracy: 0.844
+    bert-multilingual-base:
+      accuracy: 0.820
+    xlm-roberta-base:
+      accuracy: 0.812
+    bertinho:
+      accuracy: 0.791
+    ixaes:
+      accuracy: 0.817
+  assin2_rte:
+    albertina-pt-pt:
+      accuracy: 0.910
+    albertina-pt-br:
+      accuracy: 0.916
+    deberta-v2-large:
+      accuracy: 0.911
+    xlm-roberta-large:
+      accuracy: 0.910
+    mdeberta-v3-base:
+      accuracy: 0.904
+    bertimbau-large:
+      accuracy: 0.897
+    bert-large:
+      accuracy: 0.892
+    bertimbau-base:
+      accuracy: 0.884
+    bert-multilingual-base:
+      accuracy: 0.877
+    xlm-roberta-base:
+      accuracy: 0.875
+    bertinho:
+      accuracy: 0.855
+    ixaes:
+      accuracy: 0.879
+    ttl-460m:
+      accuracy: 0.8643
+    ttl-160m:
+      accuracy: 0.8578
+  assin2_sts:
+    deberta-v2-large:
+      accuracy: 0.724
+    mdeberta-v3-base:
+      accuracy: 0.847
+    bertimbau-large:
+      accuracy: 0.855
+    bert-large:
+      accuracy: 0.792
+    bertimbau-base:
+      accuracy: 0.840
+    bert-multilingual-base:
+      accuracy: 0.827
+    xlm-roberta-base:
+      accuracy: 0.847
+    bertinho:
+      accuracy: 0.802
+    ixaes:
+      accuracy: 0.822
+  faquad-nli:
+    mdeberta-v3-base:
+      accuracy: 0.889
+    bertimbau-large:
+      accuracy: 0.900
+    bert-large:
+      accuracy: 0.838
+    bertimbau-base:
+      accuracy: 0.897
+    bert-multilingual-base:
+      accuracy: 0.865
+    xlm-roberta-base:
+      accuracy: 0.898
+    bertinho:
+      accuracy: 0.866
+    ixaes:
+      accuracy: 0.860
+    ttl-460m:
+      accuracy: 0.9118
+    ttl-160m:
+      accuracy: 0.9000
+  hatebr:
+    mdeberta-v3-base:
+      accuracy: 0.911
+    bertimbau-large:
+      accuracy: 0.919
+    bert-large:
+      accuracy: 0.838
+    bertimbau-base:
+      accuracy: 0.920
+    bert-multilingual-base:
+      accuracy: 0.871
+    xlm-roberta-base:
+      accuracy: 0.920
+    bertinho:
+      accuracy: 0.879
+    ixaes:
+      accuracy: 0.872
+    ttl-460m:
+      accuracy: 0.9228
+    ttl-160m:
+      accuracy: 0.9071
+  porsimplessent:
+    mdeberta-v3-base:
+      accuracy: 0.953
+    bertimbau-large:
+      accuracy: 0.919
+    bert-large:
+      accuracy: 0.907
+    bertimbau-base:
+      accuracy: 0.920
+    bert-multilingual-base:
+      accuracy: 0.933
+    xlm-roberta-base:
+      accuracy: 0.920
+    bertinho:
+      accuracy: 0.900
+    ixaes:
+      accuracy: 0.899
+  reli-sa:
+    mdeberta-v3-base:
+      accuracy: 0.719
+    bertimbau-large:
+      accuracy: 0.745
+    bert-large:
+      accuracy: 0.629
+    bertimbau-base:
+      accuracy: 0.713
+    bert-multilingual-base:
+      accuracy: 0.642
+    xlm-roberta-base:
+      accuracy: 0.680
+    bertinho:
+      accuracy: 0.681
+    ixaes:
+      accuracy: 0.637
+# Model Metadata
+model_metadata:
+  albertina-pt-pt:
+    parameters: 125000000
+    architecture: "Albertina PT:PT"
+    base_model: "PORTULAN/albertina-ptpt"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
+    source: "napolab_thesis"
+  albertina-pt-br:
+    parameters: 125000000
+    architecture: "Albertina PT:BR"
+    base_model: "PORTULAN/albertina-ptbr"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
+    source: "napolab_thesis"
+  deberta-v2-large:
+    parameters: 900000000
+    architecture: "DeBERTa v2 (large)"
+    base_model: "microsoft/deberta-v2-large"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
+    source: "napolab_thesis"
+  xlm-roberta-large:
+    parameters: 550000000
+    architecture: "XLM-RoBERTa (large)"
+    base_model: "xlm-roberta-large"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/xlm-roberta-large"
+    source: "napolab_thesis"
+  mdeberta-v3-base:
+    parameters: 86000000
+    architecture: "mDeBERTa v3 (base)"
+    base_model: "microsoft/mdeberta-v3-base"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
+    source: "napolab_thesis"
+  bertimbau-large:
+    parameters: 355000000
+    architecture: "BERTimbau (large)"
+    base_model: "neuralmind/bert-large-portuguese-cased"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
+    source: "napolab_thesis"
+  bert-large:
+    parameters: 355000000
+    architecture: "BERT (large)"
+    base_model: "bert-large-uncased"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/bert-large-uncased"
+    source: "napolab_thesis"
+  bertimbau-base:
+    parameters: 110000000
+    architecture: "BERTimbau (base)"
+    base_model: "neuralmind/bert-base-portuguese-cased"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
+    source: "napolab_thesis"
+  bert-multilingual-base:
+    parameters: 110000000
+    architecture: "BERT multilingual (base)"
+    base_model: "bert-base-multilingual-cased"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
+    source: "napolab_thesis"
+  xlm-roberta-base:
+    parameters: 270000000
+    architecture: "XLM-RoBERTa (base)"
+    base_model: "xlm-roberta-base"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/xlm-roberta-base"
+    source: "napolab_thesis"
+  bertinho:
+    parameters: 110000000
+    architecture: "Bertinho"
+    base_model: "ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
+    source: "napolab_thesis"
+  ixaes:
+    parameters: 110000000
+    architecture: "IXAes"
+    base_model: "ixa-ehu/ixambert-base-cased"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
+    source: "napolab_thesis"
+  ttl-460m:
+    parameters: 460000000
+    architecture: "TeenyTinyLlama (460M)"
+    base_model: "nicholasKluge/TeenyTinyLlama-460m"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
+    source: "teenytinyllama_paper"
+  ttl-160m:
+    parameters: 160000000
+    architecture: "TeenyTinyLlama (160M)"
+    base_model: "nicholasKluge/TeenyTinyLlama-160m"
+    task: "Multiple"
+    huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"
+    source: "teenytinyllama_paper"
+# Additional Models (for Model Hub tab)
+additional_models:
+  albertina_models:
+    albertina-pt-pt:
+      huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
+    albertina-pt-br:
+      huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
+  deberta_models:
+    deberta-v2-large:
+      huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
+    mdeberta-v3-base:
+      huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
+  roberta_models:
+    xlm-roberta-large:
+      huggingface_url: "https://huggingface.co/xlm-roberta-large"
+    xlm-roberta-base:
+      huggingface_url: "https://huggingface.co/xlm-roberta-base"
+  bert_models:
+    bertimbau-large:
+      huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
+    bertimbau-base:
+      huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
+    bert-large:
+      huggingface_url: "https://huggingface.co/bert-large-uncased"
+    bert-multilingual-base:
+      huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
+  specialized_models:
+    bertinho:
+      huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
+    ixaes:
+      huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
+  teenytinyllama_models:
+    ttl-460m:
+      huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
+    ttl-160m:
+      huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"

data_loader.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+Data loader for Napolab Leaderboard
+Loads datasets, benchmark results, and model metadata from YAML configuration files.
+"""
+import yaml
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+class NapolabDataLoader:
+    """Loads and manages Napolab data from YAML configuration files."""
+    def __init__(self, data_file: str = "data.yaml"):
+        """
+        Initialize the data loader.
+        Args:
+            data_file: Path to the YAML data file
+        """
+        self.data_file = data_file
+        self.data = None
+        self.load_data()
+    def load_data(self) -> None:
+        """Load data from the YAML file."""
+        try:
+            # Get the directory where this script is located
+            script_dir = Path(__file__).parent
+            data_path = script_dir / self.data_file
+            if not data_path.exists():
+                raise FileNotFoundError(f"Data file not found: {data_path}")
+            with open(data_path, 'r', encoding='utf-8') as file:
+                self.data = yaml.safe_load(file)
+        except Exception as e:
+            print(f"Error loading data from {self.data_file}: {e}")
+            # Fallback to empty data structure
+            self.data = {
+                'datasets': {},
+                'benchmark_results': {},
+                'model_metadata': {},
+                'additional_models': {}
+            }
+    def get_datasets(self) -> Dict[str, Any]:
+        """Get all datasets information."""
+        return self.data.get('datasets', {})
+    def get_benchmark_results(self) -> Dict[str, Any]:
+        """Get all benchmark results."""
+        return self.data.get('benchmark_results', {})
+    def get_model_metadata(self) -> Dict[str, Any]:
+        """Get all model metadata."""
+        return self.data.get('model_metadata', {})
+    def get_additional_models(self) -> Dict[str, Any]:
+        """Get additional models for the Model Hub."""
+        return self.data.get('additional_models', {})
+    def get_dataset_info(self, dataset_name: str) -> Optional[Dict[str, Any]]:
+        """Get information for a specific dataset."""
+        return self.data.get('datasets', {}).get(dataset_name)
+    def get_benchmark_for_dataset(self, dataset_name: str) -> Optional[Dict[str, Any]]:
+        """Get benchmark results for a specific dataset."""
+        return self.data.get('benchmark_results', {}).get(dataset_name)
+    def get_model_info(self, model_name: str) -> Optional[Dict[str, Any]]:
+        """Get metadata for a specific model."""
+        return self.data.get('model_metadata', {}).get(model_name)
+    def get_available_datasets(self) -> list:
+        """Get list of available dataset names."""
+        return list(self.data.get('datasets', {}).keys())
+    def get_available_models_for_dataset(self, dataset_name: str) -> list:
+        """Get list of available models for a specific dataset."""
+        benchmark = self.get_benchmark_for_dataset(dataset_name)
+        if benchmark:
+            return list(benchmark.keys())
+        return []
+    def get_all_models(self) -> list:
+        """Get list of all available models."""
+        return list(self.data.get('model_metadata', {}).keys())
+    def validate_data(self) -> bool:
+        """Validate the loaded data structure."""
+        required_keys = ['datasets', 'benchmark_results', 'model_metadata']
+        for key in required_keys:
+            if key not in self.data:
+                print(f"Missing required key: {key}")
+                return False
+        return True
+    def reload_data(self) -> None:
+        """Reload data from the YAML file."""
+        self.load_data()
+    def export_data(self, output_file: str = "exported_data.yaml") -> None:
+        """Export the current data to a YAML file."""
+        try:
+            with open(output_file, 'w', encoding='utf-8') as file:
+                yaml.dump(self.data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
+            print(f"Data exported to {output_file}")
+        except Exception as e:
+            print(f"Error exporting data: {e}")
+# Global data loader instance
+data_loader = NapolabDataLoader()
+# Convenience functions for backward compatibility
+def get_napolab_datasets() -> Dict[str, Any]:
+    """Get Napolab datasets (for backward compatibility)."""
+    return data_loader.get_datasets()
+def get_sample_benchmark_results() -> Dict[str, Any]:
+    """Get benchmark results (for backward compatibility)."""
+    return data_loader.get_benchmark_results()
+def get_model_metadata() -> Dict[str, Any]:
+    """Get model metadata (for backward compatibility)."""
+    return data_loader.get_model_metadata()
+def get_additional_models() -> Dict[str, Any]:
+    """Get additional models (for backward compatibility)."""
+    return data_loader.get_additional_models()

download_external_models.py ADDED Viewed

	@@ -0,0 +1,124 @@

+#!/usr/bin/env python3
+"""
+Script to download external models data from the Open Portuguese LLM Leaderboard
+and convert it to CSV format for import into the benchmark.
+"""
+import requests
+import pandas as pd
+import json
+import sys
+def download_external_models():
+    """Download external models data and convert to CSV."""
+    url = "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard/raw/main/external_models_results.json"
+    print("Downloading external models data...")
+    try:
+        # Download the JSON file
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        # Parse JSON
+        data = response.json()
+        if not isinstance(data, list):
+            print("Error: Expected JSON array, got:", type(data))
+            return
+        print(f"Downloaded {len(data)} external models")
+        # Extract data for each model
+        extracted_data = []
+        for item in data:
+            if not isinstance(item, dict):
+                print(f"Warning: Skipping non-dict item: {type(item)}")
+                continue
+            # Extract required fields
+            model = item.get('model', '')
+            link = item.get('link', '')
+            result_metrics = item.get('result_metrics', {})
+            if not isinstance(result_metrics, dict):
+                print(f"Warning: Skipping model '{model}' - result_metrics is not a dict")
+                continue
+            # Extract metrics
+            assin2_sts = result_metrics.get('assin2_sts', 0.0)
+            assin2_rte = result_metrics.get('assin2_rte', 0.0)
+            faquad_nli = result_metrics.get('faquad_nli', 0.0)
+            hatebr_offensive = result_metrics.get('hatebr_offensive', 0.0)
+            # Create row data
+            row_data = {
+                'model': model,
+                'link': link,
+                'assin2_sts': assin2_sts,
+                'assin2_rte': assin2_rte,
+                'faquad_nli': faquad_nli,
+                'hatebr_offensive': hatebr_offensive
+            }
+            extracted_data.append(row_data)
+        # Create DataFrame
+        df = pd.DataFrame(extracted_data)
+        # Save to CSV
+        output_file = 'external_models.csv'
+        df.to_csv(output_file, index=False)
+        print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
+        # Show first few entries as preview
+        print("\nFirst 5 entries:")
+        print(df.head().to_string(index=False))
+        # Show some statistics
+        if not df.empty:
+            print(f"\nStatistics:")
+            print(f"Total models: {len(df)}")
+            # Count models with non-zero scores for each metric
+            print(f"\nModels with scores:")
+            print(f"ASSIN2 STS: {(df['assin2_sts'] > 0).sum()}")
+            print(f"ASSIN2 RTE: {(df['assin2_rte'] > 0).sum()}")
+            print(f"FaQuAD-NLI: {(df['faquad_nli'] > 0).sum()}")
+            print(f"HateBR: {(df['hatebr_offensive'] > 0).sum()}")
+            # Average scores
+            print(f"\nAverage scores:")
+            print(df[['assin2_sts', 'assin2_rte', 'faquad_nli', 'hatebr_offensive']].mean().round(3))
+            # Show data types and info
+            print(f"\nDataFrame info:")
+            print(df.info())
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading data: {e}")
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error parsing JSON: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        sys.exit(1)
+def main():
+    """Main function to run the download."""
+    print("External Models Data Downloader")
+    print("=" * 40)
+    try:
+        download_external_models()
+        print("\nDownload completed successfully!")
+    except Exception as e:
+        print(f"Error during download: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

example_usage.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+"""
+Example Usage of Napolab Leaderboard Data Management
+This script demonstrates how to use the YAML-based data management system.
+"""
+from data_loader import NapolabDataLoader
+from manage_data import validate_yaml_structure, add_dataset, add_benchmark_result, add_model_metadata, export_data
+import yaml
+def example_usage():
+    """Demonstrate the data management functionality."""
+    print("🚀 Napolab Leaderboard Data Management Example")
+    print("=" * 50)
+    # 1. Load existing data
+    print("\n1. Loading existing data...")
+    data_loader = NapolabDataLoader()
+    data = data_loader.data
+    print(f"✅ Loaded {len(data['datasets'])} datasets")
+    print(f"✅ Loaded {len(data['model_metadata'])} models")
+    # 2. Validate the data structure
+    print("\n2. Validating data structure...")
+    if validate_yaml_structure(data):
+        print("✅ Data structure is valid!")
+    else:
+        print("❌ Data structure has issues!")
+        return
+    # 3. Add a new dataset
+    print("\n3. Adding a new dataset...")
+    data = add_dataset(
+        data=data,
+        dataset_name="example_dataset",
+        name="Example Dataset",
+        description="An example dataset for demonstration",
+        tasks=["Classification", "Sentiment Analysis"],
+        url="https://huggingface.co/datasets/example"
+    )
+    # 4. Add a new model
+    print("\n4. Adding a new model...")
+    data = add_model_metadata(
+        data=data,
+        model_name="example-model",
+        parameters=125000000,
+        architecture="BERT Large",
+        base_model="bert-large-uncased",
+        task="Classification",
+        huggingface_url="https://huggingface.co/example/model"
+    )
+    # 5. Add benchmark results
+    print("\n5. Adding benchmark results...")
+    data = add_benchmark_result(
+        data=data,
+        dataset_name="example_dataset",
+        model_name="example-model",
+        metrics={
+            "accuracy": 0.89,
+            "f1": 0.88,
+            "precision": 0.90,
+            "recall": 0.87
+        }
+    )
+    # 6. Export the updated data
+    print("\n6. Exporting updated data...")
+    export_data(data, "example_updated_data.yaml")
+    # 7. Demonstrate data access
+    print("\n7. Demonstrating data access...")
+    # Get dataset info
+    dataset_info = data_loader.get_dataset_info("assin")
+    if dataset_info:
+        print(f"📊 ASSIN dataset: {dataset_info['name']}")
+        print(f"   Tasks: {', '.join(dataset_info['tasks'])}")
+    # Get available models for a dataset
+    models = data_loader.get_available_models_for_dataset("assin")
+    print(f"🤖 Available models for ASSIN: {len(models)} models")
+    # Get model info
+    model_info = data_loader.get_model_info("mdeberta-v3-base-assin-similarity")
+    if model_info:
+        print(f"🔧 Model parameters: {model_info['parameters']:,}")
+        print(f"   Architecture: {model_info['architecture']}")
+    print("\n✅ Example completed successfully!")
+    print("📁 Check 'example_updated_data.yaml' for the updated data")
+def demonstrate_yaml_structure():
+    """Show the YAML structure."""
+    print("\n📋 YAML Data Structure Example:")
+    print("-" * 30)
+    example_data = {
+        'datasets': {
+            'my_dataset': {
+                'name': 'My Dataset',
+                'description': 'A custom dataset',
+                'tasks': ['Classification'],
+                'url': 'https://huggingface.co/datasets/my_dataset'
+            }
+        },
+        'benchmark_results': {
+            'my_dataset': {
+                'my-model': {
+                    'accuracy': 0.92,
+                    'f1': 0.91
+                }
+            }
+        },
+        'model_metadata': {
+            'my-model': {
+                'parameters': 110000000,
+                'architecture': 'BERT Base',
+                'base_model': 'bert-base-uncased',
+                'task': 'Classification',
+                'huggingface_url': 'https://huggingface.co/my-model'
+            }
+        }
+    }
+    print(yaml.dump(example_data, default_flow_style=False, allow_unicode=True))
+if __name__ == "__main__":
+    example_usage()
+    demonstrate_yaml_structure()

external_models.csv ADDED Viewed

	@@ -0,0 +1,31 @@

+model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
+sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
+sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
+gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
+claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
+gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
+gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
+deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
+gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
+gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
+gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
+nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
+llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
+sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
+llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
+llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
+gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
+gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
+gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
+gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
+gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
+deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
+qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
+qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
+qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
+qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
+gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
+claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
+llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
+llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
+gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344

extract_portuguese_leaderboard.py ADDED Viewed

	@@ -0,0 +1,195 @@

+#!/usr/bin/env python3
+"""
+Script to extract data from JSON files in a repository folder
+and save it as a CSV file for import into the benchmark.
+"""
+import pandas as pd
+import json
+import os
+import sys
+import argparse
+from pathlib import Path
+def is_valid_json_file(file_path):
+    """
+    Check if a file is a valid JSON file containing a dict.
+    Args:
+        file_path (str): Path to the JSON file
+    Returns:
+        bool: True if valid JSON dict, False otherwise
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return isinstance(data, dict)
+    except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
+        return False
+def find_json_files(repo_path):
+    """
+    Recursively find all JSON files in the repository folder.
+    Args:
+        repo_path (str): Path to the repository folder
+    Returns:
+        list: List of paths to valid JSON files
+    """
+    json_files = []
+    repo_path = Path(repo_path)
+    if not repo_path.exists():
+        print(f"Error: Repository path '{repo_path}' does not exist.")
+        return []
+    if not repo_path.is_dir():
+        print(f"Error: Repository path '{repo_path}' is not a directory.")
+        return []
+    print(f"Scanning repository: {repo_path}")
+    for file_path in repo_path.rglob("*.json"):
+        if is_valid_json_file(file_path):
+            json_files.append(file_path)
+            print(f"Found valid JSON file: {file_path}")
+    print(f"Total valid JSON files found: {len(json_files)}")
+    return json_files
+def extract_data_from_json(json_file_path):
+    """
+    Extract data from a single JSON file.
+    Args:
+        json_file_path (Path): Path to the JSON file
+    Returns:
+        dict or None: Extracted data or None if extraction failed
+    """
+    try:
+        with open(json_file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # Check if required fields exist
+        if 'config_general' not in data or 'results' not in data:
+            return None
+        config_general = data['config_general']
+        results = data['results']
+        # Extract model information
+        model_name = config_general.get('model_name', '')
+        model_private = config_general.get('model_private', False)
+        # Extract results
+        all_grouped = results.get('all_grouped', {})
+        # Extract metrics
+        assin2_rte = all_grouped.get('assin2_rte', 0.0)
+        assin2_sts = all_grouped.get('assin2_sts', 0.0)
+        faquad_nli = all_grouped.get('faquad_nli', 0.0)
+        hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)
+        # Create row data
+        row_data = {
+            'json_file': str(json_file_path),
+            'model_name': model_name,
+            'model_private': model_private,
+            'assin2_rte': assin2_rte,
+            'assin2_sts': assin2_sts,
+            'faquad_nli': faquad_nli,
+            'hatebr_offensive': hatebr_offensive
+        }
+        return row_data
+    except Exception as e:
+        print(f"Error processing {json_file_path}: {e}")
+        return None
+def extract_portuguese_leaderboard(repo_path):
+    """
+    Extract data from JSON files in the repository folder and save as CSV.
+    Args:
+        repo_path (str): Path to the repository folder
+    """
+    print("Scanning repository for JSON files...")
+    # Find all JSON files
+    json_files = find_json_files(repo_path)
+    if not json_files:
+        print("No valid JSON files found in the repository.")
+        return
+    # Prepare data for DataFrame
+    data = []
+    # Process each JSON file
+    for i, json_file in enumerate(json_files):
+        print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")
+        row_data = extract_data_from_json(json_file)
+        if row_data:
+            data.append(row_data)
+        # Print progress every 10 files
+        if (i + 1) % 10 == 0:
+            print(f"  Processed {i + 1} files...")
+    if not data:
+        print("No valid data extracted from JSON files.")
+        return
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    # Write to CSV
+    output_file = 'portuguese_leaderboard.csv'
+    df.to_csv(output_file, index=False)
+    print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
+    # Show first few entries as preview
+    print("\nFirst 5 entries:")
+    print(df.head().to_string(index=False))
+    # Show some statistics
+    if not df.empty:
+        print(f"\nStatistics:")
+        print(f"Total models: {len(df)}")
+        print(f"Private models: {df['model_private'].sum()}")
+        print(f"Public models: {(~df['model_private']).sum()}")
+        # Average scores
+        print(f"\nAverage scores:")
+        print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))
+        # Show data types and info
+        print(f"\nDataFrame info:")
+        print(df.info())
+def main():
+    """Main function to run the extraction."""
+    parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
+    parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')
+    args = parser.parse_args()
+    print("Portuguese LLM Leaderboard Data Extractor")
+    print("=" * 50)
+    try:
+        extract_portuguese_leaderboard(args.repo_path)
+        print("\nExtraction completed successfully!")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

manage_data.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python3
+"""
+Data Management Utility for Napolab Leaderboard
+This script provides utilities to manage, validate, and update the YAML data file.
+"""
+import yaml
+import argparse
+from pathlib import Path
+from data_loader import NapolabDataLoader
+from typing import Dict, Any
+def validate_yaml_structure(data: Dict[str, Any]) -> bool:
+    """Validate the YAML data structure."""
+    print("🔍 Validating YAML structure...")
+    required_sections = ['datasets', 'benchmark_results', 'model_metadata']
+    for section in required_sections:
+        if section not in data:
+            print(f"❌ Missing required section: {section}")
+            return False
+        print(f"✅ Found section: {section}")
+    # Validate datasets
+    print("\n📊 Validating datasets...")
+    for dataset_name, dataset_info in data['datasets'].items():
+        required_fields = ['name', 'description', 'tasks', 'url']
+        for field in required_fields:
+            if field not in dataset_info:
+                print(f"❌ Dataset '{dataset_name}' missing field: {field}")
+                return False
+        print(f"✅ Dataset '{dataset_name}' is valid")
+    # Validate benchmark results
+    print("\n🏆 Validating benchmark results...")
+    for dataset_name, models in data['benchmark_results'].items():
+        if dataset_name not in data['datasets']:
+            print(f"⚠️  Warning: Benchmark for '{dataset_name}' but no dataset info found")
+        for model_name, metrics in models.items():
+            if not isinstance(metrics, dict):
+                print(f"❌ Invalid metrics format for model '{model_name}'")
+                return False
+            print(f"✅ Model '{model_name}' has {len(metrics)} metrics")
+    # Validate model metadata
+    print("\n🤖 Validating model metadata...")
+    for model_name, metadata in data['model_metadata'].items():
+        required_fields = ['parameters', 'architecture', 'base_model', 'task']
+        for field in required_fields:
+            if field not in metadata:
+                print(f"❌ Model '{model_name}' missing field: {field}")
+                return False
+        print(f"✅ Model '{model_name}' is valid")
+    print("\n🎉 All validations passed!")
+    return True
+def create_sample_data() -> Dict[str, Any]:
+    """Create a sample data structure."""
+    return {
+        'datasets': {
+            'sample_dataset': {
+                'name': 'Sample Dataset',
+                'description': 'A sample dataset for testing',
+                'tasks': ['Classification'],
+                'url': 'https://huggingface.co/datasets/sample'
+            }
+        },
+        'benchmark_results': {
+            'sample_dataset': {
+                'sample-model': {
+                    'accuracy': 0.85,
+                    'f1': 0.84
+                }
+            }
+        },
+        'model_metadata': {
+            'sample-model': {
+                'parameters': 100000000,
+                'architecture': 'BERT Base',
+                'base_model': 'bert-base-uncased',
+                'task': 'Classification',
+                'huggingface_url': 'https://huggingface.co/sample/model'
+            }
+        },
+        'additional_models': {}
+    }
+def add_dataset(data: Dict[str, Any], dataset_name: str, name: str, description: str,
+                tasks: list, url: str) -> Dict[str, Any]:
+    """Add a new dataset to the data structure."""
+    data['datasets'][dataset_name] = {
+        'name': name,
+        'description': description,
+        'tasks': tasks,
+        'url': url
+    }
+    print(f"✅ Added dataset: {dataset_name}")
+    return data
+def add_benchmark_result(data: Dict[str, Any], dataset_name: str, model_name: str,
+                        metrics: Dict[str, float]) -> Dict[str, Any]:
+    """Add benchmark results for a model on a dataset."""
+    if dataset_name not in data['benchmark_results']:
+        data['benchmark_results'][dataset_name] = {}
+    data['benchmark_results'][dataset_name][model_name] = metrics
+    print(f"✅ Added benchmark result for {model_name} on {dataset_name}")
+    return data
+def add_model_metadata(data: Dict[str, Any], model_name: str, parameters: int,
+                      architecture: str, base_model: str, task: str,
+                      huggingface_url: str = None) -> Dict[str, Any]:
+    """Add model metadata."""
+    data['model_metadata'][model_name] = {
+        'parameters': parameters,
+        'architecture': architecture,
+        'base_model': base_model,
+        'task': task
+    }
+    if huggingface_url:
+        data['model_metadata'][model_name]['huggingface_url'] = huggingface_url
+    print(f"✅ Added model metadata: {model_name}")
+    return data
+def export_data(data: Dict[str, Any], output_file: str) -> None:
+    """Export data to a YAML file."""
+    try:
+        with open(output_file, 'w', encoding='utf-8') as file:
+            yaml.dump(data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        print(f"✅ Data exported to {output_file}")
+    except Exception as e:
+        print(f"❌ Error exporting data: {e}")
+def main():
+    """Main function for command-line interface."""
+    parser = argparse.ArgumentParser(description='Manage Napolab Leaderboard Data')
+    parser.add_argument('action', choices=['validate', 'create-sample', 'add-dataset', 'add-benchmark', 'add-model'],
+                       help='Action to perform')
+    parser.add_argument('--data-file', default='data.yaml', help='Path to data file')
+    parser.add_argument('--output', help='Output file for export')
+    # Dataset arguments
+    parser.add_argument('--dataset-name', help='Dataset name')
+    parser.add_argument('--dataset-display-name', help='Dataset display name')
+    parser.add_argument('--dataset-description', help='Dataset description')
+    parser.add_argument('--dataset-tasks', nargs='+', help='Dataset tasks')
+    parser.add_argument('--dataset-url', help='Dataset URL')
+    # Benchmark arguments
+    parser.add_argument('--model-name', help='Model name')
+    parser.add_argument('--metrics', nargs='+', help='Metrics as key=value pairs')
+    # Model metadata arguments
+    parser.add_argument('--parameters', type=int, help='Number of parameters')
+    parser.add_argument('--architecture', help='Model architecture')
+    parser.add_argument('--base-model', help='Base model name')
+    parser.add_argument('--task', help='Task type')
+    parser.add_argument('--huggingface-url', help='Hugging Face URL')
+    args = parser.parse_args()
+    # Load existing data or create new
+    data_loader = NapolabDataLoader(args.data_file)
+    data = data_loader.data
+    if args.action == 'validate':
+        if validate_yaml_structure(data):
+            print("✅ Data validation successful!")
+        else:
+            print("❌ Data validation failed!")
+            return 1
+    elif args.action == 'create-sample':
+        data = create_sample_data()
+        export_data(data, args.output or 'sample_data.yaml')
+    elif args.action == 'add-dataset':
+        if not all([args.dataset_name, args.dataset_display_name, args.dataset_description,
+                   args.dataset_tasks, args.dataset_url]):
+            print("❌ All dataset arguments are required")
+            return 1
+        data = add_dataset(data, args.dataset_name, args.dataset_display_name,
+                          args.dataset_description, args.dataset_tasks, args.dataset_url)
+        export_data(data, args.data_file)
+    elif args.action == 'add-benchmark':
+        if not all([args.dataset_name, args.model_name, args.metrics]):
+            print("❌ All benchmark arguments are required")
+            return 1
+        # Parse metrics
+        metrics = {}
+        for metric in args.metrics:
+            if '=' in metric:
+                key, value = metric.split('=', 1)
+                try:
+                    metrics[key] = float(value)
+                except ValueError:
+                    print(f"❌ Invalid metric value: {metric}")
+                    return 1
+        data = add_benchmark_result(data, args.dataset_name, args.model_name, metrics)
+        export_data(data, args.data_file)
+    elif args.action == 'add-model':
+        if not all([args.model_name, args.parameters, args.architecture,
+                   args.base_model, args.task]):
+            print("❌ All model metadata arguments are required")
+            return 1
+        data = add_model_metadata(data, args.model_name, args.parameters,
+                                args.architecture, args.base_model, args.task,
+                                args.huggingface_url)
+        export_data(data, args.data_file)
+    return 0
+if __name__ == "__main__":
+    exit(main())

portuguese_leaderboard.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.0.0
+pandas>=1.5.0
+numpy>=1.21.0
+plotly>=5.0.0
+transformers>=4.20.0
+torch>=1.12.0
+huggingface-hub>=0.10.0
+PyYAML>=6.0

run_app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+"""
+Napolab Leaderboard Launcher Script
+This script checks dependencies and launches the Gradio app for the Napolab leaderboard.
+"""
+import sys
+import subprocess
+import importlib.util
+from pathlib import Path
+def check_dependency(package_name):
+    """Check if a package is installed."""
+    spec = importlib.util.find_spec(package_name)
+    return spec is not None
+def install_dependencies():
+    """Install required dependencies."""
+    print("Installing required dependencies...")
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
+        print("✅ Dependencies installed successfully!")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Failed to install dependencies: {e}")
+        return False
+def main():
+    """Main launcher function."""
+    print("🚀 Napolab Leaderboard Launcher")
+    print("=" * 40)
+    # Check if we're in the right directory
+    if not Path("app.py").exists():
+        print("❌ Error: app.py not found. Please run this script from the leaderboard directory.")
+        sys.exit(1)
+    # Check required dependencies
+    required_packages = ["gradio", "pandas", "numpy", "datasets", "plotly"]
+    missing_packages = []
+    for package in required_packages:
+        if not check_dependency(package):
+            missing_packages.append(package)
+    if missing_packages:
+        print(f"❌ Missing dependencies: {', '.join(missing_packages)}")
+        print("Installing dependencies...")
+        if not install_dependencies():
+            print("❌ Failed to install dependencies. Please install them manually:")
+            print("pip install -r requirements.txt")
+            sys.exit(1)
+    else:
+        print("✅ All dependencies are installed!")
+    # Launch the app
+    print("\n🌐 Launching Napolab Leaderboard...")
+    print("The app will be available at: http://localhost:7860")
+    print("Press Ctrl+C to stop the server")
+    print("-" * 40)
+    try:
+        import app
+        # The app will be launched by the import
+    except KeyboardInterrupt:
+        print("\n👋 Server stopped by user")
+    except Exception as e:
+        print(f"❌ Error launching app: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

validate_data.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+"""
+Validation script for the updated Napolab data structure
+"""
+from data_loader import NapolabDataLoader
+from manage_data import validate_yaml_structure
+import pandas as pd
+def main():
+    """Validate the updated data structure."""
+    print("🔍 Validating Updated Napolab Data Structure")
+    print("=" * 50)
+    print("📚 Data Source: Master's thesis 'Lessons learned from the evaluation of Portuguese language models'")
+    print("   by Ruan Chaves Rodrigues (2023) - University of Malta")
+    print("   Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557")
+    print("=" * 50)
+    # Load data
+    data_loader = NapolabDataLoader()
+    data = data_loader.data
+    # Validate structure
+    print("\n1. Validating YAML structure...")
+    if validate_yaml_structure(data):
+        print("✅ YAML structure is valid!")
+    else:
+        print("❌ YAML structure has issues!")
+        return
+    # Check datasets
+    print("\n2. Checking datasets...")
+    datasets = data_loader.get_datasets()
+    print(f"📊 Found {len(datasets)} datasets:")
+    for name, info in datasets.items():
+        print(f"   - {name}: {info['name']} ({', '.join(info['tasks'])})")
+    # Check benchmark results
+    print("\n3. Checking benchmark results...")
+    benchmark_results = data_loader.get_benchmark_results()
+    print(f"🏆 Found {len(benchmark_results)} benchmark datasets:")
+    for dataset_name, models in benchmark_results.items():
+        print(f"   - {dataset_name}: {len(models)} models")
+    # Check model metadata
+    print("\n4. Checking model metadata...")
+    model_metadata = data_loader.get_model_metadata()
+    print(f"🤖 Found {len(model_metadata)} models:")
+    # Group models by architecture
+    architectures = {}
+    for model_name, metadata in model_metadata.items():
+        arch = metadata['architecture']
+        if arch not in architectures:
+            architectures[arch] = []
+        architectures[arch].append(model_name)
+    for arch, models in architectures.items():
+        print(f"   - {arch}: {len(models)} models")
+        for model in models[:3]:  # Show first 3 models
+            print(f"     * {model}")
+        if len(models) > 3:
+            print(f"     ... and {len(models) - 3} more")
+    # Test data access functions
+    print("\n5. Testing data access functions...")
+    # Test getting available models for a dataset
+    test_dataset = list(benchmark_results.keys())[0]
+    models = data_loader.get_available_models_for_dataset(test_dataset)
+    print(f"   Available models for {test_dataset}: {len(models)} models")
+    # Test getting model info
+    if models:
+        test_model = models[0]
+        model_info = data_loader.get_model_info(test_model)
+        if model_info:
+            print(f"   Model {test_model}: {model_info['parameters']:,} parameters")
+    # Create a summary table
+    print("\n6. Creating summary table...")
+    summary_data = []
+    for dataset_name, models in benchmark_results.items():
+        for model_name, metrics in models.items():
+            if model_name in model_metadata:
+                summary_data.append({
+                    'Dataset': dataset_name,
+                    'Model': model_name,
+                    'Architecture': model_metadata[model_name]['architecture'],
+                    'Parameters': model_metadata[model_name]['parameters'],
+                    'Performance': metrics.get('accuracy', 0)
+                })
+    if summary_data:
+        df = pd.DataFrame(summary_data)
+        print(f"📋 Summary: {len(df)} model-dataset combinations")
+        print(f"   Average performance: {df['Performance'].mean():.3f}")
+        print(f"   Best performance: {df['Performance'].max():.3f}")
+        print(f"   Models with >0.9 performance: {(df['Performance'] > 0.9).sum()}")
+    print("\n✅ Validation completed successfully!")
+    print("🚀 The updated data structure is ready to use!")
+if __name__ == "__main__":
+    main()