diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,1504 +1,1506 @@
-import gradio as gr
-import pandas as pd
-import numpy as np
-import plotly.express as px
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import json
-import os
-import re
-from typing import Dict, List, Optional, Tuple
-
-# Import data loader
-from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
-
-# Load data from YAML file
-NAPOLAB_DATASETS = get_napolab_datasets()
-SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
-MODEL_METADATA = get_model_metadata()
-
-def load_portuguese_leaderboard_data() -> pd.DataFrame:
- """Load data from the Portuguese leaderboard CSV file."""
- try:
- csv_path = "portuguese_leaderboard.csv"
- if os.path.exists(csv_path):
- df = pd.read_csv(csv_path)
- # Select only the relevant columns
- relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
- df = df[relevant_columns].copy()
-
- # Rename columns to match the existing format
- df = df.rename(columns={
- 'assin2_rte': 'ASSIN2 RTE',
- 'assin2_sts': 'ASSIN2 STS',
- 'faquad_nli': 'FaQUaD-NLI',
- 'hatebr_offensive': 'HateBR'
- })
-
- # Add source information
- df['source'] = 'portuguese_leaderboard'
-
- print(f"Loaded {len(df)} models from Portuguese leaderboard")
- return df
- else:
- print(f"Portuguese leaderboard CSV not found: {csv_path}")
- return pd.DataFrame()
- except Exception as e:
- print(f"Error loading Portuguese leaderboard data: {e}")
- return pd.DataFrame()
-
-def load_external_models_data() -> pd.DataFrame:
- """Load data from the external models CSV file."""
- try:
- csv_path = "external_models.csv"
- if os.path.exists(csv_path):
- df = pd.read_csv(csv_path)
- # Select only the relevant columns
- relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
- df = df[relevant_columns].copy()
-
- # Rename columns to match the existing format
- df = df.rename(columns={
- 'model': 'model_name',
- 'assin2_rte': 'ASSIN2 RTE',
- 'assin2_sts': 'ASSIN2 STS',
- 'faquad_nli': 'FaQUaD-NLI',
- 'hatebr_offensive': 'HateBR'
- })
-
- # Add source information
- df['source'] = 'external_models'
-
- # Add model_num_parameters column with 0 for external models
- df['model_num_parameters'] = 0
-
- print(f"Loaded {len(df)} external models")
- return df
- else:
- print(f"External models CSV not found: {csv_path}")
- return pd.DataFrame()
- except Exception as e:
- print(f"Error loading external models data: {e}")
- return pd.DataFrame()
-
-# Load Portuguese leaderboard data
-PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
-
-# Load external models data
-EXTERNAL_MODELS_DATA = load_external_models_data()
-
-def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
- """Create a simplified benchmark table with one column per dataset."""
- # Get all dataset names
- dataset_names = sorted(NAPOLAB_DATASETS.keys())
- dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
-
- # Use selected datasets if provided, otherwise use all datasets
- if selected_datasets is None:
- selected_datasets = dataset_names
-
- # Collect data for each model
- model_data = {}
-
- # Process existing benchmark results
- for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
- for model_name, metrics in models.items():
- if model_name not in model_data:
- model_data[model_name] = {
- 'dataset_scores': {},
- 'url': None,
- 'source': 'existing'
- }
-
- # Calculate average performance for this dataset
- avg_performance = np.mean(list(metrics.values()))
- model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
-
- # Process Portuguese leaderboard data
- if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
- for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'dataset_scores': {},
- 'url': None,
- 'source': 'portuguese_leaderboard',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map Portuguese leaderboard columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in NAPOLAB_DATASETS:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['dataset_scores'][dataset_name] = score
-
- # Process external models data
- if show_external_models and not EXTERNAL_MODELS_DATA.empty:
- for _, row in EXTERNAL_MODELS_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'dataset_scores': {},
- 'url': row.get('link', ''),
- 'source': 'external_models',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map external models columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in NAPOLAB_DATASETS:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['dataset_scores'][dataset_name] = score
-
- # Get model URLs and source information for existing models
- additional_models = data_loader.get_additional_models()
- for model_name in model_data.keys():
- if model_data[model_name]['source'] == 'existing':
- # Get URL
- for arch_models in additional_models.values():
- if model_name in arch_models:
- model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
- break
-
- # Get source information
- model_metadata = MODEL_METADATA.get(model_name, {})
- source = model_metadata.get('source', 'unknown')
- model_data[model_name]['source'] = source
-
- # Add num_parameters for existing models (set to 0 as they don't have this info)
- model_data[model_name]['num_parameters'] = 0
-
- # Create table data
- table_data = []
-
- for model_name, data in model_data.items():
- # Apply source filtering
- source = data['source']
-
- # Apply show filters - only show models from sources that are checked
- if source == 'napolab_thesis' and not show_napolab_thesis:
- continue
- if source == 'teenytinyllama_paper' and not show_teenytinyllama:
- continue
- if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
- continue
- if source == 'external_models' and not show_external_models:
- continue
- # Hide models with unknown source (should not happen with proper data)
- if source == 'unknown':
- continue
-
- # Apply parameter filtering (only for Portuguese leaderboard models)
- if max_num_parameters > 0 and source == 'portuguese_leaderboard':
- num_parameters = data.get('num_parameters', 0)
- if num_parameters > max_num_parameters:
- continue
-
- # Create clickable link for model name
- if data['url']:
- model_display = f"[{model_name}]({data['url']})"
- elif source == 'portuguese_leaderboard' and '/' in model_name:
- # Create Hugging Face link for Portuguese leaderboard models with slashes
- huggingface_url = f"https://huggingface.co/{model_name}"
- model_display = f"[{model_name}]({huggingface_url})"
- else:
- model_display = model_name
-
- # Create row with dataset scores
- row_data = {'Model': model_display}
-
- # Calculate average only over selected datasets
- selected_scores = []
- for dataset_name in selected_datasets:
- score = data['dataset_scores'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores in average
- selected_scores.append(score)
-
- overall_avg = np.mean(selected_scores) if selected_scores else 0
- row_data['Average'] = round(overall_avg, 4)
-
- # Add scores for each dataset (only selected ones)
- for dataset_name in dataset_names:
- score = data['dataset_scores'].get(dataset_name, 0)
- display_name = dataset_display_names[dataset_names.index(dataset_name)]
- # Only add columns for selected datasets
- if dataset_name in selected_datasets:
- row_data[display_name] = round(score, 4)
-
- table_data.append(row_data)
-
- df = pd.DataFrame(table_data)
-
- # Filter to show only models that have scores for at least one selected dataset
- if selected_datasets and not df.empty:
- # Get display names for selected datasets
- selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
-
- # Filter models based on selection criteria
- models_to_keep = []
- for _, row in df.iterrows():
- has_score = False
- has_all_scores = True
-
- # Only check the datasets that are actually selected for display
- for dataset_name in selected_datasets:
- display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
- if display_name in df.columns:
- score = row[display_name]
- if score > 0:
- has_score = True
- else:
- has_all_scores = False
-
- # Keep model if it has at least one score
- if has_score:
- # If hide_incomplete_models is True, only keep models with all scores in selected datasets
- if not hide_incomplete_models or has_all_scores:
- models_to_keep.append(row['Model'])
-
- # Filter dataframe to only include selected models
- if models_to_keep:
- df = df[df['Model'].isin(models_to_keep)]
- else:
- # If no models to keep, create empty DataFrame with proper structure
- # Create columns list first
- columns = ['Model']
- for dataset_name in dataset_names:
- display_name = dataset_display_names[dataset_names.index(dataset_name)]
- if dataset_name in selected_datasets:
- columns.append(display_name)
- columns.append('Average')
-
- # Create empty DataFrame with correct columns
- df = pd.DataFrame(columns=columns)
-
- # Filter by minimum average performance
- if min_average_performance > 0 and not df.empty:
- df = df[df['Average'] >= min_average_performance]
-
- # Filter by search query
- if search_query and not df.empty:
- # Extract model names from markdown links for searching
- df_filtered = df.copy()
- df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
- try:
- # Use regex pattern matching
- df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False, regex=True)]
- except re.error:
- # Fallback to simple string matching if regex is invalid
- df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
- df = df_filtered.drop('model_name_clean', axis=1)
-
- # Sort by Average (descending)
- if not df.empty:
- df = df.sort_values('Average', ascending=False)
-
- # Add rank column with medal emojis for top 3 and color-coded emojis for others
- if not df.empty:
- df = df.reset_index(drop=True)
- df.index = df.index + 1 # Start ranking from 1
-
- # Create rank column with medal emojis and color-coded emojis
- rank_column = []
- total_models = len(df)
-
- for rank in df.index:
- if rank == 1:
- rank_column.append("π₯ 1")
- elif rank == 2:
- rank_column.append("π₯ 2")
- elif rank == 3:
- rank_column.append("π₯ 3")
- else:
- # Color-code based on position relative to total
- position_ratio = rank / total_models
- if position_ratio <= 0.33: # Top third
- rank_column.append("π’ " + str(rank))
- elif position_ratio <= 0.67: # Middle third
- rank_column.append("π‘ " + str(rank))
- else: # Bottom third
- rank_column.append("π΄ " + str(rank))
-
- df.insert(0, 'Rank', rank_column)
-
- return df
-
-
-# Global variable to track the current CSV file
-current_csv_file = None
-
-def export_csv(df: pd.DataFrame):
- """Export the benchmark table to CSV."""
- global current_csv_file
-
- print(f"Export function called with dataframe shape: {df.shape}")
-
- if df.empty:
- print("Dataframe is empty, returning None")
- return None
-
- # Clean up previous file if it exists
- if current_csv_file:
- try:
- import os
- if os.path.exists(current_csv_file):
- os.remove(current_csv_file)
- print(f"Deleted previous CSV file: {current_csv_file}")
- except Exception as e:
- print(f"Error deleting previous file {current_csv_file}: {e}")
-
- # Clean the dataframe for CSV export
- df_clean = df.copy()
-
- # Remove markdown formatting from model names for cleaner CSV
- df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
-
- # Create filename with timestamp
- from datetime import datetime
- import tempfile
- import os
-
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"napolab_benchmark_results_{timestamp}.csv"
-
- # Create file in current directory (simpler approach)
- file_path = filename
-
- print(f"Creating CSV file at: {file_path}")
-
- # Save to CSV file
- df_clean.to_csv(file_path, index=False)
-
- print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
-
- # Update current file tracking
- current_csv_file = file_path
-
- return file_path
-
-def cleanup_current_csv():
- """Clean up the current CSV file after download."""
- global current_csv_file
- import os
-
- if current_csv_file and os.path.exists(current_csv_file):
- try:
- os.remove(current_csv_file)
- print(f"Deleted CSV file after download: {current_csv_file}")
- current_csv_file = None
- except Exception as e:
- print(f"Error deleting file {current_csv_file}: {e}")
-
-
-def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
- """Create a radar chart showing model performance across all datasets."""
- # Use selected datasets if provided, otherwise use all datasets
- if selected_datasets is None:
- selected_datasets = list(NAPOLAB_DATASETS.keys())
-
- # Get dataset names for the radar axes (only selected ones)
- dataset_names = selected_datasets
- dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
-
- # Collect data for each model
- model_data = {}
-
- # Process existing benchmark results
- for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
- if dataset_name in selected_datasets:
- for model_name, metrics in models.items():
- if model_name not in model_data:
- model_data[model_name] = {
- 'performances': {},
- 'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
- 'source': 'existing'
- }
-
- # Calculate average performance for this dataset
- avg_performance = np.mean(list(metrics.values()))
- model_data[model_name]['performances'][dataset_name] = avg_performance
-
- # Process Portuguese leaderboard data
- if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
- for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'performances': {},
- 'architecture': 'Unknown',
- 'source': 'portuguese_leaderboard',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map Portuguese leaderboard columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in selected_datasets:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['performances'][dataset_name] = score
-
- # Process external models data
- if show_external_models and not EXTERNAL_MODELS_DATA.empty:
- for _, row in EXTERNAL_MODELS_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'performances': {},
- 'architecture': 'Unknown',
- 'source': 'external_models',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map external models columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in selected_datasets:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['performances'][dataset_name] = score
-
- # Get model URLs and source information for existing models
- additional_models = data_loader.get_additional_models()
- for model_name in model_data.keys():
- if model_data[model_name]['source'] == 'existing':
- # Get URL
- for arch_models in additional_models.values():
- if model_name in arch_models:
- model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
- break
-
- # Get source information
- model_metadata = MODEL_METADATA.get(model_name, {})
- source = model_metadata.get('source', 'unknown')
- model_data[model_name]['source'] = source
-
- # Add num_parameters for existing models (set to 0 as they don't have this info)
- model_data[model_name]['num_parameters'] = 0
-
- # Apply source filtering
- filtered_model_data = {}
- for model_name, data in model_data.items():
- source = data.get('source', 'existing')
-
- # Apply show filters - only show models from sources that are checked
- if source == 'napolab_thesis' and not show_napolab_thesis:
- continue
- if source == 'teenytinyllama_paper' and not show_teenytinyllama:
- continue
- if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
- continue
- if source == 'external_models' and not show_external_models:
- continue
- # Hide models with unknown source (should not happen with proper data)
- if source == 'unknown':
- continue
-
- # Apply parameter filtering (only for Portuguese leaderboard models)
- if max_num_parameters > 0 and source == 'portuguese_leaderboard':
- num_parameters = data.get('num_parameters', 0)
- if num_parameters > max_num_parameters:
- continue
-
- filtered_model_data[model_name] = data
-
- # Apply incomplete model filtering
- if hide_incomplete_models and selected_datasets:
- final_filtered_data = {}
- for model_name, data in filtered_model_data.items():
- has_all_scores = True
- for dataset_name in selected_datasets:
- if data['performances'].get(dataset_name, 0) == 0:
- has_all_scores = False
- break
- if has_all_scores:
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Apply minimum average performance filtering
- if min_average_performance > 0 and selected_datasets:
- final_filtered_data = {}
- for model_name, data in filtered_model_data.items():
- # Calculate average performance for selected datasets
- scores = []
- for dataset_name in selected_datasets:
- score = data['performances'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores
- scores.append(score)
-
- if scores:
- avg_performance = np.mean(scores)
- if avg_performance >= min_average_performance:
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Apply search query filtering
- if search_query:
- final_filtered_data = {}
- try:
- # Use regex pattern matching
- import re
- pattern = re.compile(search_query, re.IGNORECASE)
- for model_name, data in filtered_model_data.items():
- if pattern.search(model_name):
- final_filtered_data[model_name] = data
- except re.error:
- # Fallback to simple string matching if regex is invalid
- for model_name, data in filtered_model_data.items():
- if search_query.lower() in model_name.lower():
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Sort models by average performance (descending)
- model_performances = []
- for model_name, data in filtered_model_data.items():
- # Calculate average performance for selected datasets
- scores = []
- for dataset_name in selected_datasets:
- score = data['performances'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores
- scores.append(score)
-
- avg_performance = np.mean(scores) if scores else 0
- model_performances.append((model_name, data, avg_performance))
-
- # Sort by average performance (descending)
- model_performances.sort(key=lambda x: x[2], reverse=True)
-
- # Calculate dynamic range based on actual data
- all_performance_values = []
- for model_name, data, avg_performance in model_performances:
- for dataset_name in dataset_names:
- score = data['performances'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores
- all_performance_values.append(score)
-
- # Set dynamic range with some padding
- if all_performance_values:
- min_score = min(all_performance_values)
- max_score = max(all_performance_values)
- # Add 5% padding below minimum and ensure minimum is not below 0.5
- range_min = max(0.5, min_score - (max_score - min_score) * 0.05)
- range_max = 1.0
- else:
- # Fallback to default range if no data
- range_min = 0.6
- range_max = 1.0
-
- # Create radar chart
- fig = go.Figure()
-
- # Generate a more distinguishable color palette
- num_models = len(model_performances)
-
- # Create a list of line styles for better differentiation
- line_styles = ['solid', 'dash', 'dot', 'dashdot', 'longdash', 'longdashdot']
-
- # Use highly contrasting colors for better differentiation
- base_colors = [
- '#1f77b4', # Blue
- '#ff7f0e', # Orange
- '#2ca02c', # Green
- '#d62728', # Red
- '#9467bd', # Purple
- '#8c564b', # Brown
- '#e377c2', # Pink
- '#7f7f7f', # Gray
- '#bcbd22', # Olive
- '#17becf', # Cyan
- '#ff9896', # Light Red
- '#98df8a', # Light Green
- '#ffbb78', # Light Orange
- '#aec7e8', # Light Blue
- '#c5b0d5', # Light Purple
- ]
-
- # Ensure we have enough colors
- while len(base_colors) < num_models:
- base_colors.extend(base_colors)
-
- colors = base_colors[:num_models]
-
- for i, (model_name, data, avg_performance) in enumerate(model_performances):
- # Get performance values for all datasets (fill with 0 if missing)
- performance_values = []
- for dataset_name in dataset_names:
- performance_values.append(data['performances'].get(dataset_name, 0))
-
- # Close the polygon by adding the first value at the end
- if performance_values:
- performance_values.append(performance_values[0])
-
- # Assign color and line style based on model index for better differentiation
- color = colors[i % len(colors)]
- line_style = line_styles[i % len(line_styles)]
-
- # Show first two models by default, hide the rest
- visible = True if i < 2 else 'legendonly'
-
- # Create theta values that close the polygon
- theta_values = dataset_display_names + [dataset_display_names[0]] if dataset_display_names else []
-
- fig.add_trace(go.Scatterpolar(
- r=performance_values,
- theta=theta_values,
- fill=None,
- name=model_name,
- line_color=color,
- line_dash=line_style,
- line_width=3,
- opacity=0.8,
- visible=visible,
- hovertemplate=(
- "%{fullData.name}
" +
- "Dataset: %{theta}
" +
- "Performance: %{r:.3f}
" +
- "Architecture: " + data['architecture'] + "
" +
- ""
- )
- ))
-
- # Update layout
- fig.update_layout(
- title="Model Performance Radar Chart",
- polar=dict(
- radialaxis=dict(
- visible=True,
- range=[range_min, range_max],
- gridcolor='rgba(0, 0, 0, 0.2)',
- linecolor='rgba(0, 0, 0, 0.5)',
- tickcolor='rgba(0, 0, 0, 0.7)',
- tickfont=dict(color='rgba(0, 0, 0, 0.8)')
- ),
- angularaxis=dict(
- tickmode='array',
- tickvals=list(range(len(dataset_display_names))),
- ticktext=dataset_display_names,
- gridcolor='rgba(0, 0, 0, 0.2)',
- linecolor='rgba(0, 0, 0, 0.5)',
- tickcolor='rgba(0, 0, 0, 0.7)',
- tickfont=dict(color='rgba(0, 0, 0, 0.8)')
- ),
- bgcolor='rgba(255, 255, 255, 0)'
- ),
- height=700,
- showlegend=True,
- plot_bgcolor='rgba(255, 255, 255, 0)',
- paper_bgcolor='rgba(255, 255, 255, 0)',
- legend=dict(
- yanchor="top",
- y=-0.15,
- xanchor="center",
- x=0.5,
- bgcolor='rgba(255, 255, 255, 0.95)',
- bordercolor='rgba(0, 0, 0, 0.2)',
- borderwidth=1,
- orientation="h",
- font=dict(color='rgba(0, 0, 0, 0.8)')
- ),
- margin=dict(l=50, r=50, t=100, b=100),
- font=dict(color='rgba(0, 0, 0, 0.8)')
- )
-
- return fig
-
-# Gradio Interface
-with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
- gr.Markdown("""
- # π Napolab Leaderboard
-
- Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
-
- [β Star us on GitHub](https://github.com/ruanchaves/napolab)
- """)
-
- with gr.Tabs():
-
- # Benchmark Results Tab
- with gr.Tab("π Benchmark Results"):
- gr.Markdown("### Model Performance Benchmarks")
-
- with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
- with gr.Row():
- # Create checkboxes for each dataset
- dataset_checkboxes = []
- for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
- display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
- # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
- default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
- checkbox = gr.Checkbox(
- label=display_name,
- value=default_value
- )
- dataset_checkboxes.append((dataset_name, checkbox))
-
- with gr.Accordion("Filter by Score: (Click to expand)", open=False):
- with gr.Row():
- hide_incomplete_models = gr.Checkbox(
- label="Hide models with zero scores in selected datasets",
- value=True
- )
-
- min_average_performance = gr.Slider(
- minimum=0,
- maximum=100,
- value=80,
- step=1,
- label="Minimum Average Performance (%)"
- )
-
- with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
- with gr.Row():
- show_napolab_thesis = gr.Checkbox(
- label="Napolab Thesis models",
- value=True
- )
- show_teenytinyllama = gr.Checkbox(
- label="TeenyTinyLlama models",
- value=True
- )
- show_portuguese_leaderboard = gr.Checkbox(
- label="Open Portuguese LLM Leaderboard models (open-source)",
- value=True
- )
-
- show_external_models = gr.Checkbox(
- label="Open Portuguese LLM Leaderboard models (proprietary)",
- value=True
- )
-
- # Calculate max parameters for slider
- max_params = 0
- if not PORTUGUESE_LEADERBOARD_DATA.empty:
- max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
-
- with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
- with gr.Row():
- max_num_parameters = gr.Slider(
- minimum=0,
- maximum=max_params,
- value=0,
- step=1,
- label="Maximum Number of Parameters",
- info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
- )
-
- # Search bar for filtering models
- search_query = gr.Textbox(
- label="Search models by name (supports regex)",
- placeholder="Enter model name or regex pattern to filter...",
- value="",
- info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
- )
-
- benchmark_table = gr.DataFrame(
- label="Model Performance Benchmarks",
- wrap=[True, False, False, False, False, False, False, False, False, False],
- interactive=False,
- datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
- column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
- )
-
- gr.Markdown("*π₯π₯π₯ = Top 3 | π’ = Top 33% | π‘ = Middle 33% | π΄ = Bottom 33%*")
-
- # Export to CSV button and file component
- export_button = gr.Button("π₯ Export to CSV", variant="secondary")
- csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
-
- # Model Analysis Tab
- with gr.Tab("π Model Analysis"):
- gr.Markdown("### Model Performance Radar Chart")
-
- # Dataset Selection Controls
- with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
- with gr.Row():
- # Create checkboxes for each dataset
- analysis_dataset_checkboxes = []
- for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
- display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
- # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
- default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
- checkbox = gr.Checkbox(
- label=display_name,
- value=default_value
- )
- analysis_dataset_checkboxes.append((dataset_name, checkbox))
-
- # Filter Controls
- with gr.Accordion("Filter by Score: (Click to expand)", open=False):
- with gr.Row():
- hide_incomplete_models_analysis = gr.Checkbox(
- label="Hide models with zero scores in selected datasets",
- value=True
- )
-
- min_average_performance_analysis = gr.Slider(
- minimum=0,
- maximum=100,
- value=80,
- step=1,
- label="Minimum Average Performance (%)"
- )
-
- with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
- with gr.Row():
- show_napolab_thesis_analysis = gr.Checkbox(
- label="Napolab Thesis models",
- value=True
- )
-
- show_teenytinyllama_analysis = gr.Checkbox(
- label="TeenyTinyLlama models",
- value=True
- )
-
- show_portuguese_leaderboard_analysis = gr.Checkbox(
- label="Open Portuguese LLM Leaderboard models (open-source)",
- value=True
- )
-
- show_external_models_analysis = gr.Checkbox(
- label="Open Portuguese LLM Leaderboard models (proprietary)",
- value=True
- )
-
- # Parameter slider for Model Analysis tab
- with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
- with gr.Row():
- max_num_parameters_analysis = gr.Slider(
- minimum=0,
- maximum=max_params,
- value=0,
- step=1,
- label="Maximum Number of Parameters",
- info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
- )
-
- # Search bar for filtering models in radar chart
- search_query_analysis = gr.Textbox(
- label="Search models by name (supports regex)",
- placeholder="Enter model name or regex pattern to filter...",
- value="",
- info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
- )
-
- model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
-
- # Add scatter plot below radar chart
- model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
-
- gr.Markdown("""
- **How to interact with the chart:**
- - **Click on legend items** to show/hide specific models.
- - **Double-click on a legend item** to isolate that model (hide all others).
- - **Double-click again** to show all models.
-
- Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
- """)
-
-
-
- # About Tab
- with gr.Tab("βΉοΈ About"):
- gr.Markdown("""
- ## About Napolab
-
- **Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
-
- For more information, please visit the [GitHub repository](https://github.com/ruanchaves/napolab) and the [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab).
-
- ### Data Sources:
- The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
-
- **1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
-
- **2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
-
- **3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by CorrΓͺa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
-
- ### Thesis Citation:
- ```bibtex
- @mastersthesis{chaves2023lessons,
- title={Lessons learned from the evaluation of Portuguese language models},
- author={Chaves Rodrigues, Ruan},
- year={2023},
- school={University of Malta},
- url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
- }
- ```
-
- ### Napolab Citation:
- ```bibtex
- @software{Chaves_Rodrigues_napolab_2023,
- author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
- doi = {10.5281/zenodo.7781848},
- month = {3},
- title = {{Natural Portuguese Language Benchmark (Napolab)}},
- url = {https://github.com/ruanchaves/napolab},
- version = {1.0.0},
- year = {2023}
- }
- ```
-
- """)
-
- def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
- """Create a scatter plot showing model performance vs number of parameters."""
- # Use selected datasets if provided, otherwise use all datasets
- if selected_datasets is None:
- selected_datasets = list(NAPOLAB_DATASETS.keys())
-
- # Collect data for each model
- model_data = {}
-
- # Process existing benchmark results
- for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
- if dataset_name in selected_datasets:
- for model_name, metrics in models.items():
- if model_name not in model_data:
- # Get actual source from MODEL_METADATA
- model_metadata = MODEL_METADATA.get(model_name, {})
- actual_source = model_metadata.get('source', 'unknown')
-
- model_data[model_name] = {
- 'performances': {},
- 'architecture': model_metadata.get('architecture', 'Unknown'),
- 'source': actual_source,
- 'num_parameters': 0
- }
-
- # Calculate average performance for this dataset
- avg_performance = np.mean(list(metrics.values()))
- model_data[model_name]['performances'][dataset_name] = avg_performance
-
- # Process Portuguese leaderboard data
- if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
- for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'performances': {},
- 'architecture': 'Unknown',
- 'source': 'portuguese_leaderboard',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map Portuguese leaderboard columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in selected_datasets:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['performances'][dataset_name] = score
-
- # Process external models data
- if show_external_models and not EXTERNAL_MODELS_DATA.empty:
- for _, row in EXTERNAL_MODELS_DATA.iterrows():
- model_name = row['model_name']
-
- if model_name not in model_data:
- model_data[model_name] = {
- 'performances': {},
- 'architecture': 'Unknown',
- 'source': 'external_models',
- 'num_parameters': row.get('model_num_parameters', 0)
- }
-
- # Map external models columns to dataset names
- column_mapping = {
- 'ASSIN2 RTE': 'assin2_rte',
- 'ASSIN2 STS': 'assin2_sts',
- 'FaQUaD-NLI': 'faquad-nli',
- 'HateBR': 'hatebr'
- }
-
- for display_name, dataset_name in column_mapping.items():
- if dataset_name in selected_datasets:
- score = row[display_name]
- if pd.notna(score) and score > 0:
- model_data[model_name]['performances'][dataset_name] = score
-
- # Apply source filtering
- filtered_model_data = {}
- for model_name, data in model_data.items():
- source = data.get('source', 'existing')
-
- # Apply show filters - only show models from sources that are checked
- if source == 'napolab_thesis' and not show_napolab_thesis:
- continue
- if source == 'teenytinyllama_paper' and not show_teenytinyllama:
- continue
- if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
- continue
- if source == 'external_models' and not show_external_models:
- continue
- # Hide models with unknown source (should not happen with proper data)
- if source == 'unknown':
- continue
-
- # Apply parameter filtering (only for Portuguese leaderboard models)
- if max_num_parameters > 0 and source == 'portuguese_leaderboard':
- num_parameters = data.get('num_parameters', 0)
- if num_parameters > max_num_parameters:
- continue
-
- filtered_model_data[model_name] = data
-
- # Apply incomplete model filtering
- if hide_incomplete_models and selected_datasets:
- final_filtered_data = {}
- for model_name, data in filtered_model_data.items():
- has_all_scores = True
- for dataset_name in selected_datasets:
- if data['performances'].get(dataset_name, 0) == 0:
- has_all_scores = False
- break
- if has_all_scores:
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Apply minimum average performance filtering
- if min_average_performance > 0 and selected_datasets:
- final_filtered_data = {}
- for model_name, data in filtered_model_data.items():
- # Calculate average performance for selected datasets
- scores = []
- for dataset_name in selected_datasets:
- score = data['performances'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores
- scores.append(score)
-
- if scores:
- avg_performance = np.mean(scores)
- if avg_performance >= min_average_performance:
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Apply search query filtering
- if search_query:
- final_filtered_data = {}
- try:
- # Use regex pattern matching
- import re
- pattern = re.compile(search_query, re.IGNORECASE)
- for model_name, data in filtered_model_data.items():
- if pattern.search(model_name):
- final_filtered_data[model_name] = data
- except re.error:
- # Fallback to simple string matching if regex is invalid
- for model_name, data in filtered_model_data.items():
- if search_query.lower() in model_name.lower():
- final_filtered_data[model_name] = data
- filtered_model_data = final_filtered_data
-
- # Prepare data for scatter plot
- scatter_data = []
- for model_name, data in filtered_model_data.items():
- # Calculate average performance for selected datasets
- scores = []
- for dataset_name in selected_datasets:
- score = data['performances'].get(dataset_name, 0)
- if score > 0: # Only include non-zero scores
- scores.append(score)
-
- if scores:
- avg_performance = np.mean(scores)
- num_parameters = data.get('num_parameters', 0)
- source = data.get('source', 'unknown')
-
- scatter_data.append({
- 'model_name': model_name,
- 'avg_performance': avg_performance,
- 'num_parameters': num_parameters,
- 'source': source
- })
-
- if not scatter_data:
- # Create empty figure if no data
- fig = go.Figure()
- fig.add_annotation(
- text="No data available for the selected filters",
- xref="paper", yref="paper",
- x=0.5, y=0.5, showarrow=False,
- font=dict(size=16)
- )
- fig.update_layout(
- title="Model Performance vs Number of Parameters",
- xaxis_title="Number of Parameters",
- yaxis_title="Average Performance Score",
- height=500
- )
- return fig
-
- # Create scatter plot
- df_scatter = pd.DataFrame(scatter_data)
-
- # Create color mapping for sources
- color_map = {
- 'portuguese_leaderboard': '#1f77b4',
- 'external_models': '#ff7f0e',
- 'napolab_thesis': '#2ca02c',
- 'teenytinyllama_paper': '#d62728',
- 'unknown': '#9467bd'
- }
-
- # Create display name mapping for sources
- display_name_map = {
- 'portuguese_leaderboard': 'Open PT LLM Leaderboard',
- 'external_models': 'Proprietary Models',
- 'napolab_thesis': 'Napolab Thesis',
- 'teenytinyllama_paper': 'TeenyTinyLlama Paper',
- 'unknown': 'Unknown Source'
- }
-
- fig = go.Figure()
-
- for source in df_scatter['source'].unique():
- source_data = df_scatter[df_scatter['source'] == source]
- color = color_map.get(source, '#7f7f7f')
- display_name = display_name_map.get(source, source.replace('_', ' ').title())
-
- fig.add_trace(go.Scatter(
- x=source_data['num_parameters'],
- y=source_data['avg_performance'],
- mode='markers',
- name=display_name,
- marker=dict(
- color=color,
- size=8,
- opacity=0.7
- ),
- text=source_data['model_name'],
- hovertemplate=(
- "%{text}
" +
- "Average Performance: %{y:.3f}
" +
- "Number of Parameters: %{x:,}
" +
- "Source: " + display_name + "
" +
- ""
- )
- ))
-
- fig.update_layout(
- title="Model Performance vs Number of Parameters",
- xaxis_title="Number of Parameters",
- yaxis_title="Average Performance Score",
- height=500,
- showlegend=True,
- plot_bgcolor='rgba(255, 255, 255, 0)',
- paper_bgcolor='rgba(255, 255, 255, 0)',
- legend=dict(
- yanchor="top",
- y=-0.15,
- xanchor="center",
- x=0.5,
- bgcolor='rgba(255, 255, 255, 0.95)',
- bordercolor='rgba(0, 0, 0, 0.2)',
- borderwidth=1,
- orientation="h"
- ),
- margin=dict(l=50, r=50, t=100, b=100)
- )
-
- return fig
-
- # Event handlers
- def update_radar_chart(*args):
- # Extract arguments for radar chart
- dataset_values = args[:len(analysis_dataset_checkboxes)]
- hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
- min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
- show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
- show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
- show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
- show_external_models = args[len(analysis_dataset_checkboxes) + 5]
- search_query = args[len(analysis_dataset_checkboxes) + 6]
- max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
-
- # Convert dataset selections to list of selected dataset names
- selected_datasets = []
- for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
- if dataset_values[i]:
- selected_datasets.append(dataset_name)
-
- return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
-
- def update_benchmark_table(*args):
- # Extract arguments
- dataset_values = args[:len(dataset_checkboxes)]
- hide_incomplete_models = args[len(dataset_checkboxes)]
- min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
- show_napolab_thesis = args[len(dataset_checkboxes) + 2]
- show_teenytinyllama = args[len(dataset_checkboxes) + 3]
- show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
- show_external_models = args[len(dataset_checkboxes) + 5]
- search_query = args[len(dataset_checkboxes) + 6]
- max_num_parameters = args[len(dataset_checkboxes) + 7]
-
- # Convert dataset selections to list of selected dataset names
- selected_datasets = []
- for i, (dataset_name, _) in enumerate(dataset_checkboxes):
- if dataset_values[i]:
- selected_datasets.append(dataset_name)
-
- df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
-
- return df
-
- def update_scatter_plot(*args):
- # Extract arguments for scatter plot
- dataset_values = args[:len(analysis_dataset_checkboxes)]
- hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
- min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
- show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
- show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
- show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
- show_external_models = args[len(analysis_dataset_checkboxes) + 5]
- search_query = args[len(analysis_dataset_checkboxes) + 6]
- max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
-
- # Convert dataset selections to list of selected dataset names
- selected_datasets = []
- for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
- if dataset_values[i]:
- selected_datasets.append(dataset_name)
-
- return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
-
- # Connect dataset checkboxes to update table
- for dataset_name, checkbox in dataset_checkboxes:
- checkbox.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- hide_incomplete_models.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- min_average_performance.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- show_napolab_thesis.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- show_teenytinyllama.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- show_portuguese_leaderboard.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- show_external_models.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- # Connect search query to update table
- search_query.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- # Connect max_num_parameters to update table
- max_num_parameters.change(
- update_benchmark_table,
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
- outputs=benchmark_table
- )
-
- # Connect export button
- export_button.click(
- export_csv,
- inputs=benchmark_table,
- outputs=csv_file
- )
-
- # Connect file download to cleanup
- csv_file.change(
- cleanup_current_csv,
- inputs=None,
- outputs=None
- )
-
- # Connect analysis chart events
- # Connect dataset checkboxes to update radar chart
- for dataset_name, checkbox in analysis_dataset_checkboxes:
- checkbox.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- hide_incomplete_models_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- min_average_performance_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- show_napolab_thesis_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- show_teenytinyllama_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- show_portuguese_leaderboard_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- show_external_models_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- # Connect search query to update radar chart
- search_query_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- # Connect max_num_parameters_analysis to update radar chart
- max_num_parameters_analysis.change(
- update_radar_chart,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_analysis_chart
- )
-
- # Connect all analysis controls to update scatter plot
- for dataset_name, checkbox in analysis_dataset_checkboxes:
- checkbox.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- hide_incomplete_models_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- min_average_performance_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- show_napolab_thesis_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- show_teenytinyllama_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- show_portuguese_leaderboard_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- show_external_models_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- search_query_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- max_num_parameters_analysis.change(
- update_scatter_plot,
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
- outputs=model_scatter_plot
- )
-
- # Connect events
- # Load model analysis chart on app start
- app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
-
- # Load scatter plot on app start
- app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
-
- # Load benchmark table on app start
- app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
-
-if __name__ == "__main__":
+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import json
+import os
+import re
+from typing import Dict, List, Optional, Tuple
+
+# Import data loader
+from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
+
+# Load data from YAML file
+NAPOLAB_DATASETS = get_napolab_datasets()
+SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
+MODEL_METADATA = get_model_metadata()
+
+def load_portuguese_leaderboard_data() -> pd.DataFrame:
+ """Load data from the Portuguese leaderboard CSV file."""
+ try:
+ csv_path = "portuguese_leaderboard.csv"
+ if os.path.exists(csv_path):
+ df = pd.read_csv(csv_path)
+ # Select only the relevant columns
+ relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
+ df = df[relevant_columns].copy()
+
+ # Rename columns to match the existing format
+ df = df.rename(columns={
+ 'assin2_rte': 'ASSIN2 RTE',
+ 'assin2_sts': 'ASSIN2 STS',
+ 'faquad_nli': 'FaQUaD-NLI',
+ 'hatebr_offensive': 'HateBR'
+ })
+
+ # Add source information
+ df['source'] = 'portuguese_leaderboard'
+
+ print(f"Loaded {len(df)} models from Portuguese leaderboard")
+ return df
+ else:
+ print(f"Portuguese leaderboard CSV not found: {csv_path}")
+ return pd.DataFrame()
+ except Exception as e:
+ print(f"Error loading Portuguese leaderboard data: {e}")
+ return pd.DataFrame()
+
+def load_external_models_data() -> pd.DataFrame:
+ """Load data from the external models CSV file."""
+ try:
+ csv_path = "external_models.csv"
+ if os.path.exists(csv_path):
+ df = pd.read_csv(csv_path)
+ # Select only the relevant columns
+ relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
+ df = df[relevant_columns].copy()
+
+ # Rename columns to match the existing format
+ df = df.rename(columns={
+ 'model': 'model_name',
+ 'assin2_rte': 'ASSIN2 RTE',
+ 'assin2_sts': 'ASSIN2 STS',
+ 'faquad_nli': 'FaQUaD-NLI',
+ 'hatebr_offensive': 'HateBR'
+ })
+
+ # Add source information
+ df['source'] = 'external_models'
+
+ # Add model_num_parameters column with 0 for external models
+ df['model_num_parameters'] = 0
+
+ print(f"Loaded {len(df)} external models")
+ return df
+ else:
+ print(f"External models CSV not found: {csv_path}")
+ return pd.DataFrame()
+ except Exception as e:
+ print(f"Error loading external models data: {e}")
+ return pd.DataFrame()
+
+# Load Portuguese leaderboard data
+PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
+
+# Load external models data
+EXTERNAL_MODELS_DATA = load_external_models_data()
+
+def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
+ """Create a simplified benchmark table with one column per dataset."""
+ # Get all dataset names
+ dataset_names = sorted(NAPOLAB_DATASETS.keys())
+ dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
+
+ # Use selected datasets if provided, otherwise use all datasets
+ if selected_datasets is None:
+ selected_datasets = dataset_names
+
+ # Collect data for each model
+ model_data = {}
+
+ # Process existing benchmark results
+ for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
+ for model_name, metrics in models.items():
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'dataset_scores': {},
+ 'url': None,
+ 'source': 'existing'
+ }
+
+ # Calculate average performance for this dataset
+ avg_performance = np.mean(list(metrics.values()))
+ model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
+
+ # Process Portuguese leaderboard data
+ if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
+ for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'dataset_scores': {},
+ 'url': None,
+ 'source': 'portuguese_leaderboard',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map Portuguese leaderboard columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in NAPOLAB_DATASETS:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['dataset_scores'][dataset_name] = score
+
+ # Process external models data
+ if show_external_models and not EXTERNAL_MODELS_DATA.empty:
+ for _, row in EXTERNAL_MODELS_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'dataset_scores': {},
+ 'url': row.get('link', ''),
+ 'source': 'external_models',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map external models columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in NAPOLAB_DATASETS:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['dataset_scores'][dataset_name] = score
+
+ # Get model URLs and source information for existing models
+ additional_models = data_loader.get_additional_models()
+ for model_name in model_data.keys():
+ if model_data[model_name]['source'] == 'existing':
+ # Get URL
+ for arch_models in additional_models.values():
+ if model_name in arch_models:
+ model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
+ break
+
+ # Get source information
+ model_metadata = MODEL_METADATA.get(model_name, {})
+ source = model_metadata.get('source', 'unknown')
+ model_data[model_name]['source'] = source
+
+ # Add num_parameters for existing models (set to 0 as they don't have this info)
+ model_data[model_name]['num_parameters'] = 0
+
+ # Create table data
+ table_data = []
+
+ for model_name, data in model_data.items():
+ # Apply source filtering
+ source = data['source']
+
+ # Apply show filters - only show models from sources that are checked
+ if source == 'napolab_thesis' and not show_napolab_thesis:
+ continue
+ if source == 'teenytinyllama_paper' and not show_teenytinyllama:
+ continue
+ if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
+ continue
+ if source == 'external_models' and not show_external_models:
+ continue
+ # Hide models with unknown source (should not happen with proper data)
+ if source == 'unknown':
+ continue
+
+ # Apply parameter filtering (only for Portuguese leaderboard models)
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
+ num_parameters = data.get('num_parameters', 0)
+ if num_parameters > max_num_parameters:
+ continue
+
+ # Create clickable link for model name
+ if data['url']:
+ model_display = f"[{model_name}]({data['url']})"
+ elif source == 'portuguese_leaderboard' and '/' in model_name:
+ # Create Hugging Face link for Portuguese leaderboard models with slashes
+ huggingface_url = f"https://huggingface.co/{model_name}"
+ model_display = f"[{model_name}]({huggingface_url})"
+ else:
+ model_display = model_name
+
+ # Create row with dataset scores
+ row_data = {'Model': model_display}
+
+ # Calculate average only over selected datasets
+ selected_scores = []
+ for dataset_name in selected_datasets:
+ score = data['dataset_scores'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores in average
+ selected_scores.append(score)
+
+ overall_avg = np.mean(selected_scores) if selected_scores else 0
+ row_data['Average'] = round(overall_avg, 4)
+
+ # Add scores for each dataset (only selected ones)
+ for dataset_name in dataset_names:
+ score = data['dataset_scores'].get(dataset_name, 0)
+ display_name = dataset_display_names[dataset_names.index(dataset_name)]
+ # Only add columns for selected datasets
+ if dataset_name in selected_datasets:
+ row_data[display_name] = round(score, 4)
+
+ table_data.append(row_data)
+
+ df = pd.DataFrame(table_data)
+
+ # Filter to show only models that have scores for at least one selected dataset
+ if selected_datasets and not df.empty:
+ # Get display names for selected datasets
+ selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
+
+ # Filter models based on selection criteria
+ models_to_keep = []
+ for _, row in df.iterrows():
+ has_score = False
+ has_all_scores = True
+
+ # Only check the datasets that are actually selected for display
+ for dataset_name in selected_datasets:
+ display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+ if display_name in df.columns:
+ score = row[display_name]
+ if score > 0:
+ has_score = True
+ else:
+ has_all_scores = False
+
+ # Keep model if it has at least one score
+ if has_score:
+ # If hide_incomplete_models is True, only keep models with all scores in selected datasets
+ if not hide_incomplete_models or has_all_scores:
+ models_to_keep.append(row['Model'])
+
+ # Filter dataframe to only include selected models
+ if models_to_keep:
+ df = df[df['Model'].isin(models_to_keep)]
+ else:
+ # If no models to keep, create empty DataFrame with proper structure
+ # Create columns list first
+ columns = ['Model']
+ for dataset_name in dataset_names:
+ display_name = dataset_display_names[dataset_names.index(dataset_name)]
+ if dataset_name in selected_datasets:
+ columns.append(display_name)
+ columns.append('Average')
+
+ # Create empty DataFrame with correct columns
+ df = pd.DataFrame(columns=columns)
+
+ # Filter by minimum average performance
+ if min_average_performance > 0 and not df.empty:
+ df = df[df['Average'] >= min_average_performance]
+
+ # Filter by search query
+ if search_query and not df.empty:
+ # Extract model names from markdown links for searching
+ df_filtered = df.copy()
+ df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
+ try:
+ # Use regex pattern matching
+ df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False, regex=True)]
+ except re.error:
+ # Fallback to simple string matching if regex is invalid
+ df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
+ df = df_filtered.drop('model_name_clean', axis=1)
+
+ # Sort by Average (descending)
+ if not df.empty:
+ df = df.sort_values('Average', ascending=False)
+
+ # Add rank column with medal emojis for top 3 and color-coded emojis for others
+ if not df.empty:
+ df = df.reset_index(drop=True)
+ df.index = df.index + 1 # Start ranking from 1
+
+ # Create rank column with medal emojis and color-coded emojis
+ rank_column = []
+ total_models = len(df)
+
+ for rank in df.index:
+ if rank == 1:
+ rank_column.append("π₯ 1")
+ elif rank == 2:
+ rank_column.append("π₯ 2")
+ elif rank == 3:
+ rank_column.append("π₯ 3")
+ else:
+ # Color-code based on position relative to total
+ position_ratio = rank / total_models
+ if position_ratio <= 0.33: # Top third
+ rank_column.append("π’ " + str(rank))
+ elif position_ratio <= 0.67: # Middle third
+ rank_column.append("π‘ " + str(rank))
+ else: # Bottom third
+ rank_column.append("π΄ " + str(rank))
+
+ df.insert(0, 'Rank', rank_column)
+
+ return df
+
+
+# Global variable to track the current CSV file
+current_csv_file = None
+
+def export_csv(df: pd.DataFrame):
+ """Export the benchmark table to CSV."""
+ global current_csv_file
+
+ print(f"Export function called with dataframe shape: {df.shape}")
+
+ if df.empty:
+ print("Dataframe is empty, returning None")
+ return None
+
+ # Clean up previous file if it exists
+ if current_csv_file:
+ try:
+ import os
+ if os.path.exists(current_csv_file):
+ os.remove(current_csv_file)
+ print(f"Deleted previous CSV file: {current_csv_file}")
+ except Exception as e:
+ print(f"Error deleting previous file {current_csv_file}: {e}")
+
+ # Clean the dataframe for CSV export
+ df_clean = df.copy()
+
+ # Remove markdown formatting from model names for cleaner CSV
+ df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
+
+ # Create filename with timestamp
+ from datetime import datetime
+ import tempfile
+ import os
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"napolab_benchmark_results_{timestamp}.csv"
+
+ # Create file in current directory (simpler approach)
+ file_path = filename
+
+ print(f"Creating CSV file at: {file_path}")
+
+ # Save to CSV file
+ df_clean.to_csv(file_path, index=False)
+
+ print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
+
+ # Update current file tracking
+ current_csv_file = file_path
+
+ return file_path
+
+def cleanup_current_csv():
+ """Clean up the current CSV file after download."""
+ global current_csv_file
+ import os
+
+ if current_csv_file and os.path.exists(current_csv_file):
+ try:
+ os.remove(current_csv_file)
+ print(f"Deleted CSV file after download: {current_csv_file}")
+ current_csv_file = None
+ except Exception as e:
+ print(f"Error deleting file {current_csv_file}: {e}")
+
+
+def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
+ """Create a radar chart showing model performance across all datasets."""
+ # Use selected datasets if provided, otherwise use all datasets
+ if selected_datasets is None:
+ selected_datasets = list(NAPOLAB_DATASETS.keys())
+
+ # Get dataset names for the radar axes (only selected ones)
+ dataset_names = selected_datasets
+ dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
+
+ # Collect data for each model
+ model_data = {}
+
+ # Process existing benchmark results
+ for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
+ if dataset_name in selected_datasets:
+ for model_name, metrics in models.items():
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
+ 'source': 'existing'
+ }
+
+ # Calculate average performance for this dataset
+ avg_performance = np.mean(list(metrics.values()))
+ model_data[model_name]['performances'][dataset_name] = avg_performance
+
+ # Process Portuguese leaderboard data
+ if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
+ for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': 'Unknown',
+ 'source': 'portuguese_leaderboard',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map Portuguese leaderboard columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in selected_datasets:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['performances'][dataset_name] = score
+
+ # Process external models data
+ if show_external_models and not EXTERNAL_MODELS_DATA.empty:
+ for _, row in EXTERNAL_MODELS_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': 'Unknown',
+ 'source': 'external_models',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map external models columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in selected_datasets:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['performances'][dataset_name] = score
+
+ # Get model URLs and source information for existing models
+ additional_models = data_loader.get_additional_models()
+ for model_name in model_data.keys():
+ if model_data[model_name]['source'] == 'existing':
+ # Get URL
+ for arch_models in additional_models.values():
+ if model_name in arch_models:
+ model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
+ break
+
+ # Get source information
+ model_metadata = MODEL_METADATA.get(model_name, {})
+ source = model_metadata.get('source', 'unknown')
+ model_data[model_name]['source'] = source
+
+ # Add num_parameters for existing models (set to 0 as they don't have this info)
+ model_data[model_name]['num_parameters'] = 0
+
+ # Apply source filtering
+ filtered_model_data = {}
+ for model_name, data in model_data.items():
+ source = data.get('source', 'existing')
+
+ # Apply show filters - only show models from sources that are checked
+ if source == 'napolab_thesis' and not show_napolab_thesis:
+ continue
+ if source == 'teenytinyllama_paper' and not show_teenytinyllama:
+ continue
+ if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
+ continue
+ if source == 'external_models' and not show_external_models:
+ continue
+ # Hide models with unknown source (should not happen with proper data)
+ if source == 'unknown':
+ continue
+
+ # Apply parameter filtering (only for Portuguese leaderboard models)
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
+ num_parameters = data.get('num_parameters', 0)
+ if num_parameters > max_num_parameters:
+ continue
+
+ filtered_model_data[model_name] = data
+
+ # Apply incomplete model filtering
+ if hide_incomplete_models and selected_datasets:
+ final_filtered_data = {}
+ for model_name, data in filtered_model_data.items():
+ has_all_scores = True
+ for dataset_name in selected_datasets:
+ if data['performances'].get(dataset_name, 0) == 0:
+ has_all_scores = False
+ break
+ if has_all_scores:
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Apply minimum average performance filtering
+ if min_average_performance > 0 and selected_datasets:
+ final_filtered_data = {}
+ for model_name, data in filtered_model_data.items():
+ # Calculate average performance for selected datasets
+ scores = []
+ for dataset_name in selected_datasets:
+ score = data['performances'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores
+ scores.append(score)
+
+ if scores:
+ avg_performance = np.mean(scores)
+ if avg_performance >= min_average_performance:
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Apply search query filtering
+ if search_query:
+ final_filtered_data = {}
+ try:
+ # Use regex pattern matching
+ import re
+ pattern = re.compile(search_query, re.IGNORECASE)
+ for model_name, data in filtered_model_data.items():
+ if pattern.search(model_name):
+ final_filtered_data[model_name] = data
+ except re.error:
+ # Fallback to simple string matching if regex is invalid
+ for model_name, data in filtered_model_data.items():
+ if search_query.lower() in model_name.lower():
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Sort models by average performance (descending)
+ model_performances = []
+ for model_name, data in filtered_model_data.items():
+ # Calculate average performance for selected datasets
+ scores = []
+ for dataset_name in selected_datasets:
+ score = data['performances'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores
+ scores.append(score)
+
+ avg_performance = np.mean(scores) if scores else 0
+ model_performances.append((model_name, data, avg_performance))
+
+ # Sort by average performance (descending)
+ model_performances.sort(key=lambda x: x[2], reverse=True)
+
+ # Calculate dynamic range based on actual data
+ all_performance_values = []
+ for model_name, data, avg_performance in model_performances:
+ for dataset_name in dataset_names:
+ score = data['performances'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores
+ all_performance_values.append(score)
+
+ # Set dynamic range with some padding
+ if all_performance_values:
+ min_score = min(all_performance_values)
+ max_score = max(all_performance_values)
+ # Add 5% padding below minimum and ensure minimum is not below 0.5
+ range_min = max(0.5, min_score - (max_score - min_score) * 0.05)
+ range_max = 1.0
+ else:
+ # Fallback to default range if no data
+ range_min = 0.6
+ range_max = 1.0
+
+ # Create radar chart
+ fig = go.Figure()
+
+ # Generate a more distinguishable color palette
+ num_models = len(model_performances)
+
+ # Create a list of line styles for better differentiation
+ line_styles = ['solid', 'dash', 'dot', 'dashdot', 'longdash', 'longdashdot']
+
+ # Use highly contrasting colors for better differentiation
+ base_colors = [
+ '#1f77b4', # Blue
+ '#ff7f0e', # Orange
+ '#2ca02c', # Green
+ '#d62728', # Red
+ '#9467bd', # Purple
+ '#8c564b', # Brown
+ '#e377c2', # Pink
+ '#7f7f7f', # Gray
+ '#bcbd22', # Olive
+ '#17becf', # Cyan
+ '#ff9896', # Light Red
+ '#98df8a', # Light Green
+ '#ffbb78', # Light Orange
+ '#aec7e8', # Light Blue
+ '#c5b0d5', # Light Purple
+ ]
+
+ # Ensure we have enough colors
+ while len(base_colors) < num_models:
+ base_colors.extend(base_colors)
+
+ colors = base_colors[:num_models]
+
+ for i, (model_name, data, avg_performance) in enumerate(model_performances):
+ # Get performance values for all datasets (fill with 0 if missing)
+ performance_values = []
+ for dataset_name in dataset_names:
+ performance_values.append(data['performances'].get(dataset_name, 0))
+
+ # Close the polygon by adding the first value at the end
+ if performance_values:
+ performance_values.append(performance_values[0])
+
+ # Assign color and line style based on model index for better differentiation
+ color = colors[i % len(colors)]
+ line_style = line_styles[i % len(line_styles)]
+
+ # Show first two models by default, hide the rest
+ visible = True if i < 2 else 'legendonly'
+
+ # Create theta values that close the polygon
+ theta_values = dataset_display_names + [dataset_display_names[0]] if dataset_display_names else []
+
+ fig.add_trace(go.Scatterpolar(
+ r=performance_values,
+ theta=theta_values,
+ fill=None,
+ name=model_name,
+ line_color=color,
+ line_dash=line_style,
+ line_width=3,
+ opacity=0.8,
+ visible=visible,
+ hovertemplate=(
+ "%{fullData.name}
" +
+ "Dataset: %{theta}
" +
+ "Performance: %{r:.3f}
" +
+ "Architecture: " + data['architecture'] + "
" +
+ ""
+ )
+ ))
+
+ # Update layout
+ fig.update_layout(
+ title="Model Performance Radar Chart",
+ polar=dict(
+ radialaxis=dict(
+ visible=True,
+ range=[range_min, range_max],
+ gridcolor='rgba(0, 0, 0, 0.2)',
+ linecolor='rgba(0, 0, 0, 0.5)',
+ tickcolor='rgba(0, 0, 0, 0.7)',
+ tickfont=dict(color='rgba(0, 0, 0, 0.8)')
+ ),
+ angularaxis=dict(
+ tickmode='array',
+ tickvals=list(range(len(dataset_display_names))),
+ ticktext=dataset_display_names,
+ gridcolor='rgba(0, 0, 0, 0.2)',
+ linecolor='rgba(0, 0, 0, 0.5)',
+ tickcolor='rgba(0, 0, 0, 0.7)',
+ tickfont=dict(color='rgba(0, 0, 0, 0.8)')
+ ),
+ bgcolor='rgba(255, 255, 255, 0)'
+ ),
+ height=700,
+ showlegend=True,
+ plot_bgcolor='rgba(255, 255, 255, 0)',
+ paper_bgcolor='rgba(255, 255, 255, 0)',
+ legend=dict(
+ yanchor="top",
+ y=-0.15,
+ xanchor="center",
+ x=0.5,
+ bgcolor='rgba(255, 255, 255, 0.95)',
+ bordercolor='rgba(0, 0, 0, 0.2)',
+ borderwidth=1,
+ orientation="h",
+ font=dict(color='rgba(0, 0, 0, 0.8)')
+ ),
+ margin=dict(l=50, r=50, t=100, b=100),
+ font=dict(color='rgba(0, 0, 0, 0.8)')
+ )
+
+ return fig
+
+# Gradio Interface
+with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
+ gr.Markdown("""
+ # π Napolab Leaderboard
+
+ Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
+
+ [β Star us on GitHub](https://github.com/ruanchaves/napolab)
+ """)
+
+ with gr.Tabs():
+
+ # Benchmark Results Tab
+ with gr.Tab("π Benchmark Results"):
+ gr.Markdown("### Model Performance Benchmarks")
+
+ with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
+ with gr.Row():
+ # Create checkboxes for each dataset
+ dataset_checkboxes = []
+ for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
+ display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+ # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
+ default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
+ checkbox = gr.Checkbox(
+ label=display_name,
+ value=default_value
+ )
+ dataset_checkboxes.append((dataset_name, checkbox))
+
+ with gr.Accordion("Filter by Score: (Click to expand)", open=False):
+ with gr.Row():
+ hide_incomplete_models = gr.Checkbox(
+ label="Hide models with zero scores in selected datasets",
+ value=True
+ )
+
+ min_average_performance = gr.Slider(
+ minimum=0,
+ maximum=100,
+ value=80,
+ step=1,
+ label="Minimum Average Performance (%)"
+ )
+
+ with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
+ with gr.Row():
+ show_napolab_thesis = gr.Checkbox(
+ label="Napolab Thesis models",
+ value=True
+ )
+ show_teenytinyllama = gr.Checkbox(
+ label="TeenyTinyLlama models",
+ value=True
+ )
+ show_portuguese_leaderboard = gr.Checkbox(
+ label="Open Portuguese LLM Leaderboard models (open-source)",
+ value=True
+ )
+
+ show_external_models = gr.Checkbox(
+ label="Open Portuguese LLM Leaderboard models (proprietary)",
+ value=True
+ )
+
+ # Calculate max parameters for slider
+ max_params = 0
+ if not PORTUGUESE_LEADERBOARD_DATA.empty:
+ max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
+
+ with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
+ with gr.Row():
+ max_num_parameters = gr.Slider(
+ minimum=0,
+ maximum=max_params,
+ value=0,
+ step=1,
+ label="Maximum Number of Parameters",
+ info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
+ )
+
+ # Search bar for filtering models
+ search_query = gr.Textbox(
+ label="Search models by name (supports regex)",
+ placeholder="Enter model name or regex pattern to filter...",
+ value="",
+ info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
+ )
+
+ benchmark_table = gr.DataFrame(
+ label="Model Performance Benchmarks",
+ wrap=[True, False, False, False, False, False, False, False, False, False],
+ interactive=False,
+ datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
+ column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
+ )
+
+ gr.Markdown("*π₯π₯π₯ = Top 3 | π’ = Top 33% | π‘ = Middle 33% | π΄ = Bottom 33%*")
+
+ # Export to CSV button and file component
+ export_button = gr.Button("π₯ Export to CSV", variant="secondary")
+ csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
+
+ # Model Analysis Tab
+ with gr.Tab("π Model Analysis"):
+ gr.Markdown("### Model Performance Radar Chart")
+
+ # Dataset Selection Controls
+ with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
+ with gr.Row():
+ # Create checkboxes for each dataset
+ analysis_dataset_checkboxes = []
+ for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
+ display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
+ # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
+ default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
+ checkbox = gr.Checkbox(
+ label=display_name,
+ value=default_value
+ )
+ analysis_dataset_checkboxes.append((dataset_name, checkbox))
+
+ # Filter Controls
+ with gr.Accordion("Filter by Score: (Click to expand)", open=False):
+ with gr.Row():
+ hide_incomplete_models_analysis = gr.Checkbox(
+ label="Hide models with zero scores in selected datasets",
+ value=True
+ )
+
+ min_average_performance_analysis = gr.Slider(
+ minimum=0,
+ maximum=100,
+ value=80,
+ step=1,
+ label="Minimum Average Performance (%)"
+ )
+
+ with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
+ with gr.Row():
+ show_napolab_thesis_analysis = gr.Checkbox(
+ label="Napolab Thesis models",
+ value=True
+ )
+
+ show_teenytinyllama_analysis = gr.Checkbox(
+ label="TeenyTinyLlama models",
+ value=True
+ )
+
+ show_portuguese_leaderboard_analysis = gr.Checkbox(
+ label="Open Portuguese LLM Leaderboard models (open-source)",
+ value=True
+ )
+
+ show_external_models_analysis = gr.Checkbox(
+ label="Open Portuguese LLM Leaderboard models (proprietary)",
+ value=True
+ )
+
+ # Parameter slider for Model Analysis tab
+ with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
+ with gr.Row():
+ max_num_parameters_analysis = gr.Slider(
+ minimum=0,
+ maximum=max_params,
+ value=0,
+ step=1,
+ label="Maximum Number of Parameters",
+ info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
+ )
+
+ # Search bar for filtering models in radar chart
+ search_query_analysis = gr.Textbox(
+ label="Search models by name (supports regex)",
+ placeholder="Enter model name or regex pattern to filter...",
+ value="",
+ info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
+ )
+
+ model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
+
+ # Add scatter plot below radar chart
+ model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
+
+ gr.Markdown("""
+ **How to interact with the chart:**
+ - **Click on legend items** to show/hide specific models.
+ - **Double-click on a legend item** to isolate that model (hide all others).
+ - **Double-click again** to show all models.
+
+ Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
+ """)
+
+
+
+ # About Tab
+ with gr.Tab("βΉοΈ About"):
+ gr.Markdown("""
+ ## About Napolab
+
+ **Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
+
+ - [GitHub repository](https://github.com/ruanchaves/napolab)
+ - [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab)
+ - Article: ["The Hidden Truth About LLM Performance: Why Your Benchmark Results Might Be Misleading"](https://ruanchaves.medium.com/the-hidden-truth-about-llm-performance-why-your-benchmark-results-might-be-misleading-afd24f40a46c)
+
+ ### Data Sources:
+ The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
+
+ **1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
+
+ **2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
+
+ **3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by CorrΓͺa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
+
+ ### Thesis Citation:
+ ```bibtex
+ @mastersthesis{chaves2023lessons,
+ title={Lessons learned from the evaluation of Portuguese language models},
+ author={Chaves Rodrigues, Ruan},
+ year={2023},
+ school={University of Malta},
+ url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
+ }
+ ```
+
+ ### Napolab Citation:
+ ```bibtex
+ @software{Chaves_Rodrigues_napolab_2023,
+ author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
+ doi = {10.5281/zenodo.7781848},
+ month = {3},
+ title = {{Natural Portuguese Language Benchmark (Napolab)}},
+ url = {https://github.com/ruanchaves/napolab},
+ version = {1.0.0},
+ year = {2023}
+ }
+ ```
+
+ """)
+
+ def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
+ """Create a scatter plot showing model performance vs number of parameters."""
+ # Use selected datasets if provided, otherwise use all datasets
+ if selected_datasets is None:
+ selected_datasets = list(NAPOLAB_DATASETS.keys())
+
+ # Collect data for each model
+ model_data = {}
+
+ # Process existing benchmark results
+ for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
+ if dataset_name in selected_datasets:
+ for model_name, metrics in models.items():
+ if model_name not in model_data:
+ # Get actual source from MODEL_METADATA
+ model_metadata = MODEL_METADATA.get(model_name, {})
+ actual_source = model_metadata.get('source', 'unknown')
+
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': model_metadata.get('architecture', 'Unknown'),
+ 'source': actual_source,
+ 'num_parameters': 0
+ }
+
+ # Calculate average performance for this dataset
+ avg_performance = np.mean(list(metrics.values()))
+ model_data[model_name]['performances'][dataset_name] = avg_performance
+
+ # Process Portuguese leaderboard data
+ if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
+ for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': 'Unknown',
+ 'source': 'portuguese_leaderboard',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map Portuguese leaderboard columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in selected_datasets:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['performances'][dataset_name] = score
+
+ # Process external models data
+ if show_external_models and not EXTERNAL_MODELS_DATA.empty:
+ for _, row in EXTERNAL_MODELS_DATA.iterrows():
+ model_name = row['model_name']
+
+ if model_name not in model_data:
+ model_data[model_name] = {
+ 'performances': {},
+ 'architecture': 'Unknown',
+ 'source': 'external_models',
+ 'num_parameters': row.get('model_num_parameters', 0)
+ }
+
+ # Map external models columns to dataset names
+ column_mapping = {
+ 'ASSIN2 RTE': 'assin2_rte',
+ 'ASSIN2 STS': 'assin2_sts',
+ 'FaQUaD-NLI': 'faquad-nli',
+ 'HateBR': 'hatebr'
+ }
+
+ for display_name, dataset_name in column_mapping.items():
+ if dataset_name in selected_datasets:
+ score = row[display_name]
+ if pd.notna(score) and score > 0:
+ model_data[model_name]['performances'][dataset_name] = score
+
+ # Apply source filtering
+ filtered_model_data = {}
+ for model_name, data in model_data.items():
+ source = data.get('source', 'existing')
+
+ # Apply show filters - only show models from sources that are checked
+ if source == 'napolab_thesis' and not show_napolab_thesis:
+ continue
+ if source == 'teenytinyllama_paper' and not show_teenytinyllama:
+ continue
+ if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
+ continue
+ if source == 'external_models' and not show_external_models:
+ continue
+ # Hide models with unknown source (should not happen with proper data)
+ if source == 'unknown':
+ continue
+
+ # Apply parameter filtering (only for Portuguese leaderboard models)
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
+ num_parameters = data.get('num_parameters', 0)
+ if num_parameters > max_num_parameters:
+ continue
+
+ filtered_model_data[model_name] = data
+
+ # Apply incomplete model filtering
+ if hide_incomplete_models and selected_datasets:
+ final_filtered_data = {}
+ for model_name, data in filtered_model_data.items():
+ has_all_scores = True
+ for dataset_name in selected_datasets:
+ if data['performances'].get(dataset_name, 0) == 0:
+ has_all_scores = False
+ break
+ if has_all_scores:
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Apply minimum average performance filtering
+ if min_average_performance > 0 and selected_datasets:
+ final_filtered_data = {}
+ for model_name, data in filtered_model_data.items():
+ # Calculate average performance for selected datasets
+ scores = []
+ for dataset_name in selected_datasets:
+ score = data['performances'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores
+ scores.append(score)
+
+ if scores:
+ avg_performance = np.mean(scores)
+ if avg_performance >= min_average_performance:
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Apply search query filtering
+ if search_query:
+ final_filtered_data = {}
+ try:
+ # Use regex pattern matching
+ import re
+ pattern = re.compile(search_query, re.IGNORECASE)
+ for model_name, data in filtered_model_data.items():
+ if pattern.search(model_name):
+ final_filtered_data[model_name] = data
+ except re.error:
+ # Fallback to simple string matching if regex is invalid
+ for model_name, data in filtered_model_data.items():
+ if search_query.lower() in model_name.lower():
+ final_filtered_data[model_name] = data
+ filtered_model_data = final_filtered_data
+
+ # Prepare data for scatter plot
+ scatter_data = []
+ for model_name, data in filtered_model_data.items():
+ # Calculate average performance for selected datasets
+ scores = []
+ for dataset_name in selected_datasets:
+ score = data['performances'].get(dataset_name, 0)
+ if score > 0: # Only include non-zero scores
+ scores.append(score)
+
+ if scores:
+ avg_performance = np.mean(scores)
+ num_parameters = data.get('num_parameters', 0)
+ source = data.get('source', 'unknown')
+
+ scatter_data.append({
+ 'model_name': model_name,
+ 'avg_performance': avg_performance,
+ 'num_parameters': num_parameters,
+ 'source': source
+ })
+
+ if not scatter_data:
+ # Create empty figure if no data
+ fig = go.Figure()
+ fig.add_annotation(
+ text="No data available for the selected filters",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5, showarrow=False,
+ font=dict(size=16)
+ )
+ fig.update_layout(
+ title="Model Performance vs Number of Parameters",
+ xaxis_title="Number of Parameters",
+ yaxis_title="Average Performance Score",
+ height=500
+ )
+ return fig
+
+ # Create scatter plot
+ df_scatter = pd.DataFrame(scatter_data)
+
+ # Create color mapping for sources
+ color_map = {
+ 'portuguese_leaderboard': '#1f77b4',
+ 'external_models': '#ff7f0e',
+ 'napolab_thesis': '#2ca02c',
+ 'teenytinyllama_paper': '#d62728',
+ 'unknown': '#9467bd'
+ }
+
+ # Create display name mapping for sources
+ display_name_map = {
+ 'portuguese_leaderboard': 'Open PT LLM Leaderboard',
+ 'external_models': 'Proprietary Models',
+ 'napolab_thesis': 'Napolab Thesis',
+ 'teenytinyllama_paper': 'TeenyTinyLlama Paper',
+ 'unknown': 'Unknown Source'
+ }
+
+ fig = go.Figure()
+
+ for source in df_scatter['source'].unique():
+ source_data = df_scatter[df_scatter['source'] == source]
+ color = color_map.get(source, '#7f7f7f')
+ display_name = display_name_map.get(source, source.replace('_', ' ').title())
+
+ fig.add_trace(go.Scatter(
+ x=source_data['num_parameters'],
+ y=source_data['avg_performance'],
+ mode='markers',
+ name=display_name,
+ marker=dict(
+ color=color,
+ size=8,
+ opacity=0.7
+ ),
+ text=source_data['model_name'],
+ hovertemplate=(
+ "%{text}
" +
+ "Average Performance: %{y:.3f}
" +
+ "Number of Parameters: %{x:,}
" +
+ "Source: " + display_name + "
" +
+ ""
+ )
+ ))
+
+ fig.update_layout(
+ title="Model Performance vs Number of Parameters",
+ xaxis_title="Number of Parameters",
+ yaxis_title="Average Performance Score",
+ height=500,
+ showlegend=True,
+ plot_bgcolor='rgba(255, 255, 255, 0)',
+ paper_bgcolor='rgba(255, 255, 255, 0)',
+ legend=dict(
+ yanchor="top",
+ y=-0.15,
+ xanchor="center",
+ x=0.5,
+ bgcolor='rgba(255, 255, 255, 0.95)',
+ bordercolor='rgba(0, 0, 0, 0.2)',
+ borderwidth=1,
+ orientation="h"
+ ),
+ margin=dict(l=50, r=50, t=100, b=100)
+ )
+
+ return fig
+
+ # Event handlers
+ def update_radar_chart(*args):
+ # Extract arguments for radar chart
+ dataset_values = args[:len(analysis_dataset_checkboxes)]
+ hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
+ min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
+ show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
+ show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
+ show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
+ show_external_models = args[len(analysis_dataset_checkboxes) + 5]
+ search_query = args[len(analysis_dataset_checkboxes) + 6]
+ max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
+
+ # Convert dataset selections to list of selected dataset names
+ selected_datasets = []
+ for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
+ if dataset_values[i]:
+ selected_datasets.append(dataset_name)
+
+ return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
+
+ def update_benchmark_table(*args):
+ # Extract arguments
+ dataset_values = args[:len(dataset_checkboxes)]
+ hide_incomplete_models = args[len(dataset_checkboxes)]
+ min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
+ show_napolab_thesis = args[len(dataset_checkboxes) + 2]
+ show_teenytinyllama = args[len(dataset_checkboxes) + 3]
+ show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
+ show_external_models = args[len(dataset_checkboxes) + 5]
+ search_query = args[len(dataset_checkboxes) + 6]
+ max_num_parameters = args[len(dataset_checkboxes) + 7]
+
+ # Convert dataset selections to list of selected dataset names
+ selected_datasets = []
+ for i, (dataset_name, _) in enumerate(dataset_checkboxes):
+ if dataset_values[i]:
+ selected_datasets.append(dataset_name)
+
+ df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
+
+ return df
+
+ def update_scatter_plot(*args):
+ # Extract arguments for scatter plot
+ dataset_values = args[:len(analysis_dataset_checkboxes)]
+ hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
+ min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
+ show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
+ show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
+ show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
+ show_external_models = args[len(analysis_dataset_checkboxes) + 5]
+ search_query = args[len(analysis_dataset_checkboxes) + 6]
+ max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
+
+ # Convert dataset selections to list of selected dataset names
+ selected_datasets = []
+ for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
+ if dataset_values[i]:
+ selected_datasets.append(dataset_name)
+
+ return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
+
+ # Connect dataset checkboxes to update table
+ for dataset_name, checkbox in dataset_checkboxes:
+ checkbox.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ hide_incomplete_models.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ min_average_performance.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ show_napolab_thesis.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ show_teenytinyllama.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ show_portuguese_leaderboard.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ show_external_models.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ # Connect search query to update table
+ search_query.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ # Connect max_num_parameters to update table
+ max_num_parameters.change(
+ update_benchmark_table,
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
+ outputs=benchmark_table
+ )
+
+ # Connect export button
+ export_button.click(
+ export_csv,
+ inputs=benchmark_table,
+ outputs=csv_file
+ )
+
+ # Connect file download to cleanup
+ csv_file.change(
+ cleanup_current_csv,
+ inputs=None,
+ outputs=None
+ )
+
+ # Connect analysis chart events
+ # Connect dataset checkboxes to update radar chart
+ for dataset_name, checkbox in analysis_dataset_checkboxes:
+ checkbox.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ hide_incomplete_models_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ min_average_performance_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ show_napolab_thesis_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ show_teenytinyllama_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ show_portuguese_leaderboard_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ show_external_models_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ # Connect search query to update radar chart
+ search_query_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ # Connect max_num_parameters_analysis to update radar chart
+ max_num_parameters_analysis.change(
+ update_radar_chart,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_analysis_chart
+ )
+
+ # Connect all analysis controls to update scatter plot
+ for dataset_name, checkbox in analysis_dataset_checkboxes:
+ checkbox.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ hide_incomplete_models_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ min_average_performance_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ show_napolab_thesis_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ show_teenytinyllama_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ show_portuguese_leaderboard_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ show_external_models_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ search_query_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ max_num_parameters_analysis.change(
+ update_scatter_plot,
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
+ outputs=model_scatter_plot
+ )
+
+ # Connect events
+ # Load model analysis chart on app start
+ app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
+
+ # Load scatter plot on app start
+ app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
+
+ # Load benchmark table on app start
+ app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
+
+if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)
\ No newline at end of file