Spaces:
Running
Running
Upload 14 files
Browse files- README.md +60 -13
- app.py +1006 -0
- config.py +48 -0
- data.yaml +422 -0
- data_loader.py +133 -0
- download_external_models.py +124 -0
- example_usage.py +134 -0
- external_models.csv +31 -0
- extract_portuguese_leaderboard.py +195 -0
- manage_data.py +226 -0
- portuguese_leaderboard.csv +0 -0
- requirements.txt +8 -0
- run_app.py +73 -0
- validate_data.py +106 -0
README.md
CHANGED
@@ -1,13 +1,60 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Napolab Leaderboard - Gradio App
|
2 |
+
|
3 |
+
A comprehensive Gradio web application for exploring and benchmarking Portuguese language models using the Napolab dataset collection.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **π Benchmark Results**: Single comprehensive table with one column per dataset and clickable model links
|
8 |
+
- **π Model Analysis**: Radar chart showing model performance across all datasets
|
9 |
+
|
10 |
+
## Installation
|
11 |
+
|
12 |
+
1. Navigate to the leaderboard directory:
|
13 |
+
```bash
|
14 |
+
cd dev/napolab/leaderboard
|
15 |
+
```
|
16 |
+
|
17 |
+
2. Install the required dependencies:
|
18 |
+
```bash
|
19 |
+
pip install -r requirements.txt
|
20 |
+
```
|
21 |
+
|
22 |
+
3. Extract data from external sources (optional but recommended):
|
23 |
+
```bash
|
24 |
+
# Extract data from Portuguese LLM Leaderboard
|
25 |
+
python extract_portuguese_leaderboard.py
|
26 |
+
|
27 |
+
# Download external models data
|
28 |
+
python download_external_models.py
|
29 |
+
```
|
30 |
+
|
31 |
+
4. Run the Gradio app:
|
32 |
+
```bash
|
33 |
+
python app.py
|
34 |
+
```
|
35 |
+
|
36 |
+
The app will be available at `http://localhost:7860`
|
37 |
+
|
38 |
+
## Data Management
|
39 |
+
|
40 |
+
The app uses a YAML configuration file (`data.yaml`) for adding new data, making it easy to edit and maintain.
|
41 |
+
|
42 |
+
### Data Extraction Scripts
|
43 |
+
|
44 |
+
The leaderboard includes scripts to automatically extract and update data from external sources:
|
45 |
+
|
46 |
+
#### `extract_portuguese_leaderboard.py`
|
47 |
+
This script extracts benchmark results from the Open Portuguese LLM Leaderboard:
|
48 |
+
- Fetches data from the Hugging Face Spaces leaderboard
|
49 |
+
- Updates the `portuguese_leaderboard.csv` file
|
50 |
+
- Includes both open-source and proprietary models
|
51 |
+
- Automatically handles data formatting and validation
|
52 |
+
|
53 |
+
#### `download_external_models.py`
|
54 |
+
This script downloads additional model data:
|
55 |
+
- Fetches model metadata from various sources
|
56 |
+
- Updates the `external_models.csv` file
|
57 |
+
- Includes model links and performance metrics
|
58 |
+
- Ensures data consistency with the main leaderboard
|
59 |
+
|
60 |
+
**Note**: These scripts require internet connection and may take a few minutes to complete. Run them periodically to keep the leaderboard data up to date.
|
app.py
ADDED
@@ -0,0 +1,1006 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import plotly.express as px
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
from plotly.subplots import make_subplots
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from typing import Dict, List, Optional, Tuple
|
10 |
+
|
11 |
+
# Import data loader
|
12 |
+
from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
|
13 |
+
|
14 |
+
# Load data from YAML file
|
15 |
+
NAPOLAB_DATASETS = get_napolab_datasets()
|
16 |
+
SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
|
17 |
+
MODEL_METADATA = get_model_metadata()
|
18 |
+
|
19 |
+
def load_portuguese_leaderboard_data() -> pd.DataFrame:
|
20 |
+
"""Load data from the Portuguese leaderboard CSV file."""
|
21 |
+
try:
|
22 |
+
csv_path = "portuguese_leaderboard.csv"
|
23 |
+
if os.path.exists(csv_path):
|
24 |
+
df = pd.read_csv(csv_path)
|
25 |
+
# Select only the relevant columns
|
26 |
+
relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
27 |
+
df = df[relevant_columns].copy()
|
28 |
+
|
29 |
+
# Rename columns to match the existing format
|
30 |
+
df = df.rename(columns={
|
31 |
+
'assin2_rte': 'ASSIN2 RTE',
|
32 |
+
'assin2_sts': 'ASSIN2 STS',
|
33 |
+
'faquad_nli': 'FaQuAD-NLI',
|
34 |
+
'hatebr_offensive': 'HateBR'
|
35 |
+
})
|
36 |
+
|
37 |
+
# Add source information
|
38 |
+
df['source'] = 'portuguese_leaderboard'
|
39 |
+
|
40 |
+
print(f"Loaded {len(df)} models from Portuguese leaderboard")
|
41 |
+
return df
|
42 |
+
else:
|
43 |
+
print(f"Portuguese leaderboard CSV not found: {csv_path}")
|
44 |
+
return pd.DataFrame()
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error loading Portuguese leaderboard data: {e}")
|
47 |
+
return pd.DataFrame()
|
48 |
+
|
49 |
+
def load_external_models_data() -> pd.DataFrame:
|
50 |
+
"""Load data from the external models CSV file."""
|
51 |
+
try:
|
52 |
+
csv_path = "external_models.csv"
|
53 |
+
if os.path.exists(csv_path):
|
54 |
+
df = pd.read_csv(csv_path)
|
55 |
+
# Select only the relevant columns
|
56 |
+
relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
57 |
+
df = df[relevant_columns].copy()
|
58 |
+
|
59 |
+
# Rename columns to match the existing format
|
60 |
+
df = df.rename(columns={
|
61 |
+
'model': 'model_name',
|
62 |
+
'assin2_rte': 'ASSIN2 RTE',
|
63 |
+
'assin2_sts': 'ASSIN2 STS',
|
64 |
+
'faquad_nli': 'FaQuAD-NLI',
|
65 |
+
'hatebr_offensive': 'HateBR'
|
66 |
+
})
|
67 |
+
|
68 |
+
# Add source information
|
69 |
+
df['source'] = 'external_models'
|
70 |
+
|
71 |
+
print(f"Loaded {len(df)} external models")
|
72 |
+
return df
|
73 |
+
else:
|
74 |
+
print(f"External models CSV not found: {csv_path}")
|
75 |
+
return pd.DataFrame()
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error loading external models data: {e}")
|
78 |
+
return pd.DataFrame()
|
79 |
+
|
80 |
+
# Load Portuguese leaderboard data
|
81 |
+
PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
|
82 |
+
|
83 |
+
# Load external models data
|
84 |
+
EXTERNAL_MODELS_DATA = load_external_models_data()
|
85 |
+
|
86 |
+
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
|
87 |
+
"""Create a simplified benchmark table with one column per dataset."""
|
88 |
+
# Get all dataset names
|
89 |
+
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
90 |
+
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
|
91 |
+
|
92 |
+
# Use selected datasets if provided, otherwise use all datasets
|
93 |
+
if selected_datasets is None:
|
94 |
+
selected_datasets = dataset_names
|
95 |
+
|
96 |
+
# Collect data for each model
|
97 |
+
model_data = {}
|
98 |
+
|
99 |
+
# Process existing benchmark results
|
100 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
101 |
+
for model_name, metrics in models.items():
|
102 |
+
if model_name not in model_data:
|
103 |
+
model_data[model_name] = {
|
104 |
+
'dataset_scores': {},
|
105 |
+
'url': None,
|
106 |
+
'source': 'existing'
|
107 |
+
}
|
108 |
+
|
109 |
+
# Calculate average performance for this dataset
|
110 |
+
avg_performance = np.mean(list(metrics.values()))
|
111 |
+
model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
|
112 |
+
|
113 |
+
# Process Portuguese leaderboard data
|
114 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
115 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
116 |
+
model_name = row['model_name']
|
117 |
+
|
118 |
+
if model_name not in model_data:
|
119 |
+
model_data[model_name] = {
|
120 |
+
'dataset_scores': {},
|
121 |
+
'url': None,
|
122 |
+
'source': 'portuguese_leaderboard'
|
123 |
+
}
|
124 |
+
|
125 |
+
# Map Portuguese leaderboard columns to dataset names
|
126 |
+
column_mapping = {
|
127 |
+
'ASSIN2 RTE': 'assin2_rte',
|
128 |
+
'ASSIN2 STS': 'assin2_sts',
|
129 |
+
'FaQuAD-NLI': 'faquad-nli',
|
130 |
+
'HateBR': 'hatebr'
|
131 |
+
}
|
132 |
+
|
133 |
+
for display_name, dataset_name in column_mapping.items():
|
134 |
+
if dataset_name in NAPOLAB_DATASETS:
|
135 |
+
score = row[display_name]
|
136 |
+
if pd.notna(score) and score > 0:
|
137 |
+
model_data[model_name]['dataset_scores'][dataset_name] = score
|
138 |
+
|
139 |
+
# Process external models data
|
140 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
141 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
142 |
+
model_name = row['model_name']
|
143 |
+
|
144 |
+
if model_name not in model_data:
|
145 |
+
model_data[model_name] = {
|
146 |
+
'dataset_scores': {},
|
147 |
+
'url': row.get('link', ''),
|
148 |
+
'source': 'external_models'
|
149 |
+
}
|
150 |
+
|
151 |
+
# Map external models columns to dataset names
|
152 |
+
column_mapping = {
|
153 |
+
'ASSIN2 RTE': 'assin2_rte',
|
154 |
+
'ASSIN2 STS': 'assin2_sts',
|
155 |
+
'FaQuAD-NLI': 'faquad-nli',
|
156 |
+
'HateBR': 'hatebr'
|
157 |
+
}
|
158 |
+
|
159 |
+
for display_name, dataset_name in column_mapping.items():
|
160 |
+
if dataset_name in NAPOLAB_DATASETS:
|
161 |
+
score = row[display_name]
|
162 |
+
if pd.notna(score) and score > 0:
|
163 |
+
model_data[model_name]['dataset_scores'][dataset_name] = score
|
164 |
+
|
165 |
+
# Get model URLs and source information for existing models
|
166 |
+
additional_models = data_loader.get_additional_models()
|
167 |
+
for model_name in model_data.keys():
|
168 |
+
if model_data[model_name]['source'] == 'existing':
|
169 |
+
# Get URL
|
170 |
+
for arch_models in additional_models.values():
|
171 |
+
if model_name in arch_models:
|
172 |
+
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
|
173 |
+
break
|
174 |
+
|
175 |
+
# Get source information
|
176 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
177 |
+
source = model_metadata.get('source', 'unknown')
|
178 |
+
model_data[model_name]['source'] = source
|
179 |
+
|
180 |
+
# Create table data
|
181 |
+
table_data = []
|
182 |
+
|
183 |
+
for model_name, data in model_data.items():
|
184 |
+
# Apply source filtering
|
185 |
+
source = data['source']
|
186 |
+
|
187 |
+
# Apply show filters - only show models from sources that are checked
|
188 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
189 |
+
continue
|
190 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
191 |
+
continue
|
192 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
193 |
+
continue
|
194 |
+
if source == 'external_models' and not show_external_models:
|
195 |
+
continue
|
196 |
+
# Hide models with unknown source (should not happen with proper data)
|
197 |
+
if source == 'unknown':
|
198 |
+
continue
|
199 |
+
|
200 |
+
# Create clickable link for model name
|
201 |
+
if data['url']:
|
202 |
+
model_display = f"[{model_name}]({data['url']})"
|
203 |
+
elif source == 'portuguese_leaderboard' and '/' in model_name:
|
204 |
+
# Create Hugging Face link for Portuguese leaderboard models with slashes
|
205 |
+
huggingface_url = f"https://huggingface.co/{model_name}"
|
206 |
+
model_display = f"[{model_name}]({huggingface_url})"
|
207 |
+
else:
|
208 |
+
model_display = model_name
|
209 |
+
|
210 |
+
# Create row with dataset scores
|
211 |
+
row_data = {'Model': model_display}
|
212 |
+
|
213 |
+
# Calculate average only over selected datasets
|
214 |
+
selected_scores = []
|
215 |
+
for dataset_name in selected_datasets:
|
216 |
+
score = data['dataset_scores'].get(dataset_name, 0)
|
217 |
+
if score > 0: # Only include non-zero scores in average
|
218 |
+
selected_scores.append(score)
|
219 |
+
|
220 |
+
overall_avg = np.mean(selected_scores) if selected_scores else 0
|
221 |
+
row_data['Average'] = round(overall_avg, 4)
|
222 |
+
|
223 |
+
# Add scores for each dataset (only selected ones)
|
224 |
+
for dataset_name in dataset_names:
|
225 |
+
score = data['dataset_scores'].get(dataset_name, 0)
|
226 |
+
display_name = dataset_display_names[dataset_names.index(dataset_name)]
|
227 |
+
# Only add columns for selected datasets
|
228 |
+
if dataset_name in selected_datasets:
|
229 |
+
row_data[display_name] = round(score, 4)
|
230 |
+
|
231 |
+
table_data.append(row_data)
|
232 |
+
|
233 |
+
df = pd.DataFrame(table_data)
|
234 |
+
|
235 |
+
# Filter to show only models that have scores for at least one selected dataset
|
236 |
+
if selected_datasets and not df.empty:
|
237 |
+
# Get display names for selected datasets
|
238 |
+
selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
|
239 |
+
|
240 |
+
# Filter models based on selection criteria
|
241 |
+
models_to_keep = []
|
242 |
+
for _, row in df.iterrows():
|
243 |
+
has_score = False
|
244 |
+
has_all_scores = True
|
245 |
+
|
246 |
+
# Only check the datasets that are actually selected for display
|
247 |
+
for dataset_name in selected_datasets:
|
248 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
249 |
+
if display_name in df.columns:
|
250 |
+
score = row[display_name]
|
251 |
+
if score > 0:
|
252 |
+
has_score = True
|
253 |
+
else:
|
254 |
+
has_all_scores = False
|
255 |
+
|
256 |
+
# Keep model if it has at least one score
|
257 |
+
if has_score:
|
258 |
+
# If hide_incomplete_models is True, only keep models with all scores in selected datasets
|
259 |
+
if not hide_incomplete_models or has_all_scores:
|
260 |
+
models_to_keep.append(row['Model'])
|
261 |
+
|
262 |
+
# Filter dataframe to only include selected models
|
263 |
+
if models_to_keep:
|
264 |
+
df = df[df['Model'].isin(models_to_keep)]
|
265 |
+
else:
|
266 |
+
# If no models to keep, create empty DataFrame with proper structure
|
267 |
+
# Create columns list first
|
268 |
+
columns = ['Model']
|
269 |
+
for dataset_name in dataset_names:
|
270 |
+
display_name = dataset_display_names[dataset_names.index(dataset_name)]
|
271 |
+
if dataset_name in selected_datasets:
|
272 |
+
columns.append(display_name)
|
273 |
+
columns.append('Average')
|
274 |
+
|
275 |
+
# Create empty DataFrame with correct columns
|
276 |
+
df = pd.DataFrame(columns=columns)
|
277 |
+
|
278 |
+
# Filter by minimum average performance
|
279 |
+
if min_average_performance > 0 and not df.empty:
|
280 |
+
df = df[df['Average'] >= min_average_performance]
|
281 |
+
|
282 |
+
# Filter by search query
|
283 |
+
if search_query and not df.empty:
|
284 |
+
# Extract model names from markdown links for searching
|
285 |
+
df_filtered = df.copy()
|
286 |
+
df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
|
287 |
+
df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
|
288 |
+
df = df_filtered.drop('model_name_clean', axis=1)
|
289 |
+
|
290 |
+
# Sort by Average (descending)
|
291 |
+
if not df.empty:
|
292 |
+
df = df.sort_values('Average', ascending=False)
|
293 |
+
|
294 |
+
# Add rank column with medal emojis for top 3 and color-coded emojis for others
|
295 |
+
if not df.empty:
|
296 |
+
df = df.reset_index(drop=True)
|
297 |
+
df.index = df.index + 1 # Start ranking from 1
|
298 |
+
|
299 |
+
# Create rank column with medal emojis and color-coded emojis
|
300 |
+
rank_column = []
|
301 |
+
total_models = len(df)
|
302 |
+
|
303 |
+
for rank in df.index:
|
304 |
+
if rank == 1:
|
305 |
+
rank_column.append("π₯ 1")
|
306 |
+
elif rank == 2:
|
307 |
+
rank_column.append("π₯ 2")
|
308 |
+
elif rank == 3:
|
309 |
+
rank_column.append("π₯ 3")
|
310 |
+
else:
|
311 |
+
# Color-code based on position relative to total
|
312 |
+
position_ratio = rank / total_models
|
313 |
+
if position_ratio <= 0.33: # Top third
|
314 |
+
rank_column.append("π’ " + str(rank))
|
315 |
+
elif position_ratio <= 0.67: # Middle third
|
316 |
+
rank_column.append("π‘ " + str(rank))
|
317 |
+
else: # Bottom third
|
318 |
+
rank_column.append("π΄ " + str(rank))
|
319 |
+
|
320 |
+
df.insert(0, 'Rank', rank_column)
|
321 |
+
|
322 |
+
return df
|
323 |
+
|
324 |
+
|
325 |
+
# Global variable to track the current CSV file
|
326 |
+
current_csv_file = None
|
327 |
+
|
328 |
+
def export_csv(df: pd.DataFrame):
|
329 |
+
"""Export the benchmark table to CSV."""
|
330 |
+
global current_csv_file
|
331 |
+
|
332 |
+
print(f"Export function called with dataframe shape: {df.shape}")
|
333 |
+
|
334 |
+
if df.empty:
|
335 |
+
print("Dataframe is empty, returning None")
|
336 |
+
return None
|
337 |
+
|
338 |
+
# Clean up previous file if it exists
|
339 |
+
if current_csv_file:
|
340 |
+
try:
|
341 |
+
import os
|
342 |
+
if os.path.exists(current_csv_file):
|
343 |
+
os.remove(current_csv_file)
|
344 |
+
print(f"Deleted previous CSV file: {current_csv_file}")
|
345 |
+
except Exception as e:
|
346 |
+
print(f"Error deleting previous file {current_csv_file}: {e}")
|
347 |
+
|
348 |
+
# Clean the dataframe for CSV export
|
349 |
+
df_clean = df.copy()
|
350 |
+
|
351 |
+
# Remove markdown formatting from model names for cleaner CSV
|
352 |
+
df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
|
353 |
+
|
354 |
+
# Create filename with timestamp
|
355 |
+
from datetime import datetime
|
356 |
+
import tempfile
|
357 |
+
import os
|
358 |
+
|
359 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
360 |
+
filename = f"napolab_benchmark_results_{timestamp}.csv"
|
361 |
+
|
362 |
+
# Create file in current directory (simpler approach)
|
363 |
+
file_path = filename
|
364 |
+
|
365 |
+
print(f"Creating CSV file at: {file_path}")
|
366 |
+
|
367 |
+
# Save to CSV file
|
368 |
+
df_clean.to_csv(file_path, index=False)
|
369 |
+
|
370 |
+
print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
|
371 |
+
|
372 |
+
# Update current file tracking
|
373 |
+
current_csv_file = file_path
|
374 |
+
|
375 |
+
return file_path
|
376 |
+
|
377 |
+
def cleanup_current_csv():
|
378 |
+
"""Clean up the current CSV file after download."""
|
379 |
+
global current_csv_file
|
380 |
+
import os
|
381 |
+
|
382 |
+
if current_csv_file and os.path.exists(current_csv_file):
|
383 |
+
try:
|
384 |
+
os.remove(current_csv_file)
|
385 |
+
print(f"Deleted CSV file after download: {current_csv_file}")
|
386 |
+
current_csv_file = None
|
387 |
+
except Exception as e:
|
388 |
+
print(f"Error deleting file {current_csv_file}: {e}")
|
389 |
+
|
390 |
+
|
391 |
+
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
|
392 |
+
"""Create a radar chart showing model performance across all datasets."""
|
393 |
+
# Use selected datasets if provided, otherwise use all datasets
|
394 |
+
if selected_datasets is None:
|
395 |
+
selected_datasets = list(NAPOLAB_DATASETS.keys())
|
396 |
+
|
397 |
+
# Get dataset names for the radar axes (only selected ones)
|
398 |
+
dataset_names = selected_datasets
|
399 |
+
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
|
400 |
+
|
401 |
+
# Collect data for each model
|
402 |
+
model_data = {}
|
403 |
+
|
404 |
+
# Process existing benchmark results
|
405 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
406 |
+
if dataset_name in selected_datasets:
|
407 |
+
for model_name, metrics in models.items():
|
408 |
+
if model_name not in model_data:
|
409 |
+
model_data[model_name] = {
|
410 |
+
'performances': {},
|
411 |
+
'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
|
412 |
+
'source': 'existing'
|
413 |
+
}
|
414 |
+
|
415 |
+
# Calculate average performance for this dataset
|
416 |
+
avg_performance = np.mean(list(metrics.values()))
|
417 |
+
model_data[model_name]['performances'][dataset_name] = avg_performance
|
418 |
+
|
419 |
+
# Process Portuguese leaderboard data
|
420 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
421 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
422 |
+
model_name = row['model_name']
|
423 |
+
|
424 |
+
if model_name not in model_data:
|
425 |
+
model_data[model_name] = {
|
426 |
+
'performances': {},
|
427 |
+
'architecture': 'Unknown',
|
428 |
+
'source': 'portuguese_leaderboard'
|
429 |
+
}
|
430 |
+
|
431 |
+
# Map Portuguese leaderboard columns to dataset names
|
432 |
+
column_mapping = {
|
433 |
+
'ASSIN2 RTE': 'assin2_rte',
|
434 |
+
'ASSIN2 STS': 'assin2_sts',
|
435 |
+
'FaQuAD-NLI': 'faquad-nli',
|
436 |
+
'HateBR': 'hatebr'
|
437 |
+
}
|
438 |
+
|
439 |
+
for display_name, dataset_name in column_mapping.items():
|
440 |
+
if dataset_name in selected_datasets:
|
441 |
+
score = row[display_name]
|
442 |
+
if pd.notna(score) and score > 0:
|
443 |
+
model_data[model_name]['performances'][dataset_name] = score
|
444 |
+
|
445 |
+
# Process external models data
|
446 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
447 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
448 |
+
model_name = row['model_name']
|
449 |
+
|
450 |
+
if model_name not in model_data:
|
451 |
+
model_data[model_name] = {
|
452 |
+
'performances': {},
|
453 |
+
'architecture': 'Unknown',
|
454 |
+
'source': 'external_models'
|
455 |
+
}
|
456 |
+
|
457 |
+
# Map external models columns to dataset names
|
458 |
+
column_mapping = {
|
459 |
+
'ASSIN2 RTE': 'assin2_rte',
|
460 |
+
'ASSIN2 STS': 'assin2_sts',
|
461 |
+
'FaQuAD-NLI': 'faquad-nli',
|
462 |
+
'HateBR': 'hatebr'
|
463 |
+
}
|
464 |
+
|
465 |
+
for display_name, dataset_name in column_mapping.items():
|
466 |
+
if dataset_name in selected_datasets:
|
467 |
+
score = row[display_name]
|
468 |
+
if pd.notna(score) and score > 0:
|
469 |
+
model_data[model_name]['performances'][dataset_name] = score
|
470 |
+
|
471 |
+
# Get model URLs and source information for existing models
|
472 |
+
additional_models = data_loader.get_additional_models()
|
473 |
+
for model_name in model_data.keys():
|
474 |
+
if model_data[model_name]['source'] == 'existing':
|
475 |
+
# Get URL
|
476 |
+
for arch_models in additional_models.values():
|
477 |
+
if model_name in arch_models:
|
478 |
+
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
|
479 |
+
break
|
480 |
+
|
481 |
+
# Get source information
|
482 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
483 |
+
source = model_metadata.get('source', 'unknown')
|
484 |
+
model_data[model_name]['source'] = source
|
485 |
+
|
486 |
+
# Apply source filtering
|
487 |
+
filtered_model_data = {}
|
488 |
+
for model_name, data in model_data.items():
|
489 |
+
source = data.get('source', 'existing')
|
490 |
+
|
491 |
+
# Apply show filters - only show models from sources that are checked
|
492 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
493 |
+
continue
|
494 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
495 |
+
continue
|
496 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
497 |
+
continue
|
498 |
+
if source == 'external_models' and not show_external_models:
|
499 |
+
continue
|
500 |
+
# Hide models with unknown source (should not happen with proper data)
|
501 |
+
if source == 'unknown':
|
502 |
+
continue
|
503 |
+
|
504 |
+
filtered_model_data[model_name] = data
|
505 |
+
|
506 |
+
# Apply incomplete model filtering
|
507 |
+
if hide_incomplete_models and selected_datasets:
|
508 |
+
final_filtered_data = {}
|
509 |
+
for model_name, data in filtered_model_data.items():
|
510 |
+
has_all_scores = True
|
511 |
+
for dataset_name in selected_datasets:
|
512 |
+
if data['performances'].get(dataset_name, 0) == 0:
|
513 |
+
has_all_scores = False
|
514 |
+
break
|
515 |
+
if has_all_scores:
|
516 |
+
final_filtered_data[model_name] = data
|
517 |
+
filtered_model_data = final_filtered_data
|
518 |
+
|
519 |
+
# Apply minimum average performance filtering
|
520 |
+
if min_average_performance > 0 and selected_datasets:
|
521 |
+
final_filtered_data = {}
|
522 |
+
for model_name, data in filtered_model_data.items():
|
523 |
+
# Calculate average performance for selected datasets
|
524 |
+
scores = []
|
525 |
+
for dataset_name in selected_datasets:
|
526 |
+
score = data['performances'].get(dataset_name, 0)
|
527 |
+
if score > 0: # Only include non-zero scores
|
528 |
+
scores.append(score)
|
529 |
+
|
530 |
+
if scores:
|
531 |
+
avg_performance = np.mean(scores)
|
532 |
+
if avg_performance >= min_average_performance:
|
533 |
+
final_filtered_data[model_name] = data
|
534 |
+
filtered_model_data = final_filtered_data
|
535 |
+
|
536 |
+
# Apply search query filtering
|
537 |
+
if search_query:
|
538 |
+
final_filtered_data = {}
|
539 |
+
for model_name, data in filtered_model_data.items():
|
540 |
+
if search_query.lower() in model_name.lower():
|
541 |
+
final_filtered_data[model_name] = data
|
542 |
+
filtered_model_data = final_filtered_data
|
543 |
+
|
544 |
+
# Sort models by average performance (descending)
|
545 |
+
model_performances = []
|
546 |
+
for model_name, data in filtered_model_data.items():
|
547 |
+
# Calculate average performance for selected datasets
|
548 |
+
scores = []
|
549 |
+
for dataset_name in selected_datasets:
|
550 |
+
score = data['performances'].get(dataset_name, 0)
|
551 |
+
if score > 0: # Only include non-zero scores
|
552 |
+
scores.append(score)
|
553 |
+
|
554 |
+
avg_performance = np.mean(scores) if scores else 0
|
555 |
+
model_performances.append((model_name, data, avg_performance))
|
556 |
+
|
557 |
+
# Sort by average performance (descending)
|
558 |
+
model_performances.sort(key=lambda x: x[2], reverse=True)
|
559 |
+
|
560 |
+
# Create radar chart
|
561 |
+
fig = go.Figure()
|
562 |
+
|
563 |
+
# Generate a dynamic color palette based on the number of models
|
564 |
+
num_models = len(model_performances)
|
565 |
+
if num_models <= 10:
|
566 |
+
# Use a qualitative color palette for small numbers
|
567 |
+
colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel1 + px.colors.qualitative.Dark2
|
568 |
+
else:
|
569 |
+
# Use a continuous color palette for larger numbers
|
570 |
+
colors = px.colors.sequential.Viridis + px.colors.sequential.Plasma + px.colors.sequential.Inferno
|
571 |
+
|
572 |
+
# Ensure we have enough colors
|
573 |
+
while len(colors) < num_models:
|
574 |
+
colors.extend(colors)
|
575 |
+
|
576 |
+
for i, (model_name, data, avg_performance) in enumerate(model_performances):
|
577 |
+
# Get performance values for all datasets (fill with 0 if missing)
|
578 |
+
performance_values = []
|
579 |
+
for dataset_name in dataset_names:
|
580 |
+
performance_values.append(data['performances'].get(dataset_name, 0))
|
581 |
+
|
582 |
+
# Assign color based on model index for better differentiation
|
583 |
+
color = colors[i % len(colors)]
|
584 |
+
|
585 |
+
# Show first two models by default, hide the rest
|
586 |
+
visible = True if i < 2 else 'legendonly'
|
587 |
+
|
588 |
+
fig.add_trace(go.Scatterpolar(
|
589 |
+
r=performance_values,
|
590 |
+
theta=dataset_display_names,
|
591 |
+
fill='toself',
|
592 |
+
name=model_name,
|
593 |
+
line_color=color,
|
594 |
+
opacity=0.6,
|
595 |
+
visible=visible,
|
596 |
+
hovertemplate=(
|
597 |
+
"<b>%{fullData.name}</b><br>" +
|
598 |
+
"Dataset: %{theta}<br>" +
|
599 |
+
"Performance: %{r:.3f}<br>" +
|
600 |
+
"Architecture: " + data['architecture'] + "<br>" +
|
601 |
+
"<extra></extra>"
|
602 |
+
)
|
603 |
+
))
|
604 |
+
|
605 |
+
# Update layout
|
606 |
+
fig.update_layout(
|
607 |
+
title="Model Performance Radar Chart - All Datasets",
|
608 |
+
polar=dict(
|
609 |
+
radialaxis=dict(
|
610 |
+
visible=True,
|
611 |
+
range=[0.6, 1],
|
612 |
+
ticktext=['0.6', '0.7', '0.8', '0.9', '1.0'],
|
613 |
+
tickvals=[0.6, 0.7, 0.8, 0.9, 1.0]
|
614 |
+
),
|
615 |
+
angularaxis=dict(
|
616 |
+
tickmode='array',
|
617 |
+
tickvals=list(range(len(dataset_display_names))),
|
618 |
+
ticktext=dataset_display_names
|
619 |
+
)
|
620 |
+
),
|
621 |
+
height=700,
|
622 |
+
showlegend=True,
|
623 |
+
legend=dict(
|
624 |
+
yanchor="top",
|
625 |
+
y=-0.15,
|
626 |
+
xanchor="center",
|
627 |
+
x=0.5,
|
628 |
+
bgcolor='rgba(255, 255, 255, 0.9)',
|
629 |
+
bordercolor='rgba(0, 0, 0, 0.2)',
|
630 |
+
borderwidth=1,
|
631 |
+
orientation="h"
|
632 |
+
),
|
633 |
+
margin=dict(l=50, r=50, t=100, b=100)
|
634 |
+
)
|
635 |
+
|
636 |
+
return fig
|
637 |
+
|
638 |
+
# Gradio Interface
|
639 |
+
with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
640 |
+
gr.Markdown("""
|
641 |
+
# π Napolab Leaderboard
|
642 |
+
|
643 |
+
Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
|
644 |
+
[β Star us on GitHub](https://github.com/ruanchaves/napolab)
|
645 |
+
""")
|
646 |
+
|
647 |
+
with gr.Tabs():
|
648 |
+
|
649 |
+
# Benchmark Results Tab
|
650 |
+
with gr.Tab("π Benchmark Results"):
|
651 |
+
gr.Markdown("### Model Performance Benchmarks")
|
652 |
+
|
653 |
+
with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
|
654 |
+
with gr.Row():
|
655 |
+
# Create checkboxes for each dataset
|
656 |
+
dataset_checkboxes = []
|
657 |
+
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
658 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
659 |
+
checkbox = gr.Checkbox(
|
660 |
+
label=display_name,
|
661 |
+
value=True # Default to selected
|
662 |
+
)
|
663 |
+
dataset_checkboxes.append((dataset_name, checkbox))
|
664 |
+
|
665 |
+
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
|
666 |
+
with gr.Row():
|
667 |
+
hide_incomplete_models = gr.Checkbox(
|
668 |
+
label="Hide models with zero scores in selected datasets",
|
669 |
+
value=False
|
670 |
+
)
|
671 |
+
|
672 |
+
min_average_performance = gr.Slider(
|
673 |
+
minimum=0,
|
674 |
+
maximum=100,
|
675 |
+
value=80,
|
676 |
+
step=1,
|
677 |
+
label="Minimum Average Performance (%)"
|
678 |
+
)
|
679 |
+
|
680 |
+
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
|
681 |
+
with gr.Row():
|
682 |
+
show_napolab_thesis = gr.Checkbox(
|
683 |
+
label="Napolab Thesis models",
|
684 |
+
value=True
|
685 |
+
)
|
686 |
+
show_teenytinyllama = gr.Checkbox(
|
687 |
+
label="TeenyTinyLlama models",
|
688 |
+
value=True
|
689 |
+
)
|
690 |
+
show_portuguese_leaderboard = gr.Checkbox(
|
691 |
+
label="Open Portuguese LLM Leaderboard models (open-source)",
|
692 |
+
value=True
|
693 |
+
)
|
694 |
+
|
695 |
+
show_external_models = gr.Checkbox(
|
696 |
+
label="Open Portuguese LLM Leaderboard models (proprietary)",
|
697 |
+
value=True
|
698 |
+
)
|
699 |
+
|
700 |
+
# Search bar for filtering models
|
701 |
+
search_query = gr.Textbox(
|
702 |
+
label="Search models by name",
|
703 |
+
placeholder="Enter model name to filter...",
|
704 |
+
value=""
|
705 |
+
)
|
706 |
+
|
707 |
+
benchmark_table = gr.DataFrame(
|
708 |
+
label="Model Performance Benchmarks",
|
709 |
+
wrap=[True, False, False, False, False, False, False, False, False, False],
|
710 |
+
interactive=False,
|
711 |
+
datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
|
712 |
+
column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
|
713 |
+
)
|
714 |
+
|
715 |
+
gr.Markdown("*π₯π₯π₯ = Top 3 | π’ = Top 33% | π‘ = Middle 33% | π΄ = Bottom 33%*")
|
716 |
+
|
717 |
+
# Export to CSV button and file component
|
718 |
+
export_button = gr.Button("π₯ Export to CSV", variant="secondary")
|
719 |
+
csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
|
720 |
+
|
721 |
+
# Model Analysis Tab
|
722 |
+
with gr.Tab("π Model Analysis"):
|
723 |
+
gr.Markdown("### Model Performance Radar Chart")
|
724 |
+
|
725 |
+
# Dataset Selection Controls
|
726 |
+
with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
|
727 |
+
with gr.Row():
|
728 |
+
# Create checkboxes for each dataset
|
729 |
+
analysis_dataset_checkboxes = []
|
730 |
+
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
731 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
732 |
+
checkbox = gr.Checkbox(
|
733 |
+
label=display_name,
|
734 |
+
value=True
|
735 |
+
)
|
736 |
+
analysis_dataset_checkboxes.append((dataset_name, checkbox))
|
737 |
+
|
738 |
+
# Filter Controls
|
739 |
+
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
|
740 |
+
with gr.Row():
|
741 |
+
hide_incomplete_models_analysis = gr.Checkbox(
|
742 |
+
label="Hide models with zero scores in selected datasets",
|
743 |
+
value=False
|
744 |
+
)
|
745 |
+
|
746 |
+
min_average_performance_analysis = gr.Slider(
|
747 |
+
minimum=0,
|
748 |
+
maximum=100,
|
749 |
+
value=80,
|
750 |
+
step=1,
|
751 |
+
label="Minimum Average Performance (%)"
|
752 |
+
)
|
753 |
+
|
754 |
+
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
|
755 |
+
with gr.Row():
|
756 |
+
show_napolab_thesis_analysis = gr.Checkbox(
|
757 |
+
label="Napolab Thesis models",
|
758 |
+
value=True
|
759 |
+
)
|
760 |
+
|
761 |
+
show_teenytinyllama_analysis = gr.Checkbox(
|
762 |
+
label="TeenyTinyLlama models",
|
763 |
+
value=True
|
764 |
+
)
|
765 |
+
|
766 |
+
show_portuguese_leaderboard_analysis = gr.Checkbox(
|
767 |
+
label="Open Portuguese LLM Leaderboard models (open-source)",
|
768 |
+
value=True
|
769 |
+
)
|
770 |
+
|
771 |
+
show_external_models_analysis = gr.Checkbox(
|
772 |
+
label="Open Portuguese LLM Leaderboard models (proprietary)",
|
773 |
+
value=True
|
774 |
+
)
|
775 |
+
|
776 |
+
# Search bar for filtering models in radar chart
|
777 |
+
search_query_analysis = gr.Textbox(
|
778 |
+
label="Search models by name",
|
779 |
+
placeholder="Enter model name to filter...",
|
780 |
+
value=""
|
781 |
+
)
|
782 |
+
|
783 |
+
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
784 |
+
|
785 |
+
gr.Markdown("""
|
786 |
+
**How to interact with the chart:**
|
787 |
+
- **Click on legend items** to show/hide specific models.
|
788 |
+
- **Double-click on a legend item** to isolate that model (hide all others).
|
789 |
+
- **Double-click again** to show all models.
|
790 |
+
|
791 |
+
Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
|
792 |
+
""")
|
793 |
+
|
794 |
+
|
795 |
+
|
796 |
+
# About Tab
|
797 |
+
with gr.Tab("βΉοΈ About"):
|
798 |
+
gr.Markdown("""
|
799 |
+
## About Napolab
|
800 |
+
|
801 |
+
**Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
|
802 |
+
|
803 |
+
For more information, please visit the [GitHub repository](https://github.com/ruanchaves/napolab) and the [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab).
|
804 |
+
|
805 |
+
### Data Sources:
|
806 |
+
The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
|
807 |
+
|
808 |
+
**1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
|
809 |
+
|
810 |
+
**2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
|
811 |
+
|
812 |
+
**3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by CorrΓͺa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
|
813 |
+
|
814 |
+
### Thesis Citation:
|
815 |
+
```bibtex
|
816 |
+
@mastersthesis{chaves2023lessons,
|
817 |
+
title={Lessons learned from the evaluation of Portuguese language models},
|
818 |
+
author={Chaves Rodrigues, Ruan},
|
819 |
+
year={2023},
|
820 |
+
school={University of Malta},
|
821 |
+
url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
|
822 |
+
}
|
823 |
+
```
|
824 |
+
|
825 |
+
### Napolab Citation:
|
826 |
+
```bibtex
|
827 |
+
@software{Chaves_Rodrigues_napolab_2023,
|
828 |
+
author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
|
829 |
+
doi = {10.5281/zenodo.7781848},
|
830 |
+
month = {3},
|
831 |
+
title = {{Natural Portuguese Language Benchmark (Napolab)}},
|
832 |
+
url = {https://github.com/ruanchaves/napolab},
|
833 |
+
version = {1.0.0},
|
834 |
+
year = {2023}
|
835 |
+
}
|
836 |
+
```
|
837 |
+
|
838 |
+
""")
|
839 |
+
|
840 |
+
# Event handlers
|
841 |
+
def update_radar_chart(*args):
|
842 |
+
# Extract arguments for radar chart
|
843 |
+
dataset_values = args[:len(analysis_dataset_checkboxes)]
|
844 |
+
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
|
845 |
+
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
846 |
+
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
|
847 |
+
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
|
848 |
+
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
849 |
+
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
850 |
+
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
851 |
+
|
852 |
+
# Convert dataset selections to list of selected dataset names
|
853 |
+
selected_datasets = []
|
854 |
+
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
|
855 |
+
if dataset_values[i]:
|
856 |
+
selected_datasets.append(dataset_name)
|
857 |
+
|
858 |
+
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
859 |
+
|
860 |
+
def update_benchmark_table(*args):
|
861 |
+
# Extract arguments
|
862 |
+
dataset_values = args[:len(dataset_checkboxes)]
|
863 |
+
hide_incomplete_models = args[len(dataset_checkboxes)]
|
864 |
+
min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
865 |
+
show_napolab_thesis = args[len(dataset_checkboxes) + 2]
|
866 |
+
show_teenytinyllama = args[len(dataset_checkboxes) + 3]
|
867 |
+
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
868 |
+
show_external_models = args[len(dataset_checkboxes) + 5]
|
869 |
+
search_query = args[len(dataset_checkboxes) + 6]
|
870 |
+
|
871 |
+
# Convert dataset selections to list of selected dataset names
|
872 |
+
selected_datasets = []
|
873 |
+
for i, (dataset_name, _) in enumerate(dataset_checkboxes):
|
874 |
+
if dataset_values[i]:
|
875 |
+
selected_datasets.append(dataset_name)
|
876 |
+
|
877 |
+
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
878 |
+
|
879 |
+
return df
|
880 |
+
|
881 |
+
# Connect events
|
882 |
+
# Load model analysis chart on app start
|
883 |
+
app.load(lambda: update_radar_chart(*([True] * len(analysis_dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=model_analysis_chart)
|
884 |
+
|
885 |
+
# Load benchmark table on app start
|
886 |
+
app.load(lambda: update_benchmark_table(*([True] * len(dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=benchmark_table)
|
887 |
+
|
888 |
+
# Connect dataset checkboxes to update table
|
889 |
+
for dataset_name, checkbox in dataset_checkboxes:
|
890 |
+
checkbox.change(
|
891 |
+
update_benchmark_table,
|
892 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
893 |
+
outputs=benchmark_table
|
894 |
+
)
|
895 |
+
|
896 |
+
hide_incomplete_models.change(
|
897 |
+
update_benchmark_table,
|
898 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
899 |
+
outputs=benchmark_table
|
900 |
+
)
|
901 |
+
|
902 |
+
min_average_performance.change(
|
903 |
+
update_benchmark_table,
|
904 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
905 |
+
outputs=benchmark_table
|
906 |
+
)
|
907 |
+
|
908 |
+
show_napolab_thesis.change(
|
909 |
+
update_benchmark_table,
|
910 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
911 |
+
outputs=benchmark_table
|
912 |
+
)
|
913 |
+
|
914 |
+
show_teenytinyllama.change(
|
915 |
+
update_benchmark_table,
|
916 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
917 |
+
outputs=benchmark_table
|
918 |
+
)
|
919 |
+
|
920 |
+
show_portuguese_leaderboard.change(
|
921 |
+
update_benchmark_table,
|
922 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
923 |
+
outputs=benchmark_table
|
924 |
+
)
|
925 |
+
|
926 |
+
show_external_models.change(
|
927 |
+
update_benchmark_table,
|
928 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
929 |
+
outputs=benchmark_table
|
930 |
+
)
|
931 |
+
|
932 |
+
# Connect search query to update table
|
933 |
+
search_query.change(
|
934 |
+
update_benchmark_table,
|
935 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
936 |
+
outputs=benchmark_table
|
937 |
+
)
|
938 |
+
|
939 |
+
# Connect export button
|
940 |
+
export_button.click(
|
941 |
+
export_csv,
|
942 |
+
inputs=benchmark_table,
|
943 |
+
outputs=csv_file
|
944 |
+
)
|
945 |
+
|
946 |
+
# Connect file download to cleanup
|
947 |
+
csv_file.change(
|
948 |
+
cleanup_current_csv,
|
949 |
+
inputs=None,
|
950 |
+
outputs=None
|
951 |
+
)
|
952 |
+
|
953 |
+
# Connect analysis chart events
|
954 |
+
# Connect dataset checkboxes to update radar chart
|
955 |
+
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
956 |
+
checkbox.change(
|
957 |
+
update_radar_chart,
|
958 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
959 |
+
outputs=model_analysis_chart
|
960 |
+
)
|
961 |
+
|
962 |
+
hide_incomplete_models_analysis.change(
|
963 |
+
update_radar_chart,
|
964 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
965 |
+
outputs=model_analysis_chart
|
966 |
+
)
|
967 |
+
|
968 |
+
min_average_performance_analysis.change(
|
969 |
+
update_radar_chart,
|
970 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
971 |
+
outputs=model_analysis_chart
|
972 |
+
)
|
973 |
+
|
974 |
+
show_napolab_thesis_analysis.change(
|
975 |
+
update_radar_chart,
|
976 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
977 |
+
outputs=model_analysis_chart
|
978 |
+
)
|
979 |
+
|
980 |
+
show_teenytinyllama_analysis.change(
|
981 |
+
update_radar_chart,
|
982 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
983 |
+
outputs=model_analysis_chart
|
984 |
+
)
|
985 |
+
|
986 |
+
show_portuguese_leaderboard_analysis.change(
|
987 |
+
update_radar_chart,
|
988 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
989 |
+
outputs=model_analysis_chart
|
990 |
+
)
|
991 |
+
|
992 |
+
show_external_models_analysis.change(
|
993 |
+
update_radar_chart,
|
994 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
995 |
+
outputs=model_analysis_chart
|
996 |
+
)
|
997 |
+
|
998 |
+
# Connect search query to update radar chart
|
999 |
+
search_query_analysis.change(
|
1000 |
+
update_radar_chart,
|
1001 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1002 |
+
outputs=model_analysis_chart
|
1003 |
+
)
|
1004 |
+
|
1005 |
+
if __name__ == "__main__":
|
1006 |
+
app.launch(share=True, server_name="0.0.0.0", server_port=7860)
|
config.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration file for the Napolab Leaderboard Gradio App
|
3 |
+
"""
|
4 |
+
|
5 |
+
# App Configuration
|
6 |
+
APP_TITLE = "Napolab Leaderboard"
|
7 |
+
APP_DESCRIPTION = "Natural Portuguese Language Benchmark Leaderboard"
|
8 |
+
APP_THEME = "soft"
|
9 |
+
APP_PORT = 7860
|
10 |
+
APP_HOST = "0.0.0.0"
|
11 |
+
APP_SHARE = True
|
12 |
+
|
13 |
+
# Dataset Configuration
|
14 |
+
DEFAULT_DATASET = "assin"
|
15 |
+
DEFAULT_SPLIT = "test"
|
16 |
+
DEFAULT_SAMPLES = 5
|
17 |
+
MAX_SAMPLES = 20
|
18 |
+
|
19 |
+
# Chart Configuration
|
20 |
+
CHART_HEIGHT = 400
|
21 |
+
OVERVIEW_CHART_HEIGHT = 600
|
22 |
+
CHART_COLORS = {
|
23 |
+
"primary": "#1f77b4",
|
24 |
+
"secondary": "#ff7f0e",
|
25 |
+
"success": "#2ca02c",
|
26 |
+
"warning": "#d62728"
|
27 |
+
}
|
28 |
+
|
29 |
+
# Model Configuration
|
30 |
+
DEFAULT_MODELS_TO_COMPARE = 2
|
31 |
+
|
32 |
+
# Cache Configuration
|
33 |
+
CACHE_DURATION = 3600 # 1 hour in seconds
|
34 |
+
|
35 |
+
# Error Messages
|
36 |
+
ERROR_MESSAGES = {
|
37 |
+
"dataset_load": "Error loading dataset. Please check your internet connection.",
|
38 |
+
"no_benchmark": "No benchmark data available for this dataset.",
|
39 |
+
"no_models": "No models found for comparison.",
|
40 |
+
"invalid_selection": "Invalid selection. Please try again."
|
41 |
+
}
|
42 |
+
|
43 |
+
# Links
|
44 |
+
LINKS = {
|
45 |
+
"github": "https://github.com/ruanchaves/napolab",
|
46 |
+
"huggingface_dataset": "https://huggingface.co/datasets/ruanchaves/napolab",
|
47 |
+
"open_pt_llm_leaderboard": "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
|
48 |
+
}
|
data.yaml
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Napolab Leaderboard Data Configuration
|
2 |
+
# This file contains all datasets and benchmark results for the Gradio app
|
3 |
+
#
|
4 |
+
# Data Source: "Lessons learned from the evaluation of Portuguese language models"
|
5 |
+
# by Ruan Chaves Rodrigues (2023) - Master's dissertation, University of Malta
|
6 |
+
# Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557
|
7 |
+
|
8 |
+
# Data Sources
|
9 |
+
sources:
|
10 |
+
napolab_thesis:
|
11 |
+
name: "Napolab Thesis"
|
12 |
+
description: "Lessons learned from the evaluation of Portuguese language models"
|
13 |
+
author: "Ruan Chaves Rodrigues"
|
14 |
+
year: 2023
|
15 |
+
url: "https://www.um.edu.mt/library/oar/handle/123456789/120557"
|
16 |
+
institution: "University of Malta"
|
17 |
+
|
18 |
+
open_pt_llm_leaderboard:
|
19 |
+
name: "Open PT LLM Leaderboard"
|
20 |
+
description: "Large Language Models on Portuguese Benchmarks"
|
21 |
+
author: "Eduardo Garcia"
|
22 |
+
year: 2025
|
23 |
+
url: "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
|
24 |
+
platform: "Hugging Face Spaces"
|
25 |
+
|
26 |
+
teenytinyllama_paper:
|
27 |
+
name: "TeenyTinyLlama Paper"
|
28 |
+
description: "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"
|
29 |
+
authors: ["CorrΓͺa, Nicholas Kluge", "Falk, Sophia", "Fatimah, Shiza", "Sen, Aniket", "De Oliveira, Nythamar"]
|
30 |
+
year: 2024
|
31 |
+
journal: "Machine Learning with Applications"
|
32 |
+
doi: "10.1016/j.mlwa.2024.100558"
|
33 |
+
|
34 |
+
# Dataset Information
|
35 |
+
datasets:
|
36 |
+
assin_rte:
|
37 |
+
name: "ASSIN RTE"
|
38 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual - RTE"
|
39 |
+
tasks: ["RTE"]
|
40 |
+
url: "https://huggingface.co/datasets/assin"
|
41 |
+
|
42 |
+
assin_sts:
|
43 |
+
name: "ASSIN STS"
|
44 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual - STS"
|
45 |
+
tasks: ["STS"]
|
46 |
+
url: "https://huggingface.co/datasets/assin"
|
47 |
+
|
48 |
+
assin2_rte:
|
49 |
+
name: "ASSIN 2 RTE"
|
50 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual (v2) - RTE"
|
51 |
+
tasks: ["RTE"]
|
52 |
+
url: "https://huggingface.co/datasets/assin2"
|
53 |
+
|
54 |
+
assin2_sts:
|
55 |
+
name: "ASSIN 2 STS"
|
56 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual (v2) - STS"
|
57 |
+
tasks: ["STS"]
|
58 |
+
url: "https://huggingface.co/datasets/assin2"
|
59 |
+
|
60 |
+
faquad-nli:
|
61 |
+
name: "FaQUaD-NLI"
|
62 |
+
description: "Factual Question Answering and Natural Language Inference"
|
63 |
+
tasks: ["NLI"]
|
64 |
+
url: "https://huggingface.co/datasets/ruanchaves/faquad-nli"
|
65 |
+
|
66 |
+
hatebr:
|
67 |
+
name: "HateBR"
|
68 |
+
description: "Hate Speech Detection in Brazilian Portuguese"
|
69 |
+
tasks: ["Classification"]
|
70 |
+
url: "https://huggingface.co/datasets/ruanchaves/hatebr"
|
71 |
+
|
72 |
+
porsimplessent:
|
73 |
+
name: "PorSimplesSent"
|
74 |
+
description: "Portuguese Simple Sentences Sentiment Analysis"
|
75 |
+
tasks: ["Sentiment Analysis"]
|
76 |
+
url: "https://huggingface.co/datasets/ruanchaves/porsimplessent"
|
77 |
+
|
78 |
+
reli-sa:
|
79 |
+
name: "Reli-SA"
|
80 |
+
description: "Religious Sentiment Analysis"
|
81 |
+
tasks: ["Sentiment Analysis"]
|
82 |
+
url: "https://huggingface.co/datasets/ruanchaves/reli-sa"
|
83 |
+
|
84 |
+
# Benchmark Results
|
85 |
+
benchmark_results:
|
86 |
+
assin_rte:
|
87 |
+
albertina-pt-pt:
|
88 |
+
accuracy: 0.887
|
89 |
+
albertina-pt-br:
|
90 |
+
accuracy: 0.844
|
91 |
+
deberta-v2-large:
|
92 |
+
accuracy: 0.864
|
93 |
+
xlm-roberta-large:
|
94 |
+
accuracy: 0.874
|
95 |
+
mdeberta-v3-base:
|
96 |
+
accuracy: 0.863
|
97 |
+
bertimbau-large:
|
98 |
+
accuracy: 0.838
|
99 |
+
bert-large:
|
100 |
+
accuracy: 0.802
|
101 |
+
bertimbau-base:
|
102 |
+
accuracy: 0.828
|
103 |
+
bert-multilingual-base:
|
104 |
+
accuracy: 0.815
|
105 |
+
xlm-roberta-base:
|
106 |
+
accuracy: 0.822
|
107 |
+
bertinho:
|
108 |
+
accuracy: 0.786
|
109 |
+
ixaes:
|
110 |
+
accuracy: 0.782
|
111 |
+
|
112 |
+
assin_sts:
|
113 |
+
albertina-pt-pt:
|
114 |
+
accuracy: 0.874
|
115 |
+
albertina-pt-br:
|
116 |
+
accuracy: 0.883
|
117 |
+
deberta-v2-large:
|
118 |
+
accuracy: 0.861
|
119 |
+
xlm-roberta-large:
|
120 |
+
accuracy: 0.863
|
121 |
+
mdeberta-v3-base:
|
122 |
+
accuracy: 0.855
|
123 |
+
bertimbau-large:
|
124 |
+
accuracy: 0.826
|
125 |
+
bert-large:
|
126 |
+
accuracy: 0.822
|
127 |
+
bertimbau-base:
|
128 |
+
accuracy: 0.844
|
129 |
+
bert-multilingual-base:
|
130 |
+
accuracy: 0.820
|
131 |
+
xlm-roberta-base:
|
132 |
+
accuracy: 0.812
|
133 |
+
bertinho:
|
134 |
+
accuracy: 0.791
|
135 |
+
ixaes:
|
136 |
+
accuracy: 0.817
|
137 |
+
|
138 |
+
assin2_rte:
|
139 |
+
albertina-pt-pt:
|
140 |
+
accuracy: 0.910
|
141 |
+
albertina-pt-br:
|
142 |
+
accuracy: 0.916
|
143 |
+
deberta-v2-large:
|
144 |
+
accuracy: 0.911
|
145 |
+
xlm-roberta-large:
|
146 |
+
accuracy: 0.910
|
147 |
+
mdeberta-v3-base:
|
148 |
+
accuracy: 0.904
|
149 |
+
bertimbau-large:
|
150 |
+
accuracy: 0.897
|
151 |
+
bert-large:
|
152 |
+
accuracy: 0.892
|
153 |
+
bertimbau-base:
|
154 |
+
accuracy: 0.884
|
155 |
+
bert-multilingual-base:
|
156 |
+
accuracy: 0.877
|
157 |
+
xlm-roberta-base:
|
158 |
+
accuracy: 0.875
|
159 |
+
bertinho:
|
160 |
+
accuracy: 0.855
|
161 |
+
ixaes:
|
162 |
+
accuracy: 0.879
|
163 |
+
ttl-460m:
|
164 |
+
accuracy: 0.8643
|
165 |
+
ttl-160m:
|
166 |
+
accuracy: 0.8578
|
167 |
+
|
168 |
+
assin2_sts:
|
169 |
+
deberta-v2-large:
|
170 |
+
accuracy: 0.724
|
171 |
+
mdeberta-v3-base:
|
172 |
+
accuracy: 0.847
|
173 |
+
bertimbau-large:
|
174 |
+
accuracy: 0.855
|
175 |
+
bert-large:
|
176 |
+
accuracy: 0.792
|
177 |
+
bertimbau-base:
|
178 |
+
accuracy: 0.840
|
179 |
+
bert-multilingual-base:
|
180 |
+
accuracy: 0.827
|
181 |
+
xlm-roberta-base:
|
182 |
+
accuracy: 0.847
|
183 |
+
bertinho:
|
184 |
+
accuracy: 0.802
|
185 |
+
ixaes:
|
186 |
+
accuracy: 0.822
|
187 |
+
|
188 |
+
faquad-nli:
|
189 |
+
mdeberta-v3-base:
|
190 |
+
accuracy: 0.889
|
191 |
+
bertimbau-large:
|
192 |
+
accuracy: 0.900
|
193 |
+
bert-large:
|
194 |
+
accuracy: 0.838
|
195 |
+
bertimbau-base:
|
196 |
+
accuracy: 0.897
|
197 |
+
bert-multilingual-base:
|
198 |
+
accuracy: 0.865
|
199 |
+
xlm-roberta-base:
|
200 |
+
accuracy: 0.898
|
201 |
+
bertinho:
|
202 |
+
accuracy: 0.866
|
203 |
+
ixaes:
|
204 |
+
accuracy: 0.860
|
205 |
+
ttl-460m:
|
206 |
+
accuracy: 0.9118
|
207 |
+
ttl-160m:
|
208 |
+
accuracy: 0.9000
|
209 |
+
|
210 |
+
hatebr:
|
211 |
+
mdeberta-v3-base:
|
212 |
+
accuracy: 0.911
|
213 |
+
bertimbau-large:
|
214 |
+
accuracy: 0.919
|
215 |
+
bert-large:
|
216 |
+
accuracy: 0.838
|
217 |
+
bertimbau-base:
|
218 |
+
accuracy: 0.920
|
219 |
+
bert-multilingual-base:
|
220 |
+
accuracy: 0.871
|
221 |
+
xlm-roberta-base:
|
222 |
+
accuracy: 0.920
|
223 |
+
bertinho:
|
224 |
+
accuracy: 0.879
|
225 |
+
ixaes:
|
226 |
+
accuracy: 0.872
|
227 |
+
ttl-460m:
|
228 |
+
accuracy: 0.9228
|
229 |
+
ttl-160m:
|
230 |
+
accuracy: 0.9071
|
231 |
+
|
232 |
+
porsimplessent:
|
233 |
+
mdeberta-v3-base:
|
234 |
+
accuracy: 0.953
|
235 |
+
bertimbau-large:
|
236 |
+
accuracy: 0.919
|
237 |
+
bert-large:
|
238 |
+
accuracy: 0.907
|
239 |
+
bertimbau-base:
|
240 |
+
accuracy: 0.920
|
241 |
+
bert-multilingual-base:
|
242 |
+
accuracy: 0.933
|
243 |
+
xlm-roberta-base:
|
244 |
+
accuracy: 0.920
|
245 |
+
bertinho:
|
246 |
+
accuracy: 0.900
|
247 |
+
ixaes:
|
248 |
+
accuracy: 0.899
|
249 |
+
|
250 |
+
reli-sa:
|
251 |
+
mdeberta-v3-base:
|
252 |
+
accuracy: 0.719
|
253 |
+
bertimbau-large:
|
254 |
+
accuracy: 0.745
|
255 |
+
bert-large:
|
256 |
+
accuracy: 0.629
|
257 |
+
bertimbau-base:
|
258 |
+
accuracy: 0.713
|
259 |
+
bert-multilingual-base:
|
260 |
+
accuracy: 0.642
|
261 |
+
xlm-roberta-base:
|
262 |
+
accuracy: 0.680
|
263 |
+
bertinho:
|
264 |
+
accuracy: 0.681
|
265 |
+
ixaes:
|
266 |
+
accuracy: 0.637
|
267 |
+
|
268 |
+
# Model Metadata
|
269 |
+
model_metadata:
|
270 |
+
albertina-pt-pt:
|
271 |
+
parameters: 125000000
|
272 |
+
architecture: "Albertina PT:PT"
|
273 |
+
base_model: "PORTULAN/albertina-ptpt"
|
274 |
+
task: "Multiple"
|
275 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
|
276 |
+
source: "napolab_thesis"
|
277 |
+
|
278 |
+
albertina-pt-br:
|
279 |
+
parameters: 125000000
|
280 |
+
architecture: "Albertina PT:BR"
|
281 |
+
base_model: "PORTULAN/albertina-ptbr"
|
282 |
+
task: "Multiple"
|
283 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
|
284 |
+
source: "napolab_thesis"
|
285 |
+
|
286 |
+
deberta-v2-large:
|
287 |
+
parameters: 900000000
|
288 |
+
architecture: "DeBERTa v2 (large)"
|
289 |
+
base_model: "microsoft/deberta-v2-large"
|
290 |
+
task: "Multiple"
|
291 |
+
huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
|
292 |
+
source: "napolab_thesis"
|
293 |
+
|
294 |
+
xlm-roberta-large:
|
295 |
+
parameters: 550000000
|
296 |
+
architecture: "XLM-RoBERTa (large)"
|
297 |
+
base_model: "xlm-roberta-large"
|
298 |
+
task: "Multiple"
|
299 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-large"
|
300 |
+
source: "napolab_thesis"
|
301 |
+
|
302 |
+
mdeberta-v3-base:
|
303 |
+
parameters: 86000000
|
304 |
+
architecture: "mDeBERTa v3 (base)"
|
305 |
+
base_model: "microsoft/mdeberta-v3-base"
|
306 |
+
task: "Multiple"
|
307 |
+
huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
|
308 |
+
source: "napolab_thesis"
|
309 |
+
|
310 |
+
bertimbau-large:
|
311 |
+
parameters: 355000000
|
312 |
+
architecture: "BERTimbau (large)"
|
313 |
+
base_model: "neuralmind/bert-large-portuguese-cased"
|
314 |
+
task: "Multiple"
|
315 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
|
316 |
+
source: "napolab_thesis"
|
317 |
+
|
318 |
+
bert-large:
|
319 |
+
parameters: 355000000
|
320 |
+
architecture: "BERT (large)"
|
321 |
+
base_model: "bert-large-uncased"
|
322 |
+
task: "Multiple"
|
323 |
+
huggingface_url: "https://huggingface.co/bert-large-uncased"
|
324 |
+
source: "napolab_thesis"
|
325 |
+
|
326 |
+
bertimbau-base:
|
327 |
+
parameters: 110000000
|
328 |
+
architecture: "BERTimbau (base)"
|
329 |
+
base_model: "neuralmind/bert-base-portuguese-cased"
|
330 |
+
task: "Multiple"
|
331 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
|
332 |
+
source: "napolab_thesis"
|
333 |
+
|
334 |
+
bert-multilingual-base:
|
335 |
+
parameters: 110000000
|
336 |
+
architecture: "BERT multilingual (base)"
|
337 |
+
base_model: "bert-base-multilingual-cased"
|
338 |
+
task: "Multiple"
|
339 |
+
huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
|
340 |
+
source: "napolab_thesis"
|
341 |
+
|
342 |
+
xlm-roberta-base:
|
343 |
+
parameters: 270000000
|
344 |
+
architecture: "XLM-RoBERTa (base)"
|
345 |
+
base_model: "xlm-roberta-base"
|
346 |
+
task: "Multiple"
|
347 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-base"
|
348 |
+
source: "napolab_thesis"
|
349 |
+
|
350 |
+
bertinho:
|
351 |
+
parameters: 110000000
|
352 |
+
architecture: "Bertinho"
|
353 |
+
base_model: "ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
354 |
+
task: "Multiple"
|
355 |
+
huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
356 |
+
source: "napolab_thesis"
|
357 |
+
|
358 |
+
ixaes:
|
359 |
+
parameters: 110000000
|
360 |
+
architecture: "IXAes"
|
361 |
+
base_model: "ixa-ehu/ixambert-base-cased"
|
362 |
+
task: "Multiple"
|
363 |
+
huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
|
364 |
+
source: "napolab_thesis"
|
365 |
+
|
366 |
+
ttl-460m:
|
367 |
+
parameters: 460000000
|
368 |
+
architecture: "TeenyTinyLlama (460M)"
|
369 |
+
base_model: "nicholasKluge/TeenyTinyLlama-460m"
|
370 |
+
task: "Multiple"
|
371 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
|
372 |
+
source: "teenytinyllama_paper"
|
373 |
+
|
374 |
+
ttl-160m:
|
375 |
+
parameters: 160000000
|
376 |
+
architecture: "TeenyTinyLlama (160M)"
|
377 |
+
base_model: "nicholasKluge/TeenyTinyLlama-160m"
|
378 |
+
task: "Multiple"
|
379 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"
|
380 |
+
source: "teenytinyllama_paper"
|
381 |
+
|
382 |
+
# Additional Models (for Model Hub tab)
|
383 |
+
additional_models:
|
384 |
+
albertina_models:
|
385 |
+
albertina-pt-pt:
|
386 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
|
387 |
+
albertina-pt-br:
|
388 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
|
389 |
+
|
390 |
+
deberta_models:
|
391 |
+
deberta-v2-large:
|
392 |
+
huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
|
393 |
+
mdeberta-v3-base:
|
394 |
+
huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
|
395 |
+
|
396 |
+
roberta_models:
|
397 |
+
xlm-roberta-large:
|
398 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-large"
|
399 |
+
xlm-roberta-base:
|
400 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-base"
|
401 |
+
|
402 |
+
bert_models:
|
403 |
+
bertimbau-large:
|
404 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
|
405 |
+
bertimbau-base:
|
406 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
|
407 |
+
bert-large:
|
408 |
+
huggingface_url: "https://huggingface.co/bert-large-uncased"
|
409 |
+
bert-multilingual-base:
|
410 |
+
huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
|
411 |
+
|
412 |
+
specialized_models:
|
413 |
+
bertinho:
|
414 |
+
huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
415 |
+
ixaes:
|
416 |
+
huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
|
417 |
+
|
418 |
+
teenytinyllama_models:
|
419 |
+
ttl-460m:
|
420 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
|
421 |
+
ttl-160m:
|
422 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"
|
data_loader.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Data loader for Napolab Leaderboard
|
3 |
+
Loads datasets, benchmark results, and model metadata from YAML configuration files.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import yaml
|
7 |
+
import os
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Dict, Any, Optional
|
10 |
+
|
11 |
+
class NapolabDataLoader:
|
12 |
+
"""Loads and manages Napolab data from YAML configuration files."""
|
13 |
+
|
14 |
+
def __init__(self, data_file: str = "data.yaml"):
|
15 |
+
"""
|
16 |
+
Initialize the data loader.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
data_file: Path to the YAML data file
|
20 |
+
"""
|
21 |
+
self.data_file = data_file
|
22 |
+
self.data = None
|
23 |
+
self.load_data()
|
24 |
+
|
25 |
+
def load_data(self) -> None:
|
26 |
+
"""Load data from the YAML file."""
|
27 |
+
try:
|
28 |
+
# Get the directory where this script is located
|
29 |
+
script_dir = Path(__file__).parent
|
30 |
+
data_path = script_dir / self.data_file
|
31 |
+
|
32 |
+
if not data_path.exists():
|
33 |
+
raise FileNotFoundError(f"Data file not found: {data_path}")
|
34 |
+
|
35 |
+
with open(data_path, 'r', encoding='utf-8') as file:
|
36 |
+
self.data = yaml.safe_load(file)
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error loading data from {self.data_file}: {e}")
|
40 |
+
# Fallback to empty data structure
|
41 |
+
self.data = {
|
42 |
+
'datasets': {},
|
43 |
+
'benchmark_results': {},
|
44 |
+
'model_metadata': {},
|
45 |
+
'additional_models': {}
|
46 |
+
}
|
47 |
+
|
48 |
+
def get_datasets(self) -> Dict[str, Any]:
|
49 |
+
"""Get all datasets information."""
|
50 |
+
return self.data.get('datasets', {})
|
51 |
+
|
52 |
+
def get_benchmark_results(self) -> Dict[str, Any]:
|
53 |
+
"""Get all benchmark results."""
|
54 |
+
return self.data.get('benchmark_results', {})
|
55 |
+
|
56 |
+
def get_model_metadata(self) -> Dict[str, Any]:
|
57 |
+
"""Get all model metadata."""
|
58 |
+
return self.data.get('model_metadata', {})
|
59 |
+
|
60 |
+
def get_additional_models(self) -> Dict[str, Any]:
|
61 |
+
"""Get additional models for the Model Hub."""
|
62 |
+
return self.data.get('additional_models', {})
|
63 |
+
|
64 |
+
def get_dataset_info(self, dataset_name: str) -> Optional[Dict[str, Any]]:
|
65 |
+
"""Get information for a specific dataset."""
|
66 |
+
return self.data.get('datasets', {}).get(dataset_name)
|
67 |
+
|
68 |
+
def get_benchmark_for_dataset(self, dataset_name: str) -> Optional[Dict[str, Any]]:
|
69 |
+
"""Get benchmark results for a specific dataset."""
|
70 |
+
return self.data.get('benchmark_results', {}).get(dataset_name)
|
71 |
+
|
72 |
+
def get_model_info(self, model_name: str) -> Optional[Dict[str, Any]]:
|
73 |
+
"""Get metadata for a specific model."""
|
74 |
+
return self.data.get('model_metadata', {}).get(model_name)
|
75 |
+
|
76 |
+
def get_available_datasets(self) -> list:
|
77 |
+
"""Get list of available dataset names."""
|
78 |
+
return list(self.data.get('datasets', {}).keys())
|
79 |
+
|
80 |
+
def get_available_models_for_dataset(self, dataset_name: str) -> list:
|
81 |
+
"""Get list of available models for a specific dataset."""
|
82 |
+
benchmark = self.get_benchmark_for_dataset(dataset_name)
|
83 |
+
if benchmark:
|
84 |
+
return list(benchmark.keys())
|
85 |
+
return []
|
86 |
+
|
87 |
+
def get_all_models(self) -> list:
|
88 |
+
"""Get list of all available models."""
|
89 |
+
return list(self.data.get('model_metadata', {}).keys())
|
90 |
+
|
91 |
+
def validate_data(self) -> bool:
|
92 |
+
"""Validate the loaded data structure."""
|
93 |
+
required_keys = ['datasets', 'benchmark_results', 'model_metadata']
|
94 |
+
|
95 |
+
for key in required_keys:
|
96 |
+
if key not in self.data:
|
97 |
+
print(f"Missing required key: {key}")
|
98 |
+
return False
|
99 |
+
|
100 |
+
return True
|
101 |
+
|
102 |
+
def reload_data(self) -> None:
|
103 |
+
"""Reload data from the YAML file."""
|
104 |
+
self.load_data()
|
105 |
+
|
106 |
+
def export_data(self, output_file: str = "exported_data.yaml") -> None:
|
107 |
+
"""Export the current data to a YAML file."""
|
108 |
+
try:
|
109 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
110 |
+
yaml.dump(self.data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
111 |
+
print(f"Data exported to {output_file}")
|
112 |
+
except Exception as e:
|
113 |
+
print(f"Error exporting data: {e}")
|
114 |
+
|
115 |
+
# Global data loader instance
|
116 |
+
data_loader = NapolabDataLoader()
|
117 |
+
|
118 |
+
# Convenience functions for backward compatibility
|
119 |
+
def get_napolab_datasets() -> Dict[str, Any]:
|
120 |
+
"""Get Napolab datasets (for backward compatibility)."""
|
121 |
+
return data_loader.get_datasets()
|
122 |
+
|
123 |
+
def get_sample_benchmark_results() -> Dict[str, Any]:
|
124 |
+
"""Get benchmark results (for backward compatibility)."""
|
125 |
+
return data_loader.get_benchmark_results()
|
126 |
+
|
127 |
+
def get_model_metadata() -> Dict[str, Any]:
|
128 |
+
"""Get model metadata (for backward compatibility)."""
|
129 |
+
return data_loader.get_model_metadata()
|
130 |
+
|
131 |
+
def get_additional_models() -> Dict[str, Any]:
|
132 |
+
"""Get additional models (for backward compatibility)."""
|
133 |
+
return data_loader.get_additional_models()
|
download_external_models.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to download external models data from the Open Portuguese LLM Leaderboard
|
4 |
+
and convert it to CSV format for import into the benchmark.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import pandas as pd
|
9 |
+
import json
|
10 |
+
import sys
|
11 |
+
|
12 |
+
def download_external_models():
|
13 |
+
"""Download external models data and convert to CSV."""
|
14 |
+
|
15 |
+
url = "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard/raw/main/external_models_results.json"
|
16 |
+
|
17 |
+
print("Downloading external models data...")
|
18 |
+
|
19 |
+
try:
|
20 |
+
# Download the JSON file
|
21 |
+
response = requests.get(url)
|
22 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
23 |
+
|
24 |
+
# Parse JSON
|
25 |
+
data = response.json()
|
26 |
+
|
27 |
+
if not isinstance(data, list):
|
28 |
+
print("Error: Expected JSON array, got:", type(data))
|
29 |
+
return
|
30 |
+
|
31 |
+
print(f"Downloaded {len(data)} external models")
|
32 |
+
|
33 |
+
# Extract data for each model
|
34 |
+
extracted_data = []
|
35 |
+
|
36 |
+
for item in data:
|
37 |
+
if not isinstance(item, dict):
|
38 |
+
print(f"Warning: Skipping non-dict item: {type(item)}")
|
39 |
+
continue
|
40 |
+
|
41 |
+
# Extract required fields
|
42 |
+
model = item.get('model', '')
|
43 |
+
link = item.get('link', '')
|
44 |
+
result_metrics = item.get('result_metrics', {})
|
45 |
+
|
46 |
+
if not isinstance(result_metrics, dict):
|
47 |
+
print(f"Warning: Skipping model '{model}' - result_metrics is not a dict")
|
48 |
+
continue
|
49 |
+
|
50 |
+
# Extract metrics
|
51 |
+
assin2_sts = result_metrics.get('assin2_sts', 0.0)
|
52 |
+
assin2_rte = result_metrics.get('assin2_rte', 0.0)
|
53 |
+
faquad_nli = result_metrics.get('faquad_nli', 0.0)
|
54 |
+
hatebr_offensive = result_metrics.get('hatebr_offensive', 0.0)
|
55 |
+
|
56 |
+
# Create row data
|
57 |
+
row_data = {
|
58 |
+
'model': model,
|
59 |
+
'link': link,
|
60 |
+
'assin2_sts': assin2_sts,
|
61 |
+
'assin2_rte': assin2_rte,
|
62 |
+
'faquad_nli': faquad_nli,
|
63 |
+
'hatebr_offensive': hatebr_offensive
|
64 |
+
}
|
65 |
+
|
66 |
+
extracted_data.append(row_data)
|
67 |
+
|
68 |
+
# Create DataFrame
|
69 |
+
df = pd.DataFrame(extracted_data)
|
70 |
+
|
71 |
+
# Save to CSV
|
72 |
+
output_file = 'external_models.csv'
|
73 |
+
df.to_csv(output_file, index=False)
|
74 |
+
|
75 |
+
print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
|
76 |
+
|
77 |
+
# Show first few entries as preview
|
78 |
+
print("\nFirst 5 entries:")
|
79 |
+
print(df.head().to_string(index=False))
|
80 |
+
|
81 |
+
# Show some statistics
|
82 |
+
if not df.empty:
|
83 |
+
print(f"\nStatistics:")
|
84 |
+
print(f"Total models: {len(df)}")
|
85 |
+
|
86 |
+
# Count models with non-zero scores for each metric
|
87 |
+
print(f"\nModels with scores:")
|
88 |
+
print(f"ASSIN2 STS: {(df['assin2_sts'] > 0).sum()}")
|
89 |
+
print(f"ASSIN2 RTE: {(df['assin2_rte'] > 0).sum()}")
|
90 |
+
print(f"FaQuAD-NLI: {(df['faquad_nli'] > 0).sum()}")
|
91 |
+
print(f"HateBR: {(df['hatebr_offensive'] > 0).sum()}")
|
92 |
+
|
93 |
+
# Average scores
|
94 |
+
print(f"\nAverage scores:")
|
95 |
+
print(df[['assin2_sts', 'assin2_rte', 'faquad_nli', 'hatebr_offensive']].mean().round(3))
|
96 |
+
|
97 |
+
# Show data types and info
|
98 |
+
print(f"\nDataFrame info:")
|
99 |
+
print(df.info())
|
100 |
+
|
101 |
+
except requests.exceptions.RequestException as e:
|
102 |
+
print(f"Error downloading data: {e}")
|
103 |
+
sys.exit(1)
|
104 |
+
except json.JSONDecodeError as e:
|
105 |
+
print(f"Error parsing JSON: {e}")
|
106 |
+
sys.exit(1)
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Unexpected error: {e}")
|
109 |
+
sys.exit(1)
|
110 |
+
|
111 |
+
def main():
|
112 |
+
"""Main function to run the download."""
|
113 |
+
print("External Models Data Downloader")
|
114 |
+
print("=" * 40)
|
115 |
+
|
116 |
+
try:
|
117 |
+
download_external_models()
|
118 |
+
print("\nDownload completed successfully!")
|
119 |
+
except Exception as e:
|
120 |
+
print(f"Error during download: {e}")
|
121 |
+
sys.exit(1)
|
122 |
+
|
123 |
+
if __name__ == "__main__":
|
124 |
+
main()
|
example_usage.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Example Usage of Napolab Leaderboard Data Management
|
4 |
+
|
5 |
+
This script demonstrates how to use the YAML-based data management system.
|
6 |
+
"""
|
7 |
+
|
8 |
+
from data_loader import NapolabDataLoader
|
9 |
+
from manage_data import validate_yaml_structure, add_dataset, add_benchmark_result, add_model_metadata, export_data
|
10 |
+
import yaml
|
11 |
+
|
12 |
+
def example_usage():
|
13 |
+
"""Demonstrate the data management functionality."""
|
14 |
+
|
15 |
+
print("π Napolab Leaderboard Data Management Example")
|
16 |
+
print("=" * 50)
|
17 |
+
|
18 |
+
# 1. Load existing data
|
19 |
+
print("\n1. Loading existing data...")
|
20 |
+
data_loader = NapolabDataLoader()
|
21 |
+
data = data_loader.data
|
22 |
+
|
23 |
+
print(f"β
Loaded {len(data['datasets'])} datasets")
|
24 |
+
print(f"β
Loaded {len(data['model_metadata'])} models")
|
25 |
+
|
26 |
+
# 2. Validate the data structure
|
27 |
+
print("\n2. Validating data structure...")
|
28 |
+
if validate_yaml_structure(data):
|
29 |
+
print("β
Data structure is valid!")
|
30 |
+
else:
|
31 |
+
print("β Data structure has issues!")
|
32 |
+
return
|
33 |
+
|
34 |
+
# 3. Add a new dataset
|
35 |
+
print("\n3. Adding a new dataset...")
|
36 |
+
data = add_dataset(
|
37 |
+
data=data,
|
38 |
+
dataset_name="example_dataset",
|
39 |
+
name="Example Dataset",
|
40 |
+
description="An example dataset for demonstration",
|
41 |
+
tasks=["Classification", "Sentiment Analysis"],
|
42 |
+
url="https://huggingface.co/datasets/example"
|
43 |
+
)
|
44 |
+
|
45 |
+
# 4. Add a new model
|
46 |
+
print("\n4. Adding a new model...")
|
47 |
+
data = add_model_metadata(
|
48 |
+
data=data,
|
49 |
+
model_name="example-model",
|
50 |
+
parameters=125000000,
|
51 |
+
architecture="BERT Large",
|
52 |
+
base_model="bert-large-uncased",
|
53 |
+
task="Classification",
|
54 |
+
huggingface_url="https://huggingface.co/example/model"
|
55 |
+
)
|
56 |
+
|
57 |
+
# 5. Add benchmark results
|
58 |
+
print("\n5. Adding benchmark results...")
|
59 |
+
data = add_benchmark_result(
|
60 |
+
data=data,
|
61 |
+
dataset_name="example_dataset",
|
62 |
+
model_name="example-model",
|
63 |
+
metrics={
|
64 |
+
"accuracy": 0.89,
|
65 |
+
"f1": 0.88,
|
66 |
+
"precision": 0.90,
|
67 |
+
"recall": 0.87
|
68 |
+
}
|
69 |
+
)
|
70 |
+
|
71 |
+
# 6. Export the updated data
|
72 |
+
print("\n6. Exporting updated data...")
|
73 |
+
export_data(data, "example_updated_data.yaml")
|
74 |
+
|
75 |
+
# 7. Demonstrate data access
|
76 |
+
print("\n7. Demonstrating data access...")
|
77 |
+
|
78 |
+
# Get dataset info
|
79 |
+
dataset_info = data_loader.get_dataset_info("assin")
|
80 |
+
if dataset_info:
|
81 |
+
print(f"π ASSIN dataset: {dataset_info['name']}")
|
82 |
+
print(f" Tasks: {', '.join(dataset_info['tasks'])}")
|
83 |
+
|
84 |
+
# Get available models for a dataset
|
85 |
+
models = data_loader.get_available_models_for_dataset("assin")
|
86 |
+
print(f"π€ Available models for ASSIN: {len(models)} models")
|
87 |
+
|
88 |
+
# Get model info
|
89 |
+
model_info = data_loader.get_model_info("mdeberta-v3-base-assin-similarity")
|
90 |
+
if model_info:
|
91 |
+
print(f"π§ Model parameters: {model_info['parameters']:,}")
|
92 |
+
print(f" Architecture: {model_info['architecture']}")
|
93 |
+
|
94 |
+
print("\nβ
Example completed successfully!")
|
95 |
+
print("π Check 'example_updated_data.yaml' for the updated data")
|
96 |
+
|
97 |
+
def demonstrate_yaml_structure():
|
98 |
+
"""Show the YAML structure."""
|
99 |
+
print("\nπ YAML Data Structure Example:")
|
100 |
+
print("-" * 30)
|
101 |
+
|
102 |
+
example_data = {
|
103 |
+
'datasets': {
|
104 |
+
'my_dataset': {
|
105 |
+
'name': 'My Dataset',
|
106 |
+
'description': 'A custom dataset',
|
107 |
+
'tasks': ['Classification'],
|
108 |
+
'url': 'https://huggingface.co/datasets/my_dataset'
|
109 |
+
}
|
110 |
+
},
|
111 |
+
'benchmark_results': {
|
112 |
+
'my_dataset': {
|
113 |
+
'my-model': {
|
114 |
+
'accuracy': 0.92,
|
115 |
+
'f1': 0.91
|
116 |
+
}
|
117 |
+
}
|
118 |
+
},
|
119 |
+
'model_metadata': {
|
120 |
+
'my-model': {
|
121 |
+
'parameters': 110000000,
|
122 |
+
'architecture': 'BERT Base',
|
123 |
+
'base_model': 'bert-base-uncased',
|
124 |
+
'task': 'Classification',
|
125 |
+
'huggingface_url': 'https://huggingface.co/my-model'
|
126 |
+
}
|
127 |
+
}
|
128 |
+
}
|
129 |
+
|
130 |
+
print(yaml.dump(example_data, default_flow_style=False, allow_unicode=True))
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
example_usage()
|
134 |
+
demonstrate_yaml_structure()
|
external_models.csv
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
2 |
+
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
3 |
+
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
4 |
+
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
5 |
+
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
6 |
+
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
7 |
+
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
8 |
+
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
9 |
+
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
10 |
+
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
11 |
+
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
12 |
+
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
13 |
+
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
14 |
+
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
15 |
+
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
16 |
+
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
17 |
+
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
18 |
+
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
19 |
+
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
20 |
+
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
21 |
+
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
22 |
+
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
23 |
+
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
24 |
+
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
25 |
+
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
26 |
+
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
27 |
+
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
28 |
+
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
29 |
+
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
30 |
+
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
31 |
+
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
extract_portuguese_leaderboard.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to extract data from JSON files in a repository folder
|
4 |
+
and save it as a CSV file for import into the benchmark.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
import sys
|
11 |
+
import argparse
|
12 |
+
from pathlib import Path
|
13 |
+
|
14 |
+
def is_valid_json_file(file_path):
|
15 |
+
"""
|
16 |
+
Check if a file is a valid JSON file containing a dict.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
file_path (str): Path to the JSON file
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
bool: True if valid JSON dict, False otherwise
|
23 |
+
"""
|
24 |
+
try:
|
25 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
26 |
+
data = json.load(f)
|
27 |
+
return isinstance(data, dict)
|
28 |
+
except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
|
29 |
+
return False
|
30 |
+
|
31 |
+
def find_json_files(repo_path):
|
32 |
+
"""
|
33 |
+
Recursively find all JSON files in the repository folder.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
repo_path (str): Path to the repository folder
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
list: List of paths to valid JSON files
|
40 |
+
"""
|
41 |
+
json_files = []
|
42 |
+
repo_path = Path(repo_path)
|
43 |
+
|
44 |
+
if not repo_path.exists():
|
45 |
+
print(f"Error: Repository path '{repo_path}' does not exist.")
|
46 |
+
return []
|
47 |
+
|
48 |
+
if not repo_path.is_dir():
|
49 |
+
print(f"Error: Repository path '{repo_path}' is not a directory.")
|
50 |
+
return []
|
51 |
+
|
52 |
+
print(f"Scanning repository: {repo_path}")
|
53 |
+
|
54 |
+
for file_path in repo_path.rglob("*.json"):
|
55 |
+
if is_valid_json_file(file_path):
|
56 |
+
json_files.append(file_path)
|
57 |
+
print(f"Found valid JSON file: {file_path}")
|
58 |
+
|
59 |
+
print(f"Total valid JSON files found: {len(json_files)}")
|
60 |
+
return json_files
|
61 |
+
|
62 |
+
def extract_data_from_json(json_file_path):
|
63 |
+
"""
|
64 |
+
Extract data from a single JSON file.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
json_file_path (Path): Path to the JSON file
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
dict or None: Extracted data or None if extraction failed
|
71 |
+
"""
|
72 |
+
try:
|
73 |
+
with open(json_file_path, 'r', encoding='utf-8') as f:
|
74 |
+
data = json.load(f)
|
75 |
+
|
76 |
+
# Check if required fields exist
|
77 |
+
if 'config_general' not in data or 'results' not in data:
|
78 |
+
return None
|
79 |
+
|
80 |
+
config_general = data['config_general']
|
81 |
+
results = data['results']
|
82 |
+
|
83 |
+
# Extract model information
|
84 |
+
model_name = config_general.get('model_name', '')
|
85 |
+
model_private = config_general.get('model_private', False)
|
86 |
+
|
87 |
+
# Extract results
|
88 |
+
all_grouped = results.get('all_grouped', {})
|
89 |
+
|
90 |
+
# Extract metrics
|
91 |
+
assin2_rte = all_grouped.get('assin2_rte', 0.0)
|
92 |
+
assin2_sts = all_grouped.get('assin2_sts', 0.0)
|
93 |
+
faquad_nli = all_grouped.get('faquad_nli', 0.0)
|
94 |
+
hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)
|
95 |
+
|
96 |
+
# Create row data
|
97 |
+
row_data = {
|
98 |
+
'json_file': str(json_file_path),
|
99 |
+
'model_name': model_name,
|
100 |
+
'model_private': model_private,
|
101 |
+
'assin2_rte': assin2_rte,
|
102 |
+
'assin2_sts': assin2_sts,
|
103 |
+
'faquad_nli': faquad_nli,
|
104 |
+
'hatebr_offensive': hatebr_offensive
|
105 |
+
}
|
106 |
+
|
107 |
+
return row_data
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error processing {json_file_path}: {e}")
|
111 |
+
return None
|
112 |
+
|
113 |
+
def extract_portuguese_leaderboard(repo_path):
|
114 |
+
"""
|
115 |
+
Extract data from JSON files in the repository folder and save as CSV.
|
116 |
+
|
117 |
+
Args:
|
118 |
+
repo_path (str): Path to the repository folder
|
119 |
+
"""
|
120 |
+
|
121 |
+
print("Scanning repository for JSON files...")
|
122 |
+
|
123 |
+
# Find all JSON files
|
124 |
+
json_files = find_json_files(repo_path)
|
125 |
+
|
126 |
+
if not json_files:
|
127 |
+
print("No valid JSON files found in the repository.")
|
128 |
+
return
|
129 |
+
|
130 |
+
# Prepare data for DataFrame
|
131 |
+
data = []
|
132 |
+
|
133 |
+
# Process each JSON file
|
134 |
+
for i, json_file in enumerate(json_files):
|
135 |
+
print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")
|
136 |
+
|
137 |
+
row_data = extract_data_from_json(json_file)
|
138 |
+
if row_data:
|
139 |
+
data.append(row_data)
|
140 |
+
|
141 |
+
# Print progress every 10 files
|
142 |
+
if (i + 1) % 10 == 0:
|
143 |
+
print(f" Processed {i + 1} files...")
|
144 |
+
|
145 |
+
if not data:
|
146 |
+
print("No valid data extracted from JSON files.")
|
147 |
+
return
|
148 |
+
|
149 |
+
# Create DataFrame
|
150 |
+
df = pd.DataFrame(data)
|
151 |
+
|
152 |
+
# Write to CSV
|
153 |
+
output_file = 'portuguese_leaderboard.csv'
|
154 |
+
df.to_csv(output_file, index=False)
|
155 |
+
|
156 |
+
print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
|
157 |
+
|
158 |
+
# Show first few entries as preview
|
159 |
+
print("\nFirst 5 entries:")
|
160 |
+
print(df.head().to_string(index=False))
|
161 |
+
|
162 |
+
# Show some statistics
|
163 |
+
if not df.empty:
|
164 |
+
print(f"\nStatistics:")
|
165 |
+
print(f"Total models: {len(df)}")
|
166 |
+
print(f"Private models: {df['model_private'].sum()}")
|
167 |
+
print(f"Public models: {(~df['model_private']).sum()}")
|
168 |
+
|
169 |
+
# Average scores
|
170 |
+
print(f"\nAverage scores:")
|
171 |
+
print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))
|
172 |
+
|
173 |
+
# Show data types and info
|
174 |
+
print(f"\nDataFrame info:")
|
175 |
+
print(df.info())
|
176 |
+
|
177 |
+
def main():
|
178 |
+
"""Main function to run the extraction."""
|
179 |
+
parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
|
180 |
+
parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')
|
181 |
+
|
182 |
+
args = parser.parse_args()
|
183 |
+
|
184 |
+
print("Portuguese LLM Leaderboard Data Extractor")
|
185 |
+
print("=" * 50)
|
186 |
+
|
187 |
+
try:
|
188 |
+
extract_portuguese_leaderboard(args.repo_path)
|
189 |
+
print("\nExtraction completed successfully!")
|
190 |
+
except Exception as e:
|
191 |
+
print(f"Error during extraction: {e}")
|
192 |
+
sys.exit(1)
|
193 |
+
|
194 |
+
if __name__ == "__main__":
|
195 |
+
main()
|
manage_data.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Data Management Utility for Napolab Leaderboard
|
4 |
+
|
5 |
+
This script provides utilities to manage, validate, and update the YAML data file.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import yaml
|
9 |
+
import argparse
|
10 |
+
from pathlib import Path
|
11 |
+
from data_loader import NapolabDataLoader
|
12 |
+
from typing import Dict, Any
|
13 |
+
|
14 |
+
def validate_yaml_structure(data: Dict[str, Any]) -> bool:
|
15 |
+
"""Validate the YAML data structure."""
|
16 |
+
print("π Validating YAML structure...")
|
17 |
+
|
18 |
+
required_sections = ['datasets', 'benchmark_results', 'model_metadata']
|
19 |
+
|
20 |
+
for section in required_sections:
|
21 |
+
if section not in data:
|
22 |
+
print(f"β Missing required section: {section}")
|
23 |
+
return False
|
24 |
+
print(f"β
Found section: {section}")
|
25 |
+
|
26 |
+
# Validate datasets
|
27 |
+
print("\nπ Validating datasets...")
|
28 |
+
for dataset_name, dataset_info in data['datasets'].items():
|
29 |
+
required_fields = ['name', 'description', 'tasks', 'url']
|
30 |
+
for field in required_fields:
|
31 |
+
if field not in dataset_info:
|
32 |
+
print(f"β Dataset '{dataset_name}' missing field: {field}")
|
33 |
+
return False
|
34 |
+
print(f"β
Dataset '{dataset_name}' is valid")
|
35 |
+
|
36 |
+
# Validate benchmark results
|
37 |
+
print("\nπ Validating benchmark results...")
|
38 |
+
for dataset_name, models in data['benchmark_results'].items():
|
39 |
+
if dataset_name not in data['datasets']:
|
40 |
+
print(f"β οΈ Warning: Benchmark for '{dataset_name}' but no dataset info found")
|
41 |
+
|
42 |
+
for model_name, metrics in models.items():
|
43 |
+
if not isinstance(metrics, dict):
|
44 |
+
print(f"β Invalid metrics format for model '{model_name}'")
|
45 |
+
return False
|
46 |
+
print(f"β
Model '{model_name}' has {len(metrics)} metrics")
|
47 |
+
|
48 |
+
# Validate model metadata
|
49 |
+
print("\nπ€ Validating model metadata...")
|
50 |
+
for model_name, metadata in data['model_metadata'].items():
|
51 |
+
required_fields = ['parameters', 'architecture', 'base_model', 'task']
|
52 |
+
for field in required_fields:
|
53 |
+
if field not in metadata:
|
54 |
+
print(f"β Model '{model_name}' missing field: {field}")
|
55 |
+
return False
|
56 |
+
print(f"β
Model '{model_name}' is valid")
|
57 |
+
|
58 |
+
print("\nπ All validations passed!")
|
59 |
+
return True
|
60 |
+
|
61 |
+
def create_sample_data() -> Dict[str, Any]:
|
62 |
+
"""Create a sample data structure."""
|
63 |
+
return {
|
64 |
+
'datasets': {
|
65 |
+
'sample_dataset': {
|
66 |
+
'name': 'Sample Dataset',
|
67 |
+
'description': 'A sample dataset for testing',
|
68 |
+
'tasks': ['Classification'],
|
69 |
+
'url': 'https://huggingface.co/datasets/sample'
|
70 |
+
}
|
71 |
+
},
|
72 |
+
'benchmark_results': {
|
73 |
+
'sample_dataset': {
|
74 |
+
'sample-model': {
|
75 |
+
'accuracy': 0.85,
|
76 |
+
'f1': 0.84
|
77 |
+
}
|
78 |
+
}
|
79 |
+
},
|
80 |
+
'model_metadata': {
|
81 |
+
'sample-model': {
|
82 |
+
'parameters': 100000000,
|
83 |
+
'architecture': 'BERT Base',
|
84 |
+
'base_model': 'bert-base-uncased',
|
85 |
+
'task': 'Classification',
|
86 |
+
'huggingface_url': 'https://huggingface.co/sample/model'
|
87 |
+
}
|
88 |
+
},
|
89 |
+
'additional_models': {}
|
90 |
+
}
|
91 |
+
|
92 |
+
def add_dataset(data: Dict[str, Any], dataset_name: str, name: str, description: str,
|
93 |
+
tasks: list, url: str) -> Dict[str, Any]:
|
94 |
+
"""Add a new dataset to the data structure."""
|
95 |
+
data['datasets'][dataset_name] = {
|
96 |
+
'name': name,
|
97 |
+
'description': description,
|
98 |
+
'tasks': tasks,
|
99 |
+
'url': url
|
100 |
+
}
|
101 |
+
print(f"β
Added dataset: {dataset_name}")
|
102 |
+
return data
|
103 |
+
|
104 |
+
def add_benchmark_result(data: Dict[str, Any], dataset_name: str, model_name: str,
|
105 |
+
metrics: Dict[str, float]) -> Dict[str, Any]:
|
106 |
+
"""Add benchmark results for a model on a dataset."""
|
107 |
+
if dataset_name not in data['benchmark_results']:
|
108 |
+
data['benchmark_results'][dataset_name] = {}
|
109 |
+
|
110 |
+
data['benchmark_results'][dataset_name][model_name] = metrics
|
111 |
+
print(f"β
Added benchmark result for {model_name} on {dataset_name}")
|
112 |
+
return data
|
113 |
+
|
114 |
+
def add_model_metadata(data: Dict[str, Any], model_name: str, parameters: int,
|
115 |
+
architecture: str, base_model: str, task: str,
|
116 |
+
huggingface_url: str = None) -> Dict[str, Any]:
|
117 |
+
"""Add model metadata."""
|
118 |
+
data['model_metadata'][model_name] = {
|
119 |
+
'parameters': parameters,
|
120 |
+
'architecture': architecture,
|
121 |
+
'base_model': base_model,
|
122 |
+
'task': task
|
123 |
+
}
|
124 |
+
|
125 |
+
if huggingface_url:
|
126 |
+
data['model_metadata'][model_name]['huggingface_url'] = huggingface_url
|
127 |
+
|
128 |
+
print(f"β
Added model metadata: {model_name}")
|
129 |
+
return data
|
130 |
+
|
131 |
+
def export_data(data: Dict[str, Any], output_file: str) -> None:
|
132 |
+
"""Export data to a YAML file."""
|
133 |
+
try:
|
134 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
135 |
+
yaml.dump(data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
136 |
+
print(f"β
Data exported to {output_file}")
|
137 |
+
except Exception as e:
|
138 |
+
print(f"β Error exporting data: {e}")
|
139 |
+
|
140 |
+
def main():
|
141 |
+
"""Main function for command-line interface."""
|
142 |
+
parser = argparse.ArgumentParser(description='Manage Napolab Leaderboard Data')
|
143 |
+
parser.add_argument('action', choices=['validate', 'create-sample', 'add-dataset', 'add-benchmark', 'add-model'],
|
144 |
+
help='Action to perform')
|
145 |
+
parser.add_argument('--data-file', default='data.yaml', help='Path to data file')
|
146 |
+
parser.add_argument('--output', help='Output file for export')
|
147 |
+
|
148 |
+
# Dataset arguments
|
149 |
+
parser.add_argument('--dataset-name', help='Dataset name')
|
150 |
+
parser.add_argument('--dataset-display-name', help='Dataset display name')
|
151 |
+
parser.add_argument('--dataset-description', help='Dataset description')
|
152 |
+
parser.add_argument('--dataset-tasks', nargs='+', help='Dataset tasks')
|
153 |
+
parser.add_argument('--dataset-url', help='Dataset URL')
|
154 |
+
|
155 |
+
# Benchmark arguments
|
156 |
+
parser.add_argument('--model-name', help='Model name')
|
157 |
+
parser.add_argument('--metrics', nargs='+', help='Metrics as key=value pairs')
|
158 |
+
|
159 |
+
# Model metadata arguments
|
160 |
+
parser.add_argument('--parameters', type=int, help='Number of parameters')
|
161 |
+
parser.add_argument('--architecture', help='Model architecture')
|
162 |
+
parser.add_argument('--base-model', help='Base model name')
|
163 |
+
parser.add_argument('--task', help='Task type')
|
164 |
+
parser.add_argument('--huggingface-url', help='Hugging Face URL')
|
165 |
+
|
166 |
+
args = parser.parse_args()
|
167 |
+
|
168 |
+
# Load existing data or create new
|
169 |
+
data_loader = NapolabDataLoader(args.data_file)
|
170 |
+
data = data_loader.data
|
171 |
+
|
172 |
+
if args.action == 'validate':
|
173 |
+
if validate_yaml_structure(data):
|
174 |
+
print("β
Data validation successful!")
|
175 |
+
else:
|
176 |
+
print("β Data validation failed!")
|
177 |
+
return 1
|
178 |
+
|
179 |
+
elif args.action == 'create-sample':
|
180 |
+
data = create_sample_data()
|
181 |
+
export_data(data, args.output or 'sample_data.yaml')
|
182 |
+
|
183 |
+
elif args.action == 'add-dataset':
|
184 |
+
if not all([args.dataset_name, args.dataset_display_name, args.dataset_description,
|
185 |
+
args.dataset_tasks, args.dataset_url]):
|
186 |
+
print("β All dataset arguments are required")
|
187 |
+
return 1
|
188 |
+
|
189 |
+
data = add_dataset(data, args.dataset_name, args.dataset_display_name,
|
190 |
+
args.dataset_description, args.dataset_tasks, args.dataset_url)
|
191 |
+
export_data(data, args.data_file)
|
192 |
+
|
193 |
+
elif args.action == 'add-benchmark':
|
194 |
+
if not all([args.dataset_name, args.model_name, args.metrics]):
|
195 |
+
print("β All benchmark arguments are required")
|
196 |
+
return 1
|
197 |
+
|
198 |
+
# Parse metrics
|
199 |
+
metrics = {}
|
200 |
+
for metric in args.metrics:
|
201 |
+
if '=' in metric:
|
202 |
+
key, value = metric.split('=', 1)
|
203 |
+
try:
|
204 |
+
metrics[key] = float(value)
|
205 |
+
except ValueError:
|
206 |
+
print(f"β Invalid metric value: {metric}")
|
207 |
+
return 1
|
208 |
+
|
209 |
+
data = add_benchmark_result(data, args.dataset_name, args.model_name, metrics)
|
210 |
+
export_data(data, args.data_file)
|
211 |
+
|
212 |
+
elif args.action == 'add-model':
|
213 |
+
if not all([args.model_name, args.parameters, args.architecture,
|
214 |
+
args.base_model, args.task]):
|
215 |
+
print("β All model metadata arguments are required")
|
216 |
+
return 1
|
217 |
+
|
218 |
+
data = add_model_metadata(data, args.model_name, args.parameters,
|
219 |
+
args.architecture, args.base_model, args.task,
|
220 |
+
args.huggingface_url)
|
221 |
+
export_data(data, args.data_file)
|
222 |
+
|
223 |
+
return 0
|
224 |
+
|
225 |
+
if __name__ == "__main__":
|
226 |
+
exit(main())
|
portuguese_leaderboard.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=1.5.0
|
3 |
+
numpy>=1.21.0
|
4 |
+
plotly>=5.0.0
|
5 |
+
transformers>=4.20.0
|
6 |
+
torch>=1.12.0
|
7 |
+
huggingface-hub>=0.10.0
|
8 |
+
PyYAML>=6.0
|
run_app.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Napolab Leaderboard Launcher Script
|
4 |
+
|
5 |
+
This script checks dependencies and launches the Gradio app for the Napolab leaderboard.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sys
|
9 |
+
import subprocess
|
10 |
+
import importlib.util
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
def check_dependency(package_name):
|
14 |
+
"""Check if a package is installed."""
|
15 |
+
spec = importlib.util.find_spec(package_name)
|
16 |
+
return spec is not None
|
17 |
+
|
18 |
+
def install_dependencies():
|
19 |
+
"""Install required dependencies."""
|
20 |
+
print("Installing required dependencies...")
|
21 |
+
try:
|
22 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
|
23 |
+
print("β
Dependencies installed successfully!")
|
24 |
+
return True
|
25 |
+
except subprocess.CalledProcessError as e:
|
26 |
+
print(f"β Failed to install dependencies: {e}")
|
27 |
+
return False
|
28 |
+
|
29 |
+
def main():
|
30 |
+
"""Main launcher function."""
|
31 |
+
print("π Napolab Leaderboard Launcher")
|
32 |
+
print("=" * 40)
|
33 |
+
|
34 |
+
# Check if we're in the right directory
|
35 |
+
if not Path("app.py").exists():
|
36 |
+
print("β Error: app.py not found. Please run this script from the leaderboard directory.")
|
37 |
+
sys.exit(1)
|
38 |
+
|
39 |
+
# Check required dependencies
|
40 |
+
required_packages = ["gradio", "pandas", "numpy", "datasets", "plotly"]
|
41 |
+
missing_packages = []
|
42 |
+
|
43 |
+
for package in required_packages:
|
44 |
+
if not check_dependency(package):
|
45 |
+
missing_packages.append(package)
|
46 |
+
|
47 |
+
if missing_packages:
|
48 |
+
print(f"β Missing dependencies: {', '.join(missing_packages)}")
|
49 |
+
print("Installing dependencies...")
|
50 |
+
if not install_dependencies():
|
51 |
+
print("β Failed to install dependencies. Please install them manually:")
|
52 |
+
print("pip install -r requirements.txt")
|
53 |
+
sys.exit(1)
|
54 |
+
else:
|
55 |
+
print("β
All dependencies are installed!")
|
56 |
+
|
57 |
+
# Launch the app
|
58 |
+
print("\nπ Launching Napolab Leaderboard...")
|
59 |
+
print("The app will be available at: http://localhost:7860")
|
60 |
+
print("Press Ctrl+C to stop the server")
|
61 |
+
print("-" * 40)
|
62 |
+
|
63 |
+
try:
|
64 |
+
import app
|
65 |
+
# The app will be launched by the import
|
66 |
+
except KeyboardInterrupt:
|
67 |
+
print("\nπ Server stopped by user")
|
68 |
+
except Exception as e:
|
69 |
+
print(f"β Error launching app: {e}")
|
70 |
+
sys.exit(1)
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
main()
|
validate_data.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Validation script for the updated Napolab data structure
|
4 |
+
"""
|
5 |
+
|
6 |
+
from data_loader import NapolabDataLoader
|
7 |
+
from manage_data import validate_yaml_structure
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
def main():
|
11 |
+
"""Validate the updated data structure."""
|
12 |
+
print("π Validating Updated Napolab Data Structure")
|
13 |
+
print("=" * 50)
|
14 |
+
print("π Data Source: Master's thesis 'Lessons learned from the evaluation of Portuguese language models'")
|
15 |
+
print(" by Ruan Chaves Rodrigues (2023) - University of Malta")
|
16 |
+
print(" Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557")
|
17 |
+
print("=" * 50)
|
18 |
+
|
19 |
+
# Load data
|
20 |
+
data_loader = NapolabDataLoader()
|
21 |
+
data = data_loader.data
|
22 |
+
|
23 |
+
# Validate structure
|
24 |
+
print("\n1. Validating YAML structure...")
|
25 |
+
if validate_yaml_structure(data):
|
26 |
+
print("β
YAML structure is valid!")
|
27 |
+
else:
|
28 |
+
print("β YAML structure has issues!")
|
29 |
+
return
|
30 |
+
|
31 |
+
# Check datasets
|
32 |
+
print("\n2. Checking datasets...")
|
33 |
+
datasets = data_loader.get_datasets()
|
34 |
+
print(f"π Found {len(datasets)} datasets:")
|
35 |
+
for name, info in datasets.items():
|
36 |
+
print(f" - {name}: {info['name']} ({', '.join(info['tasks'])})")
|
37 |
+
|
38 |
+
# Check benchmark results
|
39 |
+
print("\n3. Checking benchmark results...")
|
40 |
+
benchmark_results = data_loader.get_benchmark_results()
|
41 |
+
print(f"π Found {len(benchmark_results)} benchmark datasets:")
|
42 |
+
for dataset_name, models in benchmark_results.items():
|
43 |
+
print(f" - {dataset_name}: {len(models)} models")
|
44 |
+
|
45 |
+
# Check model metadata
|
46 |
+
print("\n4. Checking model metadata...")
|
47 |
+
model_metadata = data_loader.get_model_metadata()
|
48 |
+
print(f"π€ Found {len(model_metadata)} models:")
|
49 |
+
|
50 |
+
# Group models by architecture
|
51 |
+
architectures = {}
|
52 |
+
for model_name, metadata in model_metadata.items():
|
53 |
+
arch = metadata['architecture']
|
54 |
+
if arch not in architectures:
|
55 |
+
architectures[arch] = []
|
56 |
+
architectures[arch].append(model_name)
|
57 |
+
|
58 |
+
for arch, models in architectures.items():
|
59 |
+
print(f" - {arch}: {len(models)} models")
|
60 |
+
for model in models[:3]: # Show first 3 models
|
61 |
+
print(f" * {model}")
|
62 |
+
if len(models) > 3:
|
63 |
+
print(f" ... and {len(models) - 3} more")
|
64 |
+
|
65 |
+
# Test data access functions
|
66 |
+
print("\n5. Testing data access functions...")
|
67 |
+
|
68 |
+
# Test getting available models for a dataset
|
69 |
+
test_dataset = list(benchmark_results.keys())[0]
|
70 |
+
models = data_loader.get_available_models_for_dataset(test_dataset)
|
71 |
+
print(f" Available models for {test_dataset}: {len(models)} models")
|
72 |
+
|
73 |
+
# Test getting model info
|
74 |
+
if models:
|
75 |
+
test_model = models[0]
|
76 |
+
model_info = data_loader.get_model_info(test_model)
|
77 |
+
if model_info:
|
78 |
+
print(f" Model {test_model}: {model_info['parameters']:,} parameters")
|
79 |
+
|
80 |
+
# Create a summary table
|
81 |
+
print("\n6. Creating summary table...")
|
82 |
+
summary_data = []
|
83 |
+
|
84 |
+
for dataset_name, models in benchmark_results.items():
|
85 |
+
for model_name, metrics in models.items():
|
86 |
+
if model_name in model_metadata:
|
87 |
+
summary_data.append({
|
88 |
+
'Dataset': dataset_name,
|
89 |
+
'Model': model_name,
|
90 |
+
'Architecture': model_metadata[model_name]['architecture'],
|
91 |
+
'Parameters': model_metadata[model_name]['parameters'],
|
92 |
+
'Performance': metrics.get('accuracy', 0)
|
93 |
+
})
|
94 |
+
|
95 |
+
if summary_data:
|
96 |
+
df = pd.DataFrame(summary_data)
|
97 |
+
print(f"π Summary: {len(df)} model-dataset combinations")
|
98 |
+
print(f" Average performance: {df['Performance'].mean():.3f}")
|
99 |
+
print(f" Best performance: {df['Performance'].max():.3f}")
|
100 |
+
print(f" Models with >0.9 performance: {(df['Performance'] > 0.9).sum()}")
|
101 |
+
|
102 |
+
print("\nβ
Validation completed successfully!")
|
103 |
+
print("π The updated data structure is ready to use!")
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
main()
|