rzanoli commited on
Commit
ad05cd8
·
1 Parent(s): 89d23fd

Add performance metrics labels with average, std dev, and best model info.

Browse files
Files changed (2) hide show
  1. app.py +94 -44
  2. app_22_09_2025.py +625 -0
app.py CHANGED
@@ -490,69 +490,117 @@ if LEADERBOARD_DF is None:
490
  theoretical_max_combined_perf = 0.0
491
 
492
 
 
493
  def create_gradio_interface():
494
  """The main Gradio interface."""
495
  demo = gr.Blocks(css=custom_css)
496
 
497
  with demo:
498
- # Title
499
  gr.HTML(create_title_html())
500
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
501
 
502
- # Charts section
503
- with gr.Row():
504
- if LEADERBOARD_DF is not None:
505
- # Note: You'd need to implement these chart functions properly
506
- gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
507
- gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
508
-
509
- # Tabs
510
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
511
- # Main leaderboard tab
512
  with gr.TabItem("🏅 Benchmark"):
513
  if LEADERBOARD_DF is not None:
514
- leaderboard = init_leaderboard(
515
- LEADERBOARD_DF,
516
- default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT",
517
- "WIC", "FAQ", "LS", "SU", "NER", "REL"],
518
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
519
- col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA",
520
- "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
521
- )
522
 
523
- gr.HTML(
524
- f"""
 
525
  <div style="
526
- border: 2px solid #1f77b4;
527
- border-radius: 10px;
528
- padding: 10px;
529
- background-color: #f0f8ff;
530
- font-weight: bold;
531
- font-size: 14px;
532
- display: inline-block;
533
  ">
534
- Theoretical performance of a model that scores the highest on every individual task:
535
- <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  </div>
537
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  )
539
 
540
- # About tab
541
  with gr.TabItem("📝 About"):
542
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
543
 
 
544
  with gr.TabItem("║", interactive=False):
545
  gr.Markdown("", elem_classes="markdown-text")
546
 
547
- # Task-specific tabs
548
  if LEADERBOARD_DF is not None:
549
- # Multiple choice tasks
550
  for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
551
  with gr.TabItem(f"{metadata['icon']}{task}"):
552
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
553
  gr.Markdown(task_description, elem_classes="markdown-text")
554
 
555
- leaderboard = update_task_leaderboard(
556
  LEADERBOARD_DF.rename(columns={
557
  f"{task} Prompt Average": "Prompt Average",
558
  f"{task} Prompt Std": "Prompt Std",
@@ -560,24 +608,26 @@ def create_gradio_interface():
560
  f"{task} Best Prompt Id": "Best Prompt Id",
561
  task: "Comb. Perf. ⬆️"
562
  }),
563
- default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
564
- 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
565
  hidden_columns=[col for col in LEADERBOARD_DF.columns if
566
  col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
567
  'Prompt Average', 'Prompt Std', 'Best Prompt',
568
  'Best Prompt Id']]
569
  )
570
 
571
- with gr.TabItem("│", interactive=False):
572
- gr.Markdown("", elem_classes="markdown-text")
 
573
 
574
- # Generative tasks
 
575
  for task, metadata in TASK_METADATA_GENERATIVE.items():
576
  with gr.TabItem(f"{metadata['icon']}{task}"):
577
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
578
  gr.Markdown(task_description, elem_classes="markdown-text")
579
 
580
- leaderboard = update_task_leaderboard(
581
  LEADERBOARD_DF.rename(columns={
582
  f"{task} Prompt Average": "Prompt Average",
583
  f"{task} Prompt Std": "Prompt Std",
@@ -585,15 +635,15 @@ def create_gradio_interface():
585
  f"{task} Best Prompt Id": "Best Prompt Id",
586
  task: "Comb. Perf. ⬆️"
587
  }),
588
- default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
589
- 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
590
  hidden_columns=[col for col in LEADERBOARD_DF.columns if
591
  col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
592
  'Prompt Average', 'Prompt Std', 'Best Prompt',
593
  'Best Prompt Id']]
594
  )
595
 
596
- # Citation and Credits sections
597
  with gr.Accordion("📙 Citation", open=False):
598
  gr.Textbox(
599
  value=CITATION_BUTTON_TEXT,
@@ -622,4 +672,4 @@ if __name__ == "__main__":
622
  demo.queue(default_concurrency_limit=40).launch(
623
  debug=True,
624
  show_error=True
625
- )
 
490
  theoretical_max_combined_perf = 0.0
491
 
492
 
493
+ # Main Gradio interface
494
  def create_gradio_interface():
495
  """The main Gradio interface."""
496
  demo = gr.Blocks(css=custom_css)
497
 
498
  with demo:
499
+ # Titolo
500
  gr.HTML(create_title_html())
501
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
502
 
503
+ # Tabs principali
 
 
 
 
 
 
 
504
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
505
+ # 🏅 Benchmark
506
  with gr.TabItem("🏅 Benchmark"):
507
  if LEADERBOARD_DF is not None:
508
+ # Labels dei campi affiancate
 
 
 
 
 
 
 
509
 
510
+ with gr.Row():
511
+ # Labels dei campi affiancate
512
+ gr.HTML(f"""
513
  <div style="
514
+ display: flex; justify-content: flex-start; width: 100%; gap: 5px;
 
 
 
 
 
 
515
  ">
516
+ <div style="
517
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
518
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
519
+ text-align: center;
520
+ display: flex; align-items: center; justify-content: center;
521
+ min-height: 28px; white-space: nowrap;
522
+ " title="Total number of configurations (zero-shot and 5-few-shot) of the models evaluated in the leaderboard.">
523
+ Models tested: {len(LEADERBOARD_DF)}
524
+ </div>
525
+ <div style="
526
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
527
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
528
+ text-align: center;
529
+ display: flex; align-items: center; justify-content: center;
530
+ min-height: 28px; white-space: nowrap;
531
+ " title="Average accuracy of the evaluated model">
532
+ Avg combined perf.: {LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].mean():.2f}
533
+ </div>
534
+ <div style="
535
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
536
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
537
+ text-align: center;
538
+ display: flex; align-items: center; justify-content: center;
539
+ min-height: 28px; white-space: nowrap;
540
+ " title="Standard deviation of the evaluated models' performance.">
541
+ Std. Dev. {LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].std():.2f}
542
+ </div>
543
+ <div style="
544
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
545
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
546
+ text-align: center;
547
+ display: flex; align-items: center; justify-content: center;
548
+ min-height: 28px; white-space: nowrap;
549
+ " title="Best evaluated model.">
550
+ Best model: {LEADERBOARD_DF.loc[LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].idxmax(), 'Model']}
551
+ </div>
552
+ <div style="
553
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
554
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
555
+ text-align: center;
556
+ display: flex; align-items: center; justify-content: center;
557
+ min-height: 28px; white-space: nowrap;
558
+ " title="Accuracy of the best evaluated model.">
559
+ Best model accuracy: {LEADERBOARD_DF.loc[LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].idxmax(), 'Avg. Comb. Perf. ⬆️']:.2f}
560
+ </div>
561
+ <div style="
562
+ border: 2px solid #1f77b4; border-radius: 8px; padding: 4px 8px;
563
+ background-color: #f0f8ff; font-weight: bold; font-size: 12px;
564
+ text-align: center;
565
+ display: flex; align-items: center; justify-content: center;
566
+ min-height: 28px; white-space: nowrap;
567
+ " title="Maximum achievable accuracy based on the highest performance for each task by any model in the leaderboard.">
568
+ Theoretical max: {theoretical_max_combined_perf:.2f}
569
+ </div>
570
  </div>
571
+ """)
572
+
573
+ # Grafici affiancati
574
+ with gr.Row():
575
+ gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
576
+ gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
577
+
578
+ # Leaderboard
579
+ leaderboard = init_leaderboard(
580
+ LEADERBOARD_DF,
581
+ default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️",
582
+ "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
583
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
584
+ col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️",
585
+ "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
586
  )
587
 
588
+ # 📝 About
589
  with gr.TabItem("📝 About"):
590
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
591
 
592
+ # Separatori
593
  with gr.TabItem("║", interactive=False):
594
  gr.Markdown("", elem_classes="markdown-text")
595
 
596
+ # Task-specific tabs (Multiple Choice)
597
  if LEADERBOARD_DF is not None:
 
598
  for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
599
  with gr.TabItem(f"{metadata['icon']}{task}"):
600
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
601
  gr.Markdown(task_description, elem_classes="markdown-text")
602
 
603
+ leaderboard_task = update_task_leaderboard(
604
  LEADERBOARD_DF.rename(columns={
605
  f"{task} Prompt Average": "Prompt Average",
606
  f"{task} Prompt Std": "Prompt Std",
 
608
  f"{task} Best Prompt Id": "Best Prompt Id",
609
  task: "Comb. Perf. ⬆️"
610
  }),
611
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
612
+ 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
613
  hidden_columns=[col for col in LEADERBOARD_DF.columns if
614
  col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
615
  'Prompt Average', 'Prompt Std', 'Best Prompt',
616
  'Best Prompt Id']]
617
  )
618
 
619
+ # Separatori
620
+ with gr.TabItem("", interactive=False):
621
+ gr.Markdown("", elem_classes="markdown-text")
622
 
623
+ # Task-specific tabs (Generative)
624
+ if LEADERBOARD_DF is not None:
625
  for task, metadata in TASK_METADATA_GENERATIVE.items():
626
  with gr.TabItem(f"{metadata['icon']}{task}"):
627
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
628
  gr.Markdown(task_description, elem_classes="markdown-text")
629
 
630
+ leaderboard_task = update_task_leaderboard(
631
  LEADERBOARD_DF.rename(columns={
632
  f"{task} Prompt Average": "Prompt Average",
633
  f"{task} Prompt Std": "Prompt Std",
 
635
  f"{task} Best Prompt Id": "Best Prompt Id",
636
  task: "Comb. Perf. ⬆️"
637
  }),
638
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
639
+ 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
640
  hidden_columns=[col for col in LEADERBOARD_DF.columns if
641
  col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
642
  'Prompt Average', 'Prompt Std', 'Best Prompt',
643
  'Best Prompt Id']]
644
  )
645
 
646
+ # Citation e Credits
647
  with gr.Accordion("📙 Citation", open=False):
648
  gr.Textbox(
649
  value=CITATION_BUTTON_TEXT,
 
672
  demo.queue(default_concurrency_limit=40).launch(
673
  debug=True,
674
  show_error=True
675
+ )
app_22_09_2025.py ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from functools import lru_cache
7
+ import logging
8
+
9
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, \
10
+ LLM_BENCHMARKS_TEXT, TITLE
11
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
12
+ from src.display.css_html_js import custom_css
13
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, \
14
+ WeightType, Precision
15
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
16
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
17
+ from src.submission.submit import add_new_eval
18
+ import matplotlib.pyplot as plt
19
+ import re
20
+ import plotly.express as px
21
+ import plotly.graph_objects as go
22
+ import numpy as np
23
+
24
+ # Configure logging
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # EVALITA results
29
+ BASELINES = {
30
+ "TE": 71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
31
+ "LS": 38.82, "SU": 38.91, "NER": 88.00, "REL": 62.99
32
+ }
33
+
34
+ # GPT-4o results
35
+ REFERENCES = {
36
+ "NER": 79.11, "REL": 63.32, "LS": 59.25, "SU": 33.04
37
+ }
38
+
39
+ TASK_METADATA_MULTIPLECHOICE = {
40
+ "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
41
+ "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
42
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
43
+ "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
44
+ "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
45
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
46
+ }
47
+
48
+ TASK_METADATA_GENERATIVE = {
49
+ "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
50
+ "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
51
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
52
+ "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
53
+ }
54
+
55
+ def highlight_best_per_task(df):
56
+ """Add 🟡 symbol next to the maximum value in each task column"""
57
+
58
+ task_columns = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
59
+
60
+ df = df.copy()
61
+ for col in task_columns:
62
+ if col in df.columns:
63
+ max_val = df[col].max()
64
+ df[col] = df[col].apply(
65
+ lambda x: f"{x:.1f}🔺" if x == max_val else f"{x:.1f}"
66
+ )
67
+ return df
68
+
69
+ def theoretical_performance(df_hash):
70
+ """
71
+ Theoretical performance of a model that scores the highest on every individual task
72
+ """
73
+ # This is a placeholder - you'd need to pass the actual dataframe
74
+ # In practice, you'd compute this once and store it
75
+ #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
76
+ return 75.0 # Placeholder value
77
+
78
+
79
+ def scale_sizes(values, min_size=8, max_size=30):
80
+ """Normalize sizes for scatter plot markers """
81
+ if not values:
82
+ return []
83
+ vmin, vmax = min(values), max(values)
84
+ if vmax == vmin:
85
+ return [(min_size + max_size) / 2] * len(values)
86
+ return [
87
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size)
88
+ for val in values
89
+ ]
90
+
91
+
92
+ def extract_model_name(model_string):
93
+ """Extract model name from HTML string."""
94
+ match = re.search(r'>([^<]+)<', model_string)
95
+ return match.group(1) if match else model_string
96
+
97
+
98
+ def create_line_chart(dataframe):
99
+ """Create left chart."""
100
+
101
+ def scale_sizes(values, min_size=8, max_size=30):
102
+ vmin, vmax = min(values), max(values)
103
+ return [
104
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin
105
+ else (min_size + max_size) / 2
106
+ for val in values
107
+ ]
108
+
109
+ fig = go.Figure()
110
+
111
+ # Loop su 5-Shot e 0-Shot
112
+ for shot, color in [(True, "blue"), (False, "red")]:
113
+ df = dataframe[dataframe["IS_FS"] == shot]
114
+
115
+ x = df["#Params (B)"].tolist()
116
+ y = df["Avg. Comb. Perf. ⬆️"].tolist()
117
+ labels = [
118
+ re.search(r'>([^<]+)<', m).group(1) if isinstance(m, str) and re.search(r'>([^<]+)<', m) else str(m)
119
+ for m in df["Model"].tolist()
120
+ ]
121
+
122
+ fig.add_trace(go.Scatter(
123
+ x=x,
124
+ y=y,
125
+ mode="markers",
126
+ name="5-Shot" if shot else "0-Shot",
127
+ marker=dict(color=color, size=scale_sizes(x)),
128
+ hovertemplate="<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>",
129
+ customdata=labels,
130
+ ))
131
+
132
+ # Show the best model
133
+ all_y = dataframe["Avg. Comb. Perf. ⬆️"].tolist()
134
+ if all_y:
135
+ max_idx = all_y.index(max(all_y))
136
+ max_x = dataframe["#Params (B)"].iloc[max_idx]
137
+ max_y = all_y[max_idx]
138
+ max_label = re.search(r'>([^<]+)<', dataframe["Model"].iloc[max_idx]).group(1)
139
+
140
+ fig.add_annotation(
141
+ x=max_x,
142
+ y=max_y,
143
+ text=max_label,
144
+ showarrow=True,
145
+ arrowhead=2,
146
+ arrowsize=1,
147
+ arrowwidth=2,
148
+ arrowcolor="black",
149
+ font=dict(size=11, color="black"),
150
+ xshift=10, yshift=10,
151
+ ax=-30, ay=-20,
152
+ xanchor="right"
153
+ )
154
+
155
+ # Layout
156
+ fig.update_layout(
157
+ title="Average Combined Performance vs #Params",
158
+ xaxis_title="#Params (B)", yaxis_title="Average Combined Performance",
159
+ template="plotly_white", hovermode="closest",
160
+ font=dict(family="Arial", size=10), dragmode=False,
161
+ xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
162
+ yaxis=dict(tickvals=[0, 20, 40, 60, 80, 100], range=[0, 100])
163
+ )
164
+
165
+ # Caption
166
+ fig.add_annotation(
167
+ text="Accuracy generally rises with #Params, but smaller models <br>"
168
+ "with 5-shot can outperform larger zero-shot models.",
169
+ xref="paper", yref="paper", x=0.5, y=-0.3,
170
+ showarrow=False, font=dict(size=11, color="gray"),
171
+ align="center", xanchor="center"
172
+ )
173
+
174
+ fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
175
+ fig.update_yaxes(fixedrange=True)
176
+
177
+ return fig
178
+
179
+
180
+ def create_boxplot_task(dataframe=None, baselines=None, references=None):
181
+ """Create right chart"""
182
+
183
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
184
+
185
+ # Dati di default se non forniti
186
+ if dataframe is None:
187
+ np.random.seed(42)
188
+ dataframe = pd.DataFrame({task: np.random.uniform(0.4, 0.9, 20) * 100 for task in tasks})
189
+
190
+ if baselines is None:
191
+ baselines = {task: np.random.randint(50, 70) for task in tasks}
192
+
193
+ if references is None:
194
+ references = {}
195
+
196
+ colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
197
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
198
+
199
+ fig = go.Figure()
200
+
201
+ for i, task in enumerate(tasks):
202
+ if task not in dataframe.columns:
203
+ continue
204
+
205
+ y_data = dataframe[task].dropna().tolist()
206
+
207
+ # Boxplot
208
+ fig.add_trace(go.Box(
209
+ y=y_data,
210
+ name=task,
211
+ marker=dict(color=colors[i]),
212
+ line=dict(color="black", width=2),
213
+ fillcolor=colors[i],
214
+ opacity=0.7,
215
+ hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
216
+ hoverlabel=dict(bgcolor=colors[i], font_color="white"),
217
+ width=0.6,
218
+ whiskerwidth=0.2,
219
+ quartilemethod="linear"
220
+ ))
221
+
222
+ # Linea baseline
223
+ baseline_value = baselines.get(task)
224
+ if baseline_value is not None:
225
+ fig.add_shape(
226
+ type="line",
227
+ x0=i - 0.3, x1=i + 0.3,
228
+ y0=baseline_value, y1=baseline_value,
229
+ line=dict(color="black", width=2, dash="dot"),
230
+ xref="x", yref="y"
231
+ )
232
+
233
+ # Linea reference GPT-4o
234
+ reference_value = references.get(task)
235
+ if reference_value is not None:
236
+ fig.add_shape(
237
+ type="line",
238
+ x0=i - 0.3, x1=i + 0.3,
239
+ y0=reference_value, y1=reference_value,
240
+ line=dict(color="red", width=2, dash="dashdot"),
241
+ xref="x", yref="y"
242
+ )
243
+
244
+ # Layout
245
+ fig.update_layout(
246
+ title="Distribution of Model Accuracy by Task",
247
+ xaxis_title="Task",
248
+ yaxis_title="Combined Performance",
249
+ template="plotly_white",
250
+ boxmode="group",
251
+ dragmode=False,
252
+ font=dict(family="Arial", size=10),
253
+ margin=dict(b=80),
254
+ )
255
+
256
+ # Caption
257
+ fig.add_annotation(
258
+ text=(
259
+ "In tasks like TE and SA, models approach the accuracy of supervised <br>"
260
+ "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
261
+ "Dashed red lines show GPT-4o reference results for generative tasks."
262
+ ),
263
+ xref="paper", yref="paper",
264
+ x=0.5, y=-0.30,
265
+ showarrow=False,
266
+ font=dict(size=11, color="gray"),
267
+ align="center"
268
+ )
269
+
270
+ fig.update_yaxes(range=[0, 100], fixedrange=True)
271
+ fig.update_xaxes(fixedrange=True)
272
+
273
+ return fig
274
+
275
+
276
+ def create_medal_assignments(sorted_df):
277
+ """Function for medal assignment logic"""
278
+ medals = {
279
+ 'large_fs': False, 'medium_fs': False, 'small_fs': False,
280
+ 'large_0shot': False, 'medium_0shot': False, 'small_0shot': False
281
+ }
282
+
283
+ new_model_column = []
284
+
285
+ for _, row in sorted_df.iterrows():
286
+ model_name = row['Model']
287
+ size = row["Size"]
288
+ is_fs = row['IS_FS']
289
+
290
+ if is_fs: # 5-Few-Shot
291
+ if size == "🔵🔵🔵" and not medals['large_fs']:
292
+ model_name = f"{model_name} 🔵🔵🔵🏆"
293
+ medals['large_fs'] = True
294
+ elif size == "🔵🔵" and not medals['medium_fs']:
295
+ model_name = f"{model_name} 🔵🔵🏆"
296
+ medals['medium_fs'] = True
297
+ elif size == "🔵" and not medals['small_fs']:
298
+ model_name = f"{model_name} 🔵🏆"
299
+ medals['small_fs'] = True
300
+ else: # 0-Shot
301
+ if size == "🔵🔵🔵" and not medals['large_0shot']:
302
+ model_name = f"{model_name} 🔵🔵🔵🎖️"
303
+ medals['large_0shot'] = True
304
+ elif size == "🔵🔵" and not medals['medium_0shot']:
305
+ model_name = f"{model_name} 🔵🔵🎖️"
306
+ medals['medium_0shot'] = True
307
+ elif size == "🔵" and not medals['small_0shot']:
308
+ model_name = f"{model_name} 🔵🎖️"
309
+ medals['small_0shot'] = True
310
+
311
+ new_model_column.append(model_name)
312
+
313
+ return new_model_column
314
+
315
+
316
+ def create_leaderboard_base(sorted_dataframe, field_list, hidden_columns):
317
+ """Base leaderboard creation with common parameters. """
318
+
319
+ return Leaderboard(
320
+ value=sorted_dataframe,
321
+ datatype=[c.type for c in field_list],
322
+ search_columns=[AutoEvalColumn.model.name],
323
+ hide_columns=hidden_columns,
324
+ filter_columns=[
325
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
326
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
327
+ label="Select the number of parameters (B)"),
328
+ ],
329
+ bool_checkboxgroup_label="Evaluation Mode",
330
+ interactive=False,
331
+ )
332
+
333
+
334
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
335
+ """Leaderboard initialization """
336
+ if dataframe is None or dataframe.empty:
337
+ raise ValueError("Leaderboard DataFrame is empty or None.")
338
+
339
+ # Sort and reset index
340
+ sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
341
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
342
+
343
+ # Apply medal assignments
344
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
345
+
346
+ # Show the best values for tasks
347
+ sorted_dataframe = highlight_best_per_task(sorted_dataframe)
348
+
349
+ field_list = fields(AutoEvalColumn)
350
+
351
+ return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)
352
+
353
+
354
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
355
+
356
+ """ Task-specific leaderboard update."""
357
+ if dataframe is None or dataframe.empty:
358
+ raise ValueError("Leaderboard DataFrame is empty or None.")
359
+
360
+ # Sort and reset index
361
+ sorted_dataframe = dataframe.sort_values(by="Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
362
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
363
+
364
+ # Apply medal assignments
365
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
366
+
367
+ field_list = fields(AutoEvalColumn)
368
+
369
+ return Leaderboard(
370
+ value=sorted_dataframe,
371
+ datatype=[c.type for c in field_list] + [int],
372
+ search_columns=[AutoEvalColumn.model.name],
373
+ hide_columns=hidden_columns,
374
+ filter_columns=[
375
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
376
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
377
+ label="Select the number of parameters (B)"),
378
+ ],
379
+ bool_checkboxgroup_label="Evaluation Mode",
380
+ interactive=False
381
+ )
382
+
383
+
384
+ def download_snapshot(repo, local_dir, max_retries=3):
385
+ """Snapshot download with retry logic."""
386
+ for attempt in range(max_retries):
387
+ try:
388
+ logger.info(f"Downloading from {repo} to {local_dir} (attempt {attempt + 1}/{max_retries})")
389
+ snapshot_download(
390
+ repo_id=repo,
391
+ local_dir=local_dir,
392
+ repo_type="dataset",
393
+ tqdm_class=None,
394
+ etag_timeout=30,
395
+ token=TOKEN
396
+ )
397
+ return True
398
+ except Exception as e:
399
+ logger.error(f"Error downloading {repo} (attempt {attempt + 1}): {e}")
400
+ if attempt == max_retries - 1:
401
+ logger.error(f"Failed to download {repo} after {max_retries} attempts")
402
+ return False
403
+ return False
404
+
405
+
406
+ def restart_space():
407
+ """Restart the Hugging Face space."""
408
+ try:
409
+ logger.info("Restarting space... ")
410
+ API.restart_space(repo_id=REPO_ID)
411
+ except Exception as e:
412
+ logger.error(f"Error restarting space: {e}")
413
+
414
+
415
+ def create_title_html():
416
+ """Function for title HTML."""
417
+ return """
418
+ <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
419
+ <h1 style="
420
+ margin: 0 auto;
421
+ font-weight: 900;
422
+ font-size: 2.5em;
423
+ letter-spacing: 2px;
424
+ text-transform: uppercase;
425
+ background: linear-gradient(90deg, #1f77b4, #00c6ff);
426
+ -webkit-background-clip: text;
427
+ -webkit-text-fill-color: transparent;
428
+ text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
429
+ ">
430
+ EVALITA-LLM Leaderboard
431
+ </h1>
432
+ <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank"
433
+ style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">
434
+ <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">
435
+ <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
436
+ <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
437
+ </svg>
438
+ Open Italian LLM Leaderboard
439
+ </a>
440
+ </div>
441
+ """
442
+
443
+
444
+ def create_credits_markdown():
445
+ """Credits section."""
446
+ return """
447
+ **This project has benefited from the following support:**
448
+
449
+ - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
450
+
451
+ - 💶 **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
452
+
453
+ - 🖥️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
454
+ """
455
+
456
+
457
+ # Main initialization
458
+ def initialize_app():
459
+ """Initialize the application ."""
460
+ try:
461
+ # Download snapshots
462
+ queue_success = download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
463
+ results_success = download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
464
+
465
+ if not (queue_success and results_success):
466
+ logger.error("Failed to download required data")
467
+ return None, None, None, None, None
468
+
469
+ # Load leaderboard data
470
+ leaderboard_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
471
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
472
+ EVAL_REQUESTS_PATH, EVAL_COLS)
473
+
474
+ # Calculate theoretical max performance
475
+ theoretical_max = theoretical_performance(hash(str(leaderboard_df.values.tobytes())))
476
+
477
+ return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max
478
+
479
+ except Exception as e:
480
+ logger.error(f"Error initializing app: {e}")
481
+ return None, None, None, None, None
482
+
483
+
484
+ # Initialize data
485
+ LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()
486
+
487
+ if LEADERBOARD_DF is None:
488
+ # Fallback behavior
489
+ logger.error("Failed to initialize app data")
490
+ theoretical_max_combined_perf = 0.0
491
+
492
+
493
+ def create_gradio_interface():
494
+ """The main Gradio interface."""
495
+ demo = gr.Blocks(css=custom_css)
496
+
497
+ with demo:
498
+ # Title
499
+ gr.HTML(create_title_html())
500
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
501
+
502
+ # Charts section
503
+ with gr.Row():
504
+ if LEADERBOARD_DF is not None:
505
+ # Note: You'd need to implement these chart functions properly
506
+ gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
507
+ gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
508
+
509
+ # Tabs
510
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
511
+ # Main leaderboard tab
512
+ with gr.TabItem("🏅 Benchmark"):
513
+ if LEADERBOARD_DF is not None:
514
+ leaderboard = init_leaderboard(
515
+ LEADERBOARD_DF,
516
+ default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT",
517
+ "WIC", "FAQ", "LS", "SU", "NER", "REL"],
518
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
519
+ col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA",
520
+ "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
521
+ )
522
+
523
+ gr.HTML(
524
+ f"""
525
+ <div style="
526
+ border: 2px solid #1f77b4;
527
+ border-radius: 10px;
528
+ padding: 10px;
529
+ background-color: #f0f8ff;
530
+ font-weight: bold;
531
+ font-size: 14px;
532
+ display: inline-block;
533
+ ">
534
+ Theoretical performance of a model that scores the highest on every individual task:
535
+ <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
536
+ </div>
537
+ """
538
+ )
539
+
540
+ # About tab
541
+ with gr.TabItem("📝 About"):
542
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
543
+
544
+ with gr.TabItem("║", interactive=False):
545
+ gr.Markdown("", elem_classes="markdown-text")
546
+
547
+ # Task-specific tabs
548
+ if LEADERBOARD_DF is not None:
549
+ # Multiple choice tasks
550
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
551
+ with gr.TabItem(f"{metadata['icon']}{task}"):
552
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
553
+ gr.Markdown(task_description, elem_classes="markdown-text")
554
+
555
+ leaderboard = update_task_leaderboard(
556
+ LEADERBOARD_DF.rename(columns={
557
+ f"{task} Prompt Average": "Prompt Average",
558
+ f"{task} Prompt Std": "Prompt Std",
559
+ f"{task} Best Prompt": "Best Prompt",
560
+ f"{task} Best Prompt Id": "Best Prompt Id",
561
+ task: "Comb. Perf. ⬆️"
562
+ }),
563
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
564
+ 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
565
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
566
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
567
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
568
+ 'Best Prompt Id']]
569
+ )
570
+
571
+ with gr.TabItem("│", interactive=False):
572
+ gr.Markdown("", elem_classes="markdown-text")
573
+
574
+ # Generative tasks
575
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
576
+ with gr.TabItem(f"{metadata['icon']}{task}"):
577
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
578
+ gr.Markdown(task_description, elem_classes="markdown-text")
579
+
580
+ leaderboard = update_task_leaderboard(
581
+ LEADERBOARD_DF.rename(columns={
582
+ f"{task} Prompt Average": "Prompt Average",
583
+ f"{task} Prompt Std": "Prompt Std",
584
+ f"{task} Best Prompt": "Best Prompt",
585
+ f"{task} Best Prompt Id": "Best Prompt Id",
586
+ task: "Comb. Perf. ⬆️"
587
+ }),
588
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
589
+ 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
590
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
591
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
592
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
593
+ 'Best Prompt Id']]
594
+ )
595
+
596
+ # Citation and Credits sections
597
+ with gr.Accordion("📙 Citation", open=False):
598
+ gr.Textbox(
599
+ value=CITATION_BUTTON_TEXT,
600
+ label=CITATION_BUTTON_LABEL,
601
+ lines=20,
602
+ elem_id="citation-button",
603
+ show_copy_button=True
604
+ )
605
+
606
+ with gr.Accordion("📙 Credits", open=False):
607
+ gr.Markdown(create_credits_markdown())
608
+
609
+ return demo
610
+
611
+
612
+ # Create and configure the demo
613
+ demo = create_gradio_interface()
614
+
615
+ # Background scheduler for space restart
616
+ scheduler = BackgroundScheduler()
617
+ scheduler.add_job(restart_space, "interval", seconds=1800)
618
+ scheduler.start()
619
+
620
+ # Launch configuration
621
+ if __name__ == "__main__":
622
+ demo.queue(default_concurrency_limit=40).launch(
623
+ debug=True,
624
+ show_error=True
625
+ )