fix(leaderboard):
Browse files1. Fixed the issue when adding markdown as the data type of the model_name column, the text is getting overflown into the next column.
2. Removed the dependency where the model type is determined based on a symbol.
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
import core as core
|
4 |
-
from style import CSS, LANG_SYMBOLS,
|
5 |
-
from gradio_rangeslider import RangeSlider
|
6 |
|
7 |
demo = gr.Blocks(css=CSS)
|
8 |
with demo:
|
@@ -100,9 +100,8 @@ with demo:
|
|
100 |
inputs=[],
|
101 |
outputs=shown_tasks,
|
102 |
)
|
103 |
-
|
104 |
-
|
105 |
-
leaderboard_table = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
|
106 |
|
107 |
with gr.TabItem(
|
108 |
"π
LLM accuracy benchmark (Zero-Shot)",
|
@@ -188,8 +187,7 @@ with demo:
|
|
188 |
inputs=[],
|
189 |
outputs=shown_tasks_zero_shot,
|
190 |
)
|
191 |
-
leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown"
|
192 |
-
wrap=False)
|
193 |
|
194 |
with gr.TabItem(
|
195 |
"π LLM translation benchmark",
|
@@ -276,7 +274,7 @@ with demo:
|
|
276 |
outputs=shown_tasks_misc,
|
277 |
)
|
278 |
|
279 |
-
leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown"
|
280 |
|
281 |
with gr.TabItem(
|
282 |
"π LLM MT-Bench benchmark",
|
@@ -319,8 +317,7 @@ with demo:
|
|
319 |
outputs=langs_bar_mtbench,
|
320 |
)
|
321 |
|
322 |
-
leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown"
|
323 |
-
wrap=False)
|
324 |
|
325 |
for comp, fn in [
|
326 |
(search_bar, "submit"),
|
@@ -331,7 +328,7 @@ with demo:
|
|
331 |
]:
|
332 |
getattr(comp, fn)(
|
333 |
core.update_df,
|
334 |
-
[shown_tasks, search_bar, langs_bar,
|
335 |
# [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
|
336 |
leaderboard_table,
|
337 |
)
|
@@ -345,8 +342,8 @@ with demo:
|
|
345 |
]:
|
346 |
getattr(comp, fn)(
|
347 |
core.update_df,
|
348 |
-
[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
|
349 |
-
model_sizes_zero_shot, gr.State(value=False)],
|
350 |
leaderboard_table_zero_shot,
|
351 |
)
|
352 |
|
@@ -359,8 +356,8 @@ with demo:
|
|
359 |
]:
|
360 |
getattr(comp, fn)(
|
361 |
core.update_df,
|
362 |
-
[shown_tasks_misc, search_bar_misc, langs_bar_misc,
|
363 |
-
gr.State(value=False)],
|
364 |
leaderboard_table_misc,
|
365 |
)
|
366 |
|
@@ -370,41 +367,42 @@ with demo:
|
|
370 |
]:
|
371 |
getattr(comp, fn)(
|
372 |
core.update_df,
|
373 |
-
[gr.State(value=
|
374 |
-
|
375 |
-
|
376 |
leaderboard_table_mtbench,
|
377 |
)
|
378 |
|
379 |
gr.Blocks.load(
|
380 |
block=demo,
|
381 |
fn=core.update_df,
|
382 |
-
inputs=[shown_tasks, search_bar, langs_bar,
|
383 |
-
# inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
|
384 |
outputs=leaderboard_table,
|
385 |
)
|
386 |
|
387 |
gr.Blocks.load(
|
388 |
block=demo,
|
389 |
fn=core.update_df,
|
390 |
-
inputs=[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
|
391 |
-
model_sizes_zero_shot, gr.State(value=False)],
|
392 |
outputs=leaderboard_table_zero_shot,
|
393 |
)
|
394 |
|
395 |
gr.Blocks.load(
|
396 |
block=demo,
|
397 |
fn=core.update_df,
|
398 |
-
inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc,
|
399 |
-
gr.State(value=False)],
|
400 |
outputs=leaderboard_table_misc,
|
401 |
)
|
402 |
|
|
|
403 |
gr.Blocks.load(
|
404 |
block=demo,
|
405 |
fn=core.update_df,
|
406 |
-
inputs=[gr.State(value=
|
407 |
-
|
|
|
408 |
outputs=leaderboard_table_mtbench,
|
409 |
)
|
410 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_rangeslider import RangeSlider
|
3 |
|
4 |
import core as core
|
5 |
+
from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE
|
|
|
6 |
|
7 |
demo = gr.Blocks(css=CSS)
|
8 |
with demo:
|
|
|
100 |
inputs=[],
|
101 |
outputs=shown_tasks,
|
102 |
)
|
103 |
+
|
104 |
+
leaderboard_table = gr.Dataframe(datatype=["str", "markdown", "number"])
|
|
|
105 |
|
106 |
with gr.TabItem(
|
107 |
"π
LLM accuracy benchmark (Zero-Shot)",
|
|
|
187 |
inputs=[],
|
188 |
outputs=shown_tasks_zero_shot,
|
189 |
)
|
190 |
+
leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown", "number"])
|
|
|
191 |
|
192 |
with gr.TabItem(
|
193 |
"π LLM translation benchmark",
|
|
|
274 |
outputs=shown_tasks_misc,
|
275 |
)
|
276 |
|
277 |
+
leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown", "number"])
|
278 |
|
279 |
with gr.TabItem(
|
280 |
"π LLM MT-Bench benchmark",
|
|
|
317 |
outputs=langs_bar_mtbench,
|
318 |
)
|
319 |
|
320 |
+
leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown", "number"])
|
|
|
321 |
|
322 |
for comp, fn in [
|
323 |
(search_bar, "submit"),
|
|
|
328 |
]:
|
329 |
getattr(comp, fn)(
|
330 |
core.update_df,
|
331 |
+
[gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types],
|
332 |
# [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
|
333 |
leaderboard_table,
|
334 |
)
|
|
|
342 |
]:
|
343 |
getattr(comp, fn)(
|
344 |
core.update_df,
|
345 |
+
[gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
|
346 |
+
model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot],
|
347 |
leaderboard_table_zero_shot,
|
348 |
)
|
349 |
|
|
|
356 |
]:
|
357 |
getattr(comp, fn)(
|
358 |
core.update_df,
|
359 |
+
[gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc,
|
360 |
+
gr.State(value=False), model_types_misc],
|
361 |
leaderboard_table_misc,
|
362 |
)
|
363 |
|
|
|
367 |
]:
|
368 |
getattr(comp, fn)(
|
369 |
core.update_df,
|
370 |
+
[gr.State(value=3),
|
371 |
+
gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
|
372 |
+
search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)],
|
373 |
leaderboard_table_mtbench,
|
374 |
)
|
375 |
|
376 |
gr.Blocks.load(
|
377 |
block=demo,
|
378 |
fn=core.update_df,
|
379 |
+
inputs=[gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types],
|
|
|
380 |
outputs=leaderboard_table,
|
381 |
)
|
382 |
|
383 |
gr.Blocks.load(
|
384 |
block=demo,
|
385 |
fn=core.update_df,
|
386 |
+
inputs=[gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
|
387 |
+
model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot],
|
388 |
outputs=leaderboard_table_zero_shot,
|
389 |
)
|
390 |
|
391 |
gr.Blocks.load(
|
392 |
block=demo,
|
393 |
fn=core.update_df,
|
394 |
+
inputs=[gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc,
|
395 |
+
gr.State(value=False), model_types_misc],
|
396 |
outputs=leaderboard_table_misc,
|
397 |
)
|
398 |
|
399 |
+
# We do not have a checkbox for model_type in mt_bench, hence there is no model_types variable
|
400 |
gr.Blocks.load(
|
401 |
block=demo,
|
402 |
fn=core.update_df,
|
403 |
+
inputs=[gr.State(value=3),
|
404 |
+
gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
|
405 |
+
search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)],
|
406 |
outputs=leaderboard_table_mtbench,
|
407 |
)
|
408 |
|
core.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1 |
import itertools
|
2 |
import os
|
3 |
|
|
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
from datasets import load_dataset
|
7 |
-
from utils import add_model_hyperlink
|
8 |
|
9 |
import style
|
|
|
|
|
10 |
|
11 |
ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
|
12 |
FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
|
13 |
|
14 |
|
15 |
def init():
|
16 |
-
global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict, mt_bench_language_list, model_link_dict, model_size_dict
|
17 |
|
18 |
repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
|
19 |
config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
|
@@ -114,18 +116,23 @@ def select_shots(df: pd.DataFrame, fewshot: bool = False):
|
|
114 |
|
115 |
|
116 |
def update_df(
|
|
|
117 |
tasks: list[str],
|
118 |
model_query: str,
|
119 |
langs: list[str],
|
120 |
-
model_types: list[str],
|
121 |
model_sizes: list[str],
|
122 |
fewshot: bool = False,
|
|
|
123 |
format: bool = True,
|
|
|
124 |
) -> pd.DataFrame:
|
125 |
"""Return a filtered dataframe according to selected models, tasks and
|
126 |
languages. The format flag controls whether the output dataframe should
|
127 |
be formatted to tw significant figures.
|
128 |
"""
|
|
|
|
|
|
|
129 |
# keep only selected shots
|
130 |
df = select_shots(hidden_df, fewshot)
|
131 |
|
@@ -147,6 +154,66 @@ def update_df(
|
|
147 |
return sort_cols(df, fewshot)
|
148 |
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
def get_selected_task_type(task_type_id):
|
151 |
task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score", 3: "accuracy"}
|
152 |
selected_task_type = task_types[task_type_id]
|
|
|
1 |
import itertools
|
2 |
import os
|
3 |
|
4 |
+
import gradio as gr
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
from datasets import load_dataset
|
|
|
8 |
|
9 |
import style
|
10 |
+
from style import T_SYMBOLS, MT_BENCH_LANG_SYMBOLS, LANG_SYMBOLS
|
11 |
+
from utils import add_model_hyperlink
|
12 |
|
13 |
ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
|
14 |
FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
|
15 |
|
16 |
|
17 |
def init():
|
18 |
+
global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_df, model_type_dict, mt_bench_language_list, model_link_dict, model_size_dict
|
19 |
|
20 |
repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
|
21 |
config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
|
|
|
116 |
|
117 |
|
118 |
def update_df(
|
119 |
+
current_selected_tab: int,
|
120 |
tasks: list[str],
|
121 |
model_query: str,
|
122 |
langs: list[str],
|
|
|
123 |
model_sizes: list[str],
|
124 |
fewshot: bool = False,
|
125 |
+
model_types: list[str] = None,
|
126 |
format: bool = True,
|
127 |
+
|
128 |
) -> pd.DataFrame:
|
129 |
"""Return a filtered dataframe according to selected models, tasks and
|
130 |
languages. The format flag controls whether the output dataframe should
|
131 |
be formatted to tw significant figures.
|
132 |
"""
|
133 |
+
if current_selected_tab == 3:
|
134 |
+
model_types = [T_SYMBOLS["chat"]]
|
135 |
+
|
136 |
# keep only selected shots
|
137 |
df = select_shots(hidden_df, fewshot)
|
138 |
|
|
|
154 |
return sort_cols(df, fewshot)
|
155 |
|
156 |
|
157 |
+
def update_task_groups_and_fewshot(current_selected_tab: int, model_types, langs_bar,
|
158 |
+
is_fewshot_current: bool = False, ):
|
159 |
+
selected_task_type = get_selected_task_type(current_selected_tab)
|
160 |
+
available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
|
161 |
+
new_selected_tasks = available_tasks.copy()
|
162 |
+
|
163 |
+
tasks_checkbox_group_update = gr.CheckboxGroup(
|
164 |
+
choices=available_tasks,
|
165 |
+
value=new_selected_tasks,
|
166 |
+
)
|
167 |
+
|
168 |
+
if current_selected_tab == 0:
|
169 |
+
is_fewshot_new = is_fewshot_current
|
170 |
+
fewshot_available = True
|
171 |
+
elif current_selected_tab == 1:
|
172 |
+
is_fewshot_new = False
|
173 |
+
fewshot_available = False
|
174 |
+
elif current_selected_tab == 2:
|
175 |
+
is_fewshot_new = False
|
176 |
+
fewshot_available = False
|
177 |
+
else:
|
178 |
+
raise ValueError(f"Unknown tab id {current_selected_tab}")
|
179 |
+
|
180 |
+
fewshot_radio_update = gr.Radio(
|
181 |
+
value=is_fewshot_new,
|
182 |
+
interactive=fewshot_available,
|
183 |
+
)
|
184 |
+
|
185 |
+
if current_selected_tab == 2:
|
186 |
+
model_types = gr.CheckboxGroup(
|
187 |
+
value=[T_SYMBOLS['chat']],
|
188 |
+
interactive=False
|
189 |
+
)
|
190 |
+
langs_bar = gr.CheckboxGroup(
|
191 |
+
choices=[(MT_BENCH_LANG_SYMBOLS.get(l, l), l) for l in mt_bench_language_list],
|
192 |
+
value=mt_bench_language_list,
|
193 |
+
interactive=True,
|
194 |
+
)
|
195 |
+
else:
|
196 |
+
model_types = gr.CheckboxGroup(
|
197 |
+
label="Select model type",
|
198 |
+
choices=[
|
199 |
+
(
|
200 |
+
f"Pretrained {T_SYMBOLS['pretrained']}",
|
201 |
+
T_SYMBOLS["pretrained"],
|
202 |
+
),
|
203 |
+
(f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
|
204 |
+
],
|
205 |
+
value=list(T_SYMBOLS.values()),
|
206 |
+
interactive=True
|
207 |
+
)
|
208 |
+
langs_bar = gr.CheckboxGroup(
|
209 |
+
choices=[(LANG_SYMBOLS.get(l, l), l) for l in languages_list],
|
210 |
+
value=languages_list,
|
211 |
+
interactive=True,
|
212 |
+
)
|
213 |
+
|
214 |
+
return [tasks_checkbox_group_update, current_selected_tab, model_types, langs_bar]
|
215 |
+
|
216 |
+
|
217 |
def get_selected_task_type(task_type_id):
|
218 |
task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score", 3: "accuracy"}
|
219 |
selected_task_type = task_types[task_type_id]
|