Alex Jude KlaudiaTH commited on
Commit
da6c970
ยท
unverified ยท
1 Parent(s): 6351c6b

New leaderboard design (#19)

Browse files

* MT-BENCH: Model type is now fixed at "chat" for MT-BENCH. Pretrained models are not shown nor can be selected.
* MT-BENCH: Language selection in MT-BENCH tab is limited to EN, DE, ES, FR, IT
* MT-BENCH: Don't select all 22 Languages when "Select all languages" button is pressed in in Mt-Bench tab.
* New Leaderboard Design: New design skeleton
* New Leaderboard Design: Removed unnecessary updates
* New Leaderboard Design: Introduced Zero-Shot tab instead of radio buttons

---------

Co-authored-by: KlaudiaTH <[email protected]>

Files changed (3) hide show
  1. app.py +280 -83
  2. core.py +3 -65
  3. style.py +96 -11
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
 
3
  import core as core
4
- from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE, MT_BENCH_LANG_SYMBOLS
5
 
6
  demo = gr.Blocks(css=CSS)
7
  with demo:
@@ -14,8 +14,12 @@ with demo:
14
 
15
  selected_tab = gr.State(value=0)
16
 
17
- with gr.Column():
18
- with gr.Row():
 
 
 
 
19
  with gr.Column():
20
  with gr.Row():
21
  search_bar = gr.Textbox(
@@ -24,7 +28,6 @@ with demo:
24
  show_label=True,
25
  elem_id="search-bar",
26
  )
27
-
28
  model_types = gr.CheckboxGroup(
29
  label="Select model type",
30
  choices=[
@@ -36,6 +39,7 @@ with demo:
36
  ],
37
  value=list(T_SYMBOLS.values()),
38
  )
 
39
  with gr.Row():
40
  langs_bar = gr.CheckboxGroup(
41
  choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
@@ -52,125 +56,318 @@ with demo:
52
  size="sm",
53
  scale=1,
54
  )
55
- select = gr.Button(value="Select all languages", size="sm", scale=1)
56
-
57
- def update_bar(selected_tab):
58
- if selected_tab == 2:
59
- choices = [(MT_BENCH_LANG_SYMBOLS.get(l, l), l) for l in core.mt_bench_language_list]
60
- value = core.mt_bench_language_list
61
- else:
62
- choices = [(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list]
63
- value = core.languages_list
64
- langs_bar = gr.CheckboxGroup(
65
- choices=choices,
66
- value=value,
67
- label="Select languages to average over",
68
- elem_id="column-select",
69
- interactive=True,
70
- )
71
- return langs_bar
72
-
73
- select.click(update_bar, inputs=[selected_tab], outputs=langs_bar)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  with gr.Row():
76
- shown_tasks = gr.CheckboxGroup(
77
- choices=[],
78
- value=[],
79
- label="Select tasks to show",
80
  elem_id="column-select",
81
  interactive=True,
82
- scale=50,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
- fewshot = gr.Radio(
85
- choices=[("0-Shot", False), ("Few-shot", True)],
86
- value=True,
87
- label="Select evaluation type",
88
- scale=29,
89
  )
90
- clear = gr.ClearButton(shown_tasks, value="Deselect all tasks", size="sm", scale=21)
91
-
92
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
93
- with gr.TabItem("๐Ÿ… LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0) as acc:
94
- leaderboard_table = gr.Dataframe()
95
- with gr.TabItem(
96
- "๐ŸŒ LLM translation benchmark",
97
- elem_id="llm-benchmark-tab-table-misc",
98
- id=1,
99
- ) as misc:
100
- leaderboard_table_misc = gr.Dataframe()
101
- with gr.TabItem(
102
- "๐ŸŒ LLM MT-Bench benchmark",
103
- elem_id="llm-benchmark-tab-table-mtbench",
104
- id=2,
105
- ) as mtbench:
106
- leaderboard_table_mtbench = gr.Dataframe()
107
-
108
- demo.load(
109
- core.update_task_groups_and_fewshot,
110
- [gr.State(value=0), model_types, langs_bar,fewshot],
111
- [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
112
- )
113
- fewshot.change(
114
- core.update_task_groups_and_fewshot,
115
- [selected_tab, model_types, langs_bar, fewshot],
116
- [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
117
- )
118
- acc.select(
119
- core.update_task_groups_and_fewshot,
120
- inputs=[gr.State(value=0), model_types, langs_bar, fewshot],
121
- outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
122
- )
123
- misc.select(
124
- core.update_task_groups_and_fewshot,
125
- inputs=[gr.State(value=1), model_types, langs_bar, fewshot],
126
- outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
127
- )
128
- mtbench.select(
129
- core.update_task_groups_and_fewshot,
130
- inputs=[gr.State(value=2), model_types, langs_bar, fewshot],
131
- outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
132
- )
133
  for comp, fn in [
134
  (search_bar, "submit"),
135
  (langs_bar, "change"),
136
  (shown_tasks, "change"),
137
- (fewshot, "change"),
138
  (model_types, "change"),
139
  ]:
140
  getattr(comp, fn)(
141
  core.update_df,
142
- [shown_tasks, search_bar, langs_bar, model_types, fewshot],
143
  leaderboard_table,
144
  )
 
 
 
 
 
 
 
145
  getattr(comp, fn)(
146
  core.update_df,
147
- [shown_tasks, search_bar, langs_bar, model_types, fewshot],
 
 
 
 
 
 
 
 
 
 
 
 
148
  leaderboard_table_misc,
149
  )
 
 
 
 
 
150
  getattr(comp, fn)(
151
  core.update_df,
152
- [shown_tasks, search_bar, langs_bar, model_types, fewshot],
153
  leaderboard_table_mtbench,
154
  )
155
 
156
  gr.Blocks.load(
157
  block=demo,
158
  fn=core.update_df,
159
- inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
160
  outputs=leaderboard_table,
161
  )
162
 
163
  gr.Blocks.load(
164
  block=demo,
165
  fn=core.update_df,
166
- inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
 
 
 
 
 
 
 
167
  outputs=leaderboard_table_misc,
168
  )
169
 
170
  gr.Blocks.load(
171
  block=demo,
172
  fn=core.update_df,
173
- inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
174
  outputs=leaderboard_table_mtbench,
175
  )
176
 
 
1
  import gradio as gr
2
 
3
  import core as core
4
+ from style import CSS, LANG_SYMBOLS, MT_BENCH_LANG_SYMBOLS, T_SYMBOLS, TITLE
5
 
6
  demo = gr.Blocks(css=CSS)
7
  with demo:
 
14
 
15
  selected_tab = gr.State(value=0)
16
 
17
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
18
+ with gr.TabItem(
19
+ "๐Ÿ… LLM accuracy benchmark",
20
+ elem_id="llm-benchmark-tab-table-acc",
21
+ id=0,
22
+ ) as acc:
23
  with gr.Column():
24
  with gr.Row():
25
  search_bar = gr.Textbox(
 
28
  show_label=True,
29
  elem_id="search-bar",
30
  )
 
31
  model_types = gr.CheckboxGroup(
32
  label="Select model type",
33
  choices=[
 
39
  ],
40
  value=list(T_SYMBOLS.values()),
41
  )
42
+
43
  with gr.Row():
44
  langs_bar = gr.CheckboxGroup(
45
  choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
 
56
  size="sm",
57
  scale=1,
58
  )
59
+ select = gr.Button(
60
+ value="Select all languages",
61
+ size="sm",
62
+ scale=1,
63
+ )
64
+ select.click(
65
+ lambda: gr.CheckboxGroup(value=core.languages_list),
66
+ inputs=[],
67
+ outputs=langs_bar,
68
+ )
69
+
70
+ with gr.Row():
71
+ shown_tasks = gr.CheckboxGroup(
72
+ choices=core.get_available_task_groups(core.get_selected_task_type(0), True),
73
+ value=core.get_available_task_groups(core.get_selected_task_type(0), True),
74
+ label="Select tasks to show",
75
+ elem_id="column-select",
76
+ interactive=True,
77
+ scale=50,
78
+ )
79
+ clear = gr.ClearButton(
80
+ shown_tasks,
81
+ value="Deselect all tasks",
82
+ size="sm",
83
+ scale=1,
84
+ )
85
+ select = gr.Button(
86
+ value="Select all tasks",
87
+ size="sm",
88
+ scale=1,
89
+ )
90
+ select.click(
91
+ lambda: gr.CheckboxGroup(value=core.get_available_task_groups(core.get_selected_task_type(0), True)),
92
+ inputs=[],
93
+ outputs=shown_tasks,
94
+ )
95
+ leaderboard_table = gr.Dataframe()
96
+
97
+ with gr.TabItem(
98
+ "๐Ÿ… LLM accuracy benchmark (Zero-Shot)",
99
+ elem_id="llm-benchmark-tab-table-acc-zeroshot",
100
+ id=3,
101
+ ) as acc_zero_shot:
102
+ with gr.Column():
103
+ with gr.Row():
104
+ search_bar_zero_shot = gr.Textbox(
105
+ label="Search models",
106
+ placeholder=" ๐Ÿ” Separate multiple queries with ';' and press ENTER...",
107
+ show_label=True,
108
+ elem_id="search-bar",
109
+ )
110
+ model_types_zero_shot = gr.CheckboxGroup(
111
+ label="Select model type",
112
+ choices=[
113
+ (
114
+ f"Pretrained {T_SYMBOLS['pretrained']}",
115
+ T_SYMBOLS["pretrained"],
116
+ ),
117
+ (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
118
+ ],
119
+ value=list(T_SYMBOLS.values()),
120
+ )
121
+
122
+ with gr.Row():
123
+ langs_bar_zero_shot = gr.CheckboxGroup(
124
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
125
+ value=core.languages_list,
126
+ label="Select languages to average over",
127
+ elem_id="column-select",
128
+ interactive=True,
129
+ scale=6,
130
+ )
131
+ with gr.Column(scale=1):
132
+ clear_zero_shot = gr.ClearButton(
133
+ langs_bar_zero_shot,
134
+ value="Deselect all languages",
135
+ size="sm",
136
+ scale=1,
137
+ )
138
+ select_zero_shot = gr.Button(
139
+ value="Select all languages",
140
+ size="sm",
141
+ scale=1,
142
+ )
143
+ select_zero_shot.click(
144
+ lambda: gr.CheckboxGroup(value=core.languages_list),
145
+ inputs=[],
146
+ outputs=langs_bar_zero_shot,
147
+ )
148
+
149
+ with gr.Row():
150
+ shown_tasks_zero_shot = gr.CheckboxGroup(
151
+ choices=core.get_available_task_groups(core.get_selected_task_type(3), False),
152
+ value=core.get_available_task_groups(core.get_selected_task_type(3), False),
153
+ label="Select tasks to show",
154
+ elem_id="column-select",
155
+ interactive=True,
156
+ scale=50,
157
+ )
158
+ clear_zero_shot = gr.ClearButton(
159
+ shown_tasks_zero_shot,
160
+ value="Deselect all tasks",
161
+ size="sm",
162
+ scale=1,
163
+ )
164
+ select_zero_shot = gr.Button(
165
+ value="Select all tasks",
166
+ size="sm",
167
+ scale=1,
168
+ )
169
+ select_zero_shot.click(
170
+ lambda: gr.CheckboxGroup(value=core.get_available_task_groups(core.get_selected_task_type(3), False)),
171
+ inputs=[],
172
+ outputs=shown_tasks_zero_shot,
173
+ )
174
+ leaderboard_table_zero_shot = gr.Dataframe()
175
+
176
+ with gr.TabItem(
177
+ "๐ŸŒ LLM translation benchmark",
178
+ elem_id="llm-benchmark-tab-table-misc",
179
+ id=1,
180
+ ) as misc:
181
+ with gr.Column():
182
+ with gr.Row():
183
+ search_bar_misc = gr.Textbox(
184
+ label="Search models",
185
+ placeholder=" ๐Ÿ” Separate multiple queries with ';' and press ENTER...",
186
+ show_label=True,
187
+ elem_id="search-bar",
188
+ )
189
+
190
+ model_types_misc = gr.CheckboxGroup(
191
+ label="Select model type",
192
+ choices=[
193
+ (
194
+ f"Pretrained {T_SYMBOLS['pretrained']}",
195
+ T_SYMBOLS["pretrained"],
196
+ ),
197
+ (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
198
+ ],
199
+ value=list(T_SYMBOLS.values()),
200
+ )
201
 
202
  with gr.Row():
203
+ langs_bar_misc = gr.CheckboxGroup(
204
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
205
+ value=core.languages_list,
206
+ label="Select languages to average over",
207
  elem_id="column-select",
208
  interactive=True,
209
+ scale=6,
210
+ )
211
+ with gr.Column(scale=1):
212
+ clear_misc = gr.ClearButton(
213
+ langs_bar_misc,
214
+ value="Deselect all languages",
215
+ size="sm",
216
+ scale=1,
217
+ )
218
+ select_misc = gr.Button(
219
+ value="Select all languages",
220
+ size="sm",
221
+ scale=1,
222
+ )
223
+ select_misc.click(
224
+ lambda: gr.CheckboxGroup(value=core.languages_list),
225
+ inputs=[],
226
+ outputs=langs_bar_misc,
227
+ )
228
+
229
+ with gr.Row():
230
+ shown_tasks_misc = gr.CheckboxGroup(
231
+ choices=core.get_available_task_groups(core.get_selected_task_type(1), False),
232
+ value=core.get_available_task_groups(core.get_selected_task_type(1), False),
233
+ label="Select tasks to show",
234
+ elem_id="column-select",
235
+ interactive=True,
236
+ scale=50,
237
+ )
238
+ clear_tasks_misc = gr.ClearButton(
239
+ shown_tasks_misc,
240
+ value="Deselect all tasks",
241
+ size="sm",
242
+ scale=1,
243
+ )
244
+ select_all_tasks_misc = gr.Button(
245
+ value="Select all tasks",
246
+ size="sm",
247
+ scale=1,
248
+ )
249
+ select_all_tasks_misc.click(
250
+ lambda: gr.CheckboxGroup(value=core.get_available_task_groups(core.get_selected_task_type(1), False)),
251
+ inputs=[],
252
+ outputs=shown_tasks_misc,
253
+ )
254
+
255
+ leaderboard_table_misc = gr.Dataframe()
256
+
257
+ with gr.TabItem(
258
+ "๐ŸŒ LLM MT-Bench benchmark",
259
+ elem_id="llm-benchmark-tab-table-mtbench",
260
+ id=2,
261
+ ) as mtbench:
262
+ with gr.Column():
263
+ with gr.Row():
264
+ search_bar_mtbench = gr.Textbox(
265
+ label="Search models",
266
+ placeholder=" ๐Ÿ” Separate multiple queries with ';' and press ENTER...",
267
+ show_label=True,
268
+ elem_id="search-bar",
269
+ )
270
+
271
+ with gr.Row():
272
+ langs_bar_mtbench = gr.CheckboxGroup(
273
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.mt_bench_language_list],
274
+ value=core.mt_bench_language_list,
275
+ label="Select languages to average over",
276
+ elem_id="column-select",
277
+ interactive=True,
278
+ scale=6,
279
+ )
280
+ with gr.Column(scale=1):
281
+ clear_mtbench = gr.ClearButton(
282
+ langs_bar_mtbench,
283
+ value="Deselect all languages",
284
+ size="sm",
285
+ scale=1,
286
  )
287
+ select_mtbench = gr.Button(
288
+ value="Select all languages",
289
+ size="sm",
290
+ scale=1,
 
291
  )
292
+ select_mtbench.click(
293
+ lambda: gr.CheckboxGroup(value=core.mt_bench_language_list),
294
+ inputs=[],
295
+ outputs=langs_bar_mtbench,
296
+ )
297
+
298
+ leaderboard_table_mtbench = gr.Dataframe(scale=5)
299
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  for comp, fn in [
301
  (search_bar, "submit"),
302
  (langs_bar, "change"),
303
  (shown_tasks, "change"),
 
304
  (model_types, "change"),
305
  ]:
306
  getattr(comp, fn)(
307
  core.update_df,
308
+ [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
309
  leaderboard_table,
310
  )
311
+
312
+ for comp, fn in [
313
+ (search_bar_zero_shot, "submit"),
314
+ (model_types_zero_shot, "change"),
315
+ (langs_bar_zero_shot, "change"),
316
+ (shown_tasks_zero_shot, "change"),
317
+ ]:
318
  getattr(comp, fn)(
319
  core.update_df,
320
+ [shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, gr.State(value=False)],
321
+ leaderboard_table_zero_shot,
322
+ )
323
+
324
+ for comp, fn in [
325
+ (search_bar_misc, "submit"),
326
+ (langs_bar_misc, "change"),
327
+ (shown_tasks_misc, "change"),
328
+ (model_types_misc, "change"),
329
+ ]:
330
+ getattr(comp, fn)(
331
+ core.update_df,
332
+ [shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
333
  leaderboard_table_misc,
334
  )
335
+
336
+ for comp, fn in [
337
+ (search_bar_mtbench, "submit"),
338
+ (langs_bar_mtbench, "change"),
339
+ ]:
340
  getattr(comp, fn)(
341
  core.update_df,
342
+ [gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), search_bar_mtbench, langs_bar_mtbench, gr.State(value=[T_SYMBOLS["chat"]]), gr.State(value=False)], # TODO
343
  leaderboard_table_mtbench,
344
  )
345
 
346
  gr.Blocks.load(
347
  block=demo,
348
  fn=core.update_df,
349
+ inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
350
  outputs=leaderboard_table,
351
  )
352
 
353
  gr.Blocks.load(
354
  block=demo,
355
  fn=core.update_df,
356
+ inputs=[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, gr.State(value=False)],
357
+ outputs=leaderboard_table_zero_shot,
358
+ )
359
+
360
+ gr.Blocks.load(
361
+ block=demo,
362
+ fn=core.update_df,
363
+ inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
364
  outputs=leaderboard_table_misc,
365
  )
366
 
367
  gr.Blocks.load(
368
  block=demo,
369
  fn=core.update_df,
370
+ inputs=[gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), search_bar_mtbench, langs_bar_mtbench, gr.State(value=[T_SYMBOLS["chat"]]), gr.State(value=False)],
371
  outputs=leaderboard_table_mtbench,
372
  )
373
 
core.py CHANGED
@@ -1,13 +1,11 @@
1
  import itertools
2
  import os
3
 
4
- import gradio as gr
5
  import numpy as np
6
  import pandas as pd
7
  from datasets import load_dataset
8
 
9
  import style
10
- from style import T_SYMBOLS, MT_BENCH_LANG_SYMBOLS, LANG_SYMBOLS
11
 
12
  ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
13
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
@@ -29,7 +27,7 @@ def init():
29
  task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
30
  task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
31
  languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
32
- mt_bench_language_list = hidden_df[hidden_df['Task_Group'] == "MTBENCH"]["Language"].drop_duplicates().str.upper().tolist()
33
  model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
34
  model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
35
 
@@ -115,8 +113,7 @@ def update_df(
115
 
116
  # aggregate results over languages per task
117
  df = aggregate_langs(df, tasks, langs)
118
-
119
- df = df.sort_values(by='Average', ascending=False)
120
 
121
  # filter models by search bar and model type
122
  df = search_model(df, model_query)
@@ -128,67 +125,8 @@ def update_df(
128
  return sort_cols(df, fewshot)
129
 
130
 
131
- def update_task_groups_and_fewshot(current_selected_tab: int, model_types, langs_bar, is_fewshot_current: bool = False, ):
132
- selected_task_type = get_selected_task_type(current_selected_tab)
133
- available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
134
- new_selected_tasks = available_tasks.copy()
135
-
136
- tasks_checkbox_group_update = gr.CheckboxGroup(
137
- choices=available_tasks,
138
- value=new_selected_tasks,
139
- )
140
-
141
- if current_selected_tab == 0:
142
- is_fewshot_new = is_fewshot_current
143
- fewshot_available = True
144
- elif current_selected_tab == 1:
145
- is_fewshot_new = False
146
- fewshot_available = False
147
- elif current_selected_tab == 2:
148
- is_fewshot_new = False
149
- fewshot_available = False
150
- else:
151
- raise ValueError(f"Unknown tab id {current_selected_tab}")
152
-
153
- fewshot_radio_update = gr.Radio(
154
- value=is_fewshot_new,
155
- interactive=fewshot_available,
156
- )
157
-
158
- if current_selected_tab == 2:
159
- model_types = gr.CheckboxGroup(
160
- value=[T_SYMBOLS['chat']],
161
- interactive=False
162
- )
163
- langs_bar = gr.CheckboxGroup(
164
- choices=[(MT_BENCH_LANG_SYMBOLS.get(l, l), l) for l in mt_bench_language_list],
165
- value=mt_bench_language_list,
166
- interactive=True,
167
- )
168
- else:
169
- model_types = gr.CheckboxGroup(
170
- label="Select model type",
171
- choices=[
172
- (
173
- f"Pretrained {T_SYMBOLS['pretrained']}",
174
- T_SYMBOLS["pretrained"],
175
- ),
176
- (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
177
- ],
178
- value=list(T_SYMBOLS.values()),
179
- interactive=True
180
- )
181
- langs_bar = gr.CheckboxGroup(
182
- choices=[(LANG_SYMBOLS.get(l, l), l) for l in languages_list],
183
- value=languages_list,
184
- interactive=True,
185
- )
186
-
187
- return [tasks_checkbox_group_update, fewshot_radio_update, current_selected_tab, model_types, langs_bar]
188
-
189
-
190
  def get_selected_task_type(task_type_id):
191
- task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score"}
192
  selected_task_type = task_types[task_type_id]
193
  return selected_task_type
194
 
 
1
  import itertools
2
  import os
3
 
 
4
  import numpy as np
5
  import pandas as pd
6
  from datasets import load_dataset
7
 
8
  import style
 
9
 
10
  ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
11
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
 
27
  task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
28
  task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
29
  languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
30
+ mt_bench_language_list = hidden_df[hidden_df["Task_Group"] == "MTBENCH"]["Language"].drop_duplicates().str.upper().tolist()
31
  model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
32
  model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
33
 
 
113
 
114
  # aggregate results over languages per task
115
  df = aggregate_langs(df, tasks, langs)
116
+ df = df.sort_values(by="Average", ascending=False)
 
117
 
118
  # filter models by search bar and model type
119
  df = search_model(df, model_query)
 
125
  return sort_cols(df, fewshot)
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def get_selected_task_type(task_type_id):
129
+ task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score", 3: "accuracy"}
130
  selected_task_type = task_types[task_type_id]
131
  return selected_task_type
132
 
style.py CHANGED
@@ -11,10 +11,101 @@ CSS = """
11
  }
12
  """
13
 
14
- T_SYMBOLS = {
15
- "pretrained": "๐ŸŸข",
16
- "chat": "๐Ÿ’ฌ"
 
 
 
 
17
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  LANG_SYMBOLS = {
20
  "BG": "๐Ÿ‡ง๐Ÿ‡ฌ BG",
@@ -37,13 +128,7 @@ LANG_SYMBOLS = {
37
  "RO": "๐Ÿ‡ท๐Ÿ‡ด RO",
38
  "SK": "๐Ÿ‡ธ๐Ÿ‡ฐ SK",
39
  "SL": "๐Ÿ‡ธ๐Ÿ‡ฎ SL",
40
- "SV": "๐Ÿ‡ธ๐Ÿ‡ช SV"
41
  }
42
 
43
- MT_BENCH_LANG_SYMBOLS = {
44
- "ES": "๐Ÿ‡ช๐Ÿ‡ธ ES",
45
- "EN": "๐Ÿ‡ฌ๐Ÿ‡ง EN",
46
- "DE": "๐Ÿ‡ฉ๐Ÿ‡ช DE",
47
- "FR": "๐Ÿ‡ซ๐Ÿ‡ท FR",
48
- "IT": "๐Ÿ‡ฎ๐Ÿ‡น IT"
49
- }
 
11
  }
12
  """
13
 
14
+ OPEN_LLM_LEADERBOARD_CSS = """
15
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
16
+ table td:first-child,
17
+ table th:first-child {
18
+ max-width: 400px;
19
+ overflow: auto;
20
+ white-space: nowrap;
21
  }
22
+ /* Full width space */
23
+ .gradio-container {
24
+ max-width: 95% !important;
25
+ }
26
+ /* Text style and margins */
27
+ .markdown-text {
28
+ font-size: 16px !important;
29
+ }
30
+ #models-to-add-text {
31
+ font-size: 18px !important;
32
+ }
33
+ #citation-button span {
34
+ font-size: 16px !important;
35
+ }
36
+ #citation-button textarea {
37
+ font-size: 16px !important;
38
+ }
39
+ #citation-button > label > button {
40
+ margin: 6px;
41
+ transform: scale(1.3);
42
+ }
43
+ #search-bar-table-box > div:first-child {
44
+ background: none;
45
+ border: none;
46
+ }
47
+ #search-bar {
48
+ padding: 0px;
49
+ }
50
+ .tab-buttons button {
51
+ font-size: 20px;
52
+ }
53
+ /* Filters style */
54
+ #filter_type {
55
+ border: 0;
56
+ padding-left: 0;
57
+ padding-top: 0;
58
+ }
59
+ #filter_type label {
60
+ display: flex;
61
+ }
62
+ #filter_type label > span {
63
+ margin-top: var(--spacing-lg);
64
+ margin-right: 0.5em;
65
+ }
66
+ #filter_type label > .wrap {
67
+ width: 103px;
68
+ }
69
+ #filter_type label > .wrap .wrap-inner {
70
+ padding: 2px;
71
+ }
72
+ #filter_type label > .wrap .wrap-inner input {
73
+ width: 1px;
74
+ }
75
+ #filter-columns-type {
76
+ border: 0;
77
+ padding: 0.5;
78
+ }
79
+ #filter-columns-size {
80
+ border: 0;
81
+ padding: 0.5;
82
+ }
83
+ #box-filter > .form {
84
+ border: 0;
85
+ }
86
+ /* Header styles */
87
+ #header-title {
88
+ text-align: left;
89
+ display: inline-block;
90
+ }
91
+ #header-row {
92
+ display: flex;
93
+ justify-content: space-between;
94
+ align-items: center;
95
+ }
96
+ #header-row .gradio-html {
97
+ flex-grow: 1;
98
+ }
99
+ #oauth-button {
100
+ height: auto;
101
+ min-width: max-content;
102
+ white-space: nowrap;
103
+ padding: 10px 20px;
104
+ border-radius: 4px;
105
+ }
106
+ """
107
+
108
+ T_SYMBOLS = {"pretrained": "๐ŸŸข", "chat": "๐Ÿ’ฌ"}
109
 
110
  LANG_SYMBOLS = {
111
  "BG": "๐Ÿ‡ง๐Ÿ‡ฌ BG",
 
128
  "RO": "๐Ÿ‡ท๐Ÿ‡ด RO",
129
  "SK": "๐Ÿ‡ธ๐Ÿ‡ฐ SK",
130
  "SL": "๐Ÿ‡ธ๐Ÿ‡ฎ SL",
131
+ "SV": "๐Ÿ‡ธ๐Ÿ‡ช SV",
132
  }
133
 
134
+ MT_BENCH_LANG_SYMBOLS = {"ES": "๐Ÿ‡ช๐Ÿ‡ธ ES", "EN": "๐Ÿ‡ฌ๐Ÿ‡ง EN", "DE": "๐Ÿ‡ฉ๐Ÿ‡ช DE", "FR": "๐Ÿ‡ซ๐Ÿ‡ท FR", "IT": "๐Ÿ‡ฎ๐Ÿ‡น IT"}