ajude commited on
Commit
1c5b4ad
Β·
1 Parent(s): 07a2d86

fix(leaderboard):

Browse files

1. Fixed the issue when adding markdown as the data type of the model_name column, the text is getting overflown into the next column.
2. Removed the dependency where the model type is determined based on a symbol.

Files changed (2) hide show
  1. app.py +24 -26
  2. core.py +70 -3
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
 
2
 
3
  import core as core
4
- from style import CSS, LANG_SYMBOLS, MT_BENCH_LANG_SYMBOLS, T_SYMBOLS, TITLE
5
- from gradio_rangeslider import RangeSlider
6
 
7
  demo = gr.Blocks(css=CSS)
8
  with demo:
@@ -100,9 +100,8 @@ with demo:
100
  inputs=[],
101
  outputs=shown_tasks,
102
  )
103
- # TODO When adding markdown as the data type of the model_name column, the text is getting overflown into the next column.
104
- # leaderboard_table = gr.Dataframe(datatype=['str', 'markdown'])
105
- leaderboard_table = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
106
 
107
  with gr.TabItem(
108
  "πŸ… LLM accuracy benchmark (Zero-Shot)",
@@ -188,8 +187,7 @@ with demo:
188
  inputs=[],
189
  outputs=shown_tasks_zero_shot,
190
  )
191
- leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"],
192
- wrap=False)
193
 
194
  with gr.TabItem(
195
  "🌐 LLM translation benchmark",
@@ -276,7 +274,7 @@ with demo:
276
  outputs=shown_tasks_misc,
277
  )
278
 
279
- leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
280
 
281
  with gr.TabItem(
282
  "🌐 LLM MT-Bench benchmark",
@@ -319,8 +317,7 @@ with demo:
319
  outputs=langs_bar_mtbench,
320
  )
321
 
322
- leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "60%"],
323
- wrap=False)
324
 
325
  for comp, fn in [
326
  (search_bar, "submit"),
@@ -331,7 +328,7 @@ with demo:
331
  ]:
332
  getattr(comp, fn)(
333
  core.update_df,
334
- [shown_tasks, search_bar, langs_bar, model_types, model_sizes, gr.State(value=True)],
335
  # [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
336
  leaderboard_table,
337
  )
@@ -345,8 +342,8 @@ with demo:
345
  ]:
346
  getattr(comp, fn)(
347
  core.update_df,
348
- [shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot,
349
- model_sizes_zero_shot, gr.State(value=False)],
350
  leaderboard_table_zero_shot,
351
  )
352
 
@@ -359,8 +356,8 @@ with demo:
359
  ]:
360
  getattr(comp, fn)(
361
  core.update_df,
362
- [shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, model_sizes_misc,
363
- gr.State(value=False)],
364
  leaderboard_table_misc,
365
  )
366
 
@@ -370,41 +367,42 @@ with demo:
370
  ]:
371
  getattr(comp, fn)(
372
  core.update_df,
373
- [gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
374
- search_bar_mtbench, langs_bar_mtbench, gr.State(value=[T_SYMBOLS["chat"]]), gr.State(value=False)],
375
- # TODO
376
  leaderboard_table_mtbench,
377
  )
378
 
379
  gr.Blocks.load(
380
  block=demo,
381
  fn=core.update_df,
382
- inputs=[shown_tasks, search_bar, langs_bar, model_types, model_sizes, gr.State(value=True)],
383
- # inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
384
  outputs=leaderboard_table,
385
  )
386
 
387
  gr.Blocks.load(
388
  block=demo,
389
  fn=core.update_df,
390
- inputs=[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot,
391
- model_sizes_zero_shot, gr.State(value=False)],
392
  outputs=leaderboard_table_zero_shot,
393
  )
394
 
395
  gr.Blocks.load(
396
  block=demo,
397
  fn=core.update_df,
398
- inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, model_sizes_misc,
399
- gr.State(value=False)],
400
  outputs=leaderboard_table_misc,
401
  )
402
 
 
403
  gr.Blocks.load(
404
  block=demo,
405
  fn=core.update_df,
406
- inputs=[gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
407
- search_bar_mtbench, langs_bar_mtbench, gr.State(value=[T_SYMBOLS["chat"]]), gr.State(value=False)],
 
408
  outputs=leaderboard_table_mtbench,
409
  )
410
 
 
1
  import gradio as gr
2
+ from gradio_rangeslider import RangeSlider
3
 
4
  import core as core
5
+ from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE
 
6
 
7
  demo = gr.Blocks(css=CSS)
8
  with demo:
 
100
  inputs=[],
101
  outputs=shown_tasks,
102
  )
103
+
104
+ leaderboard_table = gr.Dataframe(datatype=["str", "markdown", "number"])
 
105
 
106
  with gr.TabItem(
107
  "πŸ… LLM accuracy benchmark (Zero-Shot)",
 
187
  inputs=[],
188
  outputs=shown_tasks_zero_shot,
189
  )
190
+ leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown", "number"])
 
191
 
192
  with gr.TabItem(
193
  "🌐 LLM translation benchmark",
 
274
  outputs=shown_tasks_misc,
275
  )
276
 
277
+ leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown", "number"])
278
 
279
  with gr.TabItem(
280
  "🌐 LLM MT-Bench benchmark",
 
317
  outputs=langs_bar_mtbench,
318
  )
319
 
320
+ leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown", "number"])
 
321
 
322
  for comp, fn in [
323
  (search_bar, "submit"),
 
328
  ]:
329
  getattr(comp, fn)(
330
  core.update_df,
331
+ [gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types],
332
  # [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
333
  leaderboard_table,
334
  )
 
342
  ]:
343
  getattr(comp, fn)(
344
  core.update_df,
345
+ [gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
346
+ model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot],
347
  leaderboard_table_zero_shot,
348
  )
349
 
 
356
  ]:
357
  getattr(comp, fn)(
358
  core.update_df,
359
+ [gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc,
360
+ gr.State(value=False), model_types_misc],
361
  leaderboard_table_misc,
362
  )
363
 
 
367
  ]:
368
  getattr(comp, fn)(
369
  core.update_df,
370
+ [gr.State(value=3),
371
+ gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
372
+ search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)],
373
  leaderboard_table_mtbench,
374
  )
375
 
376
  gr.Blocks.load(
377
  block=demo,
378
  fn=core.update_df,
379
+ inputs=[gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types],
 
380
  outputs=leaderboard_table,
381
  )
382
 
383
  gr.Blocks.load(
384
  block=demo,
385
  fn=core.update_df,
386
+ inputs=[gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot,
387
+ model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot],
388
  outputs=leaderboard_table_zero_shot,
389
  )
390
 
391
  gr.Blocks.load(
392
  block=demo,
393
  fn=core.update_df,
394
+ inputs=[gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc,
395
+ gr.State(value=False), model_types_misc],
396
  outputs=leaderboard_table_misc,
397
  )
398
 
399
+ # We do not have a checkbox for model_type in mt_bench, hence there is no model_types variable
400
  gr.Blocks.load(
401
  block=demo,
402
  fn=core.update_df,
403
+ inputs=[gr.State(value=3),
404
+ gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)),
405
+ search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)],
406
  outputs=leaderboard_table_mtbench,
407
  )
408
 
core.py CHANGED
@@ -1,19 +1,21 @@
1
  import itertools
2
  import os
3
 
 
4
  import numpy as np
5
  import pandas as pd
6
  from datasets import load_dataset
7
- from utils import add_model_hyperlink
8
 
9
  import style
 
 
10
 
11
  ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
12
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
13
 
14
 
15
  def init():
16
- global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict, mt_bench_language_list, model_link_dict, model_size_dict
17
 
18
  repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
19
  config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
@@ -114,18 +116,23 @@ def select_shots(df: pd.DataFrame, fewshot: bool = False):
114
 
115
 
116
  def update_df(
 
117
  tasks: list[str],
118
  model_query: str,
119
  langs: list[str],
120
- model_types: list[str],
121
  model_sizes: list[str],
122
  fewshot: bool = False,
 
123
  format: bool = True,
 
124
  ) -> pd.DataFrame:
125
  """Return a filtered dataframe according to selected models, tasks and
126
  languages. The format flag controls whether the output dataframe should
127
  be formatted to tw significant figures.
128
  """
 
 
 
129
  # keep only selected shots
130
  df = select_shots(hidden_df, fewshot)
131
 
@@ -147,6 +154,66 @@ def update_df(
147
  return sort_cols(df, fewshot)
148
 
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def get_selected_task_type(task_type_id):
151
  task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score", 3: "accuracy"}
152
  selected_task_type = task_types[task_type_id]
 
1
  import itertools
2
  import os
3
 
4
+ import gradio as gr
5
  import numpy as np
6
  import pandas as pd
7
  from datasets import load_dataset
 
8
 
9
  import style
10
+ from style import T_SYMBOLS, MT_BENCH_LANG_SYMBOLS, LANG_SYMBOLS
11
+ from utils import add_model_hyperlink
12
 
13
  ZERO_SHOT_ONLY = ["BELEBELE", "MT-Bench"]
14
  FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]
15
 
16
 
17
  def init():
18
+ global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_df, model_type_dict, mt_bench_language_list, model_link_dict, model_size_dict
19
 
20
  repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
21
  config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
 
116
 
117
 
118
  def update_df(
119
+ current_selected_tab: int,
120
  tasks: list[str],
121
  model_query: str,
122
  langs: list[str],
 
123
  model_sizes: list[str],
124
  fewshot: bool = False,
125
+ model_types: list[str] = None,
126
  format: bool = True,
127
+
128
  ) -> pd.DataFrame:
129
  """Return a filtered dataframe according to selected models, tasks and
130
  languages. The format flag controls whether the output dataframe should
131
  be formatted to tw significant figures.
132
  """
133
+ if current_selected_tab == 3:
134
+ model_types = [T_SYMBOLS["chat"]]
135
+
136
  # keep only selected shots
137
  df = select_shots(hidden_df, fewshot)
138
 
 
154
  return sort_cols(df, fewshot)
155
 
156
 
157
+ def update_task_groups_and_fewshot(current_selected_tab: int, model_types, langs_bar,
158
+ is_fewshot_current: bool = False, ):
159
+ selected_task_type = get_selected_task_type(current_selected_tab)
160
+ available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
161
+ new_selected_tasks = available_tasks.copy()
162
+
163
+ tasks_checkbox_group_update = gr.CheckboxGroup(
164
+ choices=available_tasks,
165
+ value=new_selected_tasks,
166
+ )
167
+
168
+ if current_selected_tab == 0:
169
+ is_fewshot_new = is_fewshot_current
170
+ fewshot_available = True
171
+ elif current_selected_tab == 1:
172
+ is_fewshot_new = False
173
+ fewshot_available = False
174
+ elif current_selected_tab == 2:
175
+ is_fewshot_new = False
176
+ fewshot_available = False
177
+ else:
178
+ raise ValueError(f"Unknown tab id {current_selected_tab}")
179
+
180
+ fewshot_radio_update = gr.Radio(
181
+ value=is_fewshot_new,
182
+ interactive=fewshot_available,
183
+ )
184
+
185
+ if current_selected_tab == 2:
186
+ model_types = gr.CheckboxGroup(
187
+ value=[T_SYMBOLS['chat']],
188
+ interactive=False
189
+ )
190
+ langs_bar = gr.CheckboxGroup(
191
+ choices=[(MT_BENCH_LANG_SYMBOLS.get(l, l), l) for l in mt_bench_language_list],
192
+ value=mt_bench_language_list,
193
+ interactive=True,
194
+ )
195
+ else:
196
+ model_types = gr.CheckboxGroup(
197
+ label="Select model type",
198
+ choices=[
199
+ (
200
+ f"Pretrained {T_SYMBOLS['pretrained']}",
201
+ T_SYMBOLS["pretrained"],
202
+ ),
203
+ (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
204
+ ],
205
+ value=list(T_SYMBOLS.values()),
206
+ interactive=True
207
+ )
208
+ langs_bar = gr.CheckboxGroup(
209
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in languages_list],
210
+ value=languages_list,
211
+ interactive=True,
212
+ )
213
+
214
+ return [tasks_checkbox_group_update, current_selected_tab, model_types, langs_bar]
215
+
216
+
217
  def get_selected_task_type(task_type_id):
218
  task_types = {0: "accuracy", 1: "misc", 2: "mtbench_score", 3: "accuracy"}
219
  selected_task_type = task_types[task_type_id]