kaizuberbuehler commited on
Commit
064c980
ยท
1 Parent(s): b605a32

Add EMMA benchmark

Browse files
Files changed (3) hide show
  1. app.py +29 -5
  2. emma_mini.jsonl +4 -0
  3. models.jsonl +1 -0
app.py CHANGED
@@ -190,6 +190,7 @@ with gr.Blocks() as demo:
190
  | Humanity's Last Exam | ๐Ÿ”ด 7% |
191
  | BigCodeBench | ๐ŸŸ  36% |
192
  | Simple Bench | ๐ŸŸ  42% |
 
193
  | PlanBench | ๐ŸŸ  53% |
194
  | GAIA | ๐ŸŸก 65% |
195
  | LiveBench Language | ๐ŸŸก 65% |
@@ -233,6 +234,11 @@ with gr.Blocks() as demo:
233
  simple_bench_markdown: gr.Markdown = gr.Markdown(
234
  value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
235
  )
 
 
 
 
 
236
  with gr.Tab("๐ŸŸ  PlanBench") as planbench_tab:
237
  planbench_plot: gr.Plot = gr.Plot()
238
  planbench_markdown: gr.Markdown = gr.Markdown(
@@ -331,6 +337,11 @@ with gr.Blocks() as demo:
331
  swe_bench_markdown: gr.Markdown = gr.Markdown(
332
  value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
333
  )
 
 
 
 
 
334
  with gr.Tab("WebArena", visible=False):
335
  webarena_plot: gr.Plot = gr.Plot()
336
  webarena_markdown: gr.Markdown = gr.Markdown(
@@ -341,11 +352,6 @@ with gr.Blocks() as demo:
341
  osworld_markdown: gr.Markdown = gr.Markdown(
342
  value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
343
  )
344
- with gr.Tab("EMMA-Mini", visible=False):
345
- emma_plot: gr.Plot = gr.Plot()
346
- emma_markdown: gr.Markdown = gr.Markdown(
347
- value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
348
- )
349
  with gr.Tab("MathVista", visible=False):
350
  mathvista_plot: gr.Plot = gr.Plot()
351
  mathvista_markdown: gr.Markdown = gr.Markdown(
@@ -437,6 +443,16 @@ with gr.Blocks() as demo:
437
  quality_markdown: gr.Markdown = gr.Markdown(
438
  value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
439
  )
 
 
 
 
 
 
 
 
 
 
440
  with gr.Tab("Finance") as finance_tab:
441
  with gr.Tab("Big Tech Capex") as big_five_capex_tab:
442
  big_five_capex_plot: gr.Plot = gr.Plot()
@@ -615,6 +631,14 @@ with gr.Blocks() as demo:
615
  "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
616
  gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
617
  outputs=livecodebench_plot)
 
 
 
 
 
 
 
 
618
 
619
  if __name__ == "__main__":
620
  demo.launch()
 
190
  | Humanity's Last Exam | ๐Ÿ”ด 7% |
191
  | BigCodeBench | ๐ŸŸ  36% |
192
  | Simple Bench | ๐ŸŸ  42% |
193
+ | EMMA-Mini | ๐ŸŸ  48% |
194
  | PlanBench | ๐ŸŸ  53% |
195
  | GAIA | ๐ŸŸก 65% |
196
  | LiveBench Language | ๐ŸŸก 65% |
 
234
  simple_bench_markdown: gr.Markdown = gr.Markdown(
235
  value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
236
  )
237
+ with gr.Tab("๐ŸŸ  EMMA-Mini") as emma_tab:
238
+ emma_plot: gr.Plot = gr.Plot()
239
+ emma_markdown: gr.Markdown = gr.Markdown(
240
+ value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
241
+ )
242
  with gr.Tab("๐ŸŸ  PlanBench") as planbench_tab:
243
  planbench_plot: gr.Plot = gr.Plot()
244
  planbench_markdown: gr.Markdown = gr.Markdown(
 
337
  swe_bench_markdown: gr.Markdown = gr.Markdown(
338
  value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
339
  )
340
+ with gr.Tab("SWE-bench Multimodal", visible=False):
341
+ swe_bench_multimodal_plot: gr.Plot = gr.Plot()
342
+ swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown(
343
+ value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)"""
344
+ )
345
  with gr.Tab("WebArena", visible=False):
346
  webarena_plot: gr.Plot = gr.Plot()
347
  webarena_markdown: gr.Markdown = gr.Markdown(
 
352
  osworld_markdown: gr.Markdown = gr.Markdown(
353
  value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
354
  )
 
 
 
 
 
355
  with gr.Tab("MathVista", visible=False):
356
  mathvista_plot: gr.Plot = gr.Plot()
357
  mathvista_markdown: gr.Markdown = gr.Markdown(
 
443
  quality_markdown: gr.Markdown = gr.Markdown(
444
  value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
445
  )
446
+ with gr.Tab("MMVU", visible=False):
447
+ mmvu_plot: gr.Plot = gr.Plot()
448
+ mmvu_markdown: gr.Markdown = gr.Markdown(
449
+ value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)"""
450
+ )
451
+ with gr.Tab("PhysBench", visible=False):
452
+ physbench_plot: gr.Plot = gr.Plot()
453
+ physbench_markdown: gr.Markdown = gr.Markdown(
454
+ value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)"""
455
+ )
456
  with gr.Tab("Finance") as finance_tab:
457
  with gr.Tab("Big Tech Capex") as big_five_capex_tab:
458
  big_five_capex_plot: gr.Plot = gr.Plot()
 
631
  "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
632
  gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
633
  outputs=livecodebench_plot)
634
+ emma_tab.select(fn=create_simple_plot,
635
+ inputs=[gr.State("emma_mini.jsonl"),
636
+ gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"),
637
+ gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"),
638
+ gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)),
639
+ gr.State(22.75), gr.State(100),
640
+ gr.State({"Human experts": 77.75})],
641
+ outputs=emma_plot)
642
 
643
  if __name__ == "__main__":
644
  demo.launch()
emma_mini.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 48.00}
2
+ {"model": "o1-2024-12-17", "score": 45.75}
3
+ {"model": "gemini-2.0-flash-thinking-exp-1219", "score": 43.50}
4
+ {"model": "qwen2-vl-72b-instruct", "score": 37.25}
models.jsonl CHANGED
@@ -1,3 +1,4 @@
 
1
  {"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
  {"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
  {"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
1
+ {"Name": "qwen2-vl-72b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
  {"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
  {"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
4
  {"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}