Commit
ยท
064c980
1
Parent(s):
b605a32
Add EMMA benchmark
Browse files- app.py +29 -5
- emma_mini.jsonl +4 -0
- models.jsonl +1 -0
app.py
CHANGED
@@ -190,6 +190,7 @@ with gr.Blocks() as demo:
|
|
190 |
| Humanity's Last Exam | ๐ด 7% |
|
191 |
| BigCodeBench | ๐ 36% |
|
192 |
| Simple Bench | ๐ 42% |
|
|
|
193 |
| PlanBench | ๐ 53% |
|
194 |
| GAIA | ๐ก 65% |
|
195 |
| LiveBench Language | ๐ก 65% |
|
@@ -233,6 +234,11 @@ with gr.Blocks() as demo:
|
|
233 |
simple_bench_markdown: gr.Markdown = gr.Markdown(
|
234 |
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
|
235 |
)
|
|
|
|
|
|
|
|
|
|
|
236 |
with gr.Tab("๐ PlanBench") as planbench_tab:
|
237 |
planbench_plot: gr.Plot = gr.Plot()
|
238 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
@@ -331,6 +337,11 @@ with gr.Blocks() as demo:
|
|
331 |
swe_bench_markdown: gr.Markdown = gr.Markdown(
|
332 |
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
|
333 |
)
|
|
|
|
|
|
|
|
|
|
|
334 |
with gr.Tab("WebArena", visible=False):
|
335 |
webarena_plot: gr.Plot = gr.Plot()
|
336 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
@@ -341,11 +352,6 @@ with gr.Blocks() as demo:
|
|
341 |
osworld_markdown: gr.Markdown = gr.Markdown(
|
342 |
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
343 |
)
|
344 |
-
with gr.Tab("EMMA-Mini", visible=False):
|
345 |
-
emma_plot: gr.Plot = gr.Plot()
|
346 |
-
emma_markdown: gr.Markdown = gr.Markdown(
|
347 |
-
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
348 |
-
)
|
349 |
with gr.Tab("MathVista", visible=False):
|
350 |
mathvista_plot: gr.Plot = gr.Plot()
|
351 |
mathvista_markdown: gr.Markdown = gr.Markdown(
|
@@ -437,6 +443,16 @@ with gr.Blocks() as demo:
|
|
437 |
quality_markdown: gr.Markdown = gr.Markdown(
|
438 |
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
439 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
with gr.Tab("Finance") as finance_tab:
|
441 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
442 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
@@ -615,6 +631,14 @@ with gr.Blocks() as demo:
|
|
615 |
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
616 |
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
617 |
outputs=livecodebench_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
|
619 |
if __name__ == "__main__":
|
620 |
demo.launch()
|
|
|
190 |
| Humanity's Last Exam | ๐ด 7% |
|
191 |
| BigCodeBench | ๐ 36% |
|
192 |
| Simple Bench | ๐ 42% |
|
193 |
+
| EMMA-Mini | ๐ 48% |
|
194 |
| PlanBench | ๐ 53% |
|
195 |
| GAIA | ๐ก 65% |
|
196 |
| LiveBench Language | ๐ก 65% |
|
|
|
234 |
simple_bench_markdown: gr.Markdown = gr.Markdown(
|
235 |
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
|
236 |
)
|
237 |
+
with gr.Tab("๐ EMMA-Mini") as emma_tab:
|
238 |
+
emma_plot: gr.Plot = gr.Plot()
|
239 |
+
emma_markdown: gr.Markdown = gr.Markdown(
|
240 |
+
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
241 |
+
)
|
242 |
with gr.Tab("๐ PlanBench") as planbench_tab:
|
243 |
planbench_plot: gr.Plot = gr.Plot()
|
244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
|
|
337 |
swe_bench_markdown: gr.Markdown = gr.Markdown(
|
338 |
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
|
339 |
)
|
340 |
+
with gr.Tab("SWE-bench Multimodal", visible=False):
|
341 |
+
swe_bench_multimodal_plot: gr.Plot = gr.Plot()
|
342 |
+
swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown(
|
343 |
+
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)"""
|
344 |
+
)
|
345 |
with gr.Tab("WebArena", visible=False):
|
346 |
webarena_plot: gr.Plot = gr.Plot()
|
347 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
|
|
352 |
osworld_markdown: gr.Markdown = gr.Markdown(
|
353 |
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
354 |
)
|
|
|
|
|
|
|
|
|
|
|
355 |
with gr.Tab("MathVista", visible=False):
|
356 |
mathvista_plot: gr.Plot = gr.Plot()
|
357 |
mathvista_markdown: gr.Markdown = gr.Markdown(
|
|
|
443 |
quality_markdown: gr.Markdown = gr.Markdown(
|
444 |
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
445 |
)
|
446 |
+
with gr.Tab("MMVU", visible=False):
|
447 |
+
mmvu_plot: gr.Plot = gr.Plot()
|
448 |
+
mmvu_markdown: gr.Markdown = gr.Markdown(
|
449 |
+
value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)"""
|
450 |
+
)
|
451 |
+
with gr.Tab("PhysBench", visible=False):
|
452 |
+
physbench_plot: gr.Plot = gr.Plot()
|
453 |
+
physbench_markdown: gr.Markdown = gr.Markdown(
|
454 |
+
value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)"""
|
455 |
+
)
|
456 |
with gr.Tab("Finance") as finance_tab:
|
457 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
458 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
|
631 |
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
632 |
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
633 |
outputs=livecodebench_plot)
|
634 |
+
emma_tab.select(fn=create_simple_plot,
|
635 |
+
inputs=[gr.State("emma_mini.jsonl"),
|
636 |
+
gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"),
|
637 |
+
gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"),
|
638 |
+
gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)),
|
639 |
+
gr.State(22.75), gr.State(100),
|
640 |
+
gr.State({"Human experts": 77.75})],
|
641 |
+
outputs=emma_plot)
|
642 |
|
643 |
if __name__ == "__main__":
|
644 |
demo.launch()
|
emma_mini.jsonl
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 48.00}
|
2 |
+
{"model": "o1-2024-12-17", "score": 45.75}
|
3 |
+
{"model": "gemini-2.0-flash-thinking-exp-1219", "score": 43.50}
|
4 |
+
{"model": "qwen2-vl-72b-instruct", "score": 37.25}
|
models.jsonl
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
3 |
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
1 |
+
{"Name": "qwen2-vl-72b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
3 |
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
4 |
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|