Spaces:

kaizuberbuehler
/

ai-progress-charts

Running

App Files Files Community

kaizuberbuehler commited on Feb 12

Commit

b605a32

1 Parent(s): a2d5ea0

Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench

Browse files

Files changed (13) hide show

app.py +255 -30
codeforces_leaderboard.jsonl +0 -6
humanitys_last_exam.jsonl +5 -0
livebench.jsonl +56 -0
livebench_coding.jsonl +56 -0
livebench_data_analysis.jsonl +56 -0
livebench_if.jsonl +56 -0
livebench_language.jsonl +56 -0
livebench_mathematics.jsonl +56 -0
livebench_reasoning.jsonl +56 -0
livecodebench.jsonl +26 -0
models.jsonl +56 -0
simple_bench_leaderboard.jsonl +5 -1

app.py CHANGED Viewed

@@ -187,16 +187,25 @@ with gr.Blocks() as demo:
 | Benchmark | Top Score |
 |-----------|-----------|
 | BigCodeBench | 🟠 36% |
 | Simple Bench | 🟠 42% |
 | PlanBench | 🟠 53% |
 | GAIA | 🟡 65% |
 | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
 | GPQA | 🟡 76% |
 | ZebraLogic | 🟡 81% |
 | ARC-AGI-Pub (Public Eval) | 🟡 83% |
 | ZeroEval | 🟡 86% |
 | MATH-L5 | 🟡 89% |
 | MMLU-Redux | 🟢 93% |
 | CRUX | 🟢 96% |
@@ -209,6 +218,11 @@ with gr.Blocks() as demo:
 | 🟡 Yellow | 60% to 90% |
 | 🟢 Green | Above 90% |"""
             )
         with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
             bigcodebench_plot: gr.Plot = gr.Plot()
             bigcodebench_markdown: gr.Markdown = gr.Markdown(
@@ -229,6 +243,21 @@ with gr.Blocks() as demo:
             gaia_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
             )
         with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
             with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
                 arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
@@ -237,16 +266,36 @@ with gr.Blocks() as demo:
             arc_agi_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
             )
         with gr.Tab("🟡 GPQA") as gpqa_tab:
             gpqa_plot: gr.Plot = gr.Plot()
             gpqa_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
             )
         with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
             zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
             zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
         with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
             zeroeval_average_plot: gr.Plot = gr.Plot()
             zeroeval_average_markdown: gr.Markdown = gr.Markdown(
@@ -257,6 +306,11 @@ with gr.Blocks() as demo:
             zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
         with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
             zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
             zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
@@ -267,8 +321,6 @@ with gr.Blocks() as demo:
             zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
-        with gr.Tab("Codeforces") as codeforces_tab:
-            codeforces_plot: gr.Plot = gr.Plot()
         with gr.Tab("OpenCompass", visible=False):
             opencompass_plot: gr.Plot = gr.Plot()
             opencompass_markdown: gr.Markdown = gr.Markdown(
@@ -284,6 +336,107 @@ with gr.Blocks() as demo:
             webarena_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
             )
     with gr.Tab("Finance") as finance_tab:
         with gr.Tab("Big Tech Capex") as big_five_capex_tab:
             big_five_capex_plot: gr.Plot = gr.Plot()
@@ -292,24 +445,30 @@ with gr.Blocks() as demo:
     big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
     arc_agi_public_eval_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("arc_agi_leaderboard.jsonl"),
-                                           gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
-                                           gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                            gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                            gr.State(0), gr.State(100),
                                            gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
                                    outputs=arc_agi_public_eval_plot)
     arc_agi_tab.select(fn=create_simple_plot,
                        inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
-                               gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
-                               gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                gr.State(0), gr.State(100),
                                gr.State({"MTurkers": 77})],
                        outputs=arc_agi_semi_private_eval_plot)
     arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
                                          inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
-                                                 gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
-                                                 gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                                  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                                  gr.State(0), gr.State(100),
                                                  gr.State({"MTurkers": 77})],
@@ -318,35 +477,31 @@ with gr.Blocks() as demo:
     simple_bench_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("simple_bench_leaderboard.jsonl"),
                                     gr.State("Simple Bench Score"),
-                                    gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
-                                    gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)),
                                     gr.State(0), gr.State(100),
                                     gr.State({"Humans": 83.7})],
                             outputs=simple_bench_plot)
-    codeforces_tab.select(fn=create_simple_plot,
-                          inputs=[gr.State("codeforces_leaderboard.jsonl"),
-                                  gr.State("Codeforces Rating"),
-                                  gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
-                                  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
-                                  gr.State(0), gr.State(4000),
-                                  gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
-                          outputs=codeforces_plot)
     planbench_tab.select(fn=create_simple_plot,
                          inputs=[gr.State("planbench_leaderboard.jsonl"),
                                  gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
-                                 gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
                                  gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
                          outputs=planbench_plot)
     bigcodebench_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
                                     gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
-                                    gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
                                     gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
                             outputs=bigcodebench_plot)
     gaia_tab.select(fn=create_simple_plot,
                     inputs=[gr.State("gaia_leaderboard.jsonl"),
                             gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
-                            gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
                             gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
                             gr.State(0), gr.State(100),
                             gr.State({"Humans": 92})],
@@ -354,7 +509,8 @@ with gr.Blocks() as demo:
     gpqa_tab.select(fn=create_simple_plot,
                     inputs=[gr.State("gpqa_leaderboard.jsonl"),
                             gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
-                            gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
                             gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
                             gr.State(25), gr.State(100),
                             gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
@@ -362,34 +518,103 @@ with gr.Blocks() as demo:
     zeroeval_average_tab.select(fn=create_simple_plot,
                                 inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
                                         gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
-                                        gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
                                         gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                 outputs=zeroeval_average_plot)
     zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
-                                           gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
-                                           gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
                                            gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                    outputs=zeroeval_mmlu_redux_plot)
     zeroeval_zebralogic_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
                                            gr.State("ZeroEval ZebraLogic Score"),
-                                           gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
                                            gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                    outputs=zeroeval_zebralogic_plot)
     zeroeval_crux_tab.select(fn=create_simple_plot,
                              inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
-                                     gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
-                                     gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
                                      gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                              outputs=zeroeval_crux_plot)
     zeroeval_math_l5_tab.select(fn=create_simple_plot,
                                 inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
                                         gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
-                                        gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
                                         gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                 outputs=zeroeval_math_l5_plot)
 if __name__ == "__main__":
     demo.launch()

 | Benchmark | Top Score |
 |-----------|-----------|
+| Humanity's Last Exam | 🔴 7% |
 | BigCodeBench | 🟠 36% |
 | Simple Bench | 🟠 42% |
 | PlanBench | 🟠 53% |
 | GAIA | 🟡 65% |
+| LiveBench Language | 🟡 65% |
+| LiveBench Data Analysis | 🟡 71% |
+| LiveCodeBench | 🟡 73% |
 | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
+| LiveBench | 🟡 76% |
 | GPQA | 🟡 76% |
+| LiveBench Mathematics | 🟡 81% |
 | ZebraLogic | 🟡 81% |
+| LiveBench Coding | 🟡 83% |
 | ARC-AGI-Pub (Public Eval) | 🟡 83% |
+| LiveBench IF | 🟡 86% |
 | ZeroEval | 🟡 86% |
 | MATH-L5 | 🟡 89% |
+| LiveBench Reasoning | 🟢 92% |
 | MMLU-Redux | 🟢 93% |
 | CRUX | 🟢 96% |
 | 🟡 Yellow | 60% to 90% |
 | 🟢 Green | Above 90% |"""
             )
+        with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab:
+            humanitys_last_exam_plot: gr.Plot = gr.Plot()
+            humanitys_last_exam_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)"""
+            )
         with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
             bigcodebench_plot: gr.Plot = gr.Plot()
             bigcodebench_markdown: gr.Markdown = gr.Markdown(
             gaia_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
             )
+        with gr.Tab("🟡 LiveBench Language") as livebench_language_tab:
+            livebench_language_plot: gr.Plot = gr.Plot()
+            livebench_language_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
+        with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab:
+            livebench_data_analysis_plot: gr.Plot = gr.Plot()
+            livebench_data_analysis_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
+        with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab:
+            livecodebench_plot: gr.Plot = gr.Plot()
+            livecodebench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)"""
+            )
         with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
             with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
                 arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
             arc_agi_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
             )
+        with gr.Tab("🟡 LiveBench") as livebench_tab:
+            livebench_plot: gr.Plot = gr.Plot()
+            livebench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
         with gr.Tab("🟡 GPQA") as gpqa_tab:
             gpqa_plot: gr.Plot = gr.Plot()
             gpqa_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
             )
+        with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab:
+            livebench_mathematics_plot: gr.Plot = gr.Plot()
+            livebench_mathematics_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
         with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
             zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
             zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
+        with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab:
+            livebench_coding_plot: gr.Plot = gr.Plot()
+            livebench_coding_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
+        with gr.Tab("🟡 LiveBench IF") as livebench_if_tab:
+            livebench_if_plot: gr.Plot = gr.Plot()
+            livebench_if_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench IF](https://livebench.ai/)"""
+            )
         with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
             zeroeval_average_plot: gr.Plot = gr.Plot()
             zeroeval_average_markdown: gr.Markdown = gr.Markdown(
             zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
+        with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab:
+            livebench_reasoning_plot: gr.Plot = gr.Plot()
+            livebench_reasoning_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
+            )
         with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
             zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
             zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
             zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
             )
         with gr.Tab("OpenCompass", visible=False):
             opencompass_plot: gr.Plot = gr.Plot()
             opencompass_markdown: gr.Markdown = gr.Markdown(
             webarena_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
             )
+        with gr.Tab("OSWorld", visible=False):
+            osworld_plot: gr.Plot = gr.Plot()
+            osworld_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
+            )
+        with gr.Tab("EMMA-Mini", visible=False):
+            emma_plot: gr.Plot = gr.Plot()
+            emma_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
+            )
+        with gr.Tab("MathVista", visible=False):
+            mathvista_plot: gr.Plot = gr.Plot()
+            mathvista_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)"""
+            )
+        with gr.Tab("DABStep", visible=False):
+            dabstep_plot: gr.Plot = gr.Plot()
+            dabstep_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)"""
+            )
+        with gr.Tab("lineage-bench", visible=False):
+            lineage_bench_plot: gr.Plot = gr.Plot()
+            lineage_bench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)"""
+            )
+        with gr.Tab("Step-Game", visible=False):
+            step_game_plot: gr.Plot = gr.Plot()
+            step_game_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)"""
+            )
+        with gr.Tab("HHEM", visible=False):
+            hhem_plot: gr.Plot = gr.Plot()
+            hhem_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
+            )
+        with gr.Tab("NYT Connections", visible=False):
+            nyt_connections_exam_plot: gr.Plot = gr.Plot()
+            nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
+            )
+        with gr.Tab("USACO", visible=False):
+            usaco_plot: gr.Plot = gr.Plot()
+            usaco_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)"""
+            )
+        with gr.Tab("AppWorld", visible=False):
+            appworld_plot: gr.Plot = gr.Plot()
+            appworld_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)"""
+            )
+        with gr.Tab("CORE-Bench", visible=False):
+            core_bench_plot: gr.Plot = gr.Plot()
+            core_bench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)"""
+            )
+        with gr.Tab("Cybench", visible=False):
+            cybench_plot: gr.Plot = gr.Plot()
+            cybench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)"""
+            )
+        with gr.Tab("MultiChallenge", visible=False):
+            multichallenge_plot: gr.Plot = gr.Plot()
+            multichallenge_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)"""
+            )
+        with gr.Tab("VISTA", visible=False):
+            vista_plot: gr.Plot = gr.Plot()
+            vista_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)"""
+            )
+        with gr.Tab("ToolComp", visible=False):
+            with gr.Tab("Enterprise"):
+                toolcomp_enterprise_plot: gr.Plot = gr.Plot()
+                toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown(
+                    value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)"""
+                )
+            with gr.Tab("Chat"):
+                toolcomp_chat_plot: gr.Plot = gr.Plot()
+                toolcomp_chat_markdown: gr.Markdown = gr.Markdown(
+                    value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)"""
+                )
+        with gr.Tab("BFCL", visible=False):
+            bfcl_plot: gr.Plot = gr.Plot()
+            bfcl_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)"""
+            )
+        with gr.Tab("EvalPlus", visible=False):
+            evalplus_plot: gr.Plot = gr.Plot()
+            evalplus_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)"""
+            )
+        with gr.Tab("Aider Polyglot", visible=False):
+            aider_plot: gr.Plot = gr.Plot()
+            aider_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)"""
+            )
+        with gr.Tab("QuALITY", visible=False):
+            quality_plot: gr.Plot = gr.Plot()
+            quality_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
+            )
     with gr.Tab("Finance") as finance_tab:
         with gr.Tab("Big Tech Capex") as big_five_capex_tab:
             big_five_capex_plot: gr.Plot = gr.Plot()
     big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
     arc_agi_public_eval_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("arc_agi_leaderboard.jsonl"),
+                                           gr.State(
+                                               "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
+                                           gr.State(
+                                               "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                            gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                            gr.State(0), gr.State(100),
                                            gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
                                    outputs=arc_agi_public_eval_plot)
     arc_agi_tab.select(fn=create_simple_plot,
                        inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
+                               gr.State(
+                                   "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
+                               gr.State(
+                                   "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                gr.State(0), gr.State(100),
                                gr.State({"MTurkers": 77})],
                        outputs=arc_agi_semi_private_eval_plot)
     arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
                                          inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
+                                                 gr.State(
+                                                     "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
+                                                 gr.State(
+                                                     "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
                                                  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                                  gr.State(0), gr.State(100),
                                                  gr.State({"MTurkers": 77})],
     simple_bench_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("simple_bench_leaderboard.jsonl"),
                                     gr.State("Simple Bench Score"),
+                                    gr.State(
+                                        "\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
+                                    gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)),
                                     gr.State(0), gr.State(100),
                                     gr.State({"Humans": 83.7})],
                             outputs=simple_bench_plot)
     planbench_tab.select(fn=create_simple_plot,
                          inputs=[gr.State("planbench_leaderboard.jsonl"),
                                  gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
+                                 gr.State(
+                                     "\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
                                  gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
                          outputs=planbench_plot)
     bigcodebench_tab.select(fn=create_simple_plot,
                             inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
                                     gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
+                                    gr.State(
+                                        "\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
                                     gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
                             outputs=bigcodebench_plot)
     gaia_tab.select(fn=create_simple_plot,
                     inputs=[gr.State("gaia_leaderboard.jsonl"),
                             gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
+                            gr.State(
+                                "\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
                             gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
                             gr.State(0), gr.State(100),
                             gr.State({"Humans": 92})],
     gpqa_tab.select(fn=create_simple_plot,
                     inputs=[gr.State("gpqa_leaderboard.jsonl"),
                             gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
+                            gr.State(
+                                "\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
                             gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
                             gr.State(25), gr.State(100),
                             gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
     zeroeval_average_tab.select(fn=create_simple_plot,
                                 inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
                                         gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
+                                        gr.State(
+                                            "\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
                                         gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                 outputs=zeroeval_average_plot)
     zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
+                                           gr.State(
+                                               "ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
+                                           gr.State(
+                                               "\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
                                            gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                    outputs=zeroeval_mmlu_redux_plot)
     zeroeval_zebralogic_tab.select(fn=create_simple_plot,
                                    inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
                                            gr.State("ZeroEval ZebraLogic Score"),
+                                           gr.State(
+                                               "\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
                                            gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                    outputs=zeroeval_zebralogic_plot)
     zeroeval_crux_tab.select(fn=create_simple_plot,
                              inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
+                                     gr.State(
+                                         "ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
+                                     gr.State(
+                                         "\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
                                      gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                              outputs=zeroeval_crux_plot)
     zeroeval_math_l5_tab.select(fn=create_simple_plot,
                                 inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
                                         gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
+                                        gr.State(
+                                            "\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
                                         gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
                                 outputs=zeroeval_math_l5_plot)
+    livebench_tab.select(fn=create_simple_plot,
+                         inputs=[gr.State("livebench.jsonl"),
+                                 gr.State("LiveBench-2024-11-25: Global Average Score"),
+                                 gr.State(
+                                     "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                 gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                         outputs=livebench_plot)
+    livebench_reasoning_tab.select(fn=create_simple_plot,
+                                   inputs=[gr.State("livebench_reasoning.jsonl"),
+                                           gr.State("LiveBench-2024-11-25: Reasoning Average Score"),
+                                           gr.State(
+                                               "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                           gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                                   outputs=livebench_reasoning_plot)
+    livebench_coding_tab.select(fn=create_simple_plot,
+                                inputs=[gr.State("livebench_coding.jsonl"),
+                                        gr.State("LiveBench-2024-11-25: Coding Average Score"),
+                                        gr.State(
+                                            "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                        gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                                outputs=livebench_coding_plot)
+    livebench_mathematics_tab.select(fn=create_simple_plot,
+                                     inputs=[gr.State("livebench_mathematics.jsonl"),
+                                             gr.State("LiveBench-2024-11-25: Mathematics Average Score"),
+                                             gr.State(
+                                                 "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                             gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                                     outputs=livebench_mathematics_plot)
+    livebench_data_analysis_tab.select(fn=create_simple_plot,
+                                       inputs=[gr.State("livebench_data_analysis.jsonl"),
+                                               gr.State("LiveBench-2024-11-25: Data Analysis Average Score"),
+                                               gr.State(
+                                                   "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                               gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                                       outputs=livebench_data_analysis_plot)
+    livebench_language_tab.select(fn=create_simple_plot,
+                                  inputs=[gr.State("livebench_language.jsonl"),
+                                          gr.State("LiveBench-2024-11-25: Language Average Score"),
+                                          gr.State(
+                                              "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                          gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                                  outputs=livebench_language_plot)
+    livebench_if_tab.select(fn=create_simple_plot,
+                            inputs=[gr.State("livebench_if.jsonl"),
+                                    gr.State("LiveBench-2024-11-25: IF Average Score"),
+                                    gr.State(
+                                        "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
+                                    gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
+                            outputs=livebench_if_plot)
+    humanitys_last_exam_tab.select(fn=create_simple_plot,
+                                   inputs=[gr.State("humanitys_last_exam.jsonl"),
+                                           gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"),
+                                           gr.State(
+                                               "\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"),
+                                           gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))],
+                                   outputs=humanitys_last_exam_plot)
+    livecodebench_tab.select(fn=create_simple_plot,
+                             inputs=[gr.State("livecodebench.jsonl"),
+                                     gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"),
+                                     gr.State(
+                                         "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
+                                     gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
+                             outputs=livecodebench_plot)
 if __name__ == "__main__":
     demo.launch()

codeforces_leaderboard.jsonl DELETED Viewed

@@ -1,6 +0,0 @@
-{"model": "o3", "score": 2400}
-{"model": "o3-mini", "score": 2073}
-{"model": "o1", "score": 1673}
-{"model": "o1-mini", "score": 1650}
-{"model": "o1-preview", "score": 1258}
-{"model": "gpt-4o", "score": 808}

humanitys_last_exam.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"model": "gpt-4o", "score": 3.1}
+{"model": "grok-2", "score": 3.9}
+{"model": "claude-3-5-sonnet", "score": 4.8}
+{"model": "gemini-2.0-flash-thinking", "score": 7.2}
+{"model": "o1", "score": 7.2}

livebench.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 75.88}
+{"model": "o1-2024-12-17-high", "score": 75.67}
+{"model": "deepseek-r1", "score": 71.57}
+{"model": "o3-mini-2025-01-31-medium", "score": 70.01}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 66.92}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 65.13}
+{"model": "gemini-exp-1206", "score": 64.09}
+{"model": "o3-mini-2025-01-31-low", "score": 62.45}
+{"model": "qwen2.5-max", "score": 62.29}
+{"model": "gemini-2.0-flash", "score": 61.47}
+{"model": "deepseek-v3", "score": 60.45}
+{"model": "gemini-2.0-flash-exp", "score": 59.26}
+{"model": "claude-3-5-sonnet-20241022", "score": 59.03}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.79}
+{"model": "o1-mini-2024-09-12", "score": 57.76}
+{"model": "step-2-16k-202411", "score": 56.02}
+{"model": "gpt-4o-2024-08-06", "score": 55.33}
+{"model": "gemini-1.5-pro-002", "score": 54.33}
+{"model": "grok-2-1212", "score": 54.30}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 53.24}
+{"model": "dracarys2-72b-instruct", "score": 52.64}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 52.36}
+{"model": "gpt-4o-2024-11-20", "score": 52.19}
+{"model": "learnlm-1.5-pro-experimental", "score": 52.19}
+{"model": "chatgpt-4o-latest-0903", "score": 51.66}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 51.44}
+{"model": "gpt-4-turbo-2024-04-09", "score": 50.40}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 50.16}
+{"model": "deepseek-r1-distill-llama-70b", "score": 49.66}
+{"model": "grok-beta", "score": 49.18}
+{"model": "claude-3-opus-20240229", "score": 49.16}
+{"model": "mistral-large-2411", "score": 48.43}
+{"model": "qwen2.5-coder-32b-instruct", "score": 46.23}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 46.21}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 44.89}
+{"model": "amazon.nova-pro-v1:0", "score": 43.53}
+{"model": "claude-3-5-haiku-20241022", "score": 43.45}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 42.93}
+{"model": "mistral-small-2501", "score": 42.55}
+{"model": "phi-4", "score": 41.61}
+{"model": "gpt-4o-mini-2024-07-18", "score": 41.26}
+{"model": "qwq-32b-preview", "score": 40.25}
+{"model": "gemma-2-27b-it", "score": 38.18}
+{"model": "amazon.nova-lite-v1:0", "score": 36.35}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 34.90}
+{"model": "mistral-small-2409", "score": 33.42}
+{"model": "command-r-plus-08-2024", "score": 31.76}
+{"model": "amazon.nova-micro-v1:0", "score": 29.59}
+{"model": "gemma-2-9b-it", "score": 28.66}
+{"model": "command-r-08-2024", "score": 27.48}
+{"model": "command-r-plus-04-2024", "score": 27.11}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 25.97}
+{"model": "phi-3-small-8k-instruct", "score": 24.03}
+{"model": "phi-3-mini-128k-instruct", "score": 22.36}
+{"model": "olmo-2-1124-13b-instruct", "score": 22.12}
+{"model": "phi-3-mini-4k-instruct", "score": 22.08}

livebench_coding.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 82.74}
+{"model": "o1-2024-12-17-high", "score": 69.69}
+{"model": "deepseek-r1", "score": 66.74}
+{"model": "o3-mini-2025-01-31-medium", "score": 65.38}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 53.49}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 63.49}
+{"model": "gemini-exp-1206", "score": 63.41}
+{"model": "o3-mini-2025-01-31-low", "score": 61.46}
+{"model": "qwen2.5-max", "score": 64.41}
+{"model": "gemini-2.0-flash", "score": 53.92}
+{"model": "deepseek-v3", "score": 61.77}
+{"model": "gemini-2.0-flash-exp", "score": 54.36}
+{"model": "claude-3-5-sonnet-20241022", "score": 67.13}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 60.56}
+{"model": "o1-mini-2024-09-12", "score": 48.05}
+{"model": "step-2-16k-202411", "score": 47.19}
+{"model": "gpt-4o-2024-08-06", "score": 51.44}
+{"model": "gemini-1.5-pro-002", "score": 48.80}
+{"model": "grok-2-1212", "score": 46.44}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 43.80}
+{"model": "dracarys2-72b-instruct", "score": 58.92}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 42.65}
+{"model": "gpt-4o-2024-11-20", "score": 46.08}
+{"model": "learnlm-1.5-pro-experimental", "score": 46.87}
+{"model": "chatgpt-4o-latest-0903", "score": 47.44}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 57.64}
+{"model": "gpt-4-turbo-2024-04-09", "score": 49.00}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 36.59}
+{"model": "deepseek-r1-distill-llama-70b", "score": 50.97}
+{"model": "grok-beta", "score": 45.15}
+{"model": "claude-3-opus-20240229", "score": 38.59}
+{"model": "mistral-large-2411", "score": 47.08}
+{"model": "qwen2.5-coder-32b-instruct", "score": 56.85}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 36.31}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 33.49}
+{"model": "amazon.nova-pro-v1:0", "score": 38.15}
+{"model": "claude-3-5-haiku-20241022", "score": 51.36}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 32.85}
+{"model": "mistral-small-2501", "score": 35.31}
+{"model": "phi-4", "score": 30.67}
+{"model": "gpt-4o-mini-2024-07-18", "score": 43.15}
+{"model": "qwq-32b-preview", "score": 37.20}
+{"model": "gemma-2-27b-it", "score": 35.95}
+{"model": "amazon.nova-lite-v1:0", "score": 27.46}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 38.37}
+{"model": "mistral-small-2409", "score": 25.74}
+{"model": "command-r-plus-08-2024", "score": 19.14}
+{"model": "amazon.nova-micro-v1:0", "score": 20.18}
+{"model": "gemma-2-9b-it", "score": 22.46}
+{"model": "command-r-08-2024", "score": 17.90}
+{"model": "command-r-plus-04-2024", "score": 19.46}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.74}
+{"model": "phi-3-small-8k-instruct", "score": 20.26}
+{"model": "phi-3-mini-128k-instruct", "score": 15.04}
+{"model": "olmo-2-1124-13b-instruct", "score": 10.41}
+{"model": "phi-3-mini-4k-instruct", "score": 15.54}

livebench_data_analysis.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 70.64}
+{"model": "o1-2024-12-17-high", "score": 65.47}
+{"model": "deepseek-r1", "score": 69.78}
+{"model": "o3-mini-2025-01-31-medium", "score": 66.56}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 69.37}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 68.02}
+{"model": "gemini-exp-1206", "score": 63.16}
+{"model": "o3-mini-2025-01-31-low", "score": 62.04}
+{"model": "qwen2.5-max", "score": 67.93}
+{"model": "gemini-2.0-flash", "score": 67.55}
+{"model": "deepseek-v3", "score": 60.94}
+{"model": "gemini-2.0-flash-exp", "score": 61.67}
+{"model": "claude-3-5-sonnet-20241022", "score": 55.03}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 66.00}
+{"model": "o1-mini-2024-09-12", "score": 57.92}
+{"model": "step-2-16k-202411", "score": 63.72}
+{"model": "gpt-4o-2024-08-06", "score": 60.91}
+{"model": "gemini-1.5-pro-002", "score": 54.97}
+{"model": "grok-2-1212", "score": 54.45}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 57.47}
+{"model": "dracarys2-72b-instruct", "score": 55.51}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 55.85}
+{"model": "gpt-4o-2024-11-20", "score": 56.15}
+{"model": "learnlm-1.5-pro-experimental", "score": 54.97}
+{"model": "chatgpt-4o-latest-0903", "score": 57.93}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 51.91}
+{"model": "gpt-4-turbo-2024-04-09", "score": 54.36}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 49.49}
+{"model": "deepseek-r1-distill-llama-70b", "score": 55.93}
+{"model": "grok-beta", "score": 54.27}
+{"model": "claude-3-opus-20240229", "score": 57.89}
+{"model": "mistral-large-2411", "score": 50.15}
+{"model": "qwen2.5-coder-32b-instruct", "score": 49.87}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 53.98}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 53.75}
+{"model": "amazon.nova-pro-v1:0", "score": 48.31}
+{"model": "claude-3-5-haiku-20241022", "score": 48.45}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 45.41}
+{"model": "mistral-small-2501", "score": 53.69}
+{"model": "phi-4", "score": 45.17}
+{"model": "gpt-4o-mini-2024-07-18", "score": 49.96}
+{"model": "qwq-32b-preview", "score": 31.62}
+{"model": "gemma-2-27b-it", "score": 47.87}
+{"model": "amazon.nova-lite-v1:0", "score": 37.23}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 35.22}
+{"model": "mistral-small-2409", "score": 42.73}
+{"model": "command-r-plus-08-2024", "score": 38.06}
+{"model": "amazon.nova-micro-v1:0", "score": 33.95}
+{"model": "gemma-2-9b-it", "score": 36.39}
+{"model": "command-r-08-2024", "score": 33.34}
+{"model": "command-r-plus-04-2024", "score": 25.48}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 32.82}
+{"model": "phi-3-small-8k-instruct", "score": 30.29}
+{"model": "phi-3-mini-128k-instruct", "score": 34.69}
+{"model": "olmo-2-1124-13b-instruct", "score": 20.60}
+{"model": "phi-3-mini-4k-instruct", "score": 30.21}

livebench_if.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 84.36}
+{"model": "o1-2024-12-17-high", "score": 81.55}
+{"model": "deepseek-r1", "score": 80.51}
+{"model": "o3-mini-2025-01-31-medium", "score": 83.16}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 82.47}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 83.38}
+{"model": "gemini-exp-1206", "score": 77.34}
+{"model": "o3-mini-2025-01-31-low", "score": 80.06}
+{"model": "qwen2.5-max", "score": 75.35}
+{"model": "gemini-2.0-flash", "score": 85.79}
+{"model": "deepseek-v3", "score": 75.25}
+{"model": "gemini-2.0-flash-exp", "score": 81.86}
+{"model": "claude-3-5-sonnet-20241022", "score": 69.30}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 65.07}
+{"model": "o1-mini-2024-09-12", "score": 65.40}
+{"model": "step-2-16k-202411", "score": 79.88}
+{"model": "gpt-4o-2024-08-06", "score": 68.58}
+{"model": "gemini-1.5-pro-002", "score": 70.78}
+{"model": "grok-2-1212", "score": 69.63}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 78.28}
+{"model": "dracarys2-72b-instruct", "score": 65.22}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 75.90}
+{"model": "gpt-4o-2024-11-20", "score": 64.94}
+{"model": "learnlm-1.5-pro-experimental", "score": 68.16}
+{"model": "chatgpt-4o-latest-0903", "score": 66.37}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 64.39}
+{"model": "gpt-4-turbo-2024-04-09", "score": 60.85}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 82.67}
+{"model": "deepseek-r1-distill-llama-70b", "score": 41.55}
+{"model": "grok-beta", "score": 69.62}
+{"model": "claude-3-opus-20240229", "score": 63.89}
+{"model": "mistral-large-2411", "score": 67.93}
+{"model": "qwen2.5-coder-32b-instruct", "score": 58.69}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 63.24}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 68.98}
+{"model": "amazon.nova-pro-v1:0", "score": 67.13}
+{"model": "claude-3-5-haiku-20241022", "score": 61.88}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 40.92}
+{"model": "mistral-small-2501", "score": 59.54}
+{"model": "phi-4", "score": 58.38}
+{"model": "gpt-4o-mini-2024-07-18", "score": 56.80}
+{"model": "qwq-32b-preview", "score": 35.59}
+{"model": "gemma-2-27b-it", "score": 58.10}
+{"model": "amazon.nova-lite-v1:0", "score": 54.13}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 52.11}
+{"model": "mistral-small-2409", "score": 53.23}
+{"model": "command-r-plus-08-2024", "score": 57.61}
+{"model": "amazon.nova-micro-v1:0", "score": 48.04}
+{"model": "gemma-2-9b-it", "score": 52.62}
+{"model": "command-r-08-2024", "score": 55.62}
+{"model": "command-r-plus-04-2024", "score": 59.47}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 54.90}
+{"model": "phi-3-small-8k-instruct", "score": 47.20}
+{"model": "phi-3-mini-128k-instruct", "score": 39.08}
+{"model": "olmo-2-1124-13b-instruct", "score": 60.56}
+{"model": "phi-3-mini-4k-instruct", "score": 36.36}

livebench_language.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 50.68}
+{"model": "o1-2024-12-17-high", "score": 65.39}
+{"model": "deepseek-r1", "score": 48.53}
+{"model": "o3-mini-2025-01-31-medium", "score": 46.26}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 42.18}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 44.85}
+{"model": "gemini-exp-1206", "score": 51.29}
+{"model": "o3-mini-2025-01-31-low", "score": 38.25}
+{"model": "qwen2.5-max", "score": 56.28}
+{"model": "gemini-2.0-flash", "score": 40.69}
+{"model": "deepseek-v3", "score": 47.48}
+{"model": "gemini-2.0-flash-exp", "score": 38.22}
+{"model": "claude-3-5-sonnet-20241022", "score": 53.76}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 49.14}
+{"model": "o1-mini-2024-09-12", "score": 40.89}
+{"model": "step-2-16k-202411", "score": 44.39}
+{"model": "gpt-4o-2024-08-06", "score": 47.59}
+{"model": "gemini-1.5-pro-002", "score": 43.29}
+{"model": "grok-2-1212", "score": 45.58}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 34.28}
+{"model": "dracarys2-72b-instruct", "score": 34.12}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 45.46}
+{"model": "gpt-4o-2024-11-20", "score": 47.37}
+{"model": "learnlm-1.5-pro-experimental", "score": 41.98}
+{"model": "chatgpt-4o-latest-0903", "score": 45.30}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 34.99}
+{"model": "gpt-4-turbo-2024-04-09", "score": 44.26}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 39.20}
+{"model": "deepseek-r1-distill-llama-70b", "score": 23.81}
+{"model": "grok-beta", "score": 43.16}
+{"model": "claude-3-opus-20240229", "score": 50.39}
+{"model": "mistral-large-2411", "score": 39.39}
+{"model": "qwen2.5-coder-32b-instruct", "score": 23.25}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 38.78}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 35.42}
+{"model": "amazon.nova-pro-v1:0", "score": 36.96}
+{"model": "claude-3-5-haiku-20241022", "score": 35.37}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 26.82}
+{"model": "mistral-small-2501", "score": 30.46}
+{"model": "phi-4", "score": 25.61}
+{"model": "gpt-4o-mini-2024-07-18", "score": 28.61}
+{"model": "qwq-32b-preview", "score": 21.09}
+{"model": "gemma-2-27b-it", "score": 32.62}
+{"model": "amazon.nova-lite-v1:0", "score": 25.93}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 15.80}
+{"model": "mistral-small-2409", "score": 24.49}
+{"model": "command-r-plus-08-2024", "score": 29.73}
+{"model": "amazon.nova-micro-v1:0", "score": 15.78}
+{"model": "gemma-2-9b-it", "score": 25.53}
+{"model": "command-r-08-2024", "score": 16.72}
+{"model": "command-r-plus-04-2024", "score": 19.70}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 17.71}
+{"model": "phi-3-small-8k-instruct", "score": 12.94}
+{"model": "phi-3-mini-128k-instruct", "score": 9.15}
+{"model": "olmo-2-1124-13b-instruct", "score": 11.16}
+{"model": "phi-3-mini-4k-instruct", "score": 8.56}

livebench_mathematics.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 77.29}
+{"model": "o1-2024-12-17-high", "score": 80.32}
+{"model": "deepseek-r1", "score": 80.71}
+{"model": "o3-mini-2025-01-31-medium", "score": 72.37}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 75.85}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 70.97}
+{"model": "gemini-exp-1206", "score": 72.36}
+{"model": "o3-mini-2025-01-31-low", "score": 63.06}
+{"model": "qwen2.5-max", "score": 58.35}
+{"model": "gemini-2.0-flash", "score": 65.62}
+{"model": "deepseek-v3", "score": 60.54}
+{"model": "gemini-2.0-flash-exp", "score": 60.39}
+{"model": "claude-3-5-sonnet-20241022", "score": 52.28}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 48.02}
+{"model": "o1-mini-2024-09-12", "score": 61.99}
+{"model": "step-2-16k-202411", "score": 48.77}
+{"model": "gpt-4o-2024-08-06", "score": 49.54}
+{"model": "gemini-1.5-pro-002", "score": 59.07}
+{"model": "grok-2-1212", "score": 54.88}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 55.54}
+{"model": "dracarys2-72b-instruct", "score": 54.66}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 41.07}
+{"model": "gpt-4o-2024-11-20", "score": 42.87}
+{"model": "learnlm-1.5-pro-experimental", "score": 57.75}
+{"model": "chatgpt-4o-latest-0903", "score": 42.45}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 54.29}
+{"model": "gpt-4-turbo-2024-04-09", "score": 43.02}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 42.24}
+{"model": "deepseek-r1-distill-llama-70b", "score": 58.11}
+{"model": "grok-beta", "score": 45.84}
+{"model": "claude-3-opus-20240229", "score": 43.62}
+{"model": "mistral-large-2411", "score": 42.55}
+{"model": "qwen2.5-coder-32b-instruct", "score": 46.61}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 40.30}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 34.72}
+{"model": "amazon.nova-pro-v1:0", "score": 38.04}
+{"model": "claude-3-5-haiku-20241022", "score": 35.54}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 59.36}
+{"model": "mistral-small-2501", "score": 39.89}
+{"model": "phi-4", "score": 41.98}
+{"model": "gpt-4o-mini-2024-07-18", "score": 36.31}
+{"model": "qwq-32b-preview", "score": 58.26}
+{"model": "gemma-2-27b-it", "score": 26.46}
+{"model": "amazon.nova-lite-v1:0", "score": 36.70}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 39.51}
+{"model": "mistral-small-2409", "score": 24.42}
+{"model": "command-r-plus-08-2024", "score": 21.27}
+{"model": "amazon.nova-micro-v1:0", "score": 34.49}
+{"model": "gemma-2-9b-it", "score": 19.80}
+{"model": "command-r-08-2024", "score": 19.39}
+{"model": "command-r-plus-04-2024", "score": 17.99}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.31}
+{"model": "phi-3-small-8k-instruct", "score": 17.58}
+{"model": "phi-3-mini-128k-instruct", "score": 15.72}
+{"model": "olmo-2-1124-13b-instruct", "score": 13.64}
+{"model": "phi-3-mini-4k-instruct", "score": 14.96}

livebench_reasoning.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"model": "o3-mini-2025-01-31-high", "score": 89.58}
+{"model": "o1-2024-12-17-high", "score": 91.58}
+{"model": "deepseek-r1", "score": 83.17}
+{"model": "o3-mini-2025-01-31-medium", "score": 86.33}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 78.17}
+{"model": "gemini-2.0-pro-exp-02-05", "score": 60.08}
+{"model": "gemini-exp-1206", "score": 57.00}
+{"model": "o3-mini-2025-01-31-low", "score": 69.83}
+{"model": "qwen2.5-max", "score": 51.42}
+{"model": "gemini-2.0-flash", "score": 55.25}
+{"model": "deepseek-v3", "score": 56.75}
+{"model": "gemini-2.0-flash-exp", "score": 59.08}
+{"model": "claude-3-5-sonnet-20241022", "score": 56.67}
+{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.92}
+{"model": "o1-mini-2024-09-12", "score": 72.33}
+{"model": "step-2-16k-202411", "score": 52.17}
+{"model": "gpt-4o-2024-08-06", "score": 53.92}
+{"model": "gemini-1.5-pro-002", "score": 49.08}
+{"model": "grok-2-1212", "score": 54.83}
+{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 50.08}
+{"model": "dracarys2-72b-instruct", "score": 47.38}
+{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 53.25}
+{"model": "gpt-4o-2024-11-20", "score": 55.75}
+{"model": "learnlm-1.5-pro-experimental", "score": 43.42}
+{"model": "chatgpt-4o-latest-0903", "score": 50.50}
+{"model": "qwen2.5-72b-instruct-turbo", "score": 45.42}
+{"model": "gpt-4-turbo-2024-04-09", "score": 50.92}
+{"model": "llama-3.3-70b-instruct-turbo", "score": 50.75}
+{"model": "deepseek-r1-distill-llama-70b", "score": 67.58}
+{"model": "grok-beta", "score": 37.00}
+{"model": "claude-3-opus-20240229", "score": 40.58}
+{"model": "mistral-large-2411", "score": 43.50}
+{"model": "qwen2.5-coder-32b-instruct", "score": 42.08}
+{"model": "dracarys2-llama-3.1-70b-instruct", "score": 44.67}
+{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 43.00}
+{"model": "amazon.nova-pro-v1:0", "score": 32.58}
+{"model": "claude-3-5-haiku-20241022", "score": 28.08}
+{"model": "deepseek-r1-distill-qwen-32b", "score": 52.25}
+{"model": "mistral-small-2501", "score": 36.42}
+{"model": "phi-4", "score": 47.83}
+{"model": "gpt-4o-mini-2024-07-18", "score": 32.75}
+{"model": "qwq-32b-preview", "score": 57.71}
+{"model": "gemma-2-27b-it", "score": 28.08}
+{"model": "amazon.nova-lite-v1:0", "score": 36.67}
+{"model": "qwen2.5-7b-instruct-turbo", "score": 28.42}
+{"model": "mistral-small-2409", "score": 29.92}
+{"model": "command-r-plus-08-2024", "score": 24.75}
+{"model": "amazon.nova-micro-v1:0", "score": 25.08}
+{"model": "gemma-2-9b-it", "score": 15.17}
+{"model": "command-r-08-2024", "score": 21.92}
+{"model": "command-r-plus-04-2024", "score": 20.58}
+{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 13.33}
+{"model": "phi-3-small-8k-instruct", "score": 15.92}
+{"model": "phi-3-mini-128k-instruct", "score": 20.50}
+{"model": "olmo-2-1124-13b-instruct", "score": 16.33}
+{"model": "phi-3-mini-4k-instruct", "score": 26.83}

livecodebench.jsonl ADDED Viewed

	@@ -0,0 +1,26 @@

+{"model": "o1-2024-12-17 (high)", "score": 73.1}
+{"model": "o3-mini-2025-01-31 (high)", "score": 71.6}
+{"model": "o3-mini-2025-01-31 (medium)", "score": 68.8}
+{"model": "o1-2024-12-17 (medium)", "score": 65.4}
+{"model": "deepseek-r1-preview", "score": 64.3}
+{"model": "o1-2024-12-17 (low)", "score": 62.7}
+{"model": "o3-mini-2025-01-31 (low)", "score": 62.7}
+{"model": "o1-mini-2024-09-12", "score": 54.1}
+{"model": "deepseek-r1-lite-preview", "score": 50.4}
+{"model": "gemini-flash-2.0-thinking-01-21", "score": 45}
+{"model": "qwq-32b-preview", "score": 44}
+{"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4}
+{"model": "o1-preview-2024-09-12", "score": 42.5}
+{"model": "claude-3.5-sonnet-20241022", "score": 37.1}
+{"model": "deepseek-v3", "score": 36.3}
+{"model": "gpt-4o-2024-05-13", "score": 33}
+{"model": "claude-3.5-sonnet-20240620", "score": 32}
+{"model": "gemini-flash-2.0-exp", "score": 32}
+{"model": "gemini-pro-1.5-002", "score": 30.9}
+{"model": "gpt-4o-2024-08-06", "score": 30.5}
+{"model": "gpt-4-turbo-2024-04-09", "score": 29.6}
+{"model": "gemini-flash-1.5-002", "score": 28.4}
+{"model": "gpt-4o-mini-2024-07-18", "score": 27.7}
+{"model": "mistral-large", "score": 27.6}
+{"model": "codestral-latest", "score": 23.8}
+{"model": "claude-3-haiku", "score": 17.1}

models.jsonl CHANGED Viewed

@@ -1,3 +1,49 @@
 {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -12,6 +58,7 @@
 {"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -22,6 +69,7 @@
 {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -30,6 +78,7 @@
 {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -37,6 +86,7 @@
 {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
 {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -83,6 +133,7 @@
 {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
 {"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
 {"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
@@ -90,6 +141,7 @@
 {"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
 {"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
 {"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
@@ -187,6 +239,8 @@
 {"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -202,6 +256,8 @@
 {"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}

+{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o3-mini-2025-01-31 (low)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o1-2024-12-17-high", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o1-2024-12-17 (high)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o1-2024-12-17 (medium)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o1-2024-12-17 (low)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-r1", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-r1-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-r1-lite-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o3-mini-2025-01-31-medium", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-flash-thinking-exp-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-flash-2.0-thinking-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-pro-exp-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "o3-mini-2025-01-31-low", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "qwen2.5-max", "Release Date": "2025-01-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-flash", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-flash-2.0-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-v3", "Release Date": "2024-12-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "chatgpt-4o-latest-2025-01-29", "Release Date": "2025-01-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "step-2-16k-202411", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-flash-lite-preview-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "dracarys2-72b-instruct", "Release Date": "2024-09-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "meta-llama-3.1-405b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "learnlm-1.5-pro-experimental", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "chatgpt-4o-latest-0903", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "qwen2.5-72b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "llama-3.3-70b-instruct-turbo", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-r1-distill-llama-70b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "mistral-large-2411", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "dracarys2-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "meta-llama-3.1-70b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "amazon.nova-pro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "deepseek-r1-distill-qwen-32b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "mistral-small-2501", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "phi-4", "Release Date": "2024-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "qwq-32b-preview", "Release Date": "2024-11-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "amazon.nova-lite-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "qwen2.5-7b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "mistral-small-2409", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "amazon.nova-micro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "command-r-plus-04-2024", "Release Date": "2024-04-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "meta-llama-3.1-8b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "olmo-2-1124-13b-instruct", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-pro-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-flash-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "grok-2", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
 {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "claude-3-5-sonnet", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "claude-3-haiku", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
 {"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
 {"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
 {"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
 {"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "mistral-large", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
 {"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
 {"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-flash-2.0-thinking-12-19", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "gemini-2.0-flash-thinking", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "codestral-2501", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
+{"Name": "codestral-latest", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}

simple_bench_leaderboard.jsonl CHANGED Viewed

@@ -1,15 +1,19 @@
 {"model": "o1-preview-2024-09-12", "score": 41.7}
 {"model": "claude-3-5-sonnet-20241022", "score": 41.4}
-{"model": "o1-2024-12-17", "score": 36.7}
 {"model": "gemini-exp-1206", "score": 31.1}
 {"model": "claude-3-5-sonnet-20240620", "score": 27.5}
 {"model": "gemini-1.5-pro-002", "score": 27.1}
 {"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
 {"model": "claude-3-opus-20240229", "score": 23.5}
 {"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
 {"model": "grok-beta", "score": 22.7}
 {"model": "mistral-large-2407", "score": 22.5}
 {"model": "llama-3.3-70b-instruct", "score": 19.9}
 {"model": "gemini-2.0-flash-exp", "score": 18.9}
 {"model": "o1-mini-2024-09-12", "score": 18.1}
 {"model": "gpt-4o-2024-08-06", "score": 17.8}

 {"model": "o1-preview-2024-09-12", "score": 41.7}
 {"model": "claude-3-5-sonnet-20241022", "score": 41.4}
+{"model": "o1-2024-12-17 (high)", "score": 40.1}
+{"model": "o1-2024-12-17 (medium)", "score": 36.7}
 {"model": "gemini-exp-1206", "score": 31.1}
+{"model": "deepseek-r1", "score": 30.9}
 {"model": "claude-3-5-sonnet-20240620", "score": 27.5}
 {"model": "gemini-1.5-pro-002", "score": 27.1}
 {"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
 {"model": "claude-3-opus-20240229", "score": 23.5}
 {"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
+{"model": "o3-mini-2025-01-31 (high)", "score": 22.8}
 {"model": "grok-beta", "score": 22.7}
 {"model": "mistral-large-2407", "score": 22.5}
 {"model": "llama-3.3-70b-instruct", "score": 19.9}
+{"model": "deepseek-v3", "score": 18.9}
 {"model": "gemini-2.0-flash-exp", "score": 18.9}
 {"model": "o1-mini-2024-09-12", "score": 18.1}
 {"model": "gpt-4o-2024-08-06", "score": 17.8}