kaizuberbuehler commited on
Commit
b605a32
·
1 Parent(s): a2d5ea0

Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench

Browse files
app.py CHANGED
@@ -187,16 +187,25 @@ with gr.Blocks() as demo:
187
 
188
  | Benchmark | Top Score |
189
  |-----------|-----------|
 
190
  | BigCodeBench | 🟠 36% |
191
  | Simple Bench | 🟠 42% |
192
  | PlanBench | 🟠 53% |
193
  | GAIA | 🟡 65% |
 
 
 
194
  | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
 
195
  | GPQA | 🟡 76% |
 
196
  | ZebraLogic | 🟡 81% |
 
197
  | ARC-AGI-Pub (Public Eval) | 🟡 83% |
 
198
  | ZeroEval | 🟡 86% |
199
  | MATH-L5 | 🟡 89% |
 
200
  | MMLU-Redux | 🟢 93% |
201
  | CRUX | 🟢 96% |
202
 
@@ -209,6 +218,11 @@ with gr.Blocks() as demo:
209
  | 🟡 Yellow | 60% to 90% |
210
  | 🟢 Green | Above 90% |"""
211
  )
 
 
 
 
 
212
  with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
213
  bigcodebench_plot: gr.Plot = gr.Plot()
214
  bigcodebench_markdown: gr.Markdown = gr.Markdown(
@@ -229,6 +243,21 @@ with gr.Blocks() as demo:
229
  gaia_markdown: gr.Markdown = gr.Markdown(
230
  value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
231
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
233
  with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
234
  arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
@@ -237,16 +266,36 @@ with gr.Blocks() as demo:
237
  arc_agi_markdown: gr.Markdown = gr.Markdown(
238
  value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
239
  )
 
 
 
 
 
240
  with gr.Tab("🟡 GPQA") as gpqa_tab:
241
  gpqa_plot: gr.Plot = gr.Plot()
242
  gpqa_markdown: gr.Markdown = gr.Markdown(
243
  value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
244
  )
 
 
 
 
 
245
  with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
246
  zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
247
  zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
248
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
249
  )
 
 
 
 
 
 
 
 
 
 
250
  with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
251
  zeroeval_average_plot: gr.Plot = gr.Plot()
252
  zeroeval_average_markdown: gr.Markdown = gr.Markdown(
@@ -257,6 +306,11 @@ with gr.Blocks() as demo:
257
  zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
258
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
259
  )
 
 
 
 
 
260
  with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
261
  zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
262
  zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
@@ -267,8 +321,6 @@ with gr.Blocks() as demo:
267
  zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
268
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
269
  )
270
- with gr.Tab("Codeforces") as codeforces_tab:
271
- codeforces_plot: gr.Plot = gr.Plot()
272
  with gr.Tab("OpenCompass", visible=False):
273
  opencompass_plot: gr.Plot = gr.Plot()
274
  opencompass_markdown: gr.Markdown = gr.Markdown(
@@ -284,6 +336,107 @@ with gr.Blocks() as demo:
284
  webarena_markdown: gr.Markdown = gr.Markdown(
285
  value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
286
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  with gr.Tab("Finance") as finance_tab:
288
  with gr.Tab("Big Tech Capex") as big_five_capex_tab:
289
  big_five_capex_plot: gr.Plot = gr.Plot()
@@ -292,24 +445,30 @@ with gr.Blocks() as demo:
292
  big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
293
  arc_agi_public_eval_tab.select(fn=create_simple_plot,
294
  inputs=[gr.State("arc_agi_leaderboard.jsonl"),
295
- gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
296
- gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
 
 
297
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
298
  gr.State(0), gr.State(100),
299
  gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
300
  outputs=arc_agi_public_eval_plot)
301
  arc_agi_tab.select(fn=create_simple_plot,
302
  inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
303
- gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
304
- gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
 
 
305
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
306
  gr.State(0), gr.State(100),
307
  gr.State({"MTurkers": 77})],
308
  outputs=arc_agi_semi_private_eval_plot)
309
  arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
310
  inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
311
- gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
312
- gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
 
 
313
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
314
  gr.State(0), gr.State(100),
315
  gr.State({"MTurkers": 77})],
@@ -318,35 +477,31 @@ with gr.Blocks() as demo:
318
  simple_bench_tab.select(fn=create_simple_plot,
319
  inputs=[gr.State("simple_bench_leaderboard.jsonl"),
320
  gr.State("Simple Bench Score"),
321
- gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
322
- gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)),
 
323
  gr.State(0), gr.State(100),
324
  gr.State({"Humans": 83.7})],
325
  outputs=simple_bench_plot)
326
- codeforces_tab.select(fn=create_simple_plot,
327
- inputs=[gr.State("codeforces_leaderboard.jsonl"),
328
- gr.State("Codeforces Rating"),
329
- gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
330
- gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
331
- gr.State(0), gr.State(4000),
332
- gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
333
- outputs=codeforces_plot)
334
  planbench_tab.select(fn=create_simple_plot,
335
  inputs=[gr.State("planbench_leaderboard.jsonl"),
336
  gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
337
- gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
 
338
  gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
339
  outputs=planbench_plot)
340
  bigcodebench_tab.select(fn=create_simple_plot,
341
  inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
342
  gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
343
- gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
 
344
  gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
345
  outputs=bigcodebench_plot)
346
  gaia_tab.select(fn=create_simple_plot,
347
  inputs=[gr.State("gaia_leaderboard.jsonl"),
348
  gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
349
- gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
 
350
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
351
  gr.State(0), gr.State(100),
352
  gr.State({"Humans": 92})],
@@ -354,7 +509,8 @@ with gr.Blocks() as demo:
354
  gpqa_tab.select(fn=create_simple_plot,
355
  inputs=[gr.State("gpqa_leaderboard.jsonl"),
356
  gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
357
- gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
 
358
  gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
359
  gr.State(25), gr.State(100),
360
  gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
@@ -362,34 +518,103 @@ with gr.Blocks() as demo:
362
  zeroeval_average_tab.select(fn=create_simple_plot,
363
  inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
364
  gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
365
- gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
 
366
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
367
  outputs=zeroeval_average_plot)
368
  zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
369
  inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
370
- gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
371
- gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
 
 
372
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
373
  outputs=zeroeval_mmlu_redux_plot)
374
  zeroeval_zebralogic_tab.select(fn=create_simple_plot,
375
  inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
376
  gr.State("ZeroEval ZebraLogic Score"),
377
- gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
 
378
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
379
  outputs=zeroeval_zebralogic_plot)
380
  zeroeval_crux_tab.select(fn=create_simple_plot,
381
  inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
382
- gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
383
- gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
 
 
384
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
385
  outputs=zeroeval_crux_plot)
386
  zeroeval_math_l5_tab.select(fn=create_simple_plot,
387
  inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
388
  gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
389
- gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
 
390
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
391
  outputs=zeroeval_math_l5_plot)
392
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  if __name__ == "__main__":
395
  demo.launch()
 
187
 
188
  | Benchmark | Top Score |
189
  |-----------|-----------|
190
+ | Humanity's Last Exam | 🔴 7% |
191
  | BigCodeBench | 🟠 36% |
192
  | Simple Bench | 🟠 42% |
193
  | PlanBench | 🟠 53% |
194
  | GAIA | 🟡 65% |
195
+ | LiveBench Language | 🟡 65% |
196
+ | LiveBench Data Analysis | 🟡 71% |
197
+ | LiveCodeBench | 🟡 73% |
198
  | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
199
+ | LiveBench | 🟡 76% |
200
  | GPQA | 🟡 76% |
201
+ | LiveBench Mathematics | 🟡 81% |
202
  | ZebraLogic | 🟡 81% |
203
+ | LiveBench Coding | 🟡 83% |
204
  | ARC-AGI-Pub (Public Eval) | 🟡 83% |
205
+ | LiveBench IF | 🟡 86% |
206
  | ZeroEval | 🟡 86% |
207
  | MATH-L5 | 🟡 89% |
208
+ | LiveBench Reasoning | 🟢 92% |
209
  | MMLU-Redux | 🟢 93% |
210
  | CRUX | 🟢 96% |
211
 
 
218
  | 🟡 Yellow | 60% to 90% |
219
  | 🟢 Green | Above 90% |"""
220
  )
221
+ with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab:
222
+ humanitys_last_exam_plot: gr.Plot = gr.Plot()
223
+ humanitys_last_exam_markdown: gr.Markdown = gr.Markdown(
224
+ value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)"""
225
+ )
226
  with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
227
  bigcodebench_plot: gr.Plot = gr.Plot()
228
  bigcodebench_markdown: gr.Markdown = gr.Markdown(
 
243
  gaia_markdown: gr.Markdown = gr.Markdown(
244
  value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
245
  )
246
+ with gr.Tab("🟡 LiveBench Language") as livebench_language_tab:
247
+ livebench_language_plot: gr.Plot = gr.Plot()
248
+ livebench_language_markdown: gr.Markdown = gr.Markdown(
249
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
250
+ )
251
+ with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab:
252
+ livebench_data_analysis_plot: gr.Plot = gr.Plot()
253
+ livebench_data_analysis_markdown: gr.Markdown = gr.Markdown(
254
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
255
+ )
256
+ with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab:
257
+ livecodebench_plot: gr.Plot = gr.Plot()
258
+ livecodebench_markdown: gr.Markdown = gr.Markdown(
259
+ value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)"""
260
+ )
261
  with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
262
  with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
263
  arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
 
266
  arc_agi_markdown: gr.Markdown = gr.Markdown(
267
  value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
268
  )
269
+ with gr.Tab("🟡 LiveBench") as livebench_tab:
270
+ livebench_plot: gr.Plot = gr.Plot()
271
+ livebench_markdown: gr.Markdown = gr.Markdown(
272
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
273
+ )
274
  with gr.Tab("🟡 GPQA") as gpqa_tab:
275
  gpqa_plot: gr.Plot = gr.Plot()
276
  gpqa_markdown: gr.Markdown = gr.Markdown(
277
  value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
278
  )
279
+ with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab:
280
+ livebench_mathematics_plot: gr.Plot = gr.Plot()
281
+ livebench_mathematics_markdown: gr.Markdown = gr.Markdown(
282
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
283
+ )
284
  with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
285
  zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
286
  zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
287
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
288
  )
289
+ with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab:
290
+ livebench_coding_plot: gr.Plot = gr.Plot()
291
+ livebench_coding_markdown: gr.Markdown = gr.Markdown(
292
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
293
+ )
294
+ with gr.Tab("🟡 LiveBench IF") as livebench_if_tab:
295
+ livebench_if_plot: gr.Plot = gr.Plot()
296
+ livebench_if_markdown: gr.Markdown = gr.Markdown(
297
+ value="""Source: [LiveBench IF](https://livebench.ai/)"""
298
+ )
299
  with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
300
  zeroeval_average_plot: gr.Plot = gr.Plot()
301
  zeroeval_average_markdown: gr.Markdown = gr.Markdown(
 
306
  zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
307
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
308
  )
309
+ with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab:
310
+ livebench_reasoning_plot: gr.Plot = gr.Plot()
311
+ livebench_reasoning_markdown: gr.Markdown = gr.Markdown(
312
+ value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
313
+ )
314
  with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
315
  zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
316
  zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
 
321
  zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
322
  value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
323
  )
 
 
324
  with gr.Tab("OpenCompass", visible=False):
325
  opencompass_plot: gr.Plot = gr.Plot()
326
  opencompass_markdown: gr.Markdown = gr.Markdown(
 
336
  webarena_markdown: gr.Markdown = gr.Markdown(
337
  value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
338
  )
339
+ with gr.Tab("OSWorld", visible=False):
340
+ osworld_plot: gr.Plot = gr.Plot()
341
+ osworld_markdown: gr.Markdown = gr.Markdown(
342
+ value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
343
+ )
344
+ with gr.Tab("EMMA-Mini", visible=False):
345
+ emma_plot: gr.Plot = gr.Plot()
346
+ emma_markdown: gr.Markdown = gr.Markdown(
347
+ value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
348
+ )
349
+ with gr.Tab("MathVista", visible=False):
350
+ mathvista_plot: gr.Plot = gr.Plot()
351
+ mathvista_markdown: gr.Markdown = gr.Markdown(
352
+ value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)"""
353
+ )
354
+ with gr.Tab("DABStep", visible=False):
355
+ dabstep_plot: gr.Plot = gr.Plot()
356
+ dabstep_markdown: gr.Markdown = gr.Markdown(
357
+ value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)"""
358
+ )
359
+ with gr.Tab("lineage-bench", visible=False):
360
+ lineage_bench_plot: gr.Plot = gr.Plot()
361
+ lineage_bench_markdown: gr.Markdown = gr.Markdown(
362
+ value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)"""
363
+ )
364
+ with gr.Tab("Step-Game", visible=False):
365
+ step_game_plot: gr.Plot = gr.Plot()
366
+ step_game_markdown: gr.Markdown = gr.Markdown(
367
+ value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)"""
368
+ )
369
+ with gr.Tab("HHEM", visible=False):
370
+ hhem_plot: gr.Plot = gr.Plot()
371
+ hhem_markdown: gr.Markdown = gr.Markdown(
372
+ value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
373
+ )
374
+ with gr.Tab("NYT Connections", visible=False):
375
+ nyt_connections_exam_plot: gr.Plot = gr.Plot()
376
+ nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
377
+ value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
378
+ )
379
+ with gr.Tab("USACO", visible=False):
380
+ usaco_plot: gr.Plot = gr.Plot()
381
+ usaco_markdown: gr.Markdown = gr.Markdown(
382
+ value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)"""
383
+ )
384
+ with gr.Tab("AppWorld", visible=False):
385
+ appworld_plot: gr.Plot = gr.Plot()
386
+ appworld_markdown: gr.Markdown = gr.Markdown(
387
+ value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)"""
388
+ )
389
+ with gr.Tab("CORE-Bench", visible=False):
390
+ core_bench_plot: gr.Plot = gr.Plot()
391
+ core_bench_markdown: gr.Markdown = gr.Markdown(
392
+ value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)"""
393
+ )
394
+ with gr.Tab("Cybench", visible=False):
395
+ cybench_plot: gr.Plot = gr.Plot()
396
+ cybench_markdown: gr.Markdown = gr.Markdown(
397
+ value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)"""
398
+ )
399
+ with gr.Tab("MultiChallenge", visible=False):
400
+ multichallenge_plot: gr.Plot = gr.Plot()
401
+ multichallenge_markdown: gr.Markdown = gr.Markdown(
402
+ value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)"""
403
+ )
404
+ with gr.Tab("VISTA", visible=False):
405
+ vista_plot: gr.Plot = gr.Plot()
406
+ vista_markdown: gr.Markdown = gr.Markdown(
407
+ value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)"""
408
+ )
409
+ with gr.Tab("ToolComp", visible=False):
410
+ with gr.Tab("Enterprise"):
411
+ toolcomp_enterprise_plot: gr.Plot = gr.Plot()
412
+ toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown(
413
+ value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)"""
414
+ )
415
+ with gr.Tab("Chat"):
416
+ toolcomp_chat_plot: gr.Plot = gr.Plot()
417
+ toolcomp_chat_markdown: gr.Markdown = gr.Markdown(
418
+ value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)"""
419
+ )
420
+ with gr.Tab("BFCL", visible=False):
421
+ bfcl_plot: gr.Plot = gr.Plot()
422
+ bfcl_markdown: gr.Markdown = gr.Markdown(
423
+ value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)"""
424
+ )
425
+ with gr.Tab("EvalPlus", visible=False):
426
+ evalplus_plot: gr.Plot = gr.Plot()
427
+ evalplus_markdown: gr.Markdown = gr.Markdown(
428
+ value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)"""
429
+ )
430
+ with gr.Tab("Aider Polyglot", visible=False):
431
+ aider_plot: gr.Plot = gr.Plot()
432
+ aider_markdown: gr.Markdown = gr.Markdown(
433
+ value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)"""
434
+ )
435
+ with gr.Tab("QuALITY", visible=False):
436
+ quality_plot: gr.Plot = gr.Plot()
437
+ quality_markdown: gr.Markdown = gr.Markdown(
438
+ value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
439
+ )
440
  with gr.Tab("Finance") as finance_tab:
441
  with gr.Tab("Big Tech Capex") as big_five_capex_tab:
442
  big_five_capex_plot: gr.Plot = gr.Plot()
 
445
  big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
446
  arc_agi_public_eval_tab.select(fn=create_simple_plot,
447
  inputs=[gr.State("arc_agi_leaderboard.jsonl"),
448
+ gr.State(
449
+ "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
450
+ gr.State(
451
+ "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
452
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
453
  gr.State(0), gr.State(100),
454
  gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
455
  outputs=arc_agi_public_eval_plot)
456
  arc_agi_tab.select(fn=create_simple_plot,
457
  inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
458
+ gr.State(
459
+ "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
460
+ gr.State(
461
+ "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
462
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
463
  gr.State(0), gr.State(100),
464
  gr.State({"MTurkers": 77})],
465
  outputs=arc_agi_semi_private_eval_plot)
466
  arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
467
  inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
468
+ gr.State(
469
+ "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
470
+ gr.State(
471
+ "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
472
  gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
473
  gr.State(0), gr.State(100),
474
  gr.State({"MTurkers": 77})],
 
477
  simple_bench_tab.select(fn=create_simple_plot,
478
  inputs=[gr.State("simple_bench_leaderboard.jsonl"),
479
  gr.State("Simple Bench Score"),
480
+ gr.State(
481
+ "\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
482
+ gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)),
483
  gr.State(0), gr.State(100),
484
  gr.State({"Humans": 83.7})],
485
  outputs=simple_bench_plot)
 
 
 
 
 
 
 
 
486
  planbench_tab.select(fn=create_simple_plot,
487
  inputs=[gr.State("planbench_leaderboard.jsonl"),
488
  gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
489
+ gr.State(
490
+ "\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
491
  gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
492
  outputs=planbench_plot)
493
  bigcodebench_tab.select(fn=create_simple_plot,
494
  inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
495
  gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
496
+ gr.State(
497
+ "\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
498
  gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
499
  outputs=bigcodebench_plot)
500
  gaia_tab.select(fn=create_simple_plot,
501
  inputs=[gr.State("gaia_leaderboard.jsonl"),
502
  gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
503
+ gr.State(
504
+ "\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
505
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
506
  gr.State(0), gr.State(100),
507
  gr.State({"Humans": 92})],
 
509
  gpqa_tab.select(fn=create_simple_plot,
510
  inputs=[gr.State("gpqa_leaderboard.jsonl"),
511
  gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
512
+ gr.State(
513
+ "\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
514
  gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
515
  gr.State(25), gr.State(100),
516
  gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
 
518
  zeroeval_average_tab.select(fn=create_simple_plot,
519
  inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
520
  gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
521
+ gr.State(
522
+ "\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
523
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
524
  outputs=zeroeval_average_plot)
525
  zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
526
  inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
527
+ gr.State(
528
+ "ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
529
+ gr.State(
530
+ "\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
531
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
532
  outputs=zeroeval_mmlu_redux_plot)
533
  zeroeval_zebralogic_tab.select(fn=create_simple_plot,
534
  inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
535
  gr.State("ZeroEval ZebraLogic Score"),
536
+ gr.State(
537
+ "\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
538
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
539
  outputs=zeroeval_zebralogic_plot)
540
  zeroeval_crux_tab.select(fn=create_simple_plot,
541
  inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
542
+ gr.State(
543
+ "ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
544
+ gr.State(
545
+ "\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
546
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
547
  outputs=zeroeval_crux_plot)
548
  zeroeval_math_l5_tab.select(fn=create_simple_plot,
549
  inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
550
  gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
551
+ gr.State(
552
+ "\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
553
  gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
554
  outputs=zeroeval_math_l5_plot)
555
+ livebench_tab.select(fn=create_simple_plot,
556
+ inputs=[gr.State("livebench.jsonl"),
557
+ gr.State("LiveBench-2024-11-25: Global Average Score"),
558
+ gr.State(
559
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
560
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
561
+ outputs=livebench_plot)
562
+ livebench_reasoning_tab.select(fn=create_simple_plot,
563
+ inputs=[gr.State("livebench_reasoning.jsonl"),
564
+ gr.State("LiveBench-2024-11-25: Reasoning Average Score"),
565
+ gr.State(
566
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
567
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
568
+ outputs=livebench_reasoning_plot)
569
+ livebench_coding_tab.select(fn=create_simple_plot,
570
+ inputs=[gr.State("livebench_coding.jsonl"),
571
+ gr.State("LiveBench-2024-11-25: Coding Average Score"),
572
+ gr.State(
573
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
574
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
575
+ outputs=livebench_coding_plot)
576
+ livebench_mathematics_tab.select(fn=create_simple_plot,
577
+ inputs=[gr.State("livebench_mathematics.jsonl"),
578
+ gr.State("LiveBench-2024-11-25: Mathematics Average Score"),
579
+ gr.State(
580
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
581
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
582
+ outputs=livebench_mathematics_plot)
583
+ livebench_data_analysis_tab.select(fn=create_simple_plot,
584
+ inputs=[gr.State("livebench_data_analysis.jsonl"),
585
+ gr.State("LiveBench-2024-11-25: Data Analysis Average Score"),
586
+ gr.State(
587
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
588
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
589
+ outputs=livebench_data_analysis_plot)
590
+ livebench_language_tab.select(fn=create_simple_plot,
591
+ inputs=[gr.State("livebench_language.jsonl"),
592
+ gr.State("LiveBench-2024-11-25: Language Average Score"),
593
+ gr.State(
594
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
595
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
596
+ outputs=livebench_language_plot)
597
+ livebench_if_tab.select(fn=create_simple_plot,
598
+ inputs=[gr.State("livebench_if.jsonl"),
599
+ gr.State("LiveBench-2024-11-25: IF Average Score"),
600
+ gr.State(
601
+ "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
602
+ gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
603
+ outputs=livebench_if_plot)
604
+ humanitys_last_exam_tab.select(fn=create_simple_plot,
605
+ inputs=[gr.State("humanitys_last_exam.jsonl"),
606
+ gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"),
607
+ gr.State(
608
+ "\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"),
609
+ gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))],
610
+ outputs=humanitys_last_exam_plot)
611
+ livecodebench_tab.select(fn=create_simple_plot,
612
+ inputs=[gr.State("livecodebench.jsonl"),
613
+ gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"),
614
+ gr.State(
615
+ "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
616
+ gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
617
+ outputs=livecodebench_plot)
618
 
619
  if __name__ == "__main__":
620
  demo.launch()
codeforces_leaderboard.jsonl DELETED
@@ -1,6 +0,0 @@
1
- {"model": "o3", "score": 2400}
2
- {"model": "o3-mini", "score": 2073}
3
- {"model": "o1", "score": 1673}
4
- {"model": "o1-mini", "score": 1650}
5
- {"model": "o1-preview", "score": 1258}
6
- {"model": "gpt-4o", "score": 808}
 
 
 
 
 
 
 
humanitys_last_exam.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"model": "gpt-4o", "score": 3.1}
2
+ {"model": "grok-2", "score": 3.9}
3
+ {"model": "claude-3-5-sonnet", "score": 4.8}
4
+ {"model": "gemini-2.0-flash-thinking", "score": 7.2}
5
+ {"model": "o1", "score": 7.2}
livebench.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 75.88}
2
+ {"model": "o1-2024-12-17-high", "score": 75.67}
3
+ {"model": "deepseek-r1", "score": 71.57}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 70.01}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 66.92}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 65.13}
7
+ {"model": "gemini-exp-1206", "score": 64.09}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 62.45}
9
+ {"model": "qwen2.5-max", "score": 62.29}
10
+ {"model": "gemini-2.0-flash", "score": 61.47}
11
+ {"model": "deepseek-v3", "score": 60.45}
12
+ {"model": "gemini-2.0-flash-exp", "score": 59.26}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 59.03}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 57.79}
15
+ {"model": "o1-mini-2024-09-12", "score": 57.76}
16
+ {"model": "step-2-16k-202411", "score": 56.02}
17
+ {"model": "gpt-4o-2024-08-06", "score": 55.33}
18
+ {"model": "gemini-1.5-pro-002", "score": 54.33}
19
+ {"model": "grok-2-1212", "score": 54.30}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 53.24}
21
+ {"model": "dracarys2-72b-instruct", "score": 52.64}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 52.36}
23
+ {"model": "gpt-4o-2024-11-20", "score": 52.19}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 52.19}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 51.66}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 51.44}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 50.40}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 50.16}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 49.66}
30
+ {"model": "grok-beta", "score": 49.18}
31
+ {"model": "claude-3-opus-20240229", "score": 49.16}
32
+ {"model": "mistral-large-2411", "score": 48.43}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 46.23}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 46.21}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 44.89}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 43.53}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 43.45}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 42.93}
39
+ {"model": "mistral-small-2501", "score": 42.55}
40
+ {"model": "phi-4", "score": 41.61}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 41.26}
42
+ {"model": "qwq-32b-preview", "score": 40.25}
43
+ {"model": "gemma-2-27b-it", "score": 38.18}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 36.35}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 34.90}
46
+ {"model": "mistral-small-2409", "score": 33.42}
47
+ {"model": "command-r-plus-08-2024", "score": 31.76}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 29.59}
49
+ {"model": "gemma-2-9b-it", "score": 28.66}
50
+ {"model": "command-r-08-2024", "score": 27.48}
51
+ {"model": "command-r-plus-04-2024", "score": 27.11}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 25.97}
53
+ {"model": "phi-3-small-8k-instruct", "score": 24.03}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 22.36}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 22.12}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 22.08}
livebench_coding.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 82.74}
2
+ {"model": "o1-2024-12-17-high", "score": 69.69}
3
+ {"model": "deepseek-r1", "score": 66.74}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 65.38}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 53.49}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 63.49}
7
+ {"model": "gemini-exp-1206", "score": 63.41}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 61.46}
9
+ {"model": "qwen2.5-max", "score": 64.41}
10
+ {"model": "gemini-2.0-flash", "score": 53.92}
11
+ {"model": "deepseek-v3", "score": 61.77}
12
+ {"model": "gemini-2.0-flash-exp", "score": 54.36}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 67.13}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 60.56}
15
+ {"model": "o1-mini-2024-09-12", "score": 48.05}
16
+ {"model": "step-2-16k-202411", "score": 47.19}
17
+ {"model": "gpt-4o-2024-08-06", "score": 51.44}
18
+ {"model": "gemini-1.5-pro-002", "score": 48.80}
19
+ {"model": "grok-2-1212", "score": 46.44}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 43.80}
21
+ {"model": "dracarys2-72b-instruct", "score": 58.92}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 42.65}
23
+ {"model": "gpt-4o-2024-11-20", "score": 46.08}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 46.87}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 47.44}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 57.64}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 49.00}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 36.59}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 50.97}
30
+ {"model": "grok-beta", "score": 45.15}
31
+ {"model": "claude-3-opus-20240229", "score": 38.59}
32
+ {"model": "mistral-large-2411", "score": 47.08}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 56.85}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 36.31}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 33.49}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 38.15}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 51.36}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 32.85}
39
+ {"model": "mistral-small-2501", "score": 35.31}
40
+ {"model": "phi-4", "score": 30.67}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 43.15}
42
+ {"model": "qwq-32b-preview", "score": 37.20}
43
+ {"model": "gemma-2-27b-it", "score": 35.95}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 27.46}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 38.37}
46
+ {"model": "mistral-small-2409", "score": 25.74}
47
+ {"model": "command-r-plus-08-2024", "score": 19.14}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 20.18}
49
+ {"model": "gemma-2-9b-it", "score": 22.46}
50
+ {"model": "command-r-08-2024", "score": 17.90}
51
+ {"model": "command-r-plus-04-2024", "score": 19.46}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.74}
53
+ {"model": "phi-3-small-8k-instruct", "score": 20.26}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 15.04}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 10.41}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 15.54}
livebench_data_analysis.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 70.64}
2
+ {"model": "o1-2024-12-17-high", "score": 65.47}
3
+ {"model": "deepseek-r1", "score": 69.78}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 66.56}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 69.37}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 68.02}
7
+ {"model": "gemini-exp-1206", "score": 63.16}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 62.04}
9
+ {"model": "qwen2.5-max", "score": 67.93}
10
+ {"model": "gemini-2.0-flash", "score": 67.55}
11
+ {"model": "deepseek-v3", "score": 60.94}
12
+ {"model": "gemini-2.0-flash-exp", "score": 61.67}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 55.03}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 66.00}
15
+ {"model": "o1-mini-2024-09-12", "score": 57.92}
16
+ {"model": "step-2-16k-202411", "score": 63.72}
17
+ {"model": "gpt-4o-2024-08-06", "score": 60.91}
18
+ {"model": "gemini-1.5-pro-002", "score": 54.97}
19
+ {"model": "grok-2-1212", "score": 54.45}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 57.47}
21
+ {"model": "dracarys2-72b-instruct", "score": 55.51}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 55.85}
23
+ {"model": "gpt-4o-2024-11-20", "score": 56.15}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 54.97}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 57.93}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 51.91}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 54.36}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 49.49}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 55.93}
30
+ {"model": "grok-beta", "score": 54.27}
31
+ {"model": "claude-3-opus-20240229", "score": 57.89}
32
+ {"model": "mistral-large-2411", "score": 50.15}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 49.87}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 53.98}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 53.75}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 48.31}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 48.45}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 45.41}
39
+ {"model": "mistral-small-2501", "score": 53.69}
40
+ {"model": "phi-4", "score": 45.17}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 49.96}
42
+ {"model": "qwq-32b-preview", "score": 31.62}
43
+ {"model": "gemma-2-27b-it", "score": 47.87}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 37.23}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 35.22}
46
+ {"model": "mistral-small-2409", "score": 42.73}
47
+ {"model": "command-r-plus-08-2024", "score": 38.06}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 33.95}
49
+ {"model": "gemma-2-9b-it", "score": 36.39}
50
+ {"model": "command-r-08-2024", "score": 33.34}
51
+ {"model": "command-r-plus-04-2024", "score": 25.48}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 32.82}
53
+ {"model": "phi-3-small-8k-instruct", "score": 30.29}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 34.69}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 20.60}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 30.21}
livebench_if.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 84.36}
2
+ {"model": "o1-2024-12-17-high", "score": 81.55}
3
+ {"model": "deepseek-r1", "score": 80.51}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 83.16}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 82.47}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 83.38}
7
+ {"model": "gemini-exp-1206", "score": 77.34}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 80.06}
9
+ {"model": "qwen2.5-max", "score": 75.35}
10
+ {"model": "gemini-2.0-flash", "score": 85.79}
11
+ {"model": "deepseek-v3", "score": 75.25}
12
+ {"model": "gemini-2.0-flash-exp", "score": 81.86}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 69.30}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 65.07}
15
+ {"model": "o1-mini-2024-09-12", "score": 65.40}
16
+ {"model": "step-2-16k-202411", "score": 79.88}
17
+ {"model": "gpt-4o-2024-08-06", "score": 68.58}
18
+ {"model": "gemini-1.5-pro-002", "score": 70.78}
19
+ {"model": "grok-2-1212", "score": 69.63}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 78.28}
21
+ {"model": "dracarys2-72b-instruct", "score": 65.22}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 75.90}
23
+ {"model": "gpt-4o-2024-11-20", "score": 64.94}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 68.16}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 66.37}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 64.39}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 60.85}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 82.67}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 41.55}
30
+ {"model": "grok-beta", "score": 69.62}
31
+ {"model": "claude-3-opus-20240229", "score": 63.89}
32
+ {"model": "mistral-large-2411", "score": 67.93}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 58.69}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 63.24}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 68.98}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 67.13}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 61.88}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 40.92}
39
+ {"model": "mistral-small-2501", "score": 59.54}
40
+ {"model": "phi-4", "score": 58.38}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 56.80}
42
+ {"model": "qwq-32b-preview", "score": 35.59}
43
+ {"model": "gemma-2-27b-it", "score": 58.10}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 54.13}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 52.11}
46
+ {"model": "mistral-small-2409", "score": 53.23}
47
+ {"model": "command-r-plus-08-2024", "score": 57.61}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 48.04}
49
+ {"model": "gemma-2-9b-it", "score": 52.62}
50
+ {"model": "command-r-08-2024", "score": 55.62}
51
+ {"model": "command-r-plus-04-2024", "score": 59.47}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 54.90}
53
+ {"model": "phi-3-small-8k-instruct", "score": 47.20}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 39.08}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 60.56}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 36.36}
livebench_language.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 50.68}
2
+ {"model": "o1-2024-12-17-high", "score": 65.39}
3
+ {"model": "deepseek-r1", "score": 48.53}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 46.26}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 42.18}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 44.85}
7
+ {"model": "gemini-exp-1206", "score": 51.29}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 38.25}
9
+ {"model": "qwen2.5-max", "score": 56.28}
10
+ {"model": "gemini-2.0-flash", "score": 40.69}
11
+ {"model": "deepseek-v3", "score": 47.48}
12
+ {"model": "gemini-2.0-flash-exp", "score": 38.22}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 53.76}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 49.14}
15
+ {"model": "o1-mini-2024-09-12", "score": 40.89}
16
+ {"model": "step-2-16k-202411", "score": 44.39}
17
+ {"model": "gpt-4o-2024-08-06", "score": 47.59}
18
+ {"model": "gemini-1.5-pro-002", "score": 43.29}
19
+ {"model": "grok-2-1212", "score": 45.58}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 34.28}
21
+ {"model": "dracarys2-72b-instruct", "score": 34.12}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 45.46}
23
+ {"model": "gpt-4o-2024-11-20", "score": 47.37}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 41.98}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 45.30}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 34.99}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 44.26}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 39.20}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 23.81}
30
+ {"model": "grok-beta", "score": 43.16}
31
+ {"model": "claude-3-opus-20240229", "score": 50.39}
32
+ {"model": "mistral-large-2411", "score": 39.39}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 23.25}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 38.78}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 35.42}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 36.96}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 35.37}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 26.82}
39
+ {"model": "mistral-small-2501", "score": 30.46}
40
+ {"model": "phi-4", "score": 25.61}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 28.61}
42
+ {"model": "qwq-32b-preview", "score": 21.09}
43
+ {"model": "gemma-2-27b-it", "score": 32.62}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 25.93}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 15.80}
46
+ {"model": "mistral-small-2409", "score": 24.49}
47
+ {"model": "command-r-plus-08-2024", "score": 29.73}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 15.78}
49
+ {"model": "gemma-2-9b-it", "score": 25.53}
50
+ {"model": "command-r-08-2024", "score": 16.72}
51
+ {"model": "command-r-plus-04-2024", "score": 19.70}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 17.71}
53
+ {"model": "phi-3-small-8k-instruct", "score": 12.94}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 9.15}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 11.16}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 8.56}
livebench_mathematics.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 77.29}
2
+ {"model": "o1-2024-12-17-high", "score": 80.32}
3
+ {"model": "deepseek-r1", "score": 80.71}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 72.37}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 75.85}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 70.97}
7
+ {"model": "gemini-exp-1206", "score": 72.36}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 63.06}
9
+ {"model": "qwen2.5-max", "score": 58.35}
10
+ {"model": "gemini-2.0-flash", "score": 65.62}
11
+ {"model": "deepseek-v3", "score": 60.54}
12
+ {"model": "gemini-2.0-flash-exp", "score": 60.39}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 52.28}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 48.02}
15
+ {"model": "o1-mini-2024-09-12", "score": 61.99}
16
+ {"model": "step-2-16k-202411", "score": 48.77}
17
+ {"model": "gpt-4o-2024-08-06", "score": 49.54}
18
+ {"model": "gemini-1.5-pro-002", "score": 59.07}
19
+ {"model": "grok-2-1212", "score": 54.88}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 55.54}
21
+ {"model": "dracarys2-72b-instruct", "score": 54.66}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 41.07}
23
+ {"model": "gpt-4o-2024-11-20", "score": 42.87}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 57.75}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 42.45}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 54.29}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 43.02}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 42.24}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 58.11}
30
+ {"model": "grok-beta", "score": 45.84}
31
+ {"model": "claude-3-opus-20240229", "score": 43.62}
32
+ {"model": "mistral-large-2411", "score": 42.55}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 46.61}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 40.30}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 34.72}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 38.04}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 35.54}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 59.36}
39
+ {"model": "mistral-small-2501", "score": 39.89}
40
+ {"model": "phi-4", "score": 41.98}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 36.31}
42
+ {"model": "qwq-32b-preview", "score": 58.26}
43
+ {"model": "gemma-2-27b-it", "score": 26.46}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 36.70}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 39.51}
46
+ {"model": "mistral-small-2409", "score": 24.42}
47
+ {"model": "command-r-plus-08-2024", "score": 21.27}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 34.49}
49
+ {"model": "gemma-2-9b-it", "score": 19.80}
50
+ {"model": "command-r-08-2024", "score": 19.39}
51
+ {"model": "command-r-plus-04-2024", "score": 17.99}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.31}
53
+ {"model": "phi-3-small-8k-instruct", "score": 17.58}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 15.72}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 13.64}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 14.96}
livebench_reasoning.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o3-mini-2025-01-31-high", "score": 89.58}
2
+ {"model": "o1-2024-12-17-high", "score": 91.58}
3
+ {"model": "deepseek-r1", "score": 83.17}
4
+ {"model": "o3-mini-2025-01-31-medium", "score": 86.33}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 78.17}
6
+ {"model": "gemini-2.0-pro-exp-02-05", "score": 60.08}
7
+ {"model": "gemini-exp-1206", "score": 57.00}
8
+ {"model": "o3-mini-2025-01-31-low", "score": 69.83}
9
+ {"model": "qwen2.5-max", "score": 51.42}
10
+ {"model": "gemini-2.0-flash", "score": 55.25}
11
+ {"model": "deepseek-v3", "score": 56.75}
12
+ {"model": "gemini-2.0-flash-exp", "score": 59.08}
13
+ {"model": "claude-3-5-sonnet-20241022", "score": 56.67}
14
+ {"model": "chatgpt-4o-latest-2025-01-29", "score": 57.92}
15
+ {"model": "o1-mini-2024-09-12", "score": 72.33}
16
+ {"model": "step-2-16k-202411", "score": 52.17}
17
+ {"model": "gpt-4o-2024-08-06", "score": 53.92}
18
+ {"model": "gemini-1.5-pro-002", "score": 49.08}
19
+ {"model": "grok-2-1212", "score": 54.83}
20
+ {"model": "gemini-2.0-flash-lite-preview-02-05", "score": 50.08}
21
+ {"model": "dracarys2-72b-instruct", "score": 47.38}
22
+ {"model": "meta-llama-3.1-405b-instruct-turbo", "score": 53.25}
23
+ {"model": "gpt-4o-2024-11-20", "score": 55.75}
24
+ {"model": "learnlm-1.5-pro-experimental", "score": 43.42}
25
+ {"model": "chatgpt-4o-latest-0903", "score": 50.50}
26
+ {"model": "qwen2.5-72b-instruct-turbo", "score": 45.42}
27
+ {"model": "gpt-4-turbo-2024-04-09", "score": 50.92}
28
+ {"model": "llama-3.3-70b-instruct-turbo", "score": 50.75}
29
+ {"model": "deepseek-r1-distill-llama-70b", "score": 67.58}
30
+ {"model": "grok-beta", "score": 37.00}
31
+ {"model": "claude-3-opus-20240229", "score": 40.58}
32
+ {"model": "mistral-large-2411", "score": 43.50}
33
+ {"model": "qwen2.5-coder-32b-instruct", "score": 42.08}
34
+ {"model": "dracarys2-llama-3.1-70b-instruct", "score": 44.67}
35
+ {"model": "meta-llama-3.1-70b-instruct-turbo", "score": 43.00}
36
+ {"model": "amazon.nova-pro-v1:0", "score": 32.58}
37
+ {"model": "claude-3-5-haiku-20241022", "score": 28.08}
38
+ {"model": "deepseek-r1-distill-qwen-32b", "score": 52.25}
39
+ {"model": "mistral-small-2501", "score": 36.42}
40
+ {"model": "phi-4", "score": 47.83}
41
+ {"model": "gpt-4o-mini-2024-07-18", "score": 32.75}
42
+ {"model": "qwq-32b-preview", "score": 57.71}
43
+ {"model": "gemma-2-27b-it", "score": 28.08}
44
+ {"model": "amazon.nova-lite-v1:0", "score": 36.67}
45
+ {"model": "qwen2.5-7b-instruct-turbo", "score": 28.42}
46
+ {"model": "mistral-small-2409", "score": 29.92}
47
+ {"model": "command-r-plus-08-2024", "score": 24.75}
48
+ {"model": "amazon.nova-micro-v1:0", "score": 25.08}
49
+ {"model": "gemma-2-9b-it", "score": 15.17}
50
+ {"model": "command-r-08-2024", "score": 21.92}
51
+ {"model": "command-r-plus-04-2024", "score": 20.58}
52
+ {"model": "meta-llama-3.1-8b-instruct-turbo", "score": 13.33}
53
+ {"model": "phi-3-small-8k-instruct", "score": 15.92}
54
+ {"model": "phi-3-mini-128k-instruct", "score": 20.50}
55
+ {"model": "olmo-2-1124-13b-instruct", "score": 16.33}
56
+ {"model": "phi-3-mini-4k-instruct", "score": 26.83}
livecodebench.jsonl ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-2024-12-17 (high)", "score": 73.1}
2
+ {"model": "o3-mini-2025-01-31 (high)", "score": 71.6}
3
+ {"model": "o3-mini-2025-01-31 (medium)", "score": 68.8}
4
+ {"model": "o1-2024-12-17 (medium)", "score": 65.4}
5
+ {"model": "deepseek-r1-preview", "score": 64.3}
6
+ {"model": "o1-2024-12-17 (low)", "score": 62.7}
7
+ {"model": "o3-mini-2025-01-31 (low)", "score": 62.7}
8
+ {"model": "o1-mini-2024-09-12", "score": 54.1}
9
+ {"model": "deepseek-r1-lite-preview", "score": 50.4}
10
+ {"model": "gemini-flash-2.0-thinking-01-21", "score": 45}
11
+ {"model": "qwq-32b-preview", "score": 44}
12
+ {"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4}
13
+ {"model": "o1-preview-2024-09-12", "score": 42.5}
14
+ {"model": "claude-3.5-sonnet-20241022", "score": 37.1}
15
+ {"model": "deepseek-v3", "score": 36.3}
16
+ {"model": "gpt-4o-2024-05-13", "score": 33}
17
+ {"model": "claude-3.5-sonnet-20240620", "score": 32}
18
+ {"model": "gemini-flash-2.0-exp", "score": 32}
19
+ {"model": "gemini-pro-1.5-002", "score": 30.9}
20
+ {"model": "gpt-4o-2024-08-06", "score": 30.5}
21
+ {"model": "gpt-4-turbo-2024-04-09", "score": 29.6}
22
+ {"model": "gemini-flash-1.5-002", "score": 28.4}
23
+ {"model": "gpt-4o-mini-2024-07-18", "score": 27.7}
24
+ {"model": "mistral-large", "score": 27.6}
25
+ {"model": "codestral-latest", "score": 23.8}
26
+ {"model": "claude-3-haiku", "score": 17.1}
models.jsonl CHANGED
@@ -1,3 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
  {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
  {"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -12,6 +58,7 @@
12
  {"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
13
  {"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
14
  {"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
15
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
16
  {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
17
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -22,6 +69,7 @@
22
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
23
  {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
24
  {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
25
  {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
26
  {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
27
  {"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -30,6 +78,7 @@
30
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
31
  {"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
32
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
33
  {"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
34
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
35
  {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -37,6 +86,7 @@
37
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
38
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
39
  {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
40
  {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
41
  {"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
42
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -83,6 +133,7 @@
83
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
84
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
85
  {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
86
  {"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
87
  {"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
88
  {"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
@@ -90,6 +141,7 @@
90
  {"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
91
  {"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
92
  {"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
93
  {"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
94
  {"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
95
  {"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
@@ -187,6 +239,8 @@
187
  {"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
188
  {"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
189
  {"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
190
  {"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
191
  {"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
192
  {"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -202,6 +256,8 @@
202
  {"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
203
  {"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
204
  {"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
205
  {"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
206
  {"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
207
  {"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
1
+ {"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
+ {"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
+ {"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
4
+ {"Name": "o3-mini-2025-01-31 (low)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
+ {"Name": "o1-2024-12-17-high", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
6
+ {"Name": "o1-2024-12-17 (high)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
7
+ {"Name": "o1-2024-12-17 (medium)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
8
+ {"Name": "o1-2024-12-17 (low)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
9
+ {"Name": "deepseek-r1", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
10
+ {"Name": "deepseek-r1-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
11
+ {"Name": "deepseek-r1-lite-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
12
+ {"Name": "o3-mini-2025-01-31-medium", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
13
+ {"Name": "gemini-2.0-flash-thinking-exp-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
14
+ {"Name": "gemini-flash-2.0-thinking-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
15
+ {"Name": "gemini-2.0-pro-exp-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
16
+ {"Name": "o3-mini-2025-01-31-low", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
17
+ {"Name": "qwen2.5-max", "Release Date": "2025-01-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
18
+ {"Name": "gemini-2.0-flash", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
19
+ {"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
20
+ {"Name": "gemini-flash-2.0-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
21
+ {"Name": "deepseek-v3", "Release Date": "2024-12-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
22
+ {"Name": "chatgpt-4o-latest-2025-01-29", "Release Date": "2025-01-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
23
+ {"Name": "step-2-16k-202411", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
24
+ {"Name": "gemini-2.0-flash-lite-preview-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
25
+ {"Name": "dracarys2-72b-instruct", "Release Date": "2024-09-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
26
+ {"Name": "meta-llama-3.1-405b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
27
+ {"Name": "learnlm-1.5-pro-experimental", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
28
+ {"Name": "chatgpt-4o-latest-0903", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
29
+ {"Name": "qwen2.5-72b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
30
+ {"Name": "llama-3.3-70b-instruct-turbo", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
31
+ {"Name": "deepseek-r1-distill-llama-70b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
32
+ {"Name": "mistral-large-2411", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
33
+ {"Name": "dracarys2-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
34
+ {"Name": "meta-llama-3.1-70b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
35
+ {"Name": "amazon.nova-pro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
36
+ {"Name": "deepseek-r1-distill-qwen-32b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
37
+ {"Name": "mistral-small-2501", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
38
+ {"Name": "phi-4", "Release Date": "2024-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
39
+ {"Name": "qwq-32b-preview", "Release Date": "2024-11-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
40
+ {"Name": "amazon.nova-lite-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
41
+ {"Name": "qwen2.5-7b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
42
+ {"Name": "mistral-small-2409", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
43
+ {"Name": "amazon.nova-micro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
44
+ {"Name": "command-r-plus-04-2024", "Release Date": "2024-04-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
45
+ {"Name": "meta-llama-3.1-8b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
46
+ {"Name": "olmo-2-1124-13b-instruct", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
47
  {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
48
  {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
49
  {"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
58
  {"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
59
  {"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
60
  {"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
61
+ {"Name": "gemini-pro-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
62
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
63
  {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
64
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
69
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
70
  {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
71
  {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
72
+ {"Name": "gemini-flash-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
73
  {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
74
  {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
75
  {"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
78
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
79
  {"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
80
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
81
+ {"Name": "grok-2", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
82
  {"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
83
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
84
  {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
86
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
87
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
88
  {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
89
+ {"Name": "claude-3-5-sonnet", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
90
  {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
91
  {"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
92
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
133
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
134
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
135
  {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
136
+ {"Name": "claude-3-haiku", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
137
  {"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
138
  {"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
139
  {"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
 
141
  {"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
142
  {"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
143
  {"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
144
+ {"Name": "mistral-large", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
145
  {"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
146
  {"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
147
  {"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
 
239
  {"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
240
  {"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
241
  {"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
242
+ {"Name": "gemini-flash-2.0-thinking-12-19", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
243
+ {"Name": "gemini-2.0-flash-thinking", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
244
  {"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
245
  {"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
246
  {"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
256
  {"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
257
  {"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
258
  {"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
259
+ {"Name": "codestral-2501", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
260
+ {"Name": "codestral-latest", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
261
  {"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
262
  {"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
263
  {"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
simple_bench_leaderboard.jsonl CHANGED
@@ -1,15 +1,19 @@
1
  {"model": "o1-preview-2024-09-12", "score": 41.7}
2
  {"model": "claude-3-5-sonnet-20241022", "score": 41.4}
3
- {"model": "o1-2024-12-17", "score": 36.7}
 
4
  {"model": "gemini-exp-1206", "score": 31.1}
 
5
  {"model": "claude-3-5-sonnet-20240620", "score": 27.5}
6
  {"model": "gemini-1.5-pro-002", "score": 27.1}
7
  {"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
8
  {"model": "claude-3-opus-20240229", "score": 23.5}
9
  {"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
 
10
  {"model": "grok-beta", "score": 22.7}
11
  {"model": "mistral-large-2407", "score": 22.5}
12
  {"model": "llama-3.3-70b-instruct", "score": 19.9}
 
13
  {"model": "gemini-2.0-flash-exp", "score": 18.9}
14
  {"model": "o1-mini-2024-09-12", "score": 18.1}
15
  {"model": "gpt-4o-2024-08-06", "score": 17.8}
 
1
  {"model": "o1-preview-2024-09-12", "score": 41.7}
2
  {"model": "claude-3-5-sonnet-20241022", "score": 41.4}
3
+ {"model": "o1-2024-12-17 (high)", "score": 40.1}
4
+ {"model": "o1-2024-12-17 (medium)", "score": 36.7}
5
  {"model": "gemini-exp-1206", "score": 31.1}
6
+ {"model": "deepseek-r1", "score": 30.9}
7
  {"model": "claude-3-5-sonnet-20240620", "score": 27.5}
8
  {"model": "gemini-1.5-pro-002", "score": 27.1}
9
  {"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
10
  {"model": "claude-3-opus-20240229", "score": 23.5}
11
  {"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
12
+ {"model": "o3-mini-2025-01-31 (high)", "score": 22.8}
13
  {"model": "grok-beta", "score": 22.7}
14
  {"model": "mistral-large-2407", "score": 22.5}
15
  {"model": "llama-3.3-70b-instruct", "score": 19.9}
16
+ {"model": "deepseek-v3", "score": 18.9}
17
  {"model": "gemini-2.0-flash-exp", "score": 18.9}
18
  {"model": "o1-mini-2024-09-12", "score": 18.1}
19
  {"model": "gpt-4o-2024-08-06", "score": 17.8}