Commit
·
b605a32
1
Parent(s):
a2d5ea0
Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench
Browse files- app.py +255 -30
- codeforces_leaderboard.jsonl +0 -6
- humanitys_last_exam.jsonl +5 -0
- livebench.jsonl +56 -0
- livebench_coding.jsonl +56 -0
- livebench_data_analysis.jsonl +56 -0
- livebench_if.jsonl +56 -0
- livebench_language.jsonl +56 -0
- livebench_mathematics.jsonl +56 -0
- livebench_reasoning.jsonl +56 -0
- livecodebench.jsonl +26 -0
- models.jsonl +56 -0
- simple_bench_leaderboard.jsonl +5 -1
app.py
CHANGED
@@ -187,16 +187,25 @@ with gr.Blocks() as demo:
|
|
187 |
|
188 |
| Benchmark | Top Score |
|
189 |
|-----------|-----------|
|
|
|
190 |
| BigCodeBench | 🟠 36% |
|
191 |
| Simple Bench | 🟠 42% |
|
192 |
| PlanBench | 🟠 53% |
|
193 |
| GAIA | 🟡 65% |
|
|
|
|
|
|
|
194 |
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
|
|
|
195 |
| GPQA | 🟡 76% |
|
|
|
196 |
| ZebraLogic | 🟡 81% |
|
|
|
197 |
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
|
|
|
198 |
| ZeroEval | 🟡 86% |
|
199 |
| MATH-L5 | 🟡 89% |
|
|
|
200 |
| MMLU-Redux | 🟢 93% |
|
201 |
| CRUX | 🟢 96% |
|
202 |
|
@@ -209,6 +218,11 @@ with gr.Blocks() as demo:
|
|
209 |
| 🟡 Yellow | 60% to 90% |
|
210 |
| 🟢 Green | Above 90% |"""
|
211 |
)
|
|
|
|
|
|
|
|
|
|
|
212 |
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
|
213 |
bigcodebench_plot: gr.Plot = gr.Plot()
|
214 |
bigcodebench_markdown: gr.Markdown = gr.Markdown(
|
@@ -229,6 +243,21 @@ with gr.Blocks() as demo:
|
|
229 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
230 |
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
|
231 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
|
233 |
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
|
234 |
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
|
@@ -237,16 +266,36 @@ with gr.Blocks() as demo:
|
|
237 |
arc_agi_markdown: gr.Markdown = gr.Markdown(
|
238 |
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
|
239 |
)
|
|
|
|
|
|
|
|
|
|
|
240 |
with gr.Tab("🟡 GPQA") as gpqa_tab:
|
241 |
gpqa_plot: gr.Plot = gr.Plot()
|
242 |
gpqa_markdown: gr.Markdown = gr.Markdown(
|
243 |
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
|
244 |
)
|
|
|
|
|
|
|
|
|
|
|
245 |
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
|
246 |
zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
|
247 |
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
|
248 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
249 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
|
251 |
zeroeval_average_plot: gr.Plot = gr.Plot()
|
252 |
zeroeval_average_markdown: gr.Markdown = gr.Markdown(
|
@@ -257,6 +306,11 @@ with gr.Blocks() as demo:
|
|
257 |
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
|
258 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
259 |
)
|
|
|
|
|
|
|
|
|
|
|
260 |
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
|
261 |
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
|
262 |
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
|
@@ -267,8 +321,6 @@ with gr.Blocks() as demo:
|
|
267 |
zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
|
268 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
269 |
)
|
270 |
-
with gr.Tab("Codeforces") as codeforces_tab:
|
271 |
-
codeforces_plot: gr.Plot = gr.Plot()
|
272 |
with gr.Tab("OpenCompass", visible=False):
|
273 |
opencompass_plot: gr.Plot = gr.Plot()
|
274 |
opencompass_markdown: gr.Markdown = gr.Markdown(
|
@@ -284,6 +336,107 @@ with gr.Blocks() as demo:
|
|
284 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
285 |
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
|
286 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
with gr.Tab("Finance") as finance_tab:
|
288 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
289 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
@@ -292,24 +445,30 @@ with gr.Blocks() as demo:
|
|
292 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
293 |
arc_agi_public_eval_tab.select(fn=create_simple_plot,
|
294 |
inputs=[gr.State("arc_agi_leaderboard.jsonl"),
|
295 |
-
gr.State(
|
296 |
-
|
|
|
|
|
297 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
298 |
gr.State(0), gr.State(100),
|
299 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
300 |
outputs=arc_agi_public_eval_plot)
|
301 |
arc_agi_tab.select(fn=create_simple_plot,
|
302 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
303 |
-
gr.State(
|
304 |
-
|
|
|
|
|
305 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
306 |
gr.State(0), gr.State(100),
|
307 |
gr.State({"MTurkers": 77})],
|
308 |
outputs=arc_agi_semi_private_eval_plot)
|
309 |
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
|
310 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
311 |
-
gr.State(
|
312 |
-
|
|
|
|
|
313 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
314 |
gr.State(0), gr.State(100),
|
315 |
gr.State({"MTurkers": 77})],
|
@@ -318,35 +477,31 @@ with gr.Blocks() as demo:
|
|
318 |
simple_bench_tab.select(fn=create_simple_plot,
|
319 |
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
320 |
gr.State("Simple Bench Score"),
|
321 |
-
gr.State(
|
322 |
-
|
|
|
323 |
gr.State(0), gr.State(100),
|
324 |
gr.State({"Humans": 83.7})],
|
325 |
outputs=simple_bench_plot)
|
326 |
-
codeforces_tab.select(fn=create_simple_plot,
|
327 |
-
inputs=[gr.State("codeforces_leaderboard.jsonl"),
|
328 |
-
gr.State("Codeforces Rating"),
|
329 |
-
gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
|
330 |
-
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
331 |
-
gr.State(0), gr.State(4000),
|
332 |
-
gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
|
333 |
-
outputs=codeforces_plot)
|
334 |
planbench_tab.select(fn=create_simple_plot,
|
335 |
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
336 |
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
|
337 |
-
gr.State(
|
|
|
338 |
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
|
339 |
outputs=planbench_plot)
|
340 |
bigcodebench_tab.select(fn=create_simple_plot,
|
341 |
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
|
342 |
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
|
343 |
-
gr.State(
|
|
|
344 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
|
345 |
outputs=bigcodebench_plot)
|
346 |
gaia_tab.select(fn=create_simple_plot,
|
347 |
inputs=[gr.State("gaia_leaderboard.jsonl"),
|
348 |
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
|
349 |
-
gr.State(
|
|
|
350 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
|
351 |
gr.State(0), gr.State(100),
|
352 |
gr.State({"Humans": 92})],
|
@@ -354,7 +509,8 @@ with gr.Blocks() as demo:
|
|
354 |
gpqa_tab.select(fn=create_simple_plot,
|
355 |
inputs=[gr.State("gpqa_leaderboard.jsonl"),
|
356 |
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
|
357 |
-
gr.State(
|
|
|
358 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
|
359 |
gr.State(25), gr.State(100),
|
360 |
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
|
@@ -362,34 +518,103 @@ with gr.Blocks() as demo:
|
|
362 |
zeroeval_average_tab.select(fn=create_simple_plot,
|
363 |
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
|
364 |
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
|
365 |
-
gr.State(
|
|
|
366 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
367 |
outputs=zeroeval_average_plot)
|
368 |
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
|
369 |
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
|
370 |
-
gr.State(
|
371 |
-
|
|
|
|
|
372 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
373 |
outputs=zeroeval_mmlu_redux_plot)
|
374 |
zeroeval_zebralogic_tab.select(fn=create_simple_plot,
|
375 |
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
|
376 |
gr.State("ZeroEval ZebraLogic Score"),
|
377 |
-
gr.State(
|
|
|
378 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
379 |
outputs=zeroeval_zebralogic_plot)
|
380 |
zeroeval_crux_tab.select(fn=create_simple_plot,
|
381 |
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
|
382 |
-
gr.State(
|
383 |
-
|
|
|
|
|
384 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
385 |
outputs=zeroeval_crux_plot)
|
386 |
zeroeval_math_l5_tab.select(fn=create_simple_plot,
|
387 |
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
|
388 |
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
|
389 |
-
gr.State(
|
|
|
390 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
391 |
outputs=zeroeval_math_l5_plot)
|
392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
if __name__ == "__main__":
|
395 |
demo.launch()
|
|
|
187 |
|
188 |
| Benchmark | Top Score |
|
189 |
|-----------|-----------|
|
190 |
+
| Humanity's Last Exam | 🔴 7% |
|
191 |
| BigCodeBench | 🟠 36% |
|
192 |
| Simple Bench | 🟠 42% |
|
193 |
| PlanBench | 🟠 53% |
|
194 |
| GAIA | 🟡 65% |
|
195 |
+
| LiveBench Language | 🟡 65% |
|
196 |
+
| LiveBench Data Analysis | 🟡 71% |
|
197 |
+
| LiveCodeBench | 🟡 73% |
|
198 |
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
|
199 |
+
| LiveBench | 🟡 76% |
|
200 |
| GPQA | 🟡 76% |
|
201 |
+
| LiveBench Mathematics | 🟡 81% |
|
202 |
| ZebraLogic | 🟡 81% |
|
203 |
+
| LiveBench Coding | 🟡 83% |
|
204 |
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
|
205 |
+
| LiveBench IF | 🟡 86% |
|
206 |
| ZeroEval | 🟡 86% |
|
207 |
| MATH-L5 | 🟡 89% |
|
208 |
+
| LiveBench Reasoning | 🟢 92% |
|
209 |
| MMLU-Redux | 🟢 93% |
|
210 |
| CRUX | 🟢 96% |
|
211 |
|
|
|
218 |
| 🟡 Yellow | 60% to 90% |
|
219 |
| 🟢 Green | Above 90% |"""
|
220 |
)
|
221 |
+
with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab:
|
222 |
+
humanitys_last_exam_plot: gr.Plot = gr.Plot()
|
223 |
+
humanitys_last_exam_markdown: gr.Markdown = gr.Markdown(
|
224 |
+
value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)"""
|
225 |
+
)
|
226 |
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
|
227 |
bigcodebench_plot: gr.Plot = gr.Plot()
|
228 |
bigcodebench_markdown: gr.Markdown = gr.Markdown(
|
|
|
243 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
244 |
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
|
245 |
)
|
246 |
+
with gr.Tab("🟡 LiveBench Language") as livebench_language_tab:
|
247 |
+
livebench_language_plot: gr.Plot = gr.Plot()
|
248 |
+
livebench_language_markdown: gr.Markdown = gr.Markdown(
|
249 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
250 |
+
)
|
251 |
+
with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab:
|
252 |
+
livebench_data_analysis_plot: gr.Plot = gr.Plot()
|
253 |
+
livebench_data_analysis_markdown: gr.Markdown = gr.Markdown(
|
254 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
255 |
+
)
|
256 |
+
with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab:
|
257 |
+
livecodebench_plot: gr.Plot = gr.Plot()
|
258 |
+
livecodebench_markdown: gr.Markdown = gr.Markdown(
|
259 |
+
value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)"""
|
260 |
+
)
|
261 |
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
|
262 |
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
|
263 |
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
|
|
|
266 |
arc_agi_markdown: gr.Markdown = gr.Markdown(
|
267 |
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
|
268 |
)
|
269 |
+
with gr.Tab("🟡 LiveBench") as livebench_tab:
|
270 |
+
livebench_plot: gr.Plot = gr.Plot()
|
271 |
+
livebench_markdown: gr.Markdown = gr.Markdown(
|
272 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
273 |
+
)
|
274 |
with gr.Tab("🟡 GPQA") as gpqa_tab:
|
275 |
gpqa_plot: gr.Plot = gr.Plot()
|
276 |
gpqa_markdown: gr.Markdown = gr.Markdown(
|
277 |
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
|
278 |
)
|
279 |
+
with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab:
|
280 |
+
livebench_mathematics_plot: gr.Plot = gr.Plot()
|
281 |
+
livebench_mathematics_markdown: gr.Markdown = gr.Markdown(
|
282 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
283 |
+
)
|
284 |
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
|
285 |
zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
|
286 |
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
|
287 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
288 |
)
|
289 |
+
with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab:
|
290 |
+
livebench_coding_plot: gr.Plot = gr.Plot()
|
291 |
+
livebench_coding_markdown: gr.Markdown = gr.Markdown(
|
292 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
293 |
+
)
|
294 |
+
with gr.Tab("🟡 LiveBench IF") as livebench_if_tab:
|
295 |
+
livebench_if_plot: gr.Plot = gr.Plot()
|
296 |
+
livebench_if_markdown: gr.Markdown = gr.Markdown(
|
297 |
+
value="""Source: [LiveBench IF](https://livebench.ai/)"""
|
298 |
+
)
|
299 |
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
|
300 |
zeroeval_average_plot: gr.Plot = gr.Plot()
|
301 |
zeroeval_average_markdown: gr.Markdown = gr.Markdown(
|
|
|
306 |
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
|
307 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
308 |
)
|
309 |
+
with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab:
|
310 |
+
livebench_reasoning_plot: gr.Plot = gr.Plot()
|
311 |
+
livebench_reasoning_markdown: gr.Markdown = gr.Markdown(
|
312 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
313 |
+
)
|
314 |
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
|
315 |
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
|
316 |
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
|
|
|
321 |
zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
|
322 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
323 |
)
|
|
|
|
|
324 |
with gr.Tab("OpenCompass", visible=False):
|
325 |
opencompass_plot: gr.Plot = gr.Plot()
|
326 |
opencompass_markdown: gr.Markdown = gr.Markdown(
|
|
|
336 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
337 |
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
|
338 |
)
|
339 |
+
with gr.Tab("OSWorld", visible=False):
|
340 |
+
osworld_plot: gr.Plot = gr.Plot()
|
341 |
+
osworld_markdown: gr.Markdown = gr.Markdown(
|
342 |
+
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
343 |
+
)
|
344 |
+
with gr.Tab("EMMA-Mini", visible=False):
|
345 |
+
emma_plot: gr.Plot = gr.Plot()
|
346 |
+
emma_markdown: gr.Markdown = gr.Markdown(
|
347 |
+
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
348 |
+
)
|
349 |
+
with gr.Tab("MathVista", visible=False):
|
350 |
+
mathvista_plot: gr.Plot = gr.Plot()
|
351 |
+
mathvista_markdown: gr.Markdown = gr.Markdown(
|
352 |
+
value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)"""
|
353 |
+
)
|
354 |
+
with gr.Tab("DABStep", visible=False):
|
355 |
+
dabstep_plot: gr.Plot = gr.Plot()
|
356 |
+
dabstep_markdown: gr.Markdown = gr.Markdown(
|
357 |
+
value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)"""
|
358 |
+
)
|
359 |
+
with gr.Tab("lineage-bench", visible=False):
|
360 |
+
lineage_bench_plot: gr.Plot = gr.Plot()
|
361 |
+
lineage_bench_markdown: gr.Markdown = gr.Markdown(
|
362 |
+
value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)"""
|
363 |
+
)
|
364 |
+
with gr.Tab("Step-Game", visible=False):
|
365 |
+
step_game_plot: gr.Plot = gr.Plot()
|
366 |
+
step_game_markdown: gr.Markdown = gr.Markdown(
|
367 |
+
value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)"""
|
368 |
+
)
|
369 |
+
with gr.Tab("HHEM", visible=False):
|
370 |
+
hhem_plot: gr.Plot = gr.Plot()
|
371 |
+
hhem_markdown: gr.Markdown = gr.Markdown(
|
372 |
+
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
373 |
+
)
|
374 |
+
with gr.Tab("NYT Connections", visible=False):
|
375 |
+
nyt_connections_exam_plot: gr.Plot = gr.Plot()
|
376 |
+
nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
|
377 |
+
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
378 |
+
)
|
379 |
+
with gr.Tab("USACO", visible=False):
|
380 |
+
usaco_plot: gr.Plot = gr.Plot()
|
381 |
+
usaco_markdown: gr.Markdown = gr.Markdown(
|
382 |
+
value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)"""
|
383 |
+
)
|
384 |
+
with gr.Tab("AppWorld", visible=False):
|
385 |
+
appworld_plot: gr.Plot = gr.Plot()
|
386 |
+
appworld_markdown: gr.Markdown = gr.Markdown(
|
387 |
+
value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)"""
|
388 |
+
)
|
389 |
+
with gr.Tab("CORE-Bench", visible=False):
|
390 |
+
core_bench_plot: gr.Plot = gr.Plot()
|
391 |
+
core_bench_markdown: gr.Markdown = gr.Markdown(
|
392 |
+
value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)"""
|
393 |
+
)
|
394 |
+
with gr.Tab("Cybench", visible=False):
|
395 |
+
cybench_plot: gr.Plot = gr.Plot()
|
396 |
+
cybench_markdown: gr.Markdown = gr.Markdown(
|
397 |
+
value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)"""
|
398 |
+
)
|
399 |
+
with gr.Tab("MultiChallenge", visible=False):
|
400 |
+
multichallenge_plot: gr.Plot = gr.Plot()
|
401 |
+
multichallenge_markdown: gr.Markdown = gr.Markdown(
|
402 |
+
value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)"""
|
403 |
+
)
|
404 |
+
with gr.Tab("VISTA", visible=False):
|
405 |
+
vista_plot: gr.Plot = gr.Plot()
|
406 |
+
vista_markdown: gr.Markdown = gr.Markdown(
|
407 |
+
value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)"""
|
408 |
+
)
|
409 |
+
with gr.Tab("ToolComp", visible=False):
|
410 |
+
with gr.Tab("Enterprise"):
|
411 |
+
toolcomp_enterprise_plot: gr.Plot = gr.Plot()
|
412 |
+
toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown(
|
413 |
+
value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)"""
|
414 |
+
)
|
415 |
+
with gr.Tab("Chat"):
|
416 |
+
toolcomp_chat_plot: gr.Plot = gr.Plot()
|
417 |
+
toolcomp_chat_markdown: gr.Markdown = gr.Markdown(
|
418 |
+
value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)"""
|
419 |
+
)
|
420 |
+
with gr.Tab("BFCL", visible=False):
|
421 |
+
bfcl_plot: gr.Plot = gr.Plot()
|
422 |
+
bfcl_markdown: gr.Markdown = gr.Markdown(
|
423 |
+
value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)"""
|
424 |
+
)
|
425 |
+
with gr.Tab("EvalPlus", visible=False):
|
426 |
+
evalplus_plot: gr.Plot = gr.Plot()
|
427 |
+
evalplus_markdown: gr.Markdown = gr.Markdown(
|
428 |
+
value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)"""
|
429 |
+
)
|
430 |
+
with gr.Tab("Aider Polyglot", visible=False):
|
431 |
+
aider_plot: gr.Plot = gr.Plot()
|
432 |
+
aider_markdown: gr.Markdown = gr.Markdown(
|
433 |
+
value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)"""
|
434 |
+
)
|
435 |
+
with gr.Tab("QuALITY", visible=False):
|
436 |
+
quality_plot: gr.Plot = gr.Plot()
|
437 |
+
quality_markdown: gr.Markdown = gr.Markdown(
|
438 |
+
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
439 |
+
)
|
440 |
with gr.Tab("Finance") as finance_tab:
|
441 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
442 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
|
445 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
446 |
arc_agi_public_eval_tab.select(fn=create_simple_plot,
|
447 |
inputs=[gr.State("arc_agi_leaderboard.jsonl"),
|
448 |
+
gr.State(
|
449 |
+
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
450 |
+
gr.State(
|
451 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
452 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
453 |
gr.State(0), gr.State(100),
|
454 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
455 |
outputs=arc_agi_public_eval_plot)
|
456 |
arc_agi_tab.select(fn=create_simple_plot,
|
457 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
458 |
+
gr.State(
|
459 |
+
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
460 |
+
gr.State(
|
461 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
462 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
463 |
gr.State(0), gr.State(100),
|
464 |
gr.State({"MTurkers": 77})],
|
465 |
outputs=arc_agi_semi_private_eval_plot)
|
466 |
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
|
467 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
468 |
+
gr.State(
|
469 |
+
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
470 |
+
gr.State(
|
471 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
472 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
473 |
gr.State(0), gr.State(100),
|
474 |
gr.State({"MTurkers": 77})],
|
|
|
477 |
simple_bench_tab.select(fn=create_simple_plot,
|
478 |
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
479 |
gr.State("Simple Bench Score"),
|
480 |
+
gr.State(
|
481 |
+
"\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
|
482 |
+
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)),
|
483 |
gr.State(0), gr.State(100),
|
484 |
gr.State({"Humans": 83.7})],
|
485 |
outputs=simple_bench_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
planbench_tab.select(fn=create_simple_plot,
|
487 |
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
488 |
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
|
489 |
+
gr.State(
|
490 |
+
"\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
|
491 |
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
|
492 |
outputs=planbench_plot)
|
493 |
bigcodebench_tab.select(fn=create_simple_plot,
|
494 |
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
|
495 |
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
|
496 |
+
gr.State(
|
497 |
+
"\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
|
498 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
|
499 |
outputs=bigcodebench_plot)
|
500 |
gaia_tab.select(fn=create_simple_plot,
|
501 |
inputs=[gr.State("gaia_leaderboard.jsonl"),
|
502 |
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
|
503 |
+
gr.State(
|
504 |
+
"\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
|
505 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
|
506 |
gr.State(0), gr.State(100),
|
507 |
gr.State({"Humans": 92})],
|
|
|
509 |
gpqa_tab.select(fn=create_simple_plot,
|
510 |
inputs=[gr.State("gpqa_leaderboard.jsonl"),
|
511 |
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
|
512 |
+
gr.State(
|
513 |
+
"\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
|
514 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
|
515 |
gr.State(25), gr.State(100),
|
516 |
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
|
|
|
518 |
zeroeval_average_tab.select(fn=create_simple_plot,
|
519 |
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
|
520 |
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
|
521 |
+
gr.State(
|
522 |
+
"\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
|
523 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
524 |
outputs=zeroeval_average_plot)
|
525 |
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
|
526 |
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
|
527 |
+
gr.State(
|
528 |
+
"ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
|
529 |
+
gr.State(
|
530 |
+
"\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
|
531 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
532 |
outputs=zeroeval_mmlu_redux_plot)
|
533 |
zeroeval_zebralogic_tab.select(fn=create_simple_plot,
|
534 |
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
|
535 |
gr.State("ZeroEval ZebraLogic Score"),
|
536 |
+
gr.State(
|
537 |
+
"\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
|
538 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
539 |
outputs=zeroeval_zebralogic_plot)
|
540 |
zeroeval_crux_tab.select(fn=create_simple_plot,
|
541 |
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
|
542 |
+
gr.State(
|
543 |
+
"ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
|
544 |
+
gr.State(
|
545 |
+
"\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
|
546 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
547 |
outputs=zeroeval_crux_plot)
|
548 |
zeroeval_math_l5_tab.select(fn=create_simple_plot,
|
549 |
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
|
550 |
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
|
551 |
+
gr.State(
|
552 |
+
"\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
|
553 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
554 |
outputs=zeroeval_math_l5_plot)
|
555 |
+
livebench_tab.select(fn=create_simple_plot,
|
556 |
+
inputs=[gr.State("livebench.jsonl"),
|
557 |
+
gr.State("LiveBench-2024-11-25: Global Average Score"),
|
558 |
+
gr.State(
|
559 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
560 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
561 |
+
outputs=livebench_plot)
|
562 |
+
livebench_reasoning_tab.select(fn=create_simple_plot,
|
563 |
+
inputs=[gr.State("livebench_reasoning.jsonl"),
|
564 |
+
gr.State("LiveBench-2024-11-25: Reasoning Average Score"),
|
565 |
+
gr.State(
|
566 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
567 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
568 |
+
outputs=livebench_reasoning_plot)
|
569 |
+
livebench_coding_tab.select(fn=create_simple_plot,
|
570 |
+
inputs=[gr.State("livebench_coding.jsonl"),
|
571 |
+
gr.State("LiveBench-2024-11-25: Coding Average Score"),
|
572 |
+
gr.State(
|
573 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
574 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
575 |
+
outputs=livebench_coding_plot)
|
576 |
+
livebench_mathematics_tab.select(fn=create_simple_plot,
|
577 |
+
inputs=[gr.State("livebench_mathematics.jsonl"),
|
578 |
+
gr.State("LiveBench-2024-11-25: Mathematics Average Score"),
|
579 |
+
gr.State(
|
580 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
581 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
582 |
+
outputs=livebench_mathematics_plot)
|
583 |
+
livebench_data_analysis_tab.select(fn=create_simple_plot,
|
584 |
+
inputs=[gr.State("livebench_data_analysis.jsonl"),
|
585 |
+
gr.State("LiveBench-2024-11-25: Data Analysis Average Score"),
|
586 |
+
gr.State(
|
587 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
588 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
589 |
+
outputs=livebench_data_analysis_plot)
|
590 |
+
livebench_language_tab.select(fn=create_simple_plot,
|
591 |
+
inputs=[gr.State("livebench_language.jsonl"),
|
592 |
+
gr.State("LiveBench-2024-11-25: Language Average Score"),
|
593 |
+
gr.State(
|
594 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
595 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
596 |
+
outputs=livebench_language_plot)
|
597 |
+
livebench_if_tab.select(fn=create_simple_plot,
|
598 |
+
inputs=[gr.State("livebench_if.jsonl"),
|
599 |
+
gr.State("LiveBench-2024-11-25: IF Average Score"),
|
600 |
+
gr.State(
|
601 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
602 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
603 |
+
outputs=livebench_if_plot)
|
604 |
+
humanitys_last_exam_tab.select(fn=create_simple_plot,
|
605 |
+
inputs=[gr.State("humanitys_last_exam.jsonl"),
|
606 |
+
gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"),
|
607 |
+
gr.State(
|
608 |
+
"\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"),
|
609 |
+
gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))],
|
610 |
+
outputs=humanitys_last_exam_plot)
|
611 |
+
livecodebench_tab.select(fn=create_simple_plot,
|
612 |
+
inputs=[gr.State("livecodebench.jsonl"),
|
613 |
+
gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"),
|
614 |
+
gr.State(
|
615 |
+
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
616 |
+
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
617 |
+
outputs=livecodebench_plot)
|
618 |
|
619 |
if __name__ == "__main__":
|
620 |
demo.launch()
|
codeforces_leaderboard.jsonl
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{"model": "o3", "score": 2400}
|
2 |
-
{"model": "o3-mini", "score": 2073}
|
3 |
-
{"model": "o1", "score": 1673}
|
4 |
-
{"model": "o1-mini", "score": 1650}
|
5 |
-
{"model": "o1-preview", "score": 1258}
|
6 |
-
{"model": "gpt-4o", "score": 808}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humanitys_last_exam.jsonl
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "gpt-4o", "score": 3.1}
|
2 |
+
{"model": "grok-2", "score": 3.9}
|
3 |
+
{"model": "claude-3-5-sonnet", "score": 4.8}
|
4 |
+
{"model": "gemini-2.0-flash-thinking", "score": 7.2}
|
5 |
+
{"model": "o1", "score": 7.2}
|
livebench.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 75.88}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 75.67}
|
3 |
+
{"model": "deepseek-r1", "score": 71.57}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 70.01}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 66.92}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 65.13}
|
7 |
+
{"model": "gemini-exp-1206", "score": 64.09}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 62.45}
|
9 |
+
{"model": "qwen2.5-max", "score": 62.29}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 61.47}
|
11 |
+
{"model": "deepseek-v3", "score": 60.45}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 59.26}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 59.03}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.79}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 57.76}
|
16 |
+
{"model": "step-2-16k-202411", "score": 56.02}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 55.33}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 54.33}
|
19 |
+
{"model": "grok-2-1212", "score": 54.30}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 53.24}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 52.64}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 52.36}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 52.19}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 52.19}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 51.66}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 51.44}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 50.40}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 50.16}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 49.66}
|
30 |
+
{"model": "grok-beta", "score": 49.18}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 49.16}
|
32 |
+
{"model": "mistral-large-2411", "score": 48.43}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 46.23}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 46.21}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 44.89}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 43.53}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 43.45}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 42.93}
|
39 |
+
{"model": "mistral-small-2501", "score": 42.55}
|
40 |
+
{"model": "phi-4", "score": 41.61}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 41.26}
|
42 |
+
{"model": "qwq-32b-preview", "score": 40.25}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 38.18}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.35}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 34.90}
|
46 |
+
{"model": "mistral-small-2409", "score": 33.42}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 31.76}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 29.59}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 28.66}
|
50 |
+
{"model": "command-r-08-2024", "score": 27.48}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 27.11}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 25.97}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 24.03}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 22.36}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 22.12}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 22.08}
|
livebench_coding.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 82.74}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 69.69}
|
3 |
+
{"model": "deepseek-r1", "score": 66.74}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 65.38}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 53.49}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 63.49}
|
7 |
+
{"model": "gemini-exp-1206", "score": 63.41}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 61.46}
|
9 |
+
{"model": "qwen2.5-max", "score": 64.41}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 53.92}
|
11 |
+
{"model": "deepseek-v3", "score": 61.77}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 54.36}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 67.13}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 60.56}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 48.05}
|
16 |
+
{"model": "step-2-16k-202411", "score": 47.19}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 51.44}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 48.80}
|
19 |
+
{"model": "grok-2-1212", "score": 46.44}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 43.80}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 58.92}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 42.65}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 46.08}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 46.87}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 47.44}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 57.64}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 49.00}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 36.59}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 50.97}
|
30 |
+
{"model": "grok-beta", "score": 45.15}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 38.59}
|
32 |
+
{"model": "mistral-large-2411", "score": 47.08}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 56.85}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 36.31}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 33.49}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 38.15}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 51.36}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 32.85}
|
39 |
+
{"model": "mistral-small-2501", "score": 35.31}
|
40 |
+
{"model": "phi-4", "score": 30.67}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 43.15}
|
42 |
+
{"model": "qwq-32b-preview", "score": 37.20}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 35.95}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 27.46}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 38.37}
|
46 |
+
{"model": "mistral-small-2409", "score": 25.74}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 19.14}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 20.18}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 22.46}
|
50 |
+
{"model": "command-r-08-2024", "score": 17.90}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 19.46}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.74}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 20.26}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 15.04}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 10.41}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 15.54}
|
livebench_data_analysis.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 70.64}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 65.47}
|
3 |
+
{"model": "deepseek-r1", "score": 69.78}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 66.56}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 69.37}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 68.02}
|
7 |
+
{"model": "gemini-exp-1206", "score": 63.16}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 62.04}
|
9 |
+
{"model": "qwen2.5-max", "score": 67.93}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 67.55}
|
11 |
+
{"model": "deepseek-v3", "score": 60.94}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 61.67}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 55.03}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 66.00}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 57.92}
|
16 |
+
{"model": "step-2-16k-202411", "score": 63.72}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 60.91}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 54.97}
|
19 |
+
{"model": "grok-2-1212", "score": 54.45}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 57.47}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 55.51}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 55.85}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 56.15}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 54.97}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 57.93}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 51.91}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 54.36}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 49.49}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 55.93}
|
30 |
+
{"model": "grok-beta", "score": 54.27}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 57.89}
|
32 |
+
{"model": "mistral-large-2411", "score": 50.15}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 49.87}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 53.98}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 53.75}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 48.31}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 48.45}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 45.41}
|
39 |
+
{"model": "mistral-small-2501", "score": 53.69}
|
40 |
+
{"model": "phi-4", "score": 45.17}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 49.96}
|
42 |
+
{"model": "qwq-32b-preview", "score": 31.62}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 47.87}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 37.23}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 35.22}
|
46 |
+
{"model": "mistral-small-2409", "score": 42.73}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 38.06}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 33.95}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 36.39}
|
50 |
+
{"model": "command-r-08-2024", "score": 33.34}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 25.48}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 32.82}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 30.29}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 34.69}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 20.60}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 30.21}
|
livebench_if.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 84.36}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 81.55}
|
3 |
+
{"model": "deepseek-r1", "score": 80.51}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 83.16}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 82.47}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 83.38}
|
7 |
+
{"model": "gemini-exp-1206", "score": 77.34}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 80.06}
|
9 |
+
{"model": "qwen2.5-max", "score": 75.35}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 85.79}
|
11 |
+
{"model": "deepseek-v3", "score": 75.25}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 81.86}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 69.30}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 65.07}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 65.40}
|
16 |
+
{"model": "step-2-16k-202411", "score": 79.88}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 68.58}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 70.78}
|
19 |
+
{"model": "grok-2-1212", "score": 69.63}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 78.28}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 65.22}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 75.90}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 64.94}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 68.16}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 66.37}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 64.39}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 60.85}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 82.67}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 41.55}
|
30 |
+
{"model": "grok-beta", "score": 69.62}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 63.89}
|
32 |
+
{"model": "mistral-large-2411", "score": 67.93}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 58.69}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 63.24}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 68.98}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 67.13}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 61.88}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 40.92}
|
39 |
+
{"model": "mistral-small-2501", "score": 59.54}
|
40 |
+
{"model": "phi-4", "score": 58.38}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 56.80}
|
42 |
+
{"model": "qwq-32b-preview", "score": 35.59}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 58.10}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 54.13}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 52.11}
|
46 |
+
{"model": "mistral-small-2409", "score": 53.23}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 57.61}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 48.04}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 52.62}
|
50 |
+
{"model": "command-r-08-2024", "score": 55.62}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 59.47}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 54.90}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 47.20}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 39.08}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 60.56}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 36.36}
|
livebench_language.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 50.68}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 65.39}
|
3 |
+
{"model": "deepseek-r1", "score": 48.53}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 46.26}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 42.18}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 44.85}
|
7 |
+
{"model": "gemini-exp-1206", "score": 51.29}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 38.25}
|
9 |
+
{"model": "qwen2.5-max", "score": 56.28}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 40.69}
|
11 |
+
{"model": "deepseek-v3", "score": 47.48}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 38.22}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 53.76}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 49.14}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 40.89}
|
16 |
+
{"model": "step-2-16k-202411", "score": 44.39}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 47.59}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 43.29}
|
19 |
+
{"model": "grok-2-1212", "score": 45.58}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 34.28}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 34.12}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 45.46}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 47.37}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 41.98}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 45.30}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 34.99}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 44.26}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 39.20}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 23.81}
|
30 |
+
{"model": "grok-beta", "score": 43.16}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 50.39}
|
32 |
+
{"model": "mistral-large-2411", "score": 39.39}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 23.25}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 38.78}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 35.42}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 36.96}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 35.37}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 26.82}
|
39 |
+
{"model": "mistral-small-2501", "score": 30.46}
|
40 |
+
{"model": "phi-4", "score": 25.61}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 28.61}
|
42 |
+
{"model": "qwq-32b-preview", "score": 21.09}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 32.62}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 25.93}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 15.80}
|
46 |
+
{"model": "mistral-small-2409", "score": 24.49}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 29.73}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 15.78}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 25.53}
|
50 |
+
{"model": "command-r-08-2024", "score": 16.72}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 19.70}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 17.71}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 12.94}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 9.15}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 11.16}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 8.56}
|
livebench_mathematics.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 77.29}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 80.32}
|
3 |
+
{"model": "deepseek-r1", "score": 80.71}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 72.37}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 75.85}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 70.97}
|
7 |
+
{"model": "gemini-exp-1206", "score": 72.36}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 63.06}
|
9 |
+
{"model": "qwen2.5-max", "score": 58.35}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 65.62}
|
11 |
+
{"model": "deepseek-v3", "score": 60.54}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 60.39}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 52.28}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 48.02}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 61.99}
|
16 |
+
{"model": "step-2-16k-202411", "score": 48.77}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 49.54}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 59.07}
|
19 |
+
{"model": "grok-2-1212", "score": 54.88}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 55.54}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 54.66}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 41.07}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 42.87}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 57.75}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 42.45}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 54.29}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 43.02}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 42.24}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 58.11}
|
30 |
+
{"model": "grok-beta", "score": 45.84}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 43.62}
|
32 |
+
{"model": "mistral-large-2411", "score": 42.55}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 46.61}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 40.30}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 34.72}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 38.04}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 35.54}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 59.36}
|
39 |
+
{"model": "mistral-small-2501", "score": 39.89}
|
40 |
+
{"model": "phi-4", "score": 41.98}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 36.31}
|
42 |
+
{"model": "qwq-32b-preview", "score": 58.26}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 26.46}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.70}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 39.51}
|
46 |
+
{"model": "mistral-small-2409", "score": 24.42}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 21.27}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 34.49}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 19.80}
|
50 |
+
{"model": "command-r-08-2024", "score": 19.39}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 17.99}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.31}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 17.58}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 15.72}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 13.64}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 14.96}
|
livebench_reasoning.jsonl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 89.58}
|
2 |
+
{"model": "o1-2024-12-17-high", "score": 91.58}
|
3 |
+
{"model": "deepseek-r1", "score": 83.17}
|
4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 86.33}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 78.17}
|
6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 60.08}
|
7 |
+
{"model": "gemini-exp-1206", "score": 57.00}
|
8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 69.83}
|
9 |
+
{"model": "qwen2.5-max", "score": 51.42}
|
10 |
+
{"model": "gemini-2.0-flash", "score": 55.25}
|
11 |
+
{"model": "deepseek-v3", "score": 56.75}
|
12 |
+
{"model": "gemini-2.0-flash-exp", "score": 59.08}
|
13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 56.67}
|
14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.92}
|
15 |
+
{"model": "o1-mini-2024-09-12", "score": 72.33}
|
16 |
+
{"model": "step-2-16k-202411", "score": 52.17}
|
17 |
+
{"model": "gpt-4o-2024-08-06", "score": 53.92}
|
18 |
+
{"model": "gemini-1.5-pro-002", "score": 49.08}
|
19 |
+
{"model": "grok-2-1212", "score": 54.83}
|
20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 50.08}
|
21 |
+
{"model": "dracarys2-72b-instruct", "score": 47.38}
|
22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 53.25}
|
23 |
+
{"model": "gpt-4o-2024-11-20", "score": 55.75}
|
24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 43.42}
|
25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 50.50}
|
26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 45.42}
|
27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 50.92}
|
28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 50.75}
|
29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 67.58}
|
30 |
+
{"model": "grok-beta", "score": 37.00}
|
31 |
+
{"model": "claude-3-opus-20240229", "score": 40.58}
|
32 |
+
{"model": "mistral-large-2411", "score": 43.50}
|
33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 42.08}
|
34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 44.67}
|
35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 43.00}
|
36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 32.58}
|
37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 28.08}
|
38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 52.25}
|
39 |
+
{"model": "mistral-small-2501", "score": 36.42}
|
40 |
+
{"model": "phi-4", "score": 47.83}
|
41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 32.75}
|
42 |
+
{"model": "qwq-32b-preview", "score": 57.71}
|
43 |
+
{"model": "gemma-2-27b-it", "score": 28.08}
|
44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.67}
|
45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 28.42}
|
46 |
+
{"model": "mistral-small-2409", "score": 29.92}
|
47 |
+
{"model": "command-r-plus-08-2024", "score": 24.75}
|
48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 25.08}
|
49 |
+
{"model": "gemma-2-9b-it", "score": 15.17}
|
50 |
+
{"model": "command-r-08-2024", "score": 21.92}
|
51 |
+
{"model": "command-r-plus-04-2024", "score": 20.58}
|
52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 13.33}
|
53 |
+
{"model": "phi-3-small-8k-instruct", "score": 15.92}
|
54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 20.50}
|
55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 16.33}
|
56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 26.83}
|
livecodebench.jsonl
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-2024-12-17 (high)", "score": 73.1}
|
2 |
+
{"model": "o3-mini-2025-01-31 (high)", "score": 71.6}
|
3 |
+
{"model": "o3-mini-2025-01-31 (medium)", "score": 68.8}
|
4 |
+
{"model": "o1-2024-12-17 (medium)", "score": 65.4}
|
5 |
+
{"model": "deepseek-r1-preview", "score": 64.3}
|
6 |
+
{"model": "o1-2024-12-17 (low)", "score": 62.7}
|
7 |
+
{"model": "o3-mini-2025-01-31 (low)", "score": 62.7}
|
8 |
+
{"model": "o1-mini-2024-09-12", "score": 54.1}
|
9 |
+
{"model": "deepseek-r1-lite-preview", "score": 50.4}
|
10 |
+
{"model": "gemini-flash-2.0-thinking-01-21", "score": 45}
|
11 |
+
{"model": "qwq-32b-preview", "score": 44}
|
12 |
+
{"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4}
|
13 |
+
{"model": "o1-preview-2024-09-12", "score": 42.5}
|
14 |
+
{"model": "claude-3.5-sonnet-20241022", "score": 37.1}
|
15 |
+
{"model": "deepseek-v3", "score": 36.3}
|
16 |
+
{"model": "gpt-4o-2024-05-13", "score": 33}
|
17 |
+
{"model": "claude-3.5-sonnet-20240620", "score": 32}
|
18 |
+
{"model": "gemini-flash-2.0-exp", "score": 32}
|
19 |
+
{"model": "gemini-pro-1.5-002", "score": 30.9}
|
20 |
+
{"model": "gpt-4o-2024-08-06", "score": 30.5}
|
21 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 29.6}
|
22 |
+
{"model": "gemini-flash-1.5-002", "score": 28.4}
|
23 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 27.7}
|
24 |
+
{"model": "mistral-large", "score": 27.6}
|
25 |
+
{"model": "codestral-latest", "score": 23.8}
|
26 |
+
{"model": "claude-3-haiku", "score": 17.1}
|
models.jsonl
CHANGED
@@ -1,3 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
3 |
{"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -12,6 +58,7 @@
|
|
12 |
{"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
13 |
{"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
14 |
{"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
15 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
16 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
17 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -22,6 +69,7 @@
|
|
22 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
23 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
24 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
25 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
26 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
27 |
{"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -30,6 +78,7 @@
|
|
30 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
31 |
{"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
32 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
33 |
{"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
34 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
35 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -37,6 +86,7 @@
|
|
37 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
38 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
39 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
40 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
41 |
{"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
42 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -83,6 +133,7 @@
|
|
83 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
84 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
85 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
86 |
{"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
|
87 |
{"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
|
88 |
{"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
@@ -90,6 +141,7 @@
|
|
90 |
{"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
91 |
{"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
|
92 |
{"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
93 |
{"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
|
94 |
{"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
95 |
{"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
@@ -187,6 +239,8 @@
|
|
187 |
{"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
188 |
{"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
189 |
{"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
190 |
{"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
191 |
{"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
192 |
{"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -202,6 +256,8 @@
|
|
202 |
{"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
203 |
{"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
204 |
{"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
205 |
{"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
206 |
{"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
207 |
{"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
1 |
+
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
+
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
3 |
+
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
4 |
+
{"Name": "o3-mini-2025-01-31 (low)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
5 |
+
{"Name": "o1-2024-12-17-high", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
6 |
+
{"Name": "o1-2024-12-17 (high)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
7 |
+
{"Name": "o1-2024-12-17 (medium)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
8 |
+
{"Name": "o1-2024-12-17 (low)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
9 |
+
{"Name": "deepseek-r1", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
10 |
+
{"Name": "deepseek-r1-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
11 |
+
{"Name": "deepseek-r1-lite-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
12 |
+
{"Name": "o3-mini-2025-01-31-medium", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
13 |
+
{"Name": "gemini-2.0-flash-thinking-exp-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
14 |
+
{"Name": "gemini-flash-2.0-thinking-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
15 |
+
{"Name": "gemini-2.0-pro-exp-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
16 |
+
{"Name": "o3-mini-2025-01-31-low", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
17 |
+
{"Name": "qwen2.5-max", "Release Date": "2025-01-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
18 |
+
{"Name": "gemini-2.0-flash", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
19 |
+
{"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
20 |
+
{"Name": "gemini-flash-2.0-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
21 |
+
{"Name": "deepseek-v3", "Release Date": "2024-12-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
22 |
+
{"Name": "chatgpt-4o-latest-2025-01-29", "Release Date": "2025-01-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
23 |
+
{"Name": "step-2-16k-202411", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
24 |
+
{"Name": "gemini-2.0-flash-lite-preview-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
25 |
+
{"Name": "dracarys2-72b-instruct", "Release Date": "2024-09-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
26 |
+
{"Name": "meta-llama-3.1-405b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
27 |
+
{"Name": "learnlm-1.5-pro-experimental", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
28 |
+
{"Name": "chatgpt-4o-latest-0903", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
29 |
+
{"Name": "qwen2.5-72b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
30 |
+
{"Name": "llama-3.3-70b-instruct-turbo", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
31 |
+
{"Name": "deepseek-r1-distill-llama-70b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
32 |
+
{"Name": "mistral-large-2411", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
33 |
+
{"Name": "dracarys2-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
34 |
+
{"Name": "meta-llama-3.1-70b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
35 |
+
{"Name": "amazon.nova-pro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
36 |
+
{"Name": "deepseek-r1-distill-qwen-32b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
37 |
+
{"Name": "mistral-small-2501", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
38 |
+
{"Name": "phi-4", "Release Date": "2024-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
39 |
+
{"Name": "qwq-32b-preview", "Release Date": "2024-11-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
40 |
+
{"Name": "amazon.nova-lite-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
41 |
+
{"Name": "qwen2.5-7b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
42 |
+
{"Name": "mistral-small-2409", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
43 |
+
{"Name": "amazon.nova-micro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
44 |
+
{"Name": "command-r-plus-04-2024", "Release Date": "2024-04-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
45 |
+
{"Name": "meta-llama-3.1-8b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
46 |
+
{"Name": "olmo-2-1124-13b-instruct", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
47 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
48 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
49 |
{"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
58 |
{"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
59 |
{"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
60 |
{"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
61 |
+
{"Name": "gemini-pro-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
62 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
63 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
64 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
69 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
70 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
71 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
72 |
+
{"Name": "gemini-flash-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
73 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
74 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
75 |
{"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
78 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
79 |
{"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
80 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
81 |
+
{"Name": "grok-2", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
82 |
{"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
83 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
84 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
86 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
87 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
88 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
89 |
+
{"Name": "claude-3-5-sonnet", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
90 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
91 |
{"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
92 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
133 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
134 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
135 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
136 |
+
{"Name": "claude-3-haiku", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
137 |
{"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
|
138 |
{"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
|
139 |
{"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
|
141 |
{"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
142 |
{"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
|
143 |
{"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
144 |
+
{"Name": "mistral-large", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
145 |
{"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
|
146 |
{"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
147 |
{"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
|
239 |
{"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
240 |
{"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
241 |
{"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
242 |
+
{"Name": "gemini-flash-2.0-thinking-12-19", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
243 |
+
{"Name": "gemini-2.0-flash-thinking", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
244 |
{"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
245 |
{"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
246 |
{"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
256 |
{"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
257 |
{"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
258 |
{"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
259 |
+
{"Name": "codestral-2501", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
260 |
+
{"Name": "codestral-latest", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
261 |
{"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
262 |
{"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
263 |
{"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
simple_bench_leaderboard.jsonl
CHANGED
@@ -1,15 +1,19 @@
|
|
1 |
{"model": "o1-preview-2024-09-12", "score": 41.7}
|
2 |
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
|
3 |
-
{"model": "o1-2024-12-17", "score":
|
|
|
4 |
{"model": "gemini-exp-1206", "score": 31.1}
|
|
|
5 |
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
|
6 |
{"model": "gemini-1.5-pro-002", "score": 27.1}
|
7 |
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
|
8 |
{"model": "claude-3-opus-20240229", "score": 23.5}
|
9 |
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
|
|
|
10 |
{"model": "grok-beta", "score": 22.7}
|
11 |
{"model": "mistral-large-2407", "score": 22.5}
|
12 |
{"model": "llama-3.3-70b-instruct", "score": 19.9}
|
|
|
13 |
{"model": "gemini-2.0-flash-exp", "score": 18.9}
|
14 |
{"model": "o1-mini-2024-09-12", "score": 18.1}
|
15 |
{"model": "gpt-4o-2024-08-06", "score": 17.8}
|
|
|
1 |
{"model": "o1-preview-2024-09-12", "score": 41.7}
|
2 |
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
|
3 |
+
{"model": "o1-2024-12-17 (high)", "score": 40.1}
|
4 |
+
{"model": "o1-2024-12-17 (medium)", "score": 36.7}
|
5 |
{"model": "gemini-exp-1206", "score": 31.1}
|
6 |
+
{"model": "deepseek-r1", "score": 30.9}
|
7 |
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
|
8 |
{"model": "gemini-1.5-pro-002", "score": 27.1}
|
9 |
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
|
10 |
{"model": "claude-3-opus-20240229", "score": 23.5}
|
11 |
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
|
12 |
+
{"model": "o3-mini-2025-01-31 (high)", "score": 22.8}
|
13 |
{"model": "grok-beta", "score": 22.7}
|
14 |
{"model": "mistral-large-2407", "score": 22.5}
|
15 |
{"model": "llama-3.3-70b-instruct", "score": 19.9}
|
16 |
+
{"model": "deepseek-v3", "score": 18.9}
|
17 |
{"model": "gemini-2.0-flash-exp", "score": 18.9}
|
18 |
{"model": "o1-mini-2024-09-12", "score": 18.1}
|
19 |
{"model": "gpt-4o-2024-08-06", "score": 17.8}
|