ai-progress-charts / livecodebench.jsonl
kaizuberbuehler's picture
Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench
b605a32
{"model": "o1-2024-12-17 (high)", "score": 73.1}
{"model": "o3-mini-2025-01-31 (high)", "score": 71.6}
{"model": "o3-mini-2025-01-31 (medium)", "score": 68.8}
{"model": "o1-2024-12-17 (medium)", "score": 65.4}
{"model": "deepseek-r1-preview", "score": 64.3}
{"model": "o1-2024-12-17 (low)", "score": 62.7}
{"model": "o3-mini-2025-01-31 (low)", "score": 62.7}
{"model": "o1-mini-2024-09-12", "score": 54.1}
{"model": "deepseek-r1-lite-preview", "score": 50.4}
{"model": "gemini-flash-2.0-thinking-01-21", "score": 45}
{"model": "qwq-32b-preview", "score": 44}
{"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4}
{"model": "o1-preview-2024-09-12", "score": 42.5}
{"model": "claude-3.5-sonnet-20241022", "score": 37.1}
{"model": "deepseek-v3", "score": 36.3}
{"model": "gpt-4o-2024-05-13", "score": 33}
{"model": "claude-3.5-sonnet-20240620", "score": 32}
{"model": "gemini-flash-2.0-exp", "score": 32}
{"model": "gemini-pro-1.5-002", "score": 30.9}
{"model": "gpt-4o-2024-08-06", "score": 30.5}
{"model": "gpt-4-turbo-2024-04-09", "score": 29.6}
{"model": "gemini-flash-1.5-002", "score": 28.4}
{"model": "gpt-4o-mini-2024-07-18", "score": 27.7}
{"model": "mistral-large", "score": 27.6}
{"model": "codestral-latest", "score": 23.8}
{"model": "claude-3-haiku", "score": 17.1}