Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench
b605a32
{"model": "o1-2024-12-17 (high)", "score": 73.1} | |
{"model": "o3-mini-2025-01-31 (high)", "score": 71.6} | |
{"model": "o3-mini-2025-01-31 (medium)", "score": 68.8} | |
{"model": "o1-2024-12-17 (medium)", "score": 65.4} | |
{"model": "deepseek-r1-preview", "score": 64.3} | |
{"model": "o1-2024-12-17 (low)", "score": 62.7} | |
{"model": "o3-mini-2025-01-31 (low)", "score": 62.7} | |
{"model": "o1-mini-2024-09-12", "score": 54.1} | |
{"model": "deepseek-r1-lite-preview", "score": 50.4} | |
{"model": "gemini-flash-2.0-thinking-01-21", "score": 45} | |
{"model": "qwq-32b-preview", "score": 44} | |
{"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4} | |
{"model": "o1-preview-2024-09-12", "score": 42.5} | |
{"model": "claude-3.5-sonnet-20241022", "score": 37.1} | |
{"model": "deepseek-v3", "score": 36.3} | |
{"model": "gpt-4o-2024-05-13", "score": 33} | |
{"model": "claude-3.5-sonnet-20240620", "score": 32} | |
{"model": "gemini-flash-2.0-exp", "score": 32} | |
{"model": "gemini-pro-1.5-002", "score": 30.9} | |
{"model": "gpt-4o-2024-08-06", "score": 30.5} | |
{"model": "gpt-4-turbo-2024-04-09", "score": 29.6} | |
{"model": "gemini-flash-1.5-002", "score": 28.4} | |
{"model": "gpt-4o-mini-2024-07-18", "score": 27.7} | |
{"model": "mistral-large", "score": 27.6} | |
{"model": "codestral-latest", "score": 23.8} | |
{"model": "claude-3-haiku", "score": 17.1} |