Spaces:

kaizuberbuehler
/

ai-progress-charts

Running

App Files Files Community

ai-progress-charts / livebench_data_analysis.jsonl

kaizuberbuehler

Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench

b605a32 about 2 months ago

raw

history blame contribute delete

2.88 kB

	{"model": "o3-mini-2025-01-31-high", "score": 70.64}
	{"model": "o1-2024-12-17-high", "score": 65.47}
	{"model": "deepseek-r1", "score": 69.78}
	{"model": "o3-mini-2025-01-31-medium", "score": 66.56}
	{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 69.37}
	{"model": "gemini-2.0-pro-exp-02-05", "score": 68.02}
	{"model": "gemini-exp-1206", "score": 63.16}
	{"model": "o3-mini-2025-01-31-low", "score": 62.04}
	{"model": "qwen2.5-max", "score": 67.93}
	{"model": "gemini-2.0-flash", "score": 67.55}
	{"model": "deepseek-v3", "score": 60.94}
	{"model": "gemini-2.0-flash-exp", "score": 61.67}
	{"model": "claude-3-5-sonnet-20241022", "score": 55.03}
	{"model": "chatgpt-4o-latest-2025-01-29", "score": 66.00}
	{"model": "o1-mini-2024-09-12", "score": 57.92}
	{"model": "step-2-16k-202411", "score": 63.72}
	{"model": "gpt-4o-2024-08-06", "score": 60.91}
	{"model": "gemini-1.5-pro-002", "score": 54.97}
	{"model": "grok-2-1212", "score": 54.45}
	{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 57.47}
	{"model": "dracarys2-72b-instruct", "score": 55.51}
	{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 55.85}
	{"model": "gpt-4o-2024-11-20", "score": 56.15}
	{"model": "learnlm-1.5-pro-experimental", "score": 54.97}
	{"model": "chatgpt-4o-latest-0903", "score": 57.93}
	{"model": "qwen2.5-72b-instruct-turbo", "score": 51.91}
	{"model": "gpt-4-turbo-2024-04-09", "score": 54.36}
	{"model": "llama-3.3-70b-instruct-turbo", "score": 49.49}
	{"model": "deepseek-r1-distill-llama-70b", "score": 55.93}
	{"model": "grok-beta", "score": 54.27}
	{"model": "claude-3-opus-20240229", "score": 57.89}
	{"model": "mistral-large-2411", "score": 50.15}
	{"model": "qwen2.5-coder-32b-instruct", "score": 49.87}
	{"model": "dracarys2-llama-3.1-70b-instruct", "score": 53.98}
	{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 53.75}
	{"model": "amazon.nova-pro-v1:0", "score": 48.31}
	{"model": "claude-3-5-haiku-20241022", "score": 48.45}
	{"model": "deepseek-r1-distill-qwen-32b", "score": 45.41}
	{"model": "mistral-small-2501", "score": 53.69}
	{"model": "phi-4", "score": 45.17}
	{"model": "gpt-4o-mini-2024-07-18", "score": 49.96}
	{"model": "qwq-32b-preview", "score": 31.62}
	{"model": "gemma-2-27b-it", "score": 47.87}
	{"model": "amazon.nova-lite-v1:0", "score": 37.23}
	{"model": "qwen2.5-7b-instruct-turbo", "score": 35.22}
	{"model": "mistral-small-2409", "score": 42.73}
	{"model": "command-r-plus-08-2024", "score": 38.06}
	{"model": "amazon.nova-micro-v1:0", "score": 33.95}
	{"model": "gemma-2-9b-it", "score": 36.39}
	{"model": "command-r-08-2024", "score": 33.34}
	{"model": "command-r-plus-04-2024", "score": 25.48}
	{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 32.82}
	{"model": "phi-3-small-8k-instruct", "score": 30.29}
	{"model": "phi-3-mini-128k-instruct", "score": 34.69}
	{"model": "olmo-2-1124-13b-instruct", "score": 20.60}
	{"model": "phi-3-mini-4k-instruct", "score": 30.21}