Spaces:
Running
Running
Create gaia_leaderboard_integration.py
Browse files- gaia_leaderboard_integration.py +589 -0
gaia_leaderboard_integration.py
ADDED
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
GAIA Leaderboard Integration & Continuous Benchmarking
|
4 |
+
=====================================================
|
5 |
+
|
6 |
+
Enhanced GAIA agent with official leaderboard submission capabilities,
|
7 |
+
automated benchmarking, and comprehensive evaluation features.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import json
|
11 |
+
import logging
|
12 |
+
import time
|
13 |
+
import re
|
14 |
+
import hashlib
|
15 |
+
from datetime import datetime
|
16 |
+
from typing import Dict, List, Optional, Tuple, Any
|
17 |
+
from dataclasses import dataclass
|
18 |
+
import pandas as pd
|
19 |
+
|
20 |
+
# Core ML libraries
|
21 |
+
from datasets import load_dataset
|
22 |
+
from huggingface_hub import HfApi
|
23 |
+
|
24 |
+
# Setup logging
|
25 |
+
logging.basicConfig(level=logging.INFO)
|
26 |
+
logger = logging.getLogger(__name__)
|
27 |
+
|
28 |
+
# ================================
|
29 |
+
# ENHANCED DATA STRUCTURES
|
30 |
+
# ================================
|
31 |
+
|
32 |
+
@dataclass
|
33 |
+
class GAIAQuestion:
|
34 |
+
"""Enhanced structure for GAIA benchmark questions"""
|
35 |
+
task_id: str
|
36 |
+
question: str
|
37 |
+
level: int
|
38 |
+
final_answer: Optional[str] = None
|
39 |
+
file_name: Optional[str] = None
|
40 |
+
file_path: Optional[str] = None
|
41 |
+
annotator_metadata: Optional[Dict] = None
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
def from_dict(cls, data: dict):
|
45 |
+
return cls(**{k: v for k, v in data.items() if k in cls.__annotations__})
|
46 |
+
|
47 |
+
@dataclass
|
48 |
+
class GAIASubmission:
|
49 |
+
"""Structure for leaderboard submissions"""
|
50 |
+
task_id: str
|
51 |
+
model_answer: str
|
52 |
+
reasoning_trace: str
|
53 |
+
final_answer: str
|
54 |
+
processing_time: float = 0.0
|
55 |
+
model_name: str = ""
|
56 |
+
timestamp: str = ""
|
57 |
+
|
58 |
+
def to_leaderboard_format(self) -> Dict[str, str]:
|
59 |
+
"""Convert to official GAIA leaderboard format"""
|
60 |
+
return {
|
61 |
+
"task_id": self.task_id,
|
62 |
+
"model_answer": self.model_answer,
|
63 |
+
"reasoning_trace": self.reasoning_trace
|
64 |
+
}
|
65 |
+
|
66 |
+
@dataclass
|
67 |
+
class BenchmarkResult:
|
68 |
+
"""Comprehensive benchmark results"""
|
69 |
+
model_name: str
|
70 |
+
total_questions: int
|
71 |
+
completed_questions: int
|
72 |
+
error_rate: float
|
73 |
+
avg_processing_time: float
|
74 |
+
total_time: float
|
75 |
+
level_breakdown: Dict[int, Dict[str, int]]
|
76 |
+
timestamp: str
|
77 |
+
submission_hash: str
|
78 |
+
|
79 |
+
# ================================
|
80 |
+
# GAIA PROMPT MANAGEMENT
|
81 |
+
# ================================
|
82 |
+
|
83 |
+
class GAIAPromptManager:
|
84 |
+
"""Manages GAIA-specific prompting and formatting"""
|
85 |
+
|
86 |
+
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
|
87 |
+
|
88 |
+
FINAL ANSWER: [YOUR FINAL ANSWER]
|
89 |
+
|
90 |
+
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
|
91 |
+
|
92 |
+
@staticmethod
|
93 |
+
def create_gaia_prompt(question: str) -> str:
|
94 |
+
"""Create properly formatted GAIA prompt"""
|
95 |
+
return f"{GAIAPromptManager.GAIA_SYSTEM_PROMPT}\n\nQuestion: {question}\n\nLet me think step by step:"
|
96 |
+
|
97 |
+
@staticmethod
|
98 |
+
def extract_final_answer(response: str) -> Tuple[str, str]:
|
99 |
+
"""Extract final answer and reasoning from model response"""
|
100 |
+
final_answer_pattern = r"FINAL ANSWER:\s*(.+?)(?:\n|$)"
|
101 |
+
match = re.search(final_answer_pattern, response, re.IGNORECASE | re.DOTALL)
|
102 |
+
|
103 |
+
if match:
|
104 |
+
final_answer = match.group(1).strip()
|
105 |
+
reasoning_end = match.start()
|
106 |
+
reasoning = response[:reasoning_end].strip()
|
107 |
+
else:
|
108 |
+
lines = response.strip().split('\n')
|
109 |
+
final_answer = lines[-1].strip() if lines else ""
|
110 |
+
reasoning = '\n'.join(lines[:-1]) if len(lines) > 1 else response
|
111 |
+
|
112 |
+
return final_answer, reasoning
|
113 |
+
|
114 |
+
# ================================
|
115 |
+
# GAIA LEADERBOARD MANAGER
|
116 |
+
# ================================
|
117 |
+
|
118 |
+
class GAIALeaderboardManager:
|
119 |
+
"""Manages interactions with the official GAIA leaderboard"""
|
120 |
+
|
121 |
+
LEADERBOARD_URL = "https://huggingface.co/spaces/gaia-benchmark/leaderboard"
|
122 |
+
DATASET_NAME = "gaia-benchmark/GAIA"
|
123 |
+
|
124 |
+
def __init__(self):
|
125 |
+
self.api = HfApi()
|
126 |
+
|
127 |
+
def load_test_questions(self, max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
|
128 |
+
"""Load official GAIA test questions (300 total)"""
|
129 |
+
try:
|
130 |
+
logger.info("Loading official GAIA test dataset...")
|
131 |
+
|
132 |
+
# Try to load test split
|
133 |
+
dataset = load_dataset(self.DATASET_NAME, split="test", trust_remote_code=True)
|
134 |
+
|
135 |
+
questions = []
|
136 |
+
items = dataset[:max_questions] if max_questions else dataset
|
137 |
+
|
138 |
+
for i, item in enumerate(items):
|
139 |
+
question = GAIAQuestion(
|
140 |
+
task_id=item.get('task_id', f'gaia_test_{i:03d}'),
|
141 |
+
question=item['Question'],
|
142 |
+
level=item['Level'],
|
143 |
+
final_answer=None, # Not provided in test set
|
144 |
+
file_name=item.get('file_name', None),
|
145 |
+
file_path=item.get('file_path', None),
|
146 |
+
annotator_metadata=item.get('Annotator Metadata', None)
|
147 |
+
)
|
148 |
+
questions.append(question)
|
149 |
+
|
150 |
+
status = f"✅ Loaded {len(questions)} official GAIA test questions"
|
151 |
+
logger.info(status)
|
152 |
+
return questions, status
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
error_msg = f"❌ Error loading GAIA test dataset: {str(e)}"
|
156 |
+
logger.error(error_msg)
|
157 |
+
# Fallback to validation set or samples
|
158 |
+
return self._load_validation_fallback()
|
159 |
+
|
160 |
+
def _load_validation_fallback(self) -> Tuple[List[GAIAQuestion], str]:
|
161 |
+
"""Fallback to validation set if test set unavailable"""
|
162 |
+
try:
|
163 |
+
dataset = load_dataset(self.DATASET_NAME, split="validation", trust_remote_code=True)
|
164 |
+
questions = []
|
165 |
+
|
166 |
+
for i, item in enumerate(dataset):
|
167 |
+
question = GAIAQuestion(
|
168 |
+
task_id=item.get('task_id', f'gaia_val_{i:03d}'),
|
169 |
+
question=item['Question'],
|
170 |
+
level=item['Level'],
|
171 |
+
final_answer=item.get('Final answer', None),
|
172 |
+
file_name=item.get('file_name', None),
|
173 |
+
annotator_metadata=item.get('Annotator Metadata', None)
|
174 |
+
)
|
175 |
+
questions.append(question)
|
176 |
+
|
177 |
+
return questions, f"⚠️ Using validation set ({len(questions)} questions) - test set unavailable"
|
178 |
+
|
179 |
+
except Exception as e:
|
180 |
+
# Ultimate fallback to sample questions
|
181 |
+
return self._create_representative_samples(), "⚠️ Using sample questions - datasets unavailable"
|
182 |
+
|
183 |
+
def _create_representative_samples(self) -> List[GAIAQuestion]:
|
184 |
+
"""Create representative sample questions covering all difficulty levels"""
|
185 |
+
samples = [
|
186 |
+
# Level 1 questions (basic reasoning)
|
187 |
+
{
|
188 |
+
"task_id": "sample_l1_001",
|
189 |
+
"question": "What is the capital city of the country that has the largest land area in South America?",
|
190 |
+
"level": 1,
|
191 |
+
"final_answer": "Brasília"
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"task_id": "sample_l1_002",
|
195 |
+
"question": "If a book costs $12.50 and I have a 20% discount coupon, how much will I pay?",
|
196 |
+
"level": 1,
|
197 |
+
"final_answer": "10"
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"task_id": "sample_l1_003",
|
201 |
+
"question": "What is the next number in the sequence: 2, 4, 8, 16, ?",
|
202 |
+
"level": 1,
|
203 |
+
"final_answer": "32"
|
204 |
+
},
|
205 |
+
|
206 |
+
# Level 2 questions (intermediate reasoning)
|
207 |
+
{
|
208 |
+
"task_id": "sample_l2_001",
|
209 |
+
"question": "A train travels 60 km in the first hour, 80 km in the second hour, and 100 km in the third hour. If this pattern continues, how far will it travel in the 5th hour?",
|
210 |
+
"level": 2,
|
211 |
+
"final_answer": "140"
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"task_id": "sample_l2_002",
|
215 |
+
"question": "If today is Wednesday and it was Tuesday 8 days ago, what day of the week will it be 15 days from now?",
|
216 |
+
"level": 2,
|
217 |
+
"final_answer": "Thursday"
|
218 |
+
},
|
219 |
+
|
220 |
+
# Level 3 questions (advanced reasoning)
|
221 |
+
{
|
222 |
+
"task_id": "sample_l3_001",
|
223 |
+
"question": "A company's revenue increased by 25% in the first quarter, decreased by 10% in the second quarter, and increased by 15% in the third quarter. If the original revenue was $100,000, what is the revenue at the end of the third quarter?",
|
224 |
+
"level": 3,
|
225 |
+
"final_answer": "129375"
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"task_id": "sample_l3_002",
|
229 |
+
"question": "In a group of 100 people, 60 like coffee, 40 like tea, and 20 like both. How many people like neither coffee nor tea?",
|
230 |
+
"level": 3,
|
231 |
+
"final_answer": "20"
|
232 |
+
}
|
233 |
+
]
|
234 |
+
|
235 |
+
return [GAIAQuestion.from_dict(data) for data in samples]
|
236 |
+
|
237 |
+
def create_submission_file(self, submissions: List[GAIASubmission], model_name: str) -> Tuple[str, str]:
|
238 |
+
"""Create official GAIA leaderboard submission file"""
|
239 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
240 |
+
filename = f"gaia_submission_{model_name}_{timestamp}.jsonl"
|
241 |
+
|
242 |
+
# Create submission in official format
|
243 |
+
submission_data = []
|
244 |
+
for sub in submissions:
|
245 |
+
submission_data.append(sub.to_leaderboard_format())
|
246 |
+
|
247 |
+
# Write JSONL file
|
248 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
249 |
+
for entry in submission_data:
|
250 |
+
f.write(json.dumps(entry) + '\n')
|
251 |
+
|
252 |
+
# Create submission hash for verification
|
253 |
+
with open(filename, 'rb') as f:
|
254 |
+
file_hash = hashlib.md5(f.read()).hexdigest()
|
255 |
+
|
256 |
+
# Create metadata file
|
257 |
+
metadata = {
|
258 |
+
"model_name": model_name,
|
259 |
+
"submission_time": timestamp,
|
260 |
+
"total_questions": len(submissions),
|
261 |
+
"file_hash": file_hash,
|
262 |
+
"format_version": "1.0"
|
263 |
+
}
|
264 |
+
|
265 |
+
metadata_filename = f"gaia_metadata_{model_name}_{timestamp}.json"
|
266 |
+
with open(metadata_filename, 'w') as f:
|
267 |
+
json.dump(metadata, f, indent=2)
|
268 |
+
|
269 |
+
return filename, metadata_filename
|
270 |
+
|
271 |
+
def validate_submission(self, filename: str) -> Tuple[bool, str]:
|
272 |
+
"""Validate submission file format"""
|
273 |
+
try:
|
274 |
+
with open(filename, 'r') as f:
|
275 |
+
lines = f.readlines()
|
276 |
+
|
277 |
+
required_fields = {"task_id", "model_answer", "reasoning_trace"}
|
278 |
+
|
279 |
+
for i, line in enumerate(lines):
|
280 |
+
try:
|
281 |
+
entry = json.loads(line.strip())
|
282 |
+
if not all(field in entry for field in required_fields):
|
283 |
+
return False, f"Line {i+1}: Missing required fields. Required: {required_fields}"
|
284 |
+
|
285 |
+
if not isinstance(entry["task_id"], str) or not entry["task_id"]:
|
286 |
+
return False, f"Line {i+1}: Invalid task_id"
|
287 |
+
|
288 |
+
except json.JSONDecodeError:
|
289 |
+
return False, f"Line {i+1}: Invalid JSON format"
|
290 |
+
|
291 |
+
return True, f"✅ Submission file is valid ({len(lines)} entries)"
|
292 |
+
|
293 |
+
except Exception as e:
|
294 |
+
return False, f"❌ Error validating file: {str(e)}"
|
295 |
+
|
296 |
+
# ================================
|
297 |
+
# CONTINUOUS BENCHMARKING SYSTEM
|
298 |
+
# ================================
|
299 |
+
|
300 |
+
class ContinuousBenchmarkingSystem:
|
301 |
+
"""System for automated continuous benchmarking and tracking"""
|
302 |
+
|
303 |
+
def __init__(self):
|
304 |
+
self.benchmark_history: List[BenchmarkResult] = []
|
305 |
+
self.leaderboard_manager = GAIALeaderboardManager()
|
306 |
+
|
307 |
+
def run_full_benchmark(self, agent, model_name: str, progress_callback=None) -> Tuple[BenchmarkResult, List[GAIASubmission], str, str]:
|
308 |
+
"""Run complete benchmark on all 300 test questions"""
|
309 |
+
start_time = time.time()
|
310 |
+
|
311 |
+
# Load official test questions
|
312 |
+
questions, status = self.leaderboard_manager.load_test_questions()
|
313 |
+
|
314 |
+
if progress_callback:
|
315 |
+
progress_callback(0.1, f"Loaded {len(questions)} questions")
|
316 |
+
|
317 |
+
# Run evaluation
|
318 |
+
submissions = []
|
319 |
+
level_stats = {1: {"total": 0, "completed": 0},
|
320 |
+
2: {"total": 0, "completed": 0},
|
321 |
+
3: {"total": 0, "completed": 0}}
|
322 |
+
|
323 |
+
total_questions = len(questions)
|
324 |
+
|
325 |
+
for i, question in enumerate(questions):
|
326 |
+
if progress_callback:
|
327 |
+
progress_callback((i + 1) / total_questions,
|
328 |
+
f"Processing question {i+1}/{total_questions}")
|
329 |
+
|
330 |
+
# Track by level
|
331 |
+
level_stats[question.level]["total"] += 1
|
332 |
+
|
333 |
+
try:
|
334 |
+
# Process question
|
335 |
+
start_q_time = time.time()
|
336 |
+
prompt = agent.prompt_manager.create_gaia_prompt(question.question)
|
337 |
+
raw_response = agent.model_manager.generate_response(prompt)
|
338 |
+
final_answer, reasoning = agent.prompt_manager.extract_final_answer(raw_response)
|
339 |
+
processing_time = time.time() - start_q_time
|
340 |
+
|
341 |
+
# Create submission
|
342 |
+
submission = GAIASubmission(
|
343 |
+
task_id=question.task_id,
|
344 |
+
model_answer=raw_response,
|
345 |
+
reasoning_trace=reasoning,
|
346 |
+
final_answer=final_answer,
|
347 |
+
processing_time=processing_time,
|
348 |
+
model_name=model_name,
|
349 |
+
timestamp=datetime.now().isoformat()
|
350 |
+
)
|
351 |
+
|
352 |
+
submissions.append(submission)
|
353 |
+
level_stats[question.level]["completed"] += 1
|
354 |
+
|
355 |
+
except Exception as e:
|
356 |
+
logger.error(f"Error processing {question.task_id}: {e}")
|
357 |
+
# Add error submission
|
358 |
+
error_submission = GAIASubmission(
|
359 |
+
task_id=question.task_id,
|
360 |
+
model_answer=f"Error: {str(e)}",
|
361 |
+
reasoning_trace="Processing failed",
|
362 |
+
final_answer="ERROR",
|
363 |
+
processing_time=0.0,
|
364 |
+
model_name=model_name,
|
365 |
+
timestamp=datetime.now().isoformat()
|
366 |
+
)
|
367 |
+
submissions.append(error_submission)
|
368 |
+
|
369 |
+
total_time = time.time() - start_time
|
370 |
+
completed = sum(level_stats[level]["completed"] for level in level_stats)
|
371 |
+
error_rate = (total_questions - completed) / total_questions
|
372 |
+
avg_time = sum(s.processing_time for s in submissions) / len(submissions)
|
373 |
+
|
374 |
+
# Create submission files
|
375 |
+
submission_file, metadata_file = self.leaderboard_manager.create_submission_file(
|
376 |
+
submissions, model_name
|
377 |
+
)
|
378 |
+
|
379 |
+
# Create submission hash
|
380 |
+
with open(submission_file, 'rb') as f:
|
381 |
+
submission_hash = hashlib.md5(f.read()).hexdigest()[:8]
|
382 |
+
|
383 |
+
# Create benchmark result
|
384 |
+
result = BenchmarkResult(
|
385 |
+
model_name=model_name,
|
386 |
+
total_questions=total_questions,
|
387 |
+
completed_questions=completed,
|
388 |
+
error_rate=error_rate,
|
389 |
+
avg_processing_time=avg_time,
|
390 |
+
total_time=total_time,
|
391 |
+
level_breakdown=level_stats,
|
392 |
+
timestamp=datetime.now().isoformat(),
|
393 |
+
submission_hash=submission_hash
|
394 |
+
)
|
395 |
+
|
396 |
+
self.benchmark_history.append(result)
|
397 |
+
|
398 |
+
return result, submissions, submission_file, metadata_file
|
399 |
+
|
400 |
+
def generate_benchmark_report(self, result: BenchmarkResult) -> str:
|
401 |
+
"""Generate comprehensive benchmark report"""
|
402 |
+
report = f"""
|
403 |
+
# 🏆 GAIA Benchmark Report
|
404 |
+
|
405 |
+
## Model Information
|
406 |
+
- **Model Name**: {result.model_name}
|
407 |
+
- **Benchmark Date**: {result.timestamp}
|
408 |
+
- **Submission Hash**: {result.submission_hash}
|
409 |
+
|
410 |
+
## Overall Performance
|
411 |
+
- **Total Questions**: {result.total_questions}
|
412 |
+
- **Successfully Processed**: {result.completed_questions}
|
413 |
+
- **Success Rate**: {((result.completed_questions / result.total_questions) * 100):.1f}%
|
414 |
+
- **Error Rate**: {(result.error_rate * 100):.1f}%
|
415 |
+
|
416 |
+
## Performance Metrics
|
417 |
+
- **Average Processing Time**: {result.avg_processing_time:.2f}s per question
|
418 |
+
- **Total Benchmark Time**: {(result.total_time / 60):.1f} minutes
|
419 |
+
- **Throughput**: {(result.total_questions / (result.total_time / 60)):.1f} questions/minute
|
420 |
+
|
421 |
+
## Performance by Difficulty Level
|
422 |
+
|
423 |
+
| Level | Total Questions | Completed | Success Rate |
|
424 |
+
|-------|----------------|-----------|--------------|
|
425 |
+
"""
|
426 |
+
|
427 |
+
for level in [1, 2, 3]:
|
428 |
+
stats = result.level_breakdown[level]
|
429 |
+
success_rate = (stats["completed"] / stats["total"] * 100) if stats["total"] > 0 else 0
|
430 |
+
report += f"| Level {level} | {stats['total']} | {stats['completed']} | {success_rate:.1f}% |\n"
|
431 |
+
|
432 |
+
report += f"""
|
433 |
+
|
434 |
+
## Leaderboard Submission
|
435 |
+
- ✅ Submission file generated in official GAIA format
|
436 |
+
- ✅ Ready for upload to [GAIA Leaderboard]({GAIALeaderboardManager.LEADERBOARD_URL})
|
437 |
+
- 📁 Download the JSONL file below for submission
|
438 |
+
|
439 |
+
## Next Steps
|
440 |
+
1. Download the submission file
|
441 |
+
2. Visit the [GAIA Leaderboard]({GAIALeaderboardManager.LEADERBOARD_URL})
|
442 |
+
3. Upload your results
|
443 |
+
4. Compare with other models on the public leaderboard
|
444 |
+
|
445 |
+
---
|
446 |
+
*Report generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
|
447 |
+
"""
|
448 |
+
|
449 |
+
return report
|
450 |
+
|
451 |
+
# ================================
|
452 |
+
# ENHANCED GAIA AGENT WITH LEADERBOARD INTEGRATION
|
453 |
+
# ================================
|
454 |
+
|
455 |
+
class EnhancedGAIAAgent:
|
456 |
+
"""Enhanced GAIA agent with leaderboard integration"""
|
457 |
+
|
458 |
+
def __init__(self):
|
459 |
+
self.model_manager = None
|
460 |
+
self.prompt_manager = GAIAPromptManager()
|
461 |
+
self.leaderboard_manager = GAIALeaderboardManager()
|
462 |
+
self.benchmark_system = ContinuousBenchmarkingSystem()
|
463 |
+
self.current_model = None
|
464 |
+
|
465 |
+
def initialize_model(self, model_choice: str, progress=None) -> str:
|
466 |
+
"""Initialize model with progress tracking"""
|
467 |
+
try:
|
468 |
+
if progress:
|
469 |
+
progress(0, desc="Initializing model...")
|
470 |
+
|
471 |
+
# Import model manager from main app
|
472 |
+
import importlib
|
473 |
+
app_module = importlib.import_module('app')
|
474 |
+
HFSpaceModelManager = app_module.HFSpaceModelManager
|
475 |
+
|
476 |
+
self.model_manager = HFSpaceModelManager(model_choice)
|
477 |
+
self.current_model = model_choice
|
478 |
+
|
479 |
+
def progress_callback(value, desc):
|
480 |
+
if progress:
|
481 |
+
progress(value, desc=desc)
|
482 |
+
|
483 |
+
result = self.model_manager.load_model(progress_callback)
|
484 |
+
return result
|
485 |
+
|
486 |
+
except Exception as e:
|
487 |
+
return f"❌ Failed to initialize model: {str(e)}"
|
488 |
+
|
489 |
+
def run_leaderboard_benchmark(self, progress=None) -> Tuple[str, str, str, str]:
|
490 |
+
"""Run full benchmark for leaderboard submission"""
|
491 |
+
if self.model_manager is None:
|
492 |
+
return "❌ No model loaded", "", "", ""
|
493 |
+
|
494 |
+
model_name = self.current_model.replace(" ", "_").replace("&", "and")
|
495 |
+
|
496 |
+
try:
|
497 |
+
# Run benchmark
|
498 |
+
result, submissions, submission_file, metadata_file = self.benchmark_system.run_full_benchmark(
|
499 |
+
self, model_name, progress
|
500 |
+
)
|
501 |
+
|
502 |
+
# Generate report
|
503 |
+
report = self.benchmark_system.generate_benchmark_report(result)
|
504 |
+
|
505 |
+
# Validate submission
|
506 |
+
is_valid, validation_msg = self.leaderboard_manager.validate_submission(submission_file)
|
507 |
+
|
508 |
+
if is_valid:
|
509 |
+
status = f"✅ Benchmark completed successfully!\n{validation_msg}"
|
510 |
+
else:
|
511 |
+
status = f"⚠️ Benchmark completed but validation failed:\n{validation_msg}"
|
512 |
+
|
513 |
+
return status, report, submission_file, metadata_file
|
514 |
+
|
515 |
+
except Exception as e:
|
516 |
+
return f"❌ Benchmark failed: {str(e)}", "", "", ""
|
517 |
+
|
518 |
+
# ================================
|
519 |
+
# GLOBAL INSTANCES AND INTERFACE FUNCTIONS
|
520 |
+
# ================================
|
521 |
+
|
522 |
+
# Global enhanced agent
|
523 |
+
enhanced_gaia_agent = EnhancedGAIAAgent()
|
524 |
+
|
525 |
+
def run_leaderboard_benchmark_interface(progress=None):
|
526 |
+
"""Interface for running leaderboard benchmark"""
|
527 |
+
return enhanced_gaia_agent.run_leaderboard_benchmark(progress)
|
528 |
+
|
529 |
+
def load_test_questions_interface():
|
530 |
+
"""Interface for loading test questions info"""
|
531 |
+
questions, status = enhanced_gaia_agent.leaderboard_manager.load_test_questions(max_questions=10)
|
532 |
+
|
533 |
+
preview = f"""
|
534 |
+
{status}
|
535 |
+
|
536 |
+
## Sample Questions Preview:
|
537 |
+
|
538 |
+
"""
|
539 |
+
|
540 |
+
for i, q in enumerate(questions[:5], 1):
|
541 |
+
preview += f"**Question {i} (Level {q.level})**: {q.question}\n\n"
|
542 |
+
|
543 |
+
if len(questions) > 5:
|
544 |
+
preview += f"... and {len(questions) - 5} more questions"
|
545 |
+
|
546 |
+
return preview
|
547 |
+
|
548 |
+
def get_leaderboard_info():
|
549 |
+
"""Get information about the GAIA leaderboard"""
|
550 |
+
return f"""
|
551 |
+
# 🏆 GAIA Public Leaderboard
|
552 |
+
|
553 |
+
## Overview
|
554 |
+
The GAIA benchmark provides a **public leaderboard** hosted on Hugging Face where you can:
|
555 |
+
- Submit results from **300 official test questions**
|
556 |
+
- Compare your model against state-of-the-art systems
|
557 |
+
- Track progress in AI reasoning capabilities
|
558 |
+
- Contribute to the research community
|
559 |
+
|
560 |
+
## Leaderboard Details
|
561 |
+
- **Official URL**: [GAIA Leaderboard]({GAIALeaderboardManager.LEADERBOARD_URL})
|
562 |
+
- **Test Questions**: 300 questions across 3 difficulty levels
|
563 |
+
- **Submission Format**: JSONL files with specific schema
|
564 |
+
- **Evaluation**: Automated scoring and ranking
|
565 |
+
- **Public Rankings**: Open comparison of all submissions
|
566 |
+
|
567 |
+
## How to Submit
|
568 |
+
1. **Run Benchmark**: Use the "Full Benchmark" tab to evaluate your model
|
569 |
+
2. **Download Results**: Get the generated JSONL submission file
|
570 |
+
3. **Visit Leaderboard**: Go to the official GAIA leaderboard
|
571 |
+
4. **Upload File**: Submit your JSONL file for evaluation
|
572 |
+
5. **View Results**: Check your model's ranking and performance
|
573 |
+
|
574 |
+
## Benefits of Continuous Benchmarking
|
575 |
+
- 📊 **Track Progress**: Monitor improvements over time
|
576 |
+
- 🔍 **Identify Weaknesses**: See which question types need work
|
577 |
+
- 🏆 **Compare Models**: Benchmark against other approaches
|
578 |
+
- 📈 **Drive Innovation**: Contribute to advancing AI reasoning
|
579 |
+
- 🌟 **Gain Recognition**: Showcase your model's capabilities
|
580 |
+
|
581 |
+
## Current Benchmark Standards
|
582 |
+
Top models on the leaderboard typically achieve:
|
583 |
+
- **Level 1**: 80-95% accuracy (basic reasoning)
|
584 |
+
- **Level 2**: 60-80% accuracy (intermediate reasoning)
|
585 |
+
- **Level 3**: 30-60% accuracy (advanced reasoning)
|
586 |
+
- **Overall**: 60-75% accuracy across all levels
|
587 |
+
|
588 |
+
Ready to benchmark your model? Start with the "Full Benchmark" tab! 🚀
|
589 |
+
"""
|