Gatsby767 commited on
Commit
47a4065
·
verified ·
1 Parent(s): d252649

Upload 7 files

Browse files
Files changed (7) hide show
  1. datasets_loader.py +329 -0
  2. eval_bbeh.py +185 -0
  3. eval_mmlupro.py +145 -0
  4. eval_supergpqa.py +116 -0
  5. evaluate.bash +79 -0
  6. generate.py +51 -0
  7. results_recheck.py +74 -0
datasets_loader.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import re
3
+ from math_verify import parse, verify
4
+ import pandas
5
+ from datasets import load_dataset
6
+ import random
7
+ ANSWER_PATTERN_MULTICHOICE = r"(?:\$\$\s*)?\\boxed\{[^}]*?([A-Z])[^}]*\}(?:\s*\$\$)?|(?:\*{0,2}\s*)?(?:Final|Correct)\s*Answer:\s*([A-Z])\."
8
+ ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)"
9
+ ANSWER_PATTERN_BOXED = r"(?i)\\boxed\s*{([^\n]+)}"
10
+
11
+ class DatasetHandler(ABC):
12
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED, num_examples: int = None):
13
+ self.answer_pattern = answer_pattern
14
+ self.num_examples = num_examples if num_examples is not None else 1
15
+
16
+ @abstractmethod
17
+ def load_data(self):
18
+ """
19
+ Load the dataset and return a tuple: (splits_dict, answer_type).
20
+
21
+ splits_dict: A dictionary where each key is a split name (e.g., 'train', 'test')
22
+ and the value is the corresponding dataset or data structure.
23
+ answer_type: A string describing the type of the answer, e.g.:
24
+ 'number', 'text', 'option letter', etc.
25
+ """
26
+ pass
27
+
28
+ def extract_answer(self, response: str) -> str:
29
+ try:
30
+ return re.search(self.answer_pattern, response).group(1)
31
+ except:
32
+ return None
33
+
34
+ def compare_answer(self, response: str, answer: str) -> bool:
35
+ response_answer = self.extract_answer(response)
36
+ answer = str(answer)
37
+ response_answer = str(response_answer)
38
+ if response_answer is None:
39
+ return False
40
+ if self.answer_pattern == ANSWER_PATTERN_MULTICHOICE:
41
+ return response_answer == answer
42
+ return verify(parse(answer), parse(response_answer))
43
+
44
+ def get_score(self, responses: str, answers: str) -> float:
45
+ scores = []
46
+ for r,a in zip(responses, answers):
47
+ if self.compare_answer(r,a):
48
+ scores.append(1)
49
+ else:
50
+ scores.append(0)
51
+ return scores, sum(scores)/len(scores)
52
+
53
+ class MathDatasetHandler(DatasetHandler):
54
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
55
+ super().__init__(answer_pattern)
56
+
57
+ def load_data(self):
58
+ df = pandas.read_csv(
59
+ f"https://openaipublic.blob.core.windows.net/simple-evals/math_500_test.csv"
60
+ )
61
+ examples = [row.to_dict() for _, row in df.iterrows()]
62
+ questions = [example['Question'] for example in examples]
63
+ answers = [example['Answer'] for example in examples]
64
+
65
+ return questions, answers
66
+
67
+ class Gsm8kDatasetHandler(DatasetHandler):
68
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
69
+ super().__init__(answer_pattern)
70
+
71
+ def load_data(self):
72
+ dataset = load_dataset("openai/gsm8k", 'main', split='test')
73
+ examples = [row for row in dataset]
74
+ questions = [example['question'] for example in examples]
75
+ answers = [example["answer"].split('#### ')[-1] for example in examples]
76
+ return questions, answers
77
+
78
+ class AmcDatasetHandler(DatasetHandler):
79
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
80
+ super().__init__(answer_pattern)
81
+
82
+ def load_data(self):
83
+ dataset = load_dataset("zwhe99/amc23", split='test')
84
+ examples = [row for row in dataset]
85
+ questions = [example['question'] for example in examples] *32
86
+ answers = [example['answer'] for example in examples] *32
87
+
88
+ return questions, answers
89
+
90
+ class MinervaDatasetHandler(DatasetHandler):
91
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
92
+ super().__init__(answer_pattern)
93
+
94
+ def load_data(self):
95
+ dataset = load_dataset("zwhe99/simplerl-minerva-math", split='test')
96
+ examples = [row for row in dataset]
97
+ questions = [example['problem'] for example in examples]
98
+ answers = [example['answer'] for example in examples]
99
+
100
+ return questions, answers
101
+
102
+ class OlympiadDatasetHandler(DatasetHandler):
103
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
104
+ super().__init__(answer_pattern)
105
+
106
+ def load_data(self):
107
+ dataset = load_dataset("zwhe99/simplerl-OlympiadBench", split='test')
108
+ examples = [row for row in dataset]
109
+ questions = [example['question'] for example in examples]
110
+ answers = [example['final_answer'][0] for example in examples]
111
+
112
+ return questions, answers
113
+
114
+ class Aime2024DatasetHandler(DatasetHandler):
115
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
116
+ super().__init__(answer_pattern)
117
+
118
+ def load_data(self):
119
+ dataset = load_dataset("HuggingFaceH4/aime_2024", split='train')
120
+ examples = [row for row in dataset]
121
+ questions = [example['problem'] for example in examples]*32
122
+ answers = [example['answer'] for example in examples]*32
123
+
124
+ return questions, answers
125
+
126
+
127
+ class Aime2025DatasetHandler(DatasetHandler):
128
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
129
+ super().__init__(answer_pattern)
130
+
131
+ def load_data(self):
132
+ dataset = load_dataset("yentinglin/aime_2025", 'default')['train']
133
+ examples = [row for row in dataset]
134
+ questions = [example['problem'] for example in examples]*32
135
+ answers = [example['answer'] for example in examples]*32
136
+
137
+ return questions, answers
138
+
139
+ class MmluProDatasetHandler(DatasetHandler):
140
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE):
141
+ super().__init__(answer_pattern)
142
+
143
+ def load_data(self):
144
+ dataset = load_dataset('TIGER-Lab/MMLU-Pro', split='test')
145
+ examples = []
146
+ for row in dataset:
147
+ example = {
148
+ 'question': row['question'],
149
+ 'options': row['options'],
150
+ 'answer': row['answer'],
151
+ 'answer_index': row['answer_index'],
152
+ 'category': row['category'],
153
+ 'cot_content': row['cot_content'],
154
+ 'src': row['src']
155
+ }
156
+ examples.append(example)
157
+ random.shuffle(examples)
158
+ examples = examples[:1000]
159
+ questions = []
160
+ answers = []
161
+ for example in examples:
162
+ # Format question with options
163
+ question = example['question'] + "\n\nOptions:\n"
164
+ for i, opt in enumerate(example['options']):
165
+ question += f"{chr(65+i)}. {opt}\n"
166
+
167
+ questions.append(question)
168
+ answers.append(example['answer'])
169
+
170
+ return questions, answers
171
+
172
+ class bbehDatasetHandler(DatasetHandler):
173
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED):
174
+ super().__init__(answer_pattern)
175
+
176
+ def load_data(self):
177
+ dataset = load_dataset("MrLight/bbeh-eval", split='train')
178
+ examples = [row for row in dataset]
179
+ random.shuffle(examples)
180
+ examples = examples[:1000]
181
+ questions = [example['question'] for example in examples]
182
+ answers = [example['answer'] for example in examples]
183
+
184
+ return questions, answers
185
+
186
+ class SuperGPQADatasetHandler(DatasetHandler):
187
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE):
188
+ super().__init__(answer_pattern)
189
+
190
+ def load_data(self):
191
+ dataset = load_dataset('m-a-p/SuperGPQA')
192
+ examples = []
193
+ for row in dataset['train']:
194
+ example = {
195
+ 'question': row['question'],
196
+ 'options': row['options'],
197
+ 'answer': row['answer_letter']
198
+ }
199
+ examples.append(example)
200
+ random.shuffle(examples)
201
+ examples = examples[:1000]
202
+
203
+ questions = []
204
+ answers = []
205
+ for example in examples:
206
+ # Format question with options
207
+ question = example['question'] + "\n\nOptions:\n"
208
+ for i, opt in enumerate(example['options']):
209
+ question += f"{chr(65+i)}. {opt}\n"
210
+
211
+ questions.append(question)
212
+ answers.append(example['answer'])
213
+
214
+ return questions, answers
215
+
216
+ class GPQA_DatasetHandler(DatasetHandler):
217
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE):
218
+ super().__init__(answer_pattern)
219
+
220
+ def load_data(self):
221
+ dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond",'train')
222
+ examples = []
223
+
224
+ for row in dataset:
225
+ # Get the question and answers
226
+ question = row['Question']
227
+ options = [
228
+ row['Correct Answer'],
229
+ row['Incorrect Answer 1'],
230
+ row['Incorrect Answer 2'],
231
+ row['Incorrect Answer 3']
232
+ ]
233
+ # Shuffle options to randomize correct answer position
234
+ random.shuffle(options)
235
+ # Find the index of correct answer after shuffling
236
+ correct_index = options.index(row['Correct Answer'])
237
+ correct_option = chr(65 + correct_index)
238
+
239
+ example = {
240
+ 'question': question,
241
+ 'options': options,
242
+ 'answer': correct_option
243
+ }
244
+ examples.append(example)
245
+
246
+ # Shuffle and limit to 1000 examples like other handlers
247
+ random.shuffle(examples)
248
+ examples = examples[:1000]
249
+
250
+ questions = []
251
+ answers = []
252
+ for example in examples:
253
+ # Format question with options
254
+ question = example['question'] + "\n\nOptions:\n"
255
+ for i, opt in enumerate(example['options']):
256
+ question += f"{chr(65+i)}. {opt}\n"
257
+
258
+ questions.append(question)
259
+ answers.append(example['answer'])
260
+
261
+ return questions, answers
262
+
263
+
264
+ class Mydataset_DatasetHandler(DatasetHandler):
265
+ def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED, name: str = "qwen3_frequent_solver_v1"):
266
+ super().__init__(answer_pattern)
267
+ self.name = name
268
+ def load_data(self):
269
+ dataset = load_dataset(self.name)['train']
270
+ examples = []
271
+
272
+ for row in dataset:
273
+ example = {
274
+ 'question': row['problem'],
275
+ 'answer': row['answer']
276
+ }
277
+ examples.append(example)
278
+
279
+ # Shuffle and limit to 1000 examples like other handlers
280
+ random.shuffle(examples)
281
+ # examples = examples[:1000]
282
+
283
+ questions = []
284
+ answers = []
285
+ for example in examples:
286
+
287
+ questions.append(example['question'])
288
+ answers.append(example['answer'])
289
+
290
+ return questions, answers
291
+
292
+ def get_dataset_handler(dataset_name: str,name: str = None) -> DatasetHandler:
293
+ if dataset_name == "math":
294
+ return MathDatasetHandler()
295
+ elif dataset_name == "gsm8k":
296
+ return Gsm8kDatasetHandler()
297
+ elif dataset_name == "amc":
298
+ return AmcDatasetHandler()
299
+ elif dataset_name == "minerva":
300
+ return MinervaDatasetHandler()
301
+ elif dataset_name == "olympiad":
302
+ return OlympiadDatasetHandler()
303
+ elif dataset_name == "aime2024":
304
+ return Aime2024DatasetHandler()
305
+ elif dataset_name == "aime2025":
306
+ return Aime2025DatasetHandler()
307
+ elif dataset_name == "mmlu_pro":
308
+ return MmluProDatasetHandler()
309
+ elif dataset_name == "bbeh":
310
+ return bbehDatasetHandler()
311
+ elif dataset_name == "super_gpqa":
312
+ return SuperGPQADatasetHandler()
313
+ elif dataset_name == "gpqa":
314
+ return GPQA_DatasetHandler()
315
+ elif dataset_name == "mydataset":
316
+ return Mydataset_DatasetHandler(name=name)
317
+ else:
318
+ raise ValueError(f"Dataset {dataset_name} not found")
319
+
320
+
321
+ if __name__ == "__main__":
322
+ print("mmlu_pro")
323
+ for dataset_name in ["gpqa"]:
324
+ print(f"Loading {dataset_name} dataset")
325
+ handler = get_dataset_handler(dataset_name)
326
+ questions, answers = handler.load_data()
327
+ print(questions[0])
328
+ print('-'*100)
329
+ print(answers[0])
eval_bbeh.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import json
3
+ import re
4
+ import random
5
+ import argparse
6
+ from transformers import AutoTokenizer
7
+ from vllm import LLM, SamplingParams
8
+
9
+ def extract_last_boxed(text):
10
+ pattern = r'\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}'
11
+ matches = list(re.finditer(pattern, text))
12
+ if matches:
13
+ return matches[-1].group(1)
14
+ return None
15
+
16
+ def extract_last_final_answer(text):
17
+ pattern1 = r'Final Answer:((?:[^<]|<[^<])*?)\n'
18
+ pattern2 = r'The answer is:((?:[^<]|<[^<])*?)\n'
19
+ matches1 = list(re.finditer(pattern1, text))
20
+ matches2 = list(re.finditer(pattern2, text))
21
+ if matches1:
22
+ return matches1[-1].group(1)
23
+ elif matches2:
24
+ return matches2[-1].group(1)
25
+ return None
26
+
27
+ def extract_solution(solution_str):
28
+ if '<|im_start|>user' in solution_str:
29
+ model_output = re.sub(r'^.*?<\|im_start\|>assistant', '<|im_start|>assistant', solution_str, flags=re.DOTALL, count=1)
30
+ elif 'Assistant:' in solution_str:
31
+ model_output = solution_str.split('Assistant:')[-1].strip()
32
+ else:
33
+ model_output = solution_str
34
+
35
+ stop_words = ["</s>", "<|im_end|>", "<|endoftext|>"]
36
+ for stop_word in stop_words:
37
+ if stop_word in model_output:
38
+ model_output = model_output.split(stop_word)[0].strip()
39
+
40
+ extract_boxed_answer = extract_last_boxed(model_output)
41
+ if extract_boxed_answer:
42
+ return extract_boxed_answer
43
+ else:
44
+ return extract_last_final_answer(model_output)
45
+
46
+ def strip_latex(response: str) -> str:
47
+ if response.startswith("$") and response.endswith("$"):
48
+ response = response[1:-1]
49
+ if "boxed{" in response and response.endswith("}"):
50
+ response = response[0:-1].split("boxed{")[1]
51
+ if "text{" in response and response.endswith("}"):
52
+ response = response[0:-1].split("text{")[1]
53
+ if "texttt{" in response and response.endswith("}"):
54
+ response = response[0:-1].split("texttt{")[1]
55
+ return response
56
+
57
+
58
+ def extract_answer(sample: str) -> str:
59
+ if sample is None:
60
+ sample = ""
61
+ """Extracts the final answer from the sample."""
62
+ answer_prefixes = [
63
+ "The answer is:",
64
+ "The final answer is ",
65
+ "The final answer is: ",
66
+ "The answer is "
67
+ ]
68
+ answer = sample
69
+ for answer_prefix in answer_prefixes:
70
+ if answer_prefix in answer:
71
+ answer = answer.split(answer_prefix)[-1].strip()
72
+ if answer.endswith("."):
73
+ answer = answer[:-1]
74
+ return strip_latex(answer)
75
+
76
+
77
+ def fuzzy_match(prediction: str, reference: str) -> bool:
78
+ """Fuzzy match function for BigBench Extra Hard."""
79
+ if prediction == reference:
80
+ return True
81
+
82
+ # (a) vs a
83
+ if len(prediction) == 3 and prediction[0] == "(" and prediction[-1] == ")":
84
+ return prediction[1] == reference
85
+ if len(reference) == 3 and reference[0] == "(" and reference[-1] == ")":
86
+ return reference[1] == prediction
87
+
88
+ # Numbers
89
+ try:
90
+ if float(prediction) == float(reference):
91
+ return True
92
+ except ValueError:
93
+ pass
94
+
95
+ # quote issues
96
+ if prediction.replace("'", "") == reference.replace("'", ""):
97
+ return True
98
+
99
+ # Bracket issues
100
+ if f"[{reference}]" == prediction or f"[{prediction}]" == reference:
101
+ return True
102
+
103
+ # Question mark issues
104
+ if prediction.endswith("?") and prediction[:-1] == reference:
105
+ return True
106
+
107
+ return False
108
+
109
+
110
+ def preprocess_sample(sample: str) -> str:
111
+ if sample is None:
112
+ sample = ""
113
+ prediction = extract_answer(sample.strip()).lower()
114
+ prediction = prediction.replace(", ", ",").replace("**", "")
115
+ prediction = prediction.split("\n")[0]
116
+ prediction = prediction[0:-1] if prediction.endswith(".") else prediction
117
+ return prediction
118
+
119
+
120
+ def preprocess_reference(reference: str) -> str:
121
+ reference = reference.strip().lower()
122
+ reference = reference.replace(", ", ",")
123
+ return reference
124
+
125
+
126
+ def evaluate_correctness(sample: str, reference: str) -> bool:
127
+ prediction = preprocess_sample(sample)
128
+ reference = preprocess_reference(reference)
129
+ return fuzzy_match(prediction, reference)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
135
+ parser.add_argument("--output_file", type=str, default="outputs.json", help="File to save results")
136
+ args = parser.parse_args()
137
+
138
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
139
+ llm = LLM(model=args.model_path, tensor_parallel_size=4,gpu_memory_utilization=0.85)
140
+ dataset = datasets.load_dataset('MrLight/bbeh-eval')
141
+ categories = sorted(list(set(dataset['train']['task'])))
142
+ print("Categories:", categories)
143
+ per_category_accuracy = {c: [0, 0] for c in categories}
144
+ success, fail = 0, 0
145
+ answers = []
146
+
147
+ print('----------------- Start Answering -------------------')
148
+
149
+ for category in categories:
150
+ category_entries = [entry for entry in dataset['train'] if entry['task'] == category]
151
+ prompts = []
152
+ for entry in category_entries:
153
+ query = entry['question'] + '\n'
154
+ messages = [{
155
+ "role": "user",
156
+ "content": query + '\nPlease reason step by step, and put your final answer option within \\boxed{}.'
157
+ }]
158
+ if tokenizer.chat_template:
159
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
160
+ else:
161
+ prompt = "user: " + query + '\nPlease reason step by step, and put your final answer option within \\boxed{}. Only put the letter in the box, e.g. \\boxed{A}. There is only one correct answer.'
162
+ prompts.append(prompt)
163
+
164
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=8192)
165
+ outputs = llm.generate(prompts, sampling_params)
166
+
167
+ for entry, output in zip(category_entries, outputs):
168
+ answer = output.outputs[0].text
169
+ entry['solution'] = answer
170
+ answers.append(entry)
171
+ answer = extract_solution(answer)
172
+ if evaluate_correctness(answer, entry['answer']):
173
+ success += 1
174
+ per_category_accuracy[category][0] += 1
175
+ else:
176
+ fail += 1
177
+ per_category_accuracy[category][1] += 1
178
+
179
+ print(f"{category}: {per_category_accuracy[category][0] / (per_category_accuracy[category][0] + per_category_accuracy[category][1]):.4f}")
180
+
181
+ with open(args.output_file, 'w') as f:
182
+ json.dump(answers, f, indent=2)
183
+ with open('final_results.jsonl', 'a') as f:
184
+ json.dump({"dataset": "bbeh", "model": args.model_path, "accuracy": round(success / (success + fail)*100, 2)}, f, indent=2)
185
+ print("Overall Accuracy:", success / (success + fail))
eval_mmlupro.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import json
3
+ import re
4
+ import random
5
+ import argparse
6
+ from transformers import AutoTokenizer
7
+ from vllm import LLM, SamplingParams
8
+
9
+ def extract_last_boxed(text):
10
+ pattern = r'\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}'
11
+ matches = list(re.finditer(pattern, text))
12
+ if matches:
13
+ return matches[-1].group(1)
14
+ return None
15
+
16
+ def extract_last_final_answer(text):
17
+ pattern1 = r'Final Answer:((?:[^<]|<[^<])*?)\n'
18
+ pattern2 = r'The answer is:((?:[^<]|<[^<])*?)\n'
19
+ matches1 = list(re.finditer(pattern1, text))
20
+ matches2 = list(re.finditer(pattern2, text))
21
+ if matches1:
22
+ return matches1[-1].group(1)
23
+ elif matches2:
24
+ return matches2[-1].group(1)
25
+ return None
26
+
27
+ def extract_solution(solution_str):
28
+ if '<|im_start|>user' in solution_str:
29
+ model_output = re.sub(r'^.*?<\|im_start\|>assistant', '<|im_start|>assistant', solution_str, flags=re.DOTALL, count=1)
30
+ elif 'Assistant:' in solution_str:
31
+ model_output = solution_str.split('Assistant:')[-1].strip()
32
+ else:
33
+ model_output = solution_str
34
+
35
+ stop_words = ["</s>", "<|im_end|>", "<|endoftext|>"]
36
+ for stop_word in stop_words:
37
+ if stop_word in model_output:
38
+ model_output = model_output.split(stop_word)[0].strip()
39
+
40
+ extract_boxed_answer = extract_last_boxed(model_output)
41
+ if extract_boxed_answer:
42
+ return extract_boxed_answer
43
+ else:
44
+ return extract_last_final_answer(model_output)
45
+
46
+ def form_options(options: list):
47
+ option_str = 'Options are:\n'
48
+ opts = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
49
+ for opt, o in zip(options, opts):
50
+ option_str += f'({o}): {opt}\n'
51
+ return option_str
52
+
53
+ def get_prediction(output):
54
+ solution = extract_solution(output)
55
+ if solution is None:
56
+ return random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
57
+ for option in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']:
58
+ if option in solution:
59
+ return option
60
+ return random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
61
+
62
+ if __name__ == "__main__":
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
65
+ parser.add_argument("--output_file", type=str, default="outputs.json", help="File to save results")
66
+ args = parser.parse_args()
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
69
+ llm = LLM(model=args.model_path, tensor_parallel_size=4,gpu_memory_utilization=0.85)
70
+ dataset = datasets.load_dataset('TIGER-Lab/MMLU-Pro')
71
+
72
+ categories = ['computer science', 'math', 'chemistry', 'engineering', 'law', 'biology',
73
+ 'health', 'physics', 'business', 'philosophy', 'economics', 'other',
74
+ 'psychology', 'history']
75
+ # For each category store [correct_count, incorrect_count]
76
+ per_category_accuracy = {c: [0, 0] for c in categories}
77
+ success, fail = 0, 0
78
+ answers = []
79
+
80
+ print('----------------- Start Answering -------------------')
81
+
82
+ for category in categories:
83
+ category_entries = [entry for entry in dataset['test'] if entry['category'] == category]
84
+ prompts = []
85
+ for entry in category_entries:
86
+ query = entry['question'] + '\n' + form_options(entry['options']) + '\n'
87
+ messages = [{
88
+ "role": "user",
89
+ "content": query + '\nPlease reason step by step, and put your final answer option within \\boxed{}. Only put the option letter in the box, e.g. \\boxed{A}. There is only one correct answer.'
90
+ }]
91
+ if tokenizer.chat_template:
92
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
93
+ else:
94
+ prompt = "user: " + query + '\nPlease reason step by step, and put your final answer option within \\boxed{}. Only put the letter in the box, e.g. \\boxed{A}. There is only one correct answer.'
95
+ prompts.append(prompt)
96
+
97
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=8192)
98
+ outputs = llm.generate(prompts, sampling_params)
99
+
100
+ for entry, output in zip(category_entries, outputs):
101
+ answer = output.outputs[0].text
102
+ entry['solution'] = answer
103
+ answers.append(entry)
104
+
105
+ prediction = get_prediction(answer)
106
+ if entry["answer"] == prediction:
107
+ success += 1
108
+ per_category_accuracy[category][0] += 1
109
+ else:
110
+ fail += 1
111
+ per_category_accuracy[category][1] += 1
112
+
113
+ # Print category accuracy as soon as it's computed
114
+ total_cat = per_category_accuracy[category][0] + per_category_accuracy[category][1]
115
+ cat_accuracy = per_category_accuracy[category][0] / total_cat if total_cat > 0 else 0.0
116
+ print(f"{category}: {cat_accuracy:.4f}")
117
+
118
+ # Save all the answers in a JSON file
119
+ with open(args.output_file, 'w') as f:
120
+ json.dump(answers, f, indent=2)
121
+
122
+ # Calculate per-category report, micro average, and macro average
123
+ print("\n----- Accuracy Report -----")
124
+ category_accuracy_report = {}
125
+ for category in categories:
126
+ correct, incorrect = per_category_accuracy[category]
127
+ total = correct + incorrect
128
+ if total > 0:
129
+ accuracy = correct / total
130
+ else:
131
+ accuracy = 0.0
132
+ category_accuracy_report[category] = accuracy
133
+ print(f"{category}: {correct}/{total} -> {accuracy*100:.2f}% accuracy")
134
+
135
+ total_predictions = success + fail
136
+ micro_avg = success / total_predictions if total_predictions > 0 else 0.0
137
+ print(f"\nMicro Average Accuracy: {micro_avg*100:.2f}%")
138
+ with open('final_results.jsonl', 'a') as f:
139
+ json.dump({"dataset": "mmlupro", "model": args.model_path, "accuracy": round(micro_avg*100, 2)}, f, indent=2)
140
+ valid_categories = [cat for cat in categories if (per_category_accuracy[cat][0] + per_category_accuracy[cat][1] > 0)]
141
+ if valid_categories:
142
+ macro_avg = sum(category_accuracy_report[cat] for cat in valid_categories) / len(valid_categories)
143
+ else:
144
+ macro_avg = 0.0
145
+ print(f"Macro Average Accuracy: {macro_avg*100:.2f}%")
eval_supergpqa.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import json
3
+ import re
4
+ import random
5
+ import argparse
6
+ from transformers import AutoTokenizer
7
+ from vllm import LLM, SamplingParams
8
+
9
+ def extract_last_boxed(text):
10
+ pattern = r'\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}'
11
+ matches = list(re.finditer(pattern, text))
12
+ if matches:
13
+ return matches[-1].group(1)
14
+ return None
15
+
16
+ def extract_last_final_answer(text):
17
+ pattern1 = r'Final Answer:((?:[^<]|<[^<])*?)\n'
18
+ pattern2 = r'The answer is:((?:[^<]|<[^<])*?)\n'
19
+ matches1 = list(re.finditer(pattern1, text))
20
+ matches2 = list(re.finditer(pattern2, text))
21
+ if matches1:
22
+ return matches1[-1].group(1)
23
+ elif matches2:
24
+ return matches2[-1].group(1)
25
+ return None
26
+
27
+ def extract_solution(solution_str):
28
+ if '<|im_start|>user' in solution_str:
29
+ model_output = re.sub(r'^.*?<\|im_start\|>assistant', '<|im_start|>assistant', solution_str, flags=re.DOTALL, count=1)
30
+ elif 'Assistant:' in solution_str:
31
+ model_output = solution_str.split('Assistant:')[-1].strip()
32
+ else:
33
+ model_output = solution_str
34
+
35
+ stop_words = ["</s>", "<|im_end|>", "<|endoftext|>"]
36
+ for stop_word in stop_words:
37
+ if stop_word in model_output:
38
+ model_output = model_output.split(stop_word)[0].strip()
39
+
40
+ extract_boxed_answer = extract_last_boxed(model_output)
41
+ if extract_boxed_answer:
42
+ return extract_boxed_answer
43
+ else:
44
+ return extract_last_final_answer(model_output)
45
+
46
+ def form_options(options: list):
47
+ option_str = 'Options are:\n'
48
+ opts = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
49
+ for opt, o in zip(options, opts):
50
+ option_str += f'({o}): {opt}\n'
51
+ return option_str
52
+
53
+ def get_prediction(output):
54
+ solution = extract_solution(output)
55
+ if solution is None:
56
+ return random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
57
+ for option in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']:
58
+ if option in solution:
59
+ return option
60
+ return random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
61
+
62
+ if __name__ == "__main__":
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
65
+ parser.add_argument("--output_file", type=str, default="outputs.json", help="File to save results")
66
+ args = parser.parse_args()
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
69
+ llm = LLM(model=args.model_path, tensor_parallel_size=4,gpu_memory_utilization=0.85)
70
+ print('start loading dataset')
71
+ dataset = datasets.load_dataset('m-a-p/SuperGPQA')
72
+ categories = ['Engineering', 'Medicine', 'Science', 'Philosophy', 'Military Science', 'Economics', 'Management', 'Sociology', 'Literature and Arts', 'History', 'Agronomy', 'Law', 'Education']
73
+ per_category_accuracy = {c: [0, 0] for c in categories}
74
+ success, fail = 0, 0
75
+ answers = []
76
+
77
+ print('----------------- Start Answering -------------------')
78
+
79
+ for category in categories:
80
+ category_entries = [entry for entry in dataset['train'] if entry['discipline'] == category]
81
+ prompts = []
82
+ for entry in category_entries:
83
+ query = entry['question'] + '\n' + form_options(entry['options']) + '\n'
84
+ messages = [{
85
+ "role": "user",
86
+ "content": query + '\nPlease reason step by step, and put your final answer option within \\boxed{}. Only put the letter in the box, e.g. \\boxed{A}. There is only one correct answer.'
87
+ }]
88
+ if tokenizer.chat_template:
89
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
90
+ else:
91
+ prompt = "user: " + query + '\nPlease reason step by step, and put your final answer option within \\boxed{}. Only put the letter in the box, e.g. \\boxed{A}. There is only one correct answer.'
92
+ prompts.append(prompt)
93
+
94
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=8192)
95
+ outputs = llm.generate(prompts, sampling_params)
96
+
97
+ for entry, output in zip(category_entries, outputs):
98
+ answer = output.outputs[0].text
99
+ entry['solution'] = answer
100
+ answers.append(entry)
101
+
102
+ prediction = get_prediction(answer)
103
+ if entry["answer_letter"] == prediction:
104
+ success += 1
105
+ per_category_accuracy[category][0] += 1
106
+ else:
107
+ fail += 1
108
+ per_category_accuracy[category][1] += 1
109
+
110
+ print(f"{category}: {per_category_accuracy[category][0] / (per_category_accuracy[category][0] + per_category_accuracy[category][1]):.4f}")
111
+
112
+ with open(args.output_file, 'w') as f:
113
+ json.dump(answers, f, indent=2)
114
+ with open('final_results.jsonl', 'a') as f:
115
+ json.dump({"dataset": "supergpqa", "model": args.model_path, "accuracy": round(success / (success + fail)*100, 2)}, f, indent=2)
116
+ print("Overall Accuracy:", success / (success + fail))
evaluate.bash ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ export VLLM_DISABLE_COMPILE_CACHE=1
3
+ model_name=$1
4
+
5
+ MODEL_NAMES=(
6
+ $model_name
7
+ )
8
+
9
+ TASKS=(
10
+ "math"
11
+ "gsm8k"
12
+ "amc"
13
+ "minerva"
14
+ "olympiad"
15
+ "aime2024"
16
+ "aime2025"
17
+ )
18
+
19
+ GPU_QUEUE=($(nvidia-smi --query-gpu=index --format=csv,noheader))
20
+ echo "Available GPUs: ${GPU_QUEUE[@]}"
21
+
22
+ declare -A pids
23
+
24
+ start_job() {
25
+ local gpu_id="$1"
26
+ local model="$2"
27
+ local task="$3"
28
+
29
+ echo "==> [$(date '+%Y-%m-%d %H:%M:%S')] Start task [${task}] with model [${model}] on GPU [${gpu_id}] ..."
30
+
31
+ CUDA_VISIBLE_DEVICES="${gpu_id}" \
32
+ python evaluation/generate.py --model "${model}" --dataset "${task}" &
33
+
34
+ pids["${gpu_id}"]=$!
35
+ }
36
+
37
+ for MODEL_NAME in "${MODEL_NAMES[@]}"; do
38
+ echo "==> Processing model: ${MODEL_NAME}"
39
+ TASK_INDEX=0
40
+ NUM_TASKS=${#TASKS[@]}
41
+
42
+ while :; do
43
+ while [ ${#GPU_QUEUE[@]} -gt 0 ] && [ ${TASK_INDEX} -lt ${NUM_TASKS} ]; do
44
+ gpu_id="${GPU_QUEUE[0]}"
45
+ GPU_QUEUE=("${GPU_QUEUE[@]:1}")
46
+
47
+ task="${TASKS[${TASK_INDEX}]}"
48
+ ((TASK_INDEX++))
49
+
50
+ start_job "$gpu_id" "$MODEL_NAME" "$task"
51
+ done
52
+
53
+ if [ ${TASK_INDEX} -ge ${NUM_TASKS} ] && [ ${#pids[@]} -eq 0 ]; then
54
+ break
55
+ fi
56
+
57
+ for gpu_id in "${!pids[@]}"; do
58
+ pid="${pids[$gpu_id]}"
59
+ if ! kill -0 "$pid" 2>/dev/null; then
60
+ echo "==> [$(date '+%Y-%m-%d %H:%M:%S')] GPU [${gpu_id}] job finished with PID [${pid}]."
61
+ unset pids["$gpu_id"]
62
+ GPU_QUEUE+=("$gpu_id")
63
+ fi
64
+ done
65
+
66
+ sleep 1
67
+ done
68
+ done
69
+
70
+ python evaluation/results_recheck.py --model_name $model_name &
71
+
72
+ python evaluation/eval_supergpqa.py --model_path $model_name
73
+ python evaluation/eval_bbeh.py --model_path $model_name
74
+ python evaluation/eval_mmlupro.py --model_path $model_name
75
+
76
+
77
+ python evaluation/test.py --model_name $model_name
78
+
79
+ echo "==> All tasks have finished!"
generate.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import vllm
2
+ import argparse
3
+ import evaluation.datasets_loader as datasets_loader
4
+ from transformers import AutoTokenizer
5
+ import json
6
+ import os
7
+
8
+ STORAGE_PATH = os.getenv("STORAGE_PATH")
9
+
10
+ def main(args):
11
+ print("STORAGE_PATH")
12
+ print(STORAGE_PATH)
13
+ with open('tokens.json','r') as f:
14
+ tokens = json.load(f)
15
+ print(args.model, args.dataset)
16
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
17
+ model = vllm.LLM(
18
+ model=args.model,
19
+ tokenizer=args.model,
20
+ gpu_memory_utilization=0.85
21
+ )
22
+ sample_params = vllm.SamplingParams(
23
+ max_tokens=4096,
24
+ temperature=0.0,
25
+ stop_token_ids=[tokenizer.eos_token_id],
26
+ )
27
+ handler = datasets_loader.get_dataset_handler(args.dataset,args.name)
28
+ questions, answers = handler.load_data()
29
+ chats=[[{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},{"role": "user", "content": question}] for question in questions]
30
+ if tokenizer.chat_template:
31
+ prompts = [tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True, add_special_tokens=True, enable_thinking=False) for chat in chats]
32
+ else:
33
+ prompts = ["system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"] + '\nPlease reason step by step, and put your final answer within \\boxed{}.' for chat in chats]
34
+ responses = model.generate(prompts, sampling_params=sample_params,use_tqdm=True)
35
+ responses = [response.outputs[0].text for response in responses]
36
+ scores,average_score = handler.get_score(responses, answers)
37
+ results = [{"question": question, "answer": answer, "response": response, "score": score} for question, answer, response, score in zip(questions, answers, responses, scores)]
38
+ print(f"Average score: {average_score}")
39
+ results.append({"average_score": average_score})
40
+ os.makedirs(f"{STORAGE_PATH}/evaluation/{args.model.replace('/', '_')}", exist_ok=True)
41
+ with open(f"{STORAGE_PATH}/evaluation/{args.model.replace('/', '_')}/results_{args.dataset}.json", "w") as f:
42
+ json.dump(results, f, indent=4)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ parser = argparse.ArgumentParser()
47
+ parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B")
48
+ parser.add_argument("--dataset", type=str, default="math")
49
+ parser.add_argument("--name", type=str, default=None)
50
+ args = parser.parse_args()
51
+ main(args)
results_recheck.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from mathruler.grader import extract_boxed_content, grade_answer
3
+ import openai
4
+ import requests
5
+ from tqdm import tqdm
6
+ import random
7
+ import argparse
8
+ import os
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
12
+ args = parser.parse_args()
13
+
14
+ STORAGE_PATH = os.getenv("STORAGE_PATH")
15
+ api_urls = []
16
+ api_keys=[]
17
+
18
+
19
+
20
+ def process_example(answer, response):
21
+ try:
22
+ example = {
23
+ "model": "gpt-4o",
24
+ "messages": [
25
+ {"role": "system", "content": "You are a math answer checker."},
26
+ {"role": "user", "content": f"Hi, there is a answer: {answer}\n\n, and the ground truth answer is: {response}\n\n, please check whether the answer is correct or not, and return the **only** Yes or No."}
27
+ ],
28
+ "temperature": 0.1
29
+ }
30
+ api_index = random.randint(0, len(api_urls)-1)
31
+ api_url = api_urls[api_index]
32
+ api_key = api_keys[api_index]
33
+ response = requests.post(api_url, headers={"api-key": api_key,"Content-Type": "application/json"}, json=example, timeout=20)
34
+ return response.json()['choices'][0]['message']['content']
35
+ except Exception as e:
36
+ print(e)
37
+ return "No"
38
+ new_results = []
39
+ for model_name in [args.model_name]:
40
+ for dataset in [
41
+ "math",
42
+ "gsm8k",
43
+ "amc",
44
+ "minerva",
45
+ "olympiad",
46
+ "aime2024",
47
+ "aime2025",
48
+ ]:
49
+ with open(f'{STORAGE_PATH}/evaluation/{model_name.replace("/","_")}/results_{dataset}.json', 'r') as f:
50
+ results = json.load(f)
51
+
52
+ for i in tqdm(range(len(results)-1)):
53
+ if results[i]['score'] < 0.5:
54
+ gpt_check = process_example(results[i]['answer'],results[i]['response'])
55
+ if "yes" in gpt_check.lower():
56
+ results[i]['score']=1
57
+ new_results.append({
58
+ 'model': model_name,
59
+ 'dataset': dataset,
60
+ 'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
61
+ })
62
+ print(new_results)
63
+ with open(f'final_results.jsonl', 'a') as f:
64
+ json.dump({
65
+ 'model': model_name,
66
+ 'dataset': dataset,
67
+ 'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
68
+ }, f)
69
+ f.write('\n')
70
+
71
+
72
+
73
+
74
+