Spaces:
Running
Running
from abc import ABC, abstractmethod | |
import re | |
from math_verify import parse, verify | |
import pandas | |
from datasets import load_dataset | |
import random | |
ANSWER_PATTERN_MULTICHOICE = r"(?:\$\$\s*)?\\boxed\{[^}]*?([A-Z])[^}]*\}(?:\s*\$\$)?|(?:\*{0,2}\s*)?(?:Final|Correct)\s*Answer:\s*([A-Z])\." | |
ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)" | |
ANSWER_PATTERN_BOXED = r"(?i)\\boxed\s*{([^\n]+)}" | |
class DatasetHandler(ABC): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED, num_examples: int = None): | |
self.answer_pattern = answer_pattern | |
self.num_examples = num_examples if num_examples is not None else 1 | |
def load_data(self): | |
""" | |
Load the dataset and return a tuple: (splits_dict, answer_type). | |
splits_dict: A dictionary where each key is a split name (e.g., 'train', 'test') | |
and the value is the corresponding dataset or data structure. | |
answer_type: A string describing the type of the answer, e.g.: | |
'number', 'text', 'option letter', etc. | |
""" | |
pass | |
def extract_answer(self, response: str) -> str: | |
try: | |
return re.search(self.answer_pattern, response).group(1) | |
except: | |
return None | |
def compare_answer(self, response: str, answer: str) -> bool: | |
response_answer = self.extract_answer(response) | |
answer = str(answer) | |
response_answer = str(response_answer) | |
if response_answer is None: | |
return False | |
if self.answer_pattern == ANSWER_PATTERN_MULTICHOICE: | |
return response_answer == answer | |
return verify(parse(answer), parse(response_answer)) | |
def get_score(self, responses: str, answers: str) -> float: | |
scores = [] | |
for r,a in zip(responses, answers): | |
if self.compare_answer(r,a): | |
scores.append(1) | |
else: | |
scores.append(0) | |
return scores, sum(scores)/len(scores) | |
class MathDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
df = pandas.read_csv( | |
f"https://openaipublic.blob.core.windows.net/simple-evals/math_500_test.csv" | |
) | |
examples = [row.to_dict() for _, row in df.iterrows()] | |
questions = [example['Question'] for example in examples] | |
answers = [example['Answer'] for example in examples] | |
return questions, answers | |
class Gsm8kDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("openai/gsm8k", 'main', split='test') | |
examples = [row for row in dataset] | |
questions = [example['question'] for example in examples] | |
answers = [example["answer"].split('#### ')[-1] for example in examples] | |
return questions, answers | |
class AmcDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("zwhe99/amc23", split='test') | |
examples = [row for row in dataset] | |
questions = [example['question'] for example in examples] *32 | |
answers = [example['answer'] for example in examples] *32 | |
return questions, answers | |
class MinervaDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("zwhe99/simplerl-minerva-math", split='test') | |
examples = [row for row in dataset] | |
questions = [example['problem'] for example in examples] | |
answers = [example['answer'] for example in examples] | |
return questions, answers | |
class OlympiadDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("zwhe99/simplerl-OlympiadBench", split='test') | |
examples = [row for row in dataset] | |
questions = [example['question'] for example in examples] | |
answers = [example['final_answer'][0] for example in examples] | |
return questions, answers | |
class Aime2024DatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("HuggingFaceH4/aime_2024", split='train') | |
examples = [row for row in dataset] | |
questions = [example['problem'] for example in examples]*32 | |
answers = [example['answer'] for example in examples]*32 | |
return questions, answers | |
class Aime2025DatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("yentinglin/aime_2025", 'default')['train'] | |
examples = [row for row in dataset] | |
questions = [example['problem'] for example in examples]*32 | |
answers = [example['answer'] for example in examples]*32 | |
return questions, answers | |
class MmluProDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset('TIGER-Lab/MMLU-Pro', split='test') | |
examples = [] | |
for row in dataset: | |
example = { | |
'question': row['question'], | |
'options': row['options'], | |
'answer': row['answer'], | |
'answer_index': row['answer_index'], | |
'category': row['category'], | |
'cot_content': row['cot_content'], | |
'src': row['src'] | |
} | |
examples.append(example) | |
random.shuffle(examples) | |
examples = examples[:1000] | |
questions = [] | |
answers = [] | |
for example in examples: | |
# Format question with options | |
question = example['question'] + "\n\nOptions:\n" | |
for i, opt in enumerate(example['options']): | |
question += f"{chr(65+i)}. {opt}\n" | |
questions.append(question) | |
answers.append(example['answer']) | |
return questions, answers | |
class bbehDatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("MrLight/bbeh-eval", split='train') | |
examples = [row for row in dataset] | |
random.shuffle(examples) | |
examples = examples[:1000] | |
questions = [example['question'] for example in examples] | |
answers = [example['answer'] for example in examples] | |
return questions, answers | |
class SuperGPQADatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset('m-a-p/SuperGPQA') | |
examples = [] | |
for row in dataset['train']: | |
example = { | |
'question': row['question'], | |
'options': row['options'], | |
'answer': row['answer_letter'] | |
} | |
examples.append(example) | |
random.shuffle(examples) | |
examples = examples[:1000] | |
questions = [] | |
answers = [] | |
for example in examples: | |
# Format question with options | |
question = example['question'] + "\n\nOptions:\n" | |
for i, opt in enumerate(example['options']): | |
question += f"{chr(65+i)}. {opt}\n" | |
questions.append(question) | |
answers.append(example['answer']) | |
return questions, answers | |
class GPQA_DatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_MULTICHOICE): | |
super().__init__(answer_pattern) | |
def load_data(self): | |
dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond",'train') | |
examples = [] | |
for row in dataset: | |
# Get the question and answers | |
question = row['Question'] | |
options = [ | |
row['Correct Answer'], | |
row['Incorrect Answer 1'], | |
row['Incorrect Answer 2'], | |
row['Incorrect Answer 3'] | |
] | |
# Shuffle options to randomize correct answer position | |
random.shuffle(options) | |
# Find the index of correct answer after shuffling | |
correct_index = options.index(row['Correct Answer']) | |
correct_option = chr(65 + correct_index) | |
example = { | |
'question': question, | |
'options': options, | |
'answer': correct_option | |
} | |
examples.append(example) | |
# Shuffle and limit to 1000 examples like other handlers | |
random.shuffle(examples) | |
examples = examples[:1000] | |
questions = [] | |
answers = [] | |
for example in examples: | |
# Format question with options | |
question = example['question'] + "\n\nOptions:\n" | |
for i, opt in enumerate(example['options']): | |
question += f"{chr(65+i)}. {opt}\n" | |
questions.append(question) | |
answers.append(example['answer']) | |
return questions, answers | |
class Mydataset_DatasetHandler(DatasetHandler): | |
def __init__(self, answer_pattern: str = ANSWER_PATTERN_BOXED, name: str = "qwen3_frequent_solver_v1"): | |
super().__init__(answer_pattern) | |
self.name = name | |
def load_data(self): | |
dataset = load_dataset(self.name)['train'] | |
examples = [] | |
for row in dataset: | |
example = { | |
'question': row['problem'], | |
'answer': row['answer'] | |
} | |
examples.append(example) | |
# Shuffle and limit to 1000 examples like other handlers | |
random.shuffle(examples) | |
# examples = examples[:1000] | |
questions = [] | |
answers = [] | |
for example in examples: | |
questions.append(example['question']) | |
answers.append(example['answer']) | |
return questions, answers | |
def get_dataset_handler(dataset_name: str,name: str = None) -> DatasetHandler: | |
if dataset_name == "math": | |
return MathDatasetHandler() | |
elif dataset_name == "gsm8k": | |
return Gsm8kDatasetHandler() | |
elif dataset_name == "amc": | |
return AmcDatasetHandler() | |
elif dataset_name == "minerva": | |
return MinervaDatasetHandler() | |
elif dataset_name == "olympiad": | |
return OlympiadDatasetHandler() | |
elif dataset_name == "aime2024": | |
return Aime2024DatasetHandler() | |
elif dataset_name == "aime2025": | |
return Aime2025DatasetHandler() | |
elif dataset_name == "mmlu_pro": | |
return MmluProDatasetHandler() | |
elif dataset_name == "bbeh": | |
return bbehDatasetHandler() | |
elif dataset_name == "super_gpqa": | |
return SuperGPQADatasetHandler() | |
elif dataset_name == "gpqa": | |
return GPQA_DatasetHandler() | |
elif dataset_name == "mydataset": | |
return Mydataset_DatasetHandler(name=name) | |
else: | |
raise ValueError(f"Dataset {dataset_name} not found") | |
if __name__ == "__main__": | |
print("mmlu_pro") | |
for dataset_name in ["gpqa"]: | |
print(f"Loading {dataset_name} dataset") | |
handler = get_dataset_handler(dataset_name) | |
questions, answers = handler.load_data() | |
print(questions[0]) | |
print('-'*100) | |
print(answers[0]) | |