Spaces:
Running
Running
import json | |
from mathruler.grader import extract_boxed_content, grade_answer | |
import openai | |
import requests | |
from tqdm import tqdm | |
import random | |
import argparse | |
import os | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct") | |
args = parser.parse_args() | |
STORAGE_PATH = os.getenv("STORAGE_PATH") | |
api_urls = [] | |
api_keys=[] | |
def process_example(answer, response): | |
try: | |
example = { | |
"model": "gpt-4o", | |
"messages": [ | |
{"role": "system", "content": "You are a math answer checker."}, | |
{"role": "user", "content": f"Hi, there is a answer: {answer}\n\n, and the ground truth answer is: {response}\n\n, please check whether the answer is correct or not, and return the **only** Yes or No."} | |
], | |
"temperature": 0.1 | |
} | |
api_index = random.randint(0, len(api_urls)-1) | |
api_url = api_urls[api_index] | |
api_key = api_keys[api_index] | |
response = requests.post(api_url, headers={"api-key": api_key,"Content-Type": "application/json"}, json=example, timeout=20) | |
return response.json()['choices'][0]['message']['content'] | |
except Exception as e: | |
print(e) | |
return "No" | |
new_results = [] | |
for model_name in [args.model_name]: | |
for dataset in [ | |
"math", | |
"gsm8k", | |
"amc", | |
"minerva", | |
"olympiad", | |
"aime2024", | |
"aime2025", | |
]: | |
with open(f'{STORAGE_PATH}/evaluation/{model_name.replace("/","_")}/results_{dataset}.json', 'r') as f: | |
results = json.load(f) | |
for i in tqdm(range(len(results)-1)): | |
if results[i]['score'] < 0.5: | |
gpt_check = process_example(results[i]['answer'],results[i]['response']) | |
if "yes" in gpt_check.lower(): | |
results[i]['score']=1 | |
new_results.append({ | |
'model': model_name, | |
'dataset': dataset, | |
'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2) | |
}) | |
print(new_results) | |
with open(f'final_results.jsonl', 'a') as f: | |
json.dump({ | |
'model': model_name, | |
'dataset': dataset, | |
'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2) | |
}, f) | |
f.write('\n') | |