AbrahamicSolver / results_recheck.py
Gatsby767's picture
Upload 7 files
47a4065 verified
import json
from mathruler.grader import extract_boxed_content, grade_answer
import openai
import requests
from tqdm import tqdm
import random
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
args = parser.parse_args()
STORAGE_PATH = os.getenv("STORAGE_PATH")
api_urls = []
api_keys=[]
def process_example(answer, response):
try:
example = {
"model": "gpt-4o",
"messages": [
{"role": "system", "content": "You are a math answer checker."},
{"role": "user", "content": f"Hi, there is a answer: {answer}\n\n, and the ground truth answer is: {response}\n\n, please check whether the answer is correct or not, and return the **only** Yes or No."}
],
"temperature": 0.1
}
api_index = random.randint(0, len(api_urls)-1)
api_url = api_urls[api_index]
api_key = api_keys[api_index]
response = requests.post(api_url, headers={"api-key": api_key,"Content-Type": "application/json"}, json=example, timeout=20)
return response.json()['choices'][0]['message']['content']
except Exception as e:
print(e)
return "No"
new_results = []
for model_name in [args.model_name]:
for dataset in [
"math",
"gsm8k",
"amc",
"minerva",
"olympiad",
"aime2024",
"aime2025",
]:
with open(f'{STORAGE_PATH}/evaluation/{model_name.replace("/","_")}/results_{dataset}.json', 'r') as f:
results = json.load(f)
for i in tqdm(range(len(results)-1)):
if results[i]['score'] < 0.5:
gpt_check = process_example(results[i]['answer'],results[i]['response'])
if "yes" in gpt_check.lower():
results[i]['score']=1
new_results.append({
'model': model_name,
'dataset': dataset,
'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
})
print(new_results)
with open(f'final_results.jsonl', 'a') as f:
json.dump({
'model': model_name,
'dataset': dataset,
'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
}, f)
f.write('\n')