AbrahamicSolver / evaluate.py
Gatsby767's picture
Rename math.py to app_math.py and update imports to avoid stdlib conflict
84bfc85
import json
import vllm
from transformers import AutoTokenizer
import argparse
import re
from evaluation.datasets_loader import get_dataset_handler
from mathruler.grader import extract_boxed_content, grade_answer
import os
# math_verify = get_dataset_handler("math")
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Base")
parser.add_argument("--num_samples", type=int, default=10)
parser.add_argument("--suffix", type=str, default="77")
parser.add_argument("--save_name", type=str, default="")
args = parser.parse_args()
STORAGE_PATH = os.getenv("STORAGE_PATH")
print('start load')
with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "r") as f:
data = json.load(f)
os.remove(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json")
# data = [ {
# "question": "Solve the equation \\(|x^2 - 3x + 2| = |x - 1.5|\\). How many real solutions does this equation have?",
# "answer": "4",
# "score": 0
# }]
def extract_answer(response):
return re.search(r"\\boxed{(.*?)}", response).group(1)
tokenizer = AutoTokenizer.from_pretrained(args.model)
model = vllm.LLM(
model=args.model,
tokenizer=args.model,
gpu_memory_utilization=0.85,
seed=int(args.suffix),
)
sample_params = vllm.SamplingParams(
max_tokens=4096,
temperature=1.0,
top_p=1.0,
top_k=40,
stop_token_ids=[tokenizer.eos_token_id],
n=args.num_samples,
)
wrong_data = [item for item in data if item['score'] == -1]
correct_data = [item for item in data if item['score'] == 0]
questions = [item["question"] for item in correct_data]
answers = [item["answer"] for item in correct_data]
chats = [[{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},{"role": "user", "content": question}] for question in questions]
if tokenizer.chat_template:
prompts = [tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True, add_special_tokens=True) for chat in chats]
else:
prompts = ["system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"] for chat in chats]
responses = model.generate(prompts, sampling_params=sample_params,use_tqdm=True)
print(len(data))
results_all = []
for response, answer, question in zip(responses, answers, questions):
try:
error_flag = False
count = 0
results = [extract_boxed_content(output.text) for output in response.outputs]
answer_counts = {}
for result in results:
found_match = False
# Check if result matches any existing answer group
try:
for existing_answer in answer_counts:
if grade_answer(result, existing_answer) or grade_answer(existing_answer, result) or result == existing_answer or ('no ' in result.lower() and 'no ' in existing_answer.lower()):
answer_counts[existing_answer] += 1
found_match = True
break
except:
error_flag = True
break
# If no match found, create new answer group
if not found_match:
answer_counts[result] = 1
# print(answer_counts)
# Find the answer with the most matches
if error_flag:
continue
if answer_counts:
max_count = max(answer_counts.values())
majority_answer = max(answer_counts.items(), key=lambda x: x[1])[0]
# print(majority_answer)
score = max_count/len(results)
# print(score)
# print(majority_answer)
if "证明" in question or 'box' in question.lower() or 'text' in majority_answer.lower():
continue
results_all.append({"question": question, "answer": majority_answer, "score": score, 'results':results})
except Exception as e:
print("Error:", e)
continue
# print({"question": question, "answer": majority_answer, "score": score, 'results':results})
# print(score,question,flush=True)
print(len(results_all))
with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}_results.json", "w") as f:
json.dump(results_all, f, indent=4)
# break