Spaces:

Gatsby767
/

AbrahamicSolver

Running

App Files Files Community

AbrahamicSolver / evaluate.py

Gatsby767

Rename math.py to app_math.py and update imports to avoid stdlib conflict

84bfc85 about 20 hours ago

raw

history blame contribute delete

4.4 kB

	import json
	import vllm
	from transformers import AutoTokenizer
	import argparse
	import re
	from evaluation.datasets_loader import get_dataset_handler
	from mathruler.grader import extract_boxed_content, grade_answer
	import os
	# math_verify = get_dataset_handler("math")

	parser = argparse.ArgumentParser()
	parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Base")
	parser.add_argument("--num_samples", type=int, default=10)
	parser.add_argument("--suffix", type=str, default="77")
	parser.add_argument("--save_name", type=str, default="")
	args = parser.parse_args()
	STORAGE_PATH = os.getenv("STORAGE_PATH")
	print('start load')
	with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "r") as f:
	data = json.load(f)
	os.remove(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json")
	# data = [ {
	# "question": "Solve the equation \\(\|x^2 - 3x + 2\| = \|x - 1.5\|\\). How many real solutions does this equation have?",
	# "answer": "4",
	# "score": 0
	# }]
	def extract_answer(response):
	return re.search(r"\\boxed{(.*?)}", response).group(1)

	tokenizer = AutoTokenizer.from_pretrained(args.model)
	model = vllm.LLM(
	model=args.model,
	tokenizer=args.model,
	gpu_memory_utilization=0.85,
	seed=int(args.suffix),
	)
	sample_params = vllm.SamplingParams(
	max_tokens=4096,
	temperature=1.0,
	top_p=1.0,
	top_k=40,
	stop_token_ids=[tokenizer.eos_token_id],
	n=args.num_samples,
	)
	wrong_data = [item for item in data if item['score'] == -1]
	correct_data = [item for item in data if item['score'] == 0]
	questions = [item["question"] for item in correct_data]
	answers = [item["answer"] for item in correct_data]
	chats = [[{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},{"role": "user", "content": question}] for question in questions]
	if tokenizer.chat_template:
	prompts = [tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True, add_special_tokens=True) for chat in chats]
	else:
	prompts = ["system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"] for chat in chats]
	responses = model.generate(prompts, sampling_params=sample_params,use_tqdm=True)
	print(len(data))
	results_all = []
	for response, answer, question in zip(responses, answers, questions):

	try:
	error_flag = False
	count = 0
	results = [extract_boxed_content(output.text) for output in response.outputs]
	answer_counts = {}
	for result in results:
	found_match = False
	# Check if result matches any existing answer group
	try:
	for existing_answer in answer_counts:
	if grade_answer(result, existing_answer) or grade_answer(existing_answer, result) or result == existing_answer or ('no ' in result.lower() and 'no ' in existing_answer.lower()):
	answer_counts[existing_answer] += 1
	found_match = True
	break
	except:
	error_flag = True
	break
	# If no match found, create new answer group
	if not found_match:
	answer_counts[result] = 1
	# print(answer_counts)
	# Find the answer with the most matches
	if error_flag:
	continue
	if answer_counts:
	max_count = max(answer_counts.values())
	majority_answer = max(answer_counts.items(), key=lambda x: x[1])[0]
	# print(majority_answer)
	score = max_count/len(results)
	# print(score)
	# print(majority_answer)
	if "证明" in question or 'box' in question.lower() or 'text' in majority_answer.lower():
	continue
	results_all.append({"question": question, "answer": majority_answer, "score": score, 'results':results})
	except Exception as e:
	print("Error:", e)
	continue
	# print({"question": question, "answer": majority_answer, "score": score, 'results':results})
	# print(score,question,flush=True)
	print(len(results_all))

	with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}_results.json", "w") as f:
	json.dump(results_all, f, indent=4)

	# break