Spaces:

Gatsby767
/

AbrahamicSolver

Running

App Files Files Community

AbrahamicSolver / question_generate.py

Gatsby767

Rename math.py to app_math.py and update imports to avoid stdlib conflict

84bfc85 about 20 hours ago

raw

history blame contribute delete

5.04 kB

	import vllm
	import torch
	from transformers import AutoTokenizer
	import argparse
	from typing import List
	from vllm.outputs import RequestOutput
	from evaluation.datasets_loader import get_dataset_handler
	import json
	import regex as re
	import os
	STORAGE_PATH = os.getenv("STORAGE_PATH")

	def extract_boxed(text):
	results, i = [], 0
	prefix = r'\boxed{'
	plen = len(prefix)

	while True:
	start = text.find(prefix, i)
	if start == -1:
	break # no more \boxed{…}

	j = start + plen
	depth = 1
	while j < len(text) and depth:
	if text[j] == '{':
	depth += 1
	elif text[j] == '}':
	depth -= 1
	j += 1

	results.append(text[start + plen : j - 1])
	i = j

	return results

	def get_response_mask(response_ids, eos_token_id, dtype):
	batch_size, seq_len = response_ids.shape
	mask = torch.ones((batch_size, seq_len), dtype=dtype)
	for i in range(batch_size):
	for j in range(seq_len):
	if response_ids[i][j] == eos_token_id:
	mask[i][j:] = 0
	break
	return mask

	def main(args):
	tokenizer = AutoTokenizer.from_pretrained(args.model)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token_id = tokenizer.eos_token_id
	model = vllm.LLM(
	model=args.model,
	tokenizer=args.model,
	# gpu_memory_utilization=0.8,
	seed=int(args.suffix),
	)
	dataset_handler = get_dataset_handler("math")
	questions, answers = dataset_handler.load_data()
	question = questions[0]
	answer = answers[0]
	chat = [
	{
	"role": "system",
	"content": (
	"You are an expert competition-math problem setter.\n"
	"FIRST, in your private scratch-pad, think step-by-step to design a brand-new, non-trivial problem. "
	"The problem could come from any field of mathematics, including but not limited to algebra, geometry, number theory, combinatorics, prealgebra, probability, statistics, and calculus. "
	"Aim for a difficulty such that fewer than 30 % of advanced high-school students could solve it. "
	"Avoid re-using textbook clichés or famous contest problems.\n"
	"THEN, without revealing any of your private thoughts, output exactly the following two blocks:\n\n"
	"<question>\n"
	"{The full problem statement on one or more lines}\n"
	"</question>\n\n"
	r"\boxed{final_answer}"
	"\n\n"
	"Do NOT output anything else—no explanations, no extra markup."
	)
	},
	{
	"role": "user",
	"content": (
	"Generate one new, challenging reasoning question now. "
	"Remember to format the output exactly as instructed."
	)
	}
	]

	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template(
	chat,
	tokenize=False,
	add_generation_prompt=True,
	add_special_tokens=True
	)
	else:
	prompt = "system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"]
	sample_params = vllm.SamplingParams(
	max_tokens=4096,
	temperature=1.0,
	top_p=0.95,
	n=1,
	stop_token_ids=[tokenizer.eos_token_id],
	)

	completions: List[RequestOutput] = model.generate([prompt]*args.num_samples, sampling_params=sample_params)
	results=[]
	for completion in completions:
	response = completion.outputs[0].text
	try:
	questions = re.findall(r"<question>(.*?)</question>", response, re.DOTALL)
	answers = extract_boxed(response)

	if questions and answers:
	question = questions[-1].strip()
	answer = answers[-1].strip()
	results.append({"question": question, "answer": answer, "score": 0})
	else:
	results.append({"question": response, "answer": "", "score": -1})
	except:
	results.append({"question": response, "answer": "", "score": -1})
	with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "w") as f:
	json.dump(results, f, indent=4)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B")
	parser.add_argument("--num_samples", type=int, default=1250, help="Number of samples to generate")
	parser.add_argument("--suffix", type=str, default="", help="Suffix to add to the output file")
	parser.add_argument("--save_name", type=str, default="", help="")
	args = parser.parse_args()

	main(args)