Spaces:

sagar007
/

phi-vision-math-assistant

Paused

App Files Files Community

phi-vision-math-assistant / app.py

sagar007

Update app.py

32f55fb verified 6 months ago

raw

history blame

2.24 kB

	import gradio as gr
	import spaces
	from PIL import Image
	import requests
	from transformers import AutoModelForCausalLM, AutoProcessor
	import torch

	# Load the model and processor
	model_id = "microsoft/Phi-3.5-vision-instruct"
	model_revision = "f2a2b357af3e062d60ca6e73a13f9f97a7fd3524" # Pin to a specific revision

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	revision=model_revision,
	trust_remote_code=True,
	torch_dtype=torch.float16,
	use_flash_attention_2=False, # Explicitly disable Flash Attention 2
	)
	processor = AutoProcessor.from_pretrained(model_id, revision=model_revision, trust_remote_code=True, num_crops=16)

	@spaces.GPU(duration=120) # Adjust the duration as needed
	def solve_math_problem(image):
	# Move model to GPU for this function call
	model.to('cuda')

	# Prepare the input
	messages = [
	{"role": "user", "content": "<\|image_1\|>\nSolve this math problem step by step. Explain your reasoning clearly."},
	]
	prompt = processor.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process the input
	inputs = processor(prompt, image, return_tensors="pt").to("cuda")

	# Generate the response
	generation_args = {
	"max_new_tokens": 1000,
	"temperature": 0.2,
	"do_sample": True,
	}
	generate_ids = model.generate(**inputs,
	eos_token_id=processor.tokenizer.eos_token_id,
	**generation_args
	)

	# Decode the response
	generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
	response = processor.batch_decode(generate_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	# Move model back to CPU to free up GPU memory
	model.to('cpu')

	return response

	# Create the Gradio interface
	iface = gr.Interface(
	fn=solve_math_problem,
	inputs=gr.Image(type="pil"),
	outputs="text",
	title="Visual Math Problem Solver",
	description="Upload an image of a math problem, and I'll try to solve it step by step!",
	examples=[
	["example_math_problem1.jpg"],
	["example_math_problem2.jpg"]
	]
	)

	# Launch the app
	iface.launch()