sagar007's picture
Update app.py
32f55fb verified
raw
history blame
2.24 kB
import gradio as gr
import spaces
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
# Load the model and processor
model_id = "microsoft/Phi-3.5-vision-instruct"
model_revision = "f2a2b357af3e062d60ca6e73a13f9f97a7fd3524" # Pin to a specific revision
model = AutoModelForCausalLM.from_pretrained(
model_id,
revision=model_revision,
trust_remote_code=True,
torch_dtype=torch.float16,
use_flash_attention_2=False, # Explicitly disable Flash Attention 2
)
processor = AutoProcessor.from_pretrained(model_id, revision=model_revision, trust_remote_code=True, num_crops=16)
@spaces.GPU(duration=120) # Adjust the duration as needed
def solve_math_problem(image):
# Move model to GPU for this function call
model.to('cuda')
# Prepare the input
messages = [
{"role": "user", "content": "<|image_1|>\nSolve this math problem step by step. Explain your reasoning clearly."},
]
prompt = processor.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Process the input
inputs = processor(prompt, image, return_tensors="pt").to("cuda")
# Generate the response
generation_args = {
"max_new_tokens": 1000,
"temperature": 0.2,
"do_sample": True,
}
generate_ids = model.generate(**inputs,
eos_token_id=processor.tokenizer.eos_token_id,
**generation_args
)
# Decode the response
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# Move model back to CPU to free up GPU memory
model.to('cpu')
return response
# Create the Gradio interface
iface = gr.Interface(
fn=solve_math_problem,
inputs=gr.Image(type="pil"),
outputs="text",
title="Visual Math Problem Solver",
description="Upload an image of a math problem, and I'll try to solve it step by step!",
examples=[
["example_math_problem1.jpg"],
["example_math_problem2.jpg"]
]
)
# Launch the app
iface.launch()