|
import gradio as gr |
|
import spaces |
|
from PIL import Image |
|
import requests |
|
from transformers import AutoModelForCausalLM, AutoProcessor |
|
import torch |
|
|
|
|
|
model_id = "microsoft/Phi-3.5-vision-instruct" |
|
model_revision = "f2a2b357af3e062d60ca6e73a13f9f97a7fd3524" |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
revision=model_revision, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
use_flash_attention_2=False, |
|
) |
|
processor = AutoProcessor.from_pretrained(model_id, revision=model_revision, trust_remote_code=True, num_crops=16) |
|
|
|
@spaces.GPU(duration=120) |
|
def solve_math_problem(image): |
|
|
|
model.to('cuda') |
|
|
|
|
|
messages = [ |
|
{"role": "user", "content": "<|image_1|>\nSolve this math problem step by step. Explain your reasoning clearly."}, |
|
] |
|
prompt = processor.tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
inputs = processor(prompt, image, return_tensors="pt").to("cuda") |
|
|
|
|
|
generation_args = { |
|
"max_new_tokens": 1000, |
|
"temperature": 0.2, |
|
"do_sample": True, |
|
} |
|
generate_ids = model.generate(**inputs, |
|
eos_token_id=processor.tokenizer.eos_token_id, |
|
**generation_args |
|
) |
|
|
|
|
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode(generate_ids, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=False |
|
)[0] |
|
|
|
|
|
model.to('cpu') |
|
|
|
return response |
|
|
|
|
|
iface = gr.Interface( |
|
fn=solve_math_problem, |
|
inputs=gr.Image(type="pil"), |
|
outputs="text", |
|
title="Visual Math Problem Solver", |
|
description="Upload an image of a math problem, and I'll try to solve it step by step!", |
|
examples=[ |
|
["example_math_problem1.jpg"], |
|
["example_math_problem2.jpg"] |
|
] |
|
) |
|
|
|
|
|
iface.launch() |