sparrow_demo / app.py
yakilee's picture
Update app.py
9351df3 verified
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
from datetime import datetime
import os
import torch
import gc
# Configure memory settings
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)"
def process_image(image_filepath, max_width=640, max_height=800):
if image_filepath is None:
raise ValueError("No image provided")
img = Image.open(image_filepath)
width, height = img.size
# Enhanced resizing with aspect ratio preservation
aspect_ratio = width / height
if aspect_ratio > (max_width/max_height):
new_width = max_width
new_height = int(max_width / aspect_ratio)
else:
new_height = max_height
new_width = int(max_height * aspect_ratio)
img = img.resize((new_width, new_height), Image.LANCZOS)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"/tmp/image_{timestamp}.jpg"
img.save(filename, format='JPEG', quality=75, optimize=True)
return os.path.abspath(filename), new_width, new_height
# Model initialization with memory optimizations
model = None
processor = None
def load_model():
global model, processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct",
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
@spaces.GPU
def run_inference(input_imgs, text_input):
global model, processor
if model is None:
load_model()
results = []
for image in input_imgs:
torch.cuda.empty_cache()
gc.collect()
image_path, width, height = process_image(image)
try:
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": text_input}
]
}]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Process inputs in chunks
inputs = processor(
text=[text],
images=[Image.open(image_path)],
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
).to("cuda")
# Memory-efficient generation
with torch.inference_mode():
generated_ids = model.generate(
**inputs,
max_new_tokens=512,
do_sample=False,
num_beams=1,
early_stopping=True
)
# Clean output processing
output = processor.batch_decode(
generated_ids[:, inputs.input_ids.shape[1]:],
skip_special_tokens=True
)[0]
results.append(output)
# Force memory cleanup
del inputs, generated_ids
torch.cuda.empty_cache()
gc.collect()
finally:
if os.path.exists(image_path):
os.remove(image_path)
return results
# Streamlined interface
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
input_imgs = gr.Files(file_types=["image"], label="Upload Images")
text_input = gr.Textbox(label="Query")
submit_btn = gr.Button("Submit", variant="primary")
output_text = gr.Textbox(label="Response", elem_id="output")
submit_btn.click(run_inference, [input_imgs, text_input], output_text)
demo.queue(max_size=1)
demo.launch()