Spaces:

yakilee
/

sparrow_demo

Sleeping

App Files Files Community

sparrow_demo / app.py

yakilee

Update app.py

9351df3 verified 25 days ago

raw

history blame contribute delete

4.02 kB

	import gradio as gr
	import spaces
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from PIL import Image
	from datetime import datetime
	import os
	import torch
	import gc

	# Configure memory settings
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
	DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)"

	def process_image(image_filepath, max_width=640, max_height=800):
	if image_filepath is None:
	raise ValueError("No image provided")

	img = Image.open(image_filepath)
	width, height = img.size

	# Enhanced resizing with aspect ratio preservation
	aspect_ratio = width / height
	if aspect_ratio > (max_width/max_height):
	new_width = max_width
	new_height = int(max_width / aspect_ratio)
	else:
	new_height = max_height
	new_width = int(max_height * aspect_ratio)

	img = img.resize((new_width, new_height), Image.LANCZOS)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"/tmp/image_{timestamp}.jpg"
	img.save(filename, format='JPEG', quality=75, optimize=True)

	return os.path.abspath(filename), new_width, new_height

	# Model initialization with memory optimizations
	model = None
	processor = None

	def load_model():
	global model, processor
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-2B-Instruct",
	torch_dtype=torch.float16,
	device_map="auto",
	low_cpu_mem_usage=True
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

	@spaces.GPU
	def run_inference(input_imgs, text_input):
	global model, processor
	if model is None:
	load_model()

	results = []

	for image in input_imgs:
	torch.cuda.empty_cache()
	gc.collect()

	image_path, width, height = process_image(image)

	try:
	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": image_path},
	{"type": "text", "text": text_input}
	]
	}]

	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	# Process inputs in chunks
	inputs = processor(
	text=[text],
	images=[Image.open(image_path)],
	padding=True,
	truncation=True,
	max_length=512,
	return_tensors="pt",
	).to("cuda")

	# Memory-efficient generation
	with torch.inference_mode():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=False,
	num_beams=1,
	early_stopping=True
	)

	# Clean output processing
	output = processor.batch_decode(
	generated_ids[:, inputs.input_ids.shape[1]:],
	skip_special_tokens=True
	)[0]

	results.append(output)

	# Force memory cleanup
	del inputs, generated_ids
	torch.cuda.empty_cache()
	gc.collect()

	finally:
	if os.path.exists(image_path):
	os.remove(image_path)

	return results

	# Streamlined interface
	with gr.Blocks() as demo:
	gr.Markdown(DESCRIPTION)
	with gr.Row():
	input_imgs = gr.Files(file_types=["image"], label="Upload Images")
	text_input = gr.Textbox(label="Query")
	submit_btn = gr.Button("Submit", variant="primary")
	output_text = gr.Textbox(label="Response", elem_id="output")

	submit_btn.click(run_inference, [input_imgs, text_input], output_text)

	demo.queue(max_size=1)
	demo.launch()