import gradio as gr import spaces from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from PIL import Image from datetime import datetime import os import torch import gc # Configure memory settings os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64" DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)" def process_image(image_filepath, max_width=640, max_height=800): if image_filepath is None: raise ValueError("No image provided") img = Image.open(image_filepath) width, height = img.size # Enhanced resizing with aspect ratio preservation aspect_ratio = width / height if aspect_ratio > (max_width/max_height): new_width = max_width new_height = int(max_width / aspect_ratio) else: new_height = max_height new_width = int(max_height * aspect_ratio) img = img.resize((new_width, new_height), Image.LANCZOS) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"/tmp/image_{timestamp}.jpg" img.save(filename, format='JPEG', quality=75, optimize=True) return os.path.abspath(filename), new_width, new_height # Model initialization with memory optimizations model = None processor = None def load_model(): global model, processor model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True ) processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") @spaces.GPU def run_inference(input_imgs, text_input): global model, processor if model is None: load_model() results = [] for image in input_imgs: torch.cuda.empty_cache() gc.collect() image_path, width, height = process_image(image) try: messages = [{ "role": "user", "content": [ {"type": "image", "image": image_path}, {"type": "text", "text": text_input} ] }] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Process inputs in chunks inputs = processor( text=[text], images=[Image.open(image_path)], padding=True, truncation=True, max_length=512, return_tensors="pt", ).to("cuda") # Memory-efficient generation with torch.inference_mode(): generated_ids = model.generate( **inputs, max_new_tokens=512, do_sample=False, num_beams=1, early_stopping=True ) # Clean output processing output = processor.batch_decode( generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True )[0] results.append(output) # Force memory cleanup del inputs, generated_ids torch.cuda.empty_cache() gc.collect() finally: if os.path.exists(image_path): os.remove(image_path) return results # Streamlined interface with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) with gr.Row(): input_imgs = gr.Files(file_types=["image"], label="Upload Images") text_input = gr.Textbox(label="Query") submit_btn = gr.Button("Submit", variant="primary") output_text = gr.Textbox(label="Response", elem_id="output") submit_btn.click(run_inference, [input_imgs, text_input], output_text) demo.queue(max_size=1) demo.launch()