Spaces:

yakilee
/

sparrow_demo

Sleeping

App Files Files Community

yakilee commited on 25 days ago

Commit

9351df3

verified ·

1 Parent(s): dba9150

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -117

app.py CHANGED Viewed

@@ -8,183 +8,123 @@ import os
 import torch
 import gc
-# Set PyTorch memory allocation configuration
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
 DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)"
-def process_image(image_filepath, max_width=800, max_height=1000):
     if image_filepath is None:
-        raise ValueError("No image provided. Please upload an image before submitting.")
     img = Image.open(image_filepath)
     width, height = img.size
-    # Calculate new dimensions while maintaining aspect ratio
-    if width > max_width or height > max_height:
-        aspect_ratio = width / height
-        if width > max_width:
-            new_width = max_width
-            new_height = int(new_width / aspect_ratio)
-        if new_height > max_height:
-            new_height = max_height
-            new_width = int(new_height * aspect_ratio)
     else:
-        new_width, new_height = width, height
-    # Resize the image if needed
-    if new_width != width or new_height != height:
-        img = img.resize((new_width, new_height), Image.LANCZOS)
-    # Generate temporary filename - use /tmp folder for better space management
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"/tmp/image_{timestamp}.jpg"  # Use jpg for smaller file size
-    # Save with optimized compression
-    img.save(filename, format='JPEG', quality=85, optimize=True)
     return os.path.abspath(filename), new_width, new_height
-# Initialize model with memory optimizations but without 4-bit quantization
 model = None
 processor = None
 def load_model():
-    # Load model with memory optimizations
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
-        torch_dtype=torch.float16,  # Use fp16 for memory efficiency
         device_map="auto",
-        attn_implementation="flash_attention_2"  # Use FlashAttention if available
     )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-    return model, processor
 @spaces.GPU
 def run_inference(input_imgs, text_input):
     global model, processor
-    # Lazy load model
-    if model is None or processor is None:
-        model, processor = load_model()
     results = []
-    # Process images one at a time to avoid OOM issues
     for image in input_imgs:
-        # Clear cache before processing each image
         torch.cuda.empty_cache()
         gc.collect()
-        # Process image with reduced dimensions
         image_path, width, height = process_image(image)
         try:
-            # Create messages with optimized image
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image",
-                            "image": image_path,
-                            "resized_height": height,
-                            "resized_width": width
-                        },
-                        {
-                            "type": "text",
-                            "text": text_input
-                        }
-                    ]
-                }
-            ]
-            # Prepare inputs with memory optimization
             text = processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
-            image_inputs, video_inputs = process_vision_info(messages)
-            # Clear unused memory
-            del messages
-            torch.cuda.empty_cache()
-            # Process inputs with truncation to control memory usage
             inputs = processor(
                 text=[text],
-                images=image_inputs,
-                videos=video_inputs,
                 padding=True,
-                truncation=True,      # Add truncation
-                max_length=768,       # Limit context length
                 return_tensors="pt",
-            )
-            # Move to GPU efficiently
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-            # Clean up variables to free memory
-            del text, image_inputs, video_inputs
-            torch.cuda.empty_cache()
-            # Generate with optimized parameters
-            with torch.inference_mode():  # More efficient than no_grad
                 generated_ids = model.generate(
-                    **inputs,
-                    max_new_tokens=1024,  # Reduced from 4096
-                    do_sample=False,      # Deterministic generation uses less memory
-                    use_cache=True,       # Use KV cache
-                    num_beams=1           # Disable beam search to save memory
                 )
-            # Process output efficiently
-            generated_ids_trimmed = [
-                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
-            ]
-            raw_output = processor.batch_decode(
-                generated_ids_trimmed, skip_special_tokens=True
-            )
-            results.append(raw_output[0])
-            print(f"Processed: {image_path}")
-            # Clear tensors from GPU memory
-            del inputs, generated_ids, generated_ids_trimmed
             torch.cuda.empty_cache()
             gc.collect()
         finally:
-            # Clean up temporary files
             if os.path.exists(image_path):
                 os.remove(image_path)
     return results
-# Gradio interface
-css = """
-  #output {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
-  }
-"""
-with gr.Blocks(css=css) as demo:
     gr.Markdown(DESCRIPTION)
-    with gr.Tab(label="Qwen2-VL-2B Input"):
-        with gr.Row():
-            with gr.Column():
-                input_imgs = gr.Files(file_types=["image"], label="Upload Document Images")
-                text_input = gr.Textbox(label="Query")
-                submit_btn = gr.Button(value="Submit", variant="primary")
-            with gr.Column():
-                output_text = gr.Textbox(label="Response")
-        submit_btn.click(run_inference, [input_imgs, text_input], [output_text])
-# Use smaller queue size to manage memory
-demo.queue(api_open=True, max_size=3)
-demo.launch(debug=True)

 import torch
 import gc
+# Configure memory settings
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
 DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)"
+def process_image(image_filepath, max_width=640, max_height=800):
     if image_filepath is None:
+        raise ValueError("No image provided")
     img = Image.open(image_filepath)
     width, height = img.size
+    # Enhanced resizing with aspect ratio preservation
+    aspect_ratio = width / height
+    if aspect_ratio > (max_width/max_height):
+        new_width = max_width
+        new_height = int(max_width / aspect_ratio)
     else:
+        new_height = max_height
+        new_width = int(max_height * aspect_ratio)
+    img = img.resize((new_width, new_height), Image.LANCZOS)
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"/tmp/image_{timestamp}.jpg"
+    img.save(filename, format='JPEG', quality=75, optimize=True)
     return os.path.abspath(filename), new_width, new_height
+# Model initialization with memory optimizations
 model = None
 processor = None
 def load_model():
+    global model, processor
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float16,
         device_map="auto",
+        low_cpu_mem_usage=True
     )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 @spaces.GPU
 def run_inference(input_imgs, text_input):
     global model, processor
+    if model is None:
+        load_model()
     results = []
     for image in input_imgs:
         torch.cuda.empty_cache()
         gc.collect()
         image_path, width, height = process_image(image)
         try:
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image_path},
+                    {"type": "text", "text": text_input}
+                ]
+            }]
             text = processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
+            # Process inputs in chunks
             inputs = processor(
                 text=[text],
+                images=[Image.open(image_path)],
                 padding=True,
+                truncation=True,
+                max_length=512,
                 return_tensors="pt",
+            ).to("cuda")
+            # Memory-efficient generation
+            with torch.inference_mode():
                 generated_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=512,
+                    do_sample=False,
+                    num_beams=1,
+                    early_stopping=True
                 )
+            # Clean output processing
+            output = processor.batch_decode(
+                generated_ids[:, inputs.input_ids.shape[1]:],
+                skip_special_tokens=True
+            )[0]
+            results.append(output)
+            # Force memory cleanup
+            del inputs, generated_ids
             torch.cuda.empty_cache()
             gc.collect()
         finally:
             if os.path.exists(image_path):
                 os.remove(image_path)
     return results
+# Streamlined interface
+with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        input_imgs = gr.Files(file_types=["image"], label="Upload Images")
+        text_input = gr.Textbox(label="Query")
+        submit_btn = gr.Button("Submit", variant="primary")
+    output_text = gr.Textbox(label="Response", elem_id="output")
+    submit_btn.click(run_inference, [input_imgs, text_input], output_text)
+demo.queue(max_size=1)
+demo.launch()