Spaces:
Running
on
Zero
Running
on
Zero
| import base64 | |
| import gc | |
| import json | |
| import os | |
| from io import BytesIO | |
| from pathlib import Path | |
| import gradio as gr | |
| import torch | |
| from json_repair import repair_json | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import ( | |
| AutoProcessor, | |
| Qwen2_5_VLForConditionalGeneration, | |
| Qwen2VLForConditionalGeneration, | |
| Qwen3VLForConditionalGeneration, | |
| ) | |
| from kofi import SCRIPT | |
| # Handle spaces.GPU decorator for GPU allocation in Spaces | |
| if "SPACES_ZERO_GPU" in os.environ.keys(): | |
| import spaces | |
| else: | |
| class spaces: | |
| def GPU(func=None, duration=300): | |
| def decorator(f): | |
| def wrapper(*args, **kwargs): | |
| return f(*args, **kwargs) | |
| return wrapper | |
| if func is None: | |
| return decorator | |
| return decorator(func) | |
| # Define constants | |
| HEADLINE = "# Qwen-VL Object-Detection" | |
| SUBLINE = "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection." | |
| EXAMPLES_DIR = Path(__file__).parent / "examples" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MODEL_IDS = [ | |
| "Qwen/Qwen2-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct | |
| "Qwen/Qwen2-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct | |
| "Qwen/Qwen2.5-VL-3B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct | |
| "Qwen/Qwen2.5-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct | |
| "Qwen/Qwen2.5-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct | |
| "Qwen/Qwen2.5-VL-72B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct | |
| "Qwen/Qwen3-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct | |
| "Qwen/Qwen3-VL-4B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct | |
| "Qwen/Qwen3-VL-8B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct | |
| "Qwen/Qwen3-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct | |
| ] | |
| DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.' | |
| EXAMPLES = [ | |
| [ | |
| EXAMPLES_DIR / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg", | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| DEFAULT_SYSTEM_PROMPT, | |
| "detect sailboat, rowboat, person", | |
| 512, | |
| "Yes", | |
| 1920, | |
| ], | |
| [ | |
| EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg", | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| DEFAULT_SYSTEM_PROMPT, | |
| "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink", | |
| 1024, | |
| "Yes", | |
| 1920, | |
| ], | |
| [ | |
| EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg", | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| DEFAULT_SYSTEM_PROMPT, | |
| "detect basketball, player with white jersey, player with black jersey", | |
| 512, | |
| "Yes", | |
| 1920, | |
| ], | |
| [ | |
| EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg", | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| DEFAULT_SYSTEM_PROMPT, | |
| "detect app to find great places, app to take beautiful photos, app to listen music", | |
| 512, | |
| "Yes", | |
| 1920, | |
| ], | |
| [ | |
| EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg", | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| DEFAULT_SYSTEM_PROMPT, | |
| "detect person, bicycle, netherlands flag", | |
| 1920, | |
| "Yes", | |
| 1920, | |
| ], | |
| ] | |
| # Global variables to track loaded model | |
| current_model = None | |
| current_processor = None | |
| current_model_id = None | |
| class AutoModel: | |
| def from_pretrained(model_id, dtype="auto", device_map="cpu"): | |
| if model_id.startswith("Qwen/Qwen2-VL"): | |
| model_loader = Qwen2VLForConditionalGeneration | |
| elif model_id.startswith("Qwen/Qwen2.5-VL"): | |
| model_loader = Qwen2_5_VLForConditionalGeneration | |
| elif model_id.startswith("Qwen/Qwen3-VL"): | |
| model_loader = Qwen3VLForConditionalGeneration | |
| else: | |
| raise ValueError(f"Unsupported model ID: {model_id}") | |
| return model_loader.from_pretrained( | |
| model_id, dtype=dtype, device_map=device_map | |
| ) | |
| def resize_image(image, target_size=1000): | |
| width, height = image.size | |
| if max(width, height) <= target_size: | |
| return image | |
| if width >= height: | |
| new_width = target_size | |
| new_height = int((target_size / width) * height) | |
| else: | |
| new_height = target_size | |
| new_width = int((target_size / height) * width) | |
| return image.resize((new_width, new_height)) | |
| def image_to_base64(image): | |
| buffered = BytesIO() | |
| image.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| return img_str | |
| with gr.Blocks(js=SCRIPT) as demo: | |
| gr.Markdown(HEADLINE) | |
| gr.Markdown(SUBLINE) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Inputs") | |
| image_input = gr.Image( | |
| label="Input Image", | |
| type="pil", | |
| ) | |
| gr.Markdown("## Settings") | |
| input_model_id = gr.Dropdown( | |
| choices=MODEL_IDS, | |
| label="✨ Select Model ID", | |
| ) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| lines=3, | |
| value=DEFAULT_SYSTEM_PROMPT, | |
| ) | |
| default_user_prompt = "detect object" | |
| user_prompt = gr.Textbox( | |
| label="User Prompt", | |
| lines=3, | |
| value=default_user_prompt, | |
| ) | |
| max_new_tokens = gr.Slider( | |
| label="Max New Tokens", | |
| minimum=32, | |
| maximum=4096, | |
| value=256, | |
| step=32, | |
| interactive=True, | |
| ) | |
| image_resize = gr.Radio( | |
| label="Resize Image", | |
| choices=["Yes", "No"], | |
| value="Yes", | |
| interactive=True, | |
| scale=2, | |
| ) | |
| image_target_size = gr.Slider( | |
| label="Image Target Size", | |
| minimum=256, | |
| maximum=4096, | |
| value=1024, | |
| step=1, | |
| interactive=True, | |
| scale=2, | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("## Outputs") | |
| output_annotated_image = gr.AnnotatedImage( | |
| format="jpeg", | |
| key="output_annotated_image", | |
| label="Output Image", | |
| ) | |
| gr.Markdown("## Detections") | |
| output_text = gr.Textbox( | |
| label="Output Text", | |
| lines=10, | |
| key="output_text", | |
| ) | |
| with gr.Row(): | |
| run_button = gr.Button("Run") | |
| def load_model( | |
| model_id: str, | |
| ): | |
| global current_model, current_processor, current_model_id | |
| # Only load model if it's different from the currently loaded one | |
| if current_model_id != model_id or current_model is None: | |
| # Clear previous model from memory | |
| if current_model is not None: | |
| del current_model | |
| current_model = None | |
| if current_processor is not None: | |
| del current_processor | |
| current_processor = None | |
| # Force garbage collection and clear CUDA cache | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize() | |
| gr.Info( | |
| f"Downloading and loading <strong>{model_id.removeprefix('Qwen/')}</strong> model files ...", | |
| duration=10, | |
| ) | |
| current_model = AutoModel.from_pretrained( | |
| model_id, dtype="auto", device_map="cpu" | |
| ) | |
| current_processor = AutoProcessor.from_pretrained(model_id) | |
| current_model_id = model_id | |
| return current_model, current_processor | |
| def generate( | |
| model, | |
| processor, | |
| image, | |
| model_id: str, | |
| system_prompt: str, | |
| user_prompt: str, | |
| max_new_tokens: int, | |
| image_resize: str, | |
| image_target_size: int | None, | |
| ): | |
| # Move model to CUDA if available (inside @spaces.GPU decorated function) | |
| model = model.to(DEVICE) | |
| model.eval() | |
| base64_image = image_to_base64( | |
| resize_image(image, image_target_size) | |
| if image_resize == "Yes" and image_target_size | |
| else image | |
| ) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": f"data:image;base64,{base64_image}", | |
| }, | |
| {"type": "text", "text": system_prompt}, | |
| {"type": "text", "text": user_prompt}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(DEVICE) | |
| generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] | |
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False, | |
| ) | |
| output_text = str(output_text[0]) | |
| output_text = repair_json(output_text) | |
| output_json = json.loads(output_text) | |
| scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True | |
| x_scale = float(image.width / 1000) if scale else 1.0 | |
| y_scale = float(image.height / 1000) if scale else 1.0 | |
| bboxes = [] | |
| for detection in output_json: | |
| if "bbox_2d" not in detection: | |
| continue | |
| if len(detection["bbox_2d"]) != 4: | |
| continue | |
| if "label" not in detection: | |
| continue | |
| xmin, ymin, xmax, ymax = detection["bbox_2d"] | |
| label = detection.get("label", "") | |
| bbox = [ | |
| int(xmin * x_scale), | |
| int(ymin * y_scale), | |
| int(xmax * x_scale), | |
| int(ymax * y_scale), | |
| ] | |
| bboxes.append((bbox, label)) | |
| return [(image, bboxes), str(json.dumps(output_json))] | |
| def run( | |
| image, | |
| model_id: str, | |
| system_prompt: str, | |
| user_prompt: str, | |
| max_new_tokens: int = 1024, | |
| image_resize: str = "Yes", | |
| image_target_size: int | None = None, | |
| ): | |
| # Load the model and processor (on CPU) | |
| model, processor = load_model(model_id) | |
| # Run inference (on GPU *if available) | |
| return generate( | |
| model, | |
| processor, | |
| image, | |
| model_id, | |
| system_prompt, | |
| user_prompt, | |
| max_new_tokens, | |
| image_resize, | |
| image_target_size, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Examples") | |
| gr.Examples( | |
| fn=run, | |
| cache_examples=True, | |
| cache_mode="eager", | |
| run_on_click=False, | |
| examples=EXAMPLES, | |
| inputs=[ | |
| image_input, | |
| input_model_id, | |
| system_prompt, | |
| user_prompt, | |
| max_new_tokens, | |
| image_resize, | |
| image_target_size, | |
| ], | |
| outputs=[ | |
| output_annotated_image, | |
| output_text, | |
| ], | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| if DEVICE != "cuda": | |
| gr.Markdown( | |
| "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime." | |
| ) | |
| gr.HTML('<div id="kofi" style="text-align: center;"></div>') | |
| # Connect the button to the detection function | |
| run_button.click( | |
| fn=run, | |
| inputs=[ | |
| image_input, | |
| input_model_id, | |
| system_prompt, | |
| user_prompt, | |
| max_new_tokens, | |
| image_resize, | |
| image_target_size, | |
| ], | |
| outputs=[ | |
| output_annotated_image, | |
| output_text, | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=False, | |
| ) | |