Spaces:

dmorawiec
/

Qwen-VL-Object-Detection

Running on Zero

App Files Files Community

Darius Morawiec commited on 12 days ago

Commit

b896165

1 Parent(s): 7b4b54b

Add examples

Browse files

Files changed (12) hide show

.gitignore +0 -2
app.py +90 -18
examples/elevate-nYgy58eb9aw-unsplash.jpg +3 -0
examples/elevate-nYgy58eb9aw-unsplash.link +1 -0
examples/markus-spiske-oPDQGXW7i40-unsplash.jpg +3 -0
examples/markus-spiske-oPDQGXW7i40-unsplash.link +1 -0
examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg +3 -0
examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.link +1 -0
examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg +3 -0
examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.link +1 -0
examples/william-hook-9e9PD9blAto-unsplash.jpg +3 -0
examples/william-hook-9e9PD9blAto-unsplash.link +1 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,4 @@
-.gradio
 .vscode
-output
 # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode

 .vscode
 # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gc
 import json
 import os
 from io import BytesIO
 import gradio as gr
 import torch
@@ -17,6 +18,7 @@ from transformers import (
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model_ids = [
@@ -61,11 +63,6 @@ with gr.Blocks() as demo:
         "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."
     )
-    if DEVICE != "cuda":
-        gr.Markdown(
-            "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
-        )
     with gr.Row():
         with gr.Column():
             gr.Markdown("## Inputs")
@@ -79,22 +76,22 @@ with gr.Blocks() as demo:
             input_model_id = gr.Dropdown(
                 choices=model_ids,
-                label="Select Model ID",
             )
             default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
             system_prompt = gr.Textbox(
-                label="System Prompt:",
                 lines=3,
                 value=default_system_prompt,
             )
             default_user_prompt = "detect object"
             user_prompt = gr.Textbox(
-                label="User Prompt:",
                 lines=3,
                 value=default_user_prompt,
             )
             max_new_tokens = gr.Slider(
-                label="Max New Tokens:",
                 minimum=32,
                 maximum=4096,
                 value=256,
@@ -102,9 +99,9 @@ with gr.Blocks() as demo:
                 interactive=True,
             )
             image_target_size = gr.Slider(
-                label="Image Target Size (longest side)",
                 minimum=256,
-                maximum=3072,
                 value=1024,
                 step=1,
                 interactive=True,
@@ -123,7 +120,7 @@ with gr.Blocks() as demo:
             output_text = gr.Textbox(
                 label="Output Text",
-                lines=3,
                 key="output_text",
             )
@@ -137,11 +134,11 @@ with gr.Blocks() as demo:
     def run(
         image,
-        image_target_size: int,
         system_prompt: str,
         user_prompt: str,
-        model_id: str,
         max_new_tokens: int = 1024,
     ):
         global current_model, current_processor, current_model_id
         scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
@@ -182,13 +179,17 @@ with gr.Blocks() as demo:
         model = current_model
         processor = current_processor
         messages = [
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image",
-                        "image": f"data:image;base64,{image_to_base64(scale_image(image, image_target_size))}",
                     },
                     {"type": "text", "text": system_prompt},
                     {"type": "text", "text": user_prompt},
@@ -247,18 +248,89 @@ with gr.Blocks() as demo:
             ]
             bboxes.append((bbox, label))
-        return [(image, bboxes), str(output_text)]
     # Connect the button to the detection function
     run_button.click(
         fn=run,
         inputs=[
             image_input,
-            image_target_size,
             system_prompt,
             user_prompt,
-            input_model_id,
             max_new_tokens,
         ],
         outputs=[
             output_annotated_image,

 import json
 import os
 from io import BytesIO
+from pathlib import Path
 import gradio as gr
 import torch
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+EXAMPLES_DIR = Path(__file__).parent / "examples"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model_ids = [
         "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."
     )
     with gr.Row():
         with gr.Column():
             gr.Markdown("## Inputs")
             input_model_id = gr.Dropdown(
                 choices=model_ids,
+                label="✨ Select Model ID",
             )
             default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
             system_prompt = gr.Textbox(
+                label="System Prompt",
                 lines=3,
                 value=default_system_prompt,
             )
             default_user_prompt = "detect object"
             user_prompt = gr.Textbox(
+                label="User Prompt",
                 lines=3,
                 value=default_user_prompt,
             )
             max_new_tokens = gr.Slider(
+                label="Max New Tokens",
                 minimum=32,
                 maximum=4096,
                 value=256,
                 interactive=True,
             )
             image_target_size = gr.Slider(
+                label="Image Target Size",
                 minimum=256,
+                maximum=4096,
                 value=1024,
                 step=1,
                 interactive=True,
             output_text = gr.Textbox(
                 label="Output Text",
+                lines=10,
                 key="output_text",
             )
     def run(
         image,
+        model_id: str,
         system_prompt: str,
         user_prompt: str,
         max_new_tokens: int = 1024,
+        image_target_size: int | None = None,
     ):
         global current_model, current_processor, current_model_id
         scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
         model = current_model
         processor = current_processor
+        base64_image = image_to_base64(
+            scale_image(image, image_target_size) if image_target_size else image
+        )
         messages = [
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image",
+                        "image": f"data:image;base64,{base64_image}",
                     },
                     {"type": "text", "text": system_prompt},
                     {"type": "text", "text": user_prompt},
             ]
             bboxes.append((bbox, label))
+        return [(image, bboxes), str(json.dumps(output_json))]
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## Examples")
+            gr.Examples(
+                fn=run,
+                cache_examples=True,
+                cache_mode="eager",
+                run_on_click=True,
+                examples=[
+                    [
+                        EXAMPLES_DIR
+                        / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg",
+                        "Qwen/Qwen3-VL-4B-Instruct",
+                        default_system_prompt,
+                        "detect sailboat, rowboat, person",
+                        512,
+                        1920,
+                    ],
+                    [
+                        EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg",
+                        "Qwen/Qwen3-VL-4B-Instruct",
+                        default_system_prompt,
+                        "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
+                        1024,
+                        1920,
+                    ],
+                    [
+                        EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg",
+                        "Qwen/Qwen3-VL-4B-Instruct",
+                        default_system_prompt,
+                        "detect basketball, player with white jersey, player with black jersey",
+                        512,
+                        1920,
+                    ],
+                    [
+                        EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg",
+                        "Qwen/Qwen3-VL-4B-Instruct",
+                        default_system_prompt,
+                        "detect app to find great places, app to take beautiful photos, app to listen music",
+                        512,
+                        1920,
+                    ],
+                    [
+                        EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg",
+                        "Qwen/Qwen3-VL-4B-Instruct",
+                        default_system_prompt,
+                        "detect person, bicycle, netherlands flag",
+                        1920,
+                        1920,
+                    ],
+                ],
+                inputs=[
+                    image_input,
+                    input_model_id,
+                    system_prompt,
+                    user_prompt,
+                    max_new_tokens,
+                    image_target_size,
+                ],
+                outputs=[
+                    output_annotated_image,
+                    output_text,
+                ],
+            )
+            if DEVICE != "cuda":
+                gr.Markdown(
+                    "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
+                )
     # Connect the button to the detection function
     run_button.click(
         fn=run,
         inputs=[
             image_input,
+            input_model_id,
             system_prompt,
             user_prompt,
             max_new_tokens,
+            image_target_size,
         ],
         outputs=[
             output_annotated_image,

examples/elevate-nYgy58eb9aw-unsplash.jpg ADDED Viewed

Git LFS Details

SHA256: 73085797788434cc3dfe8d0bfa60000f8b62500be133d47ee8c00925a42aacd4
Pointer size: 131 Bytes
Size of remote file: 596 kB

examples/elevate-nYgy58eb9aw-unsplash.link ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://unsplash.com/photos/four-women-holding-drinks-while-laughing-together-during-daytime-nYgy58eb9aw

examples/markus-spiske-oPDQGXW7i40-unsplash.jpg ADDED Viewed

Git LFS Details

SHA256: 010442751d65a444fe6c7bce8fdea3c8368a836c27a472a813fa10be60ad965d
Pointer size: 131 Bytes
Size of remote file: 601 kB

examples/markus-spiske-oPDQGXW7i40-unsplash.link ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://unsplash.com/photos/group-of-people-playing-basketball-oPDQGXW7i40

examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg ADDED Viewed

Git LFS Details

SHA256: 6baf40ee66763848b97f2757c51fe809712847b4b12b831561447b653b3d1219
Pointer size: 131 Bytes
Size of remote file: 548 kB

examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.link ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://unsplash.com/photos/a-group-of-sailboats-in-a-body-of-water-with-a-city-in-the-background-fDYRfHoRC4k

examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg ADDED Viewed

Git LFS Details

SHA256: 37f9440ffbca5db9c08c3e2a1d489d0ef84ad111baf2059f3763adcb028c2888
Pointer size: 131 Bytes
Size of remote file: 513 kB

examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.link ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://unsplash.com/photos/a-group-of-sailboats-in-a-body-of-water-with-a-city-in-the-background-fDYRfHoRC4k

examples/william-hook-9e9PD9blAto-unsplash.jpg ADDED Viewed

Git LFS Details

SHA256: b6e1489f35df4edd6bb8da0568f81a5fad23cd54760cdbae33c769148fc2d167
Pointer size: 131 Bytes
Size of remote file: 297 kB

examples/william-hook-9e9PD9blAto-unsplash.link ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://unsplash.com/photos/space-gray-iphone-x-9e9PD9blAto