Spaces:

dmorawiec
/

Qwen-VL-Object-Detection

Running on Zero

File size: 13,636 Bytes

import base64
import gc
import json
import os
from io import BytesIO
from pathlib import Path

import gradio as gr
import torch
from json_repair import repair_json
from qwen_vl_utils import process_vision_info
from transformers import (
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLForConditionalGeneration,
    Qwen3VLForConditionalGeneration,
)

from kofi import SCRIPT

# Handle spaces.GPU decorator for GPU allocation in Spaces
if "SPACES_ZERO_GPU" in os.environ.keys():
    import spaces
else:

    class spaces:
        @staticmethod
        def GPU(func=None, duration=300):
            def decorator(f):
                def wrapper(*args, **kwargs):
                    return f(*args, **kwargs)

                return wrapper

            if func is None:
                return decorator
            return decorator(func)


# Define constants
HEADLINE = "# Qwen-VL Object-Detection"
SUBLINE = "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."

EXAMPLES_DIR = Path(__file__).parent / "examples"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_IDS = [
    "Qwen/Qwen2-VL-2B-Instruct",  # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
    "Qwen/Qwen2-VL-7B-Instruct",  # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
    "Qwen/Qwen2.5-VL-3B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct
    "Qwen/Qwen2.5-VL-7B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
    "Qwen/Qwen2.5-VL-32B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct
    "Qwen/Qwen2.5-VL-72B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct
    "Qwen/Qwen3-VL-2B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct
    "Qwen/Qwen3-VL-4B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
    "Qwen/Qwen3-VL-8B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct
    "Qwen/Qwen3-VL-32B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct
]
DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
EXAMPLES = [
    [
        EXAMPLES_DIR / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg",
        "Qwen/Qwen3-VL-4B-Instruct",
        DEFAULT_SYSTEM_PROMPT,
        "detect sailboat, rowboat, person",
        512,
        "Yes",
        1920,
    ],
    [
        EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg",
        "Qwen/Qwen3-VL-4B-Instruct",
        DEFAULT_SYSTEM_PROMPT,
        "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
        1024,
        "Yes",
        1920,
    ],
    [
        EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg",
        "Qwen/Qwen3-VL-4B-Instruct",
        DEFAULT_SYSTEM_PROMPT,
        "detect basketball, player with white jersey, player with black jersey",
        512,
        "Yes",
        1920,
    ],
    [
        EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg",
        "Qwen/Qwen3-VL-4B-Instruct",
        DEFAULT_SYSTEM_PROMPT,
        "detect app to find great places, app to take beautiful photos, app to listen music",
        512,
        "Yes",
        1920,
    ],
    [
        EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg",
        "Qwen/Qwen3-VL-4B-Instruct",
        DEFAULT_SYSTEM_PROMPT,
        "detect person, bicycle, netherlands flag",
        1920,
        "Yes",
        1920,
    ],
]

# Global variables to track loaded model
current_model = None
current_processor = None
current_model_id = None


class AutoModel:
    @staticmethod
    def from_pretrained(model_id, dtype="auto", device_map="cpu"):
        if model_id.startswith("Qwen/Qwen2-VL"):
            model_loader = Qwen2VLForConditionalGeneration
        elif model_id.startswith("Qwen/Qwen2.5-VL"):
            model_loader = Qwen2_5_VLForConditionalGeneration
        elif model_id.startswith("Qwen/Qwen3-VL"):
            model_loader = Qwen3VLForConditionalGeneration
        else:
            raise ValueError(f"Unsupported model ID: {model_id}")
        return model_loader.from_pretrained(
            model_id, dtype=dtype, device_map=device_map
        )


def resize_image(image, target_size=1000):
    width, height = image.size
    if max(width, height) <= target_size:
        return image

    if width >= height:
        new_width = target_size
        new_height = int((target_size / width) * height)
    else:
        new_height = target_size
        new_width = int((target_size / height) * width)

    return image.resize((new_width, new_height))


def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


with gr.Blocks(js=SCRIPT) as demo:
    gr.Markdown(HEADLINE)
    gr.Markdown(SUBLINE)

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Inputs")

            image_input = gr.Image(
                label="Input Image",
                type="pil",
            )

            gr.Markdown("## Settings")

            input_model_id = gr.Dropdown(
                choices=MODEL_IDS,
                label="✨ Select Model ID",
            )
            system_prompt = gr.Textbox(
                label="System Prompt",
                lines=3,
                value=DEFAULT_SYSTEM_PROMPT,
            )
            default_user_prompt = "detect object"
            user_prompt = gr.Textbox(
                label="User Prompt",
                lines=3,
                value=default_user_prompt,
            )
            max_new_tokens = gr.Slider(
                label="Max New Tokens",
                minimum=32,
                maximum=4096,
                value=256,
                step=32,
                interactive=True,
            )

            image_resize = gr.Radio(
                label="Resize Image",
                choices=["Yes", "No"],
                value="Yes",
                interactive=True,
                scale=2,
            )

            image_target_size = gr.Slider(
                label="Image Target Size",
                minimum=256,
                maximum=4096,
                value=1024,
                step=1,
                interactive=True,
                scale=2,
            )

        with gr.Column():
            gr.Markdown("## Outputs")

            output_annotated_image = gr.AnnotatedImage(
                format="jpeg",
                key="output_annotated_image",
                label="Output Image",
            )

            gr.Markdown("## Detections")

            output_text = gr.Textbox(
                label="Output Text",
                lines=10,
                key="output_text",
            )

    with gr.Row():
        run_button = gr.Button("Run")

    def load_model(
        model_id: str,
    ):
        global current_model, current_processor, current_model_id

        # Only load model if it's different from the currently loaded one
        if current_model_id != model_id or current_model is None:
            # Clear previous model from memory
            if current_model is not None:
                del current_model
                current_model = None

            if current_processor is not None:
                del current_processor
                current_processor = None

            # Force garbage collection and clear CUDA cache
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()

            gr.Info(
                f"Downloading and loading <strong>{model_id.removeprefix('Qwen/')}</strong> model files ...",
                duration=10,
            )

            current_model = AutoModel.from_pretrained(
                model_id, dtype="auto", device_map="cpu"
            )
            current_processor = AutoProcessor.from_pretrained(model_id)
            current_model_id = model_id

        return current_model, current_processor

    @spaces.GPU(duration=300)
    def generate(
        model,
        processor,
        image,
        model_id: str,
        system_prompt: str,
        user_prompt: str,
        max_new_tokens: int,
        image_resize: str,
        image_target_size: int | None,
    ):
        # Move model to CUDA if available (inside @spaces.GPU decorated function)
        model = model.to(DEVICE)
        model.eval()

        base64_image = image_to_base64(
            resize_image(image, image_target_size)
            if image_resize == "Yes" and image_target_size
            else image
        )
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": f"data:image;base64,{base64_image}",
                    },
                    {"type": "text", "text": system_prompt},
                    {"type": "text", "text": user_prompt},
                ],
            }
        ]

        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(DEVICE)

        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )

        output_text = str(output_text[0])
        output_text = repair_json(output_text)
        output_json = json.loads(output_text)

        scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
        x_scale = float(image.width / 1000) if scale else 1.0
        y_scale = float(image.height / 1000) if scale else 1.0
        bboxes = []
        for detection in output_json:
            if "bbox_2d" not in detection:
                continue
            if len(detection["bbox_2d"]) != 4:
                continue
            if "label" not in detection:
                continue

            xmin, ymin, xmax, ymax = detection["bbox_2d"]
            label = detection.get("label", "")
            bbox = [
                int(xmin * x_scale),
                int(ymin * y_scale),
                int(xmax * x_scale),
                int(ymax * y_scale),
            ]
            bboxes.append((bbox, label))

        return [(image, bboxes), str(json.dumps(output_json))]

    def run(
        image,
        model_id: str,
        system_prompt: str,
        user_prompt: str,
        max_new_tokens: int = 1024,
        image_resize: str = "Yes",
        image_target_size: int | None = None,
    ):
        # Load the model and processor (on CPU)
        model, processor = load_model(model_id)

        # Run inference (on GPU *if available)
        return generate(
            model,
            processor,
            image,
            model_id,
            system_prompt,
            user_prompt,
            max_new_tokens,
            image_resize,
            image_target_size,
        )

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Examples")

            gr.Examples(
                fn=run,
                cache_examples=True,
                cache_mode="eager",
                run_on_click=False,
                examples=EXAMPLES,
                inputs=[
                    image_input,
                    input_model_id,
                    system_prompt,
                    user_prompt,
                    max_new_tokens,
                    image_resize,
                    image_target_size,
                ],
                outputs=[
                    output_annotated_image,
                    output_text,
                ],
            )

    with gr.Row():
        with gr.Column():
            if DEVICE != "cuda":
                gr.Markdown(
                    "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
                )
            gr.HTML('<div id="kofi" style="text-align: center;"></div>')

    # Connect the button to the detection function
    run_button.click(
        fn=run,
        inputs=[
            image_input,
            input_model_id,
            system_prompt,
            user_prompt,
            max_new_tokens,
            image_resize,
            image_target_size,
        ],
        outputs=[
            output_annotated_image,
            output_text,
        ],
    )

if __name__ == "__main__":
    demo.launch(
        share=False,
    )