Spaces:

dmorawiec
/

Qwen-VL-Object-Detection

Running on Zero

App Files Files Community

Darius Morawiec commited on 10 days ago

Commit

73b837f

1 Parent(s): 251f917

Add image resizing option and refactor GPU duration handling

Browse files

Files changed (1) hide show

app.py +25 -7

app.py CHANGED Viewed

@@ -33,6 +33,7 @@ else:
 # Define constants
 EXAMPLES_DIR = Path(__file__).parent / "examples"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_IDS = [
@@ -49,7 +50,7 @@ MODEL_IDS = [
 ]
-def scale_image(image, target_size=1000):
     width, height = image.size
     if max(width, height) <= target_size:
         return image
@@ -112,6 +113,15 @@ with gr.Blocks() as demo:
                 step=32,
                 interactive=True,
             )
             image_target_size = gr.Slider(
                 label="Image Target Size",
                 minimum=256,
@@ -119,6 +129,7 @@ with gr.Blocks() as demo:
                 value=1024,
                 step=1,
                 interactive=True,
             )
         with gr.Column():
@@ -192,12 +203,15 @@ with gr.Blocks() as demo:
         system_prompt: str,
         user_prompt: str,
         max_new_tokens: int = 1024,
         image_target_size: int | None = None,
     ):
         model, processor = load_model(model_id)
         base64_image = image_to_base64(
-            scale_image(image, image_target_size) if image_target_size else image
         )
         messages = [
             {
@@ -228,11 +242,8 @@ with gr.Blocks() as demo:
         )
         inputs = inputs.to(DEVICE)
-        @spaces.GPU(duration=300)
-        def _generate(**kwargs):
-            return model.generate(**kwargs)
-        generated_ids = _generate(**inputs, max_new_tokens=max_new_tokens)
         generated_ids_trimmed = [
             out_ids[len(in_ids) :]
             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -288,6 +299,7 @@ with gr.Blocks() as demo:
                         default_system_prompt,
                         "detect sailboat, rowboat, person",
                         512,
                         1920,
                     ],
                     [
@@ -296,6 +308,7 @@ with gr.Blocks() as demo:
                         default_system_prompt,
                         "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
                         1024,
                         1920,
                     ],
                     [
@@ -304,6 +317,7 @@ with gr.Blocks() as demo:
                         default_system_prompt,
                         "detect basketball, player with white jersey, player with black jersey",
                         512,
                         1920,
                     ],
                     [
@@ -312,6 +326,7 @@ with gr.Blocks() as demo:
                         default_system_prompt,
                         "detect app to find great places, app to take beautiful photos, app to listen music",
                         512,
                         1920,
                     ],
                     [
@@ -320,6 +335,7 @@ with gr.Blocks() as demo:
                         default_system_prompt,
                         "detect person, bicycle, netherlands flag",
                         1920,
                         1920,
                     ],
                 ],
@@ -329,6 +345,7 @@ with gr.Blocks() as demo:
                     system_prompt,
                     user_prompt,
                     max_new_tokens,
                     image_target_size,
                 ],
                 outputs=[
@@ -351,6 +368,7 @@ with gr.Blocks() as demo:
             system_prompt,
             user_prompt,
             max_new_tokens,
             image_target_size,
         ],
         outputs=[

 # Define constants
+GPU_DURATION = 300
 EXAMPLES_DIR = Path(__file__).parent / "examples"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_IDS = [
 ]
+def resize_image(image, target_size=1000):
     width, height = image.size
     if max(width, height) <= target_size:
         return image
                 step=32,
                 interactive=True,
             )
+            image_resize = gr.Radio(
+                label="Resize Image",
+                choices=["Yes", "No"],
+                value="Yes",
+                interactive=True,
+                scale=2,
+            )
             image_target_size = gr.Slider(
                 label="Image Target Size",
                 minimum=256,
                 value=1024,
                 step=1,
                 interactive=True,
+                scale=2,
             )
         with gr.Column():
         system_prompt: str,
         user_prompt: str,
         max_new_tokens: int = 1024,
+        image_resize: str = "Yes",
         image_target_size: int | None = None,
     ):
         model, processor = load_model(model_id)
         base64_image = image_to_base64(
+            resize_image(image, image_target_size)
+            if image_resize == "Yes" and image_target_size
+            else image
         )
         messages = [
             {
         )
         inputs = inputs.to(DEVICE)
+        generate = spaces.GPU(model.generate, duration=GPU_DURATION)
+        generated_ids = generate(**inputs, max_new_tokens=max_new_tokens)
         generated_ids_trimmed = [
             out_ids[len(in_ids) :]
             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                         default_system_prompt,
                         "detect sailboat, rowboat, person",
                         512,
+                        "Yes",
                         1920,
                     ],
                     [
                         default_system_prompt,
                         "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
                         1024,
+                        "Yes",
                         1920,
                     ],
                     [
                         default_system_prompt,
                         "detect basketball, player with white jersey, player with black jersey",
                         512,
+                        "Yes",
                         1920,
                     ],
                     [
                         default_system_prompt,
                         "detect app to find great places, app to take beautiful photos, app to listen music",
                         512,
+                        "Yes",
                         1920,
                     ],
                     [
                         default_system_prompt,
                         "detect person, bicycle, netherlands flag",
                         1920,
+                        "Yes",
                         1920,
                     ],
                 ],
                     system_prompt,
                     user_prompt,
                     max_new_tokens,
+                    image_resize,
                     image_target_size,
                 ],
                 outputs=[
             system_prompt,
             user_prompt,
             max_new_tokens,
+            image_resize,
             image_target_size,
         ],
         outputs=[