Darius Morawiec commited on
Commit
b896165
·
1 Parent(s): 7b4b54b

Add examples

Browse files
.gitignore CHANGED
@@ -1,6 +1,4 @@
1
- .gradio
2
  .vscode
3
- output
4
 
5
  # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
6
  # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
 
 
1
  .vscode
 
2
 
3
  # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
4
  # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,dotenv,python,windows,intellij,visualstudio,visualstudiocode
app.py CHANGED
@@ -3,6 +3,7 @@ import gc
3
  import json
4
  import os
5
  from io import BytesIO
 
6
 
7
  import gradio as gr
8
  import torch
@@ -17,6 +18,7 @@ from transformers import (
17
 
18
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
19
 
 
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
  model_ids = [
@@ -61,11 +63,6 @@ with gr.Blocks() as demo:
61
  "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."
62
  )
63
 
64
- if DEVICE != "cuda":
65
- gr.Markdown(
66
- "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
67
- )
68
-
69
  with gr.Row():
70
  with gr.Column():
71
  gr.Markdown("## Inputs")
@@ -79,22 +76,22 @@ with gr.Blocks() as demo:
79
 
80
  input_model_id = gr.Dropdown(
81
  choices=model_ids,
82
- label="Select Model ID",
83
  )
84
  default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
85
  system_prompt = gr.Textbox(
86
- label="System Prompt:",
87
  lines=3,
88
  value=default_system_prompt,
89
  )
90
  default_user_prompt = "detect object"
91
  user_prompt = gr.Textbox(
92
- label="User Prompt:",
93
  lines=3,
94
  value=default_user_prompt,
95
  )
96
  max_new_tokens = gr.Slider(
97
- label="Max New Tokens:",
98
  minimum=32,
99
  maximum=4096,
100
  value=256,
@@ -102,9 +99,9 @@ with gr.Blocks() as demo:
102
  interactive=True,
103
  )
104
  image_target_size = gr.Slider(
105
- label="Image Target Size (longest side)",
106
  minimum=256,
107
- maximum=3072,
108
  value=1024,
109
  step=1,
110
  interactive=True,
@@ -123,7 +120,7 @@ with gr.Blocks() as demo:
123
 
124
  output_text = gr.Textbox(
125
  label="Output Text",
126
- lines=3,
127
  key="output_text",
128
  )
129
 
@@ -137,11 +134,11 @@ with gr.Blocks() as demo:
137
 
138
  def run(
139
  image,
140
- image_target_size: int,
141
  system_prompt: str,
142
  user_prompt: str,
143
- model_id: str,
144
  max_new_tokens: int = 1024,
 
145
  ):
146
  global current_model, current_processor, current_model_id
147
  scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
@@ -182,13 +179,17 @@ with gr.Blocks() as demo:
182
  model = current_model
183
  processor = current_processor
184
 
 
 
 
 
185
  messages = [
186
  {
187
  "role": "user",
188
  "content": [
189
  {
190
  "type": "image",
191
- "image": f"data:image;base64,{image_to_base64(scale_image(image, image_target_size))}",
192
  },
193
  {"type": "text", "text": system_prompt},
194
  {"type": "text", "text": user_prompt},
@@ -247,18 +248,89 @@ with gr.Blocks() as demo:
247
  ]
248
  bboxes.append((bbox, label))
249
 
250
- return [(image, bboxes), str(output_text)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  # Connect the button to the detection function
253
  run_button.click(
254
  fn=run,
255
  inputs=[
256
  image_input,
257
- image_target_size,
258
  system_prompt,
259
  user_prompt,
260
- input_model_id,
261
  max_new_tokens,
 
262
  ],
263
  outputs=[
264
  output_annotated_image,
 
3
  import json
4
  import os
5
  from io import BytesIO
6
+ from pathlib import Path
7
 
8
  import gradio as gr
9
  import torch
 
18
 
19
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
20
 
21
+ EXAMPLES_DIR = Path(__file__).parent / "examples"
22
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
 
24
  model_ids = [
 
63
  "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."
64
  )
65
 
 
 
 
 
 
66
  with gr.Row():
67
  with gr.Column():
68
  gr.Markdown("## Inputs")
 
76
 
77
  input_model_id = gr.Dropdown(
78
  choices=model_ids,
79
+ label="Select Model ID",
80
  )
81
  default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
82
  system_prompt = gr.Textbox(
83
+ label="System Prompt",
84
  lines=3,
85
  value=default_system_prompt,
86
  )
87
  default_user_prompt = "detect object"
88
  user_prompt = gr.Textbox(
89
+ label="User Prompt",
90
  lines=3,
91
  value=default_user_prompt,
92
  )
93
  max_new_tokens = gr.Slider(
94
+ label="Max New Tokens",
95
  minimum=32,
96
  maximum=4096,
97
  value=256,
 
99
  interactive=True,
100
  )
101
  image_target_size = gr.Slider(
102
+ label="Image Target Size",
103
  minimum=256,
104
+ maximum=4096,
105
  value=1024,
106
  step=1,
107
  interactive=True,
 
120
 
121
  output_text = gr.Textbox(
122
  label="Output Text",
123
+ lines=10,
124
  key="output_text",
125
  )
126
 
 
134
 
135
  def run(
136
  image,
137
+ model_id: str,
138
  system_prompt: str,
139
  user_prompt: str,
 
140
  max_new_tokens: int = 1024,
141
+ image_target_size: int | None = None,
142
  ):
143
  global current_model, current_processor, current_model_id
144
  scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
 
179
  model = current_model
180
  processor = current_processor
181
 
182
+ base64_image = image_to_base64(
183
+ scale_image(image, image_target_size) if image_target_size else image
184
+ )
185
+
186
  messages = [
187
  {
188
  "role": "user",
189
  "content": [
190
  {
191
  "type": "image",
192
+ "image": f"data:image;base64,{base64_image}",
193
  },
194
  {"type": "text", "text": system_prompt},
195
  {"type": "text", "text": user_prompt},
 
248
  ]
249
  bboxes.append((bbox, label))
250
 
251
+ return [(image, bboxes), str(json.dumps(output_json))]
252
+
253
+ with gr.Row():
254
+ with gr.Column():
255
+ gr.Markdown("## Examples")
256
+
257
+ gr.Examples(
258
+ fn=run,
259
+ cache_examples=True,
260
+ cache_mode="eager",
261
+ run_on_click=True,
262
+ examples=[
263
+ [
264
+ EXAMPLES_DIR
265
+ / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg",
266
+ "Qwen/Qwen3-VL-4B-Instruct",
267
+ default_system_prompt,
268
+ "detect sailboat, rowboat, person",
269
+ 512,
270
+ 1920,
271
+ ],
272
+ [
273
+ EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg",
274
+ "Qwen/Qwen3-VL-4B-Instruct",
275
+ default_system_prompt,
276
+ "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
277
+ 1024,
278
+ 1920,
279
+ ],
280
+ [
281
+ EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg",
282
+ "Qwen/Qwen3-VL-4B-Instruct",
283
+ default_system_prompt,
284
+ "detect basketball, player with white jersey, player with black jersey",
285
+ 512,
286
+ 1920,
287
+ ],
288
+ [
289
+ EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg",
290
+ "Qwen/Qwen3-VL-4B-Instruct",
291
+ default_system_prompt,
292
+ "detect app to find great places, app to take beautiful photos, app to listen music",
293
+ 512,
294
+ 1920,
295
+ ],
296
+ [
297
+ EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg",
298
+ "Qwen/Qwen3-VL-4B-Instruct",
299
+ default_system_prompt,
300
+ "detect person, bicycle, netherlands flag",
301
+ 1920,
302
+ 1920,
303
+ ],
304
+ ],
305
+ inputs=[
306
+ image_input,
307
+ input_model_id,
308
+ system_prompt,
309
+ user_prompt,
310
+ max_new_tokens,
311
+ image_target_size,
312
+ ],
313
+ outputs=[
314
+ output_annotated_image,
315
+ output_text,
316
+ ],
317
+ )
318
+
319
+ if DEVICE != "cuda":
320
+ gr.Markdown(
321
+ "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
322
+ )
323
 
324
  # Connect the button to the detection function
325
  run_button.click(
326
  fn=run,
327
  inputs=[
328
  image_input,
329
+ input_model_id,
330
  system_prompt,
331
  user_prompt,
 
332
  max_new_tokens,
333
+ image_target_size,
334
  ],
335
  outputs=[
336
  output_annotated_image,
examples/elevate-nYgy58eb9aw-unsplash.jpg ADDED

Git LFS Details

  • SHA256: 73085797788434cc3dfe8d0bfa60000f8b62500be133d47ee8c00925a42aacd4
  • Pointer size: 131 Bytes
  • Size of remote file: 596 kB
examples/elevate-nYgy58eb9aw-unsplash.link ADDED
@@ -0,0 +1 @@
 
 
1
+ https://unsplash.com/photos/four-women-holding-drinks-while-laughing-together-during-daytime-nYgy58eb9aw
examples/markus-spiske-oPDQGXW7i40-unsplash.jpg ADDED

Git LFS Details

  • SHA256: 010442751d65a444fe6c7bce8fdea3c8368a836c27a472a813fa10be60ad965d
  • Pointer size: 131 Bytes
  • Size of remote file: 601 kB
examples/markus-spiske-oPDQGXW7i40-unsplash.link ADDED
@@ -0,0 +1 @@
 
 
1
+ https://unsplash.com/photos/group-of-people-playing-basketball-oPDQGXW7i40
examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg ADDED

Git LFS Details

  • SHA256: 6baf40ee66763848b97f2757c51fe809712847b4b12b831561447b653b3d1219
  • Pointer size: 131 Bytes
  • Size of remote file: 548 kB
examples/niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.link ADDED
@@ -0,0 +1 @@
 
 
1
+ https://unsplash.com/photos/a-group-of-sailboats-in-a-body-of-water-with-a-city-in-the-background-fDYRfHoRC4k
examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg ADDED

Git LFS Details

  • SHA256: 37f9440ffbca5db9c08c3e2a1d489d0ef84ad111baf2059f3763adcb028c2888
  • Pointer size: 131 Bytes
  • Size of remote file: 513 kB
examples/tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.link ADDED
@@ -0,0 +1 @@
 
 
1
+ https://unsplash.com/photos/a-group-of-sailboats-in-a-body-of-water-with-a-city-in-the-background-fDYRfHoRC4k
examples/william-hook-9e9PD9blAto-unsplash.jpg ADDED

Git LFS Details

  • SHA256: b6e1489f35df4edd6bb8da0568f81a5fad23cd54760cdbae33c769148fc2d167
  • Pointer size: 131 Bytes
  • Size of remote file: 297 kB
examples/william-hook-9e9PD9blAto-unsplash.link ADDED
@@ -0,0 +1 @@
 
 
1
+ https://unsplash.com/photos/space-gray-iphone-x-9e9PD9blAto