qq-hzlh commited on
Commit
da0ad7a
·
1 Parent(s): 53b7de4

change label format

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -57,9 +57,11 @@ import spaces
57
  @spaces.GPU
58
  def process_image_and_text(image, text):
59
  """Process image and text input, return thinking process and bbox"""
60
- question = f"Please carefully check the image and detect the following objects: [{text}]. "
 
61
 
62
- question = question + "First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: [\"equestrian rider's helmet\"]. Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
 
63
 
64
  messages = [
65
  {
@@ -116,7 +118,7 @@ def process_image_and_text(image, text):
116
  if __name__ == "__main__":
117
  import gradio as gr
118
 
119
- model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
120
  # device = "cuda" if torch.cuda.is_available() else "cpu"
121
  device = "cuda"
122
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
@@ -131,7 +133,7 @@ if __name__ == "__main__":
131
  fn=gradio_interface,
132
  inputs=[
133
  gr.Image(type="pil", label="Input Image"),
134
- gr.Textbox(label="Description Text")
135
  ],
136
  outputs=[
137
  gr.Textbox(label="Thinking Process"),
@@ -142,8 +144,8 @@ if __name__ == "__main__":
142
  description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
143
  examples=[
144
  ["examples/image1.jpg", "person"],
145
- ["examples/image2.jpg", "drink, fruit"],
146
- ["examples/image3.png", "keyboard, white cup, laptop"],
147
  ],
148
  cache_examples=False,
149
  examples_per_page=10
 
57
  @spaces.GPU
58
  def process_image_and_text(image, text):
59
  """Process image and text input, return thinking process and bbox"""
60
+ labels = text.split(",")
61
+ question = f"First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: {labels}. "
62
 
63
+ question = question + "Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
64
+ print("question: ", question)
65
 
66
  messages = [
67
  {
 
118
  if __name__ == "__main__":
119
  import gradio as gr
120
 
121
+ model_path = "omlab/VLM-R1-Qwen2.5VL-3B-OVD-0321"
122
  # device = "cuda" if torch.cuda.is_available() else "cpu"
123
  device = "cuda"
124
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
 
133
  fn=gradio_interface,
134
  inputs=[
135
  gr.Image(type="pil", label="Input Image"),
136
+ gr.Textbox(label="Objects to detect (separated by ,)")
137
  ],
138
  outputs=[
139
  gr.Textbox(label="Thinking Process"),
 
144
  description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
145
  examples=[
146
  ["examples/image1.jpg", "person"],
147
+ ["examples/image2.jpg", "drink,fruit"],
148
+ ["examples/image3.png", "keyboard,white cup,laptop"],
149
  ],
150
  cache_examples=False,
151
  examples_per_page=10