Spaces:

omlab
/

VLM-R1-OVD

Running on Zero

App Files Files Community

qq-hzlh commited on Mar 21

Commit

da0ad7a

1 Parent(s): 53b7de4

change label format

Browse files

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -57,9 +57,11 @@ import spaces
 @spaces.GPU
 def process_image_and_text(image, text):
     """Process image and text input, return thinking process and bbox"""
-    question = f"Please carefully check the image and detect the following objects: [{text}]. "
-    question = question + "First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: [\"equestrian rider's helmet\"]. Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
     messages = [
         {
@@ -116,7 +118,7 @@ def process_image_and_text(image, text):
 if __name__ == "__main__":
     import gradio as gr
-    model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
     # device = "cuda" if torch.cuda.is_available() else "cpu"
     device = "cuda"
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
@@ -131,7 +133,7 @@ if __name__ == "__main__":
         fn=gradio_interface,
         inputs=[
             gr.Image(type="pil", label="Input Image"),
-            gr.Textbox(label="Description Text")
         ],
         outputs=[
             gr.Textbox(label="Thinking Process"),
@@ -142,8 +144,8 @@ if __name__ == "__main__":
         description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
         examples=[
             ["examples/image1.jpg", "person"],
-            ["examples/image2.jpg", "drink, fruit"],
-            ["examples/image3.png", "keyboard, white cup, laptop"],
         ],
         cache_examples=False,
         examples_per_page=10

 @spaces.GPU
 def process_image_and_text(image, text):
     """Process image and text input, return thinking process and bbox"""
+    labels = text.split(",")
+    question = f"First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: {labels}. "
+    question = question +  "Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
+    print("question: ", question)
     messages = [
         {
 if __name__ == "__main__":
     import gradio as gr
+    model_path = "omlab/VLM-R1-Qwen2.5VL-3B-OVD-0321"
     # device = "cuda" if torch.cuda.is_available() else "cpu"
     device = "cuda"
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
         fn=gradio_interface,
         inputs=[
             gr.Image(type="pil", label="Input Image"),
+            gr.Textbox(label="Objects to detect (separated by ,)")
         ],
         outputs=[
             gr.Textbox(label="Thinking Process"),
         description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
         examples=[
             ["examples/image1.jpg", "person"],
+            ["examples/image2.jpg", "drink,fruit"],
+            ["examples/image3.png", "keyboard,white cup,laptop"],
         ],
         cache_examples=False,
         examples_per_page=10