Spaces:
Running
on
Zero
Running
on
Zero
change label format
Browse files
app.py
CHANGED
@@ -57,9 +57,11 @@ import spaces
|
|
57 |
@spaces.GPU
|
58 |
def process_image_and_text(image, text):
|
59 |
"""Process image and text input, return thinking process and bbox"""
|
60 |
-
|
|
|
61 |
|
62 |
-
question = question +
|
|
|
63 |
|
64 |
messages = [
|
65 |
{
|
@@ -116,7 +118,7 @@ def process_image_and_text(image, text):
|
|
116 |
if __name__ == "__main__":
|
117 |
import gradio as gr
|
118 |
|
119 |
-
model_path = "
|
120 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
121 |
device = "cuda"
|
122 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
|
@@ -131,7 +133,7 @@ if __name__ == "__main__":
|
|
131 |
fn=gradio_interface,
|
132 |
inputs=[
|
133 |
gr.Image(type="pil", label="Input Image"),
|
134 |
-
gr.Textbox(label="
|
135 |
],
|
136 |
outputs=[
|
137 |
gr.Textbox(label="Thinking Process"),
|
@@ -142,8 +144,8 @@ if __name__ == "__main__":
|
|
142 |
description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
|
143 |
examples=[
|
144 |
["examples/image1.jpg", "person"],
|
145 |
-
["examples/image2.jpg", "drink,
|
146 |
-
["examples/image3.png", "keyboard,
|
147 |
],
|
148 |
cache_examples=False,
|
149 |
examples_per_page=10
|
|
|
57 |
@spaces.GPU
|
58 |
def process_image_and_text(image, text):
|
59 |
"""Process image and text input, return thinking process and bbox"""
|
60 |
+
labels = text.split(",")
|
61 |
+
question = f"First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: {labels}. "
|
62 |
|
63 |
+
question = question + "Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
|
64 |
+
print("question: ", question)
|
65 |
|
66 |
messages = [
|
67 |
{
|
|
|
118 |
if __name__ == "__main__":
|
119 |
import gradio as gr
|
120 |
|
121 |
+
model_path = "omlab/VLM-R1-Qwen2.5VL-3B-OVD-0321"
|
122 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
123 |
device = "cuda"
|
124 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
|
|
|
133 |
fn=gradio_interface,
|
134 |
inputs=[
|
135 |
gr.Image(type="pil", label="Input Image"),
|
136 |
+
gr.Textbox(label="Objects to detect (separated by ,)")
|
137 |
],
|
138 |
outputs=[
|
139 |
gr.Textbox(label="Thinking Process"),
|
|
|
144 |
description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
|
145 |
examples=[
|
146 |
["examples/image1.jpg", "person"],
|
147 |
+
["examples/image2.jpg", "drink,fruit"],
|
148 |
+
["examples/image3.png", "keyboard,white cup,laptop"],
|
149 |
],
|
150 |
cache_examples=False,
|
151 |
examples_per_page=10
|