tianbaoxiexxx nielsr HF Staff commited on
Commit
e522ad8
·
verified ·
1 Parent(s): 18ed0f7

Add library name, Github repo, project page and example usage (#1)

Browse files

- Add library name, Github repo, project page and example usage (9ff3286f416e6022204824e99e0ece56a72e7c06)


Co-authored-by: Niels Rogge <[email protected]>

Files changed (1) hide show
  1. README.md +184 -3
README.md CHANGED
@@ -1,12 +1,193 @@
1
  ---
2
- license: apache-2.0
3
- language:
4
- - en
5
  base_model:
6
  - Qwen/Qwen2.5-VL-3B-Instruct
 
 
 
7
  pipeline_tag: image-text-to-text
 
8
  ---
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ## 📄 Citation
11
  If you find this work useful, please consider citing our paper:
12
 
 
1
  ---
 
 
 
2
  base_model:
3
  - Qwen/Qwen2.5-VL-3B-Instruct
4
+ language:
5
+ - en
6
+ license: apache-2.0
7
  pipeline_tag: image-text-to-text
8
+ library_name: transformers
9
  ---
10
 
11
+ This repository contains the model based on Qwen2.5-VL-3B as presented in [Scaling Computer-Use Grounding via User Interface Decomposition and Synthesis](https://arxiv.org/abs/2505.13227).
12
+
13
+ Project page: https://osworld-grounding.github.io
14
+
15
+ For code and sample usage, see https://github.com/xlang-ai/OSWorld-G.
16
+
17
+ To use our model, we recommend using `vllm`. You need to carefully follow the computer use agent template from Qwen-2.5-VL, and be very careful with the image size to enable the best performance. We show a small example here (You can also run [`demo.py`](demo.py) to see the demo):
18
+ ``` python
19
+ import json
20
+ import re
21
+ from PIL import Image, ImageDraw
22
+ from transformers import AutoTokenizer
23
+ from vllm import LLM, SamplingParams
24
+ from agent_function_call import ComputerUse
25
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
26
+ from transformers import Qwen2_5_VLProcessor
27
+ from huggingface_hub import hf_hub_download
28
+
29
+ model_path = "xlangai/Jedi-3B-1080p"
30
+ # model_path = "xlangai/Jedi-7B-1080p"
31
+
32
+ FN_CALL_TEMPLATE = """You are a helpful assistant.
33
+
34
+ # Tools
35
+
36
+ You may call one or more functions to assist with the user query.
37
+
38
+ You are provided with function signatures within <tools></tools> XML tags:
39
+ <tools>
40
+ {tool_descs}
41
+ </tools>
42
+
43
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
44
+ <tool_call>
45
+ {{"name": <function-name>, "arguments": <args-json-object>}}
46
+ </tool_call>"""
47
+
48
+
49
+ def visualize_click_position(image, coords, circle_radius=9, point_radius=3):
50
+ draw = ImageDraw.Draw(image)
51
+
52
+ x, y = coords
53
+
54
+ draw.ellipse(
55
+ [x - circle_radius, y - circle_radius, x + circle_radius, y + circle_radius],
56
+ outline="lightgreen",
57
+ width=2,
58
+ )
59
+
60
+ draw.ellipse(
61
+ [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
62
+ fill="lightgreen",
63
+ )
64
+
65
+ return image
66
+
67
+
68
+ def parse_coordinates(response):
69
+ match = re.search(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
70
+ action = None
71
+ if not match:
72
+ raise ValueError("No <tool_call> block found in response.")
73
+
74
+ try:
75
+ action = json.loads(match.group(1))
76
+ except json.JSONDecodeError as e:
77
+ raise ValueError(f"Failed to parse tool_call JSON: {e}")
78
+ action_name = action["name"]
79
+ action_type = action["arguments"]["action"]
80
+ action_args = action["arguments"]["coordinate"]
81
+
82
+ if (
83
+ action_name != "computer_use"
84
+ or action_type
85
+ not in ("mouse_move", "left_click", "right_click", "double_click")
86
+ or action_args is None
87
+ ):
88
+ print(f"Error parsing coordinates: {response}")
89
+ return None
90
+
91
+ return action_args
92
+
93
+
94
+ def main():
95
+ processor = Qwen2_5_VLProcessor.from_pretrained(model_path)
96
+
97
+ input_image = Image.open("demo_image.png")
98
+ instruction = "Open the filter function for search settings."
99
+
100
+ resized_height, resized_width = smart_resize(
101
+ input_image.height,
102
+ input_image.width,
103
+ factor=processor.image_processor.patch_size
104
+ * processor.image_processor.merge_size,
105
+ min_pixels=processor.image_processor.min_pixels,
106
+ max_pixels=processor.image_processor.max_pixels,
107
+ )
108
+
109
+ computer_use = ComputerUse(
110
+ cfg={
111
+ "display_width_px": resized_width,
112
+ "display_height_px": resized_height,
113
+ }
114
+ )
115
+ tools = [computer_use.function]
116
+ tool_descs = [{"type": "function", "function": f} for f in tools]
117
+ tool_descs = "
118
+ ".join([json.dumps(f, ensure_ascii=False) for f in tool_descs])
119
+
120
+ llm = LLM(
121
+ model=model_path,
122
+ tokenizer_mode="slow",
123
+ dtype="bfloat16",
124
+ trust_remote_code=True,
125
+ )
126
+ tokenizer = AutoTokenizer.from_pretrained(
127
+ model_path, trust_remote_code=True, use_fast=False
128
+ )
129
+
130
+ chat_template_path = hf_hub_download(
131
+ repo_id=model_path, filename="chat_template.json"
132
+ )
133
+ with open(chat_template_path, "r") as f:
134
+ tokenizer.chat_template = json.load(f)["chat_template"]
135
+
136
+ messages = [
137
+ {
138
+ "role": "system",
139
+ "content": [
140
+ {
141
+ "type": "text",
142
+ "text": FN_CALL_TEMPLATE.format(tool_descs=tool_descs),
143
+ }
144
+ ],
145
+ },
146
+ {
147
+ "role": "user",
148
+ "content": [
149
+ {
150
+ "type": "image",
151
+ },
152
+ {
153
+ "type": "text",
154
+ "text": instruction,
155
+ },
156
+ ],
157
+ },
158
+ ]
159
+ sampling_params = SamplingParams(
160
+ temperature=0.01,
161
+ max_tokens=1024,
162
+ top_k=1,
163
+ )
164
+ message = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
165
+
166
+ outputs = llm.generate(
167
+ {
168
+ "prompt_token_ids": message,
169
+ "multi_modal_data": {
170
+ "image": input_image,
171
+ },
172
+ },
173
+ sampling_params=sampling_params,
174
+ )
175
+ generated_tokens = outputs[0].outputs[0].token_ids
176
+ response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
177
+ predicted_coords = parse_coordinates(response)
178
+ print("predicted_coords: ", predicted_coords)
179
+
180
+ if predicted_coords:
181
+ viz_image = visualize_click_position(input_image, predicted_coords)
182
+ viz_image.save("click_visualization.png")
183
+
184
+ return predicted_coords
185
+
186
+
187
+ if __name__ == "__main__":
188
+ main()
189
+ ```
190
+
191
  ## 📄 Citation
192
  If you find this work useful, please consider citing our paper:
193