RCaz commited on
Commit
8a93790
·
verified ·
1 Parent(s): ace5932

Create vision_agent.py

Browse files
Files changed (1) hide show
  1. vision_agent.py +60 -0
vision_agent.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @tool
2
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
3
+ """
4
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
5
+ Args:
6
+ text: The text to search for
7
+ nth_result: Which occurrence to jump to (default: 1)
8
+ """
9
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
10
+ if nth_result > len(elements):
11
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
12
+ result = f"Found {len(elements)} matches for '{text}'."
13
+ elem = elements[nth_result - 1]
14
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
15
+ result += f"Focused on element {nth_result} of {len(elements)}"
16
+ return result
17
+
18
+
19
+ @tool
20
+ def go_back() -> None:
21
+ """Goes back to previous page."""
22
+ driver.back()
23
+
24
+
25
+ @tool
26
+ def close_popups() -> str:
27
+ """
28
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
29
+ """
30
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
31
+
32
+ def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
33
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
34
+ driver = helium.get_driver()
35
+ current_step = step_log.step_number
36
+ if driver is not None:
37
+ for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
38
+ if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
39
+ step_logs.observations_images = None
40
+ png_bytes = driver.get_screenshot_as_png()
41
+ image = Image.open(BytesIO(png_bytes))
42
+ print(f"Captured a browser screenshot: {image.size} pixels")
43
+ step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
44
+
45
+ # Update observations with current URL
46
+ url_info = f"Current url: {driver.current_url}"
47
+ step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
48
+ return
49
+
50
+ from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
51
+ model = OpenAIServerModel(model_id="gpt-4o")
52
+
53
+ agent = CodeAgent(
54
+ tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
55
+ model=model,
56
+ additional_authorized_imports=["helium"],
57
+ step_callbacks=[save_screenshot],
58
+ max_steps=20,
59
+ verbosity_level=2,
60
+ )