Spaces:
Sleeping
Sleeping
Create vision_agent.py
Browse files- vision_agent.py +60 -0
vision_agent.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@tool
|
2 |
+
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
3 |
+
"""
|
4 |
+
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
5 |
+
Args:
|
6 |
+
text: The text to search for
|
7 |
+
nth_result: Which occurrence to jump to (default: 1)
|
8 |
+
"""
|
9 |
+
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
10 |
+
if nth_result > len(elements):
|
11 |
+
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
12 |
+
result = f"Found {len(elements)} matches for '{text}'."
|
13 |
+
elem = elements[nth_result - 1]
|
14 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
15 |
+
result += f"Focused on element {nth_result} of {len(elements)}"
|
16 |
+
return result
|
17 |
+
|
18 |
+
|
19 |
+
@tool
|
20 |
+
def go_back() -> None:
|
21 |
+
"""Goes back to previous page."""
|
22 |
+
driver.back()
|
23 |
+
|
24 |
+
|
25 |
+
@tool
|
26 |
+
def close_popups() -> str:
|
27 |
+
"""
|
28 |
+
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
|
29 |
+
"""
|
30 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
31 |
+
|
32 |
+
def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
|
33 |
+
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
|
34 |
+
driver = helium.get_driver()
|
35 |
+
current_step = step_log.step_number
|
36 |
+
if driver is not None:
|
37 |
+
for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
|
38 |
+
if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
|
39 |
+
step_logs.observations_images = None
|
40 |
+
png_bytes = driver.get_screenshot_as_png()
|
41 |
+
image = Image.open(BytesIO(png_bytes))
|
42 |
+
print(f"Captured a browser screenshot: {image.size} pixels")
|
43 |
+
step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
|
44 |
+
|
45 |
+
# Update observations with current URL
|
46 |
+
url_info = f"Current url: {driver.current_url}"
|
47 |
+
step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
|
48 |
+
return
|
49 |
+
|
50 |
+
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
|
51 |
+
model = OpenAIServerModel(model_id="gpt-4o")
|
52 |
+
|
53 |
+
agent = CodeAgent(
|
54 |
+
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
|
55 |
+
model=model,
|
56 |
+
additional_authorized_imports=["helium"],
|
57 |
+
step_callbacks=[save_screenshot],
|
58 |
+
max_steps=20,
|
59 |
+
verbosity_level=2,
|
60 |
+
)
|