BuyVerse / Solution /src /tools /visit_webpage.py
abdo-Mansour's picture
done
b8ee1a5
import re
import time
import requests
import markdownify
from typing import Any, Optional
from llama_index.core.tools import FunctionTool
from bs4 import BeautifulSoup
from bs4 import Comment
def visit_webpage(url: str) -> str:
"""
Visits a webpage at the given url and reads its content as a markdown string.
Args:
url (str): The url of the webpage to visit.
Returns:
str: The webpage content converted to markdown.
"""
try:
# Sleep for 3 seconds to avoid overwhevlming the server
time.sleep(3)
# Send a GET request to the URL with a 20-second timeout
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.6",
"Cache-Control": "max-age=0",
"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
# Make the HTTP GET request with a timeout.
response = requests.get(url, headers=headers, timeout=20)
# response = requests.get(url, timeout=20)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for tag in soup(["script", "style"]):
tag.decompose()
# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
text = soup.get_text(separator=" ", strip=True)
clean_text = re.sub(r'\s+', ' ', text)
# Convert the HTML content to Markdown
# markdown_content = markdownify.markdownify(soup.text).strip()
# Remove multiple line breaks
# markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
# Truncate to reasonable size
# max_length = 10000
# if len(markdown_content) > max_length:
# markdown_content = markdown_content[:max_length] + \
# "... (content truncated)"
return clean_text[:10]
except requests.exceptions.Timeout:
return "The request timed out. Please try again later or check the URL."
except requests.exceptions.RequestException as e:
return f"Error fetching the webpage: {str(e)}"
except Exception as e:
return f"An unexpected error occurred: {str(e)}"
# Create a LlamaIndex tool
visit_webpage_tool = FunctionTool.from_defaults(
name="visit_webpage",
fn=visit_webpage,
description="Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
)