becognitium / scraper.py
Petermoyano's picture
Add scraper and update RAG
bc7569e
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse, urldefrag
import time
import logging
# Set up logging for error handling
logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR)
def scrape_docs(base_url, save_dir, delay=1):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
visited = set()
def normalize_url(url):
# Remove fragments and query parameters, and normalize slashes
url, _ = urldefrag(url) # Remove the fragment
parsed_url = urlparse(url)
normalized_url = parsed_url._replace(query="").geturl().rstrip('/')
return normalized_url
def scrape_page(url):
normalized_url = normalize_url(url)
if normalized_url in visited:
return
visited.add(normalized_url)
try:
response = requests.get(normalized_url)
if response.status_code != 200:
logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
# Save the page content
parsed_url = urlparse(normalized_url)
relative_path = parsed_url.path.lstrip('/')
file_path = os.path.join(save_dir, relative_path)
# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Determine the file name: append 'index.html' if it's a directory
if parsed_url.path.endswith('/') or not os.path.basename(file_path):
file_path = os.path.join(file_path, 'index.html')
elif not file_path.endswith('.html'):
file_path += '.html'
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"Scraped: {normalized_url}")
# Find all links on the page
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(normalized_url, href)
# Only follow links within the base URL
if full_url.startswith(base_url):
scrape_page(full_url)
# Respect server rate limits
time.sleep(delay)
except Exception as e:
logging.error(f"Failed to scrape {normalized_url}: {e}")
print(f"Failed to scrape {normalized_url}, see log for details.")
scrape_page(base_url)
if __name__ == "__main__":
base_url = "https://docs.llamaindex.ai/en/stable/api_reference/"
save_dir = "llamaindex_docs"
scrape_docs(base_url, save_dir, delay=1)
print("Scraping completed.")