Spaces:

Petermoyano
/

becognitium

Sleeping

App Files Files Community

becognitium / scraper.py

Petermoyano

Add scraper and update RAG

bc7569e 9 months ago

raw

history blame contribute delete

2.76 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.parse import urljoin, urlparse, urldefrag
	import time
	import logging

	# Set up logging for error handling
	logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR)

	def scrape_docs(base_url, save_dir, delay=1):
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)

	visited = set()

	def normalize_url(url):
	# Remove fragments and query parameters, and normalize slashes
	url, _ = urldefrag(url) # Remove the fragment
	parsed_url = urlparse(url)
	normalized_url = parsed_url._replace(query="").geturl().rstrip('/')
	return normalized_url

	def scrape_page(url):
	normalized_url = normalize_url(url)

	if normalized_url in visited:
	return
	visited.add(normalized_url)

	try:
	response = requests.get(normalized_url)
	if response.status_code != 200:
	logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}")
	return

	soup = BeautifulSoup(response.text, 'html.parser')

	# Save the page content
	parsed_url = urlparse(normalized_url)
	relative_path = parsed_url.path.lstrip('/')
	file_path = os.path.join(save_dir, relative_path)

	# Ensure the directory exists
	os.makedirs(os.path.dirname(file_path), exist_ok=True)

	# Determine the file name: append 'index.html' if it's a directory
	if parsed_url.path.endswith('/') or not os.path.basename(file_path):
	file_path = os.path.join(file_path, 'index.html')
	elif not file_path.endswith('.html'):
	file_path += '.html'

	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(response.text)

	print(f"Scraped: {normalized_url}")

	# Find all links on the page
	for link in soup.find_all('a', href=True):
	href = link['href']
	full_url = urljoin(normalized_url, href)

	# Only follow links within the base URL
	if full_url.startswith(base_url):
	scrape_page(full_url)

	# Respect server rate limits
	time.sleep(delay)

	except Exception as e:
	logging.error(f"Failed to scrape {normalized_url}: {e}")
	print(f"Failed to scrape {normalized_url}, see log for details.")

	scrape_page(base_url)

	if __name__ == "__main__":
	base_url = "https://docs.llamaindex.ai/en/stable/api_reference/"
	save_dir = "llamaindex_docs"

	scrape_docs(base_url, save_dir, delay=1)
	print("Scraping completed.")