import requests from bs4 import BeautifulSoup import os from urllib.parse import urljoin, urlparse, urldefrag import time import logging # Set up logging for error handling logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR) def scrape_docs(base_url, save_dir, delay=1): if not os.path.exists(save_dir): os.makedirs(save_dir) visited = set() def normalize_url(url): # Remove fragments and query parameters, and normalize slashes url, _ = urldefrag(url) # Remove the fragment parsed_url = urlparse(url) normalized_url = parsed_url._replace(query="").geturl().rstrip('/') return normalized_url def scrape_page(url): normalized_url = normalize_url(url) if normalized_url in visited: return visited.add(normalized_url) try: response = requests.get(normalized_url) if response.status_code != 200: logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}") return soup = BeautifulSoup(response.text, 'html.parser') # Save the page content parsed_url = urlparse(normalized_url) relative_path = parsed_url.path.lstrip('/') file_path = os.path.join(save_dir, relative_path) # Ensure the directory exists os.makedirs(os.path.dirname(file_path), exist_ok=True) # Determine the file name: append 'index.html' if it's a directory if parsed_url.path.endswith('/') or not os.path.basename(file_path): file_path = os.path.join(file_path, 'index.html') elif not file_path.endswith('.html'): file_path += '.html' with open(file_path, 'w', encoding='utf-8') as f: f.write(response.text) print(f"Scraped: {normalized_url}") # Find all links on the page for link in soup.find_all('a', href=True): href = link['href'] full_url = urljoin(normalized_url, href) # Only follow links within the base URL if full_url.startswith(base_url): scrape_page(full_url) # Respect server rate limits time.sleep(delay) except Exception as e: logging.error(f"Failed to scrape {normalized_url}: {e}") print(f"Failed to scrape {normalized_url}, see log for details.") scrape_page(base_url) if __name__ == "__main__": base_url = "https://docs.llamaindex.ai/en/stable/api_reference/" save_dir = "llamaindex_docs" scrape_docs(base_url, save_dir, delay=1) print("Scraping completed.")