Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from urllib.parse import urljoin, urlparse, urldefrag | |
import time | |
import logging | |
# Set up logging for error handling | |
logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR) | |
def scrape_docs(base_url, save_dir, delay=1): | |
if not os.path.exists(save_dir): | |
os.makedirs(save_dir) | |
visited = set() | |
def normalize_url(url): | |
# Remove fragments and query parameters, and normalize slashes | |
url, _ = urldefrag(url) # Remove the fragment | |
parsed_url = urlparse(url) | |
normalized_url = parsed_url._replace(query="").geturl().rstrip('/') | |
return normalized_url | |
def scrape_page(url): | |
normalized_url = normalize_url(url) | |
if normalized_url in visited: | |
return | |
visited.add(normalized_url) | |
try: | |
response = requests.get(normalized_url) | |
if response.status_code != 200: | |
logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}") | |
return | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Save the page content | |
parsed_url = urlparse(normalized_url) | |
relative_path = parsed_url.path.lstrip('/') | |
file_path = os.path.join(save_dir, relative_path) | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(file_path), exist_ok=True) | |
# Determine the file name: append 'index.html' if it's a directory | |
if parsed_url.path.endswith('/') or not os.path.basename(file_path): | |
file_path = os.path.join(file_path, 'index.html') | |
elif not file_path.endswith('.html'): | |
file_path += '.html' | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(response.text) | |
print(f"Scraped: {normalized_url}") | |
# Find all links on the page | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
full_url = urljoin(normalized_url, href) | |
# Only follow links within the base URL | |
if full_url.startswith(base_url): | |
scrape_page(full_url) | |
# Respect server rate limits | |
time.sleep(delay) | |
except Exception as e: | |
logging.error(f"Failed to scrape {normalized_url}: {e}") | |
print(f"Failed to scrape {normalized_url}, see log for details.") | |
scrape_page(base_url) | |
if __name__ == "__main__": | |
base_url = "https://docs.llamaindex.ai/en/stable/api_reference/" | |
save_dir = "llamaindex_docs" | |
scrape_docs(base_url, save_dir, delay=1) | |
print("Scraping completed.") | |