Spaces:
Sleeping
Sleeping
File size: 2,755 Bytes
bc7569e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse, urldefrag
import time
import logging
# Set up logging for error handling
logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR)
def scrape_docs(base_url, save_dir, delay=1):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
visited = set()
def normalize_url(url):
# Remove fragments and query parameters, and normalize slashes
url, _ = urldefrag(url) # Remove the fragment
parsed_url = urlparse(url)
normalized_url = parsed_url._replace(query="").geturl().rstrip('/')
return normalized_url
def scrape_page(url):
normalized_url = normalize_url(url)
if normalized_url in visited:
return
visited.add(normalized_url)
try:
response = requests.get(normalized_url)
if response.status_code != 200:
logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
# Save the page content
parsed_url = urlparse(normalized_url)
relative_path = parsed_url.path.lstrip('/')
file_path = os.path.join(save_dir, relative_path)
# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Determine the file name: append 'index.html' if it's a directory
if parsed_url.path.endswith('/') or not os.path.basename(file_path):
file_path = os.path.join(file_path, 'index.html')
elif not file_path.endswith('.html'):
file_path += '.html'
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"Scraped: {normalized_url}")
# Find all links on the page
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(normalized_url, href)
# Only follow links within the base URL
if full_url.startswith(base_url):
scrape_page(full_url)
# Respect server rate limits
time.sleep(delay)
except Exception as e:
logging.error(f"Failed to scrape {normalized_url}: {e}")
print(f"Failed to scrape {normalized_url}, see log for details.")
scrape_page(base_url)
if __name__ == "__main__":
base_url = "https://docs.llamaindex.ai/en/stable/api_reference/"
save_dir = "llamaindex_docs"
scrape_docs(base_url, save_dir, delay=1)
print("Scraping completed.")
|