Spaces:

Petermoyano
/

becognitium

Sleeping

File size: 2,755 Bytes

bc7569e

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse, urldefrag
import time
import logging

# Set up logging for error handling
logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR)

def scrape_docs(base_url, save_dir, delay=1):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    visited = set()

    def normalize_url(url):
        # Remove fragments and query parameters, and normalize slashes
        url, _ = urldefrag(url)  # Remove the fragment
        parsed_url = urlparse(url)
        normalized_url = parsed_url._replace(query="").geturl().rstrip('/')
        return normalized_url

    def scrape_page(url):
        normalized_url = normalize_url(url)

        if normalized_url in visited:
            return
        visited.add(normalized_url)

        try:
            response = requests.get(normalized_url)
            if response.status_code != 200:
                logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}")
                return

            soup = BeautifulSoup(response.text, 'html.parser')

            # Save the page content
            parsed_url = urlparse(normalized_url)
            relative_path = parsed_url.path.lstrip('/')
            file_path = os.path.join(save_dir, relative_path)

            # Ensure the directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            # Determine the file name: append 'index.html' if it's a directory
            if parsed_url.path.endswith('/') or not os.path.basename(file_path):
                file_path = os.path.join(file_path, 'index.html')
            elif not file_path.endswith('.html'):
                file_path += '.html'

            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(response.text)

            print(f"Scraped: {normalized_url}")

            # Find all links on the page
            for link in soup.find_all('a', href=True):
                href = link['href']
                full_url = urljoin(normalized_url, href)

                # Only follow links within the base URL
                if full_url.startswith(base_url):
                    scrape_page(full_url)

            # Respect server rate limits
            time.sleep(delay)

        except Exception as e:
            logging.error(f"Failed to scrape {normalized_url}: {e}")
            print(f"Failed to scrape {normalized_url}, see log for details.")

    scrape_page(base_url)

if __name__ == "__main__":
    base_url = "https://docs.llamaindex.ai/en/stable/api_reference/"
    save_dir = "llamaindex_docs"
    
    scrape_docs(base_url, save_dir, delay=1)
    print("Scraping completed.")