Spaces:

towardsai-tutors
/

buster

Running

buster / data /scrapper_to_csv.py

Louis-François Bouchard

Advanced rag course update (#44)

0f06abd unverified about 1 year ago

2.88 kB

	import pandas as pd
	import re


	def parse_markdown_file(file_path):
	entries = []
	with open(file_path, "r", encoding="utf-8") as file:
	current_url, current_title, current_content = "", "", ""
	inside_page = False

	for line in file:
	if line.strip() == "--": # Check for page separator
	if inside_page:
	# Process the previous page
	process_content(
	entries, current_url, current_title, current_content
	)
	current_content = ""

	inside_page = True
	# Read URL and title
	current_url = next(file).strip().split(" ", 1)[1]
	current_title = (
	next(file).strip().split(" ", 1)[1].replace("\n", " ")
	) # Replace new lines in title
	# Skip the next two lines (description and keywords)
	next(file)
	next(file)
	# print(f"Detected Page: Title - {current_title}, URL - {current_url}") # Debugging

	elif inside_page:
	current_content += line

	if inside_page:
	process_content(entries, current_url, current_title, current_content)

	df = pd.DataFrame(entries)
	return df


	def process_content(entries, url, title, content):
	# Regular expression to match markdown headers
	header_pattern = re.compile(r"^## (.+)$", re.MULTILINE)

	# Split the content into sections based on headers
	sections = re.split(header_pattern, content)
	section_title = "Main" # Default section title for content before the first header

	# Initial content before the first header (if any)
	if not sections[0].startswith("##") and sections[0].strip():
	add_content_section(entries, title, url, "Main", sections[0])

	# Process each section
	for i in range(1, len(sections), 2):
	section_header = sections[i].strip()
	section_text = (
	sections[i + 1].strip().replace("\n", " ")
	) # Replace new lines in content

	add_content_section(entries, title, url, section_header, section_text)


	def add_content_section(entries, title, url, section_title, section_text):
	full_section = f"{section_title}: {section_text}".replace(
	"\n", " "
	) # Replace new lines in content
	for j in range(0, len(full_section), 6000):
	entries.append(
	{
	"title": title,
	"url": url,
	"source": "langchain",
	"content": full_section[j : j + 6000],
	}
	)


	markdown_file_path = "data/langchain_scrape.md"
	df = parse_markdown_file(markdown_file_path)
	print("Final DataFrame:")
	print(df.head()) # Print the first few rows for verification
	df.to_csv("data/langchain.csv", index=False)