Spaces:
Running
Running
import pandas as pd | |
import re | |
def parse_markdown_file(file_path): | |
entries = [] | |
with open(file_path, "r", encoding="utf-8") as file: | |
current_url, current_title, current_content = "", "", "" | |
inside_page = False | |
for line in file: | |
if line.strip() == "--": # Check for page separator | |
if inside_page: | |
# Process the previous page | |
process_content( | |
entries, current_url, current_title, current_content | |
) | |
current_content = "" | |
inside_page = True | |
# Read URL and title | |
current_url = next(file).strip().split(" ", 1)[1] | |
current_title = ( | |
next(file).strip().split(" ", 1)[1].replace("\n", " ") | |
) # Replace new lines in title | |
# Skip the next two lines (description and keywords) | |
next(file) | |
next(file) | |
# print(f"Detected Page: Title - {current_title}, URL - {current_url}") # Debugging | |
elif inside_page: | |
current_content += line | |
if inside_page: | |
process_content(entries, current_url, current_title, current_content) | |
df = pd.DataFrame(entries) | |
return df | |
def process_content(entries, url, title, content): | |
# Regular expression to match markdown headers | |
header_pattern = re.compile(r"^## (.+)$", re.MULTILINE) | |
# Split the content into sections based on headers | |
sections = re.split(header_pattern, content) | |
section_title = "Main" # Default section title for content before the first header | |
# Initial content before the first header (if any) | |
if not sections[0].startswith("##") and sections[0].strip(): | |
add_content_section(entries, title, url, "Main", sections[0]) | |
# Process each section | |
for i in range(1, len(sections), 2): | |
section_header = sections[i].strip() | |
section_text = ( | |
sections[i + 1].strip().replace("\n", " ") | |
) # Replace new lines in content | |
add_content_section(entries, title, url, section_header, section_text) | |
def add_content_section(entries, title, url, section_title, section_text): | |
full_section = f"{section_title}: {section_text}".replace( | |
"\n", " " | |
) # Replace new lines in content | |
for j in range(0, len(full_section), 6000): | |
entries.append( | |
{ | |
"title": title, | |
"url": url, | |
"source": "langchain", | |
"content": full_section[j : j + 6000], | |
} | |
) | |
markdown_file_path = "data/langchain_scrape.md" | |
df = parse_markdown_file(markdown_file_path) | |
print("Final DataFrame:") | |
print(df.head()) # Print the first few rows for verification | |
df.to_csv("data/langchain.csv", index=False) | |