logos-gpt / pdf_extract.py
caug37's picture
Upload 25 files
c6bb4cd verified
from pdfminer.high_level import extract_text
import re
bentham_texts = []
import glob
def extract_text_from_pdf(pdf_path):
text = extract_text(pdf_path)
return text
bentham_pdfs = glob.glob('./Bentham*.pdf')
for pdf in bentham_pdfs:
print(pdf)
with open(pdf, 'rb') as f:
text = extract_text_from_pdf(f)
bentham_texts.append(text)
bentham_text_string = ' '.join(bentham_texts)
with open('bentham_text.txt', 'w') as f:
f.write(bentham_text_string)
##
bentham_text_string =''
with open('bentham_text.txt', 'r') as f:
bentham_text_strings = f.readlines()
bentham_text_string = ''.join(bentham_text_strings)
import re
def clean_text(text):
cleaned_text = re.sub(r'§\s*\d+\.', '', text)
# Step 2: Remove the unwanted patterns
# Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)
# Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) # Removes non-printable characters
cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) # Removes escaped special characters like \n, \t, \r
patterns_to_remove = [
r'^\s*$', # Empty lines
r'^\s*\d+\s*$', # Standalone numeric lines
r'\[Back to Table of Contents\]', # Specific placeholders
]
for pattern in patterns_to_remove:
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
return cleaned_text
cleaned_lines = []
for line in bentham_text_strings:
cleaned_line = clean_text(line)
if cleaned_line != '':
cleaned_lines.append(cleaned_line)
def split_into_chunks(text, chunk_size=100):
"""
Split the text into chunks of approximately `chunk_size` words.
Args:
text (str): The input text to split.
chunk_size (int): The desired chunk size in words.
Returns:
list of str: A list of text chunks.
"""
# Split the text into words
words = text.split()
# Create chunks of approximately `chunk_size` words
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
chunks = split_into_chunks((' ').join(cleaned_lines), 100)
from datasets import Dataset
# Assuming `chunks` is the list of text chunks you created earlier
data = {'text': chunks}
new_dataset = Dataset.from_dict(data)
new_dataset.save_to_disk('./bentham_chunked')