File size: 3,017 Bytes

c6bb4cd

from pdfminer.high_level import extract_text

import re
bentham_texts = []
import glob
def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

bentham_pdfs = glob.glob('./Bentham*.pdf')
for pdf in bentham_pdfs:
    print(pdf)
    with open(pdf, 'rb') as f:
        text = extract_text_from_pdf(f)
        bentham_texts.append(text)
        


bentham_text_string = ' '.join(bentham_texts)
with open('bentham_text.txt', 'w') as f:
    f.write(bentham_text_string)
    
    
    
    
##
bentham_text_string =''
with open('bentham_text.txt', 'r') as f:
    bentham_text_strings = f.readlines()
bentham_text_string = ''.join(bentham_text_strings)
    
import re

def clean_text(text):
    cleaned_text = re.sub(r'§\s*\d+\.', '', text)

    # Step 2: Remove the unwanted patterns
    # Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
    cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)

    # Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
    cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text)  # Removes non-printable characters
    cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text)  # Removes escaped special characters like \n, \t, \r
    patterns_to_remove = [
        r'^\s*$',  # Empty lines
        r'^\s*\d+\s*$',  # Standalone numeric lines
        r'\[Back to Table of Contents\]',  # Specific placeholders
    ]
    for pattern in patterns_to_remove:
        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
    return cleaned_text
cleaned_lines = []
for line in bentham_text_strings:
    cleaned_line = clean_text(line)
    if cleaned_line != '':
        cleaned_lines.append(cleaned_line)
    

def split_into_chunks(text, chunk_size=100):
    """
    Split the text into chunks of approximately `chunk_size` words.
    
    Args:
    text (str): The input text to split.
    chunk_size (int): The desired chunk size in words.
    
    Returns:
    list of str: A list of text chunks.
    """
    # Split the text into words
    words = text.split()
    
    # Create chunks of approximately `chunk_size` words
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    
    return chunks

chunks = split_into_chunks((' ').join(cleaned_lines), 100)

from datasets import Dataset

# Assuming `chunks` is the list of text chunks you created earlier
data = {'text': chunks}
new_dataset = Dataset.from_dict(data)
new_dataset.save_to_disk('./bentham_chunked')