from pdfminer.high_level import extract_text import re bentham_texts = [] import glob def extract_text_from_pdf(pdf_path): text = extract_text(pdf_path) return text bentham_pdfs = glob.glob('./Bentham*.pdf') for pdf in bentham_pdfs: print(pdf) with open(pdf, 'rb') as f: text = extract_text_from_pdf(f) bentham_texts.append(text) bentham_text_string = ' '.join(bentham_texts) with open('bentham_text.txt', 'w') as f: f.write(bentham_text_string) ## bentham_text_string ='' with open('bentham_text.txt', 'r') as f: bentham_text_strings = f.readlines() bentham_text_string = ''.join(bentham_text_strings) import re def clean_text(text): cleaned_text = re.sub(r'ยง\s*\d+\.', '', text) # Step 2: Remove the unwanted patterns # Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL) cleaned_text = re.sub(r'https?://\S+', '', cleaned_text) # Removing "Online Library of Liberty" lines that might not fit the exact previous pattern cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL) cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) # Removes non-printable characters cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) # Removes escaped special characters like \n, \t, \r patterns_to_remove = [ r'^\s*$', # Empty lines r'^\s*\d+\s*$', # Standalone numeric lines r'\[Back to Table of Contents\]', # Specific placeholders ] for pattern in patterns_to_remove: cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE) return cleaned_text cleaned_lines = [] for line in bentham_text_strings: cleaned_line = clean_text(line) if cleaned_line != '': cleaned_lines.append(cleaned_line) def split_into_chunks(text, chunk_size=100): """ Split the text into chunks of approximately `chunk_size` words. Args: text (str): The input text to split. chunk_size (int): The desired chunk size in words. Returns: list of str: A list of text chunks. """ # Split the text into words words = text.split() # Create chunks of approximately `chunk_size` words chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] return chunks chunks = split_into_chunks((' ').join(cleaned_lines), 100) from datasets import Dataset # Assuming `chunks` is the list of text chunks you created earlier data = {'text': chunks} new_dataset = Dataset.from_dict(data) new_dataset.save_to_disk('./bentham_chunked')