|
from pdfminer.high_level import extract_text |
|
|
|
import re |
|
bentham_texts = [] |
|
import glob |
|
def extract_text_from_pdf(pdf_path): |
|
text = extract_text(pdf_path) |
|
return text |
|
|
|
bentham_pdfs = glob.glob('./Bentham*.pdf') |
|
for pdf in bentham_pdfs: |
|
print(pdf) |
|
with open(pdf, 'rb') as f: |
|
text = extract_text_from_pdf(f) |
|
bentham_texts.append(text) |
|
|
|
|
|
|
|
bentham_text_string = ' '.join(bentham_texts) |
|
with open('bentham_text.txt', 'w') as f: |
|
f.write(bentham_text_string) |
|
|
|
|
|
|
|
|
|
|
|
bentham_text_string ='' |
|
with open('bentham_text.txt', 'r') as f: |
|
bentham_text_strings = f.readlines() |
|
bentham_text_string = ''.join(bentham_text_strings) |
|
|
|
import re |
|
|
|
def clean_text(text): |
|
cleaned_text = re.sub(r'§\s*\d+\.', '', text) |
|
|
|
|
|
|
|
cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) |
|
cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL) |
|
cleaned_text = re.sub(r'https?://\S+', '', cleaned_text) |
|
|
|
|
|
cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) |
|
cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL) |
|
cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) |
|
cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) |
|
patterns_to_remove = [ |
|
r'^\s*$', |
|
r'^\s*\d+\s*$', |
|
r'\[Back to Table of Contents\]', |
|
] |
|
for pattern in patterns_to_remove: |
|
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE) |
|
return cleaned_text |
|
cleaned_lines = [] |
|
for line in bentham_text_strings: |
|
cleaned_line = clean_text(line) |
|
if cleaned_line != '': |
|
cleaned_lines.append(cleaned_line) |
|
|
|
|
|
def split_into_chunks(text, chunk_size=100): |
|
""" |
|
Split the text into chunks of approximately `chunk_size` words. |
|
|
|
Args: |
|
text (str): The input text to split. |
|
chunk_size (int): The desired chunk size in words. |
|
|
|
Returns: |
|
list of str: A list of text chunks. |
|
""" |
|
|
|
words = text.split() |
|
|
|
|
|
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] |
|
|
|
return chunks |
|
|
|
chunks = split_into_chunks((' ').join(cleaned_lines), 100) |
|
|
|
from datasets import Dataset |
|
|
|
|
|
data = {'text': chunks} |
|
new_dataset = Dataset.from_dict(data) |
|
new_dataset.save_to_disk('./bentham_chunked') |
|
|
|
|