import re def is_page_number(line): return line.strip().isdigit() with open("./finn_wake.txt", "r", encoding="utf-8") as file: lines = file.readlines() filtered_lines = [line for line in lines if not is_page_number(line)] text = ''.join(filtered_lines) from datasets import Dataset import pandas as pd def split_paragraph_into_smaller_parts(paragraph, max_length=100): """Split a paragraph into smaller parts with a maximum length in words.""" words = paragraph.split() for i in range(0, len(words), max_length): yield ' '.join(words[i:i+max_length]) paragraphs = text.split('\n') split_paragraphs = [] for paragraph in paragraphs: if paragraph.strip() != "": split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100)) df = pd.DataFrame(split_paragraphs, columns=['text']) dataset = Dataset.from_pandas(df) df.to_csv('finn_wake.csv', index=False) dataset.save_to_disk('finn_wake_dataset')