|
import re |
|
def is_page_number(line): |
|
return line.strip().isdigit() |
|
with open("./finn_wake.txt", "r", encoding="utf-8") as file: |
|
lines = file.readlines() |
|
filtered_lines = [line for line in lines if not is_page_number(line)] |
|
text = ''.join(filtered_lines) |
|
from datasets import Dataset |
|
import pandas as pd |
|
def split_paragraph_into_smaller_parts(paragraph, max_length=100): |
|
"""Split a paragraph into smaller parts with a maximum length in words.""" |
|
words = paragraph.split() |
|
for i in range(0, len(words), max_length): |
|
yield ' '.join(words[i:i+max_length]) |
|
paragraphs = text.split('\n') |
|
split_paragraphs = [] |
|
for paragraph in paragraphs: |
|
if paragraph.strip() != "": |
|
split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100)) |
|
df = pd.DataFrame(split_paragraphs, columns=['text']) |
|
dataset = Dataset.from_pandas(df) |
|
df.to_csv('finn_wake.csv', index=False) |
|
dataset.save_to_disk('finn_wake_dataset') |
|
|