File size: 957 Bytes
d3c92ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import re
def is_page_number(line):
    return line.strip().isdigit()
with open("./finn_wake.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()
filtered_lines = [line for line in lines if not is_page_number(line)]
text = ''.join(filtered_lines)
from datasets import Dataset
import pandas as pd
def split_paragraph_into_smaller_parts(paragraph, max_length=100):
    """Split a paragraph into smaller parts with a maximum length in words."""
    words = paragraph.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])
paragraphs = text.split('\n')
split_paragraphs = []
for paragraph in paragraphs:
    if paragraph.strip() != "":
        split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100))
df = pd.DataFrame(split_paragraphs, columns=['text'])
dataset = Dataset.from_pandas(df)
df.to_csv('finn_wake.csv', index=False)
dataset.save_to_disk('finn_wake_dataset')