File size: 551 Bytes
e51a4c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# train.py
from datasets import load_dataset
from transformers import AutoTokenizer

# Load FineWeb
dataset = load_dataset("HuggingFaceFW/fineweb", split="train[:1%]")  # start small for testing

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
tokenized.save_to_disk("tokenized_dataset")