File size: 551 Bytes
e51a4c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
# train.py
from datasets import load_dataset
from transformers import AutoTokenizer
# Load FineWeb
dataset = load_dataset("HuggingFaceFW/fineweb", split="train[:1%]") # start small for testing
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# Tokenize
def tokenize(example):
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
tokenized.save_to_disk("tokenized_dataset")
|