FineTune / train.py
AiCoderv2's picture
Create train.py
e51a4c9 verified
raw
history blame contribute delete
551 Bytes
# train.py
from datasets import load_dataset
from transformers import AutoTokenizer
# Load FineWeb
dataset = load_dataset("HuggingFaceFW/fineweb", split="train[:1%]") # start small for testing
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# Tokenize
def tokenize(example):
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
tokenized.save_to_disk("tokenized_dataset")