|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
|
from datasets import load_dataset, load_from_disk |
|
|
|
dataset = load_from_disk('finn_wake_dataset') |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0") |
|
|
|
tokenizer.save_pretrained(".results/checkpoint-12000/") |
|
|
|
model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0") |
|
|
|
if tokenizer.pad_token is None: |
|
print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.") |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
def tokenize_function(examples): |
|
|
|
tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128) |
|
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() |
|
|
|
return tokenized_inputs |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
|
train_test_split = tokenized_dataset.train_test_split(test_size=0.1) |
|
|
|
train_dataset = train_test_split['train'] |
|
eval_dataset = train_test_split['test'] |
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=1, |
|
warmup_steps=500, |
|
weight_decay=0.01, |
|
logging_dir="./logs", |
|
logging_steps=10, |
|
save_strategy="steps", |
|
save_steps=500, |
|
save_total_limit=2, |
|
use_cpu=True) |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
) |
|
|
|
trainer.train(resume_from_checkpoint="./results/checkpoint-10000") |
|
|