tangled-alpha-0.9-core / scripts /cpt_core_model_4.py
mtasic85's picture
cpt core 4
7eb93cf
import os
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer
os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core'
run_name = 'cpt-core-4'
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
dataset_block_size = 16385
max_seq_length = 16385
dtype = torch.bfloat16
load_in_4bit = False
model_name = '../out/cpt-core-pre-4'
output_dir = '../out/cpt-core-4'
#
# model
#
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# print(f'{model=}')
# print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
# tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
# print(f'{tokenizer=}')
model = FastLanguageModel.get_peft_model(
model,
r=256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
# r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=[
'q_proj', 'k_proj', 'v_proj', 'o_proj',
'gate_proj',
'up_proj', 'down_proj',
'embed_tokens', 'lm_head',
],
lora_alpha=32,
# lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias='none', # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
# use_gradient_checkpointing='unsloth', # True or "unsloth" for very long context
use_gradient_checkpointing=False,
random_state=23,
use_rslora=True, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
# print(f'{model=}')
#
# dataset
#
from datasets import Dataset
from litdata import TokensLoader, StreamingDataset
litgpt_streaming_dataset = StreamingDataset(
input_dir=dataset_input_dir,
item_loader=TokensLoader(block_size=dataset_block_size),
)
def unlsoth_generator():
global litgpt_streaming_dataset
for batch in litgpt_streaming_dataset:
yield {'input_ids': batch}
train_dataset = Dataset.from_generator(unlsoth_generator)
dataset = train_dataset.train_test_split(test_size=0.01)
#
# trainer
#
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
trainer = UnslothTrainer(
model=model,
tokenizer=tokenizer,
# train_dataset=train_dataset,
train_dataset=dataset['train'],
eval_dataset=dataset['test'],
dataset_num_proc=32,
max_seq_length=max_seq_length,
max_steps=len(litgpt_streaming_dataset),
packing=False, # Can make training 5x faster for short sequences.
args=UnslothTrainingArguments(
per_device_train_batch_size=1,
# gradient_accumulation_steps=8,
warmup_ratio=0,
num_train_epochs=1,
learning_rate=5e-5,
embedding_learning_rate=5e-5 / 10.0,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
# optim='adamw_8bit',
optim='adamw_torch_fused',
weight_decay=0.01,
lr_scheduler_type='cosine',
seed=23,
output_dir=output_dir,
report_to='wandb',
run_name=run_name,
save_steps=1000,
do_eval=True,
fp16_full_eval=True,
per_device_eval_batch_size=1,
eval_accumulation_steps=4,
eval_strategy='steps',
eval_steps=1000,
),
)
trainer_stats = trainer.train()