|
import os |
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
from transformers import AutoTokenizer |
|
|
|
|
|
os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core' |
|
run_name = 'cpt-core-4' |
|
|
|
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/' |
|
dataset_block_size = 16385 |
|
max_seq_length = 16385 |
|
dtype = torch.bfloat16 |
|
load_in_4bit = False |
|
model_name = '../out/cpt-core-pre-4' |
|
output_dir = '../out/cpt-core-4' |
|
|
|
|
|
|
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_name, |
|
max_seq_length=max_seq_length, |
|
dtype=dtype, |
|
load_in_4bit=load_in_4bit, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=256, |
|
|
|
target_modules=[ |
|
'q_proj', 'k_proj', 'v_proj', 'o_proj', |
|
'gate_proj', |
|
'up_proj', 'down_proj', |
|
'embed_tokens', 'lm_head', |
|
], |
|
lora_alpha=32, |
|
|
|
lora_dropout=0, |
|
bias='none', |
|
|
|
|
|
use_gradient_checkpointing=False, |
|
random_state=23, |
|
use_rslora=True, |
|
loftq_config=None, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
from datasets import Dataset |
|
from litdata import TokensLoader, StreamingDataset |
|
|
|
|
|
litgpt_streaming_dataset = StreamingDataset( |
|
input_dir=dataset_input_dir, |
|
item_loader=TokensLoader(block_size=dataset_block_size), |
|
) |
|
|
|
|
|
def unlsoth_generator(): |
|
global litgpt_streaming_dataset |
|
|
|
for batch in litgpt_streaming_dataset: |
|
yield {'input_ids': batch} |
|
|
|
|
|
train_dataset = Dataset.from_generator(unlsoth_generator) |
|
dataset = train_dataset.train_test_split(test_size=0.01) |
|
|
|
|
|
|
|
|
|
|
|
from trl import SFTTrainer |
|
from transformers import TrainingArguments |
|
from unsloth import is_bfloat16_supported |
|
from unsloth import UnslothTrainer, UnslothTrainingArguments |
|
|
|
|
|
trainer = UnslothTrainer( |
|
model=model, |
|
tokenizer=tokenizer, |
|
|
|
train_dataset=dataset['train'], |
|
eval_dataset=dataset['test'], |
|
dataset_num_proc=32, |
|
max_seq_length=max_seq_length, |
|
max_steps=len(litgpt_streaming_dataset), |
|
packing=False, |
|
|
|
args=UnslothTrainingArguments( |
|
per_device_train_batch_size=1, |
|
|
|
|
|
warmup_ratio=0, |
|
num_train_epochs=1, |
|
|
|
learning_rate=5e-5, |
|
embedding_learning_rate=5e-5 / 10.0, |
|
|
|
fp16=not is_bfloat16_supported(), |
|
bf16=is_bfloat16_supported(), |
|
logging_steps=1, |
|
|
|
optim='adamw_torch_fused', |
|
weight_decay=0.01, |
|
lr_scheduler_type='cosine', |
|
seed=23, |
|
output_dir=output_dir, |
|
report_to='wandb', |
|
|
|
run_name=run_name, |
|
save_steps=1000, |
|
|
|
do_eval=True, |
|
fp16_full_eval=True, |
|
per_device_eval_batch_size=1, |
|
eval_accumulation_steps=4, |
|
eval_strategy='steps', |
|
eval_steps=1000, |
|
), |
|
) |
|
|
|
trainer_stats = trainer.train() |
|
|