File size: 3,542 Bytes
282021b 734e414 3bd4051 734e414 ad7223b 282021b 1dd7bef 282021b 9c11de1 7224ded 734e414 accd6a7 ac42a9d 2d440bc 734e414 ad7223b c5afc4e 734e414 d73bbdd 2d440bc 734e414 304f7f8 76feacc 5ea4357 b394167 304f7f8 76feacc 9c11de1 734e414 5ea4357 76feacc 734e414 d73bbdd 734e414 ad7223b 8b6e5e1 d1bfa72 8b6e5e1 3bd4051 c5afc4e 3bd4051 c5afc4e 8b6e5e1 3bd4051 c5afc4e 3bd4051 c5afc4e 4f4772d 10d6112 3bd4051 ad7223b 8b6e5e1 d1bfa72 8b6e5e1 734e414 10d6112 734e414 10d6112 756b2ff 6ffe1e7 734e414 76feacc 041c526 6304c7f 734e414 eef5b70 734e414 76feacc 734e414 c152e40 d1bfa72 734e414 10d6112 1dd7bef 7eb93cf 76feacc 10d6112 a536a39 10d6112 7eb93cf 734e414 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer
os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core'
run_name = 'cpt-core-4'
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
dataset_block_size = 16385
max_seq_length = 16385
dtype = torch.bfloat16
load_in_4bit = False
model_name = '../out/cpt-core-pre-4'
output_dir = '../out/cpt-core-4'
#
# model
#
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# print(f'{model=}')
# print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
# tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
# print(f'{tokenizer=}')
model = FastLanguageModel.get_peft_model(
model,
r=256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
# r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=[
'q_proj', 'k_proj', 'v_proj', 'o_proj',
'gate_proj',
'up_proj', 'down_proj',
'embed_tokens', 'lm_head',
],
lora_alpha=32,
# lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias='none', # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
# use_gradient_checkpointing='unsloth', # True or "unsloth" for very long context
use_gradient_checkpointing=False,
random_state=23,
use_rslora=True, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
# print(f'{model=}')
#
# dataset
#
from datasets import Dataset
from litdata import TokensLoader, StreamingDataset
litgpt_streaming_dataset = StreamingDataset(
input_dir=dataset_input_dir,
item_loader=TokensLoader(block_size=dataset_block_size),
)
def unlsoth_generator():
global litgpt_streaming_dataset
for batch in litgpt_streaming_dataset:
yield {'input_ids': batch}
train_dataset = Dataset.from_generator(unlsoth_generator)
dataset = train_dataset.train_test_split(test_size=0.01)
#
# trainer
#
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
trainer = UnslothTrainer(
model=model,
tokenizer=tokenizer,
# train_dataset=train_dataset,
train_dataset=dataset['train'],
eval_dataset=dataset['test'],
dataset_num_proc=32,
max_seq_length=max_seq_length,
max_steps=len(litgpt_streaming_dataset),
packing=False, # Can make training 5x faster for short sequences.
args=UnslothTrainingArguments(
per_device_train_batch_size=1,
# gradient_accumulation_steps=8,
warmup_ratio=0,
num_train_epochs=1,
learning_rate=5e-5,
embedding_learning_rate=5e-5 / 10.0,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
# optim='adamw_8bit',
optim='adamw_torch_fused',
weight_decay=0.01,
lr_scheduler_type='cosine',
seed=23,
output_dir=output_dir,
report_to='wandb',
run_name=run_name,
save_steps=1000,
do_eval=True,
fp16_full_eval=True,
per_device_eval_batch_size=1,
eval_accumulation_steps=4,
eval_strategy='steps',
eval_steps=1000,
),
)
trainer_stats = trainer.train()
|