import os from unsloth import FastLanguageModel import torch from transformers import AutoTokenizer os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core' run_name = 'cpt-core-4' dataset_input_dir = '../core-data-4-8193-16385-16385-1000/' dataset_block_size = 16385 max_seq_length = 16385 dtype = torch.bfloat16 load_in_4bit = False model_name = '../out/cpt-core-pre-4' output_dir = '../out/cpt-core-4' # # model # model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, ) # print(f'{model=}') # print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained') # tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True) # print(f'{tokenizer=}') model = FastLanguageModel.get_peft_model( model, r=256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 # r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules=[ 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'embed_tokens', 'lm_head', ], lora_alpha=32, # lora_alpha=16, lora_dropout=0, # Supports any, but = 0 is optimized bias='none', # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! # use_gradient_checkpointing='unsloth', # True or "unsloth" for very long context use_gradient_checkpointing=False, random_state=23, use_rslora=True, # We support rank stabilized LoRA loftq_config=None, # And LoftQ ) # print(f'{model=}') # # dataset # from datasets import Dataset from litdata import TokensLoader, StreamingDataset litgpt_streaming_dataset = StreamingDataset( input_dir=dataset_input_dir, item_loader=TokensLoader(block_size=dataset_block_size), ) def unlsoth_generator(): global litgpt_streaming_dataset for batch in litgpt_streaming_dataset: yield {'input_ids': batch} train_dataset = Dataset.from_generator(unlsoth_generator) dataset = train_dataset.train_test_split(test_size=0.01) # # trainer # from trl import SFTTrainer from transformers import TrainingArguments from unsloth import is_bfloat16_supported from unsloth import UnslothTrainer, UnslothTrainingArguments trainer = UnslothTrainer( model=model, tokenizer=tokenizer, # train_dataset=train_dataset, train_dataset=dataset['train'], eval_dataset=dataset['test'], dataset_num_proc=32, max_seq_length=max_seq_length, max_steps=len(litgpt_streaming_dataset), packing=False, # Can make training 5x faster for short sequences. args=UnslothTrainingArguments( per_device_train_batch_size=1, # gradient_accumulation_steps=8, warmup_ratio=0, num_train_epochs=1, learning_rate=5e-5, embedding_learning_rate=5e-5 / 10.0, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), logging_steps=1, # optim='adamw_8bit', optim='adamw_torch_fused', weight_decay=0.01, lr_scheduler_type='cosine', seed=23, output_dir=output_dir, report_to='wandb', run_name=run_name, save_steps=1000, do_eval=True, fp16_full_eval=True, per_device_eval_batch_size=1, eval_accumulation_steps=4, eval_strategy='steps', eval_steps=1000, ), ) trainer_stats = trainer.train()