Spaces:
Running
Running
File size: 8,154 Bytes
fceb8da 035761e fceb8da 788974d 035761e fceb8da 788974d fceb8da 035761e fceb8da 788974d fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 788974d fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 035761e fceb8da 788974d fceb8da 035761e fceb8da 788974d fceb8da 788974d 035761e fceb8da 788974d fceb8da 788974d 035761e fceb8da 035761e fceb8da 035761e fceb8da 788974d fceb8da 788974d 035761e fceb8da 035761e fceb8da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
import os
import torch
import yaml
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
from torch.utils.data import DataLoader, IterableDataset
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import (
ModelCheckpoint,
LearningRateMonitor,
RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger
from torch.nn.utils.rnn import pad_sequence
from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
from pytorch_lightning.callbacks import TQDMProgressBar
# Set environment variable for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# Function to log GPU memory usage
def log_memory_usage(step):
if torch.cuda.is_available():
print(
f"Step {step}: "
f"Allocated = {torch.cuda.memory_allocated() / 1e9:.2f} GB, "
f"Reserved = {torch.cuda.memory_reserved() / 1e9:.2f} GB"
)
# Custom Collate Function
def collate_fn(batch):
input_ids = [item["input_ids"] for item in batch]
labels = [item["labels"] for item in batch]
input_ids = pad_sequence(
input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
)
labels = pad_sequence(
labels, batch_first=True, padding_value=tokenizer.pad_token_id
)
return {"input_ids": input_ids, "labels": labels}
# Streaming Dataset
class StreamingDataset(IterableDataset):
def __init__(self, dataset, tokenizer, max_length=2048):
self.dataset = dataset
self.tokenizer = tokenizer
self.max_length = max_length
def __iter__(self):
for example in iter(self.dataset):
tokenized = self.tokenizer(
example["text"],
truncation=True,
max_length=self.max_length,
return_overflowing_tokens=True,
return_tensors="pt",
)
for chunk in tokenized["input_ids"]:
yield {
"input_ids": chunk.squeeze(0),
"labels": chunk.squeeze(0),
}
# Lightning Module
class SmolLMModule(LightningModule):
def __init__(self, config, learning_rate=1e-4):
super().__init__()
self.config = config
self.learning_rate = learning_rate
self.save_hyperparameters()
model_config = LlamaConfig(
vocab_size=49152,
hidden_size=config["model"]["model_config"]["hidden_size"],
intermediate_size=config["model"]["model_config"]["intermediate_size"],
num_hidden_layers=config["model"]["model_config"]["num_hidden_layers"],
num_attention_heads=config["model"]["model_config"]["num_attention_heads"],
num_key_value_heads=config["model"]["model_config"]["num_key_value_heads"],
hidden_act=config["model"]["model_config"]["hidden_act"],
max_position_embeddings=config["model"]["model_config"][
"max_position_embeddings"
],
initializer_range=config["model"]["model_config"]["initializer_range"],
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=config["model"]["model_config"]["pad_token_id"],
bos_token_id=config["model"]["model_config"]["bos_token_id"],
eos_token_id=config["model"]["model_config"]["eos_token_id"],
)
self.model = AutoModelForCausalLM.from_config(model_config)
def training_step(self, batch, batch_idx):
outputs = self.model(input_ids=batch["input_ids"], labels=batch["labels"])
loss = outputs.loss
self.log(
"train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
) # Log loss
# # Log memory usage
# if batch_idx % 10 == 0:
# log_memory_usage(batch_idx)
# Release intermediate tensors
del outputs
torch.cuda.empty_cache()
return loss
def configure_optimizers(self):
return torch.optim.AdamW(
self.model.parameters(),
lr=self.learning_rate,
betas=(0.9, 0.95),
eps=1e-8,
weight_decay=self.config["optimizer"]["weight_decay"],
)
# Main Script
if __name__ == "__main__":
# Load config
with open("/kaggle/input/yaml-file/config_smollm2_135.yaml", "r") as file:
config = yaml.safe_load(file)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
tokenizer.pad_token = tokenizer.eos_token
# Load dataset
dataset = load_dataset(
"HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True
)
train_dataset = dataset["train"]
# Create DataLoader
streaming_dataset = StreamingDataset(train_dataset, tokenizer, max_length=2048)
train_loader = DataLoader(
streaming_dataset,
batch_size=1, # Reduced batch size
num_workers=4,
collate_fn=collate_fn,
pin_memory=True,
)
# Create model
model = SmolLMModule(
config,
learning_rate=config["optimizer"]["learning_rate_scheduler"]["learning_rate"],
)
# Initialize logger with version based on start_step
logger = TensorBoardLogger("logs", name="smollm2")
# Checkpoint callback configuration
checkpoint_callback = ModelCheckpoint(
dirpath="checkpoints",
filename="model-{epoch:02d}-{step}-{train_loss:.2f}", # Include training loss in filename
monitor="train_loss", # Monitor training loss
mode="min", # Lower loss is better
save_top_k=3, # Save the best 3 models
save_last=True, # Additionally save the last model
every_n_train_steps=5000, # Save every 500 steps
save_weights_only=False, # Save the full model state
auto_insert_metric_name=False, # Don't insert metric name in filename
)
# Progress bar
# progress_bar = RichProgressBar(
# refresh_rate=1,
# leave=False,
# theme=RichProgressBarTheme(
# description="",
# progress_bar="#6206E0",
# progress_bar_finished="#6206E0",
# progress_bar_pulse="#6206E0",
# batch_progress="",
# time="dim",
# processing_speed="dim underline",
# metrics="italic",
# metrics_text_delimiter=" ",
# metrics_format=".3f",
# ),
# console_kwargs=None,
# )
progress_bar = TQDMProgressBar(refresh_rate=10)
# Create trainer
trainer = Trainer(
logger=logger,
strategy="ddp_notebook",
accelerator="gpu",
devices=2,
precision="16-mixed",
max_steps=500000,
accumulate_grad_batches=1,
enable_checkpointing = True,
callbacks=[
LearningRateMonitor(logging_interval="step"),
progress_bar,
checkpoint_callback,
],
enable_progress_bar=True,
enable_model_summary=True,
log_every_n_steps=10,
)
# Find latest checkpoint if exists
if os.path.exists("checkpoints/last.ckpt"):
resume_from_checkpoint = "checkpoints/last.ckpt"
print(f"Resuming from checkpoint: {resume_from_checkpoint}")
else:
resume_from_checkpoint = None
print("Starting training from scratch")
# Train with automatic checkpoint resumption
trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
optimizers = trainer.optimizers
if optimizers:
optimizer = optimizers[0]
print("optimizer state:",optimizer.state_dict())
# After training, print the best model path and score
print(f"Best model path: {checkpoint_callback.best_model_path}")
# print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
# Save final model
if trainer.is_global_zero:
output_dir = "final_model"
os.makedirs(output_dir, exist_ok=True)
model.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
|