Spaces:
Running
Running
final model
Browse files
model.py
CHANGED
@@ -10,14 +10,17 @@ from pytorch_lightning.callbacks import (
|
|
10 |
LearningRateMonitor,
|
11 |
RichProgressBar,
|
12 |
)
|
|
|
13 |
from pytorch_lightning.loggers import TensorBoardLogger
|
14 |
from torch.nn.utils.rnn import pad_sequence
|
15 |
from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
|
|
|
16 |
|
17 |
# Set environment variable for memory management
|
18 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
19 |
|
20 |
|
|
|
21 |
# Function to log GPU memory usage
|
22 |
def log_memory_usage(step):
|
23 |
if torch.cuda.is_available():
|
@@ -99,9 +102,9 @@ class SmolLMModule(LightningModule):
|
|
99 |
"train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
|
100 |
) # Log loss
|
101 |
|
102 |
-
# Log memory usage
|
103 |
-
if batch_idx % 10 == 0:
|
104 |
-
|
105 |
|
106 |
# Release intermediate tensors
|
107 |
del outputs
|
@@ -162,39 +165,41 @@ if __name__ == "__main__":
|
|
162 |
mode="min", # Lower loss is better
|
163 |
save_top_k=3, # Save the best 3 models
|
164 |
save_last=True, # Additionally save the last model
|
165 |
-
every_n_train_steps=
|
166 |
save_weights_only=False, # Save the full model state
|
167 |
auto_insert_metric_name=False, # Don't insert metric name in filename
|
168 |
)
|
169 |
|
170 |
# Progress bar
|
171 |
-
progress_bar = RichProgressBar(
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
)
|
|
|
188 |
|
189 |
# Create trainer
|
190 |
trainer = Trainer(
|
191 |
logger=logger,
|
192 |
-
strategy="
|
193 |
accelerator="gpu",
|
194 |
devices=2,
|
195 |
precision="16-mixed",
|
196 |
-
max_steps=
|
197 |
accumulate_grad_batches=1,
|
|
|
198 |
callbacks=[
|
199 |
LearningRateMonitor(logging_interval="step"),
|
200 |
progress_bar,
|
@@ -215,10 +220,14 @@ if __name__ == "__main__":
|
|
215 |
|
216 |
# Train with automatic checkpoint resumption
|
217 |
trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
|
|
|
|
|
|
|
|
|
218 |
|
219 |
# After training, print the best model path and score
|
220 |
print(f"Best model path: {checkpoint_callback.best_model_path}")
|
221 |
-
print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
|
222 |
|
223 |
# Save final model
|
224 |
if trainer.is_global_zero:
|
|
|
10 |
LearningRateMonitor,
|
11 |
RichProgressBar,
|
12 |
)
|
13 |
+
|
14 |
from pytorch_lightning.loggers import TensorBoardLogger
|
15 |
from torch.nn.utils.rnn import pad_sequence
|
16 |
from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
|
17 |
+
from pytorch_lightning.callbacks import TQDMProgressBar
|
18 |
|
19 |
# Set environment variable for memory management
|
20 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
21 |
|
22 |
|
23 |
+
|
24 |
# Function to log GPU memory usage
|
25 |
def log_memory_usage(step):
|
26 |
if torch.cuda.is_available():
|
|
|
102 |
"train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
|
103 |
) # Log loss
|
104 |
|
105 |
+
# # Log memory usage
|
106 |
+
# if batch_idx % 10 == 0:
|
107 |
+
# log_memory_usage(batch_idx)
|
108 |
|
109 |
# Release intermediate tensors
|
110 |
del outputs
|
|
|
165 |
mode="min", # Lower loss is better
|
166 |
save_top_k=3, # Save the best 3 models
|
167 |
save_last=True, # Additionally save the last model
|
168 |
+
every_n_train_steps=5000, # Save every 500 steps
|
169 |
save_weights_only=False, # Save the full model state
|
170 |
auto_insert_metric_name=False, # Don't insert metric name in filename
|
171 |
)
|
172 |
|
173 |
# Progress bar
|
174 |
+
# progress_bar = RichProgressBar(
|
175 |
+
# refresh_rate=1,
|
176 |
+
# leave=False,
|
177 |
+
# theme=RichProgressBarTheme(
|
178 |
+
# description="",
|
179 |
+
# progress_bar="#6206E0",
|
180 |
+
# progress_bar_finished="#6206E0",
|
181 |
+
# progress_bar_pulse="#6206E0",
|
182 |
+
# batch_progress="",
|
183 |
+
# time="dim",
|
184 |
+
# processing_speed="dim underline",
|
185 |
+
# metrics="italic",
|
186 |
+
# metrics_text_delimiter=" ",
|
187 |
+
# metrics_format=".3f",
|
188 |
+
# ),
|
189 |
+
# console_kwargs=None,
|
190 |
+
# )
|
191 |
+
progress_bar = TQDMProgressBar(refresh_rate=10)
|
192 |
|
193 |
# Create trainer
|
194 |
trainer = Trainer(
|
195 |
logger=logger,
|
196 |
+
strategy="ddp_notebook",
|
197 |
accelerator="gpu",
|
198 |
devices=2,
|
199 |
precision="16-mixed",
|
200 |
+
max_steps=500000,
|
201 |
accumulate_grad_batches=1,
|
202 |
+
enable_checkpointing = True,
|
203 |
callbacks=[
|
204 |
LearningRateMonitor(logging_interval="step"),
|
205 |
progress_bar,
|
|
|
220 |
|
221 |
# Train with automatic checkpoint resumption
|
222 |
trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
|
223 |
+
optimizers = trainer.optimizers
|
224 |
+
if optimizers:
|
225 |
+
optimizer = optimizers[0]
|
226 |
+
print("optimizer state:",optimizer.state_dict())
|
227 |
|
228 |
# After training, print the best model path and score
|
229 |
print(f"Best model path: {checkpoint_callback.best_model_path}")
|
230 |
+
# print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
|
231 |
|
232 |
# Save final model
|
233 |
if trainer.is_global_zero:
|