Vibi007 commited on
Commit
788974d
·
1 Parent(s): bdcd6d0

final model

Browse files
Files changed (1) hide show
  1. model.py +33 -24
model.py CHANGED
@@ -10,14 +10,17 @@ from pytorch_lightning.callbacks import (
10
  LearningRateMonitor,
11
  RichProgressBar,
12
  )
 
13
  from pytorch_lightning.loggers import TensorBoardLogger
14
  from torch.nn.utils.rnn import pad_sequence
15
  from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
 
16
 
17
  # Set environment variable for memory management
18
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
19
 
20
 
 
21
  # Function to log GPU memory usage
22
  def log_memory_usage(step):
23
  if torch.cuda.is_available():
@@ -99,9 +102,9 @@ class SmolLMModule(LightningModule):
99
  "train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
100
  ) # Log loss
101
 
102
- # Log memory usage
103
- if batch_idx % 10 == 0:
104
- log_memory_usage(batch_idx)
105
 
106
  # Release intermediate tensors
107
  del outputs
@@ -162,39 +165,41 @@ if __name__ == "__main__":
162
  mode="min", # Lower loss is better
163
  save_top_k=3, # Save the best 3 models
164
  save_last=True, # Additionally save the last model
165
- every_n_train_steps=500, # Save every 500 steps
166
  save_weights_only=False, # Save the full model state
167
  auto_insert_metric_name=False, # Don't insert metric name in filename
168
  )
169
 
170
  # Progress bar
171
- progress_bar = RichProgressBar(
172
- refresh_rate=1,
173
- leave=False,
174
- theme=RichProgressBarTheme(
175
- description="",
176
- progress_bar="#6206E0",
177
- progress_bar_finished="#6206E0",
178
- progress_bar_pulse="#6206E0",
179
- batch_progress="",
180
- time="dim",
181
- processing_speed="dim underline",
182
- metrics="italic",
183
- metrics_text_delimiter=" ",
184
- metrics_format=".3f",
185
- ),
186
- console_kwargs=None,
187
- )
 
188
 
189
  # Create trainer
190
  trainer = Trainer(
191
  logger=logger,
192
- strategy="ddp",
193
  accelerator="gpu",
194
  devices=2,
195
  precision="16-mixed",
196
- max_steps=5000,
197
  accumulate_grad_batches=1,
 
198
  callbacks=[
199
  LearningRateMonitor(logging_interval="step"),
200
  progress_bar,
@@ -215,10 +220,14 @@ if __name__ == "__main__":
215
 
216
  # Train with automatic checkpoint resumption
217
  trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
 
 
 
 
218
 
219
  # After training, print the best model path and score
220
  print(f"Best model path: {checkpoint_callback.best_model_path}")
221
- print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
222
 
223
  # Save final model
224
  if trainer.is_global_zero:
 
10
  LearningRateMonitor,
11
  RichProgressBar,
12
  )
13
+
14
  from pytorch_lightning.loggers import TensorBoardLogger
15
  from torch.nn.utils.rnn import pad_sequence
16
  from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
17
+ from pytorch_lightning.callbacks import TQDMProgressBar
18
 
19
  # Set environment variable for memory management
20
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
21
 
22
 
23
+
24
  # Function to log GPU memory usage
25
  def log_memory_usage(step):
26
  if torch.cuda.is_available():
 
102
  "train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
103
  ) # Log loss
104
 
105
+ # # Log memory usage
106
+ # if batch_idx % 10 == 0:
107
+ # log_memory_usage(batch_idx)
108
 
109
  # Release intermediate tensors
110
  del outputs
 
165
  mode="min", # Lower loss is better
166
  save_top_k=3, # Save the best 3 models
167
  save_last=True, # Additionally save the last model
168
+ every_n_train_steps=5000, # Save every 500 steps
169
  save_weights_only=False, # Save the full model state
170
  auto_insert_metric_name=False, # Don't insert metric name in filename
171
  )
172
 
173
  # Progress bar
174
+ # progress_bar = RichProgressBar(
175
+ # refresh_rate=1,
176
+ # leave=False,
177
+ # theme=RichProgressBarTheme(
178
+ # description="",
179
+ # progress_bar="#6206E0",
180
+ # progress_bar_finished="#6206E0",
181
+ # progress_bar_pulse="#6206E0",
182
+ # batch_progress="",
183
+ # time="dim",
184
+ # processing_speed="dim underline",
185
+ # metrics="italic",
186
+ # metrics_text_delimiter=" ",
187
+ # metrics_format=".3f",
188
+ # ),
189
+ # console_kwargs=None,
190
+ # )
191
+ progress_bar = TQDMProgressBar(refresh_rate=10)
192
 
193
  # Create trainer
194
  trainer = Trainer(
195
  logger=logger,
196
+ strategy="ddp_notebook",
197
  accelerator="gpu",
198
  devices=2,
199
  precision="16-mixed",
200
+ max_steps=500000,
201
  accumulate_grad_batches=1,
202
+ enable_checkpointing = True,
203
  callbacks=[
204
  LearningRateMonitor(logging_interval="step"),
205
  progress_bar,
 
220
 
221
  # Train with automatic checkpoint resumption
222
  trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
223
+ optimizers = trainer.optimizers
224
+ if optimizers:
225
+ optimizer = optimizers[0]
226
+ print("optimizer state:",optimizer.state_dict())
227
 
228
  # After training, print the best model path and score
229
  print(f"Best model path: {checkpoint_callback.best_model_path}")
230
+ # print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
231
 
232
  # Save final model
233
  if trainer.is_global_zero: