Model save

Browse files

Files changed (5) hide show

README.md +1 -1
all_results.json +5 -5
generation_config.json +10 -5
train_results.json +5 -5
trainer_state.json +588 -112

README.md CHANGED Viewed

@@ -26,7 +26,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/648848014/huggingface/runs/7eblkip7)
 This model was trained with SFT.

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/648848014/huggingface/runs/bx5kbmeh)
 This model was trained with SFT.

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 2.926914460515809e+17,
-    "train_loss": 0.0,
-    "train_runtime": 1.5655,
     "train_samples": 3472,
-    "train_samples_per_second": 1232.8,
-    "train_steps_per_second": 83.038
 }

 {
+    "total_flos": 2.4184401017438208e+17,
+    "train_loss": 0.6382126904548483,
+    "train_runtime": 2326.1115,
     "train_samples": 3472,
+    "train_samples_per_second": 3.229,
+    "train_steps_per_second": 0.202
 }

generation_config.json CHANGED Viewed

@@ -1,9 +1,14 @@
 {
-  "_from_model_config": true,
-  "bos_token_id": 151646,
   "do_sample": true,
-  "eos_token_id": 151643,
-  "temperature": 0.6,
-  "top_p": 0.95,
   "transformers_version": "4.50.0.dev0"
 }

 {
+  "bos_token_id": 151643,
   "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
   "transformers_version": "4.50.0.dev0"
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 2.926914460515809e+17,
-    "train_loss": 0.0,
-    "train_runtime": 1.5655,
     "train_samples": 3472,
-    "train_samples_per_second": 1232.8,
-    "train_steps_per_second": 83.038
 }

 {
+    "total_flos": 2.4184401017438208e+17,
+    "train_loss": 0.6382126904548483,
+    "train_runtime": 2326.1115,
     "train_samples": 3472,
+    "train_samples_per_second": 3.229,
+    "train_steps_per_second": 0.202
 }

trainer_state.json CHANGED Viewed

@@ -3,205 +3,681 @@
   "best_model_checkpoint": null,
   "epoch": 10.0,
   "eval_steps": 500,
-  "global_step": 130,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.38461538461538464,
-      "grad_norm": 0.9453125,
-      "learning_rate": 3.875e-05,
-      "loss": 1.7199,
       "step": 5
     },
     {
-      "epoch": 0.7692307692307693,
-      "grad_norm": 0.5234375,
-      "learning_rate": 1.1590097423302684e-05,
-      "loss": 1.4892,
       "step": 10
     },
     {
-      "epoch": 1.1538461538461537,
-      "grad_norm": 0.478515625,
-      "learning_rate": 4.953193036870676e-05,
-      "loss": 1.4358,
       "step": 15
     },
     {
-      "epoch": 1.5384615384615383,
-      "grad_norm": 0.4921875,
-      "learning_rate": 4.877104772313846e-05,
-      "loss": 1.4053,
       "step": 20
     },
     {
-      "epoch": 1.9230769230769231,
-      "grad_norm": 0.40234375,
-      "learning_rate": 4.766372503162375e-05,
-      "loss": 1.3156,
       "step": 25
     },
     {
-      "epoch": 2.3076923076923075,
-      "grad_norm": 0.380859375,
-      "learning_rate": 4.622799718031961e-05,
-      "loss": 1.2542,
       "step": 30
     },
     {
-      "epoch": 2.6923076923076925,
-      "grad_norm": 0.341796875,
-      "learning_rate": 4.448724776693342e-05,
-      "loss": 1.2351,
       "step": 35
     },
     {
-      "epoch": 3.076923076923077,
-      "grad_norm": 0.318359375,
-      "learning_rate": 4.246982825372522e-05,
-      "loss": 1.2441,
       "step": 40
     },
     {
-      "epoch": 3.4615384615384617,
-      "grad_norm": 0.31640625,
-      "learning_rate": 4.020859620925235e-05,
-      "loss": 1.1722,
       "step": 45
     },
     {
-      "epoch": 3.8461538461538463,
-      "grad_norm": 0.302734375,
-      "learning_rate": 3.77403801594802e-05,
-      "loss": 1.1937,
       "step": 50
     },
     {
-      "epoch": 4.230769230769231,
-      "grad_norm": 0.298828125,
-      "learning_rate": 3.510537976419131e-05,
-      "loss": 1.1898,
       "step": 55
     },
     {
-      "epoch": 4.615384615384615,
-      "grad_norm": 0.306640625,
-      "learning_rate": 3.234651108797708e-05,
-      "loss": 1.1526,
       "step": 60
     },
     {
-      "epoch": 5.0,
-      "grad_norm": 1.0703125,
-      "learning_rate": 2.9508707629336874e-05,
-      "loss": 1.1345,
       "step": 65
     },
     {
-      "epoch": 5.384615384615385,
-      "grad_norm": 0.2890625,
-      "learning_rate": 2.6638188491974207e-05,
-      "loss": 1.1493,
       "step": 70
     },
     {
-      "epoch": 5.769230769230769,
-      "grad_norm": 0.28515625,
-      "learning_rate": 2.378170561753209e-05,
-      "loss": 1.121,
       "step": 75
     },
     {
-      "epoch": 6.153846153846154,
-      "grad_norm": 0.2734375,
-      "learning_rate": 2.098578234003466e-05,
-      "loss": 1.1091,
       "step": 80
     },
     {
-      "epoch": 6.538461538461538,
-      "grad_norm": 0.2734375,
-      "learning_rate": 1.8295955663644855e-05,
-      "loss": 1.1314,
       "step": 85
     },
     {
-      "epoch": 6.923076923076923,
-      "grad_norm": 0.271484375,
-      "learning_rate": 1.575603460470665e-05,
-      "loss": 1.1291,
       "step": 90
     },
     {
-      "epoch": 7.3076923076923075,
-      "grad_norm": 0.271484375,
-      "learning_rate": 1.3407386677402312e-05,
-      "loss": 1.123,
       "step": 95
     },
     {
-      "epoch": 7.6923076923076925,
-      "grad_norm": 0.28125,
-      "learning_rate": 1.1288264143982296e-05,
-      "loss": 1.1186,
       "step": 100
     },
     {
-      "epoch": 8.076923076923077,
-      "grad_norm": 0.259765625,
-      "learning_rate": 9.433181002882383e-06,
-      "loss": 1.096,
       "step": 105
     },
     {
-      "epoch": 8.461538461538462,
-      "grad_norm": 0.275390625,
-      "learning_rate": 7.872350861678565e-06,
-      "loss": 1.1196,
       "step": 110
     },
     {
-      "epoch": 8.846153846153847,
-      "grad_norm": 0.255859375,
-      "learning_rate": 6.631194850202872e-06,
-      "loss": 1.1171,
       "step": 115
     },
     {
-      "epoch": 9.23076923076923,
-      "grad_norm": 0.283203125,
-      "learning_rate": 5.729927588404395e-06,
-      "loss": 1.1002,
       "step": 120
     },
     {
-      "epoch": 9.615384615384615,
-      "grad_norm": 0.259765625,
-      "learning_rate": 5.1832279522675925e-06,
-      "loss": 1.0983,
       "step": 125
     },
     {
-      "epoch": 10.0,
       "grad_norm": 0.96484375,
       "learning_rate": 5e-06,
-      "loss": 1.095,
-      "step": 130
     },
     {
       "epoch": 10.0,
-      "step": 130,
-      "total_flos": 2.926914460515809e+17,
-      "train_loss": 0.0,
-      "train_runtime": 1.5655,
-      "train_samples_per_second": 1232.8,
-      "train_steps_per_second": 83.038
     }
   ],
   "logging_steps": 5,
-  "max_steps": 130,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 100,
@@ -217,7 +693,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.926914460515809e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 470,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.10638297872340426,
+      "grad_norm": 3.921875,
+      "learning_rate": 1.0416666666666668e-05,
+      "loss": 1.6159,
       "step": 5
     },
     {
+      "epoch": 0.2127659574468085,
+      "grad_norm": 1.796875,
+      "learning_rate": 2.0833333333333336e-05,
+      "loss": 1.4989,
       "step": 10
     },
     {
+      "epoch": 0.3191489361702128,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.125e-05,
+      "loss": 1.3278,
       "step": 15
     },
     {
+      "epoch": 0.425531914893617,
+      "grad_norm": 1.2421875,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.2225,
       "step": 20
     },
     {
+      "epoch": 0.5319148936170213,
+      "grad_norm": 1.2109375,
+      "learning_rate": 4.999944181166713e-05,
+      "loss": 1.1515,
       "step": 25
     },
     {
+      "epoch": 0.6382978723404256,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.997990812788907e-05,
+      "loss": 1.1002,
       "step": 30
     },
     {
+      "epoch": 0.7446808510638298,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.993249271677745e-05,
+      "loss": 1.0933,
       "step": 35
     },
     {
+      "epoch": 0.851063829787234,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.985725438745002e-05,
+      "loss": 1.0604,
       "step": 40
     },
     {
+      "epoch": 0.9574468085106383,
+      "grad_norm": 1.0546875,
+      "learning_rate": 4.975428645766288e-05,
+      "loss": 1.0401,
       "step": 45
     },
     {
+      "epoch": 1.0638297872340425,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.962371663806883e-05,
+      "loss": 0.9942,
       "step": 50
     },
     {
+      "epoch": 1.1702127659574468,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.9465706873818514e-05,
+      "loss": 0.9376,
       "step": 55
     },
     {
+      "epoch": 1.2765957446808511,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.928045314370053e-05,
+      "loss": 0.9253,
       "step": 60
     },
     {
+      "epoch": 1.3829787234042552,
+      "grad_norm": 1.0234375,
+      "learning_rate": 4.906818521706987e-05,
+      "loss": 0.9234,
       "step": 65
     },
     {
+      "epoch": 1.4893617021276595,
+      "grad_norm": 0.9375,
+      "learning_rate": 4.8829166368866006e-05,
+      "loss": 0.925,
       "step": 70
     },
     {
+      "epoch": 1.5957446808510638,
+      "grad_norm": 0.96484375,
+      "learning_rate": 4.8563693053074235e-05,
+      "loss": 0.9129,
       "step": 75
     },
     {
+      "epoch": 1.702127659574468,
+      "grad_norm": 0.97265625,
+      "learning_rate": 4.8272094535035095e-05,
+      "loss": 0.8839,
       "step": 80
     },
     {
+      "epoch": 1.8085106382978724,
+      "grad_norm": 0.9140625,
+      "learning_rate": 4.7954732483058103e-05,
+      "loss": 0.8724,
       "step": 85
     },
     {
+      "epoch": 1.9148936170212765,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.761200051984621e-05,
+      "loss": 0.8852,
       "step": 90
     },
     {
+      "epoch": 2.021276595744681,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.724432373428734e-05,
+      "loss": 0.8763,
       "step": 95
     },
     {
+      "epoch": 2.127659574468085,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.6852158154218594e-05,
+      "loss": 0.7757,
       "step": 100
     },
     {
+      "epoch": 2.2340425531914896,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.643599018081702e-05,
+      "loss": 0.7828,
       "step": 105
     },
     {
+      "epoch": 2.3404255319148937,
+      "grad_norm": 1.109375,
+      "learning_rate": 4.5996335985318416e-05,
+      "loss": 0.794,
       "step": 110
     },
     {
+      "epoch": 2.4468085106382977,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.553374086881255e-05,
+      "loss": 0.7854,
       "step": 115
     },
     {
+      "epoch": 2.5531914893617023,
+      "grad_norm": 1.140625,
+      "learning_rate": 4.5048778585908696e-05,
+      "loss": 0.7727,
       "step": 120
     },
     {
+      "epoch": 2.6595744680851063,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.454205063311039e-05,
+      "loss": 0.7748,
       "step": 125
     },
     {
+      "epoch": 2.7659574468085104,
+      "grad_norm": 0.94921875,
+      "learning_rate": 4.401418550278211e-05,
+      "loss": 0.7644,
+      "step": 130
+    },
+    {
+      "epoch": 2.872340425531915,
+      "grad_norm": 1.140625,
+      "learning_rate": 4.346583790363301e-05,
+      "loss": 0.7659,
+      "step": 135
+    },
+    {
+      "epoch": 2.978723404255319,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.289768794868476e-05,
+      "loss": 0.7569,
+      "step": 140
+    },
+    {
+      "epoch": 3.0851063829787235,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.231044031173043e-05,
+      "loss": 0.6841,
+      "step": 145
+    },
+    {
+      "epoch": 3.1914893617021276,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.170482335333083e-05,
+      "loss": 0.6804,
+      "step": 150
+    },
+    {
+      "epoch": 3.297872340425532,
+      "grad_norm": 1.09375,
+      "learning_rate": 4.10815882174322e-05,
+      "loss": 0.6608,
+      "step": 155
+    },
+    {
+      "epoch": 3.404255319148936,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.0441507899725865e-05,
+      "loss": 0.6677,
+      "step": 160
+    },
+    {
+      "epoch": 3.5106382978723403,
+      "grad_norm": 0.96484375,
+      "learning_rate": 3.978537628890514e-05,
+      "loss": 0.6568,
+      "step": 165
+    },
+    {
+      "epoch": 3.617021276595745,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.9114007182008886e-05,
+      "loss": 0.6551,
+      "step": 170
+    },
+    {
+      "epoch": 3.723404255319149,
+      "grad_norm": 1.078125,
+      "learning_rate": 3.842823327507267e-05,
+      "loss": 0.6403,
+      "step": 175
+    },
+    {
+      "epoch": 3.829787234042553,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.772890513033978e-05,
+      "loss": 0.673,
+      "step": 180
+    },
+    {
+      "epoch": 3.9361702127659575,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.701689012131268e-05,
+      "loss": 0.6611,
+      "step": 185
+    },
+    {
+      "epoch": 4.042553191489362,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.629307135695365e-05,
+      "loss": 0.6185,
+      "step": 190
+    },
+    {
+      "epoch": 4.148936170212766,
+      "grad_norm": 1.0078125,
+      "learning_rate": 3.555834658636878e-05,
+      "loss": 0.5792,
+      "step": 195
+    },
+    {
+      "epoch": 4.25531914893617,
+      "grad_norm": 1.140625,
+      "learning_rate": 3.4813627085333855e-05,
+      "loss": 0.556,
+      "step": 200
+    },
+    {
+      "epoch": 4.361702127659575,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.405983652604315e-05,
+      "loss": 0.5776,
+      "step": 205
+    },
+    {
+      "epoch": 4.468085106382979,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.3297909831483065e-05,
+      "loss": 0.573,
+      "step": 210
+    },
+    {
+      "epoch": 4.574468085106383,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.252879201585139e-05,
+      "loss": 0.5789,
+      "step": 215
+    },
+    {
+      "epoch": 4.680851063829787,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.175343701246051e-05,
+      "loss": 0.574,
+      "step": 220
+    },
+    {
+      "epoch": 4.787234042553192,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.097280649057833e-05,
+      "loss": 0.5644,
+      "step": 225
+    },
+    {
+      "epoch": 4.8936170212765955,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.0187868662674168e-05,
+      "loss": 0.5646,
+      "step": 230
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.98828125,
+      "learning_rate": 2.9399597083549386e-05,
+      "loss": 0.566,
+      "step": 235
+    },
+    {
+      "epoch": 5.1063829787234045,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.8608969442841698e-05,
+      "loss": 0.5118,
+      "step": 240
+    },
+    {
+      "epoch": 5.212765957446808,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.781696635240128e-05,
+      "loss": 0.5009,
+      "step": 245
+    },
+    {
+      "epoch": 5.319148936170213,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.702457013004228e-05,
+      "loss": 0.5057,
+      "step": 250
+    },
+    {
+      "epoch": 5.425531914893617,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.623276358117861e-05,
+      "loss": 0.5049,
+      "step": 255
+    },
+    {
+      "epoch": 5.531914893617021,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.5442528779854824e-05,
+      "loss": 0.511,
+      "step": 260
+    },
+    {
+      "epoch": 5.638297872340425,
+      "grad_norm": 1.0,
+      "learning_rate": 2.4654845850684177e-05,
+      "loss": 0.4898,
+      "step": 265
+    },
+    {
+      "epoch": 5.74468085106383,
+      "grad_norm": 0.9921875,
+      "learning_rate": 2.3870691753204553e-05,
+      "loss": 0.5094,
+      "step": 270
+    },
+    {
+      "epoch": 5.851063829787234,
+      "grad_norm": 0.9921875,
+      "learning_rate": 2.3091039070160044e-05,
+      "loss": 0.5061,
+      "step": 275
+    },
+    {
+      "epoch": 5.957446808510638,
+      "grad_norm": 1.0,
+      "learning_rate": 2.2316854801210935e-05,
+      "loss": 0.4936,
+      "step": 280
+    },
+    {
+      "epoch": 6.0638297872340425,
+      "grad_norm": 0.97265625,
+      "learning_rate": 2.1549099163568527e-05,
+      "loss": 0.4771,
+      "step": 285
+    },
+    {
+      "epoch": 6.170212765957447,
+      "grad_norm": 0.98046875,
+      "learning_rate": 2.0788724401042096e-05,
+      "loss": 0.4644,
+      "step": 290
+    },
+    {
+      "epoch": 6.276595744680851,
+      "grad_norm": 1.0,
+      "learning_rate": 2.0036673602975203e-05,
+      "loss": 0.4519,
+      "step": 295
+    },
+    {
+      "epoch": 6.382978723404255,
       "grad_norm": 0.96484375,
+      "learning_rate": 1.9293879534536358e-05,
+      "loss": 0.4626,
+      "step": 300
+    },
+    {
+      "epoch": 6.48936170212766,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.8561263479814583e-05,
+      "loss": 0.4594,
+      "step": 305
+    },
+    {
+      "epoch": 6.595744680851064,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.783973409915503e-05,
+      "loss": 0.4575,
+      "step": 310
+    },
+    {
+      "epoch": 6.702127659574468,
+      "grad_norm": 0.97265625,
+      "learning_rate": 1.713018630215162e-05,
+      "loss": 0.4595,
+      "step": 315
+    },
+    {
+      "epoch": 6.808510638297872,
+      "grad_norm": 0.984375,
+      "learning_rate": 1.6433500137694846e-05,
+      "loss": 0.461,
+      "step": 320
+    },
+    {
+      "epoch": 6.914893617021277,
+      "grad_norm": 0.98828125,
+      "learning_rate": 1.5750539702451116e-05,
+      "loss": 0.4578,
+      "step": 325
+    },
+    {
+      "epoch": 7.0212765957446805,
+      "grad_norm": 0.9921875,
+      "learning_rate": 1.5082152069127619e-05,
+      "loss": 0.4584,
+      "step": 330
+    },
+    {
+      "epoch": 7.127659574468085,
+      "grad_norm": 0.94921875,
+      "learning_rate": 1.442916623585189e-05,
+      "loss": 0.4262,
+      "step": 335
+    },
+    {
+      "epoch": 7.23404255319149,
+      "grad_norm": 0.95703125,
+      "learning_rate": 1.3792392097969259e-05,
+      "loss": 0.4352,
+      "step": 340
+    },
+    {
+      "epoch": 7.340425531914893,
+      "grad_norm": 0.9375,
+      "learning_rate": 1.3172619443533333e-05,
+      "loss": 0.4332,
+      "step": 345
+    },
+    {
+      "epoch": 7.446808510638298,
+      "grad_norm": 0.94140625,
+      "learning_rate": 1.2570616973735466e-05,
+      "loss": 0.4407,
+      "step": 350
+    },
+    {
+      "epoch": 7.553191489361702,
+      "grad_norm": 0.97265625,
+      "learning_rate": 1.1987131349488227e-05,
+      "loss": 0.4204,
+      "step": 355
+    },
+    {
+      "epoch": 7.659574468085106,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.1422886265345257e-05,
+      "loss": 0.4319,
+      "step": 360
+    },
+    {
+      "epoch": 7.76595744680851,
+      "grad_norm": 0.98828125,
+      "learning_rate": 1.0878581551906195e-05,
+      "loss": 0.4308,
+      "step": 365
+    },
+    {
+      "epoch": 7.872340425531915,
+      "grad_norm": 0.921875,
+      "learning_rate": 1.0354892307820033e-05,
+      "loss": 0.4462,
+      "step": 370
+    },
+    {
+      "epoch": 7.9787234042553195,
+      "grad_norm": 0.94921875,
+      "learning_rate": 9.852468062463322e-06,
+      "loss": 0.4336,
+      "step": 375
+    },
+    {
+      "epoch": 8.085106382978724,
+      "grad_norm": 0.96875,
+      "learning_rate": 9.371931970331851e-06,
+      "loss": 0.4244,
+      "step": 380
+    },
+    {
+      "epoch": 8.191489361702128,
+      "grad_norm": 0.953125,
+      "learning_rate": 8.913880038144973e-06,
+      "loss": 0.4271,
+      "step": 385
+    },
+    {
+      "epoch": 8.297872340425531,
+      "grad_norm": 0.96484375,
+      "learning_rate": 8.478880385621198e-06,
+      "loss": 0.4232,
+      "step": 390
+    },
+    {
+      "epoch": 8.404255319148936,
+      "grad_norm": 0.921875,
+      "learning_rate": 8.067472540841918e-06,
+      "loss": 0.4183,
+      "step": 395
+    },
+    {
+      "epoch": 8.51063829787234,
+      "grad_norm": 0.95703125,
+      "learning_rate": 7.680166771077155e-06,
+      "loss": 0.4184,
+      "step": 400
+    },
+    {
+      "epoch": 8.617021276595745,
+      "grad_norm": 0.9296875,
+      "learning_rate": 7.317443449903435e-06,
+      "loss": 0.4267,
+      "step": 405
+    },
+    {
+      "epoch": 8.72340425531915,
+      "grad_norm": 0.95703125,
+      "learning_rate": 6.9797524613986255e-06,
+      "loss": 0.4276,
+      "step": 410
+    },
+    {
+      "epoch": 8.829787234042554,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.667512642152743e-06,
+      "loss": 0.4178,
+      "step": 415
+    },
+    {
+      "epoch": 8.936170212765958,
+      "grad_norm": 0.95703125,
+      "learning_rate": 6.381111261786892e-06,
+      "loss": 0.4223,
+      "step": 420
+    },
+    {
+      "epoch": 9.042553191489361,
+      "grad_norm": 0.890625,
+      "learning_rate": 6.120903542624479e-06,
+      "loss": 0.4238,
+      "step": 425
+    },
+    {
+      "epoch": 9.148936170212766,
+      "grad_norm": 0.9296875,
+      "learning_rate": 5.887212219110616e-06,
+      "loss": 0.4208,
+      "step": 430
+    },
+    {
+      "epoch": 9.25531914893617,
+      "grad_norm": 0.921875,
+      "learning_rate": 5.6803271375260525e-06,
+      "loss": 0.4079,
+      "step": 435
+    },
+    {
+      "epoch": 9.361702127659575,
+      "grad_norm": 0.93359375,
+      "learning_rate": 5.500504896492159e-06,
+      "loss": 0.4287,
+      "step": 440
+    },
+    {
+      "epoch": 9.46808510638298,
+      "grad_norm": 0.9453125,
+      "learning_rate": 5.347968528712824e-06,
+      "loss": 0.426,
+      "step": 445
+    },
+    {
+      "epoch": 9.574468085106384,
+      "grad_norm": 0.97265625,
+      "learning_rate": 5.222907224348022e-06,
+      "loss": 0.4077,
+      "step": 450
+    },
+    {
+      "epoch": 9.680851063829786,
+      "grad_norm": 0.8984375,
+      "learning_rate": 5.125476096362112e-06,
+      "loss": 0.4208,
+      "step": 455
+    },
+    {
+      "epoch": 9.787234042553191,
+      "grad_norm": 0.94140625,
+      "learning_rate": 5.055795988137946e-06,
+      "loss": 0.4104,
+      "step": 460
+    },
+    {
+      "epoch": 9.893617021276595,
+      "grad_norm": 0.93359375,
+      "learning_rate": 5.013953323595361e-06,
+      "loss": 0.4249,
+      "step": 465
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.94921875,
       "learning_rate": 5e-06,
+      "loss": 0.4235,
+      "step": 470
     },
     {
       "epoch": 10.0,
+      "step": 470,
+      "total_flos": 2.4184401017438208e+17,
+      "train_loss": 0.6382126904548483,
+      "train_runtime": 2326.1115,
+      "train_samples_per_second": 3.229,
+      "train_steps_per_second": 0.202
     }
   ],
   "logging_steps": 5,
+  "max_steps": 470,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 2.4184401017438208e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null