Model save

Browse files

Files changed (6) hide show

README.md +78 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
runs/Apr01_02-59-08_green-face-echoes-fin-01/events.out.tfevents.1743476556.green-face-echoes-fin-01.64054.0 +2 -2
train_results.json +9 -0
trainer_state.json +1068 -0

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+library_name: peft
+license: llama3.2
+base_model: meta-llama/Llama-3.2-3B
+tags:
+- trl
+- sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: llama3.1-3b-coding-gpt4o-100k2
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# llama3.1-3b-coding-gpt4o-100k2
+This model is a fine-tuned version of [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.6301
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.002
+- train_batch_size: 16
+- eval_batch_size: 16
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 256
+- total_eval_batch_size: 128
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0031        | 1.0    | 68   | 1.5510          |
+| 0.9546        | 2.0    | 136  | 1.5149          |
+| 0.936         | 3.0    | 204  | 1.5085          |
+| 0.9186        | 4.0    | 272  | 1.5175          |
+| 0.8948        | 5.0    | 340  | 1.5302          |
+| 0.8742        | 6.0    | 408  | 1.5502          |
+| 0.8556        | 7.0    | 476  | 1.5617          |
+| 0.8428        | 8.0    | 544  | 1.5965          |
+| 0.8168        | 9.0    | 612  | 1.6217          |
+| 0.8191        | 9.8593 | 670  | 1.6301          |
+### Framework versions
+- PEFT 0.15.1
+- Transformers 4.50.3
+- Pytorch 2.6.0+cu124
+- Datasets 3.5.0
+- Tokenizers 0.21.1

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:37b5a064c89f2c05e500778b82c18fa3b5b228097593fbba33ba3e96358a102a
 size 1612749744

 version https://git-lfs.github.com/spec/v1
+oid sha256:68e6c34ff5b83d4da958c29a94d06b20fa3f6c199f168e1e0b501445b8a3d7a3
 size 1612749744

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.85925925925926,
+    "total_flos": 2.9601022627828204e+18,
+    "train_loss": 0.9062140895359552,
+    "train_runtime": 3484.2972,
+    "train_samples": 116368,
+    "train_samples_per_second": 49.516,
+    "train_steps_per_second": 0.192
+}

runs/Apr01_02-59-08_green-face-echoes-fin-01/events.out.tfevents.1743476556.green-face-echoes-fin-01.64054.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86ed4e1cd866847063d76493e667ab41b12a09fc8ada3299d9420de083c2d0a5
-size 33856

 version https://git-lfs.github.com/spec/v1
+oid sha256:32e81dadfde0db11eeefe569842f22d0c579cff25c34b4af794fd35d7ed548fb
+size 37706

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.85925925925926,
+    "total_flos": 2.9601022627828204e+18,
+    "train_loss": 0.9062140895359552,
+    "train_runtime": 3484.2972,
+    "train_samples": 116368,
+    "train_samples_per_second": 49.516,
+    "train_steps_per_second": 0.192
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1068 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.85925925925926,
+  "eval_steps": 500,
+  "global_step": 670,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014814814814814815,
+      "grad_norm": 0.4682641327381134,
+      "learning_rate": 2.9850746268656717e-05,
+      "loss": 1.4595,
+      "step": 1
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.30114030838012695,
+      "learning_rate": 0.00014925373134328358,
+      "loss": 1.4529,
+      "step": 5
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.2646062970161438,
+      "learning_rate": 0.00029850746268656717,
+      "loss": 1.3781,
+      "step": 10
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.2039109170436859,
+      "learning_rate": 0.00044776119402985075,
+      "loss": 1.2598,
+      "step": 15
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.12383515387773514,
+      "learning_rate": 0.0005970149253731343,
+      "loss": 1.1834,
+      "step": 20
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.1035536378622055,
+      "learning_rate": 0.0007462686567164179,
+      "loss": 1.1305,
+      "step": 25
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.09090688824653625,
+      "learning_rate": 0.0008955223880597015,
+      "loss": 1.0904,
+      "step": 30
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.1432366669178009,
+      "learning_rate": 0.001044776119402985,
+      "loss": 1.0762,
+      "step": 35
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.07171270251274109,
+      "learning_rate": 0.0011940298507462687,
+      "loss": 1.0618,
+      "step": 40
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.07491806894540787,
+      "learning_rate": 0.0013432835820895524,
+      "loss": 1.0421,
+      "step": 45
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.06790623813867569,
+      "learning_rate": 0.0014925373134328358,
+      "loss": 1.0302,
+      "step": 50
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 0.08844709396362305,
+      "learning_rate": 0.0016417910447761195,
+      "loss": 1.018,
+      "step": 55
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.08857131749391556,
+      "learning_rate": 0.001791044776119403,
+      "loss": 1.0156,
+      "step": 60
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 0.09674689918756485,
+      "learning_rate": 0.0019402985074626867,
+      "loss": 1.0031,
+      "step": 65
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.551000714302063,
+      "eval_runtime": 0.869,
+      "eval_samples_per_second": 4.603,
+      "eval_steps_per_second": 1.151,
+      "step": 68
+    },
+    {
+      "epoch": 1.0296296296296297,
+      "grad_norm": 0.09539825469255447,
+      "learning_rate": 0.001999877856940653,
+      "loss": 0.9937,
+      "step": 70
+    },
+    {
+      "epoch": 1.1037037037037036,
+      "grad_norm": 0.09851890057325363,
+      "learning_rate": 0.0019991315351855746,
+      "loss": 0.9895,
+      "step": 75
+    },
+    {
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.06911145895719528,
+      "learning_rate": 0.0019977072547317748,
+      "loss": 0.9817,
+      "step": 80
+    },
+    {
+      "epoch": 1.2518518518518518,
+      "grad_norm": 0.06769894808530807,
+      "learning_rate": 0.001995605982021898,
+      "loss": 0.9762,
+      "step": 85
+    },
+    {
+      "epoch": 1.325925925925926,
+      "grad_norm": 0.06828448921442032,
+      "learning_rate": 0.001992829142870326,
+      "loss": 0.9743,
+      "step": 90
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.06951478868722916,
+      "learning_rate": 0.0019893786214956943,
+      "loss": 0.9743,
+      "step": 95
+    },
+    {
+      "epoch": 1.474074074074074,
+      "grad_norm": 0.06752126663923264,
+      "learning_rate": 0.001985256759242359,
+      "loss": 0.9718,
+      "step": 100
+    },
+    {
+      "epoch": 1.5481481481481483,
+      "grad_norm": 0.06669533252716064,
+      "learning_rate": 0.0019804663529916825,
+      "loss": 0.9743,
+      "step": 105
+    },
+    {
+      "epoch": 1.6222222222222222,
+      "grad_norm": 0.06977611780166626,
+      "learning_rate": 0.001975010653264216,
+      "loss": 0.9678,
+      "step": 110
+    },
+    {
+      "epoch": 1.6962962962962962,
+      "grad_norm": 0.07217196375131607,
+      "learning_rate": 0.0019688933620140635,
+      "loss": 0.9694,
+      "step": 115
+    },
+    {
+      "epoch": 1.7703703703703704,
+      "grad_norm": 0.06247986480593681,
+      "learning_rate": 0.0019621186301169314,
+      "loss": 0.9625,
+      "step": 120
+    },
+    {
+      "epoch": 1.8444444444444446,
+      "grad_norm": 0.07415565848350525,
+      "learning_rate": 0.001954691054553556,
+      "loss": 0.9697,
+      "step": 125
+    },
+    {
+      "epoch": 1.9185185185185185,
+      "grad_norm": 0.07004866003990173,
+      "learning_rate": 0.0019466156752904343,
+      "loss": 0.957,
+      "step": 130
+    },
+    {
+      "epoch": 1.9925925925925925,
+      "grad_norm": 0.06320279091596603,
+      "learning_rate": 0.0019378979718599645,
+      "loss": 0.9546,
+      "step": 135
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.5149173736572266,
+      "eval_runtime": 0.8697,
+      "eval_samples_per_second": 4.599,
+      "eval_steps_per_second": 1.15,
+      "step": 136
+    },
+    {
+      "epoch": 2.0592592592592593,
+      "grad_norm": 0.07447217404842377,
+      "learning_rate": 0.0019285438596423204,
+      "loss": 0.9443,
+      "step": 140
+    },
+    {
+      "epoch": 2.1333333333333333,
+      "grad_norm": 0.06741169095039368,
+      "learning_rate": 0.0019185596858515798,
+      "loss": 0.9371,
+      "step": 145
+    },
+    {
+      "epoch": 2.2074074074074073,
+      "grad_norm": 0.06852757930755615,
+      "learning_rate": 0.0019079522252288387,
+      "loss": 0.9395,
+      "step": 150
+    },
+    {
+      "epoch": 2.2814814814814817,
+      "grad_norm": 0.06586603075265884,
+      "learning_rate": 0.0018967286754452213,
+      "loss": 0.937,
+      "step": 155
+    },
+    {
+      "epoch": 2.3555555555555556,
+      "grad_norm": 0.0683656558394432,
+      "learning_rate": 0.0018848966522179167,
+      "loss": 0.9336,
+      "step": 160
+    },
+    {
+      "epoch": 2.4296296296296296,
+      "grad_norm": 0.07259602099657059,
+      "learning_rate": 0.001872464184142548,
+      "loss": 0.935,
+      "step": 165
+    },
+    {
+      "epoch": 2.5037037037037035,
+      "grad_norm": 0.06436455249786377,
+      "learning_rate": 0.0018594397072453856,
+      "loss": 0.9316,
+      "step": 170
+    },
+    {
+      "epoch": 2.5777777777777775,
+      "grad_norm": 0.08042966574430466,
+      "learning_rate": 0.0018458320592590974,
+      "loss": 0.938,
+      "step": 175
+    },
+    {
+      "epoch": 2.651851851851852,
+      "grad_norm": 0.0699801966547966,
+      "learning_rate": 0.0018316504736259254,
+      "loss": 0.9422,
+      "step": 180
+    },
+    {
+      "epoch": 2.725925925925926,
+      "grad_norm": 0.06373833864927292,
+      "learning_rate": 0.0018169045732323492,
+      "loss": 0.9348,
+      "step": 185
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.07165364176034927,
+      "learning_rate": 0.0018016043638794975,
+      "loss": 0.9354,
+      "step": 190
+    },
+    {
+      "epoch": 2.8740740740740742,
+      "grad_norm": 0.06121128425002098,
+      "learning_rate": 0.0017857602274937308,
+      "loss": 0.9386,
+      "step": 195
+    },
+    {
+      "epoch": 2.948148148148148,
+      "grad_norm": 0.06334740668535233,
+      "learning_rate": 0.0017693829150820068,
+      "loss": 0.936,
+      "step": 200
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.508521318435669,
+      "eval_runtime": 0.8697,
+      "eval_samples_per_second": 4.599,
+      "eval_steps_per_second": 1.15,
+      "step": 204
+    },
+    {
+      "epoch": 3.0148148148148146,
+      "grad_norm": 0.07033156603574753,
+      "learning_rate": 0.0017524835394368066,
+      "loss": 0.9317,
+      "step": 205
+    },
+    {
+      "epoch": 3.088888888888889,
+      "grad_norm": 0.06662800908088684,
+      "learning_rate": 0.0017350735675955695,
+      "loss": 0.9145,
+      "step": 210
+    },
+    {
+      "epoch": 3.162962962962963,
+      "grad_norm": 0.06688813865184784,
+      "learning_rate": 0.001717164813059761,
+      "loss": 0.9094,
+      "step": 215
+    },
+    {
+      "epoch": 3.237037037037037,
+      "grad_norm": 0.07399953156709671,
+      "learning_rate": 0.0016987694277788418,
+      "loss": 0.9147,
+      "step": 220
+    },
+    {
+      "epoch": 3.311111111111111,
+      "grad_norm": 0.06779713183641434,
+      "learning_rate": 0.0016798998939045893,
+      "loss": 0.9123,
+      "step": 225
+    },
+    {
+      "epoch": 3.3851851851851853,
+      "grad_norm": 0.06676509976387024,
+      "learning_rate": 0.001660569015321357,
+      "loss": 0.9099,
+      "step": 230
+    },
+    {
+      "epoch": 3.4592592592592593,
+      "grad_norm": 0.06683938950300217,
+      "learning_rate": 0.001640789908958026,
+      "loss": 0.9112,
+      "step": 235
+    },
+    {
+      "epoch": 3.533333333333333,
+      "grad_norm": 0.06712319701910019,
+      "learning_rate": 0.001620575995887538,
+      "loss": 0.914,
+      "step": 240
+    },
+    {
+      "epoch": 3.6074074074074076,
+      "grad_norm": 0.06718605011701584,
+      "learning_rate": 0.001599940992220053,
+      "loss": 0.9156,
+      "step": 245
+    },
+    {
+      "epoch": 3.6814814814814816,
+      "grad_norm": 0.06765800714492798,
+      "learning_rate": 0.0015788988997959114,
+      "loss": 0.9168,
+      "step": 250
+    },
+    {
+      "epoch": 3.7555555555555555,
+      "grad_norm": 0.06374535709619522,
+      "learning_rate": 0.0015574639966847127,
+      "loss": 0.9114,
+      "step": 255
+    },
+    {
+      "epoch": 3.8296296296296295,
+      "grad_norm": 0.06388971954584122,
+      "learning_rate": 0.0015356508274969594,
+      "loss": 0.9139,
+      "step": 260
+    },
+    {
+      "epoch": 3.9037037037037035,
+      "grad_norm": 0.0656428337097168,
+      "learning_rate": 0.0015134741935148419,
+      "loss": 0.916,
+      "step": 265
+    },
+    {
+      "epoch": 3.977777777777778,
+      "grad_norm": 0.06783714145421982,
+      "learning_rate": 0.0014909491426488577,
+      "loss": 0.9186,
+      "step": 270
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.517486810684204,
+      "eval_runtime": 0.8755,
+      "eval_samples_per_second": 4.569,
+      "eval_steps_per_second": 1.142,
+      "step": 272
+    },
+    {
+      "epoch": 4.044444444444444,
+      "grad_norm": 0.06940994411706924,
+      "learning_rate": 0.001468090959227082,
+      "loss": 0.9011,
+      "step": 275
+    },
+    {
+      "epoch": 4.118518518518519,
+      "grad_norm": 0.06819378584623337,
+      "learning_rate": 0.0014449151536240167,
+      "loss": 0.8866,
+      "step": 280
+    },
+    {
+      "epoch": 4.192592592592592,
+      "grad_norm": 0.0655524805188179,
+      "learning_rate": 0.0014214374517360576,
+      "loss": 0.8916,
+      "step": 285
+    },
+    {
+      "epoch": 4.266666666666667,
+      "grad_norm": 0.06668845564126968,
+      "learning_rate": 0.0013976737843107202,
+      "loss": 0.8871,
+      "step": 290
+    },
+    {
+      "epoch": 4.340740740740741,
+      "grad_norm": 0.06470604240894318,
+      "learning_rate": 0.0013736402761368597,
+      "loss": 0.8928,
+      "step": 295
+    },
+    {
+      "epoch": 4.4148148148148145,
+      "grad_norm": 0.06732232868671417,
+      "learning_rate": 0.0013493532351032318,
+      "loss": 0.8985,
+      "step": 300
+    },
+    {
+      "epoch": 4.488888888888889,
+      "grad_norm": 0.0662841871380806,
+      "learning_rate": 0.0013248291411328047,
+      "loss": 0.8869,
+      "step": 305
+    },
+    {
+      "epoch": 4.562962962962963,
+      "grad_norm": 0.06613945215940475,
+      "learning_rate": 0.001300084635000341,
+      "loss": 0.8963,
+      "step": 310
+    },
+    {
+      "epoch": 4.637037037037037,
+      "grad_norm": 0.06735741347074509,
+      "learning_rate": 0.0012751365070408334,
+      "loss": 0.9035,
+      "step": 315
+    },
+    {
+      "epoch": 4.711111111111111,
+      "grad_norm": 0.06463445723056793,
+      "learning_rate": 0.0012500016857564585,
+      "loss": 0.8966,
+      "step": 320
+    },
+    {
+      "epoch": 4.785185185185185,
+      "grad_norm": 0.06602155417203903,
+      "learning_rate": 0.0012246972263297718,
+      "loss": 0.895,
+      "step": 325
+    },
+    {
+      "epoch": 4.859259259259259,
+      "grad_norm": 0.06352429836988449,
+      "learning_rate": 0.0011992402990509514,
+      "loss": 0.894,
+      "step": 330
+    },
+    {
+      "epoch": 4.933333333333334,
+      "grad_norm": 0.06808946281671524,
+      "learning_rate": 0.0011736481776669307,
+      "loss": 0.8969,
+      "step": 335
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.08401331305503845,
+      "learning_rate": 0.0011479382276603299,
+      "loss": 0.8948,
+      "step": 340
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.5301542282104492,
+      "eval_runtime": 0.8691,
+      "eval_samples_per_second": 4.602,
+      "eval_steps_per_second": 1.151,
+      "step": 340
+    },
+    {
+      "epoch": 5.074074074074074,
+      "grad_norm": 0.06839559227228165,
+      "learning_rate": 0.0011221278944661473,
+      "loss": 0.8678,
+      "step": 345
+    },
+    {
+      "epoch": 5.148148148148148,
+      "grad_norm": 0.06838098913431168,
+      "learning_rate": 0.0010962346916341904,
+      "loss": 0.8666,
+      "step": 350
+    },
+    {
+      "epoch": 5.222222222222222,
+      "grad_norm": 0.06836072355508804,
+      "learning_rate": 0.001070276188945293,
+      "loss": 0.8731,
+      "step": 355
+    },
+    {
+      "epoch": 5.296296296296296,
+      "grad_norm": 0.06789132207632065,
+      "learning_rate": 0.0010442700004893765,
+      "loss": 0.8724,
+      "step": 360
+    },
+    {
+      "epoch": 5.37037037037037,
+      "grad_norm": 0.06825467944145203,
+      "learning_rate": 0.001018233772713443,
+      "loss": 0.8757,
+      "step": 365
+    },
+    {
+      "epoch": 5.444444444444445,
+      "grad_norm": 0.06852041184902191,
+      "learning_rate": 0.000992185172447616,
+      "loss": 0.8762,
+      "step": 370
+    },
+    {
+      "epoch": 5.518518518518518,
+      "grad_norm": 0.06898131966590881,
+      "learning_rate": 0.0009661418749173466,
+      "loss": 0.8731,
+      "step": 375
+    },
+    {
+      "epoch": 5.592592592592593,
+      "grad_norm": 0.06875770539045334,
+      "learning_rate": 0.0009401215517499251,
+      "loss": 0.8746,
+      "step": 380
+    },
+    {
+      "epoch": 5.666666666666667,
+      "grad_norm": 0.06649214774370193,
+      "learning_rate": 0.0009141418589834339,
+      "loss": 0.8748,
+      "step": 385
+    },
+    {
+      "epoch": 5.7407407407407405,
+      "grad_norm": 0.06804858148097992,
+      "learning_rate": 0.0008882204250862795,
+      "loss": 0.8783,
+      "step": 390
+    },
+    {
+      "epoch": 5.814814814814815,
+      "grad_norm": 0.06907966732978821,
+      "learning_rate": 0.0008623748389954282,
+      "loss": 0.8822,
+      "step": 395
+    },
+    {
+      "epoch": 5.888888888888889,
+      "grad_norm": 0.0679902508854866,
+      "learning_rate": 0.0008366226381814697,
+      "loss": 0.8777,
+      "step": 400
+    },
+    {
+      "epoch": 5.962962962962963,
+      "grad_norm": 0.06677145510911942,
+      "learning_rate": 0.0008109812967486025,
+      "loss": 0.8742,
+      "step": 405
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 1.5502283573150635,
+      "eval_runtime": 0.8693,
+      "eval_samples_per_second": 4.602,
+      "eval_steps_per_second": 1.15,
+      "step": 408
+    },
+    {
+      "epoch": 6.029629629629629,
+      "grad_norm": 0.06917522847652435,
+      "learning_rate": 0.0007854682135776132,
+      "loss": 0.8605,
+      "step": 410
+    },
+    {
+      "epoch": 6.103703703703704,
+      "grad_norm": 0.07051009684801102,
+      "learning_rate": 0.0007601007005199021,
+      "loss": 0.8501,
+      "step": 415
+    },
+    {
+      "epoch": 6.177777777777778,
+      "grad_norm": 0.07272496819496155,
+      "learning_rate": 0.0007348959706505627,
+      "loss": 0.8553,
+      "step": 420
+    },
+    {
+      "epoch": 6.2518518518518515,
+      "grad_norm": 0.07074420154094696,
+      "learning_rate": 0.000709871126588481,
+      "loss": 0.8496,
+      "step": 425
+    },
+    {
+      "epoch": 6.325925925925926,
+      "grad_norm": 0.07021532952785492,
+      "learning_rate": 0.0006850431488913895,
+      "loss": 0.8547,
+      "step": 430
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.07260267436504364,
+      "learning_rate": 0.0006604288845337453,
+      "loss": 0.8568,
+      "step": 435
+    },
+    {
+      "epoch": 6.474074074074074,
+      "grad_norm": 0.06939396262168884,
+      "learning_rate": 0.0006360450354752458,
+      "loss": 0.8561,
+      "step": 440
+    },
+    {
+      "epoch": 6.548148148148148,
+      "grad_norm": 0.06964612007141113,
+      "learning_rate": 0.0006119081473277501,
+      "loss": 0.8577,
+      "step": 445
+    },
+    {
+      "epoch": 6.622222222222222,
+      "grad_norm": 0.06987880170345306,
+      "learning_rate": 0.0005880345981282876,
+      "loss": 0.858,
+      "step": 450
+    },
+    {
+      "epoch": 6.696296296296296,
+      "grad_norm": 0.06909282505512238,
+      "learning_rate": 0.0005644405872257716,
+      "loss": 0.8559,
+      "step": 455
+    },
+    {
+      "epoch": 6.770370370370371,
+      "grad_norm": 0.0683453232049942,
+      "learning_rate": 0.0005411421242889642,
+      "loss": 0.8561,
+      "step": 460
+    },
+    {
+      "epoch": 6.844444444444444,
+      "grad_norm": 0.0680374875664711,
+      "learning_rate": 0.000518155018443151,
+      "loss": 0.859,
+      "step": 465
+    },
+    {
+      "epoch": 6.9185185185185185,
+      "grad_norm": 0.067069411277771,
+      "learning_rate": 0.0004954948675428853,
+      "loss": 0.8489,
+      "step": 470
+    },
+    {
+      "epoch": 6.992592592592593,
+      "grad_norm": 0.06691515445709229,
+      "learning_rate": 0.00047317704758809945,
+      "loss": 0.8556,
+      "step": 475
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 1.5617406368255615,
+      "eval_runtime": 0.8711,
+      "eval_samples_per_second": 4.592,
+      "eval_steps_per_second": 1.148,
+      "step": 476
+    },
+    {
+      "epoch": 7.059259259259259,
+      "grad_norm": 0.08424794673919678,
+      "learning_rate": 0.0004512167022907494,
+      "loss": 0.8413,
+      "step": 480
+    },
+    {
+      "epoch": 7.133333333333334,
+      "grad_norm": 0.07284523546695709,
+      "learning_rate": 0.00042962873279907965,
+      "loss": 0.8329,
+      "step": 485
+    },
+    {
+      "epoch": 7.207407407407407,
+      "grad_norm": 0.06989779323339462,
+      "learning_rate": 0.0004084277875864776,
+      "loss": 0.8368,
+      "step": 490
+    },
+    {
+      "epoch": 7.281481481481482,
+      "grad_norm": 0.0744442567229271,
+      "learning_rate": 0.0003876282525117847,
+      "loss": 0.831,
+      "step": 495
+    },
+    {
+      "epoch": 7.355555555555555,
+      "grad_norm": 0.07233459502458572,
+      "learning_rate": 0.0003672442410577965,
+      "loss": 0.8344,
+      "step": 500
+    },
+    {
+      "epoch": 7.42962962962963,
+      "grad_norm": 0.07147523015737534,
+      "learning_rate": 0.0003472895847545905,
+      "loss": 0.837,
+      "step": 505
+    },
+    {
+      "epoch": 7.503703703703704,
+      "grad_norm": 0.0732484832406044,
+      "learning_rate": 0.000327777823794168,
+      "loss": 0.8427,
+      "step": 510
+    },
+    {
+      "epoch": 7.5777777777777775,
+      "grad_norm": 0.0711125060915947,
+      "learning_rate": 0.00030872219784278354,
+      "loss": 0.8394,
+      "step": 515
+    },
+    {
+      "epoch": 7.651851851851852,
+      "grad_norm": 0.07285265624523163,
+      "learning_rate": 0.0002901356370571967,
+      "loss": 0.8336,
+      "step": 520
+    },
+    {
+      "epoch": 7.725925925925926,
+      "grad_norm": 0.07154905050992966,
+      "learning_rate": 0.0002720307533109402,
+      "loss": 0.8403,
+      "step": 525
+    },
+    {
+      "epoch": 7.8,
+      "grad_norm": 0.07089488953351974,
+      "learning_rate": 0.000254419831636557,
+      "loss": 0.839,
+      "step": 530
+    },
+    {
+      "epoch": 7.874074074074074,
+      "grad_norm": 0.0709661915898323,
+      "learning_rate": 0.00023731482188961818,
+      "loss": 0.8353,
+      "step": 535
+    },
+    {
+      "epoch": 7.948148148148148,
+      "grad_norm": 0.07034063339233398,
+      "learning_rate": 0.00022072733064017102,
+      "loss": 0.8428,
+      "step": 540
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 1.596451997756958,
+      "eval_runtime": 0.8703,
+      "eval_samples_per_second": 4.596,
+      "eval_steps_per_second": 1.149,
+      "step": 544
+    },
+    {
+      "epoch": 8.014814814814814,
+      "grad_norm": 0.07084991037845612,
+      "learning_rate": 0.00020466861329712473,
+      "loss": 0.8359,
+      "step": 545
+    },
+    {
+      "epoch": 8.088888888888889,
+      "grad_norm": 0.07405474036931992,
+      "learning_rate": 0.00018914956647091496,
+      "loss": 0.8195,
+      "step": 550
+    },
+    {
+      "epoch": 8.162962962962963,
+      "grad_norm": 0.07152204215526581,
+      "learning_rate": 0.0001741807205796314,
+      "loss": 0.8289,
+      "step": 555
+    },
+    {
+      "epoch": 8.237037037037037,
+      "grad_norm": 0.0712200403213501,
+      "learning_rate": 0.00015977223270362194,
+      "loss": 0.8271,
+      "step": 560
+    },
+    {
+      "epoch": 8.311111111111112,
+      "grad_norm": 0.07045566290616989,
+      "learning_rate": 0.0001459338796934293,
+      "loss": 0.829,
+      "step": 565
+    },
+    {
+      "epoch": 8.385185185185184,
+      "grad_norm": 0.0720411017537117,
+      "learning_rate": 0.000132675051535725,
+      "loss": 0.8265,
+      "step": 570
+    },
+    {
+      "epoch": 8.459259259259259,
+      "grad_norm": 0.07052139192819595,
+      "learning_rate": 0.00012000474498175551,
+      "loss": 0.8226,
+      "step": 575
+    },
+    {
+      "epoch": 8.533333333333333,
+      "grad_norm": 0.07078087329864502,
+      "learning_rate": 0.00010793155744261352,
+      "loss": 0.8241,
+      "step": 580
+    },
+    {
+      "epoch": 8.607407407407408,
+      "grad_norm": 0.07028964906930923,
+      "learning_rate": 9.646368115548232e-05,
+      "loss": 0.8212,
+      "step": 585
+    },
+    {
+      "epoch": 8.681481481481482,
+      "grad_norm": 0.0702112540602684,
+      "learning_rate": 8.56088976248095e-05,
+      "loss": 0.8232,
+      "step": 590
+    },
+    {
+      "epoch": 8.755555555555556,
+      "grad_norm": 0.07041744887828827,
+      "learning_rate": 7.53745723421827e-05,
+      "loss": 0.8193,
+      "step": 595
+    },
+    {
+      "epoch": 8.829629629629629,
+      "grad_norm": 0.06979186832904816,
+      "learning_rate": 6.576764978849003e-05,
+      "loss": 0.8186,
+      "step": 600
+    },
+    {
+      "epoch": 8.903703703703703,
+      "grad_norm": 0.07058751583099365,
+      "learning_rate": 5.679464872175666e-05,
+      "loss": 0.8279,
+      "step": 605
+    },
+    {
+      "epoch": 8.977777777777778,
+      "grad_norm": 0.07009345293045044,
+      "learning_rate": 4.846165775385458e-05,
+      "loss": 0.8168,
+      "step": 610
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 1.6216613054275513,
+      "eval_runtime": 0.8697,
+      "eval_samples_per_second": 4.599,
+      "eval_steps_per_second": 1.15,
+      "step": 612
+    },
+    {
+      "epoch": 9.044444444444444,
+      "grad_norm": 0.0695219412446022,
+      "learning_rate": 4.077433121908747e-05,
+      "loss": 0.8172,
+      "step": 615
+    },
+    {
+      "epoch": 9.118518518518519,
+      "grad_norm": 0.07094599306583405,
+      "learning_rate": 3.373788533745281e-05,
+      "loss": 0.8207,
+      "step": 620
+    },
+    {
+      "epoch": 9.192592592592593,
+      "grad_norm": 0.07142723351716995,
+      "learning_rate": 2.7357094675186987e-05,
+      "loss": 0.8101,
+      "step": 625
+    },
+    {
+      "epoch": 9.266666666666667,
+      "grad_norm": 0.07132075726985931,
+      "learning_rate": 2.1636288904992585e-05,
+      "loss": 0.8137,
+      "step": 630
+    },
+    {
+      "epoch": 9.34074074074074,
+      "grad_norm": 0.07085540145635605,
+      "learning_rate": 1.6579349868147686e-05,
+      "loss": 0.8103,
+      "step": 635
+    },
+    {
+      "epoch": 9.414814814814815,
+      "grad_norm": 0.0697101578116417,
+      "learning_rate": 1.218970894049065e-05,
+      "loss": 0.8094,
+      "step": 640
+    },
+    {
+      "epoch": 9.488888888888889,
+      "grad_norm": 0.07001277059316635,
+      "learning_rate": 8.470344704066047e-06,
+      "loss": 0.8233,
+      "step": 645
+    },
+    {
+      "epoch": 9.562962962962963,
+      "grad_norm": 0.07051407545804977,
+      "learning_rate": 5.42378092601481e-06,
+      "loss": 0.8181,
+      "step": 650
+    },
+    {
+      "epoch": 9.637037037037038,
+      "grad_norm": 0.07038593292236328,
+      "learning_rate": 3.0520848460765526e-06,
+      "loss": 0.8198,
+      "step": 655
+    },
+    {
+      "epoch": 9.71111111111111,
+      "grad_norm": 0.06979399174451828,
+      "learning_rate": 1.3568657738678436e-06,
+      "loss": 0.8138,
+      "step": 660
+    },
+    {
+      "epoch": 9.785185185185185,
+      "grad_norm": 0.07017084956169128,
+      "learning_rate": 3.3927399688948866e-07,
+      "loss": 0.8138,
+      "step": 665
+    },
+    {
+      "epoch": 9.85925925925926,
+      "grad_norm": 0.07032209634780884,
+      "learning_rate": 0.0,
+      "loss": 0.8191,
+      "step": 670
+    },
+    {
+      "epoch": 9.85925925925926,
+      "eval_loss": 1.630096197128296,
+      "eval_runtime": 0.8812,
+      "eval_samples_per_second": 4.539,
+      "eval_steps_per_second": 1.135,
+      "step": 670
+    },
+    {
+      "epoch": 9.85925925925926,
+      "step": 670,
+      "total_flos": 2.9601022627828204e+18,
+      "train_loss": 0.9062140895359552,
+      "train_runtime": 3484.2972,
+      "train_samples_per_second": 49.516,
+      "train_steps_per_second": 0.192
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 670,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.9601022627828204e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}