legis-llama3-1-8b-valid-arandu / trainer_state.json
felipeoes's picture
Model save
d20113e verified
{
"best_metric": 0.439,
"best_model_checkpoint": "runs/legis-llama3-1-8b-valid-arandu/checkpoint-1120",
"epoch": 0.9995600527936648,
"eval_steps": 5,
"global_step": 1136,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004399472063352398,
"grad_norm": 25.937191009521484,
"learning_rate": 8.771929824561403e-06,
"loss": 1.0992,
"step": 5
},
{
"epoch": 0.004399472063352398,
"eval_loss": 1.1428982019424438,
"eval_runtime": 29.8805,
"eval_samples_per_second": 0.569,
"eval_steps_per_second": 0.301,
"step": 5
},
{
"epoch": 0.008798944126704795,
"grad_norm": 32.52676773071289,
"learning_rate": 1.7543859649122806e-05,
"loss": 1.067,
"step": 10
},
{
"epoch": 0.008798944126704795,
"eval_loss": 1.0669578313827515,
"eval_runtime": 28.5282,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 10
},
{
"epoch": 0.013198416190057193,
"grad_norm": 78.51001739501953,
"learning_rate": 2.6315789473684212e-05,
"loss": 1.0057,
"step": 15
},
{
"epoch": 0.013198416190057193,
"eval_loss": 1.0462743043899536,
"eval_runtime": 28.5697,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 15
},
{
"epoch": 0.01759788825340959,
"grad_norm": 21.255964279174805,
"learning_rate": 3.508771929824561e-05,
"loss": 0.9236,
"step": 20
},
{
"epoch": 0.01759788825340959,
"eval_loss": 0.9604344367980957,
"eval_runtime": 28.6152,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 20
},
{
"epoch": 0.02199736031676199,
"grad_norm": 1.3699233531951904,
"learning_rate": 4.3859649122807014e-05,
"loss": 0.8823,
"step": 25
},
{
"epoch": 0.02199736031676199,
"eval_loss": 0.9002779126167297,
"eval_runtime": 28.579,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 25
},
{
"epoch": 0.026396832380114386,
"grad_norm": 2.50810170173645,
"learning_rate": 5.2631578947368424e-05,
"loss": 0.8144,
"step": 30
},
{
"epoch": 0.026396832380114386,
"eval_loss": 0.8441588878631592,
"eval_runtime": 28.4936,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 30
},
{
"epoch": 0.030796304443466784,
"grad_norm": 1.6816316843032837,
"learning_rate": 6.140350877192983e-05,
"loss": 0.7829,
"step": 35
},
{
"epoch": 0.030796304443466784,
"eval_loss": 0.7928382754325867,
"eval_runtime": 28.5908,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 35
},
{
"epoch": 0.03519577650681918,
"grad_norm": 0.5125584006309509,
"learning_rate": 7.017543859649122e-05,
"loss": 0.7075,
"step": 40
},
{
"epoch": 0.03519577650681918,
"eval_loss": 0.7538504600524902,
"eval_runtime": 28.5816,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 40
},
{
"epoch": 0.039595248570171576,
"grad_norm": 0.36081045866012573,
"learning_rate": 7.894736842105263e-05,
"loss": 0.6776,
"step": 45
},
{
"epoch": 0.039595248570171576,
"eval_loss": 0.7313268184661865,
"eval_runtime": 28.6141,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 45
},
{
"epoch": 0.04399472063352398,
"grad_norm": 0.32318177819252014,
"learning_rate": 8.771929824561403e-05,
"loss": 0.6499,
"step": 50
},
{
"epoch": 0.04399472063352398,
"eval_loss": 0.71351158618927,
"eval_runtime": 28.5766,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 50
},
{
"epoch": 0.04839419269687637,
"grad_norm": 0.34377261996269226,
"learning_rate": 9.649122807017544e-05,
"loss": 0.6487,
"step": 55
},
{
"epoch": 0.04839419269687637,
"eval_loss": 0.7006722092628479,
"eval_runtime": 28.6048,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 55
},
{
"epoch": 0.05279366476022877,
"grad_norm": 0.4360629618167877,
"learning_rate": 0.00010526315789473685,
"loss": 0.6405,
"step": 60
},
{
"epoch": 0.05279366476022877,
"eval_loss": 0.6905343532562256,
"eval_runtime": 28.5257,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 60
},
{
"epoch": 0.05719313682358117,
"grad_norm": 0.28764936327934265,
"learning_rate": 0.00011403508771929824,
"loss": 0.6352,
"step": 65
},
{
"epoch": 0.05719313682358117,
"eval_loss": 0.68143630027771,
"eval_runtime": 28.6362,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 65
},
{
"epoch": 0.06159260888693357,
"grad_norm": 0.34088754653930664,
"learning_rate": 0.00012280701754385965,
"loss": 0.6064,
"step": 70
},
{
"epoch": 0.06159260888693357,
"eval_loss": 0.6742813587188721,
"eval_runtime": 28.5667,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 70
},
{
"epoch": 0.06599208095028597,
"grad_norm": 0.31284183263778687,
"learning_rate": 0.00013157894736842108,
"loss": 0.5924,
"step": 75
},
{
"epoch": 0.06599208095028597,
"eval_loss": 0.6679767966270447,
"eval_runtime": 28.461,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 75
},
{
"epoch": 0.07039155301363836,
"grad_norm": 0.30470508337020874,
"learning_rate": 0.00014035087719298245,
"loss": 0.5992,
"step": 80
},
{
"epoch": 0.07039155301363836,
"eval_loss": 0.6631008386611938,
"eval_runtime": 28.6891,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 80
},
{
"epoch": 0.07479102507699076,
"grad_norm": 0.3255262076854706,
"learning_rate": 0.00014912280701754387,
"loss": 0.5704,
"step": 85
},
{
"epoch": 0.07479102507699076,
"eval_loss": 0.658618688583374,
"eval_runtime": 28.6094,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 85
},
{
"epoch": 0.07919049714034315,
"grad_norm": 0.31922295689582825,
"learning_rate": 0.00015789473684210527,
"loss": 0.6048,
"step": 90
},
{
"epoch": 0.07919049714034315,
"eval_loss": 0.6537344455718994,
"eval_runtime": 28.532,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 90
},
{
"epoch": 0.08358996920369556,
"grad_norm": 0.45636337995529175,
"learning_rate": 0.0001666666666666667,
"loss": 0.613,
"step": 95
},
{
"epoch": 0.08358996920369556,
"eval_loss": 0.6501972079277039,
"eval_runtime": 28.6568,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 95
},
{
"epoch": 0.08798944126704795,
"grad_norm": 0.29334941506385803,
"learning_rate": 0.00017543859649122806,
"loss": 0.5799,
"step": 100
},
{
"epoch": 0.08798944126704795,
"eval_loss": 0.6471393704414368,
"eval_runtime": 28.5997,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 100
},
{
"epoch": 0.09238891333040035,
"grad_norm": 0.31318825483322144,
"learning_rate": 0.00018421052631578948,
"loss": 0.5887,
"step": 105
},
{
"epoch": 0.09238891333040035,
"eval_loss": 0.6440868377685547,
"eval_runtime": 28.6275,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 105
},
{
"epoch": 0.09678838539375274,
"grad_norm": 0.27908894419670105,
"learning_rate": 0.00019298245614035088,
"loss": 0.5905,
"step": 110
},
{
"epoch": 0.09678838539375274,
"eval_loss": 0.6423875689506531,
"eval_runtime": 28.5491,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 110
},
{
"epoch": 0.10118785745710515,
"grad_norm": 0.2715133726596832,
"learning_rate": 0.00019999952753720356,
"loss": 0.5902,
"step": 115
},
{
"epoch": 0.10118785745710515,
"eval_loss": 0.6415910720825195,
"eval_runtime": 28.5086,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 115
},
{
"epoch": 0.10558732952045755,
"grad_norm": 0.3028790056705475,
"learning_rate": 0.000199982991808088,
"loss": 0.5773,
"step": 120
},
{
"epoch": 0.10558732952045755,
"eval_loss": 0.6377425789833069,
"eval_runtime": 28.6438,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 120
},
{
"epoch": 0.10998680158380994,
"grad_norm": 0.3071883022785187,
"learning_rate": 0.00019994283740338306,
"loss": 0.5598,
"step": 125
},
{
"epoch": 0.10998680158380994,
"eval_loss": 0.6367806196212769,
"eval_runtime": 28.4852,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 125
},
{
"epoch": 0.11438627364716233,
"grad_norm": 0.34842655062675476,
"learning_rate": 0.00019987907380864062,
"loss": 0.596,
"step": 130
},
{
"epoch": 0.11438627364716233,
"eval_loss": 0.6347749829292297,
"eval_runtime": 28.5908,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 130
},
{
"epoch": 0.11878574571051474,
"grad_norm": 0.2854275107383728,
"learning_rate": 0.00019979171608653924,
"loss": 0.5733,
"step": 135
},
{
"epoch": 0.11878574571051474,
"eval_loss": 0.6301032900810242,
"eval_runtime": 28.5482,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 135
},
{
"epoch": 0.12318521777386714,
"grad_norm": 0.27615901827812195,
"learning_rate": 0.00019968078487332566,
"loss": 0.5875,
"step": 140
},
{
"epoch": 0.12318521777386714,
"eval_loss": 0.6269793510437012,
"eval_runtime": 28.4974,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 140
},
{
"epoch": 0.12758468983721954,
"grad_norm": 0.2709368169307709,
"learning_rate": 0.00019954630637394029,
"loss": 0.5711,
"step": 145
},
{
"epoch": 0.12758468983721954,
"eval_loss": 0.6240233182907104,
"eval_runtime": 28.5264,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 145
},
{
"epoch": 0.13198416190057194,
"grad_norm": 0.2877412736415863,
"learning_rate": 0.00019938831235582672,
"loss": 0.5885,
"step": 150
},
{
"epoch": 0.13198416190057194,
"eval_loss": 0.6206945776939392,
"eval_runtime": 28.5668,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 150
},
{
"epoch": 0.13638363396392433,
"grad_norm": 0.2922605574131012,
"learning_rate": 0.00019920684014142738,
"loss": 0.5485,
"step": 155
},
{
"epoch": 0.13638363396392433,
"eval_loss": 0.6200662851333618,
"eval_runtime": 28.5452,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 155
},
{
"epoch": 0.14078310602727673,
"grad_norm": 0.28340834379196167,
"learning_rate": 0.00019900193259936704,
"loss": 0.5754,
"step": 160
},
{
"epoch": 0.14078310602727673,
"eval_loss": 0.6187402606010437,
"eval_runtime": 28.5939,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 160
},
{
"epoch": 0.14518257809062912,
"grad_norm": 0.2796618938446045,
"learning_rate": 0.0001987736381343261,
"loss": 0.5535,
"step": 165
},
{
"epoch": 0.14518257809062912,
"eval_loss": 0.6156266331672668,
"eval_runtime": 28.5378,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 165
},
{
"epoch": 0.14958205015398152,
"grad_norm": 0.25343528389930725,
"learning_rate": 0.00019852201067560606,
"loss": 0.5697,
"step": 170
},
{
"epoch": 0.14958205015398152,
"eval_loss": 0.6125033497810364,
"eval_runtime": 28.5565,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 170
},
{
"epoch": 0.1539815222173339,
"grad_norm": 0.23438464105129242,
"learning_rate": 0.00019824710966438996,
"loss": 0.5335,
"step": 175
},
{
"epoch": 0.1539815222173339,
"eval_loss": 0.6096713542938232,
"eval_runtime": 28.6017,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 175
},
{
"epoch": 0.1583809942806863,
"grad_norm": 0.24729043245315552,
"learning_rate": 0.00019794900003970077,
"loss": 0.5702,
"step": 180
},
{
"epoch": 0.1583809942806863,
"eval_loss": 0.6071114540100098,
"eval_runtime": 28.5677,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 180
},
{
"epoch": 0.16278046634403873,
"grad_norm": 0.257964551448822,
"learning_rate": 0.00019762775222306107,
"loss": 0.5494,
"step": 185
},
{
"epoch": 0.16278046634403873,
"eval_loss": 0.6062531471252441,
"eval_runtime": 28.5933,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 185
},
{
"epoch": 0.16717993840739112,
"grad_norm": 0.2648680806159973,
"learning_rate": 0.0001972834421018576,
"loss": 0.5379,
"step": 190
},
{
"epoch": 0.16717993840739112,
"eval_loss": 0.6054437756538391,
"eval_runtime": 28.5575,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 190
},
{
"epoch": 0.17157941047074352,
"grad_norm": 0.2540712356567383,
"learning_rate": 0.00019691615101141455,
"loss": 0.5415,
"step": 195
},
{
"epoch": 0.17157941047074352,
"eval_loss": 0.6023730039596558,
"eval_runtime": 28.5419,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 195
},
{
"epoch": 0.1759788825340959,
"grad_norm": 0.2424851357936859,
"learning_rate": 0.00019652596571578004,
"loss": 0.5504,
"step": 200
},
{
"epoch": 0.1759788825340959,
"eval_loss": 0.5997632145881653,
"eval_runtime": 28.6422,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 200
},
{
"epoch": 0.1803783545974483,
"grad_norm": 0.2573873698711395,
"learning_rate": 0.0001961129783872301,
"loss": 0.5418,
"step": 205
},
{
"epoch": 0.1803783545974483,
"eval_loss": 0.5976300239562988,
"eval_runtime": 28.5752,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 205
},
{
"epoch": 0.1847778266608007,
"grad_norm": 0.22338183224201202,
"learning_rate": 0.00019567728658449504,
"loss": 0.54,
"step": 210
},
{
"epoch": 0.1847778266608007,
"eval_loss": 0.5960862040519714,
"eval_runtime": 28.4685,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 210
},
{
"epoch": 0.1891772987241531,
"grad_norm": 0.2706097960472107,
"learning_rate": 0.00019521899322971352,
"loss": 0.5522,
"step": 215
},
{
"epoch": 0.1891772987241531,
"eval_loss": 0.5958646535873413,
"eval_runtime": 28.5678,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 215
},
{
"epoch": 0.1935767707875055,
"grad_norm": 0.23476411402225494,
"learning_rate": 0.00019473820658411957,
"loss": 0.5262,
"step": 220
},
{
"epoch": 0.1935767707875055,
"eval_loss": 0.5945417284965515,
"eval_runtime": 28.5611,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 220
},
{
"epoch": 0.1979762428508579,
"grad_norm": 0.23705659806728363,
"learning_rate": 0.00019423504022246825,
"loss": 0.5439,
"step": 225
},
{
"epoch": 0.1979762428508579,
"eval_loss": 0.5934200286865234,
"eval_runtime": 28.5955,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 225
},
{
"epoch": 0.2023757149142103,
"grad_norm": 0.22662319242954254,
"learning_rate": 0.00019370961300620637,
"loss": 0.5262,
"step": 230
},
{
"epoch": 0.2023757149142103,
"eval_loss": 0.5928044319152832,
"eval_runtime": 28.514,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 230
},
{
"epoch": 0.2067751869775627,
"grad_norm": 0.24046145379543304,
"learning_rate": 0.00019316204905539425,
"loss": 0.5462,
"step": 235
},
{
"epoch": 0.2067751869775627,
"eval_loss": 0.5904839038848877,
"eval_runtime": 28.5557,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 235
},
{
"epoch": 0.2111746590409151,
"grad_norm": 0.23923470079898834,
"learning_rate": 0.000192592477719385,
"loss": 0.5345,
"step": 240
},
{
"epoch": 0.2111746590409151,
"eval_loss": 0.590508759021759,
"eval_runtime": 28.5204,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 240
},
{
"epoch": 0.21557413110426749,
"grad_norm": 0.24345721304416656,
"learning_rate": 0.00019200103354626892,
"loss": 0.5478,
"step": 245
},
{
"epoch": 0.21557413110426749,
"eval_loss": 0.5882726907730103,
"eval_runtime": 28.5722,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 245
},
{
"epoch": 0.21997360316761988,
"grad_norm": 0.27501732110977173,
"learning_rate": 0.00019138785625108957,
"loss": 0.5607,
"step": 250
},
{
"epoch": 0.21997360316761988,
"eval_loss": 0.5860432982444763,
"eval_runtime": 28.503,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 250
},
{
"epoch": 0.22437307523097227,
"grad_norm": 0.3151032328605652,
"learning_rate": 0.0001907530906828393,
"loss": 0.5479,
"step": 255
},
{
"epoch": 0.22437307523097227,
"eval_loss": 0.5846895575523376,
"eval_runtime": 28.6081,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 255
},
{
"epoch": 0.22877254729432467,
"grad_norm": 0.2758755385875702,
"learning_rate": 0.0001900968867902419,
"loss": 0.5767,
"step": 260
},
{
"epoch": 0.22877254729432467,
"eval_loss": 0.5815722942352295,
"eval_runtime": 28.5574,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 260
},
{
"epoch": 0.2331720193576771,
"grad_norm": 0.25241315364837646,
"learning_rate": 0.000189419399586331,
"loss": 0.5568,
"step": 265
},
{
"epoch": 0.2331720193576771,
"eval_loss": 0.5822274684906006,
"eval_runtime": 28.573,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 265
},
{
"epoch": 0.23757149142102948,
"grad_norm": 0.316436767578125,
"learning_rate": 0.00018872078911183146,
"loss": 0.5385,
"step": 270
},
{
"epoch": 0.23757149142102948,
"eval_loss": 0.5809066891670227,
"eval_runtime": 28.5598,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 270
},
{
"epoch": 0.24197096348438188,
"grad_norm": 0.27813801169395447,
"learning_rate": 0.00018800122039735358,
"loss": 0.5348,
"step": 275
},
{
"epoch": 0.24197096348438188,
"eval_loss": 0.5786107778549194,
"eval_runtime": 28.546,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 275
},
{
"epoch": 0.24637043554773427,
"grad_norm": 0.2552705407142639,
"learning_rate": 0.00018726086342440846,
"loss": 0.5207,
"step": 280
},
{
"epoch": 0.24637043554773427,
"eval_loss": 0.5768923759460449,
"eval_runtime": 28.5995,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 280
},
{
"epoch": 0.2507699076110867,
"grad_norm": 0.21993091702461243,
"learning_rate": 0.00018649989308525372,
"loss": 0.5292,
"step": 285
},
{
"epoch": 0.2507699076110867,
"eval_loss": 0.5762263536453247,
"eval_runtime": 28.4816,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 285
},
{
"epoch": 0.2551693796744391,
"grad_norm": 0.27086153626441956,
"learning_rate": 0.0001857184891415794,
"loss": 0.5312,
"step": 290
},
{
"epoch": 0.2551693796744391,
"eval_loss": 0.5758266448974609,
"eval_runtime": 28.5295,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 290
},
{
"epoch": 0.2595688517377915,
"grad_norm": 0.21816319227218628,
"learning_rate": 0.0001849168361820431,
"loss": 0.5223,
"step": 295
},
{
"epoch": 0.2595688517377915,
"eval_loss": 0.574447751045227,
"eval_runtime": 28.5859,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 295
},
{
"epoch": 0.2639683238011439,
"grad_norm": 0.24796700477600098,
"learning_rate": 0.00018409512357866548,
"loss": 0.5485,
"step": 300
},
{
"epoch": 0.2639683238011439,
"eval_loss": 0.573371410369873,
"eval_runtime": 28.6178,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 300
},
{
"epoch": 0.2683677958644963,
"grad_norm": 0.2425287663936615,
"learning_rate": 0.00018325354544209535,
"loss": 0.5217,
"step": 305
},
{
"epoch": 0.2683677958644963,
"eval_loss": 0.5723298788070679,
"eval_runtime": 28.5916,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 305
},
{
"epoch": 0.27276726792784867,
"grad_norm": 0.21630050241947174,
"learning_rate": 0.00018239230057575542,
"loss": 0.5074,
"step": 310
},
{
"epoch": 0.27276726792784867,
"eval_loss": 0.5725327134132385,
"eval_runtime": 28.536,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 310
},
{
"epoch": 0.27716673999120106,
"grad_norm": 0.21529468894004822,
"learning_rate": 0.0001815115924288798,
"loss": 0.5487,
"step": 315
},
{
"epoch": 0.27716673999120106,
"eval_loss": 0.5721793174743652,
"eval_runtime": 28.6852,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 315
},
{
"epoch": 0.28156621205455346,
"grad_norm": 0.21623414754867554,
"learning_rate": 0.00018061162904845358,
"loss": 0.5106,
"step": 320
},
{
"epoch": 0.28156621205455346,
"eval_loss": 0.5709577202796936,
"eval_runtime": 28.4592,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 320
},
{
"epoch": 0.28596568411790585,
"grad_norm": 0.2219308316707611,
"learning_rate": 0.0001796926230300667,
"loss": 0.5218,
"step": 325
},
{
"epoch": 0.28596568411790585,
"eval_loss": 0.5698617100715637,
"eval_runtime": 28.5588,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 325
},
{
"epoch": 0.29036515618125824,
"grad_norm": 0.2264701873064041,
"learning_rate": 0.00017875479146769305,
"loss": 0.5162,
"step": 330
},
{
"epoch": 0.29036515618125824,
"eval_loss": 0.5689781308174133,
"eval_runtime": 28.6221,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 330
},
{
"epoch": 0.29476462824461064,
"grad_norm": 0.24004362523555756,
"learning_rate": 0.000177798355902407,
"loss": 0.539,
"step": 335
},
{
"epoch": 0.29476462824461064,
"eval_loss": 0.5678241848945618,
"eval_runtime": 28.5677,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 335
},
{
"epoch": 0.29916410030796303,
"grad_norm": 0.22996000945568085,
"learning_rate": 0.00017682354227004963,
"loss": 0.5002,
"step": 340
},
{
"epoch": 0.29916410030796303,
"eval_loss": 0.5670127272605896,
"eval_runtime": 28.6425,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 340
},
{
"epoch": 0.3035635723713154,
"grad_norm": 0.23163671791553497,
"learning_rate": 0.00017583058084785625,
"loss": 0.5175,
"step": 345
},
{
"epoch": 0.3035635723713154,
"eval_loss": 0.5650352239608765,
"eval_runtime": 28.5994,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 345
},
{
"epoch": 0.3079630444346678,
"grad_norm": 0.20120489597320557,
"learning_rate": 0.00017481970620005912,
"loss": 0.5269,
"step": 350
},
{
"epoch": 0.3079630444346678,
"eval_loss": 0.5640237927436829,
"eval_runtime": 28.5009,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 350
},
{
"epoch": 0.3123625164980202,
"grad_norm": 0.22231583297252655,
"learning_rate": 0.00017379115712247675,
"loss": 0.5444,
"step": 355
},
{
"epoch": 0.3123625164980202,
"eval_loss": 0.5634257197380066,
"eval_runtime": 28.5722,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 355
},
{
"epoch": 0.3167619885613726,
"grad_norm": 0.216331347823143,
"learning_rate": 0.00017274517658610398,
"loss": 0.5074,
"step": 360
},
{
"epoch": 0.3167619885613726,
"eval_loss": 0.5618783831596375,
"eval_runtime": 28.6759,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 360
},
{
"epoch": 0.32116146062472506,
"grad_norm": 0.21976010501384735,
"learning_rate": 0.0001716820116797158,
"loss": 0.5259,
"step": 365
},
{
"epoch": 0.32116146062472506,
"eval_loss": 0.5602042078971863,
"eval_runtime": 28.6019,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 365
},
{
"epoch": 0.32556093268807745,
"grad_norm": 0.22740119695663452,
"learning_rate": 0.0001706019135514982,
"loss": 0.5158,
"step": 370
},
{
"epoch": 0.32556093268807745,
"eval_loss": 0.5599080920219421,
"eval_runtime": 28.5177,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 370
},
{
"epoch": 0.32996040475142985,
"grad_norm": 0.21888501942157745,
"learning_rate": 0.0001695051373497202,
"loss": 0.527,
"step": 375
},
{
"epoch": 0.32996040475142985,
"eval_loss": 0.558814525604248,
"eval_runtime": 28.661,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 375
},
{
"epoch": 0.33435987681478224,
"grad_norm": 0.20402850210666656,
"learning_rate": 0.00016839194216246108,
"loss": 0.5027,
"step": 380
},
{
"epoch": 0.33435987681478224,
"eval_loss": 0.5578404664993286,
"eval_runtime": 28.5421,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 380
},
{
"epoch": 0.33875934887813464,
"grad_norm": 0.20368748903274536,
"learning_rate": 0.00016726259095640664,
"loss": 0.505,
"step": 385
},
{
"epoch": 0.33875934887813464,
"eval_loss": 0.5567160844802856,
"eval_runtime": 28.6126,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 385
},
{
"epoch": 0.34315882094148703,
"grad_norm": 0.2069130390882492,
"learning_rate": 0.0001661173505147295,
"loss": 0.5086,
"step": 390
},
{
"epoch": 0.34315882094148703,
"eval_loss": 0.55617755651474,
"eval_runtime": 28.4879,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 390
},
{
"epoch": 0.3475582930048394,
"grad_norm": 0.23644201457500458,
"learning_rate": 0.00016495649137406772,
"loss": 0.5412,
"step": 395
},
{
"epoch": 0.3475582930048394,
"eval_loss": 0.5556927919387817,
"eval_runtime": 28.6713,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 395
},
{
"epoch": 0.3519577650681918,
"grad_norm": 0.21997737884521484,
"learning_rate": 0.00016378028776061667,
"loss": 0.4908,
"step": 400
},
{
"epoch": 0.3519577650681918,
"eval_loss": 0.5555915832519531,
"eval_runtime": 28.596,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 400
},
{
"epoch": 0.3563572371315442,
"grad_norm": 0.22075805068016052,
"learning_rate": 0.00016258901752534948,
"loss": 0.5155,
"step": 405
},
{
"epoch": 0.3563572371315442,
"eval_loss": 0.5552019476890564,
"eval_runtime": 28.595,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 405
},
{
"epoch": 0.3607567091948966,
"grad_norm": 0.5917304158210754,
"learning_rate": 0.00016138296207838127,
"loss": 0.4991,
"step": 410
},
{
"epoch": 0.3607567091948966,
"eval_loss": 0.5550567507743835,
"eval_runtime": 28.6222,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 410
},
{
"epoch": 0.365156181258249,
"grad_norm": 0.21421152353286743,
"learning_rate": 0.00016016240632249224,
"loss": 0.4769,
"step": 415
},
{
"epoch": 0.365156181258249,
"eval_loss": 0.5548796653747559,
"eval_runtime": 28.5933,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 415
},
{
"epoch": 0.3695556533216014,
"grad_norm": 0.201774463057518,
"learning_rate": 0.0001589276385858262,
"loss": 0.4914,
"step": 420
},
{
"epoch": 0.3695556533216014,
"eval_loss": 0.5546624064445496,
"eval_runtime": 28.5213,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 420
},
{
"epoch": 0.3739551253849538,
"grad_norm": 0.22172759473323822,
"learning_rate": 0.0001576789505537795,
"loss": 0.4726,
"step": 425
},
{
"epoch": 0.3739551253849538,
"eval_loss": 0.5535080432891846,
"eval_runtime": 28.6645,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 425
},
{
"epoch": 0.3783545974483062,
"grad_norm": 0.23269815742969513,
"learning_rate": 0.00015641663720009733,
"loss": 0.5076,
"step": 430
},
{
"epoch": 0.3783545974483062,
"eval_loss": 0.5522862076759338,
"eval_runtime": 28.5697,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 430
},
{
"epoch": 0.3827540695116586,
"grad_norm": 0.23303498327732086,
"learning_rate": 0.00015514099671719268,
"loss": 0.5064,
"step": 435
},
{
"epoch": 0.3827540695116586,
"eval_loss": 0.5502522587776184,
"eval_runtime": 28.5369,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 435
},
{
"epoch": 0.387153541575011,
"grad_norm": 0.24087387323379517,
"learning_rate": 0.00015385233044570555,
"loss": 0.5361,
"step": 440
},
{
"epoch": 0.387153541575011,
"eval_loss": 0.5471201539039612,
"eval_runtime": 28.5791,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 440
},
{
"epoch": 0.3915530136383634,
"grad_norm": 0.20800553262233734,
"learning_rate": 0.00015255094280331797,
"loss": 0.5169,
"step": 445
},
{
"epoch": 0.3915530136383634,
"eval_loss": 0.5466722846031189,
"eval_runtime": 28.6339,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 445
},
{
"epoch": 0.3959524857017158,
"grad_norm": 0.37092360854148865,
"learning_rate": 0.0001512371412128424,
"loss": 0.5362,
"step": 450
},
{
"epoch": 0.3959524857017158,
"eval_loss": 0.5455148220062256,
"eval_runtime": 28.637,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 450
},
{
"epoch": 0.4003519577650682,
"grad_norm": 0.20706337690353394,
"learning_rate": 0.00014991123602960018,
"loss": 0.4994,
"step": 455
},
{
"epoch": 0.4003519577650682,
"eval_loss": 0.5440109968185425,
"eval_runtime": 28.5672,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 455
},
{
"epoch": 0.4047514298284206,
"grad_norm": 0.2135256677865982,
"learning_rate": 0.00014857354046810732,
"loss": 0.5005,
"step": 460
},
{
"epoch": 0.4047514298284206,
"eval_loss": 0.5431147813796997,
"eval_runtime": 28.4835,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 460
},
{
"epoch": 0.409150901891773,
"grad_norm": 0.5737074613571167,
"learning_rate": 0.00014722437052808472,
"loss": 0.5208,
"step": 465
},
{
"epoch": 0.409150901891773,
"eval_loss": 0.541969358921051,
"eval_runtime": 28.6004,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 465
},
{
"epoch": 0.4135503739551254,
"grad_norm": 0.24099959433078766,
"learning_rate": 0.00014586404491981052,
"loss": 0.5074,
"step": 470
},
{
"epoch": 0.4135503739551254,
"eval_loss": 0.5449388027191162,
"eval_runtime": 28.658,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 470
},
{
"epoch": 0.4179498460184778,
"grad_norm": 0.2046642154455185,
"learning_rate": 0.0001444928849888321,
"loss": 0.5052,
"step": 475
},
{
"epoch": 0.4179498460184778,
"eval_loss": 0.5407991409301758,
"eval_runtime": 28.5688,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 475
},
{
"epoch": 0.4223493180818302,
"grad_norm": 0.2824171185493469,
"learning_rate": 0.00014311121464005583,
"loss": 0.5179,
"step": 480
},
{
"epoch": 0.4223493180818302,
"eval_loss": 0.54000324010849,
"eval_runtime": 28.7144,
"eval_samples_per_second": 0.592,
"eval_steps_per_second": 0.313,
"step": 480
},
{
"epoch": 0.4267487901451826,
"grad_norm": 0.2045980840921402,
"learning_rate": 0.00014171936026123168,
"loss": 0.4634,
"step": 485
},
{
"epoch": 0.4267487901451826,
"eval_loss": 0.5398800373077393,
"eval_runtime": 28.5209,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 485
},
{
"epoch": 0.43114826220853497,
"grad_norm": 0.2092169225215912,
"learning_rate": 0.00014031765064585197,
"loss": 0.4802,
"step": 490
},
{
"epoch": 0.43114826220853497,
"eval_loss": 0.5395181179046631,
"eval_runtime": 28.5086,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 490
},
{
"epoch": 0.43554773427188737,
"grad_norm": 0.20700140297412872,
"learning_rate": 0.00013890641691548114,
"loss": 0.4962,
"step": 495
},
{
"epoch": 0.43554773427188737,
"eval_loss": 0.5390854477882385,
"eval_runtime": 28.5682,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 495
},
{
"epoch": 0.43994720633523976,
"grad_norm": 0.19903522729873657,
"learning_rate": 0.00013748599244153633,
"loss": 0.4841,
"step": 500
},
{
"epoch": 0.43994720633523976,
"eval_loss": 0.5381758213043213,
"eval_runtime": 29.4274,
"eval_samples_per_second": 0.578,
"eval_steps_per_second": 0.306,
"step": 500
},
{
"epoch": 0.44434667839859215,
"grad_norm": 0.4766729474067688,
"learning_rate": 0.00013605671276653567,
"loss": 0.5252,
"step": 505
},
{
"epoch": 0.44434667839859215,
"eval_loss": 0.5368968844413757,
"eval_runtime": 28.6474,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 505
},
{
"epoch": 0.44874615046194455,
"grad_norm": 0.21688155829906464,
"learning_rate": 0.00013461891552483444,
"loss": 0.515,
"step": 510
},
{
"epoch": 0.44874615046194455,
"eval_loss": 0.5366407036781311,
"eval_runtime": 28.5352,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 510
},
{
"epoch": 0.45314562252529694,
"grad_norm": 0.20375116169452667,
"learning_rate": 0.00013317294036286644,
"loss": 0.4887,
"step": 515
},
{
"epoch": 0.45314562252529694,
"eval_loss": 0.5360764861106873,
"eval_runtime": 28.6533,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 515
},
{
"epoch": 0.45754509458864934,
"grad_norm": 0.1958196461200714,
"learning_rate": 0.00013171912885891063,
"loss": 0.4868,
"step": 520
},
{
"epoch": 0.45754509458864934,
"eval_loss": 0.5356424450874329,
"eval_runtime": 28.5027,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 520
},
{
"epoch": 0.4619445666520018,
"grad_norm": 0.22040507197380066,
"learning_rate": 0.00013025782444240087,
"loss": 0.5086,
"step": 525
},
{
"epoch": 0.4619445666520018,
"eval_loss": 0.5351347327232361,
"eval_runtime": 28.6428,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 525
},
{
"epoch": 0.4663440387153542,
"grad_norm": 0.19495758414268494,
"learning_rate": 0.00012878937231279892,
"loss": 0.5113,
"step": 530
},
{
"epoch": 0.4663440387153542,
"eval_loss": 0.5347647070884705,
"eval_runtime": 28.6252,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 530
},
{
"epoch": 0.4707435107787066,
"grad_norm": 0.21149738132953644,
"learning_rate": 0.0001273141193580488,
"loss": 0.483,
"step": 535
},
{
"epoch": 0.4707435107787066,
"eval_loss": 0.5339221954345703,
"eval_runtime": 28.6055,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 535
},
{
"epoch": 0.47514298284205897,
"grad_norm": 0.20391018688678741,
"learning_rate": 0.0001258324140726326,
"loss": 0.4728,
"step": 540
},
{
"epoch": 0.47514298284205897,
"eval_loss": 0.5337977409362793,
"eval_runtime": 28.5842,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 540
},
{
"epoch": 0.47954245490541136,
"grad_norm": 0.20913545787334442,
"learning_rate": 0.00012434460647524676,
"loss": 0.5016,
"step": 545
},
{
"epoch": 0.47954245490541136,
"eval_loss": 0.532899022102356,
"eval_runtime": 28.4759,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 545
},
{
"epoch": 0.48394192696876376,
"grad_norm": 0.19410260021686554,
"learning_rate": 0.00012285104802611812,
"loss": 0.5103,
"step": 550
},
{
"epoch": 0.48394192696876376,
"eval_loss": 0.5321294665336609,
"eval_runtime": 28.5662,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 550
},
{
"epoch": 0.48834139903211615,
"grad_norm": 0.2097245752811432,
"learning_rate": 0.00012135209154397962,
"loss": 0.4954,
"step": 555
},
{
"epoch": 0.48834139903211615,
"eval_loss": 0.532034695148468,
"eval_runtime": 28.652,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 555
},
{
"epoch": 0.49274087109546855,
"grad_norm": 0.21518121659755707,
"learning_rate": 0.00011984809112272495,
"loss": 0.4999,
"step": 560
},
{
"epoch": 0.49274087109546855,
"eval_loss": 0.5313233733177185,
"eval_runtime": 28.5662,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 560
},
{
"epoch": 0.49714034315882094,
"grad_norm": 0.19571034610271454,
"learning_rate": 0.00011833940204776209,
"loss": 0.4931,
"step": 565
},
{
"epoch": 0.49714034315882094,
"eval_loss": 0.5311394333839417,
"eval_runtime": 28.5352,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 565
},
{
"epoch": 0.5015398152221734,
"grad_norm": 0.20554794371128082,
"learning_rate": 0.00011682638071208533,
"loss": 0.4833,
"step": 570
},
{
"epoch": 0.5015398152221734,
"eval_loss": 0.5300410389900208,
"eval_runtime": 28.5679,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 570
},
{
"epoch": 0.5059392872855257,
"grad_norm": 0.20373423397541046,
"learning_rate": 0.00011530938453208559,
"loss": 0.5057,
"step": 575
},
{
"epoch": 0.5059392872855257,
"eval_loss": 0.5300309658050537,
"eval_runtime": 28.5821,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 575
},
{
"epoch": 0.5103387593488782,
"grad_norm": 0.1982477903366089,
"learning_rate": 0.00011378877186311912,
"loss": 0.4754,
"step": 580
},
{
"epoch": 0.5103387593488782,
"eval_loss": 0.5292160511016846,
"eval_runtime": 28.5256,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 580
},
{
"epoch": 0.5147382314122305,
"grad_norm": 0.20576219260692596,
"learning_rate": 0.00011226490191485421,
"loss": 0.4991,
"step": 585
},
{
"epoch": 0.5147382314122305,
"eval_loss": 0.5280917882919312,
"eval_runtime": 28.6835,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 585
},
{
"epoch": 0.519137703475583,
"grad_norm": 0.2154638022184372,
"learning_rate": 0.00011073813466641632,
"loss": 0.4811,
"step": 590
},
{
"epoch": 0.519137703475583,
"eval_loss": 0.5274674296379089,
"eval_runtime": 28.4766,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 590
},
{
"epoch": 0.5235371755389353,
"grad_norm": 0.2037007063627243,
"learning_rate": 0.00010920883078135117,
"loss": 0.4717,
"step": 595
},
{
"epoch": 0.5235371755389353,
"eval_loss": 0.5270927548408508,
"eval_runtime": 28.5377,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 595
},
{
"epoch": 0.5279366476022878,
"grad_norm": 0.21386198699474335,
"learning_rate": 0.00010767735152242649,
"loss": 0.4776,
"step": 600
},
{
"epoch": 0.5279366476022878,
"eval_loss": 0.526791512966156,
"eval_runtime": 28.596,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 600
},
{
"epoch": 0.5323361196656401,
"grad_norm": 0.1984720528125763,
"learning_rate": 0.0001061440586662917,
"loss": 0.4708,
"step": 605
},
{
"epoch": 0.5323361196656401,
"eval_loss": 0.5266034007072449,
"eval_runtime": 28.6491,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 605
},
{
"epoch": 0.5367355917289925,
"grad_norm": 0.19453096389770508,
"learning_rate": 0.000104609314418017,
"loss": 0.4659,
"step": 610
},
{
"epoch": 0.5367355917289925,
"eval_loss": 0.5267328023910522,
"eval_runtime": 28.6358,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 610
},
{
"epoch": 0.5411350637923449,
"grad_norm": 0.2048104703426361,
"learning_rate": 0.00010307348132553025,
"loss": 0.5138,
"step": 615
},
{
"epoch": 0.5411350637923449,
"eval_loss": 0.5270944833755493,
"eval_runtime": 28.5902,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 615
},
{
"epoch": 0.5455345358556973,
"grad_norm": 0.1899915337562561,
"learning_rate": 0.00010153692219397387,
"loss": 0.4797,
"step": 620
},
{
"epoch": 0.5455345358556973,
"eval_loss": 0.5260502099990845,
"eval_runtime": 28.5533,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 620
},
{
"epoch": 0.5499340079190497,
"grad_norm": 0.18520919978618622,
"learning_rate": 0.0001,
"loss": 0.5068,
"step": 625
},
{
"epoch": 0.5499340079190497,
"eval_loss": 0.5251287817955017,
"eval_runtime": 28.4846,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 625
},
{
"epoch": 0.5543334799824021,
"grad_norm": 0.21325986087322235,
"learning_rate": 9.84630778060262e-05,
"loss": 0.4799,
"step": 630
},
{
"epoch": 0.5543334799824021,
"eval_loss": 0.524385929107666,
"eval_runtime": 28.5917,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 630
},
{
"epoch": 0.5587329520457545,
"grad_norm": 0.20572926104068756,
"learning_rate": 9.692651867446973e-05,
"loss": 0.49,
"step": 635
},
{
"epoch": 0.5587329520457545,
"eval_loss": 0.523975133895874,
"eval_runtime": 28.6052,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 635
},
{
"epoch": 0.5631324241091069,
"grad_norm": 0.20347937941551208,
"learning_rate": 9.539068558198304e-05,
"loss": 0.4702,
"step": 640
},
{
"epoch": 0.5631324241091069,
"eval_loss": 0.5229539275169373,
"eval_runtime": 28.6223,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 640
},
{
"epoch": 0.5675318961724594,
"grad_norm": 0.21256154775619507,
"learning_rate": 9.38559413337083e-05,
"loss": 0.4736,
"step": 645
},
{
"epoch": 0.5675318961724594,
"eval_loss": 0.5221072435379028,
"eval_runtime": 28.6189,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 645
},
{
"epoch": 0.5719313682358117,
"grad_norm": 0.2260565459728241,
"learning_rate": 9.232264847757357e-05,
"loss": 0.5065,
"step": 650
},
{
"epoch": 0.5719313682358117,
"eval_loss": 0.5213314890861511,
"eval_runtime": 28.6771,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 650
},
{
"epoch": 0.5763308402991641,
"grad_norm": 0.21002529561519623,
"learning_rate": 9.079116921864884e-05,
"loss": 0.4796,
"step": 655
},
{
"epoch": 0.5763308402991641,
"eval_loss": 0.5214037299156189,
"eval_runtime": 28.6202,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 655
},
{
"epoch": 0.5807303123625165,
"grad_norm": 0.19340470433235168,
"learning_rate": 8.92618653335837e-05,
"loss": 0.4788,
"step": 660
},
{
"epoch": 0.5807303123625165,
"eval_loss": 0.5211138725280762,
"eval_runtime": 28.6313,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 660
},
{
"epoch": 0.5851297844258689,
"grad_norm": 0.19035720825195312,
"learning_rate": 8.773509808514581e-05,
"loss": 0.468,
"step": 665
},
{
"epoch": 0.5851297844258689,
"eval_loss": 0.5191999077796936,
"eval_runtime": 28.0607,
"eval_samples_per_second": 0.606,
"eval_steps_per_second": 0.321,
"step": 665
},
{
"epoch": 0.5895292564892213,
"grad_norm": 0.19168096780776978,
"learning_rate": 8.62112281368809e-05,
"loss": 0.5066,
"step": 670
},
{
"epoch": 0.5895292564892213,
"eval_loss": 0.5176913142204285,
"eval_runtime": 28.5375,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 670
},
{
"epoch": 0.5939287285525737,
"grad_norm": 0.19758321344852448,
"learning_rate": 8.469061546791442e-05,
"loss": 0.51,
"step": 675
},
{
"epoch": 0.5939287285525737,
"eval_loss": 0.517296314239502,
"eval_runtime": 28.5712,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 675
},
{
"epoch": 0.5983282006159261,
"grad_norm": 0.19562241435050964,
"learning_rate": 8.317361928791469e-05,
"loss": 0.4932,
"step": 680
},
{
"epoch": 0.5983282006159261,
"eval_loss": 0.5170657634735107,
"eval_runtime": 28.4877,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 680
},
{
"epoch": 0.6027276726792785,
"grad_norm": 0.18590031564235687,
"learning_rate": 8.166059795223794e-05,
"loss": 0.5055,
"step": 685
},
{
"epoch": 0.6027276726792785,
"eval_loss": 0.5166193842887878,
"eval_runtime": 28.625,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 685
},
{
"epoch": 0.6071271447426309,
"grad_norm": 0.2049984484910965,
"learning_rate": 8.015190887727509e-05,
"loss": 0.4846,
"step": 690
},
{
"epoch": 0.6071271447426309,
"eval_loss": 0.5160765647888184,
"eval_runtime": 28.5582,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 690
},
{
"epoch": 0.6115266168059833,
"grad_norm": 0.19373777508735657,
"learning_rate": 7.864790845602039e-05,
"loss": 0.4862,
"step": 695
},
{
"epoch": 0.6115266168059833,
"eval_loss": 0.5157306790351868,
"eval_runtime": 28.6078,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 695
},
{
"epoch": 0.6159260888693356,
"grad_norm": 0.20326727628707886,
"learning_rate": 7.714895197388189e-05,
"loss": 0.5064,
"step": 700
},
{
"epoch": 0.6159260888693356,
"eval_loss": 0.5153770446777344,
"eval_runtime": 28.6597,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 700
},
{
"epoch": 0.6203255609326881,
"grad_norm": 0.19425565004348755,
"learning_rate": 7.565539352475326e-05,
"loss": 0.5018,
"step": 705
},
{
"epoch": 0.6203255609326881,
"eval_loss": 0.5147074460983276,
"eval_runtime": 28.5261,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 705
},
{
"epoch": 0.6247250329960404,
"grad_norm": 0.19491039216518402,
"learning_rate": 7.416758592736744e-05,
"loss": 0.482,
"step": 710
},
{
"epoch": 0.6247250329960404,
"eval_loss": 0.5144516229629517,
"eval_runtime": 28.533,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 710
},
{
"epoch": 0.6291245050593929,
"grad_norm": 0.1957363337278366,
"learning_rate": 7.268588064195122e-05,
"loss": 0.4883,
"step": 715
},
{
"epoch": 0.6291245050593929,
"eval_loss": 0.5139791965484619,
"eval_runtime": 28.5313,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 715
},
{
"epoch": 0.6335239771227452,
"grad_norm": 0.21253836154937744,
"learning_rate": 7.12106276872011e-05,
"loss": 0.4768,
"step": 720
},
{
"epoch": 0.6335239771227452,
"eval_loss": 0.5137556195259094,
"eval_runtime": 28.6307,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 720
},
{
"epoch": 0.6379234491860977,
"grad_norm": 0.1721029132604599,
"learning_rate": 6.974217555759915e-05,
"loss": 0.4816,
"step": 725
},
{
"epoch": 0.6379234491860977,
"eval_loss": 0.5133811831474304,
"eval_runtime": 28.5925,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 725
},
{
"epoch": 0.6423229212494501,
"grad_norm": 0.19211679697036743,
"learning_rate": 6.82808711410894e-05,
"loss": 0.5035,
"step": 730
},
{
"epoch": 0.6423229212494501,
"eval_loss": 0.5132091641426086,
"eval_runtime": 28.5078,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 730
},
{
"epoch": 0.6467223933128025,
"grad_norm": 0.19252945482730865,
"learning_rate": 6.682705963713356e-05,
"loss": 0.4822,
"step": 735
},
{
"epoch": 0.6467223933128025,
"eval_loss": 0.5131357908248901,
"eval_runtime": 28.6326,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 735
},
{
"epoch": 0.6511218653761549,
"grad_norm": 0.1986207813024521,
"learning_rate": 6.538108447516558e-05,
"loss": 0.4612,
"step": 740
},
{
"epoch": 0.6511218653761549,
"eval_loss": 0.5128303170204163,
"eval_runtime": 28.6066,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 740
},
{
"epoch": 0.6555213374395072,
"grad_norm": 0.19202682375907898,
"learning_rate": 6.394328723346434e-05,
"loss": 0.4578,
"step": 745
},
{
"epoch": 0.6555213374395072,
"eval_loss": 0.5124692916870117,
"eval_runtime": 28.6064,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 745
},
{
"epoch": 0.6599208095028597,
"grad_norm": 0.198526531457901,
"learning_rate": 6.251400755846372e-05,
"loss": 0.5176,
"step": 750
},
{
"epoch": 0.6599208095028597,
"eval_loss": 0.5121349096298218,
"eval_runtime": 28.5313,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 750
},
{
"epoch": 0.664320281566212,
"grad_norm": 0.19058994948863983,
"learning_rate": 6.109358308451885e-05,
"loss": 0.4877,
"step": 755
},
{
"epoch": 0.664320281566212,
"eval_loss": 0.5118634700775146,
"eval_runtime": 28.5287,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 755
},
{
"epoch": 0.6687197536295645,
"grad_norm": 0.1798192411661148,
"learning_rate": 5.968234935414807e-05,
"loss": 0.4805,
"step": 760
},
{
"epoch": 0.6687197536295645,
"eval_loss": 0.5116167664527893,
"eval_runtime": 28.5918,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 760
},
{
"epoch": 0.6731192256929168,
"grad_norm": 0.18448549509048462,
"learning_rate": 5.828063973876834e-05,
"loss": 0.4993,
"step": 765
},
{
"epoch": 0.6731192256929168,
"eval_loss": 0.5111361742019653,
"eval_runtime": 28.5586,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 765
},
{
"epoch": 0.6775186977562693,
"grad_norm": 0.18624383211135864,
"learning_rate": 5.688878535994421e-05,
"loss": 0.4844,
"step": 770
},
{
"epoch": 0.6775186977562693,
"eval_loss": 0.5107051134109497,
"eval_runtime": 28.5748,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 770
},
{
"epoch": 0.6819181698196216,
"grad_norm": 0.18364666402339935,
"learning_rate": 5.550711501116789e-05,
"loss": 0.4674,
"step": 775
},
{
"epoch": 0.6819181698196216,
"eval_loss": 0.5101103186607361,
"eval_runtime": 28.5159,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 775
},
{
"epoch": 0.6863176418829741,
"grad_norm": 0.23952247202396393,
"learning_rate": 5.413595508018952e-05,
"loss": 0.4943,
"step": 780
},
{
"epoch": 0.6863176418829741,
"eval_loss": 0.5096238255500793,
"eval_runtime": 28.516,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 780
},
{
"epoch": 0.6907171139463264,
"grad_norm": 0.20105206966400146,
"learning_rate": 5.27756294719153e-05,
"loss": 0.4924,
"step": 785
},
{
"epoch": 0.6907171139463264,
"eval_loss": 0.5093135237693787,
"eval_runtime": 28.5941,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 785
},
{
"epoch": 0.6951165860096788,
"grad_norm": 0.19826586544513702,
"learning_rate": 5.1426459531892714e-05,
"loss": 0.4986,
"step": 790
},
{
"epoch": 0.6951165860096788,
"eval_loss": 0.5086015462875366,
"eval_runtime": 28.6207,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 790
},
{
"epoch": 0.6995160580730312,
"grad_norm": 0.17991924285888672,
"learning_rate": 5.008876397039983e-05,
"loss": 0.4698,
"step": 795
},
{
"epoch": 0.6995160580730312,
"eval_loss": 0.5082879662513733,
"eval_runtime": 28.6587,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 795
},
{
"epoch": 0.7039155301363836,
"grad_norm": 0.19232523441314697,
"learning_rate": 4.876285878715764e-05,
"loss": 0.4981,
"step": 800
},
{
"epoch": 0.7039155301363836,
"eval_loss": 0.5078893899917603,
"eval_runtime": 28.5038,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 800
},
{
"epoch": 0.708315002199736,
"grad_norm": 0.19006720185279846,
"learning_rate": 4.744905719668207e-05,
"loss": 0.4758,
"step": 805
},
{
"epoch": 0.708315002199736,
"eval_loss": 0.5076141357421875,
"eval_runtime": 28.6324,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 805
},
{
"epoch": 0.7127144742630884,
"grad_norm": 0.19002890586853027,
"learning_rate": 4.614766955429447e-05,
"loss": 0.4642,
"step": 810
},
{
"epoch": 0.7127144742630884,
"eval_loss": 0.507789671421051,
"eval_runtime": 28.6356,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 810
},
{
"epoch": 0.7171139463264409,
"grad_norm": 0.2051495909690857,
"learning_rate": 4.485900328280731e-05,
"loss": 0.4669,
"step": 815
},
{
"epoch": 0.7171139463264409,
"eval_loss": 0.5073484182357788,
"eval_runtime": 28.5748,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 815
},
{
"epoch": 0.7215134183897932,
"grad_norm": 0.6378114223480225,
"learning_rate": 4.358336279990268e-05,
"loss": 0.4711,
"step": 820
},
{
"epoch": 0.7215134183897932,
"eval_loss": 0.5070581436157227,
"eval_runtime": 28.6233,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 820
},
{
"epoch": 0.7259128904531457,
"grad_norm": 0.181978240609169,
"learning_rate": 4.2321049446220505e-05,
"loss": 0.4704,
"step": 825
},
{
"epoch": 0.7259128904531457,
"eval_loss": 0.5068845748901367,
"eval_runtime": 28.5225,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 825
},
{
"epoch": 0.730312362516498,
"grad_norm": 0.1777966171503067,
"learning_rate": 4.107236141417382e-05,
"loss": 0.4752,
"step": 830
},
{
"epoch": 0.730312362516498,
"eval_loss": 0.5066249966621399,
"eval_runtime": 28.5423,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 830
},
{
"epoch": 0.7347118345798505,
"grad_norm": 0.18686190247535706,
"learning_rate": 3.9837593677507726e-05,
"loss": 0.4621,
"step": 835
},
{
"epoch": 0.7347118345798505,
"eval_loss": 0.5066962242126465,
"eval_runtime": 28.428,
"eval_samples_per_second": 0.598,
"eval_steps_per_second": 0.317,
"step": 835
},
{
"epoch": 0.7391113066432028,
"grad_norm": 0.18854567408561707,
"learning_rate": 3.8617037921618705e-05,
"loss": 0.4748,
"step": 840
},
{
"epoch": 0.7391113066432028,
"eval_loss": 0.50632643699646,
"eval_runtime": 28.5075,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 840
},
{
"epoch": 0.7435107787065552,
"grad_norm": 0.19204109907150269,
"learning_rate": 3.741098247465049e-05,
"loss": 0.4948,
"step": 845
},
{
"epoch": 0.7435107787065552,
"eval_loss": 0.5060507655143738,
"eval_runtime": 28.5753,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 845
},
{
"epoch": 0.7479102507699076,
"grad_norm": 0.19182614982128143,
"learning_rate": 3.621971223938334e-05,
"loss": 0.4832,
"step": 850
},
{
"epoch": 0.7479102507699076,
"eval_loss": 0.5058286190032959,
"eval_runtime": 28.5184,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 850
},
{
"epoch": 0.75230972283326,
"grad_norm": 0.18205444514751434,
"learning_rate": 3.504350862593231e-05,
"loss": 0.4642,
"step": 855
},
{
"epoch": 0.75230972283326,
"eval_loss": 0.505698025226593,
"eval_runtime": 28.6382,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 855
},
{
"epoch": 0.7567091948966124,
"grad_norm": 0.20196740329265594,
"learning_rate": 3.388264948527052e-05,
"loss": 0.4877,
"step": 860
},
{
"epoch": 0.7567091948966124,
"eval_loss": 0.5052359700202942,
"eval_runtime": 28.5347,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 860
},
{
"epoch": 0.7611086669599648,
"grad_norm": 0.18125030398368835,
"learning_rate": 3.2737409043593405e-05,
"loss": 0.4727,
"step": 865
},
{
"epoch": 0.7611086669599648,
"eval_loss": 0.504954993724823,
"eval_runtime": 28.5976,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 865
},
{
"epoch": 0.7655081390233172,
"grad_norm": 0.18927669525146484,
"learning_rate": 3.160805783753897e-05,
"loss": 0.4691,
"step": 870
},
{
"epoch": 0.7655081390233172,
"eval_loss": 0.5047942399978638,
"eval_runtime": 28.5051,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 870
},
{
"epoch": 0.7699076110866696,
"grad_norm": 0.18508534133434296,
"learning_rate": 3.0494862650279822e-05,
"loss": 0.5292,
"step": 875
},
{
"epoch": 0.7699076110866696,
"eval_loss": 0.5046341419219971,
"eval_runtime": 28.5445,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 875
},
{
"epoch": 0.774307083150022,
"grad_norm": 0.18230414390563965,
"learning_rate": 2.939808644850184e-05,
"loss": 0.4708,
"step": 880
},
{
"epoch": 0.774307083150022,
"eval_loss": 0.5046290755271912,
"eval_runtime": 28.6138,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 880
},
{
"epoch": 0.7787065552133744,
"grad_norm": 0.17352643609046936,
"learning_rate": 2.8317988320284228e-05,
"loss": 0.4863,
"step": 885
},
{
"epoch": 0.7787065552133744,
"eval_loss": 0.5044691562652588,
"eval_runtime": 28.6321,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 885
},
{
"epoch": 0.7831060272767268,
"grad_norm": 0.1845002919435501,
"learning_rate": 2.7254823413896058e-05,
"loss": 0.5006,
"step": 890
},
{
"epoch": 0.7831060272767268,
"eval_loss": 0.5042091012001038,
"eval_runtime": 28.6132,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 890
},
{
"epoch": 0.7875054993400792,
"grad_norm": 0.17883773148059845,
"learning_rate": 2.6208842877523278e-05,
"loss": 0.4887,
"step": 895
},
{
"epoch": 0.7875054993400792,
"eval_loss": 0.5039156675338745,
"eval_runtime": 28.5693,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 895
},
{
"epoch": 0.7919049714034316,
"grad_norm": 0.19202597439289093,
"learning_rate": 2.518029379994089e-05,
"loss": 0.4867,
"step": 900
},
{
"epoch": 0.7919049714034316,
"eval_loss": 0.5037320852279663,
"eval_runtime": 28.549,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 900
},
{
"epoch": 0.796304443466784,
"grad_norm": 0.18246056139469147,
"learning_rate": 2.4169419152143768e-05,
"loss": 0.4662,
"step": 905
},
{
"epoch": 0.796304443466784,
"eval_loss": 0.5035374164581299,
"eval_runtime": 28.6042,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 905
},
{
"epoch": 0.8007039155301364,
"grad_norm": 0.18989378213882446,
"learning_rate": 2.317645772995042e-05,
"loss": 0.4744,
"step": 910
},
{
"epoch": 0.8007039155301364,
"eval_loss": 0.5033923387527466,
"eval_runtime": 28.4795,
"eval_samples_per_second": 0.597,
"eval_steps_per_second": 0.316,
"step": 910
},
{
"epoch": 0.8051033875934888,
"grad_norm": 0.19525018334388733,
"learning_rate": 2.220164409759299e-05,
"loss": 0.5159,
"step": 915
},
{
"epoch": 0.8051033875934888,
"eval_loss": 0.503151535987854,
"eval_runtime": 28.6198,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 915
},
{
"epoch": 0.8095028596568412,
"grad_norm": 0.18840977549552917,
"learning_rate": 2.124520853230697e-05,
"loss": 0.4848,
"step": 920
},
{
"epoch": 0.8095028596568412,
"eval_loss": 0.5029481649398804,
"eval_runtime": 28.614,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 920
},
{
"epoch": 0.8139023317201936,
"grad_norm": 0.18055056035518646,
"learning_rate": 2.03073769699333e-05,
"loss": 0.4648,
"step": 925
},
{
"epoch": 0.8139023317201936,
"eval_loss": 0.5028063654899597,
"eval_runtime": 28.5662,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 925
},
{
"epoch": 0.818301803783546,
"grad_norm": 0.18352611362934113,
"learning_rate": 1.9388370951546432e-05,
"loss": 0.4733,
"step": 930
},
{
"epoch": 0.818301803783546,
"eval_loss": 0.5027296543121338,
"eval_runtime": 28.5532,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 930
},
{
"epoch": 0.8227012758468983,
"grad_norm": 0.18161964416503906,
"learning_rate": 1.848840757112019e-05,
"loss": 0.4556,
"step": 935
},
{
"epoch": 0.8227012758468983,
"eval_loss": 0.5025849342346191,
"eval_runtime": 28.6672,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 935
},
{
"epoch": 0.8271007479102508,
"grad_norm": 0.19485127925872803,
"learning_rate": 1.7607699424244585e-05,
"loss": 0.4973,
"step": 940
},
{
"epoch": 0.8271007479102508,
"eval_loss": 0.5023777484893799,
"eval_runtime": 28.5856,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 940
},
{
"epoch": 0.8315002199736031,
"grad_norm": 0.19218072295188904,
"learning_rate": 1.674645455790468e-05,
"loss": 0.4708,
"step": 945
},
{
"epoch": 0.8315002199736031,
"eval_loss": 0.5024308562278748,
"eval_runtime": 28.6001,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 945
},
{
"epoch": 0.8358996920369556,
"grad_norm": 0.18270643055438995,
"learning_rate": 1.5904876421334536e-05,
"loss": 0.4547,
"step": 950
},
{
"epoch": 0.8358996920369556,
"eval_loss": 0.5024178624153137,
"eval_runtime": 28.5464,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 950
},
{
"epoch": 0.8402991641003079,
"grad_norm": 0.18350371718406677,
"learning_rate": 1.5083163817956914e-05,
"loss": 0.4663,
"step": 955
},
{
"epoch": 0.8402991641003079,
"eval_loss": 0.5021481513977051,
"eval_runtime": 28.5783,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 955
},
{
"epoch": 0.8446986361636604,
"grad_norm": 0.18115630745887756,
"learning_rate": 1.4281510858420632e-05,
"loss": 0.4857,
"step": 960
},
{
"epoch": 0.8446986361636604,
"eval_loss": 0.5019457340240479,
"eval_runtime": 28.5976,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 960
},
{
"epoch": 0.8490981082270127,
"grad_norm": 0.1744571477174759,
"learning_rate": 1.350010691474629e-05,
"loss": 0.4633,
"step": 965
},
{
"epoch": 0.8490981082270127,
"eval_loss": 0.5019629597663879,
"eval_runtime": 28.5207,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 965
},
{
"epoch": 0.8534975802903652,
"grad_norm": 0.18827442824840546,
"learning_rate": 1.2739136575591581e-05,
"loss": 0.4723,
"step": 970
},
{
"epoch": 0.8534975802903652,
"eval_loss": 0.5018792748451233,
"eval_runtime": 28.4515,
"eval_samples_per_second": 0.598,
"eval_steps_per_second": 0.316,
"step": 970
},
{
"epoch": 0.8578970523537176,
"grad_norm": 0.18166576325893402,
"learning_rate": 1.1998779602646437e-05,
"loss": 0.4691,
"step": 975
},
{
"epoch": 0.8578970523537176,
"eval_loss": 0.5017500519752502,
"eval_runtime": 28.5978,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 975
},
{
"epoch": 0.8622965244170699,
"grad_norm": 0.18091408908367157,
"learning_rate": 1.1279210888168546e-05,
"loss": 0.4874,
"step": 980
},
{
"epoch": 0.8622965244170699,
"eval_loss": 0.5017052888870239,
"eval_runtime": 28.7541,
"eval_samples_per_second": 0.591,
"eval_steps_per_second": 0.313,
"step": 980
},
{
"epoch": 0.8666959964804224,
"grad_norm": 0.182442307472229,
"learning_rate": 1.0580600413668984e-05,
"loss": 0.4773,
"step": 985
},
{
"epoch": 0.8666959964804224,
"eval_loss": 0.5016083121299744,
"eval_runtime": 28.5972,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 985
},
{
"epoch": 0.8710954685437747,
"grad_norm": 0.18171900510787964,
"learning_rate": 9.903113209758096e-06,
"loss": 0.4806,
"step": 990
},
{
"epoch": 0.8710954685437747,
"eval_loss": 0.5015130043029785,
"eval_runtime": 28.5707,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 990
},
{
"epoch": 0.8754949406071272,
"grad_norm": 0.1896371841430664,
"learning_rate": 9.246909317160746e-06,
"loss": 0.4512,
"step": 995
},
{
"epoch": 0.8754949406071272,
"eval_loss": 0.5013110637664795,
"eval_runtime": 28.6509,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.314,
"step": 995
},
{
"epoch": 0.8798944126704795,
"grad_norm": 0.1779976189136505,
"learning_rate": 8.612143748910451e-06,
"loss": 0.4561,
"step": 1000
},
{
"epoch": 0.8798944126704795,
"eval_loss": 0.5013135075569153,
"eval_runtime": 28.8047,
"eval_samples_per_second": 0.59,
"eval_steps_per_second": 0.312,
"step": 1000
},
{
"epoch": 0.884293884733832,
"grad_norm": 0.17416957020759583,
"learning_rate": 7.998966453731094e-06,
"loss": 0.4637,
"step": 1005
},
{
"epoch": 0.884293884733832,
"eval_loss": 0.5013565421104431,
"eval_runtime": 28.5911,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1005
},
{
"epoch": 0.8886933567971843,
"grad_norm": 0.1769402176141739,
"learning_rate": 7.40752228061502e-06,
"loss": 0.4527,
"step": 1010
},
{
"epoch": 0.8886933567971843,
"eval_loss": 0.5010828375816345,
"eval_runtime": 28.5203,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 1010
},
{
"epoch": 0.8930928288605368,
"grad_norm": 0.17784808576107025,
"learning_rate": 6.8379509446057644e-06,
"loss": 0.4903,
"step": 1015
},
{
"epoch": 0.8930928288605368,
"eval_loss": 0.5012202262878418,
"eval_runtime": 27.8441,
"eval_samples_per_second": 0.611,
"eval_steps_per_second": 0.323,
"step": 1015
},
{
"epoch": 0.8974923009238891,
"grad_norm": 0.18067394196987152,
"learning_rate": 6.290386993793618e-06,
"loss": 0.4689,
"step": 1020
},
{
"epoch": 0.8974923009238891,
"eval_loss": 0.5012267231941223,
"eval_runtime": 28.517,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 1020
},
{
"epoch": 0.9018917729872415,
"grad_norm": 0.17478391528129578,
"learning_rate": 5.764959777531776e-06,
"loss": 0.4589,
"step": 1025
},
{
"epoch": 0.9018917729872415,
"eval_loss": 0.5011836290359497,
"eval_runtime": 28.6023,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1025
},
{
"epoch": 0.9062912450505939,
"grad_norm": 0.185857892036438,
"learning_rate": 5.261793415880456e-06,
"loss": 0.4528,
"step": 1030
},
{
"epoch": 0.9062912450505939,
"eval_loss": 0.501183807849884,
"eval_runtime": 28.5159,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.316,
"step": 1030
},
{
"epoch": 0.9106907171139463,
"grad_norm": 0.17951223254203796,
"learning_rate": 4.781006770286478e-06,
"loss": 0.4845,
"step": 1035
},
{
"epoch": 0.9106907171139463,
"eval_loss": 0.5011433959007263,
"eval_runtime": 28.6072,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1035
},
{
"epoch": 0.9150901891772987,
"grad_norm": 0.18096089363098145,
"learning_rate": 4.322713415504975e-06,
"loss": 0.4578,
"step": 1040
},
{
"epoch": 0.9150901891772987,
"eval_loss": 0.5011703968048096,
"eval_runtime": 28.6287,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 1040
},
{
"epoch": 0.9194896612406511,
"grad_norm": 0.2069099247455597,
"learning_rate": 3.887021612769936e-06,
"loss": 0.5027,
"step": 1045
},
{
"epoch": 0.9194896612406511,
"eval_loss": 0.5011240839958191,
"eval_runtime": 29.0514,
"eval_samples_per_second": 0.585,
"eval_steps_per_second": 0.31,
"step": 1045
},
{
"epoch": 0.9238891333040036,
"grad_norm": 0.18762987852096558,
"learning_rate": 3.4740342842199956e-06,
"loss": 0.4695,
"step": 1050
},
{
"epoch": 0.9238891333040036,
"eval_loss": 0.5010772347450256,
"eval_runtime": 28.5655,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1050
},
{
"epoch": 0.9282886053673559,
"grad_norm": 0.178373321890831,
"learning_rate": 3.0838489885854805e-06,
"loss": 0.484,
"step": 1055
},
{
"epoch": 0.9282886053673559,
"eval_loss": 0.5010451674461365,
"eval_runtime": 28.6083,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1055
},
{
"epoch": 0.9326880774307084,
"grad_norm": 0.1794215440750122,
"learning_rate": 2.7165578981424357e-06,
"loss": 0.4784,
"step": 1060
},
{
"epoch": 0.9326880774307084,
"eval_loss": 0.5010905265808105,
"eval_runtime": 28.5675,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1060
},
{
"epoch": 0.9370875494940607,
"grad_norm": 0.17699354887008667,
"learning_rate": 2.3722477769389517e-06,
"loss": 0.4698,
"step": 1065
},
{
"epoch": 0.9370875494940607,
"eval_loss": 0.5010352730751038,
"eval_runtime": 28.6041,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1065
},
{
"epoch": 0.9414870215574132,
"grad_norm": 0.17208220064640045,
"learning_rate": 2.0509999602992493e-06,
"loss": 0.4517,
"step": 1070
},
{
"epoch": 0.9414870215574132,
"eval_loss": 0.5010344982147217,
"eval_runtime": 28.5865,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1070
},
{
"epoch": 0.9458864936207655,
"grad_norm": 0.1774464249610901,
"learning_rate": 1.7528903356100469e-06,
"loss": 0.4846,
"step": 1075
},
{
"epoch": 0.9458864936207655,
"eval_loss": 0.5010223388671875,
"eval_runtime": 28.5634,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1075
},
{
"epoch": 0.9502859656841179,
"grad_norm": 0.1773741990327835,
"learning_rate": 1.4779893243939359e-06,
"loss": 0.4402,
"step": 1080
},
{
"epoch": 0.9502859656841179,
"eval_loss": 0.5009992718696594,
"eval_runtime": 28.5952,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1080
},
{
"epoch": 0.9546854377474703,
"grad_norm": 0.18979211151599884,
"learning_rate": 1.2263618656739084e-06,
"loss": 0.5013,
"step": 1085
},
{
"epoch": 0.9546854377474703,
"eval_loss": 0.501004159450531,
"eval_runtime": 28.614,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1085
},
{
"epoch": 0.9590849098108227,
"grad_norm": 0.1895236372947693,
"learning_rate": 9.98067400632985e-07,
"loss": 0.4588,
"step": 1090
},
{
"epoch": 0.9590849098108227,
"eval_loss": 0.5009981393814087,
"eval_runtime": 28.5601,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1090
},
{
"epoch": 0.9634843818741751,
"grad_norm": 0.17328618466854095,
"learning_rate": 7.931598585726563e-07,
"loss": 0.4712,
"step": 1095
},
{
"epoch": 0.9634843818741751,
"eval_loss": 0.500961184501648,
"eval_runtime": 28.574,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1095
},
{
"epoch": 0.9678838539375275,
"grad_norm": 0.18122579157352448,
"learning_rate": 6.116876441733088e-07,
"loss": 0.4534,
"step": 1100
},
{
"epoch": 0.9678838539375275,
"eval_loss": 0.5009814500808716,
"eval_runtime": 28.5934,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1100
},
{
"epoch": 0.9722833260008799,
"grad_norm": 0.18148748576641083,
"learning_rate": 4.536936260597258e-07,
"loss": 0.4587,
"step": 1105
},
{
"epoch": 0.9722833260008799,
"eval_loss": 0.5009997487068176,
"eval_runtime": 28.5275,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 1105
},
{
"epoch": 0.9766827980642323,
"grad_norm": 0.18024764955043793,
"learning_rate": 3.192151266743548e-07,
"loss": 0.4783,
"step": 1110
},
{
"epoch": 0.9766827980642323,
"eval_loss": 0.5009670853614807,
"eval_runtime": 28.5688,
"eval_samples_per_second": 0.595,
"eval_steps_per_second": 0.315,
"step": 1110
},
{
"epoch": 0.9810822701275846,
"grad_norm": 0.18152055144309998,
"learning_rate": 2.082839134607828e-07,
"loss": 0.4623,
"step": 1115
},
{
"epoch": 0.9810822701275846,
"eval_loss": 0.5009202361106873,
"eval_runtime": 28.6066,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1115
},
{
"epoch": 0.9854817421909371,
"grad_norm": 0.17324087023735046,
"learning_rate": 1.2092619135937177e-07,
"loss": 0.439,
"step": 1120
},
{
"epoch": 0.9854817421909371,
"eval_loss": 0.5010377168655396,
"eval_runtime": 28.5308,
"eval_samples_per_second": 0.596,
"eval_steps_per_second": 0.315,
"step": 1120
},
{
"epoch": 0.9898812142542894,
"grad_norm": 0.17685554921627045,
"learning_rate": 5.716259661695533e-08,
"loss": 0.4629,
"step": 1125
},
{
"epoch": 0.9898812142542894,
"eval_loss": 0.5009082555770874,
"eval_runtime": 28.6259,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 1125
},
{
"epoch": 0.9942806863176419,
"grad_norm": 0.17675389349460602,
"learning_rate": 1.7008191912004646e-08,
"loss": 0.4716,
"step": 1130
},
{
"epoch": 0.9942806863176419,
"eval_loss": 0.5009535551071167,
"eval_runtime": 28.626,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.314,
"step": 1130
},
{
"epoch": 0.9986801583809943,
"grad_norm": 0.18398317694664001,
"learning_rate": 4.724627964303175e-10,
"loss": 0.4832,
"step": 1135
},
{
"epoch": 0.9986801583809943,
"eval_loss": 0.5010104179382324,
"eval_runtime": 28.6106,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.315,
"step": 1135
},
{
"epoch": 0.9995600527936648,
"step": 1136,
"total_flos": 7.211600370336793e+18,
"train_loss": 0.039691918463984004,
"train_runtime": 9596.3839,
"train_samples_per_second": 1.895,
"train_steps_per_second": 0.118
}
],
"logging_steps": 5,
"max_steps": 1136,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.211600370336793e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}