|
{ |
|
"best_metric": 0.439, |
|
"best_model_checkpoint": "runs/legis-llama3-1-8b-valid-arandu/checkpoint-1120", |
|
"epoch": 0.9995600527936648, |
|
"eval_steps": 5, |
|
"global_step": 1136, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004399472063352398, |
|
"grad_norm": 25.937191009521484, |
|
"learning_rate": 8.771929824561403e-06, |
|
"loss": 1.0992, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004399472063352398, |
|
"eval_loss": 1.1428982019424438, |
|
"eval_runtime": 29.8805, |
|
"eval_samples_per_second": 0.569, |
|
"eval_steps_per_second": 0.301, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008798944126704795, |
|
"grad_norm": 32.52676773071289, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.067, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008798944126704795, |
|
"eval_loss": 1.0669578313827515, |
|
"eval_runtime": 28.5282, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013198416190057193, |
|
"grad_norm": 78.51001739501953, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 1.0057, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013198416190057193, |
|
"eval_loss": 1.0462743043899536, |
|
"eval_runtime": 28.5697, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01759788825340959, |
|
"grad_norm": 21.255964279174805, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 0.9236, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01759788825340959, |
|
"eval_loss": 0.9604344367980957, |
|
"eval_runtime": 28.6152, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02199736031676199, |
|
"grad_norm": 1.3699233531951904, |
|
"learning_rate": 4.3859649122807014e-05, |
|
"loss": 0.8823, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02199736031676199, |
|
"eval_loss": 0.9002779126167297, |
|
"eval_runtime": 28.579, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.026396832380114386, |
|
"grad_norm": 2.50810170173645, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 0.8144, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.026396832380114386, |
|
"eval_loss": 0.8441588878631592, |
|
"eval_runtime": 28.4936, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030796304443466784, |
|
"grad_norm": 1.6816316843032837, |
|
"learning_rate": 6.140350877192983e-05, |
|
"loss": 0.7829, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.030796304443466784, |
|
"eval_loss": 0.7928382754325867, |
|
"eval_runtime": 28.5908, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03519577650681918, |
|
"grad_norm": 0.5125584006309509, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 0.7075, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03519577650681918, |
|
"eval_loss": 0.7538504600524902, |
|
"eval_runtime": 28.5816, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.039595248570171576, |
|
"grad_norm": 0.36081045866012573, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 0.6776, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.039595248570171576, |
|
"eval_loss": 0.7313268184661865, |
|
"eval_runtime": 28.6141, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04399472063352398, |
|
"grad_norm": 0.32318177819252014, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 0.6499, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04399472063352398, |
|
"eval_loss": 0.71351158618927, |
|
"eval_runtime": 28.5766, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04839419269687637, |
|
"grad_norm": 0.34377261996269226, |
|
"learning_rate": 9.649122807017544e-05, |
|
"loss": 0.6487, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04839419269687637, |
|
"eval_loss": 0.7006722092628479, |
|
"eval_runtime": 28.6048, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05279366476022877, |
|
"grad_norm": 0.4360629618167877, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.6405, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05279366476022877, |
|
"eval_loss": 0.6905343532562256, |
|
"eval_runtime": 28.5257, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05719313682358117, |
|
"grad_norm": 0.28764936327934265, |
|
"learning_rate": 0.00011403508771929824, |
|
"loss": 0.6352, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05719313682358117, |
|
"eval_loss": 0.68143630027771, |
|
"eval_runtime": 28.6362, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06159260888693357, |
|
"grad_norm": 0.34088754653930664, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 0.6064, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06159260888693357, |
|
"eval_loss": 0.6742813587188721, |
|
"eval_runtime": 28.5667, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06599208095028597, |
|
"grad_norm": 0.31284183263778687, |
|
"learning_rate": 0.00013157894736842108, |
|
"loss": 0.5924, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06599208095028597, |
|
"eval_loss": 0.6679767966270447, |
|
"eval_runtime": 28.461, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07039155301363836, |
|
"grad_norm": 0.30470508337020874, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 0.5992, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07039155301363836, |
|
"eval_loss": 0.6631008386611938, |
|
"eval_runtime": 28.6891, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07479102507699076, |
|
"grad_norm": 0.3255262076854706, |
|
"learning_rate": 0.00014912280701754387, |
|
"loss": 0.5704, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07479102507699076, |
|
"eval_loss": 0.658618688583374, |
|
"eval_runtime": 28.6094, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07919049714034315, |
|
"grad_norm": 0.31922295689582825, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.6048, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07919049714034315, |
|
"eval_loss": 0.6537344455718994, |
|
"eval_runtime": 28.532, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08358996920369556, |
|
"grad_norm": 0.45636337995529175, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.613, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08358996920369556, |
|
"eval_loss": 0.6501972079277039, |
|
"eval_runtime": 28.6568, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08798944126704795, |
|
"grad_norm": 0.29334941506385803, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 0.5799, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08798944126704795, |
|
"eval_loss": 0.6471393704414368, |
|
"eval_runtime": 28.5997, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09238891333040035, |
|
"grad_norm": 0.31318825483322144, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 0.5887, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09238891333040035, |
|
"eval_loss": 0.6440868377685547, |
|
"eval_runtime": 28.6275, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09678838539375274, |
|
"grad_norm": 0.27908894419670105, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 0.5905, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09678838539375274, |
|
"eval_loss": 0.6423875689506531, |
|
"eval_runtime": 28.5491, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10118785745710515, |
|
"grad_norm": 0.2715133726596832, |
|
"learning_rate": 0.00019999952753720356, |
|
"loss": 0.5902, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10118785745710515, |
|
"eval_loss": 0.6415910720825195, |
|
"eval_runtime": 28.5086, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10558732952045755, |
|
"grad_norm": 0.3028790056705475, |
|
"learning_rate": 0.000199982991808088, |
|
"loss": 0.5773, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10558732952045755, |
|
"eval_loss": 0.6377425789833069, |
|
"eval_runtime": 28.6438, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10998680158380994, |
|
"grad_norm": 0.3071883022785187, |
|
"learning_rate": 0.00019994283740338306, |
|
"loss": 0.5598, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10998680158380994, |
|
"eval_loss": 0.6367806196212769, |
|
"eval_runtime": 28.4852, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11438627364716233, |
|
"grad_norm": 0.34842655062675476, |
|
"learning_rate": 0.00019987907380864062, |
|
"loss": 0.596, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11438627364716233, |
|
"eval_loss": 0.6347749829292297, |
|
"eval_runtime": 28.5908, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11878574571051474, |
|
"grad_norm": 0.2854275107383728, |
|
"learning_rate": 0.00019979171608653924, |
|
"loss": 0.5733, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11878574571051474, |
|
"eval_loss": 0.6301032900810242, |
|
"eval_runtime": 28.5482, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12318521777386714, |
|
"grad_norm": 0.27615901827812195, |
|
"learning_rate": 0.00019968078487332566, |
|
"loss": 0.5875, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12318521777386714, |
|
"eval_loss": 0.6269793510437012, |
|
"eval_runtime": 28.4974, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12758468983721954, |
|
"grad_norm": 0.2709368169307709, |
|
"learning_rate": 0.00019954630637394029, |
|
"loss": 0.5711, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12758468983721954, |
|
"eval_loss": 0.6240233182907104, |
|
"eval_runtime": 28.5264, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.13198416190057194, |
|
"grad_norm": 0.2877412736415863, |
|
"learning_rate": 0.00019938831235582672, |
|
"loss": 0.5885, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13198416190057194, |
|
"eval_loss": 0.6206945776939392, |
|
"eval_runtime": 28.5668, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13638363396392433, |
|
"grad_norm": 0.2922605574131012, |
|
"learning_rate": 0.00019920684014142738, |
|
"loss": 0.5485, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13638363396392433, |
|
"eval_loss": 0.6200662851333618, |
|
"eval_runtime": 28.5452, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.14078310602727673, |
|
"grad_norm": 0.28340834379196167, |
|
"learning_rate": 0.00019900193259936704, |
|
"loss": 0.5754, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14078310602727673, |
|
"eval_loss": 0.6187402606010437, |
|
"eval_runtime": 28.5939, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14518257809062912, |
|
"grad_norm": 0.2796618938446045, |
|
"learning_rate": 0.0001987736381343261, |
|
"loss": 0.5535, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14518257809062912, |
|
"eval_loss": 0.6156266331672668, |
|
"eval_runtime": 28.5378, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14958205015398152, |
|
"grad_norm": 0.25343528389930725, |
|
"learning_rate": 0.00019852201067560606, |
|
"loss": 0.5697, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14958205015398152, |
|
"eval_loss": 0.6125033497810364, |
|
"eval_runtime": 28.5565, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1539815222173339, |
|
"grad_norm": 0.23438464105129242, |
|
"learning_rate": 0.00019824710966438996, |
|
"loss": 0.5335, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1539815222173339, |
|
"eval_loss": 0.6096713542938232, |
|
"eval_runtime": 28.6017, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1583809942806863, |
|
"grad_norm": 0.24729043245315552, |
|
"learning_rate": 0.00019794900003970077, |
|
"loss": 0.5702, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1583809942806863, |
|
"eval_loss": 0.6071114540100098, |
|
"eval_runtime": 28.5677, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16278046634403873, |
|
"grad_norm": 0.257964551448822, |
|
"learning_rate": 0.00019762775222306107, |
|
"loss": 0.5494, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16278046634403873, |
|
"eval_loss": 0.6062531471252441, |
|
"eval_runtime": 28.5933, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16717993840739112, |
|
"grad_norm": 0.2648680806159973, |
|
"learning_rate": 0.0001972834421018576, |
|
"loss": 0.5379, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16717993840739112, |
|
"eval_loss": 0.6054437756538391, |
|
"eval_runtime": 28.5575, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17157941047074352, |
|
"grad_norm": 0.2540712356567383, |
|
"learning_rate": 0.00019691615101141455, |
|
"loss": 0.5415, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17157941047074352, |
|
"eval_loss": 0.6023730039596558, |
|
"eval_runtime": 28.5419, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1759788825340959, |
|
"grad_norm": 0.2424851357936859, |
|
"learning_rate": 0.00019652596571578004, |
|
"loss": 0.5504, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1759788825340959, |
|
"eval_loss": 0.5997632145881653, |
|
"eval_runtime": 28.6422, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1803783545974483, |
|
"grad_norm": 0.2573873698711395, |
|
"learning_rate": 0.0001961129783872301, |
|
"loss": 0.5418, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1803783545974483, |
|
"eval_loss": 0.5976300239562988, |
|
"eval_runtime": 28.5752, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1847778266608007, |
|
"grad_norm": 0.22338183224201202, |
|
"learning_rate": 0.00019567728658449504, |
|
"loss": 0.54, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1847778266608007, |
|
"eval_loss": 0.5960862040519714, |
|
"eval_runtime": 28.4685, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1891772987241531, |
|
"grad_norm": 0.2706097960472107, |
|
"learning_rate": 0.00019521899322971352, |
|
"loss": 0.5522, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1891772987241531, |
|
"eval_loss": 0.5958646535873413, |
|
"eval_runtime": 28.5678, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1935767707875055, |
|
"grad_norm": 0.23476411402225494, |
|
"learning_rate": 0.00019473820658411957, |
|
"loss": 0.5262, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1935767707875055, |
|
"eval_loss": 0.5945417284965515, |
|
"eval_runtime": 28.5611, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1979762428508579, |
|
"grad_norm": 0.23705659806728363, |
|
"learning_rate": 0.00019423504022246825, |
|
"loss": 0.5439, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1979762428508579, |
|
"eval_loss": 0.5934200286865234, |
|
"eval_runtime": 28.5955, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2023757149142103, |
|
"grad_norm": 0.22662319242954254, |
|
"learning_rate": 0.00019370961300620637, |
|
"loss": 0.5262, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2023757149142103, |
|
"eval_loss": 0.5928044319152832, |
|
"eval_runtime": 28.514, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2067751869775627, |
|
"grad_norm": 0.24046145379543304, |
|
"learning_rate": 0.00019316204905539425, |
|
"loss": 0.5462, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2067751869775627, |
|
"eval_loss": 0.5904839038848877, |
|
"eval_runtime": 28.5557, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2111746590409151, |
|
"grad_norm": 0.23923470079898834, |
|
"learning_rate": 0.000192592477719385, |
|
"loss": 0.5345, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2111746590409151, |
|
"eval_loss": 0.590508759021759, |
|
"eval_runtime": 28.5204, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21557413110426749, |
|
"grad_norm": 0.24345721304416656, |
|
"learning_rate": 0.00019200103354626892, |
|
"loss": 0.5478, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21557413110426749, |
|
"eval_loss": 0.5882726907730103, |
|
"eval_runtime": 28.5722, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21997360316761988, |
|
"grad_norm": 0.27501732110977173, |
|
"learning_rate": 0.00019138785625108957, |
|
"loss": 0.5607, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21997360316761988, |
|
"eval_loss": 0.5860432982444763, |
|
"eval_runtime": 28.503, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22437307523097227, |
|
"grad_norm": 0.3151032328605652, |
|
"learning_rate": 0.0001907530906828393, |
|
"loss": 0.5479, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22437307523097227, |
|
"eval_loss": 0.5846895575523376, |
|
"eval_runtime": 28.6081, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22877254729432467, |
|
"grad_norm": 0.2758755385875702, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.5767, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22877254729432467, |
|
"eval_loss": 0.5815722942352295, |
|
"eval_runtime": 28.5574, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2331720193576771, |
|
"grad_norm": 0.25241315364837646, |
|
"learning_rate": 0.000189419399586331, |
|
"loss": 0.5568, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2331720193576771, |
|
"eval_loss": 0.5822274684906006, |
|
"eval_runtime": 28.573, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23757149142102948, |
|
"grad_norm": 0.316436767578125, |
|
"learning_rate": 0.00018872078911183146, |
|
"loss": 0.5385, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23757149142102948, |
|
"eval_loss": 0.5809066891670227, |
|
"eval_runtime": 28.5598, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24197096348438188, |
|
"grad_norm": 0.27813801169395447, |
|
"learning_rate": 0.00018800122039735358, |
|
"loss": 0.5348, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.24197096348438188, |
|
"eval_loss": 0.5786107778549194, |
|
"eval_runtime": 28.546, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.24637043554773427, |
|
"grad_norm": 0.2552705407142639, |
|
"learning_rate": 0.00018726086342440846, |
|
"loss": 0.5207, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24637043554773427, |
|
"eval_loss": 0.5768923759460449, |
|
"eval_runtime": 28.5995, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2507699076110867, |
|
"grad_norm": 0.21993091702461243, |
|
"learning_rate": 0.00018649989308525372, |
|
"loss": 0.5292, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2507699076110867, |
|
"eval_loss": 0.5762263536453247, |
|
"eval_runtime": 28.4816, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2551693796744391, |
|
"grad_norm": 0.27086153626441956, |
|
"learning_rate": 0.0001857184891415794, |
|
"loss": 0.5312, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2551693796744391, |
|
"eval_loss": 0.5758266448974609, |
|
"eval_runtime": 28.5295, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2595688517377915, |
|
"grad_norm": 0.21816319227218628, |
|
"learning_rate": 0.0001849168361820431, |
|
"loss": 0.5223, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2595688517377915, |
|
"eval_loss": 0.574447751045227, |
|
"eval_runtime": 28.5859, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2639683238011439, |
|
"grad_norm": 0.24796700477600098, |
|
"learning_rate": 0.00018409512357866548, |
|
"loss": 0.5485, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2639683238011439, |
|
"eval_loss": 0.573371410369873, |
|
"eval_runtime": 28.6178, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2683677958644963, |
|
"grad_norm": 0.2425287663936615, |
|
"learning_rate": 0.00018325354544209535, |
|
"loss": 0.5217, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2683677958644963, |
|
"eval_loss": 0.5723298788070679, |
|
"eval_runtime": 28.5916, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.27276726792784867, |
|
"grad_norm": 0.21630050241947174, |
|
"learning_rate": 0.00018239230057575542, |
|
"loss": 0.5074, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.27276726792784867, |
|
"eval_loss": 0.5725327134132385, |
|
"eval_runtime": 28.536, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.27716673999120106, |
|
"grad_norm": 0.21529468894004822, |
|
"learning_rate": 0.0001815115924288798, |
|
"loss": 0.5487, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.27716673999120106, |
|
"eval_loss": 0.5721793174743652, |
|
"eval_runtime": 28.6852, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.28156621205455346, |
|
"grad_norm": 0.21623414754867554, |
|
"learning_rate": 0.00018061162904845358, |
|
"loss": 0.5106, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.28156621205455346, |
|
"eval_loss": 0.5709577202796936, |
|
"eval_runtime": 28.4592, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.28596568411790585, |
|
"grad_norm": 0.2219308316707611, |
|
"learning_rate": 0.0001796926230300667, |
|
"loss": 0.5218, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.28596568411790585, |
|
"eval_loss": 0.5698617100715637, |
|
"eval_runtime": 28.5588, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.29036515618125824, |
|
"grad_norm": 0.2264701873064041, |
|
"learning_rate": 0.00017875479146769305, |
|
"loss": 0.5162, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29036515618125824, |
|
"eval_loss": 0.5689781308174133, |
|
"eval_runtime": 28.6221, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29476462824461064, |
|
"grad_norm": 0.24004362523555756, |
|
"learning_rate": 0.000177798355902407, |
|
"loss": 0.539, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29476462824461064, |
|
"eval_loss": 0.5678241848945618, |
|
"eval_runtime": 28.5677, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29916410030796303, |
|
"grad_norm": 0.22996000945568085, |
|
"learning_rate": 0.00017682354227004963, |
|
"loss": 0.5002, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29916410030796303, |
|
"eval_loss": 0.5670127272605896, |
|
"eval_runtime": 28.6425, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3035635723713154, |
|
"grad_norm": 0.23163671791553497, |
|
"learning_rate": 0.00017583058084785625, |
|
"loss": 0.5175, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3035635723713154, |
|
"eval_loss": 0.5650352239608765, |
|
"eval_runtime": 28.5994, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3079630444346678, |
|
"grad_norm": 0.20120489597320557, |
|
"learning_rate": 0.00017481970620005912, |
|
"loss": 0.5269, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3079630444346678, |
|
"eval_loss": 0.5640237927436829, |
|
"eval_runtime": 28.5009, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3123625164980202, |
|
"grad_norm": 0.22231583297252655, |
|
"learning_rate": 0.00017379115712247675, |
|
"loss": 0.5444, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3123625164980202, |
|
"eval_loss": 0.5634257197380066, |
|
"eval_runtime": 28.5722, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3167619885613726, |
|
"grad_norm": 0.216331347823143, |
|
"learning_rate": 0.00017274517658610398, |
|
"loss": 0.5074, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3167619885613726, |
|
"eval_loss": 0.5618783831596375, |
|
"eval_runtime": 28.6759, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.32116146062472506, |
|
"grad_norm": 0.21976010501384735, |
|
"learning_rate": 0.0001716820116797158, |
|
"loss": 0.5259, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.32116146062472506, |
|
"eval_loss": 0.5602042078971863, |
|
"eval_runtime": 28.6019, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.32556093268807745, |
|
"grad_norm": 0.22740119695663452, |
|
"learning_rate": 0.0001706019135514982, |
|
"loss": 0.5158, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32556093268807745, |
|
"eval_loss": 0.5599080920219421, |
|
"eval_runtime": 28.5177, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32996040475142985, |
|
"grad_norm": 0.21888501942157745, |
|
"learning_rate": 0.0001695051373497202, |
|
"loss": 0.527, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32996040475142985, |
|
"eval_loss": 0.558814525604248, |
|
"eval_runtime": 28.661, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.33435987681478224, |
|
"grad_norm": 0.20402850210666656, |
|
"learning_rate": 0.00016839194216246108, |
|
"loss": 0.5027, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33435987681478224, |
|
"eval_loss": 0.5578404664993286, |
|
"eval_runtime": 28.5421, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33875934887813464, |
|
"grad_norm": 0.20368748903274536, |
|
"learning_rate": 0.00016726259095640664, |
|
"loss": 0.505, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.33875934887813464, |
|
"eval_loss": 0.5567160844802856, |
|
"eval_runtime": 28.6126, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.34315882094148703, |
|
"grad_norm": 0.2069130390882492, |
|
"learning_rate": 0.0001661173505147295, |
|
"loss": 0.5086, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.34315882094148703, |
|
"eval_loss": 0.55617755651474, |
|
"eval_runtime": 28.4879, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3475582930048394, |
|
"grad_norm": 0.23644201457500458, |
|
"learning_rate": 0.00016495649137406772, |
|
"loss": 0.5412, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3475582930048394, |
|
"eval_loss": 0.5556927919387817, |
|
"eval_runtime": 28.6713, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3519577650681918, |
|
"grad_norm": 0.21997737884521484, |
|
"learning_rate": 0.00016378028776061667, |
|
"loss": 0.4908, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3519577650681918, |
|
"eval_loss": 0.5555915832519531, |
|
"eval_runtime": 28.596, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3563572371315442, |
|
"grad_norm": 0.22075805068016052, |
|
"learning_rate": 0.00016258901752534948, |
|
"loss": 0.5155, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3563572371315442, |
|
"eval_loss": 0.5552019476890564, |
|
"eval_runtime": 28.595, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3607567091948966, |
|
"grad_norm": 0.5917304158210754, |
|
"learning_rate": 0.00016138296207838127, |
|
"loss": 0.4991, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3607567091948966, |
|
"eval_loss": 0.5550567507743835, |
|
"eval_runtime": 28.6222, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.365156181258249, |
|
"grad_norm": 0.21421152353286743, |
|
"learning_rate": 0.00016016240632249224, |
|
"loss": 0.4769, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.365156181258249, |
|
"eval_loss": 0.5548796653747559, |
|
"eval_runtime": 28.5933, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3695556533216014, |
|
"grad_norm": 0.201774463057518, |
|
"learning_rate": 0.0001589276385858262, |
|
"loss": 0.4914, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3695556533216014, |
|
"eval_loss": 0.5546624064445496, |
|
"eval_runtime": 28.5213, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3739551253849538, |
|
"grad_norm": 0.22172759473323822, |
|
"learning_rate": 0.0001576789505537795, |
|
"loss": 0.4726, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3739551253849538, |
|
"eval_loss": 0.5535080432891846, |
|
"eval_runtime": 28.6645, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3783545974483062, |
|
"grad_norm": 0.23269815742969513, |
|
"learning_rate": 0.00015641663720009733, |
|
"loss": 0.5076, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3783545974483062, |
|
"eval_loss": 0.5522862076759338, |
|
"eval_runtime": 28.5697, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3827540695116586, |
|
"grad_norm": 0.23303498327732086, |
|
"learning_rate": 0.00015514099671719268, |
|
"loss": 0.5064, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3827540695116586, |
|
"eval_loss": 0.5502522587776184, |
|
"eval_runtime": 28.5369, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.387153541575011, |
|
"grad_norm": 0.24087387323379517, |
|
"learning_rate": 0.00015385233044570555, |
|
"loss": 0.5361, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.387153541575011, |
|
"eval_loss": 0.5471201539039612, |
|
"eval_runtime": 28.5791, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3915530136383634, |
|
"grad_norm": 0.20800553262233734, |
|
"learning_rate": 0.00015255094280331797, |
|
"loss": 0.5169, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3915530136383634, |
|
"eval_loss": 0.5466722846031189, |
|
"eval_runtime": 28.6339, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3959524857017158, |
|
"grad_norm": 0.37092360854148865, |
|
"learning_rate": 0.0001512371412128424, |
|
"loss": 0.5362, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3959524857017158, |
|
"eval_loss": 0.5455148220062256, |
|
"eval_runtime": 28.637, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4003519577650682, |
|
"grad_norm": 0.20706337690353394, |
|
"learning_rate": 0.00014991123602960018, |
|
"loss": 0.4994, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4003519577650682, |
|
"eval_loss": 0.5440109968185425, |
|
"eval_runtime": 28.5672, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4047514298284206, |
|
"grad_norm": 0.2135256677865982, |
|
"learning_rate": 0.00014857354046810732, |
|
"loss": 0.5005, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4047514298284206, |
|
"eval_loss": 0.5431147813796997, |
|
"eval_runtime": 28.4835, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.409150901891773, |
|
"grad_norm": 0.5737074613571167, |
|
"learning_rate": 0.00014722437052808472, |
|
"loss": 0.5208, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.409150901891773, |
|
"eval_loss": 0.541969358921051, |
|
"eval_runtime": 28.6004, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4135503739551254, |
|
"grad_norm": 0.24099959433078766, |
|
"learning_rate": 0.00014586404491981052, |
|
"loss": 0.5074, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4135503739551254, |
|
"eval_loss": 0.5449388027191162, |
|
"eval_runtime": 28.658, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4179498460184778, |
|
"grad_norm": 0.2046642154455185, |
|
"learning_rate": 0.0001444928849888321, |
|
"loss": 0.5052, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4179498460184778, |
|
"eval_loss": 0.5407991409301758, |
|
"eval_runtime": 28.5688, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4223493180818302, |
|
"grad_norm": 0.2824171185493469, |
|
"learning_rate": 0.00014311121464005583, |
|
"loss": 0.5179, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4223493180818302, |
|
"eval_loss": 0.54000324010849, |
|
"eval_runtime": 28.7144, |
|
"eval_samples_per_second": 0.592, |
|
"eval_steps_per_second": 0.313, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4267487901451826, |
|
"grad_norm": 0.2045980840921402, |
|
"learning_rate": 0.00014171936026123168, |
|
"loss": 0.4634, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4267487901451826, |
|
"eval_loss": 0.5398800373077393, |
|
"eval_runtime": 28.5209, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.43114826220853497, |
|
"grad_norm": 0.2092169225215912, |
|
"learning_rate": 0.00014031765064585197, |
|
"loss": 0.4802, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.43114826220853497, |
|
"eval_loss": 0.5395181179046631, |
|
"eval_runtime": 28.5086, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.43554773427188737, |
|
"grad_norm": 0.20700140297412872, |
|
"learning_rate": 0.00013890641691548114, |
|
"loss": 0.4962, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43554773427188737, |
|
"eval_loss": 0.5390854477882385, |
|
"eval_runtime": 28.5682, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43994720633523976, |
|
"grad_norm": 0.19903522729873657, |
|
"learning_rate": 0.00013748599244153633, |
|
"loss": 0.4841, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43994720633523976, |
|
"eval_loss": 0.5381758213043213, |
|
"eval_runtime": 29.4274, |
|
"eval_samples_per_second": 0.578, |
|
"eval_steps_per_second": 0.306, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.44434667839859215, |
|
"grad_norm": 0.4766729474067688, |
|
"learning_rate": 0.00013605671276653567, |
|
"loss": 0.5252, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.44434667839859215, |
|
"eval_loss": 0.5368968844413757, |
|
"eval_runtime": 28.6474, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.44874615046194455, |
|
"grad_norm": 0.21688155829906464, |
|
"learning_rate": 0.00013461891552483444, |
|
"loss": 0.515, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.44874615046194455, |
|
"eval_loss": 0.5366407036781311, |
|
"eval_runtime": 28.5352, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.45314562252529694, |
|
"grad_norm": 0.20375116169452667, |
|
"learning_rate": 0.00013317294036286644, |
|
"loss": 0.4887, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.45314562252529694, |
|
"eval_loss": 0.5360764861106873, |
|
"eval_runtime": 28.6533, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.45754509458864934, |
|
"grad_norm": 0.1958196461200714, |
|
"learning_rate": 0.00013171912885891063, |
|
"loss": 0.4868, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.45754509458864934, |
|
"eval_loss": 0.5356424450874329, |
|
"eval_runtime": 28.5027, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4619445666520018, |
|
"grad_norm": 0.22040507197380066, |
|
"learning_rate": 0.00013025782444240087, |
|
"loss": 0.5086, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4619445666520018, |
|
"eval_loss": 0.5351347327232361, |
|
"eval_runtime": 28.6428, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4663440387153542, |
|
"grad_norm": 0.19495758414268494, |
|
"learning_rate": 0.00012878937231279892, |
|
"loss": 0.5113, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4663440387153542, |
|
"eval_loss": 0.5347647070884705, |
|
"eval_runtime": 28.6252, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4707435107787066, |
|
"grad_norm": 0.21149738132953644, |
|
"learning_rate": 0.0001273141193580488, |
|
"loss": 0.483, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4707435107787066, |
|
"eval_loss": 0.5339221954345703, |
|
"eval_runtime": 28.6055, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.47514298284205897, |
|
"grad_norm": 0.20391018688678741, |
|
"learning_rate": 0.0001258324140726326, |
|
"loss": 0.4728, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47514298284205897, |
|
"eval_loss": 0.5337977409362793, |
|
"eval_runtime": 28.5842, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47954245490541136, |
|
"grad_norm": 0.20913545787334442, |
|
"learning_rate": 0.00012434460647524676, |
|
"loss": 0.5016, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.47954245490541136, |
|
"eval_loss": 0.532899022102356, |
|
"eval_runtime": 28.4759, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.48394192696876376, |
|
"grad_norm": 0.19410260021686554, |
|
"learning_rate": 0.00012285104802611812, |
|
"loss": 0.5103, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.48394192696876376, |
|
"eval_loss": 0.5321294665336609, |
|
"eval_runtime": 28.5662, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.48834139903211615, |
|
"grad_norm": 0.2097245752811432, |
|
"learning_rate": 0.00012135209154397962, |
|
"loss": 0.4954, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.48834139903211615, |
|
"eval_loss": 0.532034695148468, |
|
"eval_runtime": 28.652, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.49274087109546855, |
|
"grad_norm": 0.21518121659755707, |
|
"learning_rate": 0.00011984809112272495, |
|
"loss": 0.4999, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.49274087109546855, |
|
"eval_loss": 0.5313233733177185, |
|
"eval_runtime": 28.5662, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.49714034315882094, |
|
"grad_norm": 0.19571034610271454, |
|
"learning_rate": 0.00011833940204776209, |
|
"loss": 0.4931, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.49714034315882094, |
|
"eval_loss": 0.5311394333839417, |
|
"eval_runtime": 28.5352, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5015398152221734, |
|
"grad_norm": 0.20554794371128082, |
|
"learning_rate": 0.00011682638071208533, |
|
"loss": 0.4833, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5015398152221734, |
|
"eval_loss": 0.5300410389900208, |
|
"eval_runtime": 28.5679, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5059392872855257, |
|
"grad_norm": 0.20373423397541046, |
|
"learning_rate": 0.00011530938453208559, |
|
"loss": 0.5057, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5059392872855257, |
|
"eval_loss": 0.5300309658050537, |
|
"eval_runtime": 28.5821, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5103387593488782, |
|
"grad_norm": 0.1982477903366089, |
|
"learning_rate": 0.00011378877186311912, |
|
"loss": 0.4754, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5103387593488782, |
|
"eval_loss": 0.5292160511016846, |
|
"eval_runtime": 28.5256, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5147382314122305, |
|
"grad_norm": 0.20576219260692596, |
|
"learning_rate": 0.00011226490191485421, |
|
"loss": 0.4991, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5147382314122305, |
|
"eval_loss": 0.5280917882919312, |
|
"eval_runtime": 28.6835, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.519137703475583, |
|
"grad_norm": 0.2154638022184372, |
|
"learning_rate": 0.00011073813466641632, |
|
"loss": 0.4811, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.519137703475583, |
|
"eval_loss": 0.5274674296379089, |
|
"eval_runtime": 28.4766, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5235371755389353, |
|
"grad_norm": 0.2037007063627243, |
|
"learning_rate": 0.00010920883078135117, |
|
"loss": 0.4717, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5235371755389353, |
|
"eval_loss": 0.5270927548408508, |
|
"eval_runtime": 28.5377, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5279366476022878, |
|
"grad_norm": 0.21386198699474335, |
|
"learning_rate": 0.00010767735152242649, |
|
"loss": 0.4776, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5279366476022878, |
|
"eval_loss": 0.526791512966156, |
|
"eval_runtime": 28.596, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5323361196656401, |
|
"grad_norm": 0.1984720528125763, |
|
"learning_rate": 0.0001061440586662917, |
|
"loss": 0.4708, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5323361196656401, |
|
"eval_loss": 0.5266034007072449, |
|
"eval_runtime": 28.6491, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5367355917289925, |
|
"grad_norm": 0.19453096389770508, |
|
"learning_rate": 0.000104609314418017, |
|
"loss": 0.4659, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5367355917289925, |
|
"eval_loss": 0.5267328023910522, |
|
"eval_runtime": 28.6358, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5411350637923449, |
|
"grad_norm": 0.2048104703426361, |
|
"learning_rate": 0.00010307348132553025, |
|
"loss": 0.5138, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5411350637923449, |
|
"eval_loss": 0.5270944833755493, |
|
"eval_runtime": 28.5902, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5455345358556973, |
|
"grad_norm": 0.1899915337562561, |
|
"learning_rate": 0.00010153692219397387, |
|
"loss": 0.4797, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5455345358556973, |
|
"eval_loss": 0.5260502099990845, |
|
"eval_runtime": 28.5533, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5499340079190497, |
|
"grad_norm": 0.18520919978618622, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5068, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5499340079190497, |
|
"eval_loss": 0.5251287817955017, |
|
"eval_runtime": 28.4846, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5543334799824021, |
|
"grad_norm": 0.21325986087322235, |
|
"learning_rate": 9.84630778060262e-05, |
|
"loss": 0.4799, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5543334799824021, |
|
"eval_loss": 0.524385929107666, |
|
"eval_runtime": 28.5917, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5587329520457545, |
|
"grad_norm": 0.20572926104068756, |
|
"learning_rate": 9.692651867446973e-05, |
|
"loss": 0.49, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5587329520457545, |
|
"eval_loss": 0.523975133895874, |
|
"eval_runtime": 28.6052, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5631324241091069, |
|
"grad_norm": 0.20347937941551208, |
|
"learning_rate": 9.539068558198304e-05, |
|
"loss": 0.4702, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5631324241091069, |
|
"eval_loss": 0.5229539275169373, |
|
"eval_runtime": 28.6223, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5675318961724594, |
|
"grad_norm": 0.21256154775619507, |
|
"learning_rate": 9.38559413337083e-05, |
|
"loss": 0.4736, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5675318961724594, |
|
"eval_loss": 0.5221072435379028, |
|
"eval_runtime": 28.6189, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5719313682358117, |
|
"grad_norm": 0.2260565459728241, |
|
"learning_rate": 9.232264847757357e-05, |
|
"loss": 0.5065, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5719313682358117, |
|
"eval_loss": 0.5213314890861511, |
|
"eval_runtime": 28.6771, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5763308402991641, |
|
"grad_norm": 0.21002529561519623, |
|
"learning_rate": 9.079116921864884e-05, |
|
"loss": 0.4796, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5763308402991641, |
|
"eval_loss": 0.5214037299156189, |
|
"eval_runtime": 28.6202, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5807303123625165, |
|
"grad_norm": 0.19340470433235168, |
|
"learning_rate": 8.92618653335837e-05, |
|
"loss": 0.4788, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5807303123625165, |
|
"eval_loss": 0.5211138725280762, |
|
"eval_runtime": 28.6313, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5851297844258689, |
|
"grad_norm": 0.19035720825195312, |
|
"learning_rate": 8.773509808514581e-05, |
|
"loss": 0.468, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5851297844258689, |
|
"eval_loss": 0.5191999077796936, |
|
"eval_runtime": 28.0607, |
|
"eval_samples_per_second": 0.606, |
|
"eval_steps_per_second": 0.321, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5895292564892213, |
|
"grad_norm": 0.19168096780776978, |
|
"learning_rate": 8.62112281368809e-05, |
|
"loss": 0.5066, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5895292564892213, |
|
"eval_loss": 0.5176913142204285, |
|
"eval_runtime": 28.5375, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5939287285525737, |
|
"grad_norm": 0.19758321344852448, |
|
"learning_rate": 8.469061546791442e-05, |
|
"loss": 0.51, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5939287285525737, |
|
"eval_loss": 0.517296314239502, |
|
"eval_runtime": 28.5712, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5983282006159261, |
|
"grad_norm": 0.19562241435050964, |
|
"learning_rate": 8.317361928791469e-05, |
|
"loss": 0.4932, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5983282006159261, |
|
"eval_loss": 0.5170657634735107, |
|
"eval_runtime": 28.4877, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6027276726792785, |
|
"grad_norm": 0.18590031564235687, |
|
"learning_rate": 8.166059795223794e-05, |
|
"loss": 0.5055, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6027276726792785, |
|
"eval_loss": 0.5166193842887878, |
|
"eval_runtime": 28.625, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6071271447426309, |
|
"grad_norm": 0.2049984484910965, |
|
"learning_rate": 8.015190887727509e-05, |
|
"loss": 0.4846, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6071271447426309, |
|
"eval_loss": 0.5160765647888184, |
|
"eval_runtime": 28.5582, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6115266168059833, |
|
"grad_norm": 0.19373777508735657, |
|
"learning_rate": 7.864790845602039e-05, |
|
"loss": 0.4862, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6115266168059833, |
|
"eval_loss": 0.5157306790351868, |
|
"eval_runtime": 28.6078, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6159260888693356, |
|
"grad_norm": 0.20326727628707886, |
|
"learning_rate": 7.714895197388189e-05, |
|
"loss": 0.5064, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6159260888693356, |
|
"eval_loss": 0.5153770446777344, |
|
"eval_runtime": 28.6597, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6203255609326881, |
|
"grad_norm": 0.19425565004348755, |
|
"learning_rate": 7.565539352475326e-05, |
|
"loss": 0.5018, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6203255609326881, |
|
"eval_loss": 0.5147074460983276, |
|
"eval_runtime": 28.5261, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6247250329960404, |
|
"grad_norm": 0.19491039216518402, |
|
"learning_rate": 7.416758592736744e-05, |
|
"loss": 0.482, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6247250329960404, |
|
"eval_loss": 0.5144516229629517, |
|
"eval_runtime": 28.533, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6291245050593929, |
|
"grad_norm": 0.1957363337278366, |
|
"learning_rate": 7.268588064195122e-05, |
|
"loss": 0.4883, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6291245050593929, |
|
"eval_loss": 0.5139791965484619, |
|
"eval_runtime": 28.5313, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6335239771227452, |
|
"grad_norm": 0.21253836154937744, |
|
"learning_rate": 7.12106276872011e-05, |
|
"loss": 0.4768, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6335239771227452, |
|
"eval_loss": 0.5137556195259094, |
|
"eval_runtime": 28.6307, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6379234491860977, |
|
"grad_norm": 0.1721029132604599, |
|
"learning_rate": 6.974217555759915e-05, |
|
"loss": 0.4816, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6379234491860977, |
|
"eval_loss": 0.5133811831474304, |
|
"eval_runtime": 28.5925, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6423229212494501, |
|
"grad_norm": 0.19211679697036743, |
|
"learning_rate": 6.82808711410894e-05, |
|
"loss": 0.5035, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6423229212494501, |
|
"eval_loss": 0.5132091641426086, |
|
"eval_runtime": 28.5078, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6467223933128025, |
|
"grad_norm": 0.19252945482730865, |
|
"learning_rate": 6.682705963713356e-05, |
|
"loss": 0.4822, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6467223933128025, |
|
"eval_loss": 0.5131357908248901, |
|
"eval_runtime": 28.6326, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6511218653761549, |
|
"grad_norm": 0.1986207813024521, |
|
"learning_rate": 6.538108447516558e-05, |
|
"loss": 0.4612, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6511218653761549, |
|
"eval_loss": 0.5128303170204163, |
|
"eval_runtime": 28.6066, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6555213374395072, |
|
"grad_norm": 0.19202682375907898, |
|
"learning_rate": 6.394328723346434e-05, |
|
"loss": 0.4578, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6555213374395072, |
|
"eval_loss": 0.5124692916870117, |
|
"eval_runtime": 28.6064, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6599208095028597, |
|
"grad_norm": 0.198526531457901, |
|
"learning_rate": 6.251400755846372e-05, |
|
"loss": 0.5176, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6599208095028597, |
|
"eval_loss": 0.5121349096298218, |
|
"eval_runtime": 28.5313, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.664320281566212, |
|
"grad_norm": 0.19058994948863983, |
|
"learning_rate": 6.109358308451885e-05, |
|
"loss": 0.4877, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.664320281566212, |
|
"eval_loss": 0.5118634700775146, |
|
"eval_runtime": 28.5287, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6687197536295645, |
|
"grad_norm": 0.1798192411661148, |
|
"learning_rate": 5.968234935414807e-05, |
|
"loss": 0.4805, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6687197536295645, |
|
"eval_loss": 0.5116167664527893, |
|
"eval_runtime": 28.5918, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6731192256929168, |
|
"grad_norm": 0.18448549509048462, |
|
"learning_rate": 5.828063973876834e-05, |
|
"loss": 0.4993, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6731192256929168, |
|
"eval_loss": 0.5111361742019653, |
|
"eval_runtime": 28.5586, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6775186977562693, |
|
"grad_norm": 0.18624383211135864, |
|
"learning_rate": 5.688878535994421e-05, |
|
"loss": 0.4844, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6775186977562693, |
|
"eval_loss": 0.5107051134109497, |
|
"eval_runtime": 28.5748, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6819181698196216, |
|
"grad_norm": 0.18364666402339935, |
|
"learning_rate": 5.550711501116789e-05, |
|
"loss": 0.4674, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6819181698196216, |
|
"eval_loss": 0.5101103186607361, |
|
"eval_runtime": 28.5159, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6863176418829741, |
|
"grad_norm": 0.23952247202396393, |
|
"learning_rate": 5.413595508018952e-05, |
|
"loss": 0.4943, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6863176418829741, |
|
"eval_loss": 0.5096238255500793, |
|
"eval_runtime": 28.516, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6907171139463264, |
|
"grad_norm": 0.20105206966400146, |
|
"learning_rate": 5.27756294719153e-05, |
|
"loss": 0.4924, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6907171139463264, |
|
"eval_loss": 0.5093135237693787, |
|
"eval_runtime": 28.5941, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6951165860096788, |
|
"grad_norm": 0.19826586544513702, |
|
"learning_rate": 5.1426459531892714e-05, |
|
"loss": 0.4986, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6951165860096788, |
|
"eval_loss": 0.5086015462875366, |
|
"eval_runtime": 28.6207, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6995160580730312, |
|
"grad_norm": 0.17991924285888672, |
|
"learning_rate": 5.008876397039983e-05, |
|
"loss": 0.4698, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6995160580730312, |
|
"eval_loss": 0.5082879662513733, |
|
"eval_runtime": 28.6587, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7039155301363836, |
|
"grad_norm": 0.19232523441314697, |
|
"learning_rate": 4.876285878715764e-05, |
|
"loss": 0.4981, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7039155301363836, |
|
"eval_loss": 0.5078893899917603, |
|
"eval_runtime": 28.5038, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.708315002199736, |
|
"grad_norm": 0.19006720185279846, |
|
"learning_rate": 4.744905719668207e-05, |
|
"loss": 0.4758, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.708315002199736, |
|
"eval_loss": 0.5076141357421875, |
|
"eval_runtime": 28.6324, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7127144742630884, |
|
"grad_norm": 0.19002890586853027, |
|
"learning_rate": 4.614766955429447e-05, |
|
"loss": 0.4642, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7127144742630884, |
|
"eval_loss": 0.507789671421051, |
|
"eval_runtime": 28.6356, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7171139463264409, |
|
"grad_norm": 0.2051495909690857, |
|
"learning_rate": 4.485900328280731e-05, |
|
"loss": 0.4669, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7171139463264409, |
|
"eval_loss": 0.5073484182357788, |
|
"eval_runtime": 28.5748, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7215134183897932, |
|
"grad_norm": 0.6378114223480225, |
|
"learning_rate": 4.358336279990268e-05, |
|
"loss": 0.4711, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7215134183897932, |
|
"eval_loss": 0.5070581436157227, |
|
"eval_runtime": 28.6233, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7259128904531457, |
|
"grad_norm": 0.181978240609169, |
|
"learning_rate": 4.2321049446220505e-05, |
|
"loss": 0.4704, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7259128904531457, |
|
"eval_loss": 0.5068845748901367, |
|
"eval_runtime": 28.5225, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.730312362516498, |
|
"grad_norm": 0.1777966171503067, |
|
"learning_rate": 4.107236141417382e-05, |
|
"loss": 0.4752, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.730312362516498, |
|
"eval_loss": 0.5066249966621399, |
|
"eval_runtime": 28.5423, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7347118345798505, |
|
"grad_norm": 0.18686190247535706, |
|
"learning_rate": 3.9837593677507726e-05, |
|
"loss": 0.4621, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7347118345798505, |
|
"eval_loss": 0.5066962242126465, |
|
"eval_runtime": 28.428, |
|
"eval_samples_per_second": 0.598, |
|
"eval_steps_per_second": 0.317, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7391113066432028, |
|
"grad_norm": 0.18854567408561707, |
|
"learning_rate": 3.8617037921618705e-05, |
|
"loss": 0.4748, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7391113066432028, |
|
"eval_loss": 0.50632643699646, |
|
"eval_runtime": 28.5075, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7435107787065552, |
|
"grad_norm": 0.19204109907150269, |
|
"learning_rate": 3.741098247465049e-05, |
|
"loss": 0.4948, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7435107787065552, |
|
"eval_loss": 0.5060507655143738, |
|
"eval_runtime": 28.5753, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7479102507699076, |
|
"grad_norm": 0.19182614982128143, |
|
"learning_rate": 3.621971223938334e-05, |
|
"loss": 0.4832, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7479102507699076, |
|
"eval_loss": 0.5058286190032959, |
|
"eval_runtime": 28.5184, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.75230972283326, |
|
"grad_norm": 0.18205444514751434, |
|
"learning_rate": 3.504350862593231e-05, |
|
"loss": 0.4642, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.75230972283326, |
|
"eval_loss": 0.505698025226593, |
|
"eval_runtime": 28.6382, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7567091948966124, |
|
"grad_norm": 0.20196740329265594, |
|
"learning_rate": 3.388264948527052e-05, |
|
"loss": 0.4877, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7567091948966124, |
|
"eval_loss": 0.5052359700202942, |
|
"eval_runtime": 28.5347, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7611086669599648, |
|
"grad_norm": 0.18125030398368835, |
|
"learning_rate": 3.2737409043593405e-05, |
|
"loss": 0.4727, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7611086669599648, |
|
"eval_loss": 0.504954993724823, |
|
"eval_runtime": 28.5976, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7655081390233172, |
|
"grad_norm": 0.18927669525146484, |
|
"learning_rate": 3.160805783753897e-05, |
|
"loss": 0.4691, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7655081390233172, |
|
"eval_loss": 0.5047942399978638, |
|
"eval_runtime": 28.5051, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7699076110866696, |
|
"grad_norm": 0.18508534133434296, |
|
"learning_rate": 3.0494862650279822e-05, |
|
"loss": 0.5292, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7699076110866696, |
|
"eval_loss": 0.5046341419219971, |
|
"eval_runtime": 28.5445, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.774307083150022, |
|
"grad_norm": 0.18230414390563965, |
|
"learning_rate": 2.939808644850184e-05, |
|
"loss": 0.4708, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.774307083150022, |
|
"eval_loss": 0.5046290755271912, |
|
"eval_runtime": 28.6138, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7787065552133744, |
|
"grad_norm": 0.17352643609046936, |
|
"learning_rate": 2.8317988320284228e-05, |
|
"loss": 0.4863, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7787065552133744, |
|
"eval_loss": 0.5044691562652588, |
|
"eval_runtime": 28.6321, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7831060272767268, |
|
"grad_norm": 0.1845002919435501, |
|
"learning_rate": 2.7254823413896058e-05, |
|
"loss": 0.5006, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7831060272767268, |
|
"eval_loss": 0.5042091012001038, |
|
"eval_runtime": 28.6132, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7875054993400792, |
|
"grad_norm": 0.17883773148059845, |
|
"learning_rate": 2.6208842877523278e-05, |
|
"loss": 0.4887, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7875054993400792, |
|
"eval_loss": 0.5039156675338745, |
|
"eval_runtime": 28.5693, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7919049714034316, |
|
"grad_norm": 0.19202597439289093, |
|
"learning_rate": 2.518029379994089e-05, |
|
"loss": 0.4867, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7919049714034316, |
|
"eval_loss": 0.5037320852279663, |
|
"eval_runtime": 28.549, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.796304443466784, |
|
"grad_norm": 0.18246056139469147, |
|
"learning_rate": 2.4169419152143768e-05, |
|
"loss": 0.4662, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.796304443466784, |
|
"eval_loss": 0.5035374164581299, |
|
"eval_runtime": 28.6042, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.8007039155301364, |
|
"grad_norm": 0.18989378213882446, |
|
"learning_rate": 2.317645772995042e-05, |
|
"loss": 0.4744, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8007039155301364, |
|
"eval_loss": 0.5033923387527466, |
|
"eval_runtime": 28.4795, |
|
"eval_samples_per_second": 0.597, |
|
"eval_steps_per_second": 0.316, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8051033875934888, |
|
"grad_norm": 0.19525018334388733, |
|
"learning_rate": 2.220164409759299e-05, |
|
"loss": 0.5159, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8051033875934888, |
|
"eval_loss": 0.503151535987854, |
|
"eval_runtime": 28.6198, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8095028596568412, |
|
"grad_norm": 0.18840977549552917, |
|
"learning_rate": 2.124520853230697e-05, |
|
"loss": 0.4848, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8095028596568412, |
|
"eval_loss": 0.5029481649398804, |
|
"eval_runtime": 28.614, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8139023317201936, |
|
"grad_norm": 0.18055056035518646, |
|
"learning_rate": 2.03073769699333e-05, |
|
"loss": 0.4648, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8139023317201936, |
|
"eval_loss": 0.5028063654899597, |
|
"eval_runtime": 28.5662, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.818301803783546, |
|
"grad_norm": 0.18352611362934113, |
|
"learning_rate": 1.9388370951546432e-05, |
|
"loss": 0.4733, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.818301803783546, |
|
"eval_loss": 0.5027296543121338, |
|
"eval_runtime": 28.5532, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8227012758468983, |
|
"grad_norm": 0.18161964416503906, |
|
"learning_rate": 1.848840757112019e-05, |
|
"loss": 0.4556, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8227012758468983, |
|
"eval_loss": 0.5025849342346191, |
|
"eval_runtime": 28.6672, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8271007479102508, |
|
"grad_norm": 0.19485127925872803, |
|
"learning_rate": 1.7607699424244585e-05, |
|
"loss": 0.4973, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8271007479102508, |
|
"eval_loss": 0.5023777484893799, |
|
"eval_runtime": 28.5856, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8315002199736031, |
|
"grad_norm": 0.19218072295188904, |
|
"learning_rate": 1.674645455790468e-05, |
|
"loss": 0.4708, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8315002199736031, |
|
"eval_loss": 0.5024308562278748, |
|
"eval_runtime": 28.6001, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8358996920369556, |
|
"grad_norm": 0.18270643055438995, |
|
"learning_rate": 1.5904876421334536e-05, |
|
"loss": 0.4547, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8358996920369556, |
|
"eval_loss": 0.5024178624153137, |
|
"eval_runtime": 28.5464, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8402991641003079, |
|
"grad_norm": 0.18350371718406677, |
|
"learning_rate": 1.5083163817956914e-05, |
|
"loss": 0.4663, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8402991641003079, |
|
"eval_loss": 0.5021481513977051, |
|
"eval_runtime": 28.5783, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8446986361636604, |
|
"grad_norm": 0.18115630745887756, |
|
"learning_rate": 1.4281510858420632e-05, |
|
"loss": 0.4857, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8446986361636604, |
|
"eval_loss": 0.5019457340240479, |
|
"eval_runtime": 28.5976, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8490981082270127, |
|
"grad_norm": 0.1744571477174759, |
|
"learning_rate": 1.350010691474629e-05, |
|
"loss": 0.4633, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8490981082270127, |
|
"eval_loss": 0.5019629597663879, |
|
"eval_runtime": 28.5207, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8534975802903652, |
|
"grad_norm": 0.18827442824840546, |
|
"learning_rate": 1.2739136575591581e-05, |
|
"loss": 0.4723, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8534975802903652, |
|
"eval_loss": 0.5018792748451233, |
|
"eval_runtime": 28.4515, |
|
"eval_samples_per_second": 0.598, |
|
"eval_steps_per_second": 0.316, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8578970523537176, |
|
"grad_norm": 0.18166576325893402, |
|
"learning_rate": 1.1998779602646437e-05, |
|
"loss": 0.4691, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8578970523537176, |
|
"eval_loss": 0.5017500519752502, |
|
"eval_runtime": 28.5978, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8622965244170699, |
|
"grad_norm": 0.18091408908367157, |
|
"learning_rate": 1.1279210888168546e-05, |
|
"loss": 0.4874, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8622965244170699, |
|
"eval_loss": 0.5017052888870239, |
|
"eval_runtime": 28.7541, |
|
"eval_samples_per_second": 0.591, |
|
"eval_steps_per_second": 0.313, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8666959964804224, |
|
"grad_norm": 0.182442307472229, |
|
"learning_rate": 1.0580600413668984e-05, |
|
"loss": 0.4773, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8666959964804224, |
|
"eval_loss": 0.5016083121299744, |
|
"eval_runtime": 28.5972, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8710954685437747, |
|
"grad_norm": 0.18171900510787964, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.4806, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8710954685437747, |
|
"eval_loss": 0.5015130043029785, |
|
"eval_runtime": 28.5707, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8754949406071272, |
|
"grad_norm": 0.1896371841430664, |
|
"learning_rate": 9.246909317160746e-06, |
|
"loss": 0.4512, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8754949406071272, |
|
"eval_loss": 0.5013110637664795, |
|
"eval_runtime": 28.6509, |
|
"eval_samples_per_second": 0.593, |
|
"eval_steps_per_second": 0.314, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8798944126704795, |
|
"grad_norm": 0.1779976189136505, |
|
"learning_rate": 8.612143748910451e-06, |
|
"loss": 0.4561, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8798944126704795, |
|
"eval_loss": 0.5013135075569153, |
|
"eval_runtime": 28.8047, |
|
"eval_samples_per_second": 0.59, |
|
"eval_steps_per_second": 0.312, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.884293884733832, |
|
"grad_norm": 0.17416957020759583, |
|
"learning_rate": 7.998966453731094e-06, |
|
"loss": 0.4637, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.884293884733832, |
|
"eval_loss": 0.5013565421104431, |
|
"eval_runtime": 28.5911, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8886933567971843, |
|
"grad_norm": 0.1769402176141739, |
|
"learning_rate": 7.40752228061502e-06, |
|
"loss": 0.4527, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8886933567971843, |
|
"eval_loss": 0.5010828375816345, |
|
"eval_runtime": 28.5203, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8930928288605368, |
|
"grad_norm": 0.17784808576107025, |
|
"learning_rate": 6.8379509446057644e-06, |
|
"loss": 0.4903, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8930928288605368, |
|
"eval_loss": 0.5012202262878418, |
|
"eval_runtime": 27.8441, |
|
"eval_samples_per_second": 0.611, |
|
"eval_steps_per_second": 0.323, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8974923009238891, |
|
"grad_norm": 0.18067394196987152, |
|
"learning_rate": 6.290386993793618e-06, |
|
"loss": 0.4689, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8974923009238891, |
|
"eval_loss": 0.5012267231941223, |
|
"eval_runtime": 28.517, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9018917729872415, |
|
"grad_norm": 0.17478391528129578, |
|
"learning_rate": 5.764959777531776e-06, |
|
"loss": 0.4589, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9018917729872415, |
|
"eval_loss": 0.5011836290359497, |
|
"eval_runtime": 28.6023, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9062912450505939, |
|
"grad_norm": 0.185857892036438, |
|
"learning_rate": 5.261793415880456e-06, |
|
"loss": 0.4528, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9062912450505939, |
|
"eval_loss": 0.501183807849884, |
|
"eval_runtime": 28.5159, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.316, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9106907171139463, |
|
"grad_norm": 0.17951223254203796, |
|
"learning_rate": 4.781006770286478e-06, |
|
"loss": 0.4845, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.9106907171139463, |
|
"eval_loss": 0.5011433959007263, |
|
"eval_runtime": 28.6072, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.9150901891772987, |
|
"grad_norm": 0.18096089363098145, |
|
"learning_rate": 4.322713415504975e-06, |
|
"loss": 0.4578, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9150901891772987, |
|
"eval_loss": 0.5011703968048096, |
|
"eval_runtime": 28.6287, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9194896612406511, |
|
"grad_norm": 0.2069099247455597, |
|
"learning_rate": 3.887021612769936e-06, |
|
"loss": 0.5027, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9194896612406511, |
|
"eval_loss": 0.5011240839958191, |
|
"eval_runtime": 29.0514, |
|
"eval_samples_per_second": 0.585, |
|
"eval_steps_per_second": 0.31, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9238891333040036, |
|
"grad_norm": 0.18762987852096558, |
|
"learning_rate": 3.4740342842199956e-06, |
|
"loss": 0.4695, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9238891333040036, |
|
"eval_loss": 0.5010772347450256, |
|
"eval_runtime": 28.5655, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9282886053673559, |
|
"grad_norm": 0.178373321890831, |
|
"learning_rate": 3.0838489885854805e-06, |
|
"loss": 0.484, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9282886053673559, |
|
"eval_loss": 0.5010451674461365, |
|
"eval_runtime": 28.6083, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9326880774307084, |
|
"grad_norm": 0.1794215440750122, |
|
"learning_rate": 2.7165578981424357e-06, |
|
"loss": 0.4784, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9326880774307084, |
|
"eval_loss": 0.5010905265808105, |
|
"eval_runtime": 28.5675, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9370875494940607, |
|
"grad_norm": 0.17699354887008667, |
|
"learning_rate": 2.3722477769389517e-06, |
|
"loss": 0.4698, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9370875494940607, |
|
"eval_loss": 0.5010352730751038, |
|
"eval_runtime": 28.6041, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9414870215574132, |
|
"grad_norm": 0.17208220064640045, |
|
"learning_rate": 2.0509999602992493e-06, |
|
"loss": 0.4517, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9414870215574132, |
|
"eval_loss": 0.5010344982147217, |
|
"eval_runtime": 28.5865, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9458864936207655, |
|
"grad_norm": 0.1774464249610901, |
|
"learning_rate": 1.7528903356100469e-06, |
|
"loss": 0.4846, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9458864936207655, |
|
"eval_loss": 0.5010223388671875, |
|
"eval_runtime": 28.5634, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9502859656841179, |
|
"grad_norm": 0.1773741990327835, |
|
"learning_rate": 1.4779893243939359e-06, |
|
"loss": 0.4402, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9502859656841179, |
|
"eval_loss": 0.5009992718696594, |
|
"eval_runtime": 28.5952, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9546854377474703, |
|
"grad_norm": 0.18979211151599884, |
|
"learning_rate": 1.2263618656739084e-06, |
|
"loss": 0.5013, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9546854377474703, |
|
"eval_loss": 0.501004159450531, |
|
"eval_runtime": 28.614, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9590849098108227, |
|
"grad_norm": 0.1895236372947693, |
|
"learning_rate": 9.98067400632985e-07, |
|
"loss": 0.4588, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9590849098108227, |
|
"eval_loss": 0.5009981393814087, |
|
"eval_runtime": 28.5601, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9634843818741751, |
|
"grad_norm": 0.17328618466854095, |
|
"learning_rate": 7.931598585726563e-07, |
|
"loss": 0.4712, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9634843818741751, |
|
"eval_loss": 0.500961184501648, |
|
"eval_runtime": 28.574, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9678838539375275, |
|
"grad_norm": 0.18122579157352448, |
|
"learning_rate": 6.116876441733088e-07, |
|
"loss": 0.4534, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9678838539375275, |
|
"eval_loss": 0.5009814500808716, |
|
"eval_runtime": 28.5934, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9722833260008799, |
|
"grad_norm": 0.18148748576641083, |
|
"learning_rate": 4.536936260597258e-07, |
|
"loss": 0.4587, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9722833260008799, |
|
"eval_loss": 0.5009997487068176, |
|
"eval_runtime": 28.5275, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9766827980642323, |
|
"grad_norm": 0.18024764955043793, |
|
"learning_rate": 3.192151266743548e-07, |
|
"loss": 0.4783, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9766827980642323, |
|
"eval_loss": 0.5009670853614807, |
|
"eval_runtime": 28.5688, |
|
"eval_samples_per_second": 0.595, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9810822701275846, |
|
"grad_norm": 0.18152055144309998, |
|
"learning_rate": 2.082839134607828e-07, |
|
"loss": 0.4623, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9810822701275846, |
|
"eval_loss": 0.5009202361106873, |
|
"eval_runtime": 28.6066, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9854817421909371, |
|
"grad_norm": 0.17324087023735046, |
|
"learning_rate": 1.2092619135937177e-07, |
|
"loss": 0.439, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9854817421909371, |
|
"eval_loss": 0.5010377168655396, |
|
"eval_runtime": 28.5308, |
|
"eval_samples_per_second": 0.596, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9898812142542894, |
|
"grad_norm": 0.17685554921627045, |
|
"learning_rate": 5.716259661695533e-08, |
|
"loss": 0.4629, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9898812142542894, |
|
"eval_loss": 0.5009082555770874, |
|
"eval_runtime": 28.6259, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9942806863176419, |
|
"grad_norm": 0.17675389349460602, |
|
"learning_rate": 1.7008191912004646e-08, |
|
"loss": 0.4716, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9942806863176419, |
|
"eval_loss": 0.5009535551071167, |
|
"eval_runtime": 28.626, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.314, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9986801583809943, |
|
"grad_norm": 0.18398317694664001, |
|
"learning_rate": 4.724627964303175e-10, |
|
"loss": 0.4832, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9986801583809943, |
|
"eval_loss": 0.5010104179382324, |
|
"eval_runtime": 28.6106, |
|
"eval_samples_per_second": 0.594, |
|
"eval_steps_per_second": 0.315, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9995600527936648, |
|
"step": 1136, |
|
"total_flos": 7.211600370336793e+18, |
|
"train_loss": 0.039691918463984004, |
|
"train_runtime": 9596.3839, |
|
"train_samples_per_second": 1.895, |
|
"train_steps_per_second": 0.118 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1136, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.211600370336793e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|