| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.149377593360996, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04149377593360996, | |
| "grad_norm": 0.7534617185592651, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3441, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08298755186721991, | |
| "grad_norm": 0.17292505502700806, | |
| "learning_rate": 2e-05, | |
| "loss": 1.2553, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12448132780082988, | |
| "grad_norm": 0.16635559499263763, | |
| "learning_rate": 3e-05, | |
| "loss": 1.2958, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16597510373443983, | |
| "grad_norm": 0.1318502277135849, | |
| "learning_rate": 4e-05, | |
| "loss": 1.1205, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2074688796680498, | |
| "grad_norm": 0.21048150956630707, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0102, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.24896265560165975, | |
| "grad_norm": 0.239803746342659, | |
| "learning_rate": 6e-05, | |
| "loss": 1.0016, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.29045643153526973, | |
| "grad_norm": 0.16366511583328247, | |
| "learning_rate": 7e-05, | |
| "loss": 0.9466, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.33195020746887965, | |
| "grad_norm": 0.24975861608982086, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7838, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.37344398340248963, | |
| "grad_norm": 0.282753586769104, | |
| "learning_rate": 9e-05, | |
| "loss": 0.7927, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4149377593360996, | |
| "grad_norm": 0.19942770898342133, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7928, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.45643153526970953, | |
| "grad_norm": 0.21815817058086395, | |
| "learning_rate": 9.888888888888889e-05, | |
| "loss": 0.58, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4979253112033195, | |
| "grad_norm": 0.19703078269958496, | |
| "learning_rate": 9.777777777777778e-05, | |
| "loss": 0.639, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5394190871369294, | |
| "grad_norm": 0.270720511674881, | |
| "learning_rate": 9.666666666666667e-05, | |
| "loss": 0.5981, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5809128630705395, | |
| "grad_norm": 0.2211589813232422, | |
| "learning_rate": 9.555555555555557e-05, | |
| "loss": 0.6611, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6224066390041494, | |
| "grad_norm": 0.22035779058933258, | |
| "learning_rate": 9.444444444444444e-05, | |
| "loss": 0.6056, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6639004149377593, | |
| "grad_norm": 0.3599834740161896, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 0.7573, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7053941908713693, | |
| "grad_norm": 0.2543361485004425, | |
| "learning_rate": 9.222222222222223e-05, | |
| "loss": 0.6592, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7468879668049793, | |
| "grad_norm": 0.26748475432395935, | |
| "learning_rate": 9.111111111111112e-05, | |
| "loss": 0.5471, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7883817427385892, | |
| "grad_norm": 0.3198457956314087, | |
| "learning_rate": 9e-05, | |
| "loss": 0.5802, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8298755186721992, | |
| "grad_norm": 0.26507750153541565, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 0.5088, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8713692946058091, | |
| "grad_norm": 0.32707032561302185, | |
| "learning_rate": 8.777777777777778e-05, | |
| "loss": 0.4706, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9128630705394191, | |
| "grad_norm": 0.20190617442131042, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.3719, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9543568464730291, | |
| "grad_norm": 0.270940363407135, | |
| "learning_rate": 8.555555555555556e-05, | |
| "loss": 0.5147, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.995850622406639, | |
| "grad_norm": 0.34486109018325806, | |
| "learning_rate": 8.444444444444444e-05, | |
| "loss": 0.4241, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.037344398340249, | |
| "grad_norm": 0.42535537481307983, | |
| "learning_rate": 8.333333333333334e-05, | |
| "loss": 0.5259, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0788381742738589, | |
| "grad_norm": 0.4141991138458252, | |
| "learning_rate": 8.222222222222222e-05, | |
| "loss": 0.3362, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.120331950207469, | |
| "grad_norm": 0.38707345724105835, | |
| "learning_rate": 8.111111111111112e-05, | |
| "loss": 0.1921, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.161825726141079, | |
| "grad_norm": 0.22714611887931824, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5203, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2033195020746887, | |
| "grad_norm": 0.23145520687103271, | |
| "learning_rate": 7.88888888888889e-05, | |
| "loss": 0.3289, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2448132780082988, | |
| "grad_norm": 0.24606278538703918, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 0.4789, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2863070539419086, | |
| "grad_norm": 0.3598890006542206, | |
| "learning_rate": 7.666666666666667e-05, | |
| "loss": 0.3212, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3278008298755186, | |
| "grad_norm": 0.4746328592300415, | |
| "learning_rate": 7.555555555555556e-05, | |
| "loss": 0.4568, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3692946058091287, | |
| "grad_norm": 0.28307732939720154, | |
| "learning_rate": 7.444444444444444e-05, | |
| "loss": 0.3563, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4107883817427385, | |
| "grad_norm": 0.3323902189731598, | |
| "learning_rate": 7.333333333333333e-05, | |
| "loss": 0.5342, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4522821576763485, | |
| "grad_norm": 0.29897817969322205, | |
| "learning_rate": 7.222222222222222e-05, | |
| "loss": 0.3693, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4937759336099585, | |
| "grad_norm": 0.5410626530647278, | |
| "learning_rate": 7.111111111111112e-05, | |
| "loss": 0.4809, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.5352697095435683, | |
| "grad_norm": 0.28330478072166443, | |
| "learning_rate": 7e-05, | |
| "loss": 0.3591, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5767634854771784, | |
| "grad_norm": 0.47028374671936035, | |
| "learning_rate": 6.88888888888889e-05, | |
| "loss": 0.3717, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6182572614107884, | |
| "grad_norm": 0.4893304109573364, | |
| "learning_rate": 6.777777777777778e-05, | |
| "loss": 0.3227, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.6597510373443982, | |
| "grad_norm": 0.5082384347915649, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.3572, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7012448132780082, | |
| "grad_norm": 0.34136882424354553, | |
| "learning_rate": 6.555555555555556e-05, | |
| "loss": 0.4784, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7427385892116183, | |
| "grad_norm": 0.39626437425613403, | |
| "learning_rate": 6.444444444444446e-05, | |
| "loss": 0.4348, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.784232365145228, | |
| "grad_norm": 0.5795237421989441, | |
| "learning_rate": 6.333333333333333e-05, | |
| "loss": 0.4211, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8257261410788381, | |
| "grad_norm": 0.6919090151786804, | |
| "learning_rate": 6.222222222222222e-05, | |
| "loss": 0.3886, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.8672199170124482, | |
| "grad_norm": 0.3347764015197754, | |
| "learning_rate": 6.111111111111112e-05, | |
| "loss": 0.3753, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.908713692946058, | |
| "grad_norm": 0.8023832440376282, | |
| "learning_rate": 6e-05, | |
| "loss": 0.3052, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.950207468879668, | |
| "grad_norm": 0.23686052858829498, | |
| "learning_rate": 5.8888888888888896e-05, | |
| "loss": 0.4345, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.991701244813278, | |
| "grad_norm": 0.38190993666648865, | |
| "learning_rate": 5.7777777777777776e-05, | |
| "loss": 0.3561, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.033195020746888, | |
| "grad_norm": 0.38062775135040283, | |
| "learning_rate": 5.666666666666667e-05, | |
| "loss": 0.4208, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.074688796680498, | |
| "grad_norm": 0.42297548055648804, | |
| "learning_rate": 5.555555555555556e-05, | |
| "loss": 0.2817, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.116182572614108, | |
| "grad_norm": 0.5791902542114258, | |
| "learning_rate": 5.4444444444444446e-05, | |
| "loss": 0.2804, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.1576763485477177, | |
| "grad_norm": 0.39327019453048706, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.2416, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.199170124481328, | |
| "grad_norm": 0.49688029289245605, | |
| "learning_rate": 5.222222222222223e-05, | |
| "loss": 0.3241, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.240663900414938, | |
| "grad_norm": 0.6079151630401611, | |
| "learning_rate": 5.111111111111111e-05, | |
| "loss": 0.383, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.2821576763485476, | |
| "grad_norm": 0.2557106614112854, | |
| "learning_rate": 5e-05, | |
| "loss": 0.2163, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.323651452282158, | |
| "grad_norm": 0.41979703307151794, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 0.3263, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.3651452282157677, | |
| "grad_norm": 0.298300564289093, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 0.178, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.4066390041493775, | |
| "grad_norm": 0.33259597420692444, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.2444, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.4481327800829877, | |
| "grad_norm": 0.3294433355331421, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 0.1401, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.4896265560165975, | |
| "grad_norm": 0.4625357389450073, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.3397, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.5311203319502074, | |
| "grad_norm": 0.5396658778190613, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.3147, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.572614107883817, | |
| "grad_norm": 0.4528850317001343, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 0.2427, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.6141078838174274, | |
| "grad_norm": 0.47281742095947266, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 0.2276, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.6556016597510372, | |
| "grad_norm": 0.2738474905490875, | |
| "learning_rate": 4e-05, | |
| "loss": 0.2599, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.6970954356846475, | |
| "grad_norm": 0.31404346227645874, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.2423, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.7385892116182573, | |
| "grad_norm": 0.53602135181427, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 0.2915, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.780082987551867, | |
| "grad_norm": 0.312480092048645, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.3175, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.821576763485477, | |
| "grad_norm": 0.2691058814525604, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.2499, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.863070539419087, | |
| "grad_norm": 0.5619029998779297, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 0.3429, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.904564315352697, | |
| "grad_norm": 0.625348687171936, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.1967, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.9460580912863072, | |
| "grad_norm": 0.20030611753463745, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 0.2182, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.987551867219917, | |
| "grad_norm": 0.2181701362133026, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 0.3135, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.029045643153527, | |
| "grad_norm": 0.5375049710273743, | |
| "learning_rate": 3e-05, | |
| "loss": 0.2967, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.070539419087137, | |
| "grad_norm": 0.40118399262428284, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 0.1545, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.112033195020747, | |
| "grad_norm": 0.7035902738571167, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.2678, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.1535269709543567, | |
| "grad_norm": 0.7388056516647339, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.2665, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.195020746887967, | |
| "grad_norm": 0.7555689811706543, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 0.2337, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.236514522821577, | |
| "grad_norm": 0.8938640356063843, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.1993, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.2780082987551866, | |
| "grad_norm": 0.7519901990890503, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.2179, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.3195020746887964, | |
| "grad_norm": 0.3737233281135559, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.1721, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.3609958506224067, | |
| "grad_norm": 0.6671114563941956, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 0.2366, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.4024896265560165, | |
| "grad_norm": 0.5140301585197449, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2073, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.4439834024896268, | |
| "grad_norm": 0.2032405585050583, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.1978, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.4854771784232366, | |
| "grad_norm": 0.5465924739837646, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.2103, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.5269709543568464, | |
| "grad_norm": 0.49002525210380554, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.2528, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.568464730290456, | |
| "grad_norm": 0.37268325686454773, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 0.2067, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.6099585062240664, | |
| "grad_norm": 0.8844376802444458, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 0.1711, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.6514522821576763, | |
| "grad_norm": 0.4846467077732086, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.1465, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.6929460580912865, | |
| "grad_norm": 0.645414412021637, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 0.2711, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.7344398340248963, | |
| "grad_norm": 0.4424195885658264, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.1949, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.775933609958506, | |
| "grad_norm": 0.21110430359840393, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1738, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.817427385892116, | |
| "grad_norm": 0.3548027276992798, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 0.1363, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.858921161825726, | |
| "grad_norm": 0.43183088302612305, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 0.205, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.900414937759336, | |
| "grad_norm": 0.21933656930923462, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.2037, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.9419087136929463, | |
| "grad_norm": 0.6654905080795288, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.209, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.983402489626556, | |
| "grad_norm": 0.2863740026950836, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 0.2686, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 4.024896265560166, | |
| "grad_norm": 0.23517794907093048, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.1827, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 4.066390041493776, | |
| "grad_norm": 0.7014562487602234, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 0.2283, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 4.1078838174273855, | |
| "grad_norm": 0.4174599051475525, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 0.1267, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 4.149377593360996, | |
| "grad_norm": 0.1902274340391159, | |
| "learning_rate": 0.0, | |
| "loss": 0.1947, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7702734288592896.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |