|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2260, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04424778761061947, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0009973451327433627, |
|
"loss": 14.6523, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08849557522123894, |
|
"grad_norm": 0.9945968985557556, |
|
"learning_rate": 0.000992920353982301, |
|
"loss": 4.8948, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13274336283185842, |
|
"grad_norm": 0.3495340645313263, |
|
"learning_rate": 0.000988495575221239, |
|
"loss": 0.6469, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17699115044247787, |
|
"grad_norm": 0.22641977667808533, |
|
"learning_rate": 0.000984070796460177, |
|
"loss": 0.5221, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22123893805309736, |
|
"grad_norm": 0.25233855843544006, |
|
"learning_rate": 0.000979646017699115, |
|
"loss": 0.4094, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26548672566371684, |
|
"grad_norm": 0.37399861216545105, |
|
"learning_rate": 0.0009752212389380531, |
|
"loss": 0.3958, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.30973451327433627, |
|
"grad_norm": 0.18545609712600708, |
|
"learning_rate": 0.0009707964601769911, |
|
"loss": 0.3405, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 0.2712928354740143, |
|
"learning_rate": 0.0009663716814159293, |
|
"loss": 0.3242, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.39823008849557523, |
|
"grad_norm": 0.2340475469827652, |
|
"learning_rate": 0.0009619469026548673, |
|
"loss": 0.3007, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4424778761061947, |
|
"grad_norm": 0.18099136650562286, |
|
"learning_rate": 0.0009575221238938053, |
|
"loss": 0.2567, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.48672566371681414, |
|
"grad_norm": 0.23833367228507996, |
|
"learning_rate": 0.0009530973451327434, |
|
"loss": 0.2734, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5309734513274337, |
|
"grad_norm": 0.20163732767105103, |
|
"learning_rate": 0.0009486725663716814, |
|
"loss": 0.2326, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5752212389380531, |
|
"grad_norm": 0.1758851557970047, |
|
"learning_rate": 0.0009442477876106195, |
|
"loss": 0.2914, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6194690265486725, |
|
"grad_norm": 0.211241215467453, |
|
"learning_rate": 0.0009398230088495575, |
|
"loss": 0.2667, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6637168141592921, |
|
"grad_norm": 0.22571340203285217, |
|
"learning_rate": 0.0009353982300884956, |
|
"loss": 0.2268, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 0.20469224452972412, |
|
"learning_rate": 0.0009309734513274336, |
|
"loss": 0.2386, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7522123893805309, |
|
"grad_norm": 0.21183688938617706, |
|
"learning_rate": 0.0009265486725663716, |
|
"loss": 0.282, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7964601769911505, |
|
"grad_norm": 0.17585916817188263, |
|
"learning_rate": 0.0009221238938053097, |
|
"loss": 0.3046, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8407079646017699, |
|
"grad_norm": 0.17937427759170532, |
|
"learning_rate": 0.0009176991150442479, |
|
"loss": 0.2693, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8849557522123894, |
|
"grad_norm": 0.19432350993156433, |
|
"learning_rate": 0.0009132743362831859, |
|
"loss": 0.252, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9292035398230089, |
|
"grad_norm": 0.18185169994831085, |
|
"learning_rate": 0.0009088495575221239, |
|
"loss": 0.2793, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9734513274336283, |
|
"grad_norm": 0.18515343964099884, |
|
"learning_rate": 0.000904424778761062, |
|
"loss": 0.2644, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.22543948888778687, |
|
"eval_runtime": 3.0243, |
|
"eval_samples_per_second": 33.066, |
|
"eval_steps_per_second": 8.266, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.0176991150442478, |
|
"grad_norm": 0.2031005322933197, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 0.2704, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 0.26087555289268494, |
|
"learning_rate": 0.0008955752212389381, |
|
"loss": 0.2526, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1061946902654867, |
|
"grad_norm": 0.1796620637178421, |
|
"learning_rate": 0.0008911504424778761, |
|
"loss": 0.2605, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1504424778761062, |
|
"grad_norm": 0.22667303681373596, |
|
"learning_rate": 0.0008867256637168141, |
|
"loss": 0.261, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1946902654867257, |
|
"grad_norm": 0.22089733183383942, |
|
"learning_rate": 0.0008823008849557523, |
|
"loss": 0.2762, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.238938053097345, |
|
"grad_norm": 0.19162122905254364, |
|
"learning_rate": 0.0008778761061946903, |
|
"loss": 0.2325, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2831858407079646, |
|
"grad_norm": 0.1732087880373001, |
|
"learning_rate": 0.0008734513274336283, |
|
"loss": 0.2455, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3274336283185841, |
|
"grad_norm": 0.15953731536865234, |
|
"learning_rate": 0.0008690265486725663, |
|
"loss": 0.2155, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3716814159292037, |
|
"grad_norm": 0.229411318898201, |
|
"learning_rate": 0.0008646017699115044, |
|
"loss": 0.2289, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 0.20390523970127106, |
|
"learning_rate": 0.0008601769911504425, |
|
"loss": 0.2429, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4601769911504425, |
|
"grad_norm": 0.23142680525779724, |
|
"learning_rate": 0.0008557522123893805, |
|
"loss": 0.2291, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.504424778761062, |
|
"grad_norm": 0.22689059376716614, |
|
"learning_rate": 0.0008513274336283185, |
|
"loss": 0.2369, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5486725663716814, |
|
"grad_norm": 0.18759772181510925, |
|
"learning_rate": 0.0008469026548672567, |
|
"loss": 0.1887, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.592920353982301, |
|
"grad_norm": 0.17289893329143524, |
|
"learning_rate": 0.0008424778761061948, |
|
"loss": 0.2547, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6371681415929205, |
|
"grad_norm": 0.20804202556610107, |
|
"learning_rate": 0.0008380530973451328, |
|
"loss": 0.2446, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6814159292035398, |
|
"grad_norm": 0.2161918580532074, |
|
"learning_rate": 0.0008336283185840708, |
|
"loss": 0.2262, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7256637168141593, |
|
"grad_norm": 0.27487823367118835, |
|
"learning_rate": 0.0008292035398230089, |
|
"loss": 0.2673, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 0.20181554555892944, |
|
"learning_rate": 0.0008247787610619469, |
|
"loss": 0.252, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8141592920353982, |
|
"grad_norm": 0.21222522854804993, |
|
"learning_rate": 0.000820353982300885, |
|
"loss": 0.23, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8584070796460177, |
|
"grad_norm": 0.21409285068511963, |
|
"learning_rate": 0.000815929203539823, |
|
"loss": 0.235, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.9026548672566372, |
|
"grad_norm": 0.2830056846141815, |
|
"learning_rate": 0.0008115044247787611, |
|
"loss": 0.2335, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.9469026548672566, |
|
"grad_norm": 0.22915257513523102, |
|
"learning_rate": 0.0008070796460176991, |
|
"loss": 0.2303, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.991150442477876, |
|
"grad_norm": 0.19883762300014496, |
|
"learning_rate": 0.0008026548672566371, |
|
"loss": 0.2222, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.21643634140491486, |
|
"eval_runtime": 2.7454, |
|
"eval_samples_per_second": 36.424, |
|
"eval_steps_per_second": 9.106, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.0353982300884956, |
|
"grad_norm": 0.2121458351612091, |
|
"learning_rate": 0.0007982300884955752, |
|
"loss": 0.2403, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.079646017699115, |
|
"grad_norm": 0.17018261551856995, |
|
"learning_rate": 0.0007938053097345133, |
|
"loss": 0.213, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 0.22500459849834442, |
|
"learning_rate": 0.0007893805309734513, |
|
"loss": 0.2239, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.168141592920354, |
|
"grad_norm": 0.19334179162979126, |
|
"learning_rate": 0.0007849557522123893, |
|
"loss": 0.2106, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.2123893805309733, |
|
"grad_norm": 0.1906515508890152, |
|
"learning_rate": 0.0007805309734513274, |
|
"loss": 0.2037, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.256637168141593, |
|
"grad_norm": 0.2478450983762741, |
|
"learning_rate": 0.0007761061946902656, |
|
"loss": 0.2164, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.3008849557522124, |
|
"grad_norm": 0.2270224243402481, |
|
"learning_rate": 0.0007716814159292036, |
|
"loss": 0.2253, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.3451327433628317, |
|
"grad_norm": 0.2539624273777008, |
|
"learning_rate": 0.0007672566371681416, |
|
"loss": 0.2016, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3893805309734515, |
|
"grad_norm": 0.33118170499801636, |
|
"learning_rate": 0.0007628318584070797, |
|
"loss": 0.2239, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.433628318584071, |
|
"grad_norm": 0.24022382497787476, |
|
"learning_rate": 0.0007584070796460178, |
|
"loss": 0.2339, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 0.22129379212856293, |
|
"learning_rate": 0.0007539823008849558, |
|
"loss": 0.2079, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.52212389380531, |
|
"grad_norm": 0.20302246510982513, |
|
"learning_rate": 0.0007495575221238938, |
|
"loss": 0.2012, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.566371681415929, |
|
"grad_norm": 0.28677117824554443, |
|
"learning_rate": 0.0007451327433628319, |
|
"loss": 0.2281, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.6106194690265485, |
|
"grad_norm": 0.2567579746246338, |
|
"learning_rate": 0.0007407079646017699, |
|
"loss": 0.2374, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.6548672566371683, |
|
"grad_norm": 0.2306365817785263, |
|
"learning_rate": 0.000736283185840708, |
|
"loss": 0.2144, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.6991150442477876, |
|
"grad_norm": 0.23293821513652802, |
|
"learning_rate": 0.000731858407079646, |
|
"loss": 0.2381, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.7433628318584073, |
|
"grad_norm": 0.2173946499824524, |
|
"learning_rate": 0.0007274336283185841, |
|
"loss": 0.2155, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.7876106194690267, |
|
"grad_norm": 0.30976563692092896, |
|
"learning_rate": 0.0007230088495575221, |
|
"loss": 0.2262, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 0.19489358365535736, |
|
"learning_rate": 0.0007185840707964601, |
|
"loss": 0.2194, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.8761061946902657, |
|
"grad_norm": 0.21821223199367523, |
|
"learning_rate": 0.0007141592920353982, |
|
"loss": 0.1967, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.920353982300885, |
|
"grad_norm": 0.23535631597042084, |
|
"learning_rate": 0.0007097345132743363, |
|
"loss": 0.2353, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.9646017699115044, |
|
"grad_norm": 0.20547734200954437, |
|
"learning_rate": 0.0007053097345132744, |
|
"loss": 0.2119, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.21383462846279144, |
|
"eval_runtime": 2.6363, |
|
"eval_samples_per_second": 37.932, |
|
"eval_steps_per_second": 9.483, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 3.0088495575221237, |
|
"grad_norm": 0.21669970452785492, |
|
"learning_rate": 0.0007008849557522124, |
|
"loss": 0.2198, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.0530973451327434, |
|
"grad_norm": 0.20589256286621094, |
|
"learning_rate": 0.0006964601769911505, |
|
"loss": 0.2002, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.0973451327433628, |
|
"grad_norm": 0.23902471363544464, |
|
"learning_rate": 0.0006920353982300886, |
|
"loss": 0.1804, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.1415929203539825, |
|
"grad_norm": 0.2881176173686981, |
|
"learning_rate": 0.0006876106194690266, |
|
"loss": 0.2162, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 0.22364391386508942, |
|
"learning_rate": 0.0006831858407079646, |
|
"loss": 0.2185, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.230088495575221, |
|
"grad_norm": 0.23607216775417328, |
|
"learning_rate": 0.0006787610619469026, |
|
"loss": 0.2124, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.274336283185841, |
|
"grad_norm": 0.18838390707969666, |
|
"learning_rate": 0.0006743362831858408, |
|
"loss": 0.179, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.3185840707964602, |
|
"grad_norm": 0.3451661765575409, |
|
"learning_rate": 0.0006699115044247788, |
|
"loss": 0.2135, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.3628318584070795, |
|
"grad_norm": 0.2281007319688797, |
|
"learning_rate": 0.0006654867256637168, |
|
"loss": 0.2071, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.4070796460176993, |
|
"grad_norm": 0.20740865170955658, |
|
"learning_rate": 0.0006610619469026548, |
|
"loss": 0.2081, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.4513274336283186, |
|
"grad_norm": 0.27458012104034424, |
|
"learning_rate": 0.0006566371681415929, |
|
"loss": 0.2026, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.495575221238938, |
|
"grad_norm": 0.19083356857299805, |
|
"learning_rate": 0.000652212389380531, |
|
"loss": 0.1946, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 0.2667248845100403, |
|
"learning_rate": 0.000647787610619469, |
|
"loss": 0.2141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.584070796460177, |
|
"grad_norm": 0.22773493826389313, |
|
"learning_rate": 0.000643362831858407, |
|
"loss": 0.2294, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.6283185840707963, |
|
"grad_norm": 0.24344410002231598, |
|
"learning_rate": 0.0006389380530973451, |
|
"loss": 0.1799, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.672566371681416, |
|
"grad_norm": 0.3232133984565735, |
|
"learning_rate": 0.0006345132743362833, |
|
"loss": 0.1807, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.7168141592920354, |
|
"grad_norm": 0.22465798258781433, |
|
"learning_rate": 0.0006300884955752213, |
|
"loss": 0.2005, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.7610619469026547, |
|
"grad_norm": 0.24152274429798126, |
|
"learning_rate": 0.0006256637168141594, |
|
"loss": 0.2001, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.8053097345132745, |
|
"grad_norm": 0.2764975130558014, |
|
"learning_rate": 0.0006212389380530974, |
|
"loss": 0.1691, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.849557522123894, |
|
"grad_norm": 0.23789626359939575, |
|
"learning_rate": 0.0006168141592920354, |
|
"loss": 0.2318, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 0.21235798299312592, |
|
"learning_rate": 0.0006123893805309735, |
|
"loss": 0.1867, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.938053097345133, |
|
"grad_norm": 0.23083995282649994, |
|
"learning_rate": 0.0006079646017699116, |
|
"loss": 0.2135, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.982300884955752, |
|
"grad_norm": 0.22863389551639557, |
|
"learning_rate": 0.0006035398230088496, |
|
"loss": 0.2188, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.20991046726703644, |
|
"eval_runtime": 2.9553, |
|
"eval_samples_per_second": 33.837, |
|
"eval_steps_per_second": 8.459, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 4.0265486725663715, |
|
"grad_norm": 0.22170217335224152, |
|
"learning_rate": 0.0005991150442477876, |
|
"loss": 0.2186, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.070796460176991, |
|
"grad_norm": 0.2190970778465271, |
|
"learning_rate": 0.0005946902654867256, |
|
"loss": 0.1978, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.115044247787611, |
|
"grad_norm": 0.1924510896205902, |
|
"learning_rate": 0.0005902654867256638, |
|
"loss": 0.1787, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.15929203539823, |
|
"grad_norm": 0.2868868112564087, |
|
"learning_rate": 0.0005858407079646018, |
|
"loss": 0.172, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.20353982300885, |
|
"grad_norm": 0.18888860940933228, |
|
"learning_rate": 0.0005814159292035398, |
|
"loss": 0.1761, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.247787610619469, |
|
"grad_norm": 0.21858586370944977, |
|
"learning_rate": 0.0005769911504424778, |
|
"loss": 0.1871, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.292035398230088, |
|
"grad_norm": 0.305698961019516, |
|
"learning_rate": 0.0005725663716814159, |
|
"loss": 0.1886, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.336283185840708, |
|
"grad_norm": 0.23597249388694763, |
|
"learning_rate": 0.000568141592920354, |
|
"loss": 0.1865, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.380530973451328, |
|
"grad_norm": 0.271823912858963, |
|
"learning_rate": 0.0005637168141592921, |
|
"loss": 0.1709, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.424778761061947, |
|
"grad_norm": 0.19630669057369232, |
|
"learning_rate": 0.0005592920353982301, |
|
"loss": 0.2429, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.469026548672566, |
|
"grad_norm": 0.29825878143310547, |
|
"learning_rate": 0.0005548672566371682, |
|
"loss": 0.1879, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.513274336283186, |
|
"grad_norm": 0.21552462875843048, |
|
"learning_rate": 0.0005504424778761063, |
|
"loss": 0.1905, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.557522123893805, |
|
"grad_norm": 0.28668805956840515, |
|
"learning_rate": 0.0005460176991150443, |
|
"loss": 0.1951, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.601769911504425, |
|
"grad_norm": 0.27180853486061096, |
|
"learning_rate": 0.0005415929203539823, |
|
"loss": 0.1758, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.646017699115045, |
|
"grad_norm": 0.3072490394115448, |
|
"learning_rate": 0.0005371681415929204, |
|
"loss": 0.1852, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.6902654867256635, |
|
"grad_norm": 0.2913398742675781, |
|
"learning_rate": 0.0005327433628318584, |
|
"loss": 0.201, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.734513274336283, |
|
"grad_norm": 0.29055866599082947, |
|
"learning_rate": 0.0005283185840707965, |
|
"loss": 0.1932, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.778761061946903, |
|
"grad_norm": 0.2742849290370941, |
|
"learning_rate": 0.0005238938053097345, |
|
"loss": 0.183, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.823008849557522, |
|
"grad_norm": 0.2370535433292389, |
|
"learning_rate": 0.0005194690265486726, |
|
"loss": 0.1849, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.867256637168142, |
|
"grad_norm": 0.31343671679496765, |
|
"learning_rate": 0.0005150442477876106, |
|
"loss": 0.2195, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.911504424778761, |
|
"grad_norm": 0.3136596381664276, |
|
"learning_rate": 0.0005106194690265486, |
|
"loss": 0.1907, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.95575221238938, |
|
"grad_norm": 0.2071835845708847, |
|
"learning_rate": 0.0005061946902654867, |
|
"loss": 0.1969, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.25057336688041687, |
|
"learning_rate": 0.0005017699115044248, |
|
"loss": 0.1916, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.21029528975486755, |
|
"eval_runtime": 2.628, |
|
"eval_samples_per_second": 38.052, |
|
"eval_steps_per_second": 9.513, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.04424778761062, |
|
"grad_norm": 0.21927224099636078, |
|
"learning_rate": 0.0004973451327433628, |
|
"loss": 0.155, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.088495575221239, |
|
"grad_norm": 0.3175056576728821, |
|
"learning_rate": 0.0004929203539823009, |
|
"loss": 0.189, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.132743362831858, |
|
"grad_norm": 0.2786344587802887, |
|
"learning_rate": 0.0004884955752212389, |
|
"loss": 0.1679, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.176991150442478, |
|
"grad_norm": 0.2475520521402359, |
|
"learning_rate": 0.00048407079646017696, |
|
"loss": 0.1855, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.221238938053097, |
|
"grad_norm": 0.24603202939033508, |
|
"learning_rate": 0.00047964601769911504, |
|
"loss": 0.1755, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.265486725663717, |
|
"grad_norm": 0.26339662075042725, |
|
"learning_rate": 0.00047522123893805305, |
|
"loss": 0.1644, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.3097345132743365, |
|
"grad_norm": 0.20065292716026306, |
|
"learning_rate": 0.0004707964601769912, |
|
"loss": 0.1555, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.353982300884955, |
|
"grad_norm": 0.34847521781921387, |
|
"learning_rate": 0.00046637168141592925, |
|
"loss": 0.1644, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.398230088495575, |
|
"grad_norm": 0.41893231868743896, |
|
"learning_rate": 0.00046194690265486727, |
|
"loss": 0.1661, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.442477876106195, |
|
"grad_norm": 0.2889445424079895, |
|
"learning_rate": 0.00045752212389380535, |
|
"loss": 0.1924, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.486725663716814, |
|
"grad_norm": 0.24809350073337555, |
|
"learning_rate": 0.00045309734513274336, |
|
"loss": 0.1941, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.530973451327434, |
|
"grad_norm": 0.27125945687294006, |
|
"learning_rate": 0.00044867256637168144, |
|
"loss": 0.1731, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.575221238938053, |
|
"grad_norm": 0.3384355902671814, |
|
"learning_rate": 0.00044424778761061946, |
|
"loss": 0.164, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.619469026548672, |
|
"grad_norm": 0.3089454174041748, |
|
"learning_rate": 0.00043982300884955753, |
|
"loss": 0.1823, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.663716814159292, |
|
"grad_norm": 0.26540765166282654, |
|
"learning_rate": 0.0004353982300884956, |
|
"loss": 0.1762, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.707964601769912, |
|
"grad_norm": 0.22383682429790497, |
|
"learning_rate": 0.0004309734513274337, |
|
"loss": 0.2063, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.752212389380531, |
|
"grad_norm": 0.24541282653808594, |
|
"learning_rate": 0.0004265486725663717, |
|
"loss": 0.1799, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.79646017699115, |
|
"grad_norm": 0.33302921056747437, |
|
"learning_rate": 0.00042212389380530976, |
|
"loss": 0.1749, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.84070796460177, |
|
"grad_norm": 0.274087131023407, |
|
"learning_rate": 0.0004176991150442478, |
|
"loss": 0.1982, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.88495575221239, |
|
"grad_norm": 0.3344975411891937, |
|
"learning_rate": 0.00041327433628318586, |
|
"loss": 0.1962, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.929203539823009, |
|
"grad_norm": 0.28589603304862976, |
|
"learning_rate": 0.0004088495575221239, |
|
"loss": 0.2078, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.9734513274336285, |
|
"grad_norm": 0.18417391180992126, |
|
"learning_rate": 0.00040442477876106195, |
|
"loss": 0.1806, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.20804466307163239, |
|
"eval_runtime": 2.6659, |
|
"eval_samples_per_second": 37.511, |
|
"eval_steps_per_second": 9.378, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 6.017699115044247, |
|
"grad_norm": 0.24382148683071136, |
|
"learning_rate": 0.0004, |
|
"loss": 0.1675, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.061946902654867, |
|
"grad_norm": 0.2718934714794159, |
|
"learning_rate": 0.0003955752212389381, |
|
"loss": 0.1546, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.106194690265487, |
|
"grad_norm": 0.321180135011673, |
|
"learning_rate": 0.0003911504424778761, |
|
"loss": 0.1828, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.150442477876107, |
|
"grad_norm": 0.31438615918159485, |
|
"learning_rate": 0.0003867256637168142, |
|
"loss": 0.1793, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.1946902654867255, |
|
"grad_norm": 0.24199295043945312, |
|
"learning_rate": 0.0003823008849557522, |
|
"loss": 0.1627, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.238938053097345, |
|
"grad_norm": 0.3219399154186249, |
|
"learning_rate": 0.0003778761061946903, |
|
"loss": 0.1557, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.283185840707965, |
|
"grad_norm": 0.20730754733085632, |
|
"learning_rate": 0.0003734513274336283, |
|
"loss": 0.1728, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.327433628318584, |
|
"grad_norm": 0.30667644739151, |
|
"learning_rate": 0.00036902654867256637, |
|
"loss": 0.1601, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.371681415929204, |
|
"grad_norm": 0.364202082157135, |
|
"learning_rate": 0.00036460176991150444, |
|
"loss": 0.166, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.415929203539823, |
|
"grad_norm": 0.2910124659538269, |
|
"learning_rate": 0.0003601769911504425, |
|
"loss": 0.18, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.460176991150442, |
|
"grad_norm": 0.3251543939113617, |
|
"learning_rate": 0.00035575221238938053, |
|
"loss": 0.1666, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.504424778761062, |
|
"grad_norm": 0.31853803992271423, |
|
"learning_rate": 0.0003513274336283186, |
|
"loss": 0.1683, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.548672566371682, |
|
"grad_norm": 0.3730286657810211, |
|
"learning_rate": 0.0003469026548672566, |
|
"loss": 0.163, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.592920353982301, |
|
"grad_norm": 0.3070693910121918, |
|
"learning_rate": 0.0003424778761061947, |
|
"loss": 0.1492, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.6371681415929205, |
|
"grad_norm": 0.25525256991386414, |
|
"learning_rate": 0.0003380530973451327, |
|
"loss": 0.1587, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.68141592920354, |
|
"grad_norm": 0.34361934661865234, |
|
"learning_rate": 0.0003336283185840708, |
|
"loss": 0.161, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.725663716814159, |
|
"grad_norm": 0.2400776594877243, |
|
"learning_rate": 0.00032920353982300886, |
|
"loss": 0.1534, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.769911504424779, |
|
"grad_norm": 0.3599693477153778, |
|
"learning_rate": 0.00032477876106194693, |
|
"loss": 0.1699, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.814159292035399, |
|
"grad_norm": 0.26774442195892334, |
|
"learning_rate": 0.00032035398230088495, |
|
"loss": 0.1567, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.8584070796460175, |
|
"grad_norm": 0.32396429777145386, |
|
"learning_rate": 0.000315929203539823, |
|
"loss": 0.1929, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.902654867256637, |
|
"grad_norm": 0.3491114377975464, |
|
"learning_rate": 0.00031150442477876104, |
|
"loss": 0.1784, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.946902654867257, |
|
"grad_norm": 0.372086763381958, |
|
"learning_rate": 0.0003070796460176991, |
|
"loss": 0.193, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.991150442477876, |
|
"grad_norm": 0.2936050593852997, |
|
"learning_rate": 0.00030265486725663713, |
|
"loss": 0.1899, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.20992980897426605, |
|
"eval_runtime": 3.174, |
|
"eval_samples_per_second": 31.506, |
|
"eval_steps_per_second": 7.877, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 7.035398230088496, |
|
"grad_norm": 0.3688855767250061, |
|
"learning_rate": 0.0002982300884955752, |
|
"loss": 0.1813, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.079646017699115, |
|
"grad_norm": 0.32831940054893494, |
|
"learning_rate": 0.00029380530973451333, |
|
"loss": 0.1472, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.123893805309734, |
|
"grad_norm": 0.32714003324508667, |
|
"learning_rate": 0.00028938053097345135, |
|
"loss": 0.1704, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.168141592920354, |
|
"grad_norm": 0.49076274037361145, |
|
"learning_rate": 0.0002849557522123894, |
|
"loss": 0.1559, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.212389380530974, |
|
"grad_norm": 0.2076297253370285, |
|
"learning_rate": 0.00028053097345132744, |
|
"loss": 0.1571, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.256637168141593, |
|
"grad_norm": 0.30924052000045776, |
|
"learning_rate": 0.0002761061946902655, |
|
"loss": 0.1497, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.300884955752212, |
|
"grad_norm": 0.29587677121162415, |
|
"learning_rate": 0.00027168141592920353, |
|
"loss": 0.1506, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.345132743362832, |
|
"grad_norm": 0.339077889919281, |
|
"learning_rate": 0.0002672566371681416, |
|
"loss": 0.152, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.389380530973451, |
|
"grad_norm": 0.2390238344669342, |
|
"learning_rate": 0.0002628318584070796, |
|
"loss": 0.1634, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.433628318584071, |
|
"grad_norm": 0.3401966392993927, |
|
"learning_rate": 0.00025840707964601775, |
|
"loss": 0.1437, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.477876106194691, |
|
"grad_norm": 0.3273468017578125, |
|
"learning_rate": 0.00025398230088495577, |
|
"loss": 0.1421, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.522123893805309, |
|
"grad_norm": 0.2576355040073395, |
|
"learning_rate": 0.00024955752212389384, |
|
"loss": 0.1606, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.566371681415929, |
|
"grad_norm": 0.3079942464828491, |
|
"learning_rate": 0.00024513274336283186, |
|
"loss": 0.1662, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.610619469026549, |
|
"grad_norm": 0.35095077753067017, |
|
"learning_rate": 0.0002407079646017699, |
|
"loss": 0.1449, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.654867256637168, |
|
"grad_norm": 0.2713673412799835, |
|
"learning_rate": 0.00023628318584070795, |
|
"loss": 0.1666, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.699115044247788, |
|
"grad_norm": 0.3343076705932617, |
|
"learning_rate": 0.00023185840707964602, |
|
"loss": 0.1657, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.743362831858407, |
|
"grad_norm": 0.27280741930007935, |
|
"learning_rate": 0.00022743362831858407, |
|
"loss": 0.1584, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.787610619469026, |
|
"grad_norm": 0.3658842146396637, |
|
"learning_rate": 0.0002230088495575221, |
|
"loss": 0.178, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.831858407079646, |
|
"grad_norm": 0.2327466607093811, |
|
"learning_rate": 0.00021858407079646016, |
|
"loss": 0.1394, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.876106194690266, |
|
"grad_norm": 0.2981870174407959, |
|
"learning_rate": 0.00021415929203539826, |
|
"loss": 0.1555, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.920353982300885, |
|
"grad_norm": 0.32251453399658203, |
|
"learning_rate": 0.0002097345132743363, |
|
"loss": 0.1817, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.964601769911504, |
|
"grad_norm": 0.34020307660102844, |
|
"learning_rate": 0.00020530973451327435, |
|
"loss": 0.1667, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.2127797156572342, |
|
"eval_runtime": 2.6346, |
|
"eval_samples_per_second": 37.957, |
|
"eval_steps_per_second": 9.489, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 8.008849557522124, |
|
"grad_norm": 0.2688687741756439, |
|
"learning_rate": 0.0002008849557522124, |
|
"loss": 0.1726, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 8.053097345132743, |
|
"grad_norm": 0.26508933305740356, |
|
"learning_rate": 0.00019646017699115047, |
|
"loss": 0.1573, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 8.097345132743364, |
|
"grad_norm": 0.38828426599502563, |
|
"learning_rate": 0.0001920353982300885, |
|
"loss": 0.1593, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.141592920353983, |
|
"grad_norm": 0.28579315543174744, |
|
"learning_rate": 0.00018761061946902656, |
|
"loss": 0.139, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.185840707964601, |
|
"grad_norm": 0.29282671213150024, |
|
"learning_rate": 0.0001831858407079646, |
|
"loss": 0.1576, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.230088495575222, |
|
"grad_norm": 0.39632460474967957, |
|
"learning_rate": 0.00017876106194690268, |
|
"loss": 0.1599, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.274336283185841, |
|
"grad_norm": 0.8853453993797302, |
|
"learning_rate": 0.00017433628318584072, |
|
"loss": 0.1415, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 8.31858407079646, |
|
"grad_norm": 0.28350165486335754, |
|
"learning_rate": 0.00016991150442477877, |
|
"loss": 0.1601, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 8.36283185840708, |
|
"grad_norm": 0.32908403873443604, |
|
"learning_rate": 0.00016548672566371681, |
|
"loss": 0.1502, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 8.4070796460177, |
|
"grad_norm": 0.26707422733306885, |
|
"learning_rate": 0.0001610619469026549, |
|
"loss": 0.144, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.451327433628318, |
|
"grad_norm": 0.2607186436653137, |
|
"learning_rate": 0.00015663716814159293, |
|
"loss": 0.1497, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.495575221238939, |
|
"grad_norm": 0.3008362650871277, |
|
"learning_rate": 0.00015221238938053098, |
|
"loss": 0.1519, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.539823008849558, |
|
"grad_norm": 0.3770766854286194, |
|
"learning_rate": 0.00014778761061946902, |
|
"loss": 0.1486, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.584070796460177, |
|
"grad_norm": 0.24154478311538696, |
|
"learning_rate": 0.0001433628318584071, |
|
"loss": 0.1504, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 8.628318584070797, |
|
"grad_norm": 0.28921449184417725, |
|
"learning_rate": 0.00013893805309734514, |
|
"loss": 0.1636, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.672566371681416, |
|
"grad_norm": 0.32194775342941284, |
|
"learning_rate": 0.0001345132743362832, |
|
"loss": 0.1746, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.716814159292035, |
|
"grad_norm": 0.2882642149925232, |
|
"learning_rate": 0.00013008849557522123, |
|
"loss": 0.1305, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 8.761061946902656, |
|
"grad_norm": 0.30995509028434753, |
|
"learning_rate": 0.0001256637168141593, |
|
"loss": 0.1484, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 8.805309734513274, |
|
"grad_norm": 0.32381975650787354, |
|
"learning_rate": 0.00012123893805309735, |
|
"loss": 0.1657, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.849557522123893, |
|
"grad_norm": 0.22391530871391296, |
|
"learning_rate": 0.0001168141592920354, |
|
"loss": 0.1247, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.893805309734514, |
|
"grad_norm": 0.23185725510120392, |
|
"learning_rate": 0.00011238938053097346, |
|
"loss": 0.153, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 8.938053097345133, |
|
"grad_norm": 0.27952226996421814, |
|
"learning_rate": 0.0001079646017699115, |
|
"loss": 0.1621, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 8.982300884955752, |
|
"grad_norm": 0.2538679540157318, |
|
"learning_rate": 0.00010353982300884956, |
|
"loss": 0.1392, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.21310940384864807, |
|
"eval_runtime": 2.6327, |
|
"eval_samples_per_second": 37.984, |
|
"eval_steps_per_second": 9.496, |
|
"step": 2034 |
|
}, |
|
{ |
|
"epoch": 9.026548672566372, |
|
"grad_norm": 0.2921323776245117, |
|
"learning_rate": 9.91150442477876e-05, |
|
"loss": 0.1549, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 9.070796460176991, |
|
"grad_norm": 0.2572889029979706, |
|
"learning_rate": 9.469026548672566e-05, |
|
"loss": 0.1734, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 9.11504424778761, |
|
"grad_norm": 0.2991015613079071, |
|
"learning_rate": 9.026548672566372e-05, |
|
"loss": 0.1582, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.15929203539823, |
|
"grad_norm": 0.33754679560661316, |
|
"learning_rate": 8.584070796460178e-05, |
|
"loss": 0.1343, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.20353982300885, |
|
"grad_norm": 0.2426099181175232, |
|
"learning_rate": 8.141592920353983e-05, |
|
"loss": 0.1462, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 9.247787610619469, |
|
"grad_norm": 0.3596532344818115, |
|
"learning_rate": 7.699115044247789e-05, |
|
"loss": 0.1522, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 9.29203539823009, |
|
"grad_norm": 0.22559010982513428, |
|
"learning_rate": 7.256637168141593e-05, |
|
"loss": 0.1292, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 9.336283185840708, |
|
"grad_norm": 0.3877250850200653, |
|
"learning_rate": 6.814159292035399e-05, |
|
"loss": 0.1257, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 9.380530973451327, |
|
"grad_norm": 0.3135465383529663, |
|
"learning_rate": 6.371681415929204e-05, |
|
"loss": 0.1508, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 9.424778761061948, |
|
"grad_norm": 0.3448950946331024, |
|
"learning_rate": 5.929203539823009e-05, |
|
"loss": 0.1386, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.469026548672566, |
|
"grad_norm": 0.2957702577114105, |
|
"learning_rate": 5.486725663716814e-05, |
|
"loss": 0.1456, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.513274336283185, |
|
"grad_norm": 0.2347142994403839, |
|
"learning_rate": 5.0442477876106195e-05, |
|
"loss": 0.1476, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 9.557522123893806, |
|
"grad_norm": 0.3887890577316284, |
|
"learning_rate": 4.601769911504425e-05, |
|
"loss": 0.158, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 9.601769911504425, |
|
"grad_norm": 0.2899017632007599, |
|
"learning_rate": 4.15929203539823e-05, |
|
"loss": 0.1323, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 9.646017699115044, |
|
"grad_norm": 0.37858498096466064, |
|
"learning_rate": 3.716814159292035e-05, |
|
"loss": 0.1488, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.690265486725664, |
|
"grad_norm": 0.30040085315704346, |
|
"learning_rate": 3.2743362831858405e-05, |
|
"loss": 0.1453, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 9.734513274336283, |
|
"grad_norm": 0.34911859035491943, |
|
"learning_rate": 2.831858407079646e-05, |
|
"loss": 0.1578, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.778761061946902, |
|
"grad_norm": 0.3793705999851227, |
|
"learning_rate": 2.3893805309734513e-05, |
|
"loss": 0.1551, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 9.823008849557523, |
|
"grad_norm": 0.3259049654006958, |
|
"learning_rate": 1.9469026548672565e-05, |
|
"loss": 0.1782, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 9.867256637168142, |
|
"grad_norm": 0.2592504620552063, |
|
"learning_rate": 1.5044247787610619e-05, |
|
"loss": 0.1488, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 9.91150442477876, |
|
"grad_norm": 0.26316604018211365, |
|
"learning_rate": 1.0619469026548673e-05, |
|
"loss": 0.1328, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 9.955752212389381, |
|
"grad_norm": 0.34197258949279785, |
|
"learning_rate": 6.194690265486725e-06, |
|
"loss": 0.1658, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.281561017036438, |
|
"learning_rate": 1.7699115044247788e-06, |
|
"loss": 0.1256, |
|
"step": 2260 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2260, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5529549227950080.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|