{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04424778761061947, "grad_norm": NaN, "learning_rate": 0.0009973451327433627, "loss": 14.6523, "step": 10 }, { "epoch": 0.08849557522123894, "grad_norm": 0.9945968985557556, "learning_rate": 0.000992920353982301, "loss": 4.8948, "step": 20 }, { "epoch": 0.13274336283185842, "grad_norm": 0.3495340645313263, "learning_rate": 0.000988495575221239, "loss": 0.6469, "step": 30 }, { "epoch": 0.17699115044247787, "grad_norm": 0.22641977667808533, "learning_rate": 0.000984070796460177, "loss": 0.5221, "step": 40 }, { "epoch": 0.22123893805309736, "grad_norm": 0.25233855843544006, "learning_rate": 0.000979646017699115, "loss": 0.4094, "step": 50 }, { "epoch": 0.26548672566371684, "grad_norm": 0.37399861216545105, "learning_rate": 0.0009752212389380531, "loss": 0.3958, "step": 60 }, { "epoch": 0.30973451327433627, "grad_norm": 0.18545609712600708, "learning_rate": 0.0009707964601769911, "loss": 0.3405, "step": 70 }, { "epoch": 0.35398230088495575, "grad_norm": 0.2712928354740143, "learning_rate": 0.0009663716814159293, "loss": 0.3242, "step": 80 }, { "epoch": 0.39823008849557523, "grad_norm": 0.2340475469827652, "learning_rate": 0.0009619469026548673, "loss": 0.3007, "step": 90 }, { "epoch": 0.4424778761061947, "grad_norm": 0.18099136650562286, "learning_rate": 0.0009575221238938053, "loss": 0.2567, "step": 100 }, { "epoch": 0.48672566371681414, "grad_norm": 0.23833367228507996, "learning_rate": 0.0009530973451327434, "loss": 0.2734, "step": 110 }, { "epoch": 0.5309734513274337, "grad_norm": 0.20163732767105103, "learning_rate": 0.0009486725663716814, "loss": 0.2326, "step": 120 }, { "epoch": 0.5752212389380531, "grad_norm": 0.1758851557970047, "learning_rate": 0.0009442477876106195, "loss": 0.2914, "step": 130 }, { "epoch": 0.6194690265486725, "grad_norm": 0.211241215467453, "learning_rate": 0.0009398230088495575, "loss": 0.2667, "step": 140 }, { "epoch": 0.6637168141592921, "grad_norm": 0.22571340203285217, "learning_rate": 0.0009353982300884956, "loss": 0.2268, "step": 150 }, { "epoch": 0.7079646017699115, "grad_norm": 0.20469224452972412, "learning_rate": 0.0009309734513274336, "loss": 0.2386, "step": 160 }, { "epoch": 0.7522123893805309, "grad_norm": 0.21183688938617706, "learning_rate": 0.0009265486725663716, "loss": 0.282, "step": 170 }, { "epoch": 0.7964601769911505, "grad_norm": 0.17585916817188263, "learning_rate": 0.0009221238938053097, "loss": 0.3046, "step": 180 }, { "epoch": 0.8407079646017699, "grad_norm": 0.17937427759170532, "learning_rate": 0.0009176991150442479, "loss": 0.2693, "step": 190 }, { "epoch": 0.8849557522123894, "grad_norm": 0.19432350993156433, "learning_rate": 0.0009132743362831859, "loss": 0.252, "step": 200 }, { "epoch": 0.9292035398230089, "grad_norm": 0.18185169994831085, "learning_rate": 0.0009088495575221239, "loss": 0.2793, "step": 210 }, { "epoch": 0.9734513274336283, "grad_norm": 0.18515343964099884, "learning_rate": 0.000904424778761062, "loss": 0.2644, "step": 220 }, { "epoch": 1.0, "eval_loss": 0.22543948888778687, "eval_runtime": 3.0243, "eval_samples_per_second": 33.066, "eval_steps_per_second": 8.266, "step": 226 }, { "epoch": 1.0176991150442478, "grad_norm": 0.2031005322933197, "learning_rate": 0.0009000000000000001, "loss": 0.2704, "step": 230 }, { "epoch": 1.0619469026548674, "grad_norm": 0.26087555289268494, "learning_rate": 0.0008955752212389381, "loss": 0.2526, "step": 240 }, { "epoch": 1.1061946902654867, "grad_norm": 0.1796620637178421, "learning_rate": 0.0008911504424778761, "loss": 0.2605, "step": 250 }, { "epoch": 1.1504424778761062, "grad_norm": 0.22667303681373596, "learning_rate": 0.0008867256637168141, "loss": 0.261, "step": 260 }, { "epoch": 1.1946902654867257, "grad_norm": 0.22089733183383942, "learning_rate": 0.0008823008849557523, "loss": 0.2762, "step": 270 }, { "epoch": 1.238938053097345, "grad_norm": 0.19162122905254364, "learning_rate": 0.0008778761061946903, "loss": 0.2325, "step": 280 }, { "epoch": 1.2831858407079646, "grad_norm": 0.1732087880373001, "learning_rate": 0.0008734513274336283, "loss": 0.2455, "step": 290 }, { "epoch": 1.3274336283185841, "grad_norm": 0.15953731536865234, "learning_rate": 0.0008690265486725663, "loss": 0.2155, "step": 300 }, { "epoch": 1.3716814159292037, "grad_norm": 0.229411318898201, "learning_rate": 0.0008646017699115044, "loss": 0.2289, "step": 310 }, { "epoch": 1.415929203539823, "grad_norm": 0.20390523970127106, "learning_rate": 0.0008601769911504425, "loss": 0.2429, "step": 320 }, { "epoch": 1.4601769911504425, "grad_norm": 0.23142680525779724, "learning_rate": 0.0008557522123893805, "loss": 0.2291, "step": 330 }, { "epoch": 1.504424778761062, "grad_norm": 0.22689059376716614, "learning_rate": 0.0008513274336283185, "loss": 0.2369, "step": 340 }, { "epoch": 1.5486725663716814, "grad_norm": 0.18759772181510925, "learning_rate": 0.0008469026548672567, "loss": 0.1887, "step": 350 }, { "epoch": 1.592920353982301, "grad_norm": 0.17289893329143524, "learning_rate": 0.0008424778761061948, "loss": 0.2547, "step": 360 }, { "epoch": 1.6371681415929205, "grad_norm": 0.20804202556610107, "learning_rate": 0.0008380530973451328, "loss": 0.2446, "step": 370 }, { "epoch": 1.6814159292035398, "grad_norm": 0.2161918580532074, "learning_rate": 0.0008336283185840708, "loss": 0.2262, "step": 380 }, { "epoch": 1.7256637168141593, "grad_norm": 0.27487823367118835, "learning_rate": 0.0008292035398230089, "loss": 0.2673, "step": 390 }, { "epoch": 1.7699115044247788, "grad_norm": 0.20181554555892944, "learning_rate": 0.0008247787610619469, "loss": 0.252, "step": 400 }, { "epoch": 1.8141592920353982, "grad_norm": 0.21222522854804993, "learning_rate": 0.000820353982300885, "loss": 0.23, "step": 410 }, { "epoch": 1.8584070796460177, "grad_norm": 0.21409285068511963, "learning_rate": 0.000815929203539823, "loss": 0.235, "step": 420 }, { "epoch": 1.9026548672566372, "grad_norm": 0.2830056846141815, "learning_rate": 0.0008115044247787611, "loss": 0.2335, "step": 430 }, { "epoch": 1.9469026548672566, "grad_norm": 0.22915257513523102, "learning_rate": 0.0008070796460176991, "loss": 0.2303, "step": 440 }, { "epoch": 1.991150442477876, "grad_norm": 0.19883762300014496, "learning_rate": 0.0008026548672566371, "loss": 0.2222, "step": 450 }, { "epoch": 2.0, "eval_loss": 0.21643634140491486, "eval_runtime": 2.7454, "eval_samples_per_second": 36.424, "eval_steps_per_second": 9.106, "step": 452 }, { "epoch": 2.0353982300884956, "grad_norm": 0.2121458351612091, "learning_rate": 0.0007982300884955752, "loss": 0.2403, "step": 460 }, { "epoch": 2.079646017699115, "grad_norm": 0.17018261551856995, "learning_rate": 0.0007938053097345133, "loss": 0.213, "step": 470 }, { "epoch": 2.1238938053097347, "grad_norm": 0.22500459849834442, "learning_rate": 0.0007893805309734513, "loss": 0.2239, "step": 480 }, { "epoch": 2.168141592920354, "grad_norm": 0.19334179162979126, "learning_rate": 0.0007849557522123893, "loss": 0.2106, "step": 490 }, { "epoch": 2.2123893805309733, "grad_norm": 0.1906515508890152, "learning_rate": 0.0007805309734513274, "loss": 0.2037, "step": 500 }, { "epoch": 2.256637168141593, "grad_norm": 0.2478450983762741, "learning_rate": 0.0007761061946902656, "loss": 0.2164, "step": 510 }, { "epoch": 2.3008849557522124, "grad_norm": 0.2270224243402481, "learning_rate": 0.0007716814159292036, "loss": 0.2253, "step": 520 }, { "epoch": 2.3451327433628317, "grad_norm": 0.2539624273777008, "learning_rate": 0.0007672566371681416, "loss": 0.2016, "step": 530 }, { "epoch": 2.3893805309734515, "grad_norm": 0.33118170499801636, "learning_rate": 0.0007628318584070797, "loss": 0.2239, "step": 540 }, { "epoch": 2.433628318584071, "grad_norm": 0.24022382497787476, "learning_rate": 0.0007584070796460178, "loss": 0.2339, "step": 550 }, { "epoch": 2.47787610619469, "grad_norm": 0.22129379212856293, "learning_rate": 0.0007539823008849558, "loss": 0.2079, "step": 560 }, { "epoch": 2.52212389380531, "grad_norm": 0.20302246510982513, "learning_rate": 0.0007495575221238938, "loss": 0.2012, "step": 570 }, { "epoch": 2.566371681415929, "grad_norm": 0.28677117824554443, "learning_rate": 0.0007451327433628319, "loss": 0.2281, "step": 580 }, { "epoch": 2.6106194690265485, "grad_norm": 0.2567579746246338, "learning_rate": 0.0007407079646017699, "loss": 0.2374, "step": 590 }, { "epoch": 2.6548672566371683, "grad_norm": 0.2306365817785263, "learning_rate": 0.000736283185840708, "loss": 0.2144, "step": 600 }, { "epoch": 2.6991150442477876, "grad_norm": 0.23293821513652802, "learning_rate": 0.000731858407079646, "loss": 0.2381, "step": 610 }, { "epoch": 2.7433628318584073, "grad_norm": 0.2173946499824524, "learning_rate": 0.0007274336283185841, "loss": 0.2155, "step": 620 }, { "epoch": 2.7876106194690267, "grad_norm": 0.30976563692092896, "learning_rate": 0.0007230088495575221, "loss": 0.2262, "step": 630 }, { "epoch": 2.831858407079646, "grad_norm": 0.19489358365535736, "learning_rate": 0.0007185840707964601, "loss": 0.2194, "step": 640 }, { "epoch": 2.8761061946902657, "grad_norm": 0.21821223199367523, "learning_rate": 0.0007141592920353982, "loss": 0.1967, "step": 650 }, { "epoch": 2.920353982300885, "grad_norm": 0.23535631597042084, "learning_rate": 0.0007097345132743363, "loss": 0.2353, "step": 660 }, { "epoch": 2.9646017699115044, "grad_norm": 0.20547734200954437, "learning_rate": 0.0007053097345132744, "loss": 0.2119, "step": 670 }, { "epoch": 3.0, "eval_loss": 0.21383462846279144, "eval_runtime": 2.6363, "eval_samples_per_second": 37.932, "eval_steps_per_second": 9.483, "step": 678 }, { "epoch": 3.0088495575221237, "grad_norm": 0.21669970452785492, "learning_rate": 0.0007008849557522124, "loss": 0.2198, "step": 680 }, { "epoch": 3.0530973451327434, "grad_norm": 0.20589256286621094, "learning_rate": 0.0006964601769911505, "loss": 0.2002, "step": 690 }, { "epoch": 3.0973451327433628, "grad_norm": 0.23902471363544464, "learning_rate": 0.0006920353982300886, "loss": 0.1804, "step": 700 }, { "epoch": 3.1415929203539825, "grad_norm": 0.2881176173686981, "learning_rate": 0.0006876106194690266, "loss": 0.2162, "step": 710 }, { "epoch": 3.185840707964602, "grad_norm": 0.22364391386508942, "learning_rate": 0.0006831858407079646, "loss": 0.2185, "step": 720 }, { "epoch": 3.230088495575221, "grad_norm": 0.23607216775417328, "learning_rate": 0.0006787610619469026, "loss": 0.2124, "step": 730 }, { "epoch": 3.274336283185841, "grad_norm": 0.18838390707969666, "learning_rate": 0.0006743362831858408, "loss": 0.179, "step": 740 }, { "epoch": 3.3185840707964602, "grad_norm": 0.3451661765575409, "learning_rate": 0.0006699115044247788, "loss": 0.2135, "step": 750 }, { "epoch": 3.3628318584070795, "grad_norm": 0.2281007319688797, "learning_rate": 0.0006654867256637168, "loss": 0.2071, "step": 760 }, { "epoch": 3.4070796460176993, "grad_norm": 0.20740865170955658, "learning_rate": 0.0006610619469026548, "loss": 0.2081, "step": 770 }, { "epoch": 3.4513274336283186, "grad_norm": 0.27458012104034424, "learning_rate": 0.0006566371681415929, "loss": 0.2026, "step": 780 }, { "epoch": 3.495575221238938, "grad_norm": 0.19083356857299805, "learning_rate": 0.000652212389380531, "loss": 0.1946, "step": 790 }, { "epoch": 3.5398230088495577, "grad_norm": 0.2667248845100403, "learning_rate": 0.000647787610619469, "loss": 0.2141, "step": 800 }, { "epoch": 3.584070796460177, "grad_norm": 0.22773493826389313, "learning_rate": 0.000643362831858407, "loss": 0.2294, "step": 810 }, { "epoch": 3.6283185840707963, "grad_norm": 0.24344410002231598, "learning_rate": 0.0006389380530973451, "loss": 0.1799, "step": 820 }, { "epoch": 3.672566371681416, "grad_norm": 0.3232133984565735, "learning_rate": 0.0006345132743362833, "loss": 0.1807, "step": 830 }, { "epoch": 3.7168141592920354, "grad_norm": 0.22465798258781433, "learning_rate": 0.0006300884955752213, "loss": 0.2005, "step": 840 }, { "epoch": 3.7610619469026547, "grad_norm": 0.24152274429798126, "learning_rate": 0.0006256637168141594, "loss": 0.2001, "step": 850 }, { "epoch": 3.8053097345132745, "grad_norm": 0.2764975130558014, "learning_rate": 0.0006212389380530974, "loss": 0.1691, "step": 860 }, { "epoch": 3.849557522123894, "grad_norm": 0.23789626359939575, "learning_rate": 0.0006168141592920354, "loss": 0.2318, "step": 870 }, { "epoch": 3.893805309734513, "grad_norm": 0.21235798299312592, "learning_rate": 0.0006123893805309735, "loss": 0.1867, "step": 880 }, { "epoch": 3.938053097345133, "grad_norm": 0.23083995282649994, "learning_rate": 0.0006079646017699116, "loss": 0.2135, "step": 890 }, { "epoch": 3.982300884955752, "grad_norm": 0.22863389551639557, "learning_rate": 0.0006035398230088496, "loss": 0.2188, "step": 900 }, { "epoch": 4.0, "eval_loss": 0.20991046726703644, "eval_runtime": 2.9553, "eval_samples_per_second": 33.837, "eval_steps_per_second": 8.459, "step": 904 }, { "epoch": 4.0265486725663715, "grad_norm": 0.22170217335224152, "learning_rate": 0.0005991150442477876, "loss": 0.2186, "step": 910 }, { "epoch": 4.070796460176991, "grad_norm": 0.2190970778465271, "learning_rate": 0.0005946902654867256, "loss": 0.1978, "step": 920 }, { "epoch": 4.115044247787611, "grad_norm": 0.1924510896205902, "learning_rate": 0.0005902654867256638, "loss": 0.1787, "step": 930 }, { "epoch": 4.15929203539823, "grad_norm": 0.2868868112564087, "learning_rate": 0.0005858407079646018, "loss": 0.172, "step": 940 }, { "epoch": 4.20353982300885, "grad_norm": 0.18888860940933228, "learning_rate": 0.0005814159292035398, "loss": 0.1761, "step": 950 }, { "epoch": 4.247787610619469, "grad_norm": 0.21858586370944977, "learning_rate": 0.0005769911504424778, "loss": 0.1871, "step": 960 }, { "epoch": 4.292035398230088, "grad_norm": 0.305698961019516, "learning_rate": 0.0005725663716814159, "loss": 0.1886, "step": 970 }, { "epoch": 4.336283185840708, "grad_norm": 0.23597249388694763, "learning_rate": 0.000568141592920354, "loss": 0.1865, "step": 980 }, { "epoch": 4.380530973451328, "grad_norm": 0.271823912858963, "learning_rate": 0.0005637168141592921, "loss": 0.1709, "step": 990 }, { "epoch": 4.424778761061947, "grad_norm": 0.19630669057369232, "learning_rate": 0.0005592920353982301, "loss": 0.2429, "step": 1000 }, { "epoch": 4.469026548672566, "grad_norm": 0.29825878143310547, "learning_rate": 0.0005548672566371682, "loss": 0.1879, "step": 1010 }, { "epoch": 4.513274336283186, "grad_norm": 0.21552462875843048, "learning_rate": 0.0005504424778761063, "loss": 0.1905, "step": 1020 }, { "epoch": 4.557522123893805, "grad_norm": 0.28668805956840515, "learning_rate": 0.0005460176991150443, "loss": 0.1951, "step": 1030 }, { "epoch": 4.601769911504425, "grad_norm": 0.27180853486061096, "learning_rate": 0.0005415929203539823, "loss": 0.1758, "step": 1040 }, { "epoch": 4.646017699115045, "grad_norm": 0.3072490394115448, "learning_rate": 0.0005371681415929204, "loss": 0.1852, "step": 1050 }, { "epoch": 4.6902654867256635, "grad_norm": 0.2913398742675781, "learning_rate": 0.0005327433628318584, "loss": 0.201, "step": 1060 }, { "epoch": 4.734513274336283, "grad_norm": 0.29055866599082947, "learning_rate": 0.0005283185840707965, "loss": 0.1932, "step": 1070 }, { "epoch": 4.778761061946903, "grad_norm": 0.2742849290370941, "learning_rate": 0.0005238938053097345, "loss": 0.183, "step": 1080 }, { "epoch": 4.823008849557522, "grad_norm": 0.2370535433292389, "learning_rate": 0.0005194690265486726, "loss": 0.1849, "step": 1090 }, { "epoch": 4.867256637168142, "grad_norm": 0.31343671679496765, "learning_rate": 0.0005150442477876106, "loss": 0.2195, "step": 1100 }, { "epoch": 4.911504424778761, "grad_norm": 0.3136596381664276, "learning_rate": 0.0005106194690265486, "loss": 0.1907, "step": 1110 }, { "epoch": 4.95575221238938, "grad_norm": 0.2071835845708847, "learning_rate": 0.0005061946902654867, "loss": 0.1969, "step": 1120 }, { "epoch": 5.0, "grad_norm": 0.25057336688041687, "learning_rate": 0.0005017699115044248, "loss": 0.1916, "step": 1130 }, { "epoch": 5.0, "eval_loss": 0.21029528975486755, "eval_runtime": 2.628, "eval_samples_per_second": 38.052, "eval_steps_per_second": 9.513, "step": 1130 }, { "epoch": 5.04424778761062, "grad_norm": 0.21927224099636078, "learning_rate": 0.0004973451327433628, "loss": 0.155, "step": 1140 }, { "epoch": 5.088495575221239, "grad_norm": 0.3175056576728821, "learning_rate": 0.0004929203539823009, "loss": 0.189, "step": 1150 }, { "epoch": 5.132743362831858, "grad_norm": 0.2786344587802887, "learning_rate": 0.0004884955752212389, "loss": 0.1679, "step": 1160 }, { "epoch": 5.176991150442478, "grad_norm": 0.2475520521402359, "learning_rate": 0.00048407079646017696, "loss": 0.1855, "step": 1170 }, { "epoch": 5.221238938053097, "grad_norm": 0.24603202939033508, "learning_rate": 0.00047964601769911504, "loss": 0.1755, "step": 1180 }, { "epoch": 5.265486725663717, "grad_norm": 0.26339662075042725, "learning_rate": 0.00047522123893805305, "loss": 0.1644, "step": 1190 }, { "epoch": 5.3097345132743365, "grad_norm": 0.20065292716026306, "learning_rate": 0.0004707964601769912, "loss": 0.1555, "step": 1200 }, { "epoch": 5.353982300884955, "grad_norm": 0.34847521781921387, "learning_rate": 0.00046637168141592925, "loss": 0.1644, "step": 1210 }, { "epoch": 5.398230088495575, "grad_norm": 0.41893231868743896, "learning_rate": 0.00046194690265486727, "loss": 0.1661, "step": 1220 }, { "epoch": 5.442477876106195, "grad_norm": 0.2889445424079895, "learning_rate": 0.00045752212389380535, "loss": 0.1924, "step": 1230 }, { "epoch": 5.486725663716814, "grad_norm": 0.24809350073337555, "learning_rate": 0.00045309734513274336, "loss": 0.1941, "step": 1240 }, { "epoch": 5.530973451327434, "grad_norm": 0.27125945687294006, "learning_rate": 0.00044867256637168144, "loss": 0.1731, "step": 1250 }, { "epoch": 5.575221238938053, "grad_norm": 0.3384355902671814, "learning_rate": 0.00044424778761061946, "loss": 0.164, "step": 1260 }, { "epoch": 5.619469026548672, "grad_norm": 0.3089454174041748, "learning_rate": 0.00043982300884955753, "loss": 0.1823, "step": 1270 }, { "epoch": 5.663716814159292, "grad_norm": 0.26540765166282654, "learning_rate": 0.0004353982300884956, "loss": 0.1762, "step": 1280 }, { "epoch": 5.707964601769912, "grad_norm": 0.22383682429790497, "learning_rate": 0.0004309734513274337, "loss": 0.2063, "step": 1290 }, { "epoch": 5.752212389380531, "grad_norm": 0.24541282653808594, "learning_rate": 0.0004265486725663717, "loss": 0.1799, "step": 1300 }, { "epoch": 5.79646017699115, "grad_norm": 0.33302921056747437, "learning_rate": 0.00042212389380530976, "loss": 0.1749, "step": 1310 }, { "epoch": 5.84070796460177, "grad_norm": 0.274087131023407, "learning_rate": 0.0004176991150442478, "loss": 0.1982, "step": 1320 }, { "epoch": 5.88495575221239, "grad_norm": 0.3344975411891937, "learning_rate": 0.00041327433628318586, "loss": 0.1962, "step": 1330 }, { "epoch": 5.929203539823009, "grad_norm": 0.28589603304862976, "learning_rate": 0.0004088495575221239, "loss": 0.2078, "step": 1340 }, { "epoch": 5.9734513274336285, "grad_norm": 0.18417391180992126, "learning_rate": 0.00040442477876106195, "loss": 0.1806, "step": 1350 }, { "epoch": 6.0, "eval_loss": 0.20804466307163239, "eval_runtime": 2.6659, "eval_samples_per_second": 37.511, "eval_steps_per_second": 9.378, "step": 1356 }, { "epoch": 6.017699115044247, "grad_norm": 0.24382148683071136, "learning_rate": 0.0004, "loss": 0.1675, "step": 1360 }, { "epoch": 6.061946902654867, "grad_norm": 0.2718934714794159, "learning_rate": 0.0003955752212389381, "loss": 0.1546, "step": 1370 }, { "epoch": 6.106194690265487, "grad_norm": 0.321180135011673, "learning_rate": 0.0003911504424778761, "loss": 0.1828, "step": 1380 }, { "epoch": 6.150442477876107, "grad_norm": 0.31438615918159485, "learning_rate": 0.0003867256637168142, "loss": 0.1793, "step": 1390 }, { "epoch": 6.1946902654867255, "grad_norm": 0.24199295043945312, "learning_rate": 0.0003823008849557522, "loss": 0.1627, "step": 1400 }, { "epoch": 6.238938053097345, "grad_norm": 0.3219399154186249, "learning_rate": 0.0003778761061946903, "loss": 0.1557, "step": 1410 }, { "epoch": 6.283185840707965, "grad_norm": 0.20730754733085632, "learning_rate": 0.0003734513274336283, "loss": 0.1728, "step": 1420 }, { "epoch": 6.327433628318584, "grad_norm": 0.30667644739151, "learning_rate": 0.00036902654867256637, "loss": 0.1601, "step": 1430 }, { "epoch": 6.371681415929204, "grad_norm": 0.364202082157135, "learning_rate": 0.00036460176991150444, "loss": 0.166, "step": 1440 }, { "epoch": 6.415929203539823, "grad_norm": 0.2910124659538269, "learning_rate": 0.0003601769911504425, "loss": 0.18, "step": 1450 }, { "epoch": 6.460176991150442, "grad_norm": 0.3251543939113617, "learning_rate": 0.00035575221238938053, "loss": 0.1666, "step": 1460 }, { "epoch": 6.504424778761062, "grad_norm": 0.31853803992271423, "learning_rate": 0.0003513274336283186, "loss": 0.1683, "step": 1470 }, { "epoch": 6.548672566371682, "grad_norm": 0.3730286657810211, "learning_rate": 0.0003469026548672566, "loss": 0.163, "step": 1480 }, { "epoch": 6.592920353982301, "grad_norm": 0.3070693910121918, "learning_rate": 0.0003424778761061947, "loss": 0.1492, "step": 1490 }, { "epoch": 6.6371681415929205, "grad_norm": 0.25525256991386414, "learning_rate": 0.0003380530973451327, "loss": 0.1587, "step": 1500 }, { "epoch": 6.68141592920354, "grad_norm": 0.34361934661865234, "learning_rate": 0.0003336283185840708, "loss": 0.161, "step": 1510 }, { "epoch": 6.725663716814159, "grad_norm": 0.2400776594877243, "learning_rate": 0.00032920353982300886, "loss": 0.1534, "step": 1520 }, { "epoch": 6.769911504424779, "grad_norm": 0.3599693477153778, "learning_rate": 0.00032477876106194693, "loss": 0.1699, "step": 1530 }, { "epoch": 6.814159292035399, "grad_norm": 0.26774442195892334, "learning_rate": 0.00032035398230088495, "loss": 0.1567, "step": 1540 }, { "epoch": 6.8584070796460175, "grad_norm": 0.32396429777145386, "learning_rate": 0.000315929203539823, "loss": 0.1929, "step": 1550 }, { "epoch": 6.902654867256637, "grad_norm": 0.3491114377975464, "learning_rate": 0.00031150442477876104, "loss": 0.1784, "step": 1560 }, { "epoch": 6.946902654867257, "grad_norm": 0.372086763381958, "learning_rate": 0.0003070796460176991, "loss": 0.193, "step": 1570 }, { "epoch": 6.991150442477876, "grad_norm": 0.2936050593852997, "learning_rate": 0.00030265486725663713, "loss": 0.1899, "step": 1580 }, { "epoch": 7.0, "eval_loss": 0.20992980897426605, "eval_runtime": 3.174, "eval_samples_per_second": 31.506, "eval_steps_per_second": 7.877, "step": 1582 }, { "epoch": 7.035398230088496, "grad_norm": 0.3688855767250061, "learning_rate": 0.0002982300884955752, "loss": 0.1813, "step": 1590 }, { "epoch": 7.079646017699115, "grad_norm": 0.32831940054893494, "learning_rate": 0.00029380530973451333, "loss": 0.1472, "step": 1600 }, { "epoch": 7.123893805309734, "grad_norm": 0.32714003324508667, "learning_rate": 0.00028938053097345135, "loss": 0.1704, "step": 1610 }, { "epoch": 7.168141592920354, "grad_norm": 0.49076274037361145, "learning_rate": 0.0002849557522123894, "loss": 0.1559, "step": 1620 }, { "epoch": 7.212389380530974, "grad_norm": 0.2076297253370285, "learning_rate": 0.00028053097345132744, "loss": 0.1571, "step": 1630 }, { "epoch": 7.256637168141593, "grad_norm": 0.30924052000045776, "learning_rate": 0.0002761061946902655, "loss": 0.1497, "step": 1640 }, { "epoch": 7.300884955752212, "grad_norm": 0.29587677121162415, "learning_rate": 0.00027168141592920353, "loss": 0.1506, "step": 1650 }, { "epoch": 7.345132743362832, "grad_norm": 0.339077889919281, "learning_rate": 0.0002672566371681416, "loss": 0.152, "step": 1660 }, { "epoch": 7.389380530973451, "grad_norm": 0.2390238344669342, "learning_rate": 0.0002628318584070796, "loss": 0.1634, "step": 1670 }, { "epoch": 7.433628318584071, "grad_norm": 0.3401966392993927, "learning_rate": 0.00025840707964601775, "loss": 0.1437, "step": 1680 }, { "epoch": 7.477876106194691, "grad_norm": 0.3273468017578125, "learning_rate": 0.00025398230088495577, "loss": 0.1421, "step": 1690 }, { "epoch": 7.522123893805309, "grad_norm": 0.2576355040073395, "learning_rate": 0.00024955752212389384, "loss": 0.1606, "step": 1700 }, { "epoch": 7.566371681415929, "grad_norm": 0.3079942464828491, "learning_rate": 0.00024513274336283186, "loss": 0.1662, "step": 1710 }, { "epoch": 7.610619469026549, "grad_norm": 0.35095077753067017, "learning_rate": 0.0002407079646017699, "loss": 0.1449, "step": 1720 }, { "epoch": 7.654867256637168, "grad_norm": 0.2713673412799835, "learning_rate": 0.00023628318584070795, "loss": 0.1666, "step": 1730 }, { "epoch": 7.699115044247788, "grad_norm": 0.3343076705932617, "learning_rate": 0.00023185840707964602, "loss": 0.1657, "step": 1740 }, { "epoch": 7.743362831858407, "grad_norm": 0.27280741930007935, "learning_rate": 0.00022743362831858407, "loss": 0.1584, "step": 1750 }, { "epoch": 7.787610619469026, "grad_norm": 0.3658842146396637, "learning_rate": 0.0002230088495575221, "loss": 0.178, "step": 1760 }, { "epoch": 7.831858407079646, "grad_norm": 0.2327466607093811, "learning_rate": 0.00021858407079646016, "loss": 0.1394, "step": 1770 }, { "epoch": 7.876106194690266, "grad_norm": 0.2981870174407959, "learning_rate": 0.00021415929203539826, "loss": 0.1555, "step": 1780 }, { "epoch": 7.920353982300885, "grad_norm": 0.32251453399658203, "learning_rate": 0.0002097345132743363, "loss": 0.1817, "step": 1790 }, { "epoch": 7.964601769911504, "grad_norm": 0.34020307660102844, "learning_rate": 0.00020530973451327435, "loss": 0.1667, "step": 1800 }, { "epoch": 8.0, "eval_loss": 0.2127797156572342, "eval_runtime": 2.6346, "eval_samples_per_second": 37.957, "eval_steps_per_second": 9.489, "step": 1808 }, { "epoch": 8.008849557522124, "grad_norm": 0.2688687741756439, "learning_rate": 0.0002008849557522124, "loss": 0.1726, "step": 1810 }, { "epoch": 8.053097345132743, "grad_norm": 0.26508933305740356, "learning_rate": 0.00019646017699115047, "loss": 0.1573, "step": 1820 }, { "epoch": 8.097345132743364, "grad_norm": 0.38828426599502563, "learning_rate": 0.0001920353982300885, "loss": 0.1593, "step": 1830 }, { "epoch": 8.141592920353983, "grad_norm": 0.28579315543174744, "learning_rate": 0.00018761061946902656, "loss": 0.139, "step": 1840 }, { "epoch": 8.185840707964601, "grad_norm": 0.29282671213150024, "learning_rate": 0.0001831858407079646, "loss": 0.1576, "step": 1850 }, { "epoch": 8.230088495575222, "grad_norm": 0.39632460474967957, "learning_rate": 0.00017876106194690268, "loss": 0.1599, "step": 1860 }, { "epoch": 8.274336283185841, "grad_norm": 0.8853453993797302, "learning_rate": 0.00017433628318584072, "loss": 0.1415, "step": 1870 }, { "epoch": 8.31858407079646, "grad_norm": 0.28350165486335754, "learning_rate": 0.00016991150442477877, "loss": 0.1601, "step": 1880 }, { "epoch": 8.36283185840708, "grad_norm": 0.32908403873443604, "learning_rate": 0.00016548672566371681, "loss": 0.1502, "step": 1890 }, { "epoch": 8.4070796460177, "grad_norm": 0.26707422733306885, "learning_rate": 0.0001610619469026549, "loss": 0.144, "step": 1900 }, { "epoch": 8.451327433628318, "grad_norm": 0.2607186436653137, "learning_rate": 0.00015663716814159293, "loss": 0.1497, "step": 1910 }, { "epoch": 8.495575221238939, "grad_norm": 0.3008362650871277, "learning_rate": 0.00015221238938053098, "loss": 0.1519, "step": 1920 }, { "epoch": 8.539823008849558, "grad_norm": 0.3770766854286194, "learning_rate": 0.00014778761061946902, "loss": 0.1486, "step": 1930 }, { "epoch": 8.584070796460177, "grad_norm": 0.24154478311538696, "learning_rate": 0.0001433628318584071, "loss": 0.1504, "step": 1940 }, { "epoch": 8.628318584070797, "grad_norm": 0.28921449184417725, "learning_rate": 0.00013893805309734514, "loss": 0.1636, "step": 1950 }, { "epoch": 8.672566371681416, "grad_norm": 0.32194775342941284, "learning_rate": 0.0001345132743362832, "loss": 0.1746, "step": 1960 }, { "epoch": 8.716814159292035, "grad_norm": 0.2882642149925232, "learning_rate": 0.00013008849557522123, "loss": 0.1305, "step": 1970 }, { "epoch": 8.761061946902656, "grad_norm": 0.30995509028434753, "learning_rate": 0.0001256637168141593, "loss": 0.1484, "step": 1980 }, { "epoch": 8.805309734513274, "grad_norm": 0.32381975650787354, "learning_rate": 0.00012123893805309735, "loss": 0.1657, "step": 1990 }, { "epoch": 8.849557522123893, "grad_norm": 0.22391530871391296, "learning_rate": 0.0001168141592920354, "loss": 0.1247, "step": 2000 }, { "epoch": 8.893805309734514, "grad_norm": 0.23185725510120392, "learning_rate": 0.00011238938053097346, "loss": 0.153, "step": 2010 }, { "epoch": 8.938053097345133, "grad_norm": 0.27952226996421814, "learning_rate": 0.0001079646017699115, "loss": 0.1621, "step": 2020 }, { "epoch": 8.982300884955752, "grad_norm": 0.2538679540157318, "learning_rate": 0.00010353982300884956, "loss": 0.1392, "step": 2030 }, { "epoch": 9.0, "eval_loss": 0.21310940384864807, "eval_runtime": 2.6327, "eval_samples_per_second": 37.984, "eval_steps_per_second": 9.496, "step": 2034 }, { "epoch": 9.026548672566372, "grad_norm": 0.2921323776245117, "learning_rate": 9.91150442477876e-05, "loss": 0.1549, "step": 2040 }, { "epoch": 9.070796460176991, "grad_norm": 0.2572889029979706, "learning_rate": 9.469026548672566e-05, "loss": 0.1734, "step": 2050 }, { "epoch": 9.11504424778761, "grad_norm": 0.2991015613079071, "learning_rate": 9.026548672566372e-05, "loss": 0.1582, "step": 2060 }, { "epoch": 9.15929203539823, "grad_norm": 0.33754679560661316, "learning_rate": 8.584070796460178e-05, "loss": 0.1343, "step": 2070 }, { "epoch": 9.20353982300885, "grad_norm": 0.2426099181175232, "learning_rate": 8.141592920353983e-05, "loss": 0.1462, "step": 2080 }, { "epoch": 9.247787610619469, "grad_norm": 0.3596532344818115, "learning_rate": 7.699115044247789e-05, "loss": 0.1522, "step": 2090 }, { "epoch": 9.29203539823009, "grad_norm": 0.22559010982513428, "learning_rate": 7.256637168141593e-05, "loss": 0.1292, "step": 2100 }, { "epoch": 9.336283185840708, "grad_norm": 0.3877250850200653, "learning_rate": 6.814159292035399e-05, "loss": 0.1257, "step": 2110 }, { "epoch": 9.380530973451327, "grad_norm": 0.3135465383529663, "learning_rate": 6.371681415929204e-05, "loss": 0.1508, "step": 2120 }, { "epoch": 9.424778761061948, "grad_norm": 0.3448950946331024, "learning_rate": 5.929203539823009e-05, "loss": 0.1386, "step": 2130 }, { "epoch": 9.469026548672566, "grad_norm": 0.2957702577114105, "learning_rate": 5.486725663716814e-05, "loss": 0.1456, "step": 2140 }, { "epoch": 9.513274336283185, "grad_norm": 0.2347142994403839, "learning_rate": 5.0442477876106195e-05, "loss": 0.1476, "step": 2150 }, { "epoch": 9.557522123893806, "grad_norm": 0.3887890577316284, "learning_rate": 4.601769911504425e-05, "loss": 0.158, "step": 2160 }, { "epoch": 9.601769911504425, "grad_norm": 0.2899017632007599, "learning_rate": 4.15929203539823e-05, "loss": 0.1323, "step": 2170 }, { "epoch": 9.646017699115044, "grad_norm": 0.37858498096466064, "learning_rate": 3.716814159292035e-05, "loss": 0.1488, "step": 2180 }, { "epoch": 9.690265486725664, "grad_norm": 0.30040085315704346, "learning_rate": 3.2743362831858405e-05, "loss": 0.1453, "step": 2190 }, { "epoch": 9.734513274336283, "grad_norm": 0.34911859035491943, "learning_rate": 2.831858407079646e-05, "loss": 0.1578, "step": 2200 }, { "epoch": 9.778761061946902, "grad_norm": 0.3793705999851227, "learning_rate": 2.3893805309734513e-05, "loss": 0.1551, "step": 2210 }, { "epoch": 9.823008849557523, "grad_norm": 0.3259049654006958, "learning_rate": 1.9469026548672565e-05, "loss": 0.1782, "step": 2220 }, { "epoch": 9.867256637168142, "grad_norm": 0.2592504620552063, "learning_rate": 1.5044247787610619e-05, "loss": 0.1488, "step": 2230 }, { "epoch": 9.91150442477876, "grad_norm": 0.26316604018211365, "learning_rate": 1.0619469026548673e-05, "loss": 0.1328, "step": 2240 }, { "epoch": 9.955752212389381, "grad_norm": 0.34197258949279785, "learning_rate": 6.194690265486725e-06, "loss": 0.1658, "step": 2250 }, { "epoch": 10.0, "grad_norm": 0.281561017036438, "learning_rate": 1.7699115044247788e-06, "loss": 0.1256, "step": 2260 } ], "logging_steps": 10, "max_steps": 2260, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5529549227950080.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }