diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 114.50381679389314, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03816793893129771, + "grad_norm": 11.100768089294434, + "learning_rate": 6.000000000000001e-08, + "loss": 1.2659, + "step": 10 + }, + { + "epoch": 0.07633587786259542, + "grad_norm": 4.894315242767334, + "learning_rate": 1.2666666666666666e-07, + "loss": 1.1033, + "step": 20 + }, + { + "epoch": 0.11450381679389313, + "grad_norm": 8.250714302062988, + "learning_rate": 1.9333333333333337e-07, + "loss": 1.0279, + "step": 30 + }, + { + "epoch": 0.15267175572519084, + "grad_norm": 4.837605953216553, + "learning_rate": 2.6e-07, + "loss": 0.9319, + "step": 40 + }, + { + "epoch": 0.19083969465648856, + "grad_norm": 6.9533257484436035, + "learning_rate": 3.266666666666667e-07, + "loss": 1.3479, + "step": 50 + }, + { + "epoch": 0.22900763358778625, + "grad_norm": 7.103541851043701, + "learning_rate": 3.9333333333333336e-07, + "loss": 1.0561, + "step": 60 + }, + { + "epoch": 0.26717557251908397, + "grad_norm": 5.55342435836792, + "learning_rate": 4.6000000000000004e-07, + "loss": 1.0501, + "step": 70 + }, + { + "epoch": 0.3053435114503817, + "grad_norm": 6.5388264656066895, + "learning_rate": 5.266666666666667e-07, + "loss": 1.2378, + "step": 80 + }, + { + "epoch": 0.3435114503816794, + "grad_norm": 7.388347148895264, + "learning_rate": 5.933333333333334e-07, + "loss": 0.8909, + "step": 90 + }, + { + "epoch": 0.3816793893129771, + "grad_norm": 3.4540562629699707, + "learning_rate": 6.6e-07, + "loss": 1.1626, + "step": 100 + }, + { + "epoch": 0.4198473282442748, + "grad_norm": 5.60060977935791, + "learning_rate": 7.266666666666668e-07, + "loss": 1.0488, + "step": 110 + }, + { + "epoch": 0.4580152671755725, + "grad_norm": 4.210075378417969, + "learning_rate": 7.933333333333335e-07, + "loss": 1.0841, + "step": 120 + }, + { + "epoch": 0.4961832061068702, + "grad_norm": 10.231063842773438, + "learning_rate": 8.6e-07, + "loss": 1.0938, + "step": 130 + }, + { + "epoch": 0.5343511450381679, + "grad_norm": 5.16438102722168, + "learning_rate": 9.266666666666667e-07, + "loss": 0.875, + "step": 140 + }, + { + "epoch": 0.5725190839694656, + "grad_norm": 1.9672963619232178, + "learning_rate": 9.933333333333333e-07, + "loss": 0.7903, + "step": 150 + }, + { + "epoch": 0.6106870229007634, + "grad_norm": 2.4632840156555176, + "learning_rate": 1.06e-06, + "loss": 0.7718, + "step": 160 + }, + { + "epoch": 0.648854961832061, + "grad_norm": 2.09865140914917, + "learning_rate": 1.1266666666666667e-06, + "loss": 0.8049, + "step": 170 + }, + { + "epoch": 0.6870229007633588, + "grad_norm": 2.6679656505584717, + "learning_rate": 1.1933333333333335e-06, + "loss": 0.6023, + "step": 180 + }, + { + "epoch": 0.7251908396946565, + "grad_norm": 4.250058650970459, + "learning_rate": 1.26e-06, + "loss": 0.6673, + "step": 190 + }, + { + "epoch": 0.7633587786259542, + "grad_norm": 2.9157729148864746, + "learning_rate": 1.3266666666666667e-06, + "loss": 0.6071, + "step": 200 + }, + { + "epoch": 0.8015267175572519, + "grad_norm": 2.76187801361084, + "learning_rate": 1.3933333333333335e-06, + "loss": 0.6026, + "step": 210 + }, + { + "epoch": 0.8396946564885496, + "grad_norm": 1.7407684326171875, + "learning_rate": 1.46e-06, + "loss": 0.5992, + "step": 220 + }, + { + "epoch": 0.8778625954198473, + "grad_norm": 1.8290092945098877, + "learning_rate": 1.526666666666667e-06, + "loss": 0.4348, + "step": 230 + }, + { + "epoch": 0.916030534351145, + "grad_norm": 1.939551591873169, + "learning_rate": 1.5933333333333335e-06, + "loss": 0.5998, + "step": 240 + }, + { + "epoch": 0.9541984732824428, + "grad_norm": 1.6572309732437134, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.3822, + "step": 250 + }, + { + "epoch": 0.9923664122137404, + "grad_norm": 2.4682672023773193, + "learning_rate": 1.7266666666666667e-06, + "loss": 0.4126, + "step": 260 + }, + { + "epoch": 1.0305343511450382, + "grad_norm": 2.106403112411499, + "learning_rate": 1.7933333333333337e-06, + "loss": 0.3898, + "step": 270 + }, + { + "epoch": 1.0687022900763359, + "grad_norm": 1.331116795539856, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.3547, + "step": 280 + }, + { + "epoch": 1.1068702290076335, + "grad_norm": 1.4754210710525513, + "learning_rate": 1.926666666666667e-06, + "loss": 0.3567, + "step": 290 + }, + { + "epoch": 1.1450381679389312, + "grad_norm": 1.8555392026901245, + "learning_rate": 1.9933333333333334e-06, + "loss": 0.3293, + "step": 300 + }, + { + "epoch": 1.183206106870229, + "grad_norm": 1.6697174310684204, + "learning_rate": 2.06e-06, + "loss": 0.3181, + "step": 310 + }, + { + "epoch": 1.2213740458015268, + "grad_norm": 1.0834065675735474, + "learning_rate": 2.126666666666667e-06, + "loss": 0.2892, + "step": 320 + }, + { + "epoch": 1.2595419847328244, + "grad_norm": 1.046123743057251, + "learning_rate": 2.1933333333333332e-06, + "loss": 0.3114, + "step": 330 + }, + { + "epoch": 1.297709923664122, + "grad_norm": 3.9311656951904297, + "learning_rate": 2.2600000000000004e-06, + "loss": 0.2735, + "step": 340 + }, + { + "epoch": 1.33587786259542, + "grad_norm": 2.4286820888519287, + "learning_rate": 2.3266666666666667e-06, + "loss": 0.2951, + "step": 350 + }, + { + "epoch": 1.3740458015267176, + "grad_norm": 1.0940420627593994, + "learning_rate": 2.3933333333333334e-06, + "loss": 0.2591, + "step": 360 + }, + { + "epoch": 1.4122137404580153, + "grad_norm": 1.823198914527893, + "learning_rate": 2.46e-06, + "loss": 0.2793, + "step": 370 + }, + { + "epoch": 1.450381679389313, + "grad_norm": 2.960927724838257, + "learning_rate": 2.526666666666667e-06, + "loss": 0.2855, + "step": 380 + }, + { + "epoch": 1.4885496183206106, + "grad_norm": 1.0646835565567017, + "learning_rate": 2.5933333333333336e-06, + "loss": 0.2421, + "step": 390 + }, + { + "epoch": 1.5267175572519083, + "grad_norm": 1.0445008277893066, + "learning_rate": 2.6600000000000004e-06, + "loss": 0.2547, + "step": 400 + }, + { + "epoch": 1.5648854961832062, + "grad_norm": 1.456775188446045, + "learning_rate": 2.726666666666667e-06, + "loss": 0.2318, + "step": 410 + }, + { + "epoch": 1.6030534351145038, + "grad_norm": 1.3221451044082642, + "learning_rate": 2.7933333333333334e-06, + "loss": 0.2203, + "step": 420 + }, + { + "epoch": 1.6412213740458015, + "grad_norm": 0.8609516024589539, + "learning_rate": 2.86e-06, + "loss": 0.2129, + "step": 430 + }, + { + "epoch": 1.6793893129770994, + "grad_norm": 1.8033744096755981, + "learning_rate": 2.9266666666666673e-06, + "loss": 0.2442, + "step": 440 + }, + { + "epoch": 1.717557251908397, + "grad_norm": 1.3024567365646362, + "learning_rate": 2.9933333333333336e-06, + "loss": 0.2504, + "step": 450 + }, + { + "epoch": 1.7557251908396947, + "grad_norm": 1.1774224042892456, + "learning_rate": 3.0600000000000003e-06, + "loss": 0.2251, + "step": 460 + }, + { + "epoch": 1.7938931297709924, + "grad_norm": 0.9202947020530701, + "learning_rate": 3.1266666666666667e-06, + "loss": 0.1936, + "step": 470 + }, + { + "epoch": 1.83206106870229, + "grad_norm": 1.2168657779693604, + "learning_rate": 3.193333333333334e-06, + "loss": 0.2638, + "step": 480 + }, + { + "epoch": 1.8702290076335877, + "grad_norm": 1.1469149589538574, + "learning_rate": 3.2600000000000006e-06, + "loss": 0.1834, + "step": 490 + }, + { + "epoch": 1.9083969465648853, + "grad_norm": 1.923591136932373, + "learning_rate": 3.326666666666667e-06, + "loss": 0.1972, + "step": 500 + }, + { + "epoch": 1.9465648854961832, + "grad_norm": 0.7359837889671326, + "learning_rate": 3.3933333333333336e-06, + "loss": 0.2008, + "step": 510 + }, + { + "epoch": 1.984732824427481, + "grad_norm": 1.34367036819458, + "learning_rate": 3.46e-06, + "loss": 0.1836, + "step": 520 + }, + { + "epoch": 2.0229007633587788, + "grad_norm": 0.9186052680015564, + "learning_rate": 3.526666666666667e-06, + "loss": 0.19, + "step": 530 + }, + { + "epoch": 2.0610687022900764, + "grad_norm": 1.0746238231658936, + "learning_rate": 3.593333333333334e-06, + "loss": 0.1821, + "step": 540 + }, + { + "epoch": 2.099236641221374, + "grad_norm": 0.6840347051620483, + "learning_rate": 3.66e-06, + "loss": 0.1709, + "step": 550 + }, + { + "epoch": 2.1374045801526718, + "grad_norm": 1.209904670715332, + "learning_rate": 3.726666666666667e-06, + "loss": 0.1677, + "step": 560 + }, + { + "epoch": 2.1755725190839694, + "grad_norm": 1.1672343015670776, + "learning_rate": 3.793333333333334e-06, + "loss": 0.1811, + "step": 570 + }, + { + "epoch": 2.213740458015267, + "grad_norm": 1.155468225479126, + "learning_rate": 3.86e-06, + "loss": 0.2217, + "step": 580 + }, + { + "epoch": 2.2519083969465647, + "grad_norm": 1.3697214126586914, + "learning_rate": 3.926666666666667e-06, + "loss": 0.2069, + "step": 590 + }, + { + "epoch": 2.2900763358778624, + "grad_norm": 1.2069604396820068, + "learning_rate": 3.993333333333334e-06, + "loss": 0.1824, + "step": 600 + }, + { + "epoch": 2.32824427480916, + "grad_norm": 1.0572060346603394, + "learning_rate": 4.060000000000001e-06, + "loss": 0.2115, + "step": 610 + }, + { + "epoch": 2.366412213740458, + "grad_norm": 3.517552137374878, + "learning_rate": 4.126666666666667e-06, + "loss": 0.1802, + "step": 620 + }, + { + "epoch": 2.404580152671756, + "grad_norm": 0.6924391388893127, + "learning_rate": 4.1933333333333336e-06, + "loss": 0.1771, + "step": 630 + }, + { + "epoch": 2.4427480916030535, + "grad_norm": 1.402863621711731, + "learning_rate": 4.26e-06, + "loss": 0.1774, + "step": 640 + }, + { + "epoch": 2.480916030534351, + "grad_norm": 3.4374442100524902, + "learning_rate": 4.326666666666667e-06, + "loss": 0.1678, + "step": 650 + }, + { + "epoch": 2.519083969465649, + "grad_norm": 1.5495679378509521, + "learning_rate": 4.393333333333334e-06, + "loss": 0.1867, + "step": 660 + }, + { + "epoch": 2.5572519083969465, + "grad_norm": 0.8096297383308411, + "learning_rate": 4.4600000000000005e-06, + "loss": 0.1647, + "step": 670 + }, + { + "epoch": 2.595419847328244, + "grad_norm": 1.6879674196243286, + "learning_rate": 4.526666666666667e-06, + "loss": 0.1838, + "step": 680 + }, + { + "epoch": 2.633587786259542, + "grad_norm": 1.4142144918441772, + "learning_rate": 4.593333333333333e-06, + "loss": 0.15, + "step": 690 + }, + { + "epoch": 2.67175572519084, + "grad_norm": 1.1931599378585815, + "learning_rate": 4.66e-06, + "loss": 0.1637, + "step": 700 + }, + { + "epoch": 2.7099236641221376, + "grad_norm": 0.7196221947669983, + "learning_rate": 4.7266666666666674e-06, + "loss": 0.1746, + "step": 710 + }, + { + "epoch": 2.7480916030534353, + "grad_norm": 1.426375150680542, + "learning_rate": 4.793333333333334e-06, + "loss": 0.1618, + "step": 720 + }, + { + "epoch": 2.786259541984733, + "grad_norm": 1.2002971172332764, + "learning_rate": 4.86e-06, + "loss": 0.1647, + "step": 730 + }, + { + "epoch": 2.8244274809160306, + "grad_norm": 1.275342345237732, + "learning_rate": 4.926666666666667e-06, + "loss": 0.1531, + "step": 740 + }, + { + "epoch": 2.8625954198473282, + "grad_norm": 0.6153695583343506, + "learning_rate": 4.9933333333333335e-06, + "loss": 0.1747, + "step": 750 + }, + { + "epoch": 2.900763358778626, + "grad_norm": 0.6474863290786743, + "learning_rate": 5.060000000000001e-06, + "loss": 0.1724, + "step": 760 + }, + { + "epoch": 2.9389312977099236, + "grad_norm": 0.8682901263237, + "learning_rate": 5.126666666666668e-06, + "loss": 0.1724, + "step": 770 + }, + { + "epoch": 2.9770992366412212, + "grad_norm": 0.9209064841270447, + "learning_rate": 5.193333333333333e-06, + "loss": 0.1791, + "step": 780 + }, + { + "epoch": 3.015267175572519, + "grad_norm": 1.5052716732025146, + "learning_rate": 5.2600000000000005e-06, + "loss": 0.1536, + "step": 790 + }, + { + "epoch": 3.053435114503817, + "grad_norm": 0.6915569305419922, + "learning_rate": 5.326666666666667e-06, + "loss": 0.1644, + "step": 800 + }, + { + "epoch": 3.0916030534351147, + "grad_norm": 1.194627046585083, + "learning_rate": 5.393333333333334e-06, + "loss": 0.1673, + "step": 810 + }, + { + "epoch": 3.1297709923664123, + "grad_norm": 1.9529887437820435, + "learning_rate": 5.460000000000001e-06, + "loss": 0.1541, + "step": 820 + }, + { + "epoch": 3.16793893129771, + "grad_norm": 0.9561631679534912, + "learning_rate": 5.5266666666666666e-06, + "loss": 0.154, + "step": 830 + }, + { + "epoch": 3.2061068702290076, + "grad_norm": 0.7585867643356323, + "learning_rate": 5.593333333333334e-06, + "loss": 0.1443, + "step": 840 + }, + { + "epoch": 3.2442748091603053, + "grad_norm": 0.9479978084564209, + "learning_rate": 5.66e-06, + "loss": 0.1705, + "step": 850 + }, + { + "epoch": 3.282442748091603, + "grad_norm": 1.1037328243255615, + "learning_rate": 5.726666666666667e-06, + "loss": 0.1538, + "step": 860 + }, + { + "epoch": 3.3206106870229006, + "grad_norm": 1.2902960777282715, + "learning_rate": 5.793333333333334e-06, + "loss": 0.1739, + "step": 870 + }, + { + "epoch": 3.3587786259541983, + "grad_norm": 0.8667603135108948, + "learning_rate": 5.86e-06, + "loss": 0.1451, + "step": 880 + }, + { + "epoch": 3.3969465648854964, + "grad_norm": 0.6339547038078308, + "learning_rate": 5.926666666666667e-06, + "loss": 0.1514, + "step": 890 + }, + { + "epoch": 3.435114503816794, + "grad_norm": 0.6177077889442444, + "learning_rate": 5.993333333333334e-06, + "loss": 0.1513, + "step": 900 + }, + { + "epoch": 3.4732824427480917, + "grad_norm": 1.273289680480957, + "learning_rate": 6.0600000000000004e-06, + "loss": 0.1457, + "step": 910 + }, + { + "epoch": 3.5114503816793894, + "grad_norm": 0.7146188020706177, + "learning_rate": 6.126666666666668e-06, + "loss": 0.174, + "step": 920 + }, + { + "epoch": 3.549618320610687, + "grad_norm": 0.6311309337615967, + "learning_rate": 6.193333333333333e-06, + "loss": 0.1493, + "step": 930 + }, + { + "epoch": 3.5877862595419847, + "grad_norm": 0.916165292263031, + "learning_rate": 6.26e-06, + "loss": 0.1552, + "step": 940 + }, + { + "epoch": 3.6259541984732824, + "grad_norm": 1.2083804607391357, + "learning_rate": 6.326666666666667e-06, + "loss": 0.1533, + "step": 950 + }, + { + "epoch": 3.66412213740458, + "grad_norm": 0.7804637551307678, + "learning_rate": 6.393333333333334e-06, + "loss": 0.1527, + "step": 960 + }, + { + "epoch": 3.7022900763358777, + "grad_norm": 2.1257057189941406, + "learning_rate": 6.460000000000001e-06, + "loss": 0.1838, + "step": 970 + }, + { + "epoch": 3.7404580152671754, + "grad_norm": 0.7072081565856934, + "learning_rate": 6.526666666666666e-06, + "loss": 0.1574, + "step": 980 + }, + { + "epoch": 3.778625954198473, + "grad_norm": 1.387868046760559, + "learning_rate": 6.5933333333333335e-06, + "loss": 0.1409, + "step": 990 + }, + { + "epoch": 3.816793893129771, + "grad_norm": 2.3242297172546387, + "learning_rate": 6.660000000000001e-06, + "loss": 0.1384, + "step": 1000 + }, + { + "epoch": 3.854961832061069, + "grad_norm": 0.578836977481842, + "learning_rate": 6.726666666666667e-06, + "loss": 0.151, + "step": 1010 + }, + { + "epoch": 3.8931297709923665, + "grad_norm": 0.8613945841789246, + "learning_rate": 6.793333333333334e-06, + "loss": 0.1561, + "step": 1020 + }, + { + "epoch": 3.931297709923664, + "grad_norm": 0.8455677628517151, + "learning_rate": 6.860000000000001e-06, + "loss": 0.1561, + "step": 1030 + }, + { + "epoch": 3.969465648854962, + "grad_norm": 2.132455825805664, + "learning_rate": 6.926666666666667e-06, + "loss": 0.1589, + "step": 1040 + }, + { + "epoch": 4.00763358778626, + "grad_norm": 2.1734399795532227, + "learning_rate": 6.993333333333334e-06, + "loss": 0.1643, + "step": 1050 + }, + { + "epoch": 4.0458015267175576, + "grad_norm": 0.7833858132362366, + "learning_rate": 7.06e-06, + "loss": 0.1425, + "step": 1060 + }, + { + "epoch": 4.083969465648855, + "grad_norm": 1.160144567489624, + "learning_rate": 7.126666666666667e-06, + "loss": 0.1446, + "step": 1070 + }, + { + "epoch": 4.122137404580153, + "grad_norm": 0.5361282825469971, + "learning_rate": 7.1933333333333345e-06, + "loss": 0.13, + "step": 1080 + }, + { + "epoch": 4.1603053435114505, + "grad_norm": 1.3808670043945312, + "learning_rate": 7.260000000000001e-06, + "loss": 0.1692, + "step": 1090 + }, + { + "epoch": 4.198473282442748, + "grad_norm": 1.0317463874816895, + "learning_rate": 7.326666666666667e-06, + "loss": 0.1346, + "step": 1100 + }, + { + "epoch": 4.236641221374046, + "grad_norm": 0.7219449877738953, + "learning_rate": 7.393333333333333e-06, + "loss": 0.1508, + "step": 1110 + }, + { + "epoch": 4.2748091603053435, + "grad_norm": 4.125285625457764, + "learning_rate": 7.4600000000000006e-06, + "loss": 0.1658, + "step": 1120 + }, + { + "epoch": 4.312977099236641, + "grad_norm": 1.0689539909362793, + "learning_rate": 7.526666666666668e-06, + "loss": 0.1459, + "step": 1130 + }, + { + "epoch": 4.351145038167939, + "grad_norm": 0.8281168937683105, + "learning_rate": 7.593333333333334e-06, + "loss": 0.1327, + "step": 1140 + }, + { + "epoch": 4.3893129770992365, + "grad_norm": 0.4322452247142792, + "learning_rate": 7.660000000000001e-06, + "loss": 0.1469, + "step": 1150 + }, + { + "epoch": 4.427480916030534, + "grad_norm": 0.7191686630249023, + "learning_rate": 7.726666666666667e-06, + "loss": 0.1352, + "step": 1160 + }, + { + "epoch": 4.465648854961832, + "grad_norm": 0.676465630531311, + "learning_rate": 7.793333333333334e-06, + "loss": 0.1336, + "step": 1170 + }, + { + "epoch": 4.5038167938931295, + "grad_norm": 0.7372109293937683, + "learning_rate": 7.860000000000001e-06, + "loss": 0.1483, + "step": 1180 + }, + { + "epoch": 4.541984732824427, + "grad_norm": 1.089241862297058, + "learning_rate": 7.926666666666666e-06, + "loss": 0.142, + "step": 1190 + }, + { + "epoch": 4.580152671755725, + "grad_norm": 0.5060117840766907, + "learning_rate": 7.993333333333334e-06, + "loss": 0.1542, + "step": 1200 + }, + { + "epoch": 4.618320610687023, + "grad_norm": 0.5482001900672913, + "learning_rate": 8.06e-06, + "loss": 0.1326, + "step": 1210 + }, + { + "epoch": 4.65648854961832, + "grad_norm": 0.677850604057312, + "learning_rate": 8.126666666666668e-06, + "loss": 0.1377, + "step": 1220 + }, + { + "epoch": 4.694656488549619, + "grad_norm": 0.4879799783229828, + "learning_rate": 8.193333333333335e-06, + "loss": 0.1323, + "step": 1230 + }, + { + "epoch": 4.732824427480916, + "grad_norm": 0.8837642073631287, + "learning_rate": 8.26e-06, + "loss": 0.134, + "step": 1240 + }, + { + "epoch": 4.770992366412214, + "grad_norm": 0.6340999007225037, + "learning_rate": 8.326666666666668e-06, + "loss": 0.1267, + "step": 1250 + }, + { + "epoch": 4.809160305343512, + "grad_norm": 1.3912076950073242, + "learning_rate": 8.393333333333335e-06, + "loss": 0.1483, + "step": 1260 + }, + { + "epoch": 4.847328244274809, + "grad_norm": 0.44081875681877136, + "learning_rate": 8.46e-06, + "loss": 0.1514, + "step": 1270 + }, + { + "epoch": 4.885496183206107, + "grad_norm": 1.8900103569030762, + "learning_rate": 8.526666666666667e-06, + "loss": 0.1492, + "step": 1280 + }, + { + "epoch": 4.923664122137405, + "grad_norm": 1.0593299865722656, + "learning_rate": 8.593333333333333e-06, + "loss": 0.1339, + "step": 1290 + }, + { + "epoch": 4.961832061068702, + "grad_norm": 0.8677412271499634, + "learning_rate": 8.66e-06, + "loss": 0.1383, + "step": 1300 + }, + { + "epoch": 5.0, + "grad_norm": 0.49994489550590515, + "learning_rate": 8.726666666666667e-06, + "loss": 0.1562, + "step": 1310 + }, + { + "epoch": 5.038167938931298, + "grad_norm": 0.9444197416305542, + "learning_rate": 8.793333333333334e-06, + "loss": 0.1584, + "step": 1320 + }, + { + "epoch": 5.076335877862595, + "grad_norm": 0.6566011309623718, + "learning_rate": 8.860000000000002e-06, + "loss": 0.1259, + "step": 1330 + }, + { + "epoch": 5.114503816793893, + "grad_norm": 0.574004590511322, + "learning_rate": 8.926666666666669e-06, + "loss": 0.1242, + "step": 1340 + }, + { + "epoch": 5.152671755725191, + "grad_norm": 0.4502525329589844, + "learning_rate": 8.993333333333334e-06, + "loss": 0.126, + "step": 1350 + }, + { + "epoch": 5.190839694656488, + "grad_norm": 0.6264625787734985, + "learning_rate": 9.060000000000001e-06, + "loss": 0.1458, + "step": 1360 + }, + { + "epoch": 5.229007633587786, + "grad_norm": 0.6364710927009583, + "learning_rate": 9.126666666666667e-06, + "loss": 0.1417, + "step": 1370 + }, + { + "epoch": 5.267175572519084, + "grad_norm": 0.8996373414993286, + "learning_rate": 9.193333333333334e-06, + "loss": 0.151, + "step": 1380 + }, + { + "epoch": 5.305343511450381, + "grad_norm": 1.3520936965942383, + "learning_rate": 9.260000000000001e-06, + "loss": 0.1447, + "step": 1390 + }, + { + "epoch": 5.34351145038168, + "grad_norm": 0.7287914752960205, + "learning_rate": 9.326666666666667e-06, + "loss": 0.1417, + "step": 1400 + }, + { + "epoch": 5.3816793893129775, + "grad_norm": 1.365167498588562, + "learning_rate": 9.393333333333334e-06, + "loss": 0.137, + "step": 1410 + }, + { + "epoch": 5.419847328244275, + "grad_norm": 0.889162540435791, + "learning_rate": 9.460000000000001e-06, + "loss": 0.1686, + "step": 1420 + }, + { + "epoch": 5.458015267175573, + "grad_norm": 1.228797197341919, + "learning_rate": 9.526666666666668e-06, + "loss": 0.1359, + "step": 1430 + }, + { + "epoch": 5.4961832061068705, + "grad_norm": 0.6598213911056519, + "learning_rate": 9.593333333333335e-06, + "loss": 0.135, + "step": 1440 + }, + { + "epoch": 5.534351145038168, + "grad_norm": 0.6431540846824646, + "learning_rate": 9.66e-06, + "loss": 0.1381, + "step": 1450 + }, + { + "epoch": 5.572519083969466, + "grad_norm": 0.731825053691864, + "learning_rate": 9.726666666666668e-06, + "loss": 0.1493, + "step": 1460 + }, + { + "epoch": 5.6106870229007635, + "grad_norm": 0.5904266238212585, + "learning_rate": 9.793333333333333e-06, + "loss": 0.1279, + "step": 1470 + }, + { + "epoch": 5.648854961832061, + "grad_norm": 1.9022412300109863, + "learning_rate": 9.86e-06, + "loss": 0.1275, + "step": 1480 + }, + { + "epoch": 5.687022900763359, + "grad_norm": 0.5651864409446716, + "learning_rate": 9.926666666666668e-06, + "loss": 0.1326, + "step": 1490 + }, + { + "epoch": 5.7251908396946565, + "grad_norm": 0.8466259241104126, + "learning_rate": 9.993333333333333e-06, + "loss": 0.141, + "step": 1500 + }, + { + "epoch": 5.763358778625954, + "grad_norm": 0.9748515486717224, + "learning_rate": 9.999997539434007e-06, + "loss": 0.1293, + "step": 1510 + }, + { + "epoch": 5.801526717557252, + "grad_norm": 0.6662544012069702, + "learning_rate": 9.999989033776898e-06, + "loss": 0.1321, + "step": 1520 + }, + { + "epoch": 5.8396946564885495, + "grad_norm": 0.4294588267803192, + "learning_rate": 9.999974452661642e-06, + "loss": 0.1387, + "step": 1530 + }, + { + "epoch": 5.877862595419847, + "grad_norm": 1.3427824974060059, + "learning_rate": 9.99995379610596e-06, + "loss": 0.144, + "step": 1540 + }, + { + "epoch": 5.916030534351145, + "grad_norm": 0.6164063215255737, + "learning_rate": 9.99992706413495e-06, + "loss": 0.1639, + "step": 1550 + }, + { + "epoch": 5.9541984732824424, + "grad_norm": 0.511205792427063, + "learning_rate": 9.999894256781095e-06, + "loss": 0.1401, + "step": 1560 + }, + { + "epoch": 5.99236641221374, + "grad_norm": 0.5297313928604126, + "learning_rate": 9.99985537408426e-06, + "loss": 0.1284, + "step": 1570 + }, + { + "epoch": 6.030534351145038, + "grad_norm": 0.4822237491607666, + "learning_rate": 9.999810416091689e-06, + "loss": 0.1251, + "step": 1580 + }, + { + "epoch": 6.068702290076336, + "grad_norm": 0.49136868119239807, + "learning_rate": 9.99975938285801e-06, + "loss": 0.135, + "step": 1590 + }, + { + "epoch": 6.106870229007634, + "grad_norm": 0.5754089951515198, + "learning_rate": 9.999702274445235e-06, + "loss": 0.1154, + "step": 1600 + }, + { + "epoch": 6.145038167938932, + "grad_norm": 0.5563395619392395, + "learning_rate": 9.999639090922758e-06, + "loss": 0.1229, + "step": 1610 + }, + { + "epoch": 6.183206106870229, + "grad_norm": 0.6620687246322632, + "learning_rate": 9.999569832367346e-06, + "loss": 0.1239, + "step": 1620 + }, + { + "epoch": 6.221374045801527, + "grad_norm": 0.7389596700668335, + "learning_rate": 9.999494498863163e-06, + "loss": 0.1308, + "step": 1630 + }, + { + "epoch": 6.259541984732825, + "grad_norm": 1.2152165174484253, + "learning_rate": 9.99941309050174e-06, + "loss": 0.1382, + "step": 1640 + }, + { + "epoch": 6.297709923664122, + "grad_norm": 0.7054020762443542, + "learning_rate": 9.999325607382e-06, + "loss": 0.1657, + "step": 1650 + }, + { + "epoch": 6.33587786259542, + "grad_norm": 0.48537522554397583, + "learning_rate": 9.999232049610239e-06, + "loss": 0.1295, + "step": 1660 + }, + { + "epoch": 6.374045801526718, + "grad_norm": 0.9919182062149048, + "learning_rate": 9.999132417300141e-06, + "loss": 0.1454, + "step": 1670 + }, + { + "epoch": 6.412213740458015, + "grad_norm": 0.5933136343955994, + "learning_rate": 9.99902671057277e-06, + "loss": 0.1393, + "step": 1680 + }, + { + "epoch": 6.450381679389313, + "grad_norm": 1.4632704257965088, + "learning_rate": 9.998914929556569e-06, + "loss": 0.1269, + "step": 1690 + }, + { + "epoch": 6.488549618320611, + "grad_norm": 0.8186988234519958, + "learning_rate": 9.99879707438736e-06, + "loss": 0.1328, + "step": 1700 + }, + { + "epoch": 6.526717557251908, + "grad_norm": 0.7318149209022522, + "learning_rate": 9.998673145208351e-06, + "loss": 0.1278, + "step": 1710 + }, + { + "epoch": 6.564885496183206, + "grad_norm": 0.6014979481697083, + "learning_rate": 9.998543142170127e-06, + "loss": 0.1331, + "step": 1720 + }, + { + "epoch": 6.603053435114504, + "grad_norm": 0.5737615823745728, + "learning_rate": 9.99840706543065e-06, + "loss": 0.1369, + "step": 1730 + }, + { + "epoch": 6.641221374045801, + "grad_norm": 1.0225228071212769, + "learning_rate": 9.998264915155274e-06, + "loss": 0.1429, + "step": 1740 + }, + { + "epoch": 6.679389312977099, + "grad_norm": 0.7193993330001831, + "learning_rate": 9.998116691516718e-06, + "loss": 0.1235, + "step": 1750 + }, + { + "epoch": 6.717557251908397, + "grad_norm": 2.232433795928955, + "learning_rate": 9.997962394695091e-06, + "loss": 0.1266, + "step": 1760 + }, + { + "epoch": 6.755725190839694, + "grad_norm": 0.7240338325500488, + "learning_rate": 9.997802024877876e-06, + "loss": 0.1358, + "step": 1770 + }, + { + "epoch": 6.793893129770993, + "grad_norm": 0.48704954981803894, + "learning_rate": 9.997635582259941e-06, + "loss": 0.1292, + "step": 1780 + }, + { + "epoch": 6.8320610687022905, + "grad_norm": 0.4361136257648468, + "learning_rate": 9.997463067043526e-06, + "loss": 0.1282, + "step": 1790 + }, + { + "epoch": 6.870229007633588, + "grad_norm": 0.8777408003807068, + "learning_rate": 9.997284479438254e-06, + "loss": 0.1289, + "step": 1800 + }, + { + "epoch": 6.908396946564886, + "grad_norm": 0.586004912853241, + "learning_rate": 9.997099819661127e-06, + "loss": 0.1308, + "step": 1810 + }, + { + "epoch": 6.9465648854961835, + "grad_norm": 0.34389933943748474, + "learning_rate": 9.996909087936523e-06, + "loss": 0.1323, + "step": 1820 + }, + { + "epoch": 6.984732824427481, + "grad_norm": 0.6134178638458252, + "learning_rate": 9.996712284496202e-06, + "loss": 0.1215, + "step": 1830 + }, + { + "epoch": 7.022900763358779, + "grad_norm": 0.5625924468040466, + "learning_rate": 9.996509409579293e-06, + "loss": 0.1316, + "step": 1840 + }, + { + "epoch": 7.061068702290076, + "grad_norm": 0.6711544394493103, + "learning_rate": 9.996300463432313e-06, + "loss": 0.1188, + "step": 1850 + }, + { + "epoch": 7.099236641221374, + "grad_norm": 0.46400946378707886, + "learning_rate": 9.996085446309149e-06, + "loss": 0.1356, + "step": 1860 + }, + { + "epoch": 7.137404580152672, + "grad_norm": 0.6120234727859497, + "learning_rate": 9.995864358471067e-06, + "loss": 0.1551, + "step": 1870 + }, + { + "epoch": 7.175572519083969, + "grad_norm": 0.8439425230026245, + "learning_rate": 9.99563720018671e-06, + "loss": 0.1382, + "step": 1880 + }, + { + "epoch": 7.213740458015267, + "grad_norm": 0.7507050037384033, + "learning_rate": 9.995403971732098e-06, + "loss": 0.1292, + "step": 1890 + }, + { + "epoch": 7.251908396946565, + "grad_norm": 1.3689290285110474, + "learning_rate": 9.995164673390624e-06, + "loss": 0.1244, + "step": 1900 + }, + { + "epoch": 7.290076335877862, + "grad_norm": 0.8372470140457153, + "learning_rate": 9.994919305453059e-06, + "loss": 0.1318, + "step": 1910 + }, + { + "epoch": 7.32824427480916, + "grad_norm": 0.5124345421791077, + "learning_rate": 9.994667868217549e-06, + "loss": 0.1198, + "step": 1920 + }, + { + "epoch": 7.366412213740458, + "grad_norm": 0.48633715510368347, + "learning_rate": 9.99441036198961e-06, + "loss": 0.1364, + "step": 1930 + }, + { + "epoch": 7.404580152671755, + "grad_norm": 0.800594687461853, + "learning_rate": 9.994146787082141e-06, + "loss": 0.1246, + "step": 1940 + }, + { + "epoch": 7.442748091603053, + "grad_norm": 0.5229592323303223, + "learning_rate": 9.993877143815406e-06, + "loss": 0.1236, + "step": 1950 + }, + { + "epoch": 7.480916030534351, + "grad_norm": 0.49684378504753113, + "learning_rate": 9.993601432517052e-06, + "loss": 0.1209, + "step": 1960 + }, + { + "epoch": 7.519083969465649, + "grad_norm": 0.4344727396965027, + "learning_rate": 9.99331965352209e-06, + "loss": 0.119, + "step": 1970 + }, + { + "epoch": 7.557251908396947, + "grad_norm": 1.0743569135665894, + "learning_rate": 9.993031807172912e-06, + "loss": 0.1304, + "step": 1980 + }, + { + "epoch": 7.595419847328245, + "grad_norm": 0.44631901383399963, + "learning_rate": 9.992737893819274e-06, + "loss": 0.1309, + "step": 1990 + }, + { + "epoch": 7.633587786259542, + "grad_norm": 0.8176268935203552, + "learning_rate": 9.992437913818311e-06, + "loss": 0.1245, + "step": 2000 + }, + { + "epoch": 7.67175572519084, + "grad_norm": 0.6087300181388855, + "learning_rate": 9.992131867534526e-06, + "loss": 0.1262, + "step": 2010 + }, + { + "epoch": 7.709923664122138, + "grad_norm": 0.5834507346153259, + "learning_rate": 9.991819755339796e-06, + "loss": 0.1215, + "step": 2020 + }, + { + "epoch": 7.748091603053435, + "grad_norm": 0.4410012364387512, + "learning_rate": 9.991501577613365e-06, + "loss": 0.1271, + "step": 2030 + }, + { + "epoch": 7.786259541984733, + "grad_norm": 0.8793311715126038, + "learning_rate": 9.991177334741851e-06, + "loss": 0.1199, + "step": 2040 + }, + { + "epoch": 7.824427480916031, + "grad_norm": 0.4199819564819336, + "learning_rate": 9.990847027119235e-06, + "loss": 0.1229, + "step": 2050 + }, + { + "epoch": 7.862595419847328, + "grad_norm": 0.5443435311317444, + "learning_rate": 9.990510655146877e-06, + "loss": 0.1155, + "step": 2060 + }, + { + "epoch": 7.900763358778626, + "grad_norm": 1.0193560123443604, + "learning_rate": 9.990168219233496e-06, + "loss": 0.1295, + "step": 2070 + }, + { + "epoch": 7.938931297709924, + "grad_norm": 0.5311276912689209, + "learning_rate": 9.989819719795189e-06, + "loss": 0.1274, + "step": 2080 + }, + { + "epoch": 7.977099236641221, + "grad_norm": 0.5494500994682312, + "learning_rate": 9.989465157255413e-06, + "loss": 0.1356, + "step": 2090 + }, + { + "epoch": 8.01526717557252, + "grad_norm": 0.4246234595775604, + "learning_rate": 9.989104532044994e-06, + "loss": 0.1213, + "step": 2100 + }, + { + "epoch": 8.053435114503817, + "grad_norm": 0.5394090414047241, + "learning_rate": 9.988737844602128e-06, + "loss": 0.1284, + "step": 2110 + }, + { + "epoch": 8.091603053435115, + "grad_norm": 0.6479999423027039, + "learning_rate": 9.988365095372373e-06, + "loss": 0.1278, + "step": 2120 + }, + { + "epoch": 8.129770992366412, + "grad_norm": 0.7949787378311157, + "learning_rate": 9.987986284808654e-06, + "loss": 0.1285, + "step": 2130 + }, + { + "epoch": 8.16793893129771, + "grad_norm": 0.37061432003974915, + "learning_rate": 9.987601413371264e-06, + "loss": 0.1218, + "step": 2140 + }, + { + "epoch": 8.206106870229007, + "grad_norm": 0.6125175952911377, + "learning_rate": 9.987210481527857e-06, + "loss": 0.1284, + "step": 2150 + }, + { + "epoch": 8.244274809160306, + "grad_norm": 1.3589234352111816, + "learning_rate": 9.98681348975345e-06, + "loss": 0.1281, + "step": 2160 + }, + { + "epoch": 8.282442748091603, + "grad_norm": 0.6536274552345276, + "learning_rate": 9.986410438530428e-06, + "loss": 0.1237, + "step": 2170 + }, + { + "epoch": 8.320610687022901, + "grad_norm": 0.5439227223396301, + "learning_rate": 9.986001328348534e-06, + "loss": 0.1292, + "step": 2180 + }, + { + "epoch": 8.358778625954198, + "grad_norm": 0.6106489896774292, + "learning_rate": 9.985586159704879e-06, + "loss": 0.1237, + "step": 2190 + }, + { + "epoch": 8.396946564885496, + "grad_norm": 0.422396719455719, + "learning_rate": 9.98516493310393e-06, + "loss": 0.1247, + "step": 2200 + }, + { + "epoch": 8.435114503816793, + "grad_norm": 0.4791705012321472, + "learning_rate": 9.984737649057514e-06, + "loss": 0.1291, + "step": 2210 + }, + { + "epoch": 8.473282442748092, + "grad_norm": 1.0834544897079468, + "learning_rate": 9.984304308084827e-06, + "loss": 0.1383, + "step": 2220 + }, + { + "epoch": 8.511450381679388, + "grad_norm": 0.5249638557434082, + "learning_rate": 9.983864910712416e-06, + "loss": 0.1249, + "step": 2230 + }, + { + "epoch": 8.549618320610687, + "grad_norm": 0.6899205446243286, + "learning_rate": 9.98341945747419e-06, + "loss": 0.1242, + "step": 2240 + }, + { + "epoch": 8.587786259541986, + "grad_norm": 0.6143678426742554, + "learning_rate": 9.98296794891142e-06, + "loss": 0.1394, + "step": 2250 + }, + { + "epoch": 8.625954198473282, + "grad_norm": 0.4065885543823242, + "learning_rate": 9.982510385572725e-06, + "loss": 0.1343, + "step": 2260 + }, + { + "epoch": 8.664122137404581, + "grad_norm": 1.2763944864273071, + "learning_rate": 9.982046768014093e-06, + "loss": 0.1448, + "step": 2270 + }, + { + "epoch": 8.702290076335878, + "grad_norm": 0.924741268157959, + "learning_rate": 9.981577096798864e-06, + "loss": 0.1442, + "step": 2280 + }, + { + "epoch": 8.740458015267176, + "grad_norm": 0.8587446212768555, + "learning_rate": 9.981101372497729e-06, + "loss": 0.13, + "step": 2290 + }, + { + "epoch": 8.778625954198473, + "grad_norm": 0.8710682392120361, + "learning_rate": 9.980619595688737e-06, + "loss": 0.1263, + "step": 2300 + }, + { + "epoch": 8.816793893129772, + "grad_norm": 0.7033365964889526, + "learning_rate": 9.980131766957296e-06, + "loss": 0.1207, + "step": 2310 + }, + { + "epoch": 8.854961832061068, + "grad_norm": 0.4609795808792114, + "learning_rate": 9.979637886896162e-06, + "loss": 0.1244, + "step": 2320 + }, + { + "epoch": 8.893129770992367, + "grad_norm": 0.7439802289009094, + "learning_rate": 9.979137956105446e-06, + "loss": 0.1202, + "step": 2330 + }, + { + "epoch": 8.931297709923664, + "grad_norm": 1.673240065574646, + "learning_rate": 9.978631975192613e-06, + "loss": 0.1258, + "step": 2340 + }, + { + "epoch": 8.969465648854962, + "grad_norm": 1.4078855514526367, + "learning_rate": 9.978119944772476e-06, + "loss": 0.1305, + "step": 2350 + }, + { + "epoch": 9.007633587786259, + "grad_norm": 0.703015923500061, + "learning_rate": 9.977601865467197e-06, + "loss": 0.1186, + "step": 2360 + }, + { + "epoch": 9.045801526717558, + "grad_norm": 0.973673939704895, + "learning_rate": 9.977077737906296e-06, + "loss": 0.129, + "step": 2370 + }, + { + "epoch": 9.083969465648854, + "grad_norm": 0.6487618684768677, + "learning_rate": 9.976547562726637e-06, + "loss": 0.1108, + "step": 2380 + }, + { + "epoch": 9.122137404580153, + "grad_norm": 0.4547242820262909, + "learning_rate": 9.97601134057243e-06, + "loss": 0.1211, + "step": 2390 + }, + { + "epoch": 9.16030534351145, + "grad_norm": 0.7747370600700378, + "learning_rate": 9.975469072095236e-06, + "loss": 0.122, + "step": 2400 + }, + { + "epoch": 9.198473282442748, + "grad_norm": 0.6227743029594421, + "learning_rate": 9.974920757953965e-06, + "loss": 0.1337, + "step": 2410 + }, + { + "epoch": 9.236641221374045, + "grad_norm": 1.6230658292770386, + "learning_rate": 9.97436639881487e-06, + "loss": 0.1333, + "step": 2420 + }, + { + "epoch": 9.274809160305344, + "grad_norm": 0.4734322130680084, + "learning_rate": 9.973805995351545e-06, + "loss": 0.1312, + "step": 2430 + }, + { + "epoch": 9.312977099236642, + "grad_norm": 1.1201488971710205, + "learning_rate": 9.973239548244938e-06, + "loss": 0.1332, + "step": 2440 + }, + { + "epoch": 9.351145038167939, + "grad_norm": 0.6548280119895935, + "learning_rate": 9.972667058183333e-06, + "loss": 0.1282, + "step": 2450 + }, + { + "epoch": 9.389312977099237, + "grad_norm": 0.9208749532699585, + "learning_rate": 9.972088525862363e-06, + "loss": 0.1397, + "step": 2460 + }, + { + "epoch": 9.427480916030534, + "grad_norm": 0.8364688158035278, + "learning_rate": 9.971503951984996e-06, + "loss": 0.1346, + "step": 2470 + }, + { + "epoch": 9.465648854961833, + "grad_norm": 0.5072993636131287, + "learning_rate": 9.970913337261544e-06, + "loss": 0.1216, + "step": 2480 + }, + { + "epoch": 9.50381679389313, + "grad_norm": 0.5047227144241333, + "learning_rate": 9.97031668240966e-06, + "loss": 0.1158, + "step": 2490 + }, + { + "epoch": 9.541984732824428, + "grad_norm": 0.6214285492897034, + "learning_rate": 9.96971398815434e-06, + "loss": 0.1275, + "step": 2500 + }, + { + "epoch": 9.580152671755725, + "grad_norm": 0.2843784987926483, + "learning_rate": 9.969105255227906e-06, + "loss": 0.1202, + "step": 2510 + }, + { + "epoch": 9.618320610687023, + "grad_norm": 0.9346826672554016, + "learning_rate": 9.968490484370035e-06, + "loss": 0.1257, + "step": 2520 + }, + { + "epoch": 9.65648854961832, + "grad_norm": 0.5003984570503235, + "learning_rate": 9.967869676327726e-06, + "loss": 0.1189, + "step": 2530 + }, + { + "epoch": 9.694656488549619, + "grad_norm": 0.535346508026123, + "learning_rate": 9.967242831855321e-06, + "loss": 0.1166, + "step": 2540 + }, + { + "epoch": 9.732824427480915, + "grad_norm": 2.279946804046631, + "learning_rate": 9.966609951714495e-06, + "loss": 0.1317, + "step": 2550 + }, + { + "epoch": 9.770992366412214, + "grad_norm": 0.4801623523235321, + "learning_rate": 9.965971036674256e-06, + "loss": 0.1233, + "step": 2560 + }, + { + "epoch": 9.80916030534351, + "grad_norm": 0.861571192741394, + "learning_rate": 9.965326087510947e-06, + "loss": 0.1238, + "step": 2570 + }, + { + "epoch": 9.84732824427481, + "grad_norm": 0.4469713568687439, + "learning_rate": 9.964675105008243e-06, + "loss": 0.1162, + "step": 2580 + }, + { + "epoch": 9.885496183206106, + "grad_norm": 0.7882781028747559, + "learning_rate": 9.964018089957148e-06, + "loss": 0.1225, + "step": 2590 + }, + { + "epoch": 9.923664122137405, + "grad_norm": 0.38498684763908386, + "learning_rate": 9.963355043155997e-06, + "loss": 0.1202, + "step": 2600 + }, + { + "epoch": 9.961832061068701, + "grad_norm": 0.5153051614761353, + "learning_rate": 9.962685965410455e-06, + "loss": 0.1202, + "step": 2610 + }, + { + "epoch": 10.0, + "grad_norm": 0.4856826663017273, + "learning_rate": 9.962010857533514e-06, + "loss": 0.1152, + "step": 2620 + }, + { + "epoch": 10.038167938931299, + "grad_norm": 0.44617682695388794, + "learning_rate": 9.961329720345494e-06, + "loss": 0.1225, + "step": 2630 + }, + { + "epoch": 10.076335877862595, + "grad_norm": 0.5542685389518738, + "learning_rate": 9.96064255467404e-06, + "loss": 0.1271, + "step": 2640 + }, + { + "epoch": 10.114503816793894, + "grad_norm": 0.46713921427726746, + "learning_rate": 9.959949361354127e-06, + "loss": 0.1034, + "step": 2650 + }, + { + "epoch": 10.15267175572519, + "grad_norm": 0.3956896662712097, + "learning_rate": 9.959250141228046e-06, + "loss": 0.1086, + "step": 2660 + }, + { + "epoch": 10.19083969465649, + "grad_norm": 0.34693989157676697, + "learning_rate": 9.958544895145415e-06, + "loss": 0.1185, + "step": 2670 + }, + { + "epoch": 10.229007633587786, + "grad_norm": 0.5464341044425964, + "learning_rate": 9.957833623963178e-06, + "loss": 0.1213, + "step": 2680 + }, + { + "epoch": 10.267175572519085, + "grad_norm": 1.7212783098220825, + "learning_rate": 9.957116328545593e-06, + "loss": 0.1231, + "step": 2690 + }, + { + "epoch": 10.305343511450381, + "grad_norm": 0.5747115015983582, + "learning_rate": 9.956393009764244e-06, + "loss": 0.1436, + "step": 2700 + }, + { + "epoch": 10.34351145038168, + "grad_norm": 0.7528557181358337, + "learning_rate": 9.955663668498032e-06, + "loss": 0.1238, + "step": 2710 + }, + { + "epoch": 10.381679389312977, + "grad_norm": 0.7713605761528015, + "learning_rate": 9.954928305633174e-06, + "loss": 0.1309, + "step": 2720 + }, + { + "epoch": 10.419847328244275, + "grad_norm": 0.42558568716049194, + "learning_rate": 9.954186922063204e-06, + "loss": 0.1258, + "step": 2730 + }, + { + "epoch": 10.458015267175572, + "grad_norm": 0.44242027401924133, + "learning_rate": 9.953439518688974e-06, + "loss": 0.1097, + "step": 2740 + }, + { + "epoch": 10.49618320610687, + "grad_norm": 0.4029052257537842, + "learning_rate": 9.952686096418652e-06, + "loss": 0.1273, + "step": 2750 + }, + { + "epoch": 10.534351145038167, + "grad_norm": 0.38173234462738037, + "learning_rate": 9.951926656167715e-06, + "loss": 0.1163, + "step": 2760 + }, + { + "epoch": 10.572519083969466, + "grad_norm": 0.6817983984947205, + "learning_rate": 9.951161198858952e-06, + "loss": 0.1242, + "step": 2770 + }, + { + "epoch": 10.610687022900763, + "grad_norm": 0.47834619879722595, + "learning_rate": 9.95038972542247e-06, + "loss": 0.1173, + "step": 2780 + }, + { + "epoch": 10.648854961832061, + "grad_norm": 1.0968841314315796, + "learning_rate": 9.949612236795682e-06, + "loss": 0.1172, + "step": 2790 + }, + { + "epoch": 10.68702290076336, + "grad_norm": 0.4130131006240845, + "learning_rate": 9.948828733923305e-06, + "loss": 0.1202, + "step": 2800 + }, + { + "epoch": 10.725190839694656, + "grad_norm": 0.5207512974739075, + "learning_rate": 9.948039217757375e-06, + "loss": 0.1318, + "step": 2810 + }, + { + "epoch": 10.763358778625955, + "grad_norm": 0.5142215490341187, + "learning_rate": 9.947243689257226e-06, + "loss": 0.1175, + "step": 2820 + }, + { + "epoch": 10.801526717557252, + "grad_norm": 0.46614205837249756, + "learning_rate": 9.946442149389498e-06, + "loss": 0.1248, + "step": 2830 + }, + { + "epoch": 10.83969465648855, + "grad_norm": 0.6606496572494507, + "learning_rate": 9.94563459912814e-06, + "loss": 0.132, + "step": 2840 + }, + { + "epoch": 10.877862595419847, + "grad_norm": 0.7258016467094421, + "learning_rate": 9.944821039454403e-06, + "loss": 0.1153, + "step": 2850 + }, + { + "epoch": 10.916030534351146, + "grad_norm": 0.5228508114814758, + "learning_rate": 9.944001471356835e-06, + "loss": 0.134, + "step": 2860 + }, + { + "epoch": 10.954198473282442, + "grad_norm": 0.7532591223716736, + "learning_rate": 9.94317589583129e-06, + "loss": 0.1203, + "step": 2870 + }, + { + "epoch": 10.992366412213741, + "grad_norm": 0.8692820072174072, + "learning_rate": 9.942344313880922e-06, + "loss": 0.1177, + "step": 2880 + }, + { + "epoch": 11.030534351145038, + "grad_norm": 0.40786102414131165, + "learning_rate": 9.941506726516179e-06, + "loss": 0.1336, + "step": 2890 + }, + { + "epoch": 11.068702290076336, + "grad_norm": 0.4568503201007843, + "learning_rate": 9.94066313475481e-06, + "loss": 0.1187, + "step": 2900 + }, + { + "epoch": 11.106870229007633, + "grad_norm": 0.4302501678466797, + "learning_rate": 9.939813539621858e-06, + "loss": 0.1209, + "step": 2910 + }, + { + "epoch": 11.145038167938932, + "grad_norm": 0.32359543442726135, + "learning_rate": 9.93895794214966e-06, + "loss": 0.1077, + "step": 2920 + }, + { + "epoch": 11.183206106870228, + "grad_norm": 0.5436263680458069, + "learning_rate": 9.938096343377853e-06, + "loss": 0.1312, + "step": 2930 + }, + { + "epoch": 11.221374045801527, + "grad_norm": 0.3711869418621063, + "learning_rate": 9.937228744353354e-06, + "loss": 0.1173, + "step": 2940 + }, + { + "epoch": 11.259541984732824, + "grad_norm": 0.35187089443206787, + "learning_rate": 9.93635514613038e-06, + "loss": 0.1153, + "step": 2950 + }, + { + "epoch": 11.297709923664122, + "grad_norm": 0.5851736664772034, + "learning_rate": 9.935475549770436e-06, + "loss": 0.1169, + "step": 2960 + }, + { + "epoch": 11.335877862595419, + "grad_norm": 0.3298185467720032, + "learning_rate": 9.934589956342315e-06, + "loss": 0.1163, + "step": 2970 + }, + { + "epoch": 11.374045801526718, + "grad_norm": 0.6017898917198181, + "learning_rate": 9.933698366922093e-06, + "loss": 0.1216, + "step": 2980 + }, + { + "epoch": 11.412213740458014, + "grad_norm": 0.6507192254066467, + "learning_rate": 9.932800782593141e-06, + "loss": 0.1226, + "step": 2990 + }, + { + "epoch": 11.450381679389313, + "grad_norm": 0.3257594406604767, + "learning_rate": 9.931897204446104e-06, + "loss": 0.1308, + "step": 3000 + }, + { + "epoch": 11.488549618320612, + "grad_norm": 4.35601806640625, + "learning_rate": 9.930987633578916e-06, + "loss": 0.1232, + "step": 3010 + }, + { + "epoch": 11.526717557251908, + "grad_norm": 0.3781754970550537, + "learning_rate": 9.930072071096791e-06, + "loss": 0.1277, + "step": 3020 + }, + { + "epoch": 11.564885496183207, + "grad_norm": 0.8004752397537231, + "learning_rate": 9.929150518112225e-06, + "loss": 0.1216, + "step": 3030 + }, + { + "epoch": 11.603053435114504, + "grad_norm": 0.6701073050498962, + "learning_rate": 9.928222975744992e-06, + "loss": 0.1208, + "step": 3040 + }, + { + "epoch": 11.641221374045802, + "grad_norm": 0.39749905467033386, + "learning_rate": 9.92728944512214e-06, + "loss": 0.1079, + "step": 3050 + }, + { + "epoch": 11.679389312977099, + "grad_norm": 0.6213489174842834, + "learning_rate": 9.926349927378e-06, + "loss": 0.1297, + "step": 3060 + }, + { + "epoch": 11.717557251908397, + "grad_norm": 0.4827481508255005, + "learning_rate": 9.925404423654175e-06, + "loss": 0.1244, + "step": 3070 + }, + { + "epoch": 11.755725190839694, + "grad_norm": 0.7463303804397583, + "learning_rate": 9.924452935099537e-06, + "loss": 0.1128, + "step": 3080 + }, + { + "epoch": 11.793893129770993, + "grad_norm": 1.3172951936721802, + "learning_rate": 9.923495462870241e-06, + "loss": 0.1119, + "step": 3090 + }, + { + "epoch": 11.83206106870229, + "grad_norm": 0.8011961579322815, + "learning_rate": 9.9225320081297e-06, + "loss": 0.1115, + "step": 3100 + }, + { + "epoch": 11.870229007633588, + "grad_norm": 1.2579299211502075, + "learning_rate": 9.921562572048606e-06, + "loss": 0.1416, + "step": 3110 + }, + { + "epoch": 11.908396946564885, + "grad_norm": 0.6220617294311523, + "learning_rate": 9.920587155804913e-06, + "loss": 0.121, + "step": 3120 + }, + { + "epoch": 11.946564885496183, + "grad_norm": 0.4087788760662079, + "learning_rate": 9.919605760583846e-06, + "loss": 0.123, + "step": 3130 + }, + { + "epoch": 11.98473282442748, + "grad_norm": 0.3896748125553131, + "learning_rate": 9.91861838757789e-06, + "loss": 0.1182, + "step": 3140 + }, + { + "epoch": 12.022900763358779, + "grad_norm": 0.467168927192688, + "learning_rate": 9.917625037986798e-06, + "loss": 0.1145, + "step": 3150 + }, + { + "epoch": 12.061068702290076, + "grad_norm": 0.6059496998786926, + "learning_rate": 9.916625713017582e-06, + "loss": 0.1218, + "step": 3160 + }, + { + "epoch": 12.099236641221374, + "grad_norm": 0.579326868057251, + "learning_rate": 9.91562041388452e-06, + "loss": 0.114, + "step": 3170 + }, + { + "epoch": 12.137404580152673, + "grad_norm": 1.0949403047561646, + "learning_rate": 9.91460914180914e-06, + "loss": 0.1306, + "step": 3180 + }, + { + "epoch": 12.17557251908397, + "grad_norm": 0.4158419370651245, + "learning_rate": 9.913591898020234e-06, + "loss": 0.1169, + "step": 3190 + }, + { + "epoch": 12.213740458015268, + "grad_norm": 1.5080878734588623, + "learning_rate": 9.912568683753853e-06, + "loss": 0.1215, + "step": 3200 + }, + { + "epoch": 12.251908396946565, + "grad_norm": 0.5173312425613403, + "learning_rate": 9.911539500253295e-06, + "loss": 0.1037, + "step": 3210 + }, + { + "epoch": 12.290076335877863, + "grad_norm": 0.33992186188697815, + "learning_rate": 9.910504348769118e-06, + "loss": 0.1145, + "step": 3220 + }, + { + "epoch": 12.32824427480916, + "grad_norm": 0.881161093711853, + "learning_rate": 9.909463230559127e-06, + "loss": 0.1205, + "step": 3230 + }, + { + "epoch": 12.366412213740459, + "grad_norm": 0.376974493265152, + "learning_rate": 9.908416146888376e-06, + "loss": 0.1234, + "step": 3240 + }, + { + "epoch": 12.404580152671755, + "grad_norm": 0.4849688410758972, + "learning_rate": 9.907363099029175e-06, + "loss": 0.1221, + "step": 3250 + }, + { + "epoch": 12.442748091603054, + "grad_norm": 0.6830031871795654, + "learning_rate": 9.906304088261073e-06, + "loss": 0.1522, + "step": 3260 + }, + { + "epoch": 12.48091603053435, + "grad_norm": 0.450184166431427, + "learning_rate": 9.905239115870873e-06, + "loss": 0.1109, + "step": 3270 + }, + { + "epoch": 12.51908396946565, + "grad_norm": 0.32189562916755676, + "learning_rate": 9.904168183152611e-06, + "loss": 0.1344, + "step": 3280 + }, + { + "epoch": 12.557251908396946, + "grad_norm": 0.6055088043212891, + "learning_rate": 9.903091291407574e-06, + "loss": 0.1377, + "step": 3290 + }, + { + "epoch": 12.595419847328245, + "grad_norm": 0.833159863948822, + "learning_rate": 9.902008441944287e-06, + "loss": 0.1304, + "step": 3300 + }, + { + "epoch": 12.633587786259541, + "grad_norm": 0.6403981447219849, + "learning_rate": 9.900919636078511e-06, + "loss": 0.111, + "step": 3310 + }, + { + "epoch": 12.67175572519084, + "grad_norm": 1.0145337581634521, + "learning_rate": 9.899824875133255e-06, + "loss": 0.123, + "step": 3320 + }, + { + "epoch": 12.709923664122137, + "grad_norm": 0.8037505149841309, + "learning_rate": 9.89872416043875e-06, + "loss": 0.1188, + "step": 3330 + }, + { + "epoch": 12.748091603053435, + "grad_norm": 0.5243023633956909, + "learning_rate": 9.89761749333247e-06, + "loss": 0.1184, + "step": 3340 + }, + { + "epoch": 12.786259541984732, + "grad_norm": 0.3315357565879822, + "learning_rate": 9.896504875159122e-06, + "loss": 0.1168, + "step": 3350 + }, + { + "epoch": 12.82442748091603, + "grad_norm": 0.7898366451263428, + "learning_rate": 9.89538630727064e-06, + "loss": 0.1138, + "step": 3360 + }, + { + "epoch": 12.862595419847327, + "grad_norm": 0.3733002841472626, + "learning_rate": 9.89426179102619e-06, + "loss": 0.1168, + "step": 3370 + }, + { + "epoch": 12.900763358778626, + "grad_norm": 0.5741856098175049, + "learning_rate": 9.893131327792166e-06, + "loss": 0.1189, + "step": 3380 + }, + { + "epoch": 12.938931297709924, + "grad_norm": 0.5322036743164062, + "learning_rate": 9.891994918942183e-06, + "loss": 0.1182, + "step": 3390 + }, + { + "epoch": 12.977099236641221, + "grad_norm": 0.3387764096260071, + "learning_rate": 9.890852565857092e-06, + "loss": 0.1201, + "step": 3400 + }, + { + "epoch": 13.01526717557252, + "grad_norm": 0.6571619510650635, + "learning_rate": 9.889704269924955e-06, + "loss": 0.1217, + "step": 3410 + }, + { + "epoch": 13.053435114503817, + "grad_norm": 0.5754358172416687, + "learning_rate": 9.88855003254106e-06, + "loss": 0.1446, + "step": 3420 + }, + { + "epoch": 13.091603053435115, + "grad_norm": 0.5554888844490051, + "learning_rate": 9.887389855107917e-06, + "loss": 0.1107, + "step": 3430 + }, + { + "epoch": 13.129770992366412, + "grad_norm": 0.5900585055351257, + "learning_rate": 9.886223739035249e-06, + "loss": 0.1211, + "step": 3440 + }, + { + "epoch": 13.16793893129771, + "grad_norm": 0.42058074474334717, + "learning_rate": 9.885051685739997e-06, + "loss": 0.1208, + "step": 3450 + }, + { + "epoch": 13.206106870229007, + "grad_norm": 0.959195077419281, + "learning_rate": 9.883873696646316e-06, + "loss": 0.1127, + "step": 3460 + }, + { + "epoch": 13.244274809160306, + "grad_norm": 0.45505401492118835, + "learning_rate": 9.882689773185575e-06, + "loss": 0.1251, + "step": 3470 + }, + { + "epoch": 13.282442748091603, + "grad_norm": 0.7457808256149292, + "learning_rate": 9.881499916796354e-06, + "loss": 0.1189, + "step": 3480 + }, + { + "epoch": 13.320610687022901, + "grad_norm": 0.8850777745246887, + "learning_rate": 9.880304128924435e-06, + "loss": 0.1307, + "step": 3490 + }, + { + "epoch": 13.358778625954198, + "grad_norm": 0.7178351879119873, + "learning_rate": 9.879102411022818e-06, + "loss": 0.1172, + "step": 3500 + }, + { + "epoch": 13.396946564885496, + "grad_norm": 0.8748807311058044, + "learning_rate": 9.877894764551704e-06, + "loss": 0.1167, + "step": 3510 + }, + { + "epoch": 13.435114503816793, + "grad_norm": 1.205231785774231, + "learning_rate": 9.876681190978494e-06, + "loss": 0.1294, + "step": 3520 + }, + { + "epoch": 13.473282442748092, + "grad_norm": 0.4213230013847351, + "learning_rate": 9.875461691777797e-06, + "loss": 0.1131, + "step": 3530 + }, + { + "epoch": 13.511450381679388, + "grad_norm": 0.5431898832321167, + "learning_rate": 9.874236268431417e-06, + "loss": 0.1157, + "step": 3540 + }, + { + "epoch": 13.549618320610687, + "grad_norm": 0.8572608828544617, + "learning_rate": 9.873004922428362e-06, + "loss": 0.1259, + "step": 3550 + }, + { + "epoch": 13.587786259541986, + "grad_norm": 0.44893670082092285, + "learning_rate": 9.87176765526483e-06, + "loss": 0.129, + "step": 3560 + }, + { + "epoch": 13.625954198473282, + "grad_norm": 0.5084060430526733, + "learning_rate": 9.87052446844422e-06, + "loss": 0.1134, + "step": 3570 + }, + { + "epoch": 13.664122137404581, + "grad_norm": 1.1047592163085938, + "learning_rate": 9.869275363477122e-06, + "loss": 0.1176, + "step": 3580 + }, + { + "epoch": 13.702290076335878, + "grad_norm": 0.836169421672821, + "learning_rate": 9.868020341881313e-06, + "loss": 0.1186, + "step": 3590 + }, + { + "epoch": 13.740458015267176, + "grad_norm": 0.7019125819206238, + "learning_rate": 9.866759405181766e-06, + "loss": 0.1199, + "step": 3600 + }, + { + "epoch": 13.778625954198473, + "grad_norm": 0.5258054733276367, + "learning_rate": 9.865492554910634e-06, + "loss": 0.1227, + "step": 3610 + }, + { + "epoch": 13.816793893129772, + "grad_norm": 0.670691967010498, + "learning_rate": 9.864219792607262e-06, + "loss": 0.115, + "step": 3620 + }, + { + "epoch": 13.854961832061068, + "grad_norm": 0.5219152569770813, + "learning_rate": 9.862941119818176e-06, + "loss": 0.1249, + "step": 3630 + }, + { + "epoch": 13.893129770992367, + "grad_norm": 0.9201920032501221, + "learning_rate": 9.861656538097086e-06, + "loss": 0.1211, + "step": 3640 + }, + { + "epoch": 13.931297709923664, + "grad_norm": 0.4263167679309845, + "learning_rate": 9.860366049004878e-06, + "loss": 0.1115, + "step": 3650 + }, + { + "epoch": 13.969465648854962, + "grad_norm": 0.8185678124427795, + "learning_rate": 9.859069654109615e-06, + "loss": 0.133, + "step": 3660 + }, + { + "epoch": 14.007633587786259, + "grad_norm": 0.4073340594768524, + "learning_rate": 9.857767354986545e-06, + "loss": 0.1306, + "step": 3670 + }, + { + "epoch": 14.045801526717558, + "grad_norm": 0.6191373467445374, + "learning_rate": 9.856459153218078e-06, + "loss": 0.114, + "step": 3680 + }, + { + "epoch": 14.083969465648854, + "grad_norm": 0.9513982534408569, + "learning_rate": 9.855145050393808e-06, + "loss": 0.1276, + "step": 3690 + }, + { + "epoch": 14.122137404580153, + "grad_norm": 0.3555004894733429, + "learning_rate": 9.853825048110491e-06, + "loss": 0.1184, + "step": 3700 + }, + { + "epoch": 14.16030534351145, + "grad_norm": 0.4830268919467926, + "learning_rate": 9.852499147972055e-06, + "loss": 0.1063, + "step": 3710 + }, + { + "epoch": 14.198473282442748, + "grad_norm": 0.5373522043228149, + "learning_rate": 9.851167351589593e-06, + "loss": 0.1422, + "step": 3720 + }, + { + "epoch": 14.236641221374045, + "grad_norm": 0.36047184467315674, + "learning_rate": 9.849829660581364e-06, + "loss": 0.1151, + "step": 3730 + }, + { + "epoch": 14.274809160305344, + "grad_norm": 0.3627484142780304, + "learning_rate": 9.848486076572787e-06, + "loss": 0.1273, + "step": 3740 + }, + { + "epoch": 14.312977099236642, + "grad_norm": 0.37799179553985596, + "learning_rate": 9.847136601196445e-06, + "loss": 0.1098, + "step": 3750 + }, + { + "epoch": 14.351145038167939, + "grad_norm": 0.7704446315765381, + "learning_rate": 9.845781236092078e-06, + "loss": 0.1134, + "step": 3760 + }, + { + "epoch": 14.389312977099237, + "grad_norm": 0.36698824167251587, + "learning_rate": 9.844419982906584e-06, + "loss": 0.1395, + "step": 3770 + }, + { + "epoch": 14.427480916030534, + "grad_norm": 0.6548821926116943, + "learning_rate": 9.84305284329401e-06, + "loss": 0.1282, + "step": 3780 + }, + { + "epoch": 14.465648854961833, + "grad_norm": 0.3647122085094452, + "learning_rate": 9.84167981891556e-06, + "loss": 0.1082, + "step": 3790 + }, + { + "epoch": 14.50381679389313, + "grad_norm": 0.334097683429718, + "learning_rate": 9.84030091143959e-06, + "loss": 0.1065, + "step": 3800 + }, + { + "epoch": 14.541984732824428, + "grad_norm": 0.37364843487739563, + "learning_rate": 9.838916122541604e-06, + "loss": 0.1208, + "step": 3810 + }, + { + "epoch": 14.580152671755725, + "grad_norm": 0.40124377608299255, + "learning_rate": 9.837525453904247e-06, + "loss": 0.1315, + "step": 3820 + }, + { + "epoch": 14.618320610687023, + "grad_norm": 0.8429109454154968, + "learning_rate": 9.836128907217315e-06, + "loss": 0.1095, + "step": 3830 + }, + { + "epoch": 14.65648854961832, + "grad_norm": 0.441574364900589, + "learning_rate": 9.834726484177743e-06, + "loss": 0.1197, + "step": 3840 + }, + { + "epoch": 14.694656488549619, + "grad_norm": 0.8704144358634949, + "learning_rate": 9.833318186489608e-06, + "loss": 0.112, + "step": 3850 + }, + { + "epoch": 14.732824427480915, + "grad_norm": 0.5962323546409607, + "learning_rate": 9.831904015864127e-06, + "loss": 0.1105, + "step": 3860 + }, + { + "epoch": 14.770992366412214, + "grad_norm": 0.38795116543769836, + "learning_rate": 9.830483974019644e-06, + "loss": 0.1107, + "step": 3870 + }, + { + "epoch": 14.80916030534351, + "grad_norm": 0.847028911113739, + "learning_rate": 9.829058062681652e-06, + "loss": 0.1209, + "step": 3880 + }, + { + "epoch": 14.84732824427481, + "grad_norm": 0.3294160068035126, + "learning_rate": 9.82762628358276e-06, + "loss": 0.1082, + "step": 3890 + }, + { + "epoch": 14.885496183206106, + "grad_norm": 1.4115614891052246, + "learning_rate": 9.826188638462719e-06, + "loss": 0.1318, + "step": 3900 + }, + { + "epoch": 14.923664122137405, + "grad_norm": 0.47591233253479004, + "learning_rate": 9.824745129068403e-06, + "loss": 0.1409, + "step": 3910 + }, + { + "epoch": 14.961832061068701, + "grad_norm": 0.369427353143692, + "learning_rate": 9.82329575715381e-06, + "loss": 0.1184, + "step": 3920 + }, + { + "epoch": 15.0, + "grad_norm": 0.4399472177028656, + "learning_rate": 9.821840524480066e-06, + "loss": 0.1095, + "step": 3930 + }, + { + "epoch": 15.038167938931299, + "grad_norm": 0.6284782886505127, + "learning_rate": 9.820379432815414e-06, + "loss": 0.1161, + "step": 3940 + }, + { + "epoch": 15.076335877862595, + "grad_norm": 0.4220380485057831, + "learning_rate": 9.81891248393522e-06, + "loss": 0.1151, + "step": 3950 + }, + { + "epoch": 15.114503816793894, + "grad_norm": 0.5931194424629211, + "learning_rate": 9.817439679621964e-06, + "loss": 0.1244, + "step": 3960 + }, + { + "epoch": 15.15267175572519, + "grad_norm": 0.31137576699256897, + "learning_rate": 9.815961021665243e-06, + "loss": 0.1194, + "step": 3970 + }, + { + "epoch": 15.19083969465649, + "grad_norm": 0.506864070892334, + "learning_rate": 9.814476511861764e-06, + "loss": 0.1112, + "step": 3980 + }, + { + "epoch": 15.229007633587786, + "grad_norm": 0.7712796330451965, + "learning_rate": 9.812986152015349e-06, + "loss": 0.1197, + "step": 3990 + }, + { + "epoch": 15.267175572519085, + "grad_norm": 0.49577248096466064, + "learning_rate": 9.811489943936922e-06, + "loss": 0.1148, + "step": 4000 + }, + { + "epoch": 15.305343511450381, + "grad_norm": 0.404431015253067, + "learning_rate": 9.809987889444523e-06, + "loss": 0.1186, + "step": 4010 + }, + { + "epoch": 15.34351145038168, + "grad_norm": 0.44168826937675476, + "learning_rate": 9.808479990363282e-06, + "loss": 0.1277, + "step": 4020 + }, + { + "epoch": 15.381679389312977, + "grad_norm": 0.2900712490081787, + "learning_rate": 9.806966248525447e-06, + "loss": 0.1116, + "step": 4030 + }, + { + "epoch": 15.419847328244275, + "grad_norm": 0.474483460187912, + "learning_rate": 9.805446665770348e-06, + "loss": 0.1249, + "step": 4040 + }, + { + "epoch": 15.458015267175572, + "grad_norm": 0.4446212947368622, + "learning_rate": 9.80392124394443e-06, + "loss": 0.1082, + "step": 4050 + }, + { + "epoch": 15.49618320610687, + "grad_norm": 0.3462320566177368, + "learning_rate": 9.802389984901218e-06, + "loss": 0.1263, + "step": 4060 + }, + { + "epoch": 15.534351145038167, + "grad_norm": 0.481645792722702, + "learning_rate": 9.80085289050134e-06, + "loss": 0.1164, + "step": 4070 + }, + { + "epoch": 15.572519083969466, + "grad_norm": 0.5727097988128662, + "learning_rate": 9.799309962612508e-06, + "loss": 0.1354, + "step": 4080 + }, + { + "epoch": 15.610687022900763, + "grad_norm": 0.9570332765579224, + "learning_rate": 9.797761203109527e-06, + "loss": 0.1192, + "step": 4090 + }, + { + "epoch": 15.648854961832061, + "grad_norm": 0.6497268080711365, + "learning_rate": 9.796206613874283e-06, + "loss": 0.1097, + "step": 4100 + }, + { + "epoch": 15.68702290076336, + "grad_norm": 0.4102814197540283, + "learning_rate": 9.794646196795754e-06, + "loss": 0.1041, + "step": 4110 + }, + { + "epoch": 15.725190839694656, + "grad_norm": 0.41581079363822937, + "learning_rate": 9.793079953769988e-06, + "loss": 0.1218, + "step": 4120 + }, + { + "epoch": 15.763358778625955, + "grad_norm": 0.5571273565292358, + "learning_rate": 9.79150788670012e-06, + "loss": 0.1083, + "step": 4130 + }, + { + "epoch": 15.801526717557252, + "grad_norm": 0.4825478792190552, + "learning_rate": 9.789929997496362e-06, + "loss": 0.1112, + "step": 4140 + }, + { + "epoch": 15.83969465648855, + "grad_norm": 0.35019156336784363, + "learning_rate": 9.788346288075993e-06, + "loss": 0.1187, + "step": 4150 + }, + { + "epoch": 15.877862595419847, + "grad_norm": 0.4018893539905548, + "learning_rate": 9.786756760363374e-06, + "loss": 0.1151, + "step": 4160 + }, + { + "epoch": 15.916030534351146, + "grad_norm": 0.6337506175041199, + "learning_rate": 9.78516141628993e-06, + "loss": 0.1114, + "step": 4170 + }, + { + "epoch": 15.954198473282442, + "grad_norm": 0.3153977394104004, + "learning_rate": 9.783560257794153e-06, + "loss": 0.1217, + "step": 4180 + }, + { + "epoch": 15.992366412213741, + "grad_norm": 0.8760756254196167, + "learning_rate": 9.781953286821604e-06, + "loss": 0.1148, + "step": 4190 + }, + { + "epoch": 16.03053435114504, + "grad_norm": 0.572750985622406, + "learning_rate": 9.7803405053249e-06, + "loss": 0.1161, + "step": 4200 + }, + { + "epoch": 16.068702290076335, + "grad_norm": 0.6838567852973938, + "learning_rate": 9.778721915263729e-06, + "loss": 0.1203, + "step": 4210 + }, + { + "epoch": 16.106870229007633, + "grad_norm": 0.4191221594810486, + "learning_rate": 9.777097518604824e-06, + "loss": 0.1159, + "step": 4220 + }, + { + "epoch": 16.14503816793893, + "grad_norm": 0.8830999135971069, + "learning_rate": 9.775467317321986e-06, + "loss": 0.1169, + "step": 4230 + }, + { + "epoch": 16.18320610687023, + "grad_norm": 0.4778836965560913, + "learning_rate": 9.773831313396056e-06, + "loss": 0.1149, + "step": 4240 + }, + { + "epoch": 16.221374045801525, + "grad_norm": 0.3479765057563782, + "learning_rate": 9.77218950881494e-06, + "loss": 0.1195, + "step": 4250 + }, + { + "epoch": 16.259541984732824, + "grad_norm": 0.39662837982177734, + "learning_rate": 9.770541905573583e-06, + "loss": 0.1275, + "step": 4260 + }, + { + "epoch": 16.297709923664122, + "grad_norm": 0.6865242719650269, + "learning_rate": 9.768888505673976e-06, + "loss": 0.1238, + "step": 4270 + }, + { + "epoch": 16.33587786259542, + "grad_norm": 0.3782191574573517, + "learning_rate": 9.767229311125162e-06, + "loss": 0.1133, + "step": 4280 + }, + { + "epoch": 16.374045801526716, + "grad_norm": 0.588668942451477, + "learning_rate": 9.76556432394321e-06, + "loss": 0.1362, + "step": 4290 + }, + { + "epoch": 16.412213740458014, + "grad_norm": 1.0646703243255615, + "learning_rate": 9.763893546151244e-06, + "loss": 0.1185, + "step": 4300 + }, + { + "epoch": 16.450381679389313, + "grad_norm": 0.4589475989341736, + "learning_rate": 9.762216979779412e-06, + "loss": 0.1173, + "step": 4310 + }, + { + "epoch": 16.48854961832061, + "grad_norm": 1.4795030355453491, + "learning_rate": 9.760534626864902e-06, + "loss": 0.125, + "step": 4320 + }, + { + "epoch": 16.52671755725191, + "grad_norm": 0.6526045799255371, + "learning_rate": 9.758846489451932e-06, + "loss": 0.126, + "step": 4330 + }, + { + "epoch": 16.564885496183205, + "grad_norm": 0.35587215423583984, + "learning_rate": 9.757152569591748e-06, + "loss": 0.1027, + "step": 4340 + }, + { + "epoch": 16.603053435114504, + "grad_norm": 0.8031662106513977, + "learning_rate": 9.755452869342621e-06, + "loss": 0.1163, + "step": 4350 + }, + { + "epoch": 16.641221374045802, + "grad_norm": 0.41244399547576904, + "learning_rate": 9.753747390769848e-06, + "loss": 0.1222, + "step": 4360 + }, + { + "epoch": 16.6793893129771, + "grad_norm": 0.4697263836860657, + "learning_rate": 9.752036135945743e-06, + "loss": 0.1078, + "step": 4370 + }, + { + "epoch": 16.717557251908396, + "grad_norm": 0.6083208322525024, + "learning_rate": 9.75031910694965e-06, + "loss": 0.1151, + "step": 4380 + }, + { + "epoch": 16.755725190839694, + "grad_norm": 0.4035137891769409, + "learning_rate": 9.748596305867913e-06, + "loss": 0.1085, + "step": 4390 + }, + { + "epoch": 16.793893129770993, + "grad_norm": 0.7176216244697571, + "learning_rate": 9.746867734793904e-06, + "loss": 0.1116, + "step": 4400 + }, + { + "epoch": 16.83206106870229, + "grad_norm": 0.4055471122264862, + "learning_rate": 9.745133395827994e-06, + "loss": 0.1248, + "step": 4410 + }, + { + "epoch": 16.870229007633586, + "grad_norm": 0.6174433827400208, + "learning_rate": 9.743393291077571e-06, + "loss": 0.1184, + "step": 4420 + }, + { + "epoch": 16.908396946564885, + "grad_norm": 0.5224699378013611, + "learning_rate": 9.741647422657029e-06, + "loss": 0.1114, + "step": 4430 + }, + { + "epoch": 16.946564885496183, + "grad_norm": 0.4283505976200104, + "learning_rate": 9.739895792687758e-06, + "loss": 0.1203, + "step": 4440 + }, + { + "epoch": 16.984732824427482, + "grad_norm": 1.0380663871765137, + "learning_rate": 9.738138403298158e-06, + "loss": 0.1119, + "step": 4450 + }, + { + "epoch": 17.022900763358777, + "grad_norm": 0.5753864645957947, + "learning_rate": 9.73637525662362e-06, + "loss": 0.1073, + "step": 4460 + }, + { + "epoch": 17.061068702290076, + "grad_norm": 0.8454107046127319, + "learning_rate": 9.734606354806533e-06, + "loss": 0.1148, + "step": 4470 + }, + { + "epoch": 17.099236641221374, + "grad_norm": 0.45036813616752625, + "learning_rate": 9.732831699996282e-06, + "loss": 0.1091, + "step": 4480 + }, + { + "epoch": 17.137404580152673, + "grad_norm": 0.3069184720516205, + "learning_rate": 9.731051294349238e-06, + "loss": 0.1298, + "step": 4490 + }, + { + "epoch": 17.17557251908397, + "grad_norm": 0.5899654626846313, + "learning_rate": 9.729265140028762e-06, + "loss": 0.1212, + "step": 4500 + }, + { + "epoch": 17.213740458015266, + "grad_norm": 0.5663384199142456, + "learning_rate": 9.727473239205202e-06, + "loss": 0.1163, + "step": 4510 + }, + { + "epoch": 17.251908396946565, + "grad_norm": 0.47053390741348267, + "learning_rate": 9.725675594055884e-06, + "loss": 0.1104, + "step": 4520 + }, + { + "epoch": 17.290076335877863, + "grad_norm": 0.5472087264060974, + "learning_rate": 9.723872206765115e-06, + "loss": 0.1186, + "step": 4530 + }, + { + "epoch": 17.328244274809162, + "grad_norm": 0.43163037300109863, + "learning_rate": 9.722063079524185e-06, + "loss": 0.118, + "step": 4540 + }, + { + "epoch": 17.366412213740457, + "grad_norm": 0.3856120705604553, + "learning_rate": 9.720248214531352e-06, + "loss": 0.1052, + "step": 4550 + }, + { + "epoch": 17.404580152671755, + "grad_norm": 0.47452834248542786, + "learning_rate": 9.718427613991848e-06, + "loss": 0.1026, + "step": 4560 + }, + { + "epoch": 17.442748091603054, + "grad_norm": 0.5274608731269836, + "learning_rate": 9.716601280117874e-06, + "loss": 0.118, + "step": 4570 + }, + { + "epoch": 17.480916030534353, + "grad_norm": 1.0746313333511353, + "learning_rate": 9.714769215128597e-06, + "loss": 0.1103, + "step": 4580 + }, + { + "epoch": 17.519083969465647, + "grad_norm": 0.3799738585948944, + "learning_rate": 9.712931421250152e-06, + "loss": 0.1026, + "step": 4590 + }, + { + "epoch": 17.557251908396946, + "grad_norm": 0.36720654368400574, + "learning_rate": 9.711087900715627e-06, + "loss": 0.1121, + "step": 4600 + }, + { + "epoch": 17.595419847328245, + "grad_norm": 0.4583773612976074, + "learning_rate": 9.709238655765079e-06, + "loss": 0.1129, + "step": 4610 + }, + { + "epoch": 17.633587786259543, + "grad_norm": 0.3917942941188812, + "learning_rate": 9.70738368864551e-06, + "loss": 0.1113, + "step": 4620 + }, + { + "epoch": 17.671755725190838, + "grad_norm": 0.4856705367565155, + "learning_rate": 9.705523001610884e-06, + "loss": 0.1059, + "step": 4630 + }, + { + "epoch": 17.709923664122137, + "grad_norm": 0.38836991786956787, + "learning_rate": 9.703656596922107e-06, + "loss": 0.1062, + "step": 4640 + }, + { + "epoch": 17.748091603053435, + "grad_norm": 0.77637779712677, + "learning_rate": 9.70178447684704e-06, + "loss": 0.1125, + "step": 4650 + }, + { + "epoch": 17.786259541984734, + "grad_norm": 1.447717547416687, + "learning_rate": 9.699906643660484e-06, + "loss": 0.1234, + "step": 4660 + }, + { + "epoch": 17.82442748091603, + "grad_norm": 0.4381782114505768, + "learning_rate": 9.698023099644186e-06, + "loss": 0.1179, + "step": 4670 + }, + { + "epoch": 17.862595419847327, + "grad_norm": 0.7676070928573608, + "learning_rate": 9.696133847086824e-06, + "loss": 0.1047, + "step": 4680 + }, + { + "epoch": 17.900763358778626, + "grad_norm": 0.36729001998901367, + "learning_rate": 9.694238888284023e-06, + "loss": 0.1246, + "step": 4690 + }, + { + "epoch": 17.938931297709924, + "grad_norm": 0.7202317714691162, + "learning_rate": 9.692338225538334e-06, + "loss": 0.1234, + "step": 4700 + }, + { + "epoch": 17.977099236641223, + "grad_norm": 0.8629083037376404, + "learning_rate": 9.690431861159242e-06, + "loss": 0.1153, + "step": 4710 + }, + { + "epoch": 18.015267175572518, + "grad_norm": 0.7108657956123352, + "learning_rate": 9.68851979746316e-06, + "loss": 0.1157, + "step": 4720 + }, + { + "epoch": 18.053435114503817, + "grad_norm": 0.3120957612991333, + "learning_rate": 9.686602036773427e-06, + "loss": 0.1134, + "step": 4730 + }, + { + "epoch": 18.091603053435115, + "grad_norm": 0.6133302450180054, + "learning_rate": 9.684678581420302e-06, + "loss": 0.1085, + "step": 4740 + }, + { + "epoch": 18.129770992366414, + "grad_norm": 0.8384220004081726, + "learning_rate": 9.682749433740963e-06, + "loss": 0.1116, + "step": 4750 + }, + { + "epoch": 18.16793893129771, + "grad_norm": 0.4204612672328949, + "learning_rate": 9.680814596079508e-06, + "loss": 0.1129, + "step": 4760 + }, + { + "epoch": 18.206106870229007, + "grad_norm": 0.8686299324035645, + "learning_rate": 9.678874070786945e-06, + "loss": 0.1226, + "step": 4770 + }, + { + "epoch": 18.244274809160306, + "grad_norm": 0.450674444437027, + "learning_rate": 9.676927860221199e-06, + "loss": 0.1189, + "step": 4780 + }, + { + "epoch": 18.282442748091604, + "grad_norm": 1.338226079940796, + "learning_rate": 9.674975966747098e-06, + "loss": 0.1275, + "step": 4790 + }, + { + "epoch": 18.3206106870229, + "grad_norm": 0.4029911160469055, + "learning_rate": 9.673018392736373e-06, + "loss": 0.1108, + "step": 4800 + }, + { + "epoch": 18.358778625954198, + "grad_norm": 0.6675863862037659, + "learning_rate": 9.671055140567667e-06, + "loss": 0.1127, + "step": 4810 + }, + { + "epoch": 18.396946564885496, + "grad_norm": 0.44830095767974854, + "learning_rate": 9.669086212626512e-06, + "loss": 0.1193, + "step": 4820 + }, + { + "epoch": 18.435114503816795, + "grad_norm": 0.5231329202651978, + "learning_rate": 9.667111611305343e-06, + "loss": 0.1326, + "step": 4830 + }, + { + "epoch": 18.47328244274809, + "grad_norm": 0.447273850440979, + "learning_rate": 9.665131339003487e-06, + "loss": 0.1144, + "step": 4840 + }, + { + "epoch": 18.51145038167939, + "grad_norm": 1.005131721496582, + "learning_rate": 9.663145398127158e-06, + "loss": 0.1227, + "step": 4850 + }, + { + "epoch": 18.549618320610687, + "grad_norm": 0.6508563160896301, + "learning_rate": 9.661153791089468e-06, + "loss": 0.117, + "step": 4860 + }, + { + "epoch": 18.587786259541986, + "grad_norm": 0.9407038688659668, + "learning_rate": 9.659156520310402e-06, + "loss": 0.1238, + "step": 4870 + }, + { + "epoch": 18.625954198473284, + "grad_norm": 0.356827050447464, + "learning_rate": 9.657153588216834e-06, + "loss": 0.1107, + "step": 4880 + }, + { + "epoch": 18.66412213740458, + "grad_norm": 0.4456828534603119, + "learning_rate": 9.655144997242516e-06, + "loss": 0.1157, + "step": 4890 + }, + { + "epoch": 18.702290076335878, + "grad_norm": 0.6558853387832642, + "learning_rate": 9.653130749828074e-06, + "loss": 0.1165, + "step": 4900 + }, + { + "epoch": 18.740458015267176, + "grad_norm": 0.5114887952804565, + "learning_rate": 9.651110848421011e-06, + "loss": 0.1238, + "step": 4910 + }, + { + "epoch": 18.778625954198475, + "grad_norm": 0.7068324089050293, + "learning_rate": 9.649085295475695e-06, + "loss": 0.1141, + "step": 4920 + }, + { + "epoch": 18.81679389312977, + "grad_norm": 0.47622790932655334, + "learning_rate": 9.647054093453366e-06, + "loss": 0.1117, + "step": 4930 + }, + { + "epoch": 18.85496183206107, + "grad_norm": 0.320960134267807, + "learning_rate": 9.645017244822124e-06, + "loss": 0.1023, + "step": 4940 + }, + { + "epoch": 18.893129770992367, + "grad_norm": 0.3788800835609436, + "learning_rate": 9.642974752056931e-06, + "loss": 0.119, + "step": 4950 + }, + { + "epoch": 18.931297709923665, + "grad_norm": 1.6596555709838867, + "learning_rate": 9.640926617639614e-06, + "loss": 0.1351, + "step": 4960 + }, + { + "epoch": 18.96946564885496, + "grad_norm": 0.5164493918418884, + "learning_rate": 9.638872844058844e-06, + "loss": 0.1051, + "step": 4970 + }, + { + "epoch": 19.00763358778626, + "grad_norm": 0.41121843457221985, + "learning_rate": 9.636813433810151e-06, + "loss": 0.1129, + "step": 4980 + }, + { + "epoch": 19.045801526717558, + "grad_norm": 0.3111073672771454, + "learning_rate": 9.634748389395914e-06, + "loss": 0.1084, + "step": 4990 + }, + { + "epoch": 19.083969465648856, + "grad_norm": 0.2994742691516876, + "learning_rate": 9.632677713325353e-06, + "loss": 0.1048, + "step": 5000 + }, + { + "epoch": 19.12213740458015, + "grad_norm": 0.32396626472473145, + "learning_rate": 9.63060140811454e-06, + "loss": 0.1049, + "step": 5010 + }, + { + "epoch": 19.16030534351145, + "grad_norm": 0.42831170558929443, + "learning_rate": 9.628519476286379e-06, + "loss": 0.1001, + "step": 5020 + }, + { + "epoch": 19.198473282442748, + "grad_norm": 0.46095895767211914, + "learning_rate": 9.626431920370613e-06, + "loss": 0.1203, + "step": 5030 + }, + { + "epoch": 19.236641221374047, + "grad_norm": 0.40226057171821594, + "learning_rate": 9.624338742903819e-06, + "loss": 0.1088, + "step": 5040 + }, + { + "epoch": 19.274809160305345, + "grad_norm": 0.5237085819244385, + "learning_rate": 9.622239946429407e-06, + "loss": 0.1111, + "step": 5050 + }, + { + "epoch": 19.31297709923664, + "grad_norm": 0.4355655014514923, + "learning_rate": 9.62013553349761e-06, + "loss": 0.1147, + "step": 5060 + }, + { + "epoch": 19.35114503816794, + "grad_norm": 0.7808547616004944, + "learning_rate": 9.61802550666549e-06, + "loss": 0.1117, + "step": 5070 + }, + { + "epoch": 19.389312977099237, + "grad_norm": 0.3424168527126312, + "learning_rate": 9.615909868496928e-06, + "loss": 0.1131, + "step": 5080 + }, + { + "epoch": 19.427480916030536, + "grad_norm": 0.38718533515930176, + "learning_rate": 9.613788621562622e-06, + "loss": 0.113, + "step": 5090 + }, + { + "epoch": 19.46564885496183, + "grad_norm": 0.3584721088409424, + "learning_rate": 9.611661768440092e-06, + "loss": 0.1029, + "step": 5100 + }, + { + "epoch": 19.50381679389313, + "grad_norm": 0.5123441815376282, + "learning_rate": 9.609529311713662e-06, + "loss": 0.1108, + "step": 5110 + }, + { + "epoch": 19.541984732824428, + "grad_norm": 0.37226274609565735, + "learning_rate": 9.607391253974467e-06, + "loss": 0.1103, + "step": 5120 + }, + { + "epoch": 19.580152671755727, + "grad_norm": 0.542453408241272, + "learning_rate": 9.605247597820448e-06, + "loss": 0.1119, + "step": 5130 + }, + { + "epoch": 19.61832061068702, + "grad_norm": 0.3321150839328766, + "learning_rate": 9.603098345856354e-06, + "loss": 0.1103, + "step": 5140 + }, + { + "epoch": 19.65648854961832, + "grad_norm": 0.4373333752155304, + "learning_rate": 9.600943500693724e-06, + "loss": 0.1147, + "step": 5150 + }, + { + "epoch": 19.69465648854962, + "grad_norm": 0.393868625164032, + "learning_rate": 9.598783064950902e-06, + "loss": 0.1079, + "step": 5160 + }, + { + "epoch": 19.732824427480917, + "grad_norm": 0.5854748487472534, + "learning_rate": 9.596617041253017e-06, + "loss": 0.116, + "step": 5170 + }, + { + "epoch": 19.770992366412212, + "grad_norm": 0.33527272939682007, + "learning_rate": 9.594445432231996e-06, + "loss": 0.1071, + "step": 5180 + }, + { + "epoch": 19.80916030534351, + "grad_norm": 0.38696524500846863, + "learning_rate": 9.592268240526546e-06, + "loss": 0.1237, + "step": 5190 + }, + { + "epoch": 19.84732824427481, + "grad_norm": 0.5272545218467712, + "learning_rate": 9.590085468782162e-06, + "loss": 0.122, + "step": 5200 + }, + { + "epoch": 19.885496183206108, + "grad_norm": 0.3966885209083557, + "learning_rate": 9.587897119651115e-06, + "loss": 0.1165, + "step": 5210 + }, + { + "epoch": 19.923664122137403, + "grad_norm": 0.6632593870162964, + "learning_rate": 9.585703195792459e-06, + "loss": 0.1279, + "step": 5220 + }, + { + "epoch": 19.9618320610687, + "grad_norm": 0.41774246096611023, + "learning_rate": 9.583503699872017e-06, + "loss": 0.1043, + "step": 5230 + }, + { + "epoch": 20.0, + "grad_norm": 0.38030463457107544, + "learning_rate": 9.581298634562382e-06, + "loss": 0.1124, + "step": 5240 + }, + { + "epoch": 20.0381679389313, + "grad_norm": 0.314238965511322, + "learning_rate": 9.579088002542918e-06, + "loss": 0.1062, + "step": 5250 + }, + { + "epoch": 20.076335877862597, + "grad_norm": 0.3432968556880951, + "learning_rate": 9.57687180649975e-06, + "loss": 0.1182, + "step": 5260 + }, + { + "epoch": 20.114503816793892, + "grad_norm": 0.7172240614891052, + "learning_rate": 9.57465004912577e-06, + "loss": 0.1114, + "step": 5270 + }, + { + "epoch": 20.15267175572519, + "grad_norm": 0.3679429590702057, + "learning_rate": 9.572422733120614e-06, + "loss": 0.1146, + "step": 5280 + }, + { + "epoch": 20.19083969465649, + "grad_norm": 0.8204743266105652, + "learning_rate": 9.57018986119069e-06, + "loss": 0.1215, + "step": 5290 + }, + { + "epoch": 20.229007633587788, + "grad_norm": 0.5028970241546631, + "learning_rate": 9.56795143604914e-06, + "loss": 0.1157, + "step": 5300 + }, + { + "epoch": 20.267175572519083, + "grad_norm": 0.5000373125076294, + "learning_rate": 9.56570746041587e-06, + "loss": 0.116, + "step": 5310 + }, + { + "epoch": 20.30534351145038, + "grad_norm": 0.36823147535324097, + "learning_rate": 9.563457937017514e-06, + "loss": 0.1211, + "step": 5320 + }, + { + "epoch": 20.34351145038168, + "grad_norm": 0.39545708894729614, + "learning_rate": 9.56120286858746e-06, + "loss": 0.1164, + "step": 5330 + }, + { + "epoch": 20.38167938931298, + "grad_norm": 1.3910619020462036, + "learning_rate": 9.558942257865829e-06, + "loss": 0.124, + "step": 5340 + }, + { + "epoch": 20.419847328244273, + "grad_norm": 0.7138753533363342, + "learning_rate": 9.556676107599472e-06, + "loss": 0.1313, + "step": 5350 + }, + { + "epoch": 20.458015267175572, + "grad_norm": 0.36400383710861206, + "learning_rate": 9.554404420541979e-06, + "loss": 0.1193, + "step": 5360 + }, + { + "epoch": 20.49618320610687, + "grad_norm": 0.42554593086242676, + "learning_rate": 9.552127199453662e-06, + "loss": 0.1122, + "step": 5370 + }, + { + "epoch": 20.53435114503817, + "grad_norm": 0.3719650208950043, + "learning_rate": 9.549844447101559e-06, + "loss": 0.1112, + "step": 5380 + }, + { + "epoch": 20.572519083969464, + "grad_norm": 0.37084949016571045, + "learning_rate": 9.547556166259433e-06, + "loss": 0.1121, + "step": 5390 + }, + { + "epoch": 20.610687022900763, + "grad_norm": 0.4586028456687927, + "learning_rate": 9.545262359707756e-06, + "loss": 0.1044, + "step": 5400 + }, + { + "epoch": 20.64885496183206, + "grad_norm": 0.5926239490509033, + "learning_rate": 9.542963030233725e-06, + "loss": 0.1071, + "step": 5410 + }, + { + "epoch": 20.68702290076336, + "grad_norm": 0.4982014000415802, + "learning_rate": 9.540658180631236e-06, + "loss": 0.1117, + "step": 5420 + }, + { + "epoch": 20.725190839694655, + "grad_norm": 0.6193251013755798, + "learning_rate": 9.538347813700903e-06, + "loss": 0.1083, + "step": 5430 + }, + { + "epoch": 20.763358778625953, + "grad_norm": 0.3325391411781311, + "learning_rate": 9.536031932250037e-06, + "loss": 0.135, + "step": 5440 + }, + { + "epoch": 20.801526717557252, + "grad_norm": 0.2922006845474243, + "learning_rate": 9.533710539092653e-06, + "loss": 0.1128, + "step": 5450 + }, + { + "epoch": 20.83969465648855, + "grad_norm": 0.6511474251747131, + "learning_rate": 9.531383637049465e-06, + "loss": 0.119, + "step": 5460 + }, + { + "epoch": 20.87786259541985, + "grad_norm": 0.7577188014984131, + "learning_rate": 9.529051228947875e-06, + "loss": 0.1144, + "step": 5470 + }, + { + "epoch": 20.916030534351144, + "grad_norm": 0.8727062940597534, + "learning_rate": 9.52671331762198e-06, + "loss": 0.1243, + "step": 5480 + }, + { + "epoch": 20.954198473282442, + "grad_norm": 0.4881748557090759, + "learning_rate": 9.524369905912566e-06, + "loss": 0.1183, + "step": 5490 + }, + { + "epoch": 20.99236641221374, + "grad_norm": 1.3852527141571045, + "learning_rate": 9.522020996667092e-06, + "loss": 0.109, + "step": 5500 + }, + { + "epoch": 21.03053435114504, + "grad_norm": 0.3250916302204132, + "learning_rate": 9.51966659273971e-06, + "loss": 0.1099, + "step": 5510 + }, + { + "epoch": 21.068702290076335, + "grad_norm": 0.3848394453525543, + "learning_rate": 9.517306696991241e-06, + "loss": 0.1153, + "step": 5520 + }, + { + "epoch": 21.106870229007633, + "grad_norm": 0.2975100576877594, + "learning_rate": 9.51494131228918e-06, + "loss": 0.1031, + "step": 5530 + }, + { + "epoch": 21.14503816793893, + "grad_norm": 0.5803893804550171, + "learning_rate": 9.512570441507696e-06, + "loss": 0.1136, + "step": 5540 + }, + { + "epoch": 21.18320610687023, + "grad_norm": 0.5576812028884888, + "learning_rate": 9.510194087527615e-06, + "loss": 0.1202, + "step": 5550 + }, + { + "epoch": 21.221374045801525, + "grad_norm": 0.5307789444923401, + "learning_rate": 9.507812253236436e-06, + "loss": 0.1223, + "step": 5560 + }, + { + "epoch": 21.259541984732824, + "grad_norm": 0.6918527483940125, + "learning_rate": 9.50542494152831e-06, + "loss": 0.1177, + "step": 5570 + }, + { + "epoch": 21.297709923664122, + "grad_norm": 0.489572674036026, + "learning_rate": 9.503032155304046e-06, + "loss": 0.1104, + "step": 5580 + }, + { + "epoch": 21.33587786259542, + "grad_norm": 0.32068297266960144, + "learning_rate": 9.500633897471105e-06, + "loss": 0.109, + "step": 5590 + }, + { + "epoch": 21.374045801526716, + "grad_norm": 0.5839059948921204, + "learning_rate": 9.498230170943597e-06, + "loss": 0.1197, + "step": 5600 + }, + { + "epoch": 21.412213740458014, + "grad_norm": 0.8069677352905273, + "learning_rate": 9.495820978642275e-06, + "loss": 0.1146, + "step": 5610 + }, + { + "epoch": 21.450381679389313, + "grad_norm": 0.5091967582702637, + "learning_rate": 9.493406323494537e-06, + "loss": 0.1342, + "step": 5620 + }, + { + "epoch": 21.48854961832061, + "grad_norm": 0.44150835275650024, + "learning_rate": 9.490986208434415e-06, + "loss": 0.1161, + "step": 5630 + }, + { + "epoch": 21.52671755725191, + "grad_norm": 0.9089648127555847, + "learning_rate": 9.488560636402577e-06, + "loss": 0.1134, + "step": 5640 + }, + { + "epoch": 21.564885496183205, + "grad_norm": 0.6889301538467407, + "learning_rate": 9.486129610346322e-06, + "loss": 0.1129, + "step": 5650 + }, + { + "epoch": 21.603053435114504, + "grad_norm": 0.3800375461578369, + "learning_rate": 9.483693133219576e-06, + "loss": 0.1042, + "step": 5660 + }, + { + "epoch": 21.641221374045802, + "grad_norm": 0.4531780481338501, + "learning_rate": 9.481251207982888e-06, + "loss": 0.1137, + "step": 5670 + }, + { + "epoch": 21.6793893129771, + "grad_norm": 0.4850050210952759, + "learning_rate": 9.47880383760343e-06, + "loss": 0.115, + "step": 5680 + }, + { + "epoch": 21.717557251908396, + "grad_norm": 0.41983506083488464, + "learning_rate": 9.476351025054984e-06, + "loss": 0.1146, + "step": 5690 + }, + { + "epoch": 21.755725190839694, + "grad_norm": 0.47073015570640564, + "learning_rate": 9.473892773317952e-06, + "loss": 0.1142, + "step": 5700 + }, + { + "epoch": 21.793893129770993, + "grad_norm": 0.7770823836326599, + "learning_rate": 9.471429085379339e-06, + "loss": 0.1043, + "step": 5710 + }, + { + "epoch": 21.83206106870229, + "grad_norm": 0.7332136631011963, + "learning_rate": 9.468959964232757e-06, + "loss": 0.1048, + "step": 5720 + }, + { + "epoch": 21.870229007633586, + "grad_norm": 0.4616805911064148, + "learning_rate": 9.466485412878425e-06, + "loss": 0.1118, + "step": 5730 + }, + { + "epoch": 21.908396946564885, + "grad_norm": 0.3106837868690491, + "learning_rate": 9.464005434323154e-06, + "loss": 0.1116, + "step": 5740 + }, + { + "epoch": 21.946564885496183, + "grad_norm": 0.606867790222168, + "learning_rate": 9.461520031580352e-06, + "loss": 0.1116, + "step": 5750 + }, + { + "epoch": 21.984732824427482, + "grad_norm": 0.3522845506668091, + "learning_rate": 9.459029207670018e-06, + "loss": 0.1007, + "step": 5760 + }, + { + "epoch": 22.022900763358777, + "grad_norm": 0.5821218490600586, + "learning_rate": 9.456532965618738e-06, + "loss": 0.1169, + "step": 5770 + }, + { + "epoch": 22.061068702290076, + "grad_norm": 0.6198912262916565, + "learning_rate": 9.454031308459681e-06, + "loss": 0.1144, + "step": 5780 + }, + { + "epoch": 22.099236641221374, + "grad_norm": 0.396658331155777, + "learning_rate": 9.451524239232596e-06, + "loss": 0.1117, + "step": 5790 + }, + { + "epoch": 22.137404580152673, + "grad_norm": 0.4272678792476654, + "learning_rate": 9.44901176098381e-06, + "loss": 0.13, + "step": 5800 + }, + { + "epoch": 22.17557251908397, + "grad_norm": 0.5688971877098083, + "learning_rate": 9.446493876766219e-06, + "loss": 0.1147, + "step": 5810 + }, + { + "epoch": 22.213740458015266, + "grad_norm": 0.8139527440071106, + "learning_rate": 9.44397058963929e-06, + "loss": 0.1215, + "step": 5820 + }, + { + "epoch": 22.251908396946565, + "grad_norm": 0.9012984037399292, + "learning_rate": 9.441441902669057e-06, + "loss": 0.1144, + "step": 5830 + }, + { + "epoch": 22.290076335877863, + "grad_norm": 0.38339322805404663, + "learning_rate": 9.43890781892811e-06, + "loss": 0.1202, + "step": 5840 + }, + { + "epoch": 22.328244274809162, + "grad_norm": 0.34039992094039917, + "learning_rate": 9.436368341495603e-06, + "loss": 0.118, + "step": 5850 + }, + { + "epoch": 22.366412213740457, + "grad_norm": 1.0768190622329712, + "learning_rate": 9.43382347345724e-06, + "loss": 0.1057, + "step": 5860 + }, + { + "epoch": 22.404580152671755, + "grad_norm": 0.682651937007904, + "learning_rate": 9.431273217905272e-06, + "loss": 0.1111, + "step": 5870 + }, + { + "epoch": 22.442748091603054, + "grad_norm": 0.7133651375770569, + "learning_rate": 9.428717577938505e-06, + "loss": 0.105, + "step": 5880 + }, + { + "epoch": 22.480916030534353, + "grad_norm": 0.4730619192123413, + "learning_rate": 9.426156556662276e-06, + "loss": 0.1056, + "step": 5890 + }, + { + "epoch": 22.519083969465647, + "grad_norm": 0.37307435274124146, + "learning_rate": 9.423590157188475e-06, + "loss": 0.1122, + "step": 5900 + }, + { + "epoch": 22.557251908396946, + "grad_norm": 0.3784120976924896, + "learning_rate": 9.421018382635514e-06, + "loss": 0.1327, + "step": 5910 + }, + { + "epoch": 22.595419847328245, + "grad_norm": 0.6031510233879089, + "learning_rate": 9.418441236128344e-06, + "loss": 0.1074, + "step": 5920 + }, + { + "epoch": 22.633587786259543, + "grad_norm": 0.6121441721916199, + "learning_rate": 9.41585872079844e-06, + "loss": 0.1079, + "step": 5930 + }, + { + "epoch": 22.671755725190838, + "grad_norm": 0.5662669539451599, + "learning_rate": 9.413270839783802e-06, + "loss": 0.1096, + "step": 5940 + }, + { + "epoch": 22.709923664122137, + "grad_norm": 0.4588741064071655, + "learning_rate": 9.41067759622895e-06, + "loss": 0.1125, + "step": 5950 + }, + { + "epoch": 22.748091603053435, + "grad_norm": 0.5805822014808655, + "learning_rate": 9.408078993284917e-06, + "loss": 0.1143, + "step": 5960 + }, + { + "epoch": 22.786259541984734, + "grad_norm": 0.5425497889518738, + "learning_rate": 9.405475034109254e-06, + "loss": 0.1006, + "step": 5970 + }, + { + "epoch": 22.82442748091603, + "grad_norm": 0.43856534361839294, + "learning_rate": 9.402865721866017e-06, + "loss": 0.1156, + "step": 5980 + }, + { + "epoch": 22.862595419847327, + "grad_norm": 0.6085226535797119, + "learning_rate": 9.400251059725762e-06, + "loss": 0.1173, + "step": 5990 + }, + { + "epoch": 22.900763358778626, + "grad_norm": 0.41723042726516724, + "learning_rate": 9.397631050865554e-06, + "loss": 0.115, + "step": 6000 + }, + { + "epoch": 22.938931297709924, + "grad_norm": 0.47364330291748047, + "learning_rate": 9.395005698468948e-06, + "loss": 0.1096, + "step": 6010 + }, + { + "epoch": 22.977099236641223, + "grad_norm": 0.5531240701675415, + "learning_rate": 9.392375005726e-06, + "loss": 0.1115, + "step": 6020 + }, + { + "epoch": 23.015267175572518, + "grad_norm": 1.9053860902786255, + "learning_rate": 9.389738975833243e-06, + "loss": 0.1107, + "step": 6030 + }, + { + "epoch": 23.053435114503817, + "grad_norm": 0.4159781336784363, + "learning_rate": 9.387097611993707e-06, + "loss": 0.1061, + "step": 6040 + }, + { + "epoch": 23.091603053435115, + "grad_norm": 0.37060031294822693, + "learning_rate": 9.384450917416894e-06, + "loss": 0.1087, + "step": 6050 + }, + { + "epoch": 23.129770992366414, + "grad_norm": 0.3280550241470337, + "learning_rate": 9.381798895318792e-06, + "loss": 0.1082, + "step": 6060 + }, + { + "epoch": 23.16793893129771, + "grad_norm": 0.6817305088043213, + "learning_rate": 9.379141548921855e-06, + "loss": 0.1084, + "step": 6070 + }, + { + "epoch": 23.206106870229007, + "grad_norm": 1.089829683303833, + "learning_rate": 9.376478881455008e-06, + "loss": 0.1191, + "step": 6080 + }, + { + "epoch": 23.244274809160306, + "grad_norm": 0.514206051826477, + "learning_rate": 9.373810896153647e-06, + "loss": 0.108, + "step": 6090 + }, + { + "epoch": 23.282442748091604, + "grad_norm": 0.3831291198730469, + "learning_rate": 9.371137596259623e-06, + "loss": 0.1141, + "step": 6100 + }, + { + "epoch": 23.3206106870229, + "grad_norm": 0.4334690570831299, + "learning_rate": 9.368458985021249e-06, + "loss": 0.11, + "step": 6110 + }, + { + "epoch": 23.358778625954198, + "grad_norm": 0.43179851770401, + "learning_rate": 9.365775065693288e-06, + "loss": 0.1128, + "step": 6120 + }, + { + "epoch": 23.396946564885496, + "grad_norm": 0.43169301748275757, + "learning_rate": 9.363085841536958e-06, + "loss": 0.1113, + "step": 6130 + }, + { + "epoch": 23.435114503816795, + "grad_norm": 0.6322646737098694, + "learning_rate": 9.360391315819917e-06, + "loss": 0.1113, + "step": 6140 + }, + { + "epoch": 23.47328244274809, + "grad_norm": 0.37557345628738403, + "learning_rate": 9.35769149181627e-06, + "loss": 0.1107, + "step": 6150 + }, + { + "epoch": 23.51145038167939, + "grad_norm": 0.48005855083465576, + "learning_rate": 9.354986372806557e-06, + "loss": 0.1067, + "step": 6160 + }, + { + "epoch": 23.549618320610687, + "grad_norm": 0.5975325107574463, + "learning_rate": 9.352275962077752e-06, + "loss": 0.1199, + "step": 6170 + }, + { + "epoch": 23.587786259541986, + "grad_norm": 0.49410223960876465, + "learning_rate": 9.349560262923262e-06, + "loss": 0.1296, + "step": 6180 + }, + { + "epoch": 23.625954198473284, + "grad_norm": 0.3413020670413971, + "learning_rate": 9.346839278642915e-06, + "loss": 0.1102, + "step": 6190 + }, + { + "epoch": 23.66412213740458, + "grad_norm": 0.43629974126815796, + "learning_rate": 9.344113012542964e-06, + "loss": 0.1163, + "step": 6200 + }, + { + "epoch": 23.702290076335878, + "grad_norm": 0.4831625521183014, + "learning_rate": 9.341381467936079e-06, + "loss": 0.1056, + "step": 6210 + }, + { + "epoch": 23.740458015267176, + "grad_norm": 0.4807736575603485, + "learning_rate": 9.338644648141347e-06, + "loss": 0.1002, + "step": 6220 + }, + { + "epoch": 23.778625954198475, + "grad_norm": 0.6690664887428284, + "learning_rate": 9.335902556484258e-06, + "loss": 0.112, + "step": 6230 + }, + { + "epoch": 23.81679389312977, + "grad_norm": 0.7592681646347046, + "learning_rate": 9.333155196296712e-06, + "loss": 0.1051, + "step": 6240 + }, + { + "epoch": 23.85496183206107, + "grad_norm": 0.3155533969402313, + "learning_rate": 9.330402570917017e-06, + "loss": 0.1104, + "step": 6250 + }, + { + "epoch": 23.893129770992367, + "grad_norm": 0.43112990260124207, + "learning_rate": 9.327644683689866e-06, + "loss": 0.1098, + "step": 6260 + }, + { + "epoch": 23.931297709923665, + "grad_norm": 0.28097161650657654, + "learning_rate": 9.324881537966355e-06, + "loss": 0.1086, + "step": 6270 + }, + { + "epoch": 23.96946564885496, + "grad_norm": 0.48976218700408936, + "learning_rate": 9.322113137103964e-06, + "loss": 0.1106, + "step": 6280 + }, + { + "epoch": 24.00763358778626, + "grad_norm": 0.5290048718452454, + "learning_rate": 9.319339484466565e-06, + "loss": 0.1065, + "step": 6290 + }, + { + "epoch": 24.045801526717558, + "grad_norm": 0.48621800541877747, + "learning_rate": 9.316560583424404e-06, + "loss": 0.106, + "step": 6300 + }, + { + "epoch": 24.083969465648856, + "grad_norm": 0.3502595126628876, + "learning_rate": 9.313776437354109e-06, + "loss": 0.105, + "step": 6310 + }, + { + "epoch": 24.12213740458015, + "grad_norm": 0.3239123225212097, + "learning_rate": 9.310987049638681e-06, + "loss": 0.1121, + "step": 6320 + }, + { + "epoch": 24.16030534351145, + "grad_norm": 0.38720473647117615, + "learning_rate": 9.308192423667486e-06, + "loss": 0.1035, + "step": 6330 + }, + { + "epoch": 24.198473282442748, + "grad_norm": 0.382134348154068, + "learning_rate": 9.305392562836262e-06, + "loss": 0.1073, + "step": 6340 + }, + { + "epoch": 24.236641221374047, + "grad_norm": 0.37410208582878113, + "learning_rate": 9.302587470547101e-06, + "loss": 0.1247, + "step": 6350 + }, + { + "epoch": 24.274809160305345, + "grad_norm": 0.3606686592102051, + "learning_rate": 9.299777150208456e-06, + "loss": 0.1206, + "step": 6360 + }, + { + "epoch": 24.31297709923664, + "grad_norm": 0.5555996894836426, + "learning_rate": 9.296961605235133e-06, + "loss": 0.1219, + "step": 6370 + }, + { + "epoch": 24.35114503816794, + "grad_norm": 0.40895533561706543, + "learning_rate": 9.29414083904828e-06, + "loss": 0.1151, + "step": 6380 + }, + { + "epoch": 24.389312977099237, + "grad_norm": 0.3560793101787567, + "learning_rate": 9.291314855075397e-06, + "loss": 0.1171, + "step": 6390 + }, + { + "epoch": 24.427480916030536, + "grad_norm": 0.583595871925354, + "learning_rate": 9.288483656750322e-06, + "loss": 0.1203, + "step": 6400 + }, + { + "epoch": 24.46564885496183, + "grad_norm": 0.3709441125392914, + "learning_rate": 9.285647247513225e-06, + "loss": 0.1087, + "step": 6410 + }, + { + "epoch": 24.50381679389313, + "grad_norm": 0.5347302556037903, + "learning_rate": 9.282805630810614e-06, + "loss": 0.1089, + "step": 6420 + }, + { + "epoch": 24.541984732824428, + "grad_norm": 0.9013820290565491, + "learning_rate": 9.279958810095317e-06, + "loss": 0.1124, + "step": 6430 + }, + { + "epoch": 24.580152671755727, + "grad_norm": 0.32639890909194946, + "learning_rate": 9.277106788826494e-06, + "loss": 0.1224, + "step": 6440 + }, + { + "epoch": 24.61832061068702, + "grad_norm": 0.462785542011261, + "learning_rate": 9.274249570469618e-06, + "loss": 0.1157, + "step": 6450 + }, + { + "epoch": 24.65648854961832, + "grad_norm": 0.6265701651573181, + "learning_rate": 9.271387158496477e-06, + "loss": 0.1041, + "step": 6460 + }, + { + "epoch": 24.69465648854962, + "grad_norm": 0.4119669198989868, + "learning_rate": 9.268519556385173e-06, + "loss": 0.1091, + "step": 6470 + }, + { + "epoch": 24.732824427480917, + "grad_norm": 0.2758201062679291, + "learning_rate": 9.265646767620113e-06, + "loss": 0.1023, + "step": 6480 + }, + { + "epoch": 24.770992366412212, + "grad_norm": 0.4721677601337433, + "learning_rate": 9.262768795692006e-06, + "loss": 0.1004, + "step": 6490 + }, + { + "epoch": 24.80916030534351, + "grad_norm": 0.4377496838569641, + "learning_rate": 9.259885644097861e-06, + "loss": 0.1251, + "step": 6500 + }, + { + "epoch": 24.84732824427481, + "grad_norm": 0.3082864284515381, + "learning_rate": 9.256997316340976e-06, + "loss": 0.1116, + "step": 6510 + }, + { + "epoch": 24.885496183206108, + "grad_norm": 0.36191001534461975, + "learning_rate": 9.254103815930944e-06, + "loss": 0.1164, + "step": 6520 + }, + { + "epoch": 24.923664122137403, + "grad_norm": 0.34867507219314575, + "learning_rate": 9.251205146383637e-06, + "loss": 0.1155, + "step": 6530 + }, + { + "epoch": 24.9618320610687, + "grad_norm": 0.665786862373352, + "learning_rate": 9.248301311221216e-06, + "loss": 0.1075, + "step": 6540 + }, + { + "epoch": 25.0, + "grad_norm": 0.5270240306854248, + "learning_rate": 9.245392313972116e-06, + "loss": 0.112, + "step": 6550 + }, + { + "epoch": 25.0381679389313, + "grad_norm": 0.5124590396881104, + "learning_rate": 9.242478158171038e-06, + "loss": 0.1091, + "step": 6560 + }, + { + "epoch": 25.076335877862597, + "grad_norm": 0.6183871626853943, + "learning_rate": 9.239558847358959e-06, + "loss": 0.1148, + "step": 6570 + }, + { + "epoch": 25.114503816793892, + "grad_norm": 0.4855692982673645, + "learning_rate": 9.236634385083115e-06, + "loss": 0.1086, + "step": 6580 + }, + { + "epoch": 25.15267175572519, + "grad_norm": 0.37100329995155334, + "learning_rate": 9.233704774897006e-06, + "loss": 0.1123, + "step": 6590 + }, + { + "epoch": 25.19083969465649, + "grad_norm": 0.5242245197296143, + "learning_rate": 9.230770020360383e-06, + "loss": 0.1155, + "step": 6600 + }, + { + "epoch": 25.229007633587788, + "grad_norm": 0.36837512254714966, + "learning_rate": 9.227830125039248e-06, + "loss": 0.1312, + "step": 6610 + }, + { + "epoch": 25.267175572519083, + "grad_norm": 0.4537639915943146, + "learning_rate": 9.224885092505853e-06, + "loss": 0.1111, + "step": 6620 + }, + { + "epoch": 25.30534351145038, + "grad_norm": 0.42237746715545654, + "learning_rate": 9.22193492633869e-06, + "loss": 0.1185, + "step": 6630 + }, + { + "epoch": 25.34351145038168, + "grad_norm": 0.6770774722099304, + "learning_rate": 9.21897963012249e-06, + "loss": 0.1094, + "step": 6640 + }, + { + "epoch": 25.38167938931298, + "grad_norm": 0.5275667905807495, + "learning_rate": 9.216019207448216e-06, + "loss": 0.1112, + "step": 6650 + }, + { + "epoch": 25.419847328244273, + "grad_norm": 0.5454118251800537, + "learning_rate": 9.213053661913061e-06, + "loss": 0.108, + "step": 6660 + }, + { + "epoch": 25.458015267175572, + "grad_norm": 0.4298250079154968, + "learning_rate": 9.210082997120439e-06, + "loss": 0.108, + "step": 6670 + }, + { + "epoch": 25.49618320610687, + "grad_norm": 0.5686354637145996, + "learning_rate": 9.207107216679994e-06, + "loss": 0.121, + "step": 6680 + }, + { + "epoch": 25.53435114503817, + "grad_norm": 0.7463754415512085, + "learning_rate": 9.204126324207575e-06, + "loss": 0.1155, + "step": 6690 + }, + { + "epoch": 25.572519083969464, + "grad_norm": 0.9898240566253662, + "learning_rate": 9.201140323325248e-06, + "loss": 0.1226, + "step": 6700 + }, + { + "epoch": 25.610687022900763, + "grad_norm": 0.40233856439590454, + "learning_rate": 9.198149217661287e-06, + "loss": 0.1073, + "step": 6710 + }, + { + "epoch": 25.64885496183206, + "grad_norm": 0.44068315625190735, + "learning_rate": 9.195153010850166e-06, + "loss": 0.1161, + "step": 6720 + }, + { + "epoch": 25.68702290076336, + "grad_norm": 0.5321930050849915, + "learning_rate": 9.192151706532562e-06, + "loss": 0.1041, + "step": 6730 + }, + { + "epoch": 25.725190839694655, + "grad_norm": 0.34386226534843445, + "learning_rate": 9.189145308355339e-06, + "loss": 0.111, + "step": 6740 + }, + { + "epoch": 25.763358778625953, + "grad_norm": 0.9037399291992188, + "learning_rate": 9.186133819971556e-06, + "loss": 0.1146, + "step": 6750 + }, + { + "epoch": 25.801526717557252, + "grad_norm": 0.2989799976348877, + "learning_rate": 9.183117245040455e-06, + "loss": 0.1291, + "step": 6760 + }, + { + "epoch": 25.83969465648855, + "grad_norm": 0.5754449367523193, + "learning_rate": 9.18009558722746e-06, + "loss": 0.1061, + "step": 6770 + }, + { + "epoch": 25.87786259541985, + "grad_norm": 0.3005794882774353, + "learning_rate": 9.177068850204167e-06, + "loss": 0.1062, + "step": 6780 + }, + { + "epoch": 25.916030534351144, + "grad_norm": 0.466094046831131, + "learning_rate": 9.174037037648351e-06, + "loss": 0.119, + "step": 6790 + }, + { + "epoch": 25.954198473282442, + "grad_norm": 0.44860929250717163, + "learning_rate": 9.171000153243948e-06, + "loss": 0.1121, + "step": 6800 + }, + { + "epoch": 25.99236641221374, + "grad_norm": 0.6850860714912415, + "learning_rate": 9.167958200681058e-06, + "loss": 0.1133, + "step": 6810 + }, + { + "epoch": 26.03053435114504, + "grad_norm": 0.5172090530395508, + "learning_rate": 9.164911183655943e-06, + "loss": 0.1096, + "step": 6820 + }, + { + "epoch": 26.068702290076335, + "grad_norm": 0.28496426343917847, + "learning_rate": 9.161859105871013e-06, + "loss": 0.1001, + "step": 6830 + }, + { + "epoch": 26.106870229007633, + "grad_norm": 0.4888004958629608, + "learning_rate": 9.158801971034832e-06, + "loss": 0.0996, + "step": 6840 + }, + { + "epoch": 26.14503816793893, + "grad_norm": 0.4661968946456909, + "learning_rate": 9.155739782862107e-06, + "loss": 0.1062, + "step": 6850 + }, + { + "epoch": 26.18320610687023, + "grad_norm": 0.6031951308250427, + "learning_rate": 9.152672545073687e-06, + "loss": 0.1138, + "step": 6860 + }, + { + "epoch": 26.221374045801525, + "grad_norm": 0.35957977175712585, + "learning_rate": 9.149600261396552e-06, + "loss": 0.1061, + "step": 6870 + }, + { + "epoch": 26.259541984732824, + "grad_norm": 0.3762602210044861, + "learning_rate": 9.146522935563816e-06, + "loss": 0.1143, + "step": 6880 + }, + { + "epoch": 26.297709923664122, + "grad_norm": 0.763342559337616, + "learning_rate": 9.143440571314723e-06, + "loss": 0.1143, + "step": 6890 + }, + { + "epoch": 26.33587786259542, + "grad_norm": 0.331662654876709, + "learning_rate": 9.140353172394637e-06, + "loss": 0.1009, + "step": 6900 + }, + { + "epoch": 26.374045801526716, + "grad_norm": 0.6234320402145386, + "learning_rate": 9.137260742555033e-06, + "loss": 0.1046, + "step": 6910 + }, + { + "epoch": 26.412213740458014, + "grad_norm": 0.2962917685508728, + "learning_rate": 9.134163285553511e-06, + "loss": 0.1058, + "step": 6920 + }, + { + "epoch": 26.450381679389313, + "grad_norm": 0.43455183506011963, + "learning_rate": 9.13106080515377e-06, + "loss": 0.116, + "step": 6930 + }, + { + "epoch": 26.48854961832061, + "grad_norm": 0.4187309145927429, + "learning_rate": 9.127953305125618e-06, + "loss": 0.1063, + "step": 6940 + }, + { + "epoch": 26.52671755725191, + "grad_norm": 0.5334097146987915, + "learning_rate": 9.124840789244958e-06, + "loss": 0.1191, + "step": 6950 + }, + { + "epoch": 26.564885496183205, + "grad_norm": 0.5690393447875977, + "learning_rate": 9.121723261293793e-06, + "loss": 0.1119, + "step": 6960 + }, + { + "epoch": 26.603053435114504, + "grad_norm": 0.41400864720344543, + "learning_rate": 9.118600725060214e-06, + "loss": 0.1023, + "step": 6970 + }, + { + "epoch": 26.641221374045802, + "grad_norm": 0.4753328561782837, + "learning_rate": 9.115473184338393e-06, + "loss": 0.1104, + "step": 6980 + }, + { + "epoch": 26.6793893129771, + "grad_norm": 0.29748255014419556, + "learning_rate": 9.11234064292859e-06, + "loss": 0.1003, + "step": 6990 + }, + { + "epoch": 26.717557251908396, + "grad_norm": 0.34052833914756775, + "learning_rate": 9.109203104637138e-06, + "loss": 0.1167, + "step": 7000 + }, + { + "epoch": 26.755725190839694, + "grad_norm": 0.39550068974494934, + "learning_rate": 9.10606057327644e-06, + "loss": 0.1128, + "step": 7010 + }, + { + "epoch": 26.793893129770993, + "grad_norm": 0.3241926431655884, + "learning_rate": 9.102913052664971e-06, + "loss": 0.1078, + "step": 7020 + }, + { + "epoch": 26.83206106870229, + "grad_norm": 0.3458397686481476, + "learning_rate": 9.099760546627262e-06, + "loss": 0.1104, + "step": 7030 + }, + { + "epoch": 26.870229007633586, + "grad_norm": 0.6425669193267822, + "learning_rate": 9.096603058993907e-06, + "loss": 0.1119, + "step": 7040 + }, + { + "epoch": 26.908396946564885, + "grad_norm": 0.39472290873527527, + "learning_rate": 9.093440593601553e-06, + "loss": 0.1109, + "step": 7050 + }, + { + "epoch": 26.946564885496183, + "grad_norm": 0.3740868866443634, + "learning_rate": 9.090273154292889e-06, + "loss": 0.1181, + "step": 7060 + }, + { + "epoch": 26.984732824427482, + "grad_norm": 0.38200682401657104, + "learning_rate": 9.087100744916656e-06, + "loss": 0.1065, + "step": 7070 + }, + { + "epoch": 27.022900763358777, + "grad_norm": 0.4096829295158386, + "learning_rate": 9.08392336932763e-06, + "loss": 0.1099, + "step": 7080 + }, + { + "epoch": 27.061068702290076, + "grad_norm": 0.6823604702949524, + "learning_rate": 9.08074103138662e-06, + "loss": 0.1167, + "step": 7090 + }, + { + "epoch": 27.099236641221374, + "grad_norm": 0.5257452726364136, + "learning_rate": 9.077553734960469e-06, + "loss": 0.1047, + "step": 7100 + }, + { + "epoch": 27.137404580152673, + "grad_norm": 0.4600130617618561, + "learning_rate": 9.074361483922041e-06, + "loss": 0.1006, + "step": 7110 + }, + { + "epoch": 27.17557251908397, + "grad_norm": 0.6489649415016174, + "learning_rate": 9.071164282150224e-06, + "loss": 0.1035, + "step": 7120 + }, + { + "epoch": 27.213740458015266, + "grad_norm": 1.3038793802261353, + "learning_rate": 9.067962133529919e-06, + "loss": 0.1039, + "step": 7130 + }, + { + "epoch": 27.251908396946565, + "grad_norm": 0.542229950428009, + "learning_rate": 9.064755041952036e-06, + "loss": 0.1162, + "step": 7140 + }, + { + "epoch": 27.290076335877863, + "grad_norm": 0.3620493412017822, + "learning_rate": 9.061543011313498e-06, + "loss": 0.1153, + "step": 7150 + }, + { + "epoch": 27.328244274809162, + "grad_norm": 0.44058114290237427, + "learning_rate": 9.05832604551722e-06, + "loss": 0.1148, + "step": 7160 + }, + { + "epoch": 27.366412213740457, + "grad_norm": 0.53098064661026, + "learning_rate": 9.055104148472123e-06, + "loss": 0.1158, + "step": 7170 + }, + { + "epoch": 27.404580152671755, + "grad_norm": 0.7122390270233154, + "learning_rate": 9.051877324093114e-06, + "loss": 0.1186, + "step": 7180 + }, + { + "epoch": 27.442748091603054, + "grad_norm": 0.6435628533363342, + "learning_rate": 9.04864557630109e-06, + "loss": 0.1176, + "step": 7190 + }, + { + "epoch": 27.480916030534353, + "grad_norm": 0.2909921705722809, + "learning_rate": 9.045408909022928e-06, + "loss": 0.1115, + "step": 7200 + }, + { + "epoch": 27.519083969465647, + "grad_norm": 0.39591115713119507, + "learning_rate": 9.042167326191484e-06, + "loss": 0.1077, + "step": 7210 + }, + { + "epoch": 27.557251908396946, + "grad_norm": 0.7656340003013611, + "learning_rate": 9.038920831745587e-06, + "loss": 0.1072, + "step": 7220 + }, + { + "epoch": 27.595419847328245, + "grad_norm": 0.5420284271240234, + "learning_rate": 9.035669429630036e-06, + "loss": 0.1113, + "step": 7230 + }, + { + "epoch": 27.633587786259543, + "grad_norm": 0.3711391091346741, + "learning_rate": 9.032413123795589e-06, + "loss": 0.1094, + "step": 7240 + }, + { + "epoch": 27.671755725190838, + "grad_norm": 0.6970059871673584, + "learning_rate": 9.029151918198962e-06, + "loss": 0.1083, + "step": 7250 + }, + { + "epoch": 27.709923664122137, + "grad_norm": 0.5760360956192017, + "learning_rate": 9.025885816802833e-06, + "loss": 0.1195, + "step": 7260 + }, + { + "epoch": 27.748091603053435, + "grad_norm": 0.3566513955593109, + "learning_rate": 9.022614823575819e-06, + "loss": 0.1091, + "step": 7270 + }, + { + "epoch": 27.786259541984734, + "grad_norm": 0.3161095380783081, + "learning_rate": 9.019338942492485e-06, + "loss": 0.1095, + "step": 7280 + }, + { + "epoch": 27.82442748091603, + "grad_norm": 0.3499198853969574, + "learning_rate": 9.01605817753334e-06, + "loss": 0.1113, + "step": 7290 + }, + { + "epoch": 27.862595419847327, + "grad_norm": 0.3654879927635193, + "learning_rate": 9.012772532684819e-06, + "loss": 0.111, + "step": 7300 + }, + { + "epoch": 27.900763358778626, + "grad_norm": 0.5612740516662598, + "learning_rate": 9.00948201193929e-06, + "loss": 0.107, + "step": 7310 + }, + { + "epoch": 27.938931297709924, + "grad_norm": 0.6282134652137756, + "learning_rate": 9.006186619295048e-06, + "loss": 0.117, + "step": 7320 + }, + { + "epoch": 27.977099236641223, + "grad_norm": 0.9098697900772095, + "learning_rate": 9.002886358756304e-06, + "loss": 0.115, + "step": 7330 + }, + { + "epoch": 28.015267175572518, + "grad_norm": 0.5928300619125366, + "learning_rate": 8.999581234333189e-06, + "loss": 0.1075, + "step": 7340 + }, + { + "epoch": 28.053435114503817, + "grad_norm": 0.39797747135162354, + "learning_rate": 8.996271250041735e-06, + "loss": 0.1185, + "step": 7350 + }, + { + "epoch": 28.091603053435115, + "grad_norm": 0.5730596780776978, + "learning_rate": 8.99295640990389e-06, + "loss": 0.1113, + "step": 7360 + }, + { + "epoch": 28.129770992366414, + "grad_norm": 0.40439486503601074, + "learning_rate": 8.989636717947496e-06, + "loss": 0.1126, + "step": 7370 + }, + { + "epoch": 28.16793893129771, + "grad_norm": 0.978628933429718, + "learning_rate": 8.986312178206291e-06, + "loss": 0.1018, + "step": 7380 + }, + { + "epoch": 28.206106870229007, + "grad_norm": 0.45754435658454895, + "learning_rate": 8.982982794719904e-06, + "loss": 0.1116, + "step": 7390 + }, + { + "epoch": 28.244274809160306, + "grad_norm": 0.3176497519016266, + "learning_rate": 8.979648571533852e-06, + "loss": 0.1055, + "step": 7400 + }, + { + "epoch": 28.282442748091604, + "grad_norm": 0.3644157946109772, + "learning_rate": 8.97630951269953e-06, + "loss": 0.1086, + "step": 7410 + }, + { + "epoch": 28.3206106870229, + "grad_norm": 0.4856671988964081, + "learning_rate": 8.972965622274206e-06, + "loss": 0.1148, + "step": 7420 + }, + { + "epoch": 28.358778625954198, + "grad_norm": 0.3878462314605713, + "learning_rate": 8.969616904321026e-06, + "loss": 0.1041, + "step": 7430 + }, + { + "epoch": 28.396946564885496, + "grad_norm": 0.4682347774505615, + "learning_rate": 8.966263362908998e-06, + "loss": 0.1167, + "step": 7440 + }, + { + "epoch": 28.435114503816795, + "grad_norm": 0.41615232825279236, + "learning_rate": 8.962905002112989e-06, + "loss": 0.1112, + "step": 7450 + }, + { + "epoch": 28.47328244274809, + "grad_norm": 0.39141789078712463, + "learning_rate": 8.959541826013725e-06, + "loss": 0.1099, + "step": 7460 + }, + { + "epoch": 28.51145038167939, + "grad_norm": 0.4517311453819275, + "learning_rate": 8.95617383869778e-06, + "loss": 0.1093, + "step": 7470 + }, + { + "epoch": 28.549618320610687, + "grad_norm": 0.3166011571884155, + "learning_rate": 8.952801044257581e-06, + "loss": 0.1008, + "step": 7480 + }, + { + "epoch": 28.587786259541986, + "grad_norm": 0.5795003771781921, + "learning_rate": 8.949423446791388e-06, + "loss": 0.1185, + "step": 7490 + }, + { + "epoch": 28.625954198473284, + "grad_norm": 0.7326614856719971, + "learning_rate": 8.9460410504033e-06, + "loss": 0.1177, + "step": 7500 + }, + { + "epoch": 28.66412213740458, + "grad_norm": 0.4169262647628784, + "learning_rate": 8.942653859203248e-06, + "loss": 0.1113, + "step": 7510 + }, + { + "epoch": 28.702290076335878, + "grad_norm": 0.43298423290252686, + "learning_rate": 8.93926187730699e-06, + "loss": 0.1072, + "step": 7520 + }, + { + "epoch": 28.740458015267176, + "grad_norm": 0.3784909248352051, + "learning_rate": 8.935865108836103e-06, + "loss": 0.1097, + "step": 7530 + }, + { + "epoch": 28.778625954198475, + "grad_norm": 0.7653591632843018, + "learning_rate": 8.932463557917982e-06, + "loss": 0.1127, + "step": 7540 + }, + { + "epoch": 28.81679389312977, + "grad_norm": 0.47900792956352234, + "learning_rate": 8.929057228685829e-06, + "loss": 0.1121, + "step": 7550 + }, + { + "epoch": 28.85496183206107, + "grad_norm": 0.6226459741592407, + "learning_rate": 8.925646125278657e-06, + "loss": 0.1082, + "step": 7560 + }, + { + "epoch": 28.893129770992367, + "grad_norm": 0.4821157455444336, + "learning_rate": 8.92223025184128e-06, + "loss": 0.1081, + "step": 7570 + }, + { + "epoch": 28.931297709923665, + "grad_norm": 0.3318312466144562, + "learning_rate": 8.918809612524305e-06, + "loss": 0.113, + "step": 7580 + }, + { + "epoch": 28.96946564885496, + "grad_norm": 0.4457385241985321, + "learning_rate": 8.91538421148413e-06, + "loss": 0.1162, + "step": 7590 + }, + { + "epoch": 29.00763358778626, + "grad_norm": 0.40070948004722595, + "learning_rate": 8.911954052882941e-06, + "loss": 0.1127, + "step": 7600 + }, + { + "epoch": 29.045801526717558, + "grad_norm": 0.4417935609817505, + "learning_rate": 8.908519140888704e-06, + "loss": 0.1035, + "step": 7610 + }, + { + "epoch": 29.083969465648856, + "grad_norm": 0.3581926226615906, + "learning_rate": 8.905079479675164e-06, + "loss": 0.1033, + "step": 7620 + }, + { + "epoch": 29.12213740458015, + "grad_norm": 0.4203307330608368, + "learning_rate": 8.901635073421831e-06, + "loss": 0.1021, + "step": 7630 + }, + { + "epoch": 29.16030534351145, + "grad_norm": 0.4549827575683594, + "learning_rate": 8.898185926313982e-06, + "loss": 0.1093, + "step": 7640 + }, + { + "epoch": 29.198473282442748, + "grad_norm": 0.3619544208049774, + "learning_rate": 8.894732042542659e-06, + "loss": 0.1126, + "step": 7650 + }, + { + "epoch": 29.236641221374047, + "grad_norm": 0.32612374424934387, + "learning_rate": 8.891273426304656e-06, + "loss": 0.1082, + "step": 7660 + }, + { + "epoch": 29.274809160305345, + "grad_norm": 0.44945961236953735, + "learning_rate": 8.887810081802514e-06, + "loss": 0.11, + "step": 7670 + }, + { + "epoch": 29.31297709923664, + "grad_norm": 0.5542881488800049, + "learning_rate": 8.88434201324453e-06, + "loss": 0.1066, + "step": 7680 + }, + { + "epoch": 29.35114503816794, + "grad_norm": 0.7978936433792114, + "learning_rate": 8.880869224844727e-06, + "loss": 0.1143, + "step": 7690 + }, + { + "epoch": 29.389312977099237, + "grad_norm": 0.6490185260772705, + "learning_rate": 8.877391720822874e-06, + "loss": 0.1155, + "step": 7700 + }, + { + "epoch": 29.427480916030536, + "grad_norm": 0.5318716168403625, + "learning_rate": 8.873909505404467e-06, + "loss": 0.1078, + "step": 7710 + }, + { + "epoch": 29.46564885496183, + "grad_norm": 0.40619373321533203, + "learning_rate": 8.870422582820726e-06, + "loss": 0.1147, + "step": 7720 + }, + { + "epoch": 29.50381679389313, + "grad_norm": 0.3707321286201477, + "learning_rate": 8.866930957308589e-06, + "loss": 0.1077, + "step": 7730 + }, + { + "epoch": 29.541984732824428, + "grad_norm": 0.5502585172653198, + "learning_rate": 8.863434633110711e-06, + "loss": 0.1046, + "step": 7740 + }, + { + "epoch": 29.580152671755727, + "grad_norm": 0.34364914894104004, + "learning_rate": 8.859933614475454e-06, + "loss": 0.1071, + "step": 7750 + }, + { + "epoch": 29.61832061068702, + "grad_norm": 0.302059531211853, + "learning_rate": 8.85642790565689e-06, + "loss": 0.1041, + "step": 7760 + }, + { + "epoch": 29.65648854961832, + "grad_norm": 0.5553315281867981, + "learning_rate": 8.852917510914783e-06, + "loss": 0.111, + "step": 7770 + }, + { + "epoch": 29.69465648854962, + "grad_norm": 0.3059275448322296, + "learning_rate": 8.84940243451459e-06, + "loss": 0.1057, + "step": 7780 + }, + { + "epoch": 29.732824427480917, + "grad_norm": 0.3817649483680725, + "learning_rate": 8.84588268072747e-06, + "loss": 0.1044, + "step": 7790 + }, + { + "epoch": 29.770992366412212, + "grad_norm": 0.3629932403564453, + "learning_rate": 8.842358253830245e-06, + "loss": 0.1092, + "step": 7800 + }, + { + "epoch": 29.80916030534351, + "grad_norm": 0.39645668864250183, + "learning_rate": 8.838829158105434e-06, + "loss": 0.1092, + "step": 7810 + }, + { + "epoch": 29.84732824427481, + "grad_norm": 0.3265850245952606, + "learning_rate": 8.835295397841217e-06, + "loss": 0.1039, + "step": 7820 + }, + { + "epoch": 29.885496183206108, + "grad_norm": 0.9192505478858948, + "learning_rate": 8.831756977331447e-06, + "loss": 0.1192, + "step": 7830 + }, + { + "epoch": 29.923664122137403, + "grad_norm": 1.0596504211425781, + "learning_rate": 8.828213900875639e-06, + "loss": 0.1154, + "step": 7840 + }, + { + "epoch": 29.9618320610687, + "grad_norm": 0.6525912880897522, + "learning_rate": 8.824666172778964e-06, + "loss": 0.113, + "step": 7850 + }, + { + "epoch": 30.0, + "grad_norm": 0.4801918864250183, + "learning_rate": 8.821113797352246e-06, + "loss": 0.106, + "step": 7860 + }, + { + "epoch": 30.0381679389313, + "grad_norm": 0.8165585398674011, + "learning_rate": 8.817556778911957e-06, + "loss": 0.1145, + "step": 7870 + }, + { + "epoch": 30.076335877862597, + "grad_norm": 0.31382817029953003, + "learning_rate": 8.81399512178021e-06, + "loss": 0.1096, + "step": 7880 + }, + { + "epoch": 30.114503816793892, + "grad_norm": 0.38235539197921753, + "learning_rate": 8.810428830284752e-06, + "loss": 0.1064, + "step": 7890 + }, + { + "epoch": 30.15267175572519, + "grad_norm": 0.9553492069244385, + "learning_rate": 8.806857908758968e-06, + "loss": 0.11, + "step": 7900 + }, + { + "epoch": 30.19083969465649, + "grad_norm": 0.33214470744132996, + "learning_rate": 8.80328236154186e-06, + "loss": 0.1346, + "step": 7910 + }, + { + "epoch": 30.229007633587788, + "grad_norm": 0.5911892056465149, + "learning_rate": 8.799702192978056e-06, + "loss": 0.1101, + "step": 7920 + }, + { + "epoch": 30.267175572519083, + "grad_norm": 0.6221767067909241, + "learning_rate": 8.7961174074178e-06, + "loss": 0.1067, + "step": 7930 + }, + { + "epoch": 30.30534351145038, + "grad_norm": 0.9988340735435486, + "learning_rate": 8.792528009216942e-06, + "loss": 0.1043, + "step": 7940 + }, + { + "epoch": 30.34351145038168, + "grad_norm": 0.3819482624530792, + "learning_rate": 8.788934002736944e-06, + "loss": 0.1047, + "step": 7950 + }, + { + "epoch": 30.38167938931298, + "grad_norm": 0.36361387372016907, + "learning_rate": 8.785335392344858e-06, + "loss": 0.1092, + "step": 7960 + }, + { + "epoch": 30.419847328244273, + "grad_norm": 0.4566929340362549, + "learning_rate": 8.781732182413336e-06, + "loss": 0.1103, + "step": 7970 + }, + { + "epoch": 30.458015267175572, + "grad_norm": 0.6546979546546936, + "learning_rate": 8.778124377320619e-06, + "loss": 0.1107, + "step": 7980 + }, + { + "epoch": 30.49618320610687, + "grad_norm": 0.38045239448547363, + "learning_rate": 8.774511981450529e-06, + "loss": 0.1074, + "step": 7990 + }, + { + "epoch": 30.53435114503817, + "grad_norm": 0.6376705169677734, + "learning_rate": 8.770894999192468e-06, + "loss": 0.1162, + "step": 8000 + }, + { + "epoch": 30.572519083969464, + "grad_norm": 0.6514620184898376, + "learning_rate": 8.767273434941413e-06, + "loss": 0.1094, + "step": 8010 + }, + { + "epoch": 30.610687022900763, + "grad_norm": 0.49210497736930847, + "learning_rate": 8.763647293097902e-06, + "loss": 0.1143, + "step": 8020 + }, + { + "epoch": 30.64885496183206, + "grad_norm": 0.9991191029548645, + "learning_rate": 8.76001657806804e-06, + "loss": 0.108, + "step": 8030 + }, + { + "epoch": 30.68702290076336, + "grad_norm": 0.3525305688381195, + "learning_rate": 8.75638129426349e-06, + "loss": 0.1043, + "step": 8040 + }, + { + "epoch": 30.725190839694655, + "grad_norm": 0.3630237281322479, + "learning_rate": 8.752741446101464e-06, + "loss": 0.1033, + "step": 8050 + }, + { + "epoch": 30.763358778625953, + "grad_norm": 0.544648289680481, + "learning_rate": 8.749097038004722e-06, + "loss": 0.1059, + "step": 8060 + }, + { + "epoch": 30.801526717557252, + "grad_norm": 0.31798774003982544, + "learning_rate": 8.745448074401562e-06, + "loss": 0.1073, + "step": 8070 + }, + { + "epoch": 30.83969465648855, + "grad_norm": 0.35739681124687195, + "learning_rate": 8.741794559725818e-06, + "loss": 0.1084, + "step": 8080 + }, + { + "epoch": 30.87786259541985, + "grad_norm": 0.38239383697509766, + "learning_rate": 8.738136498416857e-06, + "loss": 0.11, + "step": 8090 + }, + { + "epoch": 30.916030534351144, + "grad_norm": 1.0929396152496338, + "learning_rate": 8.734473894919564e-06, + "loss": 0.1153, + "step": 8100 + }, + { + "epoch": 30.954198473282442, + "grad_norm": 0.4055207669734955, + "learning_rate": 8.730806753684354e-06, + "loss": 0.1021, + "step": 8110 + }, + { + "epoch": 30.99236641221374, + "grad_norm": 0.4326501190662384, + "learning_rate": 8.727135079167144e-06, + "loss": 0.1144, + "step": 8120 + }, + { + "epoch": 31.03053435114504, + "grad_norm": 0.39526522159576416, + "learning_rate": 8.723458875829368e-06, + "loss": 0.1018, + "step": 8130 + }, + { + "epoch": 31.068702290076335, + "grad_norm": 0.35406747460365295, + "learning_rate": 8.719778148137959e-06, + "loss": 0.1056, + "step": 8140 + }, + { + "epoch": 31.106870229007633, + "grad_norm": 0.380480021238327, + "learning_rate": 8.716092900565347e-06, + "loss": 0.1079, + "step": 8150 + }, + { + "epoch": 31.14503816793893, + "grad_norm": 0.3873949944972992, + "learning_rate": 8.712403137589455e-06, + "loss": 0.1065, + "step": 8160 + }, + { + "epoch": 31.18320610687023, + "grad_norm": 0.3808857202529907, + "learning_rate": 8.708708863693696e-06, + "loss": 0.1104, + "step": 8170 + }, + { + "epoch": 31.221374045801525, + "grad_norm": 0.4861178994178772, + "learning_rate": 8.705010083366961e-06, + "loss": 0.1064, + "step": 8180 + }, + { + "epoch": 31.259541984732824, + "grad_norm": 0.8248606324195862, + "learning_rate": 8.701306801103611e-06, + "loss": 0.1039, + "step": 8190 + }, + { + "epoch": 31.297709923664122, + "grad_norm": 0.5083164572715759, + "learning_rate": 8.69759902140349e-06, + "loss": 0.1093, + "step": 8200 + }, + { + "epoch": 31.33587786259542, + "grad_norm": 0.4453307092189789, + "learning_rate": 8.693886748771896e-06, + "loss": 0.1081, + "step": 8210 + }, + { + "epoch": 31.374045801526716, + "grad_norm": 0.4088967442512512, + "learning_rate": 8.690169987719593e-06, + "loss": 0.1193, + "step": 8220 + }, + { + "epoch": 31.412213740458014, + "grad_norm": 0.42236557602882385, + "learning_rate": 8.686448742762792e-06, + "loss": 0.1143, + "step": 8230 + }, + { + "epoch": 31.450381679389313, + "grad_norm": 0.9636140465736389, + "learning_rate": 8.68272301842316e-06, + "loss": 0.1119, + "step": 8240 + }, + { + "epoch": 31.48854961832061, + "grad_norm": 1.1265870332717896, + "learning_rate": 8.678992819227804e-06, + "loss": 0.1045, + "step": 8250 + }, + { + "epoch": 31.52671755725191, + "grad_norm": 0.7294782400131226, + "learning_rate": 8.675258149709265e-06, + "loss": 0.1139, + "step": 8260 + }, + { + "epoch": 31.564885496183205, + "grad_norm": 0.9136938452720642, + "learning_rate": 8.67151901440552e-06, + "loss": 0.1111, + "step": 8270 + }, + { + "epoch": 31.603053435114504, + "grad_norm": 0.4900221824645996, + "learning_rate": 8.667775417859971e-06, + "loss": 0.1077, + "step": 8280 + }, + { + "epoch": 31.641221374045802, + "grad_norm": 0.2998524010181427, + "learning_rate": 8.664027364621442e-06, + "loss": 0.1085, + "step": 8290 + }, + { + "epoch": 31.6793893129771, + "grad_norm": 0.5325132012367249, + "learning_rate": 8.660274859244167e-06, + "loss": 0.1093, + "step": 8300 + }, + { + "epoch": 31.717557251908396, + "grad_norm": 0.6447851657867432, + "learning_rate": 8.656517906287798e-06, + "loss": 0.111, + "step": 8310 + }, + { + "epoch": 31.755725190839694, + "grad_norm": 0.4099143147468567, + "learning_rate": 8.652756510317387e-06, + "loss": 0.1085, + "step": 8320 + }, + { + "epoch": 31.793893129770993, + "grad_norm": 0.30737578868865967, + "learning_rate": 8.648990675903382e-06, + "loss": 0.1077, + "step": 8330 + }, + { + "epoch": 31.83206106870229, + "grad_norm": 0.43916499614715576, + "learning_rate": 8.645220407621629e-06, + "loss": 0.1042, + "step": 8340 + }, + { + "epoch": 31.870229007633586, + "grad_norm": 0.3269640803337097, + "learning_rate": 8.64144571005336e-06, + "loss": 0.1038, + "step": 8350 + }, + { + "epoch": 31.908396946564885, + "grad_norm": 0.38933065533638, + "learning_rate": 8.637666587785185e-06, + "loss": 0.101, + "step": 8360 + }, + { + "epoch": 31.946564885496183, + "grad_norm": 0.42047321796417236, + "learning_rate": 8.633883045409096e-06, + "loss": 0.1107, + "step": 8370 + }, + { + "epoch": 31.984732824427482, + "grad_norm": 0.36545827984809875, + "learning_rate": 8.630095087522458e-06, + "loss": 0.1053, + "step": 8380 + }, + { + "epoch": 32.02290076335878, + "grad_norm": 0.5923290848731995, + "learning_rate": 8.62630271872799e-06, + "loss": 0.1096, + "step": 8390 + }, + { + "epoch": 32.06106870229008, + "grad_norm": 0.4061322510242462, + "learning_rate": 8.622505943633781e-06, + "loss": 0.1131, + "step": 8400 + }, + { + "epoch": 32.099236641221374, + "grad_norm": 0.3834056854248047, + "learning_rate": 8.618704766853271e-06, + "loss": 0.1003, + "step": 8410 + }, + { + "epoch": 32.13740458015267, + "grad_norm": 0.29086577892303467, + "learning_rate": 8.614899193005248e-06, + "loss": 0.1085, + "step": 8420 + }, + { + "epoch": 32.17557251908397, + "grad_norm": 0.48934558033943176, + "learning_rate": 8.611089226713843e-06, + "loss": 0.1089, + "step": 8430 + }, + { + "epoch": 32.213740458015266, + "grad_norm": 0.5814259648323059, + "learning_rate": 8.607274872608521e-06, + "loss": 0.1229, + "step": 8440 + }, + { + "epoch": 32.25190839694657, + "grad_norm": 0.36074042320251465, + "learning_rate": 8.603456135324089e-06, + "loss": 0.1099, + "step": 8450 + }, + { + "epoch": 32.29007633587786, + "grad_norm": 0.40568429231643677, + "learning_rate": 8.599633019500665e-06, + "loss": 0.1234, + "step": 8460 + }, + { + "epoch": 32.32824427480916, + "grad_norm": 0.4491671621799469, + "learning_rate": 8.595805529783703e-06, + "loss": 0.1156, + "step": 8470 + }, + { + "epoch": 32.36641221374046, + "grad_norm": 0.4605843126773834, + "learning_rate": 8.59197367082396e-06, + "loss": 0.1075, + "step": 8480 + }, + { + "epoch": 32.404580152671755, + "grad_norm": 0.5685547590255737, + "learning_rate": 8.588137447277502e-06, + "loss": 0.1205, + "step": 8490 + }, + { + "epoch": 32.44274809160305, + "grad_norm": 0.32008451223373413, + "learning_rate": 8.584296863805708e-06, + "loss": 0.1065, + "step": 8500 + }, + { + "epoch": 32.48091603053435, + "grad_norm": 0.3442102074623108, + "learning_rate": 8.580451925075249e-06, + "loss": 0.1123, + "step": 8510 + }, + { + "epoch": 32.51908396946565, + "grad_norm": 0.5090252161026001, + "learning_rate": 8.576602635758086e-06, + "loss": 0.1108, + "step": 8520 + }, + { + "epoch": 32.55725190839695, + "grad_norm": 0.7189807891845703, + "learning_rate": 8.572749000531468e-06, + "loss": 0.1109, + "step": 8530 + }, + { + "epoch": 32.595419847328245, + "grad_norm": 0.6325716972351074, + "learning_rate": 8.568891024077925e-06, + "loss": 0.1113, + "step": 8540 + }, + { + "epoch": 32.63358778625954, + "grad_norm": 0.28530317544937134, + "learning_rate": 8.565028711085266e-06, + "loss": 0.1014, + "step": 8550 + }, + { + "epoch": 32.67175572519084, + "grad_norm": 0.5091580152511597, + "learning_rate": 8.561162066246562e-06, + "loss": 0.1085, + "step": 8560 + }, + { + "epoch": 32.70992366412214, + "grad_norm": 0.39473584294319153, + "learning_rate": 8.557291094260151e-06, + "loss": 0.1041, + "step": 8570 + }, + { + "epoch": 32.74809160305343, + "grad_norm": 0.5815795660018921, + "learning_rate": 8.55341579982963e-06, + "loss": 0.1011, + "step": 8580 + }, + { + "epoch": 32.786259541984734, + "grad_norm": 0.46441009640693665, + "learning_rate": 8.549536187663848e-06, + "loss": 0.1027, + "step": 8590 + }, + { + "epoch": 32.82442748091603, + "grad_norm": 0.4190653860569, + "learning_rate": 8.545652262476898e-06, + "loss": 0.1075, + "step": 8600 + }, + { + "epoch": 32.86259541984733, + "grad_norm": 0.3595786392688751, + "learning_rate": 8.541764028988115e-06, + "loss": 0.097, + "step": 8610 + }, + { + "epoch": 32.900763358778626, + "grad_norm": 0.6225172281265259, + "learning_rate": 8.537871491922072e-06, + "loss": 0.1145, + "step": 8620 + }, + { + "epoch": 32.93893129770992, + "grad_norm": 1.1140257120132446, + "learning_rate": 8.533974656008566e-06, + "loss": 0.129, + "step": 8630 + }, + { + "epoch": 32.97709923664122, + "grad_norm": 1.0088690519332886, + "learning_rate": 8.530073525982621e-06, + "loss": 0.1171, + "step": 8640 + }, + { + "epoch": 33.01526717557252, + "grad_norm": 0.2912192940711975, + "learning_rate": 8.526168106584476e-06, + "loss": 0.1004, + "step": 8650 + }, + { + "epoch": 33.05343511450382, + "grad_norm": 0.337960422039032, + "learning_rate": 8.522258402559587e-06, + "loss": 0.1179, + "step": 8660 + }, + { + "epoch": 33.091603053435115, + "grad_norm": 0.30016374588012695, + "learning_rate": 8.518344418658612e-06, + "loss": 0.0991, + "step": 8670 + }, + { + "epoch": 33.12977099236641, + "grad_norm": 0.36123228073120117, + "learning_rate": 8.51442615963741e-06, + "loss": 0.1, + "step": 8680 + }, + { + "epoch": 33.16793893129771, + "grad_norm": 0.3525253236293793, + "learning_rate": 8.510503630257034e-06, + "loss": 0.0969, + "step": 8690 + }, + { + "epoch": 33.20610687022901, + "grad_norm": 0.3419656753540039, + "learning_rate": 8.506576835283731e-06, + "loss": 0.1014, + "step": 8700 + }, + { + "epoch": 33.2442748091603, + "grad_norm": 0.3661300837993622, + "learning_rate": 8.502645779488923e-06, + "loss": 0.1014, + "step": 8710 + }, + { + "epoch": 33.282442748091604, + "grad_norm": 0.6394238471984863, + "learning_rate": 8.498710467649214e-06, + "loss": 0.1046, + "step": 8720 + }, + { + "epoch": 33.3206106870229, + "grad_norm": 0.3386576473712921, + "learning_rate": 8.494770904546381e-06, + "loss": 0.1125, + "step": 8730 + }, + { + "epoch": 33.3587786259542, + "grad_norm": 0.5160825848579407, + "learning_rate": 8.490827094967364e-06, + "loss": 0.1072, + "step": 8740 + }, + { + "epoch": 33.396946564885496, + "grad_norm": 0.37867069244384766, + "learning_rate": 8.486879043704263e-06, + "loss": 0.1043, + "step": 8750 + }, + { + "epoch": 33.43511450381679, + "grad_norm": 0.30260443687438965, + "learning_rate": 8.482926755554333e-06, + "loss": 0.1074, + "step": 8760 + }, + { + "epoch": 33.47328244274809, + "grad_norm": 0.589114248752594, + "learning_rate": 8.478970235319975e-06, + "loss": 0.1072, + "step": 8770 + }, + { + "epoch": 33.51145038167939, + "grad_norm": 0.3591925799846649, + "learning_rate": 8.475009487808738e-06, + "loss": 0.1009, + "step": 8780 + }, + { + "epoch": 33.54961832061069, + "grad_norm": 0.6600471138954163, + "learning_rate": 8.471044517833299e-06, + "loss": 0.122, + "step": 8790 + }, + { + "epoch": 33.587786259541986, + "grad_norm": 0.36351725459098816, + "learning_rate": 8.467075330211474e-06, + "loss": 0.1099, + "step": 8800 + }, + { + "epoch": 33.62595419847328, + "grad_norm": 0.31405356526374817, + "learning_rate": 8.463101929766197e-06, + "loss": 0.1024, + "step": 8810 + }, + { + "epoch": 33.66412213740458, + "grad_norm": 0.34903696179389954, + "learning_rate": 8.459124321325529e-06, + "loss": 0.1087, + "step": 8820 + }, + { + "epoch": 33.70229007633588, + "grad_norm": 0.43546533584594727, + "learning_rate": 8.455142509722635e-06, + "loss": 0.1072, + "step": 8830 + }, + { + "epoch": 33.74045801526717, + "grad_norm": 0.43064677715301514, + "learning_rate": 8.451156499795791e-06, + "loss": 0.1159, + "step": 8840 + }, + { + "epoch": 33.778625954198475, + "grad_norm": 0.3457275331020355, + "learning_rate": 8.44716629638838e-06, + "loss": 0.1137, + "step": 8850 + }, + { + "epoch": 33.81679389312977, + "grad_norm": 0.4882639944553375, + "learning_rate": 8.443171904348873e-06, + "loss": 0.1035, + "step": 8860 + }, + { + "epoch": 33.85496183206107, + "grad_norm": 0.3734376132488251, + "learning_rate": 8.439173328530829e-06, + "loss": 0.1045, + "step": 8870 + }, + { + "epoch": 33.89312977099237, + "grad_norm": 0.3466688096523285, + "learning_rate": 8.435170573792902e-06, + "loss": 0.1041, + "step": 8880 + }, + { + "epoch": 33.93129770992366, + "grad_norm": 0.27618467807769775, + "learning_rate": 8.431163644998808e-06, + "loss": 0.1138, + "step": 8890 + }, + { + "epoch": 33.969465648854964, + "grad_norm": 0.3809553384780884, + "learning_rate": 8.42715254701735e-06, + "loss": 0.1111, + "step": 8900 + }, + { + "epoch": 34.00763358778626, + "grad_norm": 0.3887907564640045, + "learning_rate": 8.423137284722389e-06, + "loss": 0.1069, + "step": 8910 + }, + { + "epoch": 34.045801526717554, + "grad_norm": 0.3740697205066681, + "learning_rate": 8.419117862992846e-06, + "loss": 0.1063, + "step": 8920 + }, + { + "epoch": 34.083969465648856, + "grad_norm": 0.38636720180511475, + "learning_rate": 8.415094286712694e-06, + "loss": 0.1108, + "step": 8930 + }, + { + "epoch": 34.12213740458015, + "grad_norm": 0.2964230477809906, + "learning_rate": 8.411066560770965e-06, + "loss": 0.1062, + "step": 8940 + }, + { + "epoch": 34.16030534351145, + "grad_norm": 0.4712314009666443, + "learning_rate": 8.407034690061722e-06, + "loss": 0.0999, + "step": 8950 + }, + { + "epoch": 34.19847328244275, + "grad_norm": 0.7659854292869568, + "learning_rate": 8.402998679484067e-06, + "loss": 0.1045, + "step": 8960 + }, + { + "epoch": 34.23664122137404, + "grad_norm": 0.4793913960456848, + "learning_rate": 8.398958533942135e-06, + "loss": 0.1036, + "step": 8970 + }, + { + "epoch": 34.274809160305345, + "grad_norm": 0.37287986278533936, + "learning_rate": 8.394914258345084e-06, + "loss": 0.1144, + "step": 8980 + }, + { + "epoch": 34.31297709923664, + "grad_norm": 0.541438639163971, + "learning_rate": 8.390865857607089e-06, + "loss": 0.1055, + "step": 8990 + }, + { + "epoch": 34.35114503816794, + "grad_norm": 0.46863681077957153, + "learning_rate": 8.386813336647339e-06, + "loss": 0.1055, + "step": 9000 + }, + { + "epoch": 34.38931297709924, + "grad_norm": 0.7329716086387634, + "learning_rate": 8.38275670039003e-06, + "loss": 0.1179, + "step": 9010 + }, + { + "epoch": 34.42748091603053, + "grad_norm": 0.4788097143173218, + "learning_rate": 8.378695953764357e-06, + "loss": 0.1261, + "step": 9020 + }, + { + "epoch": 34.465648854961835, + "grad_norm": 0.4310746192932129, + "learning_rate": 8.374631101704509e-06, + "loss": 0.1175, + "step": 9030 + }, + { + "epoch": 34.50381679389313, + "grad_norm": 0.4793173372745514, + "learning_rate": 8.370562149149666e-06, + "loss": 0.1089, + "step": 9040 + }, + { + "epoch": 34.541984732824424, + "grad_norm": 0.5185418725013733, + "learning_rate": 8.366489101043989e-06, + "loss": 0.1043, + "step": 9050 + }, + { + "epoch": 34.58015267175573, + "grad_norm": 0.37807098031044006, + "learning_rate": 8.362411962336613e-06, + "loss": 0.1102, + "step": 9060 + }, + { + "epoch": 34.61832061068702, + "grad_norm": 0.5667879581451416, + "learning_rate": 8.358330737981651e-06, + "loss": 0.1137, + "step": 9070 + }, + { + "epoch": 34.656488549618324, + "grad_norm": 1.1132043600082397, + "learning_rate": 8.35424543293817e-06, + "loss": 0.1117, + "step": 9080 + }, + { + "epoch": 34.69465648854962, + "grad_norm": 0.3736783564090729, + "learning_rate": 8.350156052170206e-06, + "loss": 0.1099, + "step": 9090 + }, + { + "epoch": 34.732824427480914, + "grad_norm": 0.3822537362575531, + "learning_rate": 8.346062600646739e-06, + "loss": 0.0993, + "step": 9100 + }, + { + "epoch": 34.770992366412216, + "grad_norm": 0.32378950715065, + "learning_rate": 8.341965083341696e-06, + "loss": 0.1012, + "step": 9110 + }, + { + "epoch": 34.80916030534351, + "grad_norm": 0.3589276969432831, + "learning_rate": 8.337863505233954e-06, + "loss": 0.1064, + "step": 9120 + }, + { + "epoch": 34.847328244274806, + "grad_norm": 0.3442489802837372, + "learning_rate": 8.333757871307311e-06, + "loss": 0.1066, + "step": 9130 + }, + { + "epoch": 34.88549618320611, + "grad_norm": 0.43421897292137146, + "learning_rate": 8.329648186550501e-06, + "loss": 0.1107, + "step": 9140 + }, + { + "epoch": 34.9236641221374, + "grad_norm": 0.33839526772499084, + "learning_rate": 8.32553445595718e-06, + "loss": 0.1105, + "step": 9150 + }, + { + "epoch": 34.961832061068705, + "grad_norm": 0.3755687475204468, + "learning_rate": 8.321416684525917e-06, + "loss": 0.1102, + "step": 9160 + }, + { + "epoch": 35.0, + "grad_norm": 0.3840588629245758, + "learning_rate": 8.317294877260193e-06, + "loss": 0.1087, + "step": 9170 + }, + { + "epoch": 35.038167938931295, + "grad_norm": 0.5461086630821228, + "learning_rate": 8.313169039168395e-06, + "loss": 0.1093, + "step": 9180 + }, + { + "epoch": 35.0763358778626, + "grad_norm": 0.42317765951156616, + "learning_rate": 8.3090391752638e-06, + "loss": 0.1133, + "step": 9190 + }, + { + "epoch": 35.11450381679389, + "grad_norm": 0.5004700422286987, + "learning_rate": 8.304905290564586e-06, + "loss": 0.0985, + "step": 9200 + }, + { + "epoch": 35.152671755725194, + "grad_norm": 0.42691975831985474, + "learning_rate": 8.300767390093814e-06, + "loss": 0.1029, + "step": 9210 + }, + { + "epoch": 35.19083969465649, + "grad_norm": 0.6003366708755493, + "learning_rate": 8.296625478879417e-06, + "loss": 0.1216, + "step": 9220 + }, + { + "epoch": 35.229007633587784, + "grad_norm": 0.31051206588745117, + "learning_rate": 8.292479561954214e-06, + "loss": 0.1022, + "step": 9230 + }, + { + "epoch": 35.267175572519086, + "grad_norm": 0.47282811999320984, + "learning_rate": 8.288329644355884e-06, + "loss": 0.1074, + "step": 9240 + }, + { + "epoch": 35.30534351145038, + "grad_norm": 0.7961530089378357, + "learning_rate": 8.284175731126964e-06, + "loss": 0.1041, + "step": 9250 + }, + { + "epoch": 35.343511450381676, + "grad_norm": 0.536555826663971, + "learning_rate": 8.280017827314854e-06, + "loss": 0.1102, + "step": 9260 + }, + { + "epoch": 35.38167938931298, + "grad_norm": 0.6327351331710815, + "learning_rate": 8.275855937971799e-06, + "loss": 0.1135, + "step": 9270 + }, + { + "epoch": 35.41984732824427, + "grad_norm": 0.39085444808006287, + "learning_rate": 8.271690068154887e-06, + "loss": 0.1086, + "step": 9280 + }, + { + "epoch": 35.458015267175576, + "grad_norm": 0.3909195065498352, + "learning_rate": 8.26752022292604e-06, + "loss": 0.1047, + "step": 9290 + }, + { + "epoch": 35.49618320610687, + "grad_norm": 0.44586417078971863, + "learning_rate": 8.263346407352017e-06, + "loss": 0.1016, + "step": 9300 + }, + { + "epoch": 35.534351145038165, + "grad_norm": 0.9170414805412292, + "learning_rate": 8.259168626504395e-06, + "loss": 0.1119, + "step": 9310 + }, + { + "epoch": 35.57251908396947, + "grad_norm": 0.5176953673362732, + "learning_rate": 8.25498688545957e-06, + "loss": 0.109, + "step": 9320 + }, + { + "epoch": 35.61068702290076, + "grad_norm": 0.3105170428752899, + "learning_rate": 8.250801189298759e-06, + "loss": 0.1089, + "step": 9330 + }, + { + "epoch": 35.64885496183206, + "grad_norm": 0.9050326943397522, + "learning_rate": 8.246611543107968e-06, + "loss": 0.1105, + "step": 9340 + }, + { + "epoch": 35.68702290076336, + "grad_norm": 0.36389780044555664, + "learning_rate": 8.24241795197802e-06, + "loss": 0.1075, + "step": 9350 + }, + { + "epoch": 35.725190839694655, + "grad_norm": 0.2961239814758301, + "learning_rate": 8.238220421004518e-06, + "loss": 0.1073, + "step": 9360 + }, + { + "epoch": 35.76335877862596, + "grad_norm": 0.4220345914363861, + "learning_rate": 8.23401895528786e-06, + "loss": 0.1004, + "step": 9370 + }, + { + "epoch": 35.80152671755725, + "grad_norm": 0.3124537765979767, + "learning_rate": 8.229813559933225e-06, + "loss": 0.1113, + "step": 9380 + }, + { + "epoch": 35.83969465648855, + "grad_norm": 0.4424472153186798, + "learning_rate": 8.22560424005056e-06, + "loss": 0.1074, + "step": 9390 + }, + { + "epoch": 35.87786259541985, + "grad_norm": 0.46610021591186523, + "learning_rate": 8.22139100075459e-06, + "loss": 0.1089, + "step": 9400 + }, + { + "epoch": 35.916030534351144, + "grad_norm": 0.5336711406707764, + "learning_rate": 8.217173847164799e-06, + "loss": 0.1031, + "step": 9410 + }, + { + "epoch": 35.954198473282446, + "grad_norm": 0.4767981171607971, + "learning_rate": 8.212952784405423e-06, + "loss": 0.1033, + "step": 9420 + }, + { + "epoch": 35.99236641221374, + "grad_norm": 0.3861401677131653, + "learning_rate": 8.208727817605453e-06, + "loss": 0.1031, + "step": 9430 + }, + { + "epoch": 36.030534351145036, + "grad_norm": 0.5217192769050598, + "learning_rate": 8.204498951898618e-06, + "loss": 0.1086, + "step": 9440 + }, + { + "epoch": 36.06870229007634, + "grad_norm": 0.41782206296920776, + "learning_rate": 8.200266192423396e-06, + "loss": 0.1091, + "step": 9450 + }, + { + "epoch": 36.10687022900763, + "grad_norm": 0.29525044560432434, + "learning_rate": 8.196029544322983e-06, + "loss": 0.1116, + "step": 9460 + }, + { + "epoch": 36.14503816793893, + "grad_norm": 0.4232509732246399, + "learning_rate": 8.191789012745306e-06, + "loss": 0.1101, + "step": 9470 + }, + { + "epoch": 36.18320610687023, + "grad_norm": 0.9247789978981018, + "learning_rate": 8.187544602843014e-06, + "loss": 0.0989, + "step": 9480 + }, + { + "epoch": 36.221374045801525, + "grad_norm": 0.3943084180355072, + "learning_rate": 8.183296319773466e-06, + "loss": 0.1031, + "step": 9490 + }, + { + "epoch": 36.25954198473283, + "grad_norm": 0.4537927806377411, + "learning_rate": 8.179044168698722e-06, + "loss": 0.1021, + "step": 9500 + }, + { + "epoch": 36.29770992366412, + "grad_norm": 0.5286827683448792, + "learning_rate": 8.174788154785548e-06, + "loss": 0.1085, + "step": 9510 + }, + { + "epoch": 36.33587786259542, + "grad_norm": 0.40371379256248474, + "learning_rate": 8.170528283205404e-06, + "loss": 0.1035, + "step": 9520 + }, + { + "epoch": 36.37404580152672, + "grad_norm": 0.4854196608066559, + "learning_rate": 8.166264559134434e-06, + "loss": 0.1115, + "step": 9530 + }, + { + "epoch": 36.412213740458014, + "grad_norm": 0.673309862613678, + "learning_rate": 8.161996987753466e-06, + "loss": 0.1044, + "step": 9540 + }, + { + "epoch": 36.45038167938931, + "grad_norm": 0.37787386775016785, + "learning_rate": 8.157725574248e-06, + "loss": 0.1136, + "step": 9550 + }, + { + "epoch": 36.48854961832061, + "grad_norm": 1.057741641998291, + "learning_rate": 8.15345032380821e-06, + "loss": 0.1129, + "step": 9560 + }, + { + "epoch": 36.52671755725191, + "grad_norm": 0.5287107229232788, + "learning_rate": 8.149171241628924e-06, + "loss": 0.0985, + "step": 9570 + }, + { + "epoch": 36.56488549618321, + "grad_norm": 0.4657130539417267, + "learning_rate": 8.144888332909631e-06, + "loss": 0.0994, + "step": 9580 + }, + { + "epoch": 36.603053435114504, + "grad_norm": 0.2887883484363556, + "learning_rate": 8.140601602854471e-06, + "loss": 0.1009, + "step": 9590 + }, + { + "epoch": 36.6412213740458, + "grad_norm": 0.5169790983200073, + "learning_rate": 8.136311056672224e-06, + "loss": 0.1026, + "step": 9600 + }, + { + "epoch": 36.6793893129771, + "grad_norm": 0.42216652631759644, + "learning_rate": 8.132016699576308e-06, + "loss": 0.1009, + "step": 9610 + }, + { + "epoch": 36.717557251908396, + "grad_norm": 0.39106106758117676, + "learning_rate": 8.127718536784771e-06, + "loss": 0.1056, + "step": 9620 + }, + { + "epoch": 36.7557251908397, + "grad_norm": 0.5721985697746277, + "learning_rate": 8.123416573520289e-06, + "loss": 0.1017, + "step": 9630 + }, + { + "epoch": 36.79389312977099, + "grad_norm": 0.5179575085639954, + "learning_rate": 8.119110815010152e-06, + "loss": 0.1127, + "step": 9640 + }, + { + "epoch": 36.83206106870229, + "grad_norm": 0.5956861972808838, + "learning_rate": 8.11480126648626e-06, + "loss": 0.1059, + "step": 9650 + }, + { + "epoch": 36.87022900763359, + "grad_norm": 0.37840545177459717, + "learning_rate": 8.110487933185123e-06, + "loss": 0.1052, + "step": 9660 + }, + { + "epoch": 36.908396946564885, + "grad_norm": 0.42260074615478516, + "learning_rate": 8.106170820347849e-06, + "loss": 0.1003, + "step": 9670 + }, + { + "epoch": 36.94656488549618, + "grad_norm": 0.7760491371154785, + "learning_rate": 8.101849933220134e-06, + "loss": 0.1081, + "step": 9680 + }, + { + "epoch": 36.98473282442748, + "grad_norm": 0.3848549723625183, + "learning_rate": 8.097525277052265e-06, + "loss": 0.1011, + "step": 9690 + }, + { + "epoch": 37.02290076335878, + "grad_norm": 0.72954261302948, + "learning_rate": 8.093196857099105e-06, + "loss": 0.1005, + "step": 9700 + }, + { + "epoch": 37.06106870229008, + "grad_norm": 0.515247642993927, + "learning_rate": 8.088864678620096e-06, + "loss": 0.1046, + "step": 9710 + }, + { + "epoch": 37.099236641221374, + "grad_norm": 0.6722749471664429, + "learning_rate": 8.084528746879243e-06, + "loss": 0.095, + "step": 9720 + }, + { + "epoch": 37.13740458015267, + "grad_norm": 0.3528861999511719, + "learning_rate": 8.080189067145107e-06, + "loss": 0.1046, + "step": 9730 + }, + { + "epoch": 37.17557251908397, + "grad_norm": 0.42519357800483704, + "learning_rate": 8.075845644690814e-06, + "loss": 0.0938, + "step": 9740 + }, + { + "epoch": 37.213740458015266, + "grad_norm": 0.5400734543800354, + "learning_rate": 8.07149848479403e-06, + "loss": 0.103, + "step": 9750 + }, + { + "epoch": 37.25190839694657, + "grad_norm": 0.6142025589942932, + "learning_rate": 8.067147592736963e-06, + "loss": 0.1027, + "step": 9760 + }, + { + "epoch": 37.29007633587786, + "grad_norm": 0.4489392936229706, + "learning_rate": 8.062792973806358e-06, + "loss": 0.1222, + "step": 9770 + }, + { + "epoch": 37.32824427480916, + "grad_norm": 0.576805830001831, + "learning_rate": 8.058434633293485e-06, + "loss": 0.1025, + "step": 9780 + }, + { + "epoch": 37.36641221374046, + "grad_norm": 0.38401392102241516, + "learning_rate": 8.054072576494142e-06, + "loss": 0.1, + "step": 9790 + }, + { + "epoch": 37.404580152671755, + "grad_norm": 0.4029317796230316, + "learning_rate": 8.04970680870864e-06, + "loss": 0.1047, + "step": 9800 + }, + { + "epoch": 37.44274809160305, + "grad_norm": 0.7393083572387695, + "learning_rate": 8.045337335241793e-06, + "loss": 0.0922, + "step": 9810 + }, + { + "epoch": 37.48091603053435, + "grad_norm": 0.4550725519657135, + "learning_rate": 8.040964161402932e-06, + "loss": 0.0988, + "step": 9820 + }, + { + "epoch": 37.51908396946565, + "grad_norm": 0.5443292260169983, + "learning_rate": 8.036587292505869e-06, + "loss": 0.0993, + "step": 9830 + }, + { + "epoch": 37.55725190839695, + "grad_norm": 0.6734360456466675, + "learning_rate": 8.032206733868912e-06, + "loss": 0.0907, + "step": 9840 + }, + { + "epoch": 37.595419847328245, + "grad_norm": 0.4426909387111664, + "learning_rate": 8.027822490814859e-06, + "loss": 0.0963, + "step": 9850 + }, + { + "epoch": 37.63358778625954, + "grad_norm": 0.7196950316429138, + "learning_rate": 8.023434568670971e-06, + "loss": 0.0974, + "step": 9860 + }, + { + "epoch": 37.67175572519084, + "grad_norm": 0.8929295539855957, + "learning_rate": 8.019042972768992e-06, + "loss": 0.0923, + "step": 9870 + }, + { + "epoch": 37.70992366412214, + "grad_norm": 0.4260941445827484, + "learning_rate": 8.014647708445124e-06, + "loss": 0.0862, + "step": 9880 + }, + { + "epoch": 37.74809160305343, + "grad_norm": 0.581256628036499, + "learning_rate": 8.010248781040027e-06, + "loss": 0.0962, + "step": 9890 + }, + { + "epoch": 37.786259541984734, + "grad_norm": 0.5044671893119812, + "learning_rate": 8.005846195898815e-06, + "loss": 0.0849, + "step": 9900 + }, + { + "epoch": 37.82442748091603, + "grad_norm": 0.5715168714523315, + "learning_rate": 8.00143995837104e-06, + "loss": 0.0864, + "step": 9910 + }, + { + "epoch": 37.86259541984733, + "grad_norm": 0.4902816116809845, + "learning_rate": 7.997030073810699e-06, + "loss": 0.09, + "step": 9920 + }, + { + "epoch": 37.900763358778626, + "grad_norm": 0.8679199814796448, + "learning_rate": 7.992616547576218e-06, + "loss": 0.1111, + "step": 9930 + }, + { + "epoch": 37.93893129770992, + "grad_norm": 0.5654365420341492, + "learning_rate": 7.988199385030446e-06, + "loss": 0.0981, + "step": 9940 + }, + { + "epoch": 37.97709923664122, + "grad_norm": 1.2245254516601562, + "learning_rate": 7.98377859154065e-06, + "loss": 0.0936, + "step": 9950 + }, + { + "epoch": 38.01526717557252, + "grad_norm": 0.9778199195861816, + "learning_rate": 7.979354172478516e-06, + "loss": 0.0862, + "step": 9960 + }, + { + "epoch": 38.05343511450382, + "grad_norm": 0.4366597831249237, + "learning_rate": 7.974926133220127e-06, + "loss": 0.0897, + "step": 9970 + }, + { + "epoch": 38.091603053435115, + "grad_norm": 0.5576930642127991, + "learning_rate": 7.970494479145968e-06, + "loss": 0.1004, + "step": 9980 + }, + { + "epoch": 38.12977099236641, + "grad_norm": 0.5354223251342773, + "learning_rate": 7.966059215640918e-06, + "loss": 0.0855, + "step": 9990 + }, + { + "epoch": 38.16793893129771, + "grad_norm": 0.4817463457584381, + "learning_rate": 7.96162034809424e-06, + "loss": 0.103, + "step": 10000 + }, + { + "epoch": 38.20610687022901, + "grad_norm": 1.0092859268188477, + "learning_rate": 7.957177881899579e-06, + "loss": 0.0936, + "step": 10010 + }, + { + "epoch": 38.2442748091603, + "grad_norm": 0.3715302050113678, + "learning_rate": 7.952731822454944e-06, + "loss": 0.0907, + "step": 10020 + }, + { + "epoch": 38.282442748091604, + "grad_norm": 0.7082487940788269, + "learning_rate": 7.948282175162723e-06, + "loss": 0.0889, + "step": 10030 + }, + { + "epoch": 38.3206106870229, + "grad_norm": 0.5549430251121521, + "learning_rate": 7.943828945429653e-06, + "loss": 0.1023, + "step": 10040 + }, + { + "epoch": 38.3587786259542, + "grad_norm": 0.41316139698028564, + "learning_rate": 7.939372138666828e-06, + "loss": 0.0843, + "step": 10050 + }, + { + "epoch": 38.396946564885496, + "grad_norm": 0.4269944429397583, + "learning_rate": 7.934911760289692e-06, + "loss": 0.0876, + "step": 10060 + }, + { + "epoch": 38.43511450381679, + "grad_norm": 0.35790082812309265, + "learning_rate": 7.930447815718022e-06, + "loss": 0.0835, + "step": 10070 + }, + { + "epoch": 38.47328244274809, + "grad_norm": 0.6934918165206909, + "learning_rate": 7.925980310375933e-06, + "loss": 0.0874, + "step": 10080 + }, + { + "epoch": 38.51145038167939, + "grad_norm": 0.36909425258636475, + "learning_rate": 7.921509249691865e-06, + "loss": 0.0937, + "step": 10090 + }, + { + "epoch": 38.54961832061069, + "grad_norm": 0.8461699485778809, + "learning_rate": 7.917034639098578e-06, + "loss": 0.0862, + "step": 10100 + }, + { + "epoch": 38.587786259541986, + "grad_norm": 0.45856791734695435, + "learning_rate": 7.912556484033146e-06, + "loss": 0.0915, + "step": 10110 + }, + { + "epoch": 38.62595419847328, + "grad_norm": 0.49433228373527527, + "learning_rate": 7.908074789936952e-06, + "loss": 0.0774, + "step": 10120 + }, + { + "epoch": 38.66412213740458, + "grad_norm": 0.3476361334323883, + "learning_rate": 7.903589562255673e-06, + "loss": 0.0933, + "step": 10130 + }, + { + "epoch": 38.70229007633588, + "grad_norm": 0.3297344744205475, + "learning_rate": 7.899100806439287e-06, + "loss": 0.0846, + "step": 10140 + }, + { + "epoch": 38.74045801526717, + "grad_norm": 0.36996668577194214, + "learning_rate": 7.894608527942049e-06, + "loss": 0.0948, + "step": 10150 + }, + { + "epoch": 38.778625954198475, + "grad_norm": 0.7153366804122925, + "learning_rate": 7.89011273222251e-06, + "loss": 0.0868, + "step": 10160 + }, + { + "epoch": 38.81679389312977, + "grad_norm": 0.571403443813324, + "learning_rate": 7.88561342474348e-06, + "loss": 0.0801, + "step": 10170 + }, + { + "epoch": 38.85496183206107, + "grad_norm": 0.5913143157958984, + "learning_rate": 7.881110610972045e-06, + "loss": 0.085, + "step": 10180 + }, + { + "epoch": 38.89312977099237, + "grad_norm": 0.36453530192375183, + "learning_rate": 7.876604296379545e-06, + "loss": 0.0777, + "step": 10190 + }, + { + "epoch": 38.93129770992366, + "grad_norm": 0.5440592169761658, + "learning_rate": 7.87209448644158e-06, + "loss": 0.0882, + "step": 10200 + }, + { + "epoch": 38.969465648854964, + "grad_norm": 0.5457788705825806, + "learning_rate": 7.867581186637991e-06, + "loss": 0.0779, + "step": 10210 + }, + { + "epoch": 39.00763358778626, + "grad_norm": 0.6085828542709351, + "learning_rate": 7.863064402452867e-06, + "loss": 0.088, + "step": 10220 + }, + { + "epoch": 39.045801526717554, + "grad_norm": 1.3797415494918823, + "learning_rate": 7.858544139374524e-06, + "loss": 0.0929, + "step": 10230 + }, + { + "epoch": 39.083969465648856, + "grad_norm": 0.5593680739402771, + "learning_rate": 7.854020402895508e-06, + "loss": 0.0891, + "step": 10240 + }, + { + "epoch": 39.12213740458015, + "grad_norm": 0.7378935813903809, + "learning_rate": 7.849493198512587e-06, + "loss": 0.085, + "step": 10250 + }, + { + "epoch": 39.16030534351145, + "grad_norm": 0.5513617992401123, + "learning_rate": 7.844962531726742e-06, + "loss": 0.0924, + "step": 10260 + }, + { + "epoch": 39.19847328244275, + "grad_norm": 0.47100234031677246, + "learning_rate": 7.840428408043156e-06, + "loss": 0.087, + "step": 10270 + }, + { + "epoch": 39.23664122137404, + "grad_norm": 0.7264495491981506, + "learning_rate": 7.835890832971218e-06, + "loss": 0.0797, + "step": 10280 + }, + { + "epoch": 39.274809160305345, + "grad_norm": 0.6747552752494812, + "learning_rate": 7.831349812024513e-06, + "loss": 0.0816, + "step": 10290 + }, + { + "epoch": 39.31297709923664, + "grad_norm": 0.568621814250946, + "learning_rate": 7.826805350720807e-06, + "loss": 0.0829, + "step": 10300 + }, + { + "epoch": 39.35114503816794, + "grad_norm": 0.8591163158416748, + "learning_rate": 7.82225745458205e-06, + "loss": 0.0945, + "step": 10310 + }, + { + "epoch": 39.38931297709924, + "grad_norm": 0.6126816272735596, + "learning_rate": 7.817706129134363e-06, + "loss": 0.09, + "step": 10320 + }, + { + "epoch": 39.42748091603053, + "grad_norm": 0.5068755745887756, + "learning_rate": 7.813151379908037e-06, + "loss": 0.085, + "step": 10330 + }, + { + "epoch": 39.465648854961835, + "grad_norm": 0.5643684267997742, + "learning_rate": 7.808593212437523e-06, + "loss": 0.0795, + "step": 10340 + }, + { + "epoch": 39.50381679389313, + "grad_norm": 0.2955627143383026, + "learning_rate": 7.804031632261421e-06, + "loss": 0.0744, + "step": 10350 + }, + { + "epoch": 39.541984732824424, + "grad_norm": 0.4418695569038391, + "learning_rate": 7.799466644922484e-06, + "loss": 0.0872, + "step": 10360 + }, + { + "epoch": 39.58015267175573, + "grad_norm": 0.5301413536071777, + "learning_rate": 7.794898255967602e-06, + "loss": 0.0899, + "step": 10370 + }, + { + "epoch": 39.61832061068702, + "grad_norm": 0.5224511623382568, + "learning_rate": 7.790326470947796e-06, + "loss": 0.0891, + "step": 10380 + }, + { + "epoch": 39.656488549618324, + "grad_norm": 0.447360634803772, + "learning_rate": 7.785751295418218e-06, + "loss": 0.0831, + "step": 10390 + }, + { + "epoch": 39.69465648854962, + "grad_norm": 0.6668679118156433, + "learning_rate": 7.781172734938136e-06, + "loss": 0.0857, + "step": 10400 + }, + { + "epoch": 39.732824427480914, + "grad_norm": 0.5265706777572632, + "learning_rate": 7.776590795070932e-06, + "loss": 0.0826, + "step": 10410 + }, + { + "epoch": 39.770992366412216, + "grad_norm": 0.44735923409461975, + "learning_rate": 7.772005481384099e-06, + "loss": 0.0889, + "step": 10420 + }, + { + "epoch": 39.80916030534351, + "grad_norm": 0.4243389070034027, + "learning_rate": 7.767416799449223e-06, + "loss": 0.0784, + "step": 10430 + }, + { + "epoch": 39.847328244274806, + "grad_norm": 0.5234137177467346, + "learning_rate": 7.762824754841985e-06, + "loss": 0.0805, + "step": 10440 + }, + { + "epoch": 39.88549618320611, + "grad_norm": 0.6243206858634949, + "learning_rate": 7.758229353142153e-06, + "loss": 0.08, + "step": 10450 + }, + { + "epoch": 39.9236641221374, + "grad_norm": 0.368437796831131, + "learning_rate": 7.753630599933572e-06, + "loss": 0.0857, + "step": 10460 + }, + { + "epoch": 39.961832061068705, + "grad_norm": 0.6152259707450867, + "learning_rate": 7.74902850080416e-06, + "loss": 0.0829, + "step": 10470 + }, + { + "epoch": 40.0, + "grad_norm": 0.43827810883522034, + "learning_rate": 7.744423061345907e-06, + "loss": 0.0865, + "step": 10480 + }, + { + "epoch": 40.038167938931295, + "grad_norm": 0.39631178975105286, + "learning_rate": 7.73981428715485e-06, + "loss": 0.0917, + "step": 10490 + }, + { + "epoch": 40.0763358778626, + "grad_norm": 0.45038852095603943, + "learning_rate": 7.735202183831085e-06, + "loss": 0.0864, + "step": 10500 + }, + { + "epoch": 40.11450381679389, + "grad_norm": 0.451773077249527, + "learning_rate": 7.730586756978758e-06, + "loss": 0.0758, + "step": 10510 + }, + { + "epoch": 40.152671755725194, + "grad_norm": 0.48386844992637634, + "learning_rate": 7.72596801220604e-06, + "loss": 0.0782, + "step": 10520 + }, + { + "epoch": 40.19083969465649, + "grad_norm": 0.8366134166717529, + "learning_rate": 7.721345955125147e-06, + "loss": 0.0861, + "step": 10530 + }, + { + "epoch": 40.229007633587784, + "grad_norm": 0.4540402889251709, + "learning_rate": 7.716720591352311e-06, + "loss": 0.0814, + "step": 10540 + }, + { + "epoch": 40.267175572519086, + "grad_norm": 0.7006372213363647, + "learning_rate": 7.712091926507788e-06, + "loss": 0.0939, + "step": 10550 + }, + { + "epoch": 40.30534351145038, + "grad_norm": 0.4313132166862488, + "learning_rate": 7.70745996621584e-06, + "loss": 0.0869, + "step": 10560 + }, + { + "epoch": 40.343511450381676, + "grad_norm": 0.33567196130752563, + "learning_rate": 7.702824716104736e-06, + "loss": 0.0924, + "step": 10570 + }, + { + "epoch": 40.38167938931298, + "grad_norm": 0.5061486959457397, + "learning_rate": 7.698186181806744e-06, + "loss": 0.0839, + "step": 10580 + }, + { + "epoch": 40.41984732824427, + "grad_norm": 0.4450092017650604, + "learning_rate": 7.693544368958116e-06, + "loss": 0.0804, + "step": 10590 + }, + { + "epoch": 40.458015267175576, + "grad_norm": 0.8481007814407349, + "learning_rate": 7.688899283199097e-06, + "loss": 0.0778, + "step": 10600 + }, + { + "epoch": 40.49618320610687, + "grad_norm": 0.34229525923728943, + "learning_rate": 7.684250930173902e-06, + "loss": 0.0805, + "step": 10610 + }, + { + "epoch": 40.534351145038165, + "grad_norm": 0.39725250005722046, + "learning_rate": 7.679599315530717e-06, + "loss": 0.0836, + "step": 10620 + }, + { + "epoch": 40.57251908396947, + "grad_norm": 0.4148560166358948, + "learning_rate": 7.674944444921696e-06, + "loss": 0.0881, + "step": 10630 + }, + { + "epoch": 40.61068702290076, + "grad_norm": 0.3573581874370575, + "learning_rate": 7.670286324002943e-06, + "loss": 0.0754, + "step": 10640 + }, + { + "epoch": 40.64885496183206, + "grad_norm": 0.4562970995903015, + "learning_rate": 7.665624958434514e-06, + "loss": 0.0943, + "step": 10650 + }, + { + "epoch": 40.68702290076336, + "grad_norm": 0.7594497799873352, + "learning_rate": 7.66096035388041e-06, + "loss": 0.0772, + "step": 10660 + }, + { + "epoch": 40.725190839694655, + "grad_norm": 0.5188819169998169, + "learning_rate": 7.656292516008563e-06, + "loss": 0.079, + "step": 10670 + }, + { + "epoch": 40.76335877862596, + "grad_norm": 0.48357266187667847, + "learning_rate": 7.651621450490836e-06, + "loss": 0.0822, + "step": 10680 + }, + { + "epoch": 40.80152671755725, + "grad_norm": 0.46848195791244507, + "learning_rate": 7.646947163003017e-06, + "loss": 0.0769, + "step": 10690 + }, + { + "epoch": 40.83969465648855, + "grad_norm": 0.448741614818573, + "learning_rate": 7.642269659224804e-06, + "loss": 0.0817, + "step": 10700 + }, + { + "epoch": 40.87786259541985, + "grad_norm": 0.2872468829154968, + "learning_rate": 7.637588944839803e-06, + "loss": 0.0789, + "step": 10710 + }, + { + "epoch": 40.916030534351144, + "grad_norm": 0.641671359539032, + "learning_rate": 7.632905025535529e-06, + "loss": 0.0874, + "step": 10720 + }, + { + "epoch": 40.954198473282446, + "grad_norm": 0.39625340700149536, + "learning_rate": 7.628217907003379e-06, + "loss": 0.0761, + "step": 10730 + }, + { + "epoch": 40.99236641221374, + "grad_norm": 0.47728121280670166, + "learning_rate": 7.623527594938649e-06, + "loss": 0.0771, + "step": 10740 + }, + { + "epoch": 41.030534351145036, + "grad_norm": 0.42139771580696106, + "learning_rate": 7.618834095040508e-06, + "loss": 0.0797, + "step": 10750 + }, + { + "epoch": 41.06870229007634, + "grad_norm": 0.5375596284866333, + "learning_rate": 7.614137413012001e-06, + "loss": 0.0822, + "step": 10760 + }, + { + "epoch": 41.10687022900763, + "grad_norm": 0.26976653933525085, + "learning_rate": 7.609437554560042e-06, + "loss": 0.0758, + "step": 10770 + }, + { + "epoch": 41.14503816793893, + "grad_norm": 0.293875515460968, + "learning_rate": 7.604734525395398e-06, + "loss": 0.0801, + "step": 10780 + }, + { + "epoch": 41.18320610687023, + "grad_norm": 1.289528727531433, + "learning_rate": 7.600028331232698e-06, + "loss": 0.0854, + "step": 10790 + }, + { + "epoch": 41.221374045801525, + "grad_norm": 0.6084293723106384, + "learning_rate": 7.595318977790408e-06, + "loss": 0.0744, + "step": 10800 + }, + { + "epoch": 41.25954198473283, + "grad_norm": 0.660408079624176, + "learning_rate": 7.5906064707908355e-06, + "loss": 0.0777, + "step": 10810 + }, + { + "epoch": 41.29770992366412, + "grad_norm": 3.1526737213134766, + "learning_rate": 7.585890815960125e-06, + "loss": 0.0843, + "step": 10820 + }, + { + "epoch": 41.33587786259542, + "grad_norm": 0.5699805617332458, + "learning_rate": 7.581172019028238e-06, + "loss": 0.0812, + "step": 10830 + }, + { + "epoch": 41.37404580152672, + "grad_norm": 0.9830774068832397, + "learning_rate": 7.576450085728959e-06, + "loss": 0.0824, + "step": 10840 + }, + { + "epoch": 41.412213740458014, + "grad_norm": 0.4558395445346832, + "learning_rate": 7.571725021799885e-06, + "loss": 0.075, + "step": 10850 + }, + { + "epoch": 41.45038167938931, + "grad_norm": 0.4857986569404602, + "learning_rate": 7.566996832982409e-06, + "loss": 0.0835, + "step": 10860 + }, + { + "epoch": 41.48854961832061, + "grad_norm": 0.8208315968513489, + "learning_rate": 7.56226552502173e-06, + "loss": 0.0939, + "step": 10870 + }, + { + "epoch": 41.52671755725191, + "grad_norm": 0.6192477941513062, + "learning_rate": 7.557531103666833e-06, + "loss": 0.0813, + "step": 10880 + }, + { + "epoch": 41.56488549618321, + "grad_norm": 0.3482249081134796, + "learning_rate": 7.552793574670485e-06, + "loss": 0.0743, + "step": 10890 + }, + { + "epoch": 41.603053435114504, + "grad_norm": 0.35872194170951843, + "learning_rate": 7.5480529437892304e-06, + "loss": 0.0756, + "step": 10900 + }, + { + "epoch": 41.6412213740458, + "grad_norm": 0.38801950216293335, + "learning_rate": 7.543309216783384e-06, + "loss": 0.0712, + "step": 10910 + }, + { + "epoch": 41.6793893129771, + "grad_norm": 0.5348705053329468, + "learning_rate": 7.538562399417021e-06, + "loss": 0.077, + "step": 10920 + }, + { + "epoch": 41.717557251908396, + "grad_norm": 0.5075365304946899, + "learning_rate": 7.533812497457972e-06, + "loss": 0.0791, + "step": 10930 + }, + { + "epoch": 41.7557251908397, + "grad_norm": 0.32560238242149353, + "learning_rate": 7.529059516677815e-06, + "loss": 0.0706, + "step": 10940 + }, + { + "epoch": 41.79389312977099, + "grad_norm": 0.5091714859008789, + "learning_rate": 7.524303462851872e-06, + "loss": 0.0773, + "step": 10950 + }, + { + "epoch": 41.83206106870229, + "grad_norm": 0.5478523373603821, + "learning_rate": 7.519544341759193e-06, + "loss": 0.0739, + "step": 10960 + }, + { + "epoch": 41.87022900763359, + "grad_norm": 0.891545832157135, + "learning_rate": 7.514782159182562e-06, + "loss": 0.0806, + "step": 10970 + }, + { + "epoch": 41.908396946564885, + "grad_norm": 0.6090933680534363, + "learning_rate": 7.510016920908481e-06, + "loss": 0.0795, + "step": 10980 + }, + { + "epoch": 41.94656488549618, + "grad_norm": 0.9301139116287231, + "learning_rate": 7.505248632727159e-06, + "loss": 0.0815, + "step": 10990 + }, + { + "epoch": 41.98473282442748, + "grad_norm": 0.6181934475898743, + "learning_rate": 7.50047730043252e-06, + "loss": 0.0806, + "step": 11000 + }, + { + "epoch": 42.02290076335878, + "grad_norm": 0.5225881934165955, + "learning_rate": 7.495702929822183e-06, + "loss": 0.0746, + "step": 11010 + }, + { + "epoch": 42.06106870229008, + "grad_norm": 0.7685074806213379, + "learning_rate": 7.490925526697455e-06, + "loss": 0.0796, + "step": 11020 + }, + { + "epoch": 42.099236641221374, + "grad_norm": 0.475824773311615, + "learning_rate": 7.486145096863334e-06, + "loss": 0.0735, + "step": 11030 + }, + { + "epoch": 42.13740458015267, + "grad_norm": 0.3295309245586395, + "learning_rate": 7.481361646128491e-06, + "loss": 0.0705, + "step": 11040 + }, + { + "epoch": 42.17557251908397, + "grad_norm": 0.2535337805747986, + "learning_rate": 7.476575180305271e-06, + "loss": 0.0808, + "step": 11050 + }, + { + "epoch": 42.213740458015266, + "grad_norm": 0.4357345402240753, + "learning_rate": 7.471785705209682e-06, + "loss": 0.075, + "step": 11060 + }, + { + "epoch": 42.25190839694657, + "grad_norm": 0.9749108552932739, + "learning_rate": 7.4669932266613875e-06, + "loss": 0.0795, + "step": 11070 + }, + { + "epoch": 42.29007633587786, + "grad_norm": 0.428632527589798, + "learning_rate": 7.4621977504837e-06, + "loss": 0.0723, + "step": 11080 + }, + { + "epoch": 42.32824427480916, + "grad_norm": 0.26020991802215576, + "learning_rate": 7.457399282503574e-06, + "loss": 0.0853, + "step": 11090 + }, + { + "epoch": 42.36641221374046, + "grad_norm": 0.3331349492073059, + "learning_rate": 7.4525978285516046e-06, + "loss": 0.0757, + "step": 11100 + }, + { + "epoch": 42.404580152671755, + "grad_norm": 0.8639835119247437, + "learning_rate": 7.447793394462006e-06, + "loss": 0.078, + "step": 11110 + }, + { + "epoch": 42.44274809160305, + "grad_norm": 2.107271909713745, + "learning_rate": 7.442985986072624e-06, + "loss": 0.0769, + "step": 11120 + }, + { + "epoch": 42.48091603053435, + "grad_norm": 0.25591397285461426, + "learning_rate": 7.438175609224908e-06, + "loss": 0.0764, + "step": 11130 + }, + { + "epoch": 42.51908396946565, + "grad_norm": 0.346733957529068, + "learning_rate": 7.433362269763924e-06, + "loss": 0.0811, + "step": 11140 + }, + { + "epoch": 42.55725190839695, + "grad_norm": 0.29196739196777344, + "learning_rate": 7.428545973538329e-06, + "loss": 0.0797, + "step": 11150 + }, + { + "epoch": 42.595419847328245, + "grad_norm": 0.6709998846054077, + "learning_rate": 7.423726726400381e-06, + "loss": 0.084, + "step": 11160 + }, + { + "epoch": 42.63358778625954, + "grad_norm": 0.46684983372688293, + "learning_rate": 7.418904534205917e-06, + "loss": 0.0788, + "step": 11170 + }, + { + "epoch": 42.67175572519084, + "grad_norm": 0.39609742164611816, + "learning_rate": 7.414079402814356e-06, + "loss": 0.0734, + "step": 11180 + }, + { + "epoch": 42.70992366412214, + "grad_norm": 0.38470694422721863, + "learning_rate": 7.4092513380886876e-06, + "loss": 0.0797, + "step": 11190 + }, + { + "epoch": 42.74809160305343, + "grad_norm": 0.26760122179985046, + "learning_rate": 7.4044203458954665e-06, + "loss": 0.084, + "step": 11200 + }, + { + "epoch": 42.786259541984734, + "grad_norm": 0.8822160363197327, + "learning_rate": 7.3995864321048036e-06, + "loss": 0.0837, + "step": 11210 + }, + { + "epoch": 42.82442748091603, + "grad_norm": 0.42078760266304016, + "learning_rate": 7.394749602590359e-06, + "loss": 0.0775, + "step": 11220 + }, + { + "epoch": 42.86259541984733, + "grad_norm": 0.2612259089946747, + "learning_rate": 7.389909863229337e-06, + "loss": 0.0713, + "step": 11230 + }, + { + "epoch": 42.900763358778626, + "grad_norm": 0.6772575974464417, + "learning_rate": 7.385067219902478e-06, + "loss": 0.0813, + "step": 11240 + }, + { + "epoch": 42.93893129770992, + "grad_norm": 0.37130260467529297, + "learning_rate": 7.380221678494048e-06, + "loss": 0.081, + "step": 11250 + }, + { + "epoch": 42.97709923664122, + "grad_norm": 0.30204957723617554, + "learning_rate": 7.375373244891839e-06, + "loss": 0.0723, + "step": 11260 + }, + { + "epoch": 43.01526717557252, + "grad_norm": 0.515661358833313, + "learning_rate": 7.370521924987155e-06, + "loss": 0.078, + "step": 11270 + }, + { + "epoch": 43.05343511450382, + "grad_norm": 0.40739643573760986, + "learning_rate": 7.3656677246748064e-06, + "loss": 0.0858, + "step": 11280 + }, + { + "epoch": 43.091603053435115, + "grad_norm": 0.37059029936790466, + "learning_rate": 7.360810649853105e-06, + "loss": 0.073, + "step": 11290 + }, + { + "epoch": 43.12977099236641, + "grad_norm": 0.36438652873039246, + "learning_rate": 7.355950706423854e-06, + "loss": 0.1083, + "step": 11300 + }, + { + "epoch": 43.16793893129771, + "grad_norm": 0.4451878070831299, + "learning_rate": 7.351087900292342e-06, + "loss": 0.0794, + "step": 11310 + }, + { + "epoch": 43.20610687022901, + "grad_norm": 0.5628499984741211, + "learning_rate": 7.346222237367339e-06, + "loss": 0.0822, + "step": 11320 + }, + { + "epoch": 43.2442748091603, + "grad_norm": 0.374586820602417, + "learning_rate": 7.341353723561082e-06, + "loss": 0.0789, + "step": 11330 + }, + { + "epoch": 43.282442748091604, + "grad_norm": 0.8239962458610535, + "learning_rate": 7.336482364789277e-06, + "loss": 0.0809, + "step": 11340 + }, + { + "epoch": 43.3206106870229, + "grad_norm": 0.36449357867240906, + "learning_rate": 7.331608166971082e-06, + "loss": 0.0776, + "step": 11350 + }, + { + "epoch": 43.3587786259542, + "grad_norm": 0.5167990326881409, + "learning_rate": 7.326731136029108e-06, + "loss": 0.089, + "step": 11360 + }, + { + "epoch": 43.396946564885496, + "grad_norm": 0.6136403679847717, + "learning_rate": 7.321851277889408e-06, + "loss": 0.0766, + "step": 11370 + }, + { + "epoch": 43.43511450381679, + "grad_norm": 0.35888543725013733, + "learning_rate": 7.31696859848147e-06, + "loss": 0.0896, + "step": 11380 + }, + { + "epoch": 43.47328244274809, + "grad_norm": 0.4521687626838684, + "learning_rate": 7.312083103738207e-06, + "loss": 0.0749, + "step": 11390 + }, + { + "epoch": 43.51145038167939, + "grad_norm": 0.4348390996456146, + "learning_rate": 7.307194799595958e-06, + "loss": 0.0852, + "step": 11400 + }, + { + "epoch": 43.54961832061069, + "grad_norm": 0.7922037839889526, + "learning_rate": 7.302303691994474e-06, + "loss": 0.0793, + "step": 11410 + }, + { + "epoch": 43.587786259541986, + "grad_norm": 0.3498508334159851, + "learning_rate": 7.29740978687691e-06, + "loss": 0.0883, + "step": 11420 + }, + { + "epoch": 43.62595419847328, + "grad_norm": 0.3470660448074341, + "learning_rate": 7.292513090189824e-06, + "loss": 0.0693, + "step": 11430 + }, + { + "epoch": 43.66412213740458, + "grad_norm": 0.2837408483028412, + "learning_rate": 7.287613607883164e-06, + "loss": 0.0732, + "step": 11440 + }, + { + "epoch": 43.70229007633588, + "grad_norm": 0.5988171696662903, + "learning_rate": 7.282711345910263e-06, + "loss": 0.0734, + "step": 11450 + }, + { + "epoch": 43.74045801526717, + "grad_norm": 0.5002927780151367, + "learning_rate": 7.277806310227831e-06, + "loss": 0.0744, + "step": 11460 + }, + { + "epoch": 43.778625954198475, + "grad_norm": 0.4815424382686615, + "learning_rate": 7.272898506795948e-06, + "loss": 0.0866, + "step": 11470 + }, + { + "epoch": 43.81679389312977, + "grad_norm": 0.4680640697479248, + "learning_rate": 7.267987941578058e-06, + "loss": 0.0883, + "step": 11480 + }, + { + "epoch": 43.85496183206107, + "grad_norm": 0.3622831404209137, + "learning_rate": 7.263074620540963e-06, + "loss": 0.0812, + "step": 11490 + }, + { + "epoch": 43.89312977099237, + "grad_norm": 0.289620965719223, + "learning_rate": 7.25815854965481e-06, + "loss": 0.0788, + "step": 11500 + }, + { + "epoch": 43.93129770992366, + "grad_norm": 0.4161958396434784, + "learning_rate": 7.253239734893089e-06, + "loss": 0.0877, + "step": 11510 + }, + { + "epoch": 43.969465648854964, + "grad_norm": 0.27894461154937744, + "learning_rate": 7.248318182232623e-06, + "loss": 0.0706, + "step": 11520 + }, + { + "epoch": 44.00763358778626, + "grad_norm": 0.4374925494194031, + "learning_rate": 7.243393897653565e-06, + "loss": 0.0757, + "step": 11530 + }, + { + "epoch": 44.045801526717554, + "grad_norm": 0.4744294285774231, + "learning_rate": 7.23846688713938e-06, + "loss": 0.0805, + "step": 11540 + }, + { + "epoch": 44.083969465648856, + "grad_norm": 0.7247238755226135, + "learning_rate": 7.233537156676854e-06, + "loss": 0.0801, + "step": 11550 + }, + { + "epoch": 44.12213740458015, + "grad_norm": 0.5223896503448486, + "learning_rate": 7.228604712256076e-06, + "loss": 0.0763, + "step": 11560 + }, + { + "epoch": 44.16030534351145, + "grad_norm": 0.32578030228614807, + "learning_rate": 7.2236695598704265e-06, + "loss": 0.0667, + "step": 11570 + }, + { + "epoch": 44.19847328244275, + "grad_norm": 0.809870183467865, + "learning_rate": 7.218731705516585e-06, + "loss": 0.072, + "step": 11580 + }, + { + "epoch": 44.23664122137404, + "grad_norm": 0.722334623336792, + "learning_rate": 7.21379115519451e-06, + "loss": 0.093, + "step": 11590 + }, + { + "epoch": 44.274809160305345, + "grad_norm": 0.8618139028549194, + "learning_rate": 7.208847914907431e-06, + "loss": 0.0869, + "step": 11600 + }, + { + "epoch": 44.31297709923664, + "grad_norm": 0.40315625071525574, + "learning_rate": 7.203901990661857e-06, + "loss": 0.0805, + "step": 11610 + }, + { + "epoch": 44.35114503816794, + "grad_norm": 0.5582493543624878, + "learning_rate": 7.1989533884675486e-06, + "loss": 0.0782, + "step": 11620 + }, + { + "epoch": 44.38931297709924, + "grad_norm": 0.5148290395736694, + "learning_rate": 7.1940021143375264e-06, + "loss": 0.0817, + "step": 11630 + }, + { + "epoch": 44.42748091603053, + "grad_norm": 0.685417652130127, + "learning_rate": 7.189048174288054e-06, + "loss": 0.0824, + "step": 11640 + }, + { + "epoch": 44.465648854961835, + "grad_norm": 0.4359172284603119, + "learning_rate": 7.184091574338637e-06, + "loss": 0.0813, + "step": 11650 + }, + { + "epoch": 44.50381679389313, + "grad_norm": 0.37148523330688477, + "learning_rate": 7.179132320512009e-06, + "loss": 0.0835, + "step": 11660 + }, + { + "epoch": 44.541984732824424, + "grad_norm": 0.714996337890625, + "learning_rate": 7.174170418834134e-06, + "loss": 0.0835, + "step": 11670 + }, + { + "epoch": 44.58015267175573, + "grad_norm": 0.30582910776138306, + "learning_rate": 7.1692058753341885e-06, + "loss": 0.071, + "step": 11680 + }, + { + "epoch": 44.61832061068702, + "grad_norm": 0.4738406836986542, + "learning_rate": 7.164238696044562e-06, + "loss": 0.0722, + "step": 11690 + }, + { + "epoch": 44.656488549618324, + "grad_norm": 0.39357179403305054, + "learning_rate": 7.159268887000845e-06, + "loss": 0.075, + "step": 11700 + }, + { + "epoch": 44.69465648854962, + "grad_norm": 0.4723822772502899, + "learning_rate": 7.1542964542418265e-06, + "loss": 0.0876, + "step": 11710 + }, + { + "epoch": 44.732824427480914, + "grad_norm": 0.43717026710510254, + "learning_rate": 7.149321403809478e-06, + "loss": 0.0779, + "step": 11720 + }, + { + "epoch": 44.770992366412216, + "grad_norm": 0.40371444821357727, + "learning_rate": 7.1443437417489555e-06, + "loss": 0.073, + "step": 11730 + }, + { + "epoch": 44.80916030534351, + "grad_norm": 0.5036951899528503, + "learning_rate": 7.13936347410859e-06, + "loss": 0.0821, + "step": 11740 + }, + { + "epoch": 44.847328244274806, + "grad_norm": 0.4972344934940338, + "learning_rate": 7.1343806069398745e-06, + "loss": 0.0886, + "step": 11750 + }, + { + "epoch": 44.88549618320611, + "grad_norm": 0.4003848135471344, + "learning_rate": 7.12939514629746e-06, + "loss": 0.0782, + "step": 11760 + }, + { + "epoch": 44.9236641221374, + "grad_norm": 0.36399978399276733, + "learning_rate": 7.1244070982391556e-06, + "loss": 0.0918, + "step": 11770 + }, + { + "epoch": 44.961832061068705, + "grad_norm": 0.2910782992839813, + "learning_rate": 7.119416468825908e-06, + "loss": 0.0756, + "step": 11780 + }, + { + "epoch": 45.0, + "grad_norm": 0.429004043340683, + "learning_rate": 7.114423264121804e-06, + "loss": 0.0846, + "step": 11790 + }, + { + "epoch": 45.038167938931295, + "grad_norm": 0.3626013994216919, + "learning_rate": 7.1094274901940566e-06, + "loss": 0.1058, + "step": 11800 + }, + { + "epoch": 45.0763358778626, + "grad_norm": 0.631897509098053, + "learning_rate": 7.104429153113001e-06, + "loss": 0.0802, + "step": 11810 + }, + { + "epoch": 45.11450381679389, + "grad_norm": 0.4777592122554779, + "learning_rate": 7.099428258952092e-06, + "loss": 0.0744, + "step": 11820 + }, + { + "epoch": 45.152671755725194, + "grad_norm": 0.4223073422908783, + "learning_rate": 7.094424813787883e-06, + "loss": 0.0749, + "step": 11830 + }, + { + "epoch": 45.19083969465649, + "grad_norm": 0.3444991707801819, + "learning_rate": 7.089418823700035e-06, + "loss": 0.0805, + "step": 11840 + }, + { + "epoch": 45.229007633587784, + "grad_norm": 0.41768884658813477, + "learning_rate": 7.084410294771298e-06, + "loss": 0.0725, + "step": 11850 + }, + { + "epoch": 45.267175572519086, + "grad_norm": 0.25872498750686646, + "learning_rate": 7.079399233087504e-06, + "loss": 0.076, + "step": 11860 + }, + { + "epoch": 45.30534351145038, + "grad_norm": 0.35184618830680847, + "learning_rate": 7.074385644737568e-06, + "loss": 0.0768, + "step": 11870 + }, + { + "epoch": 45.343511450381676, + "grad_norm": 0.6684393286705017, + "learning_rate": 7.069369535813473e-06, + "loss": 0.0831, + "step": 11880 + }, + { + "epoch": 45.38167938931298, + "grad_norm": 0.6411822438240051, + "learning_rate": 7.064350912410261e-06, + "loss": 0.0759, + "step": 11890 + }, + { + "epoch": 45.41984732824427, + "grad_norm": 0.363640695810318, + "learning_rate": 7.059329780626036e-06, + "loss": 0.0761, + "step": 11900 + }, + { + "epoch": 45.458015267175576, + "grad_norm": 0.5293934345245361, + "learning_rate": 7.054306146561944e-06, + "loss": 0.0751, + "step": 11910 + }, + { + "epoch": 45.49618320610687, + "grad_norm": 0.2698386609554291, + "learning_rate": 7.049280016322177e-06, + "loss": 0.0762, + "step": 11920 + }, + { + "epoch": 45.534351145038165, + "grad_norm": 0.39354628324508667, + "learning_rate": 7.044251396013957e-06, + "loss": 0.0678, + "step": 11930 + }, + { + "epoch": 45.57251908396947, + "grad_norm": 0.8730633854866028, + "learning_rate": 7.039220291747528e-06, + "loss": 0.0888, + "step": 11940 + }, + { + "epoch": 45.61068702290076, + "grad_norm": 0.4490543603897095, + "learning_rate": 7.034186709636159e-06, + "loss": 0.0761, + "step": 11950 + }, + { + "epoch": 45.64885496183206, + "grad_norm": 0.39466390013694763, + "learning_rate": 7.029150655796129e-06, + "loss": 0.0722, + "step": 11960 + }, + { + "epoch": 45.68702290076336, + "grad_norm": 0.3895810842514038, + "learning_rate": 7.024112136346713e-06, + "loss": 0.0774, + "step": 11970 + }, + { + "epoch": 45.725190839694655, + "grad_norm": 0.816741406917572, + "learning_rate": 7.019071157410191e-06, + "loss": 0.0879, + "step": 11980 + }, + { + "epoch": 45.76335877862596, + "grad_norm": 0.5164619088172913, + "learning_rate": 7.014027725111826e-06, + "loss": 0.0811, + "step": 11990 + }, + { + "epoch": 45.80152671755725, + "grad_norm": 0.5740540623664856, + "learning_rate": 7.0089818455798655e-06, + "loss": 0.1007, + "step": 12000 + }, + { + "epoch": 45.83969465648855, + "grad_norm": 0.5206035375595093, + "learning_rate": 7.0039335249455285e-06, + "loss": 0.0829, + "step": 12010 + }, + { + "epoch": 45.87786259541985, + "grad_norm": 0.7271233797073364, + "learning_rate": 6.998882769342998e-06, + "loss": 0.0747, + "step": 12020 + }, + { + "epoch": 45.916030534351144, + "grad_norm": 0.59157794713974, + "learning_rate": 6.993829584909423e-06, + "loss": 0.0856, + "step": 12030 + }, + { + "epoch": 45.954198473282446, + "grad_norm": 0.6484169363975525, + "learning_rate": 6.988773977784895e-06, + "loss": 0.0833, + "step": 12040 + }, + { + "epoch": 45.99236641221374, + "grad_norm": 0.494017094373703, + "learning_rate": 6.9837159541124544e-06, + "loss": 0.085, + "step": 12050 + }, + { + "epoch": 46.030534351145036, + "grad_norm": 0.5254559516906738, + "learning_rate": 6.978655520038079e-06, + "loss": 0.079, + "step": 12060 + }, + { + "epoch": 46.06870229007634, + "grad_norm": 0.5512828230857849, + "learning_rate": 6.9735926817106704e-06, + "loss": 0.0768, + "step": 12070 + }, + { + "epoch": 46.10687022900763, + "grad_norm": 0.48154160380363464, + "learning_rate": 6.968527445282056e-06, + "loss": 0.0768, + "step": 12080 + }, + { + "epoch": 46.14503816793893, + "grad_norm": 0.4283272624015808, + "learning_rate": 6.963459816906977e-06, + "loss": 0.0804, + "step": 12090 + }, + { + "epoch": 46.18320610687023, + "grad_norm": 1.275875449180603, + "learning_rate": 6.958389802743078e-06, + "loss": 0.0798, + "step": 12100 + }, + { + "epoch": 46.221374045801525, + "grad_norm": 0.565065324306488, + "learning_rate": 6.953317408950903e-06, + "loss": 0.0735, + "step": 12110 + }, + { + "epoch": 46.25954198473283, + "grad_norm": 0.7162991762161255, + "learning_rate": 6.948242641693891e-06, + "loss": 0.0741, + "step": 12120 + }, + { + "epoch": 46.29770992366412, + "grad_norm": 0.41520068049430847, + "learning_rate": 6.9431655071383605e-06, + "loss": 0.0735, + "step": 12130 + }, + { + "epoch": 46.33587786259542, + "grad_norm": 0.3980543613433838, + "learning_rate": 6.938086011453513e-06, + "loss": 0.0822, + "step": 12140 + }, + { + "epoch": 46.37404580152672, + "grad_norm": 0.28883951902389526, + "learning_rate": 6.93300416081141e-06, + "loss": 0.0836, + "step": 12150 + }, + { + "epoch": 46.412213740458014, + "grad_norm": 0.5937278270721436, + "learning_rate": 6.927919961386984e-06, + "loss": 0.0764, + "step": 12160 + }, + { + "epoch": 46.45038167938931, + "grad_norm": 0.40771183371543884, + "learning_rate": 6.922833419358012e-06, + "loss": 0.0831, + "step": 12170 + }, + { + "epoch": 46.48854961832061, + "grad_norm": 0.33484435081481934, + "learning_rate": 6.917744540905125e-06, + "loss": 0.0801, + "step": 12180 + }, + { + "epoch": 46.52671755725191, + "grad_norm": 0.5971487164497375, + "learning_rate": 6.9126533322117875e-06, + "loss": 0.0843, + "step": 12190 + }, + { + "epoch": 46.56488549618321, + "grad_norm": 0.612615704536438, + "learning_rate": 6.9075597994643e-06, + "loss": 0.0836, + "step": 12200 + }, + { + "epoch": 46.603053435114504, + "grad_norm": 0.2975360155105591, + "learning_rate": 6.902463948851786e-06, + "loss": 0.0734, + "step": 12210 + }, + { + "epoch": 46.6412213740458, + "grad_norm": 0.4995099604129791, + "learning_rate": 6.897365786566184e-06, + "loss": 0.0795, + "step": 12220 + }, + { + "epoch": 46.6793893129771, + "grad_norm": 0.3913675844669342, + "learning_rate": 6.892265318802242e-06, + "loss": 0.0782, + "step": 12230 + }, + { + "epoch": 46.717557251908396, + "grad_norm": 0.4445989429950714, + "learning_rate": 6.887162551757507e-06, + "loss": 0.0844, + "step": 12240 + }, + { + "epoch": 46.7557251908397, + "grad_norm": 0.3842538297176361, + "learning_rate": 6.882057491632326e-06, + "loss": 0.0721, + "step": 12250 + }, + { + "epoch": 46.79389312977099, + "grad_norm": 0.3080384135246277, + "learning_rate": 6.876950144629824e-06, + "loss": 0.0702, + "step": 12260 + }, + { + "epoch": 46.83206106870229, + "grad_norm": 0.5869445204734802, + "learning_rate": 6.8718405169559114e-06, + "loss": 0.0687, + "step": 12270 + }, + { + "epoch": 46.87022900763359, + "grad_norm": 0.4181062579154968, + "learning_rate": 6.866728614819268e-06, + "loss": 0.0786, + "step": 12280 + }, + { + "epoch": 46.908396946564885, + "grad_norm": 0.749423086643219, + "learning_rate": 6.861614444431337e-06, + "loss": 0.0862, + "step": 12290 + }, + { + "epoch": 46.94656488549618, + "grad_norm": 0.5488717555999756, + "learning_rate": 6.856498012006318e-06, + "loss": 0.0881, + "step": 12300 + }, + { + "epoch": 46.98473282442748, + "grad_norm": 0.8571105003356934, + "learning_rate": 6.851379323761157e-06, + "loss": 0.0773, + "step": 12310 + }, + { + "epoch": 47.02290076335878, + "grad_norm": 0.5261535048484802, + "learning_rate": 6.846258385915545e-06, + "loss": 0.0912, + "step": 12320 + }, + { + "epoch": 47.06106870229008, + "grad_norm": 0.895200252532959, + "learning_rate": 6.841135204691902e-06, + "loss": 0.0858, + "step": 12330 + }, + { + "epoch": 47.099236641221374, + "grad_norm": 0.49354881048202515, + "learning_rate": 6.8360097863153775e-06, + "loss": 0.0762, + "step": 12340 + }, + { + "epoch": 47.13740458015267, + "grad_norm": 0.42916223406791687, + "learning_rate": 6.830882137013839e-06, + "loss": 0.0749, + "step": 12350 + }, + { + "epoch": 47.17557251908397, + "grad_norm": 0.6174752116203308, + "learning_rate": 6.825752263017863e-06, + "loss": 0.0797, + "step": 12360 + }, + { + "epoch": 47.213740458015266, + "grad_norm": 0.4598800241947174, + "learning_rate": 6.820620170560731e-06, + "loss": 0.0828, + "step": 12370 + }, + { + "epoch": 47.25190839694657, + "grad_norm": 0.3219451904296875, + "learning_rate": 6.815485865878418e-06, + "loss": 0.0845, + "step": 12380 + }, + { + "epoch": 47.29007633587786, + "grad_norm": 0.4897594451904297, + "learning_rate": 6.8103493552095875e-06, + "loss": 0.0874, + "step": 12390 + }, + { + "epoch": 47.32824427480916, + "grad_norm": 0.35891905426979065, + "learning_rate": 6.805210644795588e-06, + "loss": 0.0715, + "step": 12400 + }, + { + "epoch": 47.36641221374046, + "grad_norm": 0.6998705267906189, + "learning_rate": 6.8000697408804326e-06, + "loss": 0.0799, + "step": 12410 + }, + { + "epoch": 47.404580152671755, + "grad_norm": 0.7352116703987122, + "learning_rate": 6.794926649710807e-06, + "loss": 0.0873, + "step": 12420 + }, + { + "epoch": 47.44274809160305, + "grad_norm": 0.34309205412864685, + "learning_rate": 6.7897813775360536e-06, + "loss": 0.0753, + "step": 12430 + }, + { + "epoch": 47.48091603053435, + "grad_norm": 0.6588733792304993, + "learning_rate": 6.784633930608158e-06, + "loss": 0.0777, + "step": 12440 + }, + { + "epoch": 47.51908396946565, + "grad_norm": 0.43710416555404663, + "learning_rate": 6.779484315181759e-06, + "loss": 0.0841, + "step": 12450 + }, + { + "epoch": 47.55725190839695, + "grad_norm": 0.4144386649131775, + "learning_rate": 6.774332537514122e-06, + "loss": 0.0727, + "step": 12460 + }, + { + "epoch": 47.595419847328245, + "grad_norm": 0.3395448625087738, + "learning_rate": 6.769178603865143e-06, + "loss": 0.08, + "step": 12470 + }, + { + "epoch": 47.63358778625954, + "grad_norm": 0.38854730129241943, + "learning_rate": 6.764022520497337e-06, + "loss": 0.0768, + "step": 12480 + }, + { + "epoch": 47.67175572519084, + "grad_norm": 0.43010884523391724, + "learning_rate": 6.758864293675833e-06, + "loss": 0.0757, + "step": 12490 + }, + { + "epoch": 47.70992366412214, + "grad_norm": 0.31416982412338257, + "learning_rate": 6.753703929668363e-06, + "loss": 0.0828, + "step": 12500 + }, + { + "epoch": 47.74809160305343, + "grad_norm": 0.3258754312992096, + "learning_rate": 6.7485414347452535e-06, + "loss": 0.0707, + "step": 12510 + }, + { + "epoch": 47.786259541984734, + "grad_norm": 0.4140434265136719, + "learning_rate": 6.743376815179424e-06, + "loss": 0.0717, + "step": 12520 + }, + { + "epoch": 47.82442748091603, + "grad_norm": 0.3196008801460266, + "learning_rate": 6.738210077246377e-06, + "loss": 0.0771, + "step": 12530 + }, + { + "epoch": 47.86259541984733, + "grad_norm": 0.36054739356040955, + "learning_rate": 6.733041227224182e-06, + "loss": 0.0739, + "step": 12540 + }, + { + "epoch": 47.900763358778626, + "grad_norm": 0.32305601239204407, + "learning_rate": 6.72787027139348e-06, + "loss": 0.0793, + "step": 12550 + }, + { + "epoch": 47.93893129770992, + "grad_norm": 0.41172558069229126, + "learning_rate": 6.72269721603747e-06, + "loss": 0.0746, + "step": 12560 + }, + { + "epoch": 47.97709923664122, + "grad_norm": 0.5158224701881409, + "learning_rate": 6.717522067441904e-06, + "loss": 0.0709, + "step": 12570 + }, + { + "epoch": 48.01526717557252, + "grad_norm": 0.33051443099975586, + "learning_rate": 6.712344831895075e-06, + "loss": 0.0799, + "step": 12580 + }, + { + "epoch": 48.05343511450382, + "grad_norm": 0.2499360889196396, + "learning_rate": 6.707165515687811e-06, + "loss": 0.0933, + "step": 12590 + }, + { + "epoch": 48.091603053435115, + "grad_norm": 0.3040105700492859, + "learning_rate": 6.70198412511347e-06, + "loss": 0.0684, + "step": 12600 + }, + { + "epoch": 48.12977099236641, + "grad_norm": 0.5028464794158936, + "learning_rate": 6.696800666467931e-06, + "loss": 0.0763, + "step": 12610 + }, + { + "epoch": 48.16793893129771, + "grad_norm": 0.6218233704566956, + "learning_rate": 6.691615146049584e-06, + "loss": 0.0713, + "step": 12620 + }, + { + "epoch": 48.20610687022901, + "grad_norm": 1.142249345779419, + "learning_rate": 6.686427570159324e-06, + "loss": 0.0881, + "step": 12630 + }, + { + "epoch": 48.2442748091603, + "grad_norm": 0.40421727299690247, + "learning_rate": 6.681237945100549e-06, + "loss": 0.0699, + "step": 12640 + }, + { + "epoch": 48.282442748091604, + "grad_norm": 0.4574754238128662, + "learning_rate": 6.676046277179139e-06, + "loss": 0.0794, + "step": 12650 + }, + { + "epoch": 48.3206106870229, + "grad_norm": 0.33654555678367615, + "learning_rate": 6.670852572703462e-06, + "loss": 0.0722, + "step": 12660 + }, + { + "epoch": 48.3587786259542, + "grad_norm": 0.2677624523639679, + "learning_rate": 6.665656837984359e-06, + "loss": 0.0855, + "step": 12670 + }, + { + "epoch": 48.396946564885496, + "grad_norm": 0.7722212672233582, + "learning_rate": 6.660459079335136e-06, + "loss": 0.0804, + "step": 12680 + }, + { + "epoch": 48.43511450381679, + "grad_norm": 0.32007500529289246, + "learning_rate": 6.655259303071558e-06, + "loss": 0.0786, + "step": 12690 + }, + { + "epoch": 48.47328244274809, + "grad_norm": 0.28395479917526245, + "learning_rate": 6.650057515511849e-06, + "loss": 0.0741, + "step": 12700 + }, + { + "epoch": 48.51145038167939, + "grad_norm": 0.5436563491821289, + "learning_rate": 6.644853722976667e-06, + "loss": 0.0879, + "step": 12710 + }, + { + "epoch": 48.54961832061069, + "grad_norm": 0.5980028510093689, + "learning_rate": 6.639647931789114e-06, + "loss": 0.0855, + "step": 12720 + }, + { + "epoch": 48.587786259541986, + "grad_norm": 0.345226913690567, + "learning_rate": 6.634440148274712e-06, + "loss": 0.0693, + "step": 12730 + }, + { + "epoch": 48.62595419847328, + "grad_norm": 0.559097409248352, + "learning_rate": 6.6292303787614156e-06, + "loss": 0.0704, + "step": 12740 + }, + { + "epoch": 48.66412213740458, + "grad_norm": 0.2557688355445862, + "learning_rate": 6.624018629579582e-06, + "loss": 0.0746, + "step": 12750 + }, + { + "epoch": 48.70229007633588, + "grad_norm": 0.38881349563598633, + "learning_rate": 6.618804907061977e-06, + "loss": 0.0727, + "step": 12760 + }, + { + "epoch": 48.74045801526717, + "grad_norm": 0.34416061639785767, + "learning_rate": 6.613589217543765e-06, + "loss": 0.0781, + "step": 12770 + }, + { + "epoch": 48.778625954198475, + "grad_norm": 0.6163748502731323, + "learning_rate": 6.608371567362505e-06, + "loss": 0.0785, + "step": 12780 + }, + { + "epoch": 48.81679389312977, + "grad_norm": 0.2579529583454132, + "learning_rate": 6.60315196285813e-06, + "loss": 0.064, + "step": 12790 + }, + { + "epoch": 48.85496183206107, + "grad_norm": 0.32075974345207214, + "learning_rate": 6.5979304103729545e-06, + "loss": 0.0831, + "step": 12800 + }, + { + "epoch": 48.89312977099237, + "grad_norm": 0.28072690963745117, + "learning_rate": 6.592706916251653e-06, + "loss": 0.068, + "step": 12810 + }, + { + "epoch": 48.93129770992366, + "grad_norm": 0.3512701690196991, + "learning_rate": 6.587481486841267e-06, + "loss": 0.0832, + "step": 12820 + }, + { + "epoch": 48.969465648854964, + "grad_norm": 0.3708653151988983, + "learning_rate": 6.582254128491184e-06, + "loss": 0.08, + "step": 12830 + }, + { + "epoch": 49.00763358778626, + "grad_norm": 0.5999414324760437, + "learning_rate": 6.577024847553139e-06, + "loss": 0.0707, + "step": 12840 + }, + { + "epoch": 49.045801526717554, + "grad_norm": 0.5251232385635376, + "learning_rate": 6.5717936503812e-06, + "loss": 0.078, + "step": 12850 + }, + { + "epoch": 49.083969465648856, + "grad_norm": 0.30655843019485474, + "learning_rate": 6.5665605433317655e-06, + "loss": 0.0761, + "step": 12860 + }, + { + "epoch": 49.12213740458015, + "grad_norm": 0.3946235775947571, + "learning_rate": 6.561325532763554e-06, + "loss": 0.0757, + "step": 12870 + }, + { + "epoch": 49.16030534351145, + "grad_norm": 0.27524781227111816, + "learning_rate": 6.556088625037598e-06, + "loss": 0.071, + "step": 12880 + }, + { + "epoch": 49.19847328244275, + "grad_norm": 0.27096670866012573, + "learning_rate": 6.550849826517231e-06, + "loss": 0.0781, + "step": 12890 + }, + { + "epoch": 49.23664122137404, + "grad_norm": 0.4801909029483795, + "learning_rate": 6.54560914356809e-06, + "loss": 0.0745, + "step": 12900 + }, + { + "epoch": 49.274809160305345, + "grad_norm": 0.4032978117465973, + "learning_rate": 6.5403665825580975e-06, + "loss": 0.0764, + "step": 12910 + }, + { + "epoch": 49.31297709923664, + "grad_norm": 0.46122097969055176, + "learning_rate": 6.53512214985746e-06, + "loss": 0.0733, + "step": 12920 + }, + { + "epoch": 49.35114503816794, + "grad_norm": 0.6307662129402161, + "learning_rate": 6.529875851838659e-06, + "loss": 0.0768, + "step": 12930 + }, + { + "epoch": 49.38931297709924, + "grad_norm": 0.28868502378463745, + "learning_rate": 6.5246276948764394e-06, + "loss": 0.0859, + "step": 12940 + }, + { + "epoch": 49.42748091603053, + "grad_norm": 0.463450163602829, + "learning_rate": 6.519377685347808e-06, + "loss": 0.0768, + "step": 12950 + }, + { + "epoch": 49.465648854961835, + "grad_norm": 0.4888218641281128, + "learning_rate": 6.514125829632021e-06, + "loss": 0.0741, + "step": 12960 + }, + { + "epoch": 49.50381679389313, + "grad_norm": 0.615373432636261, + "learning_rate": 6.508872134110578e-06, + "loss": 0.0736, + "step": 12970 + }, + { + "epoch": 49.541984732824424, + "grad_norm": 0.2985082268714905, + "learning_rate": 6.5036166051672135e-06, + "loss": 0.0885, + "step": 12980 + }, + { + "epoch": 49.58015267175573, + "grad_norm": 0.461047887802124, + "learning_rate": 6.498359249187893e-06, + "loss": 0.0756, + "step": 12990 + }, + { + "epoch": 49.61832061068702, + "grad_norm": 0.46043241024017334, + "learning_rate": 6.4931000725607985e-06, + "loss": 0.0753, + "step": 13000 + }, + { + "epoch": 49.656488549618324, + "grad_norm": 0.4072940945625305, + "learning_rate": 6.487839081676327e-06, + "loss": 0.0678, + "step": 13010 + }, + { + "epoch": 49.69465648854962, + "grad_norm": 0.6342893242835999, + "learning_rate": 6.482576282927076e-06, + "loss": 0.0719, + "step": 13020 + }, + { + "epoch": 49.732824427480914, + "grad_norm": 0.4662512242794037, + "learning_rate": 6.477311682707844e-06, + "loss": 0.0745, + "step": 13030 + }, + { + "epoch": 49.770992366412216, + "grad_norm": 0.2835651934146881, + "learning_rate": 6.472045287415616e-06, + "loss": 0.0689, + "step": 13040 + }, + { + "epoch": 49.80916030534351, + "grad_norm": 0.3895277678966522, + "learning_rate": 6.466777103449559e-06, + "loss": 0.0747, + "step": 13050 + }, + { + "epoch": 49.847328244274806, + "grad_norm": 0.32042866945266724, + "learning_rate": 6.461507137211012e-06, + "loss": 0.0688, + "step": 13060 + }, + { + "epoch": 49.88549618320611, + "grad_norm": 0.5186818838119507, + "learning_rate": 6.456235395103483e-06, + "loss": 0.0784, + "step": 13070 + }, + { + "epoch": 49.9236641221374, + "grad_norm": 0.28963205218315125, + "learning_rate": 6.450961883532635e-06, + "loss": 0.0769, + "step": 13080 + }, + { + "epoch": 49.961832061068705, + "grad_norm": 0.374496728181839, + "learning_rate": 6.445686608906283e-06, + "loss": 0.0674, + "step": 13090 + }, + { + "epoch": 50.0, + "grad_norm": 0.35718169808387756, + "learning_rate": 6.44040957763438e-06, + "loss": 0.0773, + "step": 13100 + }, + { + "epoch": 50.038167938931295, + "grad_norm": 0.5534579753875732, + "learning_rate": 6.435130796129019e-06, + "loss": 0.0677, + "step": 13110 + }, + { + "epoch": 50.0763358778626, + "grad_norm": 0.5444230437278748, + "learning_rate": 6.4298502708044165e-06, + "loss": 0.08, + "step": 13120 + }, + { + "epoch": 50.11450381679389, + "grad_norm": 0.5170355439186096, + "learning_rate": 6.424568008076909e-06, + "loss": 0.07, + "step": 13130 + }, + { + "epoch": 50.152671755725194, + "grad_norm": 0.514159083366394, + "learning_rate": 6.419284014364944e-06, + "loss": 0.0907, + "step": 13140 + }, + { + "epoch": 50.19083969465649, + "grad_norm": 0.31660693883895874, + "learning_rate": 6.413998296089071e-06, + "loss": 0.0744, + "step": 13150 + }, + { + "epoch": 50.229007633587784, + "grad_norm": 0.2514859139919281, + "learning_rate": 6.408710859671938e-06, + "loss": 0.0733, + "step": 13160 + }, + { + "epoch": 50.267175572519086, + "grad_norm": 0.6438994407653809, + "learning_rate": 6.403421711538278e-06, + "loss": 0.0816, + "step": 13170 + }, + { + "epoch": 50.30534351145038, + "grad_norm": 0.5075148940086365, + "learning_rate": 6.398130858114904e-06, + "loss": 0.0808, + "step": 13180 + }, + { + "epoch": 50.343511450381676, + "grad_norm": 1.2176218032836914, + "learning_rate": 6.392838305830702e-06, + "loss": 0.0821, + "step": 13190 + }, + { + "epoch": 50.38167938931298, + "grad_norm": 0.32374969124794006, + "learning_rate": 6.387544061116622e-06, + "loss": 0.0789, + "step": 13200 + }, + { + "epoch": 50.41984732824427, + "grad_norm": 0.539487898349762, + "learning_rate": 6.382248130405671e-06, + "loss": 0.0717, + "step": 13210 + }, + { + "epoch": 50.458015267175576, + "grad_norm": 0.29437029361724854, + "learning_rate": 6.376950520132906e-06, + "loss": 0.0715, + "step": 13220 + }, + { + "epoch": 50.49618320610687, + "grad_norm": 0.2797829806804657, + "learning_rate": 6.371651236735418e-06, + "loss": 0.0691, + "step": 13230 + }, + { + "epoch": 50.534351145038165, + "grad_norm": 1.2593516111373901, + "learning_rate": 6.366350286652341e-06, + "loss": 0.0775, + "step": 13240 + }, + { + "epoch": 50.57251908396947, + "grad_norm": 0.5753695964813232, + "learning_rate": 6.361047676324827e-06, + "loss": 0.0787, + "step": 13250 + }, + { + "epoch": 50.61068702290076, + "grad_norm": 0.6826583743095398, + "learning_rate": 6.355743412196047e-06, + "loss": 0.071, + "step": 13260 + }, + { + "epoch": 50.64885496183206, + "grad_norm": 0.655083417892456, + "learning_rate": 6.350437500711184e-06, + "loss": 0.07, + "step": 13270 + }, + { + "epoch": 50.68702290076336, + "grad_norm": 0.3876498341560364, + "learning_rate": 6.345129948317419e-06, + "loss": 0.0641, + "step": 13280 + }, + { + "epoch": 50.725190839694655, + "grad_norm": 0.38112854957580566, + "learning_rate": 6.339820761463929e-06, + "loss": 0.0754, + "step": 13290 + }, + { + "epoch": 50.76335877862596, + "grad_norm": 0.402295857667923, + "learning_rate": 6.334509946601879e-06, + "loss": 0.0878, + "step": 13300 + }, + { + "epoch": 50.80152671755725, + "grad_norm": 0.3208377957344055, + "learning_rate": 6.329197510184406e-06, + "loss": 0.0746, + "step": 13310 + }, + { + "epoch": 50.83969465648855, + "grad_norm": 0.3299608528614044, + "learning_rate": 6.323883458666624e-06, + "loss": 0.0748, + "step": 13320 + }, + { + "epoch": 50.87786259541985, + "grad_norm": 0.3016846477985382, + "learning_rate": 6.318567798505605e-06, + "loss": 0.0708, + "step": 13330 + }, + { + "epoch": 50.916030534351144, + "grad_norm": 0.6639785766601562, + "learning_rate": 6.313250536160378e-06, + "loss": 0.0789, + "step": 13340 + }, + { + "epoch": 50.954198473282446, + "grad_norm": 0.3203372061252594, + "learning_rate": 6.307931678091918e-06, + "loss": 0.0794, + "step": 13350 + }, + { + "epoch": 50.99236641221374, + "grad_norm": 0.44946426153182983, + "learning_rate": 6.3026112307631385e-06, + "loss": 0.0792, + "step": 13360 + }, + { + "epoch": 51.030534351145036, + "grad_norm": 0.32271233201026917, + "learning_rate": 6.297289200638888e-06, + "loss": 0.0767, + "step": 13370 + }, + { + "epoch": 51.06870229007634, + "grad_norm": 0.32984182238578796, + "learning_rate": 6.29196559418593e-06, + "loss": 0.077, + "step": 13380 + }, + { + "epoch": 51.10687022900763, + "grad_norm": 0.27354347705841064, + "learning_rate": 6.286640417872951e-06, + "loss": 0.0725, + "step": 13390 + }, + { + "epoch": 51.14503816793893, + "grad_norm": 0.4007064700126648, + "learning_rate": 6.281313678170543e-06, + "loss": 0.0697, + "step": 13400 + }, + { + "epoch": 51.18320610687023, + "grad_norm": 0.2881835401058197, + "learning_rate": 6.275985381551195e-06, + "loss": 0.0816, + "step": 13410 + }, + { + "epoch": 51.221374045801525, + "grad_norm": 0.7041976451873779, + "learning_rate": 6.270655534489292e-06, + "loss": 0.0757, + "step": 13420 + }, + { + "epoch": 51.25954198473283, + "grad_norm": 0.28111717104911804, + "learning_rate": 6.265324143461098e-06, + "loss": 0.0808, + "step": 13430 + }, + { + "epoch": 51.29770992366412, + "grad_norm": 0.38881856203079224, + "learning_rate": 6.259991214944758e-06, + "loss": 0.0752, + "step": 13440 + }, + { + "epoch": 51.33587786259542, + "grad_norm": 0.25334322452545166, + "learning_rate": 6.254656755420283e-06, + "loss": 0.0712, + "step": 13450 + }, + { + "epoch": 51.37404580152672, + "grad_norm": 0.37281477451324463, + "learning_rate": 6.249320771369545e-06, + "loss": 0.0813, + "step": 13460 + }, + { + "epoch": 51.412213740458014, + "grad_norm": 1.6410506963729858, + "learning_rate": 6.243983269276263e-06, + "loss": 0.0907, + "step": 13470 + }, + { + "epoch": 51.45038167938931, + "grad_norm": 0.2578519582748413, + "learning_rate": 6.238644255626013e-06, + "loss": 0.0719, + "step": 13480 + }, + { + "epoch": 51.48854961832061, + "grad_norm": 0.912161648273468, + "learning_rate": 6.233303736906193e-06, + "loss": 0.0714, + "step": 13490 + }, + { + "epoch": 51.52671755725191, + "grad_norm": 0.5383570790290833, + "learning_rate": 6.2279617196060394e-06, + "loss": 0.0743, + "step": 13500 + }, + { + "epoch": 51.56488549618321, + "grad_norm": 0.6001830697059631, + "learning_rate": 6.2226182102166085e-06, + "loss": 0.0739, + "step": 13510 + }, + { + "epoch": 51.603053435114504, + "grad_norm": 0.40439608693122864, + "learning_rate": 6.217273215230767e-06, + "loss": 0.0749, + "step": 13520 + }, + { + "epoch": 51.6412213740458, + "grad_norm": 0.298961877822876, + "learning_rate": 6.2119267411431885e-06, + "loss": 0.0668, + "step": 13530 + }, + { + "epoch": 51.6793893129771, + "grad_norm": 0.4611186385154724, + "learning_rate": 6.206578794450339e-06, + "loss": 0.0789, + "step": 13540 + }, + { + "epoch": 51.717557251908396, + "grad_norm": 0.2702903747558594, + "learning_rate": 6.2012293816504855e-06, + "loss": 0.077, + "step": 13550 + }, + { + "epoch": 51.7557251908397, + "grad_norm": 0.38614171743392944, + "learning_rate": 6.195878509243661e-06, + "loss": 0.0801, + "step": 13560 + }, + { + "epoch": 51.79389312977099, + "grad_norm": 0.3832460343837738, + "learning_rate": 6.190526183731686e-06, + "loss": 0.0675, + "step": 13570 + }, + { + "epoch": 51.83206106870229, + "grad_norm": 0.7933033108711243, + "learning_rate": 6.185172411618138e-06, + "loss": 0.0703, + "step": 13580 + }, + { + "epoch": 51.87022900763359, + "grad_norm": 0.3744434714317322, + "learning_rate": 6.179817199408355e-06, + "loss": 0.0737, + "step": 13590 + }, + { + "epoch": 51.908396946564885, + "grad_norm": 0.49073082208633423, + "learning_rate": 6.174460553609426e-06, + "loss": 0.0843, + "step": 13600 + }, + { + "epoch": 51.94656488549618, + "grad_norm": 0.438060462474823, + "learning_rate": 6.16910248073018e-06, + "loss": 0.0729, + "step": 13610 + }, + { + "epoch": 51.98473282442748, + "grad_norm": 0.3065158724784851, + "learning_rate": 6.16374298728118e-06, + "loss": 0.075, + "step": 13620 + }, + { + "epoch": 52.02290076335878, + "grad_norm": 0.3208974301815033, + "learning_rate": 6.158382079774716e-06, + "loss": 0.0757, + "step": 13630 + }, + { + "epoch": 52.06106870229008, + "grad_norm": 0.36276963353157043, + "learning_rate": 6.153019764724799e-06, + "loss": 0.079, + "step": 13640 + }, + { + "epoch": 52.099236641221374, + "grad_norm": 0.36144378781318665, + "learning_rate": 6.147656048647144e-06, + "loss": 0.0665, + "step": 13650 + }, + { + "epoch": 52.13740458015267, + "grad_norm": 0.42839837074279785, + "learning_rate": 6.1422909380591724e-06, + "loss": 0.0732, + "step": 13660 + }, + { + "epoch": 52.17557251908397, + "grad_norm": 0.589056134223938, + "learning_rate": 6.136924439480001e-06, + "loss": 0.0813, + "step": 13670 + }, + { + "epoch": 52.213740458015266, + "grad_norm": 0.7054497599601746, + "learning_rate": 6.13155655943043e-06, + "loss": 0.0792, + "step": 13680 + }, + { + "epoch": 52.25190839694657, + "grad_norm": 0.5286399126052856, + "learning_rate": 6.126187304432941e-06, + "loss": 0.0732, + "step": 13690 + }, + { + "epoch": 52.29007633587786, + "grad_norm": 0.4254150986671448, + "learning_rate": 6.1208166810116846e-06, + "loss": 0.0774, + "step": 13700 + }, + { + "epoch": 52.32824427480916, + "grad_norm": 0.6747491955757141, + "learning_rate": 6.115444695692474e-06, + "loss": 0.0777, + "step": 13710 + }, + { + "epoch": 52.36641221374046, + "grad_norm": 0.3658086657524109, + "learning_rate": 6.110071355002779e-06, + "loss": 0.0655, + "step": 13720 + }, + { + "epoch": 52.404580152671755, + "grad_norm": 0.4212816655635834, + "learning_rate": 6.104696665471715e-06, + "loss": 0.0842, + "step": 13730 + }, + { + "epoch": 52.44274809160305, + "grad_norm": 0.6053048372268677, + "learning_rate": 6.099320633630036e-06, + "loss": 0.0804, + "step": 13740 + }, + { + "epoch": 52.48091603053435, + "grad_norm": 0.3349289894104004, + "learning_rate": 6.093943266010128e-06, + "loss": 0.0729, + "step": 13750 + }, + { + "epoch": 52.51908396946565, + "grad_norm": 0.3947334289550781, + "learning_rate": 6.088564569146001e-06, + "loss": 0.088, + "step": 13760 + }, + { + "epoch": 52.55725190839695, + "grad_norm": 0.4069903790950775, + "learning_rate": 6.083184549573275e-06, + "loss": 0.0761, + "step": 13770 + }, + { + "epoch": 52.595419847328245, + "grad_norm": 0.526199221611023, + "learning_rate": 6.0778032138291845e-06, + "loss": 0.0743, + "step": 13780 + }, + { + "epoch": 52.63358778625954, + "grad_norm": 0.37848567962646484, + "learning_rate": 6.072420568452559e-06, + "loss": 0.0722, + "step": 13790 + }, + { + "epoch": 52.67175572519084, + "grad_norm": 0.479997843503952, + "learning_rate": 6.067036619983822e-06, + "loss": 0.0813, + "step": 13800 + }, + { + "epoch": 52.70992366412214, + "grad_norm": 0.37879565358161926, + "learning_rate": 6.061651374964974e-06, + "loss": 0.0713, + "step": 13810 + }, + { + "epoch": 52.74809160305343, + "grad_norm": 0.2613222897052765, + "learning_rate": 6.056264839939601e-06, + "loss": 0.0733, + "step": 13820 + }, + { + "epoch": 52.786259541984734, + "grad_norm": 0.4881426990032196, + "learning_rate": 6.050877021452845e-06, + "loss": 0.0693, + "step": 13830 + }, + { + "epoch": 52.82442748091603, + "grad_norm": 0.8297280073165894, + "learning_rate": 6.0454879260514196e-06, + "loss": 0.0751, + "step": 13840 + }, + { + "epoch": 52.86259541984733, + "grad_norm": 0.8701691627502441, + "learning_rate": 6.0400975602835795e-06, + "loss": 0.0745, + "step": 13850 + }, + { + "epoch": 52.900763358778626, + "grad_norm": 0.40549200773239136, + "learning_rate": 6.034705930699129e-06, + "loss": 0.077, + "step": 13860 + }, + { + "epoch": 52.93893129770992, + "grad_norm": 0.5269019603729248, + "learning_rate": 6.029313043849407e-06, + "loss": 0.0732, + "step": 13870 + }, + { + "epoch": 52.97709923664122, + "grad_norm": 0.6514595746994019, + "learning_rate": 6.0239189062872795e-06, + "loss": 0.0683, + "step": 13880 + }, + { + "epoch": 53.01526717557252, + "grad_norm": 0.270759254693985, + "learning_rate": 6.01852352456713e-06, + "loss": 0.0774, + "step": 13890 + }, + { + "epoch": 53.05343511450382, + "grad_norm": 0.37720581889152527, + "learning_rate": 6.013126905244858e-06, + "loss": 0.0703, + "step": 13900 + }, + { + "epoch": 53.091603053435115, + "grad_norm": 0.45019468665122986, + "learning_rate": 6.007729054877865e-06, + "loss": 0.0709, + "step": 13910 + }, + { + "epoch": 53.12977099236641, + "grad_norm": 0.4115181565284729, + "learning_rate": 6.002329980025047e-06, + "loss": 0.0723, + "step": 13920 + }, + { + "epoch": 53.16793893129771, + "grad_norm": 0.5874790549278259, + "learning_rate": 5.99692968724679e-06, + "loss": 0.0708, + "step": 13930 + }, + { + "epoch": 53.20610687022901, + "grad_norm": 0.31998172402381897, + "learning_rate": 5.991528183104959e-06, + "loss": 0.0739, + "step": 13940 + }, + { + "epoch": 53.2442748091603, + "grad_norm": 0.2828161120414734, + "learning_rate": 5.98612547416289e-06, + "loss": 0.0739, + "step": 13950 + }, + { + "epoch": 53.282442748091604, + "grad_norm": 0.2661030888557434, + "learning_rate": 5.9807215669853855e-06, + "loss": 0.1, + "step": 13960 + }, + { + "epoch": 53.3206106870229, + "grad_norm": 1.1698861122131348, + "learning_rate": 5.9753164681387e-06, + "loss": 0.0806, + "step": 13970 + }, + { + "epoch": 53.3587786259542, + "grad_norm": 0.413404256105423, + "learning_rate": 5.969910184190539e-06, + "loss": 0.0695, + "step": 13980 + }, + { + "epoch": 53.396946564885496, + "grad_norm": 0.5563871264457703, + "learning_rate": 5.9645027217100485e-06, + "loss": 0.0831, + "step": 13990 + }, + { + "epoch": 53.43511450381679, + "grad_norm": 0.5594649314880371, + "learning_rate": 5.9590940872678035e-06, + "loss": 0.0806, + "step": 14000 + }, + { + "epoch": 53.47328244274809, + "grad_norm": 0.2644912898540497, + "learning_rate": 5.953684287435807e-06, + "loss": 0.0744, + "step": 14010 + }, + { + "epoch": 53.51145038167939, + "grad_norm": 0.31306540966033936, + "learning_rate": 5.948273328787474e-06, + "loss": 0.0806, + "step": 14020 + }, + { + "epoch": 53.54961832061069, + "grad_norm": 0.298922598361969, + "learning_rate": 5.942861217897631e-06, + "loss": 0.0722, + "step": 14030 + }, + { + "epoch": 53.587786259541986, + "grad_norm": 0.38911426067352295, + "learning_rate": 5.937447961342501e-06, + "loss": 0.0811, + "step": 14040 + }, + { + "epoch": 53.62595419847328, + "grad_norm": 0.44509273767471313, + "learning_rate": 5.932033565699704e-06, + "loss": 0.0824, + "step": 14050 + }, + { + "epoch": 53.66412213740458, + "grad_norm": 0.6681466102600098, + "learning_rate": 5.926618037548237e-06, + "loss": 0.0815, + "step": 14060 + }, + { + "epoch": 53.70229007633588, + "grad_norm": 0.5035837888717651, + "learning_rate": 5.921201383468483e-06, + "loss": 0.0693, + "step": 14070 + }, + { + "epoch": 53.74045801526717, + "grad_norm": 0.4998972713947296, + "learning_rate": 5.915783610042183e-06, + "loss": 0.0758, + "step": 14080 + }, + { + "epoch": 53.778625954198475, + "grad_norm": 0.36603638529777527, + "learning_rate": 5.910364723852444e-06, + "loss": 0.0701, + "step": 14090 + }, + { + "epoch": 53.81679389312977, + "grad_norm": 0.4405144453048706, + "learning_rate": 5.904944731483724e-06, + "loss": 0.073, + "step": 14100 + }, + { + "epoch": 53.85496183206107, + "grad_norm": 0.4353579878807068, + "learning_rate": 5.899523639521825e-06, + "loss": 0.0771, + "step": 14110 + }, + { + "epoch": 53.89312977099237, + "grad_norm": 0.2582431435585022, + "learning_rate": 5.894101454553883e-06, + "loss": 0.0665, + "step": 14120 + }, + { + "epoch": 53.93129770992366, + "grad_norm": 0.2779502868652344, + "learning_rate": 5.888678183168368e-06, + "loss": 0.0805, + "step": 14130 + }, + { + "epoch": 53.969465648854964, + "grad_norm": 0.35140860080718994, + "learning_rate": 5.883253831955061e-06, + "loss": 0.0758, + "step": 14140 + }, + { + "epoch": 54.00763358778626, + "grad_norm": 0.7238326668739319, + "learning_rate": 5.877828407505063e-06, + "loss": 0.0818, + "step": 14150 + }, + { + "epoch": 54.045801526717554, + "grad_norm": 0.43056339025497437, + "learning_rate": 5.872401916410777e-06, + "loss": 0.0942, + "step": 14160 + }, + { + "epoch": 54.083969465648856, + "grad_norm": 0.5613008141517639, + "learning_rate": 5.866974365265901e-06, + "loss": 0.0642, + "step": 14170 + }, + { + "epoch": 54.12213740458015, + "grad_norm": 0.3026123046875, + "learning_rate": 5.86154576066542e-06, + "loss": 0.0776, + "step": 14180 + }, + { + "epoch": 54.16030534351145, + "grad_norm": 0.3152710199356079, + "learning_rate": 5.856116109205602e-06, + "loss": 0.0674, + "step": 14190 + }, + { + "epoch": 54.19847328244275, + "grad_norm": 0.8080897331237793, + "learning_rate": 5.850685417483983e-06, + "loss": 0.0859, + "step": 14200 + }, + { + "epoch": 54.23664122137404, + "grad_norm": 0.3381716310977936, + "learning_rate": 5.845253692099369e-06, + "loss": 0.0822, + "step": 14210 + }, + { + "epoch": 54.274809160305345, + "grad_norm": 0.25009527802467346, + "learning_rate": 5.839820939651817e-06, + "loss": 0.0705, + "step": 14220 + }, + { + "epoch": 54.31297709923664, + "grad_norm": 0.24691715836524963, + "learning_rate": 5.8343871667426326e-06, + "loss": 0.0736, + "step": 14230 + }, + { + "epoch": 54.35114503816794, + "grad_norm": 0.32508522272109985, + "learning_rate": 5.828952379974364e-06, + "loss": 0.0692, + "step": 14240 + }, + { + "epoch": 54.38931297709924, + "grad_norm": 0.3096218407154083, + "learning_rate": 5.823516585950787e-06, + "loss": 0.0691, + "step": 14250 + }, + { + "epoch": 54.42748091603053, + "grad_norm": 0.3918653726577759, + "learning_rate": 5.818079791276907e-06, + "loss": 0.0761, + "step": 14260 + }, + { + "epoch": 54.465648854961835, + "grad_norm": 0.4393802881240845, + "learning_rate": 5.8126420025589415e-06, + "loss": 0.0816, + "step": 14270 + }, + { + "epoch": 54.50381679389313, + "grad_norm": 0.3444501459598541, + "learning_rate": 5.807203226404313e-06, + "loss": 0.0983, + "step": 14280 + }, + { + "epoch": 54.541984732824424, + "grad_norm": 0.6188817024230957, + "learning_rate": 5.801763469421652e-06, + "loss": 0.0816, + "step": 14290 + }, + { + "epoch": 54.58015267175573, + "grad_norm": 0.39169570803642273, + "learning_rate": 5.796322738220774e-06, + "loss": 0.0833, + "step": 14300 + }, + { + "epoch": 54.61832061068702, + "grad_norm": 0.5085362792015076, + "learning_rate": 5.79088103941268e-06, + "loss": 0.0685, + "step": 14310 + }, + { + "epoch": 54.656488549618324, + "grad_norm": 1.19497549533844, + "learning_rate": 5.78543837960955e-06, + "loss": 0.086, + "step": 14320 + }, + { + "epoch": 54.69465648854962, + "grad_norm": 0.8479430079460144, + "learning_rate": 5.7799947654247244e-06, + "loss": 0.0815, + "step": 14330 + }, + { + "epoch": 54.732824427480914, + "grad_norm": 0.4414825439453125, + "learning_rate": 5.774550203472712e-06, + "loss": 0.076, + "step": 14340 + }, + { + "epoch": 54.770992366412216, + "grad_norm": 0.3391081690788269, + "learning_rate": 5.769104700369165e-06, + "loss": 0.0733, + "step": 14350 + }, + { + "epoch": 54.80916030534351, + "grad_norm": 0.6143276691436768, + "learning_rate": 5.763658262730886e-06, + "loss": 0.0741, + "step": 14360 + }, + { + "epoch": 54.847328244274806, + "grad_norm": 0.42588314414024353, + "learning_rate": 5.7582108971758095e-06, + "loss": 0.0688, + "step": 14370 + }, + { + "epoch": 54.88549618320611, + "grad_norm": 0.7601977586746216, + "learning_rate": 5.752762610323e-06, + "loss": 0.0862, + "step": 14380 + }, + { + "epoch": 54.9236641221374, + "grad_norm": 0.30980366468429565, + "learning_rate": 5.747313408792636e-06, + "loss": 0.072, + "step": 14390 + }, + { + "epoch": 54.961832061068705, + "grad_norm": 0.8562023639678955, + "learning_rate": 5.7418632992060145e-06, + "loss": 0.0743, + "step": 14400 + }, + { + "epoch": 55.0, + "grad_norm": 0.4307748079299927, + "learning_rate": 5.73641228818553e-06, + "loss": 0.0679, + "step": 14410 + }, + { + "epoch": 55.038167938931295, + "grad_norm": 0.2995448708534241, + "learning_rate": 5.730960382354677e-06, + "loss": 0.0749, + "step": 14420 + }, + { + "epoch": 55.0763358778626, + "grad_norm": 0.5709684491157532, + "learning_rate": 5.725507588338035e-06, + "loss": 0.0749, + "step": 14430 + }, + { + "epoch": 55.11450381679389, + "grad_norm": 0.5325103402137756, + "learning_rate": 5.720053912761261e-06, + "loss": 0.0696, + "step": 14440 + }, + { + "epoch": 55.152671755725194, + "grad_norm": 0.40759608149528503, + "learning_rate": 5.714599362251088e-06, + "loss": 0.0725, + "step": 14450 + }, + { + "epoch": 55.19083969465649, + "grad_norm": 0.3020817041397095, + "learning_rate": 5.709143943435307e-06, + "loss": 0.0776, + "step": 14460 + }, + { + "epoch": 55.229007633587784, + "grad_norm": 0.4237860441207886, + "learning_rate": 5.703687662942765e-06, + "loss": 0.0763, + "step": 14470 + }, + { + "epoch": 55.267175572519086, + "grad_norm": 0.8698391914367676, + "learning_rate": 5.6982305274033616e-06, + "loss": 0.0735, + "step": 14480 + }, + { + "epoch": 55.30534351145038, + "grad_norm": 0.5781353116035461, + "learning_rate": 5.692772543448027e-06, + "loss": 0.0779, + "step": 14490 + }, + { + "epoch": 55.343511450381676, + "grad_norm": 0.27626991271972656, + "learning_rate": 5.687313717708728e-06, + "loss": 0.0703, + "step": 14500 + }, + { + "epoch": 55.38167938931298, + "grad_norm": 0.28378692269325256, + "learning_rate": 5.681854056818453e-06, + "loss": 0.0795, + "step": 14510 + }, + { + "epoch": 55.41984732824427, + "grad_norm": 0.5798959732055664, + "learning_rate": 5.6763935674112045e-06, + "loss": 0.0706, + "step": 14520 + }, + { + "epoch": 55.458015267175576, + "grad_norm": 0.288756400346756, + "learning_rate": 5.670932256121992e-06, + "loss": 0.0691, + "step": 14530 + }, + { + "epoch": 55.49618320610687, + "grad_norm": 0.25545430183410645, + "learning_rate": 5.665470129586822e-06, + "loss": 0.074, + "step": 14540 + }, + { + "epoch": 55.534351145038165, + "grad_norm": 0.3490813374519348, + "learning_rate": 5.660007194442697e-06, + "loss": 0.078, + "step": 14550 + }, + { + "epoch": 55.57251908396947, + "grad_norm": 0.31008848547935486, + "learning_rate": 5.6545434573275945e-06, + "loss": 0.0772, + "step": 14560 + }, + { + "epoch": 55.61068702290076, + "grad_norm": 0.3187563717365265, + "learning_rate": 5.649078924880472e-06, + "loss": 0.0724, + "step": 14570 + }, + { + "epoch": 55.64885496183206, + "grad_norm": 0.4331674575805664, + "learning_rate": 5.643613603741253e-06, + "loss": 0.0677, + "step": 14580 + }, + { + "epoch": 55.68702290076336, + "grad_norm": 0.354324609041214, + "learning_rate": 5.6381475005508156e-06, + "loss": 0.0759, + "step": 14590 + }, + { + "epoch": 55.725190839694655, + "grad_norm": 0.4229590594768524, + "learning_rate": 5.632680621950992e-06, + "loss": 0.0783, + "step": 14600 + }, + { + "epoch": 55.76335877862596, + "grad_norm": 0.5442618727684021, + "learning_rate": 5.627212974584555e-06, + "loss": 0.0709, + "step": 14610 + }, + { + "epoch": 55.80152671755725, + "grad_norm": 0.31692686676979065, + "learning_rate": 5.62174456509521e-06, + "loss": 0.0718, + "step": 14620 + }, + { + "epoch": 55.83969465648855, + "grad_norm": 0.3284524977207184, + "learning_rate": 5.616275400127594e-06, + "loss": 0.0688, + "step": 14630 + }, + { + "epoch": 55.87786259541985, + "grad_norm": 0.3089140057563782, + "learning_rate": 5.610805486327254e-06, + "loss": 0.0686, + "step": 14640 + }, + { + "epoch": 55.916030534351144, + "grad_norm": 0.4001990556716919, + "learning_rate": 5.6053348303406545e-06, + "loss": 0.0766, + "step": 14650 + }, + { + "epoch": 55.954198473282446, + "grad_norm": 0.5436351299285889, + "learning_rate": 5.599863438815157e-06, + "loss": 0.0682, + "step": 14660 + }, + { + "epoch": 55.99236641221374, + "grad_norm": 0.47523510456085205, + "learning_rate": 5.594391318399017e-06, + "loss": 0.0724, + "step": 14670 + }, + { + "epoch": 56.030534351145036, + "grad_norm": 0.2652002274990082, + "learning_rate": 5.588918475741378e-06, + "loss": 0.0714, + "step": 14680 + }, + { + "epoch": 56.06870229007634, + "grad_norm": 0.2382209599018097, + "learning_rate": 5.583444917492259e-06, + "loss": 0.0711, + "step": 14690 + }, + { + "epoch": 56.10687022900763, + "grad_norm": 0.4424055516719818, + "learning_rate": 5.57797065030255e-06, + "loss": 0.0698, + "step": 14700 + }, + { + "epoch": 56.14503816793893, + "grad_norm": 0.25786203145980835, + "learning_rate": 5.572495680824001e-06, + "loss": 0.0745, + "step": 14710 + }, + { + "epoch": 56.18320610687023, + "grad_norm": 0.7077968716621399, + "learning_rate": 5.5670200157092195e-06, + "loss": 0.0747, + "step": 14720 + }, + { + "epoch": 56.221374045801525, + "grad_norm": 0.32011500000953674, + "learning_rate": 5.561543661611649e-06, + "loss": 0.0728, + "step": 14730 + }, + { + "epoch": 56.25954198473283, + "grad_norm": 0.6672192811965942, + "learning_rate": 5.556066625185584e-06, + "loss": 0.0744, + "step": 14740 + }, + { + "epoch": 56.29770992366412, + "grad_norm": 1.5504484176635742, + "learning_rate": 5.550588913086131e-06, + "loss": 0.0781, + "step": 14750 + }, + { + "epoch": 56.33587786259542, + "grad_norm": 0.32770517468452454, + "learning_rate": 5.545110531969234e-06, + "loss": 0.0737, + "step": 14760 + }, + { + "epoch": 56.37404580152672, + "grad_norm": 0.42965736985206604, + "learning_rate": 5.539631488491641e-06, + "loss": 0.0736, + "step": 14770 + }, + { + "epoch": 56.412213740458014, + "grad_norm": 0.277537077665329, + "learning_rate": 5.534151789310904e-06, + "loss": 0.0774, + "step": 14780 + }, + { + "epoch": 56.45038167938931, + "grad_norm": 0.5046352744102478, + "learning_rate": 5.528671441085376e-06, + "loss": 0.0713, + "step": 14790 + }, + { + "epoch": 56.48854961832061, + "grad_norm": 1.0368688106536865, + "learning_rate": 5.523190450474198e-06, + "loss": 0.0766, + "step": 14800 + }, + { + "epoch": 56.52671755725191, + "grad_norm": 0.36909282207489014, + "learning_rate": 5.517708824137287e-06, + "loss": 0.0755, + "step": 14810 + }, + { + "epoch": 56.56488549618321, + "grad_norm": 0.5959531664848328, + "learning_rate": 5.512226568735338e-06, + "loss": 0.0817, + "step": 14820 + }, + { + "epoch": 56.603053435114504, + "grad_norm": 0.2804238796234131, + "learning_rate": 5.506743690929809e-06, + "loss": 0.0779, + "step": 14830 + }, + { + "epoch": 56.6412213740458, + "grad_norm": 0.6354565620422363, + "learning_rate": 5.501260197382913e-06, + "loss": 0.0833, + "step": 14840 + }, + { + "epoch": 56.6793893129771, + "grad_norm": 0.6478805541992188, + "learning_rate": 5.49577609475761e-06, + "loss": 0.0808, + "step": 14850 + }, + { + "epoch": 56.717557251908396, + "grad_norm": 0.39894139766693115, + "learning_rate": 5.4902913897176035e-06, + "loss": 0.0729, + "step": 14860 + }, + { + "epoch": 56.7557251908397, + "grad_norm": 0.346055805683136, + "learning_rate": 5.484806088927329e-06, + "loss": 0.0772, + "step": 14870 + }, + { + "epoch": 56.79389312977099, + "grad_norm": 0.3596509099006653, + "learning_rate": 5.479320199051942e-06, + "loss": 0.0701, + "step": 14880 + }, + { + "epoch": 56.83206106870229, + "grad_norm": 0.46233484148979187, + "learning_rate": 5.473833726757314e-06, + "loss": 0.0734, + "step": 14890 + }, + { + "epoch": 56.87022900763359, + "grad_norm": 0.7055529952049255, + "learning_rate": 5.46834667871003e-06, + "loss": 0.0718, + "step": 14900 + }, + { + "epoch": 56.908396946564885, + "grad_norm": 0.9955065846443176, + "learning_rate": 5.462859061577369e-06, + "loss": 0.0794, + "step": 14910 + }, + { + "epoch": 56.94656488549618, + "grad_norm": 0.34203898906707764, + "learning_rate": 5.457370882027303e-06, + "loss": 0.0739, + "step": 14920 + }, + { + "epoch": 56.98473282442748, + "grad_norm": 0.21919408440589905, + "learning_rate": 5.451882146728489e-06, + "loss": 0.077, + "step": 14930 + }, + { + "epoch": 57.02290076335878, + "grad_norm": 0.4146571457386017, + "learning_rate": 5.446392862350255e-06, + "loss": 0.0791, + "step": 14940 + }, + { + "epoch": 57.06106870229008, + "grad_norm": 0.4011072516441345, + "learning_rate": 5.4409030355626035e-06, + "loss": 0.0835, + "step": 14950 + }, + { + "epoch": 57.099236641221374, + "grad_norm": 0.2836225628852844, + "learning_rate": 5.435412673036188e-06, + "loss": 0.0835, + "step": 14960 + }, + { + "epoch": 57.13740458015267, + "grad_norm": 0.41876521706581116, + "learning_rate": 5.429921781442318e-06, + "loss": 0.0754, + "step": 14970 + }, + { + "epoch": 57.17557251908397, + "grad_norm": 0.24527506530284882, + "learning_rate": 5.424430367452946e-06, + "loss": 0.0717, + "step": 14980 + }, + { + "epoch": 57.213740458015266, + "grad_norm": 0.5479999780654907, + "learning_rate": 5.418938437740655e-06, + "loss": 0.073, + "step": 14990 + }, + { + "epoch": 57.25190839694657, + "grad_norm": 1.0041180849075317, + "learning_rate": 5.413445998978658e-06, + "loss": 0.0765, + "step": 15000 + }, + { + "epoch": 57.29007633587786, + "grad_norm": 0.34665971994400024, + "learning_rate": 5.4079530578407895e-06, + "loss": 0.0738, + "step": 15010 + }, + { + "epoch": 57.32824427480916, + "grad_norm": 0.3036256730556488, + "learning_rate": 5.402459621001486e-06, + "loss": 0.0675, + "step": 15020 + }, + { + "epoch": 57.36641221374046, + "grad_norm": 0.36312854290008545, + "learning_rate": 5.396965695135794e-06, + "loss": 0.08, + "step": 15030 + }, + { + "epoch": 57.404580152671755, + "grad_norm": 0.2859637439250946, + "learning_rate": 5.391471286919351e-06, + "loss": 0.0661, + "step": 15040 + }, + { + "epoch": 57.44274809160305, + "grad_norm": 0.4589112401008606, + "learning_rate": 5.385976403028381e-06, + "loss": 0.0697, + "step": 15050 + }, + { + "epoch": 57.48091603053435, + "grad_norm": 0.3610994517803192, + "learning_rate": 5.380481050139688e-06, + "loss": 0.0742, + "step": 15060 + }, + { + "epoch": 57.51908396946565, + "grad_norm": 0.37091097235679626, + "learning_rate": 5.37498523493064e-06, + "loss": 0.0716, + "step": 15070 + }, + { + "epoch": 57.55725190839695, + "grad_norm": 0.30698439478874207, + "learning_rate": 5.369488964079172e-06, + "loss": 0.0838, + "step": 15080 + }, + { + "epoch": 57.595419847328245, + "grad_norm": 0.31963449716567993, + "learning_rate": 5.363992244263774e-06, + "loss": 0.0736, + "step": 15090 + }, + { + "epoch": 57.63358778625954, + "grad_norm": 0.3956395089626312, + "learning_rate": 5.358495082163476e-06, + "loss": 0.0678, + "step": 15100 + }, + { + "epoch": 57.67175572519084, + "grad_norm": 0.4276552200317383, + "learning_rate": 5.35299748445785e-06, + "loss": 0.0768, + "step": 15110 + }, + { + "epoch": 57.70992366412214, + "grad_norm": 0.3108503818511963, + "learning_rate": 5.347499457826995e-06, + "loss": 0.0803, + "step": 15120 + }, + { + "epoch": 57.74809160305343, + "grad_norm": 0.35856226086616516, + "learning_rate": 5.342001008951531e-06, + "loss": 0.0794, + "step": 15130 + }, + { + "epoch": 57.786259541984734, + "grad_norm": 0.4197726547718048, + "learning_rate": 5.336502144512592e-06, + "loss": 0.0731, + "step": 15140 + }, + { + "epoch": 57.82442748091603, + "grad_norm": 0.3682864308357239, + "learning_rate": 5.331002871191817e-06, + "loss": 0.0723, + "step": 15150 + }, + { + "epoch": 57.86259541984733, + "grad_norm": 0.7807731628417969, + "learning_rate": 5.325503195671345e-06, + "loss": 0.0747, + "step": 15160 + }, + { + "epoch": 57.900763358778626, + "grad_norm": 0.3009580373764038, + "learning_rate": 5.320003124633795e-06, + "loss": 0.0732, + "step": 15170 + }, + { + "epoch": 57.93893129770992, + "grad_norm": 0.5685849189758301, + "learning_rate": 5.314502664762275e-06, + "loss": 0.0753, + "step": 15180 + }, + { + "epoch": 57.97709923664122, + "grad_norm": 0.32221153378486633, + "learning_rate": 5.3090018227403605e-06, + "loss": 0.0841, + "step": 15190 + }, + { + "epoch": 58.01526717557252, + "grad_norm": 0.4299134314060211, + "learning_rate": 5.303500605252095e-06, + "loss": 0.076, + "step": 15200 + }, + { + "epoch": 58.05343511450382, + "grad_norm": 0.604964017868042, + "learning_rate": 5.297999018981977e-06, + "loss": 0.079, + "step": 15210 + }, + { + "epoch": 58.091603053435115, + "grad_norm": 0.3654126226902008, + "learning_rate": 5.2924970706149505e-06, + "loss": 0.071, + "step": 15220 + }, + { + "epoch": 58.12977099236641, + "grad_norm": 0.4091419279575348, + "learning_rate": 5.286994766836402e-06, + "loss": 0.0785, + "step": 15230 + }, + { + "epoch": 58.16793893129771, + "grad_norm": 0.45684048533439636, + "learning_rate": 5.2814921143321506e-06, + "loss": 0.0678, + "step": 15240 + }, + { + "epoch": 58.20610687022901, + "grad_norm": 0.32718759775161743, + "learning_rate": 5.275989119788436e-06, + "loss": 0.0739, + "step": 15250 + }, + { + "epoch": 58.2442748091603, + "grad_norm": 0.36803168058395386, + "learning_rate": 5.2704857898919195e-06, + "loss": 0.0699, + "step": 15260 + }, + { + "epoch": 58.282442748091604, + "grad_norm": 0.38092291355133057, + "learning_rate": 5.264982131329661e-06, + "loss": 0.0712, + "step": 15270 + }, + { + "epoch": 58.3206106870229, + "grad_norm": 0.3395163118839264, + "learning_rate": 5.259478150789128e-06, + "loss": 0.0717, + "step": 15280 + }, + { + "epoch": 58.3587786259542, + "grad_norm": 0.27850109338760376, + "learning_rate": 5.253973854958173e-06, + "loss": 0.0668, + "step": 15290 + }, + { + "epoch": 58.396946564885496, + "grad_norm": 0.28005483746528625, + "learning_rate": 5.2484692505250375e-06, + "loss": 0.0762, + "step": 15300 + }, + { + "epoch": 58.43511450381679, + "grad_norm": 0.3931730091571808, + "learning_rate": 5.2429643441783325e-06, + "loss": 0.0768, + "step": 15310 + }, + { + "epoch": 58.47328244274809, + "grad_norm": 0.33013564348220825, + "learning_rate": 5.237459142607041e-06, + "loss": 0.0764, + "step": 15320 + }, + { + "epoch": 58.51145038167939, + "grad_norm": 0.3280738592147827, + "learning_rate": 5.2319536525004974e-06, + "loss": 0.0749, + "step": 15330 + }, + { + "epoch": 58.54961832061069, + "grad_norm": 0.3545534610748291, + "learning_rate": 5.226447880548398e-06, + "loss": 0.0801, + "step": 15340 + }, + { + "epoch": 58.587786259541986, + "grad_norm": 0.4157971739768982, + "learning_rate": 5.220941833440768e-06, + "loss": 0.0791, + "step": 15350 + }, + { + "epoch": 58.62595419847328, + "grad_norm": 0.4566729962825775, + "learning_rate": 5.215435517867978e-06, + "loss": 0.0721, + "step": 15360 + }, + { + "epoch": 58.66412213740458, + "grad_norm": 0.5043272972106934, + "learning_rate": 5.209928940520719e-06, + "loss": 0.0723, + "step": 15370 + }, + { + "epoch": 58.70229007633588, + "grad_norm": 0.5831508636474609, + "learning_rate": 5.204422108090004e-06, + "loss": 0.0728, + "step": 15380 + }, + { + "epoch": 58.74045801526717, + "grad_norm": 0.30468279123306274, + "learning_rate": 5.19891502726715e-06, + "loss": 0.0735, + "step": 15390 + }, + { + "epoch": 58.778625954198475, + "grad_norm": 0.6569890975952148, + "learning_rate": 5.193407704743782e-06, + "loss": 0.0751, + "step": 15400 + }, + { + "epoch": 58.81679389312977, + "grad_norm": 0.4217868745326996, + "learning_rate": 5.1879001472118155e-06, + "loss": 0.0742, + "step": 15410 + }, + { + "epoch": 58.85496183206107, + "grad_norm": 0.3305950164794922, + "learning_rate": 5.182392361363453e-06, + "loss": 0.08, + "step": 15420 + }, + { + "epoch": 58.89312977099237, + "grad_norm": 0.2556851804256439, + "learning_rate": 5.176884353891172e-06, + "loss": 0.0835, + "step": 15430 + }, + { + "epoch": 58.93129770992366, + "grad_norm": 0.5155206322669983, + "learning_rate": 5.171376131487722e-06, + "loss": 0.0758, + "step": 15440 + }, + { + "epoch": 58.969465648854964, + "grad_norm": 0.3321559429168701, + "learning_rate": 5.165867700846113e-06, + "loss": 0.071, + "step": 15450 + }, + { + "epoch": 59.00763358778626, + "grad_norm": 0.49137699604034424, + "learning_rate": 5.1603590686596065e-06, + "loss": 0.08, + "step": 15460 + }, + { + "epoch": 59.045801526717554, + "grad_norm": 0.4781742990016937, + "learning_rate": 5.154850241621712e-06, + "loss": 0.0768, + "step": 15470 + }, + { + "epoch": 59.083969465648856, + "grad_norm": 0.5240231156349182, + "learning_rate": 5.149341226426172e-06, + "loss": 0.0742, + "step": 15480 + }, + { + "epoch": 59.12213740458015, + "grad_norm": 0.5776994824409485, + "learning_rate": 5.143832029766959e-06, + "loss": 0.0685, + "step": 15490 + }, + { + "epoch": 59.16030534351145, + "grad_norm": 0.23645877838134766, + "learning_rate": 5.138322658338269e-06, + "loss": 0.0668, + "step": 15500 + }, + { + "epoch": 59.19847328244275, + "grad_norm": 0.3662410378456116, + "learning_rate": 5.132813118834504e-06, + "loss": 0.0765, + "step": 15510 + }, + { + "epoch": 59.23664122137404, + "grad_norm": 0.8370821475982666, + "learning_rate": 5.127303417950278e-06, + "loss": 0.0758, + "step": 15520 + }, + { + "epoch": 59.274809160305345, + "grad_norm": 0.29751116037368774, + "learning_rate": 5.121793562380395e-06, + "loss": 0.0799, + "step": 15530 + }, + { + "epoch": 59.31297709923664, + "grad_norm": 0.3275250792503357, + "learning_rate": 5.116283558819848e-06, + "loss": 0.0758, + "step": 15540 + }, + { + "epoch": 59.35114503816794, + "grad_norm": 0.36758387088775635, + "learning_rate": 5.110773413963813e-06, + "loss": 0.071, + "step": 15550 + }, + { + "epoch": 59.38931297709924, + "grad_norm": 0.4558661878108978, + "learning_rate": 5.1052631345076365e-06, + "loss": 0.0754, + "step": 15560 + }, + { + "epoch": 59.42748091603053, + "grad_norm": 0.7922174334526062, + "learning_rate": 5.099752727146824e-06, + "loss": 0.0802, + "step": 15570 + }, + { + "epoch": 59.465648854961835, + "grad_norm": 0.5909420847892761, + "learning_rate": 5.0942421985770415e-06, + "loss": 0.0734, + "step": 15580 + }, + { + "epoch": 59.50381679389313, + "grad_norm": 0.3770803213119507, + "learning_rate": 5.088731555494102e-06, + "loss": 0.0726, + "step": 15590 + }, + { + "epoch": 59.541984732824424, + "grad_norm": 0.4959609806537628, + "learning_rate": 5.083220804593956e-06, + "loss": 0.0751, + "step": 15600 + }, + { + "epoch": 59.58015267175573, + "grad_norm": 0.8414520621299744, + "learning_rate": 5.077709952572685e-06, + "loss": 0.0779, + "step": 15610 + }, + { + "epoch": 59.61832061068702, + "grad_norm": 0.43522128462791443, + "learning_rate": 5.072199006126494e-06, + "loss": 0.072, + "step": 15620 + }, + { + "epoch": 59.656488549618324, + "grad_norm": 0.3183342218399048, + "learning_rate": 5.066687971951702e-06, + "loss": 0.0743, + "step": 15630 + }, + { + "epoch": 59.69465648854962, + "grad_norm": 0.25728127360343933, + "learning_rate": 5.0611768567447375e-06, + "loss": 0.071, + "step": 15640 + }, + { + "epoch": 59.732824427480914, + "grad_norm": 0.24269990622997284, + "learning_rate": 5.055665667202121e-06, + "loss": 0.0682, + "step": 15650 + }, + { + "epoch": 59.770992366412216, + "grad_norm": 0.3147321343421936, + "learning_rate": 5.050154410020473e-06, + "loss": 0.072, + "step": 15660 + }, + { + "epoch": 59.80916030534351, + "grad_norm": 0.259830504655838, + "learning_rate": 5.044643091896485e-06, + "loss": 0.0728, + "step": 15670 + }, + { + "epoch": 59.847328244274806, + "grad_norm": 0.31697461009025574, + "learning_rate": 5.039131719526932e-06, + "loss": 0.0709, + "step": 15680 + }, + { + "epoch": 59.88549618320611, + "grad_norm": 0.6029514670372009, + "learning_rate": 5.03362029960865e-06, + "loss": 0.0728, + "step": 15690 + }, + { + "epoch": 59.9236641221374, + "grad_norm": 0.33722543716430664, + "learning_rate": 5.028108838838533e-06, + "loss": 0.0742, + "step": 15700 + }, + { + "epoch": 59.961832061068705, + "grad_norm": 0.6031141877174377, + "learning_rate": 5.022597343913528e-06, + "loss": 0.073, + "step": 15710 + }, + { + "epoch": 60.0, + "grad_norm": 0.7514012455940247, + "learning_rate": 5.017085821530617e-06, + "loss": 0.0768, + "step": 15720 + }, + { + "epoch": 60.038167938931295, + "grad_norm": 0.3948346972465515, + "learning_rate": 5.011574278386823e-06, + "loss": 0.0766, + "step": 15730 + }, + { + "epoch": 60.0763358778626, + "grad_norm": 0.3414020538330078, + "learning_rate": 5.006062721179189e-06, + "loss": 0.0766, + "step": 15740 + }, + { + "epoch": 60.11450381679389, + "grad_norm": 0.2455693781375885, + "learning_rate": 5.000551156604777e-06, + "loss": 0.0718, + "step": 15750 + }, + { + "epoch": 60.152671755725194, + "grad_norm": 0.7095368504524231, + "learning_rate": 4.99503959136066e-06, + "loss": 0.07, + "step": 15760 + }, + { + "epoch": 60.19083969465649, + "grad_norm": 0.37839195132255554, + "learning_rate": 4.9895280321439036e-06, + "loss": 0.0733, + "step": 15770 + }, + { + "epoch": 60.229007633587784, + "grad_norm": 0.3145567774772644, + "learning_rate": 4.984016485651578e-06, + "loss": 0.0784, + "step": 15780 + }, + { + "epoch": 60.267175572519086, + "grad_norm": 0.24902135133743286, + "learning_rate": 4.978504958580728e-06, + "loss": 0.0767, + "step": 15790 + }, + { + "epoch": 60.30534351145038, + "grad_norm": 0.5785359144210815, + "learning_rate": 4.9729934576283815e-06, + "loss": 0.0916, + "step": 15800 + }, + { + "epoch": 60.343511450381676, + "grad_norm": 0.6354800462722778, + "learning_rate": 4.967481989491531e-06, + "loss": 0.0777, + "step": 15810 + }, + { + "epoch": 60.38167938931298, + "grad_norm": 0.6582098603248596, + "learning_rate": 4.961970560867126e-06, + "loss": 0.0743, + "step": 15820 + }, + { + "epoch": 60.41984732824427, + "grad_norm": 0.5716459155082703, + "learning_rate": 4.956459178452079e-06, + "loss": 0.0869, + "step": 15830 + }, + { + "epoch": 60.458015267175576, + "grad_norm": 0.3027888536453247, + "learning_rate": 4.950947848943235e-06, + "loss": 0.0897, + "step": 15840 + }, + { + "epoch": 60.49618320610687, + "grad_norm": 0.4717438519001007, + "learning_rate": 4.94543657903738e-06, + "loss": 0.0734, + "step": 15850 + }, + { + "epoch": 60.534351145038165, + "grad_norm": 0.3485773503780365, + "learning_rate": 4.939925375431226e-06, + "loss": 0.0764, + "step": 15860 + }, + { + "epoch": 60.57251908396947, + "grad_norm": 0.46527671813964844, + "learning_rate": 4.934414244821405e-06, + "loss": 0.0734, + "step": 15870 + }, + { + "epoch": 60.61068702290076, + "grad_norm": 0.2664990723133087, + "learning_rate": 4.928903193904461e-06, + "loss": 0.0738, + "step": 15880 + }, + { + "epoch": 60.64885496183206, + "grad_norm": 0.23172251880168915, + "learning_rate": 4.923392229376841e-06, + "loss": 0.0695, + "step": 15890 + }, + { + "epoch": 60.68702290076336, + "grad_norm": 0.6194725632667542, + "learning_rate": 4.917881357934886e-06, + "loss": 0.0733, + "step": 15900 + }, + { + "epoch": 60.725190839694655, + "grad_norm": 0.5128942131996155, + "learning_rate": 4.912370586274825e-06, + "loss": 0.0675, + "step": 15910 + }, + { + "epoch": 60.76335877862596, + "grad_norm": 0.3195113241672516, + "learning_rate": 4.906859921092763e-06, + "loss": 0.0694, + "step": 15920 + }, + { + "epoch": 60.80152671755725, + "grad_norm": 0.4648715555667877, + "learning_rate": 4.901349369084681e-06, + "loss": 0.0752, + "step": 15930 + }, + { + "epoch": 60.83969465648855, + "grad_norm": 0.37270602583885193, + "learning_rate": 4.895838936946416e-06, + "loss": 0.0813, + "step": 15940 + }, + { + "epoch": 60.87786259541985, + "grad_norm": 0.3094925284385681, + "learning_rate": 4.890328631373666e-06, + "loss": 0.0787, + "step": 15950 + }, + { + "epoch": 60.916030534351144, + "grad_norm": 0.8085688352584839, + "learning_rate": 4.88481845906197e-06, + "loss": 0.077, + "step": 15960 + }, + { + "epoch": 60.954198473282446, + "grad_norm": 0.45573312044143677, + "learning_rate": 4.879308426706707e-06, + "loss": 0.0752, + "step": 15970 + }, + { + "epoch": 60.99236641221374, + "grad_norm": 0.29330453276634216, + "learning_rate": 4.873798541003084e-06, + "loss": 0.0805, + "step": 15980 + }, + { + "epoch": 61.030534351145036, + "grad_norm": 0.26652470231056213, + "learning_rate": 4.868288808646136e-06, + "loss": 0.074, + "step": 15990 + }, + { + "epoch": 61.06870229007634, + "grad_norm": 0.2818049490451813, + "learning_rate": 4.862779236330705e-06, + "loss": 0.0706, + "step": 16000 + }, + { + "epoch": 61.10687022900763, + "grad_norm": 0.32146692276000977, + "learning_rate": 4.8572698307514395e-06, + "loss": 0.0704, + "step": 16010 + }, + { + "epoch": 61.14503816793893, + "grad_norm": 0.3116312325000763, + "learning_rate": 4.8517605986027904e-06, + "loss": 0.0671, + "step": 16020 + }, + { + "epoch": 61.18320610687023, + "grad_norm": 0.3970802128314972, + "learning_rate": 4.846251546578989e-06, + "loss": 0.0745, + "step": 16030 + }, + { + "epoch": 61.221374045801525, + "grad_norm": 0.35653743147850037, + "learning_rate": 4.8407426813740584e-06, + "loss": 0.0737, + "step": 16040 + }, + { + "epoch": 61.25954198473283, + "grad_norm": 0.3067050576210022, + "learning_rate": 4.835234009681787e-06, + "loss": 0.0702, + "step": 16050 + }, + { + "epoch": 61.29770992366412, + "grad_norm": 0.5067670941352844, + "learning_rate": 4.82972553819573e-06, + "loss": 0.0713, + "step": 16060 + }, + { + "epoch": 61.33587786259542, + "grad_norm": 0.31573083996772766, + "learning_rate": 4.824217273609199e-06, + "loss": 0.071, + "step": 16070 + }, + { + "epoch": 61.37404580152672, + "grad_norm": 0.29241570830345154, + "learning_rate": 4.818709222615255e-06, + "loss": 0.0712, + "step": 16080 + }, + { + "epoch": 61.412213740458014, + "grad_norm": 0.29491475224494934, + "learning_rate": 4.813201391906702e-06, + "loss": 0.0783, + "step": 16090 + }, + { + "epoch": 61.45038167938931, + "grad_norm": 0.5220329761505127, + "learning_rate": 4.807693788176071e-06, + "loss": 0.072, + "step": 16100 + }, + { + "epoch": 61.48854961832061, + "grad_norm": 0.27246901392936707, + "learning_rate": 4.802186418115622e-06, + "loss": 0.0756, + "step": 16110 + }, + { + "epoch": 61.52671755725191, + "grad_norm": 0.7175832390785217, + "learning_rate": 4.796679288417326e-06, + "loss": 0.0762, + "step": 16120 + }, + { + "epoch": 61.56488549618321, + "grad_norm": 0.4071796238422394, + "learning_rate": 4.791172405772866e-06, + "loss": 0.0787, + "step": 16130 + }, + { + "epoch": 61.603053435114504, + "grad_norm": 0.2860810458660126, + "learning_rate": 4.785665776873626e-06, + "loss": 0.0705, + "step": 16140 + }, + { + "epoch": 61.6412213740458, + "grad_norm": 0.34713274240493774, + "learning_rate": 4.780159408410677e-06, + "loss": 0.0759, + "step": 16150 + }, + { + "epoch": 61.6793893129771, + "grad_norm": 0.2859640419483185, + "learning_rate": 4.774653307074775e-06, + "loss": 0.0855, + "step": 16160 + }, + { + "epoch": 61.717557251908396, + "grad_norm": 0.30147480964660645, + "learning_rate": 4.7691474795563556e-06, + "loss": 0.0885, + "step": 16170 + }, + { + "epoch": 61.7557251908397, + "grad_norm": 0.516560435295105, + "learning_rate": 4.763641932545516e-06, + "loss": 0.0729, + "step": 16180 + }, + { + "epoch": 61.79389312977099, + "grad_norm": 0.29214778542518616, + "learning_rate": 4.758136672732013e-06, + "loss": 0.0762, + "step": 16190 + }, + { + "epoch": 61.83206106870229, + "grad_norm": 0.3169700503349304, + "learning_rate": 4.752631706805261e-06, + "loss": 0.069, + "step": 16200 + }, + { + "epoch": 61.87022900763359, + "grad_norm": 0.4579392969608307, + "learning_rate": 4.747127041454311e-06, + "loss": 0.0845, + "step": 16210 + }, + { + "epoch": 61.908396946564885, + "grad_norm": 0.4441590905189514, + "learning_rate": 4.741622683367849e-06, + "loss": 0.0681, + "step": 16220 + }, + { + "epoch": 61.94656488549618, + "grad_norm": 0.3080080449581146, + "learning_rate": 4.736118639234191e-06, + "loss": 0.0759, + "step": 16230 + }, + { + "epoch": 61.98473282442748, + "grad_norm": 0.23667652904987335, + "learning_rate": 4.7306149157412666e-06, + "loss": 0.0729, + "step": 16240 + }, + { + "epoch": 62.02290076335878, + "grad_norm": 0.3574734926223755, + "learning_rate": 4.7251115195766234e-06, + "loss": 0.0759, + "step": 16250 + }, + { + "epoch": 62.06106870229008, + "grad_norm": 1.1783217191696167, + "learning_rate": 4.719608457427404e-06, + "loss": 0.0943, + "step": 16260 + }, + { + "epoch": 62.099236641221374, + "grad_norm": 0.46935534477233887, + "learning_rate": 4.714105735980348e-06, + "loss": 0.0772, + "step": 16270 + }, + { + "epoch": 62.13740458015267, + "grad_norm": 0.32659921050071716, + "learning_rate": 4.708603361921779e-06, + "loss": 0.0761, + "step": 16280 + }, + { + "epoch": 62.17557251908397, + "grad_norm": 0.37154969573020935, + "learning_rate": 4.703101341937604e-06, + "loss": 0.0764, + "step": 16290 + }, + { + "epoch": 62.213740458015266, + "grad_norm": 0.5954548716545105, + "learning_rate": 4.697599682713292e-06, + "loss": 0.0725, + "step": 16300 + }, + { + "epoch": 62.25190839694657, + "grad_norm": 0.43994778394699097, + "learning_rate": 4.692098390933883e-06, + "loss": 0.0798, + "step": 16310 + }, + { + "epoch": 62.29007633587786, + "grad_norm": 0.42407238483428955, + "learning_rate": 4.686597473283962e-06, + "loss": 0.0674, + "step": 16320 + }, + { + "epoch": 62.32824427480916, + "grad_norm": 0.31318119168281555, + "learning_rate": 4.681096936447662e-06, + "loss": 0.0801, + "step": 16330 + }, + { + "epoch": 62.36641221374046, + "grad_norm": 0.4887228310108185, + "learning_rate": 4.675596787108652e-06, + "loss": 0.0717, + "step": 16340 + }, + { + "epoch": 62.404580152671755, + "grad_norm": 0.2779092788696289, + "learning_rate": 4.670097031950138e-06, + "loss": 0.0712, + "step": 16350 + }, + { + "epoch": 62.44274809160305, + "grad_norm": 0.839206874370575, + "learning_rate": 4.664597677654839e-06, + "loss": 0.0699, + "step": 16360 + }, + { + "epoch": 62.48091603053435, + "grad_norm": 0.25964847207069397, + "learning_rate": 4.6590987309049855e-06, + "loss": 0.0636, + "step": 16370 + }, + { + "epoch": 62.51908396946565, + "grad_norm": 0.34289413690567017, + "learning_rate": 4.65360019838232e-06, + "loss": 0.0779, + "step": 16380 + }, + { + "epoch": 62.55725190839695, + "grad_norm": 0.33554473519325256, + "learning_rate": 4.648102086768077e-06, + "loss": 0.0791, + "step": 16390 + }, + { + "epoch": 62.595419847328245, + "grad_norm": 0.4923129677772522, + "learning_rate": 4.642604402742979e-06, + "loss": 0.073, + "step": 16400 + }, + { + "epoch": 62.63358778625954, + "grad_norm": 0.44560110569000244, + "learning_rate": 4.6371071529872336e-06, + "loss": 0.0735, + "step": 16410 + }, + { + "epoch": 62.67175572519084, + "grad_norm": 0.47609812021255493, + "learning_rate": 4.6316103441805155e-06, + "loss": 0.071, + "step": 16420 + }, + { + "epoch": 62.70992366412214, + "grad_norm": 0.6052244901657104, + "learning_rate": 4.626113983001965e-06, + "loss": 0.0764, + "step": 16430 + }, + { + "epoch": 62.74809160305343, + "grad_norm": 0.3735920786857605, + "learning_rate": 4.620618076130182e-06, + "loss": 0.0671, + "step": 16440 + }, + { + "epoch": 62.786259541984734, + "grad_norm": 0.279205858707428, + "learning_rate": 4.615122630243207e-06, + "loss": 0.0767, + "step": 16450 + }, + { + "epoch": 62.82442748091603, + "grad_norm": 0.7630006670951843, + "learning_rate": 4.60962765201853e-06, + "loss": 0.0739, + "step": 16460 + }, + { + "epoch": 62.86259541984733, + "grad_norm": 0.22555769979953766, + "learning_rate": 4.604133148133066e-06, + "loss": 0.071, + "step": 16470 + }, + { + "epoch": 62.900763358778626, + "grad_norm": 0.42524755001068115, + "learning_rate": 4.598639125263155e-06, + "loss": 0.0794, + "step": 16480 + }, + { + "epoch": 62.93893129770992, + "grad_norm": 0.38487425446510315, + "learning_rate": 4.593145590084553e-06, + "loss": 0.0764, + "step": 16490 + }, + { + "epoch": 62.97709923664122, + "grad_norm": 0.3085957467556, + "learning_rate": 4.58765254927242e-06, + "loss": 0.0691, + "step": 16500 + }, + { + "epoch": 63.01526717557252, + "grad_norm": 0.4019497334957123, + "learning_rate": 4.582160009501323e-06, + "loss": 0.0692, + "step": 16510 + }, + { + "epoch": 63.05343511450382, + "grad_norm": 0.2319970428943634, + "learning_rate": 4.576667977445214e-06, + "loss": 0.0677, + "step": 16520 + }, + { + "epoch": 63.091603053435115, + "grad_norm": 0.4725869596004486, + "learning_rate": 4.571176459777431e-06, + "loss": 0.0739, + "step": 16530 + }, + { + "epoch": 63.12977099236641, + "grad_norm": 0.3336828351020813, + "learning_rate": 4.565685463170685e-06, + "loss": 0.0676, + "step": 16540 + }, + { + "epoch": 63.16793893129771, + "grad_norm": 0.6508419513702393, + "learning_rate": 4.560194994297054e-06, + "loss": 0.0765, + "step": 16550 + }, + { + "epoch": 63.20610687022901, + "grad_norm": 1.2349802255630493, + "learning_rate": 4.554705059827974e-06, + "loss": 0.0974, + "step": 16560 + }, + { + "epoch": 63.2442748091603, + "grad_norm": 0.23870617151260376, + "learning_rate": 4.549215666434237e-06, + "loss": 0.0632, + "step": 16570 + }, + { + "epoch": 63.282442748091604, + "grad_norm": 0.45109471678733826, + "learning_rate": 4.54372682078597e-06, + "loss": 0.0704, + "step": 16580 + }, + { + "epoch": 63.3206106870229, + "grad_norm": 0.9649439454078674, + "learning_rate": 4.538238529552641e-06, + "loss": 0.0808, + "step": 16590 + }, + { + "epoch": 63.3587786259542, + "grad_norm": 0.35490062832832336, + "learning_rate": 4.532750799403039e-06, + "loss": 0.0762, + "step": 16600 + }, + { + "epoch": 63.396946564885496, + "grad_norm": 0.453528493642807, + "learning_rate": 4.527263637005274e-06, + "loss": 0.0764, + "step": 16610 + }, + { + "epoch": 63.43511450381679, + "grad_norm": 0.4732442796230316, + "learning_rate": 4.521777049026767e-06, + "loss": 0.0772, + "step": 16620 + }, + { + "epoch": 63.47328244274809, + "grad_norm": 0.4358898401260376, + "learning_rate": 4.516291042134238e-06, + "loss": 0.0697, + "step": 16630 + }, + { + "epoch": 63.51145038167939, + "grad_norm": 0.3423472046852112, + "learning_rate": 4.5108056229937055e-06, + "loss": 0.0741, + "step": 16640 + }, + { + "epoch": 63.54961832061069, + "grad_norm": 0.5359134078025818, + "learning_rate": 4.505320798270467e-06, + "loss": 0.072, + "step": 16650 + }, + { + "epoch": 63.587786259541986, + "grad_norm": 0.2593790590763092, + "learning_rate": 4.4998365746291045e-06, + "loss": 0.0761, + "step": 16660 + }, + { + "epoch": 63.62595419847328, + "grad_norm": 0.24456137418746948, + "learning_rate": 4.494352958733466e-06, + "loss": 0.0652, + "step": 16670 + }, + { + "epoch": 63.66412213740458, + "grad_norm": 1.0902378559112549, + "learning_rate": 4.4888699572466624e-06, + "loss": 0.0741, + "step": 16680 + }, + { + "epoch": 63.70229007633588, + "grad_norm": 0.3199808895587921, + "learning_rate": 4.483387576831058e-06, + "loss": 0.0774, + "step": 16690 + }, + { + "epoch": 63.74045801526717, + "grad_norm": 1.09122896194458, + "learning_rate": 4.47790582414826e-06, + "loss": 0.0737, + "step": 16700 + }, + { + "epoch": 63.778625954198475, + "grad_norm": 0.3936939537525177, + "learning_rate": 4.472424705859115e-06, + "loss": 0.0705, + "step": 16710 + }, + { + "epoch": 63.81679389312977, + "grad_norm": 0.3976505398750305, + "learning_rate": 4.466944228623701e-06, + "loss": 0.073, + "step": 16720 + }, + { + "epoch": 63.85496183206107, + "grad_norm": 0.2906537652015686, + "learning_rate": 4.4614643991013125e-06, + "loss": 0.0748, + "step": 16730 + }, + { + "epoch": 63.89312977099237, + "grad_norm": 0.31357595324516296, + "learning_rate": 4.45598522395046e-06, + "loss": 0.0729, + "step": 16740 + }, + { + "epoch": 63.93129770992366, + "grad_norm": 0.30560722947120667, + "learning_rate": 4.450506709828858e-06, + "loss": 0.0751, + "step": 16750 + }, + { + "epoch": 63.969465648854964, + "grad_norm": 0.3903256356716156, + "learning_rate": 4.445028863393417e-06, + "loss": 0.0737, + "step": 16760 + }, + { + "epoch": 64.00763358778626, + "grad_norm": 0.5252776741981506, + "learning_rate": 4.439551691300236e-06, + "loss": 0.0748, + "step": 16770 + }, + { + "epoch": 64.04580152671755, + "grad_norm": 0.389498233795166, + "learning_rate": 4.4340752002045985e-06, + "loss": 0.0749, + "step": 16780 + }, + { + "epoch": 64.08396946564885, + "grad_norm": 0.3666820228099823, + "learning_rate": 4.428599396760957e-06, + "loss": 0.0691, + "step": 16790 + }, + { + "epoch": 64.12213740458016, + "grad_norm": 0.30693432688713074, + "learning_rate": 4.4231242876229256e-06, + "loss": 0.0736, + "step": 16800 + }, + { + "epoch": 64.16030534351145, + "grad_norm": 0.44508853554725647, + "learning_rate": 4.417649879443282e-06, + "loss": 0.0683, + "step": 16810 + }, + { + "epoch": 64.19847328244275, + "grad_norm": 0.3270077109336853, + "learning_rate": 4.4121761788739445e-06, + "loss": 0.0672, + "step": 16820 + }, + { + "epoch": 64.23664122137404, + "grad_norm": 0.2393168956041336, + "learning_rate": 4.406703192565981e-06, + "loss": 0.0708, + "step": 16830 + }, + { + "epoch": 64.27480916030534, + "grad_norm": 0.4205631613731384, + "learning_rate": 4.401230927169582e-06, + "loss": 0.0735, + "step": 16840 + }, + { + "epoch": 64.31297709923665, + "grad_norm": 0.7220420241355896, + "learning_rate": 4.395759389334067e-06, + "loss": 0.0704, + "step": 16850 + }, + { + "epoch": 64.35114503816794, + "grad_norm": 0.4618149697780609, + "learning_rate": 4.3902885857078685e-06, + "loss": 0.0716, + "step": 16860 + }, + { + "epoch": 64.38931297709924, + "grad_norm": 0.304708868265152, + "learning_rate": 4.384818522938531e-06, + "loss": 0.074, + "step": 16870 + }, + { + "epoch": 64.42748091603053, + "grad_norm": 0.6205267310142517, + "learning_rate": 4.379349207672696e-06, + "loss": 0.0716, + "step": 16880 + }, + { + "epoch": 64.46564885496183, + "grad_norm": 0.4849410057067871, + "learning_rate": 4.3738806465560975e-06, + "loss": 0.0744, + "step": 16890 + }, + { + "epoch": 64.50381679389314, + "grad_norm": 0.2881622612476349, + "learning_rate": 4.368412846233554e-06, + "loss": 0.0694, + "step": 16900 + }, + { + "epoch": 64.54198473282443, + "grad_norm": 0.5529125928878784, + "learning_rate": 4.362945813348956e-06, + "loss": 0.0724, + "step": 16910 + }, + { + "epoch": 64.58015267175573, + "grad_norm": 0.41142237186431885, + "learning_rate": 4.357479554545263e-06, + "loss": 0.0715, + "step": 16920 + }, + { + "epoch": 64.61832061068702, + "grad_norm": 0.6459675431251526, + "learning_rate": 4.352014076464499e-06, + "loss": 0.0723, + "step": 16930 + }, + { + "epoch": 64.65648854961832, + "grad_norm": 0.5050190091133118, + "learning_rate": 4.346549385747733e-06, + "loss": 0.0754, + "step": 16940 + }, + { + "epoch": 64.69465648854961, + "grad_norm": 0.3293865919113159, + "learning_rate": 4.34108548903508e-06, + "loss": 0.0696, + "step": 16950 + }, + { + "epoch": 64.73282442748092, + "grad_norm": 0.43672609329223633, + "learning_rate": 4.335622392965689e-06, + "loss": 0.0745, + "step": 16960 + }, + { + "epoch": 64.77099236641222, + "grad_norm": 0.43421950936317444, + "learning_rate": 4.330160104177738e-06, + "loss": 0.0691, + "step": 16970 + }, + { + "epoch": 64.80916030534351, + "grad_norm": 1.2267439365386963, + "learning_rate": 4.324698629308419e-06, + "loss": 0.0748, + "step": 16980 + }, + { + "epoch": 64.8473282442748, + "grad_norm": 0.27527856826782227, + "learning_rate": 4.3192379749939466e-06, + "loss": 0.0736, + "step": 16990 + }, + { + "epoch": 64.8854961832061, + "grad_norm": 0.84715735912323, + "learning_rate": 4.313778147869524e-06, + "loss": 0.0794, + "step": 17000 + }, + { + "epoch": 64.92366412213741, + "grad_norm": 0.49666497111320496, + "learning_rate": 4.308319154569358e-06, + "loss": 0.0752, + "step": 17010 + }, + { + "epoch": 64.9618320610687, + "grad_norm": 0.6374955773353577, + "learning_rate": 4.302861001726642e-06, + "loss": 0.0796, + "step": 17020 + }, + { + "epoch": 65.0, + "grad_norm": 0.31495794653892517, + "learning_rate": 4.297403695973542e-06, + "loss": 0.0733, + "step": 17030 + }, + { + "epoch": 65.0381679389313, + "grad_norm": 0.4066680073738098, + "learning_rate": 4.291947243941203e-06, + "loss": 0.0755, + "step": 17040 + }, + { + "epoch": 65.07633587786259, + "grad_norm": 0.33870530128479004, + "learning_rate": 4.286491652259729e-06, + "loss": 0.0676, + "step": 17050 + }, + { + "epoch": 65.1145038167939, + "grad_norm": 0.2708606421947479, + "learning_rate": 4.281036927558178e-06, + "loss": 0.0708, + "step": 17060 + }, + { + "epoch": 65.1526717557252, + "grad_norm": 0.2700878083705902, + "learning_rate": 4.275583076464552e-06, + "loss": 0.071, + "step": 17070 + }, + { + "epoch": 65.19083969465649, + "grad_norm": 0.3311876952648163, + "learning_rate": 4.270130105605794e-06, + "loss": 0.0732, + "step": 17080 + }, + { + "epoch": 65.22900763358778, + "grad_norm": 0.2695978581905365, + "learning_rate": 4.264678021607782e-06, + "loss": 0.0736, + "step": 17090 + }, + { + "epoch": 65.26717557251908, + "grad_norm": 0.3300071954727173, + "learning_rate": 4.259226831095311e-06, + "loss": 0.0724, + "step": 17100 + }, + { + "epoch": 65.30534351145039, + "grad_norm": 0.2677368223667145, + "learning_rate": 4.25377654069209e-06, + "loss": 0.0703, + "step": 17110 + }, + { + "epoch": 65.34351145038168, + "grad_norm": 0.29270052909851074, + "learning_rate": 4.248327157020737e-06, + "loss": 0.0724, + "step": 17120 + }, + { + "epoch": 65.38167938931298, + "grad_norm": 0.6969024538993835, + "learning_rate": 4.242878686702763e-06, + "loss": 0.0909, + "step": 17130 + }, + { + "epoch": 65.41984732824427, + "grad_norm": 0.3158918023109436, + "learning_rate": 4.237431136358579e-06, + "loss": 0.0732, + "step": 17140 + }, + { + "epoch": 65.45801526717557, + "grad_norm": 0.4324190318584442, + "learning_rate": 4.231984512607471e-06, + "loss": 0.0729, + "step": 17150 + }, + { + "epoch": 65.49618320610686, + "grad_norm": 0.522308886051178, + "learning_rate": 4.226538822067598e-06, + "loss": 0.0781, + "step": 17160 + }, + { + "epoch": 65.53435114503817, + "grad_norm": 0.7167192697525024, + "learning_rate": 4.22109407135599e-06, + "loss": 0.0677, + "step": 17170 + }, + { + "epoch": 65.57251908396947, + "grad_norm": 0.4006265103816986, + "learning_rate": 4.2156502670885304e-06, + "loss": 0.0681, + "step": 17180 + }, + { + "epoch": 65.61068702290076, + "grad_norm": 0.246806800365448, + "learning_rate": 4.210207415879953e-06, + "loss": 0.0702, + "step": 17190 + }, + { + "epoch": 65.64885496183206, + "grad_norm": 0.3185785710811615, + "learning_rate": 4.204765524343841e-06, + "loss": 0.0648, + "step": 17200 + }, + { + "epoch": 65.68702290076335, + "grad_norm": 0.2700112760066986, + "learning_rate": 4.199324599092603e-06, + "loss": 0.0742, + "step": 17210 + }, + { + "epoch": 65.72519083969466, + "grad_norm": 0.7986395955085754, + "learning_rate": 4.1938846467374745e-06, + "loss": 0.0727, + "step": 17220 + }, + { + "epoch": 65.76335877862596, + "grad_norm": 0.34733664989471436, + "learning_rate": 4.1884456738885125e-06, + "loss": 0.0647, + "step": 17230 + }, + { + "epoch": 65.80152671755725, + "grad_norm": 0.35277488827705383, + "learning_rate": 4.18300768715458e-06, + "loss": 0.0784, + "step": 17240 + }, + { + "epoch": 65.83969465648855, + "grad_norm": 0.31571856141090393, + "learning_rate": 4.177570693143347e-06, + "loss": 0.0736, + "step": 17250 + }, + { + "epoch": 65.87786259541984, + "grad_norm": 0.47372332215309143, + "learning_rate": 4.172134698461271e-06, + "loss": 0.0749, + "step": 17260 + }, + { + "epoch": 65.91603053435115, + "grad_norm": 0.3981868624687195, + "learning_rate": 4.166699709713599e-06, + "loss": 0.0677, + "step": 17270 + }, + { + "epoch": 65.95419847328245, + "grad_norm": 0.4929441511631012, + "learning_rate": 4.161265733504355e-06, + "loss": 0.0733, + "step": 17280 + }, + { + "epoch": 65.99236641221374, + "grad_norm": 0.24368400871753693, + "learning_rate": 4.155832776436331e-06, + "loss": 0.0687, + "step": 17290 + }, + { + "epoch": 66.03053435114504, + "grad_norm": 0.3198246359825134, + "learning_rate": 4.150400845111085e-06, + "loss": 0.0813, + "step": 17300 + }, + { + "epoch": 66.06870229007633, + "grad_norm": 0.288518488407135, + "learning_rate": 4.144969946128923e-06, + "loss": 0.072, + "step": 17310 + }, + { + "epoch": 66.10687022900764, + "grad_norm": 0.2651676833629608, + "learning_rate": 4.139540086088901e-06, + "loss": 0.0696, + "step": 17320 + }, + { + "epoch": 66.14503816793894, + "grad_norm": 0.21474707126617432, + "learning_rate": 4.1341112715888106e-06, + "loss": 0.0711, + "step": 17330 + }, + { + "epoch": 66.18320610687023, + "grad_norm": 0.3536648452281952, + "learning_rate": 4.128683509225172e-06, + "loss": 0.0783, + "step": 17340 + }, + { + "epoch": 66.22137404580153, + "grad_norm": 0.3147779703140259, + "learning_rate": 4.123256805593231e-06, + "loss": 0.0691, + "step": 17350 + }, + { + "epoch": 66.25954198473282, + "grad_norm": 0.45088210701942444, + "learning_rate": 4.117831167286943e-06, + "loss": 0.0716, + "step": 17360 + }, + { + "epoch": 66.29770992366412, + "grad_norm": 0.4173077940940857, + "learning_rate": 4.112406600898968e-06, + "loss": 0.0756, + "step": 17370 + }, + { + "epoch": 66.33587786259542, + "grad_norm": 0.2509874403476715, + "learning_rate": 4.106983113020669e-06, + "loss": 0.0699, + "step": 17380 + }, + { + "epoch": 66.37404580152672, + "grad_norm": 0.5545008778572083, + "learning_rate": 4.101560710242094e-06, + "loss": 0.0757, + "step": 17390 + }, + { + "epoch": 66.41221374045801, + "grad_norm": 0.3343713879585266, + "learning_rate": 4.096139399151971e-06, + "loss": 0.0713, + "step": 17400 + }, + { + "epoch": 66.45038167938931, + "grad_norm": 0.40432122349739075, + "learning_rate": 4.090719186337709e-06, + "loss": 0.0685, + "step": 17410 + }, + { + "epoch": 66.4885496183206, + "grad_norm": 0.8056888580322266, + "learning_rate": 4.085300078385375e-06, + "loss": 0.0753, + "step": 17420 + }, + { + "epoch": 66.52671755725191, + "grad_norm": 0.43857723474502563, + "learning_rate": 4.079882081879696e-06, + "loss": 0.0714, + "step": 17430 + }, + { + "epoch": 66.56488549618321, + "grad_norm": 0.8525163531303406, + "learning_rate": 4.074465203404048e-06, + "loss": 0.0833, + "step": 17440 + }, + { + "epoch": 66.6030534351145, + "grad_norm": 0.7485587000846863, + "learning_rate": 4.06904944954045e-06, + "loss": 0.07, + "step": 17450 + }, + { + "epoch": 66.6412213740458, + "grad_norm": 0.22170186042785645, + "learning_rate": 4.063634826869553e-06, + "loss": 0.0746, + "step": 17460 + }, + { + "epoch": 66.6793893129771, + "grad_norm": 0.40687039494514465, + "learning_rate": 4.058221341970638e-06, + "loss": 0.0709, + "step": 17470 + }, + { + "epoch": 66.7175572519084, + "grad_norm": 0.3101550340652466, + "learning_rate": 4.052809001421595e-06, + "loss": 0.0653, + "step": 17480 + }, + { + "epoch": 66.7557251908397, + "grad_norm": 0.4546465277671814, + "learning_rate": 4.047397811798929e-06, + "loss": 0.0698, + "step": 17490 + }, + { + "epoch": 66.79389312977099, + "grad_norm": 0.35028156638145447, + "learning_rate": 4.041987779677745e-06, + "loss": 0.0774, + "step": 17500 + }, + { + "epoch": 66.83206106870229, + "grad_norm": 0.47379425168037415, + "learning_rate": 4.036578911631746e-06, + "loss": 0.0726, + "step": 17510 + }, + { + "epoch": 66.87022900763358, + "grad_norm": 0.6644511222839355, + "learning_rate": 4.0311712142332115e-06, + "loss": 0.069, + "step": 17520 + }, + { + "epoch": 66.90839694656489, + "grad_norm": 0.6375014185905457, + "learning_rate": 4.025764694053008e-06, + "loss": 0.0676, + "step": 17530 + }, + { + "epoch": 66.94656488549619, + "grad_norm": 0.39866891503334045, + "learning_rate": 4.020359357660566e-06, + "loss": 0.0773, + "step": 17540 + }, + { + "epoch": 66.98473282442748, + "grad_norm": 0.31311362981796265, + "learning_rate": 4.014955211623875e-06, + "loss": 0.0661, + "step": 17550 + }, + { + "epoch": 67.02290076335878, + "grad_norm": 0.42248404026031494, + "learning_rate": 4.00955226250949e-06, + "loss": 0.0787, + "step": 17560 + }, + { + "epoch": 67.06106870229007, + "grad_norm": 0.7026179432868958, + "learning_rate": 4.0041505168824976e-06, + "loss": 0.0755, + "step": 17570 + }, + { + "epoch": 67.09923664122137, + "grad_norm": 0.32166993618011475, + "learning_rate": 3.99874998130653e-06, + "loss": 0.0659, + "step": 17580 + }, + { + "epoch": 67.13740458015268, + "grad_norm": 0.2971125543117523, + "learning_rate": 3.993350662343746e-06, + "loss": 0.0675, + "step": 17590 + }, + { + "epoch": 67.17557251908397, + "grad_norm": 0.3091699182987213, + "learning_rate": 3.987952566554828e-06, + "loss": 0.0719, + "step": 17600 + }, + { + "epoch": 67.21374045801527, + "grad_norm": 0.24681086838245392, + "learning_rate": 3.982555700498971e-06, + "loss": 0.0732, + "step": 17610 + }, + { + "epoch": 67.25190839694656, + "grad_norm": 0.4795871078968048, + "learning_rate": 3.977160070733878e-06, + "loss": 0.0691, + "step": 17620 + }, + { + "epoch": 67.29007633587786, + "grad_norm": 0.6124231815338135, + "learning_rate": 3.971765683815746e-06, + "loss": 0.0702, + "step": 17630 + }, + { + "epoch": 67.32824427480917, + "grad_norm": 0.659125030040741, + "learning_rate": 3.966372546299264e-06, + "loss": 0.0744, + "step": 17640 + }, + { + "epoch": 67.36641221374046, + "grad_norm": 0.6344574689865112, + "learning_rate": 3.960980664737604e-06, + "loss": 0.0723, + "step": 17650 + }, + { + "epoch": 67.40458015267176, + "grad_norm": 0.3194618225097656, + "learning_rate": 3.955590045682408e-06, + "loss": 0.0714, + "step": 17660 + }, + { + "epoch": 67.44274809160305, + "grad_norm": 0.44845250248908997, + "learning_rate": 3.950200695683788e-06, + "loss": 0.0723, + "step": 17670 + }, + { + "epoch": 67.48091603053435, + "grad_norm": 1.0037466287612915, + "learning_rate": 3.944812621290314e-06, + "loss": 0.0691, + "step": 17680 + }, + { + "epoch": 67.51908396946565, + "grad_norm": 0.2766275107860565, + "learning_rate": 3.939425829049002e-06, + "loss": 0.0717, + "step": 17690 + }, + { + "epoch": 67.55725190839695, + "grad_norm": 0.654982328414917, + "learning_rate": 3.934040325505313e-06, + "loss": 0.0672, + "step": 17700 + }, + { + "epoch": 67.59541984732824, + "grad_norm": 0.36373141407966614, + "learning_rate": 3.928656117203141e-06, + "loss": 0.0713, + "step": 17710 + }, + { + "epoch": 67.63358778625954, + "grad_norm": 0.34860584139823914, + "learning_rate": 3.923273210684809e-06, + "loss": 0.0757, + "step": 17720 + }, + { + "epoch": 67.67175572519083, + "grad_norm": 0.5683857202529907, + "learning_rate": 3.917891612491055e-06, + "loss": 0.0675, + "step": 17730 + }, + { + "epoch": 67.70992366412214, + "grad_norm": 0.5951647162437439, + "learning_rate": 3.912511329161027e-06, + "loss": 0.0702, + "step": 17740 + }, + { + "epoch": 67.74809160305344, + "grad_norm": 0.27941733598709106, + "learning_rate": 3.907132367232279e-06, + "loss": 0.0696, + "step": 17750 + }, + { + "epoch": 67.78625954198473, + "grad_norm": 0.27847856283187866, + "learning_rate": 3.901754733240753e-06, + "loss": 0.0727, + "step": 17760 + }, + { + "epoch": 67.82442748091603, + "grad_norm": 0.5478169322013855, + "learning_rate": 3.896378433720786e-06, + "loss": 0.0719, + "step": 17770 + }, + { + "epoch": 67.86259541984732, + "grad_norm": 0.5233021378517151, + "learning_rate": 3.891003475205086e-06, + "loss": 0.065, + "step": 17780 + }, + { + "epoch": 67.90076335877862, + "grad_norm": 0.2591749131679535, + "learning_rate": 3.885629864224736e-06, + "loss": 0.0731, + "step": 17790 + }, + { + "epoch": 67.93893129770993, + "grad_norm": 0.39864397048950195, + "learning_rate": 3.880257607309178e-06, + "loss": 0.0697, + "step": 17800 + }, + { + "epoch": 67.97709923664122, + "grad_norm": 0.46604353189468384, + "learning_rate": 3.874886710986213e-06, + "loss": 0.0708, + "step": 17810 + }, + { + "epoch": 68.01526717557252, + "grad_norm": 0.35567206144332886, + "learning_rate": 3.869517181781983e-06, + "loss": 0.0679, + "step": 17820 + }, + { + "epoch": 68.05343511450381, + "grad_norm": 0.8951524496078491, + "learning_rate": 3.864149026220977e-06, + "loss": 0.076, + "step": 17830 + }, + { + "epoch": 68.09160305343511, + "grad_norm": 0.43323707580566406, + "learning_rate": 3.858782250826009e-06, + "loss": 0.0781, + "step": 17840 + }, + { + "epoch": 68.12977099236642, + "grad_norm": 0.3977680206298828, + "learning_rate": 3.853416862118214e-06, + "loss": 0.0766, + "step": 17850 + }, + { + "epoch": 68.16793893129771, + "grad_norm": 0.7726107239723206, + "learning_rate": 3.8480528666170495e-06, + "loss": 0.075, + "step": 17860 + }, + { + "epoch": 68.20610687022901, + "grad_norm": 0.40981167554855347, + "learning_rate": 3.8426902708402695e-06, + "loss": 0.0697, + "step": 17870 + }, + { + "epoch": 68.2442748091603, + "grad_norm": 0.23430369794368744, + "learning_rate": 3.8373290813039404e-06, + "loss": 0.0744, + "step": 17880 + }, + { + "epoch": 68.2824427480916, + "grad_norm": 0.33602553606033325, + "learning_rate": 3.83196930452241e-06, + "loss": 0.0729, + "step": 17890 + }, + { + "epoch": 68.3206106870229, + "grad_norm": 0.49401628971099854, + "learning_rate": 3.826610947008313e-06, + "loss": 0.0703, + "step": 17900 + }, + { + "epoch": 68.3587786259542, + "grad_norm": 0.4614551067352295, + "learning_rate": 3.821254015272558e-06, + "loss": 0.0699, + "step": 17910 + }, + { + "epoch": 68.3969465648855, + "grad_norm": 0.39021456241607666, + "learning_rate": 3.8158985158243214e-06, + "loss": 0.0722, + "step": 17920 + }, + { + "epoch": 68.43511450381679, + "grad_norm": 0.2698560059070587, + "learning_rate": 3.810544455171044e-06, + "loss": 0.0725, + "step": 17930 + }, + { + "epoch": 68.47328244274809, + "grad_norm": 0.339725136756897, + "learning_rate": 3.805191839818412e-06, + "loss": 0.0667, + "step": 17940 + }, + { + "epoch": 68.5114503816794, + "grad_norm": 0.36522993445396423, + "learning_rate": 3.7998406762703566e-06, + "loss": 0.0783, + "step": 17950 + }, + { + "epoch": 68.54961832061069, + "grad_norm": 0.4435858726501465, + "learning_rate": 3.794490971029048e-06, + "loss": 0.0769, + "step": 17960 + }, + { + "epoch": 68.58778625954199, + "grad_norm": 0.3447798490524292, + "learning_rate": 3.7891427305948815e-06, + "loss": 0.0898, + "step": 17970 + }, + { + "epoch": 68.62595419847328, + "grad_norm": 0.2673998177051544, + "learning_rate": 3.7837959614664714e-06, + "loss": 0.0806, + "step": 17980 + }, + { + "epoch": 68.66412213740458, + "grad_norm": 0.3040778636932373, + "learning_rate": 3.778450670140651e-06, + "loss": 0.0745, + "step": 17990 + }, + { + "epoch": 68.70229007633588, + "grad_norm": 0.3017938733100891, + "learning_rate": 3.773106863112451e-06, + "loss": 0.0692, + "step": 18000 + }, + { + "epoch": 68.74045801526718, + "grad_norm": 0.28814712166786194, + "learning_rate": 3.7677645468751e-06, + "loss": 0.0656, + "step": 18010 + }, + { + "epoch": 68.77862595419847, + "grad_norm": 0.5016555190086365, + "learning_rate": 3.7624237279200175e-06, + "loss": 0.0721, + "step": 18020 + }, + { + "epoch": 68.81679389312977, + "grad_norm": 0.23837906122207642, + "learning_rate": 3.7570844127367994e-06, + "loss": 0.0642, + "step": 18030 + }, + { + "epoch": 68.85496183206106, + "grad_norm": 0.5186642408370972, + "learning_rate": 3.7517466078132213e-06, + "loss": 0.0689, + "step": 18040 + }, + { + "epoch": 68.89312977099236, + "grad_norm": 0.4285561442375183, + "learning_rate": 3.7464103196352176e-06, + "loss": 0.0816, + "step": 18050 + }, + { + "epoch": 68.93129770992367, + "grad_norm": 0.3345412015914917, + "learning_rate": 3.7410755546868803e-06, + "loss": 0.0794, + "step": 18060 + }, + { + "epoch": 68.96946564885496, + "grad_norm": 0.3829120695590973, + "learning_rate": 3.7357423194504538e-06, + "loss": 0.0698, + "step": 18070 + }, + { + "epoch": 69.00763358778626, + "grad_norm": 0.327454537153244, + "learning_rate": 3.7304106204063186e-06, + "loss": 0.0763, + "step": 18080 + }, + { + "epoch": 69.04580152671755, + "grad_norm": 0.4839470088481903, + "learning_rate": 3.725080464032996e-06, + "loss": 0.0722, + "step": 18090 + }, + { + "epoch": 69.08396946564885, + "grad_norm": 0.2581571340560913, + "learning_rate": 3.7197518568071256e-06, + "loss": 0.0678, + "step": 18100 + }, + { + "epoch": 69.12213740458016, + "grad_norm": 0.41950273513793945, + "learning_rate": 3.7144248052034696e-06, + "loss": 0.0769, + "step": 18110 + }, + { + "epoch": 69.16030534351145, + "grad_norm": 0.35857993364334106, + "learning_rate": 3.7090993156948973e-06, + "loss": 0.069, + "step": 18120 + }, + { + "epoch": 69.19847328244275, + "grad_norm": 0.5615084171295166, + "learning_rate": 3.7037753947523786e-06, + "loss": 0.0648, + "step": 18130 + }, + { + "epoch": 69.23664122137404, + "grad_norm": 0.32720375061035156, + "learning_rate": 3.6984530488449833e-06, + "loss": 0.0669, + "step": 18140 + }, + { + "epoch": 69.27480916030534, + "grad_norm": 0.38296082615852356, + "learning_rate": 3.693132284439861e-06, + "loss": 0.0698, + "step": 18150 + }, + { + "epoch": 69.31297709923665, + "grad_norm": 0.39868733286857605, + "learning_rate": 3.687813108002242e-06, + "loss": 0.0693, + "step": 18160 + }, + { + "epoch": 69.35114503816794, + "grad_norm": 0.2485743910074234, + "learning_rate": 3.6824955259954285e-06, + "loss": 0.0765, + "step": 18170 + }, + { + "epoch": 69.38931297709924, + "grad_norm": 0.3294006884098053, + "learning_rate": 3.6771795448807847e-06, + "loss": 0.0684, + "step": 18180 + }, + { + "epoch": 69.42748091603053, + "grad_norm": 0.584002673625946, + "learning_rate": 3.6718651711177244e-06, + "loss": 0.0775, + "step": 18190 + }, + { + "epoch": 69.46564885496183, + "grad_norm": 0.3454589545726776, + "learning_rate": 3.6665524111637184e-06, + "loss": 0.0703, + "step": 18200 + }, + { + "epoch": 69.50381679389314, + "grad_norm": 0.4234253466129303, + "learning_rate": 3.6612412714742695e-06, + "loss": 0.07, + "step": 18210 + }, + { + "epoch": 69.54198473282443, + "grad_norm": 0.8012011051177979, + "learning_rate": 3.655931758502912e-06, + "loss": 0.0753, + "step": 18220 + }, + { + "epoch": 69.58015267175573, + "grad_norm": 0.25509560108184814, + "learning_rate": 3.6506238787012038e-06, + "loss": 0.0704, + "step": 18230 + }, + { + "epoch": 69.61832061068702, + "grad_norm": 0.4144178628921509, + "learning_rate": 3.645317638518721e-06, + "loss": 0.0743, + "step": 18240 + }, + { + "epoch": 69.65648854961832, + "grad_norm": 0.2962321937084198, + "learning_rate": 3.6400130444030456e-06, + "loss": 0.0815, + "step": 18250 + }, + { + "epoch": 69.69465648854961, + "grad_norm": 0.6895132660865784, + "learning_rate": 3.634710102799761e-06, + "loss": 0.0696, + "step": 18260 + }, + { + "epoch": 69.73282442748092, + "grad_norm": 0.2801784873008728, + "learning_rate": 3.62940882015244e-06, + "loss": 0.0733, + "step": 18270 + }, + { + "epoch": 69.77099236641222, + "grad_norm": 0.24803809821605682, + "learning_rate": 3.6241092029026405e-06, + "loss": 0.0778, + "step": 18280 + }, + { + "epoch": 69.80916030534351, + "grad_norm": 0.7623275518417358, + "learning_rate": 3.6188112574898955e-06, + "loss": 0.0768, + "step": 18290 + }, + { + "epoch": 69.8473282442748, + "grad_norm": 0.2937699556350708, + "learning_rate": 3.613514990351712e-06, + "loss": 0.0761, + "step": 18300 + }, + { + "epoch": 69.8854961832061, + "grad_norm": 0.3946186900138855, + "learning_rate": 3.608220407923552e-06, + "loss": 0.0707, + "step": 18310 + }, + { + "epoch": 69.92366412213741, + "grad_norm": 0.28563305735588074, + "learning_rate": 3.602927516638833e-06, + "loss": 0.0698, + "step": 18320 + }, + { + "epoch": 69.9618320610687, + "grad_norm": 0.25289323925971985, + "learning_rate": 3.597636322928917e-06, + "loss": 0.0726, + "step": 18330 + }, + { + "epoch": 70.0, + "grad_norm": 0.21828551590442657, + "learning_rate": 3.5923468332231003e-06, + "loss": 0.0706, + "step": 18340 + }, + { + "epoch": 70.0381679389313, + "grad_norm": 0.2835104763507843, + "learning_rate": 3.5870590539486163e-06, + "loss": 0.0672, + "step": 18350 + }, + { + "epoch": 70.07633587786259, + "grad_norm": 0.2114870399236679, + "learning_rate": 3.5817729915306138e-06, + "loss": 0.0672, + "step": 18360 + }, + { + "epoch": 70.1145038167939, + "grad_norm": 0.33038604259490967, + "learning_rate": 3.5764886523921567e-06, + "loss": 0.062, + "step": 18370 + }, + { + "epoch": 70.1526717557252, + "grad_norm": 0.25041481852531433, + "learning_rate": 3.571206042954214e-06, + "loss": 0.0711, + "step": 18380 + }, + { + "epoch": 70.19083969465649, + "grad_norm": 0.30329805612564087, + "learning_rate": 3.565925169635657e-06, + "loss": 0.0814, + "step": 18390 + }, + { + "epoch": 70.22900763358778, + "grad_norm": 0.6908921599388123, + "learning_rate": 3.5606460388532406e-06, + "loss": 0.0786, + "step": 18400 + }, + { + "epoch": 70.26717557251908, + "grad_norm": 0.8706730008125305, + "learning_rate": 3.5553686570216116e-06, + "loss": 0.0758, + "step": 18410 + }, + { + "epoch": 70.30534351145039, + "grad_norm": 0.6713736057281494, + "learning_rate": 3.5500930305532845e-06, + "loss": 0.0827, + "step": 18420 + }, + { + "epoch": 70.34351145038168, + "grad_norm": 0.4993293583393097, + "learning_rate": 3.5448191658586423e-06, + "loss": 0.0777, + "step": 18430 + }, + { + "epoch": 70.38167938931298, + "grad_norm": 0.7732614874839783, + "learning_rate": 3.5395470693459267e-06, + "loss": 0.0762, + "step": 18440 + }, + { + "epoch": 70.41984732824427, + "grad_norm": 0.37474754452705383, + "learning_rate": 3.5342767474212344e-06, + "loss": 0.0741, + "step": 18450 + }, + { + "epoch": 70.45801526717557, + "grad_norm": 0.6375201344490051, + "learning_rate": 3.5290082064885025e-06, + "loss": 0.0704, + "step": 18460 + }, + { + "epoch": 70.49618320610686, + "grad_norm": 0.3896820545196533, + "learning_rate": 3.5237414529495056e-06, + "loss": 0.0739, + "step": 18470 + }, + { + "epoch": 70.53435114503817, + "grad_norm": 0.26550376415252686, + "learning_rate": 3.5184764932038457e-06, + "loss": 0.0941, + "step": 18480 + }, + { + "epoch": 70.57251908396947, + "grad_norm": 0.41228294372558594, + "learning_rate": 3.513213333648945e-06, + "loss": 0.0693, + "step": 18490 + }, + { + "epoch": 70.61068702290076, + "grad_norm": 0.3219086825847626, + "learning_rate": 3.507951980680037e-06, + "loss": 0.0706, + "step": 18500 + }, + { + "epoch": 70.64885496183206, + "grad_norm": 0.2977215051651001, + "learning_rate": 3.502692440690165e-06, + "loss": 0.0728, + "step": 18510 + }, + { + "epoch": 70.68702290076335, + "grad_norm": 0.45944830775260925, + "learning_rate": 3.497434720070165e-06, + "loss": 0.0658, + "step": 18520 + }, + { + "epoch": 70.72519083969466, + "grad_norm": 0.45942363142967224, + "learning_rate": 3.492178825208662e-06, + "loss": 0.0708, + "step": 18530 + }, + { + "epoch": 70.76335877862596, + "grad_norm": 0.401991069316864, + "learning_rate": 3.486924762492065e-06, + "loss": 0.0772, + "step": 18540 + }, + { + "epoch": 70.80152671755725, + "grad_norm": 0.390428751707077, + "learning_rate": 3.4816725383045534e-06, + "loss": 0.0712, + "step": 18550 + }, + { + "epoch": 70.83969465648855, + "grad_norm": 0.36165851354599, + "learning_rate": 3.476422159028079e-06, + "loss": 0.0714, + "step": 18560 + }, + { + "epoch": 70.87786259541984, + "grad_norm": 0.3357897996902466, + "learning_rate": 3.471173631042345e-06, + "loss": 0.0746, + "step": 18570 + }, + { + "epoch": 70.91603053435115, + "grad_norm": 0.9048196077346802, + "learning_rate": 3.465926960724808e-06, + "loss": 0.0731, + "step": 18580 + }, + { + "epoch": 70.95419847328245, + "grad_norm": 0.34917864203453064, + "learning_rate": 3.4606821544506664e-06, + "loss": 0.0722, + "step": 18590 + }, + { + "epoch": 70.99236641221374, + "grad_norm": 0.2705357074737549, + "learning_rate": 3.4554392185928563e-06, + "loss": 0.0867, + "step": 18600 + }, + { + "epoch": 71.03053435114504, + "grad_norm": 0.27343451976776123, + "learning_rate": 3.450198159522037e-06, + "loss": 0.0635, + "step": 18610 + }, + { + "epoch": 71.06870229007633, + "grad_norm": 0.48020657896995544, + "learning_rate": 3.444958983606592e-06, + "loss": 0.0731, + "step": 18620 + }, + { + "epoch": 71.10687022900764, + "grad_norm": 0.25576311349868774, + "learning_rate": 3.4397216972126126e-06, + "loss": 0.0738, + "step": 18630 + }, + { + "epoch": 71.14503816793894, + "grad_norm": 0.3990323543548584, + "learning_rate": 3.434486306703896e-06, + "loss": 0.0861, + "step": 18640 + }, + { + "epoch": 71.18320610687023, + "grad_norm": 0.25410884618759155, + "learning_rate": 3.429252818441935e-06, + "loss": 0.0759, + "step": 18650 + }, + { + "epoch": 71.22137404580153, + "grad_norm": 0.2940288782119751, + "learning_rate": 3.4240212387859097e-06, + "loss": 0.0745, + "step": 18660 + }, + { + "epoch": 71.25954198473282, + "grad_norm": 0.5581989288330078, + "learning_rate": 3.4187915740926856e-06, + "loss": 0.0738, + "step": 18670 + }, + { + "epoch": 71.29770992366412, + "grad_norm": 0.3819045424461365, + "learning_rate": 3.4135638307167962e-06, + "loss": 0.0729, + "step": 18680 + }, + { + "epoch": 71.33587786259542, + "grad_norm": 0.34871965646743774, + "learning_rate": 3.408338015010445e-06, + "loss": 0.0729, + "step": 18690 + }, + { + "epoch": 71.37404580152672, + "grad_norm": 0.3207714557647705, + "learning_rate": 3.4031141333234895e-06, + "loss": 0.0706, + "step": 18700 + }, + { + "epoch": 71.41221374045801, + "grad_norm": 0.27491647005081177, + "learning_rate": 3.397892192003437e-06, + "loss": 0.0704, + "step": 18710 + }, + { + "epoch": 71.45038167938931, + "grad_norm": 0.3418438732624054, + "learning_rate": 3.392672197395441e-06, + "loss": 0.0685, + "step": 18720 + }, + { + "epoch": 71.4885496183206, + "grad_norm": 0.8138360381126404, + "learning_rate": 3.3874541558422874e-06, + "loss": 0.0704, + "step": 18730 + }, + { + "epoch": 71.52671755725191, + "grad_norm": 0.2727760672569275, + "learning_rate": 3.3822380736843865e-06, + "loss": 0.087, + "step": 18740 + }, + { + "epoch": 71.56488549618321, + "grad_norm": 0.5229569673538208, + "learning_rate": 3.3770239572597715e-06, + "loss": 0.0794, + "step": 18750 + }, + { + "epoch": 71.6030534351145, + "grad_norm": 0.3278786540031433, + "learning_rate": 3.3718118129040833e-06, + "loss": 0.0712, + "step": 18760 + }, + { + "epoch": 71.6412213740458, + "grad_norm": 0.2960107624530792, + "learning_rate": 3.3666016469505725e-06, + "loss": 0.0671, + "step": 18770 + }, + { + "epoch": 71.6793893129771, + "grad_norm": 0.5897856950759888, + "learning_rate": 3.3613934657300793e-06, + "loss": 0.0704, + "step": 18780 + }, + { + "epoch": 71.7175572519084, + "grad_norm": 0.30159324407577515, + "learning_rate": 3.356187275571037e-06, + "loss": 0.0683, + "step": 18790 + }, + { + "epoch": 71.7557251908397, + "grad_norm": 0.32948631048202515, + "learning_rate": 3.350983082799456e-06, + "loss": 0.0779, + "step": 18800 + }, + { + "epoch": 71.79389312977099, + "grad_norm": 0.3158397078514099, + "learning_rate": 3.34578089373892e-06, + "loss": 0.0723, + "step": 18810 + }, + { + "epoch": 71.83206106870229, + "grad_norm": 0.32471826672554016, + "learning_rate": 3.3405807147105814e-06, + "loss": 0.0667, + "step": 18820 + }, + { + "epoch": 71.87022900763358, + "grad_norm": 0.5005679130554199, + "learning_rate": 3.3353825520331466e-06, + "loss": 0.0684, + "step": 18830 + }, + { + "epoch": 71.90839694656489, + "grad_norm": 0.9290713667869568, + "learning_rate": 3.330186412022876e-06, + "loss": 0.0772, + "step": 18840 + }, + { + "epoch": 71.94656488549619, + "grad_norm": 0.9230539202690125, + "learning_rate": 3.324992300993568e-06, + "loss": 0.0702, + "step": 18850 + }, + { + "epoch": 71.98473282442748, + "grad_norm": 0.2002493143081665, + "learning_rate": 3.3198002252565564e-06, + "loss": 0.0724, + "step": 18860 + }, + { + "epoch": 72.02290076335878, + "grad_norm": 0.7362732887268066, + "learning_rate": 3.3146101911207024e-06, + "loss": 0.0693, + "step": 18870 + }, + { + "epoch": 72.06106870229007, + "grad_norm": 0.6342445611953735, + "learning_rate": 3.3094222048923895e-06, + "loss": 0.0692, + "step": 18880 + }, + { + "epoch": 72.09923664122137, + "grad_norm": 0.997917652130127, + "learning_rate": 3.3042362728755084e-06, + "loss": 0.0727, + "step": 18890 + }, + { + "epoch": 72.13740458015268, + "grad_norm": 0.3136437237262726, + "learning_rate": 3.2990524013714565e-06, + "loss": 0.0626, + "step": 18900 + }, + { + "epoch": 72.17557251908397, + "grad_norm": 0.5527306795120239, + "learning_rate": 3.293870596679125e-06, + "loss": 0.067, + "step": 18910 + }, + { + "epoch": 72.21374045801527, + "grad_norm": 0.3831974267959595, + "learning_rate": 3.288690865094895e-06, + "loss": 0.0655, + "step": 18920 + }, + { + "epoch": 72.25190839694656, + "grad_norm": 0.297125905752182, + "learning_rate": 3.283513212912632e-06, + "loss": 0.0747, + "step": 18930 + }, + { + "epoch": 72.29007633587786, + "grad_norm": 0.30272427201271057, + "learning_rate": 3.278337646423669e-06, + "loss": 0.0821, + "step": 18940 + }, + { + "epoch": 72.32824427480917, + "grad_norm": 0.4911503493785858, + "learning_rate": 3.273164171916806e-06, + "loss": 0.0715, + "step": 18950 + }, + { + "epoch": 72.36641221374046, + "grad_norm": 0.8124505281448364, + "learning_rate": 3.267992795678306e-06, + "loss": 0.0676, + "step": 18960 + }, + { + "epoch": 72.40458015267176, + "grad_norm": 0.2938520610332489, + "learning_rate": 3.262823523991875e-06, + "loss": 0.0754, + "step": 18970 + }, + { + "epoch": 72.44274809160305, + "grad_norm": 0.26553019881248474, + "learning_rate": 3.2576563631386694e-06, + "loss": 0.0733, + "step": 18980 + }, + { + "epoch": 72.48091603053435, + "grad_norm": 0.3607490062713623, + "learning_rate": 3.2524913193972747e-06, + "loss": 0.074, + "step": 18990 + }, + { + "epoch": 72.51908396946565, + "grad_norm": 0.5048254132270813, + "learning_rate": 3.247328399043706e-06, + "loss": 0.0707, + "step": 19000 + }, + { + "epoch": 72.55725190839695, + "grad_norm": 0.3066476583480835, + "learning_rate": 3.2421676083513987e-06, + "loss": 0.0728, + "step": 19010 + }, + { + "epoch": 72.59541984732824, + "grad_norm": 0.2978527843952179, + "learning_rate": 3.2370089535911988e-06, + "loss": 0.0721, + "step": 19020 + }, + { + "epoch": 72.63358778625954, + "grad_norm": 0.2315007746219635, + "learning_rate": 3.2318524410313602e-06, + "loss": 0.0722, + "step": 19030 + }, + { + "epoch": 72.67175572519083, + "grad_norm": 0.5974384546279907, + "learning_rate": 3.22669807693753e-06, + "loss": 0.0716, + "step": 19040 + }, + { + "epoch": 72.70992366412214, + "grad_norm": 0.2158258557319641, + "learning_rate": 3.2215458675727497e-06, + "loss": 0.0687, + "step": 19050 + }, + { + "epoch": 72.74809160305344, + "grad_norm": 0.23273561894893646, + "learning_rate": 3.2163958191974375e-06, + "loss": 0.0676, + "step": 19060 + }, + { + "epoch": 72.78625954198473, + "grad_norm": 0.9219455122947693, + "learning_rate": 3.211247938069387e-06, + "loss": 0.0708, + "step": 19070 + }, + { + "epoch": 72.82442748091603, + "grad_norm": 0.949112057685852, + "learning_rate": 3.2061022304437596e-06, + "loss": 0.0779, + "step": 19080 + }, + { + "epoch": 72.86259541984732, + "grad_norm": 0.3178614675998688, + "learning_rate": 3.2009587025730765e-06, + "loss": 0.0762, + "step": 19090 + }, + { + "epoch": 72.90076335877862, + "grad_norm": 0.3184674084186554, + "learning_rate": 3.1958173607072075e-06, + "loss": 0.071, + "step": 19100 + }, + { + "epoch": 72.93893129770993, + "grad_norm": 0.6181210875511169, + "learning_rate": 3.1906782110933698e-06, + "loss": 0.0842, + "step": 19110 + }, + { + "epoch": 72.97709923664122, + "grad_norm": 0.4641569256782532, + "learning_rate": 3.1855412599761137e-06, + "loss": 0.0761, + "step": 19120 + }, + { + "epoch": 73.01526717557252, + "grad_norm": 0.8154666423797607, + "learning_rate": 3.1804065135973165e-06, + "loss": 0.0765, + "step": 19130 + }, + { + "epoch": 73.05343511450381, + "grad_norm": 0.4169461131095886, + "learning_rate": 3.175273978196184e-06, + "loss": 0.0796, + "step": 19140 + }, + { + "epoch": 73.09160305343511, + "grad_norm": 0.3701060712337494, + "learning_rate": 3.1701436600092283e-06, + "loss": 0.0755, + "step": 19150 + }, + { + "epoch": 73.12977099236642, + "grad_norm": 0.49110186100006104, + "learning_rate": 3.16501556527027e-06, + "loss": 0.0661, + "step": 19160 + }, + { + "epoch": 73.16793893129771, + "grad_norm": 0.2956679165363312, + "learning_rate": 3.1598897002104266e-06, + "loss": 0.0703, + "step": 19170 + }, + { + "epoch": 73.20610687022901, + "grad_norm": 0.35779961943626404, + "learning_rate": 3.1547660710581087e-06, + "loss": 0.074, + "step": 19180 + }, + { + "epoch": 73.2442748091603, + "grad_norm": 0.3238672614097595, + "learning_rate": 3.149644684039008e-06, + "loss": 0.0714, + "step": 19190 + }, + { + "epoch": 73.2824427480916, + "grad_norm": 0.3601135015487671, + "learning_rate": 3.144525545376095e-06, + "loss": 0.0733, + "step": 19200 + }, + { + "epoch": 73.3206106870229, + "grad_norm": 0.3360394537448883, + "learning_rate": 3.1394086612896035e-06, + "loss": 0.0708, + "step": 19210 + }, + { + "epoch": 73.3587786259542, + "grad_norm": 0.6372358202934265, + "learning_rate": 3.1342940379970315e-06, + "loss": 0.0712, + "step": 19220 + }, + { + "epoch": 73.3969465648855, + "grad_norm": 0.44446098804473877, + "learning_rate": 3.129181681713127e-06, + "loss": 0.0729, + "step": 19230 + }, + { + "epoch": 73.43511450381679, + "grad_norm": 0.2544264793395996, + "learning_rate": 3.1240715986498856e-06, + "loss": 0.0775, + "step": 19240 + }, + { + "epoch": 73.47328244274809, + "grad_norm": 0.30455854535102844, + "learning_rate": 3.1189637950165398e-06, + "loss": 0.0663, + "step": 19250 + }, + { + "epoch": 73.5114503816794, + "grad_norm": 0.28932657837867737, + "learning_rate": 3.1138582770195547e-06, + "loss": 0.0689, + "step": 19260 + }, + { + "epoch": 73.54961832061069, + "grad_norm": 0.40639564394950867, + "learning_rate": 3.1087550508626145e-06, + "loss": 0.0699, + "step": 19270 + }, + { + "epoch": 73.58778625954199, + "grad_norm": 0.3635731041431427, + "learning_rate": 3.1036541227466204e-06, + "loss": 0.0744, + "step": 19280 + }, + { + "epoch": 73.62595419847328, + "grad_norm": 0.25109219551086426, + "learning_rate": 3.098555498869679e-06, + "loss": 0.0765, + "step": 19290 + }, + { + "epoch": 73.66412213740458, + "grad_norm": 0.3934338390827179, + "learning_rate": 3.093459185427102e-06, + "loss": 0.0739, + "step": 19300 + }, + { + "epoch": 73.70229007633588, + "grad_norm": 0.2998042106628418, + "learning_rate": 3.088365188611391e-06, + "loss": 0.0724, + "step": 19310 + }, + { + "epoch": 73.74045801526718, + "grad_norm": 0.36657044291496277, + "learning_rate": 3.0832735146122295e-06, + "loss": 0.0676, + "step": 19320 + }, + { + "epoch": 73.77862595419847, + "grad_norm": 0.6263729929924011, + "learning_rate": 3.078184169616485e-06, + "loss": 0.0705, + "step": 19330 + }, + { + "epoch": 73.81679389312977, + "grad_norm": 0.6651374101638794, + "learning_rate": 3.073097159808187e-06, + "loss": 0.0761, + "step": 19340 + }, + { + "epoch": 73.85496183206106, + "grad_norm": 0.33448323607444763, + "learning_rate": 3.068012491368537e-06, + "loss": 0.0698, + "step": 19350 + }, + { + "epoch": 73.89312977099236, + "grad_norm": 0.2528843879699707, + "learning_rate": 3.0629301704758846e-06, + "loss": 0.0692, + "step": 19360 + }, + { + "epoch": 73.93129770992367, + "grad_norm": 0.36234578490257263, + "learning_rate": 3.0578502033057288e-06, + "loss": 0.0729, + "step": 19370 + }, + { + "epoch": 73.96946564885496, + "grad_norm": 0.2573143243789673, + "learning_rate": 3.0527725960307083e-06, + "loss": 0.0715, + "step": 19380 + }, + { + "epoch": 74.00763358778626, + "grad_norm": 0.3500208556652069, + "learning_rate": 3.0476973548205945e-06, + "loss": 0.0688, + "step": 19390 + }, + { + "epoch": 74.04580152671755, + "grad_norm": 0.3336540460586548, + "learning_rate": 3.042624485842285e-06, + "loss": 0.0645, + "step": 19400 + }, + { + "epoch": 74.08396946564885, + "grad_norm": 0.2523120641708374, + "learning_rate": 3.0375539952597943e-06, + "loss": 0.0718, + "step": 19410 + }, + { + "epoch": 74.12213740458016, + "grad_norm": 0.38191351294517517, + "learning_rate": 3.0324858892342467e-06, + "loss": 0.0698, + "step": 19420 + }, + { + "epoch": 74.16030534351145, + "grad_norm": 0.2716592252254486, + "learning_rate": 3.027420173923867e-06, + "loss": 0.0686, + "step": 19430 + }, + { + "epoch": 74.19847328244275, + "grad_norm": 0.4997287094593048, + "learning_rate": 3.022356855483979e-06, + "loss": 0.0691, + "step": 19440 + }, + { + "epoch": 74.23664122137404, + "grad_norm": 0.2641426622867584, + "learning_rate": 3.017295940066989e-06, + "loss": 0.0725, + "step": 19450 + }, + { + "epoch": 74.27480916030534, + "grad_norm": 0.23110733926296234, + "learning_rate": 3.0122374338223905e-06, + "loss": 0.0732, + "step": 19460 + }, + { + "epoch": 74.31297709923665, + "grad_norm": 0.31251829862594604, + "learning_rate": 3.007181342896743e-06, + "loss": 0.068, + "step": 19470 + }, + { + "epoch": 74.35114503816794, + "grad_norm": 0.39099785685539246, + "learning_rate": 3.0021276734336744e-06, + "loss": 0.0712, + "step": 19480 + }, + { + "epoch": 74.38931297709924, + "grad_norm": 0.26635631918907166, + "learning_rate": 2.997076431573871e-06, + "loss": 0.0722, + "step": 19490 + }, + { + "epoch": 74.42748091603053, + "grad_norm": 0.3487379848957062, + "learning_rate": 2.9920276234550636e-06, + "loss": 0.0711, + "step": 19500 + }, + { + "epoch": 74.46564885496183, + "grad_norm": 0.33060070872306824, + "learning_rate": 2.9869812552120355e-06, + "loss": 0.0738, + "step": 19510 + }, + { + "epoch": 74.50381679389314, + "grad_norm": 0.3587673604488373, + "learning_rate": 2.9819373329765977e-06, + "loss": 0.0679, + "step": 19520 + }, + { + "epoch": 74.54198473282443, + "grad_norm": 0.33571016788482666, + "learning_rate": 2.97689586287759e-06, + "loss": 0.0735, + "step": 19530 + }, + { + "epoch": 74.58015267175573, + "grad_norm": 0.5554803609848022, + "learning_rate": 2.9718568510408763e-06, + "loss": 0.0672, + "step": 19540 + }, + { + "epoch": 74.61832061068702, + "grad_norm": 0.35926058888435364, + "learning_rate": 2.966820303589327e-06, + "loss": 0.0747, + "step": 19550 + }, + { + "epoch": 74.65648854961832, + "grad_norm": 0.2982501685619354, + "learning_rate": 2.961786226642829e-06, + "loss": 0.0716, + "step": 19560 + }, + { + "epoch": 74.69465648854961, + "grad_norm": 0.35031527280807495, + "learning_rate": 2.9567546263182554e-06, + "loss": 0.0742, + "step": 19570 + }, + { + "epoch": 74.73282442748092, + "grad_norm": 0.6128705143928528, + "learning_rate": 2.951725508729476e-06, + "loss": 0.0731, + "step": 19580 + }, + { + "epoch": 74.77099236641222, + "grad_norm": 0.2246423214673996, + "learning_rate": 2.9466988799873443e-06, + "loss": 0.0675, + "step": 19590 + }, + { + "epoch": 74.80916030534351, + "grad_norm": 0.2989789843559265, + "learning_rate": 2.9416747461996853e-06, + "loss": 0.0762, + "step": 19600 + }, + { + "epoch": 74.8473282442748, + "grad_norm": 0.34754475951194763, + "learning_rate": 2.9366531134712974e-06, + "loss": 0.0699, + "step": 19610 + }, + { + "epoch": 74.8854961832061, + "grad_norm": 0.37204280495643616, + "learning_rate": 2.931633987903937e-06, + "loss": 0.0711, + "step": 19620 + }, + { + "epoch": 74.92366412213741, + "grad_norm": 0.5289718508720398, + "learning_rate": 2.926617375596317e-06, + "loss": 0.0818, + "step": 19630 + }, + { + "epoch": 74.9618320610687, + "grad_norm": 0.3462826907634735, + "learning_rate": 2.9216032826440927e-06, + "loss": 0.0702, + "step": 19640 + }, + { + "epoch": 75.0, + "grad_norm": 0.29999420046806335, + "learning_rate": 2.9165917151398594e-06, + "loss": 0.072, + "step": 19650 + }, + { + "epoch": 75.0381679389313, + "grad_norm": 0.28326013684272766, + "learning_rate": 2.9115826791731426e-06, + "loss": 0.0695, + "step": 19660 + }, + { + "epoch": 75.07633587786259, + "grad_norm": 0.7487019300460815, + "learning_rate": 2.9065761808303983e-06, + "loss": 0.0744, + "step": 19670 + }, + { + "epoch": 75.1145038167939, + "grad_norm": 0.24250933527946472, + "learning_rate": 2.9015722261949918e-06, + "loss": 0.0716, + "step": 19680 + }, + { + "epoch": 75.1526717557252, + "grad_norm": 0.272839218378067, + "learning_rate": 2.8965708213471987e-06, + "loss": 0.0738, + "step": 19690 + }, + { + "epoch": 75.19083969465649, + "grad_norm": 0.26355302333831787, + "learning_rate": 2.891571972364198e-06, + "loss": 0.0717, + "step": 19700 + }, + { + "epoch": 75.22900763358778, + "grad_norm": 0.28697460889816284, + "learning_rate": 2.8865756853200605e-06, + "loss": 0.0708, + "step": 19710 + }, + { + "epoch": 75.26717557251908, + "grad_norm": 0.5366164445877075, + "learning_rate": 2.8815819662857505e-06, + "loss": 0.0696, + "step": 19720 + }, + { + "epoch": 75.30534351145039, + "grad_norm": 0.5985289812088013, + "learning_rate": 2.876590821329105e-06, + "loss": 0.0767, + "step": 19730 + }, + { + "epoch": 75.34351145038168, + "grad_norm": 1.137639045715332, + "learning_rate": 2.8716022565148362e-06, + "loss": 0.0733, + "step": 19740 + }, + { + "epoch": 75.38167938931298, + "grad_norm": 0.27952316403388977, + "learning_rate": 2.8666162779045205e-06, + "loss": 0.0733, + "step": 19750 + }, + { + "epoch": 75.41984732824427, + "grad_norm": 1.1592620611190796, + "learning_rate": 2.8616328915565907e-06, + "loss": 0.0744, + "step": 19760 + }, + { + "epoch": 75.45801526717557, + "grad_norm": 0.5299224257469177, + "learning_rate": 2.856652103526334e-06, + "loss": 0.0706, + "step": 19770 + }, + { + "epoch": 75.49618320610686, + "grad_norm": 0.2791510224342346, + "learning_rate": 2.8516739198658753e-06, + "loss": 0.0666, + "step": 19780 + }, + { + "epoch": 75.53435114503817, + "grad_norm": 0.32924363017082214, + "learning_rate": 2.8466983466241772e-06, + "loss": 0.0777, + "step": 19790 + }, + { + "epoch": 75.57251908396947, + "grad_norm": 0.4635617434978485, + "learning_rate": 2.841725389847032e-06, + "loss": 0.0685, + "step": 19800 + }, + { + "epoch": 75.61068702290076, + "grad_norm": 0.42956969141960144, + "learning_rate": 2.8367550555770507e-06, + "loss": 0.0747, + "step": 19810 + }, + { + "epoch": 75.64885496183206, + "grad_norm": 0.3257053792476654, + "learning_rate": 2.8317873498536554e-06, + "loss": 0.0705, + "step": 19820 + }, + { + "epoch": 75.68702290076335, + "grad_norm": 0.3546193540096283, + "learning_rate": 2.8268222787130805e-06, + "loss": 0.0747, + "step": 19830 + }, + { + "epoch": 75.72519083969466, + "grad_norm": 0.23434428870677948, + "learning_rate": 2.8218598481883552e-06, + "loss": 0.0728, + "step": 19840 + }, + { + "epoch": 75.76335877862596, + "grad_norm": 0.3509843647480011, + "learning_rate": 2.816900064309299e-06, + "loss": 0.0685, + "step": 19850 + }, + { + "epoch": 75.80152671755725, + "grad_norm": 0.350881963968277, + "learning_rate": 2.811942933102517e-06, + "loss": 0.0668, + "step": 19860 + }, + { + "epoch": 75.83969465648855, + "grad_norm": 0.2302703559398651, + "learning_rate": 2.8069884605913912e-06, + "loss": 0.0679, + "step": 19870 + }, + { + "epoch": 75.87786259541984, + "grad_norm": 0.39022761583328247, + "learning_rate": 2.802036652796074e-06, + "loss": 0.0718, + "step": 19880 + }, + { + "epoch": 75.91603053435115, + "grad_norm": 0.536963939666748, + "learning_rate": 2.797087515733478e-06, + "loss": 0.0688, + "step": 19890 + }, + { + "epoch": 75.95419847328245, + "grad_norm": 0.33239343762397766, + "learning_rate": 2.7921410554172724e-06, + "loss": 0.0714, + "step": 19900 + }, + { + "epoch": 75.99236641221374, + "grad_norm": 0.2767488658428192, + "learning_rate": 2.787197277857871e-06, + "loss": 0.0702, + "step": 19910 + }, + { + "epoch": 76.03053435114504, + "grad_norm": 0.30666476488113403, + "learning_rate": 2.7822561890624287e-06, + "loss": 0.0674, + "step": 19920 + }, + { + "epoch": 76.06870229007633, + "grad_norm": 0.2637414038181305, + "learning_rate": 2.777317795034839e-06, + "loss": 0.0684, + "step": 19930 + }, + { + "epoch": 76.10687022900764, + "grad_norm": 0.23923492431640625, + "learning_rate": 2.772382101775711e-06, + "loss": 0.0719, + "step": 19940 + }, + { + "epoch": 76.14503816793894, + "grad_norm": 0.8826484084129333, + "learning_rate": 2.7674491152823825e-06, + "loss": 0.0727, + "step": 19950 + }, + { + "epoch": 76.18320610687023, + "grad_norm": 0.6208845973014832, + "learning_rate": 2.7625188415488946e-06, + "loss": 0.0715, + "step": 19960 + }, + { + "epoch": 76.22137404580153, + "grad_norm": 0.3055213987827301, + "learning_rate": 2.7575912865659925e-06, + "loss": 0.0719, + "step": 19970 + }, + { + "epoch": 76.25954198473282, + "grad_norm": 0.3592512011528015, + "learning_rate": 2.752666456321125e-06, + "loss": 0.0659, + "step": 19980 + }, + { + "epoch": 76.29770992366412, + "grad_norm": 0.38851675391197205, + "learning_rate": 2.7477443567984225e-06, + "loss": 0.0701, + "step": 19990 + }, + { + "epoch": 76.33587786259542, + "grad_norm": 0.6826211810112, + "learning_rate": 2.7428249939787e-06, + "loss": 0.0745, + "step": 20000 + }, + { + "epoch": 76.37404580152672, + "grad_norm": 0.26521873474121094, + "learning_rate": 2.7379083738394485e-06, + "loss": 0.0772, + "step": 20010 + }, + { + "epoch": 76.41221374045801, + "grad_norm": 0.5465177893638611, + "learning_rate": 2.732994502354823e-06, + "loss": 0.0699, + "step": 20020 + }, + { + "epoch": 76.45038167938931, + "grad_norm": 0.22223325073719025, + "learning_rate": 2.72808338549564e-06, + "loss": 0.0703, + "step": 20030 + }, + { + "epoch": 76.4885496183206, + "grad_norm": 0.3529285788536072, + "learning_rate": 2.723175029229374e-06, + "loss": 0.0709, + "step": 20040 + }, + { + "epoch": 76.52671755725191, + "grad_norm": 0.686523973941803, + "learning_rate": 2.718269439520138e-06, + "loss": 0.0706, + "step": 20050 + }, + { + "epoch": 76.56488549618321, + "grad_norm": 0.31421926617622375, + "learning_rate": 2.713366622328686e-06, + "loss": 0.0698, + "step": 20060 + }, + { + "epoch": 76.6030534351145, + "grad_norm": 0.5857685804367065, + "learning_rate": 2.7084665836124006e-06, + "loss": 0.0771, + "step": 20070 + }, + { + "epoch": 76.6412213740458, + "grad_norm": 0.3716685473918915, + "learning_rate": 2.703569329325296e-06, + "loss": 0.0697, + "step": 20080 + }, + { + "epoch": 76.6793893129771, + "grad_norm": 0.3061921298503876, + "learning_rate": 2.698674865417994e-06, + "loss": 0.0723, + "step": 20090 + }, + { + "epoch": 76.7175572519084, + "grad_norm": 0.23903951048851013, + "learning_rate": 2.693783197837733e-06, + "loss": 0.0714, + "step": 20100 + }, + { + "epoch": 76.7557251908397, + "grad_norm": 0.22559994459152222, + "learning_rate": 2.6888943325283482e-06, + "loss": 0.0774, + "step": 20110 + }, + { + "epoch": 76.79389312977099, + "grad_norm": 0.48428580164909363, + "learning_rate": 2.6840082754302734e-06, + "loss": 0.0686, + "step": 20120 + }, + { + "epoch": 76.83206106870229, + "grad_norm": 0.3806080222129822, + "learning_rate": 2.6791250324805252e-06, + "loss": 0.0689, + "step": 20130 + }, + { + "epoch": 76.87022900763358, + "grad_norm": 0.49289458990097046, + "learning_rate": 2.6742446096127086e-06, + "loss": 0.068, + "step": 20140 + }, + { + "epoch": 76.90839694656489, + "grad_norm": 0.5932338833808899, + "learning_rate": 2.669367012756996e-06, + "loss": 0.068, + "step": 20150 + }, + { + "epoch": 76.94656488549619, + "grad_norm": 0.2798921465873718, + "learning_rate": 2.664492247840127e-06, + "loss": 0.0731, + "step": 20160 + }, + { + "epoch": 76.98473282442748, + "grad_norm": 0.25367864966392517, + "learning_rate": 2.6596203207854006e-06, + "loss": 0.0678, + "step": 20170 + }, + { + "epoch": 77.02290076335878, + "grad_norm": 0.28277409076690674, + "learning_rate": 2.654751237512666e-06, + "loss": 0.0713, + "step": 20180 + }, + { + "epoch": 77.06106870229007, + "grad_norm": 0.3937804102897644, + "learning_rate": 2.649885003938323e-06, + "loss": 0.0698, + "step": 20190 + }, + { + "epoch": 77.09923664122137, + "grad_norm": 0.35799962282180786, + "learning_rate": 2.6450216259753005e-06, + "loss": 0.0761, + "step": 20200 + }, + { + "epoch": 77.13740458015268, + "grad_norm": 0.26287925243377686, + "learning_rate": 2.6401611095330632e-06, + "loss": 0.0654, + "step": 20210 + }, + { + "epoch": 77.17557251908397, + "grad_norm": 0.46736228466033936, + "learning_rate": 2.6353034605175937e-06, + "loss": 0.0715, + "step": 20220 + }, + { + "epoch": 77.21374045801527, + "grad_norm": 0.46314504742622375, + "learning_rate": 2.6304486848313982e-06, + "loss": 0.0633, + "step": 20230 + }, + { + "epoch": 77.25190839694656, + "grad_norm": 0.5143646597862244, + "learning_rate": 2.6255967883734823e-06, + "loss": 0.0706, + "step": 20240 + }, + { + "epoch": 77.29007633587786, + "grad_norm": 0.2608904242515564, + "learning_rate": 2.620747777039363e-06, + "loss": 0.068, + "step": 20250 + }, + { + "epoch": 77.32824427480917, + "grad_norm": 0.8567964434623718, + "learning_rate": 2.6159016567210426e-06, + "loss": 0.086, + "step": 20260 + }, + { + "epoch": 77.36641221374046, + "grad_norm": 0.3883376717567444, + "learning_rate": 2.6110584333070153e-06, + "loss": 0.075, + "step": 20270 + }, + { + "epoch": 77.40458015267176, + "grad_norm": 0.27981120347976685, + "learning_rate": 2.606218112682254e-06, + "loss": 0.0666, + "step": 20280 + }, + { + "epoch": 77.44274809160305, + "grad_norm": 0.2423625886440277, + "learning_rate": 2.601380700728203e-06, + "loss": 0.0745, + "step": 20290 + }, + { + "epoch": 77.48091603053435, + "grad_norm": 0.23799222707748413, + "learning_rate": 2.596546203322777e-06, + "loss": 0.0709, + "step": 20300 + }, + { + "epoch": 77.51908396946565, + "grad_norm": 0.2332644760608673, + "learning_rate": 2.591714626340346e-06, + "loss": 0.0746, + "step": 20310 + }, + { + "epoch": 77.55725190839695, + "grad_norm": 0.29361745715141296, + "learning_rate": 2.5868859756517294e-06, + "loss": 0.0765, + "step": 20320 + }, + { + "epoch": 77.59541984732824, + "grad_norm": 0.28440865874290466, + "learning_rate": 2.582060257124195e-06, + "loss": 0.068, + "step": 20330 + }, + { + "epoch": 77.63358778625954, + "grad_norm": 0.3904019296169281, + "learning_rate": 2.577237476621442e-06, + "loss": 0.072, + "step": 20340 + }, + { + "epoch": 77.67175572519083, + "grad_norm": 0.833919882774353, + "learning_rate": 2.5724176400036094e-06, + "loss": 0.0713, + "step": 20350 + }, + { + "epoch": 77.70992366412214, + "grad_norm": 0.2217206060886383, + "learning_rate": 2.5676007531272475e-06, + "loss": 0.0812, + "step": 20360 + }, + { + "epoch": 77.74809160305344, + "grad_norm": 0.307720810174942, + "learning_rate": 2.562786821845333e-06, + "loss": 0.072, + "step": 20370 + }, + { + "epoch": 77.78625954198473, + "grad_norm": 0.28931522369384766, + "learning_rate": 2.5579758520072446e-06, + "loss": 0.0757, + "step": 20380 + }, + { + "epoch": 77.82442748091603, + "grad_norm": 0.39654994010925293, + "learning_rate": 2.5531678494587612e-06, + "loss": 0.0693, + "step": 20390 + }, + { + "epoch": 77.86259541984732, + "grad_norm": 0.55362868309021, + "learning_rate": 2.5483628200420648e-06, + "loss": 0.07, + "step": 20400 + }, + { + "epoch": 77.90076335877862, + "grad_norm": 0.5507999658584595, + "learning_rate": 2.5435607695957153e-06, + "loss": 0.0727, + "step": 20410 + }, + { + "epoch": 77.93893129770993, + "grad_norm": 0.36570632457733154, + "learning_rate": 2.5387617039546585e-06, + "loss": 0.0741, + "step": 20420 + }, + { + "epoch": 77.97709923664122, + "grad_norm": 0.589455246925354, + "learning_rate": 2.5339656289502105e-06, + "loss": 0.0753, + "step": 20430 + }, + { + "epoch": 78.01526717557252, + "grad_norm": 0.40846219658851624, + "learning_rate": 2.5291725504100563e-06, + "loss": 0.0695, + "step": 20440 + }, + { + "epoch": 78.05343511450381, + "grad_norm": 0.28900301456451416, + "learning_rate": 2.524382474158234e-06, + "loss": 0.0691, + "step": 20450 + }, + { + "epoch": 78.09160305343511, + "grad_norm": 0.33728376030921936, + "learning_rate": 2.5195954060151433e-06, + "loss": 0.0755, + "step": 20460 + }, + { + "epoch": 78.12977099236642, + "grad_norm": 0.25033625960350037, + "learning_rate": 2.5148113517975216e-06, + "loss": 0.0724, + "step": 20470 + }, + { + "epoch": 78.16793893129771, + "grad_norm": 0.8060714602470398, + "learning_rate": 2.510030317318445e-06, + "loss": 0.0751, + "step": 20480 + }, + { + "epoch": 78.20610687022901, + "grad_norm": 0.5355228781700134, + "learning_rate": 2.50525230838732e-06, + "loss": 0.0735, + "step": 20490 + }, + { + "epoch": 78.2442748091603, + "grad_norm": 0.28783127665519714, + "learning_rate": 2.5004773308098814e-06, + "loss": 0.0664, + "step": 20500 + }, + { + "epoch": 78.2824427480916, + "grad_norm": 0.32949912548065186, + "learning_rate": 2.4957053903881736e-06, + "loss": 0.0654, + "step": 20510 + }, + { + "epoch": 78.3206106870229, + "grad_norm": 0.40475648641586304, + "learning_rate": 2.4909364929205575e-06, + "loss": 0.0726, + "step": 20520 + }, + { + "epoch": 78.3587786259542, + "grad_norm": 0.34688708186149597, + "learning_rate": 2.4861706442016923e-06, + "loss": 0.0671, + "step": 20530 + }, + { + "epoch": 78.3969465648855, + "grad_norm": 0.38922348618507385, + "learning_rate": 2.481407850022533e-06, + "loss": 0.076, + "step": 20540 + }, + { + "epoch": 78.43511450381679, + "grad_norm": 0.25327450037002563, + "learning_rate": 2.4766481161703216e-06, + "loss": 0.0729, + "step": 20550 + }, + { + "epoch": 78.47328244274809, + "grad_norm": 0.36593905091285706, + "learning_rate": 2.4718914484285876e-06, + "loss": 0.0795, + "step": 20560 + }, + { + "epoch": 78.5114503816794, + "grad_norm": 0.34967660903930664, + "learning_rate": 2.467137852577129e-06, + "loss": 0.0787, + "step": 20570 + }, + { + "epoch": 78.54961832061069, + "grad_norm": 0.272087424993515, + "learning_rate": 2.4623873343920123e-06, + "loss": 0.0722, + "step": 20580 + }, + { + "epoch": 78.58778625954199, + "grad_norm": 0.2819080352783203, + "learning_rate": 2.4576398996455657e-06, + "loss": 0.0631, + "step": 20590 + }, + { + "epoch": 78.62595419847328, + "grad_norm": 0.8356397151947021, + "learning_rate": 2.4528955541063683e-06, + "loss": 0.0724, + "step": 20600 + }, + { + "epoch": 78.66412213740458, + "grad_norm": 0.8203865885734558, + "learning_rate": 2.448154303539251e-06, + "loss": 0.0673, + "step": 20610 + }, + { + "epoch": 78.70229007633588, + "grad_norm": 1.2636505365371704, + "learning_rate": 2.4434161537052776e-06, + "loss": 0.0779, + "step": 20620 + }, + { + "epoch": 78.74045801526718, + "grad_norm": 0.42922544479370117, + "learning_rate": 2.4386811103617474e-06, + "loss": 0.0714, + "step": 20630 + }, + { + "epoch": 78.77862595419847, + "grad_norm": 0.4266103208065033, + "learning_rate": 2.4339491792621833e-06, + "loss": 0.0698, + "step": 20640 + }, + { + "epoch": 78.81679389312977, + "grad_norm": 0.2615494132041931, + "learning_rate": 2.4292203661563313e-06, + "loss": 0.0747, + "step": 20650 + }, + { + "epoch": 78.85496183206106, + "grad_norm": 0.2261299043893814, + "learning_rate": 2.424494676790141e-06, + "loss": 0.0723, + "step": 20660 + }, + { + "epoch": 78.89312977099236, + "grad_norm": 0.2378481775522232, + "learning_rate": 2.419772116905775e-06, + "loss": 0.0674, + "step": 20670 + }, + { + "epoch": 78.93129770992367, + "grad_norm": 0.27929824590682983, + "learning_rate": 2.4150526922415855e-06, + "loss": 0.0772, + "step": 20680 + }, + { + "epoch": 78.96946564885496, + "grad_norm": 0.3526342511177063, + "learning_rate": 2.41033640853212e-06, + "loss": 0.0782, + "step": 20690 + }, + { + "epoch": 79.00763358778626, + "grad_norm": 0.3187890350818634, + "learning_rate": 2.405623271508108e-06, + "loss": 0.0697, + "step": 20700 + }, + { + "epoch": 79.04580152671755, + "grad_norm": 0.38696810603141785, + "learning_rate": 2.4009132868964525e-06, + "loss": 0.0689, + "step": 20710 + }, + { + "epoch": 79.08396946564885, + "grad_norm": 0.2635570466518402, + "learning_rate": 2.3962064604202327e-06, + "loss": 0.0675, + "step": 20720 + }, + { + "epoch": 79.12213740458016, + "grad_norm": 0.36084693670272827, + "learning_rate": 2.391502797798686e-06, + "loss": 0.0697, + "step": 20730 + }, + { + "epoch": 79.16030534351145, + "grad_norm": 0.38881757855415344, + "learning_rate": 2.386802304747205e-06, + "loss": 0.0666, + "step": 20740 + }, + { + "epoch": 79.19847328244275, + "grad_norm": 0.4907824397087097, + "learning_rate": 2.382104986977332e-06, + "loss": 0.0668, + "step": 20750 + }, + { + "epoch": 79.23664122137404, + "grad_norm": 0.2782943844795227, + "learning_rate": 2.3774108501967493e-06, + "loss": 0.0716, + "step": 20760 + }, + { + "epoch": 79.27480916030534, + "grad_norm": 0.5779755711555481, + "learning_rate": 2.37271990010928e-06, + "loss": 0.073, + "step": 20770 + }, + { + "epoch": 79.31297709923665, + "grad_norm": 0.3922247886657715, + "learning_rate": 2.3680321424148678e-06, + "loss": 0.07, + "step": 20780 + }, + { + "epoch": 79.35114503816794, + "grad_norm": 0.3388228416442871, + "learning_rate": 2.363347582809579e-06, + "loss": 0.0672, + "step": 20790 + }, + { + "epoch": 79.38931297709924, + "grad_norm": 0.20889022946357727, + "learning_rate": 2.358666226985599e-06, + "loss": 0.0699, + "step": 20800 + }, + { + "epoch": 79.42748091603053, + "grad_norm": 0.5814685225486755, + "learning_rate": 2.3539880806312134e-06, + "loss": 0.0714, + "step": 20810 + }, + { + "epoch": 79.46564885496183, + "grad_norm": 0.2144891619682312, + "learning_rate": 2.349313149430814e-06, + "loss": 0.0667, + "step": 20820 + }, + { + "epoch": 79.50381679389314, + "grad_norm": 0.6816794276237488, + "learning_rate": 2.3446414390648815e-06, + "loss": 0.0745, + "step": 20830 + }, + { + "epoch": 79.54198473282443, + "grad_norm": 0.32124826312065125, + "learning_rate": 2.3399729552099844e-06, + "loss": 0.0716, + "step": 20840 + }, + { + "epoch": 79.58015267175573, + "grad_norm": 0.22228087484836578, + "learning_rate": 2.335307703538771e-06, + "loss": 0.0693, + "step": 20850 + }, + { + "epoch": 79.61832061068702, + "grad_norm": 0.29664310812950134, + "learning_rate": 2.330645689719962e-06, + "loss": 0.0702, + "step": 20860 + }, + { + "epoch": 79.65648854961832, + "grad_norm": 0.35011011362075806, + "learning_rate": 2.3259869194183415e-06, + "loss": 0.0662, + "step": 20870 + }, + { + "epoch": 79.69465648854961, + "grad_norm": 0.5317111015319824, + "learning_rate": 2.321331398294759e-06, + "loss": 0.0675, + "step": 20880 + }, + { + "epoch": 79.73282442748092, + "grad_norm": 0.7479346990585327, + "learning_rate": 2.3166791320061095e-06, + "loss": 0.0699, + "step": 20890 + }, + { + "epoch": 79.77099236641222, + "grad_norm": 0.25332704186439514, + "learning_rate": 2.312030126205335e-06, + "loss": 0.0709, + "step": 20900 + }, + { + "epoch": 79.80916030534351, + "grad_norm": 0.5418967604637146, + "learning_rate": 2.3073843865414163e-06, + "loss": 0.0714, + "step": 20910 + }, + { + "epoch": 79.8473282442748, + "grad_norm": 0.5316504240036011, + "learning_rate": 2.302741918659363e-06, + "loss": 0.0735, + "step": 20920 + }, + { + "epoch": 79.8854961832061, + "grad_norm": 0.31350257992744446, + "learning_rate": 2.2981027282002155e-06, + "loss": 0.0714, + "step": 20930 + }, + { + "epoch": 79.92366412213741, + "grad_norm": 0.47436007857322693, + "learning_rate": 2.2934668208010235e-06, + "loss": 0.071, + "step": 20940 + }, + { + "epoch": 79.9618320610687, + "grad_norm": 0.28028592467308044, + "learning_rate": 2.2888342020948556e-06, + "loss": 0.0701, + "step": 20950 + }, + { + "epoch": 80.0, + "grad_norm": 0.25115305185317993, + "learning_rate": 2.2842048777107783e-06, + "loss": 0.0673, + "step": 20960 + }, + { + "epoch": 80.0381679389313, + "grad_norm": 0.22422641515731812, + "learning_rate": 2.2795788532738555e-06, + "loss": 0.0737, + "step": 20970 + }, + { + "epoch": 80.07633587786259, + "grad_norm": 0.31794339418411255, + "learning_rate": 2.274956134405147e-06, + "loss": 0.0674, + "step": 20980 + }, + { + "epoch": 80.1145038167939, + "grad_norm": 0.38259294629096985, + "learning_rate": 2.2703367267216896e-06, + "loss": 0.0634, + "step": 20990 + }, + { + "epoch": 80.1526717557252, + "grad_norm": 0.22496770322322845, + "learning_rate": 2.2657206358365e-06, + "loss": 0.0683, + "step": 21000 + }, + { + "epoch": 80.19083969465649, + "grad_norm": 0.24763353168964386, + "learning_rate": 2.261107867358563e-06, + "loss": 0.0732, + "step": 21010 + }, + { + "epoch": 80.22900763358778, + "grad_norm": 0.31791627407073975, + "learning_rate": 2.2564984268928264e-06, + "loss": 0.0807, + "step": 21020 + }, + { + "epoch": 80.26717557251908, + "grad_norm": 0.2604450583457947, + "learning_rate": 2.251892320040198e-06, + "loss": 0.069, + "step": 21030 + }, + { + "epoch": 80.30534351145039, + "grad_norm": 0.21588589251041412, + "learning_rate": 2.2472895523975315e-06, + "loss": 0.0731, + "step": 21040 + }, + { + "epoch": 80.34351145038168, + "grad_norm": 0.8700875043869019, + "learning_rate": 2.2426901295576215e-06, + "loss": 0.0801, + "step": 21050 + }, + { + "epoch": 80.38167938931298, + "grad_norm": 0.2903733551502228, + "learning_rate": 2.2380940571092013e-06, + "loss": 0.0656, + "step": 21060 + }, + { + "epoch": 80.41984732824427, + "grad_norm": 0.41912519931793213, + "learning_rate": 2.2335013406369303e-06, + "loss": 0.0693, + "step": 21070 + }, + { + "epoch": 80.45801526717557, + "grad_norm": 0.5005356669425964, + "learning_rate": 2.228911985721397e-06, + "loss": 0.0676, + "step": 21080 + }, + { + "epoch": 80.49618320610686, + "grad_norm": 0.39985841512680054, + "learning_rate": 2.224325997939095e-06, + "loss": 0.0703, + "step": 21090 + }, + { + "epoch": 80.53435114503817, + "grad_norm": 0.3365218937397003, + "learning_rate": 2.2197433828624372e-06, + "loss": 0.0669, + "step": 21100 + }, + { + "epoch": 80.57251908396947, + "grad_norm": 0.2456941455602646, + "learning_rate": 2.2151641460597295e-06, + "loss": 0.0694, + "step": 21110 + }, + { + "epoch": 80.61068702290076, + "grad_norm": 0.3186664879322052, + "learning_rate": 2.210588293095177e-06, + "loss": 0.0626, + "step": 21120 + }, + { + "epoch": 80.64885496183206, + "grad_norm": 0.3170889616012573, + "learning_rate": 2.2060158295288716e-06, + "loss": 0.0628, + "step": 21130 + }, + { + "epoch": 80.68702290076335, + "grad_norm": 0.49212417006492615, + "learning_rate": 2.2014467609167904e-06, + "loss": 0.07, + "step": 21140 + }, + { + "epoch": 80.72519083969466, + "grad_norm": 0.35683688521385193, + "learning_rate": 2.196881092810781e-06, + "loss": 0.0707, + "step": 21150 + }, + { + "epoch": 80.76335877862596, + "grad_norm": 0.20278224349021912, + "learning_rate": 2.192318830758561e-06, + "loss": 0.0693, + "step": 21160 + }, + { + "epoch": 80.80152671755725, + "grad_norm": 0.27485206723213196, + "learning_rate": 2.187759980303708e-06, + "loss": 0.0648, + "step": 21170 + }, + { + "epoch": 80.83969465648855, + "grad_norm": 0.3982328474521637, + "learning_rate": 2.1832045469856544e-06, + "loss": 0.0679, + "step": 21180 + }, + { + "epoch": 80.87786259541984, + "grad_norm": 0.7986206412315369, + "learning_rate": 2.178652536339684e-06, + "loss": 0.073, + "step": 21190 + }, + { + "epoch": 80.91603053435115, + "grad_norm": 0.45346391201019287, + "learning_rate": 2.1741039538969184e-06, + "loss": 0.0801, + "step": 21200 + }, + { + "epoch": 80.95419847328245, + "grad_norm": 0.33075547218322754, + "learning_rate": 2.169558805184313e-06, + "loss": 0.0913, + "step": 21210 + }, + { + "epoch": 80.99236641221374, + "grad_norm": 0.20161128044128418, + "learning_rate": 2.165017095724651e-06, + "loss": 0.0724, + "step": 21220 + }, + { + "epoch": 81.03053435114504, + "grad_norm": 0.4609990417957306, + "learning_rate": 2.1604788310365404e-06, + "loss": 0.0798, + "step": 21230 + }, + { + "epoch": 81.06870229007633, + "grad_norm": 0.9437888860702515, + "learning_rate": 2.155944016634398e-06, + "loss": 0.0825, + "step": 21240 + }, + { + "epoch": 81.10687022900764, + "grad_norm": 0.3145671486854553, + "learning_rate": 2.1514126580284538e-06, + "loss": 0.0653, + "step": 21250 + }, + { + "epoch": 81.14503816793894, + "grad_norm": 0.25964006781578064, + "learning_rate": 2.1468847607247344e-06, + "loss": 0.0665, + "step": 21260 + }, + { + "epoch": 81.18320610687023, + "grad_norm": 0.41988322138786316, + "learning_rate": 2.1423603302250625e-06, + "loss": 0.0732, + "step": 21270 + }, + { + "epoch": 81.22137404580153, + "grad_norm": 0.2630063593387604, + "learning_rate": 2.137839372027047e-06, + "loss": 0.073, + "step": 21280 + }, + { + "epoch": 81.25954198473282, + "grad_norm": 0.42665886878967285, + "learning_rate": 2.133321891624076e-06, + "loss": 0.0777, + "step": 21290 + }, + { + "epoch": 81.29770992366412, + "grad_norm": 0.6088788509368896, + "learning_rate": 2.1288078945053194e-06, + "loss": 0.0715, + "step": 21300 + }, + { + "epoch": 81.33587786259542, + "grad_norm": 0.7648810744285583, + "learning_rate": 2.1242973861557064e-06, + "loss": 0.0714, + "step": 21310 + }, + { + "epoch": 81.37404580152672, + "grad_norm": 0.4655439555644989, + "learning_rate": 2.1197903720559303e-06, + "loss": 0.0734, + "step": 21320 + }, + { + "epoch": 81.41221374045801, + "grad_norm": 0.8302370309829712, + "learning_rate": 2.1152868576824383e-06, + "loss": 0.0769, + "step": 21330 + }, + { + "epoch": 81.45038167938931, + "grad_norm": 0.7731935977935791, + "learning_rate": 2.110786848507423e-06, + "loss": 0.0725, + "step": 21340 + }, + { + "epoch": 81.4885496183206, + "grad_norm": 0.8815107345581055, + "learning_rate": 2.1062903499988235e-06, + "loss": 0.0682, + "step": 21350 + }, + { + "epoch": 81.52671755725191, + "grad_norm": 0.4499039649963379, + "learning_rate": 2.101797367620308e-06, + "loss": 0.0738, + "step": 21360 + }, + { + "epoch": 81.56488549618321, + "grad_norm": 0.3854046165943146, + "learning_rate": 2.0973079068312713e-06, + "loss": 0.072, + "step": 21370 + }, + { + "epoch": 81.6030534351145, + "grad_norm": 0.33728718757629395, + "learning_rate": 2.0928219730868358e-06, + "loss": 0.0666, + "step": 21380 + }, + { + "epoch": 81.6412213740458, + "grad_norm": 0.24927888810634613, + "learning_rate": 2.0883395718378304e-06, + "loss": 0.0624, + "step": 21390 + }, + { + "epoch": 81.6793893129771, + "grad_norm": 0.45682278275489807, + "learning_rate": 2.083860708530798e-06, + "loss": 0.0992, + "step": 21400 + }, + { + "epoch": 81.7175572519084, + "grad_norm": 0.29621076583862305, + "learning_rate": 2.0793853886079794e-06, + "loss": 0.0766, + "step": 21410 + }, + { + "epoch": 81.7557251908397, + "grad_norm": 0.34035375714302063, + "learning_rate": 2.074913617507309e-06, + "loss": 0.0694, + "step": 21420 + }, + { + "epoch": 81.79389312977099, + "grad_norm": 0.23349541425704956, + "learning_rate": 2.0704454006624116e-06, + "loss": 0.0709, + "step": 21430 + }, + { + "epoch": 81.83206106870229, + "grad_norm": 0.2720065414905548, + "learning_rate": 2.0659807435025907e-06, + "loss": 0.0791, + "step": 21440 + }, + { + "epoch": 81.87022900763358, + "grad_norm": 0.24106143414974213, + "learning_rate": 2.061519651452825e-06, + "loss": 0.0716, + "step": 21450 + }, + { + "epoch": 81.90839694656489, + "grad_norm": 0.5342702865600586, + "learning_rate": 2.0570621299337656e-06, + "loss": 0.073, + "step": 21460 + }, + { + "epoch": 81.94656488549619, + "grad_norm": 0.3628579080104828, + "learning_rate": 2.0526081843617183e-06, + "loss": 0.0784, + "step": 21470 + }, + { + "epoch": 81.98473282442748, + "grad_norm": 0.35211580991744995, + "learning_rate": 2.0481578201486484e-06, + "loss": 0.0673, + "step": 21480 + }, + { + "epoch": 82.02290076335878, + "grad_norm": 0.44048014283180237, + "learning_rate": 2.043711042702168e-06, + "loss": 0.0757, + "step": 21490 + }, + { + "epoch": 82.06106870229007, + "grad_norm": 0.5018337965011597, + "learning_rate": 2.039267857425528e-06, + "loss": 0.0699, + "step": 21500 + }, + { + "epoch": 82.09923664122137, + "grad_norm": 0.4347798824310303, + "learning_rate": 2.034828269717622e-06, + "loss": 0.0703, + "step": 21510 + }, + { + "epoch": 82.13740458015268, + "grad_norm": 0.2908450663089752, + "learning_rate": 2.030392284972964e-06, + "loss": 0.071, + "step": 21520 + }, + { + "epoch": 82.17557251908397, + "grad_norm": 0.39640676975250244, + "learning_rate": 2.0259599085816973e-06, + "loss": 0.0735, + "step": 21530 + }, + { + "epoch": 82.21374045801527, + "grad_norm": 0.8347475528717041, + "learning_rate": 2.0215311459295757e-06, + "loss": 0.0798, + "step": 21540 + }, + { + "epoch": 82.25190839694656, + "grad_norm": 0.3542044758796692, + "learning_rate": 2.0171060023979603e-06, + "loss": 0.0669, + "step": 21550 + }, + { + "epoch": 82.29007633587786, + "grad_norm": 0.5587815046310425, + "learning_rate": 2.012684483363823e-06, + "loss": 0.0728, + "step": 21560 + }, + { + "epoch": 82.32824427480917, + "grad_norm": 0.2797441780567169, + "learning_rate": 2.0082665941997236e-06, + "loss": 0.0699, + "step": 21570 + }, + { + "epoch": 82.36641221374046, + "grad_norm": 0.5870032906532288, + "learning_rate": 2.0038523402738147e-06, + "loss": 0.0684, + "step": 21580 + }, + { + "epoch": 82.40458015267176, + "grad_norm": 0.608632504940033, + "learning_rate": 1.99944172694983e-06, + "loss": 0.0767, + "step": 21590 + }, + { + "epoch": 82.44274809160305, + "grad_norm": 0.28920474648475647, + "learning_rate": 1.99503475958708e-06, + "loss": 0.0731, + "step": 21600 + }, + { + "epoch": 82.48091603053435, + "grad_norm": 0.22390879690647125, + "learning_rate": 1.9906314435404484e-06, + "loss": 0.0676, + "step": 21610 + }, + { + "epoch": 82.51908396946565, + "grad_norm": 0.3375746011734009, + "learning_rate": 1.986231784160378e-06, + "loss": 0.0724, + "step": 21620 + }, + { + "epoch": 82.55725190839695, + "grad_norm": 0.37056073546409607, + "learning_rate": 1.9818357867928697e-06, + "loss": 0.0741, + "step": 21630 + }, + { + "epoch": 82.59541984732824, + "grad_norm": 0.25039762258529663, + "learning_rate": 1.9774434567794744e-06, + "loss": 0.0709, + "step": 21640 + }, + { + "epoch": 82.63358778625954, + "grad_norm": 0.33184704184532166, + "learning_rate": 1.973054799457286e-06, + "loss": 0.0685, + "step": 21650 + }, + { + "epoch": 82.67175572519083, + "grad_norm": 0.4607788920402527, + "learning_rate": 1.9686698201589395e-06, + "loss": 0.0739, + "step": 21660 + }, + { + "epoch": 82.70992366412214, + "grad_norm": 0.7115771770477295, + "learning_rate": 1.9642885242125962e-06, + "loss": 0.0819, + "step": 21670 + }, + { + "epoch": 82.74809160305344, + "grad_norm": 0.3059196472167969, + "learning_rate": 1.9599109169419467e-06, + "loss": 0.065, + "step": 21680 + }, + { + "epoch": 82.78625954198473, + "grad_norm": 0.2690870761871338, + "learning_rate": 1.9555370036661946e-06, + "loss": 0.069, + "step": 21690 + }, + { + "epoch": 82.82442748091603, + "grad_norm": 0.8284600973129272, + "learning_rate": 1.9511667897000577e-06, + "loss": 0.0672, + "step": 21700 + }, + { + "epoch": 82.86259541984732, + "grad_norm": 0.5146779417991638, + "learning_rate": 1.946800280353755e-06, + "loss": 0.063, + "step": 21710 + }, + { + "epoch": 82.90076335877862, + "grad_norm": 0.27559417486190796, + "learning_rate": 1.9424374809330117e-06, + "loss": 0.0717, + "step": 21720 + }, + { + "epoch": 82.93893129770993, + "grad_norm": 0.3968946933746338, + "learning_rate": 1.938078396739038e-06, + "loss": 0.0662, + "step": 21730 + }, + { + "epoch": 82.97709923664122, + "grad_norm": 0.3083120286464691, + "learning_rate": 1.9337230330685332e-06, + "loss": 0.0686, + "step": 21740 + }, + { + "epoch": 83.01526717557252, + "grad_norm": 0.46116316318511963, + "learning_rate": 1.929371395213674e-06, + "loss": 0.0809, + "step": 21750 + }, + { + "epoch": 83.05343511450381, + "grad_norm": 0.3158531188964844, + "learning_rate": 1.9250234884621093e-06, + "loss": 0.0725, + "step": 21760 + }, + { + "epoch": 83.09160305343511, + "grad_norm": 0.5462889671325684, + "learning_rate": 1.9206793180969593e-06, + "loss": 0.0671, + "step": 21770 + }, + { + "epoch": 83.12977099236642, + "grad_norm": 0.4078521430492401, + "learning_rate": 1.916338889396798e-06, + "loss": 0.0725, + "step": 21780 + }, + { + "epoch": 83.16793893129771, + "grad_norm": 0.46893805265426636, + "learning_rate": 1.9120022076356577e-06, + "loss": 0.0741, + "step": 21790 + }, + { + "epoch": 83.20610687022901, + "grad_norm": 0.8775691986083984, + "learning_rate": 1.9076692780830115e-06, + "loss": 0.0706, + "step": 21800 + }, + { + "epoch": 83.2442748091603, + "grad_norm": 0.24274039268493652, + "learning_rate": 1.903340106003782e-06, + "loss": 0.0711, + "step": 21810 + }, + { + "epoch": 83.2824427480916, + "grad_norm": 0.5412287712097168, + "learning_rate": 1.8990146966583183e-06, + "loss": 0.0716, + "step": 21820 + }, + { + "epoch": 83.3206106870229, + "grad_norm": 0.26575616002082825, + "learning_rate": 1.8946930553024034e-06, + "loss": 0.0693, + "step": 21830 + }, + { + "epoch": 83.3587786259542, + "grad_norm": 0.39014455676078796, + "learning_rate": 1.8903751871872377e-06, + "loss": 0.0669, + "step": 21840 + }, + { + "epoch": 83.3969465648855, + "grad_norm": 0.2963654398918152, + "learning_rate": 1.8860610975594384e-06, + "loss": 0.0783, + "step": 21850 + }, + { + "epoch": 83.43511450381679, + "grad_norm": 0.2706157863140106, + "learning_rate": 1.8817507916610307e-06, + "loss": 0.0688, + "step": 21860 + }, + { + "epoch": 83.47328244274809, + "grad_norm": 0.369393914937973, + "learning_rate": 1.8774442747294407e-06, + "loss": 0.0666, + "step": 21870 + }, + { + "epoch": 83.5114503816794, + "grad_norm": 0.44239068031311035, + "learning_rate": 1.8731415519974967e-06, + "loss": 0.0754, + "step": 21880 + }, + { + "epoch": 83.54961832061069, + "grad_norm": 0.29093948006629944, + "learning_rate": 1.8688426286934102e-06, + "loss": 0.0718, + "step": 21890 + }, + { + "epoch": 83.58778625954199, + "grad_norm": 0.45363718271255493, + "learning_rate": 1.864547510040779e-06, + "loss": 0.0676, + "step": 21900 + }, + { + "epoch": 83.62595419847328, + "grad_norm": 0.427654892206192, + "learning_rate": 1.8602562012585768e-06, + "loss": 0.0667, + "step": 21910 + }, + { + "epoch": 83.66412213740458, + "grad_norm": 0.2690734267234802, + "learning_rate": 1.8559687075611466e-06, + "loss": 0.0648, + "step": 21920 + }, + { + "epoch": 83.70229007633588, + "grad_norm": 0.26419100165367126, + "learning_rate": 1.8516850341582015e-06, + "loss": 0.0647, + "step": 21930 + }, + { + "epoch": 83.74045801526718, + "grad_norm": 0.4640803337097168, + "learning_rate": 1.847405186254807e-06, + "loss": 0.0662, + "step": 21940 + }, + { + "epoch": 83.77862595419847, + "grad_norm": 0.3314109742641449, + "learning_rate": 1.8431291690513791e-06, + "loss": 0.0684, + "step": 21950 + }, + { + "epoch": 83.81679389312977, + "grad_norm": 1.6556986570358276, + "learning_rate": 1.8388569877436863e-06, + "loss": 0.0736, + "step": 21960 + }, + { + "epoch": 83.85496183206106, + "grad_norm": 0.2407653033733368, + "learning_rate": 1.834588647522828e-06, + "loss": 0.0648, + "step": 21970 + }, + { + "epoch": 83.89312977099236, + "grad_norm": 0.2633666694164276, + "learning_rate": 1.8303241535752437e-06, + "loss": 0.0793, + "step": 21980 + }, + { + "epoch": 83.93129770992367, + "grad_norm": 0.3340645730495453, + "learning_rate": 1.8260635110826936e-06, + "loss": 0.0684, + "step": 21990 + }, + { + "epoch": 83.96946564885496, + "grad_norm": 0.5088317394256592, + "learning_rate": 1.82180672522226e-06, + "loss": 0.0668, + "step": 22000 + }, + { + "epoch": 84.00763358778626, + "grad_norm": 0.4002598822116852, + "learning_rate": 1.817553801166339e-06, + "loss": 0.0733, + "step": 22010 + }, + { + "epoch": 84.04580152671755, + "grad_norm": 0.34704163670539856, + "learning_rate": 1.8133047440826335e-06, + "loss": 0.0655, + "step": 22020 + }, + { + "epoch": 84.08396946564885, + "grad_norm": 0.27369430661201477, + "learning_rate": 1.8090595591341509e-06, + "loss": 0.0687, + "step": 22030 + }, + { + "epoch": 84.12213740458016, + "grad_norm": 0.6098417043685913, + "learning_rate": 1.8048182514791901e-06, + "loss": 0.0667, + "step": 22040 + }, + { + "epoch": 84.16030534351145, + "grad_norm": 0.3603808581829071, + "learning_rate": 1.8005808262713399e-06, + "loss": 0.0636, + "step": 22050 + }, + { + "epoch": 84.19847328244275, + "grad_norm": 0.5261178016662598, + "learning_rate": 1.7963472886594713e-06, + "loss": 0.0674, + "step": 22060 + }, + { + "epoch": 84.23664122137404, + "grad_norm": 0.27495747804641724, + "learning_rate": 1.7921176437877302e-06, + "loss": 0.0663, + "step": 22070 + }, + { + "epoch": 84.27480916030534, + "grad_norm": 0.5118312835693359, + "learning_rate": 1.7878918967955366e-06, + "loss": 0.0722, + "step": 22080 + }, + { + "epoch": 84.31297709923665, + "grad_norm": 0.2527947723865509, + "learning_rate": 1.7836700528175693e-06, + "loss": 0.0681, + "step": 22090 + }, + { + "epoch": 84.35114503816794, + "grad_norm": 0.25550681352615356, + "learning_rate": 1.7794521169837693e-06, + "loss": 0.0694, + "step": 22100 + }, + { + "epoch": 84.38931297709924, + "grad_norm": 0.6813188195228577, + "learning_rate": 1.7752380944193248e-06, + "loss": 0.0719, + "step": 22110 + }, + { + "epoch": 84.42748091603053, + "grad_norm": 0.6162485480308533, + "learning_rate": 1.771027990244671e-06, + "loss": 0.0768, + "step": 22120 + }, + { + "epoch": 84.46564885496183, + "grad_norm": 0.2738633453845978, + "learning_rate": 1.7668218095754797e-06, + "loss": 0.0703, + "step": 22130 + }, + { + "epoch": 84.50381679389314, + "grad_norm": 0.9221097230911255, + "learning_rate": 1.7626195575226595e-06, + "loss": 0.0679, + "step": 22140 + }, + { + "epoch": 84.54198473282443, + "grad_norm": 0.2497481107711792, + "learning_rate": 1.7584212391923428e-06, + "loss": 0.0721, + "step": 22150 + }, + { + "epoch": 84.58015267175573, + "grad_norm": 0.36537232995033264, + "learning_rate": 1.7542268596858813e-06, + "loss": 0.0772, + "step": 22160 + }, + { + "epoch": 84.61832061068702, + "grad_norm": 0.2652266025543213, + "learning_rate": 1.7500364240998412e-06, + "loss": 0.0693, + "step": 22170 + }, + { + "epoch": 84.65648854961832, + "grad_norm": 0.3559548258781433, + "learning_rate": 1.7458499375259957e-06, + "loss": 0.0703, + "step": 22180 + }, + { + "epoch": 84.69465648854961, + "grad_norm": 0.40493687987327576, + "learning_rate": 1.7416674050513243e-06, + "loss": 0.0613, + "step": 22190 + }, + { + "epoch": 84.73282442748092, + "grad_norm": 0.38015782833099365, + "learning_rate": 1.7374888317579968e-06, + "loss": 0.0736, + "step": 22200 + }, + { + "epoch": 84.77099236641222, + "grad_norm": 0.19811968505382538, + "learning_rate": 1.7333142227233728e-06, + "loss": 0.0659, + "step": 22210 + }, + { + "epoch": 84.80916030534351, + "grad_norm": 0.5039312839508057, + "learning_rate": 1.7291435830199954e-06, + "loss": 0.0726, + "step": 22220 + }, + { + "epoch": 84.8473282442748, + "grad_norm": 0.36674124002456665, + "learning_rate": 1.7249769177155879e-06, + "loss": 0.0657, + "step": 22230 + }, + { + "epoch": 84.8854961832061, + "grad_norm": 0.7190355062484741, + "learning_rate": 1.720814231873038e-06, + "loss": 0.0783, + "step": 22240 + }, + { + "epoch": 84.92366412213741, + "grad_norm": 0.3150536119937897, + "learning_rate": 1.716655530550405e-06, + "loss": 0.0755, + "step": 22250 + }, + { + "epoch": 84.9618320610687, + "grad_norm": 0.29341939091682434, + "learning_rate": 1.7125008188009018e-06, + "loss": 0.0667, + "step": 22260 + }, + { + "epoch": 85.0, + "grad_norm": 0.3069048225879669, + "learning_rate": 1.7083501016728944e-06, + "loss": 0.0674, + "step": 22270 + }, + { + "epoch": 85.0381679389313, + "grad_norm": 0.3248658776283264, + "learning_rate": 1.704203384209896e-06, + "loss": 0.0649, + "step": 22280 + }, + { + "epoch": 85.07633587786259, + "grad_norm": 0.2633165121078491, + "learning_rate": 1.700060671450557e-06, + "loss": 0.0705, + "step": 22290 + }, + { + "epoch": 85.1145038167939, + "grad_norm": 0.2645620107650757, + "learning_rate": 1.6959219684286681e-06, + "loss": 0.062, + "step": 22300 + }, + { + "epoch": 85.1526717557252, + "grad_norm": 0.25467944145202637, + "learning_rate": 1.6917872801731417e-06, + "loss": 0.0726, + "step": 22310 + }, + { + "epoch": 85.19083969465649, + "grad_norm": 0.4649989902973175, + "learning_rate": 1.6876566117080139e-06, + "loss": 0.0737, + "step": 22320 + }, + { + "epoch": 85.22900763358778, + "grad_norm": 0.4027472138404846, + "learning_rate": 1.683529968052437e-06, + "loss": 0.069, + "step": 22330 + }, + { + "epoch": 85.26717557251908, + "grad_norm": 0.35890597105026245, + "learning_rate": 1.679407354220669e-06, + "loss": 0.0709, + "step": 22340 + }, + { + "epoch": 85.30534351145039, + "grad_norm": 0.24897009134292603, + "learning_rate": 1.6752887752220792e-06, + "loss": 0.0682, + "step": 22350 + }, + { + "epoch": 85.34351145038168, + "grad_norm": 0.31344956159591675, + "learning_rate": 1.6711742360611277e-06, + "loss": 0.0736, + "step": 22360 + }, + { + "epoch": 85.38167938931298, + "grad_norm": 0.633032500743866, + "learning_rate": 1.6670637417373652e-06, + "loss": 0.0769, + "step": 22370 + }, + { + "epoch": 85.41984732824427, + "grad_norm": 0.41895341873168945, + "learning_rate": 1.6629572972454333e-06, + "loss": 0.0666, + "step": 22380 + }, + { + "epoch": 85.45801526717557, + "grad_norm": 0.25024113059043884, + "learning_rate": 1.6588549075750466e-06, + "loss": 0.075, + "step": 22390 + }, + { + "epoch": 85.49618320610686, + "grad_norm": 0.3272351622581482, + "learning_rate": 1.6547565777109975e-06, + "loss": 0.0718, + "step": 22400 + }, + { + "epoch": 85.53435114503817, + "grad_norm": 0.6027217507362366, + "learning_rate": 1.6506623126331427e-06, + "loss": 0.0696, + "step": 22410 + }, + { + "epoch": 85.57251908396947, + "grad_norm": 0.4890408515930176, + "learning_rate": 1.6465721173164e-06, + "loss": 0.0708, + "step": 22420 + }, + { + "epoch": 85.61068702290076, + "grad_norm": 0.39123833179473877, + "learning_rate": 1.6424859967307427e-06, + "loss": 0.0687, + "step": 22430 + }, + { + "epoch": 85.64885496183206, + "grad_norm": 0.26333490014076233, + "learning_rate": 1.6384039558411902e-06, + "loss": 0.0725, + "step": 22440 + }, + { + "epoch": 85.68702290076335, + "grad_norm": 0.32792478799819946, + "learning_rate": 1.6343259996078109e-06, + "loss": 0.0816, + "step": 22450 + }, + { + "epoch": 85.72519083969466, + "grad_norm": 0.32284873723983765, + "learning_rate": 1.6302521329857046e-06, + "loss": 0.0686, + "step": 22460 + }, + { + "epoch": 85.76335877862596, + "grad_norm": 0.35827475786209106, + "learning_rate": 1.6261823609250027e-06, + "loss": 0.0702, + "step": 22470 + }, + { + "epoch": 85.80152671755725, + "grad_norm": 0.28327593207359314, + "learning_rate": 1.6221166883708629e-06, + "loss": 0.0657, + "step": 22480 + }, + { + "epoch": 85.83969465648855, + "grad_norm": 0.2737085223197937, + "learning_rate": 1.6180551202634603e-06, + "loss": 0.0697, + "step": 22490 + }, + { + "epoch": 85.87786259541984, + "grad_norm": 0.29437166452407837, + "learning_rate": 1.613997661537981e-06, + "loss": 0.0692, + "step": 22500 + }, + { + "epoch": 85.91603053435115, + "grad_norm": 0.6566985249519348, + "learning_rate": 1.6099443171246243e-06, + "loss": 0.0664, + "step": 22510 + }, + { + "epoch": 85.95419847328245, + "grad_norm": 0.3802522122859955, + "learning_rate": 1.6058950919485823e-06, + "loss": 0.0635, + "step": 22520 + }, + { + "epoch": 85.99236641221374, + "grad_norm": 0.44286301732063293, + "learning_rate": 1.6018499909300478e-06, + "loss": 0.0697, + "step": 22530 + }, + { + "epoch": 86.03053435114504, + "grad_norm": 0.2453600913286209, + "learning_rate": 1.5978090189841988e-06, + "loss": 0.0671, + "step": 22540 + }, + { + "epoch": 86.06870229007633, + "grad_norm": 0.2875606417655945, + "learning_rate": 1.5937721810211958e-06, + "loss": 0.0676, + "step": 22550 + }, + { + "epoch": 86.10687022900764, + "grad_norm": 0.7595479488372803, + "learning_rate": 1.5897394819461815e-06, + "loss": 0.078, + "step": 22560 + }, + { + "epoch": 86.14503816793894, + "grad_norm": 0.2808679938316345, + "learning_rate": 1.5857109266592624e-06, + "loss": 0.0699, + "step": 22570 + }, + { + "epoch": 86.18320610687023, + "grad_norm": 0.27129414677619934, + "learning_rate": 1.5816865200555142e-06, + "loss": 0.0662, + "step": 22580 + }, + { + "epoch": 86.22137404580153, + "grad_norm": 0.27305206656455994, + "learning_rate": 1.5776662670249704e-06, + "loss": 0.0666, + "step": 22590 + }, + { + "epoch": 86.25954198473282, + "grad_norm": 0.4132048487663269, + "learning_rate": 1.573650172452615e-06, + "loss": 0.0711, + "step": 22600 + }, + { + "epoch": 86.29770992366412, + "grad_norm": 0.39085114002227783, + "learning_rate": 1.5696382412183853e-06, + "loss": 0.0721, + "step": 22610 + }, + { + "epoch": 86.33587786259542, + "grad_norm": 0.570540726184845, + "learning_rate": 1.5656304781971549e-06, + "loss": 0.0726, + "step": 22620 + }, + { + "epoch": 86.37404580152672, + "grad_norm": 0.452827513217926, + "learning_rate": 1.5616268882587331e-06, + "loss": 0.0698, + "step": 22630 + }, + { + "epoch": 86.41221374045801, + "grad_norm": 0.2821395993232727, + "learning_rate": 1.5576274762678594e-06, + "loss": 0.0672, + "step": 22640 + }, + { + "epoch": 86.45038167938931, + "grad_norm": 0.30585944652557373, + "learning_rate": 1.5536322470841953e-06, + "loss": 0.0714, + "step": 22650 + }, + { + "epoch": 86.4885496183206, + "grad_norm": 1.8794187307357788, + "learning_rate": 1.5496412055623238e-06, + "loss": 0.0727, + "step": 22660 + }, + { + "epoch": 86.52671755725191, + "grad_norm": 0.3682367205619812, + "learning_rate": 1.5456543565517334e-06, + "loss": 0.0684, + "step": 22670 + }, + { + "epoch": 86.56488549618321, + "grad_norm": 0.6367046236991882, + "learning_rate": 1.541671704896825e-06, + "loss": 0.0734, + "step": 22680 + }, + { + "epoch": 86.6030534351145, + "grad_norm": 0.2984490692615509, + "learning_rate": 1.537693255436894e-06, + "loss": 0.0801, + "step": 22690 + }, + { + "epoch": 86.6412213740458, + "grad_norm": 0.26280051469802856, + "learning_rate": 1.533719013006132e-06, + "loss": 0.0734, + "step": 22700 + }, + { + "epoch": 86.6793893129771, + "grad_norm": 0.6878067255020142, + "learning_rate": 1.5297489824336165e-06, + "loss": 0.0751, + "step": 22710 + }, + { + "epoch": 86.7175572519084, + "grad_norm": 0.42184221744537354, + "learning_rate": 1.5257831685433121e-06, + "loss": 0.0679, + "step": 22720 + }, + { + "epoch": 86.7557251908397, + "grad_norm": 0.33254826068878174, + "learning_rate": 1.521821576154055e-06, + "loss": 0.0673, + "step": 22730 + }, + { + "epoch": 86.79389312977099, + "grad_norm": 0.20484282076358795, + "learning_rate": 1.5178642100795543e-06, + "loss": 0.0659, + "step": 22740 + }, + { + "epoch": 86.83206106870229, + "grad_norm": 0.32377350330352783, + "learning_rate": 1.5139110751283819e-06, + "loss": 0.0704, + "step": 22750 + }, + { + "epoch": 86.87022900763358, + "grad_norm": 0.3272833228111267, + "learning_rate": 1.5099621761039684e-06, + "loss": 0.0702, + "step": 22760 + }, + { + "epoch": 86.90839694656489, + "grad_norm": 0.5103157162666321, + "learning_rate": 1.5060175178046017e-06, + "loss": 0.0779, + "step": 22770 + }, + { + "epoch": 86.94656488549619, + "grad_norm": 0.2767445743083954, + "learning_rate": 1.5020771050234118e-06, + "loss": 0.065, + "step": 22780 + }, + { + "epoch": 86.98473282442748, + "grad_norm": 0.6275362968444824, + "learning_rate": 1.4981409425483716e-06, + "loss": 0.065, + "step": 22790 + }, + { + "epoch": 87.02290076335878, + "grad_norm": 0.2717098593711853, + "learning_rate": 1.4942090351622884e-06, + "loss": 0.0772, + "step": 22800 + }, + { + "epoch": 87.06106870229007, + "grad_norm": 0.5612622499465942, + "learning_rate": 1.490281387642804e-06, + "loss": 0.0681, + "step": 22810 + }, + { + "epoch": 87.09923664122137, + "grad_norm": 0.27303266525268555, + "learning_rate": 1.4863580047623767e-06, + "loss": 0.068, + "step": 22820 + }, + { + "epoch": 87.13740458015268, + "grad_norm": 0.3088218867778778, + "learning_rate": 1.4824388912882897e-06, + "loss": 0.076, + "step": 22830 + }, + { + "epoch": 87.17557251908397, + "grad_norm": 0.22109341621398926, + "learning_rate": 1.4785240519826345e-06, + "loss": 0.065, + "step": 22840 + }, + { + "epoch": 87.21374045801527, + "grad_norm": 0.3760296404361725, + "learning_rate": 1.4746134916023096e-06, + "loss": 0.0653, + "step": 22850 + }, + { + "epoch": 87.25190839694656, + "grad_norm": 0.4311428368091583, + "learning_rate": 1.4707072148990142e-06, + "loss": 0.0673, + "step": 22860 + }, + { + "epoch": 87.29007633587786, + "grad_norm": 0.9990677237510681, + "learning_rate": 1.4668052266192423e-06, + "loss": 0.0692, + "step": 22870 + }, + { + "epoch": 87.32824427480917, + "grad_norm": 0.2663988769054413, + "learning_rate": 1.4629075315042795e-06, + "loss": 0.0654, + "step": 22880 + }, + { + "epoch": 87.36641221374046, + "grad_norm": 0.2906090021133423, + "learning_rate": 1.4590141342901926e-06, + "loss": 0.0667, + "step": 22890 + }, + { + "epoch": 87.40458015267176, + "grad_norm": 0.5465324521064758, + "learning_rate": 1.4551250397078253e-06, + "loss": 0.0716, + "step": 22900 + }, + { + "epoch": 87.44274809160305, + "grad_norm": 0.29929447174072266, + "learning_rate": 1.4512402524827945e-06, + "loss": 0.0755, + "step": 22910 + }, + { + "epoch": 87.48091603053435, + "grad_norm": 0.3241725564002991, + "learning_rate": 1.447359777335482e-06, + "loss": 0.0635, + "step": 22920 + }, + { + "epoch": 87.51908396946565, + "grad_norm": 0.28157156705856323, + "learning_rate": 1.4434836189810337e-06, + "loss": 0.0651, + "step": 22930 + }, + { + "epoch": 87.55725190839695, + "grad_norm": 0.32432863116264343, + "learning_rate": 1.4396117821293454e-06, + "loss": 0.0682, + "step": 22940 + }, + { + "epoch": 87.59541984732824, + "grad_norm": 0.39860352873802185, + "learning_rate": 1.4357442714850634e-06, + "loss": 0.0703, + "step": 22950 + }, + { + "epoch": 87.63358778625954, + "grad_norm": 0.4769950211048126, + "learning_rate": 1.43188109174758e-06, + "loss": 0.0657, + "step": 22960 + }, + { + "epoch": 87.67175572519083, + "grad_norm": 0.42243120074272156, + "learning_rate": 1.4280222476110206e-06, + "loss": 0.0674, + "step": 22970 + }, + { + "epoch": 87.70992366412214, + "grad_norm": 0.6192082166671753, + "learning_rate": 1.424167743764247e-06, + "loss": 0.0724, + "step": 22980 + }, + { + "epoch": 87.74809160305344, + "grad_norm": 0.6476142406463623, + "learning_rate": 1.420317584890844e-06, + "loss": 0.0671, + "step": 22990 + }, + { + "epoch": 87.78625954198473, + "grad_norm": 0.6006608605384827, + "learning_rate": 1.4164717756691176e-06, + "loss": 0.0727, + "step": 23000 + }, + { + "epoch": 87.82442748091603, + "grad_norm": 0.5018774271011353, + "learning_rate": 1.4126303207720882e-06, + "loss": 0.0644, + "step": 23010 + }, + { + "epoch": 87.86259541984732, + "grad_norm": 0.19781306385993958, + "learning_rate": 1.408793224867484e-06, + "loss": 0.0683, + "step": 23020 + }, + { + "epoch": 87.90076335877862, + "grad_norm": 0.3770202100276947, + "learning_rate": 1.4049604926177423e-06, + "loss": 0.0748, + "step": 23030 + }, + { + "epoch": 87.93893129770993, + "grad_norm": 0.23254378139972687, + "learning_rate": 1.4011321286799918e-06, + "loss": 0.071, + "step": 23040 + }, + { + "epoch": 87.97709923664122, + "grad_norm": 0.2871849238872528, + "learning_rate": 1.3973081377060565e-06, + "loss": 0.0685, + "step": 23050 + }, + { + "epoch": 88.01526717557252, + "grad_norm": 0.34911373257637024, + "learning_rate": 1.393488524342445e-06, + "loss": 0.0669, + "step": 23060 + }, + { + "epoch": 88.05343511450381, + "grad_norm": 0.23986873030662537, + "learning_rate": 1.3896732932303485e-06, + "loss": 0.0698, + "step": 23070 + }, + { + "epoch": 88.09160305343511, + "grad_norm": 0.26337021589279175, + "learning_rate": 1.3858624490056304e-06, + "loss": 0.067, + "step": 23080 + }, + { + "epoch": 88.12977099236642, + "grad_norm": 0.4079955220222473, + "learning_rate": 1.38205599629883e-06, + "loss": 0.0705, + "step": 23090 + }, + { + "epoch": 88.16793893129771, + "grad_norm": 0.5138763785362244, + "learning_rate": 1.378253939735142e-06, + "loss": 0.0691, + "step": 23100 + }, + { + "epoch": 88.20610687022901, + "grad_norm": 0.4184453785419464, + "learning_rate": 1.3744562839344267e-06, + "loss": 0.0688, + "step": 23110 + }, + { + "epoch": 88.2442748091603, + "grad_norm": 0.24194218218326569, + "learning_rate": 1.3706630335111932e-06, + "loss": 0.0689, + "step": 23120 + }, + { + "epoch": 88.2824427480916, + "grad_norm": 0.48007258772850037, + "learning_rate": 1.3668741930745966e-06, + "loss": 0.0766, + "step": 23130 + }, + { + "epoch": 88.3206106870229, + "grad_norm": 0.2578524351119995, + "learning_rate": 1.3630897672284382e-06, + "loss": 0.0634, + "step": 23140 + }, + { + "epoch": 88.3587786259542, + "grad_norm": 0.40297532081604004, + "learning_rate": 1.3593097605711508e-06, + "loss": 0.0656, + "step": 23150 + }, + { + "epoch": 88.3969465648855, + "grad_norm": 0.31493544578552246, + "learning_rate": 1.3555341776957992e-06, + "loss": 0.0729, + "step": 23160 + }, + { + "epoch": 88.43511450381679, + "grad_norm": 0.2946300208568573, + "learning_rate": 1.3517630231900724e-06, + "loss": 0.0722, + "step": 23170 + }, + { + "epoch": 88.47328244274809, + "grad_norm": 0.23656029999256134, + "learning_rate": 1.3479963016362768e-06, + "loss": 0.0721, + "step": 23180 + }, + { + "epoch": 88.5114503816794, + "grad_norm": 0.3117351233959198, + "learning_rate": 1.3442340176113378e-06, + "loss": 0.0699, + "step": 23190 + }, + { + "epoch": 88.54961832061069, + "grad_norm": 0.4488414227962494, + "learning_rate": 1.340476175686784e-06, + "loss": 0.0683, + "step": 23200 + }, + { + "epoch": 88.58778625954199, + "grad_norm": 0.3813345730304718, + "learning_rate": 1.336722780428747e-06, + "loss": 0.0699, + "step": 23210 + }, + { + "epoch": 88.62595419847328, + "grad_norm": 0.2527889311313629, + "learning_rate": 1.3329738363979561e-06, + "loss": 0.0717, + "step": 23220 + }, + { + "epoch": 88.66412213740458, + "grad_norm": 0.5992899537086487, + "learning_rate": 1.329229348149731e-06, + "loss": 0.07, + "step": 23230 + }, + { + "epoch": 88.70229007633588, + "grad_norm": 1.4383306503295898, + "learning_rate": 1.3254893202339798e-06, + "loss": 0.0725, + "step": 23240 + }, + { + "epoch": 88.74045801526718, + "grad_norm": 0.48335134983062744, + "learning_rate": 1.3217537571951872e-06, + "loss": 0.0736, + "step": 23250 + }, + { + "epoch": 88.77862595419847, + "grad_norm": 0.23111307621002197, + "learning_rate": 1.3180226635724169e-06, + "loss": 0.0851, + "step": 23260 + }, + { + "epoch": 88.81679389312977, + "grad_norm": 2.051677942276001, + "learning_rate": 1.314296043899298e-06, + "loss": 0.0734, + "step": 23270 + }, + { + "epoch": 88.85496183206106, + "grad_norm": 0.24527736008167267, + "learning_rate": 1.3105739027040248e-06, + "loss": 0.0708, + "step": 23280 + }, + { + "epoch": 88.89312977099236, + "grad_norm": 0.3417549729347229, + "learning_rate": 1.306856244509349e-06, + "loss": 0.0741, + "step": 23290 + }, + { + "epoch": 88.93129770992367, + "grad_norm": 0.42347151041030884, + "learning_rate": 1.3031430738325778e-06, + "loss": 0.0738, + "step": 23300 + }, + { + "epoch": 88.96946564885496, + "grad_norm": 0.3796163499355316, + "learning_rate": 1.299434395185563e-06, + "loss": 0.0675, + "step": 23310 + }, + { + "epoch": 89.00763358778626, + "grad_norm": 0.5870466828346252, + "learning_rate": 1.295730213074699e-06, + "loss": 0.0689, + "step": 23320 + }, + { + "epoch": 89.04580152671755, + "grad_norm": 0.3642314076423645, + "learning_rate": 1.2920305320009153e-06, + "loss": 0.064, + "step": 23330 + }, + { + "epoch": 89.08396946564885, + "grad_norm": 0.23387549817562103, + "learning_rate": 1.2883353564596729e-06, + "loss": 0.0675, + "step": 23340 + }, + { + "epoch": 89.12213740458016, + "grad_norm": 0.29250991344451904, + "learning_rate": 1.2846446909409605e-06, + "loss": 0.0627, + "step": 23350 + }, + { + "epoch": 89.16030534351145, + "grad_norm": 0.3102540075778961, + "learning_rate": 1.280958539929284e-06, + "loss": 0.0674, + "step": 23360 + }, + { + "epoch": 89.19847328244275, + "grad_norm": 0.2419544756412506, + "learning_rate": 1.2772769079036639e-06, + "loss": 0.0644, + "step": 23370 + }, + { + "epoch": 89.23664122137404, + "grad_norm": 0.26334354281425476, + "learning_rate": 1.2735997993376299e-06, + "loss": 0.0717, + "step": 23380 + }, + { + "epoch": 89.27480916030534, + "grad_norm": 0.30459722876548767, + "learning_rate": 1.2699272186992168e-06, + "loss": 0.0671, + "step": 23390 + }, + { + "epoch": 89.31297709923665, + "grad_norm": 0.49876540899276733, + "learning_rate": 1.2662591704509548e-06, + "loss": 0.068, + "step": 23400 + }, + { + "epoch": 89.35114503816794, + "grad_norm": 0.8938238620758057, + "learning_rate": 1.2625956590498712e-06, + "loss": 0.0736, + "step": 23410 + }, + { + "epoch": 89.38931297709924, + "grad_norm": 0.3588089942932129, + "learning_rate": 1.2589366889474758e-06, + "loss": 0.0731, + "step": 23420 + }, + { + "epoch": 89.42748091603053, + "grad_norm": 0.24369336664676666, + "learning_rate": 1.2552822645897623e-06, + "loss": 0.0688, + "step": 23430 + }, + { + "epoch": 89.46564885496183, + "grad_norm": 0.9177067279815674, + "learning_rate": 1.2516323904172001e-06, + "loss": 0.0654, + "step": 23440 + }, + { + "epoch": 89.50381679389314, + "grad_norm": 0.2567913830280304, + "learning_rate": 1.2479870708647324e-06, + "loss": 0.0669, + "step": 23450 + }, + { + "epoch": 89.54198473282443, + "grad_norm": 1.152028203010559, + "learning_rate": 1.2443463103617658e-06, + "loss": 0.066, + "step": 23460 + }, + { + "epoch": 89.58015267175573, + "grad_norm": 0.28860822319984436, + "learning_rate": 1.240710113332167e-06, + "loss": 0.068, + "step": 23470 + }, + { + "epoch": 89.61832061068702, + "grad_norm": 0.17750433087348938, + "learning_rate": 1.2370784841942584e-06, + "loss": 0.0676, + "step": 23480 + }, + { + "epoch": 89.65648854961832, + "grad_norm": 0.28479427099227905, + "learning_rate": 1.2334514273608117e-06, + "loss": 0.0668, + "step": 23490 + }, + { + "epoch": 89.69465648854961, + "grad_norm": 0.2883140444755554, + "learning_rate": 1.2298289472390417e-06, + "loss": 0.0837, + "step": 23500 + }, + { + "epoch": 89.73282442748092, + "grad_norm": 0.22831976413726807, + "learning_rate": 1.226211048230606e-06, + "loss": 0.073, + "step": 23510 + }, + { + "epoch": 89.77099236641222, + "grad_norm": 0.20338012278079987, + "learning_rate": 1.222597734731592e-06, + "loss": 0.0617, + "step": 23520 + }, + { + "epoch": 89.80916030534351, + "grad_norm": 0.4108172655105591, + "learning_rate": 1.2189890111325149e-06, + "loss": 0.0682, + "step": 23530 + }, + { + "epoch": 89.8473282442748, + "grad_norm": 0.3931494951248169, + "learning_rate": 1.2153848818183161e-06, + "loss": 0.0656, + "step": 23540 + }, + { + "epoch": 89.8854961832061, + "grad_norm": 0.5028300285339355, + "learning_rate": 1.2117853511683509e-06, + "loss": 0.0717, + "step": 23550 + }, + { + "epoch": 89.92366412213741, + "grad_norm": 0.5321047902107239, + "learning_rate": 1.2081904235563908e-06, + "loss": 0.0713, + "step": 23560 + }, + { + "epoch": 89.9618320610687, + "grad_norm": 0.40350303053855896, + "learning_rate": 1.20460010335061e-06, + "loss": 0.0666, + "step": 23570 + }, + { + "epoch": 90.0, + "grad_norm": 0.23981277644634247, + "learning_rate": 1.2010143949135866e-06, + "loss": 0.0726, + "step": 23580 + }, + { + "epoch": 90.0381679389313, + "grad_norm": 0.3623555600643158, + "learning_rate": 1.1974333026022939e-06, + "loss": 0.0692, + "step": 23590 + }, + { + "epoch": 90.07633587786259, + "grad_norm": 0.23631367087364197, + "learning_rate": 1.1938568307680965e-06, + "loss": 0.0698, + "step": 23600 + }, + { + "epoch": 90.1145038167939, + "grad_norm": 0.4967484176158905, + "learning_rate": 1.1902849837567466e-06, + "loss": 0.0695, + "step": 23610 + }, + { + "epoch": 90.1526717557252, + "grad_norm": 0.24320781230926514, + "learning_rate": 1.186717765908374e-06, + "loss": 0.0705, + "step": 23620 + }, + { + "epoch": 90.19083969465649, + "grad_norm": 0.7703472375869751, + "learning_rate": 1.1831551815574848e-06, + "loss": 0.0642, + "step": 23630 + }, + { + "epoch": 90.22900763358778, + "grad_norm": 0.24756668508052826, + "learning_rate": 1.1795972350329554e-06, + "loss": 0.0712, + "step": 23640 + }, + { + "epoch": 90.26717557251908, + "grad_norm": 0.45252394676208496, + "learning_rate": 1.176043930658024e-06, + "loss": 0.0674, + "step": 23650 + }, + { + "epoch": 90.30534351145039, + "grad_norm": 0.47805851697921753, + "learning_rate": 1.1724952727502942e-06, + "loss": 0.0754, + "step": 23660 + }, + { + "epoch": 90.34351145038168, + "grad_norm": 0.6508886814117432, + "learning_rate": 1.1689512656217178e-06, + "loss": 0.0732, + "step": 23670 + }, + { + "epoch": 90.38167938931298, + "grad_norm": 0.37030789256095886, + "learning_rate": 1.1654119135785964e-06, + "loss": 0.072, + "step": 23680 + }, + { + "epoch": 90.41984732824427, + "grad_norm": 0.43612009286880493, + "learning_rate": 1.1618772209215795e-06, + "loss": 0.0714, + "step": 23690 + }, + { + "epoch": 90.45801526717557, + "grad_norm": 0.3412275016307831, + "learning_rate": 1.1583471919456506e-06, + "loss": 0.0713, + "step": 23700 + }, + { + "epoch": 90.49618320610686, + "grad_norm": 0.24611587822437286, + "learning_rate": 1.1548218309401267e-06, + "loss": 0.075, + "step": 23710 + }, + { + "epoch": 90.53435114503817, + "grad_norm": 0.4314908981323242, + "learning_rate": 1.1513011421886554e-06, + "loss": 0.0646, + "step": 23720 + }, + { + "epoch": 90.57251908396947, + "grad_norm": 0.38820695877075195, + "learning_rate": 1.1477851299692056e-06, + "loss": 0.0712, + "step": 23730 + }, + { + "epoch": 90.61068702290076, + "grad_norm": 0.28595876693725586, + "learning_rate": 1.1442737985540631e-06, + "loss": 0.0646, + "step": 23740 + }, + { + "epoch": 90.64885496183206, + "grad_norm": 0.6347960233688354, + "learning_rate": 1.1407671522098262e-06, + "loss": 0.0855, + "step": 23750 + }, + { + "epoch": 90.68702290076335, + "grad_norm": 0.19028426706790924, + "learning_rate": 1.1372651951974001e-06, + "loss": 0.0702, + "step": 23760 + }, + { + "epoch": 90.72519083969466, + "grad_norm": 0.49181249737739563, + "learning_rate": 1.1337679317719952e-06, + "loss": 0.0758, + "step": 23770 + }, + { + "epoch": 90.76335877862596, + "grad_norm": 0.3415592312812805, + "learning_rate": 1.130275366183115e-06, + "loss": 0.0721, + "step": 23780 + }, + { + "epoch": 90.80152671755725, + "grad_norm": 0.3656644821166992, + "learning_rate": 1.1267875026745562e-06, + "loss": 0.0716, + "step": 23790 + }, + { + "epoch": 90.83969465648855, + "grad_norm": 0.2646436393260956, + "learning_rate": 1.1233043454844017e-06, + "loss": 0.067, + "step": 23800 + }, + { + "epoch": 90.87786259541984, + "grad_norm": 0.7572306394577026, + "learning_rate": 1.1198258988450145e-06, + "loss": 0.0698, + "step": 23810 + }, + { + "epoch": 90.91603053435115, + "grad_norm": 0.42010462284088135, + "learning_rate": 1.116352166983037e-06, + "loss": 0.0675, + "step": 23820 + }, + { + "epoch": 90.95419847328245, + "grad_norm": 0.26916852593421936, + "learning_rate": 1.112883154119382e-06, + "loss": 0.0651, + "step": 23830 + }, + { + "epoch": 90.99236641221374, + "grad_norm": 0.30210235714912415, + "learning_rate": 1.1094188644692255e-06, + "loss": 0.0734, + "step": 23840 + }, + { + "epoch": 91.03053435114504, + "grad_norm": 0.41560983657836914, + "learning_rate": 1.1059593022420067e-06, + "loss": 0.063, + "step": 23850 + }, + { + "epoch": 91.06870229007633, + "grad_norm": 0.37110573053359985, + "learning_rate": 1.1025044716414185e-06, + "loss": 0.0748, + "step": 23860 + }, + { + "epoch": 91.10687022900764, + "grad_norm": 0.5967647433280945, + "learning_rate": 1.0990543768654084e-06, + "loss": 0.0675, + "step": 23870 + }, + { + "epoch": 91.14503816793894, + "grad_norm": 0.2824907600879669, + "learning_rate": 1.0956090221061655e-06, + "loss": 0.0612, + "step": 23880 + }, + { + "epoch": 91.18320610687023, + "grad_norm": 0.536435604095459, + "learning_rate": 1.0921684115501208e-06, + "loss": 0.06, + "step": 23890 + }, + { + "epoch": 91.22137404580153, + "grad_norm": 0.25722911953926086, + "learning_rate": 1.0887325493779405e-06, + "loss": 0.0671, + "step": 23900 + }, + { + "epoch": 91.25954198473282, + "grad_norm": 0.28526943922042847, + "learning_rate": 1.0853014397645205e-06, + "loss": 0.0684, + "step": 23910 + }, + { + "epoch": 91.29770992366412, + "grad_norm": 0.28562769293785095, + "learning_rate": 1.0818750868789828e-06, + "loss": 0.0637, + "step": 23920 + }, + { + "epoch": 91.33587786259542, + "grad_norm": 0.3431946039199829, + "learning_rate": 1.0784534948846704e-06, + "loss": 0.0651, + "step": 23930 + }, + { + "epoch": 91.37404580152672, + "grad_norm": 1.2695426940917969, + "learning_rate": 1.0750366679391393e-06, + "loss": 0.0718, + "step": 23940 + }, + { + "epoch": 91.41221374045801, + "grad_norm": 0.27434664964675903, + "learning_rate": 1.0716246101941558e-06, + "loss": 0.0692, + "step": 23950 + }, + { + "epoch": 91.45038167938931, + "grad_norm": 0.30581653118133545, + "learning_rate": 1.0682173257956935e-06, + "loss": 0.0685, + "step": 23960 + }, + { + "epoch": 91.4885496183206, + "grad_norm": 0.2575470209121704, + "learning_rate": 1.064814818883922e-06, + "loss": 0.0675, + "step": 23970 + }, + { + "epoch": 91.52671755725191, + "grad_norm": 0.31820449233055115, + "learning_rate": 1.06141709359321e-06, + "loss": 0.0752, + "step": 23980 + }, + { + "epoch": 91.56488549618321, + "grad_norm": 0.5742216110229492, + "learning_rate": 1.0580241540521142e-06, + "loss": 0.0847, + "step": 23990 + }, + { + "epoch": 91.6030534351145, + "grad_norm": 0.545124351978302, + "learning_rate": 1.0546360043833747e-06, + "loss": 0.0692, + "step": 24000 + }, + { + "epoch": 91.6412213740458, + "grad_norm": 0.27798354625701904, + "learning_rate": 1.0512526487039138e-06, + "loss": 0.071, + "step": 24010 + }, + { + "epoch": 91.6793893129771, + "grad_norm": 0.6156341433525085, + "learning_rate": 1.0478740911248259e-06, + "loss": 0.0694, + "step": 24020 + }, + { + "epoch": 91.7175572519084, + "grad_norm": 0.318088173866272, + "learning_rate": 1.0445003357513794e-06, + "loss": 0.0695, + "step": 24030 + }, + { + "epoch": 91.7557251908397, + "grad_norm": 0.28530198335647583, + "learning_rate": 1.0411313866830042e-06, + "loss": 0.0656, + "step": 24040 + }, + { + "epoch": 91.79389312977099, + "grad_norm": 0.717268168926239, + "learning_rate": 1.0377672480132917e-06, + "loss": 0.0689, + "step": 24050 + }, + { + "epoch": 91.83206106870229, + "grad_norm": 0.2844495475292206, + "learning_rate": 1.0344079238299865e-06, + "loss": 0.0672, + "step": 24060 + }, + { + "epoch": 91.87022900763358, + "grad_norm": 0.8445849418640137, + "learning_rate": 1.0310534182149835e-06, + "loss": 0.074, + "step": 24070 + }, + { + "epoch": 91.90839694656489, + "grad_norm": 0.6028487682342529, + "learning_rate": 1.0277037352443258e-06, + "loss": 0.0714, + "step": 24080 + }, + { + "epoch": 91.94656488549619, + "grad_norm": 0.29483121633529663, + "learning_rate": 1.0243588789881931e-06, + "loss": 0.0693, + "step": 24090 + }, + { + "epoch": 91.98473282442748, + "grad_norm": 0.2845158576965332, + "learning_rate": 1.0210188535108995e-06, + "loss": 0.0692, + "step": 24100 + }, + { + "epoch": 92.02290076335878, + "grad_norm": 0.28095564246177673, + "learning_rate": 1.0176836628708937e-06, + "loss": 0.0705, + "step": 24110 + }, + { + "epoch": 92.06106870229007, + "grad_norm": 0.6553061008453369, + "learning_rate": 1.0143533111207455e-06, + "loss": 0.0724, + "step": 24120 + }, + { + "epoch": 92.09923664122137, + "grad_norm": 0.22242268919944763, + "learning_rate": 1.0110278023071445e-06, + "loss": 0.0688, + "step": 24130 + }, + { + "epoch": 92.13740458015268, + "grad_norm": 0.4113728106021881, + "learning_rate": 1.007707140470901e-06, + "loss": 0.0726, + "step": 24140 + }, + { + "epoch": 92.17557251908397, + "grad_norm": 0.22345533967018127, + "learning_rate": 1.00439132964693e-06, + "loss": 0.0668, + "step": 24150 + }, + { + "epoch": 92.21374045801527, + "grad_norm": 0.7509258389472961, + "learning_rate": 1.001080373864255e-06, + "loss": 0.0783, + "step": 24160 + }, + { + "epoch": 92.25190839694656, + "grad_norm": 0.3895869255065918, + "learning_rate": 9.977742771459992e-07, + "loss": 0.0698, + "step": 24170 + }, + { + "epoch": 92.29007633587786, + "grad_norm": 0.360806941986084, + "learning_rate": 9.944730435093803e-07, + "loss": 0.0658, + "step": 24180 + }, + { + "epoch": 92.32824427480917, + "grad_norm": 0.6112274527549744, + "learning_rate": 9.911766769657116e-07, + "loss": 0.0724, + "step": 24190 + }, + { + "epoch": 92.36641221374046, + "grad_norm": 0.5387519598007202, + "learning_rate": 9.878851815203883e-07, + "loss": 0.0705, + "step": 24200 + }, + { + "epoch": 92.40458015267176, + "grad_norm": 0.9864296913146973, + "learning_rate": 9.845985611728864e-07, + "loss": 0.0681, + "step": 24210 + }, + { + "epoch": 92.44274809160305, + "grad_norm": 0.337456613779068, + "learning_rate": 9.813168199167604e-07, + "loss": 0.0694, + "step": 24220 + }, + { + "epoch": 92.48091603053435, + "grad_norm": 0.3321627974510193, + "learning_rate": 9.78039961739634e-07, + "loss": 0.0657, + "step": 24230 + }, + { + "epoch": 92.51908396946565, + "grad_norm": 0.5132209062576294, + "learning_rate": 9.747679906232016e-07, + "loss": 0.0679, + "step": 24240 + }, + { + "epoch": 92.55725190839695, + "grad_norm": 0.503035843372345, + "learning_rate": 9.71500910543214e-07, + "loss": 0.0673, + "step": 24250 + }, + { + "epoch": 92.59541984732824, + "grad_norm": 0.21332845091819763, + "learning_rate": 9.682387254694835e-07, + "loss": 0.073, + "step": 24260 + }, + { + "epoch": 92.63358778625954, + "grad_norm": 0.32724636793136597, + "learning_rate": 9.649814393658725e-07, + "loss": 0.0662, + "step": 24270 + }, + { + "epoch": 92.67175572519083, + "grad_norm": 0.607822597026825, + "learning_rate": 9.61729056190288e-07, + "loss": 0.0681, + "step": 24280 + }, + { + "epoch": 92.70992366412214, + "grad_norm": 0.3636825978755951, + "learning_rate": 9.584815798946862e-07, + "loss": 0.072, + "step": 24290 + }, + { + "epoch": 92.74809160305344, + "grad_norm": 0.22937163710594177, + "learning_rate": 9.552390144250552e-07, + "loss": 0.0665, + "step": 24300 + }, + { + "epoch": 92.78625954198473, + "grad_norm": 0.2677508294582367, + "learning_rate": 9.520013637214176e-07, + "loss": 0.0675, + "step": 24310 + }, + { + "epoch": 92.82442748091603, + "grad_norm": 0.2615829408168793, + "learning_rate": 9.487686317178241e-07, + "loss": 0.0653, + "step": 24320 + }, + { + "epoch": 92.86259541984732, + "grad_norm": 0.2584886848926544, + "learning_rate": 9.455408223423496e-07, + "loss": 0.0643, + "step": 24330 + }, + { + "epoch": 92.90076335877862, + "grad_norm": 0.24199534952640533, + "learning_rate": 9.423179395170845e-07, + "loss": 0.0702, + "step": 24340 + }, + { + "epoch": 92.93893129770993, + "grad_norm": 0.3466721475124359, + "learning_rate": 9.390999871581391e-07, + "loss": 0.0767, + "step": 24350 + }, + { + "epoch": 92.97709923664122, + "grad_norm": 0.5451226830482483, + "learning_rate": 9.358869691756273e-07, + "loss": 0.0705, + "step": 24360 + }, + { + "epoch": 93.01526717557252, + "grad_norm": 0.5285539627075195, + "learning_rate": 9.326788894736688e-07, + "loss": 0.0692, + "step": 24370 + }, + { + "epoch": 93.05343511450381, + "grad_norm": 0.5585796236991882, + "learning_rate": 9.294757519503811e-07, + "loss": 0.0677, + "step": 24380 + }, + { + "epoch": 93.09160305343511, + "grad_norm": 0.31735959649086, + "learning_rate": 9.262775604978819e-07, + "loss": 0.073, + "step": 24390 + }, + { + "epoch": 93.12977099236642, + "grad_norm": 0.2296765148639679, + "learning_rate": 9.230843190022726e-07, + "loss": 0.0698, + "step": 24400 + }, + { + "epoch": 93.16793893129771, + "grad_norm": 0.5008897185325623, + "learning_rate": 9.198960313436444e-07, + "loss": 0.0874, + "step": 24410 + }, + { + "epoch": 93.20610687022901, + "grad_norm": 0.4472108483314514, + "learning_rate": 9.16712701396067e-07, + "loss": 0.0697, + "step": 24420 + }, + { + "epoch": 93.2442748091603, + "grad_norm": 0.2684642970561981, + "learning_rate": 9.135343330275864e-07, + "loss": 0.0654, + "step": 24430 + }, + { + "epoch": 93.2824427480916, + "grad_norm": 0.4187399744987488, + "learning_rate": 9.103609301002181e-07, + "loss": 0.0692, + "step": 24440 + }, + { + "epoch": 93.3206106870229, + "grad_norm": 0.6167700886726379, + "learning_rate": 9.071924964699491e-07, + "loss": 0.0724, + "step": 24450 + }, + { + "epoch": 93.3587786259542, + "grad_norm": 0.34823620319366455, + "learning_rate": 9.040290359867232e-07, + "loss": 0.0674, + "step": 24460 + }, + { + "epoch": 93.3969465648855, + "grad_norm": 0.27436694502830505, + "learning_rate": 9.008705524944439e-07, + "loss": 0.0692, + "step": 24470 + }, + { + "epoch": 93.43511450381679, + "grad_norm": 0.2943238914012909, + "learning_rate": 8.977170498309651e-07, + "loss": 0.0635, + "step": 24480 + }, + { + "epoch": 93.47328244274809, + "grad_norm": 0.6278515458106995, + "learning_rate": 8.945685318280917e-07, + "loss": 0.079, + "step": 24490 + }, + { + "epoch": 93.5114503816794, + "grad_norm": 0.23879125714302063, + "learning_rate": 8.914250023115672e-07, + "loss": 0.0696, + "step": 24500 + }, + { + "epoch": 93.54961832061069, + "grad_norm": 0.35707348585128784, + "learning_rate": 8.882864651010798e-07, + "loss": 0.0733, + "step": 24510 + }, + { + "epoch": 93.58778625954199, + "grad_norm": 0.3247864246368408, + "learning_rate": 8.851529240102464e-07, + "loss": 0.0663, + "step": 24520 + }, + { + "epoch": 93.62595419847328, + "grad_norm": 0.4654683768749237, + "learning_rate": 8.820243828466135e-07, + "loss": 0.072, + "step": 24530 + }, + { + "epoch": 93.66412213740458, + "grad_norm": 0.504030704498291, + "learning_rate": 8.789008454116566e-07, + "loss": 0.0712, + "step": 24540 + }, + { + "epoch": 93.70229007633588, + "grad_norm": 0.6368444561958313, + "learning_rate": 8.757823155007655e-07, + "loss": 0.0685, + "step": 24550 + }, + { + "epoch": 93.74045801526718, + "grad_norm": 0.28386011719703674, + "learning_rate": 8.72668796903251e-07, + "loss": 0.0714, + "step": 24560 + }, + { + "epoch": 93.77862595419847, + "grad_norm": 0.3366830348968506, + "learning_rate": 8.6956029340233e-07, + "loss": 0.0704, + "step": 24570 + }, + { + "epoch": 93.81679389312977, + "grad_norm": 0.391422837972641, + "learning_rate": 8.664568087751274e-07, + "loss": 0.069, + "step": 24580 + }, + { + "epoch": 93.85496183206106, + "grad_norm": 0.41178223490715027, + "learning_rate": 8.633583467926698e-07, + "loss": 0.0687, + "step": 24590 + }, + { + "epoch": 93.89312977099236, + "grad_norm": 0.23154325783252716, + "learning_rate": 8.602649112198796e-07, + "loss": 0.0677, + "step": 24600 + }, + { + "epoch": 93.93129770992367, + "grad_norm": 0.3986658751964569, + "learning_rate": 8.571765058155745e-07, + "loss": 0.0698, + "step": 24610 + }, + { + "epoch": 93.96946564885496, + "grad_norm": 0.3332964777946472, + "learning_rate": 8.540931343324582e-07, + "loss": 0.0714, + "step": 24620 + }, + { + "epoch": 94.00763358778626, + "grad_norm": 0.2473592758178711, + "learning_rate": 8.510148005171171e-07, + "loss": 0.0684, + "step": 24630 + }, + { + "epoch": 94.04580152671755, + "grad_norm": 0.5109586715698242, + "learning_rate": 8.479415081100167e-07, + "loss": 0.0721, + "step": 24640 + }, + { + "epoch": 94.08396946564885, + "grad_norm": 0.2871462404727936, + "learning_rate": 8.448732608454968e-07, + "loss": 0.065, + "step": 24650 + }, + { + "epoch": 94.12213740458016, + "grad_norm": 0.27044838666915894, + "learning_rate": 8.418100624517688e-07, + "loss": 0.0678, + "step": 24660 + }, + { + "epoch": 94.16030534351145, + "grad_norm": 0.2179548740386963, + "learning_rate": 8.387519166509062e-07, + "loss": 0.0654, + "step": 24670 + }, + { + "epoch": 94.19847328244275, + "grad_norm": 0.2881402373313904, + "learning_rate": 8.356988271588445e-07, + "loss": 0.0738, + "step": 24680 + }, + { + "epoch": 94.23664122137404, + "grad_norm": 0.23480482399463654, + "learning_rate": 8.326507976853765e-07, + "loss": 0.067, + "step": 24690 + }, + { + "epoch": 94.27480916030534, + "grad_norm": 0.5004714131355286, + "learning_rate": 8.296078319341444e-07, + "loss": 0.0707, + "step": 24700 + }, + { + "epoch": 94.31297709923665, + "grad_norm": 0.41305282711982727, + "learning_rate": 8.265699336026384e-07, + "loss": 0.0795, + "step": 24710 + }, + { + "epoch": 94.35114503816794, + "grad_norm": 0.7068077325820923, + "learning_rate": 8.235371063821923e-07, + "loss": 0.0639, + "step": 24720 + }, + { + "epoch": 94.38931297709924, + "grad_norm": 0.3450511693954468, + "learning_rate": 8.205093539579768e-07, + "loss": 0.0691, + "step": 24730 + }, + { + "epoch": 94.42748091603053, + "grad_norm": 0.3504796624183655, + "learning_rate": 8.174866800089964e-07, + "loss": 0.0697, + "step": 24740 + }, + { + "epoch": 94.46564885496183, + "grad_norm": 0.9486528635025024, + "learning_rate": 8.144690882080853e-07, + "loss": 0.0706, + "step": 24750 + }, + { + "epoch": 94.50381679389314, + "grad_norm": 0.4473707973957062, + "learning_rate": 8.114565822219006e-07, + "loss": 0.0683, + "step": 24760 + }, + { + "epoch": 94.54198473282443, + "grad_norm": 0.2520901560783386, + "learning_rate": 8.084491657109233e-07, + "loss": 0.0697, + "step": 24770 + }, + { + "epoch": 94.58015267175573, + "grad_norm": 0.31963804364204407, + "learning_rate": 8.054468423294476e-07, + "loss": 0.072, + "step": 24780 + }, + { + "epoch": 94.61832061068702, + "grad_norm": 0.6297641396522522, + "learning_rate": 8.024496157255784e-07, + "loss": 0.0716, + "step": 24790 + }, + { + "epoch": 94.65648854961832, + "grad_norm": 0.4397490322589874, + "learning_rate": 7.994574895412294e-07, + "loss": 0.0687, + "step": 24800 + }, + { + "epoch": 94.69465648854961, + "grad_norm": 0.3805553615093231, + "learning_rate": 7.964704674121149e-07, + "loss": 0.0654, + "step": 24810 + }, + { + "epoch": 94.73282442748092, + "grad_norm": 0.3672832250595093, + "learning_rate": 7.934885529677505e-07, + "loss": 0.068, + "step": 24820 + }, + { + "epoch": 94.77099236641222, + "grad_norm": 0.2700408399105072, + "learning_rate": 7.905117498314413e-07, + "loss": 0.0652, + "step": 24830 + }, + { + "epoch": 94.80916030534351, + "grad_norm": 0.20384560525417328, + "learning_rate": 7.875400616202861e-07, + "loss": 0.074, + "step": 24840 + }, + { + "epoch": 94.8473282442748, + "grad_norm": 0.27636826038360596, + "learning_rate": 7.845734919451647e-07, + "loss": 0.0737, + "step": 24850 + }, + { + "epoch": 94.8854961832061, + "grad_norm": 0.9613207578659058, + "learning_rate": 7.816120444107384e-07, + "loss": 0.0823, + "step": 24860 + }, + { + "epoch": 94.92366412213741, + "grad_norm": 0.3635960817337036, + "learning_rate": 7.786557226154473e-07, + "loss": 0.0747, + "step": 24870 + }, + { + "epoch": 94.9618320610687, + "grad_norm": 0.276128888130188, + "learning_rate": 7.757045301514998e-07, + "loss": 0.0715, + "step": 24880 + }, + { + "epoch": 95.0, + "grad_norm": 0.4147992432117462, + "learning_rate": 7.727584706048735e-07, + "loss": 0.0695, + "step": 24890 + }, + { + "epoch": 95.0381679389313, + "grad_norm": 0.3812028467655182, + "learning_rate": 7.698175475553076e-07, + "loss": 0.0689, + "step": 24900 + }, + { + "epoch": 95.07633587786259, + "grad_norm": 0.3648328483104706, + "learning_rate": 7.668817645763021e-07, + "loss": 0.073, + "step": 24910 + }, + { + "epoch": 95.1145038167939, + "grad_norm": 0.4475592076778412, + "learning_rate": 7.63951125235109e-07, + "loss": 0.0769, + "step": 24920 + }, + { + "epoch": 95.1526717557252, + "grad_norm": 0.3380667269229889, + "learning_rate": 7.610256330927323e-07, + "loss": 0.0676, + "step": 24930 + }, + { + "epoch": 95.19083969465649, + "grad_norm": 0.8378775119781494, + "learning_rate": 7.581052917039211e-07, + "loss": 0.0733, + "step": 24940 + }, + { + "epoch": 95.22900763358778, + "grad_norm": 0.21355819702148438, + "learning_rate": 7.551901046171645e-07, + "loss": 0.0696, + "step": 24950 + }, + { + "epoch": 95.26717557251908, + "grad_norm": 0.41850680112838745, + "learning_rate": 7.522800753746895e-07, + "loss": 0.0662, + "step": 24960 + }, + { + "epoch": 95.30534351145039, + "grad_norm": 0.3715193271636963, + "learning_rate": 7.493752075124577e-07, + "loss": 0.0705, + "step": 24970 + }, + { + "epoch": 95.34351145038168, + "grad_norm": 0.6367314457893372, + "learning_rate": 7.464755045601557e-07, + "loss": 0.0645, + "step": 24980 + }, + { + "epoch": 95.38167938931298, + "grad_norm": 0.2814936935901642, + "learning_rate": 7.435809700411972e-07, + "loss": 0.0774, + "step": 24990 + }, + { + "epoch": 95.41984732824427, + "grad_norm": 0.22556699812412262, + "learning_rate": 7.40691607472715e-07, + "loss": 0.067, + "step": 25000 + }, + { + "epoch": 95.45801526717557, + "grad_norm": 0.8220465183258057, + "learning_rate": 7.378074203655561e-07, + "loss": 0.0731, + "step": 25010 + }, + { + "epoch": 95.49618320610686, + "grad_norm": 0.2574046552181244, + "learning_rate": 7.349284122242783e-07, + "loss": 0.0736, + "step": 25020 + }, + { + "epoch": 95.53435114503817, + "grad_norm": 0.5456118583679199, + "learning_rate": 7.320545865471513e-07, + "loss": 0.0725, + "step": 25030 + }, + { + "epoch": 95.57251908396947, + "grad_norm": 0.6138007044792175, + "learning_rate": 7.291859468261426e-07, + "loss": 0.0662, + "step": 25040 + }, + { + "epoch": 95.61068702290076, + "grad_norm": 0.7047519683837891, + "learning_rate": 7.263224965469195e-07, + "loss": 0.0729, + "step": 25050 + }, + { + "epoch": 95.64885496183206, + "grad_norm": 0.20071052014827728, + "learning_rate": 7.234642391888446e-07, + "loss": 0.0697, + "step": 25060 + }, + { + "epoch": 95.68702290076335, + "grad_norm": 0.43756791949272156, + "learning_rate": 7.206111782249698e-07, + "loss": 0.0681, + "step": 25070 + }, + { + "epoch": 95.72519083969466, + "grad_norm": 0.4008914530277252, + "learning_rate": 7.177633171220339e-07, + "loss": 0.0742, + "step": 25080 + }, + { + "epoch": 95.76335877862596, + "grad_norm": 0.9968006014823914, + "learning_rate": 7.149206593404562e-07, + "loss": 0.0692, + "step": 25090 + }, + { + "epoch": 95.80152671755725, + "grad_norm": 0.2601996958255768, + "learning_rate": 7.120832083343337e-07, + "loss": 0.0672, + "step": 25100 + }, + { + "epoch": 95.83969465648855, + "grad_norm": 0.2922241687774658, + "learning_rate": 7.092509675514369e-07, + "loss": 0.0655, + "step": 25110 + }, + { + "epoch": 95.87786259541984, + "grad_norm": 0.40154197812080383, + "learning_rate": 7.064239404332063e-07, + "loss": 0.0678, + "step": 25120 + }, + { + "epoch": 95.91603053435115, + "grad_norm": 0.8301547169685364, + "learning_rate": 7.03602130414745e-07, + "loss": 0.0677, + "step": 25130 + }, + { + "epoch": 95.95419847328245, + "grad_norm": 0.28282907605171204, + "learning_rate": 7.007855409248199e-07, + "loss": 0.066, + "step": 25140 + }, + { + "epoch": 95.99236641221374, + "grad_norm": 0.278046578168869, + "learning_rate": 6.979741753858521e-07, + "loss": 0.072, + "step": 25150 + }, + { + "epoch": 96.03053435114504, + "grad_norm": 0.22601468861103058, + "learning_rate": 6.951680372139158e-07, + "loss": 0.0701, + "step": 25160 + }, + { + "epoch": 96.06870229007633, + "grad_norm": 0.3387024700641632, + "learning_rate": 6.923671298187335e-07, + "loss": 0.0757, + "step": 25170 + }, + { + "epoch": 96.10687022900764, + "grad_norm": 0.26798492670059204, + "learning_rate": 6.895714566036705e-07, + "loss": 0.0624, + "step": 25180 + }, + { + "epoch": 96.14503816793894, + "grad_norm": 0.2048361450433731, + "learning_rate": 6.86781020965736e-07, + "loss": 0.0675, + "step": 25190 + }, + { + "epoch": 96.18320610687023, + "grad_norm": 0.4887497127056122, + "learning_rate": 6.839958262955709e-07, + "loss": 0.0733, + "step": 25200 + }, + { + "epoch": 96.22137404580153, + "grad_norm": 0.37875017523765564, + "learning_rate": 6.812158759774489e-07, + "loss": 0.065, + "step": 25210 + }, + { + "epoch": 96.25954198473282, + "grad_norm": 0.3595539629459381, + "learning_rate": 6.784411733892732e-07, + "loss": 0.0682, + "step": 25220 + }, + { + "epoch": 96.29770992366412, + "grad_norm": 0.44329530000686646, + "learning_rate": 6.756717219025666e-07, + "loss": 0.0657, + "step": 25230 + }, + { + "epoch": 96.33587786259542, + "grad_norm": 0.3791472613811493, + "learning_rate": 6.729075248824762e-07, + "loss": 0.0641, + "step": 25240 + }, + { + "epoch": 96.37404580152672, + "grad_norm": 0.5714471936225891, + "learning_rate": 6.701485856877615e-07, + "loss": 0.0714, + "step": 25250 + }, + { + "epoch": 96.41221374045801, + "grad_norm": 0.5484432578086853, + "learning_rate": 6.673949076707925e-07, + "loss": 0.0691, + "step": 25260 + }, + { + "epoch": 96.45038167938931, + "grad_norm": 0.32033249735832214, + "learning_rate": 6.646464941775499e-07, + "loss": 0.0655, + "step": 25270 + }, + { + "epoch": 96.4885496183206, + "grad_norm": 0.4129583537578583, + "learning_rate": 6.619033485476129e-07, + "loss": 0.0723, + "step": 25280 + }, + { + "epoch": 96.52671755725191, + "grad_norm": 0.216600239276886, + "learning_rate": 6.591654741141639e-07, + "loss": 0.0672, + "step": 25290 + }, + { + "epoch": 96.56488549618321, + "grad_norm": 0.29276999831199646, + "learning_rate": 6.564328742039782e-07, + "loss": 0.0637, + "step": 25300 + }, + { + "epoch": 96.6030534351145, + "grad_norm": 0.20918267965316772, + "learning_rate": 6.537055521374219e-07, + "loss": 0.0667, + "step": 25310 + }, + { + "epoch": 96.6412213740458, + "grad_norm": 0.5599342584609985, + "learning_rate": 6.509835112284485e-07, + "loss": 0.0699, + "step": 25320 + }, + { + "epoch": 96.6793893129771, + "grad_norm": 0.253470242023468, + "learning_rate": 6.482667547845944e-07, + "loss": 0.069, + "step": 25330 + }, + { + "epoch": 96.7175572519084, + "grad_norm": 0.2381214201450348, + "learning_rate": 6.455552861069736e-07, + "loss": 0.0646, + "step": 25340 + }, + { + "epoch": 96.7557251908397, + "grad_norm": 0.3462165296077728, + "learning_rate": 6.428491084902788e-07, + "loss": 0.0656, + "step": 25350 + }, + { + "epoch": 96.79389312977099, + "grad_norm": 0.2496246099472046, + "learning_rate": 6.401482252227697e-07, + "loss": 0.0689, + "step": 25360 + }, + { + "epoch": 96.83206106870229, + "grad_norm": 0.4224330484867096, + "learning_rate": 6.37452639586274e-07, + "loss": 0.0723, + "step": 25370 + }, + { + "epoch": 96.87022900763358, + "grad_norm": 0.22836147248744965, + "learning_rate": 6.347623548561827e-07, + "loss": 0.0675, + "step": 25380 + }, + { + "epoch": 96.90839694656489, + "grad_norm": 0.33980873227119446, + "learning_rate": 6.320773743014441e-07, + "loss": 0.0696, + "step": 25390 + }, + { + "epoch": 96.94656488549619, + "grad_norm": 0.5132980942726135, + "learning_rate": 6.293977011845648e-07, + "loss": 0.0704, + "step": 25400 + }, + { + "epoch": 96.98473282442748, + "grad_norm": 0.2276553362607956, + "learning_rate": 6.267233387615984e-07, + "loss": 0.068, + "step": 25410 + }, + { + "epoch": 97.02290076335878, + "grad_norm": 0.3203546106815338, + "learning_rate": 6.2405429028215e-07, + "loss": 0.077, + "step": 25420 + }, + { + "epoch": 97.06106870229007, + "grad_norm": 0.5133867859840393, + "learning_rate": 6.213905589893632e-07, + "loss": 0.0696, + "step": 25430 + }, + { + "epoch": 97.09923664122137, + "grad_norm": 0.810599684715271, + "learning_rate": 6.187321481199221e-07, + "loss": 0.07, + "step": 25440 + }, + { + "epoch": 97.13740458015268, + "grad_norm": 0.36244019865989685, + "learning_rate": 6.16079060904049e-07, + "loss": 0.0705, + "step": 25450 + }, + { + "epoch": 97.17557251908397, + "grad_norm": 0.383052259683609, + "learning_rate": 6.134313005654929e-07, + "loss": 0.0666, + "step": 25460 + }, + { + "epoch": 97.21374045801527, + "grad_norm": 0.5795229077339172, + "learning_rate": 6.107888703215337e-07, + "loss": 0.0699, + "step": 25470 + }, + { + "epoch": 97.25190839694656, + "grad_norm": 0.37245771288871765, + "learning_rate": 6.081517733829723e-07, + "loss": 0.0668, + "step": 25480 + }, + { + "epoch": 97.29007633587786, + "grad_norm": 0.47016429901123047, + "learning_rate": 6.055200129541294e-07, + "loss": 0.0689, + "step": 25490 + }, + { + "epoch": 97.32824427480917, + "grad_norm": 0.21799707412719727, + "learning_rate": 6.028935922328444e-07, + "loss": 0.0626, + "step": 25500 + }, + { + "epoch": 97.36641221374046, + "grad_norm": 0.2931235432624817, + "learning_rate": 6.002725144104649e-07, + "loss": 0.0733, + "step": 25510 + }, + { + "epoch": 97.40458015267176, + "grad_norm": 0.5701701641082764, + "learning_rate": 5.976567826718476e-07, + "loss": 0.0735, + "step": 25520 + }, + { + "epoch": 97.44274809160305, + "grad_norm": 0.5489487648010254, + "learning_rate": 5.950464001953532e-07, + "loss": 0.0786, + "step": 25530 + }, + { + "epoch": 97.48091603053435, + "grad_norm": 0.4438866376876831, + "learning_rate": 5.924413701528415e-07, + "loss": 0.0679, + "step": 25540 + }, + { + "epoch": 97.51908396946565, + "grad_norm": 0.6267343163490295, + "learning_rate": 5.898416957096704e-07, + "loss": 0.0724, + "step": 25550 + }, + { + "epoch": 97.55725190839695, + "grad_norm": 0.3363583981990814, + "learning_rate": 5.872473800246914e-07, + "loss": 0.0681, + "step": 25560 + }, + { + "epoch": 97.59541984732824, + "grad_norm": 0.5211305022239685, + "learning_rate": 5.846584262502403e-07, + "loss": 0.0794, + "step": 25570 + }, + { + "epoch": 97.63358778625954, + "grad_norm": 0.32496604323387146, + "learning_rate": 5.820748375321411e-07, + "loss": 0.0661, + "step": 25580 + }, + { + "epoch": 97.67175572519083, + "grad_norm": 0.29008033871650696, + "learning_rate": 5.794966170096977e-07, + "loss": 0.0665, + "step": 25590 + }, + { + "epoch": 97.70992366412214, + "grad_norm": 0.4152953624725342, + "learning_rate": 5.769237678156897e-07, + "loss": 0.0687, + "step": 25600 + }, + { + "epoch": 97.74809160305344, + "grad_norm": 0.33877304196357727, + "learning_rate": 5.743562930763735e-07, + "loss": 0.067, + "step": 25610 + }, + { + "epoch": 97.78625954198473, + "grad_norm": 0.2797228693962097, + "learning_rate": 5.717941959114726e-07, + "loss": 0.076, + "step": 25620 + }, + { + "epoch": 97.82442748091603, + "grad_norm": 0.3312159776687622, + "learning_rate": 5.69237479434176e-07, + "loss": 0.0684, + "step": 25630 + }, + { + "epoch": 97.86259541984732, + "grad_norm": 0.36744144558906555, + "learning_rate": 5.666861467511353e-07, + "loss": 0.0686, + "step": 25640 + }, + { + "epoch": 97.90076335877862, + "grad_norm": 0.40221524238586426, + "learning_rate": 5.641402009624591e-07, + "loss": 0.0697, + "step": 25650 + }, + { + "epoch": 97.93893129770993, + "grad_norm": 0.8071361184120178, + "learning_rate": 5.615996451617145e-07, + "loss": 0.0737, + "step": 25660 + }, + { + "epoch": 97.97709923664122, + "grad_norm": 0.2889866232872009, + "learning_rate": 5.590644824359148e-07, + "loss": 0.0682, + "step": 25670 + }, + { + "epoch": 98.01526717557252, + "grad_norm": 0.27220702171325684, + "learning_rate": 5.56534715865521e-07, + "loss": 0.066, + "step": 25680 + }, + { + "epoch": 98.05343511450381, + "grad_norm": 0.2493731528520584, + "learning_rate": 5.540103485244397e-07, + "loss": 0.062, + "step": 25690 + }, + { + "epoch": 98.09160305343511, + "grad_norm": 0.33816832304000854, + "learning_rate": 5.514913834800134e-07, + "loss": 0.0676, + "step": 25700 + }, + { + "epoch": 98.12977099236642, + "grad_norm": 0.7667673230171204, + "learning_rate": 5.489778237930238e-07, + "loss": 0.0755, + "step": 25710 + }, + { + "epoch": 98.16793893129771, + "grad_norm": 0.35118091106414795, + "learning_rate": 5.464696725176827e-07, + "loss": 0.0683, + "step": 25720 + }, + { + "epoch": 98.20610687022901, + "grad_norm": 0.28815001249313354, + "learning_rate": 5.439669327016295e-07, + "loss": 0.0702, + "step": 25730 + }, + { + "epoch": 98.2442748091603, + "grad_norm": 0.5837283134460449, + "learning_rate": 5.414696073859299e-07, + "loss": 0.0691, + "step": 25740 + }, + { + "epoch": 98.2824427480916, + "grad_norm": 0.37461745738983154, + "learning_rate": 5.389776996050695e-07, + "loss": 0.0708, + "step": 25750 + }, + { + "epoch": 98.3206106870229, + "grad_norm": 0.23216131329536438, + "learning_rate": 5.364912123869493e-07, + "loss": 0.0671, + "step": 25760 + }, + { + "epoch": 98.3587786259542, + "grad_norm": 0.3143027424812317, + "learning_rate": 5.340101487528887e-07, + "loss": 0.0693, + "step": 25770 + }, + { + "epoch": 98.3969465648855, + "grad_norm": 0.35901618003845215, + "learning_rate": 5.315345117176129e-07, + "loss": 0.0713, + "step": 25780 + }, + { + "epoch": 98.43511450381679, + "grad_norm": 0.22512131929397583, + "learning_rate": 5.290643042892541e-07, + "loss": 0.0714, + "step": 25790 + }, + { + "epoch": 98.47328244274809, + "grad_norm": 0.2000698745250702, + "learning_rate": 5.265995294693472e-07, + "loss": 0.0681, + "step": 25800 + }, + { + "epoch": 98.5114503816794, + "grad_norm": 1.0218034982681274, + "learning_rate": 5.241401902528253e-07, + "loss": 0.0682, + "step": 25810 + }, + { + "epoch": 98.54961832061069, + "grad_norm": 0.3218908905982971, + "learning_rate": 5.21686289628019e-07, + "loss": 0.066, + "step": 25820 + }, + { + "epoch": 98.58778625954199, + "grad_norm": 0.4360622465610504, + "learning_rate": 5.192378305766472e-07, + "loss": 0.0684, + "step": 25830 + }, + { + "epoch": 98.62595419847328, + "grad_norm": 0.2918654680252075, + "learning_rate": 5.167948160738206e-07, + "loss": 0.0634, + "step": 25840 + }, + { + "epoch": 98.66412213740458, + "grad_norm": 0.5152965188026428, + "learning_rate": 5.143572490880311e-07, + "loss": 0.0678, + "step": 25850 + }, + { + "epoch": 98.70229007633588, + "grad_norm": 1.0082851648330688, + "learning_rate": 5.119251325811513e-07, + "loss": 0.066, + "step": 25860 + }, + { + "epoch": 98.74045801526718, + "grad_norm": 0.28817683458328247, + "learning_rate": 5.094984695084348e-07, + "loss": 0.0629, + "step": 25870 + }, + { + "epoch": 98.77862595419847, + "grad_norm": 0.5672718286514282, + "learning_rate": 5.070772628185039e-07, + "loss": 0.0688, + "step": 25880 + }, + { + "epoch": 98.81679389312977, + "grad_norm": 0.7285252213478088, + "learning_rate": 5.046615154533535e-07, + "loss": 0.0681, + "step": 25890 + }, + { + "epoch": 98.85496183206106, + "grad_norm": 0.4292791783809662, + "learning_rate": 5.022512303483451e-07, + "loss": 0.0658, + "step": 25900 + }, + { + "epoch": 98.89312977099236, + "grad_norm": 0.20751267671585083, + "learning_rate": 4.998464104322015e-07, + "loss": 0.0703, + "step": 25910 + }, + { + "epoch": 98.93129770992367, + "grad_norm": 0.2615070939064026, + "learning_rate": 4.974470586270047e-07, + "loss": 0.0651, + "step": 25920 + }, + { + "epoch": 98.96946564885496, + "grad_norm": 0.2248445302248001, + "learning_rate": 4.950531778481965e-07, + "loss": 0.0672, + "step": 25930 + }, + { + "epoch": 99.00763358778626, + "grad_norm": 0.4531436264514923, + "learning_rate": 4.926647710045652e-07, + "loss": 0.0625, + "step": 25940 + }, + { + "epoch": 99.04580152671755, + "grad_norm": 0.4915529787540436, + "learning_rate": 4.902818409982513e-07, + "loss": 0.066, + "step": 25950 + }, + { + "epoch": 99.08396946564885, + "grad_norm": 0.28831353783607483, + "learning_rate": 4.879043907247383e-07, + "loss": 0.0672, + "step": 25960 + }, + { + "epoch": 99.12213740458016, + "grad_norm": 0.2686160206794739, + "learning_rate": 4.855324230728542e-07, + "loss": 0.0696, + "step": 25970 + }, + { + "epoch": 99.16030534351145, + "grad_norm": 0.3270827829837799, + "learning_rate": 4.831659409247619e-07, + "loss": 0.0645, + "step": 25980 + }, + { + "epoch": 99.19847328244275, + "grad_norm": 0.25841906666755676, + "learning_rate": 4.808049471559617e-07, + "loss": 0.0696, + "step": 25990 + }, + { + "epoch": 99.23664122137404, + "grad_norm": 0.4139740467071533, + "learning_rate": 4.784494446352833e-07, + "loss": 0.0721, + "step": 26000 + }, + { + "epoch": 99.27480916030534, + "grad_norm": 0.6838223338127136, + "learning_rate": 4.7609943622488333e-07, + "loss": 0.0711, + "step": 26010 + }, + { + "epoch": 99.31297709923665, + "grad_norm": 0.5432331562042236, + "learning_rate": 4.737549247802442e-07, + "loss": 0.0803, + "step": 26020 + }, + { + "epoch": 99.35114503816794, + "grad_norm": 0.7913231253623962, + "learning_rate": 4.714159131501689e-07, + "loss": 0.0696, + "step": 26030 + }, + { + "epoch": 99.38931297709924, + "grad_norm": 0.5764281749725342, + "learning_rate": 4.690824041767766e-07, + "loss": 0.0753, + "step": 26040 + }, + { + "epoch": 99.42748091603053, + "grad_norm": 0.793904185295105, + "learning_rate": 4.66754400695501e-07, + "loss": 0.0798, + "step": 26050 + }, + { + "epoch": 99.46564885496183, + "grad_norm": 0.38731345534324646, + "learning_rate": 4.6443190553508597e-07, + "loss": 0.0695, + "step": 26060 + }, + { + "epoch": 99.50381679389314, + "grad_norm": 0.3625410199165344, + "learning_rate": 4.6211492151758076e-07, + "loss": 0.0681, + "step": 26070 + }, + { + "epoch": 99.54198473282443, + "grad_norm": 0.47990864515304565, + "learning_rate": 4.598034514583416e-07, + "loss": 0.0696, + "step": 26080 + }, + { + "epoch": 99.58015267175573, + "grad_norm": 0.2346879243850708, + "learning_rate": 4.574974981660213e-07, + "loss": 0.0675, + "step": 26090 + }, + { + "epoch": 99.61832061068702, + "grad_norm": 0.29999473690986633, + "learning_rate": 4.5519706444257073e-07, + "loss": 0.0679, + "step": 26100 + }, + { + "epoch": 99.65648854961832, + "grad_norm": 0.516541063785553, + "learning_rate": 4.529021530832328e-07, + "loss": 0.0648, + "step": 26110 + }, + { + "epoch": 99.69465648854961, + "grad_norm": 0.32835254073143005, + "learning_rate": 4.506127668765431e-07, + "loss": 0.0666, + "step": 26120 + }, + { + "epoch": 99.73282442748092, + "grad_norm": 0.3316701054573059, + "learning_rate": 4.483289086043197e-07, + "loss": 0.0686, + "step": 26130 + }, + { + "epoch": 99.77099236641222, + "grad_norm": 0.27133581042289734, + "learning_rate": 4.460505810416682e-07, + "loss": 0.0713, + "step": 26140 + }, + { + "epoch": 99.80916030534351, + "grad_norm": 0.6455039978027344, + "learning_rate": 4.437777869569698e-07, + "loss": 0.0716, + "step": 26150 + }, + { + "epoch": 99.8473282442748, + "grad_norm": 0.3414127826690674, + "learning_rate": 4.415105291118843e-07, + "loss": 0.0654, + "step": 26160 + }, + { + "epoch": 99.8854961832061, + "grad_norm": 0.28300490975379944, + "learning_rate": 4.392488102613435e-07, + "loss": 0.0684, + "step": 26170 + }, + { + "epoch": 99.92366412213741, + "grad_norm": 0.3191199004650116, + "learning_rate": 4.3699263315354735e-07, + "loss": 0.0706, + "step": 26180 + }, + { + "epoch": 99.9618320610687, + "grad_norm": 0.25766992568969727, + "learning_rate": 4.3474200052996685e-07, + "loss": 0.0659, + "step": 26190 + }, + { + "epoch": 100.0, + "grad_norm": 0.3972707986831665, + "learning_rate": 4.324969151253317e-07, + "loss": 0.0705, + "step": 26200 + }, + { + "epoch": 100.0381679389313, + "grad_norm": 0.21596112847328186, + "learning_rate": 4.302573796676313e-07, + "loss": 0.0675, + "step": 26210 + }, + { + "epoch": 100.07633587786259, + "grad_norm": 0.3397051692008972, + "learning_rate": 4.280233968781139e-07, + "loss": 0.0658, + "step": 26220 + }, + { + "epoch": 100.1145038167939, + "grad_norm": 0.32072919607162476, + "learning_rate": 4.257949694712771e-07, + "loss": 0.0702, + "step": 26230 + }, + { + "epoch": 100.1526717557252, + "grad_norm": 0.2912977337837219, + "learning_rate": 4.235721001548726e-07, + "loss": 0.0697, + "step": 26240 + }, + { + "epoch": 100.19083969465649, + "grad_norm": 0.3459882140159607, + "learning_rate": 4.21354791629896e-07, + "loss": 0.069, + "step": 26250 + }, + { + "epoch": 100.22900763358778, + "grad_norm": 0.19958975911140442, + "learning_rate": 4.1914304659058437e-07, + "loss": 0.0626, + "step": 26260 + }, + { + "epoch": 100.26717557251908, + "grad_norm": 0.35708945989608765, + "learning_rate": 4.1693686772441846e-07, + "loss": 0.0687, + "step": 26270 + }, + { + "epoch": 100.30534351145039, + "grad_norm": 0.7189772129058838, + "learning_rate": 4.1473625771211224e-07, + "loss": 0.0748, + "step": 26280 + }, + { + "epoch": 100.34351145038168, + "grad_norm": 0.21925044059753418, + "learning_rate": 4.125412192276157e-07, + "loss": 0.0681, + "step": 26290 + }, + { + "epoch": 100.38167938931298, + "grad_norm": 0.3142671585083008, + "learning_rate": 4.1035175493810696e-07, + "loss": 0.0725, + "step": 26300 + }, + { + "epoch": 100.41984732824427, + "grad_norm": 0.5476031303405762, + "learning_rate": 4.081678675039913e-07, + "loss": 0.0667, + "step": 26310 + }, + { + "epoch": 100.45801526717557, + "grad_norm": 0.2075609415769577, + "learning_rate": 4.059895595788987e-07, + "loss": 0.0669, + "step": 26320 + }, + { + "epoch": 100.49618320610686, + "grad_norm": 0.3402668237686157, + "learning_rate": 4.038168338096776e-07, + "loss": 0.0715, + "step": 26330 + }, + { + "epoch": 100.53435114503817, + "grad_norm": 0.2339000403881073, + "learning_rate": 4.016496928363944e-07, + "loss": 0.0724, + "step": 26340 + }, + { + "epoch": 100.57251908396947, + "grad_norm": 0.435660183429718, + "learning_rate": 3.994881392923317e-07, + "loss": 0.072, + "step": 26350 + }, + { + "epoch": 100.61068702290076, + "grad_norm": 0.24794331192970276, + "learning_rate": 3.973321758039794e-07, + "loss": 0.0633, + "step": 26360 + }, + { + "epoch": 100.64885496183206, + "grad_norm": 0.2800830602645874, + "learning_rate": 3.9518180499103665e-07, + "loss": 0.0644, + "step": 26370 + }, + { + "epoch": 100.68702290076335, + "grad_norm": 0.25040721893310547, + "learning_rate": 3.930370294664071e-07, + "loss": 0.0698, + "step": 26380 + }, + { + "epoch": 100.72519083969466, + "grad_norm": 0.2757255733013153, + "learning_rate": 3.9089785183619386e-07, + "loss": 0.0644, + "step": 26390 + }, + { + "epoch": 100.76335877862596, + "grad_norm": 0.6161481738090515, + "learning_rate": 3.8876427469970167e-07, + "loss": 0.0693, + "step": 26400 + }, + { + "epoch": 100.80152671755725, + "grad_norm": 0.24093779921531677, + "learning_rate": 3.866363006494256e-07, + "loss": 0.0653, + "step": 26410 + }, + { + "epoch": 100.83969465648855, + "grad_norm": 0.27081120014190674, + "learning_rate": 3.845139322710573e-07, + "loss": 0.0662, + "step": 26420 + }, + { + "epoch": 100.87786259541984, + "grad_norm": 0.2640981376171112, + "learning_rate": 3.823971721434727e-07, + "loss": 0.0657, + "step": 26430 + }, + { + "epoch": 100.91603053435115, + "grad_norm": 1.1142594814300537, + "learning_rate": 3.8028602283873504e-07, + "loss": 0.0662, + "step": 26440 + }, + { + "epoch": 100.95419847328245, + "grad_norm": 0.3841104209423065, + "learning_rate": 3.781804869220912e-07, + "loss": 0.07, + "step": 26450 + }, + { + "epoch": 100.99236641221374, + "grad_norm": 0.23676466941833496, + "learning_rate": 3.760805669519646e-07, + "loss": 0.0676, + "step": 26460 + }, + { + "epoch": 101.03053435114504, + "grad_norm": 0.7228853702545166, + "learning_rate": 3.7398626547995585e-07, + "loss": 0.0881, + "step": 26470 + }, + { + "epoch": 101.06870229007633, + "grad_norm": 0.665624737739563, + "learning_rate": 3.7189758505083994e-07, + "loss": 0.0639, + "step": 26480 + }, + { + "epoch": 101.10687022900764, + "grad_norm": 0.30101853609085083, + "learning_rate": 3.6981452820255836e-07, + "loss": 0.0621, + "step": 26490 + }, + { + "epoch": 101.14503816793894, + "grad_norm": 0.36952438950538635, + "learning_rate": 3.6773709746622365e-07, + "loss": 0.0672, + "step": 26500 + }, + { + "epoch": 101.18320610687023, + "grad_norm": 0.528901219367981, + "learning_rate": 3.6566529536611e-07, + "loss": 0.07, + "step": 26510 + }, + { + "epoch": 101.22137404580153, + "grad_norm": 0.3763343393802643, + "learning_rate": 3.635991244196513e-07, + "loss": 0.0788, + "step": 26520 + }, + { + "epoch": 101.25954198473282, + "grad_norm": 0.33685335516929626, + "learning_rate": 3.615385871374405e-07, + "loss": 0.0696, + "step": 26530 + }, + { + "epoch": 101.29770992366412, + "grad_norm": 0.2437128871679306, + "learning_rate": 3.594836860232237e-07, + "loss": 0.0682, + "step": 26540 + }, + { + "epoch": 101.33587786259542, + "grad_norm": 0.2727994918823242, + "learning_rate": 3.574344235739019e-07, + "loss": 0.0698, + "step": 26550 + }, + { + "epoch": 101.37404580152672, + "grad_norm": 0.6007474064826965, + "learning_rate": 3.553908022795194e-07, + "loss": 0.0701, + "step": 26560 + }, + { + "epoch": 101.41221374045801, + "grad_norm": 0.23269033432006836, + "learning_rate": 3.5335282462327093e-07, + "loss": 0.0656, + "step": 26570 + }, + { + "epoch": 101.45038167938931, + "grad_norm": 0.3645854592323303, + "learning_rate": 3.513204930814912e-07, + "loss": 0.068, + "step": 26580 + }, + { + "epoch": 101.4885496183206, + "grad_norm": 0.19710974395275116, + "learning_rate": 3.4929381012365424e-07, + "loss": 0.0682, + "step": 26590 + }, + { + "epoch": 101.52671755725191, + "grad_norm": 0.5920630097389221, + "learning_rate": 3.472727782123697e-07, + "loss": 0.0692, + "step": 26600 + }, + { + "epoch": 101.56488549618321, + "grad_norm": 0.7211366891860962, + "learning_rate": 3.452573998033842e-07, + "loss": 0.0689, + "step": 26610 + }, + { + "epoch": 101.6030534351145, + "grad_norm": 0.2851048409938812, + "learning_rate": 3.432476773455712e-07, + "loss": 0.0753, + "step": 26620 + }, + { + "epoch": 101.6412213740458, + "grad_norm": 0.38267195224761963, + "learning_rate": 3.412436132809338e-07, + "loss": 0.068, + "step": 26630 + }, + { + "epoch": 101.6793893129771, + "grad_norm": 0.5825901031494141, + "learning_rate": 3.392452100445975e-07, + "loss": 0.0682, + "step": 26640 + }, + { + "epoch": 101.7175572519084, + "grad_norm": 0.37534067034721375, + "learning_rate": 3.3725247006481167e-07, + "loss": 0.0746, + "step": 26650 + }, + { + "epoch": 101.7557251908397, + "grad_norm": 0.3646638095378876, + "learning_rate": 3.3526539576294315e-07, + "loss": 0.0671, + "step": 26660 + }, + { + "epoch": 101.79389312977099, + "grad_norm": 0.24642281234264374, + "learning_rate": 3.332839895534745e-07, + "loss": 0.0706, + "step": 26670 + }, + { + "epoch": 101.83206106870229, + "grad_norm": 0.45950332283973694, + "learning_rate": 3.3130825384400156e-07, + "loss": 0.0662, + "step": 26680 + }, + { + "epoch": 101.87022900763358, + "grad_norm": 0.3571898639202118, + "learning_rate": 3.293381910352278e-07, + "loss": 0.0646, + "step": 26690 + }, + { + "epoch": 101.90839694656489, + "grad_norm": 0.2520214021205902, + "learning_rate": 3.273738035209678e-07, + "loss": 0.0577, + "step": 26700 + }, + { + "epoch": 101.94656488549619, + "grad_norm": 0.28303033113479614, + "learning_rate": 3.254150936881356e-07, + "loss": 0.0689, + "step": 26710 + }, + { + "epoch": 101.98473282442748, + "grad_norm": 0.3684123754501343, + "learning_rate": 3.234620639167496e-07, + "loss": 0.0655, + "step": 26720 + }, + { + "epoch": 102.02290076335878, + "grad_norm": 0.5102269053459167, + "learning_rate": 3.2151471657992485e-07, + "loss": 0.0746, + "step": 26730 + }, + { + "epoch": 102.06106870229007, + "grad_norm": 0.24563086032867432, + "learning_rate": 3.1957305404387187e-07, + "loss": 0.0726, + "step": 26740 + }, + { + "epoch": 102.09923664122137, + "grad_norm": 0.37572887539863586, + "learning_rate": 3.176370786678934e-07, + "loss": 0.0636, + "step": 26750 + }, + { + "epoch": 102.13740458015268, + "grad_norm": 0.2450253665447235, + "learning_rate": 3.1570679280438163e-07, + "loss": 0.0726, + "step": 26760 + }, + { + "epoch": 102.17557251908397, + "grad_norm": 0.48279571533203125, + "learning_rate": 3.1378219879881764e-07, + "loss": 0.0705, + "step": 26770 + }, + { + "epoch": 102.21374045801527, + "grad_norm": 0.39897432923316956, + "learning_rate": 3.1186329898976296e-07, + "loss": 0.0731, + "step": 26780 + }, + { + "epoch": 102.25190839694656, + "grad_norm": 0.23803192377090454, + "learning_rate": 3.0995009570886305e-07, + "loss": 0.0779, + "step": 26790 + }, + { + "epoch": 102.29007633587786, + "grad_norm": 0.6093907356262207, + "learning_rate": 3.0804259128083936e-07, + "loss": 0.0667, + "step": 26800 + }, + { + "epoch": 102.32824427480917, + "grad_norm": 0.275802880525589, + "learning_rate": 3.0614078802348903e-07, + "loss": 0.0642, + "step": 26810 + }, + { + "epoch": 102.36641221374046, + "grad_norm": 0.2756710648536682, + "learning_rate": 3.042446882476846e-07, + "loss": 0.0668, + "step": 26820 + }, + { + "epoch": 102.40458015267176, + "grad_norm": 0.2288149744272232, + "learning_rate": 3.023542942573643e-07, + "loss": 0.0716, + "step": 26830 + }, + { + "epoch": 102.44274809160305, + "grad_norm": 0.2968932092189789, + "learning_rate": 3.004696083495351e-07, + "loss": 0.0671, + "step": 26840 + }, + { + "epoch": 102.48091603053435, + "grad_norm": 0.27723342180252075, + "learning_rate": 2.98590632814269e-07, + "loss": 0.0673, + "step": 26850 + }, + { + "epoch": 102.51908396946565, + "grad_norm": 0.36572501063346863, + "learning_rate": 2.96717369934697e-07, + "loss": 0.0679, + "step": 26860 + }, + { + "epoch": 102.55725190839695, + "grad_norm": 0.19056031107902527, + "learning_rate": 2.948498219870122e-07, + "loss": 0.067, + "step": 26870 + }, + { + "epoch": 102.59541984732824, + "grad_norm": 0.220974862575531, + "learning_rate": 2.929879912404604e-07, + "loss": 0.067, + "step": 26880 + }, + { + "epoch": 102.63358778625954, + "grad_norm": 0.2869727909564972, + "learning_rate": 2.911318799573415e-07, + "loss": 0.0699, + "step": 26890 + }, + { + "epoch": 102.67175572519083, + "grad_norm": 1.0001291036605835, + "learning_rate": 2.8928149039300525e-07, + "loss": 0.0713, + "step": 26900 + }, + { + "epoch": 102.70992366412214, + "grad_norm": 0.3044949173927307, + "learning_rate": 2.8743682479584977e-07, + "loss": 0.0816, + "step": 26910 + }, + { + "epoch": 102.74809160305344, + "grad_norm": 0.2789320945739746, + "learning_rate": 2.8559788540731826e-07, + "loss": 0.0637, + "step": 26920 + }, + { + "epoch": 102.78625954198473, + "grad_norm": 0.7257505655288696, + "learning_rate": 2.837646744618949e-07, + "loss": 0.0823, + "step": 26930 + }, + { + "epoch": 102.82442748091603, + "grad_norm": 0.3718293309211731, + "learning_rate": 2.8193719418710405e-07, + "loss": 0.0703, + "step": 26940 + }, + { + "epoch": 102.86259541984732, + "grad_norm": 0.32060879468917847, + "learning_rate": 2.8011544680350667e-07, + "loss": 0.0709, + "step": 26950 + }, + { + "epoch": 102.90076335877862, + "grad_norm": 0.6443219780921936, + "learning_rate": 2.7829943452469753e-07, + "loss": 0.0755, + "step": 26960 + }, + { + "epoch": 102.93893129770993, + "grad_norm": 0.5253495573997498, + "learning_rate": 2.764891595573022e-07, + "loss": 0.0703, + "step": 26970 + }, + { + "epoch": 102.97709923664122, + "grad_norm": 0.3999308943748474, + "learning_rate": 2.746846241009765e-07, + "loss": 0.0673, + "step": 26980 + }, + { + "epoch": 103.01526717557252, + "grad_norm": 0.2527059018611908, + "learning_rate": 2.7288583034839944e-07, + "loss": 0.0687, + "step": 26990 + }, + { + "epoch": 103.05343511450381, + "grad_norm": 0.296678751707077, + "learning_rate": 2.7109278048527756e-07, + "loss": 0.0654, + "step": 27000 + }, + { + "epoch": 103.09160305343511, + "grad_norm": 0.7945778965950012, + "learning_rate": 2.6930547669033415e-07, + "loss": 0.0737, + "step": 27010 + }, + { + "epoch": 103.12977099236642, + "grad_norm": 0.32029277086257935, + "learning_rate": 2.675239211353109e-07, + "loss": 0.0693, + "step": 27020 + }, + { + "epoch": 103.16793893129771, + "grad_norm": 0.2556557357311249, + "learning_rate": 2.6574811598496787e-07, + "loss": 0.0755, + "step": 27030 + }, + { + "epoch": 103.20610687022901, + "grad_norm": 0.23679769039154053, + "learning_rate": 2.6397806339707457e-07, + "loss": 0.0756, + "step": 27040 + }, + { + "epoch": 103.2442748091603, + "grad_norm": 0.4383762776851654, + "learning_rate": 2.622137655224122e-07, + "loss": 0.0758, + "step": 27050 + }, + { + "epoch": 103.2824427480916, + "grad_norm": 0.5516987442970276, + "learning_rate": 2.6045522450476814e-07, + "loss": 0.0728, + "step": 27060 + }, + { + "epoch": 103.3206106870229, + "grad_norm": 0.4582926630973816, + "learning_rate": 2.587024424809359e-07, + "loss": 0.0779, + "step": 27070 + }, + { + "epoch": 103.3587786259542, + "grad_norm": 0.5267468094825745, + "learning_rate": 2.5695542158071187e-07, + "loss": 0.0677, + "step": 27080 + }, + { + "epoch": 103.3969465648855, + "grad_norm": 0.3101442754268646, + "learning_rate": 2.5521416392689066e-07, + "loss": 0.0666, + "step": 27090 + }, + { + "epoch": 103.43511450381679, + "grad_norm": 0.22319281101226807, + "learning_rate": 2.5347867163526387e-07, + "loss": 0.0672, + "step": 27100 + }, + { + "epoch": 103.47328244274809, + "grad_norm": 0.2605898678302765, + "learning_rate": 2.517489468146189e-07, + "loss": 0.0673, + "step": 27110 + }, + { + "epoch": 103.5114503816794, + "grad_norm": 0.2470434010028839, + "learning_rate": 2.500249915667341e-07, + "loss": 0.0683, + "step": 27120 + }, + { + "epoch": 103.54961832061069, + "grad_norm": 0.18632711470127106, + "learning_rate": 2.4830680798637817e-07, + "loss": 0.0679, + "step": 27130 + }, + { + "epoch": 103.58778625954199, + "grad_norm": 0.2968946695327759, + "learning_rate": 2.4659439816130557e-07, + "loss": 0.0667, + "step": 27140 + }, + { + "epoch": 103.62595419847328, + "grad_norm": 0.3109892010688782, + "learning_rate": 2.448877641722569e-07, + "loss": 0.0656, + "step": 27150 + }, + { + "epoch": 103.66412213740458, + "grad_norm": 0.6580647826194763, + "learning_rate": 2.431869080929522e-07, + "loss": 0.0691, + "step": 27160 + }, + { + "epoch": 103.70229007633588, + "grad_norm": 0.44333863258361816, + "learning_rate": 2.414918319900922e-07, + "loss": 0.0796, + "step": 27170 + }, + { + "epoch": 103.74045801526718, + "grad_norm": 0.201785147190094, + "learning_rate": 2.398025379233543e-07, + "loss": 0.0642, + "step": 27180 + }, + { + "epoch": 103.77862595419847, + "grad_norm": 0.3678027391433716, + "learning_rate": 2.381190279453899e-07, + "loss": 0.0824, + "step": 27190 + }, + { + "epoch": 103.81679389312977, + "grad_norm": 0.3141831159591675, + "learning_rate": 2.364413041018232e-07, + "loss": 0.0694, + "step": 27200 + }, + { + "epoch": 103.85496183206106, + "grad_norm": 0.35555902123451233, + "learning_rate": 2.3476936843124633e-07, + "loss": 0.0665, + "step": 27210 + }, + { + "epoch": 103.89312977099236, + "grad_norm": 0.2217165231704712, + "learning_rate": 2.3310322296521859e-07, + "loss": 0.0652, + "step": 27220 + }, + { + "epoch": 103.93129770992367, + "grad_norm": 0.36410123109817505, + "learning_rate": 2.314428697282628e-07, + "loss": 0.0719, + "step": 27230 + }, + { + "epoch": 103.96946564885496, + "grad_norm": 0.7411630749702454, + "learning_rate": 2.2978831073786735e-07, + "loss": 0.0803, + "step": 27240 + }, + { + "epoch": 104.00763358778626, + "grad_norm": 0.2194436937570572, + "learning_rate": 2.2813954800447512e-07, + "loss": 0.0631, + "step": 27250 + }, + { + "epoch": 104.04580152671755, + "grad_norm": 0.3712310194969177, + "learning_rate": 2.2649658353148974e-07, + "loss": 0.0655, + "step": 27260 + }, + { + "epoch": 104.08396946564885, + "grad_norm": 0.2021886706352234, + "learning_rate": 2.2485941931526646e-07, + "loss": 0.0663, + "step": 27270 + }, + { + "epoch": 104.12213740458016, + "grad_norm": 0.5154406428337097, + "learning_rate": 2.232280573451151e-07, + "loss": 0.0712, + "step": 27280 + }, + { + "epoch": 104.16030534351145, + "grad_norm": 0.28259921073913574, + "learning_rate": 2.216024996032945e-07, + "loss": 0.0654, + "step": 27290 + }, + { + "epoch": 104.19847328244275, + "grad_norm": 0.23785048723220825, + "learning_rate": 2.1998274806501074e-07, + "loss": 0.0725, + "step": 27300 + }, + { + "epoch": 104.23664122137404, + "grad_norm": 0.3378708064556122, + "learning_rate": 2.1836880469841391e-07, + "loss": 0.071, + "step": 27310 + }, + { + "epoch": 104.27480916030534, + "grad_norm": 0.4880586266517639, + "learning_rate": 2.1676067146459812e-07, + "loss": 0.072, + "step": 27320 + }, + { + "epoch": 104.31297709923665, + "grad_norm": 0.4210817217826843, + "learning_rate": 2.151583503175958e-07, + "loss": 0.0669, + "step": 27330 + }, + { + "epoch": 104.35114503816794, + "grad_norm": 0.5876713991165161, + "learning_rate": 2.1356184320437955e-07, + "loss": 0.0688, + "step": 27340 + }, + { + "epoch": 104.38931297709924, + "grad_norm": 0.3950154483318329, + "learning_rate": 2.1197115206485542e-07, + "loss": 0.0646, + "step": 27350 + }, + { + "epoch": 104.42748091603053, + "grad_norm": 0.5265299081802368, + "learning_rate": 2.103862788318628e-07, + "loss": 0.0676, + "step": 27360 + }, + { + "epoch": 104.46564885496183, + "grad_norm": 0.4005798399448395, + "learning_rate": 2.0880722543117293e-07, + "loss": 0.0629, + "step": 27370 + }, + { + "epoch": 104.50381679389314, + "grad_norm": 0.8654969930648804, + "learning_rate": 2.0723399378148434e-07, + "loss": 0.0687, + "step": 27380 + }, + { + "epoch": 104.54198473282443, + "grad_norm": 0.2732725739479065, + "learning_rate": 2.0566658579442067e-07, + "loss": 0.0669, + "step": 27390 + }, + { + "epoch": 104.58015267175573, + "grad_norm": 0.5799111723899841, + "learning_rate": 2.0410500337453176e-07, + "loss": 0.0705, + "step": 27400 + }, + { + "epoch": 104.61832061068702, + "grad_norm": 0.28128835558891296, + "learning_rate": 2.0254924841928647e-07, + "loss": 0.0654, + "step": 27410 + }, + { + "epoch": 104.65648854961832, + "grad_norm": 0.41618332266807556, + "learning_rate": 2.0099932281907542e-07, + "loss": 0.0699, + "step": 27420 + }, + { + "epoch": 104.69465648854961, + "grad_norm": 0.5150026679039001, + "learning_rate": 1.9945522845720323e-07, + "loss": 0.0704, + "step": 27430 + }, + { + "epoch": 104.73282442748092, + "grad_norm": 0.2852458953857422, + "learning_rate": 1.9791696720988963e-07, + "loss": 0.0631, + "step": 27440 + }, + { + "epoch": 104.77099236641222, + "grad_norm": 0.2500865161418915, + "learning_rate": 1.9638454094626836e-07, + "loss": 0.0695, + "step": 27450 + }, + { + "epoch": 104.80916030534351, + "grad_norm": 0.3002980947494507, + "learning_rate": 1.9485795152838105e-07, + "loss": 0.0703, + "step": 27460 + }, + { + "epoch": 104.8473282442748, + "grad_norm": 0.22461900115013123, + "learning_rate": 1.933372008111778e-07, + "loss": 0.0646, + "step": 27470 + }, + { + "epoch": 104.8854961832061, + "grad_norm": 0.2733502686023712, + "learning_rate": 1.9182229064251433e-07, + "loss": 0.0649, + "step": 27480 + }, + { + "epoch": 104.92366412213741, + "grad_norm": 0.35162800550460815, + "learning_rate": 1.9031322286314878e-07, + "loss": 0.0745, + "step": 27490 + }, + { + "epoch": 104.9618320610687, + "grad_norm": 0.22003084421157837, + "learning_rate": 1.8880999930674216e-07, + "loss": 0.0635, + "step": 27500 + }, + { + "epoch": 105.0, + "grad_norm": 0.39051511883735657, + "learning_rate": 1.8731262179985166e-07, + "loss": 0.0752, + "step": 27510 + }, + { + "epoch": 105.0381679389313, + "grad_norm": 0.4942927658557892, + "learning_rate": 1.8582109216193245e-07, + "loss": 0.0925, + "step": 27520 + }, + { + "epoch": 105.07633587786259, + "grad_norm": 0.26188600063323975, + "learning_rate": 1.8433541220533368e-07, + "loss": 0.0646, + "step": 27530 + }, + { + "epoch": 105.1145038167939, + "grad_norm": 0.28856149315834045, + "learning_rate": 1.8285558373529578e-07, + "loss": 0.0676, + "step": 27540 + }, + { + "epoch": 105.1526717557252, + "grad_norm": 0.24751274287700653, + "learning_rate": 1.8138160854995145e-07, + "loss": 0.0711, + "step": 27550 + }, + { + "epoch": 105.19083969465649, + "grad_norm": 0.3480677902698517, + "learning_rate": 1.7991348844031864e-07, + "loss": 0.0659, + "step": 27560 + }, + { + "epoch": 105.22900763358778, + "grad_norm": 0.42214107513427734, + "learning_rate": 1.78451225190302e-07, + "loss": 0.0706, + "step": 27570 + }, + { + "epoch": 105.26717557251908, + "grad_norm": 0.47172361612319946, + "learning_rate": 1.7699482057668916e-07, + "loss": 0.0669, + "step": 27580 + }, + { + "epoch": 105.30534351145039, + "grad_norm": 0.6258100867271423, + "learning_rate": 1.7554427636914894e-07, + "loss": 0.0689, + "step": 27590 + }, + { + "epoch": 105.34351145038168, + "grad_norm": 0.21334940195083618, + "learning_rate": 1.7409959433022873e-07, + "loss": 0.0653, + "step": 27600 + }, + { + "epoch": 105.38167938931298, + "grad_norm": 0.39631593227386475, + "learning_rate": 1.726607762153548e-07, + "loss": 0.0843, + "step": 27610 + }, + { + "epoch": 105.41984732824427, + "grad_norm": 0.49448487162590027, + "learning_rate": 1.7122782377282598e-07, + "loss": 0.0667, + "step": 27620 + }, + { + "epoch": 105.45801526717557, + "grad_norm": 0.26243850588798523, + "learning_rate": 1.6980073874381497e-07, + "loss": 0.0646, + "step": 27630 + }, + { + "epoch": 105.49618320610686, + "grad_norm": 0.3036261200904846, + "learning_rate": 1.6837952286236415e-07, + "loss": 0.0757, + "step": 27640 + }, + { + "epoch": 105.53435114503817, + "grad_norm": 0.37669137120246887, + "learning_rate": 1.6696417785538487e-07, + "loss": 0.0729, + "step": 27650 + }, + { + "epoch": 105.57251908396947, + "grad_norm": 0.2965763807296753, + "learning_rate": 1.6555470544265539e-07, + "loss": 0.0724, + "step": 27660 + }, + { + "epoch": 105.61068702290076, + "grad_norm": 0.4320843815803528, + "learning_rate": 1.6415110733681737e-07, + "loss": 0.0712, + "step": 27670 + }, + { + "epoch": 105.64885496183206, + "grad_norm": 0.2053402066230774, + "learning_rate": 1.6275338524337437e-07, + "loss": 0.0683, + "step": 27680 + }, + { + "epoch": 105.68702290076335, + "grad_norm": 0.27618706226348877, + "learning_rate": 1.6136154086069054e-07, + "loss": 0.064, + "step": 27690 + }, + { + "epoch": 105.72519083969466, + "grad_norm": 0.5700408220291138, + "learning_rate": 1.599755758799887e-07, + "loss": 0.0694, + "step": 27700 + }, + { + "epoch": 105.76335877862596, + "grad_norm": 0.31304672360420227, + "learning_rate": 1.5859549198534607e-07, + "loss": 0.0695, + "step": 27710 + }, + { + "epoch": 105.80152671755725, + "grad_norm": 0.23126287758350372, + "learning_rate": 1.572212908536963e-07, + "loss": 0.0689, + "step": 27720 + }, + { + "epoch": 105.83969465648855, + "grad_norm": 0.3003278374671936, + "learning_rate": 1.5585297415482203e-07, + "loss": 0.0724, + "step": 27730 + }, + { + "epoch": 105.87786259541984, + "grad_norm": 0.4534234404563904, + "learning_rate": 1.5449054355135718e-07, + "loss": 0.0669, + "step": 27740 + }, + { + "epoch": 105.91603053435115, + "grad_norm": 0.5015247464179993, + "learning_rate": 1.5313400069878416e-07, + "loss": 0.0691, + "step": 27750 + }, + { + "epoch": 105.95419847328245, + "grad_norm": 0.2227436900138855, + "learning_rate": 1.5178334724542887e-07, + "loss": 0.0653, + "step": 27760 + }, + { + "epoch": 105.99236641221374, + "grad_norm": 0.9543538093566895, + "learning_rate": 1.504385848324641e-07, + "loss": 0.0665, + "step": 27770 + }, + { + "epoch": 106.03053435114504, + "grad_norm": 0.7465227246284485, + "learning_rate": 1.4909971509390332e-07, + "loss": 0.072, + "step": 27780 + }, + { + "epoch": 106.06870229007633, + "grad_norm": 0.23705200850963593, + "learning_rate": 1.4776673965659793e-07, + "loss": 0.0608, + "step": 27790 + }, + { + "epoch": 106.10687022900764, + "grad_norm": 0.29993176460266113, + "learning_rate": 1.4643966014023958e-07, + "loss": 0.0686, + "step": 27800 + }, + { + "epoch": 106.14503816793894, + "grad_norm": 0.24530579149723053, + "learning_rate": 1.4511847815735503e-07, + "loss": 0.0666, + "step": 27810 + }, + { + "epoch": 106.18320610687023, + "grad_norm": 0.24449963867664337, + "learning_rate": 1.438031953133051e-07, + "loss": 0.0694, + "step": 27820 + }, + { + "epoch": 106.22137404580153, + "grad_norm": 0.3024006485939026, + "learning_rate": 1.4249381320628197e-07, + "loss": 0.0774, + "step": 27830 + }, + { + "epoch": 106.25954198473282, + "grad_norm": 0.3465965986251831, + "learning_rate": 1.4119033342730904e-07, + "loss": 0.0644, + "step": 27840 + }, + { + "epoch": 106.29770992366412, + "grad_norm": 0.2610500156879425, + "learning_rate": 1.3989275756023658e-07, + "loss": 0.0673, + "step": 27850 + }, + { + "epoch": 106.33587786259542, + "grad_norm": 0.34050482511520386, + "learning_rate": 1.386010871817417e-07, + "loss": 0.0695, + "step": 27860 + }, + { + "epoch": 106.37404580152672, + "grad_norm": 0.3278212249279022, + "learning_rate": 1.3731532386132617e-07, + "loss": 0.0736, + "step": 27870 + }, + { + "epoch": 106.41221374045801, + "grad_norm": 0.27031412720680237, + "learning_rate": 1.360354691613136e-07, + "loss": 0.0695, + "step": 27880 + }, + { + "epoch": 106.45038167938931, + "grad_norm": 0.2940131723880768, + "learning_rate": 1.3476152463684778e-07, + "loss": 0.0666, + "step": 27890 + }, + { + "epoch": 106.4885496183206, + "grad_norm": 0.25805696845054626, + "learning_rate": 1.3349349183589155e-07, + "loss": 0.0729, + "step": 27900 + }, + { + "epoch": 106.52671755725191, + "grad_norm": 1.4068719148635864, + "learning_rate": 1.3223137229922356e-07, + "loss": 0.0731, + "step": 27910 + }, + { + "epoch": 106.56488549618321, + "grad_norm": 0.6141703724861145, + "learning_rate": 1.3097516756043982e-07, + "loss": 0.0695, + "step": 27920 + }, + { + "epoch": 106.6030534351145, + "grad_norm": 0.2119428515434265, + "learning_rate": 1.29724879145946e-07, + "loss": 0.0727, + "step": 27930 + }, + { + "epoch": 106.6412213740458, + "grad_norm": 0.24076199531555176, + "learning_rate": 1.284805085749613e-07, + "loss": 0.0788, + "step": 27940 + }, + { + "epoch": 106.6793893129771, + "grad_norm": 0.4202733337879181, + "learning_rate": 1.2724205735951288e-07, + "loss": 0.0714, + "step": 27950 + }, + { + "epoch": 106.7175572519084, + "grad_norm": 0.2482680380344391, + "learning_rate": 1.2600952700443592e-07, + "loss": 0.0708, + "step": 27960 + }, + { + "epoch": 106.7557251908397, + "grad_norm": 0.65274977684021, + "learning_rate": 1.247829190073707e-07, + "loss": 0.0727, + "step": 27970 + }, + { + "epoch": 106.79389312977099, + "grad_norm": 0.23994283378124237, + "learning_rate": 1.2356223485876174e-07, + "loss": 0.072, + "step": 27980 + }, + { + "epoch": 106.83206106870229, + "grad_norm": 0.2923242747783661, + "learning_rate": 1.2234747604185527e-07, + "loss": 0.0726, + "step": 27990 + }, + { + "epoch": 106.87022900763358, + "grad_norm": 0.2507231533527374, + "learning_rate": 1.2113864403269837e-07, + "loss": 0.0723, + "step": 28000 + }, + { + "epoch": 106.90839694656489, + "grad_norm": 0.6731197237968445, + "learning_rate": 1.1993574030013554e-07, + "loss": 0.0674, + "step": 28010 + }, + { + "epoch": 106.94656488549619, + "grad_norm": 0.17780010402202606, + "learning_rate": 1.1873876630580816e-07, + "loss": 0.0758, + "step": 28020 + }, + { + "epoch": 106.98473282442748, + "grad_norm": 0.7125159502029419, + "learning_rate": 1.1754772350415278e-07, + "loss": 0.0748, + "step": 28030 + }, + { + "epoch": 107.02290076335878, + "grad_norm": 0.355917364358902, + "learning_rate": 1.163626133423984e-07, + "loss": 0.0743, + "step": 28040 + }, + { + "epoch": 107.06106870229007, + "grad_norm": 0.4013403058052063, + "learning_rate": 1.151834372605659e-07, + "loss": 0.0645, + "step": 28050 + }, + { + "epoch": 107.09923664122137, + "grad_norm": 0.6267454624176025, + "learning_rate": 1.1401019669146474e-07, + "loss": 0.0681, + "step": 28060 + }, + { + "epoch": 107.13740458015268, + "grad_norm": 0.3011610209941864, + "learning_rate": 1.128428930606934e-07, + "loss": 0.0671, + "step": 28070 + }, + { + "epoch": 107.17557251908397, + "grad_norm": 0.5167744159698486, + "learning_rate": 1.1168152778663621e-07, + "loss": 0.0678, + "step": 28080 + }, + { + "epoch": 107.21374045801527, + "grad_norm": 0.2572309970855713, + "learning_rate": 1.10526102280461e-07, + "loss": 0.0666, + "step": 28090 + }, + { + "epoch": 107.25190839694656, + "grad_norm": 0.2860301434993744, + "learning_rate": 1.0937661794611865e-07, + "loss": 0.0628, + "step": 28100 + }, + { + "epoch": 107.29007633587786, + "grad_norm": 0.7779892086982727, + "learning_rate": 1.0823307618034129e-07, + "loss": 0.0683, + "step": 28110 + }, + { + "epoch": 107.32824427480917, + "grad_norm": 0.26767176389694214, + "learning_rate": 1.0709547837263967e-07, + "loss": 0.062, + "step": 28120 + }, + { + "epoch": 107.36641221374046, + "grad_norm": 0.38694190979003906, + "learning_rate": 1.0596382590530252e-07, + "loss": 0.0651, + "step": 28130 + }, + { + "epoch": 107.40458015267176, + "grad_norm": 0.9800488948822021, + "learning_rate": 1.0483812015339435e-07, + "loss": 0.0708, + "step": 28140 + }, + { + "epoch": 107.44274809160305, + "grad_norm": 0.5889298319816589, + "learning_rate": 1.0371836248475431e-07, + "loss": 0.0698, + "step": 28150 + }, + { + "epoch": 107.48091603053435, + "grad_norm": 0.2541733384132385, + "learning_rate": 1.0260455425999239e-07, + "loss": 0.0704, + "step": 28160 + }, + { + "epoch": 107.51908396946565, + "grad_norm": 0.26119011640548706, + "learning_rate": 1.0149669683249209e-07, + "loss": 0.0698, + "step": 28170 + }, + { + "epoch": 107.55725190839695, + "grad_norm": 0.42216214537620544, + "learning_rate": 1.0039479154840271e-07, + "loss": 0.0663, + "step": 28180 + }, + { + "epoch": 107.59541984732824, + "grad_norm": 0.7032060623168945, + "learning_rate": 9.92988397466449e-08, + "loss": 0.0772, + "step": 28190 + }, + { + "epoch": 107.63358778625954, + "grad_norm": 0.26121023297309875, + "learning_rate": 9.820884275890286e-08, + "loss": 0.0682, + "step": 28200 + }, + { + "epoch": 107.67175572519083, + "grad_norm": 0.7753889560699463, + "learning_rate": 9.712480190962548e-08, + "loss": 0.0703, + "step": 28210 + }, + { + "epoch": 107.70992366412214, + "grad_norm": 0.48209503293037415, + "learning_rate": 9.604671851602466e-08, + "loss": 0.0697, + "step": 28220 + }, + { + "epoch": 107.74809160305344, + "grad_norm": 0.3045661449432373, + "learning_rate": 9.497459388807306e-08, + "loss": 0.0659, + "step": 28230 + }, + { + "epoch": 107.78625954198473, + "grad_norm": 0.4404143691062927, + "learning_rate": 9.390842932850364e-08, + "loss": 0.0724, + "step": 28240 + }, + { + "epoch": 107.82442748091603, + "grad_norm": 0.34857988357543945, + "learning_rate": 9.284822613280731e-08, + "loss": 0.0706, + "step": 28250 + }, + { + "epoch": 107.86259541984732, + "grad_norm": 0.654318630695343, + "learning_rate": 9.179398558923024e-08, + "loss": 0.0696, + "step": 28260 + }, + { + "epoch": 107.90076335877862, + "grad_norm": 0.6860091686248779, + "learning_rate": 9.074570897877388e-08, + "loss": 0.0774, + "step": 28270 + }, + { + "epoch": 107.93893129770993, + "grad_norm": 0.36173543334007263, + "learning_rate": 8.970339757519375e-08, + "loss": 0.0646, + "step": 28280 + }, + { + "epoch": 107.97709923664122, + "grad_norm": 0.37401559948921204, + "learning_rate": 8.86670526449962e-08, + "loss": 0.0652, + "step": 28290 + }, + { + "epoch": 108.01526717557252, + "grad_norm": 0.4267309904098511, + "learning_rate": 8.763667544743837e-08, + "loss": 0.0682, + "step": 28300 + }, + { + "epoch": 108.05343511450381, + "grad_norm": 0.9610293507575989, + "learning_rate": 8.661226723452542e-08, + "loss": 0.0796, + "step": 28310 + }, + { + "epoch": 108.09160305343511, + "grad_norm": 0.8470723032951355, + "learning_rate": 8.559382925101e-08, + "loss": 0.0682, + "step": 28320 + }, + { + "epoch": 108.12977099236642, + "grad_norm": 0.5021095871925354, + "learning_rate": 8.458136273438943e-08, + "loss": 0.0686, + "step": 28330 + }, + { + "epoch": 108.16793893129771, + "grad_norm": 0.4912349581718445, + "learning_rate": 8.357486891490796e-08, + "loss": 0.064, + "step": 28340 + }, + { + "epoch": 108.20610687022901, + "grad_norm": 0.2633496820926666, + "learning_rate": 8.257434901554895e-08, + "loss": 0.0684, + "step": 28350 + }, + { + "epoch": 108.2442748091603, + "grad_norm": 0.25884994864463806, + "learning_rate": 8.157980425203938e-08, + "loss": 0.0704, + "step": 28360 + }, + { + "epoch": 108.2824427480916, + "grad_norm": 0.5608441233634949, + "learning_rate": 8.059123583284368e-08, + "loss": 0.0731, + "step": 28370 + }, + { + "epoch": 108.3206106870229, + "grad_norm": 0.32129549980163574, + "learning_rate": 7.960864495916653e-08, + "loss": 0.0691, + "step": 28380 + }, + { + "epoch": 108.3587786259542, + "grad_norm": 0.367439866065979, + "learning_rate": 7.863203282494846e-08, + "loss": 0.0764, + "step": 28390 + }, + { + "epoch": 108.3969465648855, + "grad_norm": 0.5697895884513855, + "learning_rate": 7.766140061686522e-08, + "loss": 0.0698, + "step": 28400 + }, + { + "epoch": 108.43511450381679, + "grad_norm": 0.5481321215629578, + "learning_rate": 7.669674951432615e-08, + "loss": 0.067, + "step": 28410 + }, + { + "epoch": 108.47328244274809, + "grad_norm": 0.2596859931945801, + "learning_rate": 7.573808068947363e-08, + "loss": 0.0645, + "step": 28420 + }, + { + "epoch": 108.5114503816794, + "grad_norm": 0.3332163989543915, + "learning_rate": 7.478539530718087e-08, + "loss": 0.0775, + "step": 28430 + }, + { + "epoch": 108.54961832061069, + "grad_norm": 0.3313477635383606, + "learning_rate": 7.383869452504966e-08, + "loss": 0.0666, + "step": 28440 + }, + { + "epoch": 108.58778625954199, + "grad_norm": 0.28801289200782776, + "learning_rate": 7.289797949341204e-08, + "loss": 0.0711, + "step": 28450 + }, + { + "epoch": 108.62595419847328, + "grad_norm": 0.4016149640083313, + "learning_rate": 7.196325135532422e-08, + "loss": 0.0649, + "step": 28460 + }, + { + "epoch": 108.66412213740458, + "grad_norm": 0.27881985902786255, + "learning_rate": 7.10345112465699e-08, + "loss": 0.0619, + "step": 28470 + }, + { + "epoch": 108.70229007633588, + "grad_norm": 0.5351571440696716, + "learning_rate": 7.011176029565525e-08, + "loss": 0.0687, + "step": 28480 + }, + { + "epoch": 108.74045801526718, + "grad_norm": 0.34915560483932495, + "learning_rate": 6.919499962381004e-08, + "loss": 0.0717, + "step": 28490 + }, + { + "epoch": 108.77862595419847, + "grad_norm": 0.3323579430580139, + "learning_rate": 6.828423034498488e-08, + "loss": 0.0642, + "step": 28500 + }, + { + "epoch": 108.81679389312977, + "grad_norm": 0.23722094297409058, + "learning_rate": 6.737945356585008e-08, + "loss": 0.067, + "step": 28510 + }, + { + "epoch": 108.85496183206106, + "grad_norm": 0.4655064344406128, + "learning_rate": 6.648067038579508e-08, + "loss": 0.0644, + "step": 28520 + }, + { + "epoch": 108.89312977099236, + "grad_norm": 0.27879801392555237, + "learning_rate": 6.558788189692578e-08, + "loss": 0.0705, + "step": 28530 + }, + { + "epoch": 108.93129770992367, + "grad_norm": 0.3153517246246338, + "learning_rate": 6.470108918406493e-08, + "loss": 0.0733, + "step": 28540 + }, + { + "epoch": 108.96946564885496, + "grad_norm": 0.42960867285728455, + "learning_rate": 6.382029332474893e-08, + "loss": 0.0667, + "step": 28550 + }, + { + "epoch": 109.00763358778626, + "grad_norm": 0.28105250000953674, + "learning_rate": 6.294549538922778e-08, + "loss": 0.0622, + "step": 28560 + }, + { + "epoch": 109.04580152671755, + "grad_norm": 0.597682535648346, + "learning_rate": 6.207669644046344e-08, + "loss": 0.0681, + "step": 28570 + }, + { + "epoch": 109.08396946564885, + "grad_norm": 0.31500354409217834, + "learning_rate": 6.121389753412866e-08, + "loss": 0.0698, + "step": 28580 + }, + { + "epoch": 109.12213740458016, + "grad_norm": 0.2600068151950836, + "learning_rate": 6.035709971860592e-08, + "loss": 0.0747, + "step": 28590 + }, + { + "epoch": 109.16030534351145, + "grad_norm": 0.2531625032424927, + "learning_rate": 5.95063040349847e-08, + "loss": 0.0688, + "step": 28600 + }, + { + "epoch": 109.19847328244275, + "grad_norm": 0.42508846521377563, + "learning_rate": 5.86615115170619e-08, + "loss": 0.0695, + "step": 28610 + }, + { + "epoch": 109.23664122137404, + "grad_norm": 0.38278618454933167, + "learning_rate": 5.782272319134086e-08, + "loss": 0.0705, + "step": 28620 + }, + { + "epoch": 109.27480916030534, + "grad_norm": 0.2914545238018036, + "learning_rate": 5.698994007702796e-08, + "loss": 0.0983, + "step": 28630 + }, + { + "epoch": 109.31297709923665, + "grad_norm": 0.2625536620616913, + "learning_rate": 5.616316318603321e-08, + "loss": 0.0712, + "step": 28640 + }, + { + "epoch": 109.35114503816794, + "grad_norm": 0.21834562718868256, + "learning_rate": 5.5342393522968e-08, + "loss": 0.0681, + "step": 28650 + }, + { + "epoch": 109.38931297709924, + "grad_norm": 0.305205374956131, + "learning_rate": 5.452763208514622e-08, + "loss": 0.0677, + "step": 28660 + }, + { + "epoch": 109.42748091603053, + "grad_norm": 0.4056416153907776, + "learning_rate": 5.371887986257873e-08, + "loss": 0.0676, + "step": 28670 + }, + { + "epoch": 109.46564885496183, + "grad_norm": 0.2562243640422821, + "learning_rate": 5.291613783797611e-08, + "loss": 0.067, + "step": 28680 + }, + { + "epoch": 109.50381679389314, + "grad_norm": 0.4858242869377136, + "learning_rate": 5.2119406986745336e-08, + "loss": 0.0686, + "step": 28690 + }, + { + "epoch": 109.54198473282443, + "grad_norm": 0.8462430238723755, + "learning_rate": 5.132868827698978e-08, + "loss": 0.0723, + "step": 28700 + }, + { + "epoch": 109.58015267175573, + "grad_norm": 0.46360403299331665, + "learning_rate": 5.0543982669507554e-08, + "loss": 0.0666, + "step": 28710 + }, + { + "epoch": 109.61832061068702, + "grad_norm": 0.2263144552707672, + "learning_rate": 4.976529111778872e-08, + "loss": 0.0672, + "step": 28720 + }, + { + "epoch": 109.65648854961832, + "grad_norm": 0.5241805911064148, + "learning_rate": 4.8992614568018624e-08, + "loss": 0.0671, + "step": 28730 + }, + { + "epoch": 109.69465648854961, + "grad_norm": 0.3242310881614685, + "learning_rate": 4.822595395907126e-08, + "loss": 0.0679, + "step": 28740 + }, + { + "epoch": 109.73282442748092, + "grad_norm": 0.300382137298584, + "learning_rate": 4.7465310222510885e-08, + "loss": 0.0718, + "step": 28750 + }, + { + "epoch": 109.77099236641222, + "grad_norm": 0.25149857997894287, + "learning_rate": 4.671068428259318e-08, + "loss": 0.0707, + "step": 28760 + }, + { + "epoch": 109.80916030534351, + "grad_norm": 0.49300718307495117, + "learning_rate": 4.596207705625799e-08, + "loss": 0.0614, + "step": 28770 + }, + { + "epoch": 109.8473282442748, + "grad_norm": 0.2667870819568634, + "learning_rate": 4.521948945313492e-08, + "loss": 0.0733, + "step": 28780 + }, + { + "epoch": 109.8854961832061, + "grad_norm": 0.32633841037750244, + "learning_rate": 4.4482922375537196e-08, + "loss": 0.073, + "step": 28790 + }, + { + "epoch": 109.92366412213741, + "grad_norm": 0.23223261535167694, + "learning_rate": 4.375237671846333e-08, + "loss": 0.0684, + "step": 28800 + }, + { + "epoch": 109.9618320610687, + "grad_norm": 0.2435031235218048, + "learning_rate": 4.302785336959547e-08, + "loss": 0.0648, + "step": 28810 + }, + { + "epoch": 110.0, + "grad_norm": 0.273689866065979, + "learning_rate": 4.230935320929774e-08, + "loss": 0.0732, + "step": 28820 + }, + { + "epoch": 110.0381679389313, + "grad_norm": 0.290106862783432, + "learning_rate": 4.159687711061566e-08, + "loss": 0.0728, + "step": 28830 + }, + { + "epoch": 110.07633587786259, + "grad_norm": 0.3028109669685364, + "learning_rate": 4.089042593927506e-08, + "loss": 0.0642, + "step": 28840 + }, + { + "epoch": 110.1145038167939, + "grad_norm": 0.2728022634983063, + "learning_rate": 4.019000055367983e-08, + "loss": 0.0614, + "step": 28850 + }, + { + "epoch": 110.1526717557252, + "grad_norm": 0.3827283978462219, + "learning_rate": 3.949560180491363e-08, + "loss": 0.0665, + "step": 28860 + }, + { + "epoch": 110.19083969465649, + "grad_norm": 0.29793211817741394, + "learning_rate": 3.880723053673652e-08, + "loss": 0.0652, + "step": 28870 + }, + { + "epoch": 110.22900763358778, + "grad_norm": 0.3043680191040039, + "learning_rate": 3.812488758558386e-08, + "loss": 0.065, + "step": 28880 + }, + { + "epoch": 110.26717557251908, + "grad_norm": 0.47717395424842834, + "learning_rate": 3.744857378056743e-08, + "loss": 0.0696, + "step": 28890 + }, + { + "epoch": 110.30534351145039, + "grad_norm": 0.47816434502601624, + "learning_rate": 3.677828994347155e-08, + "loss": 0.0655, + "step": 28900 + }, + { + "epoch": 110.34351145038168, + "grad_norm": 0.4901565611362457, + "learning_rate": 3.61140368887547e-08, + "loss": 0.074, + "step": 28910 + }, + { + "epoch": 110.38167938931298, + "grad_norm": 0.22826752066612244, + "learning_rate": 3.5455815423546815e-08, + "loss": 0.0712, + "step": 28920 + }, + { + "epoch": 110.41984732824427, + "grad_norm": 0.44958510994911194, + "learning_rate": 3.480362634764922e-08, + "loss": 0.0687, + "step": 28930 + }, + { + "epoch": 110.45801526717557, + "grad_norm": 0.21928606927394867, + "learning_rate": 3.4157470453533015e-08, + "loss": 0.0683, + "step": 28940 + }, + { + "epoch": 110.49618320610686, + "grad_norm": 0.28822770714759827, + "learning_rate": 3.3517348526339034e-08, + "loss": 0.0678, + "step": 28950 + }, + { + "epoch": 110.53435114503817, + "grad_norm": 0.44351062178611755, + "learning_rate": 3.288326134387454e-08, + "loss": 0.0647, + "step": 28960 + }, + { + "epoch": 110.57251908396947, + "grad_norm": 0.7302173972129822, + "learning_rate": 3.225520967661655e-08, + "loss": 0.0683, + "step": 28970 + }, + { + "epoch": 110.61068702290076, + "grad_norm": 0.4752961993217468, + "learning_rate": 3.163319428770628e-08, + "loss": 0.0713, + "step": 28980 + }, + { + "epoch": 110.64885496183206, + "grad_norm": 0.35356375575065613, + "learning_rate": 3.1017215932951375e-08, + "loss": 0.0694, + "step": 28990 + }, + { + "epoch": 110.68702290076335, + "grad_norm": 0.2346401959657669, + "learning_rate": 3.040727536082366e-08, + "loss": 0.0752, + "step": 29000 + }, + { + "epoch": 110.72519083969466, + "grad_norm": 0.3200841546058655, + "learning_rate": 2.980337331245864e-08, + "loss": 0.0686, + "step": 29010 + }, + { + "epoch": 110.76335877862596, + "grad_norm": 0.26796045899391174, + "learning_rate": 2.9205510521653214e-08, + "loss": 0.0625, + "step": 29020 + }, + { + "epoch": 110.80152671755725, + "grad_norm": 0.297757089138031, + "learning_rate": 2.8613687714868497e-08, + "loss": 0.0635, + "step": 29030 + }, + { + "epoch": 110.83969465648855, + "grad_norm": 0.256091833114624, + "learning_rate": 2.8027905611223704e-08, + "loss": 0.0645, + "step": 29040 + }, + { + "epoch": 110.87786259541984, + "grad_norm": 0.27508917450904846, + "learning_rate": 2.7448164922500577e-08, + "loss": 0.0867, + "step": 29050 + }, + { + "epoch": 110.91603053435115, + "grad_norm": 0.4821559488773346, + "learning_rate": 2.687446635313784e-08, + "loss": 0.0717, + "step": 29060 + }, + { + "epoch": 110.95419847328245, + "grad_norm": 0.26554882526397705, + "learning_rate": 2.6306810600233435e-08, + "loss": 0.0655, + "step": 29070 + }, + { + "epoch": 110.99236641221374, + "grad_norm": 0.19737420976161957, + "learning_rate": 2.5745198353542834e-08, + "loss": 0.0663, + "step": 29080 + }, + { + "epoch": 111.03053435114504, + "grad_norm": 0.258440762758255, + "learning_rate": 2.518963029547794e-08, + "loss": 0.0601, + "step": 29090 + }, + { + "epoch": 111.06870229007633, + "grad_norm": 0.25001877546310425, + "learning_rate": 2.464010710110598e-08, + "loss": 0.0745, + "step": 29100 + }, + { + "epoch": 111.10687022900764, + "grad_norm": 0.2975136935710907, + "learning_rate": 2.4096629438150055e-08, + "loss": 0.0669, + "step": 29110 + }, + { + "epoch": 111.14503816793894, + "grad_norm": 0.2732112407684326, + "learning_rate": 2.3559197966985802e-08, + "loss": 0.0692, + "step": 29120 + }, + { + "epoch": 111.18320610687023, + "grad_norm": 0.3679260313510895, + "learning_rate": 2.302781334064419e-08, + "loss": 0.063, + "step": 29130 + }, + { + "epoch": 111.22137404580153, + "grad_norm": 0.2609327435493469, + "learning_rate": 2.2502476204807056e-08, + "loss": 0.0694, + "step": 29140 + }, + { + "epoch": 111.25954198473282, + "grad_norm": 0.436776727437973, + "learning_rate": 2.1983187197808788e-08, + "loss": 0.0758, + "step": 29150 + }, + { + "epoch": 111.29770992366412, + "grad_norm": 0.6425755023956299, + "learning_rate": 2.1469946950634645e-08, + "loss": 0.0716, + "step": 29160 + }, + { + "epoch": 111.33587786259542, + "grad_norm": 0.22093476355075836, + "learning_rate": 2.0962756086919112e-08, + "loss": 0.0765, + "step": 29170 + }, + { + "epoch": 111.37404580152672, + "grad_norm": 0.5801058411598206, + "learning_rate": 2.046161522294754e-08, + "loss": 0.0782, + "step": 29180 + }, + { + "epoch": 111.41221374045801, + "grad_norm": 0.5053842663764954, + "learning_rate": 1.9966524967653943e-08, + "loss": 0.0657, + "step": 29190 + }, + { + "epoch": 111.45038167938931, + "grad_norm": 0.31466546654701233, + "learning_rate": 1.9477485922618222e-08, + "loss": 0.0752, + "step": 29200 + }, + { + "epoch": 111.4885496183206, + "grad_norm": 0.3375924825668335, + "learning_rate": 1.899449868207004e-08, + "loss": 0.0683, + "step": 29210 + }, + { + "epoch": 111.52671755725191, + "grad_norm": 0.49920454621315, + "learning_rate": 1.8517563832884387e-08, + "loss": 0.0734, + "step": 29220 + }, + { + "epoch": 111.56488549618321, + "grad_norm": 0.6840377449989319, + "learning_rate": 1.8046681954581035e-08, + "loss": 0.0684, + "step": 29230 + }, + { + "epoch": 111.6030534351145, + "grad_norm": 0.37621980905532837, + "learning_rate": 1.7581853619327295e-08, + "loss": 0.0656, + "step": 29240 + }, + { + "epoch": 111.6412213740458, + "grad_norm": 0.23118160665035248, + "learning_rate": 1.7123079391932474e-08, + "loss": 0.0704, + "step": 29250 + }, + { + "epoch": 111.6793893129771, + "grad_norm": 0.3853679299354553, + "learning_rate": 1.6670359829850657e-08, + "loss": 0.07, + "step": 29260 + }, + { + "epoch": 111.7175572519084, + "grad_norm": 0.20712091028690338, + "learning_rate": 1.6223695483179035e-08, + "loss": 0.0667, + "step": 29270 + }, + { + "epoch": 111.7557251908397, + "grad_norm": 0.45825672149658203, + "learning_rate": 1.5783086894656796e-08, + "loss": 0.0658, + "step": 29280 + }, + { + "epoch": 111.79389312977099, + "grad_norm": 0.25062382221221924, + "learning_rate": 1.5348534599665122e-08, + "loss": 0.0656, + "step": 29290 + }, + { + "epoch": 111.83206106870229, + "grad_norm": 0.7445496320724487, + "learning_rate": 1.4920039126225528e-08, + "loss": 0.0756, + "step": 29300 + }, + { + "epoch": 111.87022900763358, + "grad_norm": 0.4825686812400818, + "learning_rate": 1.449760099500097e-08, + "loss": 0.0684, + "step": 29310 + }, + { + "epoch": 111.90839694656489, + "grad_norm": 0.2946581244468689, + "learning_rate": 1.4081220719293076e-08, + "loss": 0.0647, + "step": 29320 + }, + { + "epoch": 111.94656488549619, + "grad_norm": 0.2547703981399536, + "learning_rate": 1.3670898805043798e-08, + "loss": 0.0762, + "step": 29330 + }, + { + "epoch": 111.98473282442748, + "grad_norm": 0.44975385069847107, + "learning_rate": 1.326663575083209e-08, + "loss": 0.0698, + "step": 29340 + }, + { + "epoch": 112.02290076335878, + "grad_norm": 0.2827472686767578, + "learning_rate": 1.286843204787669e-08, + "loss": 0.0679, + "step": 29350 + }, + { + "epoch": 112.06106870229007, + "grad_norm": 0.4250585734844208, + "learning_rate": 1.2476288180032214e-08, + "loss": 0.0638, + "step": 29360 + }, + { + "epoch": 112.09923664122137, + "grad_norm": 0.3234623670578003, + "learning_rate": 1.2090204623790292e-08, + "loss": 0.0611, + "step": 29370 + }, + { + "epoch": 112.13740458015268, + "grad_norm": 0.20758773386478424, + "learning_rate": 1.1710181848278435e-08, + "loss": 0.0717, + "step": 29380 + }, + { + "epoch": 112.17557251908397, + "grad_norm": 0.476921945810318, + "learning_rate": 1.133622031526116e-08, + "loss": 0.0667, + "step": 29390 + }, + { + "epoch": 112.21374045801527, + "grad_norm": 0.809885561466217, + "learning_rate": 1.0968320479136097e-08, + "loss": 0.0722, + "step": 29400 + }, + { + "epoch": 112.25190839694656, + "grad_norm": 0.218302920460701, + "learning_rate": 1.0606482786936768e-08, + "loss": 0.0659, + "step": 29410 + }, + { + "epoch": 112.29007633587786, + "grad_norm": 0.9261871576309204, + "learning_rate": 1.0250707678329808e-08, + "loss": 0.0777, + "step": 29420 + }, + { + "epoch": 112.32824427480917, + "grad_norm": 0.3251490890979767, + "learning_rate": 9.900995585615525e-09, + "loss": 0.0663, + "step": 29430 + }, + { + "epoch": 112.36641221374046, + "grad_norm": 0.4006260931491852, + "learning_rate": 9.55734693372734e-09, + "loss": 0.067, + "step": 29440 + }, + { + "epoch": 112.40458015267176, + "grad_norm": 0.3395634889602661, + "learning_rate": 9.219762140231237e-09, + "loss": 0.0695, + "step": 29450 + }, + { + "epoch": 112.44274809160305, + "grad_norm": 0.5408310294151306, + "learning_rate": 8.888241615322979e-09, + "loss": 0.0658, + "step": 29460 + }, + { + "epoch": 112.48091603053435, + "grad_norm": 0.22297032177448273, + "learning_rate": 8.562785761833114e-09, + "loss": 0.0761, + "step": 29470 + }, + { + "epoch": 112.51908396946565, + "grad_norm": 0.1803329437971115, + "learning_rate": 8.243394975219753e-09, + "loss": 0.0647, + "step": 29480 + }, + { + "epoch": 112.55725190839695, + "grad_norm": 0.4298841059207916, + "learning_rate": 7.930069643573568e-09, + "loss": 0.07, + "step": 29490 + }, + { + "epoch": 112.59541984732824, + "grad_norm": 0.27020174264907837, + "learning_rate": 7.622810147614457e-09, + "loss": 0.0712, + "step": 29500 + }, + { + "epoch": 112.63358778625954, + "grad_norm": 0.2004048377275467, + "learning_rate": 7.321616860690995e-09, + "loss": 0.0665, + "step": 29510 + }, + { + "epoch": 112.67175572519083, + "grad_norm": 0.25727108120918274, + "learning_rate": 7.026490148782095e-09, + "loss": 0.0726, + "step": 29520 + }, + { + "epoch": 112.70992366412214, + "grad_norm": 0.3128455877304077, + "learning_rate": 6.737430370494236e-09, + "loss": 0.0684, + "step": 29530 + }, + { + "epoch": 112.74809160305344, + "grad_norm": 0.27916234731674194, + "learning_rate": 6.4544378770625695e-09, + "loss": 0.0669, + "step": 29540 + }, + { + "epoch": 112.78625954198473, + "grad_norm": 0.26003262400627136, + "learning_rate": 6.177513012349257e-09, + "loss": 0.0651, + "step": 29550 + }, + { + "epoch": 112.82442748091603, + "grad_norm": 0.6421084403991699, + "learning_rate": 5.906656112844578e-09, + "loss": 0.0683, + "step": 29560 + }, + { + "epoch": 112.86259541984732, + "grad_norm": 0.2419942170381546, + "learning_rate": 5.641867507664156e-09, + "loss": 0.066, + "step": 29570 + }, + { + "epoch": 112.90076335877862, + "grad_norm": 0.4693322479724884, + "learning_rate": 5.383147518552845e-09, + "loss": 0.0672, + "step": 29580 + }, + { + "epoch": 112.93893129770993, + "grad_norm": 0.29389244318008423, + "learning_rate": 5.13049645987862e-09, + "loss": 0.0692, + "step": 29590 + }, + { + "epoch": 112.97709923664122, + "grad_norm": 0.5387037396430969, + "learning_rate": 4.883914638636467e-09, + "loss": 0.074, + "step": 29600 + }, + { + "epoch": 113.01526717557252, + "grad_norm": 0.2849062383174896, + "learning_rate": 4.6434023544467135e-09, + "loss": 0.0698, + "step": 29610 + }, + { + "epoch": 113.05343511450381, + "grad_norm": 0.2868102192878723, + "learning_rate": 4.408959899554477e-09, + "loss": 0.0694, + "step": 29620 + }, + { + "epoch": 113.09160305343511, + "grad_norm": 0.21665853261947632, + "learning_rate": 4.180587558829663e-09, + "loss": 0.0681, + "step": 29630 + }, + { + "epoch": 113.12977099236642, + "grad_norm": 0.2538401484489441, + "learning_rate": 3.9582856097658554e-09, + "loss": 0.0667, + "step": 29640 + }, + { + "epoch": 113.16793893129771, + "grad_norm": 0.4532374143600464, + "learning_rate": 3.74205432248087e-09, + "loss": 0.0852, + "step": 29650 + }, + { + "epoch": 113.20610687022901, + "grad_norm": 0.3058018386363983, + "learning_rate": 3.531893959716204e-09, + "loss": 0.0707, + "step": 29660 + }, + { + "epoch": 113.2442748091603, + "grad_norm": 0.21909840404987335, + "learning_rate": 3.327804776837029e-09, + "loss": 0.0932, + "step": 29670 + }, + { + "epoch": 113.2824427480916, + "grad_norm": 0.49850842356681824, + "learning_rate": 3.1297870218299776e-09, + "loss": 0.0721, + "step": 29680 + }, + { + "epoch": 113.3206106870229, + "grad_norm": 0.4520060420036316, + "learning_rate": 2.9378409353059133e-09, + "loss": 0.0695, + "step": 29690 + }, + { + "epoch": 113.3587786259542, + "grad_norm": 0.24441687762737274, + "learning_rate": 2.7519667504971593e-09, + "loss": 0.078, + "step": 29700 + }, + { + "epoch": 113.3969465648855, + "grad_norm": 0.4024795889854431, + "learning_rate": 2.572164693258605e-09, + "loss": 0.0649, + "step": 29710 + }, + { + "epoch": 113.43511450381679, + "grad_norm": 0.2969071567058563, + "learning_rate": 2.3984349820665997e-09, + "loss": 0.0697, + "step": 29720 + }, + { + "epoch": 113.47328244274809, + "grad_norm": 0.24398191273212433, + "learning_rate": 2.2307778280189484e-09, + "loss": 0.0691, + "step": 29730 + }, + { + "epoch": 113.5114503816794, + "grad_norm": 0.5313243269920349, + "learning_rate": 2.06919343483547e-09, + "loss": 0.0686, + "step": 29740 + }, + { + "epoch": 113.54961832061069, + "grad_norm": 0.33933746814727783, + "learning_rate": 1.9136819988557765e-09, + "loss": 0.0683, + "step": 29750 + }, + { + "epoch": 113.58778625954199, + "grad_norm": 0.39412805438041687, + "learning_rate": 1.764243709041491e-09, + "loss": 0.0755, + "step": 29760 + }, + { + "epoch": 113.62595419847328, + "grad_norm": 0.3232634961605072, + "learning_rate": 1.6208787469734755e-09, + "loss": 0.0709, + "step": 29770 + }, + { + "epoch": 113.66412213740458, + "grad_norm": 0.34813275933265686, + "learning_rate": 1.4835872868546041e-09, + "loss": 0.0706, + "step": 29780 + }, + { + "epoch": 113.70229007633588, + "grad_norm": 0.5944547057151794, + "learning_rate": 1.3523694955064338e-09, + "loss": 0.0737, + "step": 29790 + }, + { + "epoch": 113.74045801526718, + "grad_norm": 0.5624353289604187, + "learning_rate": 1.2272255323708682e-09, + "loss": 0.0642, + "step": 29800 + }, + { + "epoch": 113.77862595419847, + "grad_norm": 0.7441560626029968, + "learning_rate": 1.1081555495096042e-09, + "loss": 0.0699, + "step": 29810 + }, + { + "epoch": 113.81679389312977, + "grad_norm": 0.8023766875267029, + "learning_rate": 9.951596916041307e-10, + "loss": 0.0705, + "step": 29820 + }, + { + "epoch": 113.85496183206106, + "grad_norm": 0.3070337176322937, + "learning_rate": 8.88238095955174e-10, + "loss": 0.0678, + "step": 29830 + }, + { + "epoch": 113.89312977099236, + "grad_norm": 0.3391276001930237, + "learning_rate": 7.873908924821427e-10, + "loss": 0.0659, + "step": 29840 + }, + { + "epoch": 113.93129770992367, + "grad_norm": 0.2922404110431671, + "learning_rate": 6.926182037242379e-10, + "loss": 0.0701, + "step": 29850 + }, + { + "epoch": 113.96946564885496, + "grad_norm": 0.34531670808792114, + "learning_rate": 6.039201448387877e-10, + "loss": 0.0731, + "step": 29860 + }, + { + "epoch": 114.00763358778626, + "grad_norm": 0.4919748902320862, + "learning_rate": 5.212968236029126e-10, + "loss": 0.0744, + "step": 29870 + }, + { + "epoch": 114.04580152671755, + "grad_norm": 0.35671672224998474, + "learning_rate": 4.447483404118602e-10, + "loss": 0.07, + "step": 29880 + }, + { + "epoch": 114.08396946564885, + "grad_norm": 0.25368714332580566, + "learning_rate": 3.7427478827845033e-10, + "loss": 0.0624, + "step": 29890 + }, + { + "epoch": 114.12213740458016, + "grad_norm": 1.127943992614746, + "learning_rate": 3.0987625283473986e-10, + "loss": 0.09, + "step": 29900 + }, + { + "epoch": 114.16030534351145, + "grad_norm": 0.26309409737586975, + "learning_rate": 2.5155281233202324e-10, + "loss": 0.066, + "step": 29910 + }, + { + "epoch": 114.19847328244275, + "grad_norm": 0.26694461703300476, + "learning_rate": 1.9930453763750135e-10, + "loss": 0.0771, + "step": 29920 + }, + { + "epoch": 114.23664122137404, + "grad_norm": 0.3160324692726135, + "learning_rate": 1.531314922387228e-10, + "loss": 0.0669, + "step": 29930 + }, + { + "epoch": 114.27480916030534, + "grad_norm": 0.24074247479438782, + "learning_rate": 1.1303373224025305e-10, + "loss": 0.0718, + "step": 29940 + }, + { + "epoch": 114.31297709923665, + "grad_norm": 0.47037428617477417, + "learning_rate": 7.901130636367437e-11, + "loss": 0.0722, + "step": 29950 + }, + { + "epoch": 114.35114503816794, + "grad_norm": 0.5677265524864197, + "learning_rate": 5.1064255950361574e-11, + "loss": 0.0716, + "step": 29960 + }, + { + "epoch": 114.38931297709924, + "grad_norm": 0.3310537338256836, + "learning_rate": 2.9192614958706286e-11, + "loss": 0.0715, + "step": 29970 + }, + { + "epoch": 114.42748091603053, + "grad_norm": 0.5864256024360657, + "learning_rate": 1.3396409964117062e-11, + "loss": 0.072, + "step": 29980 + }, + { + "epoch": 114.46564885496183, + "grad_norm": 0.6437695026397705, + "learning_rate": 3.6756601606846574e-12, + "loss": 0.0669, + "step": 29990 + }, + { + "epoch": 114.50381679389314, + "grad_norm": 0.734368085861206, + "learning_rate": 3.037736062694308e-14, + "loss": 0.0751, + "step": 30000 + }, + { + "epoch": 114.50381679389314, + "step": 30000, + "total_flos": 0.0, + "train_loss": 0.0959479827662309, + "train_runtime": 13839.331, + "train_samples_per_second": 17.342, + "train_steps_per_second": 2.168 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 115, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}