{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9996597771540356, "eval_steps": 500, "global_step": 11756, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003402228459641065, "grad_norm": 7.6875, "learning_rate": 3.809091090277921e-07, "loss": 4.24, "step": 10 }, { "epoch": 0.00680445691928213, "grad_norm": 6.46875, "learning_rate": 7.618182180555842e-07, "loss": 4.4323, "step": 20 }, { "epoch": 0.010206685378923195, "grad_norm": 8.4375, "learning_rate": 1.1427273270833762e-06, "loss": 4.2758, "step": 30 }, { "epoch": 0.01360891383856426, "grad_norm": 7.53125, "learning_rate": 1.5236364361111684e-06, "loss": 4.1231, "step": 40 }, { "epoch": 0.017011142298205325, "grad_norm": 5.90625, "learning_rate": 1.9045455451389605e-06, "loss": 4.097, "step": 50 }, { "epoch": 0.02041337075784639, "grad_norm": 5.15625, "learning_rate": 2.2854546541667524e-06, "loss": 4.0712, "step": 60 }, { "epoch": 0.023815599217487455, "grad_norm": 4.5625, "learning_rate": 2.6663637631945448e-06, "loss": 3.8851, "step": 70 }, { "epoch": 0.02721782767712852, "grad_norm": 6.78125, "learning_rate": 3.0472728722223367e-06, "loss": 3.6937, "step": 80 }, { "epoch": 0.030620056136769585, "grad_norm": 8.25, "learning_rate": 3.4281819812501286e-06, "loss": 3.6468, "step": 90 }, { "epoch": 0.03402228459641065, "grad_norm": 9.625, "learning_rate": 3.809091090277921e-06, "loss": 3.4787, "step": 100 }, { "epoch": 0.03742451305605171, "grad_norm": 7.53125, "learning_rate": 4.190000199305713e-06, "loss": 3.3235, "step": 110 }, { "epoch": 0.04082674151569278, "grad_norm": 9.4375, "learning_rate": 4.570909308333505e-06, "loss": 3.2806, "step": 120 }, { "epoch": 0.04422896997533384, "grad_norm": 10.3125, "learning_rate": 4.951818417361297e-06, "loss": 3.0432, "step": 130 }, { "epoch": 0.04763119843497491, "grad_norm": 5.84375, "learning_rate": 5.3327275263890896e-06, "loss": 2.8991, "step": 140 }, { "epoch": 0.05103342689461597, "grad_norm": 4.1875, "learning_rate": 5.7136366354168815e-06, "loss": 2.8202, "step": 150 }, { "epoch": 0.05443565535425704, "grad_norm": 1.828125, "learning_rate": 6.094545744444673e-06, "loss": 2.6361, "step": 160 }, { "epoch": 0.0578378838138981, "grad_norm": 1.8359375, "learning_rate": 6.475454853472465e-06, "loss": 2.5525, "step": 170 }, { "epoch": 0.06124011227353917, "grad_norm": 1.765625, "learning_rate": 6.856363962500257e-06, "loss": 2.5685, "step": 180 }, { "epoch": 0.06464234073318023, "grad_norm": 2.125, "learning_rate": 7.237273071528049e-06, "loss": 2.5133, "step": 190 }, { "epoch": 0.0680445691928213, "grad_norm": 1.71875, "learning_rate": 7.618182180555842e-06, "loss": 2.4096, "step": 200 }, { "epoch": 0.07144679765246237, "grad_norm": 1.9140625, "learning_rate": 7.999091289583632e-06, "loss": 2.4864, "step": 210 }, { "epoch": 0.07484902611210342, "grad_norm": 1.9765625, "learning_rate": 8.380000398611426e-06, "loss": 2.4321, "step": 220 }, { "epoch": 0.07825125457174449, "grad_norm": 2.3125, "learning_rate": 8.760909507639218e-06, "loss": 2.3582, "step": 230 }, { "epoch": 0.08165348303138556, "grad_norm": 2.3125, "learning_rate": 9.14181861666701e-06, "loss": 2.3401, "step": 240 }, { "epoch": 0.08505571149102663, "grad_norm": 2.625, "learning_rate": 9.522727725694802e-06, "loss": 2.3312, "step": 250 }, { "epoch": 0.08845793995066768, "grad_norm": 1.9609375, "learning_rate": 9.903636834722594e-06, "loss": 2.3672, "step": 260 }, { "epoch": 0.09186016841030875, "grad_norm": 1.453125, "learning_rate": 1.0284545943750385e-05, "loss": 2.3025, "step": 270 }, { "epoch": 0.09526239686994982, "grad_norm": 1.46875, "learning_rate": 1.0665455052778179e-05, "loss": 2.3273, "step": 280 }, { "epoch": 0.09866462532959089, "grad_norm": 2.25, "learning_rate": 1.104636416180597e-05, "loss": 2.2746, "step": 290 }, { "epoch": 0.10206685378923194, "grad_norm": 1.5859375, "learning_rate": 1.1427273270833763e-05, "loss": 2.3196, "step": 300 }, { "epoch": 0.10546908224887301, "grad_norm": 1.5078125, "learning_rate": 1.1808182379861553e-05, "loss": 2.2645, "step": 310 }, { "epoch": 0.10887131070851408, "grad_norm": 1.6640625, "learning_rate": 1.2189091488889347e-05, "loss": 2.2902, "step": 320 }, { "epoch": 0.11227353916815515, "grad_norm": 1.5859375, "learning_rate": 1.2570000597917139e-05, "loss": 2.2503, "step": 330 }, { "epoch": 0.1156757676277962, "grad_norm": 1.5, "learning_rate": 1.295090970694493e-05, "loss": 2.1882, "step": 340 }, { "epoch": 0.11907799608743727, "grad_norm": 1.359375, "learning_rate": 1.3331818815972723e-05, "loss": 2.2266, "step": 350 }, { "epoch": 0.12248022454707834, "grad_norm": 1.8125, "learning_rate": 1.344607904627746e-05, "loss": 2.2011, "step": 360 }, { "epoch": 0.1258824530067194, "grad_norm": 1.4765625, "learning_rate": 1.3446017810126854e-05, "loss": 2.1828, "step": 370 }, { "epoch": 0.12928468146636046, "grad_norm": 1.5234375, "learning_rate": 1.3445905544333626e-05, "loss": 2.2727, "step": 380 }, { "epoch": 0.13268690992600152, "grad_norm": 1.6328125, "learning_rate": 1.344574224974991e-05, "loss": 2.2222, "step": 390 }, { "epoch": 0.1360891383856426, "grad_norm": 1.59375, "learning_rate": 1.3445527927615165e-05, "loss": 2.2107, "step": 400 }, { "epoch": 0.13949136684528365, "grad_norm": 1.515625, "learning_rate": 1.3445262579556173e-05, "loss": 2.1671, "step": 410 }, { "epoch": 0.14289359530492474, "grad_norm": 1.3671875, "learning_rate": 1.3444946207587011e-05, "loss": 2.1878, "step": 420 }, { "epoch": 0.1462958237645658, "grad_norm": 1.4453125, "learning_rate": 1.3444578814109056e-05, "loss": 2.1358, "step": 430 }, { "epoch": 0.14969805222420685, "grad_norm": 1.734375, "learning_rate": 1.3444160401910943e-05, "loss": 2.1564, "step": 440 }, { "epoch": 0.15310028068384793, "grad_norm": 1.4765625, "learning_rate": 1.3443690974168565e-05, "loss": 2.1756, "step": 450 }, { "epoch": 0.15650250914348898, "grad_norm": 1.546875, "learning_rate": 1.344317053444504e-05, "loss": 2.1606, "step": 460 }, { "epoch": 0.15990473760313004, "grad_norm": 1.78125, "learning_rate": 1.344259908669068e-05, "loss": 2.2352, "step": 470 }, { "epoch": 0.16330696606277112, "grad_norm": 1.5078125, "learning_rate": 1.3441976635242969e-05, "loss": 2.1258, "step": 480 }, { "epoch": 0.16670919452241217, "grad_norm": 1.6484375, "learning_rate": 1.3441303184826526e-05, "loss": 2.1533, "step": 490 }, { "epoch": 0.17011142298205326, "grad_norm": 1.78125, "learning_rate": 1.3440578740553065e-05, "loss": 2.1179, "step": 500 }, { "epoch": 0.1735136514416943, "grad_norm": 1.484375, "learning_rate": 1.3439803307921367e-05, "loss": 2.1868, "step": 510 }, { "epoch": 0.17691587990133537, "grad_norm": 1.671875, "learning_rate": 1.343897689281723e-05, "loss": 2.1144, "step": 520 }, { "epoch": 0.18031810836097645, "grad_norm": 1.5078125, "learning_rate": 1.343809950151342e-05, "loss": 2.1722, "step": 530 }, { "epoch": 0.1837203368206175, "grad_norm": 1.6171875, "learning_rate": 1.3437171140669643e-05, "loss": 2.1725, "step": 540 }, { "epoch": 0.18712256528025856, "grad_norm": 1.5234375, "learning_rate": 1.3436191817332471e-05, "loss": 2.1871, "step": 550 }, { "epoch": 0.19052479373989964, "grad_norm": 1.7890625, "learning_rate": 1.3435161538935297e-05, "loss": 2.2134, "step": 560 }, { "epoch": 0.1939270221995407, "grad_norm": 1.78125, "learning_rate": 1.3434080313298288e-05, "loss": 2.1545, "step": 570 }, { "epoch": 0.19732925065918178, "grad_norm": 1.6328125, "learning_rate": 1.3432948148628312e-05, "loss": 2.1173, "step": 580 }, { "epoch": 0.20073147911882283, "grad_norm": 1.640625, "learning_rate": 1.3431765053518884e-05, "loss": 2.1703, "step": 590 }, { "epoch": 0.20413370757846389, "grad_norm": 1.6796875, "learning_rate": 1.3430531036950099e-05, "loss": 2.1662, "step": 600 }, { "epoch": 0.20753593603810497, "grad_norm": 1.6171875, "learning_rate": 1.3429246108288562e-05, "loss": 2.153, "step": 610 }, { "epoch": 0.21093816449774602, "grad_norm": 1.6328125, "learning_rate": 1.3427910277287318e-05, "loss": 2.1421, "step": 620 }, { "epoch": 0.21434039295738708, "grad_norm": 1.4453125, "learning_rate": 1.3426523554085776e-05, "loss": 2.1315, "step": 630 }, { "epoch": 0.21774262141702816, "grad_norm": 1.5703125, "learning_rate": 1.342508594920964e-05, "loss": 2.1187, "step": 640 }, { "epoch": 0.22114484987666921, "grad_norm": 1.7578125, "learning_rate": 1.342359747357082e-05, "loss": 2.1447, "step": 650 }, { "epoch": 0.2245470783363103, "grad_norm": 1.671875, "learning_rate": 1.3422058138467349e-05, "loss": 2.1614, "step": 660 }, { "epoch": 0.22794930679595135, "grad_norm": 1.5390625, "learning_rate": 1.3420467955583304e-05, "loss": 2.1521, "step": 670 }, { "epoch": 0.2313515352555924, "grad_norm": 1.6953125, "learning_rate": 1.3418826936988714e-05, "loss": 2.1474, "step": 680 }, { "epoch": 0.2347537637152335, "grad_norm": 1.6484375, "learning_rate": 1.3417135095139467e-05, "loss": 2.1887, "step": 690 }, { "epoch": 0.23815599217487454, "grad_norm": 1.71875, "learning_rate": 1.341539244287722e-05, "loss": 2.1432, "step": 700 }, { "epoch": 0.2415582206345156, "grad_norm": 1.8046875, "learning_rate": 1.3413598993429295e-05, "loss": 2.1202, "step": 710 }, { "epoch": 0.24496044909415668, "grad_norm": 1.7578125, "learning_rate": 1.3411754760408584e-05, "loss": 2.201, "step": 720 }, { "epoch": 0.24836267755379773, "grad_norm": 1.5390625, "learning_rate": 1.3409859757813437e-05, "loss": 2.104, "step": 730 }, { "epoch": 0.2517649060134388, "grad_norm": 1.703125, "learning_rate": 1.3407914000027573e-05, "loss": 2.1118, "step": 740 }, { "epoch": 0.25516713447307987, "grad_norm": 1.5546875, "learning_rate": 1.3405917501819956e-05, "loss": 2.1533, "step": 750 }, { "epoch": 0.2585693629327209, "grad_norm": 1.3828125, "learning_rate": 1.340387027834468e-05, "loss": 2.0738, "step": 760 }, { "epoch": 0.261971591392362, "grad_norm": 1.625, "learning_rate": 1.3401772345140874e-05, "loss": 2.1696, "step": 770 }, { "epoch": 0.26537381985200303, "grad_norm": 1.921875, "learning_rate": 1.3399623718132557e-05, "loss": 2.0847, "step": 780 }, { "epoch": 0.26877604831164414, "grad_norm": 1.5390625, "learning_rate": 1.3397424413628542e-05, "loss": 2.1644, "step": 790 }, { "epoch": 0.2721782767712852, "grad_norm": 1.640625, "learning_rate": 1.3395174448322298e-05, "loss": 2.0891, "step": 800 }, { "epoch": 0.27558050523092625, "grad_norm": 1.9453125, "learning_rate": 1.3392873839291825e-05, "loss": 2.1638, "step": 810 }, { "epoch": 0.2789827336905673, "grad_norm": 1.625, "learning_rate": 1.339052260399953e-05, "loss": 2.078, "step": 820 }, { "epoch": 0.28238496215020836, "grad_norm": 1.7890625, "learning_rate": 1.3388120760292085e-05, "loss": 2.1191, "step": 830 }, { "epoch": 0.2857871906098495, "grad_norm": 1.765625, "learning_rate": 1.33856683264003e-05, "loss": 2.0554, "step": 840 }, { "epoch": 0.2891894190694905, "grad_norm": 1.8203125, "learning_rate": 1.3383165320938983e-05, "loss": 2.0385, "step": 850 }, { "epoch": 0.2925916475291316, "grad_norm": 1.7109375, "learning_rate": 1.3380611762906796e-05, "loss": 2.1071, "step": 860 }, { "epoch": 0.29599387598877264, "grad_norm": 1.6640625, "learning_rate": 1.3378007671686113e-05, "loss": 2.1171, "step": 870 }, { "epoch": 0.2993961044484137, "grad_norm": 1.4609375, "learning_rate": 1.337535306704287e-05, "loss": 2.1264, "step": 880 }, { "epoch": 0.3027983329080548, "grad_norm": 1.75, "learning_rate": 1.337264796912642e-05, "loss": 2.0562, "step": 890 }, { "epoch": 0.30620056136769586, "grad_norm": 1.78125, "learning_rate": 1.3369892398469373e-05, "loss": 2.1343, "step": 900 }, { "epoch": 0.3096027898273369, "grad_norm": 1.53125, "learning_rate": 1.3367086375987447e-05, "loss": 2.0563, "step": 910 }, { "epoch": 0.31300501828697797, "grad_norm": 1.7578125, "learning_rate": 1.3364229922979311e-05, "loss": 2.1302, "step": 920 }, { "epoch": 0.316407246746619, "grad_norm": 1.609375, "learning_rate": 1.3361323061126409e-05, "loss": 2.0733, "step": 930 }, { "epoch": 0.3198094752062601, "grad_norm": 1.921875, "learning_rate": 1.3358365812492812e-05, "loss": 2.1027, "step": 940 }, { "epoch": 0.3232117036659012, "grad_norm": 1.7265625, "learning_rate": 1.3355358199525042e-05, "loss": 2.0455, "step": 950 }, { "epoch": 0.32661393212554224, "grad_norm": 1.6953125, "learning_rate": 1.3352300245051904e-05, "loss": 2.0785, "step": 960 }, { "epoch": 0.3300161605851833, "grad_norm": 1.671875, "learning_rate": 1.3349191972284314e-05, "loss": 2.1594, "step": 970 }, { "epoch": 0.33341838904482435, "grad_norm": 1.78125, "learning_rate": 1.3346033404815114e-05, "loss": 2.066, "step": 980 }, { "epoch": 0.3368206175044654, "grad_norm": 1.59375, "learning_rate": 1.3342824566618907e-05, "loss": 2.1451, "step": 990 }, { "epoch": 0.3402228459641065, "grad_norm": 1.6953125, "learning_rate": 1.3339565482051866e-05, "loss": 2.152, "step": 1000 }, { "epoch": 0.34362507442374757, "grad_norm": 1.7109375, "learning_rate": 1.3336256175851549e-05, "loss": 2.1232, "step": 1010 }, { "epoch": 0.3470273028833886, "grad_norm": 1.8828125, "learning_rate": 1.3332896673136717e-05, "loss": 2.1158, "step": 1020 }, { "epoch": 0.3504295313430297, "grad_norm": 1.7421875, "learning_rate": 1.3329486999407136e-05, "loss": 2.102, "step": 1030 }, { "epoch": 0.35383175980267073, "grad_norm": 1.8125, "learning_rate": 1.3326027180543387e-05, "loss": 2.1266, "step": 1040 }, { "epoch": 0.35723398826231184, "grad_norm": 1.421875, "learning_rate": 1.3322517242806673e-05, "loss": 2.0884, "step": 1050 }, { "epoch": 0.3606362167219529, "grad_norm": 1.5546875, "learning_rate": 1.3318957212838615e-05, "loss": 2.0793, "step": 1060 }, { "epoch": 0.36403844518159395, "grad_norm": 1.78125, "learning_rate": 1.3315347117661048e-05, "loss": 2.0574, "step": 1070 }, { "epoch": 0.367440673641235, "grad_norm": 1.6171875, "learning_rate": 1.3311686984675822e-05, "loss": 2.0716, "step": 1080 }, { "epoch": 0.37084290210087606, "grad_norm": 1.8671875, "learning_rate": 1.3307976841664591e-05, "loss": 2.0523, "step": 1090 }, { "epoch": 0.3742451305605171, "grad_norm": 1.703125, "learning_rate": 1.33042167167886e-05, "loss": 2.0203, "step": 1100 }, { "epoch": 0.3776473590201582, "grad_norm": 1.546875, "learning_rate": 1.330040663858848e-05, "loss": 2.0823, "step": 1110 }, { "epoch": 0.3810495874797993, "grad_norm": 1.796875, "learning_rate": 1.3296546635984012e-05, "loss": 2.0758, "step": 1120 }, { "epoch": 0.38445181593944033, "grad_norm": 1.7421875, "learning_rate": 1.3292636738273931e-05, "loss": 2.1138, "step": 1130 }, { "epoch": 0.3878540443990814, "grad_norm": 1.5, "learning_rate": 1.3288676975135689e-05, "loss": 2.0277, "step": 1140 }, { "epoch": 0.39125627285872244, "grad_norm": 1.5703125, "learning_rate": 1.3284667376625236e-05, "loss": 2.042, "step": 1150 }, { "epoch": 0.39465850131836355, "grad_norm": 1.8515625, "learning_rate": 1.3280607973176785e-05, "loss": 2.114, "step": 1160 }, { "epoch": 0.3980607297780046, "grad_norm": 1.796875, "learning_rate": 1.327649879560259e-05, "loss": 2.0477, "step": 1170 }, { "epoch": 0.40146295823764566, "grad_norm": 1.8046875, "learning_rate": 1.3272339875092701e-05, "loss": 2.0101, "step": 1180 }, { "epoch": 0.4048651866972867, "grad_norm": 1.984375, "learning_rate": 1.3268131243214744e-05, "loss": 2.1261, "step": 1190 }, { "epoch": 0.40826741515692777, "grad_norm": 1.9375, "learning_rate": 1.326387293191366e-05, "loss": 2.0788, "step": 1200 }, { "epoch": 0.4116696436165688, "grad_norm": 1.78125, "learning_rate": 1.325956497351148e-05, "loss": 2.0694, "step": 1210 }, { "epoch": 0.41507187207620994, "grad_norm": 1.9296875, "learning_rate": 1.3255207400707076e-05, "loss": 2.11, "step": 1220 }, { "epoch": 0.418474100535851, "grad_norm": 1.796875, "learning_rate": 1.3250800246575906e-05, "loss": 2.0621, "step": 1230 }, { "epoch": 0.42187632899549204, "grad_norm": 1.6875, "learning_rate": 1.3246343544569764e-05, "loss": 2.0923, "step": 1240 }, { "epoch": 0.4252785574551331, "grad_norm": 1.6640625, "learning_rate": 1.3241837328516535e-05, "loss": 2.1005, "step": 1250 }, { "epoch": 0.42868078591477415, "grad_norm": 1.953125, "learning_rate": 1.323728163261993e-05, "loss": 2.0634, "step": 1260 }, { "epoch": 0.43208301437441526, "grad_norm": 1.859375, "learning_rate": 1.323267649145923e-05, "loss": 2.0635, "step": 1270 }, { "epoch": 0.4354852428340563, "grad_norm": 1.640625, "learning_rate": 1.3228021939989018e-05, "loss": 2.131, "step": 1280 }, { "epoch": 0.4388874712936974, "grad_norm": 1.7421875, "learning_rate": 1.3223318013538927e-05, "loss": 2.1021, "step": 1290 }, { "epoch": 0.44228969975333843, "grad_norm": 1.734375, "learning_rate": 1.3218564747813355e-05, "loss": 2.0758, "step": 1300 }, { "epoch": 0.4456919282129795, "grad_norm": 1.6953125, "learning_rate": 1.3213762178891202e-05, "loss": 2.0198, "step": 1310 }, { "epoch": 0.4490941566726206, "grad_norm": 1.8515625, "learning_rate": 1.3208910343225603e-05, "loss": 2.1226, "step": 1320 }, { "epoch": 0.45249638513226165, "grad_norm": 1.703125, "learning_rate": 1.3204009277643636e-05, "loss": 2.077, "step": 1330 }, { "epoch": 0.4558986135919027, "grad_norm": 1.6953125, "learning_rate": 1.3199059019346055e-05, "loss": 2.1154, "step": 1340 }, { "epoch": 0.45930084205154376, "grad_norm": 1.8984375, "learning_rate": 1.3194059605907003e-05, "loss": 2.1109, "step": 1350 }, { "epoch": 0.4627030705111848, "grad_norm": 1.8203125, "learning_rate": 1.318901107527373e-05, "loss": 2.1108, "step": 1360 }, { "epoch": 0.46610529897082587, "grad_norm": 2.09375, "learning_rate": 1.3183913465766294e-05, "loss": 2.1203, "step": 1370 }, { "epoch": 0.469507527430467, "grad_norm": 1.8671875, "learning_rate": 1.3178766816077288e-05, "loss": 2.0667, "step": 1380 }, { "epoch": 0.47290975589010803, "grad_norm": 1.8671875, "learning_rate": 1.317357116527153e-05, "loss": 2.0428, "step": 1390 }, { "epoch": 0.4763119843497491, "grad_norm": 1.703125, "learning_rate": 1.3168326552785775e-05, "loss": 2.0836, "step": 1400 }, { "epoch": 0.47971421280939014, "grad_norm": 1.6015625, "learning_rate": 1.3163033018428418e-05, "loss": 2.0031, "step": 1410 }, { "epoch": 0.4831164412690312, "grad_norm": 2.0625, "learning_rate": 1.315769060237918e-05, "loss": 2.096, "step": 1420 }, { "epoch": 0.4865186697286723, "grad_norm": 1.828125, "learning_rate": 1.3152299345188815e-05, "loss": 2.0325, "step": 1430 }, { "epoch": 0.48992089818831336, "grad_norm": 1.65625, "learning_rate": 1.3146859287778799e-05, "loss": 2.0444, "step": 1440 }, { "epoch": 0.4933231266479544, "grad_norm": 2.140625, "learning_rate": 1.3141370471441016e-05, "loss": 2.0971, "step": 1450 }, { "epoch": 0.49672535510759547, "grad_norm": 2.0, "learning_rate": 1.3135832937837444e-05, "loss": 2.0014, "step": 1460 }, { "epoch": 0.5001275835672365, "grad_norm": 1.6796875, "learning_rate": 1.3130246728999852e-05, "loss": 2.0086, "step": 1470 }, { "epoch": 0.5035298120268776, "grad_norm": 1.78125, "learning_rate": 1.3124611887329459e-05, "loss": 2.0079, "step": 1480 }, { "epoch": 0.5069320404865186, "grad_norm": 1.9296875, "learning_rate": 1.3118928455596627e-05, "loss": 2.0654, "step": 1490 }, { "epoch": 0.5103342689461597, "grad_norm": 1.875, "learning_rate": 1.3113196476940538e-05, "loss": 2.0195, "step": 1500 }, { "epoch": 0.5137364974058009, "grad_norm": 1.8203125, "learning_rate": 1.3107415994868855e-05, "loss": 2.0196, "step": 1510 }, { "epoch": 0.5171387258654419, "grad_norm": 2.125, "learning_rate": 1.3101587053257404e-05, "loss": 2.0552, "step": 1520 }, { "epoch": 0.520540954325083, "grad_norm": 1.734375, "learning_rate": 1.3095709696349833e-05, "loss": 2.0833, "step": 1530 }, { "epoch": 0.523943182784724, "grad_norm": 1.765625, "learning_rate": 1.3089783968757277e-05, "loss": 2.1067, "step": 1540 }, { "epoch": 0.5273454112443651, "grad_norm": 1.9921875, "learning_rate": 1.308380991545802e-05, "loss": 2.0313, "step": 1550 }, { "epoch": 0.5307476397040061, "grad_norm": 1.9296875, "learning_rate": 1.3077787581797163e-05, "loss": 2.0918, "step": 1560 }, { "epoch": 0.5341498681636472, "grad_norm": 1.609375, "learning_rate": 1.3071717013486259e-05, "loss": 2.0505, "step": 1570 }, { "epoch": 0.5375520966232883, "grad_norm": 1.421875, "learning_rate": 1.3065598256602989e-05, "loss": 2.1166, "step": 1580 }, { "epoch": 0.5409543250829293, "grad_norm": 1.6015625, "learning_rate": 1.3059431357590797e-05, "loss": 2.1196, "step": 1590 }, { "epoch": 0.5443565535425704, "grad_norm": 1.765625, "learning_rate": 1.3053216363258537e-05, "loss": 2.0623, "step": 1600 }, { "epoch": 0.5477587820022114, "grad_norm": 1.671875, "learning_rate": 1.3046953320780136e-05, "loss": 2.051, "step": 1610 }, { "epoch": 0.5511610104618525, "grad_norm": 1.734375, "learning_rate": 1.304064227769421e-05, "loss": 2.0341, "step": 1620 }, { "epoch": 0.5545632389214936, "grad_norm": 1.8671875, "learning_rate": 1.3034283281903722e-05, "loss": 2.001, "step": 1630 }, { "epoch": 0.5579654673811346, "grad_norm": 2.125, "learning_rate": 1.3027876381675611e-05, "loss": 1.9871, "step": 1640 }, { "epoch": 0.5613676958407757, "grad_norm": 1.8359375, "learning_rate": 1.3021421625640427e-05, "loss": 2.0712, "step": 1650 }, { "epoch": 0.5647699243004167, "grad_norm": 1.8671875, "learning_rate": 1.3014919062791965e-05, "loss": 2.0444, "step": 1660 }, { "epoch": 0.5681721527600578, "grad_norm": 1.9609375, "learning_rate": 1.3008368742486882e-05, "loss": 2.0598, "step": 1670 }, { "epoch": 0.571574381219699, "grad_norm": 1.8828125, "learning_rate": 1.300177071444434e-05, "loss": 2.0744, "step": 1680 }, { "epoch": 0.57497660967934, "grad_norm": 2.109375, "learning_rate": 1.299512502874561e-05, "loss": 1.9854, "step": 1690 }, { "epoch": 0.578378838138981, "grad_norm": 2.0, "learning_rate": 1.2988431735833709e-05, "loss": 2.0348, "step": 1700 }, { "epoch": 0.581781066598622, "grad_norm": 1.84375, "learning_rate": 1.2981690886513001e-05, "loss": 2.0189, "step": 1710 }, { "epoch": 0.5851832950582632, "grad_norm": 1.875, "learning_rate": 1.2974902531948826e-05, "loss": 1.9997, "step": 1720 }, { "epoch": 0.5885855235179043, "grad_norm": 1.6640625, "learning_rate": 1.2968066723667104e-05, "loss": 1.9861, "step": 1730 }, { "epoch": 0.5919877519775453, "grad_norm": 1.796875, "learning_rate": 1.2961183513553937e-05, "loss": 2.0284, "step": 1740 }, { "epoch": 0.5953899804371864, "grad_norm": 1.734375, "learning_rate": 1.2954252953855236e-05, "loss": 2.0376, "step": 1750 }, { "epoch": 0.5987922088968274, "grad_norm": 1.7734375, "learning_rate": 1.2947275097176301e-05, "loss": 2.0059, "step": 1760 }, { "epoch": 0.6021944373564685, "grad_norm": 2.09375, "learning_rate": 1.2940249996481436e-05, "loss": 2.0906, "step": 1770 }, { "epoch": 0.6055966658161096, "grad_norm": 1.8359375, "learning_rate": 1.2933177705093541e-05, "loss": 2.0076, "step": 1780 }, { "epoch": 0.6089988942757506, "grad_norm": 1.7265625, "learning_rate": 1.2926058276693715e-05, "loss": 2.0247, "step": 1790 }, { "epoch": 0.6124011227353917, "grad_norm": 1.8359375, "learning_rate": 1.2918891765320837e-05, "loss": 2.113, "step": 1800 }, { "epoch": 0.6158033511950327, "grad_norm": 1.671875, "learning_rate": 1.2911678225371164e-05, "loss": 2.0201, "step": 1810 }, { "epoch": 0.6192055796546738, "grad_norm": 1.8828125, "learning_rate": 1.2904417711597916e-05, "loss": 2.0172, "step": 1820 }, { "epoch": 0.6226078081143149, "grad_norm": 1.9609375, "learning_rate": 1.289711027911086e-05, "loss": 2.1396, "step": 1830 }, { "epoch": 0.6260100365739559, "grad_norm": 1.75, "learning_rate": 1.2889755983375892e-05, "loss": 2.045, "step": 1840 }, { "epoch": 0.629412265033597, "grad_norm": 1.9375, "learning_rate": 1.2882354880214616e-05, "loss": 2.012, "step": 1850 }, { "epoch": 0.632814493493238, "grad_norm": 1.8671875, "learning_rate": 1.2874907025803922e-05, "loss": 2.058, "step": 1860 }, { "epoch": 0.6362167219528791, "grad_norm": 1.8359375, "learning_rate": 1.2867412476675554e-05, "loss": 2.0796, "step": 1870 }, { "epoch": 0.6396189504125201, "grad_norm": 1.8671875, "learning_rate": 1.2859871289715688e-05, "loss": 2.0956, "step": 1880 }, { "epoch": 0.6430211788721613, "grad_norm": 1.7421875, "learning_rate": 1.2852283522164496e-05, "loss": 1.983, "step": 1890 }, { "epoch": 0.6464234073318024, "grad_norm": 1.921875, "learning_rate": 1.2844649231615713e-05, "loss": 1.9861, "step": 1900 }, { "epoch": 0.6498256357914434, "grad_norm": 1.890625, "learning_rate": 1.2836968476016196e-05, "loss": 2.0683, "step": 1910 }, { "epoch": 0.6532278642510845, "grad_norm": 1.6875, "learning_rate": 1.2829241313665494e-05, "loss": 2.0916, "step": 1920 }, { "epoch": 0.6566300927107255, "grad_norm": 1.609375, "learning_rate": 1.2821467803215395e-05, "loss": 2.0254, "step": 1930 }, { "epoch": 0.6600323211703666, "grad_norm": 1.9765625, "learning_rate": 1.2813648003669482e-05, "loss": 2.0332, "step": 1940 }, { "epoch": 0.6634345496300077, "grad_norm": 1.9140625, "learning_rate": 1.2805781974382694e-05, "loss": 2.0225, "step": 1950 }, { "epoch": 0.6668367780896487, "grad_norm": 1.859375, "learning_rate": 1.2797869775060866e-05, "loss": 2.0563, "step": 1960 }, { "epoch": 0.6702390065492898, "grad_norm": 1.6953125, "learning_rate": 1.2789911465760281e-05, "loss": 2.0027, "step": 1970 }, { "epoch": 0.6736412350089308, "grad_norm": 1.890625, "learning_rate": 1.2781907106887209e-05, "loss": 1.9895, "step": 1980 }, { "epoch": 0.6770434634685719, "grad_norm": 2.015625, "learning_rate": 1.2773856759197455e-05, "loss": 2.0175, "step": 1990 }, { "epoch": 0.680445691928213, "grad_norm": 1.7890625, "learning_rate": 1.2765760483795895e-05, "loss": 2.0702, "step": 2000 }, { "epoch": 0.683847920387854, "grad_norm": 1.796875, "learning_rate": 1.275761834213601e-05, "loss": 2.023, "step": 2010 }, { "epoch": 0.6872501488474951, "grad_norm": 1.9140625, "learning_rate": 1.2749430396019423e-05, "loss": 2.0051, "step": 2020 }, { "epoch": 0.6906523773071361, "grad_norm": 1.9765625, "learning_rate": 1.2741196707595429e-05, "loss": 2.017, "step": 2030 }, { "epoch": 0.6940546057667772, "grad_norm": 1.9296875, "learning_rate": 1.273291733936052e-05, "loss": 2.0481, "step": 2040 }, { "epoch": 0.6974568342264184, "grad_norm": 1.7265625, "learning_rate": 1.2724592354157912e-05, "loss": 2.0281, "step": 2050 }, { "epoch": 0.7008590626860594, "grad_norm": 1.8984375, "learning_rate": 1.2716221815177076e-05, "loss": 2.0459, "step": 2060 }, { "epoch": 0.7042612911457005, "grad_norm": 2.21875, "learning_rate": 1.2707805785953245e-05, "loss": 2.0705, "step": 2070 }, { "epoch": 0.7076635196053415, "grad_norm": 2.109375, "learning_rate": 1.2699344330366942e-05, "loss": 2.0759, "step": 2080 }, { "epoch": 0.7110657480649826, "grad_norm": 1.765625, "learning_rate": 1.2690837512643495e-05, "loss": 2.0324, "step": 2090 }, { "epoch": 0.7144679765246237, "grad_norm": 1.75, "learning_rate": 1.2682285397352535e-05, "loss": 1.9784, "step": 2100 }, { "epoch": 0.7178702049842647, "grad_norm": 1.9140625, "learning_rate": 1.2673688049407526e-05, "loss": 1.9902, "step": 2110 }, { "epoch": 0.7212724334439058, "grad_norm": 1.890625, "learning_rate": 1.266504553406526e-05, "loss": 2.0631, "step": 2120 }, { "epoch": 0.7246746619035468, "grad_norm": 2.015625, "learning_rate": 1.2656357916925368e-05, "loss": 2.0039, "step": 2130 }, { "epoch": 0.7280768903631879, "grad_norm": 2.15625, "learning_rate": 1.2647625263929817e-05, "loss": 1.9975, "step": 2140 }, { "epoch": 0.7314791188228289, "grad_norm": 1.71875, "learning_rate": 1.2638847641362408e-05, "loss": 2.0368, "step": 2150 }, { "epoch": 0.73488134728247, "grad_norm": 1.9296875, "learning_rate": 1.2630025115848282e-05, "loss": 2.0954, "step": 2160 }, { "epoch": 0.7382835757421111, "grad_norm": 1.6484375, "learning_rate": 1.2621157754353404e-05, "loss": 2.0297, "step": 2170 }, { "epoch": 0.7416858042017521, "grad_norm": 1.65625, "learning_rate": 1.2612245624184062e-05, "loss": 2.0445, "step": 2180 }, { "epoch": 0.7450880326613932, "grad_norm": 1.7578125, "learning_rate": 1.2603288792986354e-05, "loss": 2.0587, "step": 2190 }, { "epoch": 0.7484902611210342, "grad_norm": 1.8203125, "learning_rate": 1.2594287328745672e-05, "loss": 2.0126, "step": 2200 }, { "epoch": 0.7518924895806753, "grad_norm": 1.7890625, "learning_rate": 1.258524129978619e-05, "loss": 2.0213, "step": 2210 }, { "epoch": 0.7552947180403164, "grad_norm": 1.953125, "learning_rate": 1.257615077477034e-05, "loss": 1.9826, "step": 2220 }, { "epoch": 0.7586969464999574, "grad_norm": 1.8515625, "learning_rate": 1.25670158226983e-05, "loss": 2.0467, "step": 2230 }, { "epoch": 0.7620991749595986, "grad_norm": 1.9765625, "learning_rate": 1.2557836512907456e-05, "loss": 1.9924, "step": 2240 }, { "epoch": 0.7655014034192396, "grad_norm": 2.140625, "learning_rate": 1.2548612915071894e-05, "loss": 1.9864, "step": 2250 }, { "epoch": 0.7689036318788807, "grad_norm": 1.921875, "learning_rate": 1.2539345099201851e-05, "loss": 1.9966, "step": 2260 }, { "epoch": 0.7723058603385218, "grad_norm": 1.875, "learning_rate": 1.2530033135643203e-05, "loss": 2.0092, "step": 2270 }, { "epoch": 0.7757080887981628, "grad_norm": 2.1875, "learning_rate": 1.2520677095076918e-05, "loss": 1.97, "step": 2280 }, { "epoch": 0.7791103172578039, "grad_norm": 1.96875, "learning_rate": 1.2511277048518522e-05, "loss": 1.9781, "step": 2290 }, { "epoch": 0.7825125457174449, "grad_norm": 1.953125, "learning_rate": 1.2501833067317562e-05, "loss": 2.0167, "step": 2300 }, { "epoch": 0.785914774177086, "grad_norm": 2.0, "learning_rate": 1.2492345223157068e-05, "loss": 2.0108, "step": 2310 }, { "epoch": 0.7893170026367271, "grad_norm": 1.6328125, "learning_rate": 1.2482813588053004e-05, "loss": 2.0094, "step": 2320 }, { "epoch": 0.7927192310963681, "grad_norm": 1.3671875, "learning_rate": 1.2473238234353713e-05, "loss": 1.9266, "step": 2330 }, { "epoch": 0.7961214595560092, "grad_norm": 1.765625, "learning_rate": 1.2463619234739388e-05, "loss": 1.9982, "step": 2340 }, { "epoch": 0.7995236880156502, "grad_norm": 1.875, "learning_rate": 1.2453956662221504e-05, "loss": 2.0688, "step": 2350 }, { "epoch": 0.8029259164752913, "grad_norm": 1.890625, "learning_rate": 1.2444250590142271e-05, "loss": 1.9658, "step": 2360 }, { "epoch": 0.8063281449349324, "grad_norm": 1.953125, "learning_rate": 1.2434501092174075e-05, "loss": 1.9954, "step": 2370 }, { "epoch": 0.8097303733945734, "grad_norm": 1.7421875, "learning_rate": 1.242470824231892e-05, "loss": 2.0507, "step": 2380 }, { "epoch": 0.8131326018542145, "grad_norm": 1.7109375, "learning_rate": 1.241487211490786e-05, "loss": 2.0469, "step": 2390 }, { "epoch": 0.8165348303138555, "grad_norm": 1.8203125, "learning_rate": 1.2404992784600451e-05, "loss": 2.0436, "step": 2400 }, { "epoch": 0.8199370587734967, "grad_norm": 1.78125, "learning_rate": 1.2395070326384164e-05, "loss": 2.0195, "step": 2410 }, { "epoch": 0.8233392872331377, "grad_norm": 2.21875, "learning_rate": 1.238510481557383e-05, "loss": 1.9674, "step": 2420 }, { "epoch": 0.8267415156927788, "grad_norm": 1.9609375, "learning_rate": 1.2375096327811061e-05, "loss": 1.9918, "step": 2430 }, { "epoch": 0.8301437441524199, "grad_norm": 2.078125, "learning_rate": 1.2365044939063687e-05, "loss": 2.0161, "step": 2440 }, { "epoch": 0.8335459726120609, "grad_norm": 1.9140625, "learning_rate": 1.2354950725625158e-05, "loss": 2.0303, "step": 2450 }, { "epoch": 0.836948201071702, "grad_norm": 2.109375, "learning_rate": 1.2344813764113985e-05, "loss": 1.973, "step": 2460 }, { "epoch": 0.840350429531343, "grad_norm": 1.9296875, "learning_rate": 1.2334634131473154e-05, "loss": 2.0389, "step": 2470 }, { "epoch": 0.8437526579909841, "grad_norm": 1.78125, "learning_rate": 1.2324411904969535e-05, "loss": 2.0597, "step": 2480 }, { "epoch": 0.8471548864506252, "grad_norm": 1.7734375, "learning_rate": 1.2314147162193302e-05, "loss": 2.029, "step": 2490 }, { "epoch": 0.8505571149102662, "grad_norm": 1.921875, "learning_rate": 1.2303839981057342e-05, "loss": 2.0216, "step": 2500 }, { "epoch": 0.8539593433699073, "grad_norm": 1.96875, "learning_rate": 1.2293490439796658e-05, "loss": 1.9839, "step": 2510 }, { "epoch": 0.8573615718295483, "grad_norm": 1.78125, "learning_rate": 1.2283098616967793e-05, "loss": 2.0373, "step": 2520 }, { "epoch": 0.8607638002891894, "grad_norm": 1.75, "learning_rate": 1.2272664591448208e-05, "loss": 2.075, "step": 2530 }, { "epoch": 0.8641660287488305, "grad_norm": 1.890625, "learning_rate": 1.2262188442435706e-05, "loss": 2.071, "step": 2540 }, { "epoch": 0.8675682572084715, "grad_norm": 1.7734375, "learning_rate": 1.2251670249447816e-05, "loss": 2.0474, "step": 2550 }, { "epoch": 0.8709704856681126, "grad_norm": 1.7578125, "learning_rate": 1.22411100923212e-05, "loss": 1.9866, "step": 2560 }, { "epoch": 0.8743727141277536, "grad_norm": 1.859375, "learning_rate": 1.2230508051211039e-05, "loss": 2.0365, "step": 2570 }, { "epoch": 0.8777749425873947, "grad_norm": 2.03125, "learning_rate": 1.2219864206590427e-05, "loss": 2.0041, "step": 2580 }, { "epoch": 0.8811771710470359, "grad_norm": 1.9921875, "learning_rate": 1.2209178639249763e-05, "loss": 2.0164, "step": 2590 }, { "epoch": 0.8845793995066769, "grad_norm": 1.7578125, "learning_rate": 1.2198451430296135e-05, "loss": 2.0469, "step": 2600 }, { "epoch": 0.887981627966318, "grad_norm": 1.921875, "learning_rate": 1.2187682661152705e-05, "loss": 1.9873, "step": 2610 }, { "epoch": 0.891383856425959, "grad_norm": 1.5078125, "learning_rate": 1.2176872413558087e-05, "loss": 2.0442, "step": 2620 }, { "epoch": 0.8947860848856001, "grad_norm": 1.6640625, "learning_rate": 1.2166020769565741e-05, "loss": 2.0356, "step": 2630 }, { "epoch": 0.8981883133452412, "grad_norm": 1.9453125, "learning_rate": 1.2155127811543326e-05, "loss": 2.0253, "step": 2640 }, { "epoch": 0.9015905418048822, "grad_norm": 1.8671875, "learning_rate": 1.2144193622172099e-05, "loss": 1.974, "step": 2650 }, { "epoch": 0.9049927702645233, "grad_norm": 1.8203125, "learning_rate": 1.2133218284446276e-05, "loss": 2.0084, "step": 2660 }, { "epoch": 0.9083949987241643, "grad_norm": 1.9609375, "learning_rate": 1.2122201881672392e-05, "loss": 2.1215, "step": 2670 }, { "epoch": 0.9117972271838054, "grad_norm": 1.9140625, "learning_rate": 1.2111144497468698e-05, "loss": 1.9749, "step": 2680 }, { "epoch": 0.9151994556434464, "grad_norm": 1.75, "learning_rate": 1.2100046215764493e-05, "loss": 1.9601, "step": 2690 }, { "epoch": 0.9186016841030875, "grad_norm": 2.03125, "learning_rate": 1.2088907120799507e-05, "loss": 1.9761, "step": 2700 }, { "epoch": 0.9220039125627286, "grad_norm": 1.90625, "learning_rate": 1.2077727297123258e-05, "loss": 2.0309, "step": 2710 }, { "epoch": 0.9254061410223696, "grad_norm": 1.6953125, "learning_rate": 1.2066506829594404e-05, "loss": 2.0306, "step": 2720 }, { "epoch": 0.9288083694820107, "grad_norm": 1.765625, "learning_rate": 1.2055245803380112e-05, "loss": 2.0073, "step": 2730 }, { "epoch": 0.9322105979416517, "grad_norm": 2.046875, "learning_rate": 1.2043944303955393e-05, "loss": 1.9904, "step": 2740 }, { "epoch": 0.9356128264012928, "grad_norm": 1.8984375, "learning_rate": 1.2032602417102472e-05, "loss": 2.0916, "step": 2750 }, { "epoch": 0.939015054860934, "grad_norm": 1.8828125, "learning_rate": 1.2021220228910125e-05, "loss": 1.9665, "step": 2760 }, { "epoch": 0.942417283320575, "grad_norm": 1.984375, "learning_rate": 1.2009797825773027e-05, "loss": 1.9822, "step": 2770 }, { "epoch": 0.9458195117802161, "grad_norm": 2.109375, "learning_rate": 1.1998335294391099e-05, "loss": 1.9947, "step": 2780 }, { "epoch": 0.9492217402398571, "grad_norm": 1.7578125, "learning_rate": 1.1986832721768856e-05, "loss": 1.9626, "step": 2790 }, { "epoch": 0.9526239686994982, "grad_norm": 1.8515625, "learning_rate": 1.1975290195214724e-05, "loss": 1.9772, "step": 2800 }, { "epoch": 0.9560261971591393, "grad_norm": 1.921875, "learning_rate": 1.1963707802340409e-05, "loss": 2.0471, "step": 2810 }, { "epoch": 0.9594284256187803, "grad_norm": 1.8984375, "learning_rate": 1.1952085631060207e-05, "loss": 1.9514, "step": 2820 }, { "epoch": 0.9628306540784214, "grad_norm": 1.9453125, "learning_rate": 1.1940423769590349e-05, "loss": 1.9974, "step": 2830 }, { "epoch": 0.9662328825380624, "grad_norm": 1.7578125, "learning_rate": 1.1928722306448326e-05, "loss": 2.0036, "step": 2840 }, { "epoch": 0.9696351109977035, "grad_norm": 1.453125, "learning_rate": 1.1916981330452221e-05, "loss": 1.9803, "step": 2850 }, { "epoch": 0.9730373394573446, "grad_norm": 1.8515625, "learning_rate": 1.1905200930720032e-05, "loss": 2.0608, "step": 2860 }, { "epoch": 0.9764395679169856, "grad_norm": 1.8984375, "learning_rate": 1.1893381196668997e-05, "loss": 1.9857, "step": 2870 }, { "epoch": 0.9798417963766267, "grad_norm": 1.6171875, "learning_rate": 1.1881522218014912e-05, "loss": 2.0197, "step": 2880 }, { "epoch": 0.9832440248362677, "grad_norm": 1.8984375, "learning_rate": 1.1869624084771457e-05, "loss": 1.9883, "step": 2890 }, { "epoch": 0.9866462532959088, "grad_norm": 1.8203125, "learning_rate": 1.185768688724951e-05, "loss": 2.0941, "step": 2900 }, { "epoch": 0.9900484817555499, "grad_norm": 1.7109375, "learning_rate": 1.184571071605645e-05, "loss": 1.9953, "step": 2910 }, { "epoch": 0.9934507102151909, "grad_norm": 1.7265625, "learning_rate": 1.1833695662095493e-05, "loss": 1.9833, "step": 2920 }, { "epoch": 0.996852938674832, "grad_norm": 1.9765625, "learning_rate": 1.1821641816564982e-05, "loss": 2.0431, "step": 2930 }, { "epoch": 1.000255167134473, "grad_norm": 1.71875, "learning_rate": 1.1809549270957697e-05, "loss": 1.886, "step": 2940 }, { "epoch": 1.0036573955941142, "grad_norm": 2.078125, "learning_rate": 1.1797418117060173e-05, "loss": 1.9804, "step": 2950 }, { "epoch": 1.0070596240537553, "grad_norm": 1.875, "learning_rate": 1.1785248446951988e-05, "loss": 2.0657, "step": 2960 }, { "epoch": 1.0104618525133964, "grad_norm": 1.9296875, "learning_rate": 1.1773040353005074e-05, "loss": 2.0112, "step": 2970 }, { "epoch": 1.0138640809730373, "grad_norm": 2.015625, "learning_rate": 1.1760793927883016e-05, "loss": 2.0262, "step": 2980 }, { "epoch": 1.0172663094326784, "grad_norm": 2.109375, "learning_rate": 1.174850926454034e-05, "loss": 2.0007, "step": 2990 }, { "epoch": 1.0206685378923195, "grad_norm": 2.03125, "learning_rate": 1.1736186456221816e-05, "loss": 1.9723, "step": 3000 }, { "epoch": 1.0240707663519606, "grad_norm": 2.0625, "learning_rate": 1.1723825596461751e-05, "loss": 1.9384, "step": 3010 }, { "epoch": 1.0274729948116017, "grad_norm": 1.96875, "learning_rate": 1.1711426779083267e-05, "loss": 1.9556, "step": 3020 }, { "epoch": 1.0308752232712426, "grad_norm": 1.828125, "learning_rate": 1.1698990098197604e-05, "loss": 1.9963, "step": 3030 }, { "epoch": 1.0342774517308837, "grad_norm": 2.09375, "learning_rate": 1.1686515648203396e-05, "loss": 1.9429, "step": 3040 }, { "epoch": 1.0376796801905248, "grad_norm": 2.203125, "learning_rate": 1.1674003523785957e-05, "loss": 1.8885, "step": 3050 }, { "epoch": 1.041081908650166, "grad_norm": 1.9765625, "learning_rate": 1.1661453819916565e-05, "loss": 1.9456, "step": 3060 }, { "epoch": 1.0444841371098068, "grad_norm": 2.015625, "learning_rate": 1.1648866631851738e-05, "loss": 1.9386, "step": 3070 }, { "epoch": 1.047886365569448, "grad_norm": 2.09375, "learning_rate": 1.1636242055132511e-05, "loss": 1.9569, "step": 3080 }, { "epoch": 1.051288594029089, "grad_norm": 1.8671875, "learning_rate": 1.1623580185583711e-05, "loss": 1.9159, "step": 3090 }, { "epoch": 1.0546908224887301, "grad_norm": 1.9296875, "learning_rate": 1.1610881119313231e-05, "loss": 1.9094, "step": 3100 }, { "epoch": 1.0580930509483712, "grad_norm": 2.078125, "learning_rate": 1.1598144952711302e-05, "loss": 2.0189, "step": 3110 }, { "epoch": 1.0614952794080121, "grad_norm": 1.8515625, "learning_rate": 1.1585371782449755e-05, "loss": 2.0053, "step": 3120 }, { "epoch": 1.0648975078676532, "grad_norm": 2.15625, "learning_rate": 1.1572561705481294e-05, "loss": 1.9826, "step": 3130 }, { "epoch": 1.0682997363272944, "grad_norm": 2.015625, "learning_rate": 1.1559714819038756e-05, "loss": 1.9597, "step": 3140 }, { "epoch": 1.0717019647869355, "grad_norm": 1.734375, "learning_rate": 1.1546831220634377e-05, "loss": 1.9255, "step": 3150 }, { "epoch": 1.0751041932465766, "grad_norm": 2.109375, "learning_rate": 1.1533911008059046e-05, "loss": 1.9859, "step": 3160 }, { "epoch": 1.0785064217062175, "grad_norm": 1.7578125, "learning_rate": 1.1520954279381567e-05, "loss": 1.9651, "step": 3170 }, { "epoch": 1.0819086501658586, "grad_norm": 1.9296875, "learning_rate": 1.1507961132947917e-05, "loss": 1.9321, "step": 3180 }, { "epoch": 1.0853108786254997, "grad_norm": 1.8046875, "learning_rate": 1.1494931667380492e-05, "loss": 1.9215, "step": 3190 }, { "epoch": 1.0887131070851408, "grad_norm": 1.9453125, "learning_rate": 1.1481865981577362e-05, "loss": 1.982, "step": 3200 }, { "epoch": 1.092115335544782, "grad_norm": 2.125, "learning_rate": 1.1468764174711526e-05, "loss": 1.9728, "step": 3210 }, { "epoch": 1.0955175640044228, "grad_norm": 2.046875, "learning_rate": 1.1455626346230147e-05, "loss": 2.0267, "step": 3220 }, { "epoch": 1.098919792464064, "grad_norm": 2.359375, "learning_rate": 1.1442452595853809e-05, "loss": 1.9484, "step": 3230 }, { "epoch": 1.102322020923705, "grad_norm": 2.0, "learning_rate": 1.1429243023575758e-05, "loss": 1.9867, "step": 3240 }, { "epoch": 1.1057242493833461, "grad_norm": 1.8046875, "learning_rate": 1.1415997729661134e-05, "loss": 1.9269, "step": 3250 }, { "epoch": 1.1091264778429872, "grad_norm": 1.953125, "learning_rate": 1.140271681464622e-05, "loss": 1.9095, "step": 3260 }, { "epoch": 1.1125287063026281, "grad_norm": 1.8515625, "learning_rate": 1.1389400379337676e-05, "loss": 2.0021, "step": 3270 }, { "epoch": 1.1159309347622692, "grad_norm": 2.046875, "learning_rate": 1.137604852481177e-05, "loss": 2.0117, "step": 3280 }, { "epoch": 1.1193331632219103, "grad_norm": 1.5546875, "learning_rate": 1.1362661352413616e-05, "loss": 1.9835, "step": 3290 }, { "epoch": 1.1227353916815515, "grad_norm": 2.1875, "learning_rate": 1.1349238963756402e-05, "loss": 1.9492, "step": 3300 }, { "epoch": 1.1261376201411926, "grad_norm": 2.0, "learning_rate": 1.1335781460720621e-05, "loss": 1.9394, "step": 3310 }, { "epoch": 1.1295398486008335, "grad_norm": 1.703125, "learning_rate": 1.1322288945453292e-05, "loss": 1.9442, "step": 3320 }, { "epoch": 1.1329420770604746, "grad_norm": 1.84375, "learning_rate": 1.1308761520367196e-05, "loss": 1.9256, "step": 3330 }, { "epoch": 1.1363443055201157, "grad_norm": 1.96875, "learning_rate": 1.1295199288140082e-05, "loss": 1.9861, "step": 3340 }, { "epoch": 1.1397465339797568, "grad_norm": 2.265625, "learning_rate": 1.1281602351713905e-05, "loss": 1.9598, "step": 3350 }, { "epoch": 1.143148762439398, "grad_norm": 2.09375, "learning_rate": 1.1267970814294032e-05, "loss": 1.9839, "step": 3360 }, { "epoch": 1.1465509908990388, "grad_norm": 2.125, "learning_rate": 1.1254304779348466e-05, "loss": 1.9654, "step": 3370 }, { "epoch": 1.14995321935868, "grad_norm": 1.9296875, "learning_rate": 1.1240604350607055e-05, "loss": 1.9536, "step": 3380 }, { "epoch": 1.153355447818321, "grad_norm": 1.9296875, "learning_rate": 1.122686963206071e-05, "loss": 1.9331, "step": 3390 }, { "epoch": 1.156757676277962, "grad_norm": 1.921875, "learning_rate": 1.1213100727960614e-05, "loss": 1.9218, "step": 3400 }, { "epoch": 1.1601599047376032, "grad_norm": 1.9765625, "learning_rate": 1.1199297742817428e-05, "loss": 1.9979, "step": 3410 }, { "epoch": 1.163562133197244, "grad_norm": 2.25, "learning_rate": 1.11854607814005e-05, "loss": 2.02, "step": 3420 }, { "epoch": 1.1669643616568852, "grad_norm": 2.09375, "learning_rate": 1.117158994873707e-05, "loss": 2.0195, "step": 3430 }, { "epoch": 1.1703665901165263, "grad_norm": 1.984375, "learning_rate": 1.1157685350111472e-05, "loss": 2.0053, "step": 3440 }, { "epoch": 1.1737688185761674, "grad_norm": 1.84375, "learning_rate": 1.1143747091064334e-05, "loss": 2.014, "step": 3450 }, { "epoch": 1.1771710470358085, "grad_norm": 2.0625, "learning_rate": 1.1129775277391782e-05, "loss": 1.9057, "step": 3460 }, { "epoch": 1.1805732754954494, "grad_norm": 2.140625, "learning_rate": 1.1115770015144628e-05, "loss": 1.9496, "step": 3470 }, { "epoch": 1.1839755039550905, "grad_norm": 1.828125, "learning_rate": 1.1101731410627574e-05, "loss": 1.9163, "step": 3480 }, { "epoch": 1.1873777324147317, "grad_norm": 1.890625, "learning_rate": 1.1087659570398397e-05, "loss": 1.9717, "step": 3490 }, { "epoch": 1.1907799608743728, "grad_norm": 2.078125, "learning_rate": 1.1073554601267147e-05, "loss": 2.0302, "step": 3500 }, { "epoch": 1.1941821893340139, "grad_norm": 1.796875, "learning_rate": 1.1059416610295336e-05, "loss": 1.9523, "step": 3510 }, { "epoch": 1.1975844177936548, "grad_norm": 2.015625, "learning_rate": 1.104524570479512e-05, "loss": 1.9842, "step": 3520 }, { "epoch": 1.2009866462532959, "grad_norm": 1.875, "learning_rate": 1.1031041992328483e-05, "loss": 2.0036, "step": 3530 }, { "epoch": 1.204388874712937, "grad_norm": 2.03125, "learning_rate": 1.1016805580706439e-05, "loss": 2.048, "step": 3540 }, { "epoch": 1.207791103172578, "grad_norm": 2.0625, "learning_rate": 1.1002536577988182e-05, "loss": 1.9545, "step": 3550 }, { "epoch": 1.2111933316322192, "grad_norm": 1.9921875, "learning_rate": 1.0988235092480297e-05, "loss": 1.9575, "step": 3560 }, { "epoch": 1.21459556009186, "grad_norm": 2.015625, "learning_rate": 1.0973901232735917e-05, "loss": 1.9759, "step": 3570 }, { "epoch": 1.2179977885515012, "grad_norm": 2.078125, "learning_rate": 1.0959535107553909e-05, "loss": 1.9737, "step": 3580 }, { "epoch": 1.2214000170111423, "grad_norm": 1.890625, "learning_rate": 1.0945136825978049e-05, "loss": 2.0414, "step": 3590 }, { "epoch": 1.2248022454707834, "grad_norm": 2.0625, "learning_rate": 1.0930706497296186e-05, "loss": 1.9566, "step": 3600 }, { "epoch": 1.2282044739304245, "grad_norm": 1.8125, "learning_rate": 1.0916244231039415e-05, "loss": 1.9614, "step": 3610 }, { "epoch": 1.2316067023900654, "grad_norm": 2.09375, "learning_rate": 1.0901750136981258e-05, "loss": 2.0045, "step": 3620 }, { "epoch": 1.2350089308497065, "grad_norm": 1.578125, "learning_rate": 1.0887224325136807e-05, "loss": 1.9703, "step": 3630 }, { "epoch": 1.2384111593093476, "grad_norm": 2.265625, "learning_rate": 1.0872666905761921e-05, "loss": 1.9609, "step": 3640 }, { "epoch": 1.2418133877689888, "grad_norm": 1.9296875, "learning_rate": 1.0858077989352354e-05, "loss": 1.9865, "step": 3650 }, { "epoch": 1.2452156162286299, "grad_norm": 1.84375, "learning_rate": 1.084345768664294e-05, "loss": 1.9276, "step": 3660 }, { "epoch": 1.2486178446882708, "grad_norm": 2.25, "learning_rate": 1.0828806108606748e-05, "loss": 1.9673, "step": 3670 }, { "epoch": 1.2520200731479119, "grad_norm": 2.15625, "learning_rate": 1.081412336645423e-05, "loss": 1.9522, "step": 3680 }, { "epoch": 1.255422301607553, "grad_norm": 1.953125, "learning_rate": 1.0799409571632395e-05, "loss": 1.8882, "step": 3690 }, { "epoch": 1.258824530067194, "grad_norm": 1.9765625, "learning_rate": 1.0784664835823945e-05, "loss": 1.9378, "step": 3700 }, { "epoch": 1.2622267585268352, "grad_norm": 1.7421875, "learning_rate": 1.076988927094643e-05, "loss": 2.0231, "step": 3710 }, { "epoch": 1.265628986986476, "grad_norm": 2.03125, "learning_rate": 1.0755082989151417e-05, "loss": 1.925, "step": 3720 }, { "epoch": 1.2690312154461172, "grad_norm": 2.15625, "learning_rate": 1.0740246102823613e-05, "loss": 1.8958, "step": 3730 }, { "epoch": 1.2724334439057583, "grad_norm": 2.015625, "learning_rate": 1.0725378724580027e-05, "loss": 1.9536, "step": 3740 }, { "epoch": 1.2758356723653994, "grad_norm": 1.953125, "learning_rate": 1.0710480967269115e-05, "loss": 1.9541, "step": 3750 }, { "epoch": 1.2792379008250405, "grad_norm": 1.734375, "learning_rate": 1.0695552943969919e-05, "loss": 1.9327, "step": 3760 }, { "epoch": 1.2826401292846814, "grad_norm": 1.9375, "learning_rate": 1.0680594767991203e-05, "loss": 1.9935, "step": 3770 }, { "epoch": 1.2860423577443225, "grad_norm": 2.078125, "learning_rate": 1.0665606552870612e-05, "loss": 1.9933, "step": 3780 }, { "epoch": 1.2894445862039636, "grad_norm": 2.125, "learning_rate": 1.0650588412373792e-05, "loss": 1.9314, "step": 3790 }, { "epoch": 1.2928468146636047, "grad_norm": 1.609375, "learning_rate": 1.0635540460493534e-05, "loss": 1.9136, "step": 3800 }, { "epoch": 1.2962490431232458, "grad_norm": 1.796875, "learning_rate": 1.0620462811448904e-05, "loss": 1.9175, "step": 3810 }, { "epoch": 1.2996512715828867, "grad_norm": 2.125, "learning_rate": 1.0605355579684382e-05, "loss": 1.9929, "step": 3820 }, { "epoch": 1.3030535000425278, "grad_norm": 2.109375, "learning_rate": 1.0590218879868998e-05, "loss": 1.9072, "step": 3830 }, { "epoch": 1.306455728502169, "grad_norm": 2.296875, "learning_rate": 1.0575052826895442e-05, "loss": 1.9315, "step": 3840 }, { "epoch": 1.30985795696181, "grad_norm": 1.78125, "learning_rate": 1.0559857535879212e-05, "loss": 2.045, "step": 3850 }, { "epoch": 1.3132601854214512, "grad_norm": 2.15625, "learning_rate": 1.0544633122157734e-05, "loss": 1.9443, "step": 3860 }, { "epoch": 1.316662413881092, "grad_norm": 1.890625, "learning_rate": 1.0529379701289476e-05, "loss": 1.9742, "step": 3870 }, { "epoch": 1.3200646423407332, "grad_norm": 1.7890625, "learning_rate": 1.051409738905309e-05, "loss": 1.9852, "step": 3880 }, { "epoch": 1.3234668708003743, "grad_norm": 2.1875, "learning_rate": 1.0498786301446519e-05, "loss": 1.997, "step": 3890 }, { "epoch": 1.3268690992600152, "grad_norm": 2.0, "learning_rate": 1.0483446554686125e-05, "loss": 1.9083, "step": 3900 }, { "epoch": 1.3302713277196565, "grad_norm": 1.8046875, "learning_rate": 1.0468078265205796e-05, "loss": 1.974, "step": 3910 }, { "epoch": 1.3336735561792974, "grad_norm": 1.875, "learning_rate": 1.0452681549656073e-05, "loss": 1.9885, "step": 3920 }, { "epoch": 1.3370757846389385, "grad_norm": 1.9609375, "learning_rate": 1.0437256524903258e-05, "loss": 1.9872, "step": 3930 }, { "epoch": 1.3404780130985796, "grad_norm": 2.046875, "learning_rate": 1.0421803308028533e-05, "loss": 1.9477, "step": 3940 }, { "epoch": 1.3438802415582205, "grad_norm": 1.9296875, "learning_rate": 1.0406322016327067e-05, "loss": 2.0032, "step": 3950 }, { "epoch": 1.3472824700178618, "grad_norm": 2.015625, "learning_rate": 1.0390812767307123e-05, "loss": 1.9942, "step": 3960 }, { "epoch": 1.3506846984775027, "grad_norm": 1.8984375, "learning_rate": 1.0375275678689174e-05, "loss": 2.0242, "step": 3970 }, { "epoch": 1.3540869269371438, "grad_norm": 1.90625, "learning_rate": 1.0359710868405e-05, "loss": 2.0306, "step": 3980 }, { "epoch": 1.357489155396785, "grad_norm": 2.140625, "learning_rate": 1.0344118454596807e-05, "loss": 1.9709, "step": 3990 }, { "epoch": 1.3608913838564258, "grad_norm": 1.9765625, "learning_rate": 1.032849855561631e-05, "loss": 1.9812, "step": 4000 }, { "epoch": 1.3642936123160672, "grad_norm": 2.09375, "learning_rate": 1.0312851290023851e-05, "loss": 2.0006, "step": 4010 }, { "epoch": 1.367695840775708, "grad_norm": 2.078125, "learning_rate": 1.0297176776587497e-05, "loss": 1.9679, "step": 4020 }, { "epoch": 1.3710980692353492, "grad_norm": 2.375, "learning_rate": 1.028147513428213e-05, "loss": 1.934, "step": 4030 }, { "epoch": 1.3745002976949903, "grad_norm": 2.046875, "learning_rate": 1.026574648228855e-05, "loss": 1.9867, "step": 4040 }, { "epoch": 1.3779025261546312, "grad_norm": 2.359375, "learning_rate": 1.0249990939992573e-05, "loss": 1.899, "step": 4050 }, { "epoch": 1.3813047546142723, "grad_norm": 2.15625, "learning_rate": 1.023420862698412e-05, "loss": 1.9799, "step": 4060 }, { "epoch": 1.3847069830739134, "grad_norm": 1.9609375, "learning_rate": 1.021839966305631e-05, "loss": 2.0251, "step": 4070 }, { "epoch": 1.3881092115335545, "grad_norm": 2.0625, "learning_rate": 1.0202564168204549e-05, "loss": 1.9332, "step": 4080 }, { "epoch": 1.3915114399931956, "grad_norm": 2.1875, "learning_rate": 1.0186702262625632e-05, "loss": 1.971, "step": 4090 }, { "epoch": 1.3949136684528365, "grad_norm": 2.0625, "learning_rate": 1.0170814066716807e-05, "loss": 1.9266, "step": 4100 }, { "epoch": 1.3983158969124776, "grad_norm": 1.984375, "learning_rate": 1.0154899701074883e-05, "loss": 1.9282, "step": 4110 }, { "epoch": 1.4017181253721187, "grad_norm": 2.046875, "learning_rate": 1.0138959286495303e-05, "loss": 2.0014, "step": 4120 }, { "epoch": 1.4051203538317598, "grad_norm": 2.125, "learning_rate": 1.0122992943971232e-05, "loss": 1.9463, "step": 4130 }, { "epoch": 1.408522582291401, "grad_norm": 1.875, "learning_rate": 1.0107000794692637e-05, "loss": 2.003, "step": 4140 }, { "epoch": 1.4119248107510418, "grad_norm": 2.234375, "learning_rate": 1.0090982960045363e-05, "loss": 2.0, "step": 4150 }, { "epoch": 1.415327039210683, "grad_norm": 2.203125, "learning_rate": 1.0074939561610221e-05, "loss": 1.9832, "step": 4160 }, { "epoch": 1.418729267670324, "grad_norm": 2.078125, "learning_rate": 1.005887072116206e-05, "loss": 1.8977, "step": 4170 }, { "epoch": 1.4221314961299651, "grad_norm": 1.65625, "learning_rate": 1.0042776560668832e-05, "loss": 1.9778, "step": 4180 }, { "epoch": 1.4255337245896063, "grad_norm": 1.9921875, "learning_rate": 1.0026657202290696e-05, "loss": 1.9389, "step": 4190 }, { "epoch": 1.4289359530492471, "grad_norm": 2.21875, "learning_rate": 1.0010512768379053e-05, "loss": 1.909, "step": 4200 }, { "epoch": 1.4323381815088883, "grad_norm": 2.109375, "learning_rate": 9.994343381475644e-06, "loss": 1.9563, "step": 4210 }, { "epoch": 1.4357404099685294, "grad_norm": 2.09375, "learning_rate": 9.978149164311613e-06, "loss": 1.9725, "step": 4220 }, { "epoch": 1.4391426384281705, "grad_norm": 1.71875, "learning_rate": 9.961930239806571e-06, "loss": 2.0237, "step": 4230 }, { "epoch": 1.4425448668878116, "grad_norm": 1.953125, "learning_rate": 9.945686731067668e-06, "loss": 1.9415, "step": 4240 }, { "epoch": 1.4459470953474525, "grad_norm": 2.0625, "learning_rate": 9.929418761388654e-06, "loss": 1.9221, "step": 4250 }, { "epoch": 1.4493493238070936, "grad_norm": 2.046875, "learning_rate": 9.91312645424895e-06, "loss": 1.9062, "step": 4260 }, { "epoch": 1.4527515522667347, "grad_norm": 2.40625, "learning_rate": 9.896809933312702e-06, "loss": 1.9621, "step": 4270 }, { "epoch": 1.4561537807263758, "grad_norm": 2.265625, "learning_rate": 9.88046932242785e-06, "loss": 1.9721, "step": 4280 }, { "epoch": 1.459556009186017, "grad_norm": 1.9765625, "learning_rate": 9.864104745625186e-06, "loss": 2.0143, "step": 4290 }, { "epoch": 1.4629582376456578, "grad_norm": 2.359375, "learning_rate": 9.847716327117408e-06, "loss": 1.9356, "step": 4300 }, { "epoch": 1.466360466105299, "grad_norm": 2.140625, "learning_rate": 9.831304191298181e-06, "loss": 1.9466, "step": 4310 }, { "epoch": 1.46976269456494, "grad_norm": 1.890625, "learning_rate": 9.814868462741196e-06, "loss": 1.9112, "step": 4320 }, { "epoch": 1.4731649230245811, "grad_norm": 1.953125, "learning_rate": 9.798409266199217e-06, "loss": 1.9464, "step": 4330 }, { "epoch": 1.4765671514842222, "grad_norm": 2.046875, "learning_rate": 9.781926726603141e-06, "loss": 1.9421, "step": 4340 }, { "epoch": 1.4799693799438631, "grad_norm": 2.09375, "learning_rate": 9.765420969061045e-06, "loss": 2.0682, "step": 4350 }, { "epoch": 1.4833716084035042, "grad_norm": 1.7734375, "learning_rate": 9.748892118857236e-06, "loss": 1.9912, "step": 4360 }, { "epoch": 1.4867738368631453, "grad_norm": 1.921875, "learning_rate": 9.73234030145131e-06, "loss": 1.9594, "step": 4370 }, { "epoch": 1.4901760653227865, "grad_norm": 2.34375, "learning_rate": 9.71576564247718e-06, "loss": 1.9444, "step": 4380 }, { "epoch": 1.4935782937824276, "grad_norm": 2.09375, "learning_rate": 9.699168267742144e-06, "loss": 1.9882, "step": 4390 }, { "epoch": 1.4969805222420685, "grad_norm": 1.84375, "learning_rate": 9.682548303225915e-06, "loss": 1.9076, "step": 4400 }, { "epoch": 1.5003827507017096, "grad_norm": 2.015625, "learning_rate": 9.665905875079679e-06, "loss": 1.9594, "step": 4410 }, { "epoch": 1.5037849791613507, "grad_norm": 2.03125, "learning_rate": 9.649241109625111e-06, "loss": 2.0808, "step": 4420 }, { "epoch": 1.5071872076209918, "grad_norm": 1.9375, "learning_rate": 9.632554133353453e-06, "loss": 1.9688, "step": 4430 }, { "epoch": 1.510589436080633, "grad_norm": 1.953125, "learning_rate": 9.615845072924522e-06, "loss": 1.971, "step": 4440 }, { "epoch": 1.5139916645402738, "grad_norm": 1.9609375, "learning_rate": 9.59911405516577e-06, "loss": 1.9759, "step": 4450 }, { "epoch": 1.517393892999915, "grad_norm": 2.125, "learning_rate": 9.582361207071299e-06, "loss": 1.975, "step": 4460 }, { "epoch": 1.520796121459556, "grad_norm": 1.90625, "learning_rate": 9.565586655800928e-06, "loss": 1.9975, "step": 4470 }, { "epoch": 1.5241983499191971, "grad_norm": 1.9453125, "learning_rate": 9.5487905286792e-06, "loss": 1.966, "step": 4480 }, { "epoch": 1.5276005783788382, "grad_norm": 2.078125, "learning_rate": 9.531972953194425e-06, "loss": 1.9374, "step": 4490 }, { "epoch": 1.5310028068384791, "grad_norm": 2.0625, "learning_rate": 9.51513405699772e-06, "loss": 1.9567, "step": 4500 }, { "epoch": 1.5344050352981202, "grad_norm": 2.359375, "learning_rate": 9.498273967902033e-06, "loss": 1.9704, "step": 4510 }, { "epoch": 1.5378072637577613, "grad_norm": 2.078125, "learning_rate": 9.481392813881164e-06, "loss": 1.9064, "step": 4520 }, { "epoch": 1.5412094922174024, "grad_norm": 2.21875, "learning_rate": 9.464490723068811e-06, "loss": 1.9553, "step": 4530 }, { "epoch": 1.5446117206770436, "grad_norm": 2.171875, "learning_rate": 9.447567823757589e-06, "loss": 1.9416, "step": 4540 }, { "epoch": 1.5480139491366844, "grad_norm": 1.859375, "learning_rate": 9.430624244398053e-06, "loss": 2.0401, "step": 4550 }, { "epoch": 1.5514161775963256, "grad_norm": 2.125, "learning_rate": 9.413660113597731e-06, "loss": 1.9495, "step": 4560 }, { "epoch": 1.5548184060559667, "grad_norm": 2.296875, "learning_rate": 9.396675560120143e-06, "loss": 2.0093, "step": 4570 }, { "epoch": 1.5582206345156078, "grad_norm": 2.203125, "learning_rate": 9.379670712883817e-06, "loss": 1.974, "step": 4580 }, { "epoch": 1.5616228629752489, "grad_norm": 1.96875, "learning_rate": 9.362645700961327e-06, "loss": 1.935, "step": 4590 }, { "epoch": 1.5650250914348898, "grad_norm": 2.171875, "learning_rate": 9.345600653578297e-06, "loss": 1.9727, "step": 4600 }, { "epoch": 1.5684273198945309, "grad_norm": 2.34375, "learning_rate": 9.328535700112433e-06, "loss": 1.9115, "step": 4610 }, { "epoch": 1.571829548354172, "grad_norm": 2.109375, "learning_rate": 9.311450970092529e-06, "loss": 1.9329, "step": 4620 }, { "epoch": 1.575231776813813, "grad_norm": 1.9609375, "learning_rate": 9.294346593197489e-06, "loss": 1.9534, "step": 4630 }, { "epoch": 1.5786340052734542, "grad_norm": 1.9609375, "learning_rate": 9.277222699255353e-06, "loss": 1.9047, "step": 4640 }, { "epoch": 1.582036233733095, "grad_norm": 1.9765625, "learning_rate": 9.260079418242293e-06, "loss": 1.9975, "step": 4650 }, { "epoch": 1.5854384621927362, "grad_norm": 2.359375, "learning_rate": 9.242916880281638e-06, "loss": 1.9347, "step": 4660 }, { "epoch": 1.5888406906523773, "grad_norm": 2.1875, "learning_rate": 9.225735215642885e-06, "loss": 1.9552, "step": 4670 }, { "epoch": 1.5922429191120182, "grad_norm": 2.109375, "learning_rate": 9.208534554740706e-06, "loss": 1.9052, "step": 4680 }, { "epoch": 1.5956451475716595, "grad_norm": 2.1875, "learning_rate": 9.191315028133966e-06, "loss": 1.9881, "step": 4690 }, { "epoch": 1.5990473760313004, "grad_norm": 2.0625, "learning_rate": 9.17407676652472e-06, "loss": 1.9671, "step": 4700 }, { "epoch": 1.6024496044909415, "grad_norm": 2.203125, "learning_rate": 9.156819900757237e-06, "loss": 1.9753, "step": 4710 }, { "epoch": 1.6058518329505826, "grad_norm": 1.9140625, "learning_rate": 9.139544561816991e-06, "loss": 1.9516, "step": 4720 }, { "epoch": 1.6092540614102235, "grad_norm": 2.234375, "learning_rate": 9.122250880829674e-06, "loss": 1.9615, "step": 4730 }, { "epoch": 1.6126562898698649, "grad_norm": 2.15625, "learning_rate": 9.104938989060205e-06, "loss": 1.9325, "step": 4740 }, { "epoch": 1.6160585183295058, "grad_norm": 1.984375, "learning_rate": 9.087609017911725e-06, "loss": 1.9227, "step": 4750 }, { "epoch": 1.6194607467891469, "grad_norm": 2.109375, "learning_rate": 9.070261098924604e-06, "loss": 1.9796, "step": 4760 }, { "epoch": 1.622862975248788, "grad_norm": 2.1875, "learning_rate": 9.052895363775442e-06, "loss": 1.977, "step": 4770 }, { "epoch": 1.6262652037084289, "grad_norm": 2.046875, "learning_rate": 9.035511944276075e-06, "loss": 1.8778, "step": 4780 }, { "epoch": 1.6296674321680702, "grad_norm": 2.546875, "learning_rate": 9.018110972372563e-06, "loss": 1.924, "step": 4790 }, { "epoch": 1.633069660627711, "grad_norm": 1.9140625, "learning_rate": 9.000692580144194e-06, "loss": 1.9173, "step": 4800 }, { "epoch": 1.6364718890873522, "grad_norm": 2.40625, "learning_rate": 8.983256899802485e-06, "loss": 1.9433, "step": 4810 }, { "epoch": 1.6398741175469933, "grad_norm": 2.09375, "learning_rate": 8.96580406369018e-06, "loss": 1.9947, "step": 4820 }, { "epoch": 1.6432763460066342, "grad_norm": 1.9921875, "learning_rate": 8.948334204280234e-06, "loss": 1.9073, "step": 4830 }, { "epoch": 1.6466785744662755, "grad_norm": 2.3125, "learning_rate": 8.930847454174817e-06, "loss": 1.9565, "step": 4840 }, { "epoch": 1.6500808029259164, "grad_norm": 2.15625, "learning_rate": 8.913343946104305e-06, "loss": 1.8945, "step": 4850 }, { "epoch": 1.6534830313855575, "grad_norm": 2.296875, "learning_rate": 8.895823812926273e-06, "loss": 1.9491, "step": 4860 }, { "epoch": 1.6568852598451986, "grad_norm": 2.203125, "learning_rate": 8.878287187624486e-06, "loss": 1.8916, "step": 4870 }, { "epoch": 1.6602874883048395, "grad_norm": 1.9296875, "learning_rate": 8.860734203307893e-06, "loss": 1.9758, "step": 4880 }, { "epoch": 1.6636897167644809, "grad_norm": 1.9453125, "learning_rate": 8.84316499320961e-06, "loss": 1.9791, "step": 4890 }, { "epoch": 1.6670919452241217, "grad_norm": 2.0, "learning_rate": 8.825579690685907e-06, "loss": 2.0407, "step": 4900 }, { "epoch": 1.6704941736837629, "grad_norm": 1.953125, "learning_rate": 8.807978429215212e-06, "loss": 2.0039, "step": 4910 }, { "epoch": 1.673896402143404, "grad_norm": 2.203125, "learning_rate": 8.79036134239708e-06, "loss": 2.0093, "step": 4920 }, { "epoch": 1.6772986306030448, "grad_norm": 2.265625, "learning_rate": 8.772728563951189e-06, "loss": 1.8997, "step": 4930 }, { "epoch": 1.6807008590626862, "grad_norm": 2.140625, "learning_rate": 8.755080227716316e-06, "loss": 1.908, "step": 4940 }, { "epoch": 1.684103087522327, "grad_norm": 1.8515625, "learning_rate": 8.737416467649337e-06, "loss": 1.9478, "step": 4950 }, { "epoch": 1.6875053159819682, "grad_norm": 2.203125, "learning_rate": 8.71973741782419e-06, "loss": 1.9497, "step": 4960 }, { "epoch": 1.6909075444416093, "grad_norm": 1.8125, "learning_rate": 8.70204321243087e-06, "loss": 1.9035, "step": 4970 }, { "epoch": 1.6943097729012502, "grad_norm": 2.171875, "learning_rate": 8.684333985774413e-06, "loss": 1.9666, "step": 4980 }, { "epoch": 1.6977120013608915, "grad_norm": 2.484375, "learning_rate": 8.666609872273867e-06, "loss": 1.9943, "step": 4990 }, { "epoch": 1.7011142298205324, "grad_norm": 2.234375, "learning_rate": 8.648871006461278e-06, "loss": 1.9293, "step": 5000 }, { "epoch": 1.7045164582801735, "grad_norm": 2.140625, "learning_rate": 8.631117522980663e-06, "loss": 1.9369, "step": 5010 }, { "epoch": 1.7079186867398146, "grad_norm": 2.046875, "learning_rate": 8.613349556587001e-06, "loss": 1.9117, "step": 5020 }, { "epoch": 1.7113209151994555, "grad_norm": 2.078125, "learning_rate": 8.59556724214519e-06, "loss": 1.9757, "step": 5030 }, { "epoch": 1.7147231436590968, "grad_norm": 2.328125, "learning_rate": 8.577770714629042e-06, "loss": 1.9838, "step": 5040 }, { "epoch": 1.7181253721187377, "grad_norm": 2.328125, "learning_rate": 8.559960109120251e-06, "loss": 1.9571, "step": 5050 }, { "epoch": 1.7215276005783788, "grad_norm": 2.140625, "learning_rate": 8.542135560807365e-06, "loss": 1.9588, "step": 5060 }, { "epoch": 1.72492982903802, "grad_norm": 2.15625, "learning_rate": 8.524297204984759e-06, "loss": 1.9542, "step": 5070 }, { "epoch": 1.7283320574976608, "grad_norm": 1.9765625, "learning_rate": 8.506445177051624e-06, "loss": 1.9691, "step": 5080 }, { "epoch": 1.7317342859573022, "grad_norm": 1.953125, "learning_rate": 8.488579612510915e-06, "loss": 1.9141, "step": 5090 }, { "epoch": 1.735136514416943, "grad_norm": 2.0, "learning_rate": 8.470700646968339e-06, "loss": 2.0129, "step": 5100 }, { "epoch": 1.7385387428765842, "grad_norm": 2.171875, "learning_rate": 8.452808416131319e-06, "loss": 1.9424, "step": 5110 }, { "epoch": 1.7419409713362253, "grad_norm": 1.8984375, "learning_rate": 8.434903055807971e-06, "loss": 1.9041, "step": 5120 }, { "epoch": 1.7453431997958662, "grad_norm": 1.859375, "learning_rate": 8.416984701906065e-06, "loss": 1.9514, "step": 5130 }, { "epoch": 1.7487454282555075, "grad_norm": 1.7421875, "learning_rate": 8.399053490431994e-06, "loss": 1.9846, "step": 5140 }, { "epoch": 1.7521476567151484, "grad_norm": 2.03125, "learning_rate": 8.38110955748975e-06, "loss": 1.9242, "step": 5150 }, { "epoch": 1.7555498851747895, "grad_norm": 2.015625, "learning_rate": 8.363153039279882e-06, "loss": 1.9853, "step": 5160 }, { "epoch": 1.7589521136344306, "grad_norm": 2.15625, "learning_rate": 8.345184072098464e-06, "loss": 2.0005, "step": 5170 }, { "epoch": 1.7623543420940715, "grad_norm": 2.171875, "learning_rate": 8.327202792336068e-06, "loss": 2.0181, "step": 5180 }, { "epoch": 1.7657565705537128, "grad_norm": 2.234375, "learning_rate": 8.309209336476713e-06, "loss": 1.9119, "step": 5190 }, { "epoch": 1.7691587990133537, "grad_norm": 2.328125, "learning_rate": 8.29120384109685e-06, "loss": 1.9061, "step": 5200 }, { "epoch": 1.7725610274729948, "grad_norm": 2.046875, "learning_rate": 8.273186442864303e-06, "loss": 1.9584, "step": 5210 }, { "epoch": 1.775963255932636, "grad_norm": 2.1875, "learning_rate": 8.25515727853725e-06, "loss": 1.9456, "step": 5220 }, { "epoch": 1.7793654843922768, "grad_norm": 2.109375, "learning_rate": 8.23711648496318e-06, "loss": 1.9162, "step": 5230 }, { "epoch": 1.7827677128519182, "grad_norm": 2.3125, "learning_rate": 8.219064199077837e-06, "loss": 1.9735, "step": 5240 }, { "epoch": 1.786169941311559, "grad_norm": 2.296875, "learning_rate": 8.201000557904217e-06, "loss": 1.9512, "step": 5250 }, { "epoch": 1.7895721697712001, "grad_norm": 2.046875, "learning_rate": 8.182925698551491e-06, "loss": 1.9886, "step": 5260 }, { "epoch": 1.7929743982308413, "grad_norm": 2.390625, "learning_rate": 8.164839758213986e-06, "loss": 1.9956, "step": 5270 }, { "epoch": 1.7963766266904821, "grad_norm": 2.28125, "learning_rate": 8.14674287417013e-06, "loss": 1.9076, "step": 5280 }, { "epoch": 1.7997788551501235, "grad_norm": 1.84375, "learning_rate": 8.128635183781433e-06, "loss": 1.912, "step": 5290 }, { "epoch": 1.8031810836097644, "grad_norm": 2.21875, "learning_rate": 8.11051682449141e-06, "loss": 1.9582, "step": 5300 }, { "epoch": 1.8065833120694055, "grad_norm": 2.296875, "learning_rate": 8.092387933824571e-06, "loss": 1.979, "step": 5310 }, { "epoch": 1.8099855405290466, "grad_norm": 2.46875, "learning_rate": 8.074248649385357e-06, "loss": 1.9679, "step": 5320 }, { "epoch": 1.8133877689886875, "grad_norm": 2.21875, "learning_rate": 8.056099108857101e-06, "loss": 1.9288, "step": 5330 }, { "epoch": 1.8167899974483288, "grad_norm": 2.296875, "learning_rate": 8.037939450000985e-06, "loss": 1.922, "step": 5340 }, { "epoch": 1.8201922259079697, "grad_norm": 2.1875, "learning_rate": 8.019769810654989e-06, "loss": 1.9022, "step": 5350 }, { "epoch": 1.8235944543676108, "grad_norm": 2.0, "learning_rate": 8.00159032873285e-06, "loss": 1.9698, "step": 5360 }, { "epoch": 1.826996682827252, "grad_norm": 2.171875, "learning_rate": 7.98340114222302e-06, "loss": 1.9087, "step": 5370 }, { "epoch": 1.8303989112868928, "grad_norm": 2.140625, "learning_rate": 7.9652023891876e-06, "loss": 1.9785, "step": 5380 }, { "epoch": 1.8338011397465341, "grad_norm": 2.015625, "learning_rate": 7.946994207761316e-06, "loss": 1.9983, "step": 5390 }, { "epoch": 1.837203368206175, "grad_norm": 2.328125, "learning_rate": 7.928776736150451e-06, "loss": 2.0148, "step": 5400 }, { "epoch": 1.8406055966658161, "grad_norm": 2.109375, "learning_rate": 7.910550112631802e-06, "loss": 1.9808, "step": 5410 }, { "epoch": 1.8440078251254572, "grad_norm": 2.15625, "learning_rate": 7.892314475551641e-06, "loss": 1.9153, "step": 5420 }, { "epoch": 1.8474100535850981, "grad_norm": 2.109375, "learning_rate": 7.87406996332465e-06, "loss": 1.9285, "step": 5430 }, { "epoch": 1.8508122820447395, "grad_norm": 2.34375, "learning_rate": 7.855816714432878e-06, "loss": 1.952, "step": 5440 }, { "epoch": 1.8542145105043804, "grad_norm": 2.203125, "learning_rate": 7.837554867424685e-06, "loss": 1.9335, "step": 5450 }, { "epoch": 1.8576167389640215, "grad_norm": 2.34375, "learning_rate": 7.8192845609137e-06, "loss": 1.943, "step": 5460 }, { "epoch": 1.8610189674236626, "grad_norm": 2.203125, "learning_rate": 7.801005933577753e-06, "loss": 2.0204, "step": 5470 }, { "epoch": 1.8644211958833035, "grad_norm": 2.1875, "learning_rate": 7.782719124157842e-06, "loss": 1.915, "step": 5480 }, { "epoch": 1.8678234243429448, "grad_norm": 2.21875, "learning_rate": 7.764424271457067e-06, "loss": 1.9207, "step": 5490 }, { "epoch": 1.8712256528025857, "grad_norm": 2.015625, "learning_rate": 7.746121514339576e-06, "loss": 1.9593, "step": 5500 }, { "epoch": 1.8746278812622268, "grad_norm": 1.828125, "learning_rate": 7.727810991729512e-06, "loss": 1.904, "step": 5510 }, { "epoch": 1.878030109721868, "grad_norm": 1.9140625, "learning_rate": 7.709492842609971e-06, "loss": 1.9757, "step": 5520 }, { "epoch": 1.8814323381815088, "grad_norm": 1.9140625, "learning_rate": 7.691167206021928e-06, "loss": 1.938, "step": 5530 }, { "epoch": 1.88483456664115, "grad_norm": 2.484375, "learning_rate": 7.67283422106319e-06, "loss": 1.956, "step": 5540 }, { "epoch": 1.888236795100791, "grad_norm": 1.7578125, "learning_rate": 7.654494026887346e-06, "loss": 1.9298, "step": 5550 }, { "epoch": 1.8916390235604321, "grad_norm": 1.890625, "learning_rate": 7.636146762702703e-06, "loss": 1.8893, "step": 5560 }, { "epoch": 1.8950412520200732, "grad_norm": 2.15625, "learning_rate": 7.617792567771233e-06, "loss": 1.9309, "step": 5570 }, { "epoch": 1.8984434804797141, "grad_norm": 2.578125, "learning_rate": 7.59943158140751e-06, "loss": 1.9064, "step": 5580 }, { "epoch": 1.9018457089393552, "grad_norm": 2.203125, "learning_rate": 7.581063942977662e-06, "loss": 1.9647, "step": 5590 }, { "epoch": 1.9052479373989963, "grad_norm": 2.1875, "learning_rate": 7.56268979189831e-06, "loss": 1.9417, "step": 5600 }, { "epoch": 1.9086501658586374, "grad_norm": 2.421875, "learning_rate": 7.544309267635502e-06, "loss": 1.96, "step": 5610 }, { "epoch": 1.9120523943182786, "grad_norm": 2.25, "learning_rate": 7.525922509703665e-06, "loss": 1.9672, "step": 5620 }, { "epoch": 1.9154546227779194, "grad_norm": 2.1875, "learning_rate": 7.507529657664538e-06, "loss": 1.9975, "step": 5630 }, { "epoch": 1.9188568512375606, "grad_norm": 2.078125, "learning_rate": 7.489130851126123e-06, "loss": 1.9763, "step": 5640 }, { "epoch": 1.9222590796972017, "grad_norm": 2.171875, "learning_rate": 7.470726229741613e-06, "loss": 1.9206, "step": 5650 }, { "epoch": 1.9256613081568428, "grad_norm": 2.484375, "learning_rate": 7.45231593320834e-06, "loss": 2.0314, "step": 5660 }, { "epoch": 1.9290635366164839, "grad_norm": 2.109375, "learning_rate": 7.433900101266712e-06, "loss": 1.9449, "step": 5670 }, { "epoch": 1.9324657650761248, "grad_norm": 2.0, "learning_rate": 7.415478873699151e-06, "loss": 1.9294, "step": 5680 }, { "epoch": 1.9358679935357659, "grad_norm": 1.8828125, "learning_rate": 7.3970523903290335e-06, "loss": 1.8888, "step": 5690 }, { "epoch": 1.939270221995407, "grad_norm": 2.25, "learning_rate": 7.378620791019634e-06, "loss": 1.9365, "step": 5700 }, { "epoch": 1.942672450455048, "grad_norm": 1.8828125, "learning_rate": 7.360184215673055e-06, "loss": 1.9441, "step": 5710 }, { "epoch": 1.9460746789146892, "grad_norm": 2.28125, "learning_rate": 7.341742804229166e-06, "loss": 1.9156, "step": 5720 }, { "epoch": 1.94947690737433, "grad_norm": 2.375, "learning_rate": 7.32329669666455e-06, "loss": 1.9051, "step": 5730 }, { "epoch": 1.9528791358339712, "grad_norm": 2.109375, "learning_rate": 7.304846032991432e-06, "loss": 2.0019, "step": 5740 }, { "epoch": 1.9562813642936123, "grad_norm": 1.875, "learning_rate": 7.2863909532566196e-06, "loss": 1.8679, "step": 5750 }, { "epoch": 1.9596835927532534, "grad_norm": 2.234375, "learning_rate": 7.2679315975404405e-06, "loss": 1.9605, "step": 5760 }, { "epoch": 1.9630858212128945, "grad_norm": 1.9375, "learning_rate": 7.249468105955679e-06, "loss": 1.9355, "step": 5770 }, { "epoch": 1.9664880496725354, "grad_norm": 2.0, "learning_rate": 7.231000618646511e-06, "loss": 1.8908, "step": 5780 }, { "epoch": 1.9698902781321765, "grad_norm": 2.203125, "learning_rate": 7.212529275787436e-06, "loss": 1.9578, "step": 5790 }, { "epoch": 1.9732925065918177, "grad_norm": 2.265625, "learning_rate": 7.194054217582234e-06, "loss": 1.9287, "step": 5800 }, { "epoch": 1.9766947350514585, "grad_norm": 2.375, "learning_rate": 7.17557558426287e-06, "loss": 1.9672, "step": 5810 }, { "epoch": 1.9800969635110999, "grad_norm": 2.0, "learning_rate": 7.157093516088451e-06, "loss": 1.9581, "step": 5820 }, { "epoch": 1.9834991919707408, "grad_norm": 2.015625, "learning_rate": 7.138608153344156e-06, "loss": 1.9872, "step": 5830 }, { "epoch": 1.9869014204303819, "grad_norm": 1.921875, "learning_rate": 7.120119636340172e-06, "loss": 1.9525, "step": 5840 }, { "epoch": 1.990303648890023, "grad_norm": 1.890625, "learning_rate": 7.101628105410625e-06, "loss": 1.9093, "step": 5850 }, { "epoch": 1.9937058773496639, "grad_norm": 2.234375, "learning_rate": 7.0831337009125195e-06, "loss": 1.9706, "step": 5860 }, { "epoch": 1.9971081058093052, "grad_norm": 2.3125, "learning_rate": 7.064636563224674e-06, "loss": 1.9331, "step": 5870 }, { "epoch": 2.000510334268946, "grad_norm": 2.203125, "learning_rate": 7.046136832746647e-06, "loss": 1.9434, "step": 5880 }, { "epoch": 2.0039125627285874, "grad_norm": 2.265625, "learning_rate": 7.027634649897679e-06, "loss": 1.8678, "step": 5890 }, { "epoch": 2.0073147911882283, "grad_norm": 2.421875, "learning_rate": 7.009130155115627e-06, "loss": 1.9193, "step": 5900 }, { "epoch": 2.010717019647869, "grad_norm": 2.125, "learning_rate": 6.990623488855899e-06, "loss": 1.9459, "step": 5910 }, { "epoch": 2.0141192481075105, "grad_norm": 2.46875, "learning_rate": 6.972114791590378e-06, "loss": 1.9229, "step": 5920 }, { "epoch": 2.0175214765671514, "grad_norm": 2.03125, "learning_rate": 6.953604203806366e-06, "loss": 1.9008, "step": 5930 }, { "epoch": 2.0209237050267927, "grad_norm": 2.5625, "learning_rate": 6.935091866005518e-06, "loss": 1.9513, "step": 5940 }, { "epoch": 2.0243259334864336, "grad_norm": 2.125, "learning_rate": 6.9165779187027685e-06, "loss": 1.9013, "step": 5950 }, { "epoch": 2.0277281619460745, "grad_norm": 2.25, "learning_rate": 6.898062502425267e-06, "loss": 1.914, "step": 5960 }, { "epoch": 2.031130390405716, "grad_norm": 2.015625, "learning_rate": 6.87954575771132e-06, "loss": 1.8773, "step": 5970 }, { "epoch": 2.0345326188653567, "grad_norm": 2.234375, "learning_rate": 6.861027825109312e-06, "loss": 1.9337, "step": 5980 }, { "epoch": 2.037934847324998, "grad_norm": 2.234375, "learning_rate": 6.842508845176642e-06, "loss": 1.9866, "step": 5990 }, { "epoch": 2.041337075784639, "grad_norm": 1.9921875, "learning_rate": 6.8239889584786644e-06, "loss": 1.9557, "step": 6000 }, { "epoch": 2.04473930424428, "grad_norm": 2.0, "learning_rate": 6.805468305587612e-06, "loss": 1.9082, "step": 6010 }, { "epoch": 2.048141532703921, "grad_norm": 2.234375, "learning_rate": 6.786947027081537e-06, "loss": 1.8822, "step": 6020 }, { "epoch": 2.051543761163562, "grad_norm": 2.296875, "learning_rate": 6.768425263543234e-06, "loss": 1.9611, "step": 6030 }, { "epoch": 2.0549459896232034, "grad_norm": 2.171875, "learning_rate": 6.7499031555591875e-06, "loss": 1.9623, "step": 6040 }, { "epoch": 2.0583482180828443, "grad_norm": 2.328125, "learning_rate": 6.7313808437184895e-06, "loss": 1.9902, "step": 6050 }, { "epoch": 2.061750446542485, "grad_norm": 2.21875, "learning_rate": 6.71285846861178e-06, "loss": 1.9358, "step": 6060 }, { "epoch": 2.0651526750021265, "grad_norm": 2.40625, "learning_rate": 6.694336170830184e-06, "loss": 1.8377, "step": 6070 }, { "epoch": 2.0685549034617674, "grad_norm": 2.359375, "learning_rate": 6.675814090964238e-06, "loss": 1.9771, "step": 6080 }, { "epoch": 2.0719571319214087, "grad_norm": 2.0625, "learning_rate": 6.6572923696028185e-06, "loss": 1.8634, "step": 6090 }, { "epoch": 2.0753593603810496, "grad_norm": 2.609375, "learning_rate": 6.638771147332086e-06, "loss": 1.9388, "step": 6100 }, { "epoch": 2.0787615888406905, "grad_norm": 2.203125, "learning_rate": 6.62025056473442e-06, "loss": 1.918, "step": 6110 }, { "epoch": 2.082163817300332, "grad_norm": 2.234375, "learning_rate": 6.601730762387327e-06, "loss": 1.9617, "step": 6120 }, { "epoch": 2.0855660457599727, "grad_norm": 2.234375, "learning_rate": 6.583211880862406e-06, "loss": 1.9056, "step": 6130 }, { "epoch": 2.0889682742196136, "grad_norm": 2.15625, "learning_rate": 6.56469406072426e-06, "loss": 1.9458, "step": 6140 }, { "epoch": 2.092370502679255, "grad_norm": 2.109375, "learning_rate": 6.546177442529437e-06, "loss": 1.9393, "step": 6150 }, { "epoch": 2.095772731138896, "grad_norm": 2.140625, "learning_rate": 6.5276621668253645e-06, "loss": 1.9038, "step": 6160 }, { "epoch": 2.099174959598537, "grad_norm": 2.265625, "learning_rate": 6.509148374149276e-06, "loss": 1.9621, "step": 6170 }, { "epoch": 2.102577188058178, "grad_norm": 2.015625, "learning_rate": 6.490636205027152e-06, "loss": 1.9206, "step": 6180 }, { "epoch": 2.105979416517819, "grad_norm": 2.515625, "learning_rate": 6.472125799972643e-06, "loss": 1.9409, "step": 6190 }, { "epoch": 2.1093816449774603, "grad_norm": 2.53125, "learning_rate": 6.453617299486017e-06, "loss": 1.9348, "step": 6200 }, { "epoch": 2.112783873437101, "grad_norm": 2.109375, "learning_rate": 6.435110844053086e-06, "loss": 1.9364, "step": 6210 }, { "epoch": 2.1161861018967425, "grad_norm": 2.46875, "learning_rate": 6.416606574144131e-06, "loss": 1.9042, "step": 6220 }, { "epoch": 2.1195883303563834, "grad_norm": 2.34375, "learning_rate": 6.398104630212853e-06, "loss": 1.9547, "step": 6230 }, { "epoch": 2.1229905588160243, "grad_norm": 2.4375, "learning_rate": 6.379605152695294e-06, "loss": 1.9768, "step": 6240 }, { "epoch": 2.1263927872756656, "grad_norm": 2.125, "learning_rate": 6.361108282008776e-06, "loss": 1.9522, "step": 6250 }, { "epoch": 2.1297950157353065, "grad_norm": 1.8359375, "learning_rate": 6.342614158550832e-06, "loss": 1.9168, "step": 6260 }, { "epoch": 2.133197244194948, "grad_norm": 2.3125, "learning_rate": 6.324122922698143e-06, "loss": 1.9871, "step": 6270 }, { "epoch": 2.1365994726545887, "grad_norm": 2.28125, "learning_rate": 6.305634714805481e-06, "loss": 1.9398, "step": 6280 }, { "epoch": 2.1400017011142296, "grad_norm": 1.921875, "learning_rate": 6.287149675204619e-06, "loss": 1.9629, "step": 6290 }, { "epoch": 2.143403929573871, "grad_norm": 2.421875, "learning_rate": 6.268667944203294e-06, "loss": 1.9102, "step": 6300 }, { "epoch": 2.146806158033512, "grad_norm": 2.28125, "learning_rate": 6.2501896620841255e-06, "loss": 1.8596, "step": 6310 }, { "epoch": 2.150208386493153, "grad_norm": 2.265625, "learning_rate": 6.231714969103553e-06, "loss": 1.7886, "step": 6320 }, { "epoch": 2.153610614952794, "grad_norm": 2.3125, "learning_rate": 6.213244005490776e-06, "loss": 1.9695, "step": 6330 }, { "epoch": 2.157012843412435, "grad_norm": 2.09375, "learning_rate": 6.194776911446687e-06, "loss": 1.971, "step": 6340 }, { "epoch": 2.1604150718720763, "grad_norm": 2.375, "learning_rate": 6.176313827142807e-06, "loss": 1.9136, "step": 6350 }, { "epoch": 2.163817300331717, "grad_norm": 2.25, "learning_rate": 6.157854892720216e-06, "loss": 1.9184, "step": 6360 }, { "epoch": 2.1672195287913585, "grad_norm": 2.09375, "learning_rate": 6.139400248288503e-06, "loss": 1.9933, "step": 6370 }, { "epoch": 2.1706217572509994, "grad_norm": 1.8984375, "learning_rate": 6.120950033924691e-06, "loss": 1.9114, "step": 6380 }, { "epoch": 2.1740239857106403, "grad_norm": 2.078125, "learning_rate": 6.102504389672177e-06, "loss": 1.9974, "step": 6390 }, { "epoch": 2.1774262141702816, "grad_norm": 1.9140625, "learning_rate": 6.084063455539671e-06, "loss": 1.8925, "step": 6400 }, { "epoch": 2.1808284426299225, "grad_norm": 2.40625, "learning_rate": 6.065627371500128e-06, "loss": 1.9208, "step": 6410 }, { "epoch": 2.184230671089564, "grad_norm": 2.609375, "learning_rate": 6.0471962774896946e-06, "loss": 1.8757, "step": 6420 }, { "epoch": 2.1876328995492047, "grad_norm": 1.8203125, "learning_rate": 6.0287703134066385e-06, "loss": 1.905, "step": 6430 }, { "epoch": 2.1910351280088456, "grad_norm": 2.46875, "learning_rate": 6.010349619110283e-06, "loss": 1.8878, "step": 6440 }, { "epoch": 2.194437356468487, "grad_norm": 2.15625, "learning_rate": 5.991934334419968e-06, "loss": 1.9549, "step": 6450 }, { "epoch": 2.197839584928128, "grad_norm": 2.125, "learning_rate": 5.973524599113954e-06, "loss": 1.9137, "step": 6460 }, { "epoch": 2.201241813387769, "grad_norm": 2.453125, "learning_rate": 5.9551205529283955e-06, "loss": 1.9856, "step": 6470 }, { "epoch": 2.20464404184741, "grad_norm": 2.09375, "learning_rate": 5.936722335556252e-06, "loss": 1.9262, "step": 6480 }, { "epoch": 2.208046270307051, "grad_norm": 1.9609375, "learning_rate": 5.91833008664625e-06, "loss": 1.9596, "step": 6490 }, { "epoch": 2.2114484987666922, "grad_norm": 2.28125, "learning_rate": 5.89994394580181e-06, "loss": 1.907, "step": 6500 }, { "epoch": 2.214850727226333, "grad_norm": 2.125, "learning_rate": 5.881564052579987e-06, "loss": 1.938, "step": 6510 }, { "epoch": 2.2182529556859745, "grad_norm": 2.1875, "learning_rate": 5.863190546490422e-06, "loss": 1.9615, "step": 6520 }, { "epoch": 2.2216551841456154, "grad_norm": 2.078125, "learning_rate": 5.844823566994264e-06, "loss": 1.9353, "step": 6530 }, { "epoch": 2.2250574126052562, "grad_norm": 2.75, "learning_rate": 5.826463253503132e-06, "loss": 1.98, "step": 6540 }, { "epoch": 2.2284596410648976, "grad_norm": 2.25, "learning_rate": 5.808109745378048e-06, "loss": 1.8649, "step": 6550 }, { "epoch": 2.2318618695245385, "grad_norm": 2.265625, "learning_rate": 5.789763181928373e-06, "loss": 1.9079, "step": 6560 }, { "epoch": 2.23526409798418, "grad_norm": 2.421875, "learning_rate": 5.771423702410762e-06, "loss": 1.9156, "step": 6570 }, { "epoch": 2.2386663264438207, "grad_norm": 2.0, "learning_rate": 5.753091446028094e-06, "loss": 1.9416, "step": 6580 }, { "epoch": 2.2420685549034616, "grad_norm": 2.265625, "learning_rate": 5.734766551928427e-06, "loss": 1.8595, "step": 6590 }, { "epoch": 2.245470783363103, "grad_norm": 2.3125, "learning_rate": 5.716449159203939e-06, "loss": 1.9292, "step": 6600 }, { "epoch": 2.248873011822744, "grad_norm": 2.15625, "learning_rate": 5.698139406889855e-06, "loss": 1.9578, "step": 6610 }, { "epoch": 2.252275240282385, "grad_norm": 2.203125, "learning_rate": 5.679837433963432e-06, "loss": 1.9706, "step": 6620 }, { "epoch": 2.255677468742026, "grad_norm": 2.359375, "learning_rate": 5.661543379342855e-06, "loss": 1.9641, "step": 6630 }, { "epoch": 2.259079697201667, "grad_norm": 2.328125, "learning_rate": 5.643257381886218e-06, "loss": 1.9505, "step": 6640 }, { "epoch": 2.2624819256613082, "grad_norm": 2.046875, "learning_rate": 5.624979580390459e-06, "loss": 1.9631, "step": 6650 }, { "epoch": 2.265884154120949, "grad_norm": 2.375, "learning_rate": 5.6067101135902996e-06, "loss": 1.9767, "step": 6660 }, { "epoch": 2.2692863825805905, "grad_norm": 1.8515625, "learning_rate": 5.588449120157205e-06, "loss": 1.9077, "step": 6670 }, { "epoch": 2.2726886110402313, "grad_norm": 2.3125, "learning_rate": 5.57019673869832e-06, "loss": 1.9133, "step": 6680 }, { "epoch": 2.2760908394998722, "grad_norm": 2.265625, "learning_rate": 5.5519531077554244e-06, "loss": 1.8405, "step": 6690 }, { "epoch": 2.2794930679595136, "grad_norm": 2.375, "learning_rate": 5.533718365803875e-06, "loss": 1.8948, "step": 6700 }, { "epoch": 2.2828952964191545, "grad_norm": 2.265625, "learning_rate": 5.51549265125156e-06, "loss": 1.9344, "step": 6710 }, { "epoch": 2.286297524878796, "grad_norm": 2.015625, "learning_rate": 5.4972761024378514e-06, "loss": 1.842, "step": 6720 }, { "epoch": 2.2896997533384367, "grad_norm": 2.28125, "learning_rate": 5.479068857632542e-06, "loss": 1.9172, "step": 6730 }, { "epoch": 2.2931019817980776, "grad_norm": 2.171875, "learning_rate": 5.46087105503481e-06, "loss": 1.9252, "step": 6740 }, { "epoch": 2.296504210257719, "grad_norm": 2.21875, "learning_rate": 5.4426828327721594e-06, "loss": 1.9356, "step": 6750 }, { "epoch": 2.29990643871736, "grad_norm": 2.3125, "learning_rate": 5.4245043288993795e-06, "loss": 1.9462, "step": 6760 }, { "epoch": 2.303308667177001, "grad_norm": 2.375, "learning_rate": 5.406335681397498e-06, "loss": 1.9788, "step": 6770 }, { "epoch": 2.306710895636642, "grad_norm": 2.578125, "learning_rate": 5.388177028172714e-06, "loss": 1.9221, "step": 6780 }, { "epoch": 2.310113124096283, "grad_norm": 1.9609375, "learning_rate": 5.370028507055387e-06, "loss": 1.9344, "step": 6790 }, { "epoch": 2.313515352555924, "grad_norm": 2.140625, "learning_rate": 5.351890255798953e-06, "loss": 1.871, "step": 6800 }, { "epoch": 2.316917581015565, "grad_norm": 1.984375, "learning_rate": 5.333762412078907e-06, "loss": 1.975, "step": 6810 }, { "epoch": 2.3203198094752064, "grad_norm": 2.21875, "learning_rate": 5.315645113491743e-06, "loss": 1.9103, "step": 6820 }, { "epoch": 2.3237220379348473, "grad_norm": 2.203125, "learning_rate": 5.2975384975539145e-06, "loss": 1.9036, "step": 6830 }, { "epoch": 2.327124266394488, "grad_norm": 2.140625, "learning_rate": 5.279442701700792e-06, "loss": 1.9292, "step": 6840 }, { "epoch": 2.3305264948541295, "grad_norm": 2.34375, "learning_rate": 5.261357863285613e-06, "loss": 1.9181, "step": 6850 }, { "epoch": 2.3339287233137704, "grad_norm": 2.359375, "learning_rate": 5.243284119578448e-06, "loss": 1.8917, "step": 6860 }, { "epoch": 2.3373309517734118, "grad_norm": 2.484375, "learning_rate": 5.225221607765159e-06, "loss": 1.9389, "step": 6870 }, { "epoch": 2.3407331802330527, "grad_norm": 2.6875, "learning_rate": 5.207170464946342e-06, "loss": 1.9298, "step": 6880 }, { "epoch": 2.3441354086926935, "grad_norm": 2.078125, "learning_rate": 5.189130828136312e-06, "loss": 1.9011, "step": 6890 }, { "epoch": 2.347537637152335, "grad_norm": 2.40625, "learning_rate": 5.1711028342620375e-06, "loss": 1.908, "step": 6900 }, { "epoch": 2.3509398656119758, "grad_norm": 2.65625, "learning_rate": 5.153086620162123e-06, "loss": 1.8829, "step": 6910 }, { "epoch": 2.354342094071617, "grad_norm": 2.25, "learning_rate": 5.135082322585758e-06, "loss": 1.9441, "step": 6920 }, { "epoch": 2.357744322531258, "grad_norm": 2.4375, "learning_rate": 5.117090078191676e-06, "loss": 1.9403, "step": 6930 }, { "epoch": 2.361146550990899, "grad_norm": 2.46875, "learning_rate": 5.09911002354713e-06, "loss": 1.9478, "step": 6940 }, { "epoch": 2.36454877945054, "grad_norm": 2.0625, "learning_rate": 5.081142295126842e-06, "loss": 1.8916, "step": 6950 }, { "epoch": 2.367951007910181, "grad_norm": 2.4375, "learning_rate": 5.063187029311983e-06, "loss": 1.9323, "step": 6960 }, { "epoch": 2.3713532363698224, "grad_norm": 1.9375, "learning_rate": 5.045244362389115e-06, "loss": 1.9571, "step": 6970 }, { "epoch": 2.3747554648294633, "grad_norm": 1.8359375, "learning_rate": 5.027314430549185e-06, "loss": 1.9486, "step": 6980 }, { "epoch": 2.378157693289104, "grad_norm": 2.4375, "learning_rate": 5.009397369886466e-06, "loss": 1.944, "step": 6990 }, { "epoch": 2.3815599217487455, "grad_norm": 2.390625, "learning_rate": 4.991493316397536e-06, "loss": 1.9539, "step": 7000 }, { "epoch": 2.3849621502083864, "grad_norm": 2.21875, "learning_rate": 4.973602405980251e-06, "loss": 1.8877, "step": 7010 }, { "epoch": 2.3883643786680278, "grad_norm": 2.1875, "learning_rate": 4.955724774432697e-06, "loss": 1.9579, "step": 7020 }, { "epoch": 2.3917666071276686, "grad_norm": 2.4375, "learning_rate": 4.937860557452174e-06, "loss": 1.9066, "step": 7030 }, { "epoch": 2.3951688355873095, "grad_norm": 2.328125, "learning_rate": 4.920009890634164e-06, "loss": 1.9488, "step": 7040 }, { "epoch": 2.398571064046951, "grad_norm": 2.203125, "learning_rate": 4.902172909471289e-06, "loss": 1.9939, "step": 7050 }, { "epoch": 2.4019732925065918, "grad_norm": 2.390625, "learning_rate": 4.884349749352304e-06, "loss": 1.9718, "step": 7060 }, { "epoch": 2.405375520966233, "grad_norm": 2.53125, "learning_rate": 4.866540545561045e-06, "loss": 1.9198, "step": 7070 }, { "epoch": 2.408777749425874, "grad_norm": 2.421875, "learning_rate": 4.848745433275427e-06, "loss": 1.8993, "step": 7080 }, { "epoch": 2.412179977885515, "grad_norm": 2.65625, "learning_rate": 4.830964547566399e-06, "loss": 1.9977, "step": 7090 }, { "epoch": 2.415582206345156, "grad_norm": 2.265625, "learning_rate": 4.813198023396925e-06, "loss": 1.911, "step": 7100 }, { "epoch": 2.418984434804797, "grad_norm": 2.25, "learning_rate": 4.795445995620965e-06, "loss": 1.977, "step": 7110 }, { "epoch": 2.4223866632644384, "grad_norm": 2.203125, "learning_rate": 4.777708598982436e-06, "loss": 1.9065, "step": 7120 }, { "epoch": 2.4257888917240793, "grad_norm": 2.28125, "learning_rate": 4.759985968114213e-06, "loss": 1.9569, "step": 7130 }, { "epoch": 2.42919112018372, "grad_norm": 2.59375, "learning_rate": 4.742278237537088e-06, "loss": 1.9151, "step": 7140 }, { "epoch": 2.4325933486433615, "grad_norm": 1.90625, "learning_rate": 4.72458554165875e-06, "loss": 1.984, "step": 7150 }, { "epoch": 2.4359955771030024, "grad_norm": 1.9453125, "learning_rate": 4.706908014772776e-06, "loss": 1.9921, "step": 7160 }, { "epoch": 2.4393978055626437, "grad_norm": 2.515625, "learning_rate": 4.689245791057602e-06, "loss": 1.9753, "step": 7170 }, { "epoch": 2.4428000340222846, "grad_norm": 1.9765625, "learning_rate": 4.671599004575511e-06, "loss": 1.9305, "step": 7180 }, { "epoch": 2.4462022624819255, "grad_norm": 2.34375, "learning_rate": 4.653967789271607e-06, "loss": 1.8709, "step": 7190 }, { "epoch": 2.449604490941567, "grad_norm": 2.359375, "learning_rate": 4.636352278972806e-06, "loss": 1.9123, "step": 7200 }, { "epoch": 2.4530067194012077, "grad_norm": 2.046875, "learning_rate": 4.618752607386824e-06, "loss": 1.8976, "step": 7210 }, { "epoch": 2.456408947860849, "grad_norm": 2.375, "learning_rate": 4.601168908101142e-06, "loss": 2.0117, "step": 7220 }, { "epoch": 2.45981117632049, "grad_norm": 2.25, "learning_rate": 4.5836013145820175e-06, "loss": 1.8844, "step": 7230 }, { "epoch": 2.463213404780131, "grad_norm": 2.40625, "learning_rate": 4.5660499601734545e-06, "loss": 1.9541, "step": 7240 }, { "epoch": 2.466615633239772, "grad_norm": 2.375, "learning_rate": 4.548514978096198e-06, "loss": 1.9029, "step": 7250 }, { "epoch": 2.470017861699413, "grad_norm": 2.34375, "learning_rate": 4.5309965014467246e-06, "loss": 1.9122, "step": 7260 }, { "epoch": 2.4734200901590544, "grad_norm": 2.125, "learning_rate": 4.513494663196221e-06, "loss": 1.8935, "step": 7270 }, { "epoch": 2.4768223186186953, "grad_norm": 2.546875, "learning_rate": 4.496009596189593e-06, "loss": 1.9198, "step": 7280 }, { "epoch": 2.480224547078336, "grad_norm": 2.71875, "learning_rate": 4.478541433144435e-06, "loss": 1.8702, "step": 7290 }, { "epoch": 2.4836267755379775, "grad_norm": 2.171875, "learning_rate": 4.461090306650046e-06, "loss": 1.9336, "step": 7300 }, { "epoch": 2.4870290039976184, "grad_norm": 2.40625, "learning_rate": 4.443656349166409e-06, "loss": 1.9156, "step": 7310 }, { "epoch": 2.4904312324572597, "grad_norm": 2.078125, "learning_rate": 4.426239693023181e-06, "loss": 1.949, "step": 7320 }, { "epoch": 2.4938334609169006, "grad_norm": 2.34375, "learning_rate": 4.408840470418706e-06, "loss": 1.9331, "step": 7330 }, { "epoch": 2.4972356893765415, "grad_norm": 2.046875, "learning_rate": 4.391458813418992e-06, "loss": 1.9376, "step": 7340 }, { "epoch": 2.500637917836183, "grad_norm": 2.171875, "learning_rate": 4.374094853956726e-06, "loss": 1.8894, "step": 7350 }, { "epoch": 2.5040401462958237, "grad_norm": 2.40625, "learning_rate": 4.3567487238302625e-06, "loss": 2.0008, "step": 7360 }, { "epoch": 2.507442374755465, "grad_norm": 2.5, "learning_rate": 4.3394205547026224e-06, "loss": 1.8901, "step": 7370 }, { "epoch": 2.510844603215106, "grad_norm": 2.25, "learning_rate": 4.322110478100502e-06, "loss": 1.9533, "step": 7380 }, { "epoch": 2.514246831674747, "grad_norm": 2.171875, "learning_rate": 4.3048186254132606e-06, "loss": 1.9216, "step": 7390 }, { "epoch": 2.517649060134388, "grad_norm": 2.453125, "learning_rate": 4.287545127891939e-06, "loss": 1.9397, "step": 7400 }, { "epoch": 2.521051288594029, "grad_norm": 2.1875, "learning_rate": 4.270290116648254e-06, "loss": 1.9161, "step": 7410 }, { "epoch": 2.5244535170536704, "grad_norm": 2.484375, "learning_rate": 4.2530537226536075e-06, "loss": 1.8427, "step": 7420 }, { "epoch": 2.5278557455133113, "grad_norm": 2.84375, "learning_rate": 4.235836076738085e-06, "loss": 1.917, "step": 7430 }, { "epoch": 2.531257973972952, "grad_norm": 2.453125, "learning_rate": 4.218637309589471e-06, "loss": 1.8681, "step": 7440 }, { "epoch": 2.5346602024325935, "grad_norm": 2.171875, "learning_rate": 4.201457551752256e-06, "loss": 1.9049, "step": 7450 }, { "epoch": 2.5380624308922344, "grad_norm": 2.1875, "learning_rate": 4.184296933626636e-06, "loss": 1.9001, "step": 7460 }, { "epoch": 2.5414646593518757, "grad_norm": 2.46875, "learning_rate": 4.167155585467538e-06, "loss": 1.895, "step": 7470 }, { "epoch": 2.5448668878115166, "grad_norm": 1.890625, "learning_rate": 4.150033637383623e-06, "loss": 1.9132, "step": 7480 }, { "epoch": 2.5482691162711575, "grad_norm": 2.296875, "learning_rate": 4.132931219336289e-06, "loss": 1.9031, "step": 7490 }, { "epoch": 2.551671344730799, "grad_norm": 2.15625, "learning_rate": 4.115848461138707e-06, "loss": 1.8727, "step": 7500 }, { "epoch": 2.5550735731904397, "grad_norm": 2.5, "learning_rate": 4.0987854924548134e-06, "loss": 1.8808, "step": 7510 }, { "epoch": 2.558475801650081, "grad_norm": 2.5, "learning_rate": 4.081742442798342e-06, "loss": 1.9265, "step": 7520 }, { "epoch": 2.561878030109722, "grad_norm": 2.390625, "learning_rate": 4.064719441531834e-06, "loss": 1.9463, "step": 7530 }, { "epoch": 2.565280258569363, "grad_norm": 2.6875, "learning_rate": 4.04771661786565e-06, "loss": 1.9341, "step": 7540 }, { "epoch": 2.568682487029004, "grad_norm": 1.9296875, "learning_rate": 4.030734100857004e-06, "loss": 1.9036, "step": 7550 }, { "epoch": 2.572084715488645, "grad_norm": 2.21875, "learning_rate": 4.013772019408969e-06, "loss": 1.9604, "step": 7560 }, { "epoch": 2.5754869439482864, "grad_norm": 2.171875, "learning_rate": 3.9968305022695076e-06, "loss": 1.8938, "step": 7570 }, { "epoch": 2.5788891724079273, "grad_norm": 2.0625, "learning_rate": 3.979909678030498e-06, "loss": 1.976, "step": 7580 }, { "epoch": 2.582291400867568, "grad_norm": 2.609375, "learning_rate": 3.9630096751267395e-06, "loss": 1.9534, "step": 7590 }, { "epoch": 2.5856936293272095, "grad_norm": 2.1875, "learning_rate": 3.946130621835003e-06, "loss": 1.9374, "step": 7600 }, { "epoch": 2.5890958577868504, "grad_norm": 2.359375, "learning_rate": 3.929272646273037e-06, "loss": 1.9044, "step": 7610 }, { "epoch": 2.5924980862464917, "grad_norm": 2.265625, "learning_rate": 3.9124358763986045e-06, "loss": 1.9723, "step": 7620 }, { "epoch": 2.5959003147061326, "grad_norm": 2.578125, "learning_rate": 3.895620440008517e-06, "loss": 1.8593, "step": 7630 }, { "epoch": 2.5993025431657735, "grad_norm": 2.5, "learning_rate": 3.878826464737643e-06, "loss": 1.9203, "step": 7640 }, { "epoch": 2.602704771625415, "grad_norm": 2.5625, "learning_rate": 3.862054078057968e-06, "loss": 1.9127, "step": 7650 }, { "epoch": 2.6061070000850557, "grad_norm": 2.421875, "learning_rate": 3.845303407277605e-06, "loss": 1.8969, "step": 7660 }, { "epoch": 2.609509228544697, "grad_norm": 2.078125, "learning_rate": 3.828574579539842e-06, "loss": 1.957, "step": 7670 }, { "epoch": 2.612911457004338, "grad_norm": 2.046875, "learning_rate": 3.811867721822161e-06, "loss": 1.9497, "step": 7680 }, { "epoch": 2.616313685463979, "grad_norm": 2.484375, "learning_rate": 3.7951829609352926e-06, "loss": 1.9144, "step": 7690 }, { "epoch": 2.61971591392362, "grad_norm": 2.640625, "learning_rate": 3.778520423522247e-06, "loss": 1.9252, "step": 7700 }, { "epoch": 2.623118142383261, "grad_norm": 2.390625, "learning_rate": 3.7618802360573384e-06, "loss": 1.9192, "step": 7710 }, { "epoch": 2.6265203708429024, "grad_norm": 2.0, "learning_rate": 3.7452625248452478e-06, "loss": 1.887, "step": 7720 }, { "epoch": 2.6299225993025432, "grad_norm": 2.390625, "learning_rate": 3.728667416020052e-06, "loss": 1.9326, "step": 7730 }, { "epoch": 2.633324827762184, "grad_norm": 2.484375, "learning_rate": 3.7120950355442677e-06, "loss": 1.9739, "step": 7740 }, { "epoch": 2.6367270562218255, "grad_norm": 2.1875, "learning_rate": 3.6955455092078956e-06, "loss": 1.9417, "step": 7750 }, { "epoch": 2.6401292846814663, "grad_norm": 2.078125, "learning_rate": 3.679018962627461e-06, "loss": 1.9288, "step": 7760 }, { "epoch": 2.6435315131411077, "grad_norm": 2.0625, "learning_rate": 3.6625155212450754e-06, "loss": 1.9062, "step": 7770 }, { "epoch": 2.6469337416007486, "grad_norm": 2.625, "learning_rate": 3.6460353103274615e-06, "loss": 1.9304, "step": 7780 }, { "epoch": 2.6503359700603895, "grad_norm": 2.109375, "learning_rate": 3.6295784549650233e-06, "loss": 1.9378, "step": 7790 }, { "epoch": 2.6537381985200303, "grad_norm": 2.234375, "learning_rate": 3.613145080070886e-06, "loss": 1.9244, "step": 7800 }, { "epoch": 2.6571404269796717, "grad_norm": 2.328125, "learning_rate": 3.59673531037995e-06, "loss": 1.8997, "step": 7810 }, { "epoch": 2.660542655439313, "grad_norm": 2.203125, "learning_rate": 3.5803492704479488e-06, "loss": 1.9715, "step": 7820 }, { "epoch": 2.663944883898954, "grad_norm": 2.0625, "learning_rate": 3.5639870846504873e-06, "loss": 1.917, "step": 7830 }, { "epoch": 2.667347112358595, "grad_norm": 2.4375, "learning_rate": 3.54764887718212e-06, "loss": 1.9122, "step": 7840 }, { "epoch": 2.6707493408182357, "grad_norm": 2.265625, "learning_rate": 3.5313347720553963e-06, "loss": 1.9234, "step": 7850 }, { "epoch": 2.674151569277877, "grad_norm": 2.359375, "learning_rate": 3.5150448930999113e-06, "loss": 1.9519, "step": 7860 }, { "epoch": 2.6775537977375183, "grad_norm": 2.25, "learning_rate": 3.4987793639613926e-06, "loss": 1.9065, "step": 7870 }, { "epoch": 2.6809560261971592, "grad_norm": 2.171875, "learning_rate": 3.482538308100727e-06, "loss": 1.8604, "step": 7880 }, { "epoch": 2.6843582546568, "grad_norm": 2.328125, "learning_rate": 3.4663218487930547e-06, "loss": 1.8554, "step": 7890 }, { "epoch": 2.687760483116441, "grad_norm": 2.4375, "learning_rate": 3.4501301091268043e-06, "loss": 1.936, "step": 7900 }, { "epoch": 2.6911627115760823, "grad_norm": 2.328125, "learning_rate": 3.433963212002789e-06, "loss": 1.8966, "step": 7910 }, { "epoch": 2.6945649400357237, "grad_norm": 2.15625, "learning_rate": 3.41782128013325e-06, "loss": 1.9634, "step": 7920 }, { "epoch": 2.6979671684953646, "grad_norm": 2.546875, "learning_rate": 3.4017044360409375e-06, "loss": 1.922, "step": 7930 }, { "epoch": 2.7013693969550054, "grad_norm": 2.4375, "learning_rate": 3.3856128020581783e-06, "loss": 1.9411, "step": 7940 }, { "epoch": 2.7047716254146463, "grad_norm": 2.265625, "learning_rate": 3.3695465003259376e-06, "loss": 1.8679, "step": 7950 }, { "epoch": 2.7081738538742877, "grad_norm": 1.953125, "learning_rate": 3.353505652792909e-06, "loss": 1.906, "step": 7960 }, { "epoch": 2.711576082333929, "grad_norm": 2.421875, "learning_rate": 3.3374903812145784e-06, "loss": 1.8951, "step": 7970 }, { "epoch": 2.71497831079357, "grad_norm": 2.546875, "learning_rate": 3.3215008071522965e-06, "loss": 1.9556, "step": 7980 }, { "epoch": 2.7183805392532108, "grad_norm": 2.21875, "learning_rate": 3.3055370519723652e-06, "loss": 1.9427, "step": 7990 }, { "epoch": 2.7217827677128517, "grad_norm": 2.71875, "learning_rate": 3.289599236845113e-06, "loss": 1.9533, "step": 8000 }, { "epoch": 2.725184996172493, "grad_norm": 2.609375, "learning_rate": 3.273687482743974e-06, "loss": 1.9608, "step": 8010 }, { "epoch": 2.7285872246321343, "grad_norm": 1.9609375, "learning_rate": 3.2578019104445702e-06, "loss": 1.9894, "step": 8020 }, { "epoch": 2.731989453091775, "grad_norm": 2.46875, "learning_rate": 3.241942640523791e-06, "loss": 1.864, "step": 8030 }, { "epoch": 2.735391681551416, "grad_norm": 2.40625, "learning_rate": 3.2261097933588893e-06, "loss": 1.9567, "step": 8040 }, { "epoch": 2.738793910011057, "grad_norm": 2.65625, "learning_rate": 3.210303489126551e-06, "loss": 1.9093, "step": 8050 }, { "epoch": 2.7421961384706983, "grad_norm": 2.4375, "learning_rate": 3.1945238478020003e-06, "loss": 1.9673, "step": 8060 }, { "epoch": 2.745598366930339, "grad_norm": 2.265625, "learning_rate": 3.1787709891580763e-06, "loss": 1.9712, "step": 8070 }, { "epoch": 2.7490005953899805, "grad_norm": 2.265625, "learning_rate": 3.1630450327643315e-06, "loss": 1.9127, "step": 8080 }, { "epoch": 2.7524028238496214, "grad_norm": 2.234375, "learning_rate": 3.147346097986121e-06, "loss": 1.9763, "step": 8090 }, { "epoch": 2.7558050523092623, "grad_norm": 1.9453125, "learning_rate": 3.1316743039836908e-06, "loss": 1.8313, "step": 8100 }, { "epoch": 2.7592072807689036, "grad_norm": 2.0625, "learning_rate": 3.1160297697112855e-06, "loss": 1.9062, "step": 8110 }, { "epoch": 2.7626095092285445, "grad_norm": 2.25, "learning_rate": 3.10041261391624e-06, "loss": 1.9072, "step": 8120 }, { "epoch": 2.766011737688186, "grad_norm": 2.546875, "learning_rate": 3.0848229551380702e-06, "loss": 1.932, "step": 8130 }, { "epoch": 2.7694139661478268, "grad_norm": 2.375, "learning_rate": 3.069260911707586e-06, "loss": 1.9311, "step": 8140 }, { "epoch": 2.7728161946074676, "grad_norm": 2.6875, "learning_rate": 3.0537266017459856e-06, "loss": 1.9067, "step": 8150 }, { "epoch": 2.776218423067109, "grad_norm": 2.203125, "learning_rate": 3.0382201431639656e-06, "loss": 1.978, "step": 8160 }, { "epoch": 2.77962065152675, "grad_norm": 2.375, "learning_rate": 3.0227416536608095e-06, "loss": 1.9084, "step": 8170 }, { "epoch": 2.783022879986391, "grad_norm": 2.203125, "learning_rate": 3.0072912507235167e-06, "loss": 1.8865, "step": 8180 }, { "epoch": 2.786425108446032, "grad_norm": 2.015625, "learning_rate": 2.991869051625898e-06, "loss": 1.9293, "step": 8190 }, { "epoch": 2.789827336905673, "grad_norm": 2.59375, "learning_rate": 2.9764751734276803e-06, "loss": 1.9127, "step": 8200 }, { "epoch": 2.7932295653653143, "grad_norm": 2.453125, "learning_rate": 2.9611097329736394e-06, "loss": 1.9198, "step": 8210 }, { "epoch": 2.796631793824955, "grad_norm": 2.3125, "learning_rate": 2.9457728468926836e-06, "loss": 1.9261, "step": 8220 }, { "epoch": 2.8000340222845965, "grad_norm": 2.59375, "learning_rate": 2.930464631596993e-06, "loss": 1.9068, "step": 8230 }, { "epoch": 2.8034362507442374, "grad_norm": 2.40625, "learning_rate": 2.915185203281126e-06, "loss": 1.947, "step": 8240 }, { "epoch": 2.8068384792038783, "grad_norm": 2.34375, "learning_rate": 2.899934677921133e-06, "loss": 1.9014, "step": 8250 }, { "epoch": 2.8102407076635196, "grad_norm": 2.25, "learning_rate": 2.884713171273686e-06, "loss": 1.9012, "step": 8260 }, { "epoch": 2.8136429361231605, "grad_norm": 2.3125, "learning_rate": 2.869520798875194e-06, "loss": 1.9299, "step": 8270 }, { "epoch": 2.817045164582802, "grad_norm": 2.046875, "learning_rate": 2.8543576760409264e-06, "loss": 1.9472, "step": 8280 }, { "epoch": 2.8204473930424427, "grad_norm": 2.140625, "learning_rate": 2.839223917864142e-06, "loss": 1.9323, "step": 8290 }, { "epoch": 2.8238496215020836, "grad_norm": 2.203125, "learning_rate": 2.824119639215203e-06, "loss": 1.9394, "step": 8300 }, { "epoch": 2.827251849961725, "grad_norm": 2.515625, "learning_rate": 2.809044954740723e-06, "loss": 1.9369, "step": 8310 }, { "epoch": 2.830654078421366, "grad_norm": 2.46875, "learning_rate": 2.7939999788626755e-06, "loss": 1.9025, "step": 8320 }, { "epoch": 2.834056306881007, "grad_norm": 2.390625, "learning_rate": 2.778984825777543e-06, "loss": 1.908, "step": 8330 }, { "epoch": 2.837458535340648, "grad_norm": 2.5, "learning_rate": 2.763999609455441e-06, "loss": 1.9814, "step": 8340 }, { "epoch": 2.840860763800289, "grad_norm": 2.421875, "learning_rate": 2.7490444436392535e-06, "loss": 1.9804, "step": 8350 }, { "epoch": 2.8442629922599303, "grad_norm": 2.359375, "learning_rate": 2.7341194418437747e-06, "loss": 1.9187, "step": 8360 }, { "epoch": 2.847665220719571, "grad_norm": 2.25, "learning_rate": 2.7192247173548356e-06, "loss": 1.8885, "step": 8370 }, { "epoch": 2.8510674491792125, "grad_norm": 2.515625, "learning_rate": 2.7043603832284616e-06, "loss": 1.9056, "step": 8380 }, { "epoch": 2.8544696776388534, "grad_norm": 2.5625, "learning_rate": 2.689526552289997e-06, "loss": 1.9068, "step": 8390 }, { "epoch": 2.8578719060984943, "grad_norm": 1.9375, "learning_rate": 2.6747233371332606e-06, "loss": 2.0559, "step": 8400 }, { "epoch": 2.8612741345581356, "grad_norm": 2.140625, "learning_rate": 2.6599508501196876e-06, "loss": 1.9102, "step": 8410 }, { "epoch": 2.8646763630177765, "grad_norm": 2.3125, "learning_rate": 2.6452092033774744e-06, "loss": 1.878, "step": 8420 }, { "epoch": 2.868078591477418, "grad_norm": 2.21875, "learning_rate": 2.630498508800734e-06, "loss": 1.9412, "step": 8430 }, { "epoch": 2.8714808199370587, "grad_norm": 2.59375, "learning_rate": 2.6158188780486312e-06, "loss": 1.8957, "step": 8440 }, { "epoch": 2.8748830483966996, "grad_norm": 2.65625, "learning_rate": 2.6011704225445548e-06, "loss": 1.8656, "step": 8450 }, { "epoch": 2.878285276856341, "grad_norm": 2.5, "learning_rate": 2.586553253475264e-06, "loss": 1.9598, "step": 8460 }, { "epoch": 2.881687505315982, "grad_norm": 2.25, "learning_rate": 2.5719674817900346e-06, "loss": 1.957, "step": 8470 }, { "epoch": 2.885089733775623, "grad_norm": 2.296875, "learning_rate": 2.5574132181998334e-06, "loss": 1.9725, "step": 8480 }, { "epoch": 2.888491962235264, "grad_norm": 1.9765625, "learning_rate": 2.5428905731764664e-06, "loss": 1.9228, "step": 8490 }, { "epoch": 2.891894190694905, "grad_norm": 2.40625, "learning_rate": 2.5283996569517464e-06, "loss": 1.938, "step": 8500 }, { "epoch": 2.8952964191545463, "grad_norm": 2.21875, "learning_rate": 2.5139405795166538e-06, "loss": 1.9243, "step": 8510 }, { "epoch": 2.898698647614187, "grad_norm": 2.3125, "learning_rate": 2.4995134506204964e-06, "loss": 1.9328, "step": 8520 }, { "epoch": 2.9021008760738285, "grad_norm": 2.15625, "learning_rate": 2.48511837977009e-06, "loss": 1.9199, "step": 8530 }, { "epoch": 2.9055031045334694, "grad_norm": 2.625, "learning_rate": 2.4707554762289077e-06, "loss": 1.9613, "step": 8540 }, { "epoch": 2.9089053329931103, "grad_norm": 2.046875, "learning_rate": 2.4564248490162763e-06, "loss": 1.9547, "step": 8550 }, { "epoch": 2.9123075614527516, "grad_norm": 2.328125, "learning_rate": 2.442126606906526e-06, "loss": 2.0251, "step": 8560 }, { "epoch": 2.9157097899123925, "grad_norm": 2.40625, "learning_rate": 2.4278608584281694e-06, "loss": 1.9231, "step": 8570 }, { "epoch": 2.919112018372034, "grad_norm": 2.625, "learning_rate": 2.413627711863091e-06, "loss": 1.9295, "step": 8580 }, { "epoch": 2.9225142468316747, "grad_norm": 2.5, "learning_rate": 2.399427275245705e-06, "loss": 1.9444, "step": 8590 }, { "epoch": 2.9259164752913156, "grad_norm": 2.328125, "learning_rate": 2.3852596563621536e-06, "loss": 1.9794, "step": 8600 }, { "epoch": 2.929318703750957, "grad_norm": 2.1875, "learning_rate": 2.3711249627494803e-06, "loss": 1.9096, "step": 8610 }, { "epoch": 2.932720932210598, "grad_norm": 2.578125, "learning_rate": 2.3570233016948133e-06, "loss": 1.9062, "step": 8620 }, { "epoch": 2.936123160670239, "grad_norm": 2.34375, "learning_rate": 2.3429547802345537e-06, "loss": 1.8779, "step": 8630 }, { "epoch": 2.93952538912988, "grad_norm": 2.265625, "learning_rate": 2.3289195051535584e-06, "loss": 1.8901, "step": 8640 }, { "epoch": 2.942927617589521, "grad_norm": 2.203125, "learning_rate": 2.3149175829843367e-06, "loss": 1.9073, "step": 8650 }, { "epoch": 2.9463298460491623, "grad_norm": 2.46875, "learning_rate": 2.3009491200062343e-06, "loss": 1.9434, "step": 8660 }, { "epoch": 2.949732074508803, "grad_norm": 2.1875, "learning_rate": 2.287014222244634e-06, "loss": 1.88, "step": 8670 }, { "epoch": 2.9531343029684445, "grad_norm": 2.109375, "learning_rate": 2.273112995470147e-06, "loss": 1.968, "step": 8680 }, { "epoch": 2.9565365314280854, "grad_norm": 2.03125, "learning_rate": 2.259245545197807e-06, "loss": 1.9048, "step": 8690 }, { "epoch": 2.9599387598877263, "grad_norm": 2.46875, "learning_rate": 2.245411976686278e-06, "loss": 1.9502, "step": 8700 }, { "epoch": 2.9633409883473676, "grad_norm": 2.546875, "learning_rate": 2.231612394937042e-06, "loss": 1.87, "step": 8710 }, { "epoch": 2.9667432168070085, "grad_norm": 2.234375, "learning_rate": 2.217846904693616e-06, "loss": 1.9337, "step": 8720 }, { "epoch": 2.97014544526665, "grad_norm": 2.609375, "learning_rate": 2.2041156104407518e-06, "loss": 1.9095, "step": 8730 }, { "epoch": 2.9735476737262907, "grad_norm": 2.4375, "learning_rate": 2.1904186164036358e-06, "loss": 1.9346, "step": 8740 }, { "epoch": 2.9769499021859316, "grad_norm": 2.09375, "learning_rate": 2.1767560265471087e-06, "loss": 1.9296, "step": 8750 }, { "epoch": 2.980352130645573, "grad_norm": 2.484375, "learning_rate": 2.163127944574872e-06, "loss": 1.9386, "step": 8760 }, { "epoch": 2.983754359105214, "grad_norm": 2.40625, "learning_rate": 2.149534473928699e-06, "loss": 1.9189, "step": 8770 }, { "epoch": 2.987156587564855, "grad_norm": 2.46875, "learning_rate": 2.135975717787654e-06, "loss": 1.8996, "step": 8780 }, { "epoch": 2.990558816024496, "grad_norm": 2.1875, "learning_rate": 2.1224517790673003e-06, "loss": 1.937, "step": 8790 }, { "epoch": 2.993961044484137, "grad_norm": 2.234375, "learning_rate": 2.108962760418933e-06, "loss": 1.9724, "step": 8800 }, { "epoch": 2.9973632729437782, "grad_norm": 2.5, "learning_rate": 2.0955087642287833e-06, "loss": 1.9497, "step": 8810 }, { "epoch": 3.000765501403419, "grad_norm": 2.5, "learning_rate": 2.0820898926172546e-06, "loss": 1.9683, "step": 8820 }, { "epoch": 3.0041677298630605, "grad_norm": 2.375, "learning_rate": 2.0687062474381516e-06, "loss": 1.9146, "step": 8830 }, { "epoch": 3.0075699583227014, "grad_norm": 2.515625, "learning_rate": 2.05535793027788e-06, "loss": 1.9749, "step": 8840 }, { "epoch": 3.0109721867823422, "grad_norm": 2.46875, "learning_rate": 2.042045042454711e-06, "loss": 1.9554, "step": 8850 }, { "epoch": 3.0143744152419836, "grad_norm": 2.53125, "learning_rate": 2.028767685017981e-06, "loss": 1.8963, "step": 8860 }, { "epoch": 3.0177766437016245, "grad_norm": 2.671875, "learning_rate": 2.015525958747352e-06, "loss": 1.938, "step": 8870 }, { "epoch": 3.021178872161266, "grad_norm": 2.625, "learning_rate": 2.0023199641520177e-06, "loss": 1.9223, "step": 8880 }, { "epoch": 3.0245811006209067, "grad_norm": 2.625, "learning_rate": 1.989149801469974e-06, "loss": 1.8825, "step": 8890 }, { "epoch": 3.0279833290805476, "grad_norm": 2.703125, "learning_rate": 1.97601557066723e-06, "loss": 1.9489, "step": 8900 }, { "epoch": 3.031385557540189, "grad_norm": 2.109375, "learning_rate": 1.9629173714370583e-06, "loss": 1.9236, "step": 8910 }, { "epoch": 3.03478778599983, "grad_norm": 2.078125, "learning_rate": 1.949855303199246e-06, "loss": 1.9561, "step": 8920 }, { "epoch": 3.038190014459471, "grad_norm": 2.484375, "learning_rate": 1.9368294650993263e-06, "loss": 1.8969, "step": 8930 }, { "epoch": 3.041592242919112, "grad_norm": 2.125, "learning_rate": 1.92383995600784e-06, "loss": 1.9331, "step": 8940 }, { "epoch": 3.044994471378753, "grad_norm": 2.40625, "learning_rate": 1.910886874519575e-06, "loss": 1.9734, "step": 8950 }, { "epoch": 3.0483966998383942, "grad_norm": 2.0625, "learning_rate": 1.8979703189528225e-06, "loss": 1.918, "step": 8960 }, { "epoch": 3.051798928298035, "grad_norm": 2.40625, "learning_rate": 1.885090387348631e-06, "loss": 1.9162, "step": 8970 }, { "epoch": 3.0552011567576765, "grad_norm": 2.421875, "learning_rate": 1.8722471774700541e-06, "loss": 1.9047, "step": 8980 }, { "epoch": 3.0586033852173173, "grad_norm": 2.40625, "learning_rate": 1.8594407868014222e-06, "loss": 1.9391, "step": 8990 }, { "epoch": 3.0620056136769582, "grad_norm": 2.53125, "learning_rate": 1.8466713125475953e-06, "loss": 1.9597, "step": 9000 }, { "epoch": 3.0654078421365996, "grad_norm": 2.125, "learning_rate": 1.8339388516332183e-06, "loss": 1.9123, "step": 9010 }, { "epoch": 3.0688100705962404, "grad_norm": 2.265625, "learning_rate": 1.8212435007019987e-06, "loss": 1.9063, "step": 9020 }, { "epoch": 3.072212299055882, "grad_norm": 2.0625, "learning_rate": 1.8085853561159651e-06, "loss": 1.8604, "step": 9030 }, { "epoch": 3.0756145275155227, "grad_norm": 2.203125, "learning_rate": 1.7959645139547367e-06, "loss": 1.9165, "step": 9040 }, { "epoch": 3.0790167559751636, "grad_norm": 2.8125, "learning_rate": 1.7833810700147973e-06, "loss": 1.9096, "step": 9050 }, { "epoch": 3.082418984434805, "grad_norm": 2.203125, "learning_rate": 1.770835119808758e-06, "loss": 1.9433, "step": 9060 }, { "epoch": 3.0858212128944458, "grad_norm": 2.46875, "learning_rate": 1.7583267585646496e-06, "loss": 1.972, "step": 9070 }, { "epoch": 3.089223441354087, "grad_norm": 2.40625, "learning_rate": 1.7458560812251807e-06, "loss": 1.9191, "step": 9080 }, { "epoch": 3.092625669813728, "grad_norm": 2.046875, "learning_rate": 1.7334231824470327e-06, "loss": 1.882, "step": 9090 }, { "epoch": 3.096027898273369, "grad_norm": 2.40625, "learning_rate": 1.7210281566001321e-06, "loss": 1.9086, "step": 9100 }, { "epoch": 3.09943012673301, "grad_norm": 2.09375, "learning_rate": 1.7086710977669391e-06, "loss": 1.9225, "step": 9110 }, { "epoch": 3.102832355192651, "grad_norm": 2.515625, "learning_rate": 1.6963520997417304e-06, "loss": 1.9364, "step": 9120 }, { "epoch": 3.1062345836522924, "grad_norm": 2.40625, "learning_rate": 1.684071256029885e-06, "loss": 1.962, "step": 9130 }, { "epoch": 3.1096368121119333, "grad_norm": 2.25, "learning_rate": 1.6718286598471834e-06, "loss": 1.9557, "step": 9140 }, { "epoch": 3.113039040571574, "grad_norm": 2.234375, "learning_rate": 1.6596244041190884e-06, "loss": 1.963, "step": 9150 }, { "epoch": 3.1164412690312155, "grad_norm": 2.453125, "learning_rate": 1.6474585814800486e-06, "loss": 1.8665, "step": 9160 }, { "epoch": 3.1198434974908564, "grad_norm": 2.234375, "learning_rate": 1.6353312842727971e-06, "loss": 1.9364, "step": 9170 }, { "epoch": 3.1232457259504978, "grad_norm": 1.9921875, "learning_rate": 1.6232426045476368e-06, "loss": 1.9379, "step": 9180 }, { "epoch": 3.1266479544101387, "grad_norm": 2.484375, "learning_rate": 1.6111926340617594e-06, "loss": 1.8696, "step": 9190 }, { "epoch": 3.1300501828697795, "grad_norm": 2.546875, "learning_rate": 1.599181464278531e-06, "loss": 1.9511, "step": 9200 }, { "epoch": 3.133452411329421, "grad_norm": 2.125, "learning_rate": 1.587209186366815e-06, "loss": 1.9289, "step": 9210 }, { "epoch": 3.1368546397890618, "grad_norm": 2.296875, "learning_rate": 1.5752758912002694e-06, "loss": 1.8937, "step": 9220 }, { "epoch": 3.140256868248703, "grad_norm": 2.265625, "learning_rate": 1.5633816693566608e-06, "loss": 1.8763, "step": 9230 }, { "epoch": 3.143659096708344, "grad_norm": 2.3125, "learning_rate": 1.5515266111171768e-06, "loss": 1.9913, "step": 9240 }, { "epoch": 3.147061325167985, "grad_norm": 2.5, "learning_rate": 1.5397108064657348e-06, "loss": 1.8861, "step": 9250 }, { "epoch": 3.150463553627626, "grad_norm": 2.109375, "learning_rate": 1.5279343450883104e-06, "loss": 1.9029, "step": 9260 }, { "epoch": 3.153865782087267, "grad_norm": 2.4375, "learning_rate": 1.5161973163722477e-06, "loss": 1.9382, "step": 9270 }, { "epoch": 3.1572680105469084, "grad_norm": 2.421875, "learning_rate": 1.5044998094055818e-06, "loss": 1.8859, "step": 9280 }, { "epoch": 3.1606702390065493, "grad_norm": 2.375, "learning_rate": 1.4928419129763672e-06, "loss": 1.8785, "step": 9290 }, { "epoch": 3.16407246746619, "grad_norm": 2.6875, "learning_rate": 1.4812237155720006e-06, "loss": 1.8864, "step": 9300 }, { "epoch": 3.1674746959258315, "grad_norm": 2.53125, "learning_rate": 1.4696453053785496e-06, "loss": 1.8698, "step": 9310 }, { "epoch": 3.1708769243854724, "grad_norm": 2.296875, "learning_rate": 1.4581067702800793e-06, "loss": 1.9852, "step": 9320 }, { "epoch": 3.1742791528451137, "grad_norm": 2.3125, "learning_rate": 1.4466081978579942e-06, "loss": 1.98, "step": 9330 }, { "epoch": 3.1776813813047546, "grad_norm": 2.34375, "learning_rate": 1.4351496753903699e-06, "loss": 1.925, "step": 9340 }, { "epoch": 3.1810836097643955, "grad_norm": 2.5, "learning_rate": 1.4237312898512816e-06, "loss": 1.9355, "step": 9350 }, { "epoch": 3.184485838224037, "grad_norm": 2.703125, "learning_rate": 1.4123531279101576e-06, "loss": 1.9966, "step": 9360 }, { "epoch": 3.1878880666836777, "grad_norm": 2.578125, "learning_rate": 1.4010152759311148e-06, "loss": 1.8377, "step": 9370 }, { "epoch": 3.191290295143319, "grad_norm": 2.296875, "learning_rate": 1.3897178199723027e-06, "loss": 1.9501, "step": 9380 }, { "epoch": 3.19469252360296, "grad_norm": 2.390625, "learning_rate": 1.3784608457852537e-06, "loss": 1.9103, "step": 9390 }, { "epoch": 3.198094752062601, "grad_norm": 2.578125, "learning_rate": 1.3672444388142238e-06, "loss": 1.9575, "step": 9400 }, { "epoch": 3.201496980522242, "grad_norm": 2.328125, "learning_rate": 1.3560686841955576e-06, "loss": 1.929, "step": 9410 }, { "epoch": 3.204899208981883, "grad_norm": 2.375, "learning_rate": 1.3449336667570272e-06, "loss": 1.9606, "step": 9420 }, { "epoch": 3.2083014374415244, "grad_norm": 2.3125, "learning_rate": 1.3338394710172017e-06, "loss": 1.9379, "step": 9430 }, { "epoch": 3.2117036659011653, "grad_norm": 2.640625, "learning_rate": 1.3227861811847961e-06, "loss": 1.8995, "step": 9440 }, { "epoch": 3.215105894360806, "grad_norm": 2.203125, "learning_rate": 1.3117738811580378e-06, "loss": 1.9038, "step": 9450 }, { "epoch": 3.2185081228204475, "grad_norm": 2.234375, "learning_rate": 1.3008026545240273e-06, "loss": 1.9499, "step": 9460 }, { "epoch": 3.2219103512800884, "grad_norm": 2.234375, "learning_rate": 1.2898725845581015e-06, "loss": 1.9625, "step": 9470 }, { "epoch": 3.2253125797397297, "grad_norm": 2.234375, "learning_rate": 1.2789837542232062e-06, "loss": 2.0014, "step": 9480 }, { "epoch": 3.2287148081993706, "grad_norm": 2.375, "learning_rate": 1.2681362461692674e-06, "loss": 1.9227, "step": 9490 }, { "epoch": 3.2321170366590115, "grad_norm": 1.90625, "learning_rate": 1.2573301427325523e-06, "loss": 1.9411, "step": 9500 }, { "epoch": 3.235519265118653, "grad_norm": 1.9375, "learning_rate": 1.246565525935065e-06, "loss": 1.8898, "step": 9510 }, { "epoch": 3.2389214935782937, "grad_norm": 2.25, "learning_rate": 1.2358424774839005e-06, "loss": 1.8962, "step": 9520 }, { "epoch": 3.242323722037935, "grad_norm": 2.5, "learning_rate": 1.2251610787706435e-06, "loss": 1.9404, "step": 9530 }, { "epoch": 3.245725950497576, "grad_norm": 2.265625, "learning_rate": 1.2145214108707407e-06, "loss": 1.8978, "step": 9540 }, { "epoch": 3.249128178957217, "grad_norm": 2.140625, "learning_rate": 1.2039235545428843e-06, "loss": 1.9312, "step": 9550 }, { "epoch": 3.252530407416858, "grad_norm": 2.140625, "learning_rate": 1.1933675902284088e-06, "loss": 1.8721, "step": 9560 }, { "epoch": 3.255932635876499, "grad_norm": 2.171875, "learning_rate": 1.182853598050669e-06, "loss": 1.9304, "step": 9570 }, { "epoch": 3.2593348643361404, "grad_norm": 2.34375, "learning_rate": 1.1723816578144417e-06, "loss": 1.8912, "step": 9580 }, { "epoch": 3.2627370927957813, "grad_norm": 2.375, "learning_rate": 1.1619518490053083e-06, "loss": 1.8852, "step": 9590 }, { "epoch": 3.266139321255422, "grad_norm": 2.359375, "learning_rate": 1.1515642507890646e-06, "loss": 1.9256, "step": 9600 }, { "epoch": 3.2695415497150635, "grad_norm": 2.375, "learning_rate": 1.141218942011112e-06, "loss": 1.8988, "step": 9610 }, { "epoch": 3.2729437781747044, "grad_norm": 2.4375, "learning_rate": 1.1309160011958583e-06, "loss": 1.9262, "step": 9620 }, { "epoch": 3.2763460066343457, "grad_norm": 2.078125, "learning_rate": 1.1206555065461265e-06, "loss": 1.9177, "step": 9630 }, { "epoch": 3.2797482350939866, "grad_norm": 2.28125, "learning_rate": 1.1104375359425585e-06, "loss": 1.9117, "step": 9640 }, { "epoch": 3.2831504635536275, "grad_norm": 2.703125, "learning_rate": 1.100262166943023e-06, "loss": 1.9711, "step": 9650 }, { "epoch": 3.286552692013269, "grad_norm": 2.296875, "learning_rate": 1.0901294767820318e-06, "loss": 1.9243, "step": 9660 }, { "epoch": 3.2899549204729097, "grad_norm": 2.4375, "learning_rate": 1.0800395423701436e-06, "loss": 1.9023, "step": 9670 }, { "epoch": 3.293357148932551, "grad_norm": 2.140625, "learning_rate": 1.0699924402933917e-06, "loss": 1.938, "step": 9680 }, { "epoch": 3.296759377392192, "grad_norm": 2.359375, "learning_rate": 1.0599882468126933e-06, "loss": 1.9328, "step": 9690 }, { "epoch": 3.300161605851833, "grad_norm": 2.109375, "learning_rate": 1.0500270378632782e-06, "loss": 1.9429, "step": 9700 }, { "epoch": 3.303563834311474, "grad_norm": 2.171875, "learning_rate": 1.0401088890541082e-06, "loss": 1.9068, "step": 9710 }, { "epoch": 3.306966062771115, "grad_norm": 2.28125, "learning_rate": 1.0302338756673032e-06, "loss": 1.9121, "step": 9720 }, { "epoch": 3.3103682912307564, "grad_norm": 2.28125, "learning_rate": 1.0204020726575725e-06, "loss": 1.9197, "step": 9730 }, { "epoch": 3.3137705196903973, "grad_norm": 2.09375, "learning_rate": 1.0106135546516385e-06, "loss": 1.9347, "step": 9740 }, { "epoch": 3.317172748150038, "grad_norm": 1.9375, "learning_rate": 1.0008683959476827e-06, "loss": 1.929, "step": 9750 }, { "epoch": 3.3205749766096795, "grad_norm": 2.203125, "learning_rate": 9.911666705147721e-07, "loss": 1.8878, "step": 9760 }, { "epoch": 3.3239772050693204, "grad_norm": 2.359375, "learning_rate": 9.815084519922975e-07, "loss": 1.8525, "step": 9770 }, { "epoch": 3.3273794335289617, "grad_norm": 2.03125, "learning_rate": 9.718938136894211e-07, "loss": 1.8368, "step": 9780 }, { "epoch": 3.3307816619886026, "grad_norm": 2.0, "learning_rate": 9.623228285845155e-07, "loss": 1.8964, "step": 9790 }, { "epoch": 3.3341838904482435, "grad_norm": 2.796875, "learning_rate": 9.527955693246117e-07, "loss": 1.9062, "step": 9800 }, { "epoch": 3.337586118907885, "grad_norm": 2.125, "learning_rate": 9.433121082248422e-07, "loss": 1.87, "step": 9810 }, { "epoch": 3.3409883473675257, "grad_norm": 2.5, "learning_rate": 9.33872517267902e-07, "loss": 1.9351, "step": 9820 }, { "epoch": 3.344390575827167, "grad_norm": 2.21875, "learning_rate": 9.244768681034954e-07, "loss": 1.9826, "step": 9830 }, { "epoch": 3.347792804286808, "grad_norm": 2.5625, "learning_rate": 9.151252320477888e-07, "loss": 1.9788, "step": 9840 }, { "epoch": 3.351195032746449, "grad_norm": 1.9765625, "learning_rate": 9.058176800828842e-07, "loss": 1.9306, "step": 9850 }, { "epoch": 3.35459726120609, "grad_norm": 2.375, "learning_rate": 8.965542828562589e-07, "loss": 1.9304, "step": 9860 }, { "epoch": 3.357999489665731, "grad_norm": 2.546875, "learning_rate": 8.873351106802486e-07, "loss": 1.9565, "step": 9870 }, { "epoch": 3.3614017181253724, "grad_norm": 2.28125, "learning_rate": 8.781602335315041e-07, "loss": 1.9325, "step": 9880 }, { "epoch": 3.3648039465850133, "grad_norm": 2.25, "learning_rate": 8.690297210504589e-07, "loss": 1.9074, "step": 9890 }, { "epoch": 3.368206175044654, "grad_norm": 2.65625, "learning_rate": 8.599436425408064e-07, "loss": 1.9338, "step": 9900 }, { "epoch": 3.3716084035042955, "grad_norm": 2.625, "learning_rate": 8.509020669689717e-07, "loss": 1.9236, "step": 9910 }, { "epoch": 3.3750106319639364, "grad_norm": 2.5625, "learning_rate": 8.419050629635849e-07, "loss": 1.9387, "step": 9920 }, { "epoch": 3.3784128604235777, "grad_norm": 2.4375, "learning_rate": 8.329526988149661e-07, "loss": 1.9503, "step": 9930 }, { "epoch": 3.3818150888832186, "grad_norm": 2.1875, "learning_rate": 8.240450424745993e-07, "loss": 1.9232, "step": 9940 }, { "epoch": 3.3852173173428595, "grad_norm": 2.546875, "learning_rate": 8.151821615546263e-07, "loss": 1.9435, "step": 9950 }, { "epoch": 3.388619545802501, "grad_norm": 2.203125, "learning_rate": 8.063641233273221e-07, "loss": 1.9005, "step": 9960 }, { "epoch": 3.3920217742621417, "grad_norm": 2.609375, "learning_rate": 7.975909947245956e-07, "loss": 1.864, "step": 9970 }, { "epoch": 3.3954240027217826, "grad_norm": 2.15625, "learning_rate": 7.888628423374738e-07, "loss": 1.9707, "step": 9980 }, { "epoch": 3.398826231181424, "grad_norm": 2.53125, "learning_rate": 7.801797324156009e-07, "loss": 1.9314, "step": 9990 }, { "epoch": 3.402228459641065, "grad_norm": 2.546875, "learning_rate": 7.715417308667326e-07, "loss": 1.9229, "step": 10000 }, { "epoch": 3.405630688100706, "grad_norm": 2.5625, "learning_rate": 7.629489032562336e-07, "loss": 1.86, "step": 10010 }, { "epoch": 3.409032916560347, "grad_norm": 2.4375, "learning_rate": 7.544013148065898e-07, "loss": 1.9123, "step": 10020 }, { "epoch": 3.412435145019988, "grad_norm": 1.8515625, "learning_rate": 7.45899030396898e-07, "loss": 1.8735, "step": 10030 }, { "epoch": 3.4158373734796292, "grad_norm": 2.375, "learning_rate": 7.374421145623891e-07, "loss": 1.9386, "step": 10040 }, { "epoch": 3.41923960193927, "grad_norm": 2.5625, "learning_rate": 7.290306314939283e-07, "loss": 1.8794, "step": 10050 }, { "epoch": 3.4226418303989115, "grad_norm": 2.296875, "learning_rate": 7.206646450375306e-07, "loss": 1.9236, "step": 10060 }, { "epoch": 3.4260440588585523, "grad_norm": 2.25, "learning_rate": 7.123442186938769e-07, "loss": 1.9224, "step": 10070 }, { "epoch": 3.4294462873181932, "grad_norm": 2.28125, "learning_rate": 7.040694156178301e-07, "loss": 1.9089, "step": 10080 }, { "epoch": 3.4328485157778346, "grad_norm": 2.125, "learning_rate": 6.958402986179579e-07, "loss": 1.9395, "step": 10090 }, { "epoch": 3.4362507442374755, "grad_norm": 2.703125, "learning_rate": 6.87656930156057e-07, "loss": 1.9217, "step": 10100 }, { "epoch": 3.439652972697117, "grad_norm": 2.203125, "learning_rate": 6.795193723466726e-07, "loss": 1.9458, "step": 10110 }, { "epoch": 3.4430552011567577, "grad_norm": 1.828125, "learning_rate": 6.714276869566347e-07, "loss": 1.9698, "step": 10120 }, { "epoch": 3.4464574296163986, "grad_norm": 2.3125, "learning_rate": 6.633819354045855e-07, "loss": 1.9773, "step": 10130 }, { "epoch": 3.44985965807604, "grad_norm": 2.34375, "learning_rate": 6.553821787605149e-07, "loss": 1.8458, "step": 10140 }, { "epoch": 3.453261886535681, "grad_norm": 2.265625, "learning_rate": 6.474284777452948e-07, "loss": 1.9633, "step": 10150 }, { "epoch": 3.456664114995322, "grad_norm": 2.234375, "learning_rate": 6.395208927302167e-07, "loss": 1.9253, "step": 10160 }, { "epoch": 3.460066343454963, "grad_norm": 1.984375, "learning_rate": 6.31659483736541e-07, "loss": 1.8867, "step": 10170 }, { "epoch": 3.463468571914604, "grad_norm": 2.46875, "learning_rate": 6.238443104350302e-07, "loss": 1.9415, "step": 10180 }, { "epoch": 3.466870800374245, "grad_norm": 2.4375, "learning_rate": 6.160754321455092e-07, "loss": 1.8688, "step": 10190 }, { "epoch": 3.470273028833886, "grad_norm": 2.359375, "learning_rate": 6.083529078364046e-07, "loss": 1.8777, "step": 10200 }, { "epoch": 3.4736752572935274, "grad_norm": 2.046875, "learning_rate": 6.006767961242978e-07, "loss": 1.8808, "step": 10210 }, { "epoch": 3.4770774857531683, "grad_norm": 2.140625, "learning_rate": 5.930471552734888e-07, "loss": 1.9203, "step": 10220 }, { "epoch": 3.480479714212809, "grad_norm": 2.21875, "learning_rate": 5.854640431955407e-07, "loss": 1.9427, "step": 10230 }, { "epoch": 3.4838819426724505, "grad_norm": 2.609375, "learning_rate": 5.779275174488542e-07, "loss": 1.9229, "step": 10240 }, { "epoch": 3.4872841711320914, "grad_norm": 2.328125, "learning_rate": 5.704376352382198e-07, "loss": 1.8909, "step": 10250 }, { "epoch": 3.4906863995917328, "grad_norm": 2.25, "learning_rate": 5.629944534143905e-07, "loss": 1.9481, "step": 10260 }, { "epoch": 3.4940886280513737, "grad_norm": 2.390625, "learning_rate": 5.555980284736454e-07, "loss": 1.9152, "step": 10270 }, { "epoch": 3.4974908565110145, "grad_norm": 2.03125, "learning_rate": 5.482484165573627e-07, "loss": 1.9002, "step": 10280 }, { "epoch": 3.500893084970656, "grad_norm": 2.34375, "learning_rate": 5.409456734515961e-07, "loss": 1.9427, "step": 10290 }, { "epoch": 3.5042953134302968, "grad_norm": 2.390625, "learning_rate": 5.336898545866455e-07, "loss": 1.9312, "step": 10300 }, { "epoch": 3.5076975418899377, "grad_norm": 2.3125, "learning_rate": 5.264810150366431e-07, "loss": 1.9146, "step": 10310 }, { "epoch": 3.511099770349579, "grad_norm": 2.625, "learning_rate": 5.193192095191315e-07, "loss": 1.932, "step": 10320 }, { "epoch": 3.51450199880922, "grad_norm": 2.21875, "learning_rate": 5.122044923946488e-07, "loss": 1.9544, "step": 10330 }, { "epoch": 3.517904227268861, "grad_norm": 2.21875, "learning_rate": 5.051369176663161e-07, "loss": 1.9132, "step": 10340 }, { "epoch": 3.521306455728502, "grad_norm": 2.09375, "learning_rate": 4.981165389794265e-07, "loss": 1.9379, "step": 10350 }, { "epoch": 3.524708684188143, "grad_norm": 2.359375, "learning_rate": 4.911434096210408e-07, "loss": 1.8495, "step": 10360 }, { "epoch": 3.5281109126477843, "grad_norm": 2.53125, "learning_rate": 4.842175825195817e-07, "loss": 1.964, "step": 10370 }, { "epoch": 3.531513141107425, "grad_norm": 2.09375, "learning_rate": 4.773391102444278e-07, "loss": 1.8755, "step": 10380 }, { "epoch": 3.5349153695670665, "grad_norm": 2.8125, "learning_rate": 4.705080450055242e-07, "loss": 1.902, "step": 10390 }, { "epoch": 3.5383175980267074, "grad_norm": 3.03125, "learning_rate": 4.63724438652977e-07, "loss": 1.9428, "step": 10400 }, { "epoch": 3.5417198264863483, "grad_norm": 2.125, "learning_rate": 4.5698834267666295e-07, "loss": 1.8812, "step": 10410 }, { "epoch": 3.5451220549459896, "grad_norm": 2.265625, "learning_rate": 4.502998082058419e-07, "loss": 1.9378, "step": 10420 }, { "epoch": 3.5485242834056305, "grad_norm": 2.546875, "learning_rate": 4.4365888600876105e-07, "loss": 1.8586, "step": 10430 }, { "epoch": 3.551926511865272, "grad_norm": 2.5, "learning_rate": 4.3706562649227966e-07, "loss": 1.9303, "step": 10440 }, { "epoch": 3.5553287403249128, "grad_norm": 2.28125, "learning_rate": 4.305200797014755e-07, "loss": 1.8785, "step": 10450 }, { "epoch": 3.5587309687845536, "grad_norm": 2.296875, "learning_rate": 4.2402229531927284e-07, "loss": 1.8698, "step": 10460 }, { "epoch": 3.562133197244195, "grad_norm": 2.203125, "learning_rate": 4.1757232266606775e-07, "loss": 1.9134, "step": 10470 }, { "epoch": 3.565535425703836, "grad_norm": 2.0, "learning_rate": 4.1117021069934086e-07, "loss": 1.9092, "step": 10480 }, { "epoch": 3.568937654163477, "grad_norm": 2.578125, "learning_rate": 4.048160080133004e-07, "loss": 1.8521, "step": 10490 }, { "epoch": 3.572339882623118, "grad_norm": 2.046875, "learning_rate": 3.985097628385017e-07, "loss": 1.9322, "step": 10500 }, { "epoch": 3.575742111082759, "grad_norm": 2.265625, "learning_rate": 3.9225152304149186e-07, "loss": 1.95, "step": 10510 }, { "epoch": 3.5791443395424003, "grad_norm": 2.40625, "learning_rate": 3.8604133612443344e-07, "loss": 1.8966, "step": 10520 }, { "epoch": 3.582546568002041, "grad_norm": 2.28125, "learning_rate": 3.798792492247598e-07, "loss": 1.8615, "step": 10530 }, { "epoch": 3.5859487964616825, "grad_norm": 2.203125, "learning_rate": 3.737653091148046e-07, "loss": 1.9687, "step": 10540 }, { "epoch": 3.5893510249213234, "grad_norm": 2.109375, "learning_rate": 3.6769956220144835e-07, "loss": 1.9133, "step": 10550 }, { "epoch": 3.5927532533809643, "grad_norm": 2.203125, "learning_rate": 3.61682054525775e-07, "loss": 1.9313, "step": 10560 }, { "epoch": 3.5961554818406056, "grad_norm": 2.359375, "learning_rate": 3.5571283176270955e-07, "loss": 2.0094, "step": 10570 }, { "epoch": 3.5995577103002465, "grad_norm": 2.328125, "learning_rate": 3.4979193922068417e-07, "loss": 1.9955, "step": 10580 }, { "epoch": 3.602959938759888, "grad_norm": 2.359375, "learning_rate": 3.439194218412834e-07, "loss": 1.9294, "step": 10590 }, { "epoch": 3.6063621672195287, "grad_norm": 2.390625, "learning_rate": 3.380953241989119e-07, "loss": 1.8658, "step": 10600 }, { "epoch": 3.6097643956791696, "grad_norm": 2.859375, "learning_rate": 3.3231969050044987e-07, "loss": 1.9264, "step": 10610 }, { "epoch": 3.613166624138811, "grad_norm": 2.15625, "learning_rate": 3.2659256458491855e-07, "loss": 1.9539, "step": 10620 }, { "epoch": 3.616568852598452, "grad_norm": 2.609375, "learning_rate": 3.209139899231508e-07, "loss": 1.9833, "step": 10630 }, { "epoch": 3.619971081058093, "grad_norm": 2.328125, "learning_rate": 3.1528400961745953e-07, "loss": 1.9088, "step": 10640 }, { "epoch": 3.623373309517734, "grad_norm": 2.359375, "learning_rate": 3.0970266640130633e-07, "loss": 1.9261, "step": 10650 }, { "epoch": 3.626775537977375, "grad_norm": 2.1875, "learning_rate": 3.0417000263898494e-07, "loss": 1.8439, "step": 10660 }, { "epoch": 3.6301777664370163, "grad_norm": 2.421875, "learning_rate": 2.9868606032529224e-07, "loss": 1.9474, "step": 10670 }, { "epoch": 3.633579994896657, "grad_norm": 2.296875, "learning_rate": 2.932508810852159e-07, "loss": 1.9432, "step": 10680 }, { "epoch": 3.6369822233562985, "grad_norm": 2.84375, "learning_rate": 2.8786450617361245e-07, "loss": 1.8769, "step": 10690 }, { "epoch": 3.6403844518159394, "grad_norm": 2.40625, "learning_rate": 2.825269764748977e-07, "loss": 1.9754, "step": 10700 }, { "epoch": 3.6437866802755803, "grad_norm": 2.109375, "learning_rate": 2.772383325027377e-07, "loss": 1.9327, "step": 10710 }, { "epoch": 3.6471889087352216, "grad_norm": 2.421875, "learning_rate": 2.719986143997357e-07, "loss": 1.916, "step": 10720 }, { "epoch": 3.6505911371948625, "grad_norm": 2.328125, "learning_rate": 2.668078619371333e-07, "loss": 1.8941, "step": 10730 }, { "epoch": 3.653993365654504, "grad_norm": 2.4375, "learning_rate": 2.616661145145063e-07, "loss": 1.9525, "step": 10740 }, { "epoch": 3.6573955941141447, "grad_norm": 2.546875, "learning_rate": 2.5657341115946487e-07, "loss": 1.8995, "step": 10750 }, { "epoch": 3.6607978225737856, "grad_norm": 2.65625, "learning_rate": 2.5152979052736e-07, "loss": 1.9815, "step": 10760 }, { "epoch": 3.664200051033427, "grad_norm": 2.765625, "learning_rate": 2.46535290900983e-07, "loss": 1.8823, "step": 10770 }, { "epoch": 3.667602279493068, "grad_norm": 2.171875, "learning_rate": 2.4158995019028676e-07, "loss": 1.9158, "step": 10780 }, { "epoch": 3.671004507952709, "grad_norm": 2.671875, "learning_rate": 2.3669380593208516e-07, "loss": 1.8857, "step": 10790 }, { "epoch": 3.67440673641235, "grad_norm": 2.40625, "learning_rate": 2.3184689528977832e-07, "loss": 1.8922, "step": 10800 }, { "epoch": 3.677808964871991, "grad_norm": 2.3125, "learning_rate": 2.270492550530667e-07, "loss": 1.9044, "step": 10810 }, { "epoch": 3.6812111933316323, "grad_norm": 2.1875, "learning_rate": 2.2230092163766907e-07, "loss": 1.9365, "step": 10820 }, { "epoch": 3.684613421791273, "grad_norm": 2.15625, "learning_rate": 2.1760193108504913e-07, "loss": 1.894, "step": 10830 }, { "epoch": 3.6880156502509145, "grad_norm": 2.265625, "learning_rate": 2.1295231906214332e-07, "loss": 1.9366, "step": 10840 }, { "epoch": 3.6914178787105554, "grad_norm": 1.921875, "learning_rate": 2.0835212086108594e-07, "loss": 1.9098, "step": 10850 }, { "epoch": 3.6948201071701963, "grad_norm": 2.390625, "learning_rate": 2.038013713989457e-07, "loss": 1.9487, "step": 10860 }, { "epoch": 3.6982223356298376, "grad_norm": 2.328125, "learning_rate": 1.9930010521745713e-07, "loss": 1.8716, "step": 10870 }, { "epoch": 3.7016245640894785, "grad_norm": 2.21875, "learning_rate": 1.9484835648276147e-07, "loss": 1.8958, "step": 10880 }, { "epoch": 3.70502679254912, "grad_norm": 2.390625, "learning_rate": 1.904461589851424e-07, "loss": 1.8943, "step": 10890 }, { "epoch": 3.7084290210087607, "grad_norm": 1.9296875, "learning_rate": 1.8609354613877697e-07, "loss": 1.8747, "step": 10900 }, { "epoch": 3.7118312494684016, "grad_norm": 2.296875, "learning_rate": 1.817905509814755e-07, "loss": 1.9229, "step": 10910 }, { "epoch": 3.715233477928043, "grad_norm": 2.25, "learning_rate": 1.7753720617443335e-07, "loss": 1.9303, "step": 10920 }, { "epoch": 3.718635706387684, "grad_norm": 2.328125, "learning_rate": 1.7333354400198364e-07, "loss": 1.9388, "step": 10930 }, { "epoch": 3.722037934847325, "grad_norm": 2.015625, "learning_rate": 1.691795963713496e-07, "loss": 1.892, "step": 10940 }, { "epoch": 3.725440163306966, "grad_norm": 2.3125, "learning_rate": 1.6507539481240707e-07, "loss": 1.9215, "step": 10950 }, { "epoch": 3.728842391766607, "grad_norm": 2.28125, "learning_rate": 1.6102097047744054e-07, "loss": 1.9803, "step": 10960 }, { "epoch": 3.7322446202262483, "grad_norm": 2.046875, "learning_rate": 1.5701635414090798e-07, "loss": 1.9324, "step": 10970 }, { "epoch": 3.735646848685889, "grad_norm": 2.515625, "learning_rate": 1.530615761992094e-07, "loss": 1.8066, "step": 10980 }, { "epoch": 3.7390490771455305, "grad_norm": 2.171875, "learning_rate": 1.4915666667045188e-07, "loss": 1.8818, "step": 10990 }, { "epoch": 3.7424513056051714, "grad_norm": 2.390625, "learning_rate": 1.4530165519422625e-07, "loss": 1.9121, "step": 11000 }, { "epoch": 3.7458535340648123, "grad_norm": 2.359375, "learning_rate": 1.4149657103138097e-07, "loss": 1.9224, "step": 11010 }, { "epoch": 3.7492557625244536, "grad_norm": 2.5, "learning_rate": 1.377414430637975e-07, "loss": 1.9537, "step": 11020 }, { "epoch": 3.7526579909840945, "grad_norm": 2.5, "learning_rate": 1.3403629979417308e-07, "loss": 1.9439, "step": 11030 }, { "epoch": 3.756060219443736, "grad_norm": 2.375, "learning_rate": 1.303811693458042e-07, "loss": 1.9555, "step": 11040 }, { "epoch": 3.7594624479033767, "grad_norm": 2.171875, "learning_rate": 1.2677607946237328e-07, "loss": 1.9296, "step": 11050 }, { "epoch": 3.7628646763630176, "grad_norm": 2.46875, "learning_rate": 1.2322105750773803e-07, "loss": 1.9048, "step": 11060 }, { "epoch": 3.766266904822659, "grad_norm": 2.609375, "learning_rate": 1.1971613046572323e-07, "loss": 1.9255, "step": 11070 }, { "epoch": 3.7696691332823, "grad_norm": 2.34375, "learning_rate": 1.1626132493991633e-07, "loss": 1.9011, "step": 11080 }, { "epoch": 3.773071361741941, "grad_norm": 2.28125, "learning_rate": 1.1285666715346502e-07, "loss": 1.8918, "step": 11090 }, { "epoch": 3.776473590201582, "grad_norm": 2.484375, "learning_rate": 1.0950218294888028e-07, "loss": 1.84, "step": 11100 }, { "epoch": 3.779875818661223, "grad_norm": 2.65625, "learning_rate": 1.0619789778783557e-07, "loss": 1.979, "step": 11110 }, { "epoch": 3.7832780471208642, "grad_norm": 2.4375, "learning_rate": 1.0294383675097872e-07, "loss": 1.9141, "step": 11120 }, { "epoch": 3.786680275580505, "grad_norm": 2.09375, "learning_rate": 9.974002453774011e-08, "loss": 1.98, "step": 11130 }, { "epoch": 3.7900825040401465, "grad_norm": 2.484375, "learning_rate": 9.658648546614084e-08, "loss": 1.9723, "step": 11140 }, { "epoch": 3.7934847324997873, "grad_norm": 2.421875, "learning_rate": 9.348324347261734e-08, "loss": 1.8887, "step": 11150 }, { "epoch": 3.7968869609594282, "grad_norm": 2.546875, "learning_rate": 9.04303221118288e-08, "loss": 1.8763, "step": 11160 }, { "epoch": 3.8002891894190696, "grad_norm": 2.46875, "learning_rate": 8.742774455648695e-08, "loss": 1.9326, "step": 11170 }, { "epoch": 3.8036914178787105, "grad_norm": 1.9765625, "learning_rate": 8.447553359717545e-08, "loss": 1.8815, "step": 11180 }, { "epoch": 3.807093646338352, "grad_norm": 2.296875, "learning_rate": 8.157371164217902e-08, "loss": 1.971, "step": 11190 }, { "epoch": 3.8104958747979927, "grad_norm": 2.375, "learning_rate": 7.872230071731239e-08, "loss": 1.9483, "step": 11200 }, { "epoch": 3.8138981032576336, "grad_norm": 2.609375, "learning_rate": 7.592132246575323e-08, "loss": 1.9457, "step": 11210 }, { "epoch": 3.817300331717275, "grad_norm": 2.28125, "learning_rate": 7.317079814787934e-08, "loss": 1.9193, "step": 11220 }, { "epoch": 3.820702560176916, "grad_norm": 2.203125, "learning_rate": 7.047074864110375e-08, "loss": 1.9131, "step": 11230 }, { "epoch": 3.824104788636557, "grad_norm": 2.21875, "learning_rate": 6.782119443972094e-08, "loss": 1.9334, "step": 11240 }, { "epoch": 3.827507017096198, "grad_norm": 2.625, "learning_rate": 6.522215565474712e-08, "loss": 1.958, "step": 11250 }, { "epoch": 3.830909245555839, "grad_norm": 2.421875, "learning_rate": 6.267365201377092e-08, "loss": 1.9266, "step": 11260 }, { "epoch": 3.8343114740154802, "grad_norm": 2.53125, "learning_rate": 6.017570286079965e-08, "loss": 1.9022, "step": 11270 }, { "epoch": 3.837713702475121, "grad_norm": 2.34375, "learning_rate": 5.77283271561175e-08, "loss": 1.8612, "step": 11280 }, { "epoch": 3.8411159309347624, "grad_norm": 2.453125, "learning_rate": 5.5331543476137706e-08, "loss": 1.9326, "step": 11290 }, { "epoch": 3.8445181593944033, "grad_norm": 2.296875, "learning_rate": 5.298537001326303e-08, "loss": 1.8951, "step": 11300 }, { "epoch": 3.847920387854044, "grad_norm": 2.40625, "learning_rate": 5.068982457574685e-08, "loss": 1.9788, "step": 11310 }, { "epoch": 3.8513226163136856, "grad_norm": 2.609375, "learning_rate": 4.8444924587559654e-08, "loss": 1.9643, "step": 11320 }, { "epoch": 3.8547248447733264, "grad_norm": 2.5625, "learning_rate": 4.625068708825534e-08, "loss": 1.9245, "step": 11330 }, { "epoch": 3.8581270732329678, "grad_norm": 2.34375, "learning_rate": 4.4107128732841385e-08, "loss": 1.8401, "step": 11340 }, { "epoch": 3.8615293016926087, "grad_norm": 2.09375, "learning_rate": 4.20142657916557e-08, "loss": 1.9087, "step": 11350 }, { "epoch": 3.8649315301522496, "grad_norm": 2.140625, "learning_rate": 3.99721141502382e-08, "loss": 1.9401, "step": 11360 }, { "epoch": 3.868333758611891, "grad_norm": 2.328125, "learning_rate": 3.798068930921441e-08, "loss": 1.9699, "step": 11370 }, { "epoch": 3.8717359870715318, "grad_norm": 2.0625, "learning_rate": 3.6040006384174545e-08, "loss": 1.954, "step": 11380 }, { "epoch": 3.875138215531173, "grad_norm": 2.40625, "learning_rate": 3.4150080105563755e-08, "loss": 1.8693, "step": 11390 }, { "epoch": 3.878540443990814, "grad_norm": 2.078125, "learning_rate": 3.231092481856271e-08, "loss": 1.9307, "step": 11400 }, { "epoch": 3.881942672450455, "grad_norm": 2.328125, "learning_rate": 3.052255448298612e-08, "loss": 1.956, "step": 11410 }, { "epoch": 3.885344900910096, "grad_norm": 2.234375, "learning_rate": 2.878498267317298e-08, "loss": 1.9185, "step": 11420 }, { "epoch": 3.888747129369737, "grad_norm": 2.5, "learning_rate": 2.7098222577882825e-08, "loss": 1.8685, "step": 11430 }, { "epoch": 3.8921493578293784, "grad_norm": 2.328125, "learning_rate": 2.5462287000197963e-08, "loss": 1.9734, "step": 11440 }, { "epoch": 3.8955515862890193, "grad_norm": 2.09375, "learning_rate": 2.3877188357427174e-08, "loss": 1.8995, "step": 11450 }, { "epoch": 3.89895381474866, "grad_norm": 2.25, "learning_rate": 2.2342938681005695e-08, "loss": 1.8764, "step": 11460 }, { "epoch": 3.9023560432083015, "grad_norm": 2.265625, "learning_rate": 2.085954961641164e-08, "loss": 1.8865, "step": 11470 }, { "epoch": 3.9057582716679424, "grad_norm": 2.359375, "learning_rate": 1.9427032423071165e-08, "loss": 1.8932, "step": 11480 }, { "epoch": 3.9091605001275838, "grad_norm": 2.25, "learning_rate": 1.8045397974277166e-08, "loss": 1.9042, "step": 11490 }, { "epoch": 3.9125627285872246, "grad_norm": 2.1875, "learning_rate": 1.6714656757104883e-08, "loss": 1.94, "step": 11500 }, { "epoch": 3.9159649570468655, "grad_norm": 2.28125, "learning_rate": 1.5434818872331314e-08, "loss": 1.8879, "step": 11510 }, { "epoch": 3.919367185506507, "grad_norm": 2.046875, "learning_rate": 1.4205894034362065e-08, "loss": 1.9147, "step": 11520 }, { "epoch": 3.9227694139661478, "grad_norm": 2.484375, "learning_rate": 1.3027891571153722e-08, "loss": 1.8714, "step": 11530 }, { "epoch": 3.926171642425789, "grad_norm": 2.03125, "learning_rate": 1.1900820424145176e-08, "loss": 1.9371, "step": 11540 }, { "epoch": 3.92957387088543, "grad_norm": 2.0, "learning_rate": 1.0824689148190455e-08, "loss": 1.9505, "step": 11550 }, { "epoch": 3.932976099345071, "grad_norm": 2.453125, "learning_rate": 9.799505911490794e-09, "loss": 1.8738, "step": 11560 }, { "epoch": 3.936378327804712, "grad_norm": 2.328125, "learning_rate": 8.825278495535672e-09, "loss": 1.8447, "step": 11570 }, { "epoch": 3.939780556264353, "grad_norm": 2.28125, "learning_rate": 7.902014295042352e-09, "loss": 1.8987, "step": 11580 }, { "epoch": 3.9431827847239944, "grad_norm": 2.46875, "learning_rate": 7.029720317899902e-09, "loss": 1.9864, "step": 11590 }, { "epoch": 3.9465850131836353, "grad_norm": 2.796875, "learning_rate": 6.20840318511545e-09, "loss": 1.9454, "step": 11600 }, { "epoch": 3.949987241643276, "grad_norm": 2.59375, "learning_rate": 5.438069130766418e-09, "loss": 1.9871, "step": 11610 }, { "epoch": 3.9533894701029175, "grad_norm": 2.40625, "learning_rate": 4.718724001949017e-09, "loss": 1.8746, "step": 11620 }, { "epoch": 3.9567916985625584, "grad_norm": 2.46875, "learning_rate": 4.050373258737196e-09, "loss": 1.9578, "step": 11630 }, { "epoch": 3.9601939270221997, "grad_norm": 2.171875, "learning_rate": 3.4330219741408427e-09, "loss": 1.9242, "step": 11640 }, { "epoch": 3.9635961554818406, "grad_norm": 2.703125, "learning_rate": 2.8666748340662245e-09, "loss": 1.9133, "step": 11650 }, { "epoch": 3.9669983839414815, "grad_norm": 2.0625, "learning_rate": 2.351336137279413e-09, "loss": 1.9196, "step": 11660 }, { "epoch": 3.970400612401123, "grad_norm": 1.78125, "learning_rate": 1.887009795377922e-09, "loss": 1.9906, "step": 11670 }, { "epoch": 3.9738028408607637, "grad_norm": 2.296875, "learning_rate": 1.473699332754879e-09, "loss": 1.8989, "step": 11680 }, { "epoch": 3.977205069320405, "grad_norm": 2.609375, "learning_rate": 1.1114078865781264e-09, "loss": 1.8962, "step": 11690 }, { "epoch": 3.980607297780046, "grad_norm": 2.34375, "learning_rate": 8.001382067626036e-10, "loss": 1.944, "step": 11700 }, { "epoch": 3.984009526239687, "grad_norm": 2.265625, "learning_rate": 5.398926559516878e-10, "loss": 1.8959, "step": 11710 }, { "epoch": 3.987411754699328, "grad_norm": 2.328125, "learning_rate": 3.306732094962939e-10, "loss": 1.9388, "step": 11720 }, { "epoch": 3.990813983158969, "grad_norm": 2.359375, "learning_rate": 1.7248145544367861e-10, "loss": 1.9133, "step": 11730 }, { "epoch": 3.9942162116186104, "grad_norm": 1.96875, "learning_rate": 6.531859452325864e-11, "loss": 1.957, "step": 11740 }, { "epoch": 3.9976184400782513, "grad_norm": 2.3125, "learning_rate": 9.185440136907336e-12, "loss": 1.9494, "step": 11750 } ], "logging_steps": 10, "max_steps": 11756, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0768921731962634e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }