{ "best_metric": 1.6915712356567383, "best_model_checkpoint": "./health_analysis_results/checkpoint-8752", "epoch": 5.0, "eval_steps": 500, "global_step": 10940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004570383912248629, "grad_norm": 86932.7734375, "learning_rate": 1.0000000000000002e-06, "loss": 2.6599, "step": 10 }, { "epoch": 0.009140767824497258, "grad_norm": 41141.640625, "learning_rate": 2.0000000000000003e-06, "loss": 2.6196, "step": 20 }, { "epoch": 0.013711151736745886, "grad_norm": 64399.953125, "learning_rate": 3e-06, "loss": 2.6801, "step": 30 }, { "epoch": 0.018281535648994516, "grad_norm": 47369.9140625, "learning_rate": 4.000000000000001e-06, "loss": 2.5896, "step": 40 }, { "epoch": 0.022851919561243144, "grad_norm": 42862.3359375, "learning_rate": 5e-06, "loss": 2.6417, "step": 50 }, { "epoch": 0.027422303473491772, "grad_norm": 43046.7734375, "learning_rate": 6e-06, "loss": 2.5624, "step": 60 }, { "epoch": 0.031992687385740404, "grad_norm": 57193.46875, "learning_rate": 7.000000000000001e-06, "loss": 2.7588, "step": 70 }, { "epoch": 0.03656307129798903, "grad_norm": 47019.44921875, "learning_rate": 8.000000000000001e-06, "loss": 2.7033, "step": 80 }, { "epoch": 0.04113345521023766, "grad_norm": 96947.703125, "learning_rate": 9e-06, "loss": 2.9225, "step": 90 }, { "epoch": 0.04570383912248629, "grad_norm": 53870.5234375, "learning_rate": 1e-05, "loss": 2.65, "step": 100 }, { "epoch": 0.050274223034734916, "grad_norm": 57196.71875, "learning_rate": 1.1000000000000001e-05, "loss": 2.726, "step": 110 }, { "epoch": 0.054844606946983544, "grad_norm": 102628.6015625, "learning_rate": 1.2e-05, "loss": 2.5591, "step": 120 }, { "epoch": 0.05941499085923217, "grad_norm": 59579.734375, "learning_rate": 1.3000000000000001e-05, "loss": 2.7329, "step": 130 }, { "epoch": 0.06398537477148081, "grad_norm": 63045.7890625, "learning_rate": 1.4000000000000001e-05, "loss": 2.7527, "step": 140 }, { "epoch": 0.06855575868372943, "grad_norm": 55244.3828125, "learning_rate": 1.5e-05, "loss": 2.8104, "step": 150 }, { "epoch": 0.07312614259597806, "grad_norm": 101346.0078125, "learning_rate": 1.6000000000000003e-05, "loss": 2.6889, "step": 160 }, { "epoch": 0.07769652650822668, "grad_norm": 47467.60546875, "learning_rate": 1.7000000000000003e-05, "loss": 2.646, "step": 170 }, { "epoch": 0.08226691042047532, "grad_norm": 92061.4765625, "learning_rate": 1.8e-05, "loss": 2.6756, "step": 180 }, { "epoch": 0.08683729433272395, "grad_norm": 52667.0078125, "learning_rate": 1.9e-05, "loss": 2.5845, "step": 190 }, { "epoch": 0.09140767824497258, "grad_norm": 74090.5859375, "learning_rate": 2e-05, "loss": 2.6605, "step": 200 }, { "epoch": 0.09597806215722121, "grad_norm": 92731.8984375, "learning_rate": 2.1e-05, "loss": 2.7177, "step": 210 }, { "epoch": 0.10054844606946983, "grad_norm": 132096.203125, "learning_rate": 2.2000000000000003e-05, "loss": 2.6394, "step": 220 }, { "epoch": 0.10511882998171847, "grad_norm": 64807.15625, "learning_rate": 2.3000000000000003e-05, "loss": 2.5417, "step": 230 }, { "epoch": 0.10968921389396709, "grad_norm": 124301.5390625, "learning_rate": 2.4e-05, "loss": 2.5855, "step": 240 }, { "epoch": 0.11425959780621572, "grad_norm": 106672.2109375, "learning_rate": 2.5e-05, "loss": 2.5796, "step": 250 }, { "epoch": 0.11882998171846434, "grad_norm": 103185.78125, "learning_rate": 2.6000000000000002e-05, "loss": 2.558, "step": 260 }, { "epoch": 0.12340036563071298, "grad_norm": 539943.625, "learning_rate": 2.7000000000000002e-05, "loss": 2.533, "step": 270 }, { "epoch": 0.12797074954296161, "grad_norm": 136090.453125, "learning_rate": 2.8000000000000003e-05, "loss": 2.3531, "step": 280 }, { "epoch": 0.13254113345521024, "grad_norm": 190959.8125, "learning_rate": 2.9e-05, "loss": 2.4295, "step": 290 }, { "epoch": 0.13711151736745886, "grad_norm": 321799.75, "learning_rate": 3e-05, "loss": 2.3202, "step": 300 }, { "epoch": 0.1416819012797075, "grad_norm": 341939.46875, "learning_rate": 3.1e-05, "loss": 2.3956, "step": 310 }, { "epoch": 0.14625228519195613, "grad_norm": 118726.578125, "learning_rate": 3.2000000000000005e-05, "loss": 2.2453, "step": 320 }, { "epoch": 0.15082266910420475, "grad_norm": 199988.390625, "learning_rate": 3.3e-05, "loss": 2.2402, "step": 330 }, { "epoch": 0.15539305301645337, "grad_norm": 200156.25, "learning_rate": 3.4000000000000007e-05, "loss": 2.1299, "step": 340 }, { "epoch": 0.15996343692870202, "grad_norm": 235010.1875, "learning_rate": 3.5e-05, "loss": 2.0552, "step": 350 }, { "epoch": 0.16453382084095064, "grad_norm": 168717.734375, "learning_rate": 3.6e-05, "loss": 2.0332, "step": 360 }, { "epoch": 0.16910420475319926, "grad_norm": 164222.140625, "learning_rate": 3.7e-05, "loss": 2.1069, "step": 370 }, { "epoch": 0.1736745886654479, "grad_norm": 216883.390625, "learning_rate": 3.8e-05, "loss": 2.1525, "step": 380 }, { "epoch": 0.17824497257769653, "grad_norm": 238809.9375, "learning_rate": 3.9000000000000006e-05, "loss": 2.0176, "step": 390 }, { "epoch": 0.18281535648994515, "grad_norm": 148073.28125, "learning_rate": 4e-05, "loss": 1.9141, "step": 400 }, { "epoch": 0.18738574040219377, "grad_norm": 211357.671875, "learning_rate": 4.1e-05, "loss": 1.9354, "step": 410 }, { "epoch": 0.19195612431444242, "grad_norm": 289800.125, "learning_rate": 4.2e-05, "loss": 2.0263, "step": 420 }, { "epoch": 0.19652650822669104, "grad_norm": 354819.71875, "learning_rate": 4.3e-05, "loss": 1.9236, "step": 430 }, { "epoch": 0.20109689213893966, "grad_norm": 187694.3125, "learning_rate": 4.4000000000000006e-05, "loss": 1.8974, "step": 440 }, { "epoch": 0.2056672760511883, "grad_norm": 235554.203125, "learning_rate": 4.5e-05, "loss": 1.8884, "step": 450 }, { "epoch": 0.21023765996343693, "grad_norm": 194993.0625, "learning_rate": 4.600000000000001e-05, "loss": 1.8713, "step": 460 }, { "epoch": 0.21480804387568556, "grad_norm": 190949.453125, "learning_rate": 4.7e-05, "loss": 1.9149, "step": 470 }, { "epoch": 0.21937842778793418, "grad_norm": 209000.484375, "learning_rate": 4.8e-05, "loss": 1.9436, "step": 480 }, { "epoch": 0.22394881170018283, "grad_norm": 151603.03125, "learning_rate": 4.9e-05, "loss": 1.868, "step": 490 }, { "epoch": 0.22851919561243145, "grad_norm": 297051.9375, "learning_rate": 5e-05, "loss": 1.8631, "step": 500 }, { "epoch": 0.23308957952468007, "grad_norm": 193912.53125, "learning_rate": 4.995210727969349e-05, "loss": 1.866, "step": 510 }, { "epoch": 0.2376599634369287, "grad_norm": 185368.859375, "learning_rate": 4.9904214559386976e-05, "loss": 1.8017, "step": 520 }, { "epoch": 0.24223034734917734, "grad_norm": 482963.65625, "learning_rate": 4.985632183908046e-05, "loss": 1.958, "step": 530 }, { "epoch": 0.24680073126142596, "grad_norm": 216741.5625, "learning_rate": 4.980842911877395e-05, "loss": 1.8688, "step": 540 }, { "epoch": 0.2513711151736746, "grad_norm": 248310.640625, "learning_rate": 4.976053639846743e-05, "loss": 1.8206, "step": 550 }, { "epoch": 0.25594149908592323, "grad_norm": 174778.59375, "learning_rate": 4.971264367816092e-05, "loss": 1.7981, "step": 560 }, { "epoch": 0.26051188299817185, "grad_norm": 168291.53125, "learning_rate": 4.966475095785441e-05, "loss": 1.8444, "step": 570 }, { "epoch": 0.26508226691042047, "grad_norm": 119908.9609375, "learning_rate": 4.96168582375479e-05, "loss": 1.8102, "step": 580 }, { "epoch": 0.2696526508226691, "grad_norm": 130359.1640625, "learning_rate": 4.9568965517241384e-05, "loss": 1.819, "step": 590 }, { "epoch": 0.2742230347349177, "grad_norm": 122891.2734375, "learning_rate": 4.952107279693487e-05, "loss": 1.8189, "step": 600 }, { "epoch": 0.27879341864716634, "grad_norm": 133346.09375, "learning_rate": 4.947318007662836e-05, "loss": 1.8422, "step": 610 }, { "epoch": 0.283363802559415, "grad_norm": 179827.6875, "learning_rate": 4.9425287356321845e-05, "loss": 1.8674, "step": 620 }, { "epoch": 0.28793418647166363, "grad_norm": 158770.609375, "learning_rate": 4.9377394636015325e-05, "loss": 1.8341, "step": 630 }, { "epoch": 0.29250457038391225, "grad_norm": 199898.640625, "learning_rate": 4.932950191570881e-05, "loss": 1.8177, "step": 640 }, { "epoch": 0.2970749542961609, "grad_norm": 136966.828125, "learning_rate": 4.92816091954023e-05, "loss": 1.6618, "step": 650 }, { "epoch": 0.3016453382084095, "grad_norm": 131470.1875, "learning_rate": 4.9233716475095786e-05, "loss": 1.9034, "step": 660 }, { "epoch": 0.3062157221206581, "grad_norm": 137772.28125, "learning_rate": 4.918582375478927e-05, "loss": 1.7766, "step": 670 }, { "epoch": 0.31078610603290674, "grad_norm": 225044.28125, "learning_rate": 4.913793103448276e-05, "loss": 1.8404, "step": 680 }, { "epoch": 0.3153564899451554, "grad_norm": 117430.6015625, "learning_rate": 4.9090038314176246e-05, "loss": 1.7514, "step": 690 }, { "epoch": 0.31992687385740404, "grad_norm": 143649.03125, "learning_rate": 4.904214559386973e-05, "loss": 1.7891, "step": 700 }, { "epoch": 0.32449725776965266, "grad_norm": 132381.140625, "learning_rate": 4.899425287356322e-05, "loss": 1.728, "step": 710 }, { "epoch": 0.3290676416819013, "grad_norm": 180736.796875, "learning_rate": 4.894636015325671e-05, "loss": 1.7465, "step": 720 }, { "epoch": 0.3336380255941499, "grad_norm": 102364.2734375, "learning_rate": 4.8898467432950194e-05, "loss": 1.7522, "step": 730 }, { "epoch": 0.3382084095063985, "grad_norm": 124118.0234375, "learning_rate": 4.885057471264368e-05, "loss": 1.7294, "step": 740 }, { "epoch": 0.34277879341864714, "grad_norm": 102199.5234375, "learning_rate": 4.880268199233717e-05, "loss": 1.866, "step": 750 }, { "epoch": 0.3473491773308958, "grad_norm": 128679.890625, "learning_rate": 4.8754789272030654e-05, "loss": 1.8846, "step": 760 }, { "epoch": 0.35191956124314444, "grad_norm": 100680.3671875, "learning_rate": 4.870689655172414e-05, "loss": 1.7317, "step": 770 }, { "epoch": 0.35648994515539306, "grad_norm": 125967.859375, "learning_rate": 4.865900383141763e-05, "loss": 1.8308, "step": 780 }, { "epoch": 0.3610603290676417, "grad_norm": 124871.5859375, "learning_rate": 4.8611111111111115e-05, "loss": 1.7843, "step": 790 }, { "epoch": 0.3656307129798903, "grad_norm": 79497.8515625, "learning_rate": 4.85632183908046e-05, "loss": 1.7379, "step": 800 }, { "epoch": 0.3702010968921389, "grad_norm": 130445.0546875, "learning_rate": 4.851532567049808e-05, "loss": 1.8268, "step": 810 }, { "epoch": 0.37477148080438755, "grad_norm": 104012.1484375, "learning_rate": 4.846743295019157e-05, "loss": 1.7416, "step": 820 }, { "epoch": 0.3793418647166362, "grad_norm": 146501.59375, "learning_rate": 4.8419540229885056e-05, "loss": 1.7859, "step": 830 }, { "epoch": 0.38391224862888484, "grad_norm": 142532.4375, "learning_rate": 4.837164750957854e-05, "loss": 1.7631, "step": 840 }, { "epoch": 0.38848263254113347, "grad_norm": 130007.6640625, "learning_rate": 4.8323754789272036e-05, "loss": 1.7579, "step": 850 }, { "epoch": 0.3930530164533821, "grad_norm": 130011.5078125, "learning_rate": 4.827586206896552e-05, "loss": 1.7896, "step": 860 }, { "epoch": 0.3976234003656307, "grad_norm": 111523.03125, "learning_rate": 4.822796934865901e-05, "loss": 1.7826, "step": 870 }, { "epoch": 0.40219378427787933, "grad_norm": 155793.453125, "learning_rate": 4.81800766283525e-05, "loss": 1.8351, "step": 880 }, { "epoch": 0.40676416819012795, "grad_norm": 129142.6875, "learning_rate": 4.813218390804598e-05, "loss": 1.8079, "step": 890 }, { "epoch": 0.4113345521023766, "grad_norm": 132226.46875, "learning_rate": 4.8084291187739464e-05, "loss": 1.7534, "step": 900 }, { "epoch": 0.41590493601462525, "grad_norm": 126845.109375, "learning_rate": 4.803639846743295e-05, "loss": 1.9015, "step": 910 }, { "epoch": 0.42047531992687387, "grad_norm": 136638.390625, "learning_rate": 4.798850574712644e-05, "loss": 1.7273, "step": 920 }, { "epoch": 0.4250457038391225, "grad_norm": 142308.25, "learning_rate": 4.7940613026819925e-05, "loss": 1.7398, "step": 930 }, { "epoch": 0.4296160877513711, "grad_norm": 115603.34375, "learning_rate": 4.789272030651341e-05, "loss": 1.8353, "step": 940 }, { "epoch": 0.43418647166361973, "grad_norm": 131319.734375, "learning_rate": 4.78448275862069e-05, "loss": 1.6825, "step": 950 }, { "epoch": 0.43875685557586835, "grad_norm": 112166.5703125, "learning_rate": 4.7796934865900385e-05, "loss": 1.7169, "step": 960 }, { "epoch": 0.443327239488117, "grad_norm": 131488.796875, "learning_rate": 4.774904214559387e-05, "loss": 1.7215, "step": 970 }, { "epoch": 0.44789762340036565, "grad_norm": 149545.140625, "learning_rate": 4.770114942528736e-05, "loss": 1.6931, "step": 980 }, { "epoch": 0.4524680073126143, "grad_norm": 163557.375, "learning_rate": 4.7653256704980846e-05, "loss": 1.7066, "step": 990 }, { "epoch": 0.4570383912248629, "grad_norm": 145251.359375, "learning_rate": 4.760536398467433e-05, "loss": 1.797, "step": 1000 }, { "epoch": 0.4616087751371115, "grad_norm": 100632.7734375, "learning_rate": 4.755747126436782e-05, "loss": 1.6975, "step": 1010 }, { "epoch": 0.46617915904936014, "grad_norm": 153170.90625, "learning_rate": 4.7509578544061307e-05, "loss": 1.6741, "step": 1020 }, { "epoch": 0.47074954296160876, "grad_norm": 78909.109375, "learning_rate": 4.7461685823754793e-05, "loss": 1.6999, "step": 1030 }, { "epoch": 0.4753199268738574, "grad_norm": 110082.7421875, "learning_rate": 4.741379310344828e-05, "loss": 1.749, "step": 1040 }, { "epoch": 0.47989031078610606, "grad_norm": 100923.5390625, "learning_rate": 4.736590038314177e-05, "loss": 1.7683, "step": 1050 }, { "epoch": 0.4844606946983547, "grad_norm": 98683.3203125, "learning_rate": 4.7318007662835254e-05, "loss": 1.7535, "step": 1060 }, { "epoch": 0.4890310786106033, "grad_norm": 116789.328125, "learning_rate": 4.7270114942528734e-05, "loss": 1.8248, "step": 1070 }, { "epoch": 0.4936014625228519, "grad_norm": 114926.984375, "learning_rate": 4.722222222222222e-05, "loss": 1.7018, "step": 1080 }, { "epoch": 0.49817184643510054, "grad_norm": 91417.0859375, "learning_rate": 4.717432950191571e-05, "loss": 1.8116, "step": 1090 }, { "epoch": 0.5027422303473492, "grad_norm": 128865.3125, "learning_rate": 4.7126436781609195e-05, "loss": 1.6932, "step": 1100 }, { "epoch": 0.5073126142595978, "grad_norm": 122375.984375, "learning_rate": 4.707854406130268e-05, "loss": 1.7383, "step": 1110 }, { "epoch": 0.5118829981718465, "grad_norm": 96160.1328125, "learning_rate": 4.7030651340996175e-05, "loss": 1.6952, "step": 1120 }, { "epoch": 0.5164533820840951, "grad_norm": 124809.203125, "learning_rate": 4.698275862068966e-05, "loss": 1.7973, "step": 1130 }, { "epoch": 0.5210237659963437, "grad_norm": 112799.5625, "learning_rate": 4.693486590038315e-05, "loss": 1.7527, "step": 1140 }, { "epoch": 0.5255941499085923, "grad_norm": 82923.640625, "learning_rate": 4.688697318007663e-05, "loss": 1.7041, "step": 1150 }, { "epoch": 0.5301645338208409, "grad_norm": 130412.3671875, "learning_rate": 4.6839080459770116e-05, "loss": 1.7021, "step": 1160 }, { "epoch": 0.5347349177330896, "grad_norm": 104821.015625, "learning_rate": 4.67911877394636e-05, "loss": 1.7333, "step": 1170 }, { "epoch": 0.5393053016453382, "grad_norm": 100490.953125, "learning_rate": 4.674329501915709e-05, "loss": 1.6688, "step": 1180 }, { "epoch": 0.5438756855575868, "grad_norm": 86425.4453125, "learning_rate": 4.669540229885058e-05, "loss": 1.7381, "step": 1190 }, { "epoch": 0.5484460694698354, "grad_norm": 83740.2734375, "learning_rate": 4.6647509578544064e-05, "loss": 1.8439, "step": 1200 }, { "epoch": 0.553016453382084, "grad_norm": 140177.421875, "learning_rate": 4.659961685823755e-05, "loss": 1.7227, "step": 1210 }, { "epoch": 0.5575868372943327, "grad_norm": 144323.71875, "learning_rate": 4.655172413793104e-05, "loss": 1.7395, "step": 1220 }, { "epoch": 0.5621572212065814, "grad_norm": 97354.59375, "learning_rate": 4.6503831417624524e-05, "loss": 1.7719, "step": 1230 }, { "epoch": 0.56672760511883, "grad_norm": 72904.7578125, "learning_rate": 4.6455938697318004e-05, "loss": 1.6712, "step": 1240 }, { "epoch": 0.5712979890310786, "grad_norm": 162248.328125, "learning_rate": 4.640804597701149e-05, "loss": 1.7491, "step": 1250 }, { "epoch": 0.5758683729433273, "grad_norm": 105222.6328125, "learning_rate": 4.6360153256704985e-05, "loss": 1.7503, "step": 1260 }, { "epoch": 0.5804387568555759, "grad_norm": 113333.96875, "learning_rate": 4.631226053639847e-05, "loss": 1.751, "step": 1270 }, { "epoch": 0.5850091407678245, "grad_norm": 126183.7734375, "learning_rate": 4.626436781609196e-05, "loss": 1.8747, "step": 1280 }, { "epoch": 0.5895795246800731, "grad_norm": 118274.03125, "learning_rate": 4.6216475095785446e-05, "loss": 1.6831, "step": 1290 }, { "epoch": 0.5941499085923218, "grad_norm": 108177.03125, "learning_rate": 4.616858237547893e-05, "loss": 1.6359, "step": 1300 }, { "epoch": 0.5987202925045704, "grad_norm": 81988.9140625, "learning_rate": 4.612068965517242e-05, "loss": 1.7058, "step": 1310 }, { "epoch": 0.603290676416819, "grad_norm": 79780.96875, "learning_rate": 4.60727969348659e-05, "loss": 1.7388, "step": 1320 }, { "epoch": 0.6078610603290676, "grad_norm": 166808.515625, "learning_rate": 4.6024904214559386e-05, "loss": 1.7623, "step": 1330 }, { "epoch": 0.6124314442413162, "grad_norm": 111601.921875, "learning_rate": 4.597701149425287e-05, "loss": 1.761, "step": 1340 }, { "epoch": 0.6170018281535649, "grad_norm": 106101.40625, "learning_rate": 4.592911877394636e-05, "loss": 1.685, "step": 1350 }, { "epoch": 0.6215722120658135, "grad_norm": 125174.578125, "learning_rate": 4.588122605363985e-05, "loss": 1.7784, "step": 1360 }, { "epoch": 0.6261425959780622, "grad_norm": 107639.9375, "learning_rate": 4.5833333333333334e-05, "loss": 1.7636, "step": 1370 }, { "epoch": 0.6307129798903108, "grad_norm": 144034.265625, "learning_rate": 4.578544061302682e-05, "loss": 1.7237, "step": 1380 }, { "epoch": 0.6352833638025595, "grad_norm": 135349.234375, "learning_rate": 4.573754789272031e-05, "loss": 1.6869, "step": 1390 }, { "epoch": 0.6398537477148081, "grad_norm": 92048.3359375, "learning_rate": 4.5689655172413794e-05, "loss": 1.6916, "step": 1400 }, { "epoch": 0.6444241316270567, "grad_norm": 105181.109375, "learning_rate": 4.564176245210728e-05, "loss": 1.7276, "step": 1410 }, { "epoch": 0.6489945155393053, "grad_norm": 111967.078125, "learning_rate": 4.559386973180077e-05, "loss": 1.7594, "step": 1420 }, { "epoch": 0.6535648994515539, "grad_norm": 95790.1171875, "learning_rate": 4.5545977011494255e-05, "loss": 1.6922, "step": 1430 }, { "epoch": 0.6581352833638026, "grad_norm": 133646.671875, "learning_rate": 4.549808429118774e-05, "loss": 1.7427, "step": 1440 }, { "epoch": 0.6627056672760512, "grad_norm": 85705.421875, "learning_rate": 4.545019157088123e-05, "loss": 1.6791, "step": 1450 }, { "epoch": 0.6672760511882998, "grad_norm": 131938.828125, "learning_rate": 4.5402298850574716e-05, "loss": 1.7781, "step": 1460 }, { "epoch": 0.6718464351005484, "grad_norm": 138080.515625, "learning_rate": 4.53544061302682e-05, "loss": 1.7237, "step": 1470 }, { "epoch": 0.676416819012797, "grad_norm": 81319.125, "learning_rate": 4.530651340996169e-05, "loss": 1.7527, "step": 1480 }, { "epoch": 0.6809872029250457, "grad_norm": 114187.8203125, "learning_rate": 4.5258620689655176e-05, "loss": 1.7414, "step": 1490 }, { "epoch": 0.6855575868372943, "grad_norm": 88061.1640625, "learning_rate": 4.5210727969348656e-05, "loss": 1.7703, "step": 1500 }, { "epoch": 0.6901279707495429, "grad_norm": 70474.1015625, "learning_rate": 4.516283524904214e-05, "loss": 1.6569, "step": 1510 }, { "epoch": 0.6946983546617916, "grad_norm": 93419.6171875, "learning_rate": 4.511494252873563e-05, "loss": 1.7559, "step": 1520 }, { "epoch": 0.6992687385740403, "grad_norm": 109502.859375, "learning_rate": 4.506704980842912e-05, "loss": 1.6971, "step": 1530 }, { "epoch": 0.7038391224862889, "grad_norm": 172617.59375, "learning_rate": 4.501915708812261e-05, "loss": 1.8061, "step": 1540 }, { "epoch": 0.7084095063985375, "grad_norm": 103427.5078125, "learning_rate": 4.49712643678161e-05, "loss": 1.7566, "step": 1550 }, { "epoch": 0.7129798903107861, "grad_norm": 77810.8125, "learning_rate": 4.4923371647509585e-05, "loss": 1.6869, "step": 1560 }, { "epoch": 0.7175502742230347, "grad_norm": 112475.9453125, "learning_rate": 4.487547892720307e-05, "loss": 1.6338, "step": 1570 }, { "epoch": 0.7221206581352834, "grad_norm": 99816.34375, "learning_rate": 4.482758620689655e-05, "loss": 1.7478, "step": 1580 }, { "epoch": 0.726691042047532, "grad_norm": 73693.90625, "learning_rate": 4.477969348659004e-05, "loss": 1.7733, "step": 1590 }, { "epoch": 0.7312614259597806, "grad_norm": 125517.2734375, "learning_rate": 4.4731800766283525e-05, "loss": 1.7643, "step": 1600 }, { "epoch": 0.7358318098720292, "grad_norm": 125624.765625, "learning_rate": 4.468390804597701e-05, "loss": 1.789, "step": 1610 }, { "epoch": 0.7404021937842779, "grad_norm": 96340.703125, "learning_rate": 4.46360153256705e-05, "loss": 1.8048, "step": 1620 }, { "epoch": 0.7449725776965265, "grad_norm": 77806.5, "learning_rate": 4.4588122605363986e-05, "loss": 1.6585, "step": 1630 }, { "epoch": 0.7495429616087751, "grad_norm": 115383.6171875, "learning_rate": 4.454022988505747e-05, "loss": 1.7022, "step": 1640 }, { "epoch": 0.7541133455210237, "grad_norm": 128746.96875, "learning_rate": 4.449233716475096e-05, "loss": 1.7007, "step": 1650 }, { "epoch": 0.7586837294332724, "grad_norm": 92168.8515625, "learning_rate": 4.4444444444444447e-05, "loss": 1.78, "step": 1660 }, { "epoch": 0.7632541133455211, "grad_norm": 79167.046875, "learning_rate": 4.4396551724137933e-05, "loss": 1.8273, "step": 1670 }, { "epoch": 0.7678244972577697, "grad_norm": 113320.921875, "learning_rate": 4.434865900383142e-05, "loss": 1.6285, "step": 1680 }, { "epoch": 0.7723948811700183, "grad_norm": 80329.4140625, "learning_rate": 4.430076628352491e-05, "loss": 1.6705, "step": 1690 }, { "epoch": 0.7769652650822669, "grad_norm": 112864.1875, "learning_rate": 4.4252873563218394e-05, "loss": 1.6774, "step": 1700 }, { "epoch": 0.7815356489945156, "grad_norm": 83703.5703125, "learning_rate": 4.420498084291188e-05, "loss": 1.7612, "step": 1710 }, { "epoch": 0.7861060329067642, "grad_norm": 89239.75, "learning_rate": 4.415708812260537e-05, "loss": 1.6861, "step": 1720 }, { "epoch": 0.7906764168190128, "grad_norm": 89113.1484375, "learning_rate": 4.4109195402298855e-05, "loss": 1.751, "step": 1730 }, { "epoch": 0.7952468007312614, "grad_norm": 74121.84375, "learning_rate": 4.406130268199234e-05, "loss": 1.7383, "step": 1740 }, { "epoch": 0.79981718464351, "grad_norm": 98291.3125, "learning_rate": 4.401340996168583e-05, "loss": 1.6507, "step": 1750 }, { "epoch": 0.8043875685557587, "grad_norm": 101062.6484375, "learning_rate": 4.396551724137931e-05, "loss": 1.6809, "step": 1760 }, { "epoch": 0.8089579524680073, "grad_norm": 82451.6015625, "learning_rate": 4.3917624521072795e-05, "loss": 1.6369, "step": 1770 }, { "epoch": 0.8135283363802559, "grad_norm": 93937.7109375, "learning_rate": 4.386973180076628e-05, "loss": 1.8125, "step": 1780 }, { "epoch": 0.8180987202925045, "grad_norm": 86336.0546875, "learning_rate": 4.382183908045977e-05, "loss": 1.7483, "step": 1790 }, { "epoch": 0.8226691042047533, "grad_norm": 94180.6953125, "learning_rate": 4.3773946360153256e-05, "loss": 1.7549, "step": 1800 }, { "epoch": 0.8272394881170019, "grad_norm": 89638.5703125, "learning_rate": 4.372605363984675e-05, "loss": 1.6895, "step": 1810 }, { "epoch": 0.8318098720292505, "grad_norm": 129005.0, "learning_rate": 4.367816091954024e-05, "loss": 1.7419, "step": 1820 }, { "epoch": 0.8363802559414991, "grad_norm": 115378.0703125, "learning_rate": 4.3630268199233724e-05, "loss": 1.6716, "step": 1830 }, { "epoch": 0.8409506398537477, "grad_norm": 84872.3046875, "learning_rate": 4.3582375478927204e-05, "loss": 1.6927, "step": 1840 }, { "epoch": 0.8455210237659964, "grad_norm": 82049.890625, "learning_rate": 4.353448275862069e-05, "loss": 1.7318, "step": 1850 }, { "epoch": 0.850091407678245, "grad_norm": 81004.65625, "learning_rate": 4.348659003831418e-05, "loss": 1.6633, "step": 1860 }, { "epoch": 0.8546617915904936, "grad_norm": 186330.078125, "learning_rate": 4.3438697318007664e-05, "loss": 1.8362, "step": 1870 }, { "epoch": 0.8592321755027422, "grad_norm": 95703.015625, "learning_rate": 4.339080459770115e-05, "loss": 1.8003, "step": 1880 }, { "epoch": 0.8638025594149908, "grad_norm": 75972.78125, "learning_rate": 4.334291187739464e-05, "loss": 1.7282, "step": 1890 }, { "epoch": 0.8683729433272395, "grad_norm": 105656.9765625, "learning_rate": 4.3295019157088125e-05, "loss": 1.7041, "step": 1900 }, { "epoch": 0.8729433272394881, "grad_norm": 98981.046875, "learning_rate": 4.324712643678161e-05, "loss": 1.6535, "step": 1910 }, { "epoch": 0.8775137111517367, "grad_norm": 131719.453125, "learning_rate": 4.31992337164751e-05, "loss": 1.7521, "step": 1920 }, { "epoch": 0.8820840950639853, "grad_norm": 62522.44140625, "learning_rate": 4.3151340996168586e-05, "loss": 1.6794, "step": 1930 }, { "epoch": 0.886654478976234, "grad_norm": 104086.2265625, "learning_rate": 4.3103448275862066e-05, "loss": 1.7029, "step": 1940 }, { "epoch": 0.8912248628884827, "grad_norm": 120783.3359375, "learning_rate": 4.305555555555556e-05, "loss": 1.6313, "step": 1950 }, { "epoch": 0.8957952468007313, "grad_norm": 99394.8203125, "learning_rate": 4.3007662835249046e-05, "loss": 1.7735, "step": 1960 }, { "epoch": 0.9003656307129799, "grad_norm": 152363.28125, "learning_rate": 4.295977011494253e-05, "loss": 1.6732, "step": 1970 }, { "epoch": 0.9049360146252285, "grad_norm": 156369.4375, "learning_rate": 4.291187739463602e-05, "loss": 1.765, "step": 1980 }, { "epoch": 0.9095063985374772, "grad_norm": 63156.875, "learning_rate": 4.286398467432951e-05, "loss": 1.6294, "step": 1990 }, { "epoch": 0.9140767824497258, "grad_norm": 106310.6015625, "learning_rate": 4.2816091954022994e-05, "loss": 1.7492, "step": 2000 }, { "epoch": 0.9186471663619744, "grad_norm": 95234.3515625, "learning_rate": 4.2768199233716474e-05, "loss": 1.6726, "step": 2010 }, { "epoch": 0.923217550274223, "grad_norm": 105046.6015625, "learning_rate": 4.272030651340996e-05, "loss": 1.6848, "step": 2020 }, { "epoch": 0.9277879341864717, "grad_norm": 91991.890625, "learning_rate": 4.267241379310345e-05, "loss": 1.7447, "step": 2030 }, { "epoch": 0.9323583180987203, "grad_norm": 95458.09375, "learning_rate": 4.2624521072796934e-05, "loss": 1.7068, "step": 2040 }, { "epoch": 0.9369287020109689, "grad_norm": 108366.8828125, "learning_rate": 4.257662835249042e-05, "loss": 1.6272, "step": 2050 }, { "epoch": 0.9414990859232175, "grad_norm": 79585.796875, "learning_rate": 4.252873563218391e-05, "loss": 1.7444, "step": 2060 }, { "epoch": 0.9460694698354661, "grad_norm": 108535.515625, "learning_rate": 4.2480842911877395e-05, "loss": 1.6754, "step": 2070 }, { "epoch": 0.9506398537477148, "grad_norm": 101708.1640625, "learning_rate": 4.243295019157089e-05, "loss": 1.8225, "step": 2080 }, { "epoch": 0.9552102376599635, "grad_norm": 149960.90625, "learning_rate": 4.238505747126437e-05, "loss": 1.7228, "step": 2090 }, { "epoch": 0.9597806215722121, "grad_norm": 116507.1328125, "learning_rate": 4.2337164750957856e-05, "loss": 1.7649, "step": 2100 }, { "epoch": 0.9643510054844607, "grad_norm": 95376.0859375, "learning_rate": 4.228927203065134e-05, "loss": 1.7423, "step": 2110 }, { "epoch": 0.9689213893967094, "grad_norm": 77924.8828125, "learning_rate": 4.224137931034483e-05, "loss": 1.7371, "step": 2120 }, { "epoch": 0.973491773308958, "grad_norm": 108620.1484375, "learning_rate": 4.2193486590038316e-05, "loss": 1.6786, "step": 2130 }, { "epoch": 0.9780621572212066, "grad_norm": 96284.8984375, "learning_rate": 4.21455938697318e-05, "loss": 1.641, "step": 2140 }, { "epoch": 0.9826325411334552, "grad_norm": 78326.7890625, "learning_rate": 4.209770114942529e-05, "loss": 1.6984, "step": 2150 }, { "epoch": 0.9872029250457038, "grad_norm": 86470.40625, "learning_rate": 4.204980842911878e-05, "loss": 1.7534, "step": 2160 }, { "epoch": 0.9917733089579525, "grad_norm": 104938.75, "learning_rate": 4.2001915708812264e-05, "loss": 1.7039, "step": 2170 }, { "epoch": 0.9963436928702011, "grad_norm": 100711.6875, "learning_rate": 4.195402298850575e-05, "loss": 1.7534, "step": 2180 }, { "epoch": 1.0, "eval_loss": 1.7005630731582642, "eval_runtime": 345.6818, "eval_samples_per_second": 43.393, "eval_steps_per_second": 1.357, "step": 2188 }, { "epoch": 1.0009140767824498, "grad_norm": 91781.6796875, "learning_rate": 4.190613026819923e-05, "loss": 1.6595, "step": 2190 }, { "epoch": 1.0054844606946984, "grad_norm": 90080.9375, "learning_rate": 4.185823754789272e-05, "loss": 1.6538, "step": 2200 }, { "epoch": 1.010054844606947, "grad_norm": 88122.7890625, "learning_rate": 4.1810344827586205e-05, "loss": 1.6596, "step": 2210 }, { "epoch": 1.0146252285191957, "grad_norm": 111770.046875, "learning_rate": 4.17624521072797e-05, "loss": 1.6971, "step": 2220 }, { "epoch": 1.0191956124314443, "grad_norm": 104428.828125, "learning_rate": 4.1714559386973185e-05, "loss": 1.7929, "step": 2230 }, { "epoch": 1.023765996343693, "grad_norm": 90344.828125, "learning_rate": 4.166666666666667e-05, "loss": 1.8338, "step": 2240 }, { "epoch": 1.0283363802559415, "grad_norm": 87549.5546875, "learning_rate": 4.161877394636016e-05, "loss": 1.6911, "step": 2250 }, { "epoch": 1.0329067641681902, "grad_norm": 82813.453125, "learning_rate": 4.1570881226053646e-05, "loss": 1.7573, "step": 2260 }, { "epoch": 1.0374771480804388, "grad_norm": 87447.1328125, "learning_rate": 4.1522988505747126e-05, "loss": 1.7275, "step": 2270 }, { "epoch": 1.0420475319926874, "grad_norm": 63916.98046875, "learning_rate": 4.147509578544061e-05, "loss": 1.7326, "step": 2280 }, { "epoch": 1.046617915904936, "grad_norm": 89433.3828125, "learning_rate": 4.14272030651341e-05, "loss": 1.7586, "step": 2290 }, { "epoch": 1.0511882998171846, "grad_norm": 171660.09375, "learning_rate": 4.1379310344827587e-05, "loss": 1.789, "step": 2300 }, { "epoch": 1.0557586837294333, "grad_norm": 69103.7734375, "learning_rate": 4.1331417624521073e-05, "loss": 1.7837, "step": 2310 }, { "epoch": 1.0603290676416819, "grad_norm": 81962.609375, "learning_rate": 4.128352490421456e-05, "loss": 1.7424, "step": 2320 }, { "epoch": 1.0648994515539305, "grad_norm": 83097.5703125, "learning_rate": 4.123563218390805e-05, "loss": 1.7879, "step": 2330 }, { "epoch": 1.0694698354661791, "grad_norm": 112565.484375, "learning_rate": 4.1187739463601534e-05, "loss": 1.7266, "step": 2340 }, { "epoch": 1.0740402193784278, "grad_norm": 120119.7265625, "learning_rate": 4.113984674329502e-05, "loss": 1.8418, "step": 2350 }, { "epoch": 1.0786106032906764, "grad_norm": 105760.8359375, "learning_rate": 4.109195402298851e-05, "loss": 1.7203, "step": 2360 }, { "epoch": 1.083180987202925, "grad_norm": 88435.3125, "learning_rate": 4.1044061302681995e-05, "loss": 1.6665, "step": 2370 }, { "epoch": 1.0877513711151736, "grad_norm": 74858.359375, "learning_rate": 4.099616858237548e-05, "loss": 1.6907, "step": 2380 }, { "epoch": 1.0923217550274222, "grad_norm": 80752.5546875, "learning_rate": 4.094827586206897e-05, "loss": 1.7489, "step": 2390 }, { "epoch": 1.0968921389396709, "grad_norm": 85903.7578125, "learning_rate": 4.0900383141762455e-05, "loss": 1.6526, "step": 2400 }, { "epoch": 1.1014625228519195, "grad_norm": 87469.15625, "learning_rate": 4.085249042145594e-05, "loss": 1.7612, "step": 2410 }, { "epoch": 1.106032906764168, "grad_norm": 115305.1484375, "learning_rate": 4.080459770114943e-05, "loss": 1.7148, "step": 2420 }, { "epoch": 1.1106032906764167, "grad_norm": 76171.40625, "learning_rate": 4.0756704980842916e-05, "loss": 1.7383, "step": 2430 }, { "epoch": 1.1151736745886653, "grad_norm": 110853.4140625, "learning_rate": 4.07088122605364e-05, "loss": 1.7353, "step": 2440 }, { "epoch": 1.1197440585009142, "grad_norm": 83511.421875, "learning_rate": 4.066091954022988e-05, "loss": 1.7261, "step": 2450 }, { "epoch": 1.1243144424131628, "grad_norm": 69653.203125, "learning_rate": 4.061302681992337e-05, "loss": 1.7403, "step": 2460 }, { "epoch": 1.1288848263254114, "grad_norm": 141315.3125, "learning_rate": 4.056513409961686e-05, "loss": 1.7125, "step": 2470 }, { "epoch": 1.13345521023766, "grad_norm": 139185.4375, "learning_rate": 4.0517241379310344e-05, "loss": 1.66, "step": 2480 }, { "epoch": 1.1380255941499087, "grad_norm": 107879.578125, "learning_rate": 4.046934865900383e-05, "loss": 1.8147, "step": 2490 }, { "epoch": 1.1425959780621573, "grad_norm": 93484.1953125, "learning_rate": 4.0421455938697324e-05, "loss": 1.6882, "step": 2500 }, { "epoch": 1.147166361974406, "grad_norm": 96417.0390625, "learning_rate": 4.037356321839081e-05, "loss": 1.7444, "step": 2510 }, { "epoch": 1.1517367458866545, "grad_norm": 105896.125, "learning_rate": 4.03256704980843e-05, "loss": 1.7218, "step": 2520 }, { "epoch": 1.1563071297989032, "grad_norm": 70536.5078125, "learning_rate": 4.027777777777778e-05, "loss": 1.643, "step": 2530 }, { "epoch": 1.1608775137111518, "grad_norm": 188483.515625, "learning_rate": 4.0229885057471265e-05, "loss": 1.7145, "step": 2540 }, { "epoch": 1.1654478976234004, "grad_norm": 91915.40625, "learning_rate": 4.018199233716475e-05, "loss": 1.6617, "step": 2550 }, { "epoch": 1.170018281535649, "grad_norm": 112711.40625, "learning_rate": 4.013409961685824e-05, "loss": 1.7117, "step": 2560 }, { "epoch": 1.1745886654478976, "grad_norm": 81478.6953125, "learning_rate": 4.0086206896551726e-05, "loss": 1.8141, "step": 2570 }, { "epoch": 1.1791590493601463, "grad_norm": 90161.96875, "learning_rate": 4.003831417624521e-05, "loss": 1.6604, "step": 2580 }, { "epoch": 1.1837294332723949, "grad_norm": 67411.1953125, "learning_rate": 3.99904214559387e-05, "loss": 1.7204, "step": 2590 }, { "epoch": 1.1882998171846435, "grad_norm": 104490.5234375, "learning_rate": 3.9942528735632186e-05, "loss": 1.7942, "step": 2600 }, { "epoch": 1.1928702010968921, "grad_norm": 81397.3515625, "learning_rate": 3.989463601532567e-05, "loss": 1.6425, "step": 2610 }, { "epoch": 1.1974405850091407, "grad_norm": 117060.71875, "learning_rate": 3.984674329501916e-05, "loss": 1.7771, "step": 2620 }, { "epoch": 1.2020109689213894, "grad_norm": 80522.625, "learning_rate": 3.979885057471265e-05, "loss": 1.7167, "step": 2630 }, { "epoch": 1.206581352833638, "grad_norm": 92349.40625, "learning_rate": 3.9750957854406134e-05, "loss": 1.6713, "step": 2640 }, { "epoch": 1.2111517367458866, "grad_norm": 100128.2578125, "learning_rate": 3.970306513409962e-05, "loss": 1.7673, "step": 2650 }, { "epoch": 1.2157221206581352, "grad_norm": 89113.0625, "learning_rate": 3.965517241379311e-05, "loss": 1.6187, "step": 2660 }, { "epoch": 1.2202925045703839, "grad_norm": 124528.21875, "learning_rate": 3.9607279693486594e-05, "loss": 1.7173, "step": 2670 }, { "epoch": 1.2248628884826325, "grad_norm": 105671.9765625, "learning_rate": 3.955938697318008e-05, "loss": 1.7232, "step": 2680 }, { "epoch": 1.229433272394881, "grad_norm": 72920.734375, "learning_rate": 3.951149425287357e-05, "loss": 1.7218, "step": 2690 }, { "epoch": 1.2340036563071297, "grad_norm": 89138.640625, "learning_rate": 3.9463601532567055e-05, "loss": 1.681, "step": 2700 }, { "epoch": 1.2385740402193783, "grad_norm": 76915.9765625, "learning_rate": 3.9415708812260535e-05, "loss": 1.7417, "step": 2710 }, { "epoch": 1.2431444241316272, "grad_norm": 76970.3359375, "learning_rate": 3.936781609195402e-05, "loss": 1.7513, "step": 2720 }, { "epoch": 1.2477148080438756, "grad_norm": 102189.65625, "learning_rate": 3.931992337164751e-05, "loss": 1.6338, "step": 2730 }, { "epoch": 1.2522851919561244, "grad_norm": 102939.9375, "learning_rate": 3.9272030651340996e-05, "loss": 1.7095, "step": 2740 }, { "epoch": 1.2568555758683728, "grad_norm": 75103.9375, "learning_rate": 3.922413793103448e-05, "loss": 1.7029, "step": 2750 }, { "epoch": 1.2614259597806217, "grad_norm": 118085.3125, "learning_rate": 3.917624521072797e-05, "loss": 1.741, "step": 2760 }, { "epoch": 1.26599634369287, "grad_norm": 78790.3984375, "learning_rate": 3.912835249042146e-05, "loss": 1.739, "step": 2770 }, { "epoch": 1.270566727605119, "grad_norm": 105775.8125, "learning_rate": 3.908045977011495e-05, "loss": 1.6434, "step": 2780 }, { "epoch": 1.2751371115173675, "grad_norm": 78574.390625, "learning_rate": 3.903256704980843e-05, "loss": 1.7371, "step": 2790 }, { "epoch": 1.2797074954296161, "grad_norm": 141283.359375, "learning_rate": 3.898467432950192e-05, "loss": 1.7781, "step": 2800 }, { "epoch": 1.2842778793418648, "grad_norm": 111235.7265625, "learning_rate": 3.8936781609195404e-05, "loss": 1.7383, "step": 2810 }, { "epoch": 1.2888482632541134, "grad_norm": 76845.9296875, "learning_rate": 3.888888888888889e-05, "loss": 1.6674, "step": 2820 }, { "epoch": 1.293418647166362, "grad_norm": 95679.5, "learning_rate": 3.884099616858238e-05, "loss": 1.7516, "step": 2830 }, { "epoch": 1.2979890310786106, "grad_norm": 69231.671875, "learning_rate": 3.8793103448275865e-05, "loss": 1.7267, "step": 2840 }, { "epoch": 1.3025594149908593, "grad_norm": 90358.46875, "learning_rate": 3.874521072796935e-05, "loss": 1.7008, "step": 2850 }, { "epoch": 1.3071297989031079, "grad_norm": 130772.9921875, "learning_rate": 3.869731800766284e-05, "loss": 1.7041, "step": 2860 }, { "epoch": 1.3117001828153565, "grad_norm": 86227.6015625, "learning_rate": 3.8649425287356325e-05, "loss": 1.6709, "step": 2870 }, { "epoch": 1.3162705667276051, "grad_norm": 108528.421875, "learning_rate": 3.8601532567049805e-05, "loss": 1.6854, "step": 2880 }, { "epoch": 1.3208409506398537, "grad_norm": 96323.234375, "learning_rate": 3.855363984674329e-05, "loss": 1.7317, "step": 2890 }, { "epoch": 1.3254113345521024, "grad_norm": 160365.8125, "learning_rate": 3.850574712643678e-05, "loss": 1.7585, "step": 2900 }, { "epoch": 1.329981718464351, "grad_norm": 80294.25, "learning_rate": 3.845785440613027e-05, "loss": 1.7612, "step": 2910 }, { "epoch": 1.3345521023765996, "grad_norm": 106545.8515625, "learning_rate": 3.840996168582376e-05, "loss": 1.7292, "step": 2920 }, { "epoch": 1.3391224862888482, "grad_norm": 95112.4375, "learning_rate": 3.8362068965517246e-05, "loss": 1.7349, "step": 2930 }, { "epoch": 1.3436928702010968, "grad_norm": 65303.0390625, "learning_rate": 3.831417624521073e-05, "loss": 1.7047, "step": 2940 }, { "epoch": 1.3482632541133455, "grad_norm": 66884.5078125, "learning_rate": 3.826628352490422e-05, "loss": 1.7516, "step": 2950 }, { "epoch": 1.352833638025594, "grad_norm": 87596.6796875, "learning_rate": 3.82183908045977e-05, "loss": 1.7047, "step": 2960 }, { "epoch": 1.3574040219378427, "grad_norm": 87859.3671875, "learning_rate": 3.817049808429119e-05, "loss": 1.848, "step": 2970 }, { "epoch": 1.3619744058500913, "grad_norm": 78479.203125, "learning_rate": 3.8122605363984674e-05, "loss": 1.7104, "step": 2980 }, { "epoch": 1.3665447897623402, "grad_norm": 67868.75, "learning_rate": 3.807471264367816e-05, "loss": 1.7818, "step": 2990 }, { "epoch": 1.3711151736745886, "grad_norm": 108708.4921875, "learning_rate": 3.802681992337165e-05, "loss": 1.6592, "step": 3000 }, { "epoch": 1.3756855575868374, "grad_norm": 49919.5, "learning_rate": 3.7978927203065135e-05, "loss": 1.8281, "step": 3010 }, { "epoch": 1.3802559414990858, "grad_norm": 92234.84375, "learning_rate": 3.793103448275862e-05, "loss": 1.6741, "step": 3020 }, { "epoch": 1.3848263254113347, "grad_norm": 110063.28125, "learning_rate": 3.788314176245211e-05, "loss": 1.6882, "step": 3030 }, { "epoch": 1.389396709323583, "grad_norm": 99851.5, "learning_rate": 3.7835249042145595e-05, "loss": 1.7069, "step": 3040 }, { "epoch": 1.393967093235832, "grad_norm": 91330.5, "learning_rate": 3.778735632183908e-05, "loss": 1.7738, "step": 3050 }, { "epoch": 1.3985374771480805, "grad_norm": 97720.6953125, "learning_rate": 3.773946360153257e-05, "loss": 1.699, "step": 3060 }, { "epoch": 1.4031078610603291, "grad_norm": 60284.0859375, "learning_rate": 3.7691570881226056e-05, "loss": 1.6353, "step": 3070 }, { "epoch": 1.4076782449725778, "grad_norm": 110682.34375, "learning_rate": 3.764367816091954e-05, "loss": 1.6963, "step": 3080 }, { "epoch": 1.4122486288848264, "grad_norm": 144162.453125, "learning_rate": 3.759578544061303e-05, "loss": 1.7995, "step": 3090 }, { "epoch": 1.416819012797075, "grad_norm": 205120.046875, "learning_rate": 3.7547892720306517e-05, "loss": 1.796, "step": 3100 }, { "epoch": 1.4213893967093236, "grad_norm": 144576.359375, "learning_rate": 3.7500000000000003e-05, "loss": 1.663, "step": 3110 }, { "epoch": 1.4259597806215722, "grad_norm": 103083.203125, "learning_rate": 3.745210727969349e-05, "loss": 1.6979, "step": 3120 }, { "epoch": 1.4305301645338209, "grad_norm": 100059.3046875, "learning_rate": 3.740421455938698e-05, "loss": 1.7823, "step": 3130 }, { "epoch": 1.4351005484460695, "grad_norm": 128589.3046875, "learning_rate": 3.735632183908046e-05, "loss": 1.7182, "step": 3140 }, { "epoch": 1.4396709323583181, "grad_norm": 70812.890625, "learning_rate": 3.7308429118773944e-05, "loss": 1.7785, "step": 3150 }, { "epoch": 1.4442413162705667, "grad_norm": 63189.20703125, "learning_rate": 3.726053639846743e-05, "loss": 1.7257, "step": 3160 }, { "epoch": 1.4488117001828154, "grad_norm": 108848.34375, "learning_rate": 3.721264367816092e-05, "loss": 1.7409, "step": 3170 }, { "epoch": 1.453382084095064, "grad_norm": 97378.515625, "learning_rate": 3.716475095785441e-05, "loss": 1.7078, "step": 3180 }, { "epoch": 1.4579524680073126, "grad_norm": 101953.046875, "learning_rate": 3.71168582375479e-05, "loss": 1.8016, "step": 3190 }, { "epoch": 1.4625228519195612, "grad_norm": 118171.328125, "learning_rate": 3.7068965517241385e-05, "loss": 1.719, "step": 3200 }, { "epoch": 1.4670932358318098, "grad_norm": 108193.546875, "learning_rate": 3.702107279693487e-05, "loss": 1.7423, "step": 3210 }, { "epoch": 1.4716636197440585, "grad_norm": 83211.53125, "learning_rate": 3.697318007662835e-05, "loss": 1.6915, "step": 3220 }, { "epoch": 1.476234003656307, "grad_norm": 118025.546875, "learning_rate": 3.692528735632184e-05, "loss": 1.7542, "step": 3230 }, { "epoch": 1.4808043875685557, "grad_norm": 79268.8828125, "learning_rate": 3.6877394636015326e-05, "loss": 1.7558, "step": 3240 }, { "epoch": 1.4853747714808043, "grad_norm": 91756.1015625, "learning_rate": 3.682950191570881e-05, "loss": 1.6653, "step": 3250 }, { "epoch": 1.489945155393053, "grad_norm": 114188.828125, "learning_rate": 3.67816091954023e-05, "loss": 1.7899, "step": 3260 }, { "epoch": 1.4945155393053016, "grad_norm": 122504.4921875, "learning_rate": 3.673371647509579e-05, "loss": 1.8254, "step": 3270 }, { "epoch": 1.4990859232175504, "grad_norm": 68376.1640625, "learning_rate": 3.6685823754789274e-05, "loss": 1.7274, "step": 3280 }, { "epoch": 1.5036563071297988, "grad_norm": 118541.71875, "learning_rate": 3.663793103448276e-05, "loss": 1.7456, "step": 3290 }, { "epoch": 1.5082266910420477, "grad_norm": 62813.5078125, "learning_rate": 3.659003831417625e-05, "loss": 1.7049, "step": 3300 }, { "epoch": 1.512797074954296, "grad_norm": 107603.8515625, "learning_rate": 3.6542145593869734e-05, "loss": 1.6411, "step": 3310 }, { "epoch": 1.517367458866545, "grad_norm": 66405.75, "learning_rate": 3.649425287356322e-05, "loss": 1.6928, "step": 3320 }, { "epoch": 1.5219378427787933, "grad_norm": 77770.8125, "learning_rate": 3.644636015325671e-05, "loss": 1.6668, "step": 3330 }, { "epoch": 1.5265082266910421, "grad_norm": 96171.3359375, "learning_rate": 3.6398467432950195e-05, "loss": 1.7354, "step": 3340 }, { "epoch": 1.5310786106032905, "grad_norm": 114372.53125, "learning_rate": 3.635057471264368e-05, "loss": 1.6588, "step": 3350 }, { "epoch": 1.5356489945155394, "grad_norm": 126977.671875, "learning_rate": 3.630268199233717e-05, "loss": 1.7437, "step": 3360 }, { "epoch": 1.5402193784277878, "grad_norm": 124899.46875, "learning_rate": 3.6254789272030656e-05, "loss": 1.7381, "step": 3370 }, { "epoch": 1.5447897623400366, "grad_norm": 87250.2109375, "learning_rate": 3.620689655172414e-05, "loss": 1.7489, "step": 3380 }, { "epoch": 1.5493601462522852, "grad_norm": 99225.671875, "learning_rate": 3.615900383141763e-05, "loss": 1.7005, "step": 3390 }, { "epoch": 1.5539305301645339, "grad_norm": 110436.3515625, "learning_rate": 3.611111111111111e-05, "loss": 1.7526, "step": 3400 }, { "epoch": 1.5585009140767825, "grad_norm": 73272.2421875, "learning_rate": 3.6063218390804596e-05, "loss": 1.6412, "step": 3410 }, { "epoch": 1.563071297989031, "grad_norm": 83162.109375, "learning_rate": 3.601532567049808e-05, "loss": 1.7777, "step": 3420 }, { "epoch": 1.5676416819012797, "grad_norm": 65596.4609375, "learning_rate": 3.596743295019157e-05, "loss": 1.7165, "step": 3430 }, { "epoch": 1.5722120658135283, "grad_norm": 87716.9375, "learning_rate": 3.591954022988506e-05, "loss": 1.7015, "step": 3440 }, { "epoch": 1.576782449725777, "grad_norm": 108148.75, "learning_rate": 3.5871647509578544e-05, "loss": 1.6684, "step": 3450 }, { "epoch": 1.5813528336380256, "grad_norm": 79430.3203125, "learning_rate": 3.582375478927204e-05, "loss": 1.6243, "step": 3460 }, { "epoch": 1.5859232175502742, "grad_norm": 102633.3046875, "learning_rate": 3.5775862068965524e-05, "loss": 1.7009, "step": 3470 }, { "epoch": 1.5904936014625228, "grad_norm": 116884.0546875, "learning_rate": 3.5727969348659004e-05, "loss": 1.6794, "step": 3480 }, { "epoch": 1.5950639853747715, "grad_norm": 123101.0078125, "learning_rate": 3.568007662835249e-05, "loss": 1.7216, "step": 3490 }, { "epoch": 1.59963436928702, "grad_norm": 81439.2734375, "learning_rate": 3.563218390804598e-05, "loss": 1.6748, "step": 3500 }, { "epoch": 1.6042047531992687, "grad_norm": 101444.484375, "learning_rate": 3.5584291187739465e-05, "loss": 1.8176, "step": 3510 }, { "epoch": 1.6087751371115173, "grad_norm": 76834.7578125, "learning_rate": 3.553639846743295e-05, "loss": 1.6932, "step": 3520 }, { "epoch": 1.6133455210237662, "grad_norm": 102102.7265625, "learning_rate": 3.548850574712644e-05, "loss": 1.7848, "step": 3530 }, { "epoch": 1.6179159049360146, "grad_norm": 65518.3984375, "learning_rate": 3.5440613026819926e-05, "loss": 1.6966, "step": 3540 }, { "epoch": 1.6224862888482634, "grad_norm": 72790.3359375, "learning_rate": 3.539272030651341e-05, "loss": 1.6584, "step": 3550 }, { "epoch": 1.6270566727605118, "grad_norm": 73727.9140625, "learning_rate": 3.53448275862069e-05, "loss": 1.7437, "step": 3560 }, { "epoch": 1.6316270566727606, "grad_norm": 91062.0859375, "learning_rate": 3.529693486590038e-05, "loss": 1.6678, "step": 3570 }, { "epoch": 1.636197440585009, "grad_norm": 172251.484375, "learning_rate": 3.5249042145593867e-05, "loss": 1.7394, "step": 3580 }, { "epoch": 1.6407678244972579, "grad_norm": 85787.4140625, "learning_rate": 3.5201149425287353e-05, "loss": 1.6567, "step": 3590 }, { "epoch": 1.6453382084095063, "grad_norm": 96389.1171875, "learning_rate": 3.515325670498085e-05, "loss": 1.7427, "step": 3600 }, { "epoch": 1.6499085923217551, "grad_norm": 69201.625, "learning_rate": 3.5105363984674334e-05, "loss": 1.7775, "step": 3610 }, { "epoch": 1.6544789762340035, "grad_norm": 100617.40625, "learning_rate": 3.505747126436782e-05, "loss": 1.7371, "step": 3620 }, { "epoch": 1.6590493601462524, "grad_norm": 96952.21875, "learning_rate": 3.500957854406131e-05, "loss": 1.7269, "step": 3630 }, { "epoch": 1.6636197440585008, "grad_norm": 74618.84375, "learning_rate": 3.4961685823754795e-05, "loss": 1.6758, "step": 3640 }, { "epoch": 1.6681901279707496, "grad_norm": 57622.64453125, "learning_rate": 3.4913793103448275e-05, "loss": 1.6987, "step": 3650 }, { "epoch": 1.672760511882998, "grad_norm": 102405.7421875, "learning_rate": 3.486590038314176e-05, "loss": 1.755, "step": 3660 }, { "epoch": 1.6773308957952469, "grad_norm": 99707.1953125, "learning_rate": 3.481800766283525e-05, "loss": 1.6318, "step": 3670 }, { "epoch": 1.6819012797074955, "grad_norm": 84351.671875, "learning_rate": 3.4770114942528735e-05, "loss": 1.7005, "step": 3680 }, { "epoch": 1.686471663619744, "grad_norm": 92737.875, "learning_rate": 3.472222222222222e-05, "loss": 1.6547, "step": 3690 }, { "epoch": 1.6910420475319927, "grad_norm": 117513.71875, "learning_rate": 3.467432950191571e-05, "loss": 1.6817, "step": 3700 }, { "epoch": 1.6956124314442413, "grad_norm": 78983.3515625, "learning_rate": 3.4626436781609196e-05, "loss": 1.7011, "step": 3710 }, { "epoch": 1.70018281535649, "grad_norm": 160552.734375, "learning_rate": 3.457854406130268e-05, "loss": 1.7277, "step": 3720 }, { "epoch": 1.7047531992687386, "grad_norm": 69111.1328125, "learning_rate": 3.453065134099617e-05, "loss": 1.7326, "step": 3730 }, { "epoch": 1.7093235831809872, "grad_norm": 106880.96875, "learning_rate": 3.4482758620689657e-05, "loss": 1.7453, "step": 3740 }, { "epoch": 1.7138939670932358, "grad_norm": 117654.46875, "learning_rate": 3.4434865900383143e-05, "loss": 1.8011, "step": 3750 }, { "epoch": 1.7184643510054844, "grad_norm": 85585.7265625, "learning_rate": 3.438697318007663e-05, "loss": 1.7664, "step": 3760 }, { "epoch": 1.723034734917733, "grad_norm": 77661.5078125, "learning_rate": 3.433908045977012e-05, "loss": 1.6859, "step": 3770 }, { "epoch": 1.7276051188299817, "grad_norm": 106292.359375, "learning_rate": 3.4291187739463604e-05, "loss": 1.7482, "step": 3780 }, { "epoch": 1.7321755027422303, "grad_norm": 98747.6171875, "learning_rate": 3.424329501915709e-05, "loss": 1.7234, "step": 3790 }, { "epoch": 1.736745886654479, "grad_norm": 155160.953125, "learning_rate": 3.419540229885058e-05, "loss": 1.8456, "step": 3800 }, { "epoch": 1.7413162705667276, "grad_norm": 94582.5546875, "learning_rate": 3.4147509578544065e-05, "loss": 1.8332, "step": 3810 }, { "epoch": 1.7458866544789764, "grad_norm": 96502.8359375, "learning_rate": 3.409961685823755e-05, "loss": 1.8145, "step": 3820 }, { "epoch": 1.7504570383912248, "grad_norm": 87164.46875, "learning_rate": 3.405172413793103e-05, "loss": 1.6959, "step": 3830 }, { "epoch": 1.7550274223034736, "grad_norm": 94143.84375, "learning_rate": 3.400383141762452e-05, "loss": 1.7662, "step": 3840 }, { "epoch": 1.759597806215722, "grad_norm": 69526.3671875, "learning_rate": 3.3955938697318005e-05, "loss": 1.6818, "step": 3850 }, { "epoch": 1.7641681901279709, "grad_norm": 91595.3359375, "learning_rate": 3.390804597701149e-05, "loss": 1.6884, "step": 3860 }, { "epoch": 1.7687385740402193, "grad_norm": 97468.0234375, "learning_rate": 3.3860153256704986e-05, "loss": 1.7608, "step": 3870 }, { "epoch": 1.7733089579524681, "grad_norm": 138455.578125, "learning_rate": 3.381226053639847e-05, "loss": 1.6528, "step": 3880 }, { "epoch": 1.7778793418647165, "grad_norm": 107855.234375, "learning_rate": 3.376436781609196e-05, "loss": 1.7178, "step": 3890 }, { "epoch": 1.7824497257769654, "grad_norm": 105021.2421875, "learning_rate": 3.371647509578545e-05, "loss": 1.6807, "step": 3900 }, { "epoch": 1.7870201096892138, "grad_norm": 252806.71875, "learning_rate": 3.366858237547893e-05, "loss": 1.6723, "step": 3910 }, { "epoch": 1.7915904936014626, "grad_norm": 116308.34375, "learning_rate": 3.3620689655172414e-05, "loss": 1.7991, "step": 3920 }, { "epoch": 1.796160877513711, "grad_norm": 86454.4375, "learning_rate": 3.35727969348659e-05, "loss": 1.7825, "step": 3930 }, { "epoch": 1.8007312614259599, "grad_norm": 71364.28125, "learning_rate": 3.352490421455939e-05, "loss": 1.769, "step": 3940 }, { "epoch": 1.8053016453382082, "grad_norm": 70699.7265625, "learning_rate": 3.3477011494252874e-05, "loss": 1.7765, "step": 3950 }, { "epoch": 1.809872029250457, "grad_norm": 58119.38671875, "learning_rate": 3.342911877394636e-05, "loss": 1.6735, "step": 3960 }, { "epoch": 1.8144424131627057, "grad_norm": 89724.2578125, "learning_rate": 3.338122605363985e-05, "loss": 1.7261, "step": 3970 }, { "epoch": 1.8190127970749543, "grad_norm": 103765.2421875, "learning_rate": 3.3333333333333335e-05, "loss": 1.7276, "step": 3980 }, { "epoch": 1.823583180987203, "grad_norm": 108591.3046875, "learning_rate": 3.328544061302682e-05, "loss": 1.6897, "step": 3990 }, { "epoch": 1.8281535648994516, "grad_norm": 70272.8046875, "learning_rate": 3.323754789272031e-05, "loss": 1.6812, "step": 4000 }, { "epoch": 1.8327239488117002, "grad_norm": 71892.390625, "learning_rate": 3.3189655172413796e-05, "loss": 1.7387, "step": 4010 }, { "epoch": 1.8372943327239488, "grad_norm": 73947.421875, "learning_rate": 3.314176245210728e-05, "loss": 1.6488, "step": 4020 }, { "epoch": 1.8418647166361974, "grad_norm": 90722.765625, "learning_rate": 3.309386973180077e-05, "loss": 1.7028, "step": 4030 }, { "epoch": 1.846435100548446, "grad_norm": 106649.1484375, "learning_rate": 3.3045977011494256e-05, "loss": 1.7118, "step": 4040 }, { "epoch": 1.8510054844606947, "grad_norm": 79884.34375, "learning_rate": 3.299808429118774e-05, "loss": 1.6838, "step": 4050 }, { "epoch": 1.8555758683729433, "grad_norm": 133383.34375, "learning_rate": 3.295019157088123e-05, "loss": 1.7038, "step": 4060 }, { "epoch": 1.860146252285192, "grad_norm": 93182.28125, "learning_rate": 3.290229885057472e-05, "loss": 1.6335, "step": 4070 }, { "epoch": 1.8647166361974405, "grad_norm": 57547.94921875, "learning_rate": 3.2854406130268204e-05, "loss": 1.7422, "step": 4080 }, { "epoch": 1.8692870201096892, "grad_norm": 95853.7734375, "learning_rate": 3.2806513409961684e-05, "loss": 1.775, "step": 4090 }, { "epoch": 1.8738574040219378, "grad_norm": 144561.4375, "learning_rate": 3.275862068965517e-05, "loss": 1.8058, "step": 4100 }, { "epoch": 1.8784277879341866, "grad_norm": 108504.7734375, "learning_rate": 3.271072796934866e-05, "loss": 1.7476, "step": 4110 }, { "epoch": 1.882998171846435, "grad_norm": 94651.2109375, "learning_rate": 3.2662835249042144e-05, "loss": 1.7199, "step": 4120 }, { "epoch": 1.8875685557586839, "grad_norm": 70529.734375, "learning_rate": 3.261494252873563e-05, "loss": 1.7404, "step": 4130 }, { "epoch": 1.8921389396709323, "grad_norm": 61164.6796875, "learning_rate": 3.256704980842912e-05, "loss": 1.6635, "step": 4140 }, { "epoch": 1.8967093235831811, "grad_norm": 114792.84375, "learning_rate": 3.251915708812261e-05, "loss": 1.7986, "step": 4150 }, { "epoch": 1.9012797074954295, "grad_norm": 72610.09375, "learning_rate": 3.24712643678161e-05, "loss": 1.6614, "step": 4160 }, { "epoch": 1.9058500914076784, "grad_norm": 85551.8828125, "learning_rate": 3.242337164750958e-05, "loss": 1.6255, "step": 4170 }, { "epoch": 1.9104204753199268, "grad_norm": 109005.5546875, "learning_rate": 3.2375478927203066e-05, "loss": 1.6897, "step": 4180 }, { "epoch": 1.9149908592321756, "grad_norm": 107019.8046875, "learning_rate": 3.232758620689655e-05, "loss": 1.732, "step": 4190 }, { "epoch": 1.919561243144424, "grad_norm": 78904.2109375, "learning_rate": 3.227969348659004e-05, "loss": 1.7201, "step": 4200 }, { "epoch": 1.9241316270566728, "grad_norm": 64701.79296875, "learning_rate": 3.2231800766283526e-05, "loss": 1.6952, "step": 4210 }, { "epoch": 1.9287020109689212, "grad_norm": 64434.64453125, "learning_rate": 3.218390804597701e-05, "loss": 1.6946, "step": 4220 }, { "epoch": 1.93327239488117, "grad_norm": 115172.6171875, "learning_rate": 3.21360153256705e-05, "loss": 1.6573, "step": 4230 }, { "epoch": 1.9378427787934185, "grad_norm": 82583.25, "learning_rate": 3.208812260536399e-05, "loss": 1.7499, "step": 4240 }, { "epoch": 1.9424131627056673, "grad_norm": 93962.40625, "learning_rate": 3.2040229885057474e-05, "loss": 1.7036, "step": 4250 }, { "epoch": 1.946983546617916, "grad_norm": 81828.8203125, "learning_rate": 3.1992337164750954e-05, "loss": 1.6486, "step": 4260 }, { "epoch": 1.9515539305301646, "grad_norm": 109928.0546875, "learning_rate": 3.194444444444444e-05, "loss": 1.7805, "step": 4270 }, { "epoch": 1.9561243144424132, "grad_norm": 100452.4765625, "learning_rate": 3.1896551724137935e-05, "loss": 1.6897, "step": 4280 }, { "epoch": 1.9606946983546618, "grad_norm": 53400.49609375, "learning_rate": 3.184865900383142e-05, "loss": 1.681, "step": 4290 }, { "epoch": 1.9652650822669104, "grad_norm": 104313.296875, "learning_rate": 3.180076628352491e-05, "loss": 1.737, "step": 4300 }, { "epoch": 1.969835466179159, "grad_norm": 92321.3515625, "learning_rate": 3.1752873563218395e-05, "loss": 1.6844, "step": 4310 }, { "epoch": 1.9744058500914077, "grad_norm": 87737.4140625, "learning_rate": 3.170498084291188e-05, "loss": 1.7712, "step": 4320 }, { "epoch": 1.9789762340036563, "grad_norm": 108424.9453125, "learning_rate": 3.165708812260537e-05, "loss": 1.7173, "step": 4330 }, { "epoch": 1.983546617915905, "grad_norm": 79022.0546875, "learning_rate": 3.160919540229885e-05, "loss": 1.668, "step": 4340 }, { "epoch": 1.9881170018281535, "grad_norm": 67820.46875, "learning_rate": 3.1561302681992336e-05, "loss": 1.7499, "step": 4350 }, { "epoch": 1.9926873857404022, "grad_norm": 56711.33984375, "learning_rate": 3.151340996168582e-05, "loss": 1.6331, "step": 4360 }, { "epoch": 1.9972577696526508, "grad_norm": 137476.015625, "learning_rate": 3.146551724137931e-05, "loss": 1.6912, "step": 4370 }, { "epoch": 2.0, "eval_loss": 1.693359375, "eval_runtime": 345.1475, "eval_samples_per_second": 43.46, "eval_steps_per_second": 1.359, "step": 4376 }, { "epoch": 2.0018281535648996, "grad_norm": 95937.5703125, "learning_rate": 3.1417624521072797e-05, "loss": 1.7171, "step": 4380 }, { "epoch": 2.006398537477148, "grad_norm": 69800.3515625, "learning_rate": 3.1369731800766283e-05, "loss": 1.698, "step": 4390 }, { "epoch": 2.010968921389397, "grad_norm": 74739.84375, "learning_rate": 3.132183908045977e-05, "loss": 1.6507, "step": 4400 }, { "epoch": 2.0155393053016453, "grad_norm": 82305.1484375, "learning_rate": 3.127394636015326e-05, "loss": 1.7155, "step": 4410 }, { "epoch": 2.020109689213894, "grad_norm": 62019.15625, "learning_rate": 3.1226053639846744e-05, "loss": 1.7122, "step": 4420 }, { "epoch": 2.0246800731261425, "grad_norm": 63863.31640625, "learning_rate": 3.117816091954023e-05, "loss": 1.7085, "step": 4430 }, { "epoch": 2.0292504570383914, "grad_norm": 51309.7109375, "learning_rate": 3.113026819923372e-05, "loss": 1.7576, "step": 4440 }, { "epoch": 2.0338208409506398, "grad_norm": 55412.6796875, "learning_rate": 3.1082375478927205e-05, "loss": 1.709, "step": 4450 }, { "epoch": 2.0383912248628886, "grad_norm": 107907.9609375, "learning_rate": 3.103448275862069e-05, "loss": 1.771, "step": 4460 }, { "epoch": 2.042961608775137, "grad_norm": 73671.1796875, "learning_rate": 3.098659003831418e-05, "loss": 1.7703, "step": 4470 }, { "epoch": 2.047531992687386, "grad_norm": 59534.9765625, "learning_rate": 3.0938697318007665e-05, "loss": 1.7293, "step": 4480 }, { "epoch": 2.0521023765996342, "grad_norm": 133561.09375, "learning_rate": 3.089080459770115e-05, "loss": 1.7752, "step": 4490 }, { "epoch": 2.056672760511883, "grad_norm": 133462.234375, "learning_rate": 3.084291187739464e-05, "loss": 1.7729, "step": 4500 }, { "epoch": 2.0612431444241315, "grad_norm": 70087.765625, "learning_rate": 3.0795019157088126e-05, "loss": 1.7831, "step": 4510 }, { "epoch": 2.0658135283363803, "grad_norm": 141446.296875, "learning_rate": 3.0747126436781606e-05, "loss": 1.7614, "step": 4520 }, { "epoch": 2.0703839122486287, "grad_norm": 83022.75, "learning_rate": 3.069923371647509e-05, "loss": 1.5773, "step": 4530 }, { "epoch": 2.0749542961608776, "grad_norm": 81699.2578125, "learning_rate": 3.065134099616858e-05, "loss": 1.7204, "step": 4540 }, { "epoch": 2.079524680073126, "grad_norm": 73930.953125, "learning_rate": 3.060344827586207e-05, "loss": 1.781, "step": 4550 }, { "epoch": 2.084095063985375, "grad_norm": 99254.2890625, "learning_rate": 3.055555555555556e-05, "loss": 1.6642, "step": 4560 }, { "epoch": 2.088665447897623, "grad_norm": 84721.40625, "learning_rate": 3.0507662835249047e-05, "loss": 1.6594, "step": 4570 }, { "epoch": 2.093235831809872, "grad_norm": 78780.6015625, "learning_rate": 3.045977011494253e-05, "loss": 1.7436, "step": 4580 }, { "epoch": 2.0978062157221204, "grad_norm": 74259.4921875, "learning_rate": 3.0411877394636018e-05, "loss": 1.7986, "step": 4590 }, { "epoch": 2.1023765996343693, "grad_norm": 116283.8671875, "learning_rate": 3.0363984674329505e-05, "loss": 1.7248, "step": 4600 }, { "epoch": 2.106946983546618, "grad_norm": 96815.84375, "learning_rate": 3.031609195402299e-05, "loss": 1.763, "step": 4610 }, { "epoch": 2.1115173674588665, "grad_norm": 61044.22265625, "learning_rate": 3.0268199233716475e-05, "loss": 1.7655, "step": 4620 }, { "epoch": 2.1160877513711154, "grad_norm": 75176.796875, "learning_rate": 3.0220306513409962e-05, "loss": 1.7197, "step": 4630 }, { "epoch": 2.1206581352833638, "grad_norm": 100533.109375, "learning_rate": 3.017241379310345e-05, "loss": 1.6475, "step": 4640 }, { "epoch": 2.1252285191956126, "grad_norm": 79551.5234375, "learning_rate": 3.0124521072796936e-05, "loss": 1.6468, "step": 4650 }, { "epoch": 2.129798903107861, "grad_norm": 74119.8515625, "learning_rate": 3.0076628352490422e-05, "loss": 1.7115, "step": 4660 }, { "epoch": 2.13436928702011, "grad_norm": 62293.6953125, "learning_rate": 3.0028735632183906e-05, "loss": 1.7507, "step": 4670 }, { "epoch": 2.1389396709323583, "grad_norm": 66430.1953125, "learning_rate": 2.9980842911877393e-05, "loss": 1.6552, "step": 4680 }, { "epoch": 2.143510054844607, "grad_norm": 70040.5390625, "learning_rate": 2.9932950191570886e-05, "loss": 1.7662, "step": 4690 }, { "epoch": 2.1480804387568555, "grad_norm": 94635.6796875, "learning_rate": 2.988505747126437e-05, "loss": 1.7736, "step": 4700 }, { "epoch": 2.1526508226691043, "grad_norm": 79885.9453125, "learning_rate": 2.9837164750957857e-05, "loss": 1.7164, "step": 4710 }, { "epoch": 2.1572212065813527, "grad_norm": 81728.3046875, "learning_rate": 2.9789272030651344e-05, "loss": 1.7839, "step": 4720 }, { "epoch": 2.1617915904936016, "grad_norm": 95488.8984375, "learning_rate": 2.974137931034483e-05, "loss": 1.7674, "step": 4730 }, { "epoch": 2.16636197440585, "grad_norm": 109054.171875, "learning_rate": 2.9693486590038317e-05, "loss": 1.6966, "step": 4740 }, { "epoch": 2.170932358318099, "grad_norm": 79046.40625, "learning_rate": 2.96455938697318e-05, "loss": 1.6621, "step": 4750 }, { "epoch": 2.1755027422303472, "grad_norm": 102101.4765625, "learning_rate": 2.9597701149425288e-05, "loss": 1.7391, "step": 4760 }, { "epoch": 2.180073126142596, "grad_norm": 83587.875, "learning_rate": 2.9549808429118775e-05, "loss": 1.8232, "step": 4770 }, { "epoch": 2.1846435100548445, "grad_norm": 104632.4921875, "learning_rate": 2.950191570881226e-05, "loss": 1.6575, "step": 4780 }, { "epoch": 2.1892138939670933, "grad_norm": 72859.53125, "learning_rate": 2.945402298850575e-05, "loss": 1.7205, "step": 4790 }, { "epoch": 2.1937842778793417, "grad_norm": 92640.796875, "learning_rate": 2.9406130268199232e-05, "loss": 1.725, "step": 4800 }, { "epoch": 2.1983546617915906, "grad_norm": 246298.65625, "learning_rate": 2.935823754789272e-05, "loss": 1.7163, "step": 4810 }, { "epoch": 2.202925045703839, "grad_norm": 97907.3828125, "learning_rate": 2.9310344827586206e-05, "loss": 1.6475, "step": 4820 }, { "epoch": 2.207495429616088, "grad_norm": 85840.625, "learning_rate": 2.9262452107279696e-05, "loss": 1.7121, "step": 4830 }, { "epoch": 2.212065813528336, "grad_norm": 72967.8984375, "learning_rate": 2.9214559386973183e-05, "loss": 1.7438, "step": 4840 }, { "epoch": 2.216636197440585, "grad_norm": 75571.765625, "learning_rate": 2.916666666666667e-05, "loss": 1.6586, "step": 4850 }, { "epoch": 2.2212065813528334, "grad_norm": 65606.75, "learning_rate": 2.9118773946360157e-05, "loss": 1.7405, "step": 4860 }, { "epoch": 2.2257769652650823, "grad_norm": 81884.3828125, "learning_rate": 2.9070881226053644e-05, "loss": 1.7318, "step": 4870 }, { "epoch": 2.2303473491773307, "grad_norm": 66072.0703125, "learning_rate": 2.9022988505747127e-05, "loss": 1.7428, "step": 4880 }, { "epoch": 2.2349177330895795, "grad_norm": 154896.03125, "learning_rate": 2.8975095785440614e-05, "loss": 1.7548, "step": 4890 }, { "epoch": 2.2394881170018284, "grad_norm": 75736.953125, "learning_rate": 2.89272030651341e-05, "loss": 1.7215, "step": 4900 }, { "epoch": 2.2440585009140768, "grad_norm": 67909.71875, "learning_rate": 2.8879310344827588e-05, "loss": 1.6336, "step": 4910 }, { "epoch": 2.2486288848263256, "grad_norm": 89797.1953125, "learning_rate": 2.8831417624521075e-05, "loss": 1.7273, "step": 4920 }, { "epoch": 2.253199268738574, "grad_norm": 85214.125, "learning_rate": 2.8783524904214558e-05, "loss": 1.7144, "step": 4930 }, { "epoch": 2.257769652650823, "grad_norm": 76317.734375, "learning_rate": 2.8735632183908045e-05, "loss": 1.7784, "step": 4940 }, { "epoch": 2.2623400365630713, "grad_norm": 49163.828125, "learning_rate": 2.8687739463601532e-05, "loss": 1.7026, "step": 4950 }, { "epoch": 2.26691042047532, "grad_norm": 107379.171875, "learning_rate": 2.863984674329502e-05, "loss": 1.7803, "step": 4960 }, { "epoch": 2.2714808043875685, "grad_norm": 69726.2734375, "learning_rate": 2.859195402298851e-05, "loss": 1.7006, "step": 4970 }, { "epoch": 2.2760511882998173, "grad_norm": 68964.7265625, "learning_rate": 2.8544061302681996e-05, "loss": 1.7053, "step": 4980 }, { "epoch": 2.2806215722120657, "grad_norm": 67173.03125, "learning_rate": 2.8496168582375483e-05, "loss": 1.6936, "step": 4990 }, { "epoch": 2.2851919561243146, "grad_norm": 78151.328125, "learning_rate": 2.844827586206897e-05, "loss": 1.7376, "step": 5000 }, { "epoch": 2.289762340036563, "grad_norm": 147513.4375, "learning_rate": 2.8400383141762453e-05, "loss": 1.7627, "step": 5010 }, { "epoch": 2.294332723948812, "grad_norm": 104835.921875, "learning_rate": 2.835249042145594e-05, "loss": 1.6425, "step": 5020 }, { "epoch": 2.2989031078610602, "grad_norm": 120597.7109375, "learning_rate": 2.8304597701149427e-05, "loss": 1.7738, "step": 5030 }, { "epoch": 2.303473491773309, "grad_norm": 65482.203125, "learning_rate": 2.8256704980842914e-05, "loss": 1.6205, "step": 5040 }, { "epoch": 2.3080438756855575, "grad_norm": 71476.0078125, "learning_rate": 2.82088122605364e-05, "loss": 1.6699, "step": 5050 }, { "epoch": 2.3126142595978063, "grad_norm": 85771.3515625, "learning_rate": 2.8160919540229884e-05, "loss": 1.697, "step": 5060 }, { "epoch": 2.3171846435100547, "grad_norm": 63680.30859375, "learning_rate": 2.811302681992337e-05, "loss": 1.6754, "step": 5070 }, { "epoch": 2.3217550274223036, "grad_norm": 152300.34375, "learning_rate": 2.8065134099616858e-05, "loss": 1.6722, "step": 5080 }, { "epoch": 2.326325411334552, "grad_norm": 88300.703125, "learning_rate": 2.8017241379310345e-05, "loss": 1.7725, "step": 5090 }, { "epoch": 2.330895795246801, "grad_norm": 106965.4375, "learning_rate": 2.796934865900383e-05, "loss": 1.7734, "step": 5100 }, { "epoch": 2.335466179159049, "grad_norm": 118704.640625, "learning_rate": 2.7921455938697322e-05, "loss": 1.6477, "step": 5110 }, { "epoch": 2.340036563071298, "grad_norm": 78607.4453125, "learning_rate": 2.787356321839081e-05, "loss": 1.7386, "step": 5120 }, { "epoch": 2.3446069469835464, "grad_norm": 83952.1171875, "learning_rate": 2.7825670498084296e-05, "loss": 1.64, "step": 5130 }, { "epoch": 2.3491773308957953, "grad_norm": 107545.0859375, "learning_rate": 2.777777777777778e-05, "loss": 1.6633, "step": 5140 }, { "epoch": 2.353747714808044, "grad_norm": 72284.5, "learning_rate": 2.7729885057471266e-05, "loss": 1.7249, "step": 5150 }, { "epoch": 2.3583180987202925, "grad_norm": 89877.7109375, "learning_rate": 2.7681992337164753e-05, "loss": 1.6901, "step": 5160 }, { "epoch": 2.362888482632541, "grad_norm": 138945.6875, "learning_rate": 2.763409961685824e-05, "loss": 1.6468, "step": 5170 }, { "epoch": 2.3674588665447898, "grad_norm": 58679.375, "learning_rate": 2.7586206896551727e-05, "loss": 1.6838, "step": 5180 }, { "epoch": 2.3720292504570386, "grad_norm": 95501.8671875, "learning_rate": 2.753831417624521e-05, "loss": 1.682, "step": 5190 }, { "epoch": 2.376599634369287, "grad_norm": 76119.296875, "learning_rate": 2.7490421455938697e-05, "loss": 1.8395, "step": 5200 }, { "epoch": 2.3811700182815354, "grad_norm": 108761.65625, "learning_rate": 2.7442528735632184e-05, "loss": 1.7638, "step": 5210 }, { "epoch": 2.3857404021937842, "grad_norm": 99530.703125, "learning_rate": 2.739463601532567e-05, "loss": 1.6688, "step": 5220 }, { "epoch": 2.390310786106033, "grad_norm": 73215.6171875, "learning_rate": 2.7346743295019158e-05, "loss": 1.6266, "step": 5230 }, { "epoch": 2.3948811700182815, "grad_norm": 94147.75, "learning_rate": 2.7298850574712648e-05, "loss": 1.7344, "step": 5240 }, { "epoch": 2.3994515539305303, "grad_norm": 80156.09375, "learning_rate": 2.7250957854406135e-05, "loss": 1.7074, "step": 5250 }, { "epoch": 2.4040219378427787, "grad_norm": 54975.70703125, "learning_rate": 2.720306513409962e-05, "loss": 1.6535, "step": 5260 }, { "epoch": 2.4085923217550276, "grad_norm": 64294.23828125, "learning_rate": 2.7155172413793105e-05, "loss": 1.6997, "step": 5270 }, { "epoch": 2.413162705667276, "grad_norm": 83260.0, "learning_rate": 2.7107279693486592e-05, "loss": 1.7617, "step": 5280 }, { "epoch": 2.417733089579525, "grad_norm": 79186.6484375, "learning_rate": 2.705938697318008e-05, "loss": 1.6661, "step": 5290 }, { "epoch": 2.422303473491773, "grad_norm": 98957.21875, "learning_rate": 2.7011494252873566e-05, "loss": 1.6771, "step": 5300 }, { "epoch": 2.426873857404022, "grad_norm": 71378.125, "learning_rate": 2.6963601532567053e-05, "loss": 1.6975, "step": 5310 }, { "epoch": 2.4314442413162705, "grad_norm": 81879.71875, "learning_rate": 2.6915708812260536e-05, "loss": 1.774, "step": 5320 }, { "epoch": 2.4360146252285193, "grad_norm": 57695.0078125, "learning_rate": 2.6867816091954023e-05, "loss": 1.6981, "step": 5330 }, { "epoch": 2.4405850091407677, "grad_norm": 84480.6328125, "learning_rate": 2.681992337164751e-05, "loss": 1.7748, "step": 5340 }, { "epoch": 2.4451553930530165, "grad_norm": 71991.0859375, "learning_rate": 2.6772030651340997e-05, "loss": 1.6582, "step": 5350 }, { "epoch": 2.449725776965265, "grad_norm": 84038.8984375, "learning_rate": 2.672413793103448e-05, "loss": 1.7313, "step": 5360 }, { "epoch": 2.454296160877514, "grad_norm": 61137.41015625, "learning_rate": 2.6676245210727967e-05, "loss": 1.686, "step": 5370 }, { "epoch": 2.458866544789762, "grad_norm": 82829.7734375, "learning_rate": 2.662835249042146e-05, "loss": 1.6642, "step": 5380 }, { "epoch": 2.463436928702011, "grad_norm": 77371.140625, "learning_rate": 2.6580459770114948e-05, "loss": 1.7747, "step": 5390 }, { "epoch": 2.4680073126142594, "grad_norm": 109782.984375, "learning_rate": 2.653256704980843e-05, "loss": 1.7706, "step": 5400 }, { "epoch": 2.4725776965265083, "grad_norm": 99484.78125, "learning_rate": 2.6484674329501918e-05, "loss": 1.7255, "step": 5410 }, { "epoch": 2.4771480804387567, "grad_norm": 97897.8671875, "learning_rate": 2.6436781609195405e-05, "loss": 1.8301, "step": 5420 }, { "epoch": 2.4817184643510055, "grad_norm": 69454.625, "learning_rate": 2.6388888888888892e-05, "loss": 1.833, "step": 5430 }, { "epoch": 2.4862888482632544, "grad_norm": 51202.9765625, "learning_rate": 2.6340996168582375e-05, "loss": 1.8042, "step": 5440 }, { "epoch": 2.4908592321755028, "grad_norm": 62593.1015625, "learning_rate": 2.6293103448275862e-05, "loss": 1.7295, "step": 5450 }, { "epoch": 2.495429616087751, "grad_norm": 62147.26953125, "learning_rate": 2.624521072796935e-05, "loss": 1.7026, "step": 5460 }, { "epoch": 2.5, "grad_norm": 105682.3515625, "learning_rate": 2.6197318007662836e-05, "loss": 1.6866, "step": 5470 }, { "epoch": 2.504570383912249, "grad_norm": 133388.953125, "learning_rate": 2.6149425287356323e-05, "loss": 1.6257, "step": 5480 }, { "epoch": 2.5091407678244972, "grad_norm": 78931.4609375, "learning_rate": 2.6101532567049806e-05, "loss": 1.7367, "step": 5490 }, { "epoch": 2.5137111517367456, "grad_norm": 58830.1484375, "learning_rate": 2.6053639846743293e-05, "loss": 1.6935, "step": 5500 }, { "epoch": 2.5182815356489945, "grad_norm": 154903.03125, "learning_rate": 2.600574712643678e-05, "loss": 1.6548, "step": 5510 }, { "epoch": 2.5228519195612433, "grad_norm": 71029.5859375, "learning_rate": 2.595785440613027e-05, "loss": 1.7459, "step": 5520 }, { "epoch": 2.5274223034734917, "grad_norm": 76854.0, "learning_rate": 2.5909961685823757e-05, "loss": 1.7737, "step": 5530 }, { "epoch": 2.53199268738574, "grad_norm": 78467.3984375, "learning_rate": 2.5862068965517244e-05, "loss": 1.728, "step": 5540 }, { "epoch": 2.536563071297989, "grad_norm": 81775.109375, "learning_rate": 2.581417624521073e-05, "loss": 1.6481, "step": 5550 }, { "epoch": 2.541133455210238, "grad_norm": 59482.62890625, "learning_rate": 2.5766283524904218e-05, "loss": 1.7565, "step": 5560 }, { "epoch": 2.545703839122486, "grad_norm": 114247.8671875, "learning_rate": 2.57183908045977e-05, "loss": 1.6725, "step": 5570 }, { "epoch": 2.550274223034735, "grad_norm": 69261.0390625, "learning_rate": 2.5670498084291188e-05, "loss": 1.7034, "step": 5580 }, { "epoch": 2.5548446069469835, "grad_norm": 88227.703125, "learning_rate": 2.5622605363984675e-05, "loss": 1.6888, "step": 5590 }, { "epoch": 2.5594149908592323, "grad_norm": 123599.8828125, "learning_rate": 2.5574712643678162e-05, "loss": 1.6814, "step": 5600 }, { "epoch": 2.5639853747714807, "grad_norm": 61095.8984375, "learning_rate": 2.552681992337165e-05, "loss": 1.6875, "step": 5610 }, { "epoch": 2.5685557586837295, "grad_norm": 71797.8671875, "learning_rate": 2.5478927203065132e-05, "loss": 1.6945, "step": 5620 }, { "epoch": 2.573126142595978, "grad_norm": 70069.7890625, "learning_rate": 2.543103448275862e-05, "loss": 1.6055, "step": 5630 }, { "epoch": 2.577696526508227, "grad_norm": 90758.25, "learning_rate": 2.5383141762452106e-05, "loss": 1.7105, "step": 5640 }, { "epoch": 2.582266910420475, "grad_norm": 73838.0390625, "learning_rate": 2.5335249042145593e-05, "loss": 1.7238, "step": 5650 }, { "epoch": 2.586837294332724, "grad_norm": 169500.65625, "learning_rate": 2.5287356321839083e-05, "loss": 1.6286, "step": 5660 }, { "epoch": 2.5914076782449724, "grad_norm": 86502.6484375, "learning_rate": 2.523946360153257e-05, "loss": 1.7394, "step": 5670 }, { "epoch": 2.5959780621572213, "grad_norm": 77630.203125, "learning_rate": 2.5191570881226057e-05, "loss": 1.6857, "step": 5680 }, { "epoch": 2.60054844606947, "grad_norm": 78792.4921875, "learning_rate": 2.5143678160919544e-05, "loss": 1.7818, "step": 5690 }, { "epoch": 2.6051188299817185, "grad_norm": 61021.87890625, "learning_rate": 2.5095785440613027e-05, "loss": 1.6825, "step": 5700 }, { "epoch": 2.609689213893967, "grad_norm": 93845.828125, "learning_rate": 2.5047892720306514e-05, "loss": 1.7818, "step": 5710 }, { "epoch": 2.6142595978062158, "grad_norm": 77104.4296875, "learning_rate": 2.5e-05, "loss": 1.6891, "step": 5720 }, { "epoch": 2.6188299817184646, "grad_norm": 67157.0625, "learning_rate": 2.4952107279693488e-05, "loss": 1.7418, "step": 5730 }, { "epoch": 2.623400365630713, "grad_norm": 117816.0078125, "learning_rate": 2.4904214559386975e-05, "loss": 1.6851, "step": 5740 }, { "epoch": 2.6279707495429614, "grad_norm": 64429.0546875, "learning_rate": 2.485632183908046e-05, "loss": 1.7577, "step": 5750 }, { "epoch": 2.6325411334552102, "grad_norm": 99433.84375, "learning_rate": 2.480842911877395e-05, "loss": 1.6719, "step": 5760 }, { "epoch": 2.637111517367459, "grad_norm": 129014.9609375, "learning_rate": 2.4760536398467436e-05, "loss": 1.8557, "step": 5770 }, { "epoch": 2.6416819012797075, "grad_norm": 51642.76171875, "learning_rate": 2.4712643678160922e-05, "loss": 1.71, "step": 5780 }, { "epoch": 2.646252285191956, "grad_norm": 92177.1875, "learning_rate": 2.4664750957854406e-05, "loss": 1.6415, "step": 5790 }, { "epoch": 2.6508226691042047, "grad_norm": 83833.5546875, "learning_rate": 2.4616858237547893e-05, "loss": 1.6863, "step": 5800 }, { "epoch": 2.6553930530164536, "grad_norm": 83966.53125, "learning_rate": 2.456896551724138e-05, "loss": 1.7626, "step": 5810 }, { "epoch": 2.659963436928702, "grad_norm": 94047.453125, "learning_rate": 2.4521072796934867e-05, "loss": 1.7408, "step": 5820 }, { "epoch": 2.664533820840951, "grad_norm": 107394.71875, "learning_rate": 2.4473180076628353e-05, "loss": 1.7078, "step": 5830 }, { "epoch": 2.669104204753199, "grad_norm": 96418.6328125, "learning_rate": 2.442528735632184e-05, "loss": 1.7023, "step": 5840 }, { "epoch": 2.673674588665448, "grad_norm": 115618.6015625, "learning_rate": 2.4377394636015327e-05, "loss": 1.7811, "step": 5850 }, { "epoch": 2.6782449725776964, "grad_norm": 102043.5078125, "learning_rate": 2.4329501915708814e-05, "loss": 1.761, "step": 5860 }, { "epoch": 2.6828153564899453, "grad_norm": 85863.6953125, "learning_rate": 2.42816091954023e-05, "loss": 1.786, "step": 5870 }, { "epoch": 2.6873857404021937, "grad_norm": 105787.890625, "learning_rate": 2.4233716475095784e-05, "loss": 1.6736, "step": 5880 }, { "epoch": 2.6919561243144425, "grad_norm": 87654.65625, "learning_rate": 2.418582375478927e-05, "loss": 1.7352, "step": 5890 }, { "epoch": 2.696526508226691, "grad_norm": 65512.890625, "learning_rate": 2.413793103448276e-05, "loss": 1.6165, "step": 5900 }, { "epoch": 2.7010968921389398, "grad_norm": 96425.09375, "learning_rate": 2.409003831417625e-05, "loss": 1.6336, "step": 5910 }, { "epoch": 2.705667276051188, "grad_norm": 64857.203125, "learning_rate": 2.4042145593869732e-05, "loss": 1.7171, "step": 5920 }, { "epoch": 2.710237659963437, "grad_norm": 59102.8828125, "learning_rate": 2.399425287356322e-05, "loss": 1.7046, "step": 5930 }, { "epoch": 2.7148080438756854, "grad_norm": 89212.4140625, "learning_rate": 2.3946360153256706e-05, "loss": 1.6989, "step": 5940 }, { "epoch": 2.7193784277879343, "grad_norm": 75463.0859375, "learning_rate": 2.3898467432950193e-05, "loss": 1.7416, "step": 5950 }, { "epoch": 2.7239488117001827, "grad_norm": 107514.09375, "learning_rate": 2.385057471264368e-05, "loss": 1.7762, "step": 5960 }, { "epoch": 2.7285191956124315, "grad_norm": 97020.46875, "learning_rate": 2.3802681992337166e-05, "loss": 1.6877, "step": 5970 }, { "epoch": 2.7330895795246803, "grad_norm": 100322.8515625, "learning_rate": 2.3754789272030653e-05, "loss": 1.7148, "step": 5980 }, { "epoch": 2.7376599634369287, "grad_norm": 66746.328125, "learning_rate": 2.370689655172414e-05, "loss": 1.7645, "step": 5990 }, { "epoch": 2.742230347349177, "grad_norm": 50676.11328125, "learning_rate": 2.3659003831417627e-05, "loss": 1.7107, "step": 6000 }, { "epoch": 2.746800731261426, "grad_norm": 53923.9921875, "learning_rate": 2.361111111111111e-05, "loss": 1.7802, "step": 6010 }, { "epoch": 2.751371115173675, "grad_norm": 61331.36328125, "learning_rate": 2.3563218390804597e-05, "loss": 1.682, "step": 6020 }, { "epoch": 2.7559414990859232, "grad_norm": 92563.5078125, "learning_rate": 2.3515325670498088e-05, "loss": 1.8056, "step": 6030 }, { "epoch": 2.7605118829981716, "grad_norm": 103113.2578125, "learning_rate": 2.3467432950191575e-05, "loss": 1.6654, "step": 6040 }, { "epoch": 2.7650822669104205, "grad_norm": 83672.0859375, "learning_rate": 2.3419540229885058e-05, "loss": 1.6853, "step": 6050 }, { "epoch": 2.7696526508226693, "grad_norm": 66880.90625, "learning_rate": 2.3371647509578545e-05, "loss": 1.7165, "step": 6060 }, { "epoch": 2.7742230347349177, "grad_norm": 159394.40625, "learning_rate": 2.3323754789272032e-05, "loss": 1.6777, "step": 6070 }, { "epoch": 2.778793418647166, "grad_norm": 103960.3203125, "learning_rate": 2.327586206896552e-05, "loss": 1.7345, "step": 6080 }, { "epoch": 2.783363802559415, "grad_norm": 89361.9609375, "learning_rate": 2.3227969348659002e-05, "loss": 1.8153, "step": 6090 }, { "epoch": 2.787934186471664, "grad_norm": 48453.51953125, "learning_rate": 2.3180076628352492e-05, "loss": 1.7499, "step": 6100 }, { "epoch": 2.792504570383912, "grad_norm": 60024.06640625, "learning_rate": 2.313218390804598e-05, "loss": 1.6522, "step": 6110 }, { "epoch": 2.797074954296161, "grad_norm": 76540.078125, "learning_rate": 2.3084291187739466e-05, "loss": 1.7614, "step": 6120 }, { "epoch": 2.8016453382084094, "grad_norm": 127741.5390625, "learning_rate": 2.303639846743295e-05, "loss": 1.7328, "step": 6130 }, { "epoch": 2.8062157221206583, "grad_norm": 92570.3828125, "learning_rate": 2.2988505747126437e-05, "loss": 1.6602, "step": 6140 }, { "epoch": 2.8107861060329067, "grad_norm": 74026.5, "learning_rate": 2.2940613026819923e-05, "loss": 1.6413, "step": 6150 }, { "epoch": 2.8153564899451555, "grad_norm": 142319.03125, "learning_rate": 2.289272030651341e-05, "loss": 1.6652, "step": 6160 }, { "epoch": 2.819926873857404, "grad_norm": 66725.7578125, "learning_rate": 2.2844827586206897e-05, "loss": 1.7805, "step": 6170 }, { "epoch": 2.8244972577696528, "grad_norm": 204509.578125, "learning_rate": 2.2796934865900384e-05, "loss": 1.7006, "step": 6180 }, { "epoch": 2.829067641681901, "grad_norm": 81922.28125, "learning_rate": 2.274904214559387e-05, "loss": 1.7177, "step": 6190 }, { "epoch": 2.83363802559415, "grad_norm": 79576.234375, "learning_rate": 2.2701149425287358e-05, "loss": 1.7878, "step": 6200 }, { "epoch": 2.8382084095063984, "grad_norm": 94908.828125, "learning_rate": 2.2653256704980845e-05, "loss": 1.6381, "step": 6210 }, { "epoch": 2.8427787934186473, "grad_norm": 54743.4921875, "learning_rate": 2.2605363984674328e-05, "loss": 1.6953, "step": 6220 }, { "epoch": 2.8473491773308957, "grad_norm": 81304.8203125, "learning_rate": 2.2557471264367815e-05, "loss": 1.7417, "step": 6230 }, { "epoch": 2.8519195612431445, "grad_norm": 61965.26171875, "learning_rate": 2.2509578544061305e-05, "loss": 1.7189, "step": 6240 }, { "epoch": 2.856489945155393, "grad_norm": 74507.875, "learning_rate": 2.2461685823754792e-05, "loss": 1.6891, "step": 6250 }, { "epoch": 2.8610603290676417, "grad_norm": 83252.703125, "learning_rate": 2.2413793103448276e-05, "loss": 1.6622, "step": 6260 }, { "epoch": 2.8656307129798906, "grad_norm": 74911.6484375, "learning_rate": 2.2365900383141763e-05, "loss": 1.684, "step": 6270 }, { "epoch": 2.870201096892139, "grad_norm": 67471.203125, "learning_rate": 2.231800766283525e-05, "loss": 1.7436, "step": 6280 }, { "epoch": 2.8747714808043874, "grad_norm": 54812.26953125, "learning_rate": 2.2270114942528736e-05, "loss": 1.7817, "step": 6290 }, { "epoch": 2.8793418647166362, "grad_norm": 93858.140625, "learning_rate": 2.2222222222222223e-05, "loss": 1.721, "step": 6300 }, { "epoch": 2.883912248628885, "grad_norm": 64988.40234375, "learning_rate": 2.217432950191571e-05, "loss": 1.7022, "step": 6310 }, { "epoch": 2.8884826325411335, "grad_norm": 70428.0546875, "learning_rate": 2.2126436781609197e-05, "loss": 1.7059, "step": 6320 }, { "epoch": 2.893053016453382, "grad_norm": 82495.4453125, "learning_rate": 2.2078544061302684e-05, "loss": 1.8106, "step": 6330 }, { "epoch": 2.8976234003656307, "grad_norm": 81380.859375, "learning_rate": 2.203065134099617e-05, "loss": 1.6888, "step": 6340 }, { "epoch": 2.9021937842778796, "grad_norm": 79387.8125, "learning_rate": 2.1982758620689654e-05, "loss": 1.6739, "step": 6350 }, { "epoch": 2.906764168190128, "grad_norm": 139392.453125, "learning_rate": 2.193486590038314e-05, "loss": 1.6741, "step": 6360 }, { "epoch": 2.9113345521023763, "grad_norm": 87657.875, "learning_rate": 2.1886973180076628e-05, "loss": 1.7429, "step": 6370 }, { "epoch": 2.915904936014625, "grad_norm": 62421.78125, "learning_rate": 2.183908045977012e-05, "loss": 1.7148, "step": 6380 }, { "epoch": 2.920475319926874, "grad_norm": 126881.4375, "learning_rate": 2.1791187739463602e-05, "loss": 1.6232, "step": 6390 }, { "epoch": 2.9250457038391224, "grad_norm": 102633.1015625, "learning_rate": 2.174329501915709e-05, "loss": 1.7049, "step": 6400 }, { "epoch": 2.9296160877513713, "grad_norm": 102789.2265625, "learning_rate": 2.1695402298850576e-05, "loss": 1.7001, "step": 6410 }, { "epoch": 2.9341864716636197, "grad_norm": 79177.21875, "learning_rate": 2.1647509578544062e-05, "loss": 1.7525, "step": 6420 }, { "epoch": 2.9387568555758685, "grad_norm": 80986.46875, "learning_rate": 2.159961685823755e-05, "loss": 1.6458, "step": 6430 }, { "epoch": 2.943327239488117, "grad_norm": 97659.375, "learning_rate": 2.1551724137931033e-05, "loss": 1.7678, "step": 6440 }, { "epoch": 2.9478976234003658, "grad_norm": 97846.609375, "learning_rate": 2.1503831417624523e-05, "loss": 1.7128, "step": 6450 }, { "epoch": 2.952468007312614, "grad_norm": 74223.375, "learning_rate": 2.145593869731801e-05, "loss": 1.8434, "step": 6460 }, { "epoch": 2.957038391224863, "grad_norm": 59553.359375, "learning_rate": 2.1408045977011497e-05, "loss": 1.7717, "step": 6470 }, { "epoch": 2.9616087751371114, "grad_norm": 57012.078125, "learning_rate": 2.136015325670498e-05, "loss": 1.6865, "step": 6480 }, { "epoch": 2.9661791590493602, "grad_norm": 57963.9609375, "learning_rate": 2.1312260536398467e-05, "loss": 1.6966, "step": 6490 }, { "epoch": 2.9707495429616086, "grad_norm": 82150.6015625, "learning_rate": 2.1264367816091954e-05, "loss": 1.7219, "step": 6500 }, { "epoch": 2.9753199268738575, "grad_norm": 49790.92578125, "learning_rate": 2.1216475095785444e-05, "loss": 1.7301, "step": 6510 }, { "epoch": 2.979890310786106, "grad_norm": 77082.4140625, "learning_rate": 2.1168582375478928e-05, "loss": 1.6742, "step": 6520 }, { "epoch": 2.9844606946983547, "grad_norm": 123639.671875, "learning_rate": 2.1120689655172415e-05, "loss": 1.6256, "step": 6530 }, { "epoch": 2.989031078610603, "grad_norm": 67240.890625, "learning_rate": 2.10727969348659e-05, "loss": 1.7623, "step": 6540 }, { "epoch": 2.993601462522852, "grad_norm": 71455.7890625, "learning_rate": 2.102490421455939e-05, "loss": 1.6758, "step": 6550 }, { "epoch": 2.998171846435101, "grad_norm": 155724.125, "learning_rate": 2.0977011494252875e-05, "loss": 1.6401, "step": 6560 }, { "epoch": 3.0, "eval_loss": 1.695529818534851, "eval_runtime": 345.831, "eval_samples_per_second": 43.374, "eval_steps_per_second": 1.356, "step": 6564 }, { "epoch": 3.002742230347349, "grad_norm": 71896.8515625, "learning_rate": 2.092911877394636e-05, "loss": 1.7537, "step": 6570 }, { "epoch": 3.0073126142595976, "grad_norm": 61401.01171875, "learning_rate": 2.088122605363985e-05, "loss": 1.6979, "step": 6580 }, { "epoch": 3.0118829981718465, "grad_norm": 108287.78125, "learning_rate": 2.0833333333333336e-05, "loss": 1.7115, "step": 6590 }, { "epoch": 3.016453382084095, "grad_norm": 108027.125, "learning_rate": 2.0785440613026823e-05, "loss": 1.6887, "step": 6600 }, { "epoch": 3.0210237659963437, "grad_norm": 89622.265625, "learning_rate": 2.0737547892720306e-05, "loss": 1.6484, "step": 6610 }, { "epoch": 3.025594149908592, "grad_norm": 116170.4921875, "learning_rate": 2.0689655172413793e-05, "loss": 1.723, "step": 6620 }, { "epoch": 3.030164533820841, "grad_norm": 76070.9765625, "learning_rate": 2.064176245210728e-05, "loss": 1.6649, "step": 6630 }, { "epoch": 3.03473491773309, "grad_norm": 86966.0859375, "learning_rate": 2.0593869731800767e-05, "loss": 1.6613, "step": 6640 }, { "epoch": 3.039305301645338, "grad_norm": 101902.5, "learning_rate": 2.0545977011494254e-05, "loss": 1.6718, "step": 6650 }, { "epoch": 3.043875685557587, "grad_norm": 45214.640625, "learning_rate": 2.049808429118774e-05, "loss": 1.7025, "step": 6660 }, { "epoch": 3.0484460694698354, "grad_norm": 61494.140625, "learning_rate": 2.0450191570881228e-05, "loss": 1.7289, "step": 6670 }, { "epoch": 3.0530164533820843, "grad_norm": 136512.90625, "learning_rate": 2.0402298850574715e-05, "loss": 1.7768, "step": 6680 }, { "epoch": 3.0575868372943327, "grad_norm": 106828.390625, "learning_rate": 2.03544061302682e-05, "loss": 1.7891, "step": 6690 }, { "epoch": 3.0621572212065815, "grad_norm": 54863.59765625, "learning_rate": 2.0306513409961685e-05, "loss": 1.6614, "step": 6700 }, { "epoch": 3.06672760511883, "grad_norm": 95806.8984375, "learning_rate": 2.0258620689655172e-05, "loss": 1.6815, "step": 6710 }, { "epoch": 3.0712979890310788, "grad_norm": 70664.6875, "learning_rate": 2.0210727969348662e-05, "loss": 1.6989, "step": 6720 }, { "epoch": 3.075868372943327, "grad_norm": 118271.9921875, "learning_rate": 2.016283524904215e-05, "loss": 1.8092, "step": 6730 }, { "epoch": 3.080438756855576, "grad_norm": 82462.28125, "learning_rate": 2.0114942528735632e-05, "loss": 1.8018, "step": 6740 }, { "epoch": 3.0850091407678244, "grad_norm": 69368.5546875, "learning_rate": 2.006704980842912e-05, "loss": 1.6884, "step": 6750 }, { "epoch": 3.0895795246800732, "grad_norm": 84815.4609375, "learning_rate": 2.0019157088122606e-05, "loss": 1.695, "step": 6760 }, { "epoch": 3.0941499085923216, "grad_norm": 117484.3125, "learning_rate": 1.9971264367816093e-05, "loss": 1.799, "step": 6770 }, { "epoch": 3.0987202925045705, "grad_norm": 74344.625, "learning_rate": 1.992337164750958e-05, "loss": 1.7173, "step": 6780 }, { "epoch": 3.103290676416819, "grad_norm": 61023.3359375, "learning_rate": 1.9875478927203067e-05, "loss": 1.7335, "step": 6790 }, { "epoch": 3.1078610603290677, "grad_norm": 80261.34375, "learning_rate": 1.9827586206896554e-05, "loss": 1.6618, "step": 6800 }, { "epoch": 3.112431444241316, "grad_norm": 44226.015625, "learning_rate": 1.977969348659004e-05, "loss": 1.6994, "step": 6810 }, { "epoch": 3.117001828153565, "grad_norm": 62552.55078125, "learning_rate": 1.9731800766283527e-05, "loss": 1.6826, "step": 6820 }, { "epoch": 3.1215722120658134, "grad_norm": 126776.40625, "learning_rate": 1.968390804597701e-05, "loss": 1.6274, "step": 6830 }, { "epoch": 3.126142595978062, "grad_norm": 71035.0234375, "learning_rate": 1.9636015325670498e-05, "loss": 1.8007, "step": 6840 }, { "epoch": 3.1307129798903106, "grad_norm": 78976.2265625, "learning_rate": 1.9588122605363985e-05, "loss": 1.727, "step": 6850 }, { "epoch": 3.1352833638025595, "grad_norm": 83373.7265625, "learning_rate": 1.9540229885057475e-05, "loss": 1.7554, "step": 6860 }, { "epoch": 3.139853747714808, "grad_norm": 165484.453125, "learning_rate": 1.949233716475096e-05, "loss": 1.621, "step": 6870 }, { "epoch": 3.1444241316270567, "grad_norm": 62350.18359375, "learning_rate": 1.9444444444444445e-05, "loss": 1.8118, "step": 6880 }, { "epoch": 3.148994515539305, "grad_norm": 64155.27734375, "learning_rate": 1.9396551724137932e-05, "loss": 1.7245, "step": 6890 }, { "epoch": 3.153564899451554, "grad_norm": 69873.4375, "learning_rate": 1.934865900383142e-05, "loss": 1.7768, "step": 6900 }, { "epoch": 3.1581352833638023, "grad_norm": 122451.703125, "learning_rate": 1.9300766283524903e-05, "loss": 1.7436, "step": 6910 }, { "epoch": 3.162705667276051, "grad_norm": 50876.828125, "learning_rate": 1.925287356321839e-05, "loss": 1.7615, "step": 6920 }, { "epoch": 3.1672760511883, "grad_norm": 68587.875, "learning_rate": 1.920498084291188e-05, "loss": 1.6389, "step": 6930 }, { "epoch": 3.1718464351005484, "grad_norm": 111575.640625, "learning_rate": 1.9157088122605367e-05, "loss": 1.7055, "step": 6940 }, { "epoch": 3.1764168190127973, "grad_norm": 68116.921875, "learning_rate": 1.910919540229885e-05, "loss": 1.7635, "step": 6950 }, { "epoch": 3.1809872029250457, "grad_norm": 75995.734375, "learning_rate": 1.9061302681992337e-05, "loss": 1.7208, "step": 6960 }, { "epoch": 3.1855575868372945, "grad_norm": 97217.1796875, "learning_rate": 1.9013409961685824e-05, "loss": 1.6942, "step": 6970 }, { "epoch": 3.190127970749543, "grad_norm": 125494.984375, "learning_rate": 1.896551724137931e-05, "loss": 1.7897, "step": 6980 }, { "epoch": 3.1946983546617918, "grad_norm": 102539.3046875, "learning_rate": 1.8917624521072798e-05, "loss": 1.7186, "step": 6990 }, { "epoch": 3.19926873857404, "grad_norm": 92514.1640625, "learning_rate": 1.8869731800766285e-05, "loss": 1.7805, "step": 7000 }, { "epoch": 3.203839122486289, "grad_norm": 86951.125, "learning_rate": 1.882183908045977e-05, "loss": 1.738, "step": 7010 }, { "epoch": 3.2084095063985374, "grad_norm": 99123.890625, "learning_rate": 1.8773946360153258e-05, "loss": 1.6605, "step": 7020 }, { "epoch": 3.2129798903107862, "grad_norm": 61411.390625, "learning_rate": 1.8726053639846745e-05, "loss": 1.6291, "step": 7030 }, { "epoch": 3.2175502742230346, "grad_norm": 69628.1953125, "learning_rate": 1.867816091954023e-05, "loss": 1.7693, "step": 7040 }, { "epoch": 3.2221206581352835, "grad_norm": 74736.0390625, "learning_rate": 1.8630268199233716e-05, "loss": 1.6428, "step": 7050 }, { "epoch": 3.226691042047532, "grad_norm": 93917.203125, "learning_rate": 1.8582375478927206e-05, "loss": 1.7189, "step": 7060 }, { "epoch": 3.2312614259597807, "grad_norm": 80656.5546875, "learning_rate": 1.8534482758620693e-05, "loss": 1.7485, "step": 7070 }, { "epoch": 3.235831809872029, "grad_norm": 72125.5546875, "learning_rate": 1.8486590038314176e-05, "loss": 1.644, "step": 7080 }, { "epoch": 3.240402193784278, "grad_norm": 86443.671875, "learning_rate": 1.8438697318007663e-05, "loss": 1.6747, "step": 7090 }, { "epoch": 3.2449725776965264, "grad_norm": 93107.2734375, "learning_rate": 1.839080459770115e-05, "loss": 1.705, "step": 7100 }, { "epoch": 3.249542961608775, "grad_norm": 68446.5078125, "learning_rate": 1.8342911877394637e-05, "loss": 1.7647, "step": 7110 }, { "epoch": 3.2541133455210236, "grad_norm": 91024.3125, "learning_rate": 1.8295019157088124e-05, "loss": 1.7323, "step": 7120 }, { "epoch": 3.2586837294332724, "grad_norm": 63286.3828125, "learning_rate": 1.824712643678161e-05, "loss": 1.7359, "step": 7130 }, { "epoch": 3.263254113345521, "grad_norm": 94550.0234375, "learning_rate": 1.8199233716475097e-05, "loss": 1.7065, "step": 7140 }, { "epoch": 3.2678244972577697, "grad_norm": 66774.734375, "learning_rate": 1.8151340996168584e-05, "loss": 1.6688, "step": 7150 }, { "epoch": 3.272394881170018, "grad_norm": 52998.1484375, "learning_rate": 1.810344827586207e-05, "loss": 1.7237, "step": 7160 }, { "epoch": 3.276965265082267, "grad_norm": 70673.5078125, "learning_rate": 1.8055555555555555e-05, "loss": 1.7609, "step": 7170 }, { "epoch": 3.2815356489945158, "grad_norm": 109352.7421875, "learning_rate": 1.800766283524904e-05, "loss": 1.6908, "step": 7180 }, { "epoch": 3.286106032906764, "grad_norm": 66609.03125, "learning_rate": 1.795977011494253e-05, "loss": 1.7241, "step": 7190 }, { "epoch": 3.2906764168190126, "grad_norm": 74225.8984375, "learning_rate": 1.791187739463602e-05, "loss": 1.7827, "step": 7200 }, { "epoch": 3.2952468007312614, "grad_norm": 116946.515625, "learning_rate": 1.7863984674329502e-05, "loss": 1.6128, "step": 7210 }, { "epoch": 3.2998171846435103, "grad_norm": 76768.5234375, "learning_rate": 1.781609195402299e-05, "loss": 1.7406, "step": 7220 }, { "epoch": 3.3043875685557587, "grad_norm": 107767.0625, "learning_rate": 1.7768199233716476e-05, "loss": 1.6913, "step": 7230 }, { "epoch": 3.3089579524680075, "grad_norm": 76932.5703125, "learning_rate": 1.7720306513409963e-05, "loss": 1.751, "step": 7240 }, { "epoch": 3.313528336380256, "grad_norm": 132700.34375, "learning_rate": 1.767241379310345e-05, "loss": 1.7305, "step": 7250 }, { "epoch": 3.3180987202925047, "grad_norm": 148178.984375, "learning_rate": 1.7624521072796933e-05, "loss": 1.6116, "step": 7260 }, { "epoch": 3.322669104204753, "grad_norm": 84747.203125, "learning_rate": 1.7576628352490424e-05, "loss": 1.7728, "step": 7270 }, { "epoch": 3.327239488117002, "grad_norm": 88323.5078125, "learning_rate": 1.752873563218391e-05, "loss": 1.6676, "step": 7280 }, { "epoch": 3.3318098720292504, "grad_norm": 55836.046875, "learning_rate": 1.7480842911877397e-05, "loss": 1.7439, "step": 7290 }, { "epoch": 3.3363802559414992, "grad_norm": 107173.328125, "learning_rate": 1.743295019157088e-05, "loss": 1.746, "step": 7300 }, { "epoch": 3.3409506398537476, "grad_norm": 63958.0078125, "learning_rate": 1.7385057471264368e-05, "loss": 1.6936, "step": 7310 }, { "epoch": 3.3455210237659965, "grad_norm": 100361.1484375, "learning_rate": 1.7337164750957855e-05, "loss": 1.6349, "step": 7320 }, { "epoch": 3.350091407678245, "grad_norm": 64250.95703125, "learning_rate": 1.728927203065134e-05, "loss": 1.715, "step": 7330 }, { "epoch": 3.3546617915904937, "grad_norm": 66948.6953125, "learning_rate": 1.7241379310344828e-05, "loss": 1.6291, "step": 7340 }, { "epoch": 3.359232175502742, "grad_norm": 83300.3125, "learning_rate": 1.7193486590038315e-05, "loss": 1.6724, "step": 7350 }, { "epoch": 3.363802559414991, "grad_norm": 88165.8125, "learning_rate": 1.7145593869731802e-05, "loss": 1.7536, "step": 7360 }, { "epoch": 3.3683729433272394, "grad_norm": 67886.234375, "learning_rate": 1.709770114942529e-05, "loss": 1.6798, "step": 7370 }, { "epoch": 3.372943327239488, "grad_norm": 64415.46484375, "learning_rate": 1.7049808429118776e-05, "loss": 1.7546, "step": 7380 }, { "epoch": 3.3775137111517366, "grad_norm": 75445.15625, "learning_rate": 1.700191570881226e-05, "loss": 1.6932, "step": 7390 }, { "epoch": 3.3820840950639854, "grad_norm": 122763.609375, "learning_rate": 1.6954022988505746e-05, "loss": 1.7015, "step": 7400 }, { "epoch": 3.386654478976234, "grad_norm": 113570.5546875, "learning_rate": 1.6906130268199236e-05, "loss": 1.7188, "step": 7410 }, { "epoch": 3.3912248628884827, "grad_norm": 103909.40625, "learning_rate": 1.6858237547892723e-05, "loss": 1.6516, "step": 7420 }, { "epoch": 3.395795246800731, "grad_norm": 72607.9375, "learning_rate": 1.6810344827586207e-05, "loss": 1.704, "step": 7430 }, { "epoch": 3.40036563071298, "grad_norm": 154061.578125, "learning_rate": 1.6762452107279694e-05, "loss": 1.769, "step": 7440 }, { "epoch": 3.4049360146252283, "grad_norm": 122622.734375, "learning_rate": 1.671455938697318e-05, "loss": 1.754, "step": 7450 }, { "epoch": 3.409506398537477, "grad_norm": 70141.734375, "learning_rate": 1.6666666666666667e-05, "loss": 1.7117, "step": 7460 }, { "epoch": 3.414076782449726, "grad_norm": 82556.859375, "learning_rate": 1.6618773946360154e-05, "loss": 1.7317, "step": 7470 }, { "epoch": 3.4186471663619744, "grad_norm": 83208.140625, "learning_rate": 1.657088122605364e-05, "loss": 1.7346, "step": 7480 }, { "epoch": 3.423217550274223, "grad_norm": 102812.6171875, "learning_rate": 1.6522988505747128e-05, "loss": 1.7444, "step": 7490 }, { "epoch": 3.4277879341864717, "grad_norm": 90799.8359375, "learning_rate": 1.6475095785440615e-05, "loss": 1.7343, "step": 7500 }, { "epoch": 3.4323583180987205, "grad_norm": 91094.5390625, "learning_rate": 1.6427203065134102e-05, "loss": 1.7624, "step": 7510 }, { "epoch": 3.436928702010969, "grad_norm": 69841.4296875, "learning_rate": 1.6379310344827585e-05, "loss": 1.7264, "step": 7520 }, { "epoch": 3.4414990859232177, "grad_norm": 122445.3515625, "learning_rate": 1.6331417624521072e-05, "loss": 1.8214, "step": 7530 }, { "epoch": 3.446069469835466, "grad_norm": 85346.1875, "learning_rate": 1.628352490421456e-05, "loss": 1.7169, "step": 7540 }, { "epoch": 3.450639853747715, "grad_norm": 46006.20703125, "learning_rate": 1.623563218390805e-05, "loss": 1.6682, "step": 7550 }, { "epoch": 3.4552102376599634, "grad_norm": 61423.13671875, "learning_rate": 1.6187739463601533e-05, "loss": 1.7414, "step": 7560 }, { "epoch": 3.4597806215722122, "grad_norm": 73209.5625, "learning_rate": 1.613984674329502e-05, "loss": 1.699, "step": 7570 }, { "epoch": 3.4643510054844606, "grad_norm": 81988.2265625, "learning_rate": 1.6091954022988507e-05, "loss": 1.6394, "step": 7580 }, { "epoch": 3.4689213893967095, "grad_norm": 128754.4140625, "learning_rate": 1.6044061302681994e-05, "loss": 1.7366, "step": 7590 }, { "epoch": 3.473491773308958, "grad_norm": 92045.3828125, "learning_rate": 1.5996168582375477e-05, "loss": 1.7727, "step": 7600 }, { "epoch": 3.4780621572212067, "grad_norm": 144295.390625, "learning_rate": 1.5948275862068967e-05, "loss": 1.6927, "step": 7610 }, { "epoch": 3.482632541133455, "grad_norm": 54716.375, "learning_rate": 1.5900383141762454e-05, "loss": 1.7297, "step": 7620 }, { "epoch": 3.487202925045704, "grad_norm": 80850.1328125, "learning_rate": 1.585249042145594e-05, "loss": 1.639, "step": 7630 }, { "epoch": 3.4917733089579523, "grad_norm": 114330.296875, "learning_rate": 1.5804597701149425e-05, "loss": 1.7673, "step": 7640 }, { "epoch": 3.496343692870201, "grad_norm": 58934.4921875, "learning_rate": 1.575670498084291e-05, "loss": 1.6763, "step": 7650 }, { "epoch": 3.5009140767824496, "grad_norm": 123695.609375, "learning_rate": 1.5708812260536398e-05, "loss": 1.7135, "step": 7660 }, { "epoch": 3.5054844606946984, "grad_norm": 83289.6640625, "learning_rate": 1.5660919540229885e-05, "loss": 1.688, "step": 7670 }, { "epoch": 3.510054844606947, "grad_norm": 100226.015625, "learning_rate": 1.5613026819923372e-05, "loss": 1.7304, "step": 7680 }, { "epoch": 3.5146252285191957, "grad_norm": 88909.984375, "learning_rate": 1.556513409961686e-05, "loss": 1.6723, "step": 7690 }, { "epoch": 3.519195612431444, "grad_norm": 66940.3515625, "learning_rate": 1.5517241379310346e-05, "loss": 1.6159, "step": 7700 }, { "epoch": 3.523765996343693, "grad_norm": 88044.171875, "learning_rate": 1.5469348659003833e-05, "loss": 1.7853, "step": 7710 }, { "epoch": 3.5283363802559418, "grad_norm": 85045.421875, "learning_rate": 1.542145593869732e-05, "loss": 1.7666, "step": 7720 }, { "epoch": 3.53290676416819, "grad_norm": 60147.796875, "learning_rate": 1.5373563218390803e-05, "loss": 1.6683, "step": 7730 }, { "epoch": 3.5374771480804386, "grad_norm": 82411.9609375, "learning_rate": 1.532567049808429e-05, "loss": 1.7323, "step": 7740 }, { "epoch": 3.5420475319926874, "grad_norm": 66054.59375, "learning_rate": 1.527777777777778e-05, "loss": 1.6714, "step": 7750 }, { "epoch": 3.5466179159049362, "grad_norm": 72301.625, "learning_rate": 1.5229885057471265e-05, "loss": 1.6496, "step": 7760 }, { "epoch": 3.5511882998171846, "grad_norm": 104870.84375, "learning_rate": 1.5181992337164752e-05, "loss": 1.729, "step": 7770 }, { "epoch": 3.555758683729433, "grad_norm": 155170.828125, "learning_rate": 1.5134099616858237e-05, "loss": 1.6892, "step": 7780 }, { "epoch": 3.560329067641682, "grad_norm": 103218.2578125, "learning_rate": 1.5086206896551724e-05, "loss": 1.68, "step": 7790 }, { "epoch": 3.5648994515539307, "grad_norm": 77247.625, "learning_rate": 1.5038314176245211e-05, "loss": 1.8163, "step": 7800 }, { "epoch": 3.569469835466179, "grad_norm": 62777.0859375, "learning_rate": 1.4990421455938696e-05, "loss": 1.7311, "step": 7810 }, { "epoch": 3.5740402193784275, "grad_norm": 79059.1328125, "learning_rate": 1.4942528735632185e-05, "loss": 1.714, "step": 7820 }, { "epoch": 3.5786106032906764, "grad_norm": 75726.578125, "learning_rate": 1.4894636015325672e-05, "loss": 1.724, "step": 7830 }, { "epoch": 3.583180987202925, "grad_norm": 62909.81640625, "learning_rate": 1.4846743295019159e-05, "loss": 1.6383, "step": 7840 }, { "epoch": 3.5877513711151736, "grad_norm": 80657.5234375, "learning_rate": 1.4798850574712644e-05, "loss": 1.7269, "step": 7850 }, { "epoch": 3.5923217550274225, "grad_norm": 73948.4375, "learning_rate": 1.475095785440613e-05, "loss": 1.6603, "step": 7860 }, { "epoch": 3.596892138939671, "grad_norm": 88141.875, "learning_rate": 1.4703065134099616e-05, "loss": 1.6842, "step": 7870 }, { "epoch": 3.6014625228519197, "grad_norm": 78391.890625, "learning_rate": 1.4655172413793103e-05, "loss": 1.6908, "step": 7880 }, { "epoch": 3.606032906764168, "grad_norm": 65879.515625, "learning_rate": 1.4607279693486591e-05, "loss": 1.7438, "step": 7890 }, { "epoch": 3.610603290676417, "grad_norm": 137562.15625, "learning_rate": 1.4559386973180078e-05, "loss": 1.704, "step": 7900 }, { "epoch": 3.6151736745886653, "grad_norm": 99103.4375, "learning_rate": 1.4511494252873564e-05, "loss": 1.7265, "step": 7910 }, { "epoch": 3.619744058500914, "grad_norm": 67414.4609375, "learning_rate": 1.446360153256705e-05, "loss": 1.7134, "step": 7920 }, { "epoch": 3.6243144424131626, "grad_norm": 109987.7265625, "learning_rate": 1.4415708812260537e-05, "loss": 1.6406, "step": 7930 }, { "epoch": 3.6288848263254114, "grad_norm": 143015.703125, "learning_rate": 1.4367816091954022e-05, "loss": 1.7694, "step": 7940 }, { "epoch": 3.63345521023766, "grad_norm": 87812.765625, "learning_rate": 1.431992337164751e-05, "loss": 1.7357, "step": 7950 }, { "epoch": 3.6380255941499087, "grad_norm": 84844.578125, "learning_rate": 1.4272030651340998e-05, "loss": 1.7893, "step": 7960 }, { "epoch": 3.642595978062157, "grad_norm": 72731.703125, "learning_rate": 1.4224137931034485e-05, "loss": 1.6586, "step": 7970 }, { "epoch": 3.647166361974406, "grad_norm": 81956.890625, "learning_rate": 1.417624521072797e-05, "loss": 1.7693, "step": 7980 }, { "epoch": 3.6517367458866543, "grad_norm": 69217.53125, "learning_rate": 1.4128352490421457e-05, "loss": 1.7928, "step": 7990 }, { "epoch": 3.656307129798903, "grad_norm": 54634.5703125, "learning_rate": 1.4080459770114942e-05, "loss": 1.7918, "step": 8000 }, { "epoch": 3.660877513711152, "grad_norm": 63817.8359375, "learning_rate": 1.4032567049808429e-05, "loss": 1.6763, "step": 8010 }, { "epoch": 3.6654478976234004, "grad_norm": 118554.0859375, "learning_rate": 1.3984674329501916e-05, "loss": 1.7032, "step": 8020 }, { "epoch": 3.670018281535649, "grad_norm": 96849.4453125, "learning_rate": 1.3936781609195404e-05, "loss": 1.6672, "step": 8030 }, { "epoch": 3.6745886654478976, "grad_norm": 138688.09375, "learning_rate": 1.388888888888889e-05, "loss": 1.6499, "step": 8040 }, { "epoch": 3.6791590493601465, "grad_norm": 71032.484375, "learning_rate": 1.3840996168582376e-05, "loss": 1.7859, "step": 8050 }, { "epoch": 3.683729433272395, "grad_norm": 105990.21875, "learning_rate": 1.3793103448275863e-05, "loss": 1.6542, "step": 8060 }, { "epoch": 3.6882998171846433, "grad_norm": 119098.0859375, "learning_rate": 1.3745210727969348e-05, "loss": 1.7186, "step": 8070 }, { "epoch": 3.692870201096892, "grad_norm": 61243.96484375, "learning_rate": 1.3697318007662835e-05, "loss": 1.7328, "step": 8080 }, { "epoch": 3.697440585009141, "grad_norm": 78637.296875, "learning_rate": 1.3649425287356324e-05, "loss": 1.6125, "step": 8090 }, { "epoch": 3.7020109689213894, "grad_norm": 88676.828125, "learning_rate": 1.360153256704981e-05, "loss": 1.6784, "step": 8100 }, { "epoch": 3.7065813528336378, "grad_norm": 80246.65625, "learning_rate": 1.3553639846743296e-05, "loss": 1.6786, "step": 8110 }, { "epoch": 3.7111517367458866, "grad_norm": 79097.7734375, "learning_rate": 1.3505747126436783e-05, "loss": 1.6875, "step": 8120 }, { "epoch": 3.7157221206581355, "grad_norm": 75883.453125, "learning_rate": 1.3457854406130268e-05, "loss": 1.6788, "step": 8130 }, { "epoch": 3.720292504570384, "grad_norm": 87841.7734375, "learning_rate": 1.3409961685823755e-05, "loss": 1.7938, "step": 8140 }, { "epoch": 3.7248628884826327, "grad_norm": 60471.46875, "learning_rate": 1.336206896551724e-05, "loss": 1.7017, "step": 8150 }, { "epoch": 3.729433272394881, "grad_norm": 117315.484375, "learning_rate": 1.331417624521073e-05, "loss": 1.652, "step": 8160 }, { "epoch": 3.73400365630713, "grad_norm": 81507.8984375, "learning_rate": 1.3266283524904216e-05, "loss": 1.7639, "step": 8170 }, { "epoch": 3.7385740402193783, "grad_norm": 110054.6328125, "learning_rate": 1.3218390804597702e-05, "loss": 1.7429, "step": 8180 }, { "epoch": 3.743144424131627, "grad_norm": 65638.3828125, "learning_rate": 1.3170498084291188e-05, "loss": 1.7599, "step": 8190 }, { "epoch": 3.7477148080438756, "grad_norm": 116608.078125, "learning_rate": 1.3122605363984675e-05, "loss": 1.7329, "step": 8200 }, { "epoch": 3.7522851919561244, "grad_norm": 87637.5078125, "learning_rate": 1.3074712643678161e-05, "loss": 1.7624, "step": 8210 }, { "epoch": 3.756855575868373, "grad_norm": 94494.8125, "learning_rate": 1.3026819923371647e-05, "loss": 1.7554, "step": 8220 }, { "epoch": 3.7614259597806217, "grad_norm": 69636.9296875, "learning_rate": 1.2978927203065135e-05, "loss": 1.758, "step": 8230 }, { "epoch": 3.76599634369287, "grad_norm": 73185.421875, "learning_rate": 1.2931034482758622e-05, "loss": 1.7235, "step": 8240 }, { "epoch": 3.770566727605119, "grad_norm": 94298.2265625, "learning_rate": 1.2883141762452109e-05, "loss": 1.6726, "step": 8250 }, { "epoch": 3.7751371115173673, "grad_norm": 99814.8671875, "learning_rate": 1.2835249042145594e-05, "loss": 1.6912, "step": 8260 }, { "epoch": 3.779707495429616, "grad_norm": 68422.03125, "learning_rate": 1.2787356321839081e-05, "loss": 1.7406, "step": 8270 }, { "epoch": 3.7842778793418645, "grad_norm": 82088.296875, "learning_rate": 1.2739463601532566e-05, "loss": 1.7526, "step": 8280 }, { "epoch": 3.7888482632541134, "grad_norm": 77173.703125, "learning_rate": 1.2691570881226053e-05, "loss": 1.7784, "step": 8290 }, { "epoch": 3.7934186471663622, "grad_norm": 60719.59375, "learning_rate": 1.2643678160919542e-05, "loss": 1.7124, "step": 8300 }, { "epoch": 3.7979890310786106, "grad_norm": 71459.625, "learning_rate": 1.2595785440613029e-05, "loss": 1.6099, "step": 8310 }, { "epoch": 3.802559414990859, "grad_norm": 57145.5546875, "learning_rate": 1.2547892720306514e-05, "loss": 1.7385, "step": 8320 }, { "epoch": 3.807129798903108, "grad_norm": 72725.9296875, "learning_rate": 1.25e-05, "loss": 1.7214, "step": 8330 }, { "epoch": 3.8117001828153567, "grad_norm": 135925.65625, "learning_rate": 1.2452107279693487e-05, "loss": 1.719, "step": 8340 }, { "epoch": 3.816270566727605, "grad_norm": 48055.78515625, "learning_rate": 1.2404214559386974e-05, "loss": 1.7011, "step": 8350 }, { "epoch": 3.8208409506398535, "grad_norm": 120826.3359375, "learning_rate": 1.2356321839080461e-05, "loss": 1.6556, "step": 8360 }, { "epoch": 3.8254113345521024, "grad_norm": 98967.4453125, "learning_rate": 1.2308429118773946e-05, "loss": 1.6559, "step": 8370 }, { "epoch": 3.829981718464351, "grad_norm": 67471.8125, "learning_rate": 1.2260536398467433e-05, "loss": 1.6941, "step": 8380 }, { "epoch": 3.8345521023765996, "grad_norm": 73149.8359375, "learning_rate": 1.221264367816092e-05, "loss": 1.7197, "step": 8390 }, { "epoch": 3.839122486288848, "grad_norm": 59362.51953125, "learning_rate": 1.2164750957854407e-05, "loss": 1.7067, "step": 8400 }, { "epoch": 3.843692870201097, "grad_norm": 103423.5859375, "learning_rate": 1.2116858237547892e-05, "loss": 1.5936, "step": 8410 }, { "epoch": 3.8482632541133457, "grad_norm": 68154.90625, "learning_rate": 1.206896551724138e-05, "loss": 1.7166, "step": 8420 }, { "epoch": 3.852833638025594, "grad_norm": 81413.09375, "learning_rate": 1.2021072796934866e-05, "loss": 1.7183, "step": 8430 }, { "epoch": 3.857404021937843, "grad_norm": 67458.328125, "learning_rate": 1.1973180076628353e-05, "loss": 1.7063, "step": 8440 }, { "epoch": 3.8619744058500913, "grad_norm": 81477.78125, "learning_rate": 1.192528735632184e-05, "loss": 1.6622, "step": 8450 }, { "epoch": 3.86654478976234, "grad_norm": 94965.6953125, "learning_rate": 1.1877394636015327e-05, "loss": 1.6777, "step": 8460 }, { "epoch": 3.8711151736745886, "grad_norm": 64403.4375, "learning_rate": 1.1829501915708814e-05, "loss": 1.7471, "step": 8470 }, { "epoch": 3.8756855575868374, "grad_norm": 72309.5859375, "learning_rate": 1.1781609195402299e-05, "loss": 1.7836, "step": 8480 }, { "epoch": 3.880255941499086, "grad_norm": 80551.765625, "learning_rate": 1.1733716475095787e-05, "loss": 1.6478, "step": 8490 }, { "epoch": 3.8848263254113347, "grad_norm": 86743.6015625, "learning_rate": 1.1685823754789272e-05, "loss": 1.6723, "step": 8500 }, { "epoch": 3.889396709323583, "grad_norm": 60500.5, "learning_rate": 1.163793103448276e-05, "loss": 1.779, "step": 8510 }, { "epoch": 3.893967093235832, "grad_norm": 81024.5703125, "learning_rate": 1.1590038314176246e-05, "loss": 1.5701, "step": 8520 }, { "epoch": 3.8985374771480803, "grad_norm": 80383.0234375, "learning_rate": 1.1542145593869733e-05, "loss": 1.7806, "step": 8530 }, { "epoch": 3.903107861060329, "grad_norm": 76206.0, "learning_rate": 1.1494252873563218e-05, "loss": 1.6846, "step": 8540 }, { "epoch": 3.9076782449725775, "grad_norm": 105761.515625, "learning_rate": 1.1446360153256705e-05, "loss": 1.7613, "step": 8550 }, { "epoch": 3.9122486288848264, "grad_norm": 76597.375, "learning_rate": 1.1398467432950192e-05, "loss": 1.6565, "step": 8560 }, { "epoch": 3.916819012797075, "grad_norm": 73221.8984375, "learning_rate": 1.1350574712643679e-05, "loss": 1.7441, "step": 8570 }, { "epoch": 3.9213893967093236, "grad_norm": 79467.125, "learning_rate": 1.1302681992337164e-05, "loss": 1.6712, "step": 8580 }, { "epoch": 3.9259597806215725, "grad_norm": 72400.3125, "learning_rate": 1.1254789272030653e-05, "loss": 1.6967, "step": 8590 }, { "epoch": 3.930530164533821, "grad_norm": 57804.4296875, "learning_rate": 1.1206896551724138e-05, "loss": 1.7333, "step": 8600 }, { "epoch": 3.9351005484460693, "grad_norm": 67051.7734375, "learning_rate": 1.1159003831417625e-05, "loss": 1.6614, "step": 8610 }, { "epoch": 3.939670932358318, "grad_norm": 104811.15625, "learning_rate": 1.1111111111111112e-05, "loss": 1.7326, "step": 8620 }, { "epoch": 3.944241316270567, "grad_norm": 110914.2265625, "learning_rate": 1.1063218390804599e-05, "loss": 1.6674, "step": 8630 }, { "epoch": 3.9488117001828154, "grad_norm": 79537.1875, "learning_rate": 1.1015325670498085e-05, "loss": 1.6492, "step": 8640 }, { "epoch": 3.9533820840950638, "grad_norm": 60097.46484375, "learning_rate": 1.096743295019157e-05, "loss": 1.6846, "step": 8650 }, { "epoch": 3.9579524680073126, "grad_norm": 118254.2265625, "learning_rate": 1.091954022988506e-05, "loss": 1.6961, "step": 8660 }, { "epoch": 3.9625228519195614, "grad_norm": 114773.8046875, "learning_rate": 1.0871647509578544e-05, "loss": 1.7114, "step": 8670 }, { "epoch": 3.96709323583181, "grad_norm": 87263.125, "learning_rate": 1.0823754789272031e-05, "loss": 1.7308, "step": 8680 }, { "epoch": 3.9716636197440582, "grad_norm": 66768.859375, "learning_rate": 1.0775862068965516e-05, "loss": 1.6712, "step": 8690 }, { "epoch": 3.976234003656307, "grad_norm": 119385.375, "learning_rate": 1.0727969348659005e-05, "loss": 1.7558, "step": 8700 }, { "epoch": 3.980804387568556, "grad_norm": 71487.484375, "learning_rate": 1.068007662835249e-05, "loss": 1.7924, "step": 8710 }, { "epoch": 3.9853747714808043, "grad_norm": 81396.0234375, "learning_rate": 1.0632183908045977e-05, "loss": 1.6232, "step": 8720 }, { "epoch": 3.989945155393053, "grad_norm": 89533.171875, "learning_rate": 1.0584291187739464e-05, "loss": 1.7174, "step": 8730 }, { "epoch": 3.9945155393053016, "grad_norm": 72157.4765625, "learning_rate": 1.053639846743295e-05, "loss": 1.7366, "step": 8740 }, { "epoch": 3.9990859232175504, "grad_norm": 71103.40625, "learning_rate": 1.0488505747126438e-05, "loss": 1.6741, "step": 8750 }, { "epoch": 4.0, "eval_loss": 1.6915712356567383, "eval_runtime": 345.8106, "eval_samples_per_second": 43.376, "eval_steps_per_second": 1.356, "step": 8752 }, { "epoch": 4.003656307129799, "grad_norm": 75867.875, "learning_rate": 1.0440613026819925e-05, "loss": 1.6729, "step": 8760 }, { "epoch": 4.008226691042047, "grad_norm": 72915.7890625, "learning_rate": 1.0392720306513411e-05, "loss": 1.7197, "step": 8770 }, { "epoch": 4.012797074954296, "grad_norm": 92003.453125, "learning_rate": 1.0344827586206897e-05, "loss": 1.6728, "step": 8780 }, { "epoch": 4.017367458866545, "grad_norm": 120198.3359375, "learning_rate": 1.0296934865900384e-05, "loss": 1.7737, "step": 8790 }, { "epoch": 4.021937842778794, "grad_norm": 102790.4375, "learning_rate": 1.024904214559387e-05, "loss": 1.7166, "step": 8800 }, { "epoch": 4.026508226691042, "grad_norm": 83954.15625, "learning_rate": 1.0201149425287357e-05, "loss": 1.6454, "step": 8810 }, { "epoch": 4.0310786106032905, "grad_norm": 100407.6484375, "learning_rate": 1.0153256704980842e-05, "loss": 1.7711, "step": 8820 }, { "epoch": 4.035648994515539, "grad_norm": 70433.90625, "learning_rate": 1.0105363984674331e-05, "loss": 1.7616, "step": 8830 }, { "epoch": 4.040219378427788, "grad_norm": 73853.703125, "learning_rate": 1.0057471264367816e-05, "loss": 1.7407, "step": 8840 }, { "epoch": 4.044789762340036, "grad_norm": 89838.96875, "learning_rate": 1.0009578544061303e-05, "loss": 1.6869, "step": 8850 }, { "epoch": 4.049360146252285, "grad_norm": 98299.5859375, "learning_rate": 9.96168582375479e-06, "loss": 1.6584, "step": 8860 }, { "epoch": 4.053930530164534, "grad_norm": 52650.97265625, "learning_rate": 9.913793103448277e-06, "loss": 1.7039, "step": 8870 }, { "epoch": 4.058500914076783, "grad_norm": 98332.7890625, "learning_rate": 9.865900383141764e-06, "loss": 1.7345, "step": 8880 }, { "epoch": 4.063071297989031, "grad_norm": 87076.296875, "learning_rate": 9.818007662835249e-06, "loss": 1.6923, "step": 8890 }, { "epoch": 4.0676416819012795, "grad_norm": 54348.390625, "learning_rate": 9.770114942528738e-06, "loss": 1.7026, "step": 8900 }, { "epoch": 4.072212065813528, "grad_norm": 57868.62109375, "learning_rate": 9.722222222222223e-06, "loss": 1.7003, "step": 8910 }, { "epoch": 4.076782449725777, "grad_norm": 65227.4375, "learning_rate": 9.67432950191571e-06, "loss": 1.5978, "step": 8920 }, { "epoch": 4.081352833638025, "grad_norm": 105873.453125, "learning_rate": 9.626436781609195e-06, "loss": 1.7575, "step": 8930 }, { "epoch": 4.085923217550274, "grad_norm": 112255.6640625, "learning_rate": 9.578544061302683e-06, "loss": 1.6445, "step": 8940 }, { "epoch": 4.090493601462523, "grad_norm": 65130.7109375, "learning_rate": 9.530651340996169e-06, "loss": 1.7077, "step": 8950 }, { "epoch": 4.095063985374772, "grad_norm": 116178.6796875, "learning_rate": 9.482758620689655e-06, "loss": 1.7243, "step": 8960 }, { "epoch": 4.0996343692870205, "grad_norm": 105348.765625, "learning_rate": 9.434865900383142e-06, "loss": 1.6588, "step": 8970 }, { "epoch": 4.1042047531992685, "grad_norm": 64916.08203125, "learning_rate": 9.386973180076629e-06, "loss": 1.6274, "step": 8980 }, { "epoch": 4.108775137111517, "grad_norm": 85616.9453125, "learning_rate": 9.339080459770114e-06, "loss": 1.7181, "step": 8990 }, { "epoch": 4.113345521023766, "grad_norm": 56802.30859375, "learning_rate": 9.291187739463603e-06, "loss": 1.7205, "step": 9000 }, { "epoch": 4.117915904936015, "grad_norm": 106071.6015625, "learning_rate": 9.243295019157088e-06, "loss": 1.663, "step": 9010 }, { "epoch": 4.122486288848263, "grad_norm": 84213.0859375, "learning_rate": 9.195402298850575e-06, "loss": 1.6674, "step": 9020 }, { "epoch": 4.127056672760512, "grad_norm": 83103.328125, "learning_rate": 9.147509578544062e-06, "loss": 1.7418, "step": 9030 }, { "epoch": 4.131627056672761, "grad_norm": 45266.80859375, "learning_rate": 9.099616858237549e-06, "loss": 1.6215, "step": 9040 }, { "epoch": 4.1361974405850095, "grad_norm": 83939.390625, "learning_rate": 9.051724137931036e-06, "loss": 1.7258, "step": 9050 }, { "epoch": 4.140767824497257, "grad_norm": 98675.046875, "learning_rate": 9.00383141762452e-06, "loss": 1.6511, "step": 9060 }, { "epoch": 4.145338208409506, "grad_norm": 78594.921875, "learning_rate": 8.95593869731801e-06, "loss": 1.6855, "step": 9070 }, { "epoch": 4.149908592321755, "grad_norm": 78093.4609375, "learning_rate": 8.908045977011495e-06, "loss": 1.7284, "step": 9080 }, { "epoch": 4.154478976234004, "grad_norm": 98573.2890625, "learning_rate": 8.860153256704981e-06, "loss": 1.686, "step": 9090 }, { "epoch": 4.159049360146252, "grad_norm": 56181.88671875, "learning_rate": 8.812260536398467e-06, "loss": 1.7138, "step": 9100 }, { "epoch": 4.163619744058501, "grad_norm": 75070.2421875, "learning_rate": 8.764367816091955e-06, "loss": 1.7444, "step": 9110 }, { "epoch": 4.16819012797075, "grad_norm": 62708.078125, "learning_rate": 8.71647509578544e-06, "loss": 1.7641, "step": 9120 }, { "epoch": 4.1727605118829985, "grad_norm": 98152.28125, "learning_rate": 8.668582375478927e-06, "loss": 1.7544, "step": 9130 }, { "epoch": 4.177330895795246, "grad_norm": 59807.0546875, "learning_rate": 8.620689655172414e-06, "loss": 1.727, "step": 9140 }, { "epoch": 4.181901279707495, "grad_norm": 61243.06640625, "learning_rate": 8.572796934865901e-06, "loss": 1.6752, "step": 9150 }, { "epoch": 4.186471663619744, "grad_norm": 69440.40625, "learning_rate": 8.524904214559388e-06, "loss": 1.7704, "step": 9160 }, { "epoch": 4.191042047531993, "grad_norm": 132799.5625, "learning_rate": 8.477011494252873e-06, "loss": 1.7243, "step": 9170 }, { "epoch": 4.195612431444241, "grad_norm": 60873.515625, "learning_rate": 8.429118773946362e-06, "loss": 1.7021, "step": 9180 }, { "epoch": 4.20018281535649, "grad_norm": 56033.59375, "learning_rate": 8.381226053639847e-06, "loss": 1.6721, "step": 9190 }, { "epoch": 4.204753199268739, "grad_norm": 65075.58984375, "learning_rate": 8.333333333333334e-06, "loss": 1.7053, "step": 9200 }, { "epoch": 4.209323583180987, "grad_norm": 65212.20703125, "learning_rate": 8.28544061302682e-06, "loss": 1.6667, "step": 9210 }, { "epoch": 4.213893967093236, "grad_norm": 98034.7109375, "learning_rate": 8.237547892720307e-06, "loss": 1.7291, "step": 9220 }, { "epoch": 4.218464351005484, "grad_norm": 112262.515625, "learning_rate": 8.189655172413793e-06, "loss": 1.7307, "step": 9230 }, { "epoch": 4.223034734917733, "grad_norm": 42643.26171875, "learning_rate": 8.14176245210728e-06, "loss": 1.7146, "step": 9240 }, { "epoch": 4.227605118829982, "grad_norm": 120319.0703125, "learning_rate": 8.093869731800766e-06, "loss": 1.5791, "step": 9250 }, { "epoch": 4.232175502742231, "grad_norm": 111697.4765625, "learning_rate": 8.045977011494253e-06, "loss": 1.7305, "step": 9260 }, { "epoch": 4.236745886654479, "grad_norm": 82615.4453125, "learning_rate": 7.998084291187739e-06, "loss": 1.6441, "step": 9270 }, { "epoch": 4.2413162705667276, "grad_norm": 112459.7890625, "learning_rate": 7.950191570881227e-06, "loss": 1.6667, "step": 9280 }, { "epoch": 4.245886654478976, "grad_norm": 89633.421875, "learning_rate": 7.902298850574712e-06, "loss": 1.7402, "step": 9290 }, { "epoch": 4.250457038391225, "grad_norm": 73259.6953125, "learning_rate": 7.854406130268199e-06, "loss": 1.725, "step": 9300 }, { "epoch": 4.255027422303473, "grad_norm": 87751.640625, "learning_rate": 7.806513409961686e-06, "loss": 1.7017, "step": 9310 }, { "epoch": 4.259597806215722, "grad_norm": 130956.0546875, "learning_rate": 7.758620689655173e-06, "loss": 1.6762, "step": 9320 }, { "epoch": 4.264168190127971, "grad_norm": 54888.203125, "learning_rate": 7.71072796934866e-06, "loss": 1.772, "step": 9330 }, { "epoch": 4.26873857404022, "grad_norm": 55581.7109375, "learning_rate": 7.662835249042145e-06, "loss": 1.7176, "step": 9340 }, { "epoch": 4.273308957952468, "grad_norm": 89327.875, "learning_rate": 7.614942528735633e-06, "loss": 1.7243, "step": 9350 }, { "epoch": 4.2778793418647165, "grad_norm": 76867.40625, "learning_rate": 7.567049808429119e-06, "loss": 1.7136, "step": 9360 }, { "epoch": 4.282449725776965, "grad_norm": 131182.859375, "learning_rate": 7.519157088122606e-06, "loss": 1.7012, "step": 9370 }, { "epoch": 4.287020109689214, "grad_norm": 80961.5546875, "learning_rate": 7.4712643678160925e-06, "loss": 1.7052, "step": 9380 }, { "epoch": 4.291590493601462, "grad_norm": 64866.6796875, "learning_rate": 7.423371647509579e-06, "loss": 1.7689, "step": 9390 }, { "epoch": 4.296160877513711, "grad_norm": 65769.59375, "learning_rate": 7.375478927203065e-06, "loss": 1.7517, "step": 9400 }, { "epoch": 4.30073126142596, "grad_norm": 54188.25, "learning_rate": 7.3275862068965514e-06, "loss": 1.7222, "step": 9410 }, { "epoch": 4.305301645338209, "grad_norm": 122926.7734375, "learning_rate": 7.279693486590039e-06, "loss": 1.6745, "step": 9420 }, { "epoch": 4.309872029250457, "grad_norm": 69720.5078125, "learning_rate": 7.231800766283525e-06, "loss": 1.7206, "step": 9430 }, { "epoch": 4.3144424131627055, "grad_norm": 127599.6953125, "learning_rate": 7.183908045977011e-06, "loss": 1.6919, "step": 9440 }, { "epoch": 4.319012797074954, "grad_norm": 75789.5546875, "learning_rate": 7.136015325670499e-06, "loss": 1.725, "step": 9450 }, { "epoch": 4.323583180987203, "grad_norm": 99748.046875, "learning_rate": 7.088122605363985e-06, "loss": 1.7164, "step": 9460 }, { "epoch": 4.328153564899452, "grad_norm": 85926.734375, "learning_rate": 7.040229885057471e-06, "loss": 1.7377, "step": 9470 }, { "epoch": 4.3327239488117, "grad_norm": 85478.2109375, "learning_rate": 6.992337164750958e-06, "loss": 1.7319, "step": 9480 }, { "epoch": 4.337294332723949, "grad_norm": 72827.2265625, "learning_rate": 6.944444444444445e-06, "loss": 1.7371, "step": 9490 }, { "epoch": 4.341864716636198, "grad_norm": 93393.625, "learning_rate": 6.896551724137932e-06, "loss": 1.6363, "step": 9500 }, { "epoch": 4.346435100548446, "grad_norm": 90090.8359375, "learning_rate": 6.848659003831418e-06, "loss": 1.6988, "step": 9510 }, { "epoch": 4.3510054844606945, "grad_norm": 85922.3203125, "learning_rate": 6.800766283524905e-06, "loss": 1.6765, "step": 9520 }, { "epoch": 4.355575868372943, "grad_norm": 94569.2109375, "learning_rate": 6.7528735632183914e-06, "loss": 1.6928, "step": 9530 }, { "epoch": 4.360146252285192, "grad_norm": 86991.3984375, "learning_rate": 6.7049808429118775e-06, "loss": 1.7658, "step": 9540 }, { "epoch": 4.364716636197441, "grad_norm": 129308.6171875, "learning_rate": 6.657088122605365e-06, "loss": 1.7118, "step": 9550 }, { "epoch": 4.369287020109689, "grad_norm": 58696.4765625, "learning_rate": 6.609195402298851e-06, "loss": 1.7136, "step": 9560 }, { "epoch": 4.373857404021938, "grad_norm": 72559.2265625, "learning_rate": 6.561302681992337e-06, "loss": 1.7245, "step": 9570 }, { "epoch": 4.378427787934187, "grad_norm": 96479.890625, "learning_rate": 6.513409961685823e-06, "loss": 1.7426, "step": 9580 }, { "epoch": 4.3829981718464355, "grad_norm": 52625.5859375, "learning_rate": 6.465517241379311e-06, "loss": 1.818, "step": 9590 }, { "epoch": 4.387568555758683, "grad_norm": 74031.9765625, "learning_rate": 6.417624521072797e-06, "loss": 1.688, "step": 9600 }, { "epoch": 4.392138939670932, "grad_norm": 70750.546875, "learning_rate": 6.369731800766283e-06, "loss": 1.6336, "step": 9610 }, { "epoch": 4.396709323583181, "grad_norm": 79010.375, "learning_rate": 6.321839080459771e-06, "loss": 1.6344, "step": 9620 }, { "epoch": 4.40127970749543, "grad_norm": 52663.4765625, "learning_rate": 6.273946360153257e-06, "loss": 1.7024, "step": 9630 }, { "epoch": 4.405850091407678, "grad_norm": 88580.4375, "learning_rate": 6.226053639846744e-06, "loss": 1.7704, "step": 9640 }, { "epoch": 4.410420475319927, "grad_norm": 75858.3828125, "learning_rate": 6.178160919540231e-06, "loss": 1.7163, "step": 9650 }, { "epoch": 4.414990859232176, "grad_norm": 85468.9296875, "learning_rate": 6.130268199233717e-06, "loss": 1.8792, "step": 9660 }, { "epoch": 4.4195612431444244, "grad_norm": 149377.140625, "learning_rate": 6.0823754789272035e-06, "loss": 1.7492, "step": 9670 }, { "epoch": 4.424131627056672, "grad_norm": 96749.546875, "learning_rate": 6.03448275862069e-06, "loss": 1.641, "step": 9680 }, { "epoch": 4.428702010968921, "grad_norm": 114815.234375, "learning_rate": 5.9865900383141764e-06, "loss": 1.7394, "step": 9690 }, { "epoch": 4.43327239488117, "grad_norm": 117656.1953125, "learning_rate": 5.938697318007663e-06, "loss": 1.7659, "step": 9700 }, { "epoch": 4.437842778793419, "grad_norm": 91634.71875, "learning_rate": 5.890804597701149e-06, "loss": 1.806, "step": 9710 }, { "epoch": 4.442413162705667, "grad_norm": 59309.10546875, "learning_rate": 5.842911877394636e-06, "loss": 1.7264, "step": 9720 }, { "epoch": 4.446983546617916, "grad_norm": 102864.578125, "learning_rate": 5.795019157088123e-06, "loss": 1.6442, "step": 9730 }, { "epoch": 4.451553930530165, "grad_norm": 48123.75, "learning_rate": 5.747126436781609e-06, "loss": 1.7215, "step": 9740 }, { "epoch": 4.456124314442413, "grad_norm": 59340.78125, "learning_rate": 5.699233716475096e-06, "loss": 1.7105, "step": 9750 }, { "epoch": 4.460694698354661, "grad_norm": 47793.0078125, "learning_rate": 5.651340996168582e-06, "loss": 1.773, "step": 9760 }, { "epoch": 4.46526508226691, "grad_norm": 68314.3828125, "learning_rate": 5.603448275862069e-06, "loss": 1.8067, "step": 9770 }, { "epoch": 4.469835466179159, "grad_norm": 127164.171875, "learning_rate": 5.555555555555556e-06, "loss": 1.712, "step": 9780 }, { "epoch": 4.474405850091408, "grad_norm": 108175.9296875, "learning_rate": 5.507662835249043e-06, "loss": 1.6775, "step": 9790 }, { "epoch": 4.478976234003657, "grad_norm": 83982.234375, "learning_rate": 5.45977011494253e-06, "loss": 1.7235, "step": 9800 }, { "epoch": 4.483546617915905, "grad_norm": 116926.3515625, "learning_rate": 5.411877394636016e-06, "loss": 1.7325, "step": 9810 }, { "epoch": 4.4881170018281535, "grad_norm": 85041.0234375, "learning_rate": 5.3639846743295025e-06, "loss": 1.7853, "step": 9820 }, { "epoch": 4.492687385740402, "grad_norm": 67453.5859375, "learning_rate": 5.3160919540229885e-06, "loss": 1.7712, "step": 9830 }, { "epoch": 4.497257769652651, "grad_norm": 120161.9140625, "learning_rate": 5.268199233716475e-06, "loss": 1.7047, "step": 9840 }, { "epoch": 4.501828153564899, "grad_norm": 91166.8984375, "learning_rate": 5.220306513409962e-06, "loss": 1.7825, "step": 9850 }, { "epoch": 4.506398537477148, "grad_norm": 80539.265625, "learning_rate": 5.172413793103448e-06, "loss": 1.7293, "step": 9860 }, { "epoch": 4.510968921389397, "grad_norm": 89111.5390625, "learning_rate": 5.124521072796935e-06, "loss": 1.6972, "step": 9870 }, { "epoch": 4.515539305301646, "grad_norm": 106499.9453125, "learning_rate": 5.076628352490421e-06, "loss": 1.6945, "step": 9880 }, { "epoch": 4.520109689213894, "grad_norm": 81342.203125, "learning_rate": 5.028735632183908e-06, "loss": 1.7311, "step": 9890 }, { "epoch": 4.5246800731261425, "grad_norm": 63680.8359375, "learning_rate": 4.980842911877395e-06, "loss": 1.6724, "step": 9900 }, { "epoch": 4.529250457038391, "grad_norm": 80776.640625, "learning_rate": 4.932950191570882e-06, "loss": 1.7283, "step": 9910 }, { "epoch": 4.53382084095064, "grad_norm": 107851.6328125, "learning_rate": 4.885057471264369e-06, "loss": 1.7028, "step": 9920 }, { "epoch": 4.538391224862888, "grad_norm": 79906.65625, "learning_rate": 4.837164750957855e-06, "loss": 1.6608, "step": 9930 }, { "epoch": 4.542961608775137, "grad_norm": 67892.0, "learning_rate": 4.789272030651342e-06, "loss": 1.6717, "step": 9940 }, { "epoch": 4.547531992687386, "grad_norm": 79051.953125, "learning_rate": 4.741379310344828e-06, "loss": 1.6859, "step": 9950 }, { "epoch": 4.552102376599635, "grad_norm": 107722.7109375, "learning_rate": 4.6934865900383146e-06, "loss": 1.768, "step": 9960 }, { "epoch": 4.556672760511883, "grad_norm": 73130.265625, "learning_rate": 4.6455938697318015e-06, "loss": 1.623, "step": 9970 }, { "epoch": 4.5612431444241315, "grad_norm": 96307.8984375, "learning_rate": 4.5977011494252875e-06, "loss": 1.7251, "step": 9980 }, { "epoch": 4.56581352833638, "grad_norm": 92756.6484375, "learning_rate": 4.549808429118774e-06, "loss": 1.7256, "step": 9990 }, { "epoch": 4.570383912248629, "grad_norm": 99314.5390625, "learning_rate": 4.50191570881226e-06, "loss": 1.7193, "step": 10000 }, { "epoch": 4.574954296160877, "grad_norm": 89754.9453125, "learning_rate": 4.454022988505747e-06, "loss": 1.7082, "step": 10010 }, { "epoch": 4.579524680073126, "grad_norm": 75742.890625, "learning_rate": 4.406130268199233e-06, "loss": 1.7386, "step": 10020 }, { "epoch": 4.584095063985375, "grad_norm": 95144.9921875, "learning_rate": 4.35823754789272e-06, "loss": 1.7612, "step": 10030 }, { "epoch": 4.588665447897624, "grad_norm": 74380.0546875, "learning_rate": 4.310344827586207e-06, "loss": 1.7231, "step": 10040 }, { "epoch": 4.5932358318098725, "grad_norm": 75351.3515625, "learning_rate": 4.262452107279694e-06, "loss": 1.6865, "step": 10050 }, { "epoch": 4.5978062157221204, "grad_norm": 126452.8359375, "learning_rate": 4.214559386973181e-06, "loss": 1.7256, "step": 10060 }, { "epoch": 4.602376599634369, "grad_norm": 50301.078125, "learning_rate": 4.166666666666667e-06, "loss": 1.7692, "step": 10070 }, { "epoch": 4.606946983546618, "grad_norm": 72251.7265625, "learning_rate": 4.118773946360154e-06, "loss": 1.7593, "step": 10080 }, { "epoch": 4.611517367458866, "grad_norm": 63932.359375, "learning_rate": 4.07088122605364e-06, "loss": 1.768, "step": 10090 }, { "epoch": 4.616087751371115, "grad_norm": 71254.5390625, "learning_rate": 4.022988505747127e-06, "loss": 1.736, "step": 10100 }, { "epoch": 4.620658135283364, "grad_norm": 105173.09375, "learning_rate": 3.9750957854406135e-06, "loss": 1.7123, "step": 10110 }, { "epoch": 4.625228519195613, "grad_norm": 55138.73828125, "learning_rate": 3.9272030651340996e-06, "loss": 1.7062, "step": 10120 }, { "epoch": 4.6297989031078615, "grad_norm": 56330.21484375, "learning_rate": 3.8793103448275865e-06, "loss": 1.6406, "step": 10130 }, { "epoch": 4.634369287020109, "grad_norm": 134460.59375, "learning_rate": 3.8314176245210725e-06, "loss": 1.7209, "step": 10140 }, { "epoch": 4.638939670932358, "grad_norm": 67907.453125, "learning_rate": 3.7835249042145594e-06, "loss": 1.7233, "step": 10150 }, { "epoch": 4.643510054844607, "grad_norm": 100771.4921875, "learning_rate": 3.7356321839080462e-06, "loss": 1.6853, "step": 10160 }, { "epoch": 4.648080438756856, "grad_norm": 54433.70703125, "learning_rate": 3.6877394636015327e-06, "loss": 1.5971, "step": 10170 }, { "epoch": 4.652650822669104, "grad_norm": 74801.8515625, "learning_rate": 3.6398467432950196e-06, "loss": 1.6912, "step": 10180 }, { "epoch": 4.657221206581353, "grad_norm": 89413.6328125, "learning_rate": 3.5919540229885056e-06, "loss": 1.7757, "step": 10190 }, { "epoch": 4.661791590493602, "grad_norm": 55121.12109375, "learning_rate": 3.5440613026819925e-06, "loss": 1.7154, "step": 10200 }, { "epoch": 4.66636197440585, "grad_norm": 71017.0078125, "learning_rate": 3.496168582375479e-06, "loss": 1.6741, "step": 10210 }, { "epoch": 4.670932358318098, "grad_norm": 70393.5859375, "learning_rate": 3.448275862068966e-06, "loss": 1.7049, "step": 10220 }, { "epoch": 4.675502742230347, "grad_norm": 68136.203125, "learning_rate": 3.4003831417624527e-06, "loss": 1.7331, "step": 10230 }, { "epoch": 4.680073126142596, "grad_norm": 92633.125, "learning_rate": 3.3524904214559387e-06, "loss": 1.7386, "step": 10240 }, { "epoch": 4.684643510054845, "grad_norm": 52863.0234375, "learning_rate": 3.3045977011494256e-06, "loss": 1.6676, "step": 10250 }, { "epoch": 4.689213893967093, "grad_norm": 52217.04296875, "learning_rate": 3.2567049808429117e-06, "loss": 1.6496, "step": 10260 }, { "epoch": 4.693784277879342, "grad_norm": 71589.21875, "learning_rate": 3.2088122605363985e-06, "loss": 1.7128, "step": 10270 }, { "epoch": 4.698354661791591, "grad_norm": 63616.85546875, "learning_rate": 3.1609195402298854e-06, "loss": 1.8258, "step": 10280 }, { "epoch": 4.702925045703839, "grad_norm": 51611.06640625, "learning_rate": 3.113026819923372e-06, "loss": 1.6953, "step": 10290 }, { "epoch": 4.707495429616088, "grad_norm": 65978.15625, "learning_rate": 3.0651340996168583e-06, "loss": 1.7285, "step": 10300 }, { "epoch": 4.712065813528336, "grad_norm": 68540.25, "learning_rate": 3.017241379310345e-06, "loss": 1.666, "step": 10310 }, { "epoch": 4.716636197440585, "grad_norm": 121031.6015625, "learning_rate": 2.9693486590038317e-06, "loss": 1.7578, "step": 10320 }, { "epoch": 4.721206581352834, "grad_norm": 90117.9296875, "learning_rate": 2.921455938697318e-06, "loss": 1.7317, "step": 10330 }, { "epoch": 4.725776965265082, "grad_norm": 113749.5390625, "learning_rate": 2.8735632183908046e-06, "loss": 1.7508, "step": 10340 }, { "epoch": 4.730347349177331, "grad_norm": 51193.17578125, "learning_rate": 2.825670498084291e-06, "loss": 1.6362, "step": 10350 }, { "epoch": 4.7349177330895795, "grad_norm": 90305.71875, "learning_rate": 2.777777777777778e-06, "loss": 1.7621, "step": 10360 }, { "epoch": 4.739488117001828, "grad_norm": 74391.09375, "learning_rate": 2.729885057471265e-06, "loss": 1.6677, "step": 10370 }, { "epoch": 4.744058500914077, "grad_norm": 82829.6875, "learning_rate": 2.6819923371647512e-06, "loss": 1.8085, "step": 10380 }, { "epoch": 4.748628884826325, "grad_norm": 102442.8515625, "learning_rate": 2.6340996168582377e-06, "loss": 1.6941, "step": 10390 }, { "epoch": 4.753199268738574, "grad_norm": 84212.0546875, "learning_rate": 2.586206896551724e-06, "loss": 1.6369, "step": 10400 }, { "epoch": 4.757769652650823, "grad_norm": 87777.5234375, "learning_rate": 2.5383141762452106e-06, "loss": 1.6629, "step": 10410 }, { "epoch": 4.762340036563071, "grad_norm": 72210.796875, "learning_rate": 2.4904214559386975e-06, "loss": 1.7738, "step": 10420 }, { "epoch": 4.76691042047532, "grad_norm": 72672.140625, "learning_rate": 2.4425287356321844e-06, "loss": 1.743, "step": 10430 }, { "epoch": 4.7714808043875685, "grad_norm": 52435.03515625, "learning_rate": 2.394636015325671e-06, "loss": 1.6503, "step": 10440 }, { "epoch": 4.776051188299817, "grad_norm": 81516.578125, "learning_rate": 2.3467432950191573e-06, "loss": 1.7072, "step": 10450 }, { "epoch": 4.780621572212066, "grad_norm": 64971.1328125, "learning_rate": 2.2988505747126437e-06, "loss": 1.7349, "step": 10460 }, { "epoch": 4.785191956124314, "grad_norm": 90203.71875, "learning_rate": 2.25095785440613e-06, "loss": 1.7305, "step": 10470 }, { "epoch": 4.789762340036563, "grad_norm": 85017.875, "learning_rate": 2.2030651340996167e-06, "loss": 1.642, "step": 10480 }, { "epoch": 4.794332723948812, "grad_norm": 88660.4765625, "learning_rate": 2.1551724137931035e-06, "loss": 1.7387, "step": 10490 }, { "epoch": 4.798903107861061, "grad_norm": 81671.3984375, "learning_rate": 2.1072796934865904e-06, "loss": 1.6963, "step": 10500 }, { "epoch": 4.803473491773309, "grad_norm": 82145.9453125, "learning_rate": 2.059386973180077e-06, "loss": 1.689, "step": 10510 }, { "epoch": 4.8080438756855575, "grad_norm": 60723.84375, "learning_rate": 2.0114942528735633e-06, "loss": 1.7065, "step": 10520 }, { "epoch": 4.812614259597806, "grad_norm": 79342.8515625, "learning_rate": 1.9636015325670498e-06, "loss": 1.6952, "step": 10530 }, { "epoch": 4.817184643510055, "grad_norm": 66902.59375, "learning_rate": 1.9157088122605362e-06, "loss": 1.6965, "step": 10540 }, { "epoch": 4.821755027422303, "grad_norm": 57767.9453125, "learning_rate": 1.8678160919540231e-06, "loss": 1.7267, "step": 10550 }, { "epoch": 4.826325411334552, "grad_norm": 57847.31640625, "learning_rate": 1.8199233716475098e-06, "loss": 1.5725, "step": 10560 }, { "epoch": 4.830895795246801, "grad_norm": 71927.8984375, "learning_rate": 1.7720306513409962e-06, "loss": 1.6501, "step": 10570 }, { "epoch": 4.83546617915905, "grad_norm": 74899.0234375, "learning_rate": 1.724137931034483e-06, "loss": 1.6678, "step": 10580 }, { "epoch": 4.840036563071298, "grad_norm": 92564.6875, "learning_rate": 1.6762452107279694e-06, "loss": 1.6991, "step": 10590 }, { "epoch": 4.844606946983546, "grad_norm": 65702.7109375, "learning_rate": 1.6283524904214558e-06, "loss": 1.7309, "step": 10600 }, { "epoch": 4.849177330895795, "grad_norm": 72665.1796875, "learning_rate": 1.5804597701149427e-06, "loss": 1.6152, "step": 10610 }, { "epoch": 4.853747714808044, "grad_norm": 94501.109375, "learning_rate": 1.5325670498084292e-06, "loss": 1.6927, "step": 10620 }, { "epoch": 4.858318098720293, "grad_norm": 113540.65625, "learning_rate": 1.4846743295019158e-06, "loss": 1.7562, "step": 10630 }, { "epoch": 4.862888482632541, "grad_norm": 60082.1484375, "learning_rate": 1.4367816091954023e-06, "loss": 1.743, "step": 10640 }, { "epoch": 4.86745886654479, "grad_norm": 77984.4921875, "learning_rate": 1.388888888888889e-06, "loss": 1.597, "step": 10650 }, { "epoch": 4.872029250457039, "grad_norm": 77285.3515625, "learning_rate": 1.3409961685823756e-06, "loss": 1.6401, "step": 10660 }, { "epoch": 4.876599634369287, "grad_norm": 62164.921875, "learning_rate": 1.293103448275862e-06, "loss": 1.7019, "step": 10670 }, { "epoch": 4.881170018281535, "grad_norm": 82581.1875, "learning_rate": 1.2452107279693487e-06, "loss": 1.6698, "step": 10680 }, { "epoch": 4.885740402193784, "grad_norm": 191003.421875, "learning_rate": 1.1973180076628354e-06, "loss": 1.7425, "step": 10690 }, { "epoch": 4.890310786106033, "grad_norm": 70496.2421875, "learning_rate": 1.1494252873563219e-06, "loss": 1.7575, "step": 10700 }, { "epoch": 4.894881170018282, "grad_norm": 96979.078125, "learning_rate": 1.1015325670498083e-06, "loss": 1.7798, "step": 10710 }, { "epoch": 4.89945155393053, "grad_norm": 75589.8046875, "learning_rate": 1.0536398467432952e-06, "loss": 1.6209, "step": 10720 }, { "epoch": 4.904021937842779, "grad_norm": 146879.5, "learning_rate": 1.0057471264367817e-06, "loss": 1.7368, "step": 10730 }, { "epoch": 4.908592321755028, "grad_norm": 80107.3125, "learning_rate": 9.578544061302681e-07, "loss": 1.7019, "step": 10740 }, { "epoch": 4.913162705667276, "grad_norm": 66398.5703125, "learning_rate": 9.099616858237549e-07, "loss": 1.7885, "step": 10750 }, { "epoch": 4.917733089579524, "grad_norm": 76734.59375, "learning_rate": 8.620689655172415e-07, "loss": 1.7817, "step": 10760 }, { "epoch": 4.922303473491773, "grad_norm": 79165.3203125, "learning_rate": 8.141762452107279e-07, "loss": 1.7004, "step": 10770 }, { "epoch": 4.926873857404022, "grad_norm": 61589.2109375, "learning_rate": 7.662835249042146e-07, "loss": 1.7177, "step": 10780 }, { "epoch": 4.931444241316271, "grad_norm": 91896.1015625, "learning_rate": 7.183908045977011e-07, "loss": 1.7116, "step": 10790 }, { "epoch": 4.936014625228519, "grad_norm": 76825.4296875, "learning_rate": 6.704980842911878e-07, "loss": 1.6501, "step": 10800 }, { "epoch": 4.940585009140768, "grad_norm": 79197.75, "learning_rate": 6.226053639846744e-07, "loss": 1.7284, "step": 10810 }, { "epoch": 4.9451553930530165, "grad_norm": 87847.4609375, "learning_rate": 5.747126436781609e-07, "loss": 1.7757, "step": 10820 }, { "epoch": 4.949725776965265, "grad_norm": 105324.3125, "learning_rate": 5.268199233716476e-07, "loss": 1.7017, "step": 10830 }, { "epoch": 4.954296160877513, "grad_norm": 78621.59375, "learning_rate": 4.789272030651341e-07, "loss": 1.6659, "step": 10840 }, { "epoch": 4.958866544789762, "grad_norm": 161616.046875, "learning_rate": 4.3103448275862073e-07, "loss": 1.7586, "step": 10850 }, { "epoch": 4.963436928702011, "grad_norm": 87202.765625, "learning_rate": 3.831417624521073e-07, "loss": 1.6147, "step": 10860 }, { "epoch": 4.96800731261426, "grad_norm": 87169.0390625, "learning_rate": 3.352490421455939e-07, "loss": 1.6565, "step": 10870 }, { "epoch": 4.972577696526509, "grad_norm": 93609.015625, "learning_rate": 2.8735632183908047e-07, "loss": 1.8099, "step": 10880 }, { "epoch": 4.977148080438757, "grad_norm": 120831.265625, "learning_rate": 2.3946360153256703e-07, "loss": 1.7018, "step": 10890 }, { "epoch": 4.9817184643510055, "grad_norm": 69430.453125, "learning_rate": 1.9157088122605365e-07, "loss": 1.6799, "step": 10900 }, { "epoch": 4.986288848263254, "grad_norm": 65192.02734375, "learning_rate": 1.4367816091954023e-07, "loss": 1.7379, "step": 10910 }, { "epoch": 4.990859232175502, "grad_norm": 89309.6953125, "learning_rate": 9.578544061302682e-08, "loss": 1.7807, "step": 10920 }, { "epoch": 4.995429616087751, "grad_norm": 65604.3203125, "learning_rate": 4.789272030651341e-08, "loss": 1.7125, "step": 10930 }, { "epoch": 5.0, "grad_norm": 114907.515625, "learning_rate": 0.0, "loss": 1.7282, "step": 10940 }, { "epoch": 5.0, "eval_loss": 1.6917415857315063, "eval_runtime": 346.3088, "eval_samples_per_second": 43.314, "eval_steps_per_second": 1.354, "step": 10940 } ], "logging_steps": 10, "max_steps": 10940, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }