diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29526 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 21063, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023738308882875184, + "grad_norm": 6.03125, + "learning_rate": 2.3719165085388994e-07, + "loss": 0.993, + "step": 5 + }, + { + "epoch": 0.00047476617765750367, + "grad_norm": 15.9375, + "learning_rate": 4.743833017077799e-07, + "loss": 0.9493, + "step": 10 + }, + { + "epoch": 0.0007121492664862555, + "grad_norm": 19.875, + "learning_rate": 7.115749525616699e-07, + "loss": 0.9772, + "step": 15 + }, + { + "epoch": 0.0009495323553150073, + "grad_norm": 12.1875, + "learning_rate": 9.487666034155598e-07, + "loss": 0.867, + "step": 20 + }, + { + "epoch": 0.0011869154441437593, + "grad_norm": 7.875, + "learning_rate": 1.1859582542694498e-06, + "loss": 0.9284, + "step": 25 + }, + { + "epoch": 0.001424298532972511, + "grad_norm": 16.0, + "learning_rate": 1.4231499051233397e-06, + "loss": 0.8649, + "step": 30 + }, + { + "epoch": 0.001661681621801263, + "grad_norm": 10.875, + "learning_rate": 1.6603415559772294e-06, + "loss": 0.9028, + "step": 35 + }, + { + "epoch": 0.0018990647106300147, + "grad_norm": 20.125, + "learning_rate": 1.8975332068311195e-06, + "loss": 0.8898, + "step": 40 + }, + { + "epoch": 0.0021364477994587666, + "grad_norm": 9.75, + "learning_rate": 2.1347248576850095e-06, + "loss": 0.892, + "step": 45 + }, + { + "epoch": 0.0023738308882875186, + "grad_norm": 9.8125, + "learning_rate": 2.3719165085388996e-06, + "loss": 0.8617, + "step": 50 + }, + { + "epoch": 0.00261121397711627, + "grad_norm": 6.9375, + "learning_rate": 2.6091081593927897e-06, + "loss": 0.8693, + "step": 55 + }, + { + "epoch": 0.002848597065945022, + "grad_norm": 8.5, + "learning_rate": 2.8462998102466794e-06, + "loss": 0.7502, + "step": 60 + }, + { + "epoch": 0.003085980154773774, + "grad_norm": 4.125, + "learning_rate": 3.0834914611005695e-06, + "loss": 0.7925, + "step": 65 + }, + { + "epoch": 0.003323363243602526, + "grad_norm": 4.40625, + "learning_rate": 3.320683111954459e-06, + "loss": 0.7839, + "step": 70 + }, + { + "epoch": 0.0035607463324312774, + "grad_norm": 8.125, + "learning_rate": 3.557874762808349e-06, + "loss": 0.7424, + "step": 75 + }, + { + "epoch": 0.0037981294212600294, + "grad_norm": 3.625, + "learning_rate": 3.795066413662239e-06, + "loss": 0.706, + "step": 80 + }, + { + "epoch": 0.004035512510088781, + "grad_norm": 5.84375, + "learning_rate": 4.032258064516129e-06, + "loss": 0.7192, + "step": 85 + }, + { + "epoch": 0.004272895598917533, + "grad_norm": 3.140625, + "learning_rate": 4.269449715370019e-06, + "loss": 0.6576, + "step": 90 + }, + { + "epoch": 0.004510278687746285, + "grad_norm": 4.65625, + "learning_rate": 4.5066413662239095e-06, + "loss": 0.6379, + "step": 95 + }, + { + "epoch": 0.004747661776575037, + "grad_norm": 3.703125, + "learning_rate": 4.743833017077799e-06, + "loss": 0.627, + "step": 100 + }, + { + "epoch": 0.004985044865403789, + "grad_norm": 1.890625, + "learning_rate": 4.981024667931689e-06, + "loss": 0.5905, + "step": 105 + }, + { + "epoch": 0.00522242795423254, + "grad_norm": 1.59375, + "learning_rate": 5.218216318785579e-06, + "loss": 0.5738, + "step": 110 + }, + { + "epoch": 0.0054598110430612926, + "grad_norm": 1.609375, + "learning_rate": 5.455407969639469e-06, + "loss": 0.5512, + "step": 115 + }, + { + "epoch": 0.005697194131890044, + "grad_norm": 2.421875, + "learning_rate": 5.692599620493359e-06, + "loss": 0.5384, + "step": 120 + }, + { + "epoch": 0.005934577220718796, + "grad_norm": 1.6953125, + "learning_rate": 5.9297912713472485e-06, + "loss": 0.518, + "step": 125 + }, + { + "epoch": 0.006171960309547548, + "grad_norm": 1.609375, + "learning_rate": 6.166982922201139e-06, + "loss": 0.5024, + "step": 130 + }, + { + "epoch": 0.0064093433983762995, + "grad_norm": 1.8046875, + "learning_rate": 6.404174573055029e-06, + "loss": 0.4808, + "step": 135 + }, + { + "epoch": 0.006646726487205052, + "grad_norm": 3.140625, + "learning_rate": 6.641366223908918e-06, + "loss": 0.4742, + "step": 140 + }, + { + "epoch": 0.006884109576033803, + "grad_norm": 3.421875, + "learning_rate": 6.878557874762809e-06, + "loss": 0.474, + "step": 145 + }, + { + "epoch": 0.007121492664862555, + "grad_norm": 1.390625, + "learning_rate": 7.115749525616698e-06, + "loss": 0.4557, + "step": 150 + }, + { + "epoch": 0.007358875753691307, + "grad_norm": 3.796875, + "learning_rate": 7.3529411764705884e-06, + "loss": 0.4491, + "step": 155 + }, + { + "epoch": 0.007596258842520059, + "grad_norm": 1.515625, + "learning_rate": 7.590132827324478e-06, + "loss": 0.4419, + "step": 160 + }, + { + "epoch": 0.007833641931348811, + "grad_norm": 2.109375, + "learning_rate": 7.827324478178369e-06, + "loss": 0.4351, + "step": 165 + }, + { + "epoch": 0.008071025020177562, + "grad_norm": 3.109375, + "learning_rate": 8.064516129032258e-06, + "loss": 0.4319, + "step": 170 + }, + { + "epoch": 0.008308408109006314, + "grad_norm": 8.25, + "learning_rate": 8.30170777988615e-06, + "loss": 0.4297, + "step": 175 + }, + { + "epoch": 0.008545791197835067, + "grad_norm": 1.2890625, + "learning_rate": 8.538899430740038e-06, + "loss": 0.4251, + "step": 180 + }, + { + "epoch": 0.008783174286663819, + "grad_norm": 2.703125, + "learning_rate": 8.776091081593928e-06, + "loss": 0.418, + "step": 185 + }, + { + "epoch": 0.00902055737549257, + "grad_norm": 3.171875, + "learning_rate": 9.013282732447819e-06, + "loss": 0.415, + "step": 190 + }, + { + "epoch": 0.009257940464321322, + "grad_norm": 5.8125, + "learning_rate": 9.250474383301707e-06, + "loss": 0.4173, + "step": 195 + }, + { + "epoch": 0.009495323553150074, + "grad_norm": 57.75, + "learning_rate": 9.487666034155598e-06, + "loss": 0.4148, + "step": 200 + }, + { + "epoch": 0.009732706641978825, + "grad_norm": 8.5625, + "learning_rate": 9.724857685009488e-06, + "loss": 0.4059, + "step": 205 + }, + { + "epoch": 0.009970089730807577, + "grad_norm": 4.34375, + "learning_rate": 9.962049335863378e-06, + "loss": 0.4042, + "step": 210 + }, + { + "epoch": 0.01020747281963633, + "grad_norm": 3.75, + "learning_rate": 1.0199240986717267e-05, + "loss": 0.4064, + "step": 215 + }, + { + "epoch": 0.01044485590846508, + "grad_norm": 6.15625, + "learning_rate": 1.0436432637571159e-05, + "loss": 0.402, + "step": 220 + }, + { + "epoch": 0.010682238997293833, + "grad_norm": 10.5, + "learning_rate": 1.0673624288425049e-05, + "loss": 0.399, + "step": 225 + }, + { + "epoch": 0.010919622086122585, + "grad_norm": 1.578125, + "learning_rate": 1.0910815939278938e-05, + "loss": 0.3985, + "step": 230 + }, + { + "epoch": 0.011157005174951336, + "grad_norm": 8.25, + "learning_rate": 1.1148007590132828e-05, + "loss": 0.3927, + "step": 235 + }, + { + "epoch": 0.011394388263780088, + "grad_norm": 1.390625, + "learning_rate": 1.1385199240986718e-05, + "loss": 0.3848, + "step": 240 + }, + { + "epoch": 0.01163177135260884, + "grad_norm": 5.0, + "learning_rate": 1.1622390891840607e-05, + "loss": 0.3867, + "step": 245 + }, + { + "epoch": 0.011869154441437591, + "grad_norm": 1.6328125, + "learning_rate": 1.1859582542694497e-05, + "loss": 0.388, + "step": 250 + }, + { + "epoch": 0.012106537530266344, + "grad_norm": 1.0390625, + "learning_rate": 1.2096774193548388e-05, + "loss": 0.3836, + "step": 255 + }, + { + "epoch": 0.012343920619095096, + "grad_norm": 15.5, + "learning_rate": 1.2333965844402278e-05, + "loss": 0.3892, + "step": 260 + }, + { + "epoch": 0.012581303707923848, + "grad_norm": 2.765625, + "learning_rate": 1.2571157495256166e-05, + "loss": 0.3915, + "step": 265 + }, + { + "epoch": 0.012818686796752599, + "grad_norm": 9.9375, + "learning_rate": 1.2808349146110058e-05, + "loss": 0.3791, + "step": 270 + }, + { + "epoch": 0.013056069885581351, + "grad_norm": 4.375, + "learning_rate": 1.3045540796963949e-05, + "loss": 0.378, + "step": 275 + }, + { + "epoch": 0.013293452974410104, + "grad_norm": 8.4375, + "learning_rate": 1.3282732447817835e-05, + "loss": 0.3747, + "step": 280 + }, + { + "epoch": 0.013530836063238854, + "grad_norm": 1.2578125, + "learning_rate": 1.3519924098671727e-05, + "loss": 0.3721, + "step": 285 + }, + { + "epoch": 0.013768219152067607, + "grad_norm": 1.8125, + "learning_rate": 1.3757115749525618e-05, + "loss": 0.3664, + "step": 290 + }, + { + "epoch": 0.014005602240896359, + "grad_norm": 1.296875, + "learning_rate": 1.3994307400379508e-05, + "loss": 0.3654, + "step": 295 + }, + { + "epoch": 0.01424298532972511, + "grad_norm": 3.359375, + "learning_rate": 1.4231499051233396e-05, + "loss": 0.3587, + "step": 300 + }, + { + "epoch": 0.014480368418553862, + "grad_norm": 1.96875, + "learning_rate": 1.4468690702087287e-05, + "loss": 0.3565, + "step": 305 + }, + { + "epoch": 0.014717751507382614, + "grad_norm": 2.25, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.3563, + "step": 310 + }, + { + "epoch": 0.014955134596211365, + "grad_norm": 4.96875, + "learning_rate": 1.4943074003795068e-05, + "loss": 0.3539, + "step": 315 + }, + { + "epoch": 0.015192517685040118, + "grad_norm": 2.5, + "learning_rate": 1.5180265654648956e-05, + "loss": 0.3512, + "step": 320 + }, + { + "epoch": 0.01542990077386887, + "grad_norm": 3.53125, + "learning_rate": 1.5417457305502848e-05, + "loss": 0.3454, + "step": 325 + }, + { + "epoch": 0.015667283862697622, + "grad_norm": 2.53125, + "learning_rate": 1.5654648956356737e-05, + "loss": 0.3439, + "step": 330 + }, + { + "epoch": 0.015904666951526373, + "grad_norm": 1.46875, + "learning_rate": 1.5891840607210627e-05, + "loss": 0.3427, + "step": 335 + }, + { + "epoch": 0.016142050040355124, + "grad_norm": 2.0, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.3417, + "step": 340 + }, + { + "epoch": 0.016379433129183878, + "grad_norm": 4.375, + "learning_rate": 1.6366223908918406e-05, + "loss": 0.3331, + "step": 345 + }, + { + "epoch": 0.01661681621801263, + "grad_norm": 5.25, + "learning_rate": 1.66034155597723e-05, + "loss": 0.338, + "step": 350 + }, + { + "epoch": 0.01685419930684138, + "grad_norm": 2.359375, + "learning_rate": 1.6840607210626186e-05, + "loss": 0.3295, + "step": 355 + }, + { + "epoch": 0.017091582395670133, + "grad_norm": 14.875, + "learning_rate": 1.7077798861480076e-05, + "loss": 0.3363, + "step": 360 + }, + { + "epoch": 0.017328965484498884, + "grad_norm": 3.6875, + "learning_rate": 1.731499051233397e-05, + "loss": 0.3254, + "step": 365 + }, + { + "epoch": 0.017566348573327638, + "grad_norm": 1.7578125, + "learning_rate": 1.7552182163187855e-05, + "loss": 0.3201, + "step": 370 + }, + { + "epoch": 0.01780373166215639, + "grad_norm": 1.390625, + "learning_rate": 1.7789373814041745e-05, + "loss": 0.3217, + "step": 375 + }, + { + "epoch": 0.01804111475098514, + "grad_norm": 10.4375, + "learning_rate": 1.8026565464895638e-05, + "loss": 0.3203, + "step": 380 + }, + { + "epoch": 0.018278497839813893, + "grad_norm": 1.1875, + "learning_rate": 1.8263757115749528e-05, + "loss": 0.3119, + "step": 385 + }, + { + "epoch": 0.018515880928642644, + "grad_norm": 1.2109375, + "learning_rate": 1.8500948766603414e-05, + "loss": 0.3157, + "step": 390 + }, + { + "epoch": 0.018753264017471395, + "grad_norm": 2.078125, + "learning_rate": 1.8738140417457307e-05, + "loss": 0.3172, + "step": 395 + }, + { + "epoch": 0.01899064710630015, + "grad_norm": 1.1640625, + "learning_rate": 1.8975332068311197e-05, + "loss": 0.3157, + "step": 400 + }, + { + "epoch": 0.0192280301951289, + "grad_norm": 1.109375, + "learning_rate": 1.9212523719165086e-05, + "loss": 0.3091, + "step": 405 + }, + { + "epoch": 0.01946541328395765, + "grad_norm": 27.0, + "learning_rate": 1.9449715370018976e-05, + "loss": 0.3054, + "step": 410 + }, + { + "epoch": 0.019702796372786404, + "grad_norm": 1.3203125, + "learning_rate": 1.9686907020872866e-05, + "loss": 0.3147, + "step": 415 + }, + { + "epoch": 0.019940179461615155, + "grad_norm": 1.3515625, + "learning_rate": 1.9924098671726755e-05, + "loss": 0.3029, + "step": 420 + }, + { + "epoch": 0.020177562550443905, + "grad_norm": 1.625, + "learning_rate": 2.0161290322580645e-05, + "loss": 0.3023, + "step": 425 + }, + { + "epoch": 0.02041494563927266, + "grad_norm": 1.6171875, + "learning_rate": 2.0398481973434535e-05, + "loss": 0.3032, + "step": 430 + }, + { + "epoch": 0.02065232872810141, + "grad_norm": 1.4921875, + "learning_rate": 2.0635673624288428e-05, + "loss": 0.2973, + "step": 435 + }, + { + "epoch": 0.02088971181693016, + "grad_norm": 1.2734375, + "learning_rate": 2.0872865275142318e-05, + "loss": 0.2932, + "step": 440 + }, + { + "epoch": 0.021127094905758915, + "grad_norm": 1.34375, + "learning_rate": 2.1110056925996204e-05, + "loss": 0.2915, + "step": 445 + }, + { + "epoch": 0.021364477994587665, + "grad_norm": 1.109375, + "learning_rate": 2.1347248576850097e-05, + "loss": 0.2929, + "step": 450 + }, + { + "epoch": 0.021601861083416416, + "grad_norm": 1.1796875, + "learning_rate": 2.1584440227703987e-05, + "loss": 0.2883, + "step": 455 + }, + { + "epoch": 0.02183924417224517, + "grad_norm": 1.203125, + "learning_rate": 2.1821631878557876e-05, + "loss": 0.2882, + "step": 460 + }, + { + "epoch": 0.02207662726107392, + "grad_norm": 1.7890625, + "learning_rate": 2.2058823529411766e-05, + "loss": 0.2876, + "step": 465 + }, + { + "epoch": 0.02231401034990267, + "grad_norm": 1.65625, + "learning_rate": 2.2296015180265656e-05, + "loss": 0.2888, + "step": 470 + }, + { + "epoch": 0.022551393438731426, + "grad_norm": 2.28125, + "learning_rate": 2.2533206831119546e-05, + "loss": 0.2841, + "step": 475 + }, + { + "epoch": 0.022788776527560176, + "grad_norm": 1.3984375, + "learning_rate": 2.2770398481973435e-05, + "loss": 0.284, + "step": 480 + }, + { + "epoch": 0.023026159616388927, + "grad_norm": 2.1875, + "learning_rate": 2.3007590132827325e-05, + "loss": 0.2825, + "step": 485 + }, + { + "epoch": 0.02326354270521768, + "grad_norm": 0.95703125, + "learning_rate": 2.3244781783681215e-05, + "loss": 0.2815, + "step": 490 + }, + { + "epoch": 0.02350092579404643, + "grad_norm": 2.78125, + "learning_rate": 2.3481973434535108e-05, + "loss": 0.2851, + "step": 495 + }, + { + "epoch": 0.023738308882875182, + "grad_norm": 1.6328125, + "learning_rate": 2.3719165085388994e-05, + "loss": 0.2821, + "step": 500 + }, + { + "epoch": 0.023975691971703936, + "grad_norm": 1.3046875, + "learning_rate": 2.3956356736242884e-05, + "loss": 0.2736, + "step": 505 + }, + { + "epoch": 0.024213075060532687, + "grad_norm": 1.5390625, + "learning_rate": 2.4193548387096777e-05, + "loss": 0.2786, + "step": 510 + }, + { + "epoch": 0.02445045814936144, + "grad_norm": 2.890625, + "learning_rate": 2.4430740037950663e-05, + "loss": 0.2786, + "step": 515 + }, + { + "epoch": 0.024687841238190192, + "grad_norm": 1.078125, + "learning_rate": 2.4667931688804556e-05, + "loss": 0.2805, + "step": 520 + }, + { + "epoch": 0.024925224327018942, + "grad_norm": 1.203125, + "learning_rate": 2.4905123339658446e-05, + "loss": 0.2755, + "step": 525 + }, + { + "epoch": 0.025162607415847697, + "grad_norm": 1.25, + "learning_rate": 2.5142314990512332e-05, + "loss": 0.2753, + "step": 530 + }, + { + "epoch": 0.025399990504676447, + "grad_norm": 1.640625, + "learning_rate": 2.5379506641366225e-05, + "loss": 0.2734, + "step": 535 + }, + { + "epoch": 0.025637373593505198, + "grad_norm": 1.1171875, + "learning_rate": 2.5616698292220115e-05, + "loss": 0.2695, + "step": 540 + }, + { + "epoch": 0.025874756682333952, + "grad_norm": 1.1875, + "learning_rate": 2.5853889943074005e-05, + "loss": 0.2759, + "step": 545 + }, + { + "epoch": 0.026112139771162703, + "grad_norm": 3.0, + "learning_rate": 2.6091081593927898e-05, + "loss": 0.2725, + "step": 550 + }, + { + "epoch": 0.026349522859991453, + "grad_norm": 2.0625, + "learning_rate": 2.6328273244781788e-05, + "loss": 0.2766, + "step": 555 + }, + { + "epoch": 0.026586905948820207, + "grad_norm": 1.046875, + "learning_rate": 2.656546489563567e-05, + "loss": 0.2722, + "step": 560 + }, + { + "epoch": 0.026824289037648958, + "grad_norm": 1.3828125, + "learning_rate": 2.6802656546489564e-05, + "loss": 0.2738, + "step": 565 + }, + { + "epoch": 0.02706167212647771, + "grad_norm": 1.0546875, + "learning_rate": 2.7039848197343453e-05, + "loss": 0.2722, + "step": 570 + }, + { + "epoch": 0.027299055215306463, + "grad_norm": 0.96484375, + "learning_rate": 2.7277039848197343e-05, + "loss": 0.2638, + "step": 575 + }, + { + "epoch": 0.027536438304135213, + "grad_norm": 1.328125, + "learning_rate": 2.7514231499051236e-05, + "loss": 0.2732, + "step": 580 + }, + { + "epoch": 0.027773821392963964, + "grad_norm": 1.1171875, + "learning_rate": 2.7751423149905126e-05, + "loss": 0.2713, + "step": 585 + }, + { + "epoch": 0.028011204481792718, + "grad_norm": 1.859375, + "learning_rate": 2.7988614800759016e-05, + "loss": 0.2721, + "step": 590 + }, + { + "epoch": 0.02824858757062147, + "grad_norm": 1.203125, + "learning_rate": 2.822580645161291e-05, + "loss": 0.2693, + "step": 595 + }, + { + "epoch": 0.02848597065945022, + "grad_norm": 1.203125, + "learning_rate": 2.846299810246679e-05, + "loss": 0.264, + "step": 600 + }, + { + "epoch": 0.028723353748278974, + "grad_norm": 1.234375, + "learning_rate": 2.8700189753320685e-05, + "loss": 0.2698, + "step": 605 + }, + { + "epoch": 0.028960736837107724, + "grad_norm": 2.75, + "learning_rate": 2.8937381404174574e-05, + "loss": 0.2677, + "step": 610 + }, + { + "epoch": 0.029198119925936475, + "grad_norm": 0.9453125, + "learning_rate": 2.9174573055028464e-05, + "loss": 0.2684, + "step": 615 + }, + { + "epoch": 0.02943550301476523, + "grad_norm": 1.3828125, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.2652, + "step": 620 + }, + { + "epoch": 0.02967288610359398, + "grad_norm": 1.2421875, + "learning_rate": 2.9648956356736247e-05, + "loss": 0.2622, + "step": 625 + }, + { + "epoch": 0.02991026919242273, + "grad_norm": 1.4453125, + "learning_rate": 2.9886148007590137e-05, + "loss": 0.2676, + "step": 630 + }, + { + "epoch": 0.030147652281251484, + "grad_norm": 1.0390625, + "learning_rate": 3.0123339658444023e-05, + "loss": 0.2665, + "step": 635 + }, + { + "epoch": 0.030385035370080235, + "grad_norm": 1.375, + "learning_rate": 3.0360531309297913e-05, + "loss": 0.2595, + "step": 640 + }, + { + "epoch": 0.030622418458908986, + "grad_norm": 1.046875, + "learning_rate": 3.0597722960151806e-05, + "loss": 0.2637, + "step": 645 + }, + { + "epoch": 0.03085980154773774, + "grad_norm": 1.109375, + "learning_rate": 3.0834914611005695e-05, + "loss": 0.2647, + "step": 650 + }, + { + "epoch": 0.03109718463656649, + "grad_norm": 0.8984375, + "learning_rate": 3.1072106261859585e-05, + "loss": 0.2639, + "step": 655 + }, + { + "epoch": 0.031334567725395245, + "grad_norm": 1.640625, + "learning_rate": 3.1309297912713475e-05, + "loss": 0.2638, + "step": 660 + }, + { + "epoch": 0.031571950814223995, + "grad_norm": 1.5, + "learning_rate": 3.1546489563567364e-05, + "loss": 0.2617, + "step": 665 + }, + { + "epoch": 0.031809333903052746, + "grad_norm": 1.5703125, + "learning_rate": 3.1783681214421254e-05, + "loss": 0.2634, + "step": 670 + }, + { + "epoch": 0.032046716991881496, + "grad_norm": 1.46875, + "learning_rate": 3.2020872865275144e-05, + "loss": 0.26, + "step": 675 + }, + { + "epoch": 0.03228410008071025, + "grad_norm": 2.0, + "learning_rate": 3.2258064516129034e-05, + "loss": 0.2587, + "step": 680 + }, + { + "epoch": 0.032521483169539005, + "grad_norm": 1.0078125, + "learning_rate": 3.249525616698292e-05, + "loss": 0.2663, + "step": 685 + }, + { + "epoch": 0.032758866258367755, + "grad_norm": 1.109375, + "learning_rate": 3.273244781783681e-05, + "loss": 0.2622, + "step": 690 + }, + { + "epoch": 0.032996249347196506, + "grad_norm": 1.3671875, + "learning_rate": 3.29696394686907e-05, + "loss": 0.2576, + "step": 695 + }, + { + "epoch": 0.03323363243602526, + "grad_norm": 0.98046875, + "learning_rate": 3.32068311195446e-05, + "loss": 0.2614, + "step": 700 + }, + { + "epoch": 0.03347101552485401, + "grad_norm": 1.609375, + "learning_rate": 3.344402277039848e-05, + "loss": 0.2586, + "step": 705 + }, + { + "epoch": 0.03370839861368276, + "grad_norm": 0.98046875, + "learning_rate": 3.368121442125237e-05, + "loss": 0.2617, + "step": 710 + }, + { + "epoch": 0.033945781702511515, + "grad_norm": 1.1484375, + "learning_rate": 3.391840607210626e-05, + "loss": 0.2608, + "step": 715 + }, + { + "epoch": 0.034183164791340266, + "grad_norm": 1.1015625, + "learning_rate": 3.415559772296015e-05, + "loss": 0.2566, + "step": 720 + }, + { + "epoch": 0.03442054788016902, + "grad_norm": 1.7109375, + "learning_rate": 3.439278937381404e-05, + "loss": 0.256, + "step": 725 + }, + { + "epoch": 0.03465793096899777, + "grad_norm": 1.1875, + "learning_rate": 3.462998102466794e-05, + "loss": 0.2636, + "step": 730 + }, + { + "epoch": 0.03489531405782652, + "grad_norm": 2.0, + "learning_rate": 3.486717267552183e-05, + "loss": 0.2573, + "step": 735 + }, + { + "epoch": 0.035132697146655276, + "grad_norm": 1.6796875, + "learning_rate": 3.510436432637571e-05, + "loss": 0.2541, + "step": 740 + }, + { + "epoch": 0.035370080235484026, + "grad_norm": 1.171875, + "learning_rate": 3.53415559772296e-05, + "loss": 0.2561, + "step": 745 + }, + { + "epoch": 0.03560746332431278, + "grad_norm": 1.0390625, + "learning_rate": 3.557874762808349e-05, + "loss": 0.258, + "step": 750 + }, + { + "epoch": 0.03584484641314153, + "grad_norm": 1.390625, + "learning_rate": 3.5815939278937386e-05, + "loss": 0.2513, + "step": 755 + }, + { + "epoch": 0.03608222950197028, + "grad_norm": 1.6171875, + "learning_rate": 3.6053130929791276e-05, + "loss": 0.2604, + "step": 760 + }, + { + "epoch": 0.03631961259079903, + "grad_norm": 1.8125, + "learning_rate": 3.6290322580645165e-05, + "loss": 0.2602, + "step": 765 + }, + { + "epoch": 0.036556995679627786, + "grad_norm": 1.1015625, + "learning_rate": 3.6527514231499055e-05, + "loss": 0.2526, + "step": 770 + }, + { + "epoch": 0.03679437876845654, + "grad_norm": 1.3125, + "learning_rate": 3.6764705882352945e-05, + "loss": 0.2548, + "step": 775 + }, + { + "epoch": 0.03703176185728529, + "grad_norm": 1.2265625, + "learning_rate": 3.700189753320683e-05, + "loss": 0.2536, + "step": 780 + }, + { + "epoch": 0.03726914494611404, + "grad_norm": 1.7265625, + "learning_rate": 3.7239089184060724e-05, + "loss": 0.2559, + "step": 785 + }, + { + "epoch": 0.03750652803494279, + "grad_norm": 0.87109375, + "learning_rate": 3.7476280834914614e-05, + "loss": 0.2551, + "step": 790 + }, + { + "epoch": 0.03774391112377154, + "grad_norm": 1.6640625, + "learning_rate": 3.7713472485768504e-05, + "loss": 0.2518, + "step": 795 + }, + { + "epoch": 0.0379812942126003, + "grad_norm": 1.0703125, + "learning_rate": 3.795066413662239e-05, + "loss": 0.2523, + "step": 800 + }, + { + "epoch": 0.03821867730142905, + "grad_norm": 1.2734375, + "learning_rate": 3.818785578747628e-05, + "loss": 0.2541, + "step": 805 + }, + { + "epoch": 0.0384560603902578, + "grad_norm": 1.0390625, + "learning_rate": 3.842504743833017e-05, + "loss": 0.2558, + "step": 810 + }, + { + "epoch": 0.03869344347908655, + "grad_norm": 1.421875, + "learning_rate": 3.866223908918406e-05, + "loss": 0.2529, + "step": 815 + }, + { + "epoch": 0.0389308265679153, + "grad_norm": 1.0, + "learning_rate": 3.889943074003795e-05, + "loss": 0.2549, + "step": 820 + }, + { + "epoch": 0.03916820965674405, + "grad_norm": 2.6875, + "learning_rate": 3.913662239089184e-05, + "loss": 0.2501, + "step": 825 + }, + { + "epoch": 0.03940559274557281, + "grad_norm": 1.2109375, + "learning_rate": 3.937381404174573e-05, + "loss": 0.2542, + "step": 830 + }, + { + "epoch": 0.03964297583440156, + "grad_norm": 3.515625, + "learning_rate": 3.961100569259962e-05, + "loss": 0.2523, + "step": 835 + }, + { + "epoch": 0.03988035892323031, + "grad_norm": 0.94921875, + "learning_rate": 3.984819734345351e-05, + "loss": 0.2507, + "step": 840 + }, + { + "epoch": 0.04011774201205906, + "grad_norm": 0.94921875, + "learning_rate": 4.008538899430741e-05, + "loss": 0.25, + "step": 845 + }, + { + "epoch": 0.04035512510088781, + "grad_norm": 1.1171875, + "learning_rate": 4.032258064516129e-05, + "loss": 0.2536, + "step": 850 + }, + { + "epoch": 0.04059250818971656, + "grad_norm": 0.96484375, + "learning_rate": 4.055977229601518e-05, + "loss": 0.2566, + "step": 855 + }, + { + "epoch": 0.04082989127854532, + "grad_norm": 0.90234375, + "learning_rate": 4.079696394686907e-05, + "loss": 0.2488, + "step": 860 + }, + { + "epoch": 0.04106727436737407, + "grad_norm": 0.86328125, + "learning_rate": 4.103415559772296e-05, + "loss": 0.253, + "step": 865 + }, + { + "epoch": 0.04130465745620282, + "grad_norm": 1.3515625, + "learning_rate": 4.1271347248576856e-05, + "loss": 0.2486, + "step": 870 + }, + { + "epoch": 0.04154204054503157, + "grad_norm": 1.078125, + "learning_rate": 4.1508538899430746e-05, + "loss": 0.2519, + "step": 875 + }, + { + "epoch": 0.04177942363386032, + "grad_norm": 1.4375, + "learning_rate": 4.1745730550284635e-05, + "loss": 0.2524, + "step": 880 + }, + { + "epoch": 0.04201680672268908, + "grad_norm": 1.046875, + "learning_rate": 4.198292220113852e-05, + "loss": 0.2548, + "step": 885 + }, + { + "epoch": 0.04225418981151783, + "grad_norm": 0.953125, + "learning_rate": 4.222011385199241e-05, + "loss": 0.252, + "step": 890 + }, + { + "epoch": 0.04249157290034658, + "grad_norm": 1.2109375, + "learning_rate": 4.24573055028463e-05, + "loss": 0.2532, + "step": 895 + }, + { + "epoch": 0.04272895598917533, + "grad_norm": 0.81640625, + "learning_rate": 4.2694497153700194e-05, + "loss": 0.2519, + "step": 900 + }, + { + "epoch": 0.04296633907800408, + "grad_norm": 1.2890625, + "learning_rate": 4.2931688804554084e-05, + "loss": 0.25, + "step": 905 + }, + { + "epoch": 0.04320372216683283, + "grad_norm": 1.171875, + "learning_rate": 4.3168880455407974e-05, + "loss": 0.2497, + "step": 910 + }, + { + "epoch": 0.04344110525566159, + "grad_norm": 1.4140625, + "learning_rate": 4.340607210626186e-05, + "loss": 0.2502, + "step": 915 + }, + { + "epoch": 0.04367848834449034, + "grad_norm": 0.79296875, + "learning_rate": 4.364326375711575e-05, + "loss": 0.2532, + "step": 920 + }, + { + "epoch": 0.04391587143331909, + "grad_norm": 0.78125, + "learning_rate": 4.388045540796964e-05, + "loss": 0.2477, + "step": 925 + }, + { + "epoch": 0.04415325452214784, + "grad_norm": 1.203125, + "learning_rate": 4.411764705882353e-05, + "loss": 0.2499, + "step": 930 + }, + { + "epoch": 0.04439063761097659, + "grad_norm": 1.25, + "learning_rate": 4.435483870967742e-05, + "loss": 0.2453, + "step": 935 + }, + { + "epoch": 0.04462802069980534, + "grad_norm": 1.484375, + "learning_rate": 4.459203036053131e-05, + "loss": 0.2523, + "step": 940 + }, + { + "epoch": 0.0448654037886341, + "grad_norm": 1.5, + "learning_rate": 4.48292220113852e-05, + "loss": 0.249, + "step": 945 + }, + { + "epoch": 0.04510278687746285, + "grad_norm": 1.203125, + "learning_rate": 4.506641366223909e-05, + "loss": 0.2482, + "step": 950 + }, + { + "epoch": 0.0453401699662916, + "grad_norm": 1.203125, + "learning_rate": 4.530360531309298e-05, + "loss": 0.248, + "step": 955 + }, + { + "epoch": 0.04557755305512035, + "grad_norm": 1.3203125, + "learning_rate": 4.554079696394687e-05, + "loss": 0.2487, + "step": 960 + }, + { + "epoch": 0.0458149361439491, + "grad_norm": 1.4140625, + "learning_rate": 4.577798861480076e-05, + "loss": 0.2479, + "step": 965 + }, + { + "epoch": 0.046052319232777854, + "grad_norm": 0.75390625, + "learning_rate": 4.601518026565465e-05, + "loss": 0.2479, + "step": 970 + }, + { + "epoch": 0.04628970232160661, + "grad_norm": 1.0390625, + "learning_rate": 4.625237191650854e-05, + "loss": 0.2423, + "step": 975 + }, + { + "epoch": 0.04652708541043536, + "grad_norm": 0.921875, + "learning_rate": 4.648956356736243e-05, + "loss": 0.2474, + "step": 980 + }, + { + "epoch": 0.04676446849926411, + "grad_norm": 0.89453125, + "learning_rate": 4.6726755218216326e-05, + "loss": 0.2494, + "step": 985 + }, + { + "epoch": 0.04700185158809286, + "grad_norm": 0.859375, + "learning_rate": 4.6963946869070216e-05, + "loss": 0.2464, + "step": 990 + }, + { + "epoch": 0.047239234676921614, + "grad_norm": 1.3125, + "learning_rate": 4.72011385199241e-05, + "loss": 0.2495, + "step": 995 + }, + { + "epoch": 0.047476617765750365, + "grad_norm": 1.0859375, + "learning_rate": 4.743833017077799e-05, + "loss": 0.251, + "step": 1000 + }, + { + "epoch": 0.04771400085457912, + "grad_norm": 1.0703125, + "learning_rate": 4.767552182163188e-05, + "loss": 0.2509, + "step": 1005 + }, + { + "epoch": 0.04795138394340787, + "grad_norm": 0.77734375, + "learning_rate": 4.791271347248577e-05, + "loss": 0.2444, + "step": 1010 + }, + { + "epoch": 0.048188767032236623, + "grad_norm": 1.2734375, + "learning_rate": 4.8149905123339664e-05, + "loss": 0.2512, + "step": 1015 + }, + { + "epoch": 0.048426150121065374, + "grad_norm": 0.8046875, + "learning_rate": 4.8387096774193554e-05, + "loss": 0.2437, + "step": 1020 + }, + { + "epoch": 0.048663533209894125, + "grad_norm": 0.75390625, + "learning_rate": 4.8624288425047443e-05, + "loss": 0.2486, + "step": 1025 + }, + { + "epoch": 0.04890091629872288, + "grad_norm": 0.98046875, + "learning_rate": 4.8861480075901326e-05, + "loss": 0.2476, + "step": 1030 + }, + { + "epoch": 0.04913829938755163, + "grad_norm": 0.8671875, + "learning_rate": 4.9098671726755216e-05, + "loss": 0.2467, + "step": 1035 + }, + { + "epoch": 0.049375682476380384, + "grad_norm": 0.73828125, + "learning_rate": 4.933586337760911e-05, + "loss": 0.2447, + "step": 1040 + }, + { + "epoch": 0.049613065565209134, + "grad_norm": 0.734375, + "learning_rate": 4.9573055028463e-05, + "loss": 0.2482, + "step": 1045 + }, + { + "epoch": 0.049850448654037885, + "grad_norm": 1.0, + "learning_rate": 4.981024667931689e-05, + "loss": 0.2468, + "step": 1050 + }, + { + "epoch": 0.050087831742866636, + "grad_norm": 0.890625, + "learning_rate": 4.9999999722667036e-05, + "loss": 0.245, + "step": 1055 + }, + { + "epoch": 0.05032521483169539, + "grad_norm": 0.77734375, + "learning_rate": 4.999999001601389e-05, + "loss": 0.2473, + "step": 1060 + }, + { + "epoch": 0.050562597920524144, + "grad_norm": 0.76953125, + "learning_rate": 4.999996644271922e-05, + "loss": 0.2435, + "step": 1065 + }, + { + "epoch": 0.050799981009352894, + "grad_norm": 1.171875, + "learning_rate": 4.999992900279755e-05, + "loss": 0.2468, + "step": 1070 + }, + { + "epoch": 0.051037364098181645, + "grad_norm": 1.2421875, + "learning_rate": 4.999987769627194e-05, + "loss": 0.247, + "step": 1075 + }, + { + "epoch": 0.051274747187010396, + "grad_norm": 1.0625, + "learning_rate": 4.9999812523174015e-05, + "loss": 0.2403, + "step": 1080 + }, + { + "epoch": 0.051512130275839146, + "grad_norm": 0.90234375, + "learning_rate": 4.9999733483543956e-05, + "loss": 0.2423, + "step": 1085 + }, + { + "epoch": 0.051749513364667904, + "grad_norm": 0.9609375, + "learning_rate": 4.999964057743047e-05, + "loss": 0.2454, + "step": 1090 + }, + { + "epoch": 0.051986896453496655, + "grad_norm": 0.98828125, + "learning_rate": 4.99995338048908e-05, + "loss": 0.2477, + "step": 1095 + }, + { + "epoch": 0.052224279542325405, + "grad_norm": 0.80078125, + "learning_rate": 4.999941316599075e-05, + "loss": 0.2407, + "step": 1100 + }, + { + "epoch": 0.052461662631154156, + "grad_norm": 0.85546875, + "learning_rate": 4.99992786608047e-05, + "loss": 0.2487, + "step": 1105 + }, + { + "epoch": 0.05269904571998291, + "grad_norm": 0.9375, + "learning_rate": 4.999913028941551e-05, + "loss": 0.248, + "step": 1110 + }, + { + "epoch": 0.05293642880881166, + "grad_norm": 1.0234375, + "learning_rate": 4.999896805191463e-05, + "loss": 0.25, + "step": 1115 + }, + { + "epoch": 0.053173811897640415, + "grad_norm": 0.75, + "learning_rate": 4.999879194840206e-05, + "loss": 0.2412, + "step": 1120 + }, + { + "epoch": 0.053411194986469165, + "grad_norm": 0.72265625, + "learning_rate": 4.999860197898631e-05, + "loss": 0.2481, + "step": 1125 + }, + { + "epoch": 0.053648578075297916, + "grad_norm": 0.78515625, + "learning_rate": 4.9998398143784466e-05, + "loss": 0.2418, + "step": 1130 + }, + { + "epoch": 0.05388596116412667, + "grad_norm": 0.99609375, + "learning_rate": 4.9998180442922165e-05, + "loss": 0.2485, + "step": 1135 + }, + { + "epoch": 0.05412334425295542, + "grad_norm": 0.87890625, + "learning_rate": 4.999794887653356e-05, + "loss": 0.2451, + "step": 1140 + }, + { + "epoch": 0.05436072734178417, + "grad_norm": 0.9453125, + "learning_rate": 4.9997703444761373e-05, + "loss": 0.2468, + "step": 1145 + }, + { + "epoch": 0.054598110430612926, + "grad_norm": 0.7890625, + "learning_rate": 4.999744414775685e-05, + "loss": 0.2445, + "step": 1150 + }, + { + "epoch": 0.054835493519441676, + "grad_norm": 0.82421875, + "learning_rate": 4.99971709856798e-05, + "loss": 0.2433, + "step": 1155 + }, + { + "epoch": 0.05507287660827043, + "grad_norm": 0.7578125, + "learning_rate": 4.9996883958698584e-05, + "loss": 0.2463, + "step": 1160 + }, + { + "epoch": 0.05531025969709918, + "grad_norm": 0.921875, + "learning_rate": 4.9996583066990076e-05, + "loss": 0.2443, + "step": 1165 + }, + { + "epoch": 0.05554764278592793, + "grad_norm": 1.046875, + "learning_rate": 4.999626831073972e-05, + "loss": 0.2365, + "step": 1170 + }, + { + "epoch": 0.055785025874756686, + "grad_norm": 0.91015625, + "learning_rate": 4.999593969014151e-05, + "loss": 0.2413, + "step": 1175 + }, + { + "epoch": 0.056022408963585436, + "grad_norm": 0.8203125, + "learning_rate": 4.9995597205397966e-05, + "loss": 0.2437, + "step": 1180 + }, + { + "epoch": 0.05625979205241419, + "grad_norm": 0.83984375, + "learning_rate": 4.9995240856720146e-05, + "loss": 0.2431, + "step": 1185 + }, + { + "epoch": 0.05649717514124294, + "grad_norm": 0.765625, + "learning_rate": 4.999487064432769e-05, + "loss": 0.2423, + "step": 1190 + }, + { + "epoch": 0.05673455823007169, + "grad_norm": 0.625, + "learning_rate": 4.999448656844874e-05, + "loss": 0.2386, + "step": 1195 + }, + { + "epoch": 0.05697194131890044, + "grad_norm": 1.125, + "learning_rate": 4.999408862932002e-05, + "loss": 0.2423, + "step": 1200 + }, + { + "epoch": 0.057209324407729197, + "grad_norm": 0.9296875, + "learning_rate": 4.999367682718675e-05, + "loss": 0.2455, + "step": 1205 + }, + { + "epoch": 0.05744670749655795, + "grad_norm": 0.80078125, + "learning_rate": 4.999325116230275e-05, + "loss": 0.2453, + "step": 1210 + }, + { + "epoch": 0.0576840905853867, + "grad_norm": 0.79296875, + "learning_rate": 4.999281163493034e-05, + "loss": 0.2472, + "step": 1215 + }, + { + "epoch": 0.05792147367421545, + "grad_norm": 0.91796875, + "learning_rate": 4.99923582453404e-05, + "loss": 0.2456, + "step": 1220 + }, + { + "epoch": 0.0581588567630442, + "grad_norm": 0.94140625, + "learning_rate": 4.999189099381235e-05, + "loss": 0.2411, + "step": 1225 + }, + { + "epoch": 0.05839623985187295, + "grad_norm": 1.0234375, + "learning_rate": 4.999140988063416e-05, + "loss": 0.2395, + "step": 1230 + }, + { + "epoch": 0.05863362294070171, + "grad_norm": 0.6015625, + "learning_rate": 4.9990914906102336e-05, + "loss": 0.2377, + "step": 1235 + }, + { + "epoch": 0.05887100602953046, + "grad_norm": 0.68359375, + "learning_rate": 4.9990406070521935e-05, + "loss": 0.2433, + "step": 1240 + }, + { + "epoch": 0.05910838911835921, + "grad_norm": 0.7109375, + "learning_rate": 4.998988337420654e-05, + "loss": 0.2397, + "step": 1245 + }, + { + "epoch": 0.05934577220718796, + "grad_norm": 0.7109375, + "learning_rate": 4.9989346817478286e-05, + "loss": 0.2427, + "step": 1250 + }, + { + "epoch": 0.05958315529601671, + "grad_norm": 0.80078125, + "learning_rate": 4.998879640066786e-05, + "loss": 0.2375, + "step": 1255 + }, + { + "epoch": 0.05982053838484546, + "grad_norm": 0.828125, + "learning_rate": 4.998823212411447e-05, + "loss": 0.2416, + "step": 1260 + }, + { + "epoch": 0.06005792147367422, + "grad_norm": 0.6953125, + "learning_rate": 4.998765398816589e-05, + "loss": 0.2423, + "step": 1265 + }, + { + "epoch": 0.06029530456250297, + "grad_norm": 0.82421875, + "learning_rate": 4.998706199317841e-05, + "loss": 0.2411, + "step": 1270 + }, + { + "epoch": 0.06053268765133172, + "grad_norm": 0.63671875, + "learning_rate": 4.9986456139516877e-05, + "loss": 0.2403, + "step": 1275 + }, + { + "epoch": 0.06077007074016047, + "grad_norm": 0.66796875, + "learning_rate": 4.998583642755468e-05, + "loss": 0.2406, + "step": 1280 + }, + { + "epoch": 0.06100745382898922, + "grad_norm": 0.734375, + "learning_rate": 4.998520285767375e-05, + "loss": 0.2385, + "step": 1285 + }, + { + "epoch": 0.06124483691781797, + "grad_norm": 0.94140625, + "learning_rate": 4.998455543026454e-05, + "loss": 0.2385, + "step": 1290 + }, + { + "epoch": 0.06148222000664673, + "grad_norm": 0.73046875, + "learning_rate": 4.9983894145726074e-05, + "loss": 0.2369, + "step": 1295 + }, + { + "epoch": 0.06171960309547548, + "grad_norm": 0.6484375, + "learning_rate": 4.9983219004465874e-05, + "loss": 0.2384, + "step": 1300 + }, + { + "epoch": 0.06195698618430423, + "grad_norm": 0.6796875, + "learning_rate": 4.998253000690004e-05, + "loss": 0.2425, + "step": 1305 + }, + { + "epoch": 0.06219436927313298, + "grad_norm": 0.77734375, + "learning_rate": 4.99818271534532e-05, + "loss": 0.2413, + "step": 1310 + }, + { + "epoch": 0.06243175236196173, + "grad_norm": 0.64453125, + "learning_rate": 4.998111044455853e-05, + "loss": 0.2405, + "step": 1315 + }, + { + "epoch": 0.06266913545079049, + "grad_norm": 0.72265625, + "learning_rate": 4.9980379880657705e-05, + "loss": 0.2368, + "step": 1320 + }, + { + "epoch": 0.06290651853961923, + "grad_norm": 0.71875, + "learning_rate": 4.9979635462201e-05, + "loss": 0.2421, + "step": 1325 + }, + { + "epoch": 0.06314390162844799, + "grad_norm": 0.71875, + "learning_rate": 4.997887718964717e-05, + "loss": 0.2386, + "step": 1330 + }, + { + "epoch": 0.06338128471727675, + "grad_norm": 0.66015625, + "learning_rate": 4.9978105063463546e-05, + "loss": 0.2427, + "step": 1335 + }, + { + "epoch": 0.06361866780610549, + "grad_norm": 0.75390625, + "learning_rate": 4.997731908412599e-05, + "loss": 0.2418, + "step": 1340 + }, + { + "epoch": 0.06385605089493425, + "grad_norm": 0.75390625, + "learning_rate": 4.997651925211889e-05, + "loss": 0.2404, + "step": 1345 + }, + { + "epoch": 0.06409343398376299, + "grad_norm": 0.72265625, + "learning_rate": 4.997570556793519e-05, + "loss": 0.2398, + "step": 1350 + }, + { + "epoch": 0.06433081707259175, + "grad_norm": 0.54296875, + "learning_rate": 4.997487803207635e-05, + "loss": 0.2388, + "step": 1355 + }, + { + "epoch": 0.0645682001614205, + "grad_norm": 0.73828125, + "learning_rate": 4.997403664505238e-05, + "loss": 0.2342, + "step": 1360 + }, + { + "epoch": 0.06480558325024925, + "grad_norm": 0.64453125, + "learning_rate": 4.9973181407381817e-05, + "loss": 0.236, + "step": 1365 + }, + { + "epoch": 0.06504296633907801, + "grad_norm": 0.75, + "learning_rate": 4.9972312319591754e-05, + "loss": 0.2405, + "step": 1370 + }, + { + "epoch": 0.06528034942790675, + "grad_norm": 1.21875, + "learning_rate": 4.99714293822178e-05, + "loss": 0.2393, + "step": 1375 + }, + { + "epoch": 0.06551773251673551, + "grad_norm": 0.765625, + "learning_rate": 4.9970532595804086e-05, + "loss": 0.2377, + "step": 1380 + }, + { + "epoch": 0.06575511560556425, + "grad_norm": 0.9609375, + "learning_rate": 4.996962196090332e-05, + "loss": 0.2401, + "step": 1385 + }, + { + "epoch": 0.06599249869439301, + "grad_norm": 0.76171875, + "learning_rate": 4.9968697478076725e-05, + "loss": 0.2417, + "step": 1390 + }, + { + "epoch": 0.06622988178322177, + "grad_norm": 0.6875, + "learning_rate": 4.996775914789405e-05, + "loss": 0.2416, + "step": 1395 + }, + { + "epoch": 0.06646726487205051, + "grad_norm": 0.95703125, + "learning_rate": 4.9966806970933566e-05, + "loss": 0.2352, + "step": 1400 + }, + { + "epoch": 0.06670464796087927, + "grad_norm": 0.72265625, + "learning_rate": 4.9965840947782114e-05, + "loss": 0.2374, + "step": 1405 + }, + { + "epoch": 0.06694203104970801, + "grad_norm": 0.78125, + "learning_rate": 4.996486107903504e-05, + "loss": 0.2355, + "step": 1410 + }, + { + "epoch": 0.06717941413853677, + "grad_norm": 0.83984375, + "learning_rate": 4.996386736529625e-05, + "loss": 0.2406, + "step": 1415 + }, + { + "epoch": 0.06741679722736552, + "grad_norm": 0.84375, + "learning_rate": 4.996285980717814e-05, + "loss": 0.2421, + "step": 1420 + }, + { + "epoch": 0.06765418031619427, + "grad_norm": 0.69921875, + "learning_rate": 4.9961838405301685e-05, + "loss": 0.2412, + "step": 1425 + }, + { + "epoch": 0.06789156340502303, + "grad_norm": 0.59765625, + "learning_rate": 4.996080316029636e-05, + "loss": 0.2392, + "step": 1430 + }, + { + "epoch": 0.06812894649385177, + "grad_norm": 0.6171875, + "learning_rate": 4.995975407280019e-05, + "loss": 0.2403, + "step": 1435 + }, + { + "epoch": 0.06836632958268053, + "grad_norm": 0.859375, + "learning_rate": 4.9958691143459716e-05, + "loss": 0.2396, + "step": 1440 + }, + { + "epoch": 0.06860371267150928, + "grad_norm": 0.65625, + "learning_rate": 4.995761437293e-05, + "loss": 0.2396, + "step": 1445 + }, + { + "epoch": 0.06884109576033803, + "grad_norm": 0.6953125, + "learning_rate": 4.9956523761874685e-05, + "loss": 0.2388, + "step": 1450 + }, + { + "epoch": 0.06907847884916679, + "grad_norm": 0.87109375, + "learning_rate": 4.9955419310965885e-05, + "loss": 0.235, + "step": 1455 + }, + { + "epoch": 0.06931586193799553, + "grad_norm": 0.72265625, + "learning_rate": 4.995430102088428e-05, + "loss": 0.2371, + "step": 1460 + }, + { + "epoch": 0.06955324502682429, + "grad_norm": 0.7421875, + "learning_rate": 4.995316889231906e-05, + "loss": 0.24, + "step": 1465 + }, + { + "epoch": 0.06979062811565304, + "grad_norm": 0.67578125, + "learning_rate": 4.995202292596795e-05, + "loss": 0.2384, + "step": 1470 + }, + { + "epoch": 0.0700280112044818, + "grad_norm": 0.57421875, + "learning_rate": 4.9950863122537216e-05, + "loss": 0.236, + "step": 1475 + }, + { + "epoch": 0.07026539429331055, + "grad_norm": 0.65625, + "learning_rate": 4.994968948274163e-05, + "loss": 0.236, + "step": 1480 + }, + { + "epoch": 0.0705027773821393, + "grad_norm": 0.74609375, + "learning_rate": 4.99485020073045e-05, + "loss": 0.2395, + "step": 1485 + }, + { + "epoch": 0.07074016047096805, + "grad_norm": 0.57421875, + "learning_rate": 4.994730069695766e-05, + "loss": 0.2363, + "step": 1490 + }, + { + "epoch": 0.0709775435597968, + "grad_norm": 0.6640625, + "learning_rate": 4.994608555244147e-05, + "loss": 0.2332, + "step": 1495 + }, + { + "epoch": 0.07121492664862555, + "grad_norm": 0.5625, + "learning_rate": 4.9944856574504836e-05, + "loss": 0.2363, + "step": 1500 + }, + { + "epoch": 0.0714523097374543, + "grad_norm": 0.72265625, + "learning_rate": 4.994361376390515e-05, + "loss": 0.2384, + "step": 1505 + }, + { + "epoch": 0.07168969282628306, + "grad_norm": 0.6015625, + "learning_rate": 4.994235712140837e-05, + "loss": 0.2338, + "step": 1510 + }, + { + "epoch": 0.07192707591511181, + "grad_norm": 0.69921875, + "learning_rate": 4.9941086647788935e-05, + "loss": 0.2403, + "step": 1515 + }, + { + "epoch": 0.07216445900394056, + "grad_norm": 0.6484375, + "learning_rate": 4.993980234382985e-05, + "loss": 0.2378, + "step": 1520 + }, + { + "epoch": 0.07240184209276931, + "grad_norm": 0.75390625, + "learning_rate": 4.993850421032263e-05, + "loss": 0.2377, + "step": 1525 + }, + { + "epoch": 0.07263922518159806, + "grad_norm": 0.82421875, + "learning_rate": 4.993719224806729e-05, + "loss": 0.2382, + "step": 1530 + }, + { + "epoch": 0.07287660827042682, + "grad_norm": 1.0703125, + "learning_rate": 4.9935866457872405e-05, + "loss": 0.2384, + "step": 1535 + }, + { + "epoch": 0.07311399135925557, + "grad_norm": 0.62890625, + "learning_rate": 4.993452684055504e-05, + "loss": 0.2382, + "step": 1540 + }, + { + "epoch": 0.07335137444808432, + "grad_norm": 0.6640625, + "learning_rate": 4.99331733969408e-05, + "loss": 0.2373, + "step": 1545 + }, + { + "epoch": 0.07358875753691307, + "grad_norm": 0.796875, + "learning_rate": 4.993180612786381e-05, + "loss": 0.2409, + "step": 1550 + }, + { + "epoch": 0.07382614062574182, + "grad_norm": 1.0, + "learning_rate": 4.993042503416671e-05, + "loss": 0.236, + "step": 1555 + }, + { + "epoch": 0.07406352371457058, + "grad_norm": 0.71875, + "learning_rate": 4.9929030116700647e-05, + "loss": 0.2368, + "step": 1560 + }, + { + "epoch": 0.07430090680339932, + "grad_norm": 0.5546875, + "learning_rate": 4.9927621376325324e-05, + "loss": 0.2358, + "step": 1565 + }, + { + "epoch": 0.07453828989222808, + "grad_norm": 0.546875, + "learning_rate": 4.992619881390893e-05, + "loss": 0.2389, + "step": 1570 + }, + { + "epoch": 0.07477567298105683, + "grad_norm": 0.68359375, + "learning_rate": 4.992476243032819e-05, + "loss": 0.2388, + "step": 1575 + }, + { + "epoch": 0.07501305606988558, + "grad_norm": 0.62109375, + "learning_rate": 4.9923312226468336e-05, + "loss": 0.2367, + "step": 1580 + }, + { + "epoch": 0.07525043915871434, + "grad_norm": 0.68359375, + "learning_rate": 4.992184820322312e-05, + "loss": 0.2392, + "step": 1585 + }, + { + "epoch": 0.07548782224754308, + "grad_norm": 0.75390625, + "learning_rate": 4.992037036149481e-05, + "loss": 0.2342, + "step": 1590 + }, + { + "epoch": 0.07572520533637184, + "grad_norm": 0.70703125, + "learning_rate": 4.991887870219421e-05, + "loss": 0.238, + "step": 1595 + }, + { + "epoch": 0.0759625884252006, + "grad_norm": 0.6328125, + "learning_rate": 4.9917373226240607e-05, + "loss": 0.238, + "step": 1600 + }, + { + "epoch": 0.07619997151402934, + "grad_norm": 0.71484375, + "learning_rate": 4.991585393456182e-05, + "loss": 0.2381, + "step": 1605 + }, + { + "epoch": 0.0764373546028581, + "grad_norm": 0.7421875, + "learning_rate": 4.991432082809419e-05, + "loss": 0.2389, + "step": 1610 + }, + { + "epoch": 0.07667473769168684, + "grad_norm": 0.6484375, + "learning_rate": 4.991277390778256e-05, + "loss": 0.2316, + "step": 1615 + }, + { + "epoch": 0.0769121207805156, + "grad_norm": 0.75390625, + "learning_rate": 4.991121317458029e-05, + "loss": 0.2383, + "step": 1620 + }, + { + "epoch": 0.07714950386934435, + "grad_norm": 0.60546875, + "learning_rate": 4.990963862944924e-05, + "loss": 0.2355, + "step": 1625 + }, + { + "epoch": 0.0773868869581731, + "grad_norm": 0.6875, + "learning_rate": 4.990805027335981e-05, + "loss": 0.2365, + "step": 1630 + }, + { + "epoch": 0.07762427004700186, + "grad_norm": 0.79296875, + "learning_rate": 4.9906448107290904e-05, + "loss": 0.2381, + "step": 1635 + }, + { + "epoch": 0.0778616531358306, + "grad_norm": 0.89453125, + "learning_rate": 4.990483213222992e-05, + "loss": 0.234, + "step": 1640 + }, + { + "epoch": 0.07809903622465936, + "grad_norm": 0.5625, + "learning_rate": 4.990320234917276e-05, + "loss": 0.2314, + "step": 1645 + }, + { + "epoch": 0.0783364193134881, + "grad_norm": 0.890625, + "learning_rate": 4.990155875912387e-05, + "loss": 0.2371, + "step": 1650 + }, + { + "epoch": 0.07857380240231686, + "grad_norm": 0.56640625, + "learning_rate": 4.989990136309619e-05, + "loss": 0.2376, + "step": 1655 + }, + { + "epoch": 0.07881118549114562, + "grad_norm": 0.98828125, + "learning_rate": 4.9898230162111154e-05, + "loss": 0.2372, + "step": 1660 + }, + { + "epoch": 0.07904856857997436, + "grad_norm": 0.83984375, + "learning_rate": 4.9896545157198726e-05, + "loss": 0.2404, + "step": 1665 + }, + { + "epoch": 0.07928595166880312, + "grad_norm": 0.8359375, + "learning_rate": 4.9894846349397365e-05, + "loss": 0.2357, + "step": 1670 + }, + { + "epoch": 0.07952333475763186, + "grad_norm": 0.93359375, + "learning_rate": 4.9893133739754024e-05, + "loss": 0.24, + "step": 1675 + }, + { + "epoch": 0.07976071784646062, + "grad_norm": 0.66015625, + "learning_rate": 4.9891407329324195e-05, + "loss": 0.2319, + "step": 1680 + }, + { + "epoch": 0.07999810093528938, + "grad_norm": 0.7734375, + "learning_rate": 4.988966711917185e-05, + "loss": 0.2391, + "step": 1685 + }, + { + "epoch": 0.08023548402411812, + "grad_norm": 0.67578125, + "learning_rate": 4.9887913110369466e-05, + "loss": 0.235, + "step": 1690 + }, + { + "epoch": 0.08047286711294688, + "grad_norm": 0.7265625, + "learning_rate": 4.988614530399805e-05, + "loss": 0.2316, + "step": 1695 + }, + { + "epoch": 0.08071025020177562, + "grad_norm": 0.66015625, + "learning_rate": 4.988436370114707e-05, + "loss": 0.2359, + "step": 1700 + }, + { + "epoch": 0.08094763329060438, + "grad_norm": 0.6953125, + "learning_rate": 4.988256830291453e-05, + "loss": 0.236, + "step": 1705 + }, + { + "epoch": 0.08118501637943312, + "grad_norm": 0.6796875, + "learning_rate": 4.988075911040693e-05, + "loss": 0.2357, + "step": 1710 + }, + { + "epoch": 0.08142239946826188, + "grad_norm": 0.62109375, + "learning_rate": 4.9878936124739255e-05, + "loss": 0.2328, + "step": 1715 + }, + { + "epoch": 0.08165978255709064, + "grad_norm": 0.52734375, + "learning_rate": 4.987709934703502e-05, + "loss": 0.2269, + "step": 1720 + }, + { + "epoch": 0.08189716564591938, + "grad_norm": 0.546875, + "learning_rate": 4.987524877842622e-05, + "loss": 0.2338, + "step": 1725 + }, + { + "epoch": 0.08213454873474814, + "grad_norm": 0.5859375, + "learning_rate": 4.987338442005333e-05, + "loss": 0.2351, + "step": 1730 + }, + { + "epoch": 0.08237193182357688, + "grad_norm": 0.609375, + "learning_rate": 4.987150627306535e-05, + "loss": 0.2349, + "step": 1735 + }, + { + "epoch": 0.08260931491240564, + "grad_norm": 0.59765625, + "learning_rate": 4.986961433861981e-05, + "loss": 0.2348, + "step": 1740 + }, + { + "epoch": 0.0828466980012344, + "grad_norm": 0.7109375, + "learning_rate": 4.986770861788266e-05, + "loss": 0.2371, + "step": 1745 + }, + { + "epoch": 0.08308408109006314, + "grad_norm": 0.63671875, + "learning_rate": 4.986578911202841e-05, + "loss": 0.237, + "step": 1750 + }, + { + "epoch": 0.0833214641788919, + "grad_norm": 0.72265625, + "learning_rate": 4.986385582224003e-05, + "loss": 0.2386, + "step": 1755 + }, + { + "epoch": 0.08355884726772064, + "grad_norm": 0.8046875, + "learning_rate": 4.9861908749709004e-05, + "loss": 0.2349, + "step": 1760 + }, + { + "epoch": 0.0837962303565494, + "grad_norm": 0.84375, + "learning_rate": 4.98599478956353e-05, + "loss": 0.2375, + "step": 1765 + }, + { + "epoch": 0.08403361344537816, + "grad_norm": 0.6171875, + "learning_rate": 4.985797326122739e-05, + "loss": 0.2333, + "step": 1770 + }, + { + "epoch": 0.0842709965342069, + "grad_norm": 0.6796875, + "learning_rate": 4.985598484770222e-05, + "loss": 0.2365, + "step": 1775 + }, + { + "epoch": 0.08450837962303566, + "grad_norm": 0.96875, + "learning_rate": 4.985398265628526e-05, + "loss": 0.2406, + "step": 1780 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 0.96875, + "learning_rate": 4.985196668821043e-05, + "loss": 0.2353, + "step": 1785 + }, + { + "epoch": 0.08498314580069316, + "grad_norm": 0.72265625, + "learning_rate": 4.984993694472018e-05, + "loss": 0.2314, + "step": 1790 + }, + { + "epoch": 0.0852205288895219, + "grad_norm": 1.0390625, + "learning_rate": 4.9847893427065415e-05, + "loss": 0.2329, + "step": 1795 + }, + { + "epoch": 0.08545791197835066, + "grad_norm": 0.6015625, + "learning_rate": 4.984583613650556e-05, + "loss": 0.2371, + "step": 1800 + }, + { + "epoch": 0.08569529506717942, + "grad_norm": 0.73828125, + "learning_rate": 4.98437650743085e-05, + "loss": 0.2379, + "step": 1805 + }, + { + "epoch": 0.08593267815600816, + "grad_norm": 0.5859375, + "learning_rate": 4.9841680241750636e-05, + "loss": 0.2365, + "step": 1810 + }, + { + "epoch": 0.08617006124483692, + "grad_norm": 0.51171875, + "learning_rate": 4.983958164011683e-05, + "loss": 0.2317, + "step": 1815 + }, + { + "epoch": 0.08640744433366566, + "grad_norm": 0.80078125, + "learning_rate": 4.983746927070044e-05, + "loss": 0.2383, + "step": 1820 + }, + { + "epoch": 0.08664482742249442, + "grad_norm": 0.67578125, + "learning_rate": 4.9835343134803326e-05, + "loss": 0.2335, + "step": 1825 + }, + { + "epoch": 0.08688221051132318, + "grad_norm": 1.1171875, + "learning_rate": 4.98332032337358e-05, + "loss": 0.2386, + "step": 1830 + }, + { + "epoch": 0.08711959360015192, + "grad_norm": 0.78515625, + "learning_rate": 4.983104956881667e-05, + "loss": 0.2368, + "step": 1835 + }, + { + "epoch": 0.08735697668898068, + "grad_norm": 0.8359375, + "learning_rate": 4.9828882141373245e-05, + "loss": 0.2358, + "step": 1840 + }, + { + "epoch": 0.08759435977780942, + "grad_norm": 0.8515625, + "learning_rate": 4.982670095274129e-05, + "loss": 0.2373, + "step": 1845 + }, + { + "epoch": 0.08783174286663818, + "grad_norm": 0.55078125, + "learning_rate": 4.982450600426506e-05, + "loss": 0.2317, + "step": 1850 + }, + { + "epoch": 0.08806912595546693, + "grad_norm": 0.52734375, + "learning_rate": 4.9822297297297307e-05, + "loss": 0.2366, + "step": 1855 + }, + { + "epoch": 0.08830650904429568, + "grad_norm": 0.6015625, + "learning_rate": 4.982007483319923e-05, + "loss": 0.2354, + "step": 1860 + }, + { + "epoch": 0.08854389213312444, + "grad_norm": 0.64453125, + "learning_rate": 4.981783861334053e-05, + "loss": 0.233, + "step": 1865 + }, + { + "epoch": 0.08878127522195318, + "grad_norm": 0.62109375, + "learning_rate": 4.9815588639099374e-05, + "loss": 0.2359, + "step": 1870 + }, + { + "epoch": 0.08901865831078194, + "grad_norm": 0.6484375, + "learning_rate": 4.981332491186242e-05, + "loss": 0.2352, + "step": 1875 + }, + { + "epoch": 0.08925604139961069, + "grad_norm": 0.59765625, + "learning_rate": 4.9811047433024794e-05, + "loss": 0.2336, + "step": 1880 + }, + { + "epoch": 0.08949342448843944, + "grad_norm": 0.7578125, + "learning_rate": 4.980875620399009e-05, + "loss": 0.2335, + "step": 1885 + }, + { + "epoch": 0.0897308075772682, + "grad_norm": 0.63671875, + "learning_rate": 4.980645122617037e-05, + "loss": 0.2343, + "step": 1890 + }, + { + "epoch": 0.08996819066609694, + "grad_norm": 0.53125, + "learning_rate": 4.98041325009862e-05, + "loss": 0.235, + "step": 1895 + }, + { + "epoch": 0.0902055737549257, + "grad_norm": 0.6796875, + "learning_rate": 4.98018000298666e-05, + "loss": 0.2378, + "step": 1900 + }, + { + "epoch": 0.09044295684375445, + "grad_norm": 0.6796875, + "learning_rate": 4.979945381424905e-05, + "loss": 0.2325, + "step": 1905 + }, + { + "epoch": 0.0906803399325832, + "grad_norm": 0.71484375, + "learning_rate": 4.9797093855579515e-05, + "loss": 0.2332, + "step": 1910 + }, + { + "epoch": 0.09091772302141196, + "grad_norm": 0.70703125, + "learning_rate": 4.9794720155312436e-05, + "loss": 0.2337, + "step": 1915 + }, + { + "epoch": 0.0911551061102407, + "grad_norm": 0.5625, + "learning_rate": 4.97923327149107e-05, + "loss": 0.235, + "step": 1920 + }, + { + "epoch": 0.09139248919906946, + "grad_norm": 0.61328125, + "learning_rate": 4.978993153584569e-05, + "loss": 0.2366, + "step": 1925 + }, + { + "epoch": 0.0916298722878982, + "grad_norm": 0.61328125, + "learning_rate": 4.978751661959722e-05, + "loss": 0.2351, + "step": 1930 + }, + { + "epoch": 0.09186725537672696, + "grad_norm": 0.65234375, + "learning_rate": 4.9785087967653613e-05, + "loss": 0.2364, + "step": 1935 + }, + { + "epoch": 0.09210463846555571, + "grad_norm": 0.62109375, + "learning_rate": 4.9782645581511635e-05, + "loss": 0.2382, + "step": 1940 + }, + { + "epoch": 0.09234202155438447, + "grad_norm": 0.6171875, + "learning_rate": 4.9780189462676506e-05, + "loss": 0.2396, + "step": 1945 + }, + { + "epoch": 0.09257940464321322, + "grad_norm": 0.57421875, + "learning_rate": 4.977771961266193e-05, + "loss": 0.2327, + "step": 1950 + }, + { + "epoch": 0.09281678773204197, + "grad_norm": 0.5390625, + "learning_rate": 4.977523603299005e-05, + "loss": 0.2312, + "step": 1955 + }, + { + "epoch": 0.09305417082087072, + "grad_norm": 0.5625, + "learning_rate": 4.97727387251915e-05, + "loss": 0.2326, + "step": 1960 + }, + { + "epoch": 0.09329155390969947, + "grad_norm": 0.71875, + "learning_rate": 4.9770227690805364e-05, + "loss": 0.2337, + "step": 1965 + }, + { + "epoch": 0.09352893699852823, + "grad_norm": 0.7109375, + "learning_rate": 4.976770293137917e-05, + "loss": 0.2345, + "step": 1970 + }, + { + "epoch": 0.09376632008735698, + "grad_norm": 0.64453125, + "learning_rate": 4.976516444846891e-05, + "loss": 0.2331, + "step": 1975 + }, + { + "epoch": 0.09400370317618573, + "grad_norm": 0.90234375, + "learning_rate": 4.976261224363905e-05, + "loss": 0.237, + "step": 1980 + }, + { + "epoch": 0.09424108626501448, + "grad_norm": 0.56640625, + "learning_rate": 4.97600463184625e-05, + "loss": 0.2344, + "step": 1985 + }, + { + "epoch": 0.09447846935384323, + "grad_norm": 0.59765625, + "learning_rate": 4.9757466674520626e-05, + "loss": 0.236, + "step": 1990 + }, + { + "epoch": 0.09471585244267199, + "grad_norm": 0.6875, + "learning_rate": 4.9754873313403254e-05, + "loss": 0.2347, + "step": 1995 + }, + { + "epoch": 0.09495323553150073, + "grad_norm": 0.53125, + "learning_rate": 4.975226623670866e-05, + "loss": 0.2331, + "step": 2000 + }, + { + "epoch": 0.09519061862032949, + "grad_norm": 0.66796875, + "learning_rate": 4.974964544604357e-05, + "loss": 0.2389, + "step": 2005 + }, + { + "epoch": 0.09542800170915824, + "grad_norm": 0.73046875, + "learning_rate": 4.974701094302317e-05, + "loss": 0.2327, + "step": 2010 + }, + { + "epoch": 0.09566538479798699, + "grad_norm": 0.54296875, + "learning_rate": 4.974436272927109e-05, + "loss": 0.2325, + "step": 2015 + }, + { + "epoch": 0.09590276788681575, + "grad_norm": 0.640625, + "learning_rate": 4.974170080641941e-05, + "loss": 0.2348, + "step": 2020 + }, + { + "epoch": 0.09614015097564449, + "grad_norm": 0.875, + "learning_rate": 4.973902517610866e-05, + "loss": 0.2353, + "step": 2025 + }, + { + "epoch": 0.09637753406447325, + "grad_norm": 0.51171875, + "learning_rate": 4.973633583998783e-05, + "loss": 0.2335, + "step": 2030 + }, + { + "epoch": 0.096614917153302, + "grad_norm": 0.5078125, + "learning_rate": 4.9733632799714334e-05, + "loss": 0.2322, + "step": 2035 + }, + { + "epoch": 0.09685230024213075, + "grad_norm": 0.55078125, + "learning_rate": 4.973091605695405e-05, + "loss": 0.2335, + "step": 2040 + }, + { + "epoch": 0.0970896833309595, + "grad_norm": 0.68359375, + "learning_rate": 4.972818561338128e-05, + "loss": 0.2335, + "step": 2045 + }, + { + "epoch": 0.09732706641978825, + "grad_norm": 0.56640625, + "learning_rate": 4.972544147067881e-05, + "loss": 0.2344, + "step": 2050 + }, + { + "epoch": 0.09756444950861701, + "grad_norm": 0.62890625, + "learning_rate": 4.972268363053782e-05, + "loss": 0.2316, + "step": 2055 + }, + { + "epoch": 0.09780183259744576, + "grad_norm": 0.61328125, + "learning_rate": 4.971991209465796e-05, + "loss": 0.2344, + "step": 2060 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 0.6953125, + "learning_rate": 4.971712686474732e-05, + "loss": 0.2305, + "step": 2065 + }, + { + "epoch": 0.09827659877510327, + "grad_norm": 0.60546875, + "learning_rate": 4.971432794252243e-05, + "loss": 0.237, + "step": 2070 + }, + { + "epoch": 0.09851398186393201, + "grad_norm": 0.5703125, + "learning_rate": 4.971151532970823e-05, + "loss": 0.2315, + "step": 2075 + }, + { + "epoch": 0.09875136495276077, + "grad_norm": 0.6015625, + "learning_rate": 4.970868902803814e-05, + "loss": 0.2368, + "step": 2080 + }, + { + "epoch": 0.09898874804158951, + "grad_norm": 0.96875, + "learning_rate": 4.9705849039253995e-05, + "loss": 0.2341, + "step": 2085 + }, + { + "epoch": 0.09922613113041827, + "grad_norm": 0.7265625, + "learning_rate": 4.9702995365106065e-05, + "loss": 0.2337, + "step": 2090 + }, + { + "epoch": 0.09946351421924703, + "grad_norm": 0.546875, + "learning_rate": 4.970012800735305e-05, + "loss": 0.2371, + "step": 2095 + }, + { + "epoch": 0.09970089730807577, + "grad_norm": 0.74609375, + "learning_rate": 4.96972469677621e-05, + "loss": 0.2295, + "step": 2100 + }, + { + "epoch": 0.09993828039690453, + "grad_norm": 0.59375, + "learning_rate": 4.969435224810878e-05, + "loss": 0.2346, + "step": 2105 + }, + { + "epoch": 0.10017566348573327, + "grad_norm": 0.60546875, + "learning_rate": 4.96914438501771e-05, + "loss": 0.2358, + "step": 2110 + }, + { + "epoch": 0.10041304657456203, + "grad_norm": 0.5234375, + "learning_rate": 4.9688521775759476e-05, + "loss": 0.2363, + "step": 2115 + }, + { + "epoch": 0.10065042966339079, + "grad_norm": 0.6328125, + "learning_rate": 4.968558602665679e-05, + "loss": 0.235, + "step": 2120 + }, + { + "epoch": 0.10088781275221953, + "grad_norm": 0.796875, + "learning_rate": 4.968263660467831e-05, + "loss": 0.2315, + "step": 2125 + }, + { + "epoch": 0.10112519584104829, + "grad_norm": 0.578125, + "learning_rate": 4.967967351164177e-05, + "loss": 0.2293, + "step": 2130 + }, + { + "epoch": 0.10136257892987703, + "grad_norm": 0.546875, + "learning_rate": 4.96766967493733e-05, + "loss": 0.2311, + "step": 2135 + }, + { + "epoch": 0.10159996201870579, + "grad_norm": 0.58203125, + "learning_rate": 4.967370631970747e-05, + "loss": 0.2353, + "step": 2140 + }, + { + "epoch": 0.10183734510753453, + "grad_norm": 0.67578125, + "learning_rate": 4.967070222448727e-05, + "loss": 0.2336, + "step": 2145 + }, + { + "epoch": 0.10207472819636329, + "grad_norm": 0.64453125, + "learning_rate": 4.96676844655641e-05, + "loss": 0.2341, + "step": 2150 + }, + { + "epoch": 0.10231211128519205, + "grad_norm": 0.5859375, + "learning_rate": 4.96646530447978e-05, + "loss": 0.2339, + "step": 2155 + }, + { + "epoch": 0.10254949437402079, + "grad_norm": 0.6015625, + "learning_rate": 4.9661607964056605e-05, + "loss": 0.2365, + "step": 2160 + }, + { + "epoch": 0.10278687746284955, + "grad_norm": 0.52734375, + "learning_rate": 4.965854922521722e-05, + "loss": 0.2308, + "step": 2165 + }, + { + "epoch": 0.10302426055167829, + "grad_norm": 0.90625, + "learning_rate": 4.965547683016469e-05, + "loss": 0.2316, + "step": 2170 + }, + { + "epoch": 0.10326164364050705, + "grad_norm": 0.66015625, + "learning_rate": 4.965239078079255e-05, + "loss": 0.2342, + "step": 2175 + }, + { + "epoch": 0.10349902672933581, + "grad_norm": 1.0078125, + "learning_rate": 4.96492910790027e-05, + "loss": 0.2346, + "step": 2180 + }, + { + "epoch": 0.10373640981816455, + "grad_norm": 0.54296875, + "learning_rate": 4.964617772670548e-05, + "loss": 0.2318, + "step": 2185 + }, + { + "epoch": 0.10397379290699331, + "grad_norm": 0.56640625, + "learning_rate": 4.9643050725819635e-05, + "loss": 0.2356, + "step": 2190 + }, + { + "epoch": 0.10421117599582205, + "grad_norm": 0.609375, + "learning_rate": 4.9639910078272315e-05, + "loss": 0.2377, + "step": 2195 + }, + { + "epoch": 0.10444855908465081, + "grad_norm": 0.494140625, + "learning_rate": 4.96367557859991e-05, + "loss": 0.2306, + "step": 2200 + }, + { + "epoch": 0.10468594217347957, + "grad_norm": 0.58203125, + "learning_rate": 4.9633587850943945e-05, + "loss": 0.2284, + "step": 2205 + }, + { + "epoch": 0.10492332526230831, + "grad_norm": 0.58984375, + "learning_rate": 4.9630406275059247e-05, + "loss": 0.2346, + "step": 2210 + }, + { + "epoch": 0.10516070835113707, + "grad_norm": 0.7265625, + "learning_rate": 4.962721106030581e-05, + "loss": 0.2345, + "step": 2215 + }, + { + "epoch": 0.10539809143996581, + "grad_norm": 0.609375, + "learning_rate": 4.96240022086528e-05, + "loss": 0.235, + "step": 2220 + }, + { + "epoch": 0.10563547452879457, + "grad_norm": 0.578125, + "learning_rate": 4.962077972207784e-05, + "loss": 0.2324, + "step": 2225 + }, + { + "epoch": 0.10587285761762331, + "grad_norm": 0.5078125, + "learning_rate": 4.961754360256693e-05, + "loss": 0.2281, + "step": 2230 + }, + { + "epoch": 0.10611024070645207, + "grad_norm": 0.69921875, + "learning_rate": 4.961429385211447e-05, + "loss": 0.2321, + "step": 2235 + }, + { + "epoch": 0.10634762379528083, + "grad_norm": 0.8828125, + "learning_rate": 4.9611030472723266e-05, + "loss": 0.2307, + "step": 2240 + }, + { + "epoch": 0.10658500688410957, + "grad_norm": 0.9765625, + "learning_rate": 4.960775346640453e-05, + "loss": 0.2292, + "step": 2245 + }, + { + "epoch": 0.10682238997293833, + "grad_norm": 0.57421875, + "learning_rate": 4.9604462835177866e-05, + "loss": 0.2313, + "step": 2250 + }, + { + "epoch": 0.10705977306176707, + "grad_norm": 0.54296875, + "learning_rate": 4.960115858107128e-05, + "loss": 0.2342, + "step": 2255 + }, + { + "epoch": 0.10729715615059583, + "grad_norm": 0.59765625, + "learning_rate": 4.9597840706121153e-05, + "loss": 0.2345, + "step": 2260 + }, + { + "epoch": 0.10753453923942459, + "grad_norm": 0.57421875, + "learning_rate": 4.959450921237228e-05, + "loss": 0.2344, + "step": 2265 + }, + { + "epoch": 0.10777192232825333, + "grad_norm": 0.61328125, + "learning_rate": 4.959116410187786e-05, + "loss": 0.2304, + "step": 2270 + }, + { + "epoch": 0.10800930541708209, + "grad_norm": 0.7578125, + "learning_rate": 4.958780537669945e-05, + "loss": 0.2309, + "step": 2275 + }, + { + "epoch": 0.10824668850591083, + "grad_norm": 0.609375, + "learning_rate": 4.958443303890704e-05, + "loss": 0.2338, + "step": 2280 + }, + { + "epoch": 0.10848407159473959, + "grad_norm": 0.5703125, + "learning_rate": 4.9581047090578966e-05, + "loss": 0.2313, + "step": 2285 + }, + { + "epoch": 0.10872145468356834, + "grad_norm": 0.6640625, + "learning_rate": 4.957764753380197e-05, + "loss": 0.2334, + "step": 2290 + }, + { + "epoch": 0.1089588377723971, + "grad_norm": 0.4921875, + "learning_rate": 4.95742343706712e-05, + "loss": 0.2316, + "step": 2295 + }, + { + "epoch": 0.10919622086122585, + "grad_norm": 0.75, + "learning_rate": 4.957080760329017e-05, + "loss": 0.2295, + "step": 2300 + }, + { + "epoch": 0.1094336039500546, + "grad_norm": 0.5859375, + "learning_rate": 4.956736723377078e-05, + "loss": 0.2343, + "step": 2305 + }, + { + "epoch": 0.10967098703888335, + "grad_norm": 0.458984375, + "learning_rate": 4.95639132642333e-05, + "loss": 0.2302, + "step": 2310 + }, + { + "epoch": 0.1099083701277121, + "grad_norm": 0.6015625, + "learning_rate": 4.956044569680641e-05, + "loss": 0.2349, + "step": 2315 + }, + { + "epoch": 0.11014575321654085, + "grad_norm": 0.466796875, + "learning_rate": 4.955696453362716e-05, + "loss": 0.2324, + "step": 2320 + }, + { + "epoch": 0.11038313630536961, + "grad_norm": 0.6171875, + "learning_rate": 4.955346977684097e-05, + "loss": 0.2327, + "step": 2325 + }, + { + "epoch": 0.11062051939419836, + "grad_norm": 0.59765625, + "learning_rate": 4.9549961428601624e-05, + "loss": 0.233, + "step": 2330 + }, + { + "epoch": 0.11085790248302711, + "grad_norm": 0.61328125, + "learning_rate": 4.954643949107133e-05, + "loss": 0.2318, + "step": 2335 + }, + { + "epoch": 0.11109528557185586, + "grad_norm": 0.52734375, + "learning_rate": 4.9542903966420626e-05, + "loss": 0.2321, + "step": 2340 + }, + { + "epoch": 0.11133266866068461, + "grad_norm": 0.56640625, + "learning_rate": 4.953935485682845e-05, + "loss": 0.2361, + "step": 2345 + }, + { + "epoch": 0.11157005174951337, + "grad_norm": 0.578125, + "learning_rate": 4.9535792164482095e-05, + "loss": 0.2348, + "step": 2350 + }, + { + "epoch": 0.11180743483834212, + "grad_norm": 0.50390625, + "learning_rate": 4.9532215891577225e-05, + "loss": 0.231, + "step": 2355 + }, + { + "epoch": 0.11204481792717087, + "grad_norm": 0.5078125, + "learning_rate": 4.952862604031789e-05, + "loss": 0.2279, + "step": 2360 + }, + { + "epoch": 0.11228220101599962, + "grad_norm": 0.609375, + "learning_rate": 4.952502261291651e-05, + "loss": 0.2297, + "step": 2365 + }, + { + "epoch": 0.11251958410482837, + "grad_norm": 0.546875, + "learning_rate": 4.9521405611593834e-05, + "loss": 0.234, + "step": 2370 + }, + { + "epoch": 0.11275696719365712, + "grad_norm": 0.6171875, + "learning_rate": 4.9517775038579024e-05, + "loss": 0.233, + "step": 2375 + }, + { + "epoch": 0.11299435028248588, + "grad_norm": 0.55078125, + "learning_rate": 4.951413089610958e-05, + "loss": 0.2319, + "step": 2380 + }, + { + "epoch": 0.11323173337131463, + "grad_norm": 0.494140625, + "learning_rate": 4.951047318643136e-05, + "loss": 0.233, + "step": 2385 + }, + { + "epoch": 0.11346911646014338, + "grad_norm": 0.7265625, + "learning_rate": 4.950680191179862e-05, + "loss": 0.2351, + "step": 2390 + }, + { + "epoch": 0.11370649954897213, + "grad_norm": 0.63671875, + "learning_rate": 4.950311707447392e-05, + "loss": 0.2301, + "step": 2395 + }, + { + "epoch": 0.11394388263780088, + "grad_norm": 0.474609375, + "learning_rate": 4.949941867672822e-05, + "loss": 0.2334, + "step": 2400 + }, + { + "epoch": 0.11418126572662964, + "grad_norm": 0.75390625, + "learning_rate": 4.9495706720840845e-05, + "loss": 0.2311, + "step": 2405 + }, + { + "epoch": 0.11441864881545839, + "grad_norm": 0.6015625, + "learning_rate": 4.949198120909943e-05, + "loss": 0.23, + "step": 2410 + }, + { + "epoch": 0.11465603190428714, + "grad_norm": 0.546875, + "learning_rate": 4.9488242143800004e-05, + "loss": 0.2299, + "step": 2415 + }, + { + "epoch": 0.1148934149931159, + "grad_norm": 0.55078125, + "learning_rate": 4.948448952724693e-05, + "loss": 0.235, + "step": 2420 + }, + { + "epoch": 0.11513079808194464, + "grad_norm": 0.5, + "learning_rate": 4.948072336175293e-05, + "loss": 0.2318, + "step": 2425 + }, + { + "epoch": 0.1153681811707734, + "grad_norm": 0.74609375, + "learning_rate": 4.9476943649639066e-05, + "loss": 0.2272, + "step": 2430 + }, + { + "epoch": 0.11560556425960214, + "grad_norm": 0.62890625, + "learning_rate": 4.9473150393234784e-05, + "loss": 0.2286, + "step": 2435 + }, + { + "epoch": 0.1158429473484309, + "grad_norm": 0.56640625, + "learning_rate": 4.946934359487782e-05, + "loss": 0.2354, + "step": 2440 + }, + { + "epoch": 0.11608033043725965, + "grad_norm": 0.69921875, + "learning_rate": 4.9465523256914314e-05, + "loss": 0.2289, + "step": 2445 + }, + { + "epoch": 0.1163177135260884, + "grad_norm": 0.59765625, + "learning_rate": 4.946168938169869e-05, + "loss": 0.2293, + "step": 2450 + }, + { + "epoch": 0.11655509661491716, + "grad_norm": 0.6328125, + "learning_rate": 4.945784197159379e-05, + "loss": 0.2313, + "step": 2455 + }, + { + "epoch": 0.1167924797037459, + "grad_norm": 0.62109375, + "learning_rate": 4.9453981028970715e-05, + "loss": 0.2368, + "step": 2460 + }, + { + "epoch": 0.11702986279257466, + "grad_norm": 0.640625, + "learning_rate": 4.9450106556208975e-05, + "loss": 0.2315, + "step": 2465 + }, + { + "epoch": 0.11726724588140341, + "grad_norm": 0.578125, + "learning_rate": 4.944621855569637e-05, + "loss": 0.2329, + "step": 2470 + }, + { + "epoch": 0.11750462897023216, + "grad_norm": 0.453125, + "learning_rate": 4.9442317029829064e-05, + "loss": 0.2287, + "step": 2475 + }, + { + "epoch": 0.11774201205906092, + "grad_norm": 0.515625, + "learning_rate": 4.943840198101156e-05, + "loss": 0.234, + "step": 2480 + }, + { + "epoch": 0.11797939514788966, + "grad_norm": 0.56640625, + "learning_rate": 4.9434473411656666e-05, + "loss": 0.2294, + "step": 2485 + }, + { + "epoch": 0.11821677823671842, + "grad_norm": 0.5234375, + "learning_rate": 4.9430531324185555e-05, + "loss": 0.2308, + "step": 2490 + }, + { + "epoch": 0.11845416132554717, + "grad_norm": 0.71484375, + "learning_rate": 4.942657572102772e-05, + "loss": 0.2347, + "step": 2495 + }, + { + "epoch": 0.11869154441437592, + "grad_norm": 0.52734375, + "learning_rate": 4.942260660462097e-05, + "loss": 0.23, + "step": 2500 + }, + { + "epoch": 0.11892892750320468, + "grad_norm": 0.546875, + "learning_rate": 4.9418623977411455e-05, + "loss": 0.2308, + "step": 2505 + }, + { + "epoch": 0.11916631059203342, + "grad_norm": 0.490234375, + "learning_rate": 4.941462784185366e-05, + "loss": 0.2304, + "step": 2510 + }, + { + "epoch": 0.11940369368086218, + "grad_norm": 0.81640625, + "learning_rate": 4.9410618200410374e-05, + "loss": 0.235, + "step": 2515 + }, + { + "epoch": 0.11964107676969092, + "grad_norm": 0.703125, + "learning_rate": 4.940659505555273e-05, + "loss": 0.2389, + "step": 2520 + }, + { + "epoch": 0.11987845985851968, + "grad_norm": 0.84765625, + "learning_rate": 4.9402558409760176e-05, + "loss": 0.2329, + "step": 2525 + }, + { + "epoch": 0.12011584294734844, + "grad_norm": 0.9296875, + "learning_rate": 4.9398508265520455e-05, + "loss": 0.2325, + "step": 2530 + }, + { + "epoch": 0.12035322603617718, + "grad_norm": 0.53515625, + "learning_rate": 4.9394444625329686e-05, + "loss": 0.2307, + "step": 2535 + }, + { + "epoch": 0.12059060912500594, + "grad_norm": 0.62890625, + "learning_rate": 4.9390367491692255e-05, + "loss": 0.2294, + "step": 2540 + }, + { + "epoch": 0.12082799221383468, + "grad_norm": 0.8828125, + "learning_rate": 4.938627686712087e-05, + "loss": 0.2339, + "step": 2545 + }, + { + "epoch": 0.12106537530266344, + "grad_norm": 0.68359375, + "learning_rate": 4.9382172754136574e-05, + "loss": 0.2292, + "step": 2550 + }, + { + "epoch": 0.1213027583914922, + "grad_norm": 0.6796875, + "learning_rate": 4.9378055155268726e-05, + "loss": 0.2302, + "step": 2555 + }, + { + "epoch": 0.12154014148032094, + "grad_norm": 0.76171875, + "learning_rate": 4.937392407305497e-05, + "loss": 0.2301, + "step": 2560 + }, + { + "epoch": 0.1217775245691497, + "grad_norm": 0.66015625, + "learning_rate": 4.936977951004127e-05, + "loss": 0.2314, + "step": 2565 + }, + { + "epoch": 0.12201490765797844, + "grad_norm": 0.85546875, + "learning_rate": 4.9365621468781905e-05, + "loss": 0.2317, + "step": 2570 + }, + { + "epoch": 0.1222522907468072, + "grad_norm": 0.578125, + "learning_rate": 4.936144995183947e-05, + "loss": 0.2319, + "step": 2575 + }, + { + "epoch": 0.12248967383563594, + "grad_norm": 0.578125, + "learning_rate": 4.935726496178483e-05, + "loss": 0.2309, + "step": 2580 + }, + { + "epoch": 0.1227270569244647, + "grad_norm": 0.4453125, + "learning_rate": 4.935306650119719e-05, + "loss": 0.2333, + "step": 2585 + }, + { + "epoch": 0.12296444001329346, + "grad_norm": 0.60546875, + "learning_rate": 4.934885457266404e-05, + "loss": 0.2308, + "step": 2590 + }, + { + "epoch": 0.1232018231021222, + "grad_norm": 0.6484375, + "learning_rate": 4.9344629178781165e-05, + "loss": 0.2287, + "step": 2595 + }, + { + "epoch": 0.12343920619095096, + "grad_norm": 0.8203125, + "learning_rate": 4.934039032215267e-05, + "loss": 0.2306, + "step": 2600 + }, + { + "epoch": 0.1236765892797797, + "grad_norm": 0.48046875, + "learning_rate": 4.933613800539093e-05, + "loss": 0.2297, + "step": 2605 + }, + { + "epoch": 0.12391397236860846, + "grad_norm": 0.57421875, + "learning_rate": 4.9331872231116624e-05, + "loss": 0.2284, + "step": 2610 + }, + { + "epoch": 0.12415135545743722, + "grad_norm": 0.66015625, + "learning_rate": 4.932759300195875e-05, + "loss": 0.2291, + "step": 2615 + }, + { + "epoch": 0.12438873854626596, + "grad_norm": 0.609375, + "learning_rate": 4.9323300320554566e-05, + "loss": 0.2289, + "step": 2620 + }, + { + "epoch": 0.12462612163509472, + "grad_norm": 0.55078125, + "learning_rate": 4.931899418954964e-05, + "loss": 0.2321, + "step": 2625 + }, + { + "epoch": 0.12486350472392346, + "grad_norm": 0.5, + "learning_rate": 4.93146746115978e-05, + "loss": 0.2335, + "step": 2630 + }, + { + "epoch": 0.12510088781275222, + "grad_norm": 0.5, + "learning_rate": 4.9310341589361195e-05, + "loss": 0.2329, + "step": 2635 + }, + { + "epoch": 0.12533827090158098, + "grad_norm": 0.54296875, + "learning_rate": 4.930599512551025e-05, + "loss": 0.2311, + "step": 2640 + }, + { + "epoch": 0.12557565399040974, + "grad_norm": 0.6015625, + "learning_rate": 4.9301635222723666e-05, + "loss": 0.2332, + "step": 2645 + }, + { + "epoch": 0.12581303707923847, + "grad_norm": 0.4921875, + "learning_rate": 4.929726188368844e-05, + "loss": 0.2271, + "step": 2650 + }, + { + "epoch": 0.12605042016806722, + "grad_norm": 0.494140625, + "learning_rate": 4.9292875111099825e-05, + "loss": 0.2275, + "step": 2655 + }, + { + "epoch": 0.12628780325689598, + "grad_norm": 0.55078125, + "learning_rate": 4.928847490766138e-05, + "loss": 0.2327, + "step": 2660 + }, + { + "epoch": 0.12652518634572474, + "grad_norm": 0.58984375, + "learning_rate": 4.928406127608494e-05, + "loss": 0.2322, + "step": 2665 + }, + { + "epoch": 0.1267625694345535, + "grad_norm": 0.74609375, + "learning_rate": 4.927963421909058e-05, + "loss": 0.2277, + "step": 2670 + }, + { + "epoch": 0.12699995252338223, + "grad_norm": 0.734375, + "learning_rate": 4.927519373940669e-05, + "loss": 0.2282, + "step": 2675 + }, + { + "epoch": 0.12723733561221098, + "grad_norm": 0.578125, + "learning_rate": 4.9270739839769934e-05, + "loss": 0.2301, + "step": 2680 + }, + { + "epoch": 0.12747471870103974, + "grad_norm": 0.68359375, + "learning_rate": 4.926627252292519e-05, + "loss": 0.232, + "step": 2685 + }, + { + "epoch": 0.1277121017898685, + "grad_norm": 0.62890625, + "learning_rate": 4.926179179162569e-05, + "loss": 0.237, + "step": 2690 + }, + { + "epoch": 0.12794948487869723, + "grad_norm": 0.55078125, + "learning_rate": 4.925729764863285e-05, + "loss": 0.234, + "step": 2695 + }, + { + "epoch": 0.12818686796752599, + "grad_norm": 0.66015625, + "learning_rate": 4.925279009671642e-05, + "loss": 0.2348, + "step": 2700 + }, + { + "epoch": 0.12842425105635474, + "grad_norm": 0.5625, + "learning_rate": 4.9248269138654356e-05, + "loss": 0.2312, + "step": 2705 + }, + { + "epoch": 0.1286616341451835, + "grad_norm": 0.71875, + "learning_rate": 4.924373477723292e-05, + "loss": 0.2289, + "step": 2710 + }, + { + "epoch": 0.12889901723401226, + "grad_norm": 0.59375, + "learning_rate": 4.923918701524663e-05, + "loss": 0.2326, + "step": 2715 + }, + { + "epoch": 0.129136400322841, + "grad_norm": 0.5546875, + "learning_rate": 4.9234625855498235e-05, + "loss": 0.2337, + "step": 2720 + }, + { + "epoch": 0.12937378341166975, + "grad_norm": 0.51953125, + "learning_rate": 4.9230051300798754e-05, + "loss": 0.231, + "step": 2725 + }, + { + "epoch": 0.1296111665004985, + "grad_norm": 0.52734375, + "learning_rate": 4.922546335396746e-05, + "loss": 0.2294, + "step": 2730 + }, + { + "epoch": 0.12984854958932726, + "grad_norm": 0.486328125, + "learning_rate": 4.922086201783191e-05, + "loss": 0.2311, + "step": 2735 + }, + { + "epoch": 0.13008593267815602, + "grad_norm": 0.69140625, + "learning_rate": 4.9216247295227865e-05, + "loss": 0.2312, + "step": 2740 + }, + { + "epoch": 0.13032331576698475, + "grad_norm": 0.498046875, + "learning_rate": 4.921161918899936e-05, + "loss": 0.2315, + "step": 2745 + }, + { + "epoch": 0.1305606988558135, + "grad_norm": 0.58203125, + "learning_rate": 4.920697770199868e-05, + "loss": 0.2327, + "step": 2750 + }, + { + "epoch": 0.13079808194464226, + "grad_norm": 0.5703125, + "learning_rate": 4.9202322837086356e-05, + "loss": 0.2332, + "step": 2755 + }, + { + "epoch": 0.13103546503347102, + "grad_norm": 0.546875, + "learning_rate": 4.919765459713115e-05, + "loss": 0.2274, + "step": 2760 + }, + { + "epoch": 0.13127284812229978, + "grad_norm": 0.58984375, + "learning_rate": 4.919297298501008e-05, + "loss": 0.2299, + "step": 2765 + }, + { + "epoch": 0.1315102312111285, + "grad_norm": 0.546875, + "learning_rate": 4.91882780036084e-05, + "loss": 0.2341, + "step": 2770 + }, + { + "epoch": 0.13174761429995727, + "grad_norm": 0.64453125, + "learning_rate": 4.918356965581962e-05, + "loss": 0.2272, + "step": 2775 + }, + { + "epoch": 0.13198499738878602, + "grad_norm": 0.63671875, + "learning_rate": 4.917884794454547e-05, + "loss": 0.2309, + "step": 2780 + }, + { + "epoch": 0.13222238047761478, + "grad_norm": 0.59375, + "learning_rate": 4.917411287269589e-05, + "loss": 0.2282, + "step": 2785 + }, + { + "epoch": 0.13245976356644354, + "grad_norm": 0.58984375, + "learning_rate": 4.916936444318912e-05, + "loss": 0.2262, + "step": 2790 + }, + { + "epoch": 0.13269714665527227, + "grad_norm": 0.484375, + "learning_rate": 4.916460265895158e-05, + "loss": 0.2315, + "step": 2795 + }, + { + "epoch": 0.13293452974410103, + "grad_norm": 0.478515625, + "learning_rate": 4.915982752291793e-05, + "loss": 0.2339, + "step": 2800 + }, + { + "epoch": 0.13317191283292978, + "grad_norm": 0.6171875, + "learning_rate": 4.915503903803108e-05, + "loss": 0.2342, + "step": 2805 + }, + { + "epoch": 0.13340929592175854, + "grad_norm": 0.6015625, + "learning_rate": 4.915023720724214e-05, + "loss": 0.2349, + "step": 2810 + }, + { + "epoch": 0.1336466790105873, + "grad_norm": 0.5078125, + "learning_rate": 4.914542203351046e-05, + "loss": 0.2264, + "step": 2815 + }, + { + "epoch": 0.13388406209941603, + "grad_norm": 0.51953125, + "learning_rate": 4.9140593519803604e-05, + "loss": 0.2296, + "step": 2820 + }, + { + "epoch": 0.1341214451882448, + "grad_norm": 0.54296875, + "learning_rate": 4.913575166909738e-05, + "loss": 0.2306, + "step": 2825 + }, + { + "epoch": 0.13435882827707354, + "grad_norm": 0.64453125, + "learning_rate": 4.913089648437578e-05, + "loss": 0.2291, + "step": 2830 + }, + { + "epoch": 0.1345962113659023, + "grad_norm": 0.56640625, + "learning_rate": 4.912602796863104e-05, + "loss": 0.2312, + "step": 2835 + }, + { + "epoch": 0.13483359445473103, + "grad_norm": 0.64453125, + "learning_rate": 4.9121146124863606e-05, + "loss": 0.2306, + "step": 2840 + }, + { + "epoch": 0.1350709775435598, + "grad_norm": 0.5625, + "learning_rate": 4.911625095608213e-05, + "loss": 0.2367, + "step": 2845 + }, + { + "epoch": 0.13530836063238855, + "grad_norm": 0.486328125, + "learning_rate": 4.911134246530349e-05, + "loss": 0.2321, + "step": 2850 + }, + { + "epoch": 0.1355457437212173, + "grad_norm": 0.76953125, + "learning_rate": 4.9106420655552756e-05, + "loss": 0.2336, + "step": 2855 + }, + { + "epoch": 0.13578312681004606, + "grad_norm": 0.546875, + "learning_rate": 4.9101485529863225e-05, + "loss": 0.2319, + "step": 2860 + }, + { + "epoch": 0.1360205098988748, + "grad_norm": 0.5859375, + "learning_rate": 4.9096537091276394e-05, + "loss": 0.2283, + "step": 2865 + }, + { + "epoch": 0.13625789298770355, + "grad_norm": 0.59375, + "learning_rate": 4.909157534284196e-05, + "loss": 0.2305, + "step": 2870 + }, + { + "epoch": 0.1364952760765323, + "grad_norm": 0.462890625, + "learning_rate": 4.908660028761782e-05, + "loss": 0.2322, + "step": 2875 + }, + { + "epoch": 0.13673265916536106, + "grad_norm": 0.4921875, + "learning_rate": 4.908161192867009e-05, + "loss": 0.2305, + "step": 2880 + }, + { + "epoch": 0.13697004225418982, + "grad_norm": 0.51953125, + "learning_rate": 4.9076610269073064e-05, + "loss": 0.2309, + "step": 2885 + }, + { + "epoch": 0.13720742534301855, + "grad_norm": 0.478515625, + "learning_rate": 4.907159531190925e-05, + "loss": 0.2275, + "step": 2890 + }, + { + "epoch": 0.1374448084318473, + "grad_norm": 0.58984375, + "learning_rate": 4.9066567060269334e-05, + "loss": 0.229, + "step": 2895 + }, + { + "epoch": 0.13768219152067607, + "grad_norm": 0.515625, + "learning_rate": 4.906152551725221e-05, + "loss": 0.2304, + "step": 2900 + }, + { + "epoch": 0.13791957460950482, + "grad_norm": 0.5078125, + "learning_rate": 4.905647068596495e-05, + "loss": 0.2271, + "step": 2905 + }, + { + "epoch": 0.13815695769833358, + "grad_norm": 0.55078125, + "learning_rate": 4.905140256952284e-05, + "loss": 0.2253, + "step": 2910 + }, + { + "epoch": 0.1383943407871623, + "grad_norm": 0.6015625, + "learning_rate": 4.904632117104933e-05, + "loss": 0.2286, + "step": 2915 + }, + { + "epoch": 0.13863172387599107, + "grad_norm": 0.5078125, + "learning_rate": 4.904122649367605e-05, + "loss": 0.2294, + "step": 2920 + }, + { + "epoch": 0.13886910696481983, + "grad_norm": 0.58203125, + "learning_rate": 4.9036118540542836e-05, + "loss": 0.2277, + "step": 2925 + }, + { + "epoch": 0.13910649005364858, + "grad_norm": 0.494140625, + "learning_rate": 4.9030997314797705e-05, + "loss": 0.2304, + "step": 2930 + }, + { + "epoch": 0.13934387314247734, + "grad_norm": 0.482421875, + "learning_rate": 4.902586281959683e-05, + "loss": 0.2279, + "step": 2935 + }, + { + "epoch": 0.13958125623130607, + "grad_norm": 0.5703125, + "learning_rate": 4.902071505810459e-05, + "loss": 0.2333, + "step": 2940 + }, + { + "epoch": 0.13981863932013483, + "grad_norm": 0.859375, + "learning_rate": 4.901555403349352e-05, + "loss": 0.2289, + "step": 2945 + }, + { + "epoch": 0.1400560224089636, + "grad_norm": 0.484375, + "learning_rate": 4.901037974894433e-05, + "loss": 0.2347, + "step": 2950 + }, + { + "epoch": 0.14029340549779235, + "grad_norm": 0.57421875, + "learning_rate": 4.900519220764593e-05, + "loss": 0.2324, + "step": 2955 + }, + { + "epoch": 0.1405307885866211, + "grad_norm": 0.58203125, + "learning_rate": 4.899999141279536e-05, + "loss": 0.2261, + "step": 2960 + }, + { + "epoch": 0.14076817167544983, + "grad_norm": 0.45703125, + "learning_rate": 4.8994777367597836e-05, + "loss": 0.2315, + "step": 2965 + }, + { + "epoch": 0.1410055547642786, + "grad_norm": 0.58203125, + "learning_rate": 4.8989550075266784e-05, + "loss": 0.2299, + "step": 2970 + }, + { + "epoch": 0.14124293785310735, + "grad_norm": 0.74609375, + "learning_rate": 4.898430953902372e-05, + "loss": 0.2332, + "step": 2975 + }, + { + "epoch": 0.1414803209419361, + "grad_norm": 0.72265625, + "learning_rate": 4.8979055762098396e-05, + "loss": 0.2314, + "step": 2980 + }, + { + "epoch": 0.14171770403076484, + "grad_norm": 0.59765625, + "learning_rate": 4.897378874772867e-05, + "loss": 0.2314, + "step": 2985 + }, + { + "epoch": 0.1419550871195936, + "grad_norm": 0.59375, + "learning_rate": 4.8968508499160594e-05, + "loss": 0.2294, + "step": 2990 + }, + { + "epoch": 0.14219247020842235, + "grad_norm": 0.5546875, + "learning_rate": 4.896321501964836e-05, + "loss": 0.2284, + "step": 2995 + }, + { + "epoch": 0.1424298532972511, + "grad_norm": 0.5625, + "learning_rate": 4.8957908312454296e-05, + "loss": 0.2325, + "step": 3000 + }, + { + "epoch": 0.14266723638607987, + "grad_norm": 0.67578125, + "learning_rate": 4.895258838084892e-05, + "loss": 0.2277, + "step": 3005 + }, + { + "epoch": 0.1429046194749086, + "grad_norm": 0.609375, + "learning_rate": 4.8947255228110874e-05, + "loss": 0.228, + "step": 3010 + }, + { + "epoch": 0.14314200256373735, + "grad_norm": 0.640625, + "learning_rate": 4.894190885752697e-05, + "loss": 0.2319, + "step": 3015 + }, + { + "epoch": 0.1433793856525661, + "grad_norm": 0.546875, + "learning_rate": 4.8936549272392124e-05, + "loss": 0.2279, + "step": 3020 + }, + { + "epoch": 0.14361676874139487, + "grad_norm": 0.59375, + "learning_rate": 4.893117647600945e-05, + "loss": 0.2312, + "step": 3025 + }, + { + "epoch": 0.14385415183022363, + "grad_norm": 0.5, + "learning_rate": 4.892579047169017e-05, + "loss": 0.2297, + "step": 3030 + }, + { + "epoch": 0.14409153491905236, + "grad_norm": 0.671875, + "learning_rate": 4.892039126275365e-05, + "loss": 0.2306, + "step": 3035 + }, + { + "epoch": 0.1443289180078811, + "grad_norm": 0.6171875, + "learning_rate": 4.8914978852527394e-05, + "loss": 0.2304, + "step": 3040 + }, + { + "epoch": 0.14456630109670987, + "grad_norm": 0.59375, + "learning_rate": 4.890955324434706e-05, + "loss": 0.2311, + "step": 3045 + }, + { + "epoch": 0.14480368418553863, + "grad_norm": 0.703125, + "learning_rate": 4.890411444155641e-05, + "loss": 0.2294, + "step": 3050 + }, + { + "epoch": 0.14504106727436739, + "grad_norm": 0.76171875, + "learning_rate": 4.889866244750736e-05, + "loss": 0.2272, + "step": 3055 + }, + { + "epoch": 0.14527845036319612, + "grad_norm": 0.55078125, + "learning_rate": 4.889319726555996e-05, + "loss": 0.2332, + "step": 3060 + }, + { + "epoch": 0.14551583345202487, + "grad_norm": 0.63671875, + "learning_rate": 4.8887718899082355e-05, + "loss": 0.2331, + "step": 3065 + }, + { + "epoch": 0.14575321654085363, + "grad_norm": 0.515625, + "learning_rate": 4.888222735145086e-05, + "loss": 0.2261, + "step": 3070 + }, + { + "epoch": 0.1459905996296824, + "grad_norm": 0.51953125, + "learning_rate": 4.8876722626049884e-05, + "loss": 0.2279, + "step": 3075 + }, + { + "epoch": 0.14622798271851115, + "grad_norm": 0.50390625, + "learning_rate": 4.8871204726271946e-05, + "loss": 0.2258, + "step": 3080 + }, + { + "epoch": 0.14646536580733988, + "grad_norm": 0.5078125, + "learning_rate": 4.886567365551772e-05, + "loss": 0.2305, + "step": 3085 + }, + { + "epoch": 0.14670274889616863, + "grad_norm": 0.5078125, + "learning_rate": 4.886012941719599e-05, + "loss": 0.2314, + "step": 3090 + }, + { + "epoch": 0.1469401319849974, + "grad_norm": 0.68359375, + "learning_rate": 4.8854572014723616e-05, + "loss": 0.2293, + "step": 3095 + }, + { + "epoch": 0.14717751507382615, + "grad_norm": 0.69140625, + "learning_rate": 4.8849001451525626e-05, + "loss": 0.231, + "step": 3100 + }, + { + "epoch": 0.1474148981626549, + "grad_norm": 0.640625, + "learning_rate": 4.884341773103511e-05, + "loss": 0.2295, + "step": 3105 + }, + { + "epoch": 0.14765228125148364, + "grad_norm": 0.51953125, + "learning_rate": 4.8837820856693314e-05, + "loss": 0.2254, + "step": 3110 + }, + { + "epoch": 0.1478896643403124, + "grad_norm": 0.515625, + "learning_rate": 4.883221083194955e-05, + "loss": 0.2297, + "step": 3115 + }, + { + "epoch": 0.14812704742914115, + "grad_norm": 0.63671875, + "learning_rate": 4.882658766026125e-05, + "loss": 0.2293, + "step": 3120 + }, + { + "epoch": 0.1483644305179699, + "grad_norm": 0.54296875, + "learning_rate": 4.882095134509396e-05, + "loss": 0.2291, + "step": 3125 + }, + { + "epoch": 0.14860181360679864, + "grad_norm": 0.6328125, + "learning_rate": 4.8815301889921305e-05, + "loss": 0.2279, + "step": 3130 + }, + { + "epoch": 0.1488391966956274, + "grad_norm": 0.48046875, + "learning_rate": 4.880963929822502e-05, + "loss": 0.2261, + "step": 3135 + }, + { + "epoch": 0.14907657978445615, + "grad_norm": 0.52734375, + "learning_rate": 4.880396357349495e-05, + "loss": 0.2287, + "step": 3140 + }, + { + "epoch": 0.1493139628732849, + "grad_norm": 0.546875, + "learning_rate": 4.8798274719228996e-05, + "loss": 0.2329, + "step": 3145 + }, + { + "epoch": 0.14955134596211367, + "grad_norm": 0.578125, + "learning_rate": 4.879257273893319e-05, + "loss": 0.2319, + "step": 3150 + }, + { + "epoch": 0.1497887290509424, + "grad_norm": 0.59765625, + "learning_rate": 4.8786857636121616e-05, + "loss": 0.2303, + "step": 3155 + }, + { + "epoch": 0.15002611213977116, + "grad_norm": 0.5390625, + "learning_rate": 4.878112941431648e-05, + "loss": 0.2315, + "step": 3160 + }, + { + "epoch": 0.1502634952285999, + "grad_norm": 0.59375, + "learning_rate": 4.877538807704807e-05, + "loss": 0.2299, + "step": 3165 + }, + { + "epoch": 0.15050087831742867, + "grad_norm": 0.60546875, + "learning_rate": 4.876963362785473e-05, + "loss": 0.2343, + "step": 3170 + }, + { + "epoch": 0.15073826140625743, + "grad_norm": 0.640625, + "learning_rate": 4.876386607028291e-05, + "loss": 0.2295, + "step": 3175 + }, + { + "epoch": 0.15097564449508616, + "grad_norm": 0.47265625, + "learning_rate": 4.8758085407887105e-05, + "loss": 0.2286, + "step": 3180 + }, + { + "epoch": 0.15121302758391492, + "grad_norm": 0.53125, + "learning_rate": 4.875229164422995e-05, + "loss": 0.2305, + "step": 3185 + }, + { + "epoch": 0.15145041067274367, + "grad_norm": 0.52734375, + "learning_rate": 4.874648478288209e-05, + "loss": 0.2273, + "step": 3190 + }, + { + "epoch": 0.15168779376157243, + "grad_norm": 0.59375, + "learning_rate": 4.8740664827422265e-05, + "loss": 0.2295, + "step": 3195 + }, + { + "epoch": 0.1519251768504012, + "grad_norm": 0.546875, + "learning_rate": 4.8734831781437304e-05, + "loss": 0.2296, + "step": 3200 + }, + { + "epoch": 0.15216255993922992, + "grad_norm": 0.5234375, + "learning_rate": 4.8728985648522075e-05, + "loss": 0.2304, + "step": 3205 + }, + { + "epoch": 0.15239994302805868, + "grad_norm": 0.486328125, + "learning_rate": 4.872312643227952e-05, + "loss": 0.2354, + "step": 3210 + }, + { + "epoch": 0.15263732611688743, + "grad_norm": 0.60546875, + "learning_rate": 4.871725413632066e-05, + "loss": 0.2296, + "step": 3215 + }, + { + "epoch": 0.1528747092057162, + "grad_norm": 0.474609375, + "learning_rate": 4.8711368764264555e-05, + "loss": 0.2297, + "step": 3220 + }, + { + "epoch": 0.15311209229454495, + "grad_norm": 0.5078125, + "learning_rate": 4.870547031973834e-05, + "loss": 0.232, + "step": 3225 + }, + { + "epoch": 0.15334947538337368, + "grad_norm": 0.6171875, + "learning_rate": 4.869955880637719e-05, + "loss": 0.2322, + "step": 3230 + }, + { + "epoch": 0.15358685847220244, + "grad_norm": 0.578125, + "learning_rate": 4.8693634227824355e-05, + "loss": 0.2311, + "step": 3235 + }, + { + "epoch": 0.1538242415610312, + "grad_norm": 0.53125, + "learning_rate": 4.868769658773111e-05, + "loss": 0.2308, + "step": 3240 + }, + { + "epoch": 0.15406162464985995, + "grad_norm": 0.470703125, + "learning_rate": 4.8681745889756816e-05, + "loss": 0.2286, + "step": 3245 + }, + { + "epoch": 0.1542990077386887, + "grad_norm": 0.65625, + "learning_rate": 4.867578213756885e-05, + "loss": 0.2297, + "step": 3250 + }, + { + "epoch": 0.15453639082751744, + "grad_norm": 0.62890625, + "learning_rate": 4.8669805334842634e-05, + "loss": 0.2302, + "step": 3255 + }, + { + "epoch": 0.1547737739163462, + "grad_norm": 0.57421875, + "learning_rate": 4.8663815485261666e-05, + "loss": 0.2311, + "step": 3260 + }, + { + "epoch": 0.15501115700517495, + "grad_norm": 0.5078125, + "learning_rate": 4.865781259251744e-05, + "loss": 0.229, + "step": 3265 + }, + { + "epoch": 0.1552485400940037, + "grad_norm": 0.5625, + "learning_rate": 4.865179666030954e-05, + "loss": 0.2346, + "step": 3270 + }, + { + "epoch": 0.15548592318283244, + "grad_norm": 0.55859375, + "learning_rate": 4.864576769234553e-05, + "loss": 0.2287, + "step": 3275 + }, + { + "epoch": 0.1557233062716612, + "grad_norm": 0.5625, + "learning_rate": 4.863972569234106e-05, + "loss": 0.2326, + "step": 3280 + }, + { + "epoch": 0.15596068936048996, + "grad_norm": 0.48828125, + "learning_rate": 4.863367066401977e-05, + "loss": 0.2242, + "step": 3285 + }, + { + "epoch": 0.15619807244931871, + "grad_norm": 0.53125, + "learning_rate": 4.862760261111335e-05, + "loss": 0.2321, + "step": 3290 + }, + { + "epoch": 0.15643545553814747, + "grad_norm": 0.53125, + "learning_rate": 4.8621521537361516e-05, + "loss": 0.2286, + "step": 3295 + }, + { + "epoch": 0.1566728386269762, + "grad_norm": 0.50390625, + "learning_rate": 4.8615427446512e-05, + "loss": 0.2288, + "step": 3300 + }, + { + "epoch": 0.15691022171580496, + "grad_norm": 0.474609375, + "learning_rate": 4.860932034232058e-05, + "loss": 0.2331, + "step": 3305 + }, + { + "epoch": 0.15714760480463372, + "grad_norm": 0.482421875, + "learning_rate": 4.8603200228551006e-05, + "loss": 0.2311, + "step": 3310 + }, + { + "epoch": 0.15738498789346247, + "grad_norm": 0.46484375, + "learning_rate": 4.8597067108975104e-05, + "loss": 0.2313, + "step": 3315 + }, + { + "epoch": 0.15762237098229123, + "grad_norm": 0.54296875, + "learning_rate": 4.859092098737267e-05, + "loss": 0.2275, + "step": 3320 + }, + { + "epoch": 0.15785975407111996, + "grad_norm": 0.5078125, + "learning_rate": 4.8584761867531554e-05, + "loss": 0.2316, + "step": 3325 + }, + { + "epoch": 0.15809713715994872, + "grad_norm": 0.5625, + "learning_rate": 4.857858975324757e-05, + "loss": 0.2297, + "step": 3330 + }, + { + "epoch": 0.15833452024877748, + "grad_norm": 0.46484375, + "learning_rate": 4.857240464832457e-05, + "loss": 0.2249, + "step": 3335 + }, + { + "epoch": 0.15857190333760623, + "grad_norm": 0.4453125, + "learning_rate": 4.856620655657441e-05, + "loss": 0.2292, + "step": 3340 + }, + { + "epoch": 0.158809286426435, + "grad_norm": 0.42578125, + "learning_rate": 4.855999548181695e-05, + "loss": 0.2261, + "step": 3345 + }, + { + "epoch": 0.15904666951526372, + "grad_norm": 0.47265625, + "learning_rate": 4.855377142788003e-05, + "loss": 0.229, + "step": 3350 + }, + { + "epoch": 0.15928405260409248, + "grad_norm": 0.47265625, + "learning_rate": 4.854753439859952e-05, + "loss": 0.2349, + "step": 3355 + }, + { + "epoch": 0.15952143569292124, + "grad_norm": 0.68359375, + "learning_rate": 4.854128439781927e-05, + "loss": 0.2287, + "step": 3360 + }, + { + "epoch": 0.15975881878175, + "grad_norm": 0.62109375, + "learning_rate": 4.853502142939113e-05, + "loss": 0.2279, + "step": 3365 + }, + { + "epoch": 0.15999620187057875, + "grad_norm": 0.474609375, + "learning_rate": 4.852874549717494e-05, + "loss": 0.226, + "step": 3370 + }, + { + "epoch": 0.16023358495940748, + "grad_norm": 0.671875, + "learning_rate": 4.8522456605038515e-05, + "loss": 0.2279, + "step": 3375 + }, + { + "epoch": 0.16047096804823624, + "grad_norm": 0.52734375, + "learning_rate": 4.8516154756857686e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.160708351137065, + "grad_norm": 0.5546875, + "learning_rate": 4.8509839956516246e-05, + "loss": 0.2289, + "step": 3385 + }, + { + "epoch": 0.16094573422589376, + "grad_norm": 0.5234375, + "learning_rate": 4.8503512207905985e-05, + "loss": 0.2269, + "step": 3390 + }, + { + "epoch": 0.1611831173147225, + "grad_norm": 0.5703125, + "learning_rate": 4.849717151492665e-05, + "loss": 0.2311, + "step": 3395 + }, + { + "epoch": 0.16142050040355124, + "grad_norm": 0.6328125, + "learning_rate": 4.8490817881486e-05, + "loss": 0.2317, + "step": 3400 + }, + { + "epoch": 0.16165788349238, + "grad_norm": 0.484375, + "learning_rate": 4.848445131149973e-05, + "loss": 0.2312, + "step": 3405 + }, + { + "epoch": 0.16189526658120876, + "grad_norm": 0.71484375, + "learning_rate": 4.847807180889155e-05, + "loss": 0.2328, + "step": 3410 + }, + { + "epoch": 0.16213264967003752, + "grad_norm": 0.796875, + "learning_rate": 4.847167937759311e-05, + "loss": 0.2326, + "step": 3415 + }, + { + "epoch": 0.16237003275886625, + "grad_norm": 0.64453125, + "learning_rate": 4.8465274021544026e-05, + "loss": 0.2327, + "step": 3420 + }, + { + "epoch": 0.162607415847695, + "grad_norm": 0.490234375, + "learning_rate": 4.84588557446919e-05, + "loss": 0.2307, + "step": 3425 + }, + { + "epoch": 0.16284479893652376, + "grad_norm": 0.58203125, + "learning_rate": 4.8452424550992286e-05, + "loss": 0.2272, + "step": 3430 + }, + { + "epoch": 0.16308218202535252, + "grad_norm": 0.52734375, + "learning_rate": 4.8445980444408706e-05, + "loss": 0.2288, + "step": 3435 + }, + { + "epoch": 0.16331956511418128, + "grad_norm": 0.52734375, + "learning_rate": 4.843952342891262e-05, + "loss": 0.2317, + "step": 3440 + }, + { + "epoch": 0.16355694820301, + "grad_norm": 0.494140625, + "learning_rate": 4.843305350848346e-05, + "loss": 0.2296, + "step": 3445 + }, + { + "epoch": 0.16379433129183876, + "grad_norm": 0.55859375, + "learning_rate": 4.842657068710862e-05, + "loss": 0.2304, + "step": 3450 + }, + { + "epoch": 0.16403171438066752, + "grad_norm": 0.5390625, + "learning_rate": 4.842007496878342e-05, + "loss": 0.2284, + "step": 3455 + }, + { + "epoch": 0.16426909746949628, + "grad_norm": 0.6171875, + "learning_rate": 4.841356635751115e-05, + "loss": 0.2316, + "step": 3460 + }, + { + "epoch": 0.16450648055832504, + "grad_norm": 0.7890625, + "learning_rate": 4.840704485730303e-05, + "loss": 0.2329, + "step": 3465 + }, + { + "epoch": 0.16474386364715377, + "grad_norm": 0.74609375, + "learning_rate": 4.840051047217824e-05, + "loss": 0.2334, + "step": 3470 + }, + { + "epoch": 0.16498124673598252, + "grad_norm": 0.498046875, + "learning_rate": 4.8393963206163886e-05, + "loss": 0.2285, + "step": 3475 + }, + { + "epoch": 0.16521862982481128, + "grad_norm": 0.515625, + "learning_rate": 4.8387403063295017e-05, + "loss": 0.2304, + "step": 3480 + }, + { + "epoch": 0.16545601291364004, + "grad_norm": 0.61328125, + "learning_rate": 4.838083004761462e-05, + "loss": 0.2279, + "step": 3485 + }, + { + "epoch": 0.1656933960024688, + "grad_norm": 0.52734375, + "learning_rate": 4.837424416317362e-05, + "loss": 0.2295, + "step": 3490 + }, + { + "epoch": 0.16593077909129753, + "grad_norm": 0.54296875, + "learning_rate": 4.836764541403087e-05, + "loss": 0.2302, + "step": 3495 + }, + { + "epoch": 0.16616816218012628, + "grad_norm": 0.462890625, + "learning_rate": 4.836103380425313e-05, + "loss": 0.2275, + "step": 3500 + }, + { + "epoch": 0.16640554526895504, + "grad_norm": 0.52734375, + "learning_rate": 4.835440933791513e-05, + "loss": 0.2283, + "step": 3505 + }, + { + "epoch": 0.1666429283577838, + "grad_norm": 0.4609375, + "learning_rate": 4.834777201909948e-05, + "loss": 0.2318, + "step": 3510 + }, + { + "epoch": 0.16688031144661256, + "grad_norm": 0.55859375, + "learning_rate": 4.8341121851896734e-05, + "loss": 0.2257, + "step": 3515 + }, + { + "epoch": 0.16711769453544129, + "grad_norm": 0.62890625, + "learning_rate": 4.833445884040537e-05, + "loss": 0.2303, + "step": 3520 + }, + { + "epoch": 0.16735507762427004, + "grad_norm": 0.79296875, + "learning_rate": 4.832778298873176e-05, + "loss": 0.2315, + "step": 3525 + }, + { + "epoch": 0.1675924607130988, + "grad_norm": 0.50390625, + "learning_rate": 4.832109430099021e-05, + "loss": 0.228, + "step": 3530 + }, + { + "epoch": 0.16782984380192756, + "grad_norm": 0.58984375, + "learning_rate": 4.831439278130291e-05, + "loss": 0.2241, + "step": 3535 + }, + { + "epoch": 0.16806722689075632, + "grad_norm": 0.625, + "learning_rate": 4.83076784338e-05, + "loss": 0.2315, + "step": 3540 + }, + { + "epoch": 0.16830460997958505, + "grad_norm": 0.6171875, + "learning_rate": 4.830095126261948e-05, + "loss": 0.2279, + "step": 3545 + }, + { + "epoch": 0.1685419930684138, + "grad_norm": 0.4921875, + "learning_rate": 4.82942112719073e-05, + "loss": 0.2359, + "step": 3550 + }, + { + "epoch": 0.16877937615724256, + "grad_norm": 0.65625, + "learning_rate": 4.828745846581726e-05, + "loss": 0.2282, + "step": 3555 + }, + { + "epoch": 0.16901675924607132, + "grad_norm": 0.5078125, + "learning_rate": 4.82806928485111e-05, + "loss": 0.231, + "step": 3560 + }, + { + "epoch": 0.16925414233490005, + "grad_norm": 0.54296875, + "learning_rate": 4.827391442415843e-05, + "loss": 0.2284, + "step": 3565 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 0.50390625, + "learning_rate": 4.8267123196936767e-05, + "loss": 0.2263, + "step": 3570 + }, + { + "epoch": 0.16972890851255756, + "grad_norm": 0.60546875, + "learning_rate": 4.826031917103151e-05, + "loss": 0.2312, + "step": 3575 + }, + { + "epoch": 0.16996629160138632, + "grad_norm": 0.54296875, + "learning_rate": 4.8253502350635957e-05, + "loss": 0.2319, + "step": 3580 + }, + { + "epoch": 0.17020367469021508, + "grad_norm": 0.50390625, + "learning_rate": 4.824667273995127e-05, + "loss": 0.2323, + "step": 3585 + }, + { + "epoch": 0.1704410577790438, + "grad_norm": 0.5078125, + "learning_rate": 4.8239830343186526e-05, + "loss": 0.2315, + "step": 3590 + }, + { + "epoch": 0.17067844086787257, + "grad_norm": 0.57421875, + "learning_rate": 4.823297516455864e-05, + "loss": 0.2255, + "step": 3595 + }, + { + "epoch": 0.17091582395670132, + "grad_norm": 0.58984375, + "learning_rate": 4.8226107208292444e-05, + "loss": 0.2277, + "step": 3600 + }, + { + "epoch": 0.17115320704553008, + "grad_norm": 0.58203125, + "learning_rate": 4.8219226478620634e-05, + "loss": 0.2318, + "step": 3605 + }, + { + "epoch": 0.17139059013435884, + "grad_norm": 0.484375, + "learning_rate": 4.8212332979783746e-05, + "loss": 0.2255, + "step": 3610 + }, + { + "epoch": 0.17162797322318757, + "grad_norm": 0.453125, + "learning_rate": 4.820542671603024e-05, + "loss": 0.2276, + "step": 3615 + }, + { + "epoch": 0.17186535631201633, + "grad_norm": 0.53515625, + "learning_rate": 4.81985076916164e-05, + "loss": 0.2251, + "step": 3620 + }, + { + "epoch": 0.17210273940084508, + "grad_norm": 0.4765625, + "learning_rate": 4.819157591080639e-05, + "loss": 0.2312, + "step": 3625 + }, + { + "epoch": 0.17234012248967384, + "grad_norm": 0.5, + "learning_rate": 4.8184631377872244e-05, + "loss": 0.2251, + "step": 3630 + }, + { + "epoch": 0.1725775055785026, + "grad_norm": 0.515625, + "learning_rate": 4.8177674097093835e-05, + "loss": 0.2303, + "step": 3635 + }, + { + "epoch": 0.17281488866733133, + "grad_norm": 0.515625, + "learning_rate": 4.8170704072758904e-05, + "loss": 0.2273, + "step": 3640 + }, + { + "epoch": 0.1730522717561601, + "grad_norm": 0.54296875, + "learning_rate": 4.816372130916305e-05, + "loss": 0.2294, + "step": 3645 + }, + { + "epoch": 0.17328965484498884, + "grad_norm": 0.69921875, + "learning_rate": 4.815672581060972e-05, + "loss": 0.228, + "step": 3650 + }, + { + "epoch": 0.1735270379338176, + "grad_norm": 0.63671875, + "learning_rate": 4.814971758141021e-05, + "loss": 0.2305, + "step": 3655 + }, + { + "epoch": 0.17376442102264636, + "grad_norm": 0.53125, + "learning_rate": 4.814269662588365e-05, + "loss": 0.233, + "step": 3660 + }, + { + "epoch": 0.1740018041114751, + "grad_norm": 0.5078125, + "learning_rate": 4.813566294835703e-05, + "loss": 0.2284, + "step": 3665 + }, + { + "epoch": 0.17423918720030385, + "grad_norm": 0.60546875, + "learning_rate": 4.8128616553165175e-05, + "loss": 0.2284, + "step": 3670 + }, + { + "epoch": 0.1744765702891326, + "grad_norm": 0.5546875, + "learning_rate": 4.812155744465073e-05, + "loss": 0.2327, + "step": 3675 + }, + { + "epoch": 0.17471395337796136, + "grad_norm": 0.49609375, + "learning_rate": 4.811448562716422e-05, + "loss": 0.2279, + "step": 3680 + }, + { + "epoch": 0.17495133646679012, + "grad_norm": 0.494140625, + "learning_rate": 4.8107401105063957e-05, + "loss": 0.2299, + "step": 3685 + }, + { + "epoch": 0.17518871955561885, + "grad_norm": 0.70703125, + "learning_rate": 4.81003038827161e-05, + "loss": 0.2288, + "step": 3690 + }, + { + "epoch": 0.1754261026444476, + "grad_norm": 0.515625, + "learning_rate": 4.809319396449463e-05, + "loss": 0.2301, + "step": 3695 + }, + { + "epoch": 0.17566348573327636, + "grad_norm": 0.515625, + "learning_rate": 4.8086071354781364e-05, + "loss": 0.2263, + "step": 3700 + }, + { + "epoch": 0.17590086882210512, + "grad_norm": 0.5078125, + "learning_rate": 4.807893605796594e-05, + "loss": 0.2279, + "step": 3705 + }, + { + "epoch": 0.17613825191093385, + "grad_norm": 0.48828125, + "learning_rate": 4.80717880784458e-05, + "loss": 0.2282, + "step": 3710 + }, + { + "epoch": 0.1763756349997626, + "grad_norm": 0.486328125, + "learning_rate": 4.8064627420626215e-05, + "loss": 0.2256, + "step": 3715 + }, + { + "epoch": 0.17661301808859137, + "grad_norm": 0.5078125, + "learning_rate": 4.805745408892026e-05, + "loss": 0.2282, + "step": 3720 + }, + { + "epoch": 0.17685040117742012, + "grad_norm": 0.70703125, + "learning_rate": 4.805026808774883e-05, + "loss": 0.2284, + "step": 3725 + }, + { + "epoch": 0.17708778426624888, + "grad_norm": 0.58203125, + "learning_rate": 4.804306942154063e-05, + "loss": 0.2287, + "step": 3730 + }, + { + "epoch": 0.1773251673550776, + "grad_norm": 0.53515625, + "learning_rate": 4.803585809473216e-05, + "loss": 0.2282, + "step": 3735 + }, + { + "epoch": 0.17756255044390637, + "grad_norm": 0.578125, + "learning_rate": 4.802863411176774e-05, + "loss": 0.2269, + "step": 3740 + }, + { + "epoch": 0.17779993353273513, + "grad_norm": 0.5625, + "learning_rate": 4.802139747709945e-05, + "loss": 0.2254, + "step": 3745 + }, + { + "epoch": 0.17803731662156388, + "grad_norm": 0.478515625, + "learning_rate": 4.8014148195187226e-05, + "loss": 0.2304, + "step": 3750 + }, + { + "epoch": 0.17827469971039264, + "grad_norm": 0.54296875, + "learning_rate": 4.800688627049874e-05, + "loss": 0.2265, + "step": 3755 + }, + { + "epoch": 0.17851208279922137, + "grad_norm": 0.6953125, + "learning_rate": 4.799961170750951e-05, + "loss": 0.2326, + "step": 3760 + }, + { + "epoch": 0.17874946588805013, + "grad_norm": 0.59765625, + "learning_rate": 4.79923245107028e-05, + "loss": 0.2303, + "step": 3765 + }, + { + "epoch": 0.1789868489768789, + "grad_norm": 0.63671875, + "learning_rate": 4.798502468456967e-05, + "loss": 0.2338, + "step": 3770 + }, + { + "epoch": 0.17922423206570764, + "grad_norm": 0.59375, + "learning_rate": 4.797771223360899e-05, + "loss": 0.2258, + "step": 3775 + }, + { + "epoch": 0.1794616151545364, + "grad_norm": 0.45703125, + "learning_rate": 4.797038716232737e-05, + "loss": 0.2305, + "step": 3780 + }, + { + "epoch": 0.17969899824336513, + "grad_norm": 0.54296875, + "learning_rate": 4.796304947523923e-05, + "loss": 0.2272, + "step": 3785 + }, + { + "epoch": 0.1799363813321939, + "grad_norm": 0.466796875, + "learning_rate": 4.795569917686676e-05, + "loss": 0.2268, + "step": 3790 + }, + { + "epoch": 0.18017376442102265, + "grad_norm": 0.46875, + "learning_rate": 4.7948336271739894e-05, + "loss": 0.2324, + "step": 3795 + }, + { + "epoch": 0.1804111475098514, + "grad_norm": 0.52734375, + "learning_rate": 4.794096076439638e-05, + "loss": 0.2293, + "step": 3800 + }, + { + "epoch": 0.18064853059868016, + "grad_norm": 0.51171875, + "learning_rate": 4.793357265938169e-05, + "loss": 0.2272, + "step": 3805 + }, + { + "epoch": 0.1808859136875089, + "grad_norm": 0.515625, + "learning_rate": 4.792617196124908e-05, + "loss": 0.2255, + "step": 3810 + }, + { + "epoch": 0.18112329677633765, + "grad_norm": 0.51953125, + "learning_rate": 4.7918758674559595e-05, + "loss": 0.2314, + "step": 3815 + }, + { + "epoch": 0.1813606798651664, + "grad_norm": 0.490234375, + "learning_rate": 4.791133280388198e-05, + "loss": 0.2269, + "step": 3820 + }, + { + "epoch": 0.18159806295399517, + "grad_norm": 0.462890625, + "learning_rate": 4.7903894353792764e-05, + "loss": 0.2316, + "step": 3825 + }, + { + "epoch": 0.18183544604282392, + "grad_norm": 0.470703125, + "learning_rate": 4.789644332887626e-05, + "loss": 0.2293, + "step": 3830 + }, + { + "epoch": 0.18207282913165265, + "grad_norm": 0.44140625, + "learning_rate": 4.788897973372447e-05, + "loss": 0.2305, + "step": 3835 + }, + { + "epoch": 0.1823102122204814, + "grad_norm": 0.458984375, + "learning_rate": 4.7881503572937186e-05, + "loss": 0.2316, + "step": 3840 + }, + { + "epoch": 0.18254759530931017, + "grad_norm": 0.4921875, + "learning_rate": 4.787401485112193e-05, + "loss": 0.2306, + "step": 3845 + }, + { + "epoch": 0.18278497839813893, + "grad_norm": 0.4375, + "learning_rate": 4.7866513572893975e-05, + "loss": 0.2281, + "step": 3850 + }, + { + "epoch": 0.18302236148696766, + "grad_norm": 0.5390625, + "learning_rate": 4.7858999742876315e-05, + "loss": 0.2322, + "step": 3855 + }, + { + "epoch": 0.1832597445757964, + "grad_norm": 0.462890625, + "learning_rate": 4.785147336569969e-05, + "loss": 0.2298, + "step": 3860 + }, + { + "epoch": 0.18349712766462517, + "grad_norm": 0.51953125, + "learning_rate": 4.784393444600257e-05, + "loss": 0.2296, + "step": 3865 + }, + { + "epoch": 0.18373451075345393, + "grad_norm": 0.5078125, + "learning_rate": 4.783638298843117e-05, + "loss": 0.2273, + "step": 3870 + }, + { + "epoch": 0.18397189384228269, + "grad_norm": 0.5625, + "learning_rate": 4.7828818997639404e-05, + "loss": 0.2319, + "step": 3875 + }, + { + "epoch": 0.18420927693111142, + "grad_norm": 0.447265625, + "learning_rate": 4.7821242478288935e-05, + "loss": 0.2291, + "step": 3880 + }, + { + "epoch": 0.18444666001994017, + "grad_norm": 0.6484375, + "learning_rate": 4.781365343504913e-05, + "loss": 0.2254, + "step": 3885 + }, + { + "epoch": 0.18468404310876893, + "grad_norm": 0.546875, + "learning_rate": 4.7806051872597095e-05, + "loss": 0.2258, + "step": 3890 + }, + { + "epoch": 0.1849214261975977, + "grad_norm": 0.5625, + "learning_rate": 4.779843779561762e-05, + "loss": 0.2317, + "step": 3895 + }, + { + "epoch": 0.18515880928642645, + "grad_norm": 0.6015625, + "learning_rate": 4.7790811208803244e-05, + "loss": 0.2298, + "step": 3900 + }, + { + "epoch": 0.18539619237525518, + "grad_norm": 0.55859375, + "learning_rate": 4.778317211685418e-05, + "loss": 0.2306, + "step": 3905 + }, + { + "epoch": 0.18563357546408393, + "grad_norm": 0.5703125, + "learning_rate": 4.7775520524478374e-05, + "loss": 0.2299, + "step": 3910 + }, + { + "epoch": 0.1858709585529127, + "grad_norm": 0.49609375, + "learning_rate": 4.7767856436391476e-05, + "loss": 0.23, + "step": 3915 + }, + { + "epoch": 0.18610834164174145, + "grad_norm": 0.53125, + "learning_rate": 4.7760179857316815e-05, + "loss": 0.2305, + "step": 3920 + }, + { + "epoch": 0.1863457247305702, + "grad_norm": 0.54296875, + "learning_rate": 4.775249079198544e-05, + "loss": 0.2336, + "step": 3925 + }, + { + "epoch": 0.18658310781939894, + "grad_norm": 0.47265625, + "learning_rate": 4.774478924513608e-05, + "loss": 0.2272, + "step": 3930 + }, + { + "epoch": 0.1868204909082277, + "grad_norm": 0.5546875, + "learning_rate": 4.773707522151517e-05, + "loss": 0.2272, + "step": 3935 + }, + { + "epoch": 0.18705787399705645, + "grad_norm": 0.66796875, + "learning_rate": 4.7729348725876826e-05, + "loss": 0.2286, + "step": 3940 + }, + { + "epoch": 0.1872952570858852, + "grad_norm": 0.63671875, + "learning_rate": 4.7721609762982856e-05, + "loss": 0.2294, + "step": 3945 + }, + { + "epoch": 0.18753264017471397, + "grad_norm": 0.546875, + "learning_rate": 4.771385833760274e-05, + "loss": 0.2244, + "step": 3950 + }, + { + "epoch": 0.1877700232635427, + "grad_norm": 0.5078125, + "learning_rate": 4.770609445451364e-05, + "loss": 0.2311, + "step": 3955 + }, + { + "epoch": 0.18800740635237145, + "grad_norm": 0.5234375, + "learning_rate": 4.7698318118500424e-05, + "loss": 0.2287, + "step": 3960 + }, + { + "epoch": 0.1882447894412002, + "grad_norm": 0.5546875, + "learning_rate": 4.7690529334355596e-05, + "loss": 0.2299, + "step": 3965 + }, + { + "epoch": 0.18848217253002897, + "grad_norm": 0.5078125, + "learning_rate": 4.7682728106879356e-05, + "loss": 0.2289, + "step": 3970 + }, + { + "epoch": 0.18871955561885773, + "grad_norm": 0.48828125, + "learning_rate": 4.767491444087955e-05, + "loss": 0.2291, + "step": 3975 + }, + { + "epoch": 0.18895693870768646, + "grad_norm": 0.51171875, + "learning_rate": 4.766708834117172e-05, + "loss": 0.2285, + "step": 3980 + }, + { + "epoch": 0.1891943217965152, + "grad_norm": 0.515625, + "learning_rate": 4.7659249812579055e-05, + "loss": 0.2304, + "step": 3985 + }, + { + "epoch": 0.18943170488534397, + "grad_norm": 0.431640625, + "learning_rate": 4.765139885993241e-05, + "loss": 0.228, + "step": 3990 + }, + { + "epoch": 0.18966908797417273, + "grad_norm": 0.48828125, + "learning_rate": 4.7643535488070265e-05, + "loss": 0.2315, + "step": 3995 + }, + { + "epoch": 0.18990647106300146, + "grad_norm": 0.490234375, + "learning_rate": 4.7635659701838805e-05, + "loss": 0.2295, + "step": 4000 + }, + { + "epoch": 0.19014385415183022, + "grad_norm": 0.55859375, + "learning_rate": 4.762777150609183e-05, + "loss": 0.2306, + "step": 4005 + }, + { + "epoch": 0.19038123724065897, + "grad_norm": 0.5859375, + "learning_rate": 4.761987090569081e-05, + "loss": 0.2298, + "step": 4010 + }, + { + "epoch": 0.19061862032948773, + "grad_norm": 0.58984375, + "learning_rate": 4.761195790550484e-05, + "loss": 0.2285, + "step": 4015 + }, + { + "epoch": 0.1908560034183165, + "grad_norm": 0.609375, + "learning_rate": 4.760403251041067e-05, + "loss": 0.2255, + "step": 4020 + }, + { + "epoch": 0.19109338650714522, + "grad_norm": 0.474609375, + "learning_rate": 4.7596094725292676e-05, + "loss": 0.2316, + "step": 4025 + }, + { + "epoch": 0.19133076959597398, + "grad_norm": 0.484375, + "learning_rate": 4.7588144555042896e-05, + "loss": 0.2316, + "step": 4030 + }, + { + "epoch": 0.19156815268480273, + "grad_norm": 0.47265625, + "learning_rate": 4.758018200456097e-05, + "loss": 0.2295, + "step": 4035 + }, + { + "epoch": 0.1918055357736315, + "grad_norm": 0.498046875, + "learning_rate": 4.757220707875418e-05, + "loss": 0.2289, + "step": 4040 + }, + { + "epoch": 0.19204291886246025, + "grad_norm": 0.62109375, + "learning_rate": 4.756421978253745e-05, + "loss": 0.2285, + "step": 4045 + }, + { + "epoch": 0.19228030195128898, + "grad_norm": 0.5, + "learning_rate": 4.755622012083331e-05, + "loss": 0.2279, + "step": 4050 + }, + { + "epoch": 0.19251768504011774, + "grad_norm": 0.61328125, + "learning_rate": 4.75482080985719e-05, + "loss": 0.2301, + "step": 4055 + }, + { + "epoch": 0.1927550681289465, + "grad_norm": 0.474609375, + "learning_rate": 4.7540183720691006e-05, + "loss": 0.2295, + "step": 4060 + }, + { + "epoch": 0.19299245121777525, + "grad_norm": 0.50390625, + "learning_rate": 4.753214699213602e-05, + "loss": 0.2277, + "step": 4065 + }, + { + "epoch": 0.193229834306604, + "grad_norm": 0.53515625, + "learning_rate": 4.752409791785993e-05, + "loss": 0.2294, + "step": 4070 + }, + { + "epoch": 0.19346721739543274, + "grad_norm": 0.625, + "learning_rate": 4.751603650282335e-05, + "loss": 0.229, + "step": 4075 + }, + { + "epoch": 0.1937046004842615, + "grad_norm": 0.52734375, + "learning_rate": 4.75079627519945e-05, + "loss": 0.2295, + "step": 4080 + }, + { + "epoch": 0.19394198357309025, + "grad_norm": 0.47265625, + "learning_rate": 4.749987667034918e-05, + "loss": 0.229, + "step": 4085 + }, + { + "epoch": 0.194179366661919, + "grad_norm": 0.50390625, + "learning_rate": 4.7491778262870825e-05, + "loss": 0.2263, + "step": 4090 + }, + { + "epoch": 0.19441674975074777, + "grad_norm": 0.408203125, + "learning_rate": 4.748366753455042e-05, + "loss": 0.2257, + "step": 4095 + }, + { + "epoch": 0.1946541328395765, + "grad_norm": 0.478515625, + "learning_rate": 4.74755444903866e-05, + "loss": 0.2286, + "step": 4100 + }, + { + "epoch": 0.19489151592840526, + "grad_norm": 0.48828125, + "learning_rate": 4.7467409135385535e-05, + "loss": 0.2297, + "step": 4105 + }, + { + "epoch": 0.19512889901723401, + "grad_norm": 0.62890625, + "learning_rate": 4.745926147456102e-05, + "loss": 0.2269, + "step": 4110 + }, + { + "epoch": 0.19536628210606277, + "grad_norm": 0.61328125, + "learning_rate": 4.745110151293442e-05, + "loss": 0.2268, + "step": 4115 + }, + { + "epoch": 0.19560366519489153, + "grad_norm": 0.7109375, + "learning_rate": 4.744292925553468e-05, + "loss": 0.227, + "step": 4120 + }, + { + "epoch": 0.19584104828372026, + "grad_norm": 0.703125, + "learning_rate": 4.7434744707398335e-05, + "loss": 0.2252, + "step": 4125 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.59375, + "learning_rate": 4.7426547873569466e-05, + "loss": 0.2294, + "step": 4130 + }, + { + "epoch": 0.19631581446137777, + "grad_norm": 0.50390625, + "learning_rate": 4.741833875909977e-05, + "loss": 0.2268, + "step": 4135 + }, + { + "epoch": 0.19655319755020653, + "grad_norm": 0.53125, + "learning_rate": 4.7410117369048455e-05, + "loss": 0.2282, + "step": 4140 + }, + { + "epoch": 0.19679058063903526, + "grad_norm": 0.4921875, + "learning_rate": 4.7401883708482356e-05, + "loss": 0.23, + "step": 4145 + }, + { + "epoch": 0.19702796372786402, + "grad_norm": 0.6875, + "learning_rate": 4.739363778247582e-05, + "loss": 0.2262, + "step": 4150 + }, + { + "epoch": 0.19726534681669278, + "grad_norm": 0.796875, + "learning_rate": 4.738537959611078e-05, + "loss": 0.2208, + "step": 4155 + }, + { + "epoch": 0.19750272990552153, + "grad_norm": 0.443359375, + "learning_rate": 4.737710915447672e-05, + "loss": 0.2284, + "step": 4160 + }, + { + "epoch": 0.1977401129943503, + "grad_norm": 0.6171875, + "learning_rate": 4.736882646267068e-05, + "loss": 0.2269, + "step": 4165 + }, + { + "epoch": 0.19797749608317902, + "grad_norm": 0.59375, + "learning_rate": 4.736053152579724e-05, + "loss": 0.2254, + "step": 4170 + }, + { + "epoch": 0.19821487917200778, + "grad_norm": 0.54296875, + "learning_rate": 4.735222434896853e-05, + "loss": 0.2257, + "step": 4175 + }, + { + "epoch": 0.19845226226083654, + "grad_norm": 0.48828125, + "learning_rate": 4.7343904937304233e-05, + "loss": 0.2272, + "step": 4180 + }, + { + "epoch": 0.1986896453496653, + "grad_norm": 0.55859375, + "learning_rate": 4.7335573295931554e-05, + "loss": 0.2289, + "step": 4185 + }, + { + "epoch": 0.19892702843849405, + "grad_norm": 0.50390625, + "learning_rate": 4.732722942998526e-05, + "loss": 0.2271, + "step": 4190 + }, + { + "epoch": 0.19916441152732278, + "grad_norm": 0.453125, + "learning_rate": 4.731887334460762e-05, + "loss": 0.2315, + "step": 4195 + }, + { + "epoch": 0.19940179461615154, + "grad_norm": 0.52734375, + "learning_rate": 4.731050504494847e-05, + "loss": 0.2297, + "step": 4200 + }, + { + "epoch": 0.1996391777049803, + "grad_norm": 0.59375, + "learning_rate": 4.7302124536165135e-05, + "loss": 0.2288, + "step": 4205 + }, + { + "epoch": 0.19987656079380905, + "grad_norm": 0.53125, + "learning_rate": 4.72937318234225e-05, + "loss": 0.2285, + "step": 4210 + }, + { + "epoch": 0.2001139438826378, + "grad_norm": 0.490234375, + "learning_rate": 4.7285326911892954e-05, + "loss": 0.2306, + "step": 4215 + }, + { + "epoch": 0.20035132697146654, + "grad_norm": 0.5078125, + "learning_rate": 4.7276909806756394e-05, + "loss": 0.2276, + "step": 4220 + }, + { + "epoch": 0.2005887100602953, + "grad_norm": 0.462890625, + "learning_rate": 4.7268480513200256e-05, + "loss": 0.2269, + "step": 4225 + }, + { + "epoch": 0.20082609314912406, + "grad_norm": 0.44921875, + "learning_rate": 4.7260039036419474e-05, + "loss": 0.229, + "step": 4230 + }, + { + "epoch": 0.20106347623795282, + "grad_norm": 0.64453125, + "learning_rate": 4.7251585381616484e-05, + "loss": 0.2288, + "step": 4235 + }, + { + "epoch": 0.20130085932678157, + "grad_norm": 0.5078125, + "learning_rate": 4.724311955400125e-05, + "loss": 0.2269, + "step": 4240 + }, + { + "epoch": 0.2015382424156103, + "grad_norm": 0.52734375, + "learning_rate": 4.723464155879121e-05, + "loss": 0.2296, + "step": 4245 + }, + { + "epoch": 0.20177562550443906, + "grad_norm": 0.6015625, + "learning_rate": 4.722615140121131e-05, + "loss": 0.2261, + "step": 4250 + }, + { + "epoch": 0.20201300859326782, + "grad_norm": 0.53515625, + "learning_rate": 4.721764908649401e-05, + "loss": 0.2286, + "step": 4255 + }, + { + "epoch": 0.20225039168209658, + "grad_norm": 0.4609375, + "learning_rate": 4.720913461987923e-05, + "loss": 0.2282, + "step": 4260 + }, + { + "epoch": 0.20248777477092533, + "grad_norm": 0.41015625, + "learning_rate": 4.720060800661442e-05, + "loss": 0.2294, + "step": 4265 + }, + { + "epoch": 0.20272515785975406, + "grad_norm": 0.55078125, + "learning_rate": 4.719206925195448e-05, + "loss": 0.2297, + "step": 4270 + }, + { + "epoch": 0.20296254094858282, + "grad_norm": 0.546875, + "learning_rate": 4.718351836116179e-05, + "loss": 0.2333, + "step": 4275 + }, + { + "epoch": 0.20319992403741158, + "grad_norm": 0.494140625, + "learning_rate": 4.717495533950625e-05, + "loss": 0.226, + "step": 4280 + }, + { + "epoch": 0.20343730712624034, + "grad_norm": 0.4453125, + "learning_rate": 4.7166380192265205e-05, + "loss": 0.2273, + "step": 4285 + }, + { + "epoch": 0.20367469021506907, + "grad_norm": 0.451171875, + "learning_rate": 4.7157792924723476e-05, + "loss": 0.2269, + "step": 4290 + }, + { + "epoch": 0.20391207330389782, + "grad_norm": 0.53125, + "learning_rate": 4.714919354217336e-05, + "loss": 0.2253, + "step": 4295 + }, + { + "epoch": 0.20414945639272658, + "grad_norm": 0.46484375, + "learning_rate": 4.714058204991462e-05, + "loss": 0.2273, + "step": 4300 + }, + { + "epoch": 0.20438683948155534, + "grad_norm": 0.53125, + "learning_rate": 4.7131958453254466e-05, + "loss": 0.2282, + "step": 4305 + }, + { + "epoch": 0.2046242225703841, + "grad_norm": 0.49609375, + "learning_rate": 4.7123322757507586e-05, + "loss": 0.2282, + "step": 4310 + }, + { + "epoch": 0.20486160565921283, + "grad_norm": 0.5625, + "learning_rate": 4.711467496799613e-05, + "loss": 0.231, + "step": 4315 + }, + { + "epoch": 0.20509898874804158, + "grad_norm": 0.53125, + "learning_rate": 4.7106015090049686e-05, + "loss": 0.2252, + "step": 4320 + }, + { + "epoch": 0.20533637183687034, + "grad_norm": 0.486328125, + "learning_rate": 4.70973431290053e-05, + "loss": 0.2255, + "step": 4325 + }, + { + "epoch": 0.2055737549256991, + "grad_norm": 0.498046875, + "learning_rate": 4.708865909020745e-05, + "loss": 0.2289, + "step": 4330 + }, + { + "epoch": 0.20581113801452786, + "grad_norm": 0.58984375, + "learning_rate": 4.7079962979008086e-05, + "loss": 0.2311, + "step": 4335 + }, + { + "epoch": 0.20604852110335659, + "grad_norm": 0.5390625, + "learning_rate": 4.707125480076657e-05, + "loss": 0.2235, + "step": 4340 + }, + { + "epoch": 0.20628590419218534, + "grad_norm": 0.640625, + "learning_rate": 4.706253456084972e-05, + "loss": 0.2303, + "step": 4345 + }, + { + "epoch": 0.2065232872810141, + "grad_norm": 0.734375, + "learning_rate": 4.705380226463178e-05, + "loss": 0.2319, + "step": 4350 + }, + { + "epoch": 0.20676067036984286, + "grad_norm": 0.5234375, + "learning_rate": 4.704505791749441e-05, + "loss": 0.2297, + "step": 4355 + }, + { + "epoch": 0.20699805345867162, + "grad_norm": 0.5859375, + "learning_rate": 4.7036301524826735e-05, + "loss": 0.223, + "step": 4360 + }, + { + "epoch": 0.20723543654750035, + "grad_norm": 0.5703125, + "learning_rate": 4.702753309202526e-05, + "loss": 0.2294, + "step": 4365 + }, + { + "epoch": 0.2074728196363291, + "grad_norm": 0.498046875, + "learning_rate": 4.701875262449395e-05, + "loss": 0.227, + "step": 4370 + }, + { + "epoch": 0.20771020272515786, + "grad_norm": 0.4375, + "learning_rate": 4.700996012764414e-05, + "loss": 0.2267, + "step": 4375 + }, + { + "epoch": 0.20794758581398662, + "grad_norm": 0.58984375, + "learning_rate": 4.700115560689463e-05, + "loss": 0.2278, + "step": 4380 + }, + { + "epoch": 0.20818496890281538, + "grad_norm": 0.5078125, + "learning_rate": 4.69923390676716e-05, + "loss": 0.225, + "step": 4385 + }, + { + "epoch": 0.2084223519916441, + "grad_norm": 0.53125, + "learning_rate": 4.698351051540864e-05, + "loss": 0.2291, + "step": 4390 + }, + { + "epoch": 0.20865973508047286, + "grad_norm": 0.390625, + "learning_rate": 4.697466995554674e-05, + "loss": 0.2262, + "step": 4395 + }, + { + "epoch": 0.20889711816930162, + "grad_norm": 0.5859375, + "learning_rate": 4.696581739353431e-05, + "loss": 0.2292, + "step": 4400 + }, + { + "epoch": 0.20913450125813038, + "grad_norm": 0.5078125, + "learning_rate": 4.6956952834827135e-05, + "loss": 0.2275, + "step": 4405 + }, + { + "epoch": 0.20937188434695914, + "grad_norm": 0.515625, + "learning_rate": 4.694807628488841e-05, + "loss": 0.2303, + "step": 4410 + }, + { + "epoch": 0.20960926743578787, + "grad_norm": 0.486328125, + "learning_rate": 4.69391877491887e-05, + "loss": 0.2284, + "step": 4415 + }, + { + "epoch": 0.20984665052461662, + "grad_norm": 0.51171875, + "learning_rate": 4.6930287233205976e-05, + "loss": 0.2243, + "step": 4420 + }, + { + "epoch": 0.21008403361344538, + "grad_norm": 0.59375, + "learning_rate": 4.692137474242559e-05, + "loss": 0.2245, + "step": 4425 + }, + { + "epoch": 0.21032141670227414, + "grad_norm": 0.55078125, + "learning_rate": 4.6912450282340264e-05, + "loss": 0.2262, + "step": 4430 + }, + { + "epoch": 0.21055879979110287, + "grad_norm": 0.73046875, + "learning_rate": 4.690351385845011e-05, + "loss": 0.2301, + "step": 4435 + }, + { + "epoch": 0.21079618287993163, + "grad_norm": 0.5703125, + "learning_rate": 4.689456547626259e-05, + "loss": 0.2308, + "step": 4440 + }, + { + "epoch": 0.21103356596876038, + "grad_norm": 0.4765625, + "learning_rate": 4.688560514129257e-05, + "loss": 0.2298, + "step": 4445 + }, + { + "epoch": 0.21127094905758914, + "grad_norm": 0.50390625, + "learning_rate": 4.687663285906225e-05, + "loss": 0.2281, + "step": 4450 + }, + { + "epoch": 0.2115083321464179, + "grad_norm": 0.453125, + "learning_rate": 4.68676486351012e-05, + "loss": 0.2244, + "step": 4455 + }, + { + "epoch": 0.21174571523524663, + "grad_norm": 0.5234375, + "learning_rate": 4.685865247494638e-05, + "loss": 0.2328, + "step": 4460 + }, + { + "epoch": 0.2119830983240754, + "grad_norm": 0.48828125, + "learning_rate": 4.684964438414206e-05, + "loss": 0.2292, + "step": 4465 + }, + { + "epoch": 0.21222048141290414, + "grad_norm": 0.5390625, + "learning_rate": 4.68406243682399e-05, + "loss": 0.2289, + "step": 4470 + }, + { + "epoch": 0.2124578645017329, + "grad_norm": 0.58984375, + "learning_rate": 4.6831592432798896e-05, + "loss": 0.2285, + "step": 4475 + }, + { + "epoch": 0.21269524759056166, + "grad_norm": 0.455078125, + "learning_rate": 4.682254858338537e-05, + "loss": 0.2246, + "step": 4480 + }, + { + "epoch": 0.2129326306793904, + "grad_norm": 0.45703125, + "learning_rate": 4.681349282557303e-05, + "loss": 0.2295, + "step": 4485 + }, + { + "epoch": 0.21317001376821915, + "grad_norm": 0.63671875, + "learning_rate": 4.680442516494287e-05, + "loss": 0.2287, + "step": 4490 + }, + { + "epoch": 0.2134073968570479, + "grad_norm": 0.47265625, + "learning_rate": 4.679534560708328e-05, + "loss": 0.2267, + "step": 4495 + }, + { + "epoch": 0.21364477994587666, + "grad_norm": 0.53125, + "learning_rate": 4.678625415758993e-05, + "loss": 0.2313, + "step": 4500 + }, + { + "epoch": 0.21388216303470542, + "grad_norm": 0.44921875, + "learning_rate": 4.677715082206584e-05, + "loss": 0.2301, + "step": 4505 + }, + { + "epoch": 0.21411954612353415, + "grad_norm": 0.439453125, + "learning_rate": 4.676803560612136e-05, + "loss": 0.2326, + "step": 4510 + }, + { + "epoch": 0.2143569292123629, + "grad_norm": 0.458984375, + "learning_rate": 4.675890851537415e-05, + "loss": 0.2284, + "step": 4515 + }, + { + "epoch": 0.21459431230119166, + "grad_norm": 0.5546875, + "learning_rate": 4.6749769555449214e-05, + "loss": 0.2237, + "step": 4520 + }, + { + "epoch": 0.21483169539002042, + "grad_norm": 0.51171875, + "learning_rate": 4.674061873197882e-05, + "loss": 0.2293, + "step": 4525 + }, + { + "epoch": 0.21506907847884918, + "grad_norm": 0.59765625, + "learning_rate": 4.67314560506026e-05, + "loss": 0.2303, + "step": 4530 + }, + { + "epoch": 0.2153064615676779, + "grad_norm": 0.5859375, + "learning_rate": 4.672228151696747e-05, + "loss": 0.2267, + "step": 4535 + }, + { + "epoch": 0.21554384465650667, + "grad_norm": 0.5859375, + "learning_rate": 4.671309513672765e-05, + "loss": 0.2324, + "step": 4540 + }, + { + "epoch": 0.21578122774533542, + "grad_norm": 0.5234375, + "learning_rate": 4.670389691554466e-05, + "loss": 0.2294, + "step": 4545 + }, + { + "epoch": 0.21601861083416418, + "grad_norm": 0.43359375, + "learning_rate": 4.6694686859087324e-05, + "loss": 0.2281, + "step": 4550 + }, + { + "epoch": 0.21625599392299294, + "grad_norm": 0.62109375, + "learning_rate": 4.668546497303177e-05, + "loss": 0.2282, + "step": 4555 + }, + { + "epoch": 0.21649337701182167, + "grad_norm": 0.6328125, + "learning_rate": 4.6676231263061386e-05, + "loss": 0.2269, + "step": 4560 + }, + { + "epoch": 0.21673076010065043, + "grad_norm": 0.61328125, + "learning_rate": 4.6666985734866884e-05, + "loss": 0.2301, + "step": 4565 + }, + { + "epoch": 0.21696814318947918, + "grad_norm": 0.4453125, + "learning_rate": 4.665772839414623e-05, + "loss": 0.2258, + "step": 4570 + }, + { + "epoch": 0.21720552627830794, + "grad_norm": 0.455078125, + "learning_rate": 4.664845924660469e-05, + "loss": 0.225, + "step": 4575 + }, + { + "epoch": 0.21744290936713667, + "grad_norm": 0.482421875, + "learning_rate": 4.663917829795478e-05, + "loss": 0.2323, + "step": 4580 + }, + { + "epoch": 0.21768029245596543, + "grad_norm": 0.5390625, + "learning_rate": 4.662988555391632e-05, + "loss": 0.2253, + "step": 4585 + }, + { + "epoch": 0.2179176755447942, + "grad_norm": 0.484375, + "learning_rate": 4.662058102021638e-05, + "loss": 0.2253, + "step": 4590 + }, + { + "epoch": 0.21815505863362294, + "grad_norm": 0.64453125, + "learning_rate": 4.661126470258931e-05, + "loss": 0.229, + "step": 4595 + }, + { + "epoch": 0.2183924417224517, + "grad_norm": 0.478515625, + "learning_rate": 4.660193660677671e-05, + "loss": 0.2258, + "step": 4600 + }, + { + "epoch": 0.21862982481128043, + "grad_norm": 0.45703125, + "learning_rate": 4.659259673852743e-05, + "loss": 0.2262, + "step": 4605 + }, + { + "epoch": 0.2188672079001092, + "grad_norm": 0.484375, + "learning_rate": 4.658324510359762e-05, + "loss": 0.2272, + "step": 4610 + }, + { + "epoch": 0.21910459098893795, + "grad_norm": 0.455078125, + "learning_rate": 4.6573881707750616e-05, + "loss": 0.2266, + "step": 4615 + }, + { + "epoch": 0.2193419740777667, + "grad_norm": 0.51953125, + "learning_rate": 4.6564506556757054e-05, + "loss": 0.2307, + "step": 4620 + }, + { + "epoch": 0.21957935716659546, + "grad_norm": 0.427734375, + "learning_rate": 4.655511965639479e-05, + "loss": 0.229, + "step": 4625 + }, + { + "epoch": 0.2198167402554242, + "grad_norm": 0.486328125, + "learning_rate": 4.654572101244893e-05, + "loss": 0.2278, + "step": 4630 + }, + { + "epoch": 0.22005412334425295, + "grad_norm": 0.54296875, + "learning_rate": 4.653631063071182e-05, + "loss": 0.2274, + "step": 4635 + }, + { + "epoch": 0.2202915064330817, + "grad_norm": 0.52734375, + "learning_rate": 4.652688851698304e-05, + "loss": 0.2275, + "step": 4640 + }, + { + "epoch": 0.22052888952191047, + "grad_norm": 0.443359375, + "learning_rate": 4.651745467706938e-05, + "loss": 0.2302, + "step": 4645 + }, + { + "epoch": 0.22076627261073922, + "grad_norm": 0.6015625, + "learning_rate": 4.6508009116784885e-05, + "loss": 0.2227, + "step": 4650 + }, + { + "epoch": 0.22100365569956795, + "grad_norm": 0.5, + "learning_rate": 4.6498551841950805e-05, + "loss": 0.2262, + "step": 4655 + }, + { + "epoch": 0.2212410387883967, + "grad_norm": 0.5078125, + "learning_rate": 4.6489082858395616e-05, + "loss": 0.2295, + "step": 4660 + }, + { + "epoch": 0.22147842187722547, + "grad_norm": 0.515625, + "learning_rate": 4.6479602171955014e-05, + "loss": 0.2285, + "step": 4665 + }, + { + "epoch": 0.22171580496605423, + "grad_norm": 0.5078125, + "learning_rate": 4.6470109788471893e-05, + "loss": 0.2283, + "step": 4670 + }, + { + "epoch": 0.22195318805488298, + "grad_norm": 0.46484375, + "learning_rate": 4.6460605713796387e-05, + "loss": 0.232, + "step": 4675 + }, + { + "epoch": 0.2221905711437117, + "grad_norm": 0.5390625, + "learning_rate": 4.645108995378578e-05, + "loss": 0.2285, + "step": 4680 + }, + { + "epoch": 0.22242795423254047, + "grad_norm": 0.462890625, + "learning_rate": 4.6441562514304625e-05, + "loss": 0.2317, + "step": 4685 + }, + { + "epoch": 0.22266533732136923, + "grad_norm": 0.5, + "learning_rate": 4.643202340122462e-05, + "loss": 0.2293, + "step": 4690 + }, + { + "epoch": 0.22290272041019799, + "grad_norm": 0.4765625, + "learning_rate": 4.6422472620424686e-05, + "loss": 0.2281, + "step": 4695 + }, + { + "epoch": 0.22314010349902674, + "grad_norm": 0.482421875, + "learning_rate": 4.6412910177790926e-05, + "loss": 0.2294, + "step": 4700 + }, + { + "epoch": 0.22337748658785547, + "grad_norm": 0.546875, + "learning_rate": 4.640333607921662e-05, + "loss": 0.2274, + "step": 4705 + }, + { + "epoch": 0.22361486967668423, + "grad_norm": 0.51953125, + "learning_rate": 4.6393750330602244e-05, + "loss": 0.2251, + "step": 4710 + }, + { + "epoch": 0.223852252765513, + "grad_norm": 0.60546875, + "learning_rate": 4.638415293785546e-05, + "loss": 0.2318, + "step": 4715 + }, + { + "epoch": 0.22408963585434175, + "grad_norm": 0.5546875, + "learning_rate": 4.637454390689109e-05, + "loss": 0.2274, + "step": 4720 + }, + { + "epoch": 0.22432701894317048, + "grad_norm": 0.62890625, + "learning_rate": 4.636492324363115e-05, + "loss": 0.2278, + "step": 4725 + }, + { + "epoch": 0.22456440203199923, + "grad_norm": 0.63671875, + "learning_rate": 4.635529095400479e-05, + "loss": 0.2314, + "step": 4730 + }, + { + "epoch": 0.224801785120828, + "grad_norm": 0.5703125, + "learning_rate": 4.634564704394836e-05, + "loss": 0.226, + "step": 4735 + }, + { + "epoch": 0.22503916820965675, + "grad_norm": 0.59765625, + "learning_rate": 4.6335991519405354e-05, + "loss": 0.2237, + "step": 4740 + }, + { + "epoch": 0.2252765512984855, + "grad_norm": 0.51171875, + "learning_rate": 4.6326324386326424e-05, + "loss": 0.2302, + "step": 4745 + }, + { + "epoch": 0.22551393438731424, + "grad_norm": 0.5078125, + "learning_rate": 4.6316645650669385e-05, + "loss": 0.2322, + "step": 4750 + }, + { + "epoch": 0.225751317476143, + "grad_norm": 0.59765625, + "learning_rate": 4.630695531839919e-05, + "loss": 0.2277, + "step": 4755 + }, + { + "epoch": 0.22598870056497175, + "grad_norm": 0.51953125, + "learning_rate": 4.629725339548796e-05, + "loss": 0.2301, + "step": 4760 + }, + { + "epoch": 0.2262260836538005, + "grad_norm": 0.65234375, + "learning_rate": 4.628753988791495e-05, + "loss": 0.2273, + "step": 4765 + }, + { + "epoch": 0.22646346674262927, + "grad_norm": 0.71875, + "learning_rate": 4.6277814801666524e-05, + "loss": 0.2294, + "step": 4770 + }, + { + "epoch": 0.226700849831458, + "grad_norm": 0.5703125, + "learning_rate": 4.626807814273624e-05, + "loss": 0.229, + "step": 4775 + }, + { + "epoch": 0.22693823292028675, + "grad_norm": 0.458984375, + "learning_rate": 4.6258329917124724e-05, + "loss": 0.2263, + "step": 4780 + }, + { + "epoch": 0.2271756160091155, + "grad_norm": 0.5078125, + "learning_rate": 4.624857013083979e-05, + "loss": 0.2241, + "step": 4785 + }, + { + "epoch": 0.22741299909794427, + "grad_norm": 0.6484375, + "learning_rate": 4.6238798789896336e-05, + "loss": 0.2261, + "step": 4790 + }, + { + "epoch": 0.22765038218677303, + "grad_norm": 0.6171875, + "learning_rate": 4.6229015900316404e-05, + "loss": 0.2291, + "step": 4795 + }, + { + "epoch": 0.22788776527560176, + "grad_norm": 0.55859375, + "learning_rate": 4.621922146812914e-05, + "loss": 0.2269, + "step": 4800 + }, + { + "epoch": 0.2281251483644305, + "grad_norm": 0.482421875, + "learning_rate": 4.62094154993708e-05, + "loss": 0.2291, + "step": 4805 + }, + { + "epoch": 0.22836253145325927, + "grad_norm": 0.71484375, + "learning_rate": 4.619959800008477e-05, + "loss": 0.2288, + "step": 4810 + }, + { + "epoch": 0.22859991454208803, + "grad_norm": 0.66015625, + "learning_rate": 4.6189768976321524e-05, + "loss": 0.2298, + "step": 4815 + }, + { + "epoch": 0.22883729763091679, + "grad_norm": 0.57421875, + "learning_rate": 4.6179928434138644e-05, + "loss": 0.2257, + "step": 4820 + }, + { + "epoch": 0.22907468071974552, + "grad_norm": 0.4921875, + "learning_rate": 4.6170076379600825e-05, + "loss": 0.2261, + "step": 4825 + }, + { + "epoch": 0.22931206380857427, + "grad_norm": 0.43359375, + "learning_rate": 4.616021281877982e-05, + "loss": 0.2284, + "step": 4830 + }, + { + "epoch": 0.22954944689740303, + "grad_norm": 0.5, + "learning_rate": 4.6150337757754515e-05, + "loss": 0.2277, + "step": 4835 + }, + { + "epoch": 0.2297868299862318, + "grad_norm": 0.470703125, + "learning_rate": 4.614045120261086e-05, + "loss": 0.2285, + "step": 4840 + }, + { + "epoch": 0.23002421307506055, + "grad_norm": 0.5546875, + "learning_rate": 4.6130553159441884e-05, + "loss": 0.2249, + "step": 4845 + }, + { + "epoch": 0.23026159616388928, + "grad_norm": 0.4296875, + "learning_rate": 4.612064363434772e-05, + "loss": 0.2274, + "step": 4850 + }, + { + "epoch": 0.23049897925271803, + "grad_norm": 0.6640625, + "learning_rate": 4.611072263343556e-05, + "loss": 0.2306, + "step": 4855 + }, + { + "epoch": 0.2307363623415468, + "grad_norm": 0.48828125, + "learning_rate": 4.610079016281967e-05, + "loss": 0.2259, + "step": 4860 + }, + { + "epoch": 0.23097374543037555, + "grad_norm": 0.5234375, + "learning_rate": 4.6090846228621384e-05, + "loss": 0.2279, + "step": 4865 + }, + { + "epoch": 0.23121112851920428, + "grad_norm": 0.400390625, + "learning_rate": 4.608089083696911e-05, + "loss": 0.2265, + "step": 4870 + }, + { + "epoch": 0.23144851160803304, + "grad_norm": 0.5078125, + "learning_rate": 4.60709239939983e-05, + "loss": 0.2239, + "step": 4875 + }, + { + "epoch": 0.2316858946968618, + "grad_norm": 0.48046875, + "learning_rate": 4.606094570585149e-05, + "loss": 0.2281, + "step": 4880 + }, + { + "epoch": 0.23192327778569055, + "grad_norm": 0.5078125, + "learning_rate": 4.605095597867823e-05, + "loss": 0.2284, + "step": 4885 + }, + { + "epoch": 0.2321606608745193, + "grad_norm": 0.484375, + "learning_rate": 4.604095481863517e-05, + "loss": 0.2241, + "step": 4890 + }, + { + "epoch": 0.23239804396334804, + "grad_norm": 0.46875, + "learning_rate": 4.603094223188596e-05, + "loss": 0.2299, + "step": 4895 + }, + { + "epoch": 0.2326354270521768, + "grad_norm": 0.52734375, + "learning_rate": 4.6020918224601324e-05, + "loss": 0.2255, + "step": 4900 + }, + { + "epoch": 0.23287281014100555, + "grad_norm": 0.5390625, + "learning_rate": 4.6010882802959e-05, + "loss": 0.2247, + "step": 4905 + }, + { + "epoch": 0.2331101932298343, + "grad_norm": 0.50390625, + "learning_rate": 4.6000835973143784e-05, + "loss": 0.2229, + "step": 4910 + }, + { + "epoch": 0.23334757631866307, + "grad_norm": 0.52734375, + "learning_rate": 4.5990777741347485e-05, + "loss": 0.229, + "step": 4915 + }, + { + "epoch": 0.2335849594074918, + "grad_norm": 0.46875, + "learning_rate": 4.598070811376895e-05, + "loss": 0.2275, + "step": 4920 + }, + { + "epoch": 0.23382234249632056, + "grad_norm": 0.478515625, + "learning_rate": 4.597062709661404e-05, + "loss": 0.227, + "step": 4925 + }, + { + "epoch": 0.23405972558514931, + "grad_norm": 0.474609375, + "learning_rate": 4.596053469609564e-05, + "loss": 0.2277, + "step": 4930 + }, + { + "epoch": 0.23429710867397807, + "grad_norm": 0.50390625, + "learning_rate": 4.595043091843366e-05, + "loss": 0.2272, + "step": 4935 + }, + { + "epoch": 0.23453449176280683, + "grad_norm": 0.6328125, + "learning_rate": 4.5940315769855004e-05, + "loss": 0.2284, + "step": 4940 + }, + { + "epoch": 0.23477187485163556, + "grad_norm": 0.54296875, + "learning_rate": 4.593018925659359e-05, + "loss": 0.2277, + "step": 4945 + }, + { + "epoch": 0.23500925794046432, + "grad_norm": 0.61328125, + "learning_rate": 4.592005138489034e-05, + "loss": 0.2299, + "step": 4950 + }, + { + "epoch": 0.23524664102929307, + "grad_norm": 0.427734375, + "learning_rate": 4.590990216099319e-05, + "loss": 0.2252, + "step": 4955 + }, + { + "epoch": 0.23548402411812183, + "grad_norm": 0.609375, + "learning_rate": 4.589974159115705e-05, + "loss": 0.2275, + "step": 4960 + }, + { + "epoch": 0.2357214072069506, + "grad_norm": 0.65625, + "learning_rate": 4.588956968164383e-05, + "loss": 0.227, + "step": 4965 + }, + { + "epoch": 0.23595879029577932, + "grad_norm": 0.61328125, + "learning_rate": 4.587938643872246e-05, + "loss": 0.2301, + "step": 4970 + }, + { + "epoch": 0.23619617338460808, + "grad_norm": 0.63671875, + "learning_rate": 4.586919186866879e-05, + "loss": 0.2333, + "step": 4975 + }, + { + "epoch": 0.23643355647343683, + "grad_norm": 0.5078125, + "learning_rate": 4.58589859777657e-05, + "loss": 0.2302, + "step": 4980 + }, + { + "epoch": 0.2366709395622656, + "grad_norm": 0.515625, + "learning_rate": 4.584876877230304e-05, + "loss": 0.2266, + "step": 4985 + }, + { + "epoch": 0.23690832265109435, + "grad_norm": 0.57421875, + "learning_rate": 4.5838540258577625e-05, + "loss": 0.2286, + "step": 4990 + }, + { + "epoch": 0.23714570573992308, + "grad_norm": 0.55078125, + "learning_rate": 4.5828300442893236e-05, + "loss": 0.2284, + "step": 4995 + }, + { + "epoch": 0.23738308882875184, + "grad_norm": 0.6484375, + "learning_rate": 4.5818049331560634e-05, + "loss": 0.2307, + "step": 5000 + }, + { + "epoch": 0.2376204719175806, + "grad_norm": 0.53125, + "learning_rate": 4.580778693089753e-05, + "loss": 0.2271, + "step": 5005 + }, + { + "epoch": 0.23785785500640935, + "grad_norm": 0.84765625, + "learning_rate": 4.579751324722859e-05, + "loss": 0.2276, + "step": 5010 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.67578125, + "learning_rate": 4.578722828688543e-05, + "loss": 0.2251, + "step": 5015 + }, + { + "epoch": 0.23833262118406684, + "grad_norm": 0.5, + "learning_rate": 4.577693205620663e-05, + "loss": 0.2299, + "step": 5020 + }, + { + "epoch": 0.2385700042728956, + "grad_norm": 0.58984375, + "learning_rate": 4.576662456153773e-05, + "loss": 0.2277, + "step": 5025 + }, + { + "epoch": 0.23880738736172435, + "grad_norm": 0.52734375, + "learning_rate": 4.5756305809231176e-05, + "loss": 0.227, + "step": 5030 + }, + { + "epoch": 0.2390447704505531, + "grad_norm": 0.5, + "learning_rate": 4.5745975805646366e-05, + "loss": 0.2311, + "step": 5035 + }, + { + "epoch": 0.23928215353938184, + "grad_norm": 0.498046875, + "learning_rate": 4.573563455714964e-05, + "loss": 0.2275, + "step": 5040 + }, + { + "epoch": 0.2395195366282106, + "grad_norm": 0.53515625, + "learning_rate": 4.572528207011425e-05, + "loss": 0.2287, + "step": 5045 + }, + { + "epoch": 0.23975691971703936, + "grad_norm": 0.49609375, + "learning_rate": 4.57149183509204e-05, + "loss": 0.2253, + "step": 5050 + }, + { + "epoch": 0.23999430280586811, + "grad_norm": 0.57421875, + "learning_rate": 4.570454340595521e-05, + "loss": 0.2316, + "step": 5055 + }, + { + "epoch": 0.24023168589469687, + "grad_norm": 0.462890625, + "learning_rate": 4.569415724161269e-05, + "loss": 0.2296, + "step": 5060 + }, + { + "epoch": 0.2404690689835256, + "grad_norm": 0.486328125, + "learning_rate": 4.56837598642938e-05, + "loss": 0.2238, + "step": 5065 + }, + { + "epoch": 0.24070645207235436, + "grad_norm": 0.48828125, + "learning_rate": 4.567335128040639e-05, + "loss": 0.2296, + "step": 5070 + }, + { + "epoch": 0.24094383516118312, + "grad_norm": 0.56640625, + "learning_rate": 4.566293149636523e-05, + "loss": 0.2239, + "step": 5075 + }, + { + "epoch": 0.24118121825001188, + "grad_norm": 0.6640625, + "learning_rate": 4.5652500518591975e-05, + "loss": 0.2252, + "step": 5080 + }, + { + "epoch": 0.24141860133884063, + "grad_norm": 0.52734375, + "learning_rate": 4.5642058353515204e-05, + "loss": 0.231, + "step": 5085 + }, + { + "epoch": 0.24165598442766936, + "grad_norm": 0.54296875, + "learning_rate": 4.5631605007570366e-05, + "loss": 0.2297, + "step": 5090 + }, + { + "epoch": 0.24189336751649812, + "grad_norm": 0.5390625, + "learning_rate": 4.5621140487199804e-05, + "loss": 0.2278, + "step": 5095 + }, + { + "epoch": 0.24213075060532688, + "grad_norm": 0.53125, + "learning_rate": 4.561066479885277e-05, + "loss": 0.2292, + "step": 5100 + }, + { + "epoch": 0.24236813369415564, + "grad_norm": 0.48046875, + "learning_rate": 4.560017794898538e-05, + "loss": 0.2265, + "step": 5105 + }, + { + "epoch": 0.2426055167829844, + "grad_norm": 0.455078125, + "learning_rate": 4.5589679944060616e-05, + "loss": 0.2244, + "step": 5110 + }, + { + "epoch": 0.24284289987181312, + "grad_norm": 0.486328125, + "learning_rate": 4.557917079054837e-05, + "loss": 0.231, + "step": 5115 + }, + { + "epoch": 0.24308028296064188, + "grad_norm": 0.490234375, + "learning_rate": 4.5568650494925376e-05, + "loss": 0.2248, + "step": 5120 + }, + { + "epoch": 0.24331766604947064, + "grad_norm": 0.5390625, + "learning_rate": 4.555811906367524e-05, + "loss": 0.2281, + "step": 5125 + }, + { + "epoch": 0.2435550491382994, + "grad_norm": 0.44921875, + "learning_rate": 4.554757650328845e-05, + "loss": 0.2311, + "step": 5130 + }, + { + "epoch": 0.24379243222712815, + "grad_norm": 0.5625, + "learning_rate": 4.553702282026232e-05, + "loss": 0.2271, + "step": 5135 + }, + { + "epoch": 0.24402981531595688, + "grad_norm": 0.5078125, + "learning_rate": 4.552645802110105e-05, + "loss": 0.2273, + "step": 5140 + }, + { + "epoch": 0.24426719840478564, + "grad_norm": 0.45703125, + "learning_rate": 4.551588211231568e-05, + "loss": 0.2277, + "step": 5145 + }, + { + "epoch": 0.2445045814936144, + "grad_norm": 0.470703125, + "learning_rate": 4.5505295100424076e-05, + "loss": 0.2287, + "step": 5150 + }, + { + "epoch": 0.24474196458244316, + "grad_norm": 0.435546875, + "learning_rate": 4.5494696991950987e-05, + "loss": 0.2265, + "step": 5155 + }, + { + "epoch": 0.24497934767127189, + "grad_norm": 0.6484375, + "learning_rate": 4.548408779342797e-05, + "loss": 0.2288, + "step": 5160 + }, + { + "epoch": 0.24521673076010064, + "grad_norm": 0.50390625, + "learning_rate": 4.547346751139342e-05, + "loss": 0.2246, + "step": 5165 + }, + { + "epoch": 0.2454541138489294, + "grad_norm": 0.5, + "learning_rate": 4.546283615239258e-05, + "loss": 0.2268, + "step": 5170 + }, + { + "epoch": 0.24569149693775816, + "grad_norm": 0.6484375, + "learning_rate": 4.545219372297749e-05, + "loss": 0.2308, + "step": 5175 + }, + { + "epoch": 0.24592888002658692, + "grad_norm": 0.5859375, + "learning_rate": 4.544154022970705e-05, + "loss": 0.2282, + "step": 5180 + }, + { + "epoch": 0.24616626311541565, + "grad_norm": 0.53125, + "learning_rate": 4.5430875679146955e-05, + "loss": 0.2288, + "step": 5185 + }, + { + "epoch": 0.2464036462042444, + "grad_norm": 0.51953125, + "learning_rate": 4.542020007786972e-05, + "loss": 0.2258, + "step": 5190 + }, + { + "epoch": 0.24664102929307316, + "grad_norm": 0.640625, + "learning_rate": 4.540951343245465e-05, + "loss": 0.2248, + "step": 5195 + }, + { + "epoch": 0.24687841238190192, + "grad_norm": 0.51953125, + "learning_rate": 4.539881574948791e-05, + "loss": 0.2254, + "step": 5200 + }, + { + "epoch": 0.24711579547073068, + "grad_norm": 0.55078125, + "learning_rate": 4.5388107035562406e-05, + "loss": 0.2229, + "step": 5205 + }, + { + "epoch": 0.2473531785595594, + "grad_norm": 0.57421875, + "learning_rate": 4.5377387297277876e-05, + "loss": 0.2268, + "step": 5210 + }, + { + "epoch": 0.24759056164838816, + "grad_norm": 0.578125, + "learning_rate": 4.536665654124085e-05, + "loss": 0.2315, + "step": 5215 + }, + { + "epoch": 0.24782794473721692, + "grad_norm": 0.546875, + "learning_rate": 4.535591477406466e-05, + "loss": 0.2258, + "step": 5220 + }, + { + "epoch": 0.24806532782604568, + "grad_norm": 0.60546875, + "learning_rate": 4.534516200236937e-05, + "loss": 0.2288, + "step": 5225 + }, + { + "epoch": 0.24830271091487444, + "grad_norm": 0.470703125, + "learning_rate": 4.533439823278189e-05, + "loss": 0.2278, + "step": 5230 + }, + { + "epoch": 0.24854009400370317, + "grad_norm": 0.4765625, + "learning_rate": 4.532362347193589e-05, + "loss": 0.2234, + "step": 5235 + }, + { + "epoch": 0.24877747709253192, + "grad_norm": 0.5390625, + "learning_rate": 4.531283772647178e-05, + "loss": 0.2267, + "step": 5240 + }, + { + "epoch": 0.24901486018136068, + "grad_norm": 0.470703125, + "learning_rate": 4.530204100303678e-05, + "loss": 0.2249, + "step": 5245 + }, + { + "epoch": 0.24925224327018944, + "grad_norm": 0.50390625, + "learning_rate": 4.529123330828487e-05, + "loss": 0.2257, + "step": 5250 + }, + { + "epoch": 0.2494896263590182, + "grad_norm": 0.462890625, + "learning_rate": 4.5280414648876765e-05, + "loss": 0.2289, + "step": 5255 + }, + { + "epoch": 0.24972700944784693, + "grad_norm": 0.5625, + "learning_rate": 4.526958503147997e-05, + "loss": 0.225, + "step": 5260 + }, + { + "epoch": 0.24996439253667568, + "grad_norm": 0.578125, + "learning_rate": 4.525874446276873e-05, + "loss": 0.2296, + "step": 5265 + }, + { + "epoch": 0.25020177562550444, + "grad_norm": 0.54296875, + "learning_rate": 4.524789294942402e-05, + "loss": 0.2267, + "step": 5270 + }, + { + "epoch": 0.2504391587143332, + "grad_norm": 0.435546875, + "learning_rate": 4.52370304981336e-05, + "loss": 0.2239, + "step": 5275 + }, + { + "epoch": 0.25067654180316196, + "grad_norm": 0.46875, + "learning_rate": 4.5226157115591937e-05, + "loss": 0.2232, + "step": 5280 + }, + { + "epoch": 0.2509139248919907, + "grad_norm": 0.427734375, + "learning_rate": 4.521527280850025e-05, + "loss": 0.2255, + "step": 5285 + }, + { + "epoch": 0.25115130798081947, + "grad_norm": 0.51171875, + "learning_rate": 4.5204377583566494e-05, + "loss": 0.228, + "step": 5290 + }, + { + "epoch": 0.2513886910696482, + "grad_norm": 0.51953125, + "learning_rate": 4.5193471447505335e-05, + "loss": 0.2265, + "step": 5295 + }, + { + "epoch": 0.25162607415847693, + "grad_norm": 0.52734375, + "learning_rate": 4.518255440703818e-05, + "loss": 0.2276, + "step": 5300 + }, + { + "epoch": 0.2518634572473057, + "grad_norm": 0.53125, + "learning_rate": 4.517162646889315e-05, + "loss": 0.226, + "step": 5305 + }, + { + "epoch": 0.25210084033613445, + "grad_norm": 0.490234375, + "learning_rate": 4.516068763980509e-05, + "loss": 0.2297, + "step": 5310 + }, + { + "epoch": 0.2523382234249632, + "grad_norm": 0.48828125, + "learning_rate": 4.514973792651555e-05, + "loss": 0.2279, + "step": 5315 + }, + { + "epoch": 0.25257560651379196, + "grad_norm": 0.5078125, + "learning_rate": 4.5138777335772784e-05, + "loss": 0.2283, + "step": 5320 + }, + { + "epoch": 0.2528129896026207, + "grad_norm": 0.53515625, + "learning_rate": 4.512780587433175e-05, + "loss": 0.2253, + "step": 5325 + }, + { + "epoch": 0.2530503726914495, + "grad_norm": 0.447265625, + "learning_rate": 4.5116823548954116e-05, + "loss": 0.2262, + "step": 5330 + }, + { + "epoch": 0.25328775578027823, + "grad_norm": 0.439453125, + "learning_rate": 4.510583036640824e-05, + "loss": 0.2233, + "step": 5335 + }, + { + "epoch": 0.253525138869107, + "grad_norm": 0.494140625, + "learning_rate": 4.5094826333469165e-05, + "loss": 0.2256, + "step": 5340 + }, + { + "epoch": 0.2537625219579357, + "grad_norm": 0.515625, + "learning_rate": 4.5083811456918635e-05, + "loss": 0.2276, + "step": 5345 + }, + { + "epoch": 0.25399990504676445, + "grad_norm": 0.5625, + "learning_rate": 4.5072785743545056e-05, + "loss": 0.2266, + "step": 5350 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 0.4765625, + "learning_rate": 4.5061749200143535e-05, + "loss": 0.2227, + "step": 5355 + }, + { + "epoch": 0.25447467122442197, + "grad_norm": 0.4296875, + "learning_rate": 4.505070183351584e-05, + "loss": 0.2227, + "step": 5360 + }, + { + "epoch": 0.2547120543132507, + "grad_norm": 0.474609375, + "learning_rate": 4.503964365047042e-05, + "loss": 0.2265, + "step": 5365 + }, + { + "epoch": 0.2549494374020795, + "grad_norm": 0.421875, + "learning_rate": 4.502857465782237e-05, + "loss": 0.2265, + "step": 5370 + }, + { + "epoch": 0.25518682049090824, + "grad_norm": 0.5546875, + "learning_rate": 4.5017494862393474e-05, + "loss": 0.2262, + "step": 5375 + }, + { + "epoch": 0.255424203579737, + "grad_norm": 0.478515625, + "learning_rate": 4.500640427101215e-05, + "loss": 0.2241, + "step": 5380 + }, + { + "epoch": 0.25566158666856575, + "grad_norm": 0.55078125, + "learning_rate": 4.499530289051348e-05, + "loss": 0.2267, + "step": 5385 + }, + { + "epoch": 0.25589896975739446, + "grad_norm": 0.51953125, + "learning_rate": 4.4984190727739186e-05, + "loss": 0.2299, + "step": 5390 + }, + { + "epoch": 0.2561363528462232, + "grad_norm": 0.53125, + "learning_rate": 4.497306778953765e-05, + "loss": 0.2324, + "step": 5395 + }, + { + "epoch": 0.25637373593505197, + "grad_norm": 0.44921875, + "learning_rate": 4.4961934082763915e-05, + "loss": 0.2256, + "step": 5400 + }, + { + "epoch": 0.25661111902388073, + "grad_norm": 0.4296875, + "learning_rate": 4.495078961427959e-05, + "loss": 0.2269, + "step": 5405 + }, + { + "epoch": 0.2568485021127095, + "grad_norm": 0.5703125, + "learning_rate": 4.493963439095299e-05, + "loss": 0.229, + "step": 5410 + }, + { + "epoch": 0.25708588520153824, + "grad_norm": 0.51953125, + "learning_rate": 4.492846841965901e-05, + "loss": 0.2248, + "step": 5415 + }, + { + "epoch": 0.257323268290367, + "grad_norm": 0.486328125, + "learning_rate": 4.49172917072792e-05, + "loss": 0.2239, + "step": 5420 + }, + { + "epoch": 0.25756065137919576, + "grad_norm": 0.5390625, + "learning_rate": 4.4906104260701716e-05, + "loss": 0.2291, + "step": 5425 + }, + { + "epoch": 0.2577980344680245, + "grad_norm": 0.484375, + "learning_rate": 4.489490608682132e-05, + "loss": 0.2239, + "step": 5430 + }, + { + "epoch": 0.2580354175568533, + "grad_norm": 0.51171875, + "learning_rate": 4.488369719253941e-05, + "loss": 0.2284, + "step": 5435 + }, + { + "epoch": 0.258272800645682, + "grad_norm": 0.546875, + "learning_rate": 4.4872477584763966e-05, + "loss": 0.2245, + "step": 5440 + }, + { + "epoch": 0.25851018373451073, + "grad_norm": 0.49609375, + "learning_rate": 4.486124727040958e-05, + "loss": 0.2304, + "step": 5445 + }, + { + "epoch": 0.2587475668233395, + "grad_norm": 0.482421875, + "learning_rate": 4.485000625639746e-05, + "loss": 0.2265, + "step": 5450 + }, + { + "epoch": 0.25898494991216825, + "grad_norm": 0.55078125, + "learning_rate": 4.4838754549655376e-05, + "loss": 0.223, + "step": 5455 + }, + { + "epoch": 0.259222333000997, + "grad_norm": 0.53125, + "learning_rate": 4.482749215711771e-05, + "loss": 0.2259, + "step": 5460 + }, + { + "epoch": 0.25945971608982576, + "grad_norm": 0.5390625, + "learning_rate": 4.4816219085725404e-05, + "loss": 0.227, + "step": 5465 + }, + { + "epoch": 0.2596970991786545, + "grad_norm": 0.5234375, + "learning_rate": 4.4804935342426036e-05, + "loss": 0.2301, + "step": 5470 + }, + { + "epoch": 0.2599344822674833, + "grad_norm": 0.50390625, + "learning_rate": 4.4793640934173705e-05, + "loss": 0.2292, + "step": 5475 + }, + { + "epoch": 0.26017186535631204, + "grad_norm": 0.5078125, + "learning_rate": 4.4782335867929096e-05, + "loss": 0.2212, + "step": 5480 + }, + { + "epoch": 0.2604092484451408, + "grad_norm": 0.494140625, + "learning_rate": 4.477102015065949e-05, + "loss": 0.2268, + "step": 5485 + }, + { + "epoch": 0.2606466315339695, + "grad_norm": 0.46875, + "learning_rate": 4.47596937893387e-05, + "loss": 0.2267, + "step": 5490 + }, + { + "epoch": 0.26088401462279825, + "grad_norm": 0.4921875, + "learning_rate": 4.4748356790947114e-05, + "loss": 0.2307, + "step": 5495 + }, + { + "epoch": 0.261121397711627, + "grad_norm": 0.50390625, + "learning_rate": 4.4737009162471666e-05, + "loss": 0.2288, + "step": 5500 + }, + { + "epoch": 0.26135878080045577, + "grad_norm": 0.4765625, + "learning_rate": 4.4725650910905866e-05, + "loss": 0.2252, + "step": 5505 + }, + { + "epoch": 0.2615961638892845, + "grad_norm": 0.466796875, + "learning_rate": 4.471428204324972e-05, + "loss": 0.2289, + "step": 5510 + }, + { + "epoch": 0.2618335469781133, + "grad_norm": 0.51171875, + "learning_rate": 4.4702902566509846e-05, + "loss": 0.2272, + "step": 5515 + }, + { + "epoch": 0.26207093006694204, + "grad_norm": 0.53125, + "learning_rate": 4.469151248769934e-05, + "loss": 0.2294, + "step": 5520 + }, + { + "epoch": 0.2623083131557708, + "grad_norm": 0.51171875, + "learning_rate": 4.4680111813837864e-05, + "loss": 0.2288, + "step": 5525 + }, + { + "epoch": 0.26254569624459956, + "grad_norm": 0.484375, + "learning_rate": 4.4668700551951607e-05, + "loss": 0.2255, + "step": 5530 + }, + { + "epoch": 0.26278307933342826, + "grad_norm": 0.498046875, + "learning_rate": 4.4657278709073266e-05, + "loss": 0.2273, + "step": 5535 + }, + { + "epoch": 0.263020462422257, + "grad_norm": 0.44921875, + "learning_rate": 4.464584629224207e-05, + "loss": 0.2259, + "step": 5540 + }, + { + "epoch": 0.2632578455110858, + "grad_norm": 0.470703125, + "learning_rate": 4.463440330850378e-05, + "loss": 0.2277, + "step": 5545 + }, + { + "epoch": 0.26349522859991453, + "grad_norm": 0.51171875, + "learning_rate": 4.462294976491064e-05, + "loss": 0.2275, + "step": 5550 + }, + { + "epoch": 0.2637326116887433, + "grad_norm": 0.5390625, + "learning_rate": 4.461148566852143e-05, + "loss": 0.2254, + "step": 5555 + }, + { + "epoch": 0.26396999477757205, + "grad_norm": 0.546875, + "learning_rate": 4.460001102640141e-05, + "loss": 0.2233, + "step": 5560 + }, + { + "epoch": 0.2642073778664008, + "grad_norm": 0.484375, + "learning_rate": 4.4588525845622356e-05, + "loss": 0.2276, + "step": 5565 + }, + { + "epoch": 0.26444476095522956, + "grad_norm": 0.60546875, + "learning_rate": 4.4577030133262534e-05, + "loss": 0.2292, + "step": 5570 + }, + { + "epoch": 0.2646821440440583, + "grad_norm": 0.5, + "learning_rate": 4.4565523896406696e-05, + "loss": 0.2266, + "step": 5575 + }, + { + "epoch": 0.2649195271328871, + "grad_norm": 0.5703125, + "learning_rate": 4.455400714214607e-05, + "loss": 0.2275, + "step": 5580 + }, + { + "epoch": 0.2651569102217158, + "grad_norm": 0.5625, + "learning_rate": 4.4542479877578404e-05, + "loss": 0.2295, + "step": 5585 + }, + { + "epoch": 0.26539429331054454, + "grad_norm": 0.44921875, + "learning_rate": 4.453094210980788e-05, + "loss": 0.2287, + "step": 5590 + }, + { + "epoch": 0.2656316763993733, + "grad_norm": 0.52734375, + "learning_rate": 4.451939384594519e-05, + "loss": 0.2249, + "step": 5595 + }, + { + "epoch": 0.26586905948820205, + "grad_norm": 0.52734375, + "learning_rate": 4.450783509310745e-05, + "loss": 0.2263, + "step": 5600 + }, + { + "epoch": 0.2661064425770308, + "grad_norm": 0.45703125, + "learning_rate": 4.44962658584183e-05, + "loss": 0.2255, + "step": 5605 + }, + { + "epoch": 0.26634382566585957, + "grad_norm": 0.48828125, + "learning_rate": 4.448468614900777e-05, + "loss": 0.2258, + "step": 5610 + }, + { + "epoch": 0.2665812087546883, + "grad_norm": 0.4609375, + "learning_rate": 4.4473095972012414e-05, + "loss": 0.2264, + "step": 5615 + }, + { + "epoch": 0.2668185918435171, + "grad_norm": 0.458984375, + "learning_rate": 4.4461495334575196e-05, + "loss": 0.2255, + "step": 5620 + }, + { + "epoch": 0.26705597493234584, + "grad_norm": 0.43359375, + "learning_rate": 4.4449884243845524e-05, + "loss": 0.2286, + "step": 5625 + }, + { + "epoch": 0.2672933580211746, + "grad_norm": 0.57421875, + "learning_rate": 4.443826270697927e-05, + "loss": 0.223, + "step": 5630 + }, + { + "epoch": 0.2675307411100033, + "grad_norm": 0.50390625, + "learning_rate": 4.442663073113873e-05, + "loss": 0.229, + "step": 5635 + }, + { + "epoch": 0.26776812419883206, + "grad_norm": 0.451171875, + "learning_rate": 4.4414988323492645e-05, + "loss": 0.2265, + "step": 5640 + }, + { + "epoch": 0.2680055072876608, + "grad_norm": 0.51171875, + "learning_rate": 4.4403335491216176e-05, + "loss": 0.2261, + "step": 5645 + }, + { + "epoch": 0.2682428903764896, + "grad_norm": 0.48046875, + "learning_rate": 4.43916722414909e-05, + "loss": 0.2282, + "step": 5650 + }, + { + "epoch": 0.26848027346531833, + "grad_norm": 0.6484375, + "learning_rate": 4.437999858150484e-05, + "loss": 0.2318, + "step": 5655 + }, + { + "epoch": 0.2687176565541471, + "grad_norm": 0.47265625, + "learning_rate": 4.436831451845242e-05, + "loss": 0.2305, + "step": 5660 + }, + { + "epoch": 0.26895503964297585, + "grad_norm": 0.5390625, + "learning_rate": 4.435662005953446e-05, + "loss": 0.2288, + "step": 5665 + }, + { + "epoch": 0.2691924227318046, + "grad_norm": 0.4609375, + "learning_rate": 4.434491521195821e-05, + "loss": 0.2265, + "step": 5670 + }, + { + "epoch": 0.26942980582063336, + "grad_norm": 0.5234375, + "learning_rate": 4.433319998293732e-05, + "loss": 0.2244, + "step": 5675 + }, + { + "epoch": 0.26966718890946206, + "grad_norm": 0.443359375, + "learning_rate": 4.432147437969182e-05, + "loss": 0.2235, + "step": 5680 + }, + { + "epoch": 0.2699045719982908, + "grad_norm": 0.546875, + "learning_rate": 4.430973840944815e-05, + "loss": 0.2283, + "step": 5685 + }, + { + "epoch": 0.2701419550871196, + "grad_norm": 0.482421875, + "learning_rate": 4.429799207943914e-05, + "loss": 0.2252, + "step": 5690 + }, + { + "epoch": 0.27037933817594834, + "grad_norm": 0.4453125, + "learning_rate": 4.4286235396904e-05, + "loss": 0.2297, + "step": 5695 + }, + { + "epoch": 0.2706167212647771, + "grad_norm": 0.474609375, + "learning_rate": 4.4274468369088315e-05, + "loss": 0.2279, + "step": 5700 + }, + { + "epoch": 0.27085410435360585, + "grad_norm": 0.482421875, + "learning_rate": 4.4262691003244056e-05, + "loss": 0.2267, + "step": 5705 + }, + { + "epoch": 0.2710914874424346, + "grad_norm": 0.546875, + "learning_rate": 4.4250903306629554e-05, + "loss": 0.2265, + "step": 5710 + }, + { + "epoch": 0.27132887053126337, + "grad_norm": 0.5234375, + "learning_rate": 4.423910528650951e-05, + "loss": 0.2234, + "step": 5715 + }, + { + "epoch": 0.2715662536200921, + "grad_norm": 0.60546875, + "learning_rate": 4.4227296950155006e-05, + "loss": 0.2248, + "step": 5720 + }, + { + "epoch": 0.2718036367089209, + "grad_norm": 0.53515625, + "learning_rate": 4.4215478304843455e-05, + "loss": 0.232, + "step": 5725 + }, + { + "epoch": 0.2720410197977496, + "grad_norm": 0.43359375, + "learning_rate": 4.4203649357858625e-05, + "loss": 0.2236, + "step": 5730 + }, + { + "epoch": 0.27227840288657834, + "grad_norm": 0.478515625, + "learning_rate": 4.4191810116490666e-05, + "loss": 0.2248, + "step": 5735 + }, + { + "epoch": 0.2725157859754071, + "grad_norm": 0.48828125, + "learning_rate": 4.4179960588036026e-05, + "loss": 0.2287, + "step": 5740 + }, + { + "epoch": 0.27275316906423586, + "grad_norm": 0.50390625, + "learning_rate": 4.4168100779797533e-05, + "loss": 0.2257, + "step": 5745 + }, + { + "epoch": 0.2729905521530646, + "grad_norm": 0.51953125, + "learning_rate": 4.415623069908432e-05, + "loss": 0.2304, + "step": 5750 + }, + { + "epoch": 0.27322793524189337, + "grad_norm": 0.56640625, + "learning_rate": 4.414435035321187e-05, + "loss": 0.2256, + "step": 5755 + }, + { + "epoch": 0.27346531833072213, + "grad_norm": 0.5390625, + "learning_rate": 4.4132459749501975e-05, + "loss": 0.2248, + "step": 5760 + }, + { + "epoch": 0.2737027014195509, + "grad_norm": 0.498046875, + "learning_rate": 4.412055889528278e-05, + "loss": 0.2286, + "step": 5765 + }, + { + "epoch": 0.27394008450837964, + "grad_norm": 0.478515625, + "learning_rate": 4.410864779788872e-05, + "loss": 0.223, + "step": 5770 + }, + { + "epoch": 0.2741774675972084, + "grad_norm": 0.54296875, + "learning_rate": 4.409672646466054e-05, + "loss": 0.2262, + "step": 5775 + }, + { + "epoch": 0.2744148506860371, + "grad_norm": 0.46484375, + "learning_rate": 4.4084794902945306e-05, + "loss": 0.2306, + "step": 5780 + }, + { + "epoch": 0.27465223377486586, + "grad_norm": 0.470703125, + "learning_rate": 4.4072853120096405e-05, + "loss": 0.2273, + "step": 5785 + }, + { + "epoch": 0.2748896168636946, + "grad_norm": 0.458984375, + "learning_rate": 4.4060901123473476e-05, + "loss": 0.2279, + "step": 5790 + }, + { + "epoch": 0.2751269999525234, + "grad_norm": 0.51953125, + "learning_rate": 4.4048938920442496e-05, + "loss": 0.2237, + "step": 5795 + }, + { + "epoch": 0.27536438304135213, + "grad_norm": 0.49609375, + "learning_rate": 4.4036966518375706e-05, + "loss": 0.2261, + "step": 5800 + }, + { + "epoch": 0.2756017661301809, + "grad_norm": 0.5078125, + "learning_rate": 4.402498392465165e-05, + "loss": 0.2298, + "step": 5805 + }, + { + "epoch": 0.27583914921900965, + "grad_norm": 0.55078125, + "learning_rate": 4.401299114665516e-05, + "loss": 0.2268, + "step": 5810 + }, + { + "epoch": 0.2760765323078384, + "grad_norm": 0.43359375, + "learning_rate": 4.40009881917773e-05, + "loss": 0.229, + "step": 5815 + }, + { + "epoch": 0.27631391539666716, + "grad_norm": 0.4765625, + "learning_rate": 4.398897506741547e-05, + "loss": 0.2236, + "step": 5820 + }, + { + "epoch": 0.27655129848549587, + "grad_norm": 0.62890625, + "learning_rate": 4.397695178097327e-05, + "loss": 0.2302, + "step": 5825 + }, + { + "epoch": 0.2767886815743246, + "grad_norm": 0.408203125, + "learning_rate": 4.396491833986062e-05, + "loss": 0.2287, + "step": 5830 + }, + { + "epoch": 0.2770260646631534, + "grad_norm": 0.515625, + "learning_rate": 4.395287475149367e-05, + "loss": 0.2259, + "step": 5835 + }, + { + "epoch": 0.27726344775198214, + "grad_norm": 0.45703125, + "learning_rate": 4.394082102329482e-05, + "loss": 0.2267, + "step": 5840 + }, + { + "epoch": 0.2775008308408109, + "grad_norm": 0.48046875, + "learning_rate": 4.392875716269274e-05, + "loss": 0.2297, + "step": 5845 + }, + { + "epoch": 0.27773821392963965, + "grad_norm": 0.423828125, + "learning_rate": 4.391668317712233e-05, + "loss": 0.227, + "step": 5850 + }, + { + "epoch": 0.2779755970184684, + "grad_norm": 0.458984375, + "learning_rate": 4.390459907402472e-05, + "loss": 0.2216, + "step": 5855 + }, + { + "epoch": 0.27821298010729717, + "grad_norm": 0.451171875, + "learning_rate": 4.389250486084731e-05, + "loss": 0.227, + "step": 5860 + }, + { + "epoch": 0.2784503631961259, + "grad_norm": 0.4921875, + "learning_rate": 4.3880400545043686e-05, + "loss": 0.2252, + "step": 5865 + }, + { + "epoch": 0.2786877462849547, + "grad_norm": 0.447265625, + "learning_rate": 4.3868286134073704e-05, + "loss": 0.226, + "step": 5870 + }, + { + "epoch": 0.2789251293737834, + "grad_norm": 0.50390625, + "learning_rate": 4.38561616354034e-05, + "loss": 0.2236, + "step": 5875 + }, + { + "epoch": 0.27916251246261214, + "grad_norm": 0.515625, + "learning_rate": 4.384402705650506e-05, + "loss": 0.2246, + "step": 5880 + }, + { + "epoch": 0.2793998955514409, + "grad_norm": 0.470703125, + "learning_rate": 4.3831882404857165e-05, + "loss": 0.2254, + "step": 5885 + }, + { + "epoch": 0.27963727864026966, + "grad_norm": 0.5859375, + "learning_rate": 4.3819727687944416e-05, + "loss": 0.2248, + "step": 5890 + }, + { + "epoch": 0.2798746617290984, + "grad_norm": 0.51953125, + "learning_rate": 4.3807562913257696e-05, + "loss": 0.2273, + "step": 5895 + }, + { + "epoch": 0.2801120448179272, + "grad_norm": 0.451171875, + "learning_rate": 4.3795388088294116e-05, + "loss": 0.2283, + "step": 5900 + }, + { + "epoch": 0.28034942790675593, + "grad_norm": 0.4609375, + "learning_rate": 4.378320322055695e-05, + "loss": 0.225, + "step": 5905 + }, + { + "epoch": 0.2805868109955847, + "grad_norm": 0.53125, + "learning_rate": 4.377100831755569e-05, + "loss": 0.2281, + "step": 5910 + }, + { + "epoch": 0.28082419408441345, + "grad_norm": 0.4609375, + "learning_rate": 4.3758803386805984e-05, + "loss": 0.2252, + "step": 5915 + }, + { + "epoch": 0.2810615771732422, + "grad_norm": 0.58984375, + "learning_rate": 4.37465884358297e-05, + "loss": 0.2265, + "step": 5920 + }, + { + "epoch": 0.2812989602620709, + "grad_norm": 0.53125, + "learning_rate": 4.373436347215483e-05, + "loss": 0.2289, + "step": 5925 + }, + { + "epoch": 0.28153634335089966, + "grad_norm": 0.62890625, + "learning_rate": 4.372212850331558e-05, + "loss": 0.2295, + "step": 5930 + }, + { + "epoch": 0.2817737264397284, + "grad_norm": 0.48828125, + "learning_rate": 4.37098835368523e-05, + "loss": 0.2282, + "step": 5935 + }, + { + "epoch": 0.2820111095285572, + "grad_norm": 0.482421875, + "learning_rate": 4.3697628580311505e-05, + "loss": 0.2275, + "step": 5940 + }, + { + "epoch": 0.28224849261738594, + "grad_norm": 0.46484375, + "learning_rate": 4.368536364124587e-05, + "loss": 0.2247, + "step": 5945 + }, + { + "epoch": 0.2824858757062147, + "grad_norm": 0.453125, + "learning_rate": 4.367308872721423e-05, + "loss": 0.2271, + "step": 5950 + }, + { + "epoch": 0.28272325879504345, + "grad_norm": 0.6171875, + "learning_rate": 4.366080384578154e-05, + "loss": 0.2313, + "step": 5955 + }, + { + "epoch": 0.2829606418838722, + "grad_norm": 0.46484375, + "learning_rate": 4.3648509004518946e-05, + "loss": 0.224, + "step": 5960 + }, + { + "epoch": 0.28319802497270097, + "grad_norm": 0.482421875, + "learning_rate": 4.363620421100367e-05, + "loss": 0.2288, + "step": 5965 + }, + { + "epoch": 0.28343540806152967, + "grad_norm": 0.435546875, + "learning_rate": 4.362388947281912e-05, + "loss": 0.2289, + "step": 5970 + }, + { + "epoch": 0.2836727911503584, + "grad_norm": 0.443359375, + "learning_rate": 4.361156479755481e-05, + "loss": 0.2263, + "step": 5975 + }, + { + "epoch": 0.2839101742391872, + "grad_norm": 0.462890625, + "learning_rate": 4.359923019280638e-05, + "loss": 0.232, + "step": 5980 + }, + { + "epoch": 0.28414755732801594, + "grad_norm": 0.46484375, + "learning_rate": 4.3586885666175584e-05, + "loss": 0.2226, + "step": 5985 + }, + { + "epoch": 0.2843849404168447, + "grad_norm": 0.4921875, + "learning_rate": 4.3574531225270316e-05, + "loss": 0.2255, + "step": 5990 + }, + { + "epoch": 0.28462232350567346, + "grad_norm": 0.51953125, + "learning_rate": 4.3562166877704533e-05, + "loss": 0.2296, + "step": 5995 + }, + { + "epoch": 0.2848597065945022, + "grad_norm": 0.5234375, + "learning_rate": 4.3549792631098356e-05, + "loss": 0.2322, + "step": 6000 + }, + { + "epoch": 0.285097089683331, + "grad_norm": 0.546875, + "learning_rate": 4.3537408493077966e-05, + "loss": 0.2257, + "step": 6005 + }, + { + "epoch": 0.28533447277215973, + "grad_norm": 0.64453125, + "learning_rate": 4.3525014471275646e-05, + "loss": 0.2257, + "step": 6010 + }, + { + "epoch": 0.2855718558609885, + "grad_norm": 0.462890625, + "learning_rate": 4.351261057332977e-05, + "loss": 0.2279, + "step": 6015 + }, + { + "epoch": 0.2858092389498172, + "grad_norm": 0.51953125, + "learning_rate": 4.350019680688482e-05, + "loss": 0.2278, + "step": 6020 + }, + { + "epoch": 0.28604662203864595, + "grad_norm": 0.50390625, + "learning_rate": 4.348777317959133e-05, + "loss": 0.2274, + "step": 6025 + }, + { + "epoch": 0.2862840051274747, + "grad_norm": 0.515625, + "learning_rate": 4.347533969910594e-05, + "loss": 0.2236, + "step": 6030 + }, + { + "epoch": 0.28652138821630346, + "grad_norm": 0.478515625, + "learning_rate": 4.346289637309133e-05, + "loss": 0.2303, + "step": 6035 + }, + { + "epoch": 0.2867587713051322, + "grad_norm": 0.578125, + "learning_rate": 4.345044320921628e-05, + "loss": 0.2259, + "step": 6040 + }, + { + "epoch": 0.286996154393961, + "grad_norm": 0.5078125, + "learning_rate": 4.34379802151556e-05, + "loss": 0.2246, + "step": 6045 + }, + { + "epoch": 0.28723353748278974, + "grad_norm": 0.546875, + "learning_rate": 4.342550739859021e-05, + "loss": 0.2253, + "step": 6050 + }, + { + "epoch": 0.2874709205716185, + "grad_norm": 0.50390625, + "learning_rate": 4.341302476720701e-05, + "loss": 0.2289, + "step": 6055 + }, + { + "epoch": 0.28770830366044725, + "grad_norm": 0.53125, + "learning_rate": 4.3400532328699016e-05, + "loss": 0.2269, + "step": 6060 + }, + { + "epoch": 0.287945686749276, + "grad_norm": 0.5, + "learning_rate": 4.338803009076525e-05, + "loss": 0.2227, + "step": 6065 + }, + { + "epoch": 0.2881830698381047, + "grad_norm": 0.498046875, + "learning_rate": 4.3375518061110785e-05, + "loss": 0.2205, + "step": 6070 + }, + { + "epoch": 0.28842045292693347, + "grad_norm": 0.5546875, + "learning_rate": 4.336299624744674e-05, + "loss": 0.2258, + "step": 6075 + }, + { + "epoch": 0.2886578360157622, + "grad_norm": 0.4375, + "learning_rate": 4.335046465749022e-05, + "loss": 0.2271, + "step": 6080 + }, + { + "epoch": 0.288895219104591, + "grad_norm": 0.486328125, + "learning_rate": 4.3337923298964426e-05, + "loss": 0.2268, + "step": 6085 + }, + { + "epoch": 0.28913260219341974, + "grad_norm": 0.59375, + "learning_rate": 4.3325372179598524e-05, + "loss": 0.2276, + "step": 6090 + }, + { + "epoch": 0.2893699852822485, + "grad_norm": 0.439453125, + "learning_rate": 4.33128113071277e-05, + "loss": 0.2235, + "step": 6095 + }, + { + "epoch": 0.28960736837107726, + "grad_norm": 0.54296875, + "learning_rate": 4.3300240689293185e-05, + "loss": 0.2288, + "step": 6100 + }, + { + "epoch": 0.289844751459906, + "grad_norm": 0.45703125, + "learning_rate": 4.328766033384217e-05, + "loss": 0.2284, + "step": 6105 + }, + { + "epoch": 0.29008213454873477, + "grad_norm": 0.40625, + "learning_rate": 4.3275070248527896e-05, + "loss": 0.228, + "step": 6110 + }, + { + "epoch": 0.2903195176375635, + "grad_norm": 0.490234375, + "learning_rate": 4.326247044110956e-05, + "loss": 0.2318, + "step": 6115 + }, + { + "epoch": 0.29055690072639223, + "grad_norm": 0.4140625, + "learning_rate": 4.324986091935236e-05, + "loss": 0.2255, + "step": 6120 + }, + { + "epoch": 0.290794283815221, + "grad_norm": 0.470703125, + "learning_rate": 4.3237241691027495e-05, + "loss": 0.2262, + "step": 6125 + }, + { + "epoch": 0.29103166690404975, + "grad_norm": 0.482421875, + "learning_rate": 4.3224612763912146e-05, + "loss": 0.2256, + "step": 6130 + }, + { + "epoch": 0.2912690499928785, + "grad_norm": 0.6015625, + "learning_rate": 4.321197414578945e-05, + "loss": 0.2253, + "step": 6135 + }, + { + "epoch": 0.29150643308170726, + "grad_norm": 0.462890625, + "learning_rate": 4.3199325844448534e-05, + "loss": 0.2282, + "step": 6140 + }, + { + "epoch": 0.291743816170536, + "grad_norm": 0.443359375, + "learning_rate": 4.318666786768449e-05, + "loss": 0.2309, + "step": 6145 + }, + { + "epoch": 0.2919811992593648, + "grad_norm": 0.4921875, + "learning_rate": 4.317400022329838e-05, + "loss": 0.2287, + "step": 6150 + }, + { + "epoch": 0.29221858234819353, + "grad_norm": 0.5546875, + "learning_rate": 4.31613229190972e-05, + "loss": 0.2257, + "step": 6155 + }, + { + "epoch": 0.2924559654370223, + "grad_norm": 0.4453125, + "learning_rate": 4.314863596289393e-05, + "loss": 0.2278, + "step": 6160 + }, + { + "epoch": 0.292693348525851, + "grad_norm": 0.482421875, + "learning_rate": 4.3135939362507463e-05, + "loss": 0.2247, + "step": 6165 + }, + { + "epoch": 0.29293073161467975, + "grad_norm": 0.6171875, + "learning_rate": 4.3123233125762666e-05, + "loss": 0.2226, + "step": 6170 + }, + { + "epoch": 0.2931681147035085, + "grad_norm": 0.58984375, + "learning_rate": 4.3110517260490346e-05, + "loss": 0.2261, + "step": 6175 + }, + { + "epoch": 0.29340549779233727, + "grad_norm": 0.55859375, + "learning_rate": 4.3097791774527226e-05, + "loss": 0.2262, + "step": 6180 + }, + { + "epoch": 0.293642880881166, + "grad_norm": 0.56640625, + "learning_rate": 4.3085056675715965e-05, + "loss": 0.2285, + "step": 6185 + }, + { + "epoch": 0.2938802639699948, + "grad_norm": 0.66796875, + "learning_rate": 4.307231197190514e-05, + "loss": 0.2244, + "step": 6190 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.58203125, + "learning_rate": 4.3059557670949264e-05, + "loss": 0.2244, + "step": 6195 + }, + { + "epoch": 0.2943550301476523, + "grad_norm": 0.609375, + "learning_rate": 4.3046793780708746e-05, + "loss": 0.2301, + "step": 6200 + }, + { + "epoch": 0.29459241323648105, + "grad_norm": 0.5625, + "learning_rate": 4.303402030904991e-05, + "loss": 0.2285, + "step": 6205 + }, + { + "epoch": 0.2948297963253098, + "grad_norm": 0.5390625, + "learning_rate": 4.3021237263845005e-05, + "loss": 0.2218, + "step": 6210 + }, + { + "epoch": 0.2950671794141385, + "grad_norm": 0.546875, + "learning_rate": 4.300844465297215e-05, + "loss": 0.2232, + "step": 6215 + }, + { + "epoch": 0.29530456250296727, + "grad_norm": 0.6328125, + "learning_rate": 4.299564248431539e-05, + "loss": 0.2269, + "step": 6220 + }, + { + "epoch": 0.29554194559179603, + "grad_norm": 0.6328125, + "learning_rate": 4.298283076576461e-05, + "loss": 0.2269, + "step": 6225 + }, + { + "epoch": 0.2957793286806248, + "grad_norm": 0.62890625, + "learning_rate": 4.297000950521565e-05, + "loss": 0.2296, + "step": 6230 + }, + { + "epoch": 0.29601671176945354, + "grad_norm": 0.59765625, + "learning_rate": 4.295717871057017e-05, + "loss": 0.2268, + "step": 6235 + }, + { + "epoch": 0.2962540948582823, + "grad_norm": 0.5234375, + "learning_rate": 4.294433838973574e-05, + "loss": 0.2261, + "step": 6240 + }, + { + "epoch": 0.29649147794711106, + "grad_norm": 0.486328125, + "learning_rate": 4.2931488550625796e-05, + "loss": 0.2284, + "step": 6245 + }, + { + "epoch": 0.2967288610359398, + "grad_norm": 0.5234375, + "learning_rate": 4.2918629201159624e-05, + "loss": 0.2314, + "step": 6250 + }, + { + "epoch": 0.2969662441247686, + "grad_norm": 0.51953125, + "learning_rate": 4.2905760349262397e-05, + "loss": 0.2293, + "step": 6255 + }, + { + "epoch": 0.2972036272135973, + "grad_norm": 0.431640625, + "learning_rate": 4.2892882002865114e-05, + "loss": 0.2248, + "step": 6260 + }, + { + "epoch": 0.29744101030242603, + "grad_norm": 0.45703125, + "learning_rate": 4.287999416990465e-05, + "loss": 0.2276, + "step": 6265 + }, + { + "epoch": 0.2976783933912548, + "grad_norm": 0.494140625, + "learning_rate": 4.286709685832372e-05, + "loss": 0.2288, + "step": 6270 + }, + { + "epoch": 0.29791577648008355, + "grad_norm": 0.40234375, + "learning_rate": 4.285419007607087e-05, + "loss": 0.2269, + "step": 6275 + }, + { + "epoch": 0.2981531595689123, + "grad_norm": 0.421875, + "learning_rate": 4.28412738311005e-05, + "loss": 0.2282, + "step": 6280 + }, + { + "epoch": 0.29839054265774106, + "grad_norm": 0.404296875, + "learning_rate": 4.2828348131372834e-05, + "loss": 0.226, + "step": 6285 + }, + { + "epoch": 0.2986279257465698, + "grad_norm": 0.4921875, + "learning_rate": 4.281541298485392e-05, + "loss": 0.2263, + "step": 6290 + }, + { + "epoch": 0.2988653088353986, + "grad_norm": 0.48828125, + "learning_rate": 4.2802468399515613e-05, + "loss": 0.2265, + "step": 6295 + }, + { + "epoch": 0.29910269192422734, + "grad_norm": 0.4609375, + "learning_rate": 4.278951438333563e-05, + "loss": 0.2281, + "step": 6300 + }, + { + "epoch": 0.2993400750130561, + "grad_norm": 0.46484375, + "learning_rate": 4.277655094429745e-05, + "loss": 0.222, + "step": 6305 + }, + { + "epoch": 0.2995774581018848, + "grad_norm": 0.3984375, + "learning_rate": 4.2763578090390394e-05, + "loss": 0.2257, + "step": 6310 + }, + { + "epoch": 0.29981484119071355, + "grad_norm": 0.490234375, + "learning_rate": 4.275059582960957e-05, + "loss": 0.2246, + "step": 6315 + }, + { + "epoch": 0.3000522242795423, + "grad_norm": 0.46484375, + "learning_rate": 4.273760416995589e-05, + "loss": 0.2256, + "step": 6320 + }, + { + "epoch": 0.30028960736837107, + "grad_norm": 0.51953125, + "learning_rate": 4.272460311943604e-05, + "loss": 0.2249, + "step": 6325 + }, + { + "epoch": 0.3005269904571998, + "grad_norm": 0.44921875, + "learning_rate": 4.271159268606252e-05, + "loss": 0.2251, + "step": 6330 + }, + { + "epoch": 0.3007643735460286, + "grad_norm": 0.51171875, + "learning_rate": 4.2698572877853605e-05, + "loss": 0.2266, + "step": 6335 + }, + { + "epoch": 0.30100175663485734, + "grad_norm": 0.52734375, + "learning_rate": 4.2685543702833334e-05, + "loss": 0.2291, + "step": 6340 + }, + { + "epoch": 0.3012391397236861, + "grad_norm": 0.484375, + "learning_rate": 4.267250516903154e-05, + "loss": 0.2269, + "step": 6345 + }, + { + "epoch": 0.30147652281251486, + "grad_norm": 0.41796875, + "learning_rate": 4.26594572844838e-05, + "loss": 0.2271, + "step": 6350 + }, + { + "epoch": 0.3017139059013436, + "grad_norm": 0.5390625, + "learning_rate": 4.264640005723147e-05, + "loss": 0.2289, + "step": 6355 + }, + { + "epoch": 0.3019512889901723, + "grad_norm": 0.48046875, + "learning_rate": 4.263333349532166e-05, + "loss": 0.2293, + "step": 6360 + }, + { + "epoch": 0.3021886720790011, + "grad_norm": 0.5625, + "learning_rate": 4.262025760680723e-05, + "loss": 0.2295, + "step": 6365 + }, + { + "epoch": 0.30242605516782983, + "grad_norm": 0.4453125, + "learning_rate": 4.2607172399746796e-05, + "loss": 0.2273, + "step": 6370 + }, + { + "epoch": 0.3026634382566586, + "grad_norm": 0.484375, + "learning_rate": 4.2594077882204706e-05, + "loss": 0.2255, + "step": 6375 + }, + { + "epoch": 0.30290082134548735, + "grad_norm": 0.55078125, + "learning_rate": 4.258097406225106e-05, + "loss": 0.2235, + "step": 6380 + }, + { + "epoch": 0.3031382044343161, + "grad_norm": 0.5625, + "learning_rate": 4.256786094796168e-05, + "loss": 0.2246, + "step": 6385 + }, + { + "epoch": 0.30337558752314486, + "grad_norm": 0.412109375, + "learning_rate": 4.2554738547418114e-05, + "loss": 0.2289, + "step": 6390 + }, + { + "epoch": 0.3036129706119736, + "grad_norm": 0.4765625, + "learning_rate": 4.254160686870764e-05, + "loss": 0.2268, + "step": 6395 + }, + { + "epoch": 0.3038503537008024, + "grad_norm": 0.427734375, + "learning_rate": 4.2528465919923245e-05, + "loss": 0.2244, + "step": 6400 + }, + { + "epoch": 0.3040877367896311, + "grad_norm": 0.54296875, + "learning_rate": 4.251531570916365e-05, + "loss": 0.2243, + "step": 6405 + }, + { + "epoch": 0.30432511987845984, + "grad_norm": 0.43359375, + "learning_rate": 4.250215624453326e-05, + "loss": 0.2198, + "step": 6410 + }, + { + "epoch": 0.3045625029672886, + "grad_norm": 0.42578125, + "learning_rate": 4.248898753414219e-05, + "loss": 0.2218, + "step": 6415 + }, + { + "epoch": 0.30479988605611735, + "grad_norm": 0.42578125, + "learning_rate": 4.247580958610628e-05, + "loss": 0.2213, + "step": 6420 + }, + { + "epoch": 0.3050372691449461, + "grad_norm": 0.384765625, + "learning_rate": 4.2462622408547005e-05, + "loss": 0.2255, + "step": 6425 + }, + { + "epoch": 0.30527465223377487, + "grad_norm": 0.443359375, + "learning_rate": 4.244942600959159e-05, + "loss": 0.2277, + "step": 6430 + }, + { + "epoch": 0.3055120353226036, + "grad_norm": 0.478515625, + "learning_rate": 4.24362203973729e-05, + "loss": 0.2259, + "step": 6435 + }, + { + "epoch": 0.3057494184114324, + "grad_norm": 0.439453125, + "learning_rate": 4.2423005580029504e-05, + "loss": 0.2291, + "step": 6440 + }, + { + "epoch": 0.30598680150026114, + "grad_norm": 0.4453125, + "learning_rate": 4.2409781565705614e-05, + "loss": 0.2221, + "step": 6445 + }, + { + "epoch": 0.3062241845890899, + "grad_norm": 0.439453125, + "learning_rate": 4.239654836255116e-05, + "loss": 0.2252, + "step": 6450 + }, + { + "epoch": 0.3064615676779186, + "grad_norm": 0.53125, + "learning_rate": 4.238330597872168e-05, + "loss": 0.2253, + "step": 6455 + }, + { + "epoch": 0.30669895076674736, + "grad_norm": 0.51171875, + "learning_rate": 4.237005442237841e-05, + "loss": 0.2264, + "step": 6460 + }, + { + "epoch": 0.3069363338555761, + "grad_norm": 0.46875, + "learning_rate": 4.235679370168822e-05, + "loss": 0.2258, + "step": 6465 + }, + { + "epoch": 0.3071737169444049, + "grad_norm": 0.46875, + "learning_rate": 4.234352382482363e-05, + "loss": 0.2219, + "step": 6470 + }, + { + "epoch": 0.30741110003323363, + "grad_norm": 0.41796875, + "learning_rate": 4.2330244799962795e-05, + "loss": 0.2253, + "step": 6475 + }, + { + "epoch": 0.3076484831220624, + "grad_norm": 0.4140625, + "learning_rate": 4.231695663528953e-05, + "loss": 0.2269, + "step": 6480 + }, + { + "epoch": 0.30788586621089115, + "grad_norm": 0.4921875, + "learning_rate": 4.230365933899328e-05, + "loss": 0.2263, + "step": 6485 + }, + { + "epoch": 0.3081232492997199, + "grad_norm": 0.53125, + "learning_rate": 4.229035291926909e-05, + "loss": 0.2268, + "step": 6490 + }, + { + "epoch": 0.30836063238854866, + "grad_norm": 0.470703125, + "learning_rate": 4.227703738431765e-05, + "loss": 0.2305, + "step": 6495 + }, + { + "epoch": 0.3085980154773774, + "grad_norm": 0.515625, + "learning_rate": 4.226371274234527e-05, + "loss": 0.2231, + "step": 6500 + }, + { + "epoch": 0.3088353985662061, + "grad_norm": 0.48828125, + "learning_rate": 4.2250379001563854e-05, + "loss": 0.2249, + "step": 6505 + }, + { + "epoch": 0.3090727816550349, + "grad_norm": 0.53125, + "learning_rate": 4.223703617019094e-05, + "loss": 0.2238, + "step": 6510 + }, + { + "epoch": 0.30931016474386364, + "grad_norm": 0.462890625, + "learning_rate": 4.222368425644965e-05, + "loss": 0.2258, + "step": 6515 + }, + { + "epoch": 0.3095475478326924, + "grad_norm": 0.423828125, + "learning_rate": 4.2210323268568704e-05, + "loss": 0.2239, + "step": 6520 + }, + { + "epoch": 0.30978493092152115, + "grad_norm": 0.455078125, + "learning_rate": 4.219695321478242e-05, + "loss": 0.2261, + "step": 6525 + }, + { + "epoch": 0.3100223140103499, + "grad_norm": 0.423828125, + "learning_rate": 4.21835741033307e-05, + "loss": 0.2258, + "step": 6530 + }, + { + "epoch": 0.31025969709917867, + "grad_norm": 0.498046875, + "learning_rate": 4.217018594245903e-05, + "loss": 0.2249, + "step": 6535 + }, + { + "epoch": 0.3104970801880074, + "grad_norm": 0.470703125, + "learning_rate": 4.215678874041848e-05, + "loss": 0.2287, + "step": 6540 + }, + { + "epoch": 0.3107344632768362, + "grad_norm": 0.58984375, + "learning_rate": 4.2143382505465676e-05, + "loss": 0.2256, + "step": 6545 + }, + { + "epoch": 0.3109718463656649, + "grad_norm": 0.72265625, + "learning_rate": 4.212996724586282e-05, + "loss": 0.2269, + "step": 6550 + }, + { + "epoch": 0.31120922945449364, + "grad_norm": 0.455078125, + "learning_rate": 4.211654296987768e-05, + "loss": 0.2288, + "step": 6555 + }, + { + "epoch": 0.3114466125433224, + "grad_norm": 0.490234375, + "learning_rate": 4.210310968578357e-05, + "loss": 0.224, + "step": 6560 + }, + { + "epoch": 0.31168399563215116, + "grad_norm": 0.43359375, + "learning_rate": 4.2089667401859374e-05, + "loss": 0.2304, + "step": 6565 + }, + { + "epoch": 0.3119213787209799, + "grad_norm": 0.47265625, + "learning_rate": 4.207621612638949e-05, + "loss": 0.2277, + "step": 6570 + }, + { + "epoch": 0.31215876180980867, + "grad_norm": 0.50390625, + "learning_rate": 4.2062755867663895e-05, + "loss": 0.2256, + "step": 6575 + }, + { + "epoch": 0.31239614489863743, + "grad_norm": 0.453125, + "learning_rate": 4.2049286633978085e-05, + "loss": 0.2256, + "step": 6580 + }, + { + "epoch": 0.3126335279874662, + "grad_norm": 0.4765625, + "learning_rate": 4.203580843363308e-05, + "loss": 0.2271, + "step": 6585 + }, + { + "epoch": 0.31287091107629494, + "grad_norm": 0.46875, + "learning_rate": 4.202232127493543e-05, + "loss": 0.2258, + "step": 6590 + }, + { + "epoch": 0.3131082941651237, + "grad_norm": 0.4375, + "learning_rate": 4.200882516619723e-05, + "loss": 0.2238, + "step": 6595 + }, + { + "epoch": 0.3133456772539524, + "grad_norm": 0.52734375, + "learning_rate": 4.199532011573605e-05, + "loss": 0.2255, + "step": 6600 + }, + { + "epoch": 0.31358306034278116, + "grad_norm": 0.56640625, + "learning_rate": 4.198180613187499e-05, + "loss": 0.2265, + "step": 6605 + }, + { + "epoch": 0.3138204434316099, + "grad_norm": 0.5546875, + "learning_rate": 4.1968283222942666e-05, + "loss": 0.2266, + "step": 6610 + }, + { + "epoch": 0.3140578265204387, + "grad_norm": 0.478515625, + "learning_rate": 4.195475139727318e-05, + "loss": 0.2265, + "step": 6615 + }, + { + "epoch": 0.31429520960926743, + "grad_norm": 0.482421875, + "learning_rate": 4.1941210663206155e-05, + "loss": 0.2232, + "step": 6620 + }, + { + "epoch": 0.3145325926980962, + "grad_norm": 0.423828125, + "learning_rate": 4.192766102908665e-05, + "loss": 0.2275, + "step": 6625 + }, + { + "epoch": 0.31476997578692495, + "grad_norm": 0.44140625, + "learning_rate": 4.1914102503265274e-05, + "loss": 0.2259, + "step": 6630 + }, + { + "epoch": 0.3150073588757537, + "grad_norm": 0.5, + "learning_rate": 4.190053509409806e-05, + "loss": 0.2309, + "step": 6635 + }, + { + "epoch": 0.31524474196458246, + "grad_norm": 0.455078125, + "learning_rate": 4.1886958809946564e-05, + "loss": 0.2235, + "step": 6640 + }, + { + "epoch": 0.3154821250534112, + "grad_norm": 0.435546875, + "learning_rate": 4.187337365917776e-05, + "loss": 0.2242, + "step": 6645 + }, + { + "epoch": 0.3157195081422399, + "grad_norm": 0.50390625, + "learning_rate": 4.185977965016413e-05, + "loss": 0.2264, + "step": 6650 + }, + { + "epoch": 0.3159568912310687, + "grad_norm": 0.5078125, + "learning_rate": 4.184617679128362e-05, + "loss": 0.2269, + "step": 6655 + }, + { + "epoch": 0.31619427431989744, + "grad_norm": 0.5, + "learning_rate": 4.1832565090919576e-05, + "loss": 0.2277, + "step": 6660 + }, + { + "epoch": 0.3164316574087262, + "grad_norm": 0.5, + "learning_rate": 4.181894455746083e-05, + "loss": 0.2253, + "step": 6665 + }, + { + "epoch": 0.31666904049755495, + "grad_norm": 0.56640625, + "learning_rate": 4.180531519930169e-05, + "loss": 0.226, + "step": 6670 + }, + { + "epoch": 0.3169064235863837, + "grad_norm": 0.474609375, + "learning_rate": 4.179167702484183e-05, + "loss": 0.2259, + "step": 6675 + }, + { + "epoch": 0.31714380667521247, + "grad_norm": 0.5, + "learning_rate": 4.177803004248641e-05, + "loss": 0.2257, + "step": 6680 + }, + { + "epoch": 0.3173811897640412, + "grad_norm": 0.5078125, + "learning_rate": 4.1764374260646e-05, + "loss": 0.2229, + "step": 6685 + }, + { + "epoch": 0.31761857285287, + "grad_norm": 0.3828125, + "learning_rate": 4.17507096877366e-05, + "loss": 0.2241, + "step": 6690 + }, + { + "epoch": 0.3178559559416987, + "grad_norm": 0.462890625, + "learning_rate": 4.173703633217962e-05, + "loss": 0.226, + "step": 6695 + }, + { + "epoch": 0.31809333903052744, + "grad_norm": 0.482421875, + "learning_rate": 4.172335420240189e-05, + "loss": 0.2259, + "step": 6700 + }, + { + "epoch": 0.3183307221193562, + "grad_norm": 0.49609375, + "learning_rate": 4.170966330683564e-05, + "loss": 0.2265, + "step": 6705 + }, + { + "epoch": 0.31856810520818496, + "grad_norm": 0.470703125, + "learning_rate": 4.1695963653918516e-05, + "loss": 0.2235, + "step": 6710 + }, + { + "epoch": 0.3188054882970137, + "grad_norm": 0.458984375, + "learning_rate": 4.168225525209355e-05, + "loss": 0.2239, + "step": 6715 + }, + { + "epoch": 0.3190428713858425, + "grad_norm": 0.53125, + "learning_rate": 4.1668538109809155e-05, + "loss": 0.2244, + "step": 6720 + }, + { + "epoch": 0.31928025447467123, + "grad_norm": 0.62109375, + "learning_rate": 4.165481223551915e-05, + "loss": 0.2261, + "step": 6725 + }, + { + "epoch": 0.3195176375635, + "grad_norm": 0.671875, + "learning_rate": 4.164107763768274e-05, + "loss": 0.2313, + "step": 6730 + }, + { + "epoch": 0.31975502065232875, + "grad_norm": 0.515625, + "learning_rate": 4.162733432476448e-05, + "loss": 0.2277, + "step": 6735 + }, + { + "epoch": 0.3199924037411575, + "grad_norm": 0.515625, + "learning_rate": 4.1613582305234314e-05, + "loss": 0.2278, + "step": 6740 + }, + { + "epoch": 0.3202297868299862, + "grad_norm": 0.416015625, + "learning_rate": 4.159982158756756e-05, + "loss": 0.2237, + "step": 6745 + }, + { + "epoch": 0.32046716991881496, + "grad_norm": 0.5546875, + "learning_rate": 4.158605218024487e-05, + "loss": 0.2258, + "step": 6750 + }, + { + "epoch": 0.3207045530076437, + "grad_norm": 0.484375, + "learning_rate": 4.157227409175228e-05, + "loss": 0.2318, + "step": 6755 + }, + { + "epoch": 0.3209419360964725, + "grad_norm": 0.484375, + "learning_rate": 4.1558487330581153e-05, + "loss": 0.2237, + "step": 6760 + }, + { + "epoch": 0.32117931918530124, + "grad_norm": 0.462890625, + "learning_rate": 4.1544691905228215e-05, + "loss": 0.2177, + "step": 6765 + }, + { + "epoch": 0.32141670227413, + "grad_norm": 0.466796875, + "learning_rate": 4.1530887824195526e-05, + "loss": 0.226, + "step": 6770 + }, + { + "epoch": 0.32165408536295875, + "grad_norm": 0.4765625, + "learning_rate": 4.151707509599047e-05, + "loss": 0.2283, + "step": 6775 + }, + { + "epoch": 0.3218914684517875, + "grad_norm": 0.55078125, + "learning_rate": 4.1503253729125776e-05, + "loss": 0.2257, + "step": 6780 + }, + { + "epoch": 0.32212885154061627, + "grad_norm": 0.447265625, + "learning_rate": 4.148942373211948e-05, + "loss": 0.2239, + "step": 6785 + }, + { + "epoch": 0.322366234629445, + "grad_norm": 0.482421875, + "learning_rate": 4.147558511349496e-05, + "loss": 0.2284, + "step": 6790 + }, + { + "epoch": 0.3226036177182737, + "grad_norm": 0.451171875, + "learning_rate": 4.146173788178088e-05, + "loss": 0.2259, + "step": 6795 + }, + { + "epoch": 0.3228410008071025, + "grad_norm": 0.4453125, + "learning_rate": 4.1447882045511234e-05, + "loss": 0.2296, + "step": 6800 + }, + { + "epoch": 0.32307838389593124, + "grad_norm": 0.478515625, + "learning_rate": 4.143401761322531e-05, + "loss": 0.2279, + "step": 6805 + }, + { + "epoch": 0.32331576698476, + "grad_norm": 0.4609375, + "learning_rate": 4.1420144593467685e-05, + "loss": 0.2247, + "step": 6810 + }, + { + "epoch": 0.32355315007358876, + "grad_norm": 0.443359375, + "learning_rate": 4.1406262994788256e-05, + "loss": 0.2228, + "step": 6815 + }, + { + "epoch": 0.3237905331624175, + "grad_norm": 0.470703125, + "learning_rate": 4.139237282574217e-05, + "loss": 0.2236, + "step": 6820 + }, + { + "epoch": 0.3240279162512463, + "grad_norm": 0.625, + "learning_rate": 4.1378474094889895e-05, + "loss": 0.2244, + "step": 6825 + }, + { + "epoch": 0.32426529934007503, + "grad_norm": 0.515625, + "learning_rate": 4.136456681079715e-05, + "loss": 0.2255, + "step": 6830 + }, + { + "epoch": 0.3245026824289038, + "grad_norm": 0.474609375, + "learning_rate": 4.1350650982034915e-05, + "loss": 0.2254, + "step": 6835 + }, + { + "epoch": 0.3247400655177325, + "grad_norm": 0.515625, + "learning_rate": 4.133672661717946e-05, + "loss": 0.2284, + "step": 6840 + }, + { + "epoch": 0.32497744860656125, + "grad_norm": 0.451171875, + "learning_rate": 4.132279372481232e-05, + "loss": 0.2277, + "step": 6845 + }, + { + "epoch": 0.32521483169539, + "grad_norm": 0.482421875, + "learning_rate": 4.130885231352026e-05, + "loss": 0.2262, + "step": 6850 + }, + { + "epoch": 0.32545221478421876, + "grad_norm": 0.431640625, + "learning_rate": 4.129490239189531e-05, + "loss": 0.2276, + "step": 6855 + }, + { + "epoch": 0.3256895978730475, + "grad_norm": 0.5703125, + "learning_rate": 4.1280943968534745e-05, + "loss": 0.2268, + "step": 6860 + }, + { + "epoch": 0.3259269809618763, + "grad_norm": 0.4375, + "learning_rate": 4.126697705204109e-05, + "loss": 0.224, + "step": 6865 + }, + { + "epoch": 0.32616436405070504, + "grad_norm": 0.435546875, + "learning_rate": 4.125300165102207e-05, + "loss": 0.2265, + "step": 6870 + }, + { + "epoch": 0.3264017471395338, + "grad_norm": 0.421875, + "learning_rate": 4.1239017774090674e-05, + "loss": 0.2293, + "step": 6875 + }, + { + "epoch": 0.32663913022836255, + "grad_norm": 0.447265625, + "learning_rate": 4.122502542986511e-05, + "loss": 0.2283, + "step": 6880 + }, + { + "epoch": 0.3268765133171913, + "grad_norm": 0.51953125, + "learning_rate": 4.121102462696877e-05, + "loss": 0.2278, + "step": 6885 + }, + { + "epoch": 0.32711389640602, + "grad_norm": 0.44921875, + "learning_rate": 4.119701537403031e-05, + "loss": 0.2261, + "step": 6890 + }, + { + "epoch": 0.32735127949484877, + "grad_norm": 0.482421875, + "learning_rate": 4.118299767968356e-05, + "loss": 0.2281, + "step": 6895 + }, + { + "epoch": 0.3275886625836775, + "grad_norm": 0.50390625, + "learning_rate": 4.116897155256755e-05, + "loss": 0.2319, + "step": 6900 + }, + { + "epoch": 0.3278260456725063, + "grad_norm": 0.466796875, + "learning_rate": 4.115493700132653e-05, + "loss": 0.2259, + "step": 6905 + }, + { + "epoch": 0.32806342876133504, + "grad_norm": 0.5546875, + "learning_rate": 4.114089403460994e-05, + "loss": 0.2253, + "step": 6910 + }, + { + "epoch": 0.3283008118501638, + "grad_norm": 0.52734375, + "learning_rate": 4.112684266107238e-05, + "loss": 0.2268, + "step": 6915 + }, + { + "epoch": 0.32853819493899256, + "grad_norm": 0.5546875, + "learning_rate": 4.111278288937366e-05, + "loss": 0.2284, + "step": 6920 + }, + { + "epoch": 0.3287755780278213, + "grad_norm": 0.466796875, + "learning_rate": 4.109871472817874e-05, + "loss": 0.2242, + "step": 6925 + }, + { + "epoch": 0.32901296111665007, + "grad_norm": 0.474609375, + "learning_rate": 4.108463818615778e-05, + "loss": 0.223, + "step": 6930 + }, + { + "epoch": 0.32925034420547883, + "grad_norm": 0.4140625, + "learning_rate": 4.1070553271986065e-05, + "loss": 0.2277, + "step": 6935 + }, + { + "epoch": 0.32948772729430753, + "grad_norm": 0.435546875, + "learning_rate": 4.105645999434409e-05, + "loss": 0.2276, + "step": 6940 + }, + { + "epoch": 0.3297251103831363, + "grad_norm": 0.45703125, + "learning_rate": 4.104235836191746e-05, + "loss": 0.2298, + "step": 6945 + }, + { + "epoch": 0.32996249347196505, + "grad_norm": 0.53125, + "learning_rate": 4.1028248383396954e-05, + "loss": 0.2234, + "step": 6950 + }, + { + "epoch": 0.3301998765607938, + "grad_norm": 0.578125, + "learning_rate": 4.101413006747849e-05, + "loss": 0.2279, + "step": 6955 + }, + { + "epoch": 0.33043725964962256, + "grad_norm": 0.52734375, + "learning_rate": 4.1000003422863115e-05, + "loss": 0.2252, + "step": 6960 + }, + { + "epoch": 0.3306746427384513, + "grad_norm": 0.39453125, + "learning_rate": 4.0985868458257024e-05, + "loss": 0.2257, + "step": 6965 + }, + { + "epoch": 0.3309120258272801, + "grad_norm": 0.484375, + "learning_rate": 4.097172518237152e-05, + "loss": 0.2265, + "step": 6970 + }, + { + "epoch": 0.33114940891610883, + "grad_norm": 0.498046875, + "learning_rate": 4.095757360392305e-05, + "loss": 0.2261, + "step": 6975 + }, + { + "epoch": 0.3313867920049376, + "grad_norm": 0.44140625, + "learning_rate": 4.0943413731633166e-05, + "loss": 0.2273, + "step": 6980 + }, + { + "epoch": 0.3316241750937663, + "grad_norm": 0.53515625, + "learning_rate": 4.092924557422853e-05, + "loss": 0.2311, + "step": 6985 + }, + { + "epoch": 0.33186155818259505, + "grad_norm": 0.5078125, + "learning_rate": 4.091506914044091e-05, + "loss": 0.2284, + "step": 6990 + }, + { + "epoch": 0.3320989412714238, + "grad_norm": 0.45703125, + "learning_rate": 4.090088443900718e-05, + "loss": 0.2255, + "step": 6995 + }, + { + "epoch": 0.33233632436025257, + "grad_norm": 0.5, + "learning_rate": 4.088669147866931e-05, + "loss": 0.2257, + "step": 7000 + }, + { + "epoch": 0.3325737074490813, + "grad_norm": 0.462890625, + "learning_rate": 4.0872490268174356e-05, + "loss": 0.2267, + "step": 7005 + }, + { + "epoch": 0.3328110905379101, + "grad_norm": 0.478515625, + "learning_rate": 4.0858280816274464e-05, + "loss": 0.2281, + "step": 7010 + }, + { + "epoch": 0.33304847362673884, + "grad_norm": 0.48828125, + "learning_rate": 4.084406313172684e-05, + "loss": 0.2243, + "step": 7015 + }, + { + "epoch": 0.3332858567155676, + "grad_norm": 0.470703125, + "learning_rate": 4.082983722329379e-05, + "loss": 0.2249, + "step": 7020 + }, + { + "epoch": 0.33352323980439635, + "grad_norm": 0.435546875, + "learning_rate": 4.081560309974268e-05, + "loss": 0.2265, + "step": 7025 + }, + { + "epoch": 0.3337606228932251, + "grad_norm": 0.51953125, + "learning_rate": 4.080136076984593e-05, + "loss": 0.2242, + "step": 7030 + }, + { + "epoch": 0.3339980059820538, + "grad_norm": 0.458984375, + "learning_rate": 4.078711024238102e-05, + "loss": 0.2262, + "step": 7035 + }, + { + "epoch": 0.33423538907088257, + "grad_norm": 0.462890625, + "learning_rate": 4.0772851526130487e-05, + "loss": 0.2253, + "step": 7040 + }, + { + "epoch": 0.33447277215971133, + "grad_norm": 0.451171875, + "learning_rate": 4.075858462988191e-05, + "loss": 0.2245, + "step": 7045 + }, + { + "epoch": 0.3347101552485401, + "grad_norm": 0.453125, + "learning_rate": 4.074430956242793e-05, + "loss": 0.2289, + "step": 7050 + }, + { + "epoch": 0.33494753833736884, + "grad_norm": 0.51953125, + "learning_rate": 4.073002633256618e-05, + "loss": 0.2267, + "step": 7055 + }, + { + "epoch": 0.3351849214261976, + "grad_norm": 0.4609375, + "learning_rate": 4.071573494909937e-05, + "loss": 0.2278, + "step": 7060 + }, + { + "epoch": 0.33542230451502636, + "grad_norm": 0.486328125, + "learning_rate": 4.07014354208352e-05, + "loss": 0.2226, + "step": 7065 + }, + { + "epoch": 0.3356596876038551, + "grad_norm": 0.41796875, + "learning_rate": 4.0687127756586416e-05, + "loss": 0.2266, + "step": 7070 + }, + { + "epoch": 0.3358970706926839, + "grad_norm": 0.41796875, + "learning_rate": 4.0672811965170765e-05, + "loss": 0.2267, + "step": 7075 + }, + { + "epoch": 0.33613445378151263, + "grad_norm": 0.498046875, + "learning_rate": 4.0658488055410996e-05, + "loss": 0.2285, + "step": 7080 + }, + { + "epoch": 0.33637183687034133, + "grad_norm": 0.412109375, + "learning_rate": 4.064415603613487e-05, + "loss": 0.2272, + "step": 7085 + }, + { + "epoch": 0.3366092199591701, + "grad_norm": 0.443359375, + "learning_rate": 4.062981591617516e-05, + "loss": 0.229, + "step": 7090 + }, + { + "epoch": 0.33684660304799885, + "grad_norm": 0.4609375, + "learning_rate": 4.0615467704369594e-05, + "loss": 0.2241, + "step": 7095 + }, + { + "epoch": 0.3370839861368276, + "grad_norm": 0.5859375, + "learning_rate": 4.060111140956093e-05, + "loss": 0.2228, + "step": 7100 + }, + { + "epoch": 0.33732136922565636, + "grad_norm": 0.4609375, + "learning_rate": 4.0586747040596884e-05, + "loss": 0.2227, + "step": 7105 + }, + { + "epoch": 0.3375587523144851, + "grad_norm": 0.5546875, + "learning_rate": 4.057237460633013e-05, + "loss": 0.2282, + "step": 7110 + }, + { + "epoch": 0.3377961354033139, + "grad_norm": 0.5234375, + "learning_rate": 4.055799411561836e-05, + "loss": 0.2298, + "step": 7115 + }, + { + "epoch": 0.33803351849214264, + "grad_norm": 0.451171875, + "learning_rate": 4.054360557732421e-05, + "loss": 0.2254, + "step": 7120 + }, + { + "epoch": 0.3382709015809714, + "grad_norm": 0.44140625, + "learning_rate": 4.052920900031524e-05, + "loss": 0.2263, + "step": 7125 + }, + { + "epoch": 0.3385082846698001, + "grad_norm": 0.5078125, + "learning_rate": 4.051480439346402e-05, + "loss": 0.2222, + "step": 7130 + }, + { + "epoch": 0.33874566775862885, + "grad_norm": 0.4140625, + "learning_rate": 4.0500391765648014e-05, + "loss": 0.2272, + "step": 7135 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 0.44140625, + "learning_rate": 4.0485971125749704e-05, + "loss": 0.2267, + "step": 7140 + }, + { + "epoch": 0.33922043393628637, + "grad_norm": 0.5703125, + "learning_rate": 4.0471542482656424e-05, + "loss": 0.2272, + "step": 7145 + }, + { + "epoch": 0.3394578170251151, + "grad_norm": 0.5234375, + "learning_rate": 4.045710584526051e-05, + "loss": 0.2267, + "step": 7150 + }, + { + "epoch": 0.3396952001139439, + "grad_norm": 0.4609375, + "learning_rate": 4.0442661222459175e-05, + "loss": 0.2244, + "step": 7155 + }, + { + "epoch": 0.33993258320277264, + "grad_norm": 0.546875, + "learning_rate": 4.0428208623154586e-05, + "loss": 0.2303, + "step": 7160 + }, + { + "epoch": 0.3401699662916014, + "grad_norm": 0.5546875, + "learning_rate": 4.041374805625381e-05, + "loss": 0.2243, + "step": 7165 + }, + { + "epoch": 0.34040734938043016, + "grad_norm": 0.42578125, + "learning_rate": 4.039927953066884e-05, + "loss": 0.2262, + "step": 7170 + }, + { + "epoch": 0.3406447324692589, + "grad_norm": 0.46484375, + "learning_rate": 4.038480305531654e-05, + "loss": 0.2254, + "step": 7175 + }, + { + "epoch": 0.3408821155580876, + "grad_norm": 0.48046875, + "learning_rate": 4.037031863911871e-05, + "loss": 0.2216, + "step": 7180 + }, + { + "epoch": 0.3411194986469164, + "grad_norm": 0.421875, + "learning_rate": 4.0355826291002033e-05, + "loss": 0.2262, + "step": 7185 + }, + { + "epoch": 0.34135688173574513, + "grad_norm": 0.51171875, + "learning_rate": 4.0341326019898075e-05, + "loss": 0.2234, + "step": 7190 + }, + { + "epoch": 0.3415942648245739, + "grad_norm": 0.6015625, + "learning_rate": 4.032681783474327e-05, + "loss": 0.2288, + "step": 7195 + }, + { + "epoch": 0.34183164791340265, + "grad_norm": 0.482421875, + "learning_rate": 4.031230174447897e-05, + "loss": 0.2239, + "step": 7200 + }, + { + "epoch": 0.3420690310022314, + "grad_norm": 0.490234375, + "learning_rate": 4.0297777758051366e-05, + "loss": 0.2251, + "step": 7205 + }, + { + "epoch": 0.34230641409106016, + "grad_norm": 0.41015625, + "learning_rate": 4.028324588441153e-05, + "loss": 0.2235, + "step": 7210 + }, + { + "epoch": 0.3425437971798889, + "grad_norm": 0.52734375, + "learning_rate": 4.0268706132515377e-05, + "loss": 0.2255, + "step": 7215 + }, + { + "epoch": 0.3427811802687177, + "grad_norm": 0.50390625, + "learning_rate": 4.025415851132371e-05, + "loss": 0.2251, + "step": 7220 + }, + { + "epoch": 0.34301856335754644, + "grad_norm": 0.41796875, + "learning_rate": 4.023960302980214e-05, + "loss": 0.2271, + "step": 7225 + }, + { + "epoch": 0.34325594644637514, + "grad_norm": 0.45703125, + "learning_rate": 4.0225039696921154e-05, + "loss": 0.2247, + "step": 7230 + }, + { + "epoch": 0.3434933295352039, + "grad_norm": 0.4921875, + "learning_rate": 4.0210468521656065e-05, + "loss": 0.2218, + "step": 7235 + }, + { + "epoch": 0.34373071262403265, + "grad_norm": 0.46484375, + "learning_rate": 4.0195889512987035e-05, + "loss": 0.2232, + "step": 7240 + }, + { + "epoch": 0.3439680957128614, + "grad_norm": 0.52734375, + "learning_rate": 4.018130267989902e-05, + "loss": 0.2296, + "step": 7245 + }, + { + "epoch": 0.34420547880169017, + "grad_norm": 0.4609375, + "learning_rate": 4.016670803138183e-05, + "loss": 0.2233, + "step": 7250 + }, + { + "epoch": 0.3444428618905189, + "grad_norm": 0.46875, + "learning_rate": 4.015210557643009e-05, + "loss": 0.2258, + "step": 7255 + }, + { + "epoch": 0.3446802449793477, + "grad_norm": 0.44921875, + "learning_rate": 4.013749532404321e-05, + "loss": 0.2267, + "step": 7260 + }, + { + "epoch": 0.34491762806817644, + "grad_norm": 0.40625, + "learning_rate": 4.012287728322542e-05, + "loss": 0.2236, + "step": 7265 + }, + { + "epoch": 0.3451550111570052, + "grad_norm": 0.416015625, + "learning_rate": 4.010825146298577e-05, + "loss": 0.2292, + "step": 7270 + }, + { + "epoch": 0.3453923942458339, + "grad_norm": 0.39453125, + "learning_rate": 4.009361787233808e-05, + "loss": 0.224, + "step": 7275 + }, + { + "epoch": 0.34562977733466266, + "grad_norm": 0.5390625, + "learning_rate": 4.007897652030096e-05, + "loss": 0.2309, + "step": 7280 + }, + { + "epoch": 0.3458671604234914, + "grad_norm": 0.443359375, + "learning_rate": 4.0064327415897815e-05, + "loss": 0.2247, + "step": 7285 + }, + { + "epoch": 0.3461045435123202, + "grad_norm": 0.53515625, + "learning_rate": 4.00496705681568e-05, + "loss": 0.2248, + "step": 7290 + }, + { + "epoch": 0.34634192660114893, + "grad_norm": 0.48046875, + "learning_rate": 4.00350059861109e-05, + "loss": 0.2248, + "step": 7295 + }, + { + "epoch": 0.3465793096899777, + "grad_norm": 0.490234375, + "learning_rate": 4.00203336787978e-05, + "loss": 0.2253, + "step": 7300 + }, + { + "epoch": 0.34681669277880645, + "grad_norm": 0.51953125, + "learning_rate": 4.000565365525999e-05, + "loss": 0.2235, + "step": 7305 + }, + { + "epoch": 0.3470540758676352, + "grad_norm": 0.47265625, + "learning_rate": 3.99909659245447e-05, + "loss": 0.2301, + "step": 7310 + }, + { + "epoch": 0.34729145895646396, + "grad_norm": 0.484375, + "learning_rate": 3.997627049570392e-05, + "loss": 0.2229, + "step": 7315 + }, + { + "epoch": 0.3475288420452927, + "grad_norm": 0.494140625, + "learning_rate": 3.996156737779436e-05, + "loss": 0.2261, + "step": 7320 + }, + { + "epoch": 0.3477662251341214, + "grad_norm": 0.5, + "learning_rate": 3.994685657987749e-05, + "loss": 0.2225, + "step": 7325 + }, + { + "epoch": 0.3480036082229502, + "grad_norm": 0.451171875, + "learning_rate": 3.993213811101951e-05, + "loss": 0.2273, + "step": 7330 + }, + { + "epoch": 0.34824099131177894, + "grad_norm": 0.4921875, + "learning_rate": 3.991741198029135e-05, + "loss": 0.2209, + "step": 7335 + }, + { + "epoch": 0.3484783744006077, + "grad_norm": 0.40234375, + "learning_rate": 3.990267819676865e-05, + "loss": 0.2244, + "step": 7340 + }, + { + "epoch": 0.34871575748943645, + "grad_norm": 0.458984375, + "learning_rate": 3.988793676953177e-05, + "loss": 0.2256, + "step": 7345 + }, + { + "epoch": 0.3489531405782652, + "grad_norm": 0.69140625, + "learning_rate": 3.98731877076658e-05, + "loss": 0.2259, + "step": 7350 + }, + { + "epoch": 0.34919052366709397, + "grad_norm": 0.56640625, + "learning_rate": 3.985843102026051e-05, + "loss": 0.2262, + "step": 7355 + }, + { + "epoch": 0.3494279067559227, + "grad_norm": 0.44140625, + "learning_rate": 3.984366671641037e-05, + "loss": 0.2216, + "step": 7360 + }, + { + "epoch": 0.3496652898447515, + "grad_norm": 0.486328125, + "learning_rate": 3.9828894805214576e-05, + "loss": 0.2259, + "step": 7365 + }, + { + "epoch": 0.34990267293358024, + "grad_norm": 0.48828125, + "learning_rate": 3.981411529577697e-05, + "loss": 0.2223, + "step": 7370 + }, + { + "epoch": 0.35014005602240894, + "grad_norm": 0.455078125, + "learning_rate": 3.97993281972061e-05, + "loss": 0.2245, + "step": 7375 + }, + { + "epoch": 0.3503774391112377, + "grad_norm": 0.55078125, + "learning_rate": 3.97845335186152e-05, + "loss": 0.2295, + "step": 7380 + }, + { + "epoch": 0.35061482220006646, + "grad_norm": 0.458984375, + "learning_rate": 3.976973126912215e-05, + "loss": 0.2235, + "step": 7385 + }, + { + "epoch": 0.3508522052888952, + "grad_norm": 0.474609375, + "learning_rate": 3.9754921457849516e-05, + "loss": 0.2274, + "step": 7390 + }, + { + "epoch": 0.35108958837772397, + "grad_norm": 0.443359375, + "learning_rate": 3.974010409392451e-05, + "loss": 0.2258, + "step": 7395 + }, + { + "epoch": 0.35132697146655273, + "grad_norm": 0.486328125, + "learning_rate": 3.9725279186479006e-05, + "loss": 0.2264, + "step": 7400 + }, + { + "epoch": 0.3515643545553815, + "grad_norm": 0.5234375, + "learning_rate": 3.971044674464954e-05, + "loss": 0.222, + "step": 7405 + }, + { + "epoch": 0.35180173764421024, + "grad_norm": 0.5234375, + "learning_rate": 3.969560677757727e-05, + "loss": 0.2235, + "step": 7410 + }, + { + "epoch": 0.352039120733039, + "grad_norm": 0.447265625, + "learning_rate": 3.9680759294408e-05, + "loss": 0.2244, + "step": 7415 + }, + { + "epoch": 0.3522765038218677, + "grad_norm": 0.55078125, + "learning_rate": 3.966590430429216e-05, + "loss": 0.2301, + "step": 7420 + }, + { + "epoch": 0.35251388691069646, + "grad_norm": 0.53515625, + "learning_rate": 3.9651041816384824e-05, + "loss": 0.2288, + "step": 7425 + }, + { + "epoch": 0.3527512699995252, + "grad_norm": 0.51953125, + "learning_rate": 3.9636171839845675e-05, + "loss": 0.2247, + "step": 7430 + }, + { + "epoch": 0.352988653088354, + "grad_norm": 0.5078125, + "learning_rate": 3.962129438383901e-05, + "loss": 0.2238, + "step": 7435 + }, + { + "epoch": 0.35322603617718273, + "grad_norm": 0.453125, + "learning_rate": 3.960640945753372e-05, + "loss": 0.2249, + "step": 7440 + }, + { + "epoch": 0.3534634192660115, + "grad_norm": 0.416015625, + "learning_rate": 3.959151707010335e-05, + "loss": 0.2252, + "step": 7445 + }, + { + "epoch": 0.35370080235484025, + "grad_norm": 0.48046875, + "learning_rate": 3.9576617230725984e-05, + "loss": 0.2259, + "step": 7450 + }, + { + "epoch": 0.353938185443669, + "grad_norm": 0.40234375, + "learning_rate": 3.956170994858434e-05, + "loss": 0.2234, + "step": 7455 + }, + { + "epoch": 0.35417556853249776, + "grad_norm": 0.5234375, + "learning_rate": 3.954679523286569e-05, + "loss": 0.2258, + "step": 7460 + }, + { + "epoch": 0.3544129516213265, + "grad_norm": 0.447265625, + "learning_rate": 3.953187309276194e-05, + "loss": 0.2248, + "step": 7465 + }, + { + "epoch": 0.3546503347101552, + "grad_norm": 0.5390625, + "learning_rate": 3.95169435374695e-05, + "loss": 0.2278, + "step": 7470 + }, + { + "epoch": 0.354887717798984, + "grad_norm": 0.5, + "learning_rate": 3.950200657618942e-05, + "loss": 0.2256, + "step": 7475 + }, + { + "epoch": 0.35512510088781274, + "grad_norm": 0.42578125, + "learning_rate": 3.9487062218127246e-05, + "loss": 0.2253, + "step": 7480 + }, + { + "epoch": 0.3553624839766415, + "grad_norm": 0.482421875, + "learning_rate": 3.9472110472493144e-05, + "loss": 0.2249, + "step": 7485 + }, + { + "epoch": 0.35559986706547025, + "grad_norm": 0.515625, + "learning_rate": 3.945715134850179e-05, + "loss": 0.2249, + "step": 7490 + }, + { + "epoch": 0.355837250154299, + "grad_norm": 0.494140625, + "learning_rate": 3.944218485537245e-05, + "loss": 0.2224, + "step": 7495 + }, + { + "epoch": 0.35607463324312777, + "grad_norm": 0.478515625, + "learning_rate": 3.942721100232887e-05, + "loss": 0.2275, + "step": 7500 + }, + { + "epoch": 0.3563120163319565, + "grad_norm": 0.455078125, + "learning_rate": 3.94122297985994e-05, + "loss": 0.2248, + "step": 7505 + }, + { + "epoch": 0.3565493994207853, + "grad_norm": 0.5078125, + "learning_rate": 3.9397241253416864e-05, + "loss": 0.225, + "step": 7510 + }, + { + "epoch": 0.35678678250961404, + "grad_norm": 0.3984375, + "learning_rate": 3.938224537601865e-05, + "loss": 0.2218, + "step": 7515 + }, + { + "epoch": 0.35702416559844274, + "grad_norm": 0.44921875, + "learning_rate": 3.936724217564665e-05, + "loss": 0.2282, + "step": 7520 + }, + { + "epoch": 0.3572615486872715, + "grad_norm": 0.54296875, + "learning_rate": 3.9352231661547244e-05, + "loss": 0.2222, + "step": 7525 + }, + { + "epoch": 0.35749893177610026, + "grad_norm": 0.49609375, + "learning_rate": 3.933721384297137e-05, + "loss": 0.2239, + "step": 7530 + }, + { + "epoch": 0.357736314864929, + "grad_norm": 0.484375, + "learning_rate": 3.932218872917443e-05, + "loss": 0.2261, + "step": 7535 + }, + { + "epoch": 0.3579736979537578, + "grad_norm": 0.486328125, + "learning_rate": 3.930715632941633e-05, + "loss": 0.2312, + "step": 7540 + }, + { + "epoch": 0.35821108104258653, + "grad_norm": 0.396484375, + "learning_rate": 3.92921166529615e-05, + "loss": 0.2254, + "step": 7545 + }, + { + "epoch": 0.3584484641314153, + "grad_norm": 0.470703125, + "learning_rate": 3.927706970907877e-05, + "loss": 0.2254, + "step": 7550 + }, + { + "epoch": 0.35868584722024405, + "grad_norm": 0.484375, + "learning_rate": 3.926201550704155e-05, + "loss": 0.2261, + "step": 7555 + }, + { + "epoch": 0.3589232303090728, + "grad_norm": 0.44140625, + "learning_rate": 3.924695405612765e-05, + "loss": 0.2243, + "step": 7560 + }, + { + "epoch": 0.3591606133979015, + "grad_norm": 0.515625, + "learning_rate": 3.9231885365619385e-05, + "loss": 0.225, + "step": 7565 + }, + { + "epoch": 0.35939799648673026, + "grad_norm": 0.40234375, + "learning_rate": 3.921680944480352e-05, + "loss": 0.2231, + "step": 7570 + }, + { + "epoch": 0.359635379575559, + "grad_norm": 0.462890625, + "learning_rate": 3.920172630297127e-05, + "loss": 0.2266, + "step": 7575 + }, + { + "epoch": 0.3598727626643878, + "grad_norm": 0.44140625, + "learning_rate": 3.9186635949418317e-05, + "loss": 0.2261, + "step": 7580 + }, + { + "epoch": 0.36011014575321654, + "grad_norm": 0.455078125, + "learning_rate": 3.917153839344476e-05, + "loss": 0.2246, + "step": 7585 + }, + { + "epoch": 0.3603475288420453, + "grad_norm": 0.51953125, + "learning_rate": 3.9156433644355175e-05, + "loss": 0.2234, + "step": 7590 + }, + { + "epoch": 0.36058491193087405, + "grad_norm": 0.47265625, + "learning_rate": 3.914132171145854e-05, + "loss": 0.2278, + "step": 7595 + }, + { + "epoch": 0.3608222950197028, + "grad_norm": 0.58984375, + "learning_rate": 3.9126202604068265e-05, + "loss": 0.2262, + "step": 7600 + }, + { + "epoch": 0.36105967810853157, + "grad_norm": 0.455078125, + "learning_rate": 3.91110763315022e-05, + "loss": 0.2209, + "step": 7605 + }, + { + "epoch": 0.3612970611973603, + "grad_norm": 0.427734375, + "learning_rate": 3.909594290308259e-05, + "loss": 0.2219, + "step": 7610 + }, + { + "epoch": 0.361534444286189, + "grad_norm": 0.427734375, + "learning_rate": 3.90808023281361e-05, + "loss": 0.2251, + "step": 7615 + }, + { + "epoch": 0.3617718273750178, + "grad_norm": 0.431640625, + "learning_rate": 3.906565461599379e-05, + "loss": 0.2269, + "step": 7620 + }, + { + "epoch": 0.36200921046384654, + "grad_norm": 0.494140625, + "learning_rate": 3.905049977599114e-05, + "loss": 0.2271, + "step": 7625 + }, + { + "epoch": 0.3622465935526753, + "grad_norm": 0.49609375, + "learning_rate": 3.9035337817468e-05, + "loss": 0.2247, + "step": 7630 + }, + { + "epoch": 0.36248397664150406, + "grad_norm": 0.4140625, + "learning_rate": 3.902016874976862e-05, + "loss": 0.2243, + "step": 7635 + }, + { + "epoch": 0.3627213597303328, + "grad_norm": 0.423828125, + "learning_rate": 3.900499258224162e-05, + "loss": 0.2233, + "step": 7640 + }, + { + "epoch": 0.3629587428191616, + "grad_norm": 0.51953125, + "learning_rate": 3.8989809324240006e-05, + "loss": 0.2287, + "step": 7645 + }, + { + "epoch": 0.36319612590799033, + "grad_norm": 0.478515625, + "learning_rate": 3.897461898512116e-05, + "loss": 0.223, + "step": 7650 + }, + { + "epoch": 0.3634335089968191, + "grad_norm": 0.5234375, + "learning_rate": 3.8959421574246816e-05, + "loss": 0.2251, + "step": 7655 + }, + { + "epoch": 0.36367089208564785, + "grad_norm": 0.44140625, + "learning_rate": 3.894421710098306e-05, + "loss": 0.2305, + "step": 7660 + }, + { + "epoch": 0.36390827517447655, + "grad_norm": 0.498046875, + "learning_rate": 3.892900557470034e-05, + "loss": 0.2247, + "step": 7665 + }, + { + "epoch": 0.3641456582633053, + "grad_norm": 0.486328125, + "learning_rate": 3.891378700477347e-05, + "loss": 0.2265, + "step": 7670 + }, + { + "epoch": 0.36438304135213406, + "grad_norm": 0.482421875, + "learning_rate": 3.889856140058156e-05, + "loss": 0.2273, + "step": 7675 + }, + { + "epoch": 0.3646204244409628, + "grad_norm": 0.458984375, + "learning_rate": 3.888332877150809e-05, + "loss": 0.2262, + "step": 7680 + }, + { + "epoch": 0.3648578075297916, + "grad_norm": 0.470703125, + "learning_rate": 3.886808912694087e-05, + "loss": 0.2245, + "step": 7685 + }, + { + "epoch": 0.36509519061862034, + "grad_norm": 0.396484375, + "learning_rate": 3.8852842476272006e-05, + "loss": 0.2259, + "step": 7690 + }, + { + "epoch": 0.3653325737074491, + "grad_norm": 0.4296875, + "learning_rate": 3.8837588828897954e-05, + "loss": 0.2258, + "step": 7695 + }, + { + "epoch": 0.36556995679627785, + "grad_norm": 0.54296875, + "learning_rate": 3.8822328194219463e-05, + "loss": 0.2259, + "step": 7700 + }, + { + "epoch": 0.3658073398851066, + "grad_norm": 0.55859375, + "learning_rate": 3.8807060581641594e-05, + "loss": 0.2297, + "step": 7705 + }, + { + "epoch": 0.3660447229739353, + "grad_norm": 0.44140625, + "learning_rate": 3.87917860005737e-05, + "loss": 0.2252, + "step": 7710 + }, + { + "epoch": 0.36628210606276407, + "grad_norm": 0.4765625, + "learning_rate": 3.877650446042944e-05, + "loss": 0.2265, + "step": 7715 + }, + { + "epoch": 0.3665194891515928, + "grad_norm": 0.51171875, + "learning_rate": 3.876121597062677e-05, + "loss": 0.2262, + "step": 7720 + }, + { + "epoch": 0.3667568722404216, + "grad_norm": 0.408203125, + "learning_rate": 3.874592054058789e-05, + "loss": 0.221, + "step": 7725 + }, + { + "epoch": 0.36699425532925034, + "grad_norm": 0.486328125, + "learning_rate": 3.873061817973933e-05, + "loss": 0.2255, + "step": 7730 + }, + { + "epoch": 0.3672316384180791, + "grad_norm": 0.4296875, + "learning_rate": 3.8715308897511856e-05, + "loss": 0.223, + "step": 7735 + }, + { + "epoch": 0.36746902150690786, + "grad_norm": 0.484375, + "learning_rate": 3.8699992703340506e-05, + "loss": 0.2222, + "step": 7740 + }, + { + "epoch": 0.3677064045957366, + "grad_norm": 0.484375, + "learning_rate": 3.868466960666458e-05, + "loss": 0.2268, + "step": 7745 + }, + { + "epoch": 0.36794378768456537, + "grad_norm": 0.56640625, + "learning_rate": 3.8669339616927644e-05, + "loss": 0.2295, + "step": 7750 + }, + { + "epoch": 0.36818117077339413, + "grad_norm": 0.486328125, + "learning_rate": 3.865400274357748e-05, + "loss": 0.2233, + "step": 7755 + }, + { + "epoch": 0.36841855386222283, + "grad_norm": 0.462890625, + "learning_rate": 3.863865899606615e-05, + "loss": 0.2257, + "step": 7760 + }, + { + "epoch": 0.3686559369510516, + "grad_norm": 0.439453125, + "learning_rate": 3.8623308383849936e-05, + "loss": 0.2265, + "step": 7765 + }, + { + "epoch": 0.36889332003988035, + "grad_norm": 0.59765625, + "learning_rate": 3.8607950916389344e-05, + "loss": 0.22, + "step": 7770 + }, + { + "epoch": 0.3691307031287091, + "grad_norm": 0.466796875, + "learning_rate": 3.859258660314912e-05, + "loss": 0.2237, + "step": 7775 + }, + { + "epoch": 0.36936808621753786, + "grad_norm": 0.453125, + "learning_rate": 3.85772154535982e-05, + "loss": 0.2245, + "step": 7780 + }, + { + "epoch": 0.3696054693063666, + "grad_norm": 0.486328125, + "learning_rate": 3.856183747720978e-05, + "loss": 0.2239, + "step": 7785 + }, + { + "epoch": 0.3698428523951954, + "grad_norm": 0.4375, + "learning_rate": 3.854645268346121e-05, + "loss": 0.2254, + "step": 7790 + }, + { + "epoch": 0.37008023548402413, + "grad_norm": 0.5078125, + "learning_rate": 3.8531061081834104e-05, + "loss": 0.2259, + "step": 7795 + }, + { + "epoch": 0.3703176185728529, + "grad_norm": 0.4765625, + "learning_rate": 3.8515662681814194e-05, + "loss": 0.2264, + "step": 7800 + }, + { + "epoch": 0.37055500166168165, + "grad_norm": 0.443359375, + "learning_rate": 3.8500257492891484e-05, + "loss": 0.2266, + "step": 7805 + }, + { + "epoch": 0.37079238475051035, + "grad_norm": 0.494140625, + "learning_rate": 3.8484845524560095e-05, + "loss": 0.2253, + "step": 7810 + }, + { + "epoch": 0.3710297678393391, + "grad_norm": 0.466796875, + "learning_rate": 3.8469426786318357e-05, + "loss": 0.2244, + "step": 7815 + }, + { + "epoch": 0.37126715092816787, + "grad_norm": 0.6015625, + "learning_rate": 3.845400128766877e-05, + "loss": 0.2253, + "step": 7820 + }, + { + "epoch": 0.3715045340169966, + "grad_norm": 0.48828125, + "learning_rate": 3.843856903811801e-05, + "loss": 0.2266, + "step": 7825 + }, + { + "epoch": 0.3717419171058254, + "grad_norm": 0.361328125, + "learning_rate": 3.8423130047176886e-05, + "loss": 0.2261, + "step": 7830 + }, + { + "epoch": 0.37197930019465414, + "grad_norm": 0.474609375, + "learning_rate": 3.8407684324360396e-05, + "loss": 0.225, + "step": 7835 + }, + { + "epoch": 0.3722166832834829, + "grad_norm": 0.62109375, + "learning_rate": 3.839223187918764e-05, + "loss": 0.2287, + "step": 7840 + }, + { + "epoch": 0.37245406637231165, + "grad_norm": 0.7109375, + "learning_rate": 3.837677272118192e-05, + "loss": 0.2264, + "step": 7845 + }, + { + "epoch": 0.3726914494611404, + "grad_norm": 0.578125, + "learning_rate": 3.8361306859870624e-05, + "loss": 0.2257, + "step": 7850 + }, + { + "epoch": 0.3729288325499691, + "grad_norm": 0.40625, + "learning_rate": 3.8345834304785305e-05, + "loss": 0.2207, + "step": 7855 + }, + { + "epoch": 0.37316621563879787, + "grad_norm": 0.408203125, + "learning_rate": 3.8330355065461625e-05, + "loss": 0.2227, + "step": 7860 + }, + { + "epoch": 0.37340359872762663, + "grad_norm": 0.4140625, + "learning_rate": 3.831486915143937e-05, + "loss": 0.2232, + "step": 7865 + }, + { + "epoch": 0.3736409818164554, + "grad_norm": 0.5703125, + "learning_rate": 3.829937657226243e-05, + "loss": 0.2233, + "step": 7870 + }, + { + "epoch": 0.37387836490528414, + "grad_norm": 0.48828125, + "learning_rate": 3.8283877337478816e-05, + "loss": 0.2278, + "step": 7875 + }, + { + "epoch": 0.3741157479941129, + "grad_norm": 0.490234375, + "learning_rate": 3.826837145664064e-05, + "loss": 0.2248, + "step": 7880 + }, + { + "epoch": 0.37435313108294166, + "grad_norm": 0.47265625, + "learning_rate": 3.82528589393041e-05, + "loss": 0.2248, + "step": 7885 + }, + { + "epoch": 0.3745905141717704, + "grad_norm": 0.447265625, + "learning_rate": 3.823733979502949e-05, + "loss": 0.228, + "step": 7890 + }, + { + "epoch": 0.3748278972605992, + "grad_norm": 0.486328125, + "learning_rate": 3.82218140333812e-05, + "loss": 0.2237, + "step": 7895 + }, + { + "epoch": 0.37506528034942793, + "grad_norm": 0.423828125, + "learning_rate": 3.820628166392766e-05, + "loss": 0.2226, + "step": 7900 + }, + { + "epoch": 0.37530266343825663, + "grad_norm": 0.443359375, + "learning_rate": 3.8190742696241426e-05, + "loss": 0.2249, + "step": 7905 + }, + { + "epoch": 0.3755400465270854, + "grad_norm": 0.4375, + "learning_rate": 3.817519713989909e-05, + "loss": 0.2269, + "step": 7910 + }, + { + "epoch": 0.37577742961591415, + "grad_norm": 0.50390625, + "learning_rate": 3.815964500448129e-05, + "loss": 0.2252, + "step": 7915 + }, + { + "epoch": 0.3760148127047429, + "grad_norm": 0.482421875, + "learning_rate": 3.8144086299572746e-05, + "loss": 0.2255, + "step": 7920 + }, + { + "epoch": 0.37625219579357166, + "grad_norm": 0.5234375, + "learning_rate": 3.812852103476222e-05, + "loss": 0.2204, + "step": 7925 + }, + { + "epoch": 0.3764895788824004, + "grad_norm": 0.490234375, + "learning_rate": 3.8112949219642514e-05, + "loss": 0.2241, + "step": 7930 + }, + { + "epoch": 0.3767269619712292, + "grad_norm": 0.44140625, + "learning_rate": 3.809737086381046e-05, + "loss": 0.2248, + "step": 7935 + }, + { + "epoch": 0.37696434506005794, + "grad_norm": 0.404296875, + "learning_rate": 3.8081785976866934e-05, + "loss": 0.2293, + "step": 7940 + }, + { + "epoch": 0.3772017281488867, + "grad_norm": 0.462890625, + "learning_rate": 3.806619456841682e-05, + "loss": 0.2235, + "step": 7945 + }, + { + "epoch": 0.37743911123771545, + "grad_norm": 0.474609375, + "learning_rate": 3.805059664806906e-05, + "loss": 0.2242, + "step": 7950 + }, + { + "epoch": 0.37767649432654415, + "grad_norm": 0.458984375, + "learning_rate": 3.8034992225436554e-05, + "loss": 0.2256, + "step": 7955 + }, + { + "epoch": 0.3779138774153729, + "grad_norm": 0.56640625, + "learning_rate": 3.801938131013625e-05, + "loss": 0.2303, + "step": 7960 + }, + { + "epoch": 0.37815126050420167, + "grad_norm": 0.54296875, + "learning_rate": 3.800376391178907e-05, + "loss": 0.2249, + "step": 7965 + }, + { + "epoch": 0.3783886435930304, + "grad_norm": 0.45703125, + "learning_rate": 3.798814004001997e-05, + "loss": 0.2226, + "step": 7970 + }, + { + "epoch": 0.3786260266818592, + "grad_norm": 0.50390625, + "learning_rate": 3.7972509704457835e-05, + "loss": 0.2295, + "step": 7975 + }, + { + "epoch": 0.37886340977068794, + "grad_norm": 0.5390625, + "learning_rate": 3.7956872914735605e-05, + "loss": 0.2246, + "step": 7980 + }, + { + "epoch": 0.3791007928595167, + "grad_norm": 0.447265625, + "learning_rate": 3.7941229680490144e-05, + "loss": 0.2272, + "step": 7985 + }, + { + "epoch": 0.37933817594834546, + "grad_norm": 0.490234375, + "learning_rate": 3.792558001136232e-05, + "loss": 0.2257, + "step": 7990 + }, + { + "epoch": 0.3795755590371742, + "grad_norm": 0.421875, + "learning_rate": 3.7909923916996924e-05, + "loss": 0.2251, + "step": 7995 + }, + { + "epoch": 0.3798129421260029, + "grad_norm": 0.4765625, + "learning_rate": 3.789426140704277e-05, + "loss": 0.2246, + "step": 8000 + }, + { + "epoch": 0.3800503252148317, + "grad_norm": 0.404296875, + "learning_rate": 3.787859249115258e-05, + "loss": 0.2215, + "step": 8005 + }, + { + "epoch": 0.38028770830366043, + "grad_norm": 0.49609375, + "learning_rate": 3.786291717898303e-05, + "loss": 0.224, + "step": 8010 + }, + { + "epoch": 0.3805250913924892, + "grad_norm": 0.416015625, + "learning_rate": 3.7847235480194736e-05, + "loss": 0.2263, + "step": 8015 + }, + { + "epoch": 0.38076247448131795, + "grad_norm": 0.55078125, + "learning_rate": 3.783154740445227e-05, + "loss": 0.2182, + "step": 8020 + }, + { + "epoch": 0.3809998575701467, + "grad_norm": 0.44921875, + "learning_rate": 3.7815852961424124e-05, + "loss": 0.2242, + "step": 8025 + }, + { + "epoch": 0.38123724065897546, + "grad_norm": 0.470703125, + "learning_rate": 3.780015216078271e-05, + "loss": 0.2255, + "step": 8030 + }, + { + "epoch": 0.3814746237478042, + "grad_norm": 0.5390625, + "learning_rate": 3.7784445012204356e-05, + "loss": 0.2242, + "step": 8035 + }, + { + "epoch": 0.381712006836633, + "grad_norm": 0.66796875, + "learning_rate": 3.776873152536931e-05, + "loss": 0.2232, + "step": 8040 + }, + { + "epoch": 0.38194938992546174, + "grad_norm": 0.42578125, + "learning_rate": 3.7753011709961704e-05, + "loss": 0.226, + "step": 8045 + }, + { + "epoch": 0.38218677301429044, + "grad_norm": 0.439453125, + "learning_rate": 3.7737285575669626e-05, + "loss": 0.2211, + "step": 8050 + }, + { + "epoch": 0.3824241561031192, + "grad_norm": 0.53515625, + "learning_rate": 3.7721553132184987e-05, + "loss": 0.2255, + "step": 8055 + }, + { + "epoch": 0.38266153919194795, + "grad_norm": 0.453125, + "learning_rate": 3.770581438920364e-05, + "loss": 0.2284, + "step": 8060 + }, + { + "epoch": 0.3828989222807767, + "grad_norm": 0.52734375, + "learning_rate": 3.7690069356425286e-05, + "loss": 0.2234, + "step": 8065 + }, + { + "epoch": 0.38313630536960547, + "grad_norm": 0.51953125, + "learning_rate": 3.7674318043553535e-05, + "loss": 0.2258, + "step": 8070 + }, + { + "epoch": 0.3833736884584342, + "grad_norm": 0.484375, + "learning_rate": 3.765856046029583e-05, + "loss": 0.2263, + "step": 8075 + }, + { + "epoch": 0.383611071547263, + "grad_norm": 0.453125, + "learning_rate": 3.764279661636352e-05, + "loss": 0.225, + "step": 8080 + }, + { + "epoch": 0.38384845463609174, + "grad_norm": 0.4765625, + "learning_rate": 3.762702652147177e-05, + "loss": 0.2227, + "step": 8085 + }, + { + "epoch": 0.3840858377249205, + "grad_norm": 0.49609375, + "learning_rate": 3.7611250185339625e-05, + "loss": 0.2265, + "step": 8090 + }, + { + "epoch": 0.38432322081374926, + "grad_norm": 0.478515625, + "learning_rate": 3.759546761768997e-05, + "loss": 0.2273, + "step": 8095 + }, + { + "epoch": 0.38456060390257796, + "grad_norm": 0.41796875, + "learning_rate": 3.7579678828249526e-05, + "loss": 0.225, + "step": 8100 + }, + { + "epoch": 0.3847979869914067, + "grad_norm": 0.47265625, + "learning_rate": 3.756388382674887e-05, + "loss": 0.2266, + "step": 8105 + }, + { + "epoch": 0.3850353700802355, + "grad_norm": 0.4296875, + "learning_rate": 3.754808262292235e-05, + "loss": 0.2233, + "step": 8110 + }, + { + "epoch": 0.38527275316906423, + "grad_norm": 0.458984375, + "learning_rate": 3.753227522650822e-05, + "loss": 0.2275, + "step": 8115 + }, + { + "epoch": 0.385510136257893, + "grad_norm": 0.447265625, + "learning_rate": 3.751646164724849e-05, + "loss": 0.2215, + "step": 8120 + }, + { + "epoch": 0.38574751934672175, + "grad_norm": 0.53515625, + "learning_rate": 3.750064189488899e-05, + "loss": 0.2261, + "step": 8125 + }, + { + "epoch": 0.3859849024355505, + "grad_norm": 0.431640625, + "learning_rate": 3.748481597917936e-05, + "loss": 0.2243, + "step": 8130 + }, + { + "epoch": 0.38622228552437926, + "grad_norm": 0.54296875, + "learning_rate": 3.746898390987305e-05, + "loss": 0.2212, + "step": 8135 + }, + { + "epoch": 0.386459668613208, + "grad_norm": 0.455078125, + "learning_rate": 3.7453145696727295e-05, + "loss": 0.2285, + "step": 8140 + }, + { + "epoch": 0.3866970517020367, + "grad_norm": 0.447265625, + "learning_rate": 3.7437301349503105e-05, + "loss": 0.2243, + "step": 8145 + }, + { + "epoch": 0.3869344347908655, + "grad_norm": 0.447265625, + "learning_rate": 3.742145087796527e-05, + "loss": 0.2196, + "step": 8150 + }, + { + "epoch": 0.38717181787969424, + "grad_norm": 0.455078125, + "learning_rate": 3.7405594291882394e-05, + "loss": 0.2257, + "step": 8155 + }, + { + "epoch": 0.387409200968523, + "grad_norm": 0.455078125, + "learning_rate": 3.738973160102679e-05, + "loss": 0.2253, + "step": 8160 + }, + { + "epoch": 0.38764658405735175, + "grad_norm": 0.482421875, + "learning_rate": 3.737386281517457e-05, + "loss": 0.2271, + "step": 8165 + }, + { + "epoch": 0.3878839671461805, + "grad_norm": 0.53515625, + "learning_rate": 3.73579879441056e-05, + "loss": 0.2281, + "step": 8170 + }, + { + "epoch": 0.38812135023500927, + "grad_norm": 0.4609375, + "learning_rate": 3.734210699760348e-05, + "loss": 0.2253, + "step": 8175 + }, + { + "epoch": 0.388358733323838, + "grad_norm": 0.56640625, + "learning_rate": 3.732621998545558e-05, + "loss": 0.2263, + "step": 8180 + }, + { + "epoch": 0.3885961164126668, + "grad_norm": 0.4609375, + "learning_rate": 3.731032691745298e-05, + "loss": 0.2263, + "step": 8185 + }, + { + "epoch": 0.38883349950149554, + "grad_norm": 0.4453125, + "learning_rate": 3.729442780339052e-05, + "loss": 0.2244, + "step": 8190 + }, + { + "epoch": 0.38907088259032424, + "grad_norm": 0.46484375, + "learning_rate": 3.727852265306674e-05, + "loss": 0.2275, + "step": 8195 + }, + { + "epoch": 0.389308265679153, + "grad_norm": 0.48828125, + "learning_rate": 3.726261147628392e-05, + "loss": 0.2246, + "step": 8200 + }, + { + "epoch": 0.38954564876798176, + "grad_norm": 0.39453125, + "learning_rate": 3.724669428284803e-05, + "loss": 0.2279, + "step": 8205 + }, + { + "epoch": 0.3897830318568105, + "grad_norm": 0.51953125, + "learning_rate": 3.723077108256878e-05, + "loss": 0.2289, + "step": 8210 + }, + { + "epoch": 0.39002041494563927, + "grad_norm": 0.47265625, + "learning_rate": 3.721484188525956e-05, + "loss": 0.2246, + "step": 8215 + }, + { + "epoch": 0.39025779803446803, + "grad_norm": 0.490234375, + "learning_rate": 3.7198906700737466e-05, + "loss": 0.2263, + "step": 8220 + }, + { + "epoch": 0.3904951811232968, + "grad_norm": 0.40234375, + "learning_rate": 3.718296553882327e-05, + "loss": 0.2263, + "step": 8225 + }, + { + "epoch": 0.39073256421212554, + "grad_norm": 0.48046875, + "learning_rate": 3.716701840934144e-05, + "loss": 0.2255, + "step": 8230 + }, + { + "epoch": 0.3909699473009543, + "grad_norm": 0.447265625, + "learning_rate": 3.715106532212014e-05, + "loss": 0.2301, + "step": 8235 + }, + { + "epoch": 0.39120733038978306, + "grad_norm": 0.52734375, + "learning_rate": 3.7135106286991154e-05, + "loss": 0.2297, + "step": 8240 + }, + { + "epoch": 0.39144471347861176, + "grad_norm": 0.44140625, + "learning_rate": 3.711914131378998e-05, + "loss": 0.2229, + "step": 8245 + }, + { + "epoch": 0.3916820965674405, + "grad_norm": 0.46484375, + "learning_rate": 3.710317041235576e-05, + "loss": 0.2262, + "step": 8250 + }, + { + "epoch": 0.3919194796562693, + "grad_norm": 0.5, + "learning_rate": 3.708719359253128e-05, + "loss": 0.2255, + "step": 8255 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.447265625, + "learning_rate": 3.707121086416299e-05, + "loss": 0.2233, + "step": 8260 + }, + { + "epoch": 0.3923942458339268, + "grad_norm": 0.49609375, + "learning_rate": 3.7055222237100965e-05, + "loss": 0.2247, + "step": 8265 + }, + { + "epoch": 0.39263162892275555, + "grad_norm": 0.49609375, + "learning_rate": 3.703922772119892e-05, + "loss": 0.2282, + "step": 8270 + }, + { + "epoch": 0.3928690120115843, + "grad_norm": 0.455078125, + "learning_rate": 3.702322732631422e-05, + "loss": 0.223, + "step": 8275 + }, + { + "epoch": 0.39310639510041306, + "grad_norm": 0.470703125, + "learning_rate": 3.700722106230781e-05, + "loss": 0.2244, + "step": 8280 + }, + { + "epoch": 0.3933437781892418, + "grad_norm": 0.50390625, + "learning_rate": 3.69912089390443e-05, + "loss": 0.2283, + "step": 8285 + }, + { + "epoch": 0.3935811612780705, + "grad_norm": 0.431640625, + "learning_rate": 3.697519096639188e-05, + "loss": 0.2239, + "step": 8290 + }, + { + "epoch": 0.3938185443668993, + "grad_norm": 0.453125, + "learning_rate": 3.695916715422236e-05, + "loss": 0.2264, + "step": 8295 + }, + { + "epoch": 0.39405592745572804, + "grad_norm": 0.4140625, + "learning_rate": 3.694313751241113e-05, + "loss": 0.2273, + "step": 8300 + }, + { + "epoch": 0.3942933105445568, + "grad_norm": 0.48046875, + "learning_rate": 3.692710205083719e-05, + "loss": 0.2247, + "step": 8305 + }, + { + "epoch": 0.39453069363338555, + "grad_norm": 0.451171875, + "learning_rate": 3.6911060779383126e-05, + "loss": 0.2236, + "step": 8310 + }, + { + "epoch": 0.3947680767222143, + "grad_norm": 0.40625, + "learning_rate": 3.6895013707935095e-05, + "loss": 0.2239, + "step": 8315 + }, + { + "epoch": 0.39500545981104307, + "grad_norm": 0.46875, + "learning_rate": 3.687896084638285e-05, + "loss": 0.2261, + "step": 8320 + }, + { + "epoch": 0.3952428428998718, + "grad_norm": 0.5546875, + "learning_rate": 3.686290220461967e-05, + "loss": 0.2239, + "step": 8325 + }, + { + "epoch": 0.3954802259887006, + "grad_norm": 0.482421875, + "learning_rate": 3.6846837792542446e-05, + "loss": 0.2249, + "step": 8330 + }, + { + "epoch": 0.39571760907752934, + "grad_norm": 0.58984375, + "learning_rate": 3.683076762005159e-05, + "loss": 0.2239, + "step": 8335 + }, + { + "epoch": 0.39595499216635804, + "grad_norm": 0.53125, + "learning_rate": 3.681469169705109e-05, + "loss": 0.223, + "step": 8340 + }, + { + "epoch": 0.3961923752551868, + "grad_norm": 0.56640625, + "learning_rate": 3.6798610033448434e-05, + "loss": 0.2253, + "step": 8345 + }, + { + "epoch": 0.39642975834401556, + "grad_norm": 0.50390625, + "learning_rate": 3.678252263915471e-05, + "loss": 0.2263, + "step": 8350 + }, + { + "epoch": 0.3966671414328443, + "grad_norm": 0.546875, + "learning_rate": 3.67664295240845e-05, + "loss": 0.2286, + "step": 8355 + }, + { + "epoch": 0.3969045245216731, + "grad_norm": 0.55859375, + "learning_rate": 3.675033069815591e-05, + "loss": 0.2292, + "step": 8360 + }, + { + "epoch": 0.39714190761050183, + "grad_norm": 0.58984375, + "learning_rate": 3.6734226171290556e-05, + "loss": 0.2274, + "step": 8365 + }, + { + "epoch": 0.3973792906993306, + "grad_norm": 0.49609375, + "learning_rate": 3.671811595341362e-05, + "loss": 0.2252, + "step": 8370 + }, + { + "epoch": 0.39761667378815935, + "grad_norm": 0.5, + "learning_rate": 3.670200005445374e-05, + "loss": 0.2269, + "step": 8375 + }, + { + "epoch": 0.3978540568769881, + "grad_norm": 0.42578125, + "learning_rate": 3.6685878484343075e-05, + "loss": 0.2248, + "step": 8380 + }, + { + "epoch": 0.39809143996581686, + "grad_norm": 0.484375, + "learning_rate": 3.666975125301726e-05, + "loss": 0.2249, + "step": 8385 + }, + { + "epoch": 0.39832882305464556, + "grad_norm": 0.482421875, + "learning_rate": 3.665361837041545e-05, + "loss": 0.2255, + "step": 8390 + }, + { + "epoch": 0.3985662061434743, + "grad_norm": 0.5, + "learning_rate": 3.663747984648026e-05, + "loss": 0.2284, + "step": 8395 + }, + { + "epoch": 0.3988035892323031, + "grad_norm": 0.474609375, + "learning_rate": 3.66213356911578e-05, + "loss": 0.2235, + "step": 8400 + }, + { + "epoch": 0.39904097232113184, + "grad_norm": 0.443359375, + "learning_rate": 3.6605185914397625e-05, + "loss": 0.2289, + "step": 8405 + }, + { + "epoch": 0.3992783554099606, + "grad_norm": 0.4140625, + "learning_rate": 3.658903052615278e-05, + "loss": 0.2219, + "step": 8410 + }, + { + "epoch": 0.39951573849878935, + "grad_norm": 0.416015625, + "learning_rate": 3.6572869536379736e-05, + "loss": 0.2271, + "step": 8415 + }, + { + "epoch": 0.3997531215876181, + "grad_norm": 0.41015625, + "learning_rate": 3.655670295503847e-05, + "loss": 0.2214, + "step": 8420 + }, + { + "epoch": 0.39999050467644687, + "grad_norm": 0.54296875, + "learning_rate": 3.654053079209235e-05, + "loss": 0.2273, + "step": 8425 + }, + { + "epoch": 0.4002278877652756, + "grad_norm": 0.44140625, + "learning_rate": 3.652435305750822e-05, + "loss": 0.2247, + "step": 8430 + }, + { + "epoch": 0.4004652708541043, + "grad_norm": 0.42578125, + "learning_rate": 3.650816976125633e-05, + "loss": 0.2264, + "step": 8435 + }, + { + "epoch": 0.4007026539429331, + "grad_norm": 0.408203125, + "learning_rate": 3.649198091331039e-05, + "loss": 0.227, + "step": 8440 + }, + { + "epoch": 0.40094003703176184, + "grad_norm": 0.4453125, + "learning_rate": 3.647578652364749e-05, + "loss": 0.2279, + "step": 8445 + }, + { + "epoch": 0.4011774201205906, + "grad_norm": 0.4296875, + "learning_rate": 3.645958660224818e-05, + "loss": 0.2251, + "step": 8450 + }, + { + "epoch": 0.40141480320941936, + "grad_norm": 0.43359375, + "learning_rate": 3.644338115909638e-05, + "loss": 0.225, + "step": 8455 + }, + { + "epoch": 0.4016521862982481, + "grad_norm": 0.458984375, + "learning_rate": 3.642717020417945e-05, + "loss": 0.2246, + "step": 8460 + }, + { + "epoch": 0.4018895693870769, + "grad_norm": 0.5234375, + "learning_rate": 3.641095374748811e-05, + "loss": 0.2265, + "step": 8465 + }, + { + "epoch": 0.40212695247590563, + "grad_norm": 0.59765625, + "learning_rate": 3.63947317990165e-05, + "loss": 0.2261, + "step": 8470 + }, + { + "epoch": 0.4023643355647344, + "grad_norm": 0.64453125, + "learning_rate": 3.6378504368762124e-05, + "loss": 0.2242, + "step": 8475 + }, + { + "epoch": 0.40260171865356315, + "grad_norm": 0.443359375, + "learning_rate": 3.636227146672589e-05, + "loss": 0.2224, + "step": 8480 + }, + { + "epoch": 0.40283910174239185, + "grad_norm": 0.50390625, + "learning_rate": 3.634603310291204e-05, + "loss": 0.2248, + "step": 8485 + }, + { + "epoch": 0.4030764848312206, + "grad_norm": 0.423828125, + "learning_rate": 3.632978928732823e-05, + "loss": 0.2293, + "step": 8490 + }, + { + "epoch": 0.40331386792004936, + "grad_norm": 0.474609375, + "learning_rate": 3.6313540029985414e-05, + "loss": 0.2236, + "step": 8495 + }, + { + "epoch": 0.4035512510088781, + "grad_norm": 0.5625, + "learning_rate": 3.6297285340897965e-05, + "loss": 0.2296, + "step": 8500 + }, + { + "epoch": 0.4037886340977069, + "grad_norm": 0.44140625, + "learning_rate": 3.6281025230083566e-05, + "loss": 0.2238, + "step": 8505 + }, + { + "epoch": 0.40402601718653564, + "grad_norm": 0.5234375, + "learning_rate": 3.626475970756324e-05, + "loss": 0.2295, + "step": 8510 + }, + { + "epoch": 0.4042634002753644, + "grad_norm": 0.51171875, + "learning_rate": 3.624848878336137e-05, + "loss": 0.2246, + "step": 8515 + }, + { + "epoch": 0.40450078336419315, + "grad_norm": 0.423828125, + "learning_rate": 3.623221246750565e-05, + "loss": 0.2259, + "step": 8520 + }, + { + "epoch": 0.4047381664530219, + "grad_norm": 0.3984375, + "learning_rate": 3.6215930770027083e-05, + "loss": 0.224, + "step": 8525 + }, + { + "epoch": 0.40497554954185067, + "grad_norm": 0.470703125, + "learning_rate": 3.619964370096002e-05, + "loss": 0.2281, + "step": 8530 + }, + { + "epoch": 0.40521293263067937, + "grad_norm": 0.38671875, + "learning_rate": 3.61833512703421e-05, + "loss": 0.2224, + "step": 8535 + }, + { + "epoch": 0.4054503157195081, + "grad_norm": 0.484375, + "learning_rate": 3.6167053488214286e-05, + "loss": 0.2222, + "step": 8540 + }, + { + "epoch": 0.4056876988083369, + "grad_norm": 0.44921875, + "learning_rate": 3.6150750364620825e-05, + "loss": 0.2248, + "step": 8545 + }, + { + "epoch": 0.40592508189716564, + "grad_norm": 0.5078125, + "learning_rate": 3.613444190960924e-05, + "loss": 0.2241, + "step": 8550 + }, + { + "epoch": 0.4061624649859944, + "grad_norm": 0.376953125, + "learning_rate": 3.6118128133230374e-05, + "loss": 0.2192, + "step": 8555 + }, + { + "epoch": 0.40639984807482316, + "grad_norm": 0.435546875, + "learning_rate": 3.610180904553832e-05, + "loss": 0.2236, + "step": 8560 + }, + { + "epoch": 0.4066372311636519, + "grad_norm": 0.44140625, + "learning_rate": 3.608548465659048e-05, + "loss": 0.2273, + "step": 8565 + }, + { + "epoch": 0.40687461425248067, + "grad_norm": 0.50390625, + "learning_rate": 3.606915497644748e-05, + "loss": 0.2271, + "step": 8570 + }, + { + "epoch": 0.40711199734130943, + "grad_norm": 0.44140625, + "learning_rate": 3.6052820015173236e-05, + "loss": 0.2222, + "step": 8575 + }, + { + "epoch": 0.40734938043013813, + "grad_norm": 0.431640625, + "learning_rate": 3.6036479782834906e-05, + "loss": 0.2236, + "step": 8580 + }, + { + "epoch": 0.4075867635189669, + "grad_norm": 0.44921875, + "learning_rate": 3.602013428950291e-05, + "loss": 0.2237, + "step": 8585 + }, + { + "epoch": 0.40782414660779565, + "grad_norm": 0.46484375, + "learning_rate": 3.6003783545250886e-05, + "loss": 0.2276, + "step": 8590 + }, + { + "epoch": 0.4080615296966244, + "grad_norm": 0.40625, + "learning_rate": 3.5987427560155744e-05, + "loss": 0.2244, + "step": 8595 + }, + { + "epoch": 0.40829891278545316, + "grad_norm": 0.486328125, + "learning_rate": 3.597106634429759e-05, + "loss": 0.2238, + "step": 8600 + }, + { + "epoch": 0.4085362958742819, + "grad_norm": 0.416015625, + "learning_rate": 3.595469990775976e-05, + "loss": 0.2254, + "step": 8605 + }, + { + "epoch": 0.4087736789631107, + "grad_norm": 0.455078125, + "learning_rate": 3.593832826062883e-05, + "loss": 0.2248, + "step": 8610 + }, + { + "epoch": 0.40901106205193943, + "grad_norm": 0.65625, + "learning_rate": 3.592195141299456e-05, + "loss": 0.2241, + "step": 8615 + }, + { + "epoch": 0.4092484451407682, + "grad_norm": 0.390625, + "learning_rate": 3.5905569374949935e-05, + "loss": 0.2249, + "step": 8620 + }, + { + "epoch": 0.40948582822959695, + "grad_norm": 0.427734375, + "learning_rate": 3.588918215659113e-05, + "loss": 0.22, + "step": 8625 + }, + { + "epoch": 0.40972321131842565, + "grad_norm": 0.51171875, + "learning_rate": 3.587278976801751e-05, + "loss": 0.2285, + "step": 8630 + }, + { + "epoch": 0.4099605944072544, + "grad_norm": 0.42578125, + "learning_rate": 3.585639221933162e-05, + "loss": 0.2283, + "step": 8635 + }, + { + "epoch": 0.41019797749608317, + "grad_norm": 0.423828125, + "learning_rate": 3.583998952063921e-05, + "loss": 0.2266, + "step": 8640 + }, + { + "epoch": 0.4104353605849119, + "grad_norm": 0.44921875, + "learning_rate": 3.582358168204919e-05, + "loss": 0.2227, + "step": 8645 + }, + { + "epoch": 0.4106727436737407, + "grad_norm": 0.404296875, + "learning_rate": 3.5807168713673624e-05, + "loss": 0.2275, + "step": 8650 + }, + { + "epoch": 0.41091012676256944, + "grad_norm": 0.5234375, + "learning_rate": 3.579075062562776e-05, + "loss": 0.2245, + "step": 8655 + }, + { + "epoch": 0.4111475098513982, + "grad_norm": 0.421875, + "learning_rate": 3.577432742802997e-05, + "loss": 0.2212, + "step": 8660 + }, + { + "epoch": 0.41138489294022695, + "grad_norm": 0.498046875, + "learning_rate": 3.575789913100183e-05, + "loss": 0.2257, + "step": 8665 + }, + { + "epoch": 0.4116222760290557, + "grad_norm": 0.44921875, + "learning_rate": 3.574146574466799e-05, + "loss": 0.2251, + "step": 8670 + }, + { + "epoch": 0.41185965911788447, + "grad_norm": 0.478515625, + "learning_rate": 3.5725027279156304e-05, + "loss": 0.2289, + "step": 8675 + }, + { + "epoch": 0.41209704220671317, + "grad_norm": 0.490234375, + "learning_rate": 3.5708583744597694e-05, + "loss": 0.2261, + "step": 8680 + }, + { + "epoch": 0.41233442529554193, + "grad_norm": 0.4296875, + "learning_rate": 3.569213515112626e-05, + "loss": 0.2289, + "step": 8685 + }, + { + "epoch": 0.4125718083843707, + "grad_norm": 0.455078125, + "learning_rate": 3.567568150887918e-05, + "loss": 0.2242, + "step": 8690 + }, + { + "epoch": 0.41280919147319944, + "grad_norm": 0.37890625, + "learning_rate": 3.565922282799676e-05, + "loss": 0.2211, + "step": 8695 + }, + { + "epoch": 0.4130465745620282, + "grad_norm": 0.5078125, + "learning_rate": 3.564275911862243e-05, + "loss": 0.2266, + "step": 8700 + }, + { + "epoch": 0.41328395765085696, + "grad_norm": 0.51953125, + "learning_rate": 3.562629039090266e-05, + "loss": 0.2276, + "step": 8705 + }, + { + "epoch": 0.4135213407396857, + "grad_norm": 0.435546875, + "learning_rate": 3.5609816654987095e-05, + "loss": 0.2245, + "step": 8710 + }, + { + "epoch": 0.4137587238285145, + "grad_norm": 0.515625, + "learning_rate": 3.559333792102839e-05, + "loss": 0.223, + "step": 8715 + }, + { + "epoch": 0.41399610691734323, + "grad_norm": 0.5390625, + "learning_rate": 3.557685419918233e-05, + "loss": 0.2285, + "step": 8720 + }, + { + "epoch": 0.41423349000617193, + "grad_norm": 0.53125, + "learning_rate": 3.556036549960776e-05, + "loss": 0.2253, + "step": 8725 + }, + { + "epoch": 0.4144708730950007, + "grad_norm": 0.58984375, + "learning_rate": 3.554387183246658e-05, + "loss": 0.2197, + "step": 8730 + }, + { + "epoch": 0.41470825618382945, + "grad_norm": 0.412109375, + "learning_rate": 3.552737320792377e-05, + "loss": 0.2245, + "step": 8735 + }, + { + "epoch": 0.4149456392726582, + "grad_norm": 0.435546875, + "learning_rate": 3.551086963614736e-05, + "loss": 0.2263, + "step": 8740 + }, + { + "epoch": 0.41518302236148696, + "grad_norm": 0.435546875, + "learning_rate": 3.549436112730841e-05, + "loss": 0.2249, + "step": 8745 + }, + { + "epoch": 0.4154204054503157, + "grad_norm": 0.4453125, + "learning_rate": 3.547784769158106e-05, + "loss": 0.2239, + "step": 8750 + }, + { + "epoch": 0.4156577885391445, + "grad_norm": 0.46484375, + "learning_rate": 3.546132933914245e-05, + "loss": 0.2282, + "step": 8755 + }, + { + "epoch": 0.41589517162797324, + "grad_norm": 0.41796875, + "learning_rate": 3.544480608017278e-05, + "loss": 0.2229, + "step": 8760 + }, + { + "epoch": 0.416132554716802, + "grad_norm": 0.51171875, + "learning_rate": 3.5428277924855236e-05, + "loss": 0.2226, + "step": 8765 + }, + { + "epoch": 0.41636993780563075, + "grad_norm": 0.55859375, + "learning_rate": 3.541174488337607e-05, + "loss": 0.2276, + "step": 8770 + }, + { + "epoch": 0.41660732089445945, + "grad_norm": 0.466796875, + "learning_rate": 3.5395206965924504e-05, + "loss": 0.2231, + "step": 8775 + }, + { + "epoch": 0.4168447039832882, + "grad_norm": 0.51171875, + "learning_rate": 3.537866418269279e-05, + "loss": 0.2234, + "step": 8780 + }, + { + "epoch": 0.41708208707211697, + "grad_norm": 0.44140625, + "learning_rate": 3.5362116543876164e-05, + "loss": 0.2244, + "step": 8785 + }, + { + "epoch": 0.4173194701609457, + "grad_norm": 0.484375, + "learning_rate": 3.534556405967287e-05, + "loss": 0.2263, + "step": 8790 + }, + { + "epoch": 0.4175568532497745, + "grad_norm": 0.470703125, + "learning_rate": 3.532900674028411e-05, + "loss": 0.2289, + "step": 8795 + }, + { + "epoch": 0.41779423633860324, + "grad_norm": 0.435546875, + "learning_rate": 3.531244459591411e-05, + "loss": 0.2251, + "step": 8800 + }, + { + "epoch": 0.418031619427432, + "grad_norm": 0.40625, + "learning_rate": 3.529587763677003e-05, + "loss": 0.2273, + "step": 8805 + }, + { + "epoch": 0.41826900251626076, + "grad_norm": 0.439453125, + "learning_rate": 3.5279305873062005e-05, + "loss": 0.2265, + "step": 8810 + }, + { + "epoch": 0.4185063856050895, + "grad_norm": 0.55078125, + "learning_rate": 3.5262729315003154e-05, + "loss": 0.2244, + "step": 8815 + }, + { + "epoch": 0.4187437686939183, + "grad_norm": 0.4921875, + "learning_rate": 3.5246147972809517e-05, + "loss": 0.2261, + "step": 8820 + }, + { + "epoch": 0.418981151782747, + "grad_norm": 0.43359375, + "learning_rate": 3.522956185670011e-05, + "loss": 0.2269, + "step": 8825 + }, + { + "epoch": 0.41921853487157573, + "grad_norm": 0.640625, + "learning_rate": 3.521297097689689e-05, + "loss": 0.2239, + "step": 8830 + }, + { + "epoch": 0.4194559179604045, + "grad_norm": 0.458984375, + "learning_rate": 3.519637534362472e-05, + "loss": 0.2222, + "step": 8835 + }, + { + "epoch": 0.41969330104923325, + "grad_norm": 0.427734375, + "learning_rate": 3.517977496711142e-05, + "loss": 0.2247, + "step": 8840 + }, + { + "epoch": 0.419930684138062, + "grad_norm": 0.48046875, + "learning_rate": 3.516316985758774e-05, + "loss": 0.2253, + "step": 8845 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 0.451171875, + "learning_rate": 3.5146560025287325e-05, + "loss": 0.2224, + "step": 8850 + }, + { + "epoch": 0.4204054503157195, + "grad_norm": 0.412109375, + "learning_rate": 3.512994548044673e-05, + "loss": 0.2284, + "step": 8855 + }, + { + "epoch": 0.4206428334045483, + "grad_norm": 0.421875, + "learning_rate": 3.511332623330544e-05, + "loss": 0.2257, + "step": 8860 + }, + { + "epoch": 0.42088021649337704, + "grad_norm": 0.46484375, + "learning_rate": 3.50967022941058e-05, + "loss": 0.2249, + "step": 8865 + }, + { + "epoch": 0.42111759958220574, + "grad_norm": 0.404296875, + "learning_rate": 3.508007367309309e-05, + "loss": 0.2243, + "step": 8870 + }, + { + "epoch": 0.4213549826710345, + "grad_norm": 0.54296875, + "learning_rate": 3.506344038051544e-05, + "loss": 0.2215, + "step": 8875 + }, + { + "epoch": 0.42159236575986325, + "grad_norm": 0.447265625, + "learning_rate": 3.5046802426623875e-05, + "loss": 0.2199, + "step": 8880 + }, + { + "epoch": 0.421829748848692, + "grad_norm": 0.458984375, + "learning_rate": 3.5030159821672296e-05, + "loss": 0.2266, + "step": 8885 + }, + { + "epoch": 0.42206713193752077, + "grad_norm": 0.42578125, + "learning_rate": 3.501351257591746e-05, + "loss": 0.2253, + "step": 8890 + }, + { + "epoch": 0.4223045150263495, + "grad_norm": 0.484375, + "learning_rate": 3.499686069961899e-05, + "loss": 0.2221, + "step": 8895 + }, + { + "epoch": 0.4225418981151783, + "grad_norm": 0.4140625, + "learning_rate": 3.4980204203039366e-05, + "loss": 0.2239, + "step": 8900 + }, + { + "epoch": 0.42277928120400704, + "grad_norm": 0.40625, + "learning_rate": 3.49635430964439e-05, + "loss": 0.2258, + "step": 8905 + }, + { + "epoch": 0.4230166642928358, + "grad_norm": 0.43359375, + "learning_rate": 3.4946877390100766e-05, + "loss": 0.2254, + "step": 8910 + }, + { + "epoch": 0.42325404738166456, + "grad_norm": 0.4609375, + "learning_rate": 3.493020709428096e-05, + "loss": 0.2272, + "step": 8915 + }, + { + "epoch": 0.42349143047049326, + "grad_norm": 0.52734375, + "learning_rate": 3.491353221925831e-05, + "loss": 0.2264, + "step": 8920 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 0.412109375, + "learning_rate": 3.489685277530946e-05, + "loss": 0.2226, + "step": 8925 + }, + { + "epoch": 0.4239661966481508, + "grad_norm": 0.484375, + "learning_rate": 3.488016877271388e-05, + "loss": 0.2251, + "step": 8930 + }, + { + "epoch": 0.42420357973697953, + "grad_norm": 0.5546875, + "learning_rate": 3.486348022175386e-05, + "loss": 0.2242, + "step": 8935 + }, + { + "epoch": 0.4244409628258083, + "grad_norm": 0.482421875, + "learning_rate": 3.484678713271445e-05, + "loss": 0.2268, + "step": 8940 + }, + { + "epoch": 0.42467834591463705, + "grad_norm": 0.51953125, + "learning_rate": 3.483008951588354e-05, + "loss": 0.226, + "step": 8945 + }, + { + "epoch": 0.4249157290034658, + "grad_norm": 0.5, + "learning_rate": 3.481338738155179e-05, + "loss": 0.2252, + "step": 8950 + }, + { + "epoch": 0.42515311209229456, + "grad_norm": 0.46875, + "learning_rate": 3.479668074001266e-05, + "loss": 0.2243, + "step": 8955 + }, + { + "epoch": 0.4253904951811233, + "grad_norm": 0.400390625, + "learning_rate": 3.477996960156237e-05, + "loss": 0.2266, + "step": 8960 + }, + { + "epoch": 0.4256278782699521, + "grad_norm": 0.4296875, + "learning_rate": 3.476325397649992e-05, + "loss": 0.2238, + "step": 8965 + }, + { + "epoch": 0.4258652613587808, + "grad_norm": 0.515625, + "learning_rate": 3.4746533875127065e-05, + "loss": 0.2257, + "step": 8970 + }, + { + "epoch": 0.42610264444760954, + "grad_norm": 0.5078125, + "learning_rate": 3.472980930774834e-05, + "loss": 0.2309, + "step": 8975 + }, + { + "epoch": 0.4263400275364383, + "grad_norm": 0.474609375, + "learning_rate": 3.4713080284671006e-05, + "loss": 0.2237, + "step": 8980 + }, + { + "epoch": 0.42657741062526705, + "grad_norm": 0.408203125, + "learning_rate": 3.469634681620511e-05, + "loss": 0.2206, + "step": 8985 + }, + { + "epoch": 0.4268147937140958, + "grad_norm": 0.56640625, + "learning_rate": 3.4679608912663385e-05, + "loss": 0.2207, + "step": 8990 + }, + { + "epoch": 0.42705217680292457, + "grad_norm": 0.439453125, + "learning_rate": 3.466286658436134e-05, + "loss": 0.2281, + "step": 8995 + }, + { + "epoch": 0.4272895598917533, + "grad_norm": 0.41015625, + "learning_rate": 3.464611984161719e-05, + "loss": 0.2286, + "step": 9000 + }, + { + "epoch": 0.4275269429805821, + "grad_norm": 0.412109375, + "learning_rate": 3.4629368694751877e-05, + "loss": 0.2254, + "step": 9005 + }, + { + "epoch": 0.42776432606941084, + "grad_norm": 0.55078125, + "learning_rate": 3.461261315408904e-05, + "loss": 0.2208, + "step": 9010 + }, + { + "epoch": 0.42800170915823954, + "grad_norm": 0.435546875, + "learning_rate": 3.4595853229955076e-05, + "loss": 0.2271, + "step": 9015 + }, + { + "epoch": 0.4282390922470683, + "grad_norm": 0.490234375, + "learning_rate": 3.457908893267904e-05, + "loss": 0.2245, + "step": 9020 + }, + { + "epoch": 0.42847647533589706, + "grad_norm": 0.3984375, + "learning_rate": 3.456232027259267e-05, + "loss": 0.2219, + "step": 9025 + }, + { + "epoch": 0.4287138584247258, + "grad_norm": 0.4609375, + "learning_rate": 3.454554726003044e-05, + "loss": 0.229, + "step": 9030 + }, + { + "epoch": 0.42895124151355457, + "grad_norm": 0.4375, + "learning_rate": 3.452876990532947e-05, + "loss": 0.227, + "step": 9035 + }, + { + "epoch": 0.42918862460238333, + "grad_norm": 0.462890625, + "learning_rate": 3.451198821882957e-05, + "loss": 0.2212, + "step": 9040 + }, + { + "epoch": 0.4294260076912121, + "grad_norm": 0.44921875, + "learning_rate": 3.449520221087321e-05, + "loss": 0.2213, + "step": 9045 + }, + { + "epoch": 0.42966339078004084, + "grad_norm": 0.46875, + "learning_rate": 3.4478411891805545e-05, + "loss": 0.2256, + "step": 9050 + }, + { + "epoch": 0.4299007738688696, + "grad_norm": 0.484375, + "learning_rate": 3.446161727197437e-05, + "loss": 0.2277, + "step": 9055 + }, + { + "epoch": 0.43013815695769836, + "grad_norm": 0.474609375, + "learning_rate": 3.444481836173011e-05, + "loss": 0.2212, + "step": 9060 + }, + { + "epoch": 0.43037554004652706, + "grad_norm": 0.43359375, + "learning_rate": 3.442801517142589e-05, + "loss": 0.2225, + "step": 9065 + }, + { + "epoch": 0.4306129231353558, + "grad_norm": 0.5078125, + "learning_rate": 3.441120771141741e-05, + "loss": 0.2227, + "step": 9070 + }, + { + "epoch": 0.4308503062241846, + "grad_norm": 0.439453125, + "learning_rate": 3.439439599206305e-05, + "loss": 0.2252, + "step": 9075 + }, + { + "epoch": 0.43108768931301333, + "grad_norm": 0.4609375, + "learning_rate": 3.437758002372379e-05, + "loss": 0.2235, + "step": 9080 + }, + { + "epoch": 0.4313250724018421, + "grad_norm": 0.45703125, + "learning_rate": 3.436075981676324e-05, + "loss": 0.2218, + "step": 9085 + }, + { + "epoch": 0.43156245549067085, + "grad_norm": 0.578125, + "learning_rate": 3.4343935381547615e-05, + "loss": 0.2245, + "step": 9090 + }, + { + "epoch": 0.4317998385794996, + "grad_norm": 0.419921875, + "learning_rate": 3.4327106728445734e-05, + "loss": 0.2254, + "step": 9095 + }, + { + "epoch": 0.43203722166832836, + "grad_norm": 0.412109375, + "learning_rate": 3.431027386782902e-05, + "loss": 0.2268, + "step": 9100 + }, + { + "epoch": 0.4322746047571571, + "grad_norm": 0.61328125, + "learning_rate": 3.429343681007148e-05, + "loss": 0.224, + "step": 9105 + }, + { + "epoch": 0.4325119878459859, + "grad_norm": 0.51171875, + "learning_rate": 3.4276595565549746e-05, + "loss": 0.2277, + "step": 9110 + }, + { + "epoch": 0.4327493709348146, + "grad_norm": 0.41796875, + "learning_rate": 3.425975014464297e-05, + "loss": 0.2222, + "step": 9115 + }, + { + "epoch": 0.43298675402364334, + "grad_norm": 0.447265625, + "learning_rate": 3.424290055773291e-05, + "loss": 0.2243, + "step": 9120 + }, + { + "epoch": 0.4332241371124721, + "grad_norm": 0.435546875, + "learning_rate": 3.422604681520391e-05, + "loss": 0.2295, + "step": 9125 + }, + { + "epoch": 0.43346152020130085, + "grad_norm": 0.404296875, + "learning_rate": 3.420918892744284e-05, + "loss": 0.2189, + "step": 9130 + }, + { + "epoch": 0.4336989032901296, + "grad_norm": 0.486328125, + "learning_rate": 3.4192326904839143e-05, + "loss": 0.2251, + "step": 9135 + }, + { + "epoch": 0.43393628637895837, + "grad_norm": 0.44921875, + "learning_rate": 3.417546075778481e-05, + "loss": 0.2204, + "step": 9140 + }, + { + "epoch": 0.4341736694677871, + "grad_norm": 0.462890625, + "learning_rate": 3.4158590496674365e-05, + "loss": 0.2249, + "step": 9145 + }, + { + "epoch": 0.4344110525566159, + "grad_norm": 0.447265625, + "learning_rate": 3.414171613190487e-05, + "loss": 0.2265, + "step": 9150 + }, + { + "epoch": 0.43464843564544464, + "grad_norm": 0.4765625, + "learning_rate": 3.412483767387592e-05, + "loss": 0.2268, + "step": 9155 + }, + { + "epoch": 0.43488581873427334, + "grad_norm": 0.48828125, + "learning_rate": 3.410795513298966e-05, + "loss": 0.2264, + "step": 9160 + }, + { + "epoch": 0.4351232018231021, + "grad_norm": 0.5, + "learning_rate": 3.4091068519650677e-05, + "loss": 0.2249, + "step": 9165 + }, + { + "epoch": 0.43536058491193086, + "grad_norm": 0.474609375, + "learning_rate": 3.407417784426615e-05, + "loss": 0.224, + "step": 9170 + }, + { + "epoch": 0.4355979680007596, + "grad_norm": 0.416015625, + "learning_rate": 3.40572831172457e-05, + "loss": 0.2241, + "step": 9175 + }, + { + "epoch": 0.4358353510895884, + "grad_norm": 0.51953125, + "learning_rate": 3.404038434900151e-05, + "loss": 0.2263, + "step": 9180 + }, + { + "epoch": 0.43607273417841713, + "grad_norm": 0.49609375, + "learning_rate": 3.402348154994816e-05, + "loss": 0.2272, + "step": 9185 + }, + { + "epoch": 0.4363101172672459, + "grad_norm": 0.45703125, + "learning_rate": 3.4006574730502806e-05, + "loss": 0.2274, + "step": 9190 + }, + { + "epoch": 0.43654750035607465, + "grad_norm": 0.455078125, + "learning_rate": 3.398966390108503e-05, + "loss": 0.2258, + "step": 9195 + }, + { + "epoch": 0.4367848834449034, + "grad_norm": 0.451171875, + "learning_rate": 3.39727490721169e-05, + "loss": 0.2196, + "step": 9200 + }, + { + "epoch": 0.43702226653373216, + "grad_norm": 0.435546875, + "learning_rate": 3.395583025402296e-05, + "loss": 0.2257, + "step": 9205 + }, + { + "epoch": 0.43725964962256086, + "grad_norm": 0.57421875, + "learning_rate": 3.393890745723018e-05, + "loss": 0.225, + "step": 9210 + }, + { + "epoch": 0.4374970327113896, + "grad_norm": 0.4453125, + "learning_rate": 3.392198069216801e-05, + "loss": 0.2285, + "step": 9215 + }, + { + "epoch": 0.4377344158002184, + "grad_norm": 0.455078125, + "learning_rate": 3.390504996926836e-05, + "loss": 0.2225, + "step": 9220 + }, + { + "epoch": 0.43797179888904714, + "grad_norm": 0.482421875, + "learning_rate": 3.388811529896552e-05, + "loss": 0.2276, + "step": 9225 + }, + { + "epoch": 0.4382091819778759, + "grad_norm": 0.494140625, + "learning_rate": 3.387117669169629e-05, + "loss": 0.2178, + "step": 9230 + }, + { + "epoch": 0.43844656506670465, + "grad_norm": 0.453125, + "learning_rate": 3.385423415789983e-05, + "loss": 0.2289, + "step": 9235 + }, + { + "epoch": 0.4386839481555334, + "grad_norm": 0.41796875, + "learning_rate": 3.3837287708017744e-05, + "loss": 0.2253, + "step": 9240 + }, + { + "epoch": 0.43892133124436217, + "grad_norm": 0.4296875, + "learning_rate": 3.382033735249407e-05, + "loss": 0.2258, + "step": 9245 + }, + { + "epoch": 0.4391587143331909, + "grad_norm": 0.453125, + "learning_rate": 3.380338310177522e-05, + "loss": 0.222, + "step": 9250 + }, + { + "epoch": 0.4393960974220197, + "grad_norm": 0.578125, + "learning_rate": 3.378642496631004e-05, + "loss": 0.2246, + "step": 9255 + }, + { + "epoch": 0.4396334805108484, + "grad_norm": 0.462890625, + "learning_rate": 3.376946295654973e-05, + "loss": 0.2256, + "step": 9260 + }, + { + "epoch": 0.43987086359967714, + "grad_norm": 0.50390625, + "learning_rate": 3.37524970829479e-05, + "loss": 0.2258, + "step": 9265 + }, + { + "epoch": 0.4401082466885059, + "grad_norm": 0.39453125, + "learning_rate": 3.373552735596057e-05, + "loss": 0.223, + "step": 9270 + }, + { + "epoch": 0.44034562977733466, + "grad_norm": 0.4609375, + "learning_rate": 3.371855378604607e-05, + "loss": 0.2286, + "step": 9275 + }, + { + "epoch": 0.4405830128661634, + "grad_norm": 0.53125, + "learning_rate": 3.3701576383665156e-05, + "loss": 0.2227, + "step": 9280 + }, + { + "epoch": 0.4408203959549922, + "grad_norm": 0.51171875, + "learning_rate": 3.368459515928092e-05, + "loss": 0.222, + "step": 9285 + }, + { + "epoch": 0.44105777904382093, + "grad_norm": 0.45703125, + "learning_rate": 3.3667610123358805e-05, + "loss": 0.2254, + "step": 9290 + }, + { + "epoch": 0.4412951621326497, + "grad_norm": 0.455078125, + "learning_rate": 3.3650621286366623e-05, + "loss": 0.2234, + "step": 9295 + }, + { + "epoch": 0.44153254522147845, + "grad_norm": 0.423828125, + "learning_rate": 3.36336286587745e-05, + "loss": 0.2275, + "step": 9300 + }, + { + "epoch": 0.44176992831030715, + "grad_norm": 0.443359375, + "learning_rate": 3.361663225105494e-05, + "loss": 0.2256, + "step": 9305 + }, + { + "epoch": 0.4420073113991359, + "grad_norm": 0.40625, + "learning_rate": 3.359963207368273e-05, + "loss": 0.2268, + "step": 9310 + }, + { + "epoch": 0.44224469448796466, + "grad_norm": 0.5703125, + "learning_rate": 3.3582628137135016e-05, + "loss": 0.2251, + "step": 9315 + }, + { + "epoch": 0.4424820775767934, + "grad_norm": 0.484375, + "learning_rate": 3.3565620451891234e-05, + "loss": 0.2246, + "step": 9320 + }, + { + "epoch": 0.4427194606656222, + "grad_norm": 0.53125, + "learning_rate": 3.354860902843315e-05, + "loss": 0.2281, + "step": 9325 + }, + { + "epoch": 0.44295684375445094, + "grad_norm": 0.443359375, + "learning_rate": 3.353159387724483e-05, + "loss": 0.2259, + "step": 9330 + }, + { + "epoch": 0.4431942268432797, + "grad_norm": 0.404296875, + "learning_rate": 3.351457500881263e-05, + "loss": 0.2235, + "step": 9335 + }, + { + "epoch": 0.44343160993210845, + "grad_norm": 0.4921875, + "learning_rate": 3.34975524336252e-05, + "loss": 0.2232, + "step": 9340 + }, + { + "epoch": 0.4436689930209372, + "grad_norm": 0.435546875, + "learning_rate": 3.348052616217348e-05, + "loss": 0.2245, + "step": 9345 + }, + { + "epoch": 0.44390637610976597, + "grad_norm": 0.484375, + "learning_rate": 3.346349620495068e-05, + "loss": 0.2259, + "step": 9350 + }, + { + "epoch": 0.44414375919859467, + "grad_norm": 0.4140625, + "learning_rate": 3.3446462572452284e-05, + "loss": 0.2275, + "step": 9355 + }, + { + "epoch": 0.4443811422874234, + "grad_norm": 0.5, + "learning_rate": 3.342942527517605e-05, + "loss": 0.2199, + "step": 9360 + }, + { + "epoch": 0.4446185253762522, + "grad_norm": 0.4375, + "learning_rate": 3.341238432362199e-05, + "loss": 0.2246, + "step": 9365 + }, + { + "epoch": 0.44485590846508094, + "grad_norm": 0.4765625, + "learning_rate": 3.339533972829234e-05, + "loss": 0.2259, + "step": 9370 + }, + { + "epoch": 0.4450932915539097, + "grad_norm": 0.43359375, + "learning_rate": 3.337829149969164e-05, + "loss": 0.2257, + "step": 9375 + }, + { + "epoch": 0.44533067464273846, + "grad_norm": 0.42578125, + "learning_rate": 3.3361239648326616e-05, + "loss": 0.2258, + "step": 9380 + }, + { + "epoch": 0.4455680577315672, + "grad_norm": 0.416015625, + "learning_rate": 3.334418418470624e-05, + "loss": 0.2239, + "step": 9385 + }, + { + "epoch": 0.44580544082039597, + "grad_norm": 0.44140625, + "learning_rate": 3.332712511934173e-05, + "loss": 0.2244, + "step": 9390 + }, + { + "epoch": 0.44604282390922473, + "grad_norm": 0.43359375, + "learning_rate": 3.3310062462746514e-05, + "loss": 0.2197, + "step": 9395 + }, + { + "epoch": 0.4462802069980535, + "grad_norm": 0.48046875, + "learning_rate": 3.329299622543621e-05, + "loss": 0.2258, + "step": 9400 + }, + { + "epoch": 0.4465175900868822, + "grad_norm": 0.427734375, + "learning_rate": 3.327592641792868e-05, + "loss": 0.2233, + "step": 9405 + }, + { + "epoch": 0.44675497317571095, + "grad_norm": 0.439453125, + "learning_rate": 3.325885305074396e-05, + "loss": 0.226, + "step": 9410 + }, + { + "epoch": 0.4469923562645397, + "grad_norm": 0.453125, + "learning_rate": 3.324177613440429e-05, + "loss": 0.2256, + "step": 9415 + }, + { + "epoch": 0.44722973935336846, + "grad_norm": 0.435546875, + "learning_rate": 3.322469567943411e-05, + "loss": 0.2246, + "step": 9420 + }, + { + "epoch": 0.4474671224421972, + "grad_norm": 0.453125, + "learning_rate": 3.3207611696360017e-05, + "loss": 0.2247, + "step": 9425 + }, + { + "epoch": 0.447704505531026, + "grad_norm": 0.515625, + "learning_rate": 3.319052419571078e-05, + "loss": 0.2236, + "step": 9430 + }, + { + "epoch": 0.44794188861985473, + "grad_norm": 0.46484375, + "learning_rate": 3.3173433188017354e-05, + "loss": 0.2317, + "step": 9435 + }, + { + "epoch": 0.4481792717086835, + "grad_norm": 0.44140625, + "learning_rate": 3.3156338683812867e-05, + "loss": 0.2249, + "step": 9440 + }, + { + "epoch": 0.44841665479751225, + "grad_norm": 0.462890625, + "learning_rate": 3.3139240693632555e-05, + "loss": 0.2232, + "step": 9445 + }, + { + "epoch": 0.44865403788634095, + "grad_norm": 0.52734375, + "learning_rate": 3.312213922801386e-05, + "loss": 0.2281, + "step": 9450 + }, + { + "epoch": 0.4488914209751697, + "grad_norm": 0.515625, + "learning_rate": 3.310503429749631e-05, + "loss": 0.222, + "step": 9455 + }, + { + "epoch": 0.44912880406399847, + "grad_norm": 0.55859375, + "learning_rate": 3.308792591262162e-05, + "loss": 0.2217, + "step": 9460 + }, + { + "epoch": 0.4493661871528272, + "grad_norm": 0.5625, + "learning_rate": 3.307081408393361e-05, + "loss": 0.2242, + "step": 9465 + }, + { + "epoch": 0.449603570241656, + "grad_norm": 0.53515625, + "learning_rate": 3.305369882197819e-05, + "loss": 0.2258, + "step": 9470 + }, + { + "epoch": 0.44984095333048474, + "grad_norm": 0.40234375, + "learning_rate": 3.3036580137303465e-05, + "loss": 0.2207, + "step": 9475 + }, + { + "epoch": 0.4500783364193135, + "grad_norm": 0.412109375, + "learning_rate": 3.301945804045956e-05, + "loss": 0.2228, + "step": 9480 + }, + { + "epoch": 0.45031571950814225, + "grad_norm": 0.4609375, + "learning_rate": 3.300233254199877e-05, + "loss": 0.2254, + "step": 9485 + }, + { + "epoch": 0.450553102596971, + "grad_norm": 0.44140625, + "learning_rate": 3.2985203652475455e-05, + "loss": 0.2231, + "step": 9490 + }, + { + "epoch": 0.45079048568579977, + "grad_norm": 0.515625, + "learning_rate": 3.296807138244606e-05, + "loss": 0.2278, + "step": 9495 + }, + { + "epoch": 0.45102786877462847, + "grad_norm": 0.435546875, + "learning_rate": 3.2950935742469155e-05, + "loss": 0.2243, + "step": 9500 + }, + { + "epoch": 0.45126525186345723, + "grad_norm": 0.5390625, + "learning_rate": 3.2933796743105334e-05, + "loss": 0.2274, + "step": 9505 + }, + { + "epoch": 0.451502634952286, + "grad_norm": 0.41796875, + "learning_rate": 3.291665439491729e-05, + "loss": 0.2229, + "step": 9510 + }, + { + "epoch": 0.45174001804111474, + "grad_norm": 0.462890625, + "learning_rate": 3.289950870846977e-05, + "loss": 0.2227, + "step": 9515 + }, + { + "epoch": 0.4519774011299435, + "grad_norm": 0.447265625, + "learning_rate": 3.2882359694329587e-05, + "loss": 0.2233, + "step": 9520 + }, + { + "epoch": 0.45221478421877226, + "grad_norm": 0.46484375, + "learning_rate": 3.28652073630656e-05, + "loss": 0.2227, + "step": 9525 + }, + { + "epoch": 0.452452167307601, + "grad_norm": 0.4765625, + "learning_rate": 3.284805172524871e-05, + "loss": 0.2246, + "step": 9530 + }, + { + "epoch": 0.4526895503964298, + "grad_norm": 0.494140625, + "learning_rate": 3.283089279145187e-05, + "loss": 0.2267, + "step": 9535 + }, + { + "epoch": 0.45292693348525853, + "grad_norm": 0.5, + "learning_rate": 3.2813730572250034e-05, + "loss": 0.222, + "step": 9540 + }, + { + "epoch": 0.4531643165740873, + "grad_norm": 0.47265625, + "learning_rate": 3.279656507822021e-05, + "loss": 0.2268, + "step": 9545 + }, + { + "epoch": 0.453401699662916, + "grad_norm": 0.404296875, + "learning_rate": 3.277939631994142e-05, + "loss": 0.2237, + "step": 9550 + }, + { + "epoch": 0.45363908275174475, + "grad_norm": 0.41015625, + "learning_rate": 3.276222430799468e-05, + "loss": 0.2227, + "step": 9555 + }, + { + "epoch": 0.4538764658405735, + "grad_norm": 0.3984375, + "learning_rate": 3.274504905296303e-05, + "loss": 0.2261, + "step": 9560 + }, + { + "epoch": 0.45411384892940226, + "grad_norm": 0.51953125, + "learning_rate": 3.27278705654315e-05, + "loss": 0.226, + "step": 9565 + }, + { + "epoch": 0.454351232018231, + "grad_norm": 0.3671875, + "learning_rate": 3.2710688855987114e-05, + "loss": 0.2223, + "step": 9570 + }, + { + "epoch": 0.4545886151070598, + "grad_norm": 0.45703125, + "learning_rate": 3.269350393521888e-05, + "loss": 0.2241, + "step": 9575 + }, + { + "epoch": 0.45482599819588854, + "grad_norm": 0.4453125, + "learning_rate": 3.2676315813717785e-05, + "loss": 0.2228, + "step": 9580 + }, + { + "epoch": 0.4550633812847173, + "grad_norm": 0.546875, + "learning_rate": 3.2659124502076793e-05, + "loss": 0.2216, + "step": 9585 + }, + { + "epoch": 0.45530076437354605, + "grad_norm": 0.515625, + "learning_rate": 3.264193001089084e-05, + "loss": 0.2262, + "step": 9590 + }, + { + "epoch": 0.45553814746237475, + "grad_norm": 0.4765625, + "learning_rate": 3.2624732350756784e-05, + "loss": 0.2227, + "step": 9595 + }, + { + "epoch": 0.4557755305512035, + "grad_norm": 0.453125, + "learning_rate": 3.26075315322735e-05, + "loss": 0.2227, + "step": 9600 + }, + { + "epoch": 0.45601291364003227, + "grad_norm": 0.423828125, + "learning_rate": 3.2590327566041753e-05, + "loss": 0.2271, + "step": 9605 + }, + { + "epoch": 0.456250296728861, + "grad_norm": 0.486328125, + "learning_rate": 3.257312046266427e-05, + "loss": 0.2253, + "step": 9610 + }, + { + "epoch": 0.4564876798176898, + "grad_norm": 0.447265625, + "learning_rate": 3.255591023274572e-05, + "loss": 0.2248, + "step": 9615 + }, + { + "epoch": 0.45672506290651854, + "grad_norm": 0.5078125, + "learning_rate": 3.253869688689268e-05, + "loss": 0.2217, + "step": 9620 + }, + { + "epoch": 0.4569624459953473, + "grad_norm": 0.416015625, + "learning_rate": 3.252148043571367e-05, + "loss": 0.2259, + "step": 9625 + }, + { + "epoch": 0.45719982908417606, + "grad_norm": 0.49609375, + "learning_rate": 3.250426088981909e-05, + "loss": 0.2247, + "step": 9630 + }, + { + "epoch": 0.4574372121730048, + "grad_norm": 0.46875, + "learning_rate": 3.248703825982129e-05, + "loss": 0.2281, + "step": 9635 + }, + { + "epoch": 0.45767459526183357, + "grad_norm": 0.39453125, + "learning_rate": 3.2469812556334484e-05, + "loss": 0.2289, + "step": 9640 + }, + { + "epoch": 0.4579119783506623, + "grad_norm": 0.490234375, + "learning_rate": 3.24525837899748e-05, + "loss": 0.224, + "step": 9645 + }, + { + "epoch": 0.45814936143949103, + "grad_norm": 0.490234375, + "learning_rate": 3.2435351971360244e-05, + "loss": 0.2208, + "step": 9650 + }, + { + "epoch": 0.4583867445283198, + "grad_norm": 0.48046875, + "learning_rate": 3.241811711111071e-05, + "loss": 0.2215, + "step": 9655 + }, + { + "epoch": 0.45862412761714855, + "grad_norm": 0.5625, + "learning_rate": 3.240087921984797e-05, + "loss": 0.2257, + "step": 9660 + }, + { + "epoch": 0.4588615107059773, + "grad_norm": 0.48828125, + "learning_rate": 3.238363830819565e-05, + "loss": 0.227, + "step": 9665 + }, + { + "epoch": 0.45909889379480606, + "grad_norm": 0.4296875, + "learning_rate": 3.236639438677925e-05, + "loss": 0.2227, + "step": 9670 + }, + { + "epoch": 0.4593362768836348, + "grad_norm": 0.5234375, + "learning_rate": 3.2349147466226125e-05, + "loss": 0.2252, + "step": 9675 + }, + { + "epoch": 0.4595736599724636, + "grad_norm": 0.5390625, + "learning_rate": 3.233189755716545e-05, + "loss": 0.2285, + "step": 9680 + }, + { + "epoch": 0.45981104306129233, + "grad_norm": 0.5, + "learning_rate": 3.23146446702283e-05, + "loss": 0.2241, + "step": 9685 + }, + { + "epoch": 0.4600484261501211, + "grad_norm": 0.44140625, + "learning_rate": 3.229738881604752e-05, + "loss": 0.2248, + "step": 9690 + }, + { + "epoch": 0.4602858092389498, + "grad_norm": 0.439453125, + "learning_rate": 3.228013000525784e-05, + "loss": 0.2259, + "step": 9695 + }, + { + "epoch": 0.46052319232777855, + "grad_norm": 0.48828125, + "learning_rate": 3.226286824849576e-05, + "loss": 0.2232, + "step": 9700 + }, + { + "epoch": 0.4607605754166073, + "grad_norm": 0.46484375, + "learning_rate": 3.224560355639964e-05, + "loss": 0.2245, + "step": 9705 + }, + { + "epoch": 0.46099795850543607, + "grad_norm": 0.4609375, + "learning_rate": 3.222833593960963e-05, + "loss": 0.2266, + "step": 9710 + }, + { + "epoch": 0.4612353415942648, + "grad_norm": 0.53125, + "learning_rate": 3.221106540876769e-05, + "loss": 0.2262, + "step": 9715 + }, + { + "epoch": 0.4614727246830936, + "grad_norm": 0.470703125, + "learning_rate": 3.219379197451756e-05, + "loss": 0.2264, + "step": 9720 + }, + { + "epoch": 0.46171010777192234, + "grad_norm": 0.423828125, + "learning_rate": 3.217651564750478e-05, + "loss": 0.2204, + "step": 9725 + }, + { + "epoch": 0.4619474908607511, + "grad_norm": 0.375, + "learning_rate": 3.2159236438376664e-05, + "loss": 0.2239, + "step": 9730 + }, + { + "epoch": 0.46218487394957986, + "grad_norm": 0.408203125, + "learning_rate": 3.2141954357782325e-05, + "loss": 0.2274, + "step": 9735 + }, + { + "epoch": 0.46242225703840856, + "grad_norm": 0.3984375, + "learning_rate": 3.212466941637261e-05, + "loss": 0.2231, + "step": 9740 + }, + { + "epoch": 0.4626596401272373, + "grad_norm": 0.46484375, + "learning_rate": 3.2107381624800184e-05, + "loss": 0.224, + "step": 9745 + }, + { + "epoch": 0.4628970232160661, + "grad_norm": 0.451171875, + "learning_rate": 3.20900909937194e-05, + "loss": 0.2282, + "step": 9750 + }, + { + "epoch": 0.46313440630489483, + "grad_norm": 0.408203125, + "learning_rate": 3.2072797533786415e-05, + "loss": 0.2251, + "step": 9755 + }, + { + "epoch": 0.4633717893937236, + "grad_norm": 0.4375, + "learning_rate": 3.20555012556591e-05, + "loss": 0.2262, + "step": 9760 + }, + { + "epoch": 0.46360917248255235, + "grad_norm": 0.48828125, + "learning_rate": 3.203820216999708e-05, + "loss": 0.2257, + "step": 9765 + }, + { + "epoch": 0.4638465555713811, + "grad_norm": 0.412109375, + "learning_rate": 3.2020900287461695e-05, + "loss": 0.2251, + "step": 9770 + }, + { + "epoch": 0.46408393866020986, + "grad_norm": 0.416015625, + "learning_rate": 3.200359561871603e-05, + "loss": 0.2228, + "step": 9775 + }, + { + "epoch": 0.4643213217490386, + "grad_norm": 0.4921875, + "learning_rate": 3.1986288174424856e-05, + "loss": 0.2261, + "step": 9780 + }, + { + "epoch": 0.4645587048378674, + "grad_norm": 0.48828125, + "learning_rate": 3.196897796525468e-05, + "loss": 0.2277, + "step": 9785 + }, + { + "epoch": 0.4647960879266961, + "grad_norm": 0.5703125, + "learning_rate": 3.19516650018737e-05, + "loss": 0.2267, + "step": 9790 + }, + { + "epoch": 0.46503347101552484, + "grad_norm": 0.46484375, + "learning_rate": 3.193434929495184e-05, + "loss": 0.2246, + "step": 9795 + }, + { + "epoch": 0.4652708541043536, + "grad_norm": 0.4921875, + "learning_rate": 3.1917030855160664e-05, + "loss": 0.2262, + "step": 9800 + }, + { + "epoch": 0.46550823719318235, + "grad_norm": 0.419921875, + "learning_rate": 3.1899709693173465e-05, + "loss": 0.2238, + "step": 9805 + }, + { + "epoch": 0.4657456202820111, + "grad_norm": 0.50390625, + "learning_rate": 3.188238581966518e-05, + "loss": 0.2278, + "step": 9810 + }, + { + "epoch": 0.46598300337083987, + "grad_norm": 0.470703125, + "learning_rate": 3.186505924531244e-05, + "loss": 0.2255, + "step": 9815 + }, + { + "epoch": 0.4662203864596686, + "grad_norm": 0.48828125, + "learning_rate": 3.184772998079355e-05, + "loss": 0.2265, + "step": 9820 + }, + { + "epoch": 0.4664577695484974, + "grad_norm": 0.408203125, + "learning_rate": 3.183039803678843e-05, + "loss": 0.2217, + "step": 9825 + }, + { + "epoch": 0.46669515263732614, + "grad_norm": 0.4375, + "learning_rate": 3.1813063423978704e-05, + "loss": 0.2237, + "step": 9830 + }, + { + "epoch": 0.4669325357261549, + "grad_norm": 0.462890625, + "learning_rate": 3.179572615304759e-05, + "loss": 0.222, + "step": 9835 + }, + { + "epoch": 0.4671699188149836, + "grad_norm": 0.412109375, + "learning_rate": 3.177838623467999e-05, + "loss": 0.228, + "step": 9840 + }, + { + "epoch": 0.46740730190381236, + "grad_norm": 0.435546875, + "learning_rate": 3.17610436795624e-05, + "loss": 0.2257, + "step": 9845 + }, + { + "epoch": 0.4676446849926411, + "grad_norm": 0.55078125, + "learning_rate": 3.174369849838297e-05, + "loss": 0.2224, + "step": 9850 + }, + { + "epoch": 0.46788206808146987, + "grad_norm": 0.494140625, + "learning_rate": 3.1726350701831454e-05, + "loss": 0.227, + "step": 9855 + }, + { + "epoch": 0.46811945117029863, + "grad_norm": 0.546875, + "learning_rate": 3.1709000300599224e-05, + "loss": 0.2269, + "step": 9860 + }, + { + "epoch": 0.4683568342591274, + "grad_norm": 0.44140625, + "learning_rate": 3.1691647305379245e-05, + "loss": 0.228, + "step": 9865 + }, + { + "epoch": 0.46859421734795614, + "grad_norm": 0.51953125, + "learning_rate": 3.1674291726866095e-05, + "loss": 0.2264, + "step": 9870 + }, + { + "epoch": 0.4688316004367849, + "grad_norm": 0.439453125, + "learning_rate": 3.165693357575593e-05, + "loss": 0.2228, + "step": 9875 + }, + { + "epoch": 0.46906898352561366, + "grad_norm": 0.466796875, + "learning_rate": 3.163957286274651e-05, + "loss": 0.2211, + "step": 9880 + }, + { + "epoch": 0.46930636661444236, + "grad_norm": 0.478515625, + "learning_rate": 3.162220959853716e-05, + "loss": 0.224, + "step": 9885 + }, + { + "epoch": 0.4695437497032711, + "grad_norm": 0.423828125, + "learning_rate": 3.160484379382877e-05, + "loss": 0.2203, + "step": 9890 + }, + { + "epoch": 0.4697811327920999, + "grad_norm": 0.390625, + "learning_rate": 3.1587475459323816e-05, + "loss": 0.2208, + "step": 9895 + }, + { + "epoch": 0.47001851588092863, + "grad_norm": 0.443359375, + "learning_rate": 3.1570104605726334e-05, + "loss": 0.2247, + "step": 9900 + }, + { + "epoch": 0.4702558989697574, + "grad_norm": 0.439453125, + "learning_rate": 3.155273124374189e-05, + "loss": 0.2233, + "step": 9905 + }, + { + "epoch": 0.47049328205858615, + "grad_norm": 0.419921875, + "learning_rate": 3.153535538407761e-05, + "loss": 0.2213, + "step": 9910 + }, + { + "epoch": 0.4707306651474149, + "grad_norm": 0.408203125, + "learning_rate": 3.1517977037442145e-05, + "loss": 0.2245, + "step": 9915 + }, + { + "epoch": 0.47096804823624366, + "grad_norm": 0.5859375, + "learning_rate": 3.150059621454572e-05, + "loss": 0.221, + "step": 9920 + }, + { + "epoch": 0.4712054313250724, + "grad_norm": 0.478515625, + "learning_rate": 3.148321292610003e-05, + "loss": 0.2261, + "step": 9925 + }, + { + "epoch": 0.4714428144139012, + "grad_norm": 0.421875, + "learning_rate": 3.146582718281833e-05, + "loss": 0.2221, + "step": 9930 + }, + { + "epoch": 0.4716801975027299, + "grad_norm": 0.47265625, + "learning_rate": 3.1448438995415365e-05, + "loss": 0.2269, + "step": 9935 + }, + { + "epoch": 0.47191758059155864, + "grad_norm": 0.43359375, + "learning_rate": 3.1431048374607405e-05, + "loss": 0.2252, + "step": 9940 + }, + { + "epoch": 0.4721549636803874, + "grad_norm": 0.47265625, + "learning_rate": 3.141365533111221e-05, + "loss": 0.2295, + "step": 9945 + }, + { + "epoch": 0.47239234676921615, + "grad_norm": 0.427734375, + "learning_rate": 3.139625987564903e-05, + "loss": 0.2229, + "step": 9950 + }, + { + "epoch": 0.4726297298580449, + "grad_norm": 0.46875, + "learning_rate": 3.13788620189386e-05, + "loss": 0.2238, + "step": 9955 + }, + { + "epoch": 0.47286711294687367, + "grad_norm": 0.453125, + "learning_rate": 3.136146177170316e-05, + "loss": 0.2239, + "step": 9960 + }, + { + "epoch": 0.4731044960357024, + "grad_norm": 0.5390625, + "learning_rate": 3.1344059144666375e-05, + "loss": 0.2221, + "step": 9965 + }, + { + "epoch": 0.4733418791245312, + "grad_norm": 0.427734375, + "learning_rate": 3.132665414855343e-05, + "loss": 0.2228, + "step": 9970 + }, + { + "epoch": 0.47357926221335994, + "grad_norm": 0.435546875, + "learning_rate": 3.130924679409093e-05, + "loss": 0.2253, + "step": 9975 + }, + { + "epoch": 0.4738166453021887, + "grad_norm": 0.5703125, + "learning_rate": 3.1291837092006954e-05, + "loss": 0.2221, + "step": 9980 + }, + { + "epoch": 0.4740540283910174, + "grad_norm": 0.484375, + "learning_rate": 3.127442505303101e-05, + "loss": 0.2236, + "step": 9985 + }, + { + "epoch": 0.47429141147984616, + "grad_norm": 0.46875, + "learning_rate": 3.125701068789408e-05, + "loss": 0.2213, + "step": 9990 + }, + { + "epoch": 0.4745287945686749, + "grad_norm": 0.6640625, + "learning_rate": 3.123959400732855e-05, + "loss": 0.2204, + "step": 9995 + }, + { + "epoch": 0.4747661776575037, + "grad_norm": 0.5390625, + "learning_rate": 3.1222175022068236e-05, + "loss": 0.2279, + "step": 10000 + }, + { + "epoch": 0.47500356074633243, + "grad_norm": 0.431640625, + "learning_rate": 3.120475374284838e-05, + "loss": 0.2218, + "step": 10005 + }, + { + "epoch": 0.4752409438351612, + "grad_norm": 0.4375, + "learning_rate": 3.118733018040564e-05, + "loss": 0.2274, + "step": 10010 + }, + { + "epoch": 0.47547832692398995, + "grad_norm": 0.4375, + "learning_rate": 3.116990434547807e-05, + "loss": 0.2254, + "step": 10015 + }, + { + "epoch": 0.4757157100128187, + "grad_norm": 0.439453125, + "learning_rate": 3.1152476248805145e-05, + "loss": 0.2259, + "step": 10020 + }, + { + "epoch": 0.47595309310164746, + "grad_norm": 0.494140625, + "learning_rate": 3.1135045901127726e-05, + "loss": 0.2232, + "step": 10025 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.443359375, + "learning_rate": 3.111761331318804e-05, + "loss": 0.2227, + "step": 10030 + }, + { + "epoch": 0.4764278592793049, + "grad_norm": 0.5, + "learning_rate": 3.1100178495729716e-05, + "loss": 0.224, + "step": 10035 + }, + { + "epoch": 0.4766652423681337, + "grad_norm": 0.455078125, + "learning_rate": 3.108274145949776e-05, + "loss": 0.2256, + "step": 10040 + }, + { + "epoch": 0.47690262545696244, + "grad_norm": 0.419921875, + "learning_rate": 3.1065302215238534e-05, + "loss": 0.2236, + "step": 10045 + }, + { + "epoch": 0.4771400085457912, + "grad_norm": 0.4609375, + "learning_rate": 3.1047860773699764e-05, + "loss": 0.2255, + "step": 10050 + }, + { + "epoch": 0.47737739163461995, + "grad_norm": 0.462890625, + "learning_rate": 3.103041714563054e-05, + "loss": 0.2226, + "step": 10055 + }, + { + "epoch": 0.4776147747234487, + "grad_norm": 0.396484375, + "learning_rate": 3.101297134178128e-05, + "loss": 0.2217, + "step": 10060 + }, + { + "epoch": 0.47785215781227747, + "grad_norm": 0.41796875, + "learning_rate": 3.099552337290375e-05, + "loss": 0.2229, + "step": 10065 + }, + { + "epoch": 0.4780895409011062, + "grad_norm": 0.42578125, + "learning_rate": 3.0978073249751066e-05, + "loss": 0.2244, + "step": 10070 + }, + { + "epoch": 0.478326923989935, + "grad_norm": 0.470703125, + "learning_rate": 3.096062098307765e-05, + "loss": 0.2216, + "step": 10075 + }, + { + "epoch": 0.4785643070787637, + "grad_norm": 0.44140625, + "learning_rate": 3.094316658363926e-05, + "loss": 0.2262, + "step": 10080 + }, + { + "epoch": 0.47880169016759244, + "grad_norm": 0.404296875, + "learning_rate": 3.092571006219296e-05, + "loss": 0.2228, + "step": 10085 + }, + { + "epoch": 0.4790390732564212, + "grad_norm": 0.5, + "learning_rate": 3.090825142949713e-05, + "loss": 0.225, + "step": 10090 + }, + { + "epoch": 0.47927645634524996, + "grad_norm": 0.39453125, + "learning_rate": 3.089079069631144e-05, + "loss": 0.2258, + "step": 10095 + }, + { + "epoch": 0.4795138394340787, + "grad_norm": 0.515625, + "learning_rate": 3.087332787339687e-05, + "loss": 0.2238, + "step": 10100 + }, + { + "epoch": 0.4797512225229075, + "grad_norm": 0.462890625, + "learning_rate": 3.085586297151567e-05, + "loss": 0.2208, + "step": 10105 + }, + { + "epoch": 0.47998860561173623, + "grad_norm": 0.486328125, + "learning_rate": 3.083839600143137e-05, + "loss": 0.2199, + "step": 10110 + }, + { + "epoch": 0.480225988700565, + "grad_norm": 0.40234375, + "learning_rate": 3.0820926973908806e-05, + "loss": 0.2238, + "step": 10115 + }, + { + "epoch": 0.48046337178939375, + "grad_norm": 0.427734375, + "learning_rate": 3.080345589971404e-05, + "loss": 0.2211, + "step": 10120 + }, + { + "epoch": 0.4807007548782225, + "grad_norm": 0.453125, + "learning_rate": 3.078598278961443e-05, + "loss": 0.2254, + "step": 10125 + }, + { + "epoch": 0.4809381379670512, + "grad_norm": 0.41796875, + "learning_rate": 3.076850765437857e-05, + "loss": 0.2219, + "step": 10130 + }, + { + "epoch": 0.48117552105587996, + "grad_norm": 0.462890625, + "learning_rate": 3.0751030504776315e-05, + "loss": 0.221, + "step": 10135 + }, + { + "epoch": 0.4814129041447087, + "grad_norm": 0.46875, + "learning_rate": 3.0733551351578724e-05, + "loss": 0.2242, + "step": 10140 + }, + { + "epoch": 0.4816502872335375, + "grad_norm": 0.421875, + "learning_rate": 3.071607020555815e-05, + "loss": 0.2239, + "step": 10145 + }, + { + "epoch": 0.48188767032236623, + "grad_norm": 0.578125, + "learning_rate": 3.069858707748813e-05, + "loss": 0.2208, + "step": 10150 + }, + { + "epoch": 0.482125053411195, + "grad_norm": 0.46484375, + "learning_rate": 3.068110197814344e-05, + "loss": 0.2246, + "step": 10155 + }, + { + "epoch": 0.48236243650002375, + "grad_norm": 0.45703125, + "learning_rate": 3.066361491830007e-05, + "loss": 0.2243, + "step": 10160 + }, + { + "epoch": 0.4825998195888525, + "grad_norm": 0.49609375, + "learning_rate": 3.064612590873521e-05, + "loss": 0.2263, + "step": 10165 + }, + { + "epoch": 0.48283720267768127, + "grad_norm": 0.458984375, + "learning_rate": 3.062863496022725e-05, + "loss": 0.2259, + "step": 10170 + }, + { + "epoch": 0.48307458576650997, + "grad_norm": 0.43359375, + "learning_rate": 3.0611142083555806e-05, + "loss": 0.2243, + "step": 10175 + }, + { + "epoch": 0.4833119688553387, + "grad_norm": 0.376953125, + "learning_rate": 3.059364728950163e-05, + "loss": 0.226, + "step": 10180 + }, + { + "epoch": 0.4835493519441675, + "grad_norm": 0.41015625, + "learning_rate": 3.0576150588846704e-05, + "loss": 0.2261, + "step": 10185 + }, + { + "epoch": 0.48378673503299624, + "grad_norm": 0.462890625, + "learning_rate": 3.0558651992374154e-05, + "loss": 0.2251, + "step": 10190 + }, + { + "epoch": 0.484024118121825, + "grad_norm": 0.453125, + "learning_rate": 3.0541151510868296e-05, + "loss": 0.226, + "step": 10195 + }, + { + "epoch": 0.48426150121065376, + "grad_norm": 0.478515625, + "learning_rate": 3.052364915511458e-05, + "loss": 0.223, + "step": 10200 + }, + { + "epoch": 0.4844988842994825, + "grad_norm": 0.443359375, + "learning_rate": 3.050614493589965e-05, + "loss": 0.2247, + "step": 10205 + }, + { + "epoch": 0.48473626738831127, + "grad_norm": 0.427734375, + "learning_rate": 3.0488638864011264e-05, + "loss": 0.2268, + "step": 10210 + }, + { + "epoch": 0.48497365047714003, + "grad_norm": 0.4609375, + "learning_rate": 3.0471130950238334e-05, + "loss": 0.2258, + "step": 10215 + }, + { + "epoch": 0.4852110335659688, + "grad_norm": 0.40234375, + "learning_rate": 3.0453621205370914e-05, + "loss": 0.219, + "step": 10220 + }, + { + "epoch": 0.4854484166547975, + "grad_norm": 0.365234375, + "learning_rate": 3.0436109640200173e-05, + "loss": 0.2202, + "step": 10225 + }, + { + "epoch": 0.48568579974362625, + "grad_norm": 0.40234375, + "learning_rate": 3.041859626551841e-05, + "loss": 0.2263, + "step": 10230 + }, + { + "epoch": 0.485923182832455, + "grad_norm": 0.451171875, + "learning_rate": 3.0401081092119045e-05, + "loss": 0.2268, + "step": 10235 + }, + { + "epoch": 0.48616056592128376, + "grad_norm": 0.453125, + "learning_rate": 3.0383564130796588e-05, + "loss": 0.2279, + "step": 10240 + }, + { + "epoch": 0.4863979490101125, + "grad_norm": 0.451171875, + "learning_rate": 3.0366045392346674e-05, + "loss": 0.2237, + "step": 10245 + }, + { + "epoch": 0.4866353320989413, + "grad_norm": 0.4453125, + "learning_rate": 3.034852488756602e-05, + "loss": 0.2227, + "step": 10250 + }, + { + "epoch": 0.48687271518777003, + "grad_norm": 0.484375, + "learning_rate": 3.033100262725243e-05, + "loss": 0.2274, + "step": 10255 + }, + { + "epoch": 0.4871100982765988, + "grad_norm": 0.52734375, + "learning_rate": 3.03134786222048e-05, + "loss": 0.2257, + "step": 10260 + }, + { + "epoch": 0.48734748136542755, + "grad_norm": 0.40625, + "learning_rate": 3.029595288322308e-05, + "loss": 0.2226, + "step": 10265 + }, + { + "epoch": 0.4875848644542563, + "grad_norm": 0.400390625, + "learning_rate": 3.0278425421108315e-05, + "loss": 0.221, + "step": 10270 + }, + { + "epoch": 0.487822247543085, + "grad_norm": 0.4765625, + "learning_rate": 3.026089624666259e-05, + "loss": 0.2235, + "step": 10275 + }, + { + "epoch": 0.48805963063191377, + "grad_norm": 0.447265625, + "learning_rate": 3.0243365370689075e-05, + "loss": 0.2242, + "step": 10280 + }, + { + "epoch": 0.4882970137207425, + "grad_norm": 0.4609375, + "learning_rate": 3.022583280399195e-05, + "loss": 0.226, + "step": 10285 + }, + { + "epoch": 0.4885343968095713, + "grad_norm": 0.482421875, + "learning_rate": 3.020829855737647e-05, + "loss": 0.2221, + "step": 10290 + }, + { + "epoch": 0.48877177989840004, + "grad_norm": 0.46875, + "learning_rate": 3.019076264164889e-05, + "loss": 0.2224, + "step": 10295 + }, + { + "epoch": 0.4890091629872288, + "grad_norm": 0.51953125, + "learning_rate": 3.017322506761654e-05, + "loss": 0.2248, + "step": 10300 + }, + { + "epoch": 0.48924654607605755, + "grad_norm": 0.484375, + "learning_rate": 3.015568584608774e-05, + "loss": 0.2243, + "step": 10305 + }, + { + "epoch": 0.4894839291648863, + "grad_norm": 0.470703125, + "learning_rate": 3.0138144987871825e-05, + "loss": 0.2256, + "step": 10310 + }, + { + "epoch": 0.48972131225371507, + "grad_norm": 0.46484375, + "learning_rate": 3.012060250377915e-05, + "loss": 0.2266, + "step": 10315 + }, + { + "epoch": 0.48995869534254377, + "grad_norm": 0.466796875, + "learning_rate": 3.010305840462107e-05, + "loss": 0.2232, + "step": 10320 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.4375, + "learning_rate": 3.0085512701209935e-05, + "loss": 0.2228, + "step": 10325 + }, + { + "epoch": 0.4904334615202013, + "grad_norm": 0.4921875, + "learning_rate": 3.0067965404359077e-05, + "loss": 0.224, + "step": 10330 + }, + { + "epoch": 0.49067084460903004, + "grad_norm": 0.46875, + "learning_rate": 3.0050416524882807e-05, + "loss": 0.2248, + "step": 10335 + }, + { + "epoch": 0.4909082276978588, + "grad_norm": 0.51953125, + "learning_rate": 3.0032866073596445e-05, + "loss": 0.2242, + "step": 10340 + }, + { + "epoch": 0.49114561078668756, + "grad_norm": 0.478515625, + "learning_rate": 3.0015314061316236e-05, + "loss": 0.2251, + "step": 10345 + }, + { + "epoch": 0.4913829938755163, + "grad_norm": 0.4453125, + "learning_rate": 2.9997760498859407e-05, + "loss": 0.227, + "step": 10350 + }, + { + "epoch": 0.4916203769643451, + "grad_norm": 0.55078125, + "learning_rate": 2.998020539704414e-05, + "loss": 0.2267, + "step": 10355 + }, + { + "epoch": 0.49185776005317383, + "grad_norm": 0.52734375, + "learning_rate": 2.9962648766689573e-05, + "loss": 0.2246, + "step": 10360 + }, + { + "epoch": 0.4920951431420026, + "grad_norm": 0.470703125, + "learning_rate": 2.9945090618615756e-05, + "loss": 0.2257, + "step": 10365 + }, + { + "epoch": 0.4923325262308313, + "grad_norm": 0.5, + "learning_rate": 2.992753096364372e-05, + "loss": 0.2241, + "step": 10370 + }, + { + "epoch": 0.49256990931966005, + "grad_norm": 0.49609375, + "learning_rate": 2.9909969812595385e-05, + "loss": 0.2245, + "step": 10375 + }, + { + "epoch": 0.4928072924084888, + "grad_norm": 0.455078125, + "learning_rate": 2.9892407176293612e-05, + "loss": 0.2261, + "step": 10380 + }, + { + "epoch": 0.49304467549731756, + "grad_norm": 0.5, + "learning_rate": 2.987484306556218e-05, + "loss": 0.2186, + "step": 10385 + }, + { + "epoch": 0.4932820585861463, + "grad_norm": 0.412109375, + "learning_rate": 2.9857277491225767e-05, + "loss": 0.2212, + "step": 10390 + }, + { + "epoch": 0.4935194416749751, + "grad_norm": 0.439453125, + "learning_rate": 2.9839710464109944e-05, + "loss": 0.2243, + "step": 10395 + }, + { + "epoch": 0.49375682476380384, + "grad_norm": 0.58203125, + "learning_rate": 2.9822141995041214e-05, + "loss": 0.2266, + "step": 10400 + }, + { + "epoch": 0.4939942078526326, + "grad_norm": 0.435546875, + "learning_rate": 2.9804572094846924e-05, + "loss": 0.2257, + "step": 10405 + }, + { + "epoch": 0.49423159094146135, + "grad_norm": 0.53125, + "learning_rate": 2.978700077435534e-05, + "loss": 0.2222, + "step": 10410 + }, + { + "epoch": 0.4944689740302901, + "grad_norm": 0.53125, + "learning_rate": 2.9769428044395568e-05, + "loss": 0.2271, + "step": 10415 + }, + { + "epoch": 0.4947063571191188, + "grad_norm": 0.4296875, + "learning_rate": 2.9751853915797617e-05, + "loss": 0.2225, + "step": 10420 + }, + { + "epoch": 0.49494374020794757, + "grad_norm": 0.5078125, + "learning_rate": 2.973427839939234e-05, + "loss": 0.2256, + "step": 10425 + }, + { + "epoch": 0.4951811232967763, + "grad_norm": 0.478515625, + "learning_rate": 2.971670150601144e-05, + "loss": 0.2247, + "step": 10430 + }, + { + "epoch": 0.4954185063856051, + "grad_norm": 0.408203125, + "learning_rate": 2.9699123246487487e-05, + "loss": 0.2238, + "step": 10435 + }, + { + "epoch": 0.49565588947443384, + "grad_norm": 0.388671875, + "learning_rate": 2.9681543631653873e-05, + "loss": 0.2215, + "step": 10440 + }, + { + "epoch": 0.4958932725632626, + "grad_norm": 0.453125, + "learning_rate": 2.9663962672344842e-05, + "loss": 0.2245, + "step": 10445 + }, + { + "epoch": 0.49613065565209136, + "grad_norm": 0.46875, + "learning_rate": 2.9646380379395456e-05, + "loss": 0.2213, + "step": 10450 + }, + { + "epoch": 0.4963680387409201, + "grad_norm": 0.447265625, + "learning_rate": 2.9628796763641607e-05, + "loss": 0.2276, + "step": 10455 + }, + { + "epoch": 0.49660542182974887, + "grad_norm": 0.5, + "learning_rate": 2.9611211835919982e-05, + "loss": 0.2258, + "step": 10460 + }, + { + "epoch": 0.4968428049185776, + "grad_norm": 0.423828125, + "learning_rate": 2.9593625607068116e-05, + "loss": 0.2213, + "step": 10465 + }, + { + "epoch": 0.49708018800740633, + "grad_norm": 0.66796875, + "learning_rate": 2.9576038087924297e-05, + "loss": 0.2233, + "step": 10470 + }, + { + "epoch": 0.4973175710962351, + "grad_norm": 0.478515625, + "learning_rate": 2.955844928932765e-05, + "loss": 0.2263, + "step": 10475 + }, + { + "epoch": 0.49755495418506385, + "grad_norm": 0.3828125, + "learning_rate": 2.9540859222118068e-05, + "loss": 0.2219, + "step": 10480 + }, + { + "epoch": 0.4977923372738926, + "grad_norm": 0.484375, + "learning_rate": 2.9523267897136226e-05, + "loss": 0.2252, + "step": 10485 + }, + { + "epoch": 0.49802972036272136, + "grad_norm": 0.478515625, + "learning_rate": 2.9505675325223575e-05, + "loss": 0.2244, + "step": 10490 + }, + { + "epoch": 0.4982671034515501, + "grad_norm": 0.421875, + "learning_rate": 2.9488081517222347e-05, + "loss": 0.2239, + "step": 10495 + }, + { + "epoch": 0.4985044865403789, + "grad_norm": 0.419921875, + "learning_rate": 2.9470486483975517e-05, + "loss": 0.2217, + "step": 10500 + }, + { + "epoch": 0.49874186962920763, + "grad_norm": 0.490234375, + "learning_rate": 2.9452890236326834e-05, + "loss": 0.2248, + "step": 10505 + }, + { + "epoch": 0.4989792527180364, + "grad_norm": 0.46875, + "learning_rate": 2.9435292785120775e-05, + "loss": 0.2266, + "step": 10510 + }, + { + "epoch": 0.4992166358068651, + "grad_norm": 0.53515625, + "learning_rate": 2.9417694141202572e-05, + "loss": 0.222, + "step": 10515 + }, + { + "epoch": 0.49945401889569385, + "grad_norm": 0.447265625, + "learning_rate": 2.9400094315418193e-05, + "loss": 0.2276, + "step": 10520 + }, + { + "epoch": 0.4996914019845226, + "grad_norm": 0.609375, + "learning_rate": 2.9382493318614323e-05, + "loss": 0.2228, + "step": 10525 + }, + { + "epoch": 0.49992878507335137, + "grad_norm": 0.40625, + "learning_rate": 2.9364891161638382e-05, + "loss": 0.2272, + "step": 10530 + }, + { + "epoch": 0.5001661681621802, + "grad_norm": 0.462890625, + "learning_rate": 2.9347287855338502e-05, + "loss": 0.2258, + "step": 10535 + }, + { + "epoch": 0.5004035512510089, + "grad_norm": 0.45703125, + "learning_rate": 2.932968341056351e-05, + "loss": 0.2237, + "step": 10540 + }, + { + "epoch": 0.5006409343398376, + "grad_norm": 0.37890625, + "learning_rate": 2.931207783816296e-05, + "loss": 0.2229, + "step": 10545 + }, + { + "epoch": 0.5008783174286664, + "grad_norm": 0.4453125, + "learning_rate": 2.9294471148987074e-05, + "loss": 0.2213, + "step": 10550 + }, + { + "epoch": 0.5011157005174951, + "grad_norm": 0.439453125, + "learning_rate": 2.927686335388678e-05, + "loss": 0.2223, + "step": 10555 + }, + { + "epoch": 0.5013530836063239, + "grad_norm": 0.515625, + "learning_rate": 2.925925446371368e-05, + "loss": 0.2272, + "step": 10560 + }, + { + "epoch": 0.5015904666951526, + "grad_norm": 0.443359375, + "learning_rate": 2.924164448932006e-05, + "loss": 0.2232, + "step": 10565 + }, + { + "epoch": 0.5018278497839814, + "grad_norm": 0.388671875, + "learning_rate": 2.9224033441558852e-05, + "loss": 0.222, + "step": 10570 + }, + { + "epoch": 0.5020652328728101, + "grad_norm": 0.416015625, + "learning_rate": 2.920642133128368e-05, + "loss": 0.2235, + "step": 10575 + }, + { + "epoch": 0.5023026159616389, + "grad_norm": 0.51953125, + "learning_rate": 2.9188808169348797e-05, + "loss": 0.2273, + "step": 10580 + }, + { + "epoch": 0.5025399990504676, + "grad_norm": 0.4375, + "learning_rate": 2.9171193966609124e-05, + "loss": 0.2228, + "step": 10585 + }, + { + "epoch": 0.5027773821392963, + "grad_norm": 0.462890625, + "learning_rate": 2.91535787339202e-05, + "loss": 0.2204, + "step": 10590 + }, + { + "epoch": 0.5030147652281252, + "grad_norm": 0.357421875, + "learning_rate": 2.913596248213823e-05, + "loss": 0.2246, + "step": 10595 + }, + { + "epoch": 0.5032521483169539, + "grad_norm": 0.5390625, + "learning_rate": 2.911834522212002e-05, + "loss": 0.2227, + "step": 10600 + }, + { + "epoch": 0.5034895314057827, + "grad_norm": 0.51953125, + "learning_rate": 2.9100726964723007e-05, + "loss": 0.2244, + "step": 10605 + }, + { + "epoch": 0.5037269144946114, + "grad_norm": 0.53515625, + "learning_rate": 2.9083107720805247e-05, + "loss": 0.2252, + "step": 10610 + }, + { + "epoch": 0.5039642975834402, + "grad_norm": 0.484375, + "learning_rate": 2.9065487501225393e-05, + "loss": 0.2223, + "step": 10615 + }, + { + "epoch": 0.5042016806722689, + "grad_norm": 0.44140625, + "learning_rate": 2.9047866316842713e-05, + "loss": 0.2248, + "step": 10620 + }, + { + "epoch": 0.5044390637610977, + "grad_norm": 0.48046875, + "learning_rate": 2.9030244178517058e-05, + "loss": 0.2217, + "step": 10625 + }, + { + "epoch": 0.5046764468499264, + "grad_norm": 0.41015625, + "learning_rate": 2.9012621097108876e-05, + "loss": 0.223, + "step": 10630 + }, + { + "epoch": 0.5049138299387551, + "grad_norm": 0.400390625, + "learning_rate": 2.899499708347919e-05, + "loss": 0.2255, + "step": 10635 + }, + { + "epoch": 0.5051512130275839, + "grad_norm": 0.44140625, + "learning_rate": 2.8977372148489607e-05, + "loss": 0.2282, + "step": 10640 + }, + { + "epoch": 0.5053885961164126, + "grad_norm": 0.427734375, + "learning_rate": 2.895974630300228e-05, + "loss": 0.2251, + "step": 10645 + }, + { + "epoch": 0.5056259792052414, + "grad_norm": 0.478515625, + "learning_rate": 2.894211955787995e-05, + "loss": 0.224, + "step": 10650 + }, + { + "epoch": 0.5058633622940701, + "grad_norm": 0.404296875, + "learning_rate": 2.8924491923985886e-05, + "loss": 0.2246, + "step": 10655 + }, + { + "epoch": 0.506100745382899, + "grad_norm": 0.48046875, + "learning_rate": 2.8906863412183933e-05, + "loss": 0.2223, + "step": 10660 + }, + { + "epoch": 0.5063381284717277, + "grad_norm": 0.58203125, + "learning_rate": 2.888923403333845e-05, + "loss": 0.2242, + "step": 10665 + }, + { + "epoch": 0.5065755115605565, + "grad_norm": 0.478515625, + "learning_rate": 2.887160379831435e-05, + "loss": 0.2309, + "step": 10670 + }, + { + "epoch": 0.5068128946493852, + "grad_norm": 0.41796875, + "learning_rate": 2.8853972717977075e-05, + "loss": 0.2205, + "step": 10675 + }, + { + "epoch": 0.507050277738214, + "grad_norm": 0.4375, + "learning_rate": 2.8836340803192565e-05, + "loss": 0.2292, + "step": 10680 + }, + { + "epoch": 0.5072876608270427, + "grad_norm": 0.453125, + "learning_rate": 2.8818708064827295e-05, + "loss": 0.2234, + "step": 10685 + }, + { + "epoch": 0.5075250439158714, + "grad_norm": 0.53125, + "learning_rate": 2.8801074513748237e-05, + "loss": 0.2262, + "step": 10690 + }, + { + "epoch": 0.5077624270047002, + "grad_norm": 0.486328125, + "learning_rate": 2.878344016082288e-05, + "loss": 0.224, + "step": 10695 + }, + { + "epoch": 0.5079998100935289, + "grad_norm": 0.455078125, + "learning_rate": 2.876580501691919e-05, + "loss": 0.2242, + "step": 10700 + }, + { + "epoch": 0.5082371931823577, + "grad_norm": 0.427734375, + "learning_rate": 2.874816909290562e-05, + "loss": 0.2226, + "step": 10705 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 0.41015625, + "learning_rate": 2.873053239965111e-05, + "loss": 0.2288, + "step": 10710 + }, + { + "epoch": 0.5087119593600152, + "grad_norm": 0.451171875, + "learning_rate": 2.8712894948025083e-05, + "loss": 0.2262, + "step": 10715 + }, + { + "epoch": 0.5089493424488439, + "grad_norm": 0.43359375, + "learning_rate": 2.8695256748897413e-05, + "loss": 0.2271, + "step": 10720 + }, + { + "epoch": 0.5091867255376727, + "grad_norm": 0.43359375, + "learning_rate": 2.867761781313844e-05, + "loss": 0.2217, + "step": 10725 + }, + { + "epoch": 0.5094241086265014, + "grad_norm": 0.43359375, + "learning_rate": 2.8659978151618965e-05, + "loss": 0.2254, + "step": 10730 + }, + { + "epoch": 0.5096614917153302, + "grad_norm": 0.55859375, + "learning_rate": 2.864233777521024e-05, + "loss": 0.2241, + "step": 10735 + }, + { + "epoch": 0.509898874804159, + "grad_norm": 0.62890625, + "learning_rate": 2.862469669478393e-05, + "loss": 0.2165, + "step": 10740 + }, + { + "epoch": 0.5101362578929877, + "grad_norm": 0.5546875, + "learning_rate": 2.8607054921212156e-05, + "loss": 0.2262, + "step": 10745 + }, + { + "epoch": 0.5103736409818165, + "grad_norm": 0.439453125, + "learning_rate": 2.8589412465367478e-05, + "loss": 0.2242, + "step": 10750 + }, + { + "epoch": 0.5106110240706452, + "grad_norm": 0.46484375, + "learning_rate": 2.857176933812285e-05, + "loss": 0.2234, + "step": 10755 + }, + { + "epoch": 0.510848407159474, + "grad_norm": 0.53125, + "learning_rate": 2.8554125550351645e-05, + "loss": 0.2249, + "step": 10760 + }, + { + "epoch": 0.5110857902483027, + "grad_norm": 0.51171875, + "learning_rate": 2.853648111292767e-05, + "loss": 0.2218, + "step": 10765 + }, + { + "epoch": 0.5113231733371315, + "grad_norm": 0.470703125, + "learning_rate": 2.851883603672509e-05, + "loss": 0.225, + "step": 10770 + }, + { + "epoch": 0.5115605564259602, + "grad_norm": 0.39453125, + "learning_rate": 2.8501190332618495e-05, + "loss": 0.2232, + "step": 10775 + }, + { + "epoch": 0.5117979395147889, + "grad_norm": 0.46875, + "learning_rate": 2.8483544011482856e-05, + "loss": 0.2224, + "step": 10780 + }, + { + "epoch": 0.5120353226036177, + "grad_norm": 0.4375, + "learning_rate": 2.8465897084193506e-05, + "loss": 0.2221, + "step": 10785 + }, + { + "epoch": 0.5122727056924464, + "grad_norm": 0.50390625, + "learning_rate": 2.844824956162618e-05, + "loss": 0.223, + "step": 10790 + }, + { + "epoch": 0.5125100887812752, + "grad_norm": 0.41015625, + "learning_rate": 2.8430601454656957e-05, + "loss": 0.2232, + "step": 10795 + }, + { + "epoch": 0.5127474718701039, + "grad_norm": 0.41796875, + "learning_rate": 2.8412952774162292e-05, + "loss": 0.223, + "step": 10800 + }, + { + "epoch": 0.5129848549589328, + "grad_norm": 0.5703125, + "learning_rate": 2.8395303531018985e-05, + "loss": 0.2227, + "step": 10805 + }, + { + "epoch": 0.5132222380477615, + "grad_norm": 0.47265625, + "learning_rate": 2.8377653736104177e-05, + "loss": 0.2212, + "step": 10810 + }, + { + "epoch": 0.5134596211365903, + "grad_norm": 0.482421875, + "learning_rate": 2.836000340029537e-05, + "loss": 0.2273, + "step": 10815 + }, + { + "epoch": 0.513697004225419, + "grad_norm": 0.455078125, + "learning_rate": 2.8342352534470373e-05, + "loss": 0.2229, + "step": 10820 + }, + { + "epoch": 0.5139343873142478, + "grad_norm": 0.50390625, + "learning_rate": 2.8324701149507342e-05, + "loss": 0.2218, + "step": 10825 + }, + { + "epoch": 0.5141717704030765, + "grad_norm": 0.5703125, + "learning_rate": 2.8307049256284746e-05, + "loss": 0.2239, + "step": 10830 + }, + { + "epoch": 0.5144091534919052, + "grad_norm": 0.419921875, + "learning_rate": 2.8289396865681356e-05, + "loss": 0.2246, + "step": 10835 + }, + { + "epoch": 0.514646536580734, + "grad_norm": 0.4140625, + "learning_rate": 2.8271743988576273e-05, + "loss": 0.2226, + "step": 10840 + }, + { + "epoch": 0.5148839196695627, + "grad_norm": 0.46484375, + "learning_rate": 2.8254090635848884e-05, + "loss": 0.226, + "step": 10845 + }, + { + "epoch": 0.5151213027583915, + "grad_norm": 0.3984375, + "learning_rate": 2.8236436818378857e-05, + "loss": 0.2257, + "step": 10850 + }, + { + "epoch": 0.5153586858472202, + "grad_norm": 0.462890625, + "learning_rate": 2.821878254704617e-05, + "loss": 0.2244, + "step": 10855 + }, + { + "epoch": 0.515596068936049, + "grad_norm": 0.466796875, + "learning_rate": 2.820112783273106e-05, + "loss": 0.2248, + "step": 10860 + }, + { + "epoch": 0.5158334520248777, + "grad_norm": 0.439453125, + "learning_rate": 2.8183472686314065e-05, + "loss": 0.2211, + "step": 10865 + }, + { + "epoch": 0.5160708351137065, + "grad_norm": 0.4609375, + "learning_rate": 2.8165817118675957e-05, + "loss": 0.2251, + "step": 10870 + }, + { + "epoch": 0.5163082182025353, + "grad_norm": 0.41015625, + "learning_rate": 2.8148161140697793e-05, + "loss": 0.2249, + "step": 10875 + }, + { + "epoch": 0.516545601291364, + "grad_norm": 0.5, + "learning_rate": 2.8130504763260856e-05, + "loss": 0.2307, + "step": 10880 + }, + { + "epoch": 0.5167829843801928, + "grad_norm": 0.484375, + "learning_rate": 2.8112847997246707e-05, + "loss": 0.2208, + "step": 10885 + }, + { + "epoch": 0.5170203674690215, + "grad_norm": 0.5, + "learning_rate": 2.8095190853537118e-05, + "loss": 0.2263, + "step": 10890 + }, + { + "epoch": 0.5172577505578503, + "grad_norm": 0.458984375, + "learning_rate": 2.8077533343014124e-05, + "loss": 0.2222, + "step": 10895 + }, + { + "epoch": 0.517495133646679, + "grad_norm": 0.4453125, + "learning_rate": 2.8059875476559944e-05, + "loss": 0.2228, + "step": 10900 + }, + { + "epoch": 0.5177325167355078, + "grad_norm": 0.462890625, + "learning_rate": 2.8042217265057064e-05, + "loss": 0.2268, + "step": 10905 + }, + { + "epoch": 0.5179698998243365, + "grad_norm": 0.455078125, + "learning_rate": 2.8024558719388143e-05, + "loss": 0.2217, + "step": 10910 + }, + { + "epoch": 0.5182072829131653, + "grad_norm": 0.41015625, + "learning_rate": 2.800689985043607e-05, + "loss": 0.2285, + "step": 10915 + }, + { + "epoch": 0.518444666001994, + "grad_norm": 0.453125, + "learning_rate": 2.798924066908392e-05, + "loss": 0.2236, + "step": 10920 + }, + { + "epoch": 0.5186820490908227, + "grad_norm": 0.455078125, + "learning_rate": 2.797158118621498e-05, + "loss": 0.2281, + "step": 10925 + }, + { + "epoch": 0.5189194321796515, + "grad_norm": 0.5078125, + "learning_rate": 2.7953921412712685e-05, + "loss": 0.2253, + "step": 10930 + }, + { + "epoch": 0.5191568152684802, + "grad_norm": 0.482421875, + "learning_rate": 2.7936261359460697e-05, + "loss": 0.2251, + "step": 10935 + }, + { + "epoch": 0.519394198357309, + "grad_norm": 0.462890625, + "learning_rate": 2.7918601037342816e-05, + "loss": 0.2237, + "step": 10940 + }, + { + "epoch": 0.5196315814461377, + "grad_norm": 0.47265625, + "learning_rate": 2.790094045724302e-05, + "loss": 0.222, + "step": 10945 + }, + { + "epoch": 0.5198689645349666, + "grad_norm": 0.66796875, + "learning_rate": 2.7883279630045445e-05, + "loss": 0.2244, + "step": 10950 + }, + { + "epoch": 0.5201063476237953, + "grad_norm": 0.466796875, + "learning_rate": 2.786561856663437e-05, + "loss": 0.2208, + "step": 10955 + }, + { + "epoch": 0.5203437307126241, + "grad_norm": 0.486328125, + "learning_rate": 2.7847957277894245e-05, + "loss": 0.2244, + "step": 10960 + }, + { + "epoch": 0.5205811138014528, + "grad_norm": 0.43359375, + "learning_rate": 2.7830295774709625e-05, + "loss": 0.2195, + "step": 10965 + }, + { + "epoch": 0.5208184968902816, + "grad_norm": 0.416015625, + "learning_rate": 2.7812634067965227e-05, + "loss": 0.2246, + "step": 10970 + }, + { + "epoch": 0.5210558799791103, + "grad_norm": 0.447265625, + "learning_rate": 2.779497216854587e-05, + "loss": 0.2257, + "step": 10975 + }, + { + "epoch": 0.521293263067939, + "grad_norm": 0.51953125, + "learning_rate": 2.7777310087336507e-05, + "loss": 0.2247, + "step": 10980 + }, + { + "epoch": 0.5215306461567678, + "grad_norm": 0.421875, + "learning_rate": 2.7759647835222197e-05, + "loss": 0.2257, + "step": 10985 + }, + { + "epoch": 0.5217680292455965, + "grad_norm": 0.3828125, + "learning_rate": 2.7741985423088114e-05, + "loss": 0.2199, + "step": 10990 + }, + { + "epoch": 0.5220054123344253, + "grad_norm": 0.462890625, + "learning_rate": 2.7724322861819508e-05, + "loss": 0.2217, + "step": 10995 + }, + { + "epoch": 0.522242795423254, + "grad_norm": 0.44921875, + "learning_rate": 2.770666016230174e-05, + "loss": 0.2261, + "step": 11000 + }, + { + "epoch": 0.5224801785120828, + "grad_norm": 0.470703125, + "learning_rate": 2.7688997335420253e-05, + "loss": 0.2238, + "step": 11005 + }, + { + "epoch": 0.5227175616009115, + "grad_norm": 0.416015625, + "learning_rate": 2.7671334392060577e-05, + "loss": 0.227, + "step": 11010 + }, + { + "epoch": 0.5229549446897404, + "grad_norm": 0.5, + "learning_rate": 2.7653671343108283e-05, + "loss": 0.2204, + "step": 11015 + }, + { + "epoch": 0.523192327778569, + "grad_norm": 0.474609375, + "learning_rate": 2.763600819944905e-05, + "loss": 0.2214, + "step": 11020 + }, + { + "epoch": 0.5234297108673978, + "grad_norm": 0.46875, + "learning_rate": 2.7618344971968575e-05, + "loss": 0.2248, + "step": 11025 + }, + { + "epoch": 0.5236670939562266, + "grad_norm": 0.55078125, + "learning_rate": 2.7600681671552636e-05, + "loss": 0.2229, + "step": 11030 + }, + { + "epoch": 0.5239044770450553, + "grad_norm": 0.51171875, + "learning_rate": 2.7583018309087038e-05, + "loss": 0.2243, + "step": 11035 + }, + { + "epoch": 0.5241418601338841, + "grad_norm": 0.51171875, + "learning_rate": 2.756535489545764e-05, + "loss": 0.2261, + "step": 11040 + }, + { + "epoch": 0.5243792432227128, + "grad_norm": 0.470703125, + "learning_rate": 2.7547691441550306e-05, + "loss": 0.2226, + "step": 11045 + }, + { + "epoch": 0.5246166263115416, + "grad_norm": 0.4765625, + "learning_rate": 2.7530027958250965e-05, + "loss": 0.2217, + "step": 11050 + }, + { + "epoch": 0.5248540094003703, + "grad_norm": 0.41015625, + "learning_rate": 2.7512364456445516e-05, + "loss": 0.2275, + "step": 11055 + }, + { + "epoch": 0.5250913924891991, + "grad_norm": 0.5078125, + "learning_rate": 2.7494700947019924e-05, + "loss": 0.2218, + "step": 11060 + }, + { + "epoch": 0.5253287755780278, + "grad_norm": 0.478515625, + "learning_rate": 2.7477037440860103e-05, + "loss": 0.2252, + "step": 11065 + }, + { + "epoch": 0.5255661586668565, + "grad_norm": 0.42578125, + "learning_rate": 2.7459373948852006e-05, + "loss": 0.2243, + "step": 11070 + }, + { + "epoch": 0.5258035417556853, + "grad_norm": 0.42578125, + "learning_rate": 2.7441710481881566e-05, + "loss": 0.2227, + "step": 11075 + }, + { + "epoch": 0.526040924844514, + "grad_norm": 0.416015625, + "learning_rate": 2.7424047050834678e-05, + "loss": 0.2256, + "step": 11080 + }, + { + "epoch": 0.5262783079333428, + "grad_norm": 0.57421875, + "learning_rate": 2.7406383666597256e-05, + "loss": 0.2264, + "step": 11085 + }, + { + "epoch": 0.5265156910221716, + "grad_norm": 0.453125, + "learning_rate": 2.7388720340055152e-05, + "loss": 0.2223, + "step": 11090 + }, + { + "epoch": 0.5267530741110004, + "grad_norm": 0.5390625, + "learning_rate": 2.7371057082094198e-05, + "loss": 0.2231, + "step": 11095 + }, + { + "epoch": 0.5269904571998291, + "grad_norm": 0.435546875, + "learning_rate": 2.7353393903600178e-05, + "loss": 0.2292, + "step": 11100 + }, + { + "epoch": 0.5272278402886579, + "grad_norm": 0.455078125, + "learning_rate": 2.733573081545883e-05, + "loss": 0.2251, + "step": 11105 + }, + { + "epoch": 0.5274652233774866, + "grad_norm": 0.423828125, + "learning_rate": 2.7318067828555826e-05, + "loss": 0.2243, + "step": 11110 + }, + { + "epoch": 0.5277026064663154, + "grad_norm": 0.39453125, + "learning_rate": 2.7300404953776803e-05, + "loss": 0.2271, + "step": 11115 + }, + { + "epoch": 0.5279399895551441, + "grad_norm": 0.427734375, + "learning_rate": 2.7282742202007296e-05, + "loss": 0.2209, + "step": 11120 + }, + { + "epoch": 0.5281773726439728, + "grad_norm": 0.478515625, + "learning_rate": 2.7265079584132775e-05, + "loss": 0.2212, + "step": 11125 + }, + { + "epoch": 0.5284147557328016, + "grad_norm": 0.51953125, + "learning_rate": 2.7247417111038643e-05, + "loss": 0.2225, + "step": 11130 + }, + { + "epoch": 0.5286521388216303, + "grad_norm": 0.4765625, + "learning_rate": 2.7229754793610195e-05, + "loss": 0.227, + "step": 11135 + }, + { + "epoch": 0.5288895219104591, + "grad_norm": 0.41796875, + "learning_rate": 2.7212092642732644e-05, + "loss": 0.2253, + "step": 11140 + }, + { + "epoch": 0.5291269049992878, + "grad_norm": 0.4921875, + "learning_rate": 2.7194430669291083e-05, + "loss": 0.2231, + "step": 11145 + }, + { + "epoch": 0.5293642880881166, + "grad_norm": 0.408203125, + "learning_rate": 2.717676888417051e-05, + "loss": 0.2249, + "step": 11150 + }, + { + "epoch": 0.5296016711769453, + "grad_norm": 0.5, + "learning_rate": 2.715910729825581e-05, + "loss": 0.2277, + "step": 11155 + }, + { + "epoch": 0.5298390542657742, + "grad_norm": 0.52734375, + "learning_rate": 2.7141445922431724e-05, + "loss": 0.2276, + "step": 11160 + }, + { + "epoch": 0.5300764373546029, + "grad_norm": 0.478515625, + "learning_rate": 2.71237847675829e-05, + "loss": 0.2228, + "step": 11165 + }, + { + "epoch": 0.5303138204434316, + "grad_norm": 0.40234375, + "learning_rate": 2.7106123844593795e-05, + "loss": 0.2198, + "step": 11170 + }, + { + "epoch": 0.5305512035322604, + "grad_norm": 0.4921875, + "learning_rate": 2.708846316434879e-05, + "loss": 0.2239, + "step": 11175 + }, + { + "epoch": 0.5307885866210891, + "grad_norm": 0.474609375, + "learning_rate": 2.7070802737732053e-05, + "loss": 0.2256, + "step": 11180 + }, + { + "epoch": 0.5310259697099179, + "grad_norm": 0.546875, + "learning_rate": 2.7053142575627638e-05, + "loss": 0.2271, + "step": 11185 + }, + { + "epoch": 0.5312633527987466, + "grad_norm": 0.498046875, + "learning_rate": 2.703548268891942e-05, + "loss": 0.223, + "step": 11190 + }, + { + "epoch": 0.5315007358875754, + "grad_norm": 0.44140625, + "learning_rate": 2.7017823088491118e-05, + "loss": 0.2235, + "step": 11195 + }, + { + "epoch": 0.5317381189764041, + "grad_norm": 0.54296875, + "learning_rate": 2.7000163785226246e-05, + "loss": 0.2244, + "step": 11200 + }, + { + "epoch": 0.5319755020652329, + "grad_norm": 0.435546875, + "learning_rate": 2.6982504790008167e-05, + "loss": 0.226, + "step": 11205 + }, + { + "epoch": 0.5322128851540616, + "grad_norm": 0.4453125, + "learning_rate": 2.6964846113720027e-05, + "loss": 0.2221, + "step": 11210 + }, + { + "epoch": 0.5324502682428903, + "grad_norm": 0.40234375, + "learning_rate": 2.6947187767244798e-05, + "loss": 0.2244, + "step": 11215 + }, + { + "epoch": 0.5326876513317191, + "grad_norm": 0.443359375, + "learning_rate": 2.6929529761465235e-05, + "loss": 0.2262, + "step": 11220 + }, + { + "epoch": 0.5329250344205478, + "grad_norm": 0.5703125, + "learning_rate": 2.6911872107263874e-05, + "loss": 0.2243, + "step": 11225 + }, + { + "epoch": 0.5331624175093767, + "grad_norm": 0.49609375, + "learning_rate": 2.6894214815523073e-05, + "loss": 0.2255, + "step": 11230 + }, + { + "epoch": 0.5333998005982054, + "grad_norm": 0.4609375, + "learning_rate": 2.6876557897124916e-05, + "loss": 0.2271, + "step": 11235 + }, + { + "epoch": 0.5336371836870342, + "grad_norm": 0.478515625, + "learning_rate": 2.68589013629513e-05, + "loss": 0.2245, + "step": 11240 + }, + { + "epoch": 0.5338745667758629, + "grad_norm": 0.4375, + "learning_rate": 2.6841245223883855e-05, + "loss": 0.2226, + "step": 11245 + }, + { + "epoch": 0.5341119498646917, + "grad_norm": 0.58984375, + "learning_rate": 2.6823589490803987e-05, + "loss": 0.2253, + "step": 11250 + }, + { + "epoch": 0.5343493329535204, + "grad_norm": 0.51953125, + "learning_rate": 2.680593417459283e-05, + "loss": 0.2267, + "step": 11255 + }, + { + "epoch": 0.5345867160423492, + "grad_norm": 0.47265625, + "learning_rate": 2.678827928613129e-05, + "loss": 0.2278, + "step": 11260 + }, + { + "epoch": 0.5348240991311779, + "grad_norm": 0.53125, + "learning_rate": 2.6770624836300003e-05, + "loss": 0.2228, + "step": 11265 + }, + { + "epoch": 0.5350614822200066, + "grad_norm": 0.4453125, + "learning_rate": 2.675297083597929e-05, + "loss": 0.2217, + "step": 11270 + }, + { + "epoch": 0.5352988653088354, + "grad_norm": 0.439453125, + "learning_rate": 2.673531729604927e-05, + "loss": 0.2235, + "step": 11275 + }, + { + "epoch": 0.5355362483976641, + "grad_norm": 0.453125, + "learning_rate": 2.6717664227389716e-05, + "loss": 0.2226, + "step": 11280 + }, + { + "epoch": 0.5357736314864929, + "grad_norm": 0.44140625, + "learning_rate": 2.6700011640880146e-05, + "loss": 0.2279, + "step": 11285 + }, + { + "epoch": 0.5360110145753216, + "grad_norm": 0.46484375, + "learning_rate": 2.6682359547399753e-05, + "loss": 0.2246, + "step": 11290 + }, + { + "epoch": 0.5362483976641504, + "grad_norm": 0.419921875, + "learning_rate": 2.6664707957827467e-05, + "loss": 0.2238, + "step": 11295 + }, + { + "epoch": 0.5364857807529791, + "grad_norm": 0.5390625, + "learning_rate": 2.6647056883041853e-05, + "loss": 0.2291, + "step": 11300 + }, + { + "epoch": 0.536723163841808, + "grad_norm": 0.412109375, + "learning_rate": 2.662940633392122e-05, + "loss": 0.2265, + "step": 11305 + }, + { + "epoch": 0.5369605469306367, + "grad_norm": 0.396484375, + "learning_rate": 2.66117563213435e-05, + "loss": 0.2214, + "step": 11310 + }, + { + "epoch": 0.5371979300194654, + "grad_norm": 0.625, + "learning_rate": 2.6594106856186317e-05, + "loss": 0.2252, + "step": 11315 + }, + { + "epoch": 0.5374353131082942, + "grad_norm": 0.462890625, + "learning_rate": 2.657645794932696e-05, + "loss": 0.2233, + "step": 11320 + }, + { + "epoch": 0.5376726961971229, + "grad_norm": 0.51171875, + "learning_rate": 2.6558809611642364e-05, + "loss": 0.2271, + "step": 11325 + }, + { + "epoch": 0.5379100792859517, + "grad_norm": 0.46875, + "learning_rate": 2.6541161854009134e-05, + "loss": 0.2229, + "step": 11330 + }, + { + "epoch": 0.5381474623747804, + "grad_norm": 0.478515625, + "learning_rate": 2.6523514687303485e-05, + "loss": 0.2257, + "step": 11335 + }, + { + "epoch": 0.5383848454636092, + "grad_norm": 0.421875, + "learning_rate": 2.6505868122401295e-05, + "loss": 0.2274, + "step": 11340 + }, + { + "epoch": 0.5386222285524379, + "grad_norm": 0.421875, + "learning_rate": 2.6488222170178062e-05, + "loss": 0.2194, + "step": 11345 + }, + { + "epoch": 0.5388596116412667, + "grad_norm": 0.4765625, + "learning_rate": 2.6470576841508905e-05, + "loss": 0.2299, + "step": 11350 + }, + { + "epoch": 0.5390969947300954, + "grad_norm": 0.48046875, + "learning_rate": 2.645293214726856e-05, + "loss": 0.2234, + "step": 11355 + }, + { + "epoch": 0.5393343778189241, + "grad_norm": 0.447265625, + "learning_rate": 2.6435288098331372e-05, + "loss": 0.2228, + "step": 11360 + }, + { + "epoch": 0.5395717609077529, + "grad_norm": 0.486328125, + "learning_rate": 2.6417644705571286e-05, + "loss": 0.2205, + "step": 11365 + }, + { + "epoch": 0.5398091439965816, + "grad_norm": 0.53125, + "learning_rate": 2.6400001979861843e-05, + "loss": 0.2261, + "step": 11370 + }, + { + "epoch": 0.5400465270854105, + "grad_norm": 0.458984375, + "learning_rate": 2.6382359932076185e-05, + "loss": 0.2278, + "step": 11375 + }, + { + "epoch": 0.5402839101742392, + "grad_norm": 0.462890625, + "learning_rate": 2.636471857308701e-05, + "loss": 0.224, + "step": 11380 + }, + { + "epoch": 0.540521293263068, + "grad_norm": 0.44140625, + "learning_rate": 2.6347077913766622e-05, + "loss": 0.2273, + "step": 11385 + }, + { + "epoch": 0.5407586763518967, + "grad_norm": 0.38671875, + "learning_rate": 2.632943796498687e-05, + "loss": 0.2232, + "step": 11390 + }, + { + "epoch": 0.5409960594407255, + "grad_norm": 0.4453125, + "learning_rate": 2.631179873761918e-05, + "loss": 0.2233, + "step": 11395 + }, + { + "epoch": 0.5412334425295542, + "grad_norm": 0.55078125, + "learning_rate": 2.6294160242534522e-05, + "loss": 0.2255, + "step": 11400 + }, + { + "epoch": 0.541470825618383, + "grad_norm": 0.447265625, + "learning_rate": 2.627652249060343e-05, + "loss": 0.224, + "step": 11405 + }, + { + "epoch": 0.5417082087072117, + "grad_norm": 0.494140625, + "learning_rate": 2.625888549269596e-05, + "loss": 0.2237, + "step": 11410 + }, + { + "epoch": 0.5419455917960404, + "grad_norm": 0.490234375, + "learning_rate": 2.624124925968171e-05, + "loss": 0.2257, + "step": 11415 + }, + { + "epoch": 0.5421829748848692, + "grad_norm": 0.484375, + "learning_rate": 2.6223613802429832e-05, + "loss": 0.2239, + "step": 11420 + }, + { + "epoch": 0.5424203579736979, + "grad_norm": 0.482421875, + "learning_rate": 2.6205979131808944e-05, + "loss": 0.2224, + "step": 11425 + }, + { + "epoch": 0.5426577410625267, + "grad_norm": 0.46484375, + "learning_rate": 2.618834525868725e-05, + "loss": 0.2232, + "step": 11430 + }, + { + "epoch": 0.5428951241513554, + "grad_norm": 0.443359375, + "learning_rate": 2.6170712193932394e-05, + "loss": 0.2195, + "step": 11435 + }, + { + "epoch": 0.5431325072401842, + "grad_norm": 0.41796875, + "learning_rate": 2.6153079948411584e-05, + "loss": 0.2186, + "step": 11440 + }, + { + "epoch": 0.543369890329013, + "grad_norm": 0.5234375, + "learning_rate": 2.6135448532991462e-05, + "loss": 0.2258, + "step": 11445 + }, + { + "epoch": 0.5436072734178418, + "grad_norm": 0.44140625, + "learning_rate": 2.6117817958538217e-05, + "loss": 0.226, + "step": 11450 + }, + { + "epoch": 0.5438446565066705, + "grad_norm": 0.4140625, + "learning_rate": 2.610018823591746e-05, + "loss": 0.2256, + "step": 11455 + }, + { + "epoch": 0.5440820395954992, + "grad_norm": 0.51953125, + "learning_rate": 2.6082559375994352e-05, + "loss": 0.2225, + "step": 11460 + }, + { + "epoch": 0.544319422684328, + "grad_norm": 0.578125, + "learning_rate": 2.6064931389633445e-05, + "loss": 0.2265, + "step": 11465 + }, + { + "epoch": 0.5445568057731567, + "grad_norm": 0.453125, + "learning_rate": 2.6047304287698794e-05, + "loss": 0.2248, + "step": 11470 + }, + { + "epoch": 0.5447941888619855, + "grad_norm": 0.42578125, + "learning_rate": 2.6029678081053903e-05, + "loss": 0.2227, + "step": 11475 + }, + { + "epoch": 0.5450315719508142, + "grad_norm": 0.484375, + "learning_rate": 2.6012052780561728e-05, + "loss": 0.2237, + "step": 11480 + }, + { + "epoch": 0.545268955039643, + "grad_norm": 0.431640625, + "learning_rate": 2.5994428397084658e-05, + "loss": 0.2266, + "step": 11485 + }, + { + "epoch": 0.5455063381284717, + "grad_norm": 0.443359375, + "learning_rate": 2.5976804941484516e-05, + "loss": 0.2272, + "step": 11490 + }, + { + "epoch": 0.5457437212173005, + "grad_norm": 0.5234375, + "learning_rate": 2.5959182424622568e-05, + "loss": 0.225, + "step": 11495 + }, + { + "epoch": 0.5459811043061292, + "grad_norm": 0.4453125, + "learning_rate": 2.5941560857359488e-05, + "loss": 0.2248, + "step": 11500 + }, + { + "epoch": 0.5462184873949579, + "grad_norm": 0.46484375, + "learning_rate": 2.592394025055536e-05, + "loss": 0.2245, + "step": 11505 + }, + { + "epoch": 0.5464558704837867, + "grad_norm": 0.4296875, + "learning_rate": 2.590632061506969e-05, + "loss": 0.2217, + "step": 11510 + }, + { + "epoch": 0.5466932535726154, + "grad_norm": 0.5234375, + "learning_rate": 2.5888701961761376e-05, + "loss": 0.2229, + "step": 11515 + }, + { + "epoch": 0.5469306366614443, + "grad_norm": 0.4296875, + "learning_rate": 2.587108430148872e-05, + "loss": 0.222, + "step": 11520 + }, + { + "epoch": 0.547168019750273, + "grad_norm": 0.44921875, + "learning_rate": 2.5853467645109392e-05, + "loss": 0.2294, + "step": 11525 + }, + { + "epoch": 0.5474054028391018, + "grad_norm": 0.55078125, + "learning_rate": 2.5835852003480475e-05, + "loss": 0.2242, + "step": 11530 + }, + { + "epoch": 0.5476427859279305, + "grad_norm": 0.494140625, + "learning_rate": 2.58182373874584e-05, + "loss": 0.2264, + "step": 11535 + }, + { + "epoch": 0.5478801690167593, + "grad_norm": 0.427734375, + "learning_rate": 2.5800623807898976e-05, + "loss": 0.2222, + "step": 11540 + }, + { + "epoch": 0.548117552105588, + "grad_norm": 0.423828125, + "learning_rate": 2.5783011275657366e-05, + "loss": 0.2197, + "step": 11545 + }, + { + "epoch": 0.5483549351944168, + "grad_norm": 0.47265625, + "learning_rate": 2.5765399801588108e-05, + "loss": 0.2267, + "step": 11550 + }, + { + "epoch": 0.5485923182832455, + "grad_norm": 0.408203125, + "learning_rate": 2.574778939654507e-05, + "loss": 0.22, + "step": 11555 + }, + { + "epoch": 0.5488297013720742, + "grad_norm": 0.42578125, + "learning_rate": 2.5730180071381437e-05, + "loss": 0.2242, + "step": 11560 + }, + { + "epoch": 0.549067084460903, + "grad_norm": 0.43359375, + "learning_rate": 2.5712571836949802e-05, + "loss": 0.2232, + "step": 11565 + }, + { + "epoch": 0.5493044675497317, + "grad_norm": 0.41015625, + "learning_rate": 2.5694964704101998e-05, + "loss": 0.2232, + "step": 11570 + }, + { + "epoch": 0.5495418506385605, + "grad_norm": 0.396484375, + "learning_rate": 2.567735868368925e-05, + "loss": 0.225, + "step": 11575 + }, + { + "epoch": 0.5497792337273892, + "grad_norm": 0.435546875, + "learning_rate": 2.565975378656204e-05, + "loss": 0.2249, + "step": 11580 + }, + { + "epoch": 0.550016616816218, + "grad_norm": 0.40234375, + "learning_rate": 2.5642150023570226e-05, + "loss": 0.2238, + "step": 11585 + }, + { + "epoch": 0.5502539999050468, + "grad_norm": 0.44140625, + "learning_rate": 2.5624547405562892e-05, + "loss": 0.2237, + "step": 11590 + }, + { + "epoch": 0.5504913829938756, + "grad_norm": 0.4609375, + "learning_rate": 2.5606945943388468e-05, + "loss": 0.2258, + "step": 11595 + }, + { + "epoch": 0.5507287660827043, + "grad_norm": 0.49609375, + "learning_rate": 2.5589345647894643e-05, + "loss": 0.2217, + "step": 11600 + }, + { + "epoch": 0.550966149171533, + "grad_norm": 0.41796875, + "learning_rate": 2.5571746529928413e-05, + "loss": 0.2199, + "step": 11605 + }, + { + "epoch": 0.5512035322603618, + "grad_norm": 0.486328125, + "learning_rate": 2.5554148600336032e-05, + "loss": 0.2254, + "step": 11610 + }, + { + "epoch": 0.5514409153491905, + "grad_norm": 0.447265625, + "learning_rate": 2.5536551869963015e-05, + "loss": 0.2245, + "step": 11615 + }, + { + "epoch": 0.5516782984380193, + "grad_norm": 0.482421875, + "learning_rate": 2.551895634965415e-05, + "loss": 0.2252, + "step": 11620 + }, + { + "epoch": 0.551915681526848, + "grad_norm": 0.470703125, + "learning_rate": 2.550136205025348e-05, + "loss": 0.2262, + "step": 11625 + }, + { + "epoch": 0.5521530646156768, + "grad_norm": 0.4765625, + "learning_rate": 2.5483768982604288e-05, + "loss": 0.2296, + "step": 11630 + }, + { + "epoch": 0.5523904477045055, + "grad_norm": 0.443359375, + "learning_rate": 2.5466177157549108e-05, + "loss": 0.2245, + "step": 11635 + }, + { + "epoch": 0.5526278307933343, + "grad_norm": 0.46484375, + "learning_rate": 2.5448586585929695e-05, + "loss": 0.2258, + "step": 11640 + }, + { + "epoch": 0.552865213882163, + "grad_norm": 0.44140625, + "learning_rate": 2.5430997278587042e-05, + "loss": 0.2261, + "step": 11645 + }, + { + "epoch": 0.5531025969709917, + "grad_norm": 0.447265625, + "learning_rate": 2.541340924636136e-05, + "loss": 0.2249, + "step": 11650 + }, + { + "epoch": 0.5533399800598205, + "grad_norm": 0.4609375, + "learning_rate": 2.539582250009207e-05, + "loss": 0.2263, + "step": 11655 + }, + { + "epoch": 0.5535773631486492, + "grad_norm": 0.4375, + "learning_rate": 2.53782370506178e-05, + "loss": 0.2247, + "step": 11660 + }, + { + "epoch": 0.5538147462374781, + "grad_norm": 0.365234375, + "learning_rate": 2.53606529087764e-05, + "loss": 0.2237, + "step": 11665 + }, + { + "epoch": 0.5540521293263068, + "grad_norm": 0.4921875, + "learning_rate": 2.5343070085404864e-05, + "loss": 0.2207, + "step": 11670 + }, + { + "epoch": 0.5542895124151356, + "grad_norm": 0.47265625, + "learning_rate": 2.532548859133944e-05, + "loss": 0.2232, + "step": 11675 + }, + { + "epoch": 0.5545268955039643, + "grad_norm": 0.404296875, + "learning_rate": 2.5307908437415502e-05, + "loss": 0.2257, + "step": 11680 + }, + { + "epoch": 0.5547642785927931, + "grad_norm": 0.45703125, + "learning_rate": 2.5290329634467634e-05, + "loss": 0.2231, + "step": 11685 + }, + { + "epoch": 0.5550016616816218, + "grad_norm": 0.3828125, + "learning_rate": 2.5272752193329553e-05, + "loss": 0.2261, + "step": 11690 + }, + { + "epoch": 0.5552390447704506, + "grad_norm": 0.490234375, + "learning_rate": 2.525517612483418e-05, + "loss": 0.2213, + "step": 11695 + }, + { + "epoch": 0.5554764278592793, + "grad_norm": 0.466796875, + "learning_rate": 2.523760143981354e-05, + "loss": 0.2247, + "step": 11700 + }, + { + "epoch": 0.555713810948108, + "grad_norm": 0.4296875, + "learning_rate": 2.5220028149098862e-05, + "loss": 0.2244, + "step": 11705 + }, + { + "epoch": 0.5559511940369368, + "grad_norm": 0.494140625, + "learning_rate": 2.520245626352046e-05, + "loss": 0.2207, + "step": 11710 + }, + { + "epoch": 0.5561885771257655, + "grad_norm": 0.388671875, + "learning_rate": 2.518488579390781e-05, + "loss": 0.2221, + "step": 11715 + }, + { + "epoch": 0.5564259602145943, + "grad_norm": 0.46484375, + "learning_rate": 2.516731675108952e-05, + "loss": 0.2294, + "step": 11720 + }, + { + "epoch": 0.556663343303423, + "grad_norm": 0.41015625, + "learning_rate": 2.514974914589331e-05, + "loss": 0.2204, + "step": 11725 + }, + { + "epoch": 0.5569007263922519, + "grad_norm": 0.4765625, + "learning_rate": 2.5132182989146015e-05, + "loss": 0.2252, + "step": 11730 + }, + { + "epoch": 0.5571381094810806, + "grad_norm": 0.486328125, + "learning_rate": 2.511461829167357e-05, + "loss": 0.2202, + "step": 11735 + }, + { + "epoch": 0.5573754925699094, + "grad_norm": 0.5625, + "learning_rate": 2.5097055064301033e-05, + "loss": 0.2316, + "step": 11740 + }, + { + "epoch": 0.5576128756587381, + "grad_norm": 0.4453125, + "learning_rate": 2.5079493317852526e-05, + "loss": 0.2229, + "step": 11745 + }, + { + "epoch": 0.5578502587475668, + "grad_norm": 0.45703125, + "learning_rate": 2.506193306315129e-05, + "loss": 0.2219, + "step": 11750 + }, + { + "epoch": 0.5580876418363956, + "grad_norm": 0.447265625, + "learning_rate": 2.5044374311019615e-05, + "loss": 0.2227, + "step": 11755 + }, + { + "epoch": 0.5583250249252243, + "grad_norm": 0.484375, + "learning_rate": 2.502681707227888e-05, + "loss": 0.228, + "step": 11760 + }, + { + "epoch": 0.5585624080140531, + "grad_norm": 0.50390625, + "learning_rate": 2.5009261357749547e-05, + "loss": 0.2256, + "step": 11765 + }, + { + "epoch": 0.5587997911028818, + "grad_norm": 0.53515625, + "learning_rate": 2.4991707178251107e-05, + "loss": 0.2234, + "step": 11770 + }, + { + "epoch": 0.5590371741917106, + "grad_norm": 0.5234375, + "learning_rate": 2.497415454460213e-05, + "loss": 0.2231, + "step": 11775 + }, + { + "epoch": 0.5592745572805393, + "grad_norm": 0.5, + "learning_rate": 2.4956603467620214e-05, + "loss": 0.2222, + "step": 11780 + }, + { + "epoch": 0.5595119403693681, + "grad_norm": 0.431640625, + "learning_rate": 2.493905395812202e-05, + "loss": 0.2272, + "step": 11785 + }, + { + "epoch": 0.5597493234581968, + "grad_norm": 0.458984375, + "learning_rate": 2.4921506026923232e-05, + "loss": 0.2209, + "step": 11790 + }, + { + "epoch": 0.5599867065470255, + "grad_norm": 0.421875, + "learning_rate": 2.4903959684838547e-05, + "loss": 0.222, + "step": 11795 + }, + { + "epoch": 0.5602240896358543, + "grad_norm": 0.466796875, + "learning_rate": 2.488641494268171e-05, + "loss": 0.224, + "step": 11800 + }, + { + "epoch": 0.560461472724683, + "grad_norm": 0.4140625, + "learning_rate": 2.486887181126545e-05, + "loss": 0.2275, + "step": 11805 + }, + { + "epoch": 0.5606988558135119, + "grad_norm": 0.482421875, + "learning_rate": 2.4851330301401542e-05, + "loss": 0.2256, + "step": 11810 + }, + { + "epoch": 0.5609362389023406, + "grad_norm": 0.4765625, + "learning_rate": 2.483379042390071e-05, + "loss": 0.2236, + "step": 11815 + }, + { + "epoch": 0.5611736219911694, + "grad_norm": 0.51171875, + "learning_rate": 2.4816252189572726e-05, + "loss": 0.2249, + "step": 11820 + }, + { + "epoch": 0.5614110050799981, + "grad_norm": 0.498046875, + "learning_rate": 2.4798715609226297e-05, + "loss": 0.2235, + "step": 11825 + }, + { + "epoch": 0.5616483881688269, + "grad_norm": 0.458984375, + "learning_rate": 2.4781180693669177e-05, + "loss": 0.226, + "step": 11830 + }, + { + "epoch": 0.5618857712576556, + "grad_norm": 0.44921875, + "learning_rate": 2.4763647453708018e-05, + "loss": 0.2235, + "step": 11835 + }, + { + "epoch": 0.5621231543464844, + "grad_norm": 0.45703125, + "learning_rate": 2.4746115900148497e-05, + "loss": 0.2269, + "step": 11840 + }, + { + "epoch": 0.5623605374353131, + "grad_norm": 0.474609375, + "learning_rate": 2.4728586043795216e-05, + "loss": 0.2237, + "step": 11845 + }, + { + "epoch": 0.5625979205241418, + "grad_norm": 0.458984375, + "learning_rate": 2.471105789545176e-05, + "loss": 0.2245, + "step": 11850 + }, + { + "epoch": 0.5628353036129706, + "grad_norm": 0.5078125, + "learning_rate": 2.4693531465920645e-05, + "loss": 0.2203, + "step": 11855 + }, + { + "epoch": 0.5630726867017993, + "grad_norm": 0.416015625, + "learning_rate": 2.4676006766003318e-05, + "loss": 0.2258, + "step": 11860 + }, + { + "epoch": 0.5633100697906281, + "grad_norm": 0.486328125, + "learning_rate": 2.4658483806500183e-05, + "loss": 0.2223, + "step": 11865 + }, + { + "epoch": 0.5635474528794568, + "grad_norm": 0.412109375, + "learning_rate": 2.4640962598210554e-05, + "loss": 0.2273, + "step": 11870 + }, + { + "epoch": 0.5637848359682857, + "grad_norm": 0.40625, + "learning_rate": 2.4623443151932672e-05, + "loss": 0.2212, + "step": 11875 + }, + { + "epoch": 0.5640222190571144, + "grad_norm": 0.41015625, + "learning_rate": 2.4605925478463697e-05, + "loss": 0.2253, + "step": 11880 + }, + { + "epoch": 0.5642596021459432, + "grad_norm": 0.51171875, + "learning_rate": 2.4588409588599696e-05, + "loss": 0.2211, + "step": 11885 + }, + { + "epoch": 0.5644969852347719, + "grad_norm": 0.390625, + "learning_rate": 2.457089549313562e-05, + "loss": 0.2225, + "step": 11890 + }, + { + "epoch": 0.5647343683236006, + "grad_norm": 0.4140625, + "learning_rate": 2.455338320286533e-05, + "loss": 0.2251, + "step": 11895 + }, + { + "epoch": 0.5649717514124294, + "grad_norm": 0.3984375, + "learning_rate": 2.4535872728581583e-05, + "loss": 0.2235, + "step": 11900 + }, + { + "epoch": 0.5652091345012581, + "grad_norm": 0.447265625, + "learning_rate": 2.4518364081075985e-05, + "loss": 0.2235, + "step": 11905 + }, + { + "epoch": 0.5654465175900869, + "grad_norm": 0.458984375, + "learning_rate": 2.4500857271139048e-05, + "loss": 0.2259, + "step": 11910 + }, + { + "epoch": 0.5656839006789156, + "grad_norm": 0.490234375, + "learning_rate": 2.4483352309560132e-05, + "loss": 0.2231, + "step": 11915 + }, + { + "epoch": 0.5659212837677444, + "grad_norm": 0.462890625, + "learning_rate": 2.4465849207127477e-05, + "loss": 0.2226, + "step": 11920 + }, + { + "epoch": 0.5661586668565731, + "grad_norm": 0.490234375, + "learning_rate": 2.4448347974628154e-05, + "loss": 0.2273, + "step": 11925 + }, + { + "epoch": 0.5663960499454019, + "grad_norm": 0.3984375, + "learning_rate": 2.44308486228481e-05, + "loss": 0.2235, + "step": 11930 + }, + { + "epoch": 0.5666334330342306, + "grad_norm": 0.416015625, + "learning_rate": 2.4413351162572086e-05, + "loss": 0.2205, + "step": 11935 + }, + { + "epoch": 0.5668708161230593, + "grad_norm": 0.400390625, + "learning_rate": 2.4395855604583713e-05, + "loss": 0.2229, + "step": 11940 + }, + { + "epoch": 0.5671081992118882, + "grad_norm": 0.4453125, + "learning_rate": 2.437836195966542e-05, + "loss": 0.2247, + "step": 11945 + }, + { + "epoch": 0.5673455823007169, + "grad_norm": 0.62109375, + "learning_rate": 2.4360870238598438e-05, + "loss": 0.2254, + "step": 11950 + }, + { + "epoch": 0.5675829653895457, + "grad_norm": 0.46484375, + "learning_rate": 2.4343380452162874e-05, + "loss": 0.2246, + "step": 11955 + }, + { + "epoch": 0.5678203484783744, + "grad_norm": 0.40625, + "learning_rate": 2.4325892611137563e-05, + "loss": 0.2244, + "step": 11960 + }, + { + "epoch": 0.5680577315672032, + "grad_norm": 0.41796875, + "learning_rate": 2.4308406726300224e-05, + "loss": 0.2256, + "step": 11965 + }, + { + "epoch": 0.5682951146560319, + "grad_norm": 0.439453125, + "learning_rate": 2.4290922808427284e-05, + "loss": 0.225, + "step": 11970 + }, + { + "epoch": 0.5685324977448607, + "grad_norm": 0.443359375, + "learning_rate": 2.4273440868294047e-05, + "loss": 0.2218, + "step": 11975 + }, + { + "epoch": 0.5687698808336894, + "grad_norm": 0.451171875, + "learning_rate": 2.4255960916674512e-05, + "loss": 0.2252, + "step": 11980 + }, + { + "epoch": 0.5690072639225182, + "grad_norm": 0.4765625, + "learning_rate": 2.4238482964341524e-05, + "loss": 0.2229, + "step": 11985 + }, + { + "epoch": 0.5692446470113469, + "grad_norm": 0.50390625, + "learning_rate": 2.4221007022066648e-05, + "loss": 0.2229, + "step": 11990 + }, + { + "epoch": 0.5694820301001756, + "grad_norm": 0.51171875, + "learning_rate": 2.420353310062024e-05, + "loss": 0.2279, + "step": 11995 + }, + { + "epoch": 0.5697194131890044, + "grad_norm": 0.435546875, + "learning_rate": 2.4186061210771388e-05, + "loss": 0.2201, + "step": 12000 + }, + { + "epoch": 0.5699567962778331, + "grad_norm": 0.4609375, + "learning_rate": 2.416859136328794e-05, + "loss": 0.2251, + "step": 12005 + }, + { + "epoch": 0.570194179366662, + "grad_norm": 0.443359375, + "learning_rate": 2.415112356893649e-05, + "loss": 0.2252, + "step": 12010 + }, + { + "epoch": 0.5704315624554906, + "grad_norm": 0.44921875, + "learning_rate": 2.4133657838482348e-05, + "loss": 0.2227, + "step": 12015 + }, + { + "epoch": 0.5706689455443195, + "grad_norm": 0.484375, + "learning_rate": 2.4116194182689582e-05, + "loss": 0.2224, + "step": 12020 + }, + { + "epoch": 0.5709063286331482, + "grad_norm": 0.38671875, + "learning_rate": 2.4098732612320945e-05, + "loss": 0.2256, + "step": 12025 + }, + { + "epoch": 0.571143711721977, + "grad_norm": 0.5859375, + "learning_rate": 2.4081273138137943e-05, + "loss": 0.2253, + "step": 12030 + }, + { + "epoch": 0.5713810948108057, + "grad_norm": 0.478515625, + "learning_rate": 2.406381577090075e-05, + "loss": 0.2248, + "step": 12035 + }, + { + "epoch": 0.5716184778996344, + "grad_norm": 0.451171875, + "learning_rate": 2.404636052136828e-05, + "loss": 0.2262, + "step": 12040 + }, + { + "epoch": 0.5718558609884632, + "grad_norm": 0.5546875, + "learning_rate": 2.4028907400298113e-05, + "loss": 0.2226, + "step": 12045 + }, + { + "epoch": 0.5720932440772919, + "grad_norm": 0.69140625, + "learning_rate": 2.4011456418446526e-05, + "loss": 0.2277, + "step": 12050 + }, + { + "epoch": 0.5723306271661207, + "grad_norm": 0.54296875, + "learning_rate": 2.3994007586568497e-05, + "loss": 0.2231, + "step": 12055 + }, + { + "epoch": 0.5725680102549494, + "grad_norm": 0.55078125, + "learning_rate": 2.3976560915417635e-05, + "loss": 0.2227, + "step": 12060 + }, + { + "epoch": 0.5728053933437782, + "grad_norm": 0.482421875, + "learning_rate": 2.395911641574627e-05, + "loss": 0.2264, + "step": 12065 + }, + { + "epoch": 0.5730427764326069, + "grad_norm": 0.45703125, + "learning_rate": 2.394167409830535e-05, + "loss": 0.2237, + "step": 12070 + }, + { + "epoch": 0.5732801595214357, + "grad_norm": 0.494140625, + "learning_rate": 2.3924233973844507e-05, + "loss": 0.2289, + "step": 12075 + }, + { + "epoch": 0.5735175426102644, + "grad_norm": 0.43359375, + "learning_rate": 2.3906796053111997e-05, + "loss": 0.2226, + "step": 12080 + }, + { + "epoch": 0.5737549256990931, + "grad_norm": 0.5, + "learning_rate": 2.388936034685475e-05, + "loss": 0.2238, + "step": 12085 + }, + { + "epoch": 0.573992308787922, + "grad_norm": 0.400390625, + "learning_rate": 2.387192686581829e-05, + "loss": 0.2206, + "step": 12090 + }, + { + "epoch": 0.5742296918767507, + "grad_norm": 0.439453125, + "learning_rate": 2.3854495620746825e-05, + "loss": 0.2217, + "step": 12095 + }, + { + "epoch": 0.5744670749655795, + "grad_norm": 0.43359375, + "learning_rate": 2.383706662238312e-05, + "loss": 0.2224, + "step": 12100 + }, + { + "epoch": 0.5747044580544082, + "grad_norm": 0.51953125, + "learning_rate": 2.3819639881468592e-05, + "loss": 0.2233, + "step": 12105 + }, + { + "epoch": 0.574941841143237, + "grad_norm": 0.466796875, + "learning_rate": 2.3802215408743282e-05, + "loss": 0.2258, + "step": 12110 + }, + { + "epoch": 0.5751792242320657, + "grad_norm": 0.53125, + "learning_rate": 2.378479321494579e-05, + "loss": 0.2279, + "step": 12115 + }, + { + "epoch": 0.5754166073208945, + "grad_norm": 0.58203125, + "learning_rate": 2.376737331081335e-05, + "loss": 0.2286, + "step": 12120 + }, + { + "epoch": 0.5756539904097232, + "grad_norm": 0.51171875, + "learning_rate": 2.3749955707081767e-05, + "loss": 0.2239, + "step": 12125 + }, + { + "epoch": 0.575891373498552, + "grad_norm": 0.474609375, + "learning_rate": 2.3732540414485428e-05, + "loss": 0.2243, + "step": 12130 + }, + { + "epoch": 0.5761287565873807, + "grad_norm": 0.45703125, + "learning_rate": 2.3715127443757295e-05, + "loss": 0.2225, + "step": 12135 + }, + { + "epoch": 0.5763661396762094, + "grad_norm": 0.416015625, + "learning_rate": 2.3697716805628912e-05, + "loss": 0.2217, + "step": 12140 + }, + { + "epoch": 0.5766035227650382, + "grad_norm": 0.412109375, + "learning_rate": 2.3680308510830374e-05, + "loss": 0.2229, + "step": 12145 + }, + { + "epoch": 0.5768409058538669, + "grad_norm": 0.54296875, + "learning_rate": 2.3662902570090323e-05, + "loss": 0.2222, + "step": 12150 + }, + { + "epoch": 0.5770782889426957, + "grad_norm": 0.404296875, + "learning_rate": 2.3645498994135977e-05, + "loss": 0.2269, + "step": 12155 + }, + { + "epoch": 0.5773156720315245, + "grad_norm": 0.42578125, + "learning_rate": 2.362809779369307e-05, + "loss": 0.2253, + "step": 12160 + }, + { + "epoch": 0.5775530551203533, + "grad_norm": 0.4296875, + "learning_rate": 2.361069897948589e-05, + "loss": 0.2254, + "step": 12165 + }, + { + "epoch": 0.577790438209182, + "grad_norm": 0.470703125, + "learning_rate": 2.3593302562237237e-05, + "loss": 0.2233, + "step": 12170 + }, + { + "epoch": 0.5780278212980108, + "grad_norm": 0.40234375, + "learning_rate": 2.3575908552668453e-05, + "loss": 0.2213, + "step": 12175 + }, + { + "epoch": 0.5782652043868395, + "grad_norm": 0.388671875, + "learning_rate": 2.3558516961499383e-05, + "loss": 0.2203, + "step": 12180 + }, + { + "epoch": 0.5785025874756682, + "grad_norm": 0.443359375, + "learning_rate": 2.3541127799448386e-05, + "loss": 0.2206, + "step": 12185 + }, + { + "epoch": 0.578739970564497, + "grad_norm": 0.462890625, + "learning_rate": 2.3523741077232337e-05, + "loss": 0.2261, + "step": 12190 + }, + { + "epoch": 0.5789773536533257, + "grad_norm": 0.369140625, + "learning_rate": 2.350635680556656e-05, + "loss": 0.2225, + "step": 12195 + }, + { + "epoch": 0.5792147367421545, + "grad_norm": 0.404296875, + "learning_rate": 2.3488974995164936e-05, + "loss": 0.2245, + "step": 12200 + }, + { + "epoch": 0.5794521198309832, + "grad_norm": 0.40625, + "learning_rate": 2.3471595656739776e-05, + "loss": 0.2222, + "step": 12205 + }, + { + "epoch": 0.579689502919812, + "grad_norm": 0.48828125, + "learning_rate": 2.3454218801001903e-05, + "loss": 0.227, + "step": 12210 + }, + { + "epoch": 0.5799268860086407, + "grad_norm": 0.42578125, + "learning_rate": 2.343684443866057e-05, + "loss": 0.2223, + "step": 12215 + }, + { + "epoch": 0.5801642690974695, + "grad_norm": 0.38671875, + "learning_rate": 2.3419472580423552e-05, + "loss": 0.2247, + "step": 12220 + }, + { + "epoch": 0.5804016521862982, + "grad_norm": 0.47265625, + "learning_rate": 2.3402103236997012e-05, + "loss": 0.2229, + "step": 12225 + }, + { + "epoch": 0.580639035275127, + "grad_norm": 0.3671875, + "learning_rate": 2.338473641908563e-05, + "loss": 0.2237, + "step": 12230 + }, + { + "epoch": 0.5808764183639558, + "grad_norm": 0.4296875, + "learning_rate": 2.3367372137392457e-05, + "loss": 0.219, + "step": 12235 + }, + { + "epoch": 0.5811138014527845, + "grad_norm": 0.4140625, + "learning_rate": 2.335001040261906e-05, + "loss": 0.2273, + "step": 12240 + }, + { + "epoch": 0.5813511845416133, + "grad_norm": 0.451171875, + "learning_rate": 2.333265122546538e-05, + "loss": 0.2258, + "step": 12245 + }, + { + "epoch": 0.581588567630442, + "grad_norm": 0.50390625, + "learning_rate": 2.331529461662979e-05, + "loss": 0.2221, + "step": 12250 + }, + { + "epoch": 0.5818259507192708, + "grad_norm": 0.404296875, + "learning_rate": 2.3297940586809107e-05, + "loss": 0.225, + "step": 12255 + }, + { + "epoch": 0.5820633338080995, + "grad_norm": 0.43359375, + "learning_rate": 2.3280589146698522e-05, + "loss": 0.2265, + "step": 12260 + }, + { + "epoch": 0.5823007168969283, + "grad_norm": 0.3984375, + "learning_rate": 2.326324030699167e-05, + "loss": 0.2245, + "step": 12265 + }, + { + "epoch": 0.582538099985757, + "grad_norm": 0.482421875, + "learning_rate": 2.3245894078380547e-05, + "loss": 0.2234, + "step": 12270 + }, + { + "epoch": 0.5827754830745858, + "grad_norm": 0.453125, + "learning_rate": 2.322855047155556e-05, + "loss": 0.2226, + "step": 12275 + }, + { + "epoch": 0.5830128661634145, + "grad_norm": 0.4375, + "learning_rate": 2.3211209497205487e-05, + "loss": 0.2205, + "step": 12280 + }, + { + "epoch": 0.5832502492522432, + "grad_norm": 0.4375, + "learning_rate": 2.3193871166017505e-05, + "loss": 0.2264, + "step": 12285 + }, + { + "epoch": 0.583487632341072, + "grad_norm": 0.427734375, + "learning_rate": 2.3176535488677143e-05, + "loss": 0.225, + "step": 12290 + }, + { + "epoch": 0.5837250154299007, + "grad_norm": 0.47265625, + "learning_rate": 2.3159202475868293e-05, + "loss": 0.2272, + "step": 12295 + }, + { + "epoch": 0.5839623985187296, + "grad_norm": 0.51953125, + "learning_rate": 2.3141872138273226e-05, + "loss": 0.2256, + "step": 12300 + }, + { + "epoch": 0.5841997816075583, + "grad_norm": 0.427734375, + "learning_rate": 2.312454448657253e-05, + "loss": 0.2263, + "step": 12305 + }, + { + "epoch": 0.5844371646963871, + "grad_norm": 0.470703125, + "learning_rate": 2.3107219531445185e-05, + "loss": 0.224, + "step": 12310 + }, + { + "epoch": 0.5846745477852158, + "grad_norm": 0.474609375, + "learning_rate": 2.3089897283568456e-05, + "loss": 0.2243, + "step": 12315 + }, + { + "epoch": 0.5849119308740446, + "grad_norm": 0.4765625, + "learning_rate": 2.3072577753617987e-05, + "loss": 0.2216, + "step": 12320 + }, + { + "epoch": 0.5851493139628733, + "grad_norm": 0.400390625, + "learning_rate": 2.3055260952267704e-05, + "loss": 0.2228, + "step": 12325 + }, + { + "epoch": 0.585386697051702, + "grad_norm": 0.431640625, + "learning_rate": 2.3037946890189894e-05, + "loss": 0.2275, + "step": 12330 + }, + { + "epoch": 0.5856240801405308, + "grad_norm": 0.458984375, + "learning_rate": 2.302063557805512e-05, + "loss": 0.2245, + "step": 12335 + }, + { + "epoch": 0.5858614632293595, + "grad_norm": 0.419921875, + "learning_rate": 2.3003327026532275e-05, + "loss": 0.2215, + "step": 12340 + }, + { + "epoch": 0.5860988463181883, + "grad_norm": 0.5078125, + "learning_rate": 2.2986021246288535e-05, + "loss": 0.224, + "step": 12345 + }, + { + "epoch": 0.586336229407017, + "grad_norm": 0.412109375, + "learning_rate": 2.2968718247989368e-05, + "loss": 0.2242, + "step": 12350 + }, + { + "epoch": 0.5865736124958458, + "grad_norm": 0.462890625, + "learning_rate": 2.295141804229855e-05, + "loss": 0.2206, + "step": 12355 + }, + { + "epoch": 0.5868109955846745, + "grad_norm": 0.451171875, + "learning_rate": 2.2934120639878093e-05, + "loss": 0.2226, + "step": 12360 + }, + { + "epoch": 0.5870483786735033, + "grad_norm": 0.416015625, + "learning_rate": 2.2916826051388342e-05, + "loss": 0.2252, + "step": 12365 + }, + { + "epoch": 0.587285761762332, + "grad_norm": 0.439453125, + "learning_rate": 2.2899534287487834e-05, + "loss": 0.2275, + "step": 12370 + }, + { + "epoch": 0.5875231448511608, + "grad_norm": 0.419921875, + "learning_rate": 2.2882245358833442e-05, + "loss": 0.2253, + "step": 12375 + }, + { + "epoch": 0.5877605279399896, + "grad_norm": 0.455078125, + "learning_rate": 2.2864959276080228e-05, + "loss": 0.2235, + "step": 12380 + }, + { + "epoch": 0.5879979110288183, + "grad_norm": 0.4140625, + "learning_rate": 2.2847676049881546e-05, + "loss": 0.2215, + "step": 12385 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.48046875, + "learning_rate": 2.283039569088895e-05, + "loss": 0.2256, + "step": 12390 + }, + { + "epoch": 0.5884726772064758, + "grad_norm": 0.50390625, + "learning_rate": 2.2813118209752256e-05, + "loss": 0.2193, + "step": 12395 + }, + { + "epoch": 0.5887100602953046, + "grad_norm": 0.443359375, + "learning_rate": 2.2795843617119495e-05, + "loss": 0.2236, + "step": 12400 + }, + { + "epoch": 0.5889474433841333, + "grad_norm": 0.48046875, + "learning_rate": 2.277857192363692e-05, + "loss": 0.2247, + "step": 12405 + }, + { + "epoch": 0.5891848264729621, + "grad_norm": 0.458984375, + "learning_rate": 2.276130313994901e-05, + "loss": 0.2281, + "step": 12410 + }, + { + "epoch": 0.5894222095617908, + "grad_norm": 0.498046875, + "learning_rate": 2.274403727669841e-05, + "loss": 0.2284, + "step": 12415 + }, + { + "epoch": 0.5896595926506196, + "grad_norm": 0.474609375, + "learning_rate": 2.272677434452602e-05, + "loss": 0.2243, + "step": 12420 + }, + { + "epoch": 0.5898969757394483, + "grad_norm": 0.39453125, + "learning_rate": 2.2709514354070893e-05, + "loss": 0.224, + "step": 12425 + }, + { + "epoch": 0.590134358828277, + "grad_norm": 0.498046875, + "learning_rate": 2.2692257315970288e-05, + "loss": 0.2268, + "step": 12430 + }, + { + "epoch": 0.5903717419171058, + "grad_norm": 0.404296875, + "learning_rate": 2.2675003240859636e-05, + "loss": 0.2275, + "step": 12435 + }, + { + "epoch": 0.5906091250059345, + "grad_norm": 0.5, + "learning_rate": 2.2657752139372547e-05, + "loss": 0.2242, + "step": 12440 + }, + { + "epoch": 0.5908465080947634, + "grad_norm": 0.65625, + "learning_rate": 2.2640504022140796e-05, + "loss": 0.2287, + "step": 12445 + }, + { + "epoch": 0.5910838911835921, + "grad_norm": 0.470703125, + "learning_rate": 2.2623258899794307e-05, + "loss": 0.2264, + "step": 12450 + }, + { + "epoch": 0.5913212742724209, + "grad_norm": 0.51953125, + "learning_rate": 2.2606016782961197e-05, + "loss": 0.219, + "step": 12455 + }, + { + "epoch": 0.5915586573612496, + "grad_norm": 0.46875, + "learning_rate": 2.258877768226768e-05, + "loss": 0.2238, + "step": 12460 + }, + { + "epoch": 0.5917960404500784, + "grad_norm": 0.4375, + "learning_rate": 2.257154160833815e-05, + "loss": 0.2244, + "step": 12465 + }, + { + "epoch": 0.5920334235389071, + "grad_norm": 0.474609375, + "learning_rate": 2.2554308571795107e-05, + "loss": 0.219, + "step": 12470 + }, + { + "epoch": 0.5922708066277358, + "grad_norm": 0.421875, + "learning_rate": 2.2537078583259215e-05, + "loss": 0.2247, + "step": 12475 + }, + { + "epoch": 0.5925081897165646, + "grad_norm": 0.419921875, + "learning_rate": 2.251985165334921e-05, + "loss": 0.2225, + "step": 12480 + }, + { + "epoch": 0.5927455728053933, + "grad_norm": 0.46875, + "learning_rate": 2.2502627792682e-05, + "loss": 0.2279, + "step": 12485 + }, + { + "epoch": 0.5929829558942221, + "grad_norm": 0.48046875, + "learning_rate": 2.248540701187255e-05, + "loss": 0.2252, + "step": 12490 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 0.498046875, + "learning_rate": 2.246818932153395e-05, + "loss": 0.2248, + "step": 12495 + }, + { + "epoch": 0.5934577220718796, + "grad_norm": 0.39453125, + "learning_rate": 2.2450974732277396e-05, + "loss": 0.2211, + "step": 12500 + }, + { + "epoch": 0.5936951051607083, + "grad_norm": 0.51953125, + "learning_rate": 2.2433763254712152e-05, + "loss": 0.2258, + "step": 12505 + }, + { + "epoch": 0.5939324882495371, + "grad_norm": 0.392578125, + "learning_rate": 2.241655489944558e-05, + "loss": 0.2231, + "step": 12510 + }, + { + "epoch": 0.5941698713383659, + "grad_norm": 0.435546875, + "learning_rate": 2.2399349677083103e-05, + "loss": 0.2217, + "step": 12515 + }, + { + "epoch": 0.5944072544271946, + "grad_norm": 0.419921875, + "learning_rate": 2.2382147598228226e-05, + "loss": 0.2275, + "step": 12520 + }, + { + "epoch": 0.5946446375160234, + "grad_norm": 0.474609375, + "learning_rate": 2.236494867348251e-05, + "loss": 0.2258, + "step": 12525 + }, + { + "epoch": 0.5948820206048521, + "grad_norm": 0.515625, + "learning_rate": 2.2347752913445587e-05, + "loss": 0.2236, + "step": 12530 + }, + { + "epoch": 0.5951194036936809, + "grad_norm": 0.431640625, + "learning_rate": 2.2330560328715106e-05, + "loss": 0.2255, + "step": 12535 + }, + { + "epoch": 0.5953567867825096, + "grad_norm": 0.41796875, + "learning_rate": 2.2313370929886783e-05, + "loss": 0.2243, + "step": 12540 + }, + { + "epoch": 0.5955941698713384, + "grad_norm": 0.376953125, + "learning_rate": 2.229618472755438e-05, + "loss": 0.2209, + "step": 12545 + }, + { + "epoch": 0.5958315529601671, + "grad_norm": 0.416015625, + "learning_rate": 2.2279001732309657e-05, + "loss": 0.2226, + "step": 12550 + }, + { + "epoch": 0.5960689360489959, + "grad_norm": 0.45703125, + "learning_rate": 2.2261821954742434e-05, + "loss": 0.224, + "step": 12555 + }, + { + "epoch": 0.5963063191378246, + "grad_norm": 0.40234375, + "learning_rate": 2.2244645405440513e-05, + "loss": 0.226, + "step": 12560 + }, + { + "epoch": 0.5965437022266534, + "grad_norm": 0.462890625, + "learning_rate": 2.222747209498973e-05, + "loss": 0.226, + "step": 12565 + }, + { + "epoch": 0.5967810853154821, + "grad_norm": 0.44921875, + "learning_rate": 2.2210302033973924e-05, + "loss": 0.2269, + "step": 12570 + }, + { + "epoch": 0.5970184684043108, + "grad_norm": 0.47265625, + "learning_rate": 2.219313523297492e-05, + "loss": 0.2256, + "step": 12575 + }, + { + "epoch": 0.5972558514931396, + "grad_norm": 0.478515625, + "learning_rate": 2.2175971702572535e-05, + "loss": 0.2272, + "step": 12580 + }, + { + "epoch": 0.5974932345819683, + "grad_norm": 0.48828125, + "learning_rate": 2.2158811453344585e-05, + "loss": 0.222, + "step": 12585 + }, + { + "epoch": 0.5977306176707972, + "grad_norm": 0.4453125, + "learning_rate": 2.2141654495866853e-05, + "loss": 0.2183, + "step": 12590 + }, + { + "epoch": 0.5979680007596259, + "grad_norm": 0.412109375, + "learning_rate": 2.212450084071308e-05, + "loss": 0.2251, + "step": 12595 + }, + { + "epoch": 0.5982053838484547, + "grad_norm": 0.474609375, + "learning_rate": 2.2107350498455004e-05, + "loss": 0.2209, + "step": 12600 + }, + { + "epoch": 0.5984427669372834, + "grad_norm": 0.45703125, + "learning_rate": 2.2090203479662282e-05, + "loss": 0.2238, + "step": 12605 + }, + { + "epoch": 0.5986801500261122, + "grad_norm": 0.46484375, + "learning_rate": 2.207305979490256e-05, + "loss": 0.2245, + "step": 12610 + }, + { + "epoch": 0.5989175331149409, + "grad_norm": 0.431640625, + "learning_rate": 2.2055919454741407e-05, + "loss": 0.2222, + "step": 12615 + }, + { + "epoch": 0.5991549162037696, + "grad_norm": 0.41015625, + "learning_rate": 2.203878246974235e-05, + "loss": 0.2214, + "step": 12620 + }, + { + "epoch": 0.5993922992925984, + "grad_norm": 0.470703125, + "learning_rate": 2.2021648850466803e-05, + "loss": 0.2203, + "step": 12625 + }, + { + "epoch": 0.5996296823814271, + "grad_norm": 0.4375, + "learning_rate": 2.2004518607474173e-05, + "loss": 0.2256, + "step": 12630 + }, + { + "epoch": 0.5998670654702559, + "grad_norm": 0.427734375, + "learning_rate": 2.1987391751321735e-05, + "loss": 0.2251, + "step": 12635 + }, + { + "epoch": 0.6001044485590846, + "grad_norm": 0.46484375, + "learning_rate": 2.1970268292564683e-05, + "loss": 0.2244, + "step": 12640 + }, + { + "epoch": 0.6003418316479134, + "grad_norm": 0.42578125, + "learning_rate": 2.195314824175615e-05, + "loss": 0.2236, + "step": 12645 + }, + { + "epoch": 0.6005792147367421, + "grad_norm": 0.4765625, + "learning_rate": 2.1936031609447117e-05, + "loss": 0.2247, + "step": 12650 + }, + { + "epoch": 0.600816597825571, + "grad_norm": 0.431640625, + "learning_rate": 2.191891840618652e-05, + "loss": 0.2239, + "step": 12655 + }, + { + "epoch": 0.6010539809143997, + "grad_norm": 0.4765625, + "learning_rate": 2.190180864252111e-05, + "loss": 0.2238, + "step": 12660 + }, + { + "epoch": 0.6012913640032284, + "grad_norm": 0.423828125, + "learning_rate": 2.188470232899559e-05, + "loss": 0.223, + "step": 12665 + }, + { + "epoch": 0.6015287470920572, + "grad_norm": 0.51953125, + "learning_rate": 2.1867599476152484e-05, + "loss": 0.2259, + "step": 12670 + }, + { + "epoch": 0.6017661301808859, + "grad_norm": 0.439453125, + "learning_rate": 2.185050009453221e-05, + "loss": 0.2231, + "step": 12675 + }, + { + "epoch": 0.6020035132697147, + "grad_norm": 0.4609375, + "learning_rate": 2.1833404194673045e-05, + "loss": 0.2237, + "step": 12680 + }, + { + "epoch": 0.6022408963585434, + "grad_norm": 0.458984375, + "learning_rate": 2.18163117871111e-05, + "loss": 0.2238, + "step": 12685 + }, + { + "epoch": 0.6024782794473722, + "grad_norm": 0.435546875, + "learning_rate": 2.1799222882380354e-05, + "loss": 0.2239, + "step": 12690 + }, + { + "epoch": 0.6027156625362009, + "grad_norm": 0.482421875, + "learning_rate": 2.1782137491012623e-05, + "loss": 0.227, + "step": 12695 + }, + { + "epoch": 0.6029530456250297, + "grad_norm": 0.443359375, + "learning_rate": 2.176505562353756e-05, + "loss": 0.2215, + "step": 12700 + }, + { + "epoch": 0.6031904287138584, + "grad_norm": 0.474609375, + "learning_rate": 2.1747977290482634e-05, + "loss": 0.2212, + "step": 12705 + }, + { + "epoch": 0.6034278118026872, + "grad_norm": 0.466796875, + "learning_rate": 2.1730902502373145e-05, + "loss": 0.2224, + "step": 12710 + }, + { + "epoch": 0.6036651948915159, + "grad_norm": 0.396484375, + "learning_rate": 2.1713831269732214e-05, + "loss": 0.2242, + "step": 12715 + }, + { + "epoch": 0.6039025779803446, + "grad_norm": 0.50390625, + "learning_rate": 2.169676360308076e-05, + "loss": 0.2182, + "step": 12720 + }, + { + "epoch": 0.6041399610691734, + "grad_norm": 0.431640625, + "learning_rate": 2.1679699512937507e-05, + "loss": 0.2232, + "step": 12725 + }, + { + "epoch": 0.6043773441580021, + "grad_norm": 0.451171875, + "learning_rate": 2.1662639009818974e-05, + "loss": 0.2186, + "step": 12730 + }, + { + "epoch": 0.604614727246831, + "grad_norm": 0.439453125, + "learning_rate": 2.1645582104239485e-05, + "loss": 0.2248, + "step": 12735 + }, + { + "epoch": 0.6048521103356597, + "grad_norm": 0.4453125, + "learning_rate": 2.1628528806711114e-05, + "loss": 0.2245, + "step": 12740 + }, + { + "epoch": 0.6050894934244885, + "grad_norm": 0.4921875, + "learning_rate": 2.161147912774375e-05, + "loss": 0.2251, + "step": 12745 + }, + { + "epoch": 0.6053268765133172, + "grad_norm": 0.482421875, + "learning_rate": 2.1594433077845006e-05, + "loss": 0.2241, + "step": 12750 + }, + { + "epoch": 0.605564259602146, + "grad_norm": 0.48046875, + "learning_rate": 2.1577390667520313e-05, + "loss": 0.2207, + "step": 12755 + }, + { + "epoch": 0.6058016426909747, + "grad_norm": 0.52734375, + "learning_rate": 2.1560351907272813e-05, + "loss": 0.2231, + "step": 12760 + }, + { + "epoch": 0.6060390257798034, + "grad_norm": 0.4609375, + "learning_rate": 2.1543316807603436e-05, + "loss": 0.221, + "step": 12765 + }, + { + "epoch": 0.6062764088686322, + "grad_norm": 0.4375, + "learning_rate": 2.1526285379010808e-05, + "loss": 0.2259, + "step": 12770 + }, + { + "epoch": 0.6065137919574609, + "grad_norm": 0.482421875, + "learning_rate": 2.1509257631991346e-05, + "loss": 0.227, + "step": 12775 + }, + { + "epoch": 0.6067511750462897, + "grad_norm": 0.486328125, + "learning_rate": 2.1492233577039167e-05, + "loss": 0.2195, + "step": 12780 + }, + { + "epoch": 0.6069885581351184, + "grad_norm": 0.43359375, + "learning_rate": 2.1475213224646107e-05, + "loss": 0.2252, + "step": 12785 + }, + { + "epoch": 0.6072259412239472, + "grad_norm": 0.474609375, + "learning_rate": 2.145819658530174e-05, + "loss": 0.2267, + "step": 12790 + }, + { + "epoch": 0.6074633243127759, + "grad_norm": 0.5078125, + "learning_rate": 2.1441183669493348e-05, + "loss": 0.2241, + "step": 12795 + }, + { + "epoch": 0.6077007074016048, + "grad_norm": 0.423828125, + "learning_rate": 2.142417448770591e-05, + "loss": 0.2242, + "step": 12800 + }, + { + "epoch": 0.6079380904904335, + "grad_norm": 0.375, + "learning_rate": 2.14071690504221e-05, + "loss": 0.2202, + "step": 12805 + }, + { + "epoch": 0.6081754735792622, + "grad_norm": 0.4375, + "learning_rate": 2.1390167368122305e-05, + "loss": 0.2232, + "step": 12810 + }, + { + "epoch": 0.608412856668091, + "grad_norm": 0.431640625, + "learning_rate": 2.137316945128457e-05, + "loss": 0.2265, + "step": 12815 + }, + { + "epoch": 0.6086502397569197, + "grad_norm": 0.490234375, + "learning_rate": 2.1356175310384645e-05, + "loss": 0.2209, + "step": 12820 + }, + { + "epoch": 0.6088876228457485, + "grad_norm": 0.431640625, + "learning_rate": 2.1339184955895936e-05, + "loss": 0.2208, + "step": 12825 + }, + { + "epoch": 0.6091250059345772, + "grad_norm": 0.53125, + "learning_rate": 2.1322198398289527e-05, + "loss": 0.2247, + "step": 12830 + }, + { + "epoch": 0.609362389023406, + "grad_norm": 0.515625, + "learning_rate": 2.130521564803416e-05, + "loss": 0.2213, + "step": 12835 + }, + { + "epoch": 0.6095997721122347, + "grad_norm": 0.4140625, + "learning_rate": 2.128823671559621e-05, + "loss": 0.222, + "step": 12840 + }, + { + "epoch": 0.6098371552010635, + "grad_norm": 0.462890625, + "learning_rate": 2.1271261611439726e-05, + "loss": 0.2195, + "step": 12845 + }, + { + "epoch": 0.6100745382898922, + "grad_norm": 0.412109375, + "learning_rate": 2.1254290346026383e-05, + "loss": 0.2234, + "step": 12850 + }, + { + "epoch": 0.610311921378721, + "grad_norm": 0.470703125, + "learning_rate": 2.1237322929815508e-05, + "loss": 0.2258, + "step": 12855 + }, + { + "epoch": 0.6105493044675497, + "grad_norm": 0.439453125, + "learning_rate": 2.1220359373264027e-05, + "loss": 0.2186, + "step": 12860 + }, + { + "epoch": 0.6107866875563784, + "grad_norm": 0.470703125, + "learning_rate": 2.1203399686826513e-05, + "loss": 0.2226, + "step": 12865 + }, + { + "epoch": 0.6110240706452073, + "grad_norm": 0.43359375, + "learning_rate": 2.118644388095513e-05, + "loss": 0.2252, + "step": 12870 + }, + { + "epoch": 0.611261453734036, + "grad_norm": 0.439453125, + "learning_rate": 2.116949196609968e-05, + "loss": 0.224, + "step": 12875 + }, + { + "epoch": 0.6114988368228648, + "grad_norm": 0.50390625, + "learning_rate": 2.1152543952707553e-05, + "loss": 0.2252, + "step": 12880 + }, + { + "epoch": 0.6117362199116935, + "grad_norm": 0.4296875, + "learning_rate": 2.1135599851223713e-05, + "loss": 0.2207, + "step": 12885 + }, + { + "epoch": 0.6119736030005223, + "grad_norm": 0.44921875, + "learning_rate": 2.1118659672090746e-05, + "loss": 0.219, + "step": 12890 + }, + { + "epoch": 0.612210986089351, + "grad_norm": 0.484375, + "learning_rate": 2.1101723425748803e-05, + "loss": 0.222, + "step": 12895 + }, + { + "epoch": 0.6124483691781798, + "grad_norm": 0.53515625, + "learning_rate": 2.1084791122635623e-05, + "loss": 0.2204, + "step": 12900 + }, + { + "epoch": 0.6126857522670085, + "grad_norm": 0.5078125, + "learning_rate": 2.10678627731865e-05, + "loss": 0.2242, + "step": 12905 + }, + { + "epoch": 0.6129231353558372, + "grad_norm": 0.431640625, + "learning_rate": 2.1050938387834302e-05, + "loss": 0.2191, + "step": 12910 + }, + { + "epoch": 0.613160518444666, + "grad_norm": 0.40625, + "learning_rate": 2.1034017977009447e-05, + "loss": 0.2213, + "step": 12915 + }, + { + "epoch": 0.6133979015334947, + "grad_norm": 0.412109375, + "learning_rate": 2.1017101551139905e-05, + "loss": 0.2257, + "step": 12920 + }, + { + "epoch": 0.6136352846223235, + "grad_norm": 0.427734375, + "learning_rate": 2.100018912065121e-05, + "loss": 0.2255, + "step": 12925 + }, + { + "epoch": 0.6138726677111522, + "grad_norm": 0.41015625, + "learning_rate": 2.098328069596639e-05, + "loss": 0.2218, + "step": 12930 + }, + { + "epoch": 0.614110050799981, + "grad_norm": 0.41796875, + "learning_rate": 2.096637628750605e-05, + "loss": 0.2215, + "step": 12935 + }, + { + "epoch": 0.6143474338888097, + "grad_norm": 0.45703125, + "learning_rate": 2.0949475905688293e-05, + "loss": 0.2215, + "step": 12940 + }, + { + "epoch": 0.6145848169776386, + "grad_norm": 0.46484375, + "learning_rate": 2.093257956092875e-05, + "loss": 0.2262, + "step": 12945 + }, + { + "epoch": 0.6148222000664673, + "grad_norm": 0.45703125, + "learning_rate": 2.0915687263640555e-05, + "loss": 0.2283, + "step": 12950 + }, + { + "epoch": 0.615059583155296, + "grad_norm": 0.435546875, + "learning_rate": 2.0898799024234363e-05, + "loss": 0.2244, + "step": 12955 + }, + { + "epoch": 0.6152969662441248, + "grad_norm": 0.435546875, + "learning_rate": 2.0881914853118305e-05, + "loss": 0.2257, + "step": 12960 + }, + { + "epoch": 0.6155343493329535, + "grad_norm": 0.421875, + "learning_rate": 2.0865034760698037e-05, + "loss": 0.2231, + "step": 12965 + }, + { + "epoch": 0.6157717324217823, + "grad_norm": 0.4609375, + "learning_rate": 2.0848158757376675e-05, + "loss": 0.2236, + "step": 12970 + }, + { + "epoch": 0.616009115510611, + "grad_norm": 0.470703125, + "learning_rate": 2.083128685355482e-05, + "loss": 0.2262, + "step": 12975 + }, + { + "epoch": 0.6162464985994398, + "grad_norm": 0.49609375, + "learning_rate": 2.0814419059630564e-05, + "loss": 0.2216, + "step": 12980 + }, + { + "epoch": 0.6164838816882685, + "grad_norm": 0.41015625, + "learning_rate": 2.0797555385999422e-05, + "loss": 0.2213, + "step": 12985 + }, + { + "epoch": 0.6167212647770973, + "grad_norm": 0.4296875, + "learning_rate": 2.0780695843054432e-05, + "loss": 0.2272, + "step": 12990 + }, + { + "epoch": 0.616958647865926, + "grad_norm": 0.45703125, + "learning_rate": 2.0763840441186023e-05, + "loss": 0.2228, + "step": 12995 + }, + { + "epoch": 0.6171960309547548, + "grad_norm": 0.462890625, + "learning_rate": 2.0746989190782133e-05, + "loss": 0.2226, + "step": 13000 + }, + { + "epoch": 0.6174334140435835, + "grad_norm": 0.453125, + "learning_rate": 2.0730142102228083e-05, + "loss": 0.2218, + "step": 13005 + }, + { + "epoch": 0.6176707971324122, + "grad_norm": 0.46484375, + "learning_rate": 2.0713299185906687e-05, + "loss": 0.2246, + "step": 13010 + }, + { + "epoch": 0.617908180221241, + "grad_norm": 0.400390625, + "learning_rate": 2.069646045219813e-05, + "loss": 0.2226, + "step": 13015 + }, + { + "epoch": 0.6181455633100698, + "grad_norm": 0.427734375, + "learning_rate": 2.0679625911480065e-05, + "loss": 0.2213, + "step": 13020 + }, + { + "epoch": 0.6183829463988986, + "grad_norm": 0.384765625, + "learning_rate": 2.0662795574127535e-05, + "loss": 0.2241, + "step": 13025 + }, + { + "epoch": 0.6186203294877273, + "grad_norm": 0.431640625, + "learning_rate": 2.0645969450513003e-05, + "loss": 0.2265, + "step": 13030 + }, + { + "epoch": 0.6188577125765561, + "grad_norm": 0.44921875, + "learning_rate": 2.0629147551006327e-05, + "loss": 0.2206, + "step": 13035 + }, + { + "epoch": 0.6190950956653848, + "grad_norm": 0.41015625, + "learning_rate": 2.0612329885974774e-05, + "loss": 0.2217, + "step": 13040 + }, + { + "epoch": 0.6193324787542136, + "grad_norm": 0.458984375, + "learning_rate": 2.0595516465782987e-05, + "loss": 0.2245, + "step": 13045 + }, + { + "epoch": 0.6195698618430423, + "grad_norm": 0.423828125, + "learning_rate": 2.057870730079301e-05, + "loss": 0.2207, + "step": 13050 + }, + { + "epoch": 0.619807244931871, + "grad_norm": 0.455078125, + "learning_rate": 2.0561902401364254e-05, + "loss": 0.2226, + "step": 13055 + }, + { + "epoch": 0.6200446280206998, + "grad_norm": 0.48828125, + "learning_rate": 2.0545101777853487e-05, + "loss": 0.2251, + "step": 13060 + }, + { + "epoch": 0.6202820111095285, + "grad_norm": 0.47265625, + "learning_rate": 2.0528305440614875e-05, + "loss": 0.2217, + "step": 13065 + }, + { + "epoch": 0.6205193941983573, + "grad_norm": 0.43359375, + "learning_rate": 2.0511513399999916e-05, + "loss": 0.222, + "step": 13070 + }, + { + "epoch": 0.620756777287186, + "grad_norm": 0.419921875, + "learning_rate": 2.0494725666357473e-05, + "loss": 0.2221, + "step": 13075 + }, + { + "epoch": 0.6209941603760148, + "grad_norm": 0.421875, + "learning_rate": 2.0477942250033742e-05, + "loss": 0.2203, + "step": 13080 + }, + { + "epoch": 0.6212315434648435, + "grad_norm": 0.419921875, + "learning_rate": 2.0461163161372277e-05, + "loss": 0.2258, + "step": 13085 + }, + { + "epoch": 0.6214689265536724, + "grad_norm": 0.484375, + "learning_rate": 2.0444388410713944e-05, + "loss": 0.225, + "step": 13090 + }, + { + "epoch": 0.6217063096425011, + "grad_norm": 0.46875, + "learning_rate": 2.042761800839695e-05, + "loss": 0.2241, + "step": 13095 + }, + { + "epoch": 0.6219436927313298, + "grad_norm": 0.5078125, + "learning_rate": 2.0410851964756816e-05, + "loss": 0.2265, + "step": 13100 + }, + { + "epoch": 0.6221810758201586, + "grad_norm": 0.40234375, + "learning_rate": 2.0394090290126373e-05, + "loss": 0.2232, + "step": 13105 + }, + { + "epoch": 0.6224184589089873, + "grad_norm": 0.455078125, + "learning_rate": 2.0377332994835778e-05, + "loss": 0.224, + "step": 13110 + }, + { + "epoch": 0.6226558419978161, + "grad_norm": 0.49609375, + "learning_rate": 2.0360580089212466e-05, + "loss": 0.2207, + "step": 13115 + }, + { + "epoch": 0.6228932250866448, + "grad_norm": 0.5078125, + "learning_rate": 2.0343831583581186e-05, + "loss": 0.2265, + "step": 13120 + }, + { + "epoch": 0.6231306081754736, + "grad_norm": 0.4453125, + "learning_rate": 2.032708748826397e-05, + "loss": 0.2279, + "step": 13125 + }, + { + "epoch": 0.6233679912643023, + "grad_norm": 0.4609375, + "learning_rate": 2.0310347813580095e-05, + "loss": 0.2244, + "step": 13130 + }, + { + "epoch": 0.6236053743531311, + "grad_norm": 0.4375, + "learning_rate": 2.0293612569846184e-05, + "loss": 0.2271, + "step": 13135 + }, + { + "epoch": 0.6238427574419598, + "grad_norm": 0.419921875, + "learning_rate": 2.027688176737606e-05, + "loss": 0.2231, + "step": 13140 + }, + { + "epoch": 0.6240801405307886, + "grad_norm": 0.50390625, + "learning_rate": 2.0260155416480875e-05, + "loss": 0.2215, + "step": 13145 + }, + { + "epoch": 0.6243175236196173, + "grad_norm": 0.41796875, + "learning_rate": 2.024343352746897e-05, + "loss": 0.2229, + "step": 13150 + }, + { + "epoch": 0.624554906708446, + "grad_norm": 0.447265625, + "learning_rate": 2.0226716110646e-05, + "loss": 0.222, + "step": 13155 + }, + { + "epoch": 0.6247922897972749, + "grad_norm": 0.4296875, + "learning_rate": 2.02100031763148e-05, + "loss": 0.2232, + "step": 13160 + }, + { + "epoch": 0.6250296728861036, + "grad_norm": 0.384765625, + "learning_rate": 2.0193294734775507e-05, + "loss": 0.2226, + "step": 13165 + }, + { + "epoch": 0.6252670559749324, + "grad_norm": 0.427734375, + "learning_rate": 2.017659079632544e-05, + "loss": 0.2245, + "step": 13170 + }, + { + "epoch": 0.6255044390637611, + "grad_norm": 0.423828125, + "learning_rate": 2.015989137125916e-05, + "loss": 0.2264, + "step": 13175 + }, + { + "epoch": 0.6257418221525899, + "grad_norm": 0.451171875, + "learning_rate": 2.0143196469868454e-05, + "loss": 0.2229, + "step": 13180 + }, + { + "epoch": 0.6259792052414186, + "grad_norm": 0.439453125, + "learning_rate": 2.01265061024423e-05, + "loss": 0.2194, + "step": 13185 + }, + { + "epoch": 0.6262165883302474, + "grad_norm": 0.47265625, + "learning_rate": 2.0109820279266915e-05, + "loss": 0.2247, + "step": 13190 + }, + { + "epoch": 0.6264539714190761, + "grad_norm": 0.435546875, + "learning_rate": 2.009313901062568e-05, + "loss": 0.2227, + "step": 13195 + }, + { + "epoch": 0.6266913545079048, + "grad_norm": 0.421875, + "learning_rate": 2.0076462306799192e-05, + "loss": 0.2255, + "step": 13200 + }, + { + "epoch": 0.6269287375967336, + "grad_norm": 0.453125, + "learning_rate": 2.005979017806523e-05, + "loss": 0.2237, + "step": 13205 + }, + { + "epoch": 0.6271661206855623, + "grad_norm": 0.431640625, + "learning_rate": 2.0043122634698753e-05, + "loss": 0.223, + "step": 13210 + }, + { + "epoch": 0.6274035037743911, + "grad_norm": 0.421875, + "learning_rate": 2.0026459686971876e-05, + "loss": 0.2226, + "step": 13215 + }, + { + "epoch": 0.6276408868632198, + "grad_norm": 0.419921875, + "learning_rate": 2.0009801345153917e-05, + "loss": 0.2236, + "step": 13220 + }, + { + "epoch": 0.6278782699520487, + "grad_norm": 0.53125, + "learning_rate": 1.999314761951133e-05, + "loss": 0.2244, + "step": 13225 + }, + { + "epoch": 0.6281156530408774, + "grad_norm": 0.40234375, + "learning_rate": 1.9976498520307723e-05, + "loss": 0.2216, + "step": 13230 + }, + { + "epoch": 0.6283530361297062, + "grad_norm": 0.435546875, + "learning_rate": 1.9959854057803872e-05, + "loss": 0.2238, + "step": 13235 + }, + { + "epoch": 0.6285904192185349, + "grad_norm": 0.451171875, + "learning_rate": 1.9943214242257675e-05, + "loss": 0.2235, + "step": 13240 + }, + { + "epoch": 0.6288278023073636, + "grad_norm": 0.48046875, + "learning_rate": 1.992657908392418e-05, + "loss": 0.226, + "step": 13245 + }, + { + "epoch": 0.6290651853961924, + "grad_norm": 0.482421875, + "learning_rate": 1.990994859305555e-05, + "loss": 0.2254, + "step": 13250 + }, + { + "epoch": 0.6293025684850211, + "grad_norm": 0.3984375, + "learning_rate": 1.9893322779901095e-05, + "loss": 0.2219, + "step": 13255 + }, + { + "epoch": 0.6295399515738499, + "grad_norm": 0.53125, + "learning_rate": 1.9876701654707213e-05, + "loss": 0.2228, + "step": 13260 + }, + { + "epoch": 0.6297773346626786, + "grad_norm": 0.447265625, + "learning_rate": 1.9860085227717444e-05, + "loss": 0.225, + "step": 13265 + }, + { + "epoch": 0.6300147177515074, + "grad_norm": 0.44140625, + "learning_rate": 1.984347350917241e-05, + "loss": 0.2221, + "step": 13270 + }, + { + "epoch": 0.6302521008403361, + "grad_norm": 0.54296875, + "learning_rate": 1.982686650930982e-05, + "loss": 0.2245, + "step": 13275 + }, + { + "epoch": 0.6304894839291649, + "grad_norm": 0.447265625, + "learning_rate": 1.9810264238364528e-05, + "loss": 0.2253, + "step": 13280 + }, + { + "epoch": 0.6307268670179936, + "grad_norm": 0.4140625, + "learning_rate": 1.9793666706568403e-05, + "loss": 0.2251, + "step": 13285 + }, + { + "epoch": 0.6309642501068224, + "grad_norm": 0.412109375, + "learning_rate": 1.9777073924150452e-05, + "loss": 0.2261, + "step": 13290 + }, + { + "epoch": 0.6312016331956511, + "grad_norm": 0.453125, + "learning_rate": 1.976048590133672e-05, + "loss": 0.2239, + "step": 13295 + }, + { + "epoch": 0.6314390162844798, + "grad_norm": 0.39453125, + "learning_rate": 1.974390264835034e-05, + "loss": 0.2215, + "step": 13300 + }, + { + "epoch": 0.6316763993733087, + "grad_norm": 0.4296875, + "learning_rate": 1.9727324175411473e-05, + "loss": 0.2214, + "step": 13305 + }, + { + "epoch": 0.6319137824621374, + "grad_norm": 0.41796875, + "learning_rate": 1.9710750492737383e-05, + "loss": 0.2286, + "step": 13310 + }, + { + "epoch": 0.6321511655509662, + "grad_norm": 0.462890625, + "learning_rate": 1.9694181610542338e-05, + "loss": 0.2224, + "step": 13315 + }, + { + "epoch": 0.6323885486397949, + "grad_norm": 0.4140625, + "learning_rate": 1.967761753903768e-05, + "loss": 0.2214, + "step": 13320 + }, + { + "epoch": 0.6326259317286237, + "grad_norm": 0.48828125, + "learning_rate": 1.9661058288431756e-05, + "loss": 0.222, + "step": 13325 + }, + { + "epoch": 0.6328633148174524, + "grad_norm": 0.462890625, + "learning_rate": 1.9644503868929958e-05, + "loss": 0.2204, + "step": 13330 + }, + { + "epoch": 0.6331006979062812, + "grad_norm": 0.44921875, + "learning_rate": 1.962795429073471e-05, + "loss": 0.2235, + "step": 13335 + }, + { + "epoch": 0.6333380809951099, + "grad_norm": 0.443359375, + "learning_rate": 1.9611409564045434e-05, + "loss": 0.2244, + "step": 13340 + }, + { + "epoch": 0.6335754640839386, + "grad_norm": 0.46875, + "learning_rate": 1.9594869699058578e-05, + "loss": 0.2209, + "step": 13345 + }, + { + "epoch": 0.6338128471727674, + "grad_norm": 0.451171875, + "learning_rate": 1.9578334705967567e-05, + "loss": 0.225, + "step": 13350 + }, + { + "epoch": 0.6340502302615961, + "grad_norm": 0.4609375, + "learning_rate": 1.9561804594962868e-05, + "loss": 0.2252, + "step": 13355 + }, + { + "epoch": 0.6342876133504249, + "grad_norm": 0.5234375, + "learning_rate": 1.954527937623189e-05, + "loss": 0.2247, + "step": 13360 + }, + { + "epoch": 0.6345249964392536, + "grad_norm": 0.416015625, + "learning_rate": 1.9528759059959063e-05, + "loss": 0.2209, + "step": 13365 + }, + { + "epoch": 0.6347623795280825, + "grad_norm": 0.388671875, + "learning_rate": 1.951224365632578e-05, + "loss": 0.2245, + "step": 13370 + }, + { + "epoch": 0.6349997626169112, + "grad_norm": 0.462890625, + "learning_rate": 1.9495733175510404e-05, + "loss": 0.2246, + "step": 13375 + }, + { + "epoch": 0.63523714570574, + "grad_norm": 0.4296875, + "learning_rate": 1.9479227627688286e-05, + "loss": 0.2215, + "step": 13380 + }, + { + "epoch": 0.6354745287945687, + "grad_norm": 0.4140625, + "learning_rate": 1.946272702303169e-05, + "loss": 0.2177, + "step": 13385 + }, + { + "epoch": 0.6357119118833974, + "grad_norm": 0.431640625, + "learning_rate": 1.94462313717099e-05, + "loss": 0.2237, + "step": 13390 + }, + { + "epoch": 0.6359492949722262, + "grad_norm": 0.4921875, + "learning_rate": 1.9429740683889076e-05, + "loss": 0.2221, + "step": 13395 + }, + { + "epoch": 0.6361866780610549, + "grad_norm": 0.39453125, + "learning_rate": 1.9413254969732385e-05, + "loss": 0.225, + "step": 13400 + }, + { + "epoch": 0.6364240611498837, + "grad_norm": 0.4453125, + "learning_rate": 1.9396774239399872e-05, + "loss": 0.2293, + "step": 13405 + }, + { + "epoch": 0.6366614442387124, + "grad_norm": 0.4375, + "learning_rate": 1.9380298503048554e-05, + "loss": 0.2183, + "step": 13410 + }, + { + "epoch": 0.6368988273275412, + "grad_norm": 0.51171875, + "learning_rate": 1.936382777083235e-05, + "loss": 0.2243, + "step": 13415 + }, + { + "epoch": 0.6371362104163699, + "grad_norm": 0.42578125, + "learning_rate": 1.9347362052902086e-05, + "loss": 0.224, + "step": 13420 + }, + { + "epoch": 0.6373735935051987, + "grad_norm": 0.41796875, + "learning_rate": 1.9330901359405516e-05, + "loss": 0.2248, + "step": 13425 + }, + { + "epoch": 0.6376109765940274, + "grad_norm": 0.4296875, + "learning_rate": 1.9314445700487288e-05, + "loss": 0.2217, + "step": 13430 + }, + { + "epoch": 0.6378483596828562, + "grad_norm": 0.453125, + "learning_rate": 1.929799508628896e-05, + "loss": 0.2237, + "step": 13435 + }, + { + "epoch": 0.638085742771685, + "grad_norm": 0.435546875, + "learning_rate": 1.9281549526948957e-05, + "loss": 0.2223, + "step": 13440 + }, + { + "epoch": 0.6383231258605137, + "grad_norm": 0.46875, + "learning_rate": 1.9265109032602614e-05, + "loss": 0.2273, + "step": 13445 + }, + { + "epoch": 0.6385605089493425, + "grad_norm": 0.50390625, + "learning_rate": 1.9248673613382124e-05, + "loss": 0.2243, + "step": 13450 + }, + { + "epoch": 0.6387978920381712, + "grad_norm": 0.470703125, + "learning_rate": 1.923224327941657e-05, + "loss": 0.2257, + "step": 13455 + }, + { + "epoch": 0.639035275127, + "grad_norm": 0.396484375, + "learning_rate": 1.9215818040831873e-05, + "loss": 0.2261, + "step": 13460 + }, + { + "epoch": 0.6392726582158287, + "grad_norm": 0.4140625, + "learning_rate": 1.9199397907750854e-05, + "loss": 0.2252, + "step": 13465 + }, + { + "epoch": 0.6395100413046575, + "grad_norm": 0.51953125, + "learning_rate": 1.9182982890293166e-05, + "loss": 0.2249, + "step": 13470 + }, + { + "epoch": 0.6397474243934862, + "grad_norm": 0.427734375, + "learning_rate": 1.9166572998575294e-05, + "loss": 0.2232, + "step": 13475 + }, + { + "epoch": 0.639984807482315, + "grad_norm": 0.4921875, + "learning_rate": 1.9150168242710594e-05, + "loss": 0.228, + "step": 13480 + }, + { + "epoch": 0.6402221905711437, + "grad_norm": 0.53125, + "learning_rate": 1.913376863280923e-05, + "loss": 0.2242, + "step": 13485 + }, + { + "epoch": 0.6404595736599724, + "grad_norm": 0.44140625, + "learning_rate": 1.911737417897822e-05, + "loss": 0.2242, + "step": 13490 + }, + { + "epoch": 0.6406969567488012, + "grad_norm": 0.408203125, + "learning_rate": 1.910098489132138e-05, + "loss": 0.2242, + "step": 13495 + }, + { + "epoch": 0.6409343398376299, + "grad_norm": 0.42578125, + "learning_rate": 1.9084600779939366e-05, + "loss": 0.2218, + "step": 13500 + }, + { + "epoch": 0.6411717229264587, + "grad_norm": 0.451171875, + "learning_rate": 1.9068221854929614e-05, + "loss": 0.2255, + "step": 13505 + }, + { + "epoch": 0.6414091060152874, + "grad_norm": 0.443359375, + "learning_rate": 1.9051848126386397e-05, + "loss": 0.2263, + "step": 13510 + }, + { + "epoch": 0.6416464891041163, + "grad_norm": 0.40625, + "learning_rate": 1.903547960440077e-05, + "loss": 0.2249, + "step": 13515 + }, + { + "epoch": 0.641883872192945, + "grad_norm": 0.50390625, + "learning_rate": 1.9019116299060557e-05, + "loss": 0.2203, + "step": 13520 + }, + { + "epoch": 0.6421212552817738, + "grad_norm": 0.404296875, + "learning_rate": 1.9002758220450417e-05, + "loss": 0.2258, + "step": 13525 + }, + { + "epoch": 0.6423586383706025, + "grad_norm": 0.4140625, + "learning_rate": 1.8986405378651732e-05, + "loss": 0.2209, + "step": 13530 + }, + { + "epoch": 0.6425960214594312, + "grad_norm": 0.3984375, + "learning_rate": 1.8970057783742712e-05, + "loss": 0.2217, + "step": 13535 + }, + { + "epoch": 0.64283340454826, + "grad_norm": 0.412109375, + "learning_rate": 1.895371544579828e-05, + "loss": 0.2243, + "step": 13540 + }, + { + "epoch": 0.6430707876370887, + "grad_norm": 0.453125, + "learning_rate": 1.8937378374890165e-05, + "loss": 0.2238, + "step": 13545 + }, + { + "epoch": 0.6433081707259175, + "grad_norm": 0.482421875, + "learning_rate": 1.892104658108681e-05, + "loss": 0.2246, + "step": 13550 + }, + { + "epoch": 0.6435455538147462, + "grad_norm": 0.421875, + "learning_rate": 1.8904720074453447e-05, + "loss": 0.2232, + "step": 13555 + }, + { + "epoch": 0.643782936903575, + "grad_norm": 0.48828125, + "learning_rate": 1.888839886505201e-05, + "loss": 0.2226, + "step": 13560 + }, + { + "epoch": 0.6440203199924037, + "grad_norm": 0.443359375, + "learning_rate": 1.8872082962941203e-05, + "loss": 0.2218, + "step": 13565 + }, + { + "epoch": 0.6442577030812325, + "grad_norm": 0.462890625, + "learning_rate": 1.8855772378176423e-05, + "loss": 0.2269, + "step": 13570 + }, + { + "epoch": 0.6444950861700612, + "grad_norm": 0.44921875, + "learning_rate": 1.883946712080981e-05, + "loss": 0.2262, + "step": 13575 + }, + { + "epoch": 0.64473246925889, + "grad_norm": 0.44140625, + "learning_rate": 1.8823167200890244e-05, + "loss": 0.2194, + "step": 13580 + }, + { + "epoch": 0.6449698523477188, + "grad_norm": 0.392578125, + "learning_rate": 1.880687262846327e-05, + "loss": 0.2189, + "step": 13585 + }, + { + "epoch": 0.6452072354365475, + "grad_norm": 0.515625, + "learning_rate": 1.879058341357116e-05, + "loss": 0.2216, + "step": 13590 + }, + { + "epoch": 0.6454446185253763, + "grad_norm": 0.44140625, + "learning_rate": 1.8774299566252894e-05, + "loss": 0.2196, + "step": 13595 + }, + { + "epoch": 0.645682001614205, + "grad_norm": 0.58984375, + "learning_rate": 1.8758021096544127e-05, + "loss": 0.2238, + "step": 13600 + }, + { + "epoch": 0.6459193847030338, + "grad_norm": 0.431640625, + "learning_rate": 1.8741748014477194e-05, + "loss": 0.2219, + "step": 13605 + }, + { + "epoch": 0.6461567677918625, + "grad_norm": 0.5703125, + "learning_rate": 1.8725480330081148e-05, + "loss": 0.2269, + "step": 13610 + }, + { + "epoch": 0.6463941508806913, + "grad_norm": 0.466796875, + "learning_rate": 1.8709218053381673e-05, + "loss": 0.2223, + "step": 13615 + }, + { + "epoch": 0.64663153396952, + "grad_norm": 0.447265625, + "learning_rate": 1.869296119440113e-05, + "loss": 0.224, + "step": 13620 + }, + { + "epoch": 0.6468689170583488, + "grad_norm": 0.38671875, + "learning_rate": 1.8676709763158557e-05, + "loss": 0.2206, + "step": 13625 + }, + { + "epoch": 0.6471063001471775, + "grad_norm": 0.458984375, + "learning_rate": 1.8660463769669634e-05, + "loss": 0.2259, + "step": 13630 + }, + { + "epoch": 0.6473436832360062, + "grad_norm": 0.56640625, + "learning_rate": 1.8644223223946695e-05, + "loss": 0.2294, + "step": 13635 + }, + { + "epoch": 0.647581066324835, + "grad_norm": 0.5390625, + "learning_rate": 1.8627988135998708e-05, + "loss": 0.2246, + "step": 13640 + }, + { + "epoch": 0.6478184494136637, + "grad_norm": 0.458984375, + "learning_rate": 1.8611758515831297e-05, + "loss": 0.2275, + "step": 13645 + }, + { + "epoch": 0.6480558325024925, + "grad_norm": 0.44140625, + "learning_rate": 1.8595534373446687e-05, + "loss": 0.2183, + "step": 13650 + }, + { + "epoch": 0.6482932155913212, + "grad_norm": 0.421875, + "learning_rate": 1.8579315718843758e-05, + "loss": 0.2236, + "step": 13655 + }, + { + "epoch": 0.6485305986801501, + "grad_norm": 0.421875, + "learning_rate": 1.8563102562017993e-05, + "loss": 0.2279, + "step": 13660 + }, + { + "epoch": 0.6487679817689788, + "grad_norm": 0.53515625, + "learning_rate": 1.854689491296146e-05, + "loss": 0.2225, + "step": 13665 + }, + { + "epoch": 0.6490053648578076, + "grad_norm": 0.443359375, + "learning_rate": 1.8530692781662895e-05, + "loss": 0.2231, + "step": 13670 + }, + { + "epoch": 0.6492427479466363, + "grad_norm": 0.421875, + "learning_rate": 1.8514496178107572e-05, + "loss": 0.2231, + "step": 13675 + }, + { + "epoch": 0.649480131035465, + "grad_norm": 0.46875, + "learning_rate": 1.8498305112277408e-05, + "loss": 0.2283, + "step": 13680 + }, + { + "epoch": 0.6497175141242938, + "grad_norm": 0.44921875, + "learning_rate": 1.8482119594150853e-05, + "loss": 0.2197, + "step": 13685 + }, + { + "epoch": 0.6499548972131225, + "grad_norm": 0.416015625, + "learning_rate": 1.8465939633703e-05, + "loss": 0.2224, + "step": 13690 + }, + { + "epoch": 0.6501922803019513, + "grad_norm": 0.48046875, + "learning_rate": 1.844976524090546e-05, + "loss": 0.2231, + "step": 13695 + }, + { + "epoch": 0.65042966339078, + "grad_norm": 0.40625, + "learning_rate": 1.843359642572646e-05, + "loss": 0.2214, + "step": 13700 + }, + { + "epoch": 0.6506670464796088, + "grad_norm": 0.435546875, + "learning_rate": 1.841743319813074e-05, + "loss": 0.2224, + "step": 13705 + }, + { + "epoch": 0.6509044295684375, + "grad_norm": 0.478515625, + "learning_rate": 1.8401275568079645e-05, + "loss": 0.2231, + "step": 13710 + }, + { + "epoch": 0.6511418126572663, + "grad_norm": 0.46875, + "learning_rate": 1.8385123545531045e-05, + "loss": 0.2242, + "step": 13715 + }, + { + "epoch": 0.651379195746095, + "grad_norm": 0.390625, + "learning_rate": 1.8368977140439354e-05, + "loss": 0.2205, + "step": 13720 + }, + { + "epoch": 0.6516165788349239, + "grad_norm": 0.4140625, + "learning_rate": 1.835283636275553e-05, + "loss": 0.2225, + "step": 13725 + }, + { + "epoch": 0.6518539619237526, + "grad_norm": 0.48046875, + "learning_rate": 1.8336701222427057e-05, + "loss": 0.2248, + "step": 13730 + }, + { + "epoch": 0.6520913450125813, + "grad_norm": 0.447265625, + "learning_rate": 1.8320571729397956e-05, + "loss": 0.225, + "step": 13735 + }, + { + "epoch": 0.6523287281014101, + "grad_norm": 0.43359375, + "learning_rate": 1.8304447893608757e-05, + "loss": 0.2213, + "step": 13740 + }, + { + "epoch": 0.6525661111902388, + "grad_norm": 0.51171875, + "learning_rate": 1.82883297249965e-05, + "loss": 0.2268, + "step": 13745 + }, + { + "epoch": 0.6528034942790676, + "grad_norm": 0.42578125, + "learning_rate": 1.827221723349475e-05, + "loss": 0.2212, + "step": 13750 + }, + { + "epoch": 0.6530408773678963, + "grad_norm": 0.462890625, + "learning_rate": 1.825611042903356e-05, + "loss": 0.2233, + "step": 13755 + }, + { + "epoch": 0.6532782604567251, + "grad_norm": 0.5390625, + "learning_rate": 1.8240009321539476e-05, + "loss": 0.2268, + "step": 13760 + }, + { + "epoch": 0.6535156435455538, + "grad_norm": 0.447265625, + "learning_rate": 1.822391392093553e-05, + "loss": 0.2243, + "step": 13765 + }, + { + "epoch": 0.6537530266343826, + "grad_norm": 0.443359375, + "learning_rate": 1.8207824237141267e-05, + "loss": 0.2222, + "step": 13770 + }, + { + "epoch": 0.6539904097232113, + "grad_norm": 0.3984375, + "learning_rate": 1.819174028007266e-05, + "loss": 0.2219, + "step": 13775 + }, + { + "epoch": 0.65422779281204, + "grad_norm": 0.40625, + "learning_rate": 1.81756620596422e-05, + "loss": 0.2231, + "step": 13780 + }, + { + "epoch": 0.6544651759008688, + "grad_norm": 0.453125, + "learning_rate": 1.8159589585758806e-05, + "loss": 0.2217, + "step": 13785 + }, + { + "epoch": 0.6547025589896975, + "grad_norm": 0.482421875, + "learning_rate": 1.8143522868327884e-05, + "loss": 0.2264, + "step": 13790 + }, + { + "epoch": 0.6549399420785263, + "grad_norm": 0.462890625, + "learning_rate": 1.8127461917251266e-05, + "loss": 0.2214, + "step": 13795 + }, + { + "epoch": 0.655177325167355, + "grad_norm": 0.4453125, + "learning_rate": 1.8111406742427245e-05, + "loss": 0.223, + "step": 13800 + }, + { + "epoch": 0.6554147082561839, + "grad_norm": 0.484375, + "learning_rate": 1.8095357353750553e-05, + "loss": 0.2197, + "step": 13805 + }, + { + "epoch": 0.6556520913450126, + "grad_norm": 0.4609375, + "learning_rate": 1.8079313761112372e-05, + "loss": 0.2234, + "step": 13810 + }, + { + "epoch": 0.6558894744338414, + "grad_norm": 0.41796875, + "learning_rate": 1.8063275974400274e-05, + "loss": 0.2235, + "step": 13815 + }, + { + "epoch": 0.6561268575226701, + "grad_norm": 0.447265625, + "learning_rate": 1.8047244003498283e-05, + "loss": 0.2242, + "step": 13820 + }, + { + "epoch": 0.6563642406114988, + "grad_norm": 0.423828125, + "learning_rate": 1.8031217858286824e-05, + "loss": 0.223, + "step": 13825 + }, + { + "epoch": 0.6566016237003276, + "grad_norm": 0.462890625, + "learning_rate": 1.8015197548642737e-05, + "loss": 0.2242, + "step": 13830 + }, + { + "epoch": 0.6568390067891563, + "grad_norm": 0.47265625, + "learning_rate": 1.799918308443928e-05, + "loss": 0.2225, + "step": 13835 + }, + { + "epoch": 0.6570763898779851, + "grad_norm": 0.44921875, + "learning_rate": 1.798317447554608e-05, + "loss": 0.2237, + "step": 13840 + }, + { + "epoch": 0.6573137729668138, + "grad_norm": 0.423828125, + "learning_rate": 1.796717173182918e-05, + "loss": 0.2261, + "step": 13845 + }, + { + "epoch": 0.6575511560556426, + "grad_norm": 0.453125, + "learning_rate": 1.7951174863150982e-05, + "loss": 0.2273, + "step": 13850 + }, + { + "epoch": 0.6577885391444713, + "grad_norm": 0.447265625, + "learning_rate": 1.7935183879370298e-05, + "loss": 0.2255, + "step": 13855 + }, + { + "epoch": 0.6580259222333001, + "grad_norm": 0.453125, + "learning_rate": 1.7919198790342313e-05, + "loss": 0.2261, + "step": 13860 + }, + { + "epoch": 0.6582633053221288, + "grad_norm": 0.546875, + "learning_rate": 1.7903219605918524e-05, + "loss": 0.2247, + "step": 13865 + }, + { + "epoch": 0.6585006884109577, + "grad_norm": 0.4296875, + "learning_rate": 1.788724633594686e-05, + "loss": 0.2254, + "step": 13870 + }, + { + "epoch": 0.6587380714997864, + "grad_norm": 0.5, + "learning_rate": 1.787127899027156e-05, + "loss": 0.2252, + "step": 13875 + }, + { + "epoch": 0.6589754545886151, + "grad_norm": 0.51953125, + "learning_rate": 1.785531757873325e-05, + "loss": 0.2297, + "step": 13880 + }, + { + "epoch": 0.6592128376774439, + "grad_norm": 0.421875, + "learning_rate": 1.7839362111168846e-05, + "loss": 0.225, + "step": 13885 + }, + { + "epoch": 0.6594502207662726, + "grad_norm": 0.39453125, + "learning_rate": 1.7823412597411642e-05, + "loss": 0.2207, + "step": 13890 + }, + { + "epoch": 0.6596876038551014, + "grad_norm": 0.40625, + "learning_rate": 1.780746904729125e-05, + "loss": 0.225, + "step": 13895 + }, + { + "epoch": 0.6599249869439301, + "grad_norm": 0.4453125, + "learning_rate": 1.7791531470633625e-05, + "loss": 0.2226, + "step": 13900 + }, + { + "epoch": 0.6601623700327589, + "grad_norm": 0.439453125, + "learning_rate": 1.7775599877261002e-05, + "loss": 0.2294, + "step": 13905 + }, + { + "epoch": 0.6603997531215876, + "grad_norm": 0.423828125, + "learning_rate": 1.7759674276991938e-05, + "loss": 0.2244, + "step": 13910 + }, + { + "epoch": 0.6606371362104164, + "grad_norm": 0.38671875, + "learning_rate": 1.774375467964134e-05, + "loss": 0.2227, + "step": 13915 + }, + { + "epoch": 0.6608745192992451, + "grad_norm": 0.470703125, + "learning_rate": 1.7727841095020365e-05, + "loss": 0.2255, + "step": 13920 + }, + { + "epoch": 0.6611119023880738, + "grad_norm": 0.447265625, + "learning_rate": 1.7711933532936487e-05, + "loss": 0.2235, + "step": 13925 + }, + { + "epoch": 0.6613492854769026, + "grad_norm": 0.46484375, + "learning_rate": 1.7696032003193452e-05, + "loss": 0.2235, + "step": 13930 + }, + { + "epoch": 0.6615866685657313, + "grad_norm": 0.4140625, + "learning_rate": 1.7680136515591322e-05, + "loss": 0.22, + "step": 13935 + }, + { + "epoch": 0.6618240516545602, + "grad_norm": 0.4296875, + "learning_rate": 1.7664247079926406e-05, + "loss": 0.2236, + "step": 13940 + }, + { + "epoch": 0.6620614347433889, + "grad_norm": 0.42578125, + "learning_rate": 1.764836370599129e-05, + "loss": 0.2237, + "step": 13945 + }, + { + "epoch": 0.6622988178322177, + "grad_norm": 0.462890625, + "learning_rate": 1.763248640357481e-05, + "loss": 0.2194, + "step": 13950 + }, + { + "epoch": 0.6625362009210464, + "grad_norm": 0.45703125, + "learning_rate": 1.76166151824621e-05, + "loss": 0.22, + "step": 13955 + }, + { + "epoch": 0.6627735840098752, + "grad_norm": 0.494140625, + "learning_rate": 1.7600750052434506e-05, + "loss": 0.2266, + "step": 13960 + }, + { + "epoch": 0.6630109670987039, + "grad_norm": 0.451171875, + "learning_rate": 1.7584891023269645e-05, + "loss": 0.2215, + "step": 13965 + }, + { + "epoch": 0.6632483501875326, + "grad_norm": 0.53515625, + "learning_rate": 1.7569038104741355e-05, + "loss": 0.2239, + "step": 13970 + }, + { + "epoch": 0.6634857332763614, + "grad_norm": 0.50390625, + "learning_rate": 1.7553191306619715e-05, + "loss": 0.2243, + "step": 13975 + }, + { + "epoch": 0.6637231163651901, + "grad_norm": 0.396484375, + "learning_rate": 1.7537350638671047e-05, + "loss": 0.2234, + "step": 13980 + }, + { + "epoch": 0.6639604994540189, + "grad_norm": 0.490234375, + "learning_rate": 1.752151611065788e-05, + "loss": 0.2222, + "step": 13985 + }, + { + "epoch": 0.6641978825428476, + "grad_norm": 0.4453125, + "learning_rate": 1.750568773233896e-05, + "loss": 0.2226, + "step": 13990 + }, + { + "epoch": 0.6644352656316764, + "grad_norm": 0.419921875, + "learning_rate": 1.7489865513469232e-05, + "loss": 0.2248, + "step": 13995 + }, + { + "epoch": 0.6646726487205051, + "grad_norm": 0.39453125, + "learning_rate": 1.7474049463799887e-05, + "loss": 0.2252, + "step": 14000 + }, + { + "epoch": 0.664910031809334, + "grad_norm": 0.4921875, + "learning_rate": 1.7458239593078258e-05, + "loss": 0.2208, + "step": 14005 + }, + { + "epoch": 0.6651474148981626, + "grad_norm": 0.427734375, + "learning_rate": 1.7442435911047896e-05, + "loss": 0.2261, + "step": 14010 + }, + { + "epoch": 0.6653847979869915, + "grad_norm": 0.40625, + "learning_rate": 1.742663842744856e-05, + "loss": 0.2232, + "step": 14015 + }, + { + "epoch": 0.6656221810758202, + "grad_norm": 0.498046875, + "learning_rate": 1.741084715201615e-05, + "loss": 0.2255, + "step": 14020 + }, + { + "epoch": 0.6658595641646489, + "grad_norm": 0.5, + "learning_rate": 1.739506209448276e-05, + "loss": 0.2261, + "step": 14025 + }, + { + "epoch": 0.6660969472534777, + "grad_norm": 0.41015625, + "learning_rate": 1.737928326457664e-05, + "loss": 0.2201, + "step": 14030 + }, + { + "epoch": 0.6663343303423064, + "grad_norm": 0.423828125, + "learning_rate": 1.7363510672022237e-05, + "loss": 0.2205, + "step": 14035 + }, + { + "epoch": 0.6665717134311352, + "grad_norm": 0.421875, + "learning_rate": 1.7347744326540112e-05, + "loss": 0.2256, + "step": 14040 + }, + { + "epoch": 0.6668090965199639, + "grad_norm": 0.439453125, + "learning_rate": 1.7331984237846986e-05, + "loss": 0.2244, + "step": 14045 + }, + { + "epoch": 0.6670464796087927, + "grad_norm": 0.47265625, + "learning_rate": 1.7316230415655738e-05, + "loss": 0.2215, + "step": 14050 + }, + { + "epoch": 0.6672838626976214, + "grad_norm": 0.4765625, + "learning_rate": 1.730048286967538e-05, + "loss": 0.2234, + "step": 14055 + }, + { + "epoch": 0.6675212457864502, + "grad_norm": 0.46484375, + "learning_rate": 1.7284741609611045e-05, + "loss": 0.2249, + "step": 14060 + }, + { + "epoch": 0.6677586288752789, + "grad_norm": 0.41796875, + "learning_rate": 1.726900664516401e-05, + "loss": 0.226, + "step": 14065 + }, + { + "epoch": 0.6679960119641076, + "grad_norm": 0.490234375, + "learning_rate": 1.7253277986031657e-05, + "loss": 0.2262, + "step": 14070 + }, + { + "epoch": 0.6682333950529364, + "grad_norm": 0.458984375, + "learning_rate": 1.7237555641907472e-05, + "loss": 0.2225, + "step": 14075 + }, + { + "epoch": 0.6684707781417651, + "grad_norm": 0.482421875, + "learning_rate": 1.722183962248109e-05, + "loss": 0.2231, + "step": 14080 + }, + { + "epoch": 0.668708161230594, + "grad_norm": 0.439453125, + "learning_rate": 1.7206129937438207e-05, + "loss": 0.2234, + "step": 14085 + }, + { + "epoch": 0.6689455443194227, + "grad_norm": 0.380859375, + "learning_rate": 1.719042659646064e-05, + "loss": 0.2241, + "step": 14090 + }, + { + "epoch": 0.6691829274082515, + "grad_norm": 0.451171875, + "learning_rate": 1.717472960922627e-05, + "loss": 0.2193, + "step": 14095 + }, + { + "epoch": 0.6694203104970802, + "grad_norm": 0.42578125, + "learning_rate": 1.7159038985409098e-05, + "loss": 0.2203, + "step": 14100 + }, + { + "epoch": 0.669657693585909, + "grad_norm": 0.46484375, + "learning_rate": 1.714335473467918e-05, + "loss": 0.2232, + "step": 14105 + }, + { + "epoch": 0.6698950766747377, + "grad_norm": 0.40234375, + "learning_rate": 1.7127676866702634e-05, + "loss": 0.2246, + "step": 14110 + }, + { + "epoch": 0.6701324597635664, + "grad_norm": 0.42578125, + "learning_rate": 1.7112005391141673e-05, + "loss": 0.2232, + "step": 14115 + }, + { + "epoch": 0.6703698428523952, + "grad_norm": 0.55859375, + "learning_rate": 1.7096340317654547e-05, + "loss": 0.2238, + "step": 14120 + }, + { + "epoch": 0.6706072259412239, + "grad_norm": 0.439453125, + "learning_rate": 1.7080681655895587e-05, + "loss": 0.2252, + "step": 14125 + }, + { + "epoch": 0.6708446090300527, + "grad_norm": 0.51953125, + "learning_rate": 1.7065029415515125e-05, + "loss": 0.2243, + "step": 14130 + }, + { + "epoch": 0.6710819921188814, + "grad_norm": 0.427734375, + "learning_rate": 1.7049383606159596e-05, + "loss": 0.2201, + "step": 14135 + }, + { + "epoch": 0.6713193752077102, + "grad_norm": 0.482421875, + "learning_rate": 1.7033744237471416e-05, + "loss": 0.2286, + "step": 14140 + }, + { + "epoch": 0.6715567582965389, + "grad_norm": 0.423828125, + "learning_rate": 1.7018111319089087e-05, + "loss": 0.2239, + "step": 14145 + }, + { + "epoch": 0.6717941413853677, + "grad_norm": 0.4296875, + "learning_rate": 1.7002484860647082e-05, + "loss": 0.2215, + "step": 14150 + }, + { + "epoch": 0.6720315244741965, + "grad_norm": 0.38671875, + "learning_rate": 1.6986864871775908e-05, + "loss": 0.2254, + "step": 14155 + }, + { + "epoch": 0.6722689075630253, + "grad_norm": 0.48828125, + "learning_rate": 1.697125136210212e-05, + "loss": 0.2235, + "step": 14160 + }, + { + "epoch": 0.672506290651854, + "grad_norm": 0.40625, + "learning_rate": 1.695564434124824e-05, + "loss": 0.2252, + "step": 14165 + }, + { + "epoch": 0.6727436737406827, + "grad_norm": 0.44140625, + "learning_rate": 1.6940043818832803e-05, + "loss": 0.2249, + "step": 14170 + }, + { + "epoch": 0.6729810568295115, + "grad_norm": 0.419921875, + "learning_rate": 1.6924449804470332e-05, + "loss": 0.2255, + "step": 14175 + }, + { + "epoch": 0.6732184399183402, + "grad_norm": 0.4765625, + "learning_rate": 1.6908862307771362e-05, + "loss": 0.2254, + "step": 14180 + }, + { + "epoch": 0.673455823007169, + "grad_norm": 0.439453125, + "learning_rate": 1.6893281338342392e-05, + "loss": 0.2302, + "step": 14185 + }, + { + "epoch": 0.6736932060959977, + "grad_norm": 0.48046875, + "learning_rate": 1.68777069057859e-05, + "loss": 0.2268, + "step": 14190 + }, + { + "epoch": 0.6739305891848265, + "grad_norm": 0.453125, + "learning_rate": 1.6862139019700335e-05, + "loss": 0.224, + "step": 14195 + }, + { + "epoch": 0.6741679722736552, + "grad_norm": 0.43359375, + "learning_rate": 1.6846577689680124e-05, + "loss": 0.2204, + "step": 14200 + }, + { + "epoch": 0.674405355362484, + "grad_norm": 0.384765625, + "learning_rate": 1.6831022925315636e-05, + "loss": 0.2238, + "step": 14205 + }, + { + "epoch": 0.6746427384513127, + "grad_norm": 0.4140625, + "learning_rate": 1.6815474736193208e-05, + "loss": 0.2266, + "step": 14210 + }, + { + "epoch": 0.6748801215401414, + "grad_norm": 0.51171875, + "learning_rate": 1.6799933131895114e-05, + "loss": 0.2271, + "step": 14215 + }, + { + "epoch": 0.6751175046289702, + "grad_norm": 0.443359375, + "learning_rate": 1.6784398121999564e-05, + "loss": 0.2191, + "step": 14220 + }, + { + "epoch": 0.675354887717799, + "grad_norm": 0.46875, + "learning_rate": 1.6768869716080732e-05, + "loss": 0.2252, + "step": 14225 + }, + { + "epoch": 0.6755922708066278, + "grad_norm": 0.498046875, + "learning_rate": 1.6753347923708697e-05, + "loss": 0.2281, + "step": 14230 + }, + { + "epoch": 0.6758296538954565, + "grad_norm": 0.494140625, + "learning_rate": 1.6737832754449466e-05, + "loss": 0.2225, + "step": 14235 + }, + { + "epoch": 0.6760670369842853, + "grad_norm": 0.439453125, + "learning_rate": 1.6722324217864965e-05, + "loss": 0.2213, + "step": 14240 + }, + { + "epoch": 0.676304420073114, + "grad_norm": 0.423828125, + "learning_rate": 1.6706822323513044e-05, + "loss": 0.2228, + "step": 14245 + }, + { + "epoch": 0.6765418031619428, + "grad_norm": 0.4375, + "learning_rate": 1.669132708094746e-05, + "loss": 0.226, + "step": 14250 + }, + { + "epoch": 0.6767791862507715, + "grad_norm": 0.3984375, + "learning_rate": 1.6675838499717822e-05, + "loss": 0.2229, + "step": 14255 + }, + { + "epoch": 0.6770165693396002, + "grad_norm": 0.412109375, + "learning_rate": 1.6660356589369703e-05, + "loss": 0.2192, + "step": 14260 + }, + { + "epoch": 0.677253952428429, + "grad_norm": 0.4609375, + "learning_rate": 1.6644881359444518e-05, + "loss": 0.2233, + "step": 14265 + }, + { + "epoch": 0.6774913355172577, + "grad_norm": 0.416015625, + "learning_rate": 1.6629412819479616e-05, + "loss": 0.2261, + "step": 14270 + }, + { + "epoch": 0.6777287186060865, + "grad_norm": 0.44921875, + "learning_rate": 1.6613950979008135e-05, + "loss": 0.226, + "step": 14275 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 0.5390625, + "learning_rate": 1.6598495847559175e-05, + "loss": 0.225, + "step": 14280 + }, + { + "epoch": 0.678203484783744, + "grad_norm": 0.4609375, + "learning_rate": 1.6583047434657643e-05, + "loss": 0.2241, + "step": 14285 + }, + { + "epoch": 0.6784408678725727, + "grad_norm": 0.45703125, + "learning_rate": 1.656760574982435e-05, + "loss": 0.2249, + "step": 14290 + }, + { + "epoch": 0.6786782509614016, + "grad_norm": 0.419921875, + "learning_rate": 1.6552170802575906e-05, + "loss": 0.224, + "step": 14295 + }, + { + "epoch": 0.6789156340502303, + "grad_norm": 0.396484375, + "learning_rate": 1.653674260242482e-05, + "loss": 0.2219, + "step": 14300 + }, + { + "epoch": 0.6791530171390591, + "grad_norm": 0.435546875, + "learning_rate": 1.6521321158879415e-05, + "loss": 0.2238, + "step": 14305 + }, + { + "epoch": 0.6793904002278878, + "grad_norm": 0.470703125, + "learning_rate": 1.6505906481443854e-05, + "loss": 0.2232, + "step": 14310 + }, + { + "epoch": 0.6796277833167165, + "grad_norm": 0.451171875, + "learning_rate": 1.6490498579618136e-05, + "loss": 0.2241, + "step": 14315 + }, + { + "epoch": 0.6798651664055453, + "grad_norm": 0.50390625, + "learning_rate": 1.647509746289807e-05, + "loss": 0.2235, + "step": 14320 + }, + { + "epoch": 0.680102549494374, + "grad_norm": 0.45703125, + "learning_rate": 1.6459703140775316e-05, + "loss": 0.2301, + "step": 14325 + }, + { + "epoch": 0.6803399325832028, + "grad_norm": 0.4453125, + "learning_rate": 1.644431562273731e-05, + "loss": 0.2239, + "step": 14330 + }, + { + "epoch": 0.6805773156720315, + "grad_norm": 0.400390625, + "learning_rate": 1.6428934918267314e-05, + "loss": 0.2255, + "step": 14335 + }, + { + "epoch": 0.6808146987608603, + "grad_norm": 0.52734375, + "learning_rate": 1.6413561036844373e-05, + "loss": 0.2223, + "step": 14340 + }, + { + "epoch": 0.681052081849689, + "grad_norm": 0.466796875, + "learning_rate": 1.6398193987943362e-05, + "loss": 0.2248, + "step": 14345 + }, + { + "epoch": 0.6812894649385178, + "grad_norm": 0.408203125, + "learning_rate": 1.6382833781034914e-05, + "loss": 0.2236, + "step": 14350 + }, + { + "epoch": 0.6815268480273465, + "grad_norm": 0.474609375, + "learning_rate": 1.6367480425585447e-05, + "loss": 0.2245, + "step": 14355 + }, + { + "epoch": 0.6817642311161752, + "grad_norm": 0.474609375, + "learning_rate": 1.635213393105717e-05, + "loss": 0.222, + "step": 14360 + }, + { + "epoch": 0.682001614205004, + "grad_norm": 0.45703125, + "learning_rate": 1.633679430690806e-05, + "loss": 0.223, + "step": 14365 + }, + { + "epoch": 0.6822389972938327, + "grad_norm": 0.4921875, + "learning_rate": 1.632146156259186e-05, + "loss": 0.2269, + "step": 14370 + }, + { + "epoch": 0.6824763803826616, + "grad_norm": 0.431640625, + "learning_rate": 1.6306135707558065e-05, + "loss": 0.2254, + "step": 14375 + }, + { + "epoch": 0.6827137634714903, + "grad_norm": 0.515625, + "learning_rate": 1.629081675125193e-05, + "loss": 0.2256, + "step": 14380 + }, + { + "epoch": 0.6829511465603191, + "grad_norm": 0.412109375, + "learning_rate": 1.6275504703114458e-05, + "loss": 0.2261, + "step": 14385 + }, + { + "epoch": 0.6831885296491478, + "grad_norm": 0.435546875, + "learning_rate": 1.6260199572582395e-05, + "loss": 0.2246, + "step": 14390 + }, + { + "epoch": 0.6834259127379766, + "grad_norm": 0.453125, + "learning_rate": 1.6244901369088237e-05, + "loss": 0.2254, + "step": 14395 + }, + { + "epoch": 0.6836632958268053, + "grad_norm": 0.427734375, + "learning_rate": 1.622961010206017e-05, + "loss": 0.223, + "step": 14400 + }, + { + "epoch": 0.683900678915634, + "grad_norm": 0.474609375, + "learning_rate": 1.6214325780922154e-05, + "loss": 0.227, + "step": 14405 + }, + { + "epoch": 0.6841380620044628, + "grad_norm": 0.451171875, + "learning_rate": 1.6199048415093842e-05, + "loss": 0.2262, + "step": 14410 + }, + { + "epoch": 0.6843754450932915, + "grad_norm": 0.482421875, + "learning_rate": 1.61837780139906e-05, + "loss": 0.2241, + "step": 14415 + }, + { + "epoch": 0.6846128281821203, + "grad_norm": 0.40234375, + "learning_rate": 1.61685145870235e-05, + "loss": 0.2251, + "step": 14420 + }, + { + "epoch": 0.684850211270949, + "grad_norm": 0.42578125, + "learning_rate": 1.6153258143599344e-05, + "loss": 0.2202, + "step": 14425 + }, + { + "epoch": 0.6850875943597778, + "grad_norm": 0.431640625, + "learning_rate": 1.613800869312059e-05, + "loss": 0.2228, + "step": 14430 + }, + { + "epoch": 0.6853249774486065, + "grad_norm": 0.3984375, + "learning_rate": 1.612276624498542e-05, + "loss": 0.2227, + "step": 14435 + }, + { + "epoch": 0.6855623605374354, + "grad_norm": 0.4375, + "learning_rate": 1.6107530808587667e-05, + "loss": 0.2215, + "step": 14440 + }, + { + "epoch": 0.6857997436262641, + "grad_norm": 0.5, + "learning_rate": 1.6092302393316876e-05, + "loss": 0.2234, + "step": 14445 + }, + { + "epoch": 0.6860371267150929, + "grad_norm": 0.423828125, + "learning_rate": 1.607708100855825e-05, + "loss": 0.2269, + "step": 14450 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.384765625, + "learning_rate": 1.6061866663692652e-05, + "loss": 0.2227, + "step": 14455 + }, + { + "epoch": 0.6865118928927503, + "grad_norm": 0.439453125, + "learning_rate": 1.6046659368096617e-05, + "loss": 0.2177, + "step": 14460 + }, + { + "epoch": 0.6867492759815791, + "grad_norm": 0.439453125, + "learning_rate": 1.6031459131142325e-05, + "loss": 0.2248, + "step": 14465 + }, + { + "epoch": 0.6869866590704078, + "grad_norm": 0.435546875, + "learning_rate": 1.6016265962197626e-05, + "loss": 0.2207, + "step": 14470 + }, + { + "epoch": 0.6872240421592366, + "grad_norm": 0.427734375, + "learning_rate": 1.600107987062599e-05, + "loss": 0.2259, + "step": 14475 + }, + { + "epoch": 0.6874614252480653, + "grad_norm": 0.51171875, + "learning_rate": 1.5985900865786547e-05, + "loss": 0.2265, + "step": 14480 + }, + { + "epoch": 0.6876988083368941, + "grad_norm": 0.451171875, + "learning_rate": 1.5970728957034032e-05, + "loss": 0.2217, + "step": 14485 + }, + { + "epoch": 0.6879361914257228, + "grad_norm": 0.43359375, + "learning_rate": 1.595556415371884e-05, + "loss": 0.222, + "step": 14490 + }, + { + "epoch": 0.6881735745145516, + "grad_norm": 0.58984375, + "learning_rate": 1.5940406465186975e-05, + "loss": 0.2258, + "step": 14495 + }, + { + "epoch": 0.6884109576033803, + "grad_norm": 0.484375, + "learning_rate": 1.5925255900780022e-05, + "loss": 0.2241, + "step": 14500 + }, + { + "epoch": 0.688648340692209, + "grad_norm": 0.46484375, + "learning_rate": 1.5910112469835232e-05, + "loss": 0.2215, + "step": 14505 + }, + { + "epoch": 0.6888857237810379, + "grad_norm": 0.43359375, + "learning_rate": 1.589497618168541e-05, + "loss": 0.2225, + "step": 14510 + }, + { + "epoch": 0.6891231068698666, + "grad_norm": 0.451171875, + "learning_rate": 1.5879847045659025e-05, + "loss": 0.2219, + "step": 14515 + }, + { + "epoch": 0.6893604899586954, + "grad_norm": 0.50390625, + "learning_rate": 1.586472507108004e-05, + "loss": 0.2263, + "step": 14520 + }, + { + "epoch": 0.6895978730475241, + "grad_norm": 0.42578125, + "learning_rate": 1.5849610267268093e-05, + "loss": 0.2236, + "step": 14525 + }, + { + "epoch": 0.6898352561363529, + "grad_norm": 0.486328125, + "learning_rate": 1.583450264353835e-05, + "loss": 0.2275, + "step": 14530 + }, + { + "epoch": 0.6900726392251816, + "grad_norm": 0.458984375, + "learning_rate": 1.5819402209201602e-05, + "loss": 0.2229, + "step": 14535 + }, + { + "epoch": 0.6903100223140104, + "grad_norm": 0.451171875, + "learning_rate": 1.5804308973564137e-05, + "loss": 0.2205, + "step": 14540 + }, + { + "epoch": 0.6905474054028391, + "grad_norm": 0.4453125, + "learning_rate": 1.5789222945927877e-05, + "loss": 0.2245, + "step": 14545 + }, + { + "epoch": 0.6907847884916678, + "grad_norm": 0.42578125, + "learning_rate": 1.5774144135590265e-05, + "loss": 0.2235, + "step": 14550 + }, + { + "epoch": 0.6910221715804966, + "grad_norm": 0.423828125, + "learning_rate": 1.57590725518443e-05, + "loss": 0.2292, + "step": 14555 + }, + { + "epoch": 0.6912595546693253, + "grad_norm": 0.43359375, + "learning_rate": 1.574400820397853e-05, + "loss": 0.2206, + "step": 14560 + }, + { + "epoch": 0.6914969377581541, + "grad_norm": 0.5078125, + "learning_rate": 1.572895110127704e-05, + "loss": 0.2258, + "step": 14565 + }, + { + "epoch": 0.6917343208469828, + "grad_norm": 0.431640625, + "learning_rate": 1.5713901253019463e-05, + "loss": 0.2245, + "step": 14570 + }, + { + "epoch": 0.6919717039358116, + "grad_norm": 0.44140625, + "learning_rate": 1.5698858668480955e-05, + "loss": 0.2228, + "step": 14575 + }, + { + "epoch": 0.6922090870246403, + "grad_norm": 0.43359375, + "learning_rate": 1.5683823356932183e-05, + "loss": 0.2236, + "step": 14580 + }, + { + "epoch": 0.6924464701134692, + "grad_norm": 0.5234375, + "learning_rate": 1.5668795327639335e-05, + "loss": 0.2242, + "step": 14585 + }, + { + "epoch": 0.6926838532022979, + "grad_norm": 0.455078125, + "learning_rate": 1.5653774589864134e-05, + "loss": 0.2249, + "step": 14590 + }, + { + "epoch": 0.6929212362911267, + "grad_norm": 0.453125, + "learning_rate": 1.5638761152863785e-05, + "loss": 0.2202, + "step": 14595 + }, + { + "epoch": 0.6931586193799554, + "grad_norm": 0.447265625, + "learning_rate": 1.5623755025891003e-05, + "loss": 0.2214, + "step": 14600 + }, + { + "epoch": 0.6933960024687841, + "grad_norm": 0.47265625, + "learning_rate": 1.560875621819399e-05, + "loss": 0.2279, + "step": 14605 + }, + { + "epoch": 0.6936333855576129, + "grad_norm": 0.515625, + "learning_rate": 1.559376473901644e-05, + "loss": 0.218, + "step": 14610 + }, + { + "epoch": 0.6938707686464416, + "grad_norm": 0.4375, + "learning_rate": 1.557878059759755e-05, + "loss": 0.2237, + "step": 14615 + }, + { + "epoch": 0.6941081517352704, + "grad_norm": 0.455078125, + "learning_rate": 1.556380380317197e-05, + "loss": 0.2212, + "step": 14620 + }, + { + "epoch": 0.6943455348240991, + "grad_norm": 0.462890625, + "learning_rate": 1.5548834364969823e-05, + "loss": 0.2219, + "step": 14625 + }, + { + "epoch": 0.6945829179129279, + "grad_norm": 0.515625, + "learning_rate": 1.5533872292216704e-05, + "loss": 0.2256, + "step": 14630 + }, + { + "epoch": 0.6948203010017566, + "grad_norm": 0.498046875, + "learning_rate": 1.5518917594133685e-05, + "loss": 0.2236, + "step": 14635 + }, + { + "epoch": 0.6950576840905854, + "grad_norm": 0.43359375, + "learning_rate": 1.5503970279937285e-05, + "loss": 0.2237, + "step": 14640 + }, + { + "epoch": 0.6952950671794141, + "grad_norm": 0.38671875, + "learning_rate": 1.548903035883943e-05, + "loss": 0.2265, + "step": 14645 + }, + { + "epoch": 0.6955324502682428, + "grad_norm": 0.50390625, + "learning_rate": 1.5474097840047555e-05, + "loss": 0.225, + "step": 14650 + }, + { + "epoch": 0.6957698333570717, + "grad_norm": 0.431640625, + "learning_rate": 1.545917273276449e-05, + "loss": 0.2203, + "step": 14655 + }, + { + "epoch": 0.6960072164459004, + "grad_norm": 0.42578125, + "learning_rate": 1.544425504618853e-05, + "loss": 0.2232, + "step": 14660 + }, + { + "epoch": 0.6962445995347292, + "grad_norm": 0.4609375, + "learning_rate": 1.542934478951335e-05, + "loss": 0.2191, + "step": 14665 + }, + { + "epoch": 0.6964819826235579, + "grad_norm": 0.53515625, + "learning_rate": 1.5414441971928096e-05, + "loss": 0.2193, + "step": 14670 + }, + { + "epoch": 0.6967193657123867, + "grad_norm": 0.578125, + "learning_rate": 1.5399546602617284e-05, + "loss": 0.2242, + "step": 14675 + }, + { + "epoch": 0.6969567488012154, + "grad_norm": 0.51953125, + "learning_rate": 1.53846586907609e-05, + "loss": 0.2264, + "step": 14680 + }, + { + "epoch": 0.6971941318900442, + "grad_norm": 0.50390625, + "learning_rate": 1.536977824553425e-05, + "loss": 0.2213, + "step": 14685 + }, + { + "epoch": 0.6974315149788729, + "grad_norm": 0.4609375, + "learning_rate": 1.5354905276108117e-05, + "loss": 0.2265, + "step": 14690 + }, + { + "epoch": 0.6976688980677016, + "grad_norm": 0.46484375, + "learning_rate": 1.5340039791648636e-05, + "loss": 0.2214, + "step": 14695 + }, + { + "epoch": 0.6979062811565304, + "grad_norm": 0.453125, + "learning_rate": 1.5325181801317337e-05, + "loss": 0.2194, + "step": 14700 + }, + { + "epoch": 0.6981436642453591, + "grad_norm": 0.435546875, + "learning_rate": 1.5310331314271127e-05, + "loss": 0.2258, + "step": 14705 + }, + { + "epoch": 0.6983810473341879, + "grad_norm": 0.43359375, + "learning_rate": 1.5295488339662296e-05, + "loss": 0.2242, + "step": 14710 + }, + { + "epoch": 0.6986184304230166, + "grad_norm": 0.4375, + "learning_rate": 1.5280652886638507e-05, + "loss": 0.2206, + "step": 14715 + }, + { + "epoch": 0.6988558135118454, + "grad_norm": 0.486328125, + "learning_rate": 1.526582496434278e-05, + "loss": 0.2261, + "step": 14720 + }, + { + "epoch": 0.6990931966006741, + "grad_norm": 0.44921875, + "learning_rate": 1.5251004581913491e-05, + "loss": 0.2214, + "step": 14725 + }, + { + "epoch": 0.699330579689503, + "grad_norm": 0.439453125, + "learning_rate": 1.5236191748484374e-05, + "loss": 0.221, + "step": 14730 + }, + { + "epoch": 0.6995679627783317, + "grad_norm": 0.421875, + "learning_rate": 1.5221386473184524e-05, + "loss": 0.2198, + "step": 14735 + }, + { + "epoch": 0.6998053458671605, + "grad_norm": 0.58203125, + "learning_rate": 1.5206588765138354e-05, + "loss": 0.2212, + "step": 14740 + }, + { + "epoch": 0.7000427289559892, + "grad_norm": 0.41796875, + "learning_rate": 1.5191798633465626e-05, + "loss": 0.2256, + "step": 14745 + }, + { + "epoch": 0.7002801120448179, + "grad_norm": 0.4140625, + "learning_rate": 1.5177016087281432e-05, + "loss": 0.2225, + "step": 14750 + }, + { + "epoch": 0.7005174951336467, + "grad_norm": 0.484375, + "learning_rate": 1.5162241135696178e-05, + "loss": 0.2241, + "step": 14755 + }, + { + "epoch": 0.7007548782224754, + "grad_norm": 0.52734375, + "learning_rate": 1.5147473787815613e-05, + "loss": 0.2233, + "step": 14760 + }, + { + "epoch": 0.7009922613113042, + "grad_norm": 0.46875, + "learning_rate": 1.5132714052740788e-05, + "loss": 0.2209, + "step": 14765 + }, + { + "epoch": 0.7012296444001329, + "grad_norm": 0.51171875, + "learning_rate": 1.511796193956805e-05, + "loss": 0.2245, + "step": 14770 + }, + { + "epoch": 0.7014670274889617, + "grad_norm": 0.462890625, + "learning_rate": 1.5103217457389056e-05, + "loss": 0.227, + "step": 14775 + }, + { + "epoch": 0.7017044105777904, + "grad_norm": 0.4296875, + "learning_rate": 1.5088480615290779e-05, + "loss": 0.2238, + "step": 14780 + }, + { + "epoch": 0.7019417936666192, + "grad_norm": 0.45703125, + "learning_rate": 1.5073751422355465e-05, + "loss": 0.2212, + "step": 14785 + }, + { + "epoch": 0.7021791767554479, + "grad_norm": 0.4296875, + "learning_rate": 1.5059029887660636e-05, + "loss": 0.2235, + "step": 14790 + }, + { + "epoch": 0.7024165598442766, + "grad_norm": 0.42578125, + "learning_rate": 1.5044316020279117e-05, + "loss": 0.2234, + "step": 14795 + }, + { + "epoch": 0.7026539429331055, + "grad_norm": 0.44140625, + "learning_rate": 1.502960982927899e-05, + "loss": 0.2213, + "step": 14800 + }, + { + "epoch": 0.7028913260219342, + "grad_norm": 0.404296875, + "learning_rate": 1.5014911323723627e-05, + "loss": 0.2217, + "step": 14805 + }, + { + "epoch": 0.703128709110763, + "grad_norm": 0.474609375, + "learning_rate": 1.5000220512671632e-05, + "loss": 0.2215, + "step": 14810 + }, + { + "epoch": 0.7033660921995917, + "grad_norm": 0.44140625, + "learning_rate": 1.4985537405176898e-05, + "loss": 0.2265, + "step": 14815 + }, + { + "epoch": 0.7036034752884205, + "grad_norm": 0.38671875, + "learning_rate": 1.4970862010288555e-05, + "loss": 0.2179, + "step": 14820 + }, + { + "epoch": 0.7038408583772492, + "grad_norm": 0.43359375, + "learning_rate": 1.4956194337050982e-05, + "loss": 0.2196, + "step": 14825 + }, + { + "epoch": 0.704078241466078, + "grad_norm": 0.5390625, + "learning_rate": 1.4941534394503792e-05, + "loss": 0.2224, + "step": 14830 + }, + { + "epoch": 0.7043156245549067, + "grad_norm": 0.416015625, + "learning_rate": 1.4926882191681854e-05, + "loss": 0.2243, + "step": 14835 + }, + { + "epoch": 0.7045530076437354, + "grad_norm": 0.451171875, + "learning_rate": 1.4912237737615248e-05, + "loss": 0.2257, + "step": 14840 + }, + { + "epoch": 0.7047903907325642, + "grad_norm": 0.42578125, + "learning_rate": 1.489760104132928e-05, + "loss": 0.2279, + "step": 14845 + }, + { + "epoch": 0.7050277738213929, + "grad_norm": 0.423828125, + "learning_rate": 1.4882972111844485e-05, + "loss": 0.2256, + "step": 14850 + }, + { + "epoch": 0.7052651569102217, + "grad_norm": 0.45703125, + "learning_rate": 1.4868350958176597e-05, + "loss": 0.2218, + "step": 14855 + }, + { + "epoch": 0.7055025399990504, + "grad_norm": 0.400390625, + "learning_rate": 1.4853737589336581e-05, + "loss": 0.2197, + "step": 14860 + }, + { + "epoch": 0.7057399230878793, + "grad_norm": 0.51171875, + "learning_rate": 1.4839132014330587e-05, + "loss": 0.2265, + "step": 14865 + }, + { + "epoch": 0.705977306176708, + "grad_norm": 0.40625, + "learning_rate": 1.4824534242159955e-05, + "loss": 0.2211, + "step": 14870 + }, + { + "epoch": 0.7062146892655368, + "grad_norm": 0.41015625, + "learning_rate": 1.480994428182122e-05, + "loss": 0.2234, + "step": 14875 + }, + { + "epoch": 0.7064520723543655, + "grad_norm": 0.431640625, + "learning_rate": 1.4795362142306126e-05, + "loss": 0.2242, + "step": 14880 + }, + { + "epoch": 0.7066894554431943, + "grad_norm": 0.470703125, + "learning_rate": 1.4780787832601583e-05, + "loss": 0.2238, + "step": 14885 + }, + { + "epoch": 0.706926838532023, + "grad_norm": 0.4609375, + "learning_rate": 1.4766221361689642e-05, + "loss": 0.2226, + "step": 14890 + }, + { + "epoch": 0.7071642216208517, + "grad_norm": 0.447265625, + "learning_rate": 1.4751662738547584e-05, + "loss": 0.2233, + "step": 14895 + }, + { + "epoch": 0.7074016047096805, + "grad_norm": 0.412109375, + "learning_rate": 1.4737111972147794e-05, + "loss": 0.2228, + "step": 14900 + }, + { + "epoch": 0.7076389877985092, + "grad_norm": 0.384765625, + "learning_rate": 1.4722569071457882e-05, + "loss": 0.2242, + "step": 14905 + }, + { + "epoch": 0.707876370887338, + "grad_norm": 0.3984375, + "learning_rate": 1.4708034045440528e-05, + "loss": 0.219, + "step": 14910 + }, + { + "epoch": 0.7081137539761667, + "grad_norm": 0.46875, + "learning_rate": 1.469350690305363e-05, + "loss": 0.2192, + "step": 14915 + }, + { + "epoch": 0.7083511370649955, + "grad_norm": 0.43359375, + "learning_rate": 1.4678987653250187e-05, + "loss": 0.2258, + "step": 14920 + }, + { + "epoch": 0.7085885201538242, + "grad_norm": 0.458984375, + "learning_rate": 1.4664476304978367e-05, + "loss": 0.2225, + "step": 14925 + }, + { + "epoch": 0.708825903242653, + "grad_norm": 0.447265625, + "learning_rate": 1.4649972867181414e-05, + "loss": 0.2201, + "step": 14930 + }, + { + "epoch": 0.7090632863314817, + "grad_norm": 0.4765625, + "learning_rate": 1.463547734879776e-05, + "loss": 0.2246, + "step": 14935 + }, + { + "epoch": 0.7093006694203104, + "grad_norm": 0.46875, + "learning_rate": 1.4620989758760919e-05, + "loss": 0.2237, + "step": 14940 + }, + { + "epoch": 0.7095380525091393, + "grad_norm": 0.443359375, + "learning_rate": 1.4606510105999521e-05, + "loss": 0.2272, + "step": 14945 + }, + { + "epoch": 0.709775435597968, + "grad_norm": 0.427734375, + "learning_rate": 1.4592038399437318e-05, + "loss": 0.222, + "step": 14950 + }, + { + "epoch": 0.7100128186867968, + "grad_norm": 0.40625, + "learning_rate": 1.4577574647993145e-05, + "loss": 0.2208, + "step": 14955 + }, + { + "epoch": 0.7102502017756255, + "grad_norm": 0.40625, + "learning_rate": 1.456311886058096e-05, + "loss": 0.2228, + "step": 14960 + }, + { + "epoch": 0.7104875848644543, + "grad_norm": 0.53515625, + "learning_rate": 1.4548671046109791e-05, + "loss": 0.2249, + "step": 14965 + }, + { + "epoch": 0.710724967953283, + "grad_norm": 0.41015625, + "learning_rate": 1.4534231213483767e-05, + "loss": 0.2204, + "step": 14970 + }, + { + "epoch": 0.7109623510421118, + "grad_norm": 0.470703125, + "learning_rate": 1.4519799371602072e-05, + "loss": 0.2225, + "step": 14975 + }, + { + "epoch": 0.7111997341309405, + "grad_norm": 0.416015625, + "learning_rate": 1.4505375529359014e-05, + "loss": 0.2226, + "step": 14980 + }, + { + "epoch": 0.7114371172197692, + "grad_norm": 0.404296875, + "learning_rate": 1.4490959695643924e-05, + "loss": 0.2231, + "step": 14985 + }, + { + "epoch": 0.711674500308598, + "grad_norm": 0.451171875, + "learning_rate": 1.4476551879341215e-05, + "loss": 0.223, + "step": 14990 + }, + { + "epoch": 0.7119118833974267, + "grad_norm": 0.447265625, + "learning_rate": 1.4462152089330367e-05, + "loss": 0.2252, + "step": 14995 + }, + { + "epoch": 0.7121492664862555, + "grad_norm": 0.478515625, + "learning_rate": 1.4447760334485882e-05, + "loss": 0.2218, + "step": 15000 + }, + { + "epoch": 0.7123866495750842, + "grad_norm": 0.453125, + "learning_rate": 1.443337662367737e-05, + "loss": 0.2244, + "step": 15005 + }, + { + "epoch": 0.712624032663913, + "grad_norm": 0.416015625, + "learning_rate": 1.441900096576942e-05, + "loss": 0.2252, + "step": 15010 + }, + { + "epoch": 0.7128614157527418, + "grad_norm": 0.41796875, + "learning_rate": 1.440463336962169e-05, + "loss": 0.2246, + "step": 15015 + }, + { + "epoch": 0.7130987988415706, + "grad_norm": 0.4375, + "learning_rate": 1.4390273844088859e-05, + "loss": 0.2258, + "step": 15020 + }, + { + "epoch": 0.7133361819303993, + "grad_norm": 0.3984375, + "learning_rate": 1.4375922398020649e-05, + "loss": 0.2232, + "step": 15025 + }, + { + "epoch": 0.7135735650192281, + "grad_norm": 0.4140625, + "learning_rate": 1.4361579040261796e-05, + "loss": 0.2232, + "step": 15030 + }, + { + "epoch": 0.7138109481080568, + "grad_norm": 0.37109375, + "learning_rate": 1.434724377965202e-05, + "loss": 0.2257, + "step": 15035 + }, + { + "epoch": 0.7140483311968855, + "grad_norm": 0.36328125, + "learning_rate": 1.4332916625026101e-05, + "loss": 0.2179, + "step": 15040 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.404296875, + "learning_rate": 1.4318597585213783e-05, + "loss": 0.2249, + "step": 15045 + }, + { + "epoch": 0.714523097374543, + "grad_norm": 0.42578125, + "learning_rate": 1.430428666903985e-05, + "loss": 0.2278, + "step": 15050 + }, + { + "epoch": 0.7147604804633718, + "grad_norm": 0.484375, + "learning_rate": 1.4289983885324021e-05, + "loss": 0.2195, + "step": 15055 + }, + { + "epoch": 0.7149978635522005, + "grad_norm": 0.423828125, + "learning_rate": 1.4275689242881063e-05, + "loss": 0.2236, + "step": 15060 + }, + { + "epoch": 0.7152352466410293, + "grad_norm": 0.453125, + "learning_rate": 1.4261402750520681e-05, + "loss": 0.2224, + "step": 15065 + }, + { + "epoch": 0.715472629729858, + "grad_norm": 0.458984375, + "learning_rate": 1.4247124417047597e-05, + "loss": 0.2231, + "step": 15070 + }, + { + "epoch": 0.7157100128186868, + "grad_norm": 0.466796875, + "learning_rate": 1.4232854251261458e-05, + "loss": 0.2256, + "step": 15075 + }, + { + "epoch": 0.7159473959075155, + "grad_norm": 0.48046875, + "learning_rate": 1.421859226195692e-05, + "loss": 0.2233, + "step": 15080 + }, + { + "epoch": 0.7161847789963443, + "grad_norm": 0.53125, + "learning_rate": 1.4204338457923575e-05, + "loss": 0.2172, + "step": 15085 + }, + { + "epoch": 0.7164221620851731, + "grad_norm": 0.419921875, + "learning_rate": 1.4190092847945979e-05, + "loss": 0.2259, + "step": 15090 + }, + { + "epoch": 0.7166595451740018, + "grad_norm": 0.439453125, + "learning_rate": 1.4175855440803635e-05, + "loss": 0.2211, + "step": 15095 + }, + { + "epoch": 0.7168969282628306, + "grad_norm": 0.41796875, + "learning_rate": 1.4161626245270986e-05, + "loss": 0.2233, + "step": 15100 + }, + { + "epoch": 0.7171343113516593, + "grad_norm": 0.458984375, + "learning_rate": 1.4147405270117441e-05, + "loss": 0.224, + "step": 15105 + }, + { + "epoch": 0.7173716944404881, + "grad_norm": 0.423828125, + "learning_rate": 1.4133192524107305e-05, + "loss": 0.2221, + "step": 15110 + }, + { + "epoch": 0.7176090775293168, + "grad_norm": 0.416015625, + "learning_rate": 1.4118988015999834e-05, + "loss": 0.2225, + "step": 15115 + }, + { + "epoch": 0.7178464606181456, + "grad_norm": 0.458984375, + "learning_rate": 1.4104791754549195e-05, + "loss": 0.2239, + "step": 15120 + }, + { + "epoch": 0.7180838437069743, + "grad_norm": 0.447265625, + "learning_rate": 1.4090603748504488e-05, + "loss": 0.2213, + "step": 15125 + }, + { + "epoch": 0.718321226795803, + "grad_norm": 0.455078125, + "learning_rate": 1.407642400660972e-05, + "loss": 0.2209, + "step": 15130 + }, + { + "epoch": 0.7185586098846318, + "grad_norm": 0.447265625, + "learning_rate": 1.4062252537603793e-05, + "loss": 0.2221, + "step": 15135 + }, + { + "epoch": 0.7187959929734605, + "grad_norm": 0.41015625, + "learning_rate": 1.4048089350220522e-05, + "loss": 0.2222, + "step": 15140 + }, + { + "epoch": 0.7190333760622893, + "grad_norm": 0.478515625, + "learning_rate": 1.4033934453188608e-05, + "loss": 0.2205, + "step": 15145 + }, + { + "epoch": 0.719270759151118, + "grad_norm": 0.443359375, + "learning_rate": 1.4019787855231659e-05, + "loss": 0.2217, + "step": 15150 + }, + { + "epoch": 0.7195081422399469, + "grad_norm": 0.4375, + "learning_rate": 1.4005649565068157e-05, + "loss": 0.2239, + "step": 15155 + }, + { + "epoch": 0.7197455253287756, + "grad_norm": 0.431640625, + "learning_rate": 1.3991519591411467e-05, + "loss": 0.2217, + "step": 15160 + }, + { + "epoch": 0.7199829084176044, + "grad_norm": 0.494140625, + "learning_rate": 1.397739794296982e-05, + "loss": 0.2244, + "step": 15165 + }, + { + "epoch": 0.7202202915064331, + "grad_norm": 0.474609375, + "learning_rate": 1.3963284628446333e-05, + "loss": 0.2249, + "step": 15170 + }, + { + "epoch": 0.7204576745952619, + "grad_norm": 0.453125, + "learning_rate": 1.3949179656538974e-05, + "loss": 0.2208, + "step": 15175 + }, + { + "epoch": 0.7206950576840906, + "grad_norm": 0.4140625, + "learning_rate": 1.3935083035940577e-05, + "loss": 0.2243, + "step": 15180 + }, + { + "epoch": 0.7209324407729193, + "grad_norm": 0.453125, + "learning_rate": 1.3920994775338814e-05, + "loss": 0.2258, + "step": 15185 + }, + { + "epoch": 0.7211698238617481, + "grad_norm": 0.46875, + "learning_rate": 1.3906914883416221e-05, + "loss": 0.2277, + "step": 15190 + }, + { + "epoch": 0.7214072069505768, + "grad_norm": 0.546875, + "learning_rate": 1.389284336885019e-05, + "loss": 0.2246, + "step": 15195 + }, + { + "epoch": 0.7216445900394056, + "grad_norm": 0.427734375, + "learning_rate": 1.3878780240312893e-05, + "loss": 0.2234, + "step": 15200 + }, + { + "epoch": 0.7218819731282343, + "grad_norm": 0.4375, + "learning_rate": 1.3864725506471404e-05, + "loss": 0.224, + "step": 15205 + }, + { + "epoch": 0.7221193562170631, + "grad_norm": 0.453125, + "learning_rate": 1.385067917598758e-05, + "loss": 0.2248, + "step": 15210 + }, + { + "epoch": 0.7223567393058918, + "grad_norm": 0.412109375, + "learning_rate": 1.3836641257518116e-05, + "loss": 0.2226, + "step": 15215 + }, + { + "epoch": 0.7225941223947207, + "grad_norm": 0.498046875, + "learning_rate": 1.3822611759714505e-05, + "loss": 0.2207, + "step": 15220 + }, + { + "epoch": 0.7228315054835494, + "grad_norm": 0.447265625, + "learning_rate": 1.3808590691223074e-05, + "loss": 0.2171, + "step": 15225 + }, + { + "epoch": 0.723068888572378, + "grad_norm": 0.373046875, + "learning_rate": 1.3794578060684945e-05, + "loss": 0.222, + "step": 15230 + }, + { + "epoch": 0.7233062716612069, + "grad_norm": 0.447265625, + "learning_rate": 1.3780573876736034e-05, + "loss": 0.2238, + "step": 15235 + }, + { + "epoch": 0.7235436547500356, + "grad_norm": 0.462890625, + "learning_rate": 1.3766578148007054e-05, + "loss": 0.2251, + "step": 15240 + }, + { + "epoch": 0.7237810378388644, + "grad_norm": 0.416015625, + "learning_rate": 1.3752590883123505e-05, + "loss": 0.2246, + "step": 15245 + }, + { + "epoch": 0.7240184209276931, + "grad_norm": 0.45703125, + "learning_rate": 1.3738612090705687e-05, + "loss": 0.2237, + "step": 15250 + }, + { + "epoch": 0.7242558040165219, + "grad_norm": 0.44140625, + "learning_rate": 1.3724641779368663e-05, + "loss": 0.2228, + "step": 15255 + }, + { + "epoch": 0.7244931871053506, + "grad_norm": 0.4921875, + "learning_rate": 1.3710679957722267e-05, + "loss": 0.2243, + "step": 15260 + }, + { + "epoch": 0.7247305701941794, + "grad_norm": 0.458984375, + "learning_rate": 1.3696726634371104e-05, + "loss": 0.227, + "step": 15265 + }, + { + "epoch": 0.7249679532830081, + "grad_norm": 0.412109375, + "learning_rate": 1.3682781817914553e-05, + "loss": 0.2235, + "step": 15270 + }, + { + "epoch": 0.7252053363718368, + "grad_norm": 0.400390625, + "learning_rate": 1.3668845516946737e-05, + "loss": 0.2263, + "step": 15275 + }, + { + "epoch": 0.7254427194606656, + "grad_norm": 0.4375, + "learning_rate": 1.3654917740056535e-05, + "loss": 0.2214, + "step": 15280 + }, + { + "epoch": 0.7256801025494943, + "grad_norm": 0.421875, + "learning_rate": 1.3640998495827573e-05, + "loss": 0.2247, + "step": 15285 + }, + { + "epoch": 0.7259174856383231, + "grad_norm": 0.53515625, + "learning_rate": 1.3627087792838205e-05, + "loss": 0.225, + "step": 15290 + }, + { + "epoch": 0.7261548687271518, + "grad_norm": 0.4375, + "learning_rate": 1.3613185639661563e-05, + "loss": 0.2236, + "step": 15295 + }, + { + "epoch": 0.7263922518159807, + "grad_norm": 0.5078125, + "learning_rate": 1.3599292044865447e-05, + "loss": 0.2229, + "step": 15300 + }, + { + "epoch": 0.7266296349048094, + "grad_norm": 0.44921875, + "learning_rate": 1.3585407017012445e-05, + "loss": 0.2249, + "step": 15305 + }, + { + "epoch": 0.7268670179936382, + "grad_norm": 0.447265625, + "learning_rate": 1.3571530564659812e-05, + "loss": 0.2194, + "step": 15310 + }, + { + "epoch": 0.7271044010824669, + "grad_norm": 0.486328125, + "learning_rate": 1.3557662696359574e-05, + "loss": 0.2244, + "step": 15315 + }, + { + "epoch": 0.7273417841712957, + "grad_norm": 0.37109375, + "learning_rate": 1.3543803420658397e-05, + "loss": 0.2196, + "step": 15320 + }, + { + "epoch": 0.7275791672601244, + "grad_norm": 0.4609375, + "learning_rate": 1.3529952746097717e-05, + "loss": 0.2247, + "step": 15325 + }, + { + "epoch": 0.7278165503489531, + "grad_norm": 0.412109375, + "learning_rate": 1.3516110681213634e-05, + "loss": 0.2237, + "step": 15330 + }, + { + "epoch": 0.7280539334377819, + "grad_norm": 0.47265625, + "learning_rate": 1.3502277234536948e-05, + "loss": 0.223, + "step": 15335 + }, + { + "epoch": 0.7282913165266106, + "grad_norm": 0.423828125, + "learning_rate": 1.3488452414593153e-05, + "loss": 0.2251, + "step": 15340 + }, + { + "epoch": 0.7285286996154394, + "grad_norm": 0.43359375, + "learning_rate": 1.3474636229902409e-05, + "loss": 0.2271, + "step": 15345 + }, + { + "epoch": 0.7287660827042681, + "grad_norm": 0.384765625, + "learning_rate": 1.3460828688979587e-05, + "loss": 0.2248, + "step": 15350 + }, + { + "epoch": 0.7290034657930969, + "grad_norm": 0.439453125, + "learning_rate": 1.3447029800334204e-05, + "loss": 0.2248, + "step": 15355 + }, + { + "epoch": 0.7292408488819256, + "grad_norm": 0.443359375, + "learning_rate": 1.3433239572470452e-05, + "loss": 0.2279, + "step": 15360 + }, + { + "epoch": 0.7294782319707545, + "grad_norm": 0.52734375, + "learning_rate": 1.3419458013887179e-05, + "loss": 0.2237, + "step": 15365 + }, + { + "epoch": 0.7297156150595832, + "grad_norm": 0.43359375, + "learning_rate": 1.340568513307791e-05, + "loss": 0.2268, + "step": 15370 + }, + { + "epoch": 0.7299529981484119, + "grad_norm": 0.45703125, + "learning_rate": 1.3391920938530799e-05, + "loss": 0.2227, + "step": 15375 + }, + { + "epoch": 0.7301903812372407, + "grad_norm": 0.45703125, + "learning_rate": 1.3378165438728665e-05, + "loss": 0.2277, + "step": 15380 + }, + { + "epoch": 0.7304277643260694, + "grad_norm": 0.416015625, + "learning_rate": 1.3364418642148952e-05, + "loss": 0.2243, + "step": 15385 + }, + { + "epoch": 0.7306651474148982, + "grad_norm": 0.43359375, + "learning_rate": 1.3350680557263744e-05, + "loss": 0.2234, + "step": 15390 + }, + { + "epoch": 0.7309025305037269, + "grad_norm": 0.458984375, + "learning_rate": 1.3336951192539771e-05, + "loss": 0.2222, + "step": 15395 + }, + { + "epoch": 0.7311399135925557, + "grad_norm": 0.412109375, + "learning_rate": 1.332323055643838e-05, + "loss": 0.2242, + "step": 15400 + }, + { + "epoch": 0.7313772966813844, + "grad_norm": 0.412109375, + "learning_rate": 1.3309518657415526e-05, + "loss": 0.2274, + "step": 15405 + }, + { + "epoch": 0.7316146797702132, + "grad_norm": 0.4609375, + "learning_rate": 1.3295815503921788e-05, + "loss": 0.2256, + "step": 15410 + }, + { + "epoch": 0.7318520628590419, + "grad_norm": 0.453125, + "learning_rate": 1.3282121104402373e-05, + "loss": 0.2227, + "step": 15415 + }, + { + "epoch": 0.7320894459478706, + "grad_norm": 0.404296875, + "learning_rate": 1.326843546729707e-05, + "loss": 0.2239, + "step": 15420 + }, + { + "epoch": 0.7323268290366994, + "grad_norm": 0.462890625, + "learning_rate": 1.325475860104027e-05, + "loss": 0.2219, + "step": 15425 + }, + { + "epoch": 0.7325642121255281, + "grad_norm": 0.419921875, + "learning_rate": 1.3241090514060967e-05, + "loss": 0.221, + "step": 15430 + }, + { + "epoch": 0.732801595214357, + "grad_norm": 0.455078125, + "learning_rate": 1.3227431214782732e-05, + "loss": 0.2235, + "step": 15435 + }, + { + "epoch": 0.7330389783031857, + "grad_norm": 0.423828125, + "learning_rate": 1.3213780711623752e-05, + "loss": 0.2197, + "step": 15440 + }, + { + "epoch": 0.7332763613920145, + "grad_norm": 0.41796875, + "learning_rate": 1.3200139012996742e-05, + "loss": 0.2251, + "step": 15445 + }, + { + "epoch": 0.7335137444808432, + "grad_norm": 0.435546875, + "learning_rate": 1.318650612730904e-05, + "loss": 0.2221, + "step": 15450 + }, + { + "epoch": 0.733751127569672, + "grad_norm": 0.44921875, + "learning_rate": 1.3172882062962517e-05, + "loss": 0.2201, + "step": 15455 + }, + { + "epoch": 0.7339885106585007, + "grad_norm": 0.408203125, + "learning_rate": 1.3159266828353641e-05, + "loss": 0.2261, + "step": 15460 + }, + { + "epoch": 0.7342258937473295, + "grad_norm": 0.451171875, + "learning_rate": 1.3145660431873396e-05, + "loss": 0.2224, + "step": 15465 + }, + { + "epoch": 0.7344632768361582, + "grad_norm": 0.443359375, + "learning_rate": 1.3132062881907359e-05, + "loss": 0.2199, + "step": 15470 + }, + { + "epoch": 0.7347006599249869, + "grad_norm": 0.4140625, + "learning_rate": 1.3118474186835628e-05, + "loss": 0.2256, + "step": 15475 + }, + { + "epoch": 0.7349380430138157, + "grad_norm": 0.4296875, + "learning_rate": 1.3104894355032863e-05, + "loss": 0.2206, + "step": 15480 + }, + { + "epoch": 0.7351754261026444, + "grad_norm": 0.44140625, + "learning_rate": 1.3091323394868246e-05, + "loss": 0.2199, + "step": 15485 + }, + { + "epoch": 0.7354128091914732, + "grad_norm": 0.416015625, + "learning_rate": 1.3077761314705495e-05, + "loss": 0.2227, + "step": 15490 + }, + { + "epoch": 0.7356501922803019, + "grad_norm": 0.421875, + "learning_rate": 1.3064208122902866e-05, + "loss": 0.2212, + "step": 15495 + }, + { + "epoch": 0.7358875753691307, + "grad_norm": 0.46875, + "learning_rate": 1.3050663827813132e-05, + "loss": 0.2213, + "step": 15500 + }, + { + "epoch": 0.7361249584579594, + "grad_norm": 0.427734375, + "learning_rate": 1.3037128437783569e-05, + "loss": 0.2205, + "step": 15505 + }, + { + "epoch": 0.7363623415467883, + "grad_norm": 0.416015625, + "learning_rate": 1.302360196115598e-05, + "loss": 0.2212, + "step": 15510 + }, + { + "epoch": 0.736599724635617, + "grad_norm": 0.50390625, + "learning_rate": 1.3010084406266675e-05, + "loss": 0.2253, + "step": 15515 + }, + { + "epoch": 0.7368371077244457, + "grad_norm": 0.421875, + "learning_rate": 1.2996575781446468e-05, + "loss": 0.2236, + "step": 15520 + }, + { + "epoch": 0.7370744908132745, + "grad_norm": 0.392578125, + "learning_rate": 1.2983076095020653e-05, + "loss": 0.2209, + "step": 15525 + }, + { + "epoch": 0.7373118739021032, + "grad_norm": 0.443359375, + "learning_rate": 1.2969585355309021e-05, + "loss": 0.2227, + "step": 15530 + }, + { + "epoch": 0.737549256990932, + "grad_norm": 0.515625, + "learning_rate": 1.295610357062586e-05, + "loss": 0.2257, + "step": 15535 + }, + { + "epoch": 0.7377866400797607, + "grad_norm": 0.46875, + "learning_rate": 1.2942630749279938e-05, + "loss": 0.2212, + "step": 15540 + }, + { + "epoch": 0.7380240231685895, + "grad_norm": 0.4375, + "learning_rate": 1.2929166899574488e-05, + "loss": 0.2249, + "step": 15545 + }, + { + "epoch": 0.7382614062574182, + "grad_norm": 0.474609375, + "learning_rate": 1.291571202980722e-05, + "loss": 0.227, + "step": 15550 + }, + { + "epoch": 0.738498789346247, + "grad_norm": 0.404296875, + "learning_rate": 1.29022661482703e-05, + "loss": 0.2253, + "step": 15555 + }, + { + "epoch": 0.7387361724350757, + "grad_norm": 0.423828125, + "learning_rate": 1.2888829263250381e-05, + "loss": 0.2239, + "step": 15560 + }, + { + "epoch": 0.7389735555239044, + "grad_norm": 0.453125, + "learning_rate": 1.2875401383028546e-05, + "loss": 0.2219, + "step": 15565 + }, + { + "epoch": 0.7392109386127332, + "grad_norm": 0.455078125, + "learning_rate": 1.2861982515880333e-05, + "loss": 0.2234, + "step": 15570 + }, + { + "epoch": 0.7394483217015619, + "grad_norm": 0.423828125, + "learning_rate": 1.2848572670075734e-05, + "loss": 0.2218, + "step": 15575 + }, + { + "epoch": 0.7396857047903908, + "grad_norm": 0.53515625, + "learning_rate": 1.2835171853879167e-05, + "loss": 0.2219, + "step": 15580 + }, + { + "epoch": 0.7399230878792195, + "grad_norm": 0.43359375, + "learning_rate": 1.2821780075549516e-05, + "loss": 0.2264, + "step": 15585 + }, + { + "epoch": 0.7401604709680483, + "grad_norm": 0.4140625, + "learning_rate": 1.2808397343340039e-05, + "loss": 0.2246, + "step": 15590 + }, + { + "epoch": 0.740397854056877, + "grad_norm": 0.435546875, + "learning_rate": 1.2795023665498482e-05, + "loss": 0.224, + "step": 15595 + }, + { + "epoch": 0.7406352371457058, + "grad_norm": 0.404296875, + "learning_rate": 1.2781659050266965e-05, + "loss": 0.2223, + "step": 15600 + }, + { + "epoch": 0.7408726202345345, + "grad_norm": 0.46875, + "learning_rate": 1.2768303505882062e-05, + "loss": 0.2162, + "step": 15605 + }, + { + "epoch": 0.7411100033233633, + "grad_norm": 0.41796875, + "learning_rate": 1.27549570405747e-05, + "loss": 0.2233, + "step": 15610 + }, + { + "epoch": 0.741347386412192, + "grad_norm": 0.408203125, + "learning_rate": 1.2741619662570273e-05, + "loss": 0.2261, + "step": 15615 + }, + { + "epoch": 0.7415847695010207, + "grad_norm": 0.498046875, + "learning_rate": 1.2728291380088536e-05, + "loss": 0.2217, + "step": 15620 + }, + { + "epoch": 0.7418221525898495, + "grad_norm": 0.439453125, + "learning_rate": 1.2714972201343653e-05, + "loss": 0.2281, + "step": 15625 + }, + { + "epoch": 0.7420595356786782, + "grad_norm": 0.48046875, + "learning_rate": 1.2701662134544167e-05, + "loss": 0.2242, + "step": 15630 + }, + { + "epoch": 0.742296918767507, + "grad_norm": 0.443359375, + "learning_rate": 1.2688361187893006e-05, + "loss": 0.2233, + "step": 15635 + }, + { + "epoch": 0.7425343018563357, + "grad_norm": 0.408203125, + "learning_rate": 1.2675069369587506e-05, + "loss": 0.2232, + "step": 15640 + }, + { + "epoch": 0.7427716849451645, + "grad_norm": 0.435546875, + "learning_rate": 1.2661786687819332e-05, + "loss": 0.2242, + "step": 15645 + }, + { + "epoch": 0.7430090680339932, + "grad_norm": 0.455078125, + "learning_rate": 1.264851315077455e-05, + "loss": 0.2215, + "step": 15650 + }, + { + "epoch": 0.7432464511228221, + "grad_norm": 0.42578125, + "learning_rate": 1.2635248766633573e-05, + "loss": 0.22, + "step": 15655 + }, + { + "epoch": 0.7434838342116508, + "grad_norm": 0.439453125, + "learning_rate": 1.2621993543571193e-05, + "loss": 0.2235, + "step": 15660 + }, + { + "epoch": 0.7437212173004795, + "grad_norm": 0.431640625, + "learning_rate": 1.2608747489756536e-05, + "loss": 0.2234, + "step": 15665 + }, + { + "epoch": 0.7439586003893083, + "grad_norm": 0.4296875, + "learning_rate": 1.259551061335309e-05, + "loss": 0.2246, + "step": 15670 + }, + { + "epoch": 0.744195983478137, + "grad_norm": 0.435546875, + "learning_rate": 1.2582282922518674e-05, + "loss": 0.2228, + "step": 15675 + }, + { + "epoch": 0.7444333665669658, + "grad_norm": 0.40234375, + "learning_rate": 1.256906442540545e-05, + "loss": 0.221, + "step": 15680 + }, + { + "epoch": 0.7446707496557945, + "grad_norm": 0.419921875, + "learning_rate": 1.2555855130159933e-05, + "loss": 0.2249, + "step": 15685 + }, + { + "epoch": 0.7449081327446233, + "grad_norm": 0.48828125, + "learning_rate": 1.2542655044922944e-05, + "loss": 0.2247, + "step": 15690 + }, + { + "epoch": 0.745145515833452, + "grad_norm": 0.42578125, + "learning_rate": 1.2529464177829633e-05, + "loss": 0.2206, + "step": 15695 + }, + { + "epoch": 0.7453828989222808, + "grad_norm": 0.458984375, + "learning_rate": 1.2516282537009466e-05, + "loss": 0.2249, + "step": 15700 + }, + { + "epoch": 0.7456202820111095, + "grad_norm": 0.478515625, + "learning_rate": 1.2503110130586254e-05, + "loss": 0.2268, + "step": 15705 + }, + { + "epoch": 0.7458576650999382, + "grad_norm": 0.404296875, + "learning_rate": 1.2489946966678055e-05, + "loss": 0.2233, + "step": 15710 + }, + { + "epoch": 0.746095048188767, + "grad_norm": 0.431640625, + "learning_rate": 1.247679305339729e-05, + "loss": 0.2218, + "step": 15715 + }, + { + "epoch": 0.7463324312775957, + "grad_norm": 0.44921875, + "learning_rate": 1.2463648398850656e-05, + "loss": 0.2243, + "step": 15720 + }, + { + "epoch": 0.7465698143664246, + "grad_norm": 0.416015625, + "learning_rate": 1.2450513011139137e-05, + "loss": 0.2209, + "step": 15725 + }, + { + "epoch": 0.7468071974552533, + "grad_norm": 0.427734375, + "learning_rate": 1.2437386898358013e-05, + "loss": 0.2239, + "step": 15730 + }, + { + "epoch": 0.7470445805440821, + "grad_norm": 0.451171875, + "learning_rate": 1.2424270068596843e-05, + "loss": 0.2228, + "step": 15735 + }, + { + "epoch": 0.7472819636329108, + "grad_norm": 0.51171875, + "learning_rate": 1.2411162529939482e-05, + "loss": 0.2274, + "step": 15740 + }, + { + "epoch": 0.7475193467217396, + "grad_norm": 0.4296875, + "learning_rate": 1.2398064290464041e-05, + "loss": 0.2195, + "step": 15745 + }, + { + "epoch": 0.7477567298105683, + "grad_norm": 0.4296875, + "learning_rate": 1.2384975358242906e-05, + "loss": 0.2252, + "step": 15750 + }, + { + "epoch": 0.7479941128993971, + "grad_norm": 0.390625, + "learning_rate": 1.2371895741342714e-05, + "loss": 0.2203, + "step": 15755 + }, + { + "epoch": 0.7482314959882258, + "grad_norm": 0.453125, + "learning_rate": 1.2358825447824394e-05, + "loss": 0.2262, + "step": 15760 + }, + { + "epoch": 0.7484688790770545, + "grad_norm": 0.431640625, + "learning_rate": 1.2345764485743098e-05, + "loss": 0.224, + "step": 15765 + }, + { + "epoch": 0.7487062621658833, + "grad_norm": 0.451171875, + "learning_rate": 1.2332712863148236e-05, + "loss": 0.2206, + "step": 15770 + }, + { + "epoch": 0.748943645254712, + "grad_norm": 0.416015625, + "learning_rate": 1.231967058808347e-05, + "loss": 0.2224, + "step": 15775 + }, + { + "epoch": 0.7491810283435408, + "grad_norm": 0.42578125, + "learning_rate": 1.2306637668586681e-05, + "loss": 0.2194, + "step": 15780 + }, + { + "epoch": 0.7494184114323695, + "grad_norm": 0.494140625, + "learning_rate": 1.2293614112690014e-05, + "loss": 0.227, + "step": 15785 + }, + { + "epoch": 0.7496557945211983, + "grad_norm": 0.447265625, + "learning_rate": 1.2280599928419825e-05, + "loss": 0.2256, + "step": 15790 + }, + { + "epoch": 0.749893177610027, + "grad_norm": 0.44140625, + "learning_rate": 1.2267595123796692e-05, + "loss": 0.2207, + "step": 15795 + }, + { + "epoch": 0.7501305606988559, + "grad_norm": 0.478515625, + "learning_rate": 1.2254599706835408e-05, + "loss": 0.2223, + "step": 15800 + }, + { + "epoch": 0.7503679437876846, + "grad_norm": 0.40625, + "learning_rate": 1.2241613685545007e-05, + "loss": 0.2254, + "step": 15805 + }, + { + "epoch": 0.7506053268765133, + "grad_norm": 0.5234375, + "learning_rate": 1.2228637067928706e-05, + "loss": 0.2248, + "step": 15810 + }, + { + "epoch": 0.7508427099653421, + "grad_norm": 0.5234375, + "learning_rate": 1.2215669861983937e-05, + "loss": 0.224, + "step": 15815 + }, + { + "epoch": 0.7510800930541708, + "grad_norm": 0.482421875, + "learning_rate": 1.2202712075702332e-05, + "loss": 0.2271, + "step": 15820 + }, + { + "epoch": 0.7513174761429996, + "grad_norm": 0.435546875, + "learning_rate": 1.21897637170697e-05, + "loss": 0.2235, + "step": 15825 + }, + { + "epoch": 0.7515548592318283, + "grad_norm": 0.388671875, + "learning_rate": 1.2176824794066089e-05, + "loss": 0.2241, + "step": 15830 + }, + { + "epoch": 0.7517922423206571, + "grad_norm": 0.462890625, + "learning_rate": 1.216389531466566e-05, + "loss": 0.2199, + "step": 15835 + }, + { + "epoch": 0.7520296254094858, + "grad_norm": 0.443359375, + "learning_rate": 1.2150975286836817e-05, + "loss": 0.2222, + "step": 15840 + }, + { + "epoch": 0.7522670084983146, + "grad_norm": 0.470703125, + "learning_rate": 1.2138064718542104e-05, + "loss": 0.2235, + "step": 15845 + }, + { + "epoch": 0.7525043915871433, + "grad_norm": 0.431640625, + "learning_rate": 1.2125163617738261e-05, + "loss": 0.2254, + "step": 15850 + }, + { + "epoch": 0.752741774675972, + "grad_norm": 0.40234375, + "learning_rate": 1.2112271992376152e-05, + "loss": 0.222, + "step": 15855 + }, + { + "epoch": 0.7529791577648008, + "grad_norm": 0.42578125, + "learning_rate": 1.2099389850400853e-05, + "loss": 0.2215, + "step": 15860 + }, + { + "epoch": 0.7532165408536295, + "grad_norm": 0.400390625, + "learning_rate": 1.2086517199751552e-05, + "loss": 0.2231, + "step": 15865 + }, + { + "epoch": 0.7534539239424584, + "grad_norm": 0.419921875, + "learning_rate": 1.2073654048361612e-05, + "loss": 0.2273, + "step": 15870 + }, + { + "epoch": 0.7536913070312871, + "grad_norm": 0.419921875, + "learning_rate": 1.2060800404158535e-05, + "loss": 0.2238, + "step": 15875 + }, + { + "epoch": 0.7539286901201159, + "grad_norm": 0.462890625, + "learning_rate": 1.2047956275063951e-05, + "loss": 0.223, + "step": 15880 + }, + { + "epoch": 0.7541660732089446, + "grad_norm": 0.4140625, + "learning_rate": 1.203512166899366e-05, + "loss": 0.2223, + "step": 15885 + }, + { + "epoch": 0.7544034562977734, + "grad_norm": 0.462890625, + "learning_rate": 1.2022296593857561e-05, + "loss": 0.2173, + "step": 15890 + }, + { + "epoch": 0.7546408393866021, + "grad_norm": 0.423828125, + "learning_rate": 1.2009481057559687e-05, + "loss": 0.2262, + "step": 15895 + }, + { + "epoch": 0.7548782224754309, + "grad_norm": 0.484375, + "learning_rate": 1.1996675067998192e-05, + "loss": 0.2275, + "step": 15900 + }, + { + "epoch": 0.7551156055642596, + "grad_norm": 0.439453125, + "learning_rate": 1.198387863306536e-05, + "loss": 0.2252, + "step": 15905 + }, + { + "epoch": 0.7553529886530883, + "grad_norm": 0.462890625, + "learning_rate": 1.1971091760647569e-05, + "loss": 0.2239, + "step": 15910 + }, + { + "epoch": 0.7555903717419171, + "grad_norm": 0.421875, + "learning_rate": 1.1958314458625312e-05, + "loss": 0.2275, + "step": 15915 + }, + { + "epoch": 0.7558277548307458, + "grad_norm": 0.423828125, + "learning_rate": 1.1945546734873181e-05, + "loss": 0.2207, + "step": 15920 + }, + { + "epoch": 0.7560651379195746, + "grad_norm": 0.41015625, + "learning_rate": 1.1932788597259859e-05, + "loss": 0.2195, + "step": 15925 + }, + { + "epoch": 0.7563025210084033, + "grad_norm": 0.4375, + "learning_rate": 1.1920040053648142e-05, + "loss": 0.2218, + "step": 15930 + }, + { + "epoch": 0.7565399040972322, + "grad_norm": 0.435546875, + "learning_rate": 1.1907301111894895e-05, + "loss": 0.2223, + "step": 15935 + }, + { + "epoch": 0.7567772871860609, + "grad_norm": 0.376953125, + "learning_rate": 1.1894571779851065e-05, + "loss": 0.2218, + "step": 15940 + }, + { + "epoch": 0.7570146702748897, + "grad_norm": 0.41015625, + "learning_rate": 1.188185206536168e-05, + "loss": 0.2208, + "step": 15945 + }, + { + "epoch": 0.7572520533637184, + "grad_norm": 0.474609375, + "learning_rate": 1.1869141976265848e-05, + "loss": 0.2224, + "step": 15950 + }, + { + "epoch": 0.7574894364525471, + "grad_norm": 0.40625, + "learning_rate": 1.1856441520396742e-05, + "loss": 0.2203, + "step": 15955 + }, + { + "epoch": 0.7577268195413759, + "grad_norm": 0.435546875, + "learning_rate": 1.1843750705581589e-05, + "loss": 0.2258, + "step": 15960 + }, + { + "epoch": 0.7579642026302046, + "grad_norm": 0.431640625, + "learning_rate": 1.1831069539641685e-05, + "loss": 0.2228, + "step": 15965 + }, + { + "epoch": 0.7582015857190334, + "grad_norm": 0.376953125, + "learning_rate": 1.1818398030392364e-05, + "loss": 0.2198, + "step": 15970 + }, + { + "epoch": 0.7584389688078621, + "grad_norm": 0.462890625, + "learning_rate": 1.1805736185643038e-05, + "loss": 0.2257, + "step": 15975 + }, + { + "epoch": 0.7586763518966909, + "grad_norm": 0.419921875, + "learning_rate": 1.1793084013197126e-05, + "loss": 0.2224, + "step": 15980 + }, + { + "epoch": 0.7589137349855196, + "grad_norm": 0.50390625, + "learning_rate": 1.1780441520852117e-05, + "loss": 0.2224, + "step": 15985 + }, + { + "epoch": 0.7591511180743484, + "grad_norm": 0.4765625, + "learning_rate": 1.1767808716399506e-05, + "loss": 0.2236, + "step": 15990 + }, + { + "epoch": 0.7593885011631771, + "grad_norm": 0.4453125, + "learning_rate": 1.1755185607624863e-05, + "loss": 0.2205, + "step": 15995 + }, + { + "epoch": 0.7596258842520058, + "grad_norm": 0.478515625, + "learning_rate": 1.1742572202307716e-05, + "loss": 0.2219, + "step": 16000 + }, + { + "epoch": 0.7598632673408346, + "grad_norm": 0.390625, + "learning_rate": 1.1729968508221674e-05, + "loss": 0.2232, + "step": 16005 + }, + { + "epoch": 0.7601006504296633, + "grad_norm": 0.44140625, + "learning_rate": 1.171737453313433e-05, + "loss": 0.2192, + "step": 16010 + }, + { + "epoch": 0.7603380335184922, + "grad_norm": 0.490234375, + "learning_rate": 1.1704790284807293e-05, + "loss": 0.2222, + "step": 16015 + }, + { + "epoch": 0.7605754166073209, + "grad_norm": 0.41015625, + "learning_rate": 1.1692215770996178e-05, + "loss": 0.2272, + "step": 16020 + }, + { + "epoch": 0.7608127996961497, + "grad_norm": 0.44921875, + "learning_rate": 1.1679650999450591e-05, + "loss": 0.2219, + "step": 16025 + }, + { + "epoch": 0.7610501827849784, + "grad_norm": 0.40625, + "learning_rate": 1.1667095977914163e-05, + "loss": 0.2237, + "step": 16030 + }, + { + "epoch": 0.7612875658738072, + "grad_norm": 0.4140625, + "learning_rate": 1.1654550714124484e-05, + "loss": 0.2185, + "step": 16035 + }, + { + "epoch": 0.7615249489626359, + "grad_norm": 0.38671875, + "learning_rate": 1.1642015215813145e-05, + "loss": 0.2216, + "step": 16040 + }, + { + "epoch": 0.7617623320514647, + "grad_norm": 0.3984375, + "learning_rate": 1.1629489490705713e-05, + "loss": 0.2251, + "step": 16045 + }, + { + "epoch": 0.7619997151402934, + "grad_norm": 0.455078125, + "learning_rate": 1.1616973546521743e-05, + "loss": 0.2232, + "step": 16050 + }, + { + "epoch": 0.7622370982291221, + "grad_norm": 0.435546875, + "learning_rate": 1.1604467390974754e-05, + "loss": 0.2244, + "step": 16055 + }, + { + "epoch": 0.7624744813179509, + "grad_norm": 0.4375, + "learning_rate": 1.1591971031772227e-05, + "loss": 0.2269, + "step": 16060 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 0.404296875, + "learning_rate": 1.1579484476615617e-05, + "loss": 0.2215, + "step": 16065 + }, + { + "epoch": 0.7629492474956084, + "grad_norm": 0.470703125, + "learning_rate": 1.156700773320032e-05, + "loss": 0.2231, + "step": 16070 + }, + { + "epoch": 0.7631866305844371, + "grad_norm": 0.44140625, + "learning_rate": 1.1554540809215712e-05, + "loss": 0.2237, + "step": 16075 + }, + { + "epoch": 0.763424013673266, + "grad_norm": 0.388671875, + "learning_rate": 1.1542083712345094e-05, + "loss": 0.2259, + "step": 16080 + }, + { + "epoch": 0.7636613967620947, + "grad_norm": 0.373046875, + "learning_rate": 1.152963645026572e-05, + "loss": 0.2241, + "step": 16085 + }, + { + "epoch": 0.7638987798509235, + "grad_norm": 0.408203125, + "learning_rate": 1.1517199030648772e-05, + "loss": 0.2217, + "step": 16090 + }, + { + "epoch": 0.7641361629397522, + "grad_norm": 0.455078125, + "learning_rate": 1.1504771461159388e-05, + "loss": 0.2214, + "step": 16095 + }, + { + "epoch": 0.7643735460285809, + "grad_norm": 0.466796875, + "learning_rate": 1.1492353749456617e-05, + "loss": 0.2252, + "step": 16100 + }, + { + "epoch": 0.7646109291174097, + "grad_norm": 0.412109375, + "learning_rate": 1.147994590319344e-05, + "loss": 0.2183, + "step": 16105 + }, + { + "epoch": 0.7648483122062384, + "grad_norm": 0.453125, + "learning_rate": 1.1467547930016753e-05, + "loss": 0.2248, + "step": 16110 + }, + { + "epoch": 0.7650856952950672, + "grad_norm": 0.5, + "learning_rate": 1.1455159837567372e-05, + "loss": 0.2248, + "step": 16115 + }, + { + "epoch": 0.7653230783838959, + "grad_norm": 0.44921875, + "learning_rate": 1.1442781633480023e-05, + "loss": 0.2222, + "step": 16120 + }, + { + "epoch": 0.7655604614727247, + "grad_norm": 0.45703125, + "learning_rate": 1.1430413325383328e-05, + "loss": 0.2268, + "step": 16125 + }, + { + "epoch": 0.7657978445615534, + "grad_norm": 0.44921875, + "learning_rate": 1.1418054920899835e-05, + "loss": 0.2222, + "step": 16130 + }, + { + "epoch": 0.7660352276503822, + "grad_norm": 0.384765625, + "learning_rate": 1.1405706427645966e-05, + "loss": 0.2207, + "step": 16135 + }, + { + "epoch": 0.7662726107392109, + "grad_norm": 0.451171875, + "learning_rate": 1.1393367853232037e-05, + "loss": 0.2253, + "step": 16140 + }, + { + "epoch": 0.7665099938280396, + "grad_norm": 0.384765625, + "learning_rate": 1.1381039205262251e-05, + "loss": 0.2246, + "step": 16145 + }, + { + "epoch": 0.7667473769168685, + "grad_norm": 0.458984375, + "learning_rate": 1.1368720491334714e-05, + "loss": 0.2198, + "step": 16150 + }, + { + "epoch": 0.7669847600056972, + "grad_norm": 0.400390625, + "learning_rate": 1.1356411719041384e-05, + "loss": 0.2227, + "step": 16155 + }, + { + "epoch": 0.767222143094526, + "grad_norm": 0.51171875, + "learning_rate": 1.1344112895968099e-05, + "loss": 0.2241, + "step": 16160 + }, + { + "epoch": 0.7674595261833547, + "grad_norm": 0.4296875, + "learning_rate": 1.1331824029694572e-05, + "loss": 0.2257, + "step": 16165 + }, + { + "epoch": 0.7676969092721835, + "grad_norm": 0.486328125, + "learning_rate": 1.1319545127794367e-05, + "loss": 0.2232, + "step": 16170 + }, + { + "epoch": 0.7679342923610122, + "grad_norm": 0.41796875, + "learning_rate": 1.1307276197834928e-05, + "loss": 0.2206, + "step": 16175 + }, + { + "epoch": 0.768171675449841, + "grad_norm": 0.4140625, + "learning_rate": 1.1295017247377535e-05, + "loss": 0.2246, + "step": 16180 + }, + { + "epoch": 0.7684090585386697, + "grad_norm": 0.412109375, + "learning_rate": 1.1282768283977322e-05, + "loss": 0.22, + "step": 16185 + }, + { + "epoch": 0.7686464416274985, + "grad_norm": 0.421875, + "learning_rate": 1.1270529315183266e-05, + "loss": 0.221, + "step": 16190 + }, + { + "epoch": 0.7688838247163272, + "grad_norm": 0.470703125, + "learning_rate": 1.12583003485382e-05, + "loss": 0.2238, + "step": 16195 + }, + { + "epoch": 0.7691212078051559, + "grad_norm": 0.451171875, + "learning_rate": 1.124608139157877e-05, + "loss": 0.2247, + "step": 16200 + }, + { + "epoch": 0.7693585908939847, + "grad_norm": 0.4609375, + "learning_rate": 1.1233872451835468e-05, + "loss": 0.2269, + "step": 16205 + }, + { + "epoch": 0.7695959739828134, + "grad_norm": 0.484375, + "learning_rate": 1.1221673536832605e-05, + "loss": 0.2242, + "step": 16210 + }, + { + "epoch": 0.7698333570716422, + "grad_norm": 0.42578125, + "learning_rate": 1.1209484654088315e-05, + "loss": 0.2251, + "step": 16215 + }, + { + "epoch": 0.770070740160471, + "grad_norm": 0.41796875, + "learning_rate": 1.1197305811114565e-05, + "loss": 0.2252, + "step": 16220 + }, + { + "epoch": 0.7703081232492998, + "grad_norm": 0.41015625, + "learning_rate": 1.11851370154171e-05, + "loss": 0.2215, + "step": 16225 + }, + { + "epoch": 0.7705455063381285, + "grad_norm": 0.474609375, + "learning_rate": 1.1172978274495507e-05, + "loss": 0.2233, + "step": 16230 + }, + { + "epoch": 0.7707828894269573, + "grad_norm": 0.419921875, + "learning_rate": 1.1160829595843153e-05, + "loss": 0.2192, + "step": 16235 + }, + { + "epoch": 0.771020272515786, + "grad_norm": 0.484375, + "learning_rate": 1.1148690986947235e-05, + "loss": 0.2229, + "step": 16240 + }, + { + "epoch": 0.7712576556046147, + "grad_norm": 0.484375, + "learning_rate": 1.1136562455288693e-05, + "loss": 0.2242, + "step": 16245 + }, + { + "epoch": 0.7714950386934435, + "grad_norm": 0.482421875, + "learning_rate": 1.1124444008342306e-05, + "loss": 0.2286, + "step": 16250 + }, + { + "epoch": 0.7717324217822722, + "grad_norm": 0.44921875, + "learning_rate": 1.1112335653576612e-05, + "loss": 0.226, + "step": 16255 + }, + { + "epoch": 0.771969804871101, + "grad_norm": 0.41796875, + "learning_rate": 1.1100237398453933e-05, + "loss": 0.2248, + "step": 16260 + }, + { + "epoch": 0.7722071879599297, + "grad_norm": 0.421875, + "learning_rate": 1.1088149250430374e-05, + "loss": 0.2199, + "step": 16265 + }, + { + "epoch": 0.7724445710487585, + "grad_norm": 0.42578125, + "learning_rate": 1.1076071216955793e-05, + "loss": 0.2216, + "step": 16270 + }, + { + "epoch": 0.7726819541375872, + "grad_norm": 0.421875, + "learning_rate": 1.1064003305473845e-05, + "loss": 0.2228, + "step": 16275 + }, + { + "epoch": 0.772919337226416, + "grad_norm": 0.52734375, + "learning_rate": 1.1051945523421917e-05, + "loss": 0.2271, + "step": 16280 + }, + { + "epoch": 0.7731567203152447, + "grad_norm": 0.46484375, + "learning_rate": 1.1039897878231167e-05, + "loss": 0.2242, + "step": 16285 + }, + { + "epoch": 0.7733941034040734, + "grad_norm": 0.494140625, + "learning_rate": 1.1027860377326504e-05, + "loss": 0.2226, + "step": 16290 + }, + { + "epoch": 0.7736314864929023, + "grad_norm": 0.431640625, + "learning_rate": 1.101583302812659e-05, + "loss": 0.2256, + "step": 16295 + }, + { + "epoch": 0.773868869581731, + "grad_norm": 0.474609375, + "learning_rate": 1.1003815838043824e-05, + "loss": 0.2216, + "step": 16300 + }, + { + "epoch": 0.7741062526705598, + "grad_norm": 0.51171875, + "learning_rate": 1.0991808814484341e-05, + "loss": 0.2258, + "step": 16305 + }, + { + "epoch": 0.7743436357593885, + "grad_norm": 0.4375, + "learning_rate": 1.0979811964848016e-05, + "loss": 0.223, + "step": 16310 + }, + { + "epoch": 0.7745810188482173, + "grad_norm": 0.470703125, + "learning_rate": 1.0967825296528453e-05, + "loss": 0.2191, + "step": 16315 + }, + { + "epoch": 0.774818401937046, + "grad_norm": 0.515625, + "learning_rate": 1.0955848816912987e-05, + "loss": 0.2185, + "step": 16320 + }, + { + "epoch": 0.7750557850258748, + "grad_norm": 0.40625, + "learning_rate": 1.0943882533382662e-05, + "loss": 0.2267, + "step": 16325 + }, + { + "epoch": 0.7752931681147035, + "grad_norm": 0.416015625, + "learning_rate": 1.0931926453312249e-05, + "loss": 0.2229, + "step": 16330 + }, + { + "epoch": 0.7755305512035323, + "grad_norm": 0.443359375, + "learning_rate": 1.0919980584070214e-05, + "loss": 0.2223, + "step": 16335 + }, + { + "epoch": 0.775767934292361, + "grad_norm": 0.451171875, + "learning_rate": 1.0908044933018757e-05, + "loss": 0.2219, + "step": 16340 + }, + { + "epoch": 0.7760053173811897, + "grad_norm": 0.50390625, + "learning_rate": 1.0896119507513757e-05, + "loss": 0.2252, + "step": 16345 + }, + { + "epoch": 0.7762427004700185, + "grad_norm": 0.42578125, + "learning_rate": 1.0884204314904802e-05, + "loss": 0.2243, + "step": 16350 + }, + { + "epoch": 0.7764800835588472, + "grad_norm": 0.4765625, + "learning_rate": 1.0872299362535173e-05, + "loss": 0.2229, + "step": 16355 + }, + { + "epoch": 0.776717466647676, + "grad_norm": 0.5078125, + "learning_rate": 1.086040465774183e-05, + "loss": 0.222, + "step": 16360 + }, + { + "epoch": 0.7769548497365047, + "grad_norm": 0.4375, + "learning_rate": 1.0848520207855439e-05, + "loss": 0.2203, + "step": 16365 + }, + { + "epoch": 0.7771922328253336, + "grad_norm": 0.451171875, + "learning_rate": 1.0836646020200314e-05, + "loss": 0.2203, + "step": 16370 + }, + { + "epoch": 0.7774296159141623, + "grad_norm": 0.408203125, + "learning_rate": 1.082478210209448e-05, + "loss": 0.224, + "step": 16375 + }, + { + "epoch": 0.7776669990029911, + "grad_norm": 0.404296875, + "learning_rate": 1.0812928460849603e-05, + "loss": 0.2233, + "step": 16380 + }, + { + "epoch": 0.7779043820918198, + "grad_norm": 0.4375, + "learning_rate": 1.0801085103771044e-05, + "loss": 0.2208, + "step": 16385 + }, + { + "epoch": 0.7781417651806485, + "grad_norm": 0.515625, + "learning_rate": 1.0789252038157791e-05, + "loss": 0.2198, + "step": 16390 + }, + { + "epoch": 0.7783791482694773, + "grad_norm": 0.455078125, + "learning_rate": 1.0777429271302522e-05, + "loss": 0.2263, + "step": 16395 + }, + { + "epoch": 0.778616531358306, + "grad_norm": 0.5703125, + "learning_rate": 1.0765616810491544e-05, + "loss": 0.2252, + "step": 16400 + }, + { + "epoch": 0.7788539144471348, + "grad_norm": 0.4296875, + "learning_rate": 1.0753814663004843e-05, + "loss": 0.2274, + "step": 16405 + }, + { + "epoch": 0.7790912975359635, + "grad_norm": 0.458984375, + "learning_rate": 1.0742022836116005e-05, + "loss": 0.2241, + "step": 16410 + }, + { + "epoch": 0.7793286806247923, + "grad_norm": 0.490234375, + "learning_rate": 1.0730241337092287e-05, + "loss": 0.2239, + "step": 16415 + }, + { + "epoch": 0.779566063713621, + "grad_norm": 0.40234375, + "learning_rate": 1.071847017319458e-05, + "loss": 0.2274, + "step": 16420 + }, + { + "epoch": 0.7798034468024498, + "grad_norm": 0.41796875, + "learning_rate": 1.0706709351677397e-05, + "loss": 0.222, + "step": 16425 + }, + { + "epoch": 0.7800408298912785, + "grad_norm": 0.50390625, + "learning_rate": 1.0694958879788874e-05, + "loss": 0.2186, + "step": 16430 + }, + { + "epoch": 0.7802782129801072, + "grad_norm": 0.439453125, + "learning_rate": 1.0683218764770766e-05, + "loss": 0.2277, + "step": 16435 + }, + { + "epoch": 0.7805155960689361, + "grad_norm": 0.40234375, + "learning_rate": 1.0671489013858473e-05, + "loss": 0.2243, + "step": 16440 + }, + { + "epoch": 0.7807529791577648, + "grad_norm": 0.39453125, + "learning_rate": 1.0659769634280976e-05, + "loss": 0.2194, + "step": 16445 + }, + { + "epoch": 0.7809903622465936, + "grad_norm": 0.5234375, + "learning_rate": 1.0648060633260872e-05, + "loss": 0.2186, + "step": 16450 + }, + { + "epoch": 0.7812277453354223, + "grad_norm": 0.40625, + "learning_rate": 1.0636362018014371e-05, + "loss": 0.223, + "step": 16455 + }, + { + "epoch": 0.7814651284242511, + "grad_norm": 0.443359375, + "learning_rate": 1.0624673795751268e-05, + "loss": 0.2229, + "step": 16460 + }, + { + "epoch": 0.7817025115130798, + "grad_norm": 0.388671875, + "learning_rate": 1.0612995973674972e-05, + "loss": 0.2233, + "step": 16465 + }, + { + "epoch": 0.7819398946019086, + "grad_norm": 0.458984375, + "learning_rate": 1.0601328558982468e-05, + "loss": 0.2239, + "step": 16470 + }, + { + "epoch": 0.7821772776907373, + "grad_norm": 0.474609375, + "learning_rate": 1.0589671558864328e-05, + "loss": 0.2251, + "step": 16475 + }, + { + "epoch": 0.7824146607795661, + "grad_norm": 0.431640625, + "learning_rate": 1.0578024980504706e-05, + "loss": 0.2209, + "step": 16480 + }, + { + "epoch": 0.7826520438683948, + "grad_norm": 0.412109375, + "learning_rate": 1.0566388831081341e-05, + "loss": 0.2217, + "step": 16485 + }, + { + "epoch": 0.7828894269572235, + "grad_norm": 0.416015625, + "learning_rate": 1.0554763117765543e-05, + "loss": 0.2247, + "step": 16490 + }, + { + "epoch": 0.7831268100460523, + "grad_norm": 0.46875, + "learning_rate": 1.0543147847722174e-05, + "loss": 0.2268, + "step": 16495 + }, + { + "epoch": 0.783364193134881, + "grad_norm": 0.498046875, + "learning_rate": 1.0531543028109681e-05, + "loss": 0.2227, + "step": 16500 + }, + { + "epoch": 0.7836015762237099, + "grad_norm": 0.4453125, + "learning_rate": 1.0519948666080052e-05, + "loss": 0.2253, + "step": 16505 + }, + { + "epoch": 0.7838389593125386, + "grad_norm": 0.40234375, + "learning_rate": 1.0508364768778859e-05, + "loss": 0.2206, + "step": 16510 + }, + { + "epoch": 0.7840763424013674, + "grad_norm": 0.40625, + "learning_rate": 1.049679134334518e-05, + "loss": 0.2266, + "step": 16515 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.365234375, + "learning_rate": 1.0485228396911685e-05, + "loss": 0.2216, + "step": 16520 + }, + { + "epoch": 0.7845511085790249, + "grad_norm": 0.453125, + "learning_rate": 1.0473675936604555e-05, + "loss": 0.2216, + "step": 16525 + }, + { + "epoch": 0.7847884916678536, + "grad_norm": 0.474609375, + "learning_rate": 1.0462133969543523e-05, + "loss": 0.223, + "step": 16530 + }, + { + "epoch": 0.7850258747566823, + "grad_norm": 0.45703125, + "learning_rate": 1.0450602502841841e-05, + "loss": 0.2199, + "step": 16535 + }, + { + "epoch": 0.7852632578455111, + "grad_norm": 0.48828125, + "learning_rate": 1.0439081543606314e-05, + "loss": 0.2269, + "step": 16540 + }, + { + "epoch": 0.7855006409343398, + "grad_norm": 0.419921875, + "learning_rate": 1.0427571098937252e-05, + "loss": 0.2193, + "step": 16545 + }, + { + "epoch": 0.7857380240231686, + "grad_norm": 0.4140625, + "learning_rate": 1.0416071175928486e-05, + "loss": 0.2222, + "step": 16550 + }, + { + "epoch": 0.7859754071119973, + "grad_norm": 0.4375, + "learning_rate": 1.0404581781667369e-05, + "loss": 0.2229, + "step": 16555 + }, + { + "epoch": 0.7862127902008261, + "grad_norm": 0.4609375, + "learning_rate": 1.0393102923234755e-05, + "loss": 0.2238, + "step": 16560 + }, + { + "epoch": 0.7864501732896548, + "grad_norm": 0.4140625, + "learning_rate": 1.0381634607705023e-05, + "loss": 0.2236, + "step": 16565 + }, + { + "epoch": 0.7866875563784836, + "grad_norm": 0.47265625, + "learning_rate": 1.0370176842146042e-05, + "loss": 0.2196, + "step": 16570 + }, + { + "epoch": 0.7869249394673123, + "grad_norm": 0.419921875, + "learning_rate": 1.0358729633619182e-05, + "loss": 0.2201, + "step": 16575 + }, + { + "epoch": 0.787162322556141, + "grad_norm": 0.474609375, + "learning_rate": 1.034729298917929e-05, + "loss": 0.2246, + "step": 16580 + }, + { + "epoch": 0.7873997056449699, + "grad_norm": 0.494140625, + "learning_rate": 1.033586691587474e-05, + "loss": 0.2224, + "step": 16585 + }, + { + "epoch": 0.7876370887337986, + "grad_norm": 0.380859375, + "learning_rate": 1.0324451420747355e-05, + "loss": 0.223, + "step": 16590 + }, + { + "epoch": 0.7878744718226274, + "grad_norm": 0.4375, + "learning_rate": 1.031304651083246e-05, + "loss": 0.2237, + "step": 16595 + }, + { + "epoch": 0.7881118549114561, + "grad_norm": 0.42578125, + "learning_rate": 1.0301652193158845e-05, + "loss": 0.2242, + "step": 16600 + }, + { + "epoch": 0.7883492380002849, + "grad_norm": 0.380859375, + "learning_rate": 1.029026847474877e-05, + "loss": 0.2235, + "step": 16605 + }, + { + "epoch": 0.7885866210891136, + "grad_norm": 0.419921875, + "learning_rate": 1.0278895362617988e-05, + "loss": 0.2222, + "step": 16610 + }, + { + "epoch": 0.7888240041779424, + "grad_norm": 0.45703125, + "learning_rate": 1.0267532863775672e-05, + "loss": 0.2271, + "step": 16615 + }, + { + "epoch": 0.7890613872667711, + "grad_norm": 0.439453125, + "learning_rate": 1.0256180985224498e-05, + "loss": 0.2222, + "step": 16620 + }, + { + "epoch": 0.7892987703555999, + "grad_norm": 0.423828125, + "learning_rate": 1.0244839733960563e-05, + "loss": 0.2247, + "step": 16625 + }, + { + "epoch": 0.7895361534444286, + "grad_norm": 0.4453125, + "learning_rate": 1.0233509116973447e-05, + "loss": 0.2238, + "step": 16630 + }, + { + "epoch": 0.7897735365332573, + "grad_norm": 0.462890625, + "learning_rate": 1.0222189141246132e-05, + "loss": 0.2215, + "step": 16635 + }, + { + "epoch": 0.7900109196220861, + "grad_norm": 0.443359375, + "learning_rate": 1.021087981375509e-05, + "loss": 0.2238, + "step": 16640 + }, + { + "epoch": 0.7902483027109148, + "grad_norm": 0.46484375, + "learning_rate": 1.0199581141470194e-05, + "loss": 0.2222, + "step": 16645 + }, + { + "epoch": 0.7904856857997437, + "grad_norm": 0.4140625, + "learning_rate": 1.0188293131354782e-05, + "loss": 0.2255, + "step": 16650 + }, + { + "epoch": 0.7907230688885724, + "grad_norm": 0.43359375, + "learning_rate": 1.0177015790365583e-05, + "loss": 0.222, + "step": 16655 + }, + { + "epoch": 0.7909604519774012, + "grad_norm": 0.419921875, + "learning_rate": 1.0165749125452775e-05, + "loss": 0.2259, + "step": 16660 + }, + { + "epoch": 0.7911978350662299, + "grad_norm": 0.443359375, + "learning_rate": 1.0154493143559962e-05, + "loss": 0.2254, + "step": 16665 + }, + { + "epoch": 0.7914352181550587, + "grad_norm": 0.439453125, + "learning_rate": 1.0143247851624149e-05, + "loss": 0.2221, + "step": 16670 + }, + { + "epoch": 0.7916726012438874, + "grad_norm": 0.48046875, + "learning_rate": 1.0132013256575761e-05, + "loss": 0.2234, + "step": 16675 + }, + { + "epoch": 0.7919099843327161, + "grad_norm": 0.470703125, + "learning_rate": 1.0120789365338617e-05, + "loss": 0.2249, + "step": 16680 + }, + { + "epoch": 0.7921473674215449, + "grad_norm": 0.451171875, + "learning_rate": 1.0109576184829963e-05, + "loss": 0.22, + "step": 16685 + }, + { + "epoch": 0.7923847505103736, + "grad_norm": 0.408203125, + "learning_rate": 1.0098373721960429e-05, + "loss": 0.2234, + "step": 16690 + }, + { + "epoch": 0.7926221335992024, + "grad_norm": 0.470703125, + "learning_rate": 1.0087181983634039e-05, + "loss": 0.2227, + "step": 16695 + }, + { + "epoch": 0.7928595166880311, + "grad_norm": 0.46875, + "learning_rate": 1.007600097674821e-05, + "loss": 0.2213, + "step": 16700 + }, + { + "epoch": 0.7930968997768599, + "grad_norm": 0.435546875, + "learning_rate": 1.0064830708193738e-05, + "loss": 0.2221, + "step": 16705 + }, + { + "epoch": 0.7933342828656886, + "grad_norm": 0.48046875, + "learning_rate": 1.0053671184854824e-05, + "loss": 0.2214, + "step": 16710 + }, + { + "epoch": 0.7935716659545174, + "grad_norm": 0.462890625, + "learning_rate": 1.0042522413609028e-05, + "loss": 0.2214, + "step": 16715 + }, + { + "epoch": 0.7938090490433461, + "grad_norm": 0.400390625, + "learning_rate": 1.0031384401327283e-05, + "loss": 0.2235, + "step": 16720 + }, + { + "epoch": 0.7940464321321749, + "grad_norm": 0.443359375, + "learning_rate": 1.0020257154873888e-05, + "loss": 0.2241, + "step": 16725 + }, + { + "epoch": 0.7942838152210037, + "grad_norm": 0.384765625, + "learning_rate": 1.0009140681106531e-05, + "loss": 0.2208, + "step": 16730 + }, + { + "epoch": 0.7945211983098324, + "grad_norm": 0.4765625, + "learning_rate": 9.998034986876236e-06, + "loss": 0.224, + "step": 16735 + }, + { + "epoch": 0.7947585813986612, + "grad_norm": 0.43359375, + "learning_rate": 9.986940079027388e-06, + "loss": 0.2213, + "step": 16740 + }, + { + "epoch": 0.7949959644874899, + "grad_norm": 0.39453125, + "learning_rate": 9.975855964397735e-06, + "loss": 0.2231, + "step": 16745 + }, + { + "epoch": 0.7952333475763187, + "grad_norm": 0.392578125, + "learning_rate": 9.964782649818355e-06, + "loss": 0.2219, + "step": 16750 + }, + { + "epoch": 0.7954707306651474, + "grad_norm": 0.447265625, + "learning_rate": 9.953720142113705e-06, + "loss": 0.2242, + "step": 16755 + }, + { + "epoch": 0.7957081137539762, + "grad_norm": 0.47265625, + "learning_rate": 9.942668448101528e-06, + "loss": 0.2245, + "step": 16760 + }, + { + "epoch": 0.7959454968428049, + "grad_norm": 0.419921875, + "learning_rate": 9.931627574592952e-06, + "loss": 0.2203, + "step": 16765 + }, + { + "epoch": 0.7961828799316337, + "grad_norm": 0.4453125, + "learning_rate": 9.920597528392412e-06, + "loss": 0.2227, + "step": 16770 + }, + { + "epoch": 0.7964202630204624, + "grad_norm": 0.388671875, + "learning_rate": 9.909578316297685e-06, + "loss": 0.2204, + "step": 16775 + }, + { + "epoch": 0.7966576461092911, + "grad_norm": 0.43359375, + "learning_rate": 9.898569945099839e-06, + "loss": 0.2258, + "step": 16780 + }, + { + "epoch": 0.7968950291981199, + "grad_norm": 0.53125, + "learning_rate": 9.887572421583306e-06, + "loss": 0.2227, + "step": 16785 + }, + { + "epoch": 0.7971324122869486, + "grad_norm": 0.412109375, + "learning_rate": 9.87658575252579e-06, + "loss": 0.2212, + "step": 16790 + }, + { + "epoch": 0.7973697953757775, + "grad_norm": 0.458984375, + "learning_rate": 9.86560994469835e-06, + "loss": 0.2177, + "step": 16795 + }, + { + "epoch": 0.7976071784646062, + "grad_norm": 0.462890625, + "learning_rate": 9.854645004865307e-06, + "loss": 0.2267, + "step": 16800 + }, + { + "epoch": 0.797844561553435, + "grad_norm": 0.5, + "learning_rate": 9.8436909397843e-06, + "loss": 0.2258, + "step": 16805 + }, + { + "epoch": 0.7980819446422637, + "grad_norm": 0.419921875, + "learning_rate": 9.832747756206285e-06, + "loss": 0.2235, + "step": 16810 + }, + { + "epoch": 0.7983193277310925, + "grad_norm": 0.45703125, + "learning_rate": 9.821815460875491e-06, + "loss": 0.2228, + "step": 16815 + }, + { + "epoch": 0.7985567108199212, + "grad_norm": 0.455078125, + "learning_rate": 9.810894060529438e-06, + "loss": 0.23, + "step": 16820 + }, + { + "epoch": 0.7987940939087499, + "grad_norm": 0.46484375, + "learning_rate": 9.79998356189893e-06, + "loss": 0.223, + "step": 16825 + }, + { + "epoch": 0.7990314769975787, + "grad_norm": 0.451171875, + "learning_rate": 9.789083971708077e-06, + "loss": 0.2265, + "step": 16830 + }, + { + "epoch": 0.7992688600864074, + "grad_norm": 0.470703125, + "learning_rate": 9.77819529667423e-06, + "loss": 0.223, + "step": 16835 + }, + { + "epoch": 0.7995062431752362, + "grad_norm": 0.416015625, + "learning_rate": 9.767317543508037e-06, + "loss": 0.221, + "step": 16840 + }, + { + "epoch": 0.7997436262640649, + "grad_norm": 0.451171875, + "learning_rate": 9.756450718913404e-06, + "loss": 0.2215, + "step": 16845 + }, + { + "epoch": 0.7999810093528937, + "grad_norm": 0.4375, + "learning_rate": 9.745594829587501e-06, + "loss": 0.2206, + "step": 16850 + }, + { + "epoch": 0.8002183924417224, + "grad_norm": 0.4296875, + "learning_rate": 9.734749882220779e-06, + "loss": 0.2241, + "step": 16855 + }, + { + "epoch": 0.8004557755305513, + "grad_norm": 0.44921875, + "learning_rate": 9.723915883496917e-06, + "loss": 0.2199, + "step": 16860 + }, + { + "epoch": 0.80069315861938, + "grad_norm": 0.439453125, + "learning_rate": 9.713092840092864e-06, + "loss": 0.2233, + "step": 16865 + }, + { + "epoch": 0.8009305417082087, + "grad_norm": 0.435546875, + "learning_rate": 9.702280758678806e-06, + "loss": 0.2226, + "step": 16870 + }, + { + "epoch": 0.8011679247970375, + "grad_norm": 0.45703125, + "learning_rate": 9.691479645918189e-06, + "loss": 0.2258, + "step": 16875 + }, + { + "epoch": 0.8014053078858662, + "grad_norm": 0.439453125, + "learning_rate": 9.680689508467685e-06, + "loss": 0.2273, + "step": 16880 + }, + { + "epoch": 0.801642690974695, + "grad_norm": 0.4140625, + "learning_rate": 9.669910352977204e-06, + "loss": 0.2226, + "step": 16885 + }, + { + "epoch": 0.8018800740635237, + "grad_norm": 0.458984375, + "learning_rate": 9.659142186089893e-06, + "loss": 0.2249, + "step": 16890 + }, + { + "epoch": 0.8021174571523525, + "grad_norm": 0.46484375, + "learning_rate": 9.648385014442127e-06, + "loss": 0.2236, + "step": 16895 + }, + { + "epoch": 0.8023548402411812, + "grad_norm": 0.419921875, + "learning_rate": 9.637638844663504e-06, + "loss": 0.2237, + "step": 16900 + }, + { + "epoch": 0.80259222333001, + "grad_norm": 0.4296875, + "learning_rate": 9.626903683376828e-06, + "loss": 0.2256, + "step": 16905 + }, + { + "epoch": 0.8028296064188387, + "grad_norm": 0.408203125, + "learning_rate": 9.616179537198141e-06, + "loss": 0.2197, + "step": 16910 + }, + { + "epoch": 0.8030669895076675, + "grad_norm": 0.42578125, + "learning_rate": 9.605466412736675e-06, + "loss": 0.2232, + "step": 16915 + }, + { + "epoch": 0.8033043725964962, + "grad_norm": 0.4453125, + "learning_rate": 9.594764316594901e-06, + "loss": 0.2232, + "step": 16920 + }, + { + "epoch": 0.8035417556853249, + "grad_norm": 0.427734375, + "learning_rate": 9.584073255368444e-06, + "loss": 0.2246, + "step": 16925 + }, + { + "epoch": 0.8037791387741537, + "grad_norm": 0.396484375, + "learning_rate": 9.573393235646177e-06, + "loss": 0.2245, + "step": 16930 + }, + { + "epoch": 0.8040165218629824, + "grad_norm": 0.412109375, + "learning_rate": 9.562724264010139e-06, + "loss": 0.2262, + "step": 16935 + }, + { + "epoch": 0.8042539049518113, + "grad_norm": 0.478515625, + "learning_rate": 9.552066347035573e-06, + "loss": 0.2242, + "step": 16940 + }, + { + "epoch": 0.80449128804064, + "grad_norm": 0.419921875, + "learning_rate": 9.541419491290902e-06, + "loss": 0.2255, + "step": 16945 + }, + { + "epoch": 0.8047286711294688, + "grad_norm": 0.38671875, + "learning_rate": 9.530783703337727e-06, + "loss": 0.2212, + "step": 16950 + }, + { + "epoch": 0.8049660542182975, + "grad_norm": 0.37890625, + "learning_rate": 9.52015898973085e-06, + "loss": 0.2252, + "step": 16955 + }, + { + "epoch": 0.8052034373071263, + "grad_norm": 0.451171875, + "learning_rate": 9.509545357018232e-06, + "loss": 0.2219, + "step": 16960 + }, + { + "epoch": 0.805440820395955, + "grad_norm": 0.484375, + "learning_rate": 9.498942811740997e-06, + "loss": 0.2217, + "step": 16965 + }, + { + "epoch": 0.8056782034847837, + "grad_norm": 0.43359375, + "learning_rate": 9.488351360433447e-06, + "loss": 0.2213, + "step": 16970 + }, + { + "epoch": 0.8059155865736125, + "grad_norm": 0.43359375, + "learning_rate": 9.477771009623055e-06, + "loss": 0.2195, + "step": 16975 + }, + { + "epoch": 0.8061529696624412, + "grad_norm": 0.4453125, + "learning_rate": 9.467201765830435e-06, + "loss": 0.226, + "step": 16980 + }, + { + "epoch": 0.80639035275127, + "grad_norm": 0.396484375, + "learning_rate": 9.456643635569367e-06, + "loss": 0.2231, + "step": 16985 + }, + { + "epoch": 0.8066277358400987, + "grad_norm": 0.46484375, + "learning_rate": 9.446096625346778e-06, + "loss": 0.2238, + "step": 16990 + }, + { + "epoch": 0.8068651189289275, + "grad_norm": 0.40625, + "learning_rate": 9.435560741662735e-06, + "loss": 0.2282, + "step": 16995 + }, + { + "epoch": 0.8071025020177562, + "grad_norm": 0.384765625, + "learning_rate": 9.425035991010472e-06, + "loss": 0.223, + "step": 17000 + }, + { + "epoch": 0.807339885106585, + "grad_norm": 0.515625, + "learning_rate": 9.414522379876331e-06, + "loss": 0.2244, + "step": 17005 + }, + { + "epoch": 0.8075772681954138, + "grad_norm": 0.447265625, + "learning_rate": 9.404019914739808e-06, + "loss": 0.2259, + "step": 17010 + }, + { + "epoch": 0.8078146512842425, + "grad_norm": 0.44140625, + "learning_rate": 9.393528602073517e-06, + "loss": 0.2215, + "step": 17015 + }, + { + "epoch": 0.8080520343730713, + "grad_norm": 0.423828125, + "learning_rate": 9.383048448343226e-06, + "loss": 0.2253, + "step": 17020 + }, + { + "epoch": 0.8082894174619, + "grad_norm": 0.5546875, + "learning_rate": 9.372579460007778e-06, + "loss": 0.2215, + "step": 17025 + }, + { + "epoch": 0.8085268005507288, + "grad_norm": 0.421875, + "learning_rate": 9.362121643519186e-06, + "loss": 0.2223, + "step": 17030 + }, + { + "epoch": 0.8087641836395575, + "grad_norm": 0.4296875, + "learning_rate": 9.35167500532254e-06, + "loss": 0.2225, + "step": 17035 + }, + { + "epoch": 0.8090015667283863, + "grad_norm": 0.5, + "learning_rate": 9.341239551856071e-06, + "loss": 0.2242, + "step": 17040 + }, + { + "epoch": 0.809238949817215, + "grad_norm": 0.48046875, + "learning_rate": 9.33081528955109e-06, + "loss": 0.2216, + "step": 17045 + }, + { + "epoch": 0.8094763329060438, + "grad_norm": 0.4453125, + "learning_rate": 9.320402224832018e-06, + "loss": 0.2212, + "step": 17050 + }, + { + "epoch": 0.8097137159948725, + "grad_norm": 0.431640625, + "learning_rate": 9.310000364116395e-06, + "loss": 0.222, + "step": 17055 + }, + { + "epoch": 0.8099510990837013, + "grad_norm": 0.392578125, + "learning_rate": 9.299609713814827e-06, + "loss": 0.2255, + "step": 17060 + }, + { + "epoch": 0.81018848217253, + "grad_norm": 0.41015625, + "learning_rate": 9.289230280331029e-06, + "loss": 0.2233, + "step": 17065 + }, + { + "epoch": 0.8104258652613587, + "grad_norm": 0.431640625, + "learning_rate": 9.278862070061798e-06, + "loss": 0.2251, + "step": 17070 + }, + { + "epoch": 0.8106632483501875, + "grad_norm": 0.404296875, + "learning_rate": 9.26850508939702e-06, + "loss": 0.2226, + "step": 17075 + }, + { + "epoch": 0.8109006314390163, + "grad_norm": 0.4453125, + "learning_rate": 9.25815934471965e-06, + "loss": 0.2256, + "step": 17080 + }, + { + "epoch": 0.8111380145278451, + "grad_norm": 0.478515625, + "learning_rate": 9.247824842405724e-06, + "loss": 0.225, + "step": 17085 + }, + { + "epoch": 0.8113753976166738, + "grad_norm": 0.49609375, + "learning_rate": 9.237501588824352e-06, + "loss": 0.2205, + "step": 17090 + }, + { + "epoch": 0.8116127807055026, + "grad_norm": 0.447265625, + "learning_rate": 9.227189590337697e-06, + "loss": 0.2234, + "step": 17095 + }, + { + "epoch": 0.8118501637943313, + "grad_norm": 0.431640625, + "learning_rate": 9.216888853301012e-06, + "loss": 0.2226, + "step": 17100 + }, + { + "epoch": 0.8120875468831601, + "grad_norm": 0.416015625, + "learning_rate": 9.206599384062585e-06, + "loss": 0.2234, + "step": 17105 + }, + { + "epoch": 0.8123249299719888, + "grad_norm": 0.38671875, + "learning_rate": 9.196321188963775e-06, + "loss": 0.2273, + "step": 17110 + }, + { + "epoch": 0.8125623130608175, + "grad_norm": 0.416015625, + "learning_rate": 9.186054274338977e-06, + "loss": 0.226, + "step": 17115 + }, + { + "epoch": 0.8127996961496463, + "grad_norm": 0.56640625, + "learning_rate": 9.175798646515655e-06, + "loss": 0.2237, + "step": 17120 + }, + { + "epoch": 0.813037079238475, + "grad_norm": 0.462890625, + "learning_rate": 9.165554311814304e-06, + "loss": 0.2235, + "step": 17125 + }, + { + "epoch": 0.8132744623273038, + "grad_norm": 0.4140625, + "learning_rate": 9.155321276548454e-06, + "loss": 0.2244, + "step": 17130 + }, + { + "epoch": 0.8135118454161325, + "grad_norm": 0.404296875, + "learning_rate": 9.145099547024678e-06, + "loss": 0.224, + "step": 17135 + }, + { + "epoch": 0.8137492285049613, + "grad_norm": 0.4140625, + "learning_rate": 9.134889129542593e-06, + "loss": 0.2234, + "step": 17140 + }, + { + "epoch": 0.81398661159379, + "grad_norm": 0.439453125, + "learning_rate": 9.124690030394827e-06, + "loss": 0.2238, + "step": 17145 + }, + { + "epoch": 0.8142239946826189, + "grad_norm": 0.4765625, + "learning_rate": 9.11450225586703e-06, + "loss": 0.2256, + "step": 17150 + }, + { + "epoch": 0.8144613777714476, + "grad_norm": 0.404296875, + "learning_rate": 9.104325812237893e-06, + "loss": 0.2268, + "step": 17155 + }, + { + "epoch": 0.8146987608602763, + "grad_norm": 0.458984375, + "learning_rate": 9.094160705779101e-06, + "loss": 0.2211, + "step": 17160 + }, + { + "epoch": 0.8149361439491051, + "grad_norm": 0.49609375, + "learning_rate": 9.084006942755378e-06, + "loss": 0.2235, + "step": 17165 + }, + { + "epoch": 0.8151735270379338, + "grad_norm": 0.416015625, + "learning_rate": 9.073864529424423e-06, + "loss": 0.2262, + "step": 17170 + }, + { + "epoch": 0.8154109101267626, + "grad_norm": 0.5078125, + "learning_rate": 9.063733472036973e-06, + "loss": 0.2191, + "step": 17175 + }, + { + "epoch": 0.8156482932155913, + "grad_norm": 0.380859375, + "learning_rate": 9.053613776836745e-06, + "loss": 0.2225, + "step": 17180 + }, + { + "epoch": 0.8158856763044201, + "grad_norm": 0.47265625, + "learning_rate": 9.043505450060475e-06, + "loss": 0.2183, + "step": 17185 + }, + { + "epoch": 0.8161230593932488, + "grad_norm": 0.4375, + "learning_rate": 9.033408497937865e-06, + "loss": 0.2231, + "step": 17190 + }, + { + "epoch": 0.8163604424820776, + "grad_norm": 0.44921875, + "learning_rate": 9.023322926691614e-06, + "loss": 0.225, + "step": 17195 + }, + { + "epoch": 0.8165978255709063, + "grad_norm": 0.4921875, + "learning_rate": 9.013248742537433e-06, + "loss": 0.2218, + "step": 17200 + }, + { + "epoch": 0.8168352086597351, + "grad_norm": 0.46875, + "learning_rate": 9.003185951683988e-06, + "loss": 0.2198, + "step": 17205 + }, + { + "epoch": 0.8170725917485638, + "grad_norm": 0.412109375, + "learning_rate": 8.993134560332928e-06, + "loss": 0.2183, + "step": 17210 + }, + { + "epoch": 0.8173099748373925, + "grad_norm": 0.474609375, + "learning_rate": 8.983094574678877e-06, + "loss": 0.2215, + "step": 17215 + }, + { + "epoch": 0.8175473579262214, + "grad_norm": 0.478515625, + "learning_rate": 8.973066000909442e-06, + "loss": 0.2232, + "step": 17220 + }, + { + "epoch": 0.81778474101505, + "grad_norm": 0.42578125, + "learning_rate": 8.963048845205182e-06, + "loss": 0.2233, + "step": 17225 + }, + { + "epoch": 0.8180221241038789, + "grad_norm": 0.41796875, + "learning_rate": 8.953043113739626e-06, + "loss": 0.2253, + "step": 17230 + }, + { + "epoch": 0.8182595071927076, + "grad_norm": 0.478515625, + "learning_rate": 8.943048812679261e-06, + "loss": 0.2253, + "step": 17235 + }, + { + "epoch": 0.8184968902815364, + "grad_norm": 0.474609375, + "learning_rate": 8.933065948183525e-06, + "loss": 0.2268, + "step": 17240 + }, + { + "epoch": 0.8187342733703651, + "grad_norm": 0.412109375, + "learning_rate": 8.923094526404815e-06, + "loss": 0.2227, + "step": 17245 + }, + { + "epoch": 0.8189716564591939, + "grad_norm": 0.408203125, + "learning_rate": 8.913134553488478e-06, + "loss": 0.218, + "step": 17250 + }, + { + "epoch": 0.8192090395480226, + "grad_norm": 0.4140625, + "learning_rate": 8.903186035572795e-06, + "loss": 0.2254, + "step": 17255 + }, + { + "epoch": 0.8194464226368513, + "grad_norm": 0.451171875, + "learning_rate": 8.893248978788984e-06, + "loss": 0.2223, + "step": 17260 + }, + { + "epoch": 0.8196838057256801, + "grad_norm": 0.431640625, + "learning_rate": 8.883323389261224e-06, + "loss": 0.2247, + "step": 17265 + }, + { + "epoch": 0.8199211888145088, + "grad_norm": 0.42578125, + "learning_rate": 8.873409273106604e-06, + "loss": 0.2234, + "step": 17270 + }, + { + "epoch": 0.8201585719033376, + "grad_norm": 0.451171875, + "learning_rate": 8.863506636435146e-06, + "loss": 0.2217, + "step": 17275 + }, + { + "epoch": 0.8203959549921663, + "grad_norm": 0.427734375, + "learning_rate": 8.8536154853498e-06, + "loss": 0.2232, + "step": 17280 + }, + { + "epoch": 0.8206333380809951, + "grad_norm": 0.453125, + "learning_rate": 8.843735825946442e-06, + "loss": 0.2235, + "step": 17285 + }, + { + "epoch": 0.8208707211698238, + "grad_norm": 0.396484375, + "learning_rate": 8.833867664313863e-06, + "loss": 0.2188, + "step": 17290 + }, + { + "epoch": 0.8211081042586527, + "grad_norm": 0.4609375, + "learning_rate": 8.824011006533754e-06, + "loss": 0.2208, + "step": 17295 + }, + { + "epoch": 0.8213454873474814, + "grad_norm": 0.408203125, + "learning_rate": 8.814165858680737e-06, + "loss": 0.2215, + "step": 17300 + }, + { + "epoch": 0.8215828704363101, + "grad_norm": 0.419921875, + "learning_rate": 8.804332226822325e-06, + "loss": 0.2269, + "step": 17305 + }, + { + "epoch": 0.8218202535251389, + "grad_norm": 0.4375, + "learning_rate": 8.794510117018961e-06, + "loss": 0.2173, + "step": 17310 + }, + { + "epoch": 0.8220576366139676, + "grad_norm": 0.427734375, + "learning_rate": 8.784699535323943e-06, + "loss": 0.2215, + "step": 17315 + }, + { + "epoch": 0.8222950197027964, + "grad_norm": 0.427734375, + "learning_rate": 8.774900487783499e-06, + "loss": 0.2223, + "step": 17320 + }, + { + "epoch": 0.8225324027916251, + "grad_norm": 0.419921875, + "learning_rate": 8.76511298043673e-06, + "loss": 0.225, + "step": 17325 + }, + { + "epoch": 0.8227697858804539, + "grad_norm": 0.4140625, + "learning_rate": 8.75533701931565e-06, + "loss": 0.2259, + "step": 17330 + }, + { + "epoch": 0.8230071689692826, + "grad_norm": 0.4296875, + "learning_rate": 8.745572610445118e-06, + "loss": 0.224, + "step": 17335 + }, + { + "epoch": 0.8232445520581114, + "grad_norm": 0.40234375, + "learning_rate": 8.735819759842903e-06, + "loss": 0.2248, + "step": 17340 + }, + { + "epoch": 0.8234819351469401, + "grad_norm": 0.484375, + "learning_rate": 8.726078473519647e-06, + "loss": 0.2299, + "step": 17345 + }, + { + "epoch": 0.8237193182357689, + "grad_norm": 0.359375, + "learning_rate": 8.716348757478857e-06, + "loss": 0.2229, + "step": 17350 + }, + { + "epoch": 0.8239567013245976, + "grad_norm": 0.40625, + "learning_rate": 8.706630617716913e-06, + "loss": 0.2245, + "step": 17355 + }, + { + "epoch": 0.8241940844134263, + "grad_norm": 0.384765625, + "learning_rate": 8.69692406022305e-06, + "loss": 0.2213, + "step": 17360 + }, + { + "epoch": 0.8244314675022552, + "grad_norm": 0.41796875, + "learning_rate": 8.687229090979395e-06, + "loss": 0.2235, + "step": 17365 + }, + { + "epoch": 0.8246688505910839, + "grad_norm": 0.447265625, + "learning_rate": 8.677545715960903e-06, + "loss": 0.2197, + "step": 17370 + }, + { + "epoch": 0.8249062336799127, + "grad_norm": 0.38671875, + "learning_rate": 8.667873941135396e-06, + "loss": 0.2248, + "step": 17375 + }, + { + "epoch": 0.8251436167687414, + "grad_norm": 0.462890625, + "learning_rate": 8.658213772463536e-06, + "loss": 0.2233, + "step": 17380 + }, + { + "epoch": 0.8253809998575702, + "grad_norm": 0.3984375, + "learning_rate": 8.648565215898854e-06, + "loss": 0.2211, + "step": 17385 + }, + { + "epoch": 0.8256183829463989, + "grad_norm": 0.5078125, + "learning_rate": 8.638928277387703e-06, + "loss": 0.2221, + "step": 17390 + }, + { + "epoch": 0.8258557660352277, + "grad_norm": 0.458984375, + "learning_rate": 8.62930296286929e-06, + "loss": 0.225, + "step": 17395 + }, + { + "epoch": 0.8260931491240564, + "grad_norm": 0.39453125, + "learning_rate": 8.619689278275653e-06, + "loss": 0.2226, + "step": 17400 + }, + { + "epoch": 0.8263305322128851, + "grad_norm": 0.412109375, + "learning_rate": 8.61008722953165e-06, + "loss": 0.2241, + "step": 17405 + }, + { + "epoch": 0.8265679153017139, + "grad_norm": 0.42578125, + "learning_rate": 8.600496822554995e-06, + "loss": 0.2212, + "step": 17410 + }, + { + "epoch": 0.8268052983905426, + "grad_norm": 0.462890625, + "learning_rate": 8.590918063256206e-06, + "loss": 0.2256, + "step": 17415 + }, + { + "epoch": 0.8270426814793714, + "grad_norm": 0.5078125, + "learning_rate": 8.581350957538628e-06, + "loss": 0.2261, + "step": 17420 + }, + { + "epoch": 0.8272800645682001, + "grad_norm": 0.453125, + "learning_rate": 8.571795511298423e-06, + "loss": 0.2243, + "step": 17425 + }, + { + "epoch": 0.827517447657029, + "grad_norm": 0.43359375, + "learning_rate": 8.562251730424581e-06, + "loss": 0.2247, + "step": 17430 + }, + { + "epoch": 0.8277548307458577, + "grad_norm": 0.404296875, + "learning_rate": 8.552719620798874e-06, + "loss": 0.2245, + "step": 17435 + }, + { + "epoch": 0.8279922138346865, + "grad_norm": 0.427734375, + "learning_rate": 8.543199188295902e-06, + "loss": 0.2238, + "step": 17440 + }, + { + "epoch": 0.8282295969235152, + "grad_norm": 0.462890625, + "learning_rate": 8.533690438783074e-06, + "loss": 0.2214, + "step": 17445 + }, + { + "epoch": 0.8284669800123439, + "grad_norm": 0.439453125, + "learning_rate": 8.524193378120581e-06, + "loss": 0.2212, + "step": 17450 + }, + { + "epoch": 0.8287043631011727, + "grad_norm": 0.44921875, + "learning_rate": 8.514708012161421e-06, + "loss": 0.2235, + "step": 17455 + }, + { + "epoch": 0.8289417461900014, + "grad_norm": 0.408203125, + "learning_rate": 8.505234346751378e-06, + "loss": 0.2177, + "step": 17460 + }, + { + "epoch": 0.8291791292788302, + "grad_norm": 0.427734375, + "learning_rate": 8.495772387729034e-06, + "loss": 0.2218, + "step": 17465 + }, + { + "epoch": 0.8294165123676589, + "grad_norm": 0.408203125, + "learning_rate": 8.486322140925748e-06, + "loss": 0.2236, + "step": 17470 + }, + { + "epoch": 0.8296538954564877, + "grad_norm": 0.408203125, + "learning_rate": 8.47688361216567e-06, + "loss": 0.2228, + "step": 17475 + }, + { + "epoch": 0.8298912785453164, + "grad_norm": 0.51953125, + "learning_rate": 8.467456807265714e-06, + "loss": 0.2207, + "step": 17480 + }, + { + "epoch": 0.8301286616341452, + "grad_norm": 0.435546875, + "learning_rate": 8.458041732035575e-06, + "loss": 0.2238, + "step": 17485 + }, + { + "epoch": 0.8303660447229739, + "grad_norm": 0.451171875, + "learning_rate": 8.448638392277732e-06, + "loss": 0.2198, + "step": 17490 + }, + { + "epoch": 0.8306034278118027, + "grad_norm": 0.50390625, + "learning_rate": 8.439246793787417e-06, + "loss": 0.2217, + "step": 17495 + }, + { + "epoch": 0.8308408109006314, + "grad_norm": 0.4453125, + "learning_rate": 8.429866942352627e-06, + "loss": 0.2243, + "step": 17500 + }, + { + "epoch": 0.8310781939894601, + "grad_norm": 0.427734375, + "learning_rate": 8.420498843754118e-06, + "loss": 0.2222, + "step": 17505 + }, + { + "epoch": 0.831315577078289, + "grad_norm": 0.455078125, + "learning_rate": 8.411142503765415e-06, + "loss": 0.2218, + "step": 17510 + }, + { + "epoch": 0.8315529601671177, + "grad_norm": 0.416015625, + "learning_rate": 8.401797928152786e-06, + "loss": 0.2229, + "step": 17515 + }, + { + "epoch": 0.8317903432559465, + "grad_norm": 0.494140625, + "learning_rate": 8.392465122675244e-06, + "loss": 0.2236, + "step": 17520 + }, + { + "epoch": 0.8320277263447752, + "grad_norm": 0.40625, + "learning_rate": 8.383144093084558e-06, + "loss": 0.2209, + "step": 17525 + }, + { + "epoch": 0.832265109433604, + "grad_norm": 0.39453125, + "learning_rate": 8.373834845125238e-06, + "loss": 0.2264, + "step": 17530 + }, + { + "epoch": 0.8325024925224327, + "grad_norm": 0.4453125, + "learning_rate": 8.364537384534537e-06, + "loss": 0.2211, + "step": 17535 + }, + { + "epoch": 0.8327398756112615, + "grad_norm": 0.474609375, + "learning_rate": 8.355251717042423e-06, + "loss": 0.2236, + "step": 17540 + }, + { + "epoch": 0.8329772587000902, + "grad_norm": 0.439453125, + "learning_rate": 8.345977848371622e-06, + "loss": 0.2241, + "step": 17545 + }, + { + "epoch": 0.8332146417889189, + "grad_norm": 0.46875, + "learning_rate": 8.336715784237567e-06, + "loss": 0.2268, + "step": 17550 + }, + { + "epoch": 0.8334520248777477, + "grad_norm": 0.4765625, + "learning_rate": 8.327465530348444e-06, + "loss": 0.2201, + "step": 17555 + }, + { + "epoch": 0.8336894079665764, + "grad_norm": 0.53515625, + "learning_rate": 8.31822709240512e-06, + "loss": 0.2255, + "step": 17560 + }, + { + "epoch": 0.8339267910554052, + "grad_norm": 0.482421875, + "learning_rate": 8.309000476101222e-06, + "loss": 0.2248, + "step": 17565 + }, + { + "epoch": 0.8341641741442339, + "grad_norm": 0.4296875, + "learning_rate": 8.299785687123054e-06, + "loss": 0.217, + "step": 17570 + }, + { + "epoch": 0.8344015572330628, + "grad_norm": 0.412109375, + "learning_rate": 8.290582731149667e-06, + "loss": 0.2253, + "step": 17575 + }, + { + "epoch": 0.8346389403218915, + "grad_norm": 0.462890625, + "learning_rate": 8.28139161385279e-06, + "loss": 0.2235, + "step": 17580 + }, + { + "epoch": 0.8348763234107203, + "grad_norm": 0.412109375, + "learning_rate": 8.272212340896863e-06, + "loss": 0.224, + "step": 17585 + }, + { + "epoch": 0.835113706499549, + "grad_norm": 0.427734375, + "learning_rate": 8.263044917939038e-06, + "loss": 0.2233, + "step": 17590 + }, + { + "epoch": 0.8353510895883777, + "grad_norm": 0.4375, + "learning_rate": 8.253889350629154e-06, + "loss": 0.2198, + "step": 17595 + }, + { + "epoch": 0.8355884726772065, + "grad_norm": 0.45703125, + "learning_rate": 8.244745644609748e-06, + "loss": 0.2248, + "step": 17600 + }, + { + "epoch": 0.8358258557660352, + "grad_norm": 0.408203125, + "learning_rate": 8.235613805516029e-06, + "loss": 0.2182, + "step": 17605 + }, + { + "epoch": 0.836063238854864, + "grad_norm": 0.484375, + "learning_rate": 8.226493838975932e-06, + "loss": 0.2234, + "step": 17610 + }, + { + "epoch": 0.8363006219436927, + "grad_norm": 0.443359375, + "learning_rate": 8.217385750610038e-06, + "loss": 0.2231, + "step": 17615 + }, + { + "epoch": 0.8365380050325215, + "grad_norm": 0.431640625, + "learning_rate": 8.20828954603162e-06, + "loss": 0.2249, + "step": 17620 + }, + { + "epoch": 0.8367753881213502, + "grad_norm": 0.431640625, + "learning_rate": 8.199205230846624e-06, + "loss": 0.2228, + "step": 17625 + }, + { + "epoch": 0.837012771210179, + "grad_norm": 0.43359375, + "learning_rate": 8.190132810653686e-06, + "loss": 0.224, + "step": 17630 + }, + { + "epoch": 0.8372501542990077, + "grad_norm": 0.455078125, + "learning_rate": 8.181072291044085e-06, + "loss": 0.225, + "step": 17635 + }, + { + "epoch": 0.8374875373878365, + "grad_norm": 0.412109375, + "learning_rate": 8.172023677601784e-06, + "loss": 0.2203, + "step": 17640 + }, + { + "epoch": 0.8377249204766652, + "grad_norm": 0.41796875, + "learning_rate": 8.162986975903397e-06, + "loss": 0.2231, + "step": 17645 + }, + { + "epoch": 0.837962303565494, + "grad_norm": 0.435546875, + "learning_rate": 8.153962191518205e-06, + "loss": 0.2236, + "step": 17650 + }, + { + "epoch": 0.8381996866543228, + "grad_norm": 0.39453125, + "learning_rate": 8.144949330008144e-06, + "loss": 0.224, + "step": 17655 + }, + { + "epoch": 0.8384370697431515, + "grad_norm": 0.447265625, + "learning_rate": 8.135948396927797e-06, + "loss": 0.2258, + "step": 17660 + }, + { + "epoch": 0.8386744528319803, + "grad_norm": 0.423828125, + "learning_rate": 8.126959397824399e-06, + "loss": 0.2263, + "step": 17665 + }, + { + "epoch": 0.838911835920809, + "grad_norm": 0.4921875, + "learning_rate": 8.117982338237822e-06, + "loss": 0.2263, + "step": 17670 + }, + { + "epoch": 0.8391492190096378, + "grad_norm": 0.451171875, + "learning_rate": 8.1090172237006e-06, + "loss": 0.2259, + "step": 17675 + }, + { + "epoch": 0.8393866020984665, + "grad_norm": 0.431640625, + "learning_rate": 8.100064059737894e-06, + "loss": 0.2276, + "step": 17680 + }, + { + "epoch": 0.8396239851872953, + "grad_norm": 0.455078125, + "learning_rate": 8.091122851867476e-06, + "loss": 0.2225, + "step": 17685 + }, + { + "epoch": 0.839861368276124, + "grad_norm": 0.41796875, + "learning_rate": 8.082193605599792e-06, + "loss": 0.2276, + "step": 17690 + }, + { + "epoch": 0.8400987513649527, + "grad_norm": 0.51953125, + "learning_rate": 8.073276326437886e-06, + "loss": 0.2235, + "step": 17695 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 0.4453125, + "learning_rate": 8.064371019877448e-06, + "loss": 0.2252, + "step": 17700 + }, + { + "epoch": 0.8405735175426102, + "grad_norm": 0.44921875, + "learning_rate": 8.055477691406764e-06, + "loss": 0.2277, + "step": 17705 + }, + { + "epoch": 0.840810900631439, + "grad_norm": 0.427734375, + "learning_rate": 8.04659634650676e-06, + "loss": 0.222, + "step": 17710 + }, + { + "epoch": 0.8410482837202677, + "grad_norm": 0.50390625, + "learning_rate": 8.037726990650963e-06, + "loss": 0.223, + "step": 17715 + }, + { + "epoch": 0.8412856668090966, + "grad_norm": 0.466796875, + "learning_rate": 8.028869629305533e-06, + "loss": 0.224, + "step": 17720 + }, + { + "epoch": 0.8415230498979253, + "grad_norm": 0.46484375, + "learning_rate": 8.0200242679292e-06, + "loss": 0.2201, + "step": 17725 + }, + { + "epoch": 0.8417604329867541, + "grad_norm": 0.474609375, + "learning_rate": 8.011190911973328e-06, + "loss": 0.2221, + "step": 17730 + }, + { + "epoch": 0.8419978160755828, + "grad_norm": 0.478515625, + "learning_rate": 8.002369566881877e-06, + "loss": 0.2231, + "step": 17735 + }, + { + "epoch": 0.8422351991644115, + "grad_norm": 0.451171875, + "learning_rate": 7.9935602380914e-06, + "loss": 0.2199, + "step": 17740 + }, + { + "epoch": 0.8424725822532403, + "grad_norm": 0.4375, + "learning_rate": 7.984762931031048e-06, + "loss": 0.2222, + "step": 17745 + }, + { + "epoch": 0.842709965342069, + "grad_norm": 0.546875, + "learning_rate": 7.975977651122555e-06, + "loss": 0.2266, + "step": 17750 + }, + { + "epoch": 0.8429473484308978, + "grad_norm": 0.376953125, + "learning_rate": 7.967204403780258e-06, + "loss": 0.2213, + "step": 17755 + }, + { + "epoch": 0.8431847315197265, + "grad_norm": 0.39453125, + "learning_rate": 7.958443194411062e-06, + "loss": 0.223, + "step": 17760 + }, + { + "epoch": 0.8434221146085553, + "grad_norm": 0.3984375, + "learning_rate": 7.949694028414467e-06, + "loss": 0.2224, + "step": 17765 + }, + { + "epoch": 0.843659497697384, + "grad_norm": 0.4609375, + "learning_rate": 7.940956911182537e-06, + "loss": 0.2257, + "step": 17770 + }, + { + "epoch": 0.8438968807862128, + "grad_norm": 0.443359375, + "learning_rate": 7.932231848099924e-06, + "loss": 0.2248, + "step": 17775 + }, + { + "epoch": 0.8441342638750415, + "grad_norm": 0.421875, + "learning_rate": 7.923518844543842e-06, + "loss": 0.2245, + "step": 17780 + }, + { + "epoch": 0.8443716469638703, + "grad_norm": 0.4375, + "learning_rate": 7.914817905884077e-06, + "loss": 0.2233, + "step": 17785 + }, + { + "epoch": 0.844609030052699, + "grad_norm": 0.41796875, + "learning_rate": 7.906129037482975e-06, + "loss": 0.2216, + "step": 17790 + }, + { + "epoch": 0.8448464131415278, + "grad_norm": 0.404296875, + "learning_rate": 7.897452244695442e-06, + "loss": 0.2202, + "step": 17795 + }, + { + "epoch": 0.8450837962303566, + "grad_norm": 0.435546875, + "learning_rate": 7.888787532868958e-06, + "loss": 0.2218, + "step": 17800 + }, + { + "epoch": 0.8453211793191853, + "grad_norm": 0.408203125, + "learning_rate": 7.880134907343539e-06, + "loss": 0.2208, + "step": 17805 + }, + { + "epoch": 0.8455585624080141, + "grad_norm": 0.38671875, + "learning_rate": 7.87149437345176e-06, + "loss": 0.2224, + "step": 17810 + }, + { + "epoch": 0.8457959454968428, + "grad_norm": 0.4375, + "learning_rate": 7.862865936518735e-06, + "loss": 0.222, + "step": 17815 + }, + { + "epoch": 0.8460333285856716, + "grad_norm": 0.3984375, + "learning_rate": 7.854249601862147e-06, + "loss": 0.2184, + "step": 17820 + }, + { + "epoch": 0.8462707116745003, + "grad_norm": 0.404296875, + "learning_rate": 7.845645374792189e-06, + "loss": 0.2207, + "step": 17825 + }, + { + "epoch": 0.8465080947633291, + "grad_norm": 0.43359375, + "learning_rate": 7.837053260611607e-06, + "loss": 0.2233, + "step": 17830 + }, + { + "epoch": 0.8467454778521578, + "grad_norm": 0.458984375, + "learning_rate": 7.828473264615692e-06, + "loss": 0.2228, + "step": 17835 + }, + { + "epoch": 0.8469828609409865, + "grad_norm": 0.431640625, + "learning_rate": 7.819905392092252e-06, + "loss": 0.2232, + "step": 17840 + }, + { + "epoch": 0.8472202440298153, + "grad_norm": 0.419921875, + "learning_rate": 7.811349648321627e-06, + "loss": 0.2221, + "step": 17845 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 0.486328125, + "learning_rate": 7.802806038576676e-06, + "loss": 0.2232, + "step": 17850 + }, + { + "epoch": 0.8476950102074728, + "grad_norm": 0.439453125, + "learning_rate": 7.7942745681228e-06, + "loss": 0.225, + "step": 17855 + }, + { + "epoch": 0.8479323932963015, + "grad_norm": 0.47265625, + "learning_rate": 7.7857552422179e-06, + "loss": 0.2211, + "step": 17860 + }, + { + "epoch": 0.8481697763851304, + "grad_norm": 0.462890625, + "learning_rate": 7.777248066112397e-06, + "loss": 0.2183, + "step": 17865 + }, + { + "epoch": 0.8484071594739591, + "grad_norm": 0.466796875, + "learning_rate": 7.768753045049223e-06, + "loss": 0.2261, + "step": 17870 + }, + { + "epoch": 0.8486445425627879, + "grad_norm": 0.44921875, + "learning_rate": 7.760270184263824e-06, + "loss": 0.2197, + "step": 17875 + }, + { + "epoch": 0.8488819256516166, + "grad_norm": 0.466796875, + "learning_rate": 7.75179948898415e-06, + "loss": 0.2234, + "step": 17880 + }, + { + "epoch": 0.8491193087404453, + "grad_norm": 0.431640625, + "learning_rate": 7.743340964430647e-06, + "loss": 0.218, + "step": 17885 + }, + { + "epoch": 0.8493566918292741, + "grad_norm": 0.54296875, + "learning_rate": 7.734894615816271e-06, + "loss": 0.2283, + "step": 17890 + }, + { + "epoch": 0.8495940749181028, + "grad_norm": 0.41015625, + "learning_rate": 7.726460448346457e-06, + "loss": 0.2262, + "step": 17895 + }, + { + "epoch": 0.8498314580069316, + "grad_norm": 0.5546875, + "learning_rate": 7.718038467219159e-06, + "loss": 0.2255, + "step": 17900 + }, + { + "epoch": 0.8500688410957603, + "grad_norm": 0.3984375, + "learning_rate": 7.709628677624799e-06, + "loss": 0.2228, + "step": 17905 + }, + { + "epoch": 0.8503062241845891, + "grad_norm": 0.421875, + "learning_rate": 7.701231084746292e-06, + "loss": 0.2241, + "step": 17910 + }, + { + "epoch": 0.8505436072734178, + "grad_norm": 0.400390625, + "learning_rate": 7.692845693759029e-06, + "loss": 0.2264, + "step": 17915 + }, + { + "epoch": 0.8507809903622466, + "grad_norm": 0.4453125, + "learning_rate": 7.6844725098309e-06, + "loss": 0.2235, + "step": 17920 + }, + { + "epoch": 0.8510183734510753, + "grad_norm": 0.404296875, + "learning_rate": 7.676111538122262e-06, + "loss": 0.2205, + "step": 17925 + }, + { + "epoch": 0.8512557565399042, + "grad_norm": 0.498046875, + "learning_rate": 7.667762783785928e-06, + "loss": 0.2261, + "step": 17930 + }, + { + "epoch": 0.8514931396287329, + "grad_norm": 0.435546875, + "learning_rate": 7.659426251967212e-06, + "loss": 0.2252, + "step": 17935 + }, + { + "epoch": 0.8517305227175616, + "grad_norm": 0.42578125, + "learning_rate": 7.65110194780387e-06, + "loss": 0.2218, + "step": 17940 + }, + { + "epoch": 0.8519679058063904, + "grad_norm": 0.380859375, + "learning_rate": 7.642789876426148e-06, + "loss": 0.2216, + "step": 17945 + }, + { + "epoch": 0.8522052888952191, + "grad_norm": 0.404296875, + "learning_rate": 7.634490042956719e-06, + "loss": 0.2227, + "step": 17950 + }, + { + "epoch": 0.8524426719840479, + "grad_norm": 0.388671875, + "learning_rate": 7.6262024525107456e-06, + "loss": 0.22, + "step": 17955 + }, + { + "epoch": 0.8526800550728766, + "grad_norm": 0.40625, + "learning_rate": 7.617927110195825e-06, + "loss": 0.2224, + "step": 17960 + }, + { + "epoch": 0.8529174381617054, + "grad_norm": 0.447265625, + "learning_rate": 7.609664021112024e-06, + "loss": 0.2201, + "step": 17965 + }, + { + "epoch": 0.8531548212505341, + "grad_norm": 0.462890625, + "learning_rate": 7.6014131903518375e-06, + "loss": 0.2239, + "step": 17970 + }, + { + "epoch": 0.8533922043393629, + "grad_norm": 0.408203125, + "learning_rate": 7.593174623000209e-06, + "loss": 0.2222, + "step": 17975 + }, + { + "epoch": 0.8536295874281916, + "grad_norm": 0.45703125, + "learning_rate": 7.584948324134543e-06, + "loss": 0.2233, + "step": 17980 + }, + { + "epoch": 0.8538669705170203, + "grad_norm": 0.39453125, + "learning_rate": 7.576734298824666e-06, + "loss": 0.2214, + "step": 17985 + }, + { + "epoch": 0.8541043536058491, + "grad_norm": 0.404296875, + "learning_rate": 7.56853255213284e-06, + "loss": 0.2234, + "step": 17990 + }, + { + "epoch": 0.8543417366946778, + "grad_norm": 0.400390625, + "learning_rate": 7.5603430891137635e-06, + "loss": 0.2206, + "step": 17995 + }, + { + "epoch": 0.8545791197835066, + "grad_norm": 0.48828125, + "learning_rate": 7.552165914814576e-06, + "loss": 0.2241, + "step": 18000 + }, + { + "epoch": 0.8548165028723353, + "grad_norm": 0.455078125, + "learning_rate": 7.5440010342748235e-06, + "loss": 0.2221, + "step": 18005 + }, + { + "epoch": 0.8550538859611642, + "grad_norm": 0.44921875, + "learning_rate": 7.535848452526489e-06, + "loss": 0.2257, + "step": 18010 + }, + { + "epoch": 0.8552912690499929, + "grad_norm": 0.515625, + "learning_rate": 7.527708174593966e-06, + "loss": 0.2255, + "step": 18015 + }, + { + "epoch": 0.8555286521388217, + "grad_norm": 0.458984375, + "learning_rate": 7.519580205494079e-06, + "loss": 0.2227, + "step": 18020 + }, + { + "epoch": 0.8557660352276504, + "grad_norm": 0.427734375, + "learning_rate": 7.511464550236052e-06, + "loss": 0.2206, + "step": 18025 + }, + { + "epoch": 0.8560034183164791, + "grad_norm": 0.44921875, + "learning_rate": 7.50336121382153e-06, + "loss": 0.2197, + "step": 18030 + }, + { + "epoch": 0.8562408014053079, + "grad_norm": 0.4765625, + "learning_rate": 7.49527020124456e-06, + "loss": 0.2218, + "step": 18035 + }, + { + "epoch": 0.8564781844941366, + "grad_norm": 0.3984375, + "learning_rate": 7.487191517491593e-06, + "loss": 0.221, + "step": 18040 + }, + { + "epoch": 0.8567155675829654, + "grad_norm": 0.404296875, + "learning_rate": 7.4791251675414925e-06, + "loss": 0.2201, + "step": 18045 + }, + { + "epoch": 0.8569529506717941, + "grad_norm": 0.4375, + "learning_rate": 7.471071156365511e-06, + "loss": 0.2204, + "step": 18050 + }, + { + "epoch": 0.8571903337606229, + "grad_norm": 0.50390625, + "learning_rate": 7.463029488927299e-06, + "loss": 0.2243, + "step": 18055 + }, + { + "epoch": 0.8574277168494516, + "grad_norm": 0.5, + "learning_rate": 7.455000170182892e-06, + "loss": 0.2224, + "step": 18060 + }, + { + "epoch": 0.8576650999382804, + "grad_norm": 0.4140625, + "learning_rate": 7.446983205080735e-06, + "loss": 0.2212, + "step": 18065 + }, + { + "epoch": 0.8579024830271091, + "grad_norm": 0.4453125, + "learning_rate": 7.4389785985616454e-06, + "loss": 0.2211, + "step": 18070 + }, + { + "epoch": 0.858139866115938, + "grad_norm": 0.388671875, + "learning_rate": 7.4309863555588124e-06, + "loss": 0.2256, + "step": 18075 + }, + { + "epoch": 0.8583772492047667, + "grad_norm": 0.412109375, + "learning_rate": 7.4230064809978325e-06, + "loss": 0.2253, + "step": 18080 + }, + { + "epoch": 0.8586146322935954, + "grad_norm": 0.486328125, + "learning_rate": 7.415038979796657e-06, + "loss": 0.222, + "step": 18085 + }, + { + "epoch": 0.8588520153824242, + "grad_norm": 0.439453125, + "learning_rate": 7.407083856865636e-06, + "loss": 0.2245, + "step": 18090 + }, + { + "epoch": 0.8590893984712529, + "grad_norm": 0.400390625, + "learning_rate": 7.399141117107454e-06, + "loss": 0.2201, + "step": 18095 + }, + { + "epoch": 0.8593267815600817, + "grad_norm": 0.435546875, + "learning_rate": 7.391210765417203e-06, + "loss": 0.2233, + "step": 18100 + }, + { + "epoch": 0.8595641646489104, + "grad_norm": 0.40625, + "learning_rate": 7.38329280668231e-06, + "loss": 0.2221, + "step": 18105 + }, + { + "epoch": 0.8598015477377392, + "grad_norm": 0.55859375, + "learning_rate": 7.375387245782595e-06, + "loss": 0.2227, + "step": 18110 + }, + { + "epoch": 0.8600389308265679, + "grad_norm": 0.51171875, + "learning_rate": 7.367494087590196e-06, + "loss": 0.2218, + "step": 18115 + }, + { + "epoch": 0.8602763139153967, + "grad_norm": 0.515625, + "learning_rate": 7.359613336969648e-06, + "loss": 0.2216, + "step": 18120 + }, + { + "epoch": 0.8605136970042254, + "grad_norm": 0.404296875, + "learning_rate": 7.351744998777814e-06, + "loss": 0.2232, + "step": 18125 + }, + { + "epoch": 0.8607510800930541, + "grad_norm": 0.435546875, + "learning_rate": 7.343889077863915e-06, + "loss": 0.223, + "step": 18130 + }, + { + "epoch": 0.8609884631818829, + "grad_norm": 0.50390625, + "learning_rate": 7.336045579069525e-06, + "loss": 0.2169, + "step": 18135 + }, + { + "epoch": 0.8612258462707116, + "grad_norm": 0.4375, + "learning_rate": 7.328214507228546e-06, + "loss": 0.2221, + "step": 18140 + }, + { + "epoch": 0.8614632293595404, + "grad_norm": 0.431640625, + "learning_rate": 7.320395867167243e-06, + "loss": 0.2219, + "step": 18145 + }, + { + "epoch": 0.8617006124483692, + "grad_norm": 0.44140625, + "learning_rate": 7.312589663704206e-06, + "loss": 0.2231, + "step": 18150 + }, + { + "epoch": 0.861937995537198, + "grad_norm": 0.447265625, + "learning_rate": 7.304795901650357e-06, + "loss": 0.2243, + "step": 18155 + }, + { + "epoch": 0.8621753786260267, + "grad_norm": 0.419921875, + "learning_rate": 7.297014585808959e-06, + "loss": 0.225, + "step": 18160 + }, + { + "epoch": 0.8624127617148555, + "grad_norm": 0.392578125, + "learning_rate": 7.289245720975606e-06, + "loss": 0.2229, + "step": 18165 + }, + { + "epoch": 0.8626501448036842, + "grad_norm": 0.5, + "learning_rate": 7.281489311938208e-06, + "loss": 0.2268, + "step": 18170 + }, + { + "epoch": 0.8628875278925129, + "grad_norm": 0.4453125, + "learning_rate": 7.273745363477011e-06, + "loss": 0.2248, + "step": 18175 + }, + { + "epoch": 0.8631249109813417, + "grad_norm": 0.419921875, + "learning_rate": 7.266013880364568e-06, + "loss": 0.2206, + "step": 18180 + }, + { + "epoch": 0.8633622940701704, + "grad_norm": 0.486328125, + "learning_rate": 7.258294867365757e-06, + "loss": 0.2201, + "step": 18185 + }, + { + "epoch": 0.8635996771589992, + "grad_norm": 0.46875, + "learning_rate": 7.250588329237777e-06, + "loss": 0.2224, + "step": 18190 + }, + { + "epoch": 0.8638370602478279, + "grad_norm": 0.45703125, + "learning_rate": 7.24289427073013e-06, + "loss": 0.2262, + "step": 18195 + }, + { + "epoch": 0.8640744433366567, + "grad_norm": 0.4375, + "learning_rate": 7.23521269658463e-06, + "loss": 0.2216, + "step": 18200 + }, + { + "epoch": 0.8643118264254854, + "grad_norm": 0.42578125, + "learning_rate": 7.227543611535388e-06, + "loss": 0.2233, + "step": 18205 + }, + { + "epoch": 0.8645492095143142, + "grad_norm": 0.44140625, + "learning_rate": 7.219887020308838e-06, + "loss": 0.2226, + "step": 18210 + }, + { + "epoch": 0.864786592603143, + "grad_norm": 0.462890625, + "learning_rate": 7.212242927623701e-06, + "loss": 0.2254, + "step": 18215 + }, + { + "epoch": 0.8650239756919718, + "grad_norm": 0.451171875, + "learning_rate": 7.204611338190984e-06, + "loss": 0.2259, + "step": 18220 + }, + { + "epoch": 0.8652613587808005, + "grad_norm": 0.451171875, + "learning_rate": 7.196992256714015e-06, + "loss": 0.2208, + "step": 18225 + }, + { + "epoch": 0.8654987418696292, + "grad_norm": 0.44140625, + "learning_rate": 7.189385687888395e-06, + "loss": 0.2219, + "step": 18230 + }, + { + "epoch": 0.865736124958458, + "grad_norm": 0.3984375, + "learning_rate": 7.181791636402013e-06, + "loss": 0.2191, + "step": 18235 + }, + { + "epoch": 0.8659735080472867, + "grad_norm": 0.455078125, + "learning_rate": 7.174210106935051e-06, + "loss": 0.2225, + "step": 18240 + }, + { + "epoch": 0.8662108911361155, + "grad_norm": 0.451171875, + "learning_rate": 7.166641104159977e-06, + "loss": 0.2213, + "step": 18245 + }, + { + "epoch": 0.8664482742249442, + "grad_norm": 0.51171875, + "learning_rate": 7.1590846327415265e-06, + "loss": 0.2253, + "step": 18250 + }, + { + "epoch": 0.866685657313773, + "grad_norm": 0.41796875, + "learning_rate": 7.15154069733672e-06, + "loss": 0.2252, + "step": 18255 + }, + { + "epoch": 0.8669230404026017, + "grad_norm": 0.43359375, + "learning_rate": 7.144009302594848e-06, + "loss": 0.2231, + "step": 18260 + }, + { + "epoch": 0.8671604234914305, + "grad_norm": 0.443359375, + "learning_rate": 7.13649045315748e-06, + "loss": 0.227, + "step": 18265 + }, + { + "epoch": 0.8673978065802592, + "grad_norm": 0.408203125, + "learning_rate": 7.128984153658448e-06, + "loss": 0.2229, + "step": 18270 + }, + { + "epoch": 0.8676351896690879, + "grad_norm": 0.51953125, + "learning_rate": 7.121490408723847e-06, + "loss": 0.2244, + "step": 18275 + }, + { + "epoch": 0.8678725727579167, + "grad_norm": 0.376953125, + "learning_rate": 7.11400922297204e-06, + "loss": 0.2229, + "step": 18280 + }, + { + "epoch": 0.8681099558467454, + "grad_norm": 0.40234375, + "learning_rate": 7.106540601013644e-06, + "loss": 0.2253, + "step": 18285 + }, + { + "epoch": 0.8683473389355743, + "grad_norm": 0.466796875, + "learning_rate": 7.0990845474515426e-06, + "loss": 0.221, + "step": 18290 + }, + { + "epoch": 0.868584722024403, + "grad_norm": 0.400390625, + "learning_rate": 7.091641066880867e-06, + "loss": 0.2182, + "step": 18295 + }, + { + "epoch": 0.8688221051132318, + "grad_norm": 0.40625, + "learning_rate": 7.084210163888999e-06, + "loss": 0.2235, + "step": 18300 + }, + { + "epoch": 0.8690594882020605, + "grad_norm": 0.416015625, + "learning_rate": 7.076791843055562e-06, + "loss": 0.2222, + "step": 18305 + }, + { + "epoch": 0.8692968712908893, + "grad_norm": 0.46875, + "learning_rate": 7.069386108952448e-06, + "loss": 0.2239, + "step": 18310 + }, + { + "epoch": 0.869534254379718, + "grad_norm": 0.404296875, + "learning_rate": 7.061992966143775e-06, + "loss": 0.2234, + "step": 18315 + }, + { + "epoch": 0.8697716374685467, + "grad_norm": 0.396484375, + "learning_rate": 7.054612419185891e-06, + "loss": 0.2275, + "step": 18320 + }, + { + "epoch": 0.8700090205573755, + "grad_norm": 0.412109375, + "learning_rate": 7.047244472627404e-06, + "loss": 0.2264, + "step": 18325 + }, + { + "epoch": 0.8702464036462042, + "grad_norm": 0.455078125, + "learning_rate": 7.039889131009141e-06, + "loss": 0.2227, + "step": 18330 + }, + { + "epoch": 0.870483786735033, + "grad_norm": 0.4921875, + "learning_rate": 7.032546398864174e-06, + "loss": 0.2241, + "step": 18335 + }, + { + "epoch": 0.8707211698238617, + "grad_norm": 0.49609375, + "learning_rate": 7.025216280717783e-06, + "loss": 0.2264, + "step": 18340 + }, + { + "epoch": 0.8709585529126905, + "grad_norm": 0.400390625, + "learning_rate": 7.017898781087495e-06, + "loss": 0.2267, + "step": 18345 + }, + { + "epoch": 0.8711959360015192, + "grad_norm": 0.447265625, + "learning_rate": 7.010593904483047e-06, + "loss": 0.2262, + "step": 18350 + }, + { + "epoch": 0.871433319090348, + "grad_norm": 0.515625, + "learning_rate": 7.003301655406409e-06, + "loss": 0.2203, + "step": 18355 + }, + { + "epoch": 0.8716707021791767, + "grad_norm": 0.46484375, + "learning_rate": 6.996022038351756e-06, + "loss": 0.2222, + "step": 18360 + }, + { + "epoch": 0.8719080852680056, + "grad_norm": 0.4453125, + "learning_rate": 6.9887550578054765e-06, + "loss": 0.2195, + "step": 18365 + }, + { + "epoch": 0.8721454683568343, + "grad_norm": 0.400390625, + "learning_rate": 6.981500718246187e-06, + "loss": 0.2211, + "step": 18370 + }, + { + "epoch": 0.872382851445663, + "grad_norm": 0.451171875, + "learning_rate": 6.974259024144701e-06, + "loss": 0.2234, + "step": 18375 + }, + { + "epoch": 0.8726202345344918, + "grad_norm": 0.421875, + "learning_rate": 6.967029979964043e-06, + "loss": 0.2237, + "step": 18380 + }, + { + "epoch": 0.8728576176233205, + "grad_norm": 0.423828125, + "learning_rate": 6.959813590159434e-06, + "loss": 0.2225, + "step": 18385 + }, + { + "epoch": 0.8730950007121493, + "grad_norm": 0.41015625, + "learning_rate": 6.952609859178309e-06, + "loss": 0.2243, + "step": 18390 + }, + { + "epoch": 0.873332383800978, + "grad_norm": 0.40625, + "learning_rate": 6.9454187914602935e-06, + "loss": 0.2248, + "step": 18395 + }, + { + "epoch": 0.8735697668898068, + "grad_norm": 0.455078125, + "learning_rate": 6.938240391437209e-06, + "loss": 0.2258, + "step": 18400 + }, + { + "epoch": 0.8738071499786355, + "grad_norm": 0.443359375, + "learning_rate": 6.931074663533065e-06, + "loss": 0.2266, + "step": 18405 + }, + { + "epoch": 0.8740445330674643, + "grad_norm": 0.45703125, + "learning_rate": 6.9239216121640745e-06, + "loss": 0.2243, + "step": 18410 + }, + { + "epoch": 0.874281916156293, + "grad_norm": 0.458984375, + "learning_rate": 6.916781241738632e-06, + "loss": 0.2212, + "step": 18415 + }, + { + "epoch": 0.8745192992451217, + "grad_norm": 0.455078125, + "learning_rate": 6.90965355665731e-06, + "loss": 0.2263, + "step": 18420 + }, + { + "epoch": 0.8747566823339505, + "grad_norm": 0.427734375, + "learning_rate": 6.902538561312873e-06, + "loss": 0.2188, + "step": 18425 + }, + { + "epoch": 0.8749940654227792, + "grad_norm": 0.41015625, + "learning_rate": 6.89543626009025e-06, + "loss": 0.2248, + "step": 18430 + }, + { + "epoch": 0.8752314485116081, + "grad_norm": 0.4453125, + "learning_rate": 6.8883466573665695e-06, + "loss": 0.2228, + "step": 18435 + }, + { + "epoch": 0.8754688316004368, + "grad_norm": 0.423828125, + "learning_rate": 6.8812697575111174e-06, + "loss": 0.2249, + "step": 18440 + }, + { + "epoch": 0.8757062146892656, + "grad_norm": 0.40234375, + "learning_rate": 6.8742055648853515e-06, + "loss": 0.2261, + "step": 18445 + }, + { + "epoch": 0.8759435977780943, + "grad_norm": 0.400390625, + "learning_rate": 6.867154083842902e-06, + "loss": 0.223, + "step": 18450 + }, + { + "epoch": 0.8761809808669231, + "grad_norm": 0.423828125, + "learning_rate": 6.860115318729568e-06, + "loss": 0.2245, + "step": 18455 + }, + { + "epoch": 0.8764183639557518, + "grad_norm": 0.423828125, + "learning_rate": 6.853089273883313e-06, + "loss": 0.2215, + "step": 18460 + }, + { + "epoch": 0.8766557470445805, + "grad_norm": 0.39453125, + "learning_rate": 6.846075953634241e-06, + "loss": 0.2226, + "step": 18465 + }, + { + "epoch": 0.8768931301334093, + "grad_norm": 0.4453125, + "learning_rate": 6.839075362304645e-06, + "loss": 0.2261, + "step": 18470 + }, + { + "epoch": 0.877130513222238, + "grad_norm": 0.443359375, + "learning_rate": 6.832087504208947e-06, + "loss": 0.2208, + "step": 18475 + }, + { + "epoch": 0.8773678963110668, + "grad_norm": 0.384765625, + "learning_rate": 6.825112383653742e-06, + "loss": 0.2222, + "step": 18480 + }, + { + "epoch": 0.8776052793998955, + "grad_norm": 0.412109375, + "learning_rate": 6.818150004937754e-06, + "loss": 0.2219, + "step": 18485 + }, + { + "epoch": 0.8778426624887243, + "grad_norm": 0.431640625, + "learning_rate": 6.8112003723518755e-06, + "loss": 0.2274, + "step": 18490 + }, + { + "epoch": 0.878080045577553, + "grad_norm": 0.4296875, + "learning_rate": 6.804263490179122e-06, + "loss": 0.2233, + "step": 18495 + }, + { + "epoch": 0.8783174286663818, + "grad_norm": 0.416015625, + "learning_rate": 6.797339362694678e-06, + "loss": 0.2222, + "step": 18500 + }, + { + "epoch": 0.8785548117552106, + "grad_norm": 0.435546875, + "learning_rate": 6.790427994165838e-06, + "loss": 0.2249, + "step": 18505 + }, + { + "epoch": 0.8787921948440394, + "grad_norm": 0.478515625, + "learning_rate": 6.783529388852053e-06, + "loss": 0.2243, + "step": 18510 + }, + { + "epoch": 0.8790295779328681, + "grad_norm": 0.443359375, + "learning_rate": 6.776643551004903e-06, + "loss": 0.2227, + "step": 18515 + }, + { + "epoch": 0.8792669610216968, + "grad_norm": 0.44140625, + "learning_rate": 6.769770484868099e-06, + "loss": 0.2279, + "step": 18520 + }, + { + "epoch": 0.8795043441105256, + "grad_norm": 0.47265625, + "learning_rate": 6.762910194677474e-06, + "loss": 0.2223, + "step": 18525 + }, + { + "epoch": 0.8797417271993543, + "grad_norm": 0.40234375, + "learning_rate": 6.756062684660997e-06, + "loss": 0.2219, + "step": 18530 + }, + { + "epoch": 0.8799791102881831, + "grad_norm": 0.412109375, + "learning_rate": 6.749227959038759e-06, + "loss": 0.2241, + "step": 18535 + }, + { + "epoch": 0.8802164933770118, + "grad_norm": 0.4140625, + "learning_rate": 6.742406022022971e-06, + "loss": 0.2208, + "step": 18540 + }, + { + "epoch": 0.8804538764658406, + "grad_norm": 0.53515625, + "learning_rate": 6.735596877817959e-06, + "loss": 0.2217, + "step": 18545 + }, + { + "epoch": 0.8806912595546693, + "grad_norm": 0.412109375, + "learning_rate": 6.7288005306201694e-06, + "loss": 0.2231, + "step": 18550 + }, + { + "epoch": 0.8809286426434981, + "grad_norm": 0.412109375, + "learning_rate": 6.722016984618159e-06, + "loss": 0.2245, + "step": 18555 + }, + { + "epoch": 0.8811660257323268, + "grad_norm": 0.421875, + "learning_rate": 6.715246243992597e-06, + "loss": 0.2182, + "step": 18560 + }, + { + "epoch": 0.8814034088211555, + "grad_norm": 0.400390625, + "learning_rate": 6.708488312916261e-06, + "loss": 0.2239, + "step": 18565 + }, + { + "epoch": 0.8816407919099843, + "grad_norm": 0.466796875, + "learning_rate": 6.701743195554032e-06, + "loss": 0.2233, + "step": 18570 + }, + { + "epoch": 0.881878174998813, + "grad_norm": 0.416015625, + "learning_rate": 6.6950108960628905e-06, + "loss": 0.2211, + "step": 18575 + }, + { + "epoch": 0.8821155580876419, + "grad_norm": 0.4921875, + "learning_rate": 6.688291418591933e-06, + "loss": 0.2228, + "step": 18580 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.4375, + "learning_rate": 6.6815847672823364e-06, + "loss": 0.2258, + "step": 18585 + }, + { + "epoch": 0.8825903242652994, + "grad_norm": 0.451171875, + "learning_rate": 6.674890946267379e-06, + "loss": 0.2224, + "step": 18590 + }, + { + "epoch": 0.8828277073541281, + "grad_norm": 0.46875, + "learning_rate": 6.668209959672428e-06, + "loss": 0.2218, + "step": 18595 + }, + { + "epoch": 0.8830650904429569, + "grad_norm": 0.453125, + "learning_rate": 6.661541811614955e-06, + "loss": 0.2212, + "step": 18600 + }, + { + "epoch": 0.8833024735317856, + "grad_norm": 0.4453125, + "learning_rate": 6.654886506204508e-06, + "loss": 0.2242, + "step": 18605 + }, + { + "epoch": 0.8835398566206143, + "grad_norm": 0.40234375, + "learning_rate": 6.6482440475427105e-06, + "loss": 0.2225, + "step": 18610 + }, + { + "epoch": 0.8837772397094431, + "grad_norm": 0.412109375, + "learning_rate": 6.641614439723291e-06, + "loss": 0.2249, + "step": 18615 + }, + { + "epoch": 0.8840146227982718, + "grad_norm": 0.439453125, + "learning_rate": 6.634997686832038e-06, + "loss": 0.2249, + "step": 18620 + }, + { + "epoch": 0.8842520058871006, + "grad_norm": 0.458984375, + "learning_rate": 6.628393792946838e-06, + "loss": 0.2219, + "step": 18625 + }, + { + "epoch": 0.8844893889759293, + "grad_norm": 0.5078125, + "learning_rate": 6.6218027621376246e-06, + "loss": 0.2291, + "step": 18630 + }, + { + "epoch": 0.8847267720647581, + "grad_norm": 0.466796875, + "learning_rate": 6.615224598466437e-06, + "loss": 0.2198, + "step": 18635 + }, + { + "epoch": 0.8849641551535868, + "grad_norm": 0.4296875, + "learning_rate": 6.608659305987355e-06, + "loss": 0.223, + "step": 18640 + }, + { + "epoch": 0.8852015382424157, + "grad_norm": 0.408203125, + "learning_rate": 6.602106888746548e-06, + "loss": 0.22, + "step": 18645 + }, + { + "epoch": 0.8854389213312444, + "grad_norm": 0.4375, + "learning_rate": 6.595567350782231e-06, + "loss": 0.2219, + "step": 18650 + }, + { + "epoch": 0.8856763044200732, + "grad_norm": 0.451171875, + "learning_rate": 6.589040696124696e-06, + "loss": 0.2199, + "step": 18655 + }, + { + "epoch": 0.8859136875089019, + "grad_norm": 0.48046875, + "learning_rate": 6.582526928796297e-06, + "loss": 0.2263, + "step": 18660 + }, + { + "epoch": 0.8861510705977306, + "grad_norm": 0.40234375, + "learning_rate": 6.576026052811429e-06, + "loss": 0.2263, + "step": 18665 + }, + { + "epoch": 0.8863884536865594, + "grad_norm": 0.4453125, + "learning_rate": 6.569538072176558e-06, + "loss": 0.2227, + "step": 18670 + }, + { + "epoch": 0.8866258367753881, + "grad_norm": 0.474609375, + "learning_rate": 6.563062990890192e-06, + "loss": 0.2238, + "step": 18675 + }, + { + "epoch": 0.8868632198642169, + "grad_norm": 0.416015625, + "learning_rate": 6.556600812942902e-06, + "loss": 0.2197, + "step": 18680 + }, + { + "epoch": 0.8871006029530456, + "grad_norm": 0.41796875, + "learning_rate": 6.550151542317295e-06, + "loss": 0.2228, + "step": 18685 + }, + { + "epoch": 0.8873379860418744, + "grad_norm": 0.447265625, + "learning_rate": 6.543715182988027e-06, + "loss": 0.2246, + "step": 18690 + }, + { + "epoch": 0.8875753691307031, + "grad_norm": 0.40625, + "learning_rate": 6.537291738921792e-06, + "loss": 0.2232, + "step": 18695 + }, + { + "epoch": 0.8878127522195319, + "grad_norm": 0.4296875, + "learning_rate": 6.530881214077338e-06, + "loss": 0.2205, + "step": 18700 + }, + { + "epoch": 0.8880501353083606, + "grad_norm": 0.44140625, + "learning_rate": 6.524483612405439e-06, + "loss": 0.2218, + "step": 18705 + }, + { + "epoch": 0.8882875183971893, + "grad_norm": 0.427734375, + "learning_rate": 6.518098937848907e-06, + "loss": 0.2256, + "step": 18710 + }, + { + "epoch": 0.8885249014860181, + "grad_norm": 0.3671875, + "learning_rate": 6.511727194342588e-06, + "loss": 0.2223, + "step": 18715 + }, + { + "epoch": 0.8887622845748469, + "grad_norm": 0.41015625, + "learning_rate": 6.5053683858133545e-06, + "loss": 0.2213, + "step": 18720 + }, + { + "epoch": 0.8889996676636757, + "grad_norm": 0.392578125, + "learning_rate": 6.499022516180124e-06, + "loss": 0.2225, + "step": 18725 + }, + { + "epoch": 0.8892370507525044, + "grad_norm": 0.384765625, + "learning_rate": 6.492689589353809e-06, + "loss": 0.2186, + "step": 18730 + }, + { + "epoch": 0.8894744338413332, + "grad_norm": 0.41796875, + "learning_rate": 6.486369609237379e-06, + "loss": 0.2218, + "step": 18735 + }, + { + "epoch": 0.8897118169301619, + "grad_norm": 0.4609375, + "learning_rate": 6.4800625797257996e-06, + "loss": 0.2173, + "step": 18740 + }, + { + "epoch": 0.8899492000189907, + "grad_norm": 0.4921875, + "learning_rate": 6.473768504706075e-06, + "loss": 0.2282, + "step": 18745 + }, + { + "epoch": 0.8901865831078194, + "grad_norm": 0.453125, + "learning_rate": 6.467487388057202e-06, + "loss": 0.224, + "step": 18750 + }, + { + "epoch": 0.8904239661966481, + "grad_norm": 0.50390625, + "learning_rate": 6.46121923365021e-06, + "loss": 0.2274, + "step": 18755 + }, + { + "epoch": 0.8906613492854769, + "grad_norm": 0.447265625, + "learning_rate": 6.45496404534814e-06, + "loss": 0.2205, + "step": 18760 + }, + { + "epoch": 0.8908987323743056, + "grad_norm": 0.431640625, + "learning_rate": 6.448721827006027e-06, + "loss": 0.2258, + "step": 18765 + }, + { + "epoch": 0.8911361154631344, + "grad_norm": 0.423828125, + "learning_rate": 6.4424925824709225e-06, + "loss": 0.2249, + "step": 18770 + }, + { + "epoch": 0.8913734985519631, + "grad_norm": 0.41796875, + "learning_rate": 6.436276315581884e-06, + "loss": 0.2256, + "step": 18775 + }, + { + "epoch": 0.8916108816407919, + "grad_norm": 0.44921875, + "learning_rate": 6.430073030169973e-06, + "loss": 0.218, + "step": 18780 + }, + { + "epoch": 0.8918482647296206, + "grad_norm": 0.4296875, + "learning_rate": 6.42388273005824e-06, + "loss": 0.2274, + "step": 18785 + }, + { + "epoch": 0.8920856478184495, + "grad_norm": 0.416015625, + "learning_rate": 6.41770541906174e-06, + "loss": 0.2234, + "step": 18790 + }, + { + "epoch": 0.8923230309072782, + "grad_norm": 0.498046875, + "learning_rate": 6.411541100987521e-06, + "loss": 0.2253, + "step": 18795 + }, + { + "epoch": 0.892560413996107, + "grad_norm": 0.435546875, + "learning_rate": 6.405389779634625e-06, + "loss": 0.222, + "step": 18800 + }, + { + "epoch": 0.8927977970849357, + "grad_norm": 0.421875, + "learning_rate": 6.399251458794088e-06, + "loss": 0.2224, + "step": 18805 + }, + { + "epoch": 0.8930351801737644, + "grad_norm": 0.47265625, + "learning_rate": 6.393126142248924e-06, + "loss": 0.2225, + "step": 18810 + }, + { + "epoch": 0.8932725632625932, + "grad_norm": 0.5, + "learning_rate": 6.387013833774139e-06, + "loss": 0.225, + "step": 18815 + }, + { + "epoch": 0.8935099463514219, + "grad_norm": 0.416015625, + "learning_rate": 6.380914537136716e-06, + "loss": 0.2234, + "step": 18820 + }, + { + "epoch": 0.8937473294402507, + "grad_norm": 0.431640625, + "learning_rate": 6.374828256095633e-06, + "loss": 0.2244, + "step": 18825 + }, + { + "epoch": 0.8939847125290794, + "grad_norm": 0.50390625, + "learning_rate": 6.368754994401832e-06, + "loss": 0.2236, + "step": 18830 + }, + { + "epoch": 0.8942220956179082, + "grad_norm": 0.412109375, + "learning_rate": 6.36269475579824e-06, + "loss": 0.2203, + "step": 18835 + }, + { + "epoch": 0.8944594787067369, + "grad_norm": 0.50390625, + "learning_rate": 6.356647544019746e-06, + "loss": 0.2239, + "step": 18840 + }, + { + "epoch": 0.8946968617955657, + "grad_norm": 0.458984375, + "learning_rate": 6.3506133627932275e-06, + "loss": 0.2246, + "step": 18845 + }, + { + "epoch": 0.8949342448843944, + "grad_norm": 0.412109375, + "learning_rate": 6.344592215837526e-06, + "loss": 0.2246, + "step": 18850 + }, + { + "epoch": 0.8951716279732231, + "grad_norm": 0.41796875, + "learning_rate": 6.338584106863435e-06, + "loss": 0.2213, + "step": 18855 + }, + { + "epoch": 0.895409011062052, + "grad_norm": 0.458984375, + "learning_rate": 6.332589039573733e-06, + "loss": 0.2269, + "step": 18860 + }, + { + "epoch": 0.8956463941508807, + "grad_norm": 0.423828125, + "learning_rate": 6.326607017663148e-06, + "loss": 0.2252, + "step": 18865 + }, + { + "epoch": 0.8958837772397095, + "grad_norm": 0.431640625, + "learning_rate": 6.320638044818385e-06, + "loss": 0.2217, + "step": 18870 + }, + { + "epoch": 0.8961211603285382, + "grad_norm": 0.5859375, + "learning_rate": 6.314682124718075e-06, + "loss": 0.223, + "step": 18875 + }, + { + "epoch": 0.896358543417367, + "grad_norm": 0.46484375, + "learning_rate": 6.308739261032841e-06, + "loss": 0.2208, + "step": 18880 + }, + { + "epoch": 0.8965959265061957, + "grad_norm": 0.49609375, + "learning_rate": 6.3028094574252325e-06, + "loss": 0.2227, + "step": 18885 + }, + { + "epoch": 0.8968333095950245, + "grad_norm": 0.404296875, + "learning_rate": 6.296892717549774e-06, + "loss": 0.2219, + "step": 18890 + }, + { + "epoch": 0.8970706926838532, + "grad_norm": 0.4296875, + "learning_rate": 6.290989045052909e-06, + "loss": 0.2249, + "step": 18895 + }, + { + "epoch": 0.8973080757726819, + "grad_norm": 0.4609375, + "learning_rate": 6.2850984435730505e-06, + "loss": 0.2238, + "step": 18900 + }, + { + "epoch": 0.8975454588615107, + "grad_norm": 0.45703125, + "learning_rate": 6.279220916740555e-06, + "loss": 0.2247, + "step": 18905 + }, + { + "epoch": 0.8977828419503394, + "grad_norm": 0.515625, + "learning_rate": 6.2733564681777075e-06, + "loss": 0.226, + "step": 18910 + }, + { + "epoch": 0.8980202250391682, + "grad_norm": 0.482421875, + "learning_rate": 6.267505101498747e-06, + "loss": 0.2233, + "step": 18915 + }, + { + "epoch": 0.8982576081279969, + "grad_norm": 0.44140625, + "learning_rate": 6.261666820309837e-06, + "loss": 0.2209, + "step": 18920 + }, + { + "epoch": 0.8984949912168257, + "grad_norm": 0.431640625, + "learning_rate": 6.255841628209092e-06, + "loss": 0.2252, + "step": 18925 + }, + { + "epoch": 0.8987323743056544, + "grad_norm": 0.419921875, + "learning_rate": 6.250029528786547e-06, + "loss": 0.2252, + "step": 18930 + }, + { + "epoch": 0.8989697573944833, + "grad_norm": 0.43359375, + "learning_rate": 6.244230525624174e-06, + "loss": 0.2208, + "step": 18935 + }, + { + "epoch": 0.899207140483312, + "grad_norm": 0.421875, + "learning_rate": 6.238444622295868e-06, + "loss": 0.2273, + "step": 18940 + }, + { + "epoch": 0.8994445235721408, + "grad_norm": 0.44921875, + "learning_rate": 6.232671822367461e-06, + "loss": 0.2242, + "step": 18945 + }, + { + "epoch": 0.8996819066609695, + "grad_norm": 0.44140625, + "learning_rate": 6.226912129396699e-06, + "loss": 0.2216, + "step": 18950 + }, + { + "epoch": 0.8999192897497982, + "grad_norm": 0.443359375, + "learning_rate": 6.221165546933258e-06, + "loss": 0.2246, + "step": 18955 + }, + { + "epoch": 0.900156672838627, + "grad_norm": 0.38671875, + "learning_rate": 6.2154320785187276e-06, + "loss": 0.2207, + "step": 18960 + }, + { + "epoch": 0.9003940559274557, + "grad_norm": 0.42578125, + "learning_rate": 6.209711727686618e-06, + "loss": 0.2226, + "step": 18965 + }, + { + "epoch": 0.9006314390162845, + "grad_norm": 0.451171875, + "learning_rate": 6.20400449796236e-06, + "loss": 0.2224, + "step": 18970 + }, + { + "epoch": 0.9008688221051132, + "grad_norm": 0.419921875, + "learning_rate": 6.1983103928632875e-06, + "loss": 0.2226, + "step": 18975 + }, + { + "epoch": 0.901106205193942, + "grad_norm": 0.455078125, + "learning_rate": 6.192629415898657e-06, + "loss": 0.2216, + "step": 18980 + }, + { + "epoch": 0.9013435882827707, + "grad_norm": 0.50390625, + "learning_rate": 6.18696157056962e-06, + "loss": 0.2259, + "step": 18985 + }, + { + "epoch": 0.9015809713715995, + "grad_norm": 0.490234375, + "learning_rate": 6.181306860369256e-06, + "loss": 0.2223, + "step": 18990 + }, + { + "epoch": 0.9018183544604282, + "grad_norm": 0.44921875, + "learning_rate": 6.175665288782531e-06, + "loss": 0.2228, + "step": 18995 + }, + { + "epoch": 0.9020557375492569, + "grad_norm": 0.451171875, + "learning_rate": 6.170036859286317e-06, + "loss": 0.2227, + "step": 19000 + }, + { + "epoch": 0.9022931206380858, + "grad_norm": 0.44140625, + "learning_rate": 6.164421575349397e-06, + "loss": 0.2216, + "step": 19005 + }, + { + "epoch": 0.9025305037269145, + "grad_norm": 0.46484375, + "learning_rate": 6.1588194404324345e-06, + "loss": 0.2258, + "step": 19010 + }, + { + "epoch": 0.9027678868157433, + "grad_norm": 0.4375, + "learning_rate": 6.153230457988011e-06, + "loss": 0.2214, + "step": 19015 + }, + { + "epoch": 0.903005269904572, + "grad_norm": 0.416015625, + "learning_rate": 6.14765463146058e-06, + "loss": 0.2227, + "step": 19020 + }, + { + "epoch": 0.9032426529934008, + "grad_norm": 0.447265625, + "learning_rate": 6.142091964286509e-06, + "loss": 0.2238, + "step": 19025 + }, + { + "epoch": 0.9034800360822295, + "grad_norm": 0.4296875, + "learning_rate": 6.136542459894033e-06, + "loss": 0.2245, + "step": 19030 + }, + { + "epoch": 0.9037174191710583, + "grad_norm": 0.53125, + "learning_rate": 6.131006121703301e-06, + "loss": 0.225, + "step": 19035 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 0.43359375, + "learning_rate": 6.1254829531263175e-06, + "loss": 0.2252, + "step": 19040 + }, + { + "epoch": 0.9041921853487157, + "grad_norm": 0.4296875, + "learning_rate": 6.119972957566996e-06, + "loss": 0.2241, + "step": 19045 + }, + { + "epoch": 0.9044295684375445, + "grad_norm": 0.384765625, + "learning_rate": 6.11447613842112e-06, + "loss": 0.2229, + "step": 19050 + }, + { + "epoch": 0.9046669515263732, + "grad_norm": 0.478515625, + "learning_rate": 6.108992499076352e-06, + "loss": 0.2209, + "step": 19055 + }, + { + "epoch": 0.904904334615202, + "grad_norm": 0.392578125, + "learning_rate": 6.103522042912236e-06, + "loss": 0.2233, + "step": 19060 + }, + { + "epoch": 0.9051417177040307, + "grad_norm": 0.421875, + "learning_rate": 6.098064773300183e-06, + "loss": 0.2221, + "step": 19065 + }, + { + "epoch": 0.9053791007928595, + "grad_norm": 0.466796875, + "learning_rate": 6.092620693603495e-06, + "loss": 0.2232, + "step": 19070 + }, + { + "epoch": 0.9056164838816883, + "grad_norm": 0.421875, + "learning_rate": 6.087189807177325e-06, + "loss": 0.221, + "step": 19075 + }, + { + "epoch": 0.9058538669705171, + "grad_norm": 0.435546875, + "learning_rate": 6.081772117368707e-06, + "loss": 0.2198, + "step": 19080 + }, + { + "epoch": 0.9060912500593458, + "grad_norm": 0.400390625, + "learning_rate": 6.076367627516534e-06, + "loss": 0.2224, + "step": 19085 + }, + { + "epoch": 0.9063286331481746, + "grad_norm": 0.453125, + "learning_rate": 6.070976340951577e-06, + "loss": 0.2235, + "step": 19090 + }, + { + "epoch": 0.9065660162370033, + "grad_norm": 0.447265625, + "learning_rate": 6.065598260996454e-06, + "loss": 0.2259, + "step": 19095 + }, + { + "epoch": 0.906803399325832, + "grad_norm": 0.421875, + "learning_rate": 6.060233390965654e-06, + "loss": 0.2233, + "step": 19100 + }, + { + "epoch": 0.9070407824146608, + "grad_norm": 0.4921875, + "learning_rate": 6.0548817341655215e-06, + "loss": 0.2229, + "step": 19105 + }, + { + "epoch": 0.9072781655034895, + "grad_norm": 0.41015625, + "learning_rate": 6.049543293894259e-06, + "loss": 0.2225, + "step": 19110 + }, + { + "epoch": 0.9075155485923183, + "grad_norm": 0.494140625, + "learning_rate": 6.044218073441922e-06, + "loss": 0.2268, + "step": 19115 + }, + { + "epoch": 0.907752931681147, + "grad_norm": 0.515625, + "learning_rate": 6.03890607609042e-06, + "loss": 0.2222, + "step": 19120 + }, + { + "epoch": 0.9079903147699758, + "grad_norm": 0.470703125, + "learning_rate": 6.033607305113514e-06, + "loss": 0.2247, + "step": 19125 + }, + { + "epoch": 0.9082276978588045, + "grad_norm": 0.474609375, + "learning_rate": 6.028321763776809e-06, + "loss": 0.2204, + "step": 19130 + }, + { + "epoch": 0.9084650809476333, + "grad_norm": 0.384765625, + "learning_rate": 6.0230494553377655e-06, + "loss": 0.2245, + "step": 19135 + }, + { + "epoch": 0.908702464036462, + "grad_norm": 0.3828125, + "learning_rate": 6.017790383045676e-06, + "loss": 0.2216, + "step": 19140 + }, + { + "epoch": 0.9089398471252907, + "grad_norm": 0.400390625, + "learning_rate": 6.01254455014169e-06, + "loss": 0.2266, + "step": 19145 + }, + { + "epoch": 0.9091772302141196, + "grad_norm": 0.44921875, + "learning_rate": 6.007311959858789e-06, + "loss": 0.2264, + "step": 19150 + }, + { + "epoch": 0.9094146133029483, + "grad_norm": 0.439453125, + "learning_rate": 6.002092615421789e-06, + "loss": 0.2246, + "step": 19155 + }, + { + "epoch": 0.9096519963917771, + "grad_norm": 0.416015625, + "learning_rate": 5.9968865200473554e-06, + "loss": 0.2189, + "step": 19160 + }, + { + "epoch": 0.9098893794806058, + "grad_norm": 0.439453125, + "learning_rate": 5.991693676943974e-06, + "loss": 0.2215, + "step": 19165 + }, + { + "epoch": 0.9101267625694346, + "grad_norm": 0.392578125, + "learning_rate": 5.986514089311978e-06, + "loss": 0.2234, + "step": 19170 + }, + { + "epoch": 0.9103641456582633, + "grad_norm": 0.453125, + "learning_rate": 5.9813477603435196e-06, + "loss": 0.2205, + "step": 19175 + }, + { + "epoch": 0.9106015287470921, + "grad_norm": 0.43359375, + "learning_rate": 5.976194693222584e-06, + "loss": 0.2239, + "step": 19180 + }, + { + "epoch": 0.9108389118359208, + "grad_norm": 0.447265625, + "learning_rate": 5.97105489112498e-06, + "loss": 0.224, + "step": 19185 + }, + { + "epoch": 0.9110762949247495, + "grad_norm": 0.390625, + "learning_rate": 5.965928357218352e-06, + "loss": 0.2223, + "step": 19190 + }, + { + "epoch": 0.9113136780135783, + "grad_norm": 0.50390625, + "learning_rate": 5.960815094662152e-06, + "loss": 0.22, + "step": 19195 + }, + { + "epoch": 0.911551061102407, + "grad_norm": 0.427734375, + "learning_rate": 5.955715106607669e-06, + "loss": 0.222, + "step": 19200 + }, + { + "epoch": 0.9117884441912358, + "grad_norm": 0.421875, + "learning_rate": 5.9506283961979934e-06, + "loss": 0.2266, + "step": 19205 + }, + { + "epoch": 0.9120258272800645, + "grad_norm": 0.4453125, + "learning_rate": 5.945554966568042e-06, + "loss": 0.222, + "step": 19210 + }, + { + "epoch": 0.9122632103688934, + "grad_norm": 0.404296875, + "learning_rate": 5.940494820844555e-06, + "loss": 0.2262, + "step": 19215 + }, + { + "epoch": 0.912500593457722, + "grad_norm": 0.494140625, + "learning_rate": 5.935447962146071e-06, + "loss": 0.2256, + "step": 19220 + }, + { + "epoch": 0.9127379765465509, + "grad_norm": 0.4765625, + "learning_rate": 5.930414393582946e-06, + "loss": 0.2229, + "step": 19225 + }, + { + "epoch": 0.9129753596353796, + "grad_norm": 0.419921875, + "learning_rate": 5.925394118257345e-06, + "loss": 0.2208, + "step": 19230 + }, + { + "epoch": 0.9132127427242084, + "grad_norm": 0.400390625, + "learning_rate": 5.9203871392632444e-06, + "loss": 0.2237, + "step": 19235 + }, + { + "epoch": 0.9134501258130371, + "grad_norm": 0.427734375, + "learning_rate": 5.915393459686419e-06, + "loss": 0.221, + "step": 19240 + }, + { + "epoch": 0.9136875089018658, + "grad_norm": 0.47265625, + "learning_rate": 5.9104130826044546e-06, + "loss": 0.2234, + "step": 19245 + }, + { + "epoch": 0.9139248919906946, + "grad_norm": 0.482421875, + "learning_rate": 5.905446011086732e-06, + "loss": 0.2237, + "step": 19250 + }, + { + "epoch": 0.9141622750795233, + "grad_norm": 0.458984375, + "learning_rate": 5.9004922481944325e-06, + "loss": 0.2231, + "step": 19255 + }, + { + "epoch": 0.9143996581683521, + "grad_norm": 0.4296875, + "learning_rate": 5.8955517969805465e-06, + "loss": 0.227, + "step": 19260 + }, + { + "epoch": 0.9146370412571808, + "grad_norm": 0.3984375, + "learning_rate": 5.89062466048984e-06, + "loss": 0.2254, + "step": 19265 + }, + { + "epoch": 0.9148744243460096, + "grad_norm": 0.478515625, + "learning_rate": 5.885710841758891e-06, + "loss": 0.2252, + "step": 19270 + }, + { + "epoch": 0.9151118074348383, + "grad_norm": 0.44140625, + "learning_rate": 5.880810343816064e-06, + "loss": 0.2219, + "step": 19275 + }, + { + "epoch": 0.9153491905236671, + "grad_norm": 0.3984375, + "learning_rate": 5.875923169681516e-06, + "loss": 0.227, + "step": 19280 + }, + { + "epoch": 0.9155865736124958, + "grad_norm": 0.40234375, + "learning_rate": 5.871049322367181e-06, + "loss": 0.226, + "step": 19285 + }, + { + "epoch": 0.9158239567013245, + "grad_norm": 0.400390625, + "learning_rate": 5.866188804876801e-06, + "loss": 0.2189, + "step": 19290 + }, + { + "epoch": 0.9160613397901534, + "grad_norm": 0.42578125, + "learning_rate": 5.861341620205882e-06, + "loss": 0.2243, + "step": 19295 + }, + { + "epoch": 0.9162987228789821, + "grad_norm": 0.50390625, + "learning_rate": 5.856507771341724e-06, + "loss": 0.2259, + "step": 19300 + }, + { + "epoch": 0.9165361059678109, + "grad_norm": 0.3984375, + "learning_rate": 5.851687261263409e-06, + "loss": 0.2214, + "step": 19305 + }, + { + "epoch": 0.9167734890566396, + "grad_norm": 0.396484375, + "learning_rate": 5.846880092941789e-06, + "loss": 0.2239, + "step": 19310 + }, + { + "epoch": 0.9170108721454684, + "grad_norm": 0.435546875, + "learning_rate": 5.842086269339506e-06, + "loss": 0.2254, + "step": 19315 + }, + { + "epoch": 0.9172482552342971, + "grad_norm": 0.48046875, + "learning_rate": 5.837305793410971e-06, + "loss": 0.2233, + "step": 19320 + }, + { + "epoch": 0.9174856383231259, + "grad_norm": 0.47265625, + "learning_rate": 5.832538668102369e-06, + "loss": 0.2253, + "step": 19325 + }, + { + "epoch": 0.9177230214119546, + "grad_norm": 0.484375, + "learning_rate": 5.827784896351651e-06, + "loss": 0.2235, + "step": 19330 + }, + { + "epoch": 0.9179604045007833, + "grad_norm": 0.451171875, + "learning_rate": 5.823044481088556e-06, + "loss": 0.2219, + "step": 19335 + }, + { + "epoch": 0.9181977875896121, + "grad_norm": 0.416015625, + "learning_rate": 5.818317425234576e-06, + "loss": 0.2227, + "step": 19340 + }, + { + "epoch": 0.9184351706784408, + "grad_norm": 0.41015625, + "learning_rate": 5.813603731702971e-06, + "loss": 0.2224, + "step": 19345 + }, + { + "epoch": 0.9186725537672696, + "grad_norm": 0.466796875, + "learning_rate": 5.808903403398774e-06, + "loss": 0.2256, + "step": 19350 + }, + { + "epoch": 0.9189099368560983, + "grad_norm": 0.455078125, + "learning_rate": 5.80421644321877e-06, + "loss": 0.2204, + "step": 19355 + }, + { + "epoch": 0.9191473199449272, + "grad_norm": 0.466796875, + "learning_rate": 5.799542854051515e-06, + "loss": 0.2218, + "step": 19360 + }, + { + "epoch": 0.9193847030337559, + "grad_norm": 0.431640625, + "learning_rate": 5.79488263877732e-06, + "loss": 0.223, + "step": 19365 + }, + { + "epoch": 0.9196220861225847, + "grad_norm": 0.427734375, + "learning_rate": 5.7902358002682565e-06, + "loss": 0.2265, + "step": 19370 + }, + { + "epoch": 0.9198594692114134, + "grad_norm": 0.427734375, + "learning_rate": 5.7856023413881444e-06, + "loss": 0.2188, + "step": 19375 + }, + { + "epoch": 0.9200968523002422, + "grad_norm": 0.447265625, + "learning_rate": 5.780982264992571e-06, + "loss": 0.2238, + "step": 19380 + }, + { + "epoch": 0.9203342353890709, + "grad_norm": 0.4296875, + "learning_rate": 5.776375573928863e-06, + "loss": 0.2213, + "step": 19385 + }, + { + "epoch": 0.9205716184778996, + "grad_norm": 0.45703125, + "learning_rate": 5.771782271036107e-06, + "loss": 0.2254, + "step": 19390 + }, + { + "epoch": 0.9208090015667284, + "grad_norm": 0.421875, + "learning_rate": 5.7672023591451275e-06, + "loss": 0.2211, + "step": 19395 + }, + { + "epoch": 0.9210463846555571, + "grad_norm": 0.423828125, + "learning_rate": 5.762635841078508e-06, + "loss": 0.222, + "step": 19400 + }, + { + "epoch": 0.9212837677443859, + "grad_norm": 0.431640625, + "learning_rate": 5.758082719650579e-06, + "loss": 0.2256, + "step": 19405 + }, + { + "epoch": 0.9215211508332146, + "grad_norm": 0.4609375, + "learning_rate": 5.753542997667398e-06, + "loss": 0.2221, + "step": 19410 + }, + { + "epoch": 0.9217585339220434, + "grad_norm": 0.46484375, + "learning_rate": 5.749016677926778e-06, + "loss": 0.222, + "step": 19415 + }, + { + "epoch": 0.9219959170108721, + "grad_norm": 0.494140625, + "learning_rate": 5.744503763218272e-06, + "loss": 0.2294, + "step": 19420 + }, + { + "epoch": 0.922233300099701, + "grad_norm": 0.44140625, + "learning_rate": 5.740004256323171e-06, + "loss": 0.2222, + "step": 19425 + }, + { + "epoch": 0.9224706831885296, + "grad_norm": 0.4609375, + "learning_rate": 5.735518160014493e-06, + "loss": 0.2252, + "step": 19430 + }, + { + "epoch": 0.9227080662773584, + "grad_norm": 0.4375, + "learning_rate": 5.731045477057006e-06, + "loss": 0.2276, + "step": 19435 + }, + { + "epoch": 0.9229454493661872, + "grad_norm": 0.42578125, + "learning_rate": 5.7265862102072e-06, + "loss": 0.2211, + "step": 19440 + }, + { + "epoch": 0.9231828324550159, + "grad_norm": 0.4765625, + "learning_rate": 5.722140362213302e-06, + "loss": 0.224, + "step": 19445 + }, + { + "epoch": 0.9234202155438447, + "grad_norm": 0.478515625, + "learning_rate": 5.717707935815266e-06, + "loss": 0.2237, + "step": 19450 + }, + { + "epoch": 0.9236575986326734, + "grad_norm": 0.404296875, + "learning_rate": 5.713288933744777e-06, + "loss": 0.223, + "step": 19455 + }, + { + "epoch": 0.9238949817215022, + "grad_norm": 0.453125, + "learning_rate": 5.7088833587252504e-06, + "loss": 0.2239, + "step": 19460 + }, + { + "epoch": 0.9241323648103309, + "grad_norm": 0.400390625, + "learning_rate": 5.70449121347182e-06, + "loss": 0.2267, + "step": 19465 + }, + { + "epoch": 0.9243697478991597, + "grad_norm": 0.439453125, + "learning_rate": 5.7001125006913406e-06, + "loss": 0.2214, + "step": 19470 + }, + { + "epoch": 0.9246071309879884, + "grad_norm": 0.4375, + "learning_rate": 5.695747223082394e-06, + "loss": 0.2243, + "step": 19475 + }, + { + "epoch": 0.9248445140768171, + "grad_norm": 0.3984375, + "learning_rate": 5.691395383335288e-06, + "loss": 0.2237, + "step": 19480 + }, + { + "epoch": 0.9250818971656459, + "grad_norm": 0.443359375, + "learning_rate": 5.687056984132033e-06, + "loss": 0.2225, + "step": 19485 + }, + { + "epoch": 0.9253192802544746, + "grad_norm": 0.41015625, + "learning_rate": 5.682732028146372e-06, + "loss": 0.2258, + "step": 19490 + }, + { + "epoch": 0.9255566633433034, + "grad_norm": 0.455078125, + "learning_rate": 5.678420518043747e-06, + "loss": 0.2193, + "step": 19495 + }, + { + "epoch": 0.9257940464321321, + "grad_norm": 0.40234375, + "learning_rate": 5.67412245648133e-06, + "loss": 0.2225, + "step": 19500 + }, + { + "epoch": 0.926031429520961, + "grad_norm": 0.52734375, + "learning_rate": 5.669837846107992e-06, + "loss": 0.223, + "step": 19505 + }, + { + "epoch": 0.9262688126097897, + "grad_norm": 0.4140625, + "learning_rate": 5.665566689564322e-06, + "loss": 0.2215, + "step": 19510 + }, + { + "epoch": 0.9265061956986185, + "grad_norm": 0.52734375, + "learning_rate": 5.661308989482615e-06, + "loss": 0.2229, + "step": 19515 + }, + { + "epoch": 0.9267435787874472, + "grad_norm": 0.4765625, + "learning_rate": 5.657064748486865e-06, + "loss": 0.2282, + "step": 19520 + }, + { + "epoch": 0.926980961876276, + "grad_norm": 0.416015625, + "learning_rate": 5.652833969192789e-06, + "loss": 0.2255, + "step": 19525 + }, + { + "epoch": 0.9272183449651047, + "grad_norm": 0.423828125, + "learning_rate": 5.648616654207791e-06, + "loss": 0.2262, + "step": 19530 + }, + { + "epoch": 0.9274557280539334, + "grad_norm": 0.427734375, + "learning_rate": 5.644412806130981e-06, + "loss": 0.2251, + "step": 19535 + }, + { + "epoch": 0.9276931111427622, + "grad_norm": 0.4140625, + "learning_rate": 5.640222427553178e-06, + "loss": 0.2249, + "step": 19540 + }, + { + "epoch": 0.9279304942315909, + "grad_norm": 0.5, + "learning_rate": 5.636045521056887e-06, + "loss": 0.2257, + "step": 19545 + }, + { + "epoch": 0.9281678773204197, + "grad_norm": 0.494140625, + "learning_rate": 5.6318820892163195e-06, + "loss": 0.2245, + "step": 19550 + }, + { + "epoch": 0.9284052604092484, + "grad_norm": 0.484375, + "learning_rate": 5.6277321345973795e-06, + "loss": 0.2241, + "step": 19555 + }, + { + "epoch": 0.9286426434980772, + "grad_norm": 0.453125, + "learning_rate": 5.623595659757664e-06, + "loss": 0.2239, + "step": 19560 + }, + { + "epoch": 0.9288800265869059, + "grad_norm": 0.388671875, + "learning_rate": 5.619472667246464e-06, + "loss": 0.2239, + "step": 19565 + }, + { + "epoch": 0.9291174096757348, + "grad_norm": 0.4296875, + "learning_rate": 5.615363159604763e-06, + "loss": 0.2261, + "step": 19570 + }, + { + "epoch": 0.9293547927645635, + "grad_norm": 0.41015625, + "learning_rate": 5.611267139365227e-06, + "loss": 0.2239, + "step": 19575 + }, + { + "epoch": 0.9295921758533922, + "grad_norm": 0.53515625, + "learning_rate": 5.60718460905222e-06, + "loss": 0.2258, + "step": 19580 + }, + { + "epoch": 0.929829558942221, + "grad_norm": 0.4921875, + "learning_rate": 5.6031155711817864e-06, + "loss": 0.2269, + "step": 19585 + }, + { + "epoch": 0.9300669420310497, + "grad_norm": 0.439453125, + "learning_rate": 5.599060028261654e-06, + "loss": 0.2231, + "step": 19590 + }, + { + "epoch": 0.9303043251198785, + "grad_norm": 0.42578125, + "learning_rate": 5.5950179827912355e-06, + "loss": 0.227, + "step": 19595 + }, + { + "epoch": 0.9305417082087072, + "grad_norm": 0.4453125, + "learning_rate": 5.590989437261621e-06, + "loss": 0.2256, + "step": 19600 + }, + { + "epoch": 0.930779091297536, + "grad_norm": 0.4296875, + "learning_rate": 5.586974394155595e-06, + "loss": 0.2265, + "step": 19605 + }, + { + "epoch": 0.9310164743863647, + "grad_norm": 0.443359375, + "learning_rate": 5.5829728559475995e-06, + "loss": 0.2226, + "step": 19610 + }, + { + "epoch": 0.9312538574751935, + "grad_norm": 0.396484375, + "learning_rate": 5.578984825103772e-06, + "loss": 0.2264, + "step": 19615 + }, + { + "epoch": 0.9314912405640222, + "grad_norm": 0.384765625, + "learning_rate": 5.575010304081913e-06, + "loss": 0.2246, + "step": 19620 + }, + { + "epoch": 0.9317286236528509, + "grad_norm": 0.466796875, + "learning_rate": 5.571049295331506e-06, + "loss": 0.223, + "step": 19625 + }, + { + "epoch": 0.9319660067416797, + "grad_norm": 0.427734375, + "learning_rate": 5.567101801293697e-06, + "loss": 0.2217, + "step": 19630 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 0.462890625, + "learning_rate": 5.563167824401318e-06, + "loss": 0.2219, + "step": 19635 + }, + { + "epoch": 0.9324407729193372, + "grad_norm": 0.423828125, + "learning_rate": 5.559247367078852e-06, + "loss": 0.2214, + "step": 19640 + }, + { + "epoch": 0.932678156008166, + "grad_norm": 0.427734375, + "learning_rate": 5.555340431742457e-06, + "loss": 0.2223, + "step": 19645 + }, + { + "epoch": 0.9329155390969948, + "grad_norm": 0.427734375, + "learning_rate": 5.5514470207999724e-06, + "loss": 0.2214, + "step": 19650 + }, + { + "epoch": 0.9331529221858235, + "grad_norm": 0.5078125, + "learning_rate": 5.5475671366508765e-06, + "loss": 0.2238, + "step": 19655 + }, + { + "epoch": 0.9333903052746523, + "grad_norm": 0.43359375, + "learning_rate": 5.5437007816863315e-06, + "loss": 0.2235, + "step": 19660 + }, + { + "epoch": 0.933627688363481, + "grad_norm": 0.443359375, + "learning_rate": 5.539847958289152e-06, + "loss": 0.2194, + "step": 19665 + }, + { + "epoch": 0.9338650714523098, + "grad_norm": 0.49609375, + "learning_rate": 5.536008668833818e-06, + "loss": 0.2252, + "step": 19670 + }, + { + "epoch": 0.9341024545411385, + "grad_norm": 0.419921875, + "learning_rate": 5.532182915686461e-06, + "loss": 0.2261, + "step": 19675 + }, + { + "epoch": 0.9343398376299672, + "grad_norm": 0.4453125, + "learning_rate": 5.528370701204879e-06, + "loss": 0.2211, + "step": 19680 + }, + { + "epoch": 0.934577220718796, + "grad_norm": 0.431640625, + "learning_rate": 5.524572027738523e-06, + "loss": 0.2228, + "step": 19685 + }, + { + "epoch": 0.9348146038076247, + "grad_norm": 0.4765625, + "learning_rate": 5.5207868976284975e-06, + "loss": 0.2228, + "step": 19690 + }, + { + "epoch": 0.9350519868964535, + "grad_norm": 0.447265625, + "learning_rate": 5.517015313207559e-06, + "loss": 0.2226, + "step": 19695 + }, + { + "epoch": 0.9352893699852822, + "grad_norm": 0.4453125, + "learning_rate": 5.513257276800121e-06, + "loss": 0.2236, + "step": 19700 + }, + { + "epoch": 0.935526753074111, + "grad_norm": 0.5, + "learning_rate": 5.50951279072224e-06, + "loss": 0.2245, + "step": 19705 + }, + { + "epoch": 0.9357641361629397, + "grad_norm": 0.431640625, + "learning_rate": 5.505781857281632e-06, + "loss": 0.2194, + "step": 19710 + }, + { + "epoch": 0.9360015192517686, + "grad_norm": 0.416015625, + "learning_rate": 5.502064478777647e-06, + "loss": 0.2245, + "step": 19715 + }, + { + "epoch": 0.9362389023405973, + "grad_norm": 0.427734375, + "learning_rate": 5.4983606575012915e-06, + "loss": 0.2219, + "step": 19720 + }, + { + "epoch": 0.936476285429426, + "grad_norm": 0.443359375, + "learning_rate": 5.494670395735217e-06, + "loss": 0.2216, + "step": 19725 + }, + { + "epoch": 0.9367136685182548, + "grad_norm": 0.421875, + "learning_rate": 5.490993695753712e-06, + "loss": 0.2211, + "step": 19730 + }, + { + "epoch": 0.9369510516070835, + "grad_norm": 0.43359375, + "learning_rate": 5.487330559822711e-06, + "loss": 0.2223, + "step": 19735 + }, + { + "epoch": 0.9371884346959123, + "grad_norm": 0.482421875, + "learning_rate": 5.483680990199788e-06, + "loss": 0.2299, + "step": 19740 + }, + { + "epoch": 0.937425817784741, + "grad_norm": 0.423828125, + "learning_rate": 5.48004498913415e-06, + "loss": 0.219, + "step": 19745 + }, + { + "epoch": 0.9376632008735698, + "grad_norm": 0.486328125, + "learning_rate": 5.476422558866659e-06, + "loss": 0.2225, + "step": 19750 + }, + { + "epoch": 0.9379005839623985, + "grad_norm": 0.40234375, + "learning_rate": 5.472813701629796e-06, + "loss": 0.2265, + "step": 19755 + }, + { + "epoch": 0.9381379670512273, + "grad_norm": 0.37109375, + "learning_rate": 5.469218419647681e-06, + "loss": 0.2165, + "step": 19760 + }, + { + "epoch": 0.938375350140056, + "grad_norm": 0.404296875, + "learning_rate": 5.465636715136074e-06, + "loss": 0.2237, + "step": 19765 + }, + { + "epoch": 0.9386127332288847, + "grad_norm": 0.46484375, + "learning_rate": 5.462068590302365e-06, + "loss": 0.2253, + "step": 19770 + }, + { + "epoch": 0.9388501163177135, + "grad_norm": 0.40625, + "learning_rate": 5.458514047345568e-06, + "loss": 0.2208, + "step": 19775 + }, + { + "epoch": 0.9390874994065422, + "grad_norm": 0.396484375, + "learning_rate": 5.454973088456336e-06, + "loss": 0.223, + "step": 19780 + }, + { + "epoch": 0.939324882495371, + "grad_norm": 0.4453125, + "learning_rate": 5.451445715816942e-06, + "loss": 0.2221, + "step": 19785 + }, + { + "epoch": 0.9395622655841998, + "grad_norm": 0.423828125, + "learning_rate": 5.447931931601291e-06, + "loss": 0.2221, + "step": 19790 + }, + { + "epoch": 0.9397996486730286, + "grad_norm": 0.455078125, + "learning_rate": 5.444431737974915e-06, + "loss": 0.2201, + "step": 19795 + }, + { + "epoch": 0.9400370317618573, + "grad_norm": 0.51171875, + "learning_rate": 5.440945137094963e-06, + "loss": 0.2231, + "step": 19800 + }, + { + "epoch": 0.9402744148506861, + "grad_norm": 0.416015625, + "learning_rate": 5.437472131110213e-06, + "loss": 0.2193, + "step": 19805 + }, + { + "epoch": 0.9405117979395148, + "grad_norm": 0.421875, + "learning_rate": 5.434012722161063e-06, + "loss": 0.223, + "step": 19810 + }, + { + "epoch": 0.9407491810283436, + "grad_norm": 0.427734375, + "learning_rate": 5.4305669123795355e-06, + "loss": 0.2259, + "step": 19815 + }, + { + "epoch": 0.9409865641171723, + "grad_norm": 0.419921875, + "learning_rate": 5.427134703889259e-06, + "loss": 0.2229, + "step": 19820 + }, + { + "epoch": 0.941223947206001, + "grad_norm": 0.494140625, + "learning_rate": 5.423716098805493e-06, + "loss": 0.2257, + "step": 19825 + }, + { + "epoch": 0.9414613302948298, + "grad_norm": 0.490234375, + "learning_rate": 5.420311099235107e-06, + "loss": 0.2217, + "step": 19830 + }, + { + "epoch": 0.9416987133836585, + "grad_norm": 0.455078125, + "learning_rate": 5.4169197072765895e-06, + "loss": 0.2232, + "step": 19835 + }, + { + "epoch": 0.9419360964724873, + "grad_norm": 0.47265625, + "learning_rate": 5.413541925020038e-06, + "loss": 0.2254, + "step": 19840 + }, + { + "epoch": 0.942173479561316, + "grad_norm": 0.4140625, + "learning_rate": 5.410177754547158e-06, + "loss": 0.2232, + "step": 19845 + }, + { + "epoch": 0.9424108626501448, + "grad_norm": 0.44140625, + "learning_rate": 5.406827197931282e-06, + "loss": 0.2248, + "step": 19850 + }, + { + "epoch": 0.9426482457389735, + "grad_norm": 0.44140625, + "learning_rate": 5.403490257237342e-06, + "loss": 0.2224, + "step": 19855 + }, + { + "epoch": 0.9428856288278024, + "grad_norm": 0.5078125, + "learning_rate": 5.400166934521874e-06, + "loss": 0.221, + "step": 19860 + }, + { + "epoch": 0.9431230119166311, + "grad_norm": 0.4453125, + "learning_rate": 5.396857231833028e-06, + "loss": 0.2256, + "step": 19865 + }, + { + "epoch": 0.9433603950054598, + "grad_norm": 0.458984375, + "learning_rate": 5.39356115121056e-06, + "loss": 0.2181, + "step": 19870 + }, + { + "epoch": 0.9435977780942886, + "grad_norm": 0.412109375, + "learning_rate": 5.3902786946858314e-06, + "loss": 0.2238, + "step": 19875 + }, + { + "epoch": 0.9438351611831173, + "grad_norm": 0.435546875, + "learning_rate": 5.387009864281804e-06, + "loss": 0.2195, + "step": 19880 + }, + { + "epoch": 0.9440725442719461, + "grad_norm": 0.421875, + "learning_rate": 5.383754662013042e-06, + "loss": 0.2228, + "step": 19885 + }, + { + "epoch": 0.9443099273607748, + "grad_norm": 0.546875, + "learning_rate": 5.380513089885711e-06, + "loss": 0.2244, + "step": 19890 + }, + { + "epoch": 0.9445473104496036, + "grad_norm": 0.4453125, + "learning_rate": 5.377285149897577e-06, + "loss": 0.2262, + "step": 19895 + }, + { + "epoch": 0.9447846935384323, + "grad_norm": 0.447265625, + "learning_rate": 5.3740708440380085e-06, + "loss": 0.2207, + "step": 19900 + }, + { + "epoch": 0.9450220766272611, + "grad_norm": 0.453125, + "learning_rate": 5.370870174287965e-06, + "loss": 0.2238, + "step": 19905 + }, + { + "epoch": 0.9452594597160898, + "grad_norm": 0.40234375, + "learning_rate": 5.367683142620001e-06, + "loss": 0.2212, + "step": 19910 + }, + { + "epoch": 0.9454968428049185, + "grad_norm": 0.466796875, + "learning_rate": 5.364509750998279e-06, + "loss": 0.2274, + "step": 19915 + }, + { + "epoch": 0.9457342258937473, + "grad_norm": 0.4140625, + "learning_rate": 5.361350001378539e-06, + "loss": 0.222, + "step": 19920 + }, + { + "epoch": 0.945971608982576, + "grad_norm": 0.4296875, + "learning_rate": 5.35820389570812e-06, + "loss": 0.2242, + "step": 19925 + }, + { + "epoch": 0.9462089920714049, + "grad_norm": 0.421875, + "learning_rate": 5.355071435925956e-06, + "loss": 0.2241, + "step": 19930 + }, + { + "epoch": 0.9464463751602336, + "grad_norm": 0.423828125, + "learning_rate": 5.351952623962563e-06, + "loss": 0.219, + "step": 19935 + }, + { + "epoch": 0.9466837582490624, + "grad_norm": 0.421875, + "learning_rate": 5.348847461740058e-06, + "loss": 0.2251, + "step": 19940 + }, + { + "epoch": 0.9469211413378911, + "grad_norm": 0.42578125, + "learning_rate": 5.34575595117213e-06, + "loss": 0.225, + "step": 19945 + }, + { + "epoch": 0.9471585244267199, + "grad_norm": 0.44921875, + "learning_rate": 5.342678094164067e-06, + "loss": 0.2287, + "step": 19950 + }, + { + "epoch": 0.9473959075155486, + "grad_norm": 0.419921875, + "learning_rate": 5.3396138926127424e-06, + "loss": 0.2221, + "step": 19955 + }, + { + "epoch": 0.9476332906043774, + "grad_norm": 0.40625, + "learning_rate": 5.336563348406602e-06, + "loss": 0.2194, + "step": 19960 + }, + { + "epoch": 0.9478706736932061, + "grad_norm": 0.458984375, + "learning_rate": 5.3335264634256855e-06, + "loss": 0.2244, + "step": 19965 + }, + { + "epoch": 0.9481080567820348, + "grad_norm": 0.453125, + "learning_rate": 5.330503239541612e-06, + "loss": 0.2216, + "step": 19970 + }, + { + "epoch": 0.9483454398708636, + "grad_norm": 0.455078125, + "learning_rate": 5.327493678617581e-06, + "loss": 0.2233, + "step": 19975 + }, + { + "epoch": 0.9485828229596923, + "grad_norm": 0.421875, + "learning_rate": 5.324497782508371e-06, + "loss": 0.2197, + "step": 19980 + }, + { + "epoch": 0.9488202060485211, + "grad_norm": 0.466796875, + "learning_rate": 5.321515553060338e-06, + "loss": 0.2217, + "step": 19985 + }, + { + "epoch": 0.9490575891373498, + "grad_norm": 0.453125, + "learning_rate": 5.318546992111413e-06, + "loss": 0.2249, + "step": 19990 + }, + { + "epoch": 0.9492949722261786, + "grad_norm": 0.4453125, + "learning_rate": 5.315592101491115e-06, + "loss": 0.2261, + "step": 19995 + }, + { + "epoch": 0.9495323553150073, + "grad_norm": 0.419921875, + "learning_rate": 5.312650883020524e-06, + "loss": 0.2245, + "step": 20000 + }, + { + "epoch": 0.9497697384038362, + "grad_norm": 0.498046875, + "learning_rate": 5.309723338512304e-06, + "loss": 0.2205, + "step": 20005 + }, + { + "epoch": 0.9500071214926649, + "grad_norm": 0.38671875, + "learning_rate": 5.30680946977068e-06, + "loss": 0.2194, + "step": 20010 + }, + { + "epoch": 0.9502445045814936, + "grad_norm": 0.4140625, + "learning_rate": 5.303909278591461e-06, + "loss": 0.2263, + "step": 20015 + }, + { + "epoch": 0.9504818876703224, + "grad_norm": 0.490234375, + "learning_rate": 5.301022766762022e-06, + "loss": 0.222, + "step": 20020 + }, + { + "epoch": 0.9507192707591511, + "grad_norm": 0.4453125, + "learning_rate": 5.298149936061306e-06, + "loss": 0.2232, + "step": 20025 + }, + { + "epoch": 0.9509566538479799, + "grad_norm": 0.435546875, + "learning_rate": 5.295290788259823e-06, + "loss": 0.2229, + "step": 20030 + }, + { + "epoch": 0.9511940369368086, + "grad_norm": 0.392578125, + "learning_rate": 5.292445325119657e-06, + "loss": 0.2238, + "step": 20035 + }, + { + "epoch": 0.9514314200256374, + "grad_norm": 0.484375, + "learning_rate": 5.289613548394455e-06, + "loss": 0.2238, + "step": 20040 + }, + { + "epoch": 0.9516688031144661, + "grad_norm": 0.43359375, + "learning_rate": 5.286795459829422e-06, + "loss": 0.2255, + "step": 20045 + }, + { + "epoch": 0.9519061862032949, + "grad_norm": 0.427734375, + "learning_rate": 5.283991061161339e-06, + "loss": 0.2246, + "step": 20050 + }, + { + "epoch": 0.9521435692921236, + "grad_norm": 0.431640625, + "learning_rate": 5.281200354118539e-06, + "loss": 0.2224, + "step": 20055 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.45703125, + "learning_rate": 5.278423340420927e-06, + "loss": 0.2261, + "step": 20060 + }, + { + "epoch": 0.9526183354697811, + "grad_norm": 0.439453125, + "learning_rate": 5.275660021779959e-06, + "loss": 0.225, + "step": 20065 + }, + { + "epoch": 0.9528557185586098, + "grad_norm": 0.44921875, + "learning_rate": 5.272910399898661e-06, + "loss": 0.2237, + "step": 20070 + }, + { + "epoch": 0.9530931016474387, + "grad_norm": 0.4453125, + "learning_rate": 5.270174476471609e-06, + "loss": 0.2244, + "step": 20075 + }, + { + "epoch": 0.9533304847362674, + "grad_norm": 0.498046875, + "learning_rate": 5.267452253184943e-06, + "loss": 0.2239, + "step": 20080 + }, + { + "epoch": 0.9535678678250962, + "grad_norm": 0.421875, + "learning_rate": 5.264743731716352e-06, + "loss": 0.2218, + "step": 20085 + }, + { + "epoch": 0.9538052509139249, + "grad_norm": 0.39453125, + "learning_rate": 5.262048913735088e-06, + "loss": 0.2212, + "step": 20090 + }, + { + "epoch": 0.9540426340027537, + "grad_norm": 0.39453125, + "learning_rate": 5.259367800901958e-06, + "loss": 0.2244, + "step": 20095 + }, + { + "epoch": 0.9542800170915824, + "grad_norm": 0.4296875, + "learning_rate": 5.256700394869315e-06, + "loss": 0.2237, + "step": 20100 + }, + { + "epoch": 0.9545174001804112, + "grad_norm": 0.462890625, + "learning_rate": 5.254046697281073e-06, + "loss": 0.2245, + "step": 20105 + }, + { + "epoch": 0.9547547832692399, + "grad_norm": 0.455078125, + "learning_rate": 5.251406709772689e-06, + "loss": 0.2252, + "step": 20110 + }, + { + "epoch": 0.9549921663580686, + "grad_norm": 0.40234375, + "learning_rate": 5.248780433971183e-06, + "loss": 0.2204, + "step": 20115 + }, + { + "epoch": 0.9552295494468974, + "grad_norm": 0.375, + "learning_rate": 5.246167871495108e-06, + "loss": 0.2253, + "step": 20120 + }, + { + "epoch": 0.9554669325357261, + "grad_norm": 0.48046875, + "learning_rate": 5.243569023954579e-06, + "loss": 0.2214, + "step": 20125 + }, + { + "epoch": 0.9557043156245549, + "grad_norm": 0.466796875, + "learning_rate": 5.240983892951256e-06, + "loss": 0.2265, + "step": 20130 + }, + { + "epoch": 0.9559416987133836, + "grad_norm": 0.474609375, + "learning_rate": 5.238412480078337e-06, + "loss": 0.2273, + "step": 20135 + }, + { + "epoch": 0.9561790818022124, + "grad_norm": 0.462890625, + "learning_rate": 5.235854786920579e-06, + "loss": 0.2216, + "step": 20140 + }, + { + "epoch": 0.9564164648910412, + "grad_norm": 0.4453125, + "learning_rate": 5.233310815054275e-06, + "loss": 0.2179, + "step": 20145 + }, + { + "epoch": 0.95665384797987, + "grad_norm": 0.474609375, + "learning_rate": 5.230780566047259e-06, + "loss": 0.2236, + "step": 20150 + }, + { + "epoch": 0.9568912310686987, + "grad_norm": 0.45703125, + "learning_rate": 5.228264041458915e-06, + "loss": 0.2227, + "step": 20155 + }, + { + "epoch": 0.9571286141575274, + "grad_norm": 0.39453125, + "learning_rate": 5.2257612428401676e-06, + "loss": 0.2256, + "step": 20160 + }, + { + "epoch": 0.9573659972463562, + "grad_norm": 0.40234375, + "learning_rate": 5.223272171733476e-06, + "loss": 0.2233, + "step": 20165 + }, + { + "epoch": 0.9576033803351849, + "grad_norm": 0.5, + "learning_rate": 5.220796829672847e-06, + "loss": 0.2241, + "step": 20170 + }, + { + "epoch": 0.9578407634240137, + "grad_norm": 0.40625, + "learning_rate": 5.218335218183821e-06, + "loss": 0.2206, + "step": 20175 + }, + { + "epoch": 0.9580781465128424, + "grad_norm": 0.455078125, + "learning_rate": 5.215887338783476e-06, + "loss": 0.2212, + "step": 20180 + }, + { + "epoch": 0.9583155296016712, + "grad_norm": 0.458984375, + "learning_rate": 5.213453192980435e-06, + "loss": 0.2239, + "step": 20185 + }, + { + "epoch": 0.9585529126904999, + "grad_norm": 0.408203125, + "learning_rate": 5.211032782274844e-06, + "loss": 0.2229, + "step": 20190 + }, + { + "epoch": 0.9587902957793287, + "grad_norm": 0.423828125, + "learning_rate": 5.208626108158396e-06, + "loss": 0.2241, + "step": 20195 + }, + { + "epoch": 0.9590276788681574, + "grad_norm": 0.427734375, + "learning_rate": 5.20623317211431e-06, + "loss": 0.2209, + "step": 20200 + }, + { + "epoch": 0.9592650619569861, + "grad_norm": 0.44921875, + "learning_rate": 5.203853975617345e-06, + "loss": 0.2175, + "step": 20205 + }, + { + "epoch": 0.959502445045815, + "grad_norm": 0.427734375, + "learning_rate": 5.201488520133786e-06, + "loss": 0.2217, + "step": 20210 + }, + { + "epoch": 0.9597398281346436, + "grad_norm": 0.4453125, + "learning_rate": 5.199136807121454e-06, + "loss": 0.2237, + "step": 20215 + }, + { + "epoch": 0.9599772112234725, + "grad_norm": 0.3984375, + "learning_rate": 5.1967988380296955e-06, + "loss": 0.2196, + "step": 20220 + }, + { + "epoch": 0.9602145943123012, + "grad_norm": 0.490234375, + "learning_rate": 5.1944746142993995e-06, + "loss": 0.2246, + "step": 20225 + }, + { + "epoch": 0.96045197740113, + "grad_norm": 0.439453125, + "learning_rate": 5.192164137362965e-06, + "loss": 0.221, + "step": 20230 + }, + { + "epoch": 0.9606893604899587, + "grad_norm": 0.4375, + "learning_rate": 5.189867408644332e-06, + "loss": 0.2233, + "step": 20235 + }, + { + "epoch": 0.9609267435787875, + "grad_norm": 0.3984375, + "learning_rate": 5.187584429558964e-06, + "loss": 0.2222, + "step": 20240 + }, + { + "epoch": 0.9611641266676162, + "grad_norm": 0.4375, + "learning_rate": 5.185315201513848e-06, + "loss": 0.2212, + "step": 20245 + }, + { + "epoch": 0.961401509756445, + "grad_norm": 0.380859375, + "learning_rate": 5.183059725907505e-06, + "loss": 0.2246, + "step": 20250 + }, + { + "epoch": 0.9616388928452737, + "grad_norm": 0.4453125, + "learning_rate": 5.180818004129968e-06, + "loss": 0.2247, + "step": 20255 + }, + { + "epoch": 0.9618762759341024, + "grad_norm": 0.4375, + "learning_rate": 5.178590037562804e-06, + "loss": 0.2165, + "step": 20260 + }, + { + "epoch": 0.9621136590229312, + "grad_norm": 0.482421875, + "learning_rate": 5.176375827579097e-06, + "loss": 0.2217, + "step": 20265 + }, + { + "epoch": 0.9623510421117599, + "grad_norm": 0.470703125, + "learning_rate": 5.1741753755434535e-06, + "loss": 0.2228, + "step": 20270 + }, + { + "epoch": 0.9625884252005887, + "grad_norm": 0.490234375, + "learning_rate": 5.1719886828120065e-06, + "loss": 0.2221, + "step": 20275 + }, + { + "epoch": 0.9628258082894174, + "grad_norm": 0.45703125, + "learning_rate": 5.1698157507323995e-06, + "loss": 0.2221, + "step": 20280 + }, + { + "epoch": 0.9630631913782463, + "grad_norm": 0.47265625, + "learning_rate": 5.167656580643803e-06, + "loss": 0.2207, + "step": 20285 + }, + { + "epoch": 0.963300574467075, + "grad_norm": 0.490234375, + "learning_rate": 5.165511173876904e-06, + "loss": 0.225, + "step": 20290 + }, + { + "epoch": 0.9635379575559038, + "grad_norm": 0.388671875, + "learning_rate": 5.163379531753907e-06, + "loss": 0.2237, + "step": 20295 + }, + { + "epoch": 0.9637753406447325, + "grad_norm": 0.5, + "learning_rate": 5.16126165558853e-06, + "loss": 0.2222, + "step": 20300 + }, + { + "epoch": 0.9640127237335612, + "grad_norm": 0.60546875, + "learning_rate": 5.159157546686015e-06, + "loss": 0.2219, + "step": 20305 + }, + { + "epoch": 0.96425010682239, + "grad_norm": 0.431640625, + "learning_rate": 5.157067206343113e-06, + "loss": 0.2238, + "step": 20310 + }, + { + "epoch": 0.9644874899112187, + "grad_norm": 0.39453125, + "learning_rate": 5.154990635848092e-06, + "loss": 0.2214, + "step": 20315 + }, + { + "epoch": 0.9647248730000475, + "grad_norm": 0.44140625, + "learning_rate": 5.1529278364807336e-06, + "loss": 0.223, + "step": 20320 + }, + { + "epoch": 0.9649622560888762, + "grad_norm": 0.470703125, + "learning_rate": 5.150878809512328e-06, + "loss": 0.2241, + "step": 20325 + }, + { + "epoch": 0.965199639177705, + "grad_norm": 0.494140625, + "learning_rate": 5.148843556205683e-06, + "loss": 0.2251, + "step": 20330 + }, + { + "epoch": 0.9654370222665337, + "grad_norm": 0.380859375, + "learning_rate": 5.146822077815119e-06, + "loss": 0.2238, + "step": 20335 + }, + { + "epoch": 0.9656744053553625, + "grad_norm": 0.39453125, + "learning_rate": 5.144814375586458e-06, + "loss": 0.2258, + "step": 20340 + }, + { + "epoch": 0.9659117884441912, + "grad_norm": 0.419921875, + "learning_rate": 5.1428204507570446e-06, + "loss": 0.2243, + "step": 20345 + }, + { + "epoch": 0.9661491715330199, + "grad_norm": 0.421875, + "learning_rate": 5.140840304555722e-06, + "loss": 0.2249, + "step": 20350 + }, + { + "epoch": 0.9663865546218487, + "grad_norm": 0.43359375, + "learning_rate": 5.138873938202843e-06, + "loss": 0.2176, + "step": 20355 + }, + { + "epoch": 0.9666239377106774, + "grad_norm": 0.5546875, + "learning_rate": 5.1369213529102735e-06, + "loss": 0.2266, + "step": 20360 + }, + { + "epoch": 0.9668613207995063, + "grad_norm": 0.486328125, + "learning_rate": 5.134982549881383e-06, + "loss": 0.2245, + "step": 20365 + }, + { + "epoch": 0.967098703888335, + "grad_norm": 0.419921875, + "learning_rate": 5.133057530311045e-06, + "loss": 0.2244, + "step": 20370 + }, + { + "epoch": 0.9673360869771638, + "grad_norm": 0.42578125, + "learning_rate": 5.131146295385643e-06, + "loss": 0.2269, + "step": 20375 + }, + { + "epoch": 0.9675734700659925, + "grad_norm": 0.392578125, + "learning_rate": 5.129248846283058e-06, + "loss": 0.2253, + "step": 20380 + }, + { + "epoch": 0.9678108531548213, + "grad_norm": 0.412109375, + "learning_rate": 5.127365184172685e-06, + "loss": 0.2235, + "step": 20385 + }, + { + "epoch": 0.96804823624365, + "grad_norm": 0.4296875, + "learning_rate": 5.125495310215413e-06, + "loss": 0.2209, + "step": 20390 + }, + { + "epoch": 0.9682856193324788, + "grad_norm": 0.455078125, + "learning_rate": 5.123639225563635e-06, + "loss": 0.2236, + "step": 20395 + }, + { + "epoch": 0.9685230024213075, + "grad_norm": 0.486328125, + "learning_rate": 5.121796931361251e-06, + "loss": 0.2195, + "step": 20400 + }, + { + "epoch": 0.9687603855101362, + "grad_norm": 0.45703125, + "learning_rate": 5.119968428743658e-06, + "loss": 0.222, + "step": 20405 + }, + { + "epoch": 0.968997768598965, + "grad_norm": 0.39453125, + "learning_rate": 5.118153718837755e-06, + "loss": 0.2227, + "step": 20410 + }, + { + "epoch": 0.9692351516877937, + "grad_norm": 0.419921875, + "learning_rate": 5.116352802761936e-06, + "loss": 0.2194, + "step": 20415 + }, + { + "epoch": 0.9694725347766225, + "grad_norm": 0.388671875, + "learning_rate": 5.114565681626099e-06, + "loss": 0.2224, + "step": 20420 + }, + { + "epoch": 0.9697099178654512, + "grad_norm": 0.46484375, + "learning_rate": 5.112792356531637e-06, + "loss": 0.2254, + "step": 20425 + }, + { + "epoch": 0.9699473009542801, + "grad_norm": 0.404296875, + "learning_rate": 5.111032828571447e-06, + "loss": 0.2193, + "step": 20430 + }, + { + "epoch": 0.9701846840431088, + "grad_norm": 0.478515625, + "learning_rate": 5.109287098829913e-06, + "loss": 0.2231, + "step": 20435 + }, + { + "epoch": 0.9704220671319376, + "grad_norm": 0.46875, + "learning_rate": 5.1075551683829224e-06, + "loss": 0.2248, + "step": 20440 + }, + { + "epoch": 0.9706594502207663, + "grad_norm": 0.4609375, + "learning_rate": 5.105837038297855e-06, + "loss": 0.2206, + "step": 20445 + }, + { + "epoch": 0.970896833309595, + "grad_norm": 0.439453125, + "learning_rate": 5.10413270963359e-06, + "loss": 0.2224, + "step": 20450 + }, + { + "epoch": 0.9711342163984238, + "grad_norm": 0.41796875, + "learning_rate": 5.1024421834404926e-06, + "loss": 0.2195, + "step": 20455 + }, + { + "epoch": 0.9713715994872525, + "grad_norm": 0.4453125, + "learning_rate": 5.100765460760431e-06, + "loss": 0.2225, + "step": 20460 + }, + { + "epoch": 0.9716089825760813, + "grad_norm": 0.494140625, + "learning_rate": 5.099102542626757e-06, + "loss": 0.2258, + "step": 20465 + }, + { + "epoch": 0.97184636566491, + "grad_norm": 0.408203125, + "learning_rate": 5.097453430064324e-06, + "loss": 0.2216, + "step": 20470 + }, + { + "epoch": 0.9720837487537388, + "grad_norm": 0.392578125, + "learning_rate": 5.09581812408947e-06, + "loss": 0.2234, + "step": 20475 + }, + { + "epoch": 0.9723211318425675, + "grad_norm": 0.39453125, + "learning_rate": 5.094196625710026e-06, + "loss": 0.223, + "step": 20480 + }, + { + "epoch": 0.9725585149313963, + "grad_norm": 0.419921875, + "learning_rate": 5.092588935925316e-06, + "loss": 0.2177, + "step": 20485 + }, + { + "epoch": 0.972795898020225, + "grad_norm": 0.42578125, + "learning_rate": 5.090995055726152e-06, + "loss": 0.2219, + "step": 20490 + }, + { + "epoch": 0.9730332811090537, + "grad_norm": 0.421875, + "learning_rate": 5.0894149860948345e-06, + "loss": 0.2217, + "step": 20495 + }, + { + "epoch": 0.9732706641978826, + "grad_norm": 0.4921875, + "learning_rate": 5.0878487280051495e-06, + "loss": 0.2197, + "step": 20500 + }, + { + "epoch": 0.9735080472867113, + "grad_norm": 0.50390625, + "learning_rate": 5.08629628242238e-06, + "loss": 0.2217, + "step": 20505 + }, + { + "epoch": 0.9737454303755401, + "grad_norm": 0.443359375, + "learning_rate": 5.084757650303292e-06, + "loss": 0.2205, + "step": 20510 + }, + { + "epoch": 0.9739828134643688, + "grad_norm": 0.44140625, + "learning_rate": 5.083232832596133e-06, + "loss": 0.2234, + "step": 20515 + }, + { + "epoch": 0.9742201965531976, + "grad_norm": 0.423828125, + "learning_rate": 5.081721830240644e-06, + "loss": 0.2212, + "step": 20520 + }, + { + "epoch": 0.9744575796420263, + "grad_norm": 0.419921875, + "learning_rate": 5.0802246441680455e-06, + "loss": 0.2176, + "step": 20525 + }, + { + "epoch": 0.9746949627308551, + "grad_norm": 0.392578125, + "learning_rate": 5.078741275301051e-06, + "loss": 0.2215, + "step": 20530 + }, + { + "epoch": 0.9749323458196838, + "grad_norm": 0.447265625, + "learning_rate": 5.07727172455385e-06, + "loss": 0.2249, + "step": 20535 + }, + { + "epoch": 0.9751697289085126, + "grad_norm": 0.439453125, + "learning_rate": 5.075815992832124e-06, + "loss": 0.2212, + "step": 20540 + }, + { + "epoch": 0.9754071119973413, + "grad_norm": 0.4140625, + "learning_rate": 5.0743740810330304e-06, + "loss": 0.225, + "step": 20545 + }, + { + "epoch": 0.97564449508617, + "grad_norm": 0.41796875, + "learning_rate": 5.072945990045213e-06, + "loss": 0.2194, + "step": 20550 + }, + { + "epoch": 0.9758818781749988, + "grad_norm": 0.419921875, + "learning_rate": 5.071531720748802e-06, + "loss": 0.2229, + "step": 20555 + }, + { + "epoch": 0.9761192612638275, + "grad_norm": 0.39453125, + "learning_rate": 5.0701312740154004e-06, + "loss": 0.2255, + "step": 20560 + }, + { + "epoch": 0.9763566443526563, + "grad_norm": 0.416015625, + "learning_rate": 5.0687446507080995e-06, + "loss": 0.2236, + "step": 20565 + }, + { + "epoch": 0.976594027441485, + "grad_norm": 0.451171875, + "learning_rate": 5.067371851681466e-06, + "loss": 0.223, + "step": 20570 + }, + { + "epoch": 0.9768314105303139, + "grad_norm": 0.40234375, + "learning_rate": 5.066012877781554e-06, + "loss": 0.2252, + "step": 20575 + }, + { + "epoch": 0.9770687936191426, + "grad_norm": 0.43359375, + "learning_rate": 5.06466772984589e-06, + "loss": 0.2251, + "step": 20580 + }, + { + "epoch": 0.9773061767079714, + "grad_norm": 0.43359375, + "learning_rate": 5.063336408703486e-06, + "loss": 0.2205, + "step": 20585 + }, + { + "epoch": 0.9775435597968001, + "grad_norm": 0.431640625, + "learning_rate": 5.0620189151748235e-06, + "loss": 0.2251, + "step": 20590 + }, + { + "epoch": 0.9777809428856288, + "grad_norm": 0.43359375, + "learning_rate": 5.060715250071874e-06, + "loss": 0.2198, + "step": 20595 + }, + { + "epoch": 0.9780183259744576, + "grad_norm": 0.451171875, + "learning_rate": 5.059425414198073e-06, + "loss": 0.2259, + "step": 20600 + }, + { + "epoch": 0.9782557090632863, + "grad_norm": 0.435546875, + "learning_rate": 5.0581494083483495e-06, + "loss": 0.2266, + "step": 20605 + }, + { + "epoch": 0.9784930921521151, + "grad_norm": 0.41796875, + "learning_rate": 5.056887233309092e-06, + "loss": 0.2199, + "step": 20610 + }, + { + "epoch": 0.9787304752409438, + "grad_norm": 0.49609375, + "learning_rate": 5.0556388898581785e-06, + "loss": 0.219, + "step": 20615 + }, + { + "epoch": 0.9789678583297726, + "grad_norm": 0.421875, + "learning_rate": 5.054404378764955e-06, + "loss": 0.2217, + "step": 20620 + }, + { + "epoch": 0.9792052414186013, + "grad_norm": 0.4296875, + "learning_rate": 5.053183700790245e-06, + "loss": 0.2234, + "step": 20625 + }, + { + "epoch": 0.9794426245074301, + "grad_norm": 0.4140625, + "learning_rate": 5.051976856686348e-06, + "loss": 0.2258, + "step": 20630 + }, + { + "epoch": 0.9796800075962588, + "grad_norm": 0.46875, + "learning_rate": 5.050783847197036e-06, + "loss": 0.2218, + "step": 20635 + }, + { + "epoch": 0.9799173906850875, + "grad_norm": 0.431640625, + "learning_rate": 5.049604673057554e-06, + "loss": 0.2221, + "step": 20640 + }, + { + "epoch": 0.9801547737739164, + "grad_norm": 0.44140625, + "learning_rate": 5.048439334994625e-06, + "loss": 0.225, + "step": 20645 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.4609375, + "learning_rate": 5.047287833726437e-06, + "loss": 0.2237, + "step": 20650 + }, + { + "epoch": 0.9806295399515739, + "grad_norm": 0.419921875, + "learning_rate": 5.04615016996266e-06, + "loss": 0.2247, + "step": 20655 + }, + { + "epoch": 0.9808669230404026, + "grad_norm": 0.451171875, + "learning_rate": 5.045026344404426e-06, + "loss": 0.2233, + "step": 20660 + }, + { + "epoch": 0.9811043061292314, + "grad_norm": 0.392578125, + "learning_rate": 5.043916357744347e-06, + "loss": 0.2226, + "step": 20665 + }, + { + "epoch": 0.9813416892180601, + "grad_norm": 0.47265625, + "learning_rate": 5.042820210666502e-06, + "loss": 0.2224, + "step": 20670 + }, + { + "epoch": 0.9815790723068889, + "grad_norm": 0.439453125, + "learning_rate": 5.04173790384644e-06, + "loss": 0.221, + "step": 20675 + }, + { + "epoch": 0.9818164553957176, + "grad_norm": 0.44140625, + "learning_rate": 5.040669437951186e-06, + "loss": 0.2229, + "step": 20680 + }, + { + "epoch": 0.9820538384845464, + "grad_norm": 0.439453125, + "learning_rate": 5.0396148136392255e-06, + "loss": 0.2242, + "step": 20685 + }, + { + "epoch": 0.9822912215733751, + "grad_norm": 0.392578125, + "learning_rate": 5.03857403156052e-06, + "loss": 0.223, + "step": 20690 + }, + { + "epoch": 0.9825286046622038, + "grad_norm": 0.41015625, + "learning_rate": 5.037547092356501e-06, + "loss": 0.2272, + "step": 20695 + }, + { + "epoch": 0.9827659877510326, + "grad_norm": 0.4921875, + "learning_rate": 5.036533996660064e-06, + "loss": 0.2218, + "step": 20700 + }, + { + "epoch": 0.9830033708398613, + "grad_norm": 0.431640625, + "learning_rate": 5.035534745095576e-06, + "loss": 0.2275, + "step": 20705 + }, + { + "epoch": 0.9832407539286901, + "grad_norm": 0.5390625, + "learning_rate": 5.034549338278872e-06, + "loss": 0.2249, + "step": 20710 + }, + { + "epoch": 0.9834781370175188, + "grad_norm": 0.41796875, + "learning_rate": 5.033577776817254e-06, + "loss": 0.2221, + "step": 20715 + }, + { + "epoch": 0.9837155201063477, + "grad_norm": 0.458984375, + "learning_rate": 5.03262006130949e-06, + "loss": 0.2222, + "step": 20720 + }, + { + "epoch": 0.9839529031951764, + "grad_norm": 0.40234375, + "learning_rate": 5.031676192345815e-06, + "loss": 0.2203, + "step": 20725 + }, + { + "epoch": 0.9841902862840052, + "grad_norm": 0.4140625, + "learning_rate": 5.030746170507934e-06, + "loss": 0.2258, + "step": 20730 + }, + { + "epoch": 0.9844276693728339, + "grad_norm": 0.44921875, + "learning_rate": 5.0298299963690096e-06, + "loss": 0.2217, + "step": 20735 + }, + { + "epoch": 0.9846650524616626, + "grad_norm": 0.40234375, + "learning_rate": 5.028927670493684e-06, + "loss": 0.2207, + "step": 20740 + }, + { + "epoch": 0.9849024355504914, + "grad_norm": 0.44140625, + "learning_rate": 5.028039193438049e-06, + "loss": 0.2225, + "step": 20745 + }, + { + "epoch": 0.9851398186393201, + "grad_norm": 0.546875, + "learning_rate": 5.0271645657496725e-06, + "loss": 0.2264, + "step": 20750 + }, + { + "epoch": 0.9853772017281489, + "grad_norm": 0.419921875, + "learning_rate": 5.026303787967583e-06, + "loss": 0.2216, + "step": 20755 + }, + { + "epoch": 0.9856145848169776, + "grad_norm": 0.421875, + "learning_rate": 5.025456860622273e-06, + "loss": 0.2231, + "step": 20760 + }, + { + "epoch": 0.9858519679058064, + "grad_norm": 0.375, + "learning_rate": 5.024623784235702e-06, + "loss": 0.2232, + "step": 20765 + }, + { + "epoch": 0.9860893509946351, + "grad_norm": 0.419921875, + "learning_rate": 5.023804559321289e-06, + "loss": 0.2267, + "step": 20770 + }, + { + "epoch": 0.9863267340834639, + "grad_norm": 0.44140625, + "learning_rate": 5.022999186383922e-06, + "loss": 0.2245, + "step": 20775 + }, + { + "epoch": 0.9865641171722926, + "grad_norm": 0.453125, + "learning_rate": 5.022207665919942e-06, + "loss": 0.2219, + "step": 20780 + }, + { + "epoch": 0.9868015002611213, + "grad_norm": 0.404296875, + "learning_rate": 5.0214299984171685e-06, + "loss": 0.2204, + "step": 20785 + }, + { + "epoch": 0.9870388833499502, + "grad_norm": 0.427734375, + "learning_rate": 5.020666184354867e-06, + "loss": 0.2232, + "step": 20790 + }, + { + "epoch": 0.9872762664387789, + "grad_norm": 0.3828125, + "learning_rate": 5.019916224203776e-06, + "loss": 0.2221, + "step": 20795 + }, + { + "epoch": 0.9875136495276077, + "grad_norm": 0.431640625, + "learning_rate": 5.019180118426092e-06, + "loss": 0.2231, + "step": 20800 + }, + { + "epoch": 0.9877510326164364, + "grad_norm": 0.4453125, + "learning_rate": 5.018457867475475e-06, + "loss": 0.2247, + "step": 20805 + }, + { + "epoch": 0.9879884157052652, + "grad_norm": 0.44921875, + "learning_rate": 5.01774947179704e-06, + "loss": 0.2234, + "step": 20810 + }, + { + "epoch": 0.9882257987940939, + "grad_norm": 0.490234375, + "learning_rate": 5.017054931827374e-06, + "loss": 0.2261, + "step": 20815 + }, + { + "epoch": 0.9884631818829227, + "grad_norm": 0.439453125, + "learning_rate": 5.016374247994518e-06, + "loss": 0.223, + "step": 20820 + }, + { + "epoch": 0.9887005649717514, + "grad_norm": 0.380859375, + "learning_rate": 5.015707420717969e-06, + "loss": 0.2211, + "step": 20825 + }, + { + "epoch": 0.9889379480605802, + "grad_norm": 0.52734375, + "learning_rate": 5.015054450408696e-06, + "loss": 0.2222, + "step": 20830 + }, + { + "epoch": 0.9891753311494089, + "grad_norm": 0.3984375, + "learning_rate": 5.014415337469117e-06, + "loss": 0.2248, + "step": 20835 + }, + { + "epoch": 0.9894127142382376, + "grad_norm": 0.4609375, + "learning_rate": 5.013790082293118e-06, + "loss": 0.2252, + "step": 20840 + }, + { + "epoch": 0.9896500973270664, + "grad_norm": 0.44921875, + "learning_rate": 5.01317868526604e-06, + "loss": 0.2258, + "step": 20845 + }, + { + "epoch": 0.9898874804158951, + "grad_norm": 0.431640625, + "learning_rate": 5.012581146764683e-06, + "loss": 0.2221, + "step": 20850 + }, + { + "epoch": 0.990124863504724, + "grad_norm": 0.4921875, + "learning_rate": 5.011997467157306e-06, + "loss": 0.2233, + "step": 20855 + }, + { + "epoch": 0.9903622465935527, + "grad_norm": 0.4453125, + "learning_rate": 5.011427646803632e-06, + "loss": 0.2248, + "step": 20860 + }, + { + "epoch": 0.9905996296823815, + "grad_norm": 0.423828125, + "learning_rate": 5.010871686054833e-06, + "loss": 0.2254, + "step": 20865 + }, + { + "epoch": 0.9908370127712102, + "grad_norm": 0.48046875, + "learning_rate": 5.010329585253552e-06, + "loss": 0.2225, + "step": 20870 + }, + { + "epoch": 0.991074395860039, + "grad_norm": 0.447265625, + "learning_rate": 5.009801344733879e-06, + "loss": 0.2233, + "step": 20875 + }, + { + "epoch": 0.9913117789488677, + "grad_norm": 0.439453125, + "learning_rate": 5.009286964821368e-06, + "loss": 0.2232, + "step": 20880 + }, + { + "epoch": 0.9915491620376964, + "grad_norm": 0.41796875, + "learning_rate": 5.0087864458330265e-06, + "loss": 0.2208, + "step": 20885 + }, + { + "epoch": 0.9917865451265252, + "grad_norm": 0.421875, + "learning_rate": 5.008299788077325e-06, + "loss": 0.2222, + "step": 20890 + }, + { + "epoch": 0.9920239282153539, + "grad_norm": 0.40625, + "learning_rate": 5.007826991854186e-06, + "loss": 0.2227, + "step": 20895 + }, + { + "epoch": 0.9922613113041827, + "grad_norm": 0.43359375, + "learning_rate": 5.007368057454995e-06, + "loss": 0.223, + "step": 20900 + }, + { + "epoch": 0.9924986943930114, + "grad_norm": 0.443359375, + "learning_rate": 5.006922985162586e-06, + "loss": 0.2169, + "step": 20905 + }, + { + "epoch": 0.9927360774818402, + "grad_norm": 0.396484375, + "learning_rate": 5.006491775251261e-06, + "loss": 0.2231, + "step": 20910 + }, + { + "epoch": 0.9929734605706689, + "grad_norm": 0.443359375, + "learning_rate": 5.006074427986766e-06, + "loss": 0.2226, + "step": 20915 + }, + { + "epoch": 0.9932108436594977, + "grad_norm": 0.451171875, + "learning_rate": 5.0056709436263185e-06, + "loss": 0.225, + "step": 20920 + }, + { + "epoch": 0.9934482267483264, + "grad_norm": 0.482421875, + "learning_rate": 5.0052813224185765e-06, + "loss": 0.2242, + "step": 20925 + }, + { + "epoch": 0.9936856098371551, + "grad_norm": 0.44140625, + "learning_rate": 5.0049055646036656e-06, + "loss": 0.2203, + "step": 20930 + }, + { + "epoch": 0.993922992925984, + "grad_norm": 0.50390625, + "learning_rate": 5.004543670413163e-06, + "loss": 0.2219, + "step": 20935 + }, + { + "epoch": 0.9941603760148127, + "grad_norm": 0.419921875, + "learning_rate": 5.004195640070101e-06, + "loss": 0.218, + "step": 20940 + }, + { + "epoch": 0.9943977591036415, + "grad_norm": 0.443359375, + "learning_rate": 5.003861473788972e-06, + "loss": 0.226, + "step": 20945 + }, + { + "epoch": 0.9946351421924702, + "grad_norm": 0.421875, + "learning_rate": 5.0035411717757195e-06, + "loss": 0.2249, + "step": 20950 + }, + { + "epoch": 0.994872525281299, + "grad_norm": 0.412109375, + "learning_rate": 5.003234734227743e-06, + "loss": 0.2255, + "step": 20955 + }, + { + "epoch": 0.9951099083701277, + "grad_norm": 0.423828125, + "learning_rate": 5.002942161333901e-06, + "loss": 0.2257, + "step": 20960 + }, + { + "epoch": 0.9953472914589565, + "grad_norm": 0.40625, + "learning_rate": 5.002663453274505e-06, + "loss": 0.2232, + "step": 20965 + }, + { + "epoch": 0.9955846745477852, + "grad_norm": 0.384765625, + "learning_rate": 5.002398610221319e-06, + "loss": 0.2195, + "step": 20970 + }, + { + "epoch": 0.995822057636614, + "grad_norm": 0.404296875, + "learning_rate": 5.002147632337564e-06, + "loss": 0.2194, + "step": 20975 + }, + { + "epoch": 0.9960594407254427, + "grad_norm": 0.48828125, + "learning_rate": 5.001910519777919e-06, + "loss": 0.2213, + "step": 20980 + }, + { + "epoch": 0.9962968238142714, + "grad_norm": 0.408203125, + "learning_rate": 5.001687272688517e-06, + "loss": 0.2266, + "step": 20985 + }, + { + "epoch": 0.9965342069031002, + "grad_norm": 0.4140625, + "learning_rate": 5.001477891206939e-06, + "loss": 0.2211, + "step": 20990 + }, + { + "epoch": 0.9967715899919289, + "grad_norm": 0.439453125, + "learning_rate": 5.001282375462229e-06, + "loss": 0.2245, + "step": 20995 + }, + { + "epoch": 0.9970089730807578, + "grad_norm": 0.498046875, + "learning_rate": 5.001100725574883e-06, + "loss": 0.2252, + "step": 21000 + }, + { + "epoch": 0.9972463561695865, + "grad_norm": 0.486328125, + "learning_rate": 5.000932941656851e-06, + "loss": 0.2212, + "step": 21005 + }, + { + "epoch": 0.9974837392584153, + "grad_norm": 0.435546875, + "learning_rate": 5.0007790238115375e-06, + "loss": 0.2215, + "step": 21010 + }, + { + "epoch": 0.997721122347244, + "grad_norm": 0.466796875, + "learning_rate": 5.000638972133797e-06, + "loss": 0.2214, + "step": 21015 + }, + { + "epoch": 0.9979585054360728, + "grad_norm": 0.44921875, + "learning_rate": 5.000512786709948e-06, + "loss": 0.2208, + "step": 21020 + }, + { + "epoch": 0.9981958885249015, + "grad_norm": 0.439453125, + "learning_rate": 5.000400467617755e-06, + "loss": 0.2237, + "step": 21025 + }, + { + "epoch": 0.9984332716137302, + "grad_norm": 0.412109375, + "learning_rate": 5.000302014926443e-06, + "loss": 0.2256, + "step": 21030 + }, + { + "epoch": 0.998670654702559, + "grad_norm": 0.43359375, + "learning_rate": 5.0002174286966856e-06, + "loss": 0.2231, + "step": 21035 + }, + { + "epoch": 0.9989080377913877, + "grad_norm": 0.431640625, + "learning_rate": 5.000146708980612e-06, + "loss": 0.2259, + "step": 21040 + }, + { + "epoch": 0.9991454208802165, + "grad_norm": 0.462890625, + "learning_rate": 5.000089855821809e-06, + "loss": 0.2235, + "step": 21045 + }, + { + "epoch": 0.9993828039690452, + "grad_norm": 0.5546875, + "learning_rate": 5.000046869255312e-06, + "loss": 0.2198, + "step": 21050 + }, + { + "epoch": 0.999620187057874, + "grad_norm": 0.43359375, + "learning_rate": 5.000017749307616e-06, + "loss": 0.2214, + "step": 21055 + }, + { + "epoch": 0.9998575701467027, + "grad_norm": 0.4765625, + "learning_rate": 5.0000024959966645e-06, + "loss": 0.2229, + "step": 21060 + }, + { + "epoch": 1.0, + "step": 21063, + "total_flos": 5.928116376894964e+18, + "train_loss": 0.23212106142810157, + "train_runtime": 36092.2941, + "train_samples_per_second": 4.669, + "train_steps_per_second": 0.584 + } + ], + "logging_steps": 5, + "max_steps": 21063, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.928116376894964e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}