diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,85619 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 61000, + "global_step": 122230, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016362595107584062, + "grad_norm": 6.450747489929199, + "learning_rate": 8.181297553792031e-09, + "loss": 0.4893, + "step": 10 + }, + { + "epoch": 0.00032725190215168124, + "grad_norm": 6.003659725189209, + "learning_rate": 1.6362595107584062e-08, + "loss": 0.4797, + "step": 20 + }, + { + "epoch": 0.0004908778532275219, + "grad_norm": 7.656006813049316, + "learning_rate": 2.4543892661376094e-08, + "loss": 0.4769, + "step": 30 + }, + { + "epoch": 0.0006545038043033625, + "grad_norm": 4.77230978012085, + "learning_rate": 3.2725190215168125e-08, + "loss": 0.4858, + "step": 40 + }, + { + "epoch": 0.0008181297553792031, + "grad_norm": 10.098441123962402, + "learning_rate": 4.090648776896016e-08, + "loss": 0.5014, + "step": 50 + }, + { + "epoch": 0.0009817557064550437, + "grad_norm": 7.6441650390625, + "learning_rate": 4.908778532275219e-08, + "loss": 0.4555, + "step": 60 + }, + { + "epoch": 0.0011453816575308843, + "grad_norm": 6.458242416381836, + "learning_rate": 5.7269082876544225e-08, + "loss": 0.481, + "step": 70 + }, + { + "epoch": 0.001309007608606725, + "grad_norm": 5.756002902984619, + "learning_rate": 6.545038043033625e-08, + "loss": 0.4646, + "step": 80 + }, + { + "epoch": 0.0014726335596825656, + "grad_norm": 6.363128662109375, + "learning_rate": 7.363167798412829e-08, + "loss": 0.4534, + "step": 90 + }, + { + "epoch": 0.0016362595107584062, + "grad_norm": 8.146249771118164, + "learning_rate": 8.181297553792032e-08, + "loss": 0.4646, + "step": 100 + }, + { + "epoch": 0.0017998854618342468, + "grad_norm": 6.564874649047852, + "learning_rate": 8.999427309171235e-08, + "loss": 0.4325, + "step": 110 + }, + { + "epoch": 0.0019635114129100874, + "grad_norm": 6.186850070953369, + "learning_rate": 9.817557064550437e-08, + "loss": 0.4514, + "step": 120 + }, + { + "epoch": 0.002127137363985928, + "grad_norm": 7.0988969802856445, + "learning_rate": 1.0635686819929643e-07, + "loss": 0.4162, + "step": 130 + }, + { + "epoch": 0.0022907633150617687, + "grad_norm": 5.64640998840332, + "learning_rate": 1.1453816575308845e-07, + "loss": 0.3946, + "step": 140 + }, + { + "epoch": 0.0024543892661376093, + "grad_norm": 5.670436859130859, + "learning_rate": 1.2271946330688047e-07, + "loss": 0.3344, + "step": 150 + }, + { + "epoch": 0.00261801521721345, + "grad_norm": 5.174366474151611, + "learning_rate": 1.309007608606725e-07, + "loss": 0.3333, + "step": 160 + }, + { + "epoch": 0.0027816411682892906, + "grad_norm": 3.695723533630371, + "learning_rate": 1.3908205841446455e-07, + "loss": 0.2925, + "step": 170 + }, + { + "epoch": 0.002945267119365131, + "grad_norm": 4.6453046798706055, + "learning_rate": 1.4726335596825657e-07, + "loss": 0.2809, + "step": 180 + }, + { + "epoch": 0.003108893070440972, + "grad_norm": 2.976259469985962, + "learning_rate": 1.554446535220486e-07, + "loss": 0.2577, + "step": 190 + }, + { + "epoch": 0.0032725190215168124, + "grad_norm": 2.8548524379730225, + "learning_rate": 1.6362595107584065e-07, + "loss": 0.2132, + "step": 200 + }, + { + "epoch": 0.003436144972592653, + "grad_norm": 2.73585844039917, + "learning_rate": 1.7180724862963267e-07, + "loss": 0.1914, + "step": 210 + }, + { + "epoch": 0.0035997709236684937, + "grad_norm": 2.198151111602783, + "learning_rate": 1.799885461834247e-07, + "loss": 0.1853, + "step": 220 + }, + { + "epoch": 0.0037633968747443343, + "grad_norm": 2.124385118484497, + "learning_rate": 1.8816984373721675e-07, + "loss": 0.178, + "step": 230 + }, + { + "epoch": 0.003927022825820175, + "grad_norm": 1.9803823232650757, + "learning_rate": 1.9635114129100875e-07, + "loss": 0.1603, + "step": 240 + }, + { + "epoch": 0.004090648776896016, + "grad_norm": 1.675918698310852, + "learning_rate": 2.045324388448008e-07, + "loss": 0.1302, + "step": 250 + }, + { + "epoch": 0.004254274727971856, + "grad_norm": 2.0964345932006836, + "learning_rate": 2.1271373639859285e-07, + "loss": 0.1478, + "step": 260 + }, + { + "epoch": 0.004417900679047697, + "grad_norm": 1.632344365119934, + "learning_rate": 2.2089503395238485e-07, + "loss": 0.1341, + "step": 270 + }, + { + "epoch": 0.004581526630123537, + "grad_norm": 1.394518494606018, + "learning_rate": 2.290763315061769e-07, + "loss": 0.1131, + "step": 280 + }, + { + "epoch": 0.004745152581199378, + "grad_norm": 2.0453503131866455, + "learning_rate": 2.3725762905996892e-07, + "loss": 0.1112, + "step": 290 + }, + { + "epoch": 0.004908778532275219, + "grad_norm": 1.4522521495819092, + "learning_rate": 2.4543892661376095e-07, + "loss": 0.1101, + "step": 300 + }, + { + "epoch": 0.00507240448335106, + "grad_norm": 1.6254401206970215, + "learning_rate": 2.53620224167553e-07, + "loss": 0.108, + "step": 310 + }, + { + "epoch": 0.0052360304344269, + "grad_norm": 1.332760214805603, + "learning_rate": 2.61801521721345e-07, + "loss": 0.0942, + "step": 320 + }, + { + "epoch": 0.005399656385502741, + "grad_norm": 1.449270486831665, + "learning_rate": 2.6998281927513705e-07, + "loss": 0.0873, + "step": 330 + }, + { + "epoch": 0.005563282336578581, + "grad_norm": 1.3381404876708984, + "learning_rate": 2.781641168289291e-07, + "loss": 0.0732, + "step": 340 + }, + { + "epoch": 0.005726908287654422, + "grad_norm": 1.650217890739441, + "learning_rate": 2.863454143827211e-07, + "loss": 0.0941, + "step": 350 + }, + { + "epoch": 0.005890534238730262, + "grad_norm": 1.642314076423645, + "learning_rate": 2.9452671193651315e-07, + "loss": 0.0854, + "step": 360 + }, + { + "epoch": 0.006054160189806103, + "grad_norm": 1.5752111673355103, + "learning_rate": 3.027080094903052e-07, + "loss": 0.0859, + "step": 370 + }, + { + "epoch": 0.006217786140881944, + "grad_norm": 2.0331008434295654, + "learning_rate": 3.108893070440972e-07, + "loss": 0.0832, + "step": 380 + }, + { + "epoch": 0.006381412091957785, + "grad_norm": 1.6878950595855713, + "learning_rate": 3.1907060459788925e-07, + "loss": 0.063, + "step": 390 + }, + { + "epoch": 0.006545038043033625, + "grad_norm": 1.2223267555236816, + "learning_rate": 3.272519021516813e-07, + "loss": 0.0877, + "step": 400 + }, + { + "epoch": 0.006708663994109466, + "grad_norm": 1.5864821672439575, + "learning_rate": 3.3543319970547335e-07, + "loss": 0.0768, + "step": 410 + }, + { + "epoch": 0.006872289945185306, + "grad_norm": 1.4420173168182373, + "learning_rate": 3.4361449725926535e-07, + "loss": 0.0768, + "step": 420 + }, + { + "epoch": 0.007035915896261147, + "grad_norm": 1.4067822694778442, + "learning_rate": 3.5179579481305735e-07, + "loss": 0.0627, + "step": 430 + }, + { + "epoch": 0.007199541847336987, + "grad_norm": 1.43185555934906, + "learning_rate": 3.599770923668494e-07, + "loss": 0.078, + "step": 440 + }, + { + "epoch": 0.007363167798412828, + "grad_norm": 1.3940660953521729, + "learning_rate": 3.6815838992064145e-07, + "loss": 0.074, + "step": 450 + }, + { + "epoch": 0.0075267937494886685, + "grad_norm": 1.532396912574768, + "learning_rate": 3.763396874744335e-07, + "loss": 0.0712, + "step": 460 + }, + { + "epoch": 0.00769041970056451, + "grad_norm": 1.0307947397232056, + "learning_rate": 3.845209850282255e-07, + "loss": 0.06, + "step": 470 + }, + { + "epoch": 0.00785404565164035, + "grad_norm": 1.2340439558029175, + "learning_rate": 3.927022825820175e-07, + "loss": 0.0723, + "step": 480 + }, + { + "epoch": 0.00801767160271619, + "grad_norm": 1.3179080486297607, + "learning_rate": 4.0088358013580955e-07, + "loss": 0.0575, + "step": 490 + }, + { + "epoch": 0.008181297553792032, + "grad_norm": 1.514940857887268, + "learning_rate": 4.090648776896016e-07, + "loss": 0.0609, + "step": 500 + }, + { + "epoch": 0.008344923504867872, + "grad_norm": 1.3926035165786743, + "learning_rate": 4.1724617524339365e-07, + "loss": 0.0548, + "step": 510 + }, + { + "epoch": 0.008508549455943712, + "grad_norm": 1.7142009735107422, + "learning_rate": 4.254274727971857e-07, + "loss": 0.0712, + "step": 520 + }, + { + "epoch": 0.008672175407019552, + "grad_norm": 1.7795113325119019, + "learning_rate": 4.336087703509777e-07, + "loss": 0.0631, + "step": 530 + }, + { + "epoch": 0.008835801358095394, + "grad_norm": 1.0693011283874512, + "learning_rate": 4.417900679047697e-07, + "loss": 0.0551, + "step": 540 + }, + { + "epoch": 0.008999427309171235, + "grad_norm": 1.2251043319702148, + "learning_rate": 4.4997136545856175e-07, + "loss": 0.0483, + "step": 550 + }, + { + "epoch": 0.009163053260247075, + "grad_norm": 1.2896251678466797, + "learning_rate": 4.581526630123538e-07, + "loss": 0.0549, + "step": 560 + }, + { + "epoch": 0.009326679211322917, + "grad_norm": 1.7514303922653198, + "learning_rate": 4.6633396056614585e-07, + "loss": 0.0491, + "step": 570 + }, + { + "epoch": 0.009490305162398757, + "grad_norm": 1.3628102540969849, + "learning_rate": 4.7451525811993785e-07, + "loss": 0.0584, + "step": 580 + }, + { + "epoch": 0.009653931113474597, + "grad_norm": 1.2788424491882324, + "learning_rate": 4.826965556737298e-07, + "loss": 0.0512, + "step": 590 + }, + { + "epoch": 0.009817557064550437, + "grad_norm": 1.4374266862869263, + "learning_rate": 4.908778532275219e-07, + "loss": 0.048, + "step": 600 + }, + { + "epoch": 0.00998118301562628, + "grad_norm": 1.0559022426605225, + "learning_rate": 4.99059150781314e-07, + "loss": 0.048, + "step": 610 + }, + { + "epoch": 0.01014480896670212, + "grad_norm": 1.654038906097412, + "learning_rate": 5.07240448335106e-07, + "loss": 0.0463, + "step": 620 + }, + { + "epoch": 0.01030843491777796, + "grad_norm": 1.4024804830551147, + "learning_rate": 5.15421745888898e-07, + "loss": 0.0444, + "step": 630 + }, + { + "epoch": 0.0104720608688538, + "grad_norm": 1.0499006509780884, + "learning_rate": 5.2360304344269e-07, + "loss": 0.0561, + "step": 640 + }, + { + "epoch": 0.010635686819929642, + "grad_norm": 1.0363799333572388, + "learning_rate": 5.31784340996482e-07, + "loss": 0.0366, + "step": 650 + }, + { + "epoch": 0.010799312771005482, + "grad_norm": 0.8694953918457031, + "learning_rate": 5.399656385502741e-07, + "loss": 0.0416, + "step": 660 + }, + { + "epoch": 0.010962938722081322, + "grad_norm": 1.33260178565979, + "learning_rate": 5.481469361040661e-07, + "loss": 0.0476, + "step": 670 + }, + { + "epoch": 0.011126564673157162, + "grad_norm": 1.2754524946212769, + "learning_rate": 5.563282336578582e-07, + "loss": 0.0479, + "step": 680 + }, + { + "epoch": 0.011290190624233004, + "grad_norm": 1.1440926790237427, + "learning_rate": 5.645095312116501e-07, + "loss": 0.0419, + "step": 690 + }, + { + "epoch": 0.011453816575308844, + "grad_norm": 1.6302803754806519, + "learning_rate": 5.726908287654422e-07, + "loss": 0.0427, + "step": 700 + }, + { + "epoch": 0.011617442526384684, + "grad_norm": 1.178799033164978, + "learning_rate": 5.808721263192342e-07, + "loss": 0.0415, + "step": 710 + }, + { + "epoch": 0.011781068477460525, + "grad_norm": 1.1128038167953491, + "learning_rate": 5.890534238730263e-07, + "loss": 0.0439, + "step": 720 + }, + { + "epoch": 0.011944694428536367, + "grad_norm": 1.201760172843933, + "learning_rate": 5.972347214268183e-07, + "loss": 0.0376, + "step": 730 + }, + { + "epoch": 0.012108320379612207, + "grad_norm": 1.2795042991638184, + "learning_rate": 6.054160189806104e-07, + "loss": 0.0419, + "step": 740 + }, + { + "epoch": 0.012271946330688047, + "grad_norm": 1.1800705194473267, + "learning_rate": 6.135973165344023e-07, + "loss": 0.0467, + "step": 750 + }, + { + "epoch": 0.012435572281763887, + "grad_norm": 1.050722599029541, + "learning_rate": 6.217786140881944e-07, + "loss": 0.0408, + "step": 760 + }, + { + "epoch": 0.012599198232839729, + "grad_norm": 1.1177846193313599, + "learning_rate": 6.299599116419864e-07, + "loss": 0.0506, + "step": 770 + }, + { + "epoch": 0.01276282418391557, + "grad_norm": 1.4573966264724731, + "learning_rate": 6.381412091957785e-07, + "loss": 0.0389, + "step": 780 + }, + { + "epoch": 0.01292645013499141, + "grad_norm": 1.0579060316085815, + "learning_rate": 6.463225067495705e-07, + "loss": 0.0447, + "step": 790 + }, + { + "epoch": 0.01309007608606725, + "grad_norm": 1.2879925966262817, + "learning_rate": 6.545038043033626e-07, + "loss": 0.0423, + "step": 800 + }, + { + "epoch": 0.013253702037143092, + "grad_norm": 0.8808040022850037, + "learning_rate": 6.626851018571547e-07, + "loss": 0.031, + "step": 810 + }, + { + "epoch": 0.013417327988218932, + "grad_norm": 1.0718401670455933, + "learning_rate": 6.708663994109467e-07, + "loss": 0.0448, + "step": 820 + }, + { + "epoch": 0.013580953939294772, + "grad_norm": 1.5772994756698608, + "learning_rate": 6.790476969647386e-07, + "loss": 0.0388, + "step": 830 + }, + { + "epoch": 0.013744579890370612, + "grad_norm": 0.9327505826950073, + "learning_rate": 6.872289945185307e-07, + "loss": 0.0381, + "step": 840 + }, + { + "epoch": 0.013908205841446454, + "grad_norm": 1.0323652029037476, + "learning_rate": 6.954102920723226e-07, + "loss": 0.0345, + "step": 850 + }, + { + "epoch": 0.014071831792522294, + "grad_norm": 1.0653870105743408, + "learning_rate": 7.035915896261147e-07, + "loss": 0.0364, + "step": 860 + }, + { + "epoch": 0.014235457743598134, + "grad_norm": 0.8184215426445007, + "learning_rate": 7.117728871799067e-07, + "loss": 0.0324, + "step": 870 + }, + { + "epoch": 0.014399083694673975, + "grad_norm": 1.2273850440979004, + "learning_rate": 7.199541847336988e-07, + "loss": 0.0419, + "step": 880 + }, + { + "epoch": 0.014562709645749817, + "grad_norm": 0.9224192500114441, + "learning_rate": 7.281354822874908e-07, + "loss": 0.0267, + "step": 890 + }, + { + "epoch": 0.014726335596825657, + "grad_norm": 1.046655535697937, + "learning_rate": 7.363167798412829e-07, + "loss": 0.0353, + "step": 900 + }, + { + "epoch": 0.014889961547901497, + "grad_norm": 0.9340174198150635, + "learning_rate": 7.44498077395075e-07, + "loss": 0.0289, + "step": 910 + }, + { + "epoch": 0.015053587498977337, + "grad_norm": 1.1622369289398193, + "learning_rate": 7.52679374948867e-07, + "loss": 0.0321, + "step": 920 + }, + { + "epoch": 0.015217213450053179, + "grad_norm": 0.6142399311065674, + "learning_rate": 7.60860672502659e-07, + "loss": 0.0304, + "step": 930 + }, + { + "epoch": 0.01538083940112902, + "grad_norm": 1.0446439981460571, + "learning_rate": 7.69041970056451e-07, + "loss": 0.0312, + "step": 940 + }, + { + "epoch": 0.01554446535220486, + "grad_norm": 0.8211088180541992, + "learning_rate": 7.77223267610243e-07, + "loss": 0.0312, + "step": 950 + }, + { + "epoch": 0.0157080913032807, + "grad_norm": 1.1001378297805786, + "learning_rate": 7.85404565164035e-07, + "loss": 0.034, + "step": 960 + }, + { + "epoch": 0.01587171725435654, + "grad_norm": 1.4075452089309692, + "learning_rate": 7.93585862717827e-07, + "loss": 0.0216, + "step": 970 + }, + { + "epoch": 0.01603534320543238, + "grad_norm": 1.2252289056777954, + "learning_rate": 8.017671602716191e-07, + "loss": 0.0262, + "step": 980 + }, + { + "epoch": 0.016198969156508224, + "grad_norm": 0.9565718173980713, + "learning_rate": 8.099484578254111e-07, + "loss": 0.0422, + "step": 990 + }, + { + "epoch": 0.016362595107584064, + "grad_norm": 1.0116455554962158, + "learning_rate": 8.181297553792032e-07, + "loss": 0.022, + "step": 1000 + }, + { + "epoch": 0.016526221058659904, + "grad_norm": 1.3713945150375366, + "learning_rate": 8.263110529329952e-07, + "loss": 0.0281, + "step": 1010 + }, + { + "epoch": 0.016689847009735744, + "grad_norm": 0.9518176913261414, + "learning_rate": 8.344923504867873e-07, + "loss": 0.0307, + "step": 1020 + }, + { + "epoch": 0.016853472960811584, + "grad_norm": 1.0284076929092407, + "learning_rate": 8.426736480405793e-07, + "loss": 0.0325, + "step": 1030 + }, + { + "epoch": 0.017017098911887425, + "grad_norm": 0.9852408766746521, + "learning_rate": 8.508549455943714e-07, + "loss": 0.0341, + "step": 1040 + }, + { + "epoch": 0.017180724862963265, + "grad_norm": 1.060655117034912, + "learning_rate": 8.590362431481633e-07, + "loss": 0.0347, + "step": 1050 + }, + { + "epoch": 0.017344350814039105, + "grad_norm": 1.2484509944915771, + "learning_rate": 8.672175407019554e-07, + "loss": 0.0271, + "step": 1060 + }, + { + "epoch": 0.01750797676511495, + "grad_norm": 1.2672250270843506, + "learning_rate": 8.753988382557474e-07, + "loss": 0.0318, + "step": 1070 + }, + { + "epoch": 0.01767160271619079, + "grad_norm": 0.7602401971817017, + "learning_rate": 8.835801358095394e-07, + "loss": 0.0284, + "step": 1080 + }, + { + "epoch": 0.01783522866726663, + "grad_norm": 1.3558201789855957, + "learning_rate": 8.917614333633314e-07, + "loss": 0.0273, + "step": 1090 + }, + { + "epoch": 0.01799885461834247, + "grad_norm": 0.8952217698097229, + "learning_rate": 8.999427309171235e-07, + "loss": 0.0368, + "step": 1100 + }, + { + "epoch": 0.01816248056941831, + "grad_norm": 1.0355019569396973, + "learning_rate": 9.081240284709155e-07, + "loss": 0.0248, + "step": 1110 + }, + { + "epoch": 0.01832610652049415, + "grad_norm": 0.6996469497680664, + "learning_rate": 9.163053260247076e-07, + "loss": 0.0224, + "step": 1120 + }, + { + "epoch": 0.01848973247156999, + "grad_norm": 1.456304907798767, + "learning_rate": 9.244866235784996e-07, + "loss": 0.0305, + "step": 1130 + }, + { + "epoch": 0.018653358422645833, + "grad_norm": 0.8071394562721252, + "learning_rate": 9.326679211322917e-07, + "loss": 0.0221, + "step": 1140 + }, + { + "epoch": 0.018816984373721674, + "grad_norm": 1.0477321147918701, + "learning_rate": 9.408492186860837e-07, + "loss": 0.0299, + "step": 1150 + }, + { + "epoch": 0.018980610324797514, + "grad_norm": 0.8900778293609619, + "learning_rate": 9.490305162398757e-07, + "loss": 0.0211, + "step": 1160 + }, + { + "epoch": 0.019144236275873354, + "grad_norm": 1.1308872699737549, + "learning_rate": 9.572118137936676e-07, + "loss": 0.0215, + "step": 1170 + }, + { + "epoch": 0.019307862226949194, + "grad_norm": 0.9169854521751404, + "learning_rate": 9.653931113474597e-07, + "loss": 0.0232, + "step": 1180 + }, + { + "epoch": 0.019471488178025034, + "grad_norm": 0.8490586876869202, + "learning_rate": 9.735744089012517e-07, + "loss": 0.0182, + "step": 1190 + }, + { + "epoch": 0.019635114129100874, + "grad_norm": 0.9158782958984375, + "learning_rate": 9.817557064550438e-07, + "loss": 0.0335, + "step": 1200 + }, + { + "epoch": 0.019798740080176715, + "grad_norm": 0.9684118628501892, + "learning_rate": 9.899370040088358e-07, + "loss": 0.0258, + "step": 1210 + }, + { + "epoch": 0.01996236603125256, + "grad_norm": 0.8719280362129211, + "learning_rate": 9.98118301562628e-07, + "loss": 0.0223, + "step": 1220 + }, + { + "epoch": 0.0201259919823284, + "grad_norm": 0.8022724986076355, + "learning_rate": 1.00629959911642e-06, + "loss": 0.0244, + "step": 1230 + }, + { + "epoch": 0.02028961793340424, + "grad_norm": 1.1774309873580933, + "learning_rate": 1.014480896670212e-06, + "loss": 0.025, + "step": 1240 + }, + { + "epoch": 0.02045324388448008, + "grad_norm": 0.7491937875747681, + "learning_rate": 1.022662194224004e-06, + "loss": 0.0246, + "step": 1250 + }, + { + "epoch": 0.02061686983555592, + "grad_norm": 0.7116556763648987, + "learning_rate": 1.030843491777796e-06, + "loss": 0.0275, + "step": 1260 + }, + { + "epoch": 0.02078049578663176, + "grad_norm": 0.9816871881484985, + "learning_rate": 1.039024789331588e-06, + "loss": 0.0247, + "step": 1270 + }, + { + "epoch": 0.0209441217377076, + "grad_norm": 0.6181216835975647, + "learning_rate": 1.04720608688538e-06, + "loss": 0.0235, + "step": 1280 + }, + { + "epoch": 0.02110774768878344, + "grad_norm": 1.024975299835205, + "learning_rate": 1.055387384439172e-06, + "loss": 0.0277, + "step": 1290 + }, + { + "epoch": 0.021271373639859283, + "grad_norm": 1.5072187185287476, + "learning_rate": 1.063568681992964e-06, + "loss": 0.0258, + "step": 1300 + }, + { + "epoch": 0.021434999590935123, + "grad_norm": 0.7752158641815186, + "learning_rate": 1.0717499795467561e-06, + "loss": 0.0235, + "step": 1310 + }, + { + "epoch": 0.021598625542010964, + "grad_norm": 1.258660912513733, + "learning_rate": 1.0799312771005482e-06, + "loss": 0.0255, + "step": 1320 + }, + { + "epoch": 0.021762251493086804, + "grad_norm": 0.997409999370575, + "learning_rate": 1.0881125746543402e-06, + "loss": 0.0181, + "step": 1330 + }, + { + "epoch": 0.021925877444162644, + "grad_norm": 1.1911503076553345, + "learning_rate": 1.0962938722081323e-06, + "loss": 0.0189, + "step": 1340 + }, + { + "epoch": 0.022089503395238484, + "grad_norm": 0.8701474070549011, + "learning_rate": 1.1044751697619243e-06, + "loss": 0.0257, + "step": 1350 + }, + { + "epoch": 0.022253129346314324, + "grad_norm": 0.8997549414634705, + "learning_rate": 1.1126564673157164e-06, + "loss": 0.0246, + "step": 1360 + }, + { + "epoch": 0.022416755297390165, + "grad_norm": 1.4192978143692017, + "learning_rate": 1.1208377648695084e-06, + "loss": 0.0306, + "step": 1370 + }, + { + "epoch": 0.022580381248466008, + "grad_norm": 0.9754282832145691, + "learning_rate": 1.1290190624233003e-06, + "loss": 0.0278, + "step": 1380 + }, + { + "epoch": 0.02274400719954185, + "grad_norm": 0.9182608127593994, + "learning_rate": 1.1372003599770923e-06, + "loss": 0.0223, + "step": 1390 + }, + { + "epoch": 0.02290763315061769, + "grad_norm": 0.9532468914985657, + "learning_rate": 1.1453816575308844e-06, + "loss": 0.0249, + "step": 1400 + }, + { + "epoch": 0.02307125910169353, + "grad_norm": 0.8933977484703064, + "learning_rate": 1.1535629550846764e-06, + "loss": 0.0257, + "step": 1410 + }, + { + "epoch": 0.02323488505276937, + "grad_norm": 0.9456914067268372, + "learning_rate": 1.1617442526384685e-06, + "loss": 0.0256, + "step": 1420 + }, + { + "epoch": 0.02339851100384521, + "grad_norm": 0.9724444150924683, + "learning_rate": 1.1699255501922605e-06, + "loss": 0.0245, + "step": 1430 + }, + { + "epoch": 0.02356213695492105, + "grad_norm": 0.8060034513473511, + "learning_rate": 1.1781068477460526e-06, + "loss": 0.0288, + "step": 1440 + }, + { + "epoch": 0.02372576290599689, + "grad_norm": 0.8337758779525757, + "learning_rate": 1.1862881452998446e-06, + "loss": 0.0209, + "step": 1450 + }, + { + "epoch": 0.023889388857072733, + "grad_norm": 0.6603108048439026, + "learning_rate": 1.1944694428536367e-06, + "loss": 0.024, + "step": 1460 + }, + { + "epoch": 0.024053014808148573, + "grad_norm": 1.0457243919372559, + "learning_rate": 1.2026507404074287e-06, + "loss": 0.0168, + "step": 1470 + }, + { + "epoch": 0.024216640759224414, + "grad_norm": 0.9682090878486633, + "learning_rate": 1.2108320379612208e-06, + "loss": 0.024, + "step": 1480 + }, + { + "epoch": 0.024380266710300254, + "grad_norm": 0.7535257339477539, + "learning_rate": 1.2190133355150126e-06, + "loss": 0.0189, + "step": 1490 + }, + { + "epoch": 0.024543892661376094, + "grad_norm": 0.6845926642417908, + "learning_rate": 1.2271946330688047e-06, + "loss": 0.0194, + "step": 1500 + }, + { + "epoch": 0.024707518612451934, + "grad_norm": 0.9407981038093567, + "learning_rate": 1.2353759306225967e-06, + "loss": 0.019, + "step": 1510 + }, + { + "epoch": 0.024871144563527774, + "grad_norm": 0.97010737657547, + "learning_rate": 1.2435572281763888e-06, + "loss": 0.0222, + "step": 1520 + }, + { + "epoch": 0.025034770514603615, + "grad_norm": 1.3376246690750122, + "learning_rate": 1.2517385257301808e-06, + "loss": 0.028, + "step": 1530 + }, + { + "epoch": 0.025198396465679458, + "grad_norm": 0.6298123002052307, + "learning_rate": 1.2599198232839729e-06, + "loss": 0.0315, + "step": 1540 + }, + { + "epoch": 0.0253620224167553, + "grad_norm": 0.7916223406791687, + "learning_rate": 1.268101120837765e-06, + "loss": 0.0243, + "step": 1550 + }, + { + "epoch": 0.02552564836783114, + "grad_norm": 0.7279757857322693, + "learning_rate": 1.276282418391557e-06, + "loss": 0.0192, + "step": 1560 + }, + { + "epoch": 0.02568927431890698, + "grad_norm": 0.5921642780303955, + "learning_rate": 1.284463715945349e-06, + "loss": 0.0198, + "step": 1570 + }, + { + "epoch": 0.02585290026998282, + "grad_norm": 0.6908919811248779, + "learning_rate": 1.292645013499141e-06, + "loss": 0.0188, + "step": 1580 + }, + { + "epoch": 0.02601652622105866, + "grad_norm": 0.7825183272361755, + "learning_rate": 1.3008263110529331e-06, + "loss": 0.0233, + "step": 1590 + }, + { + "epoch": 0.0261801521721345, + "grad_norm": 0.5276881456375122, + "learning_rate": 1.3090076086067252e-06, + "loss": 0.024, + "step": 1600 + }, + { + "epoch": 0.02634377812321034, + "grad_norm": 0.7823368310928345, + "learning_rate": 1.3171889061605172e-06, + "loss": 0.021, + "step": 1610 + }, + { + "epoch": 0.026507404074286183, + "grad_norm": 0.531443178653717, + "learning_rate": 1.3253702037143093e-06, + "loss": 0.0189, + "step": 1620 + }, + { + "epoch": 0.026671030025362023, + "grad_norm": 0.7160040140151978, + "learning_rate": 1.3335515012681014e-06, + "loss": 0.0142, + "step": 1630 + }, + { + "epoch": 0.026834655976437864, + "grad_norm": 0.945673406124115, + "learning_rate": 1.3417327988218934e-06, + "loss": 0.019, + "step": 1640 + }, + { + "epoch": 0.026998281927513704, + "grad_norm": 1.1356985569000244, + "learning_rate": 1.3499140963756852e-06, + "loss": 0.0231, + "step": 1650 + }, + { + "epoch": 0.027161907878589544, + "grad_norm": 0.8492401838302612, + "learning_rate": 1.3580953939294773e-06, + "loss": 0.0236, + "step": 1660 + }, + { + "epoch": 0.027325533829665384, + "grad_norm": 0.721347987651825, + "learning_rate": 1.3662766914832693e-06, + "loss": 0.0204, + "step": 1670 + }, + { + "epoch": 0.027489159780741224, + "grad_norm": 0.9221570491790771, + "learning_rate": 1.3744579890370614e-06, + "loss": 0.0233, + "step": 1680 + }, + { + "epoch": 0.027652785731817068, + "grad_norm": 0.7661713361740112, + "learning_rate": 1.3826392865908532e-06, + "loss": 0.0211, + "step": 1690 + }, + { + "epoch": 0.027816411682892908, + "grad_norm": 0.9114108085632324, + "learning_rate": 1.3908205841446453e-06, + "loss": 0.0208, + "step": 1700 + }, + { + "epoch": 0.02798003763396875, + "grad_norm": 0.7525148391723633, + "learning_rate": 1.3990018816984373e-06, + "loss": 0.0227, + "step": 1710 + }, + { + "epoch": 0.02814366358504459, + "grad_norm": 0.7764679789543152, + "learning_rate": 1.4071831792522294e-06, + "loss": 0.0264, + "step": 1720 + }, + { + "epoch": 0.02830728953612043, + "grad_norm": 0.6332064867019653, + "learning_rate": 1.4153644768060214e-06, + "loss": 0.0171, + "step": 1730 + }, + { + "epoch": 0.02847091548719627, + "grad_norm": 0.5143387317657471, + "learning_rate": 1.4235457743598135e-06, + "loss": 0.0156, + "step": 1740 + }, + { + "epoch": 0.02863454143827211, + "grad_norm": 0.7839280962944031, + "learning_rate": 1.4317270719136055e-06, + "loss": 0.0161, + "step": 1750 + }, + { + "epoch": 0.02879816738934795, + "grad_norm": 0.7096639275550842, + "learning_rate": 1.4399083694673976e-06, + "loss": 0.0206, + "step": 1760 + }, + { + "epoch": 0.028961793340423793, + "grad_norm": 0.9041060209274292, + "learning_rate": 1.4480896670211896e-06, + "loss": 0.0264, + "step": 1770 + }, + { + "epoch": 0.029125419291499633, + "grad_norm": 0.6815425157546997, + "learning_rate": 1.4562709645749817e-06, + "loss": 0.0221, + "step": 1780 + }, + { + "epoch": 0.029289045242575473, + "grad_norm": 1.123404622077942, + "learning_rate": 1.4644522621287737e-06, + "loss": 0.0183, + "step": 1790 + }, + { + "epoch": 0.029452671193651313, + "grad_norm": 0.8858315944671631, + "learning_rate": 1.4726335596825658e-06, + "loss": 0.0203, + "step": 1800 + }, + { + "epoch": 0.029616297144727154, + "grad_norm": 0.8124191761016846, + "learning_rate": 1.4808148572363578e-06, + "loss": 0.0118, + "step": 1810 + }, + { + "epoch": 0.029779923095802994, + "grad_norm": 0.8867793679237366, + "learning_rate": 1.48899615479015e-06, + "loss": 0.0133, + "step": 1820 + }, + { + "epoch": 0.029943549046878834, + "grad_norm": 0.7105574607849121, + "learning_rate": 1.497177452343942e-06, + "loss": 0.024, + "step": 1830 + }, + { + "epoch": 0.030107174997954674, + "grad_norm": 0.9919645190238953, + "learning_rate": 1.505358749897734e-06, + "loss": 0.016, + "step": 1840 + }, + { + "epoch": 0.030270800949030518, + "grad_norm": 0.49908286333084106, + "learning_rate": 1.513540047451526e-06, + "loss": 0.0201, + "step": 1850 + }, + { + "epoch": 0.030434426900106358, + "grad_norm": 0.7252497673034668, + "learning_rate": 1.521721345005318e-06, + "loss": 0.0179, + "step": 1860 + }, + { + "epoch": 0.030598052851182198, + "grad_norm": 0.666792631149292, + "learning_rate": 1.52990264255911e-06, + "loss": 0.0267, + "step": 1870 + }, + { + "epoch": 0.03076167880225804, + "grad_norm": 1.1081174612045288, + "learning_rate": 1.538083940112902e-06, + "loss": 0.0242, + "step": 1880 + }, + { + "epoch": 0.03092530475333388, + "grad_norm": 0.8116829991340637, + "learning_rate": 1.546265237666694e-06, + "loss": 0.0182, + "step": 1890 + }, + { + "epoch": 0.03108893070440972, + "grad_norm": 0.7736301422119141, + "learning_rate": 1.554446535220486e-06, + "loss": 0.0178, + "step": 1900 + }, + { + "epoch": 0.03125255665548556, + "grad_norm": 0.6938651204109192, + "learning_rate": 1.5626278327742781e-06, + "loss": 0.0195, + "step": 1910 + }, + { + "epoch": 0.0314161826065614, + "grad_norm": 0.9768052101135254, + "learning_rate": 1.57080913032807e-06, + "loss": 0.0177, + "step": 1920 + }, + { + "epoch": 0.03157980855763724, + "grad_norm": 0.8962604999542236, + "learning_rate": 1.578990427881862e-06, + "loss": 0.017, + "step": 1930 + }, + { + "epoch": 0.03174343450871308, + "grad_norm": 0.7683470845222473, + "learning_rate": 1.587171725435654e-06, + "loss": 0.017, + "step": 1940 + }, + { + "epoch": 0.03190706045978892, + "grad_norm": 0.8589656352996826, + "learning_rate": 1.5953530229894461e-06, + "loss": 0.0134, + "step": 1950 + }, + { + "epoch": 0.03207068641086476, + "grad_norm": 0.47683417797088623, + "learning_rate": 1.6035343205432382e-06, + "loss": 0.0209, + "step": 1960 + }, + { + "epoch": 0.032234312361940604, + "grad_norm": 0.7166604399681091, + "learning_rate": 1.6117156180970302e-06, + "loss": 0.0147, + "step": 1970 + }, + { + "epoch": 0.03239793831301645, + "grad_norm": 0.7298545241355896, + "learning_rate": 1.6198969156508223e-06, + "loss": 0.0213, + "step": 1980 + }, + { + "epoch": 0.032561564264092284, + "grad_norm": 0.741427481174469, + "learning_rate": 1.6280782132046143e-06, + "loss": 0.0178, + "step": 1990 + }, + { + "epoch": 0.03272519021516813, + "grad_norm": 0.6702003479003906, + "learning_rate": 1.6362595107584064e-06, + "loss": 0.019, + "step": 2000 + }, + { + "epoch": 0.032888816166243964, + "grad_norm": 1.0223054885864258, + "learning_rate": 1.6444408083121984e-06, + "loss": 0.0219, + "step": 2010 + }, + { + "epoch": 0.03305244211731981, + "grad_norm": 0.7337533831596375, + "learning_rate": 1.6526221058659905e-06, + "loss": 0.0215, + "step": 2020 + }, + { + "epoch": 0.033216068068395645, + "grad_norm": 0.698671281337738, + "learning_rate": 1.6608034034197825e-06, + "loss": 0.0151, + "step": 2030 + }, + { + "epoch": 0.03337969401947149, + "grad_norm": 0.8325813412666321, + "learning_rate": 1.6689847009735746e-06, + "loss": 0.0161, + "step": 2040 + }, + { + "epoch": 0.03354331997054733, + "grad_norm": 0.7636258006095886, + "learning_rate": 1.6771659985273666e-06, + "loss": 0.0211, + "step": 2050 + }, + { + "epoch": 0.03370694592162317, + "grad_norm": 0.6166092157363892, + "learning_rate": 1.6853472960811587e-06, + "loss": 0.0145, + "step": 2060 + }, + { + "epoch": 0.03387057187269901, + "grad_norm": 0.47421813011169434, + "learning_rate": 1.6935285936349507e-06, + "loss": 0.0159, + "step": 2070 + }, + { + "epoch": 0.03403419782377485, + "grad_norm": 0.627687394618988, + "learning_rate": 1.7017098911887428e-06, + "loss": 0.0153, + "step": 2080 + }, + { + "epoch": 0.03419782377485069, + "grad_norm": 0.5683699250221252, + "learning_rate": 1.7098911887425346e-06, + "loss": 0.0152, + "step": 2090 + }, + { + "epoch": 0.03436144972592653, + "grad_norm": 0.702182948589325, + "learning_rate": 1.7180724862963267e-06, + "loss": 0.0147, + "step": 2100 + }, + { + "epoch": 0.03452507567700237, + "grad_norm": 0.6274211406707764, + "learning_rate": 1.7262537838501187e-06, + "loss": 0.0295, + "step": 2110 + }, + { + "epoch": 0.03468870162807821, + "grad_norm": 0.6848294734954834, + "learning_rate": 1.7344350814039108e-06, + "loss": 0.0156, + "step": 2120 + }, + { + "epoch": 0.034852327579154053, + "grad_norm": 0.7503065466880798, + "learning_rate": 1.7426163789577028e-06, + "loss": 0.0205, + "step": 2130 + }, + { + "epoch": 0.0350159535302299, + "grad_norm": 0.3985742926597595, + "learning_rate": 1.750797676511495e-06, + "loss": 0.0133, + "step": 2140 + }, + { + "epoch": 0.035179579481305734, + "grad_norm": 0.5512229204177856, + "learning_rate": 1.7589789740652867e-06, + "loss": 0.022, + "step": 2150 + }, + { + "epoch": 0.03534320543238158, + "grad_norm": 0.3374076783657074, + "learning_rate": 1.7671602716190788e-06, + "loss": 0.0163, + "step": 2160 + }, + { + "epoch": 0.035506831383457414, + "grad_norm": 0.680812418460846, + "learning_rate": 1.7753415691728708e-06, + "loss": 0.0154, + "step": 2170 + }, + { + "epoch": 0.03567045733453326, + "grad_norm": 0.7778629660606384, + "learning_rate": 1.7835228667266629e-06, + "loss": 0.0181, + "step": 2180 + }, + { + "epoch": 0.035834083285609095, + "grad_norm": 0.9291898012161255, + "learning_rate": 1.791704164280455e-06, + "loss": 0.0167, + "step": 2190 + }, + { + "epoch": 0.03599770923668494, + "grad_norm": 0.9499353170394897, + "learning_rate": 1.799885461834247e-06, + "loss": 0.0146, + "step": 2200 + }, + { + "epoch": 0.03616133518776078, + "grad_norm": 1.1274068355560303, + "learning_rate": 1.808066759388039e-06, + "loss": 0.0212, + "step": 2210 + }, + { + "epoch": 0.03632496113883662, + "grad_norm": 0.7596690058708191, + "learning_rate": 1.816248056941831e-06, + "loss": 0.017, + "step": 2220 + }, + { + "epoch": 0.03648858708991246, + "grad_norm": 0.9302141070365906, + "learning_rate": 1.8244293544956231e-06, + "loss": 0.0231, + "step": 2230 + }, + { + "epoch": 0.0366522130409883, + "grad_norm": 0.8380283713340759, + "learning_rate": 1.8326106520494152e-06, + "loss": 0.0132, + "step": 2240 + }, + { + "epoch": 0.03681583899206414, + "grad_norm": 0.7264215350151062, + "learning_rate": 1.8407919496032072e-06, + "loss": 0.0175, + "step": 2250 + }, + { + "epoch": 0.03697946494313998, + "grad_norm": 0.6638109683990479, + "learning_rate": 1.8489732471569993e-06, + "loss": 0.0145, + "step": 2260 + }, + { + "epoch": 0.03714309089421582, + "grad_norm": 0.6979623436927795, + "learning_rate": 1.8571545447107913e-06, + "loss": 0.015, + "step": 2270 + }, + { + "epoch": 0.03730671684529167, + "grad_norm": 0.4812341332435608, + "learning_rate": 1.8653358422645834e-06, + "loss": 0.0143, + "step": 2280 + }, + { + "epoch": 0.0374703427963675, + "grad_norm": 0.5599238872528076, + "learning_rate": 1.8735171398183754e-06, + "loss": 0.0131, + "step": 2290 + }, + { + "epoch": 0.03763396874744335, + "grad_norm": 0.9793543815612793, + "learning_rate": 1.8816984373721675e-06, + "loss": 0.0196, + "step": 2300 + }, + { + "epoch": 0.037797594698519184, + "grad_norm": 0.5340746641159058, + "learning_rate": 1.8898797349259593e-06, + "loss": 0.0161, + "step": 2310 + }, + { + "epoch": 0.03796122064959503, + "grad_norm": 0.47447285056114197, + "learning_rate": 1.8980610324797514e-06, + "loss": 0.0147, + "step": 2320 + }, + { + "epoch": 0.038124846600670864, + "grad_norm": 0.9897047877311707, + "learning_rate": 1.9062423300335434e-06, + "loss": 0.0196, + "step": 2330 + }, + { + "epoch": 0.03828847255174671, + "grad_norm": 1.0041579008102417, + "learning_rate": 1.9144236275873353e-06, + "loss": 0.0155, + "step": 2340 + }, + { + "epoch": 0.038452098502822545, + "grad_norm": 0.6637380123138428, + "learning_rate": 1.9226049251411275e-06, + "loss": 0.016, + "step": 2350 + }, + { + "epoch": 0.03861572445389839, + "grad_norm": 0.5131836533546448, + "learning_rate": 1.9307862226949194e-06, + "loss": 0.0147, + "step": 2360 + }, + { + "epoch": 0.03877935040497423, + "grad_norm": 0.7195000052452087, + "learning_rate": 1.9389675202487116e-06, + "loss": 0.0137, + "step": 2370 + }, + { + "epoch": 0.03894297635605007, + "grad_norm": 0.5455965995788574, + "learning_rate": 1.9471488178025035e-06, + "loss": 0.014, + "step": 2380 + }, + { + "epoch": 0.03910660230712591, + "grad_norm": 1.1191215515136719, + "learning_rate": 1.9553301153562957e-06, + "loss": 0.0131, + "step": 2390 + }, + { + "epoch": 0.03927022825820175, + "grad_norm": 0.4611370265483856, + "learning_rate": 1.9635114129100876e-06, + "loss": 0.0111, + "step": 2400 + }, + { + "epoch": 0.03943385420927759, + "grad_norm": 0.7953206896781921, + "learning_rate": 1.97169271046388e-06, + "loss": 0.0131, + "step": 2410 + }, + { + "epoch": 0.03959748016035343, + "grad_norm": 0.5053262710571289, + "learning_rate": 1.9798740080176717e-06, + "loss": 0.0124, + "step": 2420 + }, + { + "epoch": 0.03976110611142927, + "grad_norm": 0.7063040137290955, + "learning_rate": 1.988055305571464e-06, + "loss": 0.0163, + "step": 2430 + }, + { + "epoch": 0.03992473206250512, + "grad_norm": 0.7882359623908997, + "learning_rate": 1.996236603125256e-06, + "loss": 0.0176, + "step": 2440 + }, + { + "epoch": 0.04008835801358095, + "grad_norm": 0.5648556351661682, + "learning_rate": 2.004417900679048e-06, + "loss": 0.0178, + "step": 2450 + }, + { + "epoch": 0.0402519839646568, + "grad_norm": 0.7448477745056152, + "learning_rate": 2.01259919823284e-06, + "loss": 0.0164, + "step": 2460 + }, + { + "epoch": 0.040415609915732634, + "grad_norm": 0.9597516655921936, + "learning_rate": 2.020780495786632e-06, + "loss": 0.0124, + "step": 2470 + }, + { + "epoch": 0.04057923586680848, + "grad_norm": 0.5111032128334045, + "learning_rate": 2.028961793340424e-06, + "loss": 0.0121, + "step": 2480 + }, + { + "epoch": 0.040742861817884314, + "grad_norm": 0.8775847554206848, + "learning_rate": 2.0371430908942163e-06, + "loss": 0.0168, + "step": 2490 + }, + { + "epoch": 0.04090648776896016, + "grad_norm": 0.6906808614730835, + "learning_rate": 2.045324388448008e-06, + "loss": 0.0121, + "step": 2500 + }, + { + "epoch": 0.041070113720035994, + "grad_norm": 0.39874231815338135, + "learning_rate": 2.0535056860018004e-06, + "loss": 0.0159, + "step": 2510 + }, + { + "epoch": 0.04123373967111184, + "grad_norm": 0.9298045039176941, + "learning_rate": 2.061686983555592e-06, + "loss": 0.0147, + "step": 2520 + }, + { + "epoch": 0.04139736562218768, + "grad_norm": 0.5053309798240662, + "learning_rate": 2.069868281109384e-06, + "loss": 0.015, + "step": 2530 + }, + { + "epoch": 0.04156099157326352, + "grad_norm": 0.6905393004417419, + "learning_rate": 2.078049578663176e-06, + "loss": 0.0158, + "step": 2540 + }, + { + "epoch": 0.04172461752433936, + "grad_norm": 0.6831179261207581, + "learning_rate": 2.086230876216968e-06, + "loss": 0.0142, + "step": 2550 + }, + { + "epoch": 0.0418882434754152, + "grad_norm": 0.7343636751174927, + "learning_rate": 2.09441217377076e-06, + "loss": 0.0142, + "step": 2560 + }, + { + "epoch": 0.04205186942649104, + "grad_norm": 0.6231259703636169, + "learning_rate": 2.1025934713245522e-06, + "loss": 0.0103, + "step": 2570 + }, + { + "epoch": 0.04221549537756688, + "grad_norm": 0.9527928233146667, + "learning_rate": 2.110774768878344e-06, + "loss": 0.0105, + "step": 2580 + }, + { + "epoch": 0.04237912132864272, + "grad_norm": 0.40905502438545227, + "learning_rate": 2.1189560664321363e-06, + "loss": 0.0151, + "step": 2590 + }, + { + "epoch": 0.04254274727971857, + "grad_norm": 0.21875037252902985, + "learning_rate": 2.127137363985928e-06, + "loss": 0.0108, + "step": 2600 + }, + { + "epoch": 0.0427063732307944, + "grad_norm": 0.22272948920726776, + "learning_rate": 2.1353186615397204e-06, + "loss": 0.0127, + "step": 2610 + }, + { + "epoch": 0.04286999918187025, + "grad_norm": 1.1066583395004272, + "learning_rate": 2.1434999590935123e-06, + "loss": 0.011, + "step": 2620 + }, + { + "epoch": 0.043033625132946084, + "grad_norm": 0.6232113838195801, + "learning_rate": 2.1516812566473045e-06, + "loss": 0.012, + "step": 2630 + }, + { + "epoch": 0.04319725108402193, + "grad_norm": 0.7276982069015503, + "learning_rate": 2.1598625542010964e-06, + "loss": 0.0156, + "step": 2640 + }, + { + "epoch": 0.043360877035097764, + "grad_norm": 0.773159384727478, + "learning_rate": 2.1680438517548887e-06, + "loss": 0.012, + "step": 2650 + }, + { + "epoch": 0.04352450298617361, + "grad_norm": 0.5005454421043396, + "learning_rate": 2.1762251493086805e-06, + "loss": 0.0161, + "step": 2660 + }, + { + "epoch": 0.043688128937249444, + "grad_norm": 0.5888230800628662, + "learning_rate": 2.1844064468624728e-06, + "loss": 0.0111, + "step": 2670 + }, + { + "epoch": 0.04385175488832529, + "grad_norm": 0.7097195386886597, + "learning_rate": 2.1925877444162646e-06, + "loss": 0.0113, + "step": 2680 + }, + { + "epoch": 0.04401538083940113, + "grad_norm": 0.4649309813976288, + "learning_rate": 2.200769041970057e-06, + "loss": 0.0137, + "step": 2690 + }, + { + "epoch": 0.04417900679047697, + "grad_norm": 0.5561385154724121, + "learning_rate": 2.2089503395238487e-06, + "loss": 0.0142, + "step": 2700 + }, + { + "epoch": 0.04434263274155281, + "grad_norm": 0.6173912286758423, + "learning_rate": 2.217131637077641e-06, + "loss": 0.0141, + "step": 2710 + }, + { + "epoch": 0.04450625869262865, + "grad_norm": 0.38627490401268005, + "learning_rate": 2.225312934631433e-06, + "loss": 0.0119, + "step": 2720 + }, + { + "epoch": 0.04466988464370449, + "grad_norm": 0.4352302551269531, + "learning_rate": 2.233494232185225e-06, + "loss": 0.013, + "step": 2730 + }, + { + "epoch": 0.04483351059478033, + "grad_norm": 0.5472245812416077, + "learning_rate": 2.241675529739017e-06, + "loss": 0.0117, + "step": 2740 + }, + { + "epoch": 0.04499713654585617, + "grad_norm": 0.8657650947570801, + "learning_rate": 2.2498568272928087e-06, + "loss": 0.0168, + "step": 2750 + }, + { + "epoch": 0.045160762496932016, + "grad_norm": 0.4442863166332245, + "learning_rate": 2.2580381248466006e-06, + "loss": 0.0105, + "step": 2760 + }, + { + "epoch": 0.04532438844800785, + "grad_norm": 0.6586319804191589, + "learning_rate": 2.266219422400393e-06, + "loss": 0.0112, + "step": 2770 + }, + { + "epoch": 0.0454880143990837, + "grad_norm": 1.1256358623504639, + "learning_rate": 2.2744007199541847e-06, + "loss": 0.019, + "step": 2780 + }, + { + "epoch": 0.045651640350159534, + "grad_norm": 0.8728662729263306, + "learning_rate": 2.282582017507977e-06, + "loss": 0.0131, + "step": 2790 + }, + { + "epoch": 0.04581526630123538, + "grad_norm": 0.5684436559677124, + "learning_rate": 2.2907633150617688e-06, + "loss": 0.0133, + "step": 2800 + }, + { + "epoch": 0.045978892252311214, + "grad_norm": 0.41163626313209534, + "learning_rate": 2.298944612615561e-06, + "loss": 0.0149, + "step": 2810 + }, + { + "epoch": 0.04614251820338706, + "grad_norm": 0.318732351064682, + "learning_rate": 2.307125910169353e-06, + "loss": 0.0105, + "step": 2820 + }, + { + "epoch": 0.0463061441544629, + "grad_norm": 0.47379904985427856, + "learning_rate": 2.315307207723145e-06, + "loss": 0.0128, + "step": 2830 + }, + { + "epoch": 0.04646977010553874, + "grad_norm": 0.5605900287628174, + "learning_rate": 2.323488505276937e-06, + "loss": 0.0126, + "step": 2840 + }, + { + "epoch": 0.04663339605661458, + "grad_norm": 0.6339355111122131, + "learning_rate": 2.3316698028307292e-06, + "loss": 0.0156, + "step": 2850 + }, + { + "epoch": 0.04679702200769042, + "grad_norm": 0.4461880028247833, + "learning_rate": 2.339851100384521e-06, + "loss": 0.0133, + "step": 2860 + }, + { + "epoch": 0.04696064795876626, + "grad_norm": 0.493367075920105, + "learning_rate": 2.3480323979383133e-06, + "loss": 0.0141, + "step": 2870 + }, + { + "epoch": 0.0471242739098421, + "grad_norm": 0.7958556413650513, + "learning_rate": 2.356213695492105e-06, + "loss": 0.018, + "step": 2880 + }, + { + "epoch": 0.04728789986091794, + "grad_norm": 0.7448601126670837, + "learning_rate": 2.3643949930458975e-06, + "loss": 0.0121, + "step": 2890 + }, + { + "epoch": 0.04745152581199378, + "grad_norm": 0.20599818229675293, + "learning_rate": 2.3725762905996893e-06, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.04761515176306962, + "grad_norm": 0.5559225678443909, + "learning_rate": 2.3807575881534816e-06, + "loss": 0.0142, + "step": 2910 + }, + { + "epoch": 0.047778777714145466, + "grad_norm": 0.8967880606651306, + "learning_rate": 2.3889388857072734e-06, + "loss": 0.0159, + "step": 2920 + }, + { + "epoch": 0.0479424036652213, + "grad_norm": 0.28435468673706055, + "learning_rate": 2.3971201832610657e-06, + "loss": 0.0113, + "step": 2930 + }, + { + "epoch": 0.04810602961629715, + "grad_norm": 0.4912911355495453, + "learning_rate": 2.4053014808148575e-06, + "loss": 0.0149, + "step": 2940 + }, + { + "epoch": 0.048269655567372984, + "grad_norm": 0.6454976797103882, + "learning_rate": 2.4134827783686498e-06, + "loss": 0.0137, + "step": 2950 + }, + { + "epoch": 0.04843328151844883, + "grad_norm": 0.4994121491909027, + "learning_rate": 2.4216640759224416e-06, + "loss": 0.0112, + "step": 2960 + }, + { + "epoch": 0.048596907469524664, + "grad_norm": 0.6314337849617004, + "learning_rate": 2.4298453734762334e-06, + "loss": 0.0113, + "step": 2970 + }, + { + "epoch": 0.04876053342060051, + "grad_norm": 0.4771454334259033, + "learning_rate": 2.4380266710300253e-06, + "loss": 0.0132, + "step": 2980 + }, + { + "epoch": 0.04892415937167635, + "grad_norm": 0.4665985703468323, + "learning_rate": 2.4462079685838175e-06, + "loss": 0.0112, + "step": 2990 + }, + { + "epoch": 0.04908778532275219, + "grad_norm": 0.7331014275550842, + "learning_rate": 2.4543892661376094e-06, + "loss": 0.0122, + "step": 3000 + }, + { + "epoch": 0.04925141127382803, + "grad_norm": 0.614673376083374, + "learning_rate": 2.4625705636914016e-06, + "loss": 0.014, + "step": 3010 + }, + { + "epoch": 0.04941503722490387, + "grad_norm": 0.693922758102417, + "learning_rate": 2.4707518612451935e-06, + "loss": 0.0107, + "step": 3020 + }, + { + "epoch": 0.04957866317597971, + "grad_norm": 0.9129398465156555, + "learning_rate": 2.4789331587989857e-06, + "loss": 0.0088, + "step": 3030 + }, + { + "epoch": 0.04974228912705555, + "grad_norm": 0.4695659279823303, + "learning_rate": 2.4871144563527776e-06, + "loss": 0.0153, + "step": 3040 + }, + { + "epoch": 0.04990591507813139, + "grad_norm": 0.41599157452583313, + "learning_rate": 2.49529575390657e-06, + "loss": 0.0128, + "step": 3050 + }, + { + "epoch": 0.05006954102920723, + "grad_norm": 0.6311373710632324, + "learning_rate": 2.5034770514603617e-06, + "loss": 0.0157, + "step": 3060 + }, + { + "epoch": 0.05023316698028307, + "grad_norm": 1.2424720525741577, + "learning_rate": 2.511658349014154e-06, + "loss": 0.0098, + "step": 3070 + }, + { + "epoch": 0.050396792931358916, + "grad_norm": 1.679995059967041, + "learning_rate": 2.5198396465679458e-06, + "loss": 0.0155, + "step": 3080 + }, + { + "epoch": 0.05056041888243475, + "grad_norm": 0.5821516513824463, + "learning_rate": 2.528020944121738e-06, + "loss": 0.0125, + "step": 3090 + }, + { + "epoch": 0.0507240448335106, + "grad_norm": 0.8905674815177917, + "learning_rate": 2.53620224167553e-06, + "loss": 0.0114, + "step": 3100 + }, + { + "epoch": 0.05088767078458643, + "grad_norm": 0.78466796875, + "learning_rate": 2.544383539229322e-06, + "loss": 0.0112, + "step": 3110 + }, + { + "epoch": 0.05105129673566228, + "grad_norm": 0.8018458485603333, + "learning_rate": 2.552564836783114e-06, + "loss": 0.0145, + "step": 3120 + }, + { + "epoch": 0.051214922686738114, + "grad_norm": 0.6496682167053223, + "learning_rate": 2.5607461343369063e-06, + "loss": 0.0116, + "step": 3130 + }, + { + "epoch": 0.05137854863781396, + "grad_norm": 0.3492189049720764, + "learning_rate": 2.568927431890698e-06, + "loss": 0.0098, + "step": 3140 + }, + { + "epoch": 0.0515421745888898, + "grad_norm": 0.5741250514984131, + "learning_rate": 2.5771087294444904e-06, + "loss": 0.0122, + "step": 3150 + }, + { + "epoch": 0.05170580053996564, + "grad_norm": 0.42342206835746765, + "learning_rate": 2.585290026998282e-06, + "loss": 0.0135, + "step": 3160 + }, + { + "epoch": 0.05186942649104148, + "grad_norm": 0.4527572691440582, + "learning_rate": 2.5934713245520745e-06, + "loss": 0.0091, + "step": 3170 + }, + { + "epoch": 0.05203305244211732, + "grad_norm": 0.6240607500076294, + "learning_rate": 2.6016526221058663e-06, + "loss": 0.0151, + "step": 3180 + }, + { + "epoch": 0.05219667839319316, + "grad_norm": 0.4512414038181305, + "learning_rate": 2.6098339196596586e-06, + "loss": 0.0122, + "step": 3190 + }, + { + "epoch": 0.052360304344269, + "grad_norm": 0.8511449098587036, + "learning_rate": 2.6180152172134504e-06, + "loss": 0.0139, + "step": 3200 + }, + { + "epoch": 0.05252393029534484, + "grad_norm": 0.5736969113349915, + "learning_rate": 2.6261965147672427e-06, + "loss": 0.0124, + "step": 3210 + }, + { + "epoch": 0.05268755624642068, + "grad_norm": 0.6660225987434387, + "learning_rate": 2.6343778123210345e-06, + "loss": 0.0135, + "step": 3220 + }, + { + "epoch": 0.05285118219749652, + "grad_norm": 0.5948995351791382, + "learning_rate": 2.6425591098748268e-06, + "loss": 0.0125, + "step": 3230 + }, + { + "epoch": 0.053014808148572366, + "grad_norm": 0.6419150829315186, + "learning_rate": 2.6507404074286186e-06, + "loss": 0.013, + "step": 3240 + }, + { + "epoch": 0.0531784340996482, + "grad_norm": 0.5840580463409424, + "learning_rate": 2.6589217049824104e-06, + "loss": 0.0099, + "step": 3250 + }, + { + "epoch": 0.05334206005072405, + "grad_norm": 0.7142891883850098, + "learning_rate": 2.6671030025362027e-06, + "loss": 0.0122, + "step": 3260 + }, + { + "epoch": 0.05350568600179988, + "grad_norm": 0.7479686737060547, + "learning_rate": 2.6752843000899945e-06, + "loss": 0.0125, + "step": 3270 + }, + { + "epoch": 0.05366931195287573, + "grad_norm": 0.18943554162979126, + "learning_rate": 2.683465597643787e-06, + "loss": 0.0118, + "step": 3280 + }, + { + "epoch": 0.053832937903951564, + "grad_norm": 0.47382476925849915, + "learning_rate": 2.6916468951975786e-06, + "loss": 0.0156, + "step": 3290 + }, + { + "epoch": 0.05399656385502741, + "grad_norm": 0.290359765291214, + "learning_rate": 2.6998281927513705e-06, + "loss": 0.0117, + "step": 3300 + }, + { + "epoch": 0.05416018980610325, + "grad_norm": 0.4674018621444702, + "learning_rate": 2.7080094903051623e-06, + "loss": 0.0101, + "step": 3310 + }, + { + "epoch": 0.05432381575717909, + "grad_norm": 0.6049782037734985, + "learning_rate": 2.7161907878589546e-06, + "loss": 0.0093, + "step": 3320 + }, + { + "epoch": 0.05448744170825493, + "grad_norm": 0.5735464096069336, + "learning_rate": 2.7243720854127464e-06, + "loss": 0.0106, + "step": 3330 + }, + { + "epoch": 0.05465106765933077, + "grad_norm": 0.4310954511165619, + "learning_rate": 2.7325533829665387e-06, + "loss": 0.0088, + "step": 3340 + }, + { + "epoch": 0.05481469361040661, + "grad_norm": 0.49559855461120605, + "learning_rate": 2.7407346805203305e-06, + "loss": 0.0111, + "step": 3350 + }, + { + "epoch": 0.05497831956148245, + "grad_norm": 0.47451549768447876, + "learning_rate": 2.748915978074123e-06, + "loss": 0.0092, + "step": 3360 + }, + { + "epoch": 0.05514194551255829, + "grad_norm": 0.4467151463031769, + "learning_rate": 2.7570972756279146e-06, + "loss": 0.0096, + "step": 3370 + }, + { + "epoch": 0.055305571463634136, + "grad_norm": 0.6338276267051697, + "learning_rate": 2.7652785731817065e-06, + "loss": 0.0189, + "step": 3380 + }, + { + "epoch": 0.05546919741470997, + "grad_norm": 0.4194999933242798, + "learning_rate": 2.7734598707354987e-06, + "loss": 0.0151, + "step": 3390 + }, + { + "epoch": 0.055632823365785816, + "grad_norm": 0.42343956232070923, + "learning_rate": 2.7816411682892906e-06, + "loss": 0.0089, + "step": 3400 + }, + { + "epoch": 0.05579644931686165, + "grad_norm": 0.5507587790489197, + "learning_rate": 2.789822465843083e-06, + "loss": 0.0121, + "step": 3410 + }, + { + "epoch": 0.0559600752679375, + "grad_norm": 0.6585989594459534, + "learning_rate": 2.7980037633968747e-06, + "loss": 0.0131, + "step": 3420 + }, + { + "epoch": 0.05612370121901333, + "grad_norm": 0.3411996066570282, + "learning_rate": 2.806185060950667e-06, + "loss": 0.0097, + "step": 3430 + }, + { + "epoch": 0.05628732717008918, + "grad_norm": 0.594822347164154, + "learning_rate": 2.8143663585044588e-06, + "loss": 0.0119, + "step": 3440 + }, + { + "epoch": 0.056450953121165014, + "grad_norm": 0.7176277041435242, + "learning_rate": 2.822547656058251e-06, + "loss": 0.0127, + "step": 3450 + }, + { + "epoch": 0.05661457907224086, + "grad_norm": 0.2711975872516632, + "learning_rate": 2.830728953612043e-06, + "loss": 0.0103, + "step": 3460 + }, + { + "epoch": 0.0567782050233167, + "grad_norm": 0.4954574406147003, + "learning_rate": 2.838910251165835e-06, + "loss": 0.0141, + "step": 3470 + }, + { + "epoch": 0.05694183097439254, + "grad_norm": 0.7648645043373108, + "learning_rate": 2.847091548719627e-06, + "loss": 0.0097, + "step": 3480 + }, + { + "epoch": 0.05710545692546838, + "grad_norm": 0.7074746489524841, + "learning_rate": 2.8552728462734192e-06, + "loss": 0.0097, + "step": 3490 + }, + { + "epoch": 0.05726908287654422, + "grad_norm": 0.7583217024803162, + "learning_rate": 2.863454143827211e-06, + "loss": 0.0093, + "step": 3500 + }, + { + "epoch": 0.05743270882762006, + "grad_norm": 0.4210297763347626, + "learning_rate": 2.8716354413810033e-06, + "loss": 0.0123, + "step": 3510 + }, + { + "epoch": 0.0575963347786959, + "grad_norm": 0.3861912786960602, + "learning_rate": 2.879816738934795e-06, + "loss": 0.0112, + "step": 3520 + }, + { + "epoch": 0.05775996072977174, + "grad_norm": 0.500217080116272, + "learning_rate": 2.8879980364885874e-06, + "loss": 0.0126, + "step": 3530 + }, + { + "epoch": 0.057923586680847586, + "grad_norm": 0.2987768352031708, + "learning_rate": 2.8961793340423793e-06, + "loss": 0.015, + "step": 3540 + }, + { + "epoch": 0.05808721263192342, + "grad_norm": 0.4511184096336365, + "learning_rate": 2.9043606315961715e-06, + "loss": 0.0119, + "step": 3550 + }, + { + "epoch": 0.058250838582999266, + "grad_norm": 0.7617406845092773, + "learning_rate": 2.9125419291499634e-06, + "loss": 0.0099, + "step": 3560 + }, + { + "epoch": 0.0584144645340751, + "grad_norm": 0.5174741148948669, + "learning_rate": 2.9207232267037557e-06, + "loss": 0.0123, + "step": 3570 + }, + { + "epoch": 0.058578090485150947, + "grad_norm": 0.5802791714668274, + "learning_rate": 2.9289045242575475e-06, + "loss": 0.0112, + "step": 3580 + }, + { + "epoch": 0.05874171643622678, + "grad_norm": 0.74256432056427, + "learning_rate": 2.9370858218113398e-06, + "loss": 0.0107, + "step": 3590 + }, + { + "epoch": 0.05890534238730263, + "grad_norm": 0.5330492258071899, + "learning_rate": 2.9452671193651316e-06, + "loss": 0.011, + "step": 3600 + }, + { + "epoch": 0.059068968338378464, + "grad_norm": 0.5939335823059082, + "learning_rate": 2.953448416918924e-06, + "loss": 0.0129, + "step": 3610 + }, + { + "epoch": 0.05923259428945431, + "grad_norm": 0.4373001158237457, + "learning_rate": 2.9616297144727157e-06, + "loss": 0.0113, + "step": 3620 + }, + { + "epoch": 0.05939622024053015, + "grad_norm": 0.6011515855789185, + "learning_rate": 2.969811012026508e-06, + "loss": 0.0148, + "step": 3630 + }, + { + "epoch": 0.05955984619160599, + "grad_norm": 0.46428272128105164, + "learning_rate": 2.9779923095803e-06, + "loss": 0.0129, + "step": 3640 + }, + { + "epoch": 0.05972347214268183, + "grad_norm": 0.6666850447654724, + "learning_rate": 2.986173607134092e-06, + "loss": 0.0107, + "step": 3650 + }, + { + "epoch": 0.05988709809375767, + "grad_norm": 0.6814191341400146, + "learning_rate": 2.994354904687884e-06, + "loss": 0.0156, + "step": 3660 + }, + { + "epoch": 0.06005072404483351, + "grad_norm": 0.4391625225543976, + "learning_rate": 3.002536202241676e-06, + "loss": 0.0125, + "step": 3670 + }, + { + "epoch": 0.06021434999590935, + "grad_norm": 0.45413750410079956, + "learning_rate": 3.010717499795468e-06, + "loss": 0.0096, + "step": 3680 + }, + { + "epoch": 0.06037797594698519, + "grad_norm": 0.573988676071167, + "learning_rate": 3.0188987973492603e-06, + "loss": 0.0098, + "step": 3690 + }, + { + "epoch": 0.060541601898061036, + "grad_norm": 0.4897746741771698, + "learning_rate": 3.027080094903052e-06, + "loss": 0.0141, + "step": 3700 + }, + { + "epoch": 0.06070522784913687, + "grad_norm": 0.4950472116470337, + "learning_rate": 3.035261392456844e-06, + "loss": 0.0071, + "step": 3710 + }, + { + "epoch": 0.060868853800212716, + "grad_norm": 0.2800094783306122, + "learning_rate": 3.043442690010636e-06, + "loss": 0.0151, + "step": 3720 + }, + { + "epoch": 0.06103247975128855, + "grad_norm": 0.5631102919578552, + "learning_rate": 3.051623987564428e-06, + "loss": 0.0129, + "step": 3730 + }, + { + "epoch": 0.061196105702364396, + "grad_norm": 0.4482916593551636, + "learning_rate": 3.05980528511822e-06, + "loss": 0.0115, + "step": 3740 + }, + { + "epoch": 0.06135973165344023, + "grad_norm": 0.7123658657073975, + "learning_rate": 3.0679865826720117e-06, + "loss": 0.0137, + "step": 3750 + }, + { + "epoch": 0.06152335760451608, + "grad_norm": 0.535690188407898, + "learning_rate": 3.076167880225804e-06, + "loss": 0.0122, + "step": 3760 + }, + { + "epoch": 0.061686983555591914, + "grad_norm": 0.20516180992126465, + "learning_rate": 3.084349177779596e-06, + "loss": 0.0126, + "step": 3770 + }, + { + "epoch": 0.06185060950666776, + "grad_norm": 0.48727574944496155, + "learning_rate": 3.092530475333388e-06, + "loss": 0.011, + "step": 3780 + }, + { + "epoch": 0.0620142354577436, + "grad_norm": 0.40629613399505615, + "learning_rate": 3.10071177288718e-06, + "loss": 0.0108, + "step": 3790 + }, + { + "epoch": 0.06217786140881944, + "grad_norm": 0.4044995605945587, + "learning_rate": 3.108893070440972e-06, + "loss": 0.011, + "step": 3800 + }, + { + "epoch": 0.06234148735989528, + "grad_norm": 0.7297495007514954, + "learning_rate": 3.117074367994764e-06, + "loss": 0.0101, + "step": 3810 + }, + { + "epoch": 0.06250511331097112, + "grad_norm": 0.47363176941871643, + "learning_rate": 3.1252556655485563e-06, + "loss": 0.0092, + "step": 3820 + }, + { + "epoch": 0.06266873926204695, + "grad_norm": 0.38744133710861206, + "learning_rate": 3.133436963102348e-06, + "loss": 0.0183, + "step": 3830 + }, + { + "epoch": 0.0628323652131228, + "grad_norm": 0.5381457209587097, + "learning_rate": 3.14161826065614e-06, + "loss": 0.0103, + "step": 3840 + }, + { + "epoch": 0.06299599116419864, + "grad_norm": 0.3486247658729553, + "learning_rate": 3.1497995582099322e-06, + "loss": 0.0088, + "step": 3850 + }, + { + "epoch": 0.06315961711527449, + "grad_norm": 0.4323013722896576, + "learning_rate": 3.157980855763724e-06, + "loss": 0.0145, + "step": 3860 + }, + { + "epoch": 0.06332324306635033, + "grad_norm": 0.28973954916000366, + "learning_rate": 3.1661621533175163e-06, + "loss": 0.0097, + "step": 3870 + }, + { + "epoch": 0.06348686901742616, + "grad_norm": 0.6435532569885254, + "learning_rate": 3.174343450871308e-06, + "loss": 0.0075, + "step": 3880 + }, + { + "epoch": 0.063650494968502, + "grad_norm": 0.3044186532497406, + "learning_rate": 3.1825247484251004e-06, + "loss": 0.0122, + "step": 3890 + }, + { + "epoch": 0.06381412091957785, + "grad_norm": 0.5144074559211731, + "learning_rate": 3.1907060459788923e-06, + "loss": 0.0068, + "step": 3900 + }, + { + "epoch": 0.06397774687065369, + "grad_norm": 0.3963913917541504, + "learning_rate": 3.1988873435326845e-06, + "loss": 0.0086, + "step": 3910 + }, + { + "epoch": 0.06414137282172952, + "grad_norm": 0.6809449195861816, + "learning_rate": 3.2070686410864764e-06, + "loss": 0.0102, + "step": 3920 + }, + { + "epoch": 0.06430499877280536, + "grad_norm": 0.5388890504837036, + "learning_rate": 3.2152499386402686e-06, + "loss": 0.0121, + "step": 3930 + }, + { + "epoch": 0.06446862472388121, + "grad_norm": 0.5491735935211182, + "learning_rate": 3.2234312361940605e-06, + "loss": 0.0066, + "step": 3940 + }, + { + "epoch": 0.06463225067495705, + "grad_norm": 0.624646782875061, + "learning_rate": 3.2316125337478527e-06, + "loss": 0.0103, + "step": 3950 + }, + { + "epoch": 0.0647958766260329, + "grad_norm": 0.24559050798416138, + "learning_rate": 3.2397938313016446e-06, + "loss": 0.0098, + "step": 3960 + }, + { + "epoch": 0.06495950257710872, + "grad_norm": 0.5081912875175476, + "learning_rate": 3.247975128855437e-06, + "loss": 0.0116, + "step": 3970 + }, + { + "epoch": 0.06512312852818457, + "grad_norm": 0.5179296135902405, + "learning_rate": 3.2561564264092287e-06, + "loss": 0.0118, + "step": 3980 + }, + { + "epoch": 0.06528675447926041, + "grad_norm": 0.3036271035671234, + "learning_rate": 3.264337723963021e-06, + "loss": 0.0113, + "step": 3990 + }, + { + "epoch": 0.06545038043033626, + "grad_norm": 0.4160747826099396, + "learning_rate": 3.2725190215168128e-06, + "loss": 0.014, + "step": 4000 + }, + { + "epoch": 0.0656140063814121, + "grad_norm": 0.6022220253944397, + "learning_rate": 3.280700319070605e-06, + "loss": 0.0096, + "step": 4010 + }, + { + "epoch": 0.06577763233248793, + "grad_norm": 0.5766621232032776, + "learning_rate": 3.288881616624397e-06, + "loss": 0.0099, + "step": 4020 + }, + { + "epoch": 0.06594125828356377, + "grad_norm": 0.3600672483444214, + "learning_rate": 3.297062914178189e-06, + "loss": 0.0131, + "step": 4030 + }, + { + "epoch": 0.06610488423463962, + "grad_norm": 0.6038428544998169, + "learning_rate": 3.305244211731981e-06, + "loss": 0.0082, + "step": 4040 + }, + { + "epoch": 0.06626851018571546, + "grad_norm": 0.37434881925582886, + "learning_rate": 3.3134255092857733e-06, + "loss": 0.0129, + "step": 4050 + }, + { + "epoch": 0.06643213613679129, + "grad_norm": 0.5213479995727539, + "learning_rate": 3.321606806839565e-06, + "loss": 0.009, + "step": 4060 + }, + { + "epoch": 0.06659576208786713, + "grad_norm": 0.28980115056037903, + "learning_rate": 3.3297881043933574e-06, + "loss": 0.0082, + "step": 4070 + }, + { + "epoch": 0.06675938803894298, + "grad_norm": 0.6069865226745605, + "learning_rate": 3.337969401947149e-06, + "loss": 0.0087, + "step": 4080 + }, + { + "epoch": 0.06692301399001882, + "grad_norm": 0.5408406853675842, + "learning_rate": 3.3461506995009415e-06, + "loss": 0.0118, + "step": 4090 + }, + { + "epoch": 0.06708663994109466, + "grad_norm": 0.8828417062759399, + "learning_rate": 3.3543319970547333e-06, + "loss": 0.008, + "step": 4100 + }, + { + "epoch": 0.0672502658921705, + "grad_norm": 0.5868886709213257, + "learning_rate": 3.3625132946085256e-06, + "loss": 0.0104, + "step": 4110 + }, + { + "epoch": 0.06741389184324634, + "grad_norm": 0.4467138946056366, + "learning_rate": 3.3706945921623174e-06, + "loss": 0.0148, + "step": 4120 + }, + { + "epoch": 0.06757751779432218, + "grad_norm": 1.0529546737670898, + "learning_rate": 3.3788758897161097e-06, + "loss": 0.0095, + "step": 4130 + }, + { + "epoch": 0.06774114374539802, + "grad_norm": 0.8397409915924072, + "learning_rate": 3.3870571872699015e-06, + "loss": 0.0088, + "step": 4140 + }, + { + "epoch": 0.06790476969647385, + "grad_norm": 0.6100178956985474, + "learning_rate": 3.3952384848236938e-06, + "loss": 0.0113, + "step": 4150 + }, + { + "epoch": 0.0680683956475497, + "grad_norm": 0.4465765953063965, + "learning_rate": 3.4034197823774856e-06, + "loss": 0.0079, + "step": 4160 + }, + { + "epoch": 0.06823202159862554, + "grad_norm": 0.2734662592411041, + "learning_rate": 3.4116010799312774e-06, + "loss": 0.0055, + "step": 4170 + }, + { + "epoch": 0.06839564754970139, + "grad_norm": 0.3067326545715332, + "learning_rate": 3.4197823774850693e-06, + "loss": 0.0113, + "step": 4180 + }, + { + "epoch": 0.06855927350077723, + "grad_norm": 0.47589603066444397, + "learning_rate": 3.427963675038861e-06, + "loss": 0.007, + "step": 4190 + }, + { + "epoch": 0.06872289945185306, + "grad_norm": 0.4676600694656372, + "learning_rate": 3.4361449725926534e-06, + "loss": 0.0129, + "step": 4200 + }, + { + "epoch": 0.0688865254029289, + "grad_norm": 0.6486285328865051, + "learning_rate": 3.4443262701464452e-06, + "loss": 0.0107, + "step": 4210 + }, + { + "epoch": 0.06905015135400475, + "grad_norm": 0.28594696521759033, + "learning_rate": 3.4525075677002375e-06, + "loss": 0.0099, + "step": 4220 + }, + { + "epoch": 0.06921377730508059, + "grad_norm": 0.49257415533065796, + "learning_rate": 3.4606888652540293e-06, + "loss": 0.0086, + "step": 4230 + }, + { + "epoch": 0.06937740325615642, + "grad_norm": 0.5153892040252686, + "learning_rate": 3.4688701628078216e-06, + "loss": 0.0103, + "step": 4240 + }, + { + "epoch": 0.06954102920723226, + "grad_norm": 0.5089650750160217, + "learning_rate": 3.4770514603616134e-06, + "loss": 0.0239, + "step": 4250 + }, + { + "epoch": 0.06970465515830811, + "grad_norm": 0.3236147463321686, + "learning_rate": 3.4852327579154057e-06, + "loss": 0.0129, + "step": 4260 + }, + { + "epoch": 0.06986828110938395, + "grad_norm": 0.43026453256607056, + "learning_rate": 3.4934140554691975e-06, + "loss": 0.0111, + "step": 4270 + }, + { + "epoch": 0.0700319070604598, + "grad_norm": 0.4745500385761261, + "learning_rate": 3.50159535302299e-06, + "loss": 0.0077, + "step": 4280 + }, + { + "epoch": 0.07019553301153562, + "grad_norm": 0.6749144792556763, + "learning_rate": 3.5097766505767816e-06, + "loss": 0.0123, + "step": 4290 + }, + { + "epoch": 0.07035915896261147, + "grad_norm": 0.3357636034488678, + "learning_rate": 3.5179579481305735e-06, + "loss": 0.0076, + "step": 4300 + }, + { + "epoch": 0.07052278491368731, + "grad_norm": 0.5458382368087769, + "learning_rate": 3.5261392456843657e-06, + "loss": 0.0076, + "step": 4310 + }, + { + "epoch": 0.07068641086476316, + "grad_norm": 0.3887699246406555, + "learning_rate": 3.5343205432381576e-06, + "loss": 0.0075, + "step": 4320 + }, + { + "epoch": 0.070850036815839, + "grad_norm": 0.8280920386314392, + "learning_rate": 3.54250184079195e-06, + "loss": 0.014, + "step": 4330 + }, + { + "epoch": 0.07101366276691483, + "grad_norm": 0.3678774833679199, + "learning_rate": 3.5506831383457417e-06, + "loss": 0.0103, + "step": 4340 + }, + { + "epoch": 0.07117728871799067, + "grad_norm": 0.4992738366127014, + "learning_rate": 3.558864435899534e-06, + "loss": 0.0079, + "step": 4350 + }, + { + "epoch": 0.07134091466906652, + "grad_norm": 0.43329018354415894, + "learning_rate": 3.5670457334533258e-06, + "loss": 0.0088, + "step": 4360 + }, + { + "epoch": 0.07150454062014236, + "grad_norm": 0.4352242052555084, + "learning_rate": 3.575227031007118e-06, + "loss": 0.0149, + "step": 4370 + }, + { + "epoch": 0.07166816657121819, + "grad_norm": 0.037003155797719955, + "learning_rate": 3.58340832856091e-06, + "loss": 0.0167, + "step": 4380 + }, + { + "epoch": 0.07183179252229403, + "grad_norm": 0.39874985814094543, + "learning_rate": 3.591589626114702e-06, + "loss": 0.0109, + "step": 4390 + }, + { + "epoch": 0.07199541847336988, + "grad_norm": 0.5194030404090881, + "learning_rate": 3.599770923668494e-06, + "loss": 0.0099, + "step": 4400 + }, + { + "epoch": 0.07215904442444572, + "grad_norm": 0.6235454082489014, + "learning_rate": 3.6079522212222862e-06, + "loss": 0.0069, + "step": 4410 + }, + { + "epoch": 0.07232267037552156, + "grad_norm": 0.3940913677215576, + "learning_rate": 3.616133518776078e-06, + "loss": 0.0107, + "step": 4420 + }, + { + "epoch": 0.0724862963265974, + "grad_norm": 0.38981983065605164, + "learning_rate": 3.6243148163298703e-06, + "loss": 0.0117, + "step": 4430 + }, + { + "epoch": 0.07264992227767324, + "grad_norm": 0.5303213000297546, + "learning_rate": 3.632496113883662e-06, + "loss": 0.0095, + "step": 4440 + }, + { + "epoch": 0.07281354822874908, + "grad_norm": 0.588760256767273, + "learning_rate": 3.6406774114374544e-06, + "loss": 0.0131, + "step": 4450 + }, + { + "epoch": 0.07297717417982492, + "grad_norm": 0.33309927582740784, + "learning_rate": 3.6488587089912463e-06, + "loss": 0.0098, + "step": 4460 + }, + { + "epoch": 0.07314080013090075, + "grad_norm": 0.591569721698761, + "learning_rate": 3.6570400065450385e-06, + "loss": 0.009, + "step": 4470 + }, + { + "epoch": 0.0733044260819766, + "grad_norm": 0.3031422197818756, + "learning_rate": 3.6652213040988304e-06, + "loss": 0.0082, + "step": 4480 + }, + { + "epoch": 0.07346805203305244, + "grad_norm": 0.43153369426727295, + "learning_rate": 3.6734026016526226e-06, + "loss": 0.007, + "step": 4490 + }, + { + "epoch": 0.07363167798412829, + "grad_norm": 0.5362528562545776, + "learning_rate": 3.6815838992064145e-06, + "loss": 0.0088, + "step": 4500 + }, + { + "epoch": 0.07379530393520413, + "grad_norm": 0.5695440769195557, + "learning_rate": 3.6897651967602068e-06, + "loss": 0.014, + "step": 4510 + }, + { + "epoch": 0.07395892988627996, + "grad_norm": 0.26773330569267273, + "learning_rate": 3.6979464943139986e-06, + "loss": 0.0123, + "step": 4520 + }, + { + "epoch": 0.0741225558373558, + "grad_norm": 0.544951856136322, + "learning_rate": 3.706127791867791e-06, + "loss": 0.0133, + "step": 4530 + }, + { + "epoch": 0.07428618178843165, + "grad_norm": 0.3702234625816345, + "learning_rate": 3.7143090894215827e-06, + "loss": 0.0102, + "step": 4540 + }, + { + "epoch": 0.07444980773950749, + "grad_norm": 0.34125208854675293, + "learning_rate": 3.722490386975375e-06, + "loss": 0.0094, + "step": 4550 + }, + { + "epoch": 0.07461343369058333, + "grad_norm": 0.3214631676673889, + "learning_rate": 3.730671684529167e-06, + "loss": 0.0109, + "step": 4560 + }, + { + "epoch": 0.07477705964165916, + "grad_norm": 0.3399805426597595, + "learning_rate": 3.738852982082959e-06, + "loss": 0.0077, + "step": 4570 + }, + { + "epoch": 0.074940685592735, + "grad_norm": 0.5178592801094055, + "learning_rate": 3.747034279636751e-06, + "loss": 0.0082, + "step": 4580 + }, + { + "epoch": 0.07510431154381085, + "grad_norm": 0.5321545004844666, + "learning_rate": 3.755215577190543e-06, + "loss": 0.0074, + "step": 4590 + }, + { + "epoch": 0.0752679374948867, + "grad_norm": 0.4659704566001892, + "learning_rate": 3.763396874744335e-06, + "loss": 0.0101, + "step": 4600 + }, + { + "epoch": 0.07543156344596252, + "grad_norm": 0.4055034816265106, + "learning_rate": 3.7715781722981273e-06, + "loss": 0.0093, + "step": 4610 + }, + { + "epoch": 0.07559518939703837, + "grad_norm": 0.5214939117431641, + "learning_rate": 3.7797594698519187e-06, + "loss": 0.0087, + "step": 4620 + }, + { + "epoch": 0.07575881534811421, + "grad_norm": 0.7674144506454468, + "learning_rate": 3.7879407674057105e-06, + "loss": 0.0127, + "step": 4630 + }, + { + "epoch": 0.07592244129919005, + "grad_norm": 0.5002004504203796, + "learning_rate": 3.7961220649595028e-06, + "loss": 0.0121, + "step": 4640 + }, + { + "epoch": 0.0760860672502659, + "grad_norm": 0.6141252517700195, + "learning_rate": 3.8043033625132946e-06, + "loss": 0.0104, + "step": 4650 + }, + { + "epoch": 0.07624969320134173, + "grad_norm": 0.41309961676597595, + "learning_rate": 3.812484660067087e-06, + "loss": 0.0098, + "step": 4660 + }, + { + "epoch": 0.07641331915241757, + "grad_norm": 0.18280814588069916, + "learning_rate": 3.820665957620879e-06, + "loss": 0.0119, + "step": 4670 + }, + { + "epoch": 0.07657694510349342, + "grad_norm": 0.33989739418029785, + "learning_rate": 3.8288472551746706e-06, + "loss": 0.0101, + "step": 4680 + }, + { + "epoch": 0.07674057105456926, + "grad_norm": 0.40718260407447815, + "learning_rate": 3.837028552728463e-06, + "loss": 0.0108, + "step": 4690 + }, + { + "epoch": 0.07690419700564509, + "grad_norm": 0.9239081144332886, + "learning_rate": 3.845209850282255e-06, + "loss": 0.0104, + "step": 4700 + }, + { + "epoch": 0.07706782295672093, + "grad_norm": 0.4057515859603882, + "learning_rate": 3.853391147836047e-06, + "loss": 0.0098, + "step": 4710 + }, + { + "epoch": 0.07723144890779678, + "grad_norm": 0.3407362401485443, + "learning_rate": 3.861572445389839e-06, + "loss": 0.0079, + "step": 4720 + }, + { + "epoch": 0.07739507485887262, + "grad_norm": 1.244737148284912, + "learning_rate": 3.869753742943631e-06, + "loss": 0.0099, + "step": 4730 + }, + { + "epoch": 0.07755870080994846, + "grad_norm": 0.47368761897087097, + "learning_rate": 3.877935040497423e-06, + "loss": 0.0104, + "step": 4740 + }, + { + "epoch": 0.0777223267610243, + "grad_norm": 0.5678014755249023, + "learning_rate": 3.8861163380512156e-06, + "loss": 0.015, + "step": 4750 + }, + { + "epoch": 0.07788595271210014, + "grad_norm": 0.38244327902793884, + "learning_rate": 3.894297635605007e-06, + "loss": 0.0069, + "step": 4760 + }, + { + "epoch": 0.07804957866317598, + "grad_norm": 0.2525394558906555, + "learning_rate": 3.902478933158799e-06, + "loss": 0.0088, + "step": 4770 + }, + { + "epoch": 0.07821320461425182, + "grad_norm": 0.3700346350669861, + "learning_rate": 3.9106602307125915e-06, + "loss": 0.0068, + "step": 4780 + }, + { + "epoch": 0.07837683056532765, + "grad_norm": 0.3476503789424896, + "learning_rate": 3.918841528266383e-06, + "loss": 0.0075, + "step": 4790 + }, + { + "epoch": 0.0785404565164035, + "grad_norm": 0.6082961559295654, + "learning_rate": 3.927022825820175e-06, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.07870408246747934, + "grad_norm": 0.5181087255477905, + "learning_rate": 3.9352041233739674e-06, + "loss": 0.0085, + "step": 4810 + }, + { + "epoch": 0.07886770841855519, + "grad_norm": 0.5160667896270752, + "learning_rate": 3.94338542092776e-06, + "loss": 0.0092, + "step": 4820 + }, + { + "epoch": 0.07903133436963103, + "grad_norm": 0.8015368580818176, + "learning_rate": 3.951566718481551e-06, + "loss": 0.0092, + "step": 4830 + }, + { + "epoch": 0.07919496032070686, + "grad_norm": 0.6088091731071472, + "learning_rate": 3.959748016035343e-06, + "loss": 0.009, + "step": 4840 + }, + { + "epoch": 0.0793585862717827, + "grad_norm": 0.38970452547073364, + "learning_rate": 3.967929313589136e-06, + "loss": 0.0092, + "step": 4850 + }, + { + "epoch": 0.07952221222285855, + "grad_norm": 0.41453033685684204, + "learning_rate": 3.976110611142928e-06, + "loss": 0.0113, + "step": 4860 + }, + { + "epoch": 0.07968583817393439, + "grad_norm": 0.3551105856895447, + "learning_rate": 3.984291908696719e-06, + "loss": 0.009, + "step": 4870 + }, + { + "epoch": 0.07984946412501023, + "grad_norm": 0.2863304316997528, + "learning_rate": 3.992473206250512e-06, + "loss": 0.0084, + "step": 4880 + }, + { + "epoch": 0.08001309007608606, + "grad_norm": 0.2288428395986557, + "learning_rate": 4.000654503804304e-06, + "loss": 0.0061, + "step": 4890 + }, + { + "epoch": 0.0801767160271619, + "grad_norm": 0.3824857771396637, + "learning_rate": 4.008835801358096e-06, + "loss": 0.0097, + "step": 4900 + }, + { + "epoch": 0.08034034197823775, + "grad_norm": 0.4002744257450104, + "learning_rate": 4.0170170989118875e-06, + "loss": 0.0076, + "step": 4910 + }, + { + "epoch": 0.0805039679293136, + "grad_norm": 0.47109052538871765, + "learning_rate": 4.02519839646568e-06, + "loss": 0.0073, + "step": 4920 + }, + { + "epoch": 0.08066759388038942, + "grad_norm": 0.46207544207572937, + "learning_rate": 4.033379694019472e-06, + "loss": 0.0102, + "step": 4930 + }, + { + "epoch": 0.08083121983146527, + "grad_norm": 0.6554605960845947, + "learning_rate": 4.041560991573264e-06, + "loss": 0.0092, + "step": 4940 + }, + { + "epoch": 0.08099484578254111, + "grad_norm": 0.7463805079460144, + "learning_rate": 4.049742289127056e-06, + "loss": 0.009, + "step": 4950 + }, + { + "epoch": 0.08115847173361695, + "grad_norm": 0.5068933367729187, + "learning_rate": 4.057923586680848e-06, + "loss": 0.0119, + "step": 4960 + }, + { + "epoch": 0.0813220976846928, + "grad_norm": 0.17575666308403015, + "learning_rate": 4.06610488423464e-06, + "loss": 0.0089, + "step": 4970 + }, + { + "epoch": 0.08148572363576863, + "grad_norm": 0.553342342376709, + "learning_rate": 4.0742861817884325e-06, + "loss": 0.0087, + "step": 4980 + }, + { + "epoch": 0.08164934958684447, + "grad_norm": 0.25047600269317627, + "learning_rate": 4.082467479342224e-06, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.08181297553792032, + "grad_norm": 0.2992837131023407, + "learning_rate": 4.090648776896016e-06, + "loss": 0.0104, + "step": 5000 + }, + { + "epoch": 0.08197660148899616, + "grad_norm": 0.630729615688324, + "learning_rate": 4.0988300744498085e-06, + "loss": 0.0095, + "step": 5010 + }, + { + "epoch": 0.08214022744007199, + "grad_norm": 0.41234448552131653, + "learning_rate": 4.107011372003601e-06, + "loss": 0.0064, + "step": 5020 + }, + { + "epoch": 0.08230385339114783, + "grad_norm": 0.45269840955734253, + "learning_rate": 4.115192669557392e-06, + "loss": 0.0092, + "step": 5030 + }, + { + "epoch": 0.08246747934222368, + "grad_norm": 0.18220584094524384, + "learning_rate": 4.123373967111184e-06, + "loss": 0.0099, + "step": 5040 + }, + { + "epoch": 0.08263110529329952, + "grad_norm": 0.5539675951004028, + "learning_rate": 4.131555264664977e-06, + "loss": 0.0106, + "step": 5050 + }, + { + "epoch": 0.08279473124437536, + "grad_norm": 0.39362144470214844, + "learning_rate": 4.139736562218768e-06, + "loss": 0.0091, + "step": 5060 + }, + { + "epoch": 0.0829583571954512, + "grad_norm": 0.24591320753097534, + "learning_rate": 4.14791785977256e-06, + "loss": 0.0103, + "step": 5070 + }, + { + "epoch": 0.08312198314652704, + "grad_norm": 0.41742563247680664, + "learning_rate": 4.156099157326352e-06, + "loss": 0.0079, + "step": 5080 + }, + { + "epoch": 0.08328560909760288, + "grad_norm": 0.28923746943473816, + "learning_rate": 4.164280454880144e-06, + "loss": 0.0097, + "step": 5090 + }, + { + "epoch": 0.08344923504867872, + "grad_norm": 0.19561634957790375, + "learning_rate": 4.172461752433936e-06, + "loss": 0.0086, + "step": 5100 + }, + { + "epoch": 0.08361286099975457, + "grad_norm": 0.3338703513145447, + "learning_rate": 4.1806430499877285e-06, + "loss": 0.0081, + "step": 5110 + }, + { + "epoch": 0.0837764869508304, + "grad_norm": 0.6227090954780579, + "learning_rate": 4.18882434754152e-06, + "loss": 0.0132, + "step": 5120 + }, + { + "epoch": 0.08394011290190624, + "grad_norm": 0.5862659215927124, + "learning_rate": 4.197005645095312e-06, + "loss": 0.0082, + "step": 5130 + }, + { + "epoch": 0.08410373885298209, + "grad_norm": 0.40953049063682556, + "learning_rate": 4.2051869426491045e-06, + "loss": 0.0131, + "step": 5140 + }, + { + "epoch": 0.08426736480405793, + "grad_norm": 0.1880975216627121, + "learning_rate": 4.213368240202897e-06, + "loss": 0.0067, + "step": 5150 + }, + { + "epoch": 0.08443099075513376, + "grad_norm": 0.3912363648414612, + "learning_rate": 4.221549537756688e-06, + "loss": 0.0132, + "step": 5160 + }, + { + "epoch": 0.0845946167062096, + "grad_norm": 0.7005236148834229, + "learning_rate": 4.2297308353104804e-06, + "loss": 0.0133, + "step": 5170 + }, + { + "epoch": 0.08475824265728545, + "grad_norm": 0.25200846791267395, + "learning_rate": 4.237912132864273e-06, + "loss": 0.0085, + "step": 5180 + }, + { + "epoch": 0.08492186860836129, + "grad_norm": 0.341741681098938, + "learning_rate": 4.246093430418065e-06, + "loss": 0.0079, + "step": 5190 + }, + { + "epoch": 0.08508549455943713, + "grad_norm": 0.4006780982017517, + "learning_rate": 4.254274727971856e-06, + "loss": 0.013, + "step": 5200 + }, + { + "epoch": 0.08524912051051296, + "grad_norm": 0.4393010139465332, + "learning_rate": 4.262456025525649e-06, + "loss": 0.0107, + "step": 5210 + }, + { + "epoch": 0.0854127464615888, + "grad_norm": 0.2482386976480484, + "learning_rate": 4.270637323079441e-06, + "loss": 0.0122, + "step": 5220 + }, + { + "epoch": 0.08557637241266465, + "grad_norm": 0.2598719000816345, + "learning_rate": 4.278818620633232e-06, + "loss": 0.0074, + "step": 5230 + }, + { + "epoch": 0.0857399983637405, + "grad_norm": 0.28431206941604614, + "learning_rate": 4.2869999181870246e-06, + "loss": 0.0091, + "step": 5240 + }, + { + "epoch": 0.08590362431481632, + "grad_norm": 0.1387432962656021, + "learning_rate": 4.295181215740817e-06, + "loss": 0.0101, + "step": 5250 + }, + { + "epoch": 0.08606725026589217, + "grad_norm": 0.22250521183013916, + "learning_rate": 4.303362513294609e-06, + "loss": 0.0093, + "step": 5260 + }, + { + "epoch": 0.08623087621696801, + "grad_norm": 0.4605426788330078, + "learning_rate": 4.3115438108484005e-06, + "loss": 0.0059, + "step": 5270 + }, + { + "epoch": 0.08639450216804385, + "grad_norm": 0.19569845497608185, + "learning_rate": 4.319725108402193e-06, + "loss": 0.0075, + "step": 5280 + }, + { + "epoch": 0.0865581281191197, + "grad_norm": 0.39840999245643616, + "learning_rate": 4.327906405955985e-06, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.08672175407019553, + "grad_norm": 0.35140636563301086, + "learning_rate": 4.336087703509777e-06, + "loss": 0.0097, + "step": 5300 + }, + { + "epoch": 0.08688538002127137, + "grad_norm": 0.37227746844291687, + "learning_rate": 4.344269001063569e-06, + "loss": 0.007, + "step": 5310 + }, + { + "epoch": 0.08704900597234722, + "grad_norm": 0.37834489345550537, + "learning_rate": 4.352450298617361e-06, + "loss": 0.0114, + "step": 5320 + }, + { + "epoch": 0.08721263192342306, + "grad_norm": 0.18501155078411102, + "learning_rate": 4.360631596171153e-06, + "loss": 0.0103, + "step": 5330 + }, + { + "epoch": 0.08737625787449889, + "grad_norm": 0.4748477339744568, + "learning_rate": 4.3688128937249455e-06, + "loss": 0.0088, + "step": 5340 + }, + { + "epoch": 0.08753988382557473, + "grad_norm": 0.42865192890167236, + "learning_rate": 4.376994191278737e-06, + "loss": 0.0075, + "step": 5350 + }, + { + "epoch": 0.08770350977665058, + "grad_norm": 0.3388519585132599, + "learning_rate": 4.385175488832529e-06, + "loss": 0.0066, + "step": 5360 + }, + { + "epoch": 0.08786713572772642, + "grad_norm": 0.3161774277687073, + "learning_rate": 4.3933567863863214e-06, + "loss": 0.0079, + "step": 5370 + }, + { + "epoch": 0.08803076167880226, + "grad_norm": 0.3384650945663452, + "learning_rate": 4.401538083940114e-06, + "loss": 0.0128, + "step": 5380 + }, + { + "epoch": 0.0881943876298781, + "grad_norm": 0.39423316717147827, + "learning_rate": 4.409719381493905e-06, + "loss": 0.0093, + "step": 5390 + }, + { + "epoch": 0.08835801358095394, + "grad_norm": 0.5899034142494202, + "learning_rate": 4.417900679047697e-06, + "loss": 0.0087, + "step": 5400 + }, + { + "epoch": 0.08852163953202978, + "grad_norm": 0.3199603855609894, + "learning_rate": 4.42608197660149e-06, + "loss": 0.0076, + "step": 5410 + }, + { + "epoch": 0.08868526548310562, + "grad_norm": 0.334506630897522, + "learning_rate": 4.434263274155282e-06, + "loss": 0.0064, + "step": 5420 + }, + { + "epoch": 0.08884889143418147, + "grad_norm": 0.24528226256370544, + "learning_rate": 4.442444571709073e-06, + "loss": 0.0063, + "step": 5430 + }, + { + "epoch": 0.0890125173852573, + "grad_norm": 0.5676206946372986, + "learning_rate": 4.450625869262866e-06, + "loss": 0.0079, + "step": 5440 + }, + { + "epoch": 0.08917614333633314, + "grad_norm": 0.13611209392547607, + "learning_rate": 4.458807166816658e-06, + "loss": 0.0056, + "step": 5450 + }, + { + "epoch": 0.08933976928740898, + "grad_norm": 0.34977057576179504, + "learning_rate": 4.46698846437045e-06, + "loss": 0.0095, + "step": 5460 + }, + { + "epoch": 0.08950339523848483, + "grad_norm": 0.39309054613113403, + "learning_rate": 4.4751697619242415e-06, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.08966702118956066, + "grad_norm": 0.35844671726226807, + "learning_rate": 4.483351059478034e-06, + "loss": 0.0112, + "step": 5480 + }, + { + "epoch": 0.0898306471406365, + "grad_norm": 0.3857707977294922, + "learning_rate": 4.491532357031826e-06, + "loss": 0.0071, + "step": 5490 + }, + { + "epoch": 0.08999427309171235, + "grad_norm": 0.2441084235906601, + "learning_rate": 4.4997136545856175e-06, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.09015789904278819, + "grad_norm": 0.18826180696487427, + "learning_rate": 4.50789495213941e-06, + "loss": 0.008, + "step": 5510 + }, + { + "epoch": 0.09032152499386403, + "grad_norm": 0.36036619544029236, + "learning_rate": 4.516076249693201e-06, + "loss": 0.0072, + "step": 5520 + }, + { + "epoch": 0.09048515094493986, + "grad_norm": 0.34049272537231445, + "learning_rate": 4.524257547246993e-06, + "loss": 0.0096, + "step": 5530 + }, + { + "epoch": 0.0906487768960157, + "grad_norm": 0.32959064841270447, + "learning_rate": 4.532438844800786e-06, + "loss": 0.008, + "step": 5540 + }, + { + "epoch": 0.09081240284709155, + "grad_norm": 0.29600727558135986, + "learning_rate": 4.540620142354578e-06, + "loss": 0.0103, + "step": 5550 + }, + { + "epoch": 0.0909760287981674, + "grad_norm": 0.3910512626171112, + "learning_rate": 4.548801439908369e-06, + "loss": 0.0074, + "step": 5560 + }, + { + "epoch": 0.09113965474924322, + "grad_norm": 0.28621724247932434, + "learning_rate": 4.556982737462162e-06, + "loss": 0.0103, + "step": 5570 + }, + { + "epoch": 0.09130328070031907, + "grad_norm": 0.3845384120941162, + "learning_rate": 4.565164035015954e-06, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.09146690665139491, + "grad_norm": 0.07460370659828186, + "learning_rate": 4.573345332569746e-06, + "loss": 0.0068, + "step": 5590 + }, + { + "epoch": 0.09163053260247075, + "grad_norm": 0.36962124705314636, + "learning_rate": 4.5815266301235376e-06, + "loss": 0.0084, + "step": 5600 + }, + { + "epoch": 0.0917941585535466, + "grad_norm": 0.5404472947120667, + "learning_rate": 4.58970792767733e-06, + "loss": 0.0077, + "step": 5610 + }, + { + "epoch": 0.09195778450462243, + "grad_norm": 0.3260168731212616, + "learning_rate": 4.597889225231122e-06, + "loss": 0.0121, + "step": 5620 + }, + { + "epoch": 0.09212141045569827, + "grad_norm": 0.5107181668281555, + "learning_rate": 4.606070522784914e-06, + "loss": 0.0092, + "step": 5630 + }, + { + "epoch": 0.09228503640677412, + "grad_norm": 0.3145920932292938, + "learning_rate": 4.614251820338706e-06, + "loss": 0.0124, + "step": 5640 + }, + { + "epoch": 0.09244866235784996, + "grad_norm": 0.336105614900589, + "learning_rate": 4.622433117892498e-06, + "loss": 0.0071, + "step": 5650 + }, + { + "epoch": 0.0926122883089258, + "grad_norm": 0.27319103479385376, + "learning_rate": 4.63061441544629e-06, + "loss": 0.0063, + "step": 5660 + }, + { + "epoch": 0.09277591426000163, + "grad_norm": 0.38768184185028076, + "learning_rate": 4.6387957130000826e-06, + "loss": 0.0096, + "step": 5670 + }, + { + "epoch": 0.09293954021107748, + "grad_norm": 0.2965134382247925, + "learning_rate": 4.646977010553874e-06, + "loss": 0.009, + "step": 5680 + }, + { + "epoch": 0.09310316616215332, + "grad_norm": 0.4633827209472656, + "learning_rate": 4.655158308107666e-06, + "loss": 0.0074, + "step": 5690 + }, + { + "epoch": 0.09326679211322916, + "grad_norm": 0.18506695330142975, + "learning_rate": 4.6633396056614585e-06, + "loss": 0.0095, + "step": 5700 + }, + { + "epoch": 0.093430418064305, + "grad_norm": 0.3332703709602356, + "learning_rate": 4.67152090321525e-06, + "loss": 0.0163, + "step": 5710 + }, + { + "epoch": 0.09359404401538084, + "grad_norm": 0.15415699779987335, + "learning_rate": 4.679702200769042e-06, + "loss": 0.008, + "step": 5720 + }, + { + "epoch": 0.09375766996645668, + "grad_norm": 0.2118310183286667, + "learning_rate": 4.6878834983228344e-06, + "loss": 0.0074, + "step": 5730 + }, + { + "epoch": 0.09392129591753252, + "grad_norm": 0.546164333820343, + "learning_rate": 4.696064795876627e-06, + "loss": 0.0087, + "step": 5740 + }, + { + "epoch": 0.09408492186860837, + "grad_norm": 0.2127242386341095, + "learning_rate": 4.704246093430418e-06, + "loss": 0.0075, + "step": 5750 + }, + { + "epoch": 0.0942485478196842, + "grad_norm": 0.3988535404205322, + "learning_rate": 4.71242739098421e-06, + "loss": 0.0107, + "step": 5760 + }, + { + "epoch": 0.09441217377076004, + "grad_norm": 0.32979124784469604, + "learning_rate": 4.720608688538003e-06, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.09457579972183588, + "grad_norm": 0.3586482107639313, + "learning_rate": 4.728789986091795e-06, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.09473942567291173, + "grad_norm": 0.2211388796567917, + "learning_rate": 4.736971283645586e-06, + "loss": 0.009, + "step": 5790 + }, + { + "epoch": 0.09490305162398756, + "grad_norm": 0.6729218363761902, + "learning_rate": 4.745152581199379e-06, + "loss": 0.0102, + "step": 5800 + }, + { + "epoch": 0.0950666775750634, + "grad_norm": 0.4604661464691162, + "learning_rate": 4.753333878753171e-06, + "loss": 0.0081, + "step": 5810 + }, + { + "epoch": 0.09523030352613925, + "grad_norm": 0.25674644112586975, + "learning_rate": 4.761515176306963e-06, + "loss": 0.008, + "step": 5820 + }, + { + "epoch": 0.09539392947721509, + "grad_norm": 0.2134886533021927, + "learning_rate": 4.7696964738607545e-06, + "loss": 0.0106, + "step": 5830 + }, + { + "epoch": 0.09555755542829093, + "grad_norm": 0.10825169086456299, + "learning_rate": 4.777877771414547e-06, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.09572118137936676, + "grad_norm": 0.24757535755634308, + "learning_rate": 4.786059068968339e-06, + "loss": 0.0091, + "step": 5850 + }, + { + "epoch": 0.0958848073304426, + "grad_norm": 0.25470975041389465, + "learning_rate": 4.794240366522131e-06, + "loss": 0.0094, + "step": 5860 + }, + { + "epoch": 0.09604843328151845, + "grad_norm": 0.3910082280635834, + "learning_rate": 4.802421664075923e-06, + "loss": 0.0098, + "step": 5870 + }, + { + "epoch": 0.0962120592325943, + "grad_norm": 0.24805599451065063, + "learning_rate": 4.810602961629715e-06, + "loss": 0.0053, + "step": 5880 + }, + { + "epoch": 0.09637568518367012, + "grad_norm": 0.3489004671573639, + "learning_rate": 4.818784259183507e-06, + "loss": 0.0067, + "step": 5890 + }, + { + "epoch": 0.09653931113474597, + "grad_norm": 0.5266171097755432, + "learning_rate": 4.8269655567372995e-06, + "loss": 0.0096, + "step": 5900 + }, + { + "epoch": 0.09670293708582181, + "grad_norm": 0.2186962217092514, + "learning_rate": 4.835146854291091e-06, + "loss": 0.0075, + "step": 5910 + }, + { + "epoch": 0.09686656303689765, + "grad_norm": 0.39976468682289124, + "learning_rate": 4.843328151844883e-06, + "loss": 0.0067, + "step": 5920 + }, + { + "epoch": 0.0970301889879735, + "grad_norm": 0.23606425523757935, + "learning_rate": 4.8515094493986755e-06, + "loss": 0.0063, + "step": 5930 + }, + { + "epoch": 0.09719381493904933, + "grad_norm": 0.1271335631608963, + "learning_rate": 4.859690746952467e-06, + "loss": 0.0064, + "step": 5940 + }, + { + "epoch": 0.09735744089012517, + "grad_norm": 0.41366955637931824, + "learning_rate": 4.867872044506259e-06, + "loss": 0.0105, + "step": 5950 + }, + { + "epoch": 0.09752106684120102, + "grad_norm": 0.533977210521698, + "learning_rate": 4.8760533420600506e-06, + "loss": 0.0095, + "step": 5960 + }, + { + "epoch": 0.09768469279227686, + "grad_norm": 0.4168972969055176, + "learning_rate": 4.884234639613843e-06, + "loss": 0.0093, + "step": 5970 + }, + { + "epoch": 0.0978483187433527, + "grad_norm": 0.8071966767311096, + "learning_rate": 4.892415937167635e-06, + "loss": 0.0076, + "step": 5980 + }, + { + "epoch": 0.09801194469442853, + "grad_norm": 0.5576759576797485, + "learning_rate": 4.900597234721427e-06, + "loss": 0.006, + "step": 5990 + }, + { + "epoch": 0.09817557064550438, + "grad_norm": 0.3682885468006134, + "learning_rate": 4.908778532275219e-06, + "loss": 0.0089, + "step": 6000 + }, + { + "epoch": 0.09833919659658022, + "grad_norm": 0.17927001416683197, + "learning_rate": 4.916959829829011e-06, + "loss": 0.0104, + "step": 6010 + }, + { + "epoch": 0.09850282254765606, + "grad_norm": 0.3189915716648102, + "learning_rate": 4.925141127382803e-06, + "loss": 0.0089, + "step": 6020 + }, + { + "epoch": 0.09866644849873189, + "grad_norm": 0.16469670832157135, + "learning_rate": 4.9333224249365955e-06, + "loss": 0.0057, + "step": 6030 + }, + { + "epoch": 0.09883007444980774, + "grad_norm": 0.3925216495990753, + "learning_rate": 4.941503722490387e-06, + "loss": 0.0073, + "step": 6040 + }, + { + "epoch": 0.09899370040088358, + "grad_norm": 0.2063838541507721, + "learning_rate": 4.949685020044179e-06, + "loss": 0.0068, + "step": 6050 + }, + { + "epoch": 0.09915732635195942, + "grad_norm": 0.05661248788237572, + "learning_rate": 4.9578663175979715e-06, + "loss": 0.0051, + "step": 6060 + }, + { + "epoch": 0.09932095230303527, + "grad_norm": 0.6064292192459106, + "learning_rate": 4.966047615151764e-06, + "loss": 0.0073, + "step": 6070 + }, + { + "epoch": 0.0994845782541111, + "grad_norm": 0.4550766050815582, + "learning_rate": 4.974228912705555e-06, + "loss": 0.0062, + "step": 6080 + }, + { + "epoch": 0.09964820420518694, + "grad_norm": 0.2789970338344574, + "learning_rate": 4.9824102102593474e-06, + "loss": 0.01, + "step": 6090 + }, + { + "epoch": 0.09981183015626278, + "grad_norm": 0.3552861213684082, + "learning_rate": 4.99059150781314e-06, + "loss": 0.0047, + "step": 6100 + }, + { + "epoch": 0.09997545610733863, + "grad_norm": 0.13682997226715088, + "learning_rate": 4.998772805366932e-06, + "loss": 0.0069, + "step": 6110 + }, + { + "epoch": 0.10013908205841446, + "grad_norm": 0.2569979131221771, + "learning_rate": 5.006954102920723e-06, + "loss": 0.0051, + "step": 6120 + }, + { + "epoch": 0.1003027080094903, + "grad_norm": 0.28847816586494446, + "learning_rate": 5.015135400474516e-06, + "loss": 0.0103, + "step": 6130 + }, + { + "epoch": 0.10046633396056615, + "grad_norm": 0.43194544315338135, + "learning_rate": 5.023316698028308e-06, + "loss": 0.0073, + "step": 6140 + }, + { + "epoch": 0.10062995991164199, + "grad_norm": 0.3517124652862549, + "learning_rate": 5.031497995582099e-06, + "loss": 0.0093, + "step": 6150 + }, + { + "epoch": 0.10079358586271783, + "grad_norm": 0.46990880370140076, + "learning_rate": 5.0396792931358916e-06, + "loss": 0.0092, + "step": 6160 + }, + { + "epoch": 0.10095721181379366, + "grad_norm": 0.43302083015441895, + "learning_rate": 5.047860590689684e-06, + "loss": 0.0099, + "step": 6170 + }, + { + "epoch": 0.1011208377648695, + "grad_norm": 0.7122765183448792, + "learning_rate": 5.056041888243476e-06, + "loss": 0.0084, + "step": 6180 + }, + { + "epoch": 0.10128446371594535, + "grad_norm": 0.4819414019584656, + "learning_rate": 5.0642231857972675e-06, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.1014480896670212, + "grad_norm": 0.4990685284137726, + "learning_rate": 5.07240448335106e-06, + "loss": 0.0083, + "step": 6200 + }, + { + "epoch": 0.10161171561809704, + "grad_norm": 0.5149896144866943, + "learning_rate": 5.080585780904852e-06, + "loss": 0.0059, + "step": 6210 + }, + { + "epoch": 0.10177534156917287, + "grad_norm": 0.20002731680870056, + "learning_rate": 5.088767078458644e-06, + "loss": 0.009, + "step": 6220 + }, + { + "epoch": 0.10193896752024871, + "grad_norm": 0.46108001470565796, + "learning_rate": 5.096948376012436e-06, + "loss": 0.0081, + "step": 6230 + }, + { + "epoch": 0.10210259347132455, + "grad_norm": 0.24710477888584137, + "learning_rate": 5.105129673566228e-06, + "loss": 0.007, + "step": 6240 + }, + { + "epoch": 0.1022662194224004, + "grad_norm": 0.24435612559318542, + "learning_rate": 5.11331097112002e-06, + "loss": 0.0058, + "step": 6250 + }, + { + "epoch": 0.10242984537347623, + "grad_norm": 0.3854849934577942, + "learning_rate": 5.1214922686738125e-06, + "loss": 0.0056, + "step": 6260 + }, + { + "epoch": 0.10259347132455207, + "grad_norm": 0.13587263226509094, + "learning_rate": 5.129673566227604e-06, + "loss": 0.0066, + "step": 6270 + }, + { + "epoch": 0.10275709727562791, + "grad_norm": 0.44002869725227356, + "learning_rate": 5.137854863781396e-06, + "loss": 0.0099, + "step": 6280 + }, + { + "epoch": 0.10292072322670376, + "grad_norm": 0.4461953938007355, + "learning_rate": 5.1460361613351884e-06, + "loss": 0.0088, + "step": 6290 + }, + { + "epoch": 0.1030843491777796, + "grad_norm": 0.26877957582473755, + "learning_rate": 5.154217458888981e-06, + "loss": 0.0068, + "step": 6300 + }, + { + "epoch": 0.10324797512885543, + "grad_norm": 0.34392890334129333, + "learning_rate": 5.162398756442772e-06, + "loss": 0.0077, + "step": 6310 + }, + { + "epoch": 0.10341160107993128, + "grad_norm": 0.30123746395111084, + "learning_rate": 5.170580053996564e-06, + "loss": 0.0122, + "step": 6320 + }, + { + "epoch": 0.10357522703100712, + "grad_norm": 0.33717355132102966, + "learning_rate": 5.178761351550357e-06, + "loss": 0.0099, + "step": 6330 + }, + { + "epoch": 0.10373885298208296, + "grad_norm": 0.2597667872905731, + "learning_rate": 5.186942649104149e-06, + "loss": 0.0093, + "step": 6340 + }, + { + "epoch": 0.10390247893315879, + "grad_norm": 0.27270177006721497, + "learning_rate": 5.19512394665794e-06, + "loss": 0.008, + "step": 6350 + }, + { + "epoch": 0.10406610488423464, + "grad_norm": 0.21485239267349243, + "learning_rate": 5.203305244211733e-06, + "loss": 0.0105, + "step": 6360 + }, + { + "epoch": 0.10422973083531048, + "grad_norm": 0.42165106534957886, + "learning_rate": 5.211486541765525e-06, + "loss": 0.0083, + "step": 6370 + }, + { + "epoch": 0.10439335678638632, + "grad_norm": 0.6050042510032654, + "learning_rate": 5.219667839319317e-06, + "loss": 0.0063, + "step": 6380 + }, + { + "epoch": 0.10455698273746217, + "grad_norm": 0.19433382153511047, + "learning_rate": 5.2278491368731085e-06, + "loss": 0.0075, + "step": 6390 + }, + { + "epoch": 0.104720608688538, + "grad_norm": 0.33693307638168335, + "learning_rate": 5.236030434426901e-06, + "loss": 0.0086, + "step": 6400 + }, + { + "epoch": 0.10488423463961384, + "grad_norm": 1.0649855136871338, + "learning_rate": 5.244211731980693e-06, + "loss": 0.0099, + "step": 6410 + }, + { + "epoch": 0.10504786059068968, + "grad_norm": 0.13170677423477173, + "learning_rate": 5.252393029534485e-06, + "loss": 0.0072, + "step": 6420 + }, + { + "epoch": 0.10521148654176553, + "grad_norm": 0.4372705817222595, + "learning_rate": 5.260574327088277e-06, + "loss": 0.0093, + "step": 6430 + }, + { + "epoch": 0.10537511249284136, + "grad_norm": 0.15160751342773438, + "learning_rate": 5.268755624642069e-06, + "loss": 0.006, + "step": 6440 + }, + { + "epoch": 0.1055387384439172, + "grad_norm": 0.13425381481647491, + "learning_rate": 5.276936922195861e-06, + "loss": 0.0081, + "step": 6450 + }, + { + "epoch": 0.10570236439499305, + "grad_norm": 0.2675306499004364, + "learning_rate": 5.2851182197496535e-06, + "loss": 0.0088, + "step": 6460 + }, + { + "epoch": 0.10586599034606889, + "grad_norm": 0.5187183618545532, + "learning_rate": 5.293299517303445e-06, + "loss": 0.0102, + "step": 6470 + }, + { + "epoch": 0.10602961629714473, + "grad_norm": 0.24732357263565063, + "learning_rate": 5.301480814857237e-06, + "loss": 0.0079, + "step": 6480 + }, + { + "epoch": 0.10619324224822056, + "grad_norm": 0.8667769432067871, + "learning_rate": 5.3096621124110295e-06, + "loss": 0.011, + "step": 6490 + }, + { + "epoch": 0.1063568681992964, + "grad_norm": 0.383929044008255, + "learning_rate": 5.317843409964821e-06, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.10652049415037225, + "grad_norm": 0.24571135640144348, + "learning_rate": 5.326024707518613e-06, + "loss": 0.0095, + "step": 6510 + }, + { + "epoch": 0.1066841201014481, + "grad_norm": 0.33501678705215454, + "learning_rate": 5.334206005072405e-06, + "loss": 0.0082, + "step": 6520 + }, + { + "epoch": 0.10684774605252394, + "grad_norm": 0.3351859450340271, + "learning_rate": 5.342387302626198e-06, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.10701137200359977, + "grad_norm": 0.3088167607784271, + "learning_rate": 5.350568600179989e-06, + "loss": 0.0093, + "step": 6540 + }, + { + "epoch": 0.10717499795467561, + "grad_norm": 0.48187342286109924, + "learning_rate": 5.358749897733781e-06, + "loss": 0.0062, + "step": 6550 + }, + { + "epoch": 0.10733862390575145, + "grad_norm": 0.5277681946754456, + "learning_rate": 5.366931195287574e-06, + "loss": 0.0071, + "step": 6560 + }, + { + "epoch": 0.1075022498568273, + "grad_norm": 0.33420518040657043, + "learning_rate": 5.375112492841366e-06, + "loss": 0.0101, + "step": 6570 + }, + { + "epoch": 0.10766587580790313, + "grad_norm": 0.4426640570163727, + "learning_rate": 5.383293790395157e-06, + "loss": 0.0059, + "step": 6580 + }, + { + "epoch": 0.10782950175897897, + "grad_norm": 1.3722050189971924, + "learning_rate": 5.3914750879489496e-06, + "loss": 0.0094, + "step": 6590 + }, + { + "epoch": 0.10799312771005481, + "grad_norm": 0.5037874579429626, + "learning_rate": 5.399656385502741e-06, + "loss": 0.0078, + "step": 6600 + }, + { + "epoch": 0.10815675366113066, + "grad_norm": 0.3484451174736023, + "learning_rate": 5.407837683056532e-06, + "loss": 0.0157, + "step": 6610 + }, + { + "epoch": 0.1083203796122065, + "grad_norm": 0.4415138363838196, + "learning_rate": 5.416018980610325e-06, + "loss": 0.0067, + "step": 6620 + }, + { + "epoch": 0.10848400556328233, + "grad_norm": 0.34858110547065735, + "learning_rate": 5.424200278164117e-06, + "loss": 0.0054, + "step": 6630 + }, + { + "epoch": 0.10864763151435818, + "grad_norm": 0.2884027361869812, + "learning_rate": 5.432381575717909e-06, + "loss": 0.0108, + "step": 6640 + }, + { + "epoch": 0.10881125746543402, + "grad_norm": 0.1966114342212677, + "learning_rate": 5.440562873271701e-06, + "loss": 0.0049, + "step": 6650 + }, + { + "epoch": 0.10897488341650986, + "grad_norm": 0.1636902391910553, + "learning_rate": 5.448744170825493e-06, + "loss": 0.0071, + "step": 6660 + }, + { + "epoch": 0.10913850936758569, + "grad_norm": 0.21612730622291565, + "learning_rate": 5.456925468379285e-06, + "loss": 0.0077, + "step": 6670 + }, + { + "epoch": 0.10930213531866154, + "grad_norm": 0.36195504665374756, + "learning_rate": 5.465106765933077e-06, + "loss": 0.0069, + "step": 6680 + }, + { + "epoch": 0.10946576126973738, + "grad_norm": 0.4107455909252167, + "learning_rate": 5.473288063486869e-06, + "loss": 0.0072, + "step": 6690 + }, + { + "epoch": 0.10962938722081322, + "grad_norm": 0.3034416437149048, + "learning_rate": 5.481469361040661e-06, + "loss": 0.0119, + "step": 6700 + }, + { + "epoch": 0.10979301317188907, + "grad_norm": 0.3874293267726898, + "learning_rate": 5.489650658594453e-06, + "loss": 0.0069, + "step": 6710 + }, + { + "epoch": 0.1099566391229649, + "grad_norm": 0.6343787908554077, + "learning_rate": 5.497831956148246e-06, + "loss": 0.0124, + "step": 6720 + }, + { + "epoch": 0.11012026507404074, + "grad_norm": 0.20312941074371338, + "learning_rate": 5.506013253702037e-06, + "loss": 0.0065, + "step": 6730 + }, + { + "epoch": 0.11028389102511658, + "grad_norm": 0.2851080000400543, + "learning_rate": 5.514194551255829e-06, + "loss": 0.006, + "step": 6740 + }, + { + "epoch": 0.11044751697619243, + "grad_norm": 0.13193948566913605, + "learning_rate": 5.5223758488096215e-06, + "loss": 0.0063, + "step": 6750 + }, + { + "epoch": 0.11061114292726827, + "grad_norm": 0.5235950946807861, + "learning_rate": 5.530557146363413e-06, + "loss": 0.0091, + "step": 6760 + }, + { + "epoch": 0.1107747688783441, + "grad_norm": 0.3150160014629364, + "learning_rate": 5.538738443917205e-06, + "loss": 0.007, + "step": 6770 + }, + { + "epoch": 0.11093839482941995, + "grad_norm": 0.9744367003440857, + "learning_rate": 5.5469197414709975e-06, + "loss": 0.0067, + "step": 6780 + }, + { + "epoch": 0.11110202078049579, + "grad_norm": 0.4049106538295746, + "learning_rate": 5.55510103902479e-06, + "loss": 0.0055, + "step": 6790 + }, + { + "epoch": 0.11126564673157163, + "grad_norm": 0.18236057460308075, + "learning_rate": 5.563282336578581e-06, + "loss": 0.0054, + "step": 6800 + }, + { + "epoch": 0.11142927268264746, + "grad_norm": 0.2937847971916199, + "learning_rate": 5.571463634132373e-06, + "loss": 0.0116, + "step": 6810 + }, + { + "epoch": 0.1115928986337233, + "grad_norm": 0.3066152334213257, + "learning_rate": 5.579644931686166e-06, + "loss": 0.0107, + "step": 6820 + }, + { + "epoch": 0.11175652458479915, + "grad_norm": 0.4665585458278656, + "learning_rate": 5.587826229239958e-06, + "loss": 0.0097, + "step": 6830 + }, + { + "epoch": 0.111920150535875, + "grad_norm": 0.11566019058227539, + "learning_rate": 5.596007526793749e-06, + "loss": 0.0086, + "step": 6840 + }, + { + "epoch": 0.11208377648695084, + "grad_norm": 0.29676222801208496, + "learning_rate": 5.604188824347542e-06, + "loss": 0.0082, + "step": 6850 + }, + { + "epoch": 0.11224740243802667, + "grad_norm": 0.3066006898880005, + "learning_rate": 5.612370121901334e-06, + "loss": 0.0094, + "step": 6860 + }, + { + "epoch": 0.11241102838910251, + "grad_norm": 0.44505757093429565, + "learning_rate": 5.620551419455126e-06, + "loss": 0.0138, + "step": 6870 + }, + { + "epoch": 0.11257465434017835, + "grad_norm": 0.2828943729400635, + "learning_rate": 5.6287327170089176e-06, + "loss": 0.0094, + "step": 6880 + }, + { + "epoch": 0.1127382802912542, + "grad_norm": 0.19990353286266327, + "learning_rate": 5.63691401456271e-06, + "loss": 0.0039, + "step": 6890 + }, + { + "epoch": 0.11290190624233003, + "grad_norm": 0.11108624190092087, + "learning_rate": 5.645095312116502e-06, + "loss": 0.0117, + "step": 6900 + }, + { + "epoch": 0.11306553219340587, + "grad_norm": 0.2445783168077469, + "learning_rate": 5.653276609670294e-06, + "loss": 0.0078, + "step": 6910 + }, + { + "epoch": 0.11322915814448171, + "grad_norm": 0.1478024274110794, + "learning_rate": 5.661457907224086e-06, + "loss": 0.0068, + "step": 6920 + }, + { + "epoch": 0.11339278409555756, + "grad_norm": 0.30458658933639526, + "learning_rate": 5.669639204777878e-06, + "loss": 0.008, + "step": 6930 + }, + { + "epoch": 0.1135564100466334, + "grad_norm": 0.254777729511261, + "learning_rate": 5.67782050233167e-06, + "loss": 0.0073, + "step": 6940 + }, + { + "epoch": 0.11372003599770923, + "grad_norm": 0.3834642469882965, + "learning_rate": 5.6860017998854625e-06, + "loss": 0.0062, + "step": 6950 + }, + { + "epoch": 0.11388366194878508, + "grad_norm": 0.275937557220459, + "learning_rate": 5.694183097439254e-06, + "loss": 0.0077, + "step": 6960 + }, + { + "epoch": 0.11404728789986092, + "grad_norm": 0.2779551148414612, + "learning_rate": 5.702364394993046e-06, + "loss": 0.0096, + "step": 6970 + }, + { + "epoch": 0.11421091385093676, + "grad_norm": 0.1703820675611496, + "learning_rate": 5.7105456925468385e-06, + "loss": 0.0087, + "step": 6980 + }, + { + "epoch": 0.11437453980201259, + "grad_norm": 0.26137998700141907, + "learning_rate": 5.718726990100631e-06, + "loss": 0.0079, + "step": 6990 + }, + { + "epoch": 0.11453816575308844, + "grad_norm": 0.3290702700614929, + "learning_rate": 5.726908287654422e-06, + "loss": 0.0062, + "step": 7000 + }, + { + "epoch": 0.11470179170416428, + "grad_norm": 0.2608306109905243, + "learning_rate": 5.7350895852082144e-06, + "loss": 0.0062, + "step": 7010 + }, + { + "epoch": 0.11486541765524012, + "grad_norm": 0.36139845848083496, + "learning_rate": 5.743270882762007e-06, + "loss": 0.0068, + "step": 7020 + }, + { + "epoch": 0.11502904360631597, + "grad_norm": 0.32653486728668213, + "learning_rate": 5.751452180315799e-06, + "loss": 0.0093, + "step": 7030 + }, + { + "epoch": 0.1151926695573918, + "grad_norm": 0.4448296129703522, + "learning_rate": 5.75963347786959e-06, + "loss": 0.0102, + "step": 7040 + }, + { + "epoch": 0.11535629550846764, + "grad_norm": 0.7444778084754944, + "learning_rate": 5.767814775423383e-06, + "loss": 0.0078, + "step": 7050 + }, + { + "epoch": 0.11551992145954348, + "grad_norm": 0.1455536186695099, + "learning_rate": 5.775996072977175e-06, + "loss": 0.0079, + "step": 7060 + }, + { + "epoch": 0.11568354741061933, + "grad_norm": 0.10789697617292404, + "learning_rate": 5.784177370530966e-06, + "loss": 0.0097, + "step": 7070 + }, + { + "epoch": 0.11584717336169517, + "grad_norm": 0.3874007761478424, + "learning_rate": 5.7923586680847586e-06, + "loss": 0.0116, + "step": 7080 + }, + { + "epoch": 0.116010799312771, + "grad_norm": 0.10515554994344711, + "learning_rate": 5.800539965638551e-06, + "loss": 0.0092, + "step": 7090 + }, + { + "epoch": 0.11617442526384684, + "grad_norm": 0.6693029403686523, + "learning_rate": 5.808721263192343e-06, + "loss": 0.0081, + "step": 7100 + }, + { + "epoch": 0.11633805121492269, + "grad_norm": 0.2853403091430664, + "learning_rate": 5.8169025607461345e-06, + "loss": 0.0073, + "step": 7110 + }, + { + "epoch": 0.11650167716599853, + "grad_norm": 0.27809441089630127, + "learning_rate": 5.825083858299927e-06, + "loss": 0.0084, + "step": 7120 + }, + { + "epoch": 0.11666530311707436, + "grad_norm": 0.340063214302063, + "learning_rate": 5.833265155853719e-06, + "loss": 0.0086, + "step": 7130 + }, + { + "epoch": 0.1168289290681502, + "grad_norm": 0.1692095696926117, + "learning_rate": 5.841446453407511e-06, + "loss": 0.0047, + "step": 7140 + }, + { + "epoch": 0.11699255501922605, + "grad_norm": 0.19236648082733154, + "learning_rate": 5.849627750961303e-06, + "loss": 0.0086, + "step": 7150 + }, + { + "epoch": 0.11715618097030189, + "grad_norm": 0.3904006779193878, + "learning_rate": 5.857809048515095e-06, + "loss": 0.0096, + "step": 7160 + }, + { + "epoch": 0.11731980692137774, + "grad_norm": 0.40139034390449524, + "learning_rate": 5.865990346068887e-06, + "loss": 0.0066, + "step": 7170 + }, + { + "epoch": 0.11748343287245357, + "grad_norm": 0.1781042069196701, + "learning_rate": 5.8741716436226795e-06, + "loss": 0.0073, + "step": 7180 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.3853304982185364, + "learning_rate": 5.882352941176471e-06, + "loss": 0.0091, + "step": 7190 + }, + { + "epoch": 0.11781068477460525, + "grad_norm": 0.19263824820518494, + "learning_rate": 5.890534238730263e-06, + "loss": 0.0046, + "step": 7200 + }, + { + "epoch": 0.1179743107256811, + "grad_norm": 0.1833399087190628, + "learning_rate": 5.8987155362840554e-06, + "loss": 0.0069, + "step": 7210 + }, + { + "epoch": 0.11813793667675693, + "grad_norm": 0.4048713743686676, + "learning_rate": 5.906896833837848e-06, + "loss": 0.0071, + "step": 7220 + }, + { + "epoch": 0.11830156262783277, + "grad_norm": 0.5216386318206787, + "learning_rate": 5.915078131391639e-06, + "loss": 0.0046, + "step": 7230 + }, + { + "epoch": 0.11846518857890861, + "grad_norm": 0.2489604651927948, + "learning_rate": 5.923259428945431e-06, + "loss": 0.0095, + "step": 7240 + }, + { + "epoch": 0.11862881452998446, + "grad_norm": 0.17428308725357056, + "learning_rate": 5.931440726499224e-06, + "loss": 0.0038, + "step": 7250 + }, + { + "epoch": 0.1187924404810603, + "grad_norm": 0.06524112820625305, + "learning_rate": 5.939622024053016e-06, + "loss": 0.0106, + "step": 7260 + }, + { + "epoch": 0.11895606643213613, + "grad_norm": 0.3374137878417969, + "learning_rate": 5.947803321606807e-06, + "loss": 0.0065, + "step": 7270 + }, + { + "epoch": 0.11911969238321198, + "grad_norm": 0.13164451718330383, + "learning_rate": 5.9559846191606e-06, + "loss": 0.0064, + "step": 7280 + }, + { + "epoch": 0.11928331833428782, + "grad_norm": 0.46344688534736633, + "learning_rate": 5.964165916714392e-06, + "loss": 0.0079, + "step": 7290 + }, + { + "epoch": 0.11944694428536366, + "grad_norm": 0.2627471089363098, + "learning_rate": 5.972347214268184e-06, + "loss": 0.0054, + "step": 7300 + }, + { + "epoch": 0.11961057023643949, + "grad_norm": 0.1846577227115631, + "learning_rate": 5.9805285118219755e-06, + "loss": 0.008, + "step": 7310 + }, + { + "epoch": 0.11977419618751534, + "grad_norm": 0.20860180258750916, + "learning_rate": 5.988709809375768e-06, + "loss": 0.0077, + "step": 7320 + }, + { + "epoch": 0.11993782213859118, + "grad_norm": 0.48983854055404663, + "learning_rate": 5.99689110692956e-06, + "loss": 0.0065, + "step": 7330 + }, + { + "epoch": 0.12010144808966702, + "grad_norm": 0.22185811400413513, + "learning_rate": 6.005072404483352e-06, + "loss": 0.0064, + "step": 7340 + }, + { + "epoch": 0.12026507404074287, + "grad_norm": 0.29743048548698425, + "learning_rate": 6.013253702037144e-06, + "loss": 0.0127, + "step": 7350 + }, + { + "epoch": 0.1204286999918187, + "grad_norm": 0.24011510610580444, + "learning_rate": 6.021434999590936e-06, + "loss": 0.0082, + "step": 7360 + }, + { + "epoch": 0.12059232594289454, + "grad_norm": 0.5802230834960938, + "learning_rate": 6.029616297144728e-06, + "loss": 0.0092, + "step": 7370 + }, + { + "epoch": 0.12075595189397038, + "grad_norm": 0.29700222611427307, + "learning_rate": 6.0377975946985205e-06, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.12091957784504623, + "grad_norm": 0.4346696436405182, + "learning_rate": 6.045978892252312e-06, + "loss": 0.0126, + "step": 7390 + }, + { + "epoch": 0.12108320379612207, + "grad_norm": 0.27801400423049927, + "learning_rate": 6.054160189806104e-06, + "loss": 0.0058, + "step": 7400 + }, + { + "epoch": 0.1212468297471979, + "grad_norm": 0.17639249563217163, + "learning_rate": 6.0623414873598965e-06, + "loss": 0.0058, + "step": 7410 + }, + { + "epoch": 0.12141045569827374, + "grad_norm": 0.20532791316509247, + "learning_rate": 6.070522784913688e-06, + "loss": 0.0102, + "step": 7420 + }, + { + "epoch": 0.12157408164934959, + "grad_norm": 0.28346556425094604, + "learning_rate": 6.07870408246748e-06, + "loss": 0.0085, + "step": 7430 + }, + { + "epoch": 0.12173770760042543, + "grad_norm": 0.21420612931251526, + "learning_rate": 6.086885380021272e-06, + "loss": 0.006, + "step": 7440 + }, + { + "epoch": 0.12190133355150126, + "grad_norm": 0.14299224317073822, + "learning_rate": 6.095066677575065e-06, + "loss": 0.0078, + "step": 7450 + }, + { + "epoch": 0.1220649595025771, + "grad_norm": 0.3003006875514984, + "learning_rate": 6.103247975128856e-06, + "loss": 0.0093, + "step": 7460 + }, + { + "epoch": 0.12222858545365295, + "grad_norm": 0.22845470905303955, + "learning_rate": 6.111429272682648e-06, + "loss": 0.0052, + "step": 7470 + }, + { + "epoch": 0.12239221140472879, + "grad_norm": 0.2701752483844757, + "learning_rate": 6.11961057023644e-06, + "loss": 0.0046, + "step": 7480 + }, + { + "epoch": 0.12255583735580464, + "grad_norm": 0.20579245686531067, + "learning_rate": 6.127791867790231e-06, + "loss": 0.0065, + "step": 7490 + }, + { + "epoch": 0.12271946330688047, + "grad_norm": 0.31625252962112427, + "learning_rate": 6.1359731653440234e-06, + "loss": 0.0077, + "step": 7500 + }, + { + "epoch": 0.12288308925795631, + "grad_norm": 0.2839002013206482, + "learning_rate": 6.144154462897816e-06, + "loss": 0.0107, + "step": 7510 + }, + { + "epoch": 0.12304671520903215, + "grad_norm": 0.9018831253051758, + "learning_rate": 6.152335760451608e-06, + "loss": 0.0081, + "step": 7520 + }, + { + "epoch": 0.123210341160108, + "grad_norm": 0.6090940237045288, + "learning_rate": 6.160517058005399e-06, + "loss": 0.0052, + "step": 7530 + }, + { + "epoch": 0.12337396711118383, + "grad_norm": 0.3708903193473816, + "learning_rate": 6.168698355559192e-06, + "loss": 0.0098, + "step": 7540 + }, + { + "epoch": 0.12353759306225967, + "grad_norm": 0.2477022409439087, + "learning_rate": 6.176879653112984e-06, + "loss": 0.0084, + "step": 7550 + }, + { + "epoch": 0.12370121901333551, + "grad_norm": 0.21271197497844696, + "learning_rate": 6.185060950666776e-06, + "loss": 0.0073, + "step": 7560 + }, + { + "epoch": 0.12386484496441136, + "grad_norm": 0.45584172010421753, + "learning_rate": 6.193242248220568e-06, + "loss": 0.0077, + "step": 7570 + }, + { + "epoch": 0.1240284709154872, + "grad_norm": 0.2077159285545349, + "learning_rate": 6.20142354577436e-06, + "loss": 0.0071, + "step": 7580 + }, + { + "epoch": 0.12419209686656303, + "grad_norm": 0.3583526313304901, + "learning_rate": 6.209604843328152e-06, + "loss": 0.0095, + "step": 7590 + }, + { + "epoch": 0.12435572281763888, + "grad_norm": 0.3761270046234131, + "learning_rate": 6.217786140881944e-06, + "loss": 0.0079, + "step": 7600 + }, + { + "epoch": 0.12451934876871472, + "grad_norm": 0.36139416694641113, + "learning_rate": 6.225967438435736e-06, + "loss": 0.0079, + "step": 7610 + }, + { + "epoch": 0.12468297471979056, + "grad_norm": 0.33850106596946716, + "learning_rate": 6.234148735989528e-06, + "loss": 0.0081, + "step": 7620 + }, + { + "epoch": 0.1248466006708664, + "grad_norm": 0.5654309988021851, + "learning_rate": 6.24233003354332e-06, + "loss": 0.0063, + "step": 7630 + }, + { + "epoch": 0.12501022662194225, + "grad_norm": 0.4743385314941406, + "learning_rate": 6.250511331097113e-06, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.12517385257301808, + "grad_norm": 0.4442897439002991, + "learning_rate": 6.258692628650904e-06, + "loss": 0.0058, + "step": 7650 + }, + { + "epoch": 0.1253374785240939, + "grad_norm": 0.3616870045661926, + "learning_rate": 6.266873926204696e-06, + "loss": 0.0064, + "step": 7660 + }, + { + "epoch": 0.12550110447516977, + "grad_norm": 0.2377433180809021, + "learning_rate": 6.2750552237584885e-06, + "loss": 0.0058, + "step": 7670 + }, + { + "epoch": 0.1256647304262456, + "grad_norm": 0.29012423753738403, + "learning_rate": 6.28323652131228e-06, + "loss": 0.0088, + "step": 7680 + }, + { + "epoch": 0.12582835637732145, + "grad_norm": 0.5089976191520691, + "learning_rate": 6.291417818866072e-06, + "loss": 0.0113, + "step": 7690 + }, + { + "epoch": 0.12599198232839728, + "grad_norm": 0.19199173152446747, + "learning_rate": 6.2995991164198645e-06, + "loss": 0.0059, + "step": 7700 + }, + { + "epoch": 0.1261556082794731, + "grad_norm": 0.16154931485652924, + "learning_rate": 6.307780413973657e-06, + "loss": 0.0059, + "step": 7710 + }, + { + "epoch": 0.12631923423054897, + "grad_norm": 0.18283475935459137, + "learning_rate": 6.315961711527448e-06, + "loss": 0.0052, + "step": 7720 + }, + { + "epoch": 0.1264828601816248, + "grad_norm": 0.08653350174427032, + "learning_rate": 6.32414300908124e-06, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.12664648613270066, + "grad_norm": 0.44227510690689087, + "learning_rate": 6.332324306635033e-06, + "loss": 0.0075, + "step": 7740 + }, + { + "epoch": 0.1268101120837765, + "grad_norm": 0.45884716510772705, + "learning_rate": 6.340505604188825e-06, + "loss": 0.007, + "step": 7750 + }, + { + "epoch": 0.12697373803485232, + "grad_norm": 0.3251267671585083, + "learning_rate": 6.348686901742616e-06, + "loss": 0.009, + "step": 7760 + }, + { + "epoch": 0.12713736398592818, + "grad_norm": 0.39178067445755005, + "learning_rate": 6.356868199296409e-06, + "loss": 0.0105, + "step": 7770 + }, + { + "epoch": 0.127300989937004, + "grad_norm": 0.5826765894889832, + "learning_rate": 6.365049496850201e-06, + "loss": 0.0056, + "step": 7780 + }, + { + "epoch": 0.12746461588807986, + "grad_norm": 0.34660714864730835, + "learning_rate": 6.373230794403993e-06, + "loss": 0.0099, + "step": 7790 + }, + { + "epoch": 0.1276282418391557, + "grad_norm": 0.17637041211128235, + "learning_rate": 6.3814120919577845e-06, + "loss": 0.0089, + "step": 7800 + }, + { + "epoch": 0.12779186779023152, + "grad_norm": 0.40063443779945374, + "learning_rate": 6.389593389511577e-06, + "loss": 0.0101, + "step": 7810 + }, + { + "epoch": 0.12795549374130738, + "grad_norm": 0.38279202580451965, + "learning_rate": 6.397774687065369e-06, + "loss": 0.0035, + "step": 7820 + }, + { + "epoch": 0.1281191196923832, + "grad_norm": 0.49693751335144043, + "learning_rate": 6.405955984619161e-06, + "loss": 0.0068, + "step": 7830 + }, + { + "epoch": 0.12828274564345904, + "grad_norm": 0.40038615465164185, + "learning_rate": 6.414137282172953e-06, + "loss": 0.0088, + "step": 7840 + }, + { + "epoch": 0.1284463715945349, + "grad_norm": 0.10365238785743713, + "learning_rate": 6.422318579726745e-06, + "loss": 0.004, + "step": 7850 + }, + { + "epoch": 0.12860999754561073, + "grad_norm": 0.16106294095516205, + "learning_rate": 6.430499877280537e-06, + "loss": 0.009, + "step": 7860 + }, + { + "epoch": 0.12877362349668658, + "grad_norm": 0.6523407697677612, + "learning_rate": 6.4386811748343295e-06, + "loss": 0.0054, + "step": 7870 + }, + { + "epoch": 0.12893724944776241, + "grad_norm": 0.2684585750102997, + "learning_rate": 6.446862472388121e-06, + "loss": 0.0075, + "step": 7880 + }, + { + "epoch": 0.12910087539883824, + "grad_norm": 0.2763690948486328, + "learning_rate": 6.455043769941913e-06, + "loss": 0.0071, + "step": 7890 + }, + { + "epoch": 0.1292645013499141, + "grad_norm": 0.2814697325229645, + "learning_rate": 6.4632250674957055e-06, + "loss": 0.0079, + "step": 7900 + }, + { + "epoch": 0.12942812730098993, + "grad_norm": 0.3585580289363861, + "learning_rate": 6.471406365049498e-06, + "loss": 0.0105, + "step": 7910 + }, + { + "epoch": 0.1295917532520658, + "grad_norm": 0.16478231549263, + "learning_rate": 6.479587662603289e-06, + "loss": 0.0056, + "step": 7920 + }, + { + "epoch": 0.12975537920314162, + "grad_norm": 0.2760835289955139, + "learning_rate": 6.4877689601570814e-06, + "loss": 0.0057, + "step": 7930 + }, + { + "epoch": 0.12991900515421745, + "grad_norm": 0.11307619512081146, + "learning_rate": 6.495950257710874e-06, + "loss": 0.0053, + "step": 7940 + }, + { + "epoch": 0.1300826311052933, + "grad_norm": 0.2428617775440216, + "learning_rate": 6.504131555264666e-06, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.13024625705636914, + "grad_norm": 0.31759703159332275, + "learning_rate": 6.512312852818457e-06, + "loss": 0.0054, + "step": 7960 + }, + { + "epoch": 0.130409883007445, + "grad_norm": 0.25207120180130005, + "learning_rate": 6.52049415037225e-06, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.13057350895852082, + "grad_norm": 0.1636662483215332, + "learning_rate": 6.528675447926042e-06, + "loss": 0.0061, + "step": 7980 + }, + { + "epoch": 0.13073713490959665, + "grad_norm": 0.27267494797706604, + "learning_rate": 6.536856745479834e-06, + "loss": 0.008, + "step": 7990 + }, + { + "epoch": 0.1309007608606725, + "grad_norm": 0.4470721185207367, + "learning_rate": 6.5450380430336256e-06, + "loss": 0.0088, + "step": 8000 + }, + { + "epoch": 0.13106438681174834, + "grad_norm": 0.47626909613609314, + "learning_rate": 6.553219340587418e-06, + "loss": 0.0081, + "step": 8010 + }, + { + "epoch": 0.1312280127628242, + "grad_norm": 0.4098890721797943, + "learning_rate": 6.56140063814121e-06, + "loss": 0.0057, + "step": 8020 + }, + { + "epoch": 0.13139163871390003, + "grad_norm": 0.35376524925231934, + "learning_rate": 6.5695819356950015e-06, + "loss": 0.0069, + "step": 8030 + }, + { + "epoch": 0.13155526466497586, + "grad_norm": 0.10104166716337204, + "learning_rate": 6.577763233248794e-06, + "loss": 0.0068, + "step": 8040 + }, + { + "epoch": 0.13171889061605171, + "grad_norm": 0.20320932567119598, + "learning_rate": 6.585944530802586e-06, + "loss": 0.0052, + "step": 8050 + }, + { + "epoch": 0.13188251656712754, + "grad_norm": 0.4781796634197235, + "learning_rate": 6.594125828356378e-06, + "loss": 0.0042, + "step": 8060 + }, + { + "epoch": 0.13204614251820337, + "grad_norm": 0.35332396626472473, + "learning_rate": 6.60230712591017e-06, + "loss": 0.0064, + "step": 8070 + }, + { + "epoch": 0.13220976846927923, + "grad_norm": 0.2873738706111908, + "learning_rate": 6.610488423463962e-06, + "loss": 0.0066, + "step": 8080 + }, + { + "epoch": 0.13237339442035506, + "grad_norm": 0.27414822578430176, + "learning_rate": 6.618669721017754e-06, + "loss": 0.0063, + "step": 8090 + }, + { + "epoch": 0.13253702037143092, + "grad_norm": 0.24895186722278595, + "learning_rate": 6.6268510185715465e-06, + "loss": 0.0049, + "step": 8100 + }, + { + "epoch": 0.13270064632250675, + "grad_norm": 0.17847618460655212, + "learning_rate": 6.635032316125338e-06, + "loss": 0.0055, + "step": 8110 + }, + { + "epoch": 0.13286427227358258, + "grad_norm": 0.28083527088165283, + "learning_rate": 6.64321361367913e-06, + "loss": 0.0058, + "step": 8120 + }, + { + "epoch": 0.13302789822465844, + "grad_norm": 0.4541753828525543, + "learning_rate": 6.6513949112329224e-06, + "loss": 0.0073, + "step": 8130 + }, + { + "epoch": 0.13319152417573427, + "grad_norm": 0.41221702098846436, + "learning_rate": 6.659576208786715e-06, + "loss": 0.0048, + "step": 8140 + }, + { + "epoch": 0.13335515012681012, + "grad_norm": 0.49959853291511536, + "learning_rate": 6.667757506340506e-06, + "loss": 0.0073, + "step": 8150 + }, + { + "epoch": 0.13351877607788595, + "grad_norm": 0.2927177846431732, + "learning_rate": 6.675938803894298e-06, + "loss": 0.0101, + "step": 8160 + }, + { + "epoch": 0.13368240202896178, + "grad_norm": 0.42072921991348267, + "learning_rate": 6.684120101448091e-06, + "loss": 0.0066, + "step": 8170 + }, + { + "epoch": 0.13384602798003764, + "grad_norm": 0.49477970600128174, + "learning_rate": 6.692301399001883e-06, + "loss": 0.0077, + "step": 8180 + }, + { + "epoch": 0.13400965393111347, + "grad_norm": 0.2319003790616989, + "learning_rate": 6.700482696555674e-06, + "loss": 0.006, + "step": 8190 + }, + { + "epoch": 0.13417327988218933, + "grad_norm": 0.44070616364479065, + "learning_rate": 6.708663994109467e-06, + "loss": 0.0106, + "step": 8200 + }, + { + "epoch": 0.13433690583326516, + "grad_norm": 0.39652708172798157, + "learning_rate": 6.716845291663259e-06, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.134500531784341, + "grad_norm": 0.3773319125175476, + "learning_rate": 6.725026589217051e-06, + "loss": 0.0062, + "step": 8220 + }, + { + "epoch": 0.13466415773541685, + "grad_norm": 0.2651503384113312, + "learning_rate": 6.7332078867708425e-06, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.13482778368649267, + "grad_norm": 0.2574477195739746, + "learning_rate": 6.741389184324635e-06, + "loss": 0.0067, + "step": 8240 + }, + { + "epoch": 0.1349914096375685, + "grad_norm": 0.18956246972084045, + "learning_rate": 6.749570481878427e-06, + "loss": 0.0062, + "step": 8250 + }, + { + "epoch": 0.13515503558864436, + "grad_norm": 0.40049681067466736, + "learning_rate": 6.757751779432219e-06, + "loss": 0.0064, + "step": 8260 + }, + { + "epoch": 0.1353186615397202, + "grad_norm": 0.3965357840061188, + "learning_rate": 6.765933076986011e-06, + "loss": 0.0056, + "step": 8270 + }, + { + "epoch": 0.13548228749079605, + "grad_norm": 0.2578824758529663, + "learning_rate": 6.774114374539803e-06, + "loss": 0.0064, + "step": 8280 + }, + { + "epoch": 0.13564591344187188, + "grad_norm": 0.3541015386581421, + "learning_rate": 6.782295672093595e-06, + "loss": 0.0079, + "step": 8290 + }, + { + "epoch": 0.1358095393929477, + "grad_norm": 0.4172951579093933, + "learning_rate": 6.7904769696473875e-06, + "loss": 0.0104, + "step": 8300 + }, + { + "epoch": 0.13597316534402357, + "grad_norm": 0.21050603687763214, + "learning_rate": 6.798658267201179e-06, + "loss": 0.0113, + "step": 8310 + }, + { + "epoch": 0.1361367912950994, + "grad_norm": 0.22825267910957336, + "learning_rate": 6.806839564754971e-06, + "loss": 0.0076, + "step": 8320 + }, + { + "epoch": 0.13630041724617525, + "grad_norm": 0.22266684472560883, + "learning_rate": 6.8150208623087635e-06, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.13646404319725108, + "grad_norm": 0.15727634727954865, + "learning_rate": 6.823202159862555e-06, + "loss": 0.0078, + "step": 8340 + }, + { + "epoch": 0.1366276691483269, + "grad_norm": 0.20662276446819305, + "learning_rate": 6.831383457416347e-06, + "loss": 0.0053, + "step": 8350 + }, + { + "epoch": 0.13679129509940277, + "grad_norm": 0.3878669738769531, + "learning_rate": 6.8395647549701386e-06, + "loss": 0.0045, + "step": 8360 + }, + { + "epoch": 0.1369549210504786, + "grad_norm": 0.3964359760284424, + "learning_rate": 6.84774605252393e-06, + "loss": 0.0055, + "step": 8370 + }, + { + "epoch": 0.13711854700155446, + "grad_norm": 0.44892069697380066, + "learning_rate": 6.855927350077722e-06, + "loss": 0.0083, + "step": 8380 + }, + { + "epoch": 0.1372821729526303, + "grad_norm": 0.32658880949020386, + "learning_rate": 6.8641086476315145e-06, + "loss": 0.0066, + "step": 8390 + }, + { + "epoch": 0.13744579890370612, + "grad_norm": 0.21312038600444794, + "learning_rate": 6.872289945185307e-06, + "loss": 0.0085, + "step": 8400 + }, + { + "epoch": 0.13760942485478198, + "grad_norm": 0.3220922350883484, + "learning_rate": 6.880471242739098e-06, + "loss": 0.0058, + "step": 8410 + }, + { + "epoch": 0.1377730508058578, + "grad_norm": 0.2851550877094269, + "learning_rate": 6.8886525402928904e-06, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.13793667675693366, + "grad_norm": 0.5871424674987793, + "learning_rate": 6.896833837846683e-06, + "loss": 0.0083, + "step": 8430 + }, + { + "epoch": 0.1381003027080095, + "grad_norm": 0.21022167801856995, + "learning_rate": 6.905015135400475e-06, + "loss": 0.007, + "step": 8440 + }, + { + "epoch": 0.13826392865908532, + "grad_norm": 0.3552643358707428, + "learning_rate": 6.913196432954266e-06, + "loss": 0.0072, + "step": 8450 + }, + { + "epoch": 0.13842755461016118, + "grad_norm": 0.2583826184272766, + "learning_rate": 6.921377730508059e-06, + "loss": 0.007, + "step": 8460 + }, + { + "epoch": 0.138591180561237, + "grad_norm": 0.05755360797047615, + "learning_rate": 6.929559028061851e-06, + "loss": 0.0083, + "step": 8470 + }, + { + "epoch": 0.13875480651231284, + "grad_norm": 0.34241628646850586, + "learning_rate": 6.937740325615643e-06, + "loss": 0.0065, + "step": 8480 + }, + { + "epoch": 0.1389184324633887, + "grad_norm": 0.16852635145187378, + "learning_rate": 6.945921623169435e-06, + "loss": 0.0092, + "step": 8490 + }, + { + "epoch": 0.13908205841446453, + "grad_norm": 0.3847416937351227, + "learning_rate": 6.954102920723227e-06, + "loss": 0.0058, + "step": 8500 + }, + { + "epoch": 0.13924568436554038, + "grad_norm": 0.4543159008026123, + "learning_rate": 6.962284218277019e-06, + "loss": 0.0075, + "step": 8510 + }, + { + "epoch": 0.13940931031661621, + "grad_norm": 0.20325085520744324, + "learning_rate": 6.970465515830811e-06, + "loss": 0.0051, + "step": 8520 + }, + { + "epoch": 0.13957293626769204, + "grad_norm": 0.26069197058677673, + "learning_rate": 6.978646813384603e-06, + "loss": 0.0084, + "step": 8530 + }, + { + "epoch": 0.1397365622187679, + "grad_norm": 0.27305662631988525, + "learning_rate": 6.986828110938395e-06, + "loss": 0.0067, + "step": 8540 + }, + { + "epoch": 0.13990018816984373, + "grad_norm": 0.14777232706546783, + "learning_rate": 6.995009408492187e-06, + "loss": 0.0065, + "step": 8550 + }, + { + "epoch": 0.1400638141209196, + "grad_norm": 0.22705182433128357, + "learning_rate": 7.00319070604598e-06, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.14022744007199542, + "grad_norm": 0.2742723822593689, + "learning_rate": 7.011372003599771e-06, + "loss": 0.0074, + "step": 8570 + }, + { + "epoch": 0.14039106602307125, + "grad_norm": 0.5873104929924011, + "learning_rate": 7.019553301153563e-06, + "loss": 0.0079, + "step": 8580 + }, + { + "epoch": 0.1405546919741471, + "grad_norm": 0.4223281145095825, + "learning_rate": 7.0277345987073555e-06, + "loss": 0.0091, + "step": 8590 + }, + { + "epoch": 0.14071831792522294, + "grad_norm": 0.31218650937080383, + "learning_rate": 7.035915896261147e-06, + "loss": 0.0085, + "step": 8600 + }, + { + "epoch": 0.1408819438762988, + "grad_norm": 0.19783629477024078, + "learning_rate": 7.044097193814939e-06, + "loss": 0.0052, + "step": 8610 + }, + { + "epoch": 0.14104556982737462, + "grad_norm": 0.22947272658348083, + "learning_rate": 7.0522784913687315e-06, + "loss": 0.0062, + "step": 8620 + }, + { + "epoch": 0.14120919577845045, + "grad_norm": 0.2573065459728241, + "learning_rate": 7.060459788922524e-06, + "loss": 0.0082, + "step": 8630 + }, + { + "epoch": 0.1413728217295263, + "grad_norm": 0.13020943105220795, + "learning_rate": 7.068641086476315e-06, + "loss": 0.01, + "step": 8640 + }, + { + "epoch": 0.14153644768060214, + "grad_norm": 0.11191508919000626, + "learning_rate": 7.076822384030107e-06, + "loss": 0.0072, + "step": 8650 + }, + { + "epoch": 0.141700073631678, + "grad_norm": 0.16540056467056274, + "learning_rate": 7.0850036815839e-06, + "loss": 0.0056, + "step": 8660 + }, + { + "epoch": 0.14186369958275383, + "grad_norm": 0.3133178949356079, + "learning_rate": 7.093184979137692e-06, + "loss": 0.0097, + "step": 8670 + }, + { + "epoch": 0.14202732553382966, + "grad_norm": 0.4946599304676056, + "learning_rate": 7.101366276691483e-06, + "loss": 0.0067, + "step": 8680 + }, + { + "epoch": 0.14219095148490551, + "grad_norm": 0.3116682767868042, + "learning_rate": 7.109547574245276e-06, + "loss": 0.0058, + "step": 8690 + }, + { + "epoch": 0.14235457743598134, + "grad_norm": 0.5580878257751465, + "learning_rate": 7.117728871799068e-06, + "loss": 0.008, + "step": 8700 + }, + { + "epoch": 0.14251820338705717, + "grad_norm": 0.26291677355766296, + "learning_rate": 7.12591016935286e-06, + "loss": 0.0088, + "step": 8710 + }, + { + "epoch": 0.14268182933813303, + "grad_norm": 0.4729841649532318, + "learning_rate": 7.1340914669066515e-06, + "loss": 0.01, + "step": 8720 + }, + { + "epoch": 0.14284545528920886, + "grad_norm": 0.08003254979848862, + "learning_rate": 7.142272764460444e-06, + "loss": 0.0132, + "step": 8730 + }, + { + "epoch": 0.14300908124028472, + "grad_norm": 0.37936556339263916, + "learning_rate": 7.150454062014236e-06, + "loss": 0.0078, + "step": 8740 + }, + { + "epoch": 0.14317270719136055, + "grad_norm": 0.41632506251335144, + "learning_rate": 7.158635359568028e-06, + "loss": 0.0103, + "step": 8750 + }, + { + "epoch": 0.14333633314243638, + "grad_norm": 0.1888243407011032, + "learning_rate": 7.16681665712182e-06, + "loss": 0.0074, + "step": 8760 + }, + { + "epoch": 0.14349995909351224, + "grad_norm": 0.6259981989860535, + "learning_rate": 7.174997954675612e-06, + "loss": 0.0095, + "step": 8770 + }, + { + "epoch": 0.14366358504458807, + "grad_norm": 0.17268413305282593, + "learning_rate": 7.183179252229404e-06, + "loss": 0.0056, + "step": 8780 + }, + { + "epoch": 0.14382721099566392, + "grad_norm": 0.18798314034938812, + "learning_rate": 7.1913605497831965e-06, + "loss": 0.0056, + "step": 8790 + }, + { + "epoch": 0.14399083694673975, + "grad_norm": 0.36550891399383545, + "learning_rate": 7.199541847336988e-06, + "loss": 0.0064, + "step": 8800 + }, + { + "epoch": 0.14415446289781558, + "grad_norm": 0.3121103048324585, + "learning_rate": 7.20772314489078e-06, + "loss": 0.0045, + "step": 8810 + }, + { + "epoch": 0.14431808884889144, + "grad_norm": 0.2535233199596405, + "learning_rate": 7.2159044424445725e-06, + "loss": 0.0065, + "step": 8820 + }, + { + "epoch": 0.14448171479996727, + "grad_norm": 0.36904042959213257, + "learning_rate": 7.224085739998365e-06, + "loss": 0.0071, + "step": 8830 + }, + { + "epoch": 0.14464534075104313, + "grad_norm": 0.07922352105379105, + "learning_rate": 7.232267037552156e-06, + "loss": 0.0046, + "step": 8840 + }, + { + "epoch": 0.14480896670211896, + "grad_norm": 0.22944341599941254, + "learning_rate": 7.2404483351059484e-06, + "loss": 0.0076, + "step": 8850 + }, + { + "epoch": 0.1449725926531948, + "grad_norm": 0.19778549671173096, + "learning_rate": 7.248629632659741e-06, + "loss": 0.0081, + "step": 8860 + }, + { + "epoch": 0.14513621860427064, + "grad_norm": 0.20291730761528015, + "learning_rate": 7.256810930213533e-06, + "loss": 0.0056, + "step": 8870 + }, + { + "epoch": 0.14529984455534647, + "grad_norm": 0.17763963341712952, + "learning_rate": 7.264992227767324e-06, + "loss": 0.0075, + "step": 8880 + }, + { + "epoch": 0.14546347050642233, + "grad_norm": 0.35394352674484253, + "learning_rate": 7.273173525321117e-06, + "loss": 0.0061, + "step": 8890 + }, + { + "epoch": 0.14562709645749816, + "grad_norm": 0.3305015563964844, + "learning_rate": 7.281354822874909e-06, + "loss": 0.0058, + "step": 8900 + }, + { + "epoch": 0.145790722408574, + "grad_norm": 0.25384292006492615, + "learning_rate": 7.289536120428701e-06, + "loss": 0.0051, + "step": 8910 + }, + { + "epoch": 0.14595434835964985, + "grad_norm": 0.9204688668251038, + "learning_rate": 7.2977174179824926e-06, + "loss": 0.0081, + "step": 8920 + }, + { + "epoch": 0.14611797431072568, + "grad_norm": 0.20166493952274323, + "learning_rate": 7.305898715536285e-06, + "loss": 0.0073, + "step": 8930 + }, + { + "epoch": 0.1462816002618015, + "grad_norm": 0.2840687930583954, + "learning_rate": 7.314080013090077e-06, + "loss": 0.0054, + "step": 8940 + }, + { + "epoch": 0.14644522621287737, + "grad_norm": 0.2120349258184433, + "learning_rate": 7.3222613106438685e-06, + "loss": 0.0055, + "step": 8950 + }, + { + "epoch": 0.1466088521639532, + "grad_norm": 0.29277750849723816, + "learning_rate": 7.330442608197661e-06, + "loss": 0.0047, + "step": 8960 + }, + { + "epoch": 0.14677247811502905, + "grad_norm": 0.5808455348014832, + "learning_rate": 7.338623905751453e-06, + "loss": 0.007, + "step": 8970 + }, + { + "epoch": 0.14693610406610488, + "grad_norm": 0.2462371289730072, + "learning_rate": 7.346805203305245e-06, + "loss": 0.0057, + "step": 8980 + }, + { + "epoch": 0.1470997300171807, + "grad_norm": 0.44763612747192383, + "learning_rate": 7.354986500859037e-06, + "loss": 0.0056, + "step": 8990 + }, + { + "epoch": 0.14726335596825657, + "grad_norm": 0.230062335729599, + "learning_rate": 7.363167798412829e-06, + "loss": 0.0056, + "step": 9000 + }, + { + "epoch": 0.1474269819193324, + "grad_norm": 0.25067564845085144, + "learning_rate": 7.371349095966621e-06, + "loss": 0.0074, + "step": 9010 + }, + { + "epoch": 0.14759060787040826, + "grad_norm": 0.2506806254386902, + "learning_rate": 7.3795303935204135e-06, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.1477542338214841, + "grad_norm": 0.35419762134552, + "learning_rate": 7.387711691074205e-06, + "loss": 0.0065, + "step": 9030 + }, + { + "epoch": 0.14791785977255992, + "grad_norm": 0.33476555347442627, + "learning_rate": 7.395892988627997e-06, + "loss": 0.0065, + "step": 9040 + }, + { + "epoch": 0.14808148572363578, + "grad_norm": 0.22554193437099457, + "learning_rate": 7.4040742861817894e-06, + "loss": 0.0069, + "step": 9050 + }, + { + "epoch": 0.1482451116747116, + "grad_norm": 0.19346824288368225, + "learning_rate": 7.412255583735582e-06, + "loss": 0.0072, + "step": 9060 + }, + { + "epoch": 0.14840873762578746, + "grad_norm": 0.4060174822807312, + "learning_rate": 7.420436881289373e-06, + "loss": 0.008, + "step": 9070 + }, + { + "epoch": 0.1485723635768633, + "grad_norm": 0.32242077589035034, + "learning_rate": 7.428618178843165e-06, + "loss": 0.0077, + "step": 9080 + }, + { + "epoch": 0.14873598952793912, + "grad_norm": 0.28858497738838196, + "learning_rate": 7.436799476396958e-06, + "loss": 0.0046, + "step": 9090 + }, + { + "epoch": 0.14889961547901498, + "grad_norm": 0.1785849779844284, + "learning_rate": 7.44498077395075e-06, + "loss": 0.0056, + "step": 9100 + }, + { + "epoch": 0.1490632414300908, + "grad_norm": 0.12223095446825027, + "learning_rate": 7.453162071504541e-06, + "loss": 0.0055, + "step": 9110 + }, + { + "epoch": 0.14922686738116667, + "grad_norm": 0.20449209213256836, + "learning_rate": 7.461343369058334e-06, + "loss": 0.0101, + "step": 9120 + }, + { + "epoch": 0.1493904933322425, + "grad_norm": 0.1937507539987564, + "learning_rate": 7.469524666612126e-06, + "loss": 0.0073, + "step": 9130 + }, + { + "epoch": 0.14955411928331833, + "grad_norm": 0.15817973017692566, + "learning_rate": 7.477705964165918e-06, + "loss": 0.0054, + "step": 9140 + }, + { + "epoch": 0.14971774523439418, + "grad_norm": 0.3396613299846649, + "learning_rate": 7.4858872617197095e-06, + "loss": 0.0066, + "step": 9150 + }, + { + "epoch": 0.14988137118547, + "grad_norm": 0.3950789272785187, + "learning_rate": 7.494068559273502e-06, + "loss": 0.009, + "step": 9160 + }, + { + "epoch": 0.15004499713654584, + "grad_norm": 0.2544097304344177, + "learning_rate": 7.502249856827294e-06, + "loss": 0.0076, + "step": 9170 + }, + { + "epoch": 0.1502086230876217, + "grad_norm": 0.3011780381202698, + "learning_rate": 7.510431154381086e-06, + "loss": 0.0066, + "step": 9180 + }, + { + "epoch": 0.15037224903869753, + "grad_norm": 0.2967643737792969, + "learning_rate": 7.518612451934878e-06, + "loss": 0.006, + "step": 9190 + }, + { + "epoch": 0.1505358749897734, + "grad_norm": 0.34138962626457214, + "learning_rate": 7.52679374948867e-06, + "loss": 0.0072, + "step": 9200 + }, + { + "epoch": 0.15069950094084922, + "grad_norm": 0.36358416080474854, + "learning_rate": 7.534975047042462e-06, + "loss": 0.0052, + "step": 9210 + }, + { + "epoch": 0.15086312689192505, + "grad_norm": 0.06522957235574722, + "learning_rate": 7.5431563445962545e-06, + "loss": 0.0043, + "step": 9220 + }, + { + "epoch": 0.1510267528430009, + "grad_norm": 0.2662425935268402, + "learning_rate": 7.551337642150046e-06, + "loss": 0.0073, + "step": 9230 + }, + { + "epoch": 0.15119037879407674, + "grad_norm": 0.11464164406061172, + "learning_rate": 7.559518939703837e-06, + "loss": 0.0053, + "step": 9240 + }, + { + "epoch": 0.1513540047451526, + "grad_norm": 0.13053202629089355, + "learning_rate": 7.567700237257629e-06, + "loss": 0.004, + "step": 9250 + }, + { + "epoch": 0.15151763069622842, + "grad_norm": 0.21175773441791534, + "learning_rate": 7.575881534811421e-06, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.15168125664730425, + "grad_norm": 0.11414758116006851, + "learning_rate": 7.584062832365213e-06, + "loss": 0.0051, + "step": 9270 + }, + { + "epoch": 0.1518448825983801, + "grad_norm": 0.21577127277851105, + "learning_rate": 7.5922441299190056e-06, + "loss": 0.0045, + "step": 9280 + }, + { + "epoch": 0.15200850854945594, + "grad_norm": 0.45842885971069336, + "learning_rate": 7.600425427472797e-06, + "loss": 0.0055, + "step": 9290 + }, + { + "epoch": 0.1521721345005318, + "grad_norm": 0.40762707591056824, + "learning_rate": 7.608606725026589e-06, + "loss": 0.0053, + "step": 9300 + }, + { + "epoch": 0.15233576045160763, + "grad_norm": 0.1374872475862503, + "learning_rate": 7.6167880225803815e-06, + "loss": 0.0088, + "step": 9310 + }, + { + "epoch": 0.15249938640268346, + "grad_norm": 0.23885613679885864, + "learning_rate": 7.624969320134174e-06, + "loss": 0.0082, + "step": 9320 + }, + { + "epoch": 0.15266301235375931, + "grad_norm": 0.09828246384859085, + "learning_rate": 7.633150617687965e-06, + "loss": 0.005, + "step": 9330 + }, + { + "epoch": 0.15282663830483514, + "grad_norm": 0.39649564027786255, + "learning_rate": 7.641331915241758e-06, + "loss": 0.0085, + "step": 9340 + }, + { + "epoch": 0.15299026425591097, + "grad_norm": 0.40593019127845764, + "learning_rate": 7.64951321279555e-06, + "loss": 0.0127, + "step": 9350 + }, + { + "epoch": 0.15315389020698683, + "grad_norm": 0.26107296347618103, + "learning_rate": 7.657694510349341e-06, + "loss": 0.007, + "step": 9360 + }, + { + "epoch": 0.15331751615806266, + "grad_norm": 0.39782312512397766, + "learning_rate": 7.665875807903134e-06, + "loss": 0.0084, + "step": 9370 + }, + { + "epoch": 0.15348114210913852, + "grad_norm": 0.40781745314598083, + "learning_rate": 7.674057105456926e-06, + "loss": 0.0066, + "step": 9380 + }, + { + "epoch": 0.15364476806021435, + "grad_norm": 0.27887535095214844, + "learning_rate": 7.682238403010717e-06, + "loss": 0.0052, + "step": 9390 + }, + { + "epoch": 0.15380839401129018, + "grad_norm": 0.2981220781803131, + "learning_rate": 7.69041970056451e-06, + "loss": 0.0063, + "step": 9400 + }, + { + "epoch": 0.15397201996236604, + "grad_norm": 0.23114238679409027, + "learning_rate": 7.698600998118302e-06, + "loss": 0.0074, + "step": 9410 + }, + { + "epoch": 0.15413564591344187, + "grad_norm": 0.26542651653289795, + "learning_rate": 7.706782295672095e-06, + "loss": 0.0042, + "step": 9420 + }, + { + "epoch": 0.15429927186451772, + "grad_norm": 0.41108861565589905, + "learning_rate": 7.714963593225886e-06, + "loss": 0.0072, + "step": 9430 + }, + { + "epoch": 0.15446289781559355, + "grad_norm": 0.06760261207818985, + "learning_rate": 7.723144890779678e-06, + "loss": 0.0082, + "step": 9440 + }, + { + "epoch": 0.15462652376666938, + "grad_norm": 0.4643831253051758, + "learning_rate": 7.73132618833347e-06, + "loss": 0.0081, + "step": 9450 + }, + { + "epoch": 0.15479014971774524, + "grad_norm": 0.5153923630714417, + "learning_rate": 7.739507485887262e-06, + "loss": 0.0078, + "step": 9460 + }, + { + "epoch": 0.15495377566882107, + "grad_norm": 0.45792943239212036, + "learning_rate": 7.747688783441053e-06, + "loss": 0.0071, + "step": 9470 + }, + { + "epoch": 0.15511740161989693, + "grad_norm": 0.27561572194099426, + "learning_rate": 7.755870080994847e-06, + "loss": 0.0083, + "step": 9480 + }, + { + "epoch": 0.15528102757097276, + "grad_norm": 0.08852320164442062, + "learning_rate": 7.764051378548638e-06, + "loss": 0.007, + "step": 9490 + }, + { + "epoch": 0.1554446535220486, + "grad_norm": 0.42838406562805176, + "learning_rate": 7.772232676102431e-06, + "loss": 0.0056, + "step": 9500 + }, + { + "epoch": 0.15560827947312444, + "grad_norm": 0.5333880186080933, + "learning_rate": 7.780413973656223e-06, + "loss": 0.0058, + "step": 9510 + }, + { + "epoch": 0.15577190542420027, + "grad_norm": 0.12857618927955627, + "learning_rate": 7.788595271210014e-06, + "loss": 0.0043, + "step": 9520 + }, + { + "epoch": 0.15593553137527613, + "grad_norm": 0.2167769968509674, + "learning_rate": 7.796776568763807e-06, + "loss": 0.008, + "step": 9530 + }, + { + "epoch": 0.15609915732635196, + "grad_norm": 0.17934668064117432, + "learning_rate": 7.804957866317598e-06, + "loss": 0.0057, + "step": 9540 + }, + { + "epoch": 0.1562627832774278, + "grad_norm": 0.22155557572841644, + "learning_rate": 7.81313916387139e-06, + "loss": 0.006, + "step": 9550 + }, + { + "epoch": 0.15642640922850365, + "grad_norm": 0.3502984344959259, + "learning_rate": 7.821320461425183e-06, + "loss": 0.0058, + "step": 9560 + }, + { + "epoch": 0.15659003517957948, + "grad_norm": 0.07560534030199051, + "learning_rate": 7.829501758978974e-06, + "loss": 0.0089, + "step": 9570 + }, + { + "epoch": 0.1567536611306553, + "grad_norm": 0.19566108286380768, + "learning_rate": 7.837683056532766e-06, + "loss": 0.0056, + "step": 9580 + }, + { + "epoch": 0.15691728708173117, + "grad_norm": 0.17508867383003235, + "learning_rate": 7.845864354086559e-06, + "loss": 0.0077, + "step": 9590 + }, + { + "epoch": 0.157080913032807, + "grad_norm": 0.31950974464416504, + "learning_rate": 7.85404565164035e-06, + "loss": 0.0097, + "step": 9600 + }, + { + "epoch": 0.15724453898388285, + "grad_norm": 0.210434690117836, + "learning_rate": 7.862226949194143e-06, + "loss": 0.0068, + "step": 9610 + }, + { + "epoch": 0.15740816493495868, + "grad_norm": 0.13805869221687317, + "learning_rate": 7.870408246747935e-06, + "loss": 0.0054, + "step": 9620 + }, + { + "epoch": 0.1575717908860345, + "grad_norm": 0.1168903037905693, + "learning_rate": 7.878589544301726e-06, + "loss": 0.0039, + "step": 9630 + }, + { + "epoch": 0.15773541683711037, + "grad_norm": 0.48040199279785156, + "learning_rate": 7.88677084185552e-06, + "loss": 0.0092, + "step": 9640 + }, + { + "epoch": 0.1578990427881862, + "grad_norm": 0.20810607075691223, + "learning_rate": 7.89495213940931e-06, + "loss": 0.0067, + "step": 9650 + }, + { + "epoch": 0.15806266873926206, + "grad_norm": 0.2741049826145172, + "learning_rate": 7.903133436963102e-06, + "loss": 0.0068, + "step": 9660 + }, + { + "epoch": 0.1582262946903379, + "grad_norm": 0.25321799516677856, + "learning_rate": 7.911314734516895e-06, + "loss": 0.0078, + "step": 9670 + }, + { + "epoch": 0.15838992064141372, + "grad_norm": 0.14259810745716095, + "learning_rate": 7.919496032070687e-06, + "loss": 0.0073, + "step": 9680 + }, + { + "epoch": 0.15855354659248957, + "grad_norm": 0.29586896300315857, + "learning_rate": 7.92767732962448e-06, + "loss": 0.0094, + "step": 9690 + }, + { + "epoch": 0.1587171725435654, + "grad_norm": 0.8661742806434631, + "learning_rate": 7.935858627178271e-06, + "loss": 0.0058, + "step": 9700 + }, + { + "epoch": 0.15888079849464126, + "grad_norm": 0.4148622155189514, + "learning_rate": 7.944039924732063e-06, + "loss": 0.006, + "step": 9710 + }, + { + "epoch": 0.1590444244457171, + "grad_norm": 0.2772292196750641, + "learning_rate": 7.952221222285856e-06, + "loss": 0.0055, + "step": 9720 + }, + { + "epoch": 0.15920805039679292, + "grad_norm": 0.3952766954898834, + "learning_rate": 7.960402519839647e-06, + "loss": 0.0064, + "step": 9730 + }, + { + "epoch": 0.15937167634786878, + "grad_norm": 0.19145947694778442, + "learning_rate": 7.968583817393439e-06, + "loss": 0.0039, + "step": 9740 + }, + { + "epoch": 0.1595353022989446, + "grad_norm": 0.255990594625473, + "learning_rate": 7.976765114947232e-06, + "loss": 0.0056, + "step": 9750 + }, + { + "epoch": 0.15969892825002047, + "grad_norm": 0.05305280536413193, + "learning_rate": 7.984946412501023e-06, + "loss": 0.0047, + "step": 9760 + }, + { + "epoch": 0.1598625542010963, + "grad_norm": 0.07129017263650894, + "learning_rate": 7.993127710054816e-06, + "loss": 0.0054, + "step": 9770 + }, + { + "epoch": 0.16002618015217213, + "grad_norm": 0.2662150263786316, + "learning_rate": 8.001309007608608e-06, + "loss": 0.0058, + "step": 9780 + }, + { + "epoch": 0.16018980610324798, + "grad_norm": 0.3693317174911499, + "learning_rate": 8.009490305162399e-06, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.1603534320543238, + "grad_norm": 0.24876077473163605, + "learning_rate": 8.017671602716192e-06, + "loss": 0.0081, + "step": 9800 + }, + { + "epoch": 0.16051705800539964, + "grad_norm": 0.14239034056663513, + "learning_rate": 8.025852900269984e-06, + "loss": 0.0067, + "step": 9810 + }, + { + "epoch": 0.1606806839564755, + "grad_norm": 0.6243664622306824, + "learning_rate": 8.034034197823775e-06, + "loss": 0.0057, + "step": 9820 + }, + { + "epoch": 0.16084430990755133, + "grad_norm": 0.5805561542510986, + "learning_rate": 8.042215495377568e-06, + "loss": 0.0074, + "step": 9830 + }, + { + "epoch": 0.1610079358586272, + "grad_norm": 0.18034577369689941, + "learning_rate": 8.05039679293136e-06, + "loss": 0.0053, + "step": 9840 + }, + { + "epoch": 0.16117156180970302, + "grad_norm": 0.20209714770317078, + "learning_rate": 8.058578090485151e-06, + "loss": 0.0087, + "step": 9850 + }, + { + "epoch": 0.16133518776077885, + "grad_norm": 0.2705497443675995, + "learning_rate": 8.066759388038944e-06, + "loss": 0.0078, + "step": 9860 + }, + { + "epoch": 0.1614988137118547, + "grad_norm": 0.35468733310699463, + "learning_rate": 8.074940685592736e-06, + "loss": 0.0074, + "step": 9870 + }, + { + "epoch": 0.16166243966293053, + "grad_norm": 0.14945657551288605, + "learning_rate": 8.083121983146529e-06, + "loss": 0.0077, + "step": 9880 + }, + { + "epoch": 0.1618260656140064, + "grad_norm": 0.2539266347885132, + "learning_rate": 8.09130328070032e-06, + "loss": 0.0032, + "step": 9890 + }, + { + "epoch": 0.16198969156508222, + "grad_norm": 0.2711421549320221, + "learning_rate": 8.099484578254111e-06, + "loss": 0.0085, + "step": 9900 + }, + { + "epoch": 0.16215331751615805, + "grad_norm": 0.2099442034959793, + "learning_rate": 8.107665875807905e-06, + "loss": 0.0075, + "step": 9910 + }, + { + "epoch": 0.1623169434672339, + "grad_norm": 0.11686094850301743, + "learning_rate": 8.115847173361696e-06, + "loss": 0.0059, + "step": 9920 + }, + { + "epoch": 0.16248056941830974, + "grad_norm": 0.18301193416118622, + "learning_rate": 8.124028470915487e-06, + "loss": 0.0068, + "step": 9930 + }, + { + "epoch": 0.1626441953693856, + "grad_norm": 0.1897609382867813, + "learning_rate": 8.13220976846928e-06, + "loss": 0.0079, + "step": 9940 + }, + { + "epoch": 0.16280782132046143, + "grad_norm": 0.32352903485298157, + "learning_rate": 8.140391066023072e-06, + "loss": 0.0074, + "step": 9950 + }, + { + "epoch": 0.16297144727153726, + "grad_norm": 0.10327289253473282, + "learning_rate": 8.148572363576865e-06, + "loss": 0.0032, + "step": 9960 + }, + { + "epoch": 0.16313507322261311, + "grad_norm": 0.18615023791790009, + "learning_rate": 8.156753661130656e-06, + "loss": 0.0085, + "step": 9970 + }, + { + "epoch": 0.16329869917368894, + "grad_norm": 0.26965397596359253, + "learning_rate": 8.164934958684448e-06, + "loss": 0.0055, + "step": 9980 + }, + { + "epoch": 0.1634623251247648, + "grad_norm": 0.15522529184818268, + "learning_rate": 8.173116256238241e-06, + "loss": 0.007, + "step": 9990 + }, + { + "epoch": 0.16362595107584063, + "grad_norm": 0.22529001533985138, + "learning_rate": 8.181297553792032e-06, + "loss": 0.005, + "step": 10000 + }, + { + "epoch": 0.16378957702691646, + "grad_norm": 0.24302367866039276, + "learning_rate": 8.189478851345824e-06, + "loss": 0.0082, + "step": 10010 + }, + { + "epoch": 0.16395320297799232, + "grad_norm": 0.25101688504219055, + "learning_rate": 8.197660148899617e-06, + "loss": 0.0071, + "step": 10020 + }, + { + "epoch": 0.16411682892906815, + "grad_norm": 0.3425748348236084, + "learning_rate": 8.205841446453408e-06, + "loss": 0.0091, + "step": 10030 + }, + { + "epoch": 0.16428045488014398, + "grad_norm": 0.270454466342926, + "learning_rate": 8.214022744007201e-06, + "loss": 0.0054, + "step": 10040 + }, + { + "epoch": 0.16444408083121984, + "grad_norm": 0.21871308982372284, + "learning_rate": 8.222204041560993e-06, + "loss": 0.0063, + "step": 10050 + }, + { + "epoch": 0.16460770678229567, + "grad_norm": 0.21748510003089905, + "learning_rate": 8.230385339114784e-06, + "loss": 0.0057, + "step": 10060 + }, + { + "epoch": 0.16477133273337152, + "grad_norm": 0.365843266248703, + "learning_rate": 8.238566636668577e-06, + "loss": 0.007, + "step": 10070 + }, + { + "epoch": 0.16493495868444735, + "grad_norm": 0.20976294577121735, + "learning_rate": 8.246747934222369e-06, + "loss": 0.0066, + "step": 10080 + }, + { + "epoch": 0.16509858463552318, + "grad_norm": 0.37279391288757324, + "learning_rate": 8.25492923177616e-06, + "loss": 0.0063, + "step": 10090 + }, + { + "epoch": 0.16526221058659904, + "grad_norm": 0.22895392775535583, + "learning_rate": 8.263110529329953e-06, + "loss": 0.0065, + "step": 10100 + }, + { + "epoch": 0.16542583653767487, + "grad_norm": 0.27912771701812744, + "learning_rate": 8.271291826883745e-06, + "loss": 0.0091, + "step": 10110 + }, + { + "epoch": 0.16558946248875073, + "grad_norm": 0.4469371736049652, + "learning_rate": 8.279473124437536e-06, + "loss": 0.0064, + "step": 10120 + }, + { + "epoch": 0.16575308843982656, + "grad_norm": 0.3388589322566986, + "learning_rate": 8.287654421991328e-06, + "loss": 0.0076, + "step": 10130 + }, + { + "epoch": 0.1659167143909024, + "grad_norm": 0.22695420682430267, + "learning_rate": 8.29583571954512e-06, + "loss": 0.0059, + "step": 10140 + }, + { + "epoch": 0.16608034034197824, + "grad_norm": 0.1658775806427002, + "learning_rate": 8.304017017098912e-06, + "loss": 0.0088, + "step": 10150 + }, + { + "epoch": 0.16624396629305407, + "grad_norm": 0.2975265383720398, + "learning_rate": 8.312198314652704e-06, + "loss": 0.0076, + "step": 10160 + }, + { + "epoch": 0.16640759224412993, + "grad_norm": 0.1491502821445465, + "learning_rate": 8.320379612206497e-06, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.16657121819520576, + "grad_norm": 0.2581508457660675, + "learning_rate": 8.328560909760288e-06, + "loss": 0.0082, + "step": 10180 + }, + { + "epoch": 0.1667348441462816, + "grad_norm": 0.17519281804561615, + "learning_rate": 8.33674220731408e-06, + "loss": 0.0042, + "step": 10190 + }, + { + "epoch": 0.16689847009735745, + "grad_norm": 0.1720351129770279, + "learning_rate": 8.344923504867873e-06, + "loss": 0.0056, + "step": 10200 + }, + { + "epoch": 0.16706209604843328, + "grad_norm": 0.22692690789699554, + "learning_rate": 8.353104802421664e-06, + "loss": 0.0077, + "step": 10210 + }, + { + "epoch": 0.16722572199950914, + "grad_norm": 0.16719768941402435, + "learning_rate": 8.361286099975457e-06, + "loss": 0.0058, + "step": 10220 + }, + { + "epoch": 0.16738934795058497, + "grad_norm": 0.21581275761127472, + "learning_rate": 8.369467397529249e-06, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.1675529739016608, + "grad_norm": 0.40083274245262146, + "learning_rate": 8.37764869508304e-06, + "loss": 0.0083, + "step": 10240 + }, + { + "epoch": 0.16771659985273665, + "grad_norm": 0.5954267382621765, + "learning_rate": 8.385829992636833e-06, + "loss": 0.0063, + "step": 10250 + }, + { + "epoch": 0.16788022580381248, + "grad_norm": 0.3549838066101074, + "learning_rate": 8.394011290190624e-06, + "loss": 0.0068, + "step": 10260 + }, + { + "epoch": 0.1680438517548883, + "grad_norm": 0.1814076453447342, + "learning_rate": 8.402192587744416e-06, + "loss": 0.0105, + "step": 10270 + }, + { + "epoch": 0.16820747770596417, + "grad_norm": 0.19932052493095398, + "learning_rate": 8.410373885298209e-06, + "loss": 0.0082, + "step": 10280 + }, + { + "epoch": 0.16837110365704, + "grad_norm": 0.1761980503797531, + "learning_rate": 8.418555182852e-06, + "loss": 0.0059, + "step": 10290 + }, + { + "epoch": 0.16853472960811586, + "grad_norm": 0.2077781707048416, + "learning_rate": 8.426736480405793e-06, + "loss": 0.0082, + "step": 10300 + }, + { + "epoch": 0.1686983555591917, + "grad_norm": 0.3204112946987152, + "learning_rate": 8.434917777959585e-06, + "loss": 0.0067, + "step": 10310 + }, + { + "epoch": 0.16886198151026752, + "grad_norm": 0.08514495193958282, + "learning_rate": 8.443099075513376e-06, + "loss": 0.007, + "step": 10320 + }, + { + "epoch": 0.16902560746134337, + "grad_norm": 0.16202858090400696, + "learning_rate": 8.45128037306717e-06, + "loss": 0.0086, + "step": 10330 + }, + { + "epoch": 0.1691892334124192, + "grad_norm": 0.3120019733905792, + "learning_rate": 8.459461670620961e-06, + "loss": 0.0054, + "step": 10340 + }, + { + "epoch": 0.16935285936349506, + "grad_norm": 0.7790592312812805, + "learning_rate": 8.467642968174752e-06, + "loss": 0.0085, + "step": 10350 + }, + { + "epoch": 0.1695164853145709, + "grad_norm": 0.6006662249565125, + "learning_rate": 8.475824265728545e-06, + "loss": 0.0081, + "step": 10360 + }, + { + "epoch": 0.16968011126564672, + "grad_norm": 0.34162044525146484, + "learning_rate": 8.484005563282337e-06, + "loss": 0.0061, + "step": 10370 + }, + { + "epoch": 0.16984373721672258, + "grad_norm": 0.061774272471666336, + "learning_rate": 8.49218686083613e-06, + "loss": 0.0069, + "step": 10380 + }, + { + "epoch": 0.1700073631677984, + "grad_norm": 0.22786760330200195, + "learning_rate": 8.500368158389921e-06, + "loss": 0.0114, + "step": 10390 + }, + { + "epoch": 0.17017098911887427, + "grad_norm": 0.3290214240550995, + "learning_rate": 8.508549455943713e-06, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.1703346150699501, + "grad_norm": 0.19510148465633392, + "learning_rate": 8.516730753497506e-06, + "loss": 0.0063, + "step": 10410 + }, + { + "epoch": 0.17049824102102593, + "grad_norm": 0.07396508753299713, + "learning_rate": 8.524912051051297e-06, + "loss": 0.0051, + "step": 10420 + }, + { + "epoch": 0.17066186697210178, + "grad_norm": 0.25472307205200195, + "learning_rate": 8.533093348605089e-06, + "loss": 0.0079, + "step": 10430 + }, + { + "epoch": 0.1708254929231776, + "grad_norm": 0.18875336647033691, + "learning_rate": 8.541274646158882e-06, + "loss": 0.004, + "step": 10440 + }, + { + "epoch": 0.17098911887425344, + "grad_norm": 0.21490013599395752, + "learning_rate": 8.549455943712673e-06, + "loss": 0.0047, + "step": 10450 + }, + { + "epoch": 0.1711527448253293, + "grad_norm": 0.08458748459815979, + "learning_rate": 8.557637241266465e-06, + "loss": 0.0075, + "step": 10460 + }, + { + "epoch": 0.17131637077640513, + "grad_norm": 0.29996681213378906, + "learning_rate": 8.565818538820258e-06, + "loss": 0.0066, + "step": 10470 + }, + { + "epoch": 0.171479996727481, + "grad_norm": 0.17372117936611176, + "learning_rate": 8.573999836374049e-06, + "loss": 0.008, + "step": 10480 + }, + { + "epoch": 0.17164362267855682, + "grad_norm": 0.21046558022499084, + "learning_rate": 8.582181133927842e-06, + "loss": 0.0099, + "step": 10490 + }, + { + "epoch": 0.17180724862963265, + "grad_norm": 0.48645058274269104, + "learning_rate": 8.590362431481634e-06, + "loss": 0.0048, + "step": 10500 + }, + { + "epoch": 0.1719708745807085, + "grad_norm": 0.16859287023544312, + "learning_rate": 8.598543729035425e-06, + "loss": 0.0076, + "step": 10510 + }, + { + "epoch": 0.17213450053178433, + "grad_norm": 0.37864935398101807, + "learning_rate": 8.606725026589218e-06, + "loss": 0.0075, + "step": 10520 + }, + { + "epoch": 0.1722981264828602, + "grad_norm": 0.2491617202758789, + "learning_rate": 8.61490632414301e-06, + "loss": 0.0059, + "step": 10530 + }, + { + "epoch": 0.17246175243393602, + "grad_norm": 0.29351696372032166, + "learning_rate": 8.623087621696801e-06, + "loss": 0.0086, + "step": 10540 + }, + { + "epoch": 0.17262537838501185, + "grad_norm": 0.30968332290649414, + "learning_rate": 8.631268919250594e-06, + "loss": 0.0075, + "step": 10550 + }, + { + "epoch": 0.1727890043360877, + "grad_norm": 0.11242213845252991, + "learning_rate": 8.639450216804386e-06, + "loss": 0.007, + "step": 10560 + }, + { + "epoch": 0.17295263028716354, + "grad_norm": 0.0518312007188797, + "learning_rate": 8.647631514358179e-06, + "loss": 0.0079, + "step": 10570 + }, + { + "epoch": 0.1731162562382394, + "grad_norm": 0.14475904405117035, + "learning_rate": 8.65581281191197e-06, + "loss": 0.0055, + "step": 10580 + }, + { + "epoch": 0.17327988218931523, + "grad_norm": 0.08665602654218674, + "learning_rate": 8.663994109465761e-06, + "loss": 0.0061, + "step": 10590 + }, + { + "epoch": 0.17344350814039106, + "grad_norm": 0.23037134110927582, + "learning_rate": 8.672175407019555e-06, + "loss": 0.0073, + "step": 10600 + }, + { + "epoch": 0.1736071340914669, + "grad_norm": 0.33483126759529114, + "learning_rate": 8.680356704573346e-06, + "loss": 0.0034, + "step": 10610 + }, + { + "epoch": 0.17377076004254274, + "grad_norm": 0.3462408781051636, + "learning_rate": 8.688538002127137e-06, + "loss": 0.0058, + "step": 10620 + }, + { + "epoch": 0.1739343859936186, + "grad_norm": 0.2189795821905136, + "learning_rate": 8.69671929968093e-06, + "loss": 0.005, + "step": 10630 + }, + { + "epoch": 0.17409801194469443, + "grad_norm": 0.14973264932632446, + "learning_rate": 8.704900597234722e-06, + "loss": 0.0064, + "step": 10640 + }, + { + "epoch": 0.17426163789577026, + "grad_norm": 0.2500171959400177, + "learning_rate": 8.713081894788515e-06, + "loss": 0.0052, + "step": 10650 + }, + { + "epoch": 0.17442526384684612, + "grad_norm": 0.349170982837677, + "learning_rate": 8.721263192342306e-06, + "loss": 0.005, + "step": 10660 + }, + { + "epoch": 0.17458888979792195, + "grad_norm": 0.22084467113018036, + "learning_rate": 8.729444489896098e-06, + "loss": 0.0045, + "step": 10670 + }, + { + "epoch": 0.17475251574899778, + "grad_norm": 0.2274869978427887, + "learning_rate": 8.737625787449891e-06, + "loss": 0.0084, + "step": 10680 + }, + { + "epoch": 0.17491614170007364, + "grad_norm": 0.24188733100891113, + "learning_rate": 8.745807085003682e-06, + "loss": 0.0045, + "step": 10690 + }, + { + "epoch": 0.17507976765114946, + "grad_norm": 0.16240046918392181, + "learning_rate": 8.753988382557474e-06, + "loss": 0.0059, + "step": 10700 + }, + { + "epoch": 0.17524339360222532, + "grad_norm": 0.12881557643413544, + "learning_rate": 8.762169680111267e-06, + "loss": 0.0041, + "step": 10710 + }, + { + "epoch": 0.17540701955330115, + "grad_norm": 0.24191954731941223, + "learning_rate": 8.770350977665058e-06, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.17557064550437698, + "grad_norm": 0.08183182030916214, + "learning_rate": 8.778532275218851e-06, + "loss": 0.0063, + "step": 10730 + }, + { + "epoch": 0.17573427145545284, + "grad_norm": 0.29195988178253174, + "learning_rate": 8.786713572772643e-06, + "loss": 0.0059, + "step": 10740 + }, + { + "epoch": 0.17589789740652867, + "grad_norm": 0.19651706516742706, + "learning_rate": 8.794894870326434e-06, + "loss": 0.0034, + "step": 10750 + }, + { + "epoch": 0.17606152335760453, + "grad_norm": 0.34899449348449707, + "learning_rate": 8.803076167880227e-06, + "loss": 0.0067, + "step": 10760 + }, + { + "epoch": 0.17622514930868036, + "grad_norm": 0.06525447964668274, + "learning_rate": 8.811257465434019e-06, + "loss": 0.0054, + "step": 10770 + }, + { + "epoch": 0.1763887752597562, + "grad_norm": 0.34989553689956665, + "learning_rate": 8.81943876298781e-06, + "loss": 0.0051, + "step": 10780 + }, + { + "epoch": 0.17655240121083204, + "grad_norm": 0.3499177396297455, + "learning_rate": 8.827620060541603e-06, + "loss": 0.0053, + "step": 10790 + }, + { + "epoch": 0.17671602716190787, + "grad_norm": 0.02204432711005211, + "learning_rate": 8.835801358095395e-06, + "loss": 0.004, + "step": 10800 + }, + { + "epoch": 0.17687965311298373, + "grad_norm": 0.17593590915203094, + "learning_rate": 8.843982655649186e-06, + "loss": 0.0057, + "step": 10810 + }, + { + "epoch": 0.17704327906405956, + "grad_norm": 0.034615494310855865, + "learning_rate": 8.85216395320298e-06, + "loss": 0.0073, + "step": 10820 + }, + { + "epoch": 0.1772069050151354, + "grad_norm": 0.22318828105926514, + "learning_rate": 8.86034525075677e-06, + "loss": 0.0044, + "step": 10830 + }, + { + "epoch": 0.17737053096621125, + "grad_norm": 0.24265556037425995, + "learning_rate": 8.868526548310564e-06, + "loss": 0.0039, + "step": 10840 + }, + { + "epoch": 0.17753415691728708, + "grad_norm": 0.06994540989398956, + "learning_rate": 8.876707845864355e-06, + "loss": 0.0065, + "step": 10850 + }, + { + "epoch": 0.17769778286836294, + "grad_norm": 0.2045663446187973, + "learning_rate": 8.884889143418147e-06, + "loss": 0.0058, + "step": 10860 + }, + { + "epoch": 0.17786140881943877, + "grad_norm": 0.38222843408584595, + "learning_rate": 8.89307044097194e-06, + "loss": 0.0073, + "step": 10870 + }, + { + "epoch": 0.1780250347705146, + "grad_norm": 0.29041460156440735, + "learning_rate": 8.901251738525731e-06, + "loss": 0.0054, + "step": 10880 + }, + { + "epoch": 0.17818866072159045, + "grad_norm": 0.2893683910369873, + "learning_rate": 8.909433036079523e-06, + "loss": 0.0059, + "step": 10890 + }, + { + "epoch": 0.17835228667266628, + "grad_norm": 0.3193666338920593, + "learning_rate": 8.917614333633316e-06, + "loss": 0.0057, + "step": 10900 + }, + { + "epoch": 0.1785159126237421, + "grad_norm": 0.4353903830051422, + "learning_rate": 8.925795631187107e-06, + "loss": 0.0065, + "step": 10910 + }, + { + "epoch": 0.17867953857481797, + "grad_norm": 0.15185204148292542, + "learning_rate": 8.9339769287409e-06, + "loss": 0.0052, + "step": 10920 + }, + { + "epoch": 0.1788431645258938, + "grad_norm": 0.01609669253230095, + "learning_rate": 8.942158226294692e-06, + "loss": 0.0075, + "step": 10930 + }, + { + "epoch": 0.17900679047696966, + "grad_norm": 0.20663630962371826, + "learning_rate": 8.950339523848483e-06, + "loss": 0.0053, + "step": 10940 + }, + { + "epoch": 0.1791704164280455, + "grad_norm": 0.17413762211799622, + "learning_rate": 8.958520821402276e-06, + "loss": 0.0077, + "step": 10950 + }, + { + "epoch": 0.17933404237912132, + "grad_norm": 0.18656152486801147, + "learning_rate": 8.966702118956068e-06, + "loss": 0.0076, + "step": 10960 + }, + { + "epoch": 0.17949766833019717, + "grad_norm": 0.2067999392747879, + "learning_rate": 8.974883416509859e-06, + "loss": 0.0071, + "step": 10970 + }, + { + "epoch": 0.179661294281273, + "grad_norm": 0.182582288980484, + "learning_rate": 8.983064714063652e-06, + "loss": 0.0052, + "step": 10980 + }, + { + "epoch": 0.17982492023234886, + "grad_norm": 0.14651651680469513, + "learning_rate": 8.991246011617444e-06, + "loss": 0.0074, + "step": 10990 + }, + { + "epoch": 0.1799885461834247, + "grad_norm": 0.15958207845687866, + "learning_rate": 8.999427309171235e-06, + "loss": 0.0069, + "step": 11000 + }, + { + "epoch": 0.18015217213450052, + "grad_norm": 0.2562597990036011, + "learning_rate": 9.007608606725026e-06, + "loss": 0.0059, + "step": 11010 + }, + { + "epoch": 0.18031579808557638, + "grad_norm": 0.19369298219680786, + "learning_rate": 9.01578990427882e-06, + "loss": 0.0074, + "step": 11020 + }, + { + "epoch": 0.1804794240366522, + "grad_norm": 0.06816922128200531, + "learning_rate": 9.023971201832611e-06, + "loss": 0.0088, + "step": 11030 + }, + { + "epoch": 0.18064304998772807, + "grad_norm": 0.3692319095134735, + "learning_rate": 9.032152499386402e-06, + "loss": 0.0062, + "step": 11040 + }, + { + "epoch": 0.1808066759388039, + "grad_norm": 0.39277228713035583, + "learning_rate": 9.040333796940195e-06, + "loss": 0.005, + "step": 11050 + }, + { + "epoch": 0.18097030188987973, + "grad_norm": 0.31312334537506104, + "learning_rate": 9.048515094493987e-06, + "loss": 0.0078, + "step": 11060 + }, + { + "epoch": 0.18113392784095558, + "grad_norm": 0.11233723163604736, + "learning_rate": 9.056696392047778e-06, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.1812975537920314, + "grad_norm": 0.21046818792819977, + "learning_rate": 9.064877689601571e-06, + "loss": 0.0093, + "step": 11080 + }, + { + "epoch": 0.18146117974310727, + "grad_norm": 0.22377535700798035, + "learning_rate": 9.073058987155363e-06, + "loss": 0.0075, + "step": 11090 + }, + { + "epoch": 0.1816248056941831, + "grad_norm": 0.12424563616514206, + "learning_rate": 9.081240284709156e-06, + "loss": 0.0037, + "step": 11100 + }, + { + "epoch": 0.18178843164525893, + "grad_norm": 0.41122204065322876, + "learning_rate": 9.089421582262947e-06, + "loss": 0.0068, + "step": 11110 + }, + { + "epoch": 0.1819520575963348, + "grad_norm": 0.2757875919342041, + "learning_rate": 9.097602879816739e-06, + "loss": 0.0064, + "step": 11120 + }, + { + "epoch": 0.18211568354741062, + "grad_norm": 0.20527689158916473, + "learning_rate": 9.105784177370532e-06, + "loss": 0.004, + "step": 11130 + }, + { + "epoch": 0.18227930949848645, + "grad_norm": 0.24801987409591675, + "learning_rate": 9.113965474924323e-06, + "loss": 0.0058, + "step": 11140 + }, + { + "epoch": 0.1824429354495623, + "grad_norm": 0.4577740430831909, + "learning_rate": 9.122146772478115e-06, + "loss": 0.0051, + "step": 11150 + }, + { + "epoch": 0.18260656140063813, + "grad_norm": 0.0874112993478775, + "learning_rate": 9.130328070031908e-06, + "loss": 0.0052, + "step": 11160 + }, + { + "epoch": 0.182770187351714, + "grad_norm": 0.07761070132255554, + "learning_rate": 9.1385093675857e-06, + "loss": 0.0048, + "step": 11170 + }, + { + "epoch": 0.18293381330278982, + "grad_norm": 0.17391850054264069, + "learning_rate": 9.146690665139492e-06, + "loss": 0.0054, + "step": 11180 + }, + { + "epoch": 0.18309743925386565, + "grad_norm": 0.2146953046321869, + "learning_rate": 9.154871962693284e-06, + "loss": 0.0066, + "step": 11190 + }, + { + "epoch": 0.1832610652049415, + "grad_norm": 0.11834721267223358, + "learning_rate": 9.163053260247075e-06, + "loss": 0.0032, + "step": 11200 + }, + { + "epoch": 0.18342469115601734, + "grad_norm": 0.3615259826183319, + "learning_rate": 9.171234557800868e-06, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.1835883171070932, + "grad_norm": 0.3771328330039978, + "learning_rate": 9.17941585535466e-06, + "loss": 0.0053, + "step": 11220 + }, + { + "epoch": 0.18375194305816903, + "grad_norm": 0.11126955598592758, + "learning_rate": 9.187597152908451e-06, + "loss": 0.0041, + "step": 11230 + }, + { + "epoch": 0.18391556900924486, + "grad_norm": 0.29467493295669556, + "learning_rate": 9.195778450462244e-06, + "loss": 0.0062, + "step": 11240 + }, + { + "epoch": 0.1840791949603207, + "grad_norm": 0.20356979966163635, + "learning_rate": 9.203959748016036e-06, + "loss": 0.006, + "step": 11250 + }, + { + "epoch": 0.18424282091139654, + "grad_norm": 0.12055736780166626, + "learning_rate": 9.212141045569829e-06, + "loss": 0.0055, + "step": 11260 + }, + { + "epoch": 0.1844064468624724, + "grad_norm": 0.43545442819595337, + "learning_rate": 9.22032234312362e-06, + "loss": 0.0068, + "step": 11270 + }, + { + "epoch": 0.18457007281354823, + "grad_norm": 0.13396865129470825, + "learning_rate": 9.228503640677412e-06, + "loss": 0.0052, + "step": 11280 + }, + { + "epoch": 0.18473369876462406, + "grad_norm": 0.2616546154022217, + "learning_rate": 9.236684938231205e-06, + "loss": 0.0061, + "step": 11290 + }, + { + "epoch": 0.18489732471569992, + "grad_norm": 0.14696097373962402, + "learning_rate": 9.244866235784996e-06, + "loss": 0.0057, + "step": 11300 + }, + { + "epoch": 0.18506095066677575, + "grad_norm": 0.25762873888015747, + "learning_rate": 9.253047533338787e-06, + "loss": 0.0085, + "step": 11310 + }, + { + "epoch": 0.1852245766178516, + "grad_norm": 0.1531849205493927, + "learning_rate": 9.26122883089258e-06, + "loss": 0.0044, + "step": 11320 + }, + { + "epoch": 0.18538820256892743, + "grad_norm": 0.1525973379611969, + "learning_rate": 9.269410128446372e-06, + "loss": 0.0061, + "step": 11330 + }, + { + "epoch": 0.18555182852000326, + "grad_norm": 0.16310057044029236, + "learning_rate": 9.277591426000165e-06, + "loss": 0.0076, + "step": 11340 + }, + { + "epoch": 0.18571545447107912, + "grad_norm": 0.1427851766347885, + "learning_rate": 9.285772723553957e-06, + "loss": 0.0053, + "step": 11350 + }, + { + "epoch": 0.18587908042215495, + "grad_norm": 0.17253410816192627, + "learning_rate": 9.293954021107748e-06, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.18604270637323078, + "grad_norm": 0.23499998450279236, + "learning_rate": 9.302135318661541e-06, + "loss": 0.0091, + "step": 11370 + }, + { + "epoch": 0.18620633232430664, + "grad_norm": 0.3588450253009796, + "learning_rate": 9.310316616215332e-06, + "loss": 0.0086, + "step": 11380 + }, + { + "epoch": 0.18636995827538247, + "grad_norm": 0.1670575886964798, + "learning_rate": 9.318497913769124e-06, + "loss": 0.0093, + "step": 11390 + }, + { + "epoch": 0.18653358422645833, + "grad_norm": 0.412836492061615, + "learning_rate": 9.326679211322917e-06, + "loss": 0.0076, + "step": 11400 + }, + { + "epoch": 0.18669721017753416, + "grad_norm": 0.25285816192626953, + "learning_rate": 9.334860508876708e-06, + "loss": 0.0057, + "step": 11410 + }, + { + "epoch": 0.18686083612861, + "grad_norm": 0.5399177670478821, + "learning_rate": 9.3430418064305e-06, + "loss": 0.0078, + "step": 11420 + }, + { + "epoch": 0.18702446207968584, + "grad_norm": 0.15615855157375336, + "learning_rate": 9.351223103984293e-06, + "loss": 0.0056, + "step": 11430 + }, + { + "epoch": 0.18718808803076167, + "grad_norm": 0.1417391151189804, + "learning_rate": 9.359404401538084e-06, + "loss": 0.0054, + "step": 11440 + }, + { + "epoch": 0.18735171398183753, + "grad_norm": 0.414683997631073, + "learning_rate": 9.367585699091877e-06, + "loss": 0.0097, + "step": 11450 + }, + { + "epoch": 0.18751533993291336, + "grad_norm": 0.27592965960502625, + "learning_rate": 9.375766996645669e-06, + "loss": 0.0054, + "step": 11460 + }, + { + "epoch": 0.1876789658839892, + "grad_norm": 0.08436980098485947, + "learning_rate": 9.38394829419946e-06, + "loss": 0.0062, + "step": 11470 + }, + { + "epoch": 0.18784259183506505, + "grad_norm": 0.22398467361927032, + "learning_rate": 9.392129591753253e-06, + "loss": 0.0058, + "step": 11480 + }, + { + "epoch": 0.18800621778614088, + "grad_norm": 0.14039935171604156, + "learning_rate": 9.400310889307045e-06, + "loss": 0.0051, + "step": 11490 + }, + { + "epoch": 0.18816984373721674, + "grad_norm": 0.17133980989456177, + "learning_rate": 9.408492186860836e-06, + "loss": 0.0053, + "step": 11500 + }, + { + "epoch": 0.18833346968829257, + "grad_norm": 0.17555159330368042, + "learning_rate": 9.41667348441463e-06, + "loss": 0.0048, + "step": 11510 + }, + { + "epoch": 0.1884970956393684, + "grad_norm": 0.29000338912010193, + "learning_rate": 9.42485478196842e-06, + "loss": 0.0095, + "step": 11520 + }, + { + "epoch": 0.18866072159044425, + "grad_norm": 0.1891888976097107, + "learning_rate": 9.433036079522214e-06, + "loss": 0.0075, + "step": 11530 + }, + { + "epoch": 0.18882434754152008, + "grad_norm": 0.14998741447925568, + "learning_rate": 9.441217377076005e-06, + "loss": 0.0061, + "step": 11540 + }, + { + "epoch": 0.1889879734925959, + "grad_norm": 0.49098023772239685, + "learning_rate": 9.449398674629797e-06, + "loss": 0.0068, + "step": 11550 + }, + { + "epoch": 0.18915159944367177, + "grad_norm": 0.14602211117744446, + "learning_rate": 9.45757997218359e-06, + "loss": 0.0069, + "step": 11560 + }, + { + "epoch": 0.1893152253947476, + "grad_norm": 0.36751922965049744, + "learning_rate": 9.465761269737381e-06, + "loss": 0.0084, + "step": 11570 + }, + { + "epoch": 0.18947885134582346, + "grad_norm": 0.12051521986722946, + "learning_rate": 9.473942567291173e-06, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.1896424772968993, + "grad_norm": 0.10984425246715546, + "learning_rate": 9.482123864844966e-06, + "loss": 0.0043, + "step": 11590 + }, + { + "epoch": 0.18980610324797512, + "grad_norm": 0.0915994793176651, + "learning_rate": 9.490305162398757e-06, + "loss": 0.0078, + "step": 11600 + }, + { + "epoch": 0.18996972919905097, + "grad_norm": 0.20405228435993195, + "learning_rate": 9.49848645995255e-06, + "loss": 0.0046, + "step": 11610 + }, + { + "epoch": 0.1901333551501268, + "grad_norm": 0.13243839144706726, + "learning_rate": 9.506667757506342e-06, + "loss": 0.0079, + "step": 11620 + }, + { + "epoch": 0.19029698110120266, + "grad_norm": 0.04314020648598671, + "learning_rate": 9.514849055060133e-06, + "loss": 0.0044, + "step": 11630 + }, + { + "epoch": 0.1904606070522785, + "grad_norm": 0.3143502175807953, + "learning_rate": 9.523030352613926e-06, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.19062423300335432, + "grad_norm": 0.18950845301151276, + "learning_rate": 9.531211650167718e-06, + "loss": 0.0045, + "step": 11650 + }, + { + "epoch": 0.19078785895443018, + "grad_norm": 0.27728721499443054, + "learning_rate": 9.539392947721509e-06, + "loss": 0.0052, + "step": 11660 + }, + { + "epoch": 0.190951484905506, + "grad_norm": 0.10692378878593445, + "learning_rate": 9.547574245275302e-06, + "loss": 0.0057, + "step": 11670 + }, + { + "epoch": 0.19111511085658187, + "grad_norm": 0.06206267699599266, + "learning_rate": 9.555755542829094e-06, + "loss": 0.005, + "step": 11680 + }, + { + "epoch": 0.1912787368076577, + "grad_norm": 0.3834879696369171, + "learning_rate": 9.563936840382887e-06, + "loss": 0.0082, + "step": 11690 + }, + { + "epoch": 0.19144236275873353, + "grad_norm": 0.10655863583087921, + "learning_rate": 9.572118137936678e-06, + "loss": 0.004, + "step": 11700 + }, + { + "epoch": 0.19160598870980938, + "grad_norm": 0.34981513023376465, + "learning_rate": 9.58029943549047e-06, + "loss": 0.0071, + "step": 11710 + }, + { + "epoch": 0.1917696146608852, + "grad_norm": 0.3693655729293823, + "learning_rate": 9.588480733044263e-06, + "loss": 0.0063, + "step": 11720 + }, + { + "epoch": 0.19193324061196107, + "grad_norm": 0.3498647212982178, + "learning_rate": 9.596662030598054e-06, + "loss": 0.0055, + "step": 11730 + }, + { + "epoch": 0.1920968665630369, + "grad_norm": 0.2284696251153946, + "learning_rate": 9.604843328151845e-06, + "loss": 0.0035, + "step": 11740 + }, + { + "epoch": 0.19226049251411273, + "grad_norm": 0.03513551875948906, + "learning_rate": 9.613024625705639e-06, + "loss": 0.0061, + "step": 11750 + }, + { + "epoch": 0.1924241184651886, + "grad_norm": 0.1508800983428955, + "learning_rate": 9.62120592325943e-06, + "loss": 0.0046, + "step": 11760 + }, + { + "epoch": 0.19258774441626442, + "grad_norm": 0.3604476749897003, + "learning_rate": 9.629387220813221e-06, + "loss": 0.0071, + "step": 11770 + }, + { + "epoch": 0.19275137036734025, + "grad_norm": 0.28478652238845825, + "learning_rate": 9.637568518367015e-06, + "loss": 0.0051, + "step": 11780 + }, + { + "epoch": 0.1929149963184161, + "grad_norm": 0.06905519962310791, + "learning_rate": 9.645749815920806e-06, + "loss": 0.0045, + "step": 11790 + }, + { + "epoch": 0.19307862226949193, + "grad_norm": 0.15411260724067688, + "learning_rate": 9.653931113474599e-06, + "loss": 0.0063, + "step": 11800 + }, + { + "epoch": 0.1932422482205678, + "grad_norm": 1.007348895072937, + "learning_rate": 9.66211241102839e-06, + "loss": 0.0041, + "step": 11810 + }, + { + "epoch": 0.19340587417164362, + "grad_norm": 0.29335278272628784, + "learning_rate": 9.670293708582182e-06, + "loss": 0.0073, + "step": 11820 + }, + { + "epoch": 0.19356950012271945, + "grad_norm": 0.1454388052225113, + "learning_rate": 9.678475006135975e-06, + "loss": 0.0054, + "step": 11830 + }, + { + "epoch": 0.1937331260737953, + "grad_norm": 1.1332452297210693, + "learning_rate": 9.686656303689766e-06, + "loss": 0.0045, + "step": 11840 + }, + { + "epoch": 0.19389675202487114, + "grad_norm": 0.3866525888442993, + "learning_rate": 9.694837601243558e-06, + "loss": 0.0047, + "step": 11850 + }, + { + "epoch": 0.194060377975947, + "grad_norm": 0.22672446072101593, + "learning_rate": 9.703018898797351e-06, + "loss": 0.0069, + "step": 11860 + }, + { + "epoch": 0.19422400392702283, + "grad_norm": 0.22819767892360687, + "learning_rate": 9.711200196351142e-06, + "loss": 0.0078, + "step": 11870 + }, + { + "epoch": 0.19438762987809866, + "grad_norm": 0.24415896832942963, + "learning_rate": 9.719381493904934e-06, + "loss": 0.0073, + "step": 11880 + }, + { + "epoch": 0.1945512558291745, + "grad_norm": 0.20112735033035278, + "learning_rate": 9.727562791458725e-06, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.19471488178025034, + "grad_norm": 0.4894067049026489, + "learning_rate": 9.735744089012518e-06, + "loss": 0.0076, + "step": 11900 + }, + { + "epoch": 0.1948785077313262, + "grad_norm": 0.12639126181602478, + "learning_rate": 9.74392538656631e-06, + "loss": 0.0038, + "step": 11910 + }, + { + "epoch": 0.19504213368240203, + "grad_norm": 0.08692676573991776, + "learning_rate": 9.752106684120101e-06, + "loss": 0.0057, + "step": 11920 + }, + { + "epoch": 0.19520575963347786, + "grad_norm": 0.07425641268491745, + "learning_rate": 9.760287981673894e-06, + "loss": 0.004, + "step": 11930 + }, + { + "epoch": 0.19536938558455372, + "grad_norm": 0.19665846228599548, + "learning_rate": 9.768469279227686e-06, + "loss": 0.0057, + "step": 11940 + }, + { + "epoch": 0.19553301153562955, + "grad_norm": 0.29606568813323975, + "learning_rate": 9.776650576781477e-06, + "loss": 0.0061, + "step": 11950 + }, + { + "epoch": 0.1956966374867054, + "grad_norm": 0.26980194449424744, + "learning_rate": 9.78483187433527e-06, + "loss": 0.0041, + "step": 11960 + }, + { + "epoch": 0.19586026343778123, + "grad_norm": 0.3186739385128021, + "learning_rate": 9.793013171889062e-06, + "loss": 0.0063, + "step": 11970 + }, + { + "epoch": 0.19602388938885706, + "grad_norm": 0.24271419644355774, + "learning_rate": 9.801194469442855e-06, + "loss": 0.0054, + "step": 11980 + }, + { + "epoch": 0.19618751533993292, + "grad_norm": 0.2628874182701111, + "learning_rate": 9.809375766996646e-06, + "loss": 0.0099, + "step": 11990 + }, + { + "epoch": 0.19635114129100875, + "grad_norm": 0.1418476551771164, + "learning_rate": 9.817557064550438e-06, + "loss": 0.0062, + "step": 12000 + }, + { + "epoch": 0.19651476724208458, + "grad_norm": 0.06236148253083229, + "learning_rate": 9.82573836210423e-06, + "loss": 0.0073, + "step": 12010 + }, + { + "epoch": 0.19667839319316044, + "grad_norm": 0.23187607526779175, + "learning_rate": 9.833919659658022e-06, + "loss": 0.0072, + "step": 12020 + }, + { + "epoch": 0.19684201914423627, + "grad_norm": 0.2311960905790329, + "learning_rate": 9.842100957211813e-06, + "loss": 0.007, + "step": 12030 + }, + { + "epoch": 0.19700564509531213, + "grad_norm": 0.11824999749660492, + "learning_rate": 9.850282254765607e-06, + "loss": 0.0069, + "step": 12040 + }, + { + "epoch": 0.19716927104638796, + "grad_norm": 0.16228464245796204, + "learning_rate": 9.858463552319398e-06, + "loss": 0.0067, + "step": 12050 + }, + { + "epoch": 0.19733289699746379, + "grad_norm": 0.23480363190174103, + "learning_rate": 9.866644849873191e-06, + "loss": 0.0101, + "step": 12060 + }, + { + "epoch": 0.19749652294853964, + "grad_norm": 0.1719197928905487, + "learning_rate": 9.874826147426983e-06, + "loss": 0.007, + "step": 12070 + }, + { + "epoch": 0.19766014889961547, + "grad_norm": 0.12306105345487595, + "learning_rate": 9.883007444980774e-06, + "loss": 0.0065, + "step": 12080 + }, + { + "epoch": 0.19782377485069133, + "grad_norm": 0.37918901443481445, + "learning_rate": 9.891188742534567e-06, + "loss": 0.0064, + "step": 12090 + }, + { + "epoch": 0.19798740080176716, + "grad_norm": 0.3092987835407257, + "learning_rate": 9.899370040088358e-06, + "loss": 0.0077, + "step": 12100 + }, + { + "epoch": 0.198151026752843, + "grad_norm": 0.14266997575759888, + "learning_rate": 9.90755133764215e-06, + "loss": 0.0053, + "step": 12110 + }, + { + "epoch": 0.19831465270391885, + "grad_norm": 0.24467043578624725, + "learning_rate": 9.915732635195943e-06, + "loss": 0.0052, + "step": 12120 + }, + { + "epoch": 0.19847827865499468, + "grad_norm": 0.01567120850086212, + "learning_rate": 9.923913932749734e-06, + "loss": 0.0041, + "step": 12130 + }, + { + "epoch": 0.19864190460607054, + "grad_norm": 0.26190006732940674, + "learning_rate": 9.932095230303527e-06, + "loss": 0.0062, + "step": 12140 + }, + { + "epoch": 0.19880553055714636, + "grad_norm": 0.5142116546630859, + "learning_rate": 9.940276527857319e-06, + "loss": 0.0076, + "step": 12150 + }, + { + "epoch": 0.1989691565082222, + "grad_norm": 0.1658213585615158, + "learning_rate": 9.94845782541111e-06, + "loss": 0.0044, + "step": 12160 + }, + { + "epoch": 0.19913278245929805, + "grad_norm": 0.17540432512760162, + "learning_rate": 9.956639122964903e-06, + "loss": 0.0099, + "step": 12170 + }, + { + "epoch": 0.19929640841037388, + "grad_norm": 0.30687403678894043, + "learning_rate": 9.964820420518695e-06, + "loss": 0.0079, + "step": 12180 + }, + { + "epoch": 0.19946003436144974, + "grad_norm": 0.14110256731510162, + "learning_rate": 9.973001718072486e-06, + "loss": 0.005, + "step": 12190 + }, + { + "epoch": 0.19962366031252557, + "grad_norm": 0.33628714084625244, + "learning_rate": 9.98118301562628e-06, + "loss": 0.0058, + "step": 12200 + }, + { + "epoch": 0.1997872862636014, + "grad_norm": 0.0796908438205719, + "learning_rate": 9.98936431318007e-06, + "loss": 0.0037, + "step": 12210 + }, + { + "epoch": 0.19995091221467726, + "grad_norm": 0.5572142004966736, + "learning_rate": 9.997545610733864e-06, + "loss": 0.0068, + "step": 12220 + }, + { + "epoch": 0.2001145381657531, + "grad_norm": 0.3793887197971344, + "learning_rate": 9.999999900093168e-06, + "loss": 0.0065, + "step": 12230 + }, + { + "epoch": 0.20027816411682892, + "grad_norm": 0.19541969895362854, + "learning_rate": 9.99999941075359e-06, + "loss": 0.0071, + "step": 12240 + }, + { + "epoch": 0.20044179006790477, + "grad_norm": 0.24930810928344727, + "learning_rate": 9.999998513631071e-06, + "loss": 0.0067, + "step": 12250 + }, + { + "epoch": 0.2006054160189806, + "grad_norm": 0.26440510153770447, + "learning_rate": 9.999997208725685e-06, + "loss": 0.0038, + "step": 12260 + }, + { + "epoch": 0.20076904197005646, + "grad_norm": 0.12102105468511581, + "learning_rate": 9.999995496037538e-06, + "loss": 0.0036, + "step": 12270 + }, + { + "epoch": 0.2009326679211323, + "grad_norm": 0.276606947183609, + "learning_rate": 9.99999337556677e-06, + "loss": 0.0052, + "step": 12280 + }, + { + "epoch": 0.20109629387220812, + "grad_norm": 0.2862796187400818, + "learning_rate": 9.999990847313552e-06, + "loss": 0.0082, + "step": 12290 + }, + { + "epoch": 0.20125991982328398, + "grad_norm": 0.16471347212791443, + "learning_rate": 9.999987911278094e-06, + "loss": 0.0056, + "step": 12300 + }, + { + "epoch": 0.2014235457743598, + "grad_norm": 0.30309781432151794, + "learning_rate": 9.999984567460635e-06, + "loss": 0.0062, + "step": 12310 + }, + { + "epoch": 0.20158717172543567, + "grad_norm": 0.12292275577783585, + "learning_rate": 9.999980815861443e-06, + "loss": 0.0054, + "step": 12320 + }, + { + "epoch": 0.2017507976765115, + "grad_norm": 0.29708683490753174, + "learning_rate": 9.999976656480828e-06, + "loss": 0.0073, + "step": 12330 + }, + { + "epoch": 0.20191442362758732, + "grad_norm": 0.17786575853824615, + "learning_rate": 9.999972089319127e-06, + "loss": 0.0049, + "step": 12340 + }, + { + "epoch": 0.20207804957866318, + "grad_norm": 0.3271494209766388, + "learning_rate": 9.999967114376716e-06, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.202241675529739, + "grad_norm": 0.18429258465766907, + "learning_rate": 9.999961731653998e-06, + "loss": 0.0049, + "step": 12360 + }, + { + "epoch": 0.20240530148081487, + "grad_norm": 0.09776465594768524, + "learning_rate": 9.99995594115141e-06, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.2025689274318907, + "grad_norm": 0.2743496596813202, + "learning_rate": 9.999949742869431e-06, + "loss": 0.0062, + "step": 12380 + }, + { + "epoch": 0.20273255338296653, + "grad_norm": 0.13429966568946838, + "learning_rate": 9.99994313680856e-06, + "loss": 0.0117, + "step": 12390 + }, + { + "epoch": 0.2028961793340424, + "grad_norm": 0.2560930550098419, + "learning_rate": 9.999936122969339e-06, + "loss": 0.0036, + "step": 12400 + }, + { + "epoch": 0.20305980528511822, + "grad_norm": 0.18724317848682404, + "learning_rate": 9.999928701352337e-06, + "loss": 0.0059, + "step": 12410 + }, + { + "epoch": 0.20322343123619407, + "grad_norm": 0.41032344102859497, + "learning_rate": 9.999920871958163e-06, + "loss": 0.006, + "step": 12420 + }, + { + "epoch": 0.2033870571872699, + "grad_norm": 0.324008584022522, + "learning_rate": 9.999912634787453e-06, + "loss": 0.0058, + "step": 12430 + }, + { + "epoch": 0.20355068313834573, + "grad_norm": 0.21129874885082245, + "learning_rate": 9.99990398984088e-06, + "loss": 0.0065, + "step": 12440 + }, + { + "epoch": 0.2037143090894216, + "grad_norm": 0.21353356540203094, + "learning_rate": 9.999894937119149e-06, + "loss": 0.0044, + "step": 12450 + }, + { + "epoch": 0.20387793504049742, + "grad_norm": 0.16450877487659454, + "learning_rate": 9.999885476622998e-06, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.20404156099157325, + "grad_norm": 0.39673373103141785, + "learning_rate": 9.999875608353199e-06, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.2042051869426491, + "grad_norm": 0.260753870010376, + "learning_rate": 9.999865332310556e-06, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.20436881289372494, + "grad_norm": 0.2538551688194275, + "learning_rate": 9.999854648495907e-06, + "loss": 0.0048, + "step": 12490 + }, + { + "epoch": 0.2045324388448008, + "grad_norm": 0.12521472573280334, + "learning_rate": 9.999843556910125e-06, + "loss": 0.0076, + "step": 12500 + }, + { + "epoch": 0.20469606479587663, + "grad_norm": 4.710920810699463, + "learning_rate": 9.999832057554111e-06, + "loss": 0.0067, + "step": 12510 + }, + { + "epoch": 0.20485969074695246, + "grad_norm": 1.7755043506622314, + "learning_rate": 9.999820150428808e-06, + "loss": 0.0155, + "step": 12520 + }, + { + "epoch": 0.2050233166980283, + "grad_norm": 0.1261700689792633, + "learning_rate": 9.999807835535184e-06, + "loss": 0.0085, + "step": 12530 + }, + { + "epoch": 0.20518694264910414, + "grad_norm": 0.26906171441078186, + "learning_rate": 9.999795112874242e-06, + "loss": 0.0059, + "step": 12540 + }, + { + "epoch": 0.20535056860018, + "grad_norm": 0.13919776678085327, + "learning_rate": 9.999781982447024e-06, + "loss": 0.0038, + "step": 12550 + }, + { + "epoch": 0.20551419455125583, + "grad_norm": 0.5989676117897034, + "learning_rate": 9.999768444254596e-06, + "loss": 0.0062, + "step": 12560 + }, + { + "epoch": 0.20567782050233166, + "grad_norm": 0.268352210521698, + "learning_rate": 9.999754498298064e-06, + "loss": 0.0051, + "step": 12570 + }, + { + "epoch": 0.20584144645340752, + "grad_norm": 0.2456214874982834, + "learning_rate": 9.999740144578568e-06, + "loss": 0.0087, + "step": 12580 + }, + { + "epoch": 0.20600507240448335, + "grad_norm": 0.31719282269477844, + "learning_rate": 9.999725383097275e-06, + "loss": 0.0061, + "step": 12590 + }, + { + "epoch": 0.2061686983555592, + "grad_norm": 0.28815433382987976, + "learning_rate": 9.99971021385539e-06, + "loss": 0.0065, + "step": 12600 + }, + { + "epoch": 0.20633232430663503, + "grad_norm": 0.14909744262695312, + "learning_rate": 9.999694636854151e-06, + "loss": 0.0055, + "step": 12610 + }, + { + "epoch": 0.20649595025771086, + "grad_norm": 0.03553149849176407, + "learning_rate": 9.999678652094828e-06, + "loss": 0.0056, + "step": 12620 + }, + { + "epoch": 0.20665957620878672, + "grad_norm": 0.030735662207007408, + "learning_rate": 9.999662259578725e-06, + "loss": 0.0066, + "step": 12630 + }, + { + "epoch": 0.20682320215986255, + "grad_norm": 0.28481221199035645, + "learning_rate": 9.999645459307176e-06, + "loss": 0.0063, + "step": 12640 + }, + { + "epoch": 0.20698682811093838, + "grad_norm": 0.3420799970626831, + "learning_rate": 9.999628251281556e-06, + "loss": 0.0074, + "step": 12650 + }, + { + "epoch": 0.20715045406201424, + "grad_norm": 0.10076165944337845, + "learning_rate": 9.999610635503266e-06, + "loss": 0.0052, + "step": 12660 + }, + { + "epoch": 0.20731408001309007, + "grad_norm": 0.10481889545917511, + "learning_rate": 9.999592611973743e-06, + "loss": 0.0072, + "step": 12670 + }, + { + "epoch": 0.20747770596416593, + "grad_norm": 0.37956297397613525, + "learning_rate": 9.999574180694456e-06, + "loss": 0.0082, + "step": 12680 + }, + { + "epoch": 0.20764133191524176, + "grad_norm": 0.09679333120584488, + "learning_rate": 9.999555341666908e-06, + "loss": 0.0043, + "step": 12690 + }, + { + "epoch": 0.20780495786631759, + "grad_norm": 0.3721299469470978, + "learning_rate": 9.999536094892637e-06, + "loss": 0.0058, + "step": 12700 + }, + { + "epoch": 0.20796858381739344, + "grad_norm": 0.1958468109369278, + "learning_rate": 9.999516440373212e-06, + "loss": 0.0058, + "step": 12710 + }, + { + "epoch": 0.20813220976846927, + "grad_norm": 0.10566503554582596, + "learning_rate": 9.999496378110236e-06, + "loss": 0.0036, + "step": 12720 + }, + { + "epoch": 0.20829583571954513, + "grad_norm": 0.10382892191410065, + "learning_rate": 9.999475908105345e-06, + "loss": 0.0042, + "step": 12730 + }, + { + "epoch": 0.20845946167062096, + "grad_norm": 0.25233131647109985, + "learning_rate": 9.999455030360207e-06, + "loss": 0.0086, + "step": 12740 + }, + { + "epoch": 0.2086230876216968, + "grad_norm": 0.13550812005996704, + "learning_rate": 9.999433744876528e-06, + "loss": 0.0058, + "step": 12750 + }, + { + "epoch": 0.20878671357277265, + "grad_norm": 0.10901674628257751, + "learning_rate": 9.999412051656044e-06, + "loss": 0.008, + "step": 12760 + }, + { + "epoch": 0.20895033952384848, + "grad_norm": 0.22533440589904785, + "learning_rate": 9.99938995070052e-06, + "loss": 0.0083, + "step": 12770 + }, + { + "epoch": 0.20911396547492433, + "grad_norm": 0.07614787667989731, + "learning_rate": 9.999367442011763e-06, + "loss": 0.0047, + "step": 12780 + }, + { + "epoch": 0.20927759142600016, + "grad_norm": 0.23862454295158386, + "learning_rate": 9.999344525591604e-06, + "loss": 0.0073, + "step": 12790 + }, + { + "epoch": 0.209441217377076, + "grad_norm": 0.0961994156241417, + "learning_rate": 9.999321201441916e-06, + "loss": 0.0085, + "step": 12800 + }, + { + "epoch": 0.20960484332815185, + "grad_norm": 0.09173143655061722, + "learning_rate": 9.999297469564601e-06, + "loss": 0.0039, + "step": 12810 + }, + { + "epoch": 0.20976846927922768, + "grad_norm": 0.2273540049791336, + "learning_rate": 9.999273329961594e-06, + "loss": 0.0055, + "step": 12820 + }, + { + "epoch": 0.20993209523030354, + "grad_norm": 0.12911464273929596, + "learning_rate": 9.99924878263486e-06, + "loss": 0.0061, + "step": 12830 + }, + { + "epoch": 0.21009572118137937, + "grad_norm": 0.03453594818711281, + "learning_rate": 9.999223827586406e-06, + "loss": 0.0055, + "step": 12840 + }, + { + "epoch": 0.2102593471324552, + "grad_norm": 0.31178250908851624, + "learning_rate": 9.999198464818268e-06, + "loss": 0.0053, + "step": 12850 + }, + { + "epoch": 0.21042297308353106, + "grad_norm": 0.09923781454563141, + "learning_rate": 9.999172694332508e-06, + "loss": 0.0051, + "step": 12860 + }, + { + "epoch": 0.21058659903460689, + "grad_norm": 0.14487408101558685, + "learning_rate": 9.999146516131234e-06, + "loss": 0.0033, + "step": 12870 + }, + { + "epoch": 0.21075022498568272, + "grad_norm": 0.15765681862831116, + "learning_rate": 9.999119930216576e-06, + "loss": 0.0044, + "step": 12880 + }, + { + "epoch": 0.21091385093675857, + "grad_norm": 0.33947205543518066, + "learning_rate": 9.999092936590708e-06, + "loss": 0.0039, + "step": 12890 + }, + { + "epoch": 0.2110774768878344, + "grad_norm": 0.2355516254901886, + "learning_rate": 9.999065535255828e-06, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.21124110283891026, + "grad_norm": 0.17342792451381683, + "learning_rate": 9.99903772621417e-06, + "loss": 0.0065, + "step": 12910 + }, + { + "epoch": 0.2114047287899861, + "grad_norm": 0.22083014249801636, + "learning_rate": 9.999009509468003e-06, + "loss": 0.0057, + "step": 12920 + }, + { + "epoch": 0.21156835474106192, + "grad_norm": 0.24771888554096222, + "learning_rate": 9.99898088501963e-06, + "loss": 0.0035, + "step": 12930 + }, + { + "epoch": 0.21173198069213778, + "grad_norm": 0.2606211304664612, + "learning_rate": 9.998951852871384e-06, + "loss": 0.0048, + "step": 12940 + }, + { + "epoch": 0.2118956066432136, + "grad_norm": 0.20061686635017395, + "learning_rate": 9.998922413025632e-06, + "loss": 0.0036, + "step": 12950 + }, + { + "epoch": 0.21205923259428947, + "grad_norm": 0.24848076701164246, + "learning_rate": 9.998892565484776e-06, + "loss": 0.0055, + "step": 12960 + }, + { + "epoch": 0.2122228585453653, + "grad_norm": 0.20416420698165894, + "learning_rate": 9.99886231025125e-06, + "loss": 0.0053, + "step": 12970 + }, + { + "epoch": 0.21238648449644112, + "grad_norm": 0.16592179238796234, + "learning_rate": 9.998831647327521e-06, + "loss": 0.0067, + "step": 12980 + }, + { + "epoch": 0.21255011044751698, + "grad_norm": 0.2822223901748657, + "learning_rate": 9.998800576716092e-06, + "loss": 0.0071, + "step": 12990 + }, + { + "epoch": 0.2127137363985928, + "grad_norm": 0.17762772738933563, + "learning_rate": 9.998769098419494e-06, + "loss": 0.004, + "step": 13000 + }, + { + "epoch": 0.21287736234966867, + "grad_norm": 0.2055288404226303, + "learning_rate": 9.998737212440296e-06, + "loss": 0.0026, + "step": 13010 + }, + { + "epoch": 0.2130409883007445, + "grad_norm": 0.11760883033275604, + "learning_rate": 9.998704918781097e-06, + "loss": 0.0038, + "step": 13020 + }, + { + "epoch": 0.21320461425182033, + "grad_norm": 0.24624723196029663, + "learning_rate": 9.998672217444533e-06, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.2133682402028962, + "grad_norm": 0.280038446187973, + "learning_rate": 9.99863910843327e-06, + "loss": 0.0072, + "step": 13040 + }, + { + "epoch": 0.21353186615397202, + "grad_norm": 0.23071032762527466, + "learning_rate": 9.998605591750009e-06, + "loss": 0.0061, + "step": 13050 + }, + { + "epoch": 0.21369549210504787, + "grad_norm": 0.11674752086400986, + "learning_rate": 9.998571667397481e-06, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.2138591180561237, + "grad_norm": 0.627616286277771, + "learning_rate": 9.998537335378456e-06, + "loss": 0.0052, + "step": 13070 + }, + { + "epoch": 0.21402274400719953, + "grad_norm": 0.4384259879589081, + "learning_rate": 9.998502595695732e-06, + "loss": 0.0065, + "step": 13080 + }, + { + "epoch": 0.2141863699582754, + "grad_norm": 0.15268398821353912, + "learning_rate": 9.998467448352141e-06, + "loss": 0.0042, + "step": 13090 + }, + { + "epoch": 0.21434999590935122, + "grad_norm": 0.1323525309562683, + "learning_rate": 9.998431893350552e-06, + "loss": 0.0061, + "step": 13100 + }, + { + "epoch": 0.21451362186042705, + "grad_norm": 0.1803782433271408, + "learning_rate": 9.998395930693865e-06, + "loss": 0.0047, + "step": 13110 + }, + { + "epoch": 0.2146772478115029, + "grad_norm": 0.1619434803724289, + "learning_rate": 9.998359560385011e-06, + "loss": 0.0042, + "step": 13120 + }, + { + "epoch": 0.21484087376257874, + "grad_norm": 0.11872068792581558, + "learning_rate": 9.998322782426957e-06, + "loss": 0.0054, + "step": 13130 + }, + { + "epoch": 0.2150044997136546, + "grad_norm": 0.14572450518608093, + "learning_rate": 9.998285596822704e-06, + "loss": 0.0059, + "step": 13140 + }, + { + "epoch": 0.21516812566473043, + "grad_norm": 0.10185088962316513, + "learning_rate": 9.998248003575282e-06, + "loss": 0.0063, + "step": 13150 + }, + { + "epoch": 0.21533175161580626, + "grad_norm": 0.06390407681465149, + "learning_rate": 9.998210002687758e-06, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.2154953775668821, + "grad_norm": 0.29848599433898926, + "learning_rate": 9.998171594163232e-06, + "loss": 0.0065, + "step": 13170 + }, + { + "epoch": 0.21565900351795794, + "grad_norm": 0.2984239161014557, + "learning_rate": 9.998132778004837e-06, + "loss": 0.0076, + "step": 13180 + }, + { + "epoch": 0.2158226294690338, + "grad_norm": 0.17627207934856415, + "learning_rate": 9.998093554215735e-06, + "loss": 0.0084, + "step": 13190 + }, + { + "epoch": 0.21598625542010963, + "grad_norm": 0.18145498633384705, + "learning_rate": 9.998053922799131e-06, + "loss": 0.0062, + "step": 13200 + }, + { + "epoch": 0.21614988137118546, + "grad_norm": 0.23339824378490448, + "learning_rate": 9.998013883758252e-06, + "loss": 0.0056, + "step": 13210 + }, + { + "epoch": 0.21631350732226132, + "grad_norm": 0.3056366443634033, + "learning_rate": 9.997973437096366e-06, + "loss": 0.0046, + "step": 13220 + }, + { + "epoch": 0.21647713327333715, + "grad_norm": 0.10920435190200806, + "learning_rate": 9.997932582816771e-06, + "loss": 0.0056, + "step": 13230 + }, + { + "epoch": 0.216640759224413, + "grad_norm": 0.044270556420087814, + "learning_rate": 9.9978913209228e-06, + "loss": 0.005, + "step": 13240 + }, + { + "epoch": 0.21680438517548883, + "grad_norm": 0.11681827902793884, + "learning_rate": 9.997849651417815e-06, + "loss": 0.006, + "step": 13250 + }, + { + "epoch": 0.21696801112656466, + "grad_norm": 0.22933712601661682, + "learning_rate": 9.997807574305218e-06, + "loss": 0.0065, + "step": 13260 + }, + { + "epoch": 0.21713163707764052, + "grad_norm": 0.24406129121780396, + "learning_rate": 9.997765089588439e-06, + "loss": 0.0057, + "step": 13270 + }, + { + "epoch": 0.21729526302871635, + "grad_norm": 0.20547881722450256, + "learning_rate": 9.997722197270942e-06, + "loss": 0.0037, + "step": 13280 + }, + { + "epoch": 0.2174588889797922, + "grad_norm": 0.14097577333450317, + "learning_rate": 9.997678897356227e-06, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.21762251493086804, + "grad_norm": 0.27782827615737915, + "learning_rate": 9.997635189847827e-06, + "loss": 0.0048, + "step": 13300 + }, + { + "epoch": 0.21778614088194387, + "grad_norm": 0.09930739551782608, + "learning_rate": 9.997591074749302e-06, + "loss": 0.0061, + "step": 13310 + }, + { + "epoch": 0.21794976683301973, + "grad_norm": 0.17293070256710052, + "learning_rate": 9.997546552064252e-06, + "loss": 0.0044, + "step": 13320 + }, + { + "epoch": 0.21811339278409556, + "grad_norm": 0.2866891622543335, + "learning_rate": 9.997501621796309e-06, + "loss": 0.0076, + "step": 13330 + }, + { + "epoch": 0.21827701873517139, + "grad_norm": 0.2108461856842041, + "learning_rate": 9.997456283949135e-06, + "loss": 0.005, + "step": 13340 + }, + { + "epoch": 0.21844064468624724, + "grad_norm": 0.17102141678333282, + "learning_rate": 9.997410538526434e-06, + "loss": 0.0074, + "step": 13350 + }, + { + "epoch": 0.21860427063732307, + "grad_norm": 0.17764686048030853, + "learning_rate": 9.997364385531928e-06, + "loss": 0.0098, + "step": 13360 + }, + { + "epoch": 0.21876789658839893, + "grad_norm": 0.25975266098976135, + "learning_rate": 9.997317824969385e-06, + "loss": 0.0077, + "step": 13370 + }, + { + "epoch": 0.21893152253947476, + "grad_norm": 0.10543973743915558, + "learning_rate": 9.997270856842605e-06, + "loss": 0.0043, + "step": 13380 + }, + { + "epoch": 0.2190951484905506, + "grad_norm": 0.07928754389286041, + "learning_rate": 9.997223481155417e-06, + "loss": 0.0054, + "step": 13390 + }, + { + "epoch": 0.21925877444162645, + "grad_norm": 0.25949251651763916, + "learning_rate": 9.99717569791168e-06, + "loss": 0.0053, + "step": 13400 + }, + { + "epoch": 0.21942240039270228, + "grad_norm": 0.21976803243160248, + "learning_rate": 9.997127507115297e-06, + "loss": 0.0077, + "step": 13410 + }, + { + "epoch": 0.21958602634377813, + "grad_norm": 0.10809529572725296, + "learning_rate": 9.997078908770197e-06, + "loss": 0.0046, + "step": 13420 + }, + { + "epoch": 0.21974965229485396, + "grad_norm": 0.3533380329608917, + "learning_rate": 9.99702990288034e-06, + "loss": 0.009, + "step": 13430 + }, + { + "epoch": 0.2199132782459298, + "grad_norm": 0.07063060253858566, + "learning_rate": 9.996980489449728e-06, + "loss": 0.0083, + "step": 13440 + }, + { + "epoch": 0.22007690419700565, + "grad_norm": 0.14417965710163116, + "learning_rate": 9.996930668482388e-06, + "loss": 0.0048, + "step": 13450 + }, + { + "epoch": 0.22024053014808148, + "grad_norm": 0.13901092112064362, + "learning_rate": 9.996880439982382e-06, + "loss": 0.0033, + "step": 13460 + }, + { + "epoch": 0.22040415609915734, + "grad_norm": 0.21198055148124695, + "learning_rate": 9.99682980395381e-06, + "loss": 0.0064, + "step": 13470 + }, + { + "epoch": 0.22056778205023317, + "grad_norm": 0.2886105179786682, + "learning_rate": 9.9967787604008e-06, + "loss": 0.0042, + "step": 13480 + }, + { + "epoch": 0.220731408001309, + "grad_norm": 0.09032531082630157, + "learning_rate": 9.996727309327514e-06, + "loss": 0.0055, + "step": 13490 + }, + { + "epoch": 0.22089503395238486, + "grad_norm": 0.23418252170085907, + "learning_rate": 9.99667545073815e-06, + "loss": 0.0079, + "step": 13500 + }, + { + "epoch": 0.22105865990346069, + "grad_norm": 0.05176910385489464, + "learning_rate": 9.996623184636936e-06, + "loss": 0.004, + "step": 13510 + }, + { + "epoch": 0.22122228585453654, + "grad_norm": 0.15251822769641876, + "learning_rate": 9.996570511028135e-06, + "loss": 0.0072, + "step": 13520 + }, + { + "epoch": 0.22138591180561237, + "grad_norm": 0.34697702527046204, + "learning_rate": 9.996517429916041e-06, + "loss": 0.0071, + "step": 13530 + }, + { + "epoch": 0.2215495377566882, + "grad_norm": 0.2799191176891327, + "learning_rate": 9.996463941304987e-06, + "loss": 0.0054, + "step": 13540 + }, + { + "epoch": 0.22171316370776406, + "grad_norm": 0.8140790462493896, + "learning_rate": 9.996410045199334e-06, + "loss": 0.0039, + "step": 13550 + }, + { + "epoch": 0.2218767896588399, + "grad_norm": 0.16691988706588745, + "learning_rate": 9.996355741603475e-06, + "loss": 0.0058, + "step": 13560 + }, + { + "epoch": 0.22204041560991572, + "grad_norm": 0.26519906520843506, + "learning_rate": 9.996301030521842e-06, + "loss": 0.0093, + "step": 13570 + }, + { + "epoch": 0.22220404156099158, + "grad_norm": 0.0860215499997139, + "learning_rate": 9.996245911958896e-06, + "loss": 0.0053, + "step": 13580 + }, + { + "epoch": 0.2223676675120674, + "grad_norm": 0.14534030854701996, + "learning_rate": 9.996190385919131e-06, + "loss": 0.005, + "step": 13590 + }, + { + "epoch": 0.22253129346314326, + "grad_norm": 0.2794528603553772, + "learning_rate": 9.996134452407077e-06, + "loss": 0.0065, + "step": 13600 + }, + { + "epoch": 0.2226949194142191, + "grad_norm": 0.27352064847946167, + "learning_rate": 9.996078111427297e-06, + "loss": 0.0067, + "step": 13610 + }, + { + "epoch": 0.22285854536529492, + "grad_norm": 0.12785963714122772, + "learning_rate": 9.996021362984383e-06, + "loss": 0.007, + "step": 13620 + }, + { + "epoch": 0.22302217131637078, + "grad_norm": 0.11861082911491394, + "learning_rate": 9.995964207082964e-06, + "loss": 0.0064, + "step": 13630 + }, + { + "epoch": 0.2231857972674466, + "grad_norm": 0.27641215920448303, + "learning_rate": 9.995906643727703e-06, + "loss": 0.0042, + "step": 13640 + }, + { + "epoch": 0.22334942321852247, + "grad_norm": 0.14083783328533173, + "learning_rate": 9.995848672923293e-06, + "loss": 0.0036, + "step": 13650 + }, + { + "epoch": 0.2235130491695983, + "grad_norm": 0.19127905368804932, + "learning_rate": 9.995790294674464e-06, + "loss": 0.0044, + "step": 13660 + }, + { + "epoch": 0.22367667512067413, + "grad_norm": 0.22909848392009735, + "learning_rate": 9.995731508985974e-06, + "loss": 0.0047, + "step": 13670 + }, + { + "epoch": 0.22384030107175, + "grad_norm": 0.12029135227203369, + "learning_rate": 9.99567231586262e-06, + "loss": 0.0067, + "step": 13680 + }, + { + "epoch": 0.22400392702282582, + "grad_norm": 0.2586616277694702, + "learning_rate": 9.995612715309228e-06, + "loss": 0.0043, + "step": 13690 + }, + { + "epoch": 0.22416755297390167, + "grad_norm": 0.21871019899845123, + "learning_rate": 9.99555270733066e-06, + "loss": 0.0061, + "step": 13700 + }, + { + "epoch": 0.2243311789249775, + "grad_norm": 0.21606074273586273, + "learning_rate": 9.99549229193181e-06, + "loss": 0.0034, + "step": 13710 + }, + { + "epoch": 0.22449480487605333, + "grad_norm": 0.3272698223590851, + "learning_rate": 9.995431469117604e-06, + "loss": 0.0045, + "step": 13720 + }, + { + "epoch": 0.2246584308271292, + "grad_norm": 0.15977269411087036, + "learning_rate": 9.995370238893002e-06, + "loss": 0.0077, + "step": 13730 + }, + { + "epoch": 0.22482205677820502, + "grad_norm": 0.1499793827533722, + "learning_rate": 9.995308601263001e-06, + "loss": 0.0053, + "step": 13740 + }, + { + "epoch": 0.22498568272928085, + "grad_norm": 0.13406315445899963, + "learning_rate": 9.995246556232626e-06, + "loss": 0.0048, + "step": 13750 + }, + { + "epoch": 0.2251493086803567, + "grad_norm": 0.13505113124847412, + "learning_rate": 9.995184103806937e-06, + "loss": 0.0046, + "step": 13760 + }, + { + "epoch": 0.22531293463143254, + "grad_norm": 0.22426843643188477, + "learning_rate": 9.995121243991028e-06, + "loss": 0.0102, + "step": 13770 + }, + { + "epoch": 0.2254765605825084, + "grad_norm": 0.16202637553215027, + "learning_rate": 9.995057976790024e-06, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.22564018653358422, + "grad_norm": 0.23404191434383392, + "learning_rate": 9.994994302209087e-06, + "loss": 0.0069, + "step": 13790 + }, + { + "epoch": 0.22580381248466005, + "grad_norm": 0.1363525688648224, + "learning_rate": 9.994930220253407e-06, + "loss": 0.0054, + "step": 13800 + }, + { + "epoch": 0.2259674384357359, + "grad_norm": 0.1189674362540245, + "learning_rate": 9.994865730928214e-06, + "loss": 0.0049, + "step": 13810 + }, + { + "epoch": 0.22613106438681174, + "grad_norm": 0.18299731612205505, + "learning_rate": 9.994800834238767e-06, + "loss": 0.0058, + "step": 13820 + }, + { + "epoch": 0.2262946903378876, + "grad_norm": 0.09564902633428574, + "learning_rate": 9.994735530190356e-06, + "loss": 0.0052, + "step": 13830 + }, + { + "epoch": 0.22645831628896343, + "grad_norm": 0.11441633105278015, + "learning_rate": 9.994669818788311e-06, + "loss": 0.0075, + "step": 13840 + }, + { + "epoch": 0.22662194224003926, + "grad_norm": 0.17015090584754944, + "learning_rate": 9.994603700037988e-06, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.22678556819111512, + "grad_norm": 0.3901327848434448, + "learning_rate": 9.994537173944779e-06, + "loss": 0.0059, + "step": 13860 + }, + { + "epoch": 0.22694919414219095, + "grad_norm": 0.26342910528182983, + "learning_rate": 9.994470240514111e-06, + "loss": 0.0065, + "step": 13870 + }, + { + "epoch": 0.2271128200932668, + "grad_norm": 0.13855819404125214, + "learning_rate": 9.994402899751445e-06, + "loss": 0.0058, + "step": 13880 + }, + { + "epoch": 0.22727644604434263, + "grad_norm": 0.0944550558924675, + "learning_rate": 9.994335151662268e-06, + "loss": 0.005, + "step": 13890 + }, + { + "epoch": 0.22744007199541846, + "grad_norm": 0.14530417323112488, + "learning_rate": 9.994266996252111e-06, + "loss": 0.0062, + "step": 13900 + }, + { + "epoch": 0.22760369794649432, + "grad_norm": 0.19953951239585876, + "learning_rate": 9.994198433526529e-06, + "loss": 0.0055, + "step": 13910 + }, + { + "epoch": 0.22776732389757015, + "grad_norm": 0.1666042059659958, + "learning_rate": 9.994129463491114e-06, + "loss": 0.0046, + "step": 13920 + }, + { + "epoch": 0.227930949848646, + "grad_norm": 0.07594744861125946, + "learning_rate": 9.994060086151491e-06, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.22809457579972184, + "grad_norm": 0.23996597528457642, + "learning_rate": 9.99399030151332e-06, + "loss": 0.004, + "step": 13940 + }, + { + "epoch": 0.22825820175079767, + "grad_norm": 0.19474714994430542, + "learning_rate": 9.99392010958229e-06, + "loss": 0.0041, + "step": 13950 + }, + { + "epoch": 0.22842182770187353, + "grad_norm": 0.07609615474939346, + "learning_rate": 9.993849510364127e-06, + "loss": 0.004, + "step": 13960 + }, + { + "epoch": 0.22858545365294936, + "grad_norm": 0.27029407024383545, + "learning_rate": 9.993778503864588e-06, + "loss": 0.0051, + "step": 13970 + }, + { + "epoch": 0.22874907960402519, + "grad_norm": 0.24098403751850128, + "learning_rate": 9.993707090089463e-06, + "loss": 0.0054, + "step": 13980 + }, + { + "epoch": 0.22891270555510104, + "grad_norm": 0.12854556739330292, + "learning_rate": 9.993635269044582e-06, + "loss": 0.0063, + "step": 13990 + }, + { + "epoch": 0.22907633150617687, + "grad_norm": 0.08363831788301468, + "learning_rate": 9.993563040735796e-06, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.22923995745725273, + "grad_norm": 0.5338570475578308, + "learning_rate": 9.993490405168997e-06, + "loss": 0.0045, + "step": 14010 + }, + { + "epoch": 0.22940358340832856, + "grad_norm": 0.31365853548049927, + "learning_rate": 9.99341736235011e-06, + "loss": 0.008, + "step": 14020 + }, + { + "epoch": 0.2295672093594044, + "grad_norm": 0.1387081742286682, + "learning_rate": 9.993343912285093e-06, + "loss": 0.0029, + "step": 14030 + }, + { + "epoch": 0.22973083531048025, + "grad_norm": 0.07173755019903183, + "learning_rate": 9.993270054979935e-06, + "loss": 0.0063, + "step": 14040 + }, + { + "epoch": 0.22989446126155608, + "grad_norm": 0.22205767035484314, + "learning_rate": 9.993195790440661e-06, + "loss": 0.0045, + "step": 14050 + }, + { + "epoch": 0.23005808721263193, + "grad_norm": 0.21055075526237488, + "learning_rate": 9.993121118673326e-06, + "loss": 0.0046, + "step": 14060 + }, + { + "epoch": 0.23022171316370776, + "grad_norm": 0.33890509605407715, + "learning_rate": 9.99304603968402e-06, + "loss": 0.0044, + "step": 14070 + }, + { + "epoch": 0.2303853391147836, + "grad_norm": 0.3940201699733734, + "learning_rate": 9.992970553478867e-06, + "loss": 0.0098, + "step": 14080 + }, + { + "epoch": 0.23054896506585945, + "grad_norm": 0.15668921172618866, + "learning_rate": 9.992894660064023e-06, + "loss": 0.0047, + "step": 14090 + }, + { + "epoch": 0.23071259101693528, + "grad_norm": 0.2652505040168762, + "learning_rate": 9.992818359445678e-06, + "loss": 0.0044, + "step": 14100 + }, + { + "epoch": 0.23087621696801114, + "grad_norm": 0.11571596562862396, + "learning_rate": 9.992741651630055e-06, + "loss": 0.0026, + "step": 14110 + }, + { + "epoch": 0.23103984291908697, + "grad_norm": 0.33519086241722107, + "learning_rate": 9.992664536623409e-06, + "loss": 0.0048, + "step": 14120 + }, + { + "epoch": 0.2312034688701628, + "grad_norm": 0.2683626711368561, + "learning_rate": 9.99258701443203e-06, + "loss": 0.0064, + "step": 14130 + }, + { + "epoch": 0.23136709482123866, + "grad_norm": 0.060153067111968994, + "learning_rate": 9.992509085062241e-06, + "loss": 0.0049, + "step": 14140 + }, + { + "epoch": 0.23153072077231449, + "grad_norm": 0.051124393939971924, + "learning_rate": 9.992430748520396e-06, + "loss": 0.0073, + "step": 14150 + }, + { + "epoch": 0.23169434672339034, + "grad_norm": 0.14996354281902313, + "learning_rate": 9.992352004812887e-06, + "loss": 0.004, + "step": 14160 + }, + { + "epoch": 0.23185797267446617, + "grad_norm": 0.17824135720729828, + "learning_rate": 9.992272853946133e-06, + "loss": 0.005, + "step": 14170 + }, + { + "epoch": 0.232021598625542, + "grad_norm": 0.3842923045158386, + "learning_rate": 9.99219329592659e-06, + "loss": 0.0074, + "step": 14180 + }, + { + "epoch": 0.23218522457661786, + "grad_norm": 0.3007977306842804, + "learning_rate": 9.992113330760744e-06, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.2323488505276937, + "grad_norm": 0.3273521065711975, + "learning_rate": 9.992032958455122e-06, + "loss": 0.0057, + "step": 14200 + }, + { + "epoch": 0.23251247647876952, + "grad_norm": 0.27948787808418274, + "learning_rate": 9.991952179016277e-06, + "loss": 0.0072, + "step": 14210 + }, + { + "epoch": 0.23267610242984538, + "grad_norm": 0.07427410036325455, + "learning_rate": 9.991870992450794e-06, + "loss": 0.0058, + "step": 14220 + }, + { + "epoch": 0.2328397283809212, + "grad_norm": 0.16711176931858063, + "learning_rate": 9.9917893987653e-06, + "loss": 0.0063, + "step": 14230 + }, + { + "epoch": 0.23300335433199706, + "grad_norm": 0.15623080730438232, + "learning_rate": 9.991707397966443e-06, + "loss": 0.0047, + "step": 14240 + }, + { + "epoch": 0.2331669802830729, + "grad_norm": 0.8671354651451111, + "learning_rate": 9.991624990060915e-06, + "loss": 0.0065, + "step": 14250 + }, + { + "epoch": 0.23333060623414872, + "grad_norm": 0.30463269352912903, + "learning_rate": 9.991542175055436e-06, + "loss": 0.0053, + "step": 14260 + }, + { + "epoch": 0.23349423218522458, + "grad_norm": 0.14944496750831604, + "learning_rate": 9.99145895295676e-06, + "loss": 0.0064, + "step": 14270 + }, + { + "epoch": 0.2336578581363004, + "grad_norm": 0.1360207349061966, + "learning_rate": 9.991375323771673e-06, + "loss": 0.0041, + "step": 14280 + }, + { + "epoch": 0.23382148408737627, + "grad_norm": 0.24179519712924957, + "learning_rate": 9.991291287506998e-06, + "loss": 0.0047, + "step": 14290 + }, + { + "epoch": 0.2339851100384521, + "grad_norm": 0.2822986841201782, + "learning_rate": 9.991206844169588e-06, + "loss": 0.0069, + "step": 14300 + }, + { + "epoch": 0.23414873598952793, + "grad_norm": 0.21003149449825287, + "learning_rate": 9.991121993766329e-06, + "loss": 0.0046, + "step": 14310 + }, + { + "epoch": 0.23431236194060379, + "grad_norm": 0.559204638004303, + "learning_rate": 9.99103673630414e-06, + "loss": 0.009, + "step": 14320 + }, + { + "epoch": 0.23447598789167962, + "grad_norm": 0.18115448951721191, + "learning_rate": 9.990951071789977e-06, + "loss": 0.0056, + "step": 14330 + }, + { + "epoch": 0.23463961384275547, + "grad_norm": 0.14334458112716675, + "learning_rate": 9.990865000230825e-06, + "loss": 0.0053, + "step": 14340 + }, + { + "epoch": 0.2348032397938313, + "grad_norm": 0.3625998795032501, + "learning_rate": 9.990778521633703e-06, + "loss": 0.0078, + "step": 14350 + }, + { + "epoch": 0.23496686574490713, + "grad_norm": 0.14576828479766846, + "learning_rate": 9.990691636005667e-06, + "loss": 0.0067, + "step": 14360 + }, + { + "epoch": 0.235130491695983, + "grad_norm": 0.13885726034641266, + "learning_rate": 9.990604343353799e-06, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.26105016469955444, + "learning_rate": 9.990516643685222e-06, + "loss": 0.0064, + "step": 14380 + }, + { + "epoch": 0.23545774359813468, + "grad_norm": 0.06199558824300766, + "learning_rate": 9.990428537007084e-06, + "loss": 0.0073, + "step": 14390 + }, + { + "epoch": 0.2356213695492105, + "grad_norm": 0.18994049727916718, + "learning_rate": 9.990340023326577e-06, + "loss": 0.0072, + "step": 14400 + }, + { + "epoch": 0.23578499550028634, + "grad_norm": 0.2793636918067932, + "learning_rate": 9.990251102650914e-06, + "loss": 0.0058, + "step": 14410 + }, + { + "epoch": 0.2359486214513622, + "grad_norm": 0.2294287085533142, + "learning_rate": 9.99016177498735e-06, + "loss": 0.0049, + "step": 14420 + }, + { + "epoch": 0.23611224740243802, + "grad_norm": 0.21436075866222382, + "learning_rate": 9.990072040343169e-06, + "loss": 0.0064, + "step": 14430 + }, + { + "epoch": 0.23627587335351385, + "grad_norm": 0.1207033172249794, + "learning_rate": 9.989981898725692e-06, + "loss": 0.0052, + "step": 14440 + }, + { + "epoch": 0.2364394993045897, + "grad_norm": 0.18606725335121155, + "learning_rate": 9.989891350142266e-06, + "loss": 0.0088, + "step": 14450 + }, + { + "epoch": 0.23660312525566554, + "grad_norm": 0.12256237119436264, + "learning_rate": 9.98980039460028e-06, + "loss": 0.0037, + "step": 14460 + }, + { + "epoch": 0.2367667512067414, + "grad_norm": 0.5557464361190796, + "learning_rate": 9.989709032107151e-06, + "loss": 0.0055, + "step": 14470 + }, + { + "epoch": 0.23693037715781723, + "grad_norm": 0.28305840492248535, + "learning_rate": 9.989617262670328e-06, + "loss": 0.004, + "step": 14480 + }, + { + "epoch": 0.23709400310889306, + "grad_norm": 0.4966054856777191, + "learning_rate": 9.989525086297299e-06, + "loss": 0.0047, + "step": 14490 + }, + { + "epoch": 0.23725762905996892, + "grad_norm": 0.13753916323184967, + "learning_rate": 9.98943250299558e-06, + "loss": 0.003, + "step": 14500 + }, + { + "epoch": 0.23742125501104475, + "grad_norm": 0.059442464262247086, + "learning_rate": 9.98933951277272e-06, + "loss": 0.0047, + "step": 14510 + }, + { + "epoch": 0.2375848809621206, + "grad_norm": 0.22410885989665985, + "learning_rate": 9.989246115636304e-06, + "loss": 0.0038, + "step": 14520 + }, + { + "epoch": 0.23774850691319643, + "grad_norm": 0.25762972235679626, + "learning_rate": 9.989152311593953e-06, + "loss": 0.0059, + "step": 14530 + }, + { + "epoch": 0.23791213286427226, + "grad_norm": 0.2537146508693695, + "learning_rate": 9.989058100653312e-06, + "loss": 0.0034, + "step": 14540 + }, + { + "epoch": 0.23807575881534812, + "grad_norm": 0.30126863718032837, + "learning_rate": 9.988963482822066e-06, + "loss": 0.0045, + "step": 14550 + }, + { + "epoch": 0.23823938476642395, + "grad_norm": 0.24404503405094147, + "learning_rate": 9.988868458107931e-06, + "loss": 0.0041, + "step": 14560 + }, + { + "epoch": 0.2384030107174998, + "grad_norm": 0.36956584453582764, + "learning_rate": 9.988773026518661e-06, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.23856663666857564, + "grad_norm": 0.11267326027154922, + "learning_rate": 9.988677188062036e-06, + "loss": 0.0049, + "step": 14580 + }, + { + "epoch": 0.23873026261965147, + "grad_norm": 0.1392108052968979, + "learning_rate": 9.988580942745871e-06, + "loss": 0.0083, + "step": 14590 + }, + { + "epoch": 0.23889388857072733, + "grad_norm": 0.07403679937124252, + "learning_rate": 9.988484290578018e-06, + "loss": 0.0035, + "step": 14600 + }, + { + "epoch": 0.23905751452180315, + "grad_norm": 0.2874336242675781, + "learning_rate": 9.988387231566358e-06, + "loss": 0.0066, + "step": 14610 + }, + { + "epoch": 0.23922114047287898, + "grad_norm": 0.15391848981380463, + "learning_rate": 9.988289765718808e-06, + "loss": 0.0044, + "step": 14620 + }, + { + "epoch": 0.23938476642395484, + "grad_norm": 0.07161050289869308, + "learning_rate": 9.988191893043317e-06, + "loss": 0.0037, + "step": 14630 + }, + { + "epoch": 0.23954839237503067, + "grad_norm": 0.30230942368507385, + "learning_rate": 9.988093613547864e-06, + "loss": 0.0053, + "step": 14640 + }, + { + "epoch": 0.23971201832610653, + "grad_norm": 0.11311539262533188, + "learning_rate": 9.987994927240469e-06, + "loss": 0.0048, + "step": 14650 + }, + { + "epoch": 0.23987564427718236, + "grad_norm": 0.06841092556715012, + "learning_rate": 9.987895834129177e-06, + "loss": 0.0069, + "step": 14660 + }, + { + "epoch": 0.2400392702282582, + "grad_norm": 0.15750084817409515, + "learning_rate": 9.987796334222073e-06, + "loss": 0.0055, + "step": 14670 + }, + { + "epoch": 0.24020289617933405, + "grad_norm": 0.06570466607809067, + "learning_rate": 9.987696427527268e-06, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.24036652213040988, + "grad_norm": 0.14742617309093475, + "learning_rate": 9.987596114052912e-06, + "loss": 0.0026, + "step": 14690 + }, + { + "epoch": 0.24053014808148573, + "grad_norm": 0.08288706094026566, + "learning_rate": 9.987495393807188e-06, + "loss": 0.0042, + "step": 14700 + }, + { + "epoch": 0.24069377403256156, + "grad_norm": 0.3669871985912323, + "learning_rate": 9.987394266798308e-06, + "loss": 0.0068, + "step": 14710 + }, + { + "epoch": 0.2408573999836374, + "grad_norm": 0.1870093047618866, + "learning_rate": 9.987292733034518e-06, + "loss": 0.004, + "step": 14720 + }, + { + "epoch": 0.24102102593471325, + "grad_norm": 0.0927494689822197, + "learning_rate": 9.987190792524101e-06, + "loss": 0.0045, + "step": 14730 + }, + { + "epoch": 0.24118465188578908, + "grad_norm": 0.07338996231555939, + "learning_rate": 9.987088445275375e-06, + "loss": 0.0051, + "step": 14740 + }, + { + "epoch": 0.24134827783686494, + "grad_norm": 0.2542848289012909, + "learning_rate": 9.986985691296679e-06, + "loss": 0.0049, + "step": 14750 + }, + { + "epoch": 0.24151190378794077, + "grad_norm": 0.14750252664089203, + "learning_rate": 9.986882530596398e-06, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.2416755297390166, + "grad_norm": 0.13782623410224915, + "learning_rate": 9.986778963182945e-06, + "loss": 0.0054, + "step": 14770 + }, + { + "epoch": 0.24183915569009246, + "grad_norm": 0.0877394899725914, + "learning_rate": 9.986674989064768e-06, + "loss": 0.0035, + "step": 14780 + }, + { + "epoch": 0.24200278164116829, + "grad_norm": 0.1941722333431244, + "learning_rate": 9.986570608250343e-06, + "loss": 0.0033, + "step": 14790 + }, + { + "epoch": 0.24216640759224414, + "grad_norm": 0.07986953109502792, + "learning_rate": 9.986465820748186e-06, + "loss": 0.0071, + "step": 14800 + }, + { + "epoch": 0.24233003354331997, + "grad_norm": 0.15755994617938995, + "learning_rate": 9.98636062656684e-06, + "loss": 0.0048, + "step": 14810 + }, + { + "epoch": 0.2424936594943958, + "grad_norm": 0.16584065556526184, + "learning_rate": 9.986255025714888e-06, + "loss": 0.0058, + "step": 14820 + }, + { + "epoch": 0.24265728544547166, + "grad_norm": 0.1625998318195343, + "learning_rate": 9.98614901820094e-06, + "loss": 0.0078, + "step": 14830 + }, + { + "epoch": 0.2428209113965475, + "grad_norm": 0.0650317594408989, + "learning_rate": 9.986042604033644e-06, + "loss": 0.0047, + "step": 14840 + }, + { + "epoch": 0.24298453734762332, + "grad_norm": 0.1152484193444252, + "learning_rate": 9.985935783221676e-06, + "loss": 0.0052, + "step": 14850 + }, + { + "epoch": 0.24314816329869918, + "grad_norm": 0.14701791107654572, + "learning_rate": 9.985828555773748e-06, + "loss": 0.0059, + "step": 14860 + }, + { + "epoch": 0.243311789249775, + "grad_norm": 0.33101290464401245, + "learning_rate": 9.98572092169861e-06, + "loss": 0.0038, + "step": 14870 + }, + { + "epoch": 0.24347541520085086, + "grad_norm": 0.1879790723323822, + "learning_rate": 9.985612881005034e-06, + "loss": 0.0044, + "step": 14880 + }, + { + "epoch": 0.2436390411519267, + "grad_norm": 0.09927515685558319, + "learning_rate": 9.985504433701834e-06, + "loss": 0.0039, + "step": 14890 + }, + { + "epoch": 0.24380266710300252, + "grad_norm": 0.13312315940856934, + "learning_rate": 9.985395579797855e-06, + "loss": 0.0032, + "step": 14900 + }, + { + "epoch": 0.24396629305407838, + "grad_norm": 0.176203653216362, + "learning_rate": 9.985286319301972e-06, + "loss": 0.0085, + "step": 14910 + }, + { + "epoch": 0.2441299190051542, + "grad_norm": 0.17916902899742126, + "learning_rate": 9.985176652223101e-06, + "loss": 0.0038, + "step": 14920 + }, + { + "epoch": 0.24429354495623007, + "grad_norm": 0.1555701494216919, + "learning_rate": 9.985066578570184e-06, + "loss": 0.0077, + "step": 14930 + }, + { + "epoch": 0.2444571709073059, + "grad_norm": 1.00691819190979, + "learning_rate": 9.984956098352194e-06, + "loss": 0.0044, + "step": 14940 + }, + { + "epoch": 0.24462079685838173, + "grad_norm": 0.1658470630645752, + "learning_rate": 9.984845211578147e-06, + "loss": 0.0044, + "step": 14950 + }, + { + "epoch": 0.24478442280945759, + "grad_norm": 0.22211848199367523, + "learning_rate": 9.984733918257085e-06, + "loss": 0.0048, + "step": 14960 + }, + { + "epoch": 0.24494804876053342, + "grad_norm": 0.8488159775733948, + "learning_rate": 9.984622218398083e-06, + "loss": 0.0067, + "step": 14970 + }, + { + "epoch": 0.24511167471160927, + "grad_norm": 0.3361782431602478, + "learning_rate": 9.984510112010253e-06, + "loss": 0.0075, + "step": 14980 + }, + { + "epoch": 0.2452753006626851, + "grad_norm": 0.16249647736549377, + "learning_rate": 9.984397599102735e-06, + "loss": 0.0063, + "step": 14990 + }, + { + "epoch": 0.24543892661376093, + "grad_norm": 0.21449130773544312, + "learning_rate": 9.98428467968471e-06, + "loss": 0.0058, + "step": 15000 + }, + { + "epoch": 0.2456025525648368, + "grad_norm": 0.11375071108341217, + "learning_rate": 9.984171353765383e-06, + "loss": 0.0031, + "step": 15010 + }, + { + "epoch": 0.24576617851591262, + "grad_norm": 0.11024336516857147, + "learning_rate": 9.984057621353997e-06, + "loss": 0.0045, + "step": 15020 + }, + { + "epoch": 0.24592980446698848, + "grad_norm": 0.08600004762411118, + "learning_rate": 9.98394348245983e-06, + "loss": 0.0054, + "step": 15030 + }, + { + "epoch": 0.2460934304180643, + "grad_norm": 0.22953863441944122, + "learning_rate": 9.98382893709219e-06, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.24625705636914014, + "grad_norm": 0.11961615085601807, + "learning_rate": 9.983713985260418e-06, + "loss": 0.0058, + "step": 15050 + }, + { + "epoch": 0.246420682320216, + "grad_norm": 0.34285932779312134, + "learning_rate": 9.98359862697389e-06, + "loss": 0.0042, + "step": 15060 + }, + { + "epoch": 0.24658430827129182, + "grad_norm": 0.07007353007793427, + "learning_rate": 9.983482862242011e-06, + "loss": 0.0045, + "step": 15070 + }, + { + "epoch": 0.24674793422236765, + "grad_norm": 0.3148964047431946, + "learning_rate": 9.983366691074228e-06, + "loss": 0.0064, + "step": 15080 + }, + { + "epoch": 0.2469115601734435, + "grad_norm": 0.08215848356485367, + "learning_rate": 9.983250113480009e-06, + "loss": 0.0034, + "step": 15090 + }, + { + "epoch": 0.24707518612451934, + "grad_norm": 0.2079235315322876, + "learning_rate": 9.983133129468869e-06, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 0.2472388120755952, + "grad_norm": 0.18692241609096527, + "learning_rate": 9.983015739050343e-06, + "loss": 0.0066, + "step": 15110 + }, + { + "epoch": 0.24740243802667103, + "grad_norm": 0.27569684386253357, + "learning_rate": 9.982897942234008e-06, + "loss": 0.0042, + "step": 15120 + }, + { + "epoch": 0.24756606397774686, + "grad_norm": 0.1384645253419876, + "learning_rate": 9.98277973902947e-06, + "loss": 0.0042, + "step": 15130 + }, + { + "epoch": 0.24772968992882272, + "grad_norm": 0.1159195825457573, + "learning_rate": 9.982661129446369e-06, + "loss": 0.0052, + "step": 15140 + }, + { + "epoch": 0.24789331587989855, + "grad_norm": 0.031648822128772736, + "learning_rate": 9.982542113494378e-06, + "loss": 0.0039, + "step": 15150 + }, + { + "epoch": 0.2480569418309744, + "grad_norm": 0.12593811750411987, + "learning_rate": 9.982422691183206e-06, + "loss": 0.0046, + "step": 15160 + }, + { + "epoch": 0.24822056778205023, + "grad_norm": 0.22473861277103424, + "learning_rate": 9.982302862522591e-06, + "loss": 0.0043, + "step": 15170 + }, + { + "epoch": 0.24838419373312606, + "grad_norm": 0.1660010665655136, + "learning_rate": 9.982182627522304e-06, + "loss": 0.0058, + "step": 15180 + }, + { + "epoch": 0.24854781968420192, + "grad_norm": 0.13586291670799255, + "learning_rate": 9.982061986192153e-06, + "loss": 0.0057, + "step": 15190 + }, + { + "epoch": 0.24871144563527775, + "grad_norm": 0.2331942319869995, + "learning_rate": 9.981940938541977e-06, + "loss": 0.0046, + "step": 15200 + }, + { + "epoch": 0.2488750715863536, + "grad_norm": 0.21079044044017792, + "learning_rate": 9.981819484581649e-06, + "loss": 0.0026, + "step": 15210 + }, + { + "epoch": 0.24903869753742944, + "grad_norm": 0.12120892107486725, + "learning_rate": 9.981697624321073e-06, + "loss": 0.0041, + "step": 15220 + }, + { + "epoch": 0.24920232348850527, + "grad_norm": 0.045738670974969864, + "learning_rate": 9.981575357770187e-06, + "loss": 0.0073, + "step": 15230 + }, + { + "epoch": 0.24936594943958112, + "grad_norm": 0.08233648538589478, + "learning_rate": 9.981452684938966e-06, + "loss": 0.0038, + "step": 15240 + }, + { + "epoch": 0.24952957539065695, + "grad_norm": 0.10493548214435577, + "learning_rate": 9.981329605837412e-06, + "loss": 0.0039, + "step": 15250 + }, + { + "epoch": 0.2496932013417328, + "grad_norm": 0.22596482932567596, + "learning_rate": 9.981206120475561e-06, + "loss": 0.006, + "step": 15260 + }, + { + "epoch": 0.24985682729280864, + "grad_norm": 0.07747557759284973, + "learning_rate": 9.981082228863487e-06, + "loss": 0.0041, + "step": 15270 + }, + { + "epoch": 0.2500204532438845, + "grad_norm": 0.37612032890319824, + "learning_rate": 9.980957931011294e-06, + "loss": 0.0049, + "step": 15280 + }, + { + "epoch": 0.25018407919496033, + "grad_norm": 0.18529130518436432, + "learning_rate": 9.980833226929118e-06, + "loss": 0.0045, + "step": 15290 + }, + { + "epoch": 0.25034770514603616, + "grad_norm": 0.268167644739151, + "learning_rate": 9.98070811662713e-06, + "loss": 0.0047, + "step": 15300 + }, + { + "epoch": 0.250511331097112, + "grad_norm": 0.08909649401903152, + "learning_rate": 9.980582600115536e-06, + "loss": 0.0044, + "step": 15310 + }, + { + "epoch": 0.2506749570481878, + "grad_norm": 0.17307843267917633, + "learning_rate": 9.98045667740457e-06, + "loss": 0.0042, + "step": 15320 + }, + { + "epoch": 0.2508385829992637, + "grad_norm": 0.07683459669351578, + "learning_rate": 9.980330348504502e-06, + "loss": 0.0048, + "step": 15330 + }, + { + "epoch": 0.25100220895033953, + "grad_norm": 0.19765128195285797, + "learning_rate": 9.980203613425636e-06, + "loss": 0.0036, + "step": 15340 + }, + { + "epoch": 0.25116583490141536, + "grad_norm": 0.31956881284713745, + "learning_rate": 9.980076472178307e-06, + "loss": 0.0047, + "step": 15350 + }, + { + "epoch": 0.2513294608524912, + "grad_norm": 0.2477467805147171, + "learning_rate": 9.979948924772884e-06, + "loss": 0.0073, + "step": 15360 + }, + { + "epoch": 0.251493086803567, + "grad_norm": 0.20623217523097992, + "learning_rate": 9.979820971219768e-06, + "loss": 0.0037, + "step": 15370 + }, + { + "epoch": 0.2516567127546429, + "grad_norm": 0.0902077704668045, + "learning_rate": 9.9796926115294e-06, + "loss": 0.0042, + "step": 15380 + }, + { + "epoch": 0.25182033870571874, + "grad_norm": 0.38105589151382446, + "learning_rate": 9.979563845712244e-06, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.25198396465679457, + "grad_norm": 0.09033389389514923, + "learning_rate": 9.979434673778803e-06, + "loss": 0.0057, + "step": 15400 + }, + { + "epoch": 0.2521475906078704, + "grad_norm": 0.08248700946569443, + "learning_rate": 9.97930509573961e-06, + "loss": 0.004, + "step": 15410 + }, + { + "epoch": 0.2523112165589462, + "grad_norm": 0.5345564484596252, + "learning_rate": 9.979175111605235e-06, + "loss": 0.0056, + "step": 15420 + }, + { + "epoch": 0.2524748425100221, + "grad_norm": 0.19937844574451447, + "learning_rate": 9.97904472138628e-06, + "loss": 0.0038, + "step": 15430 + }, + { + "epoch": 0.25263846846109794, + "grad_norm": 0.11802786588668823, + "learning_rate": 9.978913925093375e-06, + "loss": 0.0056, + "step": 15440 + }, + { + "epoch": 0.2528020944121738, + "grad_norm": 0.14457106590270996, + "learning_rate": 9.978782722737192e-06, + "loss": 0.0041, + "step": 15450 + }, + { + "epoch": 0.2529657203632496, + "grad_norm": 0.07589415460824966, + "learning_rate": 9.978651114328429e-06, + "loss": 0.0076, + "step": 15460 + }, + { + "epoch": 0.25312934631432543, + "grad_norm": 0.18440186977386475, + "learning_rate": 9.978519099877819e-06, + "loss": 0.0024, + "step": 15470 + }, + { + "epoch": 0.2532929722654013, + "grad_norm": 0.2740703523159027, + "learning_rate": 9.97838667939613e-06, + "loss": 0.0068, + "step": 15480 + }, + { + "epoch": 0.25345659821647715, + "grad_norm": 0.36186614632606506, + "learning_rate": 9.978253852894162e-06, + "loss": 0.0084, + "step": 15490 + }, + { + "epoch": 0.253620224167553, + "grad_norm": 0.11956515908241272, + "learning_rate": 9.978120620382748e-06, + "loss": 0.008, + "step": 15500 + }, + { + "epoch": 0.2537838501186288, + "grad_norm": 0.016868753358721733, + "learning_rate": 9.97798698187275e-06, + "loss": 0.0058, + "step": 15510 + }, + { + "epoch": 0.25394747606970464, + "grad_norm": 0.13054509460926056, + "learning_rate": 9.977852937375074e-06, + "loss": 0.004, + "step": 15520 + }, + { + "epoch": 0.2541111020207805, + "grad_norm": 0.24364294111728668, + "learning_rate": 9.977718486900647e-06, + "loss": 0.0046, + "step": 15530 + }, + { + "epoch": 0.25427472797185635, + "grad_norm": 0.05926011502742767, + "learning_rate": 9.977583630460437e-06, + "loss": 0.0046, + "step": 15540 + }, + { + "epoch": 0.2544383539229322, + "grad_norm": 0.1436033844947815, + "learning_rate": 9.977448368065438e-06, + "loss": 0.0089, + "step": 15550 + }, + { + "epoch": 0.254601979874008, + "grad_norm": 0.30904632806777954, + "learning_rate": 9.977312699726689e-06, + "loss": 0.0051, + "step": 15560 + }, + { + "epoch": 0.25476560582508384, + "grad_norm": 0.16983985900878906, + "learning_rate": 9.977176625455248e-06, + "loss": 0.0033, + "step": 15570 + }, + { + "epoch": 0.2549292317761597, + "grad_norm": 0.1800442487001419, + "learning_rate": 9.977040145262216e-06, + "loss": 0.0049, + "step": 15580 + }, + { + "epoch": 0.25509285772723556, + "grad_norm": 0.13681121170520782, + "learning_rate": 9.976903259158723e-06, + "loss": 0.0052, + "step": 15590 + }, + { + "epoch": 0.2552564836783114, + "grad_norm": 0.3236236274242401, + "learning_rate": 9.976765967155933e-06, + "loss": 0.0044, + "step": 15600 + }, + { + "epoch": 0.2554201096293872, + "grad_norm": 0.21640510857105255, + "learning_rate": 9.976628269265044e-06, + "loss": 0.0066, + "step": 15610 + }, + { + "epoch": 0.25558373558046305, + "grad_norm": 0.0908389464020729, + "learning_rate": 9.976490165497283e-06, + "loss": 0.0049, + "step": 15620 + }, + { + "epoch": 0.2557473615315389, + "grad_norm": 0.3596591353416443, + "learning_rate": 9.976351655863919e-06, + "loss": 0.0034, + "step": 15630 + }, + { + "epoch": 0.25591098748261476, + "grad_norm": 0.10676617920398712, + "learning_rate": 9.976212740376241e-06, + "loss": 0.0035, + "step": 15640 + }, + { + "epoch": 0.2560746134336906, + "grad_norm": 0.22603820264339447, + "learning_rate": 9.976073419045585e-06, + "loss": 0.0055, + "step": 15650 + }, + { + "epoch": 0.2562382393847664, + "grad_norm": 0.4517509639263153, + "learning_rate": 9.97593369188331e-06, + "loss": 0.0031, + "step": 15660 + }, + { + "epoch": 0.25640186533584225, + "grad_norm": 0.23560291528701782, + "learning_rate": 9.975793558900814e-06, + "loss": 0.0066, + "step": 15670 + }, + { + "epoch": 0.2565654912869181, + "grad_norm": 0.05763200297951698, + "learning_rate": 9.975653020109524e-06, + "loss": 0.0046, + "step": 15680 + }, + { + "epoch": 0.25672911723799396, + "grad_norm": 0.07072847336530685, + "learning_rate": 9.975512075520901e-06, + "loss": 0.0045, + "step": 15690 + }, + { + "epoch": 0.2568927431890698, + "grad_norm": 0.11106191575527191, + "learning_rate": 9.975370725146443e-06, + "loss": 0.0043, + "step": 15700 + }, + { + "epoch": 0.2570563691401456, + "grad_norm": 0.12212468683719635, + "learning_rate": 9.975228968997674e-06, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.25721999509122145, + "grad_norm": 0.2445971667766571, + "learning_rate": 9.97508680708616e-06, + "loss": 0.0058, + "step": 15720 + }, + { + "epoch": 0.2573836210422973, + "grad_norm": 0.11314980685710907, + "learning_rate": 9.974944239423492e-06, + "loss": 0.0061, + "step": 15730 + }, + { + "epoch": 0.25754724699337317, + "grad_norm": 0.10585033893585205, + "learning_rate": 9.974801266021296e-06, + "loss": 0.0098, + "step": 15740 + }, + { + "epoch": 0.257710872944449, + "grad_norm": 0.2456834465265274, + "learning_rate": 9.974657886891237e-06, + "loss": 0.0057, + "step": 15750 + }, + { + "epoch": 0.25787449889552483, + "grad_norm": 0.12119577825069427, + "learning_rate": 9.974514102045006e-06, + "loss": 0.0042, + "step": 15760 + }, + { + "epoch": 0.25803812484660066, + "grad_norm": 0.25558027625083923, + "learning_rate": 9.974369911494329e-06, + "loss": 0.0049, + "step": 15770 + }, + { + "epoch": 0.2582017507976765, + "grad_norm": 0.11136852204799652, + "learning_rate": 9.974225315250965e-06, + "loss": 0.0042, + "step": 15780 + }, + { + "epoch": 0.2583653767487524, + "grad_norm": 0.04781457781791687, + "learning_rate": 9.97408031332671e-06, + "loss": 0.0026, + "step": 15790 + }, + { + "epoch": 0.2585290026998282, + "grad_norm": 0.5310918688774109, + "learning_rate": 9.973934905733387e-06, + "loss": 0.004, + "step": 15800 + }, + { + "epoch": 0.25869262865090403, + "grad_norm": 0.008327346295118332, + "learning_rate": 9.973789092482856e-06, + "loss": 0.0032, + "step": 15810 + }, + { + "epoch": 0.25885625460197986, + "grad_norm": 0.13971813023090363, + "learning_rate": 9.973642873587009e-06, + "loss": 0.0041, + "step": 15820 + }, + { + "epoch": 0.2590198805530557, + "grad_norm": 0.20547564327716827, + "learning_rate": 9.97349624905777e-06, + "loss": 0.0036, + "step": 15830 + }, + { + "epoch": 0.2591835065041316, + "grad_norm": 0.23655913770198822, + "learning_rate": 9.9733492189071e-06, + "loss": 0.0039, + "step": 15840 + }, + { + "epoch": 0.2593471324552074, + "grad_norm": 0.10760346800088882, + "learning_rate": 9.973201783146989e-06, + "loss": 0.0047, + "step": 15850 + }, + { + "epoch": 0.25951075840628324, + "grad_norm": 0.10793501883745193, + "learning_rate": 9.973053941789458e-06, + "loss": 0.004, + "step": 15860 + }, + { + "epoch": 0.25967438435735907, + "grad_norm": 0.07372265309095383, + "learning_rate": 9.972905694846569e-06, + "loss": 0.0034, + "step": 15870 + }, + { + "epoch": 0.2598380103084349, + "grad_norm": 0.17902660369873047, + "learning_rate": 9.97275704233041e-06, + "loss": 0.0054, + "step": 15880 + }, + { + "epoch": 0.2600016362595108, + "grad_norm": 0.24299824237823486, + "learning_rate": 9.972607984253107e-06, + "loss": 0.0031, + "step": 15890 + }, + { + "epoch": 0.2601652622105866, + "grad_norm": 0.39346474409103394, + "learning_rate": 9.972458520626814e-06, + "loss": 0.0039, + "step": 15900 + }, + { + "epoch": 0.26032888816166244, + "grad_norm": 0.13941121101379395, + "learning_rate": 9.972308651463722e-06, + "loss": 0.0043, + "step": 15910 + }, + { + "epoch": 0.26049251411273827, + "grad_norm": 0.06831086426973343, + "learning_rate": 9.972158376776053e-06, + "loss": 0.0088, + "step": 15920 + }, + { + "epoch": 0.2606561400638141, + "grad_norm": 0.24526947736740112, + "learning_rate": 9.972007696576065e-06, + "loss": 0.0073, + "step": 15930 + }, + { + "epoch": 0.26081976601489, + "grad_norm": 0.1682722270488739, + "learning_rate": 9.971856610876043e-06, + "loss": 0.0031, + "step": 15940 + }, + { + "epoch": 0.2609833919659658, + "grad_norm": 0.11419714987277985, + "learning_rate": 9.971705119688314e-06, + "loss": 0.0036, + "step": 15950 + }, + { + "epoch": 0.26114701791704165, + "grad_norm": 0.7184643149375916, + "learning_rate": 9.97155322302523e-06, + "loss": 0.0108, + "step": 15960 + }, + { + "epoch": 0.2613106438681175, + "grad_norm": 0.15539275109767914, + "learning_rate": 9.97140092089918e-06, + "loss": 0.006, + "step": 15970 + }, + { + "epoch": 0.2614742698191933, + "grad_norm": 0.31286701560020447, + "learning_rate": 9.971248213322585e-06, + "loss": 0.004, + "step": 15980 + }, + { + "epoch": 0.2616378957702692, + "grad_norm": 0.17413610219955444, + "learning_rate": 9.971095100307898e-06, + "loss": 0.006, + "step": 15990 + }, + { + "epoch": 0.261801521721345, + "grad_norm": 0.09632241725921631, + "learning_rate": 9.970941581867608e-06, + "loss": 0.005, + "step": 16000 + }, + { + "epoch": 0.26196514767242085, + "grad_norm": 0.12612801790237427, + "learning_rate": 9.970787658014235e-06, + "loss": 0.0055, + "step": 16010 + }, + { + "epoch": 0.2621287736234967, + "grad_norm": 0.19577094912528992, + "learning_rate": 9.970633328760335e-06, + "loss": 0.0043, + "step": 16020 + }, + { + "epoch": 0.2622923995745725, + "grad_norm": 0.08477102965116501, + "learning_rate": 9.970478594118491e-06, + "loss": 0.0071, + "step": 16030 + }, + { + "epoch": 0.2624560255256484, + "grad_norm": 0.24222905933856964, + "learning_rate": 9.970323454101324e-06, + "loss": 0.0043, + "step": 16040 + }, + { + "epoch": 0.2626196514767242, + "grad_norm": 0.2804945409297943, + "learning_rate": 9.970167908721486e-06, + "loss": 0.0053, + "step": 16050 + }, + { + "epoch": 0.26278327742780005, + "grad_norm": 0.1598391830921173, + "learning_rate": 9.970011957991664e-06, + "loss": 0.0046, + "step": 16060 + }, + { + "epoch": 0.2629469033788759, + "grad_norm": 0.08779624104499817, + "learning_rate": 9.969855601924577e-06, + "loss": 0.0034, + "step": 16070 + }, + { + "epoch": 0.2631105293299517, + "grad_norm": 0.1460345834493637, + "learning_rate": 9.969698840532974e-06, + "loss": 0.003, + "step": 16080 + }, + { + "epoch": 0.26327415528102754, + "grad_norm": 0.3892747163772583, + "learning_rate": 9.969541673829643e-06, + "loss": 0.005, + "step": 16090 + }, + { + "epoch": 0.26343778123210343, + "grad_norm": 0.13804906606674194, + "learning_rate": 9.9693841018274e-06, + "loss": 0.0058, + "step": 16100 + }, + { + "epoch": 0.26360140718317926, + "grad_norm": 0.14759685099124908, + "learning_rate": 9.969226124539097e-06, + "loss": 0.0059, + "step": 16110 + }, + { + "epoch": 0.2637650331342551, + "grad_norm": 0.08929809927940369, + "learning_rate": 9.96906774197762e-06, + "loss": 0.0049, + "step": 16120 + }, + { + "epoch": 0.2639286590853309, + "grad_norm": 0.11297618597745895, + "learning_rate": 9.968908954155883e-06, + "loss": 0.0046, + "step": 16130 + }, + { + "epoch": 0.26409228503640675, + "grad_norm": 0.21108055114746094, + "learning_rate": 9.968749761086837e-06, + "loss": 0.0034, + "step": 16140 + }, + { + "epoch": 0.26425591098748263, + "grad_norm": 0.09542800486087799, + "learning_rate": 9.968590162783467e-06, + "loss": 0.0044, + "step": 16150 + }, + { + "epoch": 0.26441953693855846, + "grad_norm": 0.23468288779258728, + "learning_rate": 9.968430159258785e-06, + "loss": 0.004, + "step": 16160 + }, + { + "epoch": 0.2645831628896343, + "grad_norm": 0.06206181272864342, + "learning_rate": 9.968269750525846e-06, + "loss": 0.0027, + "step": 16170 + }, + { + "epoch": 0.2647467888407101, + "grad_norm": 0.3007090389728546, + "learning_rate": 9.968108936597729e-06, + "loss": 0.0072, + "step": 16180 + }, + { + "epoch": 0.26491041479178595, + "grad_norm": 0.23993346095085144, + "learning_rate": 9.967947717487549e-06, + "loss": 0.0108, + "step": 16190 + }, + { + "epoch": 0.26507404074286184, + "grad_norm": 0.11867068707942963, + "learning_rate": 9.967786093208457e-06, + "loss": 0.0026, + "step": 16200 + }, + { + "epoch": 0.26523766669393767, + "grad_norm": 0.2238997220993042, + "learning_rate": 9.967624063773634e-06, + "loss": 0.0048, + "step": 16210 + }, + { + "epoch": 0.2654012926450135, + "grad_norm": 0.2756165564060211, + "learning_rate": 9.967461629196291e-06, + "loss": 0.0036, + "step": 16220 + }, + { + "epoch": 0.2655649185960893, + "grad_norm": 0.21465995907783508, + "learning_rate": 9.96729878948968e-06, + "loss": 0.0065, + "step": 16230 + }, + { + "epoch": 0.26572854454716516, + "grad_norm": 0.1759486198425293, + "learning_rate": 9.96713554466708e-06, + "loss": 0.0046, + "step": 16240 + }, + { + "epoch": 0.26589217049824104, + "grad_norm": 0.3826967477798462, + "learning_rate": 9.966971894741804e-06, + "loss": 0.0037, + "step": 16250 + }, + { + "epoch": 0.2660557964493169, + "grad_norm": 0.33866703510284424, + "learning_rate": 9.9668078397272e-06, + "loss": 0.0031, + "step": 16260 + }, + { + "epoch": 0.2662194224003927, + "grad_norm": 0.23178279399871826, + "learning_rate": 9.966643379636646e-06, + "loss": 0.0058, + "step": 16270 + }, + { + "epoch": 0.26638304835146853, + "grad_norm": 0.2155774086713791, + "learning_rate": 9.966478514483557e-06, + "loss": 0.0038, + "step": 16280 + }, + { + "epoch": 0.26654667430254436, + "grad_norm": 0.21046411991119385, + "learning_rate": 9.966313244281377e-06, + "loss": 0.0053, + "step": 16290 + }, + { + "epoch": 0.26671030025362025, + "grad_norm": 0.05194896087050438, + "learning_rate": 9.966147569043584e-06, + "loss": 0.0043, + "step": 16300 + }, + { + "epoch": 0.2668739262046961, + "grad_norm": 0.11445903778076172, + "learning_rate": 9.965981488783697e-06, + "loss": 0.0052, + "step": 16310 + }, + { + "epoch": 0.2670375521557719, + "grad_norm": 0.09579271823167801, + "learning_rate": 9.96581500351525e-06, + "loss": 0.0057, + "step": 16320 + }, + { + "epoch": 0.26720117810684774, + "grad_norm": 0.11837436258792877, + "learning_rate": 9.965648113251828e-06, + "loss": 0.0065, + "step": 16330 + }, + { + "epoch": 0.26736480405792357, + "grad_norm": 0.03708185628056526, + "learning_rate": 9.965480818007042e-06, + "loss": 0.0064, + "step": 16340 + }, + { + "epoch": 0.26752843000899945, + "grad_norm": 0.14848802983760834, + "learning_rate": 9.965313117794532e-06, + "loss": 0.0036, + "step": 16350 + }, + { + "epoch": 0.2676920559600753, + "grad_norm": 0.13026051223278046, + "learning_rate": 9.96514501262798e-06, + "loss": 0.0064, + "step": 16360 + }, + { + "epoch": 0.2678556819111511, + "grad_norm": 0.09151672571897507, + "learning_rate": 9.964976502521093e-06, + "loss": 0.0061, + "step": 16370 + }, + { + "epoch": 0.26801930786222694, + "grad_norm": 0.11466898769140244, + "learning_rate": 9.964807587487614e-06, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 0.26818293381330277, + "grad_norm": 0.23110909759998322, + "learning_rate": 9.964638267541321e-06, + "loss": 0.0089, + "step": 16390 + }, + { + "epoch": 0.26834655976437866, + "grad_norm": 0.18098123371601105, + "learning_rate": 9.964468542696022e-06, + "loss": 0.0051, + "step": 16400 + }, + { + "epoch": 0.2685101857154545, + "grad_norm": 0.5774562358856201, + "learning_rate": 9.964298412965558e-06, + "loss": 0.0041, + "step": 16410 + }, + { + "epoch": 0.2686738116665303, + "grad_norm": 0.36918169260025024, + "learning_rate": 9.964127878363805e-06, + "loss": 0.0069, + "step": 16420 + }, + { + "epoch": 0.26883743761760615, + "grad_norm": 0.18472912907600403, + "learning_rate": 9.963956938904674e-06, + "loss": 0.0041, + "step": 16430 + }, + { + "epoch": 0.269001063568682, + "grad_norm": 0.6528200507164001, + "learning_rate": 9.963785594602103e-06, + "loss": 0.0058, + "step": 16440 + }, + { + "epoch": 0.26916468951975786, + "grad_norm": 0.36004722118377686, + "learning_rate": 9.963613845470066e-06, + "loss": 0.005, + "step": 16450 + }, + { + "epoch": 0.2693283154708337, + "grad_norm": 0.3867912292480469, + "learning_rate": 9.963441691522573e-06, + "loss": 0.006, + "step": 16460 + }, + { + "epoch": 0.2694919414219095, + "grad_norm": 0.5443325638771057, + "learning_rate": 9.963269132773661e-06, + "loss": 0.0074, + "step": 16470 + }, + { + "epoch": 0.26965556737298535, + "grad_norm": 0.2045241743326187, + "learning_rate": 9.963096169237407e-06, + "loss": 0.0078, + "step": 16480 + }, + { + "epoch": 0.2698191933240612, + "grad_norm": 0.4688001275062561, + "learning_rate": 9.962922800927915e-06, + "loss": 0.005, + "step": 16490 + }, + { + "epoch": 0.269982819275137, + "grad_norm": 0.17226062715053558, + "learning_rate": 9.962749027859325e-06, + "loss": 0.0047, + "step": 16500 + }, + { + "epoch": 0.2701464452262129, + "grad_norm": 0.2984844446182251, + "learning_rate": 9.962574850045807e-06, + "loss": 0.0064, + "step": 16510 + }, + { + "epoch": 0.2703100711772887, + "grad_norm": 0.18127554655075073, + "learning_rate": 9.962400267501568e-06, + "loss": 0.0058, + "step": 16520 + }, + { + "epoch": 0.27047369712836455, + "grad_norm": 0.17778706550598145, + "learning_rate": 9.96222528024085e-06, + "loss": 0.0041, + "step": 16530 + }, + { + "epoch": 0.2706373230794404, + "grad_norm": 0.20506048202514648, + "learning_rate": 9.962049888277918e-06, + "loss": 0.0036, + "step": 16540 + }, + { + "epoch": 0.2708009490305162, + "grad_norm": 0.22299182415008545, + "learning_rate": 9.961874091627082e-06, + "loss": 0.0101, + "step": 16550 + }, + { + "epoch": 0.2709645749815921, + "grad_norm": 0.14671024680137634, + "learning_rate": 9.961697890302675e-06, + "loss": 0.0064, + "step": 16560 + }, + { + "epoch": 0.27112820093266793, + "grad_norm": 0.08210575580596924, + "learning_rate": 9.96152128431907e-06, + "loss": 0.004, + "step": 16570 + }, + { + "epoch": 0.27129182688374376, + "grad_norm": 0.21329963207244873, + "learning_rate": 9.96134427369067e-06, + "loss": 0.005, + "step": 16580 + }, + { + "epoch": 0.2714554528348196, + "grad_norm": 0.1510000228881836, + "learning_rate": 9.96116685843191e-06, + "loss": 0.0035, + "step": 16590 + }, + { + "epoch": 0.2716190787858954, + "grad_norm": 0.5466216206550598, + "learning_rate": 9.96098903855726e-06, + "loss": 0.005, + "step": 16600 + }, + { + "epoch": 0.2717827047369713, + "grad_norm": 0.17150095105171204, + "learning_rate": 9.960810814081225e-06, + "loss": 0.0053, + "step": 16610 + }, + { + "epoch": 0.27194633068804713, + "grad_norm": 0.10637058317661285, + "learning_rate": 9.960632185018335e-06, + "loss": 0.0036, + "step": 16620 + }, + { + "epoch": 0.27210995663912296, + "grad_norm": 0.06568179279565811, + "learning_rate": 9.960453151383164e-06, + "loss": 0.0034, + "step": 16630 + }, + { + "epoch": 0.2722735825901988, + "grad_norm": 0.1168641522526741, + "learning_rate": 9.96027371319031e-06, + "loss": 0.0045, + "step": 16640 + }, + { + "epoch": 0.2724372085412746, + "grad_norm": 0.10334012657403946, + "learning_rate": 9.960093870454408e-06, + "loss": 0.0047, + "step": 16650 + }, + { + "epoch": 0.2726008344923505, + "grad_norm": 0.22061508893966675, + "learning_rate": 9.959913623190127e-06, + "loss": 0.0055, + "step": 16660 + }, + { + "epoch": 0.27276446044342634, + "grad_norm": 0.20489159226417542, + "learning_rate": 9.959732971412165e-06, + "loss": 0.005, + "step": 16670 + }, + { + "epoch": 0.27292808639450217, + "grad_norm": 0.17072172462940216, + "learning_rate": 9.959551915135255e-06, + "loss": 0.0025, + "step": 16680 + }, + { + "epoch": 0.273091712345578, + "grad_norm": 0.4690852463245392, + "learning_rate": 9.959370454374166e-06, + "loss": 0.0052, + "step": 16690 + }, + { + "epoch": 0.2732553382966538, + "grad_norm": 0.1498551368713379, + "learning_rate": 9.959188589143695e-06, + "loss": 0.0058, + "step": 16700 + }, + { + "epoch": 0.2734189642477297, + "grad_norm": 0.24540501832962036, + "learning_rate": 9.959006319458676e-06, + "loss": 0.0041, + "step": 16710 + }, + { + "epoch": 0.27358259019880554, + "grad_norm": 0.06451457738876343, + "learning_rate": 9.958823645333975e-06, + "loss": 0.0037, + "step": 16720 + }, + { + "epoch": 0.27374621614988137, + "grad_norm": 0.13698600232601166, + "learning_rate": 9.958640566784488e-06, + "loss": 0.0076, + "step": 16730 + }, + { + "epoch": 0.2739098421009572, + "grad_norm": 0.07890374213457108, + "learning_rate": 9.958457083825147e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 0.27407346805203303, + "grad_norm": 0.14798398315906525, + "learning_rate": 9.958273196470915e-06, + "loss": 0.0026, + "step": 16750 + }, + { + "epoch": 0.2742370940031089, + "grad_norm": 0.2839567959308624, + "learning_rate": 9.958088904736793e-06, + "loss": 0.0041, + "step": 16760 + }, + { + "epoch": 0.27440071995418475, + "grad_norm": 0.19271968305110931, + "learning_rate": 9.957904208637807e-06, + "loss": 0.0066, + "step": 16770 + }, + { + "epoch": 0.2745643459052606, + "grad_norm": 0.11610250174999237, + "learning_rate": 9.957719108189023e-06, + "loss": 0.0045, + "step": 16780 + }, + { + "epoch": 0.2747279718563364, + "grad_norm": 0.2309107631444931, + "learning_rate": 9.957533603405536e-06, + "loss": 0.0053, + "step": 16790 + }, + { + "epoch": 0.27489159780741224, + "grad_norm": 0.11249174177646637, + "learning_rate": 9.957347694302475e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 0.2750552237584881, + "grad_norm": 0.25255057215690613, + "learning_rate": 9.957161380895002e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "epoch": 0.27521884970956395, + "grad_norm": 0.04065065085887909, + "learning_rate": 9.956974663198314e-06, + "loss": 0.0039, + "step": 16820 + }, + { + "epoch": 0.2753824756606398, + "grad_norm": 0.13424116373062134, + "learning_rate": 9.956787541227635e-06, + "loss": 0.0036, + "step": 16830 + }, + { + "epoch": 0.2755461016117156, + "grad_norm": 0.4534515142440796, + "learning_rate": 9.95660001499823e-06, + "loss": 0.0067, + "step": 16840 + }, + { + "epoch": 0.27570972756279144, + "grad_norm": 0.11763358861207962, + "learning_rate": 9.956412084525392e-06, + "loss": 0.0025, + "step": 16850 + }, + { + "epoch": 0.2758733535138673, + "grad_norm": 0.1640862673521042, + "learning_rate": 9.956223749824447e-06, + "loss": 0.0071, + "step": 16860 + }, + { + "epoch": 0.27603697946494316, + "grad_norm": 0.04584338888525963, + "learning_rate": 9.956035010910757e-06, + "loss": 0.0042, + "step": 16870 + }, + { + "epoch": 0.276200605416019, + "grad_norm": 0.3774982988834381, + "learning_rate": 9.95584586779971e-06, + "loss": 0.0057, + "step": 16880 + }, + { + "epoch": 0.2763642313670948, + "grad_norm": 0.30738601088523865, + "learning_rate": 9.955656320506738e-06, + "loss": 0.0036, + "step": 16890 + }, + { + "epoch": 0.27652785731817064, + "grad_norm": 0.2193276435136795, + "learning_rate": 9.955466369047297e-06, + "loss": 0.0071, + "step": 16900 + }, + { + "epoch": 0.27669148326924653, + "grad_norm": 0.271788090467453, + "learning_rate": 9.955276013436877e-06, + "loss": 0.0053, + "step": 16910 + }, + { + "epoch": 0.27685510922032236, + "grad_norm": 0.2519876956939697, + "learning_rate": 9.955085253691006e-06, + "loss": 0.0068, + "step": 16920 + }, + { + "epoch": 0.2770187351713982, + "grad_norm": 0.15955935418605804, + "learning_rate": 9.95489408982524e-06, + "loss": 0.0032, + "step": 16930 + }, + { + "epoch": 0.277182361122474, + "grad_norm": 0.1520773321390152, + "learning_rate": 9.954702521855171e-06, + "loss": 0.0049, + "step": 16940 + }, + { + "epoch": 0.27734598707354985, + "grad_norm": 0.132028728723526, + "learning_rate": 9.954510549796421e-06, + "loss": 0.0059, + "step": 16950 + }, + { + "epoch": 0.2775096130246257, + "grad_norm": 0.14376412332057953, + "learning_rate": 9.954318173664648e-06, + "loss": 0.0054, + "step": 16960 + }, + { + "epoch": 0.27767323897570156, + "grad_norm": 0.1658952385187149, + "learning_rate": 9.95412539347554e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 0.2778368649267774, + "grad_norm": 0.008403602056205273, + "learning_rate": 9.95393220924482e-06, + "loss": 0.0065, + "step": 16980 + }, + { + "epoch": 0.2780004908778532, + "grad_norm": 0.20611797273159027, + "learning_rate": 9.953738620988244e-06, + "loss": 0.0035, + "step": 16990 + }, + { + "epoch": 0.27816411682892905, + "grad_norm": 0.21646977961063385, + "learning_rate": 9.9535446287216e-06, + "loss": 0.003, + "step": 17000 + }, + { + "epoch": 0.2783277427800049, + "grad_norm": 0.09182654321193695, + "learning_rate": 9.95335023246071e-06, + "loss": 0.0054, + "step": 17010 + }, + { + "epoch": 0.27849136873108077, + "grad_norm": 0.1209140196442604, + "learning_rate": 9.953155432221428e-06, + "loss": 0.0095, + "step": 17020 + }, + { + "epoch": 0.2786549946821566, + "grad_norm": 0.1542261838912964, + "learning_rate": 9.95296022801964e-06, + "loss": 0.0029, + "step": 17030 + }, + { + "epoch": 0.27881862063323243, + "grad_norm": 0.008339930325746536, + "learning_rate": 9.95276461987127e-06, + "loss": 0.0036, + "step": 17040 + }, + { + "epoch": 0.27898224658430826, + "grad_norm": 0.2207038402557373, + "learning_rate": 9.952568607792265e-06, + "loss": 0.0047, + "step": 17050 + }, + { + "epoch": 0.2791458725353841, + "grad_norm": 0.40690934658050537, + "learning_rate": 9.952372191798615e-06, + "loss": 0.0049, + "step": 17060 + }, + { + "epoch": 0.27930949848646, + "grad_norm": 0.18785670399665833, + "learning_rate": 9.952175371906339e-06, + "loss": 0.0037, + "step": 17070 + }, + { + "epoch": 0.2794731244375358, + "grad_norm": 0.17055024206638336, + "learning_rate": 9.951978148131489e-06, + "loss": 0.0056, + "step": 17080 + }, + { + "epoch": 0.27963675038861163, + "grad_norm": 0.054821744561195374, + "learning_rate": 9.951780520490149e-06, + "loss": 0.0031, + "step": 17090 + }, + { + "epoch": 0.27980037633968746, + "grad_norm": 0.13429221510887146, + "learning_rate": 9.951582488998436e-06, + "loss": 0.006, + "step": 17100 + }, + { + "epoch": 0.2799640022907633, + "grad_norm": 0.033095214515924454, + "learning_rate": 9.951384053672504e-06, + "loss": 0.0042, + "step": 17110 + }, + { + "epoch": 0.2801276282418392, + "grad_norm": 0.19814211130142212, + "learning_rate": 9.951185214528534e-06, + "loss": 0.0071, + "step": 17120 + }, + { + "epoch": 0.280291254192915, + "grad_norm": 0.09839620441198349, + "learning_rate": 9.950985971582743e-06, + "loss": 0.0038, + "step": 17130 + }, + { + "epoch": 0.28045488014399084, + "grad_norm": 0.17991285026073456, + "learning_rate": 9.95078632485138e-06, + "loss": 0.0039, + "step": 17140 + }, + { + "epoch": 0.28061850609506667, + "grad_norm": 0.176238551735878, + "learning_rate": 9.950586274350728e-06, + "loss": 0.0089, + "step": 17150 + }, + { + "epoch": 0.2807821320461425, + "grad_norm": 0.18992644548416138, + "learning_rate": 9.950385820097104e-06, + "loss": 0.0052, + "step": 17160 + }, + { + "epoch": 0.2809457579972184, + "grad_norm": 0.19561715424060822, + "learning_rate": 9.950184962106855e-06, + "loss": 0.005, + "step": 17170 + }, + { + "epoch": 0.2811093839482942, + "grad_norm": 0.16559667885303497, + "learning_rate": 9.949983700396363e-06, + "loss": 0.0049, + "step": 17180 + }, + { + "epoch": 0.28127300989937004, + "grad_norm": 0.19417065382003784, + "learning_rate": 9.949782034982042e-06, + "loss": 0.0042, + "step": 17190 + }, + { + "epoch": 0.28143663585044587, + "grad_norm": 0.33402079343795776, + "learning_rate": 9.949579965880338e-06, + "loss": 0.0065, + "step": 17200 + }, + { + "epoch": 0.2816002618015217, + "grad_norm": 0.2246643304824829, + "learning_rate": 9.949377493107732e-06, + "loss": 0.005, + "step": 17210 + }, + { + "epoch": 0.2817638877525976, + "grad_norm": 0.17307962477207184, + "learning_rate": 9.949174616680736e-06, + "loss": 0.0059, + "step": 17220 + }, + { + "epoch": 0.2819275137036734, + "grad_norm": 0.5333188772201538, + "learning_rate": 9.948971336615897e-06, + "loss": 0.0039, + "step": 17230 + }, + { + "epoch": 0.28209113965474925, + "grad_norm": 0.1200670599937439, + "learning_rate": 9.948767652929796e-06, + "loss": 0.0048, + "step": 17240 + }, + { + "epoch": 0.2822547656058251, + "grad_norm": 0.23842327296733856, + "learning_rate": 9.948563565639041e-06, + "loss": 0.0049, + "step": 17250 + }, + { + "epoch": 0.2824183915569009, + "grad_norm": 0.19713306427001953, + "learning_rate": 9.948359074760277e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 0.2825820175079768, + "grad_norm": 0.1332864612340927, + "learning_rate": 9.948154180310184e-06, + "loss": 0.005, + "step": 17270 + }, + { + "epoch": 0.2827456434590526, + "grad_norm": 0.11713171005249023, + "learning_rate": 9.94794888230547e-06, + "loss": 0.0046, + "step": 17280 + }, + { + "epoch": 0.28290926941012845, + "grad_norm": 0.11685027182102203, + "learning_rate": 9.947743180762881e-06, + "loss": 0.0052, + "step": 17290 + }, + { + "epoch": 0.2830728953612043, + "grad_norm": 0.07329851388931274, + "learning_rate": 9.947537075699193e-06, + "loss": 0.0041, + "step": 17300 + }, + { + "epoch": 0.2832365213122801, + "grad_norm": 0.1772112399339676, + "learning_rate": 9.94733056713121e-06, + "loss": 0.0032, + "step": 17310 + }, + { + "epoch": 0.283400147263356, + "grad_norm": 0.1411488801240921, + "learning_rate": 9.94712365507578e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 0.2835637732144318, + "grad_norm": 0.18904012441635132, + "learning_rate": 9.946916339549777e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 0.28372739916550765, + "grad_norm": 0.1454576551914215, + "learning_rate": 9.946708620570108e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 0.2838910251165835, + "grad_norm": 0.14934088289737701, + "learning_rate": 9.946500498153712e-06, + "loss": 0.003, + "step": 17350 + }, + { + "epoch": 0.2840546510676593, + "grad_norm": 0.3014727532863617, + "learning_rate": 9.946291972317567e-06, + "loss": 0.004, + "step": 17360 + }, + { + "epoch": 0.28421827701873514, + "grad_norm": 0.0963466539978981, + "learning_rate": 9.946083043078677e-06, + "loss": 0.0035, + "step": 17370 + }, + { + "epoch": 0.28438190296981103, + "grad_norm": 0.259141743183136, + "learning_rate": 9.945873710454084e-06, + "loss": 0.0033, + "step": 17380 + }, + { + "epoch": 0.28454552892088686, + "grad_norm": 0.06163093075156212, + "learning_rate": 9.945663974460856e-06, + "loss": 0.0052, + "step": 17390 + }, + { + "epoch": 0.2847091548719627, + "grad_norm": 0.12710176408290863, + "learning_rate": 9.945453835116101e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 0.2848727808230385, + "grad_norm": 0.39482325315475464, + "learning_rate": 9.945243292436958e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 0.28503640677411435, + "grad_norm": 0.10586988180875778, + "learning_rate": 9.945032346440597e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 0.28520003272519023, + "grad_norm": 0.2811430096626282, + "learning_rate": 9.944820997144222e-06, + "loss": 0.0051, + "step": 17430 + }, + { + "epoch": 0.28536365867626606, + "grad_norm": 0.23797577619552612, + "learning_rate": 9.94460924456507e-06, + "loss": 0.0042, + "step": 17440 + }, + { + "epoch": 0.2855272846273419, + "grad_norm": 0.21866446733474731, + "learning_rate": 9.944397088720412e-06, + "loss": 0.0063, + "step": 17450 + }, + { + "epoch": 0.2856909105784177, + "grad_norm": 0.21976706385612488, + "learning_rate": 9.944184529627549e-06, + "loss": 0.006, + "step": 17460 + }, + { + "epoch": 0.28585453652949355, + "grad_norm": 0.1605653464794159, + "learning_rate": 9.943971567303815e-06, + "loss": 0.0045, + "step": 17470 + }, + { + "epoch": 0.28601816248056944, + "grad_norm": 0.11667539924383163, + "learning_rate": 9.943758201766585e-06, + "loss": 0.0025, + "step": 17480 + }, + { + "epoch": 0.28618178843164527, + "grad_norm": 0.213748961687088, + "learning_rate": 9.943544433033254e-06, + "loss": 0.0037, + "step": 17490 + }, + { + "epoch": 0.2863454143827211, + "grad_norm": 0.067848339676857, + "learning_rate": 9.94333026112126e-06, + "loss": 0.0042, + "step": 17500 + }, + { + "epoch": 0.2865090403337969, + "grad_norm": 0.12439949065446854, + "learning_rate": 9.943115686048067e-06, + "loss": 0.0026, + "step": 17510 + }, + { + "epoch": 0.28667266628487276, + "grad_norm": 0.37913748621940613, + "learning_rate": 9.942900707831178e-06, + "loss": 0.0081, + "step": 17520 + }, + { + "epoch": 0.28683629223594864, + "grad_norm": 0.23941577970981598, + "learning_rate": 9.942685326488122e-06, + "loss": 0.0036, + "step": 17530 + }, + { + "epoch": 0.28699991818702447, + "grad_norm": 0.14176420867443085, + "learning_rate": 9.942469542036468e-06, + "loss": 0.003, + "step": 17540 + }, + { + "epoch": 0.2871635441381003, + "grad_norm": 0.11878236383199692, + "learning_rate": 9.942253354493816e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 0.28732717008917613, + "grad_norm": 0.24840830266475677, + "learning_rate": 9.942036763877794e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 0.28749079604025196, + "grad_norm": 0.2546611428260803, + "learning_rate": 9.941819770206067e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 0.28765442199132785, + "grad_norm": 0.09517211467027664, + "learning_rate": 9.941602373496334e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 0.2878180479424037, + "grad_norm": 0.20494259893894196, + "learning_rate": 9.941384573766324e-06, + "loss": 0.0042, + "step": 17590 + }, + { + "epoch": 0.2879816738934795, + "grad_norm": 0.5144269466400146, + "learning_rate": 9.9411663710338e-06, + "loss": 0.0092, + "step": 17600 + }, + { + "epoch": 0.28814529984455534, + "grad_norm": 0.6665263772010803, + "learning_rate": 9.940947765316559e-06, + "loss": 0.0042, + "step": 17610 + }, + { + "epoch": 0.28830892579563117, + "grad_norm": 0.14284846186637878, + "learning_rate": 9.940728756632427e-06, + "loss": 0.0029, + "step": 17620 + }, + { + "epoch": 0.28847255174670705, + "grad_norm": 0.23586921393871307, + "learning_rate": 9.94050934499927e-06, + "loss": 0.0092, + "step": 17630 + }, + { + "epoch": 0.2886361776977829, + "grad_norm": 0.23336808383464813, + "learning_rate": 9.940289530434978e-06, + "loss": 0.0047, + "step": 17640 + }, + { + "epoch": 0.2887998036488587, + "grad_norm": 0.2163964956998825, + "learning_rate": 9.940069312957481e-06, + "loss": 0.004, + "step": 17650 + }, + { + "epoch": 0.28896342959993454, + "grad_norm": 0.15693232417106628, + "learning_rate": 9.939848692584737e-06, + "loss": 0.0047, + "step": 17660 + }, + { + "epoch": 0.28912705555101037, + "grad_norm": 0.1399923861026764, + "learning_rate": 9.939627669334741e-06, + "loss": 0.0088, + "step": 17670 + }, + { + "epoch": 0.28929068150208626, + "grad_norm": 0.23836109042167664, + "learning_rate": 9.93940624322552e-06, + "loss": 0.0045, + "step": 17680 + }, + { + "epoch": 0.2894543074531621, + "grad_norm": 0.050740379840135574, + "learning_rate": 9.93918441427513e-06, + "loss": 0.0039, + "step": 17690 + }, + { + "epoch": 0.2896179334042379, + "grad_norm": 0.05332972854375839, + "learning_rate": 9.938962182501663e-06, + "loss": 0.004, + "step": 17700 + }, + { + "epoch": 0.28978155935531374, + "grad_norm": 0.1492157131433487, + "learning_rate": 9.938739547923245e-06, + "loss": 0.0039, + "step": 17710 + }, + { + "epoch": 0.2899451853063896, + "grad_norm": 0.31322428584098816, + "learning_rate": 9.938516510558034e-06, + "loss": 0.0039, + "step": 17720 + }, + { + "epoch": 0.29010881125746546, + "grad_norm": 0.1721370667219162, + "learning_rate": 9.938293070424217e-06, + "loss": 0.0054, + "step": 17730 + }, + { + "epoch": 0.2902724372085413, + "grad_norm": 0.30744728446006775, + "learning_rate": 9.938069227540017e-06, + "loss": 0.006, + "step": 17740 + }, + { + "epoch": 0.2904360631596171, + "grad_norm": 0.05862875282764435, + "learning_rate": 9.937844981923695e-06, + "loss": 0.005, + "step": 17750 + }, + { + "epoch": 0.29059968911069295, + "grad_norm": 0.1096782386302948, + "learning_rate": 9.937620333593534e-06, + "loss": 0.0038, + "step": 17760 + }, + { + "epoch": 0.2907633150617688, + "grad_norm": 0.11893852800130844, + "learning_rate": 9.937395282567859e-06, + "loss": 0.005, + "step": 17770 + }, + { + "epoch": 0.29092694101284466, + "grad_norm": 0.09523512423038483, + "learning_rate": 9.937169828865023e-06, + "loss": 0.0039, + "step": 17780 + }, + { + "epoch": 0.2910905669639205, + "grad_norm": 0.0739772841334343, + "learning_rate": 9.936943972503412e-06, + "loss": 0.0051, + "step": 17790 + }, + { + "epoch": 0.2912541929149963, + "grad_norm": 0.24942955374717712, + "learning_rate": 9.93671771350145e-06, + "loss": 0.0031, + "step": 17800 + }, + { + "epoch": 0.29141781886607215, + "grad_norm": 0.09600315243005753, + "learning_rate": 9.936491051877584e-06, + "loss": 0.0037, + "step": 17810 + }, + { + "epoch": 0.291581444817148, + "grad_norm": 0.12371202558279037, + "learning_rate": 9.936263987650305e-06, + "loss": 0.0074, + "step": 17820 + }, + { + "epoch": 0.2917450707682238, + "grad_norm": 0.047643277794122696, + "learning_rate": 9.936036520838131e-06, + "loss": 0.003, + "step": 17830 + }, + { + "epoch": 0.2919086967192997, + "grad_norm": 0.06473499536514282, + "learning_rate": 9.93580865145961e-06, + "loss": 0.0024, + "step": 17840 + }, + { + "epoch": 0.29207232267037553, + "grad_norm": 0.07858459651470184, + "learning_rate": 9.935580379533331e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 0.29223594862145136, + "grad_norm": 0.15511786937713623, + "learning_rate": 9.935351705077907e-06, + "loss": 0.0058, + "step": 17860 + }, + { + "epoch": 0.2923995745725272, + "grad_norm": 0.13278204202651978, + "learning_rate": 9.93512262811199e-06, + "loss": 0.0034, + "step": 17870 + }, + { + "epoch": 0.292563200523603, + "grad_norm": 0.3024803698062897, + "learning_rate": 9.934893148654263e-06, + "loss": 0.0034, + "step": 17880 + }, + { + "epoch": 0.2927268264746789, + "grad_norm": 0.06911783665418625, + "learning_rate": 9.934663266723438e-06, + "loss": 0.0047, + "step": 17890 + }, + { + "epoch": 0.29289045242575473, + "grad_norm": 0.21858294308185577, + "learning_rate": 9.934432982338268e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 0.29305407837683056, + "grad_norm": 0.29643192887306213, + "learning_rate": 9.934202295517533e-06, + "loss": 0.0072, + "step": 17910 + }, + { + "epoch": 0.2932177043279064, + "grad_norm": 0.21975645422935486, + "learning_rate": 9.933971206280047e-06, + "loss": 0.0039, + "step": 17920 + }, + { + "epoch": 0.2933813302789822, + "grad_norm": 0.1387801170349121, + "learning_rate": 9.933739714644653e-06, + "loss": 0.0033, + "step": 17930 + }, + { + "epoch": 0.2935449562300581, + "grad_norm": 0.25593283772468567, + "learning_rate": 9.933507820630237e-06, + "loss": 0.0051, + "step": 17940 + }, + { + "epoch": 0.29370858218113394, + "grad_norm": 0.17318041622638702, + "learning_rate": 9.933275524255707e-06, + "loss": 0.0048, + "step": 17950 + }, + { + "epoch": 0.29387220813220977, + "grad_norm": 0.08971473574638367, + "learning_rate": 9.93304282554001e-06, + "loss": 0.0026, + "step": 17960 + }, + { + "epoch": 0.2940358340832856, + "grad_norm": 0.00987961608916521, + "learning_rate": 9.932809724502124e-06, + "loss": 0.0068, + "step": 17970 + }, + { + "epoch": 0.2941994600343614, + "grad_norm": 0.28710561990737915, + "learning_rate": 9.93257622116106e-06, + "loss": 0.0052, + "step": 17980 + }, + { + "epoch": 0.2943630859854373, + "grad_norm": 0.1490069329738617, + "learning_rate": 9.93234231553586e-06, + "loss": 0.006, + "step": 17990 + }, + { + "epoch": 0.29452671193651314, + "grad_norm": 0.11392813920974731, + "learning_rate": 9.932108007645602e-06, + "loss": 0.0037, + "step": 18000 + }, + { + "epoch": 0.29469033788758897, + "grad_norm": 0.1317816823720932, + "learning_rate": 9.931873297509396e-06, + "loss": 0.0035, + "step": 18010 + }, + { + "epoch": 0.2948539638386648, + "grad_norm": 0.16625213623046875, + "learning_rate": 9.931638185146383e-06, + "loss": 0.005, + "step": 18020 + }, + { + "epoch": 0.29501758978974063, + "grad_norm": 0.2743920385837555, + "learning_rate": 9.93140267057574e-06, + "loss": 0.0074, + "step": 18030 + }, + { + "epoch": 0.2951812157408165, + "grad_norm": 0.1882653832435608, + "learning_rate": 9.931166753816673e-06, + "loss": 0.0033, + "step": 18040 + }, + { + "epoch": 0.29534484169189235, + "grad_norm": 0.2246755063533783, + "learning_rate": 9.930930434888422e-06, + "loss": 0.0032, + "step": 18050 + }, + { + "epoch": 0.2955084676429682, + "grad_norm": 0.10592489689588547, + "learning_rate": 9.930693713810262e-06, + "loss": 0.0063, + "step": 18060 + }, + { + "epoch": 0.295672093594044, + "grad_norm": 0.007365428376942873, + "learning_rate": 9.930456590601499e-06, + "loss": 0.0089, + "step": 18070 + }, + { + "epoch": 0.29583571954511984, + "grad_norm": 0.09267830103635788, + "learning_rate": 9.93021906528147e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 0.2959993454961957, + "grad_norm": 0.23627789318561554, + "learning_rate": 9.929981137869548e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 0.29616297144727155, + "grad_norm": 0.851901113986969, + "learning_rate": 9.929742808385139e-06, + "loss": 0.0068, + "step": 18100 + }, + { + "epoch": 0.2963265973983474, + "grad_norm": 0.2420031875371933, + "learning_rate": 9.929504076847677e-06, + "loss": 0.0063, + "step": 18110 + }, + { + "epoch": 0.2964902233494232, + "grad_norm": 0.22772027552127838, + "learning_rate": 9.929264943276635e-06, + "loss": 0.0067, + "step": 18120 + }, + { + "epoch": 0.29665384930049904, + "grad_norm": 0.15481628477573395, + "learning_rate": 9.929025407691516e-06, + "loss": 0.004, + "step": 18130 + }, + { + "epoch": 0.2968174752515749, + "grad_norm": 0.10913346707820892, + "learning_rate": 9.928785470111852e-06, + "loss": 0.0057, + "step": 18140 + }, + { + "epoch": 0.29698110120265075, + "grad_norm": 0.3253653049468994, + "learning_rate": 9.928545130557216e-06, + "loss": 0.0058, + "step": 18150 + }, + { + "epoch": 0.2971447271537266, + "grad_norm": 0.16288554668426514, + "learning_rate": 9.928304389047209e-06, + "loss": 0.0066, + "step": 18160 + }, + { + "epoch": 0.2973083531048024, + "grad_norm": 0.2058979570865631, + "learning_rate": 9.928063245601463e-06, + "loss": 0.0049, + "step": 18170 + }, + { + "epoch": 0.29747197905587824, + "grad_norm": 0.17294132709503174, + "learning_rate": 9.927821700239643e-06, + "loss": 0.0052, + "step": 18180 + }, + { + "epoch": 0.29763560500695413, + "grad_norm": 0.13950498402118683, + "learning_rate": 9.927579752981454e-06, + "loss": 0.0065, + "step": 18190 + }, + { + "epoch": 0.29779923095802996, + "grad_norm": 0.249586284160614, + "learning_rate": 9.927337403846622e-06, + "loss": 0.0046, + "step": 18200 + }, + { + "epoch": 0.2979628569091058, + "grad_norm": 0.0983256921172142, + "learning_rate": 9.927094652854919e-06, + "loss": 0.0037, + "step": 18210 + }, + { + "epoch": 0.2981264828601816, + "grad_norm": 0.18704041838645935, + "learning_rate": 9.926851500026138e-06, + "loss": 0.0034, + "step": 18220 + }, + { + "epoch": 0.29829010881125745, + "grad_norm": 0.28302499651908875, + "learning_rate": 9.926607945380111e-06, + "loss": 0.0053, + "step": 18230 + }, + { + "epoch": 0.29845373476233333, + "grad_norm": 0.1998574584722519, + "learning_rate": 9.926363988936703e-06, + "loss": 0.0046, + "step": 18240 + }, + { + "epoch": 0.29861736071340916, + "grad_norm": 0.08697161078453064, + "learning_rate": 9.926119630715808e-06, + "loss": 0.0028, + "step": 18250 + }, + { + "epoch": 0.298780986664485, + "grad_norm": 0.11392093449831009, + "learning_rate": 9.925874870737356e-06, + "loss": 0.0045, + "step": 18260 + }, + { + "epoch": 0.2989446126155608, + "grad_norm": 0.243545800447464, + "learning_rate": 9.92562970902131e-06, + "loss": 0.0026, + "step": 18270 + }, + { + "epoch": 0.29910823856663665, + "grad_norm": 0.19214607775211334, + "learning_rate": 9.925384145587662e-06, + "loss": 0.0037, + "step": 18280 + }, + { + "epoch": 0.2992718645177125, + "grad_norm": 0.2569046914577484, + "learning_rate": 9.92513818045644e-06, + "loss": 0.0045, + "step": 18290 + }, + { + "epoch": 0.29943549046878837, + "grad_norm": 0.2964117228984833, + "learning_rate": 9.924891813647707e-06, + "loss": 0.0046, + "step": 18300 + }, + { + "epoch": 0.2995991164198642, + "grad_norm": 0.06764807552099228, + "learning_rate": 9.924645045181552e-06, + "loss": 0.0045, + "step": 18310 + }, + { + "epoch": 0.29976274237094, + "grad_norm": 0.565817654132843, + "learning_rate": 9.924397875078103e-06, + "loss": 0.0055, + "step": 18320 + }, + { + "epoch": 0.29992636832201586, + "grad_norm": 0.33004099130630493, + "learning_rate": 9.924150303357517e-06, + "loss": 0.0046, + "step": 18330 + }, + { + "epoch": 0.3000899942730917, + "grad_norm": 0.12197458744049072, + "learning_rate": 9.923902330039986e-06, + "loss": 0.0059, + "step": 18340 + }, + { + "epoch": 0.30025362022416757, + "grad_norm": 0.3221333920955658, + "learning_rate": 9.923653955145733e-06, + "loss": 0.0036, + "step": 18350 + }, + { + "epoch": 0.3004172461752434, + "grad_norm": 0.15347731113433838, + "learning_rate": 9.923405178695016e-06, + "loss": 0.0052, + "step": 18360 + }, + { + "epoch": 0.30058087212631923, + "grad_norm": 0.08454122394323349, + "learning_rate": 9.923156000708123e-06, + "loss": 0.0044, + "step": 18370 + }, + { + "epoch": 0.30074449807739506, + "grad_norm": 0.11487319320440292, + "learning_rate": 9.922906421205376e-06, + "loss": 0.004, + "step": 18380 + }, + { + "epoch": 0.3009081240284709, + "grad_norm": 0.18744653463363647, + "learning_rate": 9.922656440207133e-06, + "loss": 0.0041, + "step": 18390 + }, + { + "epoch": 0.3010717499795468, + "grad_norm": 0.18605977296829224, + "learning_rate": 9.922406057733776e-06, + "loss": 0.0032, + "step": 18400 + }, + { + "epoch": 0.3012353759306226, + "grad_norm": 0.0707220807671547, + "learning_rate": 9.92215527380573e-06, + "loss": 0.0035, + "step": 18410 + }, + { + "epoch": 0.30139900188169844, + "grad_norm": 0.05524848401546478, + "learning_rate": 9.921904088443447e-06, + "loss": 0.0045, + "step": 18420 + }, + { + "epoch": 0.30156262783277427, + "grad_norm": 0.053524669259786606, + "learning_rate": 9.92165250166741e-06, + "loss": 0.0074, + "step": 18430 + }, + { + "epoch": 0.3017262537838501, + "grad_norm": 0.08276185393333435, + "learning_rate": 9.92140051349814e-06, + "loss": 0.005, + "step": 18440 + }, + { + "epoch": 0.301889879734926, + "grad_norm": 0.09610067307949066, + "learning_rate": 9.921148123956191e-06, + "loss": 0.0036, + "step": 18450 + }, + { + "epoch": 0.3020535056860018, + "grad_norm": 0.20032677054405212, + "learning_rate": 9.920895333062142e-06, + "loss": 0.0043, + "step": 18460 + }, + { + "epoch": 0.30221713163707764, + "grad_norm": 0.14135132730007172, + "learning_rate": 9.920642140836613e-06, + "loss": 0.0054, + "step": 18470 + }, + { + "epoch": 0.30238075758815347, + "grad_norm": 0.11000286042690277, + "learning_rate": 9.920388547300252e-06, + "loss": 0.0035, + "step": 18480 + }, + { + "epoch": 0.3025443835392293, + "grad_norm": 0.3235343098640442, + "learning_rate": 9.920134552473741e-06, + "loss": 0.0052, + "step": 18490 + }, + { + "epoch": 0.3027080094903052, + "grad_norm": 0.12839002907276154, + "learning_rate": 9.919880156377796e-06, + "loss": 0.0069, + "step": 18500 + }, + { + "epoch": 0.302871635441381, + "grad_norm": 0.125684916973114, + "learning_rate": 9.919625359033166e-06, + "loss": 0.0038, + "step": 18510 + }, + { + "epoch": 0.30303526139245685, + "grad_norm": 0.2214241772890091, + "learning_rate": 9.919370160460629e-06, + "loss": 0.0056, + "step": 18520 + }, + { + "epoch": 0.3031988873435327, + "grad_norm": 0.1472001075744629, + "learning_rate": 9.919114560680997e-06, + "loss": 0.0032, + "step": 18530 + }, + { + "epoch": 0.3033625132946085, + "grad_norm": 0.24738214910030365, + "learning_rate": 9.91885855971512e-06, + "loss": 0.0036, + "step": 18540 + }, + { + "epoch": 0.3035261392456844, + "grad_norm": 0.07262137532234192, + "learning_rate": 9.918602157583874e-06, + "loss": 0.0056, + "step": 18550 + }, + { + "epoch": 0.3036897651967602, + "grad_norm": 0.2765899896621704, + "learning_rate": 9.918345354308169e-06, + "loss": 0.0043, + "step": 18560 + }, + { + "epoch": 0.30385339114783605, + "grad_norm": 0.1799459308385849, + "learning_rate": 9.918088149908951e-06, + "loss": 0.0068, + "step": 18570 + }, + { + "epoch": 0.3040170170989119, + "grad_norm": 0.07286353409290314, + "learning_rate": 9.917830544407197e-06, + "loss": 0.0036, + "step": 18580 + }, + { + "epoch": 0.3041806430499877, + "grad_norm": 0.1088128387928009, + "learning_rate": 9.917572537823915e-06, + "loss": 0.003, + "step": 18590 + }, + { + "epoch": 0.3043442690010636, + "grad_norm": 0.08138229697942734, + "learning_rate": 9.917314130180149e-06, + "loss": 0.0044, + "step": 18600 + }, + { + "epoch": 0.3045078949521394, + "grad_norm": 0.29889506101608276, + "learning_rate": 9.917055321496972e-06, + "loss": 0.0044, + "step": 18610 + }, + { + "epoch": 0.30467152090321525, + "grad_norm": 0.1717907041311264, + "learning_rate": 9.916796111795491e-06, + "loss": 0.0051, + "step": 18620 + }, + { + "epoch": 0.3048351468542911, + "grad_norm": 0.1286926567554474, + "learning_rate": 9.916536501096849e-06, + "loss": 0.0058, + "step": 18630 + }, + { + "epoch": 0.3049987728053669, + "grad_norm": 0.1292576640844345, + "learning_rate": 9.916276489422218e-06, + "loss": 0.0063, + "step": 18640 + }, + { + "epoch": 0.3051623987564428, + "grad_norm": 0.031184492632746696, + "learning_rate": 9.916016076792802e-06, + "loss": 0.0045, + "step": 18650 + }, + { + "epoch": 0.30532602470751863, + "grad_norm": 0.04804946482181549, + "learning_rate": 9.91575526322984e-06, + "loss": 0.0057, + "step": 18660 + }, + { + "epoch": 0.30548965065859446, + "grad_norm": 0.17744258046150208, + "learning_rate": 9.915494048754605e-06, + "loss": 0.0036, + "step": 18670 + }, + { + "epoch": 0.3056532766096703, + "grad_norm": 0.0896824449300766, + "learning_rate": 9.915232433388397e-06, + "loss": 0.0027, + "step": 18680 + }, + { + "epoch": 0.3058169025607461, + "grad_norm": 0.15027998387813568, + "learning_rate": 9.914970417152558e-06, + "loss": 0.0044, + "step": 18690 + }, + { + "epoch": 0.30598052851182195, + "grad_norm": 0.05921371653676033, + "learning_rate": 9.914708000068452e-06, + "loss": 0.0068, + "step": 18700 + }, + { + "epoch": 0.30614415446289783, + "grad_norm": 0.09498249739408493, + "learning_rate": 9.914445182157484e-06, + "loss": 0.0038, + "step": 18710 + }, + { + "epoch": 0.30630778041397366, + "grad_norm": 0.13720545172691345, + "learning_rate": 9.914181963441087e-06, + "loss": 0.0036, + "step": 18720 + }, + { + "epoch": 0.3064714063650495, + "grad_norm": 0.037773698568344116, + "learning_rate": 9.913918343940728e-06, + "loss": 0.0061, + "step": 18730 + }, + { + "epoch": 0.3066350323161253, + "grad_norm": 0.037387363612651825, + "learning_rate": 9.913654323677907e-06, + "loss": 0.0038, + "step": 18740 + }, + { + "epoch": 0.30679865826720115, + "grad_norm": 0.1995997279882431, + "learning_rate": 9.913389902674158e-06, + "loss": 0.004, + "step": 18750 + }, + { + "epoch": 0.30696228421827704, + "grad_norm": 0.07831252366304398, + "learning_rate": 9.913125080951046e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 0.30712591016935287, + "grad_norm": 0.12026140838861465, + "learning_rate": 9.912859858530167e-06, + "loss": 0.0036, + "step": 18770 + }, + { + "epoch": 0.3072895361204287, + "grad_norm": 0.18794669210910797, + "learning_rate": 9.912594235433152e-06, + "loss": 0.0042, + "step": 18780 + }, + { + "epoch": 0.3074531620715045, + "grad_norm": 0.3090685307979584, + "learning_rate": 9.912328211681667e-06, + "loss": 0.0051, + "step": 18790 + }, + { + "epoch": 0.30761678802258036, + "grad_norm": 0.195156529545784, + "learning_rate": 9.912061787297405e-06, + "loss": 0.0026, + "step": 18800 + }, + { + "epoch": 0.30778041397365624, + "grad_norm": 0.1345290243625641, + "learning_rate": 9.911794962302098e-06, + "loss": 0.007, + "step": 18810 + }, + { + "epoch": 0.30794403992473207, + "grad_norm": 0.1819356381893158, + "learning_rate": 9.911527736717503e-06, + "loss": 0.0027, + "step": 18820 + }, + { + "epoch": 0.3081076658758079, + "grad_norm": 0.10387175530195236, + "learning_rate": 9.911260110565416e-06, + "loss": 0.0039, + "step": 18830 + }, + { + "epoch": 0.30827129182688373, + "grad_norm": 0.09819977730512619, + "learning_rate": 9.910992083867665e-06, + "loss": 0.0037, + "step": 18840 + }, + { + "epoch": 0.30843491777795956, + "grad_norm": 0.091976098716259, + "learning_rate": 9.910723656646108e-06, + "loss": 0.0038, + "step": 18850 + }, + { + "epoch": 0.30859854372903545, + "grad_norm": 0.24055665731430054, + "learning_rate": 9.910454828922638e-06, + "loss": 0.0037, + "step": 18860 + }, + { + "epoch": 0.3087621696801113, + "grad_norm": 0.1224713996052742, + "learning_rate": 9.910185600719179e-06, + "loss": 0.006, + "step": 18870 + }, + { + "epoch": 0.3089257956311871, + "grad_norm": 0.002193062799051404, + "learning_rate": 9.909915972057688e-06, + "loss": 0.0019, + "step": 18880 + }, + { + "epoch": 0.30908942158226294, + "grad_norm": 0.41805511713027954, + "learning_rate": 9.909645942960156e-06, + "loss": 0.0063, + "step": 18890 + }, + { + "epoch": 0.30925304753333877, + "grad_norm": 0.12885433435440063, + "learning_rate": 9.909375513448603e-06, + "loss": 0.0059, + "step": 18900 + }, + { + "epoch": 0.30941667348441465, + "grad_norm": 0.12093693763017654, + "learning_rate": 9.909104683545088e-06, + "loss": 0.0044, + "step": 18910 + }, + { + "epoch": 0.3095802994354905, + "grad_norm": 0.15998868644237518, + "learning_rate": 9.908833453271695e-06, + "loss": 0.003, + "step": 18920 + }, + { + "epoch": 0.3097439253865663, + "grad_norm": 0.06271903216838837, + "learning_rate": 9.90856182265055e-06, + "loss": 0.0028, + "step": 18930 + }, + { + "epoch": 0.30990755133764214, + "grad_norm": 0.029513388872146606, + "learning_rate": 9.908289791703801e-06, + "loss": 0.0046, + "step": 18940 + }, + { + "epoch": 0.31007117728871797, + "grad_norm": 0.1227802038192749, + "learning_rate": 9.908017360453636e-06, + "loss": 0.0042, + "step": 18950 + }, + { + "epoch": 0.31023480323979385, + "grad_norm": 0.28032901883125305, + "learning_rate": 9.907744528922274e-06, + "loss": 0.0055, + "step": 18960 + }, + { + "epoch": 0.3103984291908697, + "grad_norm": 0.1124168261885643, + "learning_rate": 9.907471297131967e-06, + "loss": 0.0041, + "step": 18970 + }, + { + "epoch": 0.3105620551419455, + "grad_norm": 0.07405764609575272, + "learning_rate": 9.907197665104997e-06, + "loss": 0.0045, + "step": 18980 + }, + { + "epoch": 0.31072568109302134, + "grad_norm": 0.08645839989185333, + "learning_rate": 9.906923632863682e-06, + "loss": 0.005, + "step": 18990 + }, + { + "epoch": 0.3108893070440972, + "grad_norm": 0.09636373817920685, + "learning_rate": 9.906649200430367e-06, + "loss": 0.0037, + "step": 19000 + }, + { + "epoch": 0.31105293299517306, + "grad_norm": 0.03481016680598259, + "learning_rate": 9.90637436782744e-06, + "loss": 0.004, + "step": 19010 + }, + { + "epoch": 0.3112165589462489, + "grad_norm": 0.3704533576965332, + "learning_rate": 9.906099135077312e-06, + "loss": 0.0051, + "step": 19020 + }, + { + "epoch": 0.3113801848973247, + "grad_norm": 0.36298590898513794, + "learning_rate": 9.90582350220243e-06, + "loss": 0.0069, + "step": 19030 + }, + { + "epoch": 0.31154381084840055, + "grad_norm": 0.17100781202316284, + "learning_rate": 9.905547469225274e-06, + "loss": 0.0037, + "step": 19040 + }, + { + "epoch": 0.3117074367994764, + "grad_norm": 0.19253915548324585, + "learning_rate": 9.905271036168357e-06, + "loss": 0.0044, + "step": 19050 + }, + { + "epoch": 0.31187106275055226, + "grad_norm": 0.1165117397904396, + "learning_rate": 9.904994203054224e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 0.3120346887016281, + "grad_norm": 0.11860129982233047, + "learning_rate": 9.90471696990545e-06, + "loss": 0.0055, + "step": 19070 + }, + { + "epoch": 0.3121983146527039, + "grad_norm": 0.09898581355810165, + "learning_rate": 9.90443933674465e-06, + "loss": 0.007, + "step": 19080 + }, + { + "epoch": 0.31236194060377975, + "grad_norm": 0.0958847850561142, + "learning_rate": 9.904161303594461e-06, + "loss": 0.0036, + "step": 19090 + }, + { + "epoch": 0.3125255665548556, + "grad_norm": 0.1846855878829956, + "learning_rate": 9.903882870477563e-06, + "loss": 0.0029, + "step": 19100 + }, + { + "epoch": 0.31268919250593147, + "grad_norm": 0.4415309429168701, + "learning_rate": 9.903604037416664e-06, + "loss": 0.0037, + "step": 19110 + }, + { + "epoch": 0.3128528184570073, + "grad_norm": 0.19907858967781067, + "learning_rate": 9.903324804434503e-06, + "loss": 0.0052, + "step": 19120 + }, + { + "epoch": 0.3130164444080831, + "grad_norm": 0.22549308836460114, + "learning_rate": 9.903045171553851e-06, + "loss": 0.0041, + "step": 19130 + }, + { + "epoch": 0.31318007035915896, + "grad_norm": 0.08515822887420654, + "learning_rate": 9.90276513879752e-06, + "loss": 0.003, + "step": 19140 + }, + { + "epoch": 0.3133436963102348, + "grad_norm": 0.08104472607374191, + "learning_rate": 9.902484706188341e-06, + "loss": 0.0043, + "step": 19150 + }, + { + "epoch": 0.3135073222613106, + "grad_norm": 0.33319786190986633, + "learning_rate": 9.90220387374919e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 0.3136709482123865, + "grad_norm": 0.31271547079086304, + "learning_rate": 9.901922641502972e-06, + "loss": 0.0051, + "step": 19170 + }, + { + "epoch": 0.31383457416346233, + "grad_norm": 0.14193718135356903, + "learning_rate": 9.90164100947262e-06, + "loss": 0.0046, + "step": 19180 + }, + { + "epoch": 0.31399820011453816, + "grad_norm": 0.332012414932251, + "learning_rate": 9.901358977681103e-06, + "loss": 0.0052, + "step": 19190 + }, + { + "epoch": 0.314161826065614, + "grad_norm": 0.17114713788032532, + "learning_rate": 9.901076546151425e-06, + "loss": 0.0048, + "step": 19200 + }, + { + "epoch": 0.3143254520166898, + "grad_norm": 0.1274709552526474, + "learning_rate": 9.900793714906618e-06, + "loss": 0.0042, + "step": 19210 + }, + { + "epoch": 0.3144890779677657, + "grad_norm": 0.28423044085502625, + "learning_rate": 9.900510483969749e-06, + "loss": 0.007, + "step": 19220 + }, + { + "epoch": 0.31465270391884154, + "grad_norm": 0.06696786731481552, + "learning_rate": 9.900226853363919e-06, + "loss": 0.0037, + "step": 19230 + }, + { + "epoch": 0.31481632986991737, + "grad_norm": 0.12529677152633667, + "learning_rate": 9.899942823112259e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 0.3149799558209932, + "grad_norm": 0.1957942396402359, + "learning_rate": 9.899658393237934e-06, + "loss": 0.0053, + "step": 19250 + }, + { + "epoch": 0.315143581772069, + "grad_norm": 0.10554284602403641, + "learning_rate": 9.899373563764138e-06, + "loss": 0.0043, + "step": 19260 + }, + { + "epoch": 0.3153072077231449, + "grad_norm": 0.11743362993001938, + "learning_rate": 9.899088334714106e-06, + "loss": 0.0039, + "step": 19270 + }, + { + "epoch": 0.31547083367422074, + "grad_norm": 0.09323113411664963, + "learning_rate": 9.898802706111095e-06, + "loss": 0.0044, + "step": 19280 + }, + { + "epoch": 0.31563445962529657, + "grad_norm": 0.23355872929096222, + "learning_rate": 9.898516677978404e-06, + "loss": 0.0036, + "step": 19290 + }, + { + "epoch": 0.3157980855763724, + "grad_norm": 0.055486563593149185, + "learning_rate": 9.89823025033936e-06, + "loss": 0.0046, + "step": 19300 + }, + { + "epoch": 0.31596171152744823, + "grad_norm": 0.2832466661930084, + "learning_rate": 9.89794342321732e-06, + "loss": 0.0061, + "step": 19310 + }, + { + "epoch": 0.3161253374785241, + "grad_norm": 0.15053504705429077, + "learning_rate": 9.897656196635678e-06, + "loss": 0.0028, + "step": 19320 + }, + { + "epoch": 0.31628896342959995, + "grad_norm": 0.19460567831993103, + "learning_rate": 9.897368570617862e-06, + "loss": 0.0035, + "step": 19330 + }, + { + "epoch": 0.3164525893806758, + "grad_norm": 0.13320541381835938, + "learning_rate": 9.897080545187328e-06, + "loss": 0.0044, + "step": 19340 + }, + { + "epoch": 0.3166162153317516, + "grad_norm": 0.1881943941116333, + "learning_rate": 9.896792120367564e-06, + "loss": 0.0048, + "step": 19350 + }, + { + "epoch": 0.31677984128282743, + "grad_norm": 0.11229316145181656, + "learning_rate": 9.896503296182096e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 0.3169434672339033, + "grad_norm": 0.07651843130588531, + "learning_rate": 9.896214072654478e-06, + "loss": 0.0045, + "step": 19370 + }, + { + "epoch": 0.31710709318497915, + "grad_norm": 0.46119266748428345, + "learning_rate": 9.8959244498083e-06, + "loss": 0.0064, + "step": 19380 + }, + { + "epoch": 0.317270719136055, + "grad_norm": 0.3152843117713928, + "learning_rate": 9.89563442766718e-06, + "loss": 0.0051, + "step": 19390 + }, + { + "epoch": 0.3174343450871308, + "grad_norm": 0.164754256606102, + "learning_rate": 9.895344006254773e-06, + "loss": 0.0042, + "step": 19400 + }, + { + "epoch": 0.31759797103820664, + "grad_norm": 0.23276254534721375, + "learning_rate": 9.895053185594762e-06, + "loss": 0.0032, + "step": 19410 + }, + { + "epoch": 0.3177615969892825, + "grad_norm": 0.31304922699928284, + "learning_rate": 9.894761965710871e-06, + "loss": 0.0049, + "step": 19420 + }, + { + "epoch": 0.31792522294035835, + "grad_norm": 0.06616653501987457, + "learning_rate": 9.894470346626846e-06, + "loss": 0.0069, + "step": 19430 + }, + { + "epoch": 0.3180888488914342, + "grad_norm": 0.15861405432224274, + "learning_rate": 9.894178328366473e-06, + "loss": 0.0048, + "step": 19440 + }, + { + "epoch": 0.31825247484251, + "grad_norm": 0.24078448116779327, + "learning_rate": 9.893885910953564e-06, + "loss": 0.0047, + "step": 19450 + }, + { + "epoch": 0.31841610079358584, + "grad_norm": 0.05956120043992996, + "learning_rate": 9.893593094411973e-06, + "loss": 0.0034, + "step": 19460 + }, + { + "epoch": 0.31857972674466173, + "grad_norm": 0.11779209226369858, + "learning_rate": 9.89329987876558e-06, + "loss": 0.0048, + "step": 19470 + }, + { + "epoch": 0.31874335269573756, + "grad_norm": 0.3354147970676422, + "learning_rate": 9.893006264038294e-06, + "loss": 0.0028, + "step": 19480 + }, + { + "epoch": 0.3189069786468134, + "grad_norm": 0.09096290171146393, + "learning_rate": 9.892712250254067e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 0.3190706045978892, + "grad_norm": 0.20709989964962006, + "learning_rate": 9.892417837436874e-06, + "loss": 0.005, + "step": 19500 + }, + { + "epoch": 0.31923423054896505, + "grad_norm": 0.11091521382331848, + "learning_rate": 9.892123025610728e-06, + "loss": 0.0048, + "step": 19510 + }, + { + "epoch": 0.31939785650004093, + "grad_norm": 0.31609076261520386, + "learning_rate": 9.891827814799672e-06, + "loss": 0.0028, + "step": 19520 + }, + { + "epoch": 0.31956148245111676, + "grad_norm": 0.10027654469013214, + "learning_rate": 9.891532205027783e-06, + "loss": 0.0042, + "step": 19530 + }, + { + "epoch": 0.3197251084021926, + "grad_norm": 0.36275723576545715, + "learning_rate": 9.891236196319172e-06, + "loss": 0.0037, + "step": 19540 + }, + { + "epoch": 0.3198887343532684, + "grad_norm": 0.17737677693367004, + "learning_rate": 9.890939788697975e-06, + "loss": 0.0031, + "step": 19550 + }, + { + "epoch": 0.32005236030434425, + "grad_norm": 0.15842317044734955, + "learning_rate": 9.890642982188372e-06, + "loss": 0.0026, + "step": 19560 + }, + { + "epoch": 0.3202159862554201, + "grad_norm": 0.04089176282286644, + "learning_rate": 9.890345776814565e-06, + "loss": 0.0036, + "step": 19570 + }, + { + "epoch": 0.32037961220649597, + "grad_norm": 0.2341189682483673, + "learning_rate": 9.890048172600795e-06, + "loss": 0.0058, + "step": 19580 + }, + { + "epoch": 0.3205432381575718, + "grad_norm": 0.12738503515720367, + "learning_rate": 9.889750169571332e-06, + "loss": 0.0041, + "step": 19590 + }, + { + "epoch": 0.3207068641086476, + "grad_norm": 0.03597637265920639, + "learning_rate": 9.889451767750484e-06, + "loss": 0.0048, + "step": 19600 + }, + { + "epoch": 0.32087049005972346, + "grad_norm": 0.10549870878458023, + "learning_rate": 9.889152967162586e-06, + "loss": 0.0051, + "step": 19610 + }, + { + "epoch": 0.3210341160107993, + "grad_norm": 0.10223033279180527, + "learning_rate": 9.888853767832003e-06, + "loss": 0.0043, + "step": 19620 + }, + { + "epoch": 0.32119774196187517, + "grad_norm": 0.10213784128427505, + "learning_rate": 9.888554169783143e-06, + "loss": 0.0057, + "step": 19630 + }, + { + "epoch": 0.321361367912951, + "grad_norm": 0.07770213484764099, + "learning_rate": 9.888254173040434e-06, + "loss": 0.0036, + "step": 19640 + }, + { + "epoch": 0.32152499386402683, + "grad_norm": 0.41759949922561646, + "learning_rate": 9.887953777628349e-06, + "loss": 0.006, + "step": 19650 + }, + { + "epoch": 0.32168861981510266, + "grad_norm": 0.22137853503227234, + "learning_rate": 9.887652983571383e-06, + "loss": 0.0042, + "step": 19660 + }, + { + "epoch": 0.3218522457661785, + "grad_norm": 0.18771515786647797, + "learning_rate": 9.887351790894069e-06, + "loss": 0.004, + "step": 19670 + }, + { + "epoch": 0.3220158717172544, + "grad_norm": 0.1392206996679306, + "learning_rate": 9.887050199620972e-06, + "loss": 0.006, + "step": 19680 + }, + { + "epoch": 0.3221794976683302, + "grad_norm": 0.23520411550998688, + "learning_rate": 9.886748209776687e-06, + "loss": 0.005, + "step": 19690 + }, + { + "epoch": 0.32234312361940604, + "grad_norm": 0.4393717646598816, + "learning_rate": 9.886445821385844e-06, + "loss": 0.0042, + "step": 19700 + }, + { + "epoch": 0.32250674957048187, + "grad_norm": 0.04335736110806465, + "learning_rate": 9.886143034473104e-06, + "loss": 0.0022, + "step": 19710 + }, + { + "epoch": 0.3226703755215577, + "grad_norm": 0.07217326760292053, + "learning_rate": 9.885839849063163e-06, + "loss": 0.0044, + "step": 19720 + }, + { + "epoch": 0.3228340014726336, + "grad_norm": 0.07146336883306503, + "learning_rate": 9.885536265180748e-06, + "loss": 0.003, + "step": 19730 + }, + { + "epoch": 0.3229976274237094, + "grad_norm": 0.08882381021976471, + "learning_rate": 9.885232282850616e-06, + "loss": 0.0051, + "step": 19740 + }, + { + "epoch": 0.32316125337478524, + "grad_norm": 0.2379351705312729, + "learning_rate": 9.884927902097561e-06, + "loss": 0.0052, + "step": 19750 + }, + { + "epoch": 0.32332487932586107, + "grad_norm": 0.142432302236557, + "learning_rate": 9.884623122946405e-06, + "loss": 0.0036, + "step": 19760 + }, + { + "epoch": 0.3234885052769369, + "grad_norm": 0.08389263600111008, + "learning_rate": 9.884317945422007e-06, + "loss": 0.0046, + "step": 19770 + }, + { + "epoch": 0.3236521312280128, + "grad_norm": 0.10304949432611465, + "learning_rate": 9.884012369549255e-06, + "loss": 0.0028, + "step": 19780 + }, + { + "epoch": 0.3238157571790886, + "grad_norm": 0.08805543184280396, + "learning_rate": 9.883706395353072e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 0.32397938313016444, + "grad_norm": 0.14350149035453796, + "learning_rate": 9.88340002285841e-06, + "loss": 0.0055, + "step": 19800 + }, + { + "epoch": 0.3241430090812403, + "grad_norm": 0.2578975260257721, + "learning_rate": 9.883093252090257e-06, + "loss": 0.0044, + "step": 19810 + }, + { + "epoch": 0.3243066350323161, + "grad_norm": 0.08603624999523163, + "learning_rate": 9.882786083073632e-06, + "loss": 0.0037, + "step": 19820 + }, + { + "epoch": 0.324470260983392, + "grad_norm": 0.058858297765254974, + "learning_rate": 9.882478515833587e-06, + "loss": 0.0039, + "step": 19830 + }, + { + "epoch": 0.3246338869344678, + "grad_norm": 0.20485638082027435, + "learning_rate": 9.882170550395205e-06, + "loss": 0.0038, + "step": 19840 + }, + { + "epoch": 0.32479751288554365, + "grad_norm": 0.11642660200595856, + "learning_rate": 9.881862186783605e-06, + "loss": 0.0036, + "step": 19850 + }, + { + "epoch": 0.3249611388366195, + "grad_norm": 0.12272995710372925, + "learning_rate": 9.881553425023933e-06, + "loss": 0.0051, + "step": 19860 + }, + { + "epoch": 0.3251247647876953, + "grad_norm": 0.4291192293167114, + "learning_rate": 9.881244265141374e-06, + "loss": 0.0045, + "step": 19870 + }, + { + "epoch": 0.3252883907387712, + "grad_norm": 0.10232923924922943, + "learning_rate": 9.880934707161138e-06, + "loss": 0.0072, + "step": 19880 + }, + { + "epoch": 0.325452016689847, + "grad_norm": 0.11810983717441559, + "learning_rate": 9.880624751108476e-06, + "loss": 0.0044, + "step": 19890 + }, + { + "epoch": 0.32561564264092285, + "grad_norm": 0.2597808539867401, + "learning_rate": 9.880314397008663e-06, + "loss": 0.0057, + "step": 19900 + }, + { + "epoch": 0.3257792685919987, + "grad_norm": 0.2902827262878418, + "learning_rate": 9.880003644887013e-06, + "loss": 0.0061, + "step": 19910 + }, + { + "epoch": 0.3259428945430745, + "grad_norm": 0.2732642590999603, + "learning_rate": 9.879692494768868e-06, + "loss": 0.0039, + "step": 19920 + }, + { + "epoch": 0.3261065204941504, + "grad_norm": 0.21653032302856445, + "learning_rate": 9.879380946679605e-06, + "loss": 0.0037, + "step": 19930 + }, + { + "epoch": 0.32627014644522623, + "grad_norm": 0.14108705520629883, + "learning_rate": 9.879069000644635e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 0.32643377239630206, + "grad_norm": 0.12505283951759338, + "learning_rate": 9.878756656689395e-06, + "loss": 0.0051, + "step": 19950 + }, + { + "epoch": 0.3265973983473779, + "grad_norm": 0.13990822434425354, + "learning_rate": 9.878443914839362e-06, + "loss": 0.0029, + "step": 19960 + }, + { + "epoch": 0.3267610242984537, + "grad_norm": 0.1480061113834381, + "learning_rate": 9.878130775120041e-06, + "loss": 0.0057, + "step": 19970 + }, + { + "epoch": 0.3269246502495296, + "grad_norm": 0.1516495645046234, + "learning_rate": 9.877817237556972e-06, + "loss": 0.0031, + "step": 19980 + }, + { + "epoch": 0.32708827620060543, + "grad_norm": 0.1430971622467041, + "learning_rate": 9.877503302175724e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 0.32725190215168126, + "grad_norm": 0.17753866314888, + "learning_rate": 9.8771889690019e-06, + "loss": 0.0066, + "step": 20000 + }, + { + "epoch": 0.3274155281027571, + "grad_norm": 0.10308807343244553, + "learning_rate": 9.87687423806114e-06, + "loss": 0.0034, + "step": 20010 + }, + { + "epoch": 0.3275791540538329, + "grad_norm": 0.13336046040058136, + "learning_rate": 9.876559109379108e-06, + "loss": 0.0039, + "step": 20020 + }, + { + "epoch": 0.32774278000490875, + "grad_norm": 0.1490883231163025, + "learning_rate": 9.876243582981507e-06, + "loss": 0.0025, + "step": 20030 + }, + { + "epoch": 0.32790640595598464, + "grad_norm": 0.0983930230140686, + "learning_rate": 9.87592765889407e-06, + "loss": 0.0048, + "step": 20040 + }, + { + "epoch": 0.32807003190706047, + "grad_norm": 0.15341004729270935, + "learning_rate": 9.875611337142561e-06, + "loss": 0.0054, + "step": 20050 + }, + { + "epoch": 0.3282336578581363, + "grad_norm": 0.1860189288854599, + "learning_rate": 9.875294617752782e-06, + "loss": 0.0058, + "step": 20060 + }, + { + "epoch": 0.3283972838092121, + "grad_norm": 0.45263639092445374, + "learning_rate": 9.87497750075056e-06, + "loss": 0.0054, + "step": 20070 + }, + { + "epoch": 0.32856090976028796, + "grad_norm": 0.04703308641910553, + "learning_rate": 9.874659986161758e-06, + "loss": 0.0022, + "step": 20080 + }, + { + "epoch": 0.32872453571136384, + "grad_norm": 0.4157925248146057, + "learning_rate": 9.874342074012275e-06, + "loss": 0.0073, + "step": 20090 + }, + { + "epoch": 0.32888816166243967, + "grad_norm": 0.0724644586443901, + "learning_rate": 9.874023764328034e-06, + "loss": 0.0034, + "step": 20100 + }, + { + "epoch": 0.3290517876135155, + "grad_norm": 0.16821777820587158, + "learning_rate": 9.873705057134998e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 0.32921541356459133, + "grad_norm": 0.15424877405166626, + "learning_rate": 9.87338595245916e-06, + "loss": 0.0067, + "step": 20120 + }, + { + "epoch": 0.32937903951566716, + "grad_norm": 0.018444593995809555, + "learning_rate": 9.873066450326546e-06, + "loss": 0.0063, + "step": 20130 + }, + { + "epoch": 0.32954266546674305, + "grad_norm": 0.2621767520904541, + "learning_rate": 9.87274655076321e-06, + "loss": 0.0057, + "step": 20140 + }, + { + "epoch": 0.3297062914178189, + "grad_norm": 0.1759834885597229, + "learning_rate": 9.872426253795244e-06, + "loss": 0.0041, + "step": 20150 + }, + { + "epoch": 0.3298699173688947, + "grad_norm": 0.08095880597829819, + "learning_rate": 9.87210555944877e-06, + "loss": 0.0057, + "step": 20160 + }, + { + "epoch": 0.33003354331997053, + "grad_norm": 0.3416996896266937, + "learning_rate": 9.871784467749944e-06, + "loss": 0.0086, + "step": 20170 + }, + { + "epoch": 0.33019716927104636, + "grad_norm": 0.20829014480113983, + "learning_rate": 9.87146297872495e-06, + "loss": 0.0036, + "step": 20180 + }, + { + "epoch": 0.33036079522212225, + "grad_norm": 0.1080484390258789, + "learning_rate": 9.871141092400012e-06, + "loss": 0.0042, + "step": 20190 + }, + { + "epoch": 0.3305244211731981, + "grad_norm": 0.4624972939491272, + "learning_rate": 9.87081880880138e-06, + "loss": 0.0043, + "step": 20200 + }, + { + "epoch": 0.3306880471242739, + "grad_norm": 0.1670476645231247, + "learning_rate": 9.870496127955335e-06, + "loss": 0.0059, + "step": 20210 + }, + { + "epoch": 0.33085167307534974, + "grad_norm": 0.29990607500076294, + "learning_rate": 9.8701730498882e-06, + "loss": 0.0069, + "step": 20220 + }, + { + "epoch": 0.33101529902642557, + "grad_norm": 0.14530187845230103, + "learning_rate": 9.869849574626317e-06, + "loss": 0.0036, + "step": 20230 + }, + { + "epoch": 0.33117892497750145, + "grad_norm": 0.11519566178321838, + "learning_rate": 9.869525702196074e-06, + "loss": 0.0024, + "step": 20240 + }, + { + "epoch": 0.3313425509285773, + "grad_norm": 0.37026020884513855, + "learning_rate": 9.86920143262388e-06, + "loss": 0.0052, + "step": 20250 + }, + { + "epoch": 0.3315061768796531, + "grad_norm": 0.1430608183145523, + "learning_rate": 9.868876765936186e-06, + "loss": 0.0045, + "step": 20260 + }, + { + "epoch": 0.33166980283072894, + "grad_norm": 0.12555855512619019, + "learning_rate": 9.868551702159466e-06, + "loss": 0.0029, + "step": 20270 + }, + { + "epoch": 0.3318334287818048, + "grad_norm": 0.17927005887031555, + "learning_rate": 9.868226241320237e-06, + "loss": 0.0036, + "step": 20280 + }, + { + "epoch": 0.33199705473288066, + "grad_norm": 0.1316860020160675, + "learning_rate": 9.867900383445035e-06, + "loss": 0.0033, + "step": 20290 + }, + { + "epoch": 0.3321606806839565, + "grad_norm": 0.07333186268806458, + "learning_rate": 9.867574128560442e-06, + "loss": 0.0056, + "step": 20300 + }, + { + "epoch": 0.3323243066350323, + "grad_norm": 0.18035760521888733, + "learning_rate": 9.867247476693064e-06, + "loss": 0.004, + "step": 20310 + }, + { + "epoch": 0.33248793258610815, + "grad_norm": 0.1360657960176468, + "learning_rate": 9.86692042786954e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 0.332651558537184, + "grad_norm": 0.14488337934017181, + "learning_rate": 9.866592982116547e-06, + "loss": 0.0055, + "step": 20330 + }, + { + "epoch": 0.33281518448825986, + "grad_norm": 0.14830628037452698, + "learning_rate": 9.866265139460787e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 0.3329788104393357, + "grad_norm": 0.25210511684417725, + "learning_rate": 9.865936899928998e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 0.3331424363904115, + "grad_norm": 0.06869592517614365, + "learning_rate": 9.86560826354795e-06, + "loss": 0.0045, + "step": 20360 + }, + { + "epoch": 0.33330606234148735, + "grad_norm": 0.11608990281820297, + "learning_rate": 9.865279230344448e-06, + "loss": 0.004, + "step": 20370 + }, + { + "epoch": 0.3334696882925632, + "grad_norm": 0.2703023850917816, + "learning_rate": 9.864949800345325e-06, + "loss": 0.0042, + "step": 20380 + }, + { + "epoch": 0.33363331424363907, + "grad_norm": 0.10395345836877823, + "learning_rate": 9.864619973577448e-06, + "loss": 0.0046, + "step": 20390 + }, + { + "epoch": 0.3337969401947149, + "grad_norm": 0.16700316965579987, + "learning_rate": 9.864289750067715e-06, + "loss": 0.0039, + "step": 20400 + }, + { + "epoch": 0.3339605661457907, + "grad_norm": 0.08299065381288528, + "learning_rate": 9.863959129843061e-06, + "loss": 0.0041, + "step": 20410 + }, + { + "epoch": 0.33412419209686656, + "grad_norm": 0.1187463104724884, + "learning_rate": 9.86362811293045e-06, + "loss": 0.0061, + "step": 20420 + }, + { + "epoch": 0.3342878180479424, + "grad_norm": 0.16819888353347778, + "learning_rate": 9.863296699356876e-06, + "loss": 0.0049, + "step": 20430 + }, + { + "epoch": 0.33445144399901827, + "grad_norm": 0.11607053130865097, + "learning_rate": 9.862964889149372e-06, + "loss": 0.0059, + "step": 20440 + }, + { + "epoch": 0.3346150699500941, + "grad_norm": 0.3275061845779419, + "learning_rate": 9.862632682334994e-06, + "loss": 0.0038, + "step": 20450 + }, + { + "epoch": 0.33477869590116993, + "grad_norm": 0.10634026676416397, + "learning_rate": 9.86230007894084e-06, + "loss": 0.0025, + "step": 20460 + }, + { + "epoch": 0.33494232185224576, + "grad_norm": 0.10460998117923737, + "learning_rate": 9.861967078994035e-06, + "loss": 0.0029, + "step": 20470 + }, + { + "epoch": 0.3351059478033216, + "grad_norm": 0.209858238697052, + "learning_rate": 9.861633682521736e-06, + "loss": 0.0104, + "step": 20480 + }, + { + "epoch": 0.3352695737543974, + "grad_norm": 0.18016286194324493, + "learning_rate": 9.861299889551135e-06, + "loss": 0.0041, + "step": 20490 + }, + { + "epoch": 0.3354331997054733, + "grad_norm": 0.38085615634918213, + "learning_rate": 9.860965700109453e-06, + "loss": 0.0053, + "step": 20500 + }, + { + "epoch": 0.33559682565654914, + "grad_norm": 0.2172110229730606, + "learning_rate": 9.860631114223948e-06, + "loss": 0.0048, + "step": 20510 + }, + { + "epoch": 0.33576045160762497, + "grad_norm": 0.06819513440132141, + "learning_rate": 9.860296131921909e-06, + "loss": 0.0052, + "step": 20520 + }, + { + "epoch": 0.3359240775587008, + "grad_norm": 0.1689801961183548, + "learning_rate": 9.859960753230651e-06, + "loss": 0.0076, + "step": 20530 + }, + { + "epoch": 0.3360877035097766, + "grad_norm": 0.19185730814933777, + "learning_rate": 9.859624978177529e-06, + "loss": 0.0056, + "step": 20540 + }, + { + "epoch": 0.3362513294608525, + "grad_norm": 0.09345399588346481, + "learning_rate": 9.859288806789929e-06, + "loss": 0.0038, + "step": 20550 + }, + { + "epoch": 0.33641495541192834, + "grad_norm": 0.11281021684408188, + "learning_rate": 9.858952239095265e-06, + "loss": 0.0032, + "step": 20560 + }, + { + "epoch": 0.33657858136300417, + "grad_norm": 0.061719730496406555, + "learning_rate": 9.858615275120989e-06, + "loss": 0.0031, + "step": 20570 + }, + { + "epoch": 0.33674220731408, + "grad_norm": 0.2039213925600052, + "learning_rate": 9.858277914894581e-06, + "loss": 0.0052, + "step": 20580 + }, + { + "epoch": 0.33690583326515583, + "grad_norm": 0.11667054146528244, + "learning_rate": 9.857940158443558e-06, + "loss": 0.0038, + "step": 20590 + }, + { + "epoch": 0.3370694592162317, + "grad_norm": 0.02627541869878769, + "learning_rate": 9.85760200579546e-06, + "loss": 0.003, + "step": 20600 + }, + { + "epoch": 0.33723308516730754, + "grad_norm": 0.21701110899448395, + "learning_rate": 9.857263456977872e-06, + "loss": 0.0031, + "step": 20610 + }, + { + "epoch": 0.3373967111183834, + "grad_norm": 0.06111739203333855, + "learning_rate": 9.8569245120184e-06, + "loss": 0.0025, + "step": 20620 + }, + { + "epoch": 0.3375603370694592, + "grad_norm": 0.06633685529232025, + "learning_rate": 9.856585170944693e-06, + "loss": 0.0026, + "step": 20630 + }, + { + "epoch": 0.33772396302053503, + "grad_norm": 0.1305380016565323, + "learning_rate": 9.856245433784419e-06, + "loss": 0.0033, + "step": 20640 + }, + { + "epoch": 0.3378875889716109, + "grad_norm": 0.09376155585050583, + "learning_rate": 9.855905300565293e-06, + "loss": 0.005, + "step": 20650 + }, + { + "epoch": 0.33805121492268675, + "grad_norm": 0.11227598041296005, + "learning_rate": 9.85556477131505e-06, + "loss": 0.0027, + "step": 20660 + }, + { + "epoch": 0.3382148408737626, + "grad_norm": 0.1978847086429596, + "learning_rate": 9.855223846061466e-06, + "loss": 0.0032, + "step": 20670 + }, + { + "epoch": 0.3383784668248384, + "grad_norm": 0.11767446249723434, + "learning_rate": 9.854882524832343e-06, + "loss": 0.005, + "step": 20680 + }, + { + "epoch": 0.33854209277591424, + "grad_norm": 0.10278233140707016, + "learning_rate": 9.854540807655519e-06, + "loss": 0.003, + "step": 20690 + }, + { + "epoch": 0.3387057187269901, + "grad_norm": 0.16309624910354614, + "learning_rate": 9.854198694558862e-06, + "loss": 0.006, + "step": 20700 + }, + { + "epoch": 0.33886934467806595, + "grad_norm": 0.1410336196422577, + "learning_rate": 9.853856185570276e-06, + "loss": 0.0035, + "step": 20710 + }, + { + "epoch": 0.3390329706291418, + "grad_norm": 0.06369351595640182, + "learning_rate": 9.853513280717695e-06, + "loss": 0.0029, + "step": 20720 + }, + { + "epoch": 0.3391965965802176, + "grad_norm": 0.20060832798480988, + "learning_rate": 9.853169980029083e-06, + "loss": 0.006, + "step": 20730 + }, + { + "epoch": 0.33936022253129344, + "grad_norm": 0.0910542905330658, + "learning_rate": 9.852826283532439e-06, + "loss": 0.0038, + "step": 20740 + }, + { + "epoch": 0.33952384848236933, + "grad_norm": 0.1707943081855774, + "learning_rate": 9.852482191255794e-06, + "loss": 0.0035, + "step": 20750 + }, + { + "epoch": 0.33968747443344516, + "grad_norm": 0.0393177792429924, + "learning_rate": 9.852137703227212e-06, + "loss": 0.0031, + "step": 20760 + }, + { + "epoch": 0.339851100384521, + "grad_norm": 0.11621933430433273, + "learning_rate": 9.851792819474785e-06, + "loss": 0.0032, + "step": 20770 + }, + { + "epoch": 0.3400147263355968, + "grad_norm": 0.11890273541212082, + "learning_rate": 9.851447540026645e-06, + "loss": 0.0037, + "step": 20780 + }, + { + "epoch": 0.34017835228667265, + "grad_norm": 0.1648484319448471, + "learning_rate": 9.851101864910949e-06, + "loss": 0.0039, + "step": 20790 + }, + { + "epoch": 0.34034197823774853, + "grad_norm": 0.23658430576324463, + "learning_rate": 9.850755794155891e-06, + "loss": 0.0055, + "step": 20800 + }, + { + "epoch": 0.34050560418882436, + "grad_norm": 0.07108192890882492, + "learning_rate": 9.850409327789692e-06, + "loss": 0.0029, + "step": 20810 + }, + { + "epoch": 0.3406692301399002, + "grad_norm": 0.10644032061100006, + "learning_rate": 9.850062465840611e-06, + "loss": 0.0037, + "step": 20820 + }, + { + "epoch": 0.340832856090976, + "grad_norm": 0.013563537038862705, + "learning_rate": 9.849715208336938e-06, + "loss": 0.0045, + "step": 20830 + }, + { + "epoch": 0.34099648204205185, + "grad_norm": 0.16491533815860748, + "learning_rate": 9.849367555306993e-06, + "loss": 0.0043, + "step": 20840 + }, + { + "epoch": 0.34116010799312774, + "grad_norm": 0.18684102594852448, + "learning_rate": 9.849019506779127e-06, + "loss": 0.006, + "step": 20850 + }, + { + "epoch": 0.34132373394420357, + "grad_norm": 0.09518221020698547, + "learning_rate": 9.84867106278173e-06, + "loss": 0.0061, + "step": 20860 + }, + { + "epoch": 0.3414873598952794, + "grad_norm": 0.053158923983573914, + "learning_rate": 9.848322223343217e-06, + "loss": 0.0038, + "step": 20870 + }, + { + "epoch": 0.3416509858463552, + "grad_norm": 0.11258822679519653, + "learning_rate": 9.847972988492038e-06, + "loss": 0.0026, + "step": 20880 + }, + { + "epoch": 0.34181461179743106, + "grad_norm": 0.18029457330703735, + "learning_rate": 9.847623358256678e-06, + "loss": 0.0042, + "step": 20890 + }, + { + "epoch": 0.3419782377485069, + "grad_norm": 0.11698746681213379, + "learning_rate": 9.847273332665648e-06, + "loss": 0.0105, + "step": 20900 + }, + { + "epoch": 0.34214186369958277, + "grad_norm": 0.3530029058456421, + "learning_rate": 9.846922911747498e-06, + "loss": 0.0072, + "step": 20910 + }, + { + "epoch": 0.3423054896506586, + "grad_norm": 0.2188953161239624, + "learning_rate": 9.846572095530807e-06, + "loss": 0.0057, + "step": 20920 + }, + { + "epoch": 0.34246911560173443, + "grad_norm": 0.18808463215827942, + "learning_rate": 9.846220884044183e-06, + "loss": 0.005, + "step": 20930 + }, + { + "epoch": 0.34263274155281026, + "grad_norm": 0.11173252761363983, + "learning_rate": 9.845869277316273e-06, + "loss": 0.0042, + "step": 20940 + }, + { + "epoch": 0.3427963675038861, + "grad_norm": 0.10642429441213608, + "learning_rate": 9.845517275375752e-06, + "loss": 0.0028, + "step": 20950 + }, + { + "epoch": 0.342959993454962, + "grad_norm": 0.08499259501695633, + "learning_rate": 9.845164878251327e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 0.3431236194060378, + "grad_norm": 0.06196568161249161, + "learning_rate": 9.84481208597174e-06, + "loss": 0.0091, + "step": 20970 + }, + { + "epoch": 0.34328724535711364, + "grad_norm": 0.07904001325368881, + "learning_rate": 9.844458898565763e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 0.34345087130818946, + "grad_norm": 0.06873547285795212, + "learning_rate": 9.8441053160622e-06, + "loss": 0.0032, + "step": 20990 + }, + { + "epoch": 0.3436144972592653, + "grad_norm": 0.12111184000968933, + "learning_rate": 9.84375133848989e-06, + "loss": 0.0029, + "step": 21000 + }, + { + "epoch": 0.3437781232103412, + "grad_norm": 0.32035741209983826, + "learning_rate": 9.843396965877698e-06, + "loss": 0.0051, + "step": 21010 + }, + { + "epoch": 0.343941749161417, + "grad_norm": 0.18931397795677185, + "learning_rate": 9.843042198254531e-06, + "loss": 0.003, + "step": 21020 + }, + { + "epoch": 0.34410537511249284, + "grad_norm": 0.23826611042022705, + "learning_rate": 9.842687035649319e-06, + "loss": 0.0034, + "step": 21030 + }, + { + "epoch": 0.34426900106356867, + "grad_norm": 0.08468547463417053, + "learning_rate": 9.842331478091027e-06, + "loss": 0.0042, + "step": 21040 + }, + { + "epoch": 0.3444326270146445, + "grad_norm": 0.07451336830854416, + "learning_rate": 9.841975525608655e-06, + "loss": 0.0029, + "step": 21050 + }, + { + "epoch": 0.3445962529657204, + "grad_norm": 0.01555787306278944, + "learning_rate": 9.841619178231234e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 0.3447598789167962, + "grad_norm": 0.2597961127758026, + "learning_rate": 9.841262435987825e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 0.34492350486787204, + "grad_norm": 0.15131132304668427, + "learning_rate": 9.840905298907523e-06, + "loss": 0.0028, + "step": 21080 + }, + { + "epoch": 0.3450871308189479, + "grad_norm": 0.0743577852845192, + "learning_rate": 9.840547767019456e-06, + "loss": 0.0025, + "step": 21090 + }, + { + "epoch": 0.3452507567700237, + "grad_norm": 0.1728866696357727, + "learning_rate": 9.84018984035278e-06, + "loss": 0.0047, + "step": 21100 + }, + { + "epoch": 0.3454143827210996, + "grad_norm": 0.14121197164058685, + "learning_rate": 9.83983151893669e-06, + "loss": 0.0044, + "step": 21110 + }, + { + "epoch": 0.3455780086721754, + "grad_norm": 0.46367713809013367, + "learning_rate": 9.839472802800407e-06, + "loss": 0.0037, + "step": 21120 + }, + { + "epoch": 0.34574163462325125, + "grad_norm": 0.3309983015060425, + "learning_rate": 9.839113691973188e-06, + "loss": 0.005, + "step": 21130 + }, + { + "epoch": 0.3459052605743271, + "grad_norm": 0.1667407900094986, + "learning_rate": 9.838754186484321e-06, + "loss": 0.0036, + "step": 21140 + }, + { + "epoch": 0.3460688865254029, + "grad_norm": 0.23879320919513702, + "learning_rate": 9.838394286363124e-06, + "loss": 0.0041, + "step": 21150 + }, + { + "epoch": 0.3462325124764788, + "grad_norm": 0.20847830176353455, + "learning_rate": 9.83803399163895e-06, + "loss": 0.0036, + "step": 21160 + }, + { + "epoch": 0.3463961384275546, + "grad_norm": 0.4925137758255005, + "learning_rate": 9.837673302341187e-06, + "loss": 0.006, + "step": 21170 + }, + { + "epoch": 0.34655976437863045, + "grad_norm": 0.239491805434227, + "learning_rate": 9.837312218499248e-06, + "loss": 0.0061, + "step": 21180 + }, + { + "epoch": 0.3467233903297063, + "grad_norm": 0.08568806946277618, + "learning_rate": 9.83695074014258e-06, + "loss": 0.0047, + "step": 21190 + }, + { + "epoch": 0.3468870162807821, + "grad_norm": 0.1617346704006195, + "learning_rate": 9.836588867300669e-06, + "loss": 0.0039, + "step": 21200 + }, + { + "epoch": 0.347050642231858, + "grad_norm": 0.3689493238925934, + "learning_rate": 9.836226600003025e-06, + "loss": 0.0063, + "step": 21210 + }, + { + "epoch": 0.3472142681829338, + "grad_norm": 0.06783358007669449, + "learning_rate": 9.835863938279193e-06, + "loss": 0.0036, + "step": 21220 + }, + { + "epoch": 0.34737789413400966, + "grad_norm": 0.06754415482282639, + "learning_rate": 9.83550088215875e-06, + "loss": 0.0043, + "step": 21230 + }, + { + "epoch": 0.3475415200850855, + "grad_norm": 0.03541114926338196, + "learning_rate": 9.835137431671308e-06, + "loss": 0.0026, + "step": 21240 + }, + { + "epoch": 0.3477051460361613, + "grad_norm": 0.08518331497907639, + "learning_rate": 9.834773586846509e-06, + "loss": 0.0064, + "step": 21250 + }, + { + "epoch": 0.3478687719872372, + "grad_norm": 0.19849002361297607, + "learning_rate": 9.834409347714025e-06, + "loss": 0.0043, + "step": 21260 + }, + { + "epoch": 0.34803239793831303, + "grad_norm": 0.2208586037158966, + "learning_rate": 9.834044714303562e-06, + "loss": 0.0041, + "step": 21270 + }, + { + "epoch": 0.34819602388938886, + "grad_norm": 0.06353924423456192, + "learning_rate": 9.833679686644858e-06, + "loss": 0.0029, + "step": 21280 + }, + { + "epoch": 0.3483596498404647, + "grad_norm": 0.2002340853214264, + "learning_rate": 9.833314264767686e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 0.3485232757915405, + "grad_norm": 0.21066778898239136, + "learning_rate": 9.832948448701846e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 0.3486869017426164, + "grad_norm": 0.08138404041528702, + "learning_rate": 9.832582238477173e-06, + "loss": 0.0048, + "step": 21310 + }, + { + "epoch": 0.34885052769369224, + "grad_norm": 0.07733263820409775, + "learning_rate": 9.832215634123537e-06, + "loss": 0.0045, + "step": 21320 + }, + { + "epoch": 0.34901415364476807, + "grad_norm": 0.038150470703840256, + "learning_rate": 9.831848635670831e-06, + "loss": 0.0031, + "step": 21330 + }, + { + "epoch": 0.3491777795958439, + "grad_norm": 0.21571916341781616, + "learning_rate": 9.831481243148992e-06, + "loss": 0.006, + "step": 21340 + }, + { + "epoch": 0.3493414055469197, + "grad_norm": 0.16414399445056915, + "learning_rate": 9.831113456587981e-06, + "loss": 0.0035, + "step": 21350 + }, + { + "epoch": 0.34950503149799556, + "grad_norm": 0.07113195210695267, + "learning_rate": 9.830745276017792e-06, + "loss": 0.0053, + "step": 21360 + }, + { + "epoch": 0.34966865744907144, + "grad_norm": 0.042324427515268326, + "learning_rate": 9.830376701468456e-06, + "loss": 0.0038, + "step": 21370 + }, + { + "epoch": 0.34983228340014727, + "grad_norm": 0.0561634860932827, + "learning_rate": 9.830007732970028e-06, + "loss": 0.0032, + "step": 21380 + }, + { + "epoch": 0.3499959093512231, + "grad_norm": 0.1563832312822342, + "learning_rate": 9.829638370552605e-06, + "loss": 0.01, + "step": 21390 + }, + { + "epoch": 0.35015953530229893, + "grad_norm": 0.12699761986732483, + "learning_rate": 9.829268614246306e-06, + "loss": 0.0037, + "step": 21400 + }, + { + "epoch": 0.35032316125337476, + "grad_norm": 0.19512920081615448, + "learning_rate": 9.828898464081291e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 0.35048678720445064, + "grad_norm": 0.09152159839868546, + "learning_rate": 9.828527920087746e-06, + "loss": 0.0038, + "step": 21420 + }, + { + "epoch": 0.3506504131555265, + "grad_norm": 0.07050028443336487, + "learning_rate": 9.828156982295893e-06, + "loss": 0.0028, + "step": 21430 + }, + { + "epoch": 0.3508140391066023, + "grad_norm": 0.19286592304706573, + "learning_rate": 9.827785650735982e-06, + "loss": 0.0018, + "step": 21440 + }, + { + "epoch": 0.35097766505767813, + "grad_norm": 0.09797953814268112, + "learning_rate": 9.827413925438299e-06, + "loss": 0.0056, + "step": 21450 + }, + { + "epoch": 0.35114129100875396, + "grad_norm": 0.15946625173091888, + "learning_rate": 9.82704180643316e-06, + "loss": 0.0065, + "step": 21460 + }, + { + "epoch": 0.35130491695982985, + "grad_norm": 0.1895676553249359, + "learning_rate": 9.826669293750915e-06, + "loss": 0.0049, + "step": 21470 + }, + { + "epoch": 0.3514685429109057, + "grad_norm": 0.1782289296388626, + "learning_rate": 9.826296387421945e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 0.3516321688619815, + "grad_norm": 0.1732427179813385, + "learning_rate": 9.825923087476662e-06, + "loss": 0.0061, + "step": 21490 + }, + { + "epoch": 0.35179579481305734, + "grad_norm": 0.1630256474018097, + "learning_rate": 9.82554939394551e-06, + "loss": 0.0046, + "step": 21500 + }, + { + "epoch": 0.35195942076413317, + "grad_norm": 0.09715411812067032, + "learning_rate": 9.825175306858968e-06, + "loss": 0.0029, + "step": 21510 + }, + { + "epoch": 0.35212304671520905, + "grad_norm": 0.10771908611059189, + "learning_rate": 9.824800826247544e-06, + "loss": 0.0027, + "step": 21520 + }, + { + "epoch": 0.3522866726662849, + "grad_norm": 0.16415317356586456, + "learning_rate": 9.824425952141781e-06, + "loss": 0.0045, + "step": 21530 + }, + { + "epoch": 0.3524502986173607, + "grad_norm": 0.08979246020317078, + "learning_rate": 9.824050684572251e-06, + "loss": 0.0043, + "step": 21540 + }, + { + "epoch": 0.35261392456843654, + "grad_norm": 0.0841081440448761, + "learning_rate": 9.82367502356956e-06, + "loss": 0.0033, + "step": 21550 + }, + { + "epoch": 0.3527775505195124, + "grad_norm": 0.11242271214723587, + "learning_rate": 9.823298969164345e-06, + "loss": 0.0034, + "step": 21560 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.07970384508371353, + "learning_rate": 9.822922521387277e-06, + "loss": 0.0038, + "step": 21570 + }, + { + "epoch": 0.3531048024216641, + "grad_norm": 0.13367889821529388, + "learning_rate": 9.822545680269056e-06, + "loss": 0.0048, + "step": 21580 + }, + { + "epoch": 0.3532684283727399, + "grad_norm": 0.16324225068092346, + "learning_rate": 9.822168445840418e-06, + "loss": 0.0032, + "step": 21590 + }, + { + "epoch": 0.35343205432381575, + "grad_norm": 0.11214855313301086, + "learning_rate": 9.821790818132128e-06, + "loss": 0.0036, + "step": 21600 + }, + { + "epoch": 0.3535956802748916, + "grad_norm": 0.060387782752513885, + "learning_rate": 9.821412797174985e-06, + "loss": 0.0051, + "step": 21610 + }, + { + "epoch": 0.35375930622596746, + "grad_norm": 0.17262721061706543, + "learning_rate": 9.821034382999817e-06, + "loss": 0.0049, + "step": 21620 + }, + { + "epoch": 0.3539229321770433, + "grad_norm": 0.16682997345924377, + "learning_rate": 9.820655575637488e-06, + "loss": 0.0043, + "step": 21630 + }, + { + "epoch": 0.3540865581281191, + "grad_norm": 0.06710716336965561, + "learning_rate": 9.82027637511889e-06, + "loss": 0.0049, + "step": 21640 + }, + { + "epoch": 0.35425018407919495, + "grad_norm": 0.14109598100185394, + "learning_rate": 9.819896781474953e-06, + "loss": 0.0033, + "step": 21650 + }, + { + "epoch": 0.3544138100302708, + "grad_norm": 0.13810548186302185, + "learning_rate": 9.819516794736632e-06, + "loss": 0.0074, + "step": 21660 + }, + { + "epoch": 0.35457743598134667, + "grad_norm": 0.15594951808452606, + "learning_rate": 9.81913641493492e-06, + "loss": 0.0057, + "step": 21670 + }, + { + "epoch": 0.3547410619324225, + "grad_norm": 0.046292707324028015, + "learning_rate": 9.818755642100836e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 0.3549046878834983, + "grad_norm": 0.12565389275550842, + "learning_rate": 9.818374476265438e-06, + "loss": 0.0094, + "step": 21690 + }, + { + "epoch": 0.35506831383457416, + "grad_norm": 0.0537894144654274, + "learning_rate": 9.81799291745981e-06, + "loss": 0.0037, + "step": 21700 + }, + { + "epoch": 0.35523193978565, + "grad_norm": 0.24739708006381989, + "learning_rate": 9.817610965715072e-06, + "loss": 0.0043, + "step": 21710 + }, + { + "epoch": 0.35539556573672587, + "grad_norm": 0.17831526696681976, + "learning_rate": 9.817228621062377e-06, + "loss": 0.0037, + "step": 21720 + }, + { + "epoch": 0.3555591916878017, + "grad_norm": 0.06960579007863998, + "learning_rate": 9.816845883532902e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 0.35572281763887753, + "grad_norm": 0.09768445789813995, + "learning_rate": 9.816462753157867e-06, + "loss": 0.004, + "step": 21740 + }, + { + "epoch": 0.35588644358995336, + "grad_norm": 0.1259879767894745, + "learning_rate": 9.816079229968515e-06, + "loss": 0.004, + "step": 21750 + }, + { + "epoch": 0.3560500695410292, + "grad_norm": 0.11670015007257462, + "learning_rate": 9.81569531399613e-06, + "loss": 0.0035, + "step": 21760 + }, + { + "epoch": 0.356213695492105, + "grad_norm": 0.07601787149906158, + "learning_rate": 9.815311005272017e-06, + "loss": 0.0054, + "step": 21770 + }, + { + "epoch": 0.3563773214431809, + "grad_norm": 0.2184062898159027, + "learning_rate": 9.814926303827524e-06, + "loss": 0.005, + "step": 21780 + }, + { + "epoch": 0.35654094739425674, + "grad_norm": 0.3103298544883728, + "learning_rate": 9.81454120969402e-06, + "loss": 0.003, + "step": 21790 + }, + { + "epoch": 0.35670457334533257, + "grad_norm": 0.3028302788734436, + "learning_rate": 9.814155722902919e-06, + "loss": 0.0078, + "step": 21800 + }, + { + "epoch": 0.3568681992964084, + "grad_norm": 0.06297461688518524, + "learning_rate": 9.813769843485654e-06, + "loss": 0.0032, + "step": 21810 + }, + { + "epoch": 0.3570318252474842, + "grad_norm": 0.14724983274936676, + "learning_rate": 9.813383571473699e-06, + "loss": 0.0043, + "step": 21820 + }, + { + "epoch": 0.3571954511985601, + "grad_norm": 0.09313119202852249, + "learning_rate": 9.812996906898556e-06, + "loss": 0.0042, + "step": 21830 + }, + { + "epoch": 0.35735907714963594, + "grad_norm": 0.07065626233816147, + "learning_rate": 9.812609849791762e-06, + "loss": 0.0036, + "step": 21840 + }, + { + "epoch": 0.35752270310071177, + "grad_norm": 0.053032826632261276, + "learning_rate": 9.81222240018488e-06, + "loss": 0.0038, + "step": 21850 + }, + { + "epoch": 0.3576863290517876, + "grad_norm": 0.16859860718250275, + "learning_rate": 9.811834558109514e-06, + "loss": 0.003, + "step": 21860 + }, + { + "epoch": 0.35784995500286343, + "grad_norm": 0.19317512214183807, + "learning_rate": 9.81144632359729e-06, + "loss": 0.0049, + "step": 21870 + }, + { + "epoch": 0.3580135809539393, + "grad_norm": 0.10547548532485962, + "learning_rate": 9.811057696679878e-06, + "loss": 0.0028, + "step": 21880 + }, + { + "epoch": 0.35817720690501514, + "grad_norm": 0.1879497915506363, + "learning_rate": 9.810668677388967e-06, + "loss": 0.0045, + "step": 21890 + }, + { + "epoch": 0.358340832856091, + "grad_norm": 0.32673487067222595, + "learning_rate": 9.810279265756285e-06, + "loss": 0.0058, + "step": 21900 + }, + { + "epoch": 0.3585044588071668, + "grad_norm": 0.26756083965301514, + "learning_rate": 9.809889461813592e-06, + "loss": 0.0053, + "step": 21910 + }, + { + "epoch": 0.35866808475824263, + "grad_norm": 0.23736342787742615, + "learning_rate": 9.809499265592678e-06, + "loss": 0.0046, + "step": 21920 + }, + { + "epoch": 0.3588317107093185, + "grad_norm": 0.14139366149902344, + "learning_rate": 9.809108677125371e-06, + "loss": 0.0055, + "step": 21930 + }, + { + "epoch": 0.35899533666039435, + "grad_norm": 0.061194807291030884, + "learning_rate": 9.80871769644352e-06, + "loss": 0.0032, + "step": 21940 + }, + { + "epoch": 0.3591589626114702, + "grad_norm": 0.18361642956733704, + "learning_rate": 9.808326323579014e-06, + "loss": 0.0051, + "step": 21950 + }, + { + "epoch": 0.359322588562546, + "grad_norm": 0.0626908540725708, + "learning_rate": 9.807934558563774e-06, + "loss": 0.0029, + "step": 21960 + }, + { + "epoch": 0.35948621451362184, + "grad_norm": 0.08477026224136353, + "learning_rate": 9.807542401429748e-06, + "loss": 0.0033, + "step": 21970 + }, + { + "epoch": 0.3596498404646977, + "grad_norm": 0.040161699056625366, + "learning_rate": 9.807149852208922e-06, + "loss": 0.0051, + "step": 21980 + }, + { + "epoch": 0.35981346641577355, + "grad_norm": 0.1196545660495758, + "learning_rate": 9.806756910933308e-06, + "loss": 0.0042, + "step": 21990 + }, + { + "epoch": 0.3599770923668494, + "grad_norm": 0.09050813317298889, + "learning_rate": 9.806363577634954e-06, + "loss": 0.0037, + "step": 22000 + }, + { + "epoch": 0.3601407183179252, + "grad_norm": 0.0685819610953331, + "learning_rate": 9.80596985234594e-06, + "loss": 0.003, + "step": 22010 + }, + { + "epoch": 0.36030434426900104, + "grad_norm": 0.23909932374954224, + "learning_rate": 9.805575735098376e-06, + "loss": 0.0049, + "step": 22020 + }, + { + "epoch": 0.3604679702200769, + "grad_norm": 0.09177765995264053, + "learning_rate": 9.805181225924406e-06, + "loss": 0.0051, + "step": 22030 + }, + { + "epoch": 0.36063159617115276, + "grad_norm": 0.19640956819057465, + "learning_rate": 9.804786324856202e-06, + "loss": 0.0045, + "step": 22040 + }, + { + "epoch": 0.3607952221222286, + "grad_norm": 0.2347734123468399, + "learning_rate": 9.804391031925972e-06, + "loss": 0.0031, + "step": 22050 + }, + { + "epoch": 0.3609588480733044, + "grad_norm": 0.3181383013725281, + "learning_rate": 9.803995347165958e-06, + "loss": 0.0037, + "step": 22060 + }, + { + "epoch": 0.36112247402438025, + "grad_norm": 0.18287114799022675, + "learning_rate": 9.803599270608427e-06, + "loss": 0.0061, + "step": 22070 + }, + { + "epoch": 0.36128609997545613, + "grad_norm": 0.4434426426887512, + "learning_rate": 9.803202802285682e-06, + "loss": 0.0046, + "step": 22080 + }, + { + "epoch": 0.36144972592653196, + "grad_norm": 0.19482307136058807, + "learning_rate": 9.802805942230061e-06, + "loss": 0.0038, + "step": 22090 + }, + { + "epoch": 0.3616133518776078, + "grad_norm": 0.139286071062088, + "learning_rate": 9.802408690473925e-06, + "loss": 0.0042, + "step": 22100 + }, + { + "epoch": 0.3617769778286836, + "grad_norm": 0.11104580760002136, + "learning_rate": 9.802011047049675e-06, + "loss": 0.0045, + "step": 22110 + }, + { + "epoch": 0.36194060377975945, + "grad_norm": 0.2055734097957611, + "learning_rate": 9.801613011989745e-06, + "loss": 0.0033, + "step": 22120 + }, + { + "epoch": 0.36210422973083534, + "grad_norm": 0.07899788022041321, + "learning_rate": 9.801214585326592e-06, + "loss": 0.0055, + "step": 22130 + }, + { + "epoch": 0.36226785568191117, + "grad_norm": 0.13910041749477386, + "learning_rate": 9.800815767092712e-06, + "loss": 0.0047, + "step": 22140 + }, + { + "epoch": 0.362431481632987, + "grad_norm": 0.0787155032157898, + "learning_rate": 9.800416557320634e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 0.3625951075840628, + "grad_norm": 0.05051407963037491, + "learning_rate": 9.800016956042913e-06, + "loss": 0.0046, + "step": 22160 + }, + { + "epoch": 0.36275873353513866, + "grad_norm": 0.2281433492898941, + "learning_rate": 9.79961696329214e-06, + "loss": 0.0031, + "step": 22170 + }, + { + "epoch": 0.36292235948621454, + "grad_norm": 0.06710359454154968, + "learning_rate": 9.799216579100937e-06, + "loss": 0.0048, + "step": 22180 + }, + { + "epoch": 0.36308598543729037, + "grad_norm": 0.08895856887102127, + "learning_rate": 9.798815803501958e-06, + "loss": 0.006, + "step": 22190 + }, + { + "epoch": 0.3632496113883662, + "grad_norm": 0.164419025182724, + "learning_rate": 9.798414636527891e-06, + "loss": 0.0038, + "step": 22200 + }, + { + "epoch": 0.36341323733944203, + "grad_norm": 0.1343121975660324, + "learning_rate": 9.79801307821145e-06, + "loss": 0.0053, + "step": 22210 + }, + { + "epoch": 0.36357686329051786, + "grad_norm": 0.09868015348911285, + "learning_rate": 9.797611128585388e-06, + "loss": 0.0048, + "step": 22220 + }, + { + "epoch": 0.3637404892415937, + "grad_norm": 0.2300853580236435, + "learning_rate": 9.797208787682482e-06, + "loss": 0.006, + "step": 22230 + }, + { + "epoch": 0.3639041151926696, + "grad_norm": 0.1900927722454071, + "learning_rate": 9.796806055535551e-06, + "loss": 0.0039, + "step": 22240 + }, + { + "epoch": 0.3640677411437454, + "grad_norm": 0.2144131362438202, + "learning_rate": 9.796402932177438e-06, + "loss": 0.0056, + "step": 22250 + }, + { + "epoch": 0.36423136709482123, + "grad_norm": 0.05185553804039955, + "learning_rate": 9.795999417641022e-06, + "loss": 0.0034, + "step": 22260 + }, + { + "epoch": 0.36439499304589706, + "grad_norm": 0.060830626636743546, + "learning_rate": 9.795595511959208e-06, + "loss": 0.0043, + "step": 22270 + }, + { + "epoch": 0.3645586189969729, + "grad_norm": 0.1453610509634018, + "learning_rate": 9.795191215164943e-06, + "loss": 0.0036, + "step": 22280 + }, + { + "epoch": 0.3647222449480488, + "grad_norm": 0.1452437788248062, + "learning_rate": 9.794786527291196e-06, + "loss": 0.0032, + "step": 22290 + }, + { + "epoch": 0.3648858708991246, + "grad_norm": 0.1662825047969818, + "learning_rate": 9.794381448370974e-06, + "loss": 0.0068, + "step": 22300 + }, + { + "epoch": 0.36504949685020044, + "grad_norm": 0.08057316392660141, + "learning_rate": 9.793975978437312e-06, + "loss": 0.0032, + "step": 22310 + }, + { + "epoch": 0.36521312280127627, + "grad_norm": 0.064130499958992, + "learning_rate": 9.793570117523279e-06, + "loss": 0.0028, + "step": 22320 + }, + { + "epoch": 0.3653767487523521, + "grad_norm": 0.13349877297878265, + "learning_rate": 9.793163865661977e-06, + "loss": 0.003, + "step": 22330 + }, + { + "epoch": 0.365540374703428, + "grad_norm": 0.07337766885757446, + "learning_rate": 9.792757222886538e-06, + "loss": 0.0026, + "step": 22340 + }, + { + "epoch": 0.3657040006545038, + "grad_norm": 0.009043429978191853, + "learning_rate": 9.792350189230125e-06, + "loss": 0.0059, + "step": 22350 + }, + { + "epoch": 0.36586762660557964, + "grad_norm": 0.06668829917907715, + "learning_rate": 9.791942764725938e-06, + "loss": 0.0025, + "step": 22360 + }, + { + "epoch": 0.3660312525566555, + "grad_norm": 0.11217039078474045, + "learning_rate": 9.791534949407202e-06, + "loss": 0.0043, + "step": 22370 + }, + { + "epoch": 0.3661948785077313, + "grad_norm": 0.08812987804412842, + "learning_rate": 9.791126743307176e-06, + "loss": 0.0038, + "step": 22380 + }, + { + "epoch": 0.3663585044588072, + "grad_norm": 0.195985808968544, + "learning_rate": 9.790718146459154e-06, + "loss": 0.0036, + "step": 22390 + }, + { + "epoch": 0.366522130409883, + "grad_norm": 0.14922137558460236, + "learning_rate": 9.79030915889646e-06, + "loss": 0.0025, + "step": 22400 + }, + { + "epoch": 0.36668575636095885, + "grad_norm": 0.1597341001033783, + "learning_rate": 9.78989978065245e-06, + "loss": 0.0042, + "step": 22410 + }, + { + "epoch": 0.3668493823120347, + "grad_norm": 0.017290370538830757, + "learning_rate": 9.78949001176051e-06, + "loss": 0.0035, + "step": 22420 + }, + { + "epoch": 0.3670130082631105, + "grad_norm": 0.14993682503700256, + "learning_rate": 9.78907985225406e-06, + "loss": 0.0031, + "step": 22430 + }, + { + "epoch": 0.3671766342141864, + "grad_norm": 0.05891666188836098, + "learning_rate": 9.788669302166551e-06, + "loss": 0.0034, + "step": 22440 + }, + { + "epoch": 0.3673402601652622, + "grad_norm": 0.135666623711586, + "learning_rate": 9.788258361531466e-06, + "loss": 0.0027, + "step": 22450 + }, + { + "epoch": 0.36750388611633805, + "grad_norm": 0.10232146829366684, + "learning_rate": 9.78784703038232e-06, + "loss": 0.0042, + "step": 22460 + }, + { + "epoch": 0.3676675120674139, + "grad_norm": 0.15934139490127563, + "learning_rate": 9.78743530875266e-06, + "loss": 0.0035, + "step": 22470 + }, + { + "epoch": 0.3678311380184897, + "grad_norm": 0.08764767646789551, + "learning_rate": 9.787023196676064e-06, + "loss": 0.0051, + "step": 22480 + }, + { + "epoch": 0.3679947639695656, + "grad_norm": 0.07965680956840515, + "learning_rate": 9.786610694186143e-06, + "loss": 0.0049, + "step": 22490 + }, + { + "epoch": 0.3681583899206414, + "grad_norm": 0.07782886922359467, + "learning_rate": 9.786197801316539e-06, + "loss": 0.0026, + "step": 22500 + }, + { + "epoch": 0.36832201587171726, + "grad_norm": 0.12102463841438293, + "learning_rate": 9.785784518100928e-06, + "loss": 0.0062, + "step": 22510 + }, + { + "epoch": 0.3684856418227931, + "grad_norm": 0.02740984596312046, + "learning_rate": 9.785370844573014e-06, + "loss": 0.0037, + "step": 22520 + }, + { + "epoch": 0.3686492677738689, + "grad_norm": 0.059107620269060135, + "learning_rate": 9.784956780766534e-06, + "loss": 0.003, + "step": 22530 + }, + { + "epoch": 0.3688128937249448, + "grad_norm": 0.30199170112609863, + "learning_rate": 9.784542326715259e-06, + "loss": 0.0043, + "step": 22540 + }, + { + "epoch": 0.36897651967602063, + "grad_norm": 0.15318413078784943, + "learning_rate": 9.78412748245299e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 0.36914014562709646, + "grad_norm": 0.37131184339523315, + "learning_rate": 9.783712248013561e-06, + "loss": 0.004, + "step": 22560 + }, + { + "epoch": 0.3693037715781723, + "grad_norm": 0.0888022854924202, + "learning_rate": 9.783296623430838e-06, + "loss": 0.005, + "step": 22570 + }, + { + "epoch": 0.3694673975292481, + "grad_norm": 0.12317230552434921, + "learning_rate": 9.782880608738714e-06, + "loss": 0.0022, + "step": 22580 + }, + { + "epoch": 0.369631023480324, + "grad_norm": 0.17772452533245087, + "learning_rate": 9.782464203971122e-06, + "loss": 0.0041, + "step": 22590 + }, + { + "epoch": 0.36979464943139984, + "grad_norm": 0.15841124951839447, + "learning_rate": 9.78204740916202e-06, + "loss": 0.0049, + "step": 22600 + }, + { + "epoch": 0.36995827538247567, + "grad_norm": 0.1106036975979805, + "learning_rate": 9.781630224345402e-06, + "loss": 0.0041, + "step": 22610 + }, + { + "epoch": 0.3701219013335515, + "grad_norm": 0.13645784556865692, + "learning_rate": 9.781212649555291e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 0.3702855272846273, + "grad_norm": 0.2242395579814911, + "learning_rate": 9.780794684825743e-06, + "loss": 0.0062, + "step": 22630 + }, + { + "epoch": 0.3704491532357032, + "grad_norm": 0.09353913366794586, + "learning_rate": 9.780376330190847e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 0.37061277918677904, + "grad_norm": 0.13247133791446686, + "learning_rate": 9.779957585684721e-06, + "loss": 0.0054, + "step": 22650 + }, + { + "epoch": 0.37077640513785487, + "grad_norm": 0.14868858456611633, + "learning_rate": 9.779538451341519e-06, + "loss": 0.0055, + "step": 22660 + }, + { + "epoch": 0.3709400310889307, + "grad_norm": 0.15545055270195007, + "learning_rate": 9.77911892719542e-06, + "loss": 0.0082, + "step": 22670 + }, + { + "epoch": 0.37110365704000653, + "grad_norm": 0.2338918149471283, + "learning_rate": 9.778699013280642e-06, + "loss": 0.0055, + "step": 22680 + }, + { + "epoch": 0.37126728299108236, + "grad_norm": 0.1080441102385521, + "learning_rate": 9.77827870963143e-06, + "loss": 0.0036, + "step": 22690 + }, + { + "epoch": 0.37143090894215824, + "grad_norm": 0.1431145966053009, + "learning_rate": 9.777858016282067e-06, + "loss": 0.0048, + "step": 22700 + }, + { + "epoch": 0.3715945348932341, + "grad_norm": 0.11440259218215942, + "learning_rate": 9.777436933266857e-06, + "loss": 0.0028, + "step": 22710 + }, + { + "epoch": 0.3717581608443099, + "grad_norm": 0.26593178510665894, + "learning_rate": 9.777015460620144e-06, + "loss": 0.0024, + "step": 22720 + }, + { + "epoch": 0.37192178679538573, + "grad_norm": 0.03324040398001671, + "learning_rate": 9.776593598376305e-06, + "loss": 0.0031, + "step": 22730 + }, + { + "epoch": 0.37208541274646156, + "grad_norm": 0.07390396296977997, + "learning_rate": 9.776171346569743e-06, + "loss": 0.0019, + "step": 22740 + }, + { + "epoch": 0.37224903869753745, + "grad_norm": 0.06852605193853378, + "learning_rate": 9.775748705234897e-06, + "loss": 0.0036, + "step": 22750 + }, + { + "epoch": 0.3724126646486133, + "grad_norm": 0.1449580192565918, + "learning_rate": 9.775325674406233e-06, + "loss": 0.0032, + "step": 22760 + }, + { + "epoch": 0.3725762905996891, + "grad_norm": 0.04383785277605057, + "learning_rate": 9.774902254118255e-06, + "loss": 0.0024, + "step": 22770 + }, + { + "epoch": 0.37273991655076494, + "grad_norm": 0.1101393923163414, + "learning_rate": 9.774478444405494e-06, + "loss": 0.0027, + "step": 22780 + }, + { + "epoch": 0.37290354250184077, + "grad_norm": 0.012634593062102795, + "learning_rate": 9.774054245302516e-06, + "loss": 0.0039, + "step": 22790 + }, + { + "epoch": 0.37306716845291665, + "grad_norm": 0.06818386912345886, + "learning_rate": 9.773629656843917e-06, + "loss": 0.0049, + "step": 22800 + }, + { + "epoch": 0.3732307944039925, + "grad_norm": 0.09158064424991608, + "learning_rate": 9.773204679064324e-06, + "loss": 0.0032, + "step": 22810 + }, + { + "epoch": 0.3733944203550683, + "grad_norm": 0.0785619467496872, + "learning_rate": 9.772779311998398e-06, + "loss": 0.0038, + "step": 22820 + }, + { + "epoch": 0.37355804630614414, + "grad_norm": 0.077456995844841, + "learning_rate": 9.772353555680828e-06, + "loss": 0.0031, + "step": 22830 + }, + { + "epoch": 0.37372167225722, + "grad_norm": 0.08676807582378387, + "learning_rate": 9.77192741014634e-06, + "loss": 0.0025, + "step": 22840 + }, + { + "epoch": 0.37388529820829586, + "grad_norm": 0.18085947632789612, + "learning_rate": 9.771500875429687e-06, + "loss": 0.0033, + "step": 22850 + }, + { + "epoch": 0.3740489241593717, + "grad_norm": 0.07374570518732071, + "learning_rate": 9.771073951565657e-06, + "loss": 0.0053, + "step": 22860 + }, + { + "epoch": 0.3742125501104475, + "grad_norm": 0.060111820697784424, + "learning_rate": 9.770646638589069e-06, + "loss": 0.0038, + "step": 22870 + }, + { + "epoch": 0.37437617606152335, + "grad_norm": 0.12110818177461624, + "learning_rate": 9.770218936534773e-06, + "loss": 0.0037, + "step": 22880 + }, + { + "epoch": 0.3745398020125992, + "grad_norm": 0.22021162509918213, + "learning_rate": 9.769790845437647e-06, + "loss": 0.0031, + "step": 22890 + }, + { + "epoch": 0.37470342796367506, + "grad_norm": 0.08133351802825928, + "learning_rate": 9.769362365332611e-06, + "loss": 0.0028, + "step": 22900 + }, + { + "epoch": 0.3748670539147509, + "grad_norm": 0.3176220655441284, + "learning_rate": 9.768933496254607e-06, + "loss": 0.0026, + "step": 22910 + }, + { + "epoch": 0.3750306798658267, + "grad_norm": 0.06265996396541595, + "learning_rate": 9.76850423823861e-06, + "loss": 0.0024, + "step": 22920 + }, + { + "epoch": 0.37519430581690255, + "grad_norm": 0.014191754162311554, + "learning_rate": 9.768074591319634e-06, + "loss": 0.0024, + "step": 22930 + }, + { + "epoch": 0.3753579317679784, + "grad_norm": 0.19419102370738983, + "learning_rate": 9.767644555532715e-06, + "loss": 0.0046, + "step": 22940 + }, + { + "epoch": 0.37552155771905427, + "grad_norm": 0.11108149588108063, + "learning_rate": 9.767214130912928e-06, + "loss": 0.0032, + "step": 22950 + }, + { + "epoch": 0.3756851836701301, + "grad_norm": 0.10928455740213394, + "learning_rate": 9.766783317495373e-06, + "loss": 0.0033, + "step": 22960 + }, + { + "epoch": 0.3758488096212059, + "grad_norm": 0.06540073454380035, + "learning_rate": 9.766352115315191e-06, + "loss": 0.0034, + "step": 22970 + }, + { + "epoch": 0.37601243557228176, + "grad_norm": 0.24643945693969727, + "learning_rate": 9.765920524407548e-06, + "loss": 0.0029, + "step": 22980 + }, + { + "epoch": 0.3761760615233576, + "grad_norm": 0.2234330177307129, + "learning_rate": 9.765488544807642e-06, + "loss": 0.0047, + "step": 22990 + }, + { + "epoch": 0.37633968747443347, + "grad_norm": 0.19479380548000336, + "learning_rate": 9.765056176550703e-06, + "loss": 0.0048, + "step": 23000 + }, + { + "epoch": 0.3765033134255093, + "grad_norm": 0.18185527622699738, + "learning_rate": 9.764623419671995e-06, + "loss": 0.0042, + "step": 23010 + }, + { + "epoch": 0.37666693937658513, + "grad_norm": 0.04639330506324768, + "learning_rate": 9.76419027420681e-06, + "loss": 0.0035, + "step": 23020 + }, + { + "epoch": 0.37683056532766096, + "grad_norm": 0.15931709110736847, + "learning_rate": 9.763756740190475e-06, + "loss": 0.004, + "step": 23030 + }, + { + "epoch": 0.3769941912787368, + "grad_norm": 0.1329541802406311, + "learning_rate": 9.76332281765835e-06, + "loss": 0.0051, + "step": 23040 + }, + { + "epoch": 0.3771578172298127, + "grad_norm": 0.25017082691192627, + "learning_rate": 9.762888506645822e-06, + "loss": 0.0046, + "step": 23050 + }, + { + "epoch": 0.3773214431808885, + "grad_norm": 0.14795063436031342, + "learning_rate": 9.76245380718831e-06, + "loss": 0.0039, + "step": 23060 + }, + { + "epoch": 0.37748506913196433, + "grad_norm": 0.04935089871287346, + "learning_rate": 9.762018719321272e-06, + "loss": 0.0027, + "step": 23070 + }, + { + "epoch": 0.37764869508304016, + "grad_norm": 0.06639645993709564, + "learning_rate": 9.761583243080187e-06, + "loss": 0.0038, + "step": 23080 + }, + { + "epoch": 0.377812321034116, + "grad_norm": 0.17891868948936462, + "learning_rate": 9.761147378500573e-06, + "loss": 0.0042, + "step": 23090 + }, + { + "epoch": 0.3779759469851918, + "grad_norm": 0.18009509146213531, + "learning_rate": 9.760711125617979e-06, + "loss": 0.0029, + "step": 23100 + }, + { + "epoch": 0.3781395729362677, + "grad_norm": 0.08619875460863113, + "learning_rate": 9.760274484467981e-06, + "loss": 0.0031, + "step": 23110 + }, + { + "epoch": 0.37830319888734354, + "grad_norm": 0.041656337678432465, + "learning_rate": 9.759837455086193e-06, + "loss": 0.0027, + "step": 23120 + }, + { + "epoch": 0.37846682483841937, + "grad_norm": 0.0884852409362793, + "learning_rate": 9.759400037508257e-06, + "loss": 0.0043, + "step": 23130 + }, + { + "epoch": 0.3786304507894952, + "grad_norm": 0.22761377692222595, + "learning_rate": 9.758962231769846e-06, + "loss": 0.0033, + "step": 23140 + }, + { + "epoch": 0.37879407674057103, + "grad_norm": 0.24562352895736694, + "learning_rate": 9.758524037906666e-06, + "loss": 0.0042, + "step": 23150 + }, + { + "epoch": 0.3789577026916469, + "grad_norm": 0.07740072906017303, + "learning_rate": 9.758085455954457e-06, + "loss": 0.0037, + "step": 23160 + }, + { + "epoch": 0.37912132864272274, + "grad_norm": 0.12342134863138199, + "learning_rate": 9.757646485948986e-06, + "loss": 0.0038, + "step": 23170 + }, + { + "epoch": 0.3792849545937986, + "grad_norm": 0.13873374462127686, + "learning_rate": 9.757207127926054e-06, + "loss": 0.0036, + "step": 23180 + }, + { + "epoch": 0.3794485805448744, + "grad_norm": 0.07392168045043945, + "learning_rate": 9.756767381921495e-06, + "loss": 0.003, + "step": 23190 + }, + { + "epoch": 0.37961220649595023, + "grad_norm": 0.2070484757423401, + "learning_rate": 9.756327247971171e-06, + "loss": 0.0032, + "step": 23200 + }, + { + "epoch": 0.3797758324470261, + "grad_norm": 0.17215843498706818, + "learning_rate": 9.75588672611098e-06, + "loss": 0.0055, + "step": 23210 + }, + { + "epoch": 0.37993945839810195, + "grad_norm": 0.20391426980495453, + "learning_rate": 9.75544581637685e-06, + "loss": 0.0047, + "step": 23220 + }, + { + "epoch": 0.3801030843491778, + "grad_norm": 0.07884842902421951, + "learning_rate": 9.755004518804736e-06, + "loss": 0.0033, + "step": 23230 + }, + { + "epoch": 0.3802667103002536, + "grad_norm": 0.05794193968176842, + "learning_rate": 9.754562833430632e-06, + "loss": 0.0036, + "step": 23240 + }, + { + "epoch": 0.38043033625132944, + "grad_norm": 0.05548607185482979, + "learning_rate": 9.75412076029056e-06, + "loss": 0.0029, + "step": 23250 + }, + { + "epoch": 0.3805939622024053, + "grad_norm": 0.08710295706987381, + "learning_rate": 9.753678299420574e-06, + "loss": 0.0028, + "step": 23260 + }, + { + "epoch": 0.38075758815348115, + "grad_norm": 0.26748549938201904, + "learning_rate": 9.75323545085676e-06, + "loss": 0.0039, + "step": 23270 + }, + { + "epoch": 0.380921214104557, + "grad_norm": 0.11372175812721252, + "learning_rate": 9.752792214635232e-06, + "loss": 0.0032, + "step": 23280 + }, + { + "epoch": 0.3810848400556328, + "grad_norm": 0.17136171460151672, + "learning_rate": 9.752348590792144e-06, + "loss": 0.0048, + "step": 23290 + }, + { + "epoch": 0.38124846600670864, + "grad_norm": 0.26676544547080994, + "learning_rate": 9.751904579363673e-06, + "loss": 0.0031, + "step": 23300 + }, + { + "epoch": 0.3814120919577845, + "grad_norm": 0.12151642888784409, + "learning_rate": 9.751460180386032e-06, + "loss": 0.0051, + "step": 23310 + }, + { + "epoch": 0.38157571790886036, + "grad_norm": 0.2721957266330719, + "learning_rate": 9.751015393895465e-06, + "loss": 0.0033, + "step": 23320 + }, + { + "epoch": 0.3817393438599362, + "grad_norm": 0.049269482493400574, + "learning_rate": 9.750570219928245e-06, + "loss": 0.0032, + "step": 23330 + }, + { + "epoch": 0.381902969811012, + "grad_norm": 0.3966676592826843, + "learning_rate": 9.750124658520682e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 0.38206659576208785, + "grad_norm": 0.10777287930250168, + "learning_rate": 9.749678709709112e-06, + "loss": 0.0042, + "step": 23350 + }, + { + "epoch": 0.38223022171316373, + "grad_norm": 0.24577341973781586, + "learning_rate": 9.749232373529907e-06, + "loss": 0.004, + "step": 23360 + }, + { + "epoch": 0.38239384766423956, + "grad_norm": 0.07021196186542511, + "learning_rate": 9.748785650019468e-06, + "loss": 0.0037, + "step": 23370 + }, + { + "epoch": 0.3825574736153154, + "grad_norm": 0.2510216236114502, + "learning_rate": 9.748338539214229e-06, + "loss": 0.0043, + "step": 23380 + }, + { + "epoch": 0.3827210995663912, + "grad_norm": 0.19000519812107086, + "learning_rate": 9.747891041150654e-06, + "loss": 0.0037, + "step": 23390 + }, + { + "epoch": 0.38288472551746705, + "grad_norm": 0.0985528975725174, + "learning_rate": 9.747443155865238e-06, + "loss": 0.0039, + "step": 23400 + }, + { + "epoch": 0.38304835146854294, + "grad_norm": 0.172266885638237, + "learning_rate": 9.746994883394512e-06, + "loss": 0.0035, + "step": 23410 + }, + { + "epoch": 0.38321197741961877, + "grad_norm": 0.07203979045152664, + "learning_rate": 9.746546223775032e-06, + "loss": 0.0031, + "step": 23420 + }, + { + "epoch": 0.3833756033706946, + "grad_norm": 0.21380412578582764, + "learning_rate": 9.746097177043393e-06, + "loss": 0.0062, + "step": 23430 + }, + { + "epoch": 0.3835392293217704, + "grad_norm": 0.07951556146144867, + "learning_rate": 9.745647743236216e-06, + "loss": 0.0026, + "step": 23440 + }, + { + "epoch": 0.38370285527284625, + "grad_norm": 0.09176522493362427, + "learning_rate": 9.745197922390153e-06, + "loss": 0.005, + "step": 23450 + }, + { + "epoch": 0.38386648122392214, + "grad_norm": 0.14726032316684723, + "learning_rate": 9.744747714541894e-06, + "loss": 0.0028, + "step": 23460 + }, + { + "epoch": 0.38403010717499797, + "grad_norm": 0.05336847901344299, + "learning_rate": 9.744297119728152e-06, + "loss": 0.0039, + "step": 23470 + }, + { + "epoch": 0.3841937331260738, + "grad_norm": 0.14600268006324768, + "learning_rate": 9.74384613798568e-06, + "loss": 0.005, + "step": 23480 + }, + { + "epoch": 0.38435735907714963, + "grad_norm": 0.10225504636764526, + "learning_rate": 9.743394769351258e-06, + "loss": 0.0025, + "step": 23490 + }, + { + "epoch": 0.38452098502822546, + "grad_norm": 0.08892350643873215, + "learning_rate": 9.742943013861695e-06, + "loss": 0.0036, + "step": 23500 + }, + { + "epoch": 0.38468461097930134, + "grad_norm": 0.14312788844108582, + "learning_rate": 9.742490871553837e-06, + "loss": 0.0026, + "step": 23510 + }, + { + "epoch": 0.3848482369303772, + "grad_norm": 0.24043717980384827, + "learning_rate": 9.74203834246456e-06, + "loss": 0.0081, + "step": 23520 + }, + { + "epoch": 0.385011862881453, + "grad_norm": 0.08359953761100769, + "learning_rate": 9.74158542663077e-06, + "loss": 0.0037, + "step": 23530 + }, + { + "epoch": 0.38517548883252883, + "grad_norm": 0.05971476808190346, + "learning_rate": 9.741132124089403e-06, + "loss": 0.0022, + "step": 23540 + }, + { + "epoch": 0.38533911478360466, + "grad_norm": 0.09047897160053253, + "learning_rate": 9.740678434877433e-06, + "loss": 0.0049, + "step": 23550 + }, + { + "epoch": 0.3855027407346805, + "grad_norm": 0.06716237962245941, + "learning_rate": 9.740224359031858e-06, + "loss": 0.0025, + "step": 23560 + }, + { + "epoch": 0.3856663666857564, + "grad_norm": 0.2412993162870407, + "learning_rate": 9.73976989658971e-06, + "loss": 0.0044, + "step": 23570 + }, + { + "epoch": 0.3858299926368322, + "grad_norm": 0.14340847730636597, + "learning_rate": 9.739315047588059e-06, + "loss": 0.0034, + "step": 23580 + }, + { + "epoch": 0.38599361858790804, + "grad_norm": 0.04268035292625427, + "learning_rate": 9.738859812063994e-06, + "loss": 0.0026, + "step": 23590 + }, + { + "epoch": 0.38615724453898387, + "grad_norm": 0.1195569857954979, + "learning_rate": 9.738404190054646e-06, + "loss": 0.0061, + "step": 23600 + }, + { + "epoch": 0.3863208704900597, + "grad_norm": 0.10575102269649506, + "learning_rate": 9.737948181597176e-06, + "loss": 0.0041, + "step": 23610 + }, + { + "epoch": 0.3864844964411356, + "grad_norm": 0.1247902438044548, + "learning_rate": 9.73749178672877e-06, + "loss": 0.0025, + "step": 23620 + }, + { + "epoch": 0.3866481223922114, + "grad_norm": 0.08549071848392487, + "learning_rate": 9.737035005486653e-06, + "loss": 0.0024, + "step": 23630 + }, + { + "epoch": 0.38681174834328724, + "grad_norm": 0.0867099016904831, + "learning_rate": 9.73657783790808e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 0.3869753742943631, + "grad_norm": 0.08126109093427658, + "learning_rate": 9.73612028403033e-06, + "loss": 0.0036, + "step": 23650 + }, + { + "epoch": 0.3871390002454389, + "grad_norm": 0.08932679891586304, + "learning_rate": 9.735662343890723e-06, + "loss": 0.004, + "step": 23660 + }, + { + "epoch": 0.3873026261965148, + "grad_norm": 0.1253349781036377, + "learning_rate": 9.735204017526611e-06, + "loss": 0.0033, + "step": 23670 + }, + { + "epoch": 0.3874662521475906, + "grad_norm": 0.0405646376311779, + "learning_rate": 9.734745304975368e-06, + "loss": 0.0055, + "step": 23680 + }, + { + "epoch": 0.38762987809866645, + "grad_norm": 0.10145255923271179, + "learning_rate": 9.734286206274408e-06, + "loss": 0.0042, + "step": 23690 + }, + { + "epoch": 0.3877935040497423, + "grad_norm": 0.032671693712472916, + "learning_rate": 9.733826721461171e-06, + "loss": 0.003, + "step": 23700 + }, + { + "epoch": 0.3879571300008181, + "grad_norm": 0.016059959307312965, + "learning_rate": 9.733366850573133e-06, + "loss": 0.0034, + "step": 23710 + }, + { + "epoch": 0.388120755951894, + "grad_norm": 0.08102528750896454, + "learning_rate": 9.732906593647799e-06, + "loss": 0.0044, + "step": 23720 + }, + { + "epoch": 0.3882843819029698, + "grad_norm": 0.06434213370084763, + "learning_rate": 9.732445950722706e-06, + "loss": 0.0034, + "step": 23730 + }, + { + "epoch": 0.38844800785404565, + "grad_norm": 0.10787647217512131, + "learning_rate": 9.731984921835422e-06, + "loss": 0.0026, + "step": 23740 + }, + { + "epoch": 0.3886116338051215, + "grad_norm": 0.10055384039878845, + "learning_rate": 9.731523507023547e-06, + "loss": 0.0035, + "step": 23750 + }, + { + "epoch": 0.3887752597561973, + "grad_norm": 0.21888549625873566, + "learning_rate": 9.731061706324714e-06, + "loss": 0.0036, + "step": 23760 + }, + { + "epoch": 0.3889388857072732, + "grad_norm": 0.14533309638500214, + "learning_rate": 9.730599519776584e-06, + "loss": 0.0056, + "step": 23770 + }, + { + "epoch": 0.389102511658349, + "grad_norm": 0.2747025191783905, + "learning_rate": 9.730136947416853e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 0.38926613760942486, + "grad_norm": 0.1449516862630844, + "learning_rate": 9.729673989283244e-06, + "loss": 0.0042, + "step": 23790 + }, + { + "epoch": 0.3894297635605007, + "grad_norm": 0.17499899864196777, + "learning_rate": 9.729210645413518e-06, + "loss": 0.0061, + "step": 23800 + }, + { + "epoch": 0.3895933895115765, + "grad_norm": 0.0841960534453392, + "learning_rate": 9.728746915845461e-06, + "loss": 0.0039, + "step": 23810 + }, + { + "epoch": 0.3897570154626524, + "grad_norm": 0.10993972420692444, + "learning_rate": 9.728282800616896e-06, + "loss": 0.0062, + "step": 23820 + }, + { + "epoch": 0.38992064141372823, + "grad_norm": 0.06402769684791565, + "learning_rate": 9.72781829976567e-06, + "loss": 0.0045, + "step": 23830 + }, + { + "epoch": 0.39008426736480406, + "grad_norm": 0.046928927302360535, + "learning_rate": 9.72735341332967e-06, + "loss": 0.0025, + "step": 23840 + }, + { + "epoch": 0.3902478933158799, + "grad_norm": 0.04450773447751999, + "learning_rate": 9.72688814134681e-06, + "loss": 0.005, + "step": 23850 + }, + { + "epoch": 0.3904115192669557, + "grad_norm": 0.07732709497213364, + "learning_rate": 9.726422483855034e-06, + "loss": 0.0033, + "step": 23860 + }, + { + "epoch": 0.3905751452180316, + "grad_norm": 0.16074153780937195, + "learning_rate": 9.72595644089232e-06, + "loss": 0.004, + "step": 23870 + }, + { + "epoch": 0.39073877116910744, + "grad_norm": 0.24663259088993073, + "learning_rate": 9.725490012496682e-06, + "loss": 0.0058, + "step": 23880 + }, + { + "epoch": 0.39090239712018326, + "grad_norm": 0.3892887234687805, + "learning_rate": 9.725023198706154e-06, + "loss": 0.0044, + "step": 23890 + }, + { + "epoch": 0.3910660230712591, + "grad_norm": 0.17836634814739227, + "learning_rate": 9.724555999558809e-06, + "loss": 0.0032, + "step": 23900 + }, + { + "epoch": 0.3912296490223349, + "grad_norm": 0.016964320093393326, + "learning_rate": 9.724088415092752e-06, + "loss": 0.0036, + "step": 23910 + }, + { + "epoch": 0.3913932749734108, + "grad_norm": 0.07119446992874146, + "learning_rate": 9.723620445346116e-06, + "loss": 0.0027, + "step": 23920 + }, + { + "epoch": 0.39155690092448664, + "grad_norm": 0.04636238515377045, + "learning_rate": 9.723152090357066e-06, + "loss": 0.0023, + "step": 23930 + }, + { + "epoch": 0.39172052687556247, + "grad_norm": 0.08750925213098526, + "learning_rate": 9.722683350163804e-06, + "loss": 0.0046, + "step": 23940 + }, + { + "epoch": 0.3918841528266383, + "grad_norm": 0.18636532127857208, + "learning_rate": 9.722214224804555e-06, + "loss": 0.0051, + "step": 23950 + }, + { + "epoch": 0.39204777877771413, + "grad_norm": 0.2473328560590744, + "learning_rate": 9.72174471431758e-06, + "loss": 0.0044, + "step": 23960 + }, + { + "epoch": 0.39221140472878996, + "grad_norm": 0.37887436151504517, + "learning_rate": 9.721274818741171e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 0.39237503067986584, + "grad_norm": 0.09699217230081558, + "learning_rate": 9.720804538113651e-06, + "loss": 0.0032, + "step": 23980 + }, + { + "epoch": 0.3925386566309417, + "grad_norm": 0.024318208917975426, + "learning_rate": 9.720333872473373e-06, + "loss": 0.0056, + "step": 23990 + }, + { + "epoch": 0.3927022825820175, + "grad_norm": 0.09461870044469833, + "learning_rate": 9.719862821858726e-06, + "loss": 0.0037, + "step": 24000 + }, + { + "epoch": 0.39286590853309333, + "grad_norm": 0.12473174184560776, + "learning_rate": 9.719391386308125e-06, + "loss": 0.0043, + "step": 24010 + }, + { + "epoch": 0.39302953448416916, + "grad_norm": 0.2460101693868637, + "learning_rate": 9.71891956586002e-06, + "loss": 0.0036, + "step": 24020 + }, + { + "epoch": 0.39319316043524505, + "grad_norm": 0.0626598596572876, + "learning_rate": 9.71844736055289e-06, + "loss": 0.0042, + "step": 24030 + }, + { + "epoch": 0.3933567863863209, + "grad_norm": 0.06889753043651581, + "learning_rate": 9.717974770425246e-06, + "loss": 0.0032, + "step": 24040 + }, + { + "epoch": 0.3935204123373967, + "grad_norm": 0.18909382820129395, + "learning_rate": 9.717501795515632e-06, + "loss": 0.0045, + "step": 24050 + }, + { + "epoch": 0.39368403828847254, + "grad_norm": 0.09070835262537003, + "learning_rate": 9.717028435862623e-06, + "loss": 0.0022, + "step": 24060 + }, + { + "epoch": 0.39384766423954837, + "grad_norm": 0.11403269320726395, + "learning_rate": 9.716554691504822e-06, + "loss": 0.003, + "step": 24070 + }, + { + "epoch": 0.39401129019062425, + "grad_norm": 0.2606067657470703, + "learning_rate": 9.716080562480867e-06, + "loss": 0.0062, + "step": 24080 + }, + { + "epoch": 0.3941749161417001, + "grad_norm": 0.05964642018079758, + "learning_rate": 9.715606048829429e-06, + "loss": 0.0054, + "step": 24090 + }, + { + "epoch": 0.3943385420927759, + "grad_norm": 0.11207167059183121, + "learning_rate": 9.715131150589203e-06, + "loss": 0.0021, + "step": 24100 + }, + { + "epoch": 0.39450216804385174, + "grad_norm": 0.3053283095359802, + "learning_rate": 9.714655867798926e-06, + "loss": 0.0038, + "step": 24110 + }, + { + "epoch": 0.39466579399492757, + "grad_norm": 0.37912648916244507, + "learning_rate": 9.714180200497353e-06, + "loss": 0.0086, + "step": 24120 + }, + { + "epoch": 0.39482941994600346, + "grad_norm": 0.3171529471874237, + "learning_rate": 9.713704148723286e-06, + "loss": 0.004, + "step": 24130 + }, + { + "epoch": 0.3949930458970793, + "grad_norm": 0.20151226222515106, + "learning_rate": 9.713227712515543e-06, + "loss": 0.0034, + "step": 24140 + }, + { + "epoch": 0.3951566718481551, + "grad_norm": 0.3001769185066223, + "learning_rate": 9.712750891912986e-06, + "loss": 0.0041, + "step": 24150 + }, + { + "epoch": 0.39532029779923095, + "grad_norm": 0.15295594930648804, + "learning_rate": 9.712273686954498e-06, + "loss": 0.007, + "step": 24160 + }, + { + "epoch": 0.3954839237503068, + "grad_norm": 0.19454239308834076, + "learning_rate": 9.711796097679e-06, + "loss": 0.0029, + "step": 24170 + }, + { + "epoch": 0.39564754970138266, + "grad_norm": 0.09349364042282104, + "learning_rate": 9.711318124125445e-06, + "loss": 0.0031, + "step": 24180 + }, + { + "epoch": 0.3958111756524585, + "grad_norm": 0.12422489374876022, + "learning_rate": 9.710839766332814e-06, + "loss": 0.0032, + "step": 24190 + }, + { + "epoch": 0.3959748016035343, + "grad_norm": 0.11687396466732025, + "learning_rate": 9.710361024340118e-06, + "loss": 0.0035, + "step": 24200 + }, + { + "epoch": 0.39613842755461015, + "grad_norm": 0.05111733451485634, + "learning_rate": 9.709881898186403e-06, + "loss": 0.0045, + "step": 24210 + }, + { + "epoch": 0.396302053505686, + "grad_norm": 0.15105533599853516, + "learning_rate": 9.709402387910745e-06, + "loss": 0.0046, + "step": 24220 + }, + { + "epoch": 0.39646567945676187, + "grad_norm": 0.271156370639801, + "learning_rate": 9.70892249355225e-06, + "loss": 0.004, + "step": 24230 + }, + { + "epoch": 0.3966293054078377, + "grad_norm": 0.1677645742893219, + "learning_rate": 9.70844221515006e-06, + "loss": 0.0037, + "step": 24240 + }, + { + "epoch": 0.3967929313589135, + "grad_norm": 0.12438059598207474, + "learning_rate": 9.707961552743341e-06, + "loss": 0.0031, + "step": 24250 + }, + { + "epoch": 0.39695655730998936, + "grad_norm": 0.3922032415866852, + "learning_rate": 9.707480506371295e-06, + "loss": 0.0054, + "step": 24260 + }, + { + "epoch": 0.3971201832610652, + "grad_norm": 0.12613599002361298, + "learning_rate": 9.706999076073157e-06, + "loss": 0.0045, + "step": 24270 + }, + { + "epoch": 0.39728380921214107, + "grad_norm": 0.06518051028251648, + "learning_rate": 9.706517261888187e-06, + "loss": 0.003, + "step": 24280 + }, + { + "epoch": 0.3974474351632169, + "grad_norm": 0.041631996631622314, + "learning_rate": 9.706035063855682e-06, + "loss": 0.002, + "step": 24290 + }, + { + "epoch": 0.39761106111429273, + "grad_norm": 0.12702889740467072, + "learning_rate": 9.705552482014969e-06, + "loss": 0.0032, + "step": 24300 + }, + { + "epoch": 0.39777468706536856, + "grad_norm": 0.10419308394193649, + "learning_rate": 9.705069516405405e-06, + "loss": 0.0063, + "step": 24310 + }, + { + "epoch": 0.3979383130164444, + "grad_norm": 0.3529118299484253, + "learning_rate": 9.704586167066382e-06, + "loss": 0.0034, + "step": 24320 + }, + { + "epoch": 0.3981019389675203, + "grad_norm": 0.09571299701929092, + "learning_rate": 9.704102434037314e-06, + "loss": 0.0023, + "step": 24330 + }, + { + "epoch": 0.3982655649185961, + "grad_norm": 0.08594782650470734, + "learning_rate": 9.703618317357657e-06, + "loss": 0.0035, + "step": 24340 + }, + { + "epoch": 0.39842919086967193, + "grad_norm": 0.05259867385029793, + "learning_rate": 9.703133817066894e-06, + "loss": 0.0021, + "step": 24350 + }, + { + "epoch": 0.39859281682074776, + "grad_norm": 0.18884804844856262, + "learning_rate": 9.702648933204537e-06, + "loss": 0.0047, + "step": 24360 + }, + { + "epoch": 0.3987564427718236, + "grad_norm": 0.24906374514102936, + "learning_rate": 9.702163665810135e-06, + "loss": 0.0035, + "step": 24370 + }, + { + "epoch": 0.3989200687228995, + "grad_norm": 0.11324939131736755, + "learning_rate": 9.70167801492326e-06, + "loss": 0.003, + "step": 24380 + }, + { + "epoch": 0.3990836946739753, + "grad_norm": 0.11663619428873062, + "learning_rate": 9.701191980583524e-06, + "loss": 0.0034, + "step": 24390 + }, + { + "epoch": 0.39924732062505114, + "grad_norm": 0.3687373697757721, + "learning_rate": 9.700705562830566e-06, + "loss": 0.0037, + "step": 24400 + }, + { + "epoch": 0.39941094657612697, + "grad_norm": 0.07192040234804153, + "learning_rate": 9.700218761704054e-06, + "loss": 0.0022, + "step": 24410 + }, + { + "epoch": 0.3995745725272028, + "grad_norm": 0.03650715947151184, + "learning_rate": 9.699731577243692e-06, + "loss": 0.0046, + "step": 24420 + }, + { + "epoch": 0.39973819847827863, + "grad_norm": 0.18229758739471436, + "learning_rate": 9.69924400948921e-06, + "loss": 0.0045, + "step": 24430 + }, + { + "epoch": 0.3999018244293545, + "grad_norm": 0.3506583869457245, + "learning_rate": 9.698756058480378e-06, + "loss": 0.0021, + "step": 24440 + }, + { + "epoch": 0.40006545038043034, + "grad_norm": 0.18497774004936218, + "learning_rate": 9.698267724256988e-06, + "loss": 0.0034, + "step": 24450 + }, + { + "epoch": 0.4002290763315062, + "grad_norm": 0.12030255049467087, + "learning_rate": 9.697779006858866e-06, + "loss": 0.0037, + "step": 24460 + }, + { + "epoch": 0.400392702282582, + "grad_norm": 0.1044439971446991, + "learning_rate": 9.697289906325873e-06, + "loss": 0.0028, + "step": 24470 + }, + { + "epoch": 0.40055632823365783, + "grad_norm": 0.09643489122390747, + "learning_rate": 9.696800422697896e-06, + "loss": 0.004, + "step": 24480 + }, + { + "epoch": 0.4007199541847337, + "grad_norm": 0.08540601283311844, + "learning_rate": 9.696310556014856e-06, + "loss": 0.0054, + "step": 24490 + }, + { + "epoch": 0.40088358013580955, + "grad_norm": 0.11384651809930801, + "learning_rate": 9.695820306316705e-06, + "loss": 0.0033, + "step": 24500 + }, + { + "epoch": 0.4010472060868854, + "grad_norm": 0.024184083566069603, + "learning_rate": 9.695329673643427e-06, + "loss": 0.0034, + "step": 24510 + }, + { + "epoch": 0.4012108320379612, + "grad_norm": 0.06803640723228455, + "learning_rate": 9.694838658035034e-06, + "loss": 0.0025, + "step": 24520 + }, + { + "epoch": 0.40137445798903704, + "grad_norm": 0.059210050851106644, + "learning_rate": 9.694347259531576e-06, + "loss": 0.0035, + "step": 24530 + }, + { + "epoch": 0.4015380839401129, + "grad_norm": 0.22898975014686584, + "learning_rate": 9.693855478173127e-06, + "loss": 0.0063, + "step": 24540 + }, + { + "epoch": 0.40170170989118875, + "grad_norm": 0.04901501163840294, + "learning_rate": 9.693363313999792e-06, + "loss": 0.0039, + "step": 24550 + }, + { + "epoch": 0.4018653358422646, + "grad_norm": 0.3988078832626343, + "learning_rate": 9.692870767051717e-06, + "loss": 0.0044, + "step": 24560 + }, + { + "epoch": 0.4020289617933404, + "grad_norm": 0.06453163921833038, + "learning_rate": 9.692377837369066e-06, + "loss": 0.0035, + "step": 24570 + }, + { + "epoch": 0.40219258774441624, + "grad_norm": 0.1123410314321518, + "learning_rate": 9.691884524992045e-06, + "loss": 0.0035, + "step": 24580 + }, + { + "epoch": 0.4023562136954921, + "grad_norm": 0.20643159747123718, + "learning_rate": 9.691390829960886e-06, + "loss": 0.0029, + "step": 24590 + }, + { + "epoch": 0.40251983964656796, + "grad_norm": 0.27239346504211426, + "learning_rate": 9.690896752315851e-06, + "loss": 0.0041, + "step": 24600 + }, + { + "epoch": 0.4026834655976438, + "grad_norm": 0.08087146282196045, + "learning_rate": 9.690402292097237e-06, + "loss": 0.0035, + "step": 24610 + }, + { + "epoch": 0.4028470915487196, + "grad_norm": 0.0781392976641655, + "learning_rate": 9.689907449345369e-06, + "loss": 0.0016, + "step": 24620 + }, + { + "epoch": 0.40301071749979545, + "grad_norm": 0.16768626868724823, + "learning_rate": 9.689412224100607e-06, + "loss": 0.0025, + "step": 24630 + }, + { + "epoch": 0.40317434345087133, + "grad_norm": 0.37064576148986816, + "learning_rate": 9.688916616403338e-06, + "loss": 0.0059, + "step": 24640 + }, + { + "epoch": 0.40333796940194716, + "grad_norm": 0.051520735025405884, + "learning_rate": 9.688420626293984e-06, + "loss": 0.004, + "step": 24650 + }, + { + "epoch": 0.403501595353023, + "grad_norm": 0.06874562799930573, + "learning_rate": 9.687924253812994e-06, + "loss": 0.0037, + "step": 24660 + }, + { + "epoch": 0.4036652213040988, + "grad_norm": 0.0444541834294796, + "learning_rate": 9.687427499000852e-06, + "loss": 0.005, + "step": 24670 + }, + { + "epoch": 0.40382884725517465, + "grad_norm": 0.26220789551734924, + "learning_rate": 9.686930361898073e-06, + "loss": 0.0033, + "step": 24680 + }, + { + "epoch": 0.40399247320625054, + "grad_norm": 0.12733012437820435, + "learning_rate": 9.686432842545197e-06, + "loss": 0.0039, + "step": 24690 + }, + { + "epoch": 0.40415609915732637, + "grad_norm": 0.08577050268650055, + "learning_rate": 9.685934940982806e-06, + "loss": 0.0041, + "step": 24700 + }, + { + "epoch": 0.4043197251084022, + "grad_norm": 0.14089174568653107, + "learning_rate": 9.685436657251501e-06, + "loss": 0.0036, + "step": 24710 + }, + { + "epoch": 0.404483351059478, + "grad_norm": 0.19395211338996887, + "learning_rate": 9.684937991391924e-06, + "loss": 0.0034, + "step": 24720 + }, + { + "epoch": 0.40464697701055385, + "grad_norm": 0.15160194039344788, + "learning_rate": 9.684438943444747e-06, + "loss": 0.0032, + "step": 24730 + }, + { + "epoch": 0.40481060296162974, + "grad_norm": 0.06268467754125595, + "learning_rate": 9.683939513450665e-06, + "loss": 0.0042, + "step": 24740 + }, + { + "epoch": 0.40497422891270557, + "grad_norm": 0.0942181795835495, + "learning_rate": 9.683439701450413e-06, + "loss": 0.0048, + "step": 24750 + }, + { + "epoch": 0.4051378548637814, + "grad_norm": 0.20840847492218018, + "learning_rate": 9.682939507484754e-06, + "loss": 0.0043, + "step": 24760 + }, + { + "epoch": 0.40530148081485723, + "grad_norm": 0.14479120075702667, + "learning_rate": 9.68243893159448e-06, + "loss": 0.0026, + "step": 24770 + }, + { + "epoch": 0.40546510676593306, + "grad_norm": 0.09427852183580399, + "learning_rate": 9.68193797382042e-06, + "loss": 0.0035, + "step": 24780 + }, + { + "epoch": 0.40562873271700894, + "grad_norm": 0.1335517019033432, + "learning_rate": 9.681436634203426e-06, + "loss": 0.0045, + "step": 24790 + }, + { + "epoch": 0.4057923586680848, + "grad_norm": 0.12426796555519104, + "learning_rate": 9.680934912784388e-06, + "loss": 0.0037, + "step": 24800 + }, + { + "epoch": 0.4059559846191606, + "grad_norm": 0.3622891902923584, + "learning_rate": 9.680432809604225e-06, + "loss": 0.0026, + "step": 24810 + }, + { + "epoch": 0.40611961057023643, + "grad_norm": 0.09143351763486862, + "learning_rate": 9.679930324703888e-06, + "loss": 0.0036, + "step": 24820 + }, + { + "epoch": 0.40628323652131226, + "grad_norm": 0.26484695076942444, + "learning_rate": 9.679427458124352e-06, + "loss": 0.0058, + "step": 24830 + }, + { + "epoch": 0.40644686247238815, + "grad_norm": 0.1457994282245636, + "learning_rate": 9.678924209906637e-06, + "loss": 0.0045, + "step": 24840 + }, + { + "epoch": 0.406610488423464, + "grad_norm": 0.19691668450832367, + "learning_rate": 9.678420580091781e-06, + "loss": 0.0026, + "step": 24850 + }, + { + "epoch": 0.4067741143745398, + "grad_norm": 0.06473774462938309, + "learning_rate": 9.67791656872086e-06, + "loss": 0.003, + "step": 24860 + }, + { + "epoch": 0.40693774032561564, + "grad_norm": 0.3390854597091675, + "learning_rate": 9.67741217583498e-06, + "loss": 0.0044, + "step": 24870 + }, + { + "epoch": 0.40710136627669147, + "grad_norm": 0.0831959918141365, + "learning_rate": 9.676907401475277e-06, + "loss": 0.0038, + "step": 24880 + }, + { + "epoch": 0.4072649922277673, + "grad_norm": 0.05004332587122917, + "learning_rate": 9.676402245682916e-06, + "loss": 0.0045, + "step": 24890 + }, + { + "epoch": 0.4074286181788432, + "grad_norm": 0.12332847714424133, + "learning_rate": 9.675896708499102e-06, + "loss": 0.0028, + "step": 24900 + }, + { + "epoch": 0.407592244129919, + "grad_norm": 0.26546967029571533, + "learning_rate": 9.675390789965058e-06, + "loss": 0.0027, + "step": 24910 + }, + { + "epoch": 0.40775587008099484, + "grad_norm": 0.09480591863393784, + "learning_rate": 9.67488449012205e-06, + "loss": 0.0032, + "step": 24920 + }, + { + "epoch": 0.40791949603207067, + "grad_norm": 0.11836016178131104, + "learning_rate": 9.674377809011368e-06, + "loss": 0.0049, + "step": 24930 + }, + { + "epoch": 0.4080831219831465, + "grad_norm": 0.2503274083137512, + "learning_rate": 9.673870746674336e-06, + "loss": 0.0039, + "step": 24940 + }, + { + "epoch": 0.4082467479342224, + "grad_norm": 0.115508534014225, + "learning_rate": 9.673363303152306e-06, + "loss": 0.0043, + "step": 24950 + }, + { + "epoch": 0.4084103738852982, + "grad_norm": 0.16050244867801666, + "learning_rate": 9.672855478486668e-06, + "loss": 0.0038, + "step": 24960 + }, + { + "epoch": 0.40857399983637405, + "grad_norm": 0.05900110304355621, + "learning_rate": 9.672347272718835e-06, + "loss": 0.0057, + "step": 24970 + }, + { + "epoch": 0.4087376257874499, + "grad_norm": 0.21688447892665863, + "learning_rate": 9.671838685890252e-06, + "loss": 0.0044, + "step": 24980 + }, + { + "epoch": 0.4089012517385257, + "grad_norm": 0.08749242126941681, + "learning_rate": 9.671329718042404e-06, + "loss": 0.0037, + "step": 24990 + }, + { + "epoch": 0.4090648776896016, + "grad_norm": 0.14104853570461273, + "learning_rate": 9.670820369216795e-06, + "loss": 0.0029, + "step": 25000 + }, + { + "epoch": 0.4092285036406774, + "grad_norm": 0.08482057601213455, + "learning_rate": 9.670310639454969e-06, + "loss": 0.0057, + "step": 25010 + }, + { + "epoch": 0.40939212959175325, + "grad_norm": 0.062256261706352234, + "learning_rate": 9.669800528798498e-06, + "loss": 0.0038, + "step": 25020 + }, + { + "epoch": 0.4095557555428291, + "grad_norm": 0.16285395622253418, + "learning_rate": 9.669290037288984e-06, + "loss": 0.0035, + "step": 25030 + }, + { + "epoch": 0.4097193814939049, + "grad_norm": 0.029890358448028564, + "learning_rate": 9.668779164968061e-06, + "loss": 0.0052, + "step": 25040 + }, + { + "epoch": 0.4098830074449808, + "grad_norm": 0.07181970030069351, + "learning_rate": 9.668267911877392e-06, + "loss": 0.0036, + "step": 25050 + }, + { + "epoch": 0.4100466333960566, + "grad_norm": 0.13503605127334595, + "learning_rate": 9.667756278058676e-06, + "loss": 0.0054, + "step": 25060 + }, + { + "epoch": 0.41021025934713246, + "grad_norm": 0.2239055335521698, + "learning_rate": 9.667244263553639e-06, + "loss": 0.0026, + "step": 25070 + }, + { + "epoch": 0.4103738852982083, + "grad_norm": 0.06981676816940308, + "learning_rate": 9.66673186840404e-06, + "loss": 0.0037, + "step": 25080 + }, + { + "epoch": 0.4105375112492841, + "grad_norm": 0.045035943388938904, + "learning_rate": 9.666219092651667e-06, + "loss": 0.0033, + "step": 25090 + }, + { + "epoch": 0.41070113720036, + "grad_norm": 0.04974318668246269, + "learning_rate": 9.66570593633834e-06, + "loss": 0.0039, + "step": 25100 + }, + { + "epoch": 0.41086476315143583, + "grad_norm": 0.20097310841083527, + "learning_rate": 9.665192399505912e-06, + "loss": 0.0036, + "step": 25110 + }, + { + "epoch": 0.41102838910251166, + "grad_norm": 0.5315530300140381, + "learning_rate": 9.664678482196264e-06, + "loss": 0.0038, + "step": 25120 + }, + { + "epoch": 0.4111920150535875, + "grad_norm": 0.11775261908769608, + "learning_rate": 9.664164184451309e-06, + "loss": 0.0027, + "step": 25130 + }, + { + "epoch": 0.4113556410046633, + "grad_norm": 0.04819796234369278, + "learning_rate": 9.663649506312991e-06, + "loss": 0.0061, + "step": 25140 + }, + { + "epoch": 0.4115192669557392, + "grad_norm": 0.12614324688911438, + "learning_rate": 9.663134447823287e-06, + "loss": 0.0032, + "step": 25150 + }, + { + "epoch": 0.41168289290681503, + "grad_norm": 0.19738654792308807, + "learning_rate": 9.662619009024203e-06, + "loss": 0.0032, + "step": 25160 + }, + { + "epoch": 0.41184651885789086, + "grad_norm": 0.11247184127569199, + "learning_rate": 9.662103189957777e-06, + "loss": 0.006, + "step": 25170 + }, + { + "epoch": 0.4120101448089667, + "grad_norm": 0.05924522876739502, + "learning_rate": 9.661586990666077e-06, + "loss": 0.004, + "step": 25180 + }, + { + "epoch": 0.4121737707600425, + "grad_norm": 0.08525634557008743, + "learning_rate": 9.6610704111912e-06, + "loss": 0.0034, + "step": 25190 + }, + { + "epoch": 0.4123373967111184, + "grad_norm": 0.06503406912088394, + "learning_rate": 9.66055345157528e-06, + "loss": 0.0063, + "step": 25200 + }, + { + "epoch": 0.41250102266219424, + "grad_norm": 0.06928110122680664, + "learning_rate": 9.660036111860478e-06, + "loss": 0.0052, + "step": 25210 + }, + { + "epoch": 0.41266464861327007, + "grad_norm": 0.1879051774740219, + "learning_rate": 9.659518392088985e-06, + "loss": 0.0024, + "step": 25220 + }, + { + "epoch": 0.4128282745643459, + "grad_norm": 0.1871768981218338, + "learning_rate": 9.659000292303023e-06, + "loss": 0.0043, + "step": 25230 + }, + { + "epoch": 0.41299190051542173, + "grad_norm": 0.09390852600336075, + "learning_rate": 9.658481812544851e-06, + "loss": 0.0027, + "step": 25240 + }, + { + "epoch": 0.4131555264664976, + "grad_norm": 0.16996236145496368, + "learning_rate": 9.65796295285675e-06, + "loss": 0.0034, + "step": 25250 + }, + { + "epoch": 0.41331915241757344, + "grad_norm": 0.18050551414489746, + "learning_rate": 9.65744371328104e-06, + "loss": 0.0039, + "step": 25260 + }, + { + "epoch": 0.4134827783686493, + "grad_norm": 0.19516703486442566, + "learning_rate": 9.656924093860064e-06, + "loss": 0.0025, + "step": 25270 + }, + { + "epoch": 0.4136464043197251, + "grad_norm": 0.12207509577274323, + "learning_rate": 9.656404094636207e-06, + "loss": 0.0041, + "step": 25280 + }, + { + "epoch": 0.41381003027080093, + "grad_norm": 0.15849445760250092, + "learning_rate": 9.655883715651873e-06, + "loss": 0.0033, + "step": 25290 + }, + { + "epoch": 0.41397365622187676, + "grad_norm": 0.15875327587127686, + "learning_rate": 9.655362956949503e-06, + "loss": 0.0035, + "step": 25300 + }, + { + "epoch": 0.41413728217295265, + "grad_norm": 0.12671710550785065, + "learning_rate": 9.654841818571568e-06, + "loss": 0.0029, + "step": 25310 + }, + { + "epoch": 0.4143009081240285, + "grad_norm": 0.12516403198242188, + "learning_rate": 9.654320300560573e-06, + "loss": 0.0044, + "step": 25320 + }, + { + "epoch": 0.4144645340751043, + "grad_norm": 0.012723371386528015, + "learning_rate": 9.65379840295905e-06, + "loss": 0.0052, + "step": 25330 + }, + { + "epoch": 0.41462816002618014, + "grad_norm": 0.23379164934158325, + "learning_rate": 9.653276125809564e-06, + "loss": 0.0047, + "step": 25340 + }, + { + "epoch": 0.41479178597725597, + "grad_norm": 0.03316957876086235, + "learning_rate": 9.652753469154707e-06, + "loss": 0.0032, + "step": 25350 + }, + { + "epoch": 0.41495541192833185, + "grad_norm": 0.05698050558567047, + "learning_rate": 9.652230433037106e-06, + "loss": 0.0031, + "step": 25360 + }, + { + "epoch": 0.4151190378794077, + "grad_norm": 0.08126674592494965, + "learning_rate": 9.651707017499421e-06, + "loss": 0.0042, + "step": 25370 + }, + { + "epoch": 0.4152826638304835, + "grad_norm": 0.04867204651236534, + "learning_rate": 9.651183222584338e-06, + "loss": 0.0034, + "step": 25380 + }, + { + "epoch": 0.41544628978155934, + "grad_norm": 0.08237331360578537, + "learning_rate": 9.650659048334577e-06, + "loss": 0.0029, + "step": 25390 + }, + { + "epoch": 0.41560991573263517, + "grad_norm": 0.041710592806339264, + "learning_rate": 9.650134494792884e-06, + "loss": 0.0029, + "step": 25400 + }, + { + "epoch": 0.41577354168371106, + "grad_norm": 0.030708497390151024, + "learning_rate": 9.649609562002045e-06, + "loss": 0.0036, + "step": 25410 + }, + { + "epoch": 0.4159371676347869, + "grad_norm": 0.059733130037784576, + "learning_rate": 9.64908425000487e-06, + "loss": 0.0036, + "step": 25420 + }, + { + "epoch": 0.4161007935858627, + "grad_norm": 0.06851238012313843, + "learning_rate": 9.648558558844198e-06, + "loss": 0.002, + "step": 25430 + }, + { + "epoch": 0.41626441953693855, + "grad_norm": 0.15898770093917847, + "learning_rate": 9.64803248856291e-06, + "loss": 0.0047, + "step": 25440 + }, + { + "epoch": 0.4164280454880144, + "grad_norm": 0.12070489674806595, + "learning_rate": 9.647506039203902e-06, + "loss": 0.0044, + "step": 25450 + }, + { + "epoch": 0.41659167143909026, + "grad_norm": 0.037828654050827026, + "learning_rate": 9.646979210810117e-06, + "loss": 0.0042, + "step": 25460 + }, + { + "epoch": 0.4167552973901661, + "grad_norm": 0.06421557813882828, + "learning_rate": 9.646452003424515e-06, + "loss": 0.0037, + "step": 25470 + }, + { + "epoch": 0.4169189233412419, + "grad_norm": 0.08535627275705338, + "learning_rate": 9.645924417090098e-06, + "loss": 0.0023, + "step": 25480 + }, + { + "epoch": 0.41708254929231775, + "grad_norm": 0.1495044231414795, + "learning_rate": 9.645396451849893e-06, + "loss": 0.0038, + "step": 25490 + }, + { + "epoch": 0.4172461752433936, + "grad_norm": 0.1131482869386673, + "learning_rate": 9.644868107746957e-06, + "loss": 0.0043, + "step": 25500 + }, + { + "epoch": 0.41740980119446947, + "grad_norm": 0.03557871654629707, + "learning_rate": 9.644339384824381e-06, + "loss": 0.0037, + "step": 25510 + }, + { + "epoch": 0.4175734271455453, + "grad_norm": 0.11914899200201035, + "learning_rate": 9.643810283125287e-06, + "loss": 0.0031, + "step": 25520 + }, + { + "epoch": 0.4177370530966211, + "grad_norm": 0.4008508026599884, + "learning_rate": 9.643280802692827e-06, + "loss": 0.0037, + "step": 25530 + }, + { + "epoch": 0.41790067904769695, + "grad_norm": 0.15735241770744324, + "learning_rate": 9.64275094357018e-06, + "loss": 0.004, + "step": 25540 + }, + { + "epoch": 0.4180643049987728, + "grad_norm": 0.06464161723852158, + "learning_rate": 9.642220705800565e-06, + "loss": 0.0031, + "step": 25550 + }, + { + "epoch": 0.41822793094984867, + "grad_norm": 0.22424744069576263, + "learning_rate": 9.641690089427222e-06, + "loss": 0.0038, + "step": 25560 + }, + { + "epoch": 0.4183915569009245, + "grad_norm": 0.28786009550094604, + "learning_rate": 9.641159094493428e-06, + "loss": 0.0029, + "step": 25570 + }, + { + "epoch": 0.41855518285200033, + "grad_norm": 0.04271455854177475, + "learning_rate": 9.640627721042488e-06, + "loss": 0.0035, + "step": 25580 + }, + { + "epoch": 0.41871880880307616, + "grad_norm": 0.023068362846970558, + "learning_rate": 9.64009596911774e-06, + "loss": 0.0043, + "step": 25590 + }, + { + "epoch": 0.418882434754152, + "grad_norm": 0.23619158565998077, + "learning_rate": 9.639563838762552e-06, + "loss": 0.0066, + "step": 25600 + }, + { + "epoch": 0.4190460607052279, + "grad_norm": 0.1617545187473297, + "learning_rate": 9.639031330020323e-06, + "loss": 0.0026, + "step": 25610 + }, + { + "epoch": 0.4192096866563037, + "grad_norm": 0.6730504035949707, + "learning_rate": 9.63849844293448e-06, + "loss": 0.0065, + "step": 25620 + }, + { + "epoch": 0.41937331260737953, + "grad_norm": 0.10138659924268723, + "learning_rate": 9.637965177548488e-06, + "loss": 0.0038, + "step": 25630 + }, + { + "epoch": 0.41953693855845536, + "grad_norm": 0.04692848399281502, + "learning_rate": 9.637431533905834e-06, + "loss": 0.0039, + "step": 25640 + }, + { + "epoch": 0.4197005645095312, + "grad_norm": 0.1432286500930786, + "learning_rate": 9.636897512050044e-06, + "loss": 0.0029, + "step": 25650 + }, + { + "epoch": 0.4198641904606071, + "grad_norm": 0.14014388620853424, + "learning_rate": 9.636363112024668e-06, + "loss": 0.0035, + "step": 25660 + }, + { + "epoch": 0.4200278164116829, + "grad_norm": 0.06834814697504044, + "learning_rate": 9.635828333873291e-06, + "loss": 0.0044, + "step": 25670 + }, + { + "epoch": 0.42019144236275874, + "grad_norm": 0.1365925818681717, + "learning_rate": 9.635293177639526e-06, + "loss": 0.0031, + "step": 25680 + }, + { + "epoch": 0.42035506831383457, + "grad_norm": 0.04953382536768913, + "learning_rate": 9.634757643367023e-06, + "loss": 0.0031, + "step": 25690 + }, + { + "epoch": 0.4205186942649104, + "grad_norm": 0.09412840753793716, + "learning_rate": 9.634221731099454e-06, + "loss": 0.0021, + "step": 25700 + }, + { + "epoch": 0.4206823202159863, + "grad_norm": 0.28074344992637634, + "learning_rate": 9.633685440880527e-06, + "loss": 0.0037, + "step": 25710 + }, + { + "epoch": 0.4208459461670621, + "grad_norm": 0.10350345820188522, + "learning_rate": 9.633148772753981e-06, + "loss": 0.0026, + "step": 25720 + }, + { + "epoch": 0.42100957211813794, + "grad_norm": 0.2897213399410248, + "learning_rate": 9.632611726763584e-06, + "loss": 0.0039, + "step": 25730 + }, + { + "epoch": 0.42117319806921377, + "grad_norm": 0.12776196002960205, + "learning_rate": 9.632074302953135e-06, + "loss": 0.0025, + "step": 25740 + }, + { + "epoch": 0.4213368240202896, + "grad_norm": 0.10490020364522934, + "learning_rate": 9.63153650136647e-06, + "loss": 0.0028, + "step": 25750 + }, + { + "epoch": 0.42150044997136543, + "grad_norm": 0.20296016335487366, + "learning_rate": 9.630998322047442e-06, + "loss": 0.004, + "step": 25760 + }, + { + "epoch": 0.4216640759224413, + "grad_norm": 0.6429498791694641, + "learning_rate": 9.630459765039948e-06, + "loss": 0.0032, + "step": 25770 + }, + { + "epoch": 0.42182770187351715, + "grad_norm": 0.10263705253601074, + "learning_rate": 9.629920830387908e-06, + "loss": 0.0037, + "step": 25780 + }, + { + "epoch": 0.421991327824593, + "grad_norm": 0.09050842374563217, + "learning_rate": 9.62938151813528e-06, + "loss": 0.0034, + "step": 25790 + }, + { + "epoch": 0.4221549537756688, + "grad_norm": 0.06890574097633362, + "learning_rate": 9.628841828326046e-06, + "loss": 0.005, + "step": 25800 + }, + { + "epoch": 0.42231857972674464, + "grad_norm": 0.42209726572036743, + "learning_rate": 9.628301761004219e-06, + "loss": 0.0043, + "step": 25810 + }, + { + "epoch": 0.4224822056778205, + "grad_norm": 0.04127822816371918, + "learning_rate": 9.627761316213848e-06, + "loss": 0.0026, + "step": 25820 + }, + { + "epoch": 0.42264583162889635, + "grad_norm": 0.09475449472665787, + "learning_rate": 9.627220493999008e-06, + "loss": 0.0032, + "step": 25830 + }, + { + "epoch": 0.4228094575799722, + "grad_norm": 0.07014153897762299, + "learning_rate": 9.626679294403809e-06, + "loss": 0.0039, + "step": 25840 + }, + { + "epoch": 0.422973083531048, + "grad_norm": 0.08209510147571564, + "learning_rate": 9.626137717472387e-06, + "loss": 0.0054, + "step": 25850 + }, + { + "epoch": 0.42313670948212384, + "grad_norm": 0.08755461871623993, + "learning_rate": 9.625595763248915e-06, + "loss": 0.0029, + "step": 25860 + }, + { + "epoch": 0.4233003354331997, + "grad_norm": 0.17248448729515076, + "learning_rate": 9.62505343177759e-06, + "loss": 0.0037, + "step": 25870 + }, + { + "epoch": 0.42346396138427556, + "grad_norm": 0.19342680275440216, + "learning_rate": 9.62451072310264e-06, + "loss": 0.0031, + "step": 25880 + }, + { + "epoch": 0.4236275873353514, + "grad_norm": 0.06711114943027496, + "learning_rate": 9.62396763726833e-06, + "loss": 0.0024, + "step": 25890 + }, + { + "epoch": 0.4237912132864272, + "grad_norm": 0.07085072994232178, + "learning_rate": 9.623424174318953e-06, + "loss": 0.0039, + "step": 25900 + }, + { + "epoch": 0.42395483923750305, + "grad_norm": 0.1373291313648224, + "learning_rate": 9.62288033429883e-06, + "loss": 0.0034, + "step": 25910 + }, + { + "epoch": 0.42411846518857893, + "grad_norm": 0.12586650252342224, + "learning_rate": 9.622336117252314e-06, + "loss": 0.0034, + "step": 25920 + }, + { + "epoch": 0.42428209113965476, + "grad_norm": 0.266230046749115, + "learning_rate": 9.621791523223792e-06, + "loss": 0.0041, + "step": 25930 + }, + { + "epoch": 0.4244457170907306, + "grad_norm": 0.06697499006986618, + "learning_rate": 9.62124655225768e-06, + "loss": 0.0043, + "step": 25940 + }, + { + "epoch": 0.4246093430418064, + "grad_norm": 0.01914454810321331, + "learning_rate": 9.620701204398419e-06, + "loss": 0.0019, + "step": 25950 + }, + { + "epoch": 0.42477296899288225, + "grad_norm": 0.11346393823623657, + "learning_rate": 9.62015547969049e-06, + "loss": 0.0035, + "step": 25960 + }, + { + "epoch": 0.42493659494395813, + "grad_norm": 0.1259746253490448, + "learning_rate": 9.619609378178398e-06, + "loss": 0.0027, + "step": 25970 + }, + { + "epoch": 0.42510022089503396, + "grad_norm": 0.14006903767585754, + "learning_rate": 9.619062899906684e-06, + "loss": 0.0026, + "step": 25980 + }, + { + "epoch": 0.4252638468461098, + "grad_norm": 0.09713710099458694, + "learning_rate": 9.618516044919914e-06, + "loss": 0.0021, + "step": 25990 + }, + { + "epoch": 0.4254274727971856, + "grad_norm": 0.19937124848365784, + "learning_rate": 9.61796881326269e-06, + "loss": 0.0029, + "step": 26000 + }, + { + "epoch": 0.42559109874826145, + "grad_norm": 0.15069615840911865, + "learning_rate": 9.617421204979642e-06, + "loss": 0.0052, + "step": 26010 + }, + { + "epoch": 0.42575472469933734, + "grad_norm": 0.21594305336475372, + "learning_rate": 9.616873220115429e-06, + "loss": 0.0044, + "step": 26020 + }, + { + "epoch": 0.42591835065041317, + "grad_norm": 0.17940425872802734, + "learning_rate": 9.616324858714743e-06, + "loss": 0.004, + "step": 26030 + }, + { + "epoch": 0.426081976601489, + "grad_norm": 0.11648151278495789, + "learning_rate": 9.61577612082231e-06, + "loss": 0.0027, + "step": 26040 + }, + { + "epoch": 0.42624560255256483, + "grad_norm": 0.17684414982795715, + "learning_rate": 9.61522700648288e-06, + "loss": 0.0041, + "step": 26050 + }, + { + "epoch": 0.42640922850364066, + "grad_norm": 0.027457231655716896, + "learning_rate": 9.614677515741238e-06, + "loss": 0.0028, + "step": 26060 + }, + { + "epoch": 0.42657285445471654, + "grad_norm": 0.06978648155927658, + "learning_rate": 9.614127648642197e-06, + "loss": 0.003, + "step": 26070 + }, + { + "epoch": 0.4267364804057924, + "grad_norm": 0.21632900834083557, + "learning_rate": 9.613577405230605e-06, + "loss": 0.0046, + "step": 26080 + }, + { + "epoch": 0.4269001063568682, + "grad_norm": 0.03772740811109543, + "learning_rate": 9.613026785551336e-06, + "loss": 0.004, + "step": 26090 + }, + { + "epoch": 0.42706373230794403, + "grad_norm": 0.057381242513656616, + "learning_rate": 9.612475789649297e-06, + "loss": 0.0035, + "step": 26100 + }, + { + "epoch": 0.42722735825901986, + "grad_norm": 0.10717799514532089, + "learning_rate": 9.611924417569424e-06, + "loss": 0.0026, + "step": 26110 + }, + { + "epoch": 0.42739098421009575, + "grad_norm": 0.185784250497818, + "learning_rate": 9.61137266935669e-06, + "loss": 0.0044, + "step": 26120 + }, + { + "epoch": 0.4275546101611716, + "grad_norm": 0.45703235268592834, + "learning_rate": 9.610820545056089e-06, + "loss": 0.0036, + "step": 26130 + }, + { + "epoch": 0.4277182361122474, + "grad_norm": 0.11393826454877853, + "learning_rate": 9.610268044712651e-06, + "loss": 0.0039, + "step": 26140 + }, + { + "epoch": 0.42788186206332324, + "grad_norm": 0.1036711037158966, + "learning_rate": 9.609715168371439e-06, + "loss": 0.0024, + "step": 26150 + }, + { + "epoch": 0.42804548801439907, + "grad_norm": 0.07201945036649704, + "learning_rate": 9.609161916077538e-06, + "loss": 0.0035, + "step": 26160 + }, + { + "epoch": 0.4282091139654749, + "grad_norm": 0.16630186140537262, + "learning_rate": 9.608608287876075e-06, + "loss": 0.0048, + "step": 26170 + }, + { + "epoch": 0.4283727399165508, + "grad_norm": 0.09769545495510101, + "learning_rate": 9.608054283812199e-06, + "loss": 0.0033, + "step": 26180 + }, + { + "epoch": 0.4285363658676266, + "grad_norm": 0.11515926569700241, + "learning_rate": 9.607499903931094e-06, + "loss": 0.0032, + "step": 26190 + }, + { + "epoch": 0.42869999181870244, + "grad_norm": 0.01391480304300785, + "learning_rate": 9.606945148277974e-06, + "loss": 0.0031, + "step": 26200 + }, + { + "epoch": 0.42886361776977827, + "grad_norm": 0.2259417474269867, + "learning_rate": 9.606390016898081e-06, + "loss": 0.0032, + "step": 26210 + }, + { + "epoch": 0.4290272437208541, + "grad_norm": 0.045059408992528915, + "learning_rate": 9.605834509836688e-06, + "loss": 0.0044, + "step": 26220 + }, + { + "epoch": 0.42919086967193, + "grad_norm": 0.05494910478591919, + "learning_rate": 9.605278627139107e-06, + "loss": 0.0027, + "step": 26230 + }, + { + "epoch": 0.4293544956230058, + "grad_norm": 0.08120948821306229, + "learning_rate": 9.604722368850668e-06, + "loss": 0.0027, + "step": 26240 + }, + { + "epoch": 0.42951812157408165, + "grad_norm": 0.12476938962936401, + "learning_rate": 9.60416573501674e-06, + "loss": 0.0039, + "step": 26250 + }, + { + "epoch": 0.4296817475251575, + "grad_norm": 0.256864994764328, + "learning_rate": 9.603608725682717e-06, + "loss": 0.0059, + "step": 26260 + }, + { + "epoch": 0.4298453734762333, + "grad_norm": 0.09119724482297897, + "learning_rate": 9.603051340894031e-06, + "loss": 0.0023, + "step": 26270 + }, + { + "epoch": 0.4300089994273092, + "grad_norm": 0.040157802402973175, + "learning_rate": 9.602493580696137e-06, + "loss": 0.0025, + "step": 26280 + }, + { + "epoch": 0.430172625378385, + "grad_norm": 0.12991248071193695, + "learning_rate": 9.601935445134528e-06, + "loss": 0.0041, + "step": 26290 + }, + { + "epoch": 0.43033625132946085, + "grad_norm": 0.040545277297496796, + "learning_rate": 9.60137693425472e-06, + "loss": 0.0028, + "step": 26300 + }, + { + "epoch": 0.4304998772805367, + "grad_norm": 0.15685582160949707, + "learning_rate": 9.600818048102265e-06, + "loss": 0.0047, + "step": 26310 + }, + { + "epoch": 0.4306635032316125, + "grad_norm": 0.3727242052555084, + "learning_rate": 9.600258786722743e-06, + "loss": 0.0035, + "step": 26320 + }, + { + "epoch": 0.4308271291826884, + "grad_norm": 0.2849130928516388, + "learning_rate": 9.599699150161765e-06, + "loss": 0.0038, + "step": 26330 + }, + { + "epoch": 0.4309907551337642, + "grad_norm": 0.1869385689496994, + "learning_rate": 9.599139138464975e-06, + "loss": 0.0057, + "step": 26340 + }, + { + "epoch": 0.43115438108484005, + "grad_norm": 0.19602486491203308, + "learning_rate": 9.598578751678042e-06, + "loss": 0.0026, + "step": 26350 + }, + { + "epoch": 0.4313180070359159, + "grad_norm": 0.1786963790655136, + "learning_rate": 9.598017989846675e-06, + "loss": 0.004, + "step": 26360 + }, + { + "epoch": 0.4314816329869917, + "grad_norm": 0.10797512531280518, + "learning_rate": 9.597456853016602e-06, + "loss": 0.0031, + "step": 26370 + }, + { + "epoch": 0.4316452589380676, + "grad_norm": 0.049385931342840195, + "learning_rate": 9.596895341233592e-06, + "loss": 0.0041, + "step": 26380 + }, + { + "epoch": 0.43180888488914343, + "grad_norm": 0.11708057671785355, + "learning_rate": 9.596333454543436e-06, + "loss": 0.0032, + "step": 26390 + }, + { + "epoch": 0.43197251084021926, + "grad_norm": 0.2220579832792282, + "learning_rate": 9.595771192991962e-06, + "loss": 0.0035, + "step": 26400 + }, + { + "epoch": 0.4321361367912951, + "grad_norm": 0.08392468094825745, + "learning_rate": 9.595208556625027e-06, + "loss": 0.0039, + "step": 26410 + }, + { + "epoch": 0.4322997627423709, + "grad_norm": 0.058450691401958466, + "learning_rate": 9.594645545488516e-06, + "loss": 0.0061, + "step": 26420 + }, + { + "epoch": 0.4324633886934468, + "grad_norm": 0.025988183915615082, + "learning_rate": 9.594082159628346e-06, + "loss": 0.0116, + "step": 26430 + }, + { + "epoch": 0.43262701464452263, + "grad_norm": 0.11620432138442993, + "learning_rate": 9.593518399090467e-06, + "loss": 0.0031, + "step": 26440 + }, + { + "epoch": 0.43279064059559846, + "grad_norm": 0.38806021213531494, + "learning_rate": 9.592954263920854e-06, + "loss": 0.0031, + "step": 26450 + }, + { + "epoch": 0.4329542665466743, + "grad_norm": 0.045491643249988556, + "learning_rate": 9.592389754165518e-06, + "loss": 0.0032, + "step": 26460 + }, + { + "epoch": 0.4331178924977501, + "grad_norm": 0.35154813528060913, + "learning_rate": 9.591824869870498e-06, + "loss": 0.003, + "step": 26470 + }, + { + "epoch": 0.433281518448826, + "grad_norm": 0.1841619610786438, + "learning_rate": 9.591259611081868e-06, + "loss": 0.0025, + "step": 26480 + }, + { + "epoch": 0.43344514439990184, + "grad_norm": 0.20501504838466644, + "learning_rate": 9.59069397784572e-06, + "loss": 0.0039, + "step": 26490 + }, + { + "epoch": 0.43360877035097767, + "grad_norm": 0.6021678447723389, + "learning_rate": 9.590127970208193e-06, + "loss": 0.0094, + "step": 26500 + }, + { + "epoch": 0.4337723963020535, + "grad_norm": 0.0889105498790741, + "learning_rate": 9.589561588215444e-06, + "loss": 0.0043, + "step": 26510 + }, + { + "epoch": 0.4339360222531293, + "grad_norm": 0.28999319672584534, + "learning_rate": 9.588994831913668e-06, + "loss": 0.0028, + "step": 26520 + }, + { + "epoch": 0.4340996482042052, + "grad_norm": 0.04398936778306961, + "learning_rate": 9.588427701349086e-06, + "loss": 0.0029, + "step": 26530 + }, + { + "epoch": 0.43426327415528104, + "grad_norm": 0.046272728592157364, + "learning_rate": 9.587860196567954e-06, + "loss": 0.0025, + "step": 26540 + }, + { + "epoch": 0.4344269001063569, + "grad_norm": 0.07538960129022598, + "learning_rate": 9.58729231761655e-06, + "loss": 0.0019, + "step": 26550 + }, + { + "epoch": 0.4345905260574327, + "grad_norm": 0.051992617547512054, + "learning_rate": 9.586724064541195e-06, + "loss": 0.0023, + "step": 26560 + }, + { + "epoch": 0.43475415200850853, + "grad_norm": 0.1269051730632782, + "learning_rate": 9.586155437388229e-06, + "loss": 0.0039, + "step": 26570 + }, + { + "epoch": 0.4349177779595844, + "grad_norm": 0.1950068324804306, + "learning_rate": 9.585586436204028e-06, + "loss": 0.0025, + "step": 26580 + }, + { + "epoch": 0.43508140391066025, + "grad_norm": 0.06404326856136322, + "learning_rate": 9.585017061035e-06, + "loss": 0.0058, + "step": 26590 + }, + { + "epoch": 0.4352450298617361, + "grad_norm": 0.15282125771045685, + "learning_rate": 9.584447311927582e-06, + "loss": 0.0035, + "step": 26600 + }, + { + "epoch": 0.4354086558128119, + "grad_norm": 0.06417321413755417, + "learning_rate": 9.583877188928236e-06, + "loss": 0.0028, + "step": 26610 + }, + { + "epoch": 0.43557228176388774, + "grad_norm": 0.10315041989088058, + "learning_rate": 9.583306692083464e-06, + "loss": 0.0024, + "step": 26620 + }, + { + "epoch": 0.43573590771496357, + "grad_norm": 0.23699307441711426, + "learning_rate": 9.58273582143979e-06, + "loss": 0.009, + "step": 26630 + }, + { + "epoch": 0.43589953366603945, + "grad_norm": 0.07633335143327713, + "learning_rate": 9.582164577043776e-06, + "loss": 0.0045, + "step": 26640 + }, + { + "epoch": 0.4360631596171153, + "grad_norm": 0.17290852963924408, + "learning_rate": 9.581592958942008e-06, + "loss": 0.0033, + "step": 26650 + }, + { + "epoch": 0.4362267855681911, + "grad_norm": 0.21958240866661072, + "learning_rate": 9.581020967181106e-06, + "loss": 0.0033, + "step": 26660 + }, + { + "epoch": 0.43639041151926694, + "grad_norm": 0.17968712747097015, + "learning_rate": 9.58044860180772e-06, + "loss": 0.0048, + "step": 26670 + }, + { + "epoch": 0.43655403747034277, + "grad_norm": 0.21733339130878448, + "learning_rate": 9.579875862868533e-06, + "loss": 0.0033, + "step": 26680 + }, + { + "epoch": 0.43671766342141866, + "grad_norm": 0.11233142763376236, + "learning_rate": 9.57930275041025e-06, + "loss": 0.0066, + "step": 26690 + }, + { + "epoch": 0.4368812893724945, + "grad_norm": 0.09297332912683487, + "learning_rate": 9.578729264479615e-06, + "loss": 0.0032, + "step": 26700 + }, + { + "epoch": 0.4370449153235703, + "grad_norm": 0.06843338161706924, + "learning_rate": 9.5781554051234e-06, + "loss": 0.0047, + "step": 26710 + }, + { + "epoch": 0.43720854127464615, + "grad_norm": 0.07784595340490341, + "learning_rate": 9.577581172388406e-06, + "loss": 0.0024, + "step": 26720 + }, + { + "epoch": 0.437372167225722, + "grad_norm": 0.21292120218276978, + "learning_rate": 9.577006566321465e-06, + "loss": 0.0048, + "step": 26730 + }, + { + "epoch": 0.43753579317679786, + "grad_norm": 0.13139836490154266, + "learning_rate": 9.576431586969441e-06, + "loss": 0.0033, + "step": 26740 + }, + { + "epoch": 0.4376994191278737, + "grad_norm": 0.2480802983045578, + "learning_rate": 9.575856234379228e-06, + "loss": 0.0046, + "step": 26750 + }, + { + "epoch": 0.4378630450789495, + "grad_norm": 0.31265008449554443, + "learning_rate": 9.575280508597748e-06, + "loss": 0.0062, + "step": 26760 + }, + { + "epoch": 0.43802667103002535, + "grad_norm": 0.10543932020664215, + "learning_rate": 9.574704409671958e-06, + "loss": 0.0029, + "step": 26770 + }, + { + "epoch": 0.4381902969811012, + "grad_norm": 0.20948873460292816, + "learning_rate": 9.574127937648838e-06, + "loss": 0.0041, + "step": 26780 + }, + { + "epoch": 0.43835392293217706, + "grad_norm": 0.24258270859718323, + "learning_rate": 9.573551092575407e-06, + "loss": 0.0044, + "step": 26790 + }, + { + "epoch": 0.4385175488832529, + "grad_norm": 0.1120290756225586, + "learning_rate": 9.572973874498711e-06, + "loss": 0.0081, + "step": 26800 + }, + { + "epoch": 0.4386811748343287, + "grad_norm": 0.14425401389598846, + "learning_rate": 9.572396283465824e-06, + "loss": 0.0022, + "step": 26810 + }, + { + "epoch": 0.43884480078540455, + "grad_norm": 0.05229343846440315, + "learning_rate": 9.571818319523851e-06, + "loss": 0.004, + "step": 26820 + }, + { + "epoch": 0.4390084267364804, + "grad_norm": 0.1308344304561615, + "learning_rate": 9.571239982719932e-06, + "loss": 0.0036, + "step": 26830 + }, + { + "epoch": 0.43917205268755627, + "grad_norm": 0.0666099488735199, + "learning_rate": 9.570661273101233e-06, + "loss": 0.003, + "step": 26840 + }, + { + "epoch": 0.4393356786386321, + "grad_norm": 0.10902194678783417, + "learning_rate": 9.57008219071495e-06, + "loss": 0.0057, + "step": 26850 + }, + { + "epoch": 0.43949930458970793, + "grad_norm": 0.10403379797935486, + "learning_rate": 9.569502735608314e-06, + "loss": 0.0043, + "step": 26860 + }, + { + "epoch": 0.43966293054078376, + "grad_norm": 0.09467127174139023, + "learning_rate": 9.568922907828579e-06, + "loss": 0.0029, + "step": 26870 + }, + { + "epoch": 0.4398265564918596, + "grad_norm": 0.28333282470703125, + "learning_rate": 9.568342707423038e-06, + "loss": 0.0052, + "step": 26880 + }, + { + "epoch": 0.4399901824429355, + "grad_norm": 0.14598549902439117, + "learning_rate": 9.567762134439009e-06, + "loss": 0.0031, + "step": 26890 + }, + { + "epoch": 0.4401538083940113, + "grad_norm": 0.061146821826696396, + "learning_rate": 9.567181188923841e-06, + "loss": 0.0029, + "step": 26900 + }, + { + "epoch": 0.44031743434508713, + "grad_norm": 0.20208175480365753, + "learning_rate": 9.566599870924914e-06, + "loss": 0.0033, + "step": 26910 + }, + { + "epoch": 0.44048106029616296, + "grad_norm": 0.09864014387130737, + "learning_rate": 9.566018180489638e-06, + "loss": 0.0029, + "step": 26920 + }, + { + "epoch": 0.4406446862472388, + "grad_norm": 0.06830285489559174, + "learning_rate": 9.565436117665455e-06, + "loss": 0.0069, + "step": 26930 + }, + { + "epoch": 0.4408083121983147, + "grad_norm": 0.1314922273159027, + "learning_rate": 9.564853682499835e-06, + "loss": 0.0039, + "step": 26940 + }, + { + "epoch": 0.4409719381493905, + "grad_norm": 0.04637869819998741, + "learning_rate": 9.564270875040279e-06, + "loss": 0.0033, + "step": 26950 + }, + { + "epoch": 0.44113556410046634, + "grad_norm": 0.11996118724346161, + "learning_rate": 9.563687695334321e-06, + "loss": 0.0027, + "step": 26960 + }, + { + "epoch": 0.44129919005154217, + "grad_norm": 0.1876671016216278, + "learning_rate": 9.56310414342952e-06, + "loss": 0.0057, + "step": 26970 + }, + { + "epoch": 0.441462816002618, + "grad_norm": 0.31153422594070435, + "learning_rate": 9.56252021937347e-06, + "loss": 0.0042, + "step": 26980 + }, + { + "epoch": 0.4416264419536939, + "grad_norm": 0.10770515352487564, + "learning_rate": 9.561935923213794e-06, + "loss": 0.0035, + "step": 26990 + }, + { + "epoch": 0.4417900679047697, + "grad_norm": 0.11300802230834961, + "learning_rate": 9.561351254998146e-06, + "loss": 0.0028, + "step": 27000 + }, + { + "epoch": 0.44195369385584554, + "grad_norm": 0.038234710693359375, + "learning_rate": 9.560766214774208e-06, + "loss": 0.0024, + "step": 27010 + }, + { + "epoch": 0.44211731980692137, + "grad_norm": 0.11550666391849518, + "learning_rate": 9.560180802589695e-06, + "loss": 0.0054, + "step": 27020 + }, + { + "epoch": 0.4422809457579972, + "grad_norm": 0.1973753571510315, + "learning_rate": 9.559595018492351e-06, + "loss": 0.0033, + "step": 27030 + }, + { + "epoch": 0.4424445717090731, + "grad_norm": 0.0685434564948082, + "learning_rate": 9.55900886252995e-06, + "loss": 0.0035, + "step": 27040 + }, + { + "epoch": 0.4426081976601489, + "grad_norm": 0.09395457059144974, + "learning_rate": 9.558422334750297e-06, + "loss": 0.003, + "step": 27050 + }, + { + "epoch": 0.44277182361122475, + "grad_norm": 0.2162901610136032, + "learning_rate": 9.557835435201228e-06, + "loss": 0.003, + "step": 27060 + }, + { + "epoch": 0.4429354495623006, + "grad_norm": 0.04899590089917183, + "learning_rate": 9.557248163930606e-06, + "loss": 0.0023, + "step": 27070 + }, + { + "epoch": 0.4430990755133764, + "grad_norm": 0.0845181941986084, + "learning_rate": 9.556660520986332e-06, + "loss": 0.0036, + "step": 27080 + }, + { + "epoch": 0.44326270146445224, + "grad_norm": 0.08560560643672943, + "learning_rate": 9.556072506416327e-06, + "loss": 0.0025, + "step": 27090 + }, + { + "epoch": 0.4434263274155281, + "grad_norm": 0.1488703340291977, + "learning_rate": 9.55548412026855e-06, + "loss": 0.0051, + "step": 27100 + }, + { + "epoch": 0.44358995336660395, + "grad_norm": 0.046460285782814026, + "learning_rate": 9.554895362590988e-06, + "loss": 0.0025, + "step": 27110 + }, + { + "epoch": 0.4437535793176798, + "grad_norm": 0.14281921088695526, + "learning_rate": 9.554306233431656e-06, + "loss": 0.0028, + "step": 27120 + }, + { + "epoch": 0.4439172052687556, + "grad_norm": 0.07671947032213211, + "learning_rate": 9.553716732838603e-06, + "loss": 0.0033, + "step": 27130 + }, + { + "epoch": 0.44408083121983144, + "grad_norm": 0.38148581981658936, + "learning_rate": 9.553126860859908e-06, + "loss": 0.003, + "step": 27140 + }, + { + "epoch": 0.4442444571709073, + "grad_norm": 0.02350139617919922, + "learning_rate": 9.552536617543675e-06, + "loss": 0.0029, + "step": 27150 + }, + { + "epoch": 0.44440808312198316, + "grad_norm": 0.0743347704410553, + "learning_rate": 9.551946002938046e-06, + "loss": 0.0029, + "step": 27160 + }, + { + "epoch": 0.444571709073059, + "grad_norm": 0.22405192255973816, + "learning_rate": 9.551355017091189e-06, + "loss": 0.003, + "step": 27170 + }, + { + "epoch": 0.4447353350241348, + "grad_norm": 0.08125679939985275, + "learning_rate": 9.550763660051299e-06, + "loss": 0.0017, + "step": 27180 + }, + { + "epoch": 0.44489896097521064, + "grad_norm": 0.08913346379995346, + "learning_rate": 9.55017193186661e-06, + "loss": 0.0034, + "step": 27190 + }, + { + "epoch": 0.44506258692628653, + "grad_norm": 0.3011961877346039, + "learning_rate": 9.549579832585379e-06, + "loss": 0.0054, + "step": 27200 + }, + { + "epoch": 0.44522621287736236, + "grad_norm": 0.09054125845432281, + "learning_rate": 9.548987362255896e-06, + "loss": 0.0031, + "step": 27210 + }, + { + "epoch": 0.4453898388284382, + "grad_norm": 0.18851423263549805, + "learning_rate": 9.548394520926479e-06, + "loss": 0.0038, + "step": 27220 + }, + { + "epoch": 0.445553464779514, + "grad_norm": 0.08263241499662399, + "learning_rate": 9.547801308645482e-06, + "loss": 0.0023, + "step": 27230 + }, + { + "epoch": 0.44571709073058985, + "grad_norm": 0.11365669965744019, + "learning_rate": 9.547207725461281e-06, + "loss": 0.0023, + "step": 27240 + }, + { + "epoch": 0.44588071668166573, + "grad_norm": 0.04128199443221092, + "learning_rate": 9.54661377142229e-06, + "loss": 0.0017, + "step": 27250 + }, + { + "epoch": 0.44604434263274156, + "grad_norm": 0.37932923436164856, + "learning_rate": 9.546019446576949e-06, + "loss": 0.0045, + "step": 27260 + }, + { + "epoch": 0.4462079685838174, + "grad_norm": 0.11917036026716232, + "learning_rate": 9.545424750973728e-06, + "loss": 0.0037, + "step": 27270 + }, + { + "epoch": 0.4463715945348932, + "grad_norm": 0.07910063862800598, + "learning_rate": 9.54482968466113e-06, + "loss": 0.0026, + "step": 27280 + }, + { + "epoch": 0.44653522048596905, + "grad_norm": 0.10809221118688583, + "learning_rate": 9.544234247687688e-06, + "loss": 0.0035, + "step": 27290 + }, + { + "epoch": 0.44669884643704494, + "grad_norm": 0.06450322270393372, + "learning_rate": 9.543638440101958e-06, + "loss": 0.0021, + "step": 27300 + }, + { + "epoch": 0.44686247238812077, + "grad_norm": 0.06999453157186508, + "learning_rate": 9.543042261952537e-06, + "loss": 0.0027, + "step": 27310 + }, + { + "epoch": 0.4470260983391966, + "grad_norm": 0.04958852007985115, + "learning_rate": 9.542445713288044e-06, + "loss": 0.0038, + "step": 27320 + }, + { + "epoch": 0.44718972429027243, + "grad_norm": 0.13095538318157196, + "learning_rate": 9.541848794157136e-06, + "loss": 0.002, + "step": 27330 + }, + { + "epoch": 0.44735335024134826, + "grad_norm": 0.03661338984966278, + "learning_rate": 9.541251504608493e-06, + "loss": 0.005, + "step": 27340 + }, + { + "epoch": 0.44751697619242414, + "grad_norm": 0.06599044054746628, + "learning_rate": 9.540653844690826e-06, + "loss": 0.0034, + "step": 27350 + }, + { + "epoch": 0.4476806021435, + "grad_norm": 0.1458204835653305, + "learning_rate": 9.540055814452883e-06, + "loss": 0.0034, + "step": 27360 + }, + { + "epoch": 0.4478442280945758, + "grad_norm": 0.013831786811351776, + "learning_rate": 9.53945741394343e-06, + "loss": 0.0032, + "step": 27370 + }, + { + "epoch": 0.44800785404565163, + "grad_norm": 0.3614981472492218, + "learning_rate": 9.538858643211278e-06, + "loss": 0.0029, + "step": 27380 + }, + { + "epoch": 0.44817147999672746, + "grad_norm": 0.032748524099588394, + "learning_rate": 9.538259502305257e-06, + "loss": 0.0033, + "step": 27390 + }, + { + "epoch": 0.44833510594780335, + "grad_norm": 0.11110106110572815, + "learning_rate": 9.537659991274232e-06, + "loss": 0.003, + "step": 27400 + }, + { + "epoch": 0.4484987318988792, + "grad_norm": 0.1370764821767807, + "learning_rate": 9.537060110167095e-06, + "loss": 0.0027, + "step": 27410 + }, + { + "epoch": 0.448662357849955, + "grad_norm": 0.07810279726982117, + "learning_rate": 9.536459859032774e-06, + "loss": 0.0028, + "step": 27420 + }, + { + "epoch": 0.44882598380103084, + "grad_norm": 0.13732978701591492, + "learning_rate": 9.535859237920219e-06, + "loss": 0.002, + "step": 27430 + }, + { + "epoch": 0.44898960975210667, + "grad_norm": 0.06112179160118103, + "learning_rate": 9.535258246878418e-06, + "loss": 0.0017, + "step": 27440 + }, + { + "epoch": 0.44915323570318255, + "grad_norm": 0.14673815667629242, + "learning_rate": 9.534656885956386e-06, + "loss": 0.0036, + "step": 27450 + }, + { + "epoch": 0.4493168616542584, + "grad_norm": 0.06018859148025513, + "learning_rate": 9.534055155203164e-06, + "loss": 0.0059, + "step": 27460 + }, + { + "epoch": 0.4494804876053342, + "grad_norm": 0.06915943324565887, + "learning_rate": 9.533453054667831e-06, + "loss": 0.0045, + "step": 27470 + }, + { + "epoch": 0.44964411355641004, + "grad_norm": 0.017338022589683533, + "learning_rate": 9.53285058439949e-06, + "loss": 0.0047, + "step": 27480 + }, + { + "epoch": 0.44980773950748587, + "grad_norm": 0.2309543639421463, + "learning_rate": 9.532247744447281e-06, + "loss": 0.0028, + "step": 27490 + }, + { + "epoch": 0.4499713654585617, + "grad_norm": 0.1974831372499466, + "learning_rate": 9.531644534860363e-06, + "loss": 0.0033, + "step": 27500 + }, + { + "epoch": 0.4501349914096376, + "grad_norm": 0.11578907817602158, + "learning_rate": 9.531040955687935e-06, + "loss": 0.0038, + "step": 27510 + }, + { + "epoch": 0.4502986173607134, + "grad_norm": 0.09191634505987167, + "learning_rate": 9.530437006979223e-06, + "loss": 0.003, + "step": 27520 + }, + { + "epoch": 0.45046224331178925, + "grad_norm": 0.1568758636713028, + "learning_rate": 9.529832688783483e-06, + "loss": 0.0043, + "step": 27530 + }, + { + "epoch": 0.4506258692628651, + "grad_norm": 0.05258748680353165, + "learning_rate": 9.52922800115e-06, + "loss": 0.0041, + "step": 27540 + }, + { + "epoch": 0.4507894952139409, + "grad_norm": 0.18043150007724762, + "learning_rate": 9.528622944128089e-06, + "loss": 0.0025, + "step": 27550 + }, + { + "epoch": 0.4509531211650168, + "grad_norm": 0.15081089735031128, + "learning_rate": 9.528017517767103e-06, + "loss": 0.0029, + "step": 27560 + }, + { + "epoch": 0.4511167471160926, + "grad_norm": 0.07562534511089325, + "learning_rate": 9.52741172211641e-06, + "loss": 0.0029, + "step": 27570 + }, + { + "epoch": 0.45128037306716845, + "grad_norm": 0.2528219521045685, + "learning_rate": 9.526805557225422e-06, + "loss": 0.003, + "step": 27580 + }, + { + "epoch": 0.4514439990182443, + "grad_norm": 0.19838450849056244, + "learning_rate": 9.526199023143574e-06, + "loss": 0.0104, + "step": 27590 + }, + { + "epoch": 0.4516076249693201, + "grad_norm": 0.048178285360336304, + "learning_rate": 9.525592119920333e-06, + "loss": 0.0047, + "step": 27600 + }, + { + "epoch": 0.451771250920396, + "grad_norm": 0.09211842715740204, + "learning_rate": 9.524984847605197e-06, + "loss": 0.0027, + "step": 27610 + }, + { + "epoch": 0.4519348768714718, + "grad_norm": 0.34548458456993103, + "learning_rate": 9.524377206247691e-06, + "loss": 0.0049, + "step": 27620 + }, + { + "epoch": 0.45209850282254765, + "grad_norm": 0.08648984134197235, + "learning_rate": 9.523769195897374e-06, + "loss": 0.0034, + "step": 27630 + }, + { + "epoch": 0.4522621287736235, + "grad_norm": 0.08408576995134354, + "learning_rate": 9.523160816603832e-06, + "loss": 0.0039, + "step": 27640 + }, + { + "epoch": 0.4524257547246993, + "grad_norm": 0.16873887181282043, + "learning_rate": 9.522552068416684e-06, + "loss": 0.0033, + "step": 27650 + }, + { + "epoch": 0.4525893806757752, + "grad_norm": 0.1678036004304886, + "learning_rate": 9.521942951385576e-06, + "loss": 0.0038, + "step": 27660 + }, + { + "epoch": 0.45275300662685103, + "grad_norm": 0.24030046164989471, + "learning_rate": 9.521333465560186e-06, + "loss": 0.0035, + "step": 27670 + }, + { + "epoch": 0.45291663257792686, + "grad_norm": 0.09746508300304413, + "learning_rate": 9.52072361099022e-06, + "loss": 0.0038, + "step": 27680 + }, + { + "epoch": 0.4530802585290027, + "grad_norm": 0.111379474401474, + "learning_rate": 9.520113387725418e-06, + "loss": 0.003, + "step": 27690 + }, + { + "epoch": 0.4532438844800785, + "grad_norm": 0.08142360299825668, + "learning_rate": 9.519502795815548e-06, + "loss": 0.0035, + "step": 27700 + }, + { + "epoch": 0.4534075104311544, + "grad_norm": 0.12644042074680328, + "learning_rate": 9.518891835310404e-06, + "loss": 0.0029, + "step": 27710 + }, + { + "epoch": 0.45357113638223023, + "grad_norm": 0.1252516359090805, + "learning_rate": 9.518280506259819e-06, + "loss": 0.0025, + "step": 27720 + }, + { + "epoch": 0.45373476233330606, + "grad_norm": 0.08867057412862778, + "learning_rate": 9.517668808713647e-06, + "loss": 0.0036, + "step": 27730 + }, + { + "epoch": 0.4538983882843819, + "grad_norm": 0.060929544270038605, + "learning_rate": 9.517056742721779e-06, + "loss": 0.0037, + "step": 27740 + }, + { + "epoch": 0.4540620142354577, + "grad_norm": 0.1040935218334198, + "learning_rate": 9.51644430833413e-06, + "loss": 0.0034, + "step": 27750 + }, + { + "epoch": 0.4542256401865336, + "grad_norm": 0.10028395056724548, + "learning_rate": 9.51583150560065e-06, + "loss": 0.0027, + "step": 27760 + }, + { + "epoch": 0.45438926613760944, + "grad_norm": 0.17439784109592438, + "learning_rate": 9.515218334571317e-06, + "loss": 0.0039, + "step": 27770 + }, + { + "epoch": 0.45455289208868527, + "grad_norm": 0.24616345763206482, + "learning_rate": 9.514604795296139e-06, + "loss": 0.0028, + "step": 27780 + }, + { + "epoch": 0.4547165180397611, + "grad_norm": 0.03100091964006424, + "learning_rate": 9.513990887825151e-06, + "loss": 0.0026, + "step": 27790 + }, + { + "epoch": 0.4548801439908369, + "grad_norm": 0.4011678993701935, + "learning_rate": 9.513376612208427e-06, + "loss": 0.0021, + "step": 27800 + }, + { + "epoch": 0.4550437699419128, + "grad_norm": 0.0659242570400238, + "learning_rate": 9.512761968496062e-06, + "loss": 0.0038, + "step": 27810 + }, + { + "epoch": 0.45520739589298864, + "grad_norm": 0.184167742729187, + "learning_rate": 9.512146956738184e-06, + "loss": 0.0043, + "step": 27820 + }, + { + "epoch": 0.45537102184406447, + "grad_norm": 0.026814324781298637, + "learning_rate": 9.51153157698495e-06, + "loss": 0.002, + "step": 27830 + }, + { + "epoch": 0.4555346477951403, + "grad_norm": 0.03376597538590431, + "learning_rate": 9.510915829286553e-06, + "loss": 0.0029, + "step": 27840 + }, + { + "epoch": 0.45569827374621613, + "grad_norm": 0.09375806152820587, + "learning_rate": 9.510299713693209e-06, + "loss": 0.0048, + "step": 27850 + }, + { + "epoch": 0.455861899697292, + "grad_norm": 0.07186637818813324, + "learning_rate": 9.509683230255165e-06, + "loss": 0.0036, + "step": 27860 + }, + { + "epoch": 0.45602552564836785, + "grad_norm": 0.16127312183380127, + "learning_rate": 9.509066379022697e-06, + "loss": 0.0032, + "step": 27870 + }, + { + "epoch": 0.4561891515994437, + "grad_norm": 0.08908277750015259, + "learning_rate": 9.508449160046121e-06, + "loss": 0.0036, + "step": 27880 + }, + { + "epoch": 0.4563527775505195, + "grad_norm": 0.11411503702402115, + "learning_rate": 9.507831573375767e-06, + "loss": 0.0032, + "step": 27890 + }, + { + "epoch": 0.45651640350159534, + "grad_norm": 0.05092979967594147, + "learning_rate": 9.507213619062008e-06, + "loss": 0.0026, + "step": 27900 + }, + { + "epoch": 0.4566800294526712, + "grad_norm": 0.08310269564390182, + "learning_rate": 9.50659529715524e-06, + "loss": 0.0037, + "step": 27910 + }, + { + "epoch": 0.45684365540374705, + "grad_norm": 0.06380978971719742, + "learning_rate": 9.505976607705895e-06, + "loss": 0.0044, + "step": 27920 + }, + { + "epoch": 0.4570072813548229, + "grad_norm": 0.064381442964077, + "learning_rate": 9.505357550764427e-06, + "loss": 0.0033, + "step": 27930 + }, + { + "epoch": 0.4571709073058987, + "grad_norm": 0.22628238797187805, + "learning_rate": 9.504738126381325e-06, + "loss": 0.0059, + "step": 27940 + }, + { + "epoch": 0.45733453325697454, + "grad_norm": 0.05161839351058006, + "learning_rate": 9.504118334607108e-06, + "loss": 0.0035, + "step": 27950 + }, + { + "epoch": 0.45749815920805037, + "grad_norm": 0.09953735768795013, + "learning_rate": 9.503498175492326e-06, + "loss": 0.0024, + "step": 27960 + }, + { + "epoch": 0.45766178515912626, + "grad_norm": 0.05142979323863983, + "learning_rate": 9.502877649087552e-06, + "loss": 0.0023, + "step": 27970 + }, + { + "epoch": 0.4578254111102021, + "grad_norm": 0.2211284190416336, + "learning_rate": 9.5022567554434e-06, + "loss": 0.0049, + "step": 27980 + }, + { + "epoch": 0.4579890370612779, + "grad_norm": 0.08128587156534195, + "learning_rate": 9.501635494610503e-06, + "loss": 0.0025, + "step": 27990 + }, + { + "epoch": 0.45815266301235374, + "grad_norm": 0.11402089893817902, + "learning_rate": 9.50101386663953e-06, + "loss": 0.0026, + "step": 28000 + }, + { + "epoch": 0.4583162889634296, + "grad_norm": 0.070767842233181, + "learning_rate": 9.500391871581182e-06, + "loss": 0.0034, + "step": 28010 + }, + { + "epoch": 0.45847991491450546, + "grad_norm": 0.36080360412597656, + "learning_rate": 9.499769509486185e-06, + "loss": 0.0052, + "step": 28020 + }, + { + "epoch": 0.4586435408655813, + "grad_norm": 0.014667628332972527, + "learning_rate": 9.499146780405294e-06, + "loss": 0.0038, + "step": 28030 + }, + { + "epoch": 0.4588071668166571, + "grad_norm": 0.09940271824598312, + "learning_rate": 9.498523684389301e-06, + "loss": 0.0042, + "step": 28040 + }, + { + "epoch": 0.45897079276773295, + "grad_norm": 0.09149282425642014, + "learning_rate": 9.497900221489022e-06, + "loss": 0.0053, + "step": 28050 + }, + { + "epoch": 0.4591344187188088, + "grad_norm": 0.045744918286800385, + "learning_rate": 9.497276391755302e-06, + "loss": 0.0023, + "step": 28060 + }, + { + "epoch": 0.45929804466988466, + "grad_norm": 0.1874120682477951, + "learning_rate": 9.496652195239023e-06, + "loss": 0.0039, + "step": 28070 + }, + { + "epoch": 0.4594616706209605, + "grad_norm": 0.07139880955219269, + "learning_rate": 9.496027631991086e-06, + "loss": 0.0034, + "step": 28080 + }, + { + "epoch": 0.4596252965720363, + "grad_norm": 0.2532361149787903, + "learning_rate": 9.495402702062434e-06, + "loss": 0.0053, + "step": 28090 + }, + { + "epoch": 0.45978892252311215, + "grad_norm": 0.05640101060271263, + "learning_rate": 9.494777405504035e-06, + "loss": 0.0036, + "step": 28100 + }, + { + "epoch": 0.459952548474188, + "grad_norm": 0.04346856102347374, + "learning_rate": 9.494151742366882e-06, + "loss": 0.0039, + "step": 28110 + }, + { + "epoch": 0.46011617442526387, + "grad_norm": 0.17160440981388092, + "learning_rate": 9.493525712702e-06, + "loss": 0.0037, + "step": 28120 + }, + { + "epoch": 0.4602798003763397, + "grad_norm": 0.11071068793535233, + "learning_rate": 9.492899316560454e-06, + "loss": 0.0027, + "step": 28130 + }, + { + "epoch": 0.46044342632741553, + "grad_norm": 0.04852447658777237, + "learning_rate": 9.492272553993323e-06, + "loss": 0.0031, + "step": 28140 + }, + { + "epoch": 0.46060705227849136, + "grad_norm": 0.08421175926923752, + "learning_rate": 9.491645425051728e-06, + "loss": 0.0025, + "step": 28150 + }, + { + "epoch": 0.4607706782295672, + "grad_norm": 0.11441599577665329, + "learning_rate": 9.491017929786812e-06, + "loss": 0.0042, + "step": 28160 + }, + { + "epoch": 0.4609343041806431, + "grad_norm": 0.21369783580303192, + "learning_rate": 9.490390068249755e-06, + "loss": 0.0059, + "step": 28170 + }, + { + "epoch": 0.4610979301317189, + "grad_norm": 0.10642528533935547, + "learning_rate": 9.489761840491761e-06, + "loss": 0.0041, + "step": 28180 + }, + { + "epoch": 0.46126155608279473, + "grad_norm": 0.08429161459207535, + "learning_rate": 9.489133246564068e-06, + "loss": 0.0036, + "step": 28190 + }, + { + "epoch": 0.46142518203387056, + "grad_norm": 0.10386421531438828, + "learning_rate": 9.488504286517939e-06, + "loss": 0.0026, + "step": 28200 + }, + { + "epoch": 0.4615888079849464, + "grad_norm": 0.11419639736413956, + "learning_rate": 9.487874960404673e-06, + "loss": 0.004, + "step": 28210 + }, + { + "epoch": 0.4617524339360223, + "grad_norm": 0.047336872667074203, + "learning_rate": 9.487245268275593e-06, + "loss": 0.0018, + "step": 28220 + }, + { + "epoch": 0.4619160598870981, + "grad_norm": 0.06023683398962021, + "learning_rate": 9.486615210182057e-06, + "loss": 0.0045, + "step": 28230 + }, + { + "epoch": 0.46207968583817394, + "grad_norm": 0.050592176616191864, + "learning_rate": 9.485984786175448e-06, + "loss": 0.0054, + "step": 28240 + }, + { + "epoch": 0.46224331178924977, + "grad_norm": 0.0629870668053627, + "learning_rate": 9.485353996307185e-06, + "loss": 0.0026, + "step": 28250 + }, + { + "epoch": 0.4624069377403256, + "grad_norm": 0.10153938084840775, + "learning_rate": 9.484722840628708e-06, + "loss": 0.0027, + "step": 28260 + }, + { + "epoch": 0.4625705636914015, + "grad_norm": 0.09818442910909653, + "learning_rate": 9.484091319191495e-06, + "loss": 0.0023, + "step": 28270 + }, + { + "epoch": 0.4627341896424773, + "grad_norm": 0.10715015232563019, + "learning_rate": 9.48345943204705e-06, + "loss": 0.0047, + "step": 28280 + }, + { + "epoch": 0.46289781559355314, + "grad_norm": 0.19105903804302216, + "learning_rate": 9.482827179246909e-06, + "loss": 0.0023, + "step": 28290 + }, + { + "epoch": 0.46306144154462897, + "grad_norm": 0.10577499866485596, + "learning_rate": 9.482194560842635e-06, + "loss": 0.0038, + "step": 28300 + }, + { + "epoch": 0.4632250674957048, + "grad_norm": 0.33034053444862366, + "learning_rate": 9.481561576885821e-06, + "loss": 0.0073, + "step": 28310 + }, + { + "epoch": 0.4633886934467807, + "grad_norm": 0.14007116854190826, + "learning_rate": 9.480928227428095e-06, + "loss": 0.0031, + "step": 28320 + }, + { + "epoch": 0.4635523193978565, + "grad_norm": 0.1659744828939438, + "learning_rate": 9.480294512521105e-06, + "loss": 0.004, + "step": 28330 + }, + { + "epoch": 0.46371594534893235, + "grad_norm": 0.08970291912555695, + "learning_rate": 9.47966043221654e-06, + "loss": 0.0042, + "step": 28340 + }, + { + "epoch": 0.4638795713000082, + "grad_norm": 0.10167667269706726, + "learning_rate": 9.479025986566111e-06, + "loss": 0.0029, + "step": 28350 + }, + { + "epoch": 0.464043197251084, + "grad_norm": 0.14350557327270508, + "learning_rate": 9.47839117562156e-06, + "loss": 0.0045, + "step": 28360 + }, + { + "epoch": 0.46420682320215984, + "grad_norm": 0.026401014998555183, + "learning_rate": 9.477755999434662e-06, + "loss": 0.0032, + "step": 28370 + }, + { + "epoch": 0.4643704491532357, + "grad_norm": 0.03000246360898018, + "learning_rate": 9.477120458057221e-06, + "loss": 0.0028, + "step": 28380 + }, + { + "epoch": 0.46453407510431155, + "grad_norm": 0.17039255797863007, + "learning_rate": 9.476484551541065e-06, + "loss": 0.0024, + "step": 28390 + }, + { + "epoch": 0.4646977010553874, + "grad_norm": 0.2673710584640503, + "learning_rate": 9.475848279938063e-06, + "loss": 0.0044, + "step": 28400 + }, + { + "epoch": 0.4648613270064632, + "grad_norm": 0.1765543669462204, + "learning_rate": 9.475211643300099e-06, + "loss": 0.0045, + "step": 28410 + }, + { + "epoch": 0.46502495295753904, + "grad_norm": 0.12987369298934937, + "learning_rate": 9.474574641679102e-06, + "loss": 0.0035, + "step": 28420 + }, + { + "epoch": 0.4651885789086149, + "grad_norm": 0.1911579817533493, + "learning_rate": 9.473937275127022e-06, + "loss": 0.0026, + "step": 28430 + }, + { + "epoch": 0.46535220485969075, + "grad_norm": 0.06673678755760193, + "learning_rate": 9.47329954369584e-06, + "loss": 0.0021, + "step": 28440 + }, + { + "epoch": 0.4655158308107666, + "grad_norm": 0.08422823250293732, + "learning_rate": 9.472661447437563e-06, + "loss": 0.0029, + "step": 28450 + }, + { + "epoch": 0.4656794567618424, + "grad_norm": 0.053204409778118134, + "learning_rate": 9.472022986404238e-06, + "loss": 0.0029, + "step": 28460 + }, + { + "epoch": 0.46584308271291824, + "grad_norm": 0.022931329905986786, + "learning_rate": 9.471384160647933e-06, + "loss": 0.0044, + "step": 28470 + }, + { + "epoch": 0.46600670866399413, + "grad_norm": 0.04039199650287628, + "learning_rate": 9.47074497022075e-06, + "loss": 0.0032, + "step": 28480 + }, + { + "epoch": 0.46617033461506996, + "grad_norm": 0.04516413062810898, + "learning_rate": 9.470105415174817e-06, + "loss": 0.0029, + "step": 28490 + }, + { + "epoch": 0.4663339605661458, + "grad_norm": 0.11811202019453049, + "learning_rate": 9.469465495562293e-06, + "loss": 0.0023, + "step": 28500 + }, + { + "epoch": 0.4664975865172216, + "grad_norm": 0.12220500409603119, + "learning_rate": 9.468825211435373e-06, + "loss": 0.0046, + "step": 28510 + }, + { + "epoch": 0.46666121246829745, + "grad_norm": 0.048121124505996704, + "learning_rate": 9.468184562846271e-06, + "loss": 0.0035, + "step": 28520 + }, + { + "epoch": 0.46682483841937333, + "grad_norm": 0.062382303178310394, + "learning_rate": 9.467543549847238e-06, + "loss": 0.0034, + "step": 28530 + }, + { + "epoch": 0.46698846437044916, + "grad_norm": 0.02163584530353546, + "learning_rate": 9.466902172490555e-06, + "loss": 0.0025, + "step": 28540 + }, + { + "epoch": 0.467152090321525, + "grad_norm": 0.3023717701435089, + "learning_rate": 9.466260430828526e-06, + "loss": 0.0032, + "step": 28550 + }, + { + "epoch": 0.4673157162726008, + "grad_norm": 0.17531529068946838, + "learning_rate": 9.465618324913494e-06, + "loss": 0.0036, + "step": 28560 + }, + { + "epoch": 0.46747934222367665, + "grad_norm": 0.04204995930194855, + "learning_rate": 9.464975854797824e-06, + "loss": 0.0043, + "step": 28570 + }, + { + "epoch": 0.46764296817475254, + "grad_norm": 0.22975222766399384, + "learning_rate": 9.464333020533914e-06, + "loss": 0.0059, + "step": 28580 + }, + { + "epoch": 0.46780659412582837, + "grad_norm": 0.08279221504926682, + "learning_rate": 9.463689822174191e-06, + "loss": 0.0025, + "step": 28590 + }, + { + "epoch": 0.4679702200769042, + "grad_norm": 0.1958877444267273, + "learning_rate": 9.463046259771113e-06, + "loss": 0.0025, + "step": 28600 + }, + { + "epoch": 0.46813384602798, + "grad_norm": 0.03067268803715706, + "learning_rate": 9.462402333377169e-06, + "loss": 0.0029, + "step": 28610 + }, + { + "epoch": 0.46829747197905586, + "grad_norm": 0.03204172104597092, + "learning_rate": 9.461758043044871e-06, + "loss": 0.0042, + "step": 28620 + }, + { + "epoch": 0.46846109793013174, + "grad_norm": 0.045231226831674576, + "learning_rate": 9.461113388826768e-06, + "loss": 0.0025, + "step": 28630 + }, + { + "epoch": 0.46862472388120757, + "grad_norm": 0.03135726973414421, + "learning_rate": 9.460468370775435e-06, + "loss": 0.0072, + "step": 28640 + }, + { + "epoch": 0.4687883498322834, + "grad_norm": 0.16129206120967865, + "learning_rate": 9.459822988943477e-06, + "loss": 0.0048, + "step": 28650 + }, + { + "epoch": 0.46895197578335923, + "grad_norm": 0.19696134328842163, + "learning_rate": 9.459177243383531e-06, + "loss": 0.0051, + "step": 28660 + }, + { + "epoch": 0.46911560173443506, + "grad_norm": 0.07071767747402191, + "learning_rate": 9.45853113414826e-06, + "loss": 0.005, + "step": 28670 + }, + { + "epoch": 0.46927922768551095, + "grad_norm": 0.06767883896827698, + "learning_rate": 9.457884661290358e-06, + "loss": 0.0016, + "step": 28680 + }, + { + "epoch": 0.4694428536365868, + "grad_norm": 0.21728633344173431, + "learning_rate": 9.457237824862549e-06, + "loss": 0.0031, + "step": 28690 + }, + { + "epoch": 0.4696064795876626, + "grad_norm": 0.41223353147506714, + "learning_rate": 9.45659062491759e-06, + "loss": 0.0064, + "step": 28700 + }, + { + "epoch": 0.46977010553873844, + "grad_norm": 0.054228391498327255, + "learning_rate": 9.455943061508261e-06, + "loss": 0.0036, + "step": 28710 + }, + { + "epoch": 0.46993373148981427, + "grad_norm": 0.043424446135759354, + "learning_rate": 9.455295134687377e-06, + "loss": 0.0036, + "step": 28720 + }, + { + "epoch": 0.47009735744089015, + "grad_norm": 0.05108136683702469, + "learning_rate": 9.45464684450778e-06, + "loss": 0.0038, + "step": 28730 + }, + { + "epoch": 0.470260983391966, + "grad_norm": 0.07761649787425995, + "learning_rate": 9.453998191022343e-06, + "loss": 0.0016, + "step": 28740 + }, + { + "epoch": 0.4704246093430418, + "grad_norm": 0.10181140899658203, + "learning_rate": 9.453349174283965e-06, + "loss": 0.0055, + "step": 28750 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.12838008999824524, + "learning_rate": 9.452699794345583e-06, + "loss": 0.003, + "step": 28760 + }, + { + "epoch": 0.47075186124519347, + "grad_norm": 0.06869093328714371, + "learning_rate": 9.452050051260152e-06, + "loss": 0.0022, + "step": 28770 + }, + { + "epoch": 0.47091548719626936, + "grad_norm": 0.1513408124446869, + "learning_rate": 9.451399945080668e-06, + "loss": 0.0036, + "step": 28780 + }, + { + "epoch": 0.4710791131473452, + "grad_norm": 0.20629248023033142, + "learning_rate": 9.45074947586015e-06, + "loss": 0.0045, + "step": 28790 + }, + { + "epoch": 0.471242739098421, + "grad_norm": 0.12404251098632812, + "learning_rate": 9.450098643651647e-06, + "loss": 0.0044, + "step": 28800 + }, + { + "epoch": 0.47140636504949684, + "grad_norm": 0.22954332828521729, + "learning_rate": 9.449447448508238e-06, + "loss": 0.0026, + "step": 28810 + }, + { + "epoch": 0.4715699910005727, + "grad_norm": 0.2752246856689453, + "learning_rate": 9.448795890483035e-06, + "loss": 0.0039, + "step": 28820 + }, + { + "epoch": 0.4717336169516485, + "grad_norm": 0.11914423108100891, + "learning_rate": 9.448143969629175e-06, + "loss": 0.0033, + "step": 28830 + }, + { + "epoch": 0.4718972429027244, + "grad_norm": 0.07998602092266083, + "learning_rate": 9.447491685999825e-06, + "loss": 0.02, + "step": 28840 + }, + { + "epoch": 0.4720608688538002, + "grad_norm": 0.13012129068374634, + "learning_rate": 9.446839039648187e-06, + "loss": 0.0029, + "step": 28850 + }, + { + "epoch": 0.47222449480487605, + "grad_norm": 0.21434567868709564, + "learning_rate": 9.446186030627486e-06, + "loss": 0.0035, + "step": 28860 + }, + { + "epoch": 0.4723881207559519, + "grad_norm": 0.11092668771743774, + "learning_rate": 9.44553265899098e-06, + "loss": 0.0033, + "step": 28870 + }, + { + "epoch": 0.4725517467070277, + "grad_norm": 0.02316344901919365, + "learning_rate": 9.444878924791952e-06, + "loss": 0.0055, + "step": 28880 + }, + { + "epoch": 0.4727153726581036, + "grad_norm": 0.041690804064273834, + "learning_rate": 9.444224828083725e-06, + "loss": 0.0028, + "step": 28890 + }, + { + "epoch": 0.4728789986091794, + "grad_norm": 0.07620447874069214, + "learning_rate": 9.44357036891964e-06, + "loss": 0.003, + "step": 28900 + }, + { + "epoch": 0.47304262456025525, + "grad_norm": 0.2158164083957672, + "learning_rate": 9.442915547353075e-06, + "loss": 0.003, + "step": 28910 + }, + { + "epoch": 0.4732062505113311, + "grad_norm": 0.2905883193016052, + "learning_rate": 9.442260363437435e-06, + "loss": 0.0071, + "step": 28920 + }, + { + "epoch": 0.4733698764624069, + "grad_norm": 0.2301294356584549, + "learning_rate": 9.441604817226151e-06, + "loss": 0.003, + "step": 28930 + }, + { + "epoch": 0.4735335024134828, + "grad_norm": 0.0769018903374672, + "learning_rate": 9.440948908772692e-06, + "loss": 0.0052, + "step": 28940 + }, + { + "epoch": 0.47369712836455863, + "grad_norm": 0.0813213661313057, + "learning_rate": 9.440292638130548e-06, + "loss": 0.0045, + "step": 28950 + }, + { + "epoch": 0.47386075431563446, + "grad_norm": 0.09577547013759613, + "learning_rate": 9.439636005353244e-06, + "loss": 0.0036, + "step": 28960 + }, + { + "epoch": 0.4740243802667103, + "grad_norm": 0.04842466115951538, + "learning_rate": 9.438979010494333e-06, + "loss": 0.0024, + "step": 28970 + }, + { + "epoch": 0.4741880062177861, + "grad_norm": 0.24843230843544006, + "learning_rate": 9.438321653607397e-06, + "loss": 0.0032, + "step": 28980 + }, + { + "epoch": 0.474351632168862, + "grad_norm": 0.24147559702396393, + "learning_rate": 9.437663934746045e-06, + "loss": 0.0051, + "step": 28990 + }, + { + "epoch": 0.47451525811993783, + "grad_norm": 0.05181020125746727, + "learning_rate": 9.437005853963923e-06, + "loss": 0.0044, + "step": 29000 + }, + { + "epoch": 0.47467888407101366, + "grad_norm": 0.13449133932590485, + "learning_rate": 9.436347411314698e-06, + "loss": 0.0036, + "step": 29010 + }, + { + "epoch": 0.4748425100220895, + "grad_norm": 0.1109488382935524, + "learning_rate": 9.435688606852073e-06, + "loss": 0.003, + "step": 29020 + }, + { + "epoch": 0.4750061359731653, + "grad_norm": 0.11689011752605438, + "learning_rate": 9.435029440629776e-06, + "loss": 0.0039, + "step": 29030 + }, + { + "epoch": 0.4751697619242412, + "grad_norm": 0.023432515561580658, + "learning_rate": 9.434369912701568e-06, + "loss": 0.0022, + "step": 29040 + }, + { + "epoch": 0.47533338787531704, + "grad_norm": 0.11035023629665375, + "learning_rate": 9.433710023121236e-06, + "loss": 0.0032, + "step": 29050 + }, + { + "epoch": 0.47549701382639287, + "grad_norm": 0.22261519730091095, + "learning_rate": 9.433049771942599e-06, + "loss": 0.0045, + "step": 29060 + }, + { + "epoch": 0.4756606397774687, + "grad_norm": 0.16968069970607758, + "learning_rate": 9.432389159219507e-06, + "loss": 0.0028, + "step": 29070 + }, + { + "epoch": 0.4758242657285445, + "grad_norm": 0.06222948804497719, + "learning_rate": 9.431728185005834e-06, + "loss": 0.0025, + "step": 29080 + }, + { + "epoch": 0.4759878916796204, + "grad_norm": 0.1742427796125412, + "learning_rate": 9.431066849355488e-06, + "loss": 0.0047, + "step": 29090 + }, + { + "epoch": 0.47615151763069624, + "grad_norm": 0.20284488797187805, + "learning_rate": 9.430405152322407e-06, + "loss": 0.0024, + "step": 29100 + }, + { + "epoch": 0.47631514358177207, + "grad_norm": 0.07978978753089905, + "learning_rate": 9.429743093960555e-06, + "loss": 0.0034, + "step": 29110 + }, + { + "epoch": 0.4764787695328479, + "grad_norm": 0.07165643572807312, + "learning_rate": 9.429080674323927e-06, + "loss": 0.0034, + "step": 29120 + }, + { + "epoch": 0.47664239548392373, + "grad_norm": 0.08576705306768417, + "learning_rate": 9.428417893466549e-06, + "loss": 0.0038, + "step": 29130 + }, + { + "epoch": 0.4768060214349996, + "grad_norm": 0.1268194317817688, + "learning_rate": 9.427754751442473e-06, + "loss": 0.005, + "step": 29140 + }, + { + "epoch": 0.47696964738607545, + "grad_norm": 0.10577467083930969, + "learning_rate": 9.427091248305784e-06, + "loss": 0.0036, + "step": 29150 + }, + { + "epoch": 0.4771332733371513, + "grad_norm": 0.06506650894880295, + "learning_rate": 9.426427384110596e-06, + "loss": 0.0027, + "step": 29160 + }, + { + "epoch": 0.4772968992882271, + "grad_norm": 0.017393868416547775, + "learning_rate": 9.42576315891105e-06, + "loss": 0.0027, + "step": 29170 + }, + { + "epoch": 0.47746052523930294, + "grad_norm": 0.08368945121765137, + "learning_rate": 9.42509857276132e-06, + "loss": 0.0033, + "step": 29180 + }, + { + "epoch": 0.4776241511903788, + "grad_norm": 0.14434656500816345, + "learning_rate": 9.424433625715602e-06, + "loss": 0.0032, + "step": 29190 + }, + { + "epoch": 0.47778777714145465, + "grad_norm": 0.07934143394231796, + "learning_rate": 9.423768317828134e-06, + "loss": 0.0061, + "step": 29200 + }, + { + "epoch": 0.4779514030925305, + "grad_norm": 0.05839494988322258, + "learning_rate": 9.423102649153173e-06, + "loss": 0.0031, + "step": 29210 + }, + { + "epoch": 0.4781150290436063, + "grad_norm": 0.11376013606786728, + "learning_rate": 9.422436619745008e-06, + "loss": 0.0028, + "step": 29220 + }, + { + "epoch": 0.47827865499468214, + "grad_norm": 0.10313332080841064, + "learning_rate": 9.421770229657959e-06, + "loss": 0.0024, + "step": 29230 + }, + { + "epoch": 0.47844228094575797, + "grad_norm": 0.1202329769730568, + "learning_rate": 9.421103478946374e-06, + "loss": 0.003, + "step": 29240 + }, + { + "epoch": 0.47860590689683385, + "grad_norm": 0.11629094928503036, + "learning_rate": 9.420436367664631e-06, + "loss": 0.0034, + "step": 29250 + }, + { + "epoch": 0.4787695328479097, + "grad_norm": 0.10433991998434067, + "learning_rate": 9.41976889586714e-06, + "loss": 0.0048, + "step": 29260 + }, + { + "epoch": 0.4789331587989855, + "grad_norm": 0.11831417679786682, + "learning_rate": 9.419101063608331e-06, + "loss": 0.0033, + "step": 29270 + }, + { + "epoch": 0.47909678475006134, + "grad_norm": 0.15843774378299713, + "learning_rate": 9.418432870942675e-06, + "loss": 0.0046, + "step": 29280 + }, + { + "epoch": 0.4792604107011372, + "grad_norm": 0.054176975041627884, + "learning_rate": 9.417764317924669e-06, + "loss": 0.0024, + "step": 29290 + }, + { + "epoch": 0.47942403665221306, + "grad_norm": 0.03427838906645775, + "learning_rate": 9.417095404608834e-06, + "loss": 0.0032, + "step": 29300 + }, + { + "epoch": 0.4795876626032889, + "grad_norm": 0.04099629446864128, + "learning_rate": 9.416426131049727e-06, + "loss": 0.0018, + "step": 29310 + }, + { + "epoch": 0.4797512885543647, + "grad_norm": 0.10742400586605072, + "learning_rate": 9.41575649730193e-06, + "loss": 0.0024, + "step": 29320 + }, + { + "epoch": 0.47991491450544055, + "grad_norm": 0.17142325639724731, + "learning_rate": 9.415086503420059e-06, + "loss": 0.0046, + "step": 29330 + }, + { + "epoch": 0.4800785404565164, + "grad_norm": 0.09329265356063843, + "learning_rate": 9.414416149458751e-06, + "loss": 0.0029, + "step": 29340 + }, + { + "epoch": 0.48024216640759226, + "grad_norm": 0.04275538772344589, + "learning_rate": 9.413745435472683e-06, + "loss": 0.0047, + "step": 29350 + }, + { + "epoch": 0.4804057923586681, + "grad_norm": 0.004263002425432205, + "learning_rate": 9.413074361516554e-06, + "loss": 0.003, + "step": 29360 + }, + { + "epoch": 0.4805694183097439, + "grad_norm": 0.2757914662361145, + "learning_rate": 9.412402927645095e-06, + "loss": 0.004, + "step": 29370 + }, + { + "epoch": 0.48073304426081975, + "grad_norm": 0.09664005041122437, + "learning_rate": 9.411731133913063e-06, + "loss": 0.0033, + "step": 29380 + }, + { + "epoch": 0.4808966702118956, + "grad_norm": 0.20267799496650696, + "learning_rate": 9.411058980375251e-06, + "loss": 0.003, + "step": 29390 + }, + { + "epoch": 0.48106029616297147, + "grad_norm": 0.04491041228175163, + "learning_rate": 9.410386467086479e-06, + "loss": 0.0026, + "step": 29400 + }, + { + "epoch": 0.4812239221140473, + "grad_norm": 0.28391507267951965, + "learning_rate": 9.409713594101589e-06, + "loss": 0.0021, + "step": 29410 + }, + { + "epoch": 0.4813875480651231, + "grad_norm": 0.18214187026023865, + "learning_rate": 9.409040361475463e-06, + "loss": 0.0038, + "step": 29420 + }, + { + "epoch": 0.48155117401619896, + "grad_norm": 0.044792406260967255, + "learning_rate": 9.408366769263005e-06, + "loss": 0.0031, + "step": 29430 + }, + { + "epoch": 0.4817147999672748, + "grad_norm": 0.12545135617256165, + "learning_rate": 9.407692817519151e-06, + "loss": 0.0031, + "step": 29440 + }, + { + "epoch": 0.4818784259183507, + "grad_norm": 0.11244130879640579, + "learning_rate": 9.407018506298868e-06, + "loss": 0.0023, + "step": 29450 + }, + { + "epoch": 0.4820420518694265, + "grad_norm": 0.027645418420433998, + "learning_rate": 9.40634383565715e-06, + "loss": 0.0035, + "step": 29460 + }, + { + "epoch": 0.48220567782050233, + "grad_norm": 0.17549319565296173, + "learning_rate": 9.405668805649023e-06, + "loss": 0.0046, + "step": 29470 + }, + { + "epoch": 0.48236930377157816, + "grad_norm": 0.12509573996067047, + "learning_rate": 9.404993416329533e-06, + "loss": 0.0047, + "step": 29480 + }, + { + "epoch": 0.482532929722654, + "grad_norm": 0.1549886018037796, + "learning_rate": 9.40431766775377e-06, + "loss": 0.0023, + "step": 29490 + }, + { + "epoch": 0.4826965556737299, + "grad_norm": 0.2679791748523712, + "learning_rate": 9.40364155997684e-06, + "loss": 0.0043, + "step": 29500 + }, + { + "epoch": 0.4828601816248057, + "grad_norm": 0.11589089781045914, + "learning_rate": 9.402965093053888e-06, + "loss": 0.0044, + "step": 29510 + }, + { + "epoch": 0.48302380757588154, + "grad_norm": 0.1716080904006958, + "learning_rate": 9.402288267040085e-06, + "loss": 0.0046, + "step": 29520 + }, + { + "epoch": 0.48318743352695737, + "grad_norm": 0.16904345154762268, + "learning_rate": 9.401611081990628e-06, + "loss": 0.003, + "step": 29530 + }, + { + "epoch": 0.4833510594780332, + "grad_norm": 0.11915217339992523, + "learning_rate": 9.400933537960745e-06, + "loss": 0.0023, + "step": 29540 + }, + { + "epoch": 0.4835146854291091, + "grad_norm": 0.061145078390836716, + "learning_rate": 9.400255635005699e-06, + "loss": 0.0026, + "step": 29550 + }, + { + "epoch": 0.4836783113801849, + "grad_norm": 0.08691444247961044, + "learning_rate": 9.399577373180773e-06, + "loss": 0.0039, + "step": 29560 + }, + { + "epoch": 0.48384193733126074, + "grad_norm": 0.05766739696264267, + "learning_rate": 9.398898752541284e-06, + "loss": 0.0031, + "step": 29570 + }, + { + "epoch": 0.48400556328233657, + "grad_norm": 0.08079709112644196, + "learning_rate": 9.39821977314258e-06, + "loss": 0.0041, + "step": 29580 + }, + { + "epoch": 0.4841691892334124, + "grad_norm": 0.07256931811571121, + "learning_rate": 9.397540435040034e-06, + "loss": 0.0032, + "step": 29590 + }, + { + "epoch": 0.4843328151844883, + "grad_norm": 0.06149398535490036, + "learning_rate": 9.396860738289054e-06, + "loss": 0.0054, + "step": 29600 + }, + { + "epoch": 0.4844964411355641, + "grad_norm": 0.1838526725769043, + "learning_rate": 9.396180682945071e-06, + "loss": 0.0022, + "step": 29610 + }, + { + "epoch": 0.48466006708663995, + "grad_norm": 0.021014414727687836, + "learning_rate": 9.395500269063546e-06, + "loss": 0.0027, + "step": 29620 + }, + { + "epoch": 0.4848236930377158, + "grad_norm": 0.18143707513809204, + "learning_rate": 9.394819496699977e-06, + "loss": 0.0038, + "step": 29630 + }, + { + "epoch": 0.4849873189887916, + "grad_norm": 0.041861046105623245, + "learning_rate": 9.394138365909882e-06, + "loss": 0.0037, + "step": 29640 + }, + { + "epoch": 0.4851509449398675, + "grad_norm": 0.12975075840950012, + "learning_rate": 9.393456876748812e-06, + "loss": 0.003, + "step": 29650 + }, + { + "epoch": 0.4853145708909433, + "grad_norm": 0.31343191862106323, + "learning_rate": 9.392775029272345e-06, + "loss": 0.0035, + "step": 29660 + }, + { + "epoch": 0.48547819684201915, + "grad_norm": 0.07164093106985092, + "learning_rate": 9.392092823536095e-06, + "loss": 0.0029, + "step": 29670 + }, + { + "epoch": 0.485641822793095, + "grad_norm": 0.08914093673229218, + "learning_rate": 9.391410259595695e-06, + "loss": 0.003, + "step": 29680 + }, + { + "epoch": 0.4858054487441708, + "grad_norm": 0.06592705845832825, + "learning_rate": 9.390727337506815e-06, + "loss": 0.0048, + "step": 29690 + }, + { + "epoch": 0.48596907469524664, + "grad_norm": 0.0723566859960556, + "learning_rate": 9.390044057325154e-06, + "loss": 0.0025, + "step": 29700 + }, + { + "epoch": 0.4861327006463225, + "grad_norm": 0.15704552829265594, + "learning_rate": 9.389360419106435e-06, + "loss": 0.0034, + "step": 29710 + }, + { + "epoch": 0.48629632659739835, + "grad_norm": 0.07331628352403641, + "learning_rate": 9.388676422906413e-06, + "loss": 0.0047, + "step": 29720 + }, + { + "epoch": 0.4864599525484742, + "grad_norm": 0.07547681778669357, + "learning_rate": 9.387992068780876e-06, + "loss": 0.0035, + "step": 29730 + }, + { + "epoch": 0.48662357849955, + "grad_norm": 0.1840086132287979, + "learning_rate": 9.387307356785634e-06, + "loss": 0.004, + "step": 29740 + }, + { + "epoch": 0.48678720445062584, + "grad_norm": 0.13028809428215027, + "learning_rate": 9.386622286976529e-06, + "loss": 0.0038, + "step": 29750 + }, + { + "epoch": 0.48695083040170173, + "grad_norm": 0.048142917454242706, + "learning_rate": 9.385936859409436e-06, + "loss": 0.0023, + "step": 29760 + }, + { + "epoch": 0.48711445635277756, + "grad_norm": 0.08636688441038132, + "learning_rate": 9.385251074140256e-06, + "loss": 0.0026, + "step": 29770 + }, + { + "epoch": 0.4872780823038534, + "grad_norm": 0.1748528629541397, + "learning_rate": 9.384564931224918e-06, + "loss": 0.0048, + "step": 29780 + }, + { + "epoch": 0.4874417082549292, + "grad_norm": 0.11434413492679596, + "learning_rate": 9.38387843071938e-06, + "loss": 0.0021, + "step": 29790 + }, + { + "epoch": 0.48760533420600505, + "grad_norm": 0.09903785586357117, + "learning_rate": 9.383191572679635e-06, + "loss": 0.006, + "step": 29800 + }, + { + "epoch": 0.48776896015708093, + "grad_norm": 0.1673927903175354, + "learning_rate": 9.382504357161698e-06, + "loss": 0.0024, + "step": 29810 + }, + { + "epoch": 0.48793258610815676, + "grad_norm": 0.04580897465348244, + "learning_rate": 9.381816784221613e-06, + "loss": 0.0023, + "step": 29820 + }, + { + "epoch": 0.4880962120592326, + "grad_norm": 0.03202640637755394, + "learning_rate": 9.381128853915462e-06, + "loss": 0.0022, + "step": 29830 + }, + { + "epoch": 0.4882598380103084, + "grad_norm": 0.14334113895893097, + "learning_rate": 9.380440566299347e-06, + "loss": 0.0037, + "step": 29840 + }, + { + "epoch": 0.48842346396138425, + "grad_norm": 0.1461249738931656, + "learning_rate": 9.379751921429403e-06, + "loss": 0.0034, + "step": 29850 + }, + { + "epoch": 0.48858708991246014, + "grad_norm": 0.10982855409383774, + "learning_rate": 9.379062919361794e-06, + "loss": 0.0034, + "step": 29860 + }, + { + "epoch": 0.48875071586353597, + "grad_norm": 0.13848136365413666, + "learning_rate": 9.378373560152711e-06, + "loss": 0.0039, + "step": 29870 + }, + { + "epoch": 0.4889143418146118, + "grad_norm": 0.1193138062953949, + "learning_rate": 9.377683843858378e-06, + "loss": 0.0041, + "step": 29880 + }, + { + "epoch": 0.4890779677656876, + "grad_norm": 0.09822847694158554, + "learning_rate": 9.376993770535046e-06, + "loss": 0.0029, + "step": 29890 + }, + { + "epoch": 0.48924159371676346, + "grad_norm": 0.13098573684692383, + "learning_rate": 9.376303340238993e-06, + "loss": 0.0025, + "step": 29900 + }, + { + "epoch": 0.48940521966783934, + "grad_norm": 0.05100260302424431, + "learning_rate": 9.375612553026527e-06, + "loss": 0.0033, + "step": 29910 + }, + { + "epoch": 0.48956884561891517, + "grad_norm": 0.20293666422367096, + "learning_rate": 9.37492140895399e-06, + "loss": 0.0028, + "step": 29920 + }, + { + "epoch": 0.489732471569991, + "grad_norm": 0.10998491197824478, + "learning_rate": 9.374229908077748e-06, + "loss": 0.0025, + "step": 29930 + }, + { + "epoch": 0.48989609752106683, + "grad_norm": 0.0260187815874815, + "learning_rate": 9.373538050454197e-06, + "loss": 0.0021, + "step": 29940 + }, + { + "epoch": 0.49005972347214266, + "grad_norm": 0.15895287692546844, + "learning_rate": 9.37284583613976e-06, + "loss": 0.0033, + "step": 29950 + }, + { + "epoch": 0.49022334942321855, + "grad_norm": 0.11989407986402512, + "learning_rate": 9.372153265190898e-06, + "loss": 0.0019, + "step": 29960 + }, + { + "epoch": 0.4903869753742944, + "grad_norm": 0.0730937048792839, + "learning_rate": 9.37146033766409e-06, + "loss": 0.004, + "step": 29970 + }, + { + "epoch": 0.4905506013253702, + "grad_norm": 0.08417189866304398, + "learning_rate": 9.370767053615849e-06, + "loss": 0.0032, + "step": 29980 + }, + { + "epoch": 0.49071422727644604, + "grad_norm": 0.18247011303901672, + "learning_rate": 9.370073413102718e-06, + "loss": 0.0038, + "step": 29990 + }, + { + "epoch": 0.49087785322752187, + "grad_norm": 0.18512503802776337, + "learning_rate": 9.369379416181268e-06, + "loss": 0.0018, + "step": 30000 + }, + { + "epoch": 0.49104147917859775, + "grad_norm": 0.2067500352859497, + "learning_rate": 9.368685062908098e-06, + "loss": 0.0025, + "step": 30010 + }, + { + "epoch": 0.4912051051296736, + "grad_norm": 0.2376611977815628, + "learning_rate": 9.367990353339838e-06, + "loss": 0.0042, + "step": 30020 + }, + { + "epoch": 0.4913687310807494, + "grad_norm": 0.05311886593699455, + "learning_rate": 9.367295287533148e-06, + "loss": 0.0033, + "step": 30030 + }, + { + "epoch": 0.49153235703182524, + "grad_norm": 0.14123480021953583, + "learning_rate": 9.36659986554471e-06, + "loss": 0.0043, + "step": 30040 + }, + { + "epoch": 0.49169598298290107, + "grad_norm": 0.29360732436180115, + "learning_rate": 9.365904087431246e-06, + "loss": 0.0021, + "step": 30050 + }, + { + "epoch": 0.49185960893397696, + "grad_norm": 0.08756820857524872, + "learning_rate": 9.365207953249497e-06, + "loss": 0.0051, + "step": 30060 + }, + { + "epoch": 0.4920232348850528, + "grad_norm": 0.06855221837759018, + "learning_rate": 9.364511463056241e-06, + "loss": 0.0033, + "step": 30070 + }, + { + "epoch": 0.4921868608361286, + "grad_norm": 0.19539619982242584, + "learning_rate": 9.363814616908278e-06, + "loss": 0.0033, + "step": 30080 + }, + { + "epoch": 0.49235048678720444, + "grad_norm": 0.06320386379957199, + "learning_rate": 9.363117414862441e-06, + "loss": 0.0031, + "step": 30090 + }, + { + "epoch": 0.4925141127382803, + "grad_norm": 0.04635658487677574, + "learning_rate": 9.362419856975593e-06, + "loss": 0.0021, + "step": 30100 + }, + { + "epoch": 0.49267773868935616, + "grad_norm": 0.11186891049146652, + "learning_rate": 9.361721943304625e-06, + "loss": 0.0037, + "step": 30110 + }, + { + "epoch": 0.492841364640432, + "grad_norm": 0.31901976466178894, + "learning_rate": 9.361023673906454e-06, + "loss": 0.0033, + "step": 30120 + }, + { + "epoch": 0.4930049905915078, + "grad_norm": 0.05110229179263115, + "learning_rate": 9.36032504883803e-06, + "loss": 0.0026, + "step": 30130 + }, + { + "epoch": 0.49316861654258365, + "grad_norm": 0.05644770339131355, + "learning_rate": 9.359626068156329e-06, + "loss": 0.0024, + "step": 30140 + }, + { + "epoch": 0.4933322424936595, + "grad_norm": 0.0804319903254509, + "learning_rate": 9.35892673191836e-06, + "loss": 0.0046, + "step": 30150 + }, + { + "epoch": 0.4934958684447353, + "grad_norm": 0.17363384366035461, + "learning_rate": 9.358227040181156e-06, + "loss": 0.0035, + "step": 30160 + }, + { + "epoch": 0.4936594943958112, + "grad_norm": 0.22661608457565308, + "learning_rate": 9.357526993001784e-06, + "loss": 0.0034, + "step": 30170 + }, + { + "epoch": 0.493823120346887, + "grad_norm": 0.16821138560771942, + "learning_rate": 9.356826590437334e-06, + "loss": 0.0028, + "step": 30180 + }, + { + "epoch": 0.49398674629796285, + "grad_norm": 0.11628932505846024, + "learning_rate": 9.356125832544932e-06, + "loss": 0.003, + "step": 30190 + }, + { + "epoch": 0.4941503722490387, + "grad_norm": 0.05957257002592087, + "learning_rate": 9.355424719381729e-06, + "loss": 0.0033, + "step": 30200 + }, + { + "epoch": 0.4943139982001145, + "grad_norm": 0.22713930904865265, + "learning_rate": 9.354723251004902e-06, + "loss": 0.0028, + "step": 30210 + }, + { + "epoch": 0.4944776241511904, + "grad_norm": 0.10876328498125076, + "learning_rate": 9.354021427471665e-06, + "loss": 0.0042, + "step": 30220 + }, + { + "epoch": 0.49464125010226623, + "grad_norm": 0.0741264745593071, + "learning_rate": 9.353319248839251e-06, + "loss": 0.0038, + "step": 30230 + }, + { + "epoch": 0.49480487605334206, + "grad_norm": 0.4156412184238434, + "learning_rate": 9.352616715164933e-06, + "loss": 0.0034, + "step": 30240 + }, + { + "epoch": 0.4949685020044179, + "grad_norm": 0.21535980701446533, + "learning_rate": 9.351913826506003e-06, + "loss": 0.0035, + "step": 30250 + }, + { + "epoch": 0.4951321279554937, + "grad_norm": 0.08432229608297348, + "learning_rate": 9.351210582919789e-06, + "loss": 0.0039, + "step": 30260 + }, + { + "epoch": 0.4952957539065696, + "grad_norm": 0.09170220792293549, + "learning_rate": 9.350506984463643e-06, + "loss": 0.003, + "step": 30270 + }, + { + "epoch": 0.49545937985764543, + "grad_norm": 0.15254619717597961, + "learning_rate": 9.34980303119495e-06, + "loss": 0.0023, + "step": 30280 + }, + { + "epoch": 0.49562300580872126, + "grad_norm": 0.06740779429674149, + "learning_rate": 9.349098723171119e-06, + "loss": 0.0036, + "step": 30290 + }, + { + "epoch": 0.4957866317597971, + "grad_norm": 0.09019188582897186, + "learning_rate": 9.348394060449594e-06, + "loss": 0.0023, + "step": 30300 + }, + { + "epoch": 0.4959502577108729, + "grad_norm": 0.11102591454982758, + "learning_rate": 9.347689043087846e-06, + "loss": 0.0035, + "step": 30310 + }, + { + "epoch": 0.4961138836619488, + "grad_norm": 0.09559548646211624, + "learning_rate": 9.34698367114337e-06, + "loss": 0.0032, + "step": 30320 + }, + { + "epoch": 0.49627750961302464, + "grad_norm": 0.20734286308288574, + "learning_rate": 9.346277944673696e-06, + "loss": 0.0027, + "step": 30330 + }, + { + "epoch": 0.49644113556410047, + "grad_norm": 0.1631333976984024, + "learning_rate": 9.34557186373638e-06, + "loss": 0.0042, + "step": 30340 + }, + { + "epoch": 0.4966047615151763, + "grad_norm": 0.07738049328327179, + "learning_rate": 9.344865428389007e-06, + "loss": 0.0041, + "step": 30350 + }, + { + "epoch": 0.4967683874662521, + "grad_norm": 0.033862121403217316, + "learning_rate": 9.344158638689193e-06, + "loss": 0.0032, + "step": 30360 + }, + { + "epoch": 0.496932013417328, + "grad_norm": 0.047343477606773376, + "learning_rate": 9.34345149469458e-06, + "loss": 0.0021, + "step": 30370 + }, + { + "epoch": 0.49709563936840384, + "grad_norm": 0.04529246687889099, + "learning_rate": 9.342743996462841e-06, + "loss": 0.0031, + "step": 30380 + }, + { + "epoch": 0.49725926531947967, + "grad_norm": 0.04019023850560188, + "learning_rate": 9.342036144051678e-06, + "loss": 0.0053, + "step": 30390 + }, + { + "epoch": 0.4974228912705555, + "grad_norm": 0.10508981347084045, + "learning_rate": 9.341327937518818e-06, + "loss": 0.0032, + "step": 30400 + }, + { + "epoch": 0.49758651722163133, + "grad_norm": 0.07485896348953247, + "learning_rate": 9.340619376922023e-06, + "loss": 0.0046, + "step": 30410 + }, + { + "epoch": 0.4977501431727072, + "grad_norm": 0.13404831290245056, + "learning_rate": 9.33991046231908e-06, + "loss": 0.0061, + "step": 30420 + }, + { + "epoch": 0.49791376912378305, + "grad_norm": 0.047028373926877975, + "learning_rate": 9.339201193767804e-06, + "loss": 0.0028, + "step": 30430 + }, + { + "epoch": 0.4980773950748589, + "grad_norm": 0.22291763126850128, + "learning_rate": 9.338491571326043e-06, + "loss": 0.0016, + "step": 30440 + }, + { + "epoch": 0.4982410210259347, + "grad_norm": 0.06728585809469223, + "learning_rate": 9.33778159505167e-06, + "loss": 0.0045, + "step": 30450 + }, + { + "epoch": 0.49840464697701053, + "grad_norm": 0.07716561108827591, + "learning_rate": 9.337071265002589e-06, + "loss": 0.0026, + "step": 30460 + }, + { + "epoch": 0.4985682729280864, + "grad_norm": 0.09710460156202316, + "learning_rate": 9.33636058123673e-06, + "loss": 0.003, + "step": 30470 + }, + { + "epoch": 0.49873189887916225, + "grad_norm": 0.09153927862644196, + "learning_rate": 9.335649543812057e-06, + "loss": 0.0027, + "step": 30480 + }, + { + "epoch": 0.4988955248302381, + "grad_norm": 0.07671051472425461, + "learning_rate": 9.33493815278656e-06, + "loss": 0.0027, + "step": 30490 + }, + { + "epoch": 0.4990591507813139, + "grad_norm": 0.07697685807943344, + "learning_rate": 9.334226408218253e-06, + "loss": 0.0039, + "step": 30500 + }, + { + "epoch": 0.49922277673238974, + "grad_norm": 0.04844066500663757, + "learning_rate": 9.333514310165188e-06, + "loss": 0.0045, + "step": 30510 + }, + { + "epoch": 0.4993864026834656, + "grad_norm": 0.08367463946342468, + "learning_rate": 9.332801858685438e-06, + "loss": 0.004, + "step": 30520 + }, + { + "epoch": 0.49955002863454145, + "grad_norm": 0.09480585902929306, + "learning_rate": 9.332089053837112e-06, + "loss": 0.0026, + "step": 30530 + }, + { + "epoch": 0.4997136545856173, + "grad_norm": 0.23385405540466309, + "learning_rate": 9.331375895678341e-06, + "loss": 0.0033, + "step": 30540 + }, + { + "epoch": 0.4998772805366931, + "grad_norm": 0.04920531436800957, + "learning_rate": 9.330662384267289e-06, + "loss": 0.0044, + "step": 30550 + }, + { + "epoch": 0.500040906487769, + "grad_norm": 0.06316211074590683, + "learning_rate": 9.329948519662147e-06, + "loss": 0.0031, + "step": 30560 + }, + { + "epoch": 0.5002045324388448, + "grad_norm": 0.10192988067865372, + "learning_rate": 9.329234301921135e-06, + "loss": 0.0048, + "step": 30570 + }, + { + "epoch": 0.5003681583899207, + "grad_norm": 0.1315154731273651, + "learning_rate": 9.328519731102501e-06, + "loss": 0.0028, + "step": 30580 + }, + { + "epoch": 0.5005317843409964, + "grad_norm": 0.10005255043506622, + "learning_rate": 9.327804807264528e-06, + "loss": 0.0032, + "step": 30590 + }, + { + "epoch": 0.5006954102920723, + "grad_norm": 0.0728859156370163, + "learning_rate": 9.327089530465517e-06, + "loss": 0.0026, + "step": 30600 + }, + { + "epoch": 0.5008590362431482, + "grad_norm": 0.08706450462341309, + "learning_rate": 9.326373900763807e-06, + "loss": 0.0038, + "step": 30610 + }, + { + "epoch": 0.501022662194224, + "grad_norm": 0.07957324385643005, + "learning_rate": 9.32565791821776e-06, + "loss": 0.0033, + "step": 30620 + }, + { + "epoch": 0.5011862881452999, + "grad_norm": 0.13891012966632843, + "learning_rate": 9.324941582885772e-06, + "loss": 0.003, + "step": 30630 + }, + { + "epoch": 0.5013499140963756, + "grad_norm": 0.042739301919937134, + "learning_rate": 9.324224894826263e-06, + "loss": 0.0043, + "step": 30640 + }, + { + "epoch": 0.5015135400474515, + "grad_norm": 0.07387175410985947, + "learning_rate": 9.323507854097684e-06, + "loss": 0.0032, + "step": 30650 + }, + { + "epoch": 0.5016771659985274, + "grad_norm": 0.10685703903436661, + "learning_rate": 9.322790460758513e-06, + "loss": 0.0041, + "step": 30660 + }, + { + "epoch": 0.5018407919496032, + "grad_norm": 0.014521356672048569, + "learning_rate": 9.322072714867261e-06, + "loss": 0.0034, + "step": 30670 + }, + { + "epoch": 0.5020044179006791, + "grad_norm": 0.09038981050252914, + "learning_rate": 9.32135461648246e-06, + "loss": 0.0021, + "step": 30680 + }, + { + "epoch": 0.5021680438517548, + "grad_norm": 0.07703753560781479, + "learning_rate": 9.320636165662684e-06, + "loss": 0.003, + "step": 30690 + }, + { + "epoch": 0.5023316698028307, + "grad_norm": 0.04458431154489517, + "learning_rate": 9.31991736246652e-06, + "loss": 0.0022, + "step": 30700 + }, + { + "epoch": 0.5024952957539066, + "grad_norm": 0.23633258044719696, + "learning_rate": 9.319198206952592e-06, + "loss": 0.0058, + "step": 30710 + }, + { + "epoch": 0.5026589217049824, + "grad_norm": 0.08995245397090912, + "learning_rate": 9.318478699179555e-06, + "loss": 0.0043, + "step": 30720 + }, + { + "epoch": 0.5028225476560583, + "grad_norm": 0.3186088800430298, + "learning_rate": 9.317758839206088e-06, + "loss": 0.0027, + "step": 30730 + }, + { + "epoch": 0.502986173607134, + "grad_norm": 0.15831568837165833, + "learning_rate": 9.3170386270909e-06, + "loss": 0.0023, + "step": 30740 + }, + { + "epoch": 0.5031497995582099, + "grad_norm": 0.06968633085489273, + "learning_rate": 9.31631806289273e-06, + "loss": 0.0033, + "step": 30750 + }, + { + "epoch": 0.5033134255092858, + "grad_norm": 0.03314252942800522, + "learning_rate": 9.315597146670343e-06, + "loss": 0.0024, + "step": 30760 + }, + { + "epoch": 0.5034770514603616, + "grad_norm": 0.11809874325990677, + "learning_rate": 9.314875878482536e-06, + "loss": 0.0035, + "step": 30770 + }, + { + "epoch": 0.5036406774114375, + "grad_norm": 0.219374880194664, + "learning_rate": 9.314154258388135e-06, + "loss": 0.0029, + "step": 30780 + }, + { + "epoch": 0.5038043033625133, + "grad_norm": 0.08316024392843246, + "learning_rate": 9.31343228644599e-06, + "loss": 0.003, + "step": 30790 + }, + { + "epoch": 0.5039679293135891, + "grad_norm": 0.14430877566337585, + "learning_rate": 9.31270996271498e-06, + "loss": 0.003, + "step": 30800 + }, + { + "epoch": 0.504131555264665, + "grad_norm": 0.17799600958824158, + "learning_rate": 9.311987287254022e-06, + "loss": 0.0021, + "step": 30810 + }, + { + "epoch": 0.5042951812157408, + "grad_norm": 0.3822096586227417, + "learning_rate": 9.311264260122051e-06, + "loss": 0.0022, + "step": 30820 + }, + { + "epoch": 0.5044588071668167, + "grad_norm": 0.055685460567474365, + "learning_rate": 9.310540881378034e-06, + "loss": 0.0032, + "step": 30830 + }, + { + "epoch": 0.5046224331178925, + "grad_norm": 0.1303826868534088, + "learning_rate": 9.309817151080969e-06, + "loss": 0.0024, + "step": 30840 + }, + { + "epoch": 0.5047860590689683, + "grad_norm": 0.06911187618970871, + "learning_rate": 9.30909306928988e-06, + "loss": 0.0026, + "step": 30850 + }, + { + "epoch": 0.5049496850200442, + "grad_norm": 0.05135410279035568, + "learning_rate": 9.30836863606382e-06, + "loss": 0.0041, + "step": 30860 + }, + { + "epoch": 0.50511331097112, + "grad_norm": 0.24860428273677826, + "learning_rate": 9.307643851461874e-06, + "loss": 0.0035, + "step": 30870 + }, + { + "epoch": 0.5052769369221959, + "grad_norm": 0.18532079458236694, + "learning_rate": 9.306918715543152e-06, + "loss": 0.0035, + "step": 30880 + }, + { + "epoch": 0.5054405628732717, + "grad_norm": 0.05449739098548889, + "learning_rate": 9.306193228366791e-06, + "loss": 0.0028, + "step": 30890 + }, + { + "epoch": 0.5056041888243475, + "grad_norm": 0.07781044393777847, + "learning_rate": 9.305467389991964e-06, + "loss": 0.0033, + "step": 30900 + }, + { + "epoch": 0.5057678147754234, + "grad_norm": 0.15199995040893555, + "learning_rate": 9.304741200477863e-06, + "loss": 0.0029, + "step": 30910 + }, + { + "epoch": 0.5059314407264992, + "grad_norm": 0.2231421023607254, + "learning_rate": 9.304014659883716e-06, + "loss": 0.0018, + "step": 30920 + }, + { + "epoch": 0.5060950666775751, + "grad_norm": 0.08540011942386627, + "learning_rate": 9.303287768268775e-06, + "loss": 0.0027, + "step": 30930 + }, + { + "epoch": 0.5062586926286509, + "grad_norm": 0.0498240664601326, + "learning_rate": 9.302560525692329e-06, + "loss": 0.0019, + "step": 30940 + }, + { + "epoch": 0.5064223185797267, + "grad_norm": 0.2611545920372009, + "learning_rate": 9.301832932213681e-06, + "loss": 0.0031, + "step": 30950 + }, + { + "epoch": 0.5065859445308026, + "grad_norm": 0.3127445578575134, + "learning_rate": 9.301104987892177e-06, + "loss": 0.0022, + "step": 30960 + }, + { + "epoch": 0.5067495704818784, + "grad_norm": 0.1496264934539795, + "learning_rate": 9.300376692787185e-06, + "loss": 0.0019, + "step": 30970 + }, + { + "epoch": 0.5069131964329543, + "grad_norm": 0.06906845420598984, + "learning_rate": 9.299648046958098e-06, + "loss": 0.0026, + "step": 30980 + }, + { + "epoch": 0.5070768223840301, + "grad_norm": 0.11016680300235748, + "learning_rate": 9.298919050464348e-06, + "loss": 0.0045, + "step": 30990 + }, + { + "epoch": 0.507240448335106, + "grad_norm": 0.06764791905879974, + "learning_rate": 9.298189703365385e-06, + "loss": 0.0033, + "step": 31000 + }, + { + "epoch": 0.5074040742861818, + "grad_norm": 0.06481602787971497, + "learning_rate": 9.297460005720694e-06, + "loss": 0.0023, + "step": 31010 + }, + { + "epoch": 0.5075677002372576, + "grad_norm": 0.19831061363220215, + "learning_rate": 9.296729957589784e-06, + "loss": 0.0043, + "step": 31020 + }, + { + "epoch": 0.5077313261883335, + "grad_norm": 0.08853733539581299, + "learning_rate": 9.2959995590322e-06, + "loss": 0.0026, + "step": 31030 + }, + { + "epoch": 0.5078949521394093, + "grad_norm": 0.11338179558515549, + "learning_rate": 9.295268810107508e-06, + "loss": 0.0051, + "step": 31040 + }, + { + "epoch": 0.5080585780904852, + "grad_norm": 0.18170523643493652, + "learning_rate": 9.294537710875305e-06, + "loss": 0.002, + "step": 31050 + }, + { + "epoch": 0.508222204041561, + "grad_norm": 0.14694349467754364, + "learning_rate": 9.293806261395218e-06, + "loss": 0.004, + "step": 31060 + }, + { + "epoch": 0.5083858299926368, + "grad_norm": 0.18828746676445007, + "learning_rate": 9.2930744617269e-06, + "loss": 0.0045, + "step": 31070 + }, + { + "epoch": 0.5085494559437127, + "grad_norm": 0.13360236585140228, + "learning_rate": 9.292342311930038e-06, + "loss": 0.003, + "step": 31080 + }, + { + "epoch": 0.5087130818947885, + "grad_norm": 0.04547407850623131, + "learning_rate": 9.291609812064338e-06, + "loss": 0.0028, + "step": 31090 + }, + { + "epoch": 0.5088767078458644, + "grad_norm": 0.06315422058105469, + "learning_rate": 9.290876962189543e-06, + "loss": 0.002, + "step": 31100 + }, + { + "epoch": 0.5090403337969402, + "grad_norm": 0.3497620224952698, + "learning_rate": 9.290143762365423e-06, + "loss": 0.0043, + "step": 31110 + }, + { + "epoch": 0.509203959748016, + "grad_norm": 0.08874396234750748, + "learning_rate": 9.289410212651774e-06, + "loss": 0.0057, + "step": 31120 + }, + { + "epoch": 0.5093675856990919, + "grad_norm": 0.1794067621231079, + "learning_rate": 9.288676313108422e-06, + "loss": 0.0034, + "step": 31130 + }, + { + "epoch": 0.5095312116501677, + "grad_norm": 0.0529162734746933, + "learning_rate": 9.28794206379522e-06, + "loss": 0.0034, + "step": 31140 + }, + { + "epoch": 0.5096948376012436, + "grad_norm": 0.18635709583759308, + "learning_rate": 9.287207464772054e-06, + "loss": 0.0022, + "step": 31150 + }, + { + "epoch": 0.5098584635523195, + "grad_norm": 0.10029938817024231, + "learning_rate": 9.286472516098832e-06, + "loss": 0.0023, + "step": 31160 + }, + { + "epoch": 0.5100220895033952, + "grad_norm": 0.06586988270282745, + "learning_rate": 9.285737217835496e-06, + "loss": 0.003, + "step": 31170 + }, + { + "epoch": 0.5101857154544711, + "grad_norm": 0.1123717799782753, + "learning_rate": 9.285001570042014e-06, + "loss": 0.0021, + "step": 31180 + }, + { + "epoch": 0.5103493414055469, + "grad_norm": 0.081089086830616, + "learning_rate": 9.284265572778381e-06, + "loss": 0.0024, + "step": 31190 + }, + { + "epoch": 0.5105129673566228, + "grad_norm": 0.06498655676841736, + "learning_rate": 9.283529226104626e-06, + "loss": 0.0034, + "step": 31200 + }, + { + "epoch": 0.5106765933076987, + "grad_norm": 0.0598677322268486, + "learning_rate": 9.282792530080801e-06, + "loss": 0.002, + "step": 31210 + }, + { + "epoch": 0.5108402192587744, + "grad_norm": 0.042015474289655685, + "learning_rate": 9.28205548476699e-06, + "loss": 0.0025, + "step": 31220 + }, + { + "epoch": 0.5110038452098503, + "grad_norm": 0.03508414700627327, + "learning_rate": 9.2813180902233e-06, + "loss": 0.0025, + "step": 31230 + }, + { + "epoch": 0.5111674711609261, + "grad_norm": 0.059798464179039, + "learning_rate": 9.280580346509873e-06, + "loss": 0.0035, + "step": 31240 + }, + { + "epoch": 0.511331097112002, + "grad_norm": 0.06756911426782608, + "learning_rate": 9.279842253686878e-06, + "loss": 0.0018, + "step": 31250 + }, + { + "epoch": 0.5114947230630777, + "grad_norm": 0.07574374973773956, + "learning_rate": 9.279103811814509e-06, + "loss": 0.0024, + "step": 31260 + }, + { + "epoch": 0.5116583490141536, + "grad_norm": 0.08296548575162888, + "learning_rate": 9.278365020952993e-06, + "loss": 0.0034, + "step": 31270 + }, + { + "epoch": 0.5118219749652295, + "grad_norm": 0.10783910751342773, + "learning_rate": 9.277625881162582e-06, + "loss": 0.0025, + "step": 31280 + }, + { + "epoch": 0.5119856009163053, + "grad_norm": 0.03009295091032982, + "learning_rate": 9.276886392503558e-06, + "loss": 0.0021, + "step": 31290 + }, + { + "epoch": 0.5121492268673812, + "grad_norm": 0.059914231300354004, + "learning_rate": 9.27614655503623e-06, + "loss": 0.0032, + "step": 31300 + }, + { + "epoch": 0.512312852818457, + "grad_norm": 0.0674881637096405, + "learning_rate": 9.275406368820938e-06, + "loss": 0.0032, + "step": 31310 + }, + { + "epoch": 0.5124764787695328, + "grad_norm": 0.12186701595783234, + "learning_rate": 9.274665833918049e-06, + "loss": 0.0036, + "step": 31320 + }, + { + "epoch": 0.5126401047206087, + "grad_norm": 0.24976617097854614, + "learning_rate": 9.273924950387958e-06, + "loss": 0.0036, + "step": 31330 + }, + { + "epoch": 0.5128037306716845, + "grad_norm": 0.06949980556964874, + "learning_rate": 9.27318371829109e-06, + "loss": 0.0026, + "step": 31340 + }, + { + "epoch": 0.5129673566227604, + "grad_norm": 0.1329091340303421, + "learning_rate": 9.272442137687895e-06, + "loss": 0.0032, + "step": 31350 + }, + { + "epoch": 0.5131309825738362, + "grad_norm": 0.12890006601810455, + "learning_rate": 9.271700208638856e-06, + "loss": 0.0058, + "step": 31360 + }, + { + "epoch": 0.513294608524912, + "grad_norm": 0.10463836044073105, + "learning_rate": 9.270957931204482e-06, + "loss": 0.0053, + "step": 31370 + }, + { + "epoch": 0.5134582344759879, + "grad_norm": 0.0823264941573143, + "learning_rate": 9.270215305445311e-06, + "loss": 0.0032, + "step": 31380 + }, + { + "epoch": 0.5136218604270637, + "grad_norm": 0.18861626088619232, + "learning_rate": 9.269472331421907e-06, + "loss": 0.0025, + "step": 31390 + }, + { + "epoch": 0.5137854863781396, + "grad_norm": 0.1712394505739212, + "learning_rate": 9.268729009194865e-06, + "loss": 0.004, + "step": 31400 + }, + { + "epoch": 0.5139491123292154, + "grad_norm": 0.10809671878814697, + "learning_rate": 9.26798533882481e-06, + "loss": 0.0022, + "step": 31410 + }, + { + "epoch": 0.5141127382802912, + "grad_norm": 0.07816719263792038, + "learning_rate": 9.267241320372391e-06, + "loss": 0.0057, + "step": 31420 + }, + { + "epoch": 0.5142763642313671, + "grad_norm": 0.04988302290439606, + "learning_rate": 9.266496953898289e-06, + "loss": 0.0037, + "step": 31430 + }, + { + "epoch": 0.5144399901824429, + "grad_norm": 0.30293551087379456, + "learning_rate": 9.26575223946321e-06, + "loss": 0.0027, + "step": 31440 + }, + { + "epoch": 0.5146036161335188, + "grad_norm": 0.1359817236661911, + "learning_rate": 9.265007177127894e-06, + "loss": 0.0041, + "step": 31450 + }, + { + "epoch": 0.5147672420845946, + "grad_norm": 0.07000212371349335, + "learning_rate": 9.264261766953101e-06, + "loss": 0.0044, + "step": 31460 + }, + { + "epoch": 0.5149308680356705, + "grad_norm": 0.12727925181388855, + "learning_rate": 9.263516008999628e-06, + "loss": 0.0035, + "step": 31470 + }, + { + "epoch": 0.5150944939867463, + "grad_norm": 0.1488708108663559, + "learning_rate": 9.262769903328294e-06, + "loss": 0.0048, + "step": 31480 + }, + { + "epoch": 0.5152581199378221, + "grad_norm": 0.05991646647453308, + "learning_rate": 9.262023449999951e-06, + "loss": 0.0097, + "step": 31490 + }, + { + "epoch": 0.515421745888898, + "grad_norm": 0.11776924878358841, + "learning_rate": 9.261276649075474e-06, + "loss": 0.0041, + "step": 31500 + }, + { + "epoch": 0.5155853718399738, + "grad_norm": 0.11384090036153793, + "learning_rate": 9.260529500615774e-06, + "loss": 0.0029, + "step": 31510 + }, + { + "epoch": 0.5157489977910497, + "grad_norm": 0.22239993512630463, + "learning_rate": 9.259782004681783e-06, + "loss": 0.0022, + "step": 31520 + }, + { + "epoch": 0.5159126237421255, + "grad_norm": 0.18100106716156006, + "learning_rate": 9.259034161334464e-06, + "loss": 0.0032, + "step": 31530 + }, + { + "epoch": 0.5160762496932013, + "grad_norm": 0.15316882729530334, + "learning_rate": 9.25828597063481e-06, + "loss": 0.0023, + "step": 31540 + }, + { + "epoch": 0.5162398756442772, + "grad_norm": 0.1278257817029953, + "learning_rate": 9.25753743264384e-06, + "loss": 0.0043, + "step": 31550 + }, + { + "epoch": 0.516403501595353, + "grad_norm": 0.06046155095100403, + "learning_rate": 9.256788547422601e-06, + "loss": 0.0061, + "step": 31560 + }, + { + "epoch": 0.5165671275464289, + "grad_norm": 0.10817162692546844, + "learning_rate": 9.256039315032172e-06, + "loss": 0.0023, + "step": 31570 + }, + { + "epoch": 0.5167307534975047, + "grad_norm": 0.14828264713287354, + "learning_rate": 9.255289735533656e-06, + "loss": 0.0053, + "step": 31580 + }, + { + "epoch": 0.5168943794485805, + "grad_norm": 0.1993962526321411, + "learning_rate": 9.254539808988189e-06, + "loss": 0.0049, + "step": 31590 + }, + { + "epoch": 0.5170580053996564, + "grad_norm": 0.0732942596077919, + "learning_rate": 9.253789535456929e-06, + "loss": 0.0042, + "step": 31600 + }, + { + "epoch": 0.5172216313507322, + "grad_norm": 0.12835919857025146, + "learning_rate": 9.253038915001066e-06, + "loss": 0.0038, + "step": 31610 + }, + { + "epoch": 0.5173852573018081, + "grad_norm": 0.21196874976158142, + "learning_rate": 9.252287947681822e-06, + "loss": 0.0031, + "step": 31620 + }, + { + "epoch": 0.517548883252884, + "grad_norm": 0.05480005964636803, + "learning_rate": 9.251536633560439e-06, + "loss": 0.0038, + "step": 31630 + }, + { + "epoch": 0.5177125092039597, + "grad_norm": 0.06896719336509705, + "learning_rate": 9.250784972698192e-06, + "loss": 0.0029, + "step": 31640 + }, + { + "epoch": 0.5178761351550356, + "grad_norm": 0.05573992058634758, + "learning_rate": 9.250032965156386e-06, + "loss": 0.0027, + "step": 31650 + }, + { + "epoch": 0.5180397611061114, + "grad_norm": 0.06220722198486328, + "learning_rate": 9.249280610996352e-06, + "loss": 0.0026, + "step": 31660 + }, + { + "epoch": 0.5182033870571873, + "grad_norm": 0.12922725081443787, + "learning_rate": 9.248527910279447e-06, + "loss": 0.0029, + "step": 31670 + }, + { + "epoch": 0.5183670130082632, + "grad_norm": 0.06652378290891647, + "learning_rate": 9.247774863067063e-06, + "loss": 0.0035, + "step": 31680 + }, + { + "epoch": 0.5185306389593389, + "grad_norm": 0.1481589525938034, + "learning_rate": 9.24702146942061e-06, + "loss": 0.0023, + "step": 31690 + }, + { + "epoch": 0.5186942649104148, + "grad_norm": 0.7701807618141174, + "learning_rate": 9.24626772940154e-06, + "loss": 0.0026, + "step": 31700 + }, + { + "epoch": 0.5188578908614906, + "grad_norm": 0.04232223704457283, + "learning_rate": 9.245513643071317e-06, + "loss": 0.0037, + "step": 31710 + }, + { + "epoch": 0.5190215168125665, + "grad_norm": 0.09090936928987503, + "learning_rate": 9.244759210491448e-06, + "loss": 0.0041, + "step": 31720 + }, + { + "epoch": 0.5191851427636424, + "grad_norm": 0.07332294434309006, + "learning_rate": 9.244004431723458e-06, + "loss": 0.0025, + "step": 31730 + }, + { + "epoch": 0.5193487687147181, + "grad_norm": 0.12391065806150436, + "learning_rate": 9.243249306828907e-06, + "loss": 0.0024, + "step": 31740 + }, + { + "epoch": 0.519512394665794, + "grad_norm": 0.04803864285349846, + "learning_rate": 9.24249383586938e-06, + "loss": 0.0033, + "step": 31750 + }, + { + "epoch": 0.5196760206168698, + "grad_norm": 0.04341021552681923, + "learning_rate": 9.241738018906487e-06, + "loss": 0.0026, + "step": 31760 + }, + { + "epoch": 0.5198396465679457, + "grad_norm": 0.07323268055915833, + "learning_rate": 9.240981856001876e-06, + "loss": 0.002, + "step": 31770 + }, + { + "epoch": 0.5200032725190216, + "grad_norm": 0.07374236732721329, + "learning_rate": 9.240225347217213e-06, + "loss": 0.004, + "step": 31780 + }, + { + "epoch": 0.5201668984700973, + "grad_norm": 0.30002522468566895, + "learning_rate": 9.239468492614197e-06, + "loss": 0.0058, + "step": 31790 + }, + { + "epoch": 0.5203305244211732, + "grad_norm": 0.14853738248348236, + "learning_rate": 9.238711292254553e-06, + "loss": 0.0032, + "step": 31800 + }, + { + "epoch": 0.520494150372249, + "grad_norm": 0.04692799225449562, + "learning_rate": 9.23795374620004e-06, + "loss": 0.0022, + "step": 31810 + }, + { + "epoch": 0.5206577763233249, + "grad_norm": 0.06869132816791534, + "learning_rate": 9.237195854512436e-06, + "loss": 0.0041, + "step": 31820 + }, + { + "epoch": 0.5208214022744008, + "grad_norm": 0.04035910964012146, + "learning_rate": 9.236437617253556e-06, + "loss": 0.0035, + "step": 31830 + }, + { + "epoch": 0.5209850282254765, + "grad_norm": 0.04523344337940216, + "learning_rate": 9.235679034485237e-06, + "loss": 0.0028, + "step": 31840 + }, + { + "epoch": 0.5211486541765524, + "grad_norm": 0.20912596583366394, + "learning_rate": 9.234920106269346e-06, + "loss": 0.0029, + "step": 31850 + }, + { + "epoch": 0.5213122801276282, + "grad_norm": 0.10722243785858154, + "learning_rate": 9.234160832667781e-06, + "loss": 0.0019, + "step": 31860 + }, + { + "epoch": 0.5214759060787041, + "grad_norm": 0.18421198427677155, + "learning_rate": 9.233401213742464e-06, + "loss": 0.0041, + "step": 31870 + }, + { + "epoch": 0.52163953202978, + "grad_norm": 0.23088569939136505, + "learning_rate": 9.232641249555348e-06, + "loss": 0.0047, + "step": 31880 + }, + { + "epoch": 0.5218031579808557, + "grad_norm": 0.17881347239017487, + "learning_rate": 9.23188094016841e-06, + "loss": 0.0034, + "step": 31890 + }, + { + "epoch": 0.5219667839319316, + "grad_norm": 0.06535833328962326, + "learning_rate": 9.231120285643662e-06, + "loss": 0.0035, + "step": 31900 + }, + { + "epoch": 0.5221304098830074, + "grad_norm": 0.12647125124931335, + "learning_rate": 9.23035928604314e-06, + "loss": 0.0033, + "step": 31910 + }, + { + "epoch": 0.5222940358340833, + "grad_norm": 0.025344140827655792, + "learning_rate": 9.229597941428907e-06, + "loss": 0.0024, + "step": 31920 + }, + { + "epoch": 0.5224576617851592, + "grad_norm": 0.09759747982025146, + "learning_rate": 9.228836251863055e-06, + "loss": 0.0026, + "step": 31930 + }, + { + "epoch": 0.522621287736235, + "grad_norm": 0.06471876800060272, + "learning_rate": 9.228074217407707e-06, + "loss": 0.0025, + "step": 31940 + }, + { + "epoch": 0.5227849136873108, + "grad_norm": 0.07845073193311691, + "learning_rate": 9.22731183812501e-06, + "loss": 0.0033, + "step": 31950 + }, + { + "epoch": 0.5229485396383866, + "grad_norm": 0.3236940801143646, + "learning_rate": 9.226549114077143e-06, + "loss": 0.0034, + "step": 31960 + }, + { + "epoch": 0.5231121655894625, + "grad_norm": 0.02920597791671753, + "learning_rate": 9.225786045326311e-06, + "loss": 0.0019, + "step": 31970 + }, + { + "epoch": 0.5232757915405384, + "grad_norm": 0.1861599236726761, + "learning_rate": 9.225022631934745e-06, + "loss": 0.0028, + "step": 31980 + }, + { + "epoch": 0.5234394174916142, + "grad_norm": 0.07145814597606659, + "learning_rate": 9.224258873964708e-06, + "loss": 0.002, + "step": 31990 + }, + { + "epoch": 0.52360304344269, + "grad_norm": 0.042243584990501404, + "learning_rate": 9.223494771478492e-06, + "loss": 0.0017, + "step": 32000 + }, + { + "epoch": 0.5237666693937658, + "grad_norm": 0.05444829538464546, + "learning_rate": 9.22273032453841e-06, + "loss": 0.0032, + "step": 32010 + }, + { + "epoch": 0.5239302953448417, + "grad_norm": 0.11444508284330368, + "learning_rate": 9.221965533206808e-06, + "loss": 0.0032, + "step": 32020 + }, + { + "epoch": 0.5240939212959176, + "grad_norm": 0.10446197539567947, + "learning_rate": 9.221200397546065e-06, + "loss": 0.0048, + "step": 32030 + }, + { + "epoch": 0.5242575472469934, + "grad_norm": 0.15641173720359802, + "learning_rate": 9.220434917618576e-06, + "loss": 0.0037, + "step": 32040 + }, + { + "epoch": 0.5244211731980692, + "grad_norm": 0.06004998832941055, + "learning_rate": 9.219669093486777e-06, + "loss": 0.0025, + "step": 32050 + }, + { + "epoch": 0.524584799149145, + "grad_norm": 0.13192476332187653, + "learning_rate": 9.218902925213122e-06, + "loss": 0.0027, + "step": 32060 + }, + { + "epoch": 0.5247484251002209, + "grad_norm": 0.1496613323688507, + "learning_rate": 9.218136412860099e-06, + "loss": 0.0047, + "step": 32070 + }, + { + "epoch": 0.5249120510512968, + "grad_norm": 0.07085603475570679, + "learning_rate": 9.21736955649022e-06, + "loss": 0.0024, + "step": 32080 + }, + { + "epoch": 0.5250756770023726, + "grad_norm": 0.180672749876976, + "learning_rate": 9.21660235616603e-06, + "loss": 0.0014, + "step": 32090 + }, + { + "epoch": 0.5252393029534485, + "grad_norm": 0.05476313456892967, + "learning_rate": 9.215834811950097e-06, + "loss": 0.0029, + "step": 32100 + }, + { + "epoch": 0.5254029289045242, + "grad_norm": 0.09045332670211792, + "learning_rate": 9.215066923905022e-06, + "loss": 0.0018, + "step": 32110 + }, + { + "epoch": 0.5255665548556001, + "grad_norm": 0.0838485136628151, + "learning_rate": 9.214298692093429e-06, + "loss": 0.0029, + "step": 32120 + }, + { + "epoch": 0.5257301808066759, + "grad_norm": 0.12681303918361664, + "learning_rate": 9.21353011657797e-06, + "loss": 0.0046, + "step": 32130 + }, + { + "epoch": 0.5258938067577518, + "grad_norm": 0.09967485070228577, + "learning_rate": 9.212761197421334e-06, + "loss": 0.0034, + "step": 32140 + }, + { + "epoch": 0.5260574327088277, + "grad_norm": 0.02472381852567196, + "learning_rate": 9.211991934686227e-06, + "loss": 0.0014, + "step": 32150 + }, + { + "epoch": 0.5262210586599034, + "grad_norm": 0.1094222217798233, + "learning_rate": 9.211222328435388e-06, + "loss": 0.0026, + "step": 32160 + }, + { + "epoch": 0.5263846846109793, + "grad_norm": 0.22920668125152588, + "learning_rate": 9.210452378731583e-06, + "loss": 0.0034, + "step": 32170 + }, + { + "epoch": 0.5265483105620551, + "grad_norm": 0.06849930435419083, + "learning_rate": 9.209682085637608e-06, + "loss": 0.0043, + "step": 32180 + }, + { + "epoch": 0.526711936513131, + "grad_norm": 0.09366314113140106, + "learning_rate": 9.208911449216284e-06, + "loss": 0.0029, + "step": 32190 + }, + { + "epoch": 0.5268755624642069, + "grad_norm": 0.1349136084318161, + "learning_rate": 9.208140469530463e-06, + "loss": 0.0041, + "step": 32200 + }, + { + "epoch": 0.5270391884152826, + "grad_norm": 0.2200312614440918, + "learning_rate": 9.207369146643022e-06, + "loss": 0.002, + "step": 32210 + }, + { + "epoch": 0.5272028143663585, + "grad_norm": 0.15642328560352325, + "learning_rate": 9.20659748061687e-06, + "loss": 0.0028, + "step": 32220 + }, + { + "epoch": 0.5273664403174343, + "grad_norm": 0.13700298964977264, + "learning_rate": 9.205825471514937e-06, + "loss": 0.0029, + "step": 32230 + }, + { + "epoch": 0.5275300662685102, + "grad_norm": 0.08765758574008942, + "learning_rate": 9.20505311940019e-06, + "loss": 0.0017, + "step": 32240 + }, + { + "epoch": 0.5276936922195861, + "grad_norm": 0.17214705049991608, + "learning_rate": 9.204280424335615e-06, + "loss": 0.002, + "step": 32250 + }, + { + "epoch": 0.5278573181706618, + "grad_norm": 0.1650456041097641, + "learning_rate": 9.203507386384233e-06, + "loss": 0.0023, + "step": 32260 + }, + { + "epoch": 0.5280209441217377, + "grad_norm": 0.08436457067728043, + "learning_rate": 9.202734005609092e-06, + "loss": 0.0033, + "step": 32270 + }, + { + "epoch": 0.5281845700728135, + "grad_norm": 0.09092748165130615, + "learning_rate": 9.201960282073264e-06, + "loss": 0.0036, + "step": 32280 + }, + { + "epoch": 0.5283481960238894, + "grad_norm": 0.08706819266080856, + "learning_rate": 9.20118621583985e-06, + "loss": 0.0027, + "step": 32290 + }, + { + "epoch": 0.5285118219749653, + "grad_norm": 0.24460530281066895, + "learning_rate": 9.200411806971985e-06, + "loss": 0.0042, + "step": 32300 + }, + { + "epoch": 0.528675447926041, + "grad_norm": 0.2795489728450775, + "learning_rate": 9.199637055532822e-06, + "loss": 0.0037, + "step": 32310 + }, + { + "epoch": 0.5288390738771169, + "grad_norm": 0.06631495803594589, + "learning_rate": 9.198861961585548e-06, + "loss": 0.0027, + "step": 32320 + }, + { + "epoch": 0.5290026998281927, + "grad_norm": 0.16572906076908112, + "learning_rate": 9.198086525193381e-06, + "loss": 0.0039, + "step": 32330 + }, + { + "epoch": 0.5291663257792686, + "grad_norm": 0.049690622836351395, + "learning_rate": 9.197310746419558e-06, + "loss": 0.0049, + "step": 32340 + }, + { + "epoch": 0.5293299517303445, + "grad_norm": 0.05921037122607231, + "learning_rate": 9.19653462532735e-06, + "loss": 0.0031, + "step": 32350 + }, + { + "epoch": 0.5294935776814202, + "grad_norm": 0.07682120054960251, + "learning_rate": 9.19575816198006e-06, + "loss": 0.0017, + "step": 32360 + }, + { + "epoch": 0.5296572036324961, + "grad_norm": 0.3854997158050537, + "learning_rate": 9.194981356441006e-06, + "loss": 0.0029, + "step": 32370 + }, + { + "epoch": 0.5298208295835719, + "grad_norm": 0.06295394897460938, + "learning_rate": 9.194204208773547e-06, + "loss": 0.003, + "step": 32380 + }, + { + "epoch": 0.5299844555346478, + "grad_norm": 0.06632732599973679, + "learning_rate": 9.193426719041062e-06, + "loss": 0.0033, + "step": 32390 + }, + { + "epoch": 0.5301480814857237, + "grad_norm": 0.11335868388414383, + "learning_rate": 9.19264888730696e-06, + "loss": 0.0031, + "step": 32400 + }, + { + "epoch": 0.5303117074367995, + "grad_norm": 0.1577157974243164, + "learning_rate": 9.191870713634681e-06, + "loss": 0.0032, + "step": 32410 + }, + { + "epoch": 0.5304753333878753, + "grad_norm": 0.061971068382263184, + "learning_rate": 9.191092198087688e-06, + "loss": 0.0033, + "step": 32420 + }, + { + "epoch": 0.5306389593389511, + "grad_norm": 0.07954541593790054, + "learning_rate": 9.190313340729474e-06, + "loss": 0.0021, + "step": 32430 + }, + { + "epoch": 0.530802585290027, + "grad_norm": 0.04109479486942291, + "learning_rate": 9.189534141623562e-06, + "loss": 0.0034, + "step": 32440 + }, + { + "epoch": 0.5309662112411029, + "grad_norm": 0.24136632680892944, + "learning_rate": 9.188754600833499e-06, + "loss": 0.0045, + "step": 32450 + }, + { + "epoch": 0.5311298371921787, + "grad_norm": 0.04293668270111084, + "learning_rate": 9.18797471842286e-06, + "loss": 0.0039, + "step": 32460 + }, + { + "epoch": 0.5312934631432545, + "grad_norm": 0.2215060442686081, + "learning_rate": 9.187194494455254e-06, + "loss": 0.0045, + "step": 32470 + }, + { + "epoch": 0.5314570890943303, + "grad_norm": 0.09272361546754837, + "learning_rate": 9.18641392899431e-06, + "loss": 0.0042, + "step": 32480 + }, + { + "epoch": 0.5316207150454062, + "grad_norm": 0.048450734466314316, + "learning_rate": 9.18563302210369e-06, + "loss": 0.0025, + "step": 32490 + }, + { + "epoch": 0.5317843409964821, + "grad_norm": 0.2152061015367508, + "learning_rate": 9.184851773847081e-06, + "loss": 0.0038, + "step": 32500 + }, + { + "epoch": 0.5319479669475579, + "grad_norm": 0.08277581632137299, + "learning_rate": 9.1840701842882e-06, + "loss": 0.0028, + "step": 32510 + }, + { + "epoch": 0.5321115928986337, + "grad_norm": 0.053438831120729446, + "learning_rate": 9.183288253490789e-06, + "loss": 0.0038, + "step": 32520 + }, + { + "epoch": 0.5322752188497095, + "grad_norm": 0.21432149410247803, + "learning_rate": 9.182505981518622e-06, + "loss": 0.0038, + "step": 32530 + }, + { + "epoch": 0.5324388448007854, + "grad_norm": 0.17829272150993347, + "learning_rate": 9.181723368435498e-06, + "loss": 0.0033, + "step": 32540 + }, + { + "epoch": 0.5326024707518613, + "grad_norm": 0.06643196940422058, + "learning_rate": 9.180940414305241e-06, + "loss": 0.0033, + "step": 32550 + }, + { + "epoch": 0.5327660967029371, + "grad_norm": 0.07685931771993637, + "learning_rate": 9.180157119191711e-06, + "loss": 0.0029, + "step": 32560 + }, + { + "epoch": 0.532929722654013, + "grad_norm": 0.08952626585960388, + "learning_rate": 9.179373483158788e-06, + "loss": 0.002, + "step": 32570 + }, + { + "epoch": 0.5330933486050887, + "grad_norm": 0.09441076964139938, + "learning_rate": 9.178589506270382e-06, + "loss": 0.0027, + "step": 32580 + }, + { + "epoch": 0.5332569745561646, + "grad_norm": 0.0432206429541111, + "learning_rate": 9.177805188590435e-06, + "loss": 0.0023, + "step": 32590 + }, + { + "epoch": 0.5334206005072405, + "grad_norm": 0.1083078682422638, + "learning_rate": 9.177020530182908e-06, + "loss": 0.0021, + "step": 32600 + }, + { + "epoch": 0.5335842264583163, + "grad_norm": 0.10792893171310425, + "learning_rate": 9.176235531111799e-06, + "loss": 0.0026, + "step": 32610 + }, + { + "epoch": 0.5337478524093922, + "grad_norm": 0.1392124891281128, + "learning_rate": 9.17545019144113e-06, + "loss": 0.003, + "step": 32620 + }, + { + "epoch": 0.5339114783604679, + "grad_norm": 0.08289247751235962, + "learning_rate": 9.174664511234947e-06, + "loss": 0.0029, + "step": 32630 + }, + { + "epoch": 0.5340751043115438, + "grad_norm": 0.01622731238603592, + "learning_rate": 9.173878490557332e-06, + "loss": 0.002, + "step": 32640 + }, + { + "epoch": 0.5342387302626197, + "grad_norm": 0.04721998795866966, + "learning_rate": 9.173092129472388e-06, + "loss": 0.002, + "step": 32650 + }, + { + "epoch": 0.5344023562136955, + "grad_norm": 0.06475071609020233, + "learning_rate": 9.172305428044248e-06, + "loss": 0.0019, + "step": 32660 + }, + { + "epoch": 0.5345659821647714, + "grad_norm": 0.09046278893947601, + "learning_rate": 9.171518386337073e-06, + "loss": 0.0021, + "step": 32670 + }, + { + "epoch": 0.5347296081158471, + "grad_norm": 0.1746835708618164, + "learning_rate": 9.17073100441505e-06, + "loss": 0.0023, + "step": 32680 + }, + { + "epoch": 0.534893234066923, + "grad_norm": 0.13488642871379852, + "learning_rate": 9.169943282342398e-06, + "loss": 0.0022, + "step": 32690 + }, + { + "epoch": 0.5350568600179989, + "grad_norm": 0.1203932836651802, + "learning_rate": 9.169155220183357e-06, + "loss": 0.0023, + "step": 32700 + }, + { + "epoch": 0.5352204859690747, + "grad_norm": 0.05865379050374031, + "learning_rate": 9.168366818002203e-06, + "loss": 0.0019, + "step": 32710 + }, + { + "epoch": 0.5353841119201506, + "grad_norm": 0.05368148535490036, + "learning_rate": 9.167578075863232e-06, + "loss": 0.0022, + "step": 32720 + }, + { + "epoch": 0.5355477378712263, + "grad_norm": 0.1944589614868164, + "learning_rate": 9.166788993830773e-06, + "loss": 0.0035, + "step": 32730 + }, + { + "epoch": 0.5357113638223022, + "grad_norm": 0.25233814120292664, + "learning_rate": 9.16599957196918e-06, + "loss": 0.0026, + "step": 32740 + }, + { + "epoch": 0.5358749897733781, + "grad_norm": 0.04803385213017464, + "learning_rate": 9.165209810342835e-06, + "loss": 0.0026, + "step": 32750 + }, + { + "epoch": 0.5360386157244539, + "grad_norm": 0.11446692049503326, + "learning_rate": 9.16441970901615e-06, + "loss": 0.0042, + "step": 32760 + }, + { + "epoch": 0.5362022416755298, + "grad_norm": 0.1486598551273346, + "learning_rate": 9.163629268053564e-06, + "loss": 0.0035, + "step": 32770 + }, + { + "epoch": 0.5363658676266055, + "grad_norm": 0.13443703949451447, + "learning_rate": 9.162838487519539e-06, + "loss": 0.0073, + "step": 32780 + }, + { + "epoch": 0.5365294935776814, + "grad_norm": 0.11499325931072235, + "learning_rate": 9.16204736747857e-06, + "loss": 0.002, + "step": 32790 + }, + { + "epoch": 0.5366931195287573, + "grad_norm": 0.07831773161888123, + "learning_rate": 9.161255907995177e-06, + "loss": 0.0028, + "step": 32800 + }, + { + "epoch": 0.5368567454798331, + "grad_norm": 0.09263172000646591, + "learning_rate": 9.160464109133913e-06, + "loss": 0.0051, + "step": 32810 + }, + { + "epoch": 0.537020371430909, + "grad_norm": 0.041871313005685806, + "learning_rate": 9.15967197095935e-06, + "loss": 0.0019, + "step": 32820 + }, + { + "epoch": 0.5371839973819847, + "grad_norm": 0.20193035900592804, + "learning_rate": 9.158879493536092e-06, + "loss": 0.0019, + "step": 32830 + }, + { + "epoch": 0.5373476233330606, + "grad_norm": 0.04876742511987686, + "learning_rate": 9.158086676928773e-06, + "loss": 0.003, + "step": 32840 + }, + { + "epoch": 0.5375112492841365, + "grad_norm": 0.04884132370352745, + "learning_rate": 9.157293521202053e-06, + "loss": 0.0025, + "step": 32850 + }, + { + "epoch": 0.5376748752352123, + "grad_norm": 0.10001318901777267, + "learning_rate": 9.156500026420616e-06, + "loss": 0.0022, + "step": 32860 + }, + { + "epoch": 0.5378385011862882, + "grad_norm": 0.12856820225715637, + "learning_rate": 9.155706192649181e-06, + "loss": 0.0034, + "step": 32870 + }, + { + "epoch": 0.538002127137364, + "grad_norm": 0.05957594886422157, + "learning_rate": 9.154912019952485e-06, + "loss": 0.0032, + "step": 32880 + }, + { + "epoch": 0.5381657530884398, + "grad_norm": 0.043399207293987274, + "learning_rate": 9.154117508395303e-06, + "loss": 0.0022, + "step": 32890 + }, + { + "epoch": 0.5383293790395157, + "grad_norm": 0.07214365154504776, + "learning_rate": 9.153322658042429e-06, + "loss": 0.0037, + "step": 32900 + }, + { + "epoch": 0.5384930049905915, + "grad_norm": 0.08859546482563019, + "learning_rate": 9.152527468958692e-06, + "loss": 0.0014, + "step": 32910 + }, + { + "epoch": 0.5386566309416674, + "grad_norm": 0.17552126944065094, + "learning_rate": 9.15173194120894e-06, + "loss": 0.0025, + "step": 32920 + }, + { + "epoch": 0.5388202568927432, + "grad_norm": 0.07439539581537247, + "learning_rate": 9.150936074858057e-06, + "loss": 0.0043, + "step": 32930 + }, + { + "epoch": 0.538983882843819, + "grad_norm": 0.07390560209751129, + "learning_rate": 9.150139869970951e-06, + "loss": 0.0036, + "step": 32940 + }, + { + "epoch": 0.5391475087948949, + "grad_norm": 0.10978835821151733, + "learning_rate": 9.149343326612557e-06, + "loss": 0.002, + "step": 32950 + }, + { + "epoch": 0.5393111347459707, + "grad_norm": 0.02490217424929142, + "learning_rate": 9.14854644484784e-06, + "loss": 0.0018, + "step": 32960 + }, + { + "epoch": 0.5394747606970466, + "grad_norm": 0.13357634842395782, + "learning_rate": 9.147749224741788e-06, + "loss": 0.0033, + "step": 32970 + }, + { + "epoch": 0.5396383866481224, + "grad_norm": 0.11878160387277603, + "learning_rate": 9.14695166635942e-06, + "loss": 0.0022, + "step": 32980 + }, + { + "epoch": 0.5398020125991982, + "grad_norm": 0.14954353868961334, + "learning_rate": 9.146153769765786e-06, + "loss": 0.0023, + "step": 32990 + }, + { + "epoch": 0.539965638550274, + "grad_norm": 0.1649288684129715, + "learning_rate": 9.145355535025955e-06, + "loss": 0.0029, + "step": 33000 + }, + { + "epoch": 0.5401292645013499, + "grad_norm": 0.4143097698688507, + "learning_rate": 9.144556962205032e-06, + "loss": 0.0041, + "step": 33010 + }, + { + "epoch": 0.5402928904524258, + "grad_norm": 0.06574437767267227, + "learning_rate": 9.14375805136814e-06, + "loss": 0.0022, + "step": 33020 + }, + { + "epoch": 0.5404565164035016, + "grad_norm": 0.05873316526412964, + "learning_rate": 9.142958802580445e-06, + "loss": 0.0037, + "step": 33030 + }, + { + "epoch": 0.5406201423545774, + "grad_norm": 0.08040419965982437, + "learning_rate": 9.142159215907123e-06, + "loss": 0.0028, + "step": 33040 + }, + { + "epoch": 0.5407837683056532, + "grad_norm": 0.08465006947517395, + "learning_rate": 9.14135929141339e-06, + "loss": 0.0016, + "step": 33050 + }, + { + "epoch": 0.5409473942567291, + "grad_norm": 0.11002630740404129, + "learning_rate": 9.140559029164479e-06, + "loss": 0.0038, + "step": 33060 + }, + { + "epoch": 0.541111020207805, + "grad_norm": 0.040398065000772476, + "learning_rate": 9.139758429225664e-06, + "loss": 0.0023, + "step": 33070 + }, + { + "epoch": 0.5412746461588808, + "grad_norm": 0.11514900624752045, + "learning_rate": 9.138957491662237e-06, + "loss": 0.0034, + "step": 33080 + }, + { + "epoch": 0.5414382721099567, + "grad_norm": 0.06339000165462494, + "learning_rate": 9.138156216539517e-06, + "loss": 0.0022, + "step": 33090 + }, + { + "epoch": 0.5416018980610324, + "grad_norm": 0.0937352105975151, + "learning_rate": 9.137354603922857e-06, + "loss": 0.0031, + "step": 33100 + }, + { + "epoch": 0.5417655240121083, + "grad_norm": 0.09305834025144577, + "learning_rate": 9.136552653877631e-06, + "loss": 0.0027, + "step": 33110 + }, + { + "epoch": 0.5419291499631842, + "grad_norm": 0.06475158780813217, + "learning_rate": 9.135750366469245e-06, + "loss": 0.0034, + "step": 33120 + }, + { + "epoch": 0.54209277591426, + "grad_norm": 0.05281263589859009, + "learning_rate": 9.13494774176313e-06, + "loss": 0.0033, + "step": 33130 + }, + { + "epoch": 0.5422564018653359, + "grad_norm": 0.09146049618721008, + "learning_rate": 9.134144779824744e-06, + "loss": 0.0034, + "step": 33140 + }, + { + "epoch": 0.5424200278164116, + "grad_norm": 0.22583580017089844, + "learning_rate": 9.133341480719575e-06, + "loss": 0.0029, + "step": 33150 + }, + { + "epoch": 0.5425836537674875, + "grad_norm": 0.13174423575401306, + "learning_rate": 9.13253784451314e-06, + "loss": 0.0047, + "step": 33160 + }, + { + "epoch": 0.5427472797185634, + "grad_norm": 0.11868665367364883, + "learning_rate": 9.131733871270978e-06, + "loss": 0.0029, + "step": 33170 + }, + { + "epoch": 0.5429109056696392, + "grad_norm": 0.1526699662208557, + "learning_rate": 9.130929561058658e-06, + "loss": 0.0024, + "step": 33180 + }, + { + "epoch": 0.5430745316207151, + "grad_norm": 0.06724399328231812, + "learning_rate": 9.130124913941779e-06, + "loss": 0.0029, + "step": 33190 + }, + { + "epoch": 0.5432381575717908, + "grad_norm": 0.06776267290115356, + "learning_rate": 9.129319929985963e-06, + "loss": 0.0025, + "step": 33200 + }, + { + "epoch": 0.5434017835228667, + "grad_norm": 0.04374212399125099, + "learning_rate": 9.128514609256863e-06, + "loss": 0.003, + "step": 33210 + }, + { + "epoch": 0.5435654094739426, + "grad_norm": 0.13316085934638977, + "learning_rate": 9.127708951820156e-06, + "loss": 0.0033, + "step": 33220 + }, + { + "epoch": 0.5437290354250184, + "grad_norm": 0.061265528202056885, + "learning_rate": 9.126902957741552e-06, + "loss": 0.0016, + "step": 33230 + }, + { + "epoch": 0.5438926613760943, + "grad_norm": 0.0881817489862442, + "learning_rate": 9.126096627086785e-06, + "loss": 0.0084, + "step": 33240 + }, + { + "epoch": 0.54405628732717, + "grad_norm": 0.2160559743642807, + "learning_rate": 9.125289959921613e-06, + "loss": 0.0028, + "step": 33250 + }, + { + "epoch": 0.5442199132782459, + "grad_norm": 0.07653578370809555, + "learning_rate": 9.124482956311828e-06, + "loss": 0.0033, + "step": 33260 + }, + { + "epoch": 0.5443835392293218, + "grad_norm": 0.0900450050830841, + "learning_rate": 9.123675616323246e-06, + "loss": 0.0025, + "step": 33270 + }, + { + "epoch": 0.5445471651803976, + "grad_norm": 0.1406930685043335, + "learning_rate": 9.122867940021712e-06, + "loss": 0.0022, + "step": 33280 + }, + { + "epoch": 0.5447107911314735, + "grad_norm": 0.11342816799879074, + "learning_rate": 9.122059927473094e-06, + "loss": 0.0031, + "step": 33290 + }, + { + "epoch": 0.5448744170825492, + "grad_norm": 0.16277405619621277, + "learning_rate": 9.121251578743294e-06, + "loss": 0.0029, + "step": 33300 + }, + { + "epoch": 0.5450380430336251, + "grad_norm": 0.016029680147767067, + "learning_rate": 9.120442893898237e-06, + "loss": 0.0044, + "step": 33310 + }, + { + "epoch": 0.545201668984701, + "grad_norm": 0.13244576752185822, + "learning_rate": 9.119633873003875e-06, + "loss": 0.0017, + "step": 33320 + }, + { + "epoch": 0.5453652949357768, + "grad_norm": 0.2600537836551666, + "learning_rate": 9.118824516126193e-06, + "loss": 0.003, + "step": 33330 + }, + { + "epoch": 0.5455289208868527, + "grad_norm": 0.017338261008262634, + "learning_rate": 9.118014823331193e-06, + "loss": 0.0054, + "step": 33340 + }, + { + "epoch": 0.5456925468379284, + "grad_norm": 0.11253099143505096, + "learning_rate": 9.11720479468492e-06, + "loss": 0.0025, + "step": 33350 + }, + { + "epoch": 0.5458561727890043, + "grad_norm": 0.05320839583873749, + "learning_rate": 9.116394430253428e-06, + "loss": 0.0015, + "step": 33360 + }, + { + "epoch": 0.5460197987400802, + "grad_norm": 0.09717463701963425, + "learning_rate": 9.115583730102813e-06, + "loss": 0.0034, + "step": 33370 + }, + { + "epoch": 0.546183424691156, + "grad_norm": 0.011118598282337189, + "learning_rate": 9.11477269429919e-06, + "loss": 0.0027, + "step": 33380 + }, + { + "epoch": 0.5463470506422319, + "grad_norm": 0.16390582919120789, + "learning_rate": 9.113961322908707e-06, + "loss": 0.0032, + "step": 33390 + }, + { + "epoch": 0.5465106765933077, + "grad_norm": 0.07863014191389084, + "learning_rate": 9.113149615997535e-06, + "loss": 0.0037, + "step": 33400 + }, + { + "epoch": 0.5466743025443835, + "grad_norm": 0.04523053020238876, + "learning_rate": 9.112337573631875e-06, + "loss": 0.0047, + "step": 33410 + }, + { + "epoch": 0.5468379284954594, + "grad_norm": 0.06712795794010162, + "learning_rate": 9.111525195877952e-06, + "loss": 0.0023, + "step": 33420 + }, + { + "epoch": 0.5470015544465352, + "grad_norm": 0.06714444607496262, + "learning_rate": 9.110712482802026e-06, + "loss": 0.0027, + "step": 33430 + }, + { + "epoch": 0.5471651803976111, + "grad_norm": 0.1865249127149582, + "learning_rate": 9.109899434470373e-06, + "loss": 0.003, + "step": 33440 + }, + { + "epoch": 0.5473288063486869, + "grad_norm": 0.17515318095684052, + "learning_rate": 9.109086050949307e-06, + "loss": 0.0021, + "step": 33450 + }, + { + "epoch": 0.5474924322997627, + "grad_norm": 0.17247720062732697, + "learning_rate": 9.108272332305161e-06, + "loss": 0.0021, + "step": 33460 + }, + { + "epoch": 0.5476560582508386, + "grad_norm": 0.07520222663879395, + "learning_rate": 9.107458278604302e-06, + "loss": 0.0025, + "step": 33470 + }, + { + "epoch": 0.5478196842019144, + "grad_norm": 0.4124133288860321, + "learning_rate": 9.106643889913122e-06, + "loss": 0.0027, + "step": 33480 + }, + { + "epoch": 0.5479833101529903, + "grad_norm": 0.12948864698410034, + "learning_rate": 9.105829166298037e-06, + "loss": 0.0029, + "step": 33490 + }, + { + "epoch": 0.5481469361040661, + "grad_norm": 0.1656457632780075, + "learning_rate": 9.105014107825493e-06, + "loss": 0.0028, + "step": 33500 + }, + { + "epoch": 0.548310562055142, + "grad_norm": 0.2449573129415512, + "learning_rate": 9.104198714561968e-06, + "loss": 0.0025, + "step": 33510 + }, + { + "epoch": 0.5484741880062178, + "grad_norm": 0.06181897595524788, + "learning_rate": 9.103382986573957e-06, + "loss": 0.002, + "step": 33520 + }, + { + "epoch": 0.5486378139572936, + "grad_norm": 0.05906658619642258, + "learning_rate": 9.102566923927991e-06, + "loss": 0.006, + "step": 33530 + }, + { + "epoch": 0.5488014399083695, + "grad_norm": 0.07057667523622513, + "learning_rate": 9.101750526690626e-06, + "loss": 0.0017, + "step": 33540 + }, + { + "epoch": 0.5489650658594453, + "grad_norm": 0.09633629024028778, + "learning_rate": 9.100933794928442e-06, + "loss": 0.0035, + "step": 33550 + }, + { + "epoch": 0.5491286918105212, + "grad_norm": 0.19643664360046387, + "learning_rate": 9.100116728708052e-06, + "loss": 0.0027, + "step": 33560 + }, + { + "epoch": 0.549292317761597, + "grad_norm": 0.14608485996723175, + "learning_rate": 9.09929932809609e-06, + "loss": 0.0028, + "step": 33570 + }, + { + "epoch": 0.5494559437126728, + "grad_norm": 0.09090889990329742, + "learning_rate": 9.098481593159223e-06, + "loss": 0.0036, + "step": 33580 + }, + { + "epoch": 0.5496195696637487, + "grad_norm": 0.10197705030441284, + "learning_rate": 9.09766352396414e-06, + "loss": 0.0027, + "step": 33590 + }, + { + "epoch": 0.5497831956148245, + "grad_norm": 0.06614929437637329, + "learning_rate": 9.096845120577561e-06, + "loss": 0.0023, + "step": 33600 + }, + { + "epoch": 0.5499468215659004, + "grad_norm": 0.046114519238471985, + "learning_rate": 9.096026383066234e-06, + "loss": 0.0027, + "step": 33610 + }, + { + "epoch": 0.5501104475169762, + "grad_norm": 0.022980744019150734, + "learning_rate": 9.095207311496932e-06, + "loss": 0.0018, + "step": 33620 + }, + { + "epoch": 0.550274073468052, + "grad_norm": 0.05219675227999687, + "learning_rate": 9.094387905936452e-06, + "loss": 0.0034, + "step": 33630 + }, + { + "epoch": 0.5504376994191279, + "grad_norm": 0.03392721712589264, + "learning_rate": 9.093568166451627e-06, + "loss": 0.0021, + "step": 33640 + }, + { + "epoch": 0.5506013253702037, + "grad_norm": 0.10393422096967697, + "learning_rate": 9.092748093109309e-06, + "loss": 0.003, + "step": 33650 + }, + { + "epoch": 0.5507649513212796, + "grad_norm": 0.042780663818120956, + "learning_rate": 9.091927685976382e-06, + "loss": 0.0026, + "step": 33660 + }, + { + "epoch": 0.5509285772723554, + "grad_norm": 0.09566706418991089, + "learning_rate": 9.091106945119754e-06, + "loss": 0.0017, + "step": 33670 + }, + { + "epoch": 0.5510922032234312, + "grad_norm": 0.1961510330438614, + "learning_rate": 9.090285870606363e-06, + "loss": 0.0027, + "step": 33680 + }, + { + "epoch": 0.5512558291745071, + "grad_norm": 0.06689534336328506, + "learning_rate": 9.089464462503172e-06, + "loss": 0.0025, + "step": 33690 + }, + { + "epoch": 0.5514194551255829, + "grad_norm": 0.02609667181968689, + "learning_rate": 9.088642720877174e-06, + "loss": 0.0034, + "step": 33700 + }, + { + "epoch": 0.5515830810766588, + "grad_norm": 0.03313341364264488, + "learning_rate": 9.087820645795386e-06, + "loss": 0.0017, + "step": 33710 + }, + { + "epoch": 0.5517467070277347, + "grad_norm": 0.19971023499965668, + "learning_rate": 9.086998237324855e-06, + "loss": 0.0022, + "step": 33720 + }, + { + "epoch": 0.5519103329788104, + "grad_norm": 0.07108059525489807, + "learning_rate": 9.086175495532653e-06, + "loss": 0.0028, + "step": 33730 + }, + { + "epoch": 0.5520739589298863, + "grad_norm": 0.23887395858764648, + "learning_rate": 9.085352420485878e-06, + "loss": 0.0044, + "step": 33740 + }, + { + "epoch": 0.5522375848809621, + "grad_norm": 0.024461984634399414, + "learning_rate": 9.084529012251661e-06, + "loss": 0.0015, + "step": 33750 + }, + { + "epoch": 0.552401210832038, + "grad_norm": 0.03654909506440163, + "learning_rate": 9.083705270897153e-06, + "loss": 0.003, + "step": 33760 + }, + { + "epoch": 0.5525648367831139, + "grad_norm": 0.05137854442000389, + "learning_rate": 9.082881196489538e-06, + "loss": 0.004, + "step": 33770 + }, + { + "epoch": 0.5527284627341896, + "grad_norm": 0.1305990070104599, + "learning_rate": 9.082056789096024e-06, + "loss": 0.0013, + "step": 33780 + }, + { + "epoch": 0.5528920886852655, + "grad_norm": 0.06542232632637024, + "learning_rate": 9.081232048783847e-06, + "loss": 0.0018, + "step": 33790 + }, + { + "epoch": 0.5530557146363413, + "grad_norm": 0.2352975606918335, + "learning_rate": 9.080406975620269e-06, + "loss": 0.0036, + "step": 33800 + }, + { + "epoch": 0.5532193405874172, + "grad_norm": 0.049139637500047684, + "learning_rate": 9.07958156967258e-06, + "loss": 0.0027, + "step": 33810 + }, + { + "epoch": 0.5533829665384931, + "grad_norm": 0.18974940478801727, + "learning_rate": 9.078755831008099e-06, + "loss": 0.0035, + "step": 33820 + }, + { + "epoch": 0.5535465924895688, + "grad_norm": 0.0452951043844223, + "learning_rate": 9.077929759694171e-06, + "loss": 0.002, + "step": 33830 + }, + { + "epoch": 0.5537102184406447, + "grad_norm": 0.07449331879615784, + "learning_rate": 9.077103355798163e-06, + "loss": 0.002, + "step": 33840 + }, + { + "epoch": 0.5538738443917205, + "grad_norm": 0.2256460338830948, + "learning_rate": 9.076276619387478e-06, + "loss": 0.0053, + "step": 33850 + }, + { + "epoch": 0.5540374703427964, + "grad_norm": 0.029968148097395897, + "learning_rate": 9.075449550529542e-06, + "loss": 0.0025, + "step": 33860 + }, + { + "epoch": 0.5542010962938722, + "grad_norm": 0.06543967872858047, + "learning_rate": 9.074622149291805e-06, + "loss": 0.0026, + "step": 33870 + }, + { + "epoch": 0.554364722244948, + "grad_norm": 0.12374580651521683, + "learning_rate": 9.07379441574175e-06, + "loss": 0.0044, + "step": 33880 + }, + { + "epoch": 0.5545283481960239, + "grad_norm": 0.044838760048151016, + "learning_rate": 9.072966349946881e-06, + "loss": 0.0023, + "step": 33890 + }, + { + "epoch": 0.5546919741470997, + "grad_norm": 0.14062732458114624, + "learning_rate": 9.072137951974736e-06, + "loss": 0.002, + "step": 33900 + }, + { + "epoch": 0.5548556000981756, + "grad_norm": 0.126304030418396, + "learning_rate": 9.071309221892873e-06, + "loss": 0.003, + "step": 33910 + }, + { + "epoch": 0.5550192260492514, + "grad_norm": 0.06975863873958588, + "learning_rate": 9.07048015976888e-06, + "loss": 0.002, + "step": 33920 + }, + { + "epoch": 0.5551828520003272, + "grad_norm": 0.3728843331336975, + "learning_rate": 9.069650765670376e-06, + "loss": 0.0032, + "step": 33930 + }, + { + "epoch": 0.5553464779514031, + "grad_norm": 0.06448273360729218, + "learning_rate": 9.068821039665e-06, + "loss": 0.0029, + "step": 33940 + }, + { + "epoch": 0.5555101039024789, + "grad_norm": 0.027964968234300613, + "learning_rate": 9.067990981820427e-06, + "loss": 0.0043, + "step": 33950 + }, + { + "epoch": 0.5556737298535548, + "grad_norm": 0.12232799082994461, + "learning_rate": 9.067160592204346e-06, + "loss": 0.0029, + "step": 33960 + }, + { + "epoch": 0.5558373558046306, + "grad_norm": 0.0456613190472126, + "learning_rate": 9.066329870884488e-06, + "loss": 0.0036, + "step": 33970 + }, + { + "epoch": 0.5560009817557064, + "grad_norm": 0.06744259595870972, + "learning_rate": 9.0654988179286e-06, + "loss": 0.0025, + "step": 33980 + }, + { + "epoch": 0.5561646077067823, + "grad_norm": 0.08431285619735718, + "learning_rate": 9.06466743340446e-06, + "loss": 0.0036, + "step": 33990 + }, + { + "epoch": 0.5563282336578581, + "grad_norm": 0.01698785275220871, + "learning_rate": 9.063835717379872e-06, + "loss": 0.003, + "step": 34000 + }, + { + "epoch": 0.556491859608934, + "grad_norm": 0.02922319620847702, + "learning_rate": 9.06300366992267e-06, + "loss": 0.0026, + "step": 34010 + }, + { + "epoch": 0.5566554855600098, + "grad_norm": 0.050951965153217316, + "learning_rate": 9.062171291100713e-06, + "loss": 0.0024, + "step": 34020 + }, + { + "epoch": 0.5568191115110857, + "grad_norm": 0.05026979371905327, + "learning_rate": 9.061338580981884e-06, + "loss": 0.0027, + "step": 34030 + }, + { + "epoch": 0.5569827374621615, + "grad_norm": 0.16412629187107086, + "learning_rate": 9.0605055396341e-06, + "loss": 0.0028, + "step": 34040 + }, + { + "epoch": 0.5571463634132373, + "grad_norm": 0.16496610641479492, + "learning_rate": 9.0596721671253e-06, + "loss": 0.0033, + "step": 34050 + }, + { + "epoch": 0.5573099893643132, + "grad_norm": 0.0889497920870781, + "learning_rate": 9.05883846352345e-06, + "loss": 0.0019, + "step": 34060 + }, + { + "epoch": 0.557473615315389, + "grad_norm": 0.14715588092803955, + "learning_rate": 9.058004428896544e-06, + "loss": 0.0024, + "step": 34070 + }, + { + "epoch": 0.5576372412664649, + "grad_norm": 0.07355787605047226, + "learning_rate": 9.0571700633126e-06, + "loss": 0.0035, + "step": 34080 + }, + { + "epoch": 0.5578008672175407, + "grad_norm": 0.127385675907135, + "learning_rate": 9.056335366839674e-06, + "loss": 0.0033, + "step": 34090 + }, + { + "epoch": 0.5579644931686165, + "grad_norm": 0.1759384125471115, + "learning_rate": 9.055500339545834e-06, + "loss": 0.003, + "step": 34100 + }, + { + "epoch": 0.5581281191196924, + "grad_norm": 0.06901757419109344, + "learning_rate": 9.054664981499185e-06, + "loss": 0.003, + "step": 34110 + }, + { + "epoch": 0.5582917450707682, + "grad_norm": 0.04467601329088211, + "learning_rate": 9.053829292767855e-06, + "loss": 0.0032, + "step": 34120 + }, + { + "epoch": 0.5584553710218441, + "grad_norm": 0.08457455039024353, + "learning_rate": 9.05299327342e-06, + "loss": 0.002, + "step": 34130 + }, + { + "epoch": 0.55861899697292, + "grad_norm": 0.18473650515079498, + "learning_rate": 9.052156923523803e-06, + "loss": 0.0039, + "step": 34140 + }, + { + "epoch": 0.5587826229239957, + "grad_norm": 0.05865251272916794, + "learning_rate": 9.051320243147474e-06, + "loss": 0.0025, + "step": 34150 + }, + { + "epoch": 0.5589462488750716, + "grad_norm": 0.07468479126691818, + "learning_rate": 9.050483232359251e-06, + "loss": 0.002, + "step": 34160 + }, + { + "epoch": 0.5591098748261474, + "grad_norm": 0.0697217732667923, + "learning_rate": 9.049645891227395e-06, + "loss": 0.002, + "step": 34170 + }, + { + "epoch": 0.5592735007772233, + "grad_norm": 0.1778220534324646, + "learning_rate": 9.0488082198202e-06, + "loss": 0.0034, + "step": 34180 + }, + { + "epoch": 0.5594371267282992, + "grad_norm": 0.018835963681340218, + "learning_rate": 9.04797021820598e-06, + "loss": 0.003, + "step": 34190 + }, + { + "epoch": 0.5596007526793749, + "grad_norm": 0.06329817324876785, + "learning_rate": 9.047131886453081e-06, + "loss": 0.0033, + "step": 34200 + }, + { + "epoch": 0.5597643786304508, + "grad_norm": 0.11150910705327988, + "learning_rate": 9.046293224629876e-06, + "loss": 0.003, + "step": 34210 + }, + { + "epoch": 0.5599280045815266, + "grad_norm": 0.06717385351657867, + "learning_rate": 9.045454232804764e-06, + "loss": 0.002, + "step": 34220 + }, + { + "epoch": 0.5600916305326025, + "grad_norm": 0.04742787778377533, + "learning_rate": 9.044614911046166e-06, + "loss": 0.0019, + "step": 34230 + }, + { + "epoch": 0.5602552564836784, + "grad_norm": 0.13319995999336243, + "learning_rate": 9.043775259422539e-06, + "loss": 0.0027, + "step": 34240 + }, + { + "epoch": 0.5604188824347541, + "grad_norm": 0.02570258267223835, + "learning_rate": 9.042935278002356e-06, + "loss": 0.0015, + "step": 34250 + }, + { + "epoch": 0.56058250838583, + "grad_norm": 0.17978332936763763, + "learning_rate": 9.042094966854131e-06, + "loss": 0.0028, + "step": 34260 + }, + { + "epoch": 0.5607461343369058, + "grad_norm": 0.06665828824043274, + "learning_rate": 9.041254326046392e-06, + "loss": 0.0029, + "step": 34270 + }, + { + "epoch": 0.5609097602879817, + "grad_norm": 0.1326732039451599, + "learning_rate": 9.040413355647697e-06, + "loss": 0.0023, + "step": 34280 + }, + { + "epoch": 0.5610733862390576, + "grad_norm": 0.05947272479534149, + "learning_rate": 9.039572055726637e-06, + "loss": 0.0037, + "step": 34290 + }, + { + "epoch": 0.5612370121901333, + "grad_norm": 0.20495085418224335, + "learning_rate": 9.038730426351826e-06, + "loss": 0.0038, + "step": 34300 + }, + { + "epoch": 0.5614006381412092, + "grad_norm": 0.4255751967430115, + "learning_rate": 9.0378884675919e-06, + "loss": 0.0024, + "step": 34310 + }, + { + "epoch": 0.561564264092285, + "grad_norm": 0.10219269245862961, + "learning_rate": 9.037046179515529e-06, + "loss": 0.004, + "step": 34320 + }, + { + "epoch": 0.5617278900433609, + "grad_norm": 0.06360520422458649, + "learning_rate": 9.036203562191408e-06, + "loss": 0.0029, + "step": 34330 + }, + { + "epoch": 0.5618915159944368, + "grad_norm": 0.1153009831905365, + "learning_rate": 9.035360615688255e-06, + "loss": 0.0039, + "step": 34340 + }, + { + "epoch": 0.5620551419455125, + "grad_norm": 0.11293741315603256, + "learning_rate": 9.034517340074822e-06, + "loss": 0.0042, + "step": 34350 + }, + { + "epoch": 0.5622187678965884, + "grad_norm": 0.15900881588459015, + "learning_rate": 9.03367373541988e-06, + "loss": 0.0026, + "step": 34360 + }, + { + "epoch": 0.5623823938476642, + "grad_norm": 0.03355743736028671, + "learning_rate": 9.032829801792232e-06, + "loss": 0.0017, + "step": 34370 + }, + { + "epoch": 0.5625460197987401, + "grad_norm": 0.14817260205745697, + "learning_rate": 9.031985539260705e-06, + "loss": 0.0041, + "step": 34380 + }, + { + "epoch": 0.562709645749816, + "grad_norm": 0.18024475872516632, + "learning_rate": 9.031140947894158e-06, + "loss": 0.0059, + "step": 34390 + }, + { + "epoch": 0.5628732717008917, + "grad_norm": 0.037742141634225845, + "learning_rate": 9.030296027761469e-06, + "loss": 0.0017, + "step": 34400 + }, + { + "epoch": 0.5630368976519676, + "grad_norm": 0.08214405179023743, + "learning_rate": 9.029450778931548e-06, + "loss": 0.0016, + "step": 34410 + }, + { + "epoch": 0.5632005236030434, + "grad_norm": 0.10488860309123993, + "learning_rate": 9.028605201473331e-06, + "loss": 0.0027, + "step": 34420 + }, + { + "epoch": 0.5633641495541193, + "grad_norm": 0.07780135422945023, + "learning_rate": 9.02775929545578e-06, + "loss": 0.0026, + "step": 34430 + }, + { + "epoch": 0.5635277755051952, + "grad_norm": 0.15384942293167114, + "learning_rate": 9.026913060947887e-06, + "loss": 0.0027, + "step": 34440 + }, + { + "epoch": 0.563691401456271, + "grad_norm": 0.06369028985500336, + "learning_rate": 9.026066498018661e-06, + "loss": 0.0023, + "step": 34450 + }, + { + "epoch": 0.5638550274073468, + "grad_norm": 0.2127196192741394, + "learning_rate": 9.025219606737152e-06, + "loss": 0.0045, + "step": 34460 + }, + { + "epoch": 0.5640186533584226, + "grad_norm": 0.08209564536809921, + "learning_rate": 9.024372387172426e-06, + "loss": 0.0033, + "step": 34470 + }, + { + "epoch": 0.5641822793094985, + "grad_norm": 0.04950880631804466, + "learning_rate": 9.023524839393582e-06, + "loss": 0.0029, + "step": 34480 + }, + { + "epoch": 0.5643459052605744, + "grad_norm": 0.09371622651815414, + "learning_rate": 9.02267696346974e-06, + "loss": 0.004, + "step": 34490 + }, + { + "epoch": 0.5645095312116502, + "grad_norm": 0.14973004162311554, + "learning_rate": 9.021828759470051e-06, + "loss": 0.0034, + "step": 34500 + }, + { + "epoch": 0.564673157162726, + "grad_norm": 0.13089285790920258, + "learning_rate": 9.020980227463692e-06, + "loss": 0.0018, + "step": 34510 + }, + { + "epoch": 0.5648367831138018, + "grad_norm": 0.029335087165236473, + "learning_rate": 9.020131367519866e-06, + "loss": 0.003, + "step": 34520 + }, + { + "epoch": 0.5650004090648777, + "grad_norm": 0.1268162578344345, + "learning_rate": 9.019282179707805e-06, + "loss": 0.0034, + "step": 34530 + }, + { + "epoch": 0.5651640350159536, + "grad_norm": 0.03817616030573845, + "learning_rate": 9.01843266409676e-06, + "loss": 0.002, + "step": 34540 + }, + { + "epoch": 0.5653276609670294, + "grad_norm": 0.022283917292952538, + "learning_rate": 9.017582820756024e-06, + "loss": 0.0023, + "step": 34550 + }, + { + "epoch": 0.5654912869181052, + "grad_norm": 0.12352630496025085, + "learning_rate": 9.016732649754898e-06, + "loss": 0.0039, + "step": 34560 + }, + { + "epoch": 0.565654912869181, + "grad_norm": 0.1744757443666458, + "learning_rate": 9.015882151162727e-06, + "loss": 0.0036, + "step": 34570 + }, + { + "epoch": 0.5658185388202569, + "grad_norm": 0.07029528170824051, + "learning_rate": 9.015031325048869e-06, + "loss": 0.0026, + "step": 34580 + }, + { + "epoch": 0.5659821647713328, + "grad_norm": 0.12010376155376434, + "learning_rate": 9.014180171482716e-06, + "loss": 0.0023, + "step": 34590 + }, + { + "epoch": 0.5661457907224086, + "grad_norm": 0.13346995413303375, + "learning_rate": 9.013328690533685e-06, + "loss": 0.0033, + "step": 34600 + }, + { + "epoch": 0.5663094166734844, + "grad_norm": 0.17702341079711914, + "learning_rate": 9.012476882271222e-06, + "loss": 0.0025, + "step": 34610 + }, + { + "epoch": 0.5664730426245602, + "grad_norm": 0.03770411014556885, + "learning_rate": 9.011624746764796e-06, + "loss": 0.0057, + "step": 34620 + }, + { + "epoch": 0.5666366685756361, + "grad_norm": 0.047059137374162674, + "learning_rate": 9.010772284083904e-06, + "loss": 0.0016, + "step": 34630 + }, + { + "epoch": 0.566800294526712, + "grad_norm": 0.28435850143432617, + "learning_rate": 9.009919494298069e-06, + "loss": 0.0034, + "step": 34640 + }, + { + "epoch": 0.5669639204777878, + "grad_norm": 0.0524299219250679, + "learning_rate": 9.009066377476845e-06, + "loss": 0.0022, + "step": 34650 + }, + { + "epoch": 0.5671275464288636, + "grad_norm": 0.022958291694521904, + "learning_rate": 9.008212933689806e-06, + "loss": 0.0027, + "step": 34660 + }, + { + "epoch": 0.5672911723799394, + "grad_norm": 0.050073932856321335, + "learning_rate": 9.007359163006558e-06, + "loss": 0.0038, + "step": 34670 + }, + { + "epoch": 0.5674547983310153, + "grad_norm": 0.0715407058596611, + "learning_rate": 9.00650506549673e-06, + "loss": 0.0029, + "step": 34680 + }, + { + "epoch": 0.5676184242820912, + "grad_norm": 0.27536875009536743, + "learning_rate": 9.005650641229981e-06, + "loss": 0.0026, + "step": 34690 + }, + { + "epoch": 0.567782050233167, + "grad_norm": 0.10513325780630112, + "learning_rate": 9.004795890275995e-06, + "loss": 0.0025, + "step": 34700 + }, + { + "epoch": 0.5679456761842429, + "grad_norm": 0.125313863158226, + "learning_rate": 9.00394081270448e-06, + "loss": 0.0024, + "step": 34710 + }, + { + "epoch": 0.5681093021353186, + "grad_norm": 0.09218619018793106, + "learning_rate": 9.003085408585176e-06, + "loss": 0.0022, + "step": 34720 + }, + { + "epoch": 0.5682729280863945, + "grad_norm": 0.13089098036289215, + "learning_rate": 9.002229677987845e-06, + "loss": 0.0018, + "step": 34730 + }, + { + "epoch": 0.5684365540374703, + "grad_norm": 0.020297177135944366, + "learning_rate": 9.001373620982279e-06, + "loss": 0.0037, + "step": 34740 + }, + { + "epoch": 0.5686001799885462, + "grad_norm": 0.10721893608570099, + "learning_rate": 9.000517237638293e-06, + "loss": 0.0021, + "step": 34750 + }, + { + "epoch": 0.5687638059396221, + "grad_norm": 0.12039229273796082, + "learning_rate": 8.999660528025733e-06, + "loss": 0.0026, + "step": 34760 + }, + { + "epoch": 0.5689274318906978, + "grad_norm": 0.055747658014297485, + "learning_rate": 8.998803492214468e-06, + "loss": 0.0042, + "step": 34770 + }, + { + "epoch": 0.5690910578417737, + "grad_norm": 0.05545589327812195, + "learning_rate": 8.997946130274396e-06, + "loss": 0.0043, + "step": 34780 + }, + { + "epoch": 0.5692546837928495, + "grad_norm": 0.1255505383014679, + "learning_rate": 8.997088442275439e-06, + "loss": 0.0044, + "step": 34790 + }, + { + "epoch": 0.5694183097439254, + "grad_norm": 0.07918395847082138, + "learning_rate": 8.996230428287548e-06, + "loss": 0.004, + "step": 34800 + }, + { + "epoch": 0.5695819356950013, + "grad_norm": 0.17006701231002808, + "learning_rate": 8.9953720883807e-06, + "loss": 0.003, + "step": 34810 + }, + { + "epoch": 0.569745561646077, + "grad_norm": 0.1715926080942154, + "learning_rate": 8.994513422624898e-06, + "loss": 0.0035, + "step": 34820 + }, + { + "epoch": 0.5699091875971529, + "grad_norm": 0.19922161102294922, + "learning_rate": 8.99365443109017e-06, + "loss": 0.0037, + "step": 34830 + }, + { + "epoch": 0.5700728135482287, + "grad_norm": 0.12470365315675735, + "learning_rate": 8.992795113846577e-06, + "loss": 0.0035, + "step": 34840 + }, + { + "epoch": 0.5702364394993046, + "grad_norm": 0.07749177515506744, + "learning_rate": 8.991935470964198e-06, + "loss": 0.0021, + "step": 34850 + }, + { + "epoch": 0.5704000654503805, + "grad_norm": 0.12300921231508255, + "learning_rate": 8.991075502513144e-06, + "loss": 0.0024, + "step": 34860 + }, + { + "epoch": 0.5705636914014562, + "grad_norm": 0.19126859307289124, + "learning_rate": 8.990215208563549e-06, + "loss": 0.005, + "step": 34870 + }, + { + "epoch": 0.5707273173525321, + "grad_norm": 0.23512768745422363, + "learning_rate": 8.98935458918558e-06, + "loss": 0.0039, + "step": 34880 + }, + { + "epoch": 0.5708909433036079, + "grad_norm": 0.19217929244041443, + "learning_rate": 8.988493644449424e-06, + "loss": 0.0032, + "step": 34890 + }, + { + "epoch": 0.5710545692546838, + "grad_norm": 0.14963099360466003, + "learning_rate": 8.987632374425293e-06, + "loss": 0.005, + "step": 34900 + }, + { + "epoch": 0.5712181952057597, + "grad_norm": 0.03155124559998512, + "learning_rate": 8.986770779183434e-06, + "loss": 0.0038, + "step": 34910 + }, + { + "epoch": 0.5713818211568354, + "grad_norm": 0.14544332027435303, + "learning_rate": 8.985908858794115e-06, + "loss": 0.002, + "step": 34920 + }, + { + "epoch": 0.5715454471079113, + "grad_norm": 0.1308814287185669, + "learning_rate": 8.985046613327631e-06, + "loss": 0.0032, + "step": 34930 + }, + { + "epoch": 0.5717090730589871, + "grad_norm": 0.2157951295375824, + "learning_rate": 8.984184042854303e-06, + "loss": 0.0046, + "step": 34940 + }, + { + "epoch": 0.571872699010063, + "grad_norm": 0.10986444354057312, + "learning_rate": 8.98332114744448e-06, + "loss": 0.0023, + "step": 34950 + }, + { + "epoch": 0.5720363249611389, + "grad_norm": 0.1494085192680359, + "learning_rate": 8.982457927168537e-06, + "loss": 0.0041, + "step": 34960 + }, + { + "epoch": 0.5721999509122146, + "grad_norm": 0.10345122218132019, + "learning_rate": 8.981594382096875e-06, + "loss": 0.0037, + "step": 34970 + }, + { + "epoch": 0.5723635768632905, + "grad_norm": 0.022296082228422165, + "learning_rate": 8.980730512299922e-06, + "loss": 0.0026, + "step": 34980 + }, + { + "epoch": 0.5725272028143663, + "grad_norm": 0.19852212071418762, + "learning_rate": 8.97986631784813e-06, + "loss": 0.0025, + "step": 34990 + }, + { + "epoch": 0.5726908287654422, + "grad_norm": 0.17523986101150513, + "learning_rate": 8.979001798811984e-06, + "loss": 0.0043, + "step": 35000 + }, + { + "epoch": 0.5728544547165181, + "grad_norm": 0.03879449516534805, + "learning_rate": 8.978136955261989e-06, + "loss": 0.0024, + "step": 35010 + }, + { + "epoch": 0.5730180806675939, + "grad_norm": 0.21252746880054474, + "learning_rate": 8.977271787268678e-06, + "loss": 0.0035, + "step": 35020 + }, + { + "epoch": 0.5731817066186697, + "grad_norm": 0.025494545698165894, + "learning_rate": 8.97640629490261e-06, + "loss": 0.002, + "step": 35030 + }, + { + "epoch": 0.5733453325697455, + "grad_norm": 0.07665806263685226, + "learning_rate": 8.975540478234377e-06, + "loss": 0.0018, + "step": 35040 + }, + { + "epoch": 0.5735089585208214, + "grad_norm": 0.07661059498786926, + "learning_rate": 8.974674337334586e-06, + "loss": 0.0027, + "step": 35050 + }, + { + "epoch": 0.5736725844718973, + "grad_norm": 0.09893879294395447, + "learning_rate": 8.97380787227388e-06, + "loss": 0.0029, + "step": 35060 + }, + { + "epoch": 0.5738362104229731, + "grad_norm": 0.1142156571149826, + "learning_rate": 8.972941083122923e-06, + "loss": 0.0037, + "step": 35070 + }, + { + "epoch": 0.5739998363740489, + "grad_norm": 0.036033470183610916, + "learning_rate": 8.972073969952411e-06, + "loss": 0.0029, + "step": 35080 + }, + { + "epoch": 0.5741634623251247, + "grad_norm": 0.17171558737754822, + "learning_rate": 8.971206532833058e-06, + "loss": 0.0036, + "step": 35090 + }, + { + "epoch": 0.5743270882762006, + "grad_norm": 0.08280320465564728, + "learning_rate": 8.970338771835612e-06, + "loss": 0.0034, + "step": 35100 + }, + { + "epoch": 0.5744907142272765, + "grad_norm": 0.09014665335416794, + "learning_rate": 8.969470687030843e-06, + "loss": 0.0025, + "step": 35110 + }, + { + "epoch": 0.5746543401783523, + "grad_norm": 0.15907633304595947, + "learning_rate": 8.96860227848955e-06, + "loss": 0.0029, + "step": 35120 + }, + { + "epoch": 0.5748179661294281, + "grad_norm": 0.10765702277421951, + "learning_rate": 8.96773354628256e-06, + "loss": 0.002, + "step": 35130 + }, + { + "epoch": 0.5749815920805039, + "grad_norm": 0.0546712689101696, + "learning_rate": 8.96686449048072e-06, + "loss": 0.0028, + "step": 35140 + }, + { + "epoch": 0.5751452180315798, + "grad_norm": 0.1573525071144104, + "learning_rate": 8.965995111154909e-06, + "loss": 0.0023, + "step": 35150 + }, + { + "epoch": 0.5753088439826557, + "grad_norm": 0.11107318848371506, + "learning_rate": 8.965125408376029e-06, + "loss": 0.0022, + "step": 35160 + }, + { + "epoch": 0.5754724699337315, + "grad_norm": 0.0553484745323658, + "learning_rate": 8.964255382215012e-06, + "loss": 0.0032, + "step": 35170 + }, + { + "epoch": 0.5756360958848074, + "grad_norm": 0.16430622339248657, + "learning_rate": 8.963385032742813e-06, + "loss": 0.0016, + "step": 35180 + }, + { + "epoch": 0.5757997218358831, + "grad_norm": 0.15215294063091278, + "learning_rate": 8.962514360030417e-06, + "loss": 0.0055, + "step": 35190 + }, + { + "epoch": 0.575963347786959, + "grad_norm": 0.10163141787052155, + "learning_rate": 8.961643364148829e-06, + "loss": 0.003, + "step": 35200 + }, + { + "epoch": 0.5761269737380349, + "grad_norm": 0.16122987866401672, + "learning_rate": 8.960772045169088e-06, + "loss": 0.0046, + "step": 35210 + }, + { + "epoch": 0.5762905996891107, + "grad_norm": 0.15351475775241852, + "learning_rate": 8.959900403162255e-06, + "loss": 0.0034, + "step": 35220 + }, + { + "epoch": 0.5764542256401866, + "grad_norm": 0.06374000012874603, + "learning_rate": 8.959028438199417e-06, + "loss": 0.0043, + "step": 35230 + }, + { + "epoch": 0.5766178515912623, + "grad_norm": 0.04834722727537155, + "learning_rate": 8.958156150351692e-06, + "loss": 0.0029, + "step": 35240 + }, + { + "epoch": 0.5767814775423382, + "grad_norm": 0.04215531051158905, + "learning_rate": 8.957283539690215e-06, + "loss": 0.0028, + "step": 35250 + }, + { + "epoch": 0.5769451034934141, + "grad_norm": 0.04755663871765137, + "learning_rate": 8.956410606286157e-06, + "loss": 0.0019, + "step": 35260 + }, + { + "epoch": 0.5771087294444899, + "grad_norm": 0.11351222544908524, + "learning_rate": 8.955537350210712e-06, + "loss": 0.0018, + "step": 35270 + }, + { + "epoch": 0.5772723553955658, + "grad_norm": 0.1154792532324791, + "learning_rate": 8.954663771535098e-06, + "loss": 0.0025, + "step": 35280 + }, + { + "epoch": 0.5774359813466415, + "grad_norm": 0.00735984742641449, + "learning_rate": 8.953789870330562e-06, + "loss": 0.0022, + "step": 35290 + }, + { + "epoch": 0.5775996072977174, + "grad_norm": 0.08282400667667389, + "learning_rate": 8.952915646668376e-06, + "loss": 0.0017, + "step": 35300 + }, + { + "epoch": 0.5777632332487933, + "grad_norm": 0.10006248205900192, + "learning_rate": 8.952041100619837e-06, + "loss": 0.0026, + "step": 35310 + }, + { + "epoch": 0.5779268591998691, + "grad_norm": 0.04867241159081459, + "learning_rate": 8.951166232256273e-06, + "loss": 0.0032, + "step": 35320 + }, + { + "epoch": 0.578090485150945, + "grad_norm": 0.18728525936603546, + "learning_rate": 8.950291041649037e-06, + "loss": 0.0035, + "step": 35330 + }, + { + "epoch": 0.5782541111020207, + "grad_norm": 0.13466301560401917, + "learning_rate": 8.9494155288695e-06, + "loss": 0.0037, + "step": 35340 + }, + { + "epoch": 0.5784177370530966, + "grad_norm": 0.1026085615158081, + "learning_rate": 8.948539693989072e-06, + "loss": 0.0036, + "step": 35350 + }, + { + "epoch": 0.5785813630041725, + "grad_norm": 0.15514378249645233, + "learning_rate": 8.947663537079178e-06, + "loss": 0.0023, + "step": 35360 + }, + { + "epoch": 0.5787449889552483, + "grad_norm": 0.15846118330955505, + "learning_rate": 8.946787058211279e-06, + "loss": 0.0033, + "step": 35370 + }, + { + "epoch": 0.5789086149063242, + "grad_norm": 0.2537915110588074, + "learning_rate": 8.945910257456856e-06, + "loss": 0.0033, + "step": 35380 + }, + { + "epoch": 0.5790722408574, + "grad_norm": 0.09212382137775421, + "learning_rate": 8.945033134887417e-06, + "loss": 0.0046, + "step": 35390 + }, + { + "epoch": 0.5792358668084758, + "grad_norm": 0.11593269556760788, + "learning_rate": 8.944155690574497e-06, + "loss": 0.0053, + "step": 35400 + }, + { + "epoch": 0.5793994927595517, + "grad_norm": 0.11310001462697983, + "learning_rate": 8.943277924589659e-06, + "loss": 0.0041, + "step": 35410 + }, + { + "epoch": 0.5795631187106275, + "grad_norm": 0.006186493672430515, + "learning_rate": 8.942399837004489e-06, + "loss": 0.0027, + "step": 35420 + }, + { + "epoch": 0.5797267446617034, + "grad_norm": 0.07787127792835236, + "learning_rate": 8.9415214278906e-06, + "loss": 0.0025, + "step": 35430 + }, + { + "epoch": 0.5798903706127791, + "grad_norm": 0.17375636100769043, + "learning_rate": 8.940642697319637e-06, + "loss": 0.0038, + "step": 35440 + }, + { + "epoch": 0.580053996563855, + "grad_norm": 0.1743973195552826, + "learning_rate": 8.939763645363262e-06, + "loss": 0.0032, + "step": 35450 + }, + { + "epoch": 0.5802176225149309, + "grad_norm": 0.24443970620632172, + "learning_rate": 8.938884272093166e-06, + "loss": 0.003, + "step": 35460 + }, + { + "epoch": 0.5803812484660067, + "grad_norm": 0.03497815132141113, + "learning_rate": 8.938004577581071e-06, + "loss": 0.0033, + "step": 35470 + }, + { + "epoch": 0.5805448744170826, + "grad_norm": 0.11990688741207123, + "learning_rate": 8.937124561898723e-06, + "loss": 0.0028, + "step": 35480 + }, + { + "epoch": 0.5807085003681584, + "grad_norm": 0.05437789112329483, + "learning_rate": 8.936244225117887e-06, + "loss": 0.0028, + "step": 35490 + }, + { + "epoch": 0.5808721263192342, + "grad_norm": 0.06166590005159378, + "learning_rate": 8.935363567310367e-06, + "loss": 0.0021, + "step": 35500 + }, + { + "epoch": 0.5810357522703101, + "grad_norm": 0.005976386368274689, + "learning_rate": 8.934482588547983e-06, + "loss": 0.0019, + "step": 35510 + }, + { + "epoch": 0.5811993782213859, + "grad_norm": 0.0694926455616951, + "learning_rate": 8.933601288902587e-06, + "loss": 0.003, + "step": 35520 + }, + { + "epoch": 0.5813630041724618, + "grad_norm": 0.06724761426448822, + "learning_rate": 8.93271966844605e-06, + "loss": 0.0033, + "step": 35530 + }, + { + "epoch": 0.5815266301235376, + "grad_norm": 0.07907678186893463, + "learning_rate": 8.93183772725028e-06, + "loss": 0.0028, + "step": 35540 + }, + { + "epoch": 0.5816902560746134, + "grad_norm": 0.023092232644557953, + "learning_rate": 8.930955465387201e-06, + "loss": 0.0025, + "step": 35550 + }, + { + "epoch": 0.5818538820256893, + "grad_norm": 0.18264669179916382, + "learning_rate": 8.930072882928768e-06, + "loss": 0.0018, + "step": 35560 + }, + { + "epoch": 0.5820175079767651, + "grad_norm": 0.09096872061491013, + "learning_rate": 8.929189979946964e-06, + "loss": 0.0031, + "step": 35570 + }, + { + "epoch": 0.582181133927841, + "grad_norm": 0.03712155669927597, + "learning_rate": 8.92830675651379e-06, + "loss": 0.0033, + "step": 35580 + }, + { + "epoch": 0.5823447598789168, + "grad_norm": 0.08800095319747925, + "learning_rate": 8.927423212701284e-06, + "loss": 0.0024, + "step": 35590 + }, + { + "epoch": 0.5825083858299926, + "grad_norm": 0.13426180183887482, + "learning_rate": 8.926539348581505e-06, + "loss": 0.0043, + "step": 35600 + }, + { + "epoch": 0.5826720117810685, + "grad_norm": 0.04330554977059364, + "learning_rate": 8.925655164226534e-06, + "loss": 0.0028, + "step": 35610 + }, + { + "epoch": 0.5828356377321443, + "grad_norm": 0.03764757513999939, + "learning_rate": 8.924770659708483e-06, + "loss": 0.0032, + "step": 35620 + }, + { + "epoch": 0.5829992636832202, + "grad_norm": 0.10116902738809586, + "learning_rate": 8.923885835099493e-06, + "loss": 0.0042, + "step": 35630 + }, + { + "epoch": 0.583162889634296, + "grad_norm": 0.23714792728424072, + "learning_rate": 8.923000690471723e-06, + "loss": 0.0043, + "step": 35640 + }, + { + "epoch": 0.5833265155853719, + "grad_norm": 0.1362658441066742, + "learning_rate": 8.922115225897363e-06, + "loss": 0.0035, + "step": 35650 + }, + { + "epoch": 0.5834901415364476, + "grad_norm": 0.061279796063899994, + "learning_rate": 8.921229441448632e-06, + "loss": 0.0035, + "step": 35660 + }, + { + "epoch": 0.5836537674875235, + "grad_norm": 0.04157334193587303, + "learning_rate": 8.920343337197766e-06, + "loss": 0.0023, + "step": 35670 + }, + { + "epoch": 0.5838173934385994, + "grad_norm": 0.06357670575380325, + "learning_rate": 8.919456913217037e-06, + "loss": 0.0022, + "step": 35680 + }, + { + "epoch": 0.5839810193896752, + "grad_norm": 0.03231954947113991, + "learning_rate": 8.918570169578736e-06, + "loss": 0.0027, + "step": 35690 + }, + { + "epoch": 0.5841446453407511, + "grad_norm": 0.01109381765127182, + "learning_rate": 8.917683106355186e-06, + "loss": 0.0007, + "step": 35700 + }, + { + "epoch": 0.5843082712918268, + "grad_norm": 0.12514245510101318, + "learning_rate": 8.91679572361873e-06, + "loss": 0.0038, + "step": 35710 + }, + { + "epoch": 0.5844718972429027, + "grad_norm": 0.1741524487733841, + "learning_rate": 8.91590802144174e-06, + "loss": 0.0026, + "step": 35720 + }, + { + "epoch": 0.5846355231939786, + "grad_norm": 0.190440371632576, + "learning_rate": 8.915019999896616e-06, + "loss": 0.0029, + "step": 35730 + }, + { + "epoch": 0.5847991491450544, + "grad_norm": 0.05949242040514946, + "learning_rate": 8.91413165905578e-06, + "loss": 0.0025, + "step": 35740 + }, + { + "epoch": 0.5849627750961303, + "grad_norm": 0.08088551461696625, + "learning_rate": 8.913242998991684e-06, + "loss": 0.0024, + "step": 35750 + }, + { + "epoch": 0.585126401047206, + "grad_norm": 0.13037364184856415, + "learning_rate": 8.912354019776804e-06, + "loss": 0.0073, + "step": 35760 + }, + { + "epoch": 0.5852900269982819, + "grad_norm": 0.05738003924489021, + "learning_rate": 8.911464721483638e-06, + "loss": 0.0041, + "step": 35770 + }, + { + "epoch": 0.5854536529493578, + "grad_norm": 0.05892040207982063, + "learning_rate": 8.91057510418472e-06, + "loss": 0.0027, + "step": 35780 + }, + { + "epoch": 0.5856172789004336, + "grad_norm": 0.042426131665706635, + "learning_rate": 8.9096851679526e-06, + "loss": 0.0025, + "step": 35790 + }, + { + "epoch": 0.5857809048515095, + "grad_norm": 0.18582364916801453, + "learning_rate": 8.908794912859859e-06, + "loss": 0.0035, + "step": 35800 + }, + { + "epoch": 0.5859445308025852, + "grad_norm": 0.07756080478429794, + "learning_rate": 8.907904338979105e-06, + "loss": 0.0026, + "step": 35810 + }, + { + "epoch": 0.5861081567536611, + "grad_norm": 0.10745445638895035, + "learning_rate": 8.90701344638297e-06, + "loss": 0.0031, + "step": 35820 + }, + { + "epoch": 0.586271782704737, + "grad_norm": 0.08886758238077164, + "learning_rate": 8.90612223514411e-06, + "loss": 0.0014, + "step": 35830 + }, + { + "epoch": 0.5864354086558128, + "grad_norm": 0.08024553209543228, + "learning_rate": 8.905230705335209e-06, + "loss": 0.0022, + "step": 35840 + }, + { + "epoch": 0.5865990346068887, + "grad_norm": 0.015699956566095352, + "learning_rate": 8.90433885702898e-06, + "loss": 0.0017, + "step": 35850 + }, + { + "epoch": 0.5867626605579644, + "grad_norm": 0.02287263423204422, + "learning_rate": 8.903446690298156e-06, + "loss": 0.0045, + "step": 35860 + }, + { + "epoch": 0.5869262865090403, + "grad_norm": 0.021680114790797234, + "learning_rate": 8.902554205215503e-06, + "loss": 0.0017, + "step": 35870 + }, + { + "epoch": 0.5870899124601162, + "grad_norm": 0.14445845782756805, + "learning_rate": 8.901661401853806e-06, + "loss": 0.0025, + "step": 35880 + }, + { + "epoch": 0.587253538411192, + "grad_norm": 0.14091873168945312, + "learning_rate": 8.900768280285878e-06, + "loss": 0.0035, + "step": 35890 + }, + { + "epoch": 0.5874171643622679, + "grad_norm": 0.005799212027341127, + "learning_rate": 8.899874840584561e-06, + "loss": 0.0019, + "step": 35900 + }, + { + "epoch": 0.5875807903133436, + "grad_norm": 0.08871165663003922, + "learning_rate": 8.898981082822723e-06, + "loss": 0.002, + "step": 35910 + }, + { + "epoch": 0.5877444162644195, + "grad_norm": 0.062440838664770126, + "learning_rate": 8.89808700707325e-06, + "loss": 0.0027, + "step": 35920 + }, + { + "epoch": 0.5879080422154954, + "grad_norm": 0.0891856923699379, + "learning_rate": 8.897192613409065e-06, + "loss": 0.002, + "step": 35930 + }, + { + "epoch": 0.5880716681665712, + "grad_norm": 0.2031840682029724, + "learning_rate": 8.896297901903108e-06, + "loss": 0.0031, + "step": 35940 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.09049496799707413, + "learning_rate": 8.895402872628352e-06, + "loss": 0.0027, + "step": 35950 + }, + { + "epoch": 0.5883989200687229, + "grad_norm": 0.11323459446430206, + "learning_rate": 8.894507525657792e-06, + "loss": 0.0023, + "step": 35960 + }, + { + "epoch": 0.5885625460197987, + "grad_norm": 0.04053853824734688, + "learning_rate": 8.893611861064447e-06, + "loss": 0.0025, + "step": 35970 + }, + { + "epoch": 0.5887261719708746, + "grad_norm": 0.1629210114479065, + "learning_rate": 8.892715878921366e-06, + "loss": 0.003, + "step": 35980 + }, + { + "epoch": 0.5888897979219504, + "grad_norm": 0.06607711315155029, + "learning_rate": 8.891819579301623e-06, + "loss": 0.0033, + "step": 35990 + }, + { + "epoch": 0.5890534238730263, + "grad_norm": 0.08085120469331741, + "learning_rate": 8.890922962278315e-06, + "loss": 0.0022, + "step": 36000 + }, + { + "epoch": 0.5892170498241021, + "grad_norm": 0.07256773114204407, + "learning_rate": 8.89002602792457e-06, + "loss": 0.003, + "step": 36010 + }, + { + "epoch": 0.5893806757751779, + "grad_norm": 0.10660307109355927, + "learning_rate": 8.889128776313536e-06, + "loss": 0.0025, + "step": 36020 + }, + { + "epoch": 0.5895443017262538, + "grad_norm": 0.11291348189115524, + "learning_rate": 8.88823120751839e-06, + "loss": 0.0029, + "step": 36030 + }, + { + "epoch": 0.5897079276773296, + "grad_norm": 0.03786439076066017, + "learning_rate": 8.887333321612338e-06, + "loss": 0.003, + "step": 36040 + }, + { + "epoch": 0.5898715536284055, + "grad_norm": 0.13428544998168945, + "learning_rate": 8.886435118668604e-06, + "loss": 0.0031, + "step": 36050 + }, + { + "epoch": 0.5900351795794813, + "grad_norm": 0.11745154112577438, + "learning_rate": 8.885536598760445e-06, + "loss": 0.003, + "step": 36060 + }, + { + "epoch": 0.5901988055305571, + "grad_norm": 0.09285806119441986, + "learning_rate": 8.884637761961142e-06, + "loss": 0.0026, + "step": 36070 + }, + { + "epoch": 0.590362431481633, + "grad_norm": 0.07872680574655533, + "learning_rate": 8.883738608343997e-06, + "loss": 0.003, + "step": 36080 + }, + { + "epoch": 0.5905260574327088, + "grad_norm": 0.12723150849342346, + "learning_rate": 8.882839137982347e-06, + "loss": 0.002, + "step": 36090 + }, + { + "epoch": 0.5906896833837847, + "grad_norm": 0.03077523224055767, + "learning_rate": 8.881939350949547e-06, + "loss": 0.0022, + "step": 36100 + }, + { + "epoch": 0.5908533093348605, + "grad_norm": 0.15075184404850006, + "learning_rate": 8.88103924731898e-06, + "loss": 0.0031, + "step": 36110 + }, + { + "epoch": 0.5910169352859364, + "grad_norm": 0.06483548879623413, + "learning_rate": 8.880138827164057e-06, + "loss": 0.0019, + "step": 36120 + }, + { + "epoch": 0.5911805612370122, + "grad_norm": 0.07136178016662598, + "learning_rate": 8.879238090558214e-06, + "loss": 0.002, + "step": 36130 + }, + { + "epoch": 0.591344187188088, + "grad_norm": 0.12176304310560226, + "learning_rate": 8.87833703757491e-06, + "loss": 0.0033, + "step": 36140 + }, + { + "epoch": 0.5915078131391639, + "grad_norm": 0.12080062180757523, + "learning_rate": 8.877435668287633e-06, + "loss": 0.0016, + "step": 36150 + }, + { + "epoch": 0.5916714390902397, + "grad_norm": 0.045237571001052856, + "learning_rate": 8.876533982769893e-06, + "loss": 0.0029, + "step": 36160 + }, + { + "epoch": 0.5918350650413156, + "grad_norm": 0.05660109594464302, + "learning_rate": 8.875631981095232e-06, + "loss": 0.0025, + "step": 36170 + }, + { + "epoch": 0.5919986909923914, + "grad_norm": 0.1006564348936081, + "learning_rate": 8.874729663337213e-06, + "loss": 0.0042, + "step": 36180 + }, + { + "epoch": 0.5921623169434672, + "grad_norm": 0.14676658809185028, + "learning_rate": 8.873827029569424e-06, + "loss": 0.0033, + "step": 36190 + }, + { + "epoch": 0.5923259428945431, + "grad_norm": 0.027155034244060516, + "learning_rate": 8.872924079865483e-06, + "loss": 0.002, + "step": 36200 + }, + { + "epoch": 0.5924895688456189, + "grad_norm": 0.1798582375049591, + "learning_rate": 8.87202081429903e-06, + "loss": 0.0035, + "step": 36210 + }, + { + "epoch": 0.5926531947966948, + "grad_norm": 0.1418972611427307, + "learning_rate": 8.871117232943737e-06, + "loss": 0.0037, + "step": 36220 + }, + { + "epoch": 0.5928168207477706, + "grad_norm": 0.05614809691905975, + "learning_rate": 8.87021333587329e-06, + "loss": 0.0026, + "step": 36230 + }, + { + "epoch": 0.5929804466988464, + "grad_norm": 0.055441975593566895, + "learning_rate": 8.86930912316141e-06, + "loss": 0.0049, + "step": 36240 + }, + { + "epoch": 0.5931440726499223, + "grad_norm": 0.053149133920669556, + "learning_rate": 8.868404594881846e-06, + "loss": 0.0032, + "step": 36250 + }, + { + "epoch": 0.5933076986009981, + "grad_norm": 0.09360457956790924, + "learning_rate": 8.867499751108362e-06, + "loss": 0.002, + "step": 36260 + }, + { + "epoch": 0.593471324552074, + "grad_norm": 0.13754481077194214, + "learning_rate": 8.866594591914756e-06, + "loss": 0.0028, + "step": 36270 + }, + { + "epoch": 0.5936349505031498, + "grad_norm": 0.2108798772096634, + "learning_rate": 8.865689117374853e-06, + "loss": 0.0023, + "step": 36280 + }, + { + "epoch": 0.5937985764542256, + "grad_norm": 0.05230475962162018, + "learning_rate": 8.864783327562496e-06, + "loss": 0.002, + "step": 36290 + }, + { + "epoch": 0.5939622024053015, + "grad_norm": 0.055224016308784485, + "learning_rate": 8.86387722255156e-06, + "loss": 0.0025, + "step": 36300 + }, + { + "epoch": 0.5941258283563773, + "grad_norm": 0.09527760744094849, + "learning_rate": 8.862970802415945e-06, + "loss": 0.004, + "step": 36310 + }, + { + "epoch": 0.5942894543074532, + "grad_norm": 0.07829606533050537, + "learning_rate": 8.862064067229573e-06, + "loss": 0.0022, + "step": 36320 + }, + { + "epoch": 0.594453080258529, + "grad_norm": 0.08064805716276169, + "learning_rate": 8.861157017066396e-06, + "loss": 0.0042, + "step": 36330 + }, + { + "epoch": 0.5946167062096048, + "grad_norm": 0.18253670632839203, + "learning_rate": 8.86024965200039e-06, + "loss": 0.0037, + "step": 36340 + }, + { + "epoch": 0.5947803321606807, + "grad_norm": 0.06974855065345764, + "learning_rate": 8.859341972105555e-06, + "loss": 0.0029, + "step": 36350 + }, + { + "epoch": 0.5949439581117565, + "grad_norm": 0.14941421151161194, + "learning_rate": 8.858433977455921e-06, + "loss": 0.0026, + "step": 36360 + }, + { + "epoch": 0.5951075840628324, + "grad_norm": 0.2193268984556198, + "learning_rate": 8.857525668125538e-06, + "loss": 0.0035, + "step": 36370 + }, + { + "epoch": 0.5952712100139083, + "grad_norm": 0.032499201595783234, + "learning_rate": 8.856617044188486e-06, + "loss": 0.0021, + "step": 36380 + }, + { + "epoch": 0.595434835964984, + "grad_norm": 0.06605914235115051, + "learning_rate": 8.855708105718869e-06, + "loss": 0.0021, + "step": 36390 + }, + { + "epoch": 0.5955984619160599, + "grad_norm": 0.13698311150074005, + "learning_rate": 8.854798852790818e-06, + "loss": 0.0041, + "step": 36400 + }, + { + "epoch": 0.5957620878671357, + "grad_norm": 0.2649853825569153, + "learning_rate": 8.853889285478487e-06, + "loss": 0.0031, + "step": 36410 + }, + { + "epoch": 0.5959257138182116, + "grad_norm": 0.09784290939569473, + "learning_rate": 8.852979403856059e-06, + "loss": 0.0032, + "step": 36420 + }, + { + "epoch": 0.5960893397692875, + "grad_norm": 0.02051716484129429, + "learning_rate": 8.852069207997737e-06, + "loss": 0.0038, + "step": 36430 + }, + { + "epoch": 0.5962529657203632, + "grad_norm": 0.05615180358290672, + "learning_rate": 8.85115869797776e-06, + "loss": 0.0042, + "step": 36440 + }, + { + "epoch": 0.5964165916714391, + "grad_norm": 0.04665743559598923, + "learning_rate": 8.85024787387038e-06, + "loss": 0.0044, + "step": 36450 + }, + { + "epoch": 0.5965802176225149, + "grad_norm": 0.0609181746840477, + "learning_rate": 8.849336735749884e-06, + "loss": 0.0016, + "step": 36460 + }, + { + "epoch": 0.5967438435735908, + "grad_norm": 0.03353914991021156, + "learning_rate": 8.84842528369058e-06, + "loss": 0.0016, + "step": 36470 + }, + { + "epoch": 0.5969074695246667, + "grad_norm": 0.03775521740317345, + "learning_rate": 8.847513517766804e-06, + "loss": 0.0015, + "step": 36480 + }, + { + "epoch": 0.5970710954757424, + "grad_norm": 0.11112702637910843, + "learning_rate": 8.846601438052915e-06, + "loss": 0.0027, + "step": 36490 + }, + { + "epoch": 0.5972347214268183, + "grad_norm": 0.09484587609767914, + "learning_rate": 8.8456890446233e-06, + "loss": 0.0019, + "step": 36500 + }, + { + "epoch": 0.5973983473778941, + "grad_norm": 0.08690627664327621, + "learning_rate": 8.844776337552372e-06, + "loss": 0.0033, + "step": 36510 + }, + { + "epoch": 0.59756197332897, + "grad_norm": 0.1479829102754593, + "learning_rate": 8.843863316914566e-06, + "loss": 0.0025, + "step": 36520 + }, + { + "epoch": 0.5977255992800458, + "grad_norm": 0.10502468794584274, + "learning_rate": 8.842949982784346e-06, + "loss": 0.0021, + "step": 36530 + }, + { + "epoch": 0.5978892252311216, + "grad_norm": 0.06019517034292221, + "learning_rate": 8.842036335236199e-06, + "loss": 0.0032, + "step": 36540 + }, + { + "epoch": 0.5980528511821975, + "grad_norm": 0.26551783084869385, + "learning_rate": 8.841122374344642e-06, + "loss": 0.0046, + "step": 36550 + }, + { + "epoch": 0.5982164771332733, + "grad_norm": 0.11244402080774307, + "learning_rate": 8.840208100184212e-06, + "loss": 0.0031, + "step": 36560 + }, + { + "epoch": 0.5983801030843492, + "grad_norm": 0.06438885629177094, + "learning_rate": 8.839293512829474e-06, + "loss": 0.0022, + "step": 36570 + }, + { + "epoch": 0.598543729035425, + "grad_norm": 0.0021785805001854897, + "learning_rate": 8.83837861235502e-06, + "loss": 0.004, + "step": 36580 + }, + { + "epoch": 0.5987073549865009, + "grad_norm": 0.1145516037940979, + "learning_rate": 8.837463398835467e-06, + "loss": 0.0021, + "step": 36590 + }, + { + "epoch": 0.5988709809375767, + "grad_norm": 0.050754331052303314, + "learning_rate": 8.836547872345454e-06, + "loss": 0.0038, + "step": 36600 + }, + { + "epoch": 0.5990346068886525, + "grad_norm": 0.11124119162559509, + "learning_rate": 8.835632032959651e-06, + "loss": 0.0032, + "step": 36610 + }, + { + "epoch": 0.5991982328397284, + "grad_norm": 0.0438012033700943, + "learning_rate": 8.834715880752747e-06, + "loss": 0.0023, + "step": 36620 + }, + { + "epoch": 0.5993618587908042, + "grad_norm": 0.020244888961315155, + "learning_rate": 8.833799415799464e-06, + "loss": 0.0027, + "step": 36630 + }, + { + "epoch": 0.59952548474188, + "grad_norm": 0.13828010857105255, + "learning_rate": 8.832882638174545e-06, + "loss": 0.0031, + "step": 36640 + }, + { + "epoch": 0.5996891106929559, + "grad_norm": 0.13835541903972626, + "learning_rate": 8.83196554795276e-06, + "loss": 0.0021, + "step": 36650 + }, + { + "epoch": 0.5998527366440317, + "grad_norm": 0.20447050034999847, + "learning_rate": 8.831048145208898e-06, + "loss": 0.003, + "step": 36660 + }, + { + "epoch": 0.6000163625951076, + "grad_norm": 0.07340088486671448, + "learning_rate": 8.830130430017788e-06, + "loss": 0.004, + "step": 36670 + }, + { + "epoch": 0.6001799885461834, + "grad_norm": 0.09831058979034424, + "learning_rate": 8.829212402454269e-06, + "loss": 0.0032, + "step": 36680 + }, + { + "epoch": 0.6003436144972593, + "grad_norm": 0.09342899918556213, + "learning_rate": 8.828294062593214e-06, + "loss": 0.0028, + "step": 36690 + }, + { + "epoch": 0.6005072404483351, + "grad_norm": 0.05146569013595581, + "learning_rate": 8.827375410509522e-06, + "loss": 0.0035, + "step": 36700 + }, + { + "epoch": 0.6006708663994109, + "grad_norm": 0.02984634041786194, + "learning_rate": 8.826456446278111e-06, + "loss": 0.0039, + "step": 36710 + }, + { + "epoch": 0.6008344923504868, + "grad_norm": 0.09399478137493134, + "learning_rate": 8.825537169973932e-06, + "loss": 0.0021, + "step": 36720 + }, + { + "epoch": 0.6009981183015626, + "grad_norm": 0.02587178163230419, + "learning_rate": 8.824617581671958e-06, + "loss": 0.0039, + "step": 36730 + }, + { + "epoch": 0.6011617442526385, + "grad_norm": 0.07559508085250854, + "learning_rate": 8.823697681447186e-06, + "loss": 0.002, + "step": 36740 + }, + { + "epoch": 0.6013253702037143, + "grad_norm": 0.06608793139457703, + "learning_rate": 8.822777469374641e-06, + "loss": 0.0029, + "step": 36750 + }, + { + "epoch": 0.6014889961547901, + "grad_norm": 0.09661190211772919, + "learning_rate": 8.821856945529371e-06, + "loss": 0.0032, + "step": 36760 + }, + { + "epoch": 0.601652622105866, + "grad_norm": 0.09360766410827637, + "learning_rate": 8.820936109986451e-06, + "loss": 0.003, + "step": 36770 + }, + { + "epoch": 0.6018162480569418, + "grad_norm": 0.08921334147453308, + "learning_rate": 8.820014962820984e-06, + "loss": 0.0036, + "step": 36780 + }, + { + "epoch": 0.6019798740080177, + "grad_norm": 0.12837113440036774, + "learning_rate": 8.819093504108092e-06, + "loss": 0.0027, + "step": 36790 + }, + { + "epoch": 0.6021434999590936, + "grad_norm": 0.04904981702566147, + "learning_rate": 8.818171733922927e-06, + "loss": 0.0015, + "step": 36800 + }, + { + "epoch": 0.6023071259101693, + "grad_norm": 0.13305571675300598, + "learning_rate": 8.817249652340667e-06, + "loss": 0.0021, + "step": 36810 + }, + { + "epoch": 0.6024707518612452, + "grad_norm": 0.20025058090686798, + "learning_rate": 8.816327259436513e-06, + "loss": 0.0024, + "step": 36820 + }, + { + "epoch": 0.602634377812321, + "grad_norm": 0.1698462963104248, + "learning_rate": 8.81540455528569e-06, + "loss": 0.0029, + "step": 36830 + }, + { + "epoch": 0.6027980037633969, + "grad_norm": 0.031062902882695198, + "learning_rate": 8.814481539963456e-06, + "loss": 0.0055, + "step": 36840 + }, + { + "epoch": 0.6029616297144728, + "grad_norm": 0.06259023398160934, + "learning_rate": 8.813558213545082e-06, + "loss": 0.001, + "step": 36850 + }, + { + "epoch": 0.6031252556655485, + "grad_norm": 0.17906297743320465, + "learning_rate": 8.812634576105877e-06, + "loss": 0.0043, + "step": 36860 + }, + { + "epoch": 0.6032888816166244, + "grad_norm": 0.03559622913599014, + "learning_rate": 8.811710627721167e-06, + "loss": 0.0042, + "step": 36870 + }, + { + "epoch": 0.6034525075677002, + "grad_norm": 0.05674935504794121, + "learning_rate": 8.810786368466307e-06, + "loss": 0.0046, + "step": 36880 + }, + { + "epoch": 0.6036161335187761, + "grad_norm": 0.08643285930156708, + "learning_rate": 8.809861798416675e-06, + "loss": 0.0035, + "step": 36890 + }, + { + "epoch": 0.603779759469852, + "grad_norm": 0.11587459594011307, + "learning_rate": 8.808936917647678e-06, + "loss": 0.0037, + "step": 36900 + }, + { + "epoch": 0.6039433854209277, + "grad_norm": 0.07755623012781143, + "learning_rate": 8.808011726234746e-06, + "loss": 0.0034, + "step": 36910 + }, + { + "epoch": 0.6041070113720036, + "grad_norm": 0.08583834767341614, + "learning_rate": 8.807086224253332e-06, + "loss": 0.0028, + "step": 36920 + }, + { + "epoch": 0.6042706373230794, + "grad_norm": 0.08574008196592331, + "learning_rate": 8.806160411778919e-06, + "loss": 0.0036, + "step": 36930 + }, + { + "epoch": 0.6044342632741553, + "grad_norm": 0.1612904965877533, + "learning_rate": 8.805234288887012e-06, + "loss": 0.0037, + "step": 36940 + }, + { + "epoch": 0.6045978892252312, + "grad_norm": 0.038598284125328064, + "learning_rate": 8.804307855653145e-06, + "loss": 0.0029, + "step": 36950 + }, + { + "epoch": 0.6047615151763069, + "grad_norm": 0.2639501690864563, + "learning_rate": 8.80338111215287e-06, + "loss": 0.0019, + "step": 36960 + }, + { + "epoch": 0.6049251411273828, + "grad_norm": 0.10830695927143097, + "learning_rate": 8.802454058461774e-06, + "loss": 0.0045, + "step": 36970 + }, + { + "epoch": 0.6050887670784586, + "grad_norm": 0.05082109943032265, + "learning_rate": 8.80152669465546e-06, + "loss": 0.0042, + "step": 36980 + }, + { + "epoch": 0.6052523930295345, + "grad_norm": 0.14712199568748474, + "learning_rate": 8.800599020809566e-06, + "loss": 0.0019, + "step": 36990 + }, + { + "epoch": 0.6054160189806104, + "grad_norm": 0.10536909103393555, + "learning_rate": 8.799671036999743e-06, + "loss": 0.0027, + "step": 37000 + }, + { + "epoch": 0.6055796449316861, + "grad_norm": 0.09069158136844635, + "learning_rate": 8.79874274330168e-06, + "loss": 0.0033, + "step": 37010 + }, + { + "epoch": 0.605743270882762, + "grad_norm": 0.050764184445142746, + "learning_rate": 8.797814139791081e-06, + "loss": 0.0029, + "step": 37020 + }, + { + "epoch": 0.6059068968338378, + "grad_norm": 0.06032281741499901, + "learning_rate": 8.796885226543682e-06, + "loss": 0.0023, + "step": 37030 + }, + { + "epoch": 0.6060705227849137, + "grad_norm": 0.07672061771154404, + "learning_rate": 8.795956003635245e-06, + "loss": 0.0022, + "step": 37040 + }, + { + "epoch": 0.6062341487359896, + "grad_norm": 0.06515415012836456, + "learning_rate": 8.79502647114155e-06, + "loss": 0.0077, + "step": 37050 + }, + { + "epoch": 0.6063977746870653, + "grad_norm": 0.2183670997619629, + "learning_rate": 8.794096629138407e-06, + "loss": 0.0046, + "step": 37060 + }, + { + "epoch": 0.6065614006381412, + "grad_norm": 0.06741929799318314, + "learning_rate": 8.793166477701653e-06, + "loss": 0.0032, + "step": 37070 + }, + { + "epoch": 0.606725026589217, + "grad_norm": 0.19403107464313507, + "learning_rate": 8.792236016907146e-06, + "loss": 0.0038, + "step": 37080 + }, + { + "epoch": 0.6068886525402929, + "grad_norm": 0.17053139209747314, + "learning_rate": 8.79130524683077e-06, + "loss": 0.0017, + "step": 37090 + }, + { + "epoch": 0.6070522784913688, + "grad_norm": 0.04111277312040329, + "learning_rate": 8.790374167548438e-06, + "loss": 0.0022, + "step": 37100 + }, + { + "epoch": 0.6072159044424446, + "grad_norm": 0.08792674541473389, + "learning_rate": 8.789442779136086e-06, + "loss": 0.0036, + "step": 37110 + }, + { + "epoch": 0.6073795303935204, + "grad_norm": 0.13975588977336884, + "learning_rate": 8.788511081669676e-06, + "loss": 0.0022, + "step": 37120 + }, + { + "epoch": 0.6075431563445962, + "grad_norm": 0.060710933059453964, + "learning_rate": 8.787579075225187e-06, + "loss": 0.0024, + "step": 37130 + }, + { + "epoch": 0.6077067822956721, + "grad_norm": 0.048313673585653305, + "learning_rate": 8.786646759878638e-06, + "loss": 0.0019, + "step": 37140 + }, + { + "epoch": 0.607870408246748, + "grad_norm": 0.05173991620540619, + "learning_rate": 8.785714135706062e-06, + "loss": 0.0041, + "step": 37150 + }, + { + "epoch": 0.6080340341978238, + "grad_norm": 0.05603951960802078, + "learning_rate": 8.784781202783522e-06, + "loss": 0.0026, + "step": 37160 + }, + { + "epoch": 0.6081976601488996, + "grad_norm": 0.050557494163513184, + "learning_rate": 8.783847961187102e-06, + "loss": 0.0022, + "step": 37170 + }, + { + "epoch": 0.6083612860999754, + "grad_norm": 0.06078165024518967, + "learning_rate": 8.782914410992917e-06, + "loss": 0.0039, + "step": 37180 + }, + { + "epoch": 0.6085249120510513, + "grad_norm": 0.23364116251468658, + "learning_rate": 8.781980552277103e-06, + "loss": 0.0032, + "step": 37190 + }, + { + "epoch": 0.6086885380021272, + "grad_norm": 0.09005658328533173, + "learning_rate": 8.781046385115822e-06, + "loss": 0.0023, + "step": 37200 + }, + { + "epoch": 0.608852163953203, + "grad_norm": 0.03304097428917885, + "learning_rate": 8.780111909585262e-06, + "loss": 0.0029, + "step": 37210 + }, + { + "epoch": 0.6090157899042788, + "grad_norm": 0.23793883621692657, + "learning_rate": 8.779177125761637e-06, + "loss": 0.0084, + "step": 37220 + }, + { + "epoch": 0.6091794158553546, + "grad_norm": 0.18885046243667603, + "learning_rate": 8.778242033721185e-06, + "loss": 0.0023, + "step": 37230 + }, + { + "epoch": 0.6093430418064305, + "grad_norm": 0.1231994703412056, + "learning_rate": 8.777306633540164e-06, + "loss": 0.0027, + "step": 37240 + }, + { + "epoch": 0.6095066677575064, + "grad_norm": 0.1364826112985611, + "learning_rate": 8.776370925294867e-06, + "loss": 0.0026, + "step": 37250 + }, + { + "epoch": 0.6096702937085822, + "grad_norm": 0.051186852157115936, + "learning_rate": 8.775434909061606e-06, + "loss": 0.0024, + "step": 37260 + }, + { + "epoch": 0.609833919659658, + "grad_norm": 0.31306907534599304, + "learning_rate": 8.774498584916718e-06, + "loss": 0.0049, + "step": 37270 + }, + { + "epoch": 0.6099975456107338, + "grad_norm": 0.032567791640758514, + "learning_rate": 8.773561952936569e-06, + "loss": 0.0022, + "step": 37280 + }, + { + "epoch": 0.6101611715618097, + "grad_norm": 0.05053863301873207, + "learning_rate": 8.772625013197544e-06, + "loss": 0.0037, + "step": 37290 + }, + { + "epoch": 0.6103247975128856, + "grad_norm": 0.029592325910925865, + "learning_rate": 8.77168776577606e-06, + "loss": 0.002, + "step": 37300 + }, + { + "epoch": 0.6104884234639614, + "grad_norm": 0.27198657393455505, + "learning_rate": 8.770750210748554e-06, + "loss": 0.0021, + "step": 37310 + }, + { + "epoch": 0.6106520494150373, + "grad_norm": 0.35949841141700745, + "learning_rate": 8.769812348191489e-06, + "loss": 0.0033, + "step": 37320 + }, + { + "epoch": 0.610815675366113, + "grad_norm": 0.17255106568336487, + "learning_rate": 8.768874178181355e-06, + "loss": 0.002, + "step": 37330 + }, + { + "epoch": 0.6109793013171889, + "grad_norm": 0.10463345050811768, + "learning_rate": 8.767935700794666e-06, + "loss": 0.0042, + "step": 37340 + }, + { + "epoch": 0.6111429272682648, + "grad_norm": 0.05498625338077545, + "learning_rate": 8.766996916107962e-06, + "loss": 0.0032, + "step": 37350 + }, + { + "epoch": 0.6113065532193406, + "grad_norm": 0.21080386638641357, + "learning_rate": 8.766057824197804e-06, + "loss": 0.0036, + "step": 37360 + }, + { + "epoch": 0.6114701791704165, + "grad_norm": 0.03491256386041641, + "learning_rate": 8.765118425140784e-06, + "loss": 0.0023, + "step": 37370 + }, + { + "epoch": 0.6116338051214922, + "grad_norm": 0.08403094857931137, + "learning_rate": 8.764178719013516e-06, + "loss": 0.0031, + "step": 37380 + }, + { + "epoch": 0.6117974310725681, + "grad_norm": 0.045884132385253906, + "learning_rate": 8.763238705892638e-06, + "loss": 0.0019, + "step": 37390 + }, + { + "epoch": 0.6119610570236439, + "grad_norm": 0.06711523234844208, + "learning_rate": 8.762298385854814e-06, + "loss": 0.0013, + "step": 37400 + }, + { + "epoch": 0.6121246829747198, + "grad_norm": 0.13800066709518433, + "learning_rate": 8.761357758976737e-06, + "loss": 0.0029, + "step": 37410 + }, + { + "epoch": 0.6122883089257957, + "grad_norm": 0.07132541388273239, + "learning_rate": 8.760416825335115e-06, + "loss": 0.0021, + "step": 37420 + }, + { + "epoch": 0.6124519348768714, + "grad_norm": 0.06595374643802643, + "learning_rate": 8.759475585006691e-06, + "loss": 0.0034, + "step": 37430 + }, + { + "epoch": 0.6126155608279473, + "grad_norm": 0.1668674647808075, + "learning_rate": 8.758534038068231e-06, + "loss": 0.0022, + "step": 37440 + }, + { + "epoch": 0.6127791867790231, + "grad_norm": 0.06871428340673447, + "learning_rate": 8.757592184596522e-06, + "loss": 0.0018, + "step": 37450 + }, + { + "epoch": 0.612942812730099, + "grad_norm": 0.07337944954633713, + "learning_rate": 8.756650024668378e-06, + "loss": 0.004, + "step": 37460 + }, + { + "epoch": 0.6131064386811749, + "grad_norm": 0.13682547211647034, + "learning_rate": 8.75570755836064e-06, + "loss": 0.002, + "step": 37470 + }, + { + "epoch": 0.6132700646322506, + "grad_norm": 0.20227354764938354, + "learning_rate": 8.754764785750172e-06, + "loss": 0.0025, + "step": 37480 + }, + { + "epoch": 0.6134336905833265, + "grad_norm": 0.14616483449935913, + "learning_rate": 8.753821706913862e-06, + "loss": 0.0035, + "step": 37490 + }, + { + "epoch": 0.6135973165344023, + "grad_norm": 0.11802705377340317, + "learning_rate": 8.752878321928626e-06, + "loss": 0.0029, + "step": 37500 + }, + { + "epoch": 0.6137609424854782, + "grad_norm": 0.05202655866742134, + "learning_rate": 8.751934630871401e-06, + "loss": 0.0039, + "step": 37510 + }, + { + "epoch": 0.6139245684365541, + "grad_norm": 0.20219677686691284, + "learning_rate": 8.750990633819155e-06, + "loss": 0.0051, + "step": 37520 + }, + { + "epoch": 0.6140881943876298, + "grad_norm": 0.0687403529882431, + "learning_rate": 8.750046330848872e-06, + "loss": 0.0018, + "step": 37530 + }, + { + "epoch": 0.6142518203387057, + "grad_norm": 0.012231512926518917, + "learning_rate": 8.749101722037571e-06, + "loss": 0.0043, + "step": 37540 + }, + { + "epoch": 0.6144154462897815, + "grad_norm": 0.13095110654830933, + "learning_rate": 8.748156807462289e-06, + "loss": 0.0018, + "step": 37550 + }, + { + "epoch": 0.6145790722408574, + "grad_norm": 0.0067818365059792995, + "learning_rate": 8.747211587200088e-06, + "loss": 0.0022, + "step": 37560 + }, + { + "epoch": 0.6147426981919333, + "grad_norm": 0.0933321937918663, + "learning_rate": 8.746266061328062e-06, + "loss": 0.0024, + "step": 37570 + }, + { + "epoch": 0.614906324143009, + "grad_norm": 0.10935989767313004, + "learning_rate": 8.74532022992332e-06, + "loss": 0.0032, + "step": 37580 + }, + { + "epoch": 0.6150699500940849, + "grad_norm": 0.11650828272104263, + "learning_rate": 8.744374093063004e-06, + "loss": 0.0042, + "step": 37590 + }, + { + "epoch": 0.6152335760451607, + "grad_norm": 0.07681674510240555, + "learning_rate": 8.743427650824276e-06, + "loss": 0.0072, + "step": 37600 + }, + { + "epoch": 0.6153972019962366, + "grad_norm": 0.020434940233826637, + "learning_rate": 8.742480903284326e-06, + "loss": 0.002, + "step": 37610 + }, + { + "epoch": 0.6155608279473125, + "grad_norm": 0.12207379192113876, + "learning_rate": 8.741533850520364e-06, + "loss": 0.0029, + "step": 37620 + }, + { + "epoch": 0.6157244538983883, + "grad_norm": 0.03350047022104263, + "learning_rate": 8.740586492609633e-06, + "loss": 0.0021, + "step": 37630 + }, + { + "epoch": 0.6158880798494641, + "grad_norm": 0.1759958118200302, + "learning_rate": 8.739638829629394e-06, + "loss": 0.0036, + "step": 37640 + }, + { + "epoch": 0.6160517058005399, + "grad_norm": 0.08769656717777252, + "learning_rate": 8.738690861656933e-06, + "loss": 0.003, + "step": 37650 + }, + { + "epoch": 0.6162153317516158, + "grad_norm": 0.15247301757335663, + "learning_rate": 8.737742588769569e-06, + "loss": 0.0024, + "step": 37660 + }, + { + "epoch": 0.6163789577026917, + "grad_norm": 0.08341974765062332, + "learning_rate": 8.736794011044633e-06, + "loss": 0.0028, + "step": 37670 + }, + { + "epoch": 0.6165425836537675, + "grad_norm": 0.11260994523763657, + "learning_rate": 8.735845128559493e-06, + "loss": 0.0035, + "step": 37680 + }, + { + "epoch": 0.6167062096048433, + "grad_norm": 0.09610302001237869, + "learning_rate": 8.734895941391535e-06, + "loss": 0.0023, + "step": 37690 + }, + { + "epoch": 0.6168698355559191, + "grad_norm": 0.1326465606689453, + "learning_rate": 8.733946449618172e-06, + "loss": 0.002, + "step": 37700 + }, + { + "epoch": 0.617033461506995, + "grad_norm": 0.05639144778251648, + "learning_rate": 8.732996653316839e-06, + "loss": 0.0028, + "step": 37710 + }, + { + "epoch": 0.6171970874580709, + "grad_norm": 0.3435528874397278, + "learning_rate": 8.732046552565e-06, + "loss": 0.0028, + "step": 37720 + }, + { + "epoch": 0.6173607134091467, + "grad_norm": 0.1259031444787979, + "learning_rate": 8.731096147440141e-06, + "loss": 0.0032, + "step": 37730 + }, + { + "epoch": 0.6175243393602226, + "grad_norm": 0.03697971999645233, + "learning_rate": 8.730145438019776e-06, + "loss": 0.0044, + "step": 37740 + }, + { + "epoch": 0.6176879653112983, + "grad_norm": 0.1517125815153122, + "learning_rate": 8.72919442438144e-06, + "loss": 0.0029, + "step": 37750 + }, + { + "epoch": 0.6178515912623742, + "grad_norm": 0.0528755821287632, + "learning_rate": 8.728243106602694e-06, + "loss": 0.0025, + "step": 37760 + }, + { + "epoch": 0.6180152172134501, + "grad_norm": 0.1169549971818924, + "learning_rate": 8.727291484761124e-06, + "loss": 0.0042, + "step": 37770 + }, + { + "epoch": 0.6181788431645259, + "grad_norm": 0.15145055949687958, + "learning_rate": 8.726339558934343e-06, + "loss": 0.0035, + "step": 37780 + }, + { + "epoch": 0.6183424691156018, + "grad_norm": 0.026678087189793587, + "learning_rate": 8.725387329199986e-06, + "loss": 0.0016, + "step": 37790 + }, + { + "epoch": 0.6185060950666775, + "grad_norm": 0.003591701854020357, + "learning_rate": 8.724434795635712e-06, + "loss": 0.0029, + "step": 37800 + }, + { + "epoch": 0.6186697210177534, + "grad_norm": 0.02403920330107212, + "learning_rate": 8.723481958319209e-06, + "loss": 0.0024, + "step": 37810 + }, + { + "epoch": 0.6188333469688293, + "grad_norm": 0.10099305212497711, + "learning_rate": 8.722528817328186e-06, + "loss": 0.0048, + "step": 37820 + }, + { + "epoch": 0.6189969729199051, + "grad_norm": 0.07914862036705017, + "learning_rate": 8.721575372740377e-06, + "loss": 0.003, + "step": 37830 + }, + { + "epoch": 0.619160598870981, + "grad_norm": 0.29134613275527954, + "learning_rate": 8.720621624633542e-06, + "loss": 0.0041, + "step": 37840 + }, + { + "epoch": 0.6193242248220567, + "grad_norm": 0.09700335562229156, + "learning_rate": 8.719667573085467e-06, + "loss": 0.0041, + "step": 37850 + }, + { + "epoch": 0.6194878507731326, + "grad_norm": 0.17290009558200836, + "learning_rate": 8.718713218173958e-06, + "loss": 0.0041, + "step": 37860 + }, + { + "epoch": 0.6196514767242085, + "grad_norm": 0.10613647103309631, + "learning_rate": 8.717758559976853e-06, + "loss": 0.003, + "step": 37870 + }, + { + "epoch": 0.6198151026752843, + "grad_norm": 0.048731859773397446, + "learning_rate": 8.716803598572008e-06, + "loss": 0.0019, + "step": 37880 + }, + { + "epoch": 0.6199787286263602, + "grad_norm": 0.34867143630981445, + "learning_rate": 8.715848334037307e-06, + "loss": 0.0043, + "step": 37890 + }, + { + "epoch": 0.6201423545774359, + "grad_norm": 0.048855897039175034, + "learning_rate": 8.71489276645066e-06, + "loss": 0.0024, + "step": 37900 + }, + { + "epoch": 0.6203059805285118, + "grad_norm": 0.09153824299573898, + "learning_rate": 8.713936895889997e-06, + "loss": 0.0034, + "step": 37910 + }, + { + "epoch": 0.6204696064795877, + "grad_norm": 0.0667981207370758, + "learning_rate": 8.712980722433277e-06, + "loss": 0.0027, + "step": 37920 + }, + { + "epoch": 0.6206332324306635, + "grad_norm": 0.21692980825901031, + "learning_rate": 8.712024246158482e-06, + "loss": 0.0032, + "step": 37930 + }, + { + "epoch": 0.6207968583817394, + "grad_norm": 0.10792932659387589, + "learning_rate": 8.711067467143617e-06, + "loss": 0.0059, + "step": 37940 + }, + { + "epoch": 0.6209604843328151, + "grad_norm": 0.07672310620546341, + "learning_rate": 8.710110385466717e-06, + "loss": 0.0025, + "step": 37950 + }, + { + "epoch": 0.621124110283891, + "grad_norm": 0.6200510263442993, + "learning_rate": 8.709153001205837e-06, + "loss": 0.0034, + "step": 37960 + }, + { + "epoch": 0.6212877362349669, + "grad_norm": 0.15146096050739288, + "learning_rate": 8.70819531443906e-06, + "loss": 0.002, + "step": 37970 + }, + { + "epoch": 0.6214513621860427, + "grad_norm": 0.12634432315826416, + "learning_rate": 8.707237325244487e-06, + "loss": 0.002, + "step": 37980 + }, + { + "epoch": 0.6216149881371186, + "grad_norm": 0.10400105267763138, + "learning_rate": 8.70627903370025e-06, + "loss": 0.0032, + "step": 37990 + }, + { + "epoch": 0.6217786140881943, + "grad_norm": 0.06751594692468643, + "learning_rate": 8.705320439884505e-06, + "loss": 0.0019, + "step": 38000 + }, + { + "epoch": 0.6219422400392702, + "grad_norm": 0.04412994533777237, + "learning_rate": 8.704361543875433e-06, + "loss": 0.0026, + "step": 38010 + }, + { + "epoch": 0.6221058659903461, + "grad_norm": 0.06331221759319305, + "learning_rate": 8.703402345751237e-06, + "loss": 0.0019, + "step": 38020 + }, + { + "epoch": 0.6222694919414219, + "grad_norm": 0.07288952171802521, + "learning_rate": 8.702442845590145e-06, + "loss": 0.0027, + "step": 38030 + }, + { + "epoch": 0.6224331178924978, + "grad_norm": 0.06311749666929245, + "learning_rate": 8.701483043470412e-06, + "loss": 0.0033, + "step": 38040 + }, + { + "epoch": 0.6225967438435736, + "grad_norm": 0.029867738485336304, + "learning_rate": 8.700522939470313e-06, + "loss": 0.0036, + "step": 38050 + }, + { + "epoch": 0.6227603697946494, + "grad_norm": 0.04987639933824539, + "learning_rate": 8.699562533668155e-06, + "loss": 0.0018, + "step": 38060 + }, + { + "epoch": 0.6229239957457253, + "grad_norm": 0.056613337248563766, + "learning_rate": 8.698601826142265e-06, + "loss": 0.0021, + "step": 38070 + }, + { + "epoch": 0.6230876216968011, + "grad_norm": 0.11939631402492523, + "learning_rate": 8.697640816970993e-06, + "loss": 0.0025, + "step": 38080 + }, + { + "epoch": 0.623251247647877, + "grad_norm": 0.015964725986123085, + "learning_rate": 8.696679506232714e-06, + "loss": 0.003, + "step": 38090 + }, + { + "epoch": 0.6234148735989528, + "grad_norm": 0.1405145823955536, + "learning_rate": 8.695717894005836e-06, + "loss": 0.0023, + "step": 38100 + }, + { + "epoch": 0.6235784995500286, + "grad_norm": 0.30327147245407104, + "learning_rate": 8.694755980368778e-06, + "loss": 0.0051, + "step": 38110 + }, + { + "epoch": 0.6237421255011045, + "grad_norm": 0.06534552574157715, + "learning_rate": 8.693793765399993e-06, + "loss": 0.0019, + "step": 38120 + }, + { + "epoch": 0.6239057514521803, + "grad_norm": 0.0863557904958725, + "learning_rate": 8.692831249177956e-06, + "loss": 0.0017, + "step": 38130 + }, + { + "epoch": 0.6240693774032562, + "grad_norm": 0.1365835964679718, + "learning_rate": 8.691868431781167e-06, + "loss": 0.0032, + "step": 38140 + }, + { + "epoch": 0.624233003354332, + "grad_norm": 0.0490381121635437, + "learning_rate": 8.69090531328815e-06, + "loss": 0.0039, + "step": 38150 + }, + { + "epoch": 0.6243966293054078, + "grad_norm": 0.04853466898202896, + "learning_rate": 8.689941893777452e-06, + "loss": 0.0047, + "step": 38160 + }, + { + "epoch": 0.6245602552564837, + "grad_norm": 0.04490708187222481, + "learning_rate": 8.688978173327646e-06, + "loss": 0.0025, + "step": 38170 + }, + { + "epoch": 0.6247238812075595, + "grad_norm": 0.11523409932851791, + "learning_rate": 8.688014152017332e-06, + "loss": 0.0031, + "step": 38180 + }, + { + "epoch": 0.6248875071586354, + "grad_norm": 0.008040662854909897, + "learning_rate": 8.687049829925134e-06, + "loss": 0.0036, + "step": 38190 + }, + { + "epoch": 0.6250511331097112, + "grad_norm": 0.1238519474864006, + "learning_rate": 8.686085207129693e-06, + "loss": 0.0022, + "step": 38200 + }, + { + "epoch": 0.625214759060787, + "grad_norm": 0.0984954684972763, + "learning_rate": 8.685120283709687e-06, + "loss": 0.0021, + "step": 38210 + }, + { + "epoch": 0.6253783850118629, + "grad_norm": 0.051364999264478683, + "learning_rate": 8.684155059743806e-06, + "loss": 0.0026, + "step": 38220 + }, + { + "epoch": 0.6255420109629387, + "grad_norm": 0.09169857949018478, + "learning_rate": 8.683189535310774e-06, + "loss": 0.0018, + "step": 38230 + }, + { + "epoch": 0.6257056369140146, + "grad_norm": 0.015738578513264656, + "learning_rate": 8.682223710489333e-06, + "loss": 0.0022, + "step": 38240 + }, + { + "epoch": 0.6258692628650904, + "grad_norm": 0.26920127868652344, + "learning_rate": 8.681257585358256e-06, + "loss": 0.0025, + "step": 38250 + }, + { + "epoch": 0.6260328888161663, + "grad_norm": 0.0809926837682724, + "learning_rate": 8.680291159996334e-06, + "loss": 0.0026, + "step": 38260 + }, + { + "epoch": 0.626196514767242, + "grad_norm": 0.06103767827153206, + "learning_rate": 8.679324434482388e-06, + "loss": 0.0021, + "step": 38270 + }, + { + "epoch": 0.6263601407183179, + "grad_norm": 0.19600960612297058, + "learning_rate": 8.678357408895257e-06, + "loss": 0.0037, + "step": 38280 + }, + { + "epoch": 0.6265237666693938, + "grad_norm": 0.008998398669064045, + "learning_rate": 8.677390083313812e-06, + "loss": 0.0029, + "step": 38290 + }, + { + "epoch": 0.6266873926204696, + "grad_norm": 0.05034337192773819, + "learning_rate": 8.676422457816945e-06, + "loss": 0.0022, + "step": 38300 + }, + { + "epoch": 0.6268510185715455, + "grad_norm": 0.19930779933929443, + "learning_rate": 8.675454532483569e-06, + "loss": 0.0024, + "step": 38310 + }, + { + "epoch": 0.6270146445226212, + "grad_norm": 0.2219306379556656, + "learning_rate": 8.674486307392625e-06, + "loss": 0.0043, + "step": 38320 + }, + { + "epoch": 0.6271782704736971, + "grad_norm": 0.08216582983732224, + "learning_rate": 8.67351778262308e-06, + "loss": 0.0025, + "step": 38330 + }, + { + "epoch": 0.627341896424773, + "grad_norm": 0.024125108495354652, + "learning_rate": 8.672548958253925e-06, + "loss": 0.0021, + "step": 38340 + }, + { + "epoch": 0.6275055223758488, + "grad_norm": 0.06840266287326813, + "learning_rate": 8.67157983436417e-06, + "loss": 0.002, + "step": 38350 + }, + { + "epoch": 0.6276691483269247, + "grad_norm": 0.03997747227549553, + "learning_rate": 8.670610411032857e-06, + "loss": 0.0043, + "step": 38360 + }, + { + "epoch": 0.6278327742780004, + "grad_norm": 0.15534232556819916, + "learning_rate": 8.669640688339046e-06, + "loss": 0.0023, + "step": 38370 + }, + { + "epoch": 0.6279964002290763, + "grad_norm": 0.1500731110572815, + "learning_rate": 8.668670666361828e-06, + "loss": 0.0021, + "step": 38380 + }, + { + "epoch": 0.6281600261801522, + "grad_norm": 0.03249631077051163, + "learning_rate": 8.667700345180309e-06, + "loss": 0.0019, + "step": 38390 + }, + { + "epoch": 0.628323652131228, + "grad_norm": 0.09533066302537918, + "learning_rate": 8.66672972487363e-06, + "loss": 0.0022, + "step": 38400 + }, + { + "epoch": 0.6284872780823039, + "grad_norm": 0.07553412765264511, + "learning_rate": 8.66575880552095e-06, + "loss": 0.0034, + "step": 38410 + }, + { + "epoch": 0.6286509040333796, + "grad_norm": 0.04963284730911255, + "learning_rate": 8.664787587201454e-06, + "loss": 0.0041, + "step": 38420 + }, + { + "epoch": 0.6288145299844555, + "grad_norm": 0.15100255608558655, + "learning_rate": 8.663816069994351e-06, + "loss": 0.0029, + "step": 38430 + }, + { + "epoch": 0.6289781559355314, + "grad_norm": 0.021357066929340363, + "learning_rate": 8.662844253978873e-06, + "loss": 0.0027, + "step": 38440 + }, + { + "epoch": 0.6291417818866072, + "grad_norm": 0.09207446128129959, + "learning_rate": 8.661872139234282e-06, + "loss": 0.0037, + "step": 38450 + }, + { + "epoch": 0.6293054078376831, + "grad_norm": 0.08249254524707794, + "learning_rate": 8.660899725839857e-06, + "loss": 0.0023, + "step": 38460 + }, + { + "epoch": 0.6294690337887588, + "grad_norm": 0.08996037393808365, + "learning_rate": 8.659927013874907e-06, + "loss": 0.003, + "step": 38470 + }, + { + "epoch": 0.6296326597398347, + "grad_norm": 0.03326552361249924, + "learning_rate": 8.658954003418761e-06, + "loss": 0.0027, + "step": 38480 + }, + { + "epoch": 0.6297962856909106, + "grad_norm": 0.05299806967377663, + "learning_rate": 8.657980694550777e-06, + "loss": 0.0021, + "step": 38490 + }, + { + "epoch": 0.6299599116419864, + "grad_norm": 0.03193550929427147, + "learning_rate": 8.65700708735033e-06, + "loss": 0.0019, + "step": 38500 + }, + { + "epoch": 0.6301235375930623, + "grad_norm": 0.07028847187757492, + "learning_rate": 8.656033181896827e-06, + "loss": 0.0016, + "step": 38510 + }, + { + "epoch": 0.630287163544138, + "grad_norm": 0.05462953448295593, + "learning_rate": 8.655058978269699e-06, + "loss": 0.0033, + "step": 38520 + }, + { + "epoch": 0.6304507894952139, + "grad_norm": 0.012995736673474312, + "learning_rate": 8.654084476548397e-06, + "loss": 0.0021, + "step": 38530 + }, + { + "epoch": 0.6306144154462898, + "grad_norm": 0.21472889184951782, + "learning_rate": 8.653109676812395e-06, + "loss": 0.0031, + "step": 38540 + }, + { + "epoch": 0.6307780413973656, + "grad_norm": 0.07544241845607758, + "learning_rate": 8.652134579141198e-06, + "loss": 0.0019, + "step": 38550 + }, + { + "epoch": 0.6309416673484415, + "grad_norm": 0.03559279069304466, + "learning_rate": 8.651159183614331e-06, + "loss": 0.0032, + "step": 38560 + }, + { + "epoch": 0.6311052932995173, + "grad_norm": 0.051309216767549515, + "learning_rate": 8.650183490311341e-06, + "loss": 0.0025, + "step": 38570 + }, + { + "epoch": 0.6312689192505931, + "grad_norm": 0.18429972231388092, + "learning_rate": 8.649207499311805e-06, + "loss": 0.0052, + "step": 38580 + }, + { + "epoch": 0.631432545201669, + "grad_norm": 0.17924368381500244, + "learning_rate": 8.648231210695323e-06, + "loss": 0.0038, + "step": 38590 + }, + { + "epoch": 0.6315961711527448, + "grad_norm": 0.06036658212542534, + "learning_rate": 8.647254624541514e-06, + "loss": 0.0019, + "step": 38600 + }, + { + "epoch": 0.6317597971038207, + "grad_norm": 0.05907200649380684, + "learning_rate": 8.646277740930028e-06, + "loss": 0.002, + "step": 38610 + }, + { + "epoch": 0.6319234230548965, + "grad_norm": 0.0779176726937294, + "learning_rate": 8.645300559940535e-06, + "loss": 0.0017, + "step": 38620 + }, + { + "epoch": 0.6320870490059723, + "grad_norm": 0.251051127910614, + "learning_rate": 8.64432308165273e-06, + "loss": 0.0028, + "step": 38630 + }, + { + "epoch": 0.6322506749570482, + "grad_norm": 0.1337164044380188, + "learning_rate": 8.643345306146334e-06, + "loss": 0.0028, + "step": 38640 + }, + { + "epoch": 0.632414300908124, + "grad_norm": 0.10822925716638565, + "learning_rate": 8.64236723350109e-06, + "loss": 0.0034, + "step": 38650 + }, + { + "epoch": 0.6325779268591999, + "grad_norm": 0.1978202909231186, + "learning_rate": 8.641388863796767e-06, + "loss": 0.0024, + "step": 38660 + }, + { + "epoch": 0.6327415528102757, + "grad_norm": 0.16618888080120087, + "learning_rate": 8.640410197113158e-06, + "loss": 0.002, + "step": 38670 + }, + { + "epoch": 0.6329051787613516, + "grad_norm": 0.044764094054698944, + "learning_rate": 8.639431233530079e-06, + "loss": 0.0022, + "step": 38680 + }, + { + "epoch": 0.6330688047124274, + "grad_norm": 0.10753671079874039, + "learning_rate": 8.63845197312737e-06, + "loss": 0.0025, + "step": 38690 + }, + { + "epoch": 0.6332324306635032, + "grad_norm": 0.038792937994003296, + "learning_rate": 8.637472415984897e-06, + "loss": 0.0026, + "step": 38700 + }, + { + "epoch": 0.6333960566145791, + "grad_norm": 0.06564744561910629, + "learning_rate": 8.636492562182553e-06, + "loss": 0.0022, + "step": 38710 + }, + { + "epoch": 0.6335596825656549, + "grad_norm": 0.05353368818759918, + "learning_rate": 8.635512411800245e-06, + "loss": 0.003, + "step": 38720 + }, + { + "epoch": 0.6337233085167308, + "grad_norm": 0.03349089249968529, + "learning_rate": 8.634531964917914e-06, + "loss": 0.0023, + "step": 38730 + }, + { + "epoch": 0.6338869344678066, + "grad_norm": 0.03374045342206955, + "learning_rate": 8.63355122161552e-06, + "loss": 0.0028, + "step": 38740 + }, + { + "epoch": 0.6340505604188824, + "grad_norm": 0.1154458150267601, + "learning_rate": 8.632570181973054e-06, + "loss": 0.0023, + "step": 38750 + }, + { + "epoch": 0.6342141863699583, + "grad_norm": 0.06812771409749985, + "learning_rate": 8.631588846070522e-06, + "loss": 0.0018, + "step": 38760 + }, + { + "epoch": 0.6343778123210341, + "grad_norm": 0.12013090401887894, + "learning_rate": 8.630607213987959e-06, + "loss": 0.0032, + "step": 38770 + }, + { + "epoch": 0.63454143827211, + "grad_norm": 0.12253228574991226, + "learning_rate": 8.629625285805425e-06, + "loss": 0.0033, + "step": 38780 + }, + { + "epoch": 0.6347050642231858, + "grad_norm": 0.028028065338730812, + "learning_rate": 8.628643061602999e-06, + "loss": 0.0016, + "step": 38790 + }, + { + "epoch": 0.6348686901742616, + "grad_norm": 0.03966687619686127, + "learning_rate": 8.627660541460795e-06, + "loss": 0.0032, + "step": 38800 + }, + { + "epoch": 0.6350323161253375, + "grad_norm": 0.16435879468917847, + "learning_rate": 8.626677725458935e-06, + "loss": 0.0037, + "step": 38810 + }, + { + "epoch": 0.6351959420764133, + "grad_norm": 0.10439508408308029, + "learning_rate": 8.625694613677582e-06, + "loss": 0.0037, + "step": 38820 + }, + { + "epoch": 0.6353595680274892, + "grad_norm": 0.05926480516791344, + "learning_rate": 8.624711206196909e-06, + "loss": 0.0023, + "step": 38830 + }, + { + "epoch": 0.635523193978565, + "grad_norm": 0.3598000109195709, + "learning_rate": 8.623727503097126e-06, + "loss": 0.0024, + "step": 38840 + }, + { + "epoch": 0.6356868199296408, + "grad_norm": 0.02207525260746479, + "learning_rate": 8.622743504458455e-06, + "loss": 0.0038, + "step": 38850 + }, + { + "epoch": 0.6358504458807167, + "grad_norm": 0.03934333845973015, + "learning_rate": 8.62175921036115e-06, + "loss": 0.0027, + "step": 38860 + }, + { + "epoch": 0.6360140718317925, + "grad_norm": 0.018653923645615578, + "learning_rate": 8.620774620885484e-06, + "loss": 0.0031, + "step": 38870 + }, + { + "epoch": 0.6361776977828684, + "grad_norm": 0.1300826221704483, + "learning_rate": 8.619789736111762e-06, + "loss": 0.0025, + "step": 38880 + }, + { + "epoch": 0.6363413237339443, + "grad_norm": 0.1303454488515854, + "learning_rate": 8.618804556120302e-06, + "loss": 0.0032, + "step": 38890 + }, + { + "epoch": 0.63650494968502, + "grad_norm": 0.1254907250404358, + "learning_rate": 8.617819080991455e-06, + "loss": 0.0026, + "step": 38900 + }, + { + "epoch": 0.6366685756360959, + "grad_norm": 0.07402250915765762, + "learning_rate": 8.616833310805593e-06, + "loss": 0.0028, + "step": 38910 + }, + { + "epoch": 0.6368322015871717, + "grad_norm": 0.15269005298614502, + "learning_rate": 8.615847245643112e-06, + "loss": 0.0024, + "step": 38920 + }, + { + "epoch": 0.6369958275382476, + "grad_norm": 0.052248455584049225, + "learning_rate": 8.614860885584432e-06, + "loss": 0.0031, + "step": 38930 + }, + { + "epoch": 0.6371594534893235, + "grad_norm": 0.09626732021570206, + "learning_rate": 8.613874230709997e-06, + "loss": 0.003, + "step": 38940 + }, + { + "epoch": 0.6373230794403992, + "grad_norm": 0.04366279020905495, + "learning_rate": 8.612887281100277e-06, + "loss": 0.0027, + "step": 38950 + }, + { + "epoch": 0.6374867053914751, + "grad_norm": 0.1165156364440918, + "learning_rate": 8.61190003683576e-06, + "loss": 0.0031, + "step": 38960 + }, + { + "epoch": 0.6376503313425509, + "grad_norm": 0.26304417848587036, + "learning_rate": 8.610912497996965e-06, + "loss": 0.006, + "step": 38970 + }, + { + "epoch": 0.6378139572936268, + "grad_norm": 0.055768873542547226, + "learning_rate": 8.609924664664432e-06, + "loss": 0.0027, + "step": 38980 + }, + { + "epoch": 0.6379775832447027, + "grad_norm": 0.038172975182533264, + "learning_rate": 8.608936536918727e-06, + "loss": 0.0029, + "step": 38990 + }, + { + "epoch": 0.6381412091957784, + "grad_norm": 0.07032755762338638, + "learning_rate": 8.607948114840435e-06, + "loss": 0.0026, + "step": 39000 + }, + { + "epoch": 0.6383048351468543, + "grad_norm": 0.07438138872385025, + "learning_rate": 8.606959398510169e-06, + "loss": 0.005, + "step": 39010 + }, + { + "epoch": 0.6384684610979301, + "grad_norm": 0.0852329358458519, + "learning_rate": 8.60597038800857e-06, + "loss": 0.003, + "step": 39020 + }, + { + "epoch": 0.638632087049006, + "grad_norm": 0.024013763293623924, + "learning_rate": 8.60498108341629e-06, + "loss": 0.0016, + "step": 39030 + }, + { + "epoch": 0.6387957130000819, + "grad_norm": 0.17839962244033813, + "learning_rate": 8.603991484814022e-06, + "loss": 0.0042, + "step": 39040 + }, + { + "epoch": 0.6389593389511576, + "grad_norm": 0.21313296258449554, + "learning_rate": 8.603001592282469e-06, + "loss": 0.0018, + "step": 39050 + }, + { + "epoch": 0.6391229649022335, + "grad_norm": 0.1219082847237587, + "learning_rate": 8.602011405902364e-06, + "loss": 0.002, + "step": 39060 + }, + { + "epoch": 0.6392865908533093, + "grad_norm": 0.07517138868570328, + "learning_rate": 8.601020925754464e-06, + "loss": 0.0021, + "step": 39070 + }, + { + "epoch": 0.6394502168043852, + "grad_norm": 0.02984750270843506, + "learning_rate": 8.60003015191955e-06, + "loss": 0.0019, + "step": 39080 + }, + { + "epoch": 0.6396138427554611, + "grad_norm": 0.1653256118297577, + "learning_rate": 8.599039084478425e-06, + "loss": 0.0031, + "step": 39090 + }, + { + "epoch": 0.6397774687065368, + "grad_norm": 0.08026986569166183, + "learning_rate": 8.598047723511916e-06, + "loss": 0.0036, + "step": 39100 + }, + { + "epoch": 0.6399410946576127, + "grad_norm": 0.09979500621557236, + "learning_rate": 8.597056069100877e-06, + "loss": 0.0032, + "step": 39110 + }, + { + "epoch": 0.6401047206086885, + "grad_norm": 0.06817537546157837, + "learning_rate": 8.596064121326184e-06, + "loss": 0.0019, + "step": 39120 + }, + { + "epoch": 0.6402683465597644, + "grad_norm": 0.07053446769714355, + "learning_rate": 8.595071880268735e-06, + "loss": 0.0033, + "step": 39130 + }, + { + "epoch": 0.6404319725108402, + "grad_norm": 0.09091051667928696, + "learning_rate": 8.594079346009456e-06, + "loss": 0.0025, + "step": 39140 + }, + { + "epoch": 0.640595598461916, + "grad_norm": 0.09570232033729553, + "learning_rate": 8.593086518629292e-06, + "loss": 0.002, + "step": 39150 + }, + { + "epoch": 0.6407592244129919, + "grad_norm": 0.1417010873556137, + "learning_rate": 8.592093398209217e-06, + "loss": 0.0022, + "step": 39160 + }, + { + "epoch": 0.6409228503640677, + "grad_norm": 0.059278704226017, + "learning_rate": 8.591099984830228e-06, + "loss": 0.003, + "step": 39170 + }, + { + "epoch": 0.6410864763151436, + "grad_norm": 0.020414268597960472, + "learning_rate": 8.59010627857334e-06, + "loss": 0.0022, + "step": 39180 + }, + { + "epoch": 0.6412501022662194, + "grad_norm": 0.22856572270393372, + "learning_rate": 8.589112279519599e-06, + "loss": 0.0026, + "step": 39190 + }, + { + "epoch": 0.6414137282172953, + "grad_norm": 0.23743440210819244, + "learning_rate": 8.588117987750072e-06, + "loss": 0.0023, + "step": 39200 + }, + { + "epoch": 0.6415773541683711, + "grad_norm": 0.09644228965044022, + "learning_rate": 8.58712340334585e-06, + "loss": 0.0039, + "step": 39210 + }, + { + "epoch": 0.6417409801194469, + "grad_norm": 0.08064088970422745, + "learning_rate": 8.586128526388046e-06, + "loss": 0.0043, + "step": 39220 + }, + { + "epoch": 0.6419046060705228, + "grad_norm": 0.03328957036137581, + "learning_rate": 8.585133356957802e-06, + "loss": 0.0042, + "step": 39230 + }, + { + "epoch": 0.6420682320215986, + "grad_norm": 0.1000988557934761, + "learning_rate": 8.584137895136278e-06, + "loss": 0.0048, + "step": 39240 + }, + { + "epoch": 0.6422318579726745, + "grad_norm": 0.07038478553295135, + "learning_rate": 8.583142141004662e-06, + "loss": 0.0044, + "step": 39250 + }, + { + "epoch": 0.6423954839237503, + "grad_norm": 0.10646302253007889, + "learning_rate": 8.582146094644164e-06, + "loss": 0.0028, + "step": 39260 + }, + { + "epoch": 0.6425591098748261, + "grad_norm": 0.017525313422083855, + "learning_rate": 8.581149756136018e-06, + "loss": 0.0018, + "step": 39270 + }, + { + "epoch": 0.642722735825902, + "grad_norm": 0.03510558605194092, + "learning_rate": 8.580153125561482e-06, + "loss": 0.0026, + "step": 39280 + }, + { + "epoch": 0.6428863617769778, + "grad_norm": 0.043394170701503754, + "learning_rate": 8.579156203001838e-06, + "loss": 0.0021, + "step": 39290 + }, + { + "epoch": 0.6430499877280537, + "grad_norm": 0.1543533056974411, + "learning_rate": 8.578158988538392e-06, + "loss": 0.0028, + "step": 39300 + }, + { + "epoch": 0.6432136136791295, + "grad_norm": 0.015024775639176369, + "learning_rate": 8.577161482252472e-06, + "loss": 0.0022, + "step": 39310 + }, + { + "epoch": 0.6433772396302053, + "grad_norm": 0.19195719063282013, + "learning_rate": 8.576163684225432e-06, + "loss": 0.0029, + "step": 39320 + }, + { + "epoch": 0.6435408655812812, + "grad_norm": 0.06950744241476059, + "learning_rate": 8.57516559453865e-06, + "loss": 0.004, + "step": 39330 + }, + { + "epoch": 0.643704491532357, + "grad_norm": 0.06668994575738907, + "learning_rate": 8.574167213273524e-06, + "loss": 0.0022, + "step": 39340 + }, + { + "epoch": 0.6438681174834329, + "grad_norm": 0.2713773548603058, + "learning_rate": 8.573168540511482e-06, + "loss": 0.0032, + "step": 39350 + }, + { + "epoch": 0.6440317434345088, + "grad_norm": 0.3114817142486572, + "learning_rate": 8.572169576333971e-06, + "loss": 0.0016, + "step": 39360 + }, + { + "epoch": 0.6441953693855845, + "grad_norm": 0.09380172193050385, + "learning_rate": 8.571170320822461e-06, + "loss": 0.003, + "step": 39370 + }, + { + "epoch": 0.6443589953366604, + "grad_norm": 0.031921155750751495, + "learning_rate": 8.570170774058451e-06, + "loss": 0.0024, + "step": 39380 + }, + { + "epoch": 0.6445226212877362, + "grad_norm": 0.054590776562690735, + "learning_rate": 8.569170936123461e-06, + "loss": 0.0033, + "step": 39390 + }, + { + "epoch": 0.6446862472388121, + "grad_norm": 0.03549819439649582, + "learning_rate": 8.56817080709903e-06, + "loss": 0.0018, + "step": 39400 + }, + { + "epoch": 0.644849873189888, + "grad_norm": 0.1801433563232422, + "learning_rate": 8.56717038706673e-06, + "loss": 0.0042, + "step": 39410 + }, + { + "epoch": 0.6450134991409637, + "grad_norm": 0.06492079049348831, + "learning_rate": 8.566169676108149e-06, + "loss": 0.0048, + "step": 39420 + }, + { + "epoch": 0.6451771250920396, + "grad_norm": 0.07603912800550461, + "learning_rate": 8.565168674304902e-06, + "loss": 0.002, + "step": 39430 + }, + { + "epoch": 0.6453407510431154, + "grad_norm": 0.07331918925046921, + "learning_rate": 8.564167381738628e-06, + "loss": 0.0022, + "step": 39440 + }, + { + "epoch": 0.6455043769941913, + "grad_norm": 0.07943751662969589, + "learning_rate": 8.56316579849099e-06, + "loss": 0.0023, + "step": 39450 + }, + { + "epoch": 0.6456680029452672, + "grad_norm": 0.08440044522285461, + "learning_rate": 8.562163924643672e-06, + "loss": 0.0041, + "step": 39460 + }, + { + "epoch": 0.6458316288963429, + "grad_norm": 0.161666601896286, + "learning_rate": 8.561161760278383e-06, + "loss": 0.0061, + "step": 39470 + }, + { + "epoch": 0.6459952548474188, + "grad_norm": 0.0927104502916336, + "learning_rate": 8.560159305476857e-06, + "loss": 0.0038, + "step": 39480 + }, + { + "epoch": 0.6461588807984946, + "grad_norm": 0.023915352299809456, + "learning_rate": 8.55915656032085e-06, + "loss": 0.002, + "step": 39490 + }, + { + "epoch": 0.6463225067495705, + "grad_norm": 0.1288783848285675, + "learning_rate": 8.558153524892145e-06, + "loss": 0.0031, + "step": 39500 + }, + { + "epoch": 0.6464861327006464, + "grad_norm": 0.04643791913986206, + "learning_rate": 8.557150199272542e-06, + "loss": 0.007, + "step": 39510 + }, + { + "epoch": 0.6466497586517221, + "grad_norm": 0.08360448479652405, + "learning_rate": 8.556146583543874e-06, + "loss": 0.0025, + "step": 39520 + }, + { + "epoch": 0.646813384602798, + "grad_norm": 0.10117527842521667, + "learning_rate": 8.555142677787987e-06, + "loss": 0.0033, + "step": 39530 + }, + { + "epoch": 0.6469770105538738, + "grad_norm": 0.057454243302345276, + "learning_rate": 8.55413848208676e-06, + "loss": 0.0029, + "step": 39540 + }, + { + "epoch": 0.6471406365049497, + "grad_norm": 0.20288380980491638, + "learning_rate": 8.553133996522092e-06, + "loss": 0.0045, + "step": 39550 + }, + { + "epoch": 0.6473042624560256, + "grad_norm": 0.19805529713630676, + "learning_rate": 8.552129221175901e-06, + "loss": 0.0027, + "step": 39560 + }, + { + "epoch": 0.6474678884071013, + "grad_norm": 0.061083775013685226, + "learning_rate": 8.55112415613014e-06, + "loss": 0.0042, + "step": 39570 + }, + { + "epoch": 0.6476315143581772, + "grad_norm": 0.061113279312849045, + "learning_rate": 8.550118801466773e-06, + "loss": 0.0021, + "step": 39580 + }, + { + "epoch": 0.647795140309253, + "grad_norm": 0.03751441463828087, + "learning_rate": 8.549113157267794e-06, + "loss": 0.0036, + "step": 39590 + }, + { + "epoch": 0.6479587662603289, + "grad_norm": 0.11440392583608627, + "learning_rate": 8.548107223615224e-06, + "loss": 0.0018, + "step": 39600 + }, + { + "epoch": 0.6481223922114048, + "grad_norm": 0.07054636627435684, + "learning_rate": 8.547101000591096e-06, + "loss": 0.0023, + "step": 39610 + }, + { + "epoch": 0.6482860181624805, + "grad_norm": 0.16492822766304016, + "learning_rate": 8.546094488277482e-06, + "loss": 0.0038, + "step": 39620 + }, + { + "epoch": 0.6484496441135564, + "grad_norm": 0.06979059427976608, + "learning_rate": 8.545087686756467e-06, + "loss": 0.0027, + "step": 39630 + }, + { + "epoch": 0.6486132700646322, + "grad_norm": 0.052431054413318634, + "learning_rate": 8.544080596110159e-06, + "loss": 0.0024, + "step": 39640 + }, + { + "epoch": 0.6487768960157081, + "grad_norm": 0.1581445038318634, + "learning_rate": 8.543073216420697e-06, + "loss": 0.0024, + "step": 39650 + }, + { + "epoch": 0.648940521966784, + "grad_norm": 0.07132606953382492, + "learning_rate": 8.542065547770237e-06, + "loss": 0.002, + "step": 39660 + }, + { + "epoch": 0.6491041479178598, + "grad_norm": 0.07171407341957092, + "learning_rate": 8.541057590240963e-06, + "loss": 0.0032, + "step": 39670 + }, + { + "epoch": 0.6492677738689356, + "grad_norm": 0.03588040918111801, + "learning_rate": 8.54004934391508e-06, + "loss": 0.0025, + "step": 39680 + }, + { + "epoch": 0.6494313998200114, + "grad_norm": 0.04136650264263153, + "learning_rate": 8.539040808874816e-06, + "loss": 0.0025, + "step": 39690 + }, + { + "epoch": 0.6495950257710873, + "grad_norm": 0.09886538982391357, + "learning_rate": 8.538031985202424e-06, + "loss": 0.0017, + "step": 39700 + }, + { + "epoch": 0.6497586517221632, + "grad_norm": 0.07219899445772171, + "learning_rate": 8.537022872980184e-06, + "loss": 0.0024, + "step": 39710 + }, + { + "epoch": 0.649922277673239, + "grad_norm": 0.1638651043176651, + "learning_rate": 8.536013472290387e-06, + "loss": 0.0019, + "step": 39720 + }, + { + "epoch": 0.6500859036243148, + "grad_norm": 0.020979199558496475, + "learning_rate": 8.535003783215366e-06, + "loss": 0.002, + "step": 39730 + }, + { + "epoch": 0.6502495295753906, + "grad_norm": 0.13245879113674164, + "learning_rate": 8.533993805837463e-06, + "loss": 0.0029, + "step": 39740 + }, + { + "epoch": 0.6504131555264665, + "grad_norm": 0.03319181501865387, + "learning_rate": 8.532983540239048e-06, + "loss": 0.0044, + "step": 39750 + }, + { + "epoch": 0.6505767814775424, + "grad_norm": 0.009434771724045277, + "learning_rate": 8.531972986502517e-06, + "loss": 0.0023, + "step": 39760 + }, + { + "epoch": 0.6507404074286182, + "grad_norm": 0.12471001595258713, + "learning_rate": 8.530962144710285e-06, + "loss": 0.0021, + "step": 39770 + }, + { + "epoch": 0.650904033379694, + "grad_norm": 0.10073836892843246, + "learning_rate": 8.529951014944792e-06, + "loss": 0.0023, + "step": 39780 + }, + { + "epoch": 0.6510676593307698, + "grad_norm": 0.08077047020196915, + "learning_rate": 8.528939597288506e-06, + "loss": 0.0028, + "step": 39790 + }, + { + "epoch": 0.6512312852818457, + "grad_norm": 0.11728660762310028, + "learning_rate": 8.527927891823913e-06, + "loss": 0.0031, + "step": 39800 + }, + { + "epoch": 0.6513949112329216, + "grad_norm": 0.15916123986244202, + "learning_rate": 8.526915898633524e-06, + "loss": 0.0026, + "step": 39810 + }, + { + "epoch": 0.6515585371839974, + "grad_norm": 0.012175176292657852, + "learning_rate": 8.525903617799874e-06, + "loss": 0.0041, + "step": 39820 + }, + { + "epoch": 0.6517221631350733, + "grad_norm": 0.16962768137454987, + "learning_rate": 8.524891049405522e-06, + "loss": 0.0034, + "step": 39830 + }, + { + "epoch": 0.651885789086149, + "grad_norm": 0.0806095078587532, + "learning_rate": 8.523878193533047e-06, + "loss": 0.0023, + "step": 39840 + }, + { + "epoch": 0.6520494150372249, + "grad_norm": 0.05332040414214134, + "learning_rate": 8.522865050265056e-06, + "loss": 0.0022, + "step": 39850 + }, + { + "epoch": 0.6522130409883008, + "grad_norm": 0.04758303239941597, + "learning_rate": 8.521851619684178e-06, + "loss": 0.0032, + "step": 39860 + }, + { + "epoch": 0.6523766669393766, + "grad_norm": 0.0878957137465477, + "learning_rate": 8.520837901873065e-06, + "loss": 0.0055, + "step": 39870 + }, + { + "epoch": 0.6525402928904525, + "grad_norm": 0.04823160171508789, + "learning_rate": 8.519823896914391e-06, + "loss": 0.0028, + "step": 39880 + }, + { + "epoch": 0.6527039188415282, + "grad_norm": 0.30594176054000854, + "learning_rate": 8.518809604890856e-06, + "loss": 0.0036, + "step": 39890 + }, + { + "epoch": 0.6528675447926041, + "grad_norm": 0.08237633854150772, + "learning_rate": 8.51779502588518e-06, + "loss": 0.002, + "step": 39900 + }, + { + "epoch": 0.65303117074368, + "grad_norm": 0.04482179135084152, + "learning_rate": 8.516780159980112e-06, + "loss": 0.0046, + "step": 39910 + }, + { + "epoch": 0.6531947966947558, + "grad_norm": 0.08056865632534027, + "learning_rate": 8.515765007258418e-06, + "loss": 0.003, + "step": 39920 + }, + { + "epoch": 0.6533584226458317, + "grad_norm": 0.10867752134799957, + "learning_rate": 8.514749567802892e-06, + "loss": 0.002, + "step": 39930 + }, + { + "epoch": 0.6535220485969074, + "grad_norm": 0.08309381455183029, + "learning_rate": 8.51373384169635e-06, + "loss": 0.0017, + "step": 39940 + }, + { + "epoch": 0.6536856745479833, + "grad_norm": 0.048605818301439285, + "learning_rate": 8.51271782902163e-06, + "loss": 0.0032, + "step": 39950 + }, + { + "epoch": 0.6538493004990592, + "grad_norm": 0.033906593918800354, + "learning_rate": 8.511701529861595e-06, + "loss": 0.0027, + "step": 39960 + }, + { + "epoch": 0.654012926450135, + "grad_norm": 0.06378569453954697, + "learning_rate": 8.510684944299132e-06, + "loss": 0.0029, + "step": 39970 + }, + { + "epoch": 0.6541765524012109, + "grad_norm": 0.06641020625829697, + "learning_rate": 8.50966807241715e-06, + "loss": 0.0035, + "step": 39980 + }, + { + "epoch": 0.6543401783522866, + "grad_norm": 0.058488670736551285, + "learning_rate": 8.50865091429858e-06, + "loss": 0.0028, + "step": 39990 + }, + { + "epoch": 0.6545038043033625, + "grad_norm": 0.10854487866163254, + "learning_rate": 8.507633470026377e-06, + "loss": 0.002, + "step": 40000 + }, + { + "epoch": 0.6546674302544384, + "grad_norm": 0.09760761260986328, + "learning_rate": 8.506615739683524e-06, + "loss": 0.0024, + "step": 40010 + }, + { + "epoch": 0.6548310562055142, + "grad_norm": 0.08055496215820312, + "learning_rate": 8.505597723353022e-06, + "loss": 0.0042, + "step": 40020 + }, + { + "epoch": 0.6549946821565901, + "grad_norm": 0.09465897083282471, + "learning_rate": 8.504579421117896e-06, + "loss": 0.0027, + "step": 40030 + }, + { + "epoch": 0.6551583081076658, + "grad_norm": 0.09184083342552185, + "learning_rate": 8.503560833061196e-06, + "loss": 0.0019, + "step": 40040 + }, + { + "epoch": 0.6553219340587417, + "grad_norm": 0.046404317021369934, + "learning_rate": 8.502541959265996e-06, + "loss": 0.0018, + "step": 40050 + }, + { + "epoch": 0.6554855600098175, + "grad_norm": 0.027325669303536415, + "learning_rate": 8.501522799815389e-06, + "loss": 0.0023, + "step": 40060 + }, + { + "epoch": 0.6556491859608934, + "grad_norm": 0.1321181356906891, + "learning_rate": 8.500503354792497e-06, + "loss": 0.0034, + "step": 40070 + }, + { + "epoch": 0.6558128119119693, + "grad_norm": 0.04301869124174118, + "learning_rate": 8.49948362428046e-06, + "loss": 0.0019, + "step": 40080 + }, + { + "epoch": 0.655976437863045, + "grad_norm": 0.05978606641292572, + "learning_rate": 8.498463608362445e-06, + "loss": 0.0019, + "step": 40090 + }, + { + "epoch": 0.6561400638141209, + "grad_norm": 0.09832644462585449, + "learning_rate": 8.497443307121641e-06, + "loss": 0.0015, + "step": 40100 + }, + { + "epoch": 0.6563036897651967, + "grad_norm": 0.14280065894126892, + "learning_rate": 8.49642272064126e-06, + "loss": 0.0019, + "step": 40110 + }, + { + "epoch": 0.6564673157162726, + "grad_norm": 0.03534003719687462, + "learning_rate": 8.49540184900454e-06, + "loss": 0.002, + "step": 40120 + }, + { + "epoch": 0.6566309416673485, + "grad_norm": 0.03862011432647705, + "learning_rate": 8.494380692294734e-06, + "loss": 0.0049, + "step": 40130 + }, + { + "epoch": 0.6567945676184243, + "grad_norm": 0.13252069056034088, + "learning_rate": 8.49335925059513e-06, + "loss": 0.0032, + "step": 40140 + }, + { + "epoch": 0.6569581935695001, + "grad_norm": 0.21603387594223022, + "learning_rate": 8.49233752398903e-06, + "loss": 0.0072, + "step": 40150 + }, + { + "epoch": 0.6571218195205759, + "grad_norm": 0.11427640914916992, + "learning_rate": 8.491315512559764e-06, + "loss": 0.0023, + "step": 40160 + }, + { + "epoch": 0.6572854454716518, + "grad_norm": 0.12012048065662384, + "learning_rate": 8.490293216390682e-06, + "loss": 0.0027, + "step": 40170 + }, + { + "epoch": 0.6574490714227277, + "grad_norm": 0.036919478327035904, + "learning_rate": 8.489270635565161e-06, + "loss": 0.0028, + "step": 40180 + }, + { + "epoch": 0.6576126973738035, + "grad_norm": 0.0641576498746872, + "learning_rate": 8.4882477701666e-06, + "loss": 0.0018, + "step": 40190 + }, + { + "epoch": 0.6577763233248793, + "grad_norm": 0.043436549603939056, + "learning_rate": 8.487224620278415e-06, + "loss": 0.0044, + "step": 40200 + }, + { + "epoch": 0.6579399492759551, + "grad_norm": 0.16939841210842133, + "learning_rate": 8.486201185984059e-06, + "loss": 0.0032, + "step": 40210 + }, + { + "epoch": 0.658103575227031, + "grad_norm": 0.09159771353006363, + "learning_rate": 8.485177467366993e-06, + "loss": 0.0044, + "step": 40220 + }, + { + "epoch": 0.6582672011781069, + "grad_norm": 0.057120632380247116, + "learning_rate": 8.484153464510712e-06, + "loss": 0.0023, + "step": 40230 + }, + { + "epoch": 0.6584308271291827, + "grad_norm": 0.14374807476997375, + "learning_rate": 8.483129177498727e-06, + "loss": 0.0027, + "step": 40240 + }, + { + "epoch": 0.6585944530802585, + "grad_norm": 0.02444578893482685, + "learning_rate": 8.482104606414576e-06, + "loss": 0.003, + "step": 40250 + }, + { + "epoch": 0.6587580790313343, + "grad_norm": 0.04823712259531021, + "learning_rate": 8.481079751341823e-06, + "loss": 0.0029, + "step": 40260 + }, + { + "epoch": 0.6589217049824102, + "grad_norm": 0.024433519691228867, + "learning_rate": 8.480054612364048e-06, + "loss": 0.0014, + "step": 40270 + }, + { + "epoch": 0.6590853309334861, + "grad_norm": 0.03714948520064354, + "learning_rate": 8.479029189564858e-06, + "loss": 0.0033, + "step": 40280 + }, + { + "epoch": 0.6592489568845619, + "grad_norm": 0.08686116337776184, + "learning_rate": 8.478003483027886e-06, + "loss": 0.0046, + "step": 40290 + }, + { + "epoch": 0.6594125828356378, + "grad_norm": 0.03075013868510723, + "learning_rate": 8.476977492836782e-06, + "loss": 0.0031, + "step": 40300 + }, + { + "epoch": 0.6595762087867135, + "grad_norm": 0.06753110140562057, + "learning_rate": 8.475951219075222e-06, + "loss": 0.0023, + "step": 40310 + }, + { + "epoch": 0.6597398347377894, + "grad_norm": 0.04041937366127968, + "learning_rate": 8.47492466182691e-06, + "loss": 0.0032, + "step": 40320 + }, + { + "epoch": 0.6599034606888653, + "grad_norm": 0.04040047153830528, + "learning_rate": 8.473897821175563e-06, + "loss": 0.0051, + "step": 40330 + }, + { + "epoch": 0.6600670866399411, + "grad_norm": 0.03707794472575188, + "learning_rate": 8.47287069720493e-06, + "loss": 0.002, + "step": 40340 + }, + { + "epoch": 0.660230712591017, + "grad_norm": 0.11853500455617905, + "learning_rate": 8.471843289998777e-06, + "loss": 0.0028, + "step": 40350 + }, + { + "epoch": 0.6603943385420927, + "grad_norm": 0.20351850986480713, + "learning_rate": 8.470815599640898e-06, + "loss": 0.0042, + "step": 40360 + }, + { + "epoch": 0.6605579644931686, + "grad_norm": 0.08293548226356506, + "learning_rate": 8.46978762621511e-06, + "loss": 0.003, + "step": 40370 + }, + { + "epoch": 0.6607215904442445, + "grad_norm": 0.01352684572339058, + "learning_rate": 8.468759369805244e-06, + "loss": 0.003, + "step": 40380 + }, + { + "epoch": 0.6608852163953203, + "grad_norm": 0.067771315574646, + "learning_rate": 8.467730830495168e-06, + "loss": 0.0029, + "step": 40390 + }, + { + "epoch": 0.6610488423463962, + "grad_norm": 0.06476642936468124, + "learning_rate": 8.466702008368765e-06, + "loss": 0.0025, + "step": 40400 + }, + { + "epoch": 0.6612124682974719, + "grad_norm": 0.08182857930660248, + "learning_rate": 8.465672903509939e-06, + "loss": 0.0021, + "step": 40410 + }, + { + "epoch": 0.6613760942485478, + "grad_norm": 0.0771382749080658, + "learning_rate": 8.464643516002623e-06, + "loss": 0.0022, + "step": 40420 + }, + { + "epoch": 0.6615397201996237, + "grad_norm": 0.12741197645664215, + "learning_rate": 8.46361384593077e-06, + "loss": 0.0018, + "step": 40430 + }, + { + "epoch": 0.6617033461506995, + "grad_norm": 0.02982301637530327, + "learning_rate": 8.462583893378356e-06, + "loss": 0.0029, + "step": 40440 + }, + { + "epoch": 0.6618669721017754, + "grad_norm": 0.05917232111096382, + "learning_rate": 8.461553658429378e-06, + "loss": 0.0013, + "step": 40450 + }, + { + "epoch": 0.6620305980528511, + "grad_norm": 0.1927354782819748, + "learning_rate": 8.460523141167862e-06, + "loss": 0.0027, + "step": 40460 + }, + { + "epoch": 0.662194224003927, + "grad_norm": 0.25302886962890625, + "learning_rate": 8.459492341677853e-06, + "loss": 0.0026, + "step": 40470 + }, + { + "epoch": 0.6623578499550029, + "grad_norm": 0.32481902837753296, + "learning_rate": 8.45846126004342e-06, + "loss": 0.0011, + "step": 40480 + }, + { + "epoch": 0.6625214759060787, + "grad_norm": 0.05792341008782387, + "learning_rate": 8.457429896348653e-06, + "loss": 0.0022, + "step": 40490 + }, + { + "epoch": 0.6626851018571546, + "grad_norm": 0.04456359148025513, + "learning_rate": 8.456398250677665e-06, + "loss": 0.0021, + "step": 40500 + }, + { + "epoch": 0.6628487278082303, + "grad_norm": 0.19139991700649261, + "learning_rate": 8.455366323114596e-06, + "loss": 0.0048, + "step": 40510 + }, + { + "epoch": 0.6630123537593062, + "grad_norm": 0.1043798178434372, + "learning_rate": 8.454334113743605e-06, + "loss": 0.002, + "step": 40520 + }, + { + "epoch": 0.6631759797103821, + "grad_norm": 0.09990212321281433, + "learning_rate": 8.453301622648878e-06, + "loss": 0.0031, + "step": 40530 + }, + { + "epoch": 0.6633396056614579, + "grad_norm": 0.060381870716810226, + "learning_rate": 8.45226884991462e-06, + "loss": 0.0022, + "step": 40540 + }, + { + "epoch": 0.6635032316125338, + "grad_norm": 0.023226188495755196, + "learning_rate": 8.45123579562506e-06, + "loss": 0.0025, + "step": 40550 + }, + { + "epoch": 0.6636668575636095, + "grad_norm": 0.15302708745002747, + "learning_rate": 8.45020245986445e-06, + "loss": 0.0033, + "step": 40560 + }, + { + "epoch": 0.6638304835146854, + "grad_norm": 0.07055632025003433, + "learning_rate": 8.449168842717067e-06, + "loss": 0.0039, + "step": 40570 + }, + { + "epoch": 0.6639941094657613, + "grad_norm": 0.014842112548649311, + "learning_rate": 8.448134944267206e-06, + "loss": 0.0031, + "step": 40580 + }, + { + "epoch": 0.6641577354168371, + "grad_norm": 0.21070148050785065, + "learning_rate": 8.447100764599192e-06, + "loss": 0.0029, + "step": 40590 + }, + { + "epoch": 0.664321361367913, + "grad_norm": 0.17054150998592377, + "learning_rate": 8.446066303797366e-06, + "loss": 0.003, + "step": 40600 + }, + { + "epoch": 0.6644849873189888, + "grad_norm": 0.07928485423326492, + "learning_rate": 8.4450315619461e-06, + "loss": 0.0034, + "step": 40610 + }, + { + "epoch": 0.6646486132700646, + "grad_norm": 0.03261743113398552, + "learning_rate": 8.44399653912978e-06, + "loss": 0.0021, + "step": 40620 + }, + { + "epoch": 0.6648122392211405, + "grad_norm": 0.008574265986680984, + "learning_rate": 8.442961235432818e-06, + "loss": 0.0028, + "step": 40630 + }, + { + "epoch": 0.6649758651722163, + "grad_norm": 0.18837010860443115, + "learning_rate": 8.441925650939653e-06, + "loss": 0.0025, + "step": 40640 + }, + { + "epoch": 0.6651394911232922, + "grad_norm": 0.09124614298343658, + "learning_rate": 8.440889785734742e-06, + "loss": 0.0037, + "step": 40650 + }, + { + "epoch": 0.665303117074368, + "grad_norm": 0.13488143682479858, + "learning_rate": 8.439853639902566e-06, + "loss": 0.0028, + "step": 40660 + }, + { + "epoch": 0.6654667430254438, + "grad_norm": 0.19879809021949768, + "learning_rate": 8.438817213527632e-06, + "loss": 0.0024, + "step": 40670 + }, + { + "epoch": 0.6656303689765197, + "grad_norm": 0.07655355334281921, + "learning_rate": 8.437780506694465e-06, + "loss": 0.0039, + "step": 40680 + }, + { + "epoch": 0.6657939949275955, + "grad_norm": 0.06329560279846191, + "learning_rate": 8.436743519487615e-06, + "loss": 0.0029, + "step": 40690 + }, + { + "epoch": 0.6659576208786714, + "grad_norm": 0.07198058068752289, + "learning_rate": 8.43570625199166e-06, + "loss": 0.0018, + "step": 40700 + }, + { + "epoch": 0.6661212468297472, + "grad_norm": 0.12384290248155594, + "learning_rate": 8.43466870429119e-06, + "loss": 0.0045, + "step": 40710 + }, + { + "epoch": 0.666284872780823, + "grad_norm": 0.145598903298378, + "learning_rate": 8.433630876470826e-06, + "loss": 0.003, + "step": 40720 + }, + { + "epoch": 0.6664484987318989, + "grad_norm": 0.2404741793870926, + "learning_rate": 8.43259276861521e-06, + "loss": 0.0026, + "step": 40730 + }, + { + "epoch": 0.6666121246829747, + "grad_norm": 0.08709389716386795, + "learning_rate": 8.431554380809008e-06, + "loss": 0.0024, + "step": 40740 + }, + { + "epoch": 0.6667757506340506, + "grad_norm": 0.008196630515158176, + "learning_rate": 8.430515713136904e-06, + "loss": 0.0015, + "step": 40750 + }, + { + "epoch": 0.6669393765851264, + "grad_norm": 0.05855025723576546, + "learning_rate": 8.429476765683612e-06, + "loss": 0.0017, + "step": 40760 + }, + { + "epoch": 0.6671030025362022, + "grad_norm": 0.07866821438074112, + "learning_rate": 8.428437538533861e-06, + "loss": 0.0018, + "step": 40770 + }, + { + "epoch": 0.6672666284872781, + "grad_norm": 0.2005240023136139, + "learning_rate": 8.42739803177241e-06, + "loss": 0.0032, + "step": 40780 + }, + { + "epoch": 0.6674302544383539, + "grad_norm": 0.10873547196388245, + "learning_rate": 8.426358245484036e-06, + "loss": 0.002, + "step": 40790 + }, + { + "epoch": 0.6675938803894298, + "grad_norm": 0.0981588363647461, + "learning_rate": 8.425318179753542e-06, + "loss": 0.0019, + "step": 40800 + }, + { + "epoch": 0.6677575063405056, + "grad_norm": 0.1374509483575821, + "learning_rate": 8.42427783466575e-06, + "loss": 0.0029, + "step": 40810 + }, + { + "epoch": 0.6679211322915815, + "grad_norm": 0.049998167902231216, + "learning_rate": 8.42323721030551e-06, + "loss": 0.0007, + "step": 40820 + }, + { + "epoch": 0.6680847582426573, + "grad_norm": 0.0330437496304512, + "learning_rate": 8.422196306757689e-06, + "loss": 0.0015, + "step": 40830 + }, + { + "epoch": 0.6682483841937331, + "grad_norm": 0.0394873209297657, + "learning_rate": 8.42115512410718e-06, + "loss": 0.0025, + "step": 40840 + }, + { + "epoch": 0.668412010144809, + "grad_norm": 0.14858004450798035, + "learning_rate": 8.420113662438898e-06, + "loss": 0.0072, + "step": 40850 + }, + { + "epoch": 0.6685756360958848, + "grad_norm": 0.055894069373607635, + "learning_rate": 8.419071921837784e-06, + "loss": 0.0036, + "step": 40860 + }, + { + "epoch": 0.6687392620469607, + "grad_norm": 0.09399282932281494, + "learning_rate": 8.418029902388796e-06, + "loss": 0.0031, + "step": 40870 + }, + { + "epoch": 0.6689028879980365, + "grad_norm": 0.0983235090970993, + "learning_rate": 8.416987604176918e-06, + "loss": 0.0029, + "step": 40880 + }, + { + "epoch": 0.6690665139491123, + "grad_norm": 0.2799706757068634, + "learning_rate": 8.415945027287156e-06, + "loss": 0.0024, + "step": 40890 + }, + { + "epoch": 0.6692301399001882, + "grad_norm": 0.010169921442866325, + "learning_rate": 8.414902171804542e-06, + "loss": 0.0025, + "step": 40900 + }, + { + "epoch": 0.669393765851264, + "grad_norm": 0.03838230296969414, + "learning_rate": 8.413859037814123e-06, + "loss": 0.0021, + "step": 40910 + }, + { + "epoch": 0.6695573918023399, + "grad_norm": 0.17767493426799774, + "learning_rate": 8.412815625400976e-06, + "loss": 0.0022, + "step": 40920 + }, + { + "epoch": 0.6697210177534156, + "grad_norm": 0.07790587842464447, + "learning_rate": 8.4117719346502e-06, + "loss": 0.002, + "step": 40930 + }, + { + "epoch": 0.6698846437044915, + "grad_norm": 0.04043501242995262, + "learning_rate": 8.410727965646909e-06, + "loss": 0.005, + "step": 40940 + }, + { + "epoch": 0.6700482696555674, + "grad_norm": 0.032733093947172165, + "learning_rate": 8.409683718476253e-06, + "loss": 0.0022, + "step": 40950 + }, + { + "epoch": 0.6702118956066432, + "grad_norm": 0.046128902584314346, + "learning_rate": 8.408639193223392e-06, + "loss": 0.0076, + "step": 40960 + }, + { + "epoch": 0.6703755215577191, + "grad_norm": 0.16634483635425568, + "learning_rate": 8.407594389973517e-06, + "loss": 0.0035, + "step": 40970 + }, + { + "epoch": 0.6705391475087948, + "grad_norm": 0.025049185380339622, + "learning_rate": 8.406549308811835e-06, + "loss": 0.0015, + "step": 40980 + }, + { + "epoch": 0.6707027734598707, + "grad_norm": 0.08842090517282486, + "learning_rate": 8.405503949823583e-06, + "loss": 0.0021, + "step": 40990 + }, + { + "epoch": 0.6708663994109466, + "grad_norm": 0.07626570761203766, + "learning_rate": 8.404458313094015e-06, + "loss": 0.0029, + "step": 41000 + }, + { + "epoch": 0.6710300253620224, + "grad_norm": 0.07488197833299637, + "learning_rate": 8.403412398708411e-06, + "loss": 0.0019, + "step": 41010 + }, + { + "epoch": 0.6711936513130983, + "grad_norm": 0.05449520796537399, + "learning_rate": 8.40236620675207e-06, + "loss": 0.0021, + "step": 41020 + }, + { + "epoch": 0.671357277264174, + "grad_norm": 0.05049673840403557, + "learning_rate": 8.401319737310318e-06, + "loss": 0.0021, + "step": 41030 + }, + { + "epoch": 0.6715209032152499, + "grad_norm": 0.14458458125591278, + "learning_rate": 8.400272990468499e-06, + "loss": 0.0026, + "step": 41040 + }, + { + "epoch": 0.6716845291663258, + "grad_norm": 0.1568382829427719, + "learning_rate": 8.399225966311984e-06, + "loss": 0.0035, + "step": 41050 + }, + { + "epoch": 0.6718481551174016, + "grad_norm": 0.015643656253814697, + "learning_rate": 8.398178664926164e-06, + "loss": 0.0013, + "step": 41060 + }, + { + "epoch": 0.6720117810684775, + "grad_norm": 0.003948468714952469, + "learning_rate": 8.397131086396455e-06, + "loss": 0.0022, + "step": 41070 + }, + { + "epoch": 0.6721754070195533, + "grad_norm": 0.041831858456134796, + "learning_rate": 8.396083230808292e-06, + "loss": 0.0026, + "step": 41080 + }, + { + "epoch": 0.6723390329706291, + "grad_norm": 0.05406446382403374, + "learning_rate": 8.395035098247136e-06, + "loss": 0.0018, + "step": 41090 + }, + { + "epoch": 0.672502658921705, + "grad_norm": 0.12080182135105133, + "learning_rate": 8.393986688798468e-06, + "loss": 0.0021, + "step": 41100 + }, + { + "epoch": 0.6726662848727808, + "grad_norm": 0.07608948647975922, + "learning_rate": 8.392938002547793e-06, + "loss": 0.0035, + "step": 41110 + }, + { + "epoch": 0.6728299108238567, + "grad_norm": 0.13132861256599426, + "learning_rate": 8.391889039580637e-06, + "loss": 0.0026, + "step": 41120 + }, + { + "epoch": 0.6729935367749325, + "grad_norm": 0.09472689777612686, + "learning_rate": 8.390839799982552e-06, + "loss": 0.003, + "step": 41130 + }, + { + "epoch": 0.6731571627260083, + "grad_norm": 0.025278786197304726, + "learning_rate": 8.389790283839109e-06, + "loss": 0.002, + "step": 41140 + }, + { + "epoch": 0.6733207886770842, + "grad_norm": 1.022646188735962, + "learning_rate": 8.388740491235905e-06, + "loss": 0.0027, + "step": 41150 + }, + { + "epoch": 0.67348441462816, + "grad_norm": 0.21813036501407623, + "learning_rate": 8.387690422258554e-06, + "loss": 0.0031, + "step": 41160 + }, + { + "epoch": 0.6736480405792359, + "grad_norm": 0.4471426010131836, + "learning_rate": 8.386640076992701e-06, + "loss": 0.0023, + "step": 41170 + }, + { + "epoch": 0.6738116665303117, + "grad_norm": 0.16581852734088898, + "learning_rate": 8.385589455524002e-06, + "loss": 0.0024, + "step": 41180 + }, + { + "epoch": 0.6739752924813875, + "grad_norm": 0.04886772856116295, + "learning_rate": 8.384538557938147e-06, + "loss": 0.0027, + "step": 41190 + }, + { + "epoch": 0.6741389184324634, + "grad_norm": 0.13878095149993896, + "learning_rate": 8.383487384320844e-06, + "loss": 0.0032, + "step": 41200 + }, + { + "epoch": 0.6743025443835392, + "grad_norm": 0.03204534202814102, + "learning_rate": 8.38243593475782e-06, + "loss": 0.0016, + "step": 41210 + }, + { + "epoch": 0.6744661703346151, + "grad_norm": 0.26388972997665405, + "learning_rate": 8.38138420933483e-06, + "loss": 0.0032, + "step": 41220 + }, + { + "epoch": 0.6746297962856909, + "grad_norm": 0.1268002837896347, + "learning_rate": 8.380332208137648e-06, + "loss": 0.0041, + "step": 41230 + }, + { + "epoch": 0.6747934222367667, + "grad_norm": 0.09900958836078644, + "learning_rate": 8.379279931252072e-06, + "loss": 0.0032, + "step": 41240 + }, + { + "epoch": 0.6749570481878426, + "grad_norm": 0.26281067728996277, + "learning_rate": 8.378227378763923e-06, + "loss": 0.0031, + "step": 41250 + }, + { + "epoch": 0.6751206741389184, + "grad_norm": 0.25603729486465454, + "learning_rate": 8.377174550759043e-06, + "loss": 0.006, + "step": 41260 + }, + { + "epoch": 0.6752843000899943, + "grad_norm": 0.26240843534469604, + "learning_rate": 8.376121447323294e-06, + "loss": 0.0033, + "step": 41270 + }, + { + "epoch": 0.6754479260410701, + "grad_norm": 0.07707250118255615, + "learning_rate": 8.375068068542568e-06, + "loss": 0.0019, + "step": 41280 + }, + { + "epoch": 0.675611551992146, + "grad_norm": 0.06844502687454224, + "learning_rate": 8.374014414502774e-06, + "loss": 0.0045, + "step": 41290 + }, + { + "epoch": 0.6757751779432218, + "grad_norm": 0.3305200934410095, + "learning_rate": 8.372960485289843e-06, + "loss": 0.0029, + "step": 41300 + }, + { + "epoch": 0.6759388038942976, + "grad_norm": 0.056485649198293686, + "learning_rate": 8.37190628098973e-06, + "loss": 0.0034, + "step": 41310 + }, + { + "epoch": 0.6761024298453735, + "grad_norm": 0.08391249179840088, + "learning_rate": 8.370851801688413e-06, + "loss": 0.002, + "step": 41320 + }, + { + "epoch": 0.6762660557964493, + "grad_norm": 0.028164148330688477, + "learning_rate": 8.369797047471893e-06, + "loss": 0.0033, + "step": 41330 + }, + { + "epoch": 0.6764296817475252, + "grad_norm": 0.2291320413351059, + "learning_rate": 8.36874201842619e-06, + "loss": 0.0055, + "step": 41340 + }, + { + "epoch": 0.676593307698601, + "grad_norm": 0.02646239660680294, + "learning_rate": 8.367686714637348e-06, + "loss": 0.0016, + "step": 41350 + }, + { + "epoch": 0.6767569336496768, + "grad_norm": 0.033750392496585846, + "learning_rate": 8.36663113619144e-06, + "loss": 0.0014, + "step": 41360 + }, + { + "epoch": 0.6769205596007527, + "grad_norm": 0.041009679436683655, + "learning_rate": 8.365575283174546e-06, + "loss": 0.0019, + "step": 41370 + }, + { + "epoch": 0.6770841855518285, + "grad_norm": 0.2438313364982605, + "learning_rate": 8.364519155672783e-06, + "loss": 0.0032, + "step": 41380 + }, + { + "epoch": 0.6772478115029044, + "grad_norm": 0.25042006373405457, + "learning_rate": 8.363462753772287e-06, + "loss": 0.0014, + "step": 41390 + }, + { + "epoch": 0.6774114374539802, + "grad_norm": 0.04324944689869881, + "learning_rate": 8.362406077559212e-06, + "loss": 0.0022, + "step": 41400 + }, + { + "epoch": 0.677575063405056, + "grad_norm": 0.24871376156806946, + "learning_rate": 8.361349127119735e-06, + "loss": 0.0041, + "step": 41410 + }, + { + "epoch": 0.6777386893561319, + "grad_norm": 0.02890959195792675, + "learning_rate": 8.360291902540062e-06, + "loss": 0.0033, + "step": 41420 + }, + { + "epoch": 0.6779023153072077, + "grad_norm": 0.024548379704356194, + "learning_rate": 8.359234403906413e-06, + "loss": 0.0017, + "step": 41430 + }, + { + "epoch": 0.6780659412582836, + "grad_norm": 0.2140108197927475, + "learning_rate": 8.358176631305036e-06, + "loss": 0.0021, + "step": 41440 + }, + { + "epoch": 0.6782295672093595, + "grad_norm": 0.13933569192886353, + "learning_rate": 8.357118584822197e-06, + "loss": 0.0015, + "step": 41450 + }, + { + "epoch": 0.6783931931604352, + "grad_norm": 0.09648358076810837, + "learning_rate": 8.35606026454419e-06, + "loss": 0.0031, + "step": 41460 + }, + { + "epoch": 0.6785568191115111, + "grad_norm": 0.06922072917222977, + "learning_rate": 8.355001670557324e-06, + "loss": 0.003, + "step": 41470 + }, + { + "epoch": 0.6787204450625869, + "grad_norm": 0.1881045252084732, + "learning_rate": 8.353942802947938e-06, + "loss": 0.0026, + "step": 41480 + }, + { + "epoch": 0.6788840710136628, + "grad_norm": 0.04266451671719551, + "learning_rate": 8.352883661802388e-06, + "loss": 0.0028, + "step": 41490 + }, + { + "epoch": 0.6790476969647387, + "grad_norm": 0.10109952837228775, + "learning_rate": 8.351824247207053e-06, + "loss": 0.0043, + "step": 41500 + }, + { + "epoch": 0.6792113229158144, + "grad_norm": 0.10828981548547745, + "learning_rate": 8.350764559248336e-06, + "loss": 0.0018, + "step": 41510 + }, + { + "epoch": 0.6793749488668903, + "grad_norm": 0.23151905834674835, + "learning_rate": 8.349704598012664e-06, + "loss": 0.0021, + "step": 41520 + }, + { + "epoch": 0.6795385748179661, + "grad_norm": 0.08255578577518463, + "learning_rate": 8.34864436358648e-06, + "loss": 0.0017, + "step": 41530 + }, + { + "epoch": 0.679702200769042, + "grad_norm": 0.26133525371551514, + "learning_rate": 8.347583856056255e-06, + "loss": 0.0034, + "step": 41540 + }, + { + "epoch": 0.6798658267201179, + "grad_norm": 0.07293794304132462, + "learning_rate": 8.346523075508481e-06, + "loss": 0.0025, + "step": 41550 + }, + { + "epoch": 0.6800294526711936, + "grad_norm": 0.07824334502220154, + "learning_rate": 8.34546202202967e-06, + "loss": 0.0037, + "step": 41560 + }, + { + "epoch": 0.6801930786222695, + "grad_norm": 0.14194002747535706, + "learning_rate": 8.344400695706358e-06, + "loss": 0.0025, + "step": 41570 + }, + { + "epoch": 0.6803567045733453, + "grad_norm": 0.0412052646279335, + "learning_rate": 8.343339096625104e-06, + "loss": 0.0017, + "step": 41580 + }, + { + "epoch": 0.6805203305244212, + "grad_norm": 0.18909892439842224, + "learning_rate": 8.34227722487249e-06, + "loss": 0.0022, + "step": 41590 + }, + { + "epoch": 0.6806839564754971, + "grad_norm": 0.16024208068847656, + "learning_rate": 8.341215080535117e-06, + "loss": 0.0028, + "step": 41600 + }, + { + "epoch": 0.6808475824265728, + "grad_norm": 0.09370427578687668, + "learning_rate": 8.340152663699607e-06, + "loss": 0.003, + "step": 41610 + }, + { + "epoch": 0.6810112083776487, + "grad_norm": 0.1780400276184082, + "learning_rate": 8.339089974452613e-06, + "loss": 0.0028, + "step": 41620 + }, + { + "epoch": 0.6811748343287245, + "grad_norm": 0.05311296507716179, + "learning_rate": 8.3380270128808e-06, + "loss": 0.002, + "step": 41630 + }, + { + "epoch": 0.6813384602798004, + "grad_norm": 0.22057993710041046, + "learning_rate": 8.336963779070861e-06, + "loss": 0.0051, + "step": 41640 + }, + { + "epoch": 0.6815020862308763, + "grad_norm": 0.05289392173290253, + "learning_rate": 8.33590027310951e-06, + "loss": 0.0025, + "step": 41650 + }, + { + "epoch": 0.681665712181952, + "grad_norm": 0.03361356630921364, + "learning_rate": 8.33483649508348e-06, + "loss": 0.0025, + "step": 41660 + }, + { + "epoch": 0.6818293381330279, + "grad_norm": 0.04952887073159218, + "learning_rate": 8.333772445079533e-06, + "loss": 0.0032, + "step": 41670 + }, + { + "epoch": 0.6819929640841037, + "grad_norm": 0.06632176786661148, + "learning_rate": 8.33270812318445e-06, + "loss": 0.003, + "step": 41680 + }, + { + "epoch": 0.6821565900351796, + "grad_norm": 0.07646957039833069, + "learning_rate": 8.33164352948503e-06, + "loss": 0.0017, + "step": 41690 + }, + { + "epoch": 0.6823202159862555, + "grad_norm": 0.07562000304460526, + "learning_rate": 8.330578664068097e-06, + "loss": 0.0012, + "step": 41700 + }, + { + "epoch": 0.6824838419373312, + "grad_norm": 0.06474523991346359, + "learning_rate": 8.3295135270205e-06, + "loss": 0.0022, + "step": 41710 + }, + { + "epoch": 0.6826474678884071, + "grad_norm": 0.06082943081855774, + "learning_rate": 8.32844811842911e-06, + "loss": 0.0019, + "step": 41720 + }, + { + "epoch": 0.6828110938394829, + "grad_norm": 0.005953933112323284, + "learning_rate": 8.327382438380816e-06, + "loss": 0.0016, + "step": 41730 + }, + { + "epoch": 0.6829747197905588, + "grad_norm": 0.11157190054655075, + "learning_rate": 8.326316486962529e-06, + "loss": 0.0021, + "step": 41740 + }, + { + "epoch": 0.6831383457416347, + "grad_norm": 0.03926282376050949, + "learning_rate": 8.325250264261187e-06, + "loss": 0.0058, + "step": 41750 + }, + { + "epoch": 0.6833019716927105, + "grad_norm": 0.029198868200182915, + "learning_rate": 8.324183770363747e-06, + "loss": 0.0016, + "step": 41760 + }, + { + "epoch": 0.6834655976437863, + "grad_norm": 0.04374314472079277, + "learning_rate": 8.323117005357188e-06, + "loss": 0.0022, + "step": 41770 + }, + { + "epoch": 0.6836292235948621, + "grad_norm": 0.045673198997974396, + "learning_rate": 8.322049969328515e-06, + "loss": 0.0019, + "step": 41780 + }, + { + "epoch": 0.683792849545938, + "grad_norm": 0.047671206295490265, + "learning_rate": 8.320982662364746e-06, + "loss": 0.0067, + "step": 41790 + }, + { + "epoch": 0.6839564754970138, + "grad_norm": 0.07123294472694397, + "learning_rate": 8.319915084552932e-06, + "loss": 0.0024, + "step": 41800 + }, + { + "epoch": 0.6841201014480897, + "grad_norm": 0.018189528957009315, + "learning_rate": 8.318847235980138e-06, + "loss": 0.0019, + "step": 41810 + }, + { + "epoch": 0.6842837273991655, + "grad_norm": 0.03999166190624237, + "learning_rate": 8.317779116733455e-06, + "loss": 0.0035, + "step": 41820 + }, + { + "epoch": 0.6844473533502413, + "grad_norm": 0.2060263454914093, + "learning_rate": 8.316710726899994e-06, + "loss": 0.0032, + "step": 41830 + }, + { + "epoch": 0.6846109793013172, + "grad_norm": 0.05537264421582222, + "learning_rate": 8.315642066566893e-06, + "loss": 0.0031, + "step": 41840 + }, + { + "epoch": 0.684774605252393, + "grad_norm": 0.15973874926567078, + "learning_rate": 8.314573135821304e-06, + "loss": 0.0022, + "step": 41850 + }, + { + "epoch": 0.6849382312034689, + "grad_norm": 0.03887307271361351, + "learning_rate": 8.31350393475041e-06, + "loss": 0.0023, + "step": 41860 + }, + { + "epoch": 0.6851018571545447, + "grad_norm": 0.06278285384178162, + "learning_rate": 8.312434463441405e-06, + "loss": 0.0014, + "step": 41870 + }, + { + "epoch": 0.6852654831056205, + "grad_norm": 0.07525129616260529, + "learning_rate": 8.311364721981517e-06, + "loss": 0.0057, + "step": 41880 + }, + { + "epoch": 0.6854291090566964, + "grad_norm": 0.125365749001503, + "learning_rate": 8.31029471045799e-06, + "loss": 0.0027, + "step": 41890 + }, + { + "epoch": 0.6855927350077722, + "grad_norm": 0.19804522395133972, + "learning_rate": 8.309224428958087e-06, + "loss": 0.0033, + "step": 41900 + }, + { + "epoch": 0.6857563609588481, + "grad_norm": 0.1322184056043625, + "learning_rate": 8.3081538775691e-06, + "loss": 0.0024, + "step": 41910 + }, + { + "epoch": 0.685919986909924, + "grad_norm": 0.028758447617292404, + "learning_rate": 8.307083056378336e-06, + "loss": 0.0029, + "step": 41920 + }, + { + "epoch": 0.6860836128609997, + "grad_norm": 0.059308767318725586, + "learning_rate": 8.306011965473129e-06, + "loss": 0.003, + "step": 41930 + }, + { + "epoch": 0.6862472388120756, + "grad_norm": 0.11813205480575562, + "learning_rate": 8.304940604940836e-06, + "loss": 0.0034, + "step": 41940 + }, + { + "epoch": 0.6864108647631514, + "grad_norm": 0.2293846756219864, + "learning_rate": 8.303868974868831e-06, + "loss": 0.0023, + "step": 41950 + }, + { + "epoch": 0.6865744907142273, + "grad_norm": 0.13416968286037445, + "learning_rate": 8.302797075344514e-06, + "loss": 0.0037, + "step": 41960 + }, + { + "epoch": 0.6867381166653032, + "grad_norm": 0.12346319109201431, + "learning_rate": 8.301724906455305e-06, + "loss": 0.0029, + "step": 41970 + }, + { + "epoch": 0.6869017426163789, + "grad_norm": 0.07347419857978821, + "learning_rate": 8.300652468288643e-06, + "loss": 0.0025, + "step": 41980 + }, + { + "epoch": 0.6870653685674548, + "grad_norm": 0.150022953748703, + "learning_rate": 8.299579760931998e-06, + "loss": 0.0041, + "step": 41990 + }, + { + "epoch": 0.6872289945185306, + "grad_norm": 0.05635262653231621, + "learning_rate": 8.298506784472852e-06, + "loss": 0.0031, + "step": 42000 + }, + { + "epoch": 0.6873926204696065, + "grad_norm": 0.23792438209056854, + "learning_rate": 8.297433538998718e-06, + "loss": 0.0032, + "step": 42010 + }, + { + "epoch": 0.6875562464206824, + "grad_norm": 0.1262994259595871, + "learning_rate": 8.296360024597122e-06, + "loss": 0.0019, + "step": 42020 + }, + { + "epoch": 0.6877198723717581, + "grad_norm": 0.056801892817020416, + "learning_rate": 8.295286241355616e-06, + "loss": 0.0017, + "step": 42030 + }, + { + "epoch": 0.687883498322834, + "grad_norm": 0.10647229850292206, + "learning_rate": 8.294212189361778e-06, + "loss": 0.0041, + "step": 42040 + }, + { + "epoch": 0.6880471242739098, + "grad_norm": 0.10949555784463882, + "learning_rate": 8.2931378687032e-06, + "loss": 0.0036, + "step": 42050 + }, + { + "epoch": 0.6882107502249857, + "grad_norm": 0.03249487653374672, + "learning_rate": 8.292063279467503e-06, + "loss": 0.0024, + "step": 42060 + }, + { + "epoch": 0.6883743761760616, + "grad_norm": 0.10420724004507065, + "learning_rate": 8.290988421742325e-06, + "loss": 0.0031, + "step": 42070 + }, + { + "epoch": 0.6885380021271373, + "grad_norm": 0.03690098226070404, + "learning_rate": 8.289913295615328e-06, + "loss": 0.0038, + "step": 42080 + }, + { + "epoch": 0.6887016280782132, + "grad_norm": 0.11487086862325668, + "learning_rate": 8.288837901174198e-06, + "loss": 0.0039, + "step": 42090 + }, + { + "epoch": 0.688865254029289, + "grad_norm": 0.047081757336854935, + "learning_rate": 8.287762238506636e-06, + "loss": 0.0013, + "step": 42100 + }, + { + "epoch": 0.6890288799803649, + "grad_norm": 1.014747142791748, + "learning_rate": 8.286686307700371e-06, + "loss": 0.0038, + "step": 42110 + }, + { + "epoch": 0.6891925059314408, + "grad_norm": 0.07342509180307388, + "learning_rate": 8.285610108843156e-06, + "loss": 0.0023, + "step": 42120 + }, + { + "epoch": 0.6893561318825165, + "grad_norm": 0.08703934401273727, + "learning_rate": 8.284533642022756e-06, + "loss": 0.0038, + "step": 42130 + }, + { + "epoch": 0.6895197578335924, + "grad_norm": 0.09622837603092194, + "learning_rate": 8.28345690732697e-06, + "loss": 0.0039, + "step": 42140 + }, + { + "epoch": 0.6896833837846682, + "grad_norm": 0.04340745881199837, + "learning_rate": 8.282379904843606e-06, + "loss": 0.0017, + "step": 42150 + }, + { + "epoch": 0.6898470097357441, + "grad_norm": 0.13778144121170044, + "learning_rate": 8.281302634660509e-06, + "loss": 0.0046, + "step": 42160 + }, + { + "epoch": 0.69001063568682, + "grad_norm": 0.05705218017101288, + "learning_rate": 8.28022509686553e-06, + "loss": 0.0028, + "step": 42170 + }, + { + "epoch": 0.6901742616378957, + "grad_norm": 0.15162791311740875, + "learning_rate": 8.279147291546554e-06, + "loss": 0.0018, + "step": 42180 + }, + { + "epoch": 0.6903378875889716, + "grad_norm": 0.27879807353019714, + "learning_rate": 8.27806921879148e-06, + "loss": 0.0049, + "step": 42190 + }, + { + "epoch": 0.6905015135400474, + "grad_norm": 0.09431177377700806, + "learning_rate": 8.276990878688235e-06, + "loss": 0.0032, + "step": 42200 + }, + { + "epoch": 0.6906651394911233, + "grad_norm": 0.1065945103764534, + "learning_rate": 8.275912271324763e-06, + "loss": 0.0022, + "step": 42210 + }, + { + "epoch": 0.6908287654421992, + "grad_norm": 0.10742338001728058, + "learning_rate": 8.274833396789031e-06, + "loss": 0.0019, + "step": 42220 + }, + { + "epoch": 0.690992391393275, + "grad_norm": 0.11232302337884903, + "learning_rate": 8.27375425516903e-06, + "loss": 0.0034, + "step": 42230 + }, + { + "epoch": 0.6911560173443508, + "grad_norm": 0.07168204337358475, + "learning_rate": 8.27267484655277e-06, + "loss": 0.0014, + "step": 42240 + }, + { + "epoch": 0.6913196432954266, + "grad_norm": 0.017771488055586815, + "learning_rate": 8.271595171028283e-06, + "loss": 0.0016, + "step": 42250 + }, + { + "epoch": 0.6914832692465025, + "grad_norm": 0.1965777575969696, + "learning_rate": 8.270515228683626e-06, + "loss": 0.0032, + "step": 42260 + }, + { + "epoch": 0.6916468951975784, + "grad_norm": 0.025913791730999947, + "learning_rate": 8.269435019606875e-06, + "loss": 0.0044, + "step": 42270 + }, + { + "epoch": 0.6918105211486542, + "grad_norm": 0.058756064623594284, + "learning_rate": 8.268354543886126e-06, + "loss": 0.0047, + "step": 42280 + }, + { + "epoch": 0.69197414709973, + "grad_norm": 0.21434593200683594, + "learning_rate": 8.2672738016095e-06, + "loss": 0.0062, + "step": 42290 + }, + { + "epoch": 0.6921377730508058, + "grad_norm": 0.06954843550920486, + "learning_rate": 8.26619279286514e-06, + "loss": 0.003, + "step": 42300 + }, + { + "epoch": 0.6923013990018817, + "grad_norm": 0.023796798661351204, + "learning_rate": 8.265111517741206e-06, + "loss": 0.0025, + "step": 42310 + }, + { + "epoch": 0.6924650249529576, + "grad_norm": 0.0614449679851532, + "learning_rate": 8.26402997632589e-06, + "loss": 0.0023, + "step": 42320 + }, + { + "epoch": 0.6926286509040334, + "grad_norm": 0.11617767065763474, + "learning_rate": 8.26294816870739e-06, + "loss": 0.0026, + "step": 42330 + }, + { + "epoch": 0.6927922768551092, + "grad_norm": 0.07790637761354446, + "learning_rate": 8.26186609497394e-06, + "loss": 0.0023, + "step": 42340 + }, + { + "epoch": 0.692955902806185, + "grad_norm": 0.022893153131008148, + "learning_rate": 8.26078375521379e-06, + "loss": 0.0024, + "step": 42350 + }, + { + "epoch": 0.6931195287572609, + "grad_norm": 0.036024775356054306, + "learning_rate": 8.259701149515211e-06, + "loss": 0.0031, + "step": 42360 + }, + { + "epoch": 0.6932831547083368, + "grad_norm": 0.06227801367640495, + "learning_rate": 8.258618277966498e-06, + "loss": 0.0026, + "step": 42370 + }, + { + "epoch": 0.6934467806594126, + "grad_norm": 0.10701841861009598, + "learning_rate": 8.257535140655961e-06, + "loss": 0.0018, + "step": 42380 + }, + { + "epoch": 0.6936104066104885, + "grad_norm": 0.051589235663414, + "learning_rate": 8.256451737671945e-06, + "loss": 0.0015, + "step": 42390 + }, + { + "epoch": 0.6937740325615642, + "grad_norm": 0.09103359282016754, + "learning_rate": 8.255368069102803e-06, + "loss": 0.0027, + "step": 42400 + }, + { + "epoch": 0.6939376585126401, + "grad_norm": 0.03723711520433426, + "learning_rate": 8.254284135036916e-06, + "loss": 0.0013, + "step": 42410 + }, + { + "epoch": 0.694101284463716, + "grad_norm": 0.0831732377409935, + "learning_rate": 8.25319993556269e-06, + "loss": 0.0019, + "step": 42420 + }, + { + "epoch": 0.6942649104147918, + "grad_norm": 0.10774324089288712, + "learning_rate": 8.252115470768543e-06, + "loss": 0.0024, + "step": 42430 + }, + { + "epoch": 0.6944285363658677, + "grad_norm": 0.037433020770549774, + "learning_rate": 8.251030740742923e-06, + "loss": 0.0013, + "step": 42440 + }, + { + "epoch": 0.6945921623169434, + "grad_norm": 0.042912546545267105, + "learning_rate": 8.249945745574297e-06, + "loss": 0.0023, + "step": 42450 + }, + { + "epoch": 0.6947557882680193, + "grad_norm": 0.0421382300555706, + "learning_rate": 8.248860485351155e-06, + "loss": 0.002, + "step": 42460 + }, + { + "epoch": 0.6949194142190952, + "grad_norm": 0.13821718096733093, + "learning_rate": 8.247774960162003e-06, + "loss": 0.0043, + "step": 42470 + }, + { + "epoch": 0.695083040170171, + "grad_norm": 0.06397277861833572, + "learning_rate": 8.246689170095376e-06, + "loss": 0.0038, + "step": 42480 + }, + { + "epoch": 0.6952466661212469, + "grad_norm": 0.03633551672101021, + "learning_rate": 8.245603115239826e-06, + "loss": 0.0039, + "step": 42490 + }, + { + "epoch": 0.6954102920723226, + "grad_norm": 0.037556543946266174, + "learning_rate": 8.24451679568393e-06, + "loss": 0.0016, + "step": 42500 + }, + { + "epoch": 0.6955739180233985, + "grad_norm": 0.03047458454966545, + "learning_rate": 8.243430211516282e-06, + "loss": 0.0024, + "step": 42510 + }, + { + "epoch": 0.6957375439744744, + "grad_norm": 0.06643889099359512, + "learning_rate": 8.2423433628255e-06, + "loss": 0.0018, + "step": 42520 + }, + { + "epoch": 0.6959011699255502, + "grad_norm": 0.19985431432724, + "learning_rate": 8.241256249700227e-06, + "loss": 0.0013, + "step": 42530 + }, + { + "epoch": 0.6960647958766261, + "grad_norm": 0.07066982239484787, + "learning_rate": 8.24016887222912e-06, + "loss": 0.0024, + "step": 42540 + }, + { + "epoch": 0.6962284218277018, + "grad_norm": 0.29324936866760254, + "learning_rate": 8.239081230500865e-06, + "loss": 0.0052, + "step": 42550 + }, + { + "epoch": 0.6963920477787777, + "grad_norm": 0.06926723569631577, + "learning_rate": 8.237993324604166e-06, + "loss": 0.0018, + "step": 42560 + }, + { + "epoch": 0.6965556737298536, + "grad_norm": 0.034019507467746735, + "learning_rate": 8.236905154627746e-06, + "loss": 0.0022, + "step": 42570 + }, + { + "epoch": 0.6967192996809294, + "grad_norm": 0.06813473254442215, + "learning_rate": 8.235816720660355e-06, + "loss": 0.0034, + "step": 42580 + }, + { + "epoch": 0.6968829256320053, + "grad_norm": 0.06074924021959305, + "learning_rate": 8.23472802279076e-06, + "loss": 0.0024, + "step": 42590 + }, + { + "epoch": 0.697046551583081, + "grad_norm": 0.16627155244350433, + "learning_rate": 8.233639061107757e-06, + "loss": 0.0019, + "step": 42600 + }, + { + "epoch": 0.6972101775341569, + "grad_norm": 0.030916433781385422, + "learning_rate": 8.23254983570015e-06, + "loss": 0.0032, + "step": 42610 + }, + { + "epoch": 0.6973738034852328, + "grad_norm": 0.045743539929389954, + "learning_rate": 8.231460346656778e-06, + "loss": 0.0014, + "step": 42620 + }, + { + "epoch": 0.6975374294363086, + "grad_norm": 0.2038387656211853, + "learning_rate": 8.230370594066494e-06, + "loss": 0.0024, + "step": 42630 + }, + { + "epoch": 0.6977010553873845, + "grad_norm": 0.05940770357847214, + "learning_rate": 8.229280578018178e-06, + "loss": 0.0023, + "step": 42640 + }, + { + "epoch": 0.6978646813384602, + "grad_norm": 0.08583934605121613, + "learning_rate": 8.228190298600723e-06, + "loss": 0.0037, + "step": 42650 + }, + { + "epoch": 0.6980283072895361, + "grad_norm": 0.17937374114990234, + "learning_rate": 8.22709975590305e-06, + "loss": 0.0038, + "step": 42660 + }, + { + "epoch": 0.6981919332406119, + "grad_norm": 0.11187461018562317, + "learning_rate": 8.226008950014099e-06, + "loss": 0.0018, + "step": 42670 + }, + { + "epoch": 0.6983555591916878, + "grad_norm": 0.06361231207847595, + "learning_rate": 8.224917881022836e-06, + "loss": 0.0019, + "step": 42680 + }, + { + "epoch": 0.6985191851427637, + "grad_norm": 0.1189630925655365, + "learning_rate": 8.22382654901824e-06, + "loss": 0.002, + "step": 42690 + }, + { + "epoch": 0.6986828110938395, + "grad_norm": 0.07285558432340622, + "learning_rate": 8.222734954089323e-06, + "loss": 0.0015, + "step": 42700 + }, + { + "epoch": 0.6988464370449153, + "grad_norm": 0.10752551257610321, + "learning_rate": 8.221643096325105e-06, + "loss": 0.0031, + "step": 42710 + }, + { + "epoch": 0.6990100629959911, + "grad_norm": 0.04563531279563904, + "learning_rate": 8.22055097581464e-06, + "loss": 0.0016, + "step": 42720 + }, + { + "epoch": 0.699173688947067, + "grad_norm": 0.10486824810504913, + "learning_rate": 8.219458592646991e-06, + "loss": 0.0023, + "step": 42730 + }, + { + "epoch": 0.6993373148981429, + "grad_norm": 0.004063060972839594, + "learning_rate": 8.218365946911256e-06, + "loss": 0.0024, + "step": 42740 + }, + { + "epoch": 0.6995009408492187, + "grad_norm": 0.08407556265592575, + "learning_rate": 8.217273038696542e-06, + "loss": 0.0018, + "step": 42750 + }, + { + "epoch": 0.6996645668002945, + "grad_norm": 0.01555270329117775, + "learning_rate": 8.216179868091987e-06, + "loss": 0.0017, + "step": 42760 + }, + { + "epoch": 0.6998281927513703, + "grad_norm": 0.0795881599187851, + "learning_rate": 8.215086435186743e-06, + "loss": 0.0028, + "step": 42770 + }, + { + "epoch": 0.6999918187024462, + "grad_norm": 0.06360206753015518, + "learning_rate": 8.213992740069987e-06, + "loss": 0.0026, + "step": 42780 + }, + { + "epoch": 0.7001554446535221, + "grad_norm": 0.02852897346019745, + "learning_rate": 8.21289878283092e-06, + "loss": 0.0013, + "step": 42790 + }, + { + "epoch": 0.7003190706045979, + "grad_norm": 0.04118447005748749, + "learning_rate": 8.21180456355876e-06, + "loss": 0.0023, + "step": 42800 + }, + { + "epoch": 0.7004826965556737, + "grad_norm": 0.15166281163692474, + "learning_rate": 8.210710082342744e-06, + "loss": 0.0028, + "step": 42810 + }, + { + "epoch": 0.7006463225067495, + "grad_norm": 0.05840795114636421, + "learning_rate": 8.20961533927214e-06, + "loss": 0.0028, + "step": 42820 + }, + { + "epoch": 0.7008099484578254, + "grad_norm": 0.14578768610954285, + "learning_rate": 8.208520334436228e-06, + "loss": 0.0023, + "step": 42830 + }, + { + "epoch": 0.7009735744089013, + "grad_norm": 0.0719045028090477, + "learning_rate": 8.207425067924314e-06, + "loss": 0.0022, + "step": 42840 + }, + { + "epoch": 0.7011372003599771, + "grad_norm": 0.01881229318678379, + "learning_rate": 8.206329539825724e-06, + "loss": 0.0028, + "step": 42850 + }, + { + "epoch": 0.701300826311053, + "grad_norm": 0.1771376132965088, + "learning_rate": 8.205233750229806e-06, + "loss": 0.0024, + "step": 42860 + }, + { + "epoch": 0.7014644522621287, + "grad_norm": 0.0363045409321785, + "learning_rate": 8.20413769922593e-06, + "loss": 0.0031, + "step": 42870 + }, + { + "epoch": 0.7016280782132046, + "grad_norm": 0.11194594204425812, + "learning_rate": 8.203041386903483e-06, + "loss": 0.0035, + "step": 42880 + }, + { + "epoch": 0.7017917041642805, + "grad_norm": 0.04965907335281372, + "learning_rate": 8.201944813351879e-06, + "loss": 0.0019, + "step": 42890 + }, + { + "epoch": 0.7019553301153563, + "grad_norm": 0.07199104130268097, + "learning_rate": 8.200847978660549e-06, + "loss": 0.0029, + "step": 42900 + }, + { + "epoch": 0.7021189560664322, + "grad_norm": 0.1052938774228096, + "learning_rate": 8.199750882918947e-06, + "loss": 0.0018, + "step": 42910 + }, + { + "epoch": 0.7022825820175079, + "grad_norm": 0.027919912710785866, + "learning_rate": 8.198653526216552e-06, + "loss": 0.0025, + "step": 42920 + }, + { + "epoch": 0.7024462079685838, + "grad_norm": 0.06763521581888199, + "learning_rate": 8.197555908642857e-06, + "loss": 0.0015, + "step": 42930 + }, + { + "epoch": 0.7026098339196597, + "grad_norm": 0.07445783168077469, + "learning_rate": 8.196458030287381e-06, + "loss": 0.0026, + "step": 42940 + }, + { + "epoch": 0.7027734598707355, + "grad_norm": 0.010249611921608448, + "learning_rate": 8.195359891239662e-06, + "loss": 0.0029, + "step": 42950 + }, + { + "epoch": 0.7029370858218114, + "grad_norm": 0.06104375794529915, + "learning_rate": 8.194261491589265e-06, + "loss": 0.0024, + "step": 42960 + }, + { + "epoch": 0.7031007117728871, + "grad_norm": 0.056723762303590775, + "learning_rate": 8.193162831425766e-06, + "loss": 0.0023, + "step": 42970 + }, + { + "epoch": 0.703264337723963, + "grad_norm": 0.06246568262577057, + "learning_rate": 8.192063910838771e-06, + "loss": 0.0031, + "step": 42980 + }, + { + "epoch": 0.7034279636750389, + "grad_norm": 0.022989075630903244, + "learning_rate": 8.190964729917904e-06, + "loss": 0.0045, + "step": 42990 + }, + { + "epoch": 0.7035915896261147, + "grad_norm": 0.05999006703495979, + "learning_rate": 8.189865288752812e-06, + "loss": 0.0013, + "step": 43000 + }, + { + "epoch": 0.7037552155771906, + "grad_norm": 0.0738217905163765, + "learning_rate": 8.188765587433157e-06, + "loss": 0.0029, + "step": 43010 + }, + { + "epoch": 0.7039188415282663, + "grad_norm": 0.06145365908741951, + "learning_rate": 8.187665626048632e-06, + "loss": 0.0017, + "step": 43020 + }, + { + "epoch": 0.7040824674793422, + "grad_norm": 0.05235118046402931, + "learning_rate": 8.186565404688942e-06, + "loss": 0.0014, + "step": 43030 + }, + { + "epoch": 0.7042460934304181, + "grad_norm": 0.11701613664627075, + "learning_rate": 8.185464923443823e-06, + "loss": 0.0046, + "step": 43040 + }, + { + "epoch": 0.7044097193814939, + "grad_norm": 0.041148070245981216, + "learning_rate": 8.184364182403018e-06, + "loss": 0.002, + "step": 43050 + }, + { + "epoch": 0.7045733453325698, + "grad_norm": 0.03790149837732315, + "learning_rate": 8.183263181656308e-06, + "loss": 0.0043, + "step": 43060 + }, + { + "epoch": 0.7047369712836455, + "grad_norm": 0.02770465612411499, + "learning_rate": 8.18216192129348e-06, + "loss": 0.0026, + "step": 43070 + }, + { + "epoch": 0.7049005972347214, + "grad_norm": 0.05528177693486214, + "learning_rate": 8.181060401404354e-06, + "loss": 0.0031, + "step": 43080 + }, + { + "epoch": 0.7050642231857973, + "grad_norm": 0.03338836878538132, + "learning_rate": 8.179958622078765e-06, + "loss": 0.0016, + "step": 43090 + }, + { + "epoch": 0.7052278491368731, + "grad_norm": 0.09353293478488922, + "learning_rate": 8.178856583406569e-06, + "loss": 0.0026, + "step": 43100 + }, + { + "epoch": 0.705391475087949, + "grad_norm": 0.07116588205099106, + "learning_rate": 8.177754285477646e-06, + "loss": 0.0026, + "step": 43110 + }, + { + "epoch": 0.7055551010390247, + "grad_norm": 0.11429449170827866, + "learning_rate": 8.176651728381895e-06, + "loss": 0.0025, + "step": 43120 + }, + { + "epoch": 0.7057187269901006, + "grad_norm": 0.03536340966820717, + "learning_rate": 8.175548912209239e-06, + "loss": 0.0027, + "step": 43130 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.09896332770586014, + "learning_rate": 8.174445837049614e-06, + "loss": 0.0017, + "step": 43140 + }, + { + "epoch": 0.7060459788922523, + "grad_norm": 0.045147284865379333, + "learning_rate": 8.173342502992989e-06, + "loss": 0.0027, + "step": 43150 + }, + { + "epoch": 0.7062096048433282, + "grad_norm": 0.15183088183403015, + "learning_rate": 8.172238910129346e-06, + "loss": 0.0023, + "step": 43160 + }, + { + "epoch": 0.706373230794404, + "grad_norm": 0.09464211761951447, + "learning_rate": 8.171135058548692e-06, + "loss": 0.0023, + "step": 43170 + }, + { + "epoch": 0.7065368567454798, + "grad_norm": 0.026473268866539, + "learning_rate": 8.170030948341049e-06, + "loss": 0.0039, + "step": 43180 + }, + { + "epoch": 0.7067004826965557, + "grad_norm": 0.04579450935125351, + "learning_rate": 8.168926579596469e-06, + "loss": 0.0022, + "step": 43190 + }, + { + "epoch": 0.7068641086476315, + "grad_norm": 0.06653797626495361, + "learning_rate": 8.167821952405017e-06, + "loss": 0.0041, + "step": 43200 + }, + { + "epoch": 0.7070277345987074, + "grad_norm": 0.16703353822231293, + "learning_rate": 8.166717066856786e-06, + "loss": 0.0041, + "step": 43210 + }, + { + "epoch": 0.7071913605497832, + "grad_norm": 0.10521818697452545, + "learning_rate": 8.165611923041884e-06, + "loss": 0.0041, + "step": 43220 + }, + { + "epoch": 0.707354986500859, + "grad_norm": 0.0518835224211216, + "learning_rate": 8.164506521050446e-06, + "loss": 0.0078, + "step": 43230 + }, + { + "epoch": 0.7075186124519349, + "grad_norm": 0.06331518292427063, + "learning_rate": 8.163400860972621e-06, + "loss": 0.0025, + "step": 43240 + }, + { + "epoch": 0.7076822384030107, + "grad_norm": 0.07451155036687851, + "learning_rate": 8.162294942898586e-06, + "loss": 0.0016, + "step": 43250 + }, + { + "epoch": 0.7078458643540866, + "grad_norm": 0.02100847102701664, + "learning_rate": 8.161188766918533e-06, + "loss": 0.0029, + "step": 43260 + }, + { + "epoch": 0.7080094903051624, + "grad_norm": 0.06605824828147888, + "learning_rate": 8.160082333122679e-06, + "loss": 0.0021, + "step": 43270 + }, + { + "epoch": 0.7081731162562382, + "grad_norm": 0.03543270379304886, + "learning_rate": 8.158975641601263e-06, + "loss": 0.002, + "step": 43280 + }, + { + "epoch": 0.7083367422073141, + "grad_norm": 0.007369784638285637, + "learning_rate": 8.157868692444541e-06, + "loss": 0.0019, + "step": 43290 + }, + { + "epoch": 0.7085003681583899, + "grad_norm": 0.34923169016838074, + "learning_rate": 8.156761485742791e-06, + "loss": 0.0019, + "step": 43300 + }, + { + "epoch": 0.7086639941094658, + "grad_norm": 0.13776707649230957, + "learning_rate": 8.155654021586316e-06, + "loss": 0.0036, + "step": 43310 + }, + { + "epoch": 0.7088276200605416, + "grad_norm": 0.1732506901025772, + "learning_rate": 8.154546300065435e-06, + "loss": 0.002, + "step": 43320 + }, + { + "epoch": 0.7089912460116174, + "grad_norm": 0.1082187071442604, + "learning_rate": 8.15343832127049e-06, + "loss": 0.003, + "step": 43330 + }, + { + "epoch": 0.7091548719626933, + "grad_norm": 0.11107086390256882, + "learning_rate": 8.152330085291846e-06, + "loss": 0.0054, + "step": 43340 + }, + { + "epoch": 0.7093184979137691, + "grad_norm": 0.10254182666540146, + "learning_rate": 8.151221592219885e-06, + "loss": 0.005, + "step": 43350 + }, + { + "epoch": 0.709482123864845, + "grad_norm": 0.026903679594397545, + "learning_rate": 8.150112842145011e-06, + "loss": 0.0023, + "step": 43360 + }, + { + "epoch": 0.7096457498159208, + "grad_norm": 0.017902657389640808, + "learning_rate": 8.149003835157651e-06, + "loss": 0.0022, + "step": 43370 + }, + { + "epoch": 0.7098093757669967, + "grad_norm": 0.04412378370761871, + "learning_rate": 8.147894571348254e-06, + "loss": 0.0023, + "step": 43380 + }, + { + "epoch": 0.7099730017180725, + "grad_norm": 0.18464715778827667, + "learning_rate": 8.146785050807285e-06, + "loss": 0.0026, + "step": 43390 + }, + { + "epoch": 0.7101366276691483, + "grad_norm": 0.10496652126312256, + "learning_rate": 8.145675273625232e-06, + "loss": 0.0038, + "step": 43400 + }, + { + "epoch": 0.7103002536202242, + "grad_norm": 0.13076041638851166, + "learning_rate": 8.144565239892608e-06, + "loss": 0.0043, + "step": 43410 + }, + { + "epoch": 0.7104638795713, + "grad_norm": 0.25232309103012085, + "learning_rate": 8.143454949699942e-06, + "loss": 0.0025, + "step": 43420 + }, + { + "epoch": 0.7106275055223759, + "grad_norm": 0.037370793521404266, + "learning_rate": 8.142344403137785e-06, + "loss": 0.0036, + "step": 43430 + }, + { + "epoch": 0.7107911314734517, + "grad_norm": 0.039053935557603836, + "learning_rate": 8.14123360029671e-06, + "loss": 0.0017, + "step": 43440 + }, + { + "epoch": 0.7109547574245275, + "grad_norm": 0.007159300148487091, + "learning_rate": 8.140122541267308e-06, + "loss": 0.0019, + "step": 43450 + }, + { + "epoch": 0.7111183833756034, + "grad_norm": 0.14263097941875458, + "learning_rate": 8.139011226140197e-06, + "loss": 0.0022, + "step": 43460 + }, + { + "epoch": 0.7112820093266792, + "grad_norm": 0.1862632930278778, + "learning_rate": 8.13789965500601e-06, + "loss": 0.002, + "step": 43470 + }, + { + "epoch": 0.7114456352777551, + "grad_norm": 0.2040887027978897, + "learning_rate": 8.136787827955401e-06, + "loss": 0.0043, + "step": 43480 + }, + { + "epoch": 0.711609261228831, + "grad_norm": 0.077354297041893, + "learning_rate": 8.135675745079053e-06, + "loss": 0.0027, + "step": 43490 + }, + { + "epoch": 0.7117728871799067, + "grad_norm": 0.13910935819149017, + "learning_rate": 8.134563406467656e-06, + "loss": 0.0016, + "step": 43500 + }, + { + "epoch": 0.7119365131309826, + "grad_norm": 0.13739068806171417, + "learning_rate": 8.133450812211934e-06, + "loss": 0.0018, + "step": 43510 + }, + { + "epoch": 0.7121001390820584, + "grad_norm": 0.10940311849117279, + "learning_rate": 8.132337962402626e-06, + "loss": 0.0034, + "step": 43520 + }, + { + "epoch": 0.7122637650331343, + "grad_norm": 0.03299413621425629, + "learning_rate": 8.131224857130488e-06, + "loss": 0.0025, + "step": 43530 + }, + { + "epoch": 0.71242739098421, + "grad_norm": 0.031098751351237297, + "learning_rate": 8.130111496486306e-06, + "loss": 0.0026, + "step": 43540 + }, + { + "epoch": 0.7125910169352859, + "grad_norm": 0.09025661647319794, + "learning_rate": 8.128997880560878e-06, + "loss": 0.002, + "step": 43550 + }, + { + "epoch": 0.7127546428863618, + "grad_norm": 0.12498236447572708, + "learning_rate": 8.12788400944503e-06, + "loss": 0.0018, + "step": 43560 + }, + { + "epoch": 0.7129182688374376, + "grad_norm": 0.04543954133987427, + "learning_rate": 8.126769883229604e-06, + "loss": 0.0027, + "step": 43570 + }, + { + "epoch": 0.7130818947885135, + "grad_norm": 0.23174482583999634, + "learning_rate": 8.125655502005465e-06, + "loss": 0.0023, + "step": 43580 + }, + { + "epoch": 0.7132455207395892, + "grad_norm": 0.08633371442556381, + "learning_rate": 8.124540865863497e-06, + "loss": 0.0027, + "step": 43590 + }, + { + "epoch": 0.7134091466906651, + "grad_norm": 0.08456216752529144, + "learning_rate": 8.123425974894608e-06, + "loss": 0.0027, + "step": 43600 + }, + { + "epoch": 0.713572772641741, + "grad_norm": 0.07421570271253586, + "learning_rate": 8.122310829189721e-06, + "loss": 0.0044, + "step": 43610 + }, + { + "epoch": 0.7137363985928168, + "grad_norm": 0.14526040852069855, + "learning_rate": 8.121195428839787e-06, + "loss": 0.0027, + "step": 43620 + }, + { + "epoch": 0.7139000245438927, + "grad_norm": 0.052335113286972046, + "learning_rate": 8.120079773935774e-06, + "loss": 0.0041, + "step": 43630 + }, + { + "epoch": 0.7140636504949684, + "grad_norm": 0.03402649611234665, + "learning_rate": 8.118963864568669e-06, + "loss": 0.0025, + "step": 43640 + }, + { + "epoch": 0.7142272764460443, + "grad_norm": 0.015778332948684692, + "learning_rate": 8.117847700829483e-06, + "loss": 0.003, + "step": 43650 + }, + { + "epoch": 0.7143909023971202, + "grad_norm": 0.13444769382476807, + "learning_rate": 8.116731282809245e-06, + "loss": 0.0025, + "step": 43660 + }, + { + "epoch": 0.714554528348196, + "grad_norm": 0.11536898463964462, + "learning_rate": 8.11561461059901e-06, + "loss": 0.0023, + "step": 43670 + }, + { + "epoch": 0.7147181542992719, + "grad_norm": 0.023170851171016693, + "learning_rate": 8.114497684289846e-06, + "loss": 0.0021, + "step": 43680 + }, + { + "epoch": 0.7148817802503477, + "grad_norm": 0.026733066886663437, + "learning_rate": 8.113380503972848e-06, + "loss": 0.0017, + "step": 43690 + }, + { + "epoch": 0.7150454062014235, + "grad_norm": 0.04442456364631653, + "learning_rate": 8.112263069739129e-06, + "loss": 0.0032, + "step": 43700 + }, + { + "epoch": 0.7152090321524994, + "grad_norm": 0.04640334099531174, + "learning_rate": 8.111145381679821e-06, + "loss": 0.0016, + "step": 43710 + }, + { + "epoch": 0.7153726581035752, + "grad_norm": 0.009622717276215553, + "learning_rate": 8.110027439886084e-06, + "loss": 0.0024, + "step": 43720 + }, + { + "epoch": 0.7155362840546511, + "grad_norm": 0.01147310808300972, + "learning_rate": 8.108909244449088e-06, + "loss": 0.0021, + "step": 43730 + }, + { + "epoch": 0.7156999100057269, + "grad_norm": 0.08082813769578934, + "learning_rate": 8.107790795460032e-06, + "loss": 0.0033, + "step": 43740 + }, + { + "epoch": 0.7158635359568027, + "grad_norm": 0.13127946853637695, + "learning_rate": 8.10667209301013e-06, + "loss": 0.002, + "step": 43750 + }, + { + "epoch": 0.7160271619078786, + "grad_norm": 0.11432065814733505, + "learning_rate": 8.105553137190625e-06, + "loss": 0.0024, + "step": 43760 + }, + { + "epoch": 0.7161907878589544, + "grad_norm": 0.07971400767564774, + "learning_rate": 8.104433928092772e-06, + "loss": 0.0013, + "step": 43770 + }, + { + "epoch": 0.7163544138100303, + "grad_norm": 0.1732354462146759, + "learning_rate": 8.103314465807847e-06, + "loss": 0.0031, + "step": 43780 + }, + { + "epoch": 0.7165180397611061, + "grad_norm": 0.09525655955076218, + "learning_rate": 8.102194750427156e-06, + "loss": 0.002, + "step": 43790 + }, + { + "epoch": 0.716681665712182, + "grad_norm": 0.07279553264379501, + "learning_rate": 8.101074782042015e-06, + "loss": 0.0036, + "step": 43800 + }, + { + "epoch": 0.7168452916632578, + "grad_norm": 0.11666157096624374, + "learning_rate": 8.099954560743764e-06, + "loss": 0.003, + "step": 43810 + }, + { + "epoch": 0.7170089176143336, + "grad_norm": 0.025846201926469803, + "learning_rate": 8.098834086623768e-06, + "loss": 0.0023, + "step": 43820 + }, + { + "epoch": 0.7171725435654095, + "grad_norm": 0.072555772960186, + "learning_rate": 8.097713359773405e-06, + "loss": 0.0029, + "step": 43830 + }, + { + "epoch": 0.7173361695164853, + "grad_norm": 0.18477746844291687, + "learning_rate": 8.096592380284083e-06, + "loss": 0.0038, + "step": 43840 + }, + { + "epoch": 0.7174997954675612, + "grad_norm": 0.19694846868515015, + "learning_rate": 8.095471148247218e-06, + "loss": 0.0018, + "step": 43850 + }, + { + "epoch": 0.717663421418637, + "grad_norm": 0.13902299106121063, + "learning_rate": 8.094349663754259e-06, + "loss": 0.0019, + "step": 43860 + }, + { + "epoch": 0.7178270473697128, + "grad_norm": 0.044256895780563354, + "learning_rate": 8.093227926896671e-06, + "loss": 0.0018, + "step": 43870 + }, + { + "epoch": 0.7179906733207887, + "grad_norm": 0.33997824788093567, + "learning_rate": 8.092105937765934e-06, + "loss": 0.0025, + "step": 43880 + }, + { + "epoch": 0.7181542992718645, + "grad_norm": 0.0224887914955616, + "learning_rate": 8.09098369645356e-06, + "loss": 0.0032, + "step": 43890 + }, + { + "epoch": 0.7183179252229404, + "grad_norm": 0.0422448106110096, + "learning_rate": 8.089861203051072e-06, + "loss": 0.0026, + "step": 43900 + }, + { + "epoch": 0.7184815511740162, + "grad_norm": 0.0016009770333766937, + "learning_rate": 8.088738457650016e-06, + "loss": 0.0017, + "step": 43910 + }, + { + "epoch": 0.718645177125092, + "grad_norm": 0.04724181070923805, + "learning_rate": 8.08761546034196e-06, + "loss": 0.0015, + "step": 43920 + }, + { + "epoch": 0.7188088030761679, + "grad_norm": 0.09206213057041168, + "learning_rate": 8.086492211218493e-06, + "loss": 0.0036, + "step": 43930 + }, + { + "epoch": 0.7189724290272437, + "grad_norm": 0.029400011524558067, + "learning_rate": 8.085368710371221e-06, + "loss": 0.001, + "step": 43940 + }, + { + "epoch": 0.7191360549783196, + "grad_norm": 0.04279040917754173, + "learning_rate": 8.084244957891776e-06, + "loss": 0.0019, + "step": 43950 + }, + { + "epoch": 0.7192996809293954, + "grad_norm": 0.025080056861042976, + "learning_rate": 8.083120953871805e-06, + "loss": 0.0015, + "step": 43960 + }, + { + "epoch": 0.7194633068804712, + "grad_norm": 0.10105162858963013, + "learning_rate": 8.081996698402979e-06, + "loss": 0.0026, + "step": 43970 + }, + { + "epoch": 0.7196269328315471, + "grad_norm": 0.08337251842021942, + "learning_rate": 8.080872191576987e-06, + "loss": 0.0024, + "step": 43980 + }, + { + "epoch": 0.7197905587826229, + "grad_norm": 0.0339948907494545, + "learning_rate": 8.079747433485542e-06, + "loss": 0.0011, + "step": 43990 + }, + { + "epoch": 0.7199541847336988, + "grad_norm": 0.20203080773353577, + "learning_rate": 8.078622424220374e-06, + "loss": 0.0069, + "step": 44000 + }, + { + "epoch": 0.7201178106847747, + "grad_norm": 0.11097287386655807, + "learning_rate": 8.077497163873237e-06, + "loss": 0.0019, + "step": 44010 + }, + { + "epoch": 0.7202814366358504, + "grad_norm": 0.017145076766610146, + "learning_rate": 8.0763716525359e-06, + "loss": 0.002, + "step": 44020 + }, + { + "epoch": 0.7204450625869263, + "grad_norm": 0.04417567327618599, + "learning_rate": 8.07524589030016e-06, + "loss": 0.0104, + "step": 44030 + }, + { + "epoch": 0.7206086885380021, + "grad_norm": 0.07366751879453659, + "learning_rate": 8.074119877257824e-06, + "loss": 0.0021, + "step": 44040 + }, + { + "epoch": 0.720772314489078, + "grad_norm": 0.035887643694877625, + "learning_rate": 8.072993613500734e-06, + "loss": 0.0028, + "step": 44050 + }, + { + "epoch": 0.7209359404401539, + "grad_norm": 0.023754918947815895, + "learning_rate": 8.071867099120739e-06, + "loss": 0.0016, + "step": 44060 + }, + { + "epoch": 0.7210995663912296, + "grad_norm": 0.04827277734875679, + "learning_rate": 8.070740334209712e-06, + "loss": 0.0032, + "step": 44070 + }, + { + "epoch": 0.7212631923423055, + "grad_norm": 0.07769101113080978, + "learning_rate": 8.069613318859555e-06, + "loss": 0.0028, + "step": 44080 + }, + { + "epoch": 0.7214268182933813, + "grad_norm": 0.07465600967407227, + "learning_rate": 8.068486053162177e-06, + "loss": 0.0017, + "step": 44090 + }, + { + "epoch": 0.7215904442444572, + "grad_norm": 0.07614905387163162, + "learning_rate": 8.067358537209517e-06, + "loss": 0.003, + "step": 44100 + }, + { + "epoch": 0.7217540701955331, + "grad_norm": 0.05258912220597267, + "learning_rate": 8.066230771093531e-06, + "loss": 0.0026, + "step": 44110 + }, + { + "epoch": 0.7219176961466088, + "grad_norm": 0.04659995064139366, + "learning_rate": 8.065102754906195e-06, + "loss": 0.0027, + "step": 44120 + }, + { + "epoch": 0.7220813220976847, + "grad_norm": 0.022946927696466446, + "learning_rate": 8.063974488739506e-06, + "loss": 0.0019, + "step": 44130 + }, + { + "epoch": 0.7222449480487605, + "grad_norm": 0.0068640816025435925, + "learning_rate": 8.062845972685484e-06, + "loss": 0.0022, + "step": 44140 + }, + { + "epoch": 0.7224085739998364, + "grad_norm": 0.04052237421274185, + "learning_rate": 8.061717206836163e-06, + "loss": 0.0023, + "step": 44150 + }, + { + "epoch": 0.7225721999509123, + "grad_norm": 0.16379417479038239, + "learning_rate": 8.060588191283607e-06, + "loss": 0.0037, + "step": 44160 + }, + { + "epoch": 0.722735825901988, + "grad_norm": 0.10318569093942642, + "learning_rate": 8.059458926119888e-06, + "loss": 0.0028, + "step": 44170 + }, + { + "epoch": 0.7228994518530639, + "grad_norm": 0.019126171246170998, + "learning_rate": 8.05832941143711e-06, + "loss": 0.0015, + "step": 44180 + }, + { + "epoch": 0.7230630778041397, + "grad_norm": 0.02773362398147583, + "learning_rate": 8.05719964732739e-06, + "loss": 0.0025, + "step": 44190 + }, + { + "epoch": 0.7232267037552156, + "grad_norm": 0.05926528945565224, + "learning_rate": 8.056069633882868e-06, + "loss": 0.0021, + "step": 44200 + }, + { + "epoch": 0.7233903297062915, + "grad_norm": 0.06562968343496323, + "learning_rate": 8.054939371195704e-06, + "loss": 0.0019, + "step": 44210 + }, + { + "epoch": 0.7235539556573672, + "grad_norm": 0.0780167430639267, + "learning_rate": 8.053808859358079e-06, + "loss": 0.0016, + "step": 44220 + }, + { + "epoch": 0.7237175816084431, + "grad_norm": 0.05056006833910942, + "learning_rate": 8.052678098462193e-06, + "loss": 0.0023, + "step": 44230 + }, + { + "epoch": 0.7238812075595189, + "grad_norm": 0.032847411930561066, + "learning_rate": 8.051547088600267e-06, + "loss": 0.0025, + "step": 44240 + }, + { + "epoch": 0.7240448335105948, + "grad_norm": 0.05597968399524689, + "learning_rate": 8.050415829864544e-06, + "loss": 0.003, + "step": 44250 + }, + { + "epoch": 0.7242084594616707, + "grad_norm": 0.04562026262283325, + "learning_rate": 8.049284322347284e-06, + "loss": 0.0021, + "step": 44260 + }, + { + "epoch": 0.7243720854127464, + "grad_norm": 0.045441530644893646, + "learning_rate": 8.048152566140768e-06, + "loss": 0.0022, + "step": 44270 + }, + { + "epoch": 0.7245357113638223, + "grad_norm": 0.05194681137800217, + "learning_rate": 8.0470205613373e-06, + "loss": 0.0021, + "step": 44280 + }, + { + "epoch": 0.7246993373148981, + "grad_norm": 0.15032348036766052, + "learning_rate": 8.045888308029204e-06, + "loss": 0.0021, + "step": 44290 + }, + { + "epoch": 0.724862963265974, + "grad_norm": 0.08662296831607819, + "learning_rate": 8.044755806308818e-06, + "loss": 0.0014, + "step": 44300 + }, + { + "epoch": 0.7250265892170499, + "grad_norm": 0.06516364216804504, + "learning_rate": 8.043623056268509e-06, + "loss": 0.0021, + "step": 44310 + }, + { + "epoch": 0.7251902151681257, + "grad_norm": 0.398365318775177, + "learning_rate": 8.042490058000658e-06, + "loss": 0.002, + "step": 44320 + }, + { + "epoch": 0.7253538411192015, + "grad_norm": 0.04737105593085289, + "learning_rate": 8.04135681159767e-06, + "loss": 0.0024, + "step": 44330 + }, + { + "epoch": 0.7255174670702773, + "grad_norm": 0.08958578109741211, + "learning_rate": 8.040223317151967e-06, + "loss": 0.0015, + "step": 44340 + }, + { + "epoch": 0.7256810930213532, + "grad_norm": 0.10113321244716644, + "learning_rate": 8.039089574755996e-06, + "loss": 0.002, + "step": 44350 + }, + { + "epoch": 0.7258447189724291, + "grad_norm": 0.0900040715932846, + "learning_rate": 8.037955584502217e-06, + "loss": 0.0016, + "step": 44360 + }, + { + "epoch": 0.7260083449235049, + "grad_norm": 0.07053272426128387, + "learning_rate": 8.036821346483116e-06, + "loss": 0.002, + "step": 44370 + }, + { + "epoch": 0.7261719708745807, + "grad_norm": 0.032537445425987244, + "learning_rate": 8.0356868607912e-06, + "loss": 0.0012, + "step": 44380 + }, + { + "epoch": 0.7263355968256565, + "grad_norm": 0.15229865908622742, + "learning_rate": 8.034552127518994e-06, + "loss": 0.0028, + "step": 44390 + }, + { + "epoch": 0.7264992227767324, + "grad_norm": 0.14430421590805054, + "learning_rate": 8.033417146759037e-06, + "loss": 0.0016, + "step": 44400 + }, + { + "epoch": 0.7266628487278082, + "grad_norm": 0.07929670810699463, + "learning_rate": 8.0322819186039e-06, + "loss": 0.0038, + "step": 44410 + }, + { + "epoch": 0.7268264746788841, + "grad_norm": 0.0590423084795475, + "learning_rate": 8.031146443146165e-06, + "loss": 0.002, + "step": 44420 + }, + { + "epoch": 0.72699010062996, + "grad_norm": 0.15701057016849518, + "learning_rate": 8.030010720478439e-06, + "loss": 0.0031, + "step": 44430 + }, + { + "epoch": 0.7271537265810357, + "grad_norm": 0.07895903289318085, + "learning_rate": 8.02887475069335e-06, + "loss": 0.002, + "step": 44440 + }, + { + "epoch": 0.7273173525321116, + "grad_norm": 0.021112807095050812, + "learning_rate": 8.027738533883539e-06, + "loss": 0.0028, + "step": 44450 + }, + { + "epoch": 0.7274809784831874, + "grad_norm": 0.1281396895647049, + "learning_rate": 8.026602070141676e-06, + "loss": 0.0021, + "step": 44460 + }, + { + "epoch": 0.7276446044342633, + "grad_norm": 0.0907074436545372, + "learning_rate": 8.025465359560445e-06, + "loss": 0.0044, + "step": 44470 + }, + { + "epoch": 0.7278082303853391, + "grad_norm": 0.11169345676898956, + "learning_rate": 8.024328402232552e-06, + "loss": 0.0014, + "step": 44480 + }, + { + "epoch": 0.7279718563364149, + "grad_norm": 0.10047292709350586, + "learning_rate": 8.023191198250724e-06, + "loss": 0.0019, + "step": 44490 + }, + { + "epoch": 0.7281354822874908, + "grad_norm": 0.09887635707855225, + "learning_rate": 8.02205374770771e-06, + "loss": 0.0019, + "step": 44500 + }, + { + "epoch": 0.7282991082385666, + "grad_norm": 0.07256154716014862, + "learning_rate": 8.020916050696272e-06, + "loss": 0.0025, + "step": 44510 + }, + { + "epoch": 0.7284627341896425, + "grad_norm": 0.08774542808532715, + "learning_rate": 8.0197781073092e-06, + "loss": 0.003, + "step": 44520 + }, + { + "epoch": 0.7286263601407184, + "grad_norm": 0.03413913771510124, + "learning_rate": 8.0186399176393e-06, + "loss": 0.0017, + "step": 44530 + }, + { + "epoch": 0.7287899860917941, + "grad_norm": 0.1782178431749344, + "learning_rate": 8.017501481779398e-06, + "loss": 0.0045, + "step": 44540 + }, + { + "epoch": 0.72895361204287, + "grad_norm": 0.1473647803068161, + "learning_rate": 8.016362799822342e-06, + "loss": 0.0022, + "step": 44550 + }, + { + "epoch": 0.7291172379939458, + "grad_norm": 0.056445784866809845, + "learning_rate": 8.015223871860998e-06, + "loss": 0.0028, + "step": 44560 + }, + { + "epoch": 0.7292808639450217, + "grad_norm": 0.16442395746707916, + "learning_rate": 8.014084697988254e-06, + "loss": 0.0029, + "step": 44570 + }, + { + "epoch": 0.7294444898960976, + "grad_norm": 0.053557731211185455, + "learning_rate": 8.012945278297016e-06, + "loss": 0.003, + "step": 44580 + }, + { + "epoch": 0.7296081158471733, + "grad_norm": 0.14761383831501007, + "learning_rate": 8.011805612880212e-06, + "loss": 0.0031, + "step": 44590 + }, + { + "epoch": 0.7297717417982492, + "grad_norm": 0.062305767089128494, + "learning_rate": 8.01066570183079e-06, + "loss": 0.0028, + "step": 44600 + }, + { + "epoch": 0.729935367749325, + "grad_norm": 0.1019849106669426, + "learning_rate": 8.009525545241717e-06, + "loss": 0.0024, + "step": 44610 + }, + { + "epoch": 0.7300989937004009, + "grad_norm": 0.06322908401489258, + "learning_rate": 8.008385143205978e-06, + "loss": 0.0028, + "step": 44620 + }, + { + "epoch": 0.7302626196514768, + "grad_norm": 0.016959216445684433, + "learning_rate": 8.007244495816586e-06, + "loss": 0.0016, + "step": 44630 + }, + { + "epoch": 0.7304262456025525, + "grad_norm": 0.04674536734819412, + "learning_rate": 8.00610360316656e-06, + "loss": 0.0018, + "step": 44640 + }, + { + "epoch": 0.7305898715536284, + "grad_norm": 0.0854129046201706, + "learning_rate": 8.004962465348956e-06, + "loss": 0.0041, + "step": 44650 + }, + { + "epoch": 0.7307534975047042, + "grad_norm": 0.06493894010782242, + "learning_rate": 8.003821082456835e-06, + "loss": 0.0018, + "step": 44660 + }, + { + "epoch": 0.7309171234557801, + "grad_norm": 0.13204319775104523, + "learning_rate": 8.002679454583285e-06, + "loss": 0.0026, + "step": 44670 + }, + { + "epoch": 0.731080749406856, + "grad_norm": 0.09227117896080017, + "learning_rate": 8.001537581821416e-06, + "loss": 0.0019, + "step": 44680 + }, + { + "epoch": 0.7312443753579317, + "grad_norm": 0.131661519408226, + "learning_rate": 8.000395464264354e-06, + "loss": 0.0018, + "step": 44690 + }, + { + "epoch": 0.7314080013090076, + "grad_norm": 0.3169766068458557, + "learning_rate": 7.999253102005245e-06, + "loss": 0.004, + "step": 44700 + }, + { + "epoch": 0.7315716272600834, + "grad_norm": 0.10875862091779709, + "learning_rate": 7.99811049513726e-06, + "loss": 0.0017, + "step": 44710 + }, + { + "epoch": 0.7317352532111593, + "grad_norm": 0.06453468650579453, + "learning_rate": 7.996967643753581e-06, + "loss": 0.0017, + "step": 44720 + }, + { + "epoch": 0.7318988791622352, + "grad_norm": 0.16648490726947784, + "learning_rate": 7.995824547947419e-06, + "loss": 0.0027, + "step": 44730 + }, + { + "epoch": 0.732062505113311, + "grad_norm": 0.10406239330768585, + "learning_rate": 7.994681207811998e-06, + "loss": 0.0016, + "step": 44740 + }, + { + "epoch": 0.7322261310643868, + "grad_norm": 0.06713445484638214, + "learning_rate": 7.993537623440568e-06, + "loss": 0.0032, + "step": 44750 + }, + { + "epoch": 0.7323897570154626, + "grad_norm": 0.05412828549742699, + "learning_rate": 7.992393794926393e-06, + "loss": 0.0021, + "step": 44760 + }, + { + "epoch": 0.7325533829665385, + "grad_norm": 0.0850588008761406, + "learning_rate": 7.991249722362762e-06, + "loss": 0.0029, + "step": 44770 + }, + { + "epoch": 0.7327170089176144, + "grad_norm": 0.05673776566982269, + "learning_rate": 7.99010540584298e-06, + "loss": 0.0021, + "step": 44780 + }, + { + "epoch": 0.7328806348686902, + "grad_norm": 0.06593542546033859, + "learning_rate": 7.988960845460375e-06, + "loss": 0.0019, + "step": 44790 + }, + { + "epoch": 0.733044260819766, + "grad_norm": 0.10725630074739456, + "learning_rate": 7.987816041308293e-06, + "loss": 0.0017, + "step": 44800 + }, + { + "epoch": 0.7332078867708418, + "grad_norm": 0.06610086560249329, + "learning_rate": 7.9866709934801e-06, + "loss": 0.0012, + "step": 44810 + }, + { + "epoch": 0.7333715127219177, + "grad_norm": 0.20274235308170319, + "learning_rate": 7.985525702069183e-06, + "loss": 0.0017, + "step": 44820 + }, + { + "epoch": 0.7335351386729936, + "grad_norm": 0.12632769346237183, + "learning_rate": 7.984380167168947e-06, + "loss": 0.0031, + "step": 44830 + }, + { + "epoch": 0.7336987646240694, + "grad_norm": 0.13647542893886566, + "learning_rate": 7.983234388872816e-06, + "loss": 0.0023, + "step": 44840 + }, + { + "epoch": 0.7338623905751452, + "grad_norm": 0.10609569400548935, + "learning_rate": 7.98208836727424e-06, + "loss": 0.0028, + "step": 44850 + }, + { + "epoch": 0.734026016526221, + "grad_norm": 0.029697943478822708, + "learning_rate": 7.980942102466682e-06, + "loss": 0.0021, + "step": 44860 + }, + { + "epoch": 0.7341896424772969, + "grad_norm": 0.03464086726307869, + "learning_rate": 7.97979559454363e-06, + "loss": 0.0021, + "step": 44870 + }, + { + "epoch": 0.7343532684283728, + "grad_norm": 0.2556789815425873, + "learning_rate": 7.978648843598586e-06, + "loss": 0.0045, + "step": 44880 + }, + { + "epoch": 0.7345168943794486, + "grad_norm": 0.034051842987537384, + "learning_rate": 7.977501849725076e-06, + "loss": 0.003, + "step": 44890 + }, + { + "epoch": 0.7346805203305244, + "grad_norm": 0.06016731634736061, + "learning_rate": 7.976354613016646e-06, + "loss": 0.0019, + "step": 44900 + }, + { + "epoch": 0.7348441462816002, + "grad_norm": 0.07642786204814911, + "learning_rate": 7.97520713356686e-06, + "loss": 0.0023, + "step": 44910 + }, + { + "epoch": 0.7350077722326761, + "grad_norm": 0.10233766585588455, + "learning_rate": 7.974059411469304e-06, + "loss": 0.0031, + "step": 44920 + }, + { + "epoch": 0.735171398183752, + "grad_norm": 0.03389649838209152, + "learning_rate": 7.972911446817579e-06, + "loss": 0.0033, + "step": 44930 + }, + { + "epoch": 0.7353350241348278, + "grad_norm": 0.12942197918891907, + "learning_rate": 7.971763239705312e-06, + "loss": 0.0024, + "step": 44940 + }, + { + "epoch": 0.7354986500859036, + "grad_norm": 0.024554140865802765, + "learning_rate": 7.970614790226147e-06, + "loss": 0.0015, + "step": 44950 + }, + { + "epoch": 0.7356622760369794, + "grad_norm": 0.012263045646250248, + "learning_rate": 7.969466098473746e-06, + "loss": 0.0014, + "step": 44960 + }, + { + "epoch": 0.7358259019880553, + "grad_norm": 0.06102617084980011, + "learning_rate": 7.968317164541792e-06, + "loss": 0.0029, + "step": 44970 + }, + { + "epoch": 0.7359895279391312, + "grad_norm": 0.06320632994174957, + "learning_rate": 7.96716798852399e-06, + "loss": 0.0023, + "step": 44980 + }, + { + "epoch": 0.736153153890207, + "grad_norm": 0.07558856904506683, + "learning_rate": 7.966018570514063e-06, + "loss": 0.0016, + "step": 44990 + }, + { + "epoch": 0.7363167798412829, + "grad_norm": 0.0770316794514656, + "learning_rate": 7.964868910605751e-06, + "loss": 0.0023, + "step": 45000 + }, + { + "epoch": 0.7364804057923586, + "grad_norm": 0.06283899396657944, + "learning_rate": 7.963719008892818e-06, + "loss": 0.0038, + "step": 45010 + }, + { + "epoch": 0.7366440317434345, + "grad_norm": 0.05875033512711525, + "learning_rate": 7.962568865469048e-06, + "loss": 0.003, + "step": 45020 + }, + { + "epoch": 0.7368076576945104, + "grad_norm": 0.11941047012805939, + "learning_rate": 7.961418480428239e-06, + "loss": 0.0027, + "step": 45030 + }, + { + "epoch": 0.7369712836455862, + "grad_norm": 0.02768949791789055, + "learning_rate": 7.960267853864216e-06, + "loss": 0.0017, + "step": 45040 + }, + { + "epoch": 0.7371349095966621, + "grad_norm": 0.04157964885234833, + "learning_rate": 7.959116985870816e-06, + "loss": 0.0024, + "step": 45050 + }, + { + "epoch": 0.7372985355477378, + "grad_norm": 0.06325796991586685, + "learning_rate": 7.957965876541905e-06, + "loss": 0.0017, + "step": 45060 + }, + { + "epoch": 0.7374621614988137, + "grad_norm": 0.04996092990040779, + "learning_rate": 7.95681452597136e-06, + "loss": 0.0022, + "step": 45070 + }, + { + "epoch": 0.7376257874498896, + "grad_norm": 0.08422041684389114, + "learning_rate": 7.955662934253082e-06, + "loss": 0.0034, + "step": 45080 + }, + { + "epoch": 0.7377894134009654, + "grad_norm": 0.1696813404560089, + "learning_rate": 7.954511101480991e-06, + "loss": 0.0035, + "step": 45090 + }, + { + "epoch": 0.7379530393520413, + "grad_norm": 0.24613326787948608, + "learning_rate": 7.953359027749028e-06, + "loss": 0.0029, + "step": 45100 + }, + { + "epoch": 0.738116665303117, + "grad_norm": 0.13812978565692902, + "learning_rate": 7.95220671315115e-06, + "loss": 0.0022, + "step": 45110 + }, + { + "epoch": 0.7382802912541929, + "grad_norm": 0.056641675531864166, + "learning_rate": 7.951054157781335e-06, + "loss": 0.0031, + "step": 45120 + }, + { + "epoch": 0.7384439172052688, + "grad_norm": 0.22468949854373932, + "learning_rate": 7.949901361733585e-06, + "loss": 0.0035, + "step": 45130 + }, + { + "epoch": 0.7386075431563446, + "grad_norm": 0.06773383170366287, + "learning_rate": 7.948748325101916e-06, + "loss": 0.0018, + "step": 45140 + }, + { + "epoch": 0.7387711691074205, + "grad_norm": 0.10979906469583511, + "learning_rate": 7.947595047980367e-06, + "loss": 0.0012, + "step": 45150 + }, + { + "epoch": 0.7389347950584962, + "grad_norm": 0.06032257899641991, + "learning_rate": 7.946441530462994e-06, + "loss": 0.0011, + "step": 45160 + }, + { + "epoch": 0.7390984210095721, + "grad_norm": 0.019600093364715576, + "learning_rate": 7.945287772643872e-06, + "loss": 0.0024, + "step": 45170 + }, + { + "epoch": 0.739262046960648, + "grad_norm": 0.08960779756307602, + "learning_rate": 7.944133774617105e-06, + "loss": 0.0024, + "step": 45180 + }, + { + "epoch": 0.7394256729117238, + "grad_norm": 0.10188505053520203, + "learning_rate": 7.9429795364768e-06, + "loss": 0.0018, + "step": 45190 + }, + { + "epoch": 0.7395892988627997, + "grad_norm": 0.07792667299509048, + "learning_rate": 7.941825058317098e-06, + "loss": 0.003, + "step": 45200 + }, + { + "epoch": 0.7397529248138754, + "grad_norm": 0.11480510234832764, + "learning_rate": 7.940670340232154e-06, + "loss": 0.005, + "step": 45210 + }, + { + "epoch": 0.7399165507649513, + "grad_norm": 0.07940959930419922, + "learning_rate": 7.93951538231614e-06, + "loss": 0.001, + "step": 45220 + }, + { + "epoch": 0.7400801767160272, + "grad_norm": 0.05966443195939064, + "learning_rate": 7.938360184663254e-06, + "loss": 0.002, + "step": 45230 + }, + { + "epoch": 0.740243802667103, + "grad_norm": 0.10636087507009506, + "learning_rate": 7.93720474736771e-06, + "loss": 0.0024, + "step": 45240 + }, + { + "epoch": 0.7404074286181789, + "grad_norm": 0.17561720311641693, + "learning_rate": 7.936049070523739e-06, + "loss": 0.0021, + "step": 45250 + }, + { + "epoch": 0.7405710545692546, + "grad_norm": 0.044909875839948654, + "learning_rate": 7.934893154225594e-06, + "loss": 0.0025, + "step": 45260 + }, + { + "epoch": 0.7407346805203305, + "grad_norm": 0.09109053015708923, + "learning_rate": 7.93373699856755e-06, + "loss": 0.0025, + "step": 45270 + }, + { + "epoch": 0.7408983064714064, + "grad_norm": 0.05211983621120453, + "learning_rate": 7.932580603643896e-06, + "loss": 0.0013, + "step": 45280 + }, + { + "epoch": 0.7410619324224822, + "grad_norm": 0.09957781434059143, + "learning_rate": 7.931423969548948e-06, + "loss": 0.0029, + "step": 45290 + }, + { + "epoch": 0.7412255583735581, + "grad_norm": 0.06035226210951805, + "learning_rate": 7.930267096377032e-06, + "loss": 0.0012, + "step": 45300 + }, + { + "epoch": 0.7413891843246339, + "grad_norm": 0.15249688923358917, + "learning_rate": 7.929109984222503e-06, + "loss": 0.0044, + "step": 45310 + }, + { + "epoch": 0.7415528102757097, + "grad_norm": 0.16975709795951843, + "learning_rate": 7.927952633179728e-06, + "loss": 0.0036, + "step": 45320 + }, + { + "epoch": 0.7417164362267855, + "grad_norm": 0.028939656913280487, + "learning_rate": 7.926795043343099e-06, + "loss": 0.0038, + "step": 45330 + }, + { + "epoch": 0.7418800621778614, + "grad_norm": 0.0719069316983223, + "learning_rate": 7.925637214807026e-06, + "loss": 0.0019, + "step": 45340 + }, + { + "epoch": 0.7420436881289373, + "grad_norm": 0.024133216589689255, + "learning_rate": 7.924479147665931e-06, + "loss": 0.002, + "step": 45350 + }, + { + "epoch": 0.7422073140800131, + "grad_norm": 0.00805605947971344, + "learning_rate": 7.92332084201427e-06, + "loss": 0.0015, + "step": 45360 + }, + { + "epoch": 0.7423709400310889, + "grad_norm": 0.06430293619632721, + "learning_rate": 7.922162297946507e-06, + "loss": 0.0033, + "step": 45370 + }, + { + "epoch": 0.7425345659821647, + "grad_norm": 0.12935978174209595, + "learning_rate": 7.92100351555713e-06, + "loss": 0.0024, + "step": 45380 + }, + { + "epoch": 0.7426981919332406, + "grad_norm": 0.1596582531929016, + "learning_rate": 7.91984449494064e-06, + "loss": 0.0022, + "step": 45390 + }, + { + "epoch": 0.7428618178843165, + "grad_norm": 0.14239265024662018, + "learning_rate": 7.91868523619157e-06, + "loss": 0.003, + "step": 45400 + }, + { + "epoch": 0.7430254438353923, + "grad_norm": 0.051771629601716995, + "learning_rate": 7.917525739404464e-06, + "loss": 0.0013, + "step": 45410 + }, + { + "epoch": 0.7431890697864681, + "grad_norm": 0.0413055345416069, + "learning_rate": 7.916366004673882e-06, + "loss": 0.0015, + "step": 45420 + }, + { + "epoch": 0.7433526957375439, + "grad_norm": 0.10837849974632263, + "learning_rate": 7.915206032094412e-06, + "loss": 0.0029, + "step": 45430 + }, + { + "epoch": 0.7435163216886198, + "grad_norm": 0.12710237503051758, + "learning_rate": 7.914045821760658e-06, + "loss": 0.0024, + "step": 45440 + }, + { + "epoch": 0.7436799476396957, + "grad_norm": 0.03616471216082573, + "learning_rate": 7.912885373767238e-06, + "loss": 0.0012, + "step": 45450 + }, + { + "epoch": 0.7438435735907715, + "grad_norm": 0.17294131219387054, + "learning_rate": 7.911724688208801e-06, + "loss": 0.0016, + "step": 45460 + }, + { + "epoch": 0.7440071995418474, + "grad_norm": 0.03460134193301201, + "learning_rate": 7.910563765180002e-06, + "loss": 0.001, + "step": 45470 + }, + { + "epoch": 0.7441708254929231, + "grad_norm": 0.11427845805883408, + "learning_rate": 7.909402604775527e-06, + "loss": 0.0022, + "step": 45480 + }, + { + "epoch": 0.744334451443999, + "grad_norm": 0.1683957427740097, + "learning_rate": 7.908241207090073e-06, + "loss": 0.0024, + "step": 45490 + }, + { + "epoch": 0.7444980773950749, + "grad_norm": 0.08027928322553635, + "learning_rate": 7.907079572218361e-06, + "loss": 0.0034, + "step": 45500 + }, + { + "epoch": 0.7446617033461507, + "grad_norm": 0.004896610975265503, + "learning_rate": 7.90591770025513e-06, + "loss": 0.0017, + "step": 45510 + }, + { + "epoch": 0.7448253292972266, + "grad_norm": 0.06502117961645126, + "learning_rate": 7.90475559129514e-06, + "loss": 0.0016, + "step": 45520 + }, + { + "epoch": 0.7449889552483023, + "grad_norm": 0.07437731325626373, + "learning_rate": 7.903593245433162e-06, + "loss": 0.0014, + "step": 45530 + }, + { + "epoch": 0.7451525811993782, + "grad_norm": 0.012837845832109451, + "learning_rate": 7.902430662764002e-06, + "loss": 0.0011, + "step": 45540 + }, + { + "epoch": 0.7453162071504541, + "grad_norm": 0.02074851468205452, + "learning_rate": 7.901267843382472e-06, + "loss": 0.0019, + "step": 45550 + }, + { + "epoch": 0.7454798331015299, + "grad_norm": 0.0744505301117897, + "learning_rate": 7.900104787383407e-06, + "loss": 0.0017, + "step": 45560 + }, + { + "epoch": 0.7456434590526058, + "grad_norm": 0.07938183844089508, + "learning_rate": 7.898941494861661e-06, + "loss": 0.003, + "step": 45570 + }, + { + "epoch": 0.7458070850036815, + "grad_norm": 0.05233943834900856, + "learning_rate": 7.89777796591211e-06, + "loss": 0.002, + "step": 45580 + }, + { + "epoch": 0.7459707109547574, + "grad_norm": 0.007412207778543234, + "learning_rate": 7.896614200629648e-06, + "loss": 0.0046, + "step": 45590 + }, + { + "epoch": 0.7461343369058333, + "grad_norm": 0.08828859031200409, + "learning_rate": 7.895450199109186e-06, + "loss": 0.0015, + "step": 45600 + }, + { + "epoch": 0.7462979628569091, + "grad_norm": 0.048798076808452606, + "learning_rate": 7.89428596144566e-06, + "loss": 0.0024, + "step": 45610 + }, + { + "epoch": 0.746461588807985, + "grad_norm": 0.21522916853427887, + "learning_rate": 7.893121487734015e-06, + "loss": 0.003, + "step": 45620 + }, + { + "epoch": 0.7466252147590607, + "grad_norm": 0.047461673617362976, + "learning_rate": 7.891956778069227e-06, + "loss": 0.002, + "step": 45630 + }, + { + "epoch": 0.7467888407101366, + "grad_norm": 0.05639554187655449, + "learning_rate": 7.890791832546281e-06, + "loss": 0.0032, + "step": 45640 + }, + { + "epoch": 0.7469524666612125, + "grad_norm": 0.31747567653656006, + "learning_rate": 7.88962665126019e-06, + "loss": 0.0038, + "step": 45650 + }, + { + "epoch": 0.7471160926122883, + "grad_norm": 0.023616468533873558, + "learning_rate": 7.88846123430598e-06, + "loss": 0.0015, + "step": 45660 + }, + { + "epoch": 0.7472797185633642, + "grad_norm": 0.1407870650291443, + "learning_rate": 7.8872955817787e-06, + "loss": 0.0017, + "step": 45670 + }, + { + "epoch": 0.74744334451444, + "grad_norm": 0.07562875747680664, + "learning_rate": 7.886129693773416e-06, + "loss": 0.0018, + "step": 45680 + }, + { + "epoch": 0.7476069704655158, + "grad_norm": 0.013998042792081833, + "learning_rate": 7.884963570385214e-06, + "loss": 0.0013, + "step": 45690 + }, + { + "epoch": 0.7477705964165917, + "grad_norm": 0.04817408323287964, + "learning_rate": 7.883797211709196e-06, + "loss": 0.0012, + "step": 45700 + }, + { + "epoch": 0.7479342223676675, + "grad_norm": 0.19806192815303802, + "learning_rate": 7.88263061784049e-06, + "loss": 0.0033, + "step": 45710 + }, + { + "epoch": 0.7480978483187434, + "grad_norm": 0.10142921656370163, + "learning_rate": 7.88146378887424e-06, + "loss": 0.0022, + "step": 45720 + }, + { + "epoch": 0.7482614742698191, + "grad_norm": 0.06292014569044113, + "learning_rate": 7.880296724905605e-06, + "loss": 0.0036, + "step": 45730 + }, + { + "epoch": 0.748425100220895, + "grad_norm": 0.10621743649244308, + "learning_rate": 7.879129426029771e-06, + "loss": 0.0027, + "step": 45740 + }, + { + "epoch": 0.7485887261719709, + "grad_norm": 0.06463120877742767, + "learning_rate": 7.877961892341934e-06, + "loss": 0.0031, + "step": 45750 + }, + { + "epoch": 0.7487523521230467, + "grad_norm": 0.09279613196849823, + "learning_rate": 7.876794123937318e-06, + "loss": 0.0023, + "step": 45760 + }, + { + "epoch": 0.7489159780741226, + "grad_norm": 0.09475941210985184, + "learning_rate": 7.875626120911162e-06, + "loss": 0.0029, + "step": 45770 + }, + { + "epoch": 0.7490796040251984, + "grad_norm": 0.11217482388019562, + "learning_rate": 7.874457883358722e-06, + "loss": 0.0017, + "step": 45780 + }, + { + "epoch": 0.7492432299762742, + "grad_norm": 0.0697907954454422, + "learning_rate": 7.873289411375276e-06, + "loss": 0.0015, + "step": 45790 + }, + { + "epoch": 0.7494068559273501, + "grad_norm": 0.1602737009525299, + "learning_rate": 7.872120705056124e-06, + "loss": 0.0035, + "step": 45800 + }, + { + "epoch": 0.7495704818784259, + "grad_norm": 0.09871604293584824, + "learning_rate": 7.870951764496575e-06, + "loss": 0.0022, + "step": 45810 + }, + { + "epoch": 0.7497341078295018, + "grad_norm": 0.04293878749012947, + "learning_rate": 7.869782589791971e-06, + "loss": 0.0013, + "step": 45820 + }, + { + "epoch": 0.7498977337805776, + "grad_norm": 0.04196852818131447, + "learning_rate": 7.868613181037663e-06, + "loss": 0.0013, + "step": 45830 + }, + { + "epoch": 0.7500613597316534, + "grad_norm": 0.07524515688419342, + "learning_rate": 7.867443538329023e-06, + "loss": 0.003, + "step": 45840 + }, + { + "epoch": 0.7502249856827293, + "grad_norm": 0.07245465368032455, + "learning_rate": 7.866273661761445e-06, + "loss": 0.0018, + "step": 45850 + }, + { + "epoch": 0.7503886116338051, + "grad_norm": 0.06912210583686829, + "learning_rate": 7.865103551430338e-06, + "loss": 0.0019, + "step": 45860 + }, + { + "epoch": 0.750552237584881, + "grad_norm": 0.04246789216995239, + "learning_rate": 7.863933207431133e-06, + "loss": 0.0023, + "step": 45870 + }, + { + "epoch": 0.7507158635359568, + "grad_norm": 0.078510582447052, + "learning_rate": 7.86276262985928e-06, + "loss": 0.0035, + "step": 45880 + }, + { + "epoch": 0.7508794894870326, + "grad_norm": 0.049804095178842545, + "learning_rate": 7.861591818810246e-06, + "loss": 0.0024, + "step": 45890 + }, + { + "epoch": 0.7510431154381085, + "grad_norm": 0.040483035147190094, + "learning_rate": 7.86042077437952e-06, + "loss": 0.003, + "step": 45900 + }, + { + "epoch": 0.7512067413891843, + "grad_norm": 0.09779264032840729, + "learning_rate": 7.859249496662607e-06, + "loss": 0.0057, + "step": 45910 + }, + { + "epoch": 0.7513703673402602, + "grad_norm": 0.2889154851436615, + "learning_rate": 7.858077985755034e-06, + "loss": 0.006, + "step": 45920 + }, + { + "epoch": 0.751533993291336, + "grad_norm": 0.22923921048641205, + "learning_rate": 7.856906241752345e-06, + "loss": 0.003, + "step": 45930 + }, + { + "epoch": 0.7516976192424119, + "grad_norm": 0.04952012747526169, + "learning_rate": 7.855734264750102e-06, + "loss": 0.0035, + "step": 45940 + }, + { + "epoch": 0.7518612451934877, + "grad_norm": 0.3482204079627991, + "learning_rate": 7.854562054843888e-06, + "loss": 0.0019, + "step": 45950 + }, + { + "epoch": 0.7520248711445635, + "grad_norm": 0.08798300474882126, + "learning_rate": 7.853389612129305e-06, + "loss": 0.0037, + "step": 45960 + }, + { + "epoch": 0.7521884970956394, + "grad_norm": 0.05743544176220894, + "learning_rate": 7.852216936701972e-06, + "loss": 0.002, + "step": 45970 + }, + { + "epoch": 0.7523521230467152, + "grad_norm": 0.09211505204439163, + "learning_rate": 7.851044028657532e-06, + "loss": 0.0028, + "step": 45980 + }, + { + "epoch": 0.7525157489977911, + "grad_norm": 0.04254554584622383, + "learning_rate": 7.849870888091642e-06, + "loss": 0.0013, + "step": 45990 + }, + { + "epoch": 0.7526793749488669, + "grad_norm": 0.0790761262178421, + "learning_rate": 7.848697515099976e-06, + "loss": 0.0016, + "step": 46000 + }, + { + "epoch": 0.7528430008999427, + "grad_norm": 0.06166630983352661, + "learning_rate": 7.847523909778234e-06, + "loss": 0.0023, + "step": 46010 + }, + { + "epoch": 0.7530066268510186, + "grad_norm": 0.0754413977265358, + "learning_rate": 7.84635007222213e-06, + "loss": 0.002, + "step": 46020 + }, + { + "epoch": 0.7531702528020944, + "grad_norm": 0.09435712546110153, + "learning_rate": 7.845176002527399e-06, + "loss": 0.0025, + "step": 46030 + }, + { + "epoch": 0.7533338787531703, + "grad_norm": 0.06301447004079819, + "learning_rate": 7.844001700789791e-06, + "loss": 0.0014, + "step": 46040 + }, + { + "epoch": 0.7534975047042461, + "grad_norm": 0.05977576971054077, + "learning_rate": 7.842827167105083e-06, + "loss": 0.003, + "step": 46050 + }, + { + "epoch": 0.7536611306553219, + "grad_norm": 0.06579329073429108, + "learning_rate": 7.841652401569062e-06, + "loss": 0.0025, + "step": 46060 + }, + { + "epoch": 0.7538247566063978, + "grad_norm": 0.05518899857997894, + "learning_rate": 7.84047740427754e-06, + "loss": 0.0021, + "step": 46070 + }, + { + "epoch": 0.7539883825574736, + "grad_norm": 0.15784424543380737, + "learning_rate": 7.839302175326344e-06, + "loss": 0.002, + "step": 46080 + }, + { + "epoch": 0.7541520085085495, + "grad_norm": 0.055536169558763504, + "learning_rate": 7.838126714811323e-06, + "loss": 0.0018, + "step": 46090 + }, + { + "epoch": 0.7543156344596254, + "grad_norm": 0.09416177868843079, + "learning_rate": 7.836951022828343e-06, + "loss": 0.0022, + "step": 46100 + }, + { + "epoch": 0.7544792604107011, + "grad_norm": 0.05534076318144798, + "learning_rate": 7.83577509947329e-06, + "loss": 0.0017, + "step": 46110 + }, + { + "epoch": 0.754642886361777, + "grad_norm": 0.09154891222715378, + "learning_rate": 7.834598944842068e-06, + "loss": 0.0034, + "step": 46120 + }, + { + "epoch": 0.7548065123128528, + "grad_norm": 0.05945158377289772, + "learning_rate": 7.833422559030602e-06, + "loss": 0.0024, + "step": 46130 + }, + { + "epoch": 0.7549701382639287, + "grad_norm": 0.07742059230804443, + "learning_rate": 7.83224594213483e-06, + "loss": 0.0024, + "step": 46140 + }, + { + "epoch": 0.7551337642150046, + "grad_norm": 0.07154424488544464, + "learning_rate": 7.831069094250715e-06, + "loss": 0.0023, + "step": 46150 + }, + { + "epoch": 0.7552973901660803, + "grad_norm": 0.15334156155586243, + "learning_rate": 7.829892015474237e-06, + "loss": 0.0016, + "step": 46160 + }, + { + "epoch": 0.7554610161171562, + "grad_norm": 0.058418042957782745, + "learning_rate": 7.828714705901395e-06, + "loss": 0.0022, + "step": 46170 + }, + { + "epoch": 0.755624642068232, + "grad_norm": 0.06077789515256882, + "learning_rate": 7.827537165628206e-06, + "loss": 0.0017, + "step": 46180 + }, + { + "epoch": 0.7557882680193079, + "grad_norm": 0.08221083134412766, + "learning_rate": 7.826359394750706e-06, + "loss": 0.0011, + "step": 46190 + }, + { + "epoch": 0.7559518939703836, + "grad_norm": 0.03951855003833771, + "learning_rate": 7.825181393364949e-06, + "loss": 0.0026, + "step": 46200 + }, + { + "epoch": 0.7561155199214595, + "grad_norm": 0.09651871770620346, + "learning_rate": 7.82400316156701e-06, + "loss": 0.0015, + "step": 46210 + }, + { + "epoch": 0.7562791458725354, + "grad_norm": 0.0519305020570755, + "learning_rate": 7.82282469945298e-06, + "loss": 0.0013, + "step": 46220 + }, + { + "epoch": 0.7564427718236112, + "grad_norm": 0.049298834055662155, + "learning_rate": 7.821646007118974e-06, + "loss": 0.0018, + "step": 46230 + }, + { + "epoch": 0.7566063977746871, + "grad_norm": 0.058265797793865204, + "learning_rate": 7.820467084661118e-06, + "loss": 0.0027, + "step": 46240 + }, + { + "epoch": 0.7567700237257629, + "grad_norm": 0.12728208303451538, + "learning_rate": 7.819287932175563e-06, + "loss": 0.0033, + "step": 46250 + }, + { + "epoch": 0.7569336496768387, + "grad_norm": 0.05667201802134514, + "learning_rate": 7.818108549758477e-06, + "loss": 0.0028, + "step": 46260 + }, + { + "epoch": 0.7570972756279146, + "grad_norm": 0.06329859048128128, + "learning_rate": 7.816928937506045e-06, + "loss": 0.0048, + "step": 46270 + }, + { + "epoch": 0.7572609015789904, + "grad_norm": 0.039452340453863144, + "learning_rate": 7.815749095514474e-06, + "loss": 0.002, + "step": 46280 + }, + { + "epoch": 0.7574245275300663, + "grad_norm": 0.028742201626300812, + "learning_rate": 7.814569023879985e-06, + "loss": 0.0024, + "step": 46290 + }, + { + "epoch": 0.7575881534811421, + "grad_norm": 0.23006078600883484, + "learning_rate": 7.813388722698823e-06, + "loss": 0.0031, + "step": 46300 + }, + { + "epoch": 0.7577517794322179, + "grad_norm": 0.0632338598370552, + "learning_rate": 7.81220819206725e-06, + "loss": 0.0028, + "step": 46310 + }, + { + "epoch": 0.7579154053832938, + "grad_norm": 0.03981895372271538, + "learning_rate": 7.811027432081543e-06, + "loss": 0.0052, + "step": 46320 + }, + { + "epoch": 0.7580790313343696, + "grad_norm": 0.08559157699346542, + "learning_rate": 7.809846442838003e-06, + "loss": 0.0017, + "step": 46330 + }, + { + "epoch": 0.7582426572854455, + "grad_norm": 0.0455181747674942, + "learning_rate": 7.808665224432948e-06, + "loss": 0.0027, + "step": 46340 + }, + { + "epoch": 0.7584062832365213, + "grad_norm": 0.05995987728238106, + "learning_rate": 7.807483776962712e-06, + "loss": 0.0025, + "step": 46350 + }, + { + "epoch": 0.7585699091875971, + "grad_norm": 0.033405058085918427, + "learning_rate": 7.806302100523653e-06, + "loss": 0.0021, + "step": 46360 + }, + { + "epoch": 0.758733535138673, + "grad_norm": 0.06432349979877472, + "learning_rate": 7.80512019521214e-06, + "loss": 0.0015, + "step": 46370 + }, + { + "epoch": 0.7588971610897488, + "grad_norm": 0.04573766142129898, + "learning_rate": 7.80393806112457e-06, + "loss": 0.0014, + "step": 46380 + }, + { + "epoch": 0.7590607870408247, + "grad_norm": 0.03326824679970741, + "learning_rate": 7.80275569835735e-06, + "loss": 0.0013, + "step": 46390 + }, + { + "epoch": 0.7592244129919005, + "grad_norm": 0.05499792471528053, + "learning_rate": 7.801573107006912e-06, + "loss": 0.0024, + "step": 46400 + }, + { + "epoch": 0.7593880389429764, + "grad_norm": 0.09602683037519455, + "learning_rate": 7.8003902871697e-06, + "loss": 0.0032, + "step": 46410 + }, + { + "epoch": 0.7595516648940522, + "grad_norm": 0.018890516832470894, + "learning_rate": 7.799207238942188e-06, + "loss": 0.0024, + "step": 46420 + }, + { + "epoch": 0.759715290845128, + "grad_norm": 0.1350695937871933, + "learning_rate": 7.798023962420856e-06, + "loss": 0.0035, + "step": 46430 + }, + { + "epoch": 0.7598789167962039, + "grad_norm": 0.08974120020866394, + "learning_rate": 7.79684045770221e-06, + "loss": 0.0021, + "step": 46440 + }, + { + "epoch": 0.7600425427472797, + "grad_norm": 0.052961576730012894, + "learning_rate": 7.79565672488277e-06, + "loss": 0.0041, + "step": 46450 + }, + { + "epoch": 0.7602061686983556, + "grad_norm": 0.0438653938472271, + "learning_rate": 7.794472764059082e-06, + "loss": 0.0022, + "step": 46460 + }, + { + "epoch": 0.7603697946494314, + "grad_norm": 0.0634981170296669, + "learning_rate": 7.7932885753277e-06, + "loss": 0.0023, + "step": 46470 + }, + { + "epoch": 0.7605334206005072, + "grad_norm": 0.1865743100643158, + "learning_rate": 7.792104158785208e-06, + "loss": 0.0034, + "step": 46480 + }, + { + "epoch": 0.7606970465515831, + "grad_norm": 0.07153315842151642, + "learning_rate": 7.7909195145282e-06, + "loss": 0.0029, + "step": 46490 + }, + { + "epoch": 0.7608606725026589, + "grad_norm": 0.07196561247110367, + "learning_rate": 7.789734642653291e-06, + "loss": 0.0017, + "step": 46500 + }, + { + "epoch": 0.7610242984537348, + "grad_norm": 0.04044812172651291, + "learning_rate": 7.788549543257116e-06, + "loss": 0.0022, + "step": 46510 + }, + { + "epoch": 0.7611879244048106, + "grad_norm": 0.07842089235782623, + "learning_rate": 7.787364216436326e-06, + "loss": 0.0017, + "step": 46520 + }, + { + "epoch": 0.7613515503558864, + "grad_norm": 0.19229631125926971, + "learning_rate": 7.786178662287596e-06, + "loss": 0.0041, + "step": 46530 + }, + { + "epoch": 0.7615151763069623, + "grad_norm": 0.07726782560348511, + "learning_rate": 7.784992880907613e-06, + "loss": 0.0028, + "step": 46540 + }, + { + "epoch": 0.7616788022580381, + "grad_norm": 0.04832502454519272, + "learning_rate": 7.783806872393086e-06, + "loss": 0.0015, + "step": 46550 + }, + { + "epoch": 0.761842428209114, + "grad_norm": 0.0372101254761219, + "learning_rate": 7.782620636840743e-06, + "loss": 0.0023, + "step": 46560 + }, + { + "epoch": 0.7620060541601898, + "grad_norm": 0.10387596487998962, + "learning_rate": 7.781434174347327e-06, + "loss": 0.0028, + "step": 46570 + }, + { + "epoch": 0.7621696801112656, + "grad_norm": 0.04422289505600929, + "learning_rate": 7.780247485009604e-06, + "loss": 0.0015, + "step": 46580 + }, + { + "epoch": 0.7623333060623415, + "grad_norm": 0.028675392270088196, + "learning_rate": 7.779060568924355e-06, + "loss": 0.0025, + "step": 46590 + }, + { + "epoch": 0.7624969320134173, + "grad_norm": 0.06437677145004272, + "learning_rate": 7.77787342618838e-06, + "loss": 0.0022, + "step": 46600 + }, + { + "epoch": 0.7626605579644932, + "grad_norm": 0.20517979562282562, + "learning_rate": 7.776686056898501e-06, + "loss": 0.0054, + "step": 46610 + }, + { + "epoch": 0.762824183915569, + "grad_norm": 0.09521616995334625, + "learning_rate": 7.775498461151553e-06, + "loss": 0.0037, + "step": 46620 + }, + { + "epoch": 0.7629878098666448, + "grad_norm": 0.11395671963691711, + "learning_rate": 7.774310639044395e-06, + "loss": 0.002, + "step": 46630 + }, + { + "epoch": 0.7631514358177207, + "grad_norm": 0.21778123080730438, + "learning_rate": 7.7731225906739e-06, + "loss": 0.0025, + "step": 46640 + }, + { + "epoch": 0.7633150617687965, + "grad_norm": 0.15097777545452118, + "learning_rate": 7.77193431613696e-06, + "loss": 0.0015, + "step": 46650 + }, + { + "epoch": 0.7634786877198724, + "grad_norm": 0.032192833721637726, + "learning_rate": 7.77074581553049e-06, + "loss": 0.0024, + "step": 46660 + }, + { + "epoch": 0.7636423136709483, + "grad_norm": 0.1414719969034195, + "learning_rate": 7.769557088951419e-06, + "loss": 0.0016, + "step": 46670 + }, + { + "epoch": 0.763805939622024, + "grad_norm": 0.08889006078243256, + "learning_rate": 7.76836813649669e-06, + "loss": 0.0026, + "step": 46680 + }, + { + "epoch": 0.7639695655730999, + "grad_norm": 0.06315800547599792, + "learning_rate": 7.76717895826328e-06, + "loss": 0.0018, + "step": 46690 + }, + { + "epoch": 0.7641331915241757, + "grad_norm": 0.05180824175477028, + "learning_rate": 7.765989554348166e-06, + "loss": 0.0022, + "step": 46700 + }, + { + "epoch": 0.7642968174752516, + "grad_norm": 0.08959269523620605, + "learning_rate": 7.764799924848354e-06, + "loss": 0.0024, + "step": 46710 + }, + { + "epoch": 0.7644604434263275, + "grad_norm": 0.2625131905078888, + "learning_rate": 7.763610069860869e-06, + "loss": 0.0027, + "step": 46720 + }, + { + "epoch": 0.7646240693774032, + "grad_norm": 0.04443061724305153, + "learning_rate": 7.762419989482748e-06, + "loss": 0.0021, + "step": 46730 + }, + { + "epoch": 0.7647876953284791, + "grad_norm": 0.07516305148601532, + "learning_rate": 7.76122968381105e-06, + "loss": 0.0028, + "step": 46740 + }, + { + "epoch": 0.7649513212795549, + "grad_norm": 0.02677128277719021, + "learning_rate": 7.760039152942856e-06, + "loss": 0.002, + "step": 46750 + }, + { + "epoch": 0.7651149472306308, + "grad_norm": 0.08414687216281891, + "learning_rate": 7.758848396975258e-06, + "loss": 0.0022, + "step": 46760 + }, + { + "epoch": 0.7652785731817067, + "grad_norm": 0.10644156485795975, + "learning_rate": 7.757657416005373e-06, + "loss": 0.003, + "step": 46770 + }, + { + "epoch": 0.7654421991327824, + "grad_norm": 0.20370633900165558, + "learning_rate": 7.756466210130329e-06, + "loss": 0.0038, + "step": 46780 + }, + { + "epoch": 0.7656058250838583, + "grad_norm": 0.18757426738739014, + "learning_rate": 7.75527477944728e-06, + "loss": 0.0029, + "step": 46790 + }, + { + "epoch": 0.7657694510349341, + "grad_norm": 0.2388020008802414, + "learning_rate": 7.754083124053394e-06, + "loss": 0.002, + "step": 46800 + }, + { + "epoch": 0.76593307698601, + "grad_norm": 0.2437874674797058, + "learning_rate": 7.752891244045859e-06, + "loss": 0.0017, + "step": 46810 + }, + { + "epoch": 0.7660967029370859, + "grad_norm": 0.04364819452166557, + "learning_rate": 7.75169913952188e-06, + "loss": 0.0038, + "step": 46820 + }, + { + "epoch": 0.7662603288881616, + "grad_norm": 0.1228971779346466, + "learning_rate": 7.750506810578682e-06, + "loss": 0.002, + "step": 46830 + }, + { + "epoch": 0.7664239548392375, + "grad_norm": 0.09557469934225082, + "learning_rate": 7.749314257313506e-06, + "loss": 0.004, + "step": 46840 + }, + { + "epoch": 0.7665875807903133, + "grad_norm": 0.0702197328209877, + "learning_rate": 7.748121479823614e-06, + "loss": 0.0014, + "step": 46850 + }, + { + "epoch": 0.7667512067413892, + "grad_norm": 0.06272779405117035, + "learning_rate": 7.746928478206283e-06, + "loss": 0.0018, + "step": 46860 + }, + { + "epoch": 0.7669148326924651, + "grad_norm": 0.09017270058393478, + "learning_rate": 7.745735252558811e-06, + "loss": 0.0025, + "step": 46870 + }, + { + "epoch": 0.7670784586435409, + "grad_norm": 0.10913238674402237, + "learning_rate": 7.744541802978514e-06, + "loss": 0.0021, + "step": 46880 + }, + { + "epoch": 0.7672420845946167, + "grad_norm": 0.1820211559534073, + "learning_rate": 7.743348129562724e-06, + "loss": 0.002, + "step": 46890 + }, + { + "epoch": 0.7674057105456925, + "grad_norm": 0.12953536212444305, + "learning_rate": 7.742154232408796e-06, + "loss": 0.0028, + "step": 46900 + }, + { + "epoch": 0.7675693364967684, + "grad_norm": 0.03876791149377823, + "learning_rate": 7.740960111614097e-06, + "loss": 0.0026, + "step": 46910 + }, + { + "epoch": 0.7677329624478443, + "grad_norm": 0.09419603645801544, + "learning_rate": 7.73976576727602e-06, + "loss": 0.002, + "step": 46920 + }, + { + "epoch": 0.76789658839892, + "grad_norm": 0.08967334777116776, + "learning_rate": 7.738571199491965e-06, + "loss": 0.0027, + "step": 46930 + }, + { + "epoch": 0.7680602143499959, + "grad_norm": 0.11530376225709915, + "learning_rate": 7.737376408359362e-06, + "loss": 0.0036, + "step": 46940 + }, + { + "epoch": 0.7682238403010717, + "grad_norm": 0.02665570192039013, + "learning_rate": 7.736181393975653e-06, + "loss": 0.0029, + "step": 46950 + }, + { + "epoch": 0.7683874662521476, + "grad_norm": 0.08262048661708832, + "learning_rate": 7.734986156438296e-06, + "loss": 0.0037, + "step": 46960 + }, + { + "epoch": 0.7685510922032235, + "grad_norm": 0.06585193425416946, + "learning_rate": 7.733790695844776e-06, + "loss": 0.0017, + "step": 46970 + }, + { + "epoch": 0.7687147181542993, + "grad_norm": 0.06354914605617523, + "learning_rate": 7.732595012292587e-06, + "loss": 0.0015, + "step": 46980 + }, + { + "epoch": 0.7688783441053751, + "grad_norm": 0.20126089453697205, + "learning_rate": 7.731399105879246e-06, + "loss": 0.0016, + "step": 46990 + }, + { + "epoch": 0.7690419700564509, + "grad_norm": 0.07253695279359818, + "learning_rate": 7.730202976702288e-06, + "loss": 0.0028, + "step": 47000 + }, + { + "epoch": 0.7692055960075268, + "grad_norm": 0.025956712663173676, + "learning_rate": 7.729006624859263e-06, + "loss": 0.0018, + "step": 47010 + }, + { + "epoch": 0.7693692219586027, + "grad_norm": 0.018495842814445496, + "learning_rate": 7.727810050447742e-06, + "loss": 0.0015, + "step": 47020 + }, + { + "epoch": 0.7695328479096785, + "grad_norm": 0.07379436492919922, + "learning_rate": 7.726613253565315e-06, + "loss": 0.0031, + "step": 47030 + }, + { + "epoch": 0.7696964738607543, + "grad_norm": 0.02228795364499092, + "learning_rate": 7.725416234309589e-06, + "loss": 0.0021, + "step": 47040 + }, + { + "epoch": 0.7698600998118301, + "grad_norm": 0.2120303362607956, + "learning_rate": 7.724218992778185e-06, + "loss": 0.0063, + "step": 47050 + }, + { + "epoch": 0.770023725762906, + "grad_norm": 0.050658971071243286, + "learning_rate": 7.72302152906875e-06, + "loss": 0.0016, + "step": 47060 + }, + { + "epoch": 0.7701873517139818, + "grad_norm": 0.03249619901180267, + "learning_rate": 7.721823843278944e-06, + "loss": 0.0034, + "step": 47070 + }, + { + "epoch": 0.7703509776650577, + "grad_norm": 0.031035101041197777, + "learning_rate": 7.720625935506445e-06, + "loss": 0.0015, + "step": 47080 + }, + { + "epoch": 0.7705146036161336, + "grad_norm": 0.12945032119750977, + "learning_rate": 7.719427805848952e-06, + "loss": 0.0022, + "step": 47090 + }, + { + "epoch": 0.7706782295672093, + "grad_norm": 0.04619685932993889, + "learning_rate": 7.718229454404178e-06, + "loss": 0.0027, + "step": 47100 + }, + { + "epoch": 0.7708418555182852, + "grad_norm": 0.01806194894015789, + "learning_rate": 7.717030881269858e-06, + "loss": 0.0021, + "step": 47110 + }, + { + "epoch": 0.771005481469361, + "grad_norm": 0.06549072265625, + "learning_rate": 7.715832086543744e-06, + "loss": 0.0017, + "step": 47120 + }, + { + "epoch": 0.7711691074204369, + "grad_norm": 0.021185988560318947, + "learning_rate": 7.714633070323607e-06, + "loss": 0.0041, + "step": 47130 + }, + { + "epoch": 0.7713327333715128, + "grad_norm": 0.17615261673927307, + "learning_rate": 7.71343383270723e-06, + "loss": 0.0024, + "step": 47140 + }, + { + "epoch": 0.7714963593225885, + "grad_norm": 0.07218621671199799, + "learning_rate": 7.712234373792423e-06, + "loss": 0.0027, + "step": 47150 + }, + { + "epoch": 0.7716599852736644, + "grad_norm": 0.10249637812376022, + "learning_rate": 7.711034693677008e-06, + "loss": 0.002, + "step": 47160 + }, + { + "epoch": 0.7718236112247402, + "grad_norm": 0.061467766761779785, + "learning_rate": 7.709834792458826e-06, + "loss": 0.0016, + "step": 47170 + }, + { + "epoch": 0.7719872371758161, + "grad_norm": 0.08059611171483994, + "learning_rate": 7.70863467023574e-06, + "loss": 0.0017, + "step": 47180 + }, + { + "epoch": 0.772150863126892, + "grad_norm": 0.20557264983654022, + "learning_rate": 7.707434327105625e-06, + "loss": 0.0031, + "step": 47190 + }, + { + "epoch": 0.7723144890779677, + "grad_norm": 0.11062387377023697, + "learning_rate": 7.706233763166377e-06, + "loss": 0.0033, + "step": 47200 + }, + { + "epoch": 0.7724781150290436, + "grad_norm": 0.05701605603098869, + "learning_rate": 7.70503297851591e-06, + "loss": 0.0016, + "step": 47210 + }, + { + "epoch": 0.7726417409801194, + "grad_norm": 0.06543687731027603, + "learning_rate": 7.703831973252158e-06, + "loss": 0.0024, + "step": 47220 + }, + { + "epoch": 0.7728053669311953, + "grad_norm": 0.4352649748325348, + "learning_rate": 7.70263074747307e-06, + "loss": 0.0029, + "step": 47230 + }, + { + "epoch": 0.7729689928822712, + "grad_norm": 0.3466147780418396, + "learning_rate": 7.701429301276612e-06, + "loss": 0.0023, + "step": 47240 + }, + { + "epoch": 0.7731326188333469, + "grad_norm": 0.1285238116979599, + "learning_rate": 7.700227634760773e-06, + "loss": 0.0027, + "step": 47250 + }, + { + "epoch": 0.7732962447844228, + "grad_norm": 0.10287115722894669, + "learning_rate": 7.699025748023553e-06, + "loss": 0.0012, + "step": 47260 + }, + { + "epoch": 0.7734598707354986, + "grad_norm": 0.1444256603717804, + "learning_rate": 7.697823641162978e-06, + "loss": 0.0039, + "step": 47270 + }, + { + "epoch": 0.7736234966865745, + "grad_norm": 0.06911500543355942, + "learning_rate": 7.696621314277083e-06, + "loss": 0.002, + "step": 47280 + }, + { + "epoch": 0.7737871226376504, + "grad_norm": 0.032964158803224564, + "learning_rate": 7.695418767463931e-06, + "loss": 0.0031, + "step": 47290 + }, + { + "epoch": 0.7739507485887261, + "grad_norm": 0.014431367628276348, + "learning_rate": 7.694216000821592e-06, + "loss": 0.0022, + "step": 47300 + }, + { + "epoch": 0.774114374539802, + "grad_norm": 0.04793722182512283, + "learning_rate": 7.693013014448166e-06, + "loss": 0.0015, + "step": 47310 + }, + { + "epoch": 0.7742780004908778, + "grad_norm": 0.10975118726491928, + "learning_rate": 7.691809808441758e-06, + "loss": 0.0019, + "step": 47320 + }, + { + "epoch": 0.7744416264419537, + "grad_norm": 0.12805026769638062, + "learning_rate": 7.6906063829005e-06, + "loss": 0.0023, + "step": 47330 + }, + { + "epoch": 0.7746052523930296, + "grad_norm": 0.04912872612476349, + "learning_rate": 7.689402737922542e-06, + "loss": 0.0024, + "step": 47340 + }, + { + "epoch": 0.7747688783441053, + "grad_norm": 0.06140722706913948, + "learning_rate": 7.688198873606046e-06, + "loss": 0.0018, + "step": 47350 + }, + { + "epoch": 0.7749325042951812, + "grad_norm": 0.09005114436149597, + "learning_rate": 7.686994790049197e-06, + "loss": 0.0018, + "step": 47360 + }, + { + "epoch": 0.775096130246257, + "grad_norm": 0.14339587092399597, + "learning_rate": 7.685790487350194e-06, + "loss": 0.002, + "step": 47370 + }, + { + "epoch": 0.7752597561973329, + "grad_norm": 0.16206015646457672, + "learning_rate": 7.684585965607255e-06, + "loss": 0.0017, + "step": 47380 + }, + { + "epoch": 0.7754233821484088, + "grad_norm": 0.058691803365945816, + "learning_rate": 7.68338122491862e-06, + "loss": 0.0015, + "step": 47390 + }, + { + "epoch": 0.7755870080994846, + "grad_norm": 0.04804212599992752, + "learning_rate": 7.682176265382541e-06, + "loss": 0.0016, + "step": 47400 + }, + { + "epoch": 0.7757506340505604, + "grad_norm": 0.04845782741904259, + "learning_rate": 7.680971087097293e-06, + "loss": 0.0033, + "step": 47410 + }, + { + "epoch": 0.7759142600016362, + "grad_norm": 0.10536957532167435, + "learning_rate": 7.679765690161165e-06, + "loss": 0.0017, + "step": 47420 + }, + { + "epoch": 0.7760778859527121, + "grad_norm": 0.06472755968570709, + "learning_rate": 7.678560074672461e-06, + "loss": 0.0017, + "step": 47430 + }, + { + "epoch": 0.776241511903788, + "grad_norm": 0.19850881397724152, + "learning_rate": 7.677354240729514e-06, + "loss": 0.0034, + "step": 47440 + }, + { + "epoch": 0.7764051378548638, + "grad_norm": 0.07329036295413971, + "learning_rate": 7.676148188430664e-06, + "loss": 0.0019, + "step": 47450 + }, + { + "epoch": 0.7765687638059396, + "grad_norm": 0.12139184772968292, + "learning_rate": 7.67494191787427e-06, + "loss": 0.0037, + "step": 47460 + }, + { + "epoch": 0.7767323897570154, + "grad_norm": 0.04895130172371864, + "learning_rate": 7.673735429158717e-06, + "loss": 0.0035, + "step": 47470 + }, + { + "epoch": 0.7768960157080913, + "grad_norm": 0.1295814961194992, + "learning_rate": 7.672528722382398e-06, + "loss": 0.0018, + "step": 47480 + }, + { + "epoch": 0.7770596416591672, + "grad_norm": 0.11648079752922058, + "learning_rate": 7.67132179764373e-06, + "loss": 0.0032, + "step": 47490 + }, + { + "epoch": 0.777223267610243, + "grad_norm": 0.13318321108818054, + "learning_rate": 7.670114655041144e-06, + "loss": 0.0016, + "step": 47500 + }, + { + "epoch": 0.7773868935613188, + "grad_norm": 0.024613745510578156, + "learning_rate": 7.668907294673092e-06, + "loss": 0.0019, + "step": 47510 + }, + { + "epoch": 0.7775505195123946, + "grad_norm": 0.07957175374031067, + "learning_rate": 7.66769971663804e-06, + "loss": 0.0017, + "step": 47520 + }, + { + "epoch": 0.7777141454634705, + "grad_norm": 0.07746037095785141, + "learning_rate": 7.666491921034478e-06, + "loss": 0.0025, + "step": 47530 + }, + { + "epoch": 0.7778777714145464, + "grad_norm": 0.15120545029640198, + "learning_rate": 7.665283907960906e-06, + "loss": 0.0045, + "step": 47540 + }, + { + "epoch": 0.7780413973656222, + "grad_norm": 0.04857835918664932, + "learning_rate": 7.664075677515845e-06, + "loss": 0.0019, + "step": 47550 + }, + { + "epoch": 0.778205023316698, + "grad_norm": 0.07643720507621765, + "learning_rate": 7.662867229797837e-06, + "loss": 0.0024, + "step": 47560 + }, + { + "epoch": 0.7783686492677738, + "grad_norm": 0.07395889610052109, + "learning_rate": 7.661658564905437e-06, + "loss": 0.0024, + "step": 47570 + }, + { + "epoch": 0.7785322752188497, + "grad_norm": 0.06251849979162216, + "learning_rate": 7.660449682937222e-06, + "loss": 0.0029, + "step": 47580 + }, + { + "epoch": 0.7786959011699256, + "grad_norm": 0.08747665584087372, + "learning_rate": 7.65924058399178e-06, + "loss": 0.0027, + "step": 47590 + }, + { + "epoch": 0.7788595271210014, + "grad_norm": 0.0925385132431984, + "learning_rate": 7.658031268167724e-06, + "loss": 0.0014, + "step": 47600 + }, + { + "epoch": 0.7790231530720773, + "grad_norm": 0.04613550752401352, + "learning_rate": 7.656821735563683e-06, + "loss": 0.0013, + "step": 47610 + }, + { + "epoch": 0.779186779023153, + "grad_norm": 0.1335994005203247, + "learning_rate": 7.655611986278298e-06, + "loss": 0.0028, + "step": 47620 + }, + { + "epoch": 0.7793504049742289, + "grad_norm": 0.09227462857961655, + "learning_rate": 7.654402020410236e-06, + "loss": 0.0016, + "step": 47630 + }, + { + "epoch": 0.7795140309253048, + "grad_norm": 0.10981418937444687, + "learning_rate": 7.653191838058176e-06, + "loss": 0.002, + "step": 47640 + }, + { + "epoch": 0.7796776568763806, + "grad_norm": 0.4968116879463196, + "learning_rate": 7.651981439320816e-06, + "loss": 0.0022, + "step": 47650 + }, + { + "epoch": 0.7798412828274565, + "grad_norm": 0.12973618507385254, + "learning_rate": 7.650770824296873e-06, + "loss": 0.0019, + "step": 47660 + }, + { + "epoch": 0.7800049087785322, + "grad_norm": 0.07794830203056335, + "learning_rate": 7.649559993085083e-06, + "loss": 0.0023, + "step": 47670 + }, + { + "epoch": 0.7801685347296081, + "grad_norm": 0.03202817216515541, + "learning_rate": 7.648348945784193e-06, + "loss": 0.0034, + "step": 47680 + }, + { + "epoch": 0.780332160680684, + "grad_norm": 0.09393087774515152, + "learning_rate": 7.647137682492972e-06, + "loss": 0.0024, + "step": 47690 + }, + { + "epoch": 0.7804957866317598, + "grad_norm": 0.1074301078915596, + "learning_rate": 7.645926203310208e-06, + "loss": 0.0017, + "step": 47700 + }, + { + "epoch": 0.7806594125828357, + "grad_norm": 0.15869303047657013, + "learning_rate": 7.644714508334705e-06, + "loss": 0.0036, + "step": 47710 + }, + { + "epoch": 0.7808230385339114, + "grad_norm": 0.06684724241495132, + "learning_rate": 7.643502597665285e-06, + "loss": 0.0027, + "step": 47720 + }, + { + "epoch": 0.7809866644849873, + "grad_norm": 0.09461407363414764, + "learning_rate": 7.642290471400788e-06, + "loss": 0.0027, + "step": 47730 + }, + { + "epoch": 0.7811502904360632, + "grad_norm": 0.0623343363404274, + "learning_rate": 7.64107812964007e-06, + "loss": 0.0041, + "step": 47740 + }, + { + "epoch": 0.781313916387139, + "grad_norm": 0.036193910986185074, + "learning_rate": 7.639865572482004e-06, + "loss": 0.0011, + "step": 47750 + }, + { + "epoch": 0.7814775423382149, + "grad_norm": 0.07433672249317169, + "learning_rate": 7.638652800025484e-06, + "loss": 0.0027, + "step": 47760 + }, + { + "epoch": 0.7816411682892906, + "grad_norm": 0.038148537278175354, + "learning_rate": 7.63743981236942e-06, + "loss": 0.0032, + "step": 47770 + }, + { + "epoch": 0.7818047942403665, + "grad_norm": 0.1260686218738556, + "learning_rate": 7.636226609612739e-06, + "loss": 0.0049, + "step": 47780 + }, + { + "epoch": 0.7819684201914424, + "grad_norm": 0.059843964874744415, + "learning_rate": 7.635013191854383e-06, + "loss": 0.0014, + "step": 47790 + }, + { + "epoch": 0.7821320461425182, + "grad_norm": 0.14283470809459686, + "learning_rate": 7.633799559193317e-06, + "loss": 0.0038, + "step": 47800 + }, + { + "epoch": 0.7822956720935941, + "grad_norm": 0.14687062799930573, + "learning_rate": 7.63258571172852e-06, + "loss": 0.0037, + "step": 47810 + }, + { + "epoch": 0.7824592980446698, + "grad_norm": 0.025399206206202507, + "learning_rate": 7.631371649558988e-06, + "loss": 0.003, + "step": 47820 + }, + { + "epoch": 0.7826229239957457, + "grad_norm": 0.057816196233034134, + "learning_rate": 7.630157372783738e-06, + "loss": 0.0011, + "step": 47830 + }, + { + "epoch": 0.7827865499468216, + "grad_norm": 0.06773620843887329, + "learning_rate": 7.628942881501802e-06, + "loss": 0.0026, + "step": 47840 + }, + { + "epoch": 0.7829501758978974, + "grad_norm": 0.039248026907444, + "learning_rate": 7.627728175812228e-06, + "loss": 0.002, + "step": 47850 + }, + { + "epoch": 0.7831138018489733, + "grad_norm": 0.07992345094680786, + "learning_rate": 7.626513255814085e-06, + "loss": 0.0021, + "step": 47860 + }, + { + "epoch": 0.783277427800049, + "grad_norm": 0.11569804698228836, + "learning_rate": 7.625298121606457e-06, + "loss": 0.0029, + "step": 47870 + }, + { + "epoch": 0.7834410537511249, + "grad_norm": 0.05941597372293472, + "learning_rate": 7.624082773288446e-06, + "loss": 0.0025, + "step": 47880 + }, + { + "epoch": 0.7836046797022008, + "grad_norm": 0.19366200268268585, + "learning_rate": 7.622867210959171e-06, + "loss": 0.0014, + "step": 47890 + }, + { + "epoch": 0.7837683056532766, + "grad_norm": 0.09540849924087524, + "learning_rate": 7.6216514347177715e-06, + "loss": 0.0023, + "step": 47900 + }, + { + "epoch": 0.7839319316043525, + "grad_norm": 0.0654233768582344, + "learning_rate": 7.6204354446634e-06, + "loss": 0.0022, + "step": 47910 + }, + { + "epoch": 0.7840955575554283, + "grad_norm": 0.029068201780319214, + "learning_rate": 7.6192192408952284e-06, + "loss": 0.0018, + "step": 47920 + }, + { + "epoch": 0.7842591835065041, + "grad_norm": 0.036160003393888474, + "learning_rate": 7.618002823512447e-06, + "loss": 0.0023, + "step": 47930 + }, + { + "epoch": 0.7844228094575799, + "grad_norm": 0.05089482292532921, + "learning_rate": 7.616786192614264e-06, + "loss": 0.0057, + "step": 47940 + }, + { + "epoch": 0.7845864354086558, + "grad_norm": 0.06993527710437775, + "learning_rate": 7.6155693482999e-06, + "loss": 0.0035, + "step": 47950 + }, + { + "epoch": 0.7847500613597317, + "grad_norm": 0.021343685686588287, + "learning_rate": 7.614352290668601e-06, + "loss": 0.0018, + "step": 47960 + }, + { + "epoch": 0.7849136873108075, + "grad_norm": 0.04830536991357803, + "learning_rate": 7.613135019819623e-06, + "loss": 0.0029, + "step": 47970 + }, + { + "epoch": 0.7850773132618833, + "grad_norm": 0.24141362309455872, + "learning_rate": 7.6119175358522436e-06, + "loss": 0.0022, + "step": 47980 + }, + { + "epoch": 0.7852409392129591, + "grad_norm": 0.0946425348520279, + "learning_rate": 7.610699838865756e-06, + "loss": 0.0027, + "step": 47990 + }, + { + "epoch": 0.785404565164035, + "grad_norm": 0.08040356636047363, + "learning_rate": 7.609481928959473e-06, + "loss": 0.003, + "step": 48000 + }, + { + "epoch": 0.7855681911151109, + "grad_norm": 0.1269335299730301, + "learning_rate": 7.6082638062327205e-06, + "loss": 0.0023, + "step": 48010 + }, + { + "epoch": 0.7857318170661867, + "grad_norm": 0.04364291951060295, + "learning_rate": 7.607045470784847e-06, + "loss": 0.002, + "step": 48020 + }, + { + "epoch": 0.7858954430172626, + "grad_norm": 0.21546050906181335, + "learning_rate": 7.605826922715216e-06, + "loss": 0.0026, + "step": 48030 + }, + { + "epoch": 0.7860590689683383, + "grad_norm": 0.06948831677436829, + "learning_rate": 7.604608162123205e-06, + "loss": 0.0031, + "step": 48040 + }, + { + "epoch": 0.7862226949194142, + "grad_norm": 0.045591115951538086, + "learning_rate": 7.603389189108215e-06, + "loss": 0.0016, + "step": 48050 + }, + { + "epoch": 0.7863863208704901, + "grad_norm": 0.03939118608832359, + "learning_rate": 7.60217000376966e-06, + "loss": 0.0011, + "step": 48060 + }, + { + "epoch": 0.7865499468215659, + "grad_norm": 0.1619071215391159, + "learning_rate": 7.600950606206971e-06, + "loss": 0.0027, + "step": 48070 + }, + { + "epoch": 0.7867135727726418, + "grad_norm": 0.06290711462497711, + "learning_rate": 7.5997309965196035e-06, + "loss": 0.002, + "step": 48080 + }, + { + "epoch": 0.7868771987237175, + "grad_norm": 0.07672736048698425, + "learning_rate": 7.5985111748070185e-06, + "loss": 0.0016, + "step": 48090 + }, + { + "epoch": 0.7870408246747934, + "grad_norm": 0.07655113190412521, + "learning_rate": 7.597291141168704e-06, + "loss": 0.002, + "step": 48100 + }, + { + "epoch": 0.7872044506258693, + "grad_norm": 0.08286347985267639, + "learning_rate": 7.59607089570416e-06, + "loss": 0.0014, + "step": 48110 + }, + { + "epoch": 0.7873680765769451, + "grad_norm": 0.04497053474187851, + "learning_rate": 7.594850438512906e-06, + "loss": 0.0024, + "step": 48120 + }, + { + "epoch": 0.787531702528021, + "grad_norm": 0.0022733251098543406, + "learning_rate": 7.593629769694479e-06, + "loss": 0.0016, + "step": 48130 + }, + { + "epoch": 0.7876953284790967, + "grad_norm": 0.15311624109745026, + "learning_rate": 7.592408889348433e-06, + "loss": 0.0018, + "step": 48140 + }, + { + "epoch": 0.7878589544301726, + "grad_norm": 0.05552238970994949, + "learning_rate": 7.591187797574337e-06, + "loss": 0.0021, + "step": 48150 + }, + { + "epoch": 0.7880225803812485, + "grad_norm": 0.09556196630001068, + "learning_rate": 7.589966494471781e-06, + "loss": 0.0019, + "step": 48160 + }, + { + "epoch": 0.7881862063323243, + "grad_norm": 0.1586340218782425, + "learning_rate": 7.58874498014037e-06, + "loss": 0.0025, + "step": 48170 + }, + { + "epoch": 0.7883498322834002, + "grad_norm": 0.0378156341612339, + "learning_rate": 7.587523254679725e-06, + "loss": 0.0024, + "step": 48180 + }, + { + "epoch": 0.7885134582344759, + "grad_norm": 0.04207262396812439, + "learning_rate": 7.5863013181894885e-06, + "loss": 0.002, + "step": 48190 + }, + { + "epoch": 0.7886770841855518, + "grad_norm": 0.02107994630932808, + "learning_rate": 7.5850791707693135e-06, + "loss": 0.0038, + "step": 48200 + }, + { + "epoch": 0.7888407101366277, + "grad_norm": 0.08851795643568039, + "learning_rate": 7.58385681251888e-06, + "loss": 0.0027, + "step": 48210 + }, + { + "epoch": 0.7890043360877035, + "grad_norm": 0.053407587110996246, + "learning_rate": 7.582634243537872e-06, + "loss": 0.0012, + "step": 48220 + }, + { + "epoch": 0.7891679620387794, + "grad_norm": 0.014489405788481236, + "learning_rate": 7.581411463926004e-06, + "loss": 0.0025, + "step": 48230 + }, + { + "epoch": 0.7893315879898551, + "grad_norm": 0.11791495978832245, + "learning_rate": 7.580188473782999e-06, + "loss": 0.0023, + "step": 48240 + }, + { + "epoch": 0.789495213940931, + "grad_norm": 0.0511234886944294, + "learning_rate": 7.578965273208601e-06, + "loss": 0.0021, + "step": 48250 + }, + { + "epoch": 0.7896588398920069, + "grad_norm": 0.010010137222707272, + "learning_rate": 7.57774186230257e-06, + "loss": 0.0036, + "step": 48260 + }, + { + "epoch": 0.7898224658430827, + "grad_norm": 0.06014297530055046, + "learning_rate": 7.576518241164683e-06, + "loss": 0.0028, + "step": 48270 + }, + { + "epoch": 0.7899860917941586, + "grad_norm": 0.06189087778329849, + "learning_rate": 7.575294409894733e-06, + "loss": 0.0019, + "step": 48280 + }, + { + "epoch": 0.7901497177452343, + "grad_norm": 0.07456030696630478, + "learning_rate": 7.574070368592534e-06, + "loss": 0.0017, + "step": 48290 + }, + { + "epoch": 0.7903133436963102, + "grad_norm": 0.19125349819660187, + "learning_rate": 7.572846117357914e-06, + "loss": 0.0035, + "step": 48300 + }, + { + "epoch": 0.7904769696473861, + "grad_norm": 0.011991233564913273, + "learning_rate": 7.571621656290717e-06, + "loss": 0.0016, + "step": 48310 + }, + { + "epoch": 0.7906405955984619, + "grad_norm": 0.1326286643743515, + "learning_rate": 7.570396985490808e-06, + "loss": 0.0032, + "step": 48320 + }, + { + "epoch": 0.7908042215495378, + "grad_norm": 0.035251643508672714, + "learning_rate": 7.569172105058064e-06, + "loss": 0.0023, + "step": 48330 + }, + { + "epoch": 0.7909678475006136, + "grad_norm": 0.03234274312853813, + "learning_rate": 7.5679470150923876e-06, + "loss": 0.0025, + "step": 48340 + }, + { + "epoch": 0.7911314734516894, + "grad_norm": 0.08079449087381363, + "learning_rate": 7.566721715693688e-06, + "loss": 0.0036, + "step": 48350 + }, + { + "epoch": 0.7912950994027653, + "grad_norm": 0.06091868504881859, + "learning_rate": 7.565496206961897e-06, + "loss": 0.0031, + "step": 48360 + }, + { + "epoch": 0.7914587253538411, + "grad_norm": 0.05930716544389725, + "learning_rate": 7.564270488996966e-06, + "loss": 0.0036, + "step": 48370 + }, + { + "epoch": 0.791622351304917, + "grad_norm": 0.1460629254579544, + "learning_rate": 7.5630445618988566e-06, + "loss": 0.003, + "step": 48380 + }, + { + "epoch": 0.7917859772559928, + "grad_norm": 0.055982090532779694, + "learning_rate": 7.561818425767553e-06, + "loss": 0.0029, + "step": 48390 + }, + { + "epoch": 0.7919496032070686, + "grad_norm": 0.020283877849578857, + "learning_rate": 7.560592080703055e-06, + "loss": 0.0014, + "step": 48400 + }, + { + "epoch": 0.7921132291581445, + "grad_norm": 0.06365669518709183, + "learning_rate": 7.55936552680538e-06, + "loss": 0.002, + "step": 48410 + }, + { + "epoch": 0.7922768551092203, + "grad_norm": 0.17802797257900238, + "learning_rate": 7.558138764174558e-06, + "loss": 0.0018, + "step": 48420 + }, + { + "epoch": 0.7924404810602962, + "grad_norm": 0.1036376878619194, + "learning_rate": 7.556911792910644e-06, + "loss": 0.0026, + "step": 48430 + }, + { + "epoch": 0.792604107011372, + "grad_norm": 0.18839232623577118, + "learning_rate": 7.555684613113703e-06, + "loss": 0.0022, + "step": 48440 + }, + { + "epoch": 0.7927677329624478, + "grad_norm": 0.37500232458114624, + "learning_rate": 7.55445722488382e-06, + "loss": 0.0013, + "step": 48450 + }, + { + "epoch": 0.7929313589135237, + "grad_norm": 0.06019440293312073, + "learning_rate": 7.553229628321097e-06, + "loss": 0.0013, + "step": 48460 + }, + { + "epoch": 0.7930949848645995, + "grad_norm": 0.07467138767242432, + "learning_rate": 7.552001823525652e-06, + "loss": 0.0017, + "step": 48470 + }, + { + "epoch": 0.7932586108156754, + "grad_norm": 0.2778966724872589, + "learning_rate": 7.55077381059762e-06, + "loss": 0.0018, + "step": 48480 + }, + { + "epoch": 0.7934222367667512, + "grad_norm": 0.04924745112657547, + "learning_rate": 7.549545589637156e-06, + "loss": 0.0027, + "step": 48490 + }, + { + "epoch": 0.793585862717827, + "grad_norm": 0.10963691025972366, + "learning_rate": 7.548317160744427e-06, + "loss": 0.0028, + "step": 48500 + }, + { + "epoch": 0.7937494886689029, + "grad_norm": 0.09508652240037918, + "learning_rate": 7.547088524019622e-06, + "loss": 0.0023, + "step": 48510 + }, + { + "epoch": 0.7939131146199787, + "grad_norm": 0.3775133192539215, + "learning_rate": 7.545859679562942e-06, + "loss": 0.0056, + "step": 48520 + }, + { + "epoch": 0.7940767405710546, + "grad_norm": 0.19377544522285461, + "learning_rate": 7.544630627474609e-06, + "loss": 0.0013, + "step": 48530 + }, + { + "epoch": 0.7942403665221304, + "grad_norm": 0.09341082721948624, + "learning_rate": 7.543401367854859e-06, + "loss": 0.0025, + "step": 48540 + }, + { + "epoch": 0.7944039924732063, + "grad_norm": 0.10973850637674332, + "learning_rate": 7.542171900803945e-06, + "loss": 0.0036, + "step": 48550 + }, + { + "epoch": 0.7945676184242821, + "grad_norm": 0.08497956395149231, + "learning_rate": 7.540942226422143e-06, + "loss": 0.0027, + "step": 48560 + }, + { + "epoch": 0.7947312443753579, + "grad_norm": 0.07113735377788544, + "learning_rate": 7.5397123448097354e-06, + "loss": 0.0028, + "step": 48570 + }, + { + "epoch": 0.7948948703264338, + "grad_norm": 0.13081003725528717, + "learning_rate": 7.538482256067032e-06, + "loss": 0.002, + "step": 48580 + }, + { + "epoch": 0.7950584962775096, + "grad_norm": 0.026676848530769348, + "learning_rate": 7.537251960294352e-06, + "loss": 0.0016, + "step": 48590 + }, + { + "epoch": 0.7952221222285855, + "grad_norm": 0.13389082252979279, + "learning_rate": 7.536021457592033e-06, + "loss": 0.0016, + "step": 48600 + }, + { + "epoch": 0.7953857481796613, + "grad_norm": 0.01036846823990345, + "learning_rate": 7.534790748060434e-06, + "loss": 0.0019, + "step": 48610 + }, + { + "epoch": 0.7955493741307371, + "grad_norm": 0.04020111635327339, + "learning_rate": 7.533559831799925e-06, + "loss": 0.0021, + "step": 48620 + }, + { + "epoch": 0.795713000081813, + "grad_norm": 0.18633858859539032, + "learning_rate": 7.532328708910897e-06, + "loss": 0.0021, + "step": 48630 + }, + { + "epoch": 0.7958766260328888, + "grad_norm": 0.18273411691188812, + "learning_rate": 7.531097379493752e-06, + "loss": 0.0019, + "step": 48640 + }, + { + "epoch": 0.7960402519839647, + "grad_norm": 0.164573535323143, + "learning_rate": 7.52986584364892e-06, + "loss": 0.0027, + "step": 48650 + }, + { + "epoch": 0.7962038779350405, + "grad_norm": 0.05422735959291458, + "learning_rate": 7.528634101476835e-06, + "loss": 0.0028, + "step": 48660 + }, + { + "epoch": 0.7963675038861163, + "grad_norm": 0.09189562499523163, + "learning_rate": 7.5274021530779565e-06, + "loss": 0.0022, + "step": 48670 + }, + { + "epoch": 0.7965311298371922, + "grad_norm": 0.01544390432536602, + "learning_rate": 7.526169998552757e-06, + "loss": 0.0031, + "step": 48680 + }, + { + "epoch": 0.796694755788268, + "grad_norm": 0.021614916622638702, + "learning_rate": 7.524937638001728e-06, + "loss": 0.002, + "step": 48690 + }, + { + "epoch": 0.7968583817393439, + "grad_norm": 0.2687177062034607, + "learning_rate": 7.5237050715253755e-06, + "loss": 0.0045, + "step": 48700 + }, + { + "epoch": 0.7970220076904198, + "grad_norm": 0.2881016433238983, + "learning_rate": 7.522472299224224e-06, + "loss": 0.0075, + "step": 48710 + }, + { + "epoch": 0.7971856336414955, + "grad_norm": 0.07347449660301208, + "learning_rate": 7.521239321198813e-06, + "loss": 0.0032, + "step": 48720 + }, + { + "epoch": 0.7973492595925714, + "grad_norm": 0.09159993380308151, + "learning_rate": 7.520006137549702e-06, + "loss": 0.0025, + "step": 48730 + }, + { + "epoch": 0.7975128855436472, + "grad_norm": 0.06818033754825592, + "learning_rate": 7.518772748377463e-06, + "loss": 0.0025, + "step": 48740 + }, + { + "epoch": 0.7976765114947231, + "grad_norm": 0.07374434918165207, + "learning_rate": 7.5175391537826894e-06, + "loss": 0.0021, + "step": 48750 + }, + { + "epoch": 0.797840137445799, + "grad_norm": 0.08558545261621475, + "learning_rate": 7.516305353865988e-06, + "loss": 0.002, + "step": 48760 + }, + { + "epoch": 0.7980037633968747, + "grad_norm": 0.019095228984951973, + "learning_rate": 7.5150713487279826e-06, + "loss": 0.0019, + "step": 48770 + }, + { + "epoch": 0.7981673893479506, + "grad_norm": 0.2626863420009613, + "learning_rate": 7.513837138469315e-06, + "loss": 0.0031, + "step": 48780 + }, + { + "epoch": 0.7983310152990264, + "grad_norm": 0.11407846212387085, + "learning_rate": 7.512602723190643e-06, + "loss": 0.0024, + "step": 48790 + }, + { + "epoch": 0.7984946412501023, + "grad_norm": 0.032676588743925095, + "learning_rate": 7.5113681029926425e-06, + "loss": 0.002, + "step": 48800 + }, + { + "epoch": 0.798658267201178, + "grad_norm": 0.0805685818195343, + "learning_rate": 7.510133277976002e-06, + "loss": 0.0021, + "step": 48810 + }, + { + "epoch": 0.7988218931522539, + "grad_norm": 0.1352633535861969, + "learning_rate": 7.508898248241433e-06, + "loss": 0.0015, + "step": 48820 + }, + { + "epoch": 0.7989855191033298, + "grad_norm": 0.05259109288454056, + "learning_rate": 7.50766301388966e-06, + "loss": 0.0046, + "step": 48830 + }, + { + "epoch": 0.7991491450544056, + "grad_norm": 0.14377066493034363, + "learning_rate": 7.506427575021422e-06, + "loss": 0.0037, + "step": 48840 + }, + { + "epoch": 0.7993127710054815, + "grad_norm": 0.08536365628242493, + "learning_rate": 7.505191931737479e-06, + "loss": 0.002, + "step": 48850 + }, + { + "epoch": 0.7994763969565573, + "grad_norm": 0.12085624039173126, + "learning_rate": 7.503956084138604e-06, + "loss": 0.0027, + "step": 48860 + }, + { + "epoch": 0.7996400229076331, + "grad_norm": 0.025957763195037842, + "learning_rate": 7.502720032325592e-06, + "loss": 0.0021, + "step": 48870 + }, + { + "epoch": 0.799803648858709, + "grad_norm": 0.03195950761437416, + "learning_rate": 7.501483776399248e-06, + "loss": 0.0018, + "step": 48880 + }, + { + "epoch": 0.7999672748097848, + "grad_norm": 0.059326086193323135, + "learning_rate": 7.500247316460399e-06, + "loss": 0.0024, + "step": 48890 + }, + { + "epoch": 0.8001309007608607, + "grad_norm": 0.04250849783420563, + "learning_rate": 7.499010652609884e-06, + "loss": 0.0022, + "step": 48900 + }, + { + "epoch": 0.8002945267119365, + "grad_norm": 0.1613055318593979, + "learning_rate": 7.497773784948562e-06, + "loss": 0.0022, + "step": 48910 + }, + { + "epoch": 0.8004581526630123, + "grad_norm": 0.05244581773877144, + "learning_rate": 7.4965367135773095e-06, + "loss": 0.003, + "step": 48920 + }, + { + "epoch": 0.8006217786140882, + "grad_norm": 0.049232061952352524, + "learning_rate": 7.495299438597017e-06, + "loss": 0.002, + "step": 48930 + }, + { + "epoch": 0.800785404565164, + "grad_norm": 0.18946042656898499, + "learning_rate": 7.494061960108591e-06, + "loss": 0.0026, + "step": 48940 + }, + { + "epoch": 0.8009490305162399, + "grad_norm": 0.033381387591362, + "learning_rate": 7.4928242782129575e-06, + "loss": 0.0009, + "step": 48950 + }, + { + "epoch": 0.8011126564673157, + "grad_norm": 0.11923344433307648, + "learning_rate": 7.491586393011058e-06, + "loss": 0.0025, + "step": 48960 + }, + { + "epoch": 0.8012762824183915, + "grad_norm": 0.30180180072784424, + "learning_rate": 7.490348304603848e-06, + "loss": 0.0043, + "step": 48970 + }, + { + "epoch": 0.8014399083694674, + "grad_norm": 0.01668470911681652, + "learning_rate": 7.489110013092304e-06, + "loss": 0.0012, + "step": 48980 + }, + { + "epoch": 0.8016035343205432, + "grad_norm": 0.014078144915401936, + "learning_rate": 7.487871518577417e-06, + "loss": 0.0018, + "step": 48990 + }, + { + "epoch": 0.8017671602716191, + "grad_norm": 0.08132600039243698, + "learning_rate": 7.486632821160192e-06, + "loss": 0.0019, + "step": 49000 + }, + { + "epoch": 0.8019307862226949, + "grad_norm": 0.02935425005853176, + "learning_rate": 7.485393920941654e-06, + "loss": 0.0036, + "step": 49010 + }, + { + "epoch": 0.8020944121737708, + "grad_norm": 0.029493147507309914, + "learning_rate": 7.484154818022844e-06, + "loss": 0.0021, + "step": 49020 + }, + { + "epoch": 0.8022580381248466, + "grad_norm": 0.03932320699095726, + "learning_rate": 7.48291551250482e-06, + "loss": 0.0043, + "step": 49030 + }, + { + "epoch": 0.8024216640759224, + "grad_norm": 0.12742534279823303, + "learning_rate": 7.481676004488654e-06, + "loss": 0.0031, + "step": 49040 + }, + { + "epoch": 0.8025852900269983, + "grad_norm": 0.06556879729032516, + "learning_rate": 7.480436294075437e-06, + "loss": 0.0028, + "step": 49050 + }, + { + "epoch": 0.8027489159780741, + "grad_norm": 0.09390657395124435, + "learning_rate": 7.479196381366274e-06, + "loss": 0.0025, + "step": 49060 + }, + { + "epoch": 0.80291254192915, + "grad_norm": 0.13028788566589355, + "learning_rate": 7.477956266462289e-06, + "loss": 0.0018, + "step": 49070 + }, + { + "epoch": 0.8030761678802258, + "grad_norm": 0.20747381448745728, + "learning_rate": 7.476715949464621e-06, + "loss": 0.0021, + "step": 49080 + }, + { + "epoch": 0.8032397938313016, + "grad_norm": 0.0537344254553318, + "learning_rate": 7.475475430474428e-06, + "loss": 0.0016, + "step": 49090 + }, + { + "epoch": 0.8034034197823775, + "grad_norm": 0.16066353023052216, + "learning_rate": 7.4742347095928815e-06, + "loss": 0.0031, + "step": 49100 + }, + { + "epoch": 0.8035670457334533, + "grad_norm": 0.22176072001457214, + "learning_rate": 7.47299378692117e-06, + "loss": 0.0013, + "step": 49110 + }, + { + "epoch": 0.8037306716845292, + "grad_norm": 0.06017155572772026, + "learning_rate": 7.4717526625604984e-06, + "loss": 0.0024, + "step": 49120 + }, + { + "epoch": 0.803894297635605, + "grad_norm": 0.03117782063782215, + "learning_rate": 7.470511336612089e-06, + "loss": 0.0033, + "step": 49130 + }, + { + "epoch": 0.8040579235866808, + "grad_norm": 0.1709616333246231, + "learning_rate": 7.4692698091771805e-06, + "loss": 0.0016, + "step": 49140 + }, + { + "epoch": 0.8042215495377567, + "grad_norm": 0.027601182460784912, + "learning_rate": 7.468028080357028e-06, + "loss": 0.0026, + "step": 49150 + }, + { + "epoch": 0.8043851754888325, + "grad_norm": 0.07321617752313614, + "learning_rate": 7.466786150252903e-06, + "loss": 0.0025, + "step": 49160 + }, + { + "epoch": 0.8045488014399084, + "grad_norm": 0.0664752945303917, + "learning_rate": 7.465544018966091e-06, + "loss": 0.0039, + "step": 49170 + }, + { + "epoch": 0.8047124273909843, + "grad_norm": 0.046098992228507996, + "learning_rate": 7.464301686597898e-06, + "loss": 0.0026, + "step": 49180 + }, + { + "epoch": 0.80487605334206, + "grad_norm": 0.04111822694540024, + "learning_rate": 7.463059153249644e-06, + "loss": 0.004, + "step": 49190 + }, + { + "epoch": 0.8050396792931359, + "grad_norm": 0.06876882910728455, + "learning_rate": 7.461816419022664e-06, + "loss": 0.0022, + "step": 49200 + }, + { + "epoch": 0.8052033052442117, + "grad_norm": 0.11711051315069199, + "learning_rate": 7.460573484018314e-06, + "loss": 0.0019, + "step": 49210 + }, + { + "epoch": 0.8053669311952876, + "grad_norm": 0.13310858607292175, + "learning_rate": 7.459330348337963e-06, + "loss": 0.0024, + "step": 49220 + }, + { + "epoch": 0.8055305571463635, + "grad_norm": 0.3619876205921173, + "learning_rate": 7.458087012082995e-06, + "loss": 0.0023, + "step": 49230 + }, + { + "epoch": 0.8056941830974392, + "grad_norm": 0.08216134458780289, + "learning_rate": 7.456843475354813e-06, + "loss": 0.0021, + "step": 49240 + }, + { + "epoch": 0.8058578090485151, + "grad_norm": 0.2117079347372055, + "learning_rate": 7.455599738254837e-06, + "loss": 0.0046, + "step": 49250 + }, + { + "epoch": 0.8060214349995909, + "grad_norm": 0.02750161848962307, + "learning_rate": 7.4543558008845005e-06, + "loss": 0.0026, + "step": 49260 + }, + { + "epoch": 0.8061850609506668, + "grad_norm": 0.053104374557733536, + "learning_rate": 7.453111663345255e-06, + "loss": 0.001, + "step": 49270 + }, + { + "epoch": 0.8063486869017427, + "grad_norm": 0.029090924188494682, + "learning_rate": 7.451867325738568e-06, + "loss": 0.0015, + "step": 49280 + }, + { + "epoch": 0.8065123128528184, + "grad_norm": 0.11263518780469894, + "learning_rate": 7.450622788165926e-06, + "loss": 0.0019, + "step": 49290 + }, + { + "epoch": 0.8066759388038943, + "grad_norm": 0.07891736924648285, + "learning_rate": 7.449378050728826e-06, + "loss": 0.0026, + "step": 49300 + }, + { + "epoch": 0.8068395647549701, + "grad_norm": 0.22481808066368103, + "learning_rate": 7.448133113528785e-06, + "loss": 0.0046, + "step": 49310 + }, + { + "epoch": 0.807003190706046, + "grad_norm": 0.02819298394024372, + "learning_rate": 7.446887976667338e-06, + "loss": 0.0026, + "step": 49320 + }, + { + "epoch": 0.8071668166571219, + "grad_norm": 0.12364399433135986, + "learning_rate": 7.445642640246032e-06, + "loss": 0.0014, + "step": 49330 + }, + { + "epoch": 0.8073304426081976, + "grad_norm": 0.030073098838329315, + "learning_rate": 7.444397104366432e-06, + "loss": 0.0016, + "step": 49340 + }, + { + "epoch": 0.8074940685592735, + "grad_norm": 0.13526415824890137, + "learning_rate": 7.4431513691301215e-06, + "loss": 0.002, + "step": 49350 + }, + { + "epoch": 0.8076576945103493, + "grad_norm": 0.1513245701789856, + "learning_rate": 7.441905434638697e-06, + "loss": 0.0026, + "step": 49360 + }, + { + "epoch": 0.8078213204614252, + "grad_norm": 0.05508732423186302, + "learning_rate": 7.4406593009937746e-06, + "loss": 0.0018, + "step": 49370 + }, + { + "epoch": 0.8079849464125011, + "grad_norm": 0.008226108737289906, + "learning_rate": 7.4394129682969815e-06, + "loss": 0.0013, + "step": 49380 + }, + { + "epoch": 0.8081485723635768, + "grad_norm": 0.01117747463285923, + "learning_rate": 7.438166436649968e-06, + "loss": 0.0019, + "step": 49390 + }, + { + "epoch": 0.8083121983146527, + "grad_norm": 0.03988465294241905, + "learning_rate": 7.4369197061543955e-06, + "loss": 0.0024, + "step": 49400 + }, + { + "epoch": 0.8084758242657285, + "grad_norm": 0.13381509482860565, + "learning_rate": 7.435672776911942e-06, + "loss": 0.0024, + "step": 49410 + }, + { + "epoch": 0.8086394502168044, + "grad_norm": 0.13291525840759277, + "learning_rate": 7.434425649024304e-06, + "loss": 0.0023, + "step": 49420 + }, + { + "epoch": 0.8088030761678803, + "grad_norm": 0.0682954490184784, + "learning_rate": 7.4331783225931934e-06, + "loss": 0.003, + "step": 49430 + }, + { + "epoch": 0.808966702118956, + "grad_norm": 0.06010191887617111, + "learning_rate": 7.431930797720336e-06, + "loss": 0.0008, + "step": 49440 + }, + { + "epoch": 0.8091303280700319, + "grad_norm": 0.07638275623321533, + "learning_rate": 7.430683074507478e-06, + "loss": 0.0022, + "step": 49450 + }, + { + "epoch": 0.8092939540211077, + "grad_norm": 0.10926320403814316, + "learning_rate": 7.429435153056377e-06, + "loss": 0.0022, + "step": 49460 + }, + { + "epoch": 0.8094575799721836, + "grad_norm": 0.08077213913202286, + "learning_rate": 7.428187033468811e-06, + "loss": 0.0015, + "step": 49470 + }, + { + "epoch": 0.8096212059232595, + "grad_norm": 0.05735640600323677, + "learning_rate": 7.426938715846572e-06, + "loss": 0.0014, + "step": 49480 + }, + { + "epoch": 0.8097848318743353, + "grad_norm": 0.045037779957056046, + "learning_rate": 7.425690200291469e-06, + "loss": 0.0016, + "step": 49490 + }, + { + "epoch": 0.8099484578254111, + "grad_norm": 0.06679581105709076, + "learning_rate": 7.424441486905326e-06, + "loss": 0.0029, + "step": 49500 + }, + { + "epoch": 0.8101120837764869, + "grad_norm": 0.07620979100465775, + "learning_rate": 7.423192575789984e-06, + "loss": 0.0023, + "step": 49510 + }, + { + "epoch": 0.8102757097275628, + "grad_norm": 0.05226852372288704, + "learning_rate": 7.4219434670473e-06, + "loss": 0.0018, + "step": 49520 + }, + { + "epoch": 0.8104393356786387, + "grad_norm": 0.02041042409837246, + "learning_rate": 7.4206941607791474e-06, + "loss": 0.003, + "step": 49530 + }, + { + "epoch": 0.8106029616297145, + "grad_norm": 0.12250716239213943, + "learning_rate": 7.419444657087413e-06, + "loss": 0.0035, + "step": 49540 + }, + { + "epoch": 0.8107665875807903, + "grad_norm": 0.13922275602817535, + "learning_rate": 7.418194956074007e-06, + "loss": 0.0019, + "step": 49550 + }, + { + "epoch": 0.8109302135318661, + "grad_norm": 0.051301538944244385, + "learning_rate": 7.416945057840847e-06, + "loss": 0.0022, + "step": 49560 + }, + { + "epoch": 0.811093839482942, + "grad_norm": 0.06858295202255249, + "learning_rate": 7.41569496248987e-06, + "loss": 0.0018, + "step": 49570 + }, + { + "epoch": 0.8112574654340179, + "grad_norm": 0.035342417657375336, + "learning_rate": 7.414444670123031e-06, + "loss": 0.0021, + "step": 49580 + }, + { + "epoch": 0.8114210913850937, + "grad_norm": 0.07623059302568436, + "learning_rate": 7.413194180842299e-06, + "loss": 0.0032, + "step": 49590 + }, + { + "epoch": 0.8115847173361695, + "grad_norm": 0.028298957273364067, + "learning_rate": 7.41194349474966e-06, + "loss": 0.0013, + "step": 49600 + }, + { + "epoch": 0.8117483432872453, + "grad_norm": 0.020288070663809776, + "learning_rate": 7.410692611947117e-06, + "loss": 0.002, + "step": 49610 + }, + { + "epoch": 0.8119119692383212, + "grad_norm": 0.024038251489400864, + "learning_rate": 7.409441532536686e-06, + "loss": 0.0028, + "step": 49620 + }, + { + "epoch": 0.8120755951893971, + "grad_norm": 0.030338119715452194, + "learning_rate": 7.4081902566204015e-06, + "loss": 0.002, + "step": 49630 + }, + { + "epoch": 0.8122392211404729, + "grad_norm": 0.01666412688791752, + "learning_rate": 7.406938784300312e-06, + "loss": 0.0027, + "step": 49640 + }, + { + "epoch": 0.8124028470915488, + "grad_norm": 0.15088599920272827, + "learning_rate": 7.4056871156784844e-06, + "loss": 0.0024, + "step": 49650 + }, + { + "epoch": 0.8125664730426245, + "grad_norm": 0.03537040948867798, + "learning_rate": 7.404435250857002e-06, + "loss": 0.0026, + "step": 49660 + }, + { + "epoch": 0.8127300989937004, + "grad_norm": 0.1229194775223732, + "learning_rate": 7.403183189937959e-06, + "loss": 0.0017, + "step": 49670 + }, + { + "epoch": 0.8128937249447763, + "grad_norm": 0.1257980316877365, + "learning_rate": 7.4019309330234735e-06, + "loss": 0.0021, + "step": 49680 + }, + { + "epoch": 0.8130573508958521, + "grad_norm": 0.07160530984401703, + "learning_rate": 7.4006784802156715e-06, + "loss": 0.0013, + "step": 49690 + }, + { + "epoch": 0.813220976846928, + "grad_norm": 0.08276446163654327, + "learning_rate": 7.399425831616701e-06, + "loss": 0.002, + "step": 49700 + }, + { + "epoch": 0.8133846027980037, + "grad_norm": 0.03314686939120293, + "learning_rate": 7.3981729873287234e-06, + "loss": 0.0022, + "step": 49710 + }, + { + "epoch": 0.8135482287490796, + "grad_norm": 0.21276918053627014, + "learning_rate": 7.396919947453916e-06, + "loss": 0.0021, + "step": 49720 + }, + { + "epoch": 0.8137118547001554, + "grad_norm": 0.14992153644561768, + "learning_rate": 7.3956667120944735e-06, + "loss": 0.0033, + "step": 49730 + }, + { + "epoch": 0.8138754806512313, + "grad_norm": 0.0671754702925682, + "learning_rate": 7.394413281352604e-06, + "loss": 0.0021, + "step": 49740 + }, + { + "epoch": 0.8140391066023072, + "grad_norm": 0.09255821257829666, + "learning_rate": 7.393159655330534e-06, + "loss": 0.0017, + "step": 49750 + }, + { + "epoch": 0.8142027325533829, + "grad_norm": 0.08014623820781708, + "learning_rate": 7.391905834130504e-06, + "loss": 0.0028, + "step": 49760 + }, + { + "epoch": 0.8143663585044588, + "grad_norm": 0.09309299290180206, + "learning_rate": 7.3906518178547725e-06, + "loss": 0.0012, + "step": 49770 + }, + { + "epoch": 0.8145299844555346, + "grad_norm": 0.12776319682598114, + "learning_rate": 7.389397606605612e-06, + "loss": 0.003, + "step": 49780 + }, + { + "epoch": 0.8146936104066105, + "grad_norm": 0.07657567411661148, + "learning_rate": 7.388143200485314e-06, + "loss": 0.0028, + "step": 49790 + }, + { + "epoch": 0.8148572363576864, + "grad_norm": 0.05055200308561325, + "learning_rate": 7.38688859959618e-06, + "loss": 0.0017, + "step": 49800 + }, + { + "epoch": 0.8150208623087621, + "grad_norm": 0.007001059129834175, + "learning_rate": 7.385633804040534e-06, + "loss": 0.0018, + "step": 49810 + }, + { + "epoch": 0.815184488259838, + "grad_norm": 0.03933168202638626, + "learning_rate": 7.3843788139207105e-06, + "loss": 0.0041, + "step": 49820 + }, + { + "epoch": 0.8153481142109138, + "grad_norm": 0.059958551079034805, + "learning_rate": 7.383123629339064e-06, + "loss": 0.0025, + "step": 49830 + }, + { + "epoch": 0.8155117401619897, + "grad_norm": 0.048081789165735245, + "learning_rate": 7.3818682503979626e-06, + "loss": 0.0017, + "step": 49840 + }, + { + "epoch": 0.8156753661130656, + "grad_norm": 0.06314675509929657, + "learning_rate": 7.38061267719979e-06, + "loss": 0.002, + "step": 49850 + }, + { + "epoch": 0.8158389920641413, + "grad_norm": 0.11797534674406052, + "learning_rate": 7.379356909846946e-06, + "loss": 0.005, + "step": 49860 + }, + { + "epoch": 0.8160026180152172, + "grad_norm": 0.04023521766066551, + "learning_rate": 7.37810094844185e-06, + "loss": 0.002, + "step": 49870 + }, + { + "epoch": 0.816166243966293, + "grad_norm": 0.06246769800782204, + "learning_rate": 7.3768447930869306e-06, + "loss": 0.0023, + "step": 49880 + }, + { + "epoch": 0.8163298699173689, + "grad_norm": 0.07203426212072372, + "learning_rate": 7.375588443884636e-06, + "loss": 0.0026, + "step": 49890 + }, + { + "epoch": 0.8164934958684448, + "grad_norm": 0.0768526941537857, + "learning_rate": 7.37433190093743e-06, + "loss": 0.003, + "step": 49900 + }, + { + "epoch": 0.8166571218195205, + "grad_norm": 0.027911782264709473, + "learning_rate": 7.373075164347794e-06, + "loss": 0.0034, + "step": 49910 + }, + { + "epoch": 0.8168207477705964, + "grad_norm": 0.03574071452021599, + "learning_rate": 7.3718182342182196e-06, + "loss": 0.0023, + "step": 49920 + }, + { + "epoch": 0.8169843737216722, + "grad_norm": 0.0208068136125803, + "learning_rate": 7.370561110651221e-06, + "loss": 0.0016, + "step": 49930 + }, + { + "epoch": 0.8171479996727481, + "grad_norm": 0.055411193519830704, + "learning_rate": 7.369303793749323e-06, + "loss": 0.0016, + "step": 49940 + }, + { + "epoch": 0.817311625623824, + "grad_norm": 0.032277438789606094, + "learning_rate": 7.368046283615069e-06, + "loss": 0.0028, + "step": 49950 + }, + { + "epoch": 0.8174752515748998, + "grad_norm": 0.052767179906368256, + "learning_rate": 7.366788580351016e-06, + "loss": 0.0019, + "step": 49960 + }, + { + "epoch": 0.8176388775259756, + "grad_norm": 0.020436806604266167, + "learning_rate": 7.3655306840597395e-06, + "loss": 0.0017, + "step": 49970 + }, + { + "epoch": 0.8178025034770514, + "grad_norm": 0.07657230645418167, + "learning_rate": 7.364272594843829e-06, + "loss": 0.0015, + "step": 49980 + }, + { + "epoch": 0.8179661294281273, + "grad_norm": 0.016491534188389778, + "learning_rate": 7.36301431280589e-06, + "loss": 0.0027, + "step": 49990 + }, + { + "epoch": 0.8181297553792032, + "grad_norm": 0.024747731164097786, + "learning_rate": 7.361755838048542e-06, + "loss": 0.0017, + "step": 50000 + }, + { + "epoch": 0.818293381330279, + "grad_norm": 0.12155485153198242, + "learning_rate": 7.3604971706744235e-06, + "loss": 0.0031, + "step": 50010 + }, + { + "epoch": 0.8184570072813548, + "grad_norm": 0.08477406948804855, + "learning_rate": 7.359238310786187e-06, + "loss": 0.0027, + "step": 50020 + }, + { + "epoch": 0.8186206332324306, + "grad_norm": 0.11422862857580185, + "learning_rate": 7.357979258486501e-06, + "loss": 0.0025, + "step": 50030 + }, + { + "epoch": 0.8187842591835065, + "grad_norm": 0.14531439542770386, + "learning_rate": 7.3567200138780495e-06, + "loss": 0.0032, + "step": 50040 + }, + { + "epoch": 0.8189478851345824, + "grad_norm": 0.10129663348197937, + "learning_rate": 7.3554605770635315e-06, + "loss": 0.0022, + "step": 50050 + }, + { + "epoch": 0.8191115110856582, + "grad_norm": 0.01215177308768034, + "learning_rate": 7.354200948145662e-06, + "loss": 0.0021, + "step": 50060 + }, + { + "epoch": 0.819275137036734, + "grad_norm": 0.03285469114780426, + "learning_rate": 7.352941127227174e-06, + "loss": 0.0019, + "step": 50070 + }, + { + "epoch": 0.8194387629878098, + "grad_norm": 0.04930134490132332, + "learning_rate": 7.351681114410814e-06, + "loss": 0.0016, + "step": 50080 + }, + { + "epoch": 0.8196023889388857, + "grad_norm": 0.08564666658639908, + "learning_rate": 7.35042090979934e-06, + "loss": 0.0027, + "step": 50090 + }, + { + "epoch": 0.8197660148899616, + "grad_norm": 0.10493818670511246, + "learning_rate": 7.349160513495537e-06, + "loss": 0.003, + "step": 50100 + }, + { + "epoch": 0.8199296408410374, + "grad_norm": 0.07953081279993057, + "learning_rate": 7.347899925602193e-06, + "loss": 0.0027, + "step": 50110 + }, + { + "epoch": 0.8200932667921133, + "grad_norm": 0.03999391198158264, + "learning_rate": 7.346639146222121e-06, + "loss": 0.0017, + "step": 50120 + }, + { + "epoch": 0.820256892743189, + "grad_norm": 0.09898420423269272, + "learning_rate": 7.3453781754581425e-06, + "loss": 0.0036, + "step": 50130 + }, + { + "epoch": 0.8204205186942649, + "grad_norm": 0.11206120997667313, + "learning_rate": 7.3441170134131e-06, + "loss": 0.0026, + "step": 50140 + }, + { + "epoch": 0.8205841446453408, + "grad_norm": 0.0876530334353447, + "learning_rate": 7.342855660189849e-06, + "loss": 0.0022, + "step": 50150 + }, + { + "epoch": 0.8207477705964166, + "grad_norm": 0.05326462909579277, + "learning_rate": 7.3415941158912625e-06, + "loss": 0.0019, + "step": 50160 + }, + { + "epoch": 0.8209113965474925, + "grad_norm": 0.06661269813776016, + "learning_rate": 7.340332380620226e-06, + "loss": 0.0022, + "step": 50170 + }, + { + "epoch": 0.8210750224985682, + "grad_norm": 0.06276343762874603, + "learning_rate": 7.339070454479645e-06, + "loss": 0.0028, + "step": 50180 + }, + { + "epoch": 0.8212386484496441, + "grad_norm": 0.09733522683382034, + "learning_rate": 7.337808337572434e-06, + "loss": 0.0018, + "step": 50190 + }, + { + "epoch": 0.82140227440072, + "grad_norm": 0.05775713920593262, + "learning_rate": 7.336546030001529e-06, + "loss": 0.0032, + "step": 50200 + }, + { + "epoch": 0.8215659003517958, + "grad_norm": 0.06083736941218376, + "learning_rate": 7.335283531869879e-06, + "loss": 0.0018, + "step": 50210 + }, + { + "epoch": 0.8217295263028717, + "grad_norm": 0.09311966598033905, + "learning_rate": 7.334020843280451e-06, + "loss": 0.002, + "step": 50220 + }, + { + "epoch": 0.8218931522539474, + "grad_norm": 0.029629850760102272, + "learning_rate": 7.332757964336222e-06, + "loss": 0.0022, + "step": 50230 + }, + { + "epoch": 0.8220567782050233, + "grad_norm": 0.05260154604911804, + "learning_rate": 7.331494895140192e-06, + "loss": 0.0021, + "step": 50240 + }, + { + "epoch": 0.8222204041560992, + "grad_norm": 0.11988123506307602, + "learning_rate": 7.330231635795369e-06, + "loss": 0.0029, + "step": 50250 + }, + { + "epoch": 0.822384030107175, + "grad_norm": 0.07858241349458694, + "learning_rate": 7.3289681864047835e-06, + "loss": 0.0017, + "step": 50260 + }, + { + "epoch": 0.8225476560582509, + "grad_norm": 0.028480354696512222, + "learning_rate": 7.327704547071476e-06, + "loss": 0.0018, + "step": 50270 + }, + { + "epoch": 0.8227112820093266, + "grad_norm": 0.08678663522005081, + "learning_rate": 7.3264407178985055e-06, + "loss": 0.0023, + "step": 50280 + }, + { + "epoch": 0.8228749079604025, + "grad_norm": 0.01689848117530346, + "learning_rate": 7.325176698988945e-06, + "loss": 0.0015, + "step": 50290 + }, + { + "epoch": 0.8230385339114784, + "grad_norm": 0.02860642597079277, + "learning_rate": 7.323912490445884e-06, + "loss": 0.0022, + "step": 50300 + }, + { + "epoch": 0.8232021598625542, + "grad_norm": 0.682255208492279, + "learning_rate": 7.322648092372426e-06, + "loss": 0.0018, + "step": 50310 + }, + { + "epoch": 0.8233657858136301, + "grad_norm": 0.11716106534004211, + "learning_rate": 7.321383504871692e-06, + "loss": 0.0031, + "step": 50320 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.05334879085421562, + "learning_rate": 7.320118728046818e-06, + "loss": 0.0022, + "step": 50330 + }, + { + "epoch": 0.8236930377157817, + "grad_norm": 0.13172274827957153, + "learning_rate": 7.318853762000955e-06, + "loss": 0.0015, + "step": 50340 + }, + { + "epoch": 0.8238566636668576, + "grad_norm": 0.04253697395324707, + "learning_rate": 7.317588606837267e-06, + "loss": 0.0048, + "step": 50350 + }, + { + "epoch": 0.8240202896179334, + "grad_norm": 0.16102904081344604, + "learning_rate": 7.316323262658939e-06, + "loss": 0.0021, + "step": 50360 + }, + { + "epoch": 0.8241839155690093, + "grad_norm": 0.10712534934282303, + "learning_rate": 7.3150577295691646e-06, + "loss": 0.0014, + "step": 50370 + }, + { + "epoch": 0.824347541520085, + "grad_norm": 0.11200733482837677, + "learning_rate": 7.31379200767116e-06, + "loss": 0.0009, + "step": 50380 + }, + { + "epoch": 0.8245111674711609, + "grad_norm": 0.03960917890071869, + "learning_rate": 7.312526097068151e-06, + "loss": 0.0018, + "step": 50390 + }, + { + "epoch": 0.8246747934222368, + "grad_norm": 0.0998501107096672, + "learning_rate": 7.31125999786338e-06, + "loss": 0.002, + "step": 50400 + }, + { + "epoch": 0.8248384193733126, + "grad_norm": 0.17833255231380463, + "learning_rate": 7.309993710160109e-06, + "loss": 0.0024, + "step": 50410 + }, + { + "epoch": 0.8250020453243885, + "grad_norm": 0.05349275469779968, + "learning_rate": 7.30872723406161e-06, + "loss": 0.0024, + "step": 50420 + }, + { + "epoch": 0.8251656712754643, + "grad_norm": 0.04850715398788452, + "learning_rate": 7.307460569671172e-06, + "loss": 0.0022, + "step": 50430 + }, + { + "epoch": 0.8253292972265401, + "grad_norm": 0.10100100934505463, + "learning_rate": 7.306193717092101e-06, + "loss": 0.0022, + "step": 50440 + }, + { + "epoch": 0.825492923177616, + "grad_norm": 0.04609709605574608, + "learning_rate": 7.304926676427717e-06, + "loss": 0.0013, + "step": 50450 + }, + { + "epoch": 0.8256565491286918, + "grad_norm": 0.05325142666697502, + "learning_rate": 7.303659447781355e-06, + "loss": 0.0016, + "step": 50460 + }, + { + "epoch": 0.8258201750797677, + "grad_norm": 0.0937347561120987, + "learning_rate": 7.3023920312563665e-06, + "loss": 0.0037, + "step": 50470 + }, + { + "epoch": 0.8259838010308435, + "grad_norm": 0.09343674033880234, + "learning_rate": 7.301124426956117e-06, + "loss": 0.0021, + "step": 50480 + }, + { + "epoch": 0.8261474269819193, + "grad_norm": 0.21492396295070648, + "learning_rate": 7.299856634983988e-06, + "loss": 0.0024, + "step": 50490 + }, + { + "epoch": 0.8263110529329952, + "grad_norm": 0.01718193292617798, + "learning_rate": 7.298588655443377e-06, + "loss": 0.0032, + "step": 50500 + }, + { + "epoch": 0.826474678884071, + "grad_norm": 0.12893222272396088, + "learning_rate": 7.297320488437697e-06, + "loss": 0.003, + "step": 50510 + }, + { + "epoch": 0.8266383048351469, + "grad_norm": 0.02382524125277996, + "learning_rate": 7.296052134070373e-06, + "loss": 0.0016, + "step": 50520 + }, + { + "epoch": 0.8268019307862227, + "grad_norm": 0.1249120831489563, + "learning_rate": 7.294783592444849e-06, + "loss": 0.0023, + "step": 50530 + }, + { + "epoch": 0.8269655567372985, + "grad_norm": 0.09398402273654938, + "learning_rate": 7.293514863664581e-06, + "loss": 0.0024, + "step": 50540 + }, + { + "epoch": 0.8271291826883744, + "grad_norm": 0.03904254361987114, + "learning_rate": 7.292245947833047e-06, + "loss": 0.0019, + "step": 50550 + }, + { + "epoch": 0.8272928086394502, + "grad_norm": 0.05136486515402794, + "learning_rate": 7.29097684505373e-06, + "loss": 0.0026, + "step": 50560 + }, + { + "epoch": 0.8274564345905261, + "grad_norm": 0.06342675536870956, + "learning_rate": 7.289707555430136e-06, + "loss": 0.0016, + "step": 50570 + }, + { + "epoch": 0.8276200605416019, + "grad_norm": 0.07060058414936066, + "learning_rate": 7.288438079065786e-06, + "loss": 0.0021, + "step": 50580 + }, + { + "epoch": 0.8277836864926778, + "grad_norm": 0.05490008369088173, + "learning_rate": 7.287168416064211e-06, + "loss": 0.0019, + "step": 50590 + }, + { + "epoch": 0.8279473124437535, + "grad_norm": 0.02857094258069992, + "learning_rate": 7.285898566528962e-06, + "loss": 0.0014, + "step": 50600 + }, + { + "epoch": 0.8281109383948294, + "grad_norm": 0.04800453782081604, + "learning_rate": 7.284628530563602e-06, + "loss": 0.0014, + "step": 50610 + }, + { + "epoch": 0.8282745643459053, + "grad_norm": 0.07711932808160782, + "learning_rate": 7.283358308271713e-06, + "loss": 0.0023, + "step": 50620 + }, + { + "epoch": 0.8284381902969811, + "grad_norm": 0.013450018130242825, + "learning_rate": 7.282087899756889e-06, + "loss": 0.0013, + "step": 50630 + }, + { + "epoch": 0.828601816248057, + "grad_norm": 0.08079660683870316, + "learning_rate": 7.28081730512274e-06, + "loss": 0.0022, + "step": 50640 + }, + { + "epoch": 0.8287654421991327, + "grad_norm": 0.29463911056518555, + "learning_rate": 7.279546524472893e-06, + "loss": 0.0021, + "step": 50650 + }, + { + "epoch": 0.8289290681502086, + "grad_norm": 0.11921440809965134, + "learning_rate": 7.278275557910985e-06, + "loss": 0.0031, + "step": 50660 + }, + { + "epoch": 0.8290926941012845, + "grad_norm": 0.045350778847932816, + "learning_rate": 7.277004405540675e-06, + "loss": 0.0018, + "step": 50670 + }, + { + "epoch": 0.8292563200523603, + "grad_norm": 0.05217655003070831, + "learning_rate": 7.275733067465634e-06, + "loss": 0.0021, + "step": 50680 + }, + { + "epoch": 0.8294199460034362, + "grad_norm": 0.1516101360321045, + "learning_rate": 7.2744615437895435e-06, + "loss": 0.0017, + "step": 50690 + }, + { + "epoch": 0.8295835719545119, + "grad_norm": 0.16141226887702942, + "learning_rate": 7.27318983461611e-06, + "loss": 0.0029, + "step": 50700 + }, + { + "epoch": 0.8297471979055878, + "grad_norm": 0.042797550559043884, + "learning_rate": 7.271917940049048e-06, + "loss": 0.0017, + "step": 50710 + }, + { + "epoch": 0.8299108238566637, + "grad_norm": 0.06384925544261932, + "learning_rate": 7.270645860192087e-06, + "loss": 0.0026, + "step": 50720 + }, + { + "epoch": 0.8300744498077395, + "grad_norm": 0.018085643649101257, + "learning_rate": 7.269373595148976e-06, + "loss": 0.0016, + "step": 50730 + }, + { + "epoch": 0.8302380757588154, + "grad_norm": 0.05272732675075531, + "learning_rate": 7.2681011450234765e-06, + "loss": 0.0018, + "step": 50740 + }, + { + "epoch": 0.8304017017098911, + "grad_norm": 0.18431979417800903, + "learning_rate": 7.266828509919362e-06, + "loss": 0.0024, + "step": 50750 + }, + { + "epoch": 0.830565327660967, + "grad_norm": 0.0356517918407917, + "learning_rate": 7.2655556899404285e-06, + "loss": 0.0023, + "step": 50760 + }, + { + "epoch": 0.8307289536120429, + "grad_norm": 0.3630867600440979, + "learning_rate": 7.2642826851904814e-06, + "loss": 0.0024, + "step": 50770 + }, + { + "epoch": 0.8308925795631187, + "grad_norm": 0.2252519577741623, + "learning_rate": 7.263009495773341e-06, + "loss": 0.0021, + "step": 50780 + }, + { + "epoch": 0.8310562055141946, + "grad_norm": 0.08125129342079163, + "learning_rate": 7.261736121792846e-06, + "loss": 0.0025, + "step": 50790 + }, + { + "epoch": 0.8312198314652703, + "grad_norm": 0.21658673882484436, + "learning_rate": 7.260462563352848e-06, + "loss": 0.003, + "step": 50800 + }, + { + "epoch": 0.8313834574163462, + "grad_norm": 0.007952956482768059, + "learning_rate": 7.259188820557213e-06, + "loss": 0.0051, + "step": 50810 + }, + { + "epoch": 0.8315470833674221, + "grad_norm": 0.03570942580699921, + "learning_rate": 7.257914893509827e-06, + "loss": 0.0029, + "step": 50820 + }, + { + "epoch": 0.8317107093184979, + "grad_norm": 0.08199523389339447, + "learning_rate": 7.256640782314581e-06, + "loss": 0.0012, + "step": 50830 + }, + { + "epoch": 0.8318743352695738, + "grad_norm": 0.08071230351924896, + "learning_rate": 7.255366487075394e-06, + "loss": 0.0032, + "step": 50840 + }, + { + "epoch": 0.8320379612206495, + "grad_norm": 0.47581747174263, + "learning_rate": 7.254092007896187e-06, + "loss": 0.0027, + "step": 50850 + }, + { + "epoch": 0.8322015871717254, + "grad_norm": 0.09428759664297104, + "learning_rate": 7.252817344880907e-06, + "loss": 0.0019, + "step": 50860 + }, + { + "epoch": 0.8323652131228013, + "grad_norm": 0.08285456150770187, + "learning_rate": 7.251542498133508e-06, + "loss": 0.0032, + "step": 50870 + }, + { + "epoch": 0.8325288390738771, + "grad_norm": 0.0900750681757927, + "learning_rate": 7.250267467757963e-06, + "loss": 0.0023, + "step": 50880 + }, + { + "epoch": 0.832692465024953, + "grad_norm": 0.05965256318449974, + "learning_rate": 7.2489922538582605e-06, + "loss": 0.0021, + "step": 50890 + }, + { + "epoch": 0.8328560909760288, + "grad_norm": 0.05914861708879471, + "learning_rate": 7.247716856538399e-06, + "loss": 0.0021, + "step": 50900 + }, + { + "epoch": 0.8330197169271046, + "grad_norm": 0.06369437277317047, + "learning_rate": 7.246441275902401e-06, + "loss": 0.0031, + "step": 50910 + }, + { + "epoch": 0.8331833428781805, + "grad_norm": 0.06461750715970993, + "learning_rate": 7.245165512054294e-06, + "loss": 0.0017, + "step": 50920 + }, + { + "epoch": 0.8333469688292563, + "grad_norm": 0.15263749659061432, + "learning_rate": 7.243889565098127e-06, + "loss": 0.0022, + "step": 50930 + }, + { + "epoch": 0.8335105947803322, + "grad_norm": 0.036695919930934906, + "learning_rate": 7.242613435137961e-06, + "loss": 0.0028, + "step": 50940 + }, + { + "epoch": 0.833674220731408, + "grad_norm": 0.18260805308818817, + "learning_rate": 7.241337122277874e-06, + "loss": 0.0026, + "step": 50950 + }, + { + "epoch": 0.8338378466824838, + "grad_norm": 0.08658963441848755, + "learning_rate": 7.2400606266219564e-06, + "loss": 0.0022, + "step": 50960 + }, + { + "epoch": 0.8340014726335597, + "grad_norm": 0.012939597479999065, + "learning_rate": 7.238783948274315e-06, + "loss": 0.0026, + "step": 50970 + }, + { + "epoch": 0.8341650985846355, + "grad_norm": 0.11531227827072144, + "learning_rate": 7.237507087339073e-06, + "loss": 0.0018, + "step": 50980 + }, + { + "epoch": 0.8343287245357114, + "grad_norm": 0.15803728997707367, + "learning_rate": 7.236230043920365e-06, + "loss": 0.0039, + "step": 50990 + }, + { + "epoch": 0.8344923504867872, + "grad_norm": 0.025625668466091156, + "learning_rate": 7.234952818122343e-06, + "loss": 0.0021, + "step": 51000 + }, + { + "epoch": 0.834655976437863, + "grad_norm": 0.13776859641075134, + "learning_rate": 7.233675410049172e-06, + "loss": 0.0028, + "step": 51010 + }, + { + "epoch": 0.8348196023889389, + "grad_norm": 0.19835378229618073, + "learning_rate": 7.232397819805035e-06, + "loss": 0.0042, + "step": 51020 + }, + { + "epoch": 0.8349832283400147, + "grad_norm": 0.06411415338516235, + "learning_rate": 7.231120047494128e-06, + "loss": 0.003, + "step": 51030 + }, + { + "epoch": 0.8351468542910906, + "grad_norm": 0.08993500471115112, + "learning_rate": 7.2298420932206605e-06, + "loss": 0.0015, + "step": 51040 + }, + { + "epoch": 0.8353104802421664, + "grad_norm": 0.011704706586897373, + "learning_rate": 7.228563957088858e-06, + "loss": 0.0014, + "step": 51050 + }, + { + "epoch": 0.8354741061932422, + "grad_norm": 0.0824776142835617, + "learning_rate": 7.227285639202961e-06, + "loss": 0.0024, + "step": 51060 + }, + { + "epoch": 0.8356377321443181, + "grad_norm": 0.05209881812334061, + "learning_rate": 7.226007139667226e-06, + "loss": 0.0028, + "step": 51070 + }, + { + "epoch": 0.8358013580953939, + "grad_norm": 0.08226223289966583, + "learning_rate": 7.224728458585921e-06, + "loss": 0.0017, + "step": 51080 + }, + { + "epoch": 0.8359649840464698, + "grad_norm": 0.013612093403935432, + "learning_rate": 7.223449596063334e-06, + "loss": 0.0022, + "step": 51090 + }, + { + "epoch": 0.8361286099975456, + "grad_norm": 0.027713894844055176, + "learning_rate": 7.222170552203761e-06, + "loss": 0.0014, + "step": 51100 + }, + { + "epoch": 0.8362922359486215, + "grad_norm": 0.19508448243141174, + "learning_rate": 7.220891327111519e-06, + "loss": 0.0032, + "step": 51110 + }, + { + "epoch": 0.8364558618996973, + "grad_norm": 0.0737922340631485, + "learning_rate": 7.219611920890937e-06, + "loss": 0.0017, + "step": 51120 + }, + { + "epoch": 0.8366194878507731, + "grad_norm": 0.12655296921730042, + "learning_rate": 7.2183323336463576e-06, + "loss": 0.0027, + "step": 51130 + }, + { + "epoch": 0.836783113801849, + "grad_norm": 0.10299160331487656, + "learning_rate": 7.217052565482141e-06, + "loss": 0.0026, + "step": 51140 + }, + { + "epoch": 0.8369467397529248, + "grad_norm": 0.04209122806787491, + "learning_rate": 7.21577261650266e-06, + "loss": 0.0024, + "step": 51150 + }, + { + "epoch": 0.8371103657040007, + "grad_norm": 0.0948806181550026, + "learning_rate": 7.214492486812303e-06, + "loss": 0.0012, + "step": 51160 + }, + { + "epoch": 0.8372739916550765, + "grad_norm": 0.07744203507900238, + "learning_rate": 7.213212176515474e-06, + "loss": 0.0024, + "step": 51170 + }, + { + "epoch": 0.8374376176061523, + "grad_norm": 0.15388254821300507, + "learning_rate": 7.211931685716589e-06, + "loss": 0.0033, + "step": 51180 + }, + { + "epoch": 0.8376012435572282, + "grad_norm": 0.04585542902350426, + "learning_rate": 7.2106510145200814e-06, + "loss": 0.0031, + "step": 51190 + }, + { + "epoch": 0.837764869508304, + "grad_norm": 0.04853685572743416, + "learning_rate": 7.209370163030398e-06, + "loss": 0.0022, + "step": 51200 + }, + { + "epoch": 0.8379284954593799, + "grad_norm": 0.11210136860609055, + "learning_rate": 7.208089131352002e-06, + "loss": 0.0024, + "step": 51210 + }, + { + "epoch": 0.8380921214104557, + "grad_norm": 0.111039899289608, + "learning_rate": 7.2068079195893696e-06, + "loss": 0.002, + "step": 51220 + }, + { + "epoch": 0.8382557473615315, + "grad_norm": 0.034838687628507614, + "learning_rate": 7.20552652784699e-06, + "loss": 0.0021, + "step": 51230 + }, + { + "epoch": 0.8384193733126074, + "grad_norm": 2.0237979888916016, + "learning_rate": 7.20424495622937e-06, + "loss": 0.0032, + "step": 51240 + }, + { + "epoch": 0.8385829992636832, + "grad_norm": 0.04892745986580849, + "learning_rate": 7.202963204841033e-06, + "loss": 0.0021, + "step": 51250 + }, + { + "epoch": 0.8387466252147591, + "grad_norm": 0.12003655731678009, + "learning_rate": 7.201681273786511e-06, + "loss": 0.0028, + "step": 51260 + }, + { + "epoch": 0.838910251165835, + "grad_norm": 0.13234943151474, + "learning_rate": 7.2003991631703554e-06, + "loss": 0.0029, + "step": 51270 + }, + { + "epoch": 0.8390738771169107, + "grad_norm": 0.12959690392017365, + "learning_rate": 7.199116873097129e-06, + "loss": 0.0016, + "step": 51280 + }, + { + "epoch": 0.8392375030679866, + "grad_norm": 0.03561769425868988, + "learning_rate": 7.1978344036714145e-06, + "loss": 0.0024, + "step": 51290 + }, + { + "epoch": 0.8394011290190624, + "grad_norm": 0.0820692628622055, + "learning_rate": 7.196551754997802e-06, + "loss": 0.0019, + "step": 51300 + }, + { + "epoch": 0.8395647549701383, + "grad_norm": 0.22885608673095703, + "learning_rate": 7.1952689271809025e-06, + "loss": 0.0027, + "step": 51310 + }, + { + "epoch": 0.8397283809212142, + "grad_norm": 0.0791151151061058, + "learning_rate": 7.193985920325339e-06, + "loss": 0.0018, + "step": 51320 + }, + { + "epoch": 0.8398920068722899, + "grad_norm": 0.12285812199115753, + "learning_rate": 7.192702734535747e-06, + "loss": 0.0015, + "step": 51330 + }, + { + "epoch": 0.8400556328233658, + "grad_norm": 0.09805543720722198, + "learning_rate": 7.191419369916781e-06, + "loss": 0.0022, + "step": 51340 + }, + { + "epoch": 0.8402192587744416, + "grad_norm": 0.09020490199327469, + "learning_rate": 7.190135826573107e-06, + "loss": 0.0029, + "step": 51350 + }, + { + "epoch": 0.8403828847255175, + "grad_norm": 0.06600624322891235, + "learning_rate": 7.188852104609406e-06, + "loss": 0.0019, + "step": 51360 + }, + { + "epoch": 0.8405465106765934, + "grad_norm": 0.11946147680282593, + "learning_rate": 7.187568204130375e-06, + "loss": 0.0015, + "step": 51370 + }, + { + "epoch": 0.8407101366276691, + "grad_norm": 0.07183331251144409, + "learning_rate": 7.186284125240724e-06, + "loss": 0.0025, + "step": 51380 + }, + { + "epoch": 0.840873762578745, + "grad_norm": 0.13309527933597565, + "learning_rate": 7.184999868045178e-06, + "loss": 0.0015, + "step": 51390 + }, + { + "epoch": 0.8410373885298208, + "grad_norm": 0.004618125036358833, + "learning_rate": 7.183715432648477e-06, + "loss": 0.002, + "step": 51400 + }, + { + "epoch": 0.8412010144808967, + "grad_norm": 0.03110748715698719, + "learning_rate": 7.182430819155375e-06, + "loss": 0.0015, + "step": 51410 + }, + { + "epoch": 0.8413646404319726, + "grad_norm": 0.048091642558574677, + "learning_rate": 7.18114602767064e-06, + "loss": 0.0018, + "step": 51420 + }, + { + "epoch": 0.8415282663830483, + "grad_norm": 0.1743410974740982, + "learning_rate": 7.1798610582990565e-06, + "loss": 0.003, + "step": 51430 + }, + { + "epoch": 0.8416918923341242, + "grad_norm": 0.04619912430644035, + "learning_rate": 7.178575911145421e-06, + "loss": 0.0017, + "step": 51440 + }, + { + "epoch": 0.8418555182852, + "grad_norm": 0.03747618570923805, + "learning_rate": 7.177290586314547e-06, + "loss": 0.0015, + "step": 51450 + }, + { + "epoch": 0.8420191442362759, + "grad_norm": 0.07370032370090485, + "learning_rate": 7.17600508391126e-06, + "loss": 0.0019, + "step": 51460 + }, + { + "epoch": 0.8421827701873517, + "grad_norm": 0.037222664803266525, + "learning_rate": 7.174719404040402e-06, + "loss": 0.0027, + "step": 51470 + }, + { + "epoch": 0.8423463961384275, + "grad_norm": 0.03174416348338127, + "learning_rate": 7.1734335468068295e-06, + "loss": 0.0015, + "step": 51480 + }, + { + "epoch": 0.8425100220895034, + "grad_norm": 0.12124619632959366, + "learning_rate": 7.172147512315411e-06, + "loss": 0.0013, + "step": 51490 + }, + { + "epoch": 0.8426736480405792, + "grad_norm": 0.08187685906887054, + "learning_rate": 7.170861300671031e-06, + "loss": 0.0016, + "step": 51500 + }, + { + "epoch": 0.8428372739916551, + "grad_norm": 0.032854389399290085, + "learning_rate": 7.16957491197859e-06, + "loss": 0.0025, + "step": 51510 + }, + { + "epoch": 0.8430008999427309, + "grad_norm": 0.12143535166978836, + "learning_rate": 7.1682883463429995e-06, + "loss": 0.0016, + "step": 51520 + }, + { + "epoch": 0.8431645258938067, + "grad_norm": 0.03570203855633736, + "learning_rate": 7.167001603869191e-06, + "loss": 0.0014, + "step": 51530 + }, + { + "epoch": 0.8433281518448826, + "grad_norm": 0.10940305143594742, + "learning_rate": 7.165714684662102e-06, + "loss": 0.0034, + "step": 51540 + }, + { + "epoch": 0.8434917777959584, + "grad_norm": 0.04765204340219498, + "learning_rate": 7.164427588826692e-06, + "loss": 0.0015, + "step": 51550 + }, + { + "epoch": 0.8436554037470343, + "grad_norm": 0.13376373052597046, + "learning_rate": 7.1631403164679326e-06, + "loss": 0.002, + "step": 51560 + }, + { + "epoch": 0.8438190296981101, + "grad_norm": 0.017260512337088585, + "learning_rate": 7.161852867690808e-06, + "loss": 0.0018, + "step": 51570 + }, + { + "epoch": 0.843982655649186, + "grad_norm": 0.15488842129707336, + "learning_rate": 7.160565242600319e-06, + "loss": 0.0022, + "step": 51580 + }, + { + "epoch": 0.8441462816002618, + "grad_norm": 0.11717179417610168, + "learning_rate": 7.159277441301481e-06, + "loss": 0.0022, + "step": 51590 + }, + { + "epoch": 0.8443099075513376, + "grad_norm": 0.027377715334296227, + "learning_rate": 7.15798946389932e-06, + "loss": 0.0028, + "step": 51600 + }, + { + "epoch": 0.8444735335024135, + "grad_norm": 0.02526942454278469, + "learning_rate": 7.15670131049888e-06, + "loss": 0.0012, + "step": 51610 + }, + { + "epoch": 0.8446371594534893, + "grad_norm": 0.13770677149295807, + "learning_rate": 7.15541298120522e-06, + "loss": 0.0032, + "step": 51620 + }, + { + "epoch": 0.8448007854045652, + "grad_norm": 0.11904923617839813, + "learning_rate": 7.154124476123409e-06, + "loss": 0.0016, + "step": 51630 + }, + { + "epoch": 0.844964411355641, + "grad_norm": 0.022507907822728157, + "learning_rate": 7.152835795358537e-06, + "loss": 0.0023, + "step": 51640 + }, + { + "epoch": 0.8451280373067168, + "grad_norm": 0.02419656701385975, + "learning_rate": 7.151546939015701e-06, + "loss": 0.003, + "step": 51650 + }, + { + "epoch": 0.8452916632577927, + "grad_norm": 0.04865049198269844, + "learning_rate": 7.1502579072000165e-06, + "loss": 0.0023, + "step": 51660 + }, + { + "epoch": 0.8454552892088685, + "grad_norm": 0.16077959537506104, + "learning_rate": 7.148968700016614e-06, + "loss": 0.0054, + "step": 51670 + }, + { + "epoch": 0.8456189151599444, + "grad_norm": 0.06158830597996712, + "learning_rate": 7.147679317570635e-06, + "loss": 0.002, + "step": 51680 + }, + { + "epoch": 0.8457825411110202, + "grad_norm": 0.04843762516975403, + "learning_rate": 7.146389759967237e-06, + "loss": 0.0028, + "step": 51690 + }, + { + "epoch": 0.845946167062096, + "grad_norm": 0.046450987458229065, + "learning_rate": 7.145100027311594e-06, + "loss": 0.0022, + "step": 51700 + }, + { + "epoch": 0.8461097930131719, + "grad_norm": 0.031733885407447815, + "learning_rate": 7.14381011970889e-06, + "loss": 0.0022, + "step": 51710 + }, + { + "epoch": 0.8462734189642477, + "grad_norm": 0.05632362514734268, + "learning_rate": 7.142520037264328e-06, + "loss": 0.0015, + "step": 51720 + }, + { + "epoch": 0.8464370449153236, + "grad_norm": 0.0710706114768982, + "learning_rate": 7.1412297800831206e-06, + "loss": 0.0016, + "step": 51730 + }, + { + "epoch": 0.8466006708663995, + "grad_norm": 0.14027522504329681, + "learning_rate": 7.1399393482704975e-06, + "loss": 0.003, + "step": 51740 + }, + { + "epoch": 0.8467642968174752, + "grad_norm": 0.06512275338172913, + "learning_rate": 7.138648741931702e-06, + "loss": 0.0018, + "step": 51750 + }, + { + "epoch": 0.8469279227685511, + "grad_norm": 0.09841345995664597, + "learning_rate": 7.13735796117199e-06, + "loss": 0.0031, + "step": 51760 + }, + { + "epoch": 0.8470915487196269, + "grad_norm": 0.13326981663703918, + "learning_rate": 7.136067006096636e-06, + "loss": 0.0019, + "step": 51770 + }, + { + "epoch": 0.8472551746707028, + "grad_norm": 0.027064789086580276, + "learning_rate": 7.134775876810924e-06, + "loss": 0.0033, + "step": 51780 + }, + { + "epoch": 0.8474188006217787, + "grad_norm": 0.03956165164709091, + "learning_rate": 7.133484573420155e-06, + "loss": 0.0015, + "step": 51790 + }, + { + "epoch": 0.8475824265728544, + "grad_norm": 0.1570480316877365, + "learning_rate": 7.132193096029644e-06, + "loss": 0.0027, + "step": 51800 + }, + { + "epoch": 0.8477460525239303, + "grad_norm": 0.07073867321014404, + "learning_rate": 7.130901444744717e-06, + "loss": 0.0029, + "step": 51810 + }, + { + "epoch": 0.8479096784750061, + "grad_norm": 0.09911315143108368, + "learning_rate": 7.1296096196707185e-06, + "loss": 0.0026, + "step": 51820 + }, + { + "epoch": 0.848073304426082, + "grad_norm": 0.0867316722869873, + "learning_rate": 7.128317620913004e-06, + "loss": 0.0042, + "step": 51830 + }, + { + "epoch": 0.8482369303771579, + "grad_norm": 0.05428260192275047, + "learning_rate": 7.127025448576948e-06, + "loss": 0.0016, + "step": 51840 + }, + { + "epoch": 0.8484005563282336, + "grad_norm": 0.18079212307929993, + "learning_rate": 7.12573310276793e-06, + "loss": 0.0022, + "step": 51850 + }, + { + "epoch": 0.8485641822793095, + "grad_norm": 0.06461817771196365, + "learning_rate": 7.124440583591355e-06, + "loss": 0.0022, + "step": 51860 + }, + { + "epoch": 0.8487278082303853, + "grad_norm": 0.204007089138031, + "learning_rate": 7.123147891152632e-06, + "loss": 0.0022, + "step": 51870 + }, + { + "epoch": 0.8488914341814612, + "grad_norm": 0.2365105152130127, + "learning_rate": 7.121855025557192e-06, + "loss": 0.0023, + "step": 51880 + }, + { + "epoch": 0.8490550601325371, + "grad_norm": 0.038268886506557465, + "learning_rate": 7.120561986910475e-06, + "loss": 0.0016, + "step": 51890 + }, + { + "epoch": 0.8492186860836128, + "grad_norm": 0.07389094680547714, + "learning_rate": 7.119268775317936e-06, + "loss": 0.003, + "step": 51900 + }, + { + "epoch": 0.8493823120346887, + "grad_norm": 0.008556121960282326, + "learning_rate": 7.117975390885048e-06, + "loss": 0.0012, + "step": 51910 + }, + { + "epoch": 0.8495459379857645, + "grad_norm": 0.1838950216770172, + "learning_rate": 7.116681833717292e-06, + "loss": 0.0035, + "step": 51920 + }, + { + "epoch": 0.8497095639368404, + "grad_norm": 0.04003998264670372, + "learning_rate": 7.115388103920167e-06, + "loss": 0.0022, + "step": 51930 + }, + { + "epoch": 0.8498731898879163, + "grad_norm": 0.039213523268699646, + "learning_rate": 7.114094201599185e-06, + "loss": 0.002, + "step": 51940 + }, + { + "epoch": 0.850036815838992, + "grad_norm": 0.11654011160135269, + "learning_rate": 7.112800126859874e-06, + "loss": 0.0023, + "step": 51950 + }, + { + "epoch": 0.8502004417900679, + "grad_norm": 0.1593422293663025, + "learning_rate": 7.1115058798077715e-06, + "loss": 0.0074, + "step": 51960 + }, + { + "epoch": 0.8503640677411437, + "grad_norm": 0.05370429903268814, + "learning_rate": 7.110211460548435e-06, + "loss": 0.0016, + "step": 51970 + }, + { + "epoch": 0.8505276936922196, + "grad_norm": 0.09903787076473236, + "learning_rate": 7.10891686918743e-06, + "loss": 0.0023, + "step": 51980 + }, + { + "epoch": 0.8506913196432955, + "grad_norm": 0.1487654596567154, + "learning_rate": 7.107622105830342e-06, + "loss": 0.0025, + "step": 51990 + }, + { + "epoch": 0.8508549455943712, + "grad_norm": 0.029519138857722282, + "learning_rate": 7.106327170582764e-06, + "loss": 0.0017, + "step": 52000 + }, + { + "epoch": 0.8510185715454471, + "grad_norm": 0.016456831246614456, + "learning_rate": 7.105032063550309e-06, + "loss": 0.0016, + "step": 52010 + }, + { + "epoch": 0.8511821974965229, + "grad_norm": 0.02960352972149849, + "learning_rate": 7.1037367848386015e-06, + "loss": 0.0032, + "step": 52020 + }, + { + "epoch": 0.8513458234475988, + "grad_norm": 0.059133417904376984, + "learning_rate": 7.102441334553279e-06, + "loss": 0.0018, + "step": 52030 + }, + { + "epoch": 0.8515094493986747, + "grad_norm": 0.091533362865448, + "learning_rate": 7.101145712799994e-06, + "loss": 0.0025, + "step": 52040 + }, + { + "epoch": 0.8516730753497505, + "grad_norm": 0.04424267262220383, + "learning_rate": 7.0998499196844125e-06, + "loss": 0.0034, + "step": 52050 + }, + { + "epoch": 0.8518367013008263, + "grad_norm": 0.023693222552537918, + "learning_rate": 7.098553955312217e-06, + "loss": 0.0014, + "step": 52060 + }, + { + "epoch": 0.8520003272519021, + "grad_norm": 0.06740313768386841, + "learning_rate": 7.0972578197891e-06, + "loss": 0.0015, + "step": 52070 + }, + { + "epoch": 0.852163953202978, + "grad_norm": 0.037906866520643234, + "learning_rate": 7.095961513220772e-06, + "loss": 0.0013, + "step": 52080 + }, + { + "epoch": 0.8523275791540539, + "grad_norm": 0.03597046807408333, + "learning_rate": 7.094665035712954e-06, + "loss": 0.0015, + "step": 52090 + }, + { + "epoch": 0.8524912051051297, + "grad_norm": 0.12819840013980865, + "learning_rate": 7.093368387371382e-06, + "loss": 0.0014, + "step": 52100 + }, + { + "epoch": 0.8526548310562055, + "grad_norm": 0.23417770862579346, + "learning_rate": 7.092071568301806e-06, + "loss": 0.0016, + "step": 52110 + }, + { + "epoch": 0.8528184570072813, + "grad_norm": 0.01937873661518097, + "learning_rate": 7.0907745786099915e-06, + "loss": 0.0022, + "step": 52120 + }, + { + "epoch": 0.8529820829583572, + "grad_norm": 0.04607968032360077, + "learning_rate": 7.089477418401716e-06, + "loss": 0.0024, + "step": 52130 + }, + { + "epoch": 0.8531457089094331, + "grad_norm": 0.008417102508246899, + "learning_rate": 7.088180087782771e-06, + "loss": 0.0018, + "step": 52140 + }, + { + "epoch": 0.8533093348605089, + "grad_norm": 0.057662345468997955, + "learning_rate": 7.086882586858962e-06, + "loss": 0.0022, + "step": 52150 + }, + { + "epoch": 0.8534729608115847, + "grad_norm": 0.05995975434780121, + "learning_rate": 7.08558491573611e-06, + "loss": 0.0019, + "step": 52160 + }, + { + "epoch": 0.8536365867626605, + "grad_norm": 0.02329769730567932, + "learning_rate": 7.084287074520047e-06, + "loss": 0.0014, + "step": 52170 + }, + { + "epoch": 0.8538002127137364, + "grad_norm": 0.09681403636932373, + "learning_rate": 7.082989063316623e-06, + "loss": 0.0014, + "step": 52180 + }, + { + "epoch": 0.8539638386648123, + "grad_norm": 0.07927804440259933, + "learning_rate": 7.0816908822316975e-06, + "loss": 0.0011, + "step": 52190 + }, + { + "epoch": 0.8541274646158881, + "grad_norm": 0.027724415063858032, + "learning_rate": 7.080392531371145e-06, + "loss": 0.0037, + "step": 52200 + }, + { + "epoch": 0.854291090566964, + "grad_norm": 0.04706178605556488, + "learning_rate": 7.079094010840857e-06, + "loss": 0.0014, + "step": 52210 + }, + { + "epoch": 0.8544547165180397, + "grad_norm": 0.11963524669408798, + "learning_rate": 7.077795320746735e-06, + "loss": 0.003, + "step": 52220 + }, + { + "epoch": 0.8546183424691156, + "grad_norm": 0.054164644330739975, + "learning_rate": 7.076496461194696e-06, + "loss": 0.0029, + "step": 52230 + }, + { + "epoch": 0.8547819684201915, + "grad_norm": 0.011991985142230988, + "learning_rate": 7.07519743229067e-06, + "loss": 0.0013, + "step": 52240 + }, + { + "epoch": 0.8549455943712673, + "grad_norm": 0.08985766023397446, + "learning_rate": 7.073898234140602e-06, + "loss": 0.0031, + "step": 52250 + }, + { + "epoch": 0.8551092203223432, + "grad_norm": 0.18537844717502594, + "learning_rate": 7.0725988668504505e-06, + "loss": 0.0022, + "step": 52260 + }, + { + "epoch": 0.8552728462734189, + "grad_norm": 0.10014890134334564, + "learning_rate": 7.0712993305261865e-06, + "loss": 0.0016, + "step": 52270 + }, + { + "epoch": 0.8554364722244948, + "grad_norm": 0.08260821551084518, + "learning_rate": 7.069999625273796e-06, + "loss": 0.0017, + "step": 52280 + }, + { + "epoch": 0.8556000981755707, + "grad_norm": 0.04796145111322403, + "learning_rate": 7.068699751199279e-06, + "loss": 0.0018, + "step": 52290 + }, + { + "epoch": 0.8557637241266465, + "grad_norm": 0.007061548065394163, + "learning_rate": 7.067399708408649e-06, + "loss": 0.0034, + "step": 52300 + }, + { + "epoch": 0.8559273500777224, + "grad_norm": 0.07918252050876617, + "learning_rate": 7.066099497007932e-06, + "loss": 0.0013, + "step": 52310 + }, + { + "epoch": 0.8560909760287981, + "grad_norm": 0.061982057988643646, + "learning_rate": 7.06479911710317e-06, + "loss": 0.0015, + "step": 52320 + }, + { + "epoch": 0.856254601979874, + "grad_norm": 0.09538498520851135, + "learning_rate": 7.0634985688004174e-06, + "loss": 0.0016, + "step": 52330 + }, + { + "epoch": 0.8564182279309498, + "grad_norm": 0.1483614444732666, + "learning_rate": 7.062197852205742e-06, + "loss": 0.0018, + "step": 52340 + }, + { + "epoch": 0.8565818538820257, + "grad_norm": 0.08636424690485, + "learning_rate": 7.060896967425226e-06, + "loss": 0.0017, + "step": 52350 + }, + { + "epoch": 0.8567454798331016, + "grad_norm": 0.15219436585903168, + "learning_rate": 7.059595914564965e-06, + "loss": 0.0019, + "step": 52360 + }, + { + "epoch": 0.8569091057841773, + "grad_norm": 0.03879012539982796, + "learning_rate": 7.058294693731068e-06, + "loss": 0.002, + "step": 52370 + }, + { + "epoch": 0.8570727317352532, + "grad_norm": 0.04547334089875221, + "learning_rate": 7.05699330502966e-06, + "loss": 0.0031, + "step": 52380 + }, + { + "epoch": 0.857236357686329, + "grad_norm": 0.053423453122377396, + "learning_rate": 7.055691748566875e-06, + "loss": 0.0017, + "step": 52390 + }, + { + "epoch": 0.8573999836374049, + "grad_norm": 0.043873924762010574, + "learning_rate": 7.0543900244488665e-06, + "loss": 0.0021, + "step": 52400 + }, + { + "epoch": 0.8575636095884808, + "grad_norm": 0.025887412950396538, + "learning_rate": 7.053088132781795e-06, + "loss": 0.0019, + "step": 52410 + }, + { + "epoch": 0.8577272355395565, + "grad_norm": 0.08187703788280487, + "learning_rate": 7.051786073671843e-06, + "loss": 0.003, + "step": 52420 + }, + { + "epoch": 0.8578908614906324, + "grad_norm": 0.023876162245869637, + "learning_rate": 7.050483847225199e-06, + "loss": 0.0023, + "step": 52430 + }, + { + "epoch": 0.8580544874417082, + "grad_norm": 0.036614932119846344, + "learning_rate": 7.049181453548068e-06, + "loss": 0.0026, + "step": 52440 + }, + { + "epoch": 0.8582181133927841, + "grad_norm": 0.08037271350622177, + "learning_rate": 7.047878892746671e-06, + "loss": 0.0022, + "step": 52450 + }, + { + "epoch": 0.85838173934386, + "grad_norm": 0.07627733051776886, + "learning_rate": 7.046576164927239e-06, + "loss": 0.0017, + "step": 52460 + }, + { + "epoch": 0.8585453652949357, + "grad_norm": 0.20880989730358124, + "learning_rate": 7.045273270196017e-06, + "loss": 0.004, + "step": 52470 + }, + { + "epoch": 0.8587089912460116, + "grad_norm": 0.05583646893501282, + "learning_rate": 7.043970208659265e-06, + "loss": 0.0017, + "step": 52480 + }, + { + "epoch": 0.8588726171970874, + "grad_norm": 0.11493480950593948, + "learning_rate": 7.0426669804232586e-06, + "loss": 0.002, + "step": 52490 + }, + { + "epoch": 0.8590362431481633, + "grad_norm": 0.1680634617805481, + "learning_rate": 7.041363585594282e-06, + "loss": 0.0023, + "step": 52500 + }, + { + "epoch": 0.8591998690992392, + "grad_norm": 0.07949693500995636, + "learning_rate": 7.040060024278636e-06, + "loss": 0.0032, + "step": 52510 + }, + { + "epoch": 0.859363495050315, + "grad_norm": 0.04821572080254555, + "learning_rate": 7.038756296582638e-06, + "loss": 0.0025, + "step": 52520 + }, + { + "epoch": 0.8595271210013908, + "grad_norm": 0.14623624086380005, + "learning_rate": 7.03745240261261e-06, + "loss": 0.0031, + "step": 52530 + }, + { + "epoch": 0.8596907469524666, + "grad_norm": 0.07858574390411377, + "learning_rate": 7.036148342474898e-06, + "loss": 0.0015, + "step": 52540 + }, + { + "epoch": 0.8598543729035425, + "grad_norm": 0.04424387961626053, + "learning_rate": 7.034844116275853e-06, + "loss": 0.0026, + "step": 52550 + }, + { + "epoch": 0.8600179988546184, + "grad_norm": 0.009820517152547836, + "learning_rate": 7.033539724121846e-06, + "loss": 0.0017, + "step": 52560 + }, + { + "epoch": 0.8601816248056942, + "grad_norm": 0.07594398409128189, + "learning_rate": 7.032235166119258e-06, + "loss": 0.0023, + "step": 52570 + }, + { + "epoch": 0.86034525075677, + "grad_norm": 0.17292647063732147, + "learning_rate": 7.030930442374484e-06, + "loss": 0.0024, + "step": 52580 + }, + { + "epoch": 0.8605088767078458, + "grad_norm": 0.05539826303720474, + "learning_rate": 7.029625552993933e-06, + "loss": 0.0021, + "step": 52590 + }, + { + "epoch": 0.8606725026589217, + "grad_norm": 0.1398976594209671, + "learning_rate": 7.028320498084027e-06, + "loss": 0.0019, + "step": 52600 + }, + { + "epoch": 0.8608361286099976, + "grad_norm": 0.03826690465211868, + "learning_rate": 7.027015277751201e-06, + "loss": 0.0014, + "step": 52610 + }, + { + "epoch": 0.8609997545610734, + "grad_norm": 0.07009593397378922, + "learning_rate": 7.025709892101908e-06, + "loss": 0.0059, + "step": 52620 + }, + { + "epoch": 0.8611633805121492, + "grad_norm": 0.013434285297989845, + "learning_rate": 7.024404341242606e-06, + "loss": 0.0026, + "step": 52630 + }, + { + "epoch": 0.861327006463225, + "grad_norm": 0.14003880321979523, + "learning_rate": 7.023098625279774e-06, + "loss": 0.0028, + "step": 52640 + }, + { + "epoch": 0.8614906324143009, + "grad_norm": 0.04105537757277489, + "learning_rate": 7.021792744319902e-06, + "loss": 0.0033, + "step": 52650 + }, + { + "epoch": 0.8616542583653768, + "grad_norm": 0.06372685730457306, + "learning_rate": 7.020486698469492e-06, + "loss": 0.0028, + "step": 52660 + }, + { + "epoch": 0.8618178843164526, + "grad_norm": 0.07221833616495132, + "learning_rate": 7.019180487835062e-06, + "loss": 0.0022, + "step": 52670 + }, + { + "epoch": 0.8619815102675285, + "grad_norm": 0.06876964867115021, + "learning_rate": 7.01787411252314e-06, + "loss": 0.0014, + "step": 52680 + }, + { + "epoch": 0.8621451362186042, + "grad_norm": 0.04022214189171791, + "learning_rate": 7.016567572640272e-06, + "loss": 0.0019, + "step": 52690 + }, + { + "epoch": 0.8623087621696801, + "grad_norm": 0.11424021422863007, + "learning_rate": 7.015260868293013e-06, + "loss": 0.0017, + "step": 52700 + }, + { + "epoch": 0.862472388120756, + "grad_norm": 0.08551132678985596, + "learning_rate": 7.013953999587935e-06, + "loss": 0.0018, + "step": 52710 + }, + { + "epoch": 0.8626360140718318, + "grad_norm": 0.08215713500976562, + "learning_rate": 7.01264696663162e-06, + "loss": 0.0022, + "step": 52720 + }, + { + "epoch": 0.8627996400229077, + "grad_norm": 0.020387010648846626, + "learning_rate": 7.011339769530665e-06, + "loss": 0.002, + "step": 52730 + }, + { + "epoch": 0.8629632659739834, + "grad_norm": 0.08078915625810623, + "learning_rate": 7.010032408391683e-06, + "loss": 0.0018, + "step": 52740 + }, + { + "epoch": 0.8631268919250593, + "grad_norm": 0.8305329084396362, + "learning_rate": 7.008724883321297e-06, + "loss": 0.0026, + "step": 52750 + }, + { + "epoch": 0.8632905178761352, + "grad_norm": 0.05274273455142975, + "learning_rate": 7.007417194426143e-06, + "loss": 0.0019, + "step": 52760 + }, + { + "epoch": 0.863454143827211, + "grad_norm": 0.038838643580675125, + "learning_rate": 7.0061093418128726e-06, + "loss": 0.0031, + "step": 52770 + }, + { + "epoch": 0.8636177697782869, + "grad_norm": 0.07990029454231262, + "learning_rate": 7.00480132558815e-06, + "loss": 0.0017, + "step": 52780 + }, + { + "epoch": 0.8637813957293626, + "grad_norm": 0.009780745953321457, + "learning_rate": 7.003493145858651e-06, + "loss": 0.0035, + "step": 52790 + }, + { + "epoch": 0.8639450216804385, + "grad_norm": 0.04110005125403404, + "learning_rate": 7.002184802731069e-06, + "loss": 0.0019, + "step": 52800 + }, + { + "epoch": 0.8641086476315144, + "grad_norm": 0.12480364739894867, + "learning_rate": 7.000876296312104e-06, + "loss": 0.0035, + "step": 52810 + }, + { + "epoch": 0.8642722735825902, + "grad_norm": 0.04220016673207283, + "learning_rate": 6.999567626708479e-06, + "loss": 0.0026, + "step": 52820 + }, + { + "epoch": 0.8644358995336661, + "grad_norm": 0.05488692969083786, + "learning_rate": 6.998258794026919e-06, + "loss": 0.0063, + "step": 52830 + }, + { + "epoch": 0.8645995254847418, + "grad_norm": 0.0398465059697628, + "learning_rate": 6.996949798374172e-06, + "loss": 0.0022, + "step": 52840 + }, + { + "epoch": 0.8647631514358177, + "grad_norm": 0.007414890918880701, + "learning_rate": 6.995640639856993e-06, + "loss": 0.0024, + "step": 52850 + }, + { + "epoch": 0.8649267773868936, + "grad_norm": 0.06604649871587753, + "learning_rate": 6.9943313185821535e-06, + "loss": 0.0015, + "step": 52860 + }, + { + "epoch": 0.8650904033379694, + "grad_norm": 0.04440661892294884, + "learning_rate": 6.993021834656437e-06, + "loss": 0.0025, + "step": 52870 + }, + { + "epoch": 0.8652540292890453, + "grad_norm": 0.09880305081605911, + "learning_rate": 6.99171218818664e-06, + "loss": 0.0023, + "step": 52880 + }, + { + "epoch": 0.865417655240121, + "grad_norm": 0.04403670132160187, + "learning_rate": 6.990402379279573e-06, + "loss": 0.0015, + "step": 52890 + }, + { + "epoch": 0.8655812811911969, + "grad_norm": 0.06780622899532318, + "learning_rate": 6.98909240804206e-06, + "loss": 0.0018, + "step": 52900 + }, + { + "epoch": 0.8657449071422728, + "grad_norm": 0.09713041037321091, + "learning_rate": 6.9877822745809385e-06, + "loss": 0.0023, + "step": 52910 + }, + { + "epoch": 0.8659085330933486, + "grad_norm": 0.09721215069293976, + "learning_rate": 6.986471979003058e-06, + "loss": 0.0016, + "step": 52920 + }, + { + "epoch": 0.8660721590444245, + "grad_norm": 0.13830965757369995, + "learning_rate": 6.98516152141528e-06, + "loss": 0.002, + "step": 52930 + }, + { + "epoch": 0.8662357849955002, + "grad_norm": 0.03229491412639618, + "learning_rate": 6.983850901924484e-06, + "loss": 0.0012, + "step": 52940 + }, + { + "epoch": 0.8663994109465761, + "grad_norm": 0.20107528567314148, + "learning_rate": 6.982540120637558e-06, + "loss": 0.0029, + "step": 52950 + }, + { + "epoch": 0.866563036897652, + "grad_norm": 0.09906178712844849, + "learning_rate": 6.981229177661403e-06, + "loss": 0.002, + "step": 52960 + }, + { + "epoch": 0.8667266628487278, + "grad_norm": 0.05163905769586563, + "learning_rate": 6.979918073102938e-06, + "loss": 0.0021, + "step": 52970 + }, + { + "epoch": 0.8668902887998037, + "grad_norm": 0.07864928245544434, + "learning_rate": 6.978606807069092e-06, + "loss": 0.0029, + "step": 52980 + }, + { + "epoch": 0.8670539147508795, + "grad_norm": 0.21290278434753418, + "learning_rate": 6.977295379666807e-06, + "loss": 0.0026, + "step": 52990 + }, + { + "epoch": 0.8672175407019553, + "grad_norm": 0.08762305229902267, + "learning_rate": 6.975983791003037e-06, + "loss": 0.004, + "step": 53000 + }, + { + "epoch": 0.8673811666530312, + "grad_norm": 0.10097048431634903, + "learning_rate": 6.974672041184752e-06, + "loss": 0.0032, + "step": 53010 + }, + { + "epoch": 0.867544792604107, + "grad_norm": 0.03274586424231529, + "learning_rate": 6.973360130318933e-06, + "loss": 0.0032, + "step": 53020 + }, + { + "epoch": 0.8677084185551829, + "grad_norm": 0.003902699565514922, + "learning_rate": 6.972048058512577e-06, + "loss": 0.001, + "step": 53030 + }, + { + "epoch": 0.8678720445062587, + "grad_norm": 0.5004389882087708, + "learning_rate": 6.970735825872691e-06, + "loss": 0.0023, + "step": 53040 + }, + { + "epoch": 0.8680356704573345, + "grad_norm": 0.08635661005973816, + "learning_rate": 6.9694234325062945e-06, + "loss": 0.0026, + "step": 53050 + }, + { + "epoch": 0.8681992964084104, + "grad_norm": 0.087120920419693, + "learning_rate": 6.968110878520426e-06, + "loss": 0.0012, + "step": 53060 + }, + { + "epoch": 0.8683629223594862, + "grad_norm": 0.06626240909099579, + "learning_rate": 6.966798164022128e-06, + "loss": 0.0023, + "step": 53070 + }, + { + "epoch": 0.8685265483105621, + "grad_norm": 0.038560375571250916, + "learning_rate": 6.965485289118464e-06, + "loss": 0.002, + "step": 53080 + }, + { + "epoch": 0.8686901742616379, + "grad_norm": 0.01193628553301096, + "learning_rate": 6.964172253916507e-06, + "loss": 0.0031, + "step": 53090 + }, + { + "epoch": 0.8688538002127137, + "grad_norm": 0.26060912013053894, + "learning_rate": 6.962859058523344e-06, + "loss": 0.0024, + "step": 53100 + }, + { + "epoch": 0.8690174261637896, + "grad_norm": 0.10607622563838959, + "learning_rate": 6.961545703046073e-06, + "loss": 0.0022, + "step": 53110 + }, + { + "epoch": 0.8691810521148654, + "grad_norm": 0.03295673429965973, + "learning_rate": 6.96023218759181e-06, + "loss": 0.0018, + "step": 53120 + }, + { + "epoch": 0.8693446780659413, + "grad_norm": 0.04429740831255913, + "learning_rate": 6.958918512267678e-06, + "loss": 0.0023, + "step": 53130 + }, + { + "epoch": 0.8695083040170171, + "grad_norm": 0.05577025189995766, + "learning_rate": 6.957604677180818e-06, + "loss": 0.0015, + "step": 53140 + }, + { + "epoch": 0.869671929968093, + "grad_norm": 0.10671288520097733, + "learning_rate": 6.95629068243838e-06, + "loss": 0.0019, + "step": 53150 + }, + { + "epoch": 0.8698355559191688, + "grad_norm": 0.10934911668300629, + "learning_rate": 6.954976528147531e-06, + "loss": 0.004, + "step": 53160 + }, + { + "epoch": 0.8699991818702446, + "grad_norm": 0.07432562857866287, + "learning_rate": 6.953662214415446e-06, + "loss": 0.0023, + "step": 53170 + }, + { + "epoch": 0.8701628078213205, + "grad_norm": 0.2075672596693039, + "learning_rate": 6.952347741349319e-06, + "loss": 0.0018, + "step": 53180 + }, + { + "epoch": 0.8703264337723963, + "grad_norm": 0.1452101469039917, + "learning_rate": 6.951033109056353e-06, + "loss": 0.0021, + "step": 53190 + }, + { + "epoch": 0.8704900597234722, + "grad_norm": 0.06605714559555054, + "learning_rate": 6.949718317643764e-06, + "loss": 0.0025, + "step": 53200 + }, + { + "epoch": 0.8706536856745479, + "grad_norm": 0.09846769273281097, + "learning_rate": 6.948403367218781e-06, + "loss": 0.0033, + "step": 53210 + }, + { + "epoch": 0.8708173116256238, + "grad_norm": 0.05244271457195282, + "learning_rate": 6.94708825788865e-06, + "loss": 0.0012, + "step": 53220 + }, + { + "epoch": 0.8709809375766997, + "grad_norm": 0.06133155897259712, + "learning_rate": 6.945772989760626e-06, + "loss": 0.0055, + "step": 53230 + }, + { + "epoch": 0.8711445635277755, + "grad_norm": 0.1320413202047348, + "learning_rate": 6.944457562941975e-06, + "loss": 0.0027, + "step": 53240 + }, + { + "epoch": 0.8713081894788514, + "grad_norm": 0.006265302188694477, + "learning_rate": 6.943141977539982e-06, + "loss": 0.0009, + "step": 53250 + }, + { + "epoch": 0.8714718154299271, + "grad_norm": 0.11734145134687424, + "learning_rate": 6.941826233661941e-06, + "loss": 0.0018, + "step": 53260 + }, + { + "epoch": 0.871635441381003, + "grad_norm": 0.04843049868941307, + "learning_rate": 6.940510331415158e-06, + "loss": 0.0014, + "step": 53270 + }, + { + "epoch": 0.8717990673320789, + "grad_norm": 0.040032610297203064, + "learning_rate": 6.939194270906955e-06, + "loss": 0.002, + "step": 53280 + }, + { + "epoch": 0.8719626932831547, + "grad_norm": 0.06429308652877808, + "learning_rate": 6.937878052244664e-06, + "loss": 0.0022, + "step": 53290 + }, + { + "epoch": 0.8721263192342306, + "grad_norm": 0.043519239872694016, + "learning_rate": 6.936561675535634e-06, + "loss": 0.0024, + "step": 53300 + }, + { + "epoch": 0.8722899451853063, + "grad_norm": 0.0638105645775795, + "learning_rate": 6.935245140887221e-06, + "loss": 0.0027, + "step": 53310 + }, + { + "epoch": 0.8724535711363822, + "grad_norm": 0.17934434115886688, + "learning_rate": 6.9339284484068e-06, + "loss": 0.0026, + "step": 53320 + }, + { + "epoch": 0.8726171970874581, + "grad_norm": 0.06698743999004364, + "learning_rate": 6.932611598201754e-06, + "loss": 0.0036, + "step": 53330 + }, + { + "epoch": 0.8727808230385339, + "grad_norm": 0.0858529731631279, + "learning_rate": 6.93129459037948e-06, + "loss": 0.0025, + "step": 53340 + }, + { + "epoch": 0.8729444489896098, + "grad_norm": 0.07484275102615356, + "learning_rate": 6.929977425047392e-06, + "loss": 0.0021, + "step": 53350 + }, + { + "epoch": 0.8731080749406855, + "grad_norm": 0.24180133640766144, + "learning_rate": 6.928660102312911e-06, + "loss": 0.0012, + "step": 53360 + }, + { + "epoch": 0.8732717008917614, + "grad_norm": 0.0876980647444725, + "learning_rate": 6.9273426222834735e-06, + "loss": 0.0027, + "step": 53370 + }, + { + "epoch": 0.8734353268428373, + "grad_norm": 0.062733493745327, + "learning_rate": 6.92602498506653e-06, + "loss": 0.0024, + "step": 53380 + }, + { + "epoch": 0.8735989527939131, + "grad_norm": 0.12802116572856903, + "learning_rate": 6.92470719076954e-06, + "loss": 0.0063, + "step": 53390 + }, + { + "epoch": 0.873762578744989, + "grad_norm": 0.03013026900589466, + "learning_rate": 6.923389239499982e-06, + "loss": 0.0017, + "step": 53400 + }, + { + "epoch": 0.8739262046960647, + "grad_norm": 0.07950850576162338, + "learning_rate": 6.92207113136534e-06, + "loss": 0.0017, + "step": 53410 + }, + { + "epoch": 0.8740898306471406, + "grad_norm": 0.040680043399333954, + "learning_rate": 6.920752866473116e-06, + "loss": 0.0015, + "step": 53420 + }, + { + "epoch": 0.8742534565982165, + "grad_norm": 0.0779644325375557, + "learning_rate": 6.919434444930823e-06, + "loss": 0.0014, + "step": 53430 + }, + { + "epoch": 0.8744170825492923, + "grad_norm": 0.08873041719198227, + "learning_rate": 6.918115866845988e-06, + "loss": 0.0028, + "step": 53440 + }, + { + "epoch": 0.8745807085003682, + "grad_norm": 0.08217376470565796, + "learning_rate": 6.916797132326148e-06, + "loss": 0.0016, + "step": 53450 + }, + { + "epoch": 0.874744334451444, + "grad_norm": 0.12217187136411667, + "learning_rate": 6.915478241478857e-06, + "loss": 0.0015, + "step": 53460 + }, + { + "epoch": 0.8749079604025198, + "grad_norm": 0.11413371562957764, + "learning_rate": 6.914159194411676e-06, + "loss": 0.0038, + "step": 53470 + }, + { + "epoch": 0.8750715863535957, + "grad_norm": 0.08643936365842819, + "learning_rate": 6.9128399912321845e-06, + "loss": 0.0032, + "step": 53480 + }, + { + "epoch": 0.8752352123046715, + "grad_norm": 0.05037185177206993, + "learning_rate": 6.911520632047972e-06, + "loss": 0.0012, + "step": 53490 + }, + { + "epoch": 0.8753988382557474, + "grad_norm": 0.06223201006650925, + "learning_rate": 6.91020111696664e-06, + "loss": 0.0023, + "step": 53500 + }, + { + "epoch": 0.8755624642068232, + "grad_norm": 0.06319757550954819, + "learning_rate": 6.9088814460958035e-06, + "loss": 0.0024, + "step": 53510 + }, + { + "epoch": 0.875726090157899, + "grad_norm": 0.0652029812335968, + "learning_rate": 6.907561619543091e-06, + "loss": 0.0017, + "step": 53520 + }, + { + "epoch": 0.8758897161089749, + "grad_norm": 0.27320727705955505, + "learning_rate": 6.906241637416142e-06, + "loss": 0.0019, + "step": 53530 + }, + { + "epoch": 0.8760533420600507, + "grad_norm": 0.062421705573797226, + "learning_rate": 6.904921499822611e-06, + "loss": 0.003, + "step": 53540 + }, + { + "epoch": 0.8762169680111266, + "grad_norm": 0.0823979303240776, + "learning_rate": 6.903601206870164e-06, + "loss": 0.0018, + "step": 53550 + }, + { + "epoch": 0.8763805939622024, + "grad_norm": 0.13977567851543427, + "learning_rate": 6.90228075866648e-06, + "loss": 0.0024, + "step": 53560 + }, + { + "epoch": 0.8765442199132782, + "grad_norm": 0.0352754220366478, + "learning_rate": 6.900960155319248e-06, + "loss": 0.0009, + "step": 53570 + }, + { + "epoch": 0.8767078458643541, + "grad_norm": 0.057808853685855865, + "learning_rate": 6.899639396936173e-06, + "loss": 0.0012, + "step": 53580 + }, + { + "epoch": 0.8768714718154299, + "grad_norm": 0.17973849177360535, + "learning_rate": 6.898318483624973e-06, + "loss": 0.0013, + "step": 53590 + }, + { + "epoch": 0.8770350977665058, + "grad_norm": 0.12151037156581879, + "learning_rate": 6.896997415493376e-06, + "loss": 0.0022, + "step": 53600 + }, + { + "epoch": 0.8771987237175816, + "grad_norm": 0.025110622867941856, + "learning_rate": 6.895676192649124e-06, + "loss": 0.0022, + "step": 53610 + }, + { + "epoch": 0.8773623496686574, + "grad_norm": 0.18021991848945618, + "learning_rate": 6.894354815199971e-06, + "loss": 0.0016, + "step": 53620 + }, + { + "epoch": 0.8775259756197333, + "grad_norm": 0.03200940042734146, + "learning_rate": 6.893033283253685e-06, + "loss": 0.0021, + "step": 53630 + }, + { + "epoch": 0.8776896015708091, + "grad_norm": 0.05682295188307762, + "learning_rate": 6.8917115969180445e-06, + "loss": 0.0018, + "step": 53640 + }, + { + "epoch": 0.877853227521885, + "grad_norm": 0.1284276396036148, + "learning_rate": 6.890389756300842e-06, + "loss": 0.0013, + "step": 53650 + }, + { + "epoch": 0.8780168534729608, + "grad_norm": 0.026704085990786552, + "learning_rate": 6.8890677615098834e-06, + "loss": 0.0018, + "step": 53660 + }, + { + "epoch": 0.8781804794240367, + "grad_norm": 0.24410900473594666, + "learning_rate": 6.887745612652985e-06, + "loss": 0.003, + "step": 53670 + }, + { + "epoch": 0.8783441053751125, + "grad_norm": 0.009228730574250221, + "learning_rate": 6.886423309837978e-06, + "loss": 0.003, + "step": 53680 + }, + { + "epoch": 0.8785077313261883, + "grad_norm": 0.08482401072978973, + "learning_rate": 6.885100853172702e-06, + "loss": 0.0025, + "step": 53690 + }, + { + "epoch": 0.8786713572772642, + "grad_norm": 0.0654645711183548, + "learning_rate": 6.883778242765016e-06, + "loss": 0.0016, + "step": 53700 + }, + { + "epoch": 0.87883498322834, + "grad_norm": 0.08374740928411484, + "learning_rate": 6.8824554787227845e-06, + "loss": 0.0024, + "step": 53710 + }, + { + "epoch": 0.8789986091794159, + "grad_norm": 0.029770608991384506, + "learning_rate": 6.881132561153889e-06, + "loss": 0.0011, + "step": 53720 + }, + { + "epoch": 0.8791622351304917, + "grad_norm": 0.02724733017385006, + "learning_rate": 6.879809490166223e-06, + "loss": 0.0013, + "step": 53730 + }, + { + "epoch": 0.8793258610815675, + "grad_norm": 0.010225177742540836, + "learning_rate": 6.8784862658676895e-06, + "loss": 0.0029, + "step": 53740 + }, + { + "epoch": 0.8794894870326434, + "grad_norm": 0.17977429926395416, + "learning_rate": 6.877162888366208e-06, + "loss": 0.0022, + "step": 53750 + }, + { + "epoch": 0.8796531129837192, + "grad_norm": 0.12213508784770966, + "learning_rate": 6.875839357769707e-06, + "loss": 0.0034, + "step": 53760 + }, + { + "epoch": 0.8798167389347951, + "grad_norm": 0.026230571791529655, + "learning_rate": 6.874515674186131e-06, + "loss": 0.0024, + "step": 53770 + }, + { + "epoch": 0.879980364885871, + "grad_norm": 0.02563793957233429, + "learning_rate": 6.873191837723434e-06, + "loss": 0.0017, + "step": 53780 + }, + { + "epoch": 0.8801439908369467, + "grad_norm": 0.15670177340507507, + "learning_rate": 6.871867848489584e-06, + "loss": 0.0031, + "step": 53790 + }, + { + "epoch": 0.8803076167880226, + "grad_norm": 0.0036613966803997755, + "learning_rate": 6.870543706592561e-06, + "loss": 0.0009, + "step": 53800 + }, + { + "epoch": 0.8804712427390984, + "grad_norm": 0.0243604127317667, + "learning_rate": 6.869219412140359e-06, + "loss": 0.002, + "step": 53810 + }, + { + "epoch": 0.8806348686901743, + "grad_norm": 0.1405079960823059, + "learning_rate": 6.8678949652409785e-06, + "loss": 0.0031, + "step": 53820 + }, + { + "epoch": 0.8807984946412502, + "grad_norm": 0.029803363606333733, + "learning_rate": 6.866570366002442e-06, + "loss": 0.0021, + "step": 53830 + }, + { + "epoch": 0.8809621205923259, + "grad_norm": 0.08115305006504059, + "learning_rate": 6.8652456145327765e-06, + "loss": 0.0032, + "step": 53840 + }, + { + "epoch": 0.8811257465434018, + "grad_norm": 0.05830787867307663, + "learning_rate": 6.863920710940023e-06, + "loss": 0.0009, + "step": 53850 + }, + { + "epoch": 0.8812893724944776, + "grad_norm": 0.032207537442445755, + "learning_rate": 6.8625956553322395e-06, + "loss": 0.0013, + "step": 53860 + }, + { + "epoch": 0.8814529984455535, + "grad_norm": 0.05618473142385483, + "learning_rate": 6.861270447817492e-06, + "loss": 0.0015, + "step": 53870 + }, + { + "epoch": 0.8816166243966294, + "grad_norm": 0.07736112177371979, + "learning_rate": 6.859945088503858e-06, + "loss": 0.0017, + "step": 53880 + }, + { + "epoch": 0.8817802503477051, + "grad_norm": 0.06020403280854225, + "learning_rate": 6.858619577499431e-06, + "loss": 0.0022, + "step": 53890 + }, + { + "epoch": 0.881943876298781, + "grad_norm": 0.05019012466073036, + "learning_rate": 6.857293914912316e-06, + "loss": 0.0017, + "step": 53900 + }, + { + "epoch": 0.8821075022498568, + "grad_norm": 0.029793351888656616, + "learning_rate": 6.855968100850626e-06, + "loss": 0.0023, + "step": 53910 + }, + { + "epoch": 0.8822711282009327, + "grad_norm": 0.028437718749046326, + "learning_rate": 6.8546421354224946e-06, + "loss": 0.0012, + "step": 53920 + }, + { + "epoch": 0.8824347541520086, + "grad_norm": 0.03040366806089878, + "learning_rate": 6.853316018736059e-06, + "loss": 0.0024, + "step": 53930 + }, + { + "epoch": 0.8825983801030843, + "grad_norm": 0.11271120607852936, + "learning_rate": 6.8519897508994745e-06, + "loss": 0.0036, + "step": 53940 + }, + { + "epoch": 0.8827620060541602, + "grad_norm": 0.026811890304088593, + "learning_rate": 6.850663332020908e-06, + "loss": 0.0015, + "step": 53950 + }, + { + "epoch": 0.882925632005236, + "grad_norm": 0.11508350074291229, + "learning_rate": 6.849336762208536e-06, + "loss": 0.0019, + "step": 53960 + }, + { + "epoch": 0.8830892579563119, + "grad_norm": 0.022951634600758553, + "learning_rate": 6.848010041570551e-06, + "loss": 0.0017, + "step": 53970 + }, + { + "epoch": 0.8832528839073878, + "grad_norm": 0.10866470634937286, + "learning_rate": 6.846683170215153e-06, + "loss": 0.0025, + "step": 53980 + }, + { + "epoch": 0.8834165098584635, + "grad_norm": 0.17455822229385376, + "learning_rate": 6.8453561482505594e-06, + "loss": 0.0013, + "step": 53990 + }, + { + "epoch": 0.8835801358095394, + "grad_norm": 0.01109520997852087, + "learning_rate": 6.844028975784995e-06, + "loss": 0.0014, + "step": 54000 + }, + { + "epoch": 0.8837437617606152, + "grad_norm": 0.06662735342979431, + "learning_rate": 6.842701652926703e-06, + "loss": 0.0021, + "step": 54010 + }, + { + "epoch": 0.8839073877116911, + "grad_norm": 0.1067625880241394, + "learning_rate": 6.841374179783934e-06, + "loss": 0.0012, + "step": 54020 + }, + { + "epoch": 0.884071013662767, + "grad_norm": 0.03916068375110626, + "learning_rate": 6.840046556464951e-06, + "loss": 0.0015, + "step": 54030 + }, + { + "epoch": 0.8842346396138427, + "grad_norm": 0.03386598825454712, + "learning_rate": 6.838718783078031e-06, + "loss": 0.0029, + "step": 54040 + }, + { + "epoch": 0.8843982655649186, + "grad_norm": 0.02871730737388134, + "learning_rate": 6.837390859731463e-06, + "loss": 0.003, + "step": 54050 + }, + { + "epoch": 0.8845618915159944, + "grad_norm": 0.10410573333501816, + "learning_rate": 6.8360627865335486e-06, + "loss": 0.0016, + "step": 54060 + }, + { + "epoch": 0.8847255174670703, + "grad_norm": 0.11157534271478653, + "learning_rate": 6.834734563592599e-06, + "loss": 0.0028, + "step": 54070 + }, + { + "epoch": 0.8848891434181462, + "grad_norm": 0.11567296087741852, + "learning_rate": 6.833406191016941e-06, + "loss": 0.002, + "step": 54080 + }, + { + "epoch": 0.885052769369222, + "grad_norm": 0.10017170011997223, + "learning_rate": 6.8320776689149126e-06, + "loss": 0.0018, + "step": 54090 + }, + { + "epoch": 0.8852163953202978, + "grad_norm": 0.11726928502321243, + "learning_rate": 6.8307489973948625e-06, + "loss": 0.0036, + "step": 54100 + }, + { + "epoch": 0.8853800212713736, + "grad_norm": 0.04573062062263489, + "learning_rate": 6.829420176565153e-06, + "loss": 0.0012, + "step": 54110 + }, + { + "epoch": 0.8855436472224495, + "grad_norm": 0.06171920523047447, + "learning_rate": 6.828091206534157e-06, + "loss": 0.0022, + "step": 54120 + }, + { + "epoch": 0.8857072731735253, + "grad_norm": 0.08566936105489731, + "learning_rate": 6.826762087410264e-06, + "loss": 0.0029, + "step": 54130 + }, + { + "epoch": 0.8858708991246012, + "grad_norm": 0.0958702340722084, + "learning_rate": 6.8254328193018695e-06, + "loss": 0.0012, + "step": 54140 + }, + { + "epoch": 0.886034525075677, + "grad_norm": 0.24597111344337463, + "learning_rate": 6.824103402317384e-06, + "loss": 0.0039, + "step": 54150 + }, + { + "epoch": 0.8861981510267528, + "grad_norm": 0.16293805837631226, + "learning_rate": 6.8227738365652315e-06, + "loss": 0.0014, + "step": 54160 + }, + { + "epoch": 0.8863617769778287, + "grad_norm": 0.22481563687324524, + "learning_rate": 6.821444122153846e-06, + "loss": 0.0022, + "step": 54170 + }, + { + "epoch": 0.8865254029289045, + "grad_norm": 0.02314728870987892, + "learning_rate": 6.820114259191675e-06, + "loss": 0.0021, + "step": 54180 + }, + { + "epoch": 0.8866890288799804, + "grad_norm": 0.08620315045118332, + "learning_rate": 6.818784247787179e-06, + "loss": 0.0032, + "step": 54190 + }, + { + "epoch": 0.8868526548310562, + "grad_norm": 0.05162770301103592, + "learning_rate": 6.817454088048827e-06, + "loss": 0.0026, + "step": 54200 + }, + { + "epoch": 0.887016280782132, + "grad_norm": 0.02676309645175934, + "learning_rate": 6.816123780085103e-06, + "loss": 0.0025, + "step": 54210 + }, + { + "epoch": 0.8871799067332079, + "grad_norm": 0.020230954512953758, + "learning_rate": 6.814793324004503e-06, + "loss": 0.0021, + "step": 54220 + }, + { + "epoch": 0.8873435326842837, + "grad_norm": 0.05676576495170593, + "learning_rate": 6.813462719915533e-06, + "loss": 0.0014, + "step": 54230 + }, + { + "epoch": 0.8875071586353596, + "grad_norm": 0.07944746315479279, + "learning_rate": 6.812131967926714e-06, + "loss": 0.0017, + "step": 54240 + }, + { + "epoch": 0.8876707845864354, + "grad_norm": 0.010663527064025402, + "learning_rate": 6.8108010681465775e-06, + "loss": 0.0017, + "step": 54250 + }, + { + "epoch": 0.8878344105375112, + "grad_norm": 0.15500694513320923, + "learning_rate": 6.809470020683666e-06, + "loss": 0.0029, + "step": 54260 + }, + { + "epoch": 0.8879980364885871, + "grad_norm": 0.02343682385981083, + "learning_rate": 6.808138825646537e-06, + "loss": 0.0027, + "step": 54270 + }, + { + "epoch": 0.8881616624396629, + "grad_norm": 0.14620138704776764, + "learning_rate": 6.806807483143756e-06, + "loss": 0.0021, + "step": 54280 + }, + { + "epoch": 0.8883252883907388, + "grad_norm": 0.11204607784748077, + "learning_rate": 6.805475993283904e-06, + "loss": 0.0031, + "step": 54290 + }, + { + "epoch": 0.8884889143418147, + "grad_norm": 0.09329970180988312, + "learning_rate": 6.804144356175574e-06, + "loss": 0.0019, + "step": 54300 + }, + { + "epoch": 0.8886525402928904, + "grad_norm": 0.05720141530036926, + "learning_rate": 6.8028125719273675e-06, + "loss": 0.0011, + "step": 54310 + }, + { + "epoch": 0.8888161662439663, + "grad_norm": 0.07613986730575562, + "learning_rate": 6.8014806406479015e-06, + "loss": 0.0031, + "step": 54320 + }, + { + "epoch": 0.8889797921950421, + "grad_norm": 0.3952399492263794, + "learning_rate": 6.800148562445804e-06, + "loss": 0.0019, + "step": 54330 + }, + { + "epoch": 0.889143418146118, + "grad_norm": 0.007523656357079744, + "learning_rate": 6.7988163374297145e-06, + "loss": 0.0018, + "step": 54340 + }, + { + "epoch": 0.8893070440971939, + "grad_norm": 0.1378215104341507, + "learning_rate": 6.797483965708284e-06, + "loss": 0.0021, + "step": 54350 + }, + { + "epoch": 0.8894706700482696, + "grad_norm": 0.0389818400144577, + "learning_rate": 6.796151447390177e-06, + "loss": 0.0022, + "step": 54360 + }, + { + "epoch": 0.8896342959993455, + "grad_norm": 0.1969531923532486, + "learning_rate": 6.794818782584069e-06, + "loss": 0.0031, + "step": 54370 + }, + { + "epoch": 0.8897979219504213, + "grad_norm": 0.023818299174308777, + "learning_rate": 6.793485971398648e-06, + "loss": 0.0022, + "step": 54380 + }, + { + "epoch": 0.8899615479014972, + "grad_norm": 0.23318390548229218, + "learning_rate": 6.792153013942613e-06, + "loss": 0.0039, + "step": 54390 + }, + { + "epoch": 0.8901251738525731, + "grad_norm": 0.040484681725502014, + "learning_rate": 6.790819910324675e-06, + "loss": 0.0015, + "step": 54400 + }, + { + "epoch": 0.8902887998036488, + "grad_norm": 0.05555105581879616, + "learning_rate": 6.789486660653558e-06, + "loss": 0.0021, + "step": 54410 + }, + { + "epoch": 0.8904524257547247, + "grad_norm": 0.025553874671459198, + "learning_rate": 6.788153265037998e-06, + "loss": 0.0034, + "step": 54420 + }, + { + "epoch": 0.8906160517058005, + "grad_norm": 0.035305339843034744, + "learning_rate": 6.78681972358674e-06, + "loss": 0.002, + "step": 54430 + }, + { + "epoch": 0.8907796776568764, + "grad_norm": 0.00880409125238657, + "learning_rate": 6.785486036408546e-06, + "loss": 0.0026, + "step": 54440 + }, + { + "epoch": 0.8909433036079523, + "grad_norm": 0.05229269713163376, + "learning_rate": 6.784152203612188e-06, + "loss": 0.0009, + "step": 54450 + }, + { + "epoch": 0.891106929559028, + "grad_norm": 0.12757587432861328, + "learning_rate": 6.782818225306443e-06, + "loss": 0.0029, + "step": 54460 + }, + { + "epoch": 0.8912705555101039, + "grad_norm": 0.002629539230838418, + "learning_rate": 6.781484101600111e-06, + "loss": 0.0019, + "step": 54470 + }, + { + "epoch": 0.8914341814611797, + "grad_norm": 0.08326783776283264, + "learning_rate": 6.780149832601996e-06, + "loss": 0.004, + "step": 54480 + }, + { + "epoch": 0.8915978074122556, + "grad_norm": 0.04776367172598839, + "learning_rate": 6.778815418420916e-06, + "loss": 0.0025, + "step": 54490 + }, + { + "epoch": 0.8917614333633315, + "grad_norm": 0.08596014231443405, + "learning_rate": 6.777480859165703e-06, + "loss": 0.0042, + "step": 54500 + }, + { + "epoch": 0.8919250593144072, + "grad_norm": 0.1295817494392395, + "learning_rate": 6.7761461549452e-06, + "loss": 0.0017, + "step": 54510 + }, + { + "epoch": 0.8920886852654831, + "grad_norm": 0.06088557094335556, + "learning_rate": 6.774811305868259e-06, + "loss": 0.0016, + "step": 54520 + }, + { + "epoch": 0.8922523112165589, + "grad_norm": 0.08484260737895966, + "learning_rate": 6.773476312043748e-06, + "loss": 0.0019, + "step": 54530 + }, + { + "epoch": 0.8924159371676348, + "grad_norm": 0.09183470159769058, + "learning_rate": 6.7721411735805395e-06, + "loss": 0.0017, + "step": 54540 + }, + { + "epoch": 0.8925795631187107, + "grad_norm": 0.10159572958946228, + "learning_rate": 6.770805890587529e-06, + "loss": 0.0036, + "step": 54550 + }, + { + "epoch": 0.8927431890697864, + "grad_norm": 0.08508113771677017, + "learning_rate": 6.769470463173613e-06, + "loss": 0.0028, + "step": 54560 + }, + { + "epoch": 0.8929068150208623, + "grad_norm": 0.07915453612804413, + "learning_rate": 6.768134891447708e-06, + "loss": 0.0017, + "step": 54570 + }, + { + "epoch": 0.8930704409719381, + "grad_norm": 0.13049060106277466, + "learning_rate": 6.766799175518736e-06, + "loss": 0.0015, + "step": 54580 + }, + { + "epoch": 0.893234066923014, + "grad_norm": 0.07821011543273926, + "learning_rate": 6.7654633154956336e-06, + "loss": 0.002, + "step": 54590 + }, + { + "epoch": 0.8933976928740899, + "grad_norm": 0.07520847767591476, + "learning_rate": 6.76412731148735e-06, + "loss": 0.0016, + "step": 54600 + }, + { + "epoch": 0.8935613188251657, + "grad_norm": 0.07387443631887436, + "learning_rate": 6.762791163602846e-06, + "loss": 0.0024, + "step": 54610 + }, + { + "epoch": 0.8937249447762415, + "grad_norm": 0.041292138397693634, + "learning_rate": 6.761454871951092e-06, + "loss": 0.0023, + "step": 54620 + }, + { + "epoch": 0.8938885707273173, + "grad_norm": 0.06723076850175858, + "learning_rate": 6.760118436641072e-06, + "loss": 0.0016, + "step": 54630 + }, + { + "epoch": 0.8940521966783932, + "grad_norm": 0.011046124622225761, + "learning_rate": 6.75878185778178e-06, + "loss": 0.0017, + "step": 54640 + }, + { + "epoch": 0.8942158226294691, + "grad_norm": 0.07001950591802597, + "learning_rate": 6.757445135482223e-06, + "loss": 0.0038, + "step": 54650 + }, + { + "epoch": 0.8943794485805449, + "grad_norm": 0.15871793031692505, + "learning_rate": 6.756108269851421e-06, + "loss": 0.002, + "step": 54660 + }, + { + "epoch": 0.8945430745316207, + "grad_norm": 0.2555104196071625, + "learning_rate": 6.754771260998404e-06, + "loss": 0.0019, + "step": 54670 + }, + { + "epoch": 0.8947067004826965, + "grad_norm": 0.07030268013477325, + "learning_rate": 6.753434109032212e-06, + "loss": 0.0012, + "step": 54680 + }, + { + "epoch": 0.8948703264337724, + "grad_norm": 0.02897796221077442, + "learning_rate": 6.7520968140619e-06, + "loss": 0.0014, + "step": 54690 + }, + { + "epoch": 0.8950339523848483, + "grad_norm": 0.06385869532823563, + "learning_rate": 6.750759376196534e-06, + "loss": 0.0026, + "step": 54700 + }, + { + "epoch": 0.8951975783359241, + "grad_norm": 0.05134237930178642, + "learning_rate": 6.749421795545188e-06, + "loss": 0.0032, + "step": 54710 + }, + { + "epoch": 0.895361204287, + "grad_norm": 0.15613682568073273, + "learning_rate": 6.7480840722169536e-06, + "loss": 0.0025, + "step": 54720 + }, + { + "epoch": 0.8955248302380757, + "grad_norm": 0.0398799404501915, + "learning_rate": 6.74674620632093e-06, + "loss": 0.0024, + "step": 54730 + }, + { + "epoch": 0.8956884561891516, + "grad_norm": 0.037842899560928345, + "learning_rate": 6.745408197966228e-06, + "loss": 0.0016, + "step": 54740 + }, + { + "epoch": 0.8958520821402275, + "grad_norm": 0.04701778665184975, + "learning_rate": 6.744070047261973e-06, + "loss": 0.0019, + "step": 54750 + }, + { + "epoch": 0.8960157080913033, + "grad_norm": 0.02147512137889862, + "learning_rate": 6.742731754317297e-06, + "loss": 0.0022, + "step": 54760 + }, + { + "epoch": 0.8961793340423791, + "grad_norm": 0.01278098113834858, + "learning_rate": 6.74139331924135e-06, + "loss": 0.0016, + "step": 54770 + }, + { + "epoch": 0.8963429599934549, + "grad_norm": 0.044319625943899155, + "learning_rate": 6.740054742143288e-06, + "loss": 0.0026, + "step": 54780 + }, + { + "epoch": 0.8965065859445308, + "grad_norm": 0.16306254267692566, + "learning_rate": 6.738716023132281e-06, + "loss": 0.0021, + "step": 54790 + }, + { + "epoch": 0.8966702118956067, + "grad_norm": 0.07865049690008163, + "learning_rate": 6.737377162317511e-06, + "loss": 0.0023, + "step": 54800 + }, + { + "epoch": 0.8968338378466825, + "grad_norm": 0.05578213185071945, + "learning_rate": 6.7360381598081715e-06, + "loss": 0.0023, + "step": 54810 + }, + { + "epoch": 0.8969974637977584, + "grad_norm": 0.021231284365057945, + "learning_rate": 6.7346990157134664e-06, + "loss": 0.0017, + "step": 54820 + }, + { + "epoch": 0.8971610897488341, + "grad_norm": 0.03647147864103317, + "learning_rate": 6.73335973014261e-06, + "loss": 0.0017, + "step": 54830 + }, + { + "epoch": 0.89732471569991, + "grad_norm": 0.0962071418762207, + "learning_rate": 6.732020303204832e-06, + "loss": 0.0018, + "step": 54840 + }, + { + "epoch": 0.8974883416509859, + "grad_norm": 0.2508019804954529, + "learning_rate": 6.730680735009371e-06, + "loss": 0.0036, + "step": 54850 + }, + { + "epoch": 0.8976519676020617, + "grad_norm": 0.05875832587480545, + "learning_rate": 6.729341025665477e-06, + "loss": 0.0039, + "step": 54860 + }, + { + "epoch": 0.8978155935531376, + "grad_norm": 0.04013267159461975, + "learning_rate": 6.728001175282414e-06, + "loss": 0.0019, + "step": 54870 + }, + { + "epoch": 0.8979792195042133, + "grad_norm": 0.1119929701089859, + "learning_rate": 6.726661183969453e-06, + "loss": 0.0035, + "step": 54880 + }, + { + "epoch": 0.8981428454552892, + "grad_norm": 0.10151662677526474, + "learning_rate": 6.72532105183588e-06, + "loss": 0.0014, + "step": 54890 + }, + { + "epoch": 0.8983064714063651, + "grad_norm": 0.09523914009332657, + "learning_rate": 6.723980778990994e-06, + "loss": 0.0013, + "step": 54900 + }, + { + "epoch": 0.8984700973574409, + "grad_norm": 0.027653254568576813, + "learning_rate": 6.7226403655441e-06, + "loss": 0.0019, + "step": 54910 + }, + { + "epoch": 0.8986337233085168, + "grad_norm": 0.0768464058637619, + "learning_rate": 6.721299811604518e-06, + "loss": 0.0016, + "step": 54920 + }, + { + "epoch": 0.8987973492595925, + "grad_norm": 0.11579815298318863, + "learning_rate": 6.7199591172815816e-06, + "loss": 0.003, + "step": 54930 + }, + { + "epoch": 0.8989609752106684, + "grad_norm": 0.11288614571094513, + "learning_rate": 6.71861828268463e-06, + "loss": 0.0021, + "step": 54940 + }, + { + "epoch": 0.8991246011617443, + "grad_norm": 0.11000853031873703, + "learning_rate": 6.717277307923019e-06, + "loss": 0.0026, + "step": 54950 + }, + { + "epoch": 0.8992882271128201, + "grad_norm": 0.08903923630714417, + "learning_rate": 6.715936193106113e-06, + "loss": 0.0011, + "step": 54960 + }, + { + "epoch": 0.899451853063896, + "grad_norm": 0.06517019122838974, + "learning_rate": 6.71459493834329e-06, + "loss": 0.0024, + "step": 54970 + }, + { + "epoch": 0.8996154790149717, + "grad_norm": 0.16023556888103485, + "learning_rate": 6.713253543743936e-06, + "loss": 0.0019, + "step": 54980 + }, + { + "epoch": 0.8997791049660476, + "grad_norm": 0.027420731261372566, + "learning_rate": 6.711912009417454e-06, + "loss": 0.0027, + "step": 54990 + }, + { + "epoch": 0.8999427309171234, + "grad_norm": 0.10732796788215637, + "learning_rate": 6.710570335473252e-06, + "loss": 0.002, + "step": 55000 + }, + { + "epoch": 0.9001063568681993, + "grad_norm": 0.04320235177874565, + "learning_rate": 6.709228522020752e-06, + "loss": 0.0024, + "step": 55010 + }, + { + "epoch": 0.9002699828192752, + "grad_norm": 0.03951322287321091, + "learning_rate": 6.707886569169391e-06, + "loss": 0.0036, + "step": 55020 + }, + { + "epoch": 0.900433608770351, + "grad_norm": 0.05487572029232979, + "learning_rate": 6.7065444770286115e-06, + "loss": 0.0021, + "step": 55030 + }, + { + "epoch": 0.9005972347214268, + "grad_norm": 0.031976018100976944, + "learning_rate": 6.70520224570787e-06, + "loss": 0.0018, + "step": 55040 + }, + { + "epoch": 0.9007608606725026, + "grad_norm": 0.09151197224855423, + "learning_rate": 6.703859875316635e-06, + "loss": 0.0021, + "step": 55050 + }, + { + "epoch": 0.9009244866235785, + "grad_norm": 0.10370881855487823, + "learning_rate": 6.702517365964386e-06, + "loss": 0.0027, + "step": 55060 + }, + { + "epoch": 0.9010881125746544, + "grad_norm": 0.05246930569410324, + "learning_rate": 6.7011747177606135e-06, + "loss": 0.002, + "step": 55070 + }, + { + "epoch": 0.9012517385257302, + "grad_norm": 0.05457548052072525, + "learning_rate": 6.699831930814819e-06, + "loss": 0.0048, + "step": 55080 + }, + { + "epoch": 0.901415364476806, + "grad_norm": 0.04049844294786453, + "learning_rate": 6.698489005236515e-06, + "loss": 0.0022, + "step": 55090 + }, + { + "epoch": 0.9015789904278818, + "grad_norm": 0.060036513954401016, + "learning_rate": 6.697145941135227e-06, + "loss": 0.0023, + "step": 55100 + }, + { + "epoch": 0.9017426163789577, + "grad_norm": 0.042780354619026184, + "learning_rate": 6.69580273862049e-06, + "loss": 0.0021, + "step": 55110 + }, + { + "epoch": 0.9019062423300336, + "grad_norm": 0.10763350874185562, + "learning_rate": 6.694459397801851e-06, + "loss": 0.0031, + "step": 55120 + }, + { + "epoch": 0.9020698682811094, + "grad_norm": 0.06933766603469849, + "learning_rate": 6.693115918788869e-06, + "loss": 0.0024, + "step": 55130 + }, + { + "epoch": 0.9022334942321852, + "grad_norm": 0.0266238022595644, + "learning_rate": 6.691772301691113e-06, + "loss": 0.002, + "step": 55140 + }, + { + "epoch": 0.902397120183261, + "grad_norm": 0.06552164256572723, + "learning_rate": 6.6904285466181625e-06, + "loss": 0.0018, + "step": 55150 + }, + { + "epoch": 0.9025607461343369, + "grad_norm": 0.06240471452474594, + "learning_rate": 6.689084653679613e-06, + "loss": 0.0026, + "step": 55160 + }, + { + "epoch": 0.9027243720854128, + "grad_norm": 0.03741876035928726, + "learning_rate": 6.687740622985065e-06, + "loss": 0.0017, + "step": 55170 + }, + { + "epoch": 0.9028879980364886, + "grad_norm": 0.02448151633143425, + "learning_rate": 6.686396454644134e-06, + "loss": 0.002, + "step": 55180 + }, + { + "epoch": 0.9030516239875644, + "grad_norm": 0.022847145795822144, + "learning_rate": 6.6850521487664465e-06, + "loss": 0.0024, + "step": 55190 + }, + { + "epoch": 0.9032152499386402, + "grad_norm": 0.10724387317895889, + "learning_rate": 6.6837077054616376e-06, + "loss": 0.0023, + "step": 55200 + }, + { + "epoch": 0.9033788758897161, + "grad_norm": 0.055166326463222504, + "learning_rate": 6.682363124839357e-06, + "loss": 0.0015, + "step": 55210 + }, + { + "epoch": 0.903542501840792, + "grad_norm": 0.06569929420948029, + "learning_rate": 6.681018407009265e-06, + "loss": 0.0023, + "step": 55220 + }, + { + "epoch": 0.9037061277918678, + "grad_norm": 0.26221126317977905, + "learning_rate": 6.679673552081029e-06, + "loss": 0.003, + "step": 55230 + }, + { + "epoch": 0.9038697537429436, + "grad_norm": 0.04133813828229904, + "learning_rate": 6.678328560164336e-06, + "loss": 0.0016, + "step": 55240 + }, + { + "epoch": 0.9040333796940194, + "grad_norm": 0.05080864578485489, + "learning_rate": 6.676983431368873e-06, + "loss": 0.0026, + "step": 55250 + }, + { + "epoch": 0.9041970056450953, + "grad_norm": 0.1001199409365654, + "learning_rate": 6.675638165804348e-06, + "loss": 0.0031, + "step": 55260 + }, + { + "epoch": 0.9043606315961712, + "grad_norm": 0.09347344934940338, + "learning_rate": 6.674292763580474e-06, + "loss": 0.0024, + "step": 55270 + }, + { + "epoch": 0.904524257547247, + "grad_norm": 0.04194813594222069, + "learning_rate": 6.6729472248069804e-06, + "loss": 0.0016, + "step": 55280 + }, + { + "epoch": 0.9046878834983229, + "grad_norm": 0.06681618839502335, + "learning_rate": 6.671601549593603e-06, + "loss": 0.0012, + "step": 55290 + }, + { + "epoch": 0.9048515094493986, + "grad_norm": 0.08751630783081055, + "learning_rate": 6.670255738050089e-06, + "loss": 0.0015, + "step": 55300 + }, + { + "epoch": 0.9050151354004745, + "grad_norm": 0.09668885171413422, + "learning_rate": 6.6689097902862e-06, + "loss": 0.003, + "step": 55310 + }, + { + "epoch": 0.9051787613515504, + "grad_norm": 0.009523888118565083, + "learning_rate": 6.667563706411707e-06, + "loss": 0.0024, + "step": 55320 + }, + { + "epoch": 0.9053423873026262, + "grad_norm": 0.07577191293239594, + "learning_rate": 6.666217486536393e-06, + "loss": 0.0019, + "step": 55330 + }, + { + "epoch": 0.9055060132537021, + "grad_norm": 0.28219079971313477, + "learning_rate": 6.664871130770047e-06, + "loss": 0.0028, + "step": 55340 + }, + { + "epoch": 0.9056696392047778, + "grad_norm": 0.08842485398054123, + "learning_rate": 6.663524639222478e-06, + "loss": 0.0015, + "step": 55350 + }, + { + "epoch": 0.9058332651558537, + "grad_norm": 0.03614573925733566, + "learning_rate": 6.662178012003498e-06, + "loss": 0.0011, + "step": 55360 + }, + { + "epoch": 0.9059968911069296, + "grad_norm": 0.008397082798182964, + "learning_rate": 6.660831249222936e-06, + "loss": 0.0009, + "step": 55370 + }, + { + "epoch": 0.9061605170580054, + "grad_norm": 0.05506163835525513, + "learning_rate": 6.659484350990627e-06, + "loss": 0.0028, + "step": 55380 + }, + { + "epoch": 0.9063241430090813, + "grad_norm": 0.5430269241333008, + "learning_rate": 6.65813731741642e-06, + "loss": 0.0021, + "step": 55390 + }, + { + "epoch": 0.906487768960157, + "grad_norm": 0.10015229135751724, + "learning_rate": 6.656790148610176e-06, + "loss": 0.0028, + "step": 55400 + }, + { + "epoch": 0.9066513949112329, + "grad_norm": 0.04523171856999397, + "learning_rate": 6.655442844681763e-06, + "loss": 0.0017, + "step": 55410 + }, + { + "epoch": 0.9068150208623088, + "grad_norm": 0.16789762675762177, + "learning_rate": 6.654095405741067e-06, + "loss": 0.0011, + "step": 55420 + }, + { + "epoch": 0.9069786468133846, + "grad_norm": 0.03918357193470001, + "learning_rate": 6.652747831897975e-06, + "loss": 0.0043, + "step": 55430 + }, + { + "epoch": 0.9071422727644605, + "grad_norm": 0.09510177373886108, + "learning_rate": 6.651400123262392e-06, + "loss": 0.0022, + "step": 55440 + }, + { + "epoch": 0.9073058987155362, + "grad_norm": 0.08188805729150772, + "learning_rate": 6.6500522799442345e-06, + "loss": 0.0022, + "step": 55450 + }, + { + "epoch": 0.9074695246666121, + "grad_norm": 0.03387482091784477, + "learning_rate": 6.648704302053428e-06, + "loss": 0.0016, + "step": 55460 + }, + { + "epoch": 0.907633150617688, + "grad_norm": 0.03617274388670921, + "learning_rate": 6.647356189699907e-06, + "loss": 0.0015, + "step": 55470 + }, + { + "epoch": 0.9077967765687638, + "grad_norm": 0.11838515847921371, + "learning_rate": 6.646007942993619e-06, + "loss": 0.0018, + "step": 55480 + }, + { + "epoch": 0.9079604025198397, + "grad_norm": 0.0534416139125824, + "learning_rate": 6.644659562044526e-06, + "loss": 0.0015, + "step": 55490 + }, + { + "epoch": 0.9081240284709154, + "grad_norm": 0.009499077685177326, + "learning_rate": 6.643311046962593e-06, + "loss": 0.0022, + "step": 55500 + }, + { + "epoch": 0.9082876544219913, + "grad_norm": 0.047519501298666, + "learning_rate": 6.641962397857802e-06, + "loss": 0.0032, + "step": 55510 + }, + { + "epoch": 0.9084512803730672, + "grad_norm": 0.16934987902641296, + "learning_rate": 6.640613614840143e-06, + "loss": 0.0017, + "step": 55520 + }, + { + "epoch": 0.908614906324143, + "grad_norm": 0.13198237121105194, + "learning_rate": 6.639264698019622e-06, + "loss": 0.0027, + "step": 55530 + }, + { + "epoch": 0.9087785322752189, + "grad_norm": 0.21142633259296417, + "learning_rate": 6.637915647506248e-06, + "loss": 0.0032, + "step": 55540 + }, + { + "epoch": 0.9089421582262946, + "grad_norm": 0.04463517293334007, + "learning_rate": 6.636566463410047e-06, + "loss": 0.0016, + "step": 55550 + }, + { + "epoch": 0.9091057841773705, + "grad_norm": 0.06253872066736221, + "learning_rate": 6.635217145841053e-06, + "loss": 0.0022, + "step": 55560 + }, + { + "epoch": 0.9092694101284464, + "grad_norm": 0.057266756892204285, + "learning_rate": 6.633867694909313e-06, + "loss": 0.0014, + "step": 55570 + }, + { + "epoch": 0.9094330360795222, + "grad_norm": 0.10594163089990616, + "learning_rate": 6.632518110724882e-06, + "loss": 0.0019, + "step": 55580 + }, + { + "epoch": 0.9095966620305981, + "grad_norm": 0.027606617659330368, + "learning_rate": 6.631168393397827e-06, + "loss": 0.0014, + "step": 55590 + }, + { + "epoch": 0.9097602879816739, + "grad_norm": 0.05981414392590523, + "learning_rate": 6.629818543038229e-06, + "loss": 0.003, + "step": 55600 + }, + { + "epoch": 0.9099239139327497, + "grad_norm": 0.04365190863609314, + "learning_rate": 6.628468559756175e-06, + "loss": 0.0021, + "step": 55610 + }, + { + "epoch": 0.9100875398838256, + "grad_norm": 0.05930043011903763, + "learning_rate": 6.627118443661766e-06, + "loss": 0.0016, + "step": 55620 + }, + { + "epoch": 0.9102511658349014, + "grad_norm": 0.0032276480924338102, + "learning_rate": 6.6257681948651135e-06, + "loss": 0.0021, + "step": 55630 + }, + { + "epoch": 0.9104147917859773, + "grad_norm": 0.02424374409019947, + "learning_rate": 6.624417813476338e-06, + "loss": 0.0014, + "step": 55640 + }, + { + "epoch": 0.9105784177370531, + "grad_norm": 0.06509653478860855, + "learning_rate": 6.623067299605572e-06, + "loss": 0.002, + "step": 55650 + }, + { + "epoch": 0.9107420436881289, + "grad_norm": 0.019142763689160347, + "learning_rate": 6.621716653362959e-06, + "loss": 0.0022, + "step": 55660 + }, + { + "epoch": 0.9109056696392048, + "grad_norm": 0.02811865508556366, + "learning_rate": 6.620365874858654e-06, + "loss": 0.0017, + "step": 55670 + }, + { + "epoch": 0.9110692955902806, + "grad_norm": 0.07260759174823761, + "learning_rate": 6.6190149642028205e-06, + "loss": 0.0021, + "step": 55680 + }, + { + "epoch": 0.9112329215413565, + "grad_norm": 0.055015791207551956, + "learning_rate": 6.617663921505635e-06, + "loss": 0.0031, + "step": 55690 + }, + { + "epoch": 0.9113965474924323, + "grad_norm": 0.012044398114085197, + "learning_rate": 6.616312746877285e-06, + "loss": 0.0011, + "step": 55700 + }, + { + "epoch": 0.9115601734435081, + "grad_norm": 0.022058870643377304, + "learning_rate": 6.614961440427965e-06, + "loss": 0.0011, + "step": 55710 + }, + { + "epoch": 0.911723799394584, + "grad_norm": 0.023880165070295334, + "learning_rate": 6.613610002267885e-06, + "loss": 0.001, + "step": 55720 + }, + { + "epoch": 0.9118874253456598, + "grad_norm": 0.07042147219181061, + "learning_rate": 6.612258432507264e-06, + "loss": 0.0013, + "step": 55730 + }, + { + "epoch": 0.9120510512967357, + "grad_norm": 0.11518635600805283, + "learning_rate": 6.6109067312563304e-06, + "loss": 0.0027, + "step": 55740 + }, + { + "epoch": 0.9122146772478115, + "grad_norm": 0.033243972808122635, + "learning_rate": 6.609554898625324e-06, + "loss": 0.001, + "step": 55750 + }, + { + "epoch": 0.9123783031988874, + "grad_norm": 0.11612246185541153, + "learning_rate": 6.608202934724496e-06, + "loss": 0.0021, + "step": 55760 + }, + { + "epoch": 0.9125419291499632, + "grad_norm": 0.06864006817340851, + "learning_rate": 6.606850839664109e-06, + "loss": 0.001, + "step": 55770 + }, + { + "epoch": 0.912705555101039, + "grad_norm": 0.09203996509313583, + "learning_rate": 6.605498613554433e-06, + "loss": 0.0015, + "step": 55780 + }, + { + "epoch": 0.9128691810521149, + "grad_norm": 0.026516444981098175, + "learning_rate": 6.604146256505755e-06, + "loss": 0.0012, + "step": 55790 + }, + { + "epoch": 0.9130328070031907, + "grad_norm": 0.09326178580522537, + "learning_rate": 6.602793768628364e-06, + "loss": 0.0019, + "step": 55800 + }, + { + "epoch": 0.9131964329542666, + "grad_norm": 0.038885004818439484, + "learning_rate": 6.601441150032566e-06, + "loss": 0.0029, + "step": 55810 + }, + { + "epoch": 0.9133600589053424, + "grad_norm": 0.03841819614171982, + "learning_rate": 6.600088400828678e-06, + "loss": 0.0025, + "step": 55820 + }, + { + "epoch": 0.9135236848564182, + "grad_norm": 0.08481840044260025, + "learning_rate": 6.598735521127023e-06, + "loss": 0.0019, + "step": 55830 + }, + { + "epoch": 0.9136873108074941, + "grad_norm": 0.16896317899227142, + "learning_rate": 6.597382511037939e-06, + "loss": 0.0019, + "step": 55840 + }, + { + "epoch": 0.9138509367585699, + "grad_norm": 0.09156767278909683, + "learning_rate": 6.596029370671771e-06, + "loss": 0.0027, + "step": 55850 + }, + { + "epoch": 0.9140145627096458, + "grad_norm": 0.05977252125740051, + "learning_rate": 6.594676100138879e-06, + "loss": 0.0021, + "step": 55860 + }, + { + "epoch": 0.9141781886607215, + "grad_norm": 0.3450954258441925, + "learning_rate": 6.593322699549629e-06, + "loss": 0.0017, + "step": 55870 + }, + { + "epoch": 0.9143418146117974, + "grad_norm": 0.06577891856431961, + "learning_rate": 6.591969169014401e-06, + "loss": 0.0026, + "step": 55880 + }, + { + "epoch": 0.9145054405628733, + "grad_norm": 0.11398031562566757, + "learning_rate": 6.590615508643584e-06, + "loss": 0.0018, + "step": 55890 + }, + { + "epoch": 0.9146690665139491, + "grad_norm": 0.04285018518567085, + "learning_rate": 6.589261718547577e-06, + "loss": 0.0018, + "step": 55900 + }, + { + "epoch": 0.914832692465025, + "grad_norm": 0.01052926480770111, + "learning_rate": 6.587907798836792e-06, + "loss": 0.0023, + "step": 55910 + }, + { + "epoch": 0.9149963184161007, + "grad_norm": 0.06749926507472992, + "learning_rate": 6.586553749621651e-06, + "loss": 0.0017, + "step": 55920 + }, + { + "epoch": 0.9151599443671766, + "grad_norm": 0.051221270114183426, + "learning_rate": 6.585199571012581e-06, + "loss": 0.0015, + "step": 55930 + }, + { + "epoch": 0.9153235703182525, + "grad_norm": 0.06960927695035934, + "learning_rate": 6.5838452631200304e-06, + "loss": 0.001, + "step": 55940 + }, + { + "epoch": 0.9154871962693283, + "grad_norm": 0.05837035924196243, + "learning_rate": 6.582490826054446e-06, + "loss": 0.0017, + "step": 55950 + }, + { + "epoch": 0.9156508222204042, + "grad_norm": 0.05886893719434738, + "learning_rate": 6.5811362599262965e-06, + "loss": 0.0036, + "step": 55960 + }, + { + "epoch": 0.91581444817148, + "grad_norm": 0.11389671266078949, + "learning_rate": 6.579781564846052e-06, + "loss": 0.0027, + "step": 55970 + }, + { + "epoch": 0.9159780741225558, + "grad_norm": 0.06820515543222427, + "learning_rate": 6.578426740924197e-06, + "loss": 0.0034, + "step": 55980 + }, + { + "epoch": 0.9161417000736317, + "grad_norm": 0.21691784262657166, + "learning_rate": 6.577071788271229e-06, + "loss": 0.0027, + "step": 55990 + }, + { + "epoch": 0.9163053260247075, + "grad_norm": 0.11636079102754593, + "learning_rate": 6.5757167069976505e-06, + "loss": 0.0014, + "step": 56000 + }, + { + "epoch": 0.9164689519757834, + "grad_norm": 0.24772250652313232, + "learning_rate": 6.574361497213978e-06, + "loss": 0.003, + "step": 56010 + }, + { + "epoch": 0.9166325779268591, + "grad_norm": 0.02042141556739807, + "learning_rate": 6.573006159030739e-06, + "loss": 0.0027, + "step": 56020 + }, + { + "epoch": 0.916796203877935, + "grad_norm": 0.0673416405916214, + "learning_rate": 6.571650692558469e-06, + "loss": 0.0015, + "step": 56030 + }, + { + "epoch": 0.9169598298290109, + "grad_norm": 0.08954409509897232, + "learning_rate": 6.570295097907718e-06, + "loss": 0.0041, + "step": 56040 + }, + { + "epoch": 0.9171234557800867, + "grad_norm": 0.09719007462263107, + "learning_rate": 6.568939375189038e-06, + "loss": 0.0023, + "step": 56050 + }, + { + "epoch": 0.9172870817311626, + "grad_norm": 0.09046126902103424, + "learning_rate": 6.5675835245130025e-06, + "loss": 0.0024, + "step": 56060 + }, + { + "epoch": 0.9174507076822384, + "grad_norm": 0.06140618771314621, + "learning_rate": 6.566227545990189e-06, + "loss": 0.0026, + "step": 56070 + }, + { + "epoch": 0.9176143336333142, + "grad_norm": 0.0764274075627327, + "learning_rate": 6.564871439731184e-06, + "loss": 0.0018, + "step": 56080 + }, + { + "epoch": 0.9177779595843901, + "grad_norm": 0.02905842289328575, + "learning_rate": 6.563515205846589e-06, + "loss": 0.0021, + "step": 56090 + }, + { + "epoch": 0.9179415855354659, + "grad_norm": 0.04256400838494301, + "learning_rate": 6.562158844447013e-06, + "loss": 0.0017, + "step": 56100 + }, + { + "epoch": 0.9181052114865418, + "grad_norm": 0.08738549798727036, + "learning_rate": 6.560802355643077e-06, + "loss": 0.0033, + "step": 56110 + }, + { + "epoch": 0.9182688374376176, + "grad_norm": 0.03181087225675583, + "learning_rate": 6.5594457395454115e-06, + "loss": 0.0016, + "step": 56120 + }, + { + "epoch": 0.9184324633886934, + "grad_norm": 0.027590539306402206, + "learning_rate": 6.558088996264656e-06, + "loss": 0.0031, + "step": 56130 + }, + { + "epoch": 0.9185960893397693, + "grad_norm": 0.07463917881250381, + "learning_rate": 6.556732125911463e-06, + "loss": 0.0017, + "step": 56140 + }, + { + "epoch": 0.9187597152908451, + "grad_norm": 0.013279673643410206, + "learning_rate": 6.555375128596495e-06, + "loss": 0.0012, + "step": 56150 + }, + { + "epoch": 0.918923341241921, + "grad_norm": 0.09867870807647705, + "learning_rate": 6.554018004430424e-06, + "loss": 0.0017, + "step": 56160 + }, + { + "epoch": 0.9190869671929968, + "grad_norm": 0.22824370861053467, + "learning_rate": 6.552660753523931e-06, + "loss": 0.0014, + "step": 56170 + }, + { + "epoch": 0.9192505931440726, + "grad_norm": 0.033258065581321716, + "learning_rate": 6.55130337598771e-06, + "loss": 0.0025, + "step": 56180 + }, + { + "epoch": 0.9194142190951485, + "grad_norm": 0.033072132617235184, + "learning_rate": 6.549945871932463e-06, + "loss": 0.001, + "step": 56190 + }, + { + "epoch": 0.9195778450462243, + "grad_norm": 0.05221541225910187, + "learning_rate": 6.548588241468904e-06, + "loss": 0.0015, + "step": 56200 + }, + { + "epoch": 0.9197414709973002, + "grad_norm": 0.024098750203847885, + "learning_rate": 6.547230484707758e-06, + "loss": 0.0022, + "step": 56210 + }, + { + "epoch": 0.919905096948376, + "grad_norm": 0.04554083198308945, + "learning_rate": 6.545872601759756e-06, + "loss": 0.003, + "step": 56220 + }, + { + "epoch": 0.9200687228994519, + "grad_norm": 0.19266685843467712, + "learning_rate": 6.544514592735645e-06, + "loss": 0.0017, + "step": 56230 + }, + { + "epoch": 0.9202323488505277, + "grad_norm": 0.02726871706545353, + "learning_rate": 6.5431564577461795e-06, + "loss": 0.0017, + "step": 56240 + }, + { + "epoch": 0.9203959748016035, + "grad_norm": 0.05316530168056488, + "learning_rate": 6.541798196902123e-06, + "loss": 0.0018, + "step": 56250 + }, + { + "epoch": 0.9205596007526794, + "grad_norm": 0.03866003081202507, + "learning_rate": 6.5404398103142495e-06, + "loss": 0.0013, + "step": 56260 + }, + { + "epoch": 0.9207232267037552, + "grad_norm": 0.2395082712173462, + "learning_rate": 6.5390812980933485e-06, + "loss": 0.0031, + "step": 56270 + }, + { + "epoch": 0.9208868526548311, + "grad_norm": 0.038763076066970825, + "learning_rate": 6.537722660350212e-06, + "loss": 0.002, + "step": 56280 + }, + { + "epoch": 0.9210504786059069, + "grad_norm": 0.023509008809924126, + "learning_rate": 6.536363897195648e-06, + "loss": 0.0018, + "step": 56290 + }, + { + "epoch": 0.9212141045569827, + "grad_norm": 0.10502973943948746, + "learning_rate": 6.535005008740472e-06, + "loss": 0.0019, + "step": 56300 + }, + { + "epoch": 0.9213777305080586, + "grad_norm": 0.05079428851604462, + "learning_rate": 6.533645995095508e-06, + "loss": 0.0015, + "step": 56310 + }, + { + "epoch": 0.9215413564591344, + "grad_norm": 0.09697245061397552, + "learning_rate": 6.532286856371596e-06, + "loss": 0.0026, + "step": 56320 + }, + { + "epoch": 0.9217049824102103, + "grad_norm": 0.04611137881875038, + "learning_rate": 6.530927592679581e-06, + "loss": 0.0011, + "step": 56330 + }, + { + "epoch": 0.9218686083612861, + "grad_norm": 0.11595282703638077, + "learning_rate": 6.52956820413032e-06, + "loss": 0.0019, + "step": 56340 + }, + { + "epoch": 0.9220322343123619, + "grad_norm": 0.20963560044765472, + "learning_rate": 6.528208690834681e-06, + "loss": 0.0019, + "step": 56350 + }, + { + "epoch": 0.9221958602634378, + "grad_norm": 0.14181086421012878, + "learning_rate": 6.52684905290354e-06, + "loss": 0.002, + "step": 56360 + }, + { + "epoch": 0.9223594862145136, + "grad_norm": 0.019518647342920303, + "learning_rate": 6.525489290447785e-06, + "loss": 0.003, + "step": 56370 + }, + { + "epoch": 0.9225231121655895, + "grad_norm": 0.022483259439468384, + "learning_rate": 6.524129403578314e-06, + "loss": 0.0023, + "step": 56380 + }, + { + "epoch": 0.9226867381166654, + "grad_norm": 0.053641557693481445, + "learning_rate": 6.522769392406035e-06, + "loss": 0.0021, + "step": 56390 + }, + { + "epoch": 0.9228503640677411, + "grad_norm": 0.07724954187870026, + "learning_rate": 6.521409257041864e-06, + "loss": 0.0021, + "step": 56400 + }, + { + "epoch": 0.923013990018817, + "grad_norm": 0.09131883084774017, + "learning_rate": 6.520048997596732e-06, + "loss": 0.0014, + "step": 56410 + }, + { + "epoch": 0.9231776159698928, + "grad_norm": 0.0458979569375515, + "learning_rate": 6.518688614181575e-06, + "loss": 0.0015, + "step": 56420 + }, + { + "epoch": 0.9233412419209687, + "grad_norm": 0.06478177011013031, + "learning_rate": 6.517328106907339e-06, + "loss": 0.0011, + "step": 56430 + }, + { + "epoch": 0.9235048678720446, + "grad_norm": 0.1232982873916626, + "learning_rate": 6.5159674758849875e-06, + "loss": 0.0026, + "step": 56440 + }, + { + "epoch": 0.9236684938231203, + "grad_norm": 0.054083600640296936, + "learning_rate": 6.514606721225485e-06, + "loss": 0.0012, + "step": 56450 + }, + { + "epoch": 0.9238321197741962, + "grad_norm": 0.06533703953027725, + "learning_rate": 6.513245843039812e-06, + "loss": 0.0016, + "step": 56460 + }, + { + "epoch": 0.923995745725272, + "grad_norm": 0.019338658079504967, + "learning_rate": 6.511884841438958e-06, + "loss": 0.002, + "step": 56470 + }, + { + "epoch": 0.9241593716763479, + "grad_norm": 0.08700487017631531, + "learning_rate": 6.5105237165339186e-06, + "loss": 0.0031, + "step": 56480 + }, + { + "epoch": 0.9243229976274238, + "grad_norm": 0.0781564712524414, + "learning_rate": 6.509162468435704e-06, + "loss": 0.0028, + "step": 56490 + }, + { + "epoch": 0.9244866235784995, + "grad_norm": 0.0034663050901144743, + "learning_rate": 6.507801097255334e-06, + "loss": 0.002, + "step": 56500 + }, + { + "epoch": 0.9246502495295754, + "grad_norm": 0.007611203473061323, + "learning_rate": 6.506439603103836e-06, + "loss": 0.0014, + "step": 56510 + }, + { + "epoch": 0.9248138754806512, + "grad_norm": 0.06650002300739288, + "learning_rate": 6.50507798609225e-06, + "loss": 0.0016, + "step": 56520 + }, + { + "epoch": 0.9249775014317271, + "grad_norm": 0.07986923307180405, + "learning_rate": 6.503716246331623e-06, + "loss": 0.002, + "step": 56530 + }, + { + "epoch": 0.925141127382803, + "grad_norm": 0.14060962200164795, + "learning_rate": 6.502354383933015e-06, + "loss": 0.0017, + "step": 56540 + }, + { + "epoch": 0.9253047533338787, + "grad_norm": 0.012925864197313786, + "learning_rate": 6.5009923990074956e-06, + "loss": 0.0025, + "step": 56550 + }, + { + "epoch": 0.9254683792849546, + "grad_norm": 0.02445130981504917, + "learning_rate": 6.499630291666143e-06, + "loss": 0.0013, + "step": 56560 + }, + { + "epoch": 0.9256320052360304, + "grad_norm": 0.1041211262345314, + "learning_rate": 6.4982680620200455e-06, + "loss": 0.0021, + "step": 56570 + }, + { + "epoch": 0.9257956311871063, + "grad_norm": 0.12634573876857758, + "learning_rate": 6.496905710180304e-06, + "loss": 0.0026, + "step": 56580 + }, + { + "epoch": 0.9259592571381822, + "grad_norm": 0.05812188982963562, + "learning_rate": 6.495543236258024e-06, + "loss": 0.0024, + "step": 56590 + }, + { + "epoch": 0.9261228830892579, + "grad_norm": 0.24218805134296417, + "learning_rate": 6.494180640364326e-06, + "loss": 0.0032, + "step": 56600 + }, + { + "epoch": 0.9262865090403338, + "grad_norm": 0.036598656326532364, + "learning_rate": 6.492817922610339e-06, + "loss": 0.0028, + "step": 56610 + }, + { + "epoch": 0.9264501349914096, + "grad_norm": 0.12396228313446045, + "learning_rate": 6.491455083107201e-06, + "loss": 0.002, + "step": 56620 + }, + { + "epoch": 0.9266137609424855, + "grad_norm": 0.19481991231441498, + "learning_rate": 6.490092121966061e-06, + "loss": 0.0017, + "step": 56630 + }, + { + "epoch": 0.9267773868935614, + "grad_norm": 0.0038094609044492245, + "learning_rate": 6.488729039298077e-06, + "loss": 0.0025, + "step": 56640 + }, + { + "epoch": 0.9269410128446371, + "grad_norm": 0.08035171031951904, + "learning_rate": 6.4873658352144185e-06, + "loss": 0.003, + "step": 56650 + }, + { + "epoch": 0.927104638795713, + "grad_norm": 0.08920537680387497, + "learning_rate": 6.486002509826261e-06, + "loss": 0.0016, + "step": 56660 + }, + { + "epoch": 0.9272682647467888, + "grad_norm": 0.04345780983567238, + "learning_rate": 6.484639063244797e-06, + "loss": 0.0012, + "step": 56670 + }, + { + "epoch": 0.9274318906978647, + "grad_norm": 0.05059542506933212, + "learning_rate": 6.4832754955812204e-06, + "loss": 0.002, + "step": 56680 + }, + { + "epoch": 0.9275955166489406, + "grad_norm": 0.020313138142228127, + "learning_rate": 6.481911806946743e-06, + "loss": 0.0012, + "step": 56690 + }, + { + "epoch": 0.9277591426000164, + "grad_norm": 0.05528176575899124, + "learning_rate": 6.4805479974525786e-06, + "loss": 0.0019, + "step": 56700 + }, + { + "epoch": 0.9279227685510922, + "grad_norm": 0.06339520961046219, + "learning_rate": 6.479184067209958e-06, + "loss": 0.0019, + "step": 56710 + }, + { + "epoch": 0.928086394502168, + "grad_norm": 0.08688001334667206, + "learning_rate": 6.477820016330117e-06, + "loss": 0.0029, + "step": 56720 + }, + { + "epoch": 0.9282500204532439, + "grad_norm": 0.05946914106607437, + "learning_rate": 6.476455844924303e-06, + "loss": 0.0033, + "step": 56730 + }, + { + "epoch": 0.9284136464043197, + "grad_norm": 0.11161893606185913, + "learning_rate": 6.475091553103774e-06, + "loss": 0.0017, + "step": 56740 + }, + { + "epoch": 0.9285772723553956, + "grad_norm": 0.16973428428173065, + "learning_rate": 6.473727140979798e-06, + "loss": 0.0019, + "step": 56750 + }, + { + "epoch": 0.9287408983064714, + "grad_norm": 0.046578556299209595, + "learning_rate": 6.4723626086636486e-06, + "loss": 0.002, + "step": 56760 + }, + { + "epoch": 0.9289045242575472, + "grad_norm": 0.13174575567245483, + "learning_rate": 6.470997956266614e-06, + "loss": 0.0032, + "step": 56770 + }, + { + "epoch": 0.9290681502086231, + "grad_norm": 0.03182876110076904, + "learning_rate": 6.469633183899992e-06, + "loss": 0.0013, + "step": 56780 + }, + { + "epoch": 0.9292317761596989, + "grad_norm": 0.028548920527100563, + "learning_rate": 6.468268291675086e-06, + "loss": 0.0021, + "step": 56790 + }, + { + "epoch": 0.9293954021107748, + "grad_norm": 0.13532426953315735, + "learning_rate": 6.466903279703215e-06, + "loss": 0.0019, + "step": 56800 + }, + { + "epoch": 0.9295590280618506, + "grad_norm": 0.11298929154872894, + "learning_rate": 6.465538148095704e-06, + "loss": 0.0023, + "step": 56810 + }, + { + "epoch": 0.9297226540129264, + "grad_norm": 0.12641289830207825, + "learning_rate": 6.464172896963886e-06, + "loss": 0.0019, + "step": 56820 + }, + { + "epoch": 0.9298862799640023, + "grad_norm": 0.0678074061870575, + "learning_rate": 6.462807526419109e-06, + "loss": 0.0018, + "step": 56830 + }, + { + "epoch": 0.9300499059150781, + "grad_norm": 0.1123092770576477, + "learning_rate": 6.461442036572727e-06, + "loss": 0.0023, + "step": 56840 + }, + { + "epoch": 0.930213531866154, + "grad_norm": 0.05744916945695877, + "learning_rate": 6.460076427536105e-06, + "loss": 0.0018, + "step": 56850 + }, + { + "epoch": 0.9303771578172298, + "grad_norm": 0.12219595909118652, + "learning_rate": 6.4587106994206176e-06, + "loss": 0.0029, + "step": 56860 + }, + { + "epoch": 0.9305407837683056, + "grad_norm": 0.08028897643089294, + "learning_rate": 6.457344852337648e-06, + "loss": 0.0013, + "step": 56870 + }, + { + "epoch": 0.9307044097193815, + "grad_norm": 0.007140877656638622, + "learning_rate": 6.45597888639859e-06, + "loss": 0.0015, + "step": 56880 + }, + { + "epoch": 0.9308680356704573, + "grad_norm": 0.05165205895900726, + "learning_rate": 6.454612801714848e-06, + "loss": 0.0023, + "step": 56890 + }, + { + "epoch": 0.9310316616215332, + "grad_norm": 0.027849525213241577, + "learning_rate": 6.4532465983978354e-06, + "loss": 0.0008, + "step": 56900 + }, + { + "epoch": 0.931195287572609, + "grad_norm": 0.032669514417648315, + "learning_rate": 6.451880276558974e-06, + "loss": 0.0029, + "step": 56910 + }, + { + "epoch": 0.9313589135236848, + "grad_norm": 0.04178975522518158, + "learning_rate": 6.450513836309697e-06, + "loss": 0.0028, + "step": 56920 + }, + { + "epoch": 0.9315225394747607, + "grad_norm": 0.07078851014375687, + "learning_rate": 6.449147277761447e-06, + "loss": 0.0014, + "step": 56930 + }, + { + "epoch": 0.9316861654258365, + "grad_norm": 0.15249699354171753, + "learning_rate": 6.447780601025676e-06, + "loss": 0.0025, + "step": 56940 + }, + { + "epoch": 0.9318497913769124, + "grad_norm": 0.13929148018360138, + "learning_rate": 6.446413806213845e-06, + "loss": 0.0019, + "step": 56950 + }, + { + "epoch": 0.9320134173279883, + "grad_norm": 0.08149644732475281, + "learning_rate": 6.445046893437423e-06, + "loss": 0.002, + "step": 56960 + }, + { + "epoch": 0.932177043279064, + "grad_norm": 0.036492589861154556, + "learning_rate": 6.443679862807895e-06, + "loss": 0.0021, + "step": 56970 + }, + { + "epoch": 0.9323406692301399, + "grad_norm": 0.06172330677509308, + "learning_rate": 6.442312714436748e-06, + "loss": 0.0008, + "step": 56980 + }, + { + "epoch": 0.9325042951812157, + "grad_norm": 0.03780542314052582, + "learning_rate": 6.4409454484354835e-06, + "loss": 0.0023, + "step": 56990 + }, + { + "epoch": 0.9326679211322916, + "grad_norm": 0.0671754777431488, + "learning_rate": 6.4395780649156115e-06, + "loss": 0.0021, + "step": 57000 + }, + { + "epoch": 0.9328315470833675, + "grad_norm": 0.02179705537855625, + "learning_rate": 6.438210563988649e-06, + "loss": 0.0025, + "step": 57010 + }, + { + "epoch": 0.9329951730344432, + "grad_norm": 0.03471015393733978, + "learning_rate": 6.436842945766127e-06, + "loss": 0.0019, + "step": 57020 + }, + { + "epoch": 0.9331587989855191, + "grad_norm": 0.2868771255016327, + "learning_rate": 6.435475210359583e-06, + "loss": 0.0019, + "step": 57030 + }, + { + "epoch": 0.9333224249365949, + "grad_norm": 0.006691084709018469, + "learning_rate": 6.434107357880565e-06, + "loss": 0.0011, + "step": 57040 + }, + { + "epoch": 0.9334860508876708, + "grad_norm": 0.0503176786005497, + "learning_rate": 6.4327393884406295e-06, + "loss": 0.0036, + "step": 57050 + }, + { + "epoch": 0.9336496768387467, + "grad_norm": 0.10211300849914551, + "learning_rate": 6.431371302151344e-06, + "loss": 0.0034, + "step": 57060 + }, + { + "epoch": 0.9338133027898224, + "grad_norm": 0.18614116311073303, + "learning_rate": 6.430003099124285e-06, + "loss": 0.0021, + "step": 57070 + }, + { + "epoch": 0.9339769287408983, + "grad_norm": 0.12661704421043396, + "learning_rate": 6.428634779471039e-06, + "loss": 0.002, + "step": 57080 + }, + { + "epoch": 0.9341405546919741, + "grad_norm": 0.0122816301882267, + "learning_rate": 6.4272663433032e-06, + "loss": 0.0025, + "step": 57090 + }, + { + "epoch": 0.93430418064305, + "grad_norm": 0.0325094610452652, + "learning_rate": 6.4258977907323764e-06, + "loss": 0.0018, + "step": 57100 + }, + { + "epoch": 0.9344678065941259, + "grad_norm": 0.0916358008980751, + "learning_rate": 6.424529121870179e-06, + "loss": 0.0028, + "step": 57110 + }, + { + "epoch": 0.9346314325452016, + "grad_norm": 0.05081067234277725, + "learning_rate": 6.423160336828232e-06, + "loss": 0.0026, + "step": 57120 + }, + { + "epoch": 0.9347950584962775, + "grad_norm": 0.05515988543629646, + "learning_rate": 6.421791435718171e-06, + "loss": 0.0022, + "step": 57130 + }, + { + "epoch": 0.9349586844473533, + "grad_norm": 0.03500219061970711, + "learning_rate": 6.420422418651637e-06, + "loss": 0.0016, + "step": 57140 + }, + { + "epoch": 0.9351223103984292, + "grad_norm": 0.10497685521841049, + "learning_rate": 6.419053285740285e-06, + "loss": 0.0019, + "step": 57150 + }, + { + "epoch": 0.9352859363495051, + "grad_norm": 0.14834196865558624, + "learning_rate": 6.417684037095774e-06, + "loss": 0.0026, + "step": 57160 + }, + { + "epoch": 0.9354495623005809, + "grad_norm": 0.0800432413816452, + "learning_rate": 6.416314672829775e-06, + "loss": 0.0016, + "step": 57170 + }, + { + "epoch": 0.9356131882516567, + "grad_norm": 0.021429333835840225, + "learning_rate": 6.414945193053972e-06, + "loss": 0.0014, + "step": 57180 + }, + { + "epoch": 0.9357768142027325, + "grad_norm": 0.08037056773900986, + "learning_rate": 6.413575597880052e-06, + "loss": 0.0013, + "step": 57190 + }, + { + "epoch": 0.9359404401538084, + "grad_norm": 0.0472366102039814, + "learning_rate": 6.412205887419716e-06, + "loss": 0.0017, + "step": 57200 + }, + { + "epoch": 0.9361040661048843, + "grad_norm": 0.019507095217704773, + "learning_rate": 6.4108360617846735e-06, + "loss": 0.002, + "step": 57210 + }, + { + "epoch": 0.93626769205596, + "grad_norm": 0.019027838483452797, + "learning_rate": 6.40946612108664e-06, + "loss": 0.0015, + "step": 57220 + }, + { + "epoch": 0.9364313180070359, + "grad_norm": 0.06351927667856216, + "learning_rate": 6.408096065437346e-06, + "loss": 0.0015, + "step": 57230 + }, + { + "epoch": 0.9365949439581117, + "grad_norm": 0.05936245620250702, + "learning_rate": 6.406725894948528e-06, + "loss": 0.0019, + "step": 57240 + }, + { + "epoch": 0.9367585699091876, + "grad_norm": 0.044367656111717224, + "learning_rate": 6.405355609731931e-06, + "loss": 0.002, + "step": 57250 + }, + { + "epoch": 0.9369221958602635, + "grad_norm": 0.09417359530925751, + "learning_rate": 6.403985209899313e-06, + "loss": 0.0027, + "step": 57260 + }, + { + "epoch": 0.9370858218113393, + "grad_norm": 0.2645261585712433, + "learning_rate": 6.402614695562437e-06, + "loss": 0.0037, + "step": 57270 + }, + { + "epoch": 0.9372494477624151, + "grad_norm": 0.13693548738956451, + "learning_rate": 6.40124406683308e-06, + "loss": 0.0019, + "step": 57280 + }, + { + "epoch": 0.9374130737134909, + "grad_norm": 0.11871176958084106, + "learning_rate": 6.399873323823022e-06, + "loss": 0.0014, + "step": 57290 + }, + { + "epoch": 0.9375766996645668, + "grad_norm": 0.17951926589012146, + "learning_rate": 6.398502466644061e-06, + "loss": 0.0015, + "step": 57300 + }, + { + "epoch": 0.9377403256156427, + "grad_norm": 0.0491395965218544, + "learning_rate": 6.397131495407997e-06, + "loss": 0.0024, + "step": 57310 + }, + { + "epoch": 0.9379039515667185, + "grad_norm": 0.2597217559814453, + "learning_rate": 6.395760410226641e-06, + "loss": 0.002, + "step": 57320 + }, + { + "epoch": 0.9380675775177943, + "grad_norm": 0.13409483432769775, + "learning_rate": 6.394389211211813e-06, + "loss": 0.0014, + "step": 57330 + }, + { + "epoch": 0.9382312034688701, + "grad_norm": 0.06022542715072632, + "learning_rate": 6.393017898475346e-06, + "loss": 0.0028, + "step": 57340 + }, + { + "epoch": 0.938394829419946, + "grad_norm": 0.03732301667332649, + "learning_rate": 6.39164647212908e-06, + "loss": 0.0018, + "step": 57350 + }, + { + "epoch": 0.9385584553710219, + "grad_norm": 0.026547512039542198, + "learning_rate": 6.390274932284861e-06, + "loss": 0.002, + "step": 57360 + }, + { + "epoch": 0.9387220813220977, + "grad_norm": 0.10671032965183258, + "learning_rate": 6.38890327905455e-06, + "loss": 0.0016, + "step": 57370 + }, + { + "epoch": 0.9388857072731736, + "grad_norm": 0.08442399650812149, + "learning_rate": 6.387531512550013e-06, + "loss": 0.0024, + "step": 57380 + }, + { + "epoch": 0.9390493332242493, + "grad_norm": 0.008403069339692593, + "learning_rate": 6.386159632883125e-06, + "loss": 0.0016, + "step": 57390 + }, + { + "epoch": 0.9392129591753252, + "grad_norm": 0.03750469908118248, + "learning_rate": 6.384787640165775e-06, + "loss": 0.0016, + "step": 57400 + }, + { + "epoch": 0.9393765851264011, + "grad_norm": 0.09813922643661499, + "learning_rate": 6.383415534509856e-06, + "loss": 0.002, + "step": 57410 + }, + { + "epoch": 0.9395402110774769, + "grad_norm": 0.2458689659833908, + "learning_rate": 6.382043316027272e-06, + "loss": 0.0019, + "step": 57420 + }, + { + "epoch": 0.9397038370285528, + "grad_norm": 0.14486080408096313, + "learning_rate": 6.380670984829939e-06, + "loss": 0.0021, + "step": 57430 + }, + { + "epoch": 0.9398674629796285, + "grad_norm": 0.14126110076904297, + "learning_rate": 6.379298541029777e-06, + "loss": 0.0024, + "step": 57440 + }, + { + "epoch": 0.9400310889307044, + "grad_norm": 0.09587261080741882, + "learning_rate": 6.377925984738718e-06, + "loss": 0.0028, + "step": 57450 + }, + { + "epoch": 0.9401947148817803, + "grad_norm": 0.06749685853719711, + "learning_rate": 6.376553316068705e-06, + "loss": 0.0018, + "step": 57460 + }, + { + "epoch": 0.9403583408328561, + "grad_norm": 0.0586259700357914, + "learning_rate": 6.375180535131686e-06, + "loss": 0.0053, + "step": 57470 + }, + { + "epoch": 0.940521966783932, + "grad_norm": 0.1960633546113968, + "learning_rate": 6.373807642039622e-06, + "loss": 0.0033, + "step": 57480 + }, + { + "epoch": 0.9406855927350077, + "grad_norm": 0.05132591351866722, + "learning_rate": 6.372434636904481e-06, + "loss": 0.0012, + "step": 57490 + }, + { + "epoch": 0.9408492186860836, + "grad_norm": 0.05858321860432625, + "learning_rate": 6.37106151983824e-06, + "loss": 0.0022, + "step": 57500 + }, + { + "epoch": 0.9410128446371595, + "grad_norm": 0.1998930275440216, + "learning_rate": 6.3696882909528865e-06, + "loss": 0.0016, + "step": 57510 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.05957844853401184, + "learning_rate": 6.368314950360416e-06, + "loss": 0.0019, + "step": 57520 + }, + { + "epoch": 0.9413400965393112, + "grad_norm": 0.06080023944377899, + "learning_rate": 6.366941498172833e-06, + "loss": 0.0011, + "step": 57530 + }, + { + "epoch": 0.9415037224903869, + "grad_norm": 0.04228433221578598, + "learning_rate": 6.365567934502153e-06, + "loss": 0.0038, + "step": 57540 + }, + { + "epoch": 0.9416673484414628, + "grad_norm": 0.03258278965950012, + "learning_rate": 6.364194259460397e-06, + "loss": 0.0021, + "step": 57550 + }, + { + "epoch": 0.9418309743925387, + "grad_norm": 0.016062675043940544, + "learning_rate": 6.3628204731596e-06, + "loss": 0.0023, + "step": 57560 + }, + { + "epoch": 0.9419946003436145, + "grad_norm": 0.12362527847290039, + "learning_rate": 6.361446575711801e-06, + "loss": 0.0025, + "step": 57570 + }, + { + "epoch": 0.9421582262946904, + "grad_norm": 0.0918710008263588, + "learning_rate": 6.360072567229052e-06, + "loss": 0.0023, + "step": 57580 + }, + { + "epoch": 0.9423218522457661, + "grad_norm": 0.031730104237794876, + "learning_rate": 6.358698447823411e-06, + "loss": 0.0016, + "step": 57590 + }, + { + "epoch": 0.942485478196842, + "grad_norm": 0.04843221232295036, + "learning_rate": 6.357324217606948e-06, + "loss": 0.002, + "step": 57600 + }, + { + "epoch": 0.9426491041479178, + "grad_norm": 0.031060557812452316, + "learning_rate": 6.355949876691739e-06, + "loss": 0.0007, + "step": 57610 + }, + { + "epoch": 0.9428127300989937, + "grad_norm": 0.1420125663280487, + "learning_rate": 6.354575425189873e-06, + "loss": 0.0023, + "step": 57620 + }, + { + "epoch": 0.9429763560500696, + "grad_norm": 0.04344609007239342, + "learning_rate": 6.353200863213442e-06, + "loss": 0.0022, + "step": 57630 + }, + { + "epoch": 0.9431399820011453, + "grad_norm": 0.11959046870470047, + "learning_rate": 6.351826190874554e-06, + "loss": 0.0018, + "step": 57640 + }, + { + "epoch": 0.9433036079522212, + "grad_norm": 0.05591941624879837, + "learning_rate": 6.350451408285321e-06, + "loss": 0.0019, + "step": 57650 + }, + { + "epoch": 0.943467233903297, + "grad_norm": 0.047392845153808594, + "learning_rate": 6.349076515557865e-06, + "loss": 0.0023, + "step": 57660 + }, + { + "epoch": 0.9436308598543729, + "grad_norm": 0.16122494637966156, + "learning_rate": 6.3477015128043186e-06, + "loss": 0.0019, + "step": 57670 + }, + { + "epoch": 0.9437944858054488, + "grad_norm": 0.0406617671251297, + "learning_rate": 6.346326400136822e-06, + "loss": 0.0096, + "step": 57680 + }, + { + "epoch": 0.9439581117565246, + "grad_norm": 0.03396273031830788, + "learning_rate": 6.344951177667525e-06, + "loss": 0.0017, + "step": 57690 + }, + { + "epoch": 0.9441217377076004, + "grad_norm": 0.05563920736312866, + "learning_rate": 6.3435758455085875e-06, + "loss": 0.0016, + "step": 57700 + }, + { + "epoch": 0.9442853636586762, + "grad_norm": 0.11514732241630554, + "learning_rate": 6.342200403772173e-06, + "loss": 0.0023, + "step": 57710 + }, + { + "epoch": 0.9444489896097521, + "grad_norm": 0.03952418640255928, + "learning_rate": 6.340824852570461e-06, + "loss": 0.0036, + "step": 57720 + }, + { + "epoch": 0.944612615560828, + "grad_norm": 0.048269156366586685, + "learning_rate": 6.339449192015636e-06, + "loss": 0.0015, + "step": 57730 + }, + { + "epoch": 0.9447762415119038, + "grad_norm": 0.03346279263496399, + "learning_rate": 6.338073422219891e-06, + "loss": 0.0016, + "step": 57740 + }, + { + "epoch": 0.9449398674629796, + "grad_norm": 0.11679522693157196, + "learning_rate": 6.336697543295432e-06, + "loss": 0.0021, + "step": 57750 + }, + { + "epoch": 0.9451034934140554, + "grad_norm": 0.10253500938415527, + "learning_rate": 6.3353215553544686e-06, + "loss": 0.0031, + "step": 57760 + }, + { + "epoch": 0.9452671193651313, + "grad_norm": 0.08026383817195892, + "learning_rate": 6.333945458509222e-06, + "loss": 0.0028, + "step": 57770 + }, + { + "epoch": 0.9454307453162072, + "grad_norm": 0.05896885693073273, + "learning_rate": 6.332569252871923e-06, + "loss": 0.0017, + "step": 57780 + }, + { + "epoch": 0.945594371267283, + "grad_norm": 0.026631180197000504, + "learning_rate": 6.331192938554809e-06, + "loss": 0.0014, + "step": 57790 + }, + { + "epoch": 0.9457579972183588, + "grad_norm": 0.03806743025779724, + "learning_rate": 6.329816515670127e-06, + "loss": 0.0013, + "step": 57800 + }, + { + "epoch": 0.9459216231694346, + "grad_norm": 0.1733967363834381, + "learning_rate": 6.328439984330136e-06, + "loss": 0.003, + "step": 57810 + }, + { + "epoch": 0.9460852491205105, + "grad_norm": 0.18561020493507385, + "learning_rate": 6.327063344647098e-06, + "loss": 0.0014, + "step": 57820 + }, + { + "epoch": 0.9462488750715864, + "grad_norm": 0.07884349673986435, + "learning_rate": 6.32568659673329e-06, + "loss": 0.0021, + "step": 57830 + }, + { + "epoch": 0.9464125010226622, + "grad_norm": 0.11453360319137573, + "learning_rate": 6.324309740700993e-06, + "loss": 0.0023, + "step": 57840 + }, + { + "epoch": 0.946576126973738, + "grad_norm": 0.032564714550971985, + "learning_rate": 6.3229327766624996e-06, + "loss": 0.002, + "step": 57850 + }, + { + "epoch": 0.9467397529248138, + "grad_norm": 0.050830595195293427, + "learning_rate": 6.321555704730109e-06, + "loss": 0.0023, + "step": 57860 + }, + { + "epoch": 0.9469033788758897, + "grad_norm": 0.04336218163371086, + "learning_rate": 6.320178525016133e-06, + "loss": 0.0022, + "step": 57870 + }, + { + "epoch": 0.9470670048269656, + "grad_norm": 0.1325337290763855, + "learning_rate": 6.318801237632887e-06, + "loss": 0.0033, + "step": 57880 + }, + { + "epoch": 0.9472306307780414, + "grad_norm": 0.06420790404081345, + "learning_rate": 6.317423842692699e-06, + "loss": 0.0017, + "step": 57890 + }, + { + "epoch": 0.9473942567291173, + "grad_norm": 0.05658198520541191, + "learning_rate": 6.316046340307905e-06, + "loss": 0.0012, + "step": 57900 + }, + { + "epoch": 0.947557882680193, + "grad_norm": 0.04964936152100563, + "learning_rate": 6.314668730590849e-06, + "loss": 0.0037, + "step": 57910 + }, + { + "epoch": 0.9477215086312689, + "grad_norm": 0.03604147210717201, + "learning_rate": 6.313291013653884e-06, + "loss": 0.0023, + "step": 57920 + }, + { + "epoch": 0.9478851345823448, + "grad_norm": 0.05656878650188446, + "learning_rate": 6.311913189609372e-06, + "loss": 0.0019, + "step": 57930 + }, + { + "epoch": 0.9480487605334206, + "grad_norm": 0.009907550178468227, + "learning_rate": 6.3105352585696845e-06, + "loss": 0.0008, + "step": 57940 + }, + { + "epoch": 0.9482123864844965, + "grad_norm": 0.12604115903377533, + "learning_rate": 6.3091572206472e-06, + "loss": 0.0018, + "step": 57950 + }, + { + "epoch": 0.9483760124355722, + "grad_norm": 0.04691174626350403, + "learning_rate": 6.307779075954307e-06, + "loss": 0.0023, + "step": 57960 + }, + { + "epoch": 0.9485396383866481, + "grad_norm": 0.03298064321279526, + "learning_rate": 6.3064008246034e-06, + "loss": 0.0012, + "step": 57970 + }, + { + "epoch": 0.948703264337724, + "grad_norm": 0.08867722749710083, + "learning_rate": 6.305022466706889e-06, + "loss": 0.0027, + "step": 57980 + }, + { + "epoch": 0.9488668902887998, + "grad_norm": 0.09201445430517197, + "learning_rate": 6.303644002377185e-06, + "loss": 0.0015, + "step": 57990 + }, + { + "epoch": 0.9490305162398757, + "grad_norm": 0.21433325111865997, + "learning_rate": 6.3022654317267105e-06, + "loss": 0.0016, + "step": 58000 + }, + { + "epoch": 0.9491941421909514, + "grad_norm": 0.26147589087486267, + "learning_rate": 6.300886754867899e-06, + "loss": 0.0018, + "step": 58010 + }, + { + "epoch": 0.9493577681420273, + "grad_norm": 0.10692520439624786, + "learning_rate": 6.29950797191319e-06, + "loss": 0.0021, + "step": 58020 + }, + { + "epoch": 0.9495213940931032, + "grad_norm": 0.0820641741156578, + "learning_rate": 6.298129082975031e-06, + "loss": 0.0013, + "step": 58030 + }, + { + "epoch": 0.949685020044179, + "grad_norm": 0.15649184584617615, + "learning_rate": 6.296750088165882e-06, + "loss": 0.0022, + "step": 58040 + }, + { + "epoch": 0.9498486459952549, + "grad_norm": 0.10773397237062454, + "learning_rate": 6.295370987598206e-06, + "loss": 0.001, + "step": 58050 + }, + { + "epoch": 0.9500122719463306, + "grad_norm": 0.011783221736550331, + "learning_rate": 6.293991781384481e-06, + "loss": 0.0011, + "step": 58060 + }, + { + "epoch": 0.9501758978974065, + "grad_norm": 0.023752499371767044, + "learning_rate": 6.292612469637189e-06, + "loss": 0.0011, + "step": 58070 + }, + { + "epoch": 0.9503395238484824, + "grad_norm": 0.03099977970123291, + "learning_rate": 6.291233052468822e-06, + "loss": 0.0024, + "step": 58080 + }, + { + "epoch": 0.9505031497995582, + "grad_norm": 0.039513569325208664, + "learning_rate": 6.28985352999188e-06, + "loss": 0.0011, + "step": 58090 + }, + { + "epoch": 0.9506667757506341, + "grad_norm": 0.09049548953771591, + "learning_rate": 6.288473902318871e-06, + "loss": 0.0039, + "step": 58100 + }, + { + "epoch": 0.9508304017017098, + "grad_norm": 0.07914090901613235, + "learning_rate": 6.287094169562315e-06, + "loss": 0.0013, + "step": 58110 + }, + { + "epoch": 0.9509940276527857, + "grad_norm": 0.12448439747095108, + "learning_rate": 6.285714331834739e-06, + "loss": 0.0018, + "step": 58120 + }, + { + "epoch": 0.9511576536038616, + "grad_norm": 0.05316713824868202, + "learning_rate": 6.2843343892486756e-06, + "loss": 0.0024, + "step": 58130 + }, + { + "epoch": 0.9513212795549374, + "grad_norm": 0.039714016020298004, + "learning_rate": 6.2829543419166685e-06, + "loss": 0.0021, + "step": 58140 + }, + { + "epoch": 0.9514849055060133, + "grad_norm": 0.042398519814014435, + "learning_rate": 6.281574189951271e-06, + "loss": 0.0019, + "step": 58150 + }, + { + "epoch": 0.951648531457089, + "grad_norm": 0.030234944075345993, + "learning_rate": 6.280193933465042e-06, + "loss": 0.0014, + "step": 58160 + }, + { + "epoch": 0.9518121574081649, + "grad_norm": 0.07874390482902527, + "learning_rate": 6.2788135725705525e-06, + "loss": 0.0013, + "step": 58170 + }, + { + "epoch": 0.9519757833592408, + "grad_norm": 0.0872984528541565, + "learning_rate": 6.277433107380378e-06, + "loss": 0.0021, + "step": 58180 + }, + { + "epoch": 0.9521394093103166, + "grad_norm": 0.10335346311330795, + "learning_rate": 6.276052538007107e-06, + "loss": 0.0017, + "step": 58190 + }, + { + "epoch": 0.9523030352613925, + "grad_norm": 0.0530158132314682, + "learning_rate": 6.274671864563331e-06, + "loss": 0.0017, + "step": 58200 + }, + { + "epoch": 0.9524666612124683, + "grad_norm": 0.08727231621742249, + "learning_rate": 6.273291087161655e-06, + "loss": 0.0016, + "step": 58210 + }, + { + "epoch": 0.9526302871635441, + "grad_norm": 0.005082531366497278, + "learning_rate": 6.271910205914689e-06, + "loss": 0.0018, + "step": 58220 + }, + { + "epoch": 0.95279391311462, + "grad_norm": 0.0704679936170578, + "learning_rate": 6.270529220935056e-06, + "loss": 0.0013, + "step": 58230 + }, + { + "epoch": 0.9529575390656958, + "grad_norm": 0.09169528633356094, + "learning_rate": 6.2691481323353805e-06, + "loss": 0.0019, + "step": 58240 + }, + { + "epoch": 0.9531211650167717, + "grad_norm": 0.1520804464817047, + "learning_rate": 6.267766940228303e-06, + "loss": 0.0017, + "step": 58250 + }, + { + "epoch": 0.9532847909678475, + "grad_norm": 0.08106236904859543, + "learning_rate": 6.266385644726466e-06, + "loss": 0.0022, + "step": 58260 + }, + { + "epoch": 0.9534484169189233, + "grad_norm": 0.04998185858130455, + "learning_rate": 6.265004245942525e-06, + "loss": 0.0022, + "step": 58270 + }, + { + "epoch": 0.9536120428699992, + "grad_norm": 0.04886684566736221, + "learning_rate": 6.263622743989142e-06, + "loss": 0.0013, + "step": 58280 + }, + { + "epoch": 0.953775668821075, + "grad_norm": 0.06148363649845123, + "learning_rate": 6.262241138978986e-06, + "loss": 0.0014, + "step": 58290 + }, + { + "epoch": 0.9539392947721509, + "grad_norm": 0.06408237665891647, + "learning_rate": 6.260859431024738e-06, + "loss": 0.0021, + "step": 58300 + }, + { + "epoch": 0.9541029207232267, + "grad_norm": 0.05922083184123039, + "learning_rate": 6.259477620239085e-06, + "loss": 0.0016, + "step": 58310 + }, + { + "epoch": 0.9542665466743026, + "grad_norm": 0.17053700983524323, + "learning_rate": 6.258095706734721e-06, + "loss": 0.002, + "step": 58320 + }, + { + "epoch": 0.9544301726253784, + "grad_norm": 0.06367158889770508, + "learning_rate": 6.256713690624353e-06, + "loss": 0.0029, + "step": 58330 + }, + { + "epoch": 0.9545937985764542, + "grad_norm": 0.03940455615520477, + "learning_rate": 6.255331572020692e-06, + "loss": 0.0025, + "step": 58340 + }, + { + "epoch": 0.9547574245275301, + "grad_norm": 0.09376281499862671, + "learning_rate": 6.253949351036459e-06, + "loss": 0.0031, + "step": 58350 + }, + { + "epoch": 0.9549210504786059, + "grad_norm": 0.04461251199245453, + "learning_rate": 6.252567027784382e-06, + "loss": 0.0031, + "step": 58360 + }, + { + "epoch": 0.9550846764296818, + "grad_norm": 0.09415064007043839, + "learning_rate": 6.251184602377202e-06, + "loss": 0.0023, + "step": 58370 + }, + { + "epoch": 0.9552483023807576, + "grad_norm": 0.007300138007849455, + "learning_rate": 6.24980207492766e-06, + "loss": 0.0031, + "step": 58380 + }, + { + "epoch": 0.9554119283318334, + "grad_norm": 0.06237104535102844, + "learning_rate": 6.248419445548516e-06, + "loss": 0.0024, + "step": 58390 + }, + { + "epoch": 0.9555755542829093, + "grad_norm": 0.06175771728157997, + "learning_rate": 6.247036714352528e-06, + "loss": 0.0018, + "step": 58400 + }, + { + "epoch": 0.9557391802339851, + "grad_norm": 0.050690412521362305, + "learning_rate": 6.245653881452468e-06, + "loss": 0.001, + "step": 58410 + }, + { + "epoch": 0.955902806185061, + "grad_norm": 0.01614823006093502, + "learning_rate": 6.244270946961116e-06, + "loss": 0.0025, + "step": 58420 + }, + { + "epoch": 0.9560664321361368, + "grad_norm": 0.038604676723480225, + "learning_rate": 6.2428879109912585e-06, + "loss": 0.0024, + "step": 58430 + }, + { + "epoch": 0.9562300580872126, + "grad_norm": 0.04415985941886902, + "learning_rate": 6.241504773655692e-06, + "loss": 0.0015, + "step": 58440 + }, + { + "epoch": 0.9563936840382885, + "grad_norm": 0.027101153507828712, + "learning_rate": 6.240121535067219e-06, + "loss": 0.0022, + "step": 58450 + }, + { + "epoch": 0.9565573099893643, + "grad_norm": 0.027097007259726524, + "learning_rate": 6.238738195338655e-06, + "loss": 0.0014, + "step": 58460 + }, + { + "epoch": 0.9567209359404402, + "grad_norm": 0.04853229597210884, + "learning_rate": 6.237354754582817e-06, + "loss": 0.0023, + "step": 58470 + }, + { + "epoch": 0.9568845618915159, + "grad_norm": 0.023506224155426025, + "learning_rate": 6.235971212912535e-06, + "loss": 0.0025, + "step": 58480 + }, + { + "epoch": 0.9570481878425918, + "grad_norm": 0.1510210931301117, + "learning_rate": 6.234587570440647e-06, + "loss": 0.002, + "step": 58490 + }, + { + "epoch": 0.9572118137936677, + "grad_norm": 0.06374149024486542, + "learning_rate": 6.2332038272799955e-06, + "loss": 0.0019, + "step": 58500 + }, + { + "epoch": 0.9573754397447435, + "grad_norm": 0.032993730157613754, + "learning_rate": 6.231819983543436e-06, + "loss": 0.0021, + "step": 58510 + }, + { + "epoch": 0.9575390656958194, + "grad_norm": 0.11612825840711594, + "learning_rate": 6.2304360393438315e-06, + "loss": 0.0018, + "step": 58520 + }, + { + "epoch": 0.9577026916468951, + "grad_norm": 0.0704694539308548, + "learning_rate": 6.229051994794047e-06, + "loss": 0.0018, + "step": 58530 + }, + { + "epoch": 0.957866317597971, + "grad_norm": 0.049204569309949875, + "learning_rate": 6.227667850006967e-06, + "loss": 0.0014, + "step": 58540 + }, + { + "epoch": 0.9580299435490469, + "grad_norm": 0.00743493577465415, + "learning_rate": 6.226283605095471e-06, + "loss": 0.0027, + "step": 58550 + }, + { + "epoch": 0.9581935695001227, + "grad_norm": 0.026673590764403343, + "learning_rate": 6.224899260172458e-06, + "loss": 0.0015, + "step": 58560 + }, + { + "epoch": 0.9583571954511986, + "grad_norm": 0.017874035984277725, + "learning_rate": 6.223514815350827e-06, + "loss": 0.0019, + "step": 58570 + }, + { + "epoch": 0.9585208214022743, + "grad_norm": 0.10189425945281982, + "learning_rate": 6.222130270743492e-06, + "loss": 0.0031, + "step": 58580 + }, + { + "epoch": 0.9586844473533502, + "grad_norm": 0.025408370420336723, + "learning_rate": 6.22074562646337e-06, + "loss": 0.001, + "step": 58590 + }, + { + "epoch": 0.9588480733044261, + "grad_norm": 0.03035455197095871, + "learning_rate": 6.219360882623388e-06, + "loss": 0.0012, + "step": 58600 + }, + { + "epoch": 0.9590116992555019, + "grad_norm": 0.05683242529630661, + "learning_rate": 6.217976039336481e-06, + "loss": 0.0016, + "step": 58610 + }, + { + "epoch": 0.9591753252065778, + "grad_norm": 0.029838312417268753, + "learning_rate": 6.216591096715592e-06, + "loss": 0.0016, + "step": 58620 + }, + { + "epoch": 0.9593389511576536, + "grad_norm": 0.0384819433093071, + "learning_rate": 6.215206054873672e-06, + "loss": 0.0022, + "step": 58630 + }, + { + "epoch": 0.9595025771087294, + "grad_norm": 0.03671186789870262, + "learning_rate": 6.213820913923681e-06, + "loss": 0.0012, + "step": 58640 + }, + { + "epoch": 0.9596662030598053, + "grad_norm": 0.12970297038555145, + "learning_rate": 6.212435673978587e-06, + "loss": 0.0019, + "step": 58650 + }, + { + "epoch": 0.9598298290108811, + "grad_norm": 0.12293124198913574, + "learning_rate": 6.211050335151363e-06, + "loss": 0.0028, + "step": 58660 + }, + { + "epoch": 0.959993454961957, + "grad_norm": 0.04402632266283035, + "learning_rate": 6.209664897554995e-06, + "loss": 0.002, + "step": 58670 + }, + { + "epoch": 0.9601570809130328, + "grad_norm": 0.11219951510429382, + "learning_rate": 6.2082793613024716e-06, + "loss": 0.0028, + "step": 58680 + }, + { + "epoch": 0.9603207068641086, + "grad_norm": 0.05862412974238396, + "learning_rate": 6.206893726506796e-06, + "loss": 0.0018, + "step": 58690 + }, + { + "epoch": 0.9604843328151845, + "grad_norm": 0.06713079661130905, + "learning_rate": 6.205507993280975e-06, + "loss": 0.0017, + "step": 58700 + }, + { + "epoch": 0.9606479587662603, + "grad_norm": 0.11677181720733643, + "learning_rate": 6.204122161738022e-06, + "loss": 0.0012, + "step": 58710 + }, + { + "epoch": 0.9608115847173362, + "grad_norm": 0.04015107825398445, + "learning_rate": 6.202736231990965e-06, + "loss": 0.0026, + "step": 58720 + }, + { + "epoch": 0.960975210668412, + "grad_norm": 0.16898278892040253, + "learning_rate": 6.201350204152831e-06, + "loss": 0.0015, + "step": 58730 + }, + { + "epoch": 0.9611388366194878, + "grad_norm": 0.06074877455830574, + "learning_rate": 6.199964078336661e-06, + "loss": 0.0019, + "step": 58740 + }, + { + "epoch": 0.9613024625705637, + "grad_norm": 0.03581656515598297, + "learning_rate": 6.198577854655504e-06, + "loss": 0.0013, + "step": 58750 + }, + { + "epoch": 0.9614660885216395, + "grad_norm": 0.032737743109464645, + "learning_rate": 6.197191533222415e-06, + "loss": 0.0026, + "step": 58760 + }, + { + "epoch": 0.9616297144727154, + "grad_norm": 0.012240628711879253, + "learning_rate": 6.195805114150458e-06, + "loss": 0.0017, + "step": 58770 + }, + { + "epoch": 0.9617933404237912, + "grad_norm": 0.08106410503387451, + "learning_rate": 6.194418597552705e-06, + "loss": 0.0019, + "step": 58780 + }, + { + "epoch": 0.961956966374867, + "grad_norm": 0.05172346532344818, + "learning_rate": 6.1930319835422336e-06, + "loss": 0.0018, + "step": 58790 + }, + { + "epoch": 0.9621205923259429, + "grad_norm": 0.055812761187553406, + "learning_rate": 6.191645272232134e-06, + "loss": 0.002, + "step": 58800 + }, + { + "epoch": 0.9622842182770187, + "grad_norm": 0.03036658465862274, + "learning_rate": 6.190258463735499e-06, + "loss": 0.0016, + "step": 58810 + }, + { + "epoch": 0.9624478442280946, + "grad_norm": 0.08152364939451218, + "learning_rate": 6.1888715581654345e-06, + "loss": 0.0014, + "step": 58820 + }, + { + "epoch": 0.9626114701791704, + "grad_norm": 0.0864526778459549, + "learning_rate": 6.187484555635049e-06, + "loss": 0.0018, + "step": 58830 + }, + { + "epoch": 0.9627750961302463, + "grad_norm": 0.07235458493232727, + "learning_rate": 6.186097456257465e-06, + "loss": 0.0019, + "step": 58840 + }, + { + "epoch": 0.9629387220813221, + "grad_norm": 0.04425173997879028, + "learning_rate": 6.184710260145807e-06, + "loss": 0.0032, + "step": 58850 + }, + { + "epoch": 0.9631023480323979, + "grad_norm": 0.12765848636627197, + "learning_rate": 6.183322967413212e-06, + "loss": 0.0025, + "step": 58860 + }, + { + "epoch": 0.9632659739834738, + "grad_norm": 0.04378824308514595, + "learning_rate": 6.181935578172821e-06, + "loss": 0.0014, + "step": 58870 + }, + { + "epoch": 0.9634295999345496, + "grad_norm": 0.06265013664960861, + "learning_rate": 6.180548092537786e-06, + "loss": 0.0025, + "step": 58880 + }, + { + "epoch": 0.9635932258856255, + "grad_norm": 0.016524959355592728, + "learning_rate": 6.179160510621264e-06, + "loss": 0.0019, + "step": 58890 + }, + { + "epoch": 0.9637568518367013, + "grad_norm": 0.11609126627445221, + "learning_rate": 6.177772832536423e-06, + "loss": 0.0013, + "step": 58900 + }, + { + "epoch": 0.9639204777877771, + "grad_norm": 0.09382468461990356, + "learning_rate": 6.1763850583964365e-06, + "loss": 0.0017, + "step": 58910 + }, + { + "epoch": 0.964084103738853, + "grad_norm": 0.04451765492558479, + "learning_rate": 6.174997188314489e-06, + "loss": 0.0012, + "step": 58920 + }, + { + "epoch": 0.9642477296899288, + "grad_norm": 0.010108184069395065, + "learning_rate": 6.173609222403767e-06, + "loss": 0.0028, + "step": 58930 + }, + { + "epoch": 0.9644113556410047, + "grad_norm": 0.02133953385055065, + "learning_rate": 6.172221160777469e-06, + "loss": 0.002, + "step": 58940 + }, + { + "epoch": 0.9645749815920805, + "grad_norm": 0.049128394573926926, + "learning_rate": 6.170833003548803e-06, + "loss": 0.0017, + "step": 58950 + }, + { + "epoch": 0.9647386075431563, + "grad_norm": 0.08993761986494064, + "learning_rate": 6.1694447508309805e-06, + "loss": 0.0018, + "step": 58960 + }, + { + "epoch": 0.9649022334942322, + "grad_norm": 0.13894130289554596, + "learning_rate": 6.168056402737222e-06, + "loss": 0.0035, + "step": 58970 + }, + { + "epoch": 0.965065859445308, + "grad_norm": 0.1430618315935135, + "learning_rate": 6.166667959380759e-06, + "loss": 0.0019, + "step": 58980 + }, + { + "epoch": 0.9652294853963839, + "grad_norm": 0.014461737126111984, + "learning_rate": 6.1652794208748245e-06, + "loss": 0.0024, + "step": 58990 + }, + { + "epoch": 0.9653931113474598, + "grad_norm": 0.05005745589733124, + "learning_rate": 6.163890787332667e-06, + "loss": 0.002, + "step": 59000 + }, + { + "epoch": 0.9655567372985355, + "grad_norm": 0.09706447273492813, + "learning_rate": 6.162502058867536e-06, + "loss": 0.002, + "step": 59010 + }, + { + "epoch": 0.9657203632496114, + "grad_norm": 0.11365757882595062, + "learning_rate": 6.161113235592692e-06, + "loss": 0.0019, + "step": 59020 + }, + { + "epoch": 0.9658839892006872, + "grad_norm": 0.042573362588882446, + "learning_rate": 6.1597243176214025e-06, + "loss": 0.0013, + "step": 59030 + }, + { + "epoch": 0.9660476151517631, + "grad_norm": 0.04736004024744034, + "learning_rate": 6.158335305066945e-06, + "loss": 0.003, + "step": 59040 + }, + { + "epoch": 0.966211241102839, + "grad_norm": 0.13747192919254303, + "learning_rate": 6.1569461980426e-06, + "loss": 0.0019, + "step": 59050 + }, + { + "epoch": 0.9663748670539147, + "grad_norm": 0.03255385905504227, + "learning_rate": 6.155556996661659e-06, + "loss": 0.0015, + "step": 59060 + }, + { + "epoch": 0.9665384930049906, + "grad_norm": 0.0700443759560585, + "learning_rate": 6.154167701037421e-06, + "loss": 0.002, + "step": 59070 + }, + { + "epoch": 0.9667021189560664, + "grad_norm": 0.0969579815864563, + "learning_rate": 6.152778311283193e-06, + "loss": 0.0016, + "step": 59080 + }, + { + "epoch": 0.9668657449071423, + "grad_norm": 0.052093978971242905, + "learning_rate": 6.151388827512287e-06, + "loss": 0.0017, + "step": 59090 + }, + { + "epoch": 0.9670293708582182, + "grad_norm": 0.06126511096954346, + "learning_rate": 6.149999249838025e-06, + "loss": 0.002, + "step": 59100 + }, + { + "epoch": 0.9671929968092939, + "grad_norm": 0.09167136996984482, + "learning_rate": 6.1486095783737386e-06, + "loss": 0.0009, + "step": 59110 + }, + { + "epoch": 0.9673566227603698, + "grad_norm": 0.041624169796705246, + "learning_rate": 6.147219813232762e-06, + "loss": 0.0029, + "step": 59120 + }, + { + "epoch": 0.9675202487114456, + "grad_norm": 0.15207301080226898, + "learning_rate": 6.145829954528441e-06, + "loss": 0.0012, + "step": 59130 + }, + { + "epoch": 0.9676838746625215, + "grad_norm": 0.053871866315603256, + "learning_rate": 6.144440002374127e-06, + "loss": 0.0015, + "step": 59140 + }, + { + "epoch": 0.9678475006135974, + "grad_norm": 0.030999070033431053, + "learning_rate": 6.14304995688318e-06, + "loss": 0.0019, + "step": 59150 + }, + { + "epoch": 0.9680111265646731, + "grad_norm": 0.07165121287107468, + "learning_rate": 6.1416598181689675e-06, + "loss": 0.0018, + "step": 59160 + }, + { + "epoch": 0.968174752515749, + "grad_norm": 0.03951803222298622, + "learning_rate": 6.140269586344864e-06, + "loss": 0.0015, + "step": 59170 + }, + { + "epoch": 0.9683383784668248, + "grad_norm": 0.04415475204586983, + "learning_rate": 6.138879261524254e-06, + "loss": 0.0015, + "step": 59180 + }, + { + "epoch": 0.9685020044179007, + "grad_norm": 0.034847572445869446, + "learning_rate": 6.137488843820526e-06, + "loss": 0.0018, + "step": 59190 + }, + { + "epoch": 0.9686656303689766, + "grad_norm": 0.04615645483136177, + "learning_rate": 6.136098333347077e-06, + "loss": 0.0024, + "step": 59200 + }, + { + "epoch": 0.9688292563200523, + "grad_norm": 0.05982517451047897, + "learning_rate": 6.1347077302173145e-06, + "loss": 0.0026, + "step": 59210 + }, + { + "epoch": 0.9689928822711282, + "grad_norm": 0.09822215884923935, + "learning_rate": 6.133317034544649e-06, + "loss": 0.0029, + "step": 59220 + }, + { + "epoch": 0.969156508222204, + "grad_norm": 0.21471615135669708, + "learning_rate": 6.131926246442502e-06, + "loss": 0.003, + "step": 59230 + }, + { + "epoch": 0.9693201341732799, + "grad_norm": 0.07293573021888733, + "learning_rate": 6.130535366024302e-06, + "loss": 0.0013, + "step": 59240 + }, + { + "epoch": 0.9694837601243558, + "grad_norm": 0.03750929981470108, + "learning_rate": 6.129144393403483e-06, + "loss": 0.0013, + "step": 59250 + }, + { + "epoch": 0.9696473860754315, + "grad_norm": 0.3493478298187256, + "learning_rate": 6.1277533286934906e-06, + "loss": 0.0031, + "step": 59260 + }, + { + "epoch": 0.9698110120265074, + "grad_norm": 0.13050806522369385, + "learning_rate": 6.126362172007772e-06, + "loss": 0.0017, + "step": 59270 + }, + { + "epoch": 0.9699746379775832, + "grad_norm": 0.021530838683247566, + "learning_rate": 6.1249709234597884e-06, + "loss": 0.0051, + "step": 59280 + }, + { + "epoch": 0.9701382639286591, + "grad_norm": 0.12952467799186707, + "learning_rate": 6.123579583163003e-06, + "loss": 0.0017, + "step": 59290 + }, + { + "epoch": 0.970301889879735, + "grad_norm": 0.2073030173778534, + "learning_rate": 6.12218815123089e-06, + "loss": 0.0012, + "step": 59300 + }, + { + "epoch": 0.9704655158308108, + "grad_norm": 0.047026898711919785, + "learning_rate": 6.120796627776928e-06, + "loss": 0.002, + "step": 59310 + }, + { + "epoch": 0.9706291417818866, + "grad_norm": 0.11334921419620514, + "learning_rate": 6.119405012914608e-06, + "loss": 0.002, + "step": 59320 + }, + { + "epoch": 0.9707927677329624, + "grad_norm": 0.08938440680503845, + "learning_rate": 6.118013306757423e-06, + "loss": 0.0033, + "step": 59330 + }, + { + "epoch": 0.9709563936840383, + "grad_norm": 0.1321195811033249, + "learning_rate": 6.1166215094188764e-06, + "loss": 0.0019, + "step": 59340 + }, + { + "epoch": 0.9711200196351142, + "grad_norm": 0.057198166847229004, + "learning_rate": 6.115229621012479e-06, + "loss": 0.0024, + "step": 59350 + }, + { + "epoch": 0.97128364558619, + "grad_norm": 0.09630824625492096, + "learning_rate": 6.113837641651749e-06, + "loss": 0.0032, + "step": 59360 + }, + { + "epoch": 0.9714472715372658, + "grad_norm": 0.022952301427721977, + "learning_rate": 6.1124455714502085e-06, + "loss": 0.0011, + "step": 59370 + }, + { + "epoch": 0.9716108974883416, + "grad_norm": 0.03261130303144455, + "learning_rate": 6.111053410521394e-06, + "loss": 0.0019, + "step": 59380 + }, + { + "epoch": 0.9717745234394175, + "grad_norm": 0.2276962548494339, + "learning_rate": 6.1096611589788415e-06, + "loss": 0.0019, + "step": 59390 + }, + { + "epoch": 0.9719381493904933, + "grad_norm": 0.04965461418032646, + "learning_rate": 6.108268816936102e-06, + "loss": 0.0019, + "step": 59400 + }, + { + "epoch": 0.9721017753415692, + "grad_norm": 0.07307393848896027, + "learning_rate": 6.106876384506727e-06, + "loss": 0.0029, + "step": 59410 + }, + { + "epoch": 0.972265401292645, + "grad_norm": 0.054034411907196045, + "learning_rate": 6.10548386180428e-06, + "loss": 0.0025, + "step": 59420 + }, + { + "epoch": 0.9724290272437208, + "grad_norm": 0.09574251621961594, + "learning_rate": 6.104091248942331e-06, + "loss": 0.0019, + "step": 59430 + }, + { + "epoch": 0.9725926531947967, + "grad_norm": 0.08367104083299637, + "learning_rate": 6.102698546034456e-06, + "loss": 0.0018, + "step": 59440 + }, + { + "epoch": 0.9727562791458725, + "grad_norm": 0.035694271326065063, + "learning_rate": 6.10130575319424e-06, + "loss": 0.0024, + "step": 59450 + }, + { + "epoch": 0.9729199050969484, + "grad_norm": 0.01515640877187252, + "learning_rate": 6.0999128705352724e-06, + "loss": 0.0013, + "step": 59460 + }, + { + "epoch": 0.9730835310480243, + "grad_norm": 0.22956176102161407, + "learning_rate": 6.098519898171155e-06, + "loss": 0.0025, + "step": 59470 + }, + { + "epoch": 0.9732471569991, + "grad_norm": 0.09979180991649628, + "learning_rate": 6.097126836215491e-06, + "loss": 0.0019, + "step": 59480 + }, + { + "epoch": 0.9734107829501759, + "grad_norm": 0.10009083896875381, + "learning_rate": 6.095733684781895e-06, + "loss": 0.0025, + "step": 59490 + }, + { + "epoch": 0.9735744089012517, + "grad_norm": 0.048967428505420685, + "learning_rate": 6.0943404439839885e-06, + "loss": 0.002, + "step": 59500 + }, + { + "epoch": 0.9737380348523276, + "grad_norm": 0.04773823171854019, + "learning_rate": 6.092947113935397e-06, + "loss": 0.0021, + "step": 59510 + }, + { + "epoch": 0.9739016608034035, + "grad_norm": 0.04499290511012077, + "learning_rate": 6.091553694749759e-06, + "loss": 0.0022, + "step": 59520 + }, + { + "epoch": 0.9740652867544792, + "grad_norm": 0.07160546630620956, + "learning_rate": 6.0901601865407144e-06, + "loss": 0.0023, + "step": 59530 + }, + { + "epoch": 0.9742289127055551, + "grad_norm": 0.05784238129854202, + "learning_rate": 6.088766589421915e-06, + "loss": 0.0021, + "step": 59540 + }, + { + "epoch": 0.9743925386566309, + "grad_norm": 0.06750132888555527, + "learning_rate": 6.087372903507016e-06, + "loss": 0.0021, + "step": 59550 + }, + { + "epoch": 0.9745561646077068, + "grad_norm": 0.019278204068541527, + "learning_rate": 6.085979128909684e-06, + "loss": 0.0019, + "step": 59560 + }, + { + "epoch": 0.9747197905587827, + "grad_norm": 0.15544544160366058, + "learning_rate": 6.084585265743588e-06, + "loss": 0.0028, + "step": 59570 + }, + { + "epoch": 0.9748834165098584, + "grad_norm": 0.07146583497524261, + "learning_rate": 6.083191314122407e-06, + "loss": 0.0021, + "step": 59580 + }, + { + "epoch": 0.9750470424609343, + "grad_norm": 0.06504635512828827, + "learning_rate": 6.081797274159828e-06, + "loss": 0.0016, + "step": 59590 + }, + { + "epoch": 0.9752106684120101, + "grad_norm": 0.09232314676046371, + "learning_rate": 6.080403145969545e-06, + "loss": 0.0012, + "step": 59600 + }, + { + "epoch": 0.975374294363086, + "grad_norm": 0.04990586265921593, + "learning_rate": 6.079008929665257e-06, + "loss": 0.0019, + "step": 59610 + }, + { + "epoch": 0.9755379203141619, + "grad_norm": 0.2490033656358719, + "learning_rate": 6.077614625360672e-06, + "loss": 0.0037, + "step": 59620 + }, + { + "epoch": 0.9757015462652376, + "grad_norm": 0.18469533324241638, + "learning_rate": 6.076220233169504e-06, + "loss": 0.002, + "step": 59630 + }, + { + "epoch": 0.9758651722163135, + "grad_norm": 0.06195841729640961, + "learning_rate": 6.074825753205475e-06, + "loss": 0.0013, + "step": 59640 + }, + { + "epoch": 0.9760287981673893, + "grad_norm": 0.08911745995283127, + "learning_rate": 6.073431185582315e-06, + "loss": 0.002, + "step": 59650 + }, + { + "epoch": 0.9761924241184652, + "grad_norm": 0.06178278848528862, + "learning_rate": 6.072036530413759e-06, + "loss": 0.0023, + "step": 59660 + }, + { + "epoch": 0.9763560500695411, + "grad_norm": 0.007030330132693052, + "learning_rate": 6.070641787813552e-06, + "loss": 0.0016, + "step": 59670 + }, + { + "epoch": 0.9765196760206168, + "grad_norm": 0.05066562816500664, + "learning_rate": 6.0692469578954445e-06, + "loss": 0.0015, + "step": 59680 + }, + { + "epoch": 0.9766833019716927, + "grad_norm": 0.060841407626867294, + "learning_rate": 6.067852040773191e-06, + "loss": 0.0038, + "step": 59690 + }, + { + "epoch": 0.9768469279227685, + "grad_norm": 0.05238354951143265, + "learning_rate": 6.0664570365605595e-06, + "loss": 0.0029, + "step": 59700 + }, + { + "epoch": 0.9770105538738444, + "grad_norm": 0.009691527113318443, + "learning_rate": 6.065061945371319e-06, + "loss": 0.0016, + "step": 59710 + }, + { + "epoch": 0.9771741798249203, + "grad_norm": 0.0825946256518364, + "learning_rate": 6.0636667673192504e-06, + "loss": 0.0028, + "step": 59720 + }, + { + "epoch": 0.977337805775996, + "grad_norm": 0.03873920440673828, + "learning_rate": 6.06227150251814e-06, + "loss": 0.0017, + "step": 59730 + }, + { + "epoch": 0.9775014317270719, + "grad_norm": 0.06351128965616226, + "learning_rate": 6.06087615108178e-06, + "loss": 0.0013, + "step": 59740 + }, + { + "epoch": 0.9776650576781477, + "grad_norm": 0.006233478896319866, + "learning_rate": 6.059480713123968e-06, + "loss": 0.0022, + "step": 59750 + }, + { + "epoch": 0.9778286836292236, + "grad_norm": 0.1972426325082779, + "learning_rate": 6.058085188758517e-06, + "loss": 0.0021, + "step": 59760 + }, + { + "epoch": 0.9779923095802995, + "grad_norm": 0.012261101976037025, + "learning_rate": 6.056689578099236e-06, + "loss": 0.0015, + "step": 59770 + }, + { + "epoch": 0.9781559355313753, + "grad_norm": 0.12176188081502914, + "learning_rate": 6.055293881259948e-06, + "loss": 0.0019, + "step": 59780 + }, + { + "epoch": 0.9783195614824511, + "grad_norm": 0.13167357444763184, + "learning_rate": 6.053898098354483e-06, + "loss": 0.0024, + "step": 59790 + }, + { + "epoch": 0.9784831874335269, + "grad_norm": 0.12744878232479095, + "learning_rate": 6.052502229496674e-06, + "loss": 0.0011, + "step": 59800 + }, + { + "epoch": 0.9786468133846028, + "grad_norm": 0.027661819010972977, + "learning_rate": 6.051106274800363e-06, + "loss": 0.002, + "step": 59810 + }, + { + "epoch": 0.9788104393356787, + "grad_norm": 0.03930159658193588, + "learning_rate": 6.0497102343794025e-06, + "loss": 0.0021, + "step": 59820 + }, + { + "epoch": 0.9789740652867545, + "grad_norm": 0.016959572210907936, + "learning_rate": 6.0483141083476445e-06, + "loss": 0.0025, + "step": 59830 + }, + { + "epoch": 0.9791376912378303, + "grad_norm": 0.0502483956515789, + "learning_rate": 6.046917896818956e-06, + "loss": 0.0027, + "step": 59840 + }, + { + "epoch": 0.9793013171889061, + "grad_norm": 0.0883905366063118, + "learning_rate": 6.045521599907206e-06, + "loss": 0.0023, + "step": 59850 + }, + { + "epoch": 0.979464943139982, + "grad_norm": 0.44710573554039, + "learning_rate": 6.0441252177262695e-06, + "loss": 0.0019, + "step": 59860 + }, + { + "epoch": 0.9796285690910579, + "grad_norm": 0.04729650542140007, + "learning_rate": 6.042728750390034e-06, + "loss": 0.0014, + "step": 59870 + }, + { + "epoch": 0.9797921950421337, + "grad_norm": 0.050038114190101624, + "learning_rate": 6.04133219801239e-06, + "loss": 0.0014, + "step": 59880 + }, + { + "epoch": 0.9799558209932095, + "grad_norm": 0.06400980800390244, + "learning_rate": 6.039935560707234e-06, + "loss": 0.0025, + "step": 59890 + }, + { + "epoch": 0.9801194469442853, + "grad_norm": 0.0482458621263504, + "learning_rate": 6.038538838588472e-06, + "loss": 0.0017, + "step": 59900 + }, + { + "epoch": 0.9802830728953612, + "grad_norm": 0.07127697020769119, + "learning_rate": 6.0371420317700155e-06, + "loss": 0.0019, + "step": 59910 + }, + { + "epoch": 0.9804466988464371, + "grad_norm": 0.029114123433828354, + "learning_rate": 6.035745140365784e-06, + "loss": 0.0016, + "step": 59920 + }, + { + "epoch": 0.9806103247975129, + "grad_norm": 0.047244396060705185, + "learning_rate": 6.0343481644897026e-06, + "loss": 0.0018, + "step": 59930 + }, + { + "epoch": 0.9807739507485888, + "grad_norm": 0.004442477133125067, + "learning_rate": 6.032951104255704e-06, + "loss": 0.0021, + "step": 59940 + }, + { + "epoch": 0.9809375766996645, + "grad_norm": 0.054098594933748245, + "learning_rate": 6.031553959777727e-06, + "loss": 0.0014, + "step": 59950 + }, + { + "epoch": 0.9811012026507404, + "grad_norm": 0.04171818122267723, + "learning_rate": 6.030156731169719e-06, + "loss": 0.0018, + "step": 59960 + }, + { + "epoch": 0.9812648286018163, + "grad_norm": 0.09094149619340897, + "learning_rate": 6.028759418545635e-06, + "loss": 0.0016, + "step": 59970 + }, + { + "epoch": 0.9814284545528921, + "grad_norm": 0.035672642290592194, + "learning_rate": 6.027362022019431e-06, + "loss": 0.0016, + "step": 59980 + }, + { + "epoch": 0.981592080503968, + "grad_norm": 0.14837729930877686, + "learning_rate": 6.025964541705076e-06, + "loss": 0.0031, + "step": 59990 + }, + { + "epoch": 0.9817557064550437, + "grad_norm": 0.09441840648651123, + "learning_rate": 6.024566977716544e-06, + "loss": 0.0032, + "step": 60000 + }, + { + "epoch": 0.9819193324061196, + "grad_norm": 0.2889818549156189, + "learning_rate": 6.023169330167815e-06, + "loss": 0.0028, + "step": 60010 + }, + { + "epoch": 0.9820829583571955, + "grad_norm": 0.04635530337691307, + "learning_rate": 6.0217715991728766e-06, + "loss": 0.0021, + "step": 60020 + }, + { + "epoch": 0.9822465843082713, + "grad_norm": 0.04362974688410759, + "learning_rate": 6.020373784845723e-06, + "loss": 0.0021, + "step": 60030 + }, + { + "epoch": 0.9824102102593472, + "grad_norm": 0.0717148631811142, + "learning_rate": 6.018975887300356e-06, + "loss": 0.002, + "step": 60040 + }, + { + "epoch": 0.9825738362104229, + "grad_norm": 0.026522452011704445, + "learning_rate": 6.01757790665078e-06, + "loss": 0.0015, + "step": 60050 + }, + { + "epoch": 0.9827374621614988, + "grad_norm": 0.03239431604743004, + "learning_rate": 6.016179843011014e-06, + "loss": 0.0023, + "step": 60060 + }, + { + "epoch": 0.9829010881125747, + "grad_norm": 0.05888697877526283, + "learning_rate": 6.0147816964950764e-06, + "loss": 0.0023, + "step": 60070 + }, + { + "epoch": 0.9830647140636505, + "grad_norm": 0.03198877349495888, + "learning_rate": 6.0133834672169964e-06, + "loss": 0.0012, + "step": 60080 + }, + { + "epoch": 0.9832283400147264, + "grad_norm": 0.05001695826649666, + "learning_rate": 6.011985155290809e-06, + "loss": 0.0034, + "step": 60090 + }, + { + "epoch": 0.9833919659658021, + "grad_norm": 0.029652073979377747, + "learning_rate": 6.0105867608305555e-06, + "loss": 0.0018, + "step": 60100 + }, + { + "epoch": 0.983555591916878, + "grad_norm": 0.10867547243833542, + "learning_rate": 6.0091882839502845e-06, + "loss": 0.0021, + "step": 60110 + }, + { + "epoch": 0.9837192178679539, + "grad_norm": 0.04657530039548874, + "learning_rate": 6.007789724764049e-06, + "loss": 0.0026, + "step": 60120 + }, + { + "epoch": 0.9838828438190297, + "grad_norm": 0.05494051054120064, + "learning_rate": 6.006391083385914e-06, + "loss": 0.0032, + "step": 60130 + }, + { + "epoch": 0.9840464697701056, + "grad_norm": 0.038255564868450165, + "learning_rate": 6.004992359929946e-06, + "loss": 0.002, + "step": 60140 + }, + { + "epoch": 0.9842100957211813, + "grad_norm": 0.029380599036812782, + "learning_rate": 6.00359355451022e-06, + "loss": 0.0008, + "step": 60150 + }, + { + "epoch": 0.9843737216722572, + "grad_norm": 0.28330400586128235, + "learning_rate": 6.002194667240818e-06, + "loss": 0.0033, + "step": 60160 + }, + { + "epoch": 0.9845373476233331, + "grad_norm": 0.2177828699350357, + "learning_rate": 6.000795698235828e-06, + "loss": 0.0029, + "step": 60170 + }, + { + "epoch": 0.9847009735744089, + "grad_norm": 0.0665067657828331, + "learning_rate": 5.999396647609348e-06, + "loss": 0.0014, + "step": 60180 + }, + { + "epoch": 0.9848645995254848, + "grad_norm": 0.05530639365315437, + "learning_rate": 5.997997515475476e-06, + "loss": 0.0015, + "step": 60190 + }, + { + "epoch": 0.9850282254765605, + "grad_norm": 0.0063982997089624405, + "learning_rate": 5.996598301948322e-06, + "loss": 0.0013, + "step": 60200 + }, + { + "epoch": 0.9851918514276364, + "grad_norm": 0.01960398256778717, + "learning_rate": 5.995199007142002e-06, + "loss": 0.0009, + "step": 60210 + }, + { + "epoch": 0.9853554773787123, + "grad_norm": 0.10860680043697357, + "learning_rate": 5.993799631170637e-06, + "loss": 0.001, + "step": 60220 + }, + { + "epoch": 0.9855191033297881, + "grad_norm": 0.033449865877628326, + "learning_rate": 5.992400174148355e-06, + "loss": 0.0021, + "step": 60230 + }, + { + "epoch": 0.985682729280864, + "grad_norm": 0.12303087115287781, + "learning_rate": 5.991000636189292e-06, + "loss": 0.002, + "step": 60240 + }, + { + "epoch": 0.9858463552319398, + "grad_norm": 0.05814871937036514, + "learning_rate": 5.989601017407587e-06, + "loss": 0.0016, + "step": 60250 + }, + { + "epoch": 0.9860099811830156, + "grad_norm": 0.05798044055700302, + "learning_rate": 5.988201317917392e-06, + "loss": 0.0019, + "step": 60260 + }, + { + "epoch": 0.9861736071340914, + "grad_norm": 0.05535168945789337, + "learning_rate": 5.986801537832858e-06, + "loss": 0.0016, + "step": 60270 + }, + { + "epoch": 0.9863372330851673, + "grad_norm": 0.13906854391098022, + "learning_rate": 5.985401677268148e-06, + "loss": 0.0013, + "step": 60280 + }, + { + "epoch": 0.9865008590362432, + "grad_norm": 0.13225620985031128, + "learning_rate": 5.98400173633743e-06, + "loss": 0.0034, + "step": 60290 + }, + { + "epoch": 0.986664484987319, + "grad_norm": 0.08708301186561584, + "learning_rate": 5.982601715154879e-06, + "loss": 0.0022, + "step": 60300 + }, + { + "epoch": 0.9868281109383948, + "grad_norm": 0.0183541439473629, + "learning_rate": 5.981201613834675e-06, + "loss": 0.0022, + "step": 60310 + }, + { + "epoch": 0.9869917368894706, + "grad_norm": 0.014991115778684616, + "learning_rate": 5.9798014324910055e-06, + "loss": 0.0028, + "step": 60320 + }, + { + "epoch": 0.9871553628405465, + "grad_norm": 0.05471270531415939, + "learning_rate": 5.9784011712380666e-06, + "loss": 0.0016, + "step": 60330 + }, + { + "epoch": 0.9873189887916224, + "grad_norm": 0.06115105748176575, + "learning_rate": 5.977000830190055e-06, + "loss": 0.0015, + "step": 60340 + }, + { + "epoch": 0.9874826147426982, + "grad_norm": 0.02637716569006443, + "learning_rate": 5.97560040946118e-06, + "loss": 0.0018, + "step": 60350 + }, + { + "epoch": 0.987646240693774, + "grad_norm": 0.1435128152370453, + "learning_rate": 5.974199909165654e-06, + "loss": 0.0047, + "step": 60360 + }, + { + "epoch": 0.9878098666448498, + "grad_norm": 0.08010214567184448, + "learning_rate": 5.972799329417699e-06, + "loss": 0.0017, + "step": 60370 + }, + { + "epoch": 0.9879734925959257, + "grad_norm": 0.04892215505242348, + "learning_rate": 5.971398670331541e-06, + "loss": 0.0018, + "step": 60380 + }, + { + "epoch": 0.9881371185470016, + "grad_norm": 0.02184041030704975, + "learning_rate": 5.969997932021411e-06, + "loss": 0.0015, + "step": 60390 + }, + { + "epoch": 0.9883007444980774, + "grad_norm": 0.07250533252954483, + "learning_rate": 5.968597114601551e-06, + "loss": 0.0016, + "step": 60400 + }, + { + "epoch": 0.9884643704491533, + "grad_norm": 0.03990749269723892, + "learning_rate": 5.967196218186204e-06, + "loss": 0.0021, + "step": 60410 + }, + { + "epoch": 0.988627996400229, + "grad_norm": 0.07440242916345596, + "learning_rate": 5.965795242889626e-06, + "loss": 0.0018, + "step": 60420 + }, + { + "epoch": 0.9887916223513049, + "grad_norm": 0.016071777790784836, + "learning_rate": 5.964394188826074e-06, + "loss": 0.0013, + "step": 60430 + }, + { + "epoch": 0.9889552483023808, + "grad_norm": 0.049649856984615326, + "learning_rate": 5.962993056109812e-06, + "loss": 0.0018, + "step": 60440 + }, + { + "epoch": 0.9891188742534566, + "grad_norm": 0.028748657554388046, + "learning_rate": 5.961591844855112e-06, + "loss": 0.0017, + "step": 60450 + }, + { + "epoch": 0.9892825002045325, + "grad_norm": 0.06174496188759804, + "learning_rate": 5.960190555176255e-06, + "loss": 0.0015, + "step": 60460 + }, + { + "epoch": 0.9894461261556082, + "grad_norm": 0.10030088573694229, + "learning_rate": 5.9587891871875215e-06, + "loss": 0.0017, + "step": 60470 + }, + { + "epoch": 0.9896097521066841, + "grad_norm": 0.061132289469242096, + "learning_rate": 5.957387741003205e-06, + "loss": 0.0008, + "step": 60480 + }, + { + "epoch": 0.98977337805776, + "grad_norm": 0.04618103802204132, + "learning_rate": 5.955986216737602e-06, + "loss": 0.0025, + "step": 60490 + }, + { + "epoch": 0.9899370040088358, + "grad_norm": 0.1212206780910492, + "learning_rate": 5.954584614505015e-06, + "loss": 0.002, + "step": 60500 + }, + { + "epoch": 0.9901006299599117, + "grad_norm": 0.06300706416368484, + "learning_rate": 5.953182934419753e-06, + "loss": 0.0014, + "step": 60510 + }, + { + "epoch": 0.9902642559109874, + "grad_norm": 0.07991975545883179, + "learning_rate": 5.9517811765961365e-06, + "loss": 0.0017, + "step": 60520 + }, + { + "epoch": 0.9904278818620633, + "grad_norm": 0.015466952696442604, + "learning_rate": 5.950379341148484e-06, + "loss": 0.0021, + "step": 60530 + }, + { + "epoch": 0.9905915078131392, + "grad_norm": 0.010112815536558628, + "learning_rate": 5.948977428191126e-06, + "loss": 0.0017, + "step": 60540 + }, + { + "epoch": 0.990755133764215, + "grad_norm": 0.24883177876472473, + "learning_rate": 5.9475754378383985e-06, + "loss": 0.0024, + "step": 60550 + }, + { + "epoch": 0.9909187597152909, + "grad_norm": 0.03571660444140434, + "learning_rate": 5.946173370204642e-06, + "loss": 0.0017, + "step": 60560 + }, + { + "epoch": 0.9910823856663666, + "grad_norm": 0.06003880500793457, + "learning_rate": 5.944771225404204e-06, + "loss": 0.002, + "step": 60570 + }, + { + "epoch": 0.9912460116174425, + "grad_norm": 0.06538006663322449, + "learning_rate": 5.943369003551439e-06, + "loss": 0.0015, + "step": 60580 + }, + { + "epoch": 0.9914096375685184, + "grad_norm": 0.03284836187958717, + "learning_rate": 5.941966704760709e-06, + "loss": 0.0024, + "step": 60590 + }, + { + "epoch": 0.9915732635195942, + "grad_norm": 0.13619862496852875, + "learning_rate": 5.9405643291463775e-06, + "loss": 0.0033, + "step": 60600 + }, + { + "epoch": 0.9917368894706701, + "grad_norm": 0.014600131660699844, + "learning_rate": 5.939161876822821e-06, + "loss": 0.0022, + "step": 60610 + }, + { + "epoch": 0.9919005154217458, + "grad_norm": 0.06094701215624809, + "learning_rate": 5.937759347904417e-06, + "loss": 0.0017, + "step": 60620 + }, + { + "epoch": 0.9920641413728217, + "grad_norm": 0.12479595839977264, + "learning_rate": 5.93635674250555e-06, + "loss": 0.0019, + "step": 60630 + }, + { + "epoch": 0.9922277673238976, + "grad_norm": 0.0912921279668808, + "learning_rate": 5.934954060740614e-06, + "loss": 0.0022, + "step": 60640 + }, + { + "epoch": 0.9923913932749734, + "grad_norm": 0.05441417545080185, + "learning_rate": 5.9335513027240065e-06, + "loss": 0.0016, + "step": 60650 + }, + { + "epoch": 0.9925550192260493, + "grad_norm": 0.09336307644844055, + "learning_rate": 5.932148468570129e-06, + "loss": 0.0037, + "step": 60660 + }, + { + "epoch": 0.992718645177125, + "grad_norm": 0.0738450437784195, + "learning_rate": 5.930745558393396e-06, + "loss": 0.0016, + "step": 60670 + }, + { + "epoch": 0.9928822711282009, + "grad_norm": 0.01890706829726696, + "learning_rate": 5.92934257230822e-06, + "loss": 0.0011, + "step": 60680 + }, + { + "epoch": 0.9930458970792768, + "grad_norm": 0.05509449169039726, + "learning_rate": 5.927939510429026e-06, + "loss": 0.0029, + "step": 60690 + }, + { + "epoch": 0.9932095230303526, + "grad_norm": 0.15380942821502686, + "learning_rate": 5.926536372870243e-06, + "loss": 0.002, + "step": 60700 + }, + { + "epoch": 0.9933731489814285, + "grad_norm": 0.03707823529839516, + "learning_rate": 5.925133159746305e-06, + "loss": 0.0024, + "step": 60710 + }, + { + "epoch": 0.9935367749325043, + "grad_norm": 0.25121182203292847, + "learning_rate": 5.9237298711716565e-06, + "loss": 0.0017, + "step": 60720 + }, + { + "epoch": 0.9937004008835801, + "grad_norm": 0.047686394304037094, + "learning_rate": 5.92232650726074e-06, + "loss": 0.0023, + "step": 60730 + }, + { + "epoch": 0.993864026834656, + "grad_norm": 0.07264384627342224, + "learning_rate": 5.920923068128013e-06, + "loss": 0.0041, + "step": 60740 + }, + { + "epoch": 0.9940276527857318, + "grad_norm": 0.21187007427215576, + "learning_rate": 5.919519553887933e-06, + "loss": 0.0021, + "step": 60750 + }, + { + "epoch": 0.9941912787368077, + "grad_norm": 0.08301796764135361, + "learning_rate": 5.9181159646549666e-06, + "loss": 0.0027, + "step": 60760 + }, + { + "epoch": 0.9943549046878835, + "grad_norm": 0.04313843324780464, + "learning_rate": 5.916712300543586e-06, + "loss": 0.0022, + "step": 60770 + }, + { + "epoch": 0.9945185306389593, + "grad_norm": 0.08518585562705994, + "learning_rate": 5.915308561668269e-06, + "loss": 0.001, + "step": 60780 + }, + { + "epoch": 0.9946821565900352, + "grad_norm": 0.023523874580860138, + "learning_rate": 5.913904748143501e-06, + "loss": 0.0036, + "step": 60790 + }, + { + "epoch": 0.994845782541111, + "grad_norm": 0.14175419509410858, + "learning_rate": 5.91250086008377e-06, + "loss": 0.0025, + "step": 60800 + }, + { + "epoch": 0.9950094084921869, + "grad_norm": 0.16313813626766205, + "learning_rate": 5.911096897603573e-06, + "loss": 0.0025, + "step": 60810 + }, + { + "epoch": 0.9951730344432627, + "grad_norm": 0.021661773324012756, + "learning_rate": 5.909692860817413e-06, + "loss": 0.0009, + "step": 60820 + }, + { + "epoch": 0.9953366603943385, + "grad_norm": 0.1231006383895874, + "learning_rate": 5.908288749839799e-06, + "loss": 0.0026, + "step": 60830 + }, + { + "epoch": 0.9955002863454144, + "grad_norm": 0.027026686817407608, + "learning_rate": 5.906884564785245e-06, + "loss": 0.0014, + "step": 60840 + }, + { + "epoch": 0.9956639122964902, + "grad_norm": 0.026886874809861183, + "learning_rate": 5.90548030576827e-06, + "loss": 0.0018, + "step": 60850 + }, + { + "epoch": 0.9958275382475661, + "grad_norm": 0.07238604873418808, + "learning_rate": 5.9040759729034034e-06, + "loss": 0.0031, + "step": 60860 + }, + { + "epoch": 0.9959911641986419, + "grad_norm": 0.08987830579280853, + "learning_rate": 5.902671566305177e-06, + "loss": 0.0019, + "step": 60870 + }, + { + "epoch": 0.9961547901497178, + "grad_norm": 0.09531639516353607, + "learning_rate": 5.901267086088127e-06, + "loss": 0.0016, + "step": 60880 + }, + { + "epoch": 0.9963184161007936, + "grad_norm": 0.013638807460665703, + "learning_rate": 5.899862532366801e-06, + "loss": 0.0011, + "step": 60890 + }, + { + "epoch": 0.9964820420518694, + "grad_norm": 0.24744060635566711, + "learning_rate": 5.8984579052557485e-06, + "loss": 0.0014, + "step": 60900 + }, + { + "epoch": 0.9966456680029453, + "grad_norm": 0.008690830320119858, + "learning_rate": 5.897053204869526e-06, + "loss": 0.0025, + "step": 60910 + }, + { + "epoch": 0.9968092939540211, + "grad_norm": 0.016616791486740112, + "learning_rate": 5.895648431322696e-06, + "loss": 0.0016, + "step": 60920 + }, + { + "epoch": 0.996972919905097, + "grad_norm": 0.08524341136217117, + "learning_rate": 5.894243584729827e-06, + "loss": 0.0029, + "step": 60930 + }, + { + "epoch": 0.9971365458561728, + "grad_norm": 0.07097870856523514, + "learning_rate": 5.892838665205493e-06, + "loss": 0.0021, + "step": 60940 + }, + { + "epoch": 0.9973001718072486, + "grad_norm": 0.0369216650724411, + "learning_rate": 5.891433672864276e-06, + "loss": 0.002, + "step": 60950 + }, + { + "epoch": 0.9974637977583245, + "grad_norm": 0.06325913220643997, + "learning_rate": 5.890028607820761e-06, + "loss": 0.0036, + "step": 60960 + }, + { + "epoch": 0.9976274237094003, + "grad_norm": 0.128708153963089, + "learning_rate": 5.888623470189542e-06, + "loss": 0.0011, + "step": 60970 + }, + { + "epoch": 0.9977910496604762, + "grad_norm": 0.08021515607833862, + "learning_rate": 5.887218260085216e-06, + "loss": 0.0024, + "step": 60980 + }, + { + "epoch": 0.997954675611552, + "grad_norm": 0.052398569881916046, + "learning_rate": 5.885812977622386e-06, + "loss": 0.0034, + "step": 60990 + }, + { + "epoch": 0.9981183015626278, + "grad_norm": 0.12271636724472046, + "learning_rate": 5.8844076229156645e-06, + "loss": 0.0013, + "step": 61000 + }, + { + "epoch": 0.9981183015626278, + "eval_loss": 0.0012876364635303617, + "eval_runtime": 5.3865, + "eval_samples_per_second": 37.13, + "eval_steps_per_second": 9.282, + "step": 61000 + }, + { + "epoch": 0.9982819275137037, + "grad_norm": 0.06839025020599365, + "learning_rate": 5.883002196079665e-06, + "loss": 0.0014, + "step": 61010 + }, + { + "epoch": 0.9984455534647795, + "grad_norm": 0.04630940407514572, + "learning_rate": 5.881596697229011e-06, + "loss": 0.0017, + "step": 61020 + }, + { + "epoch": 0.9986091794158554, + "grad_norm": 0.0540347583591938, + "learning_rate": 5.880191126478332e-06, + "loss": 0.0014, + "step": 61030 + }, + { + "epoch": 0.9987728053669312, + "grad_norm": 0.020785707980394363, + "learning_rate": 5.8787854839422576e-06, + "loss": 0.0013, + "step": 61040 + }, + { + "epoch": 0.998936431318007, + "grad_norm": 0.05198941007256508, + "learning_rate": 5.877379769735429e-06, + "loss": 0.0012, + "step": 61050 + }, + { + "epoch": 0.9991000572690829, + "grad_norm": 0.008295542560517788, + "learning_rate": 5.8759739839724916e-06, + "loss": 0.0008, + "step": 61060 + }, + { + "epoch": 0.9992636832201587, + "grad_norm": 0.13869978487491608, + "learning_rate": 5.8745681267680965e-06, + "loss": 0.0011, + "step": 61070 + }, + { + "epoch": 0.9994273091712346, + "grad_norm": 0.03864474222064018, + "learning_rate": 5.873162198236901e-06, + "loss": 0.0019, + "step": 61080 + }, + { + "epoch": 0.9995909351223105, + "grad_norm": 0.051644012331962585, + "learning_rate": 5.871756198493567e-06, + "loss": 0.002, + "step": 61090 + }, + { + "epoch": 0.9997545610733862, + "grad_norm": 0.029904983937740326, + "learning_rate": 5.870350127652763e-06, + "loss": 0.001, + "step": 61100 + }, + { + "epoch": 0.9999181870244621, + "grad_norm": 0.03249731659889221, + "learning_rate": 5.868943985829163e-06, + "loss": 0.0015, + "step": 61110 + }, + { + "epoch": 1.000081812975538, + "grad_norm": 0.059132978320121765, + "learning_rate": 5.86753777313745e-06, + "loss": 0.0014, + "step": 61120 + }, + { + "epoch": 1.0002454389266138, + "grad_norm": 0.10480040311813354, + "learning_rate": 5.866131489692306e-06, + "loss": 0.0018, + "step": 61130 + }, + { + "epoch": 1.0004090648776895, + "grad_norm": 0.08471012115478516, + "learning_rate": 5.864725135608426e-06, + "loss": 0.0015, + "step": 61140 + }, + { + "epoch": 1.0005726908287655, + "grad_norm": 0.026128077879548073, + "learning_rate": 5.863318711000505e-06, + "loss": 0.0036, + "step": 61150 + }, + { + "epoch": 1.0007363167798413, + "grad_norm": 0.0371905192732811, + "learning_rate": 5.861912215983247e-06, + "loss": 0.0007, + "step": 61160 + }, + { + "epoch": 1.000899942730917, + "grad_norm": 0.051652416586875916, + "learning_rate": 5.860505650671362e-06, + "loss": 0.0015, + "step": 61170 + }, + { + "epoch": 1.0010635686819929, + "grad_norm": 0.05548330768942833, + "learning_rate": 5.859099015179563e-06, + "loss": 0.0012, + "step": 61180 + }, + { + "epoch": 1.0012271946330689, + "grad_norm": 0.028802618384361267, + "learning_rate": 5.8576923096225715e-06, + "loss": 0.0012, + "step": 61190 + }, + { + "epoch": 1.0013908205841446, + "grad_norm": 0.028903771191835403, + "learning_rate": 5.856285534115114e-06, + "loss": 0.0021, + "step": 61200 + }, + { + "epoch": 1.0015544465352204, + "grad_norm": 0.21797987818717957, + "learning_rate": 5.854878688771921e-06, + "loss": 0.0026, + "step": 61210 + }, + { + "epoch": 1.0017180724862964, + "grad_norm": 0.09580961614847183, + "learning_rate": 5.853471773707731e-06, + "loss": 0.0013, + "step": 61220 + }, + { + "epoch": 1.0018816984373722, + "grad_norm": 0.05054014176130295, + "learning_rate": 5.852064789037286e-06, + "loss": 0.0007, + "step": 61230 + }, + { + "epoch": 1.002045324388448, + "grad_norm": 0.04053964093327522, + "learning_rate": 5.8506577348753365e-06, + "loss": 0.0019, + "step": 61240 + }, + { + "epoch": 1.002208950339524, + "grad_norm": 0.07920017838478088, + "learning_rate": 5.849250611336637e-06, + "loss": 0.0025, + "step": 61250 + }, + { + "epoch": 1.0023725762905997, + "grad_norm": 0.03208623826503754, + "learning_rate": 5.847843418535946e-06, + "loss": 0.0007, + "step": 61260 + }, + { + "epoch": 1.0025362022416755, + "grad_norm": 0.0023011798039078712, + "learning_rate": 5.846436156588031e-06, + "loss": 0.0008, + "step": 61270 + }, + { + "epoch": 1.0026998281927513, + "grad_norm": 0.027438288554549217, + "learning_rate": 5.845028825607663e-06, + "loss": 0.0018, + "step": 61280 + }, + { + "epoch": 1.0028634541438273, + "grad_norm": 0.017761297523975372, + "learning_rate": 5.843621425709618e-06, + "loss": 0.0018, + "step": 61290 + }, + { + "epoch": 1.003027080094903, + "grad_norm": 0.12389422208070755, + "learning_rate": 5.84221395700868e-06, + "loss": 0.0026, + "step": 61300 + }, + { + "epoch": 1.0031907060459788, + "grad_norm": 0.030440891161561012, + "learning_rate": 5.840806419619638e-06, + "loss": 0.0009, + "step": 61310 + }, + { + "epoch": 1.0033543319970548, + "grad_norm": 0.035298582166433334, + "learning_rate": 5.839398813657284e-06, + "loss": 0.0012, + "step": 61320 + }, + { + "epoch": 1.0035179579481306, + "grad_norm": 0.06256187707185745, + "learning_rate": 5.83799113923642e-06, + "loss": 0.0015, + "step": 61330 + }, + { + "epoch": 1.0036815838992064, + "grad_norm": 0.05947040766477585, + "learning_rate": 5.836583396471849e-06, + "loss": 0.002, + "step": 61340 + }, + { + "epoch": 1.0038452098502824, + "grad_norm": 0.022738659754395485, + "learning_rate": 5.835175585478383e-06, + "loss": 0.0026, + "step": 61350 + }, + { + "epoch": 1.0040088358013581, + "grad_norm": 0.07788850367069244, + "learning_rate": 5.8337677063708364e-06, + "loss": 0.0017, + "step": 61360 + }, + { + "epoch": 1.004172461752434, + "grad_norm": 0.04353642091155052, + "learning_rate": 5.832359759264034e-06, + "loss": 0.0018, + "step": 61370 + }, + { + "epoch": 1.0043360877035097, + "grad_norm": 0.07055719196796417, + "learning_rate": 5.830951744272801e-06, + "loss": 0.0015, + "step": 61380 + }, + { + "epoch": 1.0044997136545857, + "grad_norm": 0.022255435585975647, + "learning_rate": 5.829543661511972e-06, + "loss": 0.0039, + "step": 61390 + }, + { + "epoch": 1.0046633396056615, + "grad_norm": 0.05845420062541962, + "learning_rate": 5.828135511096382e-06, + "loss": 0.0018, + "step": 61400 + }, + { + "epoch": 1.0048269655567372, + "grad_norm": 0.12708251178264618, + "learning_rate": 5.826727293140879e-06, + "loss": 0.0015, + "step": 61410 + }, + { + "epoch": 1.0049905915078132, + "grad_norm": 0.03990639001131058, + "learning_rate": 5.82531900776031e-06, + "loss": 0.0014, + "step": 61420 + }, + { + "epoch": 1.005154217458889, + "grad_norm": 0.09444354474544525, + "learning_rate": 5.823910655069531e-06, + "loss": 0.0019, + "step": 61430 + }, + { + "epoch": 1.0053178434099648, + "grad_norm": 0.07281923294067383, + "learning_rate": 5.822502235183402e-06, + "loss": 0.0018, + "step": 61440 + }, + { + "epoch": 1.0054814693610408, + "grad_norm": 0.02620009332895279, + "learning_rate": 5.821093748216789e-06, + "loss": 0.003, + "step": 61450 + }, + { + "epoch": 1.0056450953121165, + "grad_norm": 0.11269406229257584, + "learning_rate": 5.819685194284563e-06, + "loss": 0.0015, + "step": 61460 + }, + { + "epoch": 1.0058087212631923, + "grad_norm": 0.21514850854873657, + "learning_rate": 5.818276573501602e-06, + "loss": 0.0015, + "step": 61470 + }, + { + "epoch": 1.005972347214268, + "grad_norm": 0.16029570996761322, + "learning_rate": 5.816867885982787e-06, + "loss": 0.0011, + "step": 61480 + }, + { + "epoch": 1.006135973165344, + "grad_norm": 0.12328983843326569, + "learning_rate": 5.815459131843008e-06, + "loss": 0.0012, + "step": 61490 + }, + { + "epoch": 1.0062995991164199, + "grad_norm": 0.2211940586566925, + "learning_rate": 5.814050311197156e-06, + "loss": 0.0028, + "step": 61500 + }, + { + "epoch": 1.0064632250674956, + "grad_norm": 0.04661479964852333, + "learning_rate": 5.81264142416013e-06, + "loss": 0.002, + "step": 61510 + }, + { + "epoch": 1.0066268510185716, + "grad_norm": 0.04160916805267334, + "learning_rate": 5.811232470846834e-06, + "loss": 0.0014, + "step": 61520 + }, + { + "epoch": 1.0067904769696474, + "grad_norm": 0.05628017336130142, + "learning_rate": 5.809823451372178e-06, + "loss": 0.0011, + "step": 61530 + }, + { + "epoch": 1.0069541029207232, + "grad_norm": 0.00382031942717731, + "learning_rate": 5.808414365851076e-06, + "loss": 0.0018, + "step": 61540 + }, + { + "epoch": 1.007117728871799, + "grad_norm": 0.01784886233508587, + "learning_rate": 5.80700521439845e-06, + "loss": 0.0008, + "step": 61550 + }, + { + "epoch": 1.007281354822875, + "grad_norm": 0.013976208865642548, + "learning_rate": 5.805595997129223e-06, + "loss": 0.001, + "step": 61560 + }, + { + "epoch": 1.0074449807739507, + "grad_norm": 0.04486384615302086, + "learning_rate": 5.804186714158328e-06, + "loss": 0.002, + "step": 61570 + }, + { + "epoch": 1.0076086067250265, + "grad_norm": 0.21445812284946442, + "learning_rate": 5.802777365600701e-06, + "loss": 0.0017, + "step": 61580 + }, + { + "epoch": 1.0077722326761025, + "grad_norm": 0.05751577764749527, + "learning_rate": 5.801367951571283e-06, + "loss": 0.0024, + "step": 61590 + }, + { + "epoch": 1.0079358586271783, + "grad_norm": 0.03802460804581642, + "learning_rate": 5.799958472185021e-06, + "loss": 0.0009, + "step": 61600 + }, + { + "epoch": 1.008099484578254, + "grad_norm": 0.023952007293701172, + "learning_rate": 5.798548927556869e-06, + "loss": 0.0015, + "step": 61610 + }, + { + "epoch": 1.00826311052933, + "grad_norm": 0.042315080761909485, + "learning_rate": 5.797139317801782e-06, + "loss": 0.002, + "step": 61620 + }, + { + "epoch": 1.0084267364804058, + "grad_norm": 0.02831646241247654, + "learning_rate": 5.795729643034725e-06, + "loss": 0.001, + "step": 61630 + }, + { + "epoch": 1.0085903624314816, + "grad_norm": 0.029023466631770134, + "learning_rate": 5.7943199033706655e-06, + "loss": 0.0009, + "step": 61640 + }, + { + "epoch": 1.0087539883825574, + "grad_norm": 0.16637486219406128, + "learning_rate": 5.792910098924577e-06, + "loss": 0.0033, + "step": 61650 + }, + { + "epoch": 1.0089176143336334, + "grad_norm": 0.06188393011689186, + "learning_rate": 5.791500229811438e-06, + "loss": 0.0012, + "step": 61660 + }, + { + "epoch": 1.0090812402847091, + "grad_norm": 0.01722871884703636, + "learning_rate": 5.7900902961462345e-06, + "loss": 0.0024, + "step": 61670 + }, + { + "epoch": 1.009244866235785, + "grad_norm": 0.051801424473524094, + "learning_rate": 5.788680298043954e-06, + "loss": 0.0012, + "step": 61680 + }, + { + "epoch": 1.009408492186861, + "grad_norm": 0.025072526186704636, + "learning_rate": 5.787270235619593e-06, + "loss": 0.0014, + "step": 61690 + }, + { + "epoch": 1.0095721181379367, + "grad_norm": 0.042015112936496735, + "learning_rate": 5.785860108988148e-06, + "loss": 0.002, + "step": 61700 + }, + { + "epoch": 1.0097357440890125, + "grad_norm": 0.04066444933414459, + "learning_rate": 5.7844499182646276e-06, + "loss": 0.0016, + "step": 61710 + }, + { + "epoch": 1.0098993700400885, + "grad_norm": 0.060358818620443344, + "learning_rate": 5.783039663564041e-06, + "loss": 0.0013, + "step": 61720 + }, + { + "epoch": 1.0100629959911642, + "grad_norm": 0.08476940542459488, + "learning_rate": 5.781629345001402e-06, + "loss": 0.0022, + "step": 61730 + }, + { + "epoch": 1.01022662194224, + "grad_norm": 0.05279752239584923, + "learning_rate": 5.780218962691734e-06, + "loss": 0.0007, + "step": 61740 + }, + { + "epoch": 1.0103902478933158, + "grad_norm": 0.061872582882642746, + "learning_rate": 5.778808516750061e-06, + "loss": 0.002, + "step": 61750 + }, + { + "epoch": 1.0105538738443918, + "grad_norm": 0.04641154035925865, + "learning_rate": 5.777398007291416e-06, + "loss": 0.0011, + "step": 61760 + }, + { + "epoch": 1.0107174997954675, + "grad_norm": 0.0397113673388958, + "learning_rate": 5.775987434430834e-06, + "loss": 0.0017, + "step": 61770 + }, + { + "epoch": 1.0108811257465433, + "grad_norm": 0.02313457429409027, + "learning_rate": 5.7745767982833576e-06, + "loss": 0.0011, + "step": 61780 + }, + { + "epoch": 1.0110447516976193, + "grad_norm": 0.04160955920815468, + "learning_rate": 5.7731660989640324e-06, + "loss": 0.0011, + "step": 61790 + }, + { + "epoch": 1.011208377648695, + "grad_norm": 0.03348110616207123, + "learning_rate": 5.7717553365879105e-06, + "loss": 0.0008, + "step": 61800 + }, + { + "epoch": 1.0113720035997709, + "grad_norm": 0.10487332940101624, + "learning_rate": 5.770344511270049e-06, + "loss": 0.0016, + "step": 61810 + }, + { + "epoch": 1.0115356295508469, + "grad_norm": 0.06869300454854965, + "learning_rate": 5.768933623125511e-06, + "loss": 0.0006, + "step": 61820 + }, + { + "epoch": 1.0116992555019226, + "grad_norm": 0.03205036744475365, + "learning_rate": 5.767522672269362e-06, + "loss": 0.0011, + "step": 61830 + }, + { + "epoch": 1.0118628814529984, + "grad_norm": 0.08402548730373383, + "learning_rate": 5.766111658816676e-06, + "loss": 0.0013, + "step": 61840 + }, + { + "epoch": 1.0120265074040742, + "grad_norm": 0.0778636708855629, + "learning_rate": 5.7647005828825285e-06, + "loss": 0.001, + "step": 61850 + }, + { + "epoch": 1.0121901333551502, + "grad_norm": 0.09324698895215988, + "learning_rate": 5.763289444582005e-06, + "loss": 0.0015, + "step": 61860 + }, + { + "epoch": 1.012353759306226, + "grad_norm": 0.027487050741910934, + "learning_rate": 5.761878244030189e-06, + "loss": 0.0015, + "step": 61870 + }, + { + "epoch": 1.0125173852573017, + "grad_norm": 0.051138538867235184, + "learning_rate": 5.7604669813421765e-06, + "loss": 0.0021, + "step": 61880 + }, + { + "epoch": 1.0126810112083777, + "grad_norm": 0.08442410081624985, + "learning_rate": 5.7590556566330645e-06, + "loss": 0.0015, + "step": 61890 + }, + { + "epoch": 1.0128446371594535, + "grad_norm": 0.0564834401011467, + "learning_rate": 5.757644270017956e-06, + "loss": 0.002, + "step": 61900 + }, + { + "epoch": 1.0130082631105293, + "grad_norm": 0.056608185172080994, + "learning_rate": 5.756232821611958e-06, + "loss": 0.0018, + "step": 61910 + }, + { + "epoch": 1.0131718890616053, + "grad_norm": 0.26579341292381287, + "learning_rate": 5.7548213115301845e-06, + "loss": 0.0013, + "step": 61920 + }, + { + "epoch": 1.013335515012681, + "grad_norm": 0.023259269073605537, + "learning_rate": 5.753409739887753e-06, + "loss": 0.0012, + "step": 61930 + }, + { + "epoch": 1.0134991409637568, + "grad_norm": 0.08676396310329437, + "learning_rate": 5.751998106799786e-06, + "loss": 0.0028, + "step": 61940 + }, + { + "epoch": 1.0136627669148326, + "grad_norm": 0.010002187453210354, + "learning_rate": 5.750586412381413e-06, + "loss": 0.0011, + "step": 61950 + }, + { + "epoch": 1.0138263928659086, + "grad_norm": 0.23325270414352417, + "learning_rate": 5.749174656747764e-06, + "loss": 0.0016, + "step": 61960 + }, + { + "epoch": 1.0139900188169844, + "grad_norm": 0.07708834856748581, + "learning_rate": 5.7477628400139796e-06, + "loss": 0.0014, + "step": 61970 + }, + { + "epoch": 1.0141536447680601, + "grad_norm": 0.14373217523097992, + "learning_rate": 5.746350962295203e-06, + "loss": 0.001, + "step": 61980 + }, + { + "epoch": 1.0143172707191361, + "grad_norm": 0.01857146807014942, + "learning_rate": 5.74493902370658e-06, + "loss": 0.0015, + "step": 61990 + }, + { + "epoch": 1.014480896670212, + "grad_norm": 0.09846285730600357, + "learning_rate": 5.743527024363266e-06, + "loss": 0.0026, + "step": 62000 + }, + { + "epoch": 1.0146445226212877, + "grad_norm": 0.04691923037171364, + "learning_rate": 5.742114964380418e-06, + "loss": 0.0009, + "step": 62010 + }, + { + "epoch": 1.0148081485723637, + "grad_norm": 0.08016195893287659, + "learning_rate": 5.740702843873197e-06, + "loss": 0.0022, + "step": 62020 + }, + { + "epoch": 1.0149717745234395, + "grad_norm": 0.15680751204490662, + "learning_rate": 5.7392906629567725e-06, + "loss": 0.0018, + "step": 62030 + }, + { + "epoch": 1.0151354004745152, + "grad_norm": 0.08605265617370605, + "learning_rate": 5.737878421746317e-06, + "loss": 0.0014, + "step": 62040 + }, + { + "epoch": 1.015299026425591, + "grad_norm": 0.04326837137341499, + "learning_rate": 5.736466120357008e-06, + "loss": 0.0012, + "step": 62050 + }, + { + "epoch": 1.015462652376667, + "grad_norm": 0.08900121599435806, + "learning_rate": 5.735053758904028e-06, + "loss": 0.0019, + "step": 62060 + }, + { + "epoch": 1.0156262783277428, + "grad_norm": 0.09054692089557648, + "learning_rate": 5.733641337502563e-06, + "loss": 0.0015, + "step": 62070 + }, + { + "epoch": 1.0157899042788185, + "grad_norm": 0.0965246707201004, + "learning_rate": 5.732228856267808e-06, + "loss": 0.0013, + "step": 62080 + }, + { + "epoch": 1.0159535302298945, + "grad_norm": 0.2699553370475769, + "learning_rate": 5.730816315314958e-06, + "loss": 0.001, + "step": 62090 + }, + { + "epoch": 1.0161171561809703, + "grad_norm": 0.1329885870218277, + "learning_rate": 5.729403714759216e-06, + "loss": 0.0016, + "step": 62100 + }, + { + "epoch": 1.016280782132046, + "grad_norm": 0.05228469893336296, + "learning_rate": 5.727991054715789e-06, + "loss": 0.0023, + "step": 62110 + }, + { + "epoch": 1.016444408083122, + "grad_norm": 0.03670582175254822, + "learning_rate": 5.726578335299887e-06, + "loss": 0.0017, + "step": 62120 + }, + { + "epoch": 1.0166080340341979, + "grad_norm": 0.08487051725387573, + "learning_rate": 5.72516555662673e-06, + "loss": 0.0011, + "step": 62130 + }, + { + "epoch": 1.0167716599852736, + "grad_norm": 0.05869929492473602, + "learning_rate": 5.723752718811536e-06, + "loss": 0.002, + "step": 62140 + }, + { + "epoch": 1.0169352859363494, + "grad_norm": 0.03334204852581024, + "learning_rate": 5.722339821969534e-06, + "loss": 0.0022, + "step": 62150 + }, + { + "epoch": 1.0170989118874254, + "grad_norm": 0.028556037694215775, + "learning_rate": 5.720926866215952e-06, + "loss": 0.0007, + "step": 62160 + }, + { + "epoch": 1.0172625378385012, + "grad_norm": 0.03512772172689438, + "learning_rate": 5.719513851666028e-06, + "loss": 0.0014, + "step": 62170 + }, + { + "epoch": 1.017426163789577, + "grad_norm": 0.10795187950134277, + "learning_rate": 5.718100778435004e-06, + "loss": 0.0014, + "step": 62180 + }, + { + "epoch": 1.017589789740653, + "grad_norm": 0.06372316181659698, + "learning_rate": 5.716687646638122e-06, + "loss": 0.0017, + "step": 62190 + }, + { + "epoch": 1.0177534156917287, + "grad_norm": 0.0025340444408357143, + "learning_rate": 5.7152744563906345e-06, + "loss": 0.0019, + "step": 62200 + }, + { + "epoch": 1.0179170416428045, + "grad_norm": 0.017589906230568886, + "learning_rate": 5.713861207807796e-06, + "loss": 0.0025, + "step": 62210 + }, + { + "epoch": 1.0180806675938805, + "grad_norm": 0.05124212056398392, + "learning_rate": 5.712447901004865e-06, + "loss": 0.0016, + "step": 62220 + }, + { + "epoch": 1.0182442935449563, + "grad_norm": 0.5123308897018433, + "learning_rate": 5.711034536097109e-06, + "loss": 0.0021, + "step": 62230 + }, + { + "epoch": 1.018407919496032, + "grad_norm": 0.10106267035007477, + "learning_rate": 5.709621113199795e-06, + "loss": 0.0018, + "step": 62240 + }, + { + "epoch": 1.0185715454471078, + "grad_norm": 0.18311373889446259, + "learning_rate": 5.708207632428195e-06, + "loss": 0.0013, + "step": 62250 + }, + { + "epoch": 1.0187351713981838, + "grad_norm": 0.014049588702619076, + "learning_rate": 5.706794093897593e-06, + "loss": 0.0022, + "step": 62260 + }, + { + "epoch": 1.0188987973492596, + "grad_norm": 0.22256989777088165, + "learning_rate": 5.705380497723268e-06, + "loss": 0.0015, + "step": 62270 + }, + { + "epoch": 1.0190624233003354, + "grad_norm": 0.0713861808180809, + "learning_rate": 5.7039668440205096e-06, + "loss": 0.0015, + "step": 62280 + }, + { + "epoch": 1.0192260492514114, + "grad_norm": 0.0895313248038292, + "learning_rate": 5.702553132904611e-06, + "loss": 0.0017, + "step": 62290 + }, + { + "epoch": 1.0193896752024871, + "grad_norm": 0.09305835515260696, + "learning_rate": 5.701139364490869e-06, + "loss": 0.0019, + "step": 62300 + }, + { + "epoch": 1.019553301153563, + "grad_norm": 0.007383860647678375, + "learning_rate": 5.699725538894586e-06, + "loss": 0.0012, + "step": 62310 + }, + { + "epoch": 1.019716927104639, + "grad_norm": 0.03426968306303024, + "learning_rate": 5.698311656231068e-06, + "loss": 0.0024, + "step": 62320 + }, + { + "epoch": 1.0198805530557147, + "grad_norm": 0.08224494010210037, + "learning_rate": 5.696897716615629e-06, + "loss": 0.0019, + "step": 62330 + }, + { + "epoch": 1.0200441790067905, + "grad_norm": 0.007065152749419212, + "learning_rate": 5.695483720163582e-06, + "loss": 0.0026, + "step": 62340 + }, + { + "epoch": 1.0202078049578662, + "grad_norm": 0.10706252604722977, + "learning_rate": 5.694069666990249e-06, + "loss": 0.0016, + "step": 62350 + }, + { + "epoch": 1.0203714309089422, + "grad_norm": 0.05247305706143379, + "learning_rate": 5.6926555572109554e-06, + "loss": 0.0009, + "step": 62360 + }, + { + "epoch": 1.020535056860018, + "grad_norm": 0.04206337779760361, + "learning_rate": 5.691241390941031e-06, + "loss": 0.0018, + "step": 62370 + }, + { + "epoch": 1.0206986828110938, + "grad_norm": 0.017959145829081535, + "learning_rate": 5.689827168295811e-06, + "loss": 0.0022, + "step": 62380 + }, + { + "epoch": 1.0208623087621698, + "grad_norm": 0.09433262050151825, + "learning_rate": 5.688412889390633e-06, + "loss": 0.0019, + "step": 62390 + }, + { + "epoch": 1.0210259347132455, + "grad_norm": 0.03946654871106148, + "learning_rate": 5.686998554340843e-06, + "loss": 0.0027, + "step": 62400 + }, + { + "epoch": 1.0211895606643213, + "grad_norm": 0.029379980638623238, + "learning_rate": 5.685584163261788e-06, + "loss": 0.0016, + "step": 62410 + }, + { + "epoch": 1.0213531866153973, + "grad_norm": 0.014345060102641582, + "learning_rate": 5.6841697162688216e-06, + "loss": 0.0022, + "step": 62420 + }, + { + "epoch": 1.021516812566473, + "grad_norm": 0.09723767638206482, + "learning_rate": 5.6827552134773015e-06, + "loss": 0.0034, + "step": 62430 + }, + { + "epoch": 1.0216804385175489, + "grad_norm": 0.06897973269224167, + "learning_rate": 5.681340655002588e-06, + "loss": 0.0013, + "step": 62440 + }, + { + "epoch": 1.0218440644686246, + "grad_norm": 0.10360734909772873, + "learning_rate": 5.679926040960048e-06, + "loss": 0.0024, + "step": 62450 + }, + { + "epoch": 1.0220076904197006, + "grad_norm": 0.36921411752700806, + "learning_rate": 5.678511371465056e-06, + "loss": 0.0051, + "step": 62460 + }, + { + "epoch": 1.0221713163707764, + "grad_norm": 0.15142609179019928, + "learning_rate": 5.677096646632983e-06, + "loss": 0.0018, + "step": 62470 + }, + { + "epoch": 1.0223349423218522, + "grad_norm": 0.11854346096515656, + "learning_rate": 5.675681866579211e-06, + "loss": 0.0015, + "step": 62480 + }, + { + "epoch": 1.0224985682729282, + "grad_norm": 0.009892048314213753, + "learning_rate": 5.674267031419125e-06, + "loss": 0.0016, + "step": 62490 + }, + { + "epoch": 1.022662194224004, + "grad_norm": 0.05725774168968201, + "learning_rate": 5.672852141268115e-06, + "loss": 0.0011, + "step": 62500 + }, + { + "epoch": 1.0228258201750797, + "grad_norm": 0.02648870460689068, + "learning_rate": 5.671437196241572e-06, + "loss": 0.0016, + "step": 62510 + }, + { + "epoch": 1.0229894461261555, + "grad_norm": 0.03140348568558693, + "learning_rate": 5.670022196454898e-06, + "loss": 0.0011, + "step": 62520 + }, + { + "epoch": 1.0231530720772315, + "grad_norm": 0.1427689641714096, + "learning_rate": 5.668607142023491e-06, + "loss": 0.0013, + "step": 62530 + }, + { + "epoch": 1.0233166980283073, + "grad_norm": 0.25043416023254395, + "learning_rate": 5.667192033062762e-06, + "loss": 0.0017, + "step": 62540 + }, + { + "epoch": 1.023480323979383, + "grad_norm": 0.025813426822423935, + "learning_rate": 5.66577686968812e-06, + "loss": 0.001, + "step": 62550 + }, + { + "epoch": 1.023643949930459, + "grad_norm": 0.09358217567205429, + "learning_rate": 5.664361652014981e-06, + "loss": 0.0019, + "step": 62560 + }, + { + "epoch": 1.0238075758815348, + "grad_norm": 0.12662622332572937, + "learning_rate": 5.662946380158767e-06, + "loss": 0.0012, + "step": 62570 + }, + { + "epoch": 1.0239712018326106, + "grad_norm": 0.061406973749399185, + "learning_rate": 5.661531054234901e-06, + "loss": 0.0018, + "step": 62580 + }, + { + "epoch": 1.0241348277836866, + "grad_norm": 0.030233286321163177, + "learning_rate": 5.6601156743588135e-06, + "loss": 0.0008, + "step": 62590 + }, + { + "epoch": 1.0242984537347624, + "grad_norm": 0.024516692385077477, + "learning_rate": 5.658700240645938e-06, + "loss": 0.0017, + "step": 62600 + }, + { + "epoch": 1.0244620796858381, + "grad_norm": 0.20729252696037292, + "learning_rate": 5.657284753211712e-06, + "loss": 0.0022, + "step": 62610 + }, + { + "epoch": 1.024625705636914, + "grad_norm": 0.02973029948771, + "learning_rate": 5.655869212171577e-06, + "loss": 0.0009, + "step": 62620 + }, + { + "epoch": 1.02478933158799, + "grad_norm": 0.05634833872318268, + "learning_rate": 5.65445361764098e-06, + "loss": 0.001, + "step": 62630 + }, + { + "epoch": 1.0249529575390657, + "grad_norm": 0.02505139261484146, + "learning_rate": 5.6530379697353736e-06, + "loss": 0.0011, + "step": 62640 + }, + { + "epoch": 1.0251165834901415, + "grad_norm": 0.05938884988427162, + "learning_rate": 5.651622268570212e-06, + "loss": 0.0015, + "step": 62650 + }, + { + "epoch": 1.0252802094412174, + "grad_norm": 0.06533920019865036, + "learning_rate": 5.650206514260957e-06, + "loss": 0.0037, + "step": 62660 + }, + { + "epoch": 1.0254438353922932, + "grad_norm": 0.00967687088996172, + "learning_rate": 5.6487907069230685e-06, + "loss": 0.0023, + "step": 62670 + }, + { + "epoch": 1.025607461343369, + "grad_norm": 0.04941129311919212, + "learning_rate": 5.647374846672018e-06, + "loss": 0.0012, + "step": 62680 + }, + { + "epoch": 1.025771087294445, + "grad_norm": 0.14057283103466034, + "learning_rate": 5.645958933623277e-06, + "loss": 0.0009, + "step": 62690 + }, + { + "epoch": 1.0259347132455208, + "grad_norm": 0.07917727530002594, + "learning_rate": 5.644542967892323e-06, + "loss": 0.0023, + "step": 62700 + }, + { + "epoch": 1.0260983391965965, + "grad_norm": 0.06364534795284271, + "learning_rate": 5.643126949594639e-06, + "loss": 0.0016, + "step": 62710 + }, + { + "epoch": 1.0262619651476723, + "grad_norm": 0.03275671228766441, + "learning_rate": 5.641710878845708e-06, + "loss": 0.0011, + "step": 62720 + }, + { + "epoch": 1.0264255910987483, + "grad_norm": 0.028730599209666252, + "learning_rate": 5.640294755761021e-06, + "loss": 0.0016, + "step": 62730 + }, + { + "epoch": 1.026589217049824, + "grad_norm": 0.05976053699851036, + "learning_rate": 5.638878580456072e-06, + "loss": 0.0012, + "step": 62740 + }, + { + "epoch": 1.0267528430008999, + "grad_norm": 0.03755538910627365, + "learning_rate": 5.63746235304636e-06, + "loss": 0.0011, + "step": 62750 + }, + { + "epoch": 1.0269164689519759, + "grad_norm": 0.1466907262802124, + "learning_rate": 5.636046073647388e-06, + "loss": 0.0016, + "step": 62760 + }, + { + "epoch": 1.0270800949030516, + "grad_norm": 0.06574656814336777, + "learning_rate": 5.634629742374661e-06, + "loss": 0.001, + "step": 62770 + }, + { + "epoch": 1.0272437208541274, + "grad_norm": 0.009138481691479683, + "learning_rate": 5.6332133593436925e-06, + "loss": 0.0028, + "step": 62780 + }, + { + "epoch": 1.0274073468052034, + "grad_norm": 0.007317844778299332, + "learning_rate": 5.631796924669996e-06, + "loss": 0.0005, + "step": 62790 + }, + { + "epoch": 1.0275709727562792, + "grad_norm": 0.023709839209914207, + "learning_rate": 5.630380438469092e-06, + "loss": 0.0014, + "step": 62800 + }, + { + "epoch": 1.027734598707355, + "grad_norm": 0.0037192576564848423, + "learning_rate": 5.628963900856503e-06, + "loss": 0.0026, + "step": 62810 + }, + { + "epoch": 1.0278982246584307, + "grad_norm": 0.15787558257579803, + "learning_rate": 5.62754731194776e-06, + "loss": 0.0016, + "step": 62820 + }, + { + "epoch": 1.0280618506095067, + "grad_norm": 0.005675437394529581, + "learning_rate": 5.626130671858393e-06, + "loss": 0.0013, + "step": 62830 + }, + { + "epoch": 1.0282254765605825, + "grad_norm": 0.1373959630727768, + "learning_rate": 5.624713980703939e-06, + "loss": 0.0016, + "step": 62840 + }, + { + "epoch": 1.0283891025116583, + "grad_norm": 0.04313086345791817, + "learning_rate": 5.623297238599936e-06, + "loss": 0.0011, + "step": 62850 + }, + { + "epoch": 1.0285527284627343, + "grad_norm": 0.0606301911175251, + "learning_rate": 5.621880445661934e-06, + "loss": 0.0021, + "step": 62860 + }, + { + "epoch": 1.02871635441381, + "grad_norm": 0.06886827200651169, + "learning_rate": 5.620463602005476e-06, + "loss": 0.0018, + "step": 62870 + }, + { + "epoch": 1.0288799803648858, + "grad_norm": 0.04967603459954262, + "learning_rate": 5.619046707746118e-06, + "loss": 0.0007, + "step": 62880 + }, + { + "epoch": 1.0290436063159618, + "grad_norm": 0.028564803302288055, + "learning_rate": 5.617629762999419e-06, + "loss": 0.0016, + "step": 62890 + }, + { + "epoch": 1.0292072322670376, + "grad_norm": 0.011786828748881817, + "learning_rate": 5.616212767880936e-06, + "loss": 0.0015, + "step": 62900 + }, + { + "epoch": 1.0293708582181134, + "grad_norm": 0.11557739973068237, + "learning_rate": 5.614795722506236e-06, + "loss": 0.0017, + "step": 62910 + }, + { + "epoch": 1.0295344841691891, + "grad_norm": 0.05231388285756111, + "learning_rate": 5.613378626990889e-06, + "loss": 0.002, + "step": 62920 + }, + { + "epoch": 1.0296981101202651, + "grad_norm": 0.08543913066387177, + "learning_rate": 5.611961481450468e-06, + "loss": 0.0017, + "step": 62930 + }, + { + "epoch": 1.029861736071341, + "grad_norm": 0.06034789979457855, + "learning_rate": 5.610544286000552e-06, + "loss": 0.0012, + "step": 62940 + }, + { + "epoch": 1.0300253620224167, + "grad_norm": 0.03104674071073532, + "learning_rate": 5.60912704075672e-06, + "loss": 0.0012, + "step": 62950 + }, + { + "epoch": 1.0301889879734927, + "grad_norm": 0.043217677623033524, + "learning_rate": 5.6077097458345585e-06, + "loss": 0.0013, + "step": 62960 + }, + { + "epoch": 1.0303526139245685, + "grad_norm": 0.010800025425851345, + "learning_rate": 5.606292401349659e-06, + "loss": 0.0013, + "step": 62970 + }, + { + "epoch": 1.0305162398756442, + "grad_norm": 0.00527599174529314, + "learning_rate": 5.604875007417614e-06, + "loss": 0.0012, + "step": 62980 + }, + { + "epoch": 1.0306798658267202, + "grad_norm": 0.04410775750875473, + "learning_rate": 5.6034575641540215e-06, + "loss": 0.0015, + "step": 62990 + }, + { + "epoch": 1.030843491777796, + "grad_norm": 0.02884807623922825, + "learning_rate": 5.602040071674481e-06, + "loss": 0.0013, + "step": 63000 + }, + { + "epoch": 1.0310071177288718, + "grad_norm": 0.03039473108947277, + "learning_rate": 5.600622530094603e-06, + "loss": 0.0011, + "step": 63010 + }, + { + "epoch": 1.0311707436799475, + "grad_norm": 0.046265389770269394, + "learning_rate": 5.599204939529994e-06, + "loss": 0.0023, + "step": 63020 + }, + { + "epoch": 1.0313343696310235, + "grad_norm": 0.10824576765298843, + "learning_rate": 5.597787300096269e-06, + "loss": 0.0011, + "step": 63030 + }, + { + "epoch": 1.0314979955820993, + "grad_norm": 0.08840864896774292, + "learning_rate": 5.5963696119090445e-06, + "loss": 0.0017, + "step": 63040 + }, + { + "epoch": 1.031661621533175, + "grad_norm": 0.05065814033150673, + "learning_rate": 5.594951875083945e-06, + "loss": 0.0023, + "step": 63050 + }, + { + "epoch": 1.031825247484251, + "grad_norm": 0.06369905173778534, + "learning_rate": 5.5935340897365945e-06, + "loss": 0.0009, + "step": 63060 + }, + { + "epoch": 1.0319888734353269, + "grad_norm": 0.026613572612404823, + "learning_rate": 5.592116255982622e-06, + "loss": 0.0012, + "step": 63070 + }, + { + "epoch": 1.0321524993864026, + "grad_norm": 0.07370531558990479, + "learning_rate": 5.590698373937663e-06, + "loss": 0.0038, + "step": 63080 + }, + { + "epoch": 1.0323161253374786, + "grad_norm": 0.2020587921142578, + "learning_rate": 5.589280443717354e-06, + "loss": 0.0017, + "step": 63090 + }, + { + "epoch": 1.0324797512885544, + "grad_norm": 0.11497752368450165, + "learning_rate": 5.587862465437338e-06, + "loss": 0.0015, + "step": 63100 + }, + { + "epoch": 1.0326433772396302, + "grad_norm": 0.021445196121931076, + "learning_rate": 5.586444439213259e-06, + "loss": 0.0013, + "step": 63110 + }, + { + "epoch": 1.032807003190706, + "grad_norm": 0.06567702442407608, + "learning_rate": 5.585026365160766e-06, + "loss": 0.0015, + "step": 63120 + }, + { + "epoch": 1.032970629141782, + "grad_norm": 0.05859753116965294, + "learning_rate": 5.583608243395513e-06, + "loss": 0.0014, + "step": 63130 + }, + { + "epoch": 1.0331342550928577, + "grad_norm": 0.1056719496846199, + "learning_rate": 5.582190074033158e-06, + "loss": 0.0014, + "step": 63140 + }, + { + "epoch": 1.0332978810439335, + "grad_norm": 0.050497256219387054, + "learning_rate": 5.580771857189361e-06, + "loss": 0.0014, + "step": 63150 + }, + { + "epoch": 1.0334615069950095, + "grad_norm": 0.07371681183576584, + "learning_rate": 5.579353592979787e-06, + "loss": 0.0018, + "step": 63160 + }, + { + "epoch": 1.0336251329460853, + "grad_norm": 0.012518026866018772, + "learning_rate": 5.577935281520106e-06, + "loss": 0.0009, + "step": 63170 + }, + { + "epoch": 1.033788758897161, + "grad_norm": 0.08353175222873688, + "learning_rate": 5.576516922925988e-06, + "loss": 0.0015, + "step": 63180 + }, + { + "epoch": 1.033952384848237, + "grad_norm": 0.09575547277927399, + "learning_rate": 5.575098517313113e-06, + "loss": 0.0016, + "step": 63190 + }, + { + "epoch": 1.0341160107993128, + "grad_norm": 0.07352940738201141, + "learning_rate": 5.57368006479716e-06, + "loss": 0.0013, + "step": 63200 + }, + { + "epoch": 1.0342796367503886, + "grad_norm": 0.030067091807723045, + "learning_rate": 5.572261565493812e-06, + "loss": 0.0013, + "step": 63210 + }, + { + "epoch": 1.0344432627014644, + "grad_norm": 0.0376623272895813, + "learning_rate": 5.570843019518757e-06, + "loss": 0.0012, + "step": 63220 + }, + { + "epoch": 1.0346068886525404, + "grad_norm": 0.07964199781417847, + "learning_rate": 5.569424426987688e-06, + "loss": 0.0017, + "step": 63230 + }, + { + "epoch": 1.0347705146036161, + "grad_norm": 0.04377982020378113, + "learning_rate": 5.5680057880163e-06, + "loss": 0.0013, + "step": 63240 + }, + { + "epoch": 1.034934140554692, + "grad_norm": 0.06022028997540474, + "learning_rate": 5.5665871027202925e-06, + "loss": 0.0013, + "step": 63250 + }, + { + "epoch": 1.035097766505768, + "grad_norm": 0.042465586215257645, + "learning_rate": 5.5651683712153685e-06, + "loss": 0.0016, + "step": 63260 + }, + { + "epoch": 1.0352613924568437, + "grad_norm": 0.1468539535999298, + "learning_rate": 5.563749593617235e-06, + "loss": 0.0015, + "step": 63270 + }, + { + "epoch": 1.0354250184079195, + "grad_norm": 0.15182700753211975, + "learning_rate": 5.5623307700416026e-06, + "loss": 0.0017, + "step": 63280 + }, + { + "epoch": 1.0355886443589952, + "grad_norm": 0.053585443645715714, + "learning_rate": 5.560911900604187e-06, + "loss": 0.002, + "step": 63290 + }, + { + "epoch": 1.0357522703100712, + "grad_norm": 0.041000962257385254, + "learning_rate": 5.5594929854207045e-06, + "loss": 0.0012, + "step": 63300 + }, + { + "epoch": 1.035915896261147, + "grad_norm": 0.05506960302591324, + "learning_rate": 5.558074024606878e-06, + "loss": 0.0006, + "step": 63310 + }, + { + "epoch": 1.0360795222122228, + "grad_norm": 0.03488294407725334, + "learning_rate": 5.556655018278432e-06, + "loss": 0.0019, + "step": 63320 + }, + { + "epoch": 1.0362431481632988, + "grad_norm": 0.12159097194671631, + "learning_rate": 5.555235966551097e-06, + "loss": 0.0015, + "step": 63330 + }, + { + "epoch": 1.0364067741143745, + "grad_norm": 0.2699704170227051, + "learning_rate": 5.5538168695406046e-06, + "loss": 0.0016, + "step": 63340 + }, + { + "epoch": 1.0365704000654503, + "grad_norm": 0.06754027307033539, + "learning_rate": 5.552397727362694e-06, + "loss": 0.0014, + "step": 63350 + }, + { + "epoch": 1.0367340260165263, + "grad_norm": 0.09161163866519928, + "learning_rate": 5.550978540133103e-06, + "loss": 0.0019, + "step": 63360 + }, + { + "epoch": 1.036897651967602, + "grad_norm": 0.0479351170361042, + "learning_rate": 5.549559307967578e-06, + "loss": 0.0018, + "step": 63370 + }, + { + "epoch": 1.0370612779186779, + "grad_norm": 0.05759137496352196, + "learning_rate": 5.5481400309818645e-06, + "loss": 0.001, + "step": 63380 + }, + { + "epoch": 1.0372249038697539, + "grad_norm": 0.12185215204954147, + "learning_rate": 5.546720709291715e-06, + "loss": 0.0017, + "step": 63390 + }, + { + "epoch": 1.0373885298208296, + "grad_norm": 0.05699262022972107, + "learning_rate": 5.5453013430128855e-06, + "loss": 0.0017, + "step": 63400 + }, + { + "epoch": 1.0375521557719054, + "grad_norm": 0.10873331129550934, + "learning_rate": 5.543881932261134e-06, + "loss": 0.0011, + "step": 63410 + }, + { + "epoch": 1.0377157817229812, + "grad_norm": 0.11183402687311172, + "learning_rate": 5.542462477152222e-06, + "loss": 0.002, + "step": 63420 + }, + { + "epoch": 1.0378794076740572, + "grad_norm": 0.011344654485583305, + "learning_rate": 5.541042977801915e-06, + "loss": 0.0013, + "step": 63430 + }, + { + "epoch": 1.038043033625133, + "grad_norm": 0.0033544001635164022, + "learning_rate": 5.539623434325984e-06, + "loss": 0.0009, + "step": 63440 + }, + { + "epoch": 1.0382066595762087, + "grad_norm": 0.03782197833061218, + "learning_rate": 5.538203846840202e-06, + "loss": 0.0015, + "step": 63450 + }, + { + "epoch": 1.0383702855272847, + "grad_norm": 0.08692289143800735, + "learning_rate": 5.5367842154603456e-06, + "loss": 0.0015, + "step": 63460 + }, + { + "epoch": 1.0385339114783605, + "grad_norm": 0.22961123287677765, + "learning_rate": 5.535364540302195e-06, + "loss": 0.0045, + "step": 63470 + }, + { + "epoch": 1.0386975374294363, + "grad_norm": 0.07950034737586975, + "learning_rate": 5.5339448214815335e-06, + "loss": 0.001, + "step": 63480 + }, + { + "epoch": 1.038861163380512, + "grad_norm": 0.024478962644934654, + "learning_rate": 5.53252505911415e-06, + "loss": 0.0015, + "step": 63490 + }, + { + "epoch": 1.039024789331588, + "grad_norm": 0.06223538517951965, + "learning_rate": 5.531105253315833e-06, + "loss": 0.0031, + "step": 63500 + }, + { + "epoch": 1.0391884152826638, + "grad_norm": 0.007910685613751411, + "learning_rate": 5.52968540420238e-06, + "loss": 0.0016, + "step": 63510 + }, + { + "epoch": 1.0393520412337396, + "grad_norm": 0.08627380430698395, + "learning_rate": 5.528265511889585e-06, + "loss": 0.0009, + "step": 63520 + }, + { + "epoch": 1.0395156671848156, + "grad_norm": 0.22251693904399872, + "learning_rate": 5.526845576493255e-06, + "loss": 0.0022, + "step": 63530 + }, + { + "epoch": 1.0396792931358914, + "grad_norm": 0.05365417152643204, + "learning_rate": 5.525425598129191e-06, + "loss": 0.0018, + "step": 63540 + }, + { + "epoch": 1.0398429190869671, + "grad_norm": 0.05744471400976181, + "learning_rate": 5.524005576913203e-06, + "loss": 0.0009, + "step": 63550 + }, + { + "epoch": 1.0400065450380431, + "grad_norm": 0.08375339210033417, + "learning_rate": 5.522585512961103e-06, + "loss": 0.0046, + "step": 63560 + }, + { + "epoch": 1.040170170989119, + "grad_norm": 0.04766619950532913, + "learning_rate": 5.521165406388707e-06, + "loss": 0.0005, + "step": 63570 + }, + { + "epoch": 1.0403337969401947, + "grad_norm": 0.04508169740438461, + "learning_rate": 5.519745257311833e-06, + "loss": 0.004, + "step": 63580 + }, + { + "epoch": 1.0404974228912705, + "grad_norm": 0.0260345246642828, + "learning_rate": 5.518325065846304e-06, + "loss": 0.0019, + "step": 63590 + }, + { + "epoch": 1.0406610488423464, + "grad_norm": 0.05305233970284462, + "learning_rate": 5.516904832107947e-06, + "loss": 0.0015, + "step": 63600 + }, + { + "epoch": 1.0408246747934222, + "grad_norm": 0.010698510333895683, + "learning_rate": 5.5154845562125894e-06, + "loss": 0.0008, + "step": 63610 + }, + { + "epoch": 1.040988300744498, + "grad_norm": 0.2244066596031189, + "learning_rate": 5.514064238276067e-06, + "loss": 0.0008, + "step": 63620 + }, + { + "epoch": 1.041151926695574, + "grad_norm": 0.04117923602461815, + "learning_rate": 5.512643878414213e-06, + "loss": 0.0013, + "step": 63630 + }, + { + "epoch": 1.0413155526466498, + "grad_norm": 0.05810079723596573, + "learning_rate": 5.511223476742868e-06, + "loss": 0.002, + "step": 63640 + }, + { + "epoch": 1.0414791785977255, + "grad_norm": 0.01418503001332283, + "learning_rate": 5.509803033377876e-06, + "loss": 0.0022, + "step": 63650 + }, + { + "epoch": 1.0416428045488015, + "grad_norm": 0.11075062304735184, + "learning_rate": 5.508382548435084e-06, + "loss": 0.0011, + "step": 63660 + }, + { + "epoch": 1.0418064304998773, + "grad_norm": 0.032910291105508804, + "learning_rate": 5.5069620220303395e-06, + "loss": 0.0029, + "step": 63670 + }, + { + "epoch": 1.041970056450953, + "grad_norm": 0.01531054824590683, + "learning_rate": 5.505541454279498e-06, + "loss": 0.0009, + "step": 63680 + }, + { + "epoch": 1.0421336824020289, + "grad_norm": 0.02331588603556156, + "learning_rate": 5.504120845298416e-06, + "loss": 0.0023, + "step": 63690 + }, + { + "epoch": 1.0422973083531049, + "grad_norm": 0.07314090430736542, + "learning_rate": 5.502700195202952e-06, + "loss": 0.0013, + "step": 63700 + }, + { + "epoch": 1.0424609343041806, + "grad_norm": 0.08812125027179718, + "learning_rate": 5.501279504108972e-06, + "loss": 0.0026, + "step": 63710 + }, + { + "epoch": 1.0426245602552564, + "grad_norm": 0.04878391698002815, + "learning_rate": 5.49985877213234e-06, + "loss": 0.0019, + "step": 63720 + }, + { + "epoch": 1.0427881862063324, + "grad_norm": 0.0049498737789690495, + "learning_rate": 5.498437999388927e-06, + "loss": 0.0014, + "step": 63730 + }, + { + "epoch": 1.0429518121574082, + "grad_norm": 0.00988365150988102, + "learning_rate": 5.497017185994607e-06, + "loss": 0.0018, + "step": 63740 + }, + { + "epoch": 1.043115438108484, + "grad_norm": 0.02225433476269245, + "learning_rate": 5.4955963320652565e-06, + "loss": 0.002, + "step": 63750 + }, + { + "epoch": 1.04327906405956, + "grad_norm": 0.016612669453024864, + "learning_rate": 5.4941754377167555e-06, + "loss": 0.0015, + "step": 63760 + }, + { + "epoch": 1.0434426900106357, + "grad_norm": 0.16139495372772217, + "learning_rate": 5.492754503064987e-06, + "loss": 0.0027, + "step": 63770 + }, + { + "epoch": 1.0436063159617115, + "grad_norm": 0.017139604315161705, + "learning_rate": 5.4913335282258375e-06, + "loss": 0.002, + "step": 63780 + }, + { + "epoch": 1.0437699419127873, + "grad_norm": 0.07805650681257248, + "learning_rate": 5.489912513315197e-06, + "loss": 0.0042, + "step": 63790 + }, + { + "epoch": 1.0439335678638633, + "grad_norm": 0.06536448001861572, + "learning_rate": 5.488491458448958e-06, + "loss": 0.001, + "step": 63800 + }, + { + "epoch": 1.044097193814939, + "grad_norm": 0.027559954673051834, + "learning_rate": 5.487070363743019e-06, + "loss": 0.0015, + "step": 63810 + }, + { + "epoch": 1.0442608197660148, + "grad_norm": 0.049371443688869476, + "learning_rate": 5.485649229313277e-06, + "loss": 0.0008, + "step": 63820 + }, + { + "epoch": 1.0444244457170908, + "grad_norm": 0.05635242909193039, + "learning_rate": 5.484228055275638e-06, + "loss": 0.0017, + "step": 63830 + }, + { + "epoch": 1.0445880716681666, + "grad_norm": 0.02139674313366413, + "learning_rate": 5.482806841746005e-06, + "loss": 0.0008, + "step": 63840 + }, + { + "epoch": 1.0447516976192424, + "grad_norm": 0.03042641654610634, + "learning_rate": 5.4813855888402876e-06, + "loss": 0.0012, + "step": 63850 + }, + { + "epoch": 1.0449153235703184, + "grad_norm": 0.09243849664926529, + "learning_rate": 5.479964296674402e-06, + "loss": 0.0017, + "step": 63860 + }, + { + "epoch": 1.0450789495213941, + "grad_norm": 0.037908848375082016, + "learning_rate": 5.47854296536426e-06, + "loss": 0.0017, + "step": 63870 + }, + { + "epoch": 1.04524257547247, + "grad_norm": 0.04545162618160248, + "learning_rate": 5.4771215950257804e-06, + "loss": 0.002, + "step": 63880 + }, + { + "epoch": 1.0454062014235457, + "grad_norm": 0.013800349086523056, + "learning_rate": 5.4757001857748895e-06, + "loss": 0.0014, + "step": 63890 + }, + { + "epoch": 1.0455698273746217, + "grad_norm": 0.054980698972940445, + "learning_rate": 5.474278737727508e-06, + "loss": 0.0013, + "step": 63900 + }, + { + "epoch": 1.0457334533256974, + "grad_norm": 0.13138845562934875, + "learning_rate": 5.472857250999567e-06, + "loss": 0.0022, + "step": 63910 + }, + { + "epoch": 1.0458970792767732, + "grad_norm": 0.02803453616797924, + "learning_rate": 5.471435725706997e-06, + "loss": 0.0012, + "step": 63920 + }, + { + "epoch": 1.0460607052278492, + "grad_norm": 0.08206098526716232, + "learning_rate": 5.470014161965735e-06, + "loss": 0.002, + "step": 63930 + }, + { + "epoch": 1.046224331178925, + "grad_norm": 0.038597412407398224, + "learning_rate": 5.468592559891714e-06, + "loss": 0.0037, + "step": 63940 + }, + { + "epoch": 1.0463879571300008, + "grad_norm": 0.06735596060752869, + "learning_rate": 5.467170919600882e-06, + "loss": 0.0023, + "step": 63950 + }, + { + "epoch": 1.0465515830810768, + "grad_norm": 0.08162075281143188, + "learning_rate": 5.465749241209176e-06, + "loss": 0.0023, + "step": 63960 + }, + { + "epoch": 1.0467152090321525, + "grad_norm": 0.024629533290863037, + "learning_rate": 5.464327524832548e-06, + "loss": 0.0012, + "step": 63970 + }, + { + "epoch": 1.0468788349832283, + "grad_norm": 0.05165449529886246, + "learning_rate": 5.462905770586946e-06, + "loss": 0.0015, + "step": 63980 + }, + { + "epoch": 1.047042460934304, + "grad_norm": 0.03659361973404884, + "learning_rate": 5.461483978588325e-06, + "loss": 0.002, + "step": 63990 + }, + { + "epoch": 1.04720608688538, + "grad_norm": 0.027760332450270653, + "learning_rate": 5.460062148952641e-06, + "loss": 0.0018, + "step": 64000 + }, + { + "epoch": 1.0473697128364559, + "grad_norm": 0.06101588159799576, + "learning_rate": 5.4586402817958514e-06, + "loss": 0.0011, + "step": 64010 + }, + { + "epoch": 1.0475333387875316, + "grad_norm": 0.054217416793107986, + "learning_rate": 5.457218377233923e-06, + "loss": 0.0019, + "step": 64020 + }, + { + "epoch": 1.0476969647386076, + "grad_norm": 0.039044298231601715, + "learning_rate": 5.4557964353828185e-06, + "loss": 0.0008, + "step": 64030 + }, + { + "epoch": 1.0478605906896834, + "grad_norm": 0.08019902557134628, + "learning_rate": 5.454374456358508e-06, + "loss": 0.0013, + "step": 64040 + }, + { + "epoch": 1.0480242166407592, + "grad_norm": 0.05514813959598541, + "learning_rate": 5.4529524402769615e-06, + "loss": 0.001, + "step": 64050 + }, + { + "epoch": 1.048187842591835, + "grad_norm": 0.049595434218645096, + "learning_rate": 5.451530387254157e-06, + "loss": 0.002, + "step": 64060 + }, + { + "epoch": 1.048351468542911, + "grad_norm": 0.03588667884469032, + "learning_rate": 5.450108297406069e-06, + "loss": 0.0012, + "step": 64070 + }, + { + "epoch": 1.0485150944939867, + "grad_norm": 0.09109651297330856, + "learning_rate": 5.448686170848679e-06, + "loss": 0.0016, + "step": 64080 + }, + { + "epoch": 1.0486787204450625, + "grad_norm": 0.0289768036454916, + "learning_rate": 5.447264007697973e-06, + "loss": 0.0007, + "step": 64090 + }, + { + "epoch": 1.0488423463961385, + "grad_norm": 0.08702289313077927, + "learning_rate": 5.445841808069936e-06, + "loss": 0.0029, + "step": 64100 + }, + { + "epoch": 1.0490059723472143, + "grad_norm": 0.03332599997520447, + "learning_rate": 5.444419572080557e-06, + "loss": 0.0028, + "step": 64110 + }, + { + "epoch": 1.04916959829829, + "grad_norm": 0.06822742521762848, + "learning_rate": 5.442997299845832e-06, + "loss": 0.0012, + "step": 64120 + }, + { + "epoch": 1.049333224249366, + "grad_norm": 0.03312753140926361, + "learning_rate": 5.441574991481752e-06, + "loss": 0.0008, + "step": 64130 + }, + { + "epoch": 1.0494968502004418, + "grad_norm": 0.03403471037745476, + "learning_rate": 5.440152647104319e-06, + "loss": 0.0012, + "step": 64140 + }, + { + "epoch": 1.0496604761515176, + "grad_norm": 0.07530993968248367, + "learning_rate": 5.438730266829533e-06, + "loss": 0.0016, + "step": 64150 + }, + { + "epoch": 1.0498241021025936, + "grad_norm": 0.042741235345602036, + "learning_rate": 5.4373078507734e-06, + "loss": 0.0009, + "step": 64160 + }, + { + "epoch": 1.0499877280536694, + "grad_norm": 0.03204313665628433, + "learning_rate": 5.435885399051926e-06, + "loss": 0.0016, + "step": 64170 + }, + { + "epoch": 1.0501513540047451, + "grad_norm": 0.04970650374889374, + "learning_rate": 5.434462911781122e-06, + "loss": 0.0019, + "step": 64180 + }, + { + "epoch": 1.050314979955821, + "grad_norm": 0.10810843110084534, + "learning_rate": 5.433040389077001e-06, + "loss": 0.0014, + "step": 64190 + }, + { + "epoch": 1.050478605906897, + "grad_norm": 0.009705748409032822, + "learning_rate": 5.43161783105558e-06, + "loss": 0.0019, + "step": 64200 + }, + { + "epoch": 1.0506422318579727, + "grad_norm": 0.1793215125799179, + "learning_rate": 5.430195237832876e-06, + "loss": 0.0019, + "step": 64210 + }, + { + "epoch": 1.0508058578090484, + "grad_norm": 0.1737505942583084, + "learning_rate": 5.428772609524913e-06, + "loss": 0.002, + "step": 64220 + }, + { + "epoch": 1.0509694837601244, + "grad_norm": 0.05152524635195732, + "learning_rate": 5.427349946247714e-06, + "loss": 0.001, + "step": 64230 + }, + { + "epoch": 1.0511331097112002, + "grad_norm": 0.034666758030653, + "learning_rate": 5.425927248117308e-06, + "loss": 0.0013, + "step": 64240 + }, + { + "epoch": 1.051296735662276, + "grad_norm": 0.05017232149839401, + "learning_rate": 5.424504515249725e-06, + "loss": 0.0013, + "step": 64250 + }, + { + "epoch": 1.0514603616133518, + "grad_norm": 0.06550250202417374, + "learning_rate": 5.423081747760996e-06, + "loss": 0.0016, + "step": 64260 + }, + { + "epoch": 1.0516239875644278, + "grad_norm": 0.10068517923355103, + "learning_rate": 5.421658945767163e-06, + "loss": 0.0017, + "step": 64270 + }, + { + "epoch": 1.0517876135155035, + "grad_norm": 0.056817881762981415, + "learning_rate": 5.420236109384258e-06, + "loss": 0.0014, + "step": 64280 + }, + { + "epoch": 1.0519512394665793, + "grad_norm": 0.052699845284223557, + "learning_rate": 5.418813238728327e-06, + "loss": 0.0015, + "step": 64290 + }, + { + "epoch": 1.0521148654176553, + "grad_norm": 0.13146892189979553, + "learning_rate": 5.417390333915412e-06, + "loss": 0.0019, + "step": 64300 + }, + { + "epoch": 1.052278491368731, + "grad_norm": 0.04212433099746704, + "learning_rate": 5.415967395061562e-06, + "loss": 0.0018, + "step": 64310 + }, + { + "epoch": 1.0524421173198069, + "grad_norm": 0.031882937997579575, + "learning_rate": 5.414544422282826e-06, + "loss": 0.001, + "step": 64320 + }, + { + "epoch": 1.0526057432708829, + "grad_norm": 0.010787849314510822, + "learning_rate": 5.413121415695258e-06, + "loss": 0.0019, + "step": 64330 + }, + { + "epoch": 1.0527693692219586, + "grad_norm": 0.04263931140303612, + "learning_rate": 5.411698375414913e-06, + "loss": 0.001, + "step": 64340 + }, + { + "epoch": 1.0529329951730344, + "grad_norm": 0.0920867994427681, + "learning_rate": 5.410275301557849e-06, + "loss": 0.0016, + "step": 64350 + }, + { + "epoch": 1.0530966211241102, + "grad_norm": 0.12725453078746796, + "learning_rate": 5.408852194240127e-06, + "loss": 0.0013, + "step": 64360 + }, + { + "epoch": 1.0532602470751862, + "grad_norm": 0.014359544031322002, + "learning_rate": 5.407429053577811e-06, + "loss": 0.0014, + "step": 64370 + }, + { + "epoch": 1.053423873026262, + "grad_norm": 0.06531564146280289, + "learning_rate": 5.406005879686968e-06, + "loss": 0.0008, + "step": 64380 + }, + { + "epoch": 1.0535874989773377, + "grad_norm": 0.0077211507596075535, + "learning_rate": 5.404582672683667e-06, + "loss": 0.0014, + "step": 64390 + }, + { + "epoch": 1.0537511249284137, + "grad_norm": 0.1500065177679062, + "learning_rate": 5.403159432683979e-06, + "loss": 0.0014, + "step": 64400 + }, + { + "epoch": 1.0539147508794895, + "grad_norm": 0.06877102702856064, + "learning_rate": 5.40173615980398e-06, + "loss": 0.0018, + "step": 64410 + }, + { + "epoch": 1.0540783768305653, + "grad_norm": 0.06654974818229675, + "learning_rate": 5.400312854159746e-06, + "loss": 0.0018, + "step": 64420 + }, + { + "epoch": 1.0542420027816413, + "grad_norm": 0.0903286561369896, + "learning_rate": 5.398889515867358e-06, + "loss": 0.0013, + "step": 64430 + }, + { + "epoch": 1.054405628732717, + "grad_norm": 0.044767577201128006, + "learning_rate": 5.397466145042898e-06, + "loss": 0.0013, + "step": 64440 + }, + { + "epoch": 1.0545692546837928, + "grad_norm": 0.04485725238919258, + "learning_rate": 5.3960427418024515e-06, + "loss": 0.0015, + "step": 64450 + }, + { + "epoch": 1.0547328806348686, + "grad_norm": 0.10228991508483887, + "learning_rate": 5.394619306262106e-06, + "loss": 0.0014, + "step": 64460 + }, + { + "epoch": 1.0548965065859446, + "grad_norm": 0.029804689809679985, + "learning_rate": 5.393195838537954e-06, + "loss": 0.0015, + "step": 64470 + }, + { + "epoch": 1.0550601325370204, + "grad_norm": 0.02489466592669487, + "learning_rate": 5.391772338746086e-06, + "loss": 0.0016, + "step": 64480 + }, + { + "epoch": 1.0552237584880961, + "grad_norm": 0.05016709491610527, + "learning_rate": 5.390348807002599e-06, + "loss": 0.0008, + "step": 64490 + }, + { + "epoch": 1.0553873844391721, + "grad_norm": 0.07112365961074829, + "learning_rate": 5.388925243423591e-06, + "loss": 0.001, + "step": 64500 + }, + { + "epoch": 1.055551010390248, + "grad_norm": 0.04656639322638512, + "learning_rate": 5.387501648125165e-06, + "loss": 0.0011, + "step": 64510 + }, + { + "epoch": 1.0557146363413237, + "grad_norm": 0.12064625322818756, + "learning_rate": 5.386078021223422e-06, + "loss": 0.0028, + "step": 64520 + }, + { + "epoch": 1.0558782622923997, + "grad_norm": 0.05928665027022362, + "learning_rate": 5.384654362834469e-06, + "loss": 0.0017, + "step": 64530 + }, + { + "epoch": 1.0560418882434754, + "grad_norm": 0.06275463104248047, + "learning_rate": 5.3832306730744146e-06, + "loss": 0.0012, + "step": 64540 + }, + { + "epoch": 1.0562055141945512, + "grad_norm": 0.07323262840509415, + "learning_rate": 5.381806952059371e-06, + "loss": 0.0015, + "step": 64550 + }, + { + "epoch": 1.056369140145627, + "grad_norm": 0.0842098593711853, + "learning_rate": 5.380383199905451e-06, + "loss": 0.0016, + "step": 64560 + }, + { + "epoch": 1.056532766096703, + "grad_norm": 0.09260202199220657, + "learning_rate": 5.378959416728772e-06, + "loss": 0.0021, + "step": 64570 + }, + { + "epoch": 1.0566963920477788, + "grad_norm": 0.07597813755273819, + "learning_rate": 5.377535602645452e-06, + "loss": 0.0018, + "step": 64580 + }, + { + "epoch": 1.0568600179988545, + "grad_norm": 0.10727567970752716, + "learning_rate": 5.3761117577716126e-06, + "loss": 0.0016, + "step": 64590 + }, + { + "epoch": 1.0570236439499305, + "grad_norm": 0.04645923525094986, + "learning_rate": 5.374687882223378e-06, + "loss": 0.0011, + "step": 64600 + }, + { + "epoch": 1.0571872699010063, + "grad_norm": 0.07954151183366776, + "learning_rate": 5.373263976116875e-06, + "loss": 0.0012, + "step": 64610 + }, + { + "epoch": 1.057350895852082, + "grad_norm": 0.09331540018320084, + "learning_rate": 5.371840039568231e-06, + "loss": 0.0015, + "step": 64620 + }, + { + "epoch": 1.057514521803158, + "grad_norm": 0.028710681945085526, + "learning_rate": 5.3704160726935795e-06, + "loss": 0.0019, + "step": 64630 + }, + { + "epoch": 1.0576781477542339, + "grad_norm": 0.0997256487607956, + "learning_rate": 5.368992075609052e-06, + "loss": 0.0015, + "step": 64640 + }, + { + "epoch": 1.0578417737053096, + "grad_norm": 0.05940896272659302, + "learning_rate": 5.367568048430787e-06, + "loss": 0.0015, + "step": 64650 + }, + { + "epoch": 1.0580053996563854, + "grad_norm": 0.06382996588945389, + "learning_rate": 5.3661439912749225e-06, + "loss": 0.0014, + "step": 64660 + }, + { + "epoch": 1.0581690256074614, + "grad_norm": 0.06846608966588974, + "learning_rate": 5.3647199042576e-06, + "loss": 0.0011, + "step": 64670 + }, + { + "epoch": 1.0583326515585372, + "grad_norm": 0.03451616317033768, + "learning_rate": 5.363295787494963e-06, + "loss": 0.0019, + "step": 64680 + }, + { + "epoch": 1.058496277509613, + "grad_norm": 0.1504979431629181, + "learning_rate": 5.361871641103158e-06, + "loss": 0.002, + "step": 64690 + }, + { + "epoch": 1.058659903460689, + "grad_norm": 0.0033024942968040705, + "learning_rate": 5.360447465198332e-06, + "loss": 0.0013, + "step": 64700 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.03498594090342522, + "learning_rate": 5.359023259896638e-06, + "loss": 0.001, + "step": 64710 + }, + { + "epoch": 1.0589871553628405, + "grad_norm": 0.04379526153206825, + "learning_rate": 5.357599025314228e-06, + "loss": 0.0013, + "step": 64720 + }, + { + "epoch": 1.0591507813139165, + "grad_norm": 0.07929637283086777, + "learning_rate": 5.356174761567259e-06, + "loss": 0.0009, + "step": 64730 + }, + { + "epoch": 1.0593144072649923, + "grad_norm": 0.08806032687425613, + "learning_rate": 5.354750468771886e-06, + "loss": 0.001, + "step": 64740 + }, + { + "epoch": 1.059478033216068, + "grad_norm": 0.05688506364822388, + "learning_rate": 5.3533261470442746e-06, + "loss": 0.0019, + "step": 64750 + }, + { + "epoch": 1.0596416591671438, + "grad_norm": 0.01541043072938919, + "learning_rate": 5.351901796500583e-06, + "loss": 0.0015, + "step": 64760 + }, + { + "epoch": 1.0598052851182198, + "grad_norm": 0.04236844554543495, + "learning_rate": 5.350477417256978e-06, + "loss": 0.0014, + "step": 64770 + }, + { + "epoch": 1.0599689110692956, + "grad_norm": 0.0074849361553788185, + "learning_rate": 5.349053009429629e-06, + "loss": 0.0019, + "step": 64780 + }, + { + "epoch": 1.0601325370203714, + "grad_norm": 0.08951664716005325, + "learning_rate": 5.3476285731347024e-06, + "loss": 0.0013, + "step": 64790 + }, + { + "epoch": 1.0602961629714474, + "grad_norm": 0.012588960118591785, + "learning_rate": 5.346204108488373e-06, + "loss": 0.0015, + "step": 64800 + }, + { + "epoch": 1.0604597889225231, + "grad_norm": 0.08123738318681717, + "learning_rate": 5.344779615606815e-06, + "loss": 0.002, + "step": 64810 + }, + { + "epoch": 1.060623414873599, + "grad_norm": 0.004824167117476463, + "learning_rate": 5.343355094606203e-06, + "loss": 0.0028, + "step": 64820 + }, + { + "epoch": 1.060787040824675, + "grad_norm": 0.09063207358121872, + "learning_rate": 5.341930545602718e-06, + "loss": 0.0013, + "step": 64830 + }, + { + "epoch": 1.0609506667757507, + "grad_norm": 0.13622954487800598, + "learning_rate": 5.3405059687125415e-06, + "loss": 0.0023, + "step": 64840 + }, + { + "epoch": 1.0611142927268264, + "grad_norm": 0.03379924222826958, + "learning_rate": 5.339081364051857e-06, + "loss": 0.0021, + "step": 64850 + }, + { + "epoch": 1.0612779186779022, + "grad_norm": 0.03544510528445244, + "learning_rate": 5.337656731736851e-06, + "loss": 0.0019, + "step": 64860 + }, + { + "epoch": 1.0614415446289782, + "grad_norm": 0.04377711936831474, + "learning_rate": 5.336232071883709e-06, + "loss": 0.0012, + "step": 64870 + }, + { + "epoch": 1.061605170580054, + "grad_norm": 0.04801655933260918, + "learning_rate": 5.3348073846086235e-06, + "loss": 0.0009, + "step": 64880 + }, + { + "epoch": 1.0617687965311298, + "grad_norm": 0.08826633542776108, + "learning_rate": 5.3333826700277866e-06, + "loss": 0.002, + "step": 64890 + }, + { + "epoch": 1.0619324224822058, + "grad_norm": 0.04082094132900238, + "learning_rate": 5.3319579282573945e-06, + "loss": 0.0014, + "step": 64900 + }, + { + "epoch": 1.0620960484332815, + "grad_norm": 0.280418336391449, + "learning_rate": 5.330533159413642e-06, + "loss": 0.0014, + "step": 64910 + }, + { + "epoch": 1.0622596743843573, + "grad_norm": 0.038144439458847046, + "learning_rate": 5.32910836361273e-06, + "loss": 0.0013, + "step": 64920 + }, + { + "epoch": 1.0624233003354333, + "grad_norm": 0.07828328013420105, + "learning_rate": 5.3276835409708605e-06, + "loss": 0.001, + "step": 64930 + }, + { + "epoch": 1.062586926286509, + "grad_norm": 0.028683684766292572, + "learning_rate": 5.326258691604235e-06, + "loss": 0.0013, + "step": 64940 + }, + { + "epoch": 1.0627505522375849, + "grad_norm": 0.07371421158313751, + "learning_rate": 5.3248338156290615e-06, + "loss": 0.0027, + "step": 64950 + }, + { + "epoch": 1.0629141781886606, + "grad_norm": 0.03749288618564606, + "learning_rate": 5.323408913161546e-06, + "loss": 0.0015, + "step": 64960 + }, + { + "epoch": 1.0630778041397366, + "grad_norm": 0.06767037510871887, + "learning_rate": 5.321983984317901e-06, + "loss": 0.0016, + "step": 64970 + }, + { + "epoch": 1.0632414300908124, + "grad_norm": 0.018412822857499123, + "learning_rate": 5.320559029214338e-06, + "loss": 0.0015, + "step": 64980 + }, + { + "epoch": 1.0634050560418882, + "grad_norm": 0.07942145317792892, + "learning_rate": 5.319134047967072e-06, + "loss": 0.0014, + "step": 64990 + }, + { + "epoch": 1.0635686819929642, + "grad_norm": 0.12868405878543854, + "learning_rate": 5.317709040692316e-06, + "loss": 0.0016, + "step": 65000 + }, + { + "epoch": 1.06373230794404, + "grad_norm": 0.05411757901310921, + "learning_rate": 5.316284007506294e-06, + "loss": 0.0019, + "step": 65010 + }, + { + "epoch": 1.0638959338951157, + "grad_norm": 0.05384686216711998, + "learning_rate": 5.314858948525223e-06, + "loss": 0.0019, + "step": 65020 + }, + { + "epoch": 1.0640595598461915, + "grad_norm": 0.017097443342208862, + "learning_rate": 5.313433863865328e-06, + "loss": 0.0018, + "step": 65030 + }, + { + "epoch": 1.0642231857972675, + "grad_norm": 0.0339205339550972, + "learning_rate": 5.312008753642834e-06, + "loss": 0.003, + "step": 65040 + }, + { + "epoch": 1.0643868117483433, + "grad_norm": 0.044893499463796616, + "learning_rate": 5.310583617973966e-06, + "loss": 0.0012, + "step": 65050 + }, + { + "epoch": 1.064550437699419, + "grad_norm": 0.044282782822847366, + "learning_rate": 5.309158456974955e-06, + "loss": 0.0012, + "step": 65060 + }, + { + "epoch": 1.064714063650495, + "grad_norm": 0.03950934112071991, + "learning_rate": 5.307733270762033e-06, + "loss": 0.0013, + "step": 65070 + }, + { + "epoch": 1.0648776896015708, + "grad_norm": 0.0735439881682396, + "learning_rate": 5.306308059451433e-06, + "loss": 0.0016, + "step": 65080 + }, + { + "epoch": 1.0650413155526466, + "grad_norm": 0.014367000199854374, + "learning_rate": 5.304882823159389e-06, + "loss": 0.0008, + "step": 65090 + }, + { + "epoch": 1.0652049415037226, + "grad_norm": 0.12071038782596588, + "learning_rate": 5.303457562002139e-06, + "loss": 0.0019, + "step": 65100 + }, + { + "epoch": 1.0653685674547984, + "grad_norm": 0.07662677019834518, + "learning_rate": 5.302032276095923e-06, + "loss": 0.0009, + "step": 65110 + }, + { + "epoch": 1.0655321934058741, + "grad_norm": 0.03474804386496544, + "learning_rate": 5.300606965556983e-06, + "loss": 0.0028, + "step": 65120 + }, + { + "epoch": 1.0656958193569501, + "grad_norm": 0.07875080406665802, + "learning_rate": 5.29918163050156e-06, + "loss": 0.0009, + "step": 65130 + }, + { + "epoch": 1.065859445308026, + "grad_norm": 0.062419354915618896, + "learning_rate": 5.297756271045902e-06, + "loss": 0.0018, + "step": 65140 + }, + { + "epoch": 1.0660230712591017, + "grad_norm": 0.024496397003531456, + "learning_rate": 5.296330887306256e-06, + "loss": 0.0017, + "step": 65150 + }, + { + "epoch": 1.0661866972101774, + "grad_norm": 0.07439788430929184, + "learning_rate": 5.2949054793988695e-06, + "loss": 0.0015, + "step": 65160 + }, + { + "epoch": 1.0663503231612534, + "grad_norm": 0.020498117431998253, + "learning_rate": 5.2934800474399976e-06, + "loss": 0.0011, + "step": 65170 + }, + { + "epoch": 1.0665139491123292, + "grad_norm": 0.12638750672340393, + "learning_rate": 5.29205459154589e-06, + "loss": 0.0026, + "step": 65180 + }, + { + "epoch": 1.066677575063405, + "grad_norm": 0.09402655810117722, + "learning_rate": 5.290629111832804e-06, + "loss": 0.002, + "step": 65190 + }, + { + "epoch": 1.066841201014481, + "grad_norm": 0.02262144722044468, + "learning_rate": 5.2892036084169965e-06, + "loss": 0.0015, + "step": 65200 + }, + { + "epoch": 1.0670048269655568, + "grad_norm": 0.14120560884475708, + "learning_rate": 5.287778081414726e-06, + "loss": 0.0009, + "step": 65210 + }, + { + "epoch": 1.0671684529166325, + "grad_norm": 0.03046940267086029, + "learning_rate": 5.286352530942255e-06, + "loss": 0.0016, + "step": 65220 + }, + { + "epoch": 1.0673320788677083, + "grad_norm": 0.05819351598620415, + "learning_rate": 5.284926957115846e-06, + "loss": 0.0016, + "step": 65230 + }, + { + "epoch": 1.0674957048187843, + "grad_norm": 0.13080142438411713, + "learning_rate": 5.283501360051764e-06, + "loss": 0.0011, + "step": 65240 + }, + { + "epoch": 1.06765933076986, + "grad_norm": 0.008926796726882458, + "learning_rate": 5.282075739866275e-06, + "loss": 0.0023, + "step": 65250 + }, + { + "epoch": 1.0678229567209359, + "grad_norm": 0.09713287651538849, + "learning_rate": 5.280650096675648e-06, + "loss": 0.002, + "step": 65260 + }, + { + "epoch": 1.0679865826720119, + "grad_norm": 0.06894666701555252, + "learning_rate": 5.279224430596155e-06, + "loss": 0.0014, + "step": 65270 + }, + { + "epoch": 1.0681502086230876, + "grad_norm": 0.1052623763680458, + "learning_rate": 5.277798741744066e-06, + "loss": 0.002, + "step": 65280 + }, + { + "epoch": 1.0683138345741634, + "grad_norm": 0.10642183572053909, + "learning_rate": 5.276373030235658e-06, + "loss": 0.0018, + "step": 65290 + }, + { + "epoch": 1.0684774605252394, + "grad_norm": 0.11881567537784576, + "learning_rate": 5.274947296187205e-06, + "loss": 0.0023, + "step": 65300 + }, + { + "epoch": 1.0686410864763152, + "grad_norm": 0.09369634091854095, + "learning_rate": 5.273521539714989e-06, + "loss": 0.002, + "step": 65310 + }, + { + "epoch": 1.068804712427391, + "grad_norm": 0.07297854870557785, + "learning_rate": 5.272095760935284e-06, + "loss": 0.0018, + "step": 65320 + }, + { + "epoch": 1.068968338378467, + "grad_norm": 0.2970001697540283, + "learning_rate": 5.270669959964374e-06, + "loss": 0.0043, + "step": 65330 + }, + { + "epoch": 1.0691319643295427, + "grad_norm": 0.07514619827270508, + "learning_rate": 5.269244136918544e-06, + "loss": 0.0024, + "step": 65340 + }, + { + "epoch": 1.0692955902806185, + "grad_norm": 0.0616283044219017, + "learning_rate": 5.267818291914078e-06, + "loss": 0.0012, + "step": 65350 + }, + { + "epoch": 1.0694592162316943, + "grad_norm": 0.027152765542268753, + "learning_rate": 5.266392425067264e-06, + "loss": 0.0012, + "step": 65360 + }, + { + "epoch": 1.0696228421827703, + "grad_norm": 0.04551069438457489, + "learning_rate": 5.2649665364943896e-06, + "loss": 0.0013, + "step": 65370 + }, + { + "epoch": 1.069786468133846, + "grad_norm": 0.055360324680805206, + "learning_rate": 5.2635406263117465e-06, + "loss": 0.0015, + "step": 65380 + }, + { + "epoch": 1.0699500940849218, + "grad_norm": 0.04829278588294983, + "learning_rate": 5.262114694635625e-06, + "loss": 0.0018, + "step": 65390 + }, + { + "epoch": 1.0701137200359978, + "grad_norm": 0.016820231452584267, + "learning_rate": 5.2606887415823225e-06, + "loss": 0.0016, + "step": 65400 + }, + { + "epoch": 1.0702773459870736, + "grad_norm": 0.05498622730374336, + "learning_rate": 5.259262767268132e-06, + "loss": 0.0016, + "step": 65410 + }, + { + "epoch": 1.0704409719381494, + "grad_norm": 0.023811891674995422, + "learning_rate": 5.257836771809352e-06, + "loss": 0.0029, + "step": 65420 + }, + { + "epoch": 1.0706045978892251, + "grad_norm": 0.06769300252199173, + "learning_rate": 5.2564107553222845e-06, + "loss": 0.002, + "step": 65430 + }, + { + "epoch": 1.0707682238403011, + "grad_norm": 0.06867410242557526, + "learning_rate": 5.254984717923226e-06, + "loss": 0.0016, + "step": 65440 + }, + { + "epoch": 1.070931849791377, + "grad_norm": 0.015183850191533566, + "learning_rate": 5.253558659728482e-06, + "loss": 0.001, + "step": 65450 + }, + { + "epoch": 1.0710954757424527, + "grad_norm": 0.05597613379359245, + "learning_rate": 5.2521325808543565e-06, + "loss": 0.0016, + "step": 65460 + }, + { + "epoch": 1.0712591016935287, + "grad_norm": 0.10989191383123398, + "learning_rate": 5.250706481417156e-06, + "loss": 0.0024, + "step": 65470 + }, + { + "epoch": 1.0714227276446044, + "grad_norm": 0.0691683441400528, + "learning_rate": 5.249280361533187e-06, + "loss": 0.0014, + "step": 65480 + }, + { + "epoch": 1.0715863535956802, + "grad_norm": 0.026200102642178535, + "learning_rate": 5.24785422131876e-06, + "loss": 0.0011, + "step": 65490 + }, + { + "epoch": 1.0717499795467562, + "grad_norm": 0.033837590366601944, + "learning_rate": 5.246428060890186e-06, + "loss": 0.0019, + "step": 65500 + }, + { + "epoch": 1.071913605497832, + "grad_norm": 0.01459808275103569, + "learning_rate": 5.2450018803637784e-06, + "loss": 0.0018, + "step": 65510 + }, + { + "epoch": 1.0720772314489078, + "grad_norm": 0.14792582392692566, + "learning_rate": 5.24357567985585e-06, + "loss": 0.0014, + "step": 65520 + }, + { + "epoch": 1.0722408573999835, + "grad_norm": 0.04514022544026375, + "learning_rate": 5.242149459482718e-06, + "loss": 0.0011, + "step": 65530 + }, + { + "epoch": 1.0724044833510595, + "grad_norm": 0.0576908253133297, + "learning_rate": 5.240723219360701e-06, + "loss": 0.0015, + "step": 65540 + }, + { + "epoch": 1.0725681093021353, + "grad_norm": 0.2102101743221283, + "learning_rate": 5.239296959606116e-06, + "loss": 0.0027, + "step": 65550 + }, + { + "epoch": 1.072731735253211, + "grad_norm": 0.06453171372413635, + "learning_rate": 5.237870680335286e-06, + "loss": 0.002, + "step": 65560 + }, + { + "epoch": 1.072895361204287, + "grad_norm": 0.009780087508261204, + "learning_rate": 5.236444381664532e-06, + "loss": 0.0016, + "step": 65570 + }, + { + "epoch": 1.0730589871553629, + "grad_norm": 0.028725354000926018, + "learning_rate": 5.23501806371018e-06, + "loss": 0.0007, + "step": 65580 + }, + { + "epoch": 1.0732226131064386, + "grad_norm": 0.1053251177072525, + "learning_rate": 5.233591726588552e-06, + "loss": 0.0016, + "step": 65590 + }, + { + "epoch": 1.0733862390575146, + "grad_norm": 0.023830266669392586, + "learning_rate": 5.232165370415979e-06, + "loss": 0.0011, + "step": 65600 + }, + { + "epoch": 1.0735498650085904, + "grad_norm": 0.050636470317840576, + "learning_rate": 5.230738995308788e-06, + "loss": 0.001, + "step": 65610 + }, + { + "epoch": 1.0737134909596662, + "grad_norm": 0.16297264397144318, + "learning_rate": 5.229312601383311e-06, + "loss": 0.0019, + "step": 65620 + }, + { + "epoch": 1.073877116910742, + "grad_norm": 0.049096617847681046, + "learning_rate": 5.227886188755878e-06, + "loss": 0.0021, + "step": 65630 + }, + { + "epoch": 1.074040742861818, + "grad_norm": 0.062334369868040085, + "learning_rate": 5.226459757542822e-06, + "loss": 0.0029, + "step": 65640 + }, + { + "epoch": 1.0742043688128937, + "grad_norm": 0.07538007944822311, + "learning_rate": 5.2250333078604785e-06, + "loss": 0.0018, + "step": 65650 + }, + { + "epoch": 1.0743679947639695, + "grad_norm": 0.030422566458582878, + "learning_rate": 5.223606839825184e-06, + "loss": 0.0014, + "step": 65660 + }, + { + "epoch": 1.0745316207150455, + "grad_norm": 0.1259501874446869, + "learning_rate": 5.222180353553277e-06, + "loss": 0.0018, + "step": 65670 + }, + { + "epoch": 1.0746952466661213, + "grad_norm": 0.08239496499300003, + "learning_rate": 5.220753849161095e-06, + "loss": 0.0019, + "step": 65680 + }, + { + "epoch": 1.074858872617197, + "grad_norm": 0.1408461332321167, + "learning_rate": 5.219327326764981e-06, + "loss": 0.0022, + "step": 65690 + }, + { + "epoch": 1.075022498568273, + "grad_norm": 0.10267409682273865, + "learning_rate": 5.217900786481277e-06, + "loss": 0.0019, + "step": 65700 + }, + { + "epoch": 1.0751861245193488, + "grad_norm": 0.05968786031007767, + "learning_rate": 5.2164742284263244e-06, + "loss": 0.0011, + "step": 65710 + }, + { + "epoch": 1.0753497504704246, + "grad_norm": 0.02846963331103325, + "learning_rate": 5.215047652716473e-06, + "loss": 0.0015, + "step": 65720 + }, + { + "epoch": 1.0755133764215004, + "grad_norm": 0.05503297969698906, + "learning_rate": 5.213621059468065e-06, + "loss": 0.0015, + "step": 65730 + }, + { + "epoch": 1.0756770023725764, + "grad_norm": 0.01726103201508522, + "learning_rate": 5.212194448797449e-06, + "loss": 0.0011, + "step": 65740 + }, + { + "epoch": 1.0758406283236521, + "grad_norm": 0.09336838126182556, + "learning_rate": 5.210767820820978e-06, + "loss": 0.001, + "step": 65750 + }, + { + "epoch": 1.076004254274728, + "grad_norm": 0.0753527358174324, + "learning_rate": 5.209341175655e-06, + "loss": 0.0008, + "step": 65760 + }, + { + "epoch": 1.076167880225804, + "grad_norm": 0.06671423465013504, + "learning_rate": 5.207914513415867e-06, + "loss": 0.0011, + "step": 65770 + }, + { + "epoch": 1.0763315061768797, + "grad_norm": 0.04025735333561897, + "learning_rate": 5.206487834219934e-06, + "loss": 0.0016, + "step": 65780 + }, + { + "epoch": 1.0764951321279554, + "grad_norm": 0.045490920543670654, + "learning_rate": 5.205061138183557e-06, + "loss": 0.0015, + "step": 65790 + }, + { + "epoch": 1.0766587580790312, + "grad_norm": 0.012613928876817226, + "learning_rate": 5.20363442542309e-06, + "loss": 0.0018, + "step": 65800 + }, + { + "epoch": 1.0768223840301072, + "grad_norm": 0.0372697077691555, + "learning_rate": 5.202207696054893e-06, + "loss": 0.0012, + "step": 65810 + }, + { + "epoch": 1.076986009981183, + "grad_norm": 0.17838460206985474, + "learning_rate": 5.200780950195323e-06, + "loss": 0.0031, + "step": 65820 + }, + { + "epoch": 1.0771496359322588, + "grad_norm": 0.07225412875413895, + "learning_rate": 5.1993541879607425e-06, + "loss": 0.0026, + "step": 65830 + }, + { + "epoch": 1.0773132618833348, + "grad_norm": 0.021484797820448875, + "learning_rate": 5.197927409467513e-06, + "loss": 0.0018, + "step": 65840 + }, + { + "epoch": 1.0774768878344105, + "grad_norm": 0.08153106272220612, + "learning_rate": 5.196500614831997e-06, + "loss": 0.0015, + "step": 65850 + }, + { + "epoch": 1.0776405137854863, + "grad_norm": 0.0051868511363863945, + "learning_rate": 5.195073804170559e-06, + "loss": 0.001, + "step": 65860 + }, + { + "epoch": 1.0778041397365623, + "grad_norm": 0.027387188747525215, + "learning_rate": 5.193646977599566e-06, + "loss": 0.0007, + "step": 65870 + }, + { + "epoch": 1.077967765687638, + "grad_norm": 0.04696065932512283, + "learning_rate": 5.1922201352353844e-06, + "loss": 0.0022, + "step": 65880 + }, + { + "epoch": 1.0781313916387139, + "grad_norm": 0.06859666109085083, + "learning_rate": 5.190793277194383e-06, + "loss": 0.0014, + "step": 65890 + }, + { + "epoch": 1.0782950175897899, + "grad_norm": 0.08227351307868958, + "learning_rate": 5.189366403592929e-06, + "loss": 0.0021, + "step": 65900 + }, + { + "epoch": 1.0784586435408656, + "grad_norm": 0.062292248010635376, + "learning_rate": 5.1879395145473975e-06, + "loss": 0.0013, + "step": 65910 + }, + { + "epoch": 1.0786222694919414, + "grad_norm": 0.06601244956254959, + "learning_rate": 5.186512610174157e-06, + "loss": 0.0018, + "step": 65920 + }, + { + "epoch": 1.0787858954430172, + "grad_norm": 0.08488211035728455, + "learning_rate": 5.185085690589584e-06, + "loss": 0.0015, + "step": 65930 + }, + { + "epoch": 1.0789495213940932, + "grad_norm": 0.05413906276226044, + "learning_rate": 5.183658755910052e-06, + "loss": 0.0024, + "step": 65940 + }, + { + "epoch": 1.079113147345169, + "grad_norm": 0.020831607282161713, + "learning_rate": 5.182231806251936e-06, + "loss": 0.0016, + "step": 65950 + }, + { + "epoch": 1.0792767732962447, + "grad_norm": 0.13814033567905426, + "learning_rate": 5.180804841731613e-06, + "loss": 0.0039, + "step": 65960 + }, + { + "epoch": 1.0794403992473207, + "grad_norm": 0.049561165273189545, + "learning_rate": 5.179377862465464e-06, + "loss": 0.0011, + "step": 65970 + }, + { + "epoch": 1.0796040251983965, + "grad_norm": 0.033354684710502625, + "learning_rate": 5.1779508685698654e-06, + "loss": 0.0008, + "step": 65980 + }, + { + "epoch": 1.0797676511494723, + "grad_norm": 0.06770694255828857, + "learning_rate": 5.1765238601612e-06, + "loss": 0.002, + "step": 65990 + }, + { + "epoch": 1.079931277100548, + "grad_norm": 0.01733277551829815, + "learning_rate": 5.17509683735585e-06, + "loss": 0.0012, + "step": 66000 + }, + { + "epoch": 1.080094903051624, + "grad_norm": 0.04519897326827049, + "learning_rate": 5.173669800270196e-06, + "loss": 0.0009, + "step": 66010 + }, + { + "epoch": 1.0802585290026998, + "grad_norm": 0.07872622460126877, + "learning_rate": 5.1722427490206254e-06, + "loss": 0.0018, + "step": 66020 + }, + { + "epoch": 1.0804221549537756, + "grad_norm": 0.047858960926532745, + "learning_rate": 5.170815683723521e-06, + "loss": 0.0015, + "step": 66030 + }, + { + "epoch": 1.0805857809048516, + "grad_norm": 0.13418707251548767, + "learning_rate": 5.169388604495271e-06, + "loss": 0.0015, + "step": 66040 + }, + { + "epoch": 1.0807494068559274, + "grad_norm": 0.03831182047724724, + "learning_rate": 5.167961511452263e-06, + "loss": 0.0013, + "step": 66050 + }, + { + "epoch": 1.0809130328070031, + "grad_norm": 0.12964409589767456, + "learning_rate": 5.166534404710885e-06, + "loss": 0.0007, + "step": 66060 + }, + { + "epoch": 1.0810766587580791, + "grad_norm": 0.0464429073035717, + "learning_rate": 5.165107284387528e-06, + "loss": 0.0017, + "step": 66070 + }, + { + "epoch": 1.081240284709155, + "grad_norm": 0.06879943609237671, + "learning_rate": 5.163680150598583e-06, + "loss": 0.0018, + "step": 66080 + }, + { + "epoch": 1.0814039106602307, + "grad_norm": 0.18898960947990417, + "learning_rate": 5.162253003460442e-06, + "loss": 0.0029, + "step": 66090 + }, + { + "epoch": 1.0815675366113067, + "grad_norm": 0.035316988825798035, + "learning_rate": 5.160825843089496e-06, + "loss": 0.0014, + "step": 66100 + }, + { + "epoch": 1.0817311625623824, + "grad_norm": 0.04327777773141861, + "learning_rate": 5.159398669602143e-06, + "loss": 0.0011, + "step": 66110 + }, + { + "epoch": 1.0818947885134582, + "grad_norm": 0.1721632033586502, + "learning_rate": 5.157971483114778e-06, + "loss": 0.0039, + "step": 66120 + }, + { + "epoch": 1.082058414464534, + "grad_norm": 0.15825077891349792, + "learning_rate": 5.156544283743794e-06, + "loss": 0.0012, + "step": 66130 + }, + { + "epoch": 1.08222204041561, + "grad_norm": 0.14909835159778595, + "learning_rate": 5.155117071605592e-06, + "loss": 0.0013, + "step": 66140 + }, + { + "epoch": 1.0823856663666858, + "grad_norm": 0.06051696464419365, + "learning_rate": 5.1536898468165695e-06, + "loss": 0.0015, + "step": 66150 + }, + { + "epoch": 1.0825492923177615, + "grad_norm": 0.17413592338562012, + "learning_rate": 5.1522626094931265e-06, + "loss": 0.0027, + "step": 66160 + }, + { + "epoch": 1.0827129182688375, + "grad_norm": 0.01725386269390583, + "learning_rate": 5.150835359751661e-06, + "loss": 0.0013, + "step": 66170 + }, + { + "epoch": 1.0828765442199133, + "grad_norm": 0.05856165662407875, + "learning_rate": 5.149408097708578e-06, + "loss": 0.0026, + "step": 66180 + }, + { + "epoch": 1.083040170170989, + "grad_norm": 0.029762886464595795, + "learning_rate": 5.14798082348028e-06, + "loss": 0.0016, + "step": 66190 + }, + { + "epoch": 1.0832037961220649, + "grad_norm": 0.12964269518852234, + "learning_rate": 5.146553537183168e-06, + "loss": 0.0016, + "step": 66200 + }, + { + "epoch": 1.0833674220731409, + "grad_norm": 0.06808414310216904, + "learning_rate": 5.145126238933649e-06, + "loss": 0.0017, + "step": 66210 + }, + { + "epoch": 1.0835310480242166, + "grad_norm": 0.02311863750219345, + "learning_rate": 5.143698928848126e-06, + "loss": 0.0013, + "step": 66220 + }, + { + "epoch": 1.0836946739752924, + "grad_norm": 0.11196508258581161, + "learning_rate": 5.14227160704301e-06, + "loss": 0.0014, + "step": 66230 + }, + { + "epoch": 1.0838582999263684, + "grad_norm": 0.09211152791976929, + "learning_rate": 5.140844273634704e-06, + "loss": 0.001, + "step": 66240 + }, + { + "epoch": 1.0840219258774442, + "grad_norm": 0.008989924564957619, + "learning_rate": 5.139416928739619e-06, + "loss": 0.0012, + "step": 66250 + }, + { + "epoch": 1.08418555182852, + "grad_norm": 0.10878390073776245, + "learning_rate": 5.137989572474163e-06, + "loss": 0.0015, + "step": 66260 + }, + { + "epoch": 1.084349177779596, + "grad_norm": 0.028733471408486366, + "learning_rate": 5.136562204954748e-06, + "loss": 0.001, + "step": 66270 + }, + { + "epoch": 1.0845128037306717, + "grad_norm": 0.06863290071487427, + "learning_rate": 5.135134826297783e-06, + "loss": 0.0011, + "step": 66280 + }, + { + "epoch": 1.0846764296817475, + "grad_norm": 0.0035480589140206575, + "learning_rate": 5.1337074366196825e-06, + "loss": 0.0019, + "step": 66290 + }, + { + "epoch": 1.0848400556328233, + "grad_norm": 0.08949305862188339, + "learning_rate": 5.132280036036858e-06, + "loss": 0.0016, + "step": 66300 + }, + { + "epoch": 1.0850036815838993, + "grad_norm": 0.08759809285402298, + "learning_rate": 5.130852624665723e-06, + "loss": 0.0014, + "step": 66310 + }, + { + "epoch": 1.085167307534975, + "grad_norm": 0.15438930690288544, + "learning_rate": 5.129425202622693e-06, + "loss": 0.004, + "step": 66320 + }, + { + "epoch": 1.0853309334860508, + "grad_norm": 0.020953025668859482, + "learning_rate": 5.1279977700241845e-06, + "loss": 0.001, + "step": 66330 + }, + { + "epoch": 1.0854945594371268, + "grad_norm": 0.038243480026721954, + "learning_rate": 5.126570326986613e-06, + "loss": 0.0016, + "step": 66340 + }, + { + "epoch": 1.0856581853882026, + "grad_norm": 0.056142378598451614, + "learning_rate": 5.125142873626396e-06, + "loss": 0.0012, + "step": 66350 + }, + { + "epoch": 1.0858218113392784, + "grad_norm": 0.10768869519233704, + "learning_rate": 5.1237154100599525e-06, + "loss": 0.0012, + "step": 66360 + }, + { + "epoch": 1.0859854372903543, + "grad_norm": 0.043584391474723816, + "learning_rate": 5.122287936403701e-06, + "loss": 0.0024, + "step": 66370 + }, + { + "epoch": 1.0861490632414301, + "grad_norm": 0.2161857634782791, + "learning_rate": 5.120860452774062e-06, + "loss": 0.001, + "step": 66380 + }, + { + "epoch": 1.086312689192506, + "grad_norm": 0.12796850502490997, + "learning_rate": 5.119432959287456e-06, + "loss": 0.0016, + "step": 66390 + }, + { + "epoch": 1.0864763151435817, + "grad_norm": 0.07119136303663254, + "learning_rate": 5.118005456060303e-06, + "loss": 0.0012, + "step": 66400 + }, + { + "epoch": 1.0866399410946577, + "grad_norm": 0.07732295244932175, + "learning_rate": 5.116577943209027e-06, + "loss": 0.0028, + "step": 66410 + }, + { + "epoch": 1.0868035670457334, + "grad_norm": 0.04027533903717995, + "learning_rate": 5.11515042085005e-06, + "loss": 0.0024, + "step": 66420 + }, + { + "epoch": 1.0869671929968092, + "grad_norm": 0.043051790446043015, + "learning_rate": 5.113722889099797e-06, + "loss": 0.0014, + "step": 66430 + }, + { + "epoch": 1.0871308189478852, + "grad_norm": 0.0551178902387619, + "learning_rate": 5.112295348074692e-06, + "loss": 0.0014, + "step": 66440 + }, + { + "epoch": 1.087294444898961, + "grad_norm": 0.033098142594099045, + "learning_rate": 5.1108677978911604e-06, + "loss": 0.0018, + "step": 66450 + }, + { + "epoch": 1.0874580708500368, + "grad_norm": 0.002978342352434993, + "learning_rate": 5.109440238665628e-06, + "loss": 0.0021, + "step": 66460 + }, + { + "epoch": 1.0876216968011128, + "grad_norm": 0.07392004132270813, + "learning_rate": 5.108012670514523e-06, + "loss": 0.0019, + "step": 66470 + }, + { + "epoch": 1.0877853227521885, + "grad_norm": 0.024156300351023674, + "learning_rate": 5.106585093554272e-06, + "loss": 0.0021, + "step": 66480 + }, + { + "epoch": 1.0879489487032643, + "grad_norm": 0.05626894533634186, + "learning_rate": 5.105157507901302e-06, + "loss": 0.0016, + "step": 66490 + }, + { + "epoch": 1.08811257465434, + "grad_norm": 0.050983961671590805, + "learning_rate": 5.103729913672046e-06, + "loss": 0.0014, + "step": 66500 + }, + { + "epoch": 1.088276200605416, + "grad_norm": 0.13798587024211884, + "learning_rate": 5.102302310982929e-06, + "loss": 0.0021, + "step": 66510 + }, + { + "epoch": 1.0884398265564919, + "grad_norm": 0.0390687994658947, + "learning_rate": 5.100874699950384e-06, + "loss": 0.0011, + "step": 66520 + }, + { + "epoch": 1.0886034525075676, + "grad_norm": 0.03659537807106972, + "learning_rate": 5.099447080690842e-06, + "loss": 0.0014, + "step": 66530 + }, + { + "epoch": 1.0887670784586436, + "grad_norm": 0.03784247860312462, + "learning_rate": 5.0980194533207336e-06, + "loss": 0.0014, + "step": 66540 + }, + { + "epoch": 1.0889307044097194, + "grad_norm": 0.1008988469839096, + "learning_rate": 5.096591817956493e-06, + "loss": 0.0009, + "step": 66550 + }, + { + "epoch": 1.0890943303607952, + "grad_norm": 0.08490591496229172, + "learning_rate": 5.095164174714553e-06, + "loss": 0.0015, + "step": 66560 + }, + { + "epoch": 1.089257956311871, + "grad_norm": 0.021908758208155632, + "learning_rate": 5.0937365237113455e-06, + "loss": 0.0014, + "step": 66570 + }, + { + "epoch": 1.089421582262947, + "grad_norm": 0.12063805013895035, + "learning_rate": 5.092308865063307e-06, + "loss": 0.0014, + "step": 66580 + }, + { + "epoch": 1.0895852082140227, + "grad_norm": 0.06550191342830658, + "learning_rate": 5.09088119888687e-06, + "loss": 0.0013, + "step": 66590 + }, + { + "epoch": 1.0897488341650985, + "grad_norm": 0.06678319722414017, + "learning_rate": 5.089453525298474e-06, + "loss": 0.0009, + "step": 66600 + }, + { + "epoch": 1.0899124601161745, + "grad_norm": 0.03586084023118019, + "learning_rate": 5.088025844414551e-06, + "loss": 0.0018, + "step": 66610 + }, + { + "epoch": 1.0900760860672503, + "grad_norm": 0.18501341342926025, + "learning_rate": 5.086598156351541e-06, + "loss": 0.0015, + "step": 66620 + }, + { + "epoch": 1.090239712018326, + "grad_norm": 0.04540174454450607, + "learning_rate": 5.085170461225879e-06, + "loss": 0.002, + "step": 66630 + }, + { + "epoch": 1.090403337969402, + "grad_norm": 0.13158740103244781, + "learning_rate": 5.083742759154003e-06, + "loss": 0.0012, + "step": 66640 + }, + { + "epoch": 1.0905669639204778, + "grad_norm": 0.01592317968606949, + "learning_rate": 5.082315050252355e-06, + "loss": 0.0007, + "step": 66650 + }, + { + "epoch": 1.0907305898715536, + "grad_norm": 0.05693323165178299, + "learning_rate": 5.0808873346373685e-06, + "loss": 0.0011, + "step": 66660 + }, + { + "epoch": 1.0908942158226296, + "grad_norm": 0.043752219527959824, + "learning_rate": 5.079459612425488e-06, + "loss": 0.0017, + "step": 66670 + }, + { + "epoch": 1.0910578417737054, + "grad_norm": 0.004869421944022179, + "learning_rate": 5.078031883733152e-06, + "loss": 0.0015, + "step": 66680 + }, + { + "epoch": 1.0912214677247811, + "grad_norm": 0.04623570293188095, + "learning_rate": 5.0766041486768005e-06, + "loss": 0.0012, + "step": 66690 + }, + { + "epoch": 1.091385093675857, + "grad_norm": 0.04098409041762352, + "learning_rate": 5.075176407372875e-06, + "loss": 0.0018, + "step": 66700 + }, + { + "epoch": 1.091548719626933, + "grad_norm": 0.05187565088272095, + "learning_rate": 5.073748659937819e-06, + "loss": 0.0013, + "step": 66710 + }, + { + "epoch": 1.0917123455780087, + "grad_norm": 0.05179828405380249, + "learning_rate": 5.0723209064880705e-06, + "loss": 0.0014, + "step": 66720 + }, + { + "epoch": 1.0918759715290844, + "grad_norm": 0.0177445225417614, + "learning_rate": 5.070893147140077e-06, + "loss": 0.0011, + "step": 66730 + }, + { + "epoch": 1.0920395974801604, + "grad_norm": 0.10295765101909637, + "learning_rate": 5.069465382010279e-06, + "loss": 0.0047, + "step": 66740 + }, + { + "epoch": 1.0922032234312362, + "grad_norm": 0.030104324221611023, + "learning_rate": 5.068037611215121e-06, + "loss": 0.0013, + "step": 66750 + }, + { + "epoch": 1.092366849382312, + "grad_norm": 0.09781965613365173, + "learning_rate": 5.066609834871047e-06, + "loss": 0.0018, + "step": 66760 + }, + { + "epoch": 1.0925304753333878, + "grad_norm": 0.056559912860393524, + "learning_rate": 5.0651820530945e-06, + "loss": 0.0035, + "step": 66770 + }, + { + "epoch": 1.0926941012844638, + "grad_norm": 0.02647298388183117, + "learning_rate": 5.063754266001929e-06, + "loss": 0.0009, + "step": 66780 + }, + { + "epoch": 1.0928577272355395, + "grad_norm": 0.060371797531843185, + "learning_rate": 5.062326473709775e-06, + "loss": 0.0028, + "step": 66790 + }, + { + "epoch": 1.0930213531866153, + "grad_norm": 0.1852189004421234, + "learning_rate": 5.060898676334487e-06, + "loss": 0.0021, + "step": 66800 + }, + { + "epoch": 1.0931849791376913, + "grad_norm": 0.038718532770872116, + "learning_rate": 5.05947087399251e-06, + "loss": 0.0012, + "step": 66810 + }, + { + "epoch": 1.093348605088767, + "grad_norm": 0.10918682813644409, + "learning_rate": 5.058043066800291e-06, + "loss": 0.0011, + "step": 66820 + }, + { + "epoch": 1.0935122310398429, + "grad_norm": 0.03159303590655327, + "learning_rate": 5.0566152548742766e-06, + "loss": 0.0013, + "step": 66830 + }, + { + "epoch": 1.0936758569909188, + "grad_norm": 0.009500847198069096, + "learning_rate": 5.0551874383309145e-06, + "loss": 0.0015, + "step": 66840 + }, + { + "epoch": 1.0938394829419946, + "grad_norm": 0.020893540233373642, + "learning_rate": 5.053759617286652e-06, + "loss": 0.0007, + "step": 66850 + }, + { + "epoch": 1.0940031088930704, + "grad_norm": 0.0421232245862484, + "learning_rate": 5.05233179185794e-06, + "loss": 0.0014, + "step": 66860 + }, + { + "epoch": 1.0941667348441464, + "grad_norm": 0.10335648059844971, + "learning_rate": 5.050903962161223e-06, + "loss": 0.0018, + "step": 66870 + }, + { + "epoch": 1.0943303607952222, + "grad_norm": 0.12847822904586792, + "learning_rate": 5.049476128312954e-06, + "loss": 0.0027, + "step": 66880 + }, + { + "epoch": 1.094493986746298, + "grad_norm": 0.17413492500782013, + "learning_rate": 5.048048290429579e-06, + "loss": 0.0017, + "step": 66890 + }, + { + "epoch": 1.0946576126973737, + "grad_norm": 0.011131856590509415, + "learning_rate": 5.0466204486275495e-06, + "loss": 0.002, + "step": 66900 + }, + { + "epoch": 1.0948212386484497, + "grad_norm": 0.03642764315009117, + "learning_rate": 5.0451926030233145e-06, + "loss": 0.0014, + "step": 66910 + }, + { + "epoch": 1.0949848645995255, + "grad_norm": 0.16153410077095032, + "learning_rate": 5.043764753733326e-06, + "loss": 0.0017, + "step": 66920 + }, + { + "epoch": 1.0951484905506013, + "grad_norm": 0.056975413113832474, + "learning_rate": 5.042336900874031e-06, + "loss": 0.001, + "step": 66930 + }, + { + "epoch": 1.0953121165016773, + "grad_norm": 0.06315185129642487, + "learning_rate": 5.040909044561882e-06, + "loss": 0.0013, + "step": 66940 + }, + { + "epoch": 1.095475742452753, + "grad_norm": 0.038827188313007355, + "learning_rate": 5.039481184913333e-06, + "loss": 0.0008, + "step": 66950 + }, + { + "epoch": 1.0956393684038288, + "grad_norm": 0.04742514714598656, + "learning_rate": 5.038053322044832e-06, + "loss": 0.0026, + "step": 66960 + }, + { + "epoch": 1.0958029943549046, + "grad_norm": 0.03633485734462738, + "learning_rate": 5.03662545607283e-06, + "loss": 0.0015, + "step": 66970 + }, + { + "epoch": 1.0959666203059806, + "grad_norm": 0.0665416270494461, + "learning_rate": 5.0351975871137814e-06, + "loss": 0.0013, + "step": 66980 + }, + { + "epoch": 1.0961302462570564, + "grad_norm": 0.2053467482328415, + "learning_rate": 5.033769715284137e-06, + "loss": 0.0014, + "step": 66990 + }, + { + "epoch": 1.0962938722081321, + "grad_norm": 0.07712331414222717, + "learning_rate": 5.03234184070035e-06, + "loss": 0.0018, + "step": 67000 + }, + { + "epoch": 1.0964574981592081, + "grad_norm": 0.07719994336366653, + "learning_rate": 5.030913963478873e-06, + "loss": 0.0019, + "step": 67010 + }, + { + "epoch": 1.096621124110284, + "grad_norm": 0.23459912836551666, + "learning_rate": 5.029486083736157e-06, + "loss": 0.0018, + "step": 67020 + }, + { + "epoch": 1.0967847500613597, + "grad_norm": 0.05724671855568886, + "learning_rate": 5.028058201588657e-06, + "loss": 0.001, + "step": 67030 + }, + { + "epoch": 1.0969483760124357, + "grad_norm": 0.010152517817914486, + "learning_rate": 5.026630317152826e-06, + "loss": 0.0017, + "step": 67040 + }, + { + "epoch": 1.0971120019635114, + "grad_norm": 0.03183060884475708, + "learning_rate": 5.025202430545116e-06, + "loss": 0.0013, + "step": 67050 + }, + { + "epoch": 1.0972756279145872, + "grad_norm": 0.059750061482191086, + "learning_rate": 5.023774541881983e-06, + "loss": 0.0015, + "step": 67060 + }, + { + "epoch": 1.0974392538656632, + "grad_norm": 0.03474615141749382, + "learning_rate": 5.022346651279878e-06, + "loss": 0.0009, + "step": 67070 + }, + { + "epoch": 1.097602879816739, + "grad_norm": 0.03113647550344467, + "learning_rate": 5.020918758855257e-06, + "loss": 0.0033, + "step": 67080 + }, + { + "epoch": 1.0977665057678148, + "grad_norm": 0.0761863961815834, + "learning_rate": 5.019490864724572e-06, + "loss": 0.0021, + "step": 67090 + }, + { + "epoch": 1.0979301317188905, + "grad_norm": 0.08832691609859467, + "learning_rate": 5.018062969004279e-06, + "loss": 0.0011, + "step": 67100 + }, + { + "epoch": 1.0980937576699665, + "grad_norm": 0.05399898812174797, + "learning_rate": 5.016635071810832e-06, + "loss": 0.0016, + "step": 67110 + }, + { + "epoch": 1.0982573836210423, + "grad_norm": 0.026567207649350166, + "learning_rate": 5.015207173260684e-06, + "loss": 0.0008, + "step": 67120 + }, + { + "epoch": 1.098421009572118, + "grad_norm": 0.03639587387442589, + "learning_rate": 5.013779273470292e-06, + "loss": 0.0022, + "step": 67130 + }, + { + "epoch": 1.098584635523194, + "grad_norm": 0.08234965801239014, + "learning_rate": 5.012351372556109e-06, + "loss": 0.0016, + "step": 67140 + }, + { + "epoch": 1.0987482614742698, + "grad_norm": 0.12885858118534088, + "learning_rate": 5.01092347063459e-06, + "loss": 0.002, + "step": 67150 + }, + { + "epoch": 1.0989118874253456, + "grad_norm": 0.07344863563776016, + "learning_rate": 5.009495567822189e-06, + "loss": 0.0008, + "step": 67160 + }, + { + "epoch": 1.0990755133764214, + "grad_norm": 0.062437355518341064, + "learning_rate": 5.008067664235363e-06, + "loss": 0.001, + "step": 67170 + }, + { + "epoch": 1.0992391393274974, + "grad_norm": 0.05675366148352623, + "learning_rate": 5.006639759990566e-06, + "loss": 0.0012, + "step": 67180 + }, + { + "epoch": 1.0994027652785732, + "grad_norm": 0.05875685438513756, + "learning_rate": 5.005211855204251e-06, + "loss": 0.0009, + "step": 67190 + }, + { + "epoch": 1.099566391229649, + "grad_norm": 0.043574266135692596, + "learning_rate": 5.003783949992876e-06, + "loss": 0.0019, + "step": 67200 + }, + { + "epoch": 1.099730017180725, + "grad_norm": 0.0449739508330822, + "learning_rate": 5.002356044472896e-06, + "loss": 0.0008, + "step": 67210 + }, + { + "epoch": 1.0998936431318007, + "grad_norm": 0.014705897308886051, + "learning_rate": 5.0009281387607635e-06, + "loss": 0.005, + "step": 67220 + }, + { + "epoch": 1.1000572690828765, + "grad_norm": 0.01937803439795971, + "learning_rate": 4.999500232972936e-06, + "loss": 0.0019, + "step": 67230 + }, + { + "epoch": 1.1002208950339525, + "grad_norm": 0.04800884798169136, + "learning_rate": 4.998072327225868e-06, + "loss": 0.0017, + "step": 67240 + }, + { + "epoch": 1.1003845209850283, + "grad_norm": 0.032485850155353546, + "learning_rate": 4.996644421636014e-06, + "loss": 0.0019, + "step": 67250 + }, + { + "epoch": 1.100548146936104, + "grad_norm": 0.08788475394248962, + "learning_rate": 4.99521651631983e-06, + "loss": 0.0021, + "step": 67260 + }, + { + "epoch": 1.1007117728871798, + "grad_norm": 0.05829519405961037, + "learning_rate": 4.993788611393769e-06, + "loss": 0.0018, + "step": 67270 + }, + { + "epoch": 1.1008753988382558, + "grad_norm": 0.03683392331004143, + "learning_rate": 4.992360706974289e-06, + "loss": 0.0015, + "step": 67280 + }, + { + "epoch": 1.1010390247893316, + "grad_norm": 0.037919510155916214, + "learning_rate": 4.9909328031778435e-06, + "loss": 0.0016, + "step": 67290 + }, + { + "epoch": 1.1012026507404074, + "grad_norm": 0.05159285292029381, + "learning_rate": 4.9895049001208875e-06, + "loss": 0.0016, + "step": 67300 + }, + { + "epoch": 1.1013662766914833, + "grad_norm": 0.03023102693259716, + "learning_rate": 4.988076997919877e-06, + "loss": 0.0007, + "step": 67310 + }, + { + "epoch": 1.1015299026425591, + "grad_norm": 0.04483753815293312, + "learning_rate": 4.986649096691265e-06, + "loss": 0.001, + "step": 67320 + }, + { + "epoch": 1.101693528593635, + "grad_norm": 0.07059621810913086, + "learning_rate": 4.985221196551508e-06, + "loss": 0.0012, + "step": 67330 + }, + { + "epoch": 1.101857154544711, + "grad_norm": 0.14924487471580505, + "learning_rate": 4.9837932976170594e-06, + "loss": 0.0029, + "step": 67340 + }, + { + "epoch": 1.1020207804957867, + "grad_norm": 0.059832584112882614, + "learning_rate": 4.982365400004374e-06, + "loss": 0.001, + "step": 67350 + }, + { + "epoch": 1.1021844064468624, + "grad_norm": 0.09202532470226288, + "learning_rate": 4.980937503829907e-06, + "loss": 0.0009, + "step": 67360 + }, + { + "epoch": 1.1023480323979382, + "grad_norm": 0.09011992067098618, + "learning_rate": 4.979509609210112e-06, + "loss": 0.0015, + "step": 67370 + }, + { + "epoch": 1.1025116583490142, + "grad_norm": 0.021511957049369812, + "learning_rate": 4.978081716261445e-06, + "loss": 0.001, + "step": 67380 + }, + { + "epoch": 1.10267528430009, + "grad_norm": 0.006959167309105396, + "learning_rate": 4.976653825100357e-06, + "loss": 0.0018, + "step": 67390 + }, + { + "epoch": 1.1028389102511658, + "grad_norm": 0.040239367634058, + "learning_rate": 4.975225935843304e-06, + "loss": 0.0016, + "step": 67400 + }, + { + "epoch": 1.1030025362022418, + "grad_norm": 0.07228533923625946, + "learning_rate": 4.97379804860674e-06, + "loss": 0.0012, + "step": 67410 + }, + { + "epoch": 1.1031661621533175, + "grad_norm": 0.02421848475933075, + "learning_rate": 4.972370163507117e-06, + "loss": 0.0009, + "step": 67420 + }, + { + "epoch": 1.1033297881043933, + "grad_norm": 0.08054196089506149, + "learning_rate": 4.97094228066089e-06, + "loss": 0.0011, + "step": 67430 + }, + { + "epoch": 1.1034934140554693, + "grad_norm": 0.0896577313542366, + "learning_rate": 4.969514400184512e-06, + "loss": 0.002, + "step": 67440 + }, + { + "epoch": 1.103657040006545, + "grad_norm": 0.0888267308473587, + "learning_rate": 4.9680865221944355e-06, + "loss": 0.0014, + "step": 67450 + }, + { + "epoch": 1.1038206659576209, + "grad_norm": 0.05953643098473549, + "learning_rate": 4.9666586468071135e-06, + "loss": 0.002, + "step": 67460 + }, + { + "epoch": 1.1039842919086966, + "grad_norm": 0.06607147306203842, + "learning_rate": 4.965230774138999e-06, + "loss": 0.0011, + "step": 67470 + }, + { + "epoch": 1.1041479178597726, + "grad_norm": 0.024466248229146004, + "learning_rate": 4.963802904306545e-06, + "loss": 0.0014, + "step": 67480 + }, + { + "epoch": 1.1043115438108484, + "grad_norm": 0.04621598869562149, + "learning_rate": 4.962375037426202e-06, + "loss": 0.001, + "step": 67490 + }, + { + "epoch": 1.1044751697619242, + "grad_norm": 0.03939751535654068, + "learning_rate": 4.9609471736144234e-06, + "loss": 0.0015, + "step": 67500 + }, + { + "epoch": 1.1046387957130002, + "grad_norm": 0.24628588557243347, + "learning_rate": 4.959519312987659e-06, + "loss": 0.0015, + "step": 67510 + }, + { + "epoch": 1.104802421664076, + "grad_norm": 0.07973644882440567, + "learning_rate": 4.958091455662364e-06, + "loss": 0.0012, + "step": 67520 + }, + { + "epoch": 1.1049660476151517, + "grad_norm": 0.06189773604273796, + "learning_rate": 4.956663601754987e-06, + "loss": 0.0022, + "step": 67530 + }, + { + "epoch": 1.1051296735662275, + "grad_norm": 0.04373147338628769, + "learning_rate": 4.955235751381977e-06, + "loss": 0.0024, + "step": 67540 + }, + { + "epoch": 1.1052932995173035, + "grad_norm": 0.0825188085436821, + "learning_rate": 4.9538079046597885e-06, + "loss": 0.0027, + "step": 67550 + }, + { + "epoch": 1.1054569254683793, + "grad_norm": 0.0289162490516901, + "learning_rate": 4.952380061704871e-06, + "loss": 0.0013, + "step": 67560 + }, + { + "epoch": 1.105620551419455, + "grad_norm": 0.06399867683649063, + "learning_rate": 4.950952222633672e-06, + "loss": 0.0015, + "step": 67570 + }, + { + "epoch": 1.105784177370531, + "grad_norm": 0.09018483757972717, + "learning_rate": 4.9495243875626444e-06, + "loss": 0.001, + "step": 67580 + }, + { + "epoch": 1.1059478033216068, + "grad_norm": 0.08095432817935944, + "learning_rate": 4.948096556608235e-06, + "loss": 0.0015, + "step": 67590 + }, + { + "epoch": 1.1061114292726826, + "grad_norm": 0.0099486093968153, + "learning_rate": 4.9466687298868934e-06, + "loss": 0.0018, + "step": 67600 + }, + { + "epoch": 1.1062750552237586, + "grad_norm": 0.04577697440981865, + "learning_rate": 4.945240907515069e-06, + "loss": 0.0011, + "step": 67610 + }, + { + "epoch": 1.1064386811748343, + "grad_norm": 0.08428248018026352, + "learning_rate": 4.943813089609211e-06, + "loss": 0.0037, + "step": 67620 + }, + { + "epoch": 1.1066023071259101, + "grad_norm": 0.004601741675287485, + "learning_rate": 4.942385276285765e-06, + "loss": 0.0012, + "step": 67630 + }, + { + "epoch": 1.1067659330769861, + "grad_norm": 0.055516455322504044, + "learning_rate": 4.94095746766118e-06, + "loss": 0.0013, + "step": 67640 + }, + { + "epoch": 1.106929559028062, + "grad_norm": 0.06810334324836731, + "learning_rate": 4.939529663851903e-06, + "loss": 0.0016, + "step": 67650 + }, + { + "epoch": 1.1070931849791377, + "grad_norm": 0.13163498044013977, + "learning_rate": 4.938101864974381e-06, + "loss": 0.0017, + "step": 67660 + }, + { + "epoch": 1.1072568109302134, + "grad_norm": 0.04317409172654152, + "learning_rate": 4.9366740711450605e-06, + "loss": 0.0035, + "step": 67670 + }, + { + "epoch": 1.1074204368812894, + "grad_norm": 0.10430760681629181, + "learning_rate": 4.935246282480388e-06, + "loss": 0.0014, + "step": 67680 + }, + { + "epoch": 1.1075840628323652, + "grad_norm": 0.05480848252773285, + "learning_rate": 4.933818499096807e-06, + "loss": 0.0014, + "step": 67690 + }, + { + "epoch": 1.107747688783441, + "grad_norm": 0.08361303806304932, + "learning_rate": 4.932390721110765e-06, + "loss": 0.0012, + "step": 67700 + }, + { + "epoch": 1.107911314734517, + "grad_norm": 0.0881851464509964, + "learning_rate": 4.930962948638705e-06, + "loss": 0.0019, + "step": 67710 + }, + { + "epoch": 1.1080749406855928, + "grad_norm": 0.06896450370550156, + "learning_rate": 4.929535181797073e-06, + "loss": 0.0021, + "step": 67720 + }, + { + "epoch": 1.1082385666366685, + "grad_norm": 0.002867023227736354, + "learning_rate": 4.92810742070231e-06, + "loss": 0.0017, + "step": 67730 + }, + { + "epoch": 1.1084021925877443, + "grad_norm": 0.07280313223600388, + "learning_rate": 4.926679665470863e-06, + "loss": 0.0021, + "step": 67740 + }, + { + "epoch": 1.1085658185388203, + "grad_norm": 0.053313832730054855, + "learning_rate": 4.925251916219173e-06, + "loss": 0.0012, + "step": 67750 + }, + { + "epoch": 1.108729444489896, + "grad_norm": 0.08107008785009384, + "learning_rate": 4.923824173063681e-06, + "loss": 0.0013, + "step": 67760 + }, + { + "epoch": 1.1088930704409719, + "grad_norm": 0.07604475319385529, + "learning_rate": 4.922396436120832e-06, + "loss": 0.0018, + "step": 67770 + }, + { + "epoch": 1.1090566963920478, + "grad_norm": 0.013636584393680096, + "learning_rate": 4.920968705507065e-06, + "loss": 0.0009, + "step": 67780 + }, + { + "epoch": 1.1092203223431236, + "grad_norm": 0.030414769425988197, + "learning_rate": 4.91954098133882e-06, + "loss": 0.0028, + "step": 67790 + }, + { + "epoch": 1.1093839482941994, + "grad_norm": 0.034453511238098145, + "learning_rate": 4.918113263732541e-06, + "loss": 0.0026, + "step": 67800 + }, + { + "epoch": 1.1095475742452754, + "grad_norm": 0.11767400056123734, + "learning_rate": 4.916685552804664e-06, + "loss": 0.0017, + "step": 67810 + }, + { + "epoch": 1.1097112001963512, + "grad_norm": 0.11816064268350601, + "learning_rate": 4.915257848671631e-06, + "loss": 0.0023, + "step": 67820 + }, + { + "epoch": 1.109874826147427, + "grad_norm": 0.10383836925029755, + "learning_rate": 4.913830151449879e-06, + "loss": 0.0013, + "step": 67830 + }, + { + "epoch": 1.110038452098503, + "grad_norm": 0.14100036025047302, + "learning_rate": 4.912402461255847e-06, + "loss": 0.0016, + "step": 67840 + }, + { + "epoch": 1.1102020780495787, + "grad_norm": 0.13607031106948853, + "learning_rate": 4.910974778205972e-06, + "loss": 0.002, + "step": 67850 + }, + { + "epoch": 1.1103657040006545, + "grad_norm": 0.058741047978401184, + "learning_rate": 4.90954710241669e-06, + "loss": 0.0014, + "step": 67860 + }, + { + "epoch": 1.1105293299517303, + "grad_norm": 0.15582901239395142, + "learning_rate": 4.908119434004441e-06, + "loss": 0.0015, + "step": 67870 + }, + { + "epoch": 1.1106929559028063, + "grad_norm": 0.06155945733189583, + "learning_rate": 4.906691773085657e-06, + "loss": 0.0014, + "step": 67880 + }, + { + "epoch": 1.110856581853882, + "grad_norm": 0.09701423346996307, + "learning_rate": 4.905264119776775e-06, + "loss": 0.0007, + "step": 67890 + }, + { + "epoch": 1.1110202078049578, + "grad_norm": 0.01467121671885252, + "learning_rate": 4.903836474194229e-06, + "loss": 0.0017, + "step": 67900 + }, + { + "epoch": 1.1111838337560338, + "grad_norm": 0.032242126762866974, + "learning_rate": 4.902408836454453e-06, + "loss": 0.0011, + "step": 67910 + }, + { + "epoch": 1.1113474597071096, + "grad_norm": 0.013647392392158508, + "learning_rate": 4.90098120667388e-06, + "loss": 0.001, + "step": 67920 + }, + { + "epoch": 1.1115110856581853, + "grad_norm": 0.04842734709382057, + "learning_rate": 4.899553584968943e-06, + "loss": 0.0011, + "step": 67930 + }, + { + "epoch": 1.1116747116092611, + "grad_norm": 0.060524847358465195, + "learning_rate": 4.898125971456074e-06, + "loss": 0.0017, + "step": 67940 + }, + { + "epoch": 1.1118383375603371, + "grad_norm": 0.04990917816758156, + "learning_rate": 4.896698366251703e-06, + "loss": 0.0007, + "step": 67950 + }, + { + "epoch": 1.112001963511413, + "grad_norm": 0.07572031766176224, + "learning_rate": 4.895270769472263e-06, + "loss": 0.0015, + "step": 67960 + }, + { + "epoch": 1.1121655894624887, + "grad_norm": 0.09095306694507599, + "learning_rate": 4.893843181234182e-06, + "loss": 0.0021, + "step": 67970 + }, + { + "epoch": 1.1123292154135647, + "grad_norm": 0.11911226063966751, + "learning_rate": 4.892415601653891e-06, + "loss": 0.0024, + "step": 67980 + }, + { + "epoch": 1.1124928413646404, + "grad_norm": 0.07754993438720703, + "learning_rate": 4.890988030847817e-06, + "loss": 0.0017, + "step": 67990 + }, + { + "epoch": 1.1126564673157162, + "grad_norm": 0.04118460789322853, + "learning_rate": 4.8895604689323875e-06, + "loss": 0.0012, + "step": 68000 + }, + { + "epoch": 1.1128200932667922, + "grad_norm": 0.025269757956266403, + "learning_rate": 4.888132916024031e-06, + "loss": 0.0018, + "step": 68010 + }, + { + "epoch": 1.112983719217868, + "grad_norm": 0.004718205891549587, + "learning_rate": 4.886705372239174e-06, + "loss": 0.003, + "step": 68020 + }, + { + "epoch": 1.1131473451689438, + "grad_norm": 0.2105235904455185, + "learning_rate": 4.8852778376942405e-06, + "loss": 0.0027, + "step": 68030 + }, + { + "epoch": 1.1133109711200195, + "grad_norm": 0.05768294632434845, + "learning_rate": 4.883850312505656e-06, + "loss": 0.0021, + "step": 68040 + }, + { + "epoch": 1.1134745970710955, + "grad_norm": 0.07050131261348724, + "learning_rate": 4.882422796789846e-06, + "loss": 0.0016, + "step": 68050 + }, + { + "epoch": 1.1136382230221713, + "grad_norm": 0.3542935252189636, + "learning_rate": 4.880995290663231e-06, + "loss": 0.0014, + "step": 68060 + }, + { + "epoch": 1.113801848973247, + "grad_norm": 0.05983225256204605, + "learning_rate": 4.879567794242237e-06, + "loss": 0.002, + "step": 68070 + }, + { + "epoch": 1.113965474924323, + "grad_norm": 0.08799834549427032, + "learning_rate": 4.878140307643282e-06, + "loss": 0.0015, + "step": 68080 + }, + { + "epoch": 1.1141291008753988, + "grad_norm": 0.0234416201710701, + "learning_rate": 4.876712830982791e-06, + "loss": 0.0013, + "step": 68090 + }, + { + "epoch": 1.1142927268264746, + "grad_norm": 0.1094837486743927, + "learning_rate": 4.875285364377181e-06, + "loss": 0.0014, + "step": 68100 + }, + { + "epoch": 1.1144563527775506, + "grad_norm": 0.028089361265301704, + "learning_rate": 4.873857907942872e-06, + "loss": 0.0017, + "step": 68110 + }, + { + "epoch": 1.1146199787286264, + "grad_norm": 0.08715226501226425, + "learning_rate": 4.872430461796283e-06, + "loss": 0.0013, + "step": 68120 + }, + { + "epoch": 1.1147836046797022, + "grad_norm": 0.18056809902191162, + "learning_rate": 4.8710030260538325e-06, + "loss": 0.001, + "step": 68130 + }, + { + "epoch": 1.114947230630778, + "grad_norm": 0.03907276317477226, + "learning_rate": 4.869575600831936e-06, + "loss": 0.0018, + "step": 68140 + }, + { + "epoch": 1.115110856581854, + "grad_norm": 0.06399427354335785, + "learning_rate": 4.8681481862470085e-06, + "loss": 0.0018, + "step": 68150 + }, + { + "epoch": 1.1152744825329297, + "grad_norm": 0.007164820097386837, + "learning_rate": 4.866720782415467e-06, + "loss": 0.0008, + "step": 68160 + }, + { + "epoch": 1.1154381084840055, + "grad_norm": 0.02238679677248001, + "learning_rate": 4.865293389453725e-06, + "loss": 0.0018, + "step": 68170 + }, + { + "epoch": 1.1156017344350815, + "grad_norm": 0.06743773072957993, + "learning_rate": 4.863866007478197e-06, + "loss": 0.0011, + "step": 68180 + }, + { + "epoch": 1.1157653603861573, + "grad_norm": 0.10422379523515701, + "learning_rate": 4.8624386366052925e-06, + "loss": 0.0013, + "step": 68190 + }, + { + "epoch": 1.115928986337233, + "grad_norm": 0.09807144105434418, + "learning_rate": 4.861011276951426e-06, + "loss": 0.0021, + "step": 68200 + }, + { + "epoch": 1.116092612288309, + "grad_norm": 0.06299929320812225, + "learning_rate": 4.859583928633007e-06, + "loss": 0.001, + "step": 68210 + }, + { + "epoch": 1.1162562382393848, + "grad_norm": 0.053699031472206116, + "learning_rate": 4.8581565917664455e-06, + "loss": 0.0012, + "step": 68220 + }, + { + "epoch": 1.1164198641904606, + "grad_norm": 0.05500046908855438, + "learning_rate": 4.856729266468149e-06, + "loss": 0.0019, + "step": 68230 + }, + { + "epoch": 1.1165834901415364, + "grad_norm": 0.23045317828655243, + "learning_rate": 4.855301952854525e-06, + "loss": 0.0015, + "step": 68240 + }, + { + "epoch": 1.1167471160926123, + "grad_norm": 0.02562612295150757, + "learning_rate": 4.853874651041983e-06, + "loss": 0.001, + "step": 68250 + }, + { + "epoch": 1.1169107420436881, + "grad_norm": 0.04910077154636383, + "learning_rate": 4.852447361146926e-06, + "loss": 0.0017, + "step": 68260 + }, + { + "epoch": 1.117074367994764, + "grad_norm": 0.21876679360866547, + "learning_rate": 4.851020083285761e-06, + "loss": 0.0029, + "step": 68270 + }, + { + "epoch": 1.11723799394584, + "grad_norm": 0.03991933539509773, + "learning_rate": 4.84959281757489e-06, + "loss": 0.0013, + "step": 68280 + }, + { + "epoch": 1.1174016198969157, + "grad_norm": 0.04062545672059059, + "learning_rate": 4.848165564130719e-06, + "loss": 0.0012, + "step": 68290 + }, + { + "epoch": 1.1175652458479914, + "grad_norm": 0.10241885483264923, + "learning_rate": 4.846738323069647e-06, + "loss": 0.0015, + "step": 68300 + }, + { + "epoch": 1.1177288717990674, + "grad_norm": 0.017392035573720932, + "learning_rate": 4.8453110945080764e-06, + "loss": 0.0005, + "step": 68310 + }, + { + "epoch": 1.1178924977501432, + "grad_norm": 0.08320208638906479, + "learning_rate": 4.843883878562406e-06, + "loss": 0.0022, + "step": 68320 + }, + { + "epoch": 1.118056123701219, + "grad_norm": 0.031161842867732048, + "learning_rate": 4.8424566753490355e-06, + "loss": 0.0016, + "step": 68330 + }, + { + "epoch": 1.1182197496522948, + "grad_norm": 0.04601942375302315, + "learning_rate": 4.841029484984362e-06, + "loss": 0.0014, + "step": 68340 + }, + { + "epoch": 1.1183833756033708, + "grad_norm": 0.03770656883716583, + "learning_rate": 4.839602307584783e-06, + "loss": 0.0012, + "step": 68350 + }, + { + "epoch": 1.1185470015544465, + "grad_norm": 0.005928488448262215, + "learning_rate": 4.838175143266695e-06, + "loss": 0.0022, + "step": 68360 + }, + { + "epoch": 1.1187106275055223, + "grad_norm": 0.015430208295583725, + "learning_rate": 4.836747992146491e-06, + "loss": 0.0018, + "step": 68370 + }, + { + "epoch": 1.1188742534565983, + "grad_norm": 0.0399409644305706, + "learning_rate": 4.835320854340565e-06, + "loss": 0.0012, + "step": 68380 + }, + { + "epoch": 1.119037879407674, + "grad_norm": 0.04816249758005142, + "learning_rate": 4.833893729965311e-06, + "loss": 0.0014, + "step": 68390 + }, + { + "epoch": 1.1192015053587498, + "grad_norm": 0.04776677489280701, + "learning_rate": 4.832466619137119e-06, + "loss": 0.0017, + "step": 68400 + }, + { + "epoch": 1.1193651313098258, + "grad_norm": 0.05590014532208443, + "learning_rate": 4.831039521972379e-06, + "loss": 0.0015, + "step": 68410 + }, + { + "epoch": 1.1195287572609016, + "grad_norm": 0.02409372478723526, + "learning_rate": 4.829612438587481e-06, + "loss": 0.0019, + "step": 68420 + }, + { + "epoch": 1.1196923832119774, + "grad_norm": 0.03411567583680153, + "learning_rate": 4.828185369098813e-06, + "loss": 0.0015, + "step": 68430 + }, + { + "epoch": 1.1198560091630532, + "grad_norm": 0.008275383152067661, + "learning_rate": 4.826758313622761e-06, + "loss": 0.0011, + "step": 68440 + }, + { + "epoch": 1.1200196351141292, + "grad_norm": 0.5816841125488281, + "learning_rate": 4.825331272275712e-06, + "loss": 0.0012, + "step": 68450 + }, + { + "epoch": 1.120183261065205, + "grad_norm": 0.1427685171365738, + "learning_rate": 4.82390424517405e-06, + "loss": 0.0016, + "step": 68460 + }, + { + "epoch": 1.1203468870162807, + "grad_norm": 0.01641976833343506, + "learning_rate": 4.822477232434158e-06, + "loss": 0.001, + "step": 68470 + }, + { + "epoch": 1.1205105129673567, + "grad_norm": 0.0024519511498510838, + "learning_rate": 4.82105023417242e-06, + "loss": 0.0014, + "step": 68480 + }, + { + "epoch": 1.1206741389184325, + "grad_norm": 0.1062808409333229, + "learning_rate": 4.819623250505216e-06, + "loss": 0.0021, + "step": 68490 + }, + { + "epoch": 1.1208377648695083, + "grad_norm": 0.06138540431857109, + "learning_rate": 4.818196281548925e-06, + "loss": 0.0014, + "step": 68500 + }, + { + "epoch": 1.121001390820584, + "grad_norm": 0.05452272295951843, + "learning_rate": 4.816769327419928e-06, + "loss": 0.0013, + "step": 68510 + }, + { + "epoch": 1.12116501677166, + "grad_norm": 0.056255191564559937, + "learning_rate": 4.8153423882346005e-06, + "loss": 0.0013, + "step": 68520 + }, + { + "epoch": 1.1213286427227358, + "grad_norm": 0.00764923682436347, + "learning_rate": 4.813915464109321e-06, + "loss": 0.0021, + "step": 68530 + }, + { + "epoch": 1.1214922686738116, + "grad_norm": 0.09291795641183853, + "learning_rate": 4.812488555160461e-06, + "loss": 0.0016, + "step": 68540 + }, + { + "epoch": 1.1216558946248876, + "grad_norm": 0.06420211493968964, + "learning_rate": 4.811061661504398e-06, + "loss": 0.0015, + "step": 68550 + }, + { + "epoch": 1.1218195205759633, + "grad_norm": 0.05445067211985588, + "learning_rate": 4.809634783257502e-06, + "loss": 0.002, + "step": 68560 + }, + { + "epoch": 1.1219831465270391, + "grad_norm": 0.09114896506071091, + "learning_rate": 4.808207920536146e-06, + "loss": 0.0024, + "step": 68570 + }, + { + "epoch": 1.1221467724781151, + "grad_norm": 0.06437831372022629, + "learning_rate": 4.8067810734566996e-06, + "loss": 0.0015, + "step": 68580 + }, + { + "epoch": 1.122310398429191, + "grad_norm": 0.03637402132153511, + "learning_rate": 4.805354242135531e-06, + "loss": 0.0024, + "step": 68590 + }, + { + "epoch": 1.1224740243802667, + "grad_norm": 0.027112627401947975, + "learning_rate": 4.803927426689009e-06, + "loss": 0.0023, + "step": 68600 + }, + { + "epoch": 1.1226376503313427, + "grad_norm": 0.2992970049381256, + "learning_rate": 4.802500627233498e-06, + "loss": 0.0017, + "step": 68610 + }, + { + "epoch": 1.1228012762824184, + "grad_norm": 0.010519352741539478, + "learning_rate": 4.8010738438853635e-06, + "loss": 0.0012, + "step": 68620 + }, + { + "epoch": 1.1229649022334942, + "grad_norm": 0.004247406963258982, + "learning_rate": 4.79964707676097e-06, + "loss": 0.0015, + "step": 68630 + }, + { + "epoch": 1.12312852818457, + "grad_norm": 0.02733306773006916, + "learning_rate": 4.79822032597668e-06, + "loss": 0.0019, + "step": 68640 + }, + { + "epoch": 1.123292154135646, + "grad_norm": 0.06895174086093903, + "learning_rate": 4.796793591648853e-06, + "loss": 0.0019, + "step": 68650 + }, + { + "epoch": 1.1234557800867218, + "grad_norm": 0.04173388332128525, + "learning_rate": 4.79536687389385e-06, + "loss": 0.0015, + "step": 68660 + }, + { + "epoch": 1.1236194060377975, + "grad_norm": 0.032186008989810944, + "learning_rate": 4.793940172828028e-06, + "loss": 0.0012, + "step": 68670 + }, + { + "epoch": 1.1237830319888735, + "grad_norm": 0.07761038094758987, + "learning_rate": 4.792513488567743e-06, + "loss": 0.0009, + "step": 68680 + }, + { + "epoch": 1.1239466579399493, + "grad_norm": 0.14062942564487457, + "learning_rate": 4.791086821229355e-06, + "loss": 0.0017, + "step": 68690 + }, + { + "epoch": 1.124110283891025, + "grad_norm": 0.04103397578001022, + "learning_rate": 4.789660170929213e-06, + "loss": 0.0011, + "step": 68700 + }, + { + "epoch": 1.1242739098421008, + "grad_norm": 0.01348420511931181, + "learning_rate": 4.788233537783672e-06, + "loss": 0.0008, + "step": 68710 + }, + { + "epoch": 1.1244375357931768, + "grad_norm": 0.011913171038031578, + "learning_rate": 4.786806921909084e-06, + "loss": 0.0012, + "step": 68720 + }, + { + "epoch": 1.1246011617442526, + "grad_norm": 0.037176214158535004, + "learning_rate": 4.785380323421797e-06, + "loss": 0.0012, + "step": 68730 + }, + { + "epoch": 1.1247647876953284, + "grad_norm": 0.013256115838885307, + "learning_rate": 4.783953742438161e-06, + "loss": 0.0014, + "step": 68740 + }, + { + "epoch": 1.1249284136464044, + "grad_norm": 0.01954822614789009, + "learning_rate": 4.782527179074523e-06, + "loss": 0.0015, + "step": 68750 + }, + { + "epoch": 1.1250920395974802, + "grad_norm": 0.05509534478187561, + "learning_rate": 4.781100633447228e-06, + "loss": 0.0019, + "step": 68760 + }, + { + "epoch": 1.125255665548556, + "grad_norm": 0.009375140070915222, + "learning_rate": 4.779674105672621e-06, + "loss": 0.0016, + "step": 68770 + }, + { + "epoch": 1.125419291499632, + "grad_norm": 0.06476373225450516, + "learning_rate": 4.7782475958670435e-06, + "loss": 0.0013, + "step": 68780 + }, + { + "epoch": 1.1255829174507077, + "grad_norm": 0.015310265123844147, + "learning_rate": 4.776821104146839e-06, + "loss": 0.0014, + "step": 68790 + }, + { + "epoch": 1.1257465434017835, + "grad_norm": 0.06258490681648254, + "learning_rate": 4.7753946306283446e-06, + "loss": 0.0026, + "step": 68800 + }, + { + "epoch": 1.1259101693528595, + "grad_norm": 0.007618163712322712, + "learning_rate": 4.773968175427901e-06, + "loss": 0.001, + "step": 68810 + }, + { + "epoch": 1.1260737953039353, + "grad_norm": 0.11507266759872437, + "learning_rate": 4.772541738661844e-06, + "loss": 0.0012, + "step": 68820 + }, + { + "epoch": 1.126237421255011, + "grad_norm": 0.021891871467232704, + "learning_rate": 4.771115320446508e-06, + "loss": 0.0015, + "step": 68830 + }, + { + "epoch": 1.1264010472060868, + "grad_norm": 0.041682783514261246, + "learning_rate": 4.7696889208982275e-06, + "loss": 0.0026, + "step": 68840 + }, + { + "epoch": 1.1265646731571628, + "grad_norm": 0.016159404069185257, + "learning_rate": 4.768262540133337e-06, + "loss": 0.001, + "step": 68850 + }, + { + "epoch": 1.1267282991082386, + "grad_norm": 0.07486572116613388, + "learning_rate": 4.766836178268163e-06, + "loss": 0.0016, + "step": 68860 + }, + { + "epoch": 1.1268919250593143, + "grad_norm": 0.025512000545859337, + "learning_rate": 4.765409835419039e-06, + "loss": 0.0014, + "step": 68870 + }, + { + "epoch": 1.1270555510103903, + "grad_norm": 0.1999233365058899, + "learning_rate": 4.76398351170229e-06, + "loss": 0.0018, + "step": 68880 + }, + { + "epoch": 1.1272191769614661, + "grad_norm": 0.07580450177192688, + "learning_rate": 4.762557207234242e-06, + "loss": 0.0009, + "step": 68890 + }, + { + "epoch": 1.127382802912542, + "grad_norm": 0.0494706891477108, + "learning_rate": 4.761130922131221e-06, + "loss": 0.0013, + "step": 68900 + }, + { + "epoch": 1.1275464288636177, + "grad_norm": 0.06407617777585983, + "learning_rate": 4.759704656509549e-06, + "loss": 0.0006, + "step": 68910 + }, + { + "epoch": 1.1277100548146937, + "grad_norm": 0.0746607854962349, + "learning_rate": 4.758278410485547e-06, + "loss": 0.001, + "step": 68920 + }, + { + "epoch": 1.1278736807657694, + "grad_norm": 0.02848961390554905, + "learning_rate": 4.756852184175537e-06, + "loss": 0.0016, + "step": 68930 + }, + { + "epoch": 1.1280373067168452, + "grad_norm": 0.04006713628768921, + "learning_rate": 4.755425977695834e-06, + "loss": 0.0026, + "step": 68940 + }, + { + "epoch": 1.1282009326679212, + "grad_norm": 0.044217657297849655, + "learning_rate": 4.753999791162757e-06, + "loss": 0.0011, + "step": 68950 + }, + { + "epoch": 1.128364558618997, + "grad_norm": 0.0506523996591568, + "learning_rate": 4.75257362469262e-06, + "loss": 0.0023, + "step": 68960 + }, + { + "epoch": 1.1285281845700728, + "grad_norm": 0.05456908047199249, + "learning_rate": 4.7511474784017365e-06, + "loss": 0.0014, + "step": 68970 + }, + { + "epoch": 1.1286918105211488, + "grad_norm": 0.2570742666721344, + "learning_rate": 4.749721352406418e-06, + "loss": 0.0023, + "step": 68980 + }, + { + "epoch": 1.1288554364722245, + "grad_norm": 0.02927926741540432, + "learning_rate": 4.7482952468229745e-06, + "loss": 0.0014, + "step": 68990 + }, + { + "epoch": 1.1290190624233003, + "grad_norm": 0.059277646243572235, + "learning_rate": 4.746869161767714e-06, + "loss": 0.001, + "step": 69000 + }, + { + "epoch": 1.1291826883743763, + "grad_norm": 0.039500642567873, + "learning_rate": 4.745443097356943e-06, + "loss": 0.001, + "step": 69010 + }, + { + "epoch": 1.129346314325452, + "grad_norm": 0.038523104041814804, + "learning_rate": 4.744017053706967e-06, + "loss": 0.0028, + "step": 69020 + }, + { + "epoch": 1.1295099402765278, + "grad_norm": 0.14120125770568848, + "learning_rate": 4.74259103093409e-06, + "loss": 0.0029, + "step": 69030 + }, + { + "epoch": 1.1296735662276036, + "grad_norm": 0.014275015331804752, + "learning_rate": 4.741165029154612e-06, + "loss": 0.0016, + "step": 69040 + }, + { + "epoch": 1.1298371921786796, + "grad_norm": 0.031047916039824486, + "learning_rate": 4.739739048484834e-06, + "loss": 0.0013, + "step": 69050 + }, + { + "epoch": 1.1300008181297554, + "grad_norm": 0.02242126315832138, + "learning_rate": 4.7383130890410535e-06, + "loss": 0.0016, + "step": 69060 + }, + { + "epoch": 1.1301644440808312, + "grad_norm": 0.036595042794942856, + "learning_rate": 4.736887150939568e-06, + "loss": 0.0012, + "step": 69070 + }, + { + "epoch": 1.130328070031907, + "grad_norm": 0.05457611009478569, + "learning_rate": 4.73546123429667e-06, + "loss": 0.0012, + "step": 69080 + }, + { + "epoch": 1.130491695982983, + "grad_norm": 0.02000110037624836, + "learning_rate": 4.734035339228655e-06, + "loss": 0.0015, + "step": 69090 + }, + { + "epoch": 1.1306553219340587, + "grad_norm": 0.046778496354818344, + "learning_rate": 4.732609465851812e-06, + "loss": 0.0017, + "step": 69100 + }, + { + "epoch": 1.1308189478851345, + "grad_norm": 0.051911722868680954, + "learning_rate": 4.731183614282431e-06, + "loss": 0.0013, + "step": 69110 + }, + { + "epoch": 1.1309825738362105, + "grad_norm": 0.07886985689401627, + "learning_rate": 4.7297577846367996e-06, + "loss": 0.0019, + "step": 69120 + }, + { + "epoch": 1.1311461997872863, + "grad_norm": 0.06864001601934433, + "learning_rate": 4.728331977031205e-06, + "loss": 0.0014, + "step": 69130 + }, + { + "epoch": 1.131309825738362, + "grad_norm": 0.016339551657438278, + "learning_rate": 4.726906191581929e-06, + "loss": 0.0015, + "step": 69140 + }, + { + "epoch": 1.131473451689438, + "grad_norm": 0.07824890315532684, + "learning_rate": 4.725480428405255e-06, + "loss": 0.0014, + "step": 69150 + }, + { + "epoch": 1.1316370776405138, + "grad_norm": 0.042329344898462296, + "learning_rate": 4.724054687617464e-06, + "loss": 0.001, + "step": 69160 + }, + { + "epoch": 1.1318007035915896, + "grad_norm": 0.07561182230710983, + "learning_rate": 4.722628969334833e-06, + "loss": 0.0014, + "step": 69170 + }, + { + "epoch": 1.1319643295426656, + "grad_norm": 0.05738131329417229, + "learning_rate": 4.721203273673641e-06, + "loss": 0.0015, + "step": 69180 + }, + { + "epoch": 1.1321279554937413, + "grad_norm": 0.14139309525489807, + "learning_rate": 4.7197776007501605e-06, + "loss": 0.0016, + "step": 69190 + }, + { + "epoch": 1.1322915814448171, + "grad_norm": 0.0735301598906517, + "learning_rate": 4.7183519506806655e-06, + "loss": 0.0008, + "step": 69200 + }, + { + "epoch": 1.132455207395893, + "grad_norm": 0.03328855335712433, + "learning_rate": 4.7169263235814275e-06, + "loss": 0.0021, + "step": 69210 + }, + { + "epoch": 1.132618833346969, + "grad_norm": 0.09768014401197433, + "learning_rate": 4.715500719568715e-06, + "loss": 0.0013, + "step": 69220 + }, + { + "epoch": 1.1327824592980447, + "grad_norm": 0.031685058027505875, + "learning_rate": 4.7140751387587955e-06, + "loss": 0.0011, + "step": 69230 + }, + { + "epoch": 1.1329460852491204, + "grad_norm": 0.057616978883743286, + "learning_rate": 4.712649581267935e-06, + "loss": 0.0014, + "step": 69240 + }, + { + "epoch": 1.1331097112001964, + "grad_norm": 0.05230008810758591, + "learning_rate": 4.711224047212397e-06, + "loss": 0.0015, + "step": 69250 + }, + { + "epoch": 1.1332733371512722, + "grad_norm": 0.07187466323375702, + "learning_rate": 4.709798536708444e-06, + "loss": 0.0012, + "step": 69260 + }, + { + "epoch": 1.133436963102348, + "grad_norm": 0.12507106363773346, + "learning_rate": 4.708373049872334e-06, + "loss": 0.0024, + "step": 69270 + }, + { + "epoch": 1.1336005890534238, + "grad_norm": 0.019464466720819473, + "learning_rate": 4.706947586820327e-06, + "loss": 0.0014, + "step": 69280 + }, + { + "epoch": 1.1337642150044998, + "grad_norm": 0.05887354165315628, + "learning_rate": 4.705522147668677e-06, + "loss": 0.0008, + "step": 69290 + }, + { + "epoch": 1.1339278409555755, + "grad_norm": 0.033978305757045746, + "learning_rate": 4.704096732533638e-06, + "loss": 0.0009, + "step": 69300 + }, + { + "epoch": 1.1340914669066513, + "grad_norm": 0.034490495920181274, + "learning_rate": 4.702671341531464e-06, + "loss": 0.0013, + "step": 69310 + }, + { + "epoch": 1.1342550928577273, + "grad_norm": 0.08320074528455734, + "learning_rate": 4.701245974778403e-06, + "loss": 0.0017, + "step": 69320 + }, + { + "epoch": 1.134418718808803, + "grad_norm": 0.10139095783233643, + "learning_rate": 4.699820632390705e-06, + "loss": 0.0012, + "step": 69330 + }, + { + "epoch": 1.1345823447598788, + "grad_norm": 0.027408484369516373, + "learning_rate": 4.698395314484613e-06, + "loss": 0.001, + "step": 69340 + }, + { + "epoch": 1.1347459707109548, + "grad_norm": 0.02187284454703331, + "learning_rate": 4.696970021176375e-06, + "loss": 0.0013, + "step": 69350 + }, + { + "epoch": 1.1349095966620306, + "grad_norm": 0.13004529476165771, + "learning_rate": 4.69554475258223e-06, + "loss": 0.0016, + "step": 69360 + }, + { + "epoch": 1.1350732226131064, + "grad_norm": 0.04210156202316284, + "learning_rate": 4.694119508818419e-06, + "loss": 0.0012, + "step": 69370 + }, + { + "epoch": 1.1352368485641824, + "grad_norm": 0.07349762320518494, + "learning_rate": 4.692694290001181e-06, + "loss": 0.0014, + "step": 69380 + }, + { + "epoch": 1.1354004745152582, + "grad_norm": 0.03395136445760727, + "learning_rate": 4.69126909624675e-06, + "loss": 0.0022, + "step": 69390 + }, + { + "epoch": 1.135564100466334, + "grad_norm": 0.038457222282886505, + "learning_rate": 4.689843927671362e-06, + "loss": 0.0016, + "step": 69400 + }, + { + "epoch": 1.1357277264174097, + "grad_norm": 0.032105591148138046, + "learning_rate": 4.688418784391247e-06, + "loss": 0.0023, + "step": 69410 + }, + { + "epoch": 1.1358913523684857, + "grad_norm": 0.0332166850566864, + "learning_rate": 4.686993666522637e-06, + "loss": 0.0011, + "step": 69420 + }, + { + "epoch": 1.1360549783195615, + "grad_norm": 0.17723166942596436, + "learning_rate": 4.685568574181758e-06, + "loss": 0.0015, + "step": 69430 + }, + { + "epoch": 1.1362186042706373, + "grad_norm": 0.057184819132089615, + "learning_rate": 4.6841435074848376e-06, + "loss": 0.001, + "step": 69440 + }, + { + "epoch": 1.1363822302217133, + "grad_norm": 0.15394125878810883, + "learning_rate": 4.682718466548096e-06, + "loss": 0.0013, + "step": 69450 + }, + { + "epoch": 1.136545856172789, + "grad_norm": 0.015407206490635872, + "learning_rate": 4.6812934514877585e-06, + "loss": 0.0026, + "step": 69460 + }, + { + "epoch": 1.1367094821238648, + "grad_norm": 0.06906851381063461, + "learning_rate": 4.679868462420042e-06, + "loss": 0.0017, + "step": 69470 + }, + { + "epoch": 1.1368731080749406, + "grad_norm": 0.036438170820474625, + "learning_rate": 4.678443499461164e-06, + "loss": 0.0013, + "step": 69480 + }, + { + "epoch": 1.1370367340260166, + "grad_norm": 0.05990985035896301, + "learning_rate": 4.677018562727341e-06, + "loss": 0.0019, + "step": 69490 + }, + { + "epoch": 1.1372003599770923, + "grad_norm": 0.06944531947374344, + "learning_rate": 4.675593652334786e-06, + "loss": 0.0014, + "step": 69500 + }, + { + "epoch": 1.1373639859281681, + "grad_norm": 0.023260870948433876, + "learning_rate": 4.674168768399708e-06, + "loss": 0.0013, + "step": 69510 + }, + { + "epoch": 1.1375276118792441, + "grad_norm": 0.11983196437358856, + "learning_rate": 4.672743911038316e-06, + "loss": 0.0019, + "step": 69520 + }, + { + "epoch": 1.13769123783032, + "grad_norm": 0.005170944146811962, + "learning_rate": 4.671319080366819e-06, + "loss": 0.0018, + "step": 69530 + }, + { + "epoch": 1.1378548637813957, + "grad_norm": 0.040983088314533234, + "learning_rate": 4.669894276501418e-06, + "loss": 0.0014, + "step": 69540 + }, + { + "epoch": 1.1380184897324717, + "grad_norm": 0.012690916657447815, + "learning_rate": 4.6684694995583165e-06, + "loss": 0.0011, + "step": 69550 + }, + { + "epoch": 1.1381821156835474, + "grad_norm": 0.10029805451631546, + "learning_rate": 4.6670447496537154e-06, + "loss": 0.0009, + "step": 69560 + }, + { + "epoch": 1.1383457416346232, + "grad_norm": 0.031852830201387405, + "learning_rate": 4.6656200269038115e-06, + "loss": 0.0013, + "step": 69570 + }, + { + "epoch": 1.1385093675856992, + "grad_norm": 0.014501173980534077, + "learning_rate": 4.664195331424801e-06, + "loss": 0.0013, + "step": 69580 + }, + { + "epoch": 1.138672993536775, + "grad_norm": 0.047503355890512466, + "learning_rate": 4.662770663332876e-06, + "loss": 0.0019, + "step": 69590 + }, + { + "epoch": 1.1388366194878508, + "grad_norm": 0.11540369689464569, + "learning_rate": 4.661346022744229e-06, + "loss": 0.0006, + "step": 69600 + }, + { + "epoch": 1.1390002454389265, + "grad_norm": 0.015183909796178341, + "learning_rate": 4.659921409775047e-06, + "loss": 0.0011, + "step": 69610 + }, + { + "epoch": 1.1391638713900025, + "grad_norm": 0.028284432366490364, + "learning_rate": 4.658496824541518e-06, + "loss": 0.0013, + "step": 69620 + }, + { + "epoch": 1.1393274973410783, + "grad_norm": 0.03630579262971878, + "learning_rate": 4.657072267159828e-06, + "loss": 0.0014, + "step": 69630 + }, + { + "epoch": 1.139491123292154, + "grad_norm": 0.01199339423328638, + "learning_rate": 4.655647737746155e-06, + "loss": 0.0014, + "step": 69640 + }, + { + "epoch": 1.13965474924323, + "grad_norm": 0.046971119940280914, + "learning_rate": 4.654223236416682e-06, + "loss": 0.0023, + "step": 69650 + }, + { + "epoch": 1.1398183751943058, + "grad_norm": 0.025801360607147217, + "learning_rate": 4.652798763287585e-06, + "loss": 0.0012, + "step": 69660 + }, + { + "epoch": 1.1399820011453816, + "grad_norm": 0.03047754429280758, + "learning_rate": 4.65137431847504e-06, + "loss": 0.0014, + "step": 69670 + }, + { + "epoch": 1.1401456270964574, + "grad_norm": 0.06464429944753647, + "learning_rate": 4.6499499020952185e-06, + "loss": 0.0023, + "step": 69680 + }, + { + "epoch": 1.1403092530475334, + "grad_norm": 0.12241919338703156, + "learning_rate": 4.648525514264293e-06, + "loss": 0.002, + "step": 69690 + }, + { + "epoch": 1.1404728789986092, + "grad_norm": 0.05516223981976509, + "learning_rate": 4.64710115509843e-06, + "loss": 0.0017, + "step": 69700 + }, + { + "epoch": 1.140636504949685, + "grad_norm": 0.08677995204925537, + "learning_rate": 4.645676824713797e-06, + "loss": 0.0014, + "step": 69710 + }, + { + "epoch": 1.140800130900761, + "grad_norm": 0.051214709877967834, + "learning_rate": 4.644252523226556e-06, + "loss": 0.0015, + "step": 69720 + }, + { + "epoch": 1.1409637568518367, + "grad_norm": 0.044149212539196014, + "learning_rate": 4.64282825075287e-06, + "loss": 0.0012, + "step": 69730 + }, + { + "epoch": 1.1411273828029125, + "grad_norm": 0.0165726225823164, + "learning_rate": 4.641404007408895e-06, + "loss": 0.0008, + "step": 69740 + }, + { + "epoch": 1.1412910087539885, + "grad_norm": 0.04800814017653465, + "learning_rate": 4.63997979331079e-06, + "loss": 0.002, + "step": 69750 + }, + { + "epoch": 1.1414546347050643, + "grad_norm": 0.055821534246206284, + "learning_rate": 4.638555608574708e-06, + "loss": 0.0014, + "step": 69760 + }, + { + "epoch": 1.14161826065614, + "grad_norm": 0.1380537748336792, + "learning_rate": 4.6371314533168e-06, + "loss": 0.0015, + "step": 69770 + }, + { + "epoch": 1.141781886607216, + "grad_norm": 0.018859386444091797, + "learning_rate": 4.635707327653218e-06, + "loss": 0.0013, + "step": 69780 + }, + { + "epoch": 1.1419455125582918, + "grad_norm": 0.05863795429468155, + "learning_rate": 4.6342832317001054e-06, + "loss": 0.0015, + "step": 69790 + }, + { + "epoch": 1.1421091385093676, + "grad_norm": 0.05261906236410141, + "learning_rate": 4.632859165573609e-06, + "loss": 0.0009, + "step": 69800 + }, + { + "epoch": 1.1422727644604433, + "grad_norm": 0.024221068248152733, + "learning_rate": 4.631435129389869e-06, + "loss": 0.0016, + "step": 69810 + }, + { + "epoch": 1.1424363904115193, + "grad_norm": 0.14689457416534424, + "learning_rate": 4.630011123265028e-06, + "loss": 0.0016, + "step": 69820 + }, + { + "epoch": 1.1426000163625951, + "grad_norm": 0.11523108184337616, + "learning_rate": 4.628587147315219e-06, + "loss": 0.0013, + "step": 69830 + }, + { + "epoch": 1.142763642313671, + "grad_norm": 0.04516945034265518, + "learning_rate": 4.627163201656579e-06, + "loss": 0.0013, + "step": 69840 + }, + { + "epoch": 1.1429272682647467, + "grad_norm": 0.03280774876475334, + "learning_rate": 4.62573928640524e-06, + "loss": 0.0012, + "step": 69850 + }, + { + "epoch": 1.1430908942158227, + "grad_norm": 0.11012667417526245, + "learning_rate": 4.624315401677331e-06, + "loss": 0.0014, + "step": 69860 + }, + { + "epoch": 1.1432545201668984, + "grad_norm": 0.04875332489609718, + "learning_rate": 4.6228915475889795e-06, + "loss": 0.0021, + "step": 69870 + }, + { + "epoch": 1.1434181461179742, + "grad_norm": 0.08149484544992447, + "learning_rate": 4.621467724256311e-06, + "loss": 0.0012, + "step": 69880 + }, + { + "epoch": 1.1435817720690502, + "grad_norm": 0.04188698157668114, + "learning_rate": 4.620043931795446e-06, + "loss": 0.001, + "step": 69890 + }, + { + "epoch": 1.143745398020126, + "grad_norm": 0.008410933427512646, + "learning_rate": 4.618620170322506e-06, + "loss": 0.0016, + "step": 69900 + }, + { + "epoch": 1.1439090239712018, + "grad_norm": 0.12317752838134766, + "learning_rate": 4.617196439953608e-06, + "loss": 0.0024, + "step": 69910 + }, + { + "epoch": 1.1440726499222778, + "grad_norm": 0.2302798479795456, + "learning_rate": 4.615772740804866e-06, + "loss": 0.0015, + "step": 69920 + }, + { + "epoch": 1.1442362758733535, + "grad_norm": 0.03326624631881714, + "learning_rate": 4.61434907299239e-06, + "loss": 0.0026, + "step": 69930 + }, + { + "epoch": 1.1443999018244293, + "grad_norm": 0.029295239597558975, + "learning_rate": 4.612925436632293e-06, + "loss": 0.0008, + "step": 69940 + }, + { + "epoch": 1.1445635277755053, + "grad_norm": 0.06539978086948395, + "learning_rate": 4.61150183184068e-06, + "loss": 0.0011, + "step": 69950 + }, + { + "epoch": 1.144727153726581, + "grad_norm": 0.038999684154987335, + "learning_rate": 4.6100782587336566e-06, + "loss": 0.0014, + "step": 69960 + }, + { + "epoch": 1.1448907796776568, + "grad_norm": 0.10136260837316513, + "learning_rate": 4.608654717427323e-06, + "loss": 0.0023, + "step": 69970 + }, + { + "epoch": 1.1450544056287328, + "grad_norm": 0.04178288206458092, + "learning_rate": 4.607231208037779e-06, + "loss": 0.0012, + "step": 69980 + }, + { + "epoch": 1.1452180315798086, + "grad_norm": 0.04721640795469284, + "learning_rate": 4.605807730681122e-06, + "loss": 0.0016, + "step": 69990 + }, + { + "epoch": 1.1453816575308844, + "grad_norm": 0.0385037362575531, + "learning_rate": 4.604384285473445e-06, + "loss": 0.0008, + "step": 70000 + }, + { + "epoch": 1.1455452834819602, + "grad_norm": 0.09557905048131943, + "learning_rate": 4.602960872530839e-06, + "loss": 0.0013, + "step": 70010 + }, + { + "epoch": 1.1457089094330362, + "grad_norm": 0.036662496626377106, + "learning_rate": 4.601537491969394e-06, + "loss": 0.0011, + "step": 70020 + }, + { + "epoch": 1.145872535384112, + "grad_norm": 0.03240806236863136, + "learning_rate": 4.600114143905196e-06, + "loss": 0.001, + "step": 70030 + }, + { + "epoch": 1.1460361613351877, + "grad_norm": 0.03840579465031624, + "learning_rate": 4.598690828454327e-06, + "loss": 0.0013, + "step": 70040 + }, + { + "epoch": 1.1461997872862635, + "grad_norm": 0.002582667162641883, + "learning_rate": 4.597267545732869e-06, + "loss": 0.0012, + "step": 70050 + }, + { + "epoch": 1.1463634132373395, + "grad_norm": 0.04660514369606972, + "learning_rate": 4.5958442958569e-06, + "loss": 0.0014, + "step": 70060 + }, + { + "epoch": 1.1465270391884153, + "grad_norm": 0.17103545367717743, + "learning_rate": 4.594421078942496e-06, + "loss": 0.0015, + "step": 70070 + }, + { + "epoch": 1.146690665139491, + "grad_norm": 0.11617986112833023, + "learning_rate": 4.592997895105728e-06, + "loss": 0.0032, + "step": 70080 + }, + { + "epoch": 1.146854291090567, + "grad_norm": 0.017251331359148026, + "learning_rate": 4.591574744462666e-06, + "loss": 0.0023, + "step": 70090 + }, + { + "epoch": 1.1470179170416428, + "grad_norm": 0.3622962236404419, + "learning_rate": 4.59015162712938e-06, + "loss": 0.0015, + "step": 70100 + }, + { + "epoch": 1.1471815429927186, + "grad_norm": 0.01872352510690689, + "learning_rate": 4.588728543221932e-06, + "loss": 0.0016, + "step": 70110 + }, + { + "epoch": 1.1473451689437946, + "grad_norm": 0.09341420233249664, + "learning_rate": 4.587305492856385e-06, + "loss": 0.0018, + "step": 70120 + }, + { + "epoch": 1.1475087948948703, + "grad_norm": 0.05101931467652321, + "learning_rate": 4.585882476148797e-06, + "loss": 0.0009, + "step": 70130 + }, + { + "epoch": 1.1476724208459461, + "grad_norm": 0.15150775015354156, + "learning_rate": 4.584459493215228e-06, + "loss": 0.0019, + "step": 70140 + }, + { + "epoch": 1.1478360467970221, + "grad_norm": 0.06719735264778137, + "learning_rate": 4.583036544171726e-06, + "loss": 0.001, + "step": 70150 + }, + { + "epoch": 1.1479996727480979, + "grad_norm": 0.042329829186201096, + "learning_rate": 4.581613629134346e-06, + "loss": 0.0015, + "step": 70160 + }, + { + "epoch": 1.1481632986991737, + "grad_norm": 0.05940350517630577, + "learning_rate": 4.580190748219135e-06, + "loss": 0.001, + "step": 70170 + }, + { + "epoch": 1.1483269246502494, + "grad_norm": 0.04276195168495178, + "learning_rate": 4.578767901542138e-06, + "loss": 0.0013, + "step": 70180 + }, + { + "epoch": 1.1484905506013254, + "grad_norm": 0.07646219432353973, + "learning_rate": 4.577345089219397e-06, + "loss": 0.0015, + "step": 70190 + }, + { + "epoch": 1.1486541765524012, + "grad_norm": 0.003967093303799629, + "learning_rate": 4.575922311366954e-06, + "loss": 0.0025, + "step": 70200 + }, + { + "epoch": 1.148817802503477, + "grad_norm": 0.12254655361175537, + "learning_rate": 4.574499568100843e-06, + "loss": 0.0017, + "step": 70210 + }, + { + "epoch": 1.148981428454553, + "grad_norm": 0.05390444025397301, + "learning_rate": 4.5730768595371005e-06, + "loss": 0.0031, + "step": 70220 + }, + { + "epoch": 1.1491450544056288, + "grad_norm": 0.01958387717604637, + "learning_rate": 4.571654185791757e-06, + "loss": 0.0016, + "step": 70230 + }, + { + "epoch": 1.1493086803567045, + "grad_norm": 0.014004549011588097, + "learning_rate": 4.57023154698084e-06, + "loss": 0.0016, + "step": 70240 + }, + { + "epoch": 1.1494723063077803, + "grad_norm": 0.2683558762073517, + "learning_rate": 4.568808943220376e-06, + "loss": 0.0017, + "step": 70250 + }, + { + "epoch": 1.1496359322588563, + "grad_norm": 0.0585932619869709, + "learning_rate": 4.567386374626388e-06, + "loss": 0.0018, + "step": 70260 + }, + { + "epoch": 1.149799558209932, + "grad_norm": 0.038936033844947815, + "learning_rate": 4.565963841314895e-06, + "loss": 0.001, + "step": 70270 + }, + { + "epoch": 1.1499631841610078, + "grad_norm": 0.04698233678936958, + "learning_rate": 4.564541343401914e-06, + "loss": 0.0014, + "step": 70280 + }, + { + "epoch": 1.1501268101120838, + "grad_norm": 0.059814032167196274, + "learning_rate": 4.563118881003461e-06, + "loss": 0.002, + "step": 70290 + }, + { + "epoch": 1.1502904360631596, + "grad_norm": 0.04966720938682556, + "learning_rate": 4.561696454235544e-06, + "loss": 0.002, + "step": 70300 + }, + { + "epoch": 1.1504540620142354, + "grad_norm": 0.061718229204416275, + "learning_rate": 4.560274063214174e-06, + "loss": 0.0012, + "step": 70310 + }, + { + "epoch": 1.1506176879653114, + "grad_norm": 0.010380145162343979, + "learning_rate": 4.558851708055355e-06, + "loss": 0.0011, + "step": 70320 + }, + { + "epoch": 1.1507813139163872, + "grad_norm": 0.08529224991798401, + "learning_rate": 4.557429388875089e-06, + "loss": 0.0022, + "step": 70330 + }, + { + "epoch": 1.150944939867463, + "grad_norm": 0.025229481980204582, + "learning_rate": 4.556007105789377e-06, + "loss": 0.0018, + "step": 70340 + }, + { + "epoch": 1.151108565818539, + "grad_norm": 0.03809288144111633, + "learning_rate": 4.554584858914215e-06, + "loss": 0.0021, + "step": 70350 + }, + { + "epoch": 1.1512721917696147, + "grad_norm": 0.03015618771314621, + "learning_rate": 4.553162648365596e-06, + "loss": 0.0015, + "step": 70360 + }, + { + "epoch": 1.1514358177206905, + "grad_norm": 0.13260255753993988, + "learning_rate": 4.5517404742595115e-06, + "loss": 0.0013, + "step": 70370 + }, + { + "epoch": 1.1515994436717663, + "grad_norm": 0.06396839767694473, + "learning_rate": 4.550318336711949e-06, + "loss": 0.0013, + "step": 70380 + }, + { + "epoch": 1.1517630696228423, + "grad_norm": 0.06370850652456284, + "learning_rate": 4.548896235838893e-06, + "loss": 0.0006, + "step": 70390 + }, + { + "epoch": 1.151926695573918, + "grad_norm": 0.0769275575876236, + "learning_rate": 4.547474171756324e-06, + "loss": 0.0014, + "step": 70400 + }, + { + "epoch": 1.1520903215249938, + "grad_norm": 0.03137046843767166, + "learning_rate": 4.546052144580224e-06, + "loss": 0.0019, + "step": 70410 + }, + { + "epoch": 1.1522539474760698, + "grad_norm": 0.10049285739660263, + "learning_rate": 4.5446301544265645e-06, + "loss": 0.0009, + "step": 70420 + }, + { + "epoch": 1.1524175734271456, + "grad_norm": 0.06732705235481262, + "learning_rate": 4.543208201411321e-06, + "loss": 0.001, + "step": 70430 + }, + { + "epoch": 1.1525811993782213, + "grad_norm": 0.06824111938476562, + "learning_rate": 4.541786285650463e-06, + "loss": 0.0018, + "step": 70440 + }, + { + "epoch": 1.1527448253292971, + "grad_norm": 0.02567416988313198, + "learning_rate": 4.540364407259957e-06, + "loss": 0.0019, + "step": 70450 + }, + { + "epoch": 1.1529084512803731, + "grad_norm": 0.11214583367109299, + "learning_rate": 4.538942566355765e-06, + "loss": 0.0014, + "step": 70460 + }, + { + "epoch": 1.1530720772314489, + "grad_norm": 0.034722182899713516, + "learning_rate": 4.53752076305385e-06, + "loss": 0.0013, + "step": 70470 + }, + { + "epoch": 1.1532357031825247, + "grad_norm": 1.3014166355133057, + "learning_rate": 4.536098997470168e-06, + "loss": 0.0009, + "step": 70480 + }, + { + "epoch": 1.1533993291336007, + "grad_norm": 0.1458534449338913, + "learning_rate": 4.534677269720672e-06, + "loss": 0.001, + "step": 70490 + }, + { + "epoch": 1.1535629550846764, + "grad_norm": 0.05178714916110039, + "learning_rate": 4.533255579921315e-06, + "loss": 0.0016, + "step": 70500 + }, + { + "epoch": 1.1537265810357522, + "grad_norm": 0.0254295002669096, + "learning_rate": 4.531833928188046e-06, + "loss": 0.0011, + "step": 70510 + }, + { + "epoch": 1.1538902069868282, + "grad_norm": 0.05432586371898651, + "learning_rate": 4.530412314636808e-06, + "loss": 0.0012, + "step": 70520 + }, + { + "epoch": 1.154053832937904, + "grad_norm": 0.04637228697538376, + "learning_rate": 4.528990739383544e-06, + "loss": 0.0011, + "step": 70530 + }, + { + "epoch": 1.1542174588889798, + "grad_norm": 0.03168310970067978, + "learning_rate": 4.527569202544193e-06, + "loss": 0.0015, + "step": 70540 + }, + { + "epoch": 1.1543810848400557, + "grad_norm": 0.060829151421785355, + "learning_rate": 4.526147704234691e-06, + "loss": 0.0013, + "step": 70550 + }, + { + "epoch": 1.1545447107911315, + "grad_norm": 0.06875636428594589, + "learning_rate": 4.524726244570969e-06, + "loss": 0.001, + "step": 70560 + }, + { + "epoch": 1.1547083367422073, + "grad_norm": 0.05969448760151863, + "learning_rate": 4.5233048236689584e-06, + "loss": 0.0011, + "step": 70570 + }, + { + "epoch": 1.154871962693283, + "grad_norm": 0.005917946342378855, + "learning_rate": 4.521883441644583e-06, + "loss": 0.0015, + "step": 70580 + }, + { + "epoch": 1.155035588644359, + "grad_norm": 0.028081731870770454, + "learning_rate": 4.520462098613769e-06, + "loss": 0.0013, + "step": 70590 + }, + { + "epoch": 1.1551992145954348, + "grad_norm": 0.039777081459760666, + "learning_rate": 4.519040794692434e-06, + "loss": 0.0018, + "step": 70600 + }, + { + "epoch": 1.1553628405465106, + "grad_norm": 0.36060217022895813, + "learning_rate": 4.517619529996496e-06, + "loss": 0.0022, + "step": 70610 + }, + { + "epoch": 1.1555264664975866, + "grad_norm": 0.1988147497177124, + "learning_rate": 4.516198304641867e-06, + "loss": 0.0019, + "step": 70620 + }, + { + "epoch": 1.1556900924486624, + "grad_norm": 0.011198713444173336, + "learning_rate": 4.5147771187444595e-06, + "loss": 0.0018, + "step": 70630 + }, + { + "epoch": 1.1558537183997382, + "grad_norm": 0.04681457206606865, + "learning_rate": 4.513355972420178e-06, + "loss": 0.0007, + "step": 70640 + }, + { + "epoch": 1.156017344350814, + "grad_norm": 0.01267332024872303, + "learning_rate": 4.511934865784929e-06, + "loss": 0.0017, + "step": 70650 + }, + { + "epoch": 1.15618097030189, + "grad_norm": 0.08585552126169205, + "learning_rate": 4.510513798954611e-06, + "loss": 0.0019, + "step": 70660 + }, + { + "epoch": 1.1563445962529657, + "grad_norm": 0.03173115476965904, + "learning_rate": 4.5090927720451225e-06, + "loss": 0.0007, + "step": 70670 + }, + { + "epoch": 1.1565082222040415, + "grad_norm": 0.0903201550245285, + "learning_rate": 4.5076717851723565e-06, + "loss": 0.0013, + "step": 70680 + }, + { + "epoch": 1.1566718481551175, + "grad_norm": 0.0177631638944149, + "learning_rate": 4.506250838452206e-06, + "loss": 0.0014, + "step": 70690 + }, + { + "epoch": 1.1568354741061933, + "grad_norm": 0.2032691240310669, + "learning_rate": 4.504829932000556e-06, + "loss": 0.0023, + "step": 70700 + }, + { + "epoch": 1.156999100057269, + "grad_norm": 0.07930630445480347, + "learning_rate": 4.503409065933292e-06, + "loss": 0.0031, + "step": 70710 + }, + { + "epoch": 1.157162726008345, + "grad_norm": 0.008853563107550144, + "learning_rate": 4.501988240366296e-06, + "loss": 0.0009, + "step": 70720 + }, + { + "epoch": 1.1573263519594208, + "grad_norm": 0.20645996928215027, + "learning_rate": 4.500567455415444e-06, + "loss": 0.0017, + "step": 70730 + }, + { + "epoch": 1.1574899779104966, + "grad_norm": 0.021436849609017372, + "learning_rate": 4.499146711196611e-06, + "loss": 0.0022, + "step": 70740 + }, + { + "epoch": 1.1576536038615726, + "grad_norm": 0.06520280987024307, + "learning_rate": 4.497726007825669e-06, + "loss": 0.0013, + "step": 70750 + }, + { + "epoch": 1.1578172298126483, + "grad_norm": 0.04408663138747215, + "learning_rate": 4.496305345418485e-06, + "loss": 0.0035, + "step": 70760 + }, + { + "epoch": 1.1579808557637241, + "grad_norm": 0.05710349977016449, + "learning_rate": 4.494884724090922e-06, + "loss": 0.0013, + "step": 70770 + }, + { + "epoch": 1.1581444817148, + "grad_norm": 0.13150498270988464, + "learning_rate": 4.493464143958843e-06, + "loss": 0.0017, + "step": 70780 + }, + { + "epoch": 1.1583081076658759, + "grad_norm": 0.07247708737850189, + "learning_rate": 4.492043605138108e-06, + "loss": 0.0021, + "step": 70790 + }, + { + "epoch": 1.1584717336169517, + "grad_norm": 0.11595527082681656, + "learning_rate": 4.490623107744566e-06, + "loss": 0.0016, + "step": 70800 + }, + { + "epoch": 1.1586353595680274, + "grad_norm": 0.07763620465993881, + "learning_rate": 4.489202651894069e-06, + "loss": 0.0016, + "step": 70810 + }, + { + "epoch": 1.1587989855191032, + "grad_norm": 0.06980343163013458, + "learning_rate": 4.487782237702467e-06, + "loss": 0.0008, + "step": 70820 + }, + { + "epoch": 1.1589626114701792, + "grad_norm": 0.07394812256097794, + "learning_rate": 4.4863618652856026e-06, + "loss": 0.0009, + "step": 70830 + }, + { + "epoch": 1.159126237421255, + "grad_norm": 0.06336156278848648, + "learning_rate": 4.484941534759317e-06, + "loss": 0.0009, + "step": 70840 + }, + { + "epoch": 1.1592898633723308, + "grad_norm": 0.06244694069027901, + "learning_rate": 4.4835212462394475e-06, + "loss": 0.0015, + "step": 70850 + }, + { + "epoch": 1.1594534893234067, + "grad_norm": 0.045786213129758835, + "learning_rate": 4.482100999841828e-06, + "loss": 0.0008, + "step": 70860 + }, + { + "epoch": 1.1596171152744825, + "grad_norm": 0.07851941883563995, + "learning_rate": 4.4806807956822885e-06, + "loss": 0.0018, + "step": 70870 + }, + { + "epoch": 1.1597807412255583, + "grad_norm": 0.09095162153244019, + "learning_rate": 4.4792606338766565e-06, + "loss": 0.0014, + "step": 70880 + }, + { + "epoch": 1.1599443671766343, + "grad_norm": 0.005531649570912123, + "learning_rate": 4.477840514540756e-06, + "loss": 0.0014, + "step": 70890 + }, + { + "epoch": 1.16010799312771, + "grad_norm": 0.03555836156010628, + "learning_rate": 4.476420437790407e-06, + "loss": 0.0019, + "step": 70900 + }, + { + "epoch": 1.1602716190787858, + "grad_norm": 0.014247196726500988, + "learning_rate": 4.475000403741424e-06, + "loss": 0.0009, + "step": 70910 + }, + { + "epoch": 1.1604352450298618, + "grad_norm": 0.03097921423614025, + "learning_rate": 4.473580412509623e-06, + "loss": 0.0014, + "step": 70920 + }, + { + "epoch": 1.1605988709809376, + "grad_norm": 0.019966501742601395, + "learning_rate": 4.472160464210814e-06, + "loss": 0.0025, + "step": 70930 + }, + { + "epoch": 1.1607624969320134, + "grad_norm": 0.0065721082501113415, + "learning_rate": 4.470740558960799e-06, + "loss": 0.0019, + "step": 70940 + }, + { + "epoch": 1.1609261228830892, + "grad_norm": 0.03560645878314972, + "learning_rate": 4.469320696875385e-06, + "loss": 0.0013, + "step": 70950 + }, + { + "epoch": 1.1610897488341652, + "grad_norm": 0.027652781456708908, + "learning_rate": 4.467900878070369e-06, + "loss": 0.0012, + "step": 70960 + }, + { + "epoch": 1.161253374785241, + "grad_norm": 0.09856000542640686, + "learning_rate": 4.466481102661546e-06, + "loss": 0.0015, + "step": 70970 + }, + { + "epoch": 1.1614170007363167, + "grad_norm": 0.03611590340733528, + "learning_rate": 4.465061370764711e-06, + "loss": 0.0012, + "step": 70980 + }, + { + "epoch": 1.1615806266873927, + "grad_norm": 0.009698276408016682, + "learning_rate": 4.463641682495648e-06, + "loss": 0.0009, + "step": 70990 + }, + { + "epoch": 1.1617442526384685, + "grad_norm": 0.07362033426761627, + "learning_rate": 4.462222037970147e-06, + "loss": 0.0012, + "step": 71000 + }, + { + "epoch": 1.1619078785895443, + "grad_norm": 0.02714143507182598, + "learning_rate": 4.460802437303986e-06, + "loss": 0.0011, + "step": 71010 + }, + { + "epoch": 1.16207150454062, + "grad_norm": 0.020419219508767128, + "learning_rate": 4.459382880612943e-06, + "loss": 0.0013, + "step": 71020 + }, + { + "epoch": 1.162235130491696, + "grad_norm": 0.04754084348678589, + "learning_rate": 4.457963368012794e-06, + "loss": 0.0012, + "step": 71030 + }, + { + "epoch": 1.1623987564427718, + "grad_norm": 0.06855122745037079, + "learning_rate": 4.456543899619308e-06, + "loss": 0.0009, + "step": 71040 + }, + { + "epoch": 1.1625623823938476, + "grad_norm": 0.037121742963790894, + "learning_rate": 4.455124475548253e-06, + "loss": 0.0011, + "step": 71050 + }, + { + "epoch": 1.1627260083449236, + "grad_norm": 0.0170463677495718, + "learning_rate": 4.453705095915391e-06, + "loss": 0.0007, + "step": 71060 + }, + { + "epoch": 1.1628896342959993, + "grad_norm": 0.10404789447784424, + "learning_rate": 4.452285760836484e-06, + "loss": 0.0013, + "step": 71070 + }, + { + "epoch": 1.1630532602470751, + "grad_norm": 0.027800675481557846, + "learning_rate": 4.4508664704272855e-06, + "loss": 0.0015, + "step": 71080 + }, + { + "epoch": 1.1632168861981511, + "grad_norm": 0.05662749707698822, + "learning_rate": 4.44944722480355e-06, + "loss": 0.0015, + "step": 71090 + }, + { + "epoch": 1.1633805121492269, + "grad_norm": 0.07210488617420197, + "learning_rate": 4.448028024081026e-06, + "loss": 0.0015, + "step": 71100 + }, + { + "epoch": 1.1635441381003027, + "grad_norm": 0.04680858179926872, + "learning_rate": 4.446608868375458e-06, + "loss": 0.0017, + "step": 71110 + }, + { + "epoch": 1.1637077640513787, + "grad_norm": 0.006319742649793625, + "learning_rate": 4.4451897578025885e-06, + "loss": 0.0027, + "step": 71120 + }, + { + "epoch": 1.1638713900024544, + "grad_norm": 0.03585018590092659, + "learning_rate": 4.443770692478154e-06, + "loss": 0.0022, + "step": 71130 + }, + { + "epoch": 1.1640350159535302, + "grad_norm": 0.06705167889595032, + "learning_rate": 4.4423516725178895e-06, + "loss": 0.0011, + "step": 71140 + }, + { + "epoch": 1.164198641904606, + "grad_norm": 0.043585583567619324, + "learning_rate": 4.440932698037525e-06, + "loss": 0.0012, + "step": 71150 + }, + { + "epoch": 1.164362267855682, + "grad_norm": 0.042774494737386703, + "learning_rate": 4.439513769152788e-06, + "loss": 0.0025, + "step": 71160 + }, + { + "epoch": 1.1645258938067578, + "grad_norm": 0.08851239085197449, + "learning_rate": 4.438094885979401e-06, + "loss": 0.0009, + "step": 71170 + }, + { + "epoch": 1.1646895197578335, + "grad_norm": 0.018701424822211266, + "learning_rate": 4.436676048633083e-06, + "loss": 0.001, + "step": 71180 + }, + { + "epoch": 1.1648531457089095, + "grad_norm": 0.04496646672487259, + "learning_rate": 4.43525725722955e-06, + "loss": 0.0016, + "step": 71190 + }, + { + "epoch": 1.1650167716599853, + "grad_norm": 0.044203028082847595, + "learning_rate": 4.433838511884514e-06, + "loss": 0.0018, + "step": 71200 + }, + { + "epoch": 1.165180397611061, + "grad_norm": 0.2142196148633957, + "learning_rate": 4.432419812713683e-06, + "loss": 0.0014, + "step": 71210 + }, + { + "epoch": 1.1653440235621368, + "grad_norm": 0.02357359044253826, + "learning_rate": 4.4310011598327605e-06, + "loss": 0.0007, + "step": 71220 + }, + { + "epoch": 1.1655076495132128, + "grad_norm": 0.06358008831739426, + "learning_rate": 4.4295825533574475e-06, + "loss": 0.0025, + "step": 71230 + }, + { + "epoch": 1.1656712754642886, + "grad_norm": 0.009030045010149479, + "learning_rate": 4.428163993403441e-06, + "loss": 0.0007, + "step": 71240 + }, + { + "epoch": 1.1658349014153644, + "grad_norm": 0.12800511717796326, + "learning_rate": 4.426745480086435e-06, + "loss": 0.0011, + "step": 71250 + }, + { + "epoch": 1.1659985273664404, + "grad_norm": 0.018625780940055847, + "learning_rate": 4.425327013522116e-06, + "loss": 0.0016, + "step": 71260 + }, + { + "epoch": 1.1661621533175162, + "grad_norm": 0.04585704952478409, + "learning_rate": 4.423908593826169e-06, + "loss": 0.002, + "step": 71270 + }, + { + "epoch": 1.166325779268592, + "grad_norm": 0.07561267167329788, + "learning_rate": 4.422490221114279e-06, + "loss": 0.0005, + "step": 71280 + }, + { + "epoch": 1.166489405219668, + "grad_norm": 0.04992244020104408, + "learning_rate": 4.421071895502121e-06, + "loss": 0.001, + "step": 71290 + }, + { + "epoch": 1.1666530311707437, + "grad_norm": 0.0051212003454566, + "learning_rate": 4.419653617105369e-06, + "loss": 0.0006, + "step": 71300 + }, + { + "epoch": 1.1668166571218195, + "grad_norm": 0.06859225779771805, + "learning_rate": 4.418235386039695e-06, + "loss": 0.0017, + "step": 71310 + }, + { + "epoch": 1.1669802830728955, + "grad_norm": 0.07315724343061447, + "learning_rate": 4.416817202420762e-06, + "loss": 0.0009, + "step": 71320 + }, + { + "epoch": 1.1671439090239712, + "grad_norm": 0.027722079306840897, + "learning_rate": 4.415399066364235e-06, + "loss": 0.0014, + "step": 71330 + }, + { + "epoch": 1.167307534975047, + "grad_norm": 0.08444797247648239, + "learning_rate": 4.4139809779857705e-06, + "loss": 0.0012, + "step": 71340 + }, + { + "epoch": 1.1674711609261228, + "grad_norm": 0.08554187417030334, + "learning_rate": 4.412562937401024e-06, + "loss": 0.0008, + "step": 71350 + }, + { + "epoch": 1.1676347868771988, + "grad_norm": 0.04930735379457474, + "learning_rate": 4.411144944725645e-06, + "loss": 0.002, + "step": 71360 + }, + { + "epoch": 1.1677984128282746, + "grad_norm": 0.04503967985510826, + "learning_rate": 4.409727000075281e-06, + "loss": 0.0013, + "step": 71370 + }, + { + "epoch": 1.1679620387793503, + "grad_norm": 0.05321275442838669, + "learning_rate": 4.408309103565575e-06, + "loss": 0.0011, + "step": 71380 + }, + { + "epoch": 1.1681256647304263, + "grad_norm": 0.007587939966470003, + "learning_rate": 4.406891255312166e-06, + "loss": 0.0005, + "step": 71390 + }, + { + "epoch": 1.1682892906815021, + "grad_norm": 0.013766760006546974, + "learning_rate": 4.405473455430687e-06, + "loss": 0.0009, + "step": 71400 + }, + { + "epoch": 1.1684529166325779, + "grad_norm": 0.07259071618318558, + "learning_rate": 4.404055704036771e-06, + "loss": 0.0014, + "step": 71410 + }, + { + "epoch": 1.1686165425836537, + "grad_norm": 0.061089012771844864, + "learning_rate": 4.402638001246044e-06, + "loss": 0.0022, + "step": 71420 + }, + { + "epoch": 1.1687801685347297, + "grad_norm": 0.05355614051222801, + "learning_rate": 4.401220347174129e-06, + "loss": 0.0012, + "step": 71430 + }, + { + "epoch": 1.1689437944858054, + "grad_norm": 0.07587563991546631, + "learning_rate": 4.399802741936646e-06, + "loss": 0.0022, + "step": 71440 + }, + { + "epoch": 1.1691074204368812, + "grad_norm": 0.010608148761093616, + "learning_rate": 4.3983851856492084e-06, + "loss": 0.0007, + "step": 71450 + }, + { + "epoch": 1.1692710463879572, + "grad_norm": 0.07015317678451538, + "learning_rate": 4.396967678427428e-06, + "loss": 0.0018, + "step": 71460 + }, + { + "epoch": 1.169434672339033, + "grad_norm": 0.050035253167152405, + "learning_rate": 4.395550220386913e-06, + "loss": 0.0022, + "step": 71470 + }, + { + "epoch": 1.1695982982901088, + "grad_norm": 0.04335442930459976, + "learning_rate": 4.394132811643266e-06, + "loss": 0.0008, + "step": 71480 + }, + { + "epoch": 1.1697619242411847, + "grad_norm": 0.05794442817568779, + "learning_rate": 4.392715452312084e-06, + "loss": 0.0015, + "step": 71490 + }, + { + "epoch": 1.1699255501922605, + "grad_norm": 0.05498574301600456, + "learning_rate": 4.391298142508964e-06, + "loss": 0.0014, + "step": 71500 + }, + { + "epoch": 1.1700891761433363, + "grad_norm": 0.0739671140909195, + "learning_rate": 4.389880882349497e-06, + "loss": 0.0015, + "step": 71510 + }, + { + "epoch": 1.1702528020944123, + "grad_norm": 0.12752996385097504, + "learning_rate": 4.3884636719492694e-06, + "loss": 0.0045, + "step": 71520 + }, + { + "epoch": 1.170416428045488, + "grad_norm": 0.11580020189285278, + "learning_rate": 4.387046511423863e-06, + "loss": 0.0012, + "step": 71530 + }, + { + "epoch": 1.1705800539965638, + "grad_norm": 0.09024364501237869, + "learning_rate": 4.385629400888859e-06, + "loss": 0.0022, + "step": 71540 + }, + { + "epoch": 1.1707436799476396, + "grad_norm": 0.09970027208328247, + "learning_rate": 4.384212340459831e-06, + "loss": 0.0012, + "step": 71550 + }, + { + "epoch": 1.1709073058987156, + "grad_norm": 0.07124566286802292, + "learning_rate": 4.3827953302523485e-06, + "loss": 0.0013, + "step": 71560 + }, + { + "epoch": 1.1710709318497914, + "grad_norm": 0.05069902911782265, + "learning_rate": 4.38137837038198e-06, + "loss": 0.0013, + "step": 71570 + }, + { + "epoch": 1.1712345578008672, + "grad_norm": 0.037889789789915085, + "learning_rate": 4.379961460964287e-06, + "loss": 0.0018, + "step": 71580 + }, + { + "epoch": 1.171398183751943, + "grad_norm": 0.023270778357982635, + "learning_rate": 4.378544602114826e-06, + "loss": 0.0023, + "step": 71590 + }, + { + "epoch": 1.171561809703019, + "grad_norm": 0.05484585836529732, + "learning_rate": 4.377127793949154e-06, + "loss": 0.0009, + "step": 71600 + }, + { + "epoch": 1.1717254356540947, + "grad_norm": 0.05102138593792915, + "learning_rate": 4.375711036582819e-06, + "loss": 0.0022, + "step": 71610 + }, + { + "epoch": 1.1718890616051705, + "grad_norm": 0.04460024833679199, + "learning_rate": 4.374294330131369e-06, + "loss": 0.0012, + "step": 71620 + }, + { + "epoch": 1.1720526875562465, + "grad_norm": 0.15503236651420593, + "learning_rate": 4.372877674710344e-06, + "loss": 0.0017, + "step": 71630 + }, + { + "epoch": 1.1722163135073222, + "grad_norm": 0.10285632312297821, + "learning_rate": 4.371461070435283e-06, + "loss": 0.0037, + "step": 71640 + }, + { + "epoch": 1.172379939458398, + "grad_norm": 0.04496202617883682, + "learning_rate": 4.3700445174217175e-06, + "loss": 0.0014, + "step": 71650 + }, + { + "epoch": 1.172543565409474, + "grad_norm": 0.015057187527418137, + "learning_rate": 4.368628015785178e-06, + "loss": 0.0009, + "step": 71660 + }, + { + "epoch": 1.1727071913605498, + "grad_norm": 0.08583702892065048, + "learning_rate": 4.367211565641189e-06, + "loss": 0.0012, + "step": 71670 + }, + { + "epoch": 1.1728708173116256, + "grad_norm": 0.15659911930561066, + "learning_rate": 4.365795167105273e-06, + "loss": 0.001, + "step": 71680 + }, + { + "epoch": 1.1730344432627016, + "grad_norm": 0.011225015856325626, + "learning_rate": 4.3643788202929446e-06, + "loss": 0.0013, + "step": 71690 + }, + { + "epoch": 1.1731980692137773, + "grad_norm": 0.03144923597574234, + "learning_rate": 4.3629625253197176e-06, + "loss": 0.0017, + "step": 71700 + }, + { + "epoch": 1.1733616951648531, + "grad_norm": 0.14824244379997253, + "learning_rate": 4.361546282301099e-06, + "loss": 0.0019, + "step": 71710 + }, + { + "epoch": 1.173525321115929, + "grad_norm": 0.02394001930952072, + "learning_rate": 4.360130091352594e-06, + "loss": 0.0013, + "step": 71720 + }, + { + "epoch": 1.1736889470670049, + "grad_norm": 0.03789530694484711, + "learning_rate": 4.358713952589702e-06, + "loss": 0.0008, + "step": 71730 + }, + { + "epoch": 1.1738525730180807, + "grad_norm": 0.04036388546228409, + "learning_rate": 4.357297866127917e-06, + "loss": 0.0017, + "step": 71740 + }, + { + "epoch": 1.1740161989691564, + "grad_norm": 0.0778418779373169, + "learning_rate": 4.3558818320827325e-06, + "loss": 0.0012, + "step": 71750 + }, + { + "epoch": 1.1741798249202324, + "grad_norm": 0.056500453501939774, + "learning_rate": 4.354465850569634e-06, + "loss": 0.0013, + "step": 71760 + }, + { + "epoch": 1.1743434508713082, + "grad_norm": 0.054508715867996216, + "learning_rate": 4.3530499217041036e-06, + "loss": 0.002, + "step": 71770 + }, + { + "epoch": 1.174507076822384, + "grad_norm": 0.10001111775636673, + "learning_rate": 4.351634045601621e-06, + "loss": 0.0011, + "step": 71780 + }, + { + "epoch": 1.1746707027734598, + "grad_norm": 0.040598511695861816, + "learning_rate": 4.35021822237766e-06, + "loss": 0.0019, + "step": 71790 + }, + { + "epoch": 1.1748343287245357, + "grad_norm": 0.06470203399658203, + "learning_rate": 4.348802452147689e-06, + "loss": 0.0017, + "step": 71800 + }, + { + "epoch": 1.1749979546756115, + "grad_norm": 0.07201012223958969, + "learning_rate": 4.347386735027176e-06, + "loss": 0.0023, + "step": 71810 + }, + { + "epoch": 1.1751615806266873, + "grad_norm": 0.03699260577559471, + "learning_rate": 4.34597107113158e-06, + "loss": 0.001, + "step": 71820 + }, + { + "epoch": 1.1753252065777633, + "grad_norm": 0.002159158466383815, + "learning_rate": 4.344555460576358e-06, + "loss": 0.0029, + "step": 71830 + }, + { + "epoch": 1.175488832528839, + "grad_norm": 0.05194035544991493, + "learning_rate": 4.343139903476963e-06, + "loss": 0.0018, + "step": 71840 + }, + { + "epoch": 1.1756524584799148, + "grad_norm": 0.08302514255046844, + "learning_rate": 4.341724399948842e-06, + "loss": 0.0014, + "step": 71850 + }, + { + "epoch": 1.1758160844309908, + "grad_norm": 0.005879908334463835, + "learning_rate": 4.34030895010744e-06, + "loss": 0.0014, + "step": 71860 + }, + { + "epoch": 1.1759797103820666, + "grad_norm": 0.03511945903301239, + "learning_rate": 4.338893554068195e-06, + "loss": 0.0021, + "step": 71870 + }, + { + "epoch": 1.1761433363331424, + "grad_norm": 0.12251469492912292, + "learning_rate": 4.337478211946543e-06, + "loss": 0.0024, + "step": 71880 + }, + { + "epoch": 1.1763069622842184, + "grad_norm": 0.1464211642742157, + "learning_rate": 4.336062923857914e-06, + "loss": 0.0024, + "step": 71890 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.05445133149623871, + "learning_rate": 4.334647689917734e-06, + "loss": 0.0023, + "step": 71900 + }, + { + "epoch": 1.17663421418637, + "grad_norm": 0.06458507478237152, + "learning_rate": 4.333232510241424e-06, + "loss": 0.0014, + "step": 71910 + }, + { + "epoch": 1.1767978401374457, + "grad_norm": 0.016902439296245575, + "learning_rate": 4.331817384944402e-06, + "loss": 0.0019, + "step": 71920 + }, + { + "epoch": 1.1769614660885217, + "grad_norm": 0.02908494882285595, + "learning_rate": 4.330402314142081e-06, + "loss": 0.0014, + "step": 71930 + }, + { + "epoch": 1.1771250920395975, + "grad_norm": 0.002099070930853486, + "learning_rate": 4.328987297949869e-06, + "loss": 0.0018, + "step": 71940 + }, + { + "epoch": 1.1772887179906733, + "grad_norm": 0.01586686260998249, + "learning_rate": 4.32757233648317e-06, + "loss": 0.0008, + "step": 71950 + }, + { + "epoch": 1.1774523439417492, + "grad_norm": 0.06389237195253372, + "learning_rate": 4.3261574298573835e-06, + "loss": 0.0016, + "step": 71960 + }, + { + "epoch": 1.177615969892825, + "grad_norm": 0.04454488307237625, + "learning_rate": 4.3247425781879045e-06, + "loss": 0.0019, + "step": 71970 + }, + { + "epoch": 1.1777795958439008, + "grad_norm": 0.08135736733675003, + "learning_rate": 4.323327781590123e-06, + "loss": 0.0009, + "step": 71980 + }, + { + "epoch": 1.1779432217949766, + "grad_norm": 0.05804780498147011, + "learning_rate": 4.321913040179426e-06, + "loss": 0.0015, + "step": 71990 + }, + { + "epoch": 1.1781068477460526, + "grad_norm": 0.04606780409812927, + "learning_rate": 4.320498354071193e-06, + "loss": 0.0016, + "step": 72000 + }, + { + "epoch": 1.1782704736971283, + "grad_norm": 0.03280048072338104, + "learning_rate": 4.319083723380804e-06, + "loss": 0.0019, + "step": 72010 + }, + { + "epoch": 1.1784340996482041, + "grad_norm": 0.03805740550160408, + "learning_rate": 4.317669148223629e-06, + "loss": 0.0015, + "step": 72020 + }, + { + "epoch": 1.17859772559928, + "grad_norm": 0.046157293021678925, + "learning_rate": 4.316254628715038e-06, + "loss": 0.0011, + "step": 72030 + }, + { + "epoch": 1.1787613515503559, + "grad_norm": 0.0754895955324173, + "learning_rate": 4.314840164970392e-06, + "loss": 0.0015, + "step": 72040 + }, + { + "epoch": 1.1789249775014317, + "grad_norm": 0.0036848357412964106, + "learning_rate": 4.313425757105051e-06, + "loss": 0.0013, + "step": 72050 + }, + { + "epoch": 1.1790886034525077, + "grad_norm": 0.04650893062353134, + "learning_rate": 4.31201140523437e-06, + "loss": 0.0019, + "step": 72060 + }, + { + "epoch": 1.1792522294035834, + "grad_norm": 0.04566042125225067, + "learning_rate": 4.3105971094736975e-06, + "loss": 0.0015, + "step": 72070 + }, + { + "epoch": 1.1794158553546592, + "grad_norm": 0.1227894052863121, + "learning_rate": 4.309182869938379e-06, + "loss": 0.0029, + "step": 72080 + }, + { + "epoch": 1.1795794813057352, + "grad_norm": 0.04693392664194107, + "learning_rate": 4.307768686743756e-06, + "loss": 0.0014, + "step": 72090 + }, + { + "epoch": 1.179743107256811, + "grad_norm": 0.11026965826749802, + "learning_rate": 4.306354560005163e-06, + "loss": 0.0018, + "step": 72100 + }, + { + "epoch": 1.1799067332078867, + "grad_norm": 0.046785108745098114, + "learning_rate": 4.304940489837932e-06, + "loss": 0.0012, + "step": 72110 + }, + { + "epoch": 1.1800703591589625, + "grad_norm": 0.07196402549743652, + "learning_rate": 4.303526476357391e-06, + "loss": 0.0022, + "step": 72120 + }, + { + "epoch": 1.1802339851100385, + "grad_norm": 0.023471105843782425, + "learning_rate": 4.30211251967886e-06, + "loss": 0.0012, + "step": 72130 + }, + { + "epoch": 1.1803976110611143, + "grad_norm": 0.0903412401676178, + "learning_rate": 4.300698619917658e-06, + "loss": 0.0008, + "step": 72140 + }, + { + "epoch": 1.18056123701219, + "grad_norm": 0.01290425006300211, + "learning_rate": 4.2992847771890965e-06, + "loss": 0.001, + "step": 72150 + }, + { + "epoch": 1.180724862963266, + "grad_norm": 0.06439687311649323, + "learning_rate": 4.297870991608485e-06, + "loss": 0.0027, + "step": 72160 + }, + { + "epoch": 1.1808884889143418, + "grad_norm": 0.030832646414637566, + "learning_rate": 4.296457263291127e-06, + "loss": 0.0012, + "step": 72170 + }, + { + "epoch": 1.1810521148654176, + "grad_norm": 0.018244683742523193, + "learning_rate": 4.295043592352321e-06, + "loss": 0.0011, + "step": 72180 + }, + { + "epoch": 1.1812157408164934, + "grad_norm": 0.0346740260720253, + "learning_rate": 4.29362997890736e-06, + "loss": 0.001, + "step": 72190 + }, + { + "epoch": 1.1813793667675694, + "grad_norm": 0.032389428466558456, + "learning_rate": 4.2922164230715355e-06, + "loss": 0.0017, + "step": 72200 + }, + { + "epoch": 1.1815429927186452, + "grad_norm": 0.016945354640483856, + "learning_rate": 4.290802924960132e-06, + "loss": 0.0014, + "step": 72210 + }, + { + "epoch": 1.181706618669721, + "grad_norm": 0.030116165056824684, + "learning_rate": 4.289389484688429e-06, + "loss": 0.0016, + "step": 72220 + }, + { + "epoch": 1.181870244620797, + "grad_norm": 0.028042813763022423, + "learning_rate": 4.287976102371701e-06, + "loss": 0.0013, + "step": 72230 + }, + { + "epoch": 1.1820338705718727, + "grad_norm": 0.028691904619336128, + "learning_rate": 4.28656277812522e-06, + "loss": 0.0022, + "step": 72240 + }, + { + "epoch": 1.1821974965229485, + "grad_norm": 0.0035471832379698753, + "learning_rate": 4.285149512064252e-06, + "loss": 0.0008, + "step": 72250 + }, + { + "epoch": 1.1823611224740245, + "grad_norm": 0.011386027559638023, + "learning_rate": 4.283736304304057e-06, + "loss": 0.0017, + "step": 72260 + }, + { + "epoch": 1.1825247484251002, + "grad_norm": 0.12985564768314362, + "learning_rate": 4.282323154959892e-06, + "loss": 0.0019, + "step": 72270 + }, + { + "epoch": 1.182688374376176, + "grad_norm": 0.06605116277933121, + "learning_rate": 4.280910064147009e-06, + "loss": 0.0012, + "step": 72280 + }, + { + "epoch": 1.182852000327252, + "grad_norm": 0.031029747799038887, + "learning_rate": 4.279497031980654e-06, + "loss": 0.0007, + "step": 72290 + }, + { + "epoch": 1.1830156262783278, + "grad_norm": 0.04837345704436302, + "learning_rate": 4.278084058576071e-06, + "loss": 0.0015, + "step": 72300 + }, + { + "epoch": 1.1831792522294036, + "grad_norm": 0.04557085409760475, + "learning_rate": 4.276671144048495e-06, + "loss": 0.0008, + "step": 72310 + }, + { + "epoch": 1.1833428781804793, + "grad_norm": 0.05936750769615173, + "learning_rate": 4.27525828851316e-06, + "loss": 0.0012, + "step": 72320 + }, + { + "epoch": 1.1835065041315553, + "grad_norm": 0.08688812702894211, + "learning_rate": 4.273845492085293e-06, + "loss": 0.0019, + "step": 72330 + }, + { + "epoch": 1.1836701300826311, + "grad_norm": 0.024858104065060616, + "learning_rate": 4.272432754880117e-06, + "loss": 0.0012, + "step": 72340 + }, + { + "epoch": 1.1838337560337069, + "grad_norm": 0.08039912581443787, + "learning_rate": 4.27102007701285e-06, + "loss": 0.0013, + "step": 72350 + }, + { + "epoch": 1.1839973819847829, + "grad_norm": 0.04867817834019661, + "learning_rate": 4.269607458598705e-06, + "loss": 0.0012, + "step": 72360 + }, + { + "epoch": 1.1841610079358587, + "grad_norm": 0.020656369626522064, + "learning_rate": 4.268194899752891e-06, + "loss": 0.0015, + "step": 72370 + }, + { + "epoch": 1.1843246338869344, + "grad_norm": 0.05521941930055618, + "learning_rate": 4.2667824005906116e-06, + "loss": 0.0007, + "step": 72380 + }, + { + "epoch": 1.1844882598380102, + "grad_norm": 0.05653200298547745, + "learning_rate": 4.265369961227065e-06, + "loss": 0.0013, + "step": 72390 + }, + { + "epoch": 1.1846518857890862, + "grad_norm": 0.0017546155722811818, + "learning_rate": 4.2639575817774445e-06, + "loss": 0.0012, + "step": 72400 + }, + { + "epoch": 1.184815511740162, + "grad_norm": 0.10608597844839096, + "learning_rate": 4.262545262356939e-06, + "loss": 0.0018, + "step": 72410 + }, + { + "epoch": 1.1849791376912377, + "grad_norm": 0.07191050797700882, + "learning_rate": 4.261133003080733e-06, + "loss": 0.0019, + "step": 72420 + }, + { + "epoch": 1.1851427636423137, + "grad_norm": 0.030409125611186028, + "learning_rate": 4.259720804064007e-06, + "loss": 0.0005, + "step": 72430 + }, + { + "epoch": 1.1853063895933895, + "grad_norm": 0.04548092558979988, + "learning_rate": 4.258308665421932e-06, + "loss": 0.0012, + "step": 72440 + }, + { + "epoch": 1.1854700155444653, + "grad_norm": 0.037514809519052505, + "learning_rate": 4.256896587269679e-06, + "loss": 0.0024, + "step": 72450 + }, + { + "epoch": 1.1856336414955413, + "grad_norm": 0.04791383445262909, + "learning_rate": 4.255484569722412e-06, + "loss": 0.0012, + "step": 72460 + }, + { + "epoch": 1.185797267446617, + "grad_norm": 0.03941550850868225, + "learning_rate": 4.254072612895291e-06, + "loss": 0.001, + "step": 72470 + }, + { + "epoch": 1.1859608933976928, + "grad_norm": 0.11777466535568237, + "learning_rate": 4.252660716903469e-06, + "loss": 0.0017, + "step": 72480 + }, + { + "epoch": 1.1861245193487688, + "grad_norm": 0.07658804953098297, + "learning_rate": 4.251248881862096e-06, + "loss": 0.0011, + "step": 72490 + }, + { + "epoch": 1.1862881452998446, + "grad_norm": 0.0373791866004467, + "learning_rate": 4.249837107886318e-06, + "loss": 0.0012, + "step": 72500 + }, + { + "epoch": 1.1864517712509204, + "grad_norm": 0.059319544583559036, + "learning_rate": 4.248425395091273e-06, + "loss": 0.001, + "step": 72510 + }, + { + "epoch": 1.1866153972019962, + "grad_norm": 0.01449304074048996, + "learning_rate": 4.247013743592095e-06, + "loss": 0.0009, + "step": 72520 + }, + { + "epoch": 1.1867790231530722, + "grad_norm": 0.08761975914239883, + "learning_rate": 4.245602153503915e-06, + "loss": 0.0011, + "step": 72530 + }, + { + "epoch": 1.186942649104148, + "grad_norm": 0.038129281252622604, + "learning_rate": 4.244190624941857e-06, + "loss": 0.0009, + "step": 72540 + }, + { + "epoch": 1.1871062750552237, + "grad_norm": 0.03196340054273605, + "learning_rate": 4.24277915802104e-06, + "loss": 0.0017, + "step": 72550 + }, + { + "epoch": 1.1872699010062995, + "grad_norm": 0.043143380433321, + "learning_rate": 4.241367752856578e-06, + "loss": 0.0014, + "step": 72560 + }, + { + "epoch": 1.1874335269573755, + "grad_norm": 0.05428246408700943, + "learning_rate": 4.23995640956358e-06, + "loss": 0.001, + "step": 72570 + }, + { + "epoch": 1.1875971529084512, + "grad_norm": 0.014837591908872128, + "learning_rate": 4.238545128257154e-06, + "loss": 0.002, + "step": 72580 + }, + { + "epoch": 1.187760778859527, + "grad_norm": 0.024181470274925232, + "learning_rate": 4.237133909052394e-06, + "loss": 0.002, + "step": 72590 + }, + { + "epoch": 1.187924404810603, + "grad_norm": 0.04305800795555115, + "learning_rate": 4.235722752064398e-06, + "loss": 0.0009, + "step": 72600 + }, + { + "epoch": 1.1880880307616788, + "grad_norm": 0.14382657408714294, + "learning_rate": 4.2343116574082535e-06, + "loss": 0.0018, + "step": 72610 + }, + { + "epoch": 1.1882516567127546, + "grad_norm": 0.05452917516231537, + "learning_rate": 4.232900625199046e-06, + "loss": 0.0011, + "step": 72620 + }, + { + "epoch": 1.1884152826638306, + "grad_norm": 0.002851971657946706, + "learning_rate": 4.2314896555518525e-06, + "loss": 0.0009, + "step": 72630 + }, + { + "epoch": 1.1885789086149063, + "grad_norm": 0.028746698051691055, + "learning_rate": 4.230078748581749e-06, + "loss": 0.0013, + "step": 72640 + }, + { + "epoch": 1.1887425345659821, + "grad_norm": 0.08779220283031464, + "learning_rate": 4.228667904403803e-06, + "loss": 0.0019, + "step": 72650 + }, + { + "epoch": 1.188906160517058, + "grad_norm": 0.002911148825660348, + "learning_rate": 4.227257123133078e-06, + "loss": 0.0014, + "step": 72660 + }, + { + "epoch": 1.1890697864681339, + "grad_norm": 0.037185221910476685, + "learning_rate": 4.225846404884633e-06, + "loss": 0.0012, + "step": 72670 + }, + { + "epoch": 1.1892334124192097, + "grad_norm": 0.02794252336025238, + "learning_rate": 4.224435749773522e-06, + "loss": 0.0015, + "step": 72680 + }, + { + "epoch": 1.1893970383702854, + "grad_norm": 0.09844185411930084, + "learning_rate": 4.223025157914792e-06, + "loss": 0.0016, + "step": 72690 + }, + { + "epoch": 1.1895606643213614, + "grad_norm": 0.16257110238075256, + "learning_rate": 4.221614629423487e-06, + "loss": 0.0013, + "step": 72700 + }, + { + "epoch": 1.1897242902724372, + "grad_norm": 0.04554278403520584, + "learning_rate": 4.220204164414644e-06, + "loss": 0.0014, + "step": 72710 + }, + { + "epoch": 1.189887916223513, + "grad_norm": 0.17031581699848175, + "learning_rate": 4.218793763003296e-06, + "loss": 0.0018, + "step": 72720 + }, + { + "epoch": 1.190051542174589, + "grad_norm": 0.03764116391539574, + "learning_rate": 4.217383425304472e-06, + "loss": 0.0015, + "step": 72730 + }, + { + "epoch": 1.1902151681256647, + "grad_norm": 0.08119495958089828, + "learning_rate": 4.215973151433193e-06, + "loss": 0.001, + "step": 72740 + }, + { + "epoch": 1.1903787940767405, + "grad_norm": 0.040655434131622314, + "learning_rate": 4.2145629415044754e-06, + "loss": 0.0027, + "step": 72750 + }, + { + "epoch": 1.1905424200278163, + "grad_norm": 0.021082930266857147, + "learning_rate": 4.213152795633332e-06, + "loss": 0.0011, + "step": 72760 + }, + { + "epoch": 1.1907060459788923, + "grad_norm": 0.09340421855449677, + "learning_rate": 4.2117427139347696e-06, + "loss": 0.0013, + "step": 72770 + }, + { + "epoch": 1.190869671929968, + "grad_norm": 0.0315910279750824, + "learning_rate": 4.21033269652379e-06, + "loss": 0.0016, + "step": 72780 + }, + { + "epoch": 1.1910332978810438, + "grad_norm": 0.06637918949127197, + "learning_rate": 4.208922743515389e-06, + "loss": 0.0009, + "step": 72790 + }, + { + "epoch": 1.1911969238321198, + "grad_norm": 0.024217359721660614, + "learning_rate": 4.207512855024557e-06, + "loss": 0.0028, + "step": 72800 + }, + { + "epoch": 1.1913605497831956, + "grad_norm": 0.24659863114356995, + "learning_rate": 4.206103031166281e-06, + "loss": 0.0012, + "step": 72810 + }, + { + "epoch": 1.1915241757342714, + "grad_norm": 0.028071578592061996, + "learning_rate": 4.20469327205554e-06, + "loss": 0.0008, + "step": 72820 + }, + { + "epoch": 1.1916878016853474, + "grad_norm": 0.016061635687947273, + "learning_rate": 4.20328357780731e-06, + "loss": 0.0014, + "step": 72830 + }, + { + "epoch": 1.1918514276364232, + "grad_norm": 0.022765960544347763, + "learning_rate": 4.201873948536561e-06, + "loss": 0.001, + "step": 72840 + }, + { + "epoch": 1.192015053587499, + "grad_norm": 0.05193815752863884, + "learning_rate": 4.200464384358257e-06, + "loss": 0.0009, + "step": 72850 + }, + { + "epoch": 1.192178679538575, + "grad_norm": 0.04159295931458473, + "learning_rate": 4.199054885387359e-06, + "loss": 0.0011, + "step": 72860 + }, + { + "epoch": 1.1923423054896507, + "grad_norm": 0.07398375868797302, + "learning_rate": 4.197645451738819e-06, + "loss": 0.001, + "step": 72870 + }, + { + "epoch": 1.1925059314407265, + "grad_norm": 0.07873750478029251, + "learning_rate": 4.196236083527585e-06, + "loss": 0.0006, + "step": 72880 + }, + { + "epoch": 1.1926695573918022, + "grad_norm": 0.15952168405056, + "learning_rate": 4.194826780868602e-06, + "loss": 0.0018, + "step": 72890 + }, + { + "epoch": 1.1928331833428782, + "grad_norm": 0.05892939120531082, + "learning_rate": 4.193417543876806e-06, + "loss": 0.0014, + "step": 72900 + }, + { + "epoch": 1.192996809293954, + "grad_norm": 0.08886957168579102, + "learning_rate": 4.192008372667133e-06, + "loss": 0.0011, + "step": 72910 + }, + { + "epoch": 1.1931604352450298, + "grad_norm": 0.0091168861836195, + "learning_rate": 4.190599267354507e-06, + "loss": 0.0008, + "step": 72920 + }, + { + "epoch": 1.1933240611961058, + "grad_norm": 0.06435015052556992, + "learning_rate": 4.189190228053851e-06, + "loss": 0.0015, + "step": 72930 + }, + { + "epoch": 1.1934876871471816, + "grad_norm": 0.06907084584236145, + "learning_rate": 4.1877812548800816e-06, + "loss": 0.001, + "step": 72940 + }, + { + "epoch": 1.1936513130982573, + "grad_norm": 0.027643613517284393, + "learning_rate": 4.186372347948109e-06, + "loss": 0.0036, + "step": 72950 + }, + { + "epoch": 1.1938149390493331, + "grad_norm": 0.09798567742109299, + "learning_rate": 4.18496350737284e-06, + "loss": 0.0012, + "step": 72960 + }, + { + "epoch": 1.193978565000409, + "grad_norm": 0.026236360892653465, + "learning_rate": 4.1835547332691744e-06, + "loss": 0.0011, + "step": 72970 + }, + { + "epoch": 1.1941421909514849, + "grad_norm": 0.044981054961681366, + "learning_rate": 4.182146025752007e-06, + "loss": 0.0009, + "step": 72980 + }, + { + "epoch": 1.1943058169025607, + "grad_norm": 0.05679431930184364, + "learning_rate": 4.180737384936227e-06, + "loss": 0.0011, + "step": 72990 + }, + { + "epoch": 1.1944694428536367, + "grad_norm": 0.059831857681274414, + "learning_rate": 4.1793288109367185e-06, + "loss": 0.0016, + "step": 73000 + }, + { + "epoch": 1.1946330688047124, + "grad_norm": 0.08455739170312881, + "learning_rate": 4.17792030386836e-06, + "loss": 0.0013, + "step": 73010 + }, + { + "epoch": 1.1947966947557882, + "grad_norm": 0.03891890496015549, + "learning_rate": 4.176511863846024e-06, + "loss": 0.0026, + "step": 73020 + }, + { + "epoch": 1.1949603207068642, + "grad_norm": 0.10676612704992294, + "learning_rate": 4.17510349098458e-06, + "loss": 0.0014, + "step": 73030 + }, + { + "epoch": 1.19512394665794, + "grad_norm": 0.046449173241853714, + "learning_rate": 4.1736951853988875e-06, + "loss": 0.0013, + "step": 73040 + }, + { + "epoch": 1.1952875726090157, + "grad_norm": 0.0191277377307415, + "learning_rate": 4.1722869472038055e-06, + "loss": 0.0012, + "step": 73050 + }, + { + "epoch": 1.1954511985600917, + "grad_norm": 0.0341968759894371, + "learning_rate": 4.170878776514183e-06, + "loss": 0.0017, + "step": 73060 + }, + { + "epoch": 1.1956148245111675, + "grad_norm": 0.08055173605680466, + "learning_rate": 4.169470673444867e-06, + "loss": 0.001, + "step": 73070 + }, + { + "epoch": 1.1957784504622433, + "grad_norm": 0.016361836344003677, + "learning_rate": 4.168062638110697e-06, + "loss": 0.0005, + "step": 73080 + }, + { + "epoch": 1.195942076413319, + "grad_norm": 0.05318368598818779, + "learning_rate": 4.1666546706265074e-06, + "loss": 0.001, + "step": 73090 + }, + { + "epoch": 1.196105702364395, + "grad_norm": 0.05574622005224228, + "learning_rate": 4.165246771107128e-06, + "loss": 0.002, + "step": 73100 + }, + { + "epoch": 1.1962693283154708, + "grad_norm": 0.02988249436020851, + "learning_rate": 4.163838939667382e-06, + "loss": 0.0009, + "step": 73110 + }, + { + "epoch": 1.1964329542665466, + "grad_norm": 0.01302997674793005, + "learning_rate": 4.162431176422087e-06, + "loss": 0.0014, + "step": 73120 + }, + { + "epoch": 1.1965965802176226, + "grad_norm": 0.07631079852581024, + "learning_rate": 4.161023481486056e-06, + "loss": 0.0022, + "step": 73130 + }, + { + "epoch": 1.1967602061686984, + "grad_norm": 0.04220232740044594, + "learning_rate": 4.159615854974095e-06, + "loss": 0.0011, + "step": 73140 + }, + { + "epoch": 1.1969238321197742, + "grad_norm": 0.05432287976145744, + "learning_rate": 4.158208297001006e-06, + "loss": 0.0014, + "step": 73150 + }, + { + "epoch": 1.19708745807085, + "grad_norm": 0.00919813010841608, + "learning_rate": 4.1568008076815835e-06, + "loss": 0.0018, + "step": 73160 + }, + { + "epoch": 1.197251084021926, + "grad_norm": 0.09239612519741058, + "learning_rate": 4.155393387130618e-06, + "loss": 0.0015, + "step": 73170 + }, + { + "epoch": 1.1974147099730017, + "grad_norm": 0.01925583928823471, + "learning_rate": 4.1539860354628965e-06, + "loss": 0.0015, + "step": 73180 + }, + { + "epoch": 1.1975783359240775, + "grad_norm": 0.0316462516784668, + "learning_rate": 4.1525787527931945e-06, + "loss": 0.0012, + "step": 73190 + }, + { + "epoch": 1.1977419618751535, + "grad_norm": 0.06983600556850433, + "learning_rate": 4.151171539236286e-06, + "loss": 0.0022, + "step": 73200 + }, + { + "epoch": 1.1979055878262292, + "grad_norm": 0.07768736034631729, + "learning_rate": 4.149764394906938e-06, + "loss": 0.0016, + "step": 73210 + }, + { + "epoch": 1.198069213777305, + "grad_norm": 0.05781316012144089, + "learning_rate": 4.148357319919915e-06, + "loss": 0.0015, + "step": 73220 + }, + { + "epoch": 1.198232839728381, + "grad_norm": 0.06035643815994263, + "learning_rate": 4.146950314389969e-06, + "loss": 0.001, + "step": 73230 + }, + { + "epoch": 1.1983964656794568, + "grad_norm": 0.0749223455786705, + "learning_rate": 4.145543378431855e-06, + "loss": 0.0015, + "step": 73240 + }, + { + "epoch": 1.1985600916305326, + "grad_norm": 0.0970311164855957, + "learning_rate": 4.144136512160315e-06, + "loss": 0.0014, + "step": 73250 + }, + { + "epoch": 1.1987237175816086, + "grad_norm": 0.03978846222162247, + "learning_rate": 4.142729715690089e-06, + "loss": 0.0009, + "step": 73260 + }, + { + "epoch": 1.1988873435326843, + "grad_norm": 0.08357556164264679, + "learning_rate": 4.141322989135912e-06, + "loss": 0.001, + "step": 73270 + }, + { + "epoch": 1.19905096948376, + "grad_norm": 0.04062269255518913, + "learning_rate": 4.139916332612509e-06, + "loss": 0.0009, + "step": 73280 + }, + { + "epoch": 1.1992145954348359, + "grad_norm": 0.1443643867969513, + "learning_rate": 4.138509746234604e-06, + "loss": 0.0015, + "step": 73290 + }, + { + "epoch": 1.1993782213859119, + "grad_norm": 0.049388084560632706, + "learning_rate": 4.137103230116914e-06, + "loss": 0.002, + "step": 73300 + }, + { + "epoch": 1.1995418473369877, + "grad_norm": 0.06220998987555504, + "learning_rate": 4.135696784374148e-06, + "loss": 0.0012, + "step": 73310 + }, + { + "epoch": 1.1997054732880634, + "grad_norm": 0.045358214527368546, + "learning_rate": 4.134290409121012e-06, + "loss": 0.0012, + "step": 73320 + }, + { + "epoch": 1.1998690992391392, + "grad_norm": 0.11870374530553818, + "learning_rate": 4.1328841044722046e-06, + "loss": 0.0019, + "step": 73330 + }, + { + "epoch": 1.2000327251902152, + "grad_norm": 0.019955415278673172, + "learning_rate": 4.131477870542419e-06, + "loss": 0.0017, + "step": 73340 + }, + { + "epoch": 1.200196351141291, + "grad_norm": 0.03544195368885994, + "learning_rate": 4.130071707446344e-06, + "loss": 0.0011, + "step": 73350 + }, + { + "epoch": 1.2003599770923667, + "grad_norm": 0.047319281846284866, + "learning_rate": 4.128665615298661e-06, + "loss": 0.0021, + "step": 73360 + }, + { + "epoch": 1.2005236030434427, + "grad_norm": 0.043404094874858856, + "learning_rate": 4.127259594214044e-06, + "loss": 0.001, + "step": 73370 + }, + { + "epoch": 1.2006872289945185, + "grad_norm": 0.028605856001377106, + "learning_rate": 4.125853644307167e-06, + "loss": 0.0022, + "step": 73380 + }, + { + "epoch": 1.2008508549455943, + "grad_norm": 0.0625309944152832, + "learning_rate": 4.124447765692693e-06, + "loss": 0.0012, + "step": 73390 + }, + { + "epoch": 1.2010144808966703, + "grad_norm": 0.18719732761383057, + "learning_rate": 4.12304195848528e-06, + "loss": 0.0012, + "step": 73400 + }, + { + "epoch": 1.201178106847746, + "grad_norm": 0.04038171097636223, + "learning_rate": 4.1216362227995796e-06, + "loss": 0.0012, + "step": 73410 + }, + { + "epoch": 1.2013417327988218, + "grad_norm": 0.08253055065870285, + "learning_rate": 4.1202305587502425e-06, + "loss": 0.006, + "step": 73420 + }, + { + "epoch": 1.2015053587498978, + "grad_norm": 0.05317462980747223, + "learning_rate": 4.118824966451906e-06, + "loss": 0.0017, + "step": 73430 + }, + { + "epoch": 1.2016689847009736, + "grad_norm": 0.08571598678827286, + "learning_rate": 4.117419446019208e-06, + "loss": 0.0012, + "step": 73440 + }, + { + "epoch": 1.2018326106520494, + "grad_norm": 0.04198795557022095, + "learning_rate": 4.116013997566778e-06, + "loss": 0.0009, + "step": 73450 + }, + { + "epoch": 1.2019962366031254, + "grad_norm": 0.044771261513233185, + "learning_rate": 4.114608621209238e-06, + "loss": 0.0009, + "step": 73460 + }, + { + "epoch": 1.2021598625542012, + "grad_norm": 0.03170878067612648, + "learning_rate": 4.1132033170612065e-06, + "loss": 0.0014, + "step": 73470 + }, + { + "epoch": 1.202323488505277, + "grad_norm": 0.051643356680870056, + "learning_rate": 4.111798085237295e-06, + "loss": 0.0016, + "step": 73480 + }, + { + "epoch": 1.2024871144563527, + "grad_norm": 0.061087582260370255, + "learning_rate": 4.11039292585211e-06, + "loss": 0.003, + "step": 73490 + }, + { + "epoch": 1.2026507404074287, + "grad_norm": 0.1084022969007492, + "learning_rate": 4.108987839020252e-06, + "loss": 0.0016, + "step": 73500 + }, + { + "epoch": 1.2028143663585045, + "grad_norm": 0.026638222858309746, + "learning_rate": 4.1075828248563145e-06, + "loss": 0.0009, + "step": 73510 + }, + { + "epoch": 1.2029779923095802, + "grad_norm": 0.027507686987519264, + "learning_rate": 4.106177883474885e-06, + "loss": 0.0013, + "step": 73520 + }, + { + "epoch": 1.203141618260656, + "grad_norm": 0.045355118811130524, + "learning_rate": 4.104773014990546e-06, + "loss": 0.0011, + "step": 73530 + }, + { + "epoch": 1.203305244211732, + "grad_norm": 0.06505275517702103, + "learning_rate": 4.103368219517874e-06, + "loss": 0.0012, + "step": 73540 + }, + { + "epoch": 1.2034688701628078, + "grad_norm": 0.0592808797955513, + "learning_rate": 4.101963497171439e-06, + "loss": 0.0023, + "step": 73550 + }, + { + "epoch": 1.2036324961138836, + "grad_norm": 0.03584162890911102, + "learning_rate": 4.100558848065807e-06, + "loss": 0.0008, + "step": 73560 + }, + { + "epoch": 1.2037961220649596, + "grad_norm": 0.03921978920698166, + "learning_rate": 4.099154272315535e-06, + "loss": 0.0019, + "step": 73570 + }, + { + "epoch": 1.2039597480160353, + "grad_norm": 0.015199974179267883, + "learning_rate": 4.097749770035175e-06, + "loss": 0.0013, + "step": 73580 + }, + { + "epoch": 1.204123373967111, + "grad_norm": 0.0520598441362381, + "learning_rate": 4.096345341339274e-06, + "loss": 0.0016, + "step": 73590 + }, + { + "epoch": 1.204286999918187, + "grad_norm": 0.07999885827302933, + "learning_rate": 4.094940986342373e-06, + "loss": 0.0013, + "step": 73600 + }, + { + "epoch": 1.2044506258692629, + "grad_norm": 0.10286489129066467, + "learning_rate": 4.093536705159005e-06, + "loss": 0.0011, + "step": 73610 + }, + { + "epoch": 1.2046142518203387, + "grad_norm": 0.047496747225522995, + "learning_rate": 4.0921324979037e-06, + "loss": 0.0014, + "step": 73620 + }, + { + "epoch": 1.2047778777714147, + "grad_norm": 0.02437729761004448, + "learning_rate": 4.0907283646909795e-06, + "loss": 0.001, + "step": 73630 + }, + { + "epoch": 1.2049415037224904, + "grad_norm": 0.004412082489579916, + "learning_rate": 4.08932430563536e-06, + "loss": 0.0021, + "step": 73640 + }, + { + "epoch": 1.2051051296735662, + "grad_norm": 0.03516425937414169, + "learning_rate": 4.087920320851351e-06, + "loss": 0.0003, + "step": 73650 + }, + { + "epoch": 1.205268755624642, + "grad_norm": 0.04848313704133034, + "learning_rate": 4.086516410453458e-06, + "loss": 0.0016, + "step": 73660 + }, + { + "epoch": 1.205432381575718, + "grad_norm": 0.0047675808891654015, + "learning_rate": 4.085112574556179e-06, + "loss": 0.0012, + "step": 73670 + }, + { + "epoch": 1.2055960075267937, + "grad_norm": 0.11030463129281998, + "learning_rate": 4.083708813274005e-06, + "loss": 0.0016, + "step": 73680 + }, + { + "epoch": 1.2057596334778695, + "grad_norm": 0.10973118990659714, + "learning_rate": 4.082305126721424e-06, + "loss": 0.0025, + "step": 73690 + }, + { + "epoch": 1.2059232594289455, + "grad_norm": 0.01860835775732994, + "learning_rate": 4.080901515012914e-06, + "loss": 0.0017, + "step": 73700 + }, + { + "epoch": 1.2060868853800213, + "grad_norm": 0.0034170267172157764, + "learning_rate": 4.079497978262948e-06, + "loss": 0.0009, + "step": 73710 + }, + { + "epoch": 1.206250511331097, + "grad_norm": 0.08002614229917526, + "learning_rate": 4.078094516585997e-06, + "loss": 0.002, + "step": 73720 + }, + { + "epoch": 1.2064141372821728, + "grad_norm": 0.08197759091854095, + "learning_rate": 4.0766911300965195e-06, + "loss": 0.0016, + "step": 73730 + }, + { + "epoch": 1.2065777632332488, + "grad_norm": 0.004951344802975655, + "learning_rate": 4.0752878189089725e-06, + "loss": 0.0011, + "step": 73740 + }, + { + "epoch": 1.2067413891843246, + "grad_norm": 0.0046267276629805565, + "learning_rate": 4.073884583137805e-06, + "loss": 0.001, + "step": 73750 + }, + { + "epoch": 1.2069050151354004, + "grad_norm": 0.05993367359042168, + "learning_rate": 4.0724814228974595e-06, + "loss": 0.0009, + "step": 73760 + }, + { + "epoch": 1.2070686410864764, + "grad_norm": 0.06475260108709335, + "learning_rate": 4.071078338302374e-06, + "loss": 0.0011, + "step": 73770 + }, + { + "epoch": 1.2072322670375522, + "grad_norm": 0.04632718488574028, + "learning_rate": 4.0696753294669785e-06, + "loss": 0.0008, + "step": 73780 + }, + { + "epoch": 1.207395892988628, + "grad_norm": 0.07304283231496811, + "learning_rate": 4.068272396505697e-06, + "loss": 0.0014, + "step": 73790 + }, + { + "epoch": 1.207559518939704, + "grad_norm": 0.10290410369634628, + "learning_rate": 4.06686953953295e-06, + "loss": 0.0006, + "step": 73800 + }, + { + "epoch": 1.2077231448907797, + "grad_norm": 0.021572886034846306, + "learning_rate": 4.065466758663148e-06, + "loss": 0.0012, + "step": 73810 + }, + { + "epoch": 1.2078867708418555, + "grad_norm": 0.0545496791601181, + "learning_rate": 4.064064054010699e-06, + "loss": 0.0013, + "step": 73820 + }, + { + "epoch": 1.2080503967929315, + "grad_norm": 0.10006240010261536, + "learning_rate": 4.062661425690001e-06, + "loss": 0.0017, + "step": 73830 + }, + { + "epoch": 1.2082140227440072, + "grad_norm": 0.05202379450201988, + "learning_rate": 4.061258873815447e-06, + "loss": 0.0019, + "step": 73840 + }, + { + "epoch": 1.208377648695083, + "grad_norm": 0.04262712225317955, + "learning_rate": 4.059856398501426e-06, + "loss": 0.0012, + "step": 73850 + }, + { + "epoch": 1.2085412746461588, + "grad_norm": 0.053624287247657776, + "learning_rate": 4.0584539998623175e-06, + "loss": 0.0008, + "step": 73860 + }, + { + "epoch": 1.2087049005972348, + "grad_norm": 0.0672333836555481, + "learning_rate": 4.057051678012499e-06, + "loss": 0.001, + "step": 73870 + }, + { + "epoch": 1.2088685265483106, + "grad_norm": 0.023106999695301056, + "learning_rate": 4.055649433066336e-06, + "loss": 0.001, + "step": 73880 + }, + { + "epoch": 1.2090321524993863, + "grad_norm": 0.021868957206606865, + "learning_rate": 4.0542472651381925e-06, + "loss": 0.0017, + "step": 73890 + }, + { + "epoch": 1.2091957784504623, + "grad_norm": 0.008650953881442547, + "learning_rate": 4.052845174342424e-06, + "loss": 0.0007, + "step": 73900 + }, + { + "epoch": 1.209359404401538, + "grad_norm": 0.14851944148540497, + "learning_rate": 4.051443160793382e-06, + "loss": 0.0026, + "step": 73910 + }, + { + "epoch": 1.2095230303526139, + "grad_norm": 0.13709430396556854, + "learning_rate": 4.050041224605408e-06, + "loss": 0.0014, + "step": 73920 + }, + { + "epoch": 1.2096866563036897, + "grad_norm": 0.016379734501242638, + "learning_rate": 4.048639365892839e-06, + "loss": 0.0016, + "step": 73930 + }, + { + "epoch": 1.2098502822547657, + "grad_norm": 0.02267172932624817, + "learning_rate": 4.047237584770007e-06, + "loss": 0.0013, + "step": 73940 + }, + { + "epoch": 1.2100139082058414, + "grad_norm": 0.03803670033812523, + "learning_rate": 4.045835881351235e-06, + "loss": 0.0009, + "step": 73950 + }, + { + "epoch": 1.2101775341569172, + "grad_norm": 0.06610969454050064, + "learning_rate": 4.044434255750844e-06, + "loss": 0.0015, + "step": 73960 + }, + { + "epoch": 1.2103411601079932, + "grad_norm": 0.028342286124825478, + "learning_rate": 4.043032708083143e-06, + "loss": 0.001, + "step": 73970 + }, + { + "epoch": 1.210504786059069, + "grad_norm": 0.04162374511361122, + "learning_rate": 4.041631238462438e-06, + "loss": 0.0023, + "step": 73980 + }, + { + "epoch": 1.2106684120101447, + "grad_norm": 0.03267696127295494, + "learning_rate": 4.040229847003029e-06, + "loss": 0.0021, + "step": 73990 + }, + { + "epoch": 1.2108320379612207, + "grad_norm": 0.026449577882885933, + "learning_rate": 4.038828533819209e-06, + "loss": 0.0011, + "step": 74000 + }, + { + "epoch": 1.2109956639122965, + "grad_norm": 0.02258426509797573, + "learning_rate": 4.0374272990252625e-06, + "loss": 0.0011, + "step": 74010 + }, + { + "epoch": 1.2111592898633723, + "grad_norm": 0.0716397762298584, + "learning_rate": 4.03602614273547e-06, + "loss": 0.0027, + "step": 74020 + }, + { + "epoch": 1.2113229158144483, + "grad_norm": 0.05017632246017456, + "learning_rate": 4.034625065064108e-06, + "loss": 0.001, + "step": 74030 + }, + { + "epoch": 1.211486541765524, + "grad_norm": 0.05698603391647339, + "learning_rate": 4.033224066125439e-06, + "loss": 0.0013, + "step": 74040 + }, + { + "epoch": 1.2116501677165998, + "grad_norm": 0.06374849379062653, + "learning_rate": 4.031823146033727e-06, + "loss": 0.0012, + "step": 74050 + }, + { + "epoch": 1.2118137936676756, + "grad_norm": 0.13887757062911987, + "learning_rate": 4.030422304903225e-06, + "loss": 0.0016, + "step": 74060 + }, + { + "epoch": 1.2119774196187516, + "grad_norm": 0.18006059527397156, + "learning_rate": 4.02902154284818e-06, + "loss": 0.0013, + "step": 74070 + }, + { + "epoch": 1.2121410455698274, + "grad_norm": 0.10183005779981613, + "learning_rate": 4.027620859982836e-06, + "loss": 0.0009, + "step": 74080 + }, + { + "epoch": 1.2123046715209032, + "grad_norm": 0.07122647762298584, + "learning_rate": 4.0262202564214255e-06, + "loss": 0.0024, + "step": 74090 + }, + { + "epoch": 1.2124682974719792, + "grad_norm": 0.08233967423439026, + "learning_rate": 4.024819732278178e-06, + "loss": 0.0014, + "step": 74100 + }, + { + "epoch": 1.212631923423055, + "grad_norm": 0.03426395356655121, + "learning_rate": 4.023419287667315e-06, + "loss": 0.0011, + "step": 74110 + }, + { + "epoch": 1.2127955493741307, + "grad_norm": 0.11198526620864868, + "learning_rate": 4.022018922703052e-06, + "loss": 0.0013, + "step": 74120 + }, + { + "epoch": 1.2129591753252065, + "grad_norm": 0.004733164329081774, + "learning_rate": 4.020618637499599e-06, + "loss": 0.0013, + "step": 74130 + }, + { + "epoch": 1.2131228012762825, + "grad_norm": 0.07090175151824951, + "learning_rate": 4.019218432171158e-06, + "loss": 0.0008, + "step": 74140 + }, + { + "epoch": 1.2132864272273582, + "grad_norm": 0.11436349153518677, + "learning_rate": 4.017818306831925e-06, + "loss": 0.0011, + "step": 74150 + }, + { + "epoch": 1.213450053178434, + "grad_norm": 0.027770675718784332, + "learning_rate": 4.016418261596089e-06, + "loss": 0.0011, + "step": 74160 + }, + { + "epoch": 1.21361367912951, + "grad_norm": 0.007805570028722286, + "learning_rate": 4.015018296577832e-06, + "loss": 0.003, + "step": 74170 + }, + { + "epoch": 1.2137773050805858, + "grad_norm": 0.04702167958021164, + "learning_rate": 4.013618411891333e-06, + "loss": 0.0007, + "step": 74180 + }, + { + "epoch": 1.2139409310316616, + "grad_norm": 0.046294666826725006, + "learning_rate": 4.012218607650759e-06, + "loss": 0.001, + "step": 74190 + }, + { + "epoch": 1.2141045569827376, + "grad_norm": 0.018465016037225723, + "learning_rate": 4.010818883970275e-06, + "loss": 0.0009, + "step": 74200 + }, + { + "epoch": 1.2142681829338133, + "grad_norm": 0.0883779227733612, + "learning_rate": 4.0094192409640374e-06, + "loss": 0.0013, + "step": 74210 + }, + { + "epoch": 1.214431808884889, + "grad_norm": 0.04356410354375839, + "learning_rate": 4.008019678746197e-06, + "loss": 0.0014, + "step": 74220 + }, + { + "epoch": 1.214595434835965, + "grad_norm": 0.023246828466653824, + "learning_rate": 4.006620197430896e-06, + "loss": 0.001, + "step": 74230 + }, + { + "epoch": 1.2147590607870409, + "grad_norm": 0.03040974959731102, + "learning_rate": 4.005220797132271e-06, + "loss": 0.0038, + "step": 74240 + }, + { + "epoch": 1.2149226867381167, + "grad_norm": 0.023721804842352867, + "learning_rate": 4.003821477964455e-06, + "loss": 0.002, + "step": 74250 + }, + { + "epoch": 1.2150863126891924, + "grad_norm": 0.06594086438417435, + "learning_rate": 4.002422240041569e-06, + "loss": 0.0014, + "step": 74260 + }, + { + "epoch": 1.2152499386402684, + "grad_norm": 0.05709246173501015, + "learning_rate": 4.001023083477731e-06, + "loss": 0.0014, + "step": 74270 + }, + { + "epoch": 1.2154135645913442, + "grad_norm": 0.053758423775434494, + "learning_rate": 3.999624008387052e-06, + "loss": 0.0012, + "step": 74280 + }, + { + "epoch": 1.21557719054242, + "grad_norm": 0.2136073261499405, + "learning_rate": 3.9982250148836345e-06, + "loss": 0.0017, + "step": 74290 + }, + { + "epoch": 1.2157408164934957, + "grad_norm": 0.0475122295320034, + "learning_rate": 3.996826103081577e-06, + "loss": 0.0009, + "step": 74300 + }, + { + "epoch": 1.2159044424445717, + "grad_norm": 0.0780680775642395, + "learning_rate": 3.995427273094969e-06, + "loss": 0.0014, + "step": 74310 + }, + { + "epoch": 1.2160680683956475, + "grad_norm": 0.029846929013729095, + "learning_rate": 3.994028525037896e-06, + "loss": 0.0015, + "step": 74320 + }, + { + "epoch": 1.2162316943467233, + "grad_norm": 0.06027283892035484, + "learning_rate": 3.992629859024433e-06, + "loss": 0.0028, + "step": 74330 + }, + { + "epoch": 1.2163953202977993, + "grad_norm": 0.0844481885433197, + "learning_rate": 3.991231275168651e-06, + "loss": 0.0012, + "step": 74340 + }, + { + "epoch": 1.216558946248875, + "grad_norm": 0.036458928138017654, + "learning_rate": 3.989832773584614e-06, + "loss": 0.0009, + "step": 74350 + }, + { + "epoch": 1.2167225721999508, + "grad_norm": 0.09017597883939743, + "learning_rate": 3.9884343543863805e-06, + "loss": 0.001, + "step": 74360 + }, + { + "epoch": 1.2168861981510268, + "grad_norm": 0.037239089608192444, + "learning_rate": 3.987036017687998e-06, + "loss": 0.0011, + "step": 74370 + }, + { + "epoch": 1.2170498241021026, + "grad_norm": 0.02048400789499283, + "learning_rate": 3.985637763603512e-06, + "loss": 0.0011, + "step": 74380 + }, + { + "epoch": 1.2172134500531784, + "grad_norm": 0.15330049395561218, + "learning_rate": 3.984239592246959e-06, + "loss": 0.0012, + "step": 74390 + }, + { + "epoch": 1.2173770760042544, + "grad_norm": 0.049092765897512436, + "learning_rate": 3.982841503732369e-06, + "loss": 0.001, + "step": 74400 + }, + { + "epoch": 1.2175407019553302, + "grad_norm": 0.019493570551276207, + "learning_rate": 3.981443498173764e-06, + "loss": 0.0011, + "step": 74410 + }, + { + "epoch": 1.217704327906406, + "grad_norm": 0.10030219703912735, + "learning_rate": 3.980045575685163e-06, + "loss": 0.0016, + "step": 74420 + }, + { + "epoch": 1.2178679538574817, + "grad_norm": 0.07201150804758072, + "learning_rate": 3.978647736380575e-06, + "loss": 0.0008, + "step": 74430 + }, + { + "epoch": 1.2180315798085577, + "grad_norm": 0.09272464364767075, + "learning_rate": 3.977249980374002e-06, + "loss": 0.0012, + "step": 74440 + }, + { + "epoch": 1.2181952057596335, + "grad_norm": 0.20647422969341278, + "learning_rate": 3.975852307779441e-06, + "loss": 0.002, + "step": 74450 + }, + { + "epoch": 1.2183588317107092, + "grad_norm": 0.07151030004024506, + "learning_rate": 3.974454718710882e-06, + "loss": 0.0012, + "step": 74460 + }, + { + "epoch": 1.2185224576617852, + "grad_norm": 0.01113701332360506, + "learning_rate": 3.973057213282307e-06, + "loss": 0.0013, + "step": 74470 + }, + { + "epoch": 1.218686083612861, + "grad_norm": 0.013661486096680164, + "learning_rate": 3.971659791607691e-06, + "loss": 0.0015, + "step": 74480 + }, + { + "epoch": 1.2188497095639368, + "grad_norm": 0.021350817754864693, + "learning_rate": 3.9702624538010035e-06, + "loss": 0.0016, + "step": 74490 + }, + { + "epoch": 1.2190133355150126, + "grad_norm": 0.03898026421666145, + "learning_rate": 3.968865199976207e-06, + "loss": 0.0024, + "step": 74500 + }, + { + "epoch": 1.2191769614660886, + "grad_norm": 0.041934218257665634, + "learning_rate": 3.9674680302472565e-06, + "loss": 0.0013, + "step": 74510 + }, + { + "epoch": 1.2193405874171643, + "grad_norm": 0.045785367488861084, + "learning_rate": 3.966070944728101e-06, + "loss": 0.001, + "step": 74520 + }, + { + "epoch": 1.21950421336824, + "grad_norm": 0.07368072867393494, + "learning_rate": 3.964673943532682e-06, + "loss": 0.0022, + "step": 74530 + }, + { + "epoch": 1.219667839319316, + "grad_norm": 0.0951850563287735, + "learning_rate": 3.963277026774932e-06, + "loss": 0.0008, + "step": 74540 + }, + { + "epoch": 1.2198314652703919, + "grad_norm": 0.03721340000629425, + "learning_rate": 3.961880194568781e-06, + "loss": 0.0011, + "step": 74550 + }, + { + "epoch": 1.2199950912214677, + "grad_norm": 0.019539376720786095, + "learning_rate": 3.96048344702815e-06, + "loss": 0.0011, + "step": 74560 + }, + { + "epoch": 1.2201587171725436, + "grad_norm": 0.0532948337495327, + "learning_rate": 3.959086784266952e-06, + "loss": 0.0011, + "step": 74570 + }, + { + "epoch": 1.2203223431236194, + "grad_norm": 0.09206970036029816, + "learning_rate": 3.9576902063990945e-06, + "loss": 0.0021, + "step": 74580 + }, + { + "epoch": 1.2204859690746952, + "grad_norm": 0.050228238105773926, + "learning_rate": 3.956293713538477e-06, + "loss": 0.0021, + "step": 74590 + }, + { + "epoch": 1.2206495950257712, + "grad_norm": 0.03838194161653519, + "learning_rate": 3.954897305798993e-06, + "loss": 0.0013, + "step": 74600 + }, + { + "epoch": 1.220813220976847, + "grad_norm": 0.05389568209648132, + "learning_rate": 3.9535009832945305e-06, + "loss": 0.0011, + "step": 74610 + }, + { + "epoch": 1.2209768469279227, + "grad_norm": 0.07620038837194443, + "learning_rate": 3.952104746138966e-06, + "loss": 0.0023, + "step": 74620 + }, + { + "epoch": 1.2211404728789985, + "grad_norm": 0.031721144914627075, + "learning_rate": 3.9507085944461735e-06, + "loss": 0.0015, + "step": 74630 + }, + { + "epoch": 1.2213040988300745, + "grad_norm": 0.06377547979354858, + "learning_rate": 3.949312528330019e-06, + "loss": 0.0013, + "step": 74640 + }, + { + "epoch": 1.2214677247811503, + "grad_norm": 0.06723318994045258, + "learning_rate": 3.947916547904358e-06, + "loss": 0.0017, + "step": 74650 + }, + { + "epoch": 1.221631350732226, + "grad_norm": 0.08901673555374146, + "learning_rate": 3.946520653283045e-06, + "loss": 0.0013, + "step": 74660 + }, + { + "epoch": 1.221794976683302, + "grad_norm": 0.05046418681740761, + "learning_rate": 3.9451248445799224e-06, + "loss": 0.0007, + "step": 74670 + }, + { + "epoch": 1.2219586026343778, + "grad_norm": 0.12032615393400192, + "learning_rate": 3.94372912190883e-06, + "loss": 0.0023, + "step": 74680 + }, + { + "epoch": 1.2221222285854536, + "grad_norm": 0.07041841000318527, + "learning_rate": 3.9423334853835956e-06, + "loss": 0.0017, + "step": 74690 + }, + { + "epoch": 1.2222858545365294, + "grad_norm": 0.029427969828248024, + "learning_rate": 3.940937935118044e-06, + "loss": 0.0006, + "step": 74700 + }, + { + "epoch": 1.2224494804876054, + "grad_norm": 0.04333318769931793, + "learning_rate": 3.9395424712259915e-06, + "loss": 0.002, + "step": 74710 + }, + { + "epoch": 1.2226131064386812, + "grad_norm": 0.06791647523641586, + "learning_rate": 3.938147093821246e-06, + "loss": 0.0011, + "step": 74720 + }, + { + "epoch": 1.222776732389757, + "grad_norm": 0.0302668958902359, + "learning_rate": 3.936751803017612e-06, + "loss": 0.0006, + "step": 74730 + }, + { + "epoch": 1.222940358340833, + "grad_norm": 0.05871063843369484, + "learning_rate": 3.9353565989288825e-06, + "loss": 0.0015, + "step": 74740 + }, + { + "epoch": 1.2231039842919087, + "grad_norm": 0.0647435411810875, + "learning_rate": 3.933961481668847e-06, + "loss": 0.0015, + "step": 74750 + }, + { + "epoch": 1.2232676102429845, + "grad_norm": 0.08770201355218887, + "learning_rate": 3.932566451351286e-06, + "loss": 0.0033, + "step": 74760 + }, + { + "epoch": 1.2234312361940605, + "grad_norm": 0.07383829355239868, + "learning_rate": 3.931171508089974e-06, + "loss": 0.0018, + "step": 74770 + }, + { + "epoch": 1.2235948621451362, + "grad_norm": 0.0903746709227562, + "learning_rate": 3.9297766519986766e-06, + "loss": 0.0012, + "step": 74780 + }, + { + "epoch": 1.223758488096212, + "grad_norm": 0.017470426857471466, + "learning_rate": 3.928381883191155e-06, + "loss": 0.0016, + "step": 74790 + }, + { + "epoch": 1.223922114047288, + "grad_norm": 0.02390780858695507, + "learning_rate": 3.926987201781162e-06, + "loss": 0.0009, + "step": 74800 + }, + { + "epoch": 1.2240857399983638, + "grad_norm": 0.05151120573282242, + "learning_rate": 3.92559260788244e-06, + "loss": 0.0009, + "step": 74810 + }, + { + "epoch": 1.2242493659494396, + "grad_norm": 0.041096292436122894, + "learning_rate": 3.924198101608731e-06, + "loss": 0.001, + "step": 74820 + }, + { + "epoch": 1.2244129919005153, + "grad_norm": 0.03882312774658203, + "learning_rate": 3.922803683073763e-06, + "loss": 0.0009, + "step": 74830 + }, + { + "epoch": 1.2245766178515913, + "grad_norm": 0.004259459674358368, + "learning_rate": 3.921409352391262e-06, + "loss": 0.0007, + "step": 74840 + }, + { + "epoch": 1.224740243802667, + "grad_norm": 0.1282317042350769, + "learning_rate": 3.920015109674946e-06, + "loss": 0.0012, + "step": 74850 + }, + { + "epoch": 1.2249038697537429, + "grad_norm": 0.05232157185673714, + "learning_rate": 3.918620955038522e-06, + "loss": 0.0009, + "step": 74860 + }, + { + "epoch": 1.2250674957048189, + "grad_norm": 0.01994086243212223, + "learning_rate": 3.917226888595693e-06, + "loss": 0.001, + "step": 74870 + }, + { + "epoch": 1.2252311216558947, + "grad_norm": 0.04924019053578377, + "learning_rate": 3.915832910460157e-06, + "loss": 0.0015, + "step": 74880 + }, + { + "epoch": 1.2253947476069704, + "grad_norm": 0.21405909955501556, + "learning_rate": 3.914439020745599e-06, + "loss": 0.002, + "step": 74890 + }, + { + "epoch": 1.2255583735580462, + "grad_norm": 0.05332926660776138, + "learning_rate": 3.9130452195657006e-06, + "loss": 0.0012, + "step": 74900 + }, + { + "epoch": 1.2257219995091222, + "grad_norm": 0.029374608770012856, + "learning_rate": 3.911651507034137e-06, + "loss": 0.0014, + "step": 74910 + }, + { + "epoch": 1.225885625460198, + "grad_norm": 0.045646782964468, + "learning_rate": 3.9102578832645736e-06, + "loss": 0.0016, + "step": 74920 + }, + { + "epoch": 1.2260492514112737, + "grad_norm": 0.05251171067357063, + "learning_rate": 3.908864348370669e-06, + "loss": 0.0013, + "step": 74930 + }, + { + "epoch": 1.2262128773623497, + "grad_norm": 0.006718936376273632, + "learning_rate": 3.9074709024660765e-06, + "loss": 0.0015, + "step": 74940 + }, + { + "epoch": 1.2263765033134255, + "grad_norm": 0.018619919195771217, + "learning_rate": 3.906077545664439e-06, + "loss": 0.0006, + "step": 74950 + }, + { + "epoch": 1.2265401292645013, + "grad_norm": 0.08224703371524811, + "learning_rate": 3.904684278079396e-06, + "loss": 0.0014, + "step": 74960 + }, + { + "epoch": 1.2267037552155773, + "grad_norm": 0.009832433424890041, + "learning_rate": 3.903291099824576e-06, + "loss": 0.0007, + "step": 74970 + }, + { + "epoch": 1.226867381166653, + "grad_norm": 0.08725149184465408, + "learning_rate": 3.901898011013603e-06, + "loss": 0.0018, + "step": 74980 + }, + { + "epoch": 1.2270310071177288, + "grad_norm": 0.27727824449539185, + "learning_rate": 3.900505011760092e-06, + "loss": 0.0024, + "step": 74990 + }, + { + "epoch": 1.2271946330688048, + "grad_norm": 0.045055750757455826, + "learning_rate": 3.899112102177651e-06, + "loss": 0.0015, + "step": 75000 + }, + { + "epoch": 1.2273582590198806, + "grad_norm": 0.060741279274225235, + "learning_rate": 3.897719282379883e-06, + "loss": 0.001, + "step": 75010 + }, + { + "epoch": 1.2275218849709564, + "grad_norm": 0.08668284863233566, + "learning_rate": 3.896326552480379e-06, + "loss": 0.0014, + "step": 75020 + }, + { + "epoch": 1.2276855109220322, + "grad_norm": 0.05349934101104736, + "learning_rate": 3.894933912592726e-06, + "loss": 0.0013, + "step": 75030 + }, + { + "epoch": 1.2278491368731081, + "grad_norm": 0.06193646416068077, + "learning_rate": 3.893541362830503e-06, + "loss": 0.0021, + "step": 75040 + }, + { + "epoch": 1.228012762824184, + "grad_norm": 0.015309491194784641, + "learning_rate": 3.892148903307283e-06, + "loss": 0.001, + "step": 75050 + }, + { + "epoch": 1.2281763887752597, + "grad_norm": 0.029573263600468636, + "learning_rate": 3.890756534136629e-06, + "loss": 0.0017, + "step": 75060 + }, + { + "epoch": 1.2283400147263357, + "grad_norm": 0.05194779112935066, + "learning_rate": 3.889364255432098e-06, + "loss": 0.0011, + "step": 75070 + }, + { + "epoch": 1.2285036406774115, + "grad_norm": 0.06401064991950989, + "learning_rate": 3.8879720673072395e-06, + "loss": 0.0011, + "step": 75080 + }, + { + "epoch": 1.2286672666284872, + "grad_norm": 0.03716651722788811, + "learning_rate": 3.886579969875595e-06, + "loss": 0.001, + "step": 75090 + }, + { + "epoch": 1.228830892579563, + "grad_norm": 0.04947042465209961, + "learning_rate": 3.885187963250701e-06, + "loss": 0.0012, + "step": 75100 + }, + { + "epoch": 1.228994518530639, + "grad_norm": 0.06774580478668213, + "learning_rate": 3.8837960475460836e-06, + "loss": 0.0009, + "step": 75110 + }, + { + "epoch": 1.2291581444817148, + "grad_norm": 0.015685144811868668, + "learning_rate": 3.882404222875264e-06, + "loss": 0.0014, + "step": 75120 + }, + { + "epoch": 1.2293217704327906, + "grad_norm": 0.04278433322906494, + "learning_rate": 3.881012489351752e-06, + "loss": 0.0023, + "step": 75130 + }, + { + "epoch": 1.2294853963838666, + "grad_norm": 0.0625486895442009, + "learning_rate": 3.879620847089055e-06, + "loss": 0.0015, + "step": 75140 + }, + { + "epoch": 1.2296490223349423, + "grad_norm": 0.027868643403053284, + "learning_rate": 3.87822929620067e-06, + "loss": 0.0009, + "step": 75150 + }, + { + "epoch": 1.229812648286018, + "grad_norm": 0.017256226390600204, + "learning_rate": 3.876837836800086e-06, + "loss": 0.0019, + "step": 75160 + }, + { + "epoch": 1.229976274237094, + "grad_norm": 0.030246959999203682, + "learning_rate": 3.875446469000788e-06, + "loss": 0.0009, + "step": 75170 + }, + { + "epoch": 1.2301399001881699, + "grad_norm": 0.09192854166030884, + "learning_rate": 3.874055192916248e-06, + "loss": 0.001, + "step": 75180 + }, + { + "epoch": 1.2303035261392457, + "grad_norm": 0.027013687416911125, + "learning_rate": 3.872664008659938e-06, + "loss": 0.0008, + "step": 75190 + }, + { + "epoch": 1.2304671520903216, + "grad_norm": 0.11414414644241333, + "learning_rate": 3.871272916345314e-06, + "loss": 0.0016, + "step": 75200 + }, + { + "epoch": 1.2306307780413974, + "grad_norm": 0.23190844058990479, + "learning_rate": 3.869881916085832e-06, + "loss": 0.001, + "step": 75210 + }, + { + "epoch": 1.2307944039924732, + "grad_norm": 0.01144721731543541, + "learning_rate": 3.868491007994935e-06, + "loss": 0.0014, + "step": 75220 + }, + { + "epoch": 1.230958029943549, + "grad_norm": 0.028903638944029808, + "learning_rate": 3.8671001921860614e-06, + "loss": 0.0013, + "step": 75230 + }, + { + "epoch": 1.231121655894625, + "grad_norm": 0.004566123243421316, + "learning_rate": 3.8657094687726424e-06, + "loss": 0.0009, + "step": 75240 + }, + { + "epoch": 1.2312852818457007, + "grad_norm": 0.08521173149347305, + "learning_rate": 3.8643188378681e-06, + "loss": 0.0008, + "step": 75250 + }, + { + "epoch": 1.2314489077967765, + "grad_norm": 0.07526182383298874, + "learning_rate": 3.86292829958585e-06, + "loss": 0.0021, + "step": 75260 + }, + { + "epoch": 1.2316125337478523, + "grad_norm": 0.04024568572640419, + "learning_rate": 3.861537854039298e-06, + "loss": 0.0019, + "step": 75270 + }, + { + "epoch": 1.2317761596989283, + "grad_norm": 0.10666966438293457, + "learning_rate": 3.8601475013418455e-06, + "loss": 0.0014, + "step": 75280 + }, + { + "epoch": 1.231939785650004, + "grad_norm": 0.06438332796096802, + "learning_rate": 3.858757241606884e-06, + "loss": 0.002, + "step": 75290 + }, + { + "epoch": 1.2321034116010798, + "grad_norm": 0.036823879927396774, + "learning_rate": 3.8573670749478005e-06, + "loss": 0.0013, + "step": 75300 + }, + { + "epoch": 1.2322670375521558, + "grad_norm": 0.03285127878189087, + "learning_rate": 3.85597700147797e-06, + "loss": 0.0008, + "step": 75310 + }, + { + "epoch": 1.2324306635032316, + "grad_norm": 0.06669161468744278, + "learning_rate": 3.8545870213107626e-06, + "loss": 0.0026, + "step": 75320 + }, + { + "epoch": 1.2325942894543074, + "grad_norm": 0.04048169404268265, + "learning_rate": 3.853197134559541e-06, + "loss": 0.0013, + "step": 75330 + }, + { + "epoch": 1.2327579154053834, + "grad_norm": 0.03150138258934021, + "learning_rate": 3.85180734133766e-06, + "loss": 0.0011, + "step": 75340 + }, + { + "epoch": 1.2329215413564591, + "grad_norm": 0.12060286104679108, + "learning_rate": 3.850417641758466e-06, + "loss": 0.0014, + "step": 75350 + }, + { + "epoch": 1.233085167307535, + "grad_norm": 0.04195263609290123, + "learning_rate": 3.849028035935296e-06, + "loss": 0.001, + "step": 75360 + }, + { + "epoch": 1.233248793258611, + "grad_norm": 0.04212228208780289, + "learning_rate": 3.847638523981485e-06, + "loss": 0.001, + "step": 75370 + }, + { + "epoch": 1.2334124192096867, + "grad_norm": 0.06619065254926682, + "learning_rate": 3.846249106010355e-06, + "loss": 0.001, + "step": 75380 + }, + { + "epoch": 1.2335760451607625, + "grad_norm": 0.041517145931720734, + "learning_rate": 3.844859782135222e-06, + "loss": 0.0059, + "step": 75390 + }, + { + "epoch": 1.2337396711118382, + "grad_norm": 0.10518371313810349, + "learning_rate": 3.843470552469396e-06, + "loss": 0.0012, + "step": 75400 + }, + { + "epoch": 1.2339032970629142, + "grad_norm": 0.02846491150557995, + "learning_rate": 3.842081417126176e-06, + "loss": 0.001, + "step": 75410 + }, + { + "epoch": 1.23406692301399, + "grad_norm": 0.03170047700405121, + "learning_rate": 3.840692376218856e-06, + "loss": 0.001, + "step": 75420 + }, + { + "epoch": 1.2342305489650658, + "grad_norm": 0.042040154337882996, + "learning_rate": 3.839303429860722e-06, + "loss": 0.001, + "step": 75430 + }, + { + "epoch": 1.2343941749161418, + "grad_norm": 0.0888720378279686, + "learning_rate": 3.837914578165053e-06, + "loss": 0.0009, + "step": 75440 + }, + { + "epoch": 1.2345578008672176, + "grad_norm": 0.00809149257838726, + "learning_rate": 3.836525821245115e-06, + "loss": 0.0011, + "step": 75450 + }, + { + "epoch": 1.2347214268182933, + "grad_norm": 0.003200968960300088, + "learning_rate": 3.835137159214173e-06, + "loss": 0.001, + "step": 75460 + }, + { + "epoch": 1.234885052769369, + "grad_norm": 0.25250300765037537, + "learning_rate": 3.833748592185481e-06, + "loss": 0.0023, + "step": 75470 + }, + { + "epoch": 1.235048678720445, + "grad_norm": 0.22332283854484558, + "learning_rate": 3.832360120272286e-06, + "loss": 0.0019, + "step": 75480 + }, + { + "epoch": 1.2352123046715209, + "grad_norm": 0.015924707055091858, + "learning_rate": 3.830971743587827e-06, + "loss": 0.0014, + "step": 75490 + }, + { + "epoch": 1.2353759306225967, + "grad_norm": 0.027634384110569954, + "learning_rate": 3.829583462245335e-06, + "loss": 0.0015, + "step": 75500 + }, + { + "epoch": 1.2355395565736726, + "grad_norm": 0.09749562293291092, + "learning_rate": 3.8281952763580335e-06, + "loss": 0.0017, + "step": 75510 + }, + { + "epoch": 1.2357031825247484, + "grad_norm": 0.03955467790365219, + "learning_rate": 3.826807186039139e-06, + "loss": 0.0017, + "step": 75520 + }, + { + "epoch": 1.2358668084758242, + "grad_norm": 0.049399811774492264, + "learning_rate": 3.825419191401858e-06, + "loss": 0.001, + "step": 75530 + }, + { + "epoch": 1.2360304344269002, + "grad_norm": 0.03131980076432228, + "learning_rate": 3.824031292559392e-06, + "loss": 0.0009, + "step": 75540 + }, + { + "epoch": 1.236194060377976, + "grad_norm": 0.06004263088107109, + "learning_rate": 3.8226434896249334e-06, + "loss": 0.0011, + "step": 75550 + }, + { + "epoch": 1.2363576863290517, + "grad_norm": 0.2504695951938629, + "learning_rate": 3.821255782711665e-06, + "loss": 0.0012, + "step": 75560 + }, + { + "epoch": 1.2365213122801277, + "grad_norm": 0.05799531564116478, + "learning_rate": 3.819868171932765e-06, + "loss": 0.0009, + "step": 75570 + }, + { + "epoch": 1.2366849382312035, + "grad_norm": 0.047677669674158096, + "learning_rate": 3.8184806574014025e-06, + "loss": 0.0018, + "step": 75580 + }, + { + "epoch": 1.2368485641822793, + "grad_norm": 0.034939464181661606, + "learning_rate": 3.817093239230737e-06, + "loss": 0.0011, + "step": 75590 + }, + { + "epoch": 1.237012190133355, + "grad_norm": 0.04398292303085327, + "learning_rate": 3.815705917533922e-06, + "loss": 0.0013, + "step": 75600 + }, + { + "epoch": 1.237175816084431, + "grad_norm": 0.0009588264510966837, + "learning_rate": 3.814318692424103e-06, + "loss": 0.0008, + "step": 75610 + }, + { + "epoch": 1.2373394420355068, + "grad_norm": 0.06344334036111832, + "learning_rate": 3.8129315640144183e-06, + "loss": 0.001, + "step": 75620 + }, + { + "epoch": 1.2375030679865826, + "grad_norm": 0.03541975095868111, + "learning_rate": 3.8115445324179955e-06, + "loss": 0.0013, + "step": 75630 + }, + { + "epoch": 1.2376666939376586, + "grad_norm": 0.018198177218437195, + "learning_rate": 3.810157597747958e-06, + "loss": 0.0008, + "step": 75640 + }, + { + "epoch": 1.2378303198887344, + "grad_norm": 0.036116812378168106, + "learning_rate": 3.808770760117418e-06, + "loss": 0.0013, + "step": 75650 + }, + { + "epoch": 1.2379939458398102, + "grad_norm": 0.2331334501504898, + "learning_rate": 3.807384019639482e-06, + "loss": 0.0032, + "step": 75660 + }, + { + "epoch": 1.238157571790886, + "grad_norm": 0.04998109117150307, + "learning_rate": 3.805997376427248e-06, + "loss": 0.0006, + "step": 75670 + }, + { + "epoch": 1.238321197741962, + "grad_norm": 0.026762284338474274, + "learning_rate": 3.8046108305938057e-06, + "loss": 0.001, + "step": 75680 + }, + { + "epoch": 1.2384848236930377, + "grad_norm": 0.08631635457277298, + "learning_rate": 3.8032243822522362e-06, + "loss": 0.0011, + "step": 75690 + }, + { + "epoch": 1.2386484496441135, + "grad_norm": 0.10220127552747726, + "learning_rate": 3.801838031515615e-06, + "loss": 0.0018, + "step": 75700 + }, + { + "epoch": 1.2388120755951895, + "grad_norm": 0.040783800184726715, + "learning_rate": 3.8004517784970072e-06, + "loss": 0.0013, + "step": 75710 + }, + { + "epoch": 1.2389757015462652, + "grad_norm": 0.03257445991039276, + "learning_rate": 3.7990656233094712e-06, + "loss": 0.0008, + "step": 75720 + }, + { + "epoch": 1.239139327497341, + "grad_norm": 0.06778103858232498, + "learning_rate": 3.7976795660660574e-06, + "loss": 0.0014, + "step": 75730 + }, + { + "epoch": 1.239302953448417, + "grad_norm": 0.07347385585308075, + "learning_rate": 3.7962936068798075e-06, + "loss": 0.0012, + "step": 75740 + }, + { + "epoch": 1.2394665793994928, + "grad_norm": 0.012228734791278839, + "learning_rate": 3.794907745863755e-06, + "loss": 0.001, + "step": 75750 + }, + { + "epoch": 1.2396302053505686, + "grad_norm": 0.07704233378171921, + "learning_rate": 3.793521983130927e-06, + "loss": 0.0016, + "step": 75760 + }, + { + "epoch": 1.2397938313016446, + "grad_norm": 0.05287954583764076, + "learning_rate": 3.7921363187943427e-06, + "loss": 0.0017, + "step": 75770 + }, + { + "epoch": 1.2399574572527203, + "grad_norm": 0.042877621948719025, + "learning_rate": 3.7907507529670095e-06, + "loss": 0.0017, + "step": 75780 + }, + { + "epoch": 1.240121083203796, + "grad_norm": 0.1314309537410736, + "learning_rate": 3.78936528576193e-06, + "loss": 0.0016, + "step": 75790 + }, + { + "epoch": 1.2402847091548719, + "grad_norm": 0.06001598387956619, + "learning_rate": 3.7879799172920996e-06, + "loss": 0.0008, + "step": 75800 + }, + { + "epoch": 1.2404483351059479, + "grad_norm": 0.165829598903656, + "learning_rate": 3.7865946476705033e-06, + "loss": 0.0016, + "step": 75810 + }, + { + "epoch": 1.2406119610570236, + "grad_norm": 0.049714647233486176, + "learning_rate": 3.785209477010119e-06, + "loss": 0.0008, + "step": 75820 + }, + { + "epoch": 1.2407755870080994, + "grad_norm": 0.049130432307720184, + "learning_rate": 3.783824405423917e-06, + "loss": 0.0014, + "step": 75830 + }, + { + "epoch": 1.2409392129591754, + "grad_norm": 0.05671097710728645, + "learning_rate": 3.782439433024859e-06, + "loss": 0.001, + "step": 75840 + }, + { + "epoch": 1.2411028389102512, + "grad_norm": 0.03538745269179344, + "learning_rate": 3.7810545599258973e-06, + "loss": 0.0011, + "step": 75850 + }, + { + "epoch": 1.241266464861327, + "grad_norm": 0.07837069034576416, + "learning_rate": 3.779669786239979e-06, + "loss": 0.0009, + "step": 75860 + }, + { + "epoch": 1.2414300908124027, + "grad_norm": 0.07940542697906494, + "learning_rate": 3.7782851120800413e-06, + "loss": 0.0012, + "step": 75870 + }, + { + "epoch": 1.2415937167634787, + "grad_norm": 0.10000597685575485, + "learning_rate": 3.776900537559013e-06, + "loss": 0.0039, + "step": 75880 + }, + { + "epoch": 1.2417573427145545, + "grad_norm": 0.07211638242006302, + "learning_rate": 3.7755160627898156e-06, + "loss": 0.0017, + "step": 75890 + }, + { + "epoch": 1.2419209686656303, + "grad_norm": 0.07236737757921219, + "learning_rate": 3.774131687885362e-06, + "loss": 0.0027, + "step": 75900 + }, + { + "epoch": 1.2420845946167063, + "grad_norm": 0.18286970257759094, + "learning_rate": 3.7727474129585575e-06, + "loss": 0.0016, + "step": 75910 + }, + { + "epoch": 1.242248220567782, + "grad_norm": 0.05893344804644585, + "learning_rate": 3.7713632381222974e-06, + "loss": 0.0014, + "step": 75920 + }, + { + "epoch": 1.2424118465188578, + "grad_norm": 0.05910464748740196, + "learning_rate": 3.769979163489472e-06, + "loss": 0.0011, + "step": 75930 + }, + { + "epoch": 1.2425754724699338, + "grad_norm": 0.027881009504199028, + "learning_rate": 3.7685951891729616e-06, + "loss": 0.0014, + "step": 75940 + }, + { + "epoch": 1.2427390984210096, + "grad_norm": 0.02975933998823166, + "learning_rate": 3.7672113152856383e-06, + "loss": 0.0054, + "step": 75950 + }, + { + "epoch": 1.2429027243720854, + "grad_norm": 0.026923248544335365, + "learning_rate": 3.7658275419403644e-06, + "loss": 0.001, + "step": 75960 + }, + { + "epoch": 1.2430663503231614, + "grad_norm": 0.04315062239766121, + "learning_rate": 3.7644438692499985e-06, + "loss": 0.001, + "step": 75970 + }, + { + "epoch": 1.2432299762742371, + "grad_norm": 0.044196173548698425, + "learning_rate": 3.763060297327386e-06, + "loss": 0.0022, + "step": 75980 + }, + { + "epoch": 1.243393602225313, + "grad_norm": 0.1643618941307068, + "learning_rate": 3.7616768262853674e-06, + "loss": 0.0017, + "step": 75990 + }, + { + "epoch": 1.2435572281763887, + "grad_norm": 0.2178935706615448, + "learning_rate": 3.7602934562367743e-06, + "loss": 0.0011, + "step": 76000 + }, + { + "epoch": 1.2437208541274647, + "grad_norm": 0.00882326066493988, + "learning_rate": 3.758910187294429e-06, + "loss": 0.0011, + "step": 76010 + }, + { + "epoch": 1.2438844800785405, + "grad_norm": 0.0069951810874044895, + "learning_rate": 3.7575270195711453e-06, + "loss": 0.0013, + "step": 76020 + }, + { + "epoch": 1.2440481060296162, + "grad_norm": 0.014804904349148273, + "learning_rate": 3.7561439531797315e-06, + "loss": 0.0011, + "step": 76030 + }, + { + "epoch": 1.244211731980692, + "grad_norm": 0.010005536489188671, + "learning_rate": 3.754760988232985e-06, + "loss": 0.0015, + "step": 76040 + }, + { + "epoch": 1.244375357931768, + "grad_norm": 0.03244250267744064, + "learning_rate": 3.753378124843695e-06, + "loss": 0.0018, + "step": 76050 + }, + { + "epoch": 1.2445389838828438, + "grad_norm": 0.04032203182578087, + "learning_rate": 3.7519953631246443e-06, + "loss": 0.0012, + "step": 76060 + }, + { + "epoch": 1.2447026098339196, + "grad_norm": 0.02284880168735981, + "learning_rate": 3.750612703188605e-06, + "loss": 0.0009, + "step": 76070 + }, + { + "epoch": 1.2448662357849956, + "grad_norm": 0.0598733015358448, + "learning_rate": 3.749230145148344e-06, + "loss": 0.0013, + "step": 76080 + }, + { + "epoch": 1.2450298617360713, + "grad_norm": 0.10431163012981415, + "learning_rate": 3.747847689116617e-06, + "loss": 0.0011, + "step": 76090 + }, + { + "epoch": 1.245193487687147, + "grad_norm": 0.0969981849193573, + "learning_rate": 3.7464653352061714e-06, + "loss": 0.0013, + "step": 76100 + }, + { + "epoch": 1.245357113638223, + "grad_norm": 0.07172807306051254, + "learning_rate": 3.745083083529748e-06, + "loss": 0.0019, + "step": 76110 + }, + { + "epoch": 1.2455207395892989, + "grad_norm": 0.0021600837353616953, + "learning_rate": 3.743700934200079e-06, + "loss": 0.001, + "step": 76120 + }, + { + "epoch": 1.2456843655403746, + "grad_norm": 0.06893586367368698, + "learning_rate": 3.7423188873298877e-06, + "loss": 0.0009, + "step": 76130 + }, + { + "epoch": 1.2458479914914506, + "grad_norm": 0.03157917037606239, + "learning_rate": 3.7409369430318886e-06, + "loss": 0.0008, + "step": 76140 + }, + { + "epoch": 1.2460116174425264, + "grad_norm": 0.046790771186351776, + "learning_rate": 3.739555101418789e-06, + "loss": 0.0009, + "step": 76150 + }, + { + "epoch": 1.2461752433936022, + "grad_norm": 0.03875652700662613, + "learning_rate": 3.7381733626032867e-06, + "loss": 0.0006, + "step": 76160 + }, + { + "epoch": 1.2463388693446782, + "grad_norm": 0.076677106320858, + "learning_rate": 3.736791726698072e-06, + "loss": 0.0015, + "step": 76170 + }, + { + "epoch": 1.246502495295754, + "grad_norm": 0.05655670166015625, + "learning_rate": 3.735410193815826e-06, + "loss": 0.0018, + "step": 76180 + }, + { + "epoch": 1.2466661212468297, + "grad_norm": 0.08222516626119614, + "learning_rate": 3.7340287640692224e-06, + "loss": 0.0024, + "step": 76190 + }, + { + "epoch": 1.2468297471979055, + "grad_norm": 0.09467457234859467, + "learning_rate": 3.7326474375709253e-06, + "loss": 0.0015, + "step": 76200 + }, + { + "epoch": 1.2469933731489815, + "grad_norm": 0.06342025846242905, + "learning_rate": 3.7312662144335913e-06, + "loss": 0.0011, + "step": 76210 + }, + { + "epoch": 1.2471569991000573, + "grad_norm": 0.06348420679569244, + "learning_rate": 3.7298850947698682e-06, + "loss": 0.0009, + "step": 76220 + }, + { + "epoch": 1.247320625051133, + "grad_norm": 0.08698870986700058, + "learning_rate": 3.728504078692396e-06, + "loss": 0.001, + "step": 76230 + }, + { + "epoch": 1.2474842510022088, + "grad_norm": 0.029028309509158134, + "learning_rate": 3.727123166313805e-06, + "loss": 0.0006, + "step": 76240 + }, + { + "epoch": 1.2476478769532848, + "grad_norm": 0.15747520327568054, + "learning_rate": 3.725742357746718e-06, + "loss": 0.0022, + "step": 76250 + }, + { + "epoch": 1.2478115029043606, + "grad_norm": 0.09941805154085159, + "learning_rate": 3.724361653103748e-06, + "loss": 0.0013, + "step": 76260 + }, + { + "epoch": 1.2479751288554364, + "grad_norm": 0.006171914748847485, + "learning_rate": 3.722981052497502e-06, + "loss": 0.0021, + "step": 76270 + }, + { + "epoch": 1.2481387548065124, + "grad_norm": 0.05656810477375984, + "learning_rate": 3.7216005560405767e-06, + "loss": 0.0009, + "step": 76280 + }, + { + "epoch": 1.2483023807575881, + "grad_norm": 0.030218174681067467, + "learning_rate": 3.7202201638455603e-06, + "loss": 0.0013, + "step": 76290 + }, + { + "epoch": 1.248466006708664, + "grad_norm": 0.028989270329475403, + "learning_rate": 3.7188398760250333e-06, + "loss": 0.0016, + "step": 76300 + }, + { + "epoch": 1.24862963265974, + "grad_norm": 0.06538151949644089, + "learning_rate": 3.7174596926915676e-06, + "loss": 0.0018, + "step": 76310 + }, + { + "epoch": 1.2487932586108157, + "grad_norm": 0.029546581208705902, + "learning_rate": 3.716079613957725e-06, + "loss": 0.0021, + "step": 76320 + }, + { + "epoch": 1.2489568845618915, + "grad_norm": 0.031979162245988846, + "learning_rate": 3.7146996399360615e-06, + "loss": 0.0013, + "step": 76330 + }, + { + "epoch": 1.2491205105129675, + "grad_norm": 0.049099151045084, + "learning_rate": 3.713319770739122e-06, + "loss": 0.0013, + "step": 76340 + }, + { + "epoch": 1.2492841364640432, + "grad_norm": 0.022830074653029442, + "learning_rate": 3.711940006479444e-06, + "loss": 0.0013, + "step": 76350 + }, + { + "epoch": 1.249447762415119, + "grad_norm": 0.03473230451345444, + "learning_rate": 3.710560347269557e-06, + "loss": 0.0011, + "step": 76360 + }, + { + "epoch": 1.2496113883661948, + "grad_norm": 0.022678585723042488, + "learning_rate": 3.7091807932219816e-06, + "loss": 0.0013, + "step": 76370 + }, + { + "epoch": 1.2497750143172708, + "grad_norm": 0.12722554802894592, + "learning_rate": 3.707801344449228e-06, + "loss": 0.0029, + "step": 76380 + }, + { + "epoch": 1.2499386402683466, + "grad_norm": 0.0551285557448864, + "learning_rate": 3.706422001063801e-06, + "loss": 0.001, + "step": 76390 + }, + { + "epoch": 1.2501022662194223, + "grad_norm": 0.10762631893157959, + "learning_rate": 3.7050427631781936e-06, + "loss": 0.0011, + "step": 76400 + }, + { + "epoch": 1.250265892170498, + "grad_norm": 0.12900607287883759, + "learning_rate": 3.7036636309048944e-06, + "loss": 0.0009, + "step": 76410 + }, + { + "epoch": 1.250429518121574, + "grad_norm": 0.11554361134767532, + "learning_rate": 3.7022846043563774e-06, + "loss": 0.0007, + "step": 76420 + }, + { + "epoch": 1.2505931440726499, + "grad_norm": 0.038834348320961, + "learning_rate": 3.7009056836451142e-06, + "loss": 0.0008, + "step": 76430 + }, + { + "epoch": 1.2507567700237257, + "grad_norm": 0.051226451992988586, + "learning_rate": 3.6995268688835625e-06, + "loss": 0.0019, + "step": 76440 + }, + { + "epoch": 1.2509203959748016, + "grad_norm": 0.030855748802423477, + "learning_rate": 3.6981481601841746e-06, + "loss": 0.0011, + "step": 76450 + }, + { + "epoch": 1.2510840219258774, + "grad_norm": 0.045061372220516205, + "learning_rate": 3.696769557659394e-06, + "loss": 0.0006, + "step": 76460 + }, + { + "epoch": 1.2512476478769532, + "grad_norm": 0.04351581633090973, + "learning_rate": 3.6953910614216537e-06, + "loss": 0.0008, + "step": 76470 + }, + { + "epoch": 1.2514112738280292, + "grad_norm": 0.10761336982250214, + "learning_rate": 3.69401267158338e-06, + "loss": 0.0013, + "step": 76480 + }, + { + "epoch": 1.251574899779105, + "grad_norm": 0.048454973846673965, + "learning_rate": 3.692634388256989e-06, + "loss": 0.0013, + "step": 76490 + }, + { + "epoch": 1.2517385257301807, + "grad_norm": 0.026084132492542267, + "learning_rate": 3.6912562115548906e-06, + "loss": 0.0008, + "step": 76500 + }, + { + "epoch": 1.2519021516812567, + "grad_norm": 0.0019574235193431377, + "learning_rate": 3.689878141589482e-06, + "loss": 0.0006, + "step": 76510 + }, + { + "epoch": 1.2520657776323325, + "grad_norm": 0.054881978780031204, + "learning_rate": 3.6885001784731547e-06, + "loss": 0.001, + "step": 76520 + }, + { + "epoch": 1.2522294035834083, + "grad_norm": 0.15914474427700043, + "learning_rate": 3.6871223223182905e-06, + "loss": 0.0016, + "step": 76530 + }, + { + "epoch": 1.2523930295344843, + "grad_norm": 0.07124340534210205, + "learning_rate": 3.685744573237263e-06, + "loss": 0.0015, + "step": 76540 + }, + { + "epoch": 1.25255665548556, + "grad_norm": 0.08131648600101471, + "learning_rate": 3.6843669313424375e-06, + "loss": 0.0011, + "step": 76550 + }, + { + "epoch": 1.2527202814366358, + "grad_norm": 0.08639384061098099, + "learning_rate": 3.6829893967461685e-06, + "loss": 0.0012, + "step": 76560 + }, + { + "epoch": 1.2528839073877118, + "grad_norm": 0.05704488977789879, + "learning_rate": 3.6816119695608033e-06, + "loss": 0.0014, + "step": 76570 + }, + { + "epoch": 1.2530475333387876, + "grad_norm": 0.06507814675569534, + "learning_rate": 3.6802346498986797e-06, + "loss": 0.0009, + "step": 76580 + }, + { + "epoch": 1.2532111592898634, + "grad_norm": 0.12731674313545227, + "learning_rate": 3.6788574378721285e-06, + "loss": 0.0011, + "step": 76590 + }, + { + "epoch": 1.2533747852409391, + "grad_norm": 0.14688092470169067, + "learning_rate": 3.6774803335934695e-06, + "loss": 0.0016, + "step": 76600 + }, + { + "epoch": 1.253538411192015, + "grad_norm": 0.10999850183725357, + "learning_rate": 3.6761033371750156e-06, + "loss": 0.0013, + "step": 76610 + }, + { + "epoch": 1.253702037143091, + "grad_norm": 0.08178862929344177, + "learning_rate": 3.674726448729068e-06, + "loss": 0.0018, + "step": 76620 + }, + { + "epoch": 1.2538656630941667, + "grad_norm": 0.09021533280611038, + "learning_rate": 3.673349668367923e-06, + "loss": 0.0013, + "step": 76630 + }, + { + "epoch": 1.2540292890452425, + "grad_norm": 0.11633835732936859, + "learning_rate": 3.671972996203865e-06, + "loss": 0.0019, + "step": 76640 + }, + { + "epoch": 1.2541929149963185, + "grad_norm": 0.06238851323723793, + "learning_rate": 3.670596432349172e-06, + "loss": 0.0014, + "step": 76650 + }, + { + "epoch": 1.2543565409473942, + "grad_norm": 0.008073466829955578, + "learning_rate": 3.66921997691611e-06, + "loss": 0.0012, + "step": 76660 + }, + { + "epoch": 1.25452016689847, + "grad_norm": 0.06268932670354843, + "learning_rate": 3.6678436300169397e-06, + "loss": 0.0014, + "step": 76670 + }, + { + "epoch": 1.254683792849546, + "grad_norm": 0.1938357949256897, + "learning_rate": 3.6664673917639106e-06, + "loss": 0.0019, + "step": 76680 + }, + { + "epoch": 1.2548474188006218, + "grad_norm": 0.04001988098025322, + "learning_rate": 3.665091262269263e-06, + "loss": 0.001, + "step": 76690 + }, + { + "epoch": 1.2550110447516976, + "grad_norm": 0.12498009949922562, + "learning_rate": 3.6637152416452306e-06, + "loss": 0.0008, + "step": 76700 + }, + { + "epoch": 1.2551746707027736, + "grad_norm": 0.028389887884259224, + "learning_rate": 3.662339330004037e-06, + "loss": 0.001, + "step": 76710 + }, + { + "epoch": 1.2553382966538493, + "grad_norm": 0.03279648721218109, + "learning_rate": 3.6609635274578964e-06, + "loss": 0.0026, + "step": 76720 + }, + { + "epoch": 1.255501922604925, + "grad_norm": 0.06896865367889404, + "learning_rate": 3.659587834119016e-06, + "loss": 0.0015, + "step": 76730 + }, + { + "epoch": 1.255665548556001, + "grad_norm": 1.2143607139587402, + "learning_rate": 3.6582122500995896e-06, + "loss": 0.0013, + "step": 76740 + }, + { + "epoch": 1.2558291745070769, + "grad_norm": 0.04025647044181824, + "learning_rate": 3.6568367755118078e-06, + "loss": 0.0018, + "step": 76750 + }, + { + "epoch": 1.2559928004581526, + "grad_norm": 0.0472763255238533, + "learning_rate": 3.655461410467849e-06, + "loss": 0.0014, + "step": 76760 + }, + { + "epoch": 1.2561564264092284, + "grad_norm": 0.03411184623837471, + "learning_rate": 3.6540861550798817e-06, + "loss": 0.0019, + "step": 76770 + }, + { + "epoch": 1.2563200523603044, + "grad_norm": 0.0291668139398098, + "learning_rate": 3.652711009460069e-06, + "loss": 0.0008, + "step": 76780 + }, + { + "epoch": 1.2564836783113802, + "grad_norm": 0.0225231796503067, + "learning_rate": 3.651335973720563e-06, + "loss": 0.0013, + "step": 76790 + }, + { + "epoch": 1.256647304262456, + "grad_norm": 0.06507483869791031, + "learning_rate": 3.649961047973506e-06, + "loss": 0.0014, + "step": 76800 + }, + { + "epoch": 1.2568109302135317, + "grad_norm": 0.005823655519634485, + "learning_rate": 3.648586232331033e-06, + "loss": 0.0012, + "step": 76810 + }, + { + "epoch": 1.2569745561646077, + "grad_norm": 0.03773728013038635, + "learning_rate": 3.6472115269052687e-06, + "loss": 0.0008, + "step": 76820 + }, + { + "epoch": 1.2571381821156835, + "grad_norm": 0.020107142627239227, + "learning_rate": 3.6458369318083296e-06, + "loss": 0.0008, + "step": 76830 + }, + { + "epoch": 1.2573018080667593, + "grad_norm": 0.02851218543946743, + "learning_rate": 3.644462447152323e-06, + "loss": 0.0015, + "step": 76840 + }, + { + "epoch": 1.2574654340178353, + "grad_norm": 0.024750562384724617, + "learning_rate": 3.643088073049348e-06, + "loss": 0.0017, + "step": 76850 + }, + { + "epoch": 1.257629059968911, + "grad_norm": 0.06721001863479614, + "learning_rate": 3.6417138096114924e-06, + "loss": 0.0028, + "step": 76860 + }, + { + "epoch": 1.2577926859199868, + "grad_norm": 0.030058342963457108, + "learning_rate": 3.6403396569508377e-06, + "loss": 0.0011, + "step": 76870 + }, + { + "epoch": 1.2579563118710628, + "grad_norm": 0.049991823732852936, + "learning_rate": 3.6389656151794538e-06, + "loss": 0.0015, + "step": 76880 + }, + { + "epoch": 1.2581199378221386, + "grad_norm": 0.03598054498434067, + "learning_rate": 3.637591684409404e-06, + "loss": 0.0016, + "step": 76890 + }, + { + "epoch": 1.2582835637732144, + "grad_norm": 0.08404979854822159, + "learning_rate": 3.6362178647527413e-06, + "loss": 0.0018, + "step": 76900 + }, + { + "epoch": 1.2584471897242904, + "grad_norm": 0.09314731508493423, + "learning_rate": 3.63484415632151e-06, + "loss": 0.0012, + "step": 76910 + }, + { + "epoch": 1.2586108156753661, + "grad_norm": 0.017280584201216698, + "learning_rate": 3.6334705592277443e-06, + "loss": 0.0007, + "step": 76920 + }, + { + "epoch": 1.258774441626442, + "grad_norm": 0.059732090681791306, + "learning_rate": 3.6320970735834704e-06, + "loss": 0.0017, + "step": 76930 + }, + { + "epoch": 1.258938067577518, + "grad_norm": 0.049863774329423904, + "learning_rate": 3.6307236995007052e-06, + "loss": 0.0015, + "step": 76940 + }, + { + "epoch": 1.2591016935285937, + "grad_norm": 0.052244868129491806, + "learning_rate": 3.6293504370914563e-06, + "loss": 0.0018, + "step": 76950 + }, + { + "epoch": 1.2592653194796695, + "grad_norm": 0.026911437511444092, + "learning_rate": 3.627977286467722e-06, + "loss": 0.0008, + "step": 76960 + }, + { + "epoch": 1.2594289454307452, + "grad_norm": 0.06201561912894249, + "learning_rate": 3.626604247741493e-06, + "loss": 0.0013, + "step": 76970 + }, + { + "epoch": 1.2595925713818212, + "grad_norm": 0.12437136471271515, + "learning_rate": 3.6252313210247482e-06, + "loss": 0.001, + "step": 76980 + }, + { + "epoch": 1.259756197332897, + "grad_norm": 0.03702268376946449, + "learning_rate": 3.6238585064294603e-06, + "loss": 0.0032, + "step": 76990 + }, + { + "epoch": 1.2599198232839728, + "grad_norm": 0.07538307458162308, + "learning_rate": 3.6224858040675896e-06, + "loss": 0.0022, + "step": 77000 + }, + { + "epoch": 1.2600834492350486, + "grad_norm": 0.04808036610484123, + "learning_rate": 3.6211132140510907e-06, + "loss": 0.0009, + "step": 77010 + }, + { + "epoch": 1.2602470751861246, + "grad_norm": 0.133243590593338, + "learning_rate": 3.619740736491907e-06, + "loss": 0.0014, + "step": 77020 + }, + { + "epoch": 1.2604107011372003, + "grad_norm": 0.08440541476011276, + "learning_rate": 3.6183683715019717e-06, + "loss": 0.0015, + "step": 77030 + }, + { + "epoch": 1.260574327088276, + "grad_norm": 0.026749959215521812, + "learning_rate": 3.616996119193212e-06, + "loss": 0.0012, + "step": 77040 + }, + { + "epoch": 1.260737953039352, + "grad_norm": 0.042579926550388336, + "learning_rate": 3.615623979677543e-06, + "loss": 0.0016, + "step": 77050 + }, + { + "epoch": 1.2609015789904279, + "grad_norm": 0.03619217872619629, + "learning_rate": 3.614251953066874e-06, + "loss": 0.0018, + "step": 77060 + }, + { + "epoch": 1.2610652049415036, + "grad_norm": 0.05326395109295845, + "learning_rate": 3.612880039473099e-06, + "loss": 0.0011, + "step": 77070 + }, + { + "epoch": 1.2612288308925796, + "grad_norm": 0.03684147074818611, + "learning_rate": 3.611508239008109e-06, + "loss": 0.0012, + "step": 77080 + }, + { + "epoch": 1.2613924568436554, + "grad_norm": 0.0037807838525623083, + "learning_rate": 3.6101365517837826e-06, + "loss": 0.0009, + "step": 77090 + }, + { + "epoch": 1.2615560827947312, + "grad_norm": 0.018600158393383026, + "learning_rate": 3.6087649779119915e-06, + "loss": 0.0007, + "step": 77100 + }, + { + "epoch": 1.2617197087458072, + "grad_norm": 0.13016009330749512, + "learning_rate": 3.6073935175045945e-06, + "loss": 0.002, + "step": 77110 + }, + { + "epoch": 1.261883334696883, + "grad_norm": 0.09645784646272659, + "learning_rate": 3.6060221706734446e-06, + "loss": 0.0014, + "step": 77120 + }, + { + "epoch": 1.2620469606479587, + "grad_norm": 0.03739799186587334, + "learning_rate": 3.604650937530383e-06, + "loss": 0.0017, + "step": 77130 + }, + { + "epoch": 1.2622105865990347, + "grad_norm": 0.03929201886057854, + "learning_rate": 3.603279818187244e-06, + "loss": 0.0018, + "step": 77140 + }, + { + "epoch": 1.2623742125501105, + "grad_norm": 0.034221045672893524, + "learning_rate": 3.601908812755851e-06, + "loss": 0.0013, + "step": 77150 + }, + { + "epoch": 1.2625378385011863, + "grad_norm": 0.16852647066116333, + "learning_rate": 3.6005379213480186e-06, + "loss": 0.0015, + "step": 77160 + }, + { + "epoch": 1.262701464452262, + "grad_norm": 0.027231376618146896, + "learning_rate": 3.5991671440755516e-06, + "loss": 0.0009, + "step": 77170 + }, + { + "epoch": 1.2628650904033378, + "grad_norm": 0.03230728209018707, + "learning_rate": 3.5977964810502463e-06, + "loss": 0.0014, + "step": 77180 + }, + { + "epoch": 1.2630287163544138, + "grad_norm": 0.04916096478700638, + "learning_rate": 3.5964259323838898e-06, + "loss": 0.0008, + "step": 77190 + }, + { + "epoch": 1.2631923423054896, + "grad_norm": 0.08450891822576523, + "learning_rate": 3.5950554981882584e-06, + "loss": 0.0029, + "step": 77200 + }, + { + "epoch": 1.2633559682565654, + "grad_norm": 0.10119549930095673, + "learning_rate": 3.5936851785751204e-06, + "loss": 0.0013, + "step": 77210 + }, + { + "epoch": 1.2635195942076414, + "grad_norm": 0.03982265666127205, + "learning_rate": 3.5923149736562353e-06, + "loss": 0.0015, + "step": 77220 + }, + { + "epoch": 1.2636832201587171, + "grad_norm": 0.05707027018070221, + "learning_rate": 3.5909448835433514e-06, + "loss": 0.0023, + "step": 77230 + }, + { + "epoch": 1.263846846109793, + "grad_norm": 0.028453057631850243, + "learning_rate": 3.589574908348208e-06, + "loss": 0.0015, + "step": 77240 + }, + { + "epoch": 1.264010472060869, + "grad_norm": 0.05292585864663124, + "learning_rate": 3.588205048182537e-06, + "loss": 0.0008, + "step": 77250 + }, + { + "epoch": 1.2641740980119447, + "grad_norm": 0.15501683950424194, + "learning_rate": 3.586835303158059e-06, + "loss": 0.0024, + "step": 77260 + }, + { + "epoch": 1.2643377239630205, + "grad_norm": 0.05271110683679581, + "learning_rate": 3.5854656733864856e-06, + "loss": 0.0015, + "step": 77270 + }, + { + "epoch": 1.2645013499140965, + "grad_norm": 0.02560976892709732, + "learning_rate": 3.5840961589795186e-06, + "loss": 0.0006, + "step": 77280 + }, + { + "epoch": 1.2646649758651722, + "grad_norm": 0.028053337708115578, + "learning_rate": 3.5827267600488523e-06, + "loss": 0.0008, + "step": 77290 + }, + { + "epoch": 1.264828601816248, + "grad_norm": 0.17916865646839142, + "learning_rate": 3.5813574767061693e-06, + "loss": 0.0059, + "step": 77300 + }, + { + "epoch": 1.264992227767324, + "grad_norm": 0.04611418768763542, + "learning_rate": 3.5799883090631436e-06, + "loss": 0.001, + "step": 77310 + }, + { + "epoch": 1.2651558537183998, + "grad_norm": 0.005622072611004114, + "learning_rate": 3.5786192572314405e-06, + "loss": 0.0012, + "step": 77320 + }, + { + "epoch": 1.2653194796694756, + "grad_norm": 0.11270934343338013, + "learning_rate": 3.577250321322715e-06, + "loss": 0.001, + "step": 77330 + }, + { + "epoch": 1.2654831056205516, + "grad_norm": 0.08596876263618469, + "learning_rate": 3.575881501448612e-06, + "loss": 0.0009, + "step": 77340 + }, + { + "epoch": 1.2656467315716273, + "grad_norm": 0.03830437362194061, + "learning_rate": 3.5745127977207687e-06, + "loss": 0.0009, + "step": 77350 + }, + { + "epoch": 1.265810357522703, + "grad_norm": 0.05910711735486984, + "learning_rate": 3.5731442102508118e-06, + "loss": 0.0012, + "step": 77360 + }, + { + "epoch": 1.2659739834737789, + "grad_norm": 0.047395166009664536, + "learning_rate": 3.571775739150359e-06, + "loss": 0.0019, + "step": 77370 + }, + { + "epoch": 1.2661376094248546, + "grad_norm": 0.013198910281062126, + "learning_rate": 3.5704073845310183e-06, + "loss": 0.0027, + "step": 77380 + }, + { + "epoch": 1.2663012353759306, + "grad_norm": 0.004581432323902845, + "learning_rate": 3.569039146504386e-06, + "loss": 0.0015, + "step": 77390 + }, + { + "epoch": 1.2664648613270064, + "grad_norm": 0.04696310684084892, + "learning_rate": 3.5676710251820524e-06, + "loss": 0.0013, + "step": 77400 + }, + { + "epoch": 1.2666284872780822, + "grad_norm": 0.1709856241941452, + "learning_rate": 3.5663030206755973e-06, + "loss": 0.0017, + "step": 77410 + }, + { + "epoch": 1.2667921132291582, + "grad_norm": 0.04741336405277252, + "learning_rate": 3.56493513309659e-06, + "loss": 0.0005, + "step": 77420 + }, + { + "epoch": 1.266955739180234, + "grad_norm": 0.025012029334902763, + "learning_rate": 3.5635673625565905e-06, + "loss": 0.0015, + "step": 77430 + }, + { + "epoch": 1.2671193651313097, + "grad_norm": 0.060076531022787094, + "learning_rate": 3.5621997091671493e-06, + "loss": 0.0012, + "step": 77440 + }, + { + "epoch": 1.2672829910823857, + "grad_norm": 0.039286285638809204, + "learning_rate": 3.560832173039809e-06, + "loss": 0.0015, + "step": 77450 + }, + { + "epoch": 1.2674466170334615, + "grad_norm": 0.038472436368465424, + "learning_rate": 3.5594647542860996e-06, + "loss": 0.0009, + "step": 77460 + }, + { + "epoch": 1.2676102429845373, + "grad_norm": 0.1349521428346634, + "learning_rate": 3.558097453017543e-06, + "loss": 0.0008, + "step": 77470 + }, + { + "epoch": 1.2677738689356133, + "grad_norm": 0.05331789329648018, + "learning_rate": 3.5567302693456536e-06, + "loss": 0.0031, + "step": 77480 + }, + { + "epoch": 1.267937494886689, + "grad_norm": 0.05710127204656601, + "learning_rate": 3.5553632033819326e-06, + "loss": 0.0017, + "step": 77490 + }, + { + "epoch": 1.2681011208377648, + "grad_norm": 0.04289095476269722, + "learning_rate": 3.5539962552378734e-06, + "loss": 0.0023, + "step": 77500 + }, + { + "epoch": 1.2682647467888408, + "grad_norm": 0.06787627935409546, + "learning_rate": 3.55262942502496e-06, + "loss": 0.0012, + "step": 77510 + }, + { + "epoch": 1.2684283727399166, + "grad_norm": 0.03199947625398636, + "learning_rate": 3.551262712854666e-06, + "loss": 0.0006, + "step": 77520 + }, + { + "epoch": 1.2685919986909924, + "grad_norm": 0.09511681646108627, + "learning_rate": 3.549896118838457e-06, + "loss": 0.004, + "step": 77530 + }, + { + "epoch": 1.2687556246420684, + "grad_norm": 0.03120916709303856, + "learning_rate": 3.5485296430877857e-06, + "loss": 0.0012, + "step": 77540 + }, + { + "epoch": 1.2689192505931441, + "grad_norm": 0.061602529138326645, + "learning_rate": 3.547163285714099e-06, + "loss": 0.001, + "step": 77550 + }, + { + "epoch": 1.26908287654422, + "grad_norm": 0.12144287675619125, + "learning_rate": 3.545797046828832e-06, + "loss": 0.0023, + "step": 77560 + }, + { + "epoch": 1.2692465024952957, + "grad_norm": 0.10730836540460587, + "learning_rate": 3.5444309265434102e-06, + "loss": 0.001, + "step": 77570 + }, + { + "epoch": 1.2694101284463715, + "grad_norm": 0.014221975579857826, + "learning_rate": 3.5430649249692495e-06, + "loss": 0.0011, + "step": 77580 + }, + { + "epoch": 1.2695737543974475, + "grad_norm": 0.06656935811042786, + "learning_rate": 3.5416990422177565e-06, + "loss": 0.0008, + "step": 77590 + }, + { + "epoch": 1.2697373803485232, + "grad_norm": 0.03711162880063057, + "learning_rate": 3.5403332784003285e-06, + "loss": 0.0014, + "step": 77600 + }, + { + "epoch": 1.269901006299599, + "grad_norm": 0.031001949682831764, + "learning_rate": 3.5389676336283516e-06, + "loss": 0.0007, + "step": 77610 + }, + { + "epoch": 1.270064632250675, + "grad_norm": 0.01777828484773636, + "learning_rate": 3.537602108013204e-06, + "loss": 0.0006, + "step": 77620 + }, + { + "epoch": 1.2702282582017508, + "grad_norm": 0.04528407007455826, + "learning_rate": 3.5362367016662526e-06, + "loss": 0.0014, + "step": 77630 + }, + { + "epoch": 1.2703918841528266, + "grad_norm": 0.03722751513123512, + "learning_rate": 3.534871414698856e-06, + "loss": 0.0006, + "step": 77640 + }, + { + "epoch": 1.2705555101039026, + "grad_norm": 0.10068801790475845, + "learning_rate": 3.5335062472223614e-06, + "loss": 0.0011, + "step": 77650 + }, + { + "epoch": 1.2707191360549783, + "grad_norm": 0.009970646351575851, + "learning_rate": 3.5321411993481086e-06, + "loss": 0.0012, + "step": 77660 + }, + { + "epoch": 1.270882762006054, + "grad_norm": 0.024275509640574455, + "learning_rate": 3.530776271187425e-06, + "loss": 0.0009, + "step": 77670 + }, + { + "epoch": 1.27104638795713, + "grad_norm": 0.004173334687948227, + "learning_rate": 3.5294114628516305e-06, + "loss": 0.0006, + "step": 77680 + }, + { + "epoch": 1.2712100139082059, + "grad_norm": 0.06451655179262161, + "learning_rate": 3.528046774452033e-06, + "loss": 0.0012, + "step": 77690 + }, + { + "epoch": 1.2713736398592816, + "grad_norm": 0.07585131376981735, + "learning_rate": 3.5266822060999343e-06, + "loss": 0.002, + "step": 77700 + }, + { + "epoch": 1.2715372658103576, + "grad_norm": 0.057093046605587006, + "learning_rate": 3.525317757906621e-06, + "loss": 0.0006, + "step": 77710 + }, + { + "epoch": 1.2717008917614334, + "grad_norm": 0.03583693876862526, + "learning_rate": 3.523953429983374e-06, + "loss": 0.0015, + "step": 77720 + }, + { + "epoch": 1.2718645177125092, + "grad_norm": 0.0703345388174057, + "learning_rate": 3.522589222441463e-06, + "loss": 0.0013, + "step": 77730 + }, + { + "epoch": 1.272028143663585, + "grad_norm": 0.030982453376054764, + "learning_rate": 3.521225135392149e-06, + "loss": 0.001, + "step": 77740 + }, + { + "epoch": 1.272191769614661, + "grad_norm": 0.0423283651471138, + "learning_rate": 3.519861168946681e-06, + "loss": 0.001, + "step": 77750 + }, + { + "epoch": 1.2723553955657367, + "grad_norm": 0.08313736319541931, + "learning_rate": 3.5184973232163013e-06, + "loss": 0.0024, + "step": 77760 + }, + { + "epoch": 1.2725190215168125, + "grad_norm": 0.04455333203077316, + "learning_rate": 3.517133598312238e-06, + "loss": 0.0006, + "step": 77770 + }, + { + "epoch": 1.2726826474678883, + "grad_norm": 0.07627793401479721, + "learning_rate": 3.5157699943457145e-06, + "loss": 0.0007, + "step": 77780 + }, + { + "epoch": 1.2728462734189643, + "grad_norm": 0.07501964271068573, + "learning_rate": 3.5144065114279394e-06, + "loss": 0.001, + "step": 77790 + }, + { + "epoch": 1.27300989937004, + "grad_norm": 0.12820790708065033, + "learning_rate": 3.5130431496701155e-06, + "loss": 0.0014, + "step": 77800 + }, + { + "epoch": 1.2731735253211158, + "grad_norm": 0.020283862948417664, + "learning_rate": 3.511679909183433e-06, + "loss": 0.0017, + "step": 77810 + }, + { + "epoch": 1.2733371512721918, + "grad_norm": 0.03896934166550636, + "learning_rate": 3.5103167900790737e-06, + "loss": 0.0008, + "step": 77820 + }, + { + "epoch": 1.2735007772232676, + "grad_norm": 0.01519716065376997, + "learning_rate": 3.5089537924682083e-06, + "loss": 0.0008, + "step": 77830 + }, + { + "epoch": 1.2736644031743434, + "grad_norm": 0.025452304631471634, + "learning_rate": 3.507590916461999e-06, + "loss": 0.0021, + "step": 77840 + }, + { + "epoch": 1.2738280291254194, + "grad_norm": 0.03845059499144554, + "learning_rate": 3.5062281621715965e-06, + "loss": 0.0009, + "step": 77850 + }, + { + "epoch": 1.2739916550764951, + "grad_norm": 0.029472118243575096, + "learning_rate": 3.5048655297081436e-06, + "loss": 0.001, + "step": 77860 + }, + { + "epoch": 1.274155281027571, + "grad_norm": 0.03153502568602562, + "learning_rate": 3.5035030191827703e-06, + "loss": 0.0007, + "step": 77870 + }, + { + "epoch": 1.274318906978647, + "grad_norm": 0.04291800409555435, + "learning_rate": 3.5021406307065996e-06, + "loss": 0.0007, + "step": 77880 + }, + { + "epoch": 1.2744825329297227, + "grad_norm": 0.0939502865076065, + "learning_rate": 3.5007783643907434e-06, + "loss": 0.0019, + "step": 77890 + }, + { + "epoch": 1.2746461588807985, + "grad_norm": 0.02433999441564083, + "learning_rate": 3.499416220346302e-06, + "loss": 0.0017, + "step": 77900 + }, + { + "epoch": 1.2748097848318745, + "grad_norm": 0.0496753565967083, + "learning_rate": 3.4980541986843684e-06, + "loss": 0.001, + "step": 77910 + }, + { + "epoch": 1.2749734107829502, + "grad_norm": 0.0019044345244765282, + "learning_rate": 3.496692299516025e-06, + "loss": 0.0008, + "step": 77920 + }, + { + "epoch": 1.275137036734026, + "grad_norm": 0.02745947055518627, + "learning_rate": 3.4953305229523426e-06, + "loss": 0.0008, + "step": 77930 + }, + { + "epoch": 1.2753006626851018, + "grad_norm": 0.030799519270658493, + "learning_rate": 3.4939688691043836e-06, + "loss": 0.0011, + "step": 77940 + }, + { + "epoch": 1.2754642886361778, + "grad_norm": 0.13371559977531433, + "learning_rate": 3.4926073380831994e-06, + "loss": 0.0016, + "step": 77950 + }, + { + "epoch": 1.2756279145872536, + "grad_norm": 0.02949315495789051, + "learning_rate": 3.4912459299998324e-06, + "loss": 0.0016, + "step": 77960 + }, + { + "epoch": 1.2757915405383293, + "grad_norm": 0.03466854616999626, + "learning_rate": 3.489884644965314e-06, + "loss": 0.0017, + "step": 77970 + }, + { + "epoch": 1.275955166489405, + "grad_norm": 0.047134753316640854, + "learning_rate": 3.4885234830906663e-06, + "loss": 0.0007, + "step": 77980 + }, + { + "epoch": 1.276118792440481, + "grad_norm": 0.08462857455015182, + "learning_rate": 3.4871624444869e-06, + "loss": 0.0018, + "step": 77990 + }, + { + "epoch": 1.2762824183915569, + "grad_norm": 0.013326887041330338, + "learning_rate": 3.485801529265018e-06, + "loss": 0.0008, + "step": 78000 + }, + { + "epoch": 1.2764460443426326, + "grad_norm": 0.028164181858301163, + "learning_rate": 3.484440737536012e-06, + "loss": 0.001, + "step": 78010 + }, + { + "epoch": 1.2766096702937086, + "grad_norm": 0.05022374913096428, + "learning_rate": 3.483080069410864e-06, + "loss": 0.0038, + "step": 78020 + }, + { + "epoch": 1.2767732962447844, + "grad_norm": 0.02585049904882908, + "learning_rate": 3.481719525000543e-06, + "loss": 0.0009, + "step": 78030 + }, + { + "epoch": 1.2769369221958602, + "grad_norm": 0.029151415452361107, + "learning_rate": 3.4803591044160122e-06, + "loss": 0.0011, + "step": 78040 + }, + { + "epoch": 1.2771005481469362, + "grad_norm": 0.04458334669470787, + "learning_rate": 3.478998807768223e-06, + "loss": 0.0013, + "step": 78050 + }, + { + "epoch": 1.277264174098012, + "grad_norm": 0.06082809716463089, + "learning_rate": 3.4776386351681157e-06, + "loss": 0.0009, + "step": 78060 + }, + { + "epoch": 1.2774278000490877, + "grad_norm": 0.09732098877429962, + "learning_rate": 3.476278586726622e-06, + "loss": 0.0013, + "step": 78070 + }, + { + "epoch": 1.2775914260001637, + "grad_norm": 0.08182229101657867, + "learning_rate": 3.4749186625546627e-06, + "loss": 0.0011, + "step": 78080 + }, + { + "epoch": 1.2777550519512395, + "grad_norm": 0.007163154426962137, + "learning_rate": 3.4735588627631476e-06, + "loss": 0.001, + "step": 78090 + }, + { + "epoch": 1.2779186779023153, + "grad_norm": 0.07749473303556442, + "learning_rate": 3.472199187462979e-06, + "loss": 0.0012, + "step": 78100 + }, + { + "epoch": 1.2780823038533913, + "grad_norm": 0.07042301446199417, + "learning_rate": 3.4708396367650466e-06, + "loss": 0.0016, + "step": 78110 + }, + { + "epoch": 1.278245929804467, + "grad_norm": 0.06348545849323273, + "learning_rate": 3.4694802107802304e-06, + "loss": 0.0024, + "step": 78120 + }, + { + "epoch": 1.2784095557555428, + "grad_norm": 0.04068297892808914, + "learning_rate": 3.468120909619402e-06, + "loss": 0.0015, + "step": 78130 + }, + { + "epoch": 1.2785731817066186, + "grad_norm": 0.009029011242091656, + "learning_rate": 3.466761733393419e-06, + "loss": 0.001, + "step": 78140 + }, + { + "epoch": 1.2787368076576944, + "grad_norm": 0.05771932378411293, + "learning_rate": 3.4654026822131335e-06, + "loss": 0.001, + "step": 78150 + }, + { + "epoch": 1.2789004336087704, + "grad_norm": 0.029097730293869972, + "learning_rate": 3.4640437561893836e-06, + "loss": 0.0012, + "step": 78160 + }, + { + "epoch": 1.2790640595598461, + "grad_norm": 0.091275155544281, + "learning_rate": 3.462684955432999e-06, + "loss": 0.0015, + "step": 78170 + }, + { + "epoch": 1.279227685510922, + "grad_norm": 0.060457684099674225, + "learning_rate": 3.4613262800548e-06, + "loss": 0.0008, + "step": 78180 + }, + { + "epoch": 1.279391311461998, + "grad_norm": 0.07914213091135025, + "learning_rate": 3.459967730165593e-06, + "loss": 0.0009, + "step": 78190 + }, + { + "epoch": 1.2795549374130737, + "grad_norm": 0.025470426306128502, + "learning_rate": 3.4586093058761793e-06, + "loss": 0.0008, + "step": 78200 + }, + { + "epoch": 1.2797185633641495, + "grad_norm": 0.010108712129294872, + "learning_rate": 3.4572510072973464e-06, + "loss": 0.0006, + "step": 78210 + }, + { + "epoch": 1.2798821893152255, + "grad_norm": 0.040509022772312164, + "learning_rate": 3.4558928345398724e-06, + "loss": 0.0008, + "step": 78220 + }, + { + "epoch": 1.2800458152663012, + "grad_norm": 0.11198396235704422, + "learning_rate": 3.454534787714525e-06, + "loss": 0.0015, + "step": 78230 + }, + { + "epoch": 1.280209441217377, + "grad_norm": 0.021078722551465034, + "learning_rate": 3.453176866932062e-06, + "loss": 0.0008, + "step": 78240 + }, + { + "epoch": 1.280373067168453, + "grad_norm": 0.03657149150967598, + "learning_rate": 3.4518190723032317e-06, + "loss": 0.0012, + "step": 78250 + }, + { + "epoch": 1.2805366931195288, + "grad_norm": 0.013931353576481342, + "learning_rate": 3.4504614039387697e-06, + "loss": 0.0007, + "step": 78260 + }, + { + "epoch": 1.2807003190706046, + "grad_norm": 0.12084774672985077, + "learning_rate": 3.4491038619494046e-06, + "loss": 0.0012, + "step": 78270 + }, + { + "epoch": 1.2808639450216805, + "grad_norm": 0.033430326730012894, + "learning_rate": 3.447746446445851e-06, + "loss": 0.0013, + "step": 78280 + }, + { + "epoch": 1.2810275709727563, + "grad_norm": 0.05016624927520752, + "learning_rate": 3.4463891575388164e-06, + "loss": 0.0019, + "step": 78290 + }, + { + "epoch": 1.281191196923832, + "grad_norm": 0.06589030474424362, + "learning_rate": 3.4450319953389967e-06, + "loss": 0.0011, + "step": 78300 + }, + { + "epoch": 1.281354822874908, + "grad_norm": 0.07793193310499191, + "learning_rate": 3.443674959957076e-06, + "loss": 0.0063, + "step": 78310 + }, + { + "epoch": 1.2815184488259839, + "grad_norm": 0.02572542615234852, + "learning_rate": 3.4423180515037313e-06, + "loss": 0.0013, + "step": 78320 + }, + { + "epoch": 1.2816820747770596, + "grad_norm": 0.08930584043264389, + "learning_rate": 3.440961270089626e-06, + "loss": 0.0022, + "step": 78330 + }, + { + "epoch": 1.2818457007281354, + "grad_norm": 0.04198780283331871, + "learning_rate": 3.439604615825416e-06, + "loss": 0.0012, + "step": 78340 + }, + { + "epoch": 1.2820093266792112, + "grad_norm": 0.04921184852719307, + "learning_rate": 3.438248088821745e-06, + "loss": 0.0016, + "step": 78350 + }, + { + "epoch": 1.2821729526302872, + "grad_norm": 0.04974973574280739, + "learning_rate": 3.4368916891892457e-06, + "loss": 0.001, + "step": 78360 + }, + { + "epoch": 1.282336578581363, + "grad_norm": 0.024867065250873566, + "learning_rate": 3.4355354170385413e-06, + "loss": 0.0016, + "step": 78370 + }, + { + "epoch": 1.2825002045324387, + "grad_norm": 0.005292154848575592, + "learning_rate": 3.434179272480246e-06, + "loss": 0.0007, + "step": 78380 + }, + { + "epoch": 1.2826638304835147, + "grad_norm": 0.01609170250594616, + "learning_rate": 3.432823255624962e-06, + "loss": 0.0008, + "step": 78390 + }, + { + "epoch": 1.2828274564345905, + "grad_norm": 0.009458620101213455, + "learning_rate": 3.4314673665832814e-06, + "loss": 0.0008, + "step": 78400 + }, + { + "epoch": 1.2829910823856663, + "grad_norm": 0.02687659114599228, + "learning_rate": 3.4301116054657856e-06, + "loss": 0.0011, + "step": 78410 + }, + { + "epoch": 1.2831547083367423, + "grad_norm": 0.09582116454839706, + "learning_rate": 3.428755972383046e-06, + "loss": 0.0007, + "step": 78420 + }, + { + "epoch": 1.283318334287818, + "grad_norm": 0.05896228179335594, + "learning_rate": 3.4274004674456233e-06, + "loss": 0.0008, + "step": 78430 + }, + { + "epoch": 1.2834819602388938, + "grad_norm": 0.013279764913022518, + "learning_rate": 3.4260450907640684e-06, + "loss": 0.001, + "step": 78440 + }, + { + "epoch": 1.2836455861899698, + "grad_norm": 0.06440384685993195, + "learning_rate": 3.4246898424489205e-06, + "loss": 0.0016, + "step": 78450 + }, + { + "epoch": 1.2838092121410456, + "grad_norm": 0.017549477517604828, + "learning_rate": 3.4233347226107093e-06, + "loss": 0.0013, + "step": 78460 + }, + { + "epoch": 1.2839728380921214, + "grad_norm": 0.027202928438782692, + "learning_rate": 3.4219797313599546e-06, + "loss": 0.0024, + "step": 78470 + }, + { + "epoch": 1.2841364640431974, + "grad_norm": 0.06153430417180061, + "learning_rate": 3.420624868807163e-06, + "loss": 0.0015, + "step": 78480 + }, + { + "epoch": 1.2843000899942731, + "grad_norm": 0.04982449859380722, + "learning_rate": 3.4192701350628344e-06, + "loss": 0.001, + "step": 78490 + }, + { + "epoch": 1.284463715945349, + "grad_norm": 0.05699589475989342, + "learning_rate": 3.4179155302374557e-06, + "loss": 0.0015, + "step": 78500 + }, + { + "epoch": 1.2846273418964247, + "grad_norm": 0.030946504324674606, + "learning_rate": 3.416561054441503e-06, + "loss": 0.0008, + "step": 78510 + }, + { + "epoch": 1.2847909678475007, + "grad_norm": 0.0365414060652256, + "learning_rate": 3.4152067077854433e-06, + "loss": 0.0026, + "step": 78520 + }, + { + "epoch": 1.2849545937985765, + "grad_norm": 0.03943750262260437, + "learning_rate": 3.413852490379733e-06, + "loss": 0.0008, + "step": 78530 + }, + { + "epoch": 1.2851182197496522, + "grad_norm": 0.03144293278455734, + "learning_rate": 3.412498402334817e-06, + "loss": 0.0016, + "step": 78540 + }, + { + "epoch": 1.285281845700728, + "grad_norm": 0.05824761092662811, + "learning_rate": 3.41114444376113e-06, + "loss": 0.0012, + "step": 78550 + }, + { + "epoch": 1.285445471651804, + "grad_norm": 0.04617036506533623, + "learning_rate": 3.409790614769096e-06, + "loss": 0.0008, + "step": 78560 + }, + { + "epoch": 1.2856090976028798, + "grad_norm": 0.09552693367004395, + "learning_rate": 3.40843691546913e-06, + "loss": 0.0009, + "step": 78570 + }, + { + "epoch": 1.2857727235539556, + "grad_norm": 0.05639279633760452, + "learning_rate": 3.407083345971634e-06, + "loss": 0.0014, + "step": 78580 + }, + { + "epoch": 1.2859363495050316, + "grad_norm": 0.020176507532596588, + "learning_rate": 3.405729906387001e-06, + "loss": 0.0011, + "step": 78590 + }, + { + "epoch": 1.2860999754561073, + "grad_norm": 0.052018191665410995, + "learning_rate": 3.4043765968256117e-06, + "loss": 0.0014, + "step": 78600 + }, + { + "epoch": 1.286263601407183, + "grad_norm": 0.019295357167720795, + "learning_rate": 3.4030234173978387e-06, + "loss": 0.0014, + "step": 78610 + }, + { + "epoch": 1.286427227358259, + "grad_norm": 0.041312236338853836, + "learning_rate": 3.4016703682140427e-06, + "loss": 0.0013, + "step": 78620 + }, + { + "epoch": 1.2865908533093349, + "grad_norm": 0.02839718572795391, + "learning_rate": 3.4003174493845737e-06, + "loss": 0.0012, + "step": 78630 + }, + { + "epoch": 1.2867544792604106, + "grad_norm": 0.04943479597568512, + "learning_rate": 3.39896466101977e-06, + "loss": 0.0016, + "step": 78640 + }, + { + "epoch": 1.2869181052114866, + "grad_norm": 0.0356144942343235, + "learning_rate": 3.397612003229962e-06, + "loss": 0.0009, + "step": 78650 + }, + { + "epoch": 1.2870817311625624, + "grad_norm": 0.03254229575395584, + "learning_rate": 3.3962594761254676e-06, + "loss": 0.0015, + "step": 78660 + }, + { + "epoch": 1.2872453571136382, + "grad_norm": 0.06893648952245712, + "learning_rate": 3.394907079816594e-06, + "loss": 0.0014, + "step": 78670 + }, + { + "epoch": 1.2874089830647142, + "grad_norm": 0.015596817247569561, + "learning_rate": 3.3935548144136377e-06, + "loss": 0.0024, + "step": 78680 + }, + { + "epoch": 1.28757260901579, + "grad_norm": 0.07231036573648453, + "learning_rate": 3.3922026800268847e-06, + "loss": 0.002, + "step": 78690 + }, + { + "epoch": 1.2877362349668657, + "grad_norm": 0.07444033771753311, + "learning_rate": 3.3908506767666116e-06, + "loss": 0.0014, + "step": 78700 + }, + { + "epoch": 1.2878998609179415, + "grad_norm": 0.06506069004535675, + "learning_rate": 3.389498804743083e-06, + "loss": 0.0017, + "step": 78710 + }, + { + "epoch": 1.2880634868690175, + "grad_norm": 0.28216955065727234, + "learning_rate": 3.3881470640665516e-06, + "loss": 0.0023, + "step": 78720 + }, + { + "epoch": 1.2882271128200933, + "grad_norm": 0.1464124172925949, + "learning_rate": 3.3867954548472614e-06, + "loss": 0.0032, + "step": 78730 + }, + { + "epoch": 1.288390738771169, + "grad_norm": 0.03206486627459526, + "learning_rate": 3.3854439771954452e-06, + "loss": 0.0015, + "step": 78740 + }, + { + "epoch": 1.2885543647222448, + "grad_norm": 0.009534304961562157, + "learning_rate": 3.384092631221325e-06, + "loss": 0.0008, + "step": 78750 + }, + { + "epoch": 1.2887179906733208, + "grad_norm": 0.03360547870397568, + "learning_rate": 3.3827414170351125e-06, + "loss": 0.0013, + "step": 78760 + }, + { + "epoch": 1.2888816166243966, + "grad_norm": 0.01117360033094883, + "learning_rate": 3.381390334747008e-06, + "loss": 0.0008, + "step": 78770 + }, + { + "epoch": 1.2890452425754724, + "grad_norm": 0.060954499989748, + "learning_rate": 3.3800393844671996e-06, + "loss": 0.0011, + "step": 78780 + }, + { + "epoch": 1.2892088685265484, + "grad_norm": 0.0011138300178572536, + "learning_rate": 3.378688566305869e-06, + "loss": 0.0012, + "step": 78790 + }, + { + "epoch": 1.2893724944776241, + "grad_norm": 0.09522448480129242, + "learning_rate": 3.3773378803731816e-06, + "loss": 0.0017, + "step": 78800 + }, + { + "epoch": 1.2895361204287, + "grad_norm": 0.07202402502298355, + "learning_rate": 3.375987326779297e-06, + "loss": 0.0009, + "step": 78810 + }, + { + "epoch": 1.289699746379776, + "grad_norm": 0.009073911234736443, + "learning_rate": 3.3746369056343597e-06, + "loss": 0.0015, + "step": 78820 + }, + { + "epoch": 1.2898633723308517, + "grad_norm": 0.022142861038446426, + "learning_rate": 3.3732866170485066e-06, + "loss": 0.0018, + "step": 78830 + }, + { + "epoch": 1.2900269982819275, + "grad_norm": 0.019666818901896477, + "learning_rate": 3.371936461131862e-06, + "loss": 0.0038, + "step": 78840 + }, + { + "epoch": 1.2901906242330035, + "grad_norm": 0.105963334441185, + "learning_rate": 3.370586437994541e-06, + "loss": 0.0009, + "step": 78850 + }, + { + "epoch": 1.2903542501840792, + "grad_norm": 0.0553797110915184, + "learning_rate": 3.369236547746646e-06, + "loss": 0.0013, + "step": 78860 + }, + { + "epoch": 1.290517876135155, + "grad_norm": 0.061036694794893265, + "learning_rate": 3.3678867904982702e-06, + "loss": 0.0013, + "step": 78870 + }, + { + "epoch": 1.290681502086231, + "grad_norm": 0.09497766196727753, + "learning_rate": 3.366537166359495e-06, + "loss": 0.0018, + "step": 78880 + }, + { + "epoch": 1.2908451280373068, + "grad_norm": 0.09542693942785263, + "learning_rate": 3.3651876754403905e-06, + "loss": 0.0013, + "step": 78890 + }, + { + "epoch": 1.2910087539883826, + "grad_norm": 0.08166547864675522, + "learning_rate": 3.363838317851017e-06, + "loss": 0.0013, + "step": 78900 + }, + { + "epoch": 1.2911723799394583, + "grad_norm": 0.009052703157067299, + "learning_rate": 3.3624890937014243e-06, + "loss": 0.0009, + "step": 78910 + }, + { + "epoch": 1.2913360058905343, + "grad_norm": 0.03741474077105522, + "learning_rate": 3.3611400031016493e-06, + "loss": 0.001, + "step": 78920 + }, + { + "epoch": 1.29149963184161, + "grad_norm": 0.12131674587726593, + "learning_rate": 3.35979104616172e-06, + "loss": 0.0013, + "step": 78930 + }, + { + "epoch": 1.2916632577926859, + "grad_norm": 0.049281589686870575, + "learning_rate": 3.358442222991652e-06, + "loss": 0.0015, + "step": 78940 + }, + { + "epoch": 1.2918268837437616, + "grad_norm": 0.11477705836296082, + "learning_rate": 3.3570935337014514e-06, + "loss": 0.0016, + "step": 78950 + }, + { + "epoch": 1.2919905096948376, + "grad_norm": 0.019706519320607185, + "learning_rate": 3.355744978401113e-06, + "loss": 0.0008, + "step": 78960 + }, + { + "epoch": 1.2921541356459134, + "grad_norm": 0.0271159578114748, + "learning_rate": 3.35439655720062e-06, + "loss": 0.0009, + "step": 78970 + }, + { + "epoch": 1.2923177615969892, + "grad_norm": 0.11278549581766129, + "learning_rate": 3.353048270209944e-06, + "loss": 0.0019, + "step": 78980 + }, + { + "epoch": 1.2924813875480652, + "grad_norm": 0.04301435127854347, + "learning_rate": 3.3517001175390483e-06, + "loss": 0.0013, + "step": 78990 + }, + { + "epoch": 1.292645013499141, + "grad_norm": 0.03350311517715454, + "learning_rate": 3.3503520992978834e-06, + "loss": 0.0008, + "step": 79000 + }, + { + "epoch": 1.2928086394502167, + "grad_norm": 0.10418645292520523, + "learning_rate": 3.3490042155963876e-06, + "loss": 0.0012, + "step": 79010 + }, + { + "epoch": 1.2929722654012927, + "grad_norm": 0.07796432822942734, + "learning_rate": 3.3476564665444923e-06, + "loss": 0.0008, + "step": 79020 + }, + { + "epoch": 1.2931358913523685, + "grad_norm": 0.1125660315155983, + "learning_rate": 3.3463088522521126e-06, + "loss": 0.0019, + "step": 79030 + }, + { + "epoch": 1.2932995173034443, + "grad_norm": 0.05894038453698158, + "learning_rate": 3.344961372829157e-06, + "loss": 0.0015, + "step": 79040 + }, + { + "epoch": 1.2934631432545203, + "grad_norm": 0.011720987036824226, + "learning_rate": 3.3436140283855212e-06, + "loss": 0.0018, + "step": 79050 + }, + { + "epoch": 1.293626769205596, + "grad_norm": 0.1287417858839035, + "learning_rate": 3.3422668190310893e-06, + "loss": 0.001, + "step": 79060 + }, + { + "epoch": 1.2937903951566718, + "grad_norm": 0.12819166481494904, + "learning_rate": 3.3409197448757356e-06, + "loss": 0.0011, + "step": 79070 + }, + { + "epoch": 1.2939540211077478, + "grad_norm": 0.1163904070854187, + "learning_rate": 3.3395728060293226e-06, + "loss": 0.0014, + "step": 79080 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.06882356107234955, + "learning_rate": 3.3382260026017027e-06, + "loss": 0.0007, + "step": 79090 + }, + { + "epoch": 1.2942812730098994, + "grad_norm": 0.04898802936077118, + "learning_rate": 3.3368793347027163e-06, + "loss": 0.0013, + "step": 79100 + }, + { + "epoch": 1.2944448989609751, + "grad_norm": 0.0616704523563385, + "learning_rate": 3.3355328024421926e-06, + "loss": 0.0009, + "step": 79110 + }, + { + "epoch": 1.294608524912051, + "grad_norm": 0.012394711375236511, + "learning_rate": 3.33418640592995e-06, + "loss": 0.0016, + "step": 79120 + }, + { + "epoch": 1.294772150863127, + "grad_norm": 0.05662516877055168, + "learning_rate": 3.3328401452757985e-06, + "loss": 0.001, + "step": 79130 + }, + { + "epoch": 1.2949357768142027, + "grad_norm": 0.10913223028182983, + "learning_rate": 3.3314940205895312e-06, + "loss": 0.0018, + "step": 79140 + }, + { + "epoch": 1.2950994027652785, + "grad_norm": 0.1399538218975067, + "learning_rate": 3.3301480319809353e-06, + "loss": 0.0023, + "step": 79150 + }, + { + "epoch": 1.2952630287163545, + "grad_norm": 0.05488545447587967, + "learning_rate": 3.3288021795597835e-06, + "loss": 0.0012, + "step": 79160 + }, + { + "epoch": 1.2954266546674302, + "grad_norm": 0.010304092429578304, + "learning_rate": 3.327456463435841e-06, + "loss": 0.0007, + "step": 79170 + }, + { + "epoch": 1.295590280618506, + "grad_norm": 0.04390418156981468, + "learning_rate": 3.3261108837188582e-06, + "loss": 0.0022, + "step": 79180 + }, + { + "epoch": 1.295753906569582, + "grad_norm": 0.05978436768054962, + "learning_rate": 3.3247654405185776e-06, + "loss": 0.0016, + "step": 79190 + }, + { + "epoch": 1.2959175325206578, + "grad_norm": 0.017376508563756943, + "learning_rate": 3.323420133944727e-06, + "loss": 0.0011, + "step": 79200 + }, + { + "epoch": 1.2960811584717336, + "grad_norm": 0.06467296183109283, + "learning_rate": 3.322074964107027e-06, + "loss": 0.0016, + "step": 79210 + }, + { + "epoch": 1.2962447844228095, + "grad_norm": 0.06061068922281265, + "learning_rate": 3.320729931115183e-06, + "loss": 0.0007, + "step": 79220 + }, + { + "epoch": 1.2964084103738853, + "grad_norm": 0.02549733594059944, + "learning_rate": 3.3193850350788935e-06, + "loss": 0.0026, + "step": 79230 + }, + { + "epoch": 1.296572036324961, + "grad_norm": 0.06459949165582657, + "learning_rate": 3.3180402761078424e-06, + "loss": 0.0011, + "step": 79240 + }, + { + "epoch": 1.296735662276037, + "grad_norm": 0.039040639996528625, + "learning_rate": 3.3166956543117035e-06, + "loss": 0.0008, + "step": 79250 + }, + { + "epoch": 1.2968992882271129, + "grad_norm": 0.12272796034812927, + "learning_rate": 3.31535116980014e-06, + "loss": 0.0026, + "step": 79260 + }, + { + "epoch": 1.2970629141781886, + "grad_norm": 0.03812261298298836, + "learning_rate": 3.3140068226828038e-06, + "loss": 0.0008, + "step": 79270 + }, + { + "epoch": 1.2972265401292646, + "grad_norm": 0.016286397352814674, + "learning_rate": 3.312662613069334e-06, + "loss": 0.0016, + "step": 79280 + }, + { + "epoch": 1.2973901660803404, + "grad_norm": 0.07783732563257217, + "learning_rate": 3.3113185410693616e-06, + "loss": 0.0007, + "step": 79290 + }, + { + "epoch": 1.2975537920314162, + "grad_norm": 0.02862142026424408, + "learning_rate": 3.309974606792503e-06, + "loss": 0.0011, + "step": 79300 + }, + { + "epoch": 1.297717417982492, + "grad_norm": 0.1427776962518692, + "learning_rate": 3.3086308103483657e-06, + "loss": 0.0016, + "step": 79310 + }, + { + "epoch": 1.2978810439335677, + "grad_norm": 0.02684645913541317, + "learning_rate": 3.3072871518465456e-06, + "loss": 0.002, + "step": 79320 + }, + { + "epoch": 1.2980446698846437, + "grad_norm": 0.0626491904258728, + "learning_rate": 3.3059436313966253e-06, + "loss": 0.001, + "step": 79330 + }, + { + "epoch": 1.2982082958357195, + "grad_norm": 0.05992826074361801, + "learning_rate": 3.3046002491081793e-06, + "loss": 0.0011, + "step": 79340 + }, + { + "epoch": 1.2983719217867953, + "grad_norm": 0.028699785470962524, + "learning_rate": 3.3032570050907685e-06, + "loss": 0.0023, + "step": 79350 + }, + { + "epoch": 1.2985355477378713, + "grad_norm": 0.04988691955804825, + "learning_rate": 3.3019138994539434e-06, + "loss": 0.001, + "step": 79360 + }, + { + "epoch": 1.298699173688947, + "grad_norm": 0.09028506278991699, + "learning_rate": 3.3005709323072436e-06, + "loss": 0.0013, + "step": 79370 + }, + { + "epoch": 1.2988627996400228, + "grad_norm": 0.1481347680091858, + "learning_rate": 3.299228103760197e-06, + "loss": 0.0017, + "step": 79380 + }, + { + "epoch": 1.2990264255910988, + "grad_norm": 0.07268628478050232, + "learning_rate": 3.2978854139223186e-06, + "loss": 0.0014, + "step": 79390 + }, + { + "epoch": 1.2991900515421746, + "grad_norm": 0.034383244812488556, + "learning_rate": 3.296542862903116e-06, + "loss": 0.0011, + "step": 79400 + }, + { + "epoch": 1.2993536774932504, + "grad_norm": 0.19446270167827606, + "learning_rate": 3.2952004508120805e-06, + "loss": 0.0025, + "step": 79410 + }, + { + "epoch": 1.2995173034443264, + "grad_norm": 0.060037847608327866, + "learning_rate": 3.2938581777586977e-06, + "loss": 0.0016, + "step": 79420 + }, + { + "epoch": 1.2996809293954021, + "grad_norm": 0.029453665018081665, + "learning_rate": 3.2925160438524354e-06, + "loss": 0.0012, + "step": 79430 + }, + { + "epoch": 1.299844555346478, + "grad_norm": 0.025623057037591934, + "learning_rate": 3.291174049202756e-06, + "loss": 0.0009, + "step": 79440 + }, + { + "epoch": 1.300008181297554, + "grad_norm": 0.03462930768728256, + "learning_rate": 3.289832193919108e-06, + "loss": 0.0006, + "step": 79450 + }, + { + "epoch": 1.3001718072486297, + "grad_norm": 0.006365691777318716, + "learning_rate": 3.2884904781109273e-06, + "loss": 0.0011, + "step": 79460 + }, + { + "epoch": 1.3003354331997055, + "grad_norm": 0.10619057714939117, + "learning_rate": 3.2871489018876402e-06, + "loss": 0.0014, + "step": 79470 + }, + { + "epoch": 1.3004990591507812, + "grad_norm": 0.11574172228574753, + "learning_rate": 3.2858074653586602e-06, + "loss": 0.001, + "step": 79480 + }, + { + "epoch": 1.3006626851018572, + "grad_norm": 0.09558505564928055, + "learning_rate": 3.284466168633392e-06, + "loss": 0.0027, + "step": 79490 + }, + { + "epoch": 1.300826311052933, + "grad_norm": 0.025329547002911568, + "learning_rate": 3.2831250118212253e-06, + "loss": 0.0016, + "step": 79500 + }, + { + "epoch": 1.3009899370040088, + "grad_norm": 0.04660091549158096, + "learning_rate": 3.2817839950315424e-06, + "loss": 0.0008, + "step": 79510 + }, + { + "epoch": 1.3011535629550846, + "grad_norm": 0.07537785172462463, + "learning_rate": 3.280443118373711e-06, + "loss": 0.0012, + "step": 79520 + }, + { + "epoch": 1.3013171889061605, + "grad_norm": 0.09352541714906693, + "learning_rate": 3.2791023819570877e-06, + "loss": 0.0015, + "step": 79530 + }, + { + "epoch": 1.3014808148572363, + "grad_norm": 0.021450523287057877, + "learning_rate": 3.2777617858910193e-06, + "loss": 0.0012, + "step": 79540 + }, + { + "epoch": 1.301644440808312, + "grad_norm": 0.08590084314346313, + "learning_rate": 3.2764213302848404e-06, + "loss": 0.0013, + "step": 79550 + }, + { + "epoch": 1.301808066759388, + "grad_norm": 0.03559460490942001, + "learning_rate": 3.275081015247873e-06, + "loss": 0.0004, + "step": 79560 + }, + { + "epoch": 1.3019716927104639, + "grad_norm": 0.03023878112435341, + "learning_rate": 3.27374084088943e-06, + "loss": 0.0074, + "step": 79570 + }, + { + "epoch": 1.3021353186615396, + "grad_norm": 0.0804395005106926, + "learning_rate": 3.2724008073188104e-06, + "loss": 0.0007, + "step": 79580 + }, + { + "epoch": 1.3022989446126156, + "grad_norm": 0.013239249587059021, + "learning_rate": 3.2710609146453034e-06, + "loss": 0.0015, + "step": 79590 + }, + { + "epoch": 1.3024625705636914, + "grad_norm": 0.13531453907489777, + "learning_rate": 3.2697211629781855e-06, + "loss": 0.0019, + "step": 79600 + }, + { + "epoch": 1.3026261965147672, + "grad_norm": 0.029673507437109947, + "learning_rate": 3.268381552426723e-06, + "loss": 0.001, + "step": 79610 + }, + { + "epoch": 1.3027898224658432, + "grad_norm": 0.028425974771380424, + "learning_rate": 3.2670420831001694e-06, + "loss": 0.0015, + "step": 79620 + }, + { + "epoch": 1.302953448416919, + "grad_norm": 0.1271386593580246, + "learning_rate": 3.2657027551077674e-06, + "loss": 0.0011, + "step": 79630 + }, + { + "epoch": 1.3031170743679947, + "grad_norm": 0.0557423010468483, + "learning_rate": 3.2643635685587483e-06, + "loss": 0.0009, + "step": 79640 + }, + { + "epoch": 1.3032807003190707, + "grad_norm": 0.013881120830774307, + "learning_rate": 3.2630245235623313e-06, + "loss": 0.0009, + "step": 79650 + }, + { + "epoch": 1.3034443262701465, + "grad_norm": 0.19356095790863037, + "learning_rate": 3.2616856202277248e-06, + "loss": 0.0017, + "step": 79660 + }, + { + "epoch": 1.3036079522212223, + "grad_norm": 0.27862197160720825, + "learning_rate": 3.260346858664124e-06, + "loss": 0.002, + "step": 79670 + }, + { + "epoch": 1.303771578172298, + "grad_norm": 0.0032116922084242105, + "learning_rate": 3.2590082389807156e-06, + "loss": 0.001, + "step": 79680 + }, + { + "epoch": 1.303935204123374, + "grad_norm": 0.04921858012676239, + "learning_rate": 3.2576697612866716e-06, + "loss": 0.0008, + "step": 79690 + }, + { + "epoch": 1.3040988300744498, + "grad_norm": 0.08543482422828674, + "learning_rate": 3.256331425691154e-06, + "loss": 0.0034, + "step": 79700 + }, + { + "epoch": 1.3042624560255256, + "grad_norm": 0.05653172358870506, + "learning_rate": 3.2549932323033117e-06, + "loss": 0.0018, + "step": 79710 + }, + { + "epoch": 1.3044260819766014, + "grad_norm": 0.024717243388295174, + "learning_rate": 3.253655181232286e-06, + "loss": 0.0007, + "step": 79720 + }, + { + "epoch": 1.3045897079276774, + "grad_norm": 0.09563183784484863, + "learning_rate": 3.252317272587201e-06, + "loss": 0.0014, + "step": 79730 + }, + { + "epoch": 1.3047533338787531, + "grad_norm": 0.03906206786632538, + "learning_rate": 3.2509795064771738e-06, + "loss": 0.0012, + "step": 79740 + }, + { + "epoch": 1.304916959829829, + "grad_norm": 0.06482381373643875, + "learning_rate": 3.249641883011307e-06, + "loss": 0.0014, + "step": 79750 + }, + { + "epoch": 1.305080585780905, + "grad_norm": 0.04216672107577324, + "learning_rate": 3.248304402298693e-06, + "loss": 0.0012, + "step": 79760 + }, + { + "epoch": 1.3052442117319807, + "grad_norm": 0.045597292482852936, + "learning_rate": 3.246967064448413e-06, + "loss": 0.0018, + "step": 79770 + }, + { + "epoch": 1.3054078376830565, + "grad_norm": 0.044614698737859726, + "learning_rate": 3.2456298695695345e-06, + "loss": 0.0017, + "step": 79780 + }, + { + "epoch": 1.3055714636341325, + "grad_norm": 0.07309107482433319, + "learning_rate": 3.2442928177711148e-06, + "loss": 0.0015, + "step": 79790 + }, + { + "epoch": 1.3057350895852082, + "grad_norm": 0.02369355596601963, + "learning_rate": 3.2429559091621985e-06, + "loss": 0.0015, + "step": 79800 + }, + { + "epoch": 1.305898715536284, + "grad_norm": 0.05398254469037056, + "learning_rate": 3.2416191438518218e-06, + "loss": 0.0008, + "step": 79810 + }, + { + "epoch": 1.30606234148736, + "grad_norm": 0.07313282787799835, + "learning_rate": 3.240282521949004e-06, + "loss": 0.0013, + "step": 79820 + }, + { + "epoch": 1.3062259674384358, + "grad_norm": 0.03335161134600639, + "learning_rate": 3.238946043562757e-06, + "loss": 0.0009, + "step": 79830 + }, + { + "epoch": 1.3063895933895115, + "grad_norm": 0.044305142015218735, + "learning_rate": 3.23760970880208e-06, + "loss": 0.0015, + "step": 79840 + }, + { + "epoch": 1.3065532193405875, + "grad_norm": 0.018014226108789444, + "learning_rate": 3.2362735177759574e-06, + "loss": 0.0006, + "step": 79850 + }, + { + "epoch": 1.3067168452916633, + "grad_norm": 0.18183022737503052, + "learning_rate": 3.234937470593367e-06, + "loss": 0.001, + "step": 79860 + }, + { + "epoch": 1.306880471242739, + "grad_norm": 0.02412210963666439, + "learning_rate": 3.2336015673632705e-06, + "loss": 0.0019, + "step": 79870 + }, + { + "epoch": 1.3070440971938149, + "grad_norm": 0.0383760929107666, + "learning_rate": 3.232265808194621e-06, + "loss": 0.0014, + "step": 79880 + }, + { + "epoch": 1.3072077231448906, + "grad_norm": 0.030720427632331848, + "learning_rate": 3.230930193196358e-06, + "loss": 0.0022, + "step": 79890 + }, + { + "epoch": 1.3073713490959666, + "grad_norm": 0.022045457735657692, + "learning_rate": 3.2295947224774086e-06, + "loss": 0.0012, + "step": 79900 + }, + { + "epoch": 1.3075349750470424, + "grad_norm": 0.0737752765417099, + "learning_rate": 3.2282593961466915e-06, + "loss": 0.001, + "step": 79910 + }, + { + "epoch": 1.3076986009981182, + "grad_norm": 0.11369406431913376, + "learning_rate": 3.2269242143131086e-06, + "loss": 0.0015, + "step": 79920 + }, + { + "epoch": 1.3078622269491942, + "grad_norm": 0.1049065962433815, + "learning_rate": 3.225589177085555e-06, + "loss": 0.0009, + "step": 79930 + }, + { + "epoch": 1.30802585290027, + "grad_norm": 0.030235277488827705, + "learning_rate": 3.2242542845729106e-06, + "loss": 0.0011, + "step": 79940 + }, + { + "epoch": 1.3081894788513457, + "grad_norm": 0.007540002930909395, + "learning_rate": 3.222919536884046e-06, + "loss": 0.0012, + "step": 79950 + }, + { + "epoch": 1.3083531048024217, + "grad_norm": 0.06803745776414871, + "learning_rate": 3.2215849341278172e-06, + "loss": 0.001, + "step": 79960 + }, + { + "epoch": 1.3085167307534975, + "grad_norm": 0.018689248710870743, + "learning_rate": 3.220250476413071e-06, + "loss": 0.0008, + "step": 79970 + }, + { + "epoch": 1.3086803567045733, + "grad_norm": 0.04351019114255905, + "learning_rate": 3.21891616384864e-06, + "loss": 0.0017, + "step": 79980 + }, + { + "epoch": 1.3088439826556493, + "grad_norm": 0.03786642476916313, + "learning_rate": 3.2175819965433473e-06, + "loss": 0.0021, + "step": 79990 + }, + { + "epoch": 1.309007608606725, + "grad_norm": 0.02765335887670517, + "learning_rate": 3.2162479746060025e-06, + "loss": 0.0014, + "step": 80000 + }, + { + "epoch": 1.3091712345578008, + "grad_norm": 0.04041989520192146, + "learning_rate": 3.2149140981454047e-06, + "loss": 0.0033, + "step": 80010 + }, + { + "epoch": 1.3093348605088768, + "grad_norm": 0.03959774971008301, + "learning_rate": 3.213580367270339e-06, + "loss": 0.0014, + "step": 80020 + }, + { + "epoch": 1.3094984864599526, + "grad_norm": 0.0767705962061882, + "learning_rate": 3.212246782089581e-06, + "loss": 0.0011, + "step": 80030 + }, + { + "epoch": 1.3096621124110284, + "grad_norm": 0.04189818352460861, + "learning_rate": 3.2109133427118926e-06, + "loss": 0.0017, + "step": 80040 + }, + { + "epoch": 1.3098257383621044, + "grad_norm": 0.04493827000260353, + "learning_rate": 3.209580049246025e-06, + "loss": 0.0012, + "step": 80050 + }, + { + "epoch": 1.3099893643131801, + "grad_norm": 0.04886334761977196, + "learning_rate": 3.2082469018007174e-06, + "loss": 0.0019, + "step": 80060 + }, + { + "epoch": 1.310152990264256, + "grad_norm": 0.0646354928612709, + "learning_rate": 3.206913900484696e-06, + "loss": 0.0012, + "step": 80070 + }, + { + "epoch": 1.3103166162153317, + "grad_norm": 0.13245901465415955, + "learning_rate": 3.2055810454066766e-06, + "loss": 0.0015, + "step": 80080 + }, + { + "epoch": 1.3104802421664075, + "grad_norm": 0.07928046584129333, + "learning_rate": 3.204248336675362e-06, + "loss": 0.0011, + "step": 80090 + }, + { + "epoch": 1.3106438681174835, + "grad_norm": 0.06279926747083664, + "learning_rate": 3.2029157743994443e-06, + "loss": 0.0009, + "step": 80100 + }, + { + "epoch": 1.3108074940685592, + "grad_norm": 0.04123770445585251, + "learning_rate": 3.2015833586876006e-06, + "loss": 0.001, + "step": 80110 + }, + { + "epoch": 1.310971120019635, + "grad_norm": 0.1268790364265442, + "learning_rate": 3.2002510896484994e-06, + "loss": 0.0011, + "step": 80120 + }, + { + "epoch": 1.311134745970711, + "grad_norm": 0.031708262860774994, + "learning_rate": 3.1989189673907963e-06, + "loss": 0.0008, + "step": 80130 + }, + { + "epoch": 1.3112983719217868, + "grad_norm": 0.07571687549352646, + "learning_rate": 3.1975869920231345e-06, + "loss": 0.0012, + "step": 80140 + }, + { + "epoch": 1.3114619978728626, + "grad_norm": 0.0022230877075344324, + "learning_rate": 3.1962551636541452e-06, + "loss": 0.0007, + "step": 80150 + }, + { + "epoch": 1.3116256238239385, + "grad_norm": 0.024761075153946877, + "learning_rate": 3.194923482392448e-06, + "loss": 0.0016, + "step": 80160 + }, + { + "epoch": 1.3117892497750143, + "grad_norm": 0.07486965507268906, + "learning_rate": 3.1935919483466497e-06, + "loss": 0.0011, + "step": 80170 + }, + { + "epoch": 1.31195287572609, + "grad_norm": 0.0521760955452919, + "learning_rate": 3.1922605616253465e-06, + "loss": 0.0009, + "step": 80180 + }, + { + "epoch": 1.312116501677166, + "grad_norm": 0.009890205226838589, + "learning_rate": 3.1909293223371217e-06, + "loss": 0.0011, + "step": 80190 + }, + { + "epoch": 1.3122801276282419, + "grad_norm": 0.027179060503840446, + "learning_rate": 3.189598230590546e-06, + "loss": 0.0017, + "step": 80200 + }, + { + "epoch": 1.3124437535793176, + "grad_norm": 0.004196762572973967, + "learning_rate": 3.1882672864941787e-06, + "loss": 0.0011, + "step": 80210 + }, + { + "epoch": 1.3126073795303936, + "grad_norm": 0.07651763409376144, + "learning_rate": 3.186936490156568e-06, + "loss": 0.0014, + "step": 80220 + }, + { + "epoch": 1.3127710054814694, + "grad_norm": 0.0073889028280973434, + "learning_rate": 3.185605841686249e-06, + "loss": 0.0015, + "step": 80230 + }, + { + "epoch": 1.3129346314325452, + "grad_norm": 0.06646845489740372, + "learning_rate": 3.1842753411917437e-06, + "loss": 0.0011, + "step": 80240 + }, + { + "epoch": 1.313098257383621, + "grad_norm": 0.05607403442263603, + "learning_rate": 3.1829449887815643e-06, + "loss": 0.0017, + "step": 80250 + }, + { + "epoch": 1.313261883334697, + "grad_norm": 0.0464121513068676, + "learning_rate": 3.18161478456421e-06, + "loss": 0.001, + "step": 80260 + }, + { + "epoch": 1.3134255092857727, + "grad_norm": 0.022048866376280785, + "learning_rate": 3.1802847286481665e-06, + "loss": 0.0006, + "step": 80270 + }, + { + "epoch": 1.3135891352368485, + "grad_norm": 0.05973271280527115, + "learning_rate": 3.1789548211419092e-06, + "loss": 0.0009, + "step": 80280 + }, + { + "epoch": 1.3137527611879243, + "grad_norm": 0.0234842449426651, + "learning_rate": 3.1776250621539007e-06, + "loss": 0.0007, + "step": 80290 + }, + { + "epoch": 1.3139163871390003, + "grad_norm": 0.06460271030664444, + "learning_rate": 3.1762954517925925e-06, + "loss": 0.0012, + "step": 80300 + }, + { + "epoch": 1.314080013090076, + "grad_norm": 0.12546637654304504, + "learning_rate": 3.1749659901664227e-06, + "loss": 0.0012, + "step": 80310 + }, + { + "epoch": 1.3142436390411518, + "grad_norm": 0.01690412312746048, + "learning_rate": 3.173636677383817e-06, + "loss": 0.0012, + "step": 80320 + }, + { + "epoch": 1.3144072649922278, + "grad_norm": 0.04322877153754234, + "learning_rate": 3.17230751355319e-06, + "loss": 0.0011, + "step": 80330 + }, + { + "epoch": 1.3145708909433036, + "grad_norm": 0.0616116039454937, + "learning_rate": 3.170978498782944e-06, + "loss": 0.0013, + "step": 80340 + }, + { + "epoch": 1.3147345168943794, + "grad_norm": 0.008478286676108837, + "learning_rate": 3.1696496331814684e-06, + "loss": 0.0017, + "step": 80350 + }, + { + "epoch": 1.3148981428454554, + "grad_norm": 0.04688458517193794, + "learning_rate": 3.1683209168571417e-06, + "loss": 0.0011, + "step": 80360 + }, + { + "epoch": 1.3150617687965311, + "grad_norm": 0.10005426406860352, + "learning_rate": 3.16699234991833e-06, + "loss": 0.0017, + "step": 80370 + }, + { + "epoch": 1.315225394747607, + "grad_norm": 0.028535056859254837, + "learning_rate": 3.165663932473384e-06, + "loss": 0.0018, + "step": 80380 + }, + { + "epoch": 1.315389020698683, + "grad_norm": 0.05501551553606987, + "learning_rate": 3.164335664630648e-06, + "loss": 0.0017, + "step": 80390 + }, + { + "epoch": 1.3155526466497587, + "grad_norm": 0.029538555070757866, + "learning_rate": 3.163007546498449e-06, + "loss": 0.0015, + "step": 80400 + }, + { + "epoch": 1.3157162726008345, + "grad_norm": 0.061508920043706894, + "learning_rate": 3.161679578185105e-06, + "loss": 0.0021, + "step": 80410 + }, + { + "epoch": 1.3158798985519105, + "grad_norm": 0.011630282737314701, + "learning_rate": 3.160351759798921e-06, + "loss": 0.0019, + "step": 80420 + }, + { + "epoch": 1.3160435245029862, + "grad_norm": 0.06952890753746033, + "learning_rate": 3.159024091448187e-06, + "loss": 0.0007, + "step": 80430 + }, + { + "epoch": 1.316207150454062, + "grad_norm": 0.04616463929414749, + "learning_rate": 3.157696573241186e-06, + "loss": 0.0011, + "step": 80440 + }, + { + "epoch": 1.3163707764051378, + "grad_norm": 0.021005893126130104, + "learning_rate": 3.1563692052861827e-06, + "loss": 0.0011, + "step": 80450 + }, + { + "epoch": 1.3165344023562138, + "grad_norm": 0.04831234738230705, + "learning_rate": 3.155041987691435e-06, + "loss": 0.0007, + "step": 80460 + }, + { + "epoch": 1.3166980283072895, + "grad_norm": 0.005501890555024147, + "learning_rate": 3.1537149205651864e-06, + "loss": 0.0013, + "step": 80470 + }, + { + "epoch": 1.3168616542583653, + "grad_norm": 0.06707277148962021, + "learning_rate": 3.152388004015666e-06, + "loss": 0.0011, + "step": 80480 + }, + { + "epoch": 1.317025280209441, + "grad_norm": 0.061561886221170425, + "learning_rate": 3.1510612381510954e-06, + "loss": 0.0011, + "step": 80490 + }, + { + "epoch": 1.317188906160517, + "grad_norm": 0.0730748325586319, + "learning_rate": 3.1497346230796783e-06, + "loss": 0.0016, + "step": 80500 + }, + { + "epoch": 1.3173525321115929, + "grad_norm": 0.09601116925477982, + "learning_rate": 3.1484081589096104e-06, + "loss": 0.0014, + "step": 80510 + }, + { + "epoch": 1.3175161580626686, + "grad_norm": 0.03992994502186775, + "learning_rate": 3.1470818457490736e-06, + "loss": 0.0006, + "step": 80520 + }, + { + "epoch": 1.3176797840137446, + "grad_norm": 0.08607742190361023, + "learning_rate": 3.1457556837062373e-06, + "loss": 0.001, + "step": 80530 + }, + { + "epoch": 1.3178434099648204, + "grad_norm": 0.039825137704610825, + "learning_rate": 3.1444296728892588e-06, + "loss": 0.0008, + "step": 80540 + }, + { + "epoch": 1.3180070359158962, + "grad_norm": 0.0654398500919342, + "learning_rate": 3.143103813406283e-06, + "loss": 0.0012, + "step": 80550 + }, + { + "epoch": 1.3181706618669722, + "grad_norm": 0.07151825726032257, + "learning_rate": 3.141778105365443e-06, + "loss": 0.0022, + "step": 80560 + }, + { + "epoch": 1.318334287818048, + "grad_norm": 0.031865235418081284, + "learning_rate": 3.1404525488748576e-06, + "loss": 0.0008, + "step": 80570 + }, + { + "epoch": 1.3184979137691237, + "grad_norm": 0.039745211601257324, + "learning_rate": 3.139127144042636e-06, + "loss": 0.0015, + "step": 80580 + }, + { + "epoch": 1.3186615397201997, + "grad_norm": 0.049659788608551025, + "learning_rate": 3.137801890976873e-06, + "loss": 0.0018, + "step": 80590 + }, + { + "epoch": 1.3188251656712755, + "grad_norm": 0.05730728432536125, + "learning_rate": 3.1364767897856522e-06, + "loss": 0.0008, + "step": 80600 + }, + { + "epoch": 1.3189887916223513, + "grad_norm": 0.03901224955916405, + "learning_rate": 3.1351518405770444e-06, + "loss": 0.0005, + "step": 80610 + }, + { + "epoch": 1.3191524175734273, + "grad_norm": 0.0531182698905468, + "learning_rate": 3.133827043459108e-06, + "loss": 0.0014, + "step": 80620 + }, + { + "epoch": 1.319316043524503, + "grad_norm": 0.009435409680008888, + "learning_rate": 3.1325023985398883e-06, + "loss": 0.0007, + "step": 80630 + }, + { + "epoch": 1.3194796694755788, + "grad_norm": 0.10078240185976028, + "learning_rate": 3.131177905927419e-06, + "loss": 0.0012, + "step": 80640 + }, + { + "epoch": 1.3196432954266546, + "grad_norm": 0.04385104402899742, + "learning_rate": 3.1298535657297213e-06, + "loss": 0.001, + "step": 80650 + }, + { + "epoch": 1.3198069213777306, + "grad_norm": 0.19951170682907104, + "learning_rate": 3.128529378054804e-06, + "loss": 0.0021, + "step": 80660 + }, + { + "epoch": 1.3199705473288064, + "grad_norm": 0.03995213657617569, + "learning_rate": 3.127205343010664e-06, + "loss": 0.0018, + "step": 80670 + }, + { + "epoch": 1.3201341732798821, + "grad_norm": 0.029720941558480263, + "learning_rate": 3.1258814607052833e-06, + "loss": 0.0018, + "step": 80680 + }, + { + "epoch": 1.320297799230958, + "grad_norm": 0.07610039412975311, + "learning_rate": 3.124557731246636e-06, + "loss": 0.0014, + "step": 80690 + }, + { + "epoch": 1.320461425182034, + "grad_norm": 0.018221447244286537, + "learning_rate": 3.123234154742678e-06, + "loss": 0.0016, + "step": 80700 + }, + { + "epoch": 1.3206250511331097, + "grad_norm": 0.04896111413836479, + "learning_rate": 3.1219107313013576e-06, + "loss": 0.0009, + "step": 80710 + }, + { + "epoch": 1.3207886770841855, + "grad_norm": 0.049727123230695724, + "learning_rate": 3.1205874610306075e-06, + "loss": 0.0012, + "step": 80720 + }, + { + "epoch": 1.3209523030352615, + "grad_norm": 0.006221754476428032, + "learning_rate": 3.119264344038351e-06, + "loss": 0.0023, + "step": 80730 + }, + { + "epoch": 1.3211159289863372, + "grad_norm": 0.08756019920110703, + "learning_rate": 3.1179413804324955e-06, + "loss": 0.0008, + "step": 80740 + }, + { + "epoch": 1.321279554937413, + "grad_norm": 0.16489684581756592, + "learning_rate": 3.116618570320937e-06, + "loss": 0.0019, + "step": 80750 + }, + { + "epoch": 1.321443180888489, + "grad_norm": 0.13592910766601562, + "learning_rate": 3.11529591381156e-06, + "loss": 0.003, + "step": 80760 + }, + { + "epoch": 1.3216068068395648, + "grad_norm": 0.05379120260477066, + "learning_rate": 3.1139734110122366e-06, + "loss": 0.0006, + "step": 80770 + }, + { + "epoch": 1.3217704327906405, + "grad_norm": 0.08801086992025375, + "learning_rate": 3.112651062030825e-06, + "loss": 0.0013, + "step": 80780 + }, + { + "epoch": 1.3219340587417165, + "grad_norm": 0.08735466748476028, + "learning_rate": 3.1113288669751705e-06, + "loss": 0.0009, + "step": 80790 + }, + { + "epoch": 1.3220976846927923, + "grad_norm": 0.06594033539295197, + "learning_rate": 3.110006825953109e-06, + "loss": 0.001, + "step": 80800 + }, + { + "epoch": 1.322261310643868, + "grad_norm": 0.04945126920938492, + "learning_rate": 3.1086849390724595e-06, + "loss": 0.0011, + "step": 80810 + }, + { + "epoch": 1.322424936594944, + "grad_norm": 0.032571833580732346, + "learning_rate": 3.107363206441032e-06, + "loss": 0.0015, + "step": 80820 + }, + { + "epoch": 1.3225885625460199, + "grad_norm": 0.014670531265437603, + "learning_rate": 3.1060416281666218e-06, + "loss": 0.0011, + "step": 80830 + }, + { + "epoch": 1.3227521884970956, + "grad_norm": 0.033089883625507355, + "learning_rate": 3.104720204357013e-06, + "loss": 0.0015, + "step": 80840 + }, + { + "epoch": 1.3229158144481714, + "grad_norm": 0.014068972319364548, + "learning_rate": 3.1033989351199757e-06, + "loss": 0.0011, + "step": 80850 + }, + { + "epoch": 1.3230794403992472, + "grad_norm": 0.09019894152879715, + "learning_rate": 3.1020778205632684e-06, + "loss": 0.0015, + "step": 80860 + }, + { + "epoch": 1.3232430663503232, + "grad_norm": 0.0930638238787651, + "learning_rate": 3.100756860794637e-06, + "loss": 0.0007, + "step": 80870 + }, + { + "epoch": 1.323406692301399, + "grad_norm": 0.04035981744527817, + "learning_rate": 3.099436055921814e-06, + "loss": 0.001, + "step": 80880 + }, + { + "epoch": 1.3235703182524747, + "grad_norm": 0.057395558804273605, + "learning_rate": 3.098115406052521e-06, + "loss": 0.0013, + "step": 80890 + }, + { + "epoch": 1.3237339442035507, + "grad_norm": 0.013174457475543022, + "learning_rate": 3.0967949112944634e-06, + "loss": 0.0009, + "step": 80900 + }, + { + "epoch": 1.3238975701546265, + "grad_norm": 0.03129865229129791, + "learning_rate": 3.095474571755338e-06, + "loss": 0.0011, + "step": 80910 + }, + { + "epoch": 1.3240611961057023, + "grad_norm": 0.13162872195243835, + "learning_rate": 3.094154387542827e-06, + "loss": 0.0007, + "step": 80920 + }, + { + "epoch": 1.3242248220567783, + "grad_norm": 0.1469489187002182, + "learning_rate": 3.0928343587646e-06, + "loss": 0.001, + "step": 80930 + }, + { + "epoch": 1.324388448007854, + "grad_norm": 0.019256900995969772, + "learning_rate": 3.0915144855283134e-06, + "loss": 0.0014, + "step": 80940 + }, + { + "epoch": 1.3245520739589298, + "grad_norm": 0.09133001416921616, + "learning_rate": 3.090194767941612e-06, + "loss": 0.001, + "step": 80950 + }, + { + "epoch": 1.3247156999100058, + "grad_norm": 0.08863049000501633, + "learning_rate": 3.0888752061121284e-06, + "loss": 0.0019, + "step": 80960 + }, + { + "epoch": 1.3248793258610816, + "grad_norm": 0.04386939853429794, + "learning_rate": 3.0875558001474804e-06, + "loss": 0.0006, + "step": 80970 + }, + { + "epoch": 1.3250429518121574, + "grad_norm": 0.055358823388814926, + "learning_rate": 3.0862365501552747e-06, + "loss": 0.0015, + "step": 80980 + }, + { + "epoch": 1.3252065777632334, + "grad_norm": 0.07711359113454819, + "learning_rate": 3.084917456243105e-06, + "loss": 0.0009, + "step": 80990 + }, + { + "epoch": 1.3253702037143091, + "grad_norm": 0.024266939610242844, + "learning_rate": 3.0835985185185514e-06, + "loss": 0.0011, + "step": 81000 + }, + { + "epoch": 1.325533829665385, + "grad_norm": 0.03206505626440048, + "learning_rate": 3.0822797370891835e-06, + "loss": 0.0012, + "step": 81010 + }, + { + "epoch": 1.325697455616461, + "grad_norm": 0.05650395154953003, + "learning_rate": 3.0809611120625544e-06, + "loss": 0.0026, + "step": 81020 + }, + { + "epoch": 1.3258610815675367, + "grad_norm": 0.03850245475769043, + "learning_rate": 3.079642643546208e-06, + "loss": 0.0017, + "step": 81030 + }, + { + "epoch": 1.3260247075186125, + "grad_norm": 0.12974824011325836, + "learning_rate": 3.078324331647674e-06, + "loss": 0.0031, + "step": 81040 + }, + { + "epoch": 1.3261883334696882, + "grad_norm": 0.45154961943626404, + "learning_rate": 3.07700617647447e-06, + "loss": 0.0015, + "step": 81050 + }, + { + "epoch": 1.326351959420764, + "grad_norm": 0.011137156747281551, + "learning_rate": 3.0756881781341007e-06, + "loss": 0.0007, + "step": 81060 + }, + { + "epoch": 1.32651558537184, + "grad_norm": 0.006569497287273407, + "learning_rate": 3.074370336734055e-06, + "loss": 0.001, + "step": 81070 + }, + { + "epoch": 1.3266792113229158, + "grad_norm": 0.1845833957195282, + "learning_rate": 3.0730526523818133e-06, + "loss": 0.0009, + "step": 81080 + }, + { + "epoch": 1.3268428372739915, + "grad_norm": 0.0885968804359436, + "learning_rate": 3.071735125184841e-06, + "loss": 0.0018, + "step": 81090 + }, + { + "epoch": 1.3270064632250675, + "grad_norm": 0.0576612763106823, + "learning_rate": 3.0704177552505913e-06, + "loss": 0.0024, + "step": 81100 + }, + { + "epoch": 1.3271700891761433, + "grad_norm": 0.04037720337510109, + "learning_rate": 3.069100542686505e-06, + "loss": 0.001, + "step": 81110 + }, + { + "epoch": 1.327333715127219, + "grad_norm": 0.018154360353946686, + "learning_rate": 3.0677834876000085e-06, + "loss": 0.0006, + "step": 81120 + }, + { + "epoch": 1.327497341078295, + "grad_norm": 0.13168008625507355, + "learning_rate": 3.066466590098517e-06, + "loss": 0.001, + "step": 81130 + }, + { + "epoch": 1.3276609670293709, + "grad_norm": 0.14789900183677673, + "learning_rate": 3.065149850289432e-06, + "loss": 0.0013, + "step": 81140 + }, + { + "epoch": 1.3278245929804466, + "grad_norm": 0.05491626262664795, + "learning_rate": 3.063833268280142e-06, + "loss": 0.0006, + "step": 81150 + }, + { + "epoch": 1.3279882189315226, + "grad_norm": 0.05457132309675217, + "learning_rate": 3.0625168441780235e-06, + "loss": 0.0009, + "step": 81160 + }, + { + "epoch": 1.3281518448825984, + "grad_norm": 0.01319088600575924, + "learning_rate": 3.061200578090439e-06, + "loss": 0.0016, + "step": 81170 + }, + { + "epoch": 1.3283154708336742, + "grad_norm": 0.11185579746961594, + "learning_rate": 3.0598844701247395e-06, + "loss": 0.0013, + "step": 81180 + }, + { + "epoch": 1.3284790967847502, + "grad_norm": 0.0015857135877013206, + "learning_rate": 3.0585685203882617e-06, + "loss": 0.0011, + "step": 81190 + }, + { + "epoch": 1.328642722735826, + "grad_norm": 0.052072227001190186, + "learning_rate": 3.057252728988329e-06, + "loss": 0.0009, + "step": 81200 + }, + { + "epoch": 1.3288063486869017, + "grad_norm": 0.015007534995675087, + "learning_rate": 3.0559370960322556e-06, + "loss": 0.0013, + "step": 81210 + }, + { + "epoch": 1.3289699746379775, + "grad_norm": 0.0428168885409832, + "learning_rate": 3.054621621627337e-06, + "loss": 0.001, + "step": 81220 + }, + { + "epoch": 1.3291336005890535, + "grad_norm": 0.0176068302243948, + "learning_rate": 3.053306305880861e-06, + "loss": 0.0008, + "step": 81230 + }, + { + "epoch": 1.3292972265401293, + "grad_norm": 0.03973715379834175, + "learning_rate": 3.0519911489000985e-06, + "loss": 0.0014, + "step": 81240 + }, + { + "epoch": 1.329460852491205, + "grad_norm": 0.09750473499298096, + "learning_rate": 3.050676150792311e-06, + "loss": 0.0013, + "step": 81250 + }, + { + "epoch": 1.3296244784422808, + "grad_norm": 0.006578003987669945, + "learning_rate": 3.0493613116647446e-06, + "loss": 0.0015, + "step": 81260 + }, + { + "epoch": 1.3297881043933568, + "grad_norm": 0.019946426153182983, + "learning_rate": 3.0480466316246327e-06, + "loss": 0.001, + "step": 81270 + }, + { + "epoch": 1.3299517303444326, + "grad_norm": 0.2481301724910736, + "learning_rate": 3.046732110779197e-06, + "loss": 0.0022, + "step": 81280 + }, + { + "epoch": 1.3301153562955084, + "grad_norm": 0.07669579982757568, + "learning_rate": 3.045417749235644e-06, + "loss": 0.0026, + "step": 81290 + }, + { + "epoch": 1.3302789822465844, + "grad_norm": 0.05859427899122238, + "learning_rate": 3.0441035471011692e-06, + "loss": 0.0012, + "step": 81300 + }, + { + "epoch": 1.3304426081976601, + "grad_norm": 0.05415242165327072, + "learning_rate": 3.042789504482955e-06, + "loss": 0.001, + "step": 81310 + }, + { + "epoch": 1.330606234148736, + "grad_norm": 0.04631884768605232, + "learning_rate": 3.0414756214881696e-06, + "loss": 0.0012, + "step": 81320 + }, + { + "epoch": 1.330769860099812, + "grad_norm": 0.011107529513537884, + "learning_rate": 3.040161898223969e-06, + "loss": 0.0011, + "step": 81330 + }, + { + "epoch": 1.3309334860508877, + "grad_norm": 0.015022678300738335, + "learning_rate": 3.038848334797496e-06, + "loss": 0.0009, + "step": 81340 + }, + { + "epoch": 1.3310971120019635, + "grad_norm": 0.022628238424658775, + "learning_rate": 3.037534931315881e-06, + "loss": 0.0011, + "step": 81350 + }, + { + "epoch": 1.3312607379530395, + "grad_norm": 0.0808190256357193, + "learning_rate": 3.0362216878862394e-06, + "loss": 0.0012, + "step": 81360 + }, + { + "epoch": 1.3314243639041152, + "grad_norm": 0.04676836356520653, + "learning_rate": 3.0349086046156763e-06, + "loss": 0.0014, + "step": 81370 + }, + { + "epoch": 1.331587989855191, + "grad_norm": 0.02933577261865139, + "learning_rate": 3.033595681611281e-06, + "loss": 0.0009, + "step": 81380 + }, + { + "epoch": 1.331751615806267, + "grad_norm": 0.07370854169130325, + "learning_rate": 3.032282918980133e-06, + "loss": 0.0014, + "step": 81390 + }, + { + "epoch": 1.3319152417573428, + "grad_norm": 0.06115110591053963, + "learning_rate": 3.0309703168292937e-06, + "loss": 0.0011, + "step": 81400 + }, + { + "epoch": 1.3320788677084185, + "grad_norm": 0.03955576568841934, + "learning_rate": 3.029657875265817e-06, + "loss": 0.0015, + "step": 81410 + }, + { + "epoch": 1.3322424936594943, + "grad_norm": 0.03216688707470894, + "learning_rate": 3.0283455943967395e-06, + "loss": 0.0009, + "step": 81420 + }, + { + "epoch": 1.3324061196105703, + "grad_norm": 0.06082973629236221, + "learning_rate": 3.0270334743290876e-06, + "loss": 0.0007, + "step": 81430 + }, + { + "epoch": 1.332569745561646, + "grad_norm": 0.03188791498541832, + "learning_rate": 3.0257215151698728e-06, + "loss": 0.0007, + "step": 81440 + }, + { + "epoch": 1.3327333715127219, + "grad_norm": 0.0540880486369133, + "learning_rate": 3.0244097170260943e-06, + "loss": 0.0013, + "step": 81450 + }, + { + "epoch": 1.3328969974637976, + "grad_norm": 0.01986372098326683, + "learning_rate": 3.023098080004737e-06, + "loss": 0.0008, + "step": 81460 + }, + { + "epoch": 1.3330606234148736, + "grad_norm": 0.06811761856079102, + "learning_rate": 3.0217866042127752e-06, + "loss": 0.0014, + "step": 81470 + }, + { + "epoch": 1.3332242493659494, + "grad_norm": 0.10229456424713135, + "learning_rate": 3.0204752897571676e-06, + "loss": 0.0015, + "step": 81480 + }, + { + "epoch": 1.3333878753170252, + "grad_norm": 0.01309707760810852, + "learning_rate": 3.0191641367448598e-06, + "loss": 0.0009, + "step": 81490 + }, + { + "epoch": 1.3335515012681012, + "grad_norm": 0.011736278422176838, + "learning_rate": 3.0178531452827863e-06, + "loss": 0.0014, + "step": 81500 + }, + { + "epoch": 1.333715127219177, + "grad_norm": 0.08110702037811279, + "learning_rate": 3.016542315477866e-06, + "loss": 0.0012, + "step": 81510 + }, + { + "epoch": 1.3338787531702527, + "grad_norm": 0.03364388272166252, + "learning_rate": 3.015231647437006e-06, + "loss": 0.0024, + "step": 81520 + }, + { + "epoch": 1.3340423791213287, + "grad_norm": 0.022201761603355408, + "learning_rate": 3.0139211412671e-06, + "loss": 0.0015, + "step": 81530 + }, + { + "epoch": 1.3342060050724045, + "grad_norm": 0.0016203754348680377, + "learning_rate": 3.012610797075029e-06, + "loss": 0.0013, + "step": 81540 + }, + { + "epoch": 1.3343696310234803, + "grad_norm": 0.02517550066113472, + "learning_rate": 3.0113006149676595e-06, + "loss": 0.0013, + "step": 81550 + }, + { + "epoch": 1.3345332569745563, + "grad_norm": 0.028854593634605408, + "learning_rate": 3.009990595051846e-06, + "loss": 0.0011, + "step": 81560 + }, + { + "epoch": 1.334696882925632, + "grad_norm": 0.031180864199995995, + "learning_rate": 3.0086807374344284e-06, + "loss": 0.0019, + "step": 81570 + }, + { + "epoch": 1.3348605088767078, + "grad_norm": 0.03380880504846573, + "learning_rate": 3.0073710422222347e-06, + "loss": 0.0012, + "step": 81580 + }, + { + "epoch": 1.3350241348277838, + "grad_norm": 0.0834617167711258, + "learning_rate": 3.00606150952208e-06, + "loss": 0.0008, + "step": 81590 + }, + { + "epoch": 1.3351877607788596, + "grad_norm": 0.10590919107198715, + "learning_rate": 3.0047521394407643e-06, + "loss": 0.0018, + "step": 81600 + }, + { + "epoch": 1.3353513867299354, + "grad_norm": 0.09272348880767822, + "learning_rate": 3.0034429320850754e-06, + "loss": 0.0017, + "step": 81610 + }, + { + "epoch": 1.3355150126810111, + "grad_norm": 0.032404445111751556, + "learning_rate": 3.0021338875617885e-06, + "loss": 0.0011, + "step": 81620 + }, + { + "epoch": 1.335678638632087, + "grad_norm": 0.18106071650981903, + "learning_rate": 3.0008250059776646e-06, + "loss": 0.0017, + "step": 81630 + }, + { + "epoch": 1.335842264583163, + "grad_norm": 0.023514172062277794, + "learning_rate": 2.99951628743945e-06, + "loss": 0.001, + "step": 81640 + }, + { + "epoch": 1.3360058905342387, + "grad_norm": 0.05618051812052727, + "learning_rate": 2.9982077320538828e-06, + "loss": 0.0011, + "step": 81650 + }, + { + "epoch": 1.3361695164853145, + "grad_norm": 0.04267505183815956, + "learning_rate": 2.996899339927681e-06, + "loss": 0.0025, + "step": 81660 + }, + { + "epoch": 1.3363331424363905, + "grad_norm": 0.018733562901616096, + "learning_rate": 2.995591111167554e-06, + "loss": 0.001, + "step": 81670 + }, + { + "epoch": 1.3364967683874662, + "grad_norm": 0.0518205501139164, + "learning_rate": 2.9942830458801965e-06, + "loss": 0.0007, + "step": 81680 + }, + { + "epoch": 1.336660394338542, + "grad_norm": 0.042825303971767426, + "learning_rate": 2.99297514417229e-06, + "loss": 0.0018, + "step": 81690 + }, + { + "epoch": 1.336824020289618, + "grad_norm": 0.026424622163176537, + "learning_rate": 2.991667406150502e-06, + "loss": 0.0008, + "step": 81700 + }, + { + "epoch": 1.3369876462406938, + "grad_norm": 0.07906363904476166, + "learning_rate": 2.9903598319214887e-06, + "loss": 0.0013, + "step": 81710 + }, + { + "epoch": 1.3371512721917695, + "grad_norm": 0.11585626751184464, + "learning_rate": 2.989052421591889e-06, + "loss": 0.0015, + "step": 81720 + }, + { + "epoch": 1.3373148981428455, + "grad_norm": 0.09931634366512299, + "learning_rate": 2.987745175268332e-06, + "loss": 0.0012, + "step": 81730 + }, + { + "epoch": 1.3374785240939213, + "grad_norm": 0.01809009164571762, + "learning_rate": 2.9864380930574317e-06, + "loss": 0.0023, + "step": 81740 + }, + { + "epoch": 1.337642150044997, + "grad_norm": 0.0542956218123436, + "learning_rate": 2.98513117506579e-06, + "loss": 0.0012, + "step": 81750 + }, + { + "epoch": 1.337805775996073, + "grad_norm": 0.07651602476835251, + "learning_rate": 2.9838244213999945e-06, + "loss": 0.0011, + "step": 81760 + }, + { + "epoch": 1.3379694019471489, + "grad_norm": 0.03743157535791397, + "learning_rate": 2.9825178321666204e-06, + "loss": 0.0015, + "step": 81770 + }, + { + "epoch": 1.3381330278982246, + "grad_norm": 0.006976444739848375, + "learning_rate": 2.981211407472227e-06, + "loss": 0.0009, + "step": 81780 + }, + { + "epoch": 1.3382966538493006, + "grad_norm": 0.019666900858283043, + "learning_rate": 2.979905147423363e-06, + "loss": 0.0008, + "step": 81790 + }, + { + "epoch": 1.3384602798003764, + "grad_norm": 0.010168484412133694, + "learning_rate": 2.9785990521265617e-06, + "loss": 0.0009, + "step": 81800 + }, + { + "epoch": 1.3386239057514522, + "grad_norm": 0.08481744676828384, + "learning_rate": 2.977293121688345e-06, + "loss": 0.0007, + "step": 81810 + }, + { + "epoch": 1.338787531702528, + "grad_norm": 0.07608356326818466, + "learning_rate": 2.975987356215219e-06, + "loss": 0.0021, + "step": 81820 + }, + { + "epoch": 1.3389511576536037, + "grad_norm": 0.0253651924431324, + "learning_rate": 2.974681755813678e-06, + "loss": 0.0013, + "step": 81830 + }, + { + "epoch": 1.3391147836046797, + "grad_norm": 0.05312076583504677, + "learning_rate": 2.9733763205902022e-06, + "loss": 0.0011, + "step": 81840 + }, + { + "epoch": 1.3392784095557555, + "grad_norm": 0.03739145025610924, + "learning_rate": 2.972071050651259e-06, + "loss": 0.0004, + "step": 81850 + }, + { + "epoch": 1.3394420355068313, + "grad_norm": 0.05163298174738884, + "learning_rate": 2.970765946103301e-06, + "loss": 0.0007, + "step": 81860 + }, + { + "epoch": 1.3396056614579073, + "grad_norm": 0.007754177786409855, + "learning_rate": 2.9694610070527687e-06, + "loss": 0.0013, + "step": 81870 + }, + { + "epoch": 1.339769287408983, + "grad_norm": 0.13393636047840118, + "learning_rate": 2.968156233606088e-06, + "loss": 0.001, + "step": 81880 + }, + { + "epoch": 1.3399329133600588, + "grad_norm": 0.01707519218325615, + "learning_rate": 2.9668516258696713e-06, + "loss": 0.0015, + "step": 81890 + }, + { + "epoch": 1.3400965393111348, + "grad_norm": 0.013989202678203583, + "learning_rate": 2.9655471839499195e-06, + "loss": 0.0006, + "step": 81900 + }, + { + "epoch": 1.3402601652622106, + "grad_norm": 0.03790288418531418, + "learning_rate": 2.964242907953217e-06, + "loss": 0.0007, + "step": 81910 + }, + { + "epoch": 1.3404237912132864, + "grad_norm": 0.0683358907699585, + "learning_rate": 2.962938797985937e-06, + "loss": 0.002, + "step": 81920 + }, + { + "epoch": 1.3405874171643624, + "grad_norm": 0.2492551952600479, + "learning_rate": 2.961634854154438e-06, + "loss": 0.0016, + "step": 81930 + }, + { + "epoch": 1.3407510431154381, + "grad_norm": 0.04444606229662895, + "learning_rate": 2.960331076565065e-06, + "loss": 0.0022, + "step": 81940 + }, + { + "epoch": 1.340914669066514, + "grad_norm": 0.02913873828947544, + "learning_rate": 2.9590274653241497e-06, + "loss": 0.0015, + "step": 81950 + }, + { + "epoch": 1.34107829501759, + "grad_norm": 0.037651486694812775, + "learning_rate": 2.9577240205380107e-06, + "loss": 0.0022, + "step": 81960 + }, + { + "epoch": 1.3412419209686657, + "grad_norm": 0.17404231429100037, + "learning_rate": 2.9564207423129522e-06, + "loss": 0.0019, + "step": 81970 + }, + { + "epoch": 1.3414055469197415, + "grad_norm": 0.08907478302717209, + "learning_rate": 2.9551176307552642e-06, + "loss": 0.0009, + "step": 81980 + }, + { + "epoch": 1.3415691728708175, + "grad_norm": 0.006502117030322552, + "learning_rate": 2.953814685971226e-06, + "loss": 0.0016, + "step": 81990 + }, + { + "epoch": 1.3417327988218932, + "grad_norm": 0.02640322409570217, + "learning_rate": 2.9525119080670995e-06, + "loss": 0.0018, + "step": 82000 + }, + { + "epoch": 1.341896424772969, + "grad_norm": 0.09930183738470078, + "learning_rate": 2.951209297149136e-06, + "loss": 0.0012, + "step": 82010 + }, + { + "epoch": 1.3420600507240448, + "grad_norm": 0.12355904281139374, + "learning_rate": 2.9499068533235718e-06, + "loss": 0.0017, + "step": 82020 + }, + { + "epoch": 1.3422236766751205, + "grad_norm": 0.06289374828338623, + "learning_rate": 2.948604576696631e-06, + "loss": 0.0006, + "step": 82030 + }, + { + "epoch": 1.3423873026261965, + "grad_norm": 0.08590333163738251, + "learning_rate": 2.9473024673745197e-06, + "loss": 0.002, + "step": 82040 + }, + { + "epoch": 1.3425509285772723, + "grad_norm": 0.06022851541638374, + "learning_rate": 2.9460005254634354e-06, + "loss": 0.0018, + "step": 82050 + }, + { + "epoch": 1.342714554528348, + "grad_norm": 0.011406956240534782, + "learning_rate": 2.9446987510695602e-06, + "loss": 0.0016, + "step": 82060 + }, + { + "epoch": 1.342878180479424, + "grad_norm": 0.08705837279558182, + "learning_rate": 2.9433971442990618e-06, + "loss": 0.0013, + "step": 82070 + }, + { + "epoch": 1.3430418064304999, + "grad_norm": 0.05391626060009003, + "learning_rate": 2.9420957052580958e-06, + "loss": 0.0009, + "step": 82080 + }, + { + "epoch": 1.3432054323815756, + "grad_norm": 0.07751340419054031, + "learning_rate": 2.9407944340528017e-06, + "loss": 0.0034, + "step": 82090 + }, + { + "epoch": 1.3433690583326516, + "grad_norm": 0.07951226830482483, + "learning_rate": 2.939493330789308e-06, + "loss": 0.0011, + "step": 82100 + }, + { + "epoch": 1.3435326842837274, + "grad_norm": 0.04669647663831711, + "learning_rate": 2.9381923955737277e-06, + "loss": 0.0012, + "step": 82110 + }, + { + "epoch": 1.3436963102348032, + "grad_norm": 0.012807745486497879, + "learning_rate": 2.9368916285121605e-06, + "loss": 0.0014, + "step": 82120 + }, + { + "epoch": 1.3438599361858792, + "grad_norm": 0.05920378118753433, + "learning_rate": 2.935591029710693e-06, + "loss": 0.0017, + "step": 82130 + }, + { + "epoch": 1.344023562136955, + "grad_norm": 0.03783365711569786, + "learning_rate": 2.934290599275398e-06, + "loss": 0.0007, + "step": 82140 + }, + { + "epoch": 1.3441871880880307, + "grad_norm": 0.1284850537776947, + "learning_rate": 2.9329903373123325e-06, + "loss": 0.0023, + "step": 82150 + }, + { + "epoch": 1.3443508140391067, + "grad_norm": 0.0661517083644867, + "learning_rate": 2.931690243927543e-06, + "loss": 0.0009, + "step": 82160 + }, + { + "epoch": 1.3445144399901825, + "grad_norm": 0.06416383385658264, + "learning_rate": 2.93039031922706e-06, + "loss": 0.0016, + "step": 82170 + }, + { + "epoch": 1.3446780659412583, + "grad_norm": 0.02821514382958412, + "learning_rate": 2.9290905633169015e-06, + "loss": 0.0009, + "step": 82180 + }, + { + "epoch": 1.344841691892334, + "grad_norm": 0.016270047053694725, + "learning_rate": 2.9277909763030705e-06, + "loss": 0.0021, + "step": 82190 + }, + { + "epoch": 1.34500531784341, + "grad_norm": 0.0017019853694364429, + "learning_rate": 2.9264915582915576e-06, + "loss": 0.0013, + "step": 82200 + }, + { + "epoch": 1.3451689437944858, + "grad_norm": 0.006392058916389942, + "learning_rate": 2.9251923093883383e-06, + "loss": 0.001, + "step": 82210 + }, + { + "epoch": 1.3453325697455616, + "grad_norm": 0.11702040582895279, + "learning_rate": 2.9238932296993747e-06, + "loss": 0.0012, + "step": 82220 + }, + { + "epoch": 1.3454961956966374, + "grad_norm": 0.1232714056968689, + "learning_rate": 2.922594319330616e-06, + "loss": 0.0056, + "step": 82230 + }, + { + "epoch": 1.3456598216477134, + "grad_norm": 0.08905120939016342, + "learning_rate": 2.921295578387997e-06, + "loss": 0.0014, + "step": 82240 + }, + { + "epoch": 1.3458234475987891, + "grad_norm": 0.029471050947904587, + "learning_rate": 2.919997006977438e-06, + "loss": 0.0017, + "step": 82250 + }, + { + "epoch": 1.345987073549865, + "grad_norm": 0.09401378780603409, + "learning_rate": 2.9186986052048473e-06, + "loss": 0.0012, + "step": 82260 + }, + { + "epoch": 1.346150699500941, + "grad_norm": 0.05352969467639923, + "learning_rate": 2.9174003731761157e-06, + "loss": 0.0012, + "step": 82270 + }, + { + "epoch": 1.3463143254520167, + "grad_norm": 0.06326586753129959, + "learning_rate": 2.9161023109971255e-06, + "loss": 0.0011, + "step": 82280 + }, + { + "epoch": 1.3464779514030925, + "grad_norm": 0.05645281821489334, + "learning_rate": 2.914804418773739e-06, + "loss": 0.0007, + "step": 82290 + }, + { + "epoch": 1.3466415773541685, + "grad_norm": 0.010373152792453766, + "learning_rate": 2.9135066966118113e-06, + "loss": 0.0017, + "step": 82300 + }, + { + "epoch": 1.3468052033052442, + "grad_norm": 0.021385610103607178, + "learning_rate": 2.912209144617177e-06, + "loss": 0.0022, + "step": 82310 + }, + { + "epoch": 1.34696882925632, + "grad_norm": 0.05037151277065277, + "learning_rate": 2.9109117628956633e-06, + "loss": 0.0015, + "step": 82320 + }, + { + "epoch": 1.347132455207396, + "grad_norm": 0.05088755488395691, + "learning_rate": 2.9096145515530765e-06, + "loss": 0.0022, + "step": 82330 + }, + { + "epoch": 1.3472960811584718, + "grad_norm": 0.060416366904973984, + "learning_rate": 2.908317510695217e-06, + "loss": 0.0014, + "step": 82340 + }, + { + "epoch": 1.3474597071095475, + "grad_norm": 0.0354192741215229, + "learning_rate": 2.907020640427863e-06, + "loss": 0.0015, + "step": 82350 + }, + { + "epoch": 1.3476233330606235, + "grad_norm": 0.04739399254322052, + "learning_rate": 2.905723940856786e-06, + "loss": 0.0009, + "step": 82360 + }, + { + "epoch": 1.3477869590116993, + "grad_norm": 0.0759781152009964, + "learning_rate": 2.9044274120877374e-06, + "loss": 0.0014, + "step": 82370 + }, + { + "epoch": 1.347950584962775, + "grad_norm": 0.04068463295698166, + "learning_rate": 2.9031310542264614e-06, + "loss": 0.001, + "step": 82380 + }, + { + "epoch": 1.3481142109138509, + "grad_norm": 0.09307326376438141, + "learning_rate": 2.9018348673786808e-06, + "loss": 0.0021, + "step": 82390 + }, + { + "epoch": 1.3482778368649269, + "grad_norm": 0.052611157298088074, + "learning_rate": 2.900538851650111e-06, + "loss": 0.0014, + "step": 82400 + }, + { + "epoch": 1.3484414628160026, + "grad_norm": 0.037139199674129486, + "learning_rate": 2.899243007146448e-06, + "loss": 0.0009, + "step": 82410 + }, + { + "epoch": 1.3486050887670784, + "grad_norm": 0.04838687554001808, + "learning_rate": 2.89794733397338e-06, + "loss": 0.001, + "step": 82420 + }, + { + "epoch": 1.3487687147181542, + "grad_norm": 0.1465974897146225, + "learning_rate": 2.8966518322365732e-06, + "loss": 0.0018, + "step": 82430 + }, + { + "epoch": 1.3489323406692302, + "grad_norm": 0.021042266860604286, + "learning_rate": 2.895356502041689e-06, + "loss": 0.0009, + "step": 82440 + }, + { + "epoch": 1.349095966620306, + "grad_norm": 0.05505353957414627, + "learning_rate": 2.894061343494366e-06, + "loss": 0.0011, + "step": 82450 + }, + { + "epoch": 1.3492595925713817, + "grad_norm": 0.15444590151309967, + "learning_rate": 2.8927663567002366e-06, + "loss": 0.002, + "step": 82460 + }, + { + "epoch": 1.3494232185224577, + "grad_norm": 0.07559553533792496, + "learning_rate": 2.891471541764912e-06, + "loss": 0.0011, + "step": 82470 + }, + { + "epoch": 1.3495868444735335, + "grad_norm": 0.04304005578160286, + "learning_rate": 2.890176898793996e-06, + "loss": 0.0008, + "step": 82480 + }, + { + "epoch": 1.3497504704246093, + "grad_norm": 0.028131136670708656, + "learning_rate": 2.888882427893073e-06, + "loss": 0.0009, + "step": 82490 + }, + { + "epoch": 1.3499140963756853, + "grad_norm": 0.043105218559503555, + "learning_rate": 2.8875881291677177e-06, + "loss": 0.0018, + "step": 82500 + }, + { + "epoch": 1.350077722326761, + "grad_norm": 0.019842512905597687, + "learning_rate": 2.8862940027234854e-06, + "loss": 0.001, + "step": 82510 + }, + { + "epoch": 1.3502413482778368, + "grad_norm": 0.05363038182258606, + "learning_rate": 2.885000048665925e-06, + "loss": 0.0016, + "step": 82520 + }, + { + "epoch": 1.3504049742289128, + "grad_norm": 0.033374592661857605, + "learning_rate": 2.8837062671005657e-06, + "loss": 0.0022, + "step": 82530 + }, + { + "epoch": 1.3505686001799886, + "grad_norm": 0.32421237230300903, + "learning_rate": 2.882412658132919e-06, + "loss": 0.0027, + "step": 82540 + }, + { + "epoch": 1.3507322261310644, + "grad_norm": 0.03278248384594917, + "learning_rate": 2.8811192218684946e-06, + "loss": 0.001, + "step": 82550 + }, + { + "epoch": 1.3508958520821404, + "grad_norm": 0.06711427867412567, + "learning_rate": 2.879825958412776e-06, + "loss": 0.0009, + "step": 82560 + }, + { + "epoch": 1.3510594780332161, + "grad_norm": 0.017694778740406036, + "learning_rate": 2.87853286787124e-06, + "loss": 0.0015, + "step": 82570 + }, + { + "epoch": 1.351223103984292, + "grad_norm": 0.023505406454205513, + "learning_rate": 2.877239950349343e-06, + "loss": 0.0016, + "step": 82580 + }, + { + "epoch": 1.3513867299353677, + "grad_norm": 0.07301666587591171, + "learning_rate": 2.8759472059525363e-06, + "loss": 0.0011, + "step": 82590 + }, + { + "epoch": 1.3515503558864435, + "grad_norm": 0.004277330823242664, + "learning_rate": 2.874654634786246e-06, + "loss": 0.0014, + "step": 82600 + }, + { + "epoch": 1.3517139818375195, + "grad_norm": 0.03963760286569595, + "learning_rate": 2.8733622369558954e-06, + "loss": 0.0011, + "step": 82610 + }, + { + "epoch": 1.3518776077885952, + "grad_norm": 0.006173059809952974, + "learning_rate": 2.8720700125668828e-06, + "loss": 0.0021, + "step": 82620 + }, + { + "epoch": 1.352041233739671, + "grad_norm": 0.025558434426784515, + "learning_rate": 2.8707779617246025e-06, + "loss": 0.0006, + "step": 82630 + }, + { + "epoch": 1.352204859690747, + "grad_norm": 0.025679195299744606, + "learning_rate": 2.8694860845344254e-06, + "loss": 0.0016, + "step": 82640 + }, + { + "epoch": 1.3523684856418228, + "grad_norm": 0.08831024169921875, + "learning_rate": 2.8681943811017153e-06, + "loss": 0.0017, + "step": 82650 + }, + { + "epoch": 1.3525321115928985, + "grad_norm": 0.06250100582838058, + "learning_rate": 2.866902851531818e-06, + "loss": 0.0013, + "step": 82660 + }, + { + "epoch": 1.3526957375439745, + "grad_norm": 0.038387566804885864, + "learning_rate": 2.8656114959300685e-06, + "loss": 0.001, + "step": 82670 + }, + { + "epoch": 1.3528593634950503, + "grad_norm": 0.10904587805271149, + "learning_rate": 2.8643203144017812e-06, + "loss": 0.0012, + "step": 82680 + }, + { + "epoch": 1.353022989446126, + "grad_norm": 0.045857395976781845, + "learning_rate": 2.8630293070522654e-06, + "loss": 0.0036, + "step": 82690 + }, + { + "epoch": 1.353186615397202, + "grad_norm": 0.0243227481842041, + "learning_rate": 2.861738473986806e-06, + "loss": 0.0027, + "step": 82700 + }, + { + "epoch": 1.3533502413482779, + "grad_norm": 0.10378051549196243, + "learning_rate": 2.860447815310684e-06, + "loss": 0.0011, + "step": 82710 + }, + { + "epoch": 1.3535138672993536, + "grad_norm": 0.01445746049284935, + "learning_rate": 2.8591573311291564e-06, + "loss": 0.0012, + "step": 82720 + }, + { + "epoch": 1.3536774932504296, + "grad_norm": 0.028539516031742096, + "learning_rate": 2.857867021547476e-06, + "loss": 0.0023, + "step": 82730 + }, + { + "epoch": 1.3538411192015054, + "grad_norm": 0.04224391281604767, + "learning_rate": 2.8565768866708697e-06, + "loss": 0.0011, + "step": 82740 + }, + { + "epoch": 1.3540047451525812, + "grad_norm": 0.06839609891176224, + "learning_rate": 2.8552869266045636e-06, + "loss": 0.0016, + "step": 82750 + }, + { + "epoch": 1.3541683711036572, + "grad_norm": 0.007490002084523439, + "learning_rate": 2.8539971414537558e-06, + "loss": 0.001, + "step": 82760 + }, + { + "epoch": 1.354331997054733, + "grad_norm": 0.028109358623623848, + "learning_rate": 2.8527075313236417e-06, + "loss": 0.0008, + "step": 82770 + }, + { + "epoch": 1.3544956230058087, + "grad_norm": 0.11394378542900085, + "learning_rate": 2.8514180963193944e-06, + "loss": 0.0011, + "step": 82780 + }, + { + "epoch": 1.3546592489568845, + "grad_norm": 0.04138392582535744, + "learning_rate": 2.8501288365461787e-06, + "loss": 0.0008, + "step": 82790 + }, + { + "epoch": 1.3548228749079603, + "grad_norm": 0.004836897365748882, + "learning_rate": 2.848839752109138e-06, + "loss": 0.0007, + "step": 82800 + }, + { + "epoch": 1.3549865008590363, + "grad_norm": 0.04805408790707588, + "learning_rate": 2.8475508431134115e-06, + "loss": 0.0023, + "step": 82810 + }, + { + "epoch": 1.355150126810112, + "grad_norm": 0.038444846868515015, + "learning_rate": 2.8462621096641118e-06, + "loss": 0.0015, + "step": 82820 + }, + { + "epoch": 1.3553137527611878, + "grad_norm": 0.01575840823352337, + "learning_rate": 2.8449735518663497e-06, + "loss": 0.0023, + "step": 82830 + }, + { + "epoch": 1.3554773787122638, + "grad_norm": 0.044413212686777115, + "learning_rate": 2.843685169825212e-06, + "loss": 0.0009, + "step": 82840 + }, + { + "epoch": 1.3556410046633396, + "grad_norm": 0.004706390667706728, + "learning_rate": 2.842396963645774e-06, + "loss": 0.0011, + "step": 82850 + }, + { + "epoch": 1.3558046306144154, + "grad_norm": 0.04555555433034897, + "learning_rate": 2.8411089334330997e-06, + "loss": 0.0011, + "step": 82860 + }, + { + "epoch": 1.3559682565654914, + "grad_norm": 0.027418551966547966, + "learning_rate": 2.8398210792922355e-06, + "loss": 0.0008, + "step": 82870 + }, + { + "epoch": 1.3561318825165671, + "grad_norm": 0.05459018051624298, + "learning_rate": 2.838533401328215e-06, + "loss": 0.0009, + "step": 82880 + }, + { + "epoch": 1.356295508467643, + "grad_norm": 0.0671369731426239, + "learning_rate": 2.837245899646055e-06, + "loss": 0.0012, + "step": 82890 + }, + { + "epoch": 1.356459134418719, + "grad_norm": 0.06007509306073189, + "learning_rate": 2.835958574350762e-06, + "loss": 0.0016, + "step": 82900 + }, + { + "epoch": 1.3566227603697947, + "grad_norm": 0.04670794680714607, + "learning_rate": 2.8346714255473245e-06, + "loss": 0.002, + "step": 82910 + }, + { + "epoch": 1.3567863863208705, + "grad_norm": 0.03598122298717499, + "learning_rate": 2.8333844533407195e-06, + "loss": 0.0011, + "step": 82920 + }, + { + "epoch": 1.3569500122719464, + "grad_norm": 0.053446996957063675, + "learning_rate": 2.8320976578359057e-06, + "loss": 0.0011, + "step": 82930 + }, + { + "epoch": 1.3571136382230222, + "grad_norm": 0.04230347275733948, + "learning_rate": 2.830811039137833e-06, + "loss": 0.0012, + "step": 82940 + }, + { + "epoch": 1.357277264174098, + "grad_norm": 0.041250068694353104, + "learning_rate": 2.829524597351429e-06, + "loss": 0.001, + "step": 82950 + }, + { + "epoch": 1.3574408901251738, + "grad_norm": 0.04978470876812935, + "learning_rate": 2.8282383325816165e-06, + "loss": 0.0013, + "step": 82960 + }, + { + "epoch": 1.3576045160762498, + "grad_norm": 0.15342646837234497, + "learning_rate": 2.8269522449332954e-06, + "loss": 0.0012, + "step": 82970 + }, + { + "epoch": 1.3577681420273255, + "grad_norm": 0.22620487213134766, + "learning_rate": 2.8256663345113567e-06, + "loss": 0.0012, + "step": 82980 + }, + { + "epoch": 1.3579317679784013, + "grad_norm": 0.06654641777276993, + "learning_rate": 2.8243806014206732e-06, + "loss": 0.0016, + "step": 82990 + }, + { + "epoch": 1.358095393929477, + "grad_norm": 0.16387392580509186, + "learning_rate": 2.8230950457661064e-06, + "loss": 0.003, + "step": 83000 + }, + { + "epoch": 1.358259019880553, + "grad_norm": 0.026688173413276672, + "learning_rate": 2.8218096676525002e-06, + "loss": 0.0009, + "step": 83010 + }, + { + "epoch": 1.3584226458316289, + "grad_norm": 0.028287360444664955, + "learning_rate": 2.8205244671846886e-06, + "loss": 0.0015, + "step": 83020 + }, + { + "epoch": 1.3585862717827046, + "grad_norm": 0.08470385521650314, + "learning_rate": 2.8192394444674842e-06, + "loss": 0.0007, + "step": 83030 + }, + { + "epoch": 1.3587498977337806, + "grad_norm": 0.007314716000109911, + "learning_rate": 2.817954599605693e-06, + "loss": 0.001, + "step": 83040 + }, + { + "epoch": 1.3589135236848564, + "grad_norm": 0.04179174825549126, + "learning_rate": 2.816669932704098e-06, + "loss": 0.0014, + "step": 83050 + }, + { + "epoch": 1.3590771496359322, + "grad_norm": 0.04235430434346199, + "learning_rate": 2.8153854438674778e-06, + "loss": 0.0011, + "step": 83060 + }, + { + "epoch": 1.3592407755870082, + "grad_norm": 0.04849351570010185, + "learning_rate": 2.8141011332005864e-06, + "loss": 0.0023, + "step": 83070 + }, + { + "epoch": 1.359404401538084, + "grad_norm": 0.07125011086463928, + "learning_rate": 2.8128170008081705e-06, + "loss": 0.0012, + "step": 83080 + }, + { + "epoch": 1.3595680274891597, + "grad_norm": 0.024520372971892357, + "learning_rate": 2.8115330467949577e-06, + "loss": 0.001, + "step": 83090 + }, + { + "epoch": 1.3597316534402357, + "grad_norm": 0.07137442380189896, + "learning_rate": 2.810249271265665e-06, + "loss": 0.001, + "step": 83100 + }, + { + "epoch": 1.3598952793913115, + "grad_norm": 0.11436280608177185, + "learning_rate": 2.8089656743249895e-06, + "loss": 0.0012, + "step": 83110 + }, + { + "epoch": 1.3600589053423873, + "grad_norm": 0.03243430703878403, + "learning_rate": 2.807682256077622e-06, + "loss": 0.0016, + "step": 83120 + }, + { + "epoch": 1.3602225312934633, + "grad_norm": 0.027852749451994896, + "learning_rate": 2.806399016628227e-06, + "loss": 0.0014, + "step": 83130 + }, + { + "epoch": 1.360386157244539, + "grad_norm": 0.08546815067529678, + "learning_rate": 2.805115956081469e-06, + "loss": 0.0017, + "step": 83140 + }, + { + "epoch": 1.3605497831956148, + "grad_norm": 0.08746512979269028, + "learning_rate": 2.803833074541983e-06, + "loss": 0.0009, + "step": 83150 + }, + { + "epoch": 1.3607134091466906, + "grad_norm": 0.0853603258728981, + "learning_rate": 2.802550372114401e-06, + "loss": 0.0017, + "step": 83160 + }, + { + "epoch": 1.3608770350977666, + "grad_norm": 0.031326375901699066, + "learning_rate": 2.8012678489033352e-06, + "loss": 0.0006, + "step": 83170 + }, + { + "epoch": 1.3610406610488424, + "grad_norm": 0.012908640317618847, + "learning_rate": 2.79998550501338e-06, + "loss": 0.0012, + "step": 83180 + }, + { + "epoch": 1.3612042869999181, + "grad_norm": 0.006007511168718338, + "learning_rate": 2.798703340549124e-06, + "loss": 0.001, + "step": 83190 + }, + { + "epoch": 1.361367912950994, + "grad_norm": 0.0022917778696864843, + "learning_rate": 2.797421355615133e-06, + "loss": 0.0011, + "step": 83200 + }, + { + "epoch": 1.36153153890207, + "grad_norm": 0.027564259245991707, + "learning_rate": 2.796139550315964e-06, + "loss": 0.002, + "step": 83210 + }, + { + "epoch": 1.3616951648531457, + "grad_norm": 0.012744871899485588, + "learning_rate": 2.7948579247561526e-06, + "loss": 0.0017, + "step": 83220 + }, + { + "epoch": 1.3618587908042215, + "grad_norm": 0.01554274931550026, + "learning_rate": 2.7935764790402287e-06, + "loss": 0.0007, + "step": 83230 + }, + { + "epoch": 1.3620224167552974, + "grad_norm": 0.07746699452400208, + "learning_rate": 2.7922952132726983e-06, + "loss": 0.001, + "step": 83240 + }, + { + "epoch": 1.3621860427063732, + "grad_norm": 0.10139969736337662, + "learning_rate": 2.791014127558061e-06, + "loss": 0.0019, + "step": 83250 + }, + { + "epoch": 1.362349668657449, + "grad_norm": 0.09875639528036118, + "learning_rate": 2.789733222000794e-06, + "loss": 0.004, + "step": 83260 + }, + { + "epoch": 1.362513294608525, + "grad_norm": 0.014498264528810978, + "learning_rate": 2.7884524967053672e-06, + "loss": 0.0014, + "step": 83270 + }, + { + "epoch": 1.3626769205596008, + "grad_norm": 0.009169657714664936, + "learning_rate": 2.7871719517762284e-06, + "loss": 0.0011, + "step": 83280 + }, + { + "epoch": 1.3628405465106765, + "grad_norm": 0.011819208972156048, + "learning_rate": 2.785891587317818e-06, + "loss": 0.001, + "step": 83290 + }, + { + "epoch": 1.3630041724617525, + "grad_norm": 0.04689594730734825, + "learning_rate": 2.7846114034345562e-06, + "loss": 0.0014, + "step": 83300 + }, + { + "epoch": 1.3631677984128283, + "grad_norm": 0.044827960431575775, + "learning_rate": 2.783331400230852e-06, + "loss": 0.0015, + "step": 83310 + }, + { + "epoch": 1.363331424363904, + "grad_norm": 0.07038811594247818, + "learning_rate": 2.7820515778110946e-06, + "loss": 0.0011, + "step": 83320 + }, + { + "epoch": 1.36349505031498, + "grad_norm": 0.0676996037364006, + "learning_rate": 2.780771936279667e-06, + "loss": 0.0021, + "step": 83330 + }, + { + "epoch": 1.3636586762660559, + "grad_norm": 0.06889384984970093, + "learning_rate": 2.7794924757409282e-06, + "loss": 0.0015, + "step": 83340 + }, + { + "epoch": 1.3638223022171316, + "grad_norm": 0.05268380045890808, + "learning_rate": 2.778213196299229e-06, + "loss": 0.0024, + "step": 83350 + }, + { + "epoch": 1.3639859281682074, + "grad_norm": 0.0043445006012916565, + "learning_rate": 2.776934098058901e-06, + "loss": 0.0009, + "step": 83360 + }, + { + "epoch": 1.3641495541192832, + "grad_norm": 0.09102828800678253, + "learning_rate": 2.7756551811242673e-06, + "loss": 0.0009, + "step": 83370 + }, + { + "epoch": 1.3643131800703592, + "grad_norm": 0.03233233466744423, + "learning_rate": 2.7743764455996257e-06, + "loss": 0.001, + "step": 83380 + }, + { + "epoch": 1.364476806021435, + "grad_norm": 0.08766317367553711, + "learning_rate": 2.773097891589272e-06, + "loss": 0.0008, + "step": 83390 + }, + { + "epoch": 1.3646404319725107, + "grad_norm": 0.015489128418266773, + "learning_rate": 2.771819519197475e-06, + "loss": 0.001, + "step": 83400 + }, + { + "epoch": 1.3648040579235867, + "grad_norm": 0.04057205468416214, + "learning_rate": 2.7705413285285e-06, + "loss": 0.0015, + "step": 83410 + }, + { + "epoch": 1.3649676838746625, + "grad_norm": 0.02744865044951439, + "learning_rate": 2.7692633196865858e-06, + "loss": 0.0007, + "step": 83420 + }, + { + "epoch": 1.3651313098257383, + "grad_norm": 0.0014298424357548356, + "learning_rate": 2.7679854927759685e-06, + "loss": 0.0014, + "step": 83430 + }, + { + "epoch": 1.3652949357768143, + "grad_norm": 0.08127010613679886, + "learning_rate": 2.766707847900858e-06, + "loss": 0.0011, + "step": 83440 + }, + { + "epoch": 1.36545856172789, + "grad_norm": 0.00600480567663908, + "learning_rate": 2.7654303851654597e-06, + "loss": 0.001, + "step": 83450 + }, + { + "epoch": 1.3656221876789658, + "grad_norm": 0.009183033369481564, + "learning_rate": 2.764153104673954e-06, + "loss": 0.0012, + "step": 83460 + }, + { + "epoch": 1.3657858136300418, + "grad_norm": 0.1788499504327774, + "learning_rate": 2.762876006530516e-06, + "loss": 0.0009, + "step": 83470 + }, + { + "epoch": 1.3659494395811176, + "grad_norm": 0.03937594220042229, + "learning_rate": 2.7615990908393e-06, + "loss": 0.0014, + "step": 83480 + }, + { + "epoch": 1.3661130655321934, + "grad_norm": 0.07301946729421616, + "learning_rate": 2.7603223577044436e-06, + "loss": 0.0011, + "step": 83490 + }, + { + "epoch": 1.3662766914832694, + "grad_norm": 0.0333046093583107, + "learning_rate": 2.7590458072300787e-06, + "loss": 0.0009, + "step": 83500 + }, + { + "epoch": 1.3664403174343451, + "grad_norm": 0.03378937765955925, + "learning_rate": 2.757769439520311e-06, + "loss": 0.0013, + "step": 83510 + }, + { + "epoch": 1.366603943385421, + "grad_norm": 0.02556074783205986, + "learning_rate": 2.7564932546792413e-06, + "loss": 0.0016, + "step": 83520 + }, + { + "epoch": 1.366767569336497, + "grad_norm": 0.03481189161539078, + "learning_rate": 2.7552172528109458e-06, + "loss": 0.0014, + "step": 83530 + }, + { + "epoch": 1.3669311952875727, + "grad_norm": 0.030714863911271095, + "learning_rate": 2.7539414340194963e-06, + "loss": 0.0017, + "step": 83540 + }, + { + "epoch": 1.3670948212386484, + "grad_norm": 0.1253112554550171, + "learning_rate": 2.75266579840894e-06, + "loss": 0.0015, + "step": 83550 + }, + { + "epoch": 1.3672584471897242, + "grad_norm": 0.07900375872850418, + "learning_rate": 2.751390346083317e-06, + "loss": 0.002, + "step": 83560 + }, + { + "epoch": 1.3674220731408, + "grad_norm": 0.048464708030223846, + "learning_rate": 2.7501150771466444e-06, + "loss": 0.0011, + "step": 83570 + }, + { + "epoch": 1.367585699091876, + "grad_norm": 0.02625541016459465, + "learning_rate": 2.748839991702934e-06, + "loss": 0.0011, + "step": 83580 + }, + { + "epoch": 1.3677493250429518, + "grad_norm": 0.0070399693213403225, + "learning_rate": 2.7475650898561725e-06, + "loss": 0.0006, + "step": 83590 + }, + { + "epoch": 1.3679129509940275, + "grad_norm": 0.13729895651340485, + "learning_rate": 2.7462903717103407e-06, + "loss": 0.0017, + "step": 83600 + }, + { + "epoch": 1.3680765769451035, + "grad_norm": 0.03516119718551636, + "learning_rate": 2.7450158373693963e-06, + "loss": 0.001, + "step": 83610 + }, + { + "epoch": 1.3682402028961793, + "grad_norm": 0.025441166013479233, + "learning_rate": 2.7437414869372903e-06, + "loss": 0.0006, + "step": 83620 + }, + { + "epoch": 1.368403828847255, + "grad_norm": 0.01052811462432146, + "learning_rate": 2.7424673205179497e-06, + "loss": 0.0011, + "step": 83630 + }, + { + "epoch": 1.368567454798331, + "grad_norm": 0.0376594215631485, + "learning_rate": 2.7411933382152966e-06, + "loss": 0.002, + "step": 83640 + }, + { + "epoch": 1.3687310807494069, + "grad_norm": 0.04426310956478119, + "learning_rate": 2.739919540133227e-06, + "loss": 0.0011, + "step": 83650 + }, + { + "epoch": 1.3688947067004826, + "grad_norm": 0.04762754216790199, + "learning_rate": 2.7386459263756327e-06, + "loss": 0.0008, + "step": 83660 + }, + { + "epoch": 1.3690583326515586, + "grad_norm": 0.13647077977657318, + "learning_rate": 2.7373724970463805e-06, + "loss": 0.0012, + "step": 83670 + }, + { + "epoch": 1.3692219586026344, + "grad_norm": 0.017382225021719933, + "learning_rate": 2.736099252249331e-06, + "loss": 0.0014, + "step": 83680 + }, + { + "epoch": 1.3693855845537102, + "grad_norm": 0.014112511649727821, + "learning_rate": 2.734826192088323e-06, + "loss": 0.0012, + "step": 83690 + }, + { + "epoch": 1.3695492105047862, + "grad_norm": 0.13683795928955078, + "learning_rate": 2.733553316667186e-06, + "loss": 0.0012, + "step": 83700 + }, + { + "epoch": 1.369712836455862, + "grad_norm": 0.06252920627593994, + "learning_rate": 2.732280626089726e-06, + "loss": 0.0008, + "step": 83710 + }, + { + "epoch": 1.3698764624069377, + "grad_norm": 0.015655100345611572, + "learning_rate": 2.731008120459746e-06, + "loss": 0.0014, + "step": 83720 + }, + { + "epoch": 1.3700400883580137, + "grad_norm": 0.021501947194337845, + "learning_rate": 2.7297357998810212e-06, + "loss": 0.0013, + "step": 83730 + }, + { + "epoch": 1.3702037143090895, + "grad_norm": 0.04292135313153267, + "learning_rate": 2.7284636644573225e-06, + "loss": 0.001, + "step": 83740 + }, + { + "epoch": 1.3703673402601653, + "grad_norm": 0.022099150344729424, + "learning_rate": 2.7271917142923976e-06, + "loss": 0.0017, + "step": 83750 + }, + { + "epoch": 1.370530966211241, + "grad_norm": 0.031449537724256516, + "learning_rate": 2.7259199494899847e-06, + "loss": 0.0009, + "step": 83760 + }, + { + "epoch": 1.3706945921623168, + "grad_norm": 0.07820586860179901, + "learning_rate": 2.724648370153802e-06, + "loss": 0.002, + "step": 83770 + }, + { + "epoch": 1.3708582181133928, + "grad_norm": 0.08235494792461395, + "learning_rate": 2.7233769763875595e-06, + "loss": 0.0021, + "step": 83780 + }, + { + "epoch": 1.3710218440644686, + "grad_norm": 0.026190664619207382, + "learning_rate": 2.7221057682949427e-06, + "loss": 0.0021, + "step": 83790 + }, + { + "epoch": 1.3711854700155444, + "grad_norm": 0.04493686929345131, + "learning_rate": 2.7208347459796304e-06, + "loss": 0.0016, + "step": 83800 + }, + { + "epoch": 1.3713490959666204, + "grad_norm": 0.16385577619075775, + "learning_rate": 2.719563909545283e-06, + "loss": 0.0033, + "step": 83810 + }, + { + "epoch": 1.3715127219176961, + "grad_norm": 0.0782640352845192, + "learning_rate": 2.718293259095542e-06, + "loss": 0.0011, + "step": 83820 + }, + { + "epoch": 1.371676347868772, + "grad_norm": 0.02138345316052437, + "learning_rate": 2.7170227947340416e-06, + "loss": 0.001, + "step": 83830 + }, + { + "epoch": 1.371839973819848, + "grad_norm": 0.04858659952878952, + "learning_rate": 2.715752516564393e-06, + "loss": 0.0012, + "step": 83840 + }, + { + "epoch": 1.3720035997709237, + "grad_norm": 0.054707422852516174, + "learning_rate": 2.714482424690199e-06, + "loss": 0.0007, + "step": 83850 + }, + { + "epoch": 1.3721672257219995, + "grad_norm": 0.06697214394807816, + "learning_rate": 2.713212519215041e-06, + "loss": 0.0013, + "step": 83860 + }, + { + "epoch": 1.3723308516730754, + "grad_norm": 0.021314896643161774, + "learning_rate": 2.711942800242492e-06, + "loss": 0.0006, + "step": 83870 + }, + { + "epoch": 1.3724944776241512, + "grad_norm": 0.13214902579784393, + "learning_rate": 2.7106732678761004e-06, + "loss": 0.0012, + "step": 83880 + }, + { + "epoch": 1.372658103575227, + "grad_norm": 0.04471101239323616, + "learning_rate": 2.7094039222194106e-06, + "loss": 0.0017, + "step": 83890 + }, + { + "epoch": 1.372821729526303, + "grad_norm": 0.05017504096031189, + "learning_rate": 2.708134763375942e-06, + "loss": 0.001, + "step": 83900 + }, + { + "epoch": 1.3729853554773788, + "grad_norm": 0.018335238099098206, + "learning_rate": 2.7068657914492057e-06, + "loss": 0.001, + "step": 83910 + }, + { + "epoch": 1.3731489814284545, + "grad_norm": 0.05811738595366478, + "learning_rate": 2.7055970065426922e-06, + "loss": 0.0023, + "step": 83920 + }, + { + "epoch": 1.3733126073795303, + "grad_norm": 0.03068411909043789, + "learning_rate": 2.7043284087598824e-06, + "loss": 0.0015, + "step": 83930 + }, + { + "epoch": 1.3734762333306063, + "grad_norm": 0.031612154096364975, + "learning_rate": 2.7030599982042357e-06, + "loss": 0.003, + "step": 83940 + }, + { + "epoch": 1.373639859281682, + "grad_norm": 0.05159648135304451, + "learning_rate": 2.7017917749792016e-06, + "loss": 0.001, + "step": 83950 + }, + { + "epoch": 1.3738034852327579, + "grad_norm": 0.03812577575445175, + "learning_rate": 2.70052373918821e-06, + "loss": 0.0015, + "step": 83960 + }, + { + "epoch": 1.3739671111838336, + "grad_norm": 0.03210095688700676, + "learning_rate": 2.69925589093468e-06, + "loss": 0.0011, + "step": 83970 + }, + { + "epoch": 1.3741307371349096, + "grad_norm": 0.0571649968624115, + "learning_rate": 2.69798823032201e-06, + "loss": 0.0008, + "step": 83980 + }, + { + "epoch": 1.3742943630859854, + "grad_norm": 0.10776115208864212, + "learning_rate": 2.69672075745359e-06, + "loss": 0.0012, + "step": 83990 + }, + { + "epoch": 1.3744579890370612, + "grad_norm": 0.06793531775474548, + "learning_rate": 2.6954534724327865e-06, + "loss": 0.0013, + "step": 84000 + }, + { + "epoch": 1.3746216149881372, + "grad_norm": 0.02788708172738552, + "learning_rate": 2.6941863753629593e-06, + "loss": 0.0013, + "step": 84010 + }, + { + "epoch": 1.374785240939213, + "grad_norm": 0.08497747033834457, + "learning_rate": 2.6929194663474438e-06, + "loss": 0.0006, + "step": 84020 + }, + { + "epoch": 1.3749488668902887, + "grad_norm": 0.027750637382268906, + "learning_rate": 2.691652745489569e-06, + "loss": 0.0008, + "step": 84030 + }, + { + "epoch": 1.3751124928413647, + "grad_norm": 0.07567192614078522, + "learning_rate": 2.6903862128926415e-06, + "loss": 0.0028, + "step": 84040 + }, + { + "epoch": 1.3752761187924405, + "grad_norm": 0.004019399639219046, + "learning_rate": 2.6891198686599584e-06, + "loss": 0.0019, + "step": 84050 + }, + { + "epoch": 1.3754397447435163, + "grad_norm": 0.04956422001123428, + "learning_rate": 2.687853712894794e-06, + "loss": 0.0014, + "step": 84060 + }, + { + "epoch": 1.3756033706945923, + "grad_norm": 0.09158650785684586, + "learning_rate": 2.6865877457004164e-06, + "loss": 0.001, + "step": 84070 + }, + { + "epoch": 1.375766996645668, + "grad_norm": 0.12129585444927216, + "learning_rate": 2.685321967180069e-06, + "loss": 0.0016, + "step": 84080 + }, + { + "epoch": 1.3759306225967438, + "grad_norm": 0.08762133121490479, + "learning_rate": 2.684056377436989e-06, + "loss": 0.0013, + "step": 84090 + }, + { + "epoch": 1.3760942485478198, + "grad_norm": 0.07286391407251358, + "learning_rate": 2.68279097657439e-06, + "loss": 0.0023, + "step": 84100 + }, + { + "epoch": 1.3762578744988956, + "grad_norm": 0.05570758134126663, + "learning_rate": 2.6815257646954762e-06, + "loss": 0.0011, + "step": 84110 + }, + { + "epoch": 1.3764215004499714, + "grad_norm": 0.039847783744335175, + "learning_rate": 2.680260741903431e-06, + "loss": 0.001, + "step": 84120 + }, + { + "epoch": 1.3765851264010471, + "grad_norm": 0.16149753332138062, + "learning_rate": 2.6789959083014295e-06, + "loss": 0.0018, + "step": 84130 + }, + { + "epoch": 1.3767487523521231, + "grad_norm": 0.12277252227067947, + "learning_rate": 2.677731263992625e-06, + "loss": 0.0013, + "step": 84140 + }, + { + "epoch": 1.376912378303199, + "grad_norm": 0.043828777968883514, + "learning_rate": 2.6764668090801555e-06, + "loss": 0.0016, + "step": 84150 + }, + { + "epoch": 1.3770760042542747, + "grad_norm": 0.18554279208183289, + "learning_rate": 2.6752025436671497e-06, + "loss": 0.0028, + "step": 84160 + }, + { + "epoch": 1.3772396302053505, + "grad_norm": 0.09850435703992844, + "learning_rate": 2.6739384678567127e-06, + "loss": 0.0014, + "step": 84170 + }, + { + "epoch": 1.3774032561564264, + "grad_norm": 0.04618208110332489, + "learning_rate": 2.6726745817519428e-06, + "loss": 0.0014, + "step": 84180 + }, + { + "epoch": 1.3775668821075022, + "grad_norm": 0.028582289814949036, + "learning_rate": 2.671410885455913e-06, + "loss": 0.0007, + "step": 84190 + }, + { + "epoch": 1.377730508058578, + "grad_norm": 0.06658206135034561, + "learning_rate": 2.6701473790716913e-06, + "loss": 0.0012, + "step": 84200 + }, + { + "epoch": 1.377894134009654, + "grad_norm": 0.010035544633865356, + "learning_rate": 2.6688840627023203e-06, + "loss": 0.0012, + "step": 84210 + }, + { + "epoch": 1.3780577599607298, + "grad_norm": 0.1793397217988968, + "learning_rate": 2.6676209364508365e-06, + "loss": 0.0023, + "step": 84220 + }, + { + "epoch": 1.3782213859118055, + "grad_norm": 0.05178650841116905, + "learning_rate": 2.6663580004202517e-06, + "loss": 0.0008, + "step": 84230 + }, + { + "epoch": 1.3783850118628815, + "grad_norm": 0.02752397023141384, + "learning_rate": 2.66509525471357e-06, + "loss": 0.0007, + "step": 84240 + }, + { + "epoch": 1.3785486378139573, + "grad_norm": 0.007300386670976877, + "learning_rate": 2.6638326994337736e-06, + "loss": 0.0005, + "step": 84250 + }, + { + "epoch": 1.378712263765033, + "grad_norm": 0.09950890392065048, + "learning_rate": 2.6625703346838363e-06, + "loss": 0.0012, + "step": 84260 + }, + { + "epoch": 1.378875889716109, + "grad_norm": 0.03508450463414192, + "learning_rate": 2.6613081605667068e-06, + "loss": 0.0006, + "step": 84270 + }, + { + "epoch": 1.3790395156671849, + "grad_norm": 0.034476395696401596, + "learning_rate": 2.660046177185329e-06, + "loss": 0.0011, + "step": 84280 + }, + { + "epoch": 1.3792031416182606, + "grad_norm": 0.05864248052239418, + "learning_rate": 2.658784384642623e-06, + "loss": 0.0011, + "step": 84290 + }, + { + "epoch": 1.3793667675693366, + "grad_norm": 0.05984555557370186, + "learning_rate": 2.6575227830414984e-06, + "loss": 0.001, + "step": 84300 + }, + { + "epoch": 1.3795303935204124, + "grad_norm": 0.02164645865559578, + "learning_rate": 2.6562613724848437e-06, + "loss": 0.0012, + "step": 84310 + }, + { + "epoch": 1.3796940194714882, + "grad_norm": 0.04665246233344078, + "learning_rate": 2.6550001530755393e-06, + "loss": 0.0009, + "step": 84320 + }, + { + "epoch": 1.379857645422564, + "grad_norm": 0.04419153928756714, + "learning_rate": 2.6537391249164424e-06, + "loss": 0.0009, + "step": 84330 + }, + { + "epoch": 1.3800212713736397, + "grad_norm": 0.049217589199543, + "learning_rate": 2.6524782881104015e-06, + "loss": 0.0015, + "step": 84340 + }, + { + "epoch": 1.3801848973247157, + "grad_norm": 0.07900845259428024, + "learning_rate": 2.651217642760242e-06, + "loss": 0.002, + "step": 84350 + }, + { + "epoch": 1.3803485232757915, + "grad_norm": 0.026565400883555412, + "learning_rate": 2.6499571889687835e-06, + "loss": 0.0009, + "step": 84360 + }, + { + "epoch": 1.3805121492268673, + "grad_norm": 0.005271803587675095, + "learning_rate": 2.6486969268388186e-06, + "loss": 0.0008, + "step": 84370 + }, + { + "epoch": 1.3806757751779433, + "grad_norm": 0.07838542014360428, + "learning_rate": 2.647436856473134e-06, + "loss": 0.0012, + "step": 84380 + }, + { + "epoch": 1.380839401129019, + "grad_norm": 0.07470891624689102, + "learning_rate": 2.646176977974494e-06, + "loss": 0.001, + "step": 84390 + }, + { + "epoch": 1.3810030270800948, + "grad_norm": 0.01837877556681633, + "learning_rate": 2.6449172914456524e-06, + "loss": 0.001, + "step": 84400 + }, + { + "epoch": 1.3811666530311708, + "grad_norm": 0.04809350147843361, + "learning_rate": 2.643657796989343e-06, + "loss": 0.0013, + "step": 84410 + }, + { + "epoch": 1.3813302789822466, + "grad_norm": 0.06591973453760147, + "learning_rate": 2.6423984947082874e-06, + "loss": 0.0021, + "step": 84420 + }, + { + "epoch": 1.3814939049333224, + "grad_norm": 0.08016178011894226, + "learning_rate": 2.641139384705188e-06, + "loss": 0.0029, + "step": 84430 + }, + { + "epoch": 1.3816575308843984, + "grad_norm": 0.14264336228370667, + "learning_rate": 2.6398804670827372e-06, + "loss": 0.0018, + "step": 84440 + }, + { + "epoch": 1.3818211568354741, + "grad_norm": 0.03798792511224747, + "learning_rate": 2.6386217419436057e-06, + "loss": 0.0015, + "step": 84450 + }, + { + "epoch": 1.38198478278655, + "grad_norm": 0.05401057004928589, + "learning_rate": 2.6373632093904477e-06, + "loss": 0.0012, + "step": 84460 + }, + { + "epoch": 1.382148408737626, + "grad_norm": 0.054623693227767944, + "learning_rate": 2.636104869525911e-06, + "loss": 0.002, + "step": 84470 + }, + { + "epoch": 1.3823120346887017, + "grad_norm": 0.04498565196990967, + "learning_rate": 2.6348467224526163e-06, + "loss": 0.0007, + "step": 84480 + }, + { + "epoch": 1.3824756606397774, + "grad_norm": 0.010171068832278252, + "learning_rate": 2.6335887682731774e-06, + "loss": 0.001, + "step": 84490 + }, + { + "epoch": 1.3826392865908534, + "grad_norm": 0.04460660740733147, + "learning_rate": 2.6323310070901863e-06, + "loss": 0.0008, + "step": 84500 + }, + { + "epoch": 1.3828029125419292, + "grad_norm": 0.024205192923545837, + "learning_rate": 2.6310734390062244e-06, + "loss": 0.001, + "step": 84510 + }, + { + "epoch": 1.382966538493005, + "grad_norm": 0.0299739558249712, + "learning_rate": 2.6298160641238507e-06, + "loss": 0.0009, + "step": 84520 + }, + { + "epoch": 1.3831301644440808, + "grad_norm": 0.06419893354177475, + "learning_rate": 2.6285588825456165e-06, + "loss": 0.0013, + "step": 84530 + }, + { + "epoch": 1.3832937903951565, + "grad_norm": 0.060932599008083344, + "learning_rate": 2.6273018943740503e-06, + "loss": 0.0014, + "step": 84540 + }, + { + "epoch": 1.3834574163462325, + "grad_norm": 0.037657011300325394, + "learning_rate": 2.6260450997116705e-06, + "loss": 0.0014, + "step": 84550 + }, + { + "epoch": 1.3836210422973083, + "grad_norm": 0.02954825758934021, + "learning_rate": 2.624788498660974e-06, + "loss": 0.0013, + "step": 84560 + }, + { + "epoch": 1.383784668248384, + "grad_norm": 0.1386798918247223, + "learning_rate": 2.623532091324448e-06, + "loss": 0.0012, + "step": 84570 + }, + { + "epoch": 1.38394829419946, + "grad_norm": 0.008061363361775875, + "learning_rate": 2.622275877804557e-06, + "loss": 0.0007, + "step": 84580 + }, + { + "epoch": 1.3841119201505359, + "grad_norm": 0.008589118719100952, + "learning_rate": 2.621019858203758e-06, + "loss": 0.0007, + "step": 84590 + }, + { + "epoch": 1.3842755461016116, + "grad_norm": 0.0730748102068901, + "learning_rate": 2.6197640326244834e-06, + "loss": 0.0007, + "step": 84600 + }, + { + "epoch": 1.3844391720526876, + "grad_norm": 0.030864287167787552, + "learning_rate": 2.6185084011691574e-06, + "loss": 0.001, + "step": 84610 + }, + { + "epoch": 1.3846027980037634, + "grad_norm": 0.0010963203385472298, + "learning_rate": 2.617252963940182e-06, + "loss": 0.0013, + "step": 84620 + }, + { + "epoch": 1.3847664239548392, + "grad_norm": 0.025900447741150856, + "learning_rate": 2.61599772103995e-06, + "loss": 0.0011, + "step": 84630 + }, + { + "epoch": 1.3849300499059152, + "grad_norm": 0.04189382120966911, + "learning_rate": 2.6147426725708315e-06, + "loss": 0.0012, + "step": 84640 + }, + { + "epoch": 1.385093675856991, + "grad_norm": 0.10469403862953186, + "learning_rate": 2.6134878186351863e-06, + "loss": 0.002, + "step": 84650 + }, + { + "epoch": 1.3852573018080667, + "grad_norm": 0.013182587921619415, + "learning_rate": 2.6122331593353537e-06, + "loss": 0.0009, + "step": 84660 + }, + { + "epoch": 1.3854209277591427, + "grad_norm": 0.05186162889003754, + "learning_rate": 2.6109786947736626e-06, + "loss": 0.0007, + "step": 84670 + }, + { + "epoch": 1.3855845537102185, + "grad_norm": 0.07378797233104706, + "learning_rate": 2.609724425052419e-06, + "loss": 0.0014, + "step": 84680 + }, + { + "epoch": 1.3857481796612943, + "grad_norm": 0.027884868904948235, + "learning_rate": 2.6084703502739206e-06, + "loss": 0.0008, + "step": 84690 + }, + { + "epoch": 1.38591180561237, + "grad_norm": 0.054000888019800186, + "learning_rate": 2.607216470540442e-06, + "loss": 0.0033, + "step": 84700 + }, + { + "epoch": 1.386075431563446, + "grad_norm": 0.07974854111671448, + "learning_rate": 2.6059627859542492e-06, + "loss": 0.0016, + "step": 84710 + }, + { + "epoch": 1.3862390575145218, + "grad_norm": 0.12347791343927383, + "learning_rate": 2.604709296617584e-06, + "loss": 0.0017, + "step": 84720 + }, + { + "epoch": 1.3864026834655976, + "grad_norm": 0.18000462651252747, + "learning_rate": 2.6034560026326815e-06, + "loss": 0.001, + "step": 84730 + }, + { + "epoch": 1.3865663094166734, + "grad_norm": 0.0015927114291116595, + "learning_rate": 2.6022029041017517e-06, + "loss": 0.0008, + "step": 84740 + }, + { + "epoch": 1.3867299353677494, + "grad_norm": 0.17558711767196655, + "learning_rate": 2.600950001126996e-06, + "loss": 0.0013, + "step": 84750 + }, + { + "epoch": 1.3868935613188251, + "grad_norm": 0.045367199927568436, + "learning_rate": 2.5996972938105945e-06, + "loss": 0.0009, + "step": 84760 + }, + { + "epoch": 1.387057187269901, + "grad_norm": 0.01811496540904045, + "learning_rate": 2.5984447822547176e-06, + "loss": 0.0008, + "step": 84770 + }, + { + "epoch": 1.387220813220977, + "grad_norm": 0.11393507570028305, + "learning_rate": 2.5971924665615133e-06, + "loss": 0.0021, + "step": 84780 + }, + { + "epoch": 1.3873844391720527, + "grad_norm": 0.06782524287700653, + "learning_rate": 2.5959403468331145e-06, + "loss": 0.0008, + "step": 84790 + }, + { + "epoch": 1.3875480651231284, + "grad_norm": 0.16156239807605743, + "learning_rate": 2.5946884231716435e-06, + "loss": 0.0011, + "step": 84800 + }, + { + "epoch": 1.3877116910742044, + "grad_norm": 0.06091412901878357, + "learning_rate": 2.593436695679199e-06, + "loss": 0.0007, + "step": 84810 + }, + { + "epoch": 1.3878753170252802, + "grad_norm": 0.05284532532095909, + "learning_rate": 2.592185164457872e-06, + "loss": 0.0011, + "step": 84820 + }, + { + "epoch": 1.388038942976356, + "grad_norm": 0.006466014310717583, + "learning_rate": 2.5909338296097285e-06, + "loss": 0.0008, + "step": 84830 + }, + { + "epoch": 1.388202568927432, + "grad_norm": 0.008024314418435097, + "learning_rate": 2.5896826912368282e-06, + "loss": 0.002, + "step": 84840 + }, + { + "epoch": 1.3883661948785078, + "grad_norm": 0.019224250689148903, + "learning_rate": 2.588431749441205e-06, + "loss": 0.0005, + "step": 84850 + }, + { + "epoch": 1.3885298208295835, + "grad_norm": 0.003075191518291831, + "learning_rate": 2.5871810043248847e-06, + "loss": 0.0007, + "step": 84860 + }, + { + "epoch": 1.3886934467806595, + "grad_norm": 0.05049090087413788, + "learning_rate": 2.5859304559898712e-06, + "loss": 0.0029, + "step": 84870 + }, + { + "epoch": 1.3888570727317353, + "grad_norm": 0.01819625310599804, + "learning_rate": 2.5846801045381576e-06, + "loss": 0.0007, + "step": 84880 + }, + { + "epoch": 1.389020698682811, + "grad_norm": 0.024031417444348335, + "learning_rate": 2.583429950071716e-06, + "loss": 0.0014, + "step": 84890 + }, + { + "epoch": 1.3891843246338869, + "grad_norm": 0.05024215951561928, + "learning_rate": 2.5821799926925073e-06, + "loss": 0.0009, + "step": 84900 + }, + { + "epoch": 1.3893479505849629, + "grad_norm": 0.06953619420528412, + "learning_rate": 2.58093023250247e-06, + "loss": 0.0009, + "step": 84910 + }, + { + "epoch": 1.3895115765360386, + "grad_norm": 0.04103608801960945, + "learning_rate": 2.579680669603535e-06, + "loss": 0.0007, + "step": 84920 + }, + { + "epoch": 1.3896752024871144, + "grad_norm": 0.010032951831817627, + "learning_rate": 2.5784313040976076e-06, + "loss": 0.001, + "step": 84930 + }, + { + "epoch": 1.3898388284381902, + "grad_norm": 0.01916610635817051, + "learning_rate": 2.5771821360865867e-06, + "loss": 0.0006, + "step": 84940 + }, + { + "epoch": 1.3900024543892662, + "grad_norm": 0.03895369544625282, + "learning_rate": 2.5759331656723452e-06, + "loss": 0.0013, + "step": 84950 + }, + { + "epoch": 1.390166080340342, + "grad_norm": 0.0250666756182909, + "learning_rate": 2.574684392956749e-06, + "loss": 0.0009, + "step": 84960 + }, + { + "epoch": 1.3903297062914177, + "grad_norm": 0.04116955026984215, + "learning_rate": 2.573435818041641e-06, + "loss": 0.001, + "step": 84970 + }, + { + "epoch": 1.3904933322424937, + "grad_norm": 0.06238459050655365, + "learning_rate": 2.5721874410288527e-06, + "loss": 0.001, + "step": 84980 + }, + { + "epoch": 1.3906569581935695, + "grad_norm": 0.0622844360768795, + "learning_rate": 2.5709392620201955e-06, + "loss": 0.0012, + "step": 84990 + }, + { + "epoch": 1.3908205841446453, + "grad_norm": 0.030324935913085938, + "learning_rate": 2.569691281117469e-06, + "loss": 0.0011, + "step": 85000 + }, + { + "epoch": 1.3909842100957213, + "grad_norm": 0.06983377784490585, + "learning_rate": 2.5684434984224516e-06, + "loss": 0.002, + "step": 85010 + }, + { + "epoch": 1.391147836046797, + "grad_norm": 0.011028064414858818, + "learning_rate": 2.567195914036911e-06, + "loss": 0.0007, + "step": 85020 + }, + { + "epoch": 1.3913114619978728, + "grad_norm": 0.012050114572048187, + "learning_rate": 2.565948528062593e-06, + "loss": 0.0006, + "step": 85030 + }, + { + "epoch": 1.3914750879489488, + "grad_norm": 0.20281486213207245, + "learning_rate": 2.5647013406012333e-06, + "loss": 0.002, + "step": 85040 + }, + { + "epoch": 1.3916387139000246, + "grad_norm": 0.07458246499300003, + "learning_rate": 2.563454351754545e-06, + "loss": 0.0008, + "step": 85050 + }, + { + "epoch": 1.3918023398511004, + "grad_norm": 0.02601015754044056, + "learning_rate": 2.562207561624232e-06, + "loss": 0.0009, + "step": 85060 + }, + { + "epoch": 1.3919659658021764, + "grad_norm": 0.03659119829535484, + "learning_rate": 2.5609609703119743e-06, + "loss": 0.0007, + "step": 85070 + }, + { + "epoch": 1.3921295917532521, + "grad_norm": 0.04428410157561302, + "learning_rate": 2.5597145779194433e-06, + "loss": 0.0007, + "step": 85080 + }, + { + "epoch": 1.392293217704328, + "grad_norm": 0.057671818882226944, + "learning_rate": 2.5584683845482868e-06, + "loss": 0.0013, + "step": 85090 + }, + { + "epoch": 1.3924568436554037, + "grad_norm": 0.05824888497591019, + "learning_rate": 2.5572223903001435e-06, + "loss": 0.0008, + "step": 85100 + }, + { + "epoch": 1.3926204696064794, + "grad_norm": 0.008045400492846966, + "learning_rate": 2.5559765952766313e-06, + "loss": 0.0011, + "step": 85110 + }, + { + "epoch": 1.3927840955575554, + "grad_norm": 0.05307982489466667, + "learning_rate": 2.5547309995793512e-06, + "loss": 0.0007, + "step": 85120 + }, + { + "epoch": 1.3929477215086312, + "grad_norm": 0.04576743394136429, + "learning_rate": 2.553485603309893e-06, + "loss": 0.0015, + "step": 85130 + }, + { + "epoch": 1.393111347459707, + "grad_norm": 0.05727805569767952, + "learning_rate": 2.552240406569823e-06, + "loss": 0.0007, + "step": 85140 + }, + { + "epoch": 1.393274973410783, + "grad_norm": 0.057195328176021576, + "learning_rate": 2.550995409460699e-06, + "loss": 0.0011, + "step": 85150 + }, + { + "epoch": 1.3934385993618588, + "grad_norm": 0.14627176523208618, + "learning_rate": 2.5497506120840563e-06, + "loss": 0.0013, + "step": 85160 + }, + { + "epoch": 1.3936022253129345, + "grad_norm": 0.0019530359422788024, + "learning_rate": 2.5485060145414186e-06, + "loss": 0.0005, + "step": 85170 + }, + { + "epoch": 1.3937658512640105, + "grad_norm": 0.05330021306872368, + "learning_rate": 2.5472616169342878e-06, + "loss": 0.0012, + "step": 85180 + }, + { + "epoch": 1.3939294772150863, + "grad_norm": 0.004152234178036451, + "learning_rate": 2.5460174193641564e-06, + "loss": 0.0014, + "step": 85190 + }, + { + "epoch": 1.394093103166162, + "grad_norm": 0.017515040934085846, + "learning_rate": 2.5447734219324937e-06, + "loss": 0.0013, + "step": 85200 + }, + { + "epoch": 1.394256729117238, + "grad_norm": 0.003851355519145727, + "learning_rate": 2.543529624740758e-06, + "loss": 0.0008, + "step": 85210 + }, + { + "epoch": 1.3944203550683139, + "grad_norm": 0.056642767041921616, + "learning_rate": 2.5422860278903884e-06, + "loss": 0.0019, + "step": 85220 + }, + { + "epoch": 1.3945839810193896, + "grad_norm": 0.07989409565925598, + "learning_rate": 2.5410426314828084e-06, + "loss": 0.0009, + "step": 85230 + }, + { + "epoch": 1.3947476069704656, + "grad_norm": 0.0885382816195488, + "learning_rate": 2.5397994356194246e-06, + "loss": 0.0008, + "step": 85240 + }, + { + "epoch": 1.3949112329215414, + "grad_norm": 0.020241836085915565, + "learning_rate": 2.5385564404016305e-06, + "loss": 0.0005, + "step": 85250 + }, + { + "epoch": 1.3950748588726172, + "grad_norm": 0.056566961109638214, + "learning_rate": 2.537313645930796e-06, + "loss": 0.0008, + "step": 85260 + }, + { + "epoch": 1.3952384848236932, + "grad_norm": 0.013217160478234291, + "learning_rate": 2.536071052308284e-06, + "loss": 0.0007, + "step": 85270 + }, + { + "epoch": 1.395402110774769, + "grad_norm": 0.006033493205904961, + "learning_rate": 2.534828659635432e-06, + "loss": 0.001, + "step": 85280 + }, + { + "epoch": 1.3955657367258447, + "grad_norm": 0.0031652438919991255, + "learning_rate": 2.5335864680135693e-06, + "loss": 0.0009, + "step": 85290 + }, + { + "epoch": 1.3957293626769205, + "grad_norm": 0.0006656902260147035, + "learning_rate": 2.5323444775440007e-06, + "loss": 0.001, + "step": 85300 + }, + { + "epoch": 1.3958929886279963, + "grad_norm": 0.061206698417663574, + "learning_rate": 2.531102688328023e-06, + "loss": 0.0009, + "step": 85310 + }, + { + "epoch": 1.3960566145790723, + "grad_norm": 0.00743982195854187, + "learning_rate": 2.5298611004669083e-06, + "loss": 0.001, + "step": 85320 + }, + { + "epoch": 1.396220240530148, + "grad_norm": 0.03856433182954788, + "learning_rate": 2.5286197140619193e-06, + "loss": 0.001, + "step": 85330 + }, + { + "epoch": 1.3963838664812238, + "grad_norm": 0.03750927746295929, + "learning_rate": 2.5273785292142973e-06, + "loss": 0.001, + "step": 85340 + }, + { + "epoch": 1.3965474924322998, + "grad_norm": 0.009348989464342594, + "learning_rate": 2.5261375460252707e-06, + "loss": 0.0012, + "step": 85350 + }, + { + "epoch": 1.3967111183833756, + "grad_norm": 0.018941443413496017, + "learning_rate": 2.5248967645960476e-06, + "loss": 0.0011, + "step": 85360 + }, + { + "epoch": 1.3968747443344514, + "grad_norm": 0.0721622183918953, + "learning_rate": 2.5236561850278253e-06, + "loss": 0.0029, + "step": 85370 + }, + { + "epoch": 1.3970383702855274, + "grad_norm": 0.04899194464087486, + "learning_rate": 2.522415807421777e-06, + "loss": 0.001, + "step": 85380 + }, + { + "epoch": 1.3972019962366031, + "grad_norm": 0.043110739439725876, + "learning_rate": 2.521175631879068e-06, + "loss": 0.0007, + "step": 85390 + }, + { + "epoch": 1.397365622187679, + "grad_norm": 0.03885679319500923, + "learning_rate": 2.5199356585008393e-06, + "loss": 0.0011, + "step": 85400 + }, + { + "epoch": 1.397529248138755, + "grad_norm": 0.03648418188095093, + "learning_rate": 2.5186958873882216e-06, + "loss": 0.0005, + "step": 85410 + }, + { + "epoch": 1.3976928740898307, + "grad_norm": 0.03852356970310211, + "learning_rate": 2.517456318642326e-06, + "loss": 0.0009, + "step": 85420 + }, + { + "epoch": 1.3978565000409064, + "grad_norm": 0.032116807997226715, + "learning_rate": 2.5162169523642443e-06, + "loss": 0.0008, + "step": 85430 + }, + { + "epoch": 1.3980201259919824, + "grad_norm": 0.01025864202529192, + "learning_rate": 2.5149777886550597e-06, + "loss": 0.0006, + "step": 85440 + }, + { + "epoch": 1.3981837519430582, + "grad_norm": 0.018044443801045418, + "learning_rate": 2.513738827615829e-06, + "loss": 0.0016, + "step": 85450 + }, + { + "epoch": 1.398347377894134, + "grad_norm": 0.04339112341403961, + "learning_rate": 2.512500069347604e-06, + "loss": 0.0009, + "step": 85460 + }, + { + "epoch": 1.39851100384521, + "grad_norm": 0.02578670345246792, + "learning_rate": 2.5112615139514073e-06, + "loss": 0.0006, + "step": 85470 + }, + { + "epoch": 1.3986746297962858, + "grad_norm": 0.02312774583697319, + "learning_rate": 2.5100231615282556e-06, + "loss": 0.001, + "step": 85480 + }, + { + "epoch": 1.3988382557473615, + "grad_norm": 0.11079282313585281, + "learning_rate": 2.5087850121791422e-06, + "loss": 0.0018, + "step": 85490 + }, + { + "epoch": 1.3990018816984373, + "grad_norm": 0.18890371918678284, + "learning_rate": 2.5075470660050487e-06, + "loss": 0.0014, + "step": 85500 + }, + { + "epoch": 1.399165507649513, + "grad_norm": 0.0513865128159523, + "learning_rate": 2.5063093231069346e-06, + "loss": 0.0009, + "step": 85510 + }, + { + "epoch": 1.399329133600589, + "grad_norm": 0.09298305958509445, + "learning_rate": 2.5050717835857497e-06, + "loss": 0.0011, + "step": 85520 + }, + { + "epoch": 1.3994927595516649, + "grad_norm": 0.02303282916545868, + "learning_rate": 2.50383444754242e-06, + "loss": 0.0011, + "step": 85530 + }, + { + "epoch": 1.3996563855027406, + "grad_norm": 0.10980135202407837, + "learning_rate": 2.502597315077862e-06, + "loss": 0.0014, + "step": 85540 + }, + { + "epoch": 1.3998200114538166, + "grad_norm": 0.020838702097535133, + "learning_rate": 2.501360386292968e-06, + "loss": 0.0012, + "step": 85550 + }, + { + "epoch": 1.3999836374048924, + "grad_norm": 0.014282509684562683, + "learning_rate": 2.5001236612886216e-06, + "loss": 0.0006, + "step": 85560 + }, + { + "epoch": 1.4001472633559682, + "grad_norm": 0.017213759943842888, + "learning_rate": 2.498887140165683e-06, + "loss": 0.001, + "step": 85570 + }, + { + "epoch": 1.4003108893070442, + "grad_norm": 0.060502659529447556, + "learning_rate": 2.497650823025001e-06, + "loss": 0.0012, + "step": 85580 + }, + { + "epoch": 1.40047451525812, + "grad_norm": 0.05678383260965347, + "learning_rate": 2.496414709967402e-06, + "loss": 0.0011, + "step": 85590 + }, + { + "epoch": 1.4006381412091957, + "grad_norm": 0.06617666035890579, + "learning_rate": 2.495178801093703e-06, + "loss": 0.0019, + "step": 85600 + }, + { + "epoch": 1.4008017671602717, + "grad_norm": 0.05336611717939377, + "learning_rate": 2.4939430965046974e-06, + "loss": 0.0006, + "step": 85610 + }, + { + "epoch": 1.4009653931113475, + "grad_norm": 0.07783443480730057, + "learning_rate": 2.4927075963011684e-06, + "loss": 0.0017, + "step": 85620 + }, + { + "epoch": 1.4011290190624233, + "grad_norm": 0.09148307144641876, + "learning_rate": 2.491472300583875e-06, + "loss": 0.0009, + "step": 85630 + }, + { + "epoch": 1.4012926450134993, + "grad_norm": 0.013781259767711163, + "learning_rate": 2.490237209453569e-06, + "loss": 0.0007, + "step": 85640 + }, + { + "epoch": 1.401456270964575, + "grad_norm": 0.09493014216423035, + "learning_rate": 2.4890023230109744e-06, + "loss": 0.0018, + "step": 85650 + }, + { + "epoch": 1.4016198969156508, + "grad_norm": 0.03304464742541313, + "learning_rate": 2.487767641356809e-06, + "loss": 0.0007, + "step": 85660 + }, + { + "epoch": 1.4017835228667266, + "grad_norm": 0.1106976643204689, + "learning_rate": 2.4865331645917663e-06, + "loss": 0.0011, + "step": 85670 + }, + { + "epoch": 1.4019471488178026, + "grad_norm": 0.037574999034404755, + "learning_rate": 2.4852988928165283e-06, + "loss": 0.0009, + "step": 85680 + }, + { + "epoch": 1.4021107747688784, + "grad_norm": 0.12366770952939987, + "learning_rate": 2.4840648261317553e-06, + "loss": 0.0033, + "step": 85690 + }, + { + "epoch": 1.4022744007199541, + "grad_norm": 0.029934987425804138, + "learning_rate": 2.4828309646380964e-06, + "loss": 0.0009, + "step": 85700 + }, + { + "epoch": 1.40243802667103, + "grad_norm": 0.04335254430770874, + "learning_rate": 2.481597308436179e-06, + "loss": 0.0031, + "step": 85710 + }, + { + "epoch": 1.402601652622106, + "grad_norm": 0.07084373384714127, + "learning_rate": 2.480363857626618e-06, + "loss": 0.0014, + "step": 85720 + }, + { + "epoch": 1.4027652785731817, + "grad_norm": 0.006869551260024309, + "learning_rate": 2.4791306123100066e-06, + "loss": 0.0016, + "step": 85730 + }, + { + "epoch": 1.4029289045242574, + "grad_norm": 0.033188678324222565, + "learning_rate": 2.4778975725869275e-06, + "loss": 0.0011, + "step": 85740 + }, + { + "epoch": 1.4030925304753334, + "grad_norm": 0.05597894638776779, + "learning_rate": 2.4766647385579416e-06, + "loss": 0.0008, + "step": 85750 + }, + { + "epoch": 1.4032561564264092, + "grad_norm": 0.08012067526578903, + "learning_rate": 2.4754321103235923e-06, + "loss": 0.0013, + "step": 85760 + }, + { + "epoch": 1.403419782377485, + "grad_norm": 0.03605680912733078, + "learning_rate": 2.4741996879844134e-06, + "loss": 0.0007, + "step": 85770 + }, + { + "epoch": 1.403583408328561, + "grad_norm": 0.032011374831199646, + "learning_rate": 2.4729674716409114e-06, + "loss": 0.0018, + "step": 85780 + }, + { + "epoch": 1.4037470342796368, + "grad_norm": 0.05075502023100853, + "learning_rate": 2.471735461393587e-06, + "loss": 0.0013, + "step": 85790 + }, + { + "epoch": 1.4039106602307125, + "grad_norm": 0.025442451238632202, + "learning_rate": 2.4705036573429153e-06, + "loss": 0.0008, + "step": 85800 + }, + { + "epoch": 1.4040742861817885, + "grad_norm": 0.01608380116522312, + "learning_rate": 2.4692720595893606e-06, + "loss": 0.0023, + "step": 85810 + }, + { + "epoch": 1.4042379121328643, + "grad_norm": 0.05921518802642822, + "learning_rate": 2.468040668233364e-06, + "loss": 0.0012, + "step": 85820 + }, + { + "epoch": 1.40440153808394, + "grad_norm": 0.012568363919854164, + "learning_rate": 2.4668094833753584e-06, + "loss": 0.0007, + "step": 85830 + }, + { + "epoch": 1.404565164035016, + "grad_norm": 0.0022954826708883047, + "learning_rate": 2.465578505115751e-06, + "loss": 0.0011, + "step": 85840 + }, + { + "epoch": 1.4047287899860919, + "grad_norm": 0.1080038920044899, + "learning_rate": 2.4643477335549397e-06, + "loss": 0.0015, + "step": 85850 + }, + { + "epoch": 1.4048924159371676, + "grad_norm": 0.011237172409892082, + "learning_rate": 2.4631171687932982e-06, + "loss": 0.001, + "step": 85860 + }, + { + "epoch": 1.4050560418882434, + "grad_norm": 0.10426940768957138, + "learning_rate": 2.461886810931191e-06, + "loss": 0.0017, + "step": 85870 + }, + { + "epoch": 1.4052196678393194, + "grad_norm": 0.0911511778831482, + "learning_rate": 2.4606566600689586e-06, + "loss": 0.0015, + "step": 85880 + }, + { + "epoch": 1.4053832937903952, + "grad_norm": 0.0014476514188572764, + "learning_rate": 2.459426716306931e-06, + "loss": 0.0007, + "step": 85890 + }, + { + "epoch": 1.405546919741471, + "grad_norm": 0.0435417965054512, + "learning_rate": 2.4581969797454146e-06, + "loss": 0.0014, + "step": 85900 + }, + { + "epoch": 1.4057105456925467, + "grad_norm": 0.05716950446367264, + "learning_rate": 2.456967450484707e-06, + "loss": 0.0009, + "step": 85910 + }, + { + "epoch": 1.4058741716436227, + "grad_norm": 0.007711363025009632, + "learning_rate": 2.4557381286250793e-06, + "loss": 0.0008, + "step": 85920 + }, + { + "epoch": 1.4060377975946985, + "grad_norm": 0.054224394261837006, + "learning_rate": 2.454509014266796e-06, + "loss": 0.0009, + "step": 85930 + }, + { + "epoch": 1.4062014235457743, + "grad_norm": 0.01425554696470499, + "learning_rate": 2.453280107510095e-06, + "loss": 0.0011, + "step": 85940 + }, + { + "epoch": 1.4063650494968503, + "grad_norm": 0.05185844376683235, + "learning_rate": 2.4520514084552054e-06, + "loss": 0.001, + "step": 85950 + }, + { + "epoch": 1.406528675447926, + "grad_norm": 0.040036190301179886, + "learning_rate": 2.4508229172023324e-06, + "loss": 0.0018, + "step": 85960 + }, + { + "epoch": 1.4066923013990018, + "grad_norm": 0.026100879535079002, + "learning_rate": 2.4495946338516706e-06, + "loss": 0.0014, + "step": 85970 + }, + { + "epoch": 1.4068559273500778, + "grad_norm": 0.04681260883808136, + "learning_rate": 2.4483665585033923e-06, + "loss": 0.0031, + "step": 85980 + }, + { + "epoch": 1.4070195533011536, + "grad_norm": 0.07585979998111725, + "learning_rate": 2.4471386912576574e-06, + "loss": 0.0007, + "step": 85990 + }, + { + "epoch": 1.4071831792522294, + "grad_norm": 0.09745154529809952, + "learning_rate": 2.4459110322146034e-06, + "loss": 0.0017, + "step": 86000 + }, + { + "epoch": 1.4073468052033054, + "grad_norm": 0.06328898668289185, + "learning_rate": 2.4446835814743576e-06, + "loss": 0.0016, + "step": 86010 + }, + { + "epoch": 1.4075104311543811, + "grad_norm": 0.03818634897470474, + "learning_rate": 2.443456339137023e-06, + "loss": 0.0009, + "step": 86020 + }, + { + "epoch": 1.407674057105457, + "grad_norm": 0.0605267770588398, + "learning_rate": 2.4422293053026936e-06, + "loss": 0.0011, + "step": 86030 + }, + { + "epoch": 1.407837683056533, + "grad_norm": 0.04194982722401619, + "learning_rate": 2.4410024800714376e-06, + "loss": 0.0014, + "step": 86040 + }, + { + "epoch": 1.4080013090076087, + "grad_norm": 0.18008437752723694, + "learning_rate": 2.439775863543315e-06, + "loss": 0.0027, + "step": 86050 + }, + { + "epoch": 1.4081649349586844, + "grad_norm": 0.050394292920827866, + "learning_rate": 2.438549455818362e-06, + "loss": 0.0011, + "step": 86060 + }, + { + "epoch": 1.4083285609097602, + "grad_norm": 0.0009599836193956435, + "learning_rate": 2.4373232569965987e-06, + "loss": 0.001, + "step": 86070 + }, + { + "epoch": 1.408492186860836, + "grad_norm": 0.011497218161821365, + "learning_rate": 2.436097267178034e-06, + "loss": 0.0006, + "step": 86080 + }, + { + "epoch": 1.408655812811912, + "grad_norm": 0.026954131200909615, + "learning_rate": 2.4348714864626505e-06, + "loss": 0.0008, + "step": 86090 + }, + { + "epoch": 1.4088194387629878, + "grad_norm": 0.040744245052337646, + "learning_rate": 2.4336459149504233e-06, + "loss": 0.0007, + "step": 86100 + }, + { + "epoch": 1.4089830647140635, + "grad_norm": 0.09333061426877975, + "learning_rate": 2.432420552741302e-06, + "loss": 0.0013, + "step": 86110 + }, + { + "epoch": 1.4091466906651395, + "grad_norm": 0.033017560839653015, + "learning_rate": 2.4311953999352265e-06, + "loss": 0.0008, + "step": 86120 + }, + { + "epoch": 1.4093103166162153, + "grad_norm": 0.03325832262635231, + "learning_rate": 2.4299704566321123e-06, + "loss": 0.0012, + "step": 86130 + }, + { + "epoch": 1.409473942567291, + "grad_norm": 0.023710599169135094, + "learning_rate": 2.428745722931866e-06, + "loss": 0.0013, + "step": 86140 + }, + { + "epoch": 1.409637568518367, + "grad_norm": 0.1300840526819229, + "learning_rate": 2.427521198934369e-06, + "loss": 0.0014, + "step": 86150 + }, + { + "epoch": 1.4098011944694429, + "grad_norm": 0.1166706532239914, + "learning_rate": 2.426296884739491e-06, + "loss": 0.0012, + "step": 86160 + }, + { + "epoch": 1.4099648204205186, + "grad_norm": 0.06065169721841812, + "learning_rate": 2.425072780447082e-06, + "loss": 0.0007, + "step": 86170 + }, + { + "epoch": 1.4101284463715946, + "grad_norm": 0.003930584527552128, + "learning_rate": 2.423848886156978e-06, + "loss": 0.0012, + "step": 86180 + }, + { + "epoch": 1.4102920723226704, + "grad_norm": 0.036340244114398956, + "learning_rate": 2.4226252019689923e-06, + "loss": 0.0007, + "step": 86190 + }, + { + "epoch": 1.4104556982737462, + "grad_norm": 0.05645986273884773, + "learning_rate": 2.4214017279829273e-06, + "loss": 0.0006, + "step": 86200 + }, + { + "epoch": 1.4106193242248222, + "grad_norm": 0.029943697154521942, + "learning_rate": 2.4201784642985625e-06, + "loss": 0.0021, + "step": 86210 + }, + { + "epoch": 1.410782950175898, + "grad_norm": 0.07911103218793869, + "learning_rate": 2.418955411015667e-06, + "loss": 0.0006, + "step": 86220 + }, + { + "epoch": 1.4109465761269737, + "grad_norm": 0.036517757922410965, + "learning_rate": 2.4177325682339854e-06, + "loss": 0.0016, + "step": 86230 + }, + { + "epoch": 1.4111102020780497, + "grad_norm": 0.007980478927493095, + "learning_rate": 2.4165099360532505e-06, + "loss": 0.0011, + "step": 86240 + }, + { + "epoch": 1.4112738280291255, + "grad_norm": 0.04216919094324112, + "learning_rate": 2.4152875145731747e-06, + "loss": 0.0007, + "step": 86250 + }, + { + "epoch": 1.4114374539802013, + "grad_norm": 0.033767424523830414, + "learning_rate": 2.414065303893456e-06, + "loss": 0.0015, + "step": 86260 + }, + { + "epoch": 1.411601079931277, + "grad_norm": 0.019887957721948624, + "learning_rate": 2.412843304113772e-06, + "loss": 0.0011, + "step": 86270 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.03932947292923927, + "learning_rate": 2.411621515333788e-06, + "loss": 0.0009, + "step": 86280 + }, + { + "epoch": 1.4119283318334288, + "grad_norm": 0.054890550673007965, + "learning_rate": 2.4103999376531445e-06, + "loss": 0.0013, + "step": 86290 + }, + { + "epoch": 1.4120919577845046, + "grad_norm": 0.025379173457622528, + "learning_rate": 2.4091785711714734e-06, + "loss": 0.0009, + "step": 86300 + }, + { + "epoch": 1.4122555837355804, + "grad_norm": 0.022573180496692657, + "learning_rate": 2.407957415988381e-06, + "loss": 0.0015, + "step": 86310 + }, + { + "epoch": 1.4124192096866564, + "grad_norm": 0.030358633026480675, + "learning_rate": 2.4067364722034653e-06, + "loss": 0.001, + "step": 86320 + }, + { + "epoch": 1.4125828356377321, + "grad_norm": 0.05885162204504013, + "learning_rate": 2.4055157399162975e-06, + "loss": 0.0008, + "step": 86330 + }, + { + "epoch": 1.412746461588808, + "grad_norm": 0.06898149102926254, + "learning_rate": 2.4042952192264413e-06, + "loss": 0.0012, + "step": 86340 + }, + { + "epoch": 1.412910087539884, + "grad_norm": 0.023307858034968376, + "learning_rate": 2.403074910233433e-06, + "loss": 0.003, + "step": 86350 + }, + { + "epoch": 1.4130737134909597, + "grad_norm": 0.03626040369272232, + "learning_rate": 2.401854813036802e-06, + "loss": 0.0023, + "step": 86360 + }, + { + "epoch": 1.4132373394420354, + "grad_norm": 0.0758916512131691, + "learning_rate": 2.400634927736051e-06, + "loss": 0.0012, + "step": 86370 + }, + { + "epoch": 1.4134009653931114, + "grad_norm": 0.04178851842880249, + "learning_rate": 2.3994152544306725e-06, + "loss": 0.0014, + "step": 86380 + }, + { + "epoch": 1.4135645913441872, + "grad_norm": 0.04437820985913277, + "learning_rate": 2.398195793220139e-06, + "loss": 0.0014, + "step": 86390 + }, + { + "epoch": 1.413728217295263, + "grad_norm": 0.03609314188361168, + "learning_rate": 2.3969765442039017e-06, + "loss": 0.001, + "step": 86400 + }, + { + "epoch": 1.413891843246339, + "grad_norm": 0.02759036235511303, + "learning_rate": 2.3957575074814037e-06, + "loss": 0.0007, + "step": 86410 + }, + { + "epoch": 1.4140554691974148, + "grad_norm": 0.019759638234972954, + "learning_rate": 2.3945386831520606e-06, + "loss": 0.0012, + "step": 86420 + }, + { + "epoch": 1.4142190951484905, + "grad_norm": 0.074102982878685, + "learning_rate": 2.3933200713152806e-06, + "loss": 0.0012, + "step": 86430 + }, + { + "epoch": 1.4143827210995663, + "grad_norm": 0.041933875530958176, + "learning_rate": 2.3921016720704444e-06, + "loss": 0.001, + "step": 86440 + }, + { + "epoch": 1.4145463470506423, + "grad_norm": 0.050445713102817535, + "learning_rate": 2.390883485516925e-06, + "loss": 0.0026, + "step": 86450 + }, + { + "epoch": 1.414709973001718, + "grad_norm": 0.044565193355083466, + "learning_rate": 2.3896655117540695e-06, + "loss": 0.0006, + "step": 86460 + }, + { + "epoch": 1.4148735989527939, + "grad_norm": 0.07884562760591507, + "learning_rate": 2.388447750881216e-06, + "loss": 0.0009, + "step": 86470 + }, + { + "epoch": 1.4150372249038696, + "grad_norm": 0.058861423283815384, + "learning_rate": 2.3872302029976763e-06, + "loss": 0.0012, + "step": 86480 + }, + { + "epoch": 1.4152008508549456, + "grad_norm": 0.011417794972658157, + "learning_rate": 2.386012868202754e-06, + "loss": 0.0016, + "step": 86490 + }, + { + "epoch": 1.4153644768060214, + "grad_norm": 0.060272276401519775, + "learning_rate": 2.384795746595727e-06, + "loss": 0.0016, + "step": 86500 + }, + { + "epoch": 1.4155281027570972, + "grad_norm": 0.06133784353733063, + "learning_rate": 2.383578838275863e-06, + "loss": 0.0007, + "step": 86510 + }, + { + "epoch": 1.4156917287081732, + "grad_norm": 0.04298656806349754, + "learning_rate": 2.382362143342405e-06, + "loss": 0.0008, + "step": 86520 + }, + { + "epoch": 1.415855354659249, + "grad_norm": 0.06629273295402527, + "learning_rate": 2.381145661894586e-06, + "loss": 0.0012, + "step": 86530 + }, + { + "epoch": 1.4160189806103247, + "grad_norm": 0.050635725259780884, + "learning_rate": 2.379929394031616e-06, + "loss": 0.0014, + "step": 86540 + }, + { + "epoch": 1.4161826065614007, + "grad_norm": 0.02372150681912899, + "learning_rate": 2.378713339852691e-06, + "loss": 0.0011, + "step": 86550 + }, + { + "epoch": 1.4163462325124765, + "grad_norm": 0.07492823898792267, + "learning_rate": 2.3774974994569866e-06, + "loss": 0.0021, + "step": 86560 + }, + { + "epoch": 1.4165098584635523, + "grad_norm": 0.06238546967506409, + "learning_rate": 2.3762818729436653e-06, + "loss": 0.0013, + "step": 86570 + }, + { + "epoch": 1.4166734844146283, + "grad_norm": 0.08350105583667755, + "learning_rate": 2.375066460411866e-06, + "loss": 0.0019, + "step": 86580 + }, + { + "epoch": 1.416837110365704, + "grad_norm": 0.12237215042114258, + "learning_rate": 2.373851261960717e-06, + "loss": 0.0018, + "step": 86590 + }, + { + "epoch": 1.4170007363167798, + "grad_norm": 0.025111624971032143, + "learning_rate": 2.3726362776893223e-06, + "loss": 0.0021, + "step": 86600 + }, + { + "epoch": 1.4171643622678558, + "grad_norm": 0.02818678505718708, + "learning_rate": 2.371421507696776e-06, + "loss": 0.0008, + "step": 86610 + }, + { + "epoch": 1.4173279882189316, + "grad_norm": 0.05422881245613098, + "learning_rate": 2.3702069520821482e-06, + "loss": 0.001, + "step": 86620 + }, + { + "epoch": 1.4174916141700074, + "grad_norm": 0.04695925489068031, + "learning_rate": 2.368992610944494e-06, + "loss": 0.0007, + "step": 86630 + }, + { + "epoch": 1.4176552401210831, + "grad_norm": 0.009760403074324131, + "learning_rate": 2.367778484382849e-06, + "loss": 0.0012, + "step": 86640 + }, + { + "epoch": 1.4178188660721591, + "grad_norm": 0.020390138030052185, + "learning_rate": 2.3665645724962365e-06, + "loss": 0.0011, + "step": 86650 + }, + { + "epoch": 1.417982492023235, + "grad_norm": 0.08425728976726532, + "learning_rate": 2.365350875383657e-06, + "loss": 0.0013, + "step": 86660 + }, + { + "epoch": 1.4181461179743107, + "grad_norm": 0.013156197965145111, + "learning_rate": 2.3641373931440976e-06, + "loss": 0.0012, + "step": 86670 + }, + { + "epoch": 1.4183097439253864, + "grad_norm": 0.014625060372054577, + "learning_rate": 2.3629241258765224e-06, + "loss": 0.0011, + "step": 86680 + }, + { + "epoch": 1.4184733698764624, + "grad_norm": 0.15262331068515778, + "learning_rate": 2.3617110736798854e-06, + "loss": 0.0026, + "step": 86690 + }, + { + "epoch": 1.4186369958275382, + "grad_norm": 0.018499836325645447, + "learning_rate": 2.3604982366531154e-06, + "loss": 0.0007, + "step": 86700 + }, + { + "epoch": 1.418800621778614, + "grad_norm": 0.03466907516121864, + "learning_rate": 2.3592856148951306e-06, + "loss": 0.0015, + "step": 86710 + }, + { + "epoch": 1.41896424772969, + "grad_norm": 0.025678353384137154, + "learning_rate": 2.358073208504824e-06, + "loss": 0.0011, + "step": 86720 + }, + { + "epoch": 1.4191278736807658, + "grad_norm": 0.029115119948983192, + "learning_rate": 2.356861017581081e-06, + "loss": 0.0013, + "step": 86730 + }, + { + "epoch": 1.4192914996318415, + "grad_norm": 0.006678566336631775, + "learning_rate": 2.355649042222758e-06, + "loss": 0.0009, + "step": 86740 + }, + { + "epoch": 1.4194551255829175, + "grad_norm": 0.08240357786417007, + "learning_rate": 2.354437282528704e-06, + "loss": 0.0013, + "step": 86750 + }, + { + "epoch": 1.4196187515339933, + "grad_norm": 0.0028957827016711235, + "learning_rate": 2.3532257385977433e-06, + "loss": 0.0013, + "step": 86760 + }, + { + "epoch": 1.419782377485069, + "grad_norm": 0.04983939230442047, + "learning_rate": 2.3520144105286867e-06, + "loss": 0.0007, + "step": 86770 + }, + { + "epoch": 1.419946003436145, + "grad_norm": 0.042318642139434814, + "learning_rate": 2.3508032984203248e-06, + "loss": 0.0008, + "step": 86780 + }, + { + "epoch": 1.4201096293872209, + "grad_norm": 0.01818975619971752, + "learning_rate": 2.3495924023714338e-06, + "loss": 0.001, + "step": 86790 + }, + { + "epoch": 1.4202732553382966, + "grad_norm": 0.015192101709544659, + "learning_rate": 2.3483817224807677e-06, + "loss": 0.0014, + "step": 86800 + }, + { + "epoch": 1.4204368812893726, + "grad_norm": 0.031194766983389854, + "learning_rate": 2.3471712588470684e-06, + "loss": 0.0013, + "step": 86810 + }, + { + "epoch": 1.4206005072404484, + "grad_norm": 0.10979337245225906, + "learning_rate": 2.3459610115690534e-06, + "loss": 0.0018, + "step": 86820 + }, + { + "epoch": 1.4207641331915242, + "grad_norm": 0.09447083622217178, + "learning_rate": 2.3447509807454305e-06, + "loss": 0.0013, + "step": 86830 + }, + { + "epoch": 1.4209277591426, + "grad_norm": 0.06405788660049438, + "learning_rate": 2.343541166474882e-06, + "loss": 0.001, + "step": 86840 + }, + { + "epoch": 1.4210913850936757, + "grad_norm": 0.07527408748865128, + "learning_rate": 2.3423315688560794e-06, + "loss": 0.0017, + "step": 86850 + }, + { + "epoch": 1.4212550110447517, + "grad_norm": 0.0402020663022995, + "learning_rate": 2.3411221879876722e-06, + "loss": 0.0022, + "step": 86860 + }, + { + "epoch": 1.4214186369958275, + "grad_norm": 0.006235907785594463, + "learning_rate": 2.339913023968291e-06, + "loss": 0.0008, + "step": 86870 + }, + { + "epoch": 1.4215822629469033, + "grad_norm": 0.003380571026355028, + "learning_rate": 2.3387040768965553e-06, + "loss": 0.0012, + "step": 86880 + }, + { + "epoch": 1.4217458888979793, + "grad_norm": 0.10519898682832718, + "learning_rate": 2.337495346871058e-06, + "loss": 0.0013, + "step": 86890 + }, + { + "epoch": 1.421909514849055, + "grad_norm": 0.010904007591307163, + "learning_rate": 2.3362868339903843e-06, + "loss": 0.001, + "step": 86900 + }, + { + "epoch": 1.4220731408001308, + "grad_norm": 0.02424856647849083, + "learning_rate": 2.3350785383530918e-06, + "loss": 0.0011, + "step": 86910 + }, + { + "epoch": 1.4222367667512068, + "grad_norm": 0.03161376342177391, + "learning_rate": 2.3338704600577285e-06, + "loss": 0.0012, + "step": 86920 + }, + { + "epoch": 1.4224003927022826, + "grad_norm": 0.07663678377866745, + "learning_rate": 2.3326625992028175e-06, + "loss": 0.0009, + "step": 86930 + }, + { + "epoch": 1.4225640186533584, + "grad_norm": 0.005474789533764124, + "learning_rate": 2.3314549558868717e-06, + "loss": 0.0018, + "step": 86940 + }, + { + "epoch": 1.4227276446044343, + "grad_norm": 0.023548053577542305, + "learning_rate": 2.330247530208379e-06, + "loss": 0.001, + "step": 86950 + }, + { + "epoch": 1.4228912705555101, + "grad_norm": 0.002747412072494626, + "learning_rate": 2.329040322265816e-06, + "loss": 0.0012, + "step": 86960 + }, + { + "epoch": 1.423054896506586, + "grad_norm": 0.04941778630018234, + "learning_rate": 2.3278333321576347e-06, + "loss": 0.0008, + "step": 86970 + }, + { + "epoch": 1.423218522457662, + "grad_norm": 0.07427161931991577, + "learning_rate": 2.326626559982278e-06, + "loss": 0.0008, + "step": 86980 + }, + { + "epoch": 1.4233821484087377, + "grad_norm": 0.15990613400936127, + "learning_rate": 2.325420005838161e-06, + "loss": 0.0009, + "step": 86990 + }, + { + "epoch": 1.4235457743598134, + "grad_norm": 0.06716874241828918, + "learning_rate": 2.3242136698236905e-06, + "loss": 0.0011, + "step": 87000 + }, + { + "epoch": 1.4237094003108894, + "grad_norm": 0.025656558573246002, + "learning_rate": 2.3230075520372473e-06, + "loss": 0.0015, + "step": 87010 + }, + { + "epoch": 1.4238730262619652, + "grad_norm": 0.10640022903680801, + "learning_rate": 2.321801652577203e-06, + "loss": 0.0015, + "step": 87020 + }, + { + "epoch": 1.424036652213041, + "grad_norm": 0.09093756228685379, + "learning_rate": 2.3205959715419014e-06, + "loss": 0.0015, + "step": 87030 + }, + { + "epoch": 1.4242002781641168, + "grad_norm": 0.03602614626288414, + "learning_rate": 2.3193905090296783e-06, + "loss": 0.0021, + "step": 87040 + }, + { + "epoch": 1.4243639041151925, + "grad_norm": 0.04593077674508095, + "learning_rate": 2.3181852651388426e-06, + "loss": 0.0011, + "step": 87050 + }, + { + "epoch": 1.4245275300662685, + "grad_norm": 0.03151620179414749, + "learning_rate": 2.316980239967695e-06, + "loss": 0.001, + "step": 87060 + }, + { + "epoch": 1.4246911560173443, + "grad_norm": 0.06314272433519363, + "learning_rate": 2.3157754336145085e-06, + "loss": 0.0008, + "step": 87070 + }, + { + "epoch": 1.42485478196842, + "grad_norm": 0.03376390039920807, + "learning_rate": 2.3145708461775467e-06, + "loss": 0.0006, + "step": 87080 + }, + { + "epoch": 1.425018407919496, + "grad_norm": 0.06842463463544846, + "learning_rate": 2.3133664777550486e-06, + "loss": 0.0017, + "step": 87090 + }, + { + "epoch": 1.4251820338705719, + "grad_norm": 0.007555176038295031, + "learning_rate": 2.3121623284452414e-06, + "loss": 0.0008, + "step": 87100 + }, + { + "epoch": 1.4253456598216476, + "grad_norm": 0.0578356608748436, + "learning_rate": 2.310958398346328e-06, + "loss": 0.0005, + "step": 87110 + }, + { + "epoch": 1.4255092857727236, + "grad_norm": 0.03102637641131878, + "learning_rate": 2.3097546875565006e-06, + "loss": 0.0013, + "step": 87120 + }, + { + "epoch": 1.4256729117237994, + "grad_norm": 0.032348643988370895, + "learning_rate": 2.3085511961739266e-06, + "loss": 0.0009, + "step": 87130 + }, + { + "epoch": 1.4258365376748752, + "grad_norm": 0.16237132251262665, + "learning_rate": 2.307347924296761e-06, + "loss": 0.0011, + "step": 87140 + }, + { + "epoch": 1.4260001636259512, + "grad_norm": 0.03619246557354927, + "learning_rate": 2.3061448720231362e-06, + "loss": 0.0008, + "step": 87150 + }, + { + "epoch": 1.426163789577027, + "grad_norm": 0.2018430531024933, + "learning_rate": 2.304942039451172e-06, + "loss": 0.0005, + "step": 87160 + }, + { + "epoch": 1.4263274155281027, + "grad_norm": 0.024624843150377274, + "learning_rate": 2.3037394266789652e-06, + "loss": 0.0009, + "step": 87170 + }, + { + "epoch": 1.4264910414791787, + "grad_norm": 0.01159820705652237, + "learning_rate": 2.302537033804596e-06, + "loss": 0.0004, + "step": 87180 + }, + { + "epoch": 1.4266546674302545, + "grad_norm": 0.013265382498502731, + "learning_rate": 2.3013348609261303e-06, + "loss": 0.0006, + "step": 87190 + }, + { + "epoch": 1.4268182933813303, + "grad_norm": 0.011792325414717197, + "learning_rate": 2.3001329081416105e-06, + "loss": 0.0015, + "step": 87200 + }, + { + "epoch": 1.4269819193324063, + "grad_norm": 0.031673651188611984, + "learning_rate": 2.298931175549066e-06, + "loss": 0.0013, + "step": 87210 + }, + { + "epoch": 1.427145545283482, + "grad_norm": 0.02618395909667015, + "learning_rate": 2.297729663246504e-06, + "loss": 0.0008, + "step": 87220 + }, + { + "epoch": 1.4273091712345578, + "grad_norm": 0.08764577656984329, + "learning_rate": 2.296528371331918e-06, + "loss": 0.0014, + "step": 87230 + }, + { + "epoch": 1.4274727971856336, + "grad_norm": 0.18212631344795227, + "learning_rate": 2.2953272999032784e-06, + "loss": 0.0014, + "step": 87240 + }, + { + "epoch": 1.4276364231367094, + "grad_norm": 0.06772510707378387, + "learning_rate": 2.2941264490585434e-06, + "loss": 0.0011, + "step": 87250 + }, + { + "epoch": 1.4278000490877854, + "grad_norm": 0.013120506890118122, + "learning_rate": 2.2929258188956484e-06, + "loss": 0.0012, + "step": 87260 + }, + { + "epoch": 1.4279636750388611, + "grad_norm": 0.011625261045992374, + "learning_rate": 2.291725409512514e-06, + "loss": 0.0014, + "step": 87270 + }, + { + "epoch": 1.428127300989937, + "grad_norm": 0.021880803629755974, + "learning_rate": 2.2905252210070395e-06, + "loss": 0.0017, + "step": 87280 + }, + { + "epoch": 1.428290926941013, + "grad_norm": 0.06603413820266724, + "learning_rate": 2.2893252534771115e-06, + "loss": 0.0009, + "step": 87290 + }, + { + "epoch": 1.4284545528920887, + "grad_norm": 0.01816193014383316, + "learning_rate": 2.288125507020591e-06, + "loss": 0.001, + "step": 87300 + }, + { + "epoch": 1.4286181788431644, + "grad_norm": 0.025098584592342377, + "learning_rate": 2.2869259817353302e-06, + "loss": 0.0013, + "step": 87310 + }, + { + "epoch": 1.4287818047942404, + "grad_norm": 0.037802111357450485, + "learning_rate": 2.2857266777191535e-06, + "loss": 0.0008, + "step": 87320 + }, + { + "epoch": 1.4289454307453162, + "grad_norm": 0.04814010486006737, + "learning_rate": 2.2845275950698753e-06, + "loss": 0.0011, + "step": 87330 + }, + { + "epoch": 1.429109056696392, + "grad_norm": 0.0005579679273068905, + "learning_rate": 2.283328733885287e-06, + "loss": 0.0005, + "step": 87340 + }, + { + "epoch": 1.429272682647468, + "grad_norm": 0.06661123037338257, + "learning_rate": 2.2821300942631653e-06, + "loss": 0.0012, + "step": 87350 + }, + { + "epoch": 1.4294363085985438, + "grad_norm": 0.07266577333211899, + "learning_rate": 2.280931676301264e-06, + "loss": 0.002, + "step": 87360 + }, + { + "epoch": 1.4295999345496195, + "grad_norm": 0.015039430931210518, + "learning_rate": 2.2797334800973265e-06, + "loss": 0.0012, + "step": 87370 + }, + { + "epoch": 1.4297635605006955, + "grad_norm": 0.032793078571558, + "learning_rate": 2.278535505749069e-06, + "loss": 0.0014, + "step": 87380 + }, + { + "epoch": 1.4299271864517713, + "grad_norm": 0.012454529292881489, + "learning_rate": 2.277337753354199e-06, + "loss": 0.0016, + "step": 87390 + }, + { + "epoch": 1.430090812402847, + "grad_norm": 0.030338380485773087, + "learning_rate": 2.276140223010396e-06, + "loss": 0.0008, + "step": 87400 + }, + { + "epoch": 1.4302544383539229, + "grad_norm": 0.004007387440651655, + "learning_rate": 2.274942914815331e-06, + "loss": 0.0007, + "step": 87410 + }, + { + "epoch": 1.4304180643049988, + "grad_norm": 0.009625481441617012, + "learning_rate": 2.2737458288666487e-06, + "loss": 0.0007, + "step": 87420 + }, + { + "epoch": 1.4305816902560746, + "grad_norm": 0.17188963294029236, + "learning_rate": 2.272548965261983e-06, + "loss": 0.0016, + "step": 87430 + }, + { + "epoch": 1.4307453162071504, + "grad_norm": 0.029408516362309456, + "learning_rate": 2.271352324098942e-06, + "loss": 0.0007, + "step": 87440 + }, + { + "epoch": 1.4309089421582262, + "grad_norm": 0.01963796466588974, + "learning_rate": 2.2701559054751244e-06, + "loss": 0.0016, + "step": 87450 + }, + { + "epoch": 1.4310725681093022, + "grad_norm": 0.0038243767339736223, + "learning_rate": 2.2689597094881007e-06, + "loss": 0.001, + "step": 87460 + }, + { + "epoch": 1.431236194060378, + "grad_norm": 0.025113839656114578, + "learning_rate": 2.267763736235433e-06, + "loss": 0.0009, + "step": 87470 + }, + { + "epoch": 1.4313998200114537, + "grad_norm": 0.4092933237552643, + "learning_rate": 2.266567985814658e-06, + "loss": 0.001, + "step": 87480 + }, + { + "epoch": 1.4315634459625297, + "grad_norm": 0.0572710819542408, + "learning_rate": 2.2653724583232995e-06, + "loss": 0.001, + "step": 87490 + }, + { + "epoch": 1.4317270719136055, + "grad_norm": 0.03292551636695862, + "learning_rate": 2.2641771538588595e-06, + "loss": 0.0009, + "step": 87500 + }, + { + "epoch": 1.4318906978646813, + "grad_norm": 0.03163839131593704, + "learning_rate": 2.262982072518821e-06, + "loss": 0.0009, + "step": 87510 + }, + { + "epoch": 1.4320543238157573, + "grad_norm": 0.04434162750840187, + "learning_rate": 2.2617872144006543e-06, + "loss": 0.0006, + "step": 87520 + }, + { + "epoch": 1.432217949766833, + "grad_norm": 0.02326543629169464, + "learning_rate": 2.260592579601804e-06, + "loss": 0.0007, + "step": 87530 + }, + { + "epoch": 1.4323815757179088, + "grad_norm": 0.0011645479826256633, + "learning_rate": 2.2593981682197047e-06, + "loss": 0.0009, + "step": 87540 + }, + { + "epoch": 1.4325452016689848, + "grad_norm": 0.0031618080101907253, + "learning_rate": 2.2582039803517646e-06, + "loss": 0.0008, + "step": 87550 + }, + { + "epoch": 1.4327088276200606, + "grad_norm": 0.22771863639354706, + "learning_rate": 2.257010016095382e-06, + "loss": 0.0007, + "step": 87560 + }, + { + "epoch": 1.4328724535711364, + "grad_norm": 0.018685875460505486, + "learning_rate": 2.2558162755479273e-06, + "loss": 0.0007, + "step": 87570 + }, + { + "epoch": 1.4330360795222123, + "grad_norm": 0.018462343141436577, + "learning_rate": 2.2546227588067626e-06, + "loss": 0.0017, + "step": 87580 + }, + { + "epoch": 1.4331997054732881, + "grad_norm": 0.060862280428409576, + "learning_rate": 2.2534294659692233e-06, + "loss": 0.0009, + "step": 87590 + }, + { + "epoch": 1.433363331424364, + "grad_norm": 0.002800151938572526, + "learning_rate": 2.2522363971326344e-06, + "loss": 0.0007, + "step": 87600 + }, + { + "epoch": 1.4335269573754397, + "grad_norm": 0.19052881002426147, + "learning_rate": 2.251043552394294e-06, + "loss": 0.0013, + "step": 87610 + }, + { + "epoch": 1.4336905833265157, + "grad_norm": 0.05560789257287979, + "learning_rate": 2.2498509318514907e-06, + "loss": 0.0008, + "step": 87620 + }, + { + "epoch": 1.4338542092775914, + "grad_norm": 0.01368323341012001, + "learning_rate": 2.2486585356014868e-06, + "loss": 0.0009, + "step": 87630 + }, + { + "epoch": 1.4340178352286672, + "grad_norm": 0.044441115111112595, + "learning_rate": 2.2474663637415334e-06, + "loss": 0.002, + "step": 87640 + }, + { + "epoch": 1.434181461179743, + "grad_norm": 0.04331725835800171, + "learning_rate": 2.246274416368857e-06, + "loss": 0.0005, + "step": 87650 + }, + { + "epoch": 1.434345087130819, + "grad_norm": 0.00791282206773758, + "learning_rate": 2.2450826935806717e-06, + "loss": 0.001, + "step": 87660 + }, + { + "epoch": 1.4345087130818948, + "grad_norm": 0.02606225572526455, + "learning_rate": 2.243891195474167e-06, + "loss": 0.0007, + "step": 87670 + }, + { + "epoch": 1.4346723390329705, + "grad_norm": 0.05236673355102539, + "learning_rate": 2.242699922146522e-06, + "loss": 0.0006, + "step": 87680 + }, + { + "epoch": 1.4348359649840465, + "grad_norm": 0.054539211094379425, + "learning_rate": 2.2415088736948876e-06, + "loss": 0.0015, + "step": 87690 + }, + { + "epoch": 1.4349995909351223, + "grad_norm": 0.023323310539126396, + "learning_rate": 2.2403180502164063e-06, + "loss": 0.0012, + "step": 87700 + }, + { + "epoch": 1.435163216886198, + "grad_norm": 0.12575753033161163, + "learning_rate": 2.2391274518081936e-06, + "loss": 0.0014, + "step": 87710 + }, + { + "epoch": 1.435326842837274, + "grad_norm": 0.005651365965604782, + "learning_rate": 2.2379370785673543e-06, + "loss": 0.0009, + "step": 87720 + }, + { + "epoch": 1.4354904687883498, + "grad_norm": 0.06766383349895477, + "learning_rate": 2.236746930590968e-06, + "loss": 0.0019, + "step": 87730 + }, + { + "epoch": 1.4356540947394256, + "grad_norm": 0.10399864614009857, + "learning_rate": 2.2355570079761024e-06, + "loss": 0.0024, + "step": 87740 + }, + { + "epoch": 1.4358177206905016, + "grad_norm": 0.03292897716164589, + "learning_rate": 2.2343673108198e-06, + "loss": 0.0009, + "step": 87750 + }, + { + "epoch": 1.4359813466415774, + "grad_norm": 0.05341261997818947, + "learning_rate": 2.2331778392190917e-06, + "loss": 0.0019, + "step": 87760 + }, + { + "epoch": 1.4361449725926532, + "grad_norm": 0.001011172542348504, + "learning_rate": 2.231988593270984e-06, + "loss": 0.0015, + "step": 87770 + }, + { + "epoch": 1.4363085985437292, + "grad_norm": 0.06599834561347961, + "learning_rate": 2.2307995730724704e-06, + "loss": 0.0011, + "step": 87780 + }, + { + "epoch": 1.436472224494805, + "grad_norm": 0.031680092215538025, + "learning_rate": 2.2296107787205202e-06, + "loss": 0.0015, + "step": 87790 + }, + { + "epoch": 1.4366358504458807, + "grad_norm": 0.10382623970508575, + "learning_rate": 2.228422210312091e-06, + "loss": 0.001, + "step": 87800 + }, + { + "epoch": 1.4367994763969565, + "grad_norm": 0.043182507157325745, + "learning_rate": 2.2272338679441175e-06, + "loss": 0.0007, + "step": 87810 + }, + { + "epoch": 1.4369631023480323, + "grad_norm": 0.031067989766597748, + "learning_rate": 2.226045751713513e-06, + "loss": 0.0009, + "step": 87820 + }, + { + "epoch": 1.4371267282991083, + "grad_norm": 0.012914981693029404, + "learning_rate": 2.224857861717182e-06, + "loss": 0.0007, + "step": 87830 + }, + { + "epoch": 1.437290354250184, + "grad_norm": 0.032071635127067566, + "learning_rate": 2.2236701980519994e-06, + "loss": 0.0019, + "step": 87840 + }, + { + "epoch": 1.4374539802012598, + "grad_norm": 0.057056866586208344, + "learning_rate": 2.2224827608148313e-06, + "loss": 0.0013, + "step": 87850 + }, + { + "epoch": 1.4376176061523358, + "grad_norm": 0.04092451557517052, + "learning_rate": 2.221295550102518e-06, + "loss": 0.0013, + "step": 87860 + }, + { + "epoch": 1.4377812321034116, + "grad_norm": 0.07200897485017776, + "learning_rate": 2.220108566011887e-06, + "loss": 0.0007, + "step": 87870 + }, + { + "epoch": 1.4379448580544874, + "grad_norm": 0.011565347202122211, + "learning_rate": 2.218921808639742e-06, + "loss": 0.0008, + "step": 87880 + }, + { + "epoch": 1.4381084840055633, + "grad_norm": 0.02137884497642517, + "learning_rate": 2.2177352780828735e-06, + "loss": 0.0007, + "step": 87890 + }, + { + "epoch": 1.4382721099566391, + "grad_norm": 0.009534936398267746, + "learning_rate": 2.2165489744380485e-06, + "loss": 0.0006, + "step": 87900 + }, + { + "epoch": 1.438435735907715, + "grad_norm": 0.12051648646593094, + "learning_rate": 2.2153628978020203e-06, + "loss": 0.0016, + "step": 87910 + }, + { + "epoch": 1.438599361858791, + "grad_norm": 0.07292230427265167, + "learning_rate": 2.2141770482715186e-06, + "loss": 0.0011, + "step": 87920 + }, + { + "epoch": 1.4387629878098667, + "grad_norm": 0.04109984636306763, + "learning_rate": 2.21299142594326e-06, + "loss": 0.001, + "step": 87930 + }, + { + "epoch": 1.4389266137609424, + "grad_norm": 0.07503499835729599, + "learning_rate": 2.2118060309139373e-06, + "loss": 0.0012, + "step": 87940 + }, + { + "epoch": 1.4390902397120184, + "grad_norm": 0.07943184673786163, + "learning_rate": 2.2106208632802295e-06, + "loss": 0.0016, + "step": 87950 + }, + { + "epoch": 1.4392538656630942, + "grad_norm": 0.01143516506999731, + "learning_rate": 2.2094359231387922e-06, + "loss": 0.0014, + "step": 87960 + }, + { + "epoch": 1.43941749161417, + "grad_norm": 0.030652880668640137, + "learning_rate": 2.208251210586269e-06, + "loss": 0.0012, + "step": 87970 + }, + { + "epoch": 1.439581117565246, + "grad_norm": 0.044436436146497726, + "learning_rate": 2.207066725719277e-06, + "loss": 0.0003, + "step": 87980 + }, + { + "epoch": 1.4397447435163218, + "grad_norm": 0.09317509084939957, + "learning_rate": 2.2058824686344216e-06, + "loss": 0.0012, + "step": 87990 + }, + { + "epoch": 1.4399083694673975, + "grad_norm": 0.0381840243935585, + "learning_rate": 2.2046984394282838e-06, + "loss": 0.0012, + "step": 88000 + }, + { + "epoch": 1.4400719954184733, + "grad_norm": 0.04159509390592575, + "learning_rate": 2.2035146381974333e-06, + "loss": 0.0013, + "step": 88010 + }, + { + "epoch": 1.440235621369549, + "grad_norm": 0.003345638746395707, + "learning_rate": 2.202331065038412e-06, + "loss": 0.0013, + "step": 88020 + }, + { + "epoch": 1.440399247320625, + "grad_norm": 0.17782385647296906, + "learning_rate": 2.2011477200477526e-06, + "loss": 0.0008, + "step": 88030 + }, + { + "epoch": 1.4405628732717008, + "grad_norm": 0.13986875116825104, + "learning_rate": 2.199964603321961e-06, + "loss": 0.0015, + "step": 88040 + }, + { + "epoch": 1.4407264992227766, + "grad_norm": 0.028128251433372498, + "learning_rate": 2.1987817149575313e-06, + "loss": 0.0008, + "step": 88050 + }, + { + "epoch": 1.4408901251738526, + "grad_norm": 0.011632639914751053, + "learning_rate": 2.197599055050933e-06, + "loss": 0.0013, + "step": 88060 + }, + { + "epoch": 1.4410537511249284, + "grad_norm": 0.047060687094926834, + "learning_rate": 2.1964166236986227e-06, + "loss": 0.002, + "step": 88070 + }, + { + "epoch": 1.4412173770760042, + "grad_norm": 0.012457093223929405, + "learning_rate": 2.1952344209970324e-06, + "loss": 0.001, + "step": 88080 + }, + { + "epoch": 1.4413810030270802, + "grad_norm": 0.12541504204273224, + "learning_rate": 2.194052447042582e-06, + "loss": 0.0011, + "step": 88090 + }, + { + "epoch": 1.441544628978156, + "grad_norm": 0.04264990985393524, + "learning_rate": 2.192870701931665e-06, + "loss": 0.0008, + "step": 88100 + }, + { + "epoch": 1.4417082549292317, + "grad_norm": 0.03025136888027191, + "learning_rate": 2.191689185760665e-06, + "loss": 0.0013, + "step": 88110 + }, + { + "epoch": 1.4418718808803077, + "grad_norm": 0.02718346379697323, + "learning_rate": 2.190507898625938e-06, + "loss": 0.0015, + "step": 88120 + }, + { + "epoch": 1.4420355068313835, + "grad_norm": 0.05797566846013069, + "learning_rate": 2.1893268406238303e-06, + "loss": 0.0016, + "step": 88130 + }, + { + "epoch": 1.4421991327824593, + "grad_norm": 0.039778679609298706, + "learning_rate": 2.1881460118506625e-06, + "loss": 0.0014, + "step": 88140 + }, + { + "epoch": 1.4423627587335353, + "grad_norm": 0.045133478939533234, + "learning_rate": 2.1869654124027378e-06, + "loss": 0.0013, + "step": 88150 + }, + { + "epoch": 1.442526384684611, + "grad_norm": 0.04567984491586685, + "learning_rate": 2.185785042376345e-06, + "loss": 0.001, + "step": 88160 + }, + { + "epoch": 1.4426900106356868, + "grad_norm": 0.08447040617465973, + "learning_rate": 2.1846049018677472e-06, + "loss": 0.0009, + "step": 88170 + }, + { + "epoch": 1.4428536365867626, + "grad_norm": 0.04492397606372833, + "learning_rate": 2.1834249909731976e-06, + "loss": 0.0034, + "step": 88180 + }, + { + "epoch": 1.4430172625378386, + "grad_norm": 0.10677868872880936, + "learning_rate": 2.18224530978892e-06, + "loss": 0.0018, + "step": 88190 + }, + { + "epoch": 1.4431808884889143, + "grad_norm": 0.014056211337447166, + "learning_rate": 2.1810658584111305e-06, + "loss": 0.0008, + "step": 88200 + }, + { + "epoch": 1.4433445144399901, + "grad_norm": 0.014277152717113495, + "learning_rate": 2.1798866369360166e-06, + "loss": 0.0005, + "step": 88210 + }, + { + "epoch": 1.443508140391066, + "grad_norm": 0.07201260328292847, + "learning_rate": 2.1787076454597556e-06, + "loss": 0.0012, + "step": 88220 + }, + { + "epoch": 1.443671766342142, + "grad_norm": 0.10153778642416, + "learning_rate": 2.1775288840784984e-06, + "loss": 0.001, + "step": 88230 + }, + { + "epoch": 1.4438353922932177, + "grad_norm": 0.11902565509080887, + "learning_rate": 2.1763503528883846e-06, + "loss": 0.0025, + "step": 88240 + }, + { + "epoch": 1.4439990182442934, + "grad_norm": 0.02921682596206665, + "learning_rate": 2.1751720519855274e-06, + "loss": 0.0016, + "step": 88250 + }, + { + "epoch": 1.4441626441953694, + "grad_norm": 0.058205220848321915, + "learning_rate": 2.173993981466028e-06, + "loss": 0.0009, + "step": 88260 + }, + { + "epoch": 1.4443262701464452, + "grad_norm": 0.06497303396463394, + "learning_rate": 2.172816141425963e-06, + "loss": 0.0012, + "step": 88270 + }, + { + "epoch": 1.444489896097521, + "grad_norm": 0.11249633878469467, + "learning_rate": 2.1716385319613966e-06, + "loss": 0.0011, + "step": 88280 + }, + { + "epoch": 1.444653522048597, + "grad_norm": 0.11125393956899643, + "learning_rate": 2.1704611531683667e-06, + "loss": 0.0017, + "step": 88290 + }, + { + "epoch": 1.4448171479996728, + "grad_norm": 0.10672051459550858, + "learning_rate": 2.1692840051429e-06, + "loss": 0.002, + "step": 88300 + }, + { + "epoch": 1.4449807739507485, + "grad_norm": 0.04128502309322357, + "learning_rate": 2.1681070879809967e-06, + "loss": 0.0005, + "step": 88310 + }, + { + "epoch": 1.4451443999018245, + "grad_norm": 0.10260944068431854, + "learning_rate": 2.166930401778647e-06, + "loss": 0.0019, + "step": 88320 + }, + { + "epoch": 1.4453080258529003, + "grad_norm": 0.003928142134100199, + "learning_rate": 2.165753946631812e-06, + "loss": 0.0006, + "step": 88330 + }, + { + "epoch": 1.445471651803976, + "grad_norm": 0.08012647926807404, + "learning_rate": 2.164577722636445e-06, + "loss": 0.0007, + "step": 88340 + }, + { + "epoch": 1.445635277755052, + "grad_norm": 0.051866721361875534, + "learning_rate": 2.1634017298884695e-06, + "loss": 0.0011, + "step": 88350 + }, + { + "epoch": 1.4457989037061278, + "grad_norm": 0.021905949339270592, + "learning_rate": 2.1622259684838002e-06, + "loss": 0.0014, + "step": 88360 + }, + { + "epoch": 1.4459625296572036, + "grad_norm": 0.03850788623094559, + "learning_rate": 2.161050438518324e-06, + "loss": 0.001, + "step": 88370 + }, + { + "epoch": 1.4461261556082794, + "grad_norm": 0.14281781017780304, + "learning_rate": 2.159875140087917e-06, + "loss": 0.0016, + "step": 88380 + }, + { + "epoch": 1.4462897815593554, + "grad_norm": 0.05758731812238693, + "learning_rate": 2.158700073288429e-06, + "loss": 0.0014, + "step": 88390 + }, + { + "epoch": 1.4464534075104312, + "grad_norm": 0.019825385883450508, + "learning_rate": 2.1575252382156973e-06, + "loss": 0.001, + "step": 88400 + }, + { + "epoch": 1.446617033461507, + "grad_norm": 0.028340870514512062, + "learning_rate": 2.156350634965535e-06, + "loss": 0.0014, + "step": 88410 + }, + { + "epoch": 1.4467806594125827, + "grad_norm": 0.012340308167040348, + "learning_rate": 2.1551762636337413e-06, + "loss": 0.0013, + "step": 88420 + }, + { + "epoch": 1.4469442853636587, + "grad_norm": 0.03235310688614845, + "learning_rate": 2.1540021243160912e-06, + "loss": 0.0013, + "step": 88430 + }, + { + "epoch": 1.4471079113147345, + "grad_norm": 0.17577512562274933, + "learning_rate": 2.1528282171083462e-06, + "loss": 0.0024, + "step": 88440 + }, + { + "epoch": 1.4472715372658103, + "grad_norm": 0.09183309972286224, + "learning_rate": 2.1516545421062435e-06, + "loss": 0.0012, + "step": 88450 + }, + { + "epoch": 1.4474351632168863, + "grad_norm": 0.03897007182240486, + "learning_rate": 2.1504810994055074e-06, + "loss": 0.0012, + "step": 88460 + }, + { + "epoch": 1.447598789167962, + "grad_norm": 0.03891608119010925, + "learning_rate": 2.1493078891018375e-06, + "loss": 0.0015, + "step": 88470 + }, + { + "epoch": 1.4477624151190378, + "grad_norm": 0.02291872538626194, + "learning_rate": 2.148134911290916e-06, + "loss": 0.0017, + "step": 88480 + }, + { + "epoch": 1.4479260410701138, + "grad_norm": 0.019518906250596046, + "learning_rate": 2.14696216606841e-06, + "loss": 0.0012, + "step": 88490 + }, + { + "epoch": 1.4480896670211896, + "grad_norm": 0.06341265887022018, + "learning_rate": 2.1457896535299605e-06, + "loss": 0.0012, + "step": 88500 + }, + { + "epoch": 1.4482532929722653, + "grad_norm": 0.03984321281313896, + "learning_rate": 2.1446173737711985e-06, + "loss": 0.0011, + "step": 88510 + }, + { + "epoch": 1.4484169189233413, + "grad_norm": 0.10460025072097778, + "learning_rate": 2.143445326887726e-06, + "loss": 0.0016, + "step": 88520 + }, + { + "epoch": 1.4485805448744171, + "grad_norm": 0.007973943836987019, + "learning_rate": 2.142273512975136e-06, + "loss": 0.002, + "step": 88530 + }, + { + "epoch": 1.448744170825493, + "grad_norm": 0.02535553090274334, + "learning_rate": 2.1411019321289937e-06, + "loss": 0.001, + "step": 88540 + }, + { + "epoch": 1.448907796776569, + "grad_norm": 0.08888189494609833, + "learning_rate": 2.1399305844448533e-06, + "loss": 0.001, + "step": 88550 + }, + { + "epoch": 1.4490714227276447, + "grad_norm": 0.052672844380140305, + "learning_rate": 2.138759470018241e-06, + "loss": 0.0008, + "step": 88560 + }, + { + "epoch": 1.4492350486787204, + "grad_norm": 0.016947384923696518, + "learning_rate": 2.1375885889446736e-06, + "loss": 0.0013, + "step": 88570 + }, + { + "epoch": 1.4493986746297962, + "grad_norm": 0.2120617926120758, + "learning_rate": 2.1364179413196408e-06, + "loss": 0.0011, + "step": 88580 + }, + { + "epoch": 1.4495623005808722, + "grad_norm": 0.03426506742835045, + "learning_rate": 2.135247527238619e-06, + "loss": 0.0008, + "step": 88590 + }, + { + "epoch": 1.449725926531948, + "grad_norm": 0.002394516486674547, + "learning_rate": 2.134077346797061e-06, + "loss": 0.0007, + "step": 88600 + }, + { + "epoch": 1.4498895524830238, + "grad_norm": 0.032250385731458664, + "learning_rate": 2.1329074000904056e-06, + "loss": 0.0017, + "step": 88610 + }, + { + "epoch": 1.4500531784340995, + "grad_norm": 0.06429596990346909, + "learning_rate": 2.1317376872140654e-06, + "loss": 0.0007, + "step": 88620 + }, + { + "epoch": 1.4502168043851755, + "grad_norm": 0.09414245188236237, + "learning_rate": 2.130568208263443e-06, + "loss": 0.0041, + "step": 88630 + }, + { + "epoch": 1.4503804303362513, + "grad_norm": 0.1963680386543274, + "learning_rate": 2.1293989633339136e-06, + "loss": 0.0015, + "step": 88640 + }, + { + "epoch": 1.450544056287327, + "grad_norm": 0.09079791605472565, + "learning_rate": 2.1282299525208393e-06, + "loss": 0.0015, + "step": 88650 + }, + { + "epoch": 1.450707682238403, + "grad_norm": 0.043481383472681046, + "learning_rate": 2.1270611759195583e-06, + "loss": 0.0012, + "step": 88660 + }, + { + "epoch": 1.4508713081894788, + "grad_norm": 0.048390574753284454, + "learning_rate": 2.125892633625394e-06, + "loss": 0.0011, + "step": 88670 + }, + { + "epoch": 1.4510349341405546, + "grad_norm": 0.03354208916425705, + "learning_rate": 2.124724325733647e-06, + "loss": 0.0012, + "step": 88680 + }, + { + "epoch": 1.4511985600916306, + "grad_norm": 0.03864798694849014, + "learning_rate": 2.123556252339603e-06, + "loss": 0.0008, + "step": 88690 + }, + { + "epoch": 1.4513621860427064, + "grad_norm": 0.04838123917579651, + "learning_rate": 2.1223884135385232e-06, + "loss": 0.0016, + "step": 88700 + }, + { + "epoch": 1.4515258119937822, + "grad_norm": 0.02041090466082096, + "learning_rate": 2.121220809425655e-06, + "loss": 0.0006, + "step": 88710 + }, + { + "epoch": 1.4516894379448582, + "grad_norm": 0.045102670788764954, + "learning_rate": 2.1200534400962216e-06, + "loss": 0.0008, + "step": 88720 + }, + { + "epoch": 1.451853063895934, + "grad_norm": 0.08513277024030685, + "learning_rate": 2.1188863056454327e-06, + "loss": 0.0015, + "step": 88730 + }, + { + "epoch": 1.4520166898470097, + "grad_norm": 0.047499481588602066, + "learning_rate": 2.117719406168473e-06, + "loss": 0.0013, + "step": 88740 + }, + { + "epoch": 1.4521803157980857, + "grad_norm": 0.05745978280901909, + "learning_rate": 2.116552741760514e-06, + "loss": 0.0012, + "step": 88750 + }, + { + "epoch": 1.4523439417491615, + "grad_norm": 0.015010926872491837, + "learning_rate": 2.1153863125167007e-06, + "loss": 0.0016, + "step": 88760 + }, + { + "epoch": 1.4525075677002373, + "grad_norm": 0.022948946803808212, + "learning_rate": 2.1142201185321674e-06, + "loss": 0.0007, + "step": 88770 + }, + { + "epoch": 1.452671193651313, + "grad_norm": 0.01724308170378208, + "learning_rate": 2.1130541599020236e-06, + "loss": 0.0007, + "step": 88780 + }, + { + "epoch": 1.4528348196023888, + "grad_norm": 0.006474709138274193, + "learning_rate": 2.1118884367213583e-06, + "loss": 0.0008, + "step": 88790 + }, + { + "epoch": 1.4529984455534648, + "grad_norm": 0.08029213547706604, + "learning_rate": 2.110722949085247e-06, + "loss": 0.0011, + "step": 88800 + }, + { + "epoch": 1.4531620715045406, + "grad_norm": 0.044175248593091965, + "learning_rate": 2.1095576970887405e-06, + "loss": 0.0004, + "step": 88810 + }, + { + "epoch": 1.4533256974556163, + "grad_norm": 0.0312805250287056, + "learning_rate": 2.108392680826876e-06, + "loss": 0.0011, + "step": 88820 + }, + { + "epoch": 1.4534893234066923, + "grad_norm": 0.021895818412303925, + "learning_rate": 2.1072279003946643e-06, + "loss": 0.0014, + "step": 88830 + }, + { + "epoch": 1.4536529493577681, + "grad_norm": 0.009866050444543362, + "learning_rate": 2.1060633558871046e-06, + "loss": 0.0016, + "step": 88840 + }, + { + "epoch": 1.453816575308844, + "grad_norm": 0.03385284170508385, + "learning_rate": 2.10489904739917e-06, + "loss": 0.0014, + "step": 88850 + }, + { + "epoch": 1.45398020125992, + "grad_norm": 0.027375230565667152, + "learning_rate": 2.103734975025821e-06, + "loss": 0.0013, + "step": 88860 + }, + { + "epoch": 1.4541438272109957, + "grad_norm": 0.03915165737271309, + "learning_rate": 2.1025711388619917e-06, + "loss": 0.0009, + "step": 88870 + }, + { + "epoch": 1.4543074531620714, + "grad_norm": 0.06073424965143204, + "learning_rate": 2.101407539002604e-06, + "loss": 0.0012, + "step": 88880 + }, + { + "epoch": 1.4544710791131474, + "grad_norm": 0.08118850737810135, + "learning_rate": 2.100244175542554e-06, + "loss": 0.002, + "step": 88890 + }, + { + "epoch": 1.4546347050642232, + "grad_norm": 0.026000458747148514, + "learning_rate": 2.099081048576725e-06, + "loss": 0.0012, + "step": 88900 + }, + { + "epoch": 1.454798331015299, + "grad_norm": 0.08672899752855301, + "learning_rate": 2.097918158199974e-06, + "loss": 0.0012, + "step": 88910 + }, + { + "epoch": 1.454961956966375, + "grad_norm": 0.053741566836833954, + "learning_rate": 2.0967555045071465e-06, + "loss": 0.001, + "step": 88920 + }, + { + "epoch": 1.4551255829174508, + "grad_norm": 0.06015285104513168, + "learning_rate": 2.0955930875930598e-06, + "loss": 0.0013, + "step": 88930 + }, + { + "epoch": 1.4552892088685265, + "grad_norm": 0.01598251424729824, + "learning_rate": 2.0944309075525215e-06, + "loss": 0.0014, + "step": 88940 + }, + { + "epoch": 1.4554528348196025, + "grad_norm": 0.11105940490961075, + "learning_rate": 2.0932689644803105e-06, + "loss": 0.0015, + "step": 88950 + }, + { + "epoch": 1.4556164607706783, + "grad_norm": 0.055214956402778625, + "learning_rate": 2.0921072584711952e-06, + "loss": 0.001, + "step": 88960 + }, + { + "epoch": 1.455780086721754, + "grad_norm": 0.05880969017744064, + "learning_rate": 2.090945789619917e-06, + "loss": 0.0011, + "step": 88970 + }, + { + "epoch": 1.4559437126728298, + "grad_norm": 0.06411729007959366, + "learning_rate": 2.0897845580212046e-06, + "loss": 0.0015, + "step": 88980 + }, + { + "epoch": 1.4561073386239056, + "grad_norm": 0.02467980608344078, + "learning_rate": 2.08862356376976e-06, + "loss": 0.0015, + "step": 88990 + }, + { + "epoch": 1.4562709645749816, + "grad_norm": 0.05376598238945007, + "learning_rate": 2.0874628069602744e-06, + "loss": 0.001, + "step": 89000 + }, + { + "epoch": 1.4564345905260574, + "grad_norm": 0.021031277254223824, + "learning_rate": 2.0863022876874106e-06, + "loss": 0.0005, + "step": 89010 + }, + { + "epoch": 1.4565982164771332, + "grad_norm": 0.1225455030798912, + "learning_rate": 2.085142006045821e-06, + "loss": 0.001, + "step": 89020 + }, + { + "epoch": 1.4567618424282092, + "grad_norm": 0.026533327996730804, + "learning_rate": 2.0839819621301304e-06, + "loss": 0.0006, + "step": 89030 + }, + { + "epoch": 1.456925468379285, + "grad_norm": 0.0300903283059597, + "learning_rate": 2.082822156034952e-06, + "loss": 0.0017, + "step": 89040 + }, + { + "epoch": 1.4570890943303607, + "grad_norm": 0.05557401105761528, + "learning_rate": 2.0816625878548712e-06, + "loss": 0.0007, + "step": 89050 + }, + { + "epoch": 1.4572527202814367, + "grad_norm": 0.08851264417171478, + "learning_rate": 2.0805032576844624e-06, + "loss": 0.0046, + "step": 89060 + }, + { + "epoch": 1.4574163462325125, + "grad_norm": 0.05466889962553978, + "learning_rate": 2.0793441656182732e-06, + "loss": 0.0009, + "step": 89070 + }, + { + "epoch": 1.4575799721835883, + "grad_norm": 0.04374682903289795, + "learning_rate": 2.078185311750839e-06, + "loss": 0.0013, + "step": 89080 + }, + { + "epoch": 1.4577435981346643, + "grad_norm": 0.07083718478679657, + "learning_rate": 2.077026696176668e-06, + "loss": 0.0012, + "step": 89090 + }, + { + "epoch": 1.45790722408574, + "grad_norm": 0.03942270949482918, + "learning_rate": 2.075868318990257e-06, + "loss": 0.001, + "step": 89100 + }, + { + "epoch": 1.4580708500368158, + "grad_norm": 0.23095588386058807, + "learning_rate": 2.0747101802860764e-06, + "loss": 0.0013, + "step": 89110 + }, + { + "epoch": 1.4582344759878918, + "grad_norm": 0.08636157959699631, + "learning_rate": 2.0735522801585794e-06, + "loss": 0.0016, + "step": 89120 + }, + { + "epoch": 1.4583981019389676, + "grad_norm": 0.009325859136879444, + "learning_rate": 2.072394618702203e-06, + "loss": 0.0007, + "step": 89130 + }, + { + "epoch": 1.4585617278900433, + "grad_norm": 0.1484357863664627, + "learning_rate": 2.0712371960113604e-06, + "loss": 0.0008, + "step": 89140 + }, + { + "epoch": 1.4587253538411191, + "grad_norm": 0.01525159738957882, + "learning_rate": 2.0700800121804483e-06, + "loss": 0.0008, + "step": 89150 + }, + { + "epoch": 1.4588889797921951, + "grad_norm": 0.052751071751117706, + "learning_rate": 2.0689230673038406e-06, + "loss": 0.0016, + "step": 89160 + }, + { + "epoch": 1.459052605743271, + "grad_norm": 0.09104198962450027, + "learning_rate": 2.0677663614758968e-06, + "loss": 0.0022, + "step": 89170 + }, + { + "epoch": 1.4592162316943467, + "grad_norm": 0.03054741583764553, + "learning_rate": 2.0666098947909504e-06, + "loss": 0.001, + "step": 89180 + }, + { + "epoch": 1.4593798576454224, + "grad_norm": 0.10792868584394455, + "learning_rate": 2.065453667343323e-06, + "loss": 0.0018, + "step": 89190 + }, + { + "epoch": 1.4595434835964984, + "grad_norm": 0.06750574707984924, + "learning_rate": 2.0642976792273077e-06, + "loss": 0.0008, + "step": 89200 + }, + { + "epoch": 1.4597071095475742, + "grad_norm": 0.05620532110333443, + "learning_rate": 2.063141930537188e-06, + "loss": 0.0009, + "step": 89210 + }, + { + "epoch": 1.45987073549865, + "grad_norm": 0.01012419443577528, + "learning_rate": 2.061986421367218e-06, + "loss": 0.0017, + "step": 89220 + }, + { + "epoch": 1.460034361449726, + "grad_norm": 0.043559737503528595, + "learning_rate": 2.060831151811642e-06, + "loss": 0.0007, + "step": 89230 + }, + { + "epoch": 1.4601979874008018, + "grad_norm": 0.03996361792087555, + "learning_rate": 2.059676121964675e-06, + "loss": 0.0005, + "step": 89240 + }, + { + "epoch": 1.4603616133518775, + "grad_norm": 0.07819166034460068, + "learning_rate": 2.0585213319205218e-06, + "loss": 0.0014, + "step": 89250 + }, + { + "epoch": 1.4605252393029535, + "grad_norm": 0.007949229329824448, + "learning_rate": 2.057366781773359e-06, + "loss": 0.0011, + "step": 89260 + }, + { + "epoch": 1.4606888652540293, + "grad_norm": 0.028248926624655724, + "learning_rate": 2.0562124716173514e-06, + "loss": 0.0015, + "step": 89270 + }, + { + "epoch": 1.460852491205105, + "grad_norm": 0.16604283452033997, + "learning_rate": 2.0550584015466376e-06, + "loss": 0.001, + "step": 89280 + }, + { + "epoch": 1.461016117156181, + "grad_norm": 0.04688572883605957, + "learning_rate": 2.0539045716553423e-06, + "loss": 0.0017, + "step": 89290 + }, + { + "epoch": 1.4611797431072568, + "grad_norm": 0.039852436631917953, + "learning_rate": 2.052750982037565e-06, + "loss": 0.0009, + "step": 89300 + }, + { + "epoch": 1.4613433690583326, + "grad_norm": 0.037024062126874924, + "learning_rate": 2.051597632787393e-06, + "loss": 0.0015, + "step": 89310 + }, + { + "epoch": 1.4615069950094086, + "grad_norm": 0.041955363005399704, + "learning_rate": 2.0504445239988836e-06, + "loss": 0.001, + "step": 89320 + }, + { + "epoch": 1.4616706209604844, + "grad_norm": 0.06854652613401413, + "learning_rate": 2.0492916557660857e-06, + "loss": 0.0005, + "step": 89330 + }, + { + "epoch": 1.4618342469115602, + "grad_norm": 0.009871524758636951, + "learning_rate": 2.0481390281830193e-06, + "loss": 0.0007, + "step": 89340 + }, + { + "epoch": 1.461997872862636, + "grad_norm": 0.07811180502176285, + "learning_rate": 2.0469866413436927e-06, + "loss": 0.0015, + "step": 89350 + }, + { + "epoch": 1.462161498813712, + "grad_norm": 0.02097170054912567, + "learning_rate": 2.0458344953420872e-06, + "loss": 0.0015, + "step": 89360 + }, + { + "epoch": 1.4623251247647877, + "grad_norm": 0.017110660672187805, + "learning_rate": 2.0446825902721706e-06, + "loss": 0.0006, + "step": 89370 + }, + { + "epoch": 1.4624887507158635, + "grad_norm": 0.04992340877652168, + "learning_rate": 2.043530926227886e-06, + "loss": 0.0008, + "step": 89380 + }, + { + "epoch": 1.4626523766669393, + "grad_norm": 0.03482833504676819, + "learning_rate": 2.0423795033031607e-06, + "loss": 0.0022, + "step": 89390 + }, + { + "epoch": 1.4628160026180153, + "grad_norm": 0.07968378812074661, + "learning_rate": 2.0412283215919e-06, + "loss": 0.0008, + "step": 89400 + }, + { + "epoch": 1.462979628569091, + "grad_norm": 0.15413126349449158, + "learning_rate": 2.0400773811879915e-06, + "loss": 0.0013, + "step": 89410 + }, + { + "epoch": 1.4631432545201668, + "grad_norm": 0.07705624401569366, + "learning_rate": 2.0389266821853e-06, + "loss": 0.0011, + "step": 89420 + }, + { + "epoch": 1.4633068804712428, + "grad_norm": 0.06214204430580139, + "learning_rate": 2.0377762246776754e-06, + "loss": 0.0014, + "step": 89430 + }, + { + "epoch": 1.4634705064223186, + "grad_norm": 0.0140265803784132, + "learning_rate": 2.036626008758944e-06, + "loss": 0.0014, + "step": 89440 + }, + { + "epoch": 1.4636341323733943, + "grad_norm": 0.0016831799875944853, + "learning_rate": 2.0354760345229115e-06, + "loss": 0.0016, + "step": 89450 + }, + { + "epoch": 1.4637977583244703, + "grad_norm": 0.1347316950559616, + "learning_rate": 2.0343263020633687e-06, + "loss": 0.0011, + "step": 89460 + }, + { + "epoch": 1.4639613842755461, + "grad_norm": 0.1418701410293579, + "learning_rate": 2.0331768114740807e-06, + "loss": 0.0018, + "step": 89470 + }, + { + "epoch": 1.464125010226622, + "grad_norm": 0.005771338008344173, + "learning_rate": 2.0320275628487994e-06, + "loss": 0.0018, + "step": 89480 + }, + { + "epoch": 1.4642886361776979, + "grad_norm": 0.08620180189609528, + "learning_rate": 2.030878556281251e-06, + "loss": 0.0014, + "step": 89490 + }, + { + "epoch": 1.4644522621287737, + "grad_norm": 0.003427609335631132, + "learning_rate": 2.0297297918651476e-06, + "loss": 0.0005, + "step": 89500 + }, + { + "epoch": 1.4646158880798494, + "grad_norm": 0.01985606551170349, + "learning_rate": 2.028581269694175e-06, + "loss": 0.0007, + "step": 89510 + }, + { + "epoch": 1.4647795140309254, + "grad_norm": 0.04124069586396217, + "learning_rate": 2.0274329898620055e-06, + "loss": 0.0007, + "step": 89520 + }, + { + "epoch": 1.4649431399820012, + "grad_norm": 0.029155099764466286, + "learning_rate": 2.0262849524622865e-06, + "loss": 0.0007, + "step": 89530 + }, + { + "epoch": 1.465106765933077, + "grad_norm": 0.09577871114015579, + "learning_rate": 2.025137157588651e-06, + "loss": 0.0008, + "step": 89540 + }, + { + "epoch": 1.4652703918841528, + "grad_norm": 0.04181332141160965, + "learning_rate": 2.023989605334706e-06, + "loss": 0.0006, + "step": 89550 + }, + { + "epoch": 1.4654340178352285, + "grad_norm": 0.07948426157236099, + "learning_rate": 2.022842295794045e-06, + "loss": 0.0012, + "step": 89560 + }, + { + "epoch": 1.4655976437863045, + "grad_norm": 0.04818016663193703, + "learning_rate": 2.0216952290602355e-06, + "loss": 0.0006, + "step": 89570 + }, + { + "epoch": 1.4657612697373803, + "grad_norm": 0.02989812009036541, + "learning_rate": 2.0205484052268313e-06, + "loss": 0.0006, + "step": 89580 + }, + { + "epoch": 1.465924895688456, + "grad_norm": 0.025143815204501152, + "learning_rate": 2.0194018243873612e-06, + "loss": 0.0008, + "step": 89590 + }, + { + "epoch": 1.466088521639532, + "grad_norm": 0.014957061968743801, + "learning_rate": 2.0182554866353394e-06, + "loss": 0.0006, + "step": 89600 + }, + { + "epoch": 1.4662521475906078, + "grad_norm": 0.03021889552474022, + "learning_rate": 2.0171093920642524e-06, + "loss": 0.0012, + "step": 89610 + }, + { + "epoch": 1.4664157735416836, + "grad_norm": 0.03508780524134636, + "learning_rate": 2.0159635407675776e-06, + "loss": 0.0008, + "step": 89620 + }, + { + "epoch": 1.4665793994927596, + "grad_norm": 0.12080138921737671, + "learning_rate": 2.0148179328387617e-06, + "loss": 0.0012, + "step": 89630 + }, + { + "epoch": 1.4667430254438354, + "grad_norm": 0.1634090095758438, + "learning_rate": 2.0136725683712405e-06, + "loss": 0.0008, + "step": 89640 + }, + { + "epoch": 1.4669066513949112, + "grad_norm": 0.005211548879742622, + "learning_rate": 2.012527447458422e-06, + "loss": 0.0005, + "step": 89650 + }, + { + "epoch": 1.4670702773459872, + "grad_norm": 0.08439070731401443, + "learning_rate": 2.0113825701937033e-06, + "loss": 0.0012, + "step": 89660 + }, + { + "epoch": 1.467233903297063, + "grad_norm": 0.025132741779088974, + "learning_rate": 2.0102379366704518e-06, + "loss": 0.0011, + "step": 89670 + }, + { + "epoch": 1.4673975292481387, + "grad_norm": 0.030115440487861633, + "learning_rate": 2.009093546982024e-06, + "loss": 0.0019, + "step": 89680 + }, + { + "epoch": 1.4675611551992147, + "grad_norm": 0.02979099191725254, + "learning_rate": 2.007949401221749e-06, + "loss": 0.0008, + "step": 89690 + }, + { + "epoch": 1.4677247811502905, + "grad_norm": 0.0384887233376503, + "learning_rate": 2.0068054994829423e-06, + "loss": 0.0012, + "step": 89700 + }, + { + "epoch": 1.4678884071013663, + "grad_norm": 0.05716992914676666, + "learning_rate": 2.0056618418588936e-06, + "loss": 0.0015, + "step": 89710 + }, + { + "epoch": 1.4680520330524423, + "grad_norm": 0.046387363225221634, + "learning_rate": 2.004518428442879e-06, + "loss": 0.001, + "step": 89720 + }, + { + "epoch": 1.468215659003518, + "grad_norm": 0.026477502658963203, + "learning_rate": 2.0033752593281485e-06, + "loss": 0.0008, + "step": 89730 + }, + { + "epoch": 1.4683792849545938, + "grad_norm": 0.0986756682395935, + "learning_rate": 2.0022323346079387e-06, + "loss": 0.0012, + "step": 89740 + }, + { + "epoch": 1.4685429109056696, + "grad_norm": 0.09719856828451157, + "learning_rate": 2.00108965437546e-06, + "loss": 0.0012, + "step": 89750 + }, + { + "epoch": 1.4687065368567453, + "grad_norm": 0.032134849578142166, + "learning_rate": 1.9999472187239037e-06, + "loss": 0.001, + "step": 89760 + }, + { + "epoch": 1.4688701628078213, + "grad_norm": 0.06840632855892181, + "learning_rate": 1.9988050277464473e-06, + "loss": 0.0011, + "step": 89770 + }, + { + "epoch": 1.4690337887588971, + "grad_norm": 0.05544612184166908, + "learning_rate": 1.9976630815362402e-06, + "loss": 0.0019, + "step": 89780 + }, + { + "epoch": 1.469197414709973, + "grad_norm": 0.053231481462717056, + "learning_rate": 1.996521380186419e-06, + "loss": 0.0014, + "step": 89790 + }, + { + "epoch": 1.469361040661049, + "grad_norm": 0.15656037628650665, + "learning_rate": 1.9953799237900934e-06, + "loss": 0.0049, + "step": 89800 + }, + { + "epoch": 1.4695246666121247, + "grad_norm": 0.11974067240953445, + "learning_rate": 1.9942387124403607e-06, + "loss": 0.0009, + "step": 89810 + }, + { + "epoch": 1.4696882925632004, + "grad_norm": 0.08129055052995682, + "learning_rate": 1.99309774623029e-06, + "loss": 0.0009, + "step": 89820 + }, + { + "epoch": 1.4698519185142764, + "grad_norm": 0.03509863466024399, + "learning_rate": 1.9919570252529393e-06, + "loss": 0.0006, + "step": 89830 + }, + { + "epoch": 1.4700155444653522, + "grad_norm": 0.11475770175457001, + "learning_rate": 1.9908165496013365e-06, + "loss": 0.0008, + "step": 89840 + }, + { + "epoch": 1.470179170416428, + "grad_norm": 0.03813806548714638, + "learning_rate": 1.9896763193685004e-06, + "loss": 0.0006, + "step": 89850 + }, + { + "epoch": 1.470342796367504, + "grad_norm": 0.026975499466061592, + "learning_rate": 1.9885363346474194e-06, + "loss": 0.0018, + "step": 89860 + }, + { + "epoch": 1.4705064223185798, + "grad_norm": 0.014688033610582352, + "learning_rate": 1.987396595531071e-06, + "loss": 0.001, + "step": 89870 + }, + { + "epoch": 1.4706700482696555, + "grad_norm": 0.14553506672382355, + "learning_rate": 1.986257102112405e-06, + "loss": 0.0015, + "step": 89880 + }, + { + "epoch": 1.4708336742207315, + "grad_norm": 0.0654304027557373, + "learning_rate": 1.9851178544843576e-06, + "loss": 0.0011, + "step": 89890 + }, + { + "epoch": 1.4709973001718073, + "grad_norm": 0.002384435385465622, + "learning_rate": 1.983978852739839e-06, + "loss": 0.0013, + "step": 89900 + }, + { + "epoch": 1.471160926122883, + "grad_norm": 0.01792038604617119, + "learning_rate": 1.9828400969717446e-06, + "loss": 0.0009, + "step": 89910 + }, + { + "epoch": 1.4713245520739588, + "grad_norm": 0.08039540797472, + "learning_rate": 1.981701587272946e-06, + "loss": 0.001, + "step": 89920 + }, + { + "epoch": 1.4714881780250348, + "grad_norm": 0.015828078612685204, + "learning_rate": 1.980563323736298e-06, + "loss": 0.0006, + "step": 89930 + }, + { + "epoch": 1.4716518039761106, + "grad_norm": 0.02373330667614937, + "learning_rate": 1.979425306454631e-06, + "loss": 0.0022, + "step": 89940 + }, + { + "epoch": 1.4718154299271864, + "grad_norm": 0.039735570549964905, + "learning_rate": 1.9782875355207605e-06, + "loss": 0.0016, + "step": 89950 + }, + { + "epoch": 1.4719790558782622, + "grad_norm": 0.07227891683578491, + "learning_rate": 1.977150011027476e-06, + "loss": 0.0005, + "step": 89960 + }, + { + "epoch": 1.4721426818293382, + "grad_norm": 0.04523607715964317, + "learning_rate": 1.976012733067554e-06, + "loss": 0.0014, + "step": 89970 + }, + { + "epoch": 1.472306307780414, + "grad_norm": 0.04044328257441521, + "learning_rate": 1.9748757017337433e-06, + "loss": 0.0008, + "step": 89980 + }, + { + "epoch": 1.4724699337314897, + "grad_norm": 0.020450161769986153, + "learning_rate": 1.9737389171187793e-06, + "loss": 0.0009, + "step": 89990 + }, + { + "epoch": 1.4726335596825657, + "grad_norm": 0.05939488485455513, + "learning_rate": 1.9726023793153714e-06, + "loss": 0.0007, + "step": 90000 + }, + { + "epoch": 1.4727971856336415, + "grad_norm": 0.030129818245768547, + "learning_rate": 1.971466088416216e-06, + "loss": 0.0018, + "step": 90010 + }, + { + "epoch": 1.4729608115847173, + "grad_norm": 0.07449831813573837, + "learning_rate": 1.9703300445139793e-06, + "loss": 0.0013, + "step": 90020 + }, + { + "epoch": 1.4731244375357933, + "grad_norm": 0.03683117404580116, + "learning_rate": 1.969194247701319e-06, + "loss": 0.0013, + "step": 90030 + }, + { + "epoch": 1.473288063486869, + "grad_norm": 0.03523913770914078, + "learning_rate": 1.968058698070862e-06, + "loss": 0.0007, + "step": 90040 + }, + { + "epoch": 1.4734516894379448, + "grad_norm": 0.0219389908015728, + "learning_rate": 1.966923395715224e-06, + "loss": 0.0008, + "step": 90050 + }, + { + "epoch": 1.4736153153890208, + "grad_norm": 0.05452648550271988, + "learning_rate": 1.965788340726993e-06, + "loss": 0.0013, + "step": 90060 + }, + { + "epoch": 1.4737789413400966, + "grad_norm": 0.08179844170808792, + "learning_rate": 1.9646535331987426e-06, + "loss": 0.0028, + "step": 90070 + }, + { + "epoch": 1.4739425672911723, + "grad_norm": 0.053484898060560226, + "learning_rate": 1.963518973223024e-06, + "loss": 0.0008, + "step": 90080 + }, + { + "epoch": 1.4741061932422483, + "grad_norm": 0.04054323211312294, + "learning_rate": 1.962384660892364e-06, + "loss": 0.0015, + "step": 90090 + }, + { + "epoch": 1.4742698191933241, + "grad_norm": 0.035754457116127014, + "learning_rate": 1.9612505962992785e-06, + "loss": 0.002, + "step": 90100 + }, + { + "epoch": 1.4744334451444, + "grad_norm": 0.021980365738272667, + "learning_rate": 1.9601167795362537e-06, + "loss": 0.0006, + "step": 90110 + }, + { + "epoch": 1.4745970710954757, + "grad_norm": 0.05765829607844353, + "learning_rate": 1.9589832106957635e-06, + "loss": 0.0013, + "step": 90120 + }, + { + "epoch": 1.4747606970465517, + "grad_norm": 0.025607964023947716, + "learning_rate": 1.9578498898702545e-06, + "loss": 0.0008, + "step": 90130 + }, + { + "epoch": 1.4749243229976274, + "grad_norm": 0.02451637014746666, + "learning_rate": 1.9567168171521607e-06, + "loss": 0.0015, + "step": 90140 + }, + { + "epoch": 1.4750879489487032, + "grad_norm": 0.07796599715948105, + "learning_rate": 1.9555839926338863e-06, + "loss": 0.0013, + "step": 90150 + }, + { + "epoch": 1.475251574899779, + "grad_norm": 0.14252610504627228, + "learning_rate": 1.954451416407826e-06, + "loss": 0.0009, + "step": 90160 + }, + { + "epoch": 1.475415200850855, + "grad_norm": 0.016709337010979652, + "learning_rate": 1.953319088566344e-06, + "loss": 0.0008, + "step": 90170 + }, + { + "epoch": 1.4755788268019308, + "grad_norm": 0.0331064909696579, + "learning_rate": 1.9521870092017937e-06, + "loss": 0.0012, + "step": 90180 + }, + { + "epoch": 1.4757424527530065, + "grad_norm": 0.0621693953871727, + "learning_rate": 1.9510551784064992e-06, + "loss": 0.0007, + "step": 90190 + }, + { + "epoch": 1.4759060787040825, + "grad_norm": 0.03573783114552498, + "learning_rate": 1.949923596272773e-06, + "loss": 0.0009, + "step": 90200 + }, + { + "epoch": 1.4760697046551583, + "grad_norm": 0.052000194787979126, + "learning_rate": 1.948792262892899e-06, + "loss": 0.001, + "step": 90210 + }, + { + "epoch": 1.476233330606234, + "grad_norm": 0.04357015714049339, + "learning_rate": 1.9476611783591487e-06, + "loss": 0.0013, + "step": 90220 + }, + { + "epoch": 1.47639695655731, + "grad_norm": 0.08963147550821304, + "learning_rate": 1.946530342763766e-06, + "loss": 0.0012, + "step": 90230 + }, + { + "epoch": 1.4765605825083858, + "grad_norm": 0.10553299635648727, + "learning_rate": 1.9453997561989816e-06, + "loss": 0.0016, + "step": 90240 + }, + { + "epoch": 1.4767242084594616, + "grad_norm": 0.10988305509090424, + "learning_rate": 1.944269418756999e-06, + "loss": 0.0015, + "step": 90250 + }, + { + "epoch": 1.4768878344105376, + "grad_norm": 0.06891978532075882, + "learning_rate": 1.943139330530008e-06, + "loss": 0.0015, + "step": 90260 + }, + { + "epoch": 1.4770514603616134, + "grad_norm": 0.019649969413876534, + "learning_rate": 1.94200949161017e-06, + "loss": 0.0021, + "step": 90270 + }, + { + "epoch": 1.4772150863126892, + "grad_norm": 0.016747767105698586, + "learning_rate": 1.9408799020896365e-06, + "loss": 0.0021, + "step": 90280 + }, + { + "epoch": 1.4773787122637652, + "grad_norm": 0.02739819325506687, + "learning_rate": 1.9397505620605278e-06, + "loss": 0.0011, + "step": 90290 + }, + { + "epoch": 1.477542338214841, + "grad_norm": 0.07685421407222748, + "learning_rate": 1.9386214716149536e-06, + "loss": 0.0009, + "step": 90300 + }, + { + "epoch": 1.4777059641659167, + "grad_norm": 0.0713479295372963, + "learning_rate": 1.9374926308449944e-06, + "loss": 0.001, + "step": 90310 + }, + { + "epoch": 1.4778695901169925, + "grad_norm": 0.008855665102601051, + "learning_rate": 1.9363640398427185e-06, + "loss": 0.0007, + "step": 90320 + }, + { + "epoch": 1.4780332160680685, + "grad_norm": 0.05054214596748352, + "learning_rate": 1.9352356987001666e-06, + "loss": 0.0011, + "step": 90330 + }, + { + "epoch": 1.4781968420191443, + "grad_norm": 0.05083496496081352, + "learning_rate": 1.934107607509365e-06, + "loss": 0.0016, + "step": 90340 + }, + { + "epoch": 1.47836046797022, + "grad_norm": 0.046345412731170654, + "learning_rate": 1.9329797663623146e-06, + "loss": 0.0011, + "step": 90350 + }, + { + "epoch": 1.4785240939212958, + "grad_norm": 0.04118720814585686, + "learning_rate": 1.931852175351001e-06, + "loss": 0.0005, + "step": 90360 + }, + { + "epoch": 1.4786877198723718, + "grad_norm": 0.015313656069338322, + "learning_rate": 1.930724834567384e-06, + "loss": 0.0032, + "step": 90370 + }, + { + "epoch": 1.4788513458234476, + "grad_norm": 0.061858244240283966, + "learning_rate": 1.9295977441034087e-06, + "loss": 0.0008, + "step": 90380 + }, + { + "epoch": 1.4790149717745233, + "grad_norm": 0.005056298803538084, + "learning_rate": 1.928470904050994e-06, + "loss": 0.0004, + "step": 90390 + }, + { + "epoch": 1.4791785977255993, + "grad_norm": 0.006792113184928894, + "learning_rate": 1.9273443145020417e-06, + "loss": 0.0014, + "step": 90400 + }, + { + "epoch": 1.4793422236766751, + "grad_norm": 0.0516655333340168, + "learning_rate": 1.926217975548434e-06, + "loss": 0.001, + "step": 90410 + }, + { + "epoch": 1.479505849627751, + "grad_norm": 0.09765911847352982, + "learning_rate": 1.9250918872820284e-06, + "loss": 0.0018, + "step": 90420 + }, + { + "epoch": 1.4796694755788269, + "grad_norm": 0.06497883796691895, + "learning_rate": 1.9239660497946686e-06, + "loss": 0.0016, + "step": 90430 + }, + { + "epoch": 1.4798331015299027, + "grad_norm": 0.04156436026096344, + "learning_rate": 1.9228404631781712e-06, + "loss": 0.0008, + "step": 90440 + }, + { + "epoch": 1.4799967274809784, + "grad_norm": 0.02495254948735237, + "learning_rate": 1.9217151275243368e-06, + "loss": 0.0016, + "step": 90450 + }, + { + "epoch": 1.4801603534320544, + "grad_norm": 0.055173467844724655, + "learning_rate": 1.9205900429249426e-06, + "loss": 0.001, + "step": 90460 + }, + { + "epoch": 1.4803239793831302, + "grad_norm": 0.0012991520343348384, + "learning_rate": 1.9194652094717485e-06, + "loss": 0.0007, + "step": 90470 + }, + { + "epoch": 1.480487605334206, + "grad_norm": 0.03873857110738754, + "learning_rate": 1.9183406272564896e-06, + "loss": 0.0013, + "step": 90480 + }, + { + "epoch": 1.480651231285282, + "grad_norm": 0.0435195155441761, + "learning_rate": 1.9172162963708864e-06, + "loss": 0.0024, + "step": 90490 + }, + { + "epoch": 1.4808148572363578, + "grad_norm": 0.0818653479218483, + "learning_rate": 1.9160922169066313e-06, + "loss": 0.0012, + "step": 90500 + }, + { + "epoch": 1.4809784831874335, + "grad_norm": 0.07184914499521255, + "learning_rate": 1.914968388955405e-06, + "loss": 0.0005, + "step": 90510 + }, + { + "epoch": 1.4811421091385093, + "grad_norm": 0.008615637198090553, + "learning_rate": 1.9138448126088584e-06, + "loss": 0.0004, + "step": 90520 + }, + { + "epoch": 1.481305735089585, + "grad_norm": 0.02594771608710289, + "learning_rate": 1.912721487958631e-06, + "loss": 0.0011, + "step": 90530 + }, + { + "epoch": 1.481469361040661, + "grad_norm": 0.03695163503289223, + "learning_rate": 1.911598415096333e-06, + "loss": 0.0009, + "step": 90540 + }, + { + "epoch": 1.4816329869917368, + "grad_norm": 0.13411551713943481, + "learning_rate": 1.910475594113563e-06, + "loss": 0.0017, + "step": 90550 + }, + { + "epoch": 1.4817966129428126, + "grad_norm": 0.037620969116687775, + "learning_rate": 1.9093530251018892e-06, + "loss": 0.0005, + "step": 90560 + }, + { + "epoch": 1.4819602388938886, + "grad_norm": 0.06277339905500412, + "learning_rate": 1.90823070815287e-06, + "loss": 0.0013, + "step": 90570 + }, + { + "epoch": 1.4821238648449644, + "grad_norm": 0.04689271003007889, + "learning_rate": 1.9071086433580332e-06, + "loss": 0.0023, + "step": 90580 + }, + { + "epoch": 1.4822874907960402, + "grad_norm": 0.04348761960864067, + "learning_rate": 1.9059868308088948e-06, + "loss": 0.0008, + "step": 90590 + }, + { + "epoch": 1.4824511167471162, + "grad_norm": 0.13316167891025543, + "learning_rate": 1.9048652705969412e-06, + "loss": 0.0012, + "step": 90600 + }, + { + "epoch": 1.482614742698192, + "grad_norm": 0.047682132571935654, + "learning_rate": 1.9037439628136479e-06, + "loss": 0.0007, + "step": 90610 + }, + { + "epoch": 1.4827783686492677, + "grad_norm": 0.03104541078209877, + "learning_rate": 1.9026229075504604e-06, + "loss": 0.0011, + "step": 90620 + }, + { + "epoch": 1.4829419946003437, + "grad_norm": 0.047130435705184937, + "learning_rate": 1.9015021048988124e-06, + "loss": 0.0015, + "step": 90630 + }, + { + "epoch": 1.4831056205514195, + "grad_norm": 0.036360953003168106, + "learning_rate": 1.9003815549501093e-06, + "loss": 0.0019, + "step": 90640 + }, + { + "epoch": 1.4832692465024953, + "grad_norm": 0.04357534646987915, + "learning_rate": 1.899261257795742e-06, + "loss": 0.0008, + "step": 90650 + }, + { + "epoch": 1.4834328724535712, + "grad_norm": 0.05648239701986313, + "learning_rate": 1.898141213527075e-06, + "loss": 0.0011, + "step": 90660 + }, + { + "epoch": 1.483596498404647, + "grad_norm": 0.023698559030890465, + "learning_rate": 1.8970214222354593e-06, + "loss": 0.0005, + "step": 90670 + }, + { + "epoch": 1.4837601243557228, + "grad_norm": 0.03344593197107315, + "learning_rate": 1.8959018840122174e-06, + "loss": 0.0009, + "step": 90680 + }, + { + "epoch": 1.4839237503067988, + "grad_norm": 0.0575055293738842, + "learning_rate": 1.8947825989486585e-06, + "loss": 0.0008, + "step": 90690 + }, + { + "epoch": 1.4840873762578746, + "grad_norm": 0.015905845910310745, + "learning_rate": 1.8936635671360642e-06, + "loss": 0.0005, + "step": 90700 + }, + { + "epoch": 1.4842510022089503, + "grad_norm": 0.03286319226026535, + "learning_rate": 1.8925447886657022e-06, + "loss": 0.0022, + "step": 90710 + }, + { + "epoch": 1.4844146281600261, + "grad_norm": 0.03279919549822807, + "learning_rate": 1.891426263628815e-06, + "loss": 0.0007, + "step": 90720 + }, + { + "epoch": 1.484578254111102, + "grad_norm": 0.04007800295948982, + "learning_rate": 1.890307992116624e-06, + "loss": 0.0009, + "step": 90730 + }, + { + "epoch": 1.4847418800621779, + "grad_norm": 0.0338972844183445, + "learning_rate": 1.8891899742203346e-06, + "loss": 0.0011, + "step": 90740 + }, + { + "epoch": 1.4849055060132537, + "grad_norm": 0.006547426339238882, + "learning_rate": 1.888072210031125e-06, + "loss": 0.0007, + "step": 90750 + }, + { + "epoch": 1.4850691319643294, + "grad_norm": 0.04488673061132431, + "learning_rate": 1.88695469964016e-06, + "loss": 0.0012, + "step": 90760 + }, + { + "epoch": 1.4852327579154054, + "grad_norm": 0.013546115718781948, + "learning_rate": 1.8858374431385767e-06, + "loss": 0.0012, + "step": 90770 + }, + { + "epoch": 1.4853963838664812, + "grad_norm": 0.06485360860824585, + "learning_rate": 1.884720440617498e-06, + "loss": 0.0011, + "step": 90780 + }, + { + "epoch": 1.485560009817557, + "grad_norm": 0.009377405047416687, + "learning_rate": 1.883603692168019e-06, + "loss": 0.0021, + "step": 90790 + }, + { + "epoch": 1.485723635768633, + "grad_norm": 0.03613147512078285, + "learning_rate": 1.882487197881222e-06, + "loss": 0.0023, + "step": 90800 + }, + { + "epoch": 1.4858872617197088, + "grad_norm": 0.09895659238100052, + "learning_rate": 1.8813709578481609e-06, + "loss": 0.0015, + "step": 90810 + }, + { + "epoch": 1.4860508876707845, + "grad_norm": 0.017641883343458176, + "learning_rate": 1.8802549721598757e-06, + "loss": 0.0014, + "step": 90820 + }, + { + "epoch": 1.4862145136218605, + "grad_norm": 0.0377693772315979, + "learning_rate": 1.8791392409073783e-06, + "loss": 0.0005, + "step": 90830 + }, + { + "epoch": 1.4863781395729363, + "grad_norm": 0.014671116136014462, + "learning_rate": 1.878023764181669e-06, + "loss": 0.0015, + "step": 90840 + }, + { + "epoch": 1.486541765524012, + "grad_norm": 0.029847491532564163, + "learning_rate": 1.8769085420737171e-06, + "loss": 0.0006, + "step": 90850 + }, + { + "epoch": 1.486705391475088, + "grad_norm": 0.08111515641212463, + "learning_rate": 1.875793574674481e-06, + "loss": 0.0012, + "step": 90860 + }, + { + "epoch": 1.4868690174261638, + "grad_norm": 0.020809266716241837, + "learning_rate": 1.8746788620748896e-06, + "loss": 0.0007, + "step": 90870 + }, + { + "epoch": 1.4870326433772396, + "grad_norm": 0.01035325601696968, + "learning_rate": 1.8735644043658585e-06, + "loss": 0.0007, + "step": 90880 + }, + { + "epoch": 1.4871962693283154, + "grad_norm": 0.06289991736412048, + "learning_rate": 1.8724502016382761e-06, + "loss": 0.0012, + "step": 90890 + }, + { + "epoch": 1.4873598952793914, + "grad_norm": 0.025515517219901085, + "learning_rate": 1.871336253983016e-06, + "loss": 0.0007, + "step": 90900 + }, + { + "epoch": 1.4875235212304672, + "grad_norm": 0.04921700060367584, + "learning_rate": 1.8702225614909247e-06, + "loss": 0.0008, + "step": 90910 + }, + { + "epoch": 1.487687147181543, + "grad_norm": 0.10310881584882736, + "learning_rate": 1.8691091242528341e-06, + "loss": 0.0018, + "step": 90920 + }, + { + "epoch": 1.4878507731326187, + "grad_norm": 0.02996991015970707, + "learning_rate": 1.86799594235955e-06, + "loss": 0.001, + "step": 90930 + }, + { + "epoch": 1.4880143990836947, + "grad_norm": 0.15167322754859924, + "learning_rate": 1.8668830159018624e-06, + "loss": 0.0008, + "step": 90940 + }, + { + "epoch": 1.4881780250347705, + "grad_norm": 0.08759970217943192, + "learning_rate": 1.865770344970535e-06, + "loss": 0.0009, + "step": 90950 + }, + { + "epoch": 1.4883416509858463, + "grad_norm": 0.08242107182741165, + "learning_rate": 1.8646579296563155e-06, + "loss": 0.0009, + "step": 90960 + }, + { + "epoch": 1.4885052769369223, + "grad_norm": 0.01976623386144638, + "learning_rate": 1.8635457700499271e-06, + "loss": 0.001, + "step": 90970 + }, + { + "epoch": 1.488668902887998, + "grad_norm": 0.0412474200129509, + "learning_rate": 1.8624338662420754e-06, + "loss": 0.0005, + "step": 90980 + }, + { + "epoch": 1.4888325288390738, + "grad_norm": 0.022464029490947723, + "learning_rate": 1.8613222183234414e-06, + "loss": 0.0008, + "step": 90990 + }, + { + "epoch": 1.4889961547901498, + "grad_norm": 0.07071196287870407, + "learning_rate": 1.8602108263846903e-06, + "loss": 0.0007, + "step": 91000 + }, + { + "epoch": 1.4891597807412256, + "grad_norm": 0.06409469246864319, + "learning_rate": 1.85909969051646e-06, + "loss": 0.0006, + "step": 91010 + }, + { + "epoch": 1.4893234066923013, + "grad_norm": 0.08950179815292358, + "learning_rate": 1.8579888108093742e-06, + "loss": 0.0012, + "step": 91020 + }, + { + "epoch": 1.4894870326433773, + "grad_norm": 0.00314501510001719, + "learning_rate": 1.8568781873540292e-06, + "loss": 0.001, + "step": 91030 + }, + { + "epoch": 1.4896506585944531, + "grad_norm": 0.12697337567806244, + "learning_rate": 1.8557678202410074e-06, + "loss": 0.0014, + "step": 91040 + }, + { + "epoch": 1.4898142845455289, + "grad_norm": 0.12479368597269058, + "learning_rate": 1.8546577095608648e-06, + "loss": 0.0007, + "step": 91050 + }, + { + "epoch": 1.4899779104966049, + "grad_norm": 0.013313405215740204, + "learning_rate": 1.853547855404136e-06, + "loss": 0.001, + "step": 91060 + }, + { + "epoch": 1.4901415364476807, + "grad_norm": 0.06134331598877907, + "learning_rate": 1.8524382578613404e-06, + "loss": 0.0014, + "step": 91070 + }, + { + "epoch": 1.4903051623987564, + "grad_norm": 0.037143003195524216, + "learning_rate": 1.8513289170229704e-06, + "loss": 0.0007, + "step": 91080 + }, + { + "epoch": 1.4904687883498322, + "grad_norm": 0.09902860969305038, + "learning_rate": 1.8502198329795024e-06, + "loss": 0.0012, + "step": 91090 + }, + { + "epoch": 1.4906324143009082, + "grad_norm": 0.07716687768697739, + "learning_rate": 1.8491110058213867e-06, + "loss": 0.0004, + "step": 91100 + }, + { + "epoch": 1.490796040251984, + "grad_norm": 0.014123808592557907, + "learning_rate": 1.8480024356390592e-06, + "loss": 0.0011, + "step": 91110 + }, + { + "epoch": 1.4909596662030598, + "grad_norm": 0.0366465225815773, + "learning_rate": 1.846894122522927e-06, + "loss": 0.0006, + "step": 91120 + }, + { + "epoch": 1.4911232921541355, + "grad_norm": 0.049732182174921036, + "learning_rate": 1.8457860665633843e-06, + "loss": 0.0011, + "step": 91130 + }, + { + "epoch": 1.4912869181052115, + "grad_norm": 0.05516284331679344, + "learning_rate": 1.8446782678507962e-06, + "loss": 0.0018, + "step": 91140 + }, + { + "epoch": 1.4914505440562873, + "grad_norm": 0.052313219755887985, + "learning_rate": 1.8435707264755153e-06, + "loss": 0.0026, + "step": 91150 + }, + { + "epoch": 1.491614170007363, + "grad_norm": 0.08951608836650848, + "learning_rate": 1.8424634425278653e-06, + "loss": 0.0015, + "step": 91160 + }, + { + "epoch": 1.491777795958439, + "grad_norm": 0.30258411169052124, + "learning_rate": 1.8413564160981562e-06, + "loss": 0.0022, + "step": 91170 + }, + { + "epoch": 1.4919414219095148, + "grad_norm": 0.03965074196457863, + "learning_rate": 1.8402496472766685e-06, + "loss": 0.0012, + "step": 91180 + }, + { + "epoch": 1.4921050478605906, + "grad_norm": 0.08147792518138885, + "learning_rate": 1.8391431361536716e-06, + "loss": 0.001, + "step": 91190 + }, + { + "epoch": 1.4922686738116666, + "grad_norm": 0.038175102323293686, + "learning_rate": 1.8380368828194044e-06, + "loss": 0.001, + "step": 91200 + }, + { + "epoch": 1.4924322997627424, + "grad_norm": 0.002345207380130887, + "learning_rate": 1.836930887364093e-06, + "loss": 0.001, + "step": 91210 + }, + { + "epoch": 1.4925959257138182, + "grad_norm": 0.046156398952007294, + "learning_rate": 1.8358251498779345e-06, + "loss": 0.001, + "step": 91220 + }, + { + "epoch": 1.4927595516648942, + "grad_norm": 0.06583865731954575, + "learning_rate": 1.8347196704511134e-06, + "loss": 0.0014, + "step": 91230 + }, + { + "epoch": 1.49292317761597, + "grad_norm": 0.027767017483711243, + "learning_rate": 1.833614449173785e-06, + "loss": 0.0012, + "step": 91240 + }, + { + "epoch": 1.4930868035670457, + "grad_norm": 0.0442766398191452, + "learning_rate": 1.832509486136091e-06, + "loss": 0.0007, + "step": 91250 + }, + { + "epoch": 1.4932504295181217, + "grad_norm": 0.06051894649863243, + "learning_rate": 1.8314047814281443e-06, + "loss": 0.001, + "step": 91260 + }, + { + "epoch": 1.4934140554691975, + "grad_norm": 0.07792415469884872, + "learning_rate": 1.8303003351400456e-06, + "loss": 0.0016, + "step": 91270 + }, + { + "epoch": 1.4935776814202733, + "grad_norm": 0.02484029345214367, + "learning_rate": 1.8291961473618653e-06, + "loss": 0.0014, + "step": 91280 + }, + { + "epoch": 1.493741307371349, + "grad_norm": 0.025423802435398102, + "learning_rate": 1.8280922181836607e-06, + "loss": 0.0011, + "step": 91290 + }, + { + "epoch": 1.4939049333224248, + "grad_norm": 0.07797304540872574, + "learning_rate": 1.8269885476954614e-06, + "loss": 0.001, + "step": 91300 + }, + { + "epoch": 1.4940685592735008, + "grad_norm": 0.023377392441034317, + "learning_rate": 1.8258851359872825e-06, + "loss": 0.0011, + "step": 91310 + }, + { + "epoch": 1.4942321852245766, + "grad_norm": 0.02874838560819626, + "learning_rate": 1.8247819831491109e-06, + "loss": 0.0005, + "step": 91320 + }, + { + "epoch": 1.4943958111756523, + "grad_norm": 0.06698146462440491, + "learning_rate": 1.8236790892709193e-06, + "loss": 0.0011, + "step": 91330 + }, + { + "epoch": 1.4945594371267283, + "grad_norm": 0.09435980767011642, + "learning_rate": 1.8225764544426533e-06, + "loss": 0.0016, + "step": 91340 + }, + { + "epoch": 1.4947230630778041, + "grad_norm": 0.06800531595945358, + "learning_rate": 1.8214740787542422e-06, + "loss": 0.0006, + "step": 91350 + }, + { + "epoch": 1.49488668902888, + "grad_norm": 0.058543041348457336, + "learning_rate": 1.8203719622955911e-06, + "loss": 0.0007, + "step": 91360 + }, + { + "epoch": 1.4950503149799559, + "grad_norm": 0.125640869140625, + "learning_rate": 1.8192701051565831e-06, + "loss": 0.0006, + "step": 91370 + }, + { + "epoch": 1.4952139409310317, + "grad_norm": 0.11544572561979294, + "learning_rate": 1.818168507427085e-06, + "loss": 0.0011, + "step": 91380 + }, + { + "epoch": 1.4953775668821074, + "grad_norm": 0.03796697407960892, + "learning_rate": 1.8170671691969366e-06, + "loss": 0.0009, + "step": 91390 + }, + { + "epoch": 1.4955411928331834, + "grad_norm": 0.01932319439947605, + "learning_rate": 1.8159660905559622e-06, + "loss": 0.0013, + "step": 91400 + }, + { + "epoch": 1.4957048187842592, + "grad_norm": 0.050985969603061676, + "learning_rate": 1.8148652715939585e-06, + "loss": 0.0013, + "step": 91410 + }, + { + "epoch": 1.495868444735335, + "grad_norm": 0.06053866818547249, + "learning_rate": 1.8137647124007084e-06, + "loss": 0.0008, + "step": 91420 + }, + { + "epoch": 1.496032070686411, + "grad_norm": 0.06278298050165176, + "learning_rate": 1.8126644130659659e-06, + "loss": 0.0011, + "step": 91430 + }, + { + "epoch": 1.4961956966374867, + "grad_norm": 0.041511520743370056, + "learning_rate": 1.8115643736794714e-06, + "loss": 0.0018, + "step": 91440 + }, + { + "epoch": 1.4963593225885625, + "grad_norm": 0.03669146075844765, + "learning_rate": 1.8104645943309374e-06, + "loss": 0.0009, + "step": 91450 + }, + { + "epoch": 1.4965229485396385, + "grad_norm": 0.008665479719638824, + "learning_rate": 1.8093650751100605e-06, + "loss": 0.0013, + "step": 91460 + }, + { + "epoch": 1.4966865744907143, + "grad_norm": 0.05738081783056259, + "learning_rate": 1.8082658161065108e-06, + "loss": 0.0008, + "step": 91470 + }, + { + "epoch": 1.49685020044179, + "grad_norm": 0.008138302713632584, + "learning_rate": 1.8071668174099439e-06, + "loss": 0.0006, + "step": 91480 + }, + { + "epoch": 1.4970138263928658, + "grad_norm": 0.07915917783975601, + "learning_rate": 1.8060680791099867e-06, + "loss": 0.0011, + "step": 91490 + }, + { + "epoch": 1.4971774523439416, + "grad_norm": 0.09052205085754395, + "learning_rate": 1.804969601296252e-06, + "loss": 0.002, + "step": 91500 + }, + { + "epoch": 1.4973410782950176, + "grad_norm": 0.051362503319978714, + "learning_rate": 1.8038713840583243e-06, + "loss": 0.0009, + "step": 91510 + }, + { + "epoch": 1.4975047042460934, + "grad_norm": 0.07138554006814957, + "learning_rate": 1.8027734274857744e-06, + "loss": 0.0013, + "step": 91520 + }, + { + "epoch": 1.4976683301971692, + "grad_norm": 0.04177491366863251, + "learning_rate": 1.8016757316681438e-06, + "loss": 0.0009, + "step": 91530 + }, + { + "epoch": 1.4978319561482452, + "grad_norm": 0.005156593397259712, + "learning_rate": 1.8005782966949609e-06, + "loss": 0.0012, + "step": 91540 + }, + { + "epoch": 1.497995582099321, + "grad_norm": 0.06645479053258896, + "learning_rate": 1.7994811226557246e-06, + "loss": 0.0015, + "step": 91550 + }, + { + "epoch": 1.4981592080503967, + "grad_norm": 0.08459978550672531, + "learning_rate": 1.7983842096399211e-06, + "loss": 0.0009, + "step": 91560 + }, + { + "epoch": 1.4983228340014727, + "grad_norm": 0.07986968755722046, + "learning_rate": 1.7972875577370063e-06, + "loss": 0.001, + "step": 91570 + }, + { + "epoch": 1.4984864599525485, + "grad_norm": 0.04276851937174797, + "learning_rate": 1.796191167036424e-06, + "loss": 0.0011, + "step": 91580 + }, + { + "epoch": 1.4986500859036243, + "grad_norm": 0.10228171944618225, + "learning_rate": 1.7950950376275878e-06, + "loss": 0.0014, + "step": 91590 + }, + { + "epoch": 1.4988137118547002, + "grad_norm": 0.027184845879673958, + "learning_rate": 1.7939991695998981e-06, + "loss": 0.0007, + "step": 91600 + }, + { + "epoch": 1.498977337805776, + "grad_norm": 0.022818472236394882, + "learning_rate": 1.7929035630427266e-06, + "loss": 0.0012, + "step": 91610 + }, + { + "epoch": 1.4991409637568518, + "grad_norm": 0.06028661131858826, + "learning_rate": 1.7918082180454305e-06, + "loss": 0.0011, + "step": 91620 + }, + { + "epoch": 1.4993045897079278, + "grad_norm": 0.05397883430123329, + "learning_rate": 1.7907131346973394e-06, + "loss": 0.0006, + "step": 91630 + }, + { + "epoch": 1.4994682156590036, + "grad_norm": 0.06542099267244339, + "learning_rate": 1.789618313087768e-06, + "loss": 0.001, + "step": 91640 + }, + { + "epoch": 1.4996318416100793, + "grad_norm": 0.0512886568903923, + "learning_rate": 1.7885237533060023e-06, + "loss": 0.0008, + "step": 91650 + }, + { + "epoch": 1.4997954675611553, + "grad_norm": 0.04877230152487755, + "learning_rate": 1.7874294554413152e-06, + "loss": 0.001, + "step": 91660 + }, + { + "epoch": 1.4999590935122311, + "grad_norm": 0.032953184098005295, + "learning_rate": 1.7863354195829497e-06, + "loss": 0.0008, + "step": 91670 + }, + { + "epoch": 1.5001227194633069, + "grad_norm": 0.08210153132677078, + "learning_rate": 1.7852416458201348e-06, + "loss": 0.001, + "step": 91680 + }, + { + "epoch": 1.5002863454143829, + "grad_norm": 0.05134357511997223, + "learning_rate": 1.784148134242074e-06, + "loss": 0.0009, + "step": 91690 + }, + { + "epoch": 1.5004499713654584, + "grad_norm": 0.017645183950662613, + "learning_rate": 1.7830548849379481e-06, + "loss": 0.0006, + "step": 91700 + }, + { + "epoch": 1.5006135973165344, + "grad_norm": 0.010288752615451813, + "learning_rate": 1.7819618979969228e-06, + "loss": 0.0011, + "step": 91710 + }, + { + "epoch": 1.5007772232676102, + "grad_norm": 0.015275281853973866, + "learning_rate": 1.7808691735081347e-06, + "loss": 0.0006, + "step": 91720 + }, + { + "epoch": 1.500940849218686, + "grad_norm": 0.08501256257295609, + "learning_rate": 1.7797767115607062e-06, + "loss": 0.0005, + "step": 91730 + }, + { + "epoch": 1.501104475169762, + "grad_norm": 0.03945817053318024, + "learning_rate": 1.7786845122437307e-06, + "loss": 0.0009, + "step": 91740 + }, + { + "epoch": 1.5012681011208378, + "grad_norm": 0.05070319399237633, + "learning_rate": 1.7775925756462887e-06, + "loss": 0.0014, + "step": 91750 + }, + { + "epoch": 1.5014317270719135, + "grad_norm": 0.0801292136311531, + "learning_rate": 1.7765009018574307e-06, + "loss": 0.0014, + "step": 91760 + }, + { + "epoch": 1.5015953530229895, + "grad_norm": 0.006221620365977287, + "learning_rate": 1.775409490966194e-06, + "loss": 0.0025, + "step": 91770 + }, + { + "epoch": 1.5017589789740653, + "grad_norm": 0.0658557116985321, + "learning_rate": 1.7743183430615863e-06, + "loss": 0.0009, + "step": 91780 + }, + { + "epoch": 1.501922604925141, + "grad_norm": 0.018184788525104523, + "learning_rate": 1.7732274582326015e-06, + "loss": 0.0016, + "step": 91790 + }, + { + "epoch": 1.502086230876217, + "grad_norm": 0.03207259625196457, + "learning_rate": 1.7721368365682045e-06, + "loss": 0.0005, + "step": 91800 + }, + { + "epoch": 1.5022498568272928, + "grad_norm": 0.0893886610865593, + "learning_rate": 1.7710464781573473e-06, + "loss": 0.0007, + "step": 91810 + }, + { + "epoch": 1.5024134827783686, + "grad_norm": 0.07772957533597946, + "learning_rate": 1.7699563830889517e-06, + "loss": 0.0019, + "step": 91820 + }, + { + "epoch": 1.5025771087294446, + "grad_norm": 0.024866385385394096, + "learning_rate": 1.7688665514519253e-06, + "loss": 0.001, + "step": 91830 + }, + { + "epoch": 1.5027407346805204, + "grad_norm": 0.043451279401779175, + "learning_rate": 1.7677769833351488e-06, + "loss": 0.0012, + "step": 91840 + }, + { + "epoch": 1.5029043606315962, + "grad_norm": 0.05371924489736557, + "learning_rate": 1.7666876788274857e-06, + "loss": 0.001, + "step": 91850 + }, + { + "epoch": 1.5030679865826722, + "grad_norm": 0.0037235706113278866, + "learning_rate": 1.765598638017773e-06, + "loss": 0.001, + "step": 91860 + }, + { + "epoch": 1.5032316125337477, + "grad_norm": 0.02677789144217968, + "learning_rate": 1.764509860994833e-06, + "loss": 0.0007, + "step": 91870 + }, + { + "epoch": 1.5033952384848237, + "grad_norm": 0.09653905779123306, + "learning_rate": 1.7634213478474588e-06, + "loss": 0.001, + "step": 91880 + }, + { + "epoch": 1.5035588644358997, + "grad_norm": 0.029980888590216637, + "learning_rate": 1.7623330986644294e-06, + "loss": 0.0005, + "step": 91890 + }, + { + "epoch": 1.5037224903869753, + "grad_norm": 0.052308861166238785, + "learning_rate": 1.7612451135344954e-06, + "loss": 0.001, + "step": 91900 + }, + { + "epoch": 1.5038861163380512, + "grad_norm": 0.025419622659683228, + "learning_rate": 1.760157392546392e-06, + "loss": 0.0006, + "step": 91910 + }, + { + "epoch": 1.504049742289127, + "grad_norm": 0.10279243439435959, + "learning_rate": 1.759069935788828e-06, + "loss": 0.0012, + "step": 91920 + }, + { + "epoch": 1.5042133682402028, + "grad_norm": 0.03728652000427246, + "learning_rate": 1.7579827433504943e-06, + "loss": 0.0009, + "step": 91930 + }, + { + "epoch": 1.5043769941912788, + "grad_norm": 0.0016000282485038042, + "learning_rate": 1.7568958153200565e-06, + "loss": 0.001, + "step": 91940 + }, + { + "epoch": 1.5045406201423546, + "grad_norm": 0.04359014332294464, + "learning_rate": 1.7558091517861637e-06, + "loss": 0.001, + "step": 91950 + }, + { + "epoch": 1.5047042460934303, + "grad_norm": 0.02580469846725464, + "learning_rate": 1.7547227528374365e-06, + "loss": 0.0005, + "step": 91960 + }, + { + "epoch": 1.5048678720445063, + "grad_norm": 0.05423831194639206, + "learning_rate": 1.7536366185624825e-06, + "loss": 0.0008, + "step": 91970 + }, + { + "epoch": 1.5050314979955821, + "grad_norm": 0.11060641705989838, + "learning_rate": 1.752550749049879e-06, + "loss": 0.0009, + "step": 91980 + }, + { + "epoch": 1.5051951239466579, + "grad_norm": 0.04595125839114189, + "learning_rate": 1.7514651443881891e-06, + "loss": 0.0009, + "step": 91990 + }, + { + "epoch": 1.5053587498977339, + "grad_norm": 0.05706120282411575, + "learning_rate": 1.7503798046659481e-06, + "loss": 0.0015, + "step": 92000 + }, + { + "epoch": 1.5055223758488097, + "grad_norm": 0.04266009479761124, + "learning_rate": 1.7492947299716755e-06, + "loss": 0.0012, + "step": 92010 + }, + { + "epoch": 1.5056860017998854, + "grad_norm": 0.0051775239408016205, + "learning_rate": 1.748209920393865e-06, + "loss": 0.0008, + "step": 92020 + }, + { + "epoch": 1.5058496277509614, + "grad_norm": 0.02514449506998062, + "learning_rate": 1.7471253760209883e-06, + "loss": 0.0009, + "step": 92030 + }, + { + "epoch": 1.506013253702037, + "grad_norm": 0.01568145677447319, + "learning_rate": 1.7460410969415003e-06, + "loss": 0.0022, + "step": 92040 + }, + { + "epoch": 1.506176879653113, + "grad_norm": 0.009448518045246601, + "learning_rate": 1.7449570832438278e-06, + "loss": 0.0005, + "step": 92050 + }, + { + "epoch": 1.506340505604189, + "grad_norm": 0.04503456503152847, + "learning_rate": 1.7438733350163828e-06, + "loss": 0.0013, + "step": 92060 + }, + { + "epoch": 1.5065041315552645, + "grad_norm": 0.005923328921198845, + "learning_rate": 1.7427898523475483e-06, + "loss": 0.0006, + "step": 92070 + }, + { + "epoch": 1.5066677575063405, + "grad_norm": 0.026780812069773674, + "learning_rate": 1.7417066353256933e-06, + "loss": 0.0011, + "step": 92080 + }, + { + "epoch": 1.5068313834574163, + "grad_norm": 0.06798375397920609, + "learning_rate": 1.7406236840391578e-06, + "loss": 0.0023, + "step": 92090 + }, + { + "epoch": 1.506995009408492, + "grad_norm": 0.01833983324468136, + "learning_rate": 1.7395409985762673e-06, + "loss": 0.0019, + "step": 92100 + }, + { + "epoch": 1.507158635359568, + "grad_norm": 0.037574201822280884, + "learning_rate": 1.7384585790253178e-06, + "loss": 0.0009, + "step": 92110 + }, + { + "epoch": 1.5073222613106438, + "grad_norm": 0.004330089315772057, + "learning_rate": 1.7373764254745917e-06, + "loss": 0.0003, + "step": 92120 + }, + { + "epoch": 1.5074858872617196, + "grad_norm": 0.0919056385755539, + "learning_rate": 1.736294538012342e-06, + "loss": 0.0012, + "step": 92130 + }, + { + "epoch": 1.5076495132127956, + "grad_norm": 0.043939974159002304, + "learning_rate": 1.7352129167268078e-06, + "loss": 0.0012, + "step": 92140 + }, + { + "epoch": 1.5078131391638714, + "grad_norm": 0.03530528396368027, + "learning_rate": 1.7341315617061981e-06, + "loss": 0.0011, + "step": 92150 + }, + { + "epoch": 1.5079767651149472, + "grad_norm": 0.040463365614414215, + "learning_rate": 1.7330504730387088e-06, + "loss": 0.0007, + "step": 92160 + }, + { + "epoch": 1.5081403910660232, + "grad_norm": 0.014003302901983261, + "learning_rate": 1.731969650812506e-06, + "loss": 0.001, + "step": 92170 + }, + { + "epoch": 1.508304017017099, + "grad_norm": 0.03766884654760361, + "learning_rate": 1.7308890951157415e-06, + "loss": 0.0012, + "step": 92180 + }, + { + "epoch": 1.5084676429681747, + "grad_norm": 0.054084427654743195, + "learning_rate": 1.7298088060365382e-06, + "loss": 0.0009, + "step": 92190 + }, + { + "epoch": 1.5086312689192507, + "grad_norm": 0.001971858786419034, + "learning_rate": 1.7287287836630046e-06, + "loss": 0.0007, + "step": 92200 + }, + { + "epoch": 1.5087948948703265, + "grad_norm": 0.03660333529114723, + "learning_rate": 1.7276490280832192e-06, + "loss": 0.0011, + "step": 92210 + }, + { + "epoch": 1.5089585208214022, + "grad_norm": 0.14026103913784027, + "learning_rate": 1.7265695393852478e-06, + "loss": 0.0013, + "step": 92220 + }, + { + "epoch": 1.5091221467724782, + "grad_norm": 0.10266295820474625, + "learning_rate": 1.7254903176571258e-06, + "loss": 0.0011, + "step": 92230 + }, + { + "epoch": 1.5092857727235538, + "grad_norm": 0.019358551129698753, + "learning_rate": 1.7244113629868742e-06, + "loss": 0.0007, + "step": 92240 + }, + { + "epoch": 1.5094493986746298, + "grad_norm": 0.05355290323495865, + "learning_rate": 1.7233326754624852e-06, + "loss": 0.0023, + "step": 92250 + }, + { + "epoch": 1.5096130246257058, + "grad_norm": 0.10064847767353058, + "learning_rate": 1.7222542551719373e-06, + "loss": 0.0013, + "step": 92260 + }, + { + "epoch": 1.5097766505767813, + "grad_norm": 0.07960085570812225, + "learning_rate": 1.7211761022031787e-06, + "loss": 0.002, + "step": 92270 + }, + { + "epoch": 1.5099402765278573, + "grad_norm": 0.012889365665614605, + "learning_rate": 1.7200982166441433e-06, + "loss": 0.0011, + "step": 92280 + }, + { + "epoch": 1.5101039024789331, + "grad_norm": 0.080244280397892, + "learning_rate": 1.7190205985827364e-06, + "loss": 0.0009, + "step": 92290 + }, + { + "epoch": 1.5102675284300089, + "grad_norm": 0.03683922067284584, + "learning_rate": 1.7179432481068486e-06, + "loss": 0.0007, + "step": 92300 + }, + { + "epoch": 1.5104311543810849, + "grad_norm": 0.11845352500677109, + "learning_rate": 1.716866165304341e-06, + "loss": 0.0019, + "step": 92310 + }, + { + "epoch": 1.5105947803321607, + "grad_norm": 0.01868821308016777, + "learning_rate": 1.7157893502630608e-06, + "loss": 0.0018, + "step": 92320 + }, + { + "epoch": 1.5107584062832364, + "grad_norm": 0.040416914969682693, + "learning_rate": 1.7147128030708266e-06, + "loss": 0.0009, + "step": 92330 + }, + { + "epoch": 1.5109220322343124, + "grad_norm": 0.10213281214237213, + "learning_rate": 1.7136365238154378e-06, + "loss": 0.0011, + "step": 92340 + }, + { + "epoch": 1.5110856581853882, + "grad_norm": 0.07440522313117981, + "learning_rate": 1.7125605125846738e-06, + "loss": 0.0015, + "step": 92350 + }, + { + "epoch": 1.511249284136464, + "grad_norm": 0.021405886858701706, + "learning_rate": 1.7114847694662888e-06, + "loss": 0.0006, + "step": 92360 + }, + { + "epoch": 1.51141291008754, + "grad_norm": 0.037976738065481186, + "learning_rate": 1.7104092945480189e-06, + "loss": 0.001, + "step": 92370 + }, + { + "epoch": 1.5115765360386157, + "grad_norm": 0.04305002838373184, + "learning_rate": 1.709334087917573e-06, + "loss": 0.001, + "step": 92380 + }, + { + "epoch": 1.5117401619896915, + "grad_norm": 0.03911440819501877, + "learning_rate": 1.7082591496626444e-06, + "loss": 0.0007, + "step": 92390 + }, + { + "epoch": 1.5119037879407675, + "grad_norm": 0.07353448122739792, + "learning_rate": 1.7071844798708986e-06, + "loss": 0.0009, + "step": 92400 + }, + { + "epoch": 1.5120674138918433, + "grad_norm": 0.05609318986535072, + "learning_rate": 1.7061100786299856e-06, + "loss": 0.0015, + "step": 92410 + }, + { + "epoch": 1.512231039842919, + "grad_norm": 0.01326186116784811, + "learning_rate": 1.7050359460275257e-06, + "loss": 0.0005, + "step": 92420 + }, + { + "epoch": 1.512394665793995, + "grad_norm": 0.058928415179252625, + "learning_rate": 1.7039620821511255e-06, + "loss": 0.0011, + "step": 92430 + }, + { + "epoch": 1.5125582917450706, + "grad_norm": 0.032803524285554886, + "learning_rate": 1.7028884870883617e-06, + "loss": 0.0021, + "step": 92440 + }, + { + "epoch": 1.5127219176961466, + "grad_norm": 0.03961893543601036, + "learning_rate": 1.7018151609267975e-06, + "loss": 0.0007, + "step": 92450 + }, + { + "epoch": 1.5128855436472226, + "grad_norm": 0.043346889317035675, + "learning_rate": 1.700742103753965e-06, + "loss": 0.0005, + "step": 92460 + }, + { + "epoch": 1.5130491695982982, + "grad_norm": 0.0646902546286583, + "learning_rate": 1.6996693156573835e-06, + "loss": 0.0011, + "step": 92470 + }, + { + "epoch": 1.5132127955493742, + "grad_norm": 0.09837473928928375, + "learning_rate": 1.6985967967245426e-06, + "loss": 0.001, + "step": 92480 + }, + { + "epoch": 1.51337642150045, + "grad_norm": 0.02862619049847126, + "learning_rate": 1.6975245470429158e-06, + "loss": 0.0011, + "step": 92490 + }, + { + "epoch": 1.5135400474515257, + "grad_norm": 0.015240843407809734, + "learning_rate": 1.6964525666999493e-06, + "loss": 0.0021, + "step": 92500 + }, + { + "epoch": 1.5137036734026017, + "grad_norm": 0.06871083378791809, + "learning_rate": 1.6953808557830737e-06, + "loss": 0.0011, + "step": 92510 + }, + { + "epoch": 1.5138672993536775, + "grad_norm": 0.08516515046358109, + "learning_rate": 1.6943094143796906e-06, + "loss": 0.0015, + "step": 92520 + }, + { + "epoch": 1.5140309253047533, + "grad_norm": 0.061271414160728455, + "learning_rate": 1.693238242577186e-06, + "loss": 0.0006, + "step": 92530 + }, + { + "epoch": 1.5141945512558292, + "grad_norm": 0.06548325717449188, + "learning_rate": 1.6921673404629185e-06, + "loss": 0.0044, + "step": 92540 + }, + { + "epoch": 1.514358177206905, + "grad_norm": 0.07551167905330658, + "learning_rate": 1.6910967081242296e-06, + "loss": 0.0005, + "step": 92550 + }, + { + "epoch": 1.5145218031579808, + "grad_norm": 0.032453831285238266, + "learning_rate": 1.6900263456484345e-06, + "loss": 0.0033, + "step": 92560 + }, + { + "epoch": 1.5146854291090568, + "grad_norm": 0.02420182339847088, + "learning_rate": 1.6889562531228304e-06, + "loss": 0.0011, + "step": 92570 + }, + { + "epoch": 1.5148490550601326, + "grad_norm": 0.03445042297244072, + "learning_rate": 1.687886430634687e-06, + "loss": 0.002, + "step": 92580 + }, + { + "epoch": 1.5150126810112083, + "grad_norm": 0.08497074246406555, + "learning_rate": 1.68681687827126e-06, + "loss": 0.0008, + "step": 92590 + }, + { + "epoch": 1.5151763069622843, + "grad_norm": 0.03298862650990486, + "learning_rate": 1.685747596119774e-06, + "loss": 0.0007, + "step": 92600 + }, + { + "epoch": 1.51533993291336, + "grad_norm": 0.028297094628214836, + "learning_rate": 1.6846785842674396e-06, + "loss": 0.0015, + "step": 92610 + }, + { + "epoch": 1.5155035588644359, + "grad_norm": 0.049940288066864014, + "learning_rate": 1.6836098428014386e-06, + "loss": 0.0011, + "step": 92620 + }, + { + "epoch": 1.5156671848155119, + "grad_norm": 0.039305780082941055, + "learning_rate": 1.6825413718089372e-06, + "loss": 0.0007, + "step": 92630 + }, + { + "epoch": 1.5158308107665874, + "grad_norm": 0.11006177961826324, + "learning_rate": 1.6814731713770727e-06, + "loss": 0.0012, + "step": 92640 + }, + { + "epoch": 1.5159944367176634, + "grad_norm": 0.015549235977232456, + "learning_rate": 1.6804052415929672e-06, + "loss": 0.0008, + "step": 92650 + }, + { + "epoch": 1.5161580626687394, + "grad_norm": 0.041310086846351624, + "learning_rate": 1.6793375825437163e-06, + "loss": 0.0011, + "step": 92660 + }, + { + "epoch": 1.516321688619815, + "grad_norm": 0.09205932170152664, + "learning_rate": 1.6782701943163926e-06, + "loss": 0.0014, + "step": 92670 + }, + { + "epoch": 1.516485314570891, + "grad_norm": 0.043636806309223175, + "learning_rate": 1.6772030769980519e-06, + "loss": 0.0007, + "step": 92680 + }, + { + "epoch": 1.5166489405219667, + "grad_norm": 0.10762016475200653, + "learning_rate": 1.6761362306757217e-06, + "loss": 0.001, + "step": 92690 + }, + { + "epoch": 1.5168125664730425, + "grad_norm": 0.033088572323322296, + "learning_rate": 1.6750696554364132e-06, + "loss": 0.0007, + "step": 92700 + }, + { + "epoch": 1.5169761924241185, + "grad_norm": 0.0982365757226944, + "learning_rate": 1.67400335136711e-06, + "loss": 0.0015, + "step": 92710 + }, + { + "epoch": 1.5171398183751943, + "grad_norm": 0.062267448753118515, + "learning_rate": 1.6729373185547788e-06, + "loss": 0.0007, + "step": 92720 + }, + { + "epoch": 1.51730344432627, + "grad_norm": 0.15004102885723114, + "learning_rate": 1.671871557086358e-06, + "loss": 0.0019, + "step": 92730 + }, + { + "epoch": 1.517467070277346, + "grad_norm": 0.05737435445189476, + "learning_rate": 1.670806067048772e-06, + "loss": 0.001, + "step": 92740 + }, + { + "epoch": 1.5176306962284218, + "grad_norm": 0.013875706121325493, + "learning_rate": 1.6697408485289145e-06, + "loss": 0.0007, + "step": 92750 + }, + { + "epoch": 1.5177943221794976, + "grad_norm": 0.008382689207792282, + "learning_rate": 1.6686759016136645e-06, + "loss": 0.0012, + "step": 92760 + }, + { + "epoch": 1.5179579481305736, + "grad_norm": 0.011063765734434128, + "learning_rate": 1.6676112263898725e-06, + "loss": 0.0009, + "step": 92770 + }, + { + "epoch": 1.5181215740816494, + "grad_norm": 0.014943902380764484, + "learning_rate": 1.6665468229443721e-06, + "loss": 0.0008, + "step": 92780 + }, + { + "epoch": 1.5182852000327252, + "grad_norm": 0.034328170120716095, + "learning_rate": 1.6654826913639705e-06, + "loss": 0.0005, + "step": 92790 + }, + { + "epoch": 1.5184488259838012, + "grad_norm": 0.012002581730484962, + "learning_rate": 1.664418831735457e-06, + "loss": 0.0012, + "step": 92800 + }, + { + "epoch": 1.5186124519348767, + "grad_norm": 0.03546871617436409, + "learning_rate": 1.663355244145593e-06, + "loss": 0.0009, + "step": 92810 + }, + { + "epoch": 1.5187760778859527, + "grad_norm": 0.041267212480306625, + "learning_rate": 1.6622919286811251e-06, + "loss": 0.0009, + "step": 92820 + }, + { + "epoch": 1.5189397038370287, + "grad_norm": 0.031755756586790085, + "learning_rate": 1.6612288854287695e-06, + "loss": 0.0007, + "step": 92830 + }, + { + "epoch": 1.5191033297881043, + "grad_norm": 0.15127228200435638, + "learning_rate": 1.6601661144752284e-06, + "loss": 0.0015, + "step": 92840 + }, + { + "epoch": 1.5192669557391802, + "grad_norm": 0.07316996902227402, + "learning_rate": 1.6591036159071744e-06, + "loss": 0.0014, + "step": 92850 + }, + { + "epoch": 1.519430581690256, + "grad_norm": 0.0188047606498003, + "learning_rate": 1.6580413898112646e-06, + "loss": 0.0017, + "step": 92860 + }, + { + "epoch": 1.5195942076413318, + "grad_norm": 0.020672926679253578, + "learning_rate": 1.6569794362741265e-06, + "loss": 0.0004, + "step": 92870 + }, + { + "epoch": 1.5197578335924078, + "grad_norm": 0.08664774894714355, + "learning_rate": 1.6559177553823736e-06, + "loss": 0.0006, + "step": 92880 + }, + { + "epoch": 1.5199214595434836, + "grad_norm": 0.056326452642679214, + "learning_rate": 1.6548563472225892e-06, + "loss": 0.0012, + "step": 92890 + }, + { + "epoch": 1.5200850854945593, + "grad_norm": 0.041529614478349686, + "learning_rate": 1.6537952118813423e-06, + "loss": 0.0008, + "step": 92900 + }, + { + "epoch": 1.5202487114456353, + "grad_norm": 0.009428061544895172, + "learning_rate": 1.652734349445171e-06, + "loss": 0.0007, + "step": 92910 + }, + { + "epoch": 1.5204123373967111, + "grad_norm": 0.03109990619122982, + "learning_rate": 1.6516737600005994e-06, + "loss": 0.0011, + "step": 92920 + }, + { + "epoch": 1.5205759633477869, + "grad_norm": 0.013769006356596947, + "learning_rate": 1.650613443634122e-06, + "loss": 0.0011, + "step": 92930 + }, + { + "epoch": 1.5207395892988629, + "grad_norm": 0.010103962384164333, + "learning_rate": 1.6495534004322183e-06, + "loss": 0.0012, + "step": 92940 + }, + { + "epoch": 1.5209032152499387, + "grad_norm": 0.001887293765321374, + "learning_rate": 1.6484936304813387e-06, + "loss": 0.0014, + "step": 92950 + }, + { + "epoch": 1.5210668412010144, + "grad_norm": 0.1247955784201622, + "learning_rate": 1.6474341338679173e-06, + "loss": 0.002, + "step": 92960 + }, + { + "epoch": 1.5212304671520904, + "grad_norm": 0.05327566713094711, + "learning_rate": 1.64637491067836e-06, + "loss": 0.0007, + "step": 92970 + }, + { + "epoch": 1.5213940931031662, + "grad_norm": 0.11890830844640732, + "learning_rate": 1.6453159609990565e-06, + "loss": 0.0013, + "step": 92980 + }, + { + "epoch": 1.521557719054242, + "grad_norm": 0.061663951724767685, + "learning_rate": 1.6442572849163695e-06, + "loss": 0.0011, + "step": 92990 + }, + { + "epoch": 1.521721345005318, + "grad_norm": 0.0016993772005662322, + "learning_rate": 1.64319888251664e-06, + "loss": 0.0006, + "step": 93000 + }, + { + "epoch": 1.5218849709563935, + "grad_norm": 0.15409444272518158, + "learning_rate": 1.64214075388619e-06, + "loss": 0.0018, + "step": 93010 + }, + { + "epoch": 1.5220485969074695, + "grad_norm": 0.046371039003133774, + "learning_rate": 1.6410828991113143e-06, + "loss": 0.0011, + "step": 93020 + }, + { + "epoch": 1.5222122228585455, + "grad_norm": 0.06476078927516937, + "learning_rate": 1.6400253182782905e-06, + "loss": 0.0016, + "step": 93030 + }, + { + "epoch": 1.522375848809621, + "grad_norm": 0.0310774315148592, + "learning_rate": 1.6389680114733686e-06, + "loss": 0.0015, + "step": 93040 + }, + { + "epoch": 1.522539474760697, + "grad_norm": 0.14070840179920197, + "learning_rate": 1.6379109787827824e-06, + "loss": 0.0016, + "step": 93050 + }, + { + "epoch": 1.5227031007117728, + "grad_norm": 0.062336936593055725, + "learning_rate": 1.6368542202927362e-06, + "loss": 0.0014, + "step": 93060 + }, + { + "epoch": 1.5228667266628486, + "grad_norm": 0.026353567838668823, + "learning_rate": 1.6357977360894188e-06, + "loss": 0.0008, + "step": 93070 + }, + { + "epoch": 1.5230303526139246, + "grad_norm": 0.06551194190979004, + "learning_rate": 1.6347415262589906e-06, + "loss": 0.0016, + "step": 93080 + }, + { + "epoch": 1.5231939785650004, + "grad_norm": 0.11125332862138748, + "learning_rate": 1.6336855908875958e-06, + "loss": 0.001, + "step": 93090 + }, + { + "epoch": 1.5233576045160762, + "grad_norm": 0.06439101696014404, + "learning_rate": 1.632629930061349e-06, + "loss": 0.0014, + "step": 93100 + }, + { + "epoch": 1.5235212304671522, + "grad_norm": 0.03578665480017662, + "learning_rate": 1.63157454386635e-06, + "loss": 0.001, + "step": 93110 + }, + { + "epoch": 1.523684856418228, + "grad_norm": 0.057246576994657516, + "learning_rate": 1.6305194323886698e-06, + "loss": 0.0011, + "step": 93120 + }, + { + "epoch": 1.5238484823693037, + "grad_norm": 0.10694283992052078, + "learning_rate": 1.6294645957143618e-06, + "loss": 0.0008, + "step": 93130 + }, + { + "epoch": 1.5240121083203797, + "grad_norm": 0.011965298093855381, + "learning_rate": 1.6284100339294518e-06, + "loss": 0.0005, + "step": 93140 + }, + { + "epoch": 1.5241757342714555, + "grad_norm": 0.020136523991823196, + "learning_rate": 1.6273557471199508e-06, + "loss": 0.0021, + "step": 93150 + }, + { + "epoch": 1.5243393602225312, + "grad_norm": 0.005472683813422918, + "learning_rate": 1.626301735371838e-06, + "loss": 0.001, + "step": 93160 + }, + { + "epoch": 1.5245029861736072, + "grad_norm": 0.12966491281986237, + "learning_rate": 1.6252479987710795e-06, + "loss": 0.0013, + "step": 93170 + }, + { + "epoch": 1.524666612124683, + "grad_norm": 0.07363984733819962, + "learning_rate": 1.62419453740361e-06, + "loss": 0.001, + "step": 93180 + }, + { + "epoch": 1.5248302380757588, + "grad_norm": 0.006952513940632343, + "learning_rate": 1.6231413513553506e-06, + "loss": 0.0008, + "step": 93190 + }, + { + "epoch": 1.5249938640268348, + "grad_norm": 0.022590002045035362, + "learning_rate": 1.6220884407121924e-06, + "loss": 0.0013, + "step": 93200 + }, + { + "epoch": 1.5251574899779103, + "grad_norm": 0.14063996076583862, + "learning_rate": 1.6210358055600095e-06, + "loss": 0.0014, + "step": 93210 + }, + { + "epoch": 1.5253211159289863, + "grad_norm": 0.043043188750743866, + "learning_rate": 1.619983445984648e-06, + "loss": 0.0012, + "step": 93220 + }, + { + "epoch": 1.5254847418800623, + "grad_norm": 0.021678877994418144, + "learning_rate": 1.6189313620719388e-06, + "loss": 0.0009, + "step": 93230 + }, + { + "epoch": 1.5256483678311379, + "grad_norm": 0.031118957325816154, + "learning_rate": 1.617879553907683e-06, + "loss": 0.0017, + "step": 93240 + }, + { + "epoch": 1.5258119937822139, + "grad_norm": 0.048221733421087265, + "learning_rate": 1.6168280215776649e-06, + "loss": 0.0007, + "step": 93250 + }, + { + "epoch": 1.5259756197332897, + "grad_norm": 0.06168932095170021, + "learning_rate": 1.615776765167642e-06, + "loss": 0.0009, + "step": 93260 + }, + { + "epoch": 1.5261392456843654, + "grad_norm": 0.010130315087735653, + "learning_rate": 1.614725784763353e-06, + "loss": 0.0005, + "step": 93270 + }, + { + "epoch": 1.5263028716354414, + "grad_norm": 0.09475027024745941, + "learning_rate": 1.6136750804505096e-06, + "loss": 0.0007, + "step": 93280 + }, + { + "epoch": 1.5264664975865172, + "grad_norm": 0.02594883181154728, + "learning_rate": 1.6126246523148075e-06, + "loss": 0.0007, + "step": 93290 + }, + { + "epoch": 1.526630123537593, + "grad_norm": 0.05157390981912613, + "learning_rate": 1.6115745004419136e-06, + "loss": 0.001, + "step": 93300 + }, + { + "epoch": 1.526793749488669, + "grad_norm": 0.18255183100700378, + "learning_rate": 1.6105246249174733e-06, + "loss": 0.0022, + "step": 93310 + }, + { + "epoch": 1.5269573754397447, + "grad_norm": 0.007249930407851934, + "learning_rate": 1.609475025827114e-06, + "loss": 0.0022, + "step": 93320 + }, + { + "epoch": 1.5271210013908205, + "grad_norm": 0.04296432062983513, + "learning_rate": 1.6084257032564349e-06, + "loss": 0.0068, + "step": 93330 + }, + { + "epoch": 1.5272846273418965, + "grad_norm": 0.03299983590841293, + "learning_rate": 1.6073766572910183e-06, + "loss": 0.001, + "step": 93340 + }, + { + "epoch": 1.5274482532929723, + "grad_norm": 0.05760448798537254, + "learning_rate": 1.6063278880164163e-06, + "loss": 0.0013, + "step": 93350 + }, + { + "epoch": 1.527611879244048, + "grad_norm": 0.01109104324132204, + "learning_rate": 1.6052793955181673e-06, + "loss": 0.0008, + "step": 93360 + }, + { + "epoch": 1.527775505195124, + "grad_norm": 0.025729773566126823, + "learning_rate": 1.60423117988178e-06, + "loss": 0.0011, + "step": 93370 + }, + { + "epoch": 1.5279391311461998, + "grad_norm": 0.007150289602577686, + "learning_rate": 1.6031832411927451e-06, + "loss": 0.0009, + "step": 93380 + }, + { + "epoch": 1.5281027570972756, + "grad_norm": 0.030729493126273155, + "learning_rate": 1.6021355795365268e-06, + "loss": 0.0017, + "step": 93390 + }, + { + "epoch": 1.5282663830483516, + "grad_norm": 0.032178524881601334, + "learning_rate": 1.6010881949985718e-06, + "loss": 0.0008, + "step": 93400 + }, + { + "epoch": 1.5284300089994272, + "grad_norm": 0.08488941192626953, + "learning_rate": 1.600041087664298e-06, + "loss": 0.0013, + "step": 93410 + }, + { + "epoch": 1.5285936349505032, + "grad_norm": 0.06927945464849472, + "learning_rate": 1.5989942576191064e-06, + "loss": 0.0012, + "step": 93420 + }, + { + "epoch": 1.5287572609015792, + "grad_norm": 0.025662565603852272, + "learning_rate": 1.597947704948371e-06, + "loss": 0.0005, + "step": 93430 + }, + { + "epoch": 1.5289208868526547, + "grad_norm": 0.03501514717936516, + "learning_rate": 1.5969014297374475e-06, + "loss": 0.001, + "step": 93440 + }, + { + "epoch": 1.5290845128037307, + "grad_norm": 0.07361352443695068, + "learning_rate": 1.595855432071663e-06, + "loss": 0.0014, + "step": 93450 + }, + { + "epoch": 1.5292481387548065, + "grad_norm": 0.09956180304288864, + "learning_rate": 1.59480971203633e-06, + "loss": 0.0018, + "step": 93460 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.024720311164855957, + "learning_rate": 1.5937642697167288e-06, + "loss": 0.0008, + "step": 93470 + }, + { + "epoch": 1.5295753906569582, + "grad_norm": 0.0025174375623464584, + "learning_rate": 1.5927191051981273e-06, + "loss": 0.0009, + "step": 93480 + }, + { + "epoch": 1.529739016608034, + "grad_norm": 0.0181483905762434, + "learning_rate": 1.5916742185657608e-06, + "loss": 0.0016, + "step": 93490 + }, + { + "epoch": 1.5299026425591098, + "grad_norm": 0.10036621987819672, + "learning_rate": 1.5906296099048508e-06, + "loss": 0.0014, + "step": 93500 + }, + { + "epoch": 1.5300662685101858, + "grad_norm": 0.03790244832634926, + "learning_rate": 1.5895852793005883e-06, + "loss": 0.0006, + "step": 93510 + }, + { + "epoch": 1.5302298944612616, + "grad_norm": 0.07828021049499512, + "learning_rate": 1.588541226838149e-06, + "loss": 0.0008, + "step": 93520 + }, + { + "epoch": 1.5303935204123373, + "grad_norm": 0.0891806110739708, + "learning_rate": 1.5874974526026783e-06, + "loss": 0.0014, + "step": 93530 + }, + { + "epoch": 1.5305571463634133, + "grad_norm": 0.020414000377058983, + "learning_rate": 1.5864539566793075e-06, + "loss": 0.0007, + "step": 93540 + }, + { + "epoch": 1.530720772314489, + "grad_norm": 0.08390213549137115, + "learning_rate": 1.585410739153136e-06, + "loss": 0.0017, + "step": 93550 + }, + { + "epoch": 1.5308843982655649, + "grad_norm": 0.13077345490455627, + "learning_rate": 1.584367800109249e-06, + "loss": 0.0023, + "step": 93560 + }, + { + "epoch": 1.5310480242166409, + "grad_norm": 0.1087348535656929, + "learning_rate": 1.583325139632701e-06, + "loss": 0.0011, + "step": 93570 + }, + { + "epoch": 1.5312116501677167, + "grad_norm": 0.03183962404727936, + "learning_rate": 1.5822827578085327e-06, + "loss": 0.0006, + "step": 93580 + }, + { + "epoch": 1.5313752761187924, + "grad_norm": 0.13251346349716187, + "learning_rate": 1.5812406547217524e-06, + "loss": 0.0008, + "step": 93590 + }, + { + "epoch": 1.5315389020698684, + "grad_norm": 0.11284696310758591, + "learning_rate": 1.580198830457354e-06, + "loss": 0.0016, + "step": 93600 + }, + { + "epoch": 1.531702528020944, + "grad_norm": 0.005216659512370825, + "learning_rate": 1.579157285100303e-06, + "loss": 0.0018, + "step": 93610 + }, + { + "epoch": 1.53186615397202, + "grad_norm": 0.06284871697425842, + "learning_rate": 1.578116018735546e-06, + "loss": 0.0008, + "step": 93620 + }, + { + "epoch": 1.532029779923096, + "grad_norm": 0.05726700276136398, + "learning_rate": 1.5770750314480043e-06, + "loss": 0.0014, + "step": 93630 + }, + { + "epoch": 1.5321934058741715, + "grad_norm": 0.03996344655752182, + "learning_rate": 1.5760343233225761e-06, + "loss": 0.0012, + "step": 93640 + }, + { + "epoch": 1.5323570318252475, + "grad_norm": 0.05469159409403801, + "learning_rate": 1.5749938944441407e-06, + "loss": 0.0008, + "step": 93650 + }, + { + "epoch": 1.5325206577763233, + "grad_norm": 0.07408096641302109, + "learning_rate": 1.5739537448975483e-06, + "loss": 0.0011, + "step": 93660 + }, + { + "epoch": 1.532684283727399, + "grad_norm": 0.011616322211921215, + "learning_rate": 1.5729138747676343e-06, + "loss": 0.0016, + "step": 93670 + }, + { + "epoch": 1.532847909678475, + "grad_norm": 0.0322139598429203, + "learning_rate": 1.5718742841392027e-06, + "loss": 0.0006, + "step": 93680 + }, + { + "epoch": 1.5330115356295508, + "grad_norm": 0.0314924418926239, + "learning_rate": 1.5708349730970424e-06, + "loss": 0.0008, + "step": 93690 + }, + { + "epoch": 1.5331751615806266, + "grad_norm": 0.002176120178773999, + "learning_rate": 1.5697959417259134e-06, + "loss": 0.0013, + "step": 93700 + }, + { + "epoch": 1.5333387875317026, + "grad_norm": 0.11143457144498825, + "learning_rate": 1.5687571901105586e-06, + "loss": 0.0008, + "step": 93710 + }, + { + "epoch": 1.5335024134827784, + "grad_norm": 0.05008303374052048, + "learning_rate": 1.5677187183356912e-06, + "loss": 0.0012, + "step": 93720 + }, + { + "epoch": 1.5336660394338542, + "grad_norm": 0.04798712953925133, + "learning_rate": 1.5666805264860096e-06, + "loss": 0.0014, + "step": 93730 + }, + { + "epoch": 1.5338296653849302, + "grad_norm": 0.019730838015675545, + "learning_rate": 1.5656426146461812e-06, + "loss": 0.0008, + "step": 93740 + }, + { + "epoch": 1.533993291336006, + "grad_norm": 0.12769195437431335, + "learning_rate": 1.564604982900858e-06, + "loss": 0.0012, + "step": 93750 + }, + { + "epoch": 1.5341569172870817, + "grad_norm": 0.09792885929346085, + "learning_rate": 1.5635676313346627e-06, + "loss": 0.0018, + "step": 93760 + }, + { + "epoch": 1.5343205432381577, + "grad_norm": 0.055695660412311554, + "learning_rate": 1.5625305600322016e-06, + "loss": 0.0008, + "step": 93770 + }, + { + "epoch": 1.5344841691892332, + "grad_norm": 0.06768687069416046, + "learning_rate": 1.5614937690780506e-06, + "loss": 0.0008, + "step": 93780 + }, + { + "epoch": 1.5346477951403092, + "grad_norm": 0.08766273409128189, + "learning_rate": 1.5604572585567707e-06, + "loss": 0.0011, + "step": 93790 + }, + { + "epoch": 1.5348114210913852, + "grad_norm": 0.042774517089128494, + "learning_rate": 1.5594210285528932e-06, + "loss": 0.0007, + "step": 93800 + }, + { + "epoch": 1.5349750470424608, + "grad_norm": 0.05352846905589104, + "learning_rate": 1.558385079150932e-06, + "loss": 0.0016, + "step": 93810 + }, + { + "epoch": 1.5351386729935368, + "grad_norm": 0.04533109441399574, + "learning_rate": 1.5573494104353726e-06, + "loss": 0.0013, + "step": 93820 + }, + { + "epoch": 1.5353022989446126, + "grad_norm": 0.05043867230415344, + "learning_rate": 1.5563140224906841e-06, + "loss": 0.0011, + "step": 93830 + }, + { + "epoch": 1.5354659248956883, + "grad_norm": 0.03465856984257698, + "learning_rate": 1.555278915401306e-06, + "loss": 0.001, + "step": 93840 + }, + { + "epoch": 1.5356295508467643, + "grad_norm": 0.01542655285447836, + "learning_rate": 1.5542440892516614e-06, + "loss": 0.0008, + "step": 93850 + }, + { + "epoch": 1.53579317679784, + "grad_norm": 0.052481234073638916, + "learning_rate": 1.5532095441261437e-06, + "loss": 0.0019, + "step": 93860 + }, + { + "epoch": 1.5359568027489159, + "grad_norm": 0.014215247705578804, + "learning_rate": 1.55217528010913e-06, + "loss": 0.0007, + "step": 93870 + }, + { + "epoch": 1.5361204286999919, + "grad_norm": 0.03126933053135872, + "learning_rate": 1.5511412972849682e-06, + "loss": 0.0006, + "step": 93880 + }, + { + "epoch": 1.5362840546510677, + "grad_norm": 0.01727559231221676, + "learning_rate": 1.5501075957379903e-06, + "loss": 0.0018, + "step": 93890 + }, + { + "epoch": 1.5364476806021434, + "grad_norm": 0.008257654495537281, + "learning_rate": 1.5490741755524973e-06, + "loss": 0.0008, + "step": 93900 + }, + { + "epoch": 1.5366113065532194, + "grad_norm": 0.01606535166501999, + "learning_rate": 1.5480410368127752e-06, + "loss": 0.0012, + "step": 93910 + }, + { + "epoch": 1.5367749325042952, + "grad_norm": 0.014916467480361462, + "learning_rate": 1.54700817960308e-06, + "loss": 0.0006, + "step": 93920 + }, + { + "epoch": 1.536938558455371, + "grad_norm": 0.06592939049005508, + "learning_rate": 1.5459756040076512e-06, + "loss": 0.0104, + "step": 93930 + }, + { + "epoch": 1.537102184406447, + "grad_norm": 0.019060326740145683, + "learning_rate": 1.5449433101107002e-06, + "loss": 0.0005, + "step": 93940 + }, + { + "epoch": 1.5372658103575227, + "grad_norm": 0.0022325606551021338, + "learning_rate": 1.5439112979964167e-06, + "loss": 0.0009, + "step": 93950 + }, + { + "epoch": 1.5374294363085985, + "grad_norm": 0.032140132039785385, + "learning_rate": 1.5428795677489704e-06, + "loss": 0.0006, + "step": 93960 + }, + { + "epoch": 1.5375930622596745, + "grad_norm": 0.021835198625922203, + "learning_rate": 1.5418481194525032e-06, + "loss": 0.0007, + "step": 93970 + }, + { + "epoch": 1.53775668821075, + "grad_norm": 0.02313242293894291, + "learning_rate": 1.5408169531911388e-06, + "loss": 0.0012, + "step": 93980 + }, + { + "epoch": 1.537920314161826, + "grad_norm": 0.04488152638077736, + "learning_rate": 1.539786069048973e-06, + "loss": 0.0013, + "step": 93990 + }, + { + "epoch": 1.538083940112902, + "grad_norm": 0.23308402299880981, + "learning_rate": 1.5387554671100841e-06, + "loss": 0.002, + "step": 94000 + }, + { + "epoch": 1.5382475660639776, + "grad_norm": 0.11276835948228836, + "learning_rate": 1.537725147458522e-06, + "loss": 0.0014, + "step": 94010 + }, + { + "epoch": 1.5384111920150536, + "grad_norm": 0.04927190765738487, + "learning_rate": 1.536695110178319e-06, + "loss": 0.002, + "step": 94020 + }, + { + "epoch": 1.5385748179661294, + "grad_norm": 0.02559305541217327, + "learning_rate": 1.5356653553534772e-06, + "loss": 0.0017, + "step": 94030 + }, + { + "epoch": 1.5387384439172052, + "grad_norm": 0.039904557168483734, + "learning_rate": 1.5346358830679847e-06, + "loss": 0.0011, + "step": 94040 + }, + { + "epoch": 1.5389020698682812, + "grad_norm": 0.00763119338080287, + "learning_rate": 1.5336066934057974e-06, + "loss": 0.0008, + "step": 94050 + }, + { + "epoch": 1.539065695819357, + "grad_norm": 0.09840935468673706, + "learning_rate": 1.5325777864508562e-06, + "loss": 0.002, + "step": 94060 + }, + { + "epoch": 1.5392293217704327, + "grad_norm": 0.048158399760723114, + "learning_rate": 1.531549162287072e-06, + "loss": 0.0011, + "step": 94070 + }, + { + "epoch": 1.5393929477215087, + "grad_norm": 0.03175771236419678, + "learning_rate": 1.530520820998339e-06, + "loss": 0.0012, + "step": 94080 + }, + { + "epoch": 1.5395565736725845, + "grad_norm": 0.04326491802930832, + "learning_rate": 1.529492762668522e-06, + "loss": 0.0015, + "step": 94090 + }, + { + "epoch": 1.5397201996236602, + "grad_norm": 0.05250867083668709, + "learning_rate": 1.5284649873814693e-06, + "loss": 0.0011, + "step": 94100 + }, + { + "epoch": 1.5398838255747362, + "grad_norm": 0.06214847415685654, + "learning_rate": 1.5274374952209997e-06, + "loss": 0.0012, + "step": 94110 + }, + { + "epoch": 1.540047451525812, + "grad_norm": 0.018423575907945633, + "learning_rate": 1.5264102862709145e-06, + "loss": 0.001, + "step": 94120 + }, + { + "epoch": 1.5402110774768878, + "grad_norm": 0.04551853984594345, + "learning_rate": 1.5253833606149864e-06, + "loss": 0.0009, + "step": 94130 + }, + { + "epoch": 1.5403747034279638, + "grad_norm": 0.1042928695678711, + "learning_rate": 1.5243567183369718e-06, + "loss": 0.0015, + "step": 94140 + }, + { + "epoch": 1.5405383293790396, + "grad_norm": 0.03132132068276405, + "learning_rate": 1.5233303595205962e-06, + "loss": 0.0007, + "step": 94150 + }, + { + "epoch": 1.5407019553301153, + "grad_norm": 0.04126354679465294, + "learning_rate": 1.5223042842495695e-06, + "loss": 0.0018, + "step": 94160 + }, + { + "epoch": 1.5408655812811913, + "grad_norm": 0.06351091712713242, + "learning_rate": 1.521278492607572e-06, + "loss": 0.0009, + "step": 94170 + }, + { + "epoch": 1.5410292072322669, + "grad_norm": 0.0328158438205719, + "learning_rate": 1.520252984678266e-06, + "loss": 0.0015, + "step": 94180 + }, + { + "epoch": 1.5411928331833429, + "grad_norm": 0.04032239690423012, + "learning_rate": 1.519227760545286e-06, + "loss": 0.0016, + "step": 94190 + }, + { + "epoch": 1.5413564591344189, + "grad_norm": 0.03198780491948128, + "learning_rate": 1.5182028202922494e-06, + "loss": 0.0008, + "step": 94200 + }, + { + "epoch": 1.5415200850854944, + "grad_norm": 0.04413412883877754, + "learning_rate": 1.517178164002742e-06, + "loss": 0.0009, + "step": 94210 + }, + { + "epoch": 1.5416837110365704, + "grad_norm": 0.01389741525053978, + "learning_rate": 1.5161537917603363e-06, + "loss": 0.0009, + "step": 94220 + }, + { + "epoch": 1.5418473369876462, + "grad_norm": 0.01119904313236475, + "learning_rate": 1.5151297036485729e-06, + "loss": 0.0008, + "step": 94230 + }, + { + "epoch": 1.542010962938722, + "grad_norm": 0.0933723971247673, + "learning_rate": 1.5141058997509755e-06, + "loss": 0.001, + "step": 94240 + }, + { + "epoch": 1.542174588889798, + "grad_norm": 0.03460763022303581, + "learning_rate": 1.5130823801510397e-06, + "loss": 0.0007, + "step": 94250 + }, + { + "epoch": 1.5423382148408737, + "grad_norm": 0.05208886042237282, + "learning_rate": 1.5120591449322425e-06, + "loss": 0.0013, + "step": 94260 + }, + { + "epoch": 1.5425018407919495, + "grad_norm": 0.03544263169169426, + "learning_rate": 1.511036194178035e-06, + "loss": 0.0009, + "step": 94270 + }, + { + "epoch": 1.5426654667430255, + "grad_norm": 0.04742789641022682, + "learning_rate": 1.510013527971843e-06, + "loss": 0.0012, + "step": 94280 + }, + { + "epoch": 1.5428290926941013, + "grad_norm": 0.02192148193717003, + "learning_rate": 1.5089911463970752e-06, + "loss": 0.001, + "step": 94290 + }, + { + "epoch": 1.542992718645177, + "grad_norm": 0.03941531851887703, + "learning_rate": 1.5079690495371109e-06, + "loss": 0.0011, + "step": 94300 + }, + { + "epoch": 1.543156344596253, + "grad_norm": 0.037867944687604904, + "learning_rate": 1.5069472374753113e-06, + "loss": 0.0008, + "step": 94310 + }, + { + "epoch": 1.5433199705473288, + "grad_norm": 0.09288782626390457, + "learning_rate": 1.5059257102950092e-06, + "loss": 0.0009, + "step": 94320 + }, + { + "epoch": 1.5434835964984046, + "grad_norm": 0.08688194304704666, + "learning_rate": 1.5049044680795193e-06, + "loss": 0.0011, + "step": 94330 + }, + { + "epoch": 1.5436472224494806, + "grad_norm": 0.022290518507361412, + "learning_rate": 1.5038835109121285e-06, + "loss": 0.001, + "step": 94340 + }, + { + "epoch": 1.5438108484005564, + "grad_norm": 0.04387383908033371, + "learning_rate": 1.5028628388761058e-06, + "loss": 0.0007, + "step": 94350 + }, + { + "epoch": 1.5439744743516322, + "grad_norm": 0.0038051707670092583, + "learning_rate": 1.5018424520546893e-06, + "loss": 0.0009, + "step": 94360 + }, + { + "epoch": 1.5441381003027081, + "grad_norm": 0.011049297638237476, + "learning_rate": 1.5008223505311026e-06, + "loss": 0.0012, + "step": 94370 + }, + { + "epoch": 1.5443017262537837, + "grad_norm": 0.057600632309913635, + "learning_rate": 1.4998025343885386e-06, + "loss": 0.0013, + "step": 94380 + }, + { + "epoch": 1.5444653522048597, + "grad_norm": 0.0033113150857388973, + "learning_rate": 1.498783003710172e-06, + "loss": 0.0006, + "step": 94390 + }, + { + "epoch": 1.5446289781559357, + "grad_norm": 0.05279870331287384, + "learning_rate": 1.4977637585791504e-06, + "loss": 0.0013, + "step": 94400 + }, + { + "epoch": 1.5447926041070112, + "grad_norm": 0.036784250289201736, + "learning_rate": 1.496744799078602e-06, + "loss": 0.002, + "step": 94410 + }, + { + "epoch": 1.5449562300580872, + "grad_norm": 0.03191140294075012, + "learning_rate": 1.4957261252916277e-06, + "loss": 0.0006, + "step": 94420 + }, + { + "epoch": 1.545119856009163, + "grad_norm": 0.055791281163692474, + "learning_rate": 1.4947077373013096e-06, + "loss": 0.0015, + "step": 94430 + }, + { + "epoch": 1.5452834819602388, + "grad_norm": 0.00501165771856904, + "learning_rate": 1.4936896351907003e-06, + "loss": 0.0015, + "step": 94440 + }, + { + "epoch": 1.5454471079113148, + "grad_norm": 0.027386831119656563, + "learning_rate": 1.4926718190428369e-06, + "loss": 0.0015, + "step": 94450 + }, + { + "epoch": 1.5456107338623906, + "grad_norm": 0.03638581931591034, + "learning_rate": 1.4916542889407254e-06, + "loss": 0.0008, + "step": 94460 + }, + { + "epoch": 1.5457743598134663, + "grad_norm": 0.14036647975444794, + "learning_rate": 1.4906370449673551e-06, + "loss": 0.0014, + "step": 94470 + }, + { + "epoch": 1.5459379857645423, + "grad_norm": 0.08946513384580612, + "learning_rate": 1.4896200872056859e-06, + "loss": 0.0015, + "step": 94480 + }, + { + "epoch": 1.546101611715618, + "grad_norm": 0.2316887229681015, + "learning_rate": 1.4886034157386608e-06, + "loss": 0.0014, + "step": 94490 + }, + { + "epoch": 1.5462652376666939, + "grad_norm": 0.07644547522068024, + "learning_rate": 1.4875870306491929e-06, + "loss": 0.0015, + "step": 94500 + }, + { + "epoch": 1.5464288636177699, + "grad_norm": 0.04918934404850006, + "learning_rate": 1.4865709320201777e-06, + "loss": 0.0015, + "step": 94510 + }, + { + "epoch": 1.5465924895688457, + "grad_norm": 0.047421399503946304, + "learning_rate": 1.485555119934482e-06, + "loss": 0.0007, + "step": 94520 + }, + { + "epoch": 1.5467561155199214, + "grad_norm": 0.038735903799533844, + "learning_rate": 1.4845395944749553e-06, + "loss": 0.0005, + "step": 94530 + }, + { + "epoch": 1.5469197414709974, + "grad_norm": 0.05673230439424515, + "learning_rate": 1.4835243557244188e-06, + "loss": 0.0007, + "step": 94540 + }, + { + "epoch": 1.547083367422073, + "grad_norm": 0.06433865427970886, + "learning_rate": 1.4825094037656717e-06, + "loss": 0.0009, + "step": 94550 + }, + { + "epoch": 1.547246993373149, + "grad_norm": 0.052542008459568024, + "learning_rate": 1.481494738681488e-06, + "loss": 0.0011, + "step": 94560 + }, + { + "epoch": 1.547410619324225, + "grad_norm": 0.018018361181020737, + "learning_rate": 1.4804803605546246e-06, + "loss": 0.0015, + "step": 94570 + }, + { + "epoch": 1.5475742452753005, + "grad_norm": 0.025132155045866966, + "learning_rate": 1.4794662694678069e-06, + "loss": 0.001, + "step": 94580 + }, + { + "epoch": 1.5477378712263765, + "grad_norm": 0.0924421176314354, + "learning_rate": 1.4784524655037436e-06, + "loss": 0.0009, + "step": 94590 + }, + { + "epoch": 1.5479014971774523, + "grad_norm": 0.0738072469830513, + "learning_rate": 1.4774389487451146e-06, + "loss": 0.0008, + "step": 94600 + }, + { + "epoch": 1.548065123128528, + "grad_norm": 0.08844118565320969, + "learning_rate": 1.4764257192745818e-06, + "loss": 0.0012, + "step": 94610 + }, + { + "epoch": 1.548228749079604, + "grad_norm": 0.03137395903468132, + "learning_rate": 1.4754127771747772e-06, + "loss": 0.0011, + "step": 94620 + }, + { + "epoch": 1.5483923750306798, + "grad_norm": 0.12428475171327591, + "learning_rate": 1.4744001225283167e-06, + "loss": 0.0026, + "step": 94630 + }, + { + "epoch": 1.5485560009817556, + "grad_norm": 0.02913321554660797, + "learning_rate": 1.473387755417785e-06, + "loss": 0.0008, + "step": 94640 + }, + { + "epoch": 1.5487196269328316, + "grad_norm": 0.030589599162340164, + "learning_rate": 1.4723756759257514e-06, + "loss": 0.0008, + "step": 94650 + }, + { + "epoch": 1.5488832528839074, + "grad_norm": 0.09706137329339981, + "learning_rate": 1.471363884134754e-06, + "loss": 0.0012, + "step": 94660 + }, + { + "epoch": 1.5490468788349832, + "grad_norm": 0.04359102621674538, + "learning_rate": 1.4703523801273145e-06, + "loss": 0.0004, + "step": 94670 + }, + { + "epoch": 1.5492105047860592, + "grad_norm": 0.00965199340134859, + "learning_rate": 1.4693411639859239e-06, + "loss": 0.0007, + "step": 94680 + }, + { + "epoch": 1.549374130737135, + "grad_norm": 0.011208704672753811, + "learning_rate": 1.468330235793058e-06, + "loss": 0.0011, + "step": 94690 + }, + { + "epoch": 1.5495377566882107, + "grad_norm": 0.04968635365366936, + "learning_rate": 1.4673195956311598e-06, + "loss": 0.001, + "step": 94700 + }, + { + "epoch": 1.5497013826392867, + "grad_norm": 0.03595474734902382, + "learning_rate": 1.4663092435826587e-06, + "loss": 0.0006, + "step": 94710 + }, + { + "epoch": 1.5498650085903625, + "grad_norm": 0.047162409871816635, + "learning_rate": 1.465299179729951e-06, + "loss": 0.0007, + "step": 94720 + }, + { + "epoch": 1.5500286345414382, + "grad_norm": 0.008742806501686573, + "learning_rate": 1.4642894041554174e-06, + "loss": 0.001, + "step": 94730 + }, + { + "epoch": 1.5501922604925142, + "grad_norm": 0.04749373346567154, + "learning_rate": 1.4632799169414109e-06, + "loss": 0.0006, + "step": 94740 + }, + { + "epoch": 1.5503558864435898, + "grad_norm": 0.05303960293531418, + "learning_rate": 1.4622707181702594e-06, + "loss": 0.001, + "step": 94750 + }, + { + "epoch": 1.5505195123946658, + "grad_norm": 0.059382062405347824, + "learning_rate": 1.4612618079242736e-06, + "loss": 0.0007, + "step": 94760 + }, + { + "epoch": 1.5506831383457418, + "grad_norm": 0.04280172288417816, + "learning_rate": 1.4602531862857333e-06, + "loss": 0.0007, + "step": 94770 + }, + { + "epoch": 1.5508467642968173, + "grad_norm": 0.01855129934847355, + "learning_rate": 1.4592448533369013e-06, + "loss": 0.0016, + "step": 94780 + }, + { + "epoch": 1.5510103902478933, + "grad_norm": 0.061799775809049606, + "learning_rate": 1.4582368091600107e-06, + "loss": 0.001, + "step": 94790 + }, + { + "epoch": 1.551174016198969, + "grad_norm": 0.08428354561328888, + "learning_rate": 1.457229053837278e-06, + "loss": 0.0015, + "step": 94800 + }, + { + "epoch": 1.5513376421500449, + "grad_norm": 0.2039078027009964, + "learning_rate": 1.4562215874508883e-06, + "loss": 0.001, + "step": 94810 + }, + { + "epoch": 1.5515012681011209, + "grad_norm": 0.04823237657546997, + "learning_rate": 1.45521441008301e-06, + "loss": 0.0008, + "step": 94820 + }, + { + "epoch": 1.5516648940521967, + "grad_norm": 0.03977847471833229, + "learning_rate": 1.454207521815783e-06, + "loss": 0.0011, + "step": 94830 + }, + { + "epoch": 1.5518285200032724, + "grad_norm": 0.003271718043833971, + "learning_rate": 1.4532009227313281e-06, + "loss": 0.001, + "step": 94840 + }, + { + "epoch": 1.5519921459543484, + "grad_norm": 0.04756302013993263, + "learning_rate": 1.452194612911737e-06, + "loss": 0.0012, + "step": 94850 + }, + { + "epoch": 1.5521557719054242, + "grad_norm": 0.03337939456105232, + "learning_rate": 1.4511885924390844e-06, + "loss": 0.0009, + "step": 94860 + }, + { + "epoch": 1.5523193978565, + "grad_norm": 0.0635802373290062, + "learning_rate": 1.4501828613954143e-06, + "loss": 0.0015, + "step": 94870 + }, + { + "epoch": 1.552483023807576, + "grad_norm": 0.03897978365421295, + "learning_rate": 1.4491774198627545e-06, + "loss": 0.0013, + "step": 94880 + }, + { + "epoch": 1.5526466497586517, + "grad_norm": 0.06139339134097099, + "learning_rate": 1.4481722679231014e-06, + "loss": 0.0007, + "step": 94890 + }, + { + "epoch": 1.5528102757097275, + "grad_norm": 0.05021706968545914, + "learning_rate": 1.4471674056584357e-06, + "loss": 0.0019, + "step": 94900 + }, + { + "epoch": 1.5529739016608035, + "grad_norm": 0.03962188959121704, + "learning_rate": 1.4461628331507066e-06, + "loss": 0.0014, + "step": 94910 + }, + { + "epoch": 1.5531375276118793, + "grad_norm": 0.048101626336574554, + "learning_rate": 1.4451585504818478e-06, + "loss": 0.0011, + "step": 94920 + }, + { + "epoch": 1.553301153562955, + "grad_norm": 0.0283831637352705, + "learning_rate": 1.4441545577337606e-06, + "loss": 0.0007, + "step": 94930 + }, + { + "epoch": 1.553464779514031, + "grad_norm": 0.07085319608449936, + "learning_rate": 1.4431508549883316e-06, + "loss": 0.0005, + "step": 94940 + }, + { + "epoch": 1.5536284054651066, + "grad_norm": 0.01940939761698246, + "learning_rate": 1.4421474423274162e-06, + "loss": 0.0011, + "step": 94950 + }, + { + "epoch": 1.5537920314161826, + "grad_norm": 0.04680795595049858, + "learning_rate": 1.4411443198328517e-06, + "loss": 0.0007, + "step": 94960 + }, + { + "epoch": 1.5539556573672586, + "grad_norm": 0.12658245861530304, + "learning_rate": 1.4401414875864467e-06, + "loss": 0.0009, + "step": 94970 + }, + { + "epoch": 1.5541192833183342, + "grad_norm": 0.011278039775788784, + "learning_rate": 1.4391389456699923e-06, + "loss": 0.0008, + "step": 94980 + }, + { + "epoch": 1.5542829092694102, + "grad_norm": 0.036382418125867844, + "learning_rate": 1.438136694165248e-06, + "loss": 0.0019, + "step": 94990 + }, + { + "epoch": 1.554446535220486, + "grad_norm": 0.12231285870075226, + "learning_rate": 1.4371347331539587e-06, + "loss": 0.0008, + "step": 95000 + }, + { + "epoch": 1.5546101611715617, + "grad_norm": 0.01714707911014557, + "learning_rate": 1.4361330627178367e-06, + "loss": 0.0011, + "step": 95010 + }, + { + "epoch": 1.5547737871226377, + "grad_norm": 0.061439938843250275, + "learning_rate": 1.4351316829385785e-06, + "loss": 0.001, + "step": 95020 + }, + { + "epoch": 1.5549374130737135, + "grad_norm": 0.04495241865515709, + "learning_rate": 1.4341305938978501e-06, + "loss": 0.0013, + "step": 95030 + }, + { + "epoch": 1.5551010390247892, + "grad_norm": 0.05146379396319389, + "learning_rate": 1.4331297956772995e-06, + "loss": 0.0007, + "step": 95040 + }, + { + "epoch": 1.5552646649758652, + "grad_norm": 0.020773475989699364, + "learning_rate": 1.4321292883585475e-06, + "loss": 0.0011, + "step": 95050 + }, + { + "epoch": 1.555428290926941, + "grad_norm": 0.01674269698560238, + "learning_rate": 1.4311290720231902e-06, + "loss": 0.0008, + "step": 95060 + }, + { + "epoch": 1.5555919168780168, + "grad_norm": 0.029754532501101494, + "learning_rate": 1.430129146752805e-06, + "loss": 0.0006, + "step": 95070 + }, + { + "epoch": 1.5557555428290928, + "grad_norm": 0.18505147099494934, + "learning_rate": 1.429129512628939e-06, + "loss": 0.0022, + "step": 95080 + }, + { + "epoch": 1.5559191687801686, + "grad_norm": 0.0631747916340828, + "learning_rate": 1.4281301697331223e-06, + "loss": 0.0008, + "step": 95090 + }, + { + "epoch": 1.5560827947312443, + "grad_norm": 0.04979654774069786, + "learning_rate": 1.4271311181468555e-06, + "loss": 0.001, + "step": 95100 + }, + { + "epoch": 1.5562464206823203, + "grad_norm": 0.06759896874427795, + "learning_rate": 1.42613235795162e-06, + "loss": 0.001, + "step": 95110 + }, + { + "epoch": 1.556410046633396, + "grad_norm": 0.045523688197135925, + "learning_rate": 1.4251338892288684e-06, + "loss": 0.0012, + "step": 95120 + }, + { + "epoch": 1.5565736725844719, + "grad_norm": 0.0628371313214302, + "learning_rate": 1.4241357120600358e-06, + "loss": 0.0007, + "step": 95130 + }, + { + "epoch": 1.5567372985355479, + "grad_norm": 0.0038243625313043594, + "learning_rate": 1.423137826526526e-06, + "loss": 0.0003, + "step": 95140 + }, + { + "epoch": 1.5569009244866234, + "grad_norm": 0.0647701621055603, + "learning_rate": 1.4221402327097284e-06, + "loss": 0.0007, + "step": 95150 + }, + { + "epoch": 1.5570645504376994, + "grad_norm": 0.05577058717608452, + "learning_rate": 1.421142930690998e-06, + "loss": 0.001, + "step": 95160 + }, + { + "epoch": 1.5572281763887754, + "grad_norm": 0.04225318878889084, + "learning_rate": 1.4201459205516754e-06, + "loss": 0.0012, + "step": 95170 + }, + { + "epoch": 1.557391802339851, + "grad_norm": 0.010365060530602932, + "learning_rate": 1.4191492023730706e-06, + "loss": 0.0009, + "step": 95180 + }, + { + "epoch": 1.557555428290927, + "grad_norm": 0.06492631137371063, + "learning_rate": 1.4181527762364755e-06, + "loss": 0.0007, + "step": 95190 + }, + { + "epoch": 1.5577190542420027, + "grad_norm": 0.10063893347978592, + "learning_rate": 1.4171566422231515e-06, + "loss": 0.0007, + "step": 95200 + }, + { + "epoch": 1.5578826801930785, + "grad_norm": 0.06483485549688339, + "learning_rate": 1.4161608004143435e-06, + "loss": 0.0012, + "step": 95210 + }, + { + "epoch": 1.5580463061441545, + "grad_norm": 0.06489508599042892, + "learning_rate": 1.415165250891266e-06, + "loss": 0.0011, + "step": 95220 + }, + { + "epoch": 1.5582099320952303, + "grad_norm": 0.041939400136470795, + "learning_rate": 1.4141699937351156e-06, + "loss": 0.0009, + "step": 95230 + }, + { + "epoch": 1.558373558046306, + "grad_norm": 0.04188789799809456, + "learning_rate": 1.4131750290270585e-06, + "loss": 0.0008, + "step": 95240 + }, + { + "epoch": 1.558537183997382, + "grad_norm": 0.01635364629328251, + "learning_rate": 1.4121803568482445e-06, + "loss": 0.0006, + "step": 95250 + }, + { + "epoch": 1.5587008099484578, + "grad_norm": 0.09771008044481277, + "learning_rate": 1.411185977279792e-06, + "loss": 0.0009, + "step": 95260 + }, + { + "epoch": 1.5588644358995336, + "grad_norm": 0.026777304708957672, + "learning_rate": 1.4101918904028028e-06, + "loss": 0.0013, + "step": 95270 + }, + { + "epoch": 1.5590280618506096, + "grad_norm": 0.044010795652866364, + "learning_rate": 1.4091980962983475e-06, + "loss": 0.0007, + "step": 95280 + }, + { + "epoch": 1.5591916878016854, + "grad_norm": 0.045072562992572784, + "learning_rate": 1.4082045950474804e-06, + "loss": 0.0007, + "step": 95290 + }, + { + "epoch": 1.5593553137527612, + "grad_norm": 0.031395263969898224, + "learning_rate": 1.4072113867312243e-06, + "loss": 0.001, + "step": 95300 + }, + { + "epoch": 1.5595189397038371, + "grad_norm": 0.04804563894867897, + "learning_rate": 1.406218471430586e-06, + "loss": 0.0009, + "step": 95310 + }, + { + "epoch": 1.559682565654913, + "grad_norm": 0.040642041712999344, + "learning_rate": 1.4052258492265398e-06, + "loss": 0.0009, + "step": 95320 + }, + { + "epoch": 1.5598461916059887, + "grad_norm": 0.06850877404212952, + "learning_rate": 1.4042335202000445e-06, + "loss": 0.0011, + "step": 95330 + }, + { + "epoch": 1.5600098175570647, + "grad_norm": 0.05043303593993187, + "learning_rate": 1.403241484432028e-06, + "loss": 0.0014, + "step": 95340 + }, + { + "epoch": 1.5601734435081402, + "grad_norm": 0.07418040186166763, + "learning_rate": 1.402249742003401e-06, + "loss": 0.0019, + "step": 95350 + }, + { + "epoch": 1.5603370694592162, + "grad_norm": 0.1309330016374588, + "learning_rate": 1.401258292995042e-06, + "loss": 0.001, + "step": 95360 + }, + { + "epoch": 1.5605006954102922, + "grad_norm": 0.004077328834682703, + "learning_rate": 1.4002671374878146e-06, + "loss": 0.0011, + "step": 95370 + }, + { + "epoch": 1.5606643213613678, + "grad_norm": 0.055469583719968796, + "learning_rate": 1.3992762755625515e-06, + "loss": 0.0013, + "step": 95380 + }, + { + "epoch": 1.5608279473124438, + "grad_norm": 0.020231179893016815, + "learning_rate": 1.398285707300064e-06, + "loss": 0.0003, + "step": 95390 + }, + { + "epoch": 1.5609915732635196, + "grad_norm": 0.02965341880917549, + "learning_rate": 1.397295432781141e-06, + "loss": 0.0005, + "step": 95400 + }, + { + "epoch": 1.5611551992145953, + "grad_norm": 0.007162534631788731, + "learning_rate": 1.3963054520865439e-06, + "loss": 0.0015, + "step": 95410 + }, + { + "epoch": 1.5613188251656713, + "grad_norm": 0.14514701068401337, + "learning_rate": 1.395315765297015e-06, + "loss": 0.0011, + "step": 95420 + }, + { + "epoch": 1.561482451116747, + "grad_norm": 0.09444594383239746, + "learning_rate": 1.394326372493266e-06, + "loss": 0.0012, + "step": 95430 + }, + { + "epoch": 1.5616460770678229, + "grad_norm": 0.19214867055416107, + "learning_rate": 1.3933372737559924e-06, + "loss": 0.0013, + "step": 95440 + }, + { + "epoch": 1.5618097030188989, + "grad_norm": 0.0075097642838954926, + "learning_rate": 1.3923484691658583e-06, + "loss": 0.0009, + "step": 95450 + }, + { + "epoch": 1.5619733289699747, + "grad_norm": 0.011223035864531994, + "learning_rate": 1.3913599588035098e-06, + "loss": 0.0007, + "step": 95460 + }, + { + "epoch": 1.5621369549210504, + "grad_norm": 0.019771186634898186, + "learning_rate": 1.3903717427495645e-06, + "loss": 0.0008, + "step": 95470 + }, + { + "epoch": 1.5623005808721264, + "grad_norm": 0.06696370244026184, + "learning_rate": 1.38938382108462e-06, + "loss": 0.0008, + "step": 95480 + }, + { + "epoch": 1.5624642068232022, + "grad_norm": 0.027209773659706116, + "learning_rate": 1.3883961938892454e-06, + "loss": 0.0013, + "step": 95490 + }, + { + "epoch": 1.562627832774278, + "grad_norm": 0.016985386610031128, + "learning_rate": 1.3874088612439911e-06, + "loss": 0.0012, + "step": 95500 + }, + { + "epoch": 1.562791458725354, + "grad_norm": 0.05902815982699394, + "learning_rate": 1.386421823229377e-06, + "loss": 0.0014, + "step": 95510 + }, + { + "epoch": 1.5629550846764295, + "grad_norm": 0.0026475745253264904, + "learning_rate": 1.3854350799259065e-06, + "loss": 0.0007, + "step": 95520 + }, + { + "epoch": 1.5631187106275055, + "grad_norm": 0.011088100261986256, + "learning_rate": 1.3844486314140515e-06, + "loss": 0.0029, + "step": 95530 + }, + { + "epoch": 1.5632823365785815, + "grad_norm": 0.03482355177402496, + "learning_rate": 1.3834624777742662e-06, + "loss": 0.0011, + "step": 95540 + }, + { + "epoch": 1.563445962529657, + "grad_norm": 0.018787400797009468, + "learning_rate": 1.3824766190869753e-06, + "loss": 0.0006, + "step": 95550 + }, + { + "epoch": 1.563609588480733, + "grad_norm": 0.040789831429719925, + "learning_rate": 1.381491055432585e-06, + "loss": 0.001, + "step": 95560 + }, + { + "epoch": 1.5637732144318088, + "grad_norm": 0.014312353916466236, + "learning_rate": 1.3805057868914712e-06, + "loss": 0.0005, + "step": 95570 + }, + { + "epoch": 1.5639368403828846, + "grad_norm": 0.03847702592611313, + "learning_rate": 1.379520813543993e-06, + "loss": 0.0006, + "step": 95580 + }, + { + "epoch": 1.5641004663339606, + "grad_norm": 0.0028449888341128826, + "learning_rate": 1.3785361354704774e-06, + "loss": 0.0008, + "step": 95590 + }, + { + "epoch": 1.5642640922850364, + "grad_norm": 0.018051045015454292, + "learning_rate": 1.377551752751235e-06, + "loss": 0.0015, + "step": 95600 + }, + { + "epoch": 1.5644277182361122, + "grad_norm": 0.02913535386323929, + "learning_rate": 1.376567665466545e-06, + "loss": 0.0005, + "step": 95610 + }, + { + "epoch": 1.5645913441871881, + "grad_norm": 0.043097734451293945, + "learning_rate": 1.3755838736966703e-06, + "loss": 0.0008, + "step": 95620 + }, + { + "epoch": 1.564754970138264, + "grad_norm": 0.011552899144589901, + "learning_rate": 1.3746003775218415e-06, + "loss": 0.001, + "step": 95630 + }, + { + "epoch": 1.5649185960893397, + "grad_norm": 0.0521833561360836, + "learning_rate": 1.3736171770222734e-06, + "loss": 0.0009, + "step": 95640 + }, + { + "epoch": 1.5650822220404157, + "grad_norm": 0.04688671603798866, + "learning_rate": 1.3726342722781483e-06, + "loss": 0.0013, + "step": 95650 + }, + { + "epoch": 1.5652458479914915, + "grad_norm": 0.012001353316009045, + "learning_rate": 1.3716516633696319e-06, + "loss": 0.0013, + "step": 95660 + }, + { + "epoch": 1.5654094739425672, + "grad_norm": 0.04682119935750961, + "learning_rate": 1.3706693503768603e-06, + "loss": 0.0009, + "step": 95670 + }, + { + "epoch": 1.5655730998936432, + "grad_norm": 0.007234207820147276, + "learning_rate": 1.3696873333799499e-06, + "loss": 0.0011, + "step": 95680 + }, + { + "epoch": 1.565736725844719, + "grad_norm": 0.05061132460832596, + "learning_rate": 1.3687056124589887e-06, + "loss": 0.0017, + "step": 95690 + }, + { + "epoch": 1.5659003517957948, + "grad_norm": 0.08397451788187027, + "learning_rate": 1.3677241876940417e-06, + "loss": 0.001, + "step": 95700 + }, + { + "epoch": 1.5660639777468708, + "grad_norm": 0.07349889725446701, + "learning_rate": 1.3667430591651532e-06, + "loss": 0.0009, + "step": 95710 + }, + { + "epoch": 1.5662276036979463, + "grad_norm": 0.022956345230340958, + "learning_rate": 1.3657622269523385e-06, + "loss": 0.0014, + "step": 95720 + }, + { + "epoch": 1.5663912296490223, + "grad_norm": 0.0800904855132103, + "learning_rate": 1.3647816911355926e-06, + "loss": 0.0014, + "step": 95730 + }, + { + "epoch": 1.5665548556000983, + "grad_norm": 0.044382184743881226, + "learning_rate": 1.3638014517948829e-06, + "loss": 0.0007, + "step": 95740 + }, + { + "epoch": 1.5667184815511739, + "grad_norm": 0.03818973898887634, + "learning_rate": 1.3628215090101564e-06, + "loss": 0.0009, + "step": 95750 + }, + { + "epoch": 1.5668821075022499, + "grad_norm": 0.044001106172800064, + "learning_rate": 1.3618418628613318e-06, + "loss": 0.0008, + "step": 95760 + }, + { + "epoch": 1.5670457334533257, + "grad_norm": 0.005132677964866161, + "learning_rate": 1.3608625134283082e-06, + "loss": 0.0009, + "step": 95770 + }, + { + "epoch": 1.5672093594044014, + "grad_norm": 0.07564137130975723, + "learning_rate": 1.359883460790955e-06, + "loss": 0.0016, + "step": 95780 + }, + { + "epoch": 1.5673729853554774, + "grad_norm": 0.055294036865234375, + "learning_rate": 1.3589047050291238e-06, + "loss": 0.0013, + "step": 95790 + }, + { + "epoch": 1.5675366113065532, + "grad_norm": 0.022641586139798164, + "learning_rate": 1.357926246222635e-06, + "loss": 0.0007, + "step": 95800 + }, + { + "epoch": 1.567700237257629, + "grad_norm": 0.008204330690205097, + "learning_rate": 1.3569480844512918e-06, + "loss": 0.0011, + "step": 95810 + }, + { + "epoch": 1.567863863208705, + "grad_norm": 0.08772075921297073, + "learning_rate": 1.355970219794866e-06, + "loss": 0.001, + "step": 95820 + }, + { + "epoch": 1.5680274891597807, + "grad_norm": 0.005941058974713087, + "learning_rate": 1.354992652333113e-06, + "loss": 0.0013, + "step": 95830 + }, + { + "epoch": 1.5681911151108565, + "grad_norm": 0.044869646430015564, + "learning_rate": 1.3540153821457563e-06, + "loss": 0.0009, + "step": 95840 + }, + { + "epoch": 1.5683547410619325, + "grad_norm": 0.03574323654174805, + "learning_rate": 1.3530384093125021e-06, + "loss": 0.0022, + "step": 95850 + }, + { + "epoch": 1.5685183670130083, + "grad_norm": 0.04942172020673752, + "learning_rate": 1.3520617339130254e-06, + "loss": 0.0015, + "step": 95860 + }, + { + "epoch": 1.568681992964084, + "grad_norm": 0.05153922364115715, + "learning_rate": 1.3510853560269838e-06, + "loss": 0.0008, + "step": 95870 + }, + { + "epoch": 1.56884561891516, + "grad_norm": 0.0160396546125412, + "learning_rate": 1.3501092757340045e-06, + "loss": 0.0012, + "step": 95880 + }, + { + "epoch": 1.5690092448662358, + "grad_norm": 0.035457707941532135, + "learning_rate": 1.3491334931136962e-06, + "loss": 0.0005, + "step": 95890 + }, + { + "epoch": 1.5691728708173116, + "grad_norm": 0.09537525475025177, + "learning_rate": 1.3481580082456374e-06, + "loss": 0.0012, + "step": 95900 + }, + { + "epoch": 1.5693364967683876, + "grad_norm": 0.048981811851263046, + "learning_rate": 1.3471828212093885e-06, + "loss": 0.0011, + "step": 95910 + }, + { + "epoch": 1.5695001227194632, + "grad_norm": 0.07334478199481964, + "learning_rate": 1.346207932084479e-06, + "loss": 0.0008, + "step": 95920 + }, + { + "epoch": 1.5696637486705391, + "grad_norm": 0.051195401698350906, + "learning_rate": 1.3452333409504204e-06, + "loss": 0.0014, + "step": 95930 + }, + { + "epoch": 1.5698273746216151, + "grad_norm": 0.025683952495455742, + "learning_rate": 1.3442590478866952e-06, + "loss": 0.0012, + "step": 95940 + }, + { + "epoch": 1.5699910005726907, + "grad_norm": 0.008572527207434177, + "learning_rate": 1.3432850529727647e-06, + "loss": 0.0006, + "step": 95950 + }, + { + "epoch": 1.5701546265237667, + "grad_norm": 0.10819711536169052, + "learning_rate": 1.3423113562880636e-06, + "loss": 0.0013, + "step": 95960 + }, + { + "epoch": 1.5703182524748425, + "grad_norm": 0.0014823760138824582, + "learning_rate": 1.3413379579120045e-06, + "loss": 0.0016, + "step": 95970 + }, + { + "epoch": 1.5704818784259182, + "grad_norm": 0.042564619332551956, + "learning_rate": 1.3403648579239726e-06, + "loss": 0.001, + "step": 95980 + }, + { + "epoch": 1.5706455043769942, + "grad_norm": 0.04699201136827469, + "learning_rate": 1.339392056403333e-06, + "loss": 0.0004, + "step": 95990 + }, + { + "epoch": 1.57080913032807, + "grad_norm": 0.07357124239206314, + "learning_rate": 1.3384195534294214e-06, + "loss": 0.0009, + "step": 96000 + }, + { + "epoch": 1.5709727562791458, + "grad_norm": 0.0437801368534565, + "learning_rate": 1.3374473490815549e-06, + "loss": 0.0017, + "step": 96010 + }, + { + "epoch": 1.5711363822302218, + "grad_norm": 0.07364551723003387, + "learning_rate": 1.3364754434390214e-06, + "loss": 0.0016, + "step": 96020 + }, + { + "epoch": 1.5713000081812976, + "grad_norm": 0.015064758248627186, + "learning_rate": 1.3355038365810846e-06, + "loss": 0.0022, + "step": 96030 + }, + { + "epoch": 1.5714636341323733, + "grad_norm": 0.007456564344465733, + "learning_rate": 1.3345325285869893e-06, + "loss": 0.0011, + "step": 96040 + }, + { + "epoch": 1.5716272600834493, + "grad_norm": 0.03210440278053284, + "learning_rate": 1.3335615195359481e-06, + "loss": 0.0007, + "step": 96050 + }, + { + "epoch": 1.571790886034525, + "grad_norm": 0.029707500711083412, + "learning_rate": 1.3325908095071565e-06, + "loss": 0.0005, + "step": 96060 + }, + { + "epoch": 1.5719545119856009, + "grad_norm": 0.008204796351492405, + "learning_rate": 1.3316203985797794e-06, + "loss": 0.001, + "step": 96070 + }, + { + "epoch": 1.5721181379366769, + "grad_norm": 0.1412242203950882, + "learning_rate": 1.3306502868329634e-06, + "loss": 0.0021, + "step": 96080 + }, + { + "epoch": 1.5722817638877526, + "grad_norm": 0.04805987328290939, + "learning_rate": 1.3296804743458241e-06, + "loss": 0.0009, + "step": 96090 + }, + { + "epoch": 1.5724453898388284, + "grad_norm": 0.0050872149877250195, + "learning_rate": 1.3287109611974592e-06, + "loss": 0.0003, + "step": 96100 + }, + { + "epoch": 1.5726090157899044, + "grad_norm": 0.0342746302485466, + "learning_rate": 1.3277417474669364e-06, + "loss": 0.001, + "step": 96110 + }, + { + "epoch": 1.57277264174098, + "grad_norm": 0.05099215731024742, + "learning_rate": 1.3267728332333035e-06, + "loss": 0.0005, + "step": 96120 + }, + { + "epoch": 1.572936267692056, + "grad_norm": 0.07273607701063156, + "learning_rate": 1.3258042185755798e-06, + "loss": 0.0015, + "step": 96130 + }, + { + "epoch": 1.573099893643132, + "grad_norm": 0.04864228516817093, + "learning_rate": 1.3248359035727649e-06, + "loss": 0.001, + "step": 96140 + }, + { + "epoch": 1.5732635195942075, + "grad_norm": 0.00710119167342782, + "learning_rate": 1.323867888303828e-06, + "loss": 0.0005, + "step": 96150 + }, + { + "epoch": 1.5734271455452835, + "grad_norm": 0.051200784742832184, + "learning_rate": 1.3229001728477204e-06, + "loss": 0.0017, + "step": 96160 + }, + { + "epoch": 1.5735907714963593, + "grad_norm": 0.04332570731639862, + "learning_rate": 1.321932757283363e-06, + "loss": 0.0009, + "step": 96170 + }, + { + "epoch": 1.573754397447435, + "grad_norm": 0.10078773647546768, + "learning_rate": 1.320965641689657e-06, + "loss": 0.0012, + "step": 96180 + }, + { + "epoch": 1.573918023398511, + "grad_norm": 0.01057474035769701, + "learning_rate": 1.319998826145475e-06, + "loss": 0.001, + "step": 96190 + }, + { + "epoch": 1.5740816493495868, + "grad_norm": 0.052918098866939545, + "learning_rate": 1.3190323107296692e-06, + "loss": 0.0019, + "step": 96200 + }, + { + "epoch": 1.5742452753006626, + "grad_norm": 0.0329354852437973, + "learning_rate": 1.3180660955210638e-06, + "loss": 0.0011, + "step": 96210 + }, + { + "epoch": 1.5744089012517386, + "grad_norm": 0.02556995116174221, + "learning_rate": 1.3171001805984613e-06, + "loss": 0.0011, + "step": 96220 + }, + { + "epoch": 1.5745725272028144, + "grad_norm": 0.03863323852419853, + "learning_rate": 1.3161345660406366e-06, + "loss": 0.0011, + "step": 96230 + }, + { + "epoch": 1.5747361531538902, + "grad_norm": 0.032791465520858765, + "learning_rate": 1.3151692519263447e-06, + "loss": 0.0012, + "step": 96240 + }, + { + "epoch": 1.5748997791049661, + "grad_norm": 0.03509007394313812, + "learning_rate": 1.3142042383343101e-06, + "loss": 0.0008, + "step": 96250 + }, + { + "epoch": 1.575063405056042, + "grad_norm": 0.014239206910133362, + "learning_rate": 1.3132395253432395e-06, + "loss": 0.0012, + "step": 96260 + }, + { + "epoch": 1.5752270310071177, + "grad_norm": 0.051493994891643524, + "learning_rate": 1.312275113031808e-06, + "loss": 0.0008, + "step": 96270 + }, + { + "epoch": 1.5753906569581937, + "grad_norm": 0.03945068269968033, + "learning_rate": 1.3113110014786728e-06, + "loss": 0.0011, + "step": 96280 + }, + { + "epoch": 1.5755542829092692, + "grad_norm": 0.0906679630279541, + "learning_rate": 1.310347190762461e-06, + "loss": 0.0015, + "step": 96290 + }, + { + "epoch": 1.5757179088603452, + "grad_norm": 0.05404764041304588, + "learning_rate": 1.3093836809617811e-06, + "loss": 0.0013, + "step": 96300 + }, + { + "epoch": 1.5758815348114212, + "grad_norm": 0.026969052851200104, + "learning_rate": 1.3084204721552097e-06, + "loss": 0.0009, + "step": 96310 + }, + { + "epoch": 1.5760451607624968, + "grad_norm": 0.03980583697557449, + "learning_rate": 1.3074575644213061e-06, + "loss": 0.0027, + "step": 96320 + }, + { + "epoch": 1.5762087867135728, + "grad_norm": 0.035800158977508545, + "learning_rate": 1.3064949578385988e-06, + "loss": 0.0004, + "step": 96330 + }, + { + "epoch": 1.5763724126646486, + "grad_norm": 0.034289948642253876, + "learning_rate": 1.3055326524855982e-06, + "loss": 0.0008, + "step": 96340 + }, + { + "epoch": 1.5765360386157243, + "grad_norm": 0.025562861934304237, + "learning_rate": 1.3045706484407843e-06, + "loss": 0.001, + "step": 96350 + }, + { + "epoch": 1.5766996645668003, + "grad_norm": 0.030591795220971107, + "learning_rate": 1.3036089457826144e-06, + "loss": 0.0008, + "step": 96360 + }, + { + "epoch": 1.576863290517876, + "grad_norm": 0.013254762627184391, + "learning_rate": 1.3026475445895243e-06, + "loss": 0.0017, + "step": 96370 + }, + { + "epoch": 1.5770269164689519, + "grad_norm": 0.06676498055458069, + "learning_rate": 1.3016864449399191e-06, + "loss": 0.0018, + "step": 96380 + }, + { + "epoch": 1.5771905424200279, + "grad_norm": 0.04152214527130127, + "learning_rate": 1.300725646912186e-06, + "loss": 0.0006, + "step": 96390 + }, + { + "epoch": 1.5773541683711036, + "grad_norm": 0.06061022728681564, + "learning_rate": 1.299765150584682e-06, + "loss": 0.0013, + "step": 96400 + }, + { + "epoch": 1.5775177943221794, + "grad_norm": 0.08126376569271088, + "learning_rate": 1.2988049560357447e-06, + "loss": 0.0012, + "step": 96410 + }, + { + "epoch": 1.5776814202732554, + "grad_norm": 0.07870087027549744, + "learning_rate": 1.2978450633436806e-06, + "loss": 0.0015, + "step": 96420 + }, + { + "epoch": 1.5778450462243312, + "grad_norm": 0.02038705348968506, + "learning_rate": 1.2968854725867792e-06, + "loss": 0.0036, + "step": 96430 + }, + { + "epoch": 1.578008672175407, + "grad_norm": 0.035232771188020706, + "learning_rate": 1.295926183843298e-06, + "loss": 0.001, + "step": 96440 + }, + { + "epoch": 1.578172298126483, + "grad_norm": 0.005245399195700884, + "learning_rate": 1.2949671971914762e-06, + "loss": 0.0006, + "step": 96450 + }, + { + "epoch": 1.5783359240775587, + "grad_norm": 0.09929900616407394, + "learning_rate": 1.294008512709523e-06, + "loss": 0.001, + "step": 96460 + }, + { + "epoch": 1.5784995500286345, + "grad_norm": 0.02546124905347824, + "learning_rate": 1.2930501304756277e-06, + "loss": 0.0014, + "step": 96470 + }, + { + "epoch": 1.5786631759797105, + "grad_norm": 0.045356739312410355, + "learning_rate": 1.2920920505679507e-06, + "loss": 0.0009, + "step": 96480 + }, + { + "epoch": 1.578826801930786, + "grad_norm": 0.08393066376447678, + "learning_rate": 1.291134273064632e-06, + "loss": 0.0015, + "step": 96490 + }, + { + "epoch": 1.578990427881862, + "grad_norm": 0.03120879828929901, + "learning_rate": 1.2901767980437813e-06, + "loss": 0.0006, + "step": 96500 + }, + { + "epoch": 1.579154053832938, + "grad_norm": 0.03244754672050476, + "learning_rate": 1.289219625583491e-06, + "loss": 0.0008, + "step": 96510 + }, + { + "epoch": 1.5793176797840136, + "grad_norm": 0.03129557892680168, + "learning_rate": 1.2882627557618215e-06, + "loss": 0.001, + "step": 96520 + }, + { + "epoch": 1.5794813057350896, + "grad_norm": 0.04987405985593796, + "learning_rate": 1.2873061886568145e-06, + "loss": 0.001, + "step": 96530 + }, + { + "epoch": 1.5796449316861654, + "grad_norm": 0.04315370321273804, + "learning_rate": 1.2863499243464817e-06, + "loss": 0.0016, + "step": 96540 + }, + { + "epoch": 1.5798085576372412, + "grad_norm": 0.05131954699754715, + "learning_rate": 1.2853939629088152e-06, + "loss": 0.0009, + "step": 96550 + }, + { + "epoch": 1.5799721835883171, + "grad_norm": 0.049112603068351746, + "learning_rate": 1.284438304421778e-06, + "loss": 0.0005, + "step": 96560 + }, + { + "epoch": 1.580135809539393, + "grad_norm": 0.1093360036611557, + "learning_rate": 1.2834829489633126e-06, + "loss": 0.0008, + "step": 96570 + }, + { + "epoch": 1.5802994354904687, + "grad_norm": 0.0360153429210186, + "learning_rate": 1.2825278966113314e-06, + "loss": 0.0006, + "step": 96580 + }, + { + "epoch": 1.5804630614415447, + "grad_norm": 0.05481434240937233, + "learning_rate": 1.2815731474437288e-06, + "loss": 0.0012, + "step": 96590 + }, + { + "epoch": 1.5806266873926205, + "grad_norm": 0.07745036482810974, + "learning_rate": 1.2806187015383676e-06, + "loss": 0.0004, + "step": 96600 + }, + { + "epoch": 1.5807903133436962, + "grad_norm": 0.10974860936403275, + "learning_rate": 1.2796645589730922e-06, + "loss": 0.001, + "step": 96610 + }, + { + "epoch": 1.5809539392947722, + "grad_norm": 0.041746024042367935, + "learning_rate": 1.2787107198257158e-06, + "loss": 0.0008, + "step": 96620 + }, + { + "epoch": 1.581117565245848, + "grad_norm": 0.034387823194265366, + "learning_rate": 1.2777571841740343e-06, + "loss": 0.0007, + "step": 96630 + }, + { + "epoch": 1.5812811911969238, + "grad_norm": 0.06622235476970673, + "learning_rate": 1.2768039520958108e-06, + "loss": 0.0008, + "step": 96640 + }, + { + "epoch": 1.5814448171479998, + "grad_norm": 0.008857461623847485, + "learning_rate": 1.275851023668791e-06, + "loss": 0.0007, + "step": 96650 + }, + { + "epoch": 1.5816084430990756, + "grad_norm": 0.026604799553751945, + "learning_rate": 1.2748983989706915e-06, + "loss": 0.0007, + "step": 96660 + }, + { + "epoch": 1.5817720690501513, + "grad_norm": 0.09414137899875641, + "learning_rate": 1.273946078079203e-06, + "loss": 0.001, + "step": 96670 + }, + { + "epoch": 1.5819356950012273, + "grad_norm": 0.015447148121893406, + "learning_rate": 1.2729940610719966e-06, + "loss": 0.0015, + "step": 96680 + }, + { + "epoch": 1.5820993209523029, + "grad_norm": 0.04328058287501335, + "learning_rate": 1.2720423480267124e-06, + "loss": 0.0022, + "step": 96690 + }, + { + "epoch": 1.5822629469033789, + "grad_norm": 0.06319353729486465, + "learning_rate": 1.271090939020973e-06, + "loss": 0.001, + "step": 96700 + }, + { + "epoch": 1.5824265728544549, + "grad_norm": 0.03200283646583557, + "learning_rate": 1.270139834132368e-06, + "loss": 0.0007, + "step": 96710 + }, + { + "epoch": 1.5825901988055304, + "grad_norm": 0.06690200418233871, + "learning_rate": 1.2691890334384687e-06, + "loss": 0.0021, + "step": 96720 + }, + { + "epoch": 1.5827538247566064, + "grad_norm": 0.09653203189373016, + "learning_rate": 1.2682385370168182e-06, + "loss": 0.0007, + "step": 96730 + }, + { + "epoch": 1.5829174507076822, + "grad_norm": 0.12001989781856537, + "learning_rate": 1.2672883449449368e-06, + "loss": 0.0011, + "step": 96740 + }, + { + "epoch": 1.583081076658758, + "grad_norm": 0.09170501679182053, + "learning_rate": 1.2663384573003167e-06, + "loss": 0.0005, + "step": 96750 + }, + { + "epoch": 1.583244702609834, + "grad_norm": 0.017824146896600723, + "learning_rate": 1.2653888741604309e-06, + "loss": 0.0009, + "step": 96760 + }, + { + "epoch": 1.5834083285609097, + "grad_norm": 0.07801946252584457, + "learning_rate": 1.2644395956027206e-06, + "loss": 0.0007, + "step": 96770 + }, + { + "epoch": 1.5835719545119855, + "grad_norm": 0.05510590597987175, + "learning_rate": 1.2634906217046088e-06, + "loss": 0.0006, + "step": 96780 + }, + { + "epoch": 1.5837355804630615, + "grad_norm": 0.004306759685277939, + "learning_rate": 1.2625419525434878e-06, + "loss": 0.0007, + "step": 96790 + }, + { + "epoch": 1.5838992064141373, + "grad_norm": 0.04925316572189331, + "learning_rate": 1.2615935881967312e-06, + "loss": 0.0008, + "step": 96800 + }, + { + "epoch": 1.584062832365213, + "grad_norm": 0.0014622610760852695, + "learning_rate": 1.2606455287416803e-06, + "loss": 0.0009, + "step": 96810 + }, + { + "epoch": 1.584226458316289, + "grad_norm": 0.02716750092804432, + "learning_rate": 1.2596977742556593e-06, + "loss": 0.0011, + "step": 96820 + }, + { + "epoch": 1.5843900842673648, + "grad_norm": 0.04361797496676445, + "learning_rate": 1.258750324815961e-06, + "loss": 0.0005, + "step": 96830 + }, + { + "epoch": 1.5845537102184406, + "grad_norm": 0.025874778628349304, + "learning_rate": 1.2578031804998586e-06, + "loss": 0.0011, + "step": 96840 + }, + { + "epoch": 1.5847173361695166, + "grad_norm": 0.014456517063081264, + "learning_rate": 1.2568563413845952e-06, + "loss": 0.0008, + "step": 96850 + }, + { + "epoch": 1.5848809621205924, + "grad_norm": 0.04577498510479927, + "learning_rate": 1.2559098075473946e-06, + "loss": 0.0005, + "step": 96860 + }, + { + "epoch": 1.5850445880716681, + "grad_norm": 0.08066362887620926, + "learning_rate": 1.2549635790654508e-06, + "loss": 0.001, + "step": 96870 + }, + { + "epoch": 1.5852082140227441, + "grad_norm": 0.08461767435073853, + "learning_rate": 1.2540176560159368e-06, + "loss": 0.0008, + "step": 96880 + }, + { + "epoch": 1.5853718399738197, + "grad_norm": 0.02184385247528553, + "learning_rate": 1.2530720384759964e-06, + "loss": 0.0009, + "step": 96890 + }, + { + "epoch": 1.5855354659248957, + "grad_norm": 0.04110664501786232, + "learning_rate": 1.2521267265227538e-06, + "loss": 0.0006, + "step": 96900 + }, + { + "epoch": 1.5856990918759717, + "grad_norm": 0.0821894183754921, + "learning_rate": 1.2511817202333027e-06, + "loss": 0.0008, + "step": 96910 + }, + { + "epoch": 1.5858627178270472, + "grad_norm": 0.022542444989085197, + "learning_rate": 1.2502370196847175e-06, + "loss": 0.0005, + "step": 96920 + }, + { + "epoch": 1.5860263437781232, + "grad_norm": 0.050582028925418854, + "learning_rate": 1.2492926249540416e-06, + "loss": 0.0014, + "step": 96930 + }, + { + "epoch": 1.586189969729199, + "grad_norm": 0.05709616467356682, + "learning_rate": 1.2483485361182994e-06, + "loss": 0.0007, + "step": 96940 + }, + { + "epoch": 1.5863535956802748, + "grad_norm": 0.06084844842553139, + "learning_rate": 1.2474047532544852e-06, + "loss": 0.0006, + "step": 96950 + }, + { + "epoch": 1.5865172216313508, + "grad_norm": 0.03821864724159241, + "learning_rate": 1.2464612764395734e-06, + "loss": 0.0005, + "step": 96960 + }, + { + "epoch": 1.5866808475824266, + "grad_norm": 0.02189842239022255, + "learning_rate": 1.2455181057505077e-06, + "loss": 0.0011, + "step": 96970 + }, + { + "epoch": 1.5868444735335023, + "grad_norm": 0.0485713854432106, + "learning_rate": 1.2445752412642126e-06, + "loss": 0.0013, + "step": 96980 + }, + { + "epoch": 1.5870080994845783, + "grad_norm": 0.19116142392158508, + "learning_rate": 1.243632683057584e-06, + "loss": 0.0035, + "step": 96990 + }, + { + "epoch": 1.587171725435654, + "grad_norm": 0.07462039589881897, + "learning_rate": 1.2426904312074923e-06, + "loss": 0.0012, + "step": 97000 + }, + { + "epoch": 1.5873353513867299, + "grad_norm": 0.062257200479507446, + "learning_rate": 1.241748485790787e-06, + "loss": 0.0007, + "step": 97010 + }, + { + "epoch": 1.5874989773378059, + "grad_norm": 0.034018050879240036, + "learning_rate": 1.2408068468842866e-06, + "loss": 0.0011, + "step": 97020 + }, + { + "epoch": 1.5876626032888816, + "grad_norm": 0.07112865895032883, + "learning_rate": 1.239865514564792e-06, + "loss": 0.001, + "step": 97030 + }, + { + "epoch": 1.5878262292399574, + "grad_norm": 0.08996989578008652, + "learning_rate": 1.2389244889090707e-06, + "loss": 0.0008, + "step": 97040 + }, + { + "epoch": 1.5879898551910334, + "grad_norm": 0.020867759361863136, + "learning_rate": 1.237983769993874e-06, + "loss": 0.001, + "step": 97050 + }, + { + "epoch": 1.5881534811421092, + "grad_norm": 0.01002001203596592, + "learning_rate": 1.2370433578959195e-06, + "loss": 0.0012, + "step": 97060 + }, + { + "epoch": 1.588317107093185, + "grad_norm": 0.0489850752055645, + "learning_rate": 1.2361032526919076e-06, + "loss": 0.001, + "step": 97070 + }, + { + "epoch": 1.588480733044261, + "grad_norm": 0.009347106330096722, + "learning_rate": 1.2351634544585072e-06, + "loss": 0.0012, + "step": 97080 + }, + { + "epoch": 1.5886443589953365, + "grad_norm": 0.04090132564306259, + "learning_rate": 1.2342239632723679e-06, + "loss": 0.0018, + "step": 97090 + }, + { + "epoch": 1.5888079849464125, + "grad_norm": 0.03768495097756386, + "learning_rate": 1.2332847792101087e-06, + "loss": 0.0007, + "step": 97100 + }, + { + "epoch": 1.5889716108974885, + "grad_norm": 0.1349332630634308, + "learning_rate": 1.2323459023483286e-06, + "loss": 0.002, + "step": 97110 + }, + { + "epoch": 1.589135236848564, + "grad_norm": 0.06317277997732162, + "learning_rate": 1.231407332763596e-06, + "loss": 0.0007, + "step": 97120 + }, + { + "epoch": 1.58929886279964, + "grad_norm": 0.10471262037754059, + "learning_rate": 1.2304690705324618e-06, + "loss": 0.001, + "step": 97130 + }, + { + "epoch": 1.5894624887507158, + "grad_norm": 0.09951486438512802, + "learning_rate": 1.2295311157314432e-06, + "loss": 0.0014, + "step": 97140 + }, + { + "epoch": 1.5896261147017916, + "grad_norm": 0.03760366514325142, + "learning_rate": 1.2285934684370404e-06, + "loss": 0.0006, + "step": 97150 + }, + { + "epoch": 1.5897897406528676, + "grad_norm": 0.06821509450674057, + "learning_rate": 1.2276561287257211e-06, + "loss": 0.0008, + "step": 97160 + }, + { + "epoch": 1.5899533666039434, + "grad_norm": 0.023200569674372673, + "learning_rate": 1.226719096673935e-06, + "loss": 0.0013, + "step": 97170 + }, + { + "epoch": 1.5901169925550191, + "grad_norm": 0.022301355376839638, + "learning_rate": 1.2257823723581003e-06, + "loss": 0.001, + "step": 97180 + }, + { + "epoch": 1.5902806185060951, + "grad_norm": 0.07437782734632492, + "learning_rate": 1.2248459558546154e-06, + "loss": 0.001, + "step": 97190 + }, + { + "epoch": 1.590444244457171, + "grad_norm": 0.012176129035651684, + "learning_rate": 1.2239098472398487e-06, + "loss": 0.001, + "step": 97200 + }, + { + "epoch": 1.5906078704082467, + "grad_norm": 0.04271703585982323, + "learning_rate": 1.222974046590149e-06, + "loss": 0.0008, + "step": 97210 + }, + { + "epoch": 1.5907714963593227, + "grad_norm": 0.00456777960062027, + "learning_rate": 1.2220385539818341e-06, + "loss": 0.0006, + "step": 97220 + }, + { + "epoch": 1.5909351223103985, + "grad_norm": 0.04272052273154259, + "learning_rate": 1.2211033694912021e-06, + "loss": 0.0044, + "step": 97230 + }, + { + "epoch": 1.5910987482614742, + "grad_norm": 0.016012227162718773, + "learning_rate": 1.2201684931945213e-06, + "loss": 0.0007, + "step": 97240 + }, + { + "epoch": 1.5912623742125502, + "grad_norm": 0.08335350453853607, + "learning_rate": 1.2192339251680395e-06, + "loss": 0.0007, + "step": 97250 + }, + { + "epoch": 1.5914260001636258, + "grad_norm": 0.030794652178883553, + "learning_rate": 1.2182996654879742e-06, + "loss": 0.0009, + "step": 97260 + }, + { + "epoch": 1.5915896261147018, + "grad_norm": 0.058969493955373764, + "learning_rate": 1.2173657142305229e-06, + "loss": 0.0018, + "step": 97270 + }, + { + "epoch": 1.5917532520657778, + "grad_norm": 0.022355305030941963, + "learning_rate": 1.2164320714718531e-06, + "loss": 0.0005, + "step": 97280 + }, + { + "epoch": 1.5919168780168533, + "grad_norm": 0.056606002151966095, + "learning_rate": 1.2154987372881121e-06, + "loss": 0.0005, + "step": 97290 + }, + { + "epoch": 1.5920805039679293, + "grad_norm": 0.06301253288984299, + "learning_rate": 1.2145657117554166e-06, + "loss": 0.0009, + "step": 97300 + }, + { + "epoch": 1.592244129919005, + "grad_norm": 0.05343060567975044, + "learning_rate": 1.2136329949498638e-06, + "loss": 0.0011, + "step": 97310 + }, + { + "epoch": 1.5924077558700809, + "grad_norm": 0.004658139310777187, + "learning_rate": 1.2127005869475216e-06, + "loss": 0.0013, + "step": 97320 + }, + { + "epoch": 1.5925713818211569, + "grad_norm": 0.027589093893766403, + "learning_rate": 1.211768487824433e-06, + "loss": 0.0008, + "step": 97330 + }, + { + "epoch": 1.5927350077722326, + "grad_norm": 0.08682414144277573, + "learning_rate": 1.2108366976566188e-06, + "loss": 0.0016, + "step": 97340 + }, + { + "epoch": 1.5928986337233084, + "grad_norm": 0.04159991815686226, + "learning_rate": 1.209905216520071e-06, + "loss": 0.0008, + "step": 97350 + }, + { + "epoch": 1.5930622596743844, + "grad_norm": 0.002184477401897311, + "learning_rate": 1.2089740444907593e-06, + "loss": 0.0008, + "step": 97360 + }, + { + "epoch": 1.5932258856254602, + "grad_norm": 0.008638736791908741, + "learning_rate": 1.2080431816446253e-06, + "loss": 0.0003, + "step": 97370 + }, + { + "epoch": 1.593389511576536, + "grad_norm": 0.051601581275463104, + "learning_rate": 1.2071126280575895e-06, + "loss": 0.0007, + "step": 97380 + }, + { + "epoch": 1.593553137527612, + "grad_norm": 0.11896104365587234, + "learning_rate": 1.206182383805542e-06, + "loss": 0.0007, + "step": 97390 + }, + { + "epoch": 1.5937167634786877, + "grad_norm": 0.02824978157877922, + "learning_rate": 1.2052524489643525e-06, + "loss": 0.0011, + "step": 97400 + }, + { + "epoch": 1.5938803894297635, + "grad_norm": 0.05780195817351341, + "learning_rate": 1.2043228236098613e-06, + "loss": 0.0012, + "step": 97410 + }, + { + "epoch": 1.5940440153808395, + "grad_norm": 0.081435427069664, + "learning_rate": 1.2033935078178876e-06, + "loss": 0.0004, + "step": 97420 + }, + { + "epoch": 1.5942076413319153, + "grad_norm": 0.08824429661035538, + "learning_rate": 1.2024645016642216e-06, + "loss": 0.001, + "step": 97430 + }, + { + "epoch": 1.594371267282991, + "grad_norm": 0.013098395429551601, + "learning_rate": 1.2015358052246313e-06, + "loss": 0.0031, + "step": 97440 + }, + { + "epoch": 1.594534893234067, + "grad_norm": 0.0372081995010376, + "learning_rate": 1.2006074185748563e-06, + "loss": 0.0006, + "step": 97450 + }, + { + "epoch": 1.5946985191851426, + "grad_norm": 0.013786882162094116, + "learning_rate": 1.1996793417906145e-06, + "loss": 0.0009, + "step": 97460 + }, + { + "epoch": 1.5948621451362186, + "grad_norm": 0.18417958915233612, + "learning_rate": 1.1987515749475948e-06, + "loss": 0.0018, + "step": 97470 + }, + { + "epoch": 1.5950257710872946, + "grad_norm": 0.02820058912038803, + "learning_rate": 1.197824118121465e-06, + "loss": 0.0013, + "step": 97480 + }, + { + "epoch": 1.5951893970383701, + "grad_norm": 0.12244687974452972, + "learning_rate": 1.1968969713878626e-06, + "loss": 0.0007, + "step": 97490 + }, + { + "epoch": 1.5953530229894461, + "grad_norm": 0.027667613700032234, + "learning_rate": 1.1959701348224052e-06, + "loss": 0.0008, + "step": 97500 + }, + { + "epoch": 1.595516648940522, + "grad_norm": 0.03789461776614189, + "learning_rate": 1.1950436085006795e-06, + "loss": 0.0011, + "step": 97510 + }, + { + "epoch": 1.5956802748915977, + "grad_norm": 0.054429762065410614, + "learning_rate": 1.1941173924982535e-06, + "loss": 0.0009, + "step": 97520 + }, + { + "epoch": 1.5958439008426737, + "grad_norm": 0.03577103465795517, + "learning_rate": 1.1931914868906624e-06, + "loss": 0.0022, + "step": 97530 + }, + { + "epoch": 1.5960075267937495, + "grad_norm": 0.04397017881274223, + "learning_rate": 1.1922658917534236e-06, + "loss": 0.001, + "step": 97540 + }, + { + "epoch": 1.5961711527448252, + "grad_norm": 0.04712901636958122, + "learning_rate": 1.1913406071620215e-06, + "loss": 0.0019, + "step": 97550 + }, + { + "epoch": 1.5963347786959012, + "grad_norm": 0.07704362273216248, + "learning_rate": 1.1904156331919232e-06, + "loss": 0.0012, + "step": 97560 + }, + { + "epoch": 1.596498404646977, + "grad_norm": 0.10225943475961685, + "learning_rate": 1.1894909699185631e-06, + "loss": 0.0007, + "step": 97570 + }, + { + "epoch": 1.5966620305980528, + "grad_norm": 0.14104241132736206, + "learning_rate": 1.1885666174173565e-06, + "loss": 0.0011, + "step": 97580 + }, + { + "epoch": 1.5968256565491288, + "grad_norm": 0.02985711768269539, + "learning_rate": 1.1876425757636877e-06, + "loss": 0.0013, + "step": 97590 + }, + { + "epoch": 1.5969892825002046, + "grad_norm": 0.09078379720449448, + "learning_rate": 1.1867188450329209e-06, + "loss": 0.001, + "step": 97600 + }, + { + "epoch": 1.5971529084512803, + "grad_norm": 0.05256466194987297, + "learning_rate": 1.1857954253003894e-06, + "loss": 0.0018, + "step": 97610 + }, + { + "epoch": 1.5973165344023563, + "grad_norm": 0.06322584301233292, + "learning_rate": 1.1848723166414077e-06, + "loss": 0.0024, + "step": 97620 + }, + { + "epoch": 1.597480160353432, + "grad_norm": 0.06846736371517181, + "learning_rate": 1.18394951913126e-06, + "loss": 0.0009, + "step": 97630 + }, + { + "epoch": 1.5976437863045079, + "grad_norm": 0.04839754104614258, + "learning_rate": 1.1830270328452042e-06, + "loss": 0.0011, + "step": 97640 + }, + { + "epoch": 1.5978074122555839, + "grad_norm": 0.06665943562984467, + "learning_rate": 1.1821048578584787e-06, + "loss": 0.0013, + "step": 97650 + }, + { + "epoch": 1.5979710382066594, + "grad_norm": 0.0429532490670681, + "learning_rate": 1.1811829942462904e-06, + "loss": 0.0014, + "step": 97660 + }, + { + "epoch": 1.5981346641577354, + "grad_norm": 0.03213972970843315, + "learning_rate": 1.1802614420838254e-06, + "loss": 0.0009, + "step": 97670 + }, + { + "epoch": 1.5982982901088114, + "grad_norm": 0.03794856742024422, + "learning_rate": 1.1793402014462395e-06, + "loss": 0.0011, + "step": 97680 + }, + { + "epoch": 1.598461916059887, + "grad_norm": 0.012777460739016533, + "learning_rate": 1.1784192724086701e-06, + "loss": 0.0011, + "step": 97690 + }, + { + "epoch": 1.598625542010963, + "grad_norm": 0.006780116818845272, + "learning_rate": 1.177498655046221e-06, + "loss": 0.0007, + "step": 97700 + }, + { + "epoch": 1.5987891679620387, + "grad_norm": 0.06594547629356384, + "learning_rate": 1.1765783494339771e-06, + "loss": 0.0016, + "step": 97710 + }, + { + "epoch": 1.5989527939131145, + "grad_norm": 0.016113415360450745, + "learning_rate": 1.1756583556469937e-06, + "loss": 0.0004, + "step": 97720 + }, + { + "epoch": 1.5991164198641905, + "grad_norm": 0.06291721016168594, + "learning_rate": 1.174738673760305e-06, + "loss": 0.0006, + "step": 97730 + }, + { + "epoch": 1.5992800458152663, + "grad_norm": 0.02736041694879532, + "learning_rate": 1.1738193038489133e-06, + "loss": 0.0009, + "step": 97740 + }, + { + "epoch": 1.599443671766342, + "grad_norm": 0.03677744418382645, + "learning_rate": 1.172900245987803e-06, + "loss": 0.0012, + "step": 97750 + }, + { + "epoch": 1.599607297717418, + "grad_norm": 0.05457146838307381, + "learning_rate": 1.1719815002519268e-06, + "loss": 0.001, + "step": 97760 + }, + { + "epoch": 1.5997709236684938, + "grad_norm": 0.15523658692836761, + "learning_rate": 1.1710630667162165e-06, + "loss": 0.0013, + "step": 97770 + }, + { + "epoch": 1.5999345496195696, + "grad_norm": 0.03381544351577759, + "learning_rate": 1.1701449454555736e-06, + "loss": 0.0015, + "step": 97780 + }, + { + "epoch": 1.6000981755706456, + "grad_norm": 0.13160747289657593, + "learning_rate": 1.1692271365448803e-06, + "loss": 0.0009, + "step": 97790 + }, + { + "epoch": 1.6002618015217214, + "grad_norm": 0.02754007652401924, + "learning_rate": 1.168309640058987e-06, + "loss": 0.0017, + "step": 97800 + }, + { + "epoch": 1.6004254274727971, + "grad_norm": 0.08474765717983246, + "learning_rate": 1.1673924560727245e-06, + "loss": 0.0012, + "step": 97810 + }, + { + "epoch": 1.6005890534238731, + "grad_norm": 0.09033343195915222, + "learning_rate": 1.1664755846608917e-06, + "loss": 0.0009, + "step": 97820 + }, + { + "epoch": 1.600752679374949, + "grad_norm": 0.040223345160484314, + "learning_rate": 1.1655590258982691e-06, + "loss": 0.0004, + "step": 97830 + }, + { + "epoch": 1.6009163053260247, + "grad_norm": 0.06419704854488373, + "learning_rate": 1.164642779859605e-06, + "loss": 0.0011, + "step": 97840 + }, + { + "epoch": 1.6010799312771007, + "grad_norm": 0.001630942104384303, + "learning_rate": 1.1637268466196283e-06, + "loss": 0.0011, + "step": 97850 + }, + { + "epoch": 1.6012435572281762, + "grad_norm": 0.03925986588001251, + "learning_rate": 1.1628112262530367e-06, + "loss": 0.0012, + "step": 97860 + }, + { + "epoch": 1.6014071831792522, + "grad_norm": 0.04779164493083954, + "learning_rate": 1.1618959188345069e-06, + "loss": 0.001, + "step": 97870 + }, + { + "epoch": 1.6015708091303282, + "grad_norm": 0.0023507389705628157, + "learning_rate": 1.160980924438687e-06, + "loss": 0.0009, + "step": 97880 + }, + { + "epoch": 1.6017344350814038, + "grad_norm": 0.2227538377046585, + "learning_rate": 1.1600662431402028e-06, + "loss": 0.0011, + "step": 97890 + }, + { + "epoch": 1.6018980610324798, + "grad_norm": 0.01190287247300148, + "learning_rate": 1.1591518750136494e-06, + "loss": 0.0017, + "step": 97900 + }, + { + "epoch": 1.6020616869835556, + "grad_norm": 0.03589490428566933, + "learning_rate": 1.1582378201336025e-06, + "loss": 0.0008, + "step": 97910 + }, + { + "epoch": 1.6022253129346313, + "grad_norm": 0.02147943153977394, + "learning_rate": 1.157324078574607e-06, + "loss": 0.0009, + "step": 97920 + }, + { + "epoch": 1.6023889388857073, + "grad_norm": 0.08050645887851715, + "learning_rate": 1.1564106504111876e-06, + "loss": 0.0017, + "step": 97930 + }, + { + "epoch": 1.602552564836783, + "grad_norm": 0.03861328214406967, + "learning_rate": 1.1554975357178365e-06, + "loss": 0.001, + "step": 97940 + }, + { + "epoch": 1.6027161907878589, + "grad_norm": 0.02591322734951973, + "learning_rate": 1.154584734569028e-06, + "loss": 0.0007, + "step": 97950 + }, + { + "epoch": 1.6028798167389349, + "grad_norm": 0.03869979828596115, + "learning_rate": 1.1536722470392059e-06, + "loss": 0.0011, + "step": 97960 + }, + { + "epoch": 1.6030434426900106, + "grad_norm": 0.054462533444166183, + "learning_rate": 1.1527600732027866e-06, + "loss": 0.0007, + "step": 97970 + }, + { + "epoch": 1.6032070686410864, + "grad_norm": 0.08680848032236099, + "learning_rate": 1.1518482131341685e-06, + "loss": 0.0009, + "step": 97980 + }, + { + "epoch": 1.6033706945921624, + "grad_norm": 0.04948004335165024, + "learning_rate": 1.1509366669077155e-06, + "loss": 0.0021, + "step": 97990 + }, + { + "epoch": 1.6035343205432382, + "grad_norm": 0.11467420309782028, + "learning_rate": 1.150025434597774e-06, + "loss": 0.0011, + "step": 98000 + }, + { + "epoch": 1.603697946494314, + "grad_norm": 0.023892199620604515, + "learning_rate": 1.1491145162786582e-06, + "loss": 0.0007, + "step": 98010 + }, + { + "epoch": 1.60386157244539, + "grad_norm": 0.05300400033593178, + "learning_rate": 1.1482039120246612e-06, + "loss": 0.0021, + "step": 98020 + }, + { + "epoch": 1.6040251983964657, + "grad_norm": 0.03293321654200554, + "learning_rate": 1.1472936219100478e-06, + "loss": 0.0005, + "step": 98030 + }, + { + "epoch": 1.6041888243475415, + "grad_norm": 0.05904914438724518, + "learning_rate": 1.1463836460090594e-06, + "loss": 0.0012, + "step": 98040 + }, + { + "epoch": 1.6043524502986175, + "grad_norm": 0.02522321231663227, + "learning_rate": 1.1454739843959084e-06, + "loss": 0.0006, + "step": 98050 + }, + { + "epoch": 1.604516076249693, + "grad_norm": 0.0607261024415493, + "learning_rate": 1.144564637144786e-06, + "loss": 0.0006, + "step": 98060 + }, + { + "epoch": 1.604679702200769, + "grad_norm": 0.07843679189682007, + "learning_rate": 1.1436556043298535e-06, + "loss": 0.0014, + "step": 98070 + }, + { + "epoch": 1.6048433281518448, + "grad_norm": 0.020113442093133926, + "learning_rate": 1.142746886025251e-06, + "loss": 0.0007, + "step": 98080 + }, + { + "epoch": 1.6050069541029206, + "grad_norm": 0.04199831560254097, + "learning_rate": 1.1418384823050871e-06, + "loss": 0.0013, + "step": 98090 + }, + { + "epoch": 1.6051705800539966, + "grad_norm": 0.00481654005125165, + "learning_rate": 1.1409303932434519e-06, + "loss": 0.001, + "step": 98100 + }, + { + "epoch": 1.6053342060050724, + "grad_norm": 0.02471465989947319, + "learning_rate": 1.1400226189144025e-06, + "loss": 0.0014, + "step": 98110 + }, + { + "epoch": 1.6054978319561481, + "grad_norm": 0.056144922971725464, + "learning_rate": 1.1391151593919768e-06, + "loss": 0.001, + "step": 98120 + }, + { + "epoch": 1.6056614579072241, + "grad_norm": 0.020133955404162407, + "learning_rate": 1.1382080147501817e-06, + "loss": 0.0013, + "step": 98130 + }, + { + "epoch": 1.6058250838583, + "grad_norm": 0.009174136444926262, + "learning_rate": 1.1373011850630034e-06, + "loss": 0.001, + "step": 98140 + }, + { + "epoch": 1.6059887098093757, + "grad_norm": 0.02165425568819046, + "learning_rate": 1.1363946704043966e-06, + "loss": 0.0015, + "step": 98150 + }, + { + "epoch": 1.6061523357604517, + "grad_norm": 0.03630847483873367, + "learning_rate": 1.1354884708482972e-06, + "loss": 0.0012, + "step": 98160 + }, + { + "epoch": 1.6063159617115275, + "grad_norm": 0.05408246070146561, + "learning_rate": 1.1345825864686083e-06, + "loss": 0.0011, + "step": 98170 + }, + { + "epoch": 1.6064795876626032, + "grad_norm": 0.03296400234103203, + "learning_rate": 1.1336770173392136e-06, + "loss": 0.0009, + "step": 98180 + }, + { + "epoch": 1.6066432136136792, + "grad_norm": 0.06882283836603165, + "learning_rate": 1.1327717635339657e-06, + "loss": 0.0008, + "step": 98190 + }, + { + "epoch": 1.606806839564755, + "grad_norm": 0.015450521372258663, + "learning_rate": 1.1318668251266972e-06, + "loss": 0.0008, + "step": 98200 + }, + { + "epoch": 1.6069704655158308, + "grad_norm": 0.14532433450222015, + "learning_rate": 1.1309622021912081e-06, + "loss": 0.0019, + "step": 98210 + }, + { + "epoch": 1.6071340914669068, + "grad_norm": 0.04961596801877022, + "learning_rate": 1.1300578948012798e-06, + "loss": 0.0003, + "step": 98220 + }, + { + "epoch": 1.6072977174179823, + "grad_norm": 0.060220684856176376, + "learning_rate": 1.1291539030306608e-06, + "loss": 0.0008, + "step": 98230 + }, + { + "epoch": 1.6074613433690583, + "grad_norm": 0.06398151069879532, + "learning_rate": 1.128250226953082e-06, + "loss": 0.0007, + "step": 98240 + }, + { + "epoch": 1.6076249693201343, + "grad_norm": 0.09978048503398895, + "learning_rate": 1.1273468666422398e-06, + "loss": 0.0011, + "step": 98250 + }, + { + "epoch": 1.6077885952712099, + "grad_norm": 0.048443857580423355, + "learning_rate": 1.1264438221718133e-06, + "loss": 0.0015, + "step": 98260 + }, + { + "epoch": 1.6079522212222859, + "grad_norm": 0.01663028448820114, + "learning_rate": 1.1255410936154492e-06, + "loss": 0.0008, + "step": 98270 + }, + { + "epoch": 1.6081158471733616, + "grad_norm": 0.02853105030953884, + "learning_rate": 1.1246386810467702e-06, + "loss": 0.0027, + "step": 98280 + }, + { + "epoch": 1.6082794731244374, + "grad_norm": 0.01624145917594433, + "learning_rate": 1.1237365845393767e-06, + "loss": 0.0006, + "step": 98290 + }, + { + "epoch": 1.6084430990755134, + "grad_norm": 0.01780993863940239, + "learning_rate": 1.122834804166838e-06, + "loss": 0.001, + "step": 98300 + }, + { + "epoch": 1.6086067250265892, + "grad_norm": 0.04460993781685829, + "learning_rate": 1.121933340002702e-06, + "loss": 0.001, + "step": 98310 + }, + { + "epoch": 1.608770350977665, + "grad_norm": 0.05030401423573494, + "learning_rate": 1.1210321921204881e-06, + "loss": 0.0009, + "step": 98320 + }, + { + "epoch": 1.608933976928741, + "grad_norm": 0.074622243642807, + "learning_rate": 1.120131360593692e-06, + "loss": 0.0006, + "step": 98330 + }, + { + "epoch": 1.6090976028798167, + "grad_norm": 0.041184570640325546, + "learning_rate": 1.11923084549578e-06, + "loss": 0.0014, + "step": 98340 + }, + { + "epoch": 1.6092612288308925, + "grad_norm": 0.0906820222735405, + "learning_rate": 1.1183306469001986e-06, + "loss": 0.0008, + "step": 98350 + }, + { + "epoch": 1.6094248547819685, + "grad_norm": 0.024200350046157837, + "learning_rate": 1.1174307648803617e-06, + "loss": 0.002, + "step": 98360 + }, + { + "epoch": 1.6095884807330443, + "grad_norm": 0.03529239445924759, + "learning_rate": 1.1165311995096633e-06, + "loss": 0.0009, + "step": 98370 + }, + { + "epoch": 1.60975210668412, + "grad_norm": 0.058973196893930435, + "learning_rate": 1.1156319508614666e-06, + "loss": 0.0006, + "step": 98380 + }, + { + "epoch": 1.609915732635196, + "grad_norm": 0.004053601995110512, + "learning_rate": 1.1147330190091127e-06, + "loss": 0.0009, + "step": 98390 + }, + { + "epoch": 1.6100793585862718, + "grad_norm": 0.06835632771253586, + "learning_rate": 1.1138344040259142e-06, + "loss": 0.0014, + "step": 98400 + }, + { + "epoch": 1.6102429845373476, + "grad_norm": 0.055411599576473236, + "learning_rate": 1.1129361059851611e-06, + "loss": 0.0014, + "step": 98410 + }, + { + "epoch": 1.6104066104884236, + "grad_norm": 0.10034821927547455, + "learning_rate": 1.1120381249601126e-06, + "loss": 0.0006, + "step": 98420 + }, + { + "epoch": 1.6105702364394991, + "grad_norm": 0.018842680379748344, + "learning_rate": 1.1111404610240085e-06, + "loss": 0.0008, + "step": 98430 + }, + { + "epoch": 1.6107338623905751, + "grad_norm": 0.09404448419809341, + "learning_rate": 1.1102431142500559e-06, + "loss": 0.0016, + "step": 98440 + }, + { + "epoch": 1.6108974883416511, + "grad_norm": 0.005391061305999756, + "learning_rate": 1.1093460847114418e-06, + "loss": 0.0005, + "step": 98450 + }, + { + "epoch": 1.6110611142927267, + "grad_norm": 0.03553809970617294, + "learning_rate": 1.1084493724813229e-06, + "loss": 0.0008, + "step": 98460 + }, + { + "epoch": 1.6112247402438027, + "grad_norm": 0.01727294921875, + "learning_rate": 1.1075529776328335e-06, + "loss": 0.0008, + "step": 98470 + }, + { + "epoch": 1.6113883661948785, + "grad_norm": 0.04744759574532509, + "learning_rate": 1.1066569002390798e-06, + "loss": 0.0013, + "step": 98480 + }, + { + "epoch": 1.6115519921459542, + "grad_norm": 0.09186459332704544, + "learning_rate": 1.1057611403731438e-06, + "loss": 0.0011, + "step": 98490 + }, + { + "epoch": 1.6117156180970302, + "grad_norm": 0.05243953689932823, + "learning_rate": 1.1048656981080785e-06, + "loss": 0.0009, + "step": 98500 + }, + { + "epoch": 1.611879244048106, + "grad_norm": 0.05669968202710152, + "learning_rate": 1.1039705735169159e-06, + "loss": 0.0018, + "step": 98510 + }, + { + "epoch": 1.6120428699991818, + "grad_norm": 0.17999234795570374, + "learning_rate": 1.1030757666726567e-06, + "loss": 0.0009, + "step": 98520 + }, + { + "epoch": 1.6122064959502578, + "grad_norm": 0.05805385485291481, + "learning_rate": 1.1021812776482804e-06, + "loss": 0.0008, + "step": 98530 + }, + { + "epoch": 1.6123701219013336, + "grad_norm": 0.0329163633286953, + "learning_rate": 1.1012871065167362e-06, + "loss": 0.001, + "step": 98540 + }, + { + "epoch": 1.6125337478524093, + "grad_norm": 0.011239580810070038, + "learning_rate": 1.1003932533509531e-06, + "loss": 0.0004, + "step": 98550 + }, + { + "epoch": 1.6126973738034853, + "grad_norm": 0.039462849497795105, + "learning_rate": 1.0994997182238266e-06, + "loss": 0.0008, + "step": 98560 + }, + { + "epoch": 1.612860999754561, + "grad_norm": 0.08043213188648224, + "learning_rate": 1.0986065012082343e-06, + "loss": 0.0011, + "step": 98570 + }, + { + "epoch": 1.6130246257056369, + "grad_norm": 0.03515748679637909, + "learning_rate": 1.0977136023770202e-06, + "loss": 0.0007, + "step": 98580 + }, + { + "epoch": 1.6131882516567129, + "grad_norm": 0.024439388886094093, + "learning_rate": 1.0968210218030096e-06, + "loss": 0.0006, + "step": 98590 + }, + { + "epoch": 1.6133518776077886, + "grad_norm": 0.05537058785557747, + "learning_rate": 1.0959287595589968e-06, + "loss": 0.0009, + "step": 98600 + }, + { + "epoch": 1.6135155035588644, + "grad_norm": 0.017928028479218483, + "learning_rate": 1.0950368157177498e-06, + "loss": 0.0009, + "step": 98610 + }, + { + "epoch": 1.6136791295099404, + "grad_norm": 0.04345674812793732, + "learning_rate": 1.0941451903520162e-06, + "loss": 0.0011, + "step": 98620 + }, + { + "epoch": 1.613842755461016, + "grad_norm": 0.01877068541944027, + "learning_rate": 1.0932538835345103e-06, + "loss": 0.0008, + "step": 98630 + }, + { + "epoch": 1.614006381412092, + "grad_norm": 0.04336192458868027, + "learning_rate": 1.0923628953379273e-06, + "loss": 0.0008, + "step": 98640 + }, + { + "epoch": 1.614170007363168, + "grad_norm": 0.0044768378138542175, + "learning_rate": 1.0914722258349303e-06, + "loss": 0.0013, + "step": 98650 + }, + { + "epoch": 1.6143336333142435, + "grad_norm": 0.03292006626725197, + "learning_rate": 1.0905818750981618e-06, + "loss": 0.0012, + "step": 98660 + }, + { + "epoch": 1.6144972592653195, + "grad_norm": 0.11813094466924667, + "learning_rate": 1.0896918432002334e-06, + "loss": 0.001, + "step": 98670 + }, + { + "epoch": 1.6146608852163953, + "grad_norm": 0.06800729781389236, + "learning_rate": 1.0888021302137352e-06, + "loss": 0.001, + "step": 98680 + }, + { + "epoch": 1.614824511167471, + "grad_norm": 0.009437814354896545, + "learning_rate": 1.0879127362112274e-06, + "loss": 0.0009, + "step": 98690 + }, + { + "epoch": 1.614988137118547, + "grad_norm": 0.0062317312695086, + "learning_rate": 1.0870236612652474e-06, + "loss": 0.0019, + "step": 98700 + }, + { + "epoch": 1.6151517630696228, + "grad_norm": 0.011862434446811676, + "learning_rate": 1.086134905448304e-06, + "loss": 0.0007, + "step": 98710 + }, + { + "epoch": 1.6153153890206986, + "grad_norm": 0.05355127155780792, + "learning_rate": 1.0852464688328824e-06, + "loss": 0.0018, + "step": 98720 + }, + { + "epoch": 1.6154790149717746, + "grad_norm": 0.022363979369401932, + "learning_rate": 1.0843583514914385e-06, + "loss": 0.0008, + "step": 98730 + }, + { + "epoch": 1.6156426409228504, + "grad_norm": 0.23203960061073303, + "learning_rate": 1.0834705534964063e-06, + "loss": 0.0017, + "step": 98740 + }, + { + "epoch": 1.6158062668739261, + "grad_norm": 0.05894729495048523, + "learning_rate": 1.0825830749201893e-06, + "loss": 0.0011, + "step": 98750 + }, + { + "epoch": 1.6159698928250021, + "grad_norm": 0.08380956947803497, + "learning_rate": 1.0816959158351703e-06, + "loss": 0.0012, + "step": 98760 + }, + { + "epoch": 1.616133518776078, + "grad_norm": 0.011260194703936577, + "learning_rate": 1.0808090763136997e-06, + "loss": 0.0007, + "step": 98770 + }, + { + "epoch": 1.6162971447271537, + "grad_norm": 0.029289012774825096, + "learning_rate": 1.079922556428108e-06, + "loss": 0.0006, + "step": 98780 + }, + { + "epoch": 1.6164607706782297, + "grad_norm": 0.03287067636847496, + "learning_rate": 1.0790363562506933e-06, + "loss": 0.0008, + "step": 98790 + }, + { + "epoch": 1.6166243966293055, + "grad_norm": 0.057073742151260376, + "learning_rate": 1.0781504758537352e-06, + "loss": 0.0011, + "step": 98800 + }, + { + "epoch": 1.6167880225803812, + "grad_norm": 0.003922164905816317, + "learning_rate": 1.0772649153094793e-06, + "loss": 0.0004, + "step": 98810 + }, + { + "epoch": 1.6169516485314572, + "grad_norm": 0.04353697970509529, + "learning_rate": 1.0763796746901522e-06, + "loss": 0.0013, + "step": 98820 + }, + { + "epoch": 1.6171152744825328, + "grad_norm": 0.10883992910385132, + "learning_rate": 1.0754947540679483e-06, + "loss": 0.0008, + "step": 98830 + }, + { + "epoch": 1.6172789004336088, + "grad_norm": 0.08834085613489151, + "learning_rate": 1.074610153515041e-06, + "loss": 0.0008, + "step": 98840 + }, + { + "epoch": 1.6174425263846848, + "grad_norm": 0.0750710740685463, + "learning_rate": 1.0737258731035728e-06, + "loss": 0.0009, + "step": 98850 + }, + { + "epoch": 1.6176061523357603, + "grad_norm": 0.036313727498054504, + "learning_rate": 1.0728419129056655e-06, + "loss": 0.0014, + "step": 98860 + }, + { + "epoch": 1.6177697782868363, + "grad_norm": 0.02966841496527195, + "learning_rate": 1.0719582729934092e-06, + "loss": 0.002, + "step": 98870 + }, + { + "epoch": 1.617933404237912, + "grad_norm": 0.0079027796164155, + "learning_rate": 1.071074953438873e-06, + "loss": 0.0005, + "step": 98880 + }, + { + "epoch": 1.6180970301889879, + "grad_norm": 0.0825885757803917, + "learning_rate": 1.0701919543140955e-06, + "loss": 0.0051, + "step": 98890 + }, + { + "epoch": 1.6182606561400639, + "grad_norm": 0.03835207596421242, + "learning_rate": 1.0693092756910922e-06, + "loss": 0.001, + "step": 98900 + }, + { + "epoch": 1.6184242820911396, + "grad_norm": 0.03957780823111534, + "learning_rate": 1.0684269176418504e-06, + "loss": 0.0009, + "step": 98910 + }, + { + "epoch": 1.6185879080422154, + "grad_norm": 0.08176249265670776, + "learning_rate": 1.0675448802383336e-06, + "loss": 0.0012, + "step": 98920 + }, + { + "epoch": 1.6187515339932914, + "grad_norm": 0.0636419877409935, + "learning_rate": 1.0666631635524776e-06, + "loss": 0.0012, + "step": 98930 + }, + { + "epoch": 1.6189151599443672, + "grad_norm": 0.027811024338006973, + "learning_rate": 1.06578176765619e-06, + "loss": 0.0005, + "step": 98940 + }, + { + "epoch": 1.619078785895443, + "grad_norm": 0.0021214059088379145, + "learning_rate": 1.0649006926213573e-06, + "loss": 0.0007, + "step": 98950 + }, + { + "epoch": 1.619242411846519, + "grad_norm": 0.01910286210477352, + "learning_rate": 1.0640199385198347e-06, + "loss": 0.001, + "step": 98960 + }, + { + "epoch": 1.6194060377975947, + "grad_norm": 0.10791837424039841, + "learning_rate": 1.0631395054234556e-06, + "loss": 0.0009, + "step": 98970 + }, + { + "epoch": 1.6195696637486705, + "grad_norm": 0.04680231958627701, + "learning_rate": 1.0622593934040226e-06, + "loss": 0.0008, + "step": 98980 + }, + { + "epoch": 1.6197332896997465, + "grad_norm": 0.11000838130712509, + "learning_rate": 1.0613796025333178e-06, + "loss": 0.0008, + "step": 98990 + }, + { + "epoch": 1.619896915650822, + "grad_norm": 0.07441253960132599, + "learning_rate": 1.060500132883091e-06, + "loss": 0.0018, + "step": 99000 + }, + { + "epoch": 1.620060541601898, + "grad_norm": 0.05682549998164177, + "learning_rate": 1.059620984525071e-06, + "loss": 0.0004, + "step": 99010 + }, + { + "epoch": 1.620224167552974, + "grad_norm": 0.05946957319974899, + "learning_rate": 1.0587421575309559e-06, + "loss": 0.002, + "step": 99020 + }, + { + "epoch": 1.6203877935040496, + "grad_norm": 0.09401339292526245, + "learning_rate": 1.0578636519724229e-06, + "loss": 0.0013, + "step": 99030 + }, + { + "epoch": 1.6205514194551256, + "grad_norm": 0.016316894441843033, + "learning_rate": 1.056985467921116e-06, + "loss": 0.0009, + "step": 99040 + }, + { + "epoch": 1.6207150454062014, + "grad_norm": 0.03270907327532768, + "learning_rate": 1.0561076054486614e-06, + "loss": 0.0014, + "step": 99050 + }, + { + "epoch": 1.6208786713572771, + "grad_norm": 0.11566010862588882, + "learning_rate": 1.05523006462665e-06, + "loss": 0.0017, + "step": 99060 + }, + { + "epoch": 1.6210422973083531, + "grad_norm": 0.01913970150053501, + "learning_rate": 1.054352845526655e-06, + "loss": 0.0005, + "step": 99070 + }, + { + "epoch": 1.621205923259429, + "grad_norm": 0.06576588749885559, + "learning_rate": 1.0534759482202162e-06, + "loss": 0.0009, + "step": 99080 + }, + { + "epoch": 1.6213695492105047, + "grad_norm": 0.0531679131090641, + "learning_rate": 1.0525993727788535e-06, + "loss": 0.001, + "step": 99090 + }, + { + "epoch": 1.6215331751615807, + "grad_norm": 0.026732036843895912, + "learning_rate": 1.051723119274054e-06, + "loss": 0.0009, + "step": 99100 + }, + { + "epoch": 1.6216968011126565, + "grad_norm": 0.04521513730287552, + "learning_rate": 1.0508471877772852e-06, + "loss": 0.001, + "step": 99110 + }, + { + "epoch": 1.6218604270637322, + "grad_norm": 0.09972858428955078, + "learning_rate": 1.0499715783599824e-06, + "loss": 0.0012, + "step": 99120 + }, + { + "epoch": 1.6220240530148082, + "grad_norm": 0.041070688515901566, + "learning_rate": 1.04909629109356e-06, + "loss": 0.0008, + "step": 99130 + }, + { + "epoch": 1.622187678965884, + "grad_norm": 0.06659958511590958, + "learning_rate": 1.0482213260494e-06, + "loss": 0.001, + "step": 99140 + }, + { + "epoch": 1.6223513049169598, + "grad_norm": 0.0031932243146002293, + "learning_rate": 1.047346683298865e-06, + "loss": 0.0005, + "step": 99150 + }, + { + "epoch": 1.6225149308680358, + "grad_norm": 0.01182716153562069, + "learning_rate": 1.046472362913285e-06, + "loss": 0.0008, + "step": 99160 + }, + { + "epoch": 1.6226785568191116, + "grad_norm": 0.12931151688098907, + "learning_rate": 1.04559836496397e-06, + "loss": 0.0009, + "step": 99170 + }, + { + "epoch": 1.6228421827701873, + "grad_norm": 0.1108604148030281, + "learning_rate": 1.0447246895221963e-06, + "loss": 0.0013, + "step": 99180 + }, + { + "epoch": 1.6230058087212633, + "grad_norm": 0.0400497131049633, + "learning_rate": 1.0438513366592218e-06, + "loss": 0.0011, + "step": 99190 + }, + { + "epoch": 1.6231694346723389, + "grad_norm": 0.0018103771144524217, + "learning_rate": 1.0429783064462706e-06, + "loss": 0.0007, + "step": 99200 + }, + { + "epoch": 1.6233330606234149, + "grad_norm": 0.007359854876995087, + "learning_rate": 1.0421055989545476e-06, + "loss": 0.0007, + "step": 99210 + }, + { + "epoch": 1.6234966865744909, + "grad_norm": 0.06547354906797409, + "learning_rate": 1.041233214255224e-06, + "loss": 0.0009, + "step": 99220 + }, + { + "epoch": 1.6236603125255664, + "grad_norm": 0.027127254754304886, + "learning_rate": 1.0403611524194523e-06, + "loss": 0.001, + "step": 99230 + }, + { + "epoch": 1.6238239384766424, + "grad_norm": 0.0034067730884999037, + "learning_rate": 1.039489413518353e-06, + "loss": 0.0004, + "step": 99240 + }, + { + "epoch": 1.6239875644277182, + "grad_norm": 0.01794705167412758, + "learning_rate": 1.038617997623021e-06, + "loss": 0.0006, + "step": 99250 + }, + { + "epoch": 1.624151190378794, + "grad_norm": 0.021203018724918365, + "learning_rate": 1.0377469048045285e-06, + "loss": 0.0014, + "step": 99260 + }, + { + "epoch": 1.62431481632987, + "grad_norm": 0.0713459774851799, + "learning_rate": 1.0368761351339169e-06, + "loss": 0.0008, + "step": 99270 + }, + { + "epoch": 1.6244784422809457, + "grad_norm": 0.07734189927577972, + "learning_rate": 1.036005688682205e-06, + "loss": 0.0018, + "step": 99280 + }, + { + "epoch": 1.6246420682320215, + "grad_norm": 0.016635021194815636, + "learning_rate": 1.035135565520381e-06, + "loss": 0.0008, + "step": 99290 + }, + { + "epoch": 1.6248056941830975, + "grad_norm": 0.030284764245152473, + "learning_rate": 1.0342657657194122e-06, + "loss": 0.0004, + "step": 99300 + }, + { + "epoch": 1.6249693201341733, + "grad_norm": 0.03503705561161041, + "learning_rate": 1.0333962893502336e-06, + "loss": 0.0008, + "step": 99310 + }, + { + "epoch": 1.625132946085249, + "grad_norm": 0.050276078283786774, + "learning_rate": 1.0325271364837592e-06, + "loss": 0.0008, + "step": 99320 + }, + { + "epoch": 1.625296572036325, + "grad_norm": 0.0807928740978241, + "learning_rate": 1.031658307190872e-06, + "loss": 0.0009, + "step": 99330 + }, + { + "epoch": 1.6254601979874008, + "grad_norm": 0.03641953691840172, + "learning_rate": 1.0307898015424328e-06, + "loss": 0.0004, + "step": 99340 + }, + { + "epoch": 1.6256238239384766, + "grad_norm": 0.2234962284564972, + "learning_rate": 1.0299216196092721e-06, + "loss": 0.0015, + "step": 99350 + }, + { + "epoch": 1.6257874498895526, + "grad_norm": 0.04085862636566162, + "learning_rate": 1.0290537614621976e-06, + "loss": 0.0008, + "step": 99360 + }, + { + "epoch": 1.6259510758406284, + "grad_norm": 0.007594408467411995, + "learning_rate": 1.0281862271719867e-06, + "loss": 0.0008, + "step": 99370 + }, + { + "epoch": 1.6261147017917041, + "grad_norm": 0.0083228824660182, + "learning_rate": 1.0273190168093955e-06, + "loss": 0.0006, + "step": 99380 + }, + { + "epoch": 1.6262783277427801, + "grad_norm": 0.1275256872177124, + "learning_rate": 1.0264521304451474e-06, + "loss": 0.0006, + "step": 99390 + }, + { + "epoch": 1.6264419536938557, + "grad_norm": 0.005033647175878286, + "learning_rate": 1.025585568149946e-06, + "loss": 0.0014, + "step": 99400 + }, + { + "epoch": 1.6266055796449317, + "grad_norm": 0.08270313590765, + "learning_rate": 1.0247193299944618e-06, + "loss": 0.0007, + "step": 99410 + }, + { + "epoch": 1.6267692055960077, + "grad_norm": 0.030194906517863274, + "learning_rate": 1.0238534160493457e-06, + "loss": 0.0016, + "step": 99420 + }, + { + "epoch": 1.6269328315470832, + "grad_norm": 0.1548902839422226, + "learning_rate": 1.0229878263852156e-06, + "loss": 0.0011, + "step": 99430 + }, + { + "epoch": 1.6270964574981592, + "grad_norm": 0.09061524271965027, + "learning_rate": 1.0221225610726688e-06, + "loss": 0.0009, + "step": 99440 + }, + { + "epoch": 1.627260083449235, + "grad_norm": 0.08053354918956757, + "learning_rate": 1.02125762018227e-06, + "loss": 0.0019, + "step": 99450 + }, + { + "epoch": 1.6274237094003108, + "grad_norm": 0.020897533744573593, + "learning_rate": 1.020393003784565e-06, + "loss": 0.0012, + "step": 99460 + }, + { + "epoch": 1.6275873353513868, + "grad_norm": 0.017086738720536232, + "learning_rate": 1.0195287119500652e-06, + "loss": 0.0007, + "step": 99470 + }, + { + "epoch": 1.6277509613024626, + "grad_norm": 0.05146488547325134, + "learning_rate": 1.018664744749262e-06, + "loss": 0.0011, + "step": 99480 + }, + { + "epoch": 1.6279145872535383, + "grad_norm": 0.03859664499759674, + "learning_rate": 1.0178011022526157e-06, + "loss": 0.0009, + "step": 99490 + }, + { + "epoch": 1.6280782132046143, + "grad_norm": 0.07819285988807678, + "learning_rate": 1.0169377845305639e-06, + "loss": 0.001, + "step": 99500 + }, + { + "epoch": 1.62824183915569, + "grad_norm": 0.03664161264896393, + "learning_rate": 1.0160747916535135e-06, + "loss": 0.0012, + "step": 99510 + }, + { + "epoch": 1.6284054651067659, + "grad_norm": 0.10607807338237762, + "learning_rate": 1.01521212369185e-06, + "loss": 0.001, + "step": 99520 + }, + { + "epoch": 1.6285690910578419, + "grad_norm": 0.014757922850549221, + "learning_rate": 1.014349780715927e-06, + "loss": 0.0006, + "step": 99530 + }, + { + "epoch": 1.6287327170089176, + "grad_norm": 0.058034446090459824, + "learning_rate": 1.0134877627960765e-06, + "loss": 0.0025, + "step": 99540 + }, + { + "epoch": 1.6288963429599934, + "grad_norm": 0.038525357842445374, + "learning_rate": 1.0126260700025997e-06, + "loss": 0.0012, + "step": 99550 + }, + { + "epoch": 1.6290599689110694, + "grad_norm": 0.0739142894744873, + "learning_rate": 1.0117647024057759e-06, + "loss": 0.0009, + "step": 99560 + }, + { + "epoch": 1.6292235948621452, + "grad_norm": 0.006620502565056086, + "learning_rate": 1.0109036600758538e-06, + "loss": 0.0006, + "step": 99570 + }, + { + "epoch": 1.629387220813221, + "grad_norm": 0.005684231407940388, + "learning_rate": 1.0100429430830561e-06, + "loss": 0.0004, + "step": 99580 + }, + { + "epoch": 1.629550846764297, + "grad_norm": 0.05469898506999016, + "learning_rate": 1.0091825514975818e-06, + "loss": 0.0009, + "step": 99590 + }, + { + "epoch": 1.6297144727153725, + "grad_norm": 0.01911795139312744, + "learning_rate": 1.008322485389599e-06, + "loss": 0.0012, + "step": 99600 + }, + { + "epoch": 1.6298780986664485, + "grad_norm": 0.05313611030578613, + "learning_rate": 1.0074627448292557e-06, + "loss": 0.0013, + "step": 99610 + }, + { + "epoch": 1.6300417246175245, + "grad_norm": 0.03985777497291565, + "learning_rate": 1.0066033298866651e-06, + "loss": 0.001, + "step": 99620 + }, + { + "epoch": 1.6302053505686, + "grad_norm": 0.006057702004909515, + "learning_rate": 1.0057442406319219e-06, + "loss": 0.0009, + "step": 99630 + }, + { + "epoch": 1.630368976519676, + "grad_norm": 0.06132945418357849, + "learning_rate": 1.004885477135087e-06, + "loss": 0.0015, + "step": 99640 + }, + { + "epoch": 1.6305326024707518, + "grad_norm": 0.18481603264808655, + "learning_rate": 1.0040270394662016e-06, + "loss": 0.0009, + "step": 99650 + }, + { + "epoch": 1.6306962284218276, + "grad_norm": 0.16483499109745026, + "learning_rate": 1.0031689276952743e-06, + "loss": 0.0033, + "step": 99660 + }, + { + "epoch": 1.6308598543729036, + "grad_norm": 0.055334024131298065, + "learning_rate": 1.002311141892292e-06, + "loss": 0.0008, + "step": 99670 + }, + { + "epoch": 1.6310234803239794, + "grad_norm": 0.03972171992063522, + "learning_rate": 1.0014536821272103e-06, + "loss": 0.0027, + "step": 99680 + }, + { + "epoch": 1.6311871062750551, + "grad_norm": 0.01582186669111252, + "learning_rate": 1.0005965484699637e-06, + "loss": 0.0009, + "step": 99690 + }, + { + "epoch": 1.6313507322261311, + "grad_norm": 0.047537438571453094, + "learning_rate": 9.997397409904535e-07, + "loss": 0.0004, + "step": 99700 + }, + { + "epoch": 1.631514358177207, + "grad_norm": 0.02975868247449398, + "learning_rate": 9.988832597585619e-07, + "loss": 0.001, + "step": 99710 + }, + { + "epoch": 1.6316779841282827, + "grad_norm": 0.03457397222518921, + "learning_rate": 9.98027104844137e-07, + "loss": 0.0008, + "step": 99720 + }, + { + "epoch": 1.6318416100793587, + "grad_norm": 0.036272987723350525, + "learning_rate": 9.971712763170071e-07, + "loss": 0.001, + "step": 99730 + }, + { + "epoch": 1.6320052360304345, + "grad_norm": 0.07712455838918686, + "learning_rate": 9.963157742469676e-07, + "loss": 0.0016, + "step": 99740 + }, + { + "epoch": 1.6321688619815102, + "grad_norm": 0.0029385199304670095, + "learning_rate": 9.95460598703793e-07, + "loss": 0.0011, + "step": 99750 + }, + { + "epoch": 1.6323324879325862, + "grad_norm": 0.049355171620845795, + "learning_rate": 9.946057497572265e-07, + "loss": 0.0015, + "step": 99760 + }, + { + "epoch": 1.632496113883662, + "grad_norm": 0.02615751326084137, + "learning_rate": 9.937512274769883e-07, + "loss": 0.0015, + "step": 99770 + }, + { + "epoch": 1.6326597398347378, + "grad_norm": 0.04165137931704521, + "learning_rate": 9.928970319327685e-07, + "loss": 0.0006, + "step": 99780 + }, + { + "epoch": 1.6328233657858138, + "grad_norm": 0.07442167401313782, + "learning_rate": 9.92043163194235e-07, + "loss": 0.0006, + "step": 99790 + }, + { + "epoch": 1.6329869917368893, + "grad_norm": 0.03294137120246887, + "learning_rate": 9.911896213310235e-07, + "loss": 0.0011, + "step": 99800 + }, + { + "epoch": 1.6331506176879653, + "grad_norm": 0.021243317052721977, + "learning_rate": 9.90336406412748e-07, + "loss": 0.0013, + "step": 99810 + }, + { + "epoch": 1.633314243639041, + "grad_norm": 0.027108022943139076, + "learning_rate": 9.894835185089924e-07, + "loss": 0.0007, + "step": 99820 + }, + { + "epoch": 1.6334778695901169, + "grad_norm": 0.14141453802585602, + "learning_rate": 9.886309576893172e-07, + "loss": 0.0012, + "step": 99830 + }, + { + "epoch": 1.6336414955411929, + "grad_norm": 0.04646940529346466, + "learning_rate": 9.877787240232523e-07, + "loss": 0.0013, + "step": 99840 + }, + { + "epoch": 1.6338051214922686, + "grad_norm": 0.03067663684487343, + "learning_rate": 9.869268175803048e-07, + "loss": 0.0008, + "step": 99850 + }, + { + "epoch": 1.6339687474433444, + "grad_norm": 0.05898122861981392, + "learning_rate": 9.86075238429951e-07, + "loss": 0.0011, + "step": 99860 + }, + { + "epoch": 1.6341323733944204, + "grad_norm": 0.0041008577682077885, + "learning_rate": 9.852239866416458e-07, + "loss": 0.0009, + "step": 99870 + }, + { + "epoch": 1.6342959993454962, + "grad_norm": 0.049132224172353745, + "learning_rate": 9.843730622848114e-07, + "loss": 0.001, + "step": 99880 + }, + { + "epoch": 1.634459625296572, + "grad_norm": 0.02173960953950882, + "learning_rate": 9.835224654288494e-07, + "loss": 0.0005, + "step": 99890 + }, + { + "epoch": 1.634623251247648, + "grad_norm": 0.07617775350809097, + "learning_rate": 9.82672196143129e-07, + "loss": 0.0011, + "step": 99900 + }, + { + "epoch": 1.6347868771987237, + "grad_norm": 0.04964715614914894, + "learning_rate": 9.818222544969959e-07, + "loss": 0.001, + "step": 99910 + }, + { + "epoch": 1.6349505031497995, + "grad_norm": 0.1036185622215271, + "learning_rate": 9.809726405597692e-07, + "loss": 0.001, + "step": 99920 + }, + { + "epoch": 1.6351141291008755, + "grad_norm": 0.08874023705720901, + "learning_rate": 9.801233544007389e-07, + "loss": 0.001, + "step": 99930 + }, + { + "epoch": 1.6352777550519513, + "grad_norm": 0.020390445366501808, + "learning_rate": 9.792743960891726e-07, + "loss": 0.0008, + "step": 99940 + }, + { + "epoch": 1.635441381003027, + "grad_norm": 0.28065553307533264, + "learning_rate": 9.784257656943053e-07, + "loss": 0.0023, + "step": 99950 + }, + { + "epoch": 1.635605006954103, + "grad_norm": 0.0179117601364851, + "learning_rate": 9.775774632853513e-07, + "loss": 0.0004, + "step": 99960 + }, + { + "epoch": 1.6357686329051786, + "grad_norm": 0.017978258430957794, + "learning_rate": 9.767294889314926e-07, + "loss": 0.0026, + "step": 99970 + }, + { + "epoch": 1.6359322588562546, + "grad_norm": 0.08978407084941864, + "learning_rate": 9.758818427018897e-07, + "loss": 0.0011, + "step": 99980 + }, + { + "epoch": 1.6360958848073306, + "grad_norm": 0.04969654604792595, + "learning_rate": 9.750345246656712e-07, + "loss": 0.0011, + "step": 99990 + }, + { + "epoch": 1.6362595107584061, + "grad_norm": 0.030709777027368546, + "learning_rate": 9.741875348919443e-07, + "loss": 0.0004, + "step": 100000 + }, + { + "epoch": 1.6364231367094821, + "grad_norm": 0.011034307070076466, + "learning_rate": 9.733408734497834e-07, + "loss": 0.0008, + "step": 100010 + }, + { + "epoch": 1.636586762660558, + "grad_norm": 0.06286372244358063, + "learning_rate": 9.724945404082425e-07, + "loss": 0.0006, + "step": 100020 + }, + { + "epoch": 1.6367503886116337, + "grad_norm": 0.028471173718571663, + "learning_rate": 9.71648535836343e-07, + "loss": 0.0008, + "step": 100030 + }, + { + "epoch": 1.6369140145627097, + "grad_norm": 0.0518898069858551, + "learning_rate": 9.708028598030844e-07, + "loss": 0.0007, + "step": 100040 + }, + { + "epoch": 1.6370776405137855, + "grad_norm": 0.12626807391643524, + "learning_rate": 9.699575123774346e-07, + "loss": 0.0011, + "step": 100050 + }, + { + "epoch": 1.6372412664648612, + "grad_norm": 0.05185278505086899, + "learning_rate": 9.691124936283403e-07, + "loss": 0.0012, + "step": 100060 + }, + { + "epoch": 1.6374048924159372, + "grad_norm": 0.0829366147518158, + "learning_rate": 9.68267803624715e-07, + "loss": 0.0011, + "step": 100070 + }, + { + "epoch": 1.637568518367013, + "grad_norm": 0.06289747357368469, + "learning_rate": 9.674234424354522e-07, + "loss": 0.0008, + "step": 100080 + }, + { + "epoch": 1.6377321443180888, + "grad_norm": 0.07616087794303894, + "learning_rate": 9.665794101294119e-07, + "loss": 0.0007, + "step": 100090 + }, + { + "epoch": 1.6378957702691648, + "grad_norm": 0.014592207036912441, + "learning_rate": 9.657357067754335e-07, + "loss": 0.001, + "step": 100100 + }, + { + "epoch": 1.6380593962202405, + "grad_norm": 0.053025439381599426, + "learning_rate": 9.648923324423238e-07, + "loss": 0.0017, + "step": 100110 + }, + { + "epoch": 1.6382230221713163, + "grad_norm": 0.04782039672136307, + "learning_rate": 9.640492871988683e-07, + "loss": 0.0023, + "step": 100120 + }, + { + "epoch": 1.6383866481223923, + "grad_norm": 0.012251359410583973, + "learning_rate": 9.632065711138194e-07, + "loss": 0.0012, + "step": 100130 + }, + { + "epoch": 1.638550274073468, + "grad_norm": 0.013355978764593601, + "learning_rate": 9.6236418425591e-07, + "loss": 0.0007, + "step": 100140 + }, + { + "epoch": 1.6387139000245439, + "grad_norm": 0.24423478543758392, + "learning_rate": 9.615221266938397e-07, + "loss": 0.0022, + "step": 100150 + }, + { + "epoch": 1.6388775259756199, + "grad_norm": 0.016653427854180336, + "learning_rate": 9.606803984962853e-07, + "loss": 0.0007, + "step": 100160 + }, + { + "epoch": 1.6390411519266954, + "grad_norm": 0.03255431726574898, + "learning_rate": 9.598389997318934e-07, + "loss": 0.001, + "step": 100170 + }, + { + "epoch": 1.6392047778777714, + "grad_norm": 0.03150234743952751, + "learning_rate": 9.589979304692882e-07, + "loss": 0.0006, + "step": 100180 + }, + { + "epoch": 1.6393684038288474, + "grad_norm": 0.006022465415298939, + "learning_rate": 9.58157190777062e-07, + "loss": 0.0014, + "step": 100190 + }, + { + "epoch": 1.639532029779923, + "grad_norm": 0.04250042513012886, + "learning_rate": 9.573167807237848e-07, + "loss": 0.0014, + "step": 100200 + }, + { + "epoch": 1.639695655730999, + "grad_norm": 0.012929433025419712, + "learning_rate": 9.564767003779974e-07, + "loss": 0.0006, + "step": 100210 + }, + { + "epoch": 1.6398592816820747, + "grad_norm": 0.08184831589460373, + "learning_rate": 9.556369498082108e-07, + "loss": 0.001, + "step": 100220 + }, + { + "epoch": 1.6400229076331505, + "grad_norm": 0.058843448758125305, + "learning_rate": 9.547975290829158e-07, + "loss": 0.001, + "step": 100230 + }, + { + "epoch": 1.6401865335842265, + "grad_norm": 0.0030664834193885326, + "learning_rate": 9.539584382705708e-07, + "loss": 0.0009, + "step": 100240 + }, + { + "epoch": 1.6403501595353023, + "grad_norm": 0.050919629633426666, + "learning_rate": 9.531196774396107e-07, + "loss": 0.001, + "step": 100250 + }, + { + "epoch": 1.640513785486378, + "grad_norm": 0.007630038075149059, + "learning_rate": 9.522812466584392e-07, + "loss": 0.0008, + "step": 100260 + }, + { + "epoch": 1.640677411437454, + "grad_norm": 0.0943123921751976, + "learning_rate": 9.514431459954398e-07, + "loss": 0.0011, + "step": 100270 + }, + { + "epoch": 1.6408410373885298, + "grad_norm": 0.03166643902659416, + "learning_rate": 9.50605375518961e-07, + "loss": 0.0011, + "step": 100280 + }, + { + "epoch": 1.6410046633396056, + "grad_norm": 0.021564824506640434, + "learning_rate": 9.497679352973321e-07, + "loss": 0.0007, + "step": 100290 + }, + { + "epoch": 1.6411682892906816, + "grad_norm": 0.10713506489992142, + "learning_rate": 9.489308253988494e-07, + "loss": 0.0016, + "step": 100300 + }, + { + "epoch": 1.6413319152417574, + "grad_norm": 0.04868478327989578, + "learning_rate": 9.480940458917864e-07, + "loss": 0.0013, + "step": 100310 + }, + { + "epoch": 1.6414955411928331, + "grad_norm": 0.010210786014795303, + "learning_rate": 9.472575968443864e-07, + "loss": 0.0009, + "step": 100320 + }, + { + "epoch": 1.6416591671439091, + "grad_norm": 0.023481430485844612, + "learning_rate": 9.464214783248693e-07, + "loss": 0.0006, + "step": 100330 + }, + { + "epoch": 1.641822793094985, + "grad_norm": 0.028715720400214195, + "learning_rate": 9.455856904014238e-07, + "loss": 0.0011, + "step": 100340 + }, + { + "epoch": 1.6419864190460607, + "grad_norm": 0.09974674135446548, + "learning_rate": 9.447502331422159e-07, + "loss": 0.0018, + "step": 100350 + }, + { + "epoch": 1.6421500449971367, + "grad_norm": 0.09422918409109116, + "learning_rate": 9.439151066153812e-07, + "loss": 0.0015, + "step": 100360 + }, + { + "epoch": 1.6423136709482122, + "grad_norm": 0.019053339958190918, + "learning_rate": 9.430803108890313e-07, + "loss": 0.0007, + "step": 100370 + }, + { + "epoch": 1.6424772968992882, + "grad_norm": 0.04648305103182793, + "learning_rate": 9.422458460312473e-07, + "loss": 0.001, + "step": 100380 + }, + { + "epoch": 1.6426409228503642, + "grad_norm": 0.01793096214532852, + "learning_rate": 9.414117121100874e-07, + "loss": 0.0005, + "step": 100390 + }, + { + "epoch": 1.6428045488014398, + "grad_norm": 0.037642136216163635, + "learning_rate": 9.405779091935791e-07, + "loss": 0.0012, + "step": 100400 + }, + { + "epoch": 1.6429681747525158, + "grad_norm": 0.005648561753332615, + "learning_rate": 9.397444373497255e-07, + "loss": 0.0015, + "step": 100410 + }, + { + "epoch": 1.6431318007035915, + "grad_norm": 0.0291750431060791, + "learning_rate": 9.389112966465008e-07, + "loss": 0.0013, + "step": 100420 + }, + { + "epoch": 1.6432954266546673, + "grad_norm": 0.11103682219982147, + "learning_rate": 9.38078487151855e-07, + "loss": 0.0013, + "step": 100430 + }, + { + "epoch": 1.6434590526057433, + "grad_norm": 0.012171434238553047, + "learning_rate": 9.372460089337066e-07, + "loss": 0.0014, + "step": 100440 + }, + { + "epoch": 1.643622678556819, + "grad_norm": 0.024769367650151253, + "learning_rate": 9.364138620599522e-07, + "loss": 0.0017, + "step": 100450 + }, + { + "epoch": 1.6437863045078949, + "grad_norm": 0.04039948433637619, + "learning_rate": 9.355820465984566e-07, + "loss": 0.0006, + "step": 100460 + }, + { + "epoch": 1.6439499304589709, + "grad_norm": 0.06217660754919052, + "learning_rate": 9.347505626170622e-07, + "loss": 0.0011, + "step": 100470 + }, + { + "epoch": 1.6441135564100466, + "grad_norm": 0.044760867953300476, + "learning_rate": 9.339194101835791e-07, + "loss": 0.0012, + "step": 100480 + }, + { + "epoch": 1.6442771823611224, + "grad_norm": 0.02198943682014942, + "learning_rate": 9.330885893657965e-07, + "loss": 0.0008, + "step": 100490 + }, + { + "epoch": 1.6444408083121984, + "grad_norm": 0.03514161705970764, + "learning_rate": 9.322581002314701e-07, + "loss": 0.0012, + "step": 100500 + }, + { + "epoch": 1.6446044342632742, + "grad_norm": 0.08054398745298386, + "learning_rate": 9.314279428483353e-07, + "loss": 0.001, + "step": 100510 + }, + { + "epoch": 1.64476806021435, + "grad_norm": 0.01564517803490162, + "learning_rate": 9.305981172840933e-07, + "loss": 0.0006, + "step": 100520 + }, + { + "epoch": 1.644931686165426, + "grad_norm": 0.08523036539554596, + "learning_rate": 9.297686236064246e-07, + "loss": 0.001, + "step": 100530 + }, + { + "epoch": 1.6450953121165017, + "grad_norm": 0.08250916749238968, + "learning_rate": 9.289394618829794e-07, + "loss": 0.0012, + "step": 100540 + }, + { + "epoch": 1.6452589380675775, + "grad_norm": 0.0643003061413765, + "learning_rate": 9.281106321813793e-07, + "loss": 0.0018, + "step": 100550 + }, + { + "epoch": 1.6454225640186535, + "grad_norm": 0.02502853237092495, + "learning_rate": 9.272821345692235e-07, + "loss": 0.0006, + "step": 100560 + }, + { + "epoch": 1.645586189969729, + "grad_norm": 0.04937305301427841, + "learning_rate": 9.26453969114079e-07, + "loss": 0.0007, + "step": 100570 + }, + { + "epoch": 1.645749815920805, + "grad_norm": 0.02921679988503456, + "learning_rate": 9.2562613588349e-07, + "loss": 0.0003, + "step": 100580 + }, + { + "epoch": 1.645913441871881, + "grad_norm": 0.047889672219753265, + "learning_rate": 9.247986349449705e-07, + "loss": 0.0016, + "step": 100590 + }, + { + "epoch": 1.6460770678229566, + "grad_norm": 0.06035612151026726, + "learning_rate": 9.239714663660105e-07, + "loss": 0.0008, + "step": 100600 + }, + { + "epoch": 1.6462406937740326, + "grad_norm": 0.03663825988769531, + "learning_rate": 9.231446302140684e-07, + "loss": 0.0009, + "step": 100610 + }, + { + "epoch": 1.6464043197251084, + "grad_norm": 0.009869643487036228, + "learning_rate": 9.223181265565812e-07, + "loss": 0.0008, + "step": 100620 + }, + { + "epoch": 1.6465679456761841, + "grad_norm": 0.0018252196023240685, + "learning_rate": 9.21491955460953e-07, + "loss": 0.0008, + "step": 100630 + }, + { + "epoch": 1.6467315716272601, + "grad_norm": 0.026634203270077705, + "learning_rate": 9.206661169945652e-07, + "loss": 0.001, + "step": 100640 + }, + { + "epoch": 1.646895197578336, + "grad_norm": 0.0012128757080063224, + "learning_rate": 9.198406112247693e-07, + "loss": 0.0008, + "step": 100650 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.028131825849413872, + "learning_rate": 9.190154382188921e-07, + "loss": 0.0005, + "step": 100660 + }, + { + "epoch": 1.6472224494804877, + "grad_norm": 0.00040771570638753474, + "learning_rate": 9.181905980442296e-07, + "loss": 0.0011, + "step": 100670 + }, + { + "epoch": 1.6473860754315635, + "grad_norm": 0.06667282432317734, + "learning_rate": 9.173660907680559e-07, + "loss": 0.0021, + "step": 100680 + }, + { + "epoch": 1.6475497013826392, + "grad_norm": 0.03825167194008827, + "learning_rate": 9.165419164576122e-07, + "loss": 0.0004, + "step": 100690 + }, + { + "epoch": 1.6477133273337152, + "grad_norm": 0.0063306232914328575, + "learning_rate": 9.157180751801182e-07, + "loss": 0.0016, + "step": 100700 + }, + { + "epoch": 1.647876953284791, + "grad_norm": 0.0033302123192697763, + "learning_rate": 9.148945670027604e-07, + "loss": 0.001, + "step": 100710 + }, + { + "epoch": 1.6480405792358668, + "grad_norm": 0.1574021577835083, + "learning_rate": 9.140713919927047e-07, + "loss": 0.0015, + "step": 100720 + }, + { + "epoch": 1.6482042051869428, + "grad_norm": 0.06321648508310318, + "learning_rate": 9.132485502170829e-07, + "loss": 0.0014, + "step": 100730 + }, + { + "epoch": 1.6483678311380183, + "grad_norm": 0.033277928829193115, + "learning_rate": 9.12426041743007e-07, + "loss": 0.0017, + "step": 100740 + }, + { + "epoch": 1.6485314570890943, + "grad_norm": 0.04327253997325897, + "learning_rate": 9.116038666375543e-07, + "loss": 0.0004, + "step": 100750 + }, + { + "epoch": 1.6486950830401703, + "grad_norm": 0.03747379779815674, + "learning_rate": 9.107820249677812e-07, + "loss": 0.0015, + "step": 100760 + }, + { + "epoch": 1.6488587089912459, + "grad_norm": 0.045122239738702774, + "learning_rate": 9.09960516800713e-07, + "loss": 0.0008, + "step": 100770 + }, + { + "epoch": 1.6490223349423219, + "grad_norm": 0.03921327367424965, + "learning_rate": 9.091393422033501e-07, + "loss": 0.0007, + "step": 100780 + }, + { + "epoch": 1.6491859608933976, + "grad_norm": 0.018943939357995987, + "learning_rate": 9.083185012426632e-07, + "loss": 0.0006, + "step": 100790 + }, + { + "epoch": 1.6493495868444734, + "grad_norm": 0.05014105513691902, + "learning_rate": 9.074979939855993e-07, + "loss": 0.0004, + "step": 100800 + }, + { + "epoch": 1.6495132127955494, + "grad_norm": 0.0029739083256572485, + "learning_rate": 9.066778204990745e-07, + "loss": 0.0009, + "step": 100810 + }, + { + "epoch": 1.6496768387466252, + "grad_norm": 0.12536069750785828, + "learning_rate": 9.058579808499806e-07, + "loss": 0.0012, + "step": 100820 + }, + { + "epoch": 1.649840464697701, + "grad_norm": 0.018265223130583763, + "learning_rate": 9.050384751051789e-07, + "loss": 0.0011, + "step": 100830 + }, + { + "epoch": 1.650004090648777, + "grad_norm": 0.005306945648044348, + "learning_rate": 9.042193033315084e-07, + "loss": 0.0008, + "step": 100840 + }, + { + "epoch": 1.6501677165998527, + "grad_norm": 0.050336480140686035, + "learning_rate": 9.034004655957767e-07, + "loss": 0.0008, + "step": 100850 + }, + { + "epoch": 1.6503313425509285, + "grad_norm": 0.001689878641627729, + "learning_rate": 9.025819619647636e-07, + "loss": 0.0006, + "step": 100860 + }, + { + "epoch": 1.6504949685020045, + "grad_norm": 0.06841377168893814, + "learning_rate": 9.017637925052264e-07, + "loss": 0.001, + "step": 100870 + }, + { + "epoch": 1.6506585944530803, + "grad_norm": 0.10851097106933594, + "learning_rate": 9.009459572838896e-07, + "loss": 0.0011, + "step": 100880 + }, + { + "epoch": 1.650822220404156, + "grad_norm": 0.11003616452217102, + "learning_rate": 9.001284563674562e-07, + "loss": 0.0009, + "step": 100890 + }, + { + "epoch": 1.650985846355232, + "grad_norm": 0.042798299342393875, + "learning_rate": 8.993112898225948e-07, + "loss": 0.0014, + "step": 100900 + }, + { + "epoch": 1.6511494723063078, + "grad_norm": 0.031225567683577538, + "learning_rate": 8.984944577159549e-07, + "loss": 0.0015, + "step": 100910 + }, + { + "epoch": 1.6513130982573836, + "grad_norm": 0.13762202858924866, + "learning_rate": 8.976779601141505e-07, + "loss": 0.0017, + "step": 100920 + }, + { + "epoch": 1.6514767242084596, + "grad_norm": 0.05078810080885887, + "learning_rate": 8.968617970837762e-07, + "loss": 0.0007, + "step": 100930 + }, + { + "epoch": 1.6516403501595351, + "grad_norm": 0.12267827242612839, + "learning_rate": 8.960459686913919e-07, + "loss": 0.0021, + "step": 100940 + }, + { + "epoch": 1.6518039761106111, + "grad_norm": 0.055990397930145264, + "learning_rate": 8.952304750035373e-07, + "loss": 0.0008, + "step": 100950 + }, + { + "epoch": 1.6519676020616871, + "grad_norm": 0.032464973628520966, + "learning_rate": 8.94415316086718e-07, + "loss": 0.001, + "step": 100960 + }, + { + "epoch": 1.6521312280127627, + "grad_norm": 0.03776313364505768, + "learning_rate": 8.936004920074182e-07, + "loss": 0.0009, + "step": 100970 + }, + { + "epoch": 1.6522948539638387, + "grad_norm": 0.05661027878522873, + "learning_rate": 8.9278600283209e-07, + "loss": 0.0012, + "step": 100980 + }, + { + "epoch": 1.6524584799149145, + "grad_norm": 0.01972813531756401, + "learning_rate": 8.919718486271628e-07, + "loss": 0.0011, + "step": 100990 + }, + { + "epoch": 1.6526221058659902, + "grad_norm": 0.039506617933511734, + "learning_rate": 8.911580294590332e-07, + "loss": 0.0005, + "step": 101000 + }, + { + "epoch": 1.6527857318170662, + "grad_norm": 0.0016242810525000095, + "learning_rate": 8.903445453940773e-07, + "loss": 0.0007, + "step": 101010 + }, + { + "epoch": 1.652949357768142, + "grad_norm": 0.02541460655629635, + "learning_rate": 8.895313964986363e-07, + "loss": 0.0013, + "step": 101020 + }, + { + "epoch": 1.6531129837192178, + "grad_norm": 0.02735931985080242, + "learning_rate": 8.887185828390304e-07, + "loss": 0.001, + "step": 101030 + }, + { + "epoch": 1.6532766096702938, + "grad_norm": 0.03217087686061859, + "learning_rate": 8.879061044815484e-07, + "loss": 0.0013, + "step": 101040 + }, + { + "epoch": 1.6534402356213695, + "grad_norm": 0.006479606498032808, + "learning_rate": 8.870939614924551e-07, + "loss": 0.0009, + "step": 101050 + }, + { + "epoch": 1.6536038615724453, + "grad_norm": 0.04930521175265312, + "learning_rate": 8.86282153937984e-07, + "loss": 0.0008, + "step": 101060 + }, + { + "epoch": 1.6537674875235213, + "grad_norm": 0.016630342230200768, + "learning_rate": 8.854706818843456e-07, + "loss": 0.0009, + "step": 101070 + }, + { + "epoch": 1.653931113474597, + "grad_norm": 0.0023141137789934874, + "learning_rate": 8.84659545397718e-07, + "loss": 0.0002, + "step": 101080 + }, + { + "epoch": 1.6540947394256729, + "grad_norm": 0.0774788111448288, + "learning_rate": 8.838487445442573e-07, + "loss": 0.0005, + "step": 101090 + }, + { + "epoch": 1.6542583653767489, + "grad_norm": 0.027807947248220444, + "learning_rate": 8.830382793900877e-07, + "loss": 0.0012, + "step": 101100 + }, + { + "epoch": 1.6544219913278246, + "grad_norm": 0.009336289949715137, + "learning_rate": 8.822281500013102e-07, + "loss": 0.0007, + "step": 101110 + }, + { + "epoch": 1.6545856172789004, + "grad_norm": 0.008764766156673431, + "learning_rate": 8.814183564439937e-07, + "loss": 0.0015, + "step": 101120 + }, + { + "epoch": 1.6547492432299764, + "grad_norm": 0.022864574566483498, + "learning_rate": 8.806088987841843e-07, + "loss": 0.0011, + "step": 101130 + }, + { + "epoch": 1.654912869181052, + "grad_norm": 0.03213363140821457, + "learning_rate": 8.797997770878969e-07, + "loss": 0.0014, + "step": 101140 + }, + { + "epoch": 1.655076495132128, + "grad_norm": 0.17430591583251953, + "learning_rate": 8.789909914211226e-07, + "loss": 0.0009, + "step": 101150 + }, + { + "epoch": 1.655240121083204, + "grad_norm": 0.00819132849574089, + "learning_rate": 8.781825418498213e-07, + "loss": 0.0007, + "step": 101160 + }, + { + "epoch": 1.6554037470342795, + "grad_norm": 0.05609424039721489, + "learning_rate": 8.773744284399288e-07, + "loss": 0.0016, + "step": 101170 + }, + { + "epoch": 1.6555673729853555, + "grad_norm": 0.015797214582562447, + "learning_rate": 8.765666512573523e-07, + "loss": 0.0008, + "step": 101180 + }, + { + "epoch": 1.6557309989364313, + "grad_norm": 0.0739670991897583, + "learning_rate": 8.757592103679685e-07, + "loss": 0.0011, + "step": 101190 + }, + { + "epoch": 1.655894624887507, + "grad_norm": 0.03178118169307709, + "learning_rate": 8.749521058376336e-07, + "loss": 0.0007, + "step": 101200 + }, + { + "epoch": 1.656058250838583, + "grad_norm": 0.006787098944187164, + "learning_rate": 8.741453377321685e-07, + "loss": 0.0005, + "step": 101210 + }, + { + "epoch": 1.6562218767896588, + "grad_norm": 0.11626537144184113, + "learning_rate": 8.733389061173742e-07, + "loss": 0.0006, + "step": 101220 + }, + { + "epoch": 1.6563855027407346, + "grad_norm": 0.08527819812297821, + "learning_rate": 8.72532811059017e-07, + "loss": 0.0009, + "step": 101230 + }, + { + "epoch": 1.6565491286918106, + "grad_norm": 0.01779862307012081, + "learning_rate": 8.717270526228422e-07, + "loss": 0.001, + "step": 101240 + }, + { + "epoch": 1.6567127546428864, + "grad_norm": 0.048301730304956436, + "learning_rate": 8.709216308745616e-07, + "loss": 0.0008, + "step": 101250 + }, + { + "epoch": 1.6568763805939621, + "grad_norm": 0.009677115827798843, + "learning_rate": 8.701165458798666e-07, + "loss": 0.0008, + "step": 101260 + }, + { + "epoch": 1.6570400065450381, + "grad_norm": 0.045163266360759735, + "learning_rate": 8.69311797704413e-07, + "loss": 0.0008, + "step": 101270 + }, + { + "epoch": 1.657203632496114, + "grad_norm": 0.05419962853193283, + "learning_rate": 8.685073864138371e-07, + "loss": 0.0015, + "step": 101280 + }, + { + "epoch": 1.6573672584471897, + "grad_norm": 0.07545183598995209, + "learning_rate": 8.677033120737405e-07, + "loss": 0.0016, + "step": 101290 + }, + { + "epoch": 1.6575308843982657, + "grad_norm": 0.047006651759147644, + "learning_rate": 8.668995747497045e-07, + "loss": 0.0006, + "step": 101300 + }, + { + "epoch": 1.6576945103493415, + "grad_norm": 0.00753290019929409, + "learning_rate": 8.660961745072755e-07, + "loss": 0.0005, + "step": 101310 + }, + { + "epoch": 1.6578581363004172, + "grad_norm": 0.08932722359895706, + "learning_rate": 8.652931114119789e-07, + "loss": 0.0004, + "step": 101320 + }, + { + "epoch": 1.6580217622514932, + "grad_norm": 0.052556682378053665, + "learning_rate": 8.644903855293074e-07, + "loss": 0.0021, + "step": 101330 + }, + { + "epoch": 1.6581853882025688, + "grad_norm": 0.0639825388789177, + "learning_rate": 8.636879969247308e-07, + "loss": 0.0009, + "step": 101340 + }, + { + "epoch": 1.6583490141536448, + "grad_norm": 0.0941416472196579, + "learning_rate": 8.628859456636873e-07, + "loss": 0.0012, + "step": 101350 + }, + { + "epoch": 1.6585126401047208, + "grad_norm": 0.043087173253297806, + "learning_rate": 8.620842318115919e-07, + "loss": 0.0008, + "step": 101360 + }, + { + "epoch": 1.6586762660557963, + "grad_norm": 0.02111460268497467, + "learning_rate": 8.612828554338265e-07, + "loss": 0.0008, + "step": 101370 + }, + { + "epoch": 1.6588398920068723, + "grad_norm": 0.06270020455121994, + "learning_rate": 8.604818165957518e-07, + "loss": 0.0012, + "step": 101380 + }, + { + "epoch": 1.659003517957948, + "grad_norm": 0.043625134974718094, + "learning_rate": 8.59681115362695e-07, + "loss": 0.001, + "step": 101390 + }, + { + "epoch": 1.6591671439090239, + "grad_norm": 0.11834300309419632, + "learning_rate": 8.588807517999609e-07, + "loss": 0.0012, + "step": 101400 + }, + { + "epoch": 1.6593307698600999, + "grad_norm": 0.08660273998975754, + "learning_rate": 8.580807259728225e-07, + "loss": 0.0008, + "step": 101410 + }, + { + "epoch": 1.6594943958111756, + "grad_norm": 0.03922102227807045, + "learning_rate": 8.572810379465291e-07, + "loss": 0.0011, + "step": 101420 + }, + { + "epoch": 1.6596580217622514, + "grad_norm": 0.04858485981822014, + "learning_rate": 8.564816877862981e-07, + "loss": 0.0012, + "step": 101430 + }, + { + "epoch": 1.6598216477133274, + "grad_norm": 0.009295348078012466, + "learning_rate": 8.556826755573245e-07, + "loss": 0.0013, + "step": 101440 + }, + { + "epoch": 1.6599852736644032, + "grad_norm": 0.0582355335354805, + "learning_rate": 8.548840013247706e-07, + "loss": 0.0067, + "step": 101450 + }, + { + "epoch": 1.660148899615479, + "grad_norm": 0.03318160027265549, + "learning_rate": 8.540856651537754e-07, + "loss": 0.0009, + "step": 101460 + }, + { + "epoch": 1.660312525566555, + "grad_norm": 0.04215823486447334, + "learning_rate": 8.53287667109447e-07, + "loss": 0.0006, + "step": 101470 + }, + { + "epoch": 1.6604761515176307, + "grad_norm": 0.008919753134250641, + "learning_rate": 8.524900072568693e-07, + "loss": 0.0007, + "step": 101480 + }, + { + "epoch": 1.6606397774687065, + "grad_norm": 0.055964186787605286, + "learning_rate": 8.51692685661094e-07, + "loss": 0.0011, + "step": 101490 + }, + { + "epoch": 1.6608034034197825, + "grad_norm": 0.07094421982765198, + "learning_rate": 8.508957023871512e-07, + "loss": 0.0013, + "step": 101500 + }, + { + "epoch": 1.6609670293708583, + "grad_norm": 0.011620515026152134, + "learning_rate": 8.500990575000384e-07, + "loss": 0.0005, + "step": 101510 + }, + { + "epoch": 1.661130655321934, + "grad_norm": 0.04202982038259506, + "learning_rate": 8.493027510647256e-07, + "loss": 0.0006, + "step": 101520 + }, + { + "epoch": 1.66129428127301, + "grad_norm": 0.07425212115049362, + "learning_rate": 8.485067831461602e-07, + "loss": 0.001, + "step": 101530 + }, + { + "epoch": 1.6614579072240856, + "grad_norm": 0.053521011024713516, + "learning_rate": 8.477111538092552e-07, + "loss": 0.0009, + "step": 101540 + }, + { + "epoch": 1.6616215331751616, + "grad_norm": 0.009066687896847725, + "learning_rate": 8.469158631189028e-07, + "loss": 0.0011, + "step": 101550 + }, + { + "epoch": 1.6617851591262376, + "grad_norm": 0.0825028344988823, + "learning_rate": 8.461209111399615e-07, + "loss": 0.0014, + "step": 101560 + }, + { + "epoch": 1.6619487850773131, + "grad_norm": 0.15130744874477386, + "learning_rate": 8.453262979372672e-07, + "loss": 0.0011, + "step": 101570 + }, + { + "epoch": 1.6621124110283891, + "grad_norm": 0.01658623106777668, + "learning_rate": 8.445320235756233e-07, + "loss": 0.0009, + "step": 101580 + }, + { + "epoch": 1.662276036979465, + "grad_norm": 0.01609295792877674, + "learning_rate": 8.437380881198104e-07, + "loss": 0.0006, + "step": 101590 + }, + { + "epoch": 1.6624396629305407, + "grad_norm": 0.10220176726579666, + "learning_rate": 8.429444916345774e-07, + "loss": 0.001, + "step": 101600 + }, + { + "epoch": 1.6626032888816167, + "grad_norm": 0.02159099653363228, + "learning_rate": 8.421512341846494e-07, + "loss": 0.0008, + "step": 101610 + }, + { + "epoch": 1.6627669148326925, + "grad_norm": 0.03925452381372452, + "learning_rate": 8.413583158347189e-07, + "loss": 0.0008, + "step": 101620 + }, + { + "epoch": 1.6629305407837682, + "grad_norm": 0.03249557316303253, + "learning_rate": 8.405657366494563e-07, + "loss": 0.0008, + "step": 101630 + }, + { + "epoch": 1.6630941667348442, + "grad_norm": 0.02010810747742653, + "learning_rate": 8.397734966935001e-07, + "loss": 0.0009, + "step": 101640 + }, + { + "epoch": 1.66325779268592, + "grad_norm": 0.14067593216896057, + "learning_rate": 8.38981596031464e-07, + "loss": 0.001, + "step": 101650 + }, + { + "epoch": 1.6634214186369958, + "grad_norm": 0.06923440098762512, + "learning_rate": 8.381900347279304e-07, + "loss": 0.001, + "step": 101660 + }, + { + "epoch": 1.6635850445880718, + "grad_norm": 0.034058842808008194, + "learning_rate": 8.373988128474597e-07, + "loss": 0.0007, + "step": 101670 + }, + { + "epoch": 1.6637486705391475, + "grad_norm": 0.008833528496325016, + "learning_rate": 8.366079304545776e-07, + "loss": 0.0009, + "step": 101680 + }, + { + "epoch": 1.6639122964902233, + "grad_norm": 0.08756767958402634, + "learning_rate": 8.358173876137893e-07, + "loss": 0.0015, + "step": 101690 + }, + { + "epoch": 1.6640759224412993, + "grad_norm": 0.011731510981917381, + "learning_rate": 8.350271843895658e-07, + "loss": 0.0005, + "step": 101700 + }, + { + "epoch": 1.6642395483923749, + "grad_norm": 0.08542631566524506, + "learning_rate": 8.342373208463556e-07, + "loss": 0.0009, + "step": 101710 + }, + { + "epoch": 1.6644031743434509, + "grad_norm": 0.02307390235364437, + "learning_rate": 8.334477970485755e-07, + "loss": 0.0018, + "step": 101720 + }, + { + "epoch": 1.6645668002945269, + "grad_norm": 0.05497150868177414, + "learning_rate": 8.326586130606184e-07, + "loss": 0.0017, + "step": 101730 + }, + { + "epoch": 1.6647304262456024, + "grad_norm": 0.04673496261239052, + "learning_rate": 8.318697689468447e-07, + "loss": 0.0005, + "step": 101740 + }, + { + "epoch": 1.6648940521966784, + "grad_norm": 0.026257064193487167, + "learning_rate": 8.310812647715932e-07, + "loss": 0.0009, + "step": 101750 + }, + { + "epoch": 1.6650576781477542, + "grad_norm": 0.003084322903305292, + "learning_rate": 8.302931005991682e-07, + "loss": 0.0005, + "step": 101760 + }, + { + "epoch": 1.66522130409883, + "grad_norm": 0.07489932328462601, + "learning_rate": 8.295052764938528e-07, + "loss": 0.0014, + "step": 101770 + }, + { + "epoch": 1.665384930049906, + "grad_norm": 0.00158429064322263, + "learning_rate": 8.287177925198969e-07, + "loss": 0.001, + "step": 101780 + }, + { + "epoch": 1.6655485560009817, + "grad_norm": 0.014105689711868763, + "learning_rate": 8.279306487415267e-07, + "loss": 0.0013, + "step": 101790 + }, + { + "epoch": 1.6657121819520575, + "grad_norm": 0.04114721715450287, + "learning_rate": 8.271438452229369e-07, + "loss": 0.0008, + "step": 101800 + }, + { + "epoch": 1.6658758079031335, + "grad_norm": 0.11233578622341156, + "learning_rate": 8.263573820282994e-07, + "loss": 0.0011, + "step": 101810 + }, + { + "epoch": 1.6660394338542093, + "grad_norm": 0.06644126027822495, + "learning_rate": 8.255712592217536e-07, + "loss": 0.0008, + "step": 101820 + }, + { + "epoch": 1.666203059805285, + "grad_norm": 0.0266830176115036, + "learning_rate": 8.247854768674119e-07, + "loss": 0.0008, + "step": 101830 + }, + { + "epoch": 1.666366685756361, + "grad_norm": 0.061021775007247925, + "learning_rate": 8.240000350293625e-07, + "loss": 0.0008, + "step": 101840 + }, + { + "epoch": 1.6665303117074368, + "grad_norm": 0.03792758658528328, + "learning_rate": 8.232149337716616e-07, + "loss": 0.0007, + "step": 101850 + }, + { + "epoch": 1.6666939376585126, + "grad_norm": 0.024493524804711342, + "learning_rate": 8.224301731583412e-07, + "loss": 0.0007, + "step": 101860 + }, + { + "epoch": 1.6668575636095886, + "grad_norm": 0.03626318648457527, + "learning_rate": 8.216457532534011e-07, + "loss": 0.0004, + "step": 101870 + }, + { + "epoch": 1.6670211895606644, + "grad_norm": 0.10387244075536728, + "learning_rate": 8.208616741208186e-07, + "loss": 0.0012, + "step": 101880 + }, + { + "epoch": 1.6671848155117401, + "grad_norm": 0.03235061094164848, + "learning_rate": 8.200779358245381e-07, + "loss": 0.0007, + "step": 101890 + }, + { + "epoch": 1.6673484414628161, + "grad_norm": 0.03363650664687157, + "learning_rate": 8.192945384284812e-07, + "loss": 0.0009, + "step": 101900 + }, + { + "epoch": 1.6675120674138917, + "grad_norm": 0.13841702044010162, + "learning_rate": 8.185114819965362e-07, + "loss": 0.001, + "step": 101910 + }, + { + "epoch": 1.6676756933649677, + "grad_norm": 0.058803677558898926, + "learning_rate": 8.177287665925693e-07, + "loss": 0.0005, + "step": 101920 + }, + { + "epoch": 1.6678393193160437, + "grad_norm": 0.032006945461034775, + "learning_rate": 8.169463922804139e-07, + "loss": 0.0006, + "step": 101930 + }, + { + "epoch": 1.6680029452671192, + "grad_norm": 0.06506620347499847, + "learning_rate": 8.1616435912388e-07, + "loss": 0.0007, + "step": 101940 + }, + { + "epoch": 1.6681665712181952, + "grad_norm": 0.13441327214241028, + "learning_rate": 8.153826671867454e-07, + "loss": 0.002, + "step": 101950 + }, + { + "epoch": 1.668330197169271, + "grad_norm": 0.014130309224128723, + "learning_rate": 8.146013165327637e-07, + "loss": 0.0007, + "step": 101960 + }, + { + "epoch": 1.6684938231203468, + "grad_norm": 0.017316646873950958, + "learning_rate": 8.138203072256579e-07, + "loss": 0.0008, + "step": 101970 + }, + { + "epoch": 1.6686574490714228, + "grad_norm": 0.05530500039458275, + "learning_rate": 8.130396393291262e-07, + "loss": 0.0009, + "step": 101980 + }, + { + "epoch": 1.6688210750224985, + "grad_norm": 0.1263323724269867, + "learning_rate": 8.122593129068351e-07, + "loss": 0.0014, + "step": 101990 + }, + { + "epoch": 1.6689847009735743, + "grad_norm": 0.00629509286954999, + "learning_rate": 8.114793280224281e-07, + "loss": 0.0006, + "step": 102000 + }, + { + "epoch": 1.6691483269246503, + "grad_norm": 0.012647460214793682, + "learning_rate": 8.106996847395149e-07, + "loss": 0.0008, + "step": 102010 + }, + { + "epoch": 1.669311952875726, + "grad_norm": 0.005564653780311346, + "learning_rate": 8.099203831216834e-07, + "loss": 0.0016, + "step": 102020 + }, + { + "epoch": 1.6694755788268019, + "grad_norm": 0.03609232231974602, + "learning_rate": 8.091414232324884e-07, + "loss": 0.0013, + "step": 102030 + }, + { + "epoch": 1.6696392047778779, + "grad_norm": 0.020126482471823692, + "learning_rate": 8.08362805135462e-07, + "loss": 0.0004, + "step": 102040 + }, + { + "epoch": 1.6698028307289536, + "grad_norm": 0.14946407079696655, + "learning_rate": 8.075845288941026e-07, + "loss": 0.0015, + "step": 102050 + }, + { + "epoch": 1.6699664566800294, + "grad_norm": 0.05827036127448082, + "learning_rate": 8.068065945718862e-07, + "loss": 0.0009, + "step": 102060 + }, + { + "epoch": 1.6701300826311054, + "grad_norm": 0.026677848771214485, + "learning_rate": 8.060290022322565e-07, + "loss": 0.0005, + "step": 102070 + }, + { + "epoch": 1.6702937085821812, + "grad_norm": 0.049987468868494034, + "learning_rate": 8.052517519386332e-07, + "loss": 0.0009, + "step": 102080 + }, + { + "epoch": 1.670457334533257, + "grad_norm": 0.0814337506890297, + "learning_rate": 8.044748437544042e-07, + "loss": 0.0011, + "step": 102090 + }, + { + "epoch": 1.670620960484333, + "grad_norm": 0.01272518653422594, + "learning_rate": 8.036982777429336e-07, + "loss": 0.0008, + "step": 102100 + }, + { + "epoch": 1.6707845864354085, + "grad_norm": 0.0295509472489357, + "learning_rate": 8.029220539675531e-07, + "loss": 0.0011, + "step": 102110 + }, + { + "epoch": 1.6709482123864845, + "grad_norm": 0.037884145975112915, + "learning_rate": 8.021461724915719e-07, + "loss": 0.0007, + "step": 102120 + }, + { + "epoch": 1.6711118383375605, + "grad_norm": 0.07349441200494766, + "learning_rate": 8.013706333782651e-07, + "loss": 0.0011, + "step": 102130 + }, + { + "epoch": 1.671275464288636, + "grad_norm": 0.009841103106737137, + "learning_rate": 8.005954366908853e-07, + "loss": 0.0007, + "step": 102140 + }, + { + "epoch": 1.671439090239712, + "grad_norm": 0.02861565724015236, + "learning_rate": 7.998205824926547e-07, + "loss": 0.0009, + "step": 102150 + }, + { + "epoch": 1.6716027161907878, + "grad_norm": 0.15918786823749542, + "learning_rate": 7.990460708467656e-07, + "loss": 0.0013, + "step": 102160 + }, + { + "epoch": 1.6717663421418636, + "grad_norm": 0.045576311647892, + "learning_rate": 7.982719018163871e-07, + "loss": 0.0009, + "step": 102170 + }, + { + "epoch": 1.6719299680929396, + "grad_norm": 0.010951322503387928, + "learning_rate": 7.974980754646561e-07, + "loss": 0.0008, + "step": 102180 + }, + { + "epoch": 1.6720935940440154, + "grad_norm": 0.0369180329144001, + "learning_rate": 7.967245918546845e-07, + "loss": 0.0007, + "step": 102190 + }, + { + "epoch": 1.6722572199950911, + "grad_norm": 0.00865790992975235, + "learning_rate": 7.959514510495536e-07, + "loss": 0.0007, + "step": 102200 + }, + { + "epoch": 1.6724208459461671, + "grad_norm": 0.018436286598443985, + "learning_rate": 7.951786531123202e-07, + "loss": 0.001, + "step": 102210 + }, + { + "epoch": 1.672584471897243, + "grad_norm": 0.0849384069442749, + "learning_rate": 7.944061981060086e-07, + "loss": 0.0013, + "step": 102220 + }, + { + "epoch": 1.6727480978483187, + "grad_norm": 0.020758312195539474, + "learning_rate": 7.936340860936198e-07, + "loss": 0.0007, + "step": 102230 + }, + { + "epoch": 1.6729117237993947, + "grad_norm": 0.09301984310150146, + "learning_rate": 7.928623171381222e-07, + "loss": 0.0012, + "step": 102240 + }, + { + "epoch": 1.6730753497504705, + "grad_norm": 0.07311088591814041, + "learning_rate": 7.920908913024616e-07, + "loss": 0.0005, + "step": 102250 + }, + { + "epoch": 1.6732389757015462, + "grad_norm": 0.032767944037914276, + "learning_rate": 7.913198086495505e-07, + "loss": 0.0005, + "step": 102260 + }, + { + "epoch": 1.6734026016526222, + "grad_norm": 0.10399367660284042, + "learning_rate": 7.905490692422774e-07, + "loss": 0.0004, + "step": 102270 + }, + { + "epoch": 1.673566227603698, + "grad_norm": 0.09880376607179642, + "learning_rate": 7.897786731434992e-07, + "loss": 0.001, + "step": 102280 + }, + { + "epoch": 1.6737298535547738, + "grad_norm": 0.029697954654693604, + "learning_rate": 7.890086204160497e-07, + "loss": 0.0009, + "step": 102290 + }, + { + "epoch": 1.6738934795058498, + "grad_norm": 0.21316224336624146, + "learning_rate": 7.882389111227284e-07, + "loss": 0.0007, + "step": 102300 + }, + { + "epoch": 1.6740571054569253, + "grad_norm": 0.08886222541332245, + "learning_rate": 7.874695453263131e-07, + "loss": 0.001, + "step": 102310 + }, + { + "epoch": 1.6742207314080013, + "grad_norm": 0.041089996695518494, + "learning_rate": 7.867005230895485e-07, + "loss": 0.001, + "step": 102320 + }, + { + "epoch": 1.6743843573590773, + "grad_norm": 0.05217922851443291, + "learning_rate": 7.859318444751557e-07, + "loss": 0.0008, + "step": 102330 + }, + { + "epoch": 1.6745479833101529, + "grad_norm": 0.051311932504177094, + "learning_rate": 7.851635095458226e-07, + "loss": 0.0006, + "step": 102340 + }, + { + "epoch": 1.6747116092612289, + "grad_norm": 0.06850788742303848, + "learning_rate": 7.843955183642143e-07, + "loss": 0.0007, + "step": 102350 + }, + { + "epoch": 1.6748752352123046, + "grad_norm": 0.0644073411822319, + "learning_rate": 7.836278709929645e-07, + "loss": 0.001, + "step": 102360 + }, + { + "epoch": 1.6750388611633804, + "grad_norm": 0.01286712009459734, + "learning_rate": 7.82860567494681e-07, + "loss": 0.0017, + "step": 102370 + }, + { + "epoch": 1.6752024871144564, + "grad_norm": 0.022207358852028847, + "learning_rate": 7.820936079319403e-07, + "loss": 0.0009, + "step": 102380 + }, + { + "epoch": 1.6753661130655322, + "grad_norm": 0.05233767628669739, + "learning_rate": 7.813269923672956e-07, + "loss": 0.0013, + "step": 102390 + }, + { + "epoch": 1.675529739016608, + "grad_norm": 0.06963980942964554, + "learning_rate": 7.805607208632676e-07, + "loss": 0.0009, + "step": 102400 + }, + { + "epoch": 1.675693364967684, + "grad_norm": 0.005263608414679766, + "learning_rate": 7.797947934823519e-07, + "loss": 0.0004, + "step": 102410 + }, + { + "epoch": 1.6758569909187597, + "grad_norm": 0.01303224265575409, + "learning_rate": 7.79029210287014e-07, + "loss": 0.001, + "step": 102420 + }, + { + "epoch": 1.6760206168698355, + "grad_norm": 0.040707413107156754, + "learning_rate": 7.782639713396933e-07, + "loss": 0.0008, + "step": 102430 + }, + { + "epoch": 1.6761842428209115, + "grad_norm": 0.002935064723715186, + "learning_rate": 7.774990767027985e-07, + "loss": 0.0005, + "step": 102440 + }, + { + "epoch": 1.6763478687719873, + "grad_norm": 0.022606706246733665, + "learning_rate": 7.767345264387155e-07, + "loss": 0.0005, + "step": 102450 + }, + { + "epoch": 1.676511494723063, + "grad_norm": 0.057300008833408356, + "learning_rate": 7.75970320609793e-07, + "loss": 0.001, + "step": 102460 + }, + { + "epoch": 1.676675120674139, + "grad_norm": 0.06762918829917908, + "learning_rate": 7.752064592783609e-07, + "loss": 0.0009, + "step": 102470 + }, + { + "epoch": 1.6768387466252146, + "grad_norm": 0.05211671069264412, + "learning_rate": 7.744429425067152e-07, + "loss": 0.0008, + "step": 102480 + }, + { + "epoch": 1.6770023725762906, + "grad_norm": 0.008036550134420395, + "learning_rate": 7.73679770357127e-07, + "loss": 0.0017, + "step": 102490 + }, + { + "epoch": 1.6771659985273666, + "grad_norm": 0.055241260677576065, + "learning_rate": 7.729169428918371e-07, + "loss": 0.0011, + "step": 102500 + }, + { + "epoch": 1.6773296244784421, + "grad_norm": 0.04189576581120491, + "learning_rate": 7.721544601730601e-07, + "loss": 0.0004, + "step": 102510 + }, + { + "epoch": 1.6774932504295181, + "grad_norm": 0.03463882580399513, + "learning_rate": 7.713923222629805e-07, + "loss": 0.0007, + "step": 102520 + }, + { + "epoch": 1.677656876380594, + "grad_norm": 0.05555075779557228, + "learning_rate": 7.706305292237564e-07, + "loss": 0.001, + "step": 102530 + }, + { + "epoch": 1.6778205023316697, + "grad_norm": 0.009825214743614197, + "learning_rate": 7.698690811175164e-07, + "loss": 0.0004, + "step": 102540 + }, + { + "epoch": 1.6779841282827457, + "grad_norm": 0.05704585835337639, + "learning_rate": 7.691079780063631e-07, + "loss": 0.0007, + "step": 102550 + }, + { + "epoch": 1.6781477542338215, + "grad_norm": 0.03767823427915573, + "learning_rate": 7.683472199523667e-07, + "loss": 0.0006, + "step": 102560 + }, + { + "epoch": 1.6783113801848972, + "grad_norm": 0.00850795116275549, + "learning_rate": 7.675868070175752e-07, + "loss": 0.0011, + "step": 102570 + }, + { + "epoch": 1.6784750061359732, + "grad_norm": 0.04947146400809288, + "learning_rate": 7.668267392640027e-07, + "loss": 0.0007, + "step": 102580 + }, + { + "epoch": 1.678638632087049, + "grad_norm": 0.11861610412597656, + "learning_rate": 7.660670167536399e-07, + "loss": 0.0016, + "step": 102590 + }, + { + "epoch": 1.6788022580381248, + "grad_norm": 0.019659509882330894, + "learning_rate": 7.653076395484454e-07, + "loss": 0.0007, + "step": 102600 + }, + { + "epoch": 1.6789658839892008, + "grad_norm": 0.11795555800199509, + "learning_rate": 7.645486077103525e-07, + "loss": 0.0065, + "step": 102610 + }, + { + "epoch": 1.6791295099402765, + "grad_norm": 0.09073202311992645, + "learning_rate": 7.637899213012657e-07, + "loss": 0.001, + "step": 102620 + }, + { + "epoch": 1.6792931358913523, + "grad_norm": 0.0036755509208887815, + "learning_rate": 7.630315803830585e-07, + "loss": 0.0009, + "step": 102630 + }, + { + "epoch": 1.6794567618424283, + "grad_norm": 0.03757796809077263, + "learning_rate": 7.622735850175811e-07, + "loss": 0.0007, + "step": 102640 + }, + { + "epoch": 1.679620387793504, + "grad_norm": 0.04312099516391754, + "learning_rate": 7.615159352666512e-07, + "loss": 0.001, + "step": 102650 + }, + { + "epoch": 1.6797840137445799, + "grad_norm": 0.061894431710243225, + "learning_rate": 7.607586311920622e-07, + "loss": 0.0008, + "step": 102660 + }, + { + "epoch": 1.6799476396956559, + "grad_norm": 0.05961727350950241, + "learning_rate": 7.600016728555753e-07, + "loss": 0.0007, + "step": 102670 + }, + { + "epoch": 1.6801112656467314, + "grad_norm": 0.00849273893982172, + "learning_rate": 7.592450603189272e-07, + "loss": 0.0017, + "step": 102680 + }, + { + "epoch": 1.6802748915978074, + "grad_norm": 0.031189408153295517, + "learning_rate": 7.584887936438224e-07, + "loss": 0.0007, + "step": 102690 + }, + { + "epoch": 1.6804385175488834, + "grad_norm": 0.010038296692073345, + "learning_rate": 7.57732872891942e-07, + "loss": 0.0007, + "step": 102700 + }, + { + "epoch": 1.680602143499959, + "grad_norm": 0.04147586598992348, + "learning_rate": 7.569772981249335e-07, + "loss": 0.0008, + "step": 102710 + }, + { + "epoch": 1.680765769451035, + "grad_norm": 0.008245404809713364, + "learning_rate": 7.56222069404422e-07, + "loss": 0.0014, + "step": 102720 + }, + { + "epoch": 1.6809293954021107, + "grad_norm": 0.054408587515354156, + "learning_rate": 7.554671867919989e-07, + "loss": 0.0009, + "step": 102730 + }, + { + "epoch": 1.6810930213531865, + "grad_norm": 0.042227186262607574, + "learning_rate": 7.547126503492319e-07, + "loss": 0.0011, + "step": 102740 + }, + { + "epoch": 1.6812566473042625, + "grad_norm": 0.08288440853357315, + "learning_rate": 7.539584601376565e-07, + "loss": 0.0013, + "step": 102750 + }, + { + "epoch": 1.6814202732553383, + "grad_norm": 0.20849573612213135, + "learning_rate": 7.532046162187839e-07, + "loss": 0.0007, + "step": 102760 + }, + { + "epoch": 1.681583899206414, + "grad_norm": 0.05855753645300865, + "learning_rate": 7.524511186540928e-07, + "loss": 0.0007, + "step": 102770 + }, + { + "epoch": 1.68174752515749, + "grad_norm": 0.04733557254076004, + "learning_rate": 7.516979675050379e-07, + "loss": 0.001, + "step": 102780 + }, + { + "epoch": 1.6819111511085658, + "grad_norm": 0.058088548481464386, + "learning_rate": 7.509451628330417e-07, + "loss": 0.001, + "step": 102790 + }, + { + "epoch": 1.6820747770596416, + "grad_norm": 0.026194332167506218, + "learning_rate": 7.501927046995028e-07, + "loss": 0.0011, + "step": 102800 + }, + { + "epoch": 1.6822384030107176, + "grad_norm": 0.05519293621182442, + "learning_rate": 7.494405931657866e-07, + "loss": 0.0007, + "step": 102810 + }, + { + "epoch": 1.6824020289617934, + "grad_norm": 0.02002830244600773, + "learning_rate": 7.486888282932353e-07, + "loss": 0.0008, + "step": 102820 + }, + { + "epoch": 1.6825656549128691, + "grad_norm": 0.05044343322515488, + "learning_rate": 7.479374101431575e-07, + "loss": 0.0011, + "step": 102830 + }, + { + "epoch": 1.6827292808639451, + "grad_norm": 0.007632914464920759, + "learning_rate": 7.471863387768391e-07, + "loss": 0.0006, + "step": 102840 + }, + { + "epoch": 1.682892906815021, + "grad_norm": 0.09232573956251144, + "learning_rate": 7.464356142555323e-07, + "loss": 0.0007, + "step": 102850 + }, + { + "epoch": 1.6830565327660967, + "grad_norm": 0.04856385663151741, + "learning_rate": 7.456852366404665e-07, + "loss": 0.0006, + "step": 102860 + }, + { + "epoch": 1.6832201587171727, + "grad_norm": 0.01687176339328289, + "learning_rate": 7.449352059928366e-07, + "loss": 0.0004, + "step": 102870 + }, + { + "epoch": 1.6833837846682482, + "grad_norm": 0.11527708172798157, + "learning_rate": 7.441855223738159e-07, + "loss": 0.0009, + "step": 102880 + }, + { + "epoch": 1.6835474106193242, + "grad_norm": 0.017740240320563316, + "learning_rate": 7.434361858445432e-07, + "loss": 0.0012, + "step": 102890 + }, + { + "epoch": 1.6837110365704002, + "grad_norm": 0.008882102556526661, + "learning_rate": 7.426871964661342e-07, + "loss": 0.0009, + "step": 102900 + }, + { + "epoch": 1.6838746625214758, + "grad_norm": 0.18059316277503967, + "learning_rate": 7.419385542996721e-07, + "loss": 0.001, + "step": 102910 + }, + { + "epoch": 1.6840382884725518, + "grad_norm": 0.023213600739836693, + "learning_rate": 7.411902594062148e-07, + "loss": 0.0006, + "step": 102920 + }, + { + "epoch": 1.6842019144236275, + "grad_norm": 0.09939147531986237, + "learning_rate": 7.404423118467907e-07, + "loss": 0.001, + "step": 102930 + }, + { + "epoch": 1.6843655403747033, + "grad_norm": 0.024556171149015427, + "learning_rate": 7.396947116823983e-07, + "loss": 0.005, + "step": 102940 + }, + { + "epoch": 1.6845291663257793, + "grad_norm": 0.0024782780092209578, + "learning_rate": 7.389474589740108e-07, + "loss": 0.0011, + "step": 102950 + }, + { + "epoch": 1.684692792276855, + "grad_norm": 0.03002997487783432, + "learning_rate": 7.38200553782571e-07, + "loss": 0.001, + "step": 102960 + }, + { + "epoch": 1.6848564182279309, + "grad_norm": 0.02018268220126629, + "learning_rate": 7.374539961689947e-07, + "loss": 0.0008, + "step": 102970 + }, + { + "epoch": 1.6850200441790069, + "grad_norm": 0.1748068630695343, + "learning_rate": 7.367077861941668e-07, + "loss": 0.0016, + "step": 102980 + }, + { + "epoch": 1.6851836701300826, + "grad_norm": 0.03704323619604111, + "learning_rate": 7.359619239189486e-07, + "loss": 0.0016, + "step": 102990 + }, + { + "epoch": 1.6853472960811584, + "grad_norm": 0.04993622004985809, + "learning_rate": 7.352164094041669e-07, + "loss": 0.0009, + "step": 103000 + }, + { + "epoch": 1.6855109220322344, + "grad_norm": 0.025849327445030212, + "learning_rate": 7.34471242710626e-07, + "loss": 0.0007, + "step": 103010 + }, + { + "epoch": 1.6856745479833102, + "grad_norm": 0.03651169314980507, + "learning_rate": 7.337264238990966e-07, + "loss": 0.0014, + "step": 103020 + }, + { + "epoch": 1.685838173934386, + "grad_norm": 0.04672649875283241, + "learning_rate": 7.329819530303262e-07, + "loss": 0.0011, + "step": 103030 + }, + { + "epoch": 1.686001799885462, + "grad_norm": 0.23591391742229462, + "learning_rate": 7.32237830165029e-07, + "loss": 0.0017, + "step": 103040 + }, + { + "epoch": 1.6861654258365377, + "grad_norm": 0.033353328704833984, + "learning_rate": 7.314940553638956e-07, + "loss": 0.0008, + "step": 103050 + }, + { + "epoch": 1.6863290517876135, + "grad_norm": 0.03773863613605499, + "learning_rate": 7.307506286875826e-07, + "loss": 0.0015, + "step": 103060 + }, + { + "epoch": 1.6864926777386895, + "grad_norm": 0.060078006237745285, + "learning_rate": 7.300075501967246e-07, + "loss": 0.001, + "step": 103070 + }, + { + "epoch": 1.686656303689765, + "grad_norm": 0.002214373555034399, + "learning_rate": 7.292648199519215e-07, + "loss": 0.0015, + "step": 103080 + }, + { + "epoch": 1.686819929640841, + "grad_norm": 0.06906892359256744, + "learning_rate": 7.285224380137512e-07, + "loss": 0.0017, + "step": 103090 + }, + { + "epoch": 1.686983555591917, + "grad_norm": 0.00794029701501131, + "learning_rate": 7.277804044427561e-07, + "loss": 0.0005, + "step": 103100 + }, + { + "epoch": 1.6871471815429926, + "grad_norm": 0.040784694254398346, + "learning_rate": 7.270387192994576e-07, + "loss": 0.0012, + "step": 103110 + }, + { + "epoch": 1.6873108074940686, + "grad_norm": 0.032550711184740067, + "learning_rate": 7.262973826443421e-07, + "loss": 0.0011, + "step": 103120 + }, + { + "epoch": 1.6874744334451444, + "grad_norm": 0.03277921304106712, + "learning_rate": 7.25556394537873e-07, + "loss": 0.001, + "step": 103130 + }, + { + "epoch": 1.6876380593962201, + "grad_norm": 0.044698312878608704, + "learning_rate": 7.248157550404799e-07, + "loss": 0.001, + "step": 103140 + }, + { + "epoch": 1.6878016853472961, + "grad_norm": 0.02356862835586071, + "learning_rate": 7.240754642125697e-07, + "loss": 0.0006, + "step": 103150 + }, + { + "epoch": 1.687965311298372, + "grad_norm": 0.030401326715946198, + "learning_rate": 7.233355221145155e-07, + "loss": 0.0008, + "step": 103160 + }, + { + "epoch": 1.6881289372494477, + "grad_norm": 0.03275863081216812, + "learning_rate": 7.225959288066665e-07, + "loss": 0.0014, + "step": 103170 + }, + { + "epoch": 1.6882925632005237, + "grad_norm": 0.05609789118170738, + "learning_rate": 7.218566843493396e-07, + "loss": 0.0009, + "step": 103180 + }, + { + "epoch": 1.6884561891515995, + "grad_norm": 0.00938730500638485, + "learning_rate": 7.211177888028276e-07, + "loss": 0.0014, + "step": 103190 + }, + { + "epoch": 1.6886198151026752, + "grad_norm": 0.0029103972483426332, + "learning_rate": 7.203792422273892e-07, + "loss": 0.0006, + "step": 103200 + }, + { + "epoch": 1.6887834410537512, + "grad_norm": 0.05422814562916756, + "learning_rate": 7.196410446832603e-07, + "loss": 0.0009, + "step": 103210 + }, + { + "epoch": 1.688947067004827, + "grad_norm": 0.031813185662031174, + "learning_rate": 7.189031962306436e-07, + "loss": 0.0016, + "step": 103220 + }, + { + "epoch": 1.6891106929559028, + "grad_norm": 0.033441491425037384, + "learning_rate": 7.181656969297185e-07, + "loss": 0.0008, + "step": 103230 + }, + { + "epoch": 1.6892743189069788, + "grad_norm": 0.029408013448119164, + "learning_rate": 7.174285468406289e-07, + "loss": 0.0011, + "step": 103240 + }, + { + "epoch": 1.6894379448580545, + "grad_norm": 0.048699889332056046, + "learning_rate": 7.166917460234985e-07, + "loss": 0.001, + "step": 103250 + }, + { + "epoch": 1.6896015708091303, + "grad_norm": 0.03850254416465759, + "learning_rate": 7.159552945384157e-07, + "loss": 0.0007, + "step": 103260 + }, + { + "epoch": 1.6897651967602063, + "grad_norm": 0.008482254110276699, + "learning_rate": 7.152191924454427e-07, + "loss": 0.0006, + "step": 103270 + }, + { + "epoch": 1.6899288227112819, + "grad_norm": 0.04625152051448822, + "learning_rate": 7.144834398046152e-07, + "loss": 0.0006, + "step": 103280 + }, + { + "epoch": 1.6900924486623579, + "grad_norm": 0.04332206770777702, + "learning_rate": 7.137480366759364e-07, + "loss": 0.0015, + "step": 103290 + }, + { + "epoch": 1.6902560746134339, + "grad_norm": 0.029152631759643555, + "learning_rate": 7.130129831193861e-07, + "loss": 0.0011, + "step": 103300 + }, + { + "epoch": 1.6904197005645094, + "grad_norm": 0.040577176958322525, + "learning_rate": 7.122782791949101e-07, + "loss": 0.0005, + "step": 103310 + }, + { + "epoch": 1.6905833265155854, + "grad_norm": 0.036893464624881744, + "learning_rate": 7.115439249624312e-07, + "loss": 0.0007, + "step": 103320 + }, + { + "epoch": 1.6907469524666612, + "grad_norm": 0.02258090116083622, + "learning_rate": 7.108099204818375e-07, + "loss": 0.0006, + "step": 103330 + }, + { + "epoch": 1.690910578417737, + "grad_norm": 0.16975213587284088, + "learning_rate": 7.100762658129945e-07, + "loss": 0.0016, + "step": 103340 + }, + { + "epoch": 1.691074204368813, + "grad_norm": 0.0500812865793705, + "learning_rate": 7.093429610157348e-07, + "loss": 0.0007, + "step": 103350 + }, + { + "epoch": 1.6912378303198887, + "grad_norm": 0.012306282296776772, + "learning_rate": 7.086100061498663e-07, + "loss": 0.0005, + "step": 103360 + }, + { + "epoch": 1.6914014562709645, + "grad_norm": 0.024904262274503708, + "learning_rate": 7.078774012751644e-07, + "loss": 0.0013, + "step": 103370 + }, + { + "epoch": 1.6915650822220405, + "grad_norm": 0.12684333324432373, + "learning_rate": 7.07145146451379e-07, + "loss": 0.0012, + "step": 103380 + }, + { + "epoch": 1.6917287081731163, + "grad_norm": 0.028593583032488823, + "learning_rate": 7.064132417382291e-07, + "loss": 0.0009, + "step": 103390 + }, + { + "epoch": 1.691892334124192, + "grad_norm": 0.06034789979457855, + "learning_rate": 7.056816871954086e-07, + "loss": 0.0008, + "step": 103400 + }, + { + "epoch": 1.692055960075268, + "grad_norm": 0.05239923670887947, + "learning_rate": 7.049504828825776e-07, + "loss": 0.0006, + "step": 103410 + }, + { + "epoch": 1.6922195860263438, + "grad_norm": 0.057671964168548584, + "learning_rate": 7.042196288593734e-07, + "loss": 0.0015, + "step": 103420 + }, + { + "epoch": 1.6923832119774196, + "grad_norm": 0.018480369821190834, + "learning_rate": 7.034891251853998e-07, + "loss": 0.0023, + "step": 103430 + }, + { + "epoch": 1.6925468379284956, + "grad_norm": 0.0787099152803421, + "learning_rate": 7.02758971920236e-07, + "loss": 0.0007, + "step": 103440 + }, + { + "epoch": 1.6927104638795711, + "grad_norm": 0.0029936376959085464, + "learning_rate": 7.020291691234287e-07, + "loss": 0.0006, + "step": 103450 + }, + { + "epoch": 1.6928740898306471, + "grad_norm": 0.03102288767695427, + "learning_rate": 7.01299716854501e-07, + "loss": 0.0006, + "step": 103460 + }, + { + "epoch": 1.6930377157817231, + "grad_norm": 0.015558217652142048, + "learning_rate": 7.005706151729419e-07, + "loss": 0.0008, + "step": 103470 + }, + { + "epoch": 1.6932013417327987, + "grad_norm": 0.036682598292827606, + "learning_rate": 6.998418641382165e-07, + "loss": 0.0012, + "step": 103480 + }, + { + "epoch": 1.6933649676838747, + "grad_norm": 0.051463935524225235, + "learning_rate": 6.991134638097569e-07, + "loss": 0.0014, + "step": 103490 + }, + { + "epoch": 1.6935285936349505, + "grad_norm": 0.003980196081101894, + "learning_rate": 6.983854142469715e-07, + "loss": 0.0006, + "step": 103500 + }, + { + "epoch": 1.6936922195860262, + "grad_norm": 0.07924409955739975, + "learning_rate": 6.976577155092351e-07, + "loss": 0.0009, + "step": 103510 + }, + { + "epoch": 1.6938558455371022, + "grad_norm": 0.12490904331207275, + "learning_rate": 6.969303676558992e-07, + "loss": 0.0006, + "step": 103520 + }, + { + "epoch": 1.694019471488178, + "grad_norm": 0.0019140464719384909, + "learning_rate": 6.962033707462807e-07, + "loss": 0.0004, + "step": 103530 + }, + { + "epoch": 1.6941830974392538, + "grad_norm": 0.007023755926638842, + "learning_rate": 6.954767248396737e-07, + "loss": 0.0007, + "step": 103540 + }, + { + "epoch": 1.6943467233903298, + "grad_norm": 0.031072556972503662, + "learning_rate": 6.94750429995339e-07, + "loss": 0.0006, + "step": 103550 + }, + { + "epoch": 1.6945103493414055, + "grad_norm": 0.005813030991703272, + "learning_rate": 6.94024486272512e-07, + "loss": 0.0006, + "step": 103560 + }, + { + "epoch": 1.6946739752924813, + "grad_norm": 0.04161210358142853, + "learning_rate": 6.932988937303981e-07, + "loss": 0.0013, + "step": 103570 + }, + { + "epoch": 1.6948376012435573, + "grad_norm": 0.027132993564009666, + "learning_rate": 6.925736524281723e-07, + "loss": 0.0016, + "step": 103580 + }, + { + "epoch": 1.695001227194633, + "grad_norm": 0.0214987900108099, + "learning_rate": 6.918487624249859e-07, + "loss": 0.0015, + "step": 103590 + }, + { + "epoch": 1.6951648531457089, + "grad_norm": 0.05383111163973808, + "learning_rate": 6.911242237799554e-07, + "loss": 0.0012, + "step": 103600 + }, + { + "epoch": 1.6953284790967849, + "grad_norm": 0.007977821864187717, + "learning_rate": 6.90400036552174e-07, + "loss": 0.0016, + "step": 103610 + }, + { + "epoch": 1.6954921050478606, + "grad_norm": 0.001280922326259315, + "learning_rate": 6.896762008007024e-07, + "loss": 0.0003, + "step": 103620 + }, + { + "epoch": 1.6956557309989364, + "grad_norm": 0.03987139090895653, + "learning_rate": 6.889527165845761e-07, + "loss": 0.0011, + "step": 103630 + }, + { + "epoch": 1.6958193569500124, + "grad_norm": 0.04433751478791237, + "learning_rate": 6.882295839627972e-07, + "loss": 0.0006, + "step": 103640 + }, + { + "epoch": 1.695982982901088, + "grad_norm": 0.0042603472247719765, + "learning_rate": 6.875068029943449e-07, + "loss": 0.001, + "step": 103650 + }, + { + "epoch": 1.696146608852164, + "grad_norm": 0.07267089933156967, + "learning_rate": 6.867843737381646e-07, + "loss": 0.0014, + "step": 103660 + }, + { + "epoch": 1.69631023480324, + "grad_norm": 0.000646169122774154, + "learning_rate": 6.860622962531766e-07, + "loss": 0.0006, + "step": 103670 + }, + { + "epoch": 1.6964738607543155, + "grad_norm": 0.13197201490402222, + "learning_rate": 6.853405705982697e-07, + "loss": 0.0013, + "step": 103680 + }, + { + "epoch": 1.6966374867053915, + "grad_norm": 0.015983087942004204, + "learning_rate": 6.846191968323068e-07, + "loss": 0.0008, + "step": 103690 + }, + { + "epoch": 1.6968011126564673, + "grad_norm": 0.042224958539009094, + "learning_rate": 6.838981750141188e-07, + "loss": 0.0012, + "step": 103700 + }, + { + "epoch": 1.696964738607543, + "grad_norm": 0.030647391453385353, + "learning_rate": 6.831775052025125e-07, + "loss": 0.0013, + "step": 103710 + }, + { + "epoch": 1.697128364558619, + "grad_norm": 0.009895850904285908, + "learning_rate": 6.824571874562608e-07, + "loss": 0.0012, + "step": 103720 + }, + { + "epoch": 1.6972919905096948, + "grad_norm": 0.03890618309378624, + "learning_rate": 6.817372218341123e-07, + "loss": 0.0013, + "step": 103730 + }, + { + "epoch": 1.6974556164607706, + "grad_norm": 0.05881976708769798, + "learning_rate": 6.810176083947828e-07, + "loss": 0.0022, + "step": 103740 + }, + { + "epoch": 1.6976192424118466, + "grad_norm": 0.033913180232048035, + "learning_rate": 6.802983471969638e-07, + "loss": 0.0012, + "step": 103750 + }, + { + "epoch": 1.6977828683629224, + "grad_norm": 0.051612187176942825, + "learning_rate": 6.795794382993137e-07, + "loss": 0.0009, + "step": 103760 + }, + { + "epoch": 1.6979464943139981, + "grad_norm": 0.06663978844881058, + "learning_rate": 6.788608817604658e-07, + "loss": 0.0008, + "step": 103770 + }, + { + "epoch": 1.6981101202650741, + "grad_norm": 0.00887993536889553, + "learning_rate": 6.781426776390221e-07, + "loss": 0.001, + "step": 103780 + }, + { + "epoch": 1.69827374621615, + "grad_norm": 0.0070457784458994865, + "learning_rate": 6.774248259935584e-07, + "loss": 0.0004, + "step": 103790 + }, + { + "epoch": 1.6984373721672257, + "grad_norm": 0.06422353535890579, + "learning_rate": 6.767073268826185e-07, + "loss": 0.0017, + "step": 103800 + }, + { + "epoch": 1.6986009981183017, + "grad_norm": 0.10052681714296341, + "learning_rate": 6.759901803647207e-07, + "loss": 0.0015, + "step": 103810 + }, + { + "epoch": 1.6987646240693774, + "grad_norm": 0.02152802050113678, + "learning_rate": 6.752733864983507e-07, + "loss": 0.0012, + "step": 103820 + }, + { + "epoch": 1.6989282500204532, + "grad_norm": 0.019163277000188828, + "learning_rate": 6.745569453419709e-07, + "loss": 0.0005, + "step": 103830 + }, + { + "epoch": 1.6990918759715292, + "grad_norm": 0.060903165489435196, + "learning_rate": 6.73840856954009e-07, + "loss": 0.0009, + "step": 103840 + }, + { + "epoch": 1.6992555019226048, + "grad_norm": 0.134161576628685, + "learning_rate": 6.731251213928685e-07, + "loss": 0.001, + "step": 103850 + }, + { + "epoch": 1.6994191278736808, + "grad_norm": 0.03672676905989647, + "learning_rate": 6.724097387169215e-07, + "loss": 0.0006, + "step": 103860 + }, + { + "epoch": 1.6995827538247568, + "grad_norm": 0.06439151614904404, + "learning_rate": 6.716947089845133e-07, + "loss": 0.0012, + "step": 103870 + }, + { + "epoch": 1.6997463797758323, + "grad_norm": 0.06246178597211838, + "learning_rate": 6.709800322539572e-07, + "loss": 0.0006, + "step": 103880 + }, + { + "epoch": 1.6999100057269083, + "grad_norm": 0.026958303526043892, + "learning_rate": 6.702657085835424e-07, + "loss": 0.0012, + "step": 103890 + }, + { + "epoch": 1.700073631677984, + "grad_norm": 0.0357089564204216, + "learning_rate": 6.695517380315253e-07, + "loss": 0.0005, + "step": 103900 + }, + { + "epoch": 1.7002372576290599, + "grad_norm": 0.032679539173841476, + "learning_rate": 6.688381206561339e-07, + "loss": 0.0013, + "step": 103910 + }, + { + "epoch": 1.7004008835801359, + "grad_norm": 0.02173207700252533, + "learning_rate": 6.681248565155707e-07, + "loss": 0.0003, + "step": 103920 + }, + { + "epoch": 1.7005645095312116, + "grad_norm": 0.034323230385780334, + "learning_rate": 6.674119456680045e-07, + "loss": 0.0009, + "step": 103930 + }, + { + "epoch": 1.7007281354822874, + "grad_norm": 0.020370975136756897, + "learning_rate": 6.666993881715805e-07, + "loss": 0.0006, + "step": 103940 + }, + { + "epoch": 1.7008917614333634, + "grad_norm": 0.12958072125911713, + "learning_rate": 6.659871840844101e-07, + "loss": 0.0031, + "step": 103950 + }, + { + "epoch": 1.7010553873844392, + "grad_norm": 0.06326296925544739, + "learning_rate": 6.652753334645806e-07, + "loss": 0.0011, + "step": 103960 + }, + { + "epoch": 1.701219013335515, + "grad_norm": 0.030958905816078186, + "learning_rate": 6.645638363701463e-07, + "loss": 0.0014, + "step": 103970 + }, + { + "epoch": 1.701382639286591, + "grad_norm": 0.0015847939066588879, + "learning_rate": 6.638526928591355e-07, + "loss": 0.0008, + "step": 103980 + }, + { + "epoch": 1.7015462652376667, + "grad_norm": 0.060073256492614746, + "learning_rate": 6.631419029895453e-07, + "loss": 0.001, + "step": 103990 + }, + { + "epoch": 1.7017098911887425, + "grad_norm": 0.02088100090622902, + "learning_rate": 6.624314668193472e-07, + "loss": 0.0005, + "step": 104000 + }, + { + "epoch": 1.7018735171398185, + "grad_norm": 0.025589486584067345, + "learning_rate": 6.617213844064796e-07, + "loss": 0.0025, + "step": 104010 + }, + { + "epoch": 1.7020371430908943, + "grad_norm": 0.0009547353256493807, + "learning_rate": 6.610116558088575e-07, + "loss": 0.0011, + "step": 104020 + }, + { + "epoch": 1.70220076904197, + "grad_norm": 0.0042994339019060135, + "learning_rate": 6.603022810843606e-07, + "loss": 0.0005, + "step": 104030 + }, + { + "epoch": 1.702364394993046, + "grad_norm": 0.023244405165314674, + "learning_rate": 6.595932602908461e-07, + "loss": 0.0006, + "step": 104040 + }, + { + "epoch": 1.7025280209441216, + "grad_norm": 0.01974932849407196, + "learning_rate": 6.588845934861371e-07, + "loss": 0.001, + "step": 104050 + }, + { + "epoch": 1.7026916468951976, + "grad_norm": 0.04217154532670975, + "learning_rate": 6.581762807280317e-07, + "loss": 0.0009, + "step": 104060 + }, + { + "epoch": 1.7028552728462736, + "grad_norm": 0.029763035476207733, + "learning_rate": 6.574683220742956e-07, + "loss": 0.0002, + "step": 104070 + }, + { + "epoch": 1.7030188987973491, + "grad_norm": 0.07592908293008804, + "learning_rate": 6.567607175826702e-07, + "loss": 0.0014, + "step": 104080 + }, + { + "epoch": 1.7031825247484251, + "grad_norm": 0.0875341147184372, + "learning_rate": 6.560534673108621e-07, + "loss": 0.0009, + "step": 104090 + }, + { + "epoch": 1.703346150699501, + "grad_norm": 0.005179502069950104, + "learning_rate": 6.553465713165547e-07, + "loss": 0.0006, + "step": 104100 + }, + { + "epoch": 1.7035097766505767, + "grad_norm": 0.021889645606279373, + "learning_rate": 6.546400296573985e-07, + "loss": 0.0007, + "step": 104110 + }, + { + "epoch": 1.7036734026016527, + "grad_norm": 0.17082317173480988, + "learning_rate": 6.53933842391018e-07, + "loss": 0.0011, + "step": 104120 + }, + { + "epoch": 1.7038370285527284, + "grad_norm": 0.025410214439034462, + "learning_rate": 6.53228009575006e-07, + "loss": 0.0006, + "step": 104130 + }, + { + "epoch": 1.7040006545038042, + "grad_norm": 0.016821803525090218, + "learning_rate": 6.525225312669292e-07, + "loss": 0.0008, + "step": 104140 + }, + { + "epoch": 1.7041642804548802, + "grad_norm": 0.0037400545552372932, + "learning_rate": 6.518174075243228e-07, + "loss": 0.0006, + "step": 104150 + }, + { + "epoch": 1.704327906405956, + "grad_norm": 0.018201595172286034, + "learning_rate": 6.511126384046956e-07, + "loss": 0.0015, + "step": 104160 + }, + { + "epoch": 1.7044915323570318, + "grad_norm": 0.016506675630807877, + "learning_rate": 6.504082239655246e-07, + "loss": 0.001, + "step": 104170 + }, + { + "epoch": 1.7046551583081078, + "grad_norm": 0.005652797874063253, + "learning_rate": 6.49704164264261e-07, + "loss": 0.0006, + "step": 104180 + }, + { + "epoch": 1.7048187842591835, + "grad_norm": 0.08837218582630157, + "learning_rate": 6.490004593583238e-07, + "loss": 0.0006, + "step": 104190 + }, + { + "epoch": 1.7049824102102593, + "grad_norm": 0.022922301664948463, + "learning_rate": 6.482971093051072e-07, + "loss": 0.0009, + "step": 104200 + }, + { + "epoch": 1.7051460361613353, + "grad_norm": 0.006444236263632774, + "learning_rate": 6.475941141619707e-07, + "loss": 0.0009, + "step": 104210 + }, + { + "epoch": 1.7053096621124109, + "grad_norm": 0.028774041682481766, + "learning_rate": 6.46891473986252e-07, + "loss": 0.001, + "step": 104220 + }, + { + "epoch": 1.7054732880634869, + "grad_norm": 0.03530431166291237, + "learning_rate": 6.461891888352539e-07, + "loss": 0.0003, + "step": 104230 + }, + { + "epoch": 1.7056369140145629, + "grad_norm": 0.032492201775312424, + "learning_rate": 6.454872587662514e-07, + "loss": 0.0008, + "step": 104240 + }, + { + "epoch": 1.7058005399656384, + "grad_norm": 0.025364955887198448, + "learning_rate": 6.447856838364935e-07, + "loss": 0.0015, + "step": 104250 + }, + { + "epoch": 1.7059641659167144, + "grad_norm": 0.04326396435499191, + "learning_rate": 6.440844641031969e-07, + "loss": 0.0011, + "step": 104260 + }, + { + "epoch": 1.7061277918677902, + "grad_norm": 0.018982864916324615, + "learning_rate": 6.433835996235522e-07, + "loss": 0.0008, + "step": 104270 + }, + { + "epoch": 1.706291417818866, + "grad_norm": 0.036128368228673935, + "learning_rate": 6.426830904547176e-07, + "loss": 0.0008, + "step": 104280 + }, + { + "epoch": 1.706455043769942, + "grad_norm": 0.02752913162112236, + "learning_rate": 6.419829366538266e-07, + "loss": 0.0011, + "step": 104290 + }, + { + "epoch": 1.7066186697210177, + "grad_norm": 0.0069655366241931915, + "learning_rate": 6.412831382779789e-07, + "loss": 0.0012, + "step": 104300 + }, + { + "epoch": 1.7067822956720935, + "grad_norm": 0.05464395508170128, + "learning_rate": 6.405836953842493e-07, + "loss": 0.0012, + "step": 104310 + }, + { + "epoch": 1.7069459216231695, + "grad_norm": 0.0283599104732275, + "learning_rate": 6.398846080296811e-07, + "loss": 0.0014, + "step": 104320 + }, + { + "epoch": 1.7071095475742453, + "grad_norm": 0.05556324124336243, + "learning_rate": 6.391858762712904e-07, + "loss": 0.0006, + "step": 104330 + }, + { + "epoch": 1.707273173525321, + "grad_norm": 0.011731604114174843, + "learning_rate": 6.384875001660623e-07, + "loss": 0.0007, + "step": 104340 + }, + { + "epoch": 1.707436799476397, + "grad_norm": 0.052520766854286194, + "learning_rate": 6.37789479770955e-07, + "loss": 0.0006, + "step": 104350 + }, + { + "epoch": 1.7076004254274728, + "grad_norm": 0.028871996328234673, + "learning_rate": 6.370918151428957e-07, + "loss": 0.0007, + "step": 104360 + }, + { + "epoch": 1.7077640513785486, + "grad_norm": 0.008424033410847187, + "learning_rate": 6.363945063387849e-07, + "loss": 0.0005, + "step": 104370 + }, + { + "epoch": 1.7079276773296246, + "grad_norm": 0.01747305691242218, + "learning_rate": 6.356975534154908e-07, + "loss": 0.0013, + "step": 104380 + }, + { + "epoch": 1.7080913032807004, + "grad_norm": 0.0701238289475441, + "learning_rate": 6.350009564298564e-07, + "loss": 0.0008, + "step": 104390 + }, + { + "epoch": 1.7082549292317761, + "grad_norm": 0.1218705028295517, + "learning_rate": 6.343047154386927e-07, + "loss": 0.0016, + "step": 104400 + }, + { + "epoch": 1.7084185551828521, + "grad_norm": 0.03191279247403145, + "learning_rate": 6.336088304987836e-07, + "loss": 0.0006, + "step": 104410 + }, + { + "epoch": 1.7085821811339277, + "grad_norm": 0.029151448979973793, + "learning_rate": 6.329133016668815e-07, + "loss": 0.0008, + "step": 104420 + }, + { + "epoch": 1.7087458070850037, + "grad_norm": 0.03438221663236618, + "learning_rate": 6.322181289997131e-07, + "loss": 0.001, + "step": 104430 + }, + { + "epoch": 1.7089094330360797, + "grad_norm": 0.014078960753977299, + "learning_rate": 6.315233125539733e-07, + "loss": 0.0006, + "step": 104440 + }, + { + "epoch": 1.7090730589871552, + "grad_norm": 0.06922315806150436, + "learning_rate": 6.308288523863298e-07, + "loss": 0.0011, + "step": 104450 + }, + { + "epoch": 1.7092366849382312, + "grad_norm": 0.0694236233830452, + "learning_rate": 6.301347485534193e-07, + "loss": 0.0005, + "step": 104460 + }, + { + "epoch": 1.709400310889307, + "grad_norm": 0.015566708520054817, + "learning_rate": 6.294410011118518e-07, + "loss": 0.0008, + "step": 104470 + }, + { + "epoch": 1.7095639368403828, + "grad_norm": 0.02127688191831112, + "learning_rate": 6.287476101182055e-07, + "loss": 0.0012, + "step": 104480 + }, + { + "epoch": 1.7097275627914588, + "grad_norm": 0.0193184781819582, + "learning_rate": 6.280545756290323e-07, + "loss": 0.0015, + "step": 104490 + }, + { + "epoch": 1.7098911887425345, + "grad_norm": 0.009278331883251667, + "learning_rate": 6.273618977008527e-07, + "loss": 0.0004, + "step": 104500 + }, + { + "epoch": 1.7100548146936103, + "grad_norm": 0.031821269541978836, + "learning_rate": 6.266695763901609e-07, + "loss": 0.0005, + "step": 104510 + }, + { + "epoch": 1.7102184406446863, + "grad_norm": 0.014671229757368565, + "learning_rate": 6.259776117534177e-07, + "loss": 0.0008, + "step": 104520 + }, + { + "epoch": 1.710382066595762, + "grad_norm": 0.07001157850027084, + "learning_rate": 6.252860038470604e-07, + "loss": 0.001, + "step": 104530 + }, + { + "epoch": 1.7105456925468379, + "grad_norm": 0.017514387145638466, + "learning_rate": 6.245947527274921e-07, + "loss": 0.0006, + "step": 104540 + }, + { + "epoch": 1.7107093184979139, + "grad_norm": 0.014102383516728878, + "learning_rate": 6.239038584510887e-07, + "loss": 0.0005, + "step": 104550 + }, + { + "epoch": 1.7108729444489896, + "grad_norm": 0.028875062242150307, + "learning_rate": 6.232133210741987e-07, + "loss": 0.0008, + "step": 104560 + }, + { + "epoch": 1.7110365704000654, + "grad_norm": 0.10573095083236694, + "learning_rate": 6.22523140653138e-07, + "loss": 0.0011, + "step": 104570 + }, + { + "epoch": 1.7112001963511414, + "grad_norm": 0.02643732912838459, + "learning_rate": 6.21833317244197e-07, + "loss": 0.0006, + "step": 104580 + }, + { + "epoch": 1.7113638223022172, + "grad_norm": 0.040627654641866684, + "learning_rate": 6.211438509036343e-07, + "loss": 0.0007, + "step": 104590 + }, + { + "epoch": 1.711527448253293, + "grad_norm": 0.010557485744357109, + "learning_rate": 6.204547416876822e-07, + "loss": 0.001, + "step": 104600 + }, + { + "epoch": 1.711691074204369, + "grad_norm": 0.06961768120527267, + "learning_rate": 6.197659896525393e-07, + "loss": 0.001, + "step": 104610 + }, + { + "epoch": 1.7118547001554445, + "grad_norm": 0.09079907089471817, + "learning_rate": 6.190775948543809e-07, + "loss": 0.0009, + "step": 104620 + }, + { + "epoch": 1.7120183261065205, + "grad_norm": 0.06503836065530777, + "learning_rate": 6.183895573493476e-07, + "loss": 0.0014, + "step": 104630 + }, + { + "epoch": 1.7121819520575965, + "grad_norm": 0.021787263453006744, + "learning_rate": 6.177018771935556e-07, + "loss": 0.0004, + "step": 104640 + }, + { + "epoch": 1.712345578008672, + "grad_norm": 0.02542039193212986, + "learning_rate": 6.170145544430872e-07, + "loss": 0.0014, + "step": 104650 + }, + { + "epoch": 1.712509203959748, + "grad_norm": 0.0482574887573719, + "learning_rate": 6.16327589154001e-07, + "loss": 0.0009, + "step": 104660 + }, + { + "epoch": 1.7126728299108238, + "grad_norm": 0.045166414231061935, + "learning_rate": 6.156409813823211e-07, + "loss": 0.0013, + "step": 104670 + }, + { + "epoch": 1.7128364558618996, + "grad_norm": 0.019430572167038918, + "learning_rate": 6.149547311840464e-07, + "loss": 0.0011, + "step": 104680 + }, + { + "epoch": 1.7130000818129756, + "grad_norm": 0.051017872989177704, + "learning_rate": 6.14268838615144e-07, + "loss": 0.0013, + "step": 104690 + }, + { + "epoch": 1.7131637077640514, + "grad_norm": 0.05045241490006447, + "learning_rate": 6.135833037315541e-07, + "loss": 0.0006, + "step": 104700 + }, + { + "epoch": 1.7133273337151271, + "grad_norm": 0.08285973221063614, + "learning_rate": 6.128981265891859e-07, + "loss": 0.0011, + "step": 104710 + }, + { + "epoch": 1.7134909596662031, + "grad_norm": 0.013021004386246204, + "learning_rate": 6.122133072439207e-07, + "loss": 0.0011, + "step": 104720 + }, + { + "epoch": 1.713654585617279, + "grad_norm": 0.011671613901853561, + "learning_rate": 6.115288457516089e-07, + "loss": 0.0006, + "step": 104730 + }, + { + "epoch": 1.7138182115683547, + "grad_norm": 0.06137021631002426, + "learning_rate": 6.108447421680741e-07, + "loss": 0.0006, + "step": 104740 + }, + { + "epoch": 1.7139818375194307, + "grad_norm": 0.0050993310287594795, + "learning_rate": 6.101609965491084e-07, + "loss": 0.001, + "step": 104750 + }, + { + "epoch": 1.7141454634705064, + "grad_norm": 0.06072403863072395, + "learning_rate": 6.094776089504773e-07, + "loss": 0.0007, + "step": 104760 + }, + { + "epoch": 1.7143090894215822, + "grad_norm": 0.09018073230981827, + "learning_rate": 6.087945794279132e-07, + "loss": 0.0009, + "step": 104770 + }, + { + "epoch": 1.7144727153726582, + "grad_norm": 0.01661476120352745, + "learning_rate": 6.081119080371239e-07, + "loss": 0.0012, + "step": 104780 + }, + { + "epoch": 1.714636341323734, + "grad_norm": 0.048093412071466446, + "learning_rate": 6.074295948337844e-07, + "loss": 0.0006, + "step": 104790 + }, + { + "epoch": 1.7147999672748098, + "grad_norm": 0.038393646478652954, + "learning_rate": 6.067476398735433e-07, + "loss": 0.0007, + "step": 104800 + }, + { + "epoch": 1.7149635932258858, + "grad_norm": 0.03215879946947098, + "learning_rate": 6.060660432120163e-07, + "loss": 0.0012, + "step": 104810 + }, + { + "epoch": 1.7151272191769613, + "grad_norm": 0.04612722620368004, + "learning_rate": 6.053848049047944e-07, + "loss": 0.0008, + "step": 104820 + }, + { + "epoch": 1.7152908451280373, + "grad_norm": 0.042219240218400955, + "learning_rate": 6.047039250074354e-07, + "loss": 0.0005, + "step": 104830 + }, + { + "epoch": 1.7154544710791133, + "grad_norm": 0.01066769752651453, + "learning_rate": 6.040234035754705e-07, + "loss": 0.0034, + "step": 104840 + }, + { + "epoch": 1.7156180970301889, + "grad_norm": 0.010555176995694637, + "learning_rate": 6.033432406644002e-07, + "loss": 0.0011, + "step": 104850 + }, + { + "epoch": 1.7157817229812649, + "grad_norm": 0.025802545249462128, + "learning_rate": 6.026634363296968e-07, + "loss": 0.0008, + "step": 104860 + }, + { + "epoch": 1.7159453489323406, + "grad_norm": 0.030181996524333954, + "learning_rate": 6.019839906268026e-07, + "loss": 0.0008, + "step": 104870 + }, + { + "epoch": 1.7161089748834164, + "grad_norm": 0.012528599239885807, + "learning_rate": 6.013049036111301e-07, + "loss": 0.0007, + "step": 104880 + }, + { + "epoch": 1.7162726008344924, + "grad_norm": 0.0035704607143998146, + "learning_rate": 6.006261753380649e-07, + "loss": 0.0009, + "step": 104890 + }, + { + "epoch": 1.7164362267855682, + "grad_norm": 0.03126341477036476, + "learning_rate": 5.999478058629599e-07, + "loss": 0.0015, + "step": 104900 + }, + { + "epoch": 1.716599852736644, + "grad_norm": 0.035922903567552567, + "learning_rate": 5.992697952411425e-07, + "loss": 0.0007, + "step": 104910 + }, + { + "epoch": 1.71676347868772, + "grad_norm": 0.012180100195109844, + "learning_rate": 5.985921435279069e-07, + "loss": 0.0008, + "step": 104920 + }, + { + "epoch": 1.7169271046387957, + "grad_norm": 0.025365808978676796, + "learning_rate": 5.979148507785226e-07, + "loss": 0.0005, + "step": 104930 + }, + { + "epoch": 1.7170907305898715, + "grad_norm": 0.13826243579387665, + "learning_rate": 5.972379170482245e-07, + "loss": 0.0014, + "step": 104940 + }, + { + "epoch": 1.7172543565409475, + "grad_norm": 0.020765230059623718, + "learning_rate": 5.965613423922234e-07, + "loss": 0.0003, + "step": 104950 + }, + { + "epoch": 1.7174179824920233, + "grad_norm": 0.041690435260534286, + "learning_rate": 5.958851268656968e-07, + "loss": 0.0008, + "step": 104960 + }, + { + "epoch": 1.717581608443099, + "grad_norm": 0.11864360421895981, + "learning_rate": 5.952092705237961e-07, + "loss": 0.0008, + "step": 104970 + }, + { + "epoch": 1.717745234394175, + "grad_norm": 0.03102138265967369, + "learning_rate": 5.945337734216394e-07, + "loss": 0.0006, + "step": 104980 + }, + { + "epoch": 1.7179088603452508, + "grad_norm": 0.04831431061029434, + "learning_rate": 5.938586356143205e-07, + "loss": 0.0006, + "step": 104990 + }, + { + "epoch": 1.7180724862963266, + "grad_norm": 0.019998712465167046, + "learning_rate": 5.931838571568998e-07, + "loss": 0.0015, + "step": 105000 + }, + { + "epoch": 1.7182361122474026, + "grad_norm": 0.016343412920832634, + "learning_rate": 5.925094381044111e-07, + "loss": 0.001, + "step": 105010 + }, + { + "epoch": 1.7183997381984781, + "grad_norm": 0.11839848011732101, + "learning_rate": 5.918353785118564e-07, + "loss": 0.0013, + "step": 105020 + }, + { + "epoch": 1.7185633641495541, + "grad_norm": 0.02918507345020771, + "learning_rate": 5.911616784342112e-07, + "loss": 0.0007, + "step": 105030 + }, + { + "epoch": 1.7187269901006301, + "grad_norm": 0.03054187446832657, + "learning_rate": 5.904883379264181e-07, + "loss": 0.0007, + "step": 105040 + }, + { + "epoch": 1.7188906160517057, + "grad_norm": 0.12949982285499573, + "learning_rate": 5.898153570433945e-07, + "loss": 0.001, + "step": 105050 + }, + { + "epoch": 1.7190542420027817, + "grad_norm": 0.012905827723443508, + "learning_rate": 5.891427358400248e-07, + "loss": 0.0009, + "step": 105060 + }, + { + "epoch": 1.7192178679538574, + "grad_norm": 0.05726579576730728, + "learning_rate": 5.884704743711678e-07, + "loss": 0.0012, + "step": 105070 + }, + { + "epoch": 1.7193814939049332, + "grad_norm": 0.09188417345285416, + "learning_rate": 5.877985726916485e-07, + "loss": 0.0016, + "step": 105080 + }, + { + "epoch": 1.7195451198560092, + "grad_norm": 0.06933452934026718, + "learning_rate": 5.87127030856267e-07, + "loss": 0.0009, + "step": 105090 + }, + { + "epoch": 1.719708745807085, + "grad_norm": 0.02457580342888832, + "learning_rate": 5.864558489197903e-07, + "loss": 0.002, + "step": 105100 + }, + { + "epoch": 1.7198723717581608, + "grad_norm": 0.04385444149374962, + "learning_rate": 5.857850269369586e-07, + "loss": 0.0015, + "step": 105110 + }, + { + "epoch": 1.7200359977092368, + "grad_norm": 0.012341826222836971, + "learning_rate": 5.851145649624812e-07, + "loss": 0.0009, + "step": 105120 + }, + { + "epoch": 1.7201996236603125, + "grad_norm": 0.0032532778568565845, + "learning_rate": 5.844444630510404e-07, + "loss": 0.0009, + "step": 105130 + }, + { + "epoch": 1.7203632496113883, + "grad_norm": 0.1358296126127243, + "learning_rate": 5.837747212572847e-07, + "loss": 0.0016, + "step": 105140 + }, + { + "epoch": 1.7205268755624643, + "grad_norm": 0.2498563677072525, + "learning_rate": 5.831053396358383e-07, + "loss": 0.0019, + "step": 105150 + }, + { + "epoch": 1.72069050151354, + "grad_norm": 0.021418867632746696, + "learning_rate": 5.824363182412923e-07, + "loss": 0.0009, + "step": 105160 + }, + { + "epoch": 1.7208541274646159, + "grad_norm": 0.009927124716341496, + "learning_rate": 5.81767657128211e-07, + "loss": 0.0008, + "step": 105170 + }, + { + "epoch": 1.7210177534156919, + "grad_norm": 0.030217165127396584, + "learning_rate": 5.810993563511263e-07, + "loss": 0.001, + "step": 105180 + }, + { + "epoch": 1.7211813793667674, + "grad_norm": 0.061702560633420944, + "learning_rate": 5.804314159645452e-07, + "loss": 0.0005, + "step": 105190 + }, + { + "epoch": 1.7213450053178434, + "grad_norm": 0.03343638777732849, + "learning_rate": 5.797638360229413e-07, + "loss": 0.0013, + "step": 105200 + }, + { + "epoch": 1.7215086312689194, + "grad_norm": 0.018403608351945877, + "learning_rate": 5.790966165807582e-07, + "loss": 0.0013, + "step": 105210 + }, + { + "epoch": 1.721672257219995, + "grad_norm": 0.019773226231336594, + "learning_rate": 5.784297576924152e-07, + "loss": 0.0023, + "step": 105220 + }, + { + "epoch": 1.721835883171071, + "grad_norm": 0.04786520078778267, + "learning_rate": 5.777632594122967e-07, + "loss": 0.0013, + "step": 105230 + }, + { + "epoch": 1.7219995091221467, + "grad_norm": 0.018433310091495514, + "learning_rate": 5.770971217947624e-07, + "loss": 0.0013, + "step": 105240 + }, + { + "epoch": 1.7221631350732225, + "grad_norm": 0.04635376110672951, + "learning_rate": 5.764313448941372e-07, + "loss": 0.0005, + "step": 105250 + }, + { + "epoch": 1.7223267610242985, + "grad_norm": 0.030670208856463432, + "learning_rate": 5.757659287647227e-07, + "loss": 0.001, + "step": 105260 + }, + { + "epoch": 1.7224903869753743, + "grad_norm": 0.06548034399747849, + "learning_rate": 5.751008734607855e-07, + "loss": 0.0011, + "step": 105270 + }, + { + "epoch": 1.72265401292645, + "grad_norm": 0.057641465216875076, + "learning_rate": 5.744361790365665e-07, + "loss": 0.0006, + "step": 105280 + }, + { + "epoch": 1.722817638877526, + "grad_norm": 0.021024368703365326, + "learning_rate": 5.737718455462754e-07, + "loss": 0.0008, + "step": 105290 + }, + { + "epoch": 1.7229812648286018, + "grad_norm": 0.026323791593313217, + "learning_rate": 5.731078730440942e-07, + "loss": 0.0007, + "step": 105300 + }, + { + "epoch": 1.7231448907796776, + "grad_norm": 0.03075254335999489, + "learning_rate": 5.724442615841719e-07, + "loss": 0.0006, + "step": 105310 + }, + { + "epoch": 1.7233085167307536, + "grad_norm": 0.09230765700340271, + "learning_rate": 5.717810112206335e-07, + "loss": 0.0014, + "step": 105320 + }, + { + "epoch": 1.7234721426818294, + "grad_norm": 0.013636085204780102, + "learning_rate": 5.711181220075679e-07, + "loss": 0.0011, + "step": 105330 + }, + { + "epoch": 1.7236357686329051, + "grad_norm": 0.03024132549762726, + "learning_rate": 5.704555939990414e-07, + "loss": 0.0023, + "step": 105340 + }, + { + "epoch": 1.7237993945839811, + "grad_norm": 0.034921981394290924, + "learning_rate": 5.697934272490852e-07, + "loss": 0.0013, + "step": 105350 + }, + { + "epoch": 1.723963020535057, + "grad_norm": 0.058230914175510406, + "learning_rate": 5.691316218117049e-07, + "loss": 0.0007, + "step": 105360 + }, + { + "epoch": 1.7241266464861327, + "grad_norm": 0.03640428185462952, + "learning_rate": 5.68470177740873e-07, + "loss": 0.0005, + "step": 105370 + }, + { + "epoch": 1.7242902724372087, + "grad_norm": 0.0640496164560318, + "learning_rate": 5.678090950905379e-07, + "loss": 0.0012, + "step": 105380 + }, + { + "epoch": 1.7244538983882842, + "grad_norm": 0.035792384296655655, + "learning_rate": 5.671483739146116e-07, + "loss": 0.0008, + "step": 105390 + }, + { + "epoch": 1.7246175243393602, + "grad_norm": 0.005545841529965401, + "learning_rate": 5.664880142669838e-07, + "loss": 0.0008, + "step": 105400 + }, + { + "epoch": 1.7247811502904362, + "grad_norm": 0.011500423774123192, + "learning_rate": 5.658280162015078e-07, + "loss": 0.001, + "step": 105410 + }, + { + "epoch": 1.7249447762415118, + "grad_norm": 0.04240519180893898, + "learning_rate": 5.651683797720136e-07, + "loss": 0.0012, + "step": 105420 + }, + { + "epoch": 1.7251084021925878, + "grad_norm": 0.06422310322523117, + "learning_rate": 5.645091050322971e-07, + "loss": 0.0005, + "step": 105430 + }, + { + "epoch": 1.7252720281436635, + "grad_norm": 0.009497438557446003, + "learning_rate": 5.638501920361278e-07, + "loss": 0.0009, + "step": 105440 + }, + { + "epoch": 1.7254356540947393, + "grad_norm": 0.12026920169591904, + "learning_rate": 5.631916408372429e-07, + "loss": 0.0007, + "step": 105450 + }, + { + "epoch": 1.7255992800458153, + "grad_norm": 0.09134354442358017, + "learning_rate": 5.625334514893532e-07, + "loss": 0.0004, + "step": 105460 + }, + { + "epoch": 1.725762905996891, + "grad_norm": 0.028013909235596657, + "learning_rate": 5.618756240461371e-07, + "loss": 0.0007, + "step": 105470 + }, + { + "epoch": 1.7259265319479669, + "grad_norm": 0.09132256358861923, + "learning_rate": 5.612181585612459e-07, + "loss": 0.0013, + "step": 105480 + }, + { + "epoch": 1.7260901578990429, + "grad_norm": 0.15687842667102814, + "learning_rate": 5.605610550882989e-07, + "loss": 0.0016, + "step": 105490 + }, + { + "epoch": 1.7262537838501186, + "grad_norm": 0.02945963852107525, + "learning_rate": 5.59904313680889e-07, + "loss": 0.0006, + "step": 105500 + }, + { + "epoch": 1.7264174098011944, + "grad_norm": 0.05370291694998741, + "learning_rate": 5.592479343925766e-07, + "loss": 0.0012, + "step": 105510 + }, + { + "epoch": 1.7265810357522704, + "grad_norm": 0.09495176374912262, + "learning_rate": 5.585919172768928e-07, + "loss": 0.0007, + "step": 105520 + }, + { + "epoch": 1.7267446617033462, + "grad_norm": 0.025100622326135635, + "learning_rate": 5.579362623873425e-07, + "loss": 0.001, + "step": 105530 + }, + { + "epoch": 1.726908287654422, + "grad_norm": 0.018807893618941307, + "learning_rate": 5.572809697773962e-07, + "loss": 0.0007, + "step": 105540 + }, + { + "epoch": 1.727071913605498, + "grad_norm": 0.0693151205778122, + "learning_rate": 5.566260395004996e-07, + "loss": 0.0012, + "step": 105550 + }, + { + "epoch": 1.7272355395565737, + "grad_norm": 0.011685054749250412, + "learning_rate": 5.559714716100645e-07, + "loss": 0.0004, + "step": 105560 + }, + { + "epoch": 1.7273991655076495, + "grad_norm": 0.17558389902114868, + "learning_rate": 5.553172661594774e-07, + "loss": 0.0012, + "step": 105570 + }, + { + "epoch": 1.7275627914587255, + "grad_norm": 0.22072334587574005, + "learning_rate": 5.546634232020914e-07, + "loss": 0.0011, + "step": 105580 + }, + { + "epoch": 1.727726417409801, + "grad_norm": 0.05899371579289436, + "learning_rate": 5.540099427912326e-07, + "loss": 0.0009, + "step": 105590 + }, + { + "epoch": 1.727890043360877, + "grad_norm": 0.05088042467832565, + "learning_rate": 5.533568249801957e-07, + "loss": 0.0008, + "step": 105600 + }, + { + "epoch": 1.728053669311953, + "grad_norm": 0.06190170720219612, + "learning_rate": 5.52704069822248e-07, + "loss": 0.0013, + "step": 105610 + }, + { + "epoch": 1.7282172952630286, + "grad_norm": 0.0814126506447792, + "learning_rate": 5.520516773706247e-07, + "loss": 0.0012, + "step": 105620 + }, + { + "epoch": 1.7283809212141046, + "grad_norm": 0.028053781017661095, + "learning_rate": 5.513996476785344e-07, + "loss": 0.0007, + "step": 105630 + }, + { + "epoch": 1.7285445471651804, + "grad_norm": 0.025459809228777885, + "learning_rate": 5.507479807991522e-07, + "loss": 0.001, + "step": 105640 + }, + { + "epoch": 1.7287081731162561, + "grad_norm": 0.04505978524684906, + "learning_rate": 5.500966767856281e-07, + "loss": 0.0006, + "step": 105650 + }, + { + "epoch": 1.7288717990673321, + "grad_norm": 0.08860975503921509, + "learning_rate": 5.494457356910782e-07, + "loss": 0.0005, + "step": 105660 + }, + { + "epoch": 1.729035425018408, + "grad_norm": 0.04567332938313484, + "learning_rate": 5.487951575685929e-07, + "loss": 0.0011, + "step": 105670 + }, + { + "epoch": 1.7291990509694837, + "grad_norm": 0.07471456378698349, + "learning_rate": 5.481449424712293e-07, + "loss": 0.0012, + "step": 105680 + }, + { + "epoch": 1.7293626769205597, + "grad_norm": 0.09998209029436111, + "learning_rate": 5.474950904520187e-07, + "loss": 0.0012, + "step": 105690 + }, + { + "epoch": 1.7295263028716354, + "grad_norm": 0.05922834202647209, + "learning_rate": 5.468456015639584e-07, + "loss": 0.0006, + "step": 105700 + }, + { + "epoch": 1.7296899288227112, + "grad_norm": 0.048251040279865265, + "learning_rate": 5.461964758600213e-07, + "loss": 0.0009, + "step": 105710 + }, + { + "epoch": 1.7298535547737872, + "grad_norm": 0.021195555105805397, + "learning_rate": 5.455477133931459e-07, + "loss": 0.0009, + "step": 105720 + }, + { + "epoch": 1.730017180724863, + "grad_norm": 0.05589756742119789, + "learning_rate": 5.44899314216244e-07, + "loss": 0.0008, + "step": 105730 + }, + { + "epoch": 1.7301808066759388, + "grad_norm": 0.04453357309103012, + "learning_rate": 5.442512783821957e-07, + "loss": 0.0005, + "step": 105740 + }, + { + "epoch": 1.7303444326270148, + "grad_norm": 0.0006272813188843429, + "learning_rate": 5.436036059438543e-07, + "loss": 0.0006, + "step": 105750 + }, + { + "epoch": 1.7305080585780905, + "grad_norm": 0.011169848963618279, + "learning_rate": 5.429562969540398e-07, + "loss": 0.0012, + "step": 105760 + }, + { + "epoch": 1.7306716845291663, + "grad_norm": 0.00949709489941597, + "learning_rate": 5.423093514655464e-07, + "loss": 0.0007, + "step": 105770 + }, + { + "epoch": 1.7308353104802423, + "grad_norm": 0.06974199414253235, + "learning_rate": 5.41662769531135e-07, + "loss": 0.0009, + "step": 105780 + }, + { + "epoch": 1.7309989364313179, + "grad_norm": 0.014135818928480148, + "learning_rate": 5.41016551203541e-07, + "loss": 0.0003, + "step": 105790 + }, + { + "epoch": 1.7311625623823939, + "grad_norm": 0.013617103919386864, + "learning_rate": 5.40370696535465e-07, + "loss": 0.0006, + "step": 105800 + }, + { + "epoch": 1.7313261883334699, + "grad_norm": 0.02486949786543846, + "learning_rate": 5.397252055795827e-07, + "loss": 0.0008, + "step": 105810 + }, + { + "epoch": 1.7314898142845454, + "grad_norm": 0.053359244018793106, + "learning_rate": 5.39080078388537e-07, + "loss": 0.0009, + "step": 105820 + }, + { + "epoch": 1.7316534402356214, + "grad_norm": 0.02739722840487957, + "learning_rate": 5.384353150149429e-07, + "loss": 0.0011, + "step": 105830 + }, + { + "epoch": 1.7318170661866972, + "grad_norm": 0.075003482401371, + "learning_rate": 5.377909155113858e-07, + "loss": 0.0008, + "step": 105840 + }, + { + "epoch": 1.731980692137773, + "grad_norm": 0.022716032341122627, + "learning_rate": 5.371468799304186e-07, + "loss": 0.0011, + "step": 105850 + }, + { + "epoch": 1.732144318088849, + "grad_norm": 0.06845776736736298, + "learning_rate": 5.365032083245691e-07, + "loss": 0.0007, + "step": 105860 + }, + { + "epoch": 1.7323079440399247, + "grad_norm": 0.030856722965836525, + "learning_rate": 5.358599007463305e-07, + "loss": 0.0007, + "step": 105870 + }, + { + "epoch": 1.7324715699910005, + "grad_norm": 0.043110672384500504, + "learning_rate": 5.352169572481713e-07, + "loss": 0.0008, + "step": 105880 + }, + { + "epoch": 1.7326351959420765, + "grad_norm": 0.024211639538407326, + "learning_rate": 5.345743778825258e-07, + "loss": 0.001, + "step": 105890 + }, + { + "epoch": 1.7327988218931523, + "grad_norm": 0.01669558510184288, + "learning_rate": 5.339321627018018e-07, + "loss": 0.0011, + "step": 105900 + }, + { + "epoch": 1.732962447844228, + "grad_norm": 0.04783722013235092, + "learning_rate": 5.332903117583749e-07, + "loss": 0.0006, + "step": 105910 + }, + { + "epoch": 1.733126073795304, + "grad_norm": 0.1694510579109192, + "learning_rate": 5.326488251045941e-07, + "loss": 0.0006, + "step": 105920 + }, + { + "epoch": 1.7332896997463798, + "grad_norm": 0.05942774564027786, + "learning_rate": 5.32007702792775e-07, + "loss": 0.001, + "step": 105930 + }, + { + "epoch": 1.7334533256974556, + "grad_norm": 0.010538343340158463, + "learning_rate": 5.313669448752073e-07, + "loss": 0.0006, + "step": 105940 + }, + { + "epoch": 1.7336169516485316, + "grad_norm": 0.08307705074548721, + "learning_rate": 5.307265514041471e-07, + "loss": 0.0006, + "step": 105950 + }, + { + "epoch": 1.7337805775996074, + "grad_norm": 0.012879393063485622, + "learning_rate": 5.300865224318241e-07, + "loss": 0.0009, + "step": 105960 + }, + { + "epoch": 1.7339442035506831, + "grad_norm": 0.05131203308701515, + "learning_rate": 5.294468580104356e-07, + "loss": 0.0011, + "step": 105970 + }, + { + "epoch": 1.7341078295017591, + "grad_norm": 0.03458397835493088, + "learning_rate": 5.288075581921526e-07, + "loss": 0.0006, + "step": 105980 + }, + { + "epoch": 1.7342714554528347, + "grad_norm": 0.06781768798828125, + "learning_rate": 5.281686230291111e-07, + "loss": 0.0005, + "step": 105990 + }, + { + "epoch": 1.7344350814039107, + "grad_norm": 0.012050077319145203, + "learning_rate": 5.275300525734239e-07, + "loss": 0.0004, + "step": 106000 + }, + { + "epoch": 1.7345987073549864, + "grad_norm": 0.021382614970207214, + "learning_rate": 5.268918468771678e-07, + "loss": 0.0009, + "step": 106010 + }, + { + "epoch": 1.7347623333060622, + "grad_norm": 0.06707541644573212, + "learning_rate": 5.262540059923949e-07, + "loss": 0.001, + "step": 106020 + }, + { + "epoch": 1.7349259592571382, + "grad_norm": 0.002782185561954975, + "learning_rate": 5.25616529971123e-07, + "loss": 0.0011, + "step": 106030 + }, + { + "epoch": 1.735089585208214, + "grad_norm": 0.05399351194500923, + "learning_rate": 5.249794188653446e-07, + "loss": 0.0005, + "step": 106040 + }, + { + "epoch": 1.7352532111592898, + "grad_norm": 0.03972519561648369, + "learning_rate": 5.24342672727019e-07, + "loss": 0.0013, + "step": 106050 + }, + { + "epoch": 1.7354168371103658, + "grad_norm": 0.008709500543773174, + "learning_rate": 5.237062916080782e-07, + "loss": 0.0005, + "step": 106060 + }, + { + "epoch": 1.7355804630614415, + "grad_norm": 0.0478438064455986, + "learning_rate": 5.230702755604216e-07, + "loss": 0.0007, + "step": 106070 + }, + { + "epoch": 1.7357440890125173, + "grad_norm": 0.028151944279670715, + "learning_rate": 5.224346246359225e-07, + "loss": 0.0005, + "step": 106080 + }, + { + "epoch": 1.7359077149635933, + "grad_norm": 0.050489380955696106, + "learning_rate": 5.2179933888642e-07, + "loss": 0.0007, + "step": 106090 + }, + { + "epoch": 1.736071340914669, + "grad_norm": 0.028123734518885612, + "learning_rate": 5.211644183637287e-07, + "loss": 0.0003, + "step": 106100 + }, + { + "epoch": 1.7362349668657449, + "grad_norm": 0.00917387381196022, + "learning_rate": 5.205298631196287e-07, + "loss": 0.0006, + "step": 106110 + }, + { + "epoch": 1.7363985928168209, + "grad_norm": 0.04121044650673866, + "learning_rate": 5.19895673205873e-07, + "loss": 0.0008, + "step": 106120 + }, + { + "epoch": 1.7365622187678966, + "grad_norm": 0.014919224195182323, + "learning_rate": 5.192618486741824e-07, + "loss": 0.0019, + "step": 106130 + }, + { + "epoch": 1.7367258447189724, + "grad_norm": 0.04376554116606712, + "learning_rate": 5.186283895762517e-07, + "loss": 0.0008, + "step": 106140 + }, + { + "epoch": 1.7368894706700484, + "grad_norm": 0.0658923089504242, + "learning_rate": 5.17995295963743e-07, + "loss": 0.0008, + "step": 106150 + }, + { + "epoch": 1.737053096621124, + "grad_norm": 0.009980090893805027, + "learning_rate": 5.173625678882871e-07, + "loss": 0.0012, + "step": 106160 + }, + { + "epoch": 1.7372167225722, + "grad_norm": 0.003842758946120739, + "learning_rate": 5.167302054014906e-07, + "loss": 0.0009, + "step": 106170 + }, + { + "epoch": 1.737380348523276, + "grad_norm": 0.04095356911420822, + "learning_rate": 5.160982085549238e-07, + "loss": 0.0006, + "step": 106180 + }, + { + "epoch": 1.7375439744743515, + "grad_norm": 0.14131717383861542, + "learning_rate": 5.154665774001327e-07, + "loss": 0.001, + "step": 106190 + }, + { + "epoch": 1.7377076004254275, + "grad_norm": 0.01662352867424488, + "learning_rate": 5.14835311988629e-07, + "loss": 0.0009, + "step": 106200 + }, + { + "epoch": 1.7378712263765033, + "grad_norm": 0.05708159878849983, + "learning_rate": 5.142044123718981e-07, + "loss": 0.0003, + "step": 106210 + }, + { + "epoch": 1.738034852327579, + "grad_norm": 0.12500154972076416, + "learning_rate": 5.135738786013927e-07, + "loss": 0.0017, + "step": 106220 + }, + { + "epoch": 1.738198478278655, + "grad_norm": 0.02598184160888195, + "learning_rate": 5.129437107285385e-07, + "loss": 0.0006, + "step": 106230 + }, + { + "epoch": 1.7383621042297308, + "grad_norm": 0.14675581455230713, + "learning_rate": 5.12313908804728e-07, + "loss": 0.002, + "step": 106240 + }, + { + "epoch": 1.7385257301808066, + "grad_norm": 0.19847004115581512, + "learning_rate": 5.116844728813275e-07, + "loss": 0.001, + "step": 106250 + }, + { + "epoch": 1.7386893561318826, + "grad_norm": 0.04709034785628319, + "learning_rate": 5.110554030096698e-07, + "loss": 0.001, + "step": 106260 + }, + { + "epoch": 1.7388529820829584, + "grad_norm": 0.07444377988576889, + "learning_rate": 5.104266992410616e-07, + "loss": 0.0009, + "step": 106270 + }, + { + "epoch": 1.7390166080340341, + "grad_norm": 0.02570188045501709, + "learning_rate": 5.09798361626776e-07, + "loss": 0.0008, + "step": 106280 + }, + { + "epoch": 1.7391802339851101, + "grad_norm": 0.10372823476791382, + "learning_rate": 5.091703902180606e-07, + "loss": 0.0007, + "step": 106290 + }, + { + "epoch": 1.739343859936186, + "grad_norm": 0.11849724501371384, + "learning_rate": 5.085427850661273e-07, + "loss": 0.0014, + "step": 106300 + }, + { + "epoch": 1.7395074858872617, + "grad_norm": 0.030570365488529205, + "learning_rate": 5.079155462221647e-07, + "loss": 0.0005, + "step": 106310 + }, + { + "epoch": 1.7396711118383377, + "grad_norm": 0.014777791686356068, + "learning_rate": 5.072886737373255e-07, + "loss": 0.0006, + "step": 106320 + }, + { + "epoch": 1.7398347377894134, + "grad_norm": 0.03123929537832737, + "learning_rate": 5.066621676627376e-07, + "loss": 0.0008, + "step": 106330 + }, + { + "epoch": 1.7399983637404892, + "grad_norm": 0.024036722257733345, + "learning_rate": 5.060360280494947e-07, + "loss": 0.0008, + "step": 106340 + }, + { + "epoch": 1.7401619896915652, + "grad_norm": 0.03935510665178299, + "learning_rate": 5.054102549486645e-07, + "loss": 0.0009, + "step": 106350 + }, + { + "epoch": 1.7403256156426408, + "grad_norm": 0.03374578058719635, + "learning_rate": 5.047848484112811e-07, + "loss": 0.0006, + "step": 106360 + }, + { + "epoch": 1.7404892415937168, + "grad_norm": 0.07095539569854736, + "learning_rate": 5.041598084883526e-07, + "loss": 0.0002, + "step": 106370 + }, + { + "epoch": 1.7406528675447928, + "grad_norm": 0.031577467918395996, + "learning_rate": 5.035351352308526e-07, + "loss": 0.0009, + "step": 106380 + }, + { + "epoch": 1.7408164934958683, + "grad_norm": 0.06595504283905029, + "learning_rate": 5.029108286897299e-07, + "loss": 0.0012, + "step": 106390 + }, + { + "epoch": 1.7409801194469443, + "grad_norm": 0.020363593474030495, + "learning_rate": 5.022868889158982e-07, + "loss": 0.0006, + "step": 106400 + }, + { + "epoch": 1.74114374539802, + "grad_norm": 0.0649137869477272, + "learning_rate": 5.016633159602469e-07, + "loss": 0.0011, + "step": 106410 + }, + { + "epoch": 1.7413073713490959, + "grad_norm": 0.03311983123421669, + "learning_rate": 5.010401098736295e-07, + "loss": 0.0013, + "step": 106420 + }, + { + "epoch": 1.7414709973001719, + "grad_norm": 0.022958895191550255, + "learning_rate": 5.004172707068749e-07, + "loss": 0.001, + "step": 106430 + }, + { + "epoch": 1.7416346232512476, + "grad_norm": 0.06251645088195801, + "learning_rate": 4.997947985107782e-07, + "loss": 0.0011, + "step": 106440 + }, + { + "epoch": 1.7417982492023234, + "grad_norm": 0.018483279272913933, + "learning_rate": 4.991726933361074e-07, + "loss": 0.0013, + "step": 106450 + }, + { + "epoch": 1.7419618751533994, + "grad_norm": 0.11677878350019455, + "learning_rate": 4.985509552335977e-07, + "loss": 0.0011, + "step": 106460 + }, + { + "epoch": 1.7421255011044752, + "grad_norm": 0.058015260845422745, + "learning_rate": 4.979295842539577e-07, + "loss": 0.0011, + "step": 106470 + }, + { + "epoch": 1.742289127055551, + "grad_norm": 0.09229929745197296, + "learning_rate": 4.973085804478634e-07, + "loss": 0.0009, + "step": 106480 + }, + { + "epoch": 1.742452753006627, + "grad_norm": 0.018100852146744728, + "learning_rate": 4.966879438659611e-07, + "loss": 0.0007, + "step": 106490 + }, + { + "epoch": 1.7426163789577027, + "grad_norm": 0.11842692643404007, + "learning_rate": 4.960676745588694e-07, + "loss": 0.0014, + "step": 106500 + }, + { + "epoch": 1.7427800049087785, + "grad_norm": 0.03797077387571335, + "learning_rate": 4.954477725771734e-07, + "loss": 0.0005, + "step": 106510 + }, + { + "epoch": 1.7429436308598545, + "grad_norm": 0.009182128123939037, + "learning_rate": 4.94828237971432e-07, + "loss": 0.0005, + "step": 106520 + }, + { + "epoch": 1.7431072568109303, + "grad_norm": 0.008334016427397728, + "learning_rate": 4.94209070792171e-07, + "loss": 0.0009, + "step": 106530 + }, + { + "epoch": 1.743270882762006, + "grad_norm": 0.036363158375024796, + "learning_rate": 4.935902710898888e-07, + "loss": 0.0011, + "step": 106540 + }, + { + "epoch": 1.743434508713082, + "grad_norm": 0.023352665826678276, + "learning_rate": 4.929718389150512e-07, + "loss": 0.0006, + "step": 106550 + }, + { + "epoch": 1.7435981346641576, + "grad_norm": 0.028563102707266808, + "learning_rate": 4.923537743180973e-07, + "loss": 0.0007, + "step": 106560 + }, + { + "epoch": 1.7437617606152336, + "grad_norm": 0.03631766512989998, + "learning_rate": 4.917360773494318e-07, + "loss": 0.0008, + "step": 106570 + }, + { + "epoch": 1.7439253865663096, + "grad_norm": 0.22517696022987366, + "learning_rate": 4.911187480594348e-07, + "loss": 0.0015, + "step": 106580 + }, + { + "epoch": 1.7440890125173851, + "grad_norm": 0.011045229621231556, + "learning_rate": 4.905017864984513e-07, + "loss": 0.0034, + "step": 106590 + }, + { + "epoch": 1.7442526384684611, + "grad_norm": 0.009721971116960049, + "learning_rate": 4.898851927167997e-07, + "loss": 0.0005, + "step": 106600 + }, + { + "epoch": 1.744416264419537, + "grad_norm": 0.06691710650920868, + "learning_rate": 4.892689667647666e-07, + "loss": 0.001, + "step": 106610 + }, + { + "epoch": 1.7445798903706127, + "grad_norm": 0.005570719949901104, + "learning_rate": 4.88653108692611e-07, + "loss": 0.0008, + "step": 106620 + }, + { + "epoch": 1.7447435163216887, + "grad_norm": 0.009508315473794937, + "learning_rate": 4.880376185505576e-07, + "loss": 0.0007, + "step": 106630 + }, + { + "epoch": 1.7449071422727644, + "grad_norm": 0.06472483277320862, + "learning_rate": 4.874224963888058e-07, + "loss": 0.0007, + "step": 106640 + }, + { + "epoch": 1.7450707682238402, + "grad_norm": 0.048835352063179016, + "learning_rate": 4.868077422575213e-07, + "loss": 0.0014, + "step": 106650 + }, + { + "epoch": 1.7452343941749162, + "grad_norm": 0.04501650854945183, + "learning_rate": 4.861933562068433e-07, + "loss": 0.001, + "step": 106660 + }, + { + "epoch": 1.745398020125992, + "grad_norm": 0.01128881610929966, + "learning_rate": 4.855793382868768e-07, + "loss": 0.0014, + "step": 106670 + }, + { + "epoch": 1.7455616460770678, + "grad_norm": 0.03785865753889084, + "learning_rate": 4.849656885477006e-07, + "loss": 0.0012, + "step": 106680 + }, + { + "epoch": 1.7457252720281438, + "grad_norm": 0.08019950985908508, + "learning_rate": 4.843524070393607e-07, + "loss": 0.0011, + "step": 106690 + }, + { + "epoch": 1.7458888979792195, + "grad_norm": 0.02137044258415699, + "learning_rate": 4.837394938118761e-07, + "loss": 0.0004, + "step": 106700 + }, + { + "epoch": 1.7460525239302953, + "grad_norm": 0.0762724056839943, + "learning_rate": 4.831269489152318e-07, + "loss": 0.001, + "step": 106710 + }, + { + "epoch": 1.7462161498813713, + "grad_norm": 0.038665663450956345, + "learning_rate": 4.825147723993867e-07, + "loss": 0.0007, + "step": 106720 + }, + { + "epoch": 1.746379775832447, + "grad_norm": 0.03914206847548485, + "learning_rate": 4.819029643142659e-07, + "loss": 0.001, + "step": 106730 + }, + { + "epoch": 1.7465434017835229, + "grad_norm": 0.03710709884762764, + "learning_rate": 4.812915247097683e-07, + "loss": 0.002, + "step": 106740 + }, + { + "epoch": 1.7467070277345988, + "grad_norm": 0.054355598986148834, + "learning_rate": 4.80680453635759e-07, + "loss": 0.0009, + "step": 106750 + }, + { + "epoch": 1.7468706536856744, + "grad_norm": 0.09634795039892197, + "learning_rate": 4.800697511420771e-07, + "loss": 0.0007, + "step": 106760 + }, + { + "epoch": 1.7470342796367504, + "grad_norm": 0.035127975046634674, + "learning_rate": 4.794594172785267e-07, + "loss": 0.0007, + "step": 106770 + }, + { + "epoch": 1.7471979055878264, + "grad_norm": 0.03766074404120445, + "learning_rate": 4.788494520948872e-07, + "loss": 0.001, + "step": 106780 + }, + { + "epoch": 1.747361531538902, + "grad_norm": 0.005453648045659065, + "learning_rate": 4.782398556409034e-07, + "loss": 0.001, + "step": 106790 + }, + { + "epoch": 1.747525157489978, + "grad_norm": 0.050250910222530365, + "learning_rate": 4.776306279662934e-07, + "loss": 0.0009, + "step": 106800 + }, + { + "epoch": 1.7476887834410537, + "grad_norm": 0.020884646102786064, + "learning_rate": 4.77021769120743e-07, + "loss": 0.0006, + "step": 106810 + }, + { + "epoch": 1.7478524093921295, + "grad_norm": 0.02867044508457184, + "learning_rate": 4.764132791539078e-07, + "loss": 0.0006, + "step": 106820 + }, + { + "epoch": 1.7480160353432055, + "grad_norm": 0.021044151857495308, + "learning_rate": 4.758051581154155e-07, + "loss": 0.0006, + "step": 106830 + }, + { + "epoch": 1.7481796612942813, + "grad_norm": 0.0913689136505127, + "learning_rate": 4.751974060548614e-07, + "loss": 0.0015, + "step": 106840 + }, + { + "epoch": 1.748343287245357, + "grad_norm": 0.051156170666217804, + "learning_rate": 4.745900230218126e-07, + "loss": 0.0011, + "step": 106850 + }, + { + "epoch": 1.748506913196433, + "grad_norm": 0.06647111475467682, + "learning_rate": 4.739830090658043e-07, + "loss": 0.0015, + "step": 106860 + }, + { + "epoch": 1.7486705391475088, + "grad_norm": 0.03345661237835884, + "learning_rate": 4.733763642363437e-07, + "loss": 0.0008, + "step": 106870 + }, + { + "epoch": 1.7488341650985846, + "grad_norm": 0.017066478729248047, + "learning_rate": 4.7277008858290506e-07, + "loss": 0.001, + "step": 106880 + }, + { + "epoch": 1.7489977910496606, + "grad_norm": 0.015604664571583271, + "learning_rate": 4.7216418215493555e-07, + "loss": 0.0005, + "step": 106890 + }, + { + "epoch": 1.7491614170007364, + "grad_norm": 0.026232518255710602, + "learning_rate": 4.7155864500185e-07, + "loss": 0.0002, + "step": 106900 + }, + { + "epoch": 1.7493250429518121, + "grad_norm": 0.03570883348584175, + "learning_rate": 4.709534771730345e-07, + "loss": 0.0013, + "step": 106910 + }, + { + "epoch": 1.7494886689028881, + "grad_norm": 0.037644702941179276, + "learning_rate": 4.7034867871784406e-07, + "loss": 0.0005, + "step": 106920 + }, + { + "epoch": 1.7496522948539637, + "grad_norm": 0.11534906923770905, + "learning_rate": 4.6974424968560475e-07, + "loss": 0.0009, + "step": 106930 + }, + { + "epoch": 1.7498159208050397, + "grad_norm": 0.06984814256429672, + "learning_rate": 4.691401901256104e-07, + "loss": 0.001, + "step": 106940 + }, + { + "epoch": 1.7499795467561157, + "grad_norm": 0.014700477011501789, + "learning_rate": 4.6853650008712723e-07, + "loss": 0.0011, + "step": 106950 + }, + { + "epoch": 1.7501431727071912, + "grad_norm": 0.04776924103498459, + "learning_rate": 4.679331796193892e-07, + "loss": 0.001, + "step": 106960 + }, + { + "epoch": 1.7503067986582672, + "grad_norm": 0.06454283744096756, + "learning_rate": 4.673302287716025e-07, + "loss": 0.0009, + "step": 106970 + }, + { + "epoch": 1.750470424609343, + "grad_norm": 0.03165587782859802, + "learning_rate": 4.6672764759294e-07, + "loss": 0.0009, + "step": 106980 + }, + { + "epoch": 1.7506340505604188, + "grad_norm": 0.025406716391444206, + "learning_rate": 4.661254361325479e-07, + "loss": 0.0003, + "step": 106990 + }, + { + "epoch": 1.7507976765114948, + "grad_norm": 0.02245040237903595, + "learning_rate": 4.6552359443953923e-07, + "loss": 0.001, + "step": 107000 + }, + { + "epoch": 1.7509613024625705, + "grad_norm": 0.003539147786796093, + "learning_rate": 4.6492212256299906e-07, + "loss": 0.0009, + "step": 107010 + }, + { + "epoch": 1.7511249284136463, + "grad_norm": 0.0016249327454715967, + "learning_rate": 4.6432102055197994e-07, + "loss": 0.0007, + "step": 107020 + }, + { + "epoch": 1.7512885543647223, + "grad_norm": 0.05109637230634689, + "learning_rate": 4.6372028845550754e-07, + "loss": 0.0007, + "step": 107030 + }, + { + "epoch": 1.751452180315798, + "grad_norm": 0.02515731193125248, + "learning_rate": 4.6311992632257384e-07, + "loss": 0.0009, + "step": 107040 + }, + { + "epoch": 1.7516158062668739, + "grad_norm": 0.0056059337221086025, + "learning_rate": 4.6251993420214416e-07, + "loss": 0.0004, + "step": 107050 + }, + { + "epoch": 1.7517794322179499, + "grad_norm": 0.001545050647109747, + "learning_rate": 4.619203121431498e-07, + "loss": 0.0014, + "step": 107060 + }, + { + "epoch": 1.7519430581690256, + "grad_norm": 0.017759894952178, + "learning_rate": 4.6132106019449564e-07, + "loss": 0.0007, + "step": 107070 + }, + { + "epoch": 1.7521066841201014, + "grad_norm": 0.03263549506664276, + "learning_rate": 4.607221784050531e-07, + "loss": 0.0011, + "step": 107080 + }, + { + "epoch": 1.7522703100711774, + "grad_norm": 0.12587085366249084, + "learning_rate": 4.6012366682366636e-07, + "loss": 0.0009, + "step": 107090 + }, + { + "epoch": 1.7524339360222532, + "grad_norm": 0.0250666756182909, + "learning_rate": 4.595255254991471e-07, + "loss": 0.0008, + "step": 107100 + }, + { + "epoch": 1.752597561973329, + "grad_norm": 0.020386964082717896, + "learning_rate": 4.589277544802784e-07, + "loss": 0.0006, + "step": 107110 + }, + { + "epoch": 1.752761187924405, + "grad_norm": 0.041317541152238846, + "learning_rate": 4.583303538158124e-07, + "loss": 0.0011, + "step": 107120 + }, + { + "epoch": 1.7529248138754805, + "grad_norm": 0.022674575448036194, + "learning_rate": 4.577333235544695e-07, + "loss": 0.0012, + "step": 107130 + }, + { + "epoch": 1.7530884398265565, + "grad_norm": 0.05014091730117798, + "learning_rate": 4.5713666374494313e-07, + "loss": 0.0033, + "step": 107140 + }, + { + "epoch": 1.7532520657776325, + "grad_norm": 0.024792658165097237, + "learning_rate": 4.565403744358937e-07, + "loss": 0.0011, + "step": 107150 + }, + { + "epoch": 1.753415691728708, + "grad_norm": 0.04676548019051552, + "learning_rate": 4.55944455675954e-07, + "loss": 0.0008, + "step": 107160 + }, + { + "epoch": 1.753579317679784, + "grad_norm": 0.1387605369091034, + "learning_rate": 4.5534890751372354e-07, + "loss": 0.0009, + "step": 107170 + }, + { + "epoch": 1.7537429436308598, + "grad_norm": 0.05712341144680977, + "learning_rate": 4.54753729997775e-07, + "loss": 0.0007, + "step": 107180 + }, + { + "epoch": 1.7539065695819356, + "grad_norm": 0.023189526051282883, + "learning_rate": 4.541589231766469e-07, + "loss": 0.0014, + "step": 107190 + }, + { + "epoch": 1.7540701955330116, + "grad_norm": 0.0023703663609921932, + "learning_rate": 4.5356448709885146e-07, + "loss": 0.0004, + "step": 107200 + }, + { + "epoch": 1.7542338214840874, + "grad_norm": 0.021885264664888382, + "learning_rate": 4.529704218128672e-07, + "loss": 0.0016, + "step": 107210 + }, + { + "epoch": 1.7543974474351631, + "grad_norm": 0.11768580973148346, + "learning_rate": 4.5237672736714645e-07, + "loss": 0.0016, + "step": 107220 + }, + { + "epoch": 1.7545610733862391, + "grad_norm": 0.05352596938610077, + "learning_rate": 4.51783403810106e-07, + "loss": 0.0012, + "step": 107230 + }, + { + "epoch": 1.754724699337315, + "grad_norm": 0.05097443610429764, + "learning_rate": 4.511904511901377e-07, + "loss": 0.001, + "step": 107240 + }, + { + "epoch": 1.7548883252883907, + "grad_norm": 0.0020308836828917265, + "learning_rate": 4.5059786955559906e-07, + "loss": 0.0006, + "step": 107250 + }, + { + "epoch": 1.7550519512394667, + "grad_norm": 0.031343474984169006, + "learning_rate": 4.500056589548207e-07, + "loss": 0.0016, + "step": 107260 + }, + { + "epoch": 1.7552155771905424, + "grad_norm": 0.014045115560293198, + "learning_rate": 4.4941381943609977e-07, + "loss": 0.0007, + "step": 107270 + }, + { + "epoch": 1.7553792031416182, + "grad_norm": 0.07368356734514236, + "learning_rate": 4.4882235104770577e-07, + "loss": 0.0011, + "step": 107280 + }, + { + "epoch": 1.7555428290926942, + "grad_norm": 0.05008498206734657, + "learning_rate": 4.482312538378758e-07, + "loss": 0.001, + "step": 107290 + }, + { + "epoch": 1.75570645504377, + "grad_norm": 0.07638438791036606, + "learning_rate": 4.4764052785481904e-07, + "loss": 0.0006, + "step": 107300 + }, + { + "epoch": 1.7558700809948458, + "grad_norm": 0.03753861412405968, + "learning_rate": 4.470501731467114e-07, + "loss": 0.0009, + "step": 107310 + }, + { + "epoch": 1.7560337069459218, + "grad_norm": 0.15613305568695068, + "learning_rate": 4.4646018976170215e-07, + "loss": 0.0014, + "step": 107320 + }, + { + "epoch": 1.7561973328969973, + "grad_norm": 0.024668697267770767, + "learning_rate": 4.458705777479061e-07, + "loss": 0.0007, + "step": 107330 + }, + { + "epoch": 1.7563609588480733, + "grad_norm": 0.10352221876382828, + "learning_rate": 4.452813371534126e-07, + "loss": 0.001, + "step": 107340 + }, + { + "epoch": 1.7565245847991493, + "grad_norm": 0.023088568821549416, + "learning_rate": 4.4469246802627544e-07, + "loss": 0.0004, + "step": 107350 + }, + { + "epoch": 1.7566882107502249, + "grad_norm": 0.036313802003860474, + "learning_rate": 4.441039704145234e-07, + "loss": 0.0006, + "step": 107360 + }, + { + "epoch": 1.7568518367013009, + "grad_norm": 0.008619832806289196, + "learning_rate": 4.4351584436615027e-07, + "loss": 0.0005, + "step": 107370 + }, + { + "epoch": 1.7570154626523766, + "grad_norm": 0.021129556000232697, + "learning_rate": 4.4292808992912273e-07, + "loss": 0.0014, + "step": 107380 + }, + { + "epoch": 1.7571790886034524, + "grad_norm": 0.012276737950742245, + "learning_rate": 4.423407071513752e-07, + "loss": 0.0006, + "step": 107390 + }, + { + "epoch": 1.7573427145545284, + "grad_norm": 0.10430112481117249, + "learning_rate": 4.417536960808139e-07, + "loss": 0.0006, + "step": 107400 + }, + { + "epoch": 1.7575063405056042, + "grad_norm": 0.18361087143421173, + "learning_rate": 4.4116705676531146e-07, + "loss": 0.001, + "step": 107410 + }, + { + "epoch": 1.75766996645668, + "grad_norm": 0.004871009849011898, + "learning_rate": 4.4058078925271476e-07, + "loss": 0.0007, + "step": 107420 + }, + { + "epoch": 1.757833592407756, + "grad_norm": 0.020443012937903404, + "learning_rate": 4.3999489359083494e-07, + "loss": 0.0004, + "step": 107430 + }, + { + "epoch": 1.7579972183588317, + "grad_norm": 0.2269364893436432, + "learning_rate": 4.394093698274582e-07, + "loss": 0.0013, + "step": 107440 + }, + { + "epoch": 1.7581608443099075, + "grad_norm": 0.009524994529783726, + "learning_rate": 4.3882421801033693e-07, + "loss": 0.0007, + "step": 107450 + }, + { + "epoch": 1.7583244702609835, + "grad_norm": 0.07494428753852844, + "learning_rate": 4.382394381871935e-07, + "loss": 0.0012, + "step": 107460 + }, + { + "epoch": 1.7584880962120593, + "grad_norm": 0.03661913424730301, + "learning_rate": 4.3765503040572146e-07, + "loss": 0.0003, + "step": 107470 + }, + { + "epoch": 1.758651722163135, + "grad_norm": 0.08206179738044739, + "learning_rate": 4.370709947135815e-07, + "loss": 0.0018, + "step": 107480 + }, + { + "epoch": 1.758815348114211, + "grad_norm": 0.001558939111419022, + "learning_rate": 4.3648733115840836e-07, + "loss": 0.0007, + "step": 107490 + }, + { + "epoch": 1.7589789740652868, + "grad_norm": 0.08039776980876923, + "learning_rate": 4.359040397878006e-07, + "loss": 0.0015, + "step": 107500 + }, + { + "epoch": 1.7591426000163626, + "grad_norm": 0.03547074645757675, + "learning_rate": 4.353211206493324e-07, + "loss": 0.0005, + "step": 107510 + }, + { + "epoch": 1.7593062259674386, + "grad_norm": 0.1029684916138649, + "learning_rate": 4.3473857379054183e-07, + "loss": 0.001, + "step": 107520 + }, + { + "epoch": 1.7594698519185141, + "grad_norm": 0.00187123310752213, + "learning_rate": 4.3415639925894203e-07, + "loss": 0.0009, + "step": 107530 + }, + { + "epoch": 1.7596334778695901, + "grad_norm": 0.11042116582393646, + "learning_rate": 4.335745971020111e-07, + "loss": 0.001, + "step": 107540 + }, + { + "epoch": 1.7597971038206661, + "grad_norm": 0.0460456945002079, + "learning_rate": 4.3299316736719997e-07, + "loss": 0.0012, + "step": 107550 + }, + { + "epoch": 1.7599607297717417, + "grad_norm": 0.03791511431336403, + "learning_rate": 4.324121101019274e-07, + "loss": 0.001, + "step": 107560 + }, + { + "epoch": 1.7601243557228177, + "grad_norm": 0.029190704226493835, + "learning_rate": 4.318314253535838e-07, + "loss": 0.0008, + "step": 107570 + }, + { + "epoch": 1.7602879816738934, + "grad_norm": 0.0013233766658231616, + "learning_rate": 4.312511131695263e-07, + "loss": 0.0009, + "step": 107580 + }, + { + "epoch": 1.7604516076249692, + "grad_norm": 0.058730173856019974, + "learning_rate": 4.306711735970842e-07, + "loss": 0.0007, + "step": 107590 + }, + { + "epoch": 1.7606152335760452, + "grad_norm": 0.1165233626961708, + "learning_rate": 4.300916066835542e-07, + "loss": 0.0013, + "step": 107600 + }, + { + "epoch": 1.760778859527121, + "grad_norm": 0.05455328896641731, + "learning_rate": 4.295124124762057e-07, + "loss": 0.0007, + "step": 107610 + }, + { + "epoch": 1.7609424854781968, + "grad_norm": 0.03936642408370972, + "learning_rate": 4.289335910222736e-07, + "loss": 0.0008, + "step": 107620 + }, + { + "epoch": 1.7611061114292728, + "grad_norm": 0.0059580253437161446, + "learning_rate": 4.2835514236896635e-07, + "loss": 0.0007, + "step": 107630 + }, + { + "epoch": 1.7612697373803485, + "grad_norm": 0.09812729805707932, + "learning_rate": 4.2777706656345895e-07, + "loss": 0.0009, + "step": 107640 + }, + { + "epoch": 1.7614333633314243, + "grad_norm": 0.17266543209552765, + "learning_rate": 4.271993636528993e-07, + "loss": 0.0018, + "step": 107650 + }, + { + "epoch": 1.7615969892825003, + "grad_norm": 0.06077754870057106, + "learning_rate": 4.2662203368440014e-07, + "loss": 0.0011, + "step": 107660 + }, + { + "epoch": 1.761760615233576, + "grad_norm": 0.1097792237997055, + "learning_rate": 4.260450767050489e-07, + "loss": 0.0011, + "step": 107670 + }, + { + "epoch": 1.7619242411846519, + "grad_norm": 0.029920652508735657, + "learning_rate": 4.254684927618985e-07, + "loss": 0.0013, + "step": 107680 + }, + { + "epoch": 1.7620878671357278, + "grad_norm": 0.059843726456165314, + "learning_rate": 4.2489228190197463e-07, + "loss": 0.0005, + "step": 107690 + }, + { + "epoch": 1.7622514930868036, + "grad_norm": 0.03530824929475784, + "learning_rate": 4.2431644417226913e-07, + "loss": 0.0008, + "step": 107700 + }, + { + "epoch": 1.7624151190378794, + "grad_norm": 0.03113321028649807, + "learning_rate": 4.2374097961974834e-07, + "loss": 0.0009, + "step": 107710 + }, + { + "epoch": 1.7625787449889554, + "grad_norm": 0.20169009268283844, + "learning_rate": 4.2316588829134197e-07, + "loss": 0.0008, + "step": 107720 + }, + { + "epoch": 1.762742370940031, + "grad_norm": 0.049306392669677734, + "learning_rate": 4.2259117023395525e-07, + "loss": 0.0012, + "step": 107730 + }, + { + "epoch": 1.762905996891107, + "grad_norm": 0.005846387706696987, + "learning_rate": 4.220168254944579e-07, + "loss": 0.0011, + "step": 107740 + }, + { + "epoch": 1.7630696228421827, + "grad_norm": 0.02031685598194599, + "learning_rate": 4.2144285411969356e-07, + "loss": 0.0008, + "step": 107750 + }, + { + "epoch": 1.7632332487932585, + "grad_norm": 0.061661701649427414, + "learning_rate": 4.2086925615647143e-07, + "loss": 0.0005, + "step": 107760 + }, + { + "epoch": 1.7633968747443345, + "grad_norm": 0.015344727784395218, + "learning_rate": 4.202960316515747e-07, + "loss": 0.0008, + "step": 107770 + }, + { + "epoch": 1.7635605006954103, + "grad_norm": 0.042269229888916016, + "learning_rate": 4.1972318065175156e-07, + "loss": 0.0007, + "step": 107780 + }, + { + "epoch": 1.763724126646486, + "grad_norm": 0.041248686611652374, + "learning_rate": 4.1915070320372175e-07, + "loss": 0.0011, + "step": 107790 + }, + { + "epoch": 1.763887752597562, + "grad_norm": 0.022548416629433632, + "learning_rate": 4.185785993541758e-07, + "loss": 0.0005, + "step": 107800 + }, + { + "epoch": 1.7640513785486378, + "grad_norm": 0.04156475141644478, + "learning_rate": 4.180068691497713e-07, + "loss": 0.0008, + "step": 107810 + }, + { + "epoch": 1.7642150044997136, + "grad_norm": 0.023491984233260155, + "learning_rate": 4.174355126371382e-07, + "loss": 0.001, + "step": 107820 + }, + { + "epoch": 1.7643786304507896, + "grad_norm": 0.04882371053099632, + "learning_rate": 4.1686452986287205e-07, + "loss": 0.0006, + "step": 107830 + }, + { + "epoch": 1.7645422564018653, + "grad_norm": 0.0026152748614549637, + "learning_rate": 4.162939208735428e-07, + "loss": 0.0023, + "step": 107840 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.12709873914718628, + "learning_rate": 4.15723685715686e-07, + "loss": 0.0022, + "step": 107850 + }, + { + "epoch": 1.7648695083040171, + "grad_norm": 0.042144399136304855, + "learning_rate": 4.151538244358083e-07, + "loss": 0.0021, + "step": 107860 + }, + { + "epoch": 1.765033134255093, + "grad_norm": 0.0195899847894907, + "learning_rate": 4.145843370803854e-07, + "loss": 0.0004, + "step": 107870 + }, + { + "epoch": 1.7651967602061687, + "grad_norm": 0.03930019959807396, + "learning_rate": 4.140152236958639e-07, + "loss": 0.0007, + "step": 107880 + }, + { + "epoch": 1.7653603861572447, + "grad_norm": 0.05209214612841606, + "learning_rate": 4.1344648432865677e-07, + "loss": 0.0009, + "step": 107890 + }, + { + "epoch": 1.7655240121083202, + "grad_norm": 0.08244826644659042, + "learning_rate": 4.128781190251502e-07, + "loss": 0.0009, + "step": 107900 + }, + { + "epoch": 1.7656876380593962, + "grad_norm": 0.07820073515176773, + "learning_rate": 4.123101278316971e-07, + "loss": 0.001, + "step": 107910 + }, + { + "epoch": 1.7658512640104722, + "grad_norm": 0.06501936167478561, + "learning_rate": 4.1174251079462215e-07, + "loss": 0.002, + "step": 107920 + }, + { + "epoch": 1.7660148899615478, + "grad_norm": 0.051150836050510406, + "learning_rate": 4.1117526796021655e-07, + "loss": 0.001, + "step": 107930 + }, + { + "epoch": 1.7661785159126238, + "grad_norm": 0.07909323275089264, + "learning_rate": 4.1060839937474396e-07, + "loss": 0.0012, + "step": 107940 + }, + { + "epoch": 1.7663421418636995, + "grad_norm": 0.0015398211544379592, + "learning_rate": 4.1004190508443564e-07, + "loss": 0.0011, + "step": 107950 + }, + { + "epoch": 1.7665057678147753, + "grad_norm": 0.012376747094094753, + "learning_rate": 4.0947578513549356e-07, + "loss": 0.0007, + "step": 107960 + }, + { + "epoch": 1.7666693937658513, + "grad_norm": 0.002680887235328555, + "learning_rate": 4.089100395740875e-07, + "loss": 0.0033, + "step": 107970 + }, + { + "epoch": 1.766833019716927, + "grad_norm": 0.1572302281856537, + "learning_rate": 4.083446684463593e-07, + "loss": 0.0013, + "step": 107980 + }, + { + "epoch": 1.7669966456680029, + "grad_norm": 0.02062288112938404, + "learning_rate": 4.077796717984167e-07, + "loss": 0.0008, + "step": 107990 + }, + { + "epoch": 1.7671602716190788, + "grad_norm": 0.028081782162189484, + "learning_rate": 4.072150496763411e-07, + "loss": 0.0005, + "step": 108000 + }, + { + "epoch": 1.7673238975701546, + "grad_norm": 0.05993391573429108, + "learning_rate": 4.066508021261789e-07, + "loss": 0.0016, + "step": 108010 + }, + { + "epoch": 1.7674875235212304, + "grad_norm": 0.07842147350311279, + "learning_rate": 4.060869291939501e-07, + "loss": 0.0007, + "step": 108020 + }, + { + "epoch": 1.7676511494723064, + "grad_norm": 0.0626661628484726, + "learning_rate": 4.0552343092564115e-07, + "loss": 0.0012, + "step": 108030 + }, + { + "epoch": 1.7678147754233822, + "grad_norm": 0.06272612512111664, + "learning_rate": 4.049603073672098e-07, + "loss": 0.0012, + "step": 108040 + }, + { + "epoch": 1.767978401374458, + "grad_norm": 0.03335786983370781, + "learning_rate": 4.0439755856458153e-07, + "loss": 0.0007, + "step": 108050 + }, + { + "epoch": 1.768142027325534, + "grad_norm": 0.004727168940007687, + "learning_rate": 4.0383518456365346e-07, + "loss": 0.0006, + "step": 108060 + }, + { + "epoch": 1.7683056532766097, + "grad_norm": 0.035718612372875214, + "learning_rate": 4.0327318541028947e-07, + "loss": 0.0008, + "step": 108070 + }, + { + "epoch": 1.7684692792276855, + "grad_norm": 0.03150029107928276, + "learning_rate": 4.0271156115032625e-07, + "loss": 0.0009, + "step": 108080 + }, + { + "epoch": 1.7686329051787615, + "grad_norm": 0.016425782814621925, + "learning_rate": 4.0215031182956665e-07, + "loss": 0.0004, + "step": 108090 + }, + { + "epoch": 1.768796531129837, + "grad_norm": 0.09022302180528641, + "learning_rate": 4.015894374937829e-07, + "loss": 0.0007, + "step": 108100 + }, + { + "epoch": 1.768960157080913, + "grad_norm": 0.048652973026037216, + "learning_rate": 4.010289381887211e-07, + "loss": 0.0011, + "step": 108110 + }, + { + "epoch": 1.769123783031989, + "grad_norm": 0.023149846121668816, + "learning_rate": 4.0046881396009096e-07, + "loss": 0.0009, + "step": 108120 + }, + { + "epoch": 1.7692874089830646, + "grad_norm": 0.03632233291864395, + "learning_rate": 3.9990906485357583e-07, + "loss": 0.0011, + "step": 108130 + }, + { + "epoch": 1.7694510349341406, + "grad_norm": 0.012505215592682362, + "learning_rate": 3.9934969091482647e-07, + "loss": 0.0003, + "step": 108140 + }, + { + "epoch": 1.7696146608852164, + "grad_norm": 0.09661774337291718, + "learning_rate": 3.9879069218946354e-07, + "loss": 0.0011, + "step": 108150 + }, + { + "epoch": 1.7697782868362921, + "grad_norm": 0.05754555016756058, + "learning_rate": 3.9823206872307674e-07, + "loss": 0.0013, + "step": 108160 + }, + { + "epoch": 1.7699419127873681, + "grad_norm": 0.10947311669588089, + "learning_rate": 3.9767382056122685e-07, + "loss": 0.001, + "step": 108170 + }, + { + "epoch": 1.770105538738444, + "grad_norm": 0.0036237684544175863, + "learning_rate": 3.9711594774944015e-07, + "loss": 0.0008, + "step": 108180 + }, + { + "epoch": 1.7702691646895197, + "grad_norm": 0.06420178711414337, + "learning_rate": 3.9655845033321814e-07, + "loss": 0.0006, + "step": 108190 + }, + { + "epoch": 1.7704327906405957, + "grad_norm": 0.023750467225909233, + "learning_rate": 3.9600132835802494e-07, + "loss": 0.0007, + "step": 108200 + }, + { + "epoch": 1.7705964165916714, + "grad_norm": 0.054547131061553955, + "learning_rate": 3.954445818693009e-07, + "loss": 0.0006, + "step": 108210 + }, + { + "epoch": 1.7707600425427472, + "grad_norm": 0.020808063447475433, + "learning_rate": 3.948882109124491e-07, + "loss": 0.0007, + "step": 108220 + }, + { + "epoch": 1.7709236684938232, + "grad_norm": 0.10417719930410385, + "learning_rate": 3.943322155328483e-07, + "loss": 0.0016, + "step": 108230 + }, + { + "epoch": 1.771087294444899, + "grad_norm": 0.005488272290676832, + "learning_rate": 3.9377659577584105e-07, + "loss": 0.0011, + "step": 108240 + }, + { + "epoch": 1.7712509203959748, + "grad_norm": 0.10023872554302216, + "learning_rate": 3.9322135168674393e-07, + "loss": 0.0014, + "step": 108250 + }, + { + "epoch": 1.7714145463470508, + "grad_norm": 0.027351336553692818, + "learning_rate": 3.9266648331083847e-07, + "loss": 0.0006, + "step": 108260 + }, + { + "epoch": 1.7715781722981265, + "grad_norm": 0.015320983715355396, + "learning_rate": 3.921119906933807e-07, + "loss": 0.0008, + "step": 108270 + }, + { + "epoch": 1.7717417982492023, + "grad_norm": 0.02882968820631504, + "learning_rate": 3.9155787387959e-07, + "loss": 0.0006, + "step": 108280 + }, + { + "epoch": 1.7719054242002783, + "grad_norm": 0.026351315900683403, + "learning_rate": 3.9100413291466133e-07, + "loss": 0.0014, + "step": 108290 + }, + { + "epoch": 1.7720690501513539, + "grad_norm": 0.01935686729848385, + "learning_rate": 3.90450767843753e-07, + "loss": 0.0011, + "step": 108300 + }, + { + "epoch": 1.7722326761024298, + "grad_norm": 0.04308044910430908, + "learning_rate": 3.8989777871199843e-07, + "loss": 0.0011, + "step": 108310 + }, + { + "epoch": 1.7723963020535058, + "grad_norm": 0.009379858151078224, + "learning_rate": 3.8934516556449474e-07, + "loss": 0.0011, + "step": 108320 + }, + { + "epoch": 1.7725599280045814, + "grad_norm": 0.013123693875968456, + "learning_rate": 3.887929284463138e-07, + "loss": 0.0007, + "step": 108330 + }, + { + "epoch": 1.7727235539556574, + "grad_norm": 0.06673294305801392, + "learning_rate": 3.882410674024917e-07, + "loss": 0.0017, + "step": 108340 + }, + { + "epoch": 1.7728871799067332, + "grad_norm": 0.03813819959759712, + "learning_rate": 3.8768958247803857e-07, + "loss": 0.0009, + "step": 108350 + }, + { + "epoch": 1.773050805857809, + "grad_norm": 0.05809563398361206, + "learning_rate": 3.8713847371792954e-07, + "loss": 0.001, + "step": 108360 + }, + { + "epoch": 1.773214431808885, + "grad_norm": 0.012311948463320732, + "learning_rate": 3.865877411671137e-07, + "loss": 0.0005, + "step": 108370 + }, + { + "epoch": 1.7733780577599607, + "grad_norm": 0.039004478603601456, + "learning_rate": 3.8603738487050457e-07, + "loss": 0.0016, + "step": 108380 + }, + { + "epoch": 1.7735416837110365, + "grad_norm": 0.03414362296462059, + "learning_rate": 3.8548740487298895e-07, + "loss": 0.0006, + "step": 108390 + }, + { + "epoch": 1.7737053096621125, + "grad_norm": 0.038908738642930984, + "learning_rate": 3.8493780121941935e-07, + "loss": 0.0009, + "step": 108400 + }, + { + "epoch": 1.7738689356131883, + "grad_norm": 0.005892162211239338, + "learning_rate": 3.843885739546227e-07, + "loss": 0.0027, + "step": 108410 + }, + { + "epoch": 1.774032561564264, + "grad_norm": 0.026326606050133705, + "learning_rate": 3.838397231233898e-07, + "loss": 0.0009, + "step": 108420 + }, + { + "epoch": 1.77419618751534, + "grad_norm": 0.03375169634819031, + "learning_rate": 3.832912487704826e-07, + "loss": 0.0008, + "step": 108430 + }, + { + "epoch": 1.7743598134664158, + "grad_norm": 0.08160717040300369, + "learning_rate": 3.827431509406343e-07, + "loss": 0.0011, + "step": 108440 + }, + { + "epoch": 1.7745234394174916, + "grad_norm": 0.04383422061800957, + "learning_rate": 3.821954296785452e-07, + "loss": 0.0007, + "step": 108450 + }, + { + "epoch": 1.7746870653685676, + "grad_norm": 0.04501236602663994, + "learning_rate": 3.816480850288862e-07, + "loss": 0.0009, + "step": 108460 + }, + { + "epoch": 1.7748506913196433, + "grad_norm": 0.060540322214365005, + "learning_rate": 3.811011170362955e-07, + "loss": 0.0009, + "step": 108470 + }, + { + "epoch": 1.7750143172707191, + "grad_norm": 0.006911097094416618, + "learning_rate": 3.805545257453841e-07, + "loss": 0.0007, + "step": 108480 + }, + { + "epoch": 1.7751779432217951, + "grad_norm": 0.042887214571237564, + "learning_rate": 3.8000831120072747e-07, + "loss": 0.0008, + "step": 108490 + }, + { + "epoch": 1.7753415691728707, + "grad_norm": 0.0329434871673584, + "learning_rate": 3.794624734468755e-07, + "loss": 0.0003, + "step": 108500 + }, + { + "epoch": 1.7755051951239467, + "grad_norm": 0.02230953238904476, + "learning_rate": 3.789170125283431e-07, + "loss": 0.0024, + "step": 108510 + }, + { + "epoch": 1.7756688210750227, + "grad_norm": 0.03865766152739525, + "learning_rate": 3.7837192848961757e-07, + "loss": 0.0014, + "step": 108520 + }, + { + "epoch": 1.7758324470260982, + "grad_norm": 0.11782630532979965, + "learning_rate": 3.7782722137515213e-07, + "loss": 0.0013, + "step": 108530 + }, + { + "epoch": 1.7759960729771742, + "grad_norm": 0.10710638761520386, + "learning_rate": 3.772828912293741e-07, + "loss": 0.0008, + "step": 108540 + }, + { + "epoch": 1.77615969892825, + "grad_norm": 0.006776588037610054, + "learning_rate": 3.767389380966746e-07, + "loss": 0.0003, + "step": 108550 + }, + { + "epoch": 1.7763233248793258, + "grad_norm": 0.1646786779165268, + "learning_rate": 3.7619536202141813e-07, + "loss": 0.0009, + "step": 108560 + }, + { + "epoch": 1.7764869508304018, + "grad_norm": 0.004421981517225504, + "learning_rate": 3.7565216304793594e-07, + "loss": 0.0007, + "step": 108570 + }, + { + "epoch": 1.7766505767814775, + "grad_norm": 0.04801374301314354, + "learning_rate": 3.7510934122053034e-07, + "loss": 0.0008, + "step": 108580 + }, + { + "epoch": 1.7768142027325533, + "grad_norm": 0.003219426842406392, + "learning_rate": 3.745668965834709e-07, + "loss": 0.001, + "step": 108590 + }, + { + "epoch": 1.7769778286836293, + "grad_norm": 0.03201757371425629, + "learning_rate": 3.7402482918099946e-07, + "loss": 0.0007, + "step": 108600 + }, + { + "epoch": 1.777141454634705, + "grad_norm": 0.034481413662433624, + "learning_rate": 3.7348313905732294e-07, + "loss": 0.0004, + "step": 108610 + }, + { + "epoch": 1.7773050805857808, + "grad_norm": 0.10792404413223267, + "learning_rate": 3.7294182625662145e-07, + "loss": 0.001, + "step": 108620 + }, + { + "epoch": 1.7774687065368568, + "grad_norm": 0.054812539368867874, + "learning_rate": 3.7240089082304145e-07, + "loss": 0.0005, + "step": 108630 + }, + { + "epoch": 1.7776323324879326, + "grad_norm": 0.02432039938867092, + "learning_rate": 3.7186033280070145e-07, + "loss": 0.0006, + "step": 108640 + }, + { + "epoch": 1.7777959584390084, + "grad_norm": 0.036705147475004196, + "learning_rate": 3.713201522336851e-07, + "loss": 0.0009, + "step": 108650 + }, + { + "epoch": 1.7779595843900844, + "grad_norm": 0.041651029139757156, + "learning_rate": 3.7078034916605e-07, + "loss": 0.0006, + "step": 108660 + }, + { + "epoch": 1.77812321034116, + "grad_norm": 0.032106123864650726, + "learning_rate": 3.702409236418192e-07, + "loss": 0.0004, + "step": 108670 + }, + { + "epoch": 1.778286836292236, + "grad_norm": 0.041012149304151535, + "learning_rate": 3.6970187570498696e-07, + "loss": 0.0008, + "step": 108680 + }, + { + "epoch": 1.778450462243312, + "grad_norm": 0.015991734340786934, + "learning_rate": 3.691632053995159e-07, + "loss": 0.0007, + "step": 108690 + }, + { + "epoch": 1.7786140881943875, + "grad_norm": 0.06304717808961868, + "learning_rate": 3.6862491276933864e-07, + "loss": 0.0007, + "step": 108700 + }, + { + "epoch": 1.7787777141454635, + "grad_norm": 0.016339173540472984, + "learning_rate": 3.680869978583557e-07, + "loss": 0.0012, + "step": 108710 + }, + { + "epoch": 1.7789413400965393, + "grad_norm": 0.01635793410241604, + "learning_rate": 3.6754946071043907e-07, + "loss": 0.001, + "step": 108720 + }, + { + "epoch": 1.779104966047615, + "grad_norm": 0.032469525933265686, + "learning_rate": 3.670123013694271e-07, + "loss": 0.0009, + "step": 108730 + }, + { + "epoch": 1.779268591998691, + "grad_norm": 0.04310190677642822, + "learning_rate": 3.6647551987912865e-07, + "loss": 0.0008, + "step": 108740 + }, + { + "epoch": 1.7794322179497668, + "grad_norm": 0.026811139658093452, + "learning_rate": 3.6593911628332254e-07, + "loss": 0.0006, + "step": 108750 + }, + { + "epoch": 1.7795958439008426, + "grad_norm": 0.00654851458966732, + "learning_rate": 3.65403090625755e-07, + "loss": 0.0024, + "step": 108760 + }, + { + "epoch": 1.7797594698519186, + "grad_norm": 0.029045140370726585, + "learning_rate": 3.648674429501442e-07, + "loss": 0.0007, + "step": 108770 + }, + { + "epoch": 1.7799230958029943, + "grad_norm": 0.0021667282562702894, + "learning_rate": 3.643321733001737e-07, + "loss": 0.0009, + "step": 108780 + }, + { + "epoch": 1.7800867217540701, + "grad_norm": 0.06168778985738754, + "learning_rate": 3.637972817195001e-07, + "loss": 0.0008, + "step": 108790 + }, + { + "epoch": 1.7802503477051461, + "grad_norm": 0.03460283949971199, + "learning_rate": 3.6326276825174536e-07, + "loss": 0.0007, + "step": 108800 + }, + { + "epoch": 1.780413973656222, + "grad_norm": 0.027735942974686623, + "learning_rate": 3.6272863294050497e-07, + "loss": 0.0009, + "step": 108810 + }, + { + "epoch": 1.7805775996072977, + "grad_norm": 0.00878941174596548, + "learning_rate": 3.6219487582933863e-07, + "loss": 0.0005, + "step": 108820 + }, + { + "epoch": 1.7807412255583737, + "grad_norm": 0.07693719863891602, + "learning_rate": 3.6166149696178045e-07, + "loss": 0.0008, + "step": 108830 + }, + { + "epoch": 1.7809048515094494, + "grad_norm": 0.07743791490793228, + "learning_rate": 3.611284963813283e-07, + "loss": 0.0007, + "step": 108840 + }, + { + "epoch": 1.7810684774605252, + "grad_norm": 0.08219505846500397, + "learning_rate": 3.605958741314541e-07, + "loss": 0.0012, + "step": 108850 + }, + { + "epoch": 1.7812321034116012, + "grad_norm": 0.05495278909802437, + "learning_rate": 3.600636302555949e-07, + "loss": 0.0005, + "step": 108860 + }, + { + "epoch": 1.7813957293626768, + "grad_norm": 0.01769072376191616, + "learning_rate": 3.595317647971602e-07, + "loss": 0.0008, + "step": 108870 + }, + { + "epoch": 1.7815593553137528, + "grad_norm": 0.08900812268257141, + "learning_rate": 3.5900027779952607e-07, + "loss": 0.001, + "step": 108880 + }, + { + "epoch": 1.7817229812648288, + "grad_norm": 0.04917926713824272, + "learning_rate": 3.584691693060399e-07, + "loss": 0.001, + "step": 108890 + }, + { + "epoch": 1.7818866072159043, + "grad_norm": 0.04451920464634895, + "learning_rate": 3.5793843936001604e-07, + "loss": 0.0033, + "step": 108900 + }, + { + "epoch": 1.7820502331669803, + "grad_norm": 0.039458729326725006, + "learning_rate": 3.574080880047398e-07, + "loss": 0.0007, + "step": 108910 + }, + { + "epoch": 1.782213859118056, + "grad_norm": 0.0532478466629982, + "learning_rate": 3.5687811528346386e-07, + "loss": 0.0009, + "step": 108920 + }, + { + "epoch": 1.7823774850691319, + "grad_norm": 0.030456246808171272, + "learning_rate": 3.5634852123941187e-07, + "loss": 0.0008, + "step": 108930 + }, + { + "epoch": 1.7825411110202078, + "grad_norm": 0.03679100424051285, + "learning_rate": 3.5581930591577495e-07, + "loss": 0.0012, + "step": 108940 + }, + { + "epoch": 1.7827047369712836, + "grad_norm": 0.09483564645051956, + "learning_rate": 3.552904693557158e-07, + "loss": 0.0012, + "step": 108950 + }, + { + "epoch": 1.7828683629223594, + "grad_norm": 0.07398977875709534, + "learning_rate": 3.54762011602362e-07, + "loss": 0.001, + "step": 108960 + }, + { + "epoch": 1.7830319888734354, + "grad_norm": 0.04821565002202988, + "learning_rate": 3.5423393269881477e-07, + "loss": 0.0007, + "step": 108970 + }, + { + "epoch": 1.7831956148245112, + "grad_norm": 0.04015131667256355, + "learning_rate": 3.537062326881413e-07, + "loss": 0.0006, + "step": 108980 + }, + { + "epoch": 1.783359240775587, + "grad_norm": 0.02346006967127323, + "learning_rate": 3.531789116133799e-07, + "loss": 0.0011, + "step": 108990 + }, + { + "epoch": 1.783522866726663, + "grad_norm": 0.04381433129310608, + "learning_rate": 3.526519695175362e-07, + "loss": 0.0012, + "step": 109000 + }, + { + "epoch": 1.7836864926777387, + "grad_norm": 0.0223515834659338, + "learning_rate": 3.5212540644358695e-07, + "loss": 0.0004, + "step": 109010 + }, + { + "epoch": 1.7838501186288145, + "grad_norm": 0.04557507485151291, + "learning_rate": 3.51599222434475e-07, + "loss": 0.0039, + "step": 109020 + }, + { + "epoch": 1.7840137445798905, + "grad_norm": 0.010546478442847729, + "learning_rate": 3.5107341753311596e-07, + "loss": 0.0005, + "step": 109030 + }, + { + "epoch": 1.7841773705309663, + "grad_norm": 0.08547072112560272, + "learning_rate": 3.505479917823917e-07, + "loss": 0.0013, + "step": 109040 + }, + { + "epoch": 1.784340996482042, + "grad_norm": 0.16562238335609436, + "learning_rate": 3.500229452251547e-07, + "loss": 0.0011, + "step": 109050 + }, + { + "epoch": 1.784504622433118, + "grad_norm": 0.023884305730462074, + "learning_rate": 3.494982779042261e-07, + "loss": 0.0005, + "step": 109060 + }, + { + "epoch": 1.7846682483841936, + "grad_norm": 0.01914985477924347, + "learning_rate": 3.489739898623945e-07, + "loss": 0.0003, + "step": 109070 + }, + { + "epoch": 1.7848318743352696, + "grad_norm": 0.12570561468601227, + "learning_rate": 3.484500811424213e-07, + "loss": 0.001, + "step": 109080 + }, + { + "epoch": 1.7849955002863456, + "grad_norm": 0.05569448322057724, + "learning_rate": 3.4792655178703226e-07, + "loss": 0.0008, + "step": 109090 + }, + { + "epoch": 1.7851591262374211, + "grad_norm": 0.01894867606461048, + "learning_rate": 3.4740340183892716e-07, + "loss": 0.0006, + "step": 109100 + }, + { + "epoch": 1.7853227521884971, + "grad_norm": 0.2176194041967392, + "learning_rate": 3.468806313407702e-07, + "loss": 0.0014, + "step": 109110 + }, + { + "epoch": 1.785486378139573, + "grad_norm": 0.07903044670820236, + "learning_rate": 3.4635824033519837e-07, + "loss": 0.0013, + "step": 109120 + }, + { + "epoch": 1.7856500040906487, + "grad_norm": 0.031136836856603622, + "learning_rate": 3.458362288648148e-07, + "loss": 0.0005, + "step": 109130 + }, + { + "epoch": 1.7858136300417247, + "grad_norm": 0.23112307488918304, + "learning_rate": 3.4531459697219383e-07, + "loss": 0.0009, + "step": 109140 + }, + { + "epoch": 1.7859772559928004, + "grad_norm": 0.03424979746341705, + "learning_rate": 3.4479334469987745e-07, + "loss": 0.0007, + "step": 109150 + }, + { + "epoch": 1.7861408819438762, + "grad_norm": 0.02788633108139038, + "learning_rate": 3.442724720903784e-07, + "loss": 0.0012, + "step": 109160 + }, + { + "epoch": 1.7863045078949522, + "grad_norm": 0.04473122954368591, + "learning_rate": 3.4375197918617485e-07, + "loss": 0.0007, + "step": 109170 + }, + { + "epoch": 1.786468133846028, + "grad_norm": 0.044374048709869385, + "learning_rate": 3.432318660297196e-07, + "loss": 0.0007, + "step": 109180 + }, + { + "epoch": 1.7866317597971038, + "grad_norm": 0.07242981344461441, + "learning_rate": 3.4271213266342806e-07, + "loss": 0.0014, + "step": 109190 + }, + { + "epoch": 1.7867953857481798, + "grad_norm": 0.039236944168806076, + "learning_rate": 3.4219277912969085e-07, + "loss": 0.0008, + "step": 109200 + }, + { + "epoch": 1.7869590116992555, + "grad_norm": 0.028940899297595024, + "learning_rate": 3.416738054708624e-07, + "loss": 0.0004, + "step": 109210 + }, + { + "epoch": 1.7871226376503313, + "grad_norm": 0.02621063031256199, + "learning_rate": 3.4115521172927044e-07, + "loss": 0.0022, + "step": 109220 + }, + { + "epoch": 1.7872862636014073, + "grad_norm": 0.07201523333787918, + "learning_rate": 3.406369979472074e-07, + "loss": 0.0009, + "step": 109230 + }, + { + "epoch": 1.787449889552483, + "grad_norm": 0.010051997378468513, + "learning_rate": 3.4011916416693933e-07, + "loss": 0.0011, + "step": 109240 + }, + { + "epoch": 1.7876135155035588, + "grad_norm": 0.054128870368003845, + "learning_rate": 3.39601710430697e-07, + "loss": 0.0008, + "step": 109250 + }, + { + "epoch": 1.7877771414546348, + "grad_norm": 0.04184652864933014, + "learning_rate": 3.3908463678068435e-07, + "loss": 0.0004, + "step": 109260 + }, + { + "epoch": 1.7879407674057104, + "grad_norm": 0.03830265998840332, + "learning_rate": 3.3856794325906995e-07, + "loss": 0.0017, + "step": 109270 + }, + { + "epoch": 1.7881043933567864, + "grad_norm": 0.022045837715268135, + "learning_rate": 3.38051629907995e-07, + "loss": 0.0005, + "step": 109280 + }, + { + "epoch": 1.7882680193078624, + "grad_norm": 0.09758727252483368, + "learning_rate": 3.37535696769567e-07, + "loss": 0.0018, + "step": 109290 + }, + { + "epoch": 1.788431645258938, + "grad_norm": 0.0681813657283783, + "learning_rate": 3.3702014388586557e-07, + "loss": 0.0008, + "step": 109300 + }, + { + "epoch": 1.788595271210014, + "grad_norm": 0.00808724109083414, + "learning_rate": 3.3650497129893546e-07, + "loss": 0.0013, + "step": 109310 + }, + { + "epoch": 1.7887588971610897, + "grad_norm": 0.02012191154062748, + "learning_rate": 3.359901790507941e-07, + "loss": 0.001, + "step": 109320 + }, + { + "epoch": 1.7889225231121655, + "grad_norm": 0.03943663090467453, + "learning_rate": 3.3547576718342465e-07, + "loss": 0.0008, + "step": 109330 + }, + { + "epoch": 1.7890861490632415, + "grad_norm": 0.049337275326251984, + "learning_rate": 3.349617357387824e-07, + "loss": 0.0012, + "step": 109340 + }, + { + "epoch": 1.7892497750143173, + "grad_norm": 0.12241708487272263, + "learning_rate": 3.3444808475878833e-07, + "loss": 0.0008, + "step": 109350 + }, + { + "epoch": 1.789413400965393, + "grad_norm": 0.0004889406845904887, + "learning_rate": 3.3393481428533604e-07, + "loss": 0.0006, + "step": 109360 + }, + { + "epoch": 1.789577026916469, + "grad_norm": 0.14237074553966522, + "learning_rate": 3.3342192436028374e-07, + "loss": 0.0008, + "step": 109370 + }, + { + "epoch": 1.7897406528675448, + "grad_norm": 0.0270346961915493, + "learning_rate": 3.3290941502546306e-07, + "loss": 0.0008, + "step": 109380 + }, + { + "epoch": 1.7899042788186206, + "grad_norm": 0.017784483730793, + "learning_rate": 3.323972863226715e-07, + "loss": 0.0008, + "step": 109390 + }, + { + "epoch": 1.7900679047696966, + "grad_norm": 0.034558188170194626, + "learning_rate": 3.3188553829367644e-07, + "loss": 0.0014, + "step": 109400 + }, + { + "epoch": 1.7902315307207723, + "grad_norm": 0.08992783725261688, + "learning_rate": 3.3137417098021487e-07, + "loss": 0.0012, + "step": 109410 + }, + { + "epoch": 1.7903951566718481, + "grad_norm": 0.0024785336572676897, + "learning_rate": 3.3086318442399123e-07, + "loss": 0.0011, + "step": 109420 + }, + { + "epoch": 1.7905587826229241, + "grad_norm": 0.0588332936167717, + "learning_rate": 3.3035257866668113e-07, + "loss": 0.0015, + "step": 109430 + }, + { + "epoch": 1.790722408574, + "grad_norm": 0.048809777945280075, + "learning_rate": 3.2984235374992677e-07, + "loss": 0.0007, + "step": 109440 + }, + { + "epoch": 1.7908860345250757, + "grad_norm": 0.09338638186454773, + "learning_rate": 3.293325097153416e-07, + "loss": 0.0009, + "step": 109450 + }, + { + "epoch": 1.7910496604761517, + "grad_norm": 0.0046027353964746, + "learning_rate": 3.288230466045045e-07, + "loss": 0.0008, + "step": 109460 + }, + { + "epoch": 1.7912132864272272, + "grad_norm": 0.03807989880442619, + "learning_rate": 3.2831396445896834e-07, + "loss": 0.0007, + "step": 109470 + }, + { + "epoch": 1.7913769123783032, + "grad_norm": 0.12097148597240448, + "learning_rate": 3.2780526332024997e-07, + "loss": 0.0022, + "step": 109480 + }, + { + "epoch": 1.791540538329379, + "grad_norm": 0.01567874476313591, + "learning_rate": 3.2729694322983895e-07, + "loss": 0.0003, + "step": 109490 + }, + { + "epoch": 1.7917041642804548, + "grad_norm": 0.02814096212387085, + "learning_rate": 3.2678900422919046e-07, + "loss": 0.001, + "step": 109500 + }, + { + "epoch": 1.7918677902315308, + "grad_norm": 0.05652020871639252, + "learning_rate": 3.262814463597319e-07, + "loss": 0.001, + "step": 109510 + }, + { + "epoch": 1.7920314161826065, + "grad_norm": 0.025702757760882378, + "learning_rate": 3.2577426966285687e-07, + "loss": 0.0011, + "step": 109520 + }, + { + "epoch": 1.7921950421336823, + "grad_norm": 0.11110571771860123, + "learning_rate": 3.2526747417992943e-07, + "loss": 0.0011, + "step": 109530 + }, + { + "epoch": 1.7923586680847583, + "grad_norm": 0.016633115708827972, + "learning_rate": 3.247610599522816e-07, + "loss": 0.0015, + "step": 109540 + }, + { + "epoch": 1.792522294035834, + "grad_norm": 0.03265874460339546, + "learning_rate": 3.242550270212158e-07, + "loss": 0.0013, + "step": 109550 + }, + { + "epoch": 1.7926859199869098, + "grad_norm": 0.025700142607092857, + "learning_rate": 3.2374937542800075e-07, + "loss": 0.0037, + "step": 109560 + }, + { + "epoch": 1.7928495459379858, + "grad_norm": 0.051331449300050735, + "learning_rate": 3.232441052138779e-07, + "loss": 0.0006, + "step": 109570 + }, + { + "epoch": 1.7930131718890616, + "grad_norm": 0.043259989470243454, + "learning_rate": 3.227392164200532e-07, + "loss": 0.0011, + "step": 109580 + }, + { + "epoch": 1.7931767978401374, + "grad_norm": 0.012173153460025787, + "learning_rate": 3.2223470908770537e-07, + "loss": 0.0007, + "step": 109590 + }, + { + "epoch": 1.7933404237912134, + "grad_norm": 0.15781977772712708, + "learning_rate": 3.2173058325797866e-07, + "loss": 0.0023, + "step": 109600 + }, + { + "epoch": 1.7935040497422892, + "grad_norm": 0.040818627923727036, + "learning_rate": 3.2122683897198967e-07, + "loss": 0.0013, + "step": 109610 + }, + { + "epoch": 1.793667675693365, + "grad_norm": 0.06910662353038788, + "learning_rate": 3.2072347627082003e-07, + "loss": 0.0007, + "step": 109620 + }, + { + "epoch": 1.793831301644441, + "grad_norm": 0.06196121126413345, + "learning_rate": 3.2022049519552457e-07, + "loss": 0.0011, + "step": 109630 + }, + { + "epoch": 1.7939949275955165, + "grad_norm": 0.1300002485513687, + "learning_rate": 3.197178957871222e-07, + "loss": 0.0009, + "step": 109640 + }, + { + "epoch": 1.7941585535465925, + "grad_norm": 0.016693996265530586, + "learning_rate": 3.192156780866057e-07, + "loss": 0.0009, + "step": 109650 + }, + { + "epoch": 1.7943221794976685, + "grad_norm": 0.007119476795196533, + "learning_rate": 3.187138421349323e-07, + "loss": 0.0029, + "step": 109660 + }, + { + "epoch": 1.794485805448744, + "grad_norm": 0.01714773289859295, + "learning_rate": 3.18212387973032e-07, + "loss": 0.0011, + "step": 109670 + }, + { + "epoch": 1.79464943139982, + "grad_norm": 0.023635078221559525, + "learning_rate": 3.1771131564179936e-07, + "loss": 0.0008, + "step": 109680 + }, + { + "epoch": 1.7948130573508958, + "grad_norm": 0.006420428398996592, + "learning_rate": 3.1721062518210166e-07, + "loss": 0.0007, + "step": 109690 + }, + { + "epoch": 1.7949766833019716, + "grad_norm": 0.029131490737199783, + "learning_rate": 3.167103166347735e-07, + "loss": 0.0006, + "step": 109700 + }, + { + "epoch": 1.7951403092530476, + "grad_norm": 0.06370867043733597, + "learning_rate": 3.162103900406177e-07, + "loss": 0.0009, + "step": 109710 + }, + { + "epoch": 1.7953039352041233, + "grad_norm": 0.019433518871665, + "learning_rate": 3.1571084544040674e-07, + "loss": 0.0006, + "step": 109720 + }, + { + "epoch": 1.7954675611551991, + "grad_norm": 0.023268291726708412, + "learning_rate": 3.152116828748819e-07, + "loss": 0.0013, + "step": 109730 + }, + { + "epoch": 1.7956311871062751, + "grad_norm": 0.013202893547713757, + "learning_rate": 3.1471290238475337e-07, + "loss": 0.0007, + "step": 109740 + }, + { + "epoch": 1.795794813057351, + "grad_norm": 0.030107159167528152, + "learning_rate": 3.1421450401069976e-07, + "loss": 0.0007, + "step": 109750 + }, + { + "epoch": 1.7959584390084267, + "grad_norm": 0.05286666750907898, + "learning_rate": 3.1371648779336906e-07, + "loss": 0.0005, + "step": 109760 + }, + { + "epoch": 1.7961220649595027, + "grad_norm": 0.030747853219509125, + "learning_rate": 3.1321885377337657e-07, + "loss": 0.0009, + "step": 109770 + }, + { + "epoch": 1.7962856909105784, + "grad_norm": 0.000842454785015434, + "learning_rate": 3.127216019913093e-07, + "loss": 0.0008, + "step": 109780 + }, + { + "epoch": 1.7964493168616542, + "grad_norm": 0.018881119787693024, + "learning_rate": 3.1222473248772034e-07, + "loss": 0.0008, + "step": 109790 + }, + { + "epoch": 1.7966129428127302, + "grad_norm": 0.03469909355044365, + "learning_rate": 3.117282453031334e-07, + "loss": 0.0008, + "step": 109800 + }, + { + "epoch": 1.796776568763806, + "grad_norm": 0.042177844792604446, + "learning_rate": 3.112321404780394e-07, + "loss": 0.0007, + "step": 109810 + }, + { + "epoch": 1.7969401947148818, + "grad_norm": 0.042193010449409485, + "learning_rate": 3.1073641805289987e-07, + "loss": 0.0018, + "step": 109820 + }, + { + "epoch": 1.7971038206659578, + "grad_norm": 0.16278859972953796, + "learning_rate": 3.1024107806814364e-07, + "loss": 0.0034, + "step": 109830 + }, + { + "epoch": 1.7972674466170333, + "grad_norm": 0.02990959957242012, + "learning_rate": 3.097461205641694e-07, + "loss": 0.0019, + "step": 109840 + }, + { + "epoch": 1.7974310725681093, + "grad_norm": 0.02863064780831337, + "learning_rate": 3.092515455813433e-07, + "loss": 0.001, + "step": 109850 + }, + { + "epoch": 1.7975946985191853, + "grad_norm": 0.039100535213947296, + "learning_rate": 3.08757353160003e-07, + "loss": 0.0007, + "step": 109860 + }, + { + "epoch": 1.7977583244702608, + "grad_norm": 0.03215475007891655, + "learning_rate": 3.0826354334045073e-07, + "loss": 0.0004, + "step": 109870 + }, + { + "epoch": 1.7979219504213368, + "grad_norm": 0.0635056346654892, + "learning_rate": 3.0777011616296205e-07, + "loss": 0.0008, + "step": 109880 + }, + { + "epoch": 1.7980855763724126, + "grad_norm": 0.025397958233952522, + "learning_rate": 3.0727707166777753e-07, + "loss": 0.0004, + "step": 109890 + }, + { + "epoch": 1.7982492023234884, + "grad_norm": 0.08179663121700287, + "learning_rate": 3.0678440989511006e-07, + "loss": 0.0016, + "step": 109900 + }, + { + "epoch": 1.7984128282745644, + "grad_norm": 0.03371858596801758, + "learning_rate": 3.0629213088513743e-07, + "loss": 0.001, + "step": 109910 + }, + { + "epoch": 1.7985764542256402, + "grad_norm": 0.018032874912023544, + "learning_rate": 3.0580023467801033e-07, + "loss": 0.0009, + "step": 109920 + }, + { + "epoch": 1.798740080176716, + "grad_norm": 0.015191777609288692, + "learning_rate": 3.053087213138439e-07, + "loss": 0.0009, + "step": 109930 + }, + { + "epoch": 1.798903706127792, + "grad_norm": 0.10504893213510513, + "learning_rate": 3.0481759083272613e-07, + "loss": 0.001, + "step": 109940 + }, + { + "epoch": 1.7990673320788677, + "grad_norm": 0.06394493579864502, + "learning_rate": 3.0432684327471095e-07, + "loss": 0.001, + "step": 109950 + }, + { + "epoch": 1.7992309580299435, + "grad_norm": 0.02950209006667137, + "learning_rate": 3.038364786798226e-07, + "loss": 0.0011, + "step": 109960 + }, + { + "epoch": 1.7993945839810195, + "grad_norm": 0.051735080778598785, + "learning_rate": 3.0334649708805286e-07, + "loss": 0.0011, + "step": 109970 + }, + { + "epoch": 1.7995582099320953, + "grad_norm": 0.02537471055984497, + "learning_rate": 3.028568985393643e-07, + "loss": 0.0011, + "step": 109980 + }, + { + "epoch": 1.799721835883171, + "grad_norm": 0.052202340215444565, + "learning_rate": 3.023676830736849e-07, + "loss": 0.0012, + "step": 109990 + }, + { + "epoch": 1.799885461834247, + "grad_norm": 0.06691936403512955, + "learning_rate": 3.018788507309156e-07, + "loss": 0.0004, + "step": 110000 + }, + { + "epoch": 1.8000490877853228, + "grad_norm": 0.16745823621749878, + "learning_rate": 3.0139040155092225e-07, + "loss": 0.0007, + "step": 110010 + }, + { + "epoch": 1.8002127137363986, + "grad_norm": 0.0382610559463501, + "learning_rate": 3.009023355735419e-07, + "loss": 0.0011, + "step": 110020 + }, + { + "epoch": 1.8003763396874746, + "grad_norm": 0.034382786601781845, + "learning_rate": 3.0041465283857984e-07, + "loss": 0.0004, + "step": 110030 + }, + { + "epoch": 1.8005399656385501, + "grad_norm": 0.02954678237438202, + "learning_rate": 2.9992735338580825e-07, + "loss": 0.0009, + "step": 110040 + }, + { + "epoch": 1.8007035915896261, + "grad_norm": 0.07690322399139404, + "learning_rate": 2.9944043725497194e-07, + "loss": 0.0023, + "step": 110050 + }, + { + "epoch": 1.8008672175407021, + "grad_norm": 0.061491891741752625, + "learning_rate": 2.989539044857798e-07, + "loss": 0.0021, + "step": 110060 + }, + { + "epoch": 1.8010308434917777, + "grad_norm": 0.036923281848430634, + "learning_rate": 2.9846775511791324e-07, + "loss": 0.001, + "step": 110070 + }, + { + "epoch": 1.8011944694428537, + "grad_norm": 0.03511584922671318, + "learning_rate": 2.9798198919102016e-07, + "loss": 0.0005, + "step": 110080 + }, + { + "epoch": 1.8013580953939294, + "grad_norm": 0.041213735938072205, + "learning_rate": 2.9749660674471934e-07, + "loss": 0.0007, + "step": 110090 + }, + { + "epoch": 1.8015217213450052, + "grad_norm": 0.068928562104702, + "learning_rate": 2.970116078185953e-07, + "loss": 0.0006, + "step": 110100 + }, + { + "epoch": 1.8016853472960812, + "grad_norm": 0.06118115782737732, + "learning_rate": 2.9652699245220407e-07, + "loss": 0.0009, + "step": 110110 + }, + { + "epoch": 1.801848973247157, + "grad_norm": 0.0021085136104375124, + "learning_rate": 2.9604276068506797e-07, + "loss": 0.0009, + "step": 110120 + }, + { + "epoch": 1.8020125991982328, + "grad_norm": 0.07775899022817612, + "learning_rate": 2.95558912556681e-07, + "loss": 0.0008, + "step": 110130 + }, + { + "epoch": 1.8021762251493088, + "grad_norm": 0.020561179146170616, + "learning_rate": 2.9507544810650214e-07, + "loss": 0.0008, + "step": 110140 + }, + { + "epoch": 1.8023398511003845, + "grad_norm": 0.013287893496453762, + "learning_rate": 2.9459236737396314e-07, + "loss": 0.0007, + "step": 110150 + }, + { + "epoch": 1.8025034770514603, + "grad_norm": 0.06170758977532387, + "learning_rate": 2.941096703984608e-07, + "loss": 0.006, + "step": 110160 + }, + { + "epoch": 1.8026671030025363, + "grad_norm": 0.06692039221525192, + "learning_rate": 2.9362735721936376e-07, + "loss": 0.0006, + "step": 110170 + }, + { + "epoch": 1.802830728953612, + "grad_norm": 0.025893183425068855, + "learning_rate": 2.9314542787600596e-07, + "loss": 0.0006, + "step": 110180 + }, + { + "epoch": 1.8029943549046878, + "grad_norm": 0.01852772757411003, + "learning_rate": 2.926638824076938e-07, + "loss": 0.0008, + "step": 110190 + }, + { + "epoch": 1.8031579808557638, + "grad_norm": 0.007035739719867706, + "learning_rate": 2.921827208536987e-07, + "loss": 0.0014, + "step": 110200 + }, + { + "epoch": 1.8033216068068396, + "grad_norm": 0.017723141238093376, + "learning_rate": 2.917019432532647e-07, + "loss": 0.0006, + "step": 110210 + }, + { + "epoch": 1.8034852327579154, + "grad_norm": 0.1276751309633255, + "learning_rate": 2.9122154964560003e-07, + "loss": 0.0006, + "step": 110220 + }, + { + "epoch": 1.8036488587089914, + "grad_norm": 0.03449685126543045, + "learning_rate": 2.907415400698865e-07, + "loss": 0.001, + "step": 110230 + }, + { + "epoch": 1.803812484660067, + "grad_norm": 0.006220964249223471, + "learning_rate": 2.902619145652691e-07, + "loss": 0.0013, + "step": 110240 + }, + { + "epoch": 1.803976110611143, + "grad_norm": 0.019480634480714798, + "learning_rate": 2.897826731708675e-07, + "loss": 0.0011, + "step": 110250 + }, + { + "epoch": 1.804139736562219, + "grad_norm": 0.036189641803503036, + "learning_rate": 2.893038159257644e-07, + "loss": 0.0006, + "step": 110260 + }, + { + "epoch": 1.8043033625132945, + "grad_norm": 0.026509476825594902, + "learning_rate": 2.8882534286901575e-07, + "loss": 0.0005, + "step": 110270 + }, + { + "epoch": 1.8044669884643705, + "grad_norm": 0.0023668569047003984, + "learning_rate": 2.883472540396426e-07, + "loss": 0.0009, + "step": 110280 + }, + { + "epoch": 1.8046306144154463, + "grad_norm": 0.042108867317438126, + "learning_rate": 2.8786954947663756e-07, + "loss": 0.0015, + "step": 110290 + }, + { + "epoch": 1.804794240366522, + "grad_norm": 0.00629700580611825, + "learning_rate": 2.8739222921895957e-07, + "loss": 0.0005, + "step": 110300 + }, + { + "epoch": 1.804957866317598, + "grad_norm": 0.004674853757023811, + "learning_rate": 2.86915293305538e-07, + "loss": 0.0017, + "step": 110310 + }, + { + "epoch": 1.8051214922686738, + "grad_norm": 0.003186131129041314, + "learning_rate": 2.864387417752695e-07, + "loss": 0.0017, + "step": 110320 + }, + { + "epoch": 1.8052851182197496, + "grad_norm": 0.04131060466170311, + "learning_rate": 2.8596257466702084e-07, + "loss": 0.0007, + "step": 110330 + }, + { + "epoch": 1.8054487441708256, + "grad_norm": 0.043626222759485245, + "learning_rate": 2.854867920196253e-07, + "loss": 0.0006, + "step": 110340 + }, + { + "epoch": 1.8056123701219013, + "grad_norm": 0.054487768560647964, + "learning_rate": 2.850113938718879e-07, + "loss": 0.0006, + "step": 110350 + }, + { + "epoch": 1.8057759960729771, + "grad_norm": 0.06478989124298096, + "learning_rate": 2.845363802625789e-07, + "loss": 0.0007, + "step": 110360 + }, + { + "epoch": 1.8059396220240531, + "grad_norm": 0.12410090118646622, + "learning_rate": 2.8406175123044e-07, + "loss": 0.0007, + "step": 110370 + }, + { + "epoch": 1.806103247975129, + "grad_norm": 0.009459729306399822, + "learning_rate": 2.8358750681417857e-07, + "loss": 0.0006, + "step": 110380 + }, + { + "epoch": 1.8062668739262047, + "grad_norm": 0.037657782435417175, + "learning_rate": 2.831136470524742e-07, + "loss": 0.0015, + "step": 110390 + }, + { + "epoch": 1.8064304998772807, + "grad_norm": 0.06314370781183243, + "learning_rate": 2.8264017198397163e-07, + "loss": 0.001, + "step": 110400 + }, + { + "epoch": 1.8065941258283562, + "grad_norm": 0.018982969224452972, + "learning_rate": 2.821670816472877e-07, + "loss": 0.0006, + "step": 110410 + }, + { + "epoch": 1.8067577517794322, + "grad_norm": 0.02616678550839424, + "learning_rate": 2.8169437608100427e-07, + "loss": 0.0005, + "step": 110420 + }, + { + "epoch": 1.8069213777305082, + "grad_norm": 0.04556853696703911, + "learning_rate": 2.812220553236755e-07, + "loss": 0.0009, + "step": 110430 + }, + { + "epoch": 1.8070850036815838, + "grad_norm": 0.031112229451537132, + "learning_rate": 2.8075011941382015e-07, + "loss": 0.0013, + "step": 110440 + }, + { + "epoch": 1.8072486296326598, + "grad_norm": 0.006761751137673855, + "learning_rate": 2.802785683899295e-07, + "loss": 0.001, + "step": 110450 + }, + { + "epoch": 1.8074122555837355, + "grad_norm": 0.20393942296504974, + "learning_rate": 2.7980740229046064e-07, + "loss": 0.001, + "step": 110460 + }, + { + "epoch": 1.8075758815348113, + "grad_norm": 0.04542384669184685, + "learning_rate": 2.7933662115384065e-07, + "loss": 0.0006, + "step": 110470 + }, + { + "epoch": 1.8077395074858873, + "grad_norm": 0.012509548105299473, + "learning_rate": 2.788662250184648e-07, + "loss": 0.0007, + "step": 110480 + }, + { + "epoch": 1.807903133436963, + "grad_norm": 0.04757794737815857, + "learning_rate": 2.78396213922697e-07, + "loss": 0.0007, + "step": 110490 + }, + { + "epoch": 1.8080667593880388, + "grad_norm": 0.03990526497364044, + "learning_rate": 2.779265879048698e-07, + "loss": 0.0012, + "step": 110500 + }, + { + "epoch": 1.8082303853391148, + "grad_norm": 0.020959237590432167, + "learning_rate": 2.7745734700328374e-07, + "loss": 0.0009, + "step": 110510 + }, + { + "epoch": 1.8083940112901906, + "grad_norm": 0.03837386518716812, + "learning_rate": 2.7698849125620986e-07, + "loss": 0.0005, + "step": 110520 + }, + { + "epoch": 1.8085576372412664, + "grad_norm": 0.02678568661212921, + "learning_rate": 2.7652002070188476e-07, + "loss": 0.0007, + "step": 110530 + }, + { + "epoch": 1.8087212631923424, + "grad_norm": 0.02614133246243, + "learning_rate": 2.760519353785163e-07, + "loss": 0.001, + "step": 110540 + }, + { + "epoch": 1.8088848891434182, + "grad_norm": 0.017137326300144196, + "learning_rate": 2.7558423532427936e-07, + "loss": 0.0005, + "step": 110550 + }, + { + "epoch": 1.809048515094494, + "grad_norm": 0.04557979106903076, + "learning_rate": 2.751169205773191e-07, + "loss": 0.0009, + "step": 110560 + }, + { + "epoch": 1.80921214104557, + "grad_norm": 0.07694178074598312, + "learning_rate": 2.7464999117574664e-07, + "loss": 0.0011, + "step": 110570 + }, + { + "epoch": 1.8093757669966457, + "grad_norm": 0.033573735505342484, + "learning_rate": 2.741834471576449e-07, + "loss": 0.0007, + "step": 110580 + }, + { + "epoch": 1.8095393929477215, + "grad_norm": 0.01824546419084072, + "learning_rate": 2.737172885610612e-07, + "loss": 0.0021, + "step": 110590 + }, + { + "epoch": 1.8097030188987975, + "grad_norm": 0.04702093079686165, + "learning_rate": 2.7325151542401673e-07, + "loss": 0.0008, + "step": 110600 + }, + { + "epoch": 1.809866644849873, + "grad_norm": 0.0018065616022795439, + "learning_rate": 2.7278612778449563e-07, + "loss": 0.0015, + "step": 110610 + }, + { + "epoch": 1.810030270800949, + "grad_norm": 0.05801538750529289, + "learning_rate": 2.7232112568045534e-07, + "loss": 0.0009, + "step": 110620 + }, + { + "epoch": 1.810193896752025, + "grad_norm": 0.059487905353307724, + "learning_rate": 2.718565091498182e-07, + "loss": 0.0012, + "step": 110630 + }, + { + "epoch": 1.8103575227031006, + "grad_norm": 0.03374604880809784, + "learning_rate": 2.713922782304784e-07, + "loss": 0.0006, + "step": 110640 + }, + { + "epoch": 1.8105211486541766, + "grad_norm": 0.026172664016485214, + "learning_rate": 2.7092843296029627e-07, + "loss": 0.0007, + "step": 110650 + }, + { + "epoch": 1.8106847746052523, + "grad_norm": 0.00165732076857239, + "learning_rate": 2.7046497337710144e-07, + "loss": 0.0008, + "step": 110660 + }, + { + "epoch": 1.8108484005563281, + "grad_norm": 0.04420202597975731, + "learning_rate": 2.700018995186915e-07, + "loss": 0.0013, + "step": 110670 + }, + { + "epoch": 1.8110120265074041, + "grad_norm": 0.04795202612876892, + "learning_rate": 2.6953921142283457e-07, + "loss": 0.0013, + "step": 110680 + }, + { + "epoch": 1.81117565245848, + "grad_norm": 0.015184788033366203, + "learning_rate": 2.6907690912726435e-07, + "loss": 0.0008, + "step": 110690 + }, + { + "epoch": 1.8113392784095557, + "grad_norm": 0.04024822264909744, + "learning_rate": 2.686149926696863e-07, + "loss": 0.001, + "step": 110700 + }, + { + "epoch": 1.8115029043606317, + "grad_norm": 0.039022479206323624, + "learning_rate": 2.681534620877713e-07, + "loss": 0.0016, + "step": 110710 + }, + { + "epoch": 1.8116665303117074, + "grad_norm": 0.10446411371231079, + "learning_rate": 2.676923174191615e-07, + "loss": 0.0006, + "step": 110720 + }, + { + "epoch": 1.8118301562627832, + "grad_norm": 0.031347062438726425, + "learning_rate": 2.6723155870146454e-07, + "loss": 0.0007, + "step": 110730 + }, + { + "epoch": 1.8119937822138592, + "grad_norm": 0.06212379038333893, + "learning_rate": 2.667711859722605e-07, + "loss": 0.0014, + "step": 110740 + }, + { + "epoch": 1.812157408164935, + "grad_norm": 0.06820494681596756, + "learning_rate": 2.6631119926909365e-07, + "loss": 0.0015, + "step": 110750 + }, + { + "epoch": 1.8123210341160108, + "grad_norm": 0.04215957224369049, + "learning_rate": 2.6585159862948073e-07, + "loss": 0.002, + "step": 110760 + }, + { + "epoch": 1.8124846600670868, + "grad_norm": 0.048620592802762985, + "learning_rate": 2.6539238409090393e-07, + "loss": 0.0006, + "step": 110770 + }, + { + "epoch": 1.8126482860181625, + "grad_norm": 0.03688211739063263, + "learning_rate": 2.6493355569081615e-07, + "loss": 0.001, + "step": 110780 + }, + { + "epoch": 1.8128119119692383, + "grad_norm": 0.02151479385793209, + "learning_rate": 2.6447511346663734e-07, + "loss": 0.0005, + "step": 110790 + }, + { + "epoch": 1.8129755379203143, + "grad_norm": 0.008797711692750454, + "learning_rate": 2.640170574557566e-07, + "loss": 0.0005, + "step": 110800 + }, + { + "epoch": 1.8131391638713898, + "grad_norm": 0.036530692130327225, + "learning_rate": 2.635593876955322e-07, + "loss": 0.0011, + "step": 110810 + }, + { + "epoch": 1.8133027898224658, + "grad_norm": 0.020792389288544655, + "learning_rate": 2.6310210422328787e-07, + "loss": 0.001, + "step": 110820 + }, + { + "epoch": 1.8134664157735418, + "grad_norm": 0.0023267720825970173, + "learning_rate": 2.626452070763208e-07, + "loss": 0.0006, + "step": 110830 + }, + { + "epoch": 1.8136300417246174, + "grad_norm": 0.011442271061241627, + "learning_rate": 2.6218869629189235e-07, + "loss": 0.0003, + "step": 110840 + }, + { + "epoch": 1.8137936676756934, + "grad_norm": 0.14230522513389587, + "learning_rate": 2.6173257190723445e-07, + "loss": 0.0007, + "step": 110850 + }, + { + "epoch": 1.8139572936267692, + "grad_norm": 0.0642613023519516, + "learning_rate": 2.6127683395954674e-07, + "loss": 0.0008, + "step": 110860 + }, + { + "epoch": 1.814120919577845, + "grad_norm": 0.021983062848448753, + "learning_rate": 2.608214824859984e-07, + "loss": 0.0017, + "step": 110870 + }, + { + "epoch": 1.814284545528921, + "grad_norm": 0.09344790875911713, + "learning_rate": 2.603665175237252e-07, + "loss": 0.0009, + "step": 110880 + }, + { + "epoch": 1.8144481714799967, + "grad_norm": 0.01867722161114216, + "learning_rate": 2.599119391098343e-07, + "loss": 0.0008, + "step": 110890 + }, + { + "epoch": 1.8146117974310725, + "grad_norm": 0.020024236291646957, + "learning_rate": 2.594577472813975e-07, + "loss": 0.0004, + "step": 110900 + }, + { + "epoch": 1.8147754233821485, + "grad_norm": 0.00954466499388218, + "learning_rate": 2.5900394207545865e-07, + "loss": 0.0005, + "step": 110910 + }, + { + "epoch": 1.8149390493332243, + "grad_norm": 0.05353563278913498, + "learning_rate": 2.5855052352902754e-07, + "loss": 0.0011, + "step": 110920 + }, + { + "epoch": 1.8151026752843, + "grad_norm": 0.15290431678295135, + "learning_rate": 2.580974916790846e-07, + "loss": 0.0014, + "step": 110930 + }, + { + "epoch": 1.815266301235376, + "grad_norm": 0.011975144036114216, + "learning_rate": 2.576448465625758e-07, + "loss": 0.0006, + "step": 110940 + }, + { + "epoch": 1.8154299271864518, + "grad_norm": 0.049102842807769775, + "learning_rate": 2.5719258821641947e-07, + "loss": 0.0009, + "step": 110950 + }, + { + "epoch": 1.8155935531375276, + "grad_norm": 0.032433703541755676, + "learning_rate": 2.567407166774988e-07, + "loss": 0.0005, + "step": 110960 + }, + { + "epoch": 1.8157571790886036, + "grad_norm": 0.0659307911992073, + "learning_rate": 2.5628923198266764e-07, + "loss": 0.0013, + "step": 110970 + }, + { + "epoch": 1.8159208050396793, + "grad_norm": 0.01950692944228649, + "learning_rate": 2.558381341687466e-07, + "loss": 0.0007, + "step": 110980 + }, + { + "epoch": 1.8160844309907551, + "grad_norm": 0.03755845129489899, + "learning_rate": 2.5538742327252674e-07, + "loss": 0.001, + "step": 110990 + }, + { + "epoch": 1.8162480569418311, + "grad_norm": 0.04799890145659447, + "learning_rate": 2.5493709933076593e-07, + "loss": 0.0008, + "step": 111000 + }, + { + "epoch": 1.8164116828929067, + "grad_norm": 0.045996349304914474, + "learning_rate": 2.5448716238019145e-07, + "loss": 0.0009, + "step": 111010 + }, + { + "epoch": 1.8165753088439827, + "grad_norm": 0.016494890674948692, + "learning_rate": 2.5403761245749783e-07, + "loss": 0.0007, + "step": 111020 + }, + { + "epoch": 1.8167389347950587, + "grad_norm": 0.1004316583275795, + "learning_rate": 2.5358844959935016e-07, + "loss": 0.0007, + "step": 111030 + }, + { + "epoch": 1.8169025607461342, + "grad_norm": 0.03347358480095863, + "learning_rate": 2.531396738423791e-07, + "loss": 0.001, + "step": 111040 + }, + { + "epoch": 1.8170661866972102, + "grad_norm": 0.024299656972289085, + "learning_rate": 2.5269128522318664e-07, + "loss": 0.0009, + "step": 111050 + }, + { + "epoch": 1.817229812648286, + "grad_norm": 0.03240986913442612, + "learning_rate": 2.522432837783406e-07, + "loss": 0.0006, + "step": 111060 + }, + { + "epoch": 1.8173934385993618, + "grad_norm": 0.15089312195777893, + "learning_rate": 2.5179566954437964e-07, + "loss": 0.001, + "step": 111070 + }, + { + "epoch": 1.8175570645504378, + "grad_norm": 0.024585865437984467, + "learning_rate": 2.513484425578083e-07, + "loss": 0.0005, + "step": 111080 + }, + { + "epoch": 1.8177206905015135, + "grad_norm": 0.005272882059216499, + "learning_rate": 2.5090160285510257e-07, + "loss": 0.0008, + "step": 111090 + }, + { + "epoch": 1.8178843164525893, + "grad_norm": 0.036688633263111115, + "learning_rate": 2.5045515047270375e-07, + "loss": 0.0006, + "step": 111100 + }, + { + "epoch": 1.8180479424036653, + "grad_norm": 0.017682114616036415, + "learning_rate": 2.500090854470238e-07, + "loss": 0.0014, + "step": 111110 + }, + { + "epoch": 1.818211568354741, + "grad_norm": 0.03978768736124039, + "learning_rate": 2.4956340781444154e-07, + "loss": 0.001, + "step": 111120 + }, + { + "epoch": 1.8183751943058168, + "grad_norm": 0.02109573222696781, + "learning_rate": 2.491181176113061e-07, + "loss": 0.0006, + "step": 111130 + }, + { + "epoch": 1.8185388202568928, + "grad_norm": 0.03167502209544182, + "learning_rate": 2.4867321487393294e-07, + "loss": 0.0009, + "step": 111140 + }, + { + "epoch": 1.8187024462079686, + "grad_norm": 0.06268315017223358, + "learning_rate": 2.482286996386063e-07, + "loss": 0.0007, + "step": 111150 + }, + { + "epoch": 1.8188660721590444, + "grad_norm": 0.025574948638677597, + "learning_rate": 2.477845719415811e-07, + "loss": 0.0006, + "step": 111160 + }, + { + "epoch": 1.8190296981101204, + "grad_norm": 0.12344178557395935, + "learning_rate": 2.4734083181907677e-07, + "loss": 0.0006, + "step": 111170 + }, + { + "epoch": 1.8191933240611962, + "grad_norm": 0.03598754480481148, + "learning_rate": 2.468974793072848e-07, + "loss": 0.002, + "step": 111180 + }, + { + "epoch": 1.819356950012272, + "grad_norm": 0.06256358325481415, + "learning_rate": 2.464545144423625e-07, + "loss": 0.0014, + "step": 111190 + }, + { + "epoch": 1.819520575963348, + "grad_norm": 0.018764719367027283, + "learning_rate": 2.460119372604375e-07, + "loss": 0.0013, + "step": 111200 + }, + { + "epoch": 1.8196842019144235, + "grad_norm": 0.07342077791690826, + "learning_rate": 2.455697477976038e-07, + "loss": 0.001, + "step": 111210 + }, + { + "epoch": 1.8198478278654995, + "grad_norm": 0.1283794790506363, + "learning_rate": 2.451279460899264e-07, + "loss": 0.0012, + "step": 111220 + }, + { + "epoch": 1.8200114538165755, + "grad_norm": 0.01647566445171833, + "learning_rate": 2.446865321734354e-07, + "loss": 0.0009, + "step": 111230 + }, + { + "epoch": 1.820175079767651, + "grad_norm": 0.03955389931797981, + "learning_rate": 2.4424550608413254e-07, + "loss": 0.0007, + "step": 111240 + }, + { + "epoch": 1.820338705718727, + "grad_norm": 0.07133174687623978, + "learning_rate": 2.4380486785798516e-07, + "loss": 0.0007, + "step": 111250 + }, + { + "epoch": 1.8205023316698028, + "grad_norm": 0.09807589650154114, + "learning_rate": 2.433646175309312e-07, + "loss": 0.0016, + "step": 111260 + }, + { + "epoch": 1.8206659576208786, + "grad_norm": 0.06567030400037766, + "learning_rate": 2.4292475513887523e-07, + "loss": 0.0012, + "step": 111270 + }, + { + "epoch": 1.8208295835719546, + "grad_norm": 0.0040667993016541, + "learning_rate": 2.424852807176914e-07, + "loss": 0.0007, + "step": 111280 + }, + { + "epoch": 1.8209932095230303, + "grad_norm": 0.01584528759121895, + "learning_rate": 2.4204619430322153e-07, + "loss": 0.0008, + "step": 111290 + }, + { + "epoch": 1.8211568354741061, + "grad_norm": 0.025917090475559235, + "learning_rate": 2.4160749593127643e-07, + "loss": 0.0008, + "step": 111300 + }, + { + "epoch": 1.8213204614251821, + "grad_norm": 0.05672488361597061, + "learning_rate": 2.4116918563763416e-07, + "loss": 0.0012, + "step": 111310 + }, + { + "epoch": 1.8214840873762579, + "grad_norm": 0.043300505727529526, + "learning_rate": 2.407312634580422e-07, + "loss": 0.0013, + "step": 111320 + }, + { + "epoch": 1.8216477133273337, + "grad_norm": 0.05439889803528786, + "learning_rate": 2.4029372942821594e-07, + "loss": 0.0009, + "step": 111330 + }, + { + "epoch": 1.8218113392784097, + "grad_norm": 0.010111581534147263, + "learning_rate": 2.3985658358383956e-07, + "loss": 0.0012, + "step": 111340 + }, + { + "epoch": 1.8219749652294854, + "grad_norm": 0.03371474891901016, + "learning_rate": 2.39419825960564e-07, + "loss": 0.0015, + "step": 111350 + }, + { + "epoch": 1.8221385911805612, + "grad_norm": 0.2506891191005707, + "learning_rate": 2.389834565940119e-07, + "loss": 0.0009, + "step": 111360 + }, + { + "epoch": 1.8223022171316372, + "grad_norm": 0.08747326582670212, + "learning_rate": 2.385474755197692e-07, + "loss": 0.0005, + "step": 111370 + }, + { + "epoch": 1.8224658430827128, + "grad_norm": 0.024355821311473846, + "learning_rate": 2.381118827733958e-07, + "loss": 0.0019, + "step": 111380 + }, + { + "epoch": 1.8226294690337888, + "grad_norm": 0.09600850194692612, + "learning_rate": 2.3767667839041497e-07, + "loss": 0.001, + "step": 111390 + }, + { + "epoch": 1.8227930949848647, + "grad_norm": 0.02988213486969471, + "learning_rate": 2.3724186240632218e-07, + "loss": 0.0017, + "step": 111400 + }, + { + "epoch": 1.8229567209359403, + "grad_norm": 0.03001558594405651, + "learning_rate": 2.3680743485657797e-07, + "loss": 0.0003, + "step": 111410 + }, + { + "epoch": 1.8231203468870163, + "grad_norm": 0.155255526304245, + "learning_rate": 2.3637339577661454e-07, + "loss": 0.0011, + "step": 111420 + }, + { + "epoch": 1.823283972838092, + "grad_norm": 0.05806509032845497, + "learning_rate": 2.359397452018286e-07, + "loss": 0.0011, + "step": 111430 + }, + { + "epoch": 1.8234475987891678, + "grad_norm": 0.05453617498278618, + "learning_rate": 2.3550648316758906e-07, + "loss": 0.0012, + "step": 111440 + }, + { + "epoch": 1.8236112247402438, + "grad_norm": 0.024343177676200867, + "learning_rate": 2.35073609709231e-07, + "loss": 0.001, + "step": 111450 + }, + { + "epoch": 1.8237748506913196, + "grad_norm": 0.0034968750551342964, + "learning_rate": 2.346411248620567e-07, + "loss": 0.0015, + "step": 111460 + }, + { + "epoch": 1.8239384766423954, + "grad_norm": 0.07651791721582413, + "learning_rate": 2.3420902866133966e-07, + "loss": 0.0005, + "step": 111470 + }, + { + "epoch": 1.8241021025934714, + "grad_norm": 0.10037540644407272, + "learning_rate": 2.337773211423189e-07, + "loss": 0.0015, + "step": 111480 + }, + { + "epoch": 1.8242657285445472, + "grad_norm": 0.036474306136369705, + "learning_rate": 2.3334600234020455e-07, + "loss": 0.0008, + "step": 111490 + }, + { + "epoch": 1.824429354495623, + "grad_norm": 0.021350465714931488, + "learning_rate": 2.329150722901724e-07, + "loss": 0.0005, + "step": 111500 + }, + { + "epoch": 1.824592980446699, + "grad_norm": 0.002814835635945201, + "learning_rate": 2.3248453102736824e-07, + "loss": 0.0008, + "step": 111510 + }, + { + "epoch": 1.8247566063977747, + "grad_norm": 0.004110607318580151, + "learning_rate": 2.3205437858690449e-07, + "loss": 0.0007, + "step": 111520 + }, + { + "epoch": 1.8249202323488505, + "grad_norm": 0.047404851764440536, + "learning_rate": 2.3162461500386423e-07, + "loss": 0.0007, + "step": 111530 + }, + { + "epoch": 1.8250838582999265, + "grad_norm": 0.07846037298440933, + "learning_rate": 2.3119524031329665e-07, + "loss": 0.0007, + "step": 111540 + }, + { + "epoch": 1.8252474842510023, + "grad_norm": 0.03799647465348244, + "learning_rate": 2.3076625455022094e-07, + "loss": 0.0008, + "step": 111550 + }, + { + "epoch": 1.825411110202078, + "grad_norm": 0.04996400326490402, + "learning_rate": 2.3033765774962247e-07, + "loss": 0.0008, + "step": 111560 + }, + { + "epoch": 1.825574736153154, + "grad_norm": 0.010807087644934654, + "learning_rate": 2.2990944994645713e-07, + "loss": 0.0006, + "step": 111570 + }, + { + "epoch": 1.8257383621042296, + "grad_norm": 0.04443098604679108, + "learning_rate": 2.2948163117564758e-07, + "loss": 0.0009, + "step": 111580 + }, + { + "epoch": 1.8259019880553056, + "grad_norm": 0.03280111774802208, + "learning_rate": 2.2905420147208645e-07, + "loss": 0.0008, + "step": 111590 + }, + { + "epoch": 1.8260656140063816, + "grad_norm": 0.033179961144924164, + "learning_rate": 2.2862716087063142e-07, + "loss": 0.0013, + "step": 111600 + }, + { + "epoch": 1.8262292399574571, + "grad_norm": 0.09884998947381973, + "learning_rate": 2.2820050940611184e-07, + "loss": 0.0009, + "step": 111610 + }, + { + "epoch": 1.8263928659085331, + "grad_norm": 0.00635836785659194, + "learning_rate": 2.2777424711332375e-07, + "loss": 0.0004, + "step": 111620 + }, + { + "epoch": 1.8265564918596089, + "grad_norm": 0.030929584056138992, + "learning_rate": 2.2734837402703159e-07, + "loss": 0.0015, + "step": 111630 + }, + { + "epoch": 1.8267201178106847, + "grad_norm": 0.05123552680015564, + "learning_rate": 2.2692289018196756e-07, + "loss": 0.0009, + "step": 111640 + }, + { + "epoch": 1.8268837437617607, + "grad_norm": 0.029979132115840912, + "learning_rate": 2.2649779561283391e-07, + "loss": 0.001, + "step": 111650 + }, + { + "epoch": 1.8270473697128364, + "grad_norm": 0.06885220855474472, + "learning_rate": 2.2607309035429847e-07, + "loss": 0.0012, + "step": 111660 + }, + { + "epoch": 1.8272109956639122, + "grad_norm": 0.1243332028388977, + "learning_rate": 2.2564877444100075e-07, + "loss": 0.0012, + "step": 111670 + }, + { + "epoch": 1.8273746216149882, + "grad_norm": 0.10198543965816498, + "learning_rate": 2.252248479075436e-07, + "loss": 0.002, + "step": 111680 + }, + { + "epoch": 1.827538247566064, + "grad_norm": 0.08848313987255096, + "learning_rate": 2.2480131078850386e-07, + "loss": 0.0006, + "step": 111690 + }, + { + "epoch": 1.8277018735171398, + "grad_norm": 0.03245293349027634, + "learning_rate": 2.243781631184222e-07, + "loss": 0.0005, + "step": 111700 + }, + { + "epoch": 1.8278654994682157, + "grad_norm": 0.07604245096445084, + "learning_rate": 2.2395540493180989e-07, + "loss": 0.0007, + "step": 111710 + }, + { + "epoch": 1.8280291254192915, + "grad_norm": 0.040595319122076035, + "learning_rate": 2.2353303626314438e-07, + "loss": 0.0007, + "step": 111720 + }, + { + "epoch": 1.8281927513703673, + "grad_norm": 0.03918663412332535, + "learning_rate": 2.231110571468742e-07, + "loss": 0.0008, + "step": 111730 + }, + { + "epoch": 1.8283563773214433, + "grad_norm": 0.03864748403429985, + "learning_rate": 2.226894676174135e-07, + "loss": 0.0009, + "step": 111740 + }, + { + "epoch": 1.828520003272519, + "grad_norm": 0.03836618736386299, + "learning_rate": 2.222682677091459e-07, + "loss": 0.0007, + "step": 111750 + }, + { + "epoch": 1.8286836292235948, + "grad_norm": 0.16263221204280853, + "learning_rate": 2.2184745745642334e-07, + "loss": 0.0016, + "step": 111760 + }, + { + "epoch": 1.8288472551746708, + "grad_norm": 0.19651682674884796, + "learning_rate": 2.214270368935656e-07, + "loss": 0.0009, + "step": 111770 + }, + { + "epoch": 1.8290108811257464, + "grad_norm": 0.030727343633770943, + "learning_rate": 2.210070060548608e-07, + "loss": 0.0011, + "step": 111780 + }, + { + "epoch": 1.8291745070768224, + "grad_norm": 0.052766796201467514, + "learning_rate": 2.2058736497456436e-07, + "loss": 0.0011, + "step": 111790 + }, + { + "epoch": 1.8293381330278984, + "grad_norm": 0.09465035796165466, + "learning_rate": 2.2016811368690217e-07, + "loss": 0.0006, + "step": 111800 + }, + { + "epoch": 1.829501758978974, + "grad_norm": 0.03092537820339203, + "learning_rate": 2.197492522260658e-07, + "loss": 0.0008, + "step": 111810 + }, + { + "epoch": 1.82966538493005, + "grad_norm": 0.02302798442542553, + "learning_rate": 2.1933078062621682e-07, + "loss": 0.0008, + "step": 111820 + }, + { + "epoch": 1.8298290108811257, + "grad_norm": 0.10573314875364304, + "learning_rate": 2.1891269892148348e-07, + "loss": 0.001, + "step": 111830 + }, + { + "epoch": 1.8299926368322015, + "grad_norm": 0.02152593806385994, + "learning_rate": 2.184950071459646e-07, + "loss": 0.0012, + "step": 111840 + }, + { + "epoch": 1.8301562627832775, + "grad_norm": 0.03256351500749588, + "learning_rate": 2.1807770533372462e-07, + "loss": 0.0007, + "step": 111850 + }, + { + "epoch": 1.8303198887343533, + "grad_norm": 0.009935869835317135, + "learning_rate": 2.1766079351879743e-07, + "loss": 0.0015, + "step": 111860 + }, + { + "epoch": 1.830483514685429, + "grad_norm": 0.09688275307416916, + "learning_rate": 2.172442717351847e-07, + "loss": 0.0008, + "step": 111870 + }, + { + "epoch": 1.830647140636505, + "grad_norm": 0.012640787288546562, + "learning_rate": 2.168281400168576e-07, + "loss": 0.0008, + "step": 111880 + }, + { + "epoch": 1.8308107665875808, + "grad_norm": 0.033625803887844086, + "learning_rate": 2.1641239839775286e-07, + "loss": 0.0008, + "step": 111890 + }, + { + "epoch": 1.8309743925386566, + "grad_norm": 0.018425360321998596, + "learning_rate": 2.159970469117778e-07, + "loss": 0.0014, + "step": 111900 + }, + { + "epoch": 1.8311380184897326, + "grad_norm": 0.12633606791496277, + "learning_rate": 2.1558208559280701e-07, + "loss": 0.001, + "step": 111910 + }, + { + "epoch": 1.8313016444408083, + "grad_norm": 0.04302993416786194, + "learning_rate": 2.1516751447468398e-07, + "loss": 0.0007, + "step": 111920 + }, + { + "epoch": 1.8314652703918841, + "grad_norm": 0.023784073069691658, + "learning_rate": 2.147533335912183e-07, + "loss": 0.0008, + "step": 111930 + }, + { + "epoch": 1.8316288963429601, + "grad_norm": 0.009703692980110645, + "learning_rate": 2.1433954297619076e-07, + "loss": 0.0007, + "step": 111940 + }, + { + "epoch": 1.8317925222940359, + "grad_norm": 0.07528490573167801, + "learning_rate": 2.139261426633471e-07, + "loss": 0.0015, + "step": 111950 + }, + { + "epoch": 1.8319561482451117, + "grad_norm": 0.004625246860086918, + "learning_rate": 2.1351313268640373e-07, + "loss": 0.0009, + "step": 111960 + }, + { + "epoch": 1.8321197741961877, + "grad_norm": 0.1257781982421875, + "learning_rate": 2.1310051307904422e-07, + "loss": 0.0012, + "step": 111970 + }, + { + "epoch": 1.8322834001472632, + "grad_norm": 0.12225418537855148, + "learning_rate": 2.1268828387492114e-07, + "loss": 0.0013, + "step": 111980 + }, + { + "epoch": 1.8324470260983392, + "grad_norm": 0.019057348370552063, + "learning_rate": 2.1227644510765255e-07, + "loss": 0.001, + "step": 111990 + }, + { + "epoch": 1.8326106520494152, + "grad_norm": 0.03979181498289108, + "learning_rate": 2.1186499681082884e-07, + "loss": 0.0006, + "step": 112000 + }, + { + "epoch": 1.8327742780004908, + "grad_norm": 0.04923700913786888, + "learning_rate": 2.1145393901800482e-07, + "loss": 0.0018, + "step": 112010 + }, + { + "epoch": 1.8329379039515667, + "grad_norm": 0.028687067329883575, + "learning_rate": 2.110432717627059e-07, + "loss": 0.0008, + "step": 112020 + }, + { + "epoch": 1.8331015299026425, + "grad_norm": 0.07212336361408234, + "learning_rate": 2.1063299507842362e-07, + "loss": 0.0012, + "step": 112030 + }, + { + "epoch": 1.8332651558537183, + "grad_norm": 0.04509720206260681, + "learning_rate": 2.1022310899862065e-07, + "loss": 0.0011, + "step": 112040 + }, + { + "epoch": 1.8334287818047943, + "grad_norm": 0.018241552636027336, + "learning_rate": 2.0981361355672358e-07, + "loss": 0.0007, + "step": 112050 + }, + { + "epoch": 1.83359240775587, + "grad_norm": 0.02044142596423626, + "learning_rate": 2.0940450878613073e-07, + "loss": 0.0006, + "step": 112060 + }, + { + "epoch": 1.8337560337069458, + "grad_norm": 0.04604267328977585, + "learning_rate": 2.0899579472020704e-07, + "loss": 0.0012, + "step": 112070 + }, + { + "epoch": 1.8339196596580218, + "grad_norm": 0.10461519658565521, + "learning_rate": 2.0858747139228698e-07, + "loss": 0.0009, + "step": 112080 + }, + { + "epoch": 1.8340832856090976, + "grad_norm": 0.048821017146110535, + "learning_rate": 2.0817953883566944e-07, + "loss": 0.0008, + "step": 112090 + }, + { + "epoch": 1.8342469115601734, + "grad_norm": 0.07163149863481522, + "learning_rate": 2.0777199708362672e-07, + "loss": 0.0007, + "step": 112100 + }, + { + "epoch": 1.8344105375112494, + "grad_norm": 0.06433001905679703, + "learning_rate": 2.07364846169395e-07, + "loss": 0.0009, + "step": 112110 + }, + { + "epoch": 1.8345741634623252, + "grad_norm": 0.10736479610204697, + "learning_rate": 2.069580861261805e-07, + "loss": 0.0019, + "step": 112120 + }, + { + "epoch": 1.834737789413401, + "grad_norm": 0.16886258125305176, + "learning_rate": 2.0655171698715725e-07, + "loss": 0.0011, + "step": 112130 + }, + { + "epoch": 1.834901415364477, + "grad_norm": 0.014089349657297134, + "learning_rate": 2.0614573878546652e-07, + "loss": 0.0006, + "step": 112140 + }, + { + "epoch": 1.8350650413155525, + "grad_norm": 0.059346482157707214, + "learning_rate": 2.0574015155422012e-07, + "loss": 0.001, + "step": 112150 + }, + { + "epoch": 1.8352286672666285, + "grad_norm": 0.02900175005197525, + "learning_rate": 2.0533495532649494e-07, + "loss": 0.0006, + "step": 112160 + }, + { + "epoch": 1.8353922932177045, + "grad_norm": 0.03774087876081467, + "learning_rate": 2.0493015013533846e-07, + "loss": 0.0012, + "step": 112170 + }, + { + "epoch": 1.83555591916878, + "grad_norm": 0.004515171051025391, + "learning_rate": 2.0452573601376369e-07, + "loss": 0.001, + "step": 112180 + }, + { + "epoch": 1.835719545119856, + "grad_norm": 0.08888539671897888, + "learning_rate": 2.0412171299475535e-07, + "loss": 0.0009, + "step": 112190 + }, + { + "epoch": 1.8358831710709318, + "grad_norm": 0.0273519828915596, + "learning_rate": 2.0371808111126212e-07, + "loss": 0.0005, + "step": 112200 + }, + { + "epoch": 1.8360467970220076, + "grad_norm": 0.07537045329809189, + "learning_rate": 2.0331484039620487e-07, + "loss": 0.001, + "step": 112210 + }, + { + "epoch": 1.8362104229730836, + "grad_norm": 0.0029988503083586693, + "learning_rate": 2.0291199088246894e-07, + "loss": 0.0005, + "step": 112220 + }, + { + "epoch": 1.8363740489241593, + "grad_norm": 0.006366785615682602, + "learning_rate": 2.025095326029103e-07, + "loss": 0.0005, + "step": 112230 + }, + { + "epoch": 1.8365376748752351, + "grad_norm": 0.03925633803009987, + "learning_rate": 2.0210746559035099e-07, + "loss": 0.0009, + "step": 112240 + }, + { + "epoch": 1.8367013008263111, + "grad_norm": 0.043429989367723465, + "learning_rate": 2.017057898775837e-07, + "loss": 0.0007, + "step": 112250 + }, + { + "epoch": 1.8368649267773869, + "grad_norm": 0.022378094494342804, + "learning_rate": 2.0130450549736668e-07, + "loss": 0.0009, + "step": 112260 + }, + { + "epoch": 1.8370285527284627, + "grad_norm": 0.07322119176387787, + "learning_rate": 2.0090361248242762e-07, + "loss": 0.0004, + "step": 112270 + }, + { + "epoch": 1.8371921786795387, + "grad_norm": 0.021937793120741844, + "learning_rate": 2.0050311086546148e-07, + "loss": 0.0012, + "step": 112280 + }, + { + "epoch": 1.8373558046306144, + "grad_norm": 0.009467998519539833, + "learning_rate": 2.0010300067913323e-07, + "loss": 0.0008, + "step": 112290 + }, + { + "epoch": 1.8375194305816902, + "grad_norm": 0.006907562725245953, + "learning_rate": 1.997032819560729e-07, + "loss": 0.0006, + "step": 112300 + }, + { + "epoch": 1.8376830565327662, + "grad_norm": 0.6138444542884827, + "learning_rate": 1.9930395472888163e-07, + "loss": 0.0009, + "step": 112310 + }, + { + "epoch": 1.837846682483842, + "grad_norm": 0.04467267543077469, + "learning_rate": 1.989050190301256e-07, + "loss": 0.0009, + "step": 112320 + }, + { + "epoch": 1.8380103084349178, + "grad_norm": 0.04853655397891998, + "learning_rate": 1.985064748923421e-07, + "loss": 0.0008, + "step": 112330 + }, + { + "epoch": 1.8381739343859937, + "grad_norm": 0.061252862215042114, + "learning_rate": 1.98108322348034e-07, + "loss": 0.0007, + "step": 112340 + }, + { + "epoch": 1.8383375603370693, + "grad_norm": 0.03716979920864105, + "learning_rate": 1.977105614296737e-07, + "loss": 0.0004, + "step": 112350 + }, + { + "epoch": 1.8385011862881453, + "grad_norm": 0.0739227905869484, + "learning_rate": 1.9731319216970135e-07, + "loss": 0.0011, + "step": 112360 + }, + { + "epoch": 1.8386648122392213, + "grad_norm": 0.03022587299346924, + "learning_rate": 1.969162146005249e-07, + "loss": 0.0006, + "step": 112370 + }, + { + "epoch": 1.8388284381902968, + "grad_norm": 0.026162147521972656, + "learning_rate": 1.9651962875452069e-07, + "loss": 0.0007, + "step": 112380 + }, + { + "epoch": 1.8389920641413728, + "grad_norm": 0.027955437079072, + "learning_rate": 1.9612343466403227e-07, + "loss": 0.0005, + "step": 112390 + }, + { + "epoch": 1.8391556900924486, + "grad_norm": 0.13000430166721344, + "learning_rate": 1.9572763236137272e-07, + "loss": 0.0011, + "step": 112400 + }, + { + "epoch": 1.8393193160435244, + "grad_norm": 0.01655576564371586, + "learning_rate": 1.9533222187882172e-07, + "loss": 0.0006, + "step": 112410 + }, + { + "epoch": 1.8394829419946004, + "grad_norm": 0.025672340765595436, + "learning_rate": 1.9493720324862854e-07, + "loss": 0.0007, + "step": 112420 + }, + { + "epoch": 1.8396465679456762, + "grad_norm": 0.002498106099665165, + "learning_rate": 1.9454257650300733e-07, + "loss": 0.0011, + "step": 112430 + }, + { + "epoch": 1.839810193896752, + "grad_norm": 0.0408705435693264, + "learning_rate": 1.941483416741452e-07, + "loss": 0.0012, + "step": 112440 + }, + { + "epoch": 1.839973819847828, + "grad_norm": 0.03590615466237068, + "learning_rate": 1.9375449879419306e-07, + "loss": 0.0009, + "step": 112450 + }, + { + "epoch": 1.8401374457989037, + "grad_norm": 0.004171170759946108, + "learning_rate": 1.933610478952719e-07, + "loss": 0.0007, + "step": 112460 + }, + { + "epoch": 1.8403010717499795, + "grad_norm": 0.014536440372467041, + "learning_rate": 1.9296798900946935e-07, + "loss": 0.0003, + "step": 112470 + }, + { + "epoch": 1.8404646977010555, + "grad_norm": 0.08026809245347977, + "learning_rate": 1.9257532216884368e-07, + "loss": 0.0009, + "step": 112480 + }, + { + "epoch": 1.8406283236521312, + "grad_norm": 0.02323959954082966, + "learning_rate": 1.921830474054176e-07, + "loss": 0.001, + "step": 112490 + }, + { + "epoch": 1.840791949603207, + "grad_norm": 0.03691117838025093, + "learning_rate": 1.91791164751185e-07, + "loss": 0.0006, + "step": 112500 + }, + { + "epoch": 1.840955575554283, + "grad_norm": 0.016201648861169815, + "learning_rate": 1.913996742381058e-07, + "loss": 0.0004, + "step": 112510 + }, + { + "epoch": 1.8411192015053588, + "grad_norm": 0.040413998067379, + "learning_rate": 1.9100857589810952e-07, + "loss": 0.0011, + "step": 112520 + }, + { + "epoch": 1.8412828274564346, + "grad_norm": 0.03945711627602577, + "learning_rate": 1.9061786976309116e-07, + "loss": 0.0007, + "step": 112530 + }, + { + "epoch": 1.8414464534075106, + "grad_norm": 0.013471710495650768, + "learning_rate": 1.9022755586491747e-07, + "loss": 0.0018, + "step": 112540 + }, + { + "epoch": 1.8416100793585861, + "grad_norm": 0.006731478031724691, + "learning_rate": 1.8983763423541913e-07, + "loss": 0.0008, + "step": 112550 + }, + { + "epoch": 1.8417737053096621, + "grad_norm": 0.04281238839030266, + "learning_rate": 1.8944810490639897e-07, + "loss": 0.0009, + "step": 112560 + }, + { + "epoch": 1.841937331260738, + "grad_norm": 0.07378807663917542, + "learning_rate": 1.890589679096233e-07, + "loss": 0.0007, + "step": 112570 + }, + { + "epoch": 1.8421009572118137, + "grad_norm": 0.09281789511442184, + "learning_rate": 1.886702232768306e-07, + "loss": 0.0005, + "step": 112580 + }, + { + "epoch": 1.8422645831628897, + "grad_norm": 0.02334589883685112, + "learning_rate": 1.882818710397244e-07, + "loss": 0.0015, + "step": 112590 + }, + { + "epoch": 1.8424282091139654, + "grad_norm": 0.02348475158214569, + "learning_rate": 1.8789391122997879e-07, + "loss": 0.0009, + "step": 112600 + }, + { + "epoch": 1.8425918350650412, + "grad_norm": 0.09792564064264297, + "learning_rate": 1.875063438792324e-07, + "loss": 0.0015, + "step": 112610 + }, + { + "epoch": 1.8427554610161172, + "grad_norm": 0.12522895634174347, + "learning_rate": 1.87119169019096e-07, + "loss": 0.001, + "step": 112620 + }, + { + "epoch": 1.842919086967193, + "grad_norm": 0.05113855004310608, + "learning_rate": 1.867323866811449e-07, + "loss": 0.0009, + "step": 112630 + }, + { + "epoch": 1.8430827129182688, + "grad_norm": 0.12167287617921829, + "learning_rate": 1.8634599689692502e-07, + "loss": 0.0006, + "step": 112640 + }, + { + "epoch": 1.8432463388693447, + "grad_norm": 0.02742084674537182, + "learning_rate": 1.859599996979472e-07, + "loss": 0.0008, + "step": 112650 + }, + { + "epoch": 1.8434099648204205, + "grad_norm": 0.010081775486469269, + "learning_rate": 1.8557439511569353e-07, + "loss": 0.001, + "step": 112660 + }, + { + "epoch": 1.8435735907714963, + "grad_norm": 0.306482195854187, + "learning_rate": 1.851891831816116e-07, + "loss": 0.0008, + "step": 112670 + }, + { + "epoch": 1.8437372167225723, + "grad_norm": 0.044563211500644684, + "learning_rate": 1.848043639271191e-07, + "loss": 0.0015, + "step": 112680 + }, + { + "epoch": 1.843900842673648, + "grad_norm": 0.005889760795980692, + "learning_rate": 1.844199373835992e-07, + "loss": 0.0008, + "step": 112690 + }, + { + "epoch": 1.8440644686247238, + "grad_norm": 0.05091425031423569, + "learning_rate": 1.8403590358240576e-07, + "loss": 0.0016, + "step": 112700 + }, + { + "epoch": 1.8442280945757998, + "grad_norm": 0.03527555987238884, + "learning_rate": 1.8365226255485868e-07, + "loss": 0.0006, + "step": 112710 + }, + { + "epoch": 1.8443917205268756, + "grad_norm": 0.034672170877456665, + "learning_rate": 1.8326901433224631e-07, + "loss": 0.0004, + "step": 112720 + }, + { + "epoch": 1.8445553464779514, + "grad_norm": 0.011133643798530102, + "learning_rate": 1.828861589458253e-07, + "loss": 0.0009, + "step": 112730 + }, + { + "epoch": 1.8447189724290274, + "grad_norm": 0.027846060693264008, + "learning_rate": 1.8250369642682007e-07, + "loss": 0.0012, + "step": 112740 + }, + { + "epoch": 1.844882598380103, + "grad_norm": 0.04411618411540985, + "learning_rate": 1.8212162680642297e-07, + "loss": 0.0005, + "step": 112750 + }, + { + "epoch": 1.845046224331179, + "grad_norm": 0.08452106267213821, + "learning_rate": 1.817399501157935e-07, + "loss": 0.0008, + "step": 112760 + }, + { + "epoch": 1.845209850282255, + "grad_norm": 0.0366785041987896, + "learning_rate": 1.8135866638606114e-07, + "loss": 0.0009, + "step": 112770 + }, + { + "epoch": 1.8453734762333305, + "grad_norm": 0.15057390928268433, + "learning_rate": 1.8097777564832053e-07, + "loss": 0.0005, + "step": 112780 + }, + { + "epoch": 1.8455371021844065, + "grad_norm": 0.15456070005893707, + "learning_rate": 1.805972779336379e-07, + "loss": 0.0009, + "step": 112790 + }, + { + "epoch": 1.8457007281354822, + "grad_norm": 0.054340820759534836, + "learning_rate": 1.80217173273044e-07, + "loss": 0.0015, + "step": 112800 + }, + { + "epoch": 1.845864354086558, + "grad_norm": 0.091983363032341, + "learning_rate": 1.7983746169753902e-07, + "loss": 0.0005, + "step": 112810 + }, + { + "epoch": 1.846027980037634, + "grad_norm": 0.02596302516758442, + "learning_rate": 1.7945814323809096e-07, + "loss": 0.0006, + "step": 112820 + }, + { + "epoch": 1.8461916059887098, + "grad_norm": 0.026148518547415733, + "learning_rate": 1.790792179256362e-07, + "loss": 0.0006, + "step": 112830 + }, + { + "epoch": 1.8463552319397856, + "grad_norm": 0.0032644644379615784, + "learning_rate": 1.7870068579107724e-07, + "loss": 0.0014, + "step": 112840 + }, + { + "epoch": 1.8465188578908616, + "grad_norm": 0.03951422870159149, + "learning_rate": 1.7832254686528828e-07, + "loss": 0.0007, + "step": 112850 + }, + { + "epoch": 1.8466824838419373, + "grad_norm": 0.011702458374202251, + "learning_rate": 1.779448011791063e-07, + "loss": 0.0005, + "step": 112860 + }, + { + "epoch": 1.8468461097930131, + "grad_norm": 0.08894037455320358, + "learning_rate": 1.7756744876334165e-07, + "loss": 0.0016, + "step": 112870 + }, + { + "epoch": 1.847009735744089, + "grad_norm": 0.07927866280078888, + "learning_rate": 1.771904896487675e-07, + "loss": 0.0007, + "step": 112880 + }, + { + "epoch": 1.8471733616951649, + "grad_norm": 0.05027158930897713, + "learning_rate": 1.7681392386612872e-07, + "loss": 0.0008, + "step": 112890 + }, + { + "epoch": 1.8473369876462407, + "grad_norm": 0.04608063027262688, + "learning_rate": 1.7643775144613628e-07, + "loss": 0.0011, + "step": 112900 + }, + { + "epoch": 1.8475006135973167, + "grad_norm": 0.05112637206912041, + "learning_rate": 1.760619724194701e-07, + "loss": 0.0013, + "step": 112910 + }, + { + "epoch": 1.8476642395483924, + "grad_norm": 0.03093136101961136, + "learning_rate": 1.756865868167762e-07, + "loss": 0.0008, + "step": 112920 + }, + { + "epoch": 1.8478278654994682, + "grad_norm": 0.01358999777585268, + "learning_rate": 1.753115946686712e-07, + "loss": 0.001, + "step": 112930 + }, + { + "epoch": 1.8479914914505442, + "grad_norm": 0.05658520385622978, + "learning_rate": 1.7493699600573734e-07, + "loss": 0.0008, + "step": 112940 + }, + { + "epoch": 1.8481551174016198, + "grad_norm": 0.00327929574996233, + "learning_rate": 1.7456279085852624e-07, + "loss": 0.0003, + "step": 112950 + }, + { + "epoch": 1.8483187433526957, + "grad_norm": 0.017410041764378548, + "learning_rate": 1.741889792575552e-07, + "loss": 0.0009, + "step": 112960 + }, + { + "epoch": 1.8484823693037717, + "grad_norm": 0.06320864707231522, + "learning_rate": 1.7381556123331366e-07, + "loss": 0.0006, + "step": 112970 + }, + { + "epoch": 1.8486459952548473, + "grad_norm": 0.10688390582799911, + "learning_rate": 1.7344253681625345e-07, + "loss": 0.0012, + "step": 112980 + }, + { + "epoch": 1.8488096212059233, + "grad_norm": 0.1625424474477768, + "learning_rate": 1.7306990603679964e-07, + "loss": 0.0006, + "step": 112990 + }, + { + "epoch": 1.848973247156999, + "grad_norm": 0.03767475113272667, + "learning_rate": 1.7269766892534124e-07, + "loss": 0.0006, + "step": 113000 + }, + { + "epoch": 1.8491368731080748, + "grad_norm": 0.06444856524467468, + "learning_rate": 1.723258255122373e-07, + "loss": 0.0016, + "step": 113010 + }, + { + "epoch": 1.8493004990591508, + "grad_norm": 0.012578886933624744, + "learning_rate": 1.7195437582781304e-07, + "loss": 0.0006, + "step": 113020 + }, + { + "epoch": 1.8494641250102266, + "grad_norm": 0.012308084405958652, + "learning_rate": 1.715833199023642e-07, + "loss": 0.0006, + "step": 113030 + }, + { + "epoch": 1.8496277509613024, + "grad_norm": 0.019648972898721695, + "learning_rate": 1.7121265776615214e-07, + "loss": 0.0004, + "step": 113040 + }, + { + "epoch": 1.8497913769123784, + "grad_norm": 0.04874153062701225, + "learning_rate": 1.7084238944940656e-07, + "loss": 0.0008, + "step": 113050 + }, + { + "epoch": 1.8499550028634542, + "grad_norm": 0.02440950646996498, + "learning_rate": 1.7047251498232553e-07, + "loss": 0.0009, + "step": 113060 + }, + { + "epoch": 1.85011862881453, + "grad_norm": 0.029458576813340187, + "learning_rate": 1.7010303439507492e-07, + "loss": 0.0007, + "step": 113070 + }, + { + "epoch": 1.850282254765606, + "grad_norm": 0.0833294466137886, + "learning_rate": 1.6973394771778784e-07, + "loss": 0.0008, + "step": 113080 + }, + { + "epoch": 1.8504458807166817, + "grad_norm": 0.07105235755443573, + "learning_rate": 1.6936525498056576e-07, + "loss": 0.0012, + "step": 113090 + }, + { + "epoch": 1.8506095066677575, + "grad_norm": 0.008253823965787888, + "learning_rate": 1.689969562134791e-07, + "loss": 0.0012, + "step": 113100 + }, + { + "epoch": 1.8507731326188335, + "grad_norm": 0.08622777462005615, + "learning_rate": 1.6862905144656328e-07, + "loss": 0.0011, + "step": 113110 + }, + { + "epoch": 1.850936758569909, + "grad_norm": 0.06377329677343369, + "learning_rate": 1.682615407098248e-07, + "loss": 0.0011, + "step": 113120 + }, + { + "epoch": 1.851100384520985, + "grad_norm": 0.07756470888853073, + "learning_rate": 1.678944240332353e-07, + "loss": 0.0008, + "step": 113130 + }, + { + "epoch": 1.851264010472061, + "grad_norm": 0.0046451096422970295, + "learning_rate": 1.6752770144673748e-07, + "loss": 0.0008, + "step": 113140 + }, + { + "epoch": 1.8514276364231366, + "grad_norm": 0.016653910279273987, + "learning_rate": 1.6716137298023793e-07, + "loss": 0.0006, + "step": 113150 + }, + { + "epoch": 1.8515912623742126, + "grad_norm": 0.011719155125319958, + "learning_rate": 1.6679543866361448e-07, + "loss": 0.0004, + "step": 113160 + }, + { + "epoch": 1.8517548883252883, + "grad_norm": 0.04671384394168854, + "learning_rate": 1.66429898526711e-07, + "loss": 0.001, + "step": 113170 + }, + { + "epoch": 1.8519185142763641, + "grad_norm": 0.012840651907026768, + "learning_rate": 1.6606475259933973e-07, + "loss": 0.0013, + "step": 113180 + }, + { + "epoch": 1.85208214022744, + "grad_norm": 0.02137523889541626, + "learning_rate": 1.657000009112808e-07, + "loss": 0.0003, + "step": 113190 + }, + { + "epoch": 1.8522457661785159, + "grad_norm": 0.08299165219068527, + "learning_rate": 1.6533564349228204e-07, + "loss": 0.0007, + "step": 113200 + }, + { + "epoch": 1.8524093921295917, + "grad_norm": 0.039249010384082794, + "learning_rate": 1.6497168037205912e-07, + "loss": 0.0007, + "step": 113210 + }, + { + "epoch": 1.8525730180806677, + "grad_norm": 0.09357035160064697, + "learning_rate": 1.646081115802961e-07, + "loss": 0.0007, + "step": 113220 + }, + { + "epoch": 1.8527366440317434, + "grad_norm": 0.15707960724830627, + "learning_rate": 1.642449371466437e-07, + "loss": 0.0013, + "step": 113230 + }, + { + "epoch": 1.8529002699828192, + "grad_norm": 0.004865937866270542, + "learning_rate": 1.638821571007221e-07, + "loss": 0.0013, + "step": 113240 + }, + { + "epoch": 1.8530638959338952, + "grad_norm": 0.0739060640335083, + "learning_rate": 1.6351977147211707e-07, + "loss": 0.001, + "step": 113250 + }, + { + "epoch": 1.853227521884971, + "grad_norm": 0.043929535895586014, + "learning_rate": 1.6315778029038498e-07, + "loss": 0.0007, + "step": 113260 + }, + { + "epoch": 1.8533911478360467, + "grad_norm": 0.024959269911050797, + "learning_rate": 1.6279618358504777e-07, + "loss": 0.0008, + "step": 113270 + }, + { + "epoch": 1.8535547737871227, + "grad_norm": 0.09419511258602142, + "learning_rate": 1.624349813855963e-07, + "loss": 0.0012, + "step": 113280 + }, + { + "epoch": 1.8537183997381985, + "grad_norm": 0.0299326553940773, + "learning_rate": 1.620741737214887e-07, + "loss": 0.001, + "step": 113290 + }, + { + "epoch": 1.8538820256892743, + "grad_norm": 0.23981264233589172, + "learning_rate": 1.6171376062215194e-07, + "loss": 0.0008, + "step": 113300 + }, + { + "epoch": 1.8540456516403503, + "grad_norm": 0.21001379191875458, + "learning_rate": 1.613537421169792e-07, + "loss": 0.001, + "step": 113310 + }, + { + "epoch": 1.8542092775914258, + "grad_norm": 0.0021135404240339994, + "learning_rate": 1.6099411823533307e-07, + "loss": 0.0007, + "step": 113320 + }, + { + "epoch": 1.8543729035425018, + "grad_norm": 0.029162239283323288, + "learning_rate": 1.6063488900654233e-07, + "loss": 0.001, + "step": 113330 + }, + { + "epoch": 1.8545365294935778, + "grad_norm": 0.06761214882135391, + "learning_rate": 1.6027605445990579e-07, + "loss": 0.0008, + "step": 113340 + }, + { + "epoch": 1.8547001554446534, + "grad_norm": 0.04418284446001053, + "learning_rate": 1.599176146246878e-07, + "loss": 0.0011, + "step": 113350 + }, + { + "epoch": 1.8548637813957294, + "grad_norm": 0.12860621511936188, + "learning_rate": 1.595595695301222e-07, + "loss": 0.0008, + "step": 113360 + }, + { + "epoch": 1.8550274073468052, + "grad_norm": 0.05377534031867981, + "learning_rate": 1.592019192054084e-07, + "loss": 0.0007, + "step": 113370 + }, + { + "epoch": 1.855191033297881, + "grad_norm": 0.011231674812734127, + "learning_rate": 1.588446636797175e-07, + "loss": 0.0004, + "step": 113380 + }, + { + "epoch": 1.855354659248957, + "grad_norm": 0.0036793563049286604, + "learning_rate": 1.5848780298218458e-07, + "loss": 0.0009, + "step": 113390 + }, + { + "epoch": 1.8555182852000327, + "grad_norm": 0.03789239376783371, + "learning_rate": 1.5813133714191352e-07, + "loss": 0.0003, + "step": 113400 + }, + { + "epoch": 1.8556819111511085, + "grad_norm": 0.03873893618583679, + "learning_rate": 1.577752661879778e-07, + "loss": 0.0004, + "step": 113410 + }, + { + "epoch": 1.8558455371021845, + "grad_norm": 0.043267808854579926, + "learning_rate": 1.5741959014941578e-07, + "loss": 0.0016, + "step": 113420 + }, + { + "epoch": 1.8560091630532602, + "grad_norm": 0.009245689027011395, + "learning_rate": 1.5706430905523708e-07, + "loss": 0.0008, + "step": 113430 + }, + { + "epoch": 1.856172789004336, + "grad_norm": 0.03791176527738571, + "learning_rate": 1.5670942293441572e-07, + "loss": 0.0011, + "step": 113440 + }, + { + "epoch": 1.856336414955412, + "grad_norm": 0.07527469098567963, + "learning_rate": 1.563549318158958e-07, + "loss": 0.0011, + "step": 113450 + }, + { + "epoch": 1.8565000409064878, + "grad_norm": 0.03289582580327988, + "learning_rate": 1.5600083572858805e-07, + "loss": 0.0004, + "step": 113460 + }, + { + "epoch": 1.8566636668575636, + "grad_norm": 0.08498571068048477, + "learning_rate": 1.556471347013716e-07, + "loss": 0.0008, + "step": 113470 + }, + { + "epoch": 1.8568272928086396, + "grad_norm": 0.026511432603001595, + "learning_rate": 1.552938287630923e-07, + "loss": 0.0011, + "step": 113480 + }, + { + "epoch": 1.8569909187597153, + "grad_norm": 0.018651818856596947, + "learning_rate": 1.5494091794256593e-07, + "loss": 0.001, + "step": 113490 + }, + { + "epoch": 1.857154544710791, + "grad_norm": 0.084585040807724, + "learning_rate": 1.545884022685734e-07, + "loss": 0.001, + "step": 113500 + }, + { + "epoch": 1.857318170661867, + "grad_norm": 0.01497381180524826, + "learning_rate": 1.542362817698656e-07, + "loss": 0.0006, + "step": 113510 + }, + { + "epoch": 1.8574817966129427, + "grad_norm": 0.005593966227024794, + "learning_rate": 1.538845564751601e-07, + "loss": 0.0009, + "step": 113520 + }, + { + "epoch": 1.8576454225640187, + "grad_norm": 0.037745069712400436, + "learning_rate": 1.5353322641314228e-07, + "loss": 0.0006, + "step": 113530 + }, + { + "epoch": 1.8578090485150947, + "grad_norm": 0.04144706949591637, + "learning_rate": 1.5318229161246477e-07, + "loss": 0.0005, + "step": 113540 + }, + { + "epoch": 1.8579726744661702, + "grad_norm": 0.01671553961932659, + "learning_rate": 1.5283175210174973e-07, + "loss": 0.001, + "step": 113550 + }, + { + "epoch": 1.8581363004172462, + "grad_norm": 0.01007680781185627, + "learning_rate": 1.5248160790958533e-07, + "loss": 0.0012, + "step": 113560 + }, + { + "epoch": 1.858299926368322, + "grad_norm": 0.04720287770032883, + "learning_rate": 1.5213185906452876e-07, + "loss": 0.0007, + "step": 113570 + }, + { + "epoch": 1.8584635523193977, + "grad_norm": 0.002298968145623803, + "learning_rate": 1.5178250559510333e-07, + "loss": 0.001, + "step": 113580 + }, + { + "epoch": 1.8586271782704737, + "grad_norm": 0.059410396963357925, + "learning_rate": 1.5143354752980233e-07, + "loss": 0.0004, + "step": 113590 + }, + { + "epoch": 1.8587908042215495, + "grad_norm": 0.061401356011629105, + "learning_rate": 1.5108498489708412e-07, + "loss": 0.001, + "step": 113600 + }, + { + "epoch": 1.8589544301726253, + "grad_norm": 0.028765061870217323, + "learning_rate": 1.5073681772537763e-07, + "loss": 0.0009, + "step": 113610 + }, + { + "epoch": 1.8591180561237013, + "grad_norm": 0.023047300055623055, + "learning_rate": 1.5038904604307736e-07, + "loss": 0.001, + "step": 113620 + }, + { + "epoch": 1.859281682074777, + "grad_norm": 0.046529725193977356, + "learning_rate": 1.500416698785473e-07, + "loss": 0.0007, + "step": 113630 + }, + { + "epoch": 1.8594453080258528, + "grad_norm": 0.018830517306923866, + "learning_rate": 1.49694689260117e-07, + "loss": 0.0007, + "step": 113640 + }, + { + "epoch": 1.8596089339769288, + "grad_norm": 0.028549816459417343, + "learning_rate": 1.4934810421608602e-07, + "loss": 0.0005, + "step": 113650 + }, + { + "epoch": 1.8597725599280046, + "grad_norm": 0.11096290498971939, + "learning_rate": 1.4900191477472014e-07, + "loss": 0.0012, + "step": 113660 + }, + { + "epoch": 1.8599361858790804, + "grad_norm": 0.09251801669597626, + "learning_rate": 1.486561209642534e-07, + "loss": 0.0006, + "step": 113670 + }, + { + "epoch": 1.8600998118301564, + "grad_norm": 0.02776389569044113, + "learning_rate": 1.483107228128877e-07, + "loss": 0.0008, + "step": 113680 + }, + { + "epoch": 1.8602634377812322, + "grad_norm": 0.03326091915369034, + "learning_rate": 1.479657203487933e-07, + "loss": 0.0008, + "step": 113690 + }, + { + "epoch": 1.860427063732308, + "grad_norm": 0.008021118119359016, + "learning_rate": 1.47621113600106e-07, + "loss": 0.0008, + "step": 113700 + }, + { + "epoch": 1.860590689683384, + "grad_norm": 0.011117805726826191, + "learning_rate": 1.4727690259493222e-07, + "loss": 0.0007, + "step": 113710 + }, + { + "epoch": 1.8607543156344595, + "grad_norm": 0.01644272170960903, + "learning_rate": 1.4693308736134336e-07, + "loss": 0.001, + "step": 113720 + }, + { + "epoch": 1.8609179415855355, + "grad_norm": 0.0393107607960701, + "learning_rate": 1.4658966792738038e-07, + "loss": 0.0009, + "step": 113730 + }, + { + "epoch": 1.8610815675366115, + "grad_norm": 0.03911082074046135, + "learning_rate": 1.462466443210514e-07, + "loss": 0.0009, + "step": 113740 + }, + { + "epoch": 1.861245193487687, + "grad_norm": 0.026783360168337822, + "learning_rate": 1.459040165703318e-07, + "loss": 0.001, + "step": 113750 + }, + { + "epoch": 1.861408819438763, + "grad_norm": 0.04758188873529434, + "learning_rate": 1.4556178470316652e-07, + "loss": 0.0006, + "step": 113760 + }, + { + "epoch": 1.8615724453898388, + "grad_norm": 0.07427937537431717, + "learning_rate": 1.452199487474648e-07, + "loss": 0.0014, + "step": 113770 + }, + { + "epoch": 1.8617360713409146, + "grad_norm": 0.06645882874727249, + "learning_rate": 1.448785087311072e-07, + "loss": 0.0007, + "step": 113780 + }, + { + "epoch": 1.8618996972919906, + "grad_norm": 0.03985673934221268, + "learning_rate": 1.4453746468193975e-07, + "loss": 0.0012, + "step": 113790 + }, + { + "epoch": 1.8620633232430663, + "grad_norm": 0.04946216195821762, + "learning_rate": 1.4419681662777685e-07, + "loss": 0.0019, + "step": 113800 + }, + { + "epoch": 1.8622269491941421, + "grad_norm": 0.0020157562103122473, + "learning_rate": 1.4385656459640074e-07, + "loss": 0.0007, + "step": 113810 + }, + { + "epoch": 1.862390575145218, + "grad_norm": 0.04747512564063072, + "learning_rate": 1.4351670861556145e-07, + "loss": 0.0007, + "step": 113820 + }, + { + "epoch": 1.8625542010962939, + "grad_norm": 0.037427473813295364, + "learning_rate": 1.4317724871297623e-07, + "loss": 0.0005, + "step": 113830 + }, + { + "epoch": 1.8627178270473697, + "grad_norm": 0.13379958271980286, + "learning_rate": 1.4283818491633017e-07, + "loss": 0.0009, + "step": 113840 + }, + { + "epoch": 1.8628814529984457, + "grad_norm": 0.035843439400196075, + "learning_rate": 1.4249951725327615e-07, + "loss": 0.0009, + "step": 113850 + }, + { + "epoch": 1.8630450789495214, + "grad_norm": 0.048810385167598724, + "learning_rate": 1.4216124575143486e-07, + "loss": 0.0004, + "step": 113860 + }, + { + "epoch": 1.8632087049005972, + "grad_norm": 0.02390442043542862, + "learning_rate": 1.4182337043839477e-07, + "loss": 0.0007, + "step": 113870 + }, + { + "epoch": 1.8633723308516732, + "grad_norm": 0.0021288383286446333, + "learning_rate": 1.4148589134171165e-07, + "loss": 0.0005, + "step": 113880 + }, + { + "epoch": 1.8635359568027487, + "grad_norm": 0.06789467483758926, + "learning_rate": 1.4114880848890843e-07, + "loss": 0.0007, + "step": 113890 + }, + { + "epoch": 1.8636995827538247, + "grad_norm": 0.028271473944187164, + "learning_rate": 1.4081212190747816e-07, + "loss": 0.0011, + "step": 113900 + }, + { + "epoch": 1.8638632087049007, + "grad_norm": 0.03492870554327965, + "learning_rate": 1.4047583162487832e-07, + "loss": 0.0006, + "step": 113910 + }, + { + "epoch": 1.8640268346559763, + "grad_norm": 0.003272128524258733, + "learning_rate": 1.401399376685364e-07, + "loss": 0.0016, + "step": 113920 + }, + { + "epoch": 1.8641904606070523, + "grad_norm": 0.08852490037679672, + "learning_rate": 1.3980444006584605e-07, + "loss": 0.0014, + "step": 113930 + }, + { + "epoch": 1.864354086558128, + "grad_norm": 0.027692250907421112, + "learning_rate": 1.3946933884417034e-07, + "loss": 0.0009, + "step": 113940 + }, + { + "epoch": 1.8645177125092038, + "grad_norm": 0.010999278165400028, + "learning_rate": 1.3913463403083748e-07, + "loss": 0.0009, + "step": 113950 + }, + { + "epoch": 1.8646813384602798, + "grad_norm": 0.03792864456772804, + "learning_rate": 1.3880032565314672e-07, + "loss": 0.0006, + "step": 113960 + }, + { + "epoch": 1.8648449644113556, + "grad_norm": 0.047340016812086105, + "learning_rate": 1.3846641373836122e-07, + "loss": 0.0007, + "step": 113970 + }, + { + "epoch": 1.8650085903624314, + "grad_norm": 0.09782253950834274, + "learning_rate": 1.381328983137159e-07, + "loss": 0.0008, + "step": 113980 + }, + { + "epoch": 1.8651722163135074, + "grad_norm": 0.0017703217454254627, + "learning_rate": 1.37799779406409e-07, + "loss": 0.0008, + "step": 113990 + }, + { + "epoch": 1.8653358422645832, + "grad_norm": 0.06902812421321869, + "learning_rate": 1.3746705704360985e-07, + "loss": 0.0009, + "step": 114000 + }, + { + "epoch": 1.865499468215659, + "grad_norm": 0.04585520923137665, + "learning_rate": 1.3713473125245348e-07, + "loss": 0.0012, + "step": 114010 + }, + { + "epoch": 1.865663094166735, + "grad_norm": 0.033851347863674164, + "learning_rate": 1.368028020600437e-07, + "loss": 0.0006, + "step": 114020 + }, + { + "epoch": 1.8658267201178107, + "grad_norm": 0.007088218349963427, + "learning_rate": 1.364712694934517e-07, + "loss": 0.0014, + "step": 114030 + }, + { + "epoch": 1.8659903460688865, + "grad_norm": 0.036072053015232086, + "learning_rate": 1.3614013357971523e-07, + "loss": 0.0008, + "step": 114040 + }, + { + "epoch": 1.8661539720199625, + "grad_norm": 0.06582151353359222, + "learning_rate": 1.3580939434584162e-07, + "loss": 0.0007, + "step": 114050 + }, + { + "epoch": 1.8663175979710382, + "grad_norm": 0.004526420496404171, + "learning_rate": 1.354790518188037e-07, + "loss": 0.001, + "step": 114060 + }, + { + "epoch": 1.866481223922114, + "grad_norm": 0.023701583966612816, + "learning_rate": 1.3514910602554442e-07, + "loss": 0.0008, + "step": 114070 + }, + { + "epoch": 1.86664484987319, + "grad_norm": 0.06362594664096832, + "learning_rate": 1.3481955699297221e-07, + "loss": 0.0008, + "step": 114080 + }, + { + "epoch": 1.8668084758242656, + "grad_norm": 0.03933916240930557, + "learning_rate": 1.3449040474796394e-07, + "loss": 0.0011, + "step": 114090 + }, + { + "epoch": 1.8669721017753416, + "grad_norm": 0.016460837796330452, + "learning_rate": 1.3416164931736475e-07, + "loss": 0.0013, + "step": 114100 + }, + { + "epoch": 1.8671357277264176, + "grad_norm": 0.04689618945121765, + "learning_rate": 1.3383329072798657e-07, + "loss": 0.0007, + "step": 114110 + }, + { + "epoch": 1.8672993536774931, + "grad_norm": 0.033416748046875, + "learning_rate": 1.335053290066085e-07, + "loss": 0.0005, + "step": 114120 + }, + { + "epoch": 1.867462979628569, + "grad_norm": 0.07599564641714096, + "learning_rate": 1.3317776417997864e-07, + "loss": 0.0009, + "step": 114130 + }, + { + "epoch": 1.8676266055796449, + "grad_norm": 0.03028174303472042, + "learning_rate": 1.3285059627481224e-07, + "loss": 0.0014, + "step": 114140 + }, + { + "epoch": 1.8677902315307207, + "grad_norm": 0.035100746899843216, + "learning_rate": 1.3252382531779184e-07, + "loss": 0.0007, + "step": 114150 + }, + { + "epoch": 1.8679538574817967, + "grad_norm": 0.011115369386970997, + "learning_rate": 1.3219745133556727e-07, + "loss": 0.0007, + "step": 114160 + }, + { + "epoch": 1.8681174834328724, + "grad_norm": 0.024122070521116257, + "learning_rate": 1.3187147435475723e-07, + "loss": 0.0015, + "step": 114170 + }, + { + "epoch": 1.8682811093839482, + "grad_norm": 0.025685006752610207, + "learning_rate": 1.3154589440194653e-07, + "loss": 0.0007, + "step": 114180 + }, + { + "epoch": 1.8684447353350242, + "grad_norm": 0.046651050448417664, + "learning_rate": 1.3122071150368898e-07, + "loss": 0.0009, + "step": 114190 + }, + { + "epoch": 1.8686083612861, + "grad_norm": 0.014678443782031536, + "learning_rate": 1.3089592568650499e-07, + "loss": 0.0009, + "step": 114200 + }, + { + "epoch": 1.8687719872371757, + "grad_norm": 0.014545261859893799, + "learning_rate": 1.3057153697688397e-07, + "loss": 0.0007, + "step": 114210 + }, + { + "epoch": 1.8689356131882517, + "grad_norm": 0.037526004016399384, + "learning_rate": 1.3024754540128025e-07, + "loss": 0.0011, + "step": 114220 + }, + { + "epoch": 1.8690992391393275, + "grad_norm": 0.053323958069086075, + "learning_rate": 1.2992395098611888e-07, + "loss": 0.0008, + "step": 114230 + }, + { + "epoch": 1.8692628650904033, + "grad_norm": 0.07826829701662064, + "learning_rate": 1.2960075375779035e-07, + "loss": 0.0012, + "step": 114240 + }, + { + "epoch": 1.8694264910414793, + "grad_norm": 0.021955395117402077, + "learning_rate": 1.2927795374265418e-07, + "loss": 0.0007, + "step": 114250 + }, + { + "epoch": 1.869590116992555, + "grad_norm": 0.043120529502630234, + "learning_rate": 1.2895555096703593e-07, + "loss": 0.0007, + "step": 114260 + }, + { + "epoch": 1.8697537429436308, + "grad_norm": 0.10827703028917313, + "learning_rate": 1.2863354545723072e-07, + "loss": 0.0015, + "step": 114270 + }, + { + "epoch": 1.8699173688947068, + "grad_norm": 0.0020883185788989067, + "learning_rate": 1.2831193723949976e-07, + "loss": 0.0004, + "step": 114280 + }, + { + "epoch": 1.8700809948457824, + "grad_norm": 0.038506872951984406, + "learning_rate": 1.2799072634007204e-07, + "loss": 0.0017, + "step": 114290 + }, + { + "epoch": 1.8702446207968584, + "grad_norm": 0.0020213236566632986, + "learning_rate": 1.276699127851444e-07, + "loss": 0.0009, + "step": 114300 + }, + { + "epoch": 1.8704082467479344, + "grad_norm": 0.00257762148976326, + "learning_rate": 1.273494966008826e-07, + "loss": 0.0007, + "step": 114310 + }, + { + "epoch": 1.87057187269901, + "grad_norm": 0.0471498966217041, + "learning_rate": 1.2702947781341622e-07, + "loss": 0.0023, + "step": 114320 + }, + { + "epoch": 1.870735498650086, + "grad_norm": 0.045030951499938965, + "learning_rate": 1.2670985644884782e-07, + "loss": 0.0007, + "step": 114330 + }, + { + "epoch": 1.8708991246011617, + "grad_norm": 0.0077773695811629295, + "learning_rate": 1.2639063253324202e-07, + "loss": 0.0009, + "step": 114340 + }, + { + "epoch": 1.8710627505522375, + "grad_norm": 0.07448321580886841, + "learning_rate": 1.2607180609263526e-07, + "loss": 0.0007, + "step": 114350 + }, + { + "epoch": 1.8712263765033135, + "grad_norm": 0.044390615075826645, + "learning_rate": 1.257533771530295e-07, + "loss": 0.0005, + "step": 114360 + }, + { + "epoch": 1.8713900024543892, + "grad_norm": 0.04984167963266373, + "learning_rate": 1.254353457403945e-07, + "loss": 0.0009, + "step": 114370 + }, + { + "epoch": 1.871553628405465, + "grad_norm": 0.050430797040462494, + "learning_rate": 1.2511771188066845e-07, + "loss": 0.0009, + "step": 114380 + }, + { + "epoch": 1.871717254356541, + "grad_norm": 0.10092493146657944, + "learning_rate": 1.248004755997556e-07, + "loss": 0.0013, + "step": 114390 + }, + { + "epoch": 1.8718808803076168, + "grad_norm": 0.03787825629115105, + "learning_rate": 1.2448363692352915e-07, + "loss": 0.0006, + "step": 114400 + }, + { + "epoch": 1.8720445062586926, + "grad_norm": 0.24682794511318207, + "learning_rate": 1.24167195877829e-07, + "loss": 0.0011, + "step": 114410 + }, + { + "epoch": 1.8722081322097686, + "grad_norm": 0.08855167031288147, + "learning_rate": 1.238511524884639e-07, + "loss": 0.0013, + "step": 114420 + }, + { + "epoch": 1.8723717581608443, + "grad_norm": 0.08572986721992493, + "learning_rate": 1.2353550678120828e-07, + "loss": 0.0011, + "step": 114430 + }, + { + "epoch": 1.87253538411192, + "grad_norm": 0.003400080371648073, + "learning_rate": 1.2322025878180598e-07, + "loss": 0.0003, + "step": 114440 + }, + { + "epoch": 1.872699010062996, + "grad_norm": 0.03428833559155464, + "learning_rate": 1.22905408515967e-07, + "loss": 0.0024, + "step": 114450 + }, + { + "epoch": 1.8728626360140719, + "grad_norm": 0.04005441442131996, + "learning_rate": 1.2259095600936966e-07, + "loss": 0.0008, + "step": 114460 + }, + { + "epoch": 1.8730262619651477, + "grad_norm": 0.03419559821486473, + "learning_rate": 1.2227690128765902e-07, + "loss": 0.0008, + "step": 114470 + }, + { + "epoch": 1.8731898879162237, + "grad_norm": 0.03828272223472595, + "learning_rate": 1.2196324437644958e-07, + "loss": 0.0019, + "step": 114480 + }, + { + "epoch": 1.8733535138672992, + "grad_norm": 0.08481821417808533, + "learning_rate": 1.2164998530132089e-07, + "loss": 0.001, + "step": 114490 + }, + { + "epoch": 1.8735171398183752, + "grad_norm": 0.06506198644638062, + "learning_rate": 1.213371240878225e-07, + "loss": 0.0043, + "step": 114500 + }, + { + "epoch": 1.8736807657694512, + "grad_norm": 0.03218720108270645, + "learning_rate": 1.21024660761469e-07, + "loss": 0.0004, + "step": 114510 + }, + { + "epoch": 1.8738443917205267, + "grad_norm": 0.020112505182623863, + "learning_rate": 1.207125953477445e-07, + "loss": 0.0006, + "step": 114520 + }, + { + "epoch": 1.8740080176716027, + "grad_norm": 0.005319379270076752, + "learning_rate": 1.2040092787210022e-07, + "loss": 0.0007, + "step": 114530 + }, + { + "epoch": 1.8741716436226785, + "grad_norm": 0.11431462317705154, + "learning_rate": 1.2008965835995424e-07, + "loss": 0.0024, + "step": 114540 + }, + { + "epoch": 1.8743352695737543, + "grad_norm": 0.010235359892249107, + "learning_rate": 1.1977878683669287e-07, + "loss": 0.0016, + "step": 114550 + }, + { + "epoch": 1.8744988955248303, + "grad_norm": 0.03882668539881706, + "learning_rate": 1.1946831332766974e-07, + "loss": 0.0008, + "step": 114560 + }, + { + "epoch": 1.874662521475906, + "grad_norm": 0.02070467360317707, + "learning_rate": 1.1915823785820568e-07, + "loss": 0.0007, + "step": 114570 + }, + { + "epoch": 1.8748261474269818, + "grad_norm": 0.010153586976230145, + "learning_rate": 1.188485604535905e-07, + "loss": 0.0008, + "step": 114580 + }, + { + "epoch": 1.8749897733780578, + "grad_norm": 0.03179711848497391, + "learning_rate": 1.1853928113907842e-07, + "loss": 0.0008, + "step": 114590 + }, + { + "epoch": 1.8751533993291336, + "grad_norm": 0.06430523842573166, + "learning_rate": 1.1823039993989538e-07, + "loss": 0.0008, + "step": 114600 + }, + { + "epoch": 1.8753170252802094, + "grad_norm": 0.059512339532375336, + "learning_rate": 1.1792191688123123e-07, + "loss": 0.001, + "step": 114610 + }, + { + "epoch": 1.8754806512312854, + "grad_norm": 0.0408201701939106, + "learning_rate": 1.1761383198824583e-07, + "loss": 0.001, + "step": 114620 + }, + { + "epoch": 1.8756442771823612, + "grad_norm": 0.002308931201696396, + "learning_rate": 1.173061452860641e-07, + "loss": 0.001, + "step": 114630 + }, + { + "epoch": 1.875807903133437, + "grad_norm": 0.06227466091513634, + "learning_rate": 1.1699885679978151e-07, + "loss": 0.0006, + "step": 114640 + }, + { + "epoch": 1.875971529084513, + "grad_norm": 0.0017069252207875252, + "learning_rate": 1.1669196655445858e-07, + "loss": 0.0005, + "step": 114650 + }, + { + "epoch": 1.8761351550355887, + "grad_norm": 0.009412968531250954, + "learning_rate": 1.1638547457512417e-07, + "loss": 0.0004, + "step": 114660 + }, + { + "epoch": 1.8762987809866645, + "grad_norm": 0.055103521794080734, + "learning_rate": 1.1607938088677494e-07, + "loss": 0.0004, + "step": 114670 + }, + { + "epoch": 1.8764624069377405, + "grad_norm": 0.0183147881180048, + "learning_rate": 1.157736855143754e-07, + "loss": 0.0008, + "step": 114680 + }, + { + "epoch": 1.876626032888816, + "grad_norm": 0.03809869661927223, + "learning_rate": 1.1546838848285668e-07, + "loss": 0.0009, + "step": 114690 + }, + { + "epoch": 1.876789658839892, + "grad_norm": 0.06347623467445374, + "learning_rate": 1.1516348981711667e-07, + "loss": 0.0011, + "step": 114700 + }, + { + "epoch": 1.876953284790968, + "grad_norm": 0.07838743180036545, + "learning_rate": 1.1485898954202323e-07, + "loss": 0.001, + "step": 114710 + }, + { + "epoch": 1.8771169107420436, + "grad_norm": 0.02584925852715969, + "learning_rate": 1.1455488768240985e-07, + "loss": 0.0007, + "step": 114720 + }, + { + "epoch": 1.8772805366931196, + "grad_norm": 0.034247543662786484, + "learning_rate": 1.1425118426307835e-07, + "loss": 0.0006, + "step": 114730 + }, + { + "epoch": 1.8774441626441953, + "grad_norm": 0.01696961000561714, + "learning_rate": 1.1394787930879725e-07, + "loss": 0.0007, + "step": 114740 + }, + { + "epoch": 1.877607788595271, + "grad_norm": 0.027281509712338448, + "learning_rate": 1.1364497284430342e-07, + "loss": 0.0008, + "step": 114750 + }, + { + "epoch": 1.877771414546347, + "grad_norm": 0.019620731472969055, + "learning_rate": 1.1334246489430046e-07, + "loss": 0.0004, + "step": 114760 + }, + { + "epoch": 1.8779350404974229, + "grad_norm": 0.16902285814285278, + "learning_rate": 1.1304035548346081e-07, + "loss": 0.0015, + "step": 114770 + }, + { + "epoch": 1.8780986664484987, + "grad_norm": 0.028478844091296196, + "learning_rate": 1.1273864463642259e-07, + "loss": 0.0009, + "step": 114780 + }, + { + "epoch": 1.8782622923995747, + "grad_norm": 0.04589986428618431, + "learning_rate": 1.1243733237779275e-07, + "loss": 0.0008, + "step": 114790 + }, + { + "epoch": 1.8784259183506504, + "grad_norm": 0.08585745096206665, + "learning_rate": 1.1213641873214443e-07, + "loss": 0.001, + "step": 114800 + }, + { + "epoch": 1.8785895443017262, + "grad_norm": 0.20149028301239014, + "learning_rate": 1.1183590372402075e-07, + "loss": 0.0013, + "step": 114810 + }, + { + "epoch": 1.8787531702528022, + "grad_norm": 0.01713499426841736, + "learning_rate": 1.1153578737792937e-07, + "loss": 0.0007, + "step": 114820 + }, + { + "epoch": 1.878916796203878, + "grad_norm": 0.05463026463985443, + "learning_rate": 1.1123606971834732e-07, + "loss": 0.0006, + "step": 114830 + }, + { + "epoch": 1.8790804221549537, + "grad_norm": 0.014157851226627827, + "learning_rate": 1.1093675076971844e-07, + "loss": 0.0009, + "step": 114840 + }, + { + "epoch": 1.8792440481060297, + "grad_norm": 0.057343583554029465, + "learning_rate": 1.1063783055645428e-07, + "loss": 0.0011, + "step": 114850 + }, + { + "epoch": 1.8794076740571053, + "grad_norm": 0.025464218109846115, + "learning_rate": 1.1033930910293311e-07, + "loss": 0.0008, + "step": 114860 + }, + { + "epoch": 1.8795713000081813, + "grad_norm": 0.25258785486221313, + "learning_rate": 1.1004118643350214e-07, + "loss": 0.0017, + "step": 114870 + }, + { + "epoch": 1.8797349259592573, + "grad_norm": 0.01316542737185955, + "learning_rate": 1.097434625724747e-07, + "loss": 0.0011, + "step": 114880 + }, + { + "epoch": 1.8798985519103328, + "grad_norm": 0.01584048941731453, + "learning_rate": 1.0944613754413302e-07, + "loss": 0.0006, + "step": 114890 + }, + { + "epoch": 1.8800621778614088, + "grad_norm": 0.0012790379114449024, + "learning_rate": 1.0914921137272439e-07, + "loss": 0.0006, + "step": 114900 + }, + { + "epoch": 1.8802258038124846, + "grad_norm": 0.003282808233052492, + "learning_rate": 1.0885268408246663e-07, + "loss": 0.0005, + "step": 114910 + }, + { + "epoch": 1.8803894297635604, + "grad_norm": 0.017884593456983566, + "learning_rate": 1.0855655569754264e-07, + "loss": 0.0004, + "step": 114920 + }, + { + "epoch": 1.8805530557146364, + "grad_norm": 0.10842547565698624, + "learning_rate": 1.0826082624210421e-07, + "loss": 0.0007, + "step": 114930 + }, + { + "epoch": 1.8807166816657122, + "grad_norm": 0.08682433515787125, + "learning_rate": 1.0796549574026872e-07, + "loss": 0.0015, + "step": 114940 + }, + { + "epoch": 1.880880307616788, + "grad_norm": 0.02029728889465332, + "learning_rate": 1.076705642161241e-07, + "loss": 0.0007, + "step": 114950 + }, + { + "epoch": 1.881043933567864, + "grad_norm": 0.0072782267816364765, + "learning_rate": 1.0737603169372279e-07, + "loss": 0.0003, + "step": 114960 + }, + { + "epoch": 1.8812075595189397, + "grad_norm": 0.06822889298200607, + "learning_rate": 1.0708189819708614e-07, + "loss": 0.0006, + "step": 114970 + }, + { + "epoch": 1.8813711854700155, + "grad_norm": 0.010652842000126839, + "learning_rate": 1.0678816375020273e-07, + "loss": 0.0009, + "step": 114980 + }, + { + "epoch": 1.8815348114210915, + "grad_norm": 0.02239980548620224, + "learning_rate": 1.0649482837702896e-07, + "loss": 0.0008, + "step": 114990 + }, + { + "epoch": 1.8816984373721672, + "grad_norm": 0.05502698943018913, + "learning_rate": 1.0620189210148735e-07, + "loss": 0.0006, + "step": 115000 + }, + { + "epoch": 1.881862063323243, + "grad_norm": 0.011067809537053108, + "learning_rate": 1.0590935494746935e-07, + "loss": 0.0006, + "step": 115010 + }, + { + "epoch": 1.882025689274319, + "grad_norm": 0.008618976920843124, + "learning_rate": 1.0561721693883364e-07, + "loss": 0.0014, + "step": 115020 + }, + { + "epoch": 1.8821893152253948, + "grad_norm": 0.023923030123114586, + "learning_rate": 1.0532547809940507e-07, + "loss": 0.0008, + "step": 115030 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.006676288787275553, + "learning_rate": 1.0503413845297739e-07, + "loss": 0.001, + "step": 115040 + }, + { + "epoch": 1.8825165671275466, + "grad_norm": 0.017794566228985786, + "learning_rate": 1.0474319802331157e-07, + "loss": 0.0006, + "step": 115050 + }, + { + "epoch": 1.882680193078622, + "grad_norm": 0.0058709969744086266, + "learning_rate": 1.0445265683413531e-07, + "loss": 0.0012, + "step": 115060 + }, + { + "epoch": 1.882843819029698, + "grad_norm": 0.08871816098690033, + "learning_rate": 1.0416251490914409e-07, + "loss": 0.0023, + "step": 115070 + }, + { + "epoch": 1.883007444980774, + "grad_norm": 0.0012904698960483074, + "learning_rate": 1.0387277227200176e-07, + "loss": 0.0003, + "step": 115080 + }, + { + "epoch": 1.8831710709318497, + "grad_norm": 0.06076951324939728, + "learning_rate": 1.0358342894633722e-07, + "loss": 0.0005, + "step": 115090 + }, + { + "epoch": 1.8833346968829257, + "grad_norm": 0.04893820360302925, + "learning_rate": 1.0329448495574934e-07, + "loss": 0.0013, + "step": 115100 + }, + { + "epoch": 1.8834983228340014, + "grad_norm": 0.07313743978738785, + "learning_rate": 1.0300594032380318e-07, + "loss": 0.0015, + "step": 115110 + }, + { + "epoch": 1.8836619487850772, + "grad_norm": 0.017988761886954308, + "learning_rate": 1.0271779507403213e-07, + "loss": 0.001, + "step": 115120 + }, + { + "epoch": 1.8838255747361532, + "grad_norm": 0.008396394550800323, + "learning_rate": 1.0243004922993516e-07, + "loss": 0.0008, + "step": 115130 + }, + { + "epoch": 1.883989200687229, + "grad_norm": 0.04588119313120842, + "learning_rate": 1.0214270281498074e-07, + "loss": 0.0014, + "step": 115140 + }, + { + "epoch": 1.8841528266383047, + "grad_norm": 0.0762842521071434, + "learning_rate": 1.0185575585260288e-07, + "loss": 0.001, + "step": 115150 + }, + { + "epoch": 1.8843164525893807, + "grad_norm": 0.012233969755470753, + "learning_rate": 1.0156920836620566e-07, + "loss": 0.0007, + "step": 115160 + }, + { + "epoch": 1.8844800785404565, + "grad_norm": 0.04828006029129028, + "learning_rate": 1.0128306037915703e-07, + "loss": 0.001, + "step": 115170 + }, + { + "epoch": 1.8846437044915323, + "grad_norm": 0.013860705308616161, + "learning_rate": 1.0099731191479556e-07, + "loss": 0.0003, + "step": 115180 + }, + { + "epoch": 1.8848073304426083, + "grad_norm": 0.046628449112176895, + "learning_rate": 1.0071196299642483e-07, + "loss": 0.0005, + "step": 115190 + }, + { + "epoch": 1.884970956393684, + "grad_norm": 0.033884722739458084, + "learning_rate": 1.0042701364731844e-07, + "loss": 0.0008, + "step": 115200 + }, + { + "epoch": 1.8851345823447598, + "grad_norm": 0.028205636888742447, + "learning_rate": 1.0014246389071447e-07, + "loss": 0.0013, + "step": 115210 + }, + { + "epoch": 1.8852982082958358, + "grad_norm": 0.024718027561903, + "learning_rate": 9.985831374982102e-08, + "loss": 0.0017, + "step": 115220 + }, + { + "epoch": 1.8854618342469116, + "grad_norm": 0.10282206535339355, + "learning_rate": 9.957456324781123e-08, + "loss": 0.0013, + "step": 115230 + }, + { + "epoch": 1.8856254601979874, + "grad_norm": 0.05735069140791893, + "learning_rate": 9.929121240782769e-08, + "loss": 0.0012, + "step": 115240 + }, + { + "epoch": 1.8857890861490634, + "grad_norm": 0.07045166194438934, + "learning_rate": 9.900826125297858e-08, + "loss": 0.001, + "step": 115250 + }, + { + "epoch": 1.885952712100139, + "grad_norm": 0.012375889346003532, + "learning_rate": 9.87257098063421e-08, + "loss": 0.0012, + "step": 115260 + }, + { + "epoch": 1.886116338051215, + "grad_norm": 0.056508224457502365, + "learning_rate": 9.84435580909604e-08, + "loss": 0.001, + "step": 115270 + }, + { + "epoch": 1.886279964002291, + "grad_norm": 0.02302614040672779, + "learning_rate": 9.816180612984561e-08, + "loss": 0.0006, + "step": 115280 + }, + { + "epoch": 1.8864435899533665, + "grad_norm": 0.027453452348709106, + "learning_rate": 9.788045394597656e-08, + "loss": 0.0019, + "step": 115290 + }, + { + "epoch": 1.8866072159044425, + "grad_norm": 0.00518826674669981, + "learning_rate": 9.759950156229936e-08, + "loss": 0.0007, + "step": 115300 + }, + { + "epoch": 1.8867708418555182, + "grad_norm": 0.02175845205783844, + "learning_rate": 9.731894900172734e-08, + "loss": 0.0007, + "step": 115310 + }, + { + "epoch": 1.886934467806594, + "grad_norm": 0.03555656969547272, + "learning_rate": 9.703879628714163e-08, + "loss": 0.0009, + "step": 115320 + }, + { + "epoch": 1.88709809375767, + "grad_norm": 0.011384014040231705, + "learning_rate": 9.675904344139008e-08, + "loss": 0.0009, + "step": 115330 + }, + { + "epoch": 1.8872617197087458, + "grad_norm": 0.017740968614816666, + "learning_rate": 9.647969048728888e-08, + "loss": 0.0005, + "step": 115340 + }, + { + "epoch": 1.8874253456598216, + "grad_norm": 0.019332095980644226, + "learning_rate": 9.620073744762093e-08, + "loss": 0.0009, + "step": 115350 + }, + { + "epoch": 1.8875889716108976, + "grad_norm": 0.004902615677565336, + "learning_rate": 9.592218434513634e-08, + "loss": 0.0005, + "step": 115360 + }, + { + "epoch": 1.8877525975619733, + "grad_norm": 0.03447529673576355, + "learning_rate": 9.564403120255361e-08, + "loss": 0.0007, + "step": 115370 + }, + { + "epoch": 1.887916223513049, + "grad_norm": 0.06368916481733322, + "learning_rate": 9.536627804255738e-08, + "loss": 0.0006, + "step": 115380 + }, + { + "epoch": 1.888079849464125, + "grad_norm": 0.01854928396642208, + "learning_rate": 9.508892488780064e-08, + "loss": 0.0007, + "step": 115390 + }, + { + "epoch": 1.8882434754152009, + "grad_norm": 0.09362687170505524, + "learning_rate": 9.481197176090307e-08, + "loss": 0.0012, + "step": 115400 + }, + { + "epoch": 1.8884071013662767, + "grad_norm": 0.01557767204940319, + "learning_rate": 9.45354186844527e-08, + "loss": 0.0008, + "step": 115410 + }, + { + "epoch": 1.8885707273173526, + "grad_norm": 0.01166441012173891, + "learning_rate": 9.425926568100319e-08, + "loss": 0.0004, + "step": 115420 + }, + { + "epoch": 1.8887343532684284, + "grad_norm": 0.10664310306310654, + "learning_rate": 9.398351277307761e-08, + "loss": 0.001, + "step": 115430 + }, + { + "epoch": 1.8888979792195042, + "grad_norm": 0.045520734041929245, + "learning_rate": 9.370815998316463e-08, + "loss": 0.0012, + "step": 115440 + }, + { + "epoch": 1.8890616051705802, + "grad_norm": 0.003025609999895096, + "learning_rate": 9.343320733372186e-08, + "loss": 0.0004, + "step": 115450 + }, + { + "epoch": 1.8892252311216557, + "grad_norm": 0.0662052258849144, + "learning_rate": 9.315865484717245e-08, + "loss": 0.0008, + "step": 115460 + }, + { + "epoch": 1.8893888570727317, + "grad_norm": 0.05222895368933678, + "learning_rate": 9.288450254590909e-08, + "loss": 0.0004, + "step": 115470 + }, + { + "epoch": 1.8895524830238077, + "grad_norm": 0.03883333504199982, + "learning_rate": 9.261075045229051e-08, + "loss": 0.0011, + "step": 115480 + }, + { + "epoch": 1.8897161089748833, + "grad_norm": 0.05243481695652008, + "learning_rate": 9.233739858864277e-08, + "loss": 0.0017, + "step": 115490 + }, + { + "epoch": 1.8898797349259593, + "grad_norm": 0.029051413759589195, + "learning_rate": 9.206444697725858e-08, + "loss": 0.0007, + "step": 115500 + }, + { + "epoch": 1.890043360877035, + "grad_norm": 0.0033195551950484514, + "learning_rate": 9.17918956404007e-08, + "loss": 0.0006, + "step": 115510 + }, + { + "epoch": 1.8902069868281108, + "grad_norm": 0.03224264830350876, + "learning_rate": 9.151974460029634e-08, + "loss": 0.0006, + "step": 115520 + }, + { + "epoch": 1.8903706127791868, + "grad_norm": 0.014638724736869335, + "learning_rate": 9.124799387914162e-08, + "loss": 0.0006, + "step": 115530 + }, + { + "epoch": 1.8905342387302626, + "grad_norm": 0.0022072468418627977, + "learning_rate": 9.09766434990994e-08, + "loss": 0.0007, + "step": 115540 + }, + { + "epoch": 1.8906978646813384, + "grad_norm": 0.03740597516298294, + "learning_rate": 9.070569348230085e-08, + "loss": 0.0007, + "step": 115550 + }, + { + "epoch": 1.8908614906324144, + "grad_norm": 0.07786998152732849, + "learning_rate": 9.043514385084218e-08, + "loss": 0.0013, + "step": 115560 + }, + { + "epoch": 1.8910251165834902, + "grad_norm": 0.009632066823542118, + "learning_rate": 9.016499462679019e-08, + "loss": 0.0006, + "step": 115570 + }, + { + "epoch": 1.891188742534566, + "grad_norm": 0.010564597323536873, + "learning_rate": 8.989524583217557e-08, + "loss": 0.0005, + "step": 115580 + }, + { + "epoch": 1.891352368485642, + "grad_norm": 0.02836195006966591, + "learning_rate": 8.962589748899963e-08, + "loss": 0.0018, + "step": 115590 + }, + { + "epoch": 1.8915159944367177, + "grad_norm": 0.07581872493028641, + "learning_rate": 8.935694961922869e-08, + "loss": 0.0011, + "step": 115600 + }, + { + "epoch": 1.8916796203877935, + "grad_norm": 0.023821106180548668, + "learning_rate": 8.908840224479797e-08, + "loss": 0.0007, + "step": 115610 + }, + { + "epoch": 1.8918432463388695, + "grad_norm": 0.07170823216438293, + "learning_rate": 8.88202553876083e-08, + "loss": 0.0013, + "step": 115620 + }, + { + "epoch": 1.8920068722899452, + "grad_norm": 0.08178595453500748, + "learning_rate": 8.855250906952995e-08, + "loss": 0.0007, + "step": 115630 + }, + { + "epoch": 1.892170498241021, + "grad_norm": 0.12656652927398682, + "learning_rate": 8.828516331239823e-08, + "loss": 0.0014, + "step": 115640 + }, + { + "epoch": 1.892334124192097, + "grad_norm": 0.1051178127527237, + "learning_rate": 8.801821813801792e-08, + "loss": 0.0009, + "step": 115650 + }, + { + "epoch": 1.8924977501431726, + "grad_norm": 0.06637538969516754, + "learning_rate": 8.775167356815939e-08, + "loss": 0.0011, + "step": 115660 + }, + { + "epoch": 1.8926613760942486, + "grad_norm": 0.01222996599972248, + "learning_rate": 8.748552962456136e-08, + "loss": 0.0004, + "step": 115670 + }, + { + "epoch": 1.8928250020453243, + "grad_norm": 0.02322150580585003, + "learning_rate": 8.721978632893036e-08, + "loss": 0.0009, + "step": 115680 + }, + { + "epoch": 1.8929886279964, + "grad_norm": 0.0872635468840599, + "learning_rate": 8.695444370293793e-08, + "loss": 0.0007, + "step": 115690 + }, + { + "epoch": 1.893152253947476, + "grad_norm": 0.03089163452386856, + "learning_rate": 8.668950176822621e-08, + "loss": 0.0005, + "step": 115700 + }, + { + "epoch": 1.8933158798985519, + "grad_norm": 0.001216677948832512, + "learning_rate": 8.642496054640181e-08, + "loss": 0.0007, + "step": 115710 + }, + { + "epoch": 1.8934795058496277, + "grad_norm": 0.009266712702810764, + "learning_rate": 8.616082005904026e-08, + "loss": 0.0013, + "step": 115720 + }, + { + "epoch": 1.8936431318007036, + "grad_norm": 0.30547383427619934, + "learning_rate": 8.589708032768374e-08, + "loss": 0.0012, + "step": 115730 + }, + { + "epoch": 1.8938067577517794, + "grad_norm": 0.02439533919095993, + "learning_rate": 8.563374137384283e-08, + "loss": 0.0011, + "step": 115740 + }, + { + "epoch": 1.8939703837028552, + "grad_norm": 0.034946080297231674, + "learning_rate": 8.537080321899316e-08, + "loss": 0.0005, + "step": 115750 + }, + { + "epoch": 1.8941340096539312, + "grad_norm": 0.03289042040705681, + "learning_rate": 8.510826588458032e-08, + "loss": 0.0006, + "step": 115760 + }, + { + "epoch": 1.894297635605007, + "grad_norm": 0.06040625274181366, + "learning_rate": 8.484612939201441e-08, + "loss": 0.0009, + "step": 115770 + }, + { + "epoch": 1.8944612615560827, + "grad_norm": 0.23672960698604584, + "learning_rate": 8.458439376267669e-08, + "loss": 0.0005, + "step": 115780 + }, + { + "epoch": 1.8946248875071587, + "grad_norm": 0.055471859872341156, + "learning_rate": 8.432305901791116e-08, + "loss": 0.0011, + "step": 115790 + }, + { + "epoch": 1.8947885134582345, + "grad_norm": 0.06190602108836174, + "learning_rate": 8.406212517903245e-08, + "loss": 0.0008, + "step": 115800 + }, + { + "epoch": 1.8949521394093103, + "grad_norm": 0.012343844398856163, + "learning_rate": 8.380159226732132e-08, + "loss": 0.0006, + "step": 115810 + }, + { + "epoch": 1.8951157653603863, + "grad_norm": 0.014734986238181591, + "learning_rate": 8.354146030402688e-08, + "loss": 0.0008, + "step": 115820 + }, + { + "epoch": 1.8952793913114618, + "grad_norm": 0.08109033852815628, + "learning_rate": 8.328172931036272e-08, + "loss": 0.0005, + "step": 115830 + }, + { + "epoch": 1.8954430172625378, + "grad_norm": 0.01399065088480711, + "learning_rate": 8.302239930751299e-08, + "loss": 0.0007, + "step": 115840 + }, + { + "epoch": 1.8956066432136138, + "grad_norm": 0.056841157376766205, + "learning_rate": 8.276347031662691e-08, + "loss": 0.0008, + "step": 115850 + }, + { + "epoch": 1.8957702691646894, + "grad_norm": 0.056089065968990326, + "learning_rate": 8.250494235882311e-08, + "loss": 0.0012, + "step": 115860 + }, + { + "epoch": 1.8959338951157654, + "grad_norm": 0.0776858851313591, + "learning_rate": 8.224681545518476e-08, + "loss": 0.0014, + "step": 115870 + }, + { + "epoch": 1.8960975210668412, + "grad_norm": 0.10620084404945374, + "learning_rate": 8.198908962676499e-08, + "loss": 0.0006, + "step": 115880 + }, + { + "epoch": 1.896261147017917, + "grad_norm": 0.0258939191699028, + "learning_rate": 8.173176489458201e-08, + "loss": 0.0007, + "step": 115890 + }, + { + "epoch": 1.896424772968993, + "grad_norm": 0.05787687748670578, + "learning_rate": 8.147484127962347e-08, + "loss": 0.0011, + "step": 115900 + }, + { + "epoch": 1.8965883989200687, + "grad_norm": 0.012510452419519424, + "learning_rate": 8.121831880284203e-08, + "loss": 0.0009, + "step": 115910 + }, + { + "epoch": 1.8967520248711445, + "grad_norm": 0.1753036379814148, + "learning_rate": 8.096219748515987e-08, + "loss": 0.0022, + "step": 115920 + }, + { + "epoch": 1.8969156508222205, + "grad_norm": 0.027009334415197372, + "learning_rate": 8.070647734746473e-08, + "loss": 0.0008, + "step": 115930 + }, + { + "epoch": 1.8970792767732962, + "grad_norm": 0.022676708176732063, + "learning_rate": 8.045115841061268e-08, + "loss": 0.0007, + "step": 115940 + }, + { + "epoch": 1.897242902724372, + "grad_norm": 0.0455305278301239, + "learning_rate": 8.019624069542541e-08, + "loss": 0.0006, + "step": 115950 + }, + { + "epoch": 1.897406528675448, + "grad_norm": 0.10886897891759872, + "learning_rate": 7.99417242226952e-08, + "loss": 0.0007, + "step": 115960 + }, + { + "epoch": 1.8975701546265238, + "grad_norm": 0.009297381155192852, + "learning_rate": 7.968760901317818e-08, + "loss": 0.0003, + "step": 115970 + }, + { + "epoch": 1.8977337805775996, + "grad_norm": 0.009364566765725613, + "learning_rate": 7.943389508759947e-08, + "loss": 0.0007, + "step": 115980 + }, + { + "epoch": 1.8978974065286756, + "grad_norm": 0.03216440603137016, + "learning_rate": 7.918058246665084e-08, + "loss": 0.0007, + "step": 115990 + }, + { + "epoch": 1.8980610324797513, + "grad_norm": 0.045093510299921036, + "learning_rate": 7.892767117099187e-08, + "loss": 0.0013, + "step": 116000 + }, + { + "epoch": 1.898224658430827, + "grad_norm": 0.01548362523317337, + "learning_rate": 7.867516122124941e-08, + "loss": 0.0028, + "step": 116010 + }, + { + "epoch": 1.898388284381903, + "grad_norm": 0.05727113410830498, + "learning_rate": 7.842305263801641e-08, + "loss": 0.0012, + "step": 116020 + }, + { + "epoch": 1.8985519103329787, + "grad_norm": 0.02318890206515789, + "learning_rate": 7.817134544185535e-08, + "loss": 0.0008, + "step": 116030 + }, + { + "epoch": 1.8987155362840547, + "grad_norm": 0.05266227200627327, + "learning_rate": 7.79200396532931e-08, + "loss": 0.001, + "step": 116040 + }, + { + "epoch": 1.8988791622351306, + "grad_norm": 0.018220210447907448, + "learning_rate": 7.76691352928266e-08, + "loss": 0.0026, + "step": 116050 + }, + { + "epoch": 1.8990427881862062, + "grad_norm": 0.050774186849594116, + "learning_rate": 7.741863238091785e-08, + "loss": 0.0022, + "step": 116060 + }, + { + "epoch": 1.8992064141372822, + "grad_norm": 0.09759583324193954, + "learning_rate": 7.71685309379977e-08, + "loss": 0.0014, + "step": 116070 + }, + { + "epoch": 1.899370040088358, + "grad_norm": 0.014021795243024826, + "learning_rate": 7.691883098446262e-08, + "loss": 0.0023, + "step": 116080 + }, + { + "epoch": 1.8995336660394337, + "grad_norm": 0.11141176521778107, + "learning_rate": 7.666953254067855e-08, + "loss": 0.0012, + "step": 116090 + }, + { + "epoch": 1.8996972919905097, + "grad_norm": 0.04700498282909393, + "learning_rate": 7.642063562697644e-08, + "loss": 0.001, + "step": 116100 + }, + { + "epoch": 1.8998609179415855, + "grad_norm": 0.04161045327782631, + "learning_rate": 7.617214026365616e-08, + "loss": 0.0008, + "step": 116110 + }, + { + "epoch": 1.9000245438926613, + "grad_norm": 0.051022887229919434, + "learning_rate": 7.592404647098317e-08, + "loss": 0.0009, + "step": 116120 + }, + { + "epoch": 1.9001881698437373, + "grad_norm": 0.05725853517651558, + "learning_rate": 7.56763542691924e-08, + "loss": 0.0012, + "step": 116130 + }, + { + "epoch": 1.900351795794813, + "grad_norm": 0.010440354235470295, + "learning_rate": 7.542906367848435e-08, + "loss": 0.0006, + "step": 116140 + }, + { + "epoch": 1.9005154217458888, + "grad_norm": 0.0035140812397003174, + "learning_rate": 7.518217471902677e-08, + "loss": 0.0012, + "step": 116150 + }, + { + "epoch": 1.9006790476969648, + "grad_norm": 0.0815066322684288, + "learning_rate": 7.493568741095469e-08, + "loss": 0.0013, + "step": 116160 + }, + { + "epoch": 1.9008426736480406, + "grad_norm": 0.01767807826399803, + "learning_rate": 7.468960177437257e-08, + "loss": 0.0007, + "step": 116170 + }, + { + "epoch": 1.9010062995991164, + "grad_norm": 0.03358037769794464, + "learning_rate": 7.444391782934823e-08, + "loss": 0.0008, + "step": 116180 + }, + { + "epoch": 1.9011699255501924, + "grad_norm": 0.09311171621084213, + "learning_rate": 7.419863559592066e-08, + "loss": 0.0007, + "step": 116190 + }, + { + "epoch": 1.9013335515012681, + "grad_norm": 0.0928061455488205, + "learning_rate": 7.395375509409219e-08, + "loss": 0.0005, + "step": 116200 + }, + { + "epoch": 1.901497177452344, + "grad_norm": 0.03519140183925629, + "learning_rate": 7.370927634383685e-08, + "loss": 0.0017, + "step": 116210 + }, + { + "epoch": 1.90166080340342, + "grad_norm": 0.010303204879164696, + "learning_rate": 7.346519936509145e-08, + "loss": 0.0005, + "step": 116220 + }, + { + "epoch": 1.9018244293544955, + "grad_norm": 0.13487018644809723, + "learning_rate": 7.322152417776285e-08, + "loss": 0.0011, + "step": 116230 + }, + { + "epoch": 1.9019880553055715, + "grad_norm": 0.023089276626706123, + "learning_rate": 7.29782508017246e-08, + "loss": 0.005, + "step": 116240 + }, + { + "epoch": 1.9021516812566475, + "grad_norm": 0.017940377816557884, + "learning_rate": 7.273537925681751e-08, + "loss": 0.0005, + "step": 116250 + }, + { + "epoch": 1.902315307207723, + "grad_norm": 0.029363514855504036, + "learning_rate": 7.249290956284794e-08, + "loss": 0.001, + "step": 116260 + }, + { + "epoch": 1.902478933158799, + "grad_norm": 0.014394384808838367, + "learning_rate": 7.225084173959285e-08, + "loss": 0.0006, + "step": 116270 + }, + { + "epoch": 1.9026425591098748, + "grad_norm": 0.0295296348631382, + "learning_rate": 7.200917580679256e-08, + "loss": 0.0006, + "step": 116280 + }, + { + "epoch": 1.9028061850609506, + "grad_norm": 0.08148827403783798, + "learning_rate": 7.176791178415799e-08, + "loss": 0.0012, + "step": 116290 + }, + { + "epoch": 1.9029698110120266, + "grad_norm": 0.0415823757648468, + "learning_rate": 7.152704969136504e-08, + "loss": 0.0005, + "step": 116300 + }, + { + "epoch": 1.9031334369631023, + "grad_norm": 0.05801570415496826, + "learning_rate": 7.128658954805745e-08, + "loss": 0.0011, + "step": 116310 + }, + { + "epoch": 1.903297062914178, + "grad_norm": 0.021948300302028656, + "learning_rate": 7.104653137384732e-08, + "loss": 0.0007, + "step": 116320 + }, + { + "epoch": 1.903460688865254, + "grad_norm": 0.1137373223900795, + "learning_rate": 7.080687518831175e-08, + "loss": 0.0021, + "step": 116330 + }, + { + "epoch": 1.9036243148163299, + "grad_norm": 0.007349936757236719, + "learning_rate": 7.05676210109979e-08, + "loss": 0.0008, + "step": 116340 + }, + { + "epoch": 1.9037879407674057, + "grad_norm": 0.06870221346616745, + "learning_rate": 7.032876886141627e-08, + "loss": 0.0006, + "step": 116350 + }, + { + "epoch": 1.9039515667184816, + "grad_norm": 0.0719907134771347, + "learning_rate": 7.009031875904903e-08, + "loss": 0.0007, + "step": 116360 + }, + { + "epoch": 1.9041151926695574, + "grad_norm": 0.02348886802792549, + "learning_rate": 6.985227072334178e-08, + "loss": 0.0004, + "step": 116370 + }, + { + "epoch": 1.9042788186206332, + "grad_norm": 0.06328196078538895, + "learning_rate": 6.96146247737095e-08, + "loss": 0.0005, + "step": 116380 + }, + { + "epoch": 1.9044424445717092, + "grad_norm": 0.0172183346003294, + "learning_rate": 6.937738092953394e-08, + "loss": 0.0006, + "step": 116390 + }, + { + "epoch": 1.904606070522785, + "grad_norm": 0.02095283940434456, + "learning_rate": 6.914053921016406e-08, + "loss": 0.0011, + "step": 116400 + }, + { + "epoch": 1.9047696964738607, + "grad_norm": 0.15059605240821838, + "learning_rate": 6.890409963491495e-08, + "loss": 0.0005, + "step": 116410 + }, + { + "epoch": 1.9049333224249367, + "grad_norm": 0.00268458086065948, + "learning_rate": 6.866806222307121e-08, + "loss": 0.0006, + "step": 116420 + }, + { + "epoch": 1.9050969483760123, + "grad_norm": 0.0715903788805008, + "learning_rate": 6.843242699388187e-08, + "loss": 0.001, + "step": 116430 + }, + { + "epoch": 1.9052605743270883, + "grad_norm": 0.004708403721451759, + "learning_rate": 6.819719396656544e-08, + "loss": 0.0007, + "step": 116440 + }, + { + "epoch": 1.9054242002781643, + "grad_norm": 0.04708794876933098, + "learning_rate": 6.796236316030602e-08, + "loss": 0.0011, + "step": 116450 + }, + { + "epoch": 1.9055878262292398, + "grad_norm": 0.060547955334186554, + "learning_rate": 6.772793459425665e-08, + "loss": 0.0013, + "step": 116460 + }, + { + "epoch": 1.9057514521803158, + "grad_norm": 0.06558769941329956, + "learning_rate": 6.749390828753533e-08, + "loss": 0.0011, + "step": 116470 + }, + { + "epoch": 1.9059150781313916, + "grad_norm": 0.1840386539697647, + "learning_rate": 6.72602842592296e-08, + "loss": 0.002, + "step": 116480 + }, + { + "epoch": 1.9060787040824674, + "grad_norm": 0.02503153681755066, + "learning_rate": 6.702706252839197e-08, + "loss": 0.0009, + "step": 116490 + }, + { + "epoch": 1.9062423300335434, + "grad_norm": 0.10345010459423065, + "learning_rate": 6.679424311404392e-08, + "loss": 0.0012, + "step": 116500 + }, + { + "epoch": 1.9064059559846191, + "grad_norm": 0.03735567256808281, + "learning_rate": 6.656182603517303e-08, + "loss": 0.0008, + "step": 116510 + }, + { + "epoch": 1.906569581935695, + "grad_norm": 0.02678655833005905, + "learning_rate": 6.632981131073469e-08, + "loss": 0.0006, + "step": 116520 + }, + { + "epoch": 1.906733207886771, + "grad_norm": 0.08449308574199677, + "learning_rate": 6.609819895965097e-08, + "loss": 0.0009, + "step": 116530 + }, + { + "epoch": 1.9068968338378467, + "grad_norm": 0.021715274080634117, + "learning_rate": 6.586698900081179e-08, + "loss": 0.0005, + "step": 116540 + }, + { + "epoch": 1.9070604597889225, + "grad_norm": 0.03355414420366287, + "learning_rate": 6.563618145307316e-08, + "loss": 0.0013, + "step": 116550 + }, + { + "epoch": 1.9072240857399985, + "grad_norm": 0.007211328484117985, + "learning_rate": 6.540577633526001e-08, + "loss": 0.001, + "step": 116560 + }, + { + "epoch": 1.9073877116910742, + "grad_norm": 0.02551354095339775, + "learning_rate": 6.51757736661629e-08, + "loss": 0.0003, + "step": 116570 + }, + { + "epoch": 1.90755133764215, + "grad_norm": 0.005793520715087652, + "learning_rate": 6.494617346453957e-08, + "loss": 0.0004, + "step": 116580 + }, + { + "epoch": 1.907714963593226, + "grad_norm": 0.004928684793412685, + "learning_rate": 6.471697574911562e-08, + "loss": 0.0009, + "step": 116590 + }, + { + "epoch": 1.9078785895443016, + "grad_norm": 0.039309728890657425, + "learning_rate": 6.448818053858441e-08, + "loss": 0.0009, + "step": 116600 + }, + { + "epoch": 1.9080422154953776, + "grad_norm": 0.05481138825416565, + "learning_rate": 6.425978785160492e-08, + "loss": 0.0011, + "step": 116610 + }, + { + "epoch": 1.9082058414464536, + "grad_norm": 0.06257200241088867, + "learning_rate": 6.403179770680445e-08, + "loss": 0.0006, + "step": 116620 + }, + { + "epoch": 1.908369467397529, + "grad_norm": 0.11583060026168823, + "learning_rate": 6.380421012277649e-08, + "loss": 0.0015, + "step": 116630 + }, + { + "epoch": 1.908533093348605, + "grad_norm": 0.05019020289182663, + "learning_rate": 6.357702511808284e-08, + "loss": 0.0009, + "step": 116640 + }, + { + "epoch": 1.9086967192996809, + "grad_norm": 0.10041210055351257, + "learning_rate": 6.335024271125256e-08, + "loss": 0.0007, + "step": 116650 + }, + { + "epoch": 1.9088603452507567, + "grad_norm": 0.04091503843665123, + "learning_rate": 6.312386292077977e-08, + "loss": 0.0009, + "step": 116660 + }, + { + "epoch": 1.9090239712018326, + "grad_norm": 0.07322060316801071, + "learning_rate": 6.289788576512856e-08, + "loss": 0.0009, + "step": 116670 + }, + { + "epoch": 1.9091875971529084, + "grad_norm": 0.06976725906133652, + "learning_rate": 6.267231126272754e-08, + "loss": 0.0007, + "step": 116680 + }, + { + "epoch": 1.9093512231039842, + "grad_norm": 0.051630690693855286, + "learning_rate": 6.244713943197533e-08, + "loss": 0.0012, + "step": 116690 + }, + { + "epoch": 1.9095148490550602, + "grad_norm": 0.010435717180371284, + "learning_rate": 6.222237029123501e-08, + "loss": 0.0005, + "step": 116700 + }, + { + "epoch": 1.909678475006136, + "grad_norm": 0.043589841574430466, + "learning_rate": 6.199800385883858e-08, + "loss": 0.0007, + "step": 116710 + }, + { + "epoch": 1.9098421009572117, + "grad_norm": 0.306078165769577, + "learning_rate": 6.177404015308419e-08, + "loss": 0.0016, + "step": 116720 + }, + { + "epoch": 1.9100057269082877, + "grad_norm": 0.03573667258024216, + "learning_rate": 6.155047919223833e-08, + "loss": 0.0006, + "step": 116730 + }, + { + "epoch": 1.9101693528593635, + "grad_norm": 0.05123012140393257, + "learning_rate": 6.132732099453254e-08, + "loss": 0.0019, + "step": 116740 + }, + { + "epoch": 1.9103329788104393, + "grad_norm": 0.03394199162721634, + "learning_rate": 6.110456557816835e-08, + "loss": 0.0004, + "step": 116750 + }, + { + "epoch": 1.9104966047615153, + "grad_norm": 0.023436540737748146, + "learning_rate": 6.08822129613118e-08, + "loss": 0.0005, + "step": 116760 + }, + { + "epoch": 1.910660230712591, + "grad_norm": 0.06775376945734024, + "learning_rate": 6.066026316209784e-08, + "loss": 0.0008, + "step": 116770 + }, + { + "epoch": 1.9108238566636668, + "grad_norm": 0.026028968393802643, + "learning_rate": 6.043871619862751e-08, + "loss": 0.0007, + "step": 116780 + }, + { + "epoch": 1.9109874826147428, + "grad_norm": 0.030310045927762985, + "learning_rate": 6.021757208897028e-08, + "loss": 0.0009, + "step": 116790 + }, + { + "epoch": 1.9111511085658184, + "grad_norm": 0.03099539689719677, + "learning_rate": 5.999683085116059e-08, + "loss": 0.001, + "step": 116800 + }, + { + "epoch": 1.9113147345168944, + "grad_norm": 0.028056008741259575, + "learning_rate": 5.977649250320239e-08, + "loss": 0.0011, + "step": 116810 + }, + { + "epoch": 1.9114783604679704, + "grad_norm": 0.030993202701210976, + "learning_rate": 5.955655706306518e-08, + "loss": 0.0004, + "step": 116820 + }, + { + "epoch": 1.911641986419046, + "grad_norm": 0.01232182513922453, + "learning_rate": 5.933702454868628e-08, + "loss": 0.001, + "step": 116830 + }, + { + "epoch": 1.911805612370122, + "grad_norm": 0.07155122607946396, + "learning_rate": 5.911789497797027e-08, + "loss": 0.001, + "step": 116840 + }, + { + "epoch": 1.9119692383211977, + "grad_norm": 0.021993393078446388, + "learning_rate": 5.8899168368788395e-08, + "loss": 0.0005, + "step": 116850 + }, + { + "epoch": 1.9121328642722735, + "grad_norm": 0.011465046554803848, + "learning_rate": 5.868084473897917e-08, + "loss": 0.0013, + "step": 116860 + }, + { + "epoch": 1.9122964902233495, + "grad_norm": 0.03232261538505554, + "learning_rate": 5.846292410634835e-08, + "loss": 0.001, + "step": 116870 + }, + { + "epoch": 1.9124601161744252, + "grad_norm": 0.015318089164793491, + "learning_rate": 5.8245406488668944e-08, + "loss": 0.0006, + "step": 116880 + }, + { + "epoch": 1.912623742125501, + "grad_norm": 0.01868932694196701, + "learning_rate": 5.80282919036812e-08, + "loss": 0.0011, + "step": 116890 + }, + { + "epoch": 1.912787368076577, + "grad_norm": 0.026175817474722862, + "learning_rate": 5.781158036909096e-08, + "loss": 0.0007, + "step": 116900 + }, + { + "epoch": 1.9129509940276528, + "grad_norm": 0.04268673434853554, + "learning_rate": 5.7595271902574636e-08, + "loss": 0.0009, + "step": 116910 + }, + { + "epoch": 1.9131146199787286, + "grad_norm": 0.04154679924249649, + "learning_rate": 5.737936652177145e-08, + "loss": 0.0009, + "step": 116920 + }, + { + "epoch": 1.9132782459298046, + "grad_norm": 0.0030126285273581743, + "learning_rate": 5.7163864244291764e-08, + "loss": 0.0015, + "step": 116930 + }, + { + "epoch": 1.9134418718808803, + "grad_norm": 0.0038906957488507032, + "learning_rate": 5.6948765087709855e-08, + "loss": 0.0004, + "step": 116940 + }, + { + "epoch": 1.913605497831956, + "grad_norm": 0.05314008891582489, + "learning_rate": 5.673406906956891e-08, + "loss": 0.0005, + "step": 116950 + }, + { + "epoch": 1.913769123783032, + "grad_norm": 0.030740106478333473, + "learning_rate": 5.651977620737936e-08, + "loss": 0.0005, + "step": 116960 + }, + { + "epoch": 1.9139327497341079, + "grad_norm": 0.04226497933268547, + "learning_rate": 5.6305886518617235e-08, + "loss": 0.0006, + "step": 116970 + }, + { + "epoch": 1.9140963756851836, + "grad_norm": 0.10254088789224625, + "learning_rate": 5.609240002072691e-08, + "loss": 0.0011, + "step": 116980 + }, + { + "epoch": 1.9142600016362596, + "grad_norm": 0.04294828698039055, + "learning_rate": 5.587931673112002e-08, + "loss": 0.0007, + "step": 116990 + }, + { + "epoch": 1.9144236275873352, + "grad_norm": 0.019339660182595253, + "learning_rate": 5.5666636667174866e-08, + "loss": 0.0005, + "step": 117000 + }, + { + "epoch": 1.9145872535384112, + "grad_norm": 0.062423329800367355, + "learning_rate": 5.5454359846236485e-08, + "loss": 0.0009, + "step": 117010 + }, + { + "epoch": 1.9147508794894872, + "grad_norm": 0.04364512488245964, + "learning_rate": 5.5242486285618235e-08, + "loss": 0.001, + "step": 117020 + }, + { + "epoch": 1.9149145054405627, + "grad_norm": 0.04420114681124687, + "learning_rate": 5.503101600259853e-08, + "loss": 0.0007, + "step": 117030 + }, + { + "epoch": 1.9150781313916387, + "grad_norm": 0.04976295679807663, + "learning_rate": 5.4819949014425776e-08, + "loss": 0.0009, + "step": 117040 + }, + { + "epoch": 1.9152417573427145, + "grad_norm": 0.03043130412697792, + "learning_rate": 5.4609285338312335e-08, + "loss": 0.0014, + "step": 117050 + }, + { + "epoch": 1.9154053832937903, + "grad_norm": 0.05502431467175484, + "learning_rate": 5.4399024991440005e-08, + "loss": 0.001, + "step": 117060 + }, + { + "epoch": 1.9155690092448663, + "grad_norm": 0.08185819536447525, + "learning_rate": 5.418916799095675e-08, + "loss": 0.0023, + "step": 117070 + }, + { + "epoch": 1.915732635195942, + "grad_norm": 0.059280503541231155, + "learning_rate": 5.397971435397831e-08, + "loss": 0.0016, + "step": 117080 + }, + { + "epoch": 1.9158962611470178, + "grad_norm": 0.04656801000237465, + "learning_rate": 5.377066409758602e-08, + "loss": 0.0014, + "step": 117090 + }, + { + "epoch": 1.9160598870980938, + "grad_norm": 0.006370526272803545, + "learning_rate": 5.3562017238830147e-08, + "loss": 0.0008, + "step": 117100 + }, + { + "epoch": 1.9162235130491696, + "grad_norm": 0.04435443505644798, + "learning_rate": 5.335377379472706e-08, + "loss": 0.0009, + "step": 117110 + }, + { + "epoch": 1.9163871390002454, + "grad_norm": 0.03789312765002251, + "learning_rate": 5.314593378225985e-08, + "loss": 0.0006, + "step": 117120 + }, + { + "epoch": 1.9165507649513214, + "grad_norm": 0.04919374734163284, + "learning_rate": 5.293849721837996e-08, + "loss": 0.0005, + "step": 117130 + }, + { + "epoch": 1.9167143909023971, + "grad_norm": 0.0709136500954628, + "learning_rate": 5.273146412000496e-08, + "loss": 0.0007, + "step": 117140 + }, + { + "epoch": 1.916878016853473, + "grad_norm": 0.04750726372003555, + "learning_rate": 5.252483450401913e-08, + "loss": 0.0004, + "step": 117150 + }, + { + "epoch": 1.917041642804549, + "grad_norm": 0.039837710559368134, + "learning_rate": 5.231860838727565e-08, + "loss": 0.0006, + "step": 117160 + }, + { + "epoch": 1.9172052687556247, + "grad_norm": 0.06660003960132599, + "learning_rate": 5.211278578659273e-08, + "loss": 0.0012, + "step": 117170 + }, + { + "epoch": 1.9173688947067005, + "grad_norm": 0.051756616681814194, + "learning_rate": 5.1907366718756955e-08, + "loss": 0.0005, + "step": 117180 + }, + { + "epoch": 1.9175325206577765, + "grad_norm": 0.011971354484558105, + "learning_rate": 5.1702351200521026e-08, + "loss": 0.0009, + "step": 117190 + }, + { + "epoch": 1.917696146608852, + "grad_norm": 0.029503384605050087, + "learning_rate": 5.149773924860602e-08, + "loss": 0.0007, + "step": 117200 + }, + { + "epoch": 1.917859772559928, + "grad_norm": 0.02789662778377533, + "learning_rate": 5.129353087969913e-08, + "loss": 0.0009, + "step": 117210 + }, + { + "epoch": 1.918023398511004, + "grad_norm": 0.00774666341021657, + "learning_rate": 5.1089726110455374e-08, + "loss": 0.0005, + "step": 117220 + }, + { + "epoch": 1.9181870244620796, + "grad_norm": 0.05077791213989258, + "learning_rate": 5.088632495749535e-08, + "loss": 0.0007, + "step": 117230 + }, + { + "epoch": 1.9183506504131556, + "grad_norm": 0.058956168591976166, + "learning_rate": 5.068332743740856e-08, + "loss": 0.0009, + "step": 117240 + }, + { + "epoch": 1.9185142763642313, + "grad_norm": 0.07228808104991913, + "learning_rate": 5.0480733566750094e-08, + "loss": 0.0014, + "step": 117250 + }, + { + "epoch": 1.918677902315307, + "grad_norm": 0.03618868812918663, + "learning_rate": 5.0278543362043964e-08, + "loss": 0.0006, + "step": 117260 + }, + { + "epoch": 1.918841528266383, + "grad_norm": 0.0344681590795517, + "learning_rate": 5.0076756839779196e-08, + "loss": 0.0007, + "step": 117270 + }, + { + "epoch": 1.9190051542174589, + "grad_norm": 0.0023111009504646063, + "learning_rate": 4.987537401641318e-08, + "loss": 0.0012, + "step": 117280 + }, + { + "epoch": 1.9191687801685346, + "grad_norm": 0.07324729859828949, + "learning_rate": 4.967439490837e-08, + "loss": 0.0013, + "step": 117290 + }, + { + "epoch": 1.9193324061196106, + "grad_norm": 0.027084648609161377, + "learning_rate": 4.947381953203989e-08, + "loss": 0.0005, + "step": 117300 + }, + { + "epoch": 1.9194960320706864, + "grad_norm": 0.054264459758996964, + "learning_rate": 4.927364790378253e-08, + "loss": 0.0007, + "step": 117310 + }, + { + "epoch": 1.9196596580217622, + "grad_norm": 0.015776021406054497, + "learning_rate": 4.9073880039922636e-08, + "loss": 0.0009, + "step": 117320 + }, + { + "epoch": 1.9198232839728382, + "grad_norm": 0.009941987693309784, + "learning_rate": 4.8874515956752745e-08, + "loss": 0.0008, + "step": 117330 + }, + { + "epoch": 1.919986909923914, + "grad_norm": 0.04222123697400093, + "learning_rate": 4.86755556705315e-08, + "loss": 0.0013, + "step": 117340 + }, + { + "epoch": 1.9201505358749897, + "grad_norm": 0.005467007867991924, + "learning_rate": 4.8476999197486494e-08, + "loss": 0.0004, + "step": 117350 + }, + { + "epoch": 1.9203141618260657, + "grad_norm": 0.014334475621581078, + "learning_rate": 4.8278846553810876e-08, + "loss": 0.0004, + "step": 117360 + }, + { + "epoch": 1.9204777877771415, + "grad_norm": 0.005753939971327782, + "learning_rate": 4.808109775566561e-08, + "loss": 0.001, + "step": 117370 + }, + { + "epoch": 1.9206414137282173, + "grad_norm": 0.2799023687839508, + "learning_rate": 4.7883752819177787e-08, + "loss": 0.0053, + "step": 117380 + }, + { + "epoch": 1.9208050396792933, + "grad_norm": 0.014019107446074486, + "learning_rate": 4.768681176044232e-08, + "loss": 0.0003, + "step": 117390 + }, + { + "epoch": 1.9209686656303688, + "grad_norm": 0.028729360550642014, + "learning_rate": 4.7490274595521355e-08, + "loss": 0.0008, + "step": 117400 + }, + { + "epoch": 1.9211322915814448, + "grad_norm": 0.08716363459825516, + "learning_rate": 4.72941413404443e-08, + "loss": 0.001, + "step": 117410 + }, + { + "epoch": 1.9212959175325206, + "grad_norm": 0.042327962815761566, + "learning_rate": 4.709841201120557e-08, + "loss": 0.0009, + "step": 117420 + }, + { + "epoch": 1.9214595434835964, + "grad_norm": 0.05567514896392822, + "learning_rate": 4.690308662377019e-08, + "loss": 0.0004, + "step": 117430 + }, + { + "epoch": 1.9216231694346724, + "grad_norm": 0.0731797143816948, + "learning_rate": 4.670816519406651e-08, + "loss": 0.0009, + "step": 117440 + }, + { + "epoch": 1.9217867953857481, + "grad_norm": 0.09268610179424286, + "learning_rate": 4.651364773799238e-08, + "loss": 0.0015, + "step": 117450 + }, + { + "epoch": 1.921950421336824, + "grad_norm": 0.15806390345096588, + "learning_rate": 4.631953427141178e-08, + "loss": 0.0011, + "step": 117460 + }, + { + "epoch": 1.9221140472879, + "grad_norm": 0.0193981621414423, + "learning_rate": 4.6125824810156484e-08, + "loss": 0.0008, + "step": 117470 + }, + { + "epoch": 1.9222776732389757, + "grad_norm": 0.0775398313999176, + "learning_rate": 4.5932519370023855e-08, + "loss": 0.0009, + "step": 117480 + }, + { + "epoch": 1.9224412991900515, + "grad_norm": 0.05478496849536896, + "learning_rate": 4.573961796678017e-08, + "loss": 0.0008, + "step": 117490 + }, + { + "epoch": 1.9226049251411275, + "grad_norm": 0.010411275550723076, + "learning_rate": 4.554712061615729e-08, + "loss": 0.001, + "step": 117500 + }, + { + "epoch": 1.9227685510922032, + "grad_norm": 0.06554795056581497, + "learning_rate": 4.535502733385488e-08, + "loss": 0.0009, + "step": 117510 + }, + { + "epoch": 1.922932177043279, + "grad_norm": 0.0036133560352027416, + "learning_rate": 4.5163338135538745e-08, + "loss": 0.0005, + "step": 117520 + }, + { + "epoch": 1.923095802994355, + "grad_norm": 0.06528127193450928, + "learning_rate": 4.49720530368436e-08, + "loss": 0.0007, + "step": 117530 + }, + { + "epoch": 1.9232594289454308, + "grad_norm": 0.003624382195994258, + "learning_rate": 4.478117205336918e-08, + "loss": 0.0004, + "step": 117540 + }, + { + "epoch": 1.9234230548965066, + "grad_norm": 0.0884091779589653, + "learning_rate": 4.459069520068304e-08, + "loss": 0.0013, + "step": 117550 + }, + { + "epoch": 1.9235866808475826, + "grad_norm": 0.044825129210948944, + "learning_rate": 4.440062249431998e-08, + "loss": 0.001, + "step": 117560 + }, + { + "epoch": 1.923750306798658, + "grad_norm": 0.04207734391093254, + "learning_rate": 4.421095394978259e-08, + "loss": 0.0008, + "step": 117570 + }, + { + "epoch": 1.923913932749734, + "grad_norm": 0.024763138964772224, + "learning_rate": 4.402168958253794e-08, + "loss": 0.001, + "step": 117580 + }, + { + "epoch": 1.92407755870081, + "grad_norm": 0.002714958507567644, + "learning_rate": 4.383282940802258e-08, + "loss": 0.0008, + "step": 117590 + }, + { + "epoch": 1.9242411846518856, + "grad_norm": 0.024393411353230476, + "learning_rate": 4.364437344163974e-08, + "loss": 0.0006, + "step": 117600 + }, + { + "epoch": 1.9244048106029616, + "grad_norm": 0.0037096389569342136, + "learning_rate": 4.3456321698758244e-08, + "loss": 0.0007, + "step": 117610 + }, + { + "epoch": 1.9245684365540374, + "grad_norm": 0.004397556185722351, + "learning_rate": 4.326867419471637e-08, + "loss": 0.0015, + "step": 117620 + }, + { + "epoch": 1.9247320625051132, + "grad_norm": 0.042645443230867386, + "learning_rate": 4.3081430944816315e-08, + "loss": 0.0017, + "step": 117630 + }, + { + "epoch": 1.9248956884561892, + "grad_norm": 0.06267805397510529, + "learning_rate": 4.289459196433032e-08, + "loss": 0.0008, + "step": 117640 + }, + { + "epoch": 1.925059314407265, + "grad_norm": 0.09096043556928635, + "learning_rate": 4.270815726849564e-08, + "loss": 0.0007, + "step": 117650 + }, + { + "epoch": 1.9252229403583407, + "grad_norm": 0.024136090651154518, + "learning_rate": 4.252212687251789e-08, + "loss": 0.0006, + "step": 117660 + }, + { + "epoch": 1.9253865663094167, + "grad_norm": 0.011028232052922249, + "learning_rate": 4.233650079156826e-08, + "loss": 0.001, + "step": 117670 + }, + { + "epoch": 1.9255501922604925, + "grad_norm": 0.08266288787126541, + "learning_rate": 4.215127904078631e-08, + "loss": 0.0011, + "step": 117680 + }, + { + "epoch": 1.9257138182115683, + "grad_norm": 0.01891559734940529, + "learning_rate": 4.196646163527773e-08, + "loss": 0.0007, + "step": 117690 + }, + { + "epoch": 1.9258774441626443, + "grad_norm": 0.03202177584171295, + "learning_rate": 4.1782048590116583e-08, + "loss": 0.0005, + "step": 117700 + }, + { + "epoch": 1.92604107011372, + "grad_norm": 0.007185592316091061, + "learning_rate": 4.1598039920341394e-08, + "loss": 0.0007, + "step": 117710 + }, + { + "epoch": 1.9262046960647958, + "grad_norm": 0.015683166682720184, + "learning_rate": 4.14144356409607e-08, + "loss": 0.0026, + "step": 117720 + }, + { + "epoch": 1.9263683220158718, + "grad_norm": 0.014360490255057812, + "learning_rate": 4.123123576694754e-08, + "loss": 0.001, + "step": 117730 + }, + { + "epoch": 1.9265319479669476, + "grad_norm": 0.0372314453125, + "learning_rate": 4.10484403132444e-08, + "loss": 0.0008, + "step": 117740 + }, + { + "epoch": 1.9266955739180234, + "grad_norm": 0.07619918882846832, + "learning_rate": 4.086604929475824e-08, + "loss": 0.002, + "step": 117750 + }, + { + "epoch": 1.9268591998690994, + "grad_norm": 0.05482002720236778, + "learning_rate": 4.0684062726364384e-08, + "loss": 0.0008, + "step": 117760 + }, + { + "epoch": 1.927022825820175, + "grad_norm": 0.07788547873497009, + "learning_rate": 4.0502480622905404e-08, + "loss": 0.0005, + "step": 117770 + }, + { + "epoch": 1.927186451771251, + "grad_norm": 0.032083842903375626, + "learning_rate": 4.0321302999191125e-08, + "loss": 0.0008, + "step": 117780 + }, + { + "epoch": 1.927350077722327, + "grad_norm": 0.04234011843800545, + "learning_rate": 4.0140529869996395e-08, + "loss": 0.0011, + "step": 117790 + }, + { + "epoch": 1.9275137036734025, + "grad_norm": 0.04810163006186485, + "learning_rate": 3.9960161250065524e-08, + "loss": 0.001, + "step": 117800 + }, + { + "epoch": 1.9276773296244785, + "grad_norm": 0.01350861880928278, + "learning_rate": 3.978019715410786e-08, + "loss": 0.0008, + "step": 117810 + }, + { + "epoch": 1.9278409555755542, + "grad_norm": 0.03149794042110443, + "learning_rate": 3.960063759680166e-08, + "loss": 0.001, + "step": 117820 + }, + { + "epoch": 1.92800458152663, + "grad_norm": 0.0029188902117311954, + "learning_rate": 3.942148259279022e-08, + "loss": 0.0004, + "step": 117830 + }, + { + "epoch": 1.928168207477706, + "grad_norm": 0.09763001650571823, + "learning_rate": 3.924273215668628e-08, + "loss": 0.0011, + "step": 117840 + }, + { + "epoch": 1.9283318334287818, + "grad_norm": 0.08357292413711548, + "learning_rate": 3.906438630306597e-08, + "loss": 0.0011, + "step": 117850 + }, + { + "epoch": 1.9284954593798576, + "grad_norm": 0.1580624133348465, + "learning_rate": 3.888644504647654e-08, + "loss": 0.001, + "step": 117860 + }, + { + "epoch": 1.9286590853309336, + "grad_norm": 0.033751845359802246, + "learning_rate": 3.870890840142916e-08, + "loss": 0.0003, + "step": 117870 + }, + { + "epoch": 1.9288227112820093, + "grad_norm": 0.029373381286859512, + "learning_rate": 3.853177638240391e-08, + "loss": 0.001, + "step": 117880 + }, + { + "epoch": 1.928986337233085, + "grad_norm": 0.04505932703614235, + "learning_rate": 3.83550490038459e-08, + "loss": 0.0011, + "step": 117890 + }, + { + "epoch": 1.929149963184161, + "grad_norm": 0.04801009222865105, + "learning_rate": 3.817872628016972e-08, + "loss": 0.0013, + "step": 117900 + }, + { + "epoch": 1.9293135891352369, + "grad_norm": 0.0026336137671023607, + "learning_rate": 3.800280822575497e-08, + "loss": 0.0002, + "step": 117910 + }, + { + "epoch": 1.9294772150863126, + "grad_norm": 0.028031054884195328, + "learning_rate": 3.7827294854948516e-08, + "loss": 0.0007, + "step": 117920 + }, + { + "epoch": 1.9296408410373886, + "grad_norm": 0.05538899824023247, + "learning_rate": 3.765218618206556e-08, + "loss": 0.0005, + "step": 117930 + }, + { + "epoch": 1.9298044669884644, + "grad_norm": 0.009470922872424126, + "learning_rate": 3.747748222138692e-08, + "loss": 0.0007, + "step": 117940 + }, + { + "epoch": 1.9299680929395402, + "grad_norm": 0.06905197352170944, + "learning_rate": 3.7303182987160624e-08, + "loss": 0.0016, + "step": 117950 + }, + { + "epoch": 1.9301317188906162, + "grad_norm": 0.05365915223956108, + "learning_rate": 3.7129288493602534e-08, + "loss": 0.0011, + "step": 117960 + }, + { + "epoch": 1.9302953448416917, + "grad_norm": 0.09884864091873169, + "learning_rate": 3.695579875489408e-08, + "loss": 0.0008, + "step": 117970 + }, + { + "epoch": 1.9304589707927677, + "grad_norm": 0.12790483236312866, + "learning_rate": 3.6782713785185054e-08, + "loss": 0.0006, + "step": 117980 + }, + { + "epoch": 1.9306225967438437, + "grad_norm": 0.04073750972747803, + "learning_rate": 3.6610033598591946e-08, + "loss": 0.0007, + "step": 117990 + }, + { + "epoch": 1.9307862226949193, + "grad_norm": 0.284094899892807, + "learning_rate": 3.643775820919737e-08, + "loss": 0.0009, + "step": 118000 + }, + { + "epoch": 1.9309498486459953, + "grad_norm": 0.02556479349732399, + "learning_rate": 3.6265887631051764e-08, + "loss": 0.0012, + "step": 118010 + }, + { + "epoch": 1.931113474597071, + "grad_norm": 0.1338217407464981, + "learning_rate": 3.6094421878172245e-08, + "loss": 0.0008, + "step": 118020 + }, + { + "epoch": 1.9312771005481468, + "grad_norm": 0.02789692021906376, + "learning_rate": 3.5923360964543184e-08, + "loss": 0.001, + "step": 118030 + }, + { + "epoch": 1.9314407264992228, + "grad_norm": 0.033490389585494995, + "learning_rate": 3.575270490411509e-08, + "loss": 0.0009, + "step": 118040 + }, + { + "epoch": 1.9316043524502986, + "grad_norm": 0.005060158669948578, + "learning_rate": 3.558245371080682e-08, + "loss": 0.0005, + "step": 118050 + }, + { + "epoch": 1.9317679784013744, + "grad_norm": 0.0402166023850441, + "learning_rate": 3.5412607398503384e-08, + "loss": 0.0006, + "step": 118060 + }, + { + "epoch": 1.9319316043524504, + "grad_norm": 0.04928470030426979, + "learning_rate": 3.524316598105648e-08, + "loss": 0.0007, + "step": 118070 + }, + { + "epoch": 1.9320952303035261, + "grad_norm": 0.044275086373090744, + "learning_rate": 3.5074129472285056e-08, + "loss": 0.0011, + "step": 118080 + }, + { + "epoch": 1.932258856254602, + "grad_norm": 0.024314627051353455, + "learning_rate": 3.4905497885976415e-08, + "loss": 0.0013, + "step": 118090 + }, + { + "epoch": 1.932422482205678, + "grad_norm": 0.052798792719841, + "learning_rate": 3.4737271235881774e-08, + "loss": 0.0035, + "step": 118100 + }, + { + "epoch": 1.9325861081567537, + "grad_norm": 0.019403163343667984, + "learning_rate": 3.456944953572239e-08, + "loss": 0.0018, + "step": 118110 + }, + { + "epoch": 1.9327497341078295, + "grad_norm": 0.034056615084409714, + "learning_rate": 3.440203279918508e-08, + "loss": 0.0005, + "step": 118120 + }, + { + "epoch": 1.9329133600589055, + "grad_norm": 0.006692399736493826, + "learning_rate": 3.4235021039923375e-08, + "loss": 0.0005, + "step": 118130 + }, + { + "epoch": 1.9330769860099812, + "grad_norm": 0.053489863872528076, + "learning_rate": 3.4068414271558046e-08, + "loss": 0.001, + "step": 118140 + }, + { + "epoch": 1.933240611961057, + "grad_norm": 0.04069867357611656, + "learning_rate": 3.390221250767767e-08, + "loss": 0.0014, + "step": 118150 + }, + { + "epoch": 1.933404237912133, + "grad_norm": 0.06978915631771088, + "learning_rate": 3.3736415761836397e-08, + "loss": 0.001, + "step": 118160 + }, + { + "epoch": 1.9335678638632086, + "grad_norm": 0.014980925247073174, + "learning_rate": 3.357102404755674e-08, + "loss": 0.0008, + "step": 118170 + }, + { + "epoch": 1.9337314898142846, + "grad_norm": 0.038956403732299805, + "learning_rate": 3.340603737832682e-08, + "loss": 0.001, + "step": 118180 + }, + { + "epoch": 1.9338951157653606, + "grad_norm": 0.036051664501428604, + "learning_rate": 3.324145576760307e-08, + "loss": 0.0009, + "step": 118190 + }, + { + "epoch": 1.934058741716436, + "grad_norm": 0.05190818011760712, + "learning_rate": 3.307727922880699e-08, + "loss": 0.0004, + "step": 118200 + }, + { + "epoch": 1.934222367667512, + "grad_norm": 0.009585447609424591, + "learning_rate": 3.291350777533009e-08, + "loss": 0.0006, + "step": 118210 + }, + { + "epoch": 1.9343859936185879, + "grad_norm": 0.04707228019833565, + "learning_rate": 3.2750141420527236e-08, + "loss": 0.001, + "step": 118220 + }, + { + "epoch": 1.9345496195696636, + "grad_norm": 0.03924688324332237, + "learning_rate": 3.258718017772333e-08, + "loss": 0.0005, + "step": 118230 + }, + { + "epoch": 1.9347132455207396, + "grad_norm": 0.0471479557454586, + "learning_rate": 3.24246240602083e-08, + "loss": 0.0004, + "step": 118240 + }, + { + "epoch": 1.9348768714718154, + "grad_norm": 0.06175505369901657, + "learning_rate": 3.226247308123931e-08, + "loss": 0.001, + "step": 118250 + }, + { + "epoch": 1.9350404974228912, + "grad_norm": 0.010532890446484089, + "learning_rate": 3.2100727254041896e-08, + "loss": 0.001, + "step": 118260 + }, + { + "epoch": 1.9352041233739672, + "grad_norm": 0.031367652118206024, + "learning_rate": 3.1939386591806624e-08, + "loss": 0.001, + "step": 118270 + }, + { + "epoch": 1.935367749325043, + "grad_norm": 0.062123656272888184, + "learning_rate": 3.17784511076924e-08, + "loss": 0.0006, + "step": 118280 + }, + { + "epoch": 1.9355313752761187, + "grad_norm": 0.048285409808158875, + "learning_rate": 3.161792081482429e-08, + "loss": 0.0021, + "step": 118290 + }, + { + "epoch": 1.9356950012271947, + "grad_norm": 0.028201134875416756, + "learning_rate": 3.1457795726294595e-08, + "loss": 0.0013, + "step": 118300 + }, + { + "epoch": 1.9358586271782705, + "grad_norm": 0.002594185061752796, + "learning_rate": 3.1298075855162314e-08, + "loss": 0.0006, + "step": 118310 + }, + { + "epoch": 1.9360222531293463, + "grad_norm": 0.016251588240265846, + "learning_rate": 3.113876121445425e-08, + "loss": 0.0012, + "step": 118320 + }, + { + "epoch": 1.9361858790804223, + "grad_norm": 0.147062286734581, + "learning_rate": 3.097985181716334e-08, + "loss": 0.0011, + "step": 118330 + }, + { + "epoch": 1.9363495050314978, + "grad_norm": 0.011207268573343754, + "learning_rate": 3.082134767624978e-08, + "loss": 0.0019, + "step": 118340 + }, + { + "epoch": 1.9365131309825738, + "grad_norm": 0.02507847547531128, + "learning_rate": 3.0663248804640445e-08, + "loss": 0.0008, + "step": 118350 + }, + { + "epoch": 1.9366767569336498, + "grad_norm": 0.024048246443271637, + "learning_rate": 3.0505555215229466e-08, + "loss": 0.0009, + "step": 118360 + }, + { + "epoch": 1.9368403828847254, + "grad_norm": 0.004993805196136236, + "learning_rate": 3.034826692087767e-08, + "loss": 0.0009, + "step": 118370 + }, + { + "epoch": 1.9370040088358014, + "grad_norm": 0.03710927441716194, + "learning_rate": 3.0191383934412563e-08, + "loss": 0.0008, + "step": 118380 + }, + { + "epoch": 1.9371676347868771, + "grad_norm": 0.05473821610212326, + "learning_rate": 3.0034906268630034e-08, + "loss": 0.0008, + "step": 118390 + }, + { + "epoch": 1.937331260737953, + "grad_norm": 0.018258212134242058, + "learning_rate": 2.9878833936290985e-08, + "loss": 0.0005, + "step": 118400 + }, + { + "epoch": 1.937494886689029, + "grad_norm": 0.03875812888145447, + "learning_rate": 2.9723166950124117e-08, + "loss": 0.0011, + "step": 118410 + }, + { + "epoch": 1.9376585126401047, + "grad_norm": 0.014572301879525185, + "learning_rate": 2.956790532282594e-08, + "loss": 0.0006, + "step": 118420 + }, + { + "epoch": 1.9378221385911805, + "grad_norm": 0.0769246518611908, + "learning_rate": 2.9413049067058552e-08, + "loss": 0.0012, + "step": 118430 + }, + { + "epoch": 1.9379857645422565, + "grad_norm": 0.045898791402578354, + "learning_rate": 2.9258598195451293e-08, + "loss": 0.0014, + "step": 118440 + }, + { + "epoch": 1.9381493904933322, + "grad_norm": 0.034800853580236435, + "learning_rate": 2.9104552720600754e-08, + "loss": 0.0021, + "step": 118450 + }, + { + "epoch": 1.938313016444408, + "grad_norm": 0.05612555146217346, + "learning_rate": 2.895091265507022e-08, + "loss": 0.0008, + "step": 118460 + }, + { + "epoch": 1.938476642395484, + "grad_norm": 0.02328142151236534, + "learning_rate": 2.8797678011390774e-08, + "loss": 0.0009, + "step": 118470 + }, + { + "epoch": 1.9386402683465598, + "grad_norm": 0.1035553514957428, + "learning_rate": 2.8644848802059088e-08, + "loss": 0.0009, + "step": 118480 + }, + { + "epoch": 1.9388038942976356, + "grad_norm": 0.11448800563812256, + "learning_rate": 2.8492425039539085e-08, + "loss": 0.0007, + "step": 118490 + }, + { + "epoch": 1.9389675202487116, + "grad_norm": 0.040437307208776474, + "learning_rate": 2.834040673626304e-08, + "loss": 0.0009, + "step": 118500 + }, + { + "epoch": 1.9391311461997873, + "grad_norm": 0.013946594670414925, + "learning_rate": 2.81887939046277e-08, + "loss": 0.001, + "step": 118510 + }, + { + "epoch": 1.939294772150863, + "grad_norm": 0.0034135030582547188, + "learning_rate": 2.8037586556999287e-08, + "loss": 0.0008, + "step": 118520 + }, + { + "epoch": 1.939458398101939, + "grad_norm": 0.0016365470364689827, + "learning_rate": 2.7886784705709048e-08, + "loss": 0.0008, + "step": 118530 + }, + { + "epoch": 1.9396220240530146, + "grad_norm": 0.029756847769021988, + "learning_rate": 2.7736388363056588e-08, + "loss": 0.0005, + "step": 118540 + }, + { + "epoch": 1.9397856500040906, + "grad_norm": 0.04337065666913986, + "learning_rate": 2.7586397541306543e-08, + "loss": 0.0008, + "step": 118550 + }, + { + "epoch": 1.9399492759551666, + "grad_norm": 0.024112407118082047, + "learning_rate": 2.7436812252692456e-08, + "loss": 0.0014, + "step": 118560 + }, + { + "epoch": 1.9401129019062422, + "grad_norm": 0.004146270453929901, + "learning_rate": 2.7287632509413464e-08, + "loss": 0.0009, + "step": 118570 + }, + { + "epoch": 1.9402765278573182, + "grad_norm": 0.10304737091064453, + "learning_rate": 2.7138858323637052e-08, + "loss": 0.0006, + "step": 118580 + }, + { + "epoch": 1.940440153808394, + "grad_norm": 0.024668464437127113, + "learning_rate": 2.6990489707496292e-08, + "loss": 0.0006, + "step": 118590 + }, + { + "epoch": 1.9406037797594697, + "grad_norm": 0.08798343688249588, + "learning_rate": 2.6842526673091508e-08, + "loss": 0.0007, + "step": 118600 + }, + { + "epoch": 1.9407674057105457, + "grad_norm": 0.013673658482730389, + "learning_rate": 2.6694969232489708e-08, + "loss": 0.0021, + "step": 118610 + }, + { + "epoch": 1.9409310316616215, + "grad_norm": 0.017358314245939255, + "learning_rate": 2.654781739772572e-08, + "loss": 0.0006, + "step": 118620 + }, + { + "epoch": 1.9410946576126973, + "grad_norm": 0.08628585934638977, + "learning_rate": 2.6401071180800485e-08, + "loss": 0.0012, + "step": 118630 + }, + { + "epoch": 1.9412582835637733, + "grad_norm": 0.045581232756376266, + "learning_rate": 2.6254730593682775e-08, + "loss": 0.0012, + "step": 118640 + }, + { + "epoch": 1.941421909514849, + "grad_norm": 0.013516368344426155, + "learning_rate": 2.6108795648306372e-08, + "loss": 0.0009, + "step": 118650 + }, + { + "epoch": 1.9415855354659248, + "grad_norm": 0.05546526610851288, + "learning_rate": 2.5963266356574534e-08, + "loss": 0.0011, + "step": 118660 + }, + { + "epoch": 1.9417491614170008, + "grad_norm": 0.07094526290893555, + "learning_rate": 2.5818142730355544e-08, + "loss": 0.0012, + "step": 118670 + }, + { + "epoch": 1.9419127873680766, + "grad_norm": 0.038748107850551605, + "learning_rate": 2.5673424781484936e-08, + "loss": 0.001, + "step": 118680 + }, + { + "epoch": 1.9420764133191524, + "grad_norm": 0.02977623976767063, + "learning_rate": 2.552911252176604e-08, + "loss": 0.0006, + "step": 118690 + }, + { + "epoch": 1.9422400392702284, + "grad_norm": 0.05881514400243759, + "learning_rate": 2.5385205962967784e-08, + "loss": 0.0014, + "step": 118700 + }, + { + "epoch": 1.9424036652213041, + "grad_norm": 0.03750619292259216, + "learning_rate": 2.5241705116826887e-08, + "loss": 0.0013, + "step": 118710 + }, + { + "epoch": 1.94256729117238, + "grad_norm": 0.02984338253736496, + "learning_rate": 2.509860999504732e-08, + "loss": 0.0008, + "step": 118720 + }, + { + "epoch": 1.942730917123456, + "grad_norm": 0.014807168394327164, + "learning_rate": 2.4955920609298635e-08, + "loss": 0.0004, + "step": 118730 + }, + { + "epoch": 1.9428945430745315, + "grad_norm": 0.05217082053422928, + "learning_rate": 2.4813636971218747e-08, + "loss": 0.0006, + "step": 118740 + }, + { + "epoch": 1.9430581690256075, + "grad_norm": 0.033538561314344406, + "learning_rate": 2.4671759092411708e-08, + "loss": 0.0006, + "step": 118750 + }, + { + "epoch": 1.9432217949766835, + "grad_norm": 0.057510845363140106, + "learning_rate": 2.453028698444826e-08, + "loss": 0.0007, + "step": 118760 + }, + { + "epoch": 1.943385420927759, + "grad_norm": 0.03222835808992386, + "learning_rate": 2.4389220658866952e-08, + "loss": 0.0009, + "step": 118770 + }, + { + "epoch": 1.943549046878835, + "grad_norm": 0.00914396345615387, + "learning_rate": 2.4248560127171915e-08, + "loss": 0.0008, + "step": 118780 + }, + { + "epoch": 1.9437126728299108, + "grad_norm": 0.017285292968153954, + "learning_rate": 2.4108305400835086e-08, + "loss": 0.0005, + "step": 118790 + }, + { + "epoch": 1.9438762987809866, + "grad_norm": 0.04153028875589371, + "learning_rate": 2.3968456491295643e-08, + "loss": 0.0006, + "step": 118800 + }, + { + "epoch": 1.9440399247320626, + "grad_norm": 0.0309784933924675, + "learning_rate": 2.382901340995947e-08, + "loss": 0.0007, + "step": 118810 + }, + { + "epoch": 1.9442035506831383, + "grad_norm": 0.04131908714771271, + "learning_rate": 2.3689976168198014e-08, + "loss": 0.0006, + "step": 118820 + }, + { + "epoch": 1.944367176634214, + "grad_norm": 0.005361310672014952, + "learning_rate": 2.355134477735166e-08, + "loss": 0.0008, + "step": 118830 + }, + { + "epoch": 1.94453080258529, + "grad_norm": 0.06579025089740753, + "learning_rate": 2.3413119248725803e-08, + "loss": 0.0009, + "step": 118840 + }, + { + "epoch": 1.9446944285363659, + "grad_norm": 0.004515460692346096, + "learning_rate": 2.3275299593594758e-08, + "loss": 0.0005, + "step": 118850 + }, + { + "epoch": 1.9448580544874416, + "grad_norm": 0.011067098937928677, + "learning_rate": 2.3137885823197316e-08, + "loss": 0.001, + "step": 118860 + }, + { + "epoch": 1.9450216804385176, + "grad_norm": 0.03738608956336975, + "learning_rate": 2.3000877948741728e-08, + "loss": 0.0017, + "step": 118870 + }, + { + "epoch": 1.9451853063895934, + "grad_norm": 0.1172296404838562, + "learning_rate": 2.286427598140073e-08, + "loss": 0.0018, + "step": 118880 + }, + { + "epoch": 1.9453489323406692, + "grad_norm": 0.013883393257856369, + "learning_rate": 2.2728079932316515e-08, + "loss": 0.0006, + "step": 118890 + }, + { + "epoch": 1.9455125582917452, + "grad_norm": 0.020110009238123894, + "learning_rate": 2.2592289812595202e-08, + "loss": 0.001, + "step": 118900 + }, + { + "epoch": 1.945676184242821, + "grad_norm": 0.12202437967061996, + "learning_rate": 2.2456905633312377e-08, + "loss": 0.001, + "step": 118910 + }, + { + "epoch": 1.9458398101938967, + "grad_norm": 0.0923973098397255, + "learning_rate": 2.23219274055092e-08, + "loss": 0.0013, + "step": 118920 + }, + { + "epoch": 1.9460034361449727, + "grad_norm": 0.05844296142458916, + "learning_rate": 2.21873551401941e-08, + "loss": 0.0009, + "step": 118930 + }, + { + "epoch": 1.9461670620960483, + "grad_norm": 0.3888278305530548, + "learning_rate": 2.205318884834273e-08, + "loss": 0.001, + "step": 118940 + }, + { + "epoch": 1.9463306880471243, + "grad_norm": 0.04085360839962959, + "learning_rate": 2.1919428540896347e-08, + "loss": 0.0007, + "step": 118950 + }, + { + "epoch": 1.9464943139982003, + "grad_norm": 0.0014444985426962376, + "learning_rate": 2.1786074228764552e-08, + "loss": 0.0007, + "step": 118960 + }, + { + "epoch": 1.9466579399492758, + "grad_norm": 0.042099274694919586, + "learning_rate": 2.1653125922823648e-08, + "loss": 0.0008, + "step": 118970 + }, + { + "epoch": 1.9468215659003518, + "grad_norm": 0.05133488029241562, + "learning_rate": 2.1520583633915514e-08, + "loss": 0.0006, + "step": 118980 + }, + { + "epoch": 1.9469851918514276, + "grad_norm": 0.026576509699225426, + "learning_rate": 2.1388447372850397e-08, + "loss": 0.0008, + "step": 118990 + }, + { + "epoch": 1.9471488178025034, + "grad_norm": 0.0730036050081253, + "learning_rate": 2.125671715040467e-08, + "loss": 0.0011, + "step": 119000 + }, + { + "epoch": 1.9473124437535794, + "grad_norm": 0.09758596867322922, + "learning_rate": 2.1125392977322524e-08, + "loss": 0.0027, + "step": 119010 + }, + { + "epoch": 1.9474760697046551, + "grad_norm": 0.03471507132053375, + "learning_rate": 2.0994474864313718e-08, + "loss": 0.0004, + "step": 119020 + }, + { + "epoch": 1.947639695655731, + "grad_norm": 0.01150267943739891, + "learning_rate": 2.086396282205472e-08, + "loss": 0.0003, + "step": 119030 + }, + { + "epoch": 1.947803321606807, + "grad_norm": 0.0543401874601841, + "learning_rate": 2.0733856861190894e-08, + "loss": 0.0018, + "step": 119040 + }, + { + "epoch": 1.9479669475578827, + "grad_norm": 0.02571258693933487, + "learning_rate": 2.0604156992332645e-08, + "loss": 0.0009, + "step": 119050 + }, + { + "epoch": 1.9481305735089585, + "grad_norm": 0.026605796068906784, + "learning_rate": 2.0474863226058184e-08, + "loss": 0.0006, + "step": 119060 + }, + { + "epoch": 1.9482941994600345, + "grad_norm": 0.06606029719114304, + "learning_rate": 2.0345975572911848e-08, + "loss": 0.0009, + "step": 119070 + }, + { + "epoch": 1.9484578254111102, + "grad_norm": 0.01350910123437643, + "learning_rate": 2.0217494043405783e-08, + "loss": 0.001, + "step": 119080 + }, + { + "epoch": 1.948621451362186, + "grad_norm": 0.0011455549392849207, + "learning_rate": 2.0089418648018278e-08, + "loss": 0.0007, + "step": 119090 + }, + { + "epoch": 1.948785077313262, + "grad_norm": 0.0849999263882637, + "learning_rate": 1.996174939719431e-08, + "loss": 0.0017, + "step": 119100 + }, + { + "epoch": 1.9489487032643378, + "grad_norm": 0.01061960682272911, + "learning_rate": 1.9834486301346655e-08, + "loss": 0.0018, + "step": 119110 + }, + { + "epoch": 1.9491123292154136, + "grad_norm": 0.07178749144077301, + "learning_rate": 1.9707629370854243e-08, + "loss": 0.0007, + "step": 119120 + }, + { + "epoch": 1.9492759551664895, + "grad_norm": 0.02332564815878868, + "learning_rate": 1.9581178616063235e-08, + "loss": 0.0006, + "step": 119130 + }, + { + "epoch": 1.949439581117565, + "grad_norm": 0.059939831495285034, + "learning_rate": 1.9455134047286495e-08, + "loss": 0.0012, + "step": 119140 + }, + { + "epoch": 1.949603207068641, + "grad_norm": 0.009971341118216515, + "learning_rate": 1.9329495674803577e-08, + "loss": 0.0003, + "step": 119150 + }, + { + "epoch": 1.9497668330197169, + "grad_norm": 0.024140900000929832, + "learning_rate": 1.920426350886129e-08, + "loss": 0.0019, + "step": 119160 + }, + { + "epoch": 1.9499304589707926, + "grad_norm": 0.04765401780605316, + "learning_rate": 1.9079437559673673e-08, + "loss": 0.0008, + "step": 119170 + }, + { + "epoch": 1.9500940849218686, + "grad_norm": 0.018852246925234795, + "learning_rate": 1.895501783741982e-08, + "loss": 0.0007, + "step": 119180 + }, + { + "epoch": 1.9502577108729444, + "grad_norm": 0.026595139876008034, + "learning_rate": 1.8831004352248273e-08, + "loss": 0.001, + "step": 119190 + }, + { + "epoch": 1.9504213368240202, + "grad_norm": 0.02010231651365757, + "learning_rate": 1.8707397114272607e-08, + "loss": 0.0021, + "step": 119200 + }, + { + "epoch": 1.9505849627750962, + "grad_norm": 0.09063665568828583, + "learning_rate": 1.8584196133573095e-08, + "loss": 0.0008, + "step": 119210 + }, + { + "epoch": 1.950748588726172, + "grad_norm": 0.04249636456370354, + "learning_rate": 1.846140142019892e-08, + "loss": 0.0008, + "step": 119220 + }, + { + "epoch": 1.9509122146772477, + "grad_norm": 0.08367861807346344, + "learning_rate": 1.8339012984164296e-08, + "loss": 0.0021, + "step": 119230 + }, + { + "epoch": 1.9510758406283237, + "grad_norm": 0.03771822154521942, + "learning_rate": 1.8217030835450678e-08, + "loss": 0.0016, + "step": 119240 + }, + { + "epoch": 1.9512394665793995, + "grad_norm": 0.03222620114684105, + "learning_rate": 1.8095454984006222e-08, + "loss": 0.0008, + "step": 119250 + }, + { + "epoch": 1.9514030925304753, + "grad_norm": 0.01875535026192665, + "learning_rate": 1.797428543974633e-08, + "loss": 0.0009, + "step": 119260 + }, + { + "epoch": 1.9515667184815513, + "grad_norm": 0.021393340080976486, + "learning_rate": 1.7853522212553652e-08, + "loss": 0.001, + "step": 119270 + }, + { + "epoch": 1.951730344432627, + "grad_norm": 0.09135446697473526, + "learning_rate": 1.7733165312277533e-08, + "loss": 0.0008, + "step": 119280 + }, + { + "epoch": 1.9518939703837028, + "grad_norm": 0.004633698146790266, + "learning_rate": 1.7613214748732344e-08, + "loss": 0.0005, + "step": 119290 + }, + { + "epoch": 1.9520575963347788, + "grad_norm": 0.029996832832694054, + "learning_rate": 1.7493670531702478e-08, + "loss": 0.0009, + "step": 119300 + }, + { + "epoch": 1.9522212222858544, + "grad_norm": 0.05384202301502228, + "learning_rate": 1.7374532670936804e-08, + "loss": 0.0006, + "step": 119310 + }, + { + "epoch": 1.9523848482369304, + "grad_norm": 0.023778783157467842, + "learning_rate": 1.7255801176151998e-08, + "loss": 0.0003, + "step": 119320 + }, + { + "epoch": 1.9525484741880064, + "grad_norm": 0.05338103696703911, + "learning_rate": 1.7137476057031423e-08, + "loss": 0.0006, + "step": 119330 + }, + { + "epoch": 1.952712100139082, + "grad_norm": 0.03128186613321304, + "learning_rate": 1.7019557323224577e-08, + "loss": 0.0032, + "step": 119340 + }, + { + "epoch": 1.952875726090158, + "grad_norm": 0.07055889070034027, + "learning_rate": 1.6902044984349332e-08, + "loss": 0.0011, + "step": 119350 + }, + { + "epoch": 1.9530393520412337, + "grad_norm": 0.03257028013467789, + "learning_rate": 1.6784939049989123e-08, + "loss": 0.0005, + "step": 119360 + }, + { + "epoch": 1.9532029779923095, + "grad_norm": 0.0605674609541893, + "learning_rate": 1.6668239529695208e-08, + "loss": 0.0005, + "step": 119370 + }, + { + "epoch": 1.9533666039433855, + "grad_norm": 0.06829767674207687, + "learning_rate": 1.6551946432984966e-08, + "loss": 0.0069, + "step": 119380 + }, + { + "epoch": 1.9535302298944612, + "grad_norm": 0.037804704159498215, + "learning_rate": 1.643605976934248e-08, + "loss": 0.001, + "step": 119390 + }, + { + "epoch": 1.953693855845537, + "grad_norm": 0.013051602058112621, + "learning_rate": 1.6320579548219638e-08, + "loss": 0.0006, + "step": 119400 + }, + { + "epoch": 1.953857481796613, + "grad_norm": 0.0068069882690906525, + "learning_rate": 1.620550577903446e-08, + "loss": 0.0009, + "step": 119410 + }, + { + "epoch": 1.9540211077476888, + "grad_norm": 0.006176062393933535, + "learning_rate": 1.6090838471171655e-08, + "loss": 0.0006, + "step": 119420 + }, + { + "epoch": 1.9541847336987646, + "grad_norm": 0.0808325782418251, + "learning_rate": 1.597657763398375e-08, + "loss": 0.0015, + "step": 119430 + }, + { + "epoch": 1.9543483596498405, + "grad_norm": 0.06635306775569916, + "learning_rate": 1.586272327678884e-08, + "loss": 0.0007, + "step": 119440 + }, + { + "epoch": 1.9545119856009163, + "grad_norm": 0.054228853434324265, + "learning_rate": 1.574927540887283e-08, + "loss": 0.0007, + "step": 119450 + }, + { + "epoch": 1.954675611551992, + "grad_norm": 0.07508311420679092, + "learning_rate": 1.5636234039487773e-08, + "loss": 0.0006, + "step": 119460 + }, + { + "epoch": 1.954839237503068, + "grad_norm": 0.026145469397306442, + "learning_rate": 1.552359917785351e-08, + "loss": 0.0016, + "step": 119470 + }, + { + "epoch": 1.9550028634541439, + "grad_norm": 0.004232988227158785, + "learning_rate": 1.541137083315547e-08, + "loss": 0.0008, + "step": 119480 + }, + { + "epoch": 1.9551664894052196, + "grad_norm": 0.03720800578594208, + "learning_rate": 1.5299549014546887e-08, + "loss": 0.0005, + "step": 119490 + }, + { + "epoch": 1.9553301153562956, + "grad_norm": 0.023317476734519005, + "learning_rate": 1.5188133731148248e-08, + "loss": 0.0012, + "step": 119500 + }, + { + "epoch": 1.9554937413073712, + "grad_norm": 0.07464942336082458, + "learning_rate": 1.5077124992045057e-08, + "loss": 0.0009, + "step": 119510 + }, + { + "epoch": 1.9556573672584472, + "grad_norm": 0.04751353710889816, + "learning_rate": 1.496652280629174e-08, + "loss": 0.0014, + "step": 119520 + }, + { + "epoch": 1.9558209932095232, + "grad_norm": 0.011533886194229126, + "learning_rate": 1.4856327182908304e-08, + "loss": 0.0012, + "step": 119530 + }, + { + "epoch": 1.9559846191605987, + "grad_norm": 0.04342764988541603, + "learning_rate": 1.474653813088145e-08, + "loss": 0.0009, + "step": 119540 + }, + { + "epoch": 1.9561482451116747, + "grad_norm": 0.026417843997478485, + "learning_rate": 1.4637155659166236e-08, + "loss": 0.0013, + "step": 119550 + }, + { + "epoch": 1.9563118710627505, + "grad_norm": 0.16043344140052795, + "learning_rate": 1.4528179776682749e-08, + "loss": 0.0009, + "step": 119560 + }, + { + "epoch": 1.9564754970138263, + "grad_norm": 0.03958956152200699, + "learning_rate": 1.4419610492318881e-08, + "loss": 0.0011, + "step": 119570 + }, + { + "epoch": 1.9566391229649023, + "grad_norm": 0.05950172618031502, + "learning_rate": 1.4311447814928658e-08, + "loss": 0.0004, + "step": 119580 + }, + { + "epoch": 1.956802748915978, + "grad_norm": 0.043503303080797195, + "learning_rate": 1.4203691753335025e-08, + "loss": 0.0007, + "step": 119590 + }, + { + "epoch": 1.9569663748670538, + "grad_norm": 0.1272813230752945, + "learning_rate": 1.4096342316324285e-08, + "loss": 0.0008, + "step": 119600 + }, + { + "epoch": 1.9571300008181298, + "grad_norm": 0.07593804597854614, + "learning_rate": 1.3989399512652768e-08, + "loss": 0.0017, + "step": 119610 + }, + { + "epoch": 1.9572936267692056, + "grad_norm": 0.04772377386689186, + "learning_rate": 1.3882863351041276e-08, + "loss": 0.0009, + "step": 119620 + }, + { + "epoch": 1.9574572527202814, + "grad_norm": 0.0030029609333723783, + "learning_rate": 1.377673384018008e-08, + "loss": 0.0004, + "step": 119630 + }, + { + "epoch": 1.9576208786713574, + "grad_norm": 0.03763430193066597, + "learning_rate": 1.3671010988723365e-08, + "loss": 0.0005, + "step": 119640 + }, + { + "epoch": 1.9577845046224331, + "grad_norm": 0.04394448921084404, + "learning_rate": 1.3565694805294238e-08, + "loss": 0.0014, + "step": 119650 + }, + { + "epoch": 1.957948130573509, + "grad_norm": 0.013911603949964046, + "learning_rate": 1.3460785298481938e-08, + "loss": 0.0008, + "step": 119660 + }, + { + "epoch": 1.958111756524585, + "grad_norm": 0.00789094902575016, + "learning_rate": 1.3356282476841843e-08, + "loss": 0.0029, + "step": 119670 + }, + { + "epoch": 1.9582753824756607, + "grad_norm": 0.0644741877913475, + "learning_rate": 1.3252186348897688e-08, + "loss": 0.0008, + "step": 119680 + }, + { + "epoch": 1.9584390084267365, + "grad_norm": 0.01335859950631857, + "learning_rate": 1.3148496923138799e-08, + "loss": 0.0003, + "step": 119690 + }, + { + "epoch": 1.9586026343778125, + "grad_norm": 0.05094321444630623, + "learning_rate": 1.3045214208021739e-08, + "loss": 0.0009, + "step": 119700 + }, + { + "epoch": 1.958766260328888, + "grad_norm": 0.04956056922674179, + "learning_rate": 1.2942338211969775e-08, + "loss": 0.0009, + "step": 119710 + }, + { + "epoch": 1.958929886279964, + "grad_norm": 0.04690057411789894, + "learning_rate": 1.2839868943373413e-08, + "loss": 0.0007, + "step": 119720 + }, + { + "epoch": 1.95909351223104, + "grad_norm": 0.0027065288741141558, + "learning_rate": 1.2737806410589304e-08, + "loss": 0.0006, + "step": 119730 + }, + { + "epoch": 1.9592571381821156, + "grad_norm": 0.021053733304142952, + "learning_rate": 1.26361506219419e-08, + "loss": 0.0006, + "step": 119740 + }, + { + "epoch": 1.9594207641331916, + "grad_norm": 0.023141274228692055, + "learning_rate": 1.2534901585721238e-08, + "loss": 0.001, + "step": 119750 + }, + { + "epoch": 1.9595843900842673, + "grad_norm": 0.09062420576810837, + "learning_rate": 1.2434059310185708e-08, + "loss": 0.0008, + "step": 119760 + }, + { + "epoch": 1.959748016035343, + "grad_norm": 0.011048682034015656, + "learning_rate": 1.2333623803558737e-08, + "loss": 0.0004, + "step": 119770 + }, + { + "epoch": 1.959911641986419, + "grad_norm": 0.01725071668624878, + "learning_rate": 1.2233595074031545e-08, + "loss": 0.0008, + "step": 119780 + }, + { + "epoch": 1.9600752679374949, + "grad_norm": 0.026094339787960052, + "learning_rate": 1.2133973129763165e-08, + "loss": 0.0007, + "step": 119790 + }, + { + "epoch": 1.9602388938885706, + "grad_norm": 0.021458825096488, + "learning_rate": 1.2034757978877654e-08, + "loss": 0.0009, + "step": 119800 + }, + { + "epoch": 1.9604025198396466, + "grad_norm": 0.04918216913938522, + "learning_rate": 1.1935949629466315e-08, + "loss": 0.0011, + "step": 119810 + }, + { + "epoch": 1.9605661457907224, + "grad_norm": 0.07723425328731537, + "learning_rate": 1.1837548089588258e-08, + "loss": 0.0006, + "step": 119820 + }, + { + "epoch": 1.9607297717417982, + "grad_norm": 0.012126307003200054, + "learning_rate": 1.173955336726873e-08, + "loss": 0.0005, + "step": 119830 + }, + { + "epoch": 1.9608933976928742, + "grad_norm": 0.13935650885105133, + "learning_rate": 1.1641965470499672e-08, + "loss": 0.0015, + "step": 119840 + }, + { + "epoch": 1.96105702364395, + "grad_norm": 0.07405298203229904, + "learning_rate": 1.1544784407240273e-08, + "loss": 0.0004, + "step": 119850 + }, + { + "epoch": 1.9612206495950257, + "grad_norm": 0.11607611924409866, + "learning_rate": 1.1448010185415859e-08, + "loss": 0.001, + "step": 119860 + }, + { + "epoch": 1.9613842755461017, + "grad_norm": 0.008421391248703003, + "learning_rate": 1.1351642812919006e-08, + "loss": 0.0008, + "step": 119870 + }, + { + "epoch": 1.9615479014971775, + "grad_norm": 0.00933240819722414, + "learning_rate": 1.1255682297609539e-08, + "loss": 0.0008, + "step": 119880 + }, + { + "epoch": 1.9617115274482533, + "grad_norm": 0.060162998735904694, + "learning_rate": 1.1160128647313417e-08, + "loss": 0.0007, + "step": 119890 + }, + { + "epoch": 1.9618751533993293, + "grad_norm": 0.02956484630703926, + "learning_rate": 1.1064981869823855e-08, + "loss": 0.0013, + "step": 119900 + }, + { + "epoch": 1.9620387793504048, + "grad_norm": 0.07528740912675858, + "learning_rate": 1.0970241972900198e-08, + "loss": 0.001, + "step": 119910 + }, + { + "epoch": 1.9622024053014808, + "grad_norm": 0.02967989258468151, + "learning_rate": 1.08759089642696e-08, + "loss": 0.0005, + "step": 119920 + }, + { + "epoch": 1.9623660312525568, + "grad_norm": 0.11306829750537872, + "learning_rate": 1.078198285162535e-08, + "loss": 0.0004, + "step": 119930 + }, + { + "epoch": 1.9625296572036324, + "grad_norm": 0.028859227895736694, + "learning_rate": 1.0688463642627989e-08, + "loss": 0.0013, + "step": 119940 + }, + { + "epoch": 1.9626932831547084, + "grad_norm": 0.055813319981098175, + "learning_rate": 1.0595351344904192e-08, + "loss": 0.0013, + "step": 119950 + }, + { + "epoch": 1.9628569091057841, + "grad_norm": 0.006267130374908447, + "learning_rate": 1.050264596604844e-08, + "loss": 0.0009, + "step": 119960 + }, + { + "epoch": 1.96302053505686, + "grad_norm": 0.019412953406572342, + "learning_rate": 1.04103475136208e-08, + "loss": 0.0007, + "step": 119970 + }, + { + "epoch": 1.963184161007936, + "grad_norm": 0.013167980127036572, + "learning_rate": 1.0318455995149689e-08, + "loss": 0.0008, + "step": 119980 + }, + { + "epoch": 1.9633477869590117, + "grad_norm": 0.06747866421937943, + "learning_rate": 1.0226971418128562e-08, + "loss": 0.001, + "step": 119990 + }, + { + "epoch": 1.9635114129100875, + "grad_norm": 0.013039028272032738, + "learning_rate": 1.013589379001867e-08, + "loss": 0.0009, + "step": 120000 + }, + { + "epoch": 1.9636750388611635, + "grad_norm": 0.006112654227763414, + "learning_rate": 1.0045223118248514e-08, + "loss": 0.0004, + "step": 120010 + }, + { + "epoch": 1.9638386648122392, + "grad_norm": 0.10876543074846268, + "learning_rate": 9.954959410212738e-09, + "loss": 0.0016, + "step": 120020 + }, + { + "epoch": 1.964002290763315, + "grad_norm": 0.011344965547323227, + "learning_rate": 9.865102673273231e-09, + "loss": 0.0004, + "step": 120030 + }, + { + "epoch": 1.964165916714391, + "grad_norm": 0.012625058181583881, + "learning_rate": 9.77565291475746e-09, + "loss": 0.0003, + "step": 120040 + }, + { + "epoch": 1.9643295426654668, + "grad_norm": 0.044346991926431656, + "learning_rate": 9.686610141961816e-09, + "loss": 0.001, + "step": 120050 + }, + { + "epoch": 1.9644931686165426, + "grad_norm": 0.0013447734527289867, + "learning_rate": 9.597974362147156e-09, + "loss": 0.0005, + "step": 120060 + }, + { + "epoch": 1.9646567945676185, + "grad_norm": 0.019491171464323997, + "learning_rate": 9.50974558254325e-09, + "loss": 0.0007, + "step": 120070 + }, + { + "epoch": 1.964820420518694, + "grad_norm": 0.014783798716962337, + "learning_rate": 9.421923810345457e-09, + "loss": 0.0008, + "step": 120080 + }, + { + "epoch": 1.96498404646977, + "grad_norm": 0.01749807596206665, + "learning_rate": 9.334509052715823e-09, + "loss": 0.0019, + "step": 120090 + }, + { + "epoch": 1.965147672420846, + "grad_norm": 0.21147726476192474, + "learning_rate": 9.247501316784202e-09, + "loss": 0.0006, + "step": 120100 + }, + { + "epoch": 1.9653112983719216, + "grad_norm": 0.07809248566627502, + "learning_rate": 9.16090060964603e-09, + "loss": 0.0009, + "step": 120110 + }, + { + "epoch": 1.9654749243229976, + "grad_norm": 0.19455690681934357, + "learning_rate": 9.074706938364542e-09, + "loss": 0.0012, + "step": 120120 + }, + { + "epoch": 1.9656385502740734, + "grad_norm": 0.04191499948501587, + "learning_rate": 8.988920309969673e-09, + "loss": 0.0011, + "step": 120130 + }, + { + "epoch": 1.9658021762251492, + "grad_norm": 0.12729717791080475, + "learning_rate": 8.903540731457494e-09, + "loss": 0.0008, + "step": 120140 + }, + { + "epoch": 1.9659658021762252, + "grad_norm": 0.018768733367323875, + "learning_rate": 8.818568209791323e-09, + "loss": 0.0013, + "step": 120150 + }, + { + "epoch": 1.966129428127301, + "grad_norm": 0.0029396044556051493, + "learning_rate": 8.734002751901171e-09, + "loss": 0.001, + "step": 120160 + }, + { + "epoch": 1.9662930540783767, + "grad_norm": 0.06560855358839035, + "learning_rate": 8.649844364684301e-09, + "loss": 0.0007, + "step": 120170 + }, + { + "epoch": 1.9664566800294527, + "grad_norm": 0.004699551500380039, + "learning_rate": 8.566093055003555e-09, + "loss": 0.0005, + "step": 120180 + }, + { + "epoch": 1.9666203059805285, + "grad_norm": 0.026043064892292023, + "learning_rate": 8.482748829690134e-09, + "loss": 0.0004, + "step": 120190 + }, + { + "epoch": 1.9667839319316043, + "grad_norm": 0.04580653831362724, + "learning_rate": 8.399811695541383e-09, + "loss": 0.0011, + "step": 120200 + }, + { + "epoch": 1.9669475578826803, + "grad_norm": 0.08058738708496094, + "learning_rate": 8.317281659320774e-09, + "loss": 0.0014, + "step": 120210 + }, + { + "epoch": 1.967111183833756, + "grad_norm": 0.07367295771837234, + "learning_rate": 8.235158727759596e-09, + "loss": 0.0008, + "step": 120220 + }, + { + "epoch": 1.9672748097848318, + "grad_norm": 0.003173008793964982, + "learning_rate": 8.15344290755582e-09, + "loss": 0.0006, + "step": 120230 + }, + { + "epoch": 1.9674384357359078, + "grad_norm": 0.020336421206593513, + "learning_rate": 8.072134205373005e-09, + "loss": 0.0009, + "step": 120240 + }, + { + "epoch": 1.9676020616869836, + "grad_norm": 0.02688850834965706, + "learning_rate": 7.991232627843071e-09, + "loss": 0.0004, + "step": 120250 + }, + { + "epoch": 1.9677656876380594, + "grad_norm": 0.18842047452926636, + "learning_rate": 7.910738181563515e-09, + "loss": 0.0017, + "step": 120260 + }, + { + "epoch": 1.9679293135891354, + "grad_norm": 0.0487213209271431, + "learning_rate": 7.830650873100198e-09, + "loss": 0.0008, + "step": 120270 + }, + { + "epoch": 1.968092939540211, + "grad_norm": 0.012311895377933979, + "learning_rate": 7.750970708984007e-09, + "loss": 0.0004, + "step": 120280 + }, + { + "epoch": 1.968256565491287, + "grad_norm": 0.013489848002791405, + "learning_rate": 7.671697695713632e-09, + "loss": 0.0008, + "step": 120290 + }, + { + "epoch": 1.968420191442363, + "grad_norm": 0.08853311091661453, + "learning_rate": 7.5928318397539e-09, + "loss": 0.0004, + "step": 120300 + }, + { + "epoch": 1.9685838173934385, + "grad_norm": 0.01457985956221819, + "learning_rate": 7.514373147537446e-09, + "loss": 0.0008, + "step": 120310 + }, + { + "epoch": 1.9687474433445145, + "grad_norm": 0.05608929693698883, + "learning_rate": 7.436321625463039e-09, + "loss": 0.0005, + "step": 120320 + }, + { + "epoch": 1.9689110692955902, + "grad_norm": 0.013787472620606422, + "learning_rate": 7.3586772798955874e-09, + "loss": 0.0006, + "step": 120330 + }, + { + "epoch": 1.969074695246666, + "grad_norm": 0.008914037607610226, + "learning_rate": 7.281440117168359e-09, + "loss": 0.0005, + "step": 120340 + }, + { + "epoch": 1.969238321197742, + "grad_norm": 0.015239895321428776, + "learning_rate": 7.204610143579649e-09, + "loss": 0.0013, + "step": 120350 + }, + { + "epoch": 1.9694019471488178, + "grad_norm": 0.13767629861831665, + "learning_rate": 7.128187365396666e-09, + "loss": 0.0013, + "step": 120360 + }, + { + "epoch": 1.9695655730998936, + "grad_norm": 0.03583362326025963, + "learning_rate": 7.052171788851092e-09, + "loss": 0.0008, + "step": 120370 + }, + { + "epoch": 1.9697291990509695, + "grad_norm": 0.009943093173205853, + "learning_rate": 6.976563420142968e-09, + "loss": 0.0021, + "step": 120380 + }, + { + "epoch": 1.9698928250020453, + "grad_norm": 0.10988359898328781, + "learning_rate": 6.901362265438471e-09, + "loss": 0.0008, + "step": 120390 + }, + { + "epoch": 1.970056450953121, + "grad_norm": 0.004041541367769241, + "learning_rate": 6.826568330871031e-09, + "loss": 0.0012, + "step": 120400 + }, + { + "epoch": 1.970220076904197, + "grad_norm": 0.060312848538160324, + "learning_rate": 6.75218162254021e-09, + "loss": 0.0008, + "step": 120410 + }, + { + "epoch": 1.9703837028552729, + "grad_norm": 0.0006657483172602952, + "learning_rate": 6.678202146513379e-09, + "loss": 0.0006, + "step": 120420 + }, + { + "epoch": 1.9705473288063486, + "grad_norm": 0.037375181913375854, + "learning_rate": 6.60462990882349e-09, + "loss": 0.0008, + "step": 120430 + }, + { + "epoch": 1.9707109547574246, + "grad_norm": 0.09979595243930817, + "learning_rate": 6.5314649154707425e-09, + "loss": 0.001, + "step": 120440 + }, + { + "epoch": 1.9708745807085004, + "grad_norm": 0.038665417581796646, + "learning_rate": 6.4587071724225845e-09, + "loss": 0.0011, + "step": 120450 + }, + { + "epoch": 1.9710382066595762, + "grad_norm": 0.07766951620578766, + "learning_rate": 6.38635668561316e-09, + "loss": 0.0008, + "step": 120460 + }, + { + "epoch": 1.9712018326106522, + "grad_norm": 0.11505376547574997, + "learning_rate": 6.314413460942192e-09, + "loss": 0.001, + "step": 120470 + }, + { + "epoch": 1.9713654585617277, + "grad_norm": 0.019717954099178314, + "learning_rate": 6.242877504278322e-09, + "loss": 0.0004, + "step": 120480 + }, + { + "epoch": 1.9715290845128037, + "grad_norm": 0.00963100790977478, + "learning_rate": 6.17174882145466e-09, + "loss": 0.0006, + "step": 120490 + }, + { + "epoch": 1.9716927104638797, + "grad_norm": 0.018108205869793892, + "learning_rate": 6.101027418272676e-09, + "loss": 0.0013, + "step": 120500 + }, + { + "epoch": 1.9718563364149553, + "grad_norm": 0.04570852592587471, + "learning_rate": 6.030713300499979e-09, + "loss": 0.0012, + "step": 120510 + }, + { + "epoch": 1.9720199623660313, + "grad_norm": 0.09894517809152603, + "learning_rate": 5.960806473871983e-09, + "loss": 0.001, + "step": 120520 + }, + { + "epoch": 1.972183588317107, + "grad_norm": 0.05172663927078247, + "learning_rate": 5.891306944088571e-09, + "loss": 0.001, + "step": 120530 + }, + { + "epoch": 1.9723472142681828, + "grad_norm": 0.03273118659853935, + "learning_rate": 5.822214716819652e-09, + "loss": 0.0006, + "step": 120540 + }, + { + "epoch": 1.9725108402192588, + "grad_norm": 0.06041044369339943, + "learning_rate": 5.753529797698499e-09, + "loss": 0.0007, + "step": 120550 + }, + { + "epoch": 1.9726744661703346, + "grad_norm": 0.002393483417108655, + "learning_rate": 5.6852521923278505e-09, + "loss": 0.0009, + "step": 120560 + }, + { + "epoch": 1.9728380921214104, + "grad_norm": 0.128173366189003, + "learning_rate": 5.617381906276031e-09, + "loss": 0.0012, + "step": 120570 + }, + { + "epoch": 1.9730017180724864, + "grad_norm": 0.03880898281931877, + "learning_rate": 5.5499189450780585e-09, + "loss": 0.0007, + "step": 120580 + }, + { + "epoch": 1.9731653440235621, + "grad_norm": 0.16140474379062653, + "learning_rate": 5.482863314236197e-09, + "loss": 0.001, + "step": 120590 + }, + { + "epoch": 1.973328969974638, + "grad_norm": 0.06171557679772377, + "learning_rate": 5.416215019219406e-09, + "loss": 0.0017, + "step": 120600 + }, + { + "epoch": 1.973492595925714, + "grad_norm": 0.045106757432222366, + "learning_rate": 5.349974065462782e-09, + "loss": 0.0009, + "step": 120610 + }, + { + "epoch": 1.9736562218767897, + "grad_norm": 0.019132466986775398, + "learning_rate": 5.2841404583692245e-09, + "loss": 0.0003, + "step": 120620 + }, + { + "epoch": 1.9738198478278655, + "grad_norm": 0.27232876420021057, + "learning_rate": 5.218714203307218e-09, + "loss": 0.0016, + "step": 120630 + }, + { + "epoch": 1.9739834737789415, + "grad_norm": 0.05358761548995972, + "learning_rate": 5.153695305613604e-09, + "loss": 0.0007, + "step": 120640 + }, + { + "epoch": 1.9741470997300172, + "grad_norm": 0.08669502288103104, + "learning_rate": 5.089083770590253e-09, + "loss": 0.0011, + "step": 120650 + }, + { + "epoch": 1.974310725681093, + "grad_norm": 0.022912029176950455, + "learning_rate": 5.024879603507393e-09, + "loss": 0.0009, + "step": 120660 + }, + { + "epoch": 1.974474351632169, + "grad_norm": 0.025878889486193657, + "learning_rate": 4.961082809600837e-09, + "loss": 0.0007, + "step": 120670 + }, + { + "epoch": 1.9746379775832446, + "grad_norm": 0.028424885123968124, + "learning_rate": 4.897693394074199e-09, + "loss": 0.0018, + "step": 120680 + }, + { + "epoch": 1.9748016035343205, + "grad_norm": 0.05567040666937828, + "learning_rate": 4.834711362096678e-09, + "loss": 0.0015, + "step": 120690 + }, + { + "epoch": 1.9749652294853965, + "grad_norm": 0.057673078030347824, + "learning_rate": 4.772136718804721e-09, + "loss": 0.0007, + "step": 120700 + }, + { + "epoch": 1.975128855436472, + "grad_norm": 0.04457862302660942, + "learning_rate": 4.709969469302023e-09, + "loss": 0.0007, + "step": 120710 + }, + { + "epoch": 1.975292481387548, + "grad_norm": 0.012798531912267208, + "learning_rate": 4.648209618658972e-09, + "loss": 0.0017, + "step": 120720 + }, + { + "epoch": 1.9754561073386239, + "grad_norm": 0.019399434328079224, + "learning_rate": 4.586857171912651e-09, + "loss": 0.0004, + "step": 120730 + }, + { + "epoch": 1.9756197332896996, + "grad_norm": 0.015058878809213638, + "learning_rate": 4.525912134066279e-09, + "loss": 0.0008, + "step": 120740 + }, + { + "epoch": 1.9757833592407756, + "grad_norm": 0.0758814588189125, + "learning_rate": 4.4653745100903255e-09, + "loss": 0.001, + "step": 120750 + }, + { + "epoch": 1.9759469851918514, + "grad_norm": 0.04663432389497757, + "learning_rate": 4.405244304921952e-09, + "loss": 0.0006, + "step": 120760 + }, + { + "epoch": 1.9761106111429272, + "grad_norm": 0.05779373273253441, + "learning_rate": 4.345521523465568e-09, + "loss": 0.0005, + "step": 120770 + }, + { + "epoch": 1.9762742370940032, + "grad_norm": 0.04826616495847702, + "learning_rate": 4.286206170591167e-09, + "loss": 0.0006, + "step": 120780 + }, + { + "epoch": 1.976437863045079, + "grad_norm": 0.060472290962934494, + "learning_rate": 4.227298251137657e-09, + "loss": 0.0005, + "step": 120790 + }, + { + "epoch": 1.9766014889961547, + "grad_norm": 0.017789114266633987, + "learning_rate": 4.168797769908417e-09, + "loss": 0.0008, + "step": 120800 + }, + { + "epoch": 1.9767651149472307, + "grad_norm": 0.03495703265070915, + "learning_rate": 4.1107047316746305e-09, + "loss": 0.0017, + "step": 120810 + }, + { + "epoch": 1.9769287408983065, + "grad_norm": 0.039546020328998566, + "learning_rate": 4.05301914117473e-09, + "loss": 0.0006, + "step": 120820 + }, + { + "epoch": 1.9770923668493823, + "grad_norm": 0.075041763484478, + "learning_rate": 3.9957410031121744e-09, + "loss": 0.0016, + "step": 120830 + }, + { + "epoch": 1.9772559928004583, + "grad_norm": 0.07128296047449112, + "learning_rate": 3.938870322159339e-09, + "loss": 0.0012, + "step": 120840 + }, + { + "epoch": 1.977419618751534, + "grad_norm": 0.022848211228847504, + "learning_rate": 3.8824071029547325e-09, + "loss": 0.0008, + "step": 120850 + }, + { + "epoch": 1.9775832447026098, + "grad_norm": 0.0770878791809082, + "learning_rate": 3.826351350102453e-09, + "loss": 0.0023, + "step": 120860 + }, + { + "epoch": 1.9777468706536858, + "grad_norm": 0.04800952970981598, + "learning_rate": 3.770703068174397e-09, + "loss": 0.0013, + "step": 120870 + }, + { + "epoch": 1.9779104966047614, + "grad_norm": 0.16103595495224, + "learning_rate": 3.7154622617086024e-09, + "loss": 0.0007, + "step": 120880 + }, + { + "epoch": 1.9780741225558374, + "grad_norm": 0.053905077278614044, + "learning_rate": 3.6606289352114633e-09, + "loss": 0.0012, + "step": 120890 + }, + { + "epoch": 1.9782377485069134, + "grad_norm": 0.05038908123970032, + "learning_rate": 3.6062030931544034e-09, + "loss": 0.0005, + "step": 120900 + }, + { + "epoch": 1.978401374457989, + "grad_norm": 0.08015959709882736, + "learning_rate": 3.5521847399766496e-09, + "loss": 0.0009, + "step": 120910 + }, + { + "epoch": 1.978565000409065, + "grad_norm": 0.024472760036587715, + "learning_rate": 3.4985738800824564e-09, + "loss": 0.0006, + "step": 120920 + }, + { + "epoch": 1.9787286263601407, + "grad_norm": 0.04162255674600601, + "learning_rate": 3.4453705178455476e-09, + "loss": 0.0011, + "step": 120930 + }, + { + "epoch": 1.9788922523112165, + "grad_norm": 0.014462755061686039, + "learning_rate": 3.39257465760412e-09, + "loss": 0.0008, + "step": 120940 + }, + { + "epoch": 1.9790558782622925, + "grad_norm": 0.07161793112754822, + "learning_rate": 3.3401863036647276e-09, + "loss": 0.0005, + "step": 120950 + }, + { + "epoch": 1.9792195042133682, + "grad_norm": 0.02279045060276985, + "learning_rate": 3.2882054602995095e-09, + "loss": 0.0013, + "step": 120960 + }, + { + "epoch": 1.979383130164444, + "grad_norm": 0.056222815066576004, + "learning_rate": 3.236632131747852e-09, + "loss": 0.0013, + "step": 120970 + }, + { + "epoch": 1.97954675611552, + "grad_norm": 0.047857679426670074, + "learning_rate": 3.1854663222163907e-09, + "loss": 0.0012, + "step": 120980 + }, + { + "epoch": 1.9797103820665958, + "grad_norm": 0.03302791342139244, + "learning_rate": 3.1347080358773428e-09, + "loss": 0.0005, + "step": 120990 + }, + { + "epoch": 1.9798740080176715, + "grad_norm": 0.02408667467534542, + "learning_rate": 3.084357276870731e-09, + "loss": 0.0017, + "step": 121000 + }, + { + "epoch": 1.9800376339687475, + "grad_norm": 0.040168460458517075, + "learning_rate": 3.034414049303269e-09, + "loss": 0.0012, + "step": 121010 + }, + { + "epoch": 1.9802012599198233, + "grad_norm": 0.1759580373764038, + "learning_rate": 2.984878357247256e-09, + "loss": 0.0019, + "step": 121020 + }, + { + "epoch": 1.980364885870899, + "grad_norm": 0.022691862657666206, + "learning_rate": 2.9357502047439035e-09, + "loss": 0.0005, + "step": 121030 + }, + { + "epoch": 1.980528511821975, + "grad_norm": 0.04220419377088547, + "learning_rate": 2.887029595798896e-09, + "loss": 0.0005, + "step": 121040 + }, + { + "epoch": 1.9806921377730506, + "grad_norm": 0.020097140222787857, + "learning_rate": 2.8387165343862765e-09, + "loss": 0.0007, + "step": 121050 + }, + { + "epoch": 1.9808557637241266, + "grad_norm": 0.03585117682814598, + "learning_rate": 2.790811024445672e-09, + "loss": 0.0015, + "step": 121060 + }, + { + "epoch": 1.9810193896752026, + "grad_norm": 0.06970857828855515, + "learning_rate": 2.7433130698850673e-09, + "loss": 0.0009, + "step": 121070 + }, + { + "epoch": 1.9811830156262782, + "grad_norm": 0.05048614367842674, + "learning_rate": 2.6962226745774754e-09, + "loss": 0.0005, + "step": 121080 + }, + { + "epoch": 1.9813466415773542, + "grad_norm": 0.03118452988564968, + "learning_rate": 2.649539842363713e-09, + "loss": 0.0012, + "step": 121090 + }, + { + "epoch": 1.98151026752843, + "grad_norm": 0.04376658797264099, + "learning_rate": 2.60326457705129e-09, + "loss": 0.0006, + "step": 121100 + }, + { + "epoch": 1.9816738934795057, + "grad_norm": 0.016238750889897346, + "learning_rate": 2.557396882413299e-09, + "loss": 0.0006, + "step": 121110 + }, + { + "epoch": 1.9818375194305817, + "grad_norm": 0.012554849497973919, + "learning_rate": 2.511936762191747e-09, + "loss": 0.0006, + "step": 121120 + }, + { + "epoch": 1.9820011453816575, + "grad_norm": 0.0611734576523304, + "learning_rate": 2.466884220093668e-09, + "loss": 0.0007, + "step": 121130 + }, + { + "epoch": 1.9821647713327333, + "grad_norm": 0.02710803970694542, + "learning_rate": 2.4222392597933464e-09, + "loss": 0.0006, + "step": 121140 + }, + { + "epoch": 1.9823283972838093, + "grad_norm": 0.01143574994057417, + "learning_rate": 2.378001884932313e-09, + "loss": 0.0007, + "step": 121150 + }, + { + "epoch": 1.982492023234885, + "grad_norm": 0.08319120109081268, + "learning_rate": 2.334172099117682e-09, + "loss": 0.0007, + "step": 121160 + }, + { + "epoch": 1.9826556491859608, + "grad_norm": 0.0866430252790451, + "learning_rate": 2.290749905924372e-09, + "loss": 0.0012, + "step": 121170 + }, + { + "epoch": 1.9828192751370368, + "grad_norm": 0.0026253052055835724, + "learning_rate": 2.2477353088945496e-09, + "loss": 0.0005, + "step": 121180 + }, + { + "epoch": 1.9829829010881126, + "grad_norm": 0.08984646946191788, + "learning_rate": 2.2051283115348542e-09, + "loss": 0.0014, + "step": 121190 + }, + { + "epoch": 1.9831465270391884, + "grad_norm": 0.04879742115736008, + "learning_rate": 2.1629289173213942e-09, + "loss": 0.0015, + "step": 121200 + }, + { + "epoch": 1.9833101529902644, + "grad_norm": 0.04284178465604782, + "learning_rate": 2.1211371296947503e-09, + "loss": 0.0011, + "step": 121210 + }, + { + "epoch": 1.9834737789413401, + "grad_norm": 0.043293219059705734, + "learning_rate": 2.079752952064418e-09, + "loss": 0.0009, + "step": 121220 + }, + { + "epoch": 1.983637404892416, + "grad_norm": 0.001439051702618599, + "learning_rate": 2.038776387804919e-09, + "loss": 0.0015, + "step": 121230 + }, + { + "epoch": 1.983801030843492, + "grad_norm": 0.036187853664159775, + "learning_rate": 1.9982074402580266e-09, + "loss": 0.0009, + "step": 121240 + }, + { + "epoch": 1.9839646567945675, + "grad_norm": 0.08715284615755081, + "learning_rate": 1.9580461127327588e-09, + "loss": 0.0007, + "step": 121250 + }, + { + "epoch": 1.9841282827456435, + "grad_norm": 0.007016774732619524, + "learning_rate": 1.9182924085048295e-09, + "loss": 0.0009, + "step": 121260 + }, + { + "epoch": 1.9842919086967195, + "grad_norm": 0.2651830017566681, + "learning_rate": 1.8789463308160894e-09, + "loss": 0.0017, + "step": 121270 + }, + { + "epoch": 1.984455534647795, + "grad_norm": 0.0751025453209877, + "learning_rate": 1.8400078828745283e-09, + "loss": 0.0021, + "step": 121280 + }, + { + "epoch": 1.984619160598871, + "grad_norm": 0.02467581443488598, + "learning_rate": 1.8014770678576043e-09, + "loss": 0.0008, + "step": 121290 + }, + { + "epoch": 1.9847827865499468, + "grad_norm": 0.03527678921818733, + "learning_rate": 1.7633538889066937e-09, + "loss": 0.0017, + "step": 121300 + }, + { + "epoch": 1.9849464125010226, + "grad_norm": 0.010351304896175861, + "learning_rate": 1.7256383491309758e-09, + "loss": 0.0004, + "step": 121310 + }, + { + "epoch": 1.9851100384520985, + "grad_norm": 0.10141579806804657, + "learning_rate": 1.6883304516063238e-09, + "loss": 0.001, + "step": 121320 + }, + { + "epoch": 1.9852736644031743, + "grad_norm": 0.044918958097696304, + "learning_rate": 1.6514301993764138e-09, + "loss": 0.0018, + "step": 121330 + }, + { + "epoch": 1.98543729035425, + "grad_norm": 0.041537243872880936, + "learning_rate": 1.6149375954493952e-09, + "loss": 0.0007, + "step": 121340 + }, + { + "epoch": 1.985600916305326, + "grad_norm": 0.04633361101150513, + "learning_rate": 1.578852642802331e-09, + "loss": 0.0007, + "step": 121350 + }, + { + "epoch": 1.9857645422564019, + "grad_norm": 0.11641356348991394, + "learning_rate": 1.5431753443778674e-09, + "loss": 0.0014, + "step": 121360 + }, + { + "epoch": 1.9859281682074776, + "grad_norm": 0.030262496322393417, + "learning_rate": 1.507905703085899e-09, + "loss": 0.0005, + "step": 121370 + }, + { + "epoch": 1.9860917941585536, + "grad_norm": 0.06622287631034851, + "learning_rate": 1.4730437218030136e-09, + "loss": 0.0009, + "step": 121380 + }, + { + "epoch": 1.9862554201096294, + "grad_norm": 0.010505574755370617, + "learning_rate": 1.4385894033719372e-09, + "loss": 0.0009, + "step": 121390 + }, + { + "epoch": 1.9864190460607052, + "grad_norm": 0.035860270261764526, + "learning_rate": 1.4045427506026443e-09, + "loss": 0.0008, + "step": 121400 + }, + { + "epoch": 1.9865826720117812, + "grad_norm": 0.012280511669814587, + "learning_rate": 1.3709037662729129e-09, + "loss": 0.0006, + "step": 121410 + }, + { + "epoch": 1.986746297962857, + "grad_norm": 0.02275555208325386, + "learning_rate": 1.337672453124994e-09, + "loss": 0.001, + "step": 121420 + }, + { + "epoch": 1.9869099239139327, + "grad_norm": 0.06456863880157471, + "learning_rate": 1.3048488138694971e-09, + "loss": 0.0012, + "step": 121430 + }, + { + "epoch": 1.9870735498650087, + "grad_norm": 0.0017447107238695025, + "learning_rate": 1.2724328511837248e-09, + "loss": 0.0009, + "step": 121440 + }, + { + "epoch": 1.9872371758160843, + "grad_norm": 0.0033917995169758797, + "learning_rate": 1.2404245677111183e-09, + "loss": 0.0006, + "step": 121450 + }, + { + "epoch": 1.9874008017671603, + "grad_norm": 0.03676427900791168, + "learning_rate": 1.2088239660623668e-09, + "loss": 0.0007, + "step": 121460 + }, + { + "epoch": 1.9875644277182363, + "grad_norm": 0.03885538876056671, + "learning_rate": 1.1776310488142983e-09, + "loss": 0.0005, + "step": 121470 + }, + { + "epoch": 1.9877280536693118, + "grad_norm": 0.047191012650728226, + "learning_rate": 1.146845818511544e-09, + "loss": 0.001, + "step": 121480 + }, + { + "epoch": 1.9878916796203878, + "grad_norm": 0.0196328517049551, + "learning_rate": 1.1164682776637625e-09, + "loss": 0.0009, + "step": 121490 + }, + { + "epoch": 1.9880553055714636, + "grad_norm": 0.00553932087495923, + "learning_rate": 1.0864984287500823e-09, + "loss": 0.0007, + "step": 121500 + }, + { + "epoch": 1.9882189315225394, + "grad_norm": 0.07081619650125504, + "learning_rate": 1.056936274212994e-09, + "loss": 0.0009, + "step": 121510 + }, + { + "epoch": 1.9883825574736154, + "grad_norm": 0.058189865201711655, + "learning_rate": 1.0277818164650122e-09, + "loss": 0.0007, + "step": 121520 + }, + { + "epoch": 1.9885461834246911, + "grad_norm": 0.030802302062511444, + "learning_rate": 9.990350578825692e-10, + "loss": 0.0005, + "step": 121530 + }, + { + "epoch": 1.988709809375767, + "grad_norm": 0.05505533888936043, + "learning_rate": 9.70696000811011e-10, + "loss": 0.0008, + "step": 121540 + }, + { + "epoch": 1.988873435326843, + "grad_norm": 0.04216821864247322, + "learning_rate": 9.427646475618225e-10, + "loss": 0.0005, + "step": 121550 + }, + { + "epoch": 1.9890370612779187, + "grad_norm": 0.03344396501779556, + "learning_rate": 9.152410004120704e-10, + "loss": 0.001, + "step": 121560 + }, + { + "epoch": 1.9892006872289945, + "grad_norm": 0.032580576837062836, + "learning_rate": 8.881250616066261e-10, + "loss": 0.0015, + "step": 121570 + }, + { + "epoch": 1.9893643131800705, + "grad_norm": 0.11990205198526382, + "learning_rate": 8.61416833357609e-10, + "loss": 0.0007, + "step": 121580 + }, + { + "epoch": 1.9895279391311462, + "grad_norm": 0.04535841569304466, + "learning_rate": 8.351163178427213e-10, + "loss": 0.0007, + "step": 121590 + }, + { + "epoch": 1.989691565082222, + "grad_norm": 0.04809006676077843, + "learning_rate": 8.09223517206914e-10, + "loss": 0.001, + "step": 121600 + }, + { + "epoch": 1.989855191033298, + "grad_norm": 0.02720620110630989, + "learning_rate": 7.837384335623865e-10, + "loss": 0.0009, + "step": 121610 + }, + { + "epoch": 1.9900188169843738, + "grad_norm": 0.041904766112565994, + "learning_rate": 7.586610689874763e-10, + "loss": 0.0008, + "step": 121620 + }, + { + "epoch": 1.9901824429354495, + "grad_norm": 0.025538332760334015, + "learning_rate": 7.33991425527214e-10, + "loss": 0.0013, + "step": 121630 + }, + { + "epoch": 1.9903460688865255, + "grad_norm": 0.0030932335648685694, + "learning_rate": 7.09729505193324e-10, + "loss": 0.0013, + "step": 121640 + }, + { + "epoch": 1.990509694837601, + "grad_norm": 0.019069423899054527, + "learning_rate": 6.858753099653337e-10, + "loss": 0.0006, + "step": 121650 + }, + { + "epoch": 1.990673320788677, + "grad_norm": 0.061187829822301865, + "learning_rate": 6.624288417877989e-10, + "loss": 0.001, + "step": 121660 + }, + { + "epoch": 1.990836946739753, + "grad_norm": 0.033669210970401764, + "learning_rate": 6.393901025736337e-10, + "loss": 0.0005, + "step": 121670 + }, + { + "epoch": 1.9910005726908286, + "grad_norm": 0.02763231098651886, + "learning_rate": 6.167590942013358e-10, + "loss": 0.0009, + "step": 121680 + }, + { + "epoch": 1.9911641986419046, + "grad_norm": 0.12069132924079895, + "learning_rate": 5.945358185172056e-10, + "loss": 0.0013, + "step": 121690 + }, + { + "epoch": 1.9913278245929804, + "grad_norm": 0.024630241096019745, + "learning_rate": 5.727202773325724e-10, + "loss": 0.0005, + "step": 121700 + }, + { + "epoch": 1.9914914505440562, + "grad_norm": 0.028935732319951057, + "learning_rate": 5.513124724276786e-10, + "loss": 0.0006, + "step": 121710 + }, + { + "epoch": 1.9916550764951322, + "grad_norm": 0.02910882607102394, + "learning_rate": 5.3031240554835e-10, + "loss": 0.001, + "step": 121720 + }, + { + "epoch": 1.991818702446208, + "grad_norm": 0.04302552714943886, + "learning_rate": 5.097200784071054e-10, + "loss": 0.0008, + "step": 121730 + }, + { + "epoch": 1.9919823283972837, + "grad_norm": 0.028214646503329277, + "learning_rate": 4.895354926831575e-10, + "loss": 0.0006, + "step": 121740 + }, + { + "epoch": 1.9921459543483597, + "grad_norm": 0.051813915371894836, + "learning_rate": 4.697586500229667e-10, + "loss": 0.0015, + "step": 121750 + }, + { + "epoch": 1.9923095802994355, + "grad_norm": 0.050833575427532196, + "learning_rate": 4.5038955203913214e-10, + "loss": 0.0007, + "step": 121760 + }, + { + "epoch": 1.9924732062505113, + "grad_norm": 0.031718213111162186, + "learning_rate": 4.3142820031205624e-10, + "loss": 0.0009, + "step": 121770 + }, + { + "epoch": 1.9926368322015873, + "grad_norm": 0.05440567433834076, + "learning_rate": 4.128745963871694e-10, + "loss": 0.0011, + "step": 121780 + }, + { + "epoch": 1.992800458152663, + "grad_norm": 0.006071154028177261, + "learning_rate": 3.9472874177881594e-10, + "loss": 0.0009, + "step": 121790 + }, + { + "epoch": 1.9929640841037388, + "grad_norm": 0.0027376804500818253, + "learning_rate": 3.769906379658128e-10, + "loss": 0.0005, + "step": 121800 + }, + { + "epoch": 1.9931277100548148, + "grad_norm": 0.00888826884329319, + "learning_rate": 3.5966028639533577e-10, + "loss": 0.001, + "step": 121810 + }, + { + "epoch": 1.9932913360058904, + "grad_norm": 0.02660565823316574, + "learning_rate": 3.4273768848069875e-10, + "loss": 0.0007, + "step": 121820 + }, + { + "epoch": 1.9934549619569664, + "grad_norm": 0.06732051074504852, + "learning_rate": 3.26222845601909e-10, + "loss": 0.0007, + "step": 121830 + }, + { + "epoch": 1.9936185879080424, + "grad_norm": 0.017321888357400894, + "learning_rate": 3.1011575910622204e-10, + "loss": 0.0004, + "step": 121840 + }, + { + "epoch": 1.993782213859118, + "grad_norm": 0.016387946903705597, + "learning_rate": 2.9441643030703184e-10, + "loss": 0.0016, + "step": 121850 + }, + { + "epoch": 1.993945839810194, + "grad_norm": 0.019673125818371773, + "learning_rate": 2.7912486048498054e-10, + "loss": 0.0007, + "step": 121860 + }, + { + "epoch": 1.9941094657612697, + "grad_norm": 0.06270117312669754, + "learning_rate": 2.642410508868487e-10, + "loss": 0.0009, + "step": 121870 + }, + { + "epoch": 1.9942730917123455, + "grad_norm": 0.03190852329134941, + "learning_rate": 2.4976500272611005e-10, + "loss": 0.0008, + "step": 121880 + }, + { + "epoch": 1.9944367176634215, + "grad_norm": 0.07426242530345917, + "learning_rate": 2.3569671718459694e-10, + "loss": 0.0012, + "step": 121890 + }, + { + "epoch": 1.9946003436144972, + "grad_norm": 0.06926099956035614, + "learning_rate": 2.2203619540861476e-10, + "loss": 0.001, + "step": 121900 + }, + { + "epoch": 1.994763969565573, + "grad_norm": 0.06824562698602676, + "learning_rate": 2.0878343851282733e-10, + "loss": 0.001, + "step": 121910 + }, + { + "epoch": 1.994927595516649, + "grad_norm": 0.03840915486216545, + "learning_rate": 1.9593844757803682e-10, + "loss": 0.001, + "step": 121920 + }, + { + "epoch": 1.9950912214677248, + "grad_norm": 0.06597849726676941, + "learning_rate": 1.835012236517386e-10, + "loss": 0.0009, + "step": 121930 + }, + { + "epoch": 1.9952548474188005, + "grad_norm": 0.08199784904718399, + "learning_rate": 1.7147176774812148e-10, + "loss": 0.0009, + "step": 121940 + }, + { + "epoch": 1.9954184733698765, + "grad_norm": 0.030702393501996994, + "learning_rate": 1.598500808480674e-10, + "loss": 0.0009, + "step": 121950 + }, + { + "epoch": 1.9955820993209523, + "grad_norm": 0.060015805065631866, + "learning_rate": 1.4863616390026202e-10, + "loss": 0.0012, + "step": 121960 + }, + { + "epoch": 1.995745725272028, + "grad_norm": 0.019695112481713295, + "learning_rate": 1.3783001781841886e-10, + "loss": 0.0004, + "step": 121970 + }, + { + "epoch": 1.995909351223104, + "grad_norm": 0.035306788980960846, + "learning_rate": 1.2743164348405502e-10, + "loss": 0.0006, + "step": 121980 + }, + { + "epoch": 1.9960729771741799, + "grad_norm": 0.02539607509970665, + "learning_rate": 1.1744104174538084e-10, + "loss": 0.0005, + "step": 121990 + }, + { + "epoch": 1.9962366031252556, + "grad_norm": 0.10506034642457962, + "learning_rate": 1.0785821341730007e-10, + "loss": 0.0008, + "step": 122000 + }, + { + "epoch": 1.9962366031252556, + "eval_loss": 0.0007702079601585865, + "eval_runtime": 5.3816, + "eval_samples_per_second": 37.164, + "eval_steps_per_second": 9.291, + "step": 122000 + }, + { + "epoch": 1.9964002290763316, + "grad_norm": 0.09506328403949738, + "learning_rate": 9.86831592814097e-11, + "loss": 0.0008, + "step": 122010 + }, + { + "epoch": 1.9965638550274072, + "grad_norm": 0.04165778309106827, + "learning_rate": 8.991588008544494e-11, + "loss": 0.0009, + "step": 122020 + }, + { + "epoch": 1.9967274809784832, + "grad_norm": 0.09607928991317749, + "learning_rate": 8.155637654494452e-11, + "loss": 0.0012, + "step": 122030 + }, + { + "epoch": 1.9968911069295592, + "grad_norm": 0.0512206107378006, + "learning_rate": 7.360464934103029e-11, + "loss": 0.0012, + "step": 122040 + }, + { + "epoch": 1.9970547328806347, + "grad_norm": 0.10398688912391663, + "learning_rate": 6.606069912318269e-11, + "loss": 0.0012, + "step": 122050 + }, + { + "epoch": 1.9972183588317107, + "grad_norm": 0.001678274478763342, + "learning_rate": 5.892452650591018e-11, + "loss": 0.0005, + "step": 122060 + }, + { + "epoch": 1.9973819847827865, + "grad_norm": 0.043181102722883224, + "learning_rate": 5.2196132072079854e-11, + "loss": 0.0018, + "step": 122070 + }, + { + "epoch": 1.9975456107338623, + "grad_norm": 0.013596700504422188, + "learning_rate": 4.587551636903165e-11, + "loss": 0.0011, + "step": 122080 + }, + { + "epoch": 1.9977092366849383, + "grad_norm": 0.024047480896115303, + "learning_rate": 3.9962679913574386e-11, + "loss": 0.0005, + "step": 122090 + }, + { + "epoch": 1.997872862636014, + "grad_norm": 0.1744498759508133, + "learning_rate": 3.4457623187544865e-11, + "loss": 0.0008, + "step": 122100 + }, + { + "epoch": 1.9980364885870898, + "grad_norm": 0.028417835012078285, + "learning_rate": 2.936034663947318e-11, + "loss": 0.0005, + "step": 122110 + }, + { + "epoch": 1.9982001145381658, + "grad_norm": 0.02932523936033249, + "learning_rate": 2.467085068513786e-11, + "loss": 0.0005, + "step": 122120 + }, + { + "epoch": 1.9983637404892416, + "grad_norm": 0.03043956682085991, + "learning_rate": 2.0389135708120954e-11, + "loss": 0.0008, + "step": 122130 + }, + { + "epoch": 1.9985273664403174, + "grad_norm": 0.015334525145590305, + "learning_rate": 1.6515202055922273e-11, + "loss": 0.0008, + "step": 122140 + }, + { + "epoch": 1.9986909923913934, + "grad_norm": 0.021789342164993286, + "learning_rate": 1.3049050045510492e-11, + "loss": 0.0008, + "step": 122150 + }, + { + "epoch": 1.9988546183424691, + "grad_norm": 0.008600656874477863, + "learning_rate": 9.990679959992478e-12, + "loss": 0.001, + "step": 122160 + }, + { + "epoch": 1.999018244293545, + "grad_norm": 0.01498556137084961, + "learning_rate": 7.340092047503078e-12, + "loss": 0.001, + "step": 122170 + }, + { + "epoch": 1.999181870244621, + "grad_norm": 0.029551725834608078, + "learning_rate": 5.097286525090894e-12, + "loss": 0.0016, + "step": 122180 + }, + { + "epoch": 1.9993454961956967, + "grad_norm": 0.036482520401477814, + "learning_rate": 3.2622635759427257e-12, + "loss": 0.0006, + "step": 122190 + }, + { + "epoch": 1.9995091221467725, + "grad_norm": 0.06275158375501633, + "learning_rate": 1.8350233488284573e-12, + "loss": 0.0007, + "step": 122200 + }, + { + "epoch": 1.9996727480978485, + "grad_norm": 0.028780221939086914, + "learning_rate": 8.155659603215072e-13, + "loss": 0.0007, + "step": 122210 + }, + { + "epoch": 1.999836374048924, + "grad_norm": 0.015836335718631744, + "learning_rate": 2.0389149424371314e-13, + "loss": 0.0003, + "step": 122220 + }, + { + "epoch": 2.0, + "grad_norm": 0.006197645328938961, + "learning_rate": 0.0, + "loss": 0.0004, + "step": 122230 + }, + { + "epoch": 2.0, + "step": 122230, + "total_flos": 1.424923653115478e+19, + "train_loss": 0.003896685916660938, + "train_runtime": 121618.4873, + "train_samples_per_second": 8.04, + "train_steps_per_second": 1.005 + } + ], + "logging_steps": 10, + "max_steps": 122230, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 62000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.424923653115478e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}