diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13893 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999747481124214, + "eval_steps": 500, + "global_step": 19800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00050503775157193, + "grad_norm": 13.943564139476521, + "learning_rate": 9.996464289322155e-06, + "loss": 1.7553, + "step": 10 + }, + { + "epoch": 0.00101007550314386, + "grad_norm": 5.789553200928907, + "learning_rate": 9.991413274068087e-06, + "loss": 0.8176, + "step": 20 + }, + { + "epoch": 0.00151511325471579, + "grad_norm": 7.735975598755883, + "learning_rate": 9.986362258814021e-06, + "loss": 0.7018, + "step": 30 + }, + { + "epoch": 0.00202015100628772, + "grad_norm": 3.5851607839184143, + "learning_rate": 9.981311243559957e-06, + "loss": 0.6599, + "step": 40 + }, + { + "epoch": 0.00252518875785965, + "grad_norm": 6.237435656827707, + "learning_rate": 9.976260228305891e-06, + "loss": 0.6038, + "step": 50 + }, + { + "epoch": 0.00303022650943158, + "grad_norm": 13.568356180772243, + "learning_rate": 9.971209213051824e-06, + "loss": 0.6027, + "step": 60 + }, + { + "epoch": 0.00353526426100351, + "grad_norm": 3.235105055773784, + "learning_rate": 9.966158197797758e-06, + "loss": 0.5893, + "step": 70 + }, + { + "epoch": 0.00404030201257544, + "grad_norm": 5.862991115468846, + "learning_rate": 9.961107182543692e-06, + "loss": 0.5863, + "step": 80 + }, + { + "epoch": 0.00454533976414737, + "grad_norm": 3.3977361363891334, + "learning_rate": 9.956056167289626e-06, + "loss": 0.5736, + "step": 90 + }, + { + "epoch": 0.0050503775157193, + "grad_norm": 4.281339414007124, + "learning_rate": 9.95100515203556e-06, + "loss": 0.5508, + "step": 100 + }, + { + "epoch": 0.00555541526729123, + "grad_norm": 3.144694292761742, + "learning_rate": 9.945954136781494e-06, + "loss": 0.5493, + "step": 110 + }, + { + "epoch": 0.00606045301886316, + "grad_norm": 3.0898861803222566, + "learning_rate": 9.940903121527427e-06, + "loss": 0.5417, + "step": 120 + }, + { + "epoch": 0.00656549077043509, + "grad_norm": 14.79188542424222, + "learning_rate": 9.935852106273361e-06, + "loss": 0.5444, + "step": 130 + }, + { + "epoch": 0.00707052852200702, + "grad_norm": 3.493197693686729, + "learning_rate": 9.930801091019295e-06, + "loss": 0.5284, + "step": 140 + }, + { + "epoch": 0.00757556627357895, + "grad_norm": 10.7485482232531, + "learning_rate": 9.92575007576523e-06, + "loss": 0.53, + "step": 150 + }, + { + "epoch": 0.00808060402515088, + "grad_norm": 4.095456564556807, + "learning_rate": 9.920699060511163e-06, + "loss": 0.5171, + "step": 160 + }, + { + "epoch": 0.00858564177672281, + "grad_norm": 18.07561627235401, + "learning_rate": 9.915648045257097e-06, + "loss": 0.5293, + "step": 170 + }, + { + "epoch": 0.00909067952829474, + "grad_norm": 14.881912561895916, + "learning_rate": 9.910597030003032e-06, + "loss": 0.525, + "step": 180 + }, + { + "epoch": 0.00959571727986667, + "grad_norm": 3.077548490509973, + "learning_rate": 9.905546014748966e-06, + "loss": 0.4952, + "step": 190 + }, + { + "epoch": 0.0101007550314386, + "grad_norm": 5.40203767482295, + "learning_rate": 9.9004949994949e-06, + "loss": 0.5172, + "step": 200 + }, + { + "epoch": 0.01060579278301053, + "grad_norm": 12.22425699537504, + "learning_rate": 9.895443984240834e-06, + "loss": 0.5358, + "step": 210 + }, + { + "epoch": 0.01111083053458246, + "grad_norm": 7.830646098334567, + "learning_rate": 9.890392968986768e-06, + "loss": 0.5102, + "step": 220 + }, + { + "epoch": 0.01161586828615439, + "grad_norm": 3.814940763160932, + "learning_rate": 9.8853419537327e-06, + "loss": 0.4948, + "step": 230 + }, + { + "epoch": 0.01212090603772632, + "grad_norm": 9.822466847144137, + "learning_rate": 9.880290938478635e-06, + "loss": 0.5217, + "step": 240 + }, + { + "epoch": 0.01262594378929825, + "grad_norm": 46.6756643508386, + "learning_rate": 9.875239923224569e-06, + "loss": 0.5024, + "step": 250 + }, + { + "epoch": 0.01313098154087018, + "grad_norm": 2.706287487769574, + "learning_rate": 9.870188907970503e-06, + "loss": 0.5161, + "step": 260 + }, + { + "epoch": 0.01363601929244211, + "grad_norm": 2.211316891350979, + "learning_rate": 9.865137892716437e-06, + "loss": 0.5112, + "step": 270 + }, + { + "epoch": 0.01414105704401404, + "grad_norm": 2.9091819978240663, + "learning_rate": 9.860086877462371e-06, + "loss": 0.5166, + "step": 280 + }, + { + "epoch": 0.01464609479558597, + "grad_norm": 3.745541781549167, + "learning_rate": 9.855035862208304e-06, + "loss": 0.5051, + "step": 290 + }, + { + "epoch": 0.0151511325471579, + "grad_norm": 2.8340353132271394, + "learning_rate": 9.849984846954238e-06, + "loss": 0.5075, + "step": 300 + }, + { + "epoch": 0.01565617029872983, + "grad_norm": 3.869953185267734, + "learning_rate": 9.844933831700174e-06, + "loss": 0.5216, + "step": 310 + }, + { + "epoch": 0.01616120805030176, + "grad_norm": 2.211016455052966, + "learning_rate": 9.839882816446108e-06, + "loss": 0.4973, + "step": 320 + }, + { + "epoch": 0.01666624580187369, + "grad_norm": 3.3086106494157606, + "learning_rate": 9.83483180119204e-06, + "loss": 0.5023, + "step": 330 + }, + { + "epoch": 0.01717128355344562, + "grad_norm": 3.680801950407282, + "learning_rate": 9.829780785937974e-06, + "loss": 0.4933, + "step": 340 + }, + { + "epoch": 0.01767632130501755, + "grad_norm": 2.7546156381855935, + "learning_rate": 9.824729770683908e-06, + "loss": 0.5176, + "step": 350 + }, + { + "epoch": 0.01818135905658948, + "grad_norm": 5.87533624997546, + "learning_rate": 9.819678755429842e-06, + "loss": 0.5014, + "step": 360 + }, + { + "epoch": 0.01868639680816141, + "grad_norm": 5.806811873538911, + "learning_rate": 9.814627740175777e-06, + "loss": 0.5103, + "step": 370 + }, + { + "epoch": 0.01919143455973334, + "grad_norm": 2.3849830922620137, + "learning_rate": 9.80957672492171e-06, + "loss": 0.5133, + "step": 380 + }, + { + "epoch": 0.01969647231130527, + "grad_norm": 7.2668450331901715, + "learning_rate": 9.804525709667643e-06, + "loss": 0.5049, + "step": 390 + }, + { + "epoch": 0.0202015100628772, + "grad_norm": 2.5520502646771264, + "learning_rate": 9.799474694413577e-06, + "loss": 0.5102, + "step": 400 + }, + { + "epoch": 0.02070654781444913, + "grad_norm": 2.473700390480294, + "learning_rate": 9.794423679159511e-06, + "loss": 0.4825, + "step": 410 + }, + { + "epoch": 0.02121158556602106, + "grad_norm": 3.648682107473895, + "learning_rate": 9.789372663905445e-06, + "loss": 0.4983, + "step": 420 + }, + { + "epoch": 0.02171662331759299, + "grad_norm": 3.458879333685983, + "learning_rate": 9.78432164865138e-06, + "loss": 0.4912, + "step": 430 + }, + { + "epoch": 0.02222166106916492, + "grad_norm": 3.0729192822937503, + "learning_rate": 9.779270633397314e-06, + "loss": 0.4905, + "step": 440 + }, + { + "epoch": 0.02272669882073685, + "grad_norm": 17.505451267102025, + "learning_rate": 9.774219618143248e-06, + "loss": 0.4988, + "step": 450 + }, + { + "epoch": 0.02323173657230878, + "grad_norm": 11.347644323036132, + "learning_rate": 9.769168602889182e-06, + "loss": 0.527, + "step": 460 + }, + { + "epoch": 0.02373677432388071, + "grad_norm": 3.745152344082931, + "learning_rate": 9.764117587635116e-06, + "loss": 0.4751, + "step": 470 + }, + { + "epoch": 0.02424181207545264, + "grad_norm": 3.430436888183069, + "learning_rate": 9.75906657238105e-06, + "loss": 0.4925, + "step": 480 + }, + { + "epoch": 0.02474684982702457, + "grad_norm": 2.5106033133425316, + "learning_rate": 9.754015557126983e-06, + "loss": 0.4909, + "step": 490 + }, + { + "epoch": 0.0252518875785965, + "grad_norm": 2.9444873201490345, + "learning_rate": 9.748964541872917e-06, + "loss": 0.5046, + "step": 500 + }, + { + "epoch": 0.02575692533016843, + "grad_norm": 2.2226598370186355, + "learning_rate": 9.743913526618851e-06, + "loss": 0.5019, + "step": 510 + }, + { + "epoch": 0.02626196308174036, + "grad_norm": 2.1431364606222143, + "learning_rate": 9.738862511364785e-06, + "loss": 0.4978, + "step": 520 + }, + { + "epoch": 0.02676700083331229, + "grad_norm": 1.9687420001979, + "learning_rate": 9.733811496110719e-06, + "loss": 0.5168, + "step": 530 + }, + { + "epoch": 0.02727203858488422, + "grad_norm": 1.790704686386335, + "learning_rate": 9.728760480856653e-06, + "loss": 0.4918, + "step": 540 + }, + { + "epoch": 0.02777707633645615, + "grad_norm": 1.5122456879041994, + "learning_rate": 9.723709465602587e-06, + "loss": 0.5015, + "step": 550 + }, + { + "epoch": 0.02828211408802808, + "grad_norm": 2.155745175549288, + "learning_rate": 9.71865845034852e-06, + "loss": 0.4958, + "step": 560 + }, + { + "epoch": 0.02878715183960001, + "grad_norm": 2.2999303399374753, + "learning_rate": 9.713607435094454e-06, + "loss": 0.494, + "step": 570 + }, + { + "epoch": 0.02929218959117194, + "grad_norm": 1.9957939818387918, + "learning_rate": 9.70855641984039e-06, + "loss": 0.4772, + "step": 580 + }, + { + "epoch": 0.02979722734274387, + "grad_norm": 2.7790545722653137, + "learning_rate": 9.703505404586324e-06, + "loss": 0.4958, + "step": 590 + }, + { + "epoch": 0.0303022650943158, + "grad_norm": 1.8902194621075885, + "learning_rate": 9.698454389332256e-06, + "loss": 0.4811, + "step": 600 + }, + { + "epoch": 0.03080730284588773, + "grad_norm": 2.330090449709308, + "learning_rate": 9.69340337407819e-06, + "loss": 0.4955, + "step": 610 + }, + { + "epoch": 0.03131234059745966, + "grad_norm": 2.63971178528007, + "learning_rate": 9.688352358824125e-06, + "loss": 0.481, + "step": 620 + }, + { + "epoch": 0.03181737834903159, + "grad_norm": 3.9470995700087093, + "learning_rate": 9.683301343570059e-06, + "loss": 0.4911, + "step": 630 + }, + { + "epoch": 0.03232241610060352, + "grad_norm": 3.705578063273332, + "learning_rate": 9.678250328315993e-06, + "loss": 0.4707, + "step": 640 + }, + { + "epoch": 0.03282745385217545, + "grad_norm": 2.583661159495941, + "learning_rate": 9.673199313061927e-06, + "loss": 0.4599, + "step": 650 + }, + { + "epoch": 0.03333249160374738, + "grad_norm": 2.0174541627393743, + "learning_rate": 9.66814829780786e-06, + "loss": 0.4941, + "step": 660 + }, + { + "epoch": 0.03383752935531931, + "grad_norm": 4.576484208609804, + "learning_rate": 9.663097282553793e-06, + "loss": 0.4839, + "step": 670 + }, + { + "epoch": 0.03434256710689124, + "grad_norm": 5.516602752547856, + "learning_rate": 9.658046267299728e-06, + "loss": 0.4652, + "step": 680 + }, + { + "epoch": 0.03484760485846317, + "grad_norm": 1.9835012286897338, + "learning_rate": 9.652995252045662e-06, + "loss": 0.4573, + "step": 690 + }, + { + "epoch": 0.0353526426100351, + "grad_norm": 2.3068456370975596, + "learning_rate": 9.647944236791596e-06, + "loss": 0.4939, + "step": 700 + }, + { + "epoch": 0.03585768036160703, + "grad_norm": 2.504718982689086, + "learning_rate": 9.64289322153753e-06, + "loss": 0.479, + "step": 710 + }, + { + "epoch": 0.03636271811317896, + "grad_norm": 3.0659241109345885, + "learning_rate": 9.637842206283464e-06, + "loss": 0.4668, + "step": 720 + }, + { + "epoch": 0.03686775586475089, + "grad_norm": 5.822499608800306, + "learning_rate": 9.632791191029398e-06, + "loss": 0.4776, + "step": 730 + }, + { + "epoch": 0.03737279361632282, + "grad_norm": 2.282902491093137, + "learning_rate": 9.627740175775332e-06, + "loss": 0.4809, + "step": 740 + }, + { + "epoch": 0.03787783136789475, + "grad_norm": 3.6160296320382623, + "learning_rate": 9.622689160521266e-06, + "loss": 0.4772, + "step": 750 + }, + { + "epoch": 0.03838286911946668, + "grad_norm": 2.168648546785388, + "learning_rate": 9.617638145267199e-06, + "loss": 0.4727, + "step": 760 + }, + { + "epoch": 0.03888790687103861, + "grad_norm": 2.3243624027787746, + "learning_rate": 9.612587130013133e-06, + "loss": 0.4725, + "step": 770 + }, + { + "epoch": 0.03939294462261054, + "grad_norm": 2.302262458969543, + "learning_rate": 9.607536114759067e-06, + "loss": 0.4866, + "step": 780 + }, + { + "epoch": 0.03989798237418247, + "grad_norm": 8.258585402548862, + "learning_rate": 9.602485099505001e-06, + "loss": 0.4732, + "step": 790 + }, + { + "epoch": 0.0404030201257544, + "grad_norm": 2.1689541135562007, + "learning_rate": 9.597434084250935e-06, + "loss": 0.4761, + "step": 800 + }, + { + "epoch": 0.04090805787732633, + "grad_norm": 2.087988553230154, + "learning_rate": 9.59238306899687e-06, + "loss": 0.4755, + "step": 810 + }, + { + "epoch": 0.04141309562889826, + "grad_norm": 2.2958375828670965, + "learning_rate": 9.587332053742802e-06, + "loss": 0.4688, + "step": 820 + }, + { + "epoch": 0.04191813338047019, + "grad_norm": 2.3117097400641446, + "learning_rate": 9.582281038488736e-06, + "loss": 0.4635, + "step": 830 + }, + { + "epoch": 0.04242317113204212, + "grad_norm": 8.93438709495605, + "learning_rate": 9.57723002323467e-06, + "loss": 0.4615, + "step": 840 + }, + { + "epoch": 0.04292820888361405, + "grad_norm": 2.1041537246773134, + "learning_rate": 9.572179007980606e-06, + "loss": 0.493, + "step": 850 + }, + { + "epoch": 0.04343324663518598, + "grad_norm": 1.73155627574332, + "learning_rate": 9.567127992726538e-06, + "loss": 0.4531, + "step": 860 + }, + { + "epoch": 0.04393828438675791, + "grad_norm": 1.9302062705761438, + "learning_rate": 9.562076977472473e-06, + "loss": 0.4644, + "step": 870 + }, + { + "epoch": 0.04444332213832984, + "grad_norm": 1.9821994690328757, + "learning_rate": 9.557025962218407e-06, + "loss": 0.4418, + "step": 880 + }, + { + "epoch": 0.04494835988990177, + "grad_norm": 2.5134313739910716, + "learning_rate": 9.55197494696434e-06, + "loss": 0.4556, + "step": 890 + }, + { + "epoch": 0.0454533976414737, + "grad_norm": 2.448160093260678, + "learning_rate": 9.546923931710275e-06, + "loss": 0.4867, + "step": 900 + }, + { + "epoch": 0.04595843539304563, + "grad_norm": 3.6293868003589456, + "learning_rate": 9.541872916456209e-06, + "loss": 0.4575, + "step": 910 + }, + { + "epoch": 0.04646347314461756, + "grad_norm": 5.740958333853467, + "learning_rate": 9.536821901202143e-06, + "loss": 0.4602, + "step": 920 + }, + { + "epoch": 0.04696851089618949, + "grad_norm": 2.6003779275117282, + "learning_rate": 9.531770885948076e-06, + "loss": 0.4631, + "step": 930 + }, + { + "epoch": 0.04747354864776142, + "grad_norm": 2.412862217111322, + "learning_rate": 9.52671987069401e-06, + "loss": 0.4765, + "step": 940 + }, + { + "epoch": 0.04797858639933335, + "grad_norm": 6.6681363062446035, + "learning_rate": 9.521668855439944e-06, + "loss": 0.4658, + "step": 950 + }, + { + "epoch": 0.04848362415090528, + "grad_norm": 5.764214860916012, + "learning_rate": 9.516617840185878e-06, + "loss": 0.4615, + "step": 960 + }, + { + "epoch": 0.04898866190247721, + "grad_norm": 2.464119785527778, + "learning_rate": 9.511566824931812e-06, + "loss": 0.4496, + "step": 970 + }, + { + "epoch": 0.04949369965404914, + "grad_norm": 6.238087195845154, + "learning_rate": 9.506515809677746e-06, + "loss": 0.4431, + "step": 980 + }, + { + "epoch": 0.04999873740562107, + "grad_norm": 5.932767508185511, + "learning_rate": 9.50146479442368e-06, + "loss": 0.4574, + "step": 990 + }, + { + "epoch": 0.050503775157193, + "grad_norm": 9.775572888274276, + "learning_rate": 9.496413779169614e-06, + "loss": 0.4596, + "step": 1000 + }, + { + "epoch": 0.05100881290876493, + "grad_norm": 6.307739027249706, + "learning_rate": 9.491362763915549e-06, + "loss": 0.4515, + "step": 1010 + }, + { + "epoch": 0.05151385066033686, + "grad_norm": 3.187729331697054, + "learning_rate": 9.486311748661483e-06, + "loss": 0.4713, + "step": 1020 + }, + { + "epoch": 0.05201888841190879, + "grad_norm": 5.592090620849129, + "learning_rate": 9.481260733407415e-06, + "loss": 0.454, + "step": 1030 + }, + { + "epoch": 0.05252392616348072, + "grad_norm": 5.281749256921326, + "learning_rate": 9.47620971815335e-06, + "loss": 0.4513, + "step": 1040 + }, + { + "epoch": 0.05302896391505265, + "grad_norm": 3.415542870576186, + "learning_rate": 9.471158702899283e-06, + "loss": 0.4584, + "step": 1050 + }, + { + "epoch": 0.05353400166662458, + "grad_norm": 3.4909280444023034, + "learning_rate": 9.466107687645218e-06, + "loss": 0.4491, + "step": 1060 + }, + { + "epoch": 0.05403903941819651, + "grad_norm": 4.87148376874038, + "learning_rate": 9.461056672391152e-06, + "loss": 0.4672, + "step": 1070 + }, + { + "epoch": 0.05454407716976844, + "grad_norm": 5.691263676353072, + "learning_rate": 9.456005657137086e-06, + "loss": 0.4398, + "step": 1080 + }, + { + "epoch": 0.05504911492134037, + "grad_norm": 7.086742552442662, + "learning_rate": 9.450954641883018e-06, + "loss": 0.4657, + "step": 1090 + }, + { + "epoch": 0.0555541526729123, + "grad_norm": 3.941589115392049, + "learning_rate": 9.445903626628952e-06, + "loss": 0.4559, + "step": 1100 + }, + { + "epoch": 0.05605919042448423, + "grad_norm": 11.794328327356654, + "learning_rate": 9.440852611374886e-06, + "loss": 0.4436, + "step": 1110 + }, + { + "epoch": 0.05656422817605616, + "grad_norm": 9.586300666497886, + "learning_rate": 9.435801596120822e-06, + "loss": 0.4581, + "step": 1120 + }, + { + "epoch": 0.05706926592762809, + "grad_norm": 3.5060354246476484, + "learning_rate": 9.430750580866755e-06, + "loss": 0.4414, + "step": 1130 + }, + { + "epoch": 0.05757430367920002, + "grad_norm": 2.9623447600620136, + "learning_rate": 9.425699565612689e-06, + "loss": 0.4475, + "step": 1140 + }, + { + "epoch": 0.05807934143077195, + "grad_norm": 10.925832762761535, + "learning_rate": 9.420648550358623e-06, + "loss": 0.4454, + "step": 1150 + }, + { + "epoch": 0.05858437918234388, + "grad_norm": 3.275733802006869, + "learning_rate": 9.415597535104557e-06, + "loss": 0.4349, + "step": 1160 + }, + { + "epoch": 0.05908941693391581, + "grad_norm": 4.379989620996673, + "learning_rate": 9.410546519850491e-06, + "loss": 0.455, + "step": 1170 + }, + { + "epoch": 0.05959445468548774, + "grad_norm": 2.1574051089026893, + "learning_rate": 9.405495504596425e-06, + "loss": 0.4551, + "step": 1180 + }, + { + "epoch": 0.06009949243705967, + "grad_norm": 2.5566027220209344, + "learning_rate": 9.400444489342358e-06, + "loss": 0.4496, + "step": 1190 + }, + { + "epoch": 0.0606045301886316, + "grad_norm": 3.3866623066830153, + "learning_rate": 9.395393474088292e-06, + "loss": 0.4705, + "step": 1200 + }, + { + "epoch": 0.06110956794020353, + "grad_norm": 3.166133789377687, + "learning_rate": 9.390342458834226e-06, + "loss": 0.4556, + "step": 1210 + }, + { + "epoch": 0.06161460569177546, + "grad_norm": 3.91861184581623, + "learning_rate": 9.38529144358016e-06, + "loss": 0.4673, + "step": 1220 + }, + { + "epoch": 0.06211964344334739, + "grad_norm": 3.380841845875404, + "learning_rate": 9.380240428326094e-06, + "loss": 0.4551, + "step": 1230 + }, + { + "epoch": 0.06262468119491932, + "grad_norm": 8.214890466142487, + "learning_rate": 9.375189413072028e-06, + "loss": 0.4436, + "step": 1240 + }, + { + "epoch": 0.06312971894649125, + "grad_norm": 3.5096622602562904, + "learning_rate": 9.37013839781796e-06, + "loss": 0.4455, + "step": 1250 + }, + { + "epoch": 0.06363475669806318, + "grad_norm": 3.773235111611304, + "learning_rate": 9.365087382563897e-06, + "loss": 0.4622, + "step": 1260 + }, + { + "epoch": 0.0641397944496351, + "grad_norm": 6.295753580294585, + "learning_rate": 9.36003636730983e-06, + "loss": 0.4602, + "step": 1270 + }, + { + "epoch": 0.06464483220120704, + "grad_norm": 2.725162214074834, + "learning_rate": 9.354985352055765e-06, + "loss": 0.4597, + "step": 1280 + }, + { + "epoch": 0.06514986995277897, + "grad_norm": 2.52312004043461, + "learning_rate": 9.349934336801699e-06, + "loss": 0.4617, + "step": 1290 + }, + { + "epoch": 0.0656549077043509, + "grad_norm": 2.789794392473564, + "learning_rate": 9.344883321547631e-06, + "loss": 0.4505, + "step": 1300 + }, + { + "epoch": 0.06615994545592283, + "grad_norm": 3.952687566204272, + "learning_rate": 9.339832306293566e-06, + "loss": 0.4488, + "step": 1310 + }, + { + "epoch": 0.06666498320749475, + "grad_norm": 2.4979475288074284, + "learning_rate": 9.3347812910395e-06, + "loss": 0.436, + "step": 1320 + }, + { + "epoch": 0.06717002095906668, + "grad_norm": 3.3362793627660685, + "learning_rate": 9.329730275785434e-06, + "loss": 0.4739, + "step": 1330 + }, + { + "epoch": 0.06767505871063861, + "grad_norm": 2.4147254145567834, + "learning_rate": 9.324679260531368e-06, + "loss": 0.4627, + "step": 1340 + }, + { + "epoch": 0.06818009646221056, + "grad_norm": 5.672214495427731, + "learning_rate": 9.319628245277302e-06, + "loss": 0.451, + "step": 1350 + }, + { + "epoch": 0.06868513421378249, + "grad_norm": 1.8726502280944801, + "learning_rate": 9.314577230023234e-06, + "loss": 0.4417, + "step": 1360 + }, + { + "epoch": 0.06919017196535442, + "grad_norm": 3.973145301581678, + "learning_rate": 9.309526214769169e-06, + "loss": 0.4253, + "step": 1370 + }, + { + "epoch": 0.06969520971692635, + "grad_norm": 5.073872511256849, + "learning_rate": 9.304475199515103e-06, + "loss": 0.4408, + "step": 1380 + }, + { + "epoch": 0.07020024746849828, + "grad_norm": 7.525570163399166, + "learning_rate": 9.299424184261039e-06, + "loss": 0.4307, + "step": 1390 + }, + { + "epoch": 0.0707052852200702, + "grad_norm": 2.781161378212436, + "learning_rate": 9.294373169006971e-06, + "loss": 0.4536, + "step": 1400 + }, + { + "epoch": 0.07121032297164213, + "grad_norm": 2.090988491048501, + "learning_rate": 9.289322153752905e-06, + "loss": 0.4583, + "step": 1410 + }, + { + "epoch": 0.07171536072321406, + "grad_norm": 2.4437267649640715, + "learning_rate": 9.28427113849884e-06, + "loss": 0.4579, + "step": 1420 + }, + { + "epoch": 0.072220398474786, + "grad_norm": 2.2076648371457033, + "learning_rate": 9.279220123244773e-06, + "loss": 0.4343, + "step": 1430 + }, + { + "epoch": 0.07272543622635792, + "grad_norm": 1.8617491045028525, + "learning_rate": 9.274169107990707e-06, + "loss": 0.4509, + "step": 1440 + }, + { + "epoch": 0.07323047397792985, + "grad_norm": 2.222879920477197, + "learning_rate": 9.269118092736642e-06, + "loss": 0.4577, + "step": 1450 + }, + { + "epoch": 0.07373551172950178, + "grad_norm": 2.5321185321413626, + "learning_rate": 9.264067077482574e-06, + "loss": 0.4391, + "step": 1460 + }, + { + "epoch": 0.07424054948107371, + "grad_norm": 1.8760025526162245, + "learning_rate": 9.259016062228508e-06, + "loss": 0.474, + "step": 1470 + }, + { + "epoch": 0.07474558723264564, + "grad_norm": 4.085966137442041, + "learning_rate": 9.253965046974442e-06, + "loss": 0.4444, + "step": 1480 + }, + { + "epoch": 0.07525062498421757, + "grad_norm": 3.6849288920231547, + "learning_rate": 9.248914031720376e-06, + "loss": 0.4631, + "step": 1490 + }, + { + "epoch": 0.0757556627357895, + "grad_norm": 2.0812547682300395, + "learning_rate": 9.24386301646631e-06, + "loss": 0.4429, + "step": 1500 + }, + { + "epoch": 0.07626070048736143, + "grad_norm": 2.1885414591057923, + "learning_rate": 9.238812001212245e-06, + "loss": 0.4453, + "step": 1510 + }, + { + "epoch": 0.07676573823893336, + "grad_norm": 2.3972513453985234, + "learning_rate": 9.233760985958177e-06, + "loss": 0.4541, + "step": 1520 + }, + { + "epoch": 0.07727077599050529, + "grad_norm": 1.715103982949988, + "learning_rate": 9.228709970704113e-06, + "loss": 0.4218, + "step": 1530 + }, + { + "epoch": 0.07777581374207722, + "grad_norm": 2.825431193796437, + "learning_rate": 9.223658955450047e-06, + "loss": 0.4552, + "step": 1540 + }, + { + "epoch": 0.07828085149364915, + "grad_norm": 4.126140659488343, + "learning_rate": 9.218607940195981e-06, + "loss": 0.4379, + "step": 1550 + }, + { + "epoch": 0.07878588924522108, + "grad_norm": 4.979196187347629, + "learning_rate": 9.213556924941914e-06, + "loss": 0.4649, + "step": 1560 + }, + { + "epoch": 0.079290926996793, + "grad_norm": 5.8119930612941735, + "learning_rate": 9.208505909687848e-06, + "loss": 0.4474, + "step": 1570 + }, + { + "epoch": 0.07979596474836494, + "grad_norm": 2.5598139471551864, + "learning_rate": 9.203454894433782e-06, + "loss": 0.4645, + "step": 1580 + }, + { + "epoch": 0.08030100249993687, + "grad_norm": 2.9414494655541112, + "learning_rate": 9.198403879179716e-06, + "loss": 0.4431, + "step": 1590 + }, + { + "epoch": 0.0808060402515088, + "grad_norm": 2.518508877017779, + "learning_rate": 9.19335286392565e-06, + "loss": 0.4513, + "step": 1600 + }, + { + "epoch": 0.08131107800308073, + "grad_norm": 4.2677352209415425, + "learning_rate": 9.188301848671584e-06, + "loss": 0.4362, + "step": 1610 + }, + { + "epoch": 0.08181611575465265, + "grad_norm": 2.376190662511493, + "learning_rate": 9.183250833417518e-06, + "loss": 0.4419, + "step": 1620 + }, + { + "epoch": 0.08232115350622458, + "grad_norm": 2.311824528121457, + "learning_rate": 9.17819981816345e-06, + "loss": 0.4543, + "step": 1630 + }, + { + "epoch": 0.08282619125779651, + "grad_norm": 2.8591324038418207, + "learning_rate": 9.173148802909385e-06, + "loss": 0.4449, + "step": 1640 + }, + { + "epoch": 0.08333122900936846, + "grad_norm": 2.0096705761911737, + "learning_rate": 9.168097787655319e-06, + "loss": 0.4413, + "step": 1650 + }, + { + "epoch": 0.08383626676094039, + "grad_norm": 2.536437573585815, + "learning_rate": 9.163046772401255e-06, + "loss": 0.4527, + "step": 1660 + }, + { + "epoch": 0.08434130451251232, + "grad_norm": 1.974773461430328, + "learning_rate": 9.157995757147187e-06, + "loss": 0.4471, + "step": 1670 + }, + { + "epoch": 0.08484634226408425, + "grad_norm": 3.0369504080501755, + "learning_rate": 9.152944741893121e-06, + "loss": 0.4523, + "step": 1680 + }, + { + "epoch": 0.08535138001565618, + "grad_norm": 2.3986445472475455, + "learning_rate": 9.147893726639055e-06, + "loss": 0.4367, + "step": 1690 + }, + { + "epoch": 0.0858564177672281, + "grad_norm": 2.808593335583457, + "learning_rate": 9.14284271138499e-06, + "loss": 0.4465, + "step": 1700 + }, + { + "epoch": 0.08636145551880003, + "grad_norm": 2.516839276650598, + "learning_rate": 9.137791696130924e-06, + "loss": 0.4492, + "step": 1710 + }, + { + "epoch": 0.08686649327037196, + "grad_norm": 2.458679996734928, + "learning_rate": 9.132740680876858e-06, + "loss": 0.4387, + "step": 1720 + }, + { + "epoch": 0.0873715310219439, + "grad_norm": 2.691340501537917, + "learning_rate": 9.12768966562279e-06, + "loss": 0.4591, + "step": 1730 + }, + { + "epoch": 0.08787656877351582, + "grad_norm": 2.2677007816655244, + "learning_rate": 9.122638650368724e-06, + "loss": 0.4463, + "step": 1740 + }, + { + "epoch": 0.08838160652508775, + "grad_norm": 2.0671070241912797, + "learning_rate": 9.117587635114659e-06, + "loss": 0.4307, + "step": 1750 + }, + { + "epoch": 0.08888664427665968, + "grad_norm": 2.7605610064361596, + "learning_rate": 9.112536619860593e-06, + "loss": 0.4391, + "step": 1760 + }, + { + "epoch": 0.08939168202823161, + "grad_norm": 2.437909002135904, + "learning_rate": 9.107485604606527e-06, + "loss": 0.4526, + "step": 1770 + }, + { + "epoch": 0.08989671977980354, + "grad_norm": 2.99196689672201, + "learning_rate": 9.102434589352461e-06, + "loss": 0.4593, + "step": 1780 + }, + { + "epoch": 0.09040175753137547, + "grad_norm": 1.8141907580964987, + "learning_rate": 9.097383574098393e-06, + "loss": 0.4611, + "step": 1790 + }, + { + "epoch": 0.0909067952829474, + "grad_norm": 2.018729489961064, + "learning_rate": 9.092332558844329e-06, + "loss": 0.4594, + "step": 1800 + }, + { + "epoch": 0.09141183303451933, + "grad_norm": 1.9703582862843025, + "learning_rate": 9.087281543590263e-06, + "loss": 0.4382, + "step": 1810 + }, + { + "epoch": 0.09191687078609126, + "grad_norm": 1.6921508201701456, + "learning_rate": 9.082230528336197e-06, + "loss": 0.4561, + "step": 1820 + }, + { + "epoch": 0.09242190853766319, + "grad_norm": 3.029420346852276, + "learning_rate": 9.07717951308213e-06, + "loss": 0.4441, + "step": 1830 + }, + { + "epoch": 0.09292694628923512, + "grad_norm": 1.9103397185841393, + "learning_rate": 9.072128497828064e-06, + "loss": 0.4345, + "step": 1840 + }, + { + "epoch": 0.09343198404080705, + "grad_norm": 1.6003619276106154, + "learning_rate": 9.067077482573998e-06, + "loss": 0.454, + "step": 1850 + }, + { + "epoch": 0.09393702179237898, + "grad_norm": 2.039612541605303, + "learning_rate": 9.062026467319932e-06, + "loss": 0.4472, + "step": 1860 + }, + { + "epoch": 0.09444205954395091, + "grad_norm": 1.9475921005254484, + "learning_rate": 9.056975452065866e-06, + "loss": 0.4568, + "step": 1870 + }, + { + "epoch": 0.09494709729552284, + "grad_norm": 1.556252720211933, + "learning_rate": 9.0519244368118e-06, + "loss": 0.4462, + "step": 1880 + }, + { + "epoch": 0.09545213504709477, + "grad_norm": 2.041709870164745, + "learning_rate": 9.046873421557733e-06, + "loss": 0.4522, + "step": 1890 + }, + { + "epoch": 0.0959571727986667, + "grad_norm": 1.6042104819290137, + "learning_rate": 9.041822406303667e-06, + "loss": 0.4358, + "step": 1900 + }, + { + "epoch": 0.09646221055023863, + "grad_norm": 1.7827733522561613, + "learning_rate": 9.036771391049601e-06, + "loss": 0.4541, + "step": 1910 + }, + { + "epoch": 0.09696724830181055, + "grad_norm": 1.5335434779957613, + "learning_rate": 9.031720375795535e-06, + "loss": 0.462, + "step": 1920 + }, + { + "epoch": 0.09747228605338248, + "grad_norm": 1.5961467635458402, + "learning_rate": 9.02666936054147e-06, + "loss": 0.444, + "step": 1930 + }, + { + "epoch": 0.09797732380495441, + "grad_norm": 2.6325242433089637, + "learning_rate": 9.021618345287403e-06, + "loss": 0.4413, + "step": 1940 + }, + { + "epoch": 0.09848236155652636, + "grad_norm": 2.8583722906916056, + "learning_rate": 9.016567330033338e-06, + "loss": 0.4444, + "step": 1950 + }, + { + "epoch": 0.09898739930809829, + "grad_norm": 2.3144131776880763, + "learning_rate": 9.011516314779272e-06, + "loss": 0.4444, + "step": 1960 + }, + { + "epoch": 0.09949243705967022, + "grad_norm": 2.1581296348642938, + "learning_rate": 9.006465299525206e-06, + "loss": 0.4335, + "step": 1970 + }, + { + "epoch": 0.09999747481124215, + "grad_norm": 17.52409284022625, + "learning_rate": 9.00141428427114e-06, + "loss": 0.4315, + "step": 1980 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 4.993014635080338, + "learning_rate": 8.996363269017074e-06, + "loss": 0.4397, + "step": 1990 + }, + { + "epoch": 0.101007550314386, + "grad_norm": 2.877086759384337, + "learning_rate": 8.991312253763007e-06, + "loss": 0.4399, + "step": 2000 + }, + { + "epoch": 0.10151258806595793, + "grad_norm": 1.8400304810419659, + "learning_rate": 8.98626123850894e-06, + "loss": 0.4362, + "step": 2010 + }, + { + "epoch": 0.10201762581752986, + "grad_norm": 2.9024549851445993, + "learning_rate": 8.981210223254875e-06, + "loss": 0.4408, + "step": 2020 + }, + { + "epoch": 0.1025226635691018, + "grad_norm": 2.0461267797053067, + "learning_rate": 8.976159208000809e-06, + "loss": 0.432, + "step": 2030 + }, + { + "epoch": 0.10302770132067372, + "grad_norm": 3.503082468120821, + "learning_rate": 8.971108192746743e-06, + "loss": 0.4325, + "step": 2040 + }, + { + "epoch": 0.10353273907224565, + "grad_norm": 2.388506460672893, + "learning_rate": 8.966057177492677e-06, + "loss": 0.4351, + "step": 2050 + }, + { + "epoch": 0.10403777682381758, + "grad_norm": 2.1254282185067646, + "learning_rate": 8.96100616223861e-06, + "loss": 0.4471, + "step": 2060 + }, + { + "epoch": 0.10454281457538951, + "grad_norm": 7.57345126491693, + "learning_rate": 8.955955146984545e-06, + "loss": 0.4397, + "step": 2070 + }, + { + "epoch": 0.10504785232696144, + "grad_norm": 1.6413054443187884, + "learning_rate": 8.95090413173048e-06, + "loss": 0.4322, + "step": 2080 + }, + { + "epoch": 0.10555289007853337, + "grad_norm": 1.8503237766357117, + "learning_rate": 8.945853116476414e-06, + "loss": 0.4599, + "step": 2090 + }, + { + "epoch": 0.1060579278301053, + "grad_norm": 2.122766063633365, + "learning_rate": 8.940802101222346e-06, + "loss": 0.4297, + "step": 2100 + }, + { + "epoch": 0.10656296558167723, + "grad_norm": 1.859081726279874, + "learning_rate": 8.93575108596828e-06, + "loss": 0.4448, + "step": 2110 + }, + { + "epoch": 0.10706800333324916, + "grad_norm": 2.477456287176706, + "learning_rate": 8.930700070714214e-06, + "loss": 0.4358, + "step": 2120 + }, + { + "epoch": 0.10757304108482109, + "grad_norm": 1.642087049827079, + "learning_rate": 8.925649055460148e-06, + "loss": 0.4549, + "step": 2130 + }, + { + "epoch": 0.10807807883639302, + "grad_norm": 3.1351745453532818, + "learning_rate": 8.920598040206083e-06, + "loss": 0.4482, + "step": 2140 + }, + { + "epoch": 0.10858311658796495, + "grad_norm": 3.2350948921462805, + "learning_rate": 8.915547024952017e-06, + "loss": 0.4426, + "step": 2150 + }, + { + "epoch": 0.10908815433953688, + "grad_norm": 2.4276194333582986, + "learning_rate": 8.910496009697949e-06, + "loss": 0.4523, + "step": 2160 + }, + { + "epoch": 0.10959319209110881, + "grad_norm": 2.425429246229103, + "learning_rate": 8.905444994443883e-06, + "loss": 0.4381, + "step": 2170 + }, + { + "epoch": 0.11009822984268074, + "grad_norm": 2.8737600406248682, + "learning_rate": 8.900393979189817e-06, + "loss": 0.4355, + "step": 2180 + }, + { + "epoch": 0.11060326759425267, + "grad_norm": 1.868046294608527, + "learning_rate": 8.895342963935751e-06, + "loss": 0.4445, + "step": 2190 + }, + { + "epoch": 0.1111083053458246, + "grad_norm": 7.059764277040628, + "learning_rate": 8.890291948681686e-06, + "loss": 0.451, + "step": 2200 + }, + { + "epoch": 0.11161334309739653, + "grad_norm": 2.155018266737317, + "learning_rate": 8.88524093342762e-06, + "loss": 0.4421, + "step": 2210 + }, + { + "epoch": 0.11211838084896845, + "grad_norm": 9.177888788340095, + "learning_rate": 8.880189918173554e-06, + "loss": 0.4398, + "step": 2220 + }, + { + "epoch": 0.11262341860054038, + "grad_norm": 3.143167600541576, + "learning_rate": 8.875138902919488e-06, + "loss": 0.4297, + "step": 2230 + }, + { + "epoch": 0.11312845635211231, + "grad_norm": 2.152834452055092, + "learning_rate": 8.870087887665422e-06, + "loss": 0.4529, + "step": 2240 + }, + { + "epoch": 0.11363349410368426, + "grad_norm": 14.793812918224992, + "learning_rate": 8.865036872411356e-06, + "loss": 0.4383, + "step": 2250 + }, + { + "epoch": 0.11413853185525619, + "grad_norm": 3.45745902653208, + "learning_rate": 8.859985857157289e-06, + "loss": 0.4302, + "step": 2260 + }, + { + "epoch": 0.11464356960682812, + "grad_norm": 2.7114403885624605, + "learning_rate": 8.854934841903223e-06, + "loss": 0.4251, + "step": 2270 + }, + { + "epoch": 0.11514860735840005, + "grad_norm": 2.723960534334651, + "learning_rate": 8.849883826649157e-06, + "loss": 0.4286, + "step": 2280 + }, + { + "epoch": 0.11565364510997198, + "grad_norm": 2.4905833599831415, + "learning_rate": 8.844832811395091e-06, + "loss": 0.4508, + "step": 2290 + }, + { + "epoch": 0.1161586828615439, + "grad_norm": 20.83563549184929, + "learning_rate": 8.839781796141025e-06, + "loss": 0.4474, + "step": 2300 + }, + { + "epoch": 0.11666372061311583, + "grad_norm": 3.9724073335359336, + "learning_rate": 8.83473078088696e-06, + "loss": 0.4476, + "step": 2310 + }, + { + "epoch": 0.11716875836468776, + "grad_norm": 2.1463739260782106, + "learning_rate": 8.829679765632892e-06, + "loss": 0.4189, + "step": 2320 + }, + { + "epoch": 0.1176737961162597, + "grad_norm": 3.592083818788885, + "learning_rate": 8.824628750378826e-06, + "loss": 0.4242, + "step": 2330 + }, + { + "epoch": 0.11817883386783162, + "grad_norm": 2.548758001659721, + "learning_rate": 8.819577735124762e-06, + "loss": 0.4112, + "step": 2340 + }, + { + "epoch": 0.11868387161940355, + "grad_norm": 2.713449956929624, + "learning_rate": 8.814526719870696e-06, + "loss": 0.4173, + "step": 2350 + }, + { + "epoch": 0.11918890937097548, + "grad_norm": 3.2378838703432016, + "learning_rate": 8.80947570461663e-06, + "loss": 0.4256, + "step": 2360 + }, + { + "epoch": 0.11969394712254741, + "grad_norm": 3.8665508352583013, + "learning_rate": 8.804424689362562e-06, + "loss": 0.4241, + "step": 2370 + }, + { + "epoch": 0.12019898487411934, + "grad_norm": 4.556177393216747, + "learning_rate": 8.799373674108496e-06, + "loss": 0.4282, + "step": 2380 + }, + { + "epoch": 0.12070402262569127, + "grad_norm": 3.687650501163452, + "learning_rate": 8.79432265885443e-06, + "loss": 0.4242, + "step": 2390 + }, + { + "epoch": 0.1212090603772632, + "grad_norm": 2.9530727834279595, + "learning_rate": 8.789271643600365e-06, + "loss": 0.4301, + "step": 2400 + }, + { + "epoch": 0.12171409812883513, + "grad_norm": 2.916996176159921, + "learning_rate": 8.784220628346299e-06, + "loss": 0.4284, + "step": 2410 + }, + { + "epoch": 0.12221913588040706, + "grad_norm": 2.2417118662412734, + "learning_rate": 8.779169613092233e-06, + "loss": 0.4186, + "step": 2420 + }, + { + "epoch": 0.12272417363197899, + "grad_norm": 2.947908220653578, + "learning_rate": 8.774118597838165e-06, + "loss": 0.4223, + "step": 2430 + }, + { + "epoch": 0.12322921138355092, + "grad_norm": 2.071616410723938, + "learning_rate": 8.7690675825841e-06, + "loss": 0.4375, + "step": 2440 + }, + { + "epoch": 0.12373424913512285, + "grad_norm": 2.8361443264190296, + "learning_rate": 8.764016567330034e-06, + "loss": 0.426, + "step": 2450 + }, + { + "epoch": 0.12423928688669478, + "grad_norm": 2.820818503243756, + "learning_rate": 8.758965552075968e-06, + "loss": 0.4341, + "step": 2460 + }, + { + "epoch": 0.12474432463826671, + "grad_norm": 3.136410856304739, + "learning_rate": 8.753914536821902e-06, + "loss": 0.4445, + "step": 2470 + }, + { + "epoch": 0.12524936238983864, + "grad_norm": 8.047547235777753, + "learning_rate": 8.748863521567836e-06, + "loss": 0.4267, + "step": 2480 + }, + { + "epoch": 0.12575440014141057, + "grad_norm": 3.892449775126266, + "learning_rate": 8.74381250631377e-06, + "loss": 0.419, + "step": 2490 + }, + { + "epoch": 0.1262594378929825, + "grad_norm": 2.354897593217238, + "learning_rate": 8.738761491059704e-06, + "loss": 0.4272, + "step": 2500 + }, + { + "epoch": 0.12676447564455443, + "grad_norm": 2.5811852973740566, + "learning_rate": 8.733710475805638e-06, + "loss": 0.4253, + "step": 2510 + }, + { + "epoch": 0.12726951339612635, + "grad_norm": 6.105044708207583, + "learning_rate": 8.728659460551572e-06, + "loss": 0.4214, + "step": 2520 + }, + { + "epoch": 0.12777455114769828, + "grad_norm": 6.168633495733829, + "learning_rate": 8.723608445297505e-06, + "loss": 0.4374, + "step": 2530 + }, + { + "epoch": 0.1282795888992702, + "grad_norm": 3.3331523293740677, + "learning_rate": 8.718557430043439e-06, + "loss": 0.4221, + "step": 2540 + }, + { + "epoch": 0.12878462665084214, + "grad_norm": 3.1244184035542895, + "learning_rate": 8.713506414789373e-06, + "loss": 0.4362, + "step": 2550 + }, + { + "epoch": 0.12928966440241407, + "grad_norm": 3.438277611759734, + "learning_rate": 8.708455399535307e-06, + "loss": 0.4193, + "step": 2560 + }, + { + "epoch": 0.129794702153986, + "grad_norm": 5.520539408202482, + "learning_rate": 8.703404384281241e-06, + "loss": 0.4057, + "step": 2570 + }, + { + "epoch": 0.13029973990555793, + "grad_norm": 5.422130415998936, + "learning_rate": 8.698353369027176e-06, + "loss": 0.4384, + "step": 2580 + }, + { + "epoch": 0.13080477765712986, + "grad_norm": 3.341989913378195, + "learning_rate": 8.693302353773108e-06, + "loss": 0.4162, + "step": 2590 + }, + { + "epoch": 0.1313098154087018, + "grad_norm": 2.475515489410939, + "learning_rate": 8.688251338519042e-06, + "loss": 0.4348, + "step": 2600 + }, + { + "epoch": 0.13181485316027372, + "grad_norm": 2.550493025405498, + "learning_rate": 8.683200323264976e-06, + "loss": 0.4173, + "step": 2610 + }, + { + "epoch": 0.13231989091184565, + "grad_norm": 2.0333378304030956, + "learning_rate": 8.678149308010912e-06, + "loss": 0.423, + "step": 2620 + }, + { + "epoch": 0.13282492866341758, + "grad_norm": 2.2902374319536176, + "learning_rate": 8.673098292756844e-06, + "loss": 0.443, + "step": 2630 + }, + { + "epoch": 0.1333299664149895, + "grad_norm": 2.774814046594427, + "learning_rate": 8.668047277502779e-06, + "loss": 0.4174, + "step": 2640 + }, + { + "epoch": 0.13383500416656144, + "grad_norm": 3.8730252519897013, + "learning_rate": 8.662996262248713e-06, + "loss": 0.4248, + "step": 2650 + }, + { + "epoch": 0.13434004191813337, + "grad_norm": 2.6952038500341127, + "learning_rate": 8.657945246994647e-06, + "loss": 0.4117, + "step": 2660 + }, + { + "epoch": 0.1348450796697053, + "grad_norm": 2.7361849988399634, + "learning_rate": 8.652894231740581e-06, + "loss": 0.4249, + "step": 2670 + }, + { + "epoch": 0.13535011742127723, + "grad_norm": 2.9758725333203535, + "learning_rate": 8.647843216486515e-06, + "loss": 0.4381, + "step": 2680 + }, + { + "epoch": 0.13585515517284916, + "grad_norm": 2.551067170047961, + "learning_rate": 8.642792201232447e-06, + "loss": 0.4214, + "step": 2690 + }, + { + "epoch": 0.13636019292442111, + "grad_norm": 2.6334795048127724, + "learning_rate": 8.637741185978382e-06, + "loss": 0.427, + "step": 2700 + }, + { + "epoch": 0.13686523067599304, + "grad_norm": 3.9869976953943866, + "learning_rate": 8.632690170724316e-06, + "loss": 0.4101, + "step": 2710 + }, + { + "epoch": 0.13737026842756497, + "grad_norm": 3.313933800574665, + "learning_rate": 8.62763915547025e-06, + "loss": 0.4138, + "step": 2720 + }, + { + "epoch": 0.1378753061791369, + "grad_norm": 3.641595792417612, + "learning_rate": 8.622588140216184e-06, + "loss": 0.4256, + "step": 2730 + }, + { + "epoch": 0.13838034393070883, + "grad_norm": 5.36751356787116, + "learning_rate": 8.617537124962118e-06, + "loss": 0.4314, + "step": 2740 + }, + { + "epoch": 0.13888538168228076, + "grad_norm": 2.5991069721105977, + "learning_rate": 8.612486109708052e-06, + "loss": 0.4157, + "step": 2750 + }, + { + "epoch": 0.1393904194338527, + "grad_norm": 3.1641348139616463, + "learning_rate": 8.607435094453986e-06, + "loss": 0.4023, + "step": 2760 + }, + { + "epoch": 0.13989545718542462, + "grad_norm": 3.021572554543311, + "learning_rate": 8.60238407919992e-06, + "loss": 0.4138, + "step": 2770 + }, + { + "epoch": 0.14040049493699655, + "grad_norm": 3.0773716536091063, + "learning_rate": 8.597333063945855e-06, + "loss": 0.4347, + "step": 2780 + }, + { + "epoch": 0.14090553268856848, + "grad_norm": 3.1572917500612236, + "learning_rate": 8.592282048691789e-06, + "loss": 0.425, + "step": 2790 + }, + { + "epoch": 0.1414105704401404, + "grad_norm": 3.065547472806378, + "learning_rate": 8.587231033437721e-06, + "loss": 0.4227, + "step": 2800 + }, + { + "epoch": 0.14191560819171234, + "grad_norm": 3.0283835796400593, + "learning_rate": 8.582180018183655e-06, + "loss": 0.4126, + "step": 2810 + }, + { + "epoch": 0.14242064594328427, + "grad_norm": 3.173481302073063, + "learning_rate": 8.57712900292959e-06, + "loss": 0.434, + "step": 2820 + }, + { + "epoch": 0.1429256836948562, + "grad_norm": 3.765724472944031, + "learning_rate": 8.572077987675524e-06, + "loss": 0.4463, + "step": 2830 + }, + { + "epoch": 0.14343072144642813, + "grad_norm": 5.419243180691885, + "learning_rate": 8.567026972421458e-06, + "loss": 0.4213, + "step": 2840 + }, + { + "epoch": 0.14393575919800006, + "grad_norm": 7.3295747683680865, + "learning_rate": 8.561975957167392e-06, + "loss": 0.4202, + "step": 2850 + }, + { + "epoch": 0.144440796949572, + "grad_norm": 6.887688386053364, + "learning_rate": 8.556924941913324e-06, + "loss": 0.4238, + "step": 2860 + }, + { + "epoch": 0.14494583470114392, + "grad_norm": 2.786632234737629, + "learning_rate": 8.551873926659258e-06, + "loss": 0.431, + "step": 2870 + }, + { + "epoch": 0.14545087245271585, + "grad_norm": 2.60607889031576, + "learning_rate": 8.546822911405192e-06, + "loss": 0.4306, + "step": 2880 + }, + { + "epoch": 0.14595591020428778, + "grad_norm": 8.477010546599903, + "learning_rate": 8.541771896151128e-06, + "loss": 0.4419, + "step": 2890 + }, + { + "epoch": 0.1464609479558597, + "grad_norm": 2.27833113141815, + "learning_rate": 8.53672088089706e-06, + "loss": 0.4226, + "step": 2900 + }, + { + "epoch": 0.14696598570743163, + "grad_norm": 4.423453943948206, + "learning_rate": 8.531669865642995e-06, + "loss": 0.439, + "step": 2910 + }, + { + "epoch": 0.14747102345900356, + "grad_norm": 3.6944661359694013, + "learning_rate": 8.526618850388929e-06, + "loss": 0.4221, + "step": 2920 + }, + { + "epoch": 0.1479760612105755, + "grad_norm": 5.078989868876925, + "learning_rate": 8.521567835134863e-06, + "loss": 0.4317, + "step": 2930 + }, + { + "epoch": 0.14848109896214742, + "grad_norm": 3.37535562195142, + "learning_rate": 8.516516819880797e-06, + "loss": 0.4365, + "step": 2940 + }, + { + "epoch": 0.14898613671371935, + "grad_norm": 4.387702072726384, + "learning_rate": 8.511465804626731e-06, + "loss": 0.4133, + "step": 2950 + }, + { + "epoch": 0.14949117446529128, + "grad_norm": 4.377549617088979, + "learning_rate": 8.506414789372664e-06, + "loss": 0.4332, + "step": 2960 + }, + { + "epoch": 0.1499962122168632, + "grad_norm": 3.6071548234377033, + "learning_rate": 8.501363774118598e-06, + "loss": 0.424, + "step": 2970 + }, + { + "epoch": 0.15050124996843514, + "grad_norm": 3.2776129282491953, + "learning_rate": 8.496312758864532e-06, + "loss": 0.431, + "step": 2980 + }, + { + "epoch": 0.15100628772000707, + "grad_norm": 4.474289915888015, + "learning_rate": 8.491261743610466e-06, + "loss": 0.4289, + "step": 2990 + }, + { + "epoch": 0.151511325471579, + "grad_norm": 2.562841849224475, + "learning_rate": 8.4862107283564e-06, + "loss": 0.4291, + "step": 3000 + }, + { + "epoch": 0.15201636322315093, + "grad_norm": 2.9179612414883325, + "learning_rate": 8.481159713102334e-06, + "loss": 0.4187, + "step": 3010 + }, + { + "epoch": 0.15252140097472286, + "grad_norm": 3.2632959036982703, + "learning_rate": 8.476108697848268e-06, + "loss": 0.4212, + "step": 3020 + }, + { + "epoch": 0.1530264387262948, + "grad_norm": 3.962692563420196, + "learning_rate": 8.471057682594203e-06, + "loss": 0.4228, + "step": 3030 + }, + { + "epoch": 0.15353147647786672, + "grad_norm": 3.2213155600732803, + "learning_rate": 8.466006667340137e-06, + "loss": 0.4271, + "step": 3040 + }, + { + "epoch": 0.15403651422943865, + "grad_norm": 2.3479787665404555, + "learning_rate": 8.460955652086071e-06, + "loss": 0.4329, + "step": 3050 + }, + { + "epoch": 0.15454155198101058, + "grad_norm": 2.7693374782815035, + "learning_rate": 8.455904636832003e-06, + "loss": 0.4168, + "step": 3060 + }, + { + "epoch": 0.1550465897325825, + "grad_norm": 2.7643943115933314, + "learning_rate": 8.450853621577937e-06, + "loss": 0.4145, + "step": 3070 + }, + { + "epoch": 0.15555162748415444, + "grad_norm": 2.6062282259447422, + "learning_rate": 8.445802606323872e-06, + "loss": 0.4191, + "step": 3080 + }, + { + "epoch": 0.15605666523572637, + "grad_norm": 2.60763373572543, + "learning_rate": 8.440751591069806e-06, + "loss": 0.4161, + "step": 3090 + }, + { + "epoch": 0.1565617029872983, + "grad_norm": 2.4475412139703, + "learning_rate": 8.43570057581574e-06, + "loss": 0.4273, + "step": 3100 + }, + { + "epoch": 0.15706674073887023, + "grad_norm": 2.9309156363483084, + "learning_rate": 8.430649560561674e-06, + "loss": 0.4042, + "step": 3110 + }, + { + "epoch": 0.15757177849044215, + "grad_norm": 2.2191638521741845, + "learning_rate": 8.425598545307608e-06, + "loss": 0.4306, + "step": 3120 + }, + { + "epoch": 0.15807681624201408, + "grad_norm": 2.0790541165617893, + "learning_rate": 8.42054753005354e-06, + "loss": 0.4219, + "step": 3130 + }, + { + "epoch": 0.158581853993586, + "grad_norm": 2.344144785550955, + "learning_rate": 8.415496514799475e-06, + "loss": 0.44, + "step": 3140 + }, + { + "epoch": 0.15908689174515794, + "grad_norm": 3.896107585478935, + "learning_rate": 8.410445499545409e-06, + "loss": 0.4196, + "step": 3150 + }, + { + "epoch": 0.15959192949672987, + "grad_norm": 3.511331268768427, + "learning_rate": 8.405394484291345e-06, + "loss": 0.4138, + "step": 3160 + }, + { + "epoch": 0.1600969672483018, + "grad_norm": 4.918842428536537, + "learning_rate": 8.400343469037277e-06, + "loss": 0.426, + "step": 3170 + }, + { + "epoch": 0.16060200499987373, + "grad_norm": 2.670823674907055, + "learning_rate": 8.395292453783211e-06, + "loss": 0.4099, + "step": 3180 + }, + { + "epoch": 0.16110704275144566, + "grad_norm": 2.125448197180906, + "learning_rate": 8.390241438529145e-06, + "loss": 0.4186, + "step": 3190 + }, + { + "epoch": 0.1616120805030176, + "grad_norm": 1.971202039415502, + "learning_rate": 8.38519042327508e-06, + "loss": 0.4307, + "step": 3200 + }, + { + "epoch": 0.16211711825458952, + "grad_norm": 3.637785251447948, + "learning_rate": 8.380139408021013e-06, + "loss": 0.4315, + "step": 3210 + }, + { + "epoch": 0.16262215600616145, + "grad_norm": 3.1909501990683027, + "learning_rate": 8.375088392766948e-06, + "loss": 0.4103, + "step": 3220 + }, + { + "epoch": 0.16312719375773338, + "grad_norm": 3.426808590014172, + "learning_rate": 8.37003737751288e-06, + "loss": 0.4178, + "step": 3230 + }, + { + "epoch": 0.1636322315093053, + "grad_norm": 2.794301511590198, + "learning_rate": 8.364986362258814e-06, + "loss": 0.421, + "step": 3240 + }, + { + "epoch": 0.16413726926087724, + "grad_norm": 2.6521369002493196, + "learning_rate": 8.359935347004748e-06, + "loss": 0.4279, + "step": 3250 + }, + { + "epoch": 0.16464230701244917, + "grad_norm": 3.065473735931392, + "learning_rate": 8.354884331750682e-06, + "loss": 0.4242, + "step": 3260 + }, + { + "epoch": 0.1651473447640211, + "grad_norm": 2.3676634296220826, + "learning_rate": 8.349833316496616e-06, + "loss": 0.4004, + "step": 3270 + }, + { + "epoch": 0.16565238251559303, + "grad_norm": 3.0123313610182407, + "learning_rate": 8.34478230124255e-06, + "loss": 0.4168, + "step": 3280 + }, + { + "epoch": 0.16615742026716496, + "grad_norm": 2.613652117431061, + "learning_rate": 8.339731285988485e-06, + "loss": 0.4032, + "step": 3290 + }, + { + "epoch": 0.16666245801873691, + "grad_norm": 2.5495103609008383, + "learning_rate": 8.334680270734419e-06, + "loss": 0.4258, + "step": 3300 + }, + { + "epoch": 0.16716749577030884, + "grad_norm": 1.944110652634474, + "learning_rate": 8.329629255480353e-06, + "loss": 0.4047, + "step": 3310 + }, + { + "epoch": 0.16767253352188077, + "grad_norm": 3.996554861787568, + "learning_rate": 8.324578240226287e-06, + "loss": 0.4266, + "step": 3320 + }, + { + "epoch": 0.1681775712734527, + "grad_norm": 3.311463894609568, + "learning_rate": 8.31952722497222e-06, + "loss": 0.4142, + "step": 3330 + }, + { + "epoch": 0.16868260902502463, + "grad_norm": 2.6266078214610418, + "learning_rate": 8.314476209718154e-06, + "loss": 0.4106, + "step": 3340 + }, + { + "epoch": 0.16918764677659656, + "grad_norm": 3.3843962429509533, + "learning_rate": 8.309425194464088e-06, + "loss": 0.4023, + "step": 3350 + }, + { + "epoch": 0.1696926845281685, + "grad_norm": 2.755113613993984, + "learning_rate": 8.304374179210022e-06, + "loss": 0.3974, + "step": 3360 + }, + { + "epoch": 0.17019772227974042, + "grad_norm": 3.915262353619256, + "learning_rate": 8.299323163955956e-06, + "loss": 0.4233, + "step": 3370 + }, + { + "epoch": 0.17070276003131235, + "grad_norm": 6.128222421561234, + "learning_rate": 8.29427214870189e-06, + "loss": 0.3987, + "step": 3380 + }, + { + "epoch": 0.17120779778288428, + "grad_norm": 3.244731601327553, + "learning_rate": 8.289221133447823e-06, + "loss": 0.408, + "step": 3390 + }, + { + "epoch": 0.1717128355344562, + "grad_norm": 3.415058459654747, + "learning_rate": 8.284170118193757e-06, + "loss": 0.4264, + "step": 3400 + }, + { + "epoch": 0.17221787328602814, + "grad_norm": 4.870959665296901, + "learning_rate": 8.27911910293969e-06, + "loss": 0.4068, + "step": 3410 + }, + { + "epoch": 0.17272291103760007, + "grad_norm": 3.1738996571406837, + "learning_rate": 8.274068087685625e-06, + "loss": 0.4238, + "step": 3420 + }, + { + "epoch": 0.173227948789172, + "grad_norm": 5.015822248252729, + "learning_rate": 8.269017072431559e-06, + "loss": 0.4258, + "step": 3430 + }, + { + "epoch": 0.17373298654074393, + "grad_norm": 5.826443535049107, + "learning_rate": 8.263966057177493e-06, + "loss": 0.389, + "step": 3440 + }, + { + "epoch": 0.17423802429231586, + "grad_norm": 3.9268362077276366, + "learning_rate": 8.258915041923427e-06, + "loss": 0.4302, + "step": 3450 + }, + { + "epoch": 0.1747430620438878, + "grad_norm": 2.7948709929923155, + "learning_rate": 8.253864026669361e-06, + "loss": 0.4179, + "step": 3460 + }, + { + "epoch": 0.17524809979545972, + "grad_norm": 7.055728348112125, + "learning_rate": 8.248813011415296e-06, + "loss": 0.4249, + "step": 3470 + }, + { + "epoch": 0.17575313754703165, + "grad_norm": 11.223550772273683, + "learning_rate": 8.24376199616123e-06, + "loss": 0.4198, + "step": 3480 + }, + { + "epoch": 0.17625817529860358, + "grad_norm": 3.272872614673025, + "learning_rate": 8.238710980907164e-06, + "loss": 0.412, + "step": 3490 + }, + { + "epoch": 0.1767632130501755, + "grad_norm": 4.687786747360582, + "learning_rate": 8.233659965653096e-06, + "loss": 0.4143, + "step": 3500 + }, + { + "epoch": 0.17726825080174743, + "grad_norm": 4.2131701830416315, + "learning_rate": 8.22860895039903e-06, + "loss": 0.407, + "step": 3510 + }, + { + "epoch": 0.17777328855331936, + "grad_norm": 2.7350467089835058, + "learning_rate": 8.223557935144965e-06, + "loss": 0.4065, + "step": 3520 + }, + { + "epoch": 0.1782783263048913, + "grad_norm": 15.963906126522279, + "learning_rate": 8.218506919890899e-06, + "loss": 0.4053, + "step": 3530 + }, + { + "epoch": 0.17878336405646322, + "grad_norm": 6.283377242483946, + "learning_rate": 8.213455904636833e-06, + "loss": 0.4398, + "step": 3540 + }, + { + "epoch": 0.17928840180803515, + "grad_norm": 4.211980280469181, + "learning_rate": 8.208404889382767e-06, + "loss": 0.4279, + "step": 3550 + }, + { + "epoch": 0.17979343955960708, + "grad_norm": 4.888324462942457, + "learning_rate": 8.2033538741287e-06, + "loss": 0.4279, + "step": 3560 + }, + { + "epoch": 0.180298477311179, + "grad_norm": 4.806718275911088, + "learning_rate": 8.198302858874635e-06, + "loss": 0.4074, + "step": 3570 + }, + { + "epoch": 0.18080351506275094, + "grad_norm": 14.618909757426387, + "learning_rate": 8.19325184362057e-06, + "loss": 0.4377, + "step": 3580 + }, + { + "epoch": 0.18130855281432287, + "grad_norm": 3.0145598452408935, + "learning_rate": 8.188200828366503e-06, + "loss": 0.4244, + "step": 3590 + }, + { + "epoch": 0.1818135905658948, + "grad_norm": 3.537451616849706, + "learning_rate": 8.183149813112436e-06, + "loss": 0.4115, + "step": 3600 + }, + { + "epoch": 0.18231862831746673, + "grad_norm": 6.760898974505609, + "learning_rate": 8.17809879785837e-06, + "loss": 0.4183, + "step": 3610 + }, + { + "epoch": 0.18282366606903866, + "grad_norm": 3.1669599365914927, + "learning_rate": 8.173047782604304e-06, + "loss": 0.4116, + "step": 3620 + }, + { + "epoch": 0.1833287038206106, + "grad_norm": 15.662389012118272, + "learning_rate": 8.167996767350238e-06, + "loss": 0.4208, + "step": 3630 + }, + { + "epoch": 0.18383374157218252, + "grad_norm": 7.978949404174967, + "learning_rate": 8.162945752096172e-06, + "loss": 0.4355, + "step": 3640 + }, + { + "epoch": 0.18433877932375445, + "grad_norm": 3.0917950299227375, + "learning_rate": 8.157894736842106e-06, + "loss": 0.4161, + "step": 3650 + }, + { + "epoch": 0.18484381707532638, + "grad_norm": 5.585272554418732, + "learning_rate": 8.152843721588039e-06, + "loss": 0.4206, + "step": 3660 + }, + { + "epoch": 0.1853488548268983, + "grad_norm": 2.7026089034564875, + "learning_rate": 8.147792706333973e-06, + "loss": 0.4092, + "step": 3670 + }, + { + "epoch": 0.18585389257847024, + "grad_norm": 4.859931210791958, + "learning_rate": 8.142741691079907e-06, + "loss": 0.4064, + "step": 3680 + }, + { + "epoch": 0.18635893033004217, + "grad_norm": 3.403058240228029, + "learning_rate": 8.137690675825841e-06, + "loss": 0.4082, + "step": 3690 + }, + { + "epoch": 0.1868639680816141, + "grad_norm": 3.2005619654009076, + "learning_rate": 8.132639660571775e-06, + "loss": 0.4042, + "step": 3700 + }, + { + "epoch": 0.18736900583318603, + "grad_norm": 2.6974451416766443, + "learning_rate": 8.12758864531771e-06, + "loss": 0.4291, + "step": 3710 + }, + { + "epoch": 0.18787404358475795, + "grad_norm": 3.226834318122475, + "learning_rate": 8.122537630063644e-06, + "loss": 0.4173, + "step": 3720 + }, + { + "epoch": 0.18837908133632988, + "grad_norm": 2.6103166184994895, + "learning_rate": 8.117486614809578e-06, + "loss": 0.4112, + "step": 3730 + }, + { + "epoch": 0.18888411908790181, + "grad_norm": 2.691120701877213, + "learning_rate": 8.112435599555512e-06, + "loss": 0.4083, + "step": 3740 + }, + { + "epoch": 0.18938915683947374, + "grad_norm": 3.323955083237021, + "learning_rate": 8.107384584301446e-06, + "loss": 0.4096, + "step": 3750 + }, + { + "epoch": 0.18989419459104567, + "grad_norm": 6.965415675773204, + "learning_rate": 8.102333569047378e-06, + "loss": 0.3977, + "step": 3760 + }, + { + "epoch": 0.1903992323426176, + "grad_norm": 2.451525045869991, + "learning_rate": 8.097282553793313e-06, + "loss": 0.4195, + "step": 3770 + }, + { + "epoch": 0.19090427009418953, + "grad_norm": 2.6787347183394243, + "learning_rate": 8.092231538539247e-06, + "loss": 0.4121, + "step": 3780 + }, + { + "epoch": 0.19140930784576146, + "grad_norm": 6.742330893738729, + "learning_rate": 8.08718052328518e-06, + "loss": 0.4102, + "step": 3790 + }, + { + "epoch": 0.1919143455973334, + "grad_norm": 2.9458958646737083, + "learning_rate": 8.082129508031115e-06, + "loss": 0.4206, + "step": 3800 + }, + { + "epoch": 0.19241938334890532, + "grad_norm": 4.018744503984466, + "learning_rate": 8.077078492777049e-06, + "loss": 0.4367, + "step": 3810 + }, + { + "epoch": 0.19292442110047725, + "grad_norm": 4.594358535206507, + "learning_rate": 8.072027477522983e-06, + "loss": 0.4123, + "step": 3820 + }, + { + "epoch": 0.19342945885204918, + "grad_norm": 7.765118457446688, + "learning_rate": 8.066976462268916e-06, + "loss": 0.4203, + "step": 3830 + }, + { + "epoch": 0.1939344966036211, + "grad_norm": 4.571772892446487, + "learning_rate": 8.061925447014851e-06, + "loss": 0.4093, + "step": 3840 + }, + { + "epoch": 0.19443953435519304, + "grad_norm": 3.0704420404585995, + "learning_rate": 8.056874431760786e-06, + "loss": 0.4195, + "step": 3850 + }, + { + "epoch": 0.19494457210676497, + "grad_norm": 2.8235987578400334, + "learning_rate": 8.05182341650672e-06, + "loss": 0.4131, + "step": 3860 + }, + { + "epoch": 0.1954496098583369, + "grad_norm": 4.664925999855343, + "learning_rate": 8.046772401252652e-06, + "loss": 0.4052, + "step": 3870 + }, + { + "epoch": 0.19595464760990883, + "grad_norm": 3.0583937314253, + "learning_rate": 8.041721385998586e-06, + "loss": 0.4198, + "step": 3880 + }, + { + "epoch": 0.19645968536148076, + "grad_norm": 2.973956981634855, + "learning_rate": 8.03667037074452e-06, + "loss": 0.4066, + "step": 3890 + }, + { + "epoch": 0.19696472311305271, + "grad_norm": 2.803835666144933, + "learning_rate": 8.031619355490454e-06, + "loss": 0.4158, + "step": 3900 + }, + { + "epoch": 0.19746976086462464, + "grad_norm": 4.834280824568942, + "learning_rate": 8.026568340236389e-06, + "loss": 0.4178, + "step": 3910 + }, + { + "epoch": 0.19797479861619657, + "grad_norm": 10.038307243068243, + "learning_rate": 8.021517324982323e-06, + "loss": 0.4111, + "step": 3920 + }, + { + "epoch": 0.1984798363677685, + "grad_norm": 7.610030059675689, + "learning_rate": 8.016466309728255e-06, + "loss": 0.4096, + "step": 3930 + }, + { + "epoch": 0.19898487411934043, + "grad_norm": 3.6050764165551556, + "learning_rate": 8.01141529447419e-06, + "loss": 0.4233, + "step": 3940 + }, + { + "epoch": 0.19948991187091236, + "grad_norm": 6.736237771102318, + "learning_rate": 8.006364279220123e-06, + "loss": 0.4153, + "step": 3950 + }, + { + "epoch": 0.1999949496224843, + "grad_norm": 2.9583460730423443, + "learning_rate": 8.001313263966057e-06, + "loss": 0.4211, + "step": 3960 + }, + { + "epoch": 0.20049998737405622, + "grad_norm": 2.2968014860075576, + "learning_rate": 7.996262248711992e-06, + "loss": 0.4289, + "step": 3970 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 2.474011765446813, + "learning_rate": 7.991211233457926e-06, + "loss": 0.4233, + "step": 3980 + }, + { + "epoch": 0.20151006287720008, + "grad_norm": 3.0063110556198778, + "learning_rate": 7.98616021820386e-06, + "loss": 0.428, + "step": 3990 + }, + { + "epoch": 0.202015100628772, + "grad_norm": 4.355361962575098, + "learning_rate": 7.981109202949794e-06, + "loss": 0.4226, + "step": 4000 + }, + { + "epoch": 0.20252013838034394, + "grad_norm": 2.6560259873816747, + "learning_rate": 7.976058187695728e-06, + "loss": 0.4175, + "step": 4010 + }, + { + "epoch": 0.20302517613191587, + "grad_norm": 2.9619585376031647, + "learning_rate": 7.971007172441662e-06, + "loss": 0.4143, + "step": 4020 + }, + { + "epoch": 0.2035302138834878, + "grad_norm": 2.066998306222369, + "learning_rate": 7.965956157187595e-06, + "loss": 0.4001, + "step": 4030 + }, + { + "epoch": 0.20403525163505973, + "grad_norm": 4.632842633228617, + "learning_rate": 7.960905141933529e-06, + "loss": 0.4133, + "step": 4040 + }, + { + "epoch": 0.20454028938663166, + "grad_norm": 6.512090967359411, + "learning_rate": 7.955854126679463e-06, + "loss": 0.4271, + "step": 4050 + }, + { + "epoch": 0.2050453271382036, + "grad_norm": 3.4613990642774777, + "learning_rate": 7.950803111425397e-06, + "loss": 0.4035, + "step": 4060 + }, + { + "epoch": 0.20555036488977552, + "grad_norm": 6.082524106916885, + "learning_rate": 7.945752096171331e-06, + "loss": 0.4186, + "step": 4070 + }, + { + "epoch": 0.20605540264134745, + "grad_norm": 9.959451587245269, + "learning_rate": 7.940701080917265e-06, + "loss": 0.4136, + "step": 4080 + }, + { + "epoch": 0.20656044039291938, + "grad_norm": 3.1580403384223694, + "learning_rate": 7.935650065663198e-06, + "loss": 0.4166, + "step": 4090 + }, + { + "epoch": 0.2070654781444913, + "grad_norm": 43.253412774792665, + "learning_rate": 7.930599050409132e-06, + "loss": 0.4237, + "step": 4100 + }, + { + "epoch": 0.20757051589606323, + "grad_norm": 4.161811983252057, + "learning_rate": 7.925548035155068e-06, + "loss": 0.426, + "step": 4110 + }, + { + "epoch": 0.20807555364763516, + "grad_norm": 4.07148032877867, + "learning_rate": 7.920497019901002e-06, + "loss": 0.4046, + "step": 4120 + }, + { + "epoch": 0.2085805913992071, + "grad_norm": 2.980398901316733, + "learning_rate": 7.915446004646934e-06, + "loss": 0.4018, + "step": 4130 + }, + { + "epoch": 0.20908562915077902, + "grad_norm": 12.570414912224734, + "learning_rate": 7.910394989392868e-06, + "loss": 0.4249, + "step": 4140 + }, + { + "epoch": 0.20959066690235095, + "grad_norm": 4.296760517181418, + "learning_rate": 7.905343974138802e-06, + "loss": 0.4134, + "step": 4150 + }, + { + "epoch": 0.21009570465392288, + "grad_norm": 3.8924058247405795, + "learning_rate": 7.900292958884737e-06, + "loss": 0.4184, + "step": 4160 + }, + { + "epoch": 0.2106007424054948, + "grad_norm": 5.074708812193615, + "learning_rate": 7.89524194363067e-06, + "loss": 0.4326, + "step": 4170 + }, + { + "epoch": 0.21110578015706674, + "grad_norm": 4.094894279147386, + "learning_rate": 7.890190928376605e-06, + "loss": 0.427, + "step": 4180 + }, + { + "epoch": 0.21161081790863867, + "grad_norm": 7.0180188428328245, + "learning_rate": 7.885139913122539e-06, + "loss": 0.4434, + "step": 4190 + }, + { + "epoch": 0.2121158556602106, + "grad_norm": 3.1522461269078854, + "learning_rate": 7.880088897868471e-06, + "loss": 0.4118, + "step": 4200 + }, + { + "epoch": 0.21262089341178253, + "grad_norm": 3.1387724966942048, + "learning_rate": 7.875037882614405e-06, + "loss": 0.4188, + "step": 4210 + }, + { + "epoch": 0.21312593116335446, + "grad_norm": 2.6776667573826005, + "learning_rate": 7.86998686736034e-06, + "loss": 0.432, + "step": 4220 + }, + { + "epoch": 0.2136309689149264, + "grad_norm": 3.6637419491007046, + "learning_rate": 7.864935852106274e-06, + "loss": 0.435, + "step": 4230 + }, + { + "epoch": 0.21413600666649832, + "grad_norm": 5.882360941042325, + "learning_rate": 7.859884836852208e-06, + "loss": 0.4308, + "step": 4240 + }, + { + "epoch": 0.21464104441807025, + "grad_norm": 8.010164762562015, + "learning_rate": 7.854833821598142e-06, + "loss": 0.4035, + "step": 4250 + }, + { + "epoch": 0.21514608216964218, + "grad_norm": 2.940352809589755, + "learning_rate": 7.849782806344076e-06, + "loss": 0.4045, + "step": 4260 + }, + { + "epoch": 0.2156511199212141, + "grad_norm": 5.768043762782633, + "learning_rate": 7.84473179109001e-06, + "loss": 0.4106, + "step": 4270 + }, + { + "epoch": 0.21615615767278604, + "grad_norm": 4.169886235036406, + "learning_rate": 7.839680775835944e-06, + "loss": 0.4238, + "step": 4280 + }, + { + "epoch": 0.21666119542435797, + "grad_norm": 3.249720384522653, + "learning_rate": 7.834629760581878e-06, + "loss": 0.3969, + "step": 4290 + }, + { + "epoch": 0.2171662331759299, + "grad_norm": 5.065976709307023, + "learning_rate": 7.829578745327811e-06, + "loss": 0.4069, + "step": 4300 + }, + { + "epoch": 0.21767127092750183, + "grad_norm": 3.9668832894751107, + "learning_rate": 7.824527730073745e-06, + "loss": 0.4125, + "step": 4310 + }, + { + "epoch": 0.21817630867907375, + "grad_norm": 3.4619845168591796, + "learning_rate": 7.819476714819679e-06, + "loss": 0.4105, + "step": 4320 + }, + { + "epoch": 0.21868134643064568, + "grad_norm": 3.886033142263623, + "learning_rate": 7.814425699565613e-06, + "loss": 0.4151, + "step": 4330 + }, + { + "epoch": 0.21918638418221761, + "grad_norm": 3.5781821573741994, + "learning_rate": 7.809374684311547e-06, + "loss": 0.3923, + "step": 4340 + }, + { + "epoch": 0.21969142193378954, + "grad_norm": 2.9767702411423325, + "learning_rate": 7.804323669057482e-06, + "loss": 0.3827, + "step": 4350 + }, + { + "epoch": 0.22019645968536147, + "grad_norm": 3.3595860629684577, + "learning_rate": 7.799272653803414e-06, + "loss": 0.4103, + "step": 4360 + }, + { + "epoch": 0.2207014974369334, + "grad_norm": 2.0952276159055154, + "learning_rate": 7.794221638549348e-06, + "loss": 0.4171, + "step": 4370 + }, + { + "epoch": 0.22120653518850533, + "grad_norm": 3.1915802413191376, + "learning_rate": 7.789170623295284e-06, + "loss": 0.4273, + "step": 4380 + }, + { + "epoch": 0.22171157294007726, + "grad_norm": 1.8629666469771935, + "learning_rate": 7.784119608041218e-06, + "loss": 0.4226, + "step": 4390 + }, + { + "epoch": 0.2222166106916492, + "grad_norm": 4.627481223873417, + "learning_rate": 7.77906859278715e-06, + "loss": 0.3979, + "step": 4400 + }, + { + "epoch": 0.22272164844322112, + "grad_norm": 3.4768855391600084, + "learning_rate": 7.774017577533085e-06, + "loss": 0.413, + "step": 4410 + }, + { + "epoch": 0.22322668619479305, + "grad_norm": 2.8907573807850673, + "learning_rate": 7.768966562279019e-06, + "loss": 0.4201, + "step": 4420 + }, + { + "epoch": 0.22373172394636498, + "grad_norm": 5.5052911721090805, + "learning_rate": 7.763915547024953e-06, + "loss": 0.4322, + "step": 4430 + }, + { + "epoch": 0.2242367616979369, + "grad_norm": 2.171757782442608, + "learning_rate": 7.758864531770887e-06, + "loss": 0.4341, + "step": 4440 + }, + { + "epoch": 0.22474179944950884, + "grad_norm": 2.092006999366198, + "learning_rate": 7.753813516516821e-06, + "loss": 0.422, + "step": 4450 + }, + { + "epoch": 0.22524683720108077, + "grad_norm": 6.02493579048235, + "learning_rate": 7.748762501262753e-06, + "loss": 0.4105, + "step": 4460 + }, + { + "epoch": 0.2257518749526527, + "grad_norm": 3.705631561875418, + "learning_rate": 7.743711486008688e-06, + "loss": 0.4081, + "step": 4470 + }, + { + "epoch": 0.22625691270422463, + "grad_norm": 4.191020612596744, + "learning_rate": 7.738660470754622e-06, + "loss": 0.402, + "step": 4480 + }, + { + "epoch": 0.22676195045579656, + "grad_norm": 2.9926161263513484, + "learning_rate": 7.733609455500556e-06, + "loss": 0.4105, + "step": 4490 + }, + { + "epoch": 0.22726698820736851, + "grad_norm": 2.7804371886771304, + "learning_rate": 7.72855844024649e-06, + "loss": 0.4004, + "step": 4500 + }, + { + "epoch": 0.22777202595894044, + "grad_norm": 3.3532410934862007, + "learning_rate": 7.723507424992424e-06, + "loss": 0.3982, + "step": 4510 + }, + { + "epoch": 0.22827706371051237, + "grad_norm": 5.67085098197258, + "learning_rate": 7.718456409738358e-06, + "loss": 0.4244, + "step": 4520 + }, + { + "epoch": 0.2287821014620843, + "grad_norm": 6.390172943127481, + "learning_rate": 7.713405394484292e-06, + "loss": 0.405, + "step": 4530 + }, + { + "epoch": 0.22928713921365623, + "grad_norm": 2.0493376647118535, + "learning_rate": 7.708354379230226e-06, + "loss": 0.4091, + "step": 4540 + }, + { + "epoch": 0.22979217696522816, + "grad_norm": 3.2998640142236373, + "learning_rate": 7.70330336397616e-06, + "loss": 0.4116, + "step": 4550 + }, + { + "epoch": 0.2302972147168001, + "grad_norm": 9.39006495464318, + "learning_rate": 7.698252348722095e-06, + "loss": 0.4047, + "step": 4560 + }, + { + "epoch": 0.23080225246837202, + "grad_norm": 5.551570211137529, + "learning_rate": 7.693201333468027e-06, + "loss": 0.408, + "step": 4570 + }, + { + "epoch": 0.23130729021994395, + "grad_norm": 4.557076032338838, + "learning_rate": 7.688150318213961e-06, + "loss": 0.3971, + "step": 4580 + }, + { + "epoch": 0.23181232797151588, + "grad_norm": 2.606206409240519, + "learning_rate": 7.683099302959895e-06, + "loss": 0.4178, + "step": 4590 + }, + { + "epoch": 0.2323173657230878, + "grad_norm": 4.447142719835132, + "learning_rate": 7.67804828770583e-06, + "loss": 0.394, + "step": 4600 + }, + { + "epoch": 0.23282240347465974, + "grad_norm": 3.0541458976753204, + "learning_rate": 7.672997272451764e-06, + "loss": 0.425, + "step": 4610 + }, + { + "epoch": 0.23332744122623167, + "grad_norm": 2.176856844484498, + "learning_rate": 7.667946257197698e-06, + "loss": 0.4028, + "step": 4620 + }, + { + "epoch": 0.2338324789778036, + "grad_norm": 2.308455454963439, + "learning_rate": 7.66289524194363e-06, + "loss": 0.4141, + "step": 4630 + }, + { + "epoch": 0.23433751672937553, + "grad_norm": 2.112712712332131, + "learning_rate": 7.657844226689564e-06, + "loss": 0.419, + "step": 4640 + }, + { + "epoch": 0.23484255448094746, + "grad_norm": 3.2748057827058634, + "learning_rate": 7.6527932114355e-06, + "loss": 0.4122, + "step": 4650 + }, + { + "epoch": 0.2353475922325194, + "grad_norm": 4.614165718449686, + "learning_rate": 7.647742196181434e-06, + "loss": 0.4181, + "step": 4660 + }, + { + "epoch": 0.23585262998409132, + "grad_norm": 3.4633789585985038, + "learning_rate": 7.642691180927367e-06, + "loss": 0.4134, + "step": 4670 + }, + { + "epoch": 0.23635766773566325, + "grad_norm": 2.459474386028591, + "learning_rate": 7.6376401656733e-06, + "loss": 0.4045, + "step": 4680 + }, + { + "epoch": 0.23686270548723518, + "grad_norm": 4.467572890592401, + "learning_rate": 7.632589150419235e-06, + "loss": 0.4108, + "step": 4690 + }, + { + "epoch": 0.2373677432388071, + "grad_norm": 3.2733674833799085, + "learning_rate": 7.627538135165169e-06, + "loss": 0.4247, + "step": 4700 + }, + { + "epoch": 0.23787278099037903, + "grad_norm": 2.5012350340127583, + "learning_rate": 7.622487119911103e-06, + "loss": 0.4027, + "step": 4710 + }, + { + "epoch": 0.23837781874195096, + "grad_norm": 2.918885002659697, + "learning_rate": 7.6174361046570365e-06, + "loss": 0.3961, + "step": 4720 + }, + { + "epoch": 0.2388828564935229, + "grad_norm": 2.151448240091896, + "learning_rate": 7.612385089402971e-06, + "loss": 0.397, + "step": 4730 + }, + { + "epoch": 0.23938789424509482, + "grad_norm": 3.645676075183417, + "learning_rate": 7.607334074148905e-06, + "loss": 0.4042, + "step": 4740 + }, + { + "epoch": 0.23989293199666675, + "grad_norm": 4.7934119111079365, + "learning_rate": 7.602283058894838e-06, + "loss": 0.4199, + "step": 4750 + }, + { + "epoch": 0.24039796974823868, + "grad_norm": 3.024756725193816, + "learning_rate": 7.597232043640772e-06, + "loss": 0.4214, + "step": 4760 + }, + { + "epoch": 0.2409030074998106, + "grad_norm": 3.8298473971048552, + "learning_rate": 7.592181028386706e-06, + "loss": 0.3971, + "step": 4770 + }, + { + "epoch": 0.24140804525138254, + "grad_norm": 3.6045455173768897, + "learning_rate": 7.5871300131326395e-06, + "loss": 0.41, + "step": 4780 + }, + { + "epoch": 0.24191308300295447, + "grad_norm": 2.4793807629242686, + "learning_rate": 7.5820789978785745e-06, + "loss": 0.4027, + "step": 4790 + }, + { + "epoch": 0.2424181207545264, + "grad_norm": 2.7349340937466837, + "learning_rate": 7.577027982624509e-06, + "loss": 0.4354, + "step": 4800 + }, + { + "epoch": 0.24292315850609833, + "grad_norm": 3.2738776137303436, + "learning_rate": 7.571976967370443e-06, + "loss": 0.4472, + "step": 4810 + }, + { + "epoch": 0.24342819625767026, + "grad_norm": 13.785461532124051, + "learning_rate": 7.566925952116376e-06, + "loss": 0.3969, + "step": 4820 + }, + { + "epoch": 0.2439332340092422, + "grad_norm": 3.313726734189114, + "learning_rate": 7.56187493686231e-06, + "loss": 0.4007, + "step": 4830 + }, + { + "epoch": 0.24443827176081412, + "grad_norm": 4.099815087006132, + "learning_rate": 7.556823921608244e-06, + "loss": 0.4099, + "step": 4840 + }, + { + "epoch": 0.24494330951238605, + "grad_norm": 4.723031766050033, + "learning_rate": 7.5517729063541775e-06, + "loss": 0.4204, + "step": 4850 + }, + { + "epoch": 0.24544834726395798, + "grad_norm": 3.7013488115087085, + "learning_rate": 7.546721891100112e-06, + "loss": 0.3928, + "step": 4860 + }, + { + "epoch": 0.2459533850155299, + "grad_norm": 7.683672740752602, + "learning_rate": 7.541670875846046e-06, + "loss": 0.4028, + "step": 4870 + }, + { + "epoch": 0.24645842276710184, + "grad_norm": 5.671010001370461, + "learning_rate": 7.536619860591979e-06, + "loss": 0.433, + "step": 4880 + }, + { + "epoch": 0.24696346051867377, + "grad_norm": 9.869009780517741, + "learning_rate": 7.531568845337913e-06, + "loss": 0.3989, + "step": 4890 + }, + { + "epoch": 0.2474684982702457, + "grad_norm": 5.709197913641078, + "learning_rate": 7.526517830083847e-06, + "loss": 0.4031, + "step": 4900 + }, + { + "epoch": 0.24797353602181763, + "grad_norm": 4.33540159587482, + "learning_rate": 7.521466814829781e-06, + "loss": 0.4062, + "step": 4910 + }, + { + "epoch": 0.24847857377338955, + "grad_norm": 8.109361669653662, + "learning_rate": 7.5164157995757156e-06, + "loss": 0.3972, + "step": 4920 + }, + { + "epoch": 0.24898361152496148, + "grad_norm": 13.57377976771627, + "learning_rate": 7.51136478432165e-06, + "loss": 0.4067, + "step": 4930 + }, + { + "epoch": 0.24948864927653341, + "grad_norm": 4.077593295992246, + "learning_rate": 7.506313769067584e-06, + "loss": 0.3992, + "step": 4940 + }, + { + "epoch": 0.24999368702810534, + "grad_norm": 6.370437009913983, + "learning_rate": 7.501262753813517e-06, + "loss": 0.41, + "step": 4950 + }, + { + "epoch": 0.2504987247796773, + "grad_norm": 4.917853551887297, + "learning_rate": 7.496211738559451e-06, + "loss": 0.3967, + "step": 4960 + }, + { + "epoch": 0.25100376253124923, + "grad_norm": 8.018747776559712, + "learning_rate": 7.491160723305385e-06, + "loss": 0.4174, + "step": 4970 + }, + { + "epoch": 0.25150880028282113, + "grad_norm": 5.56863245620068, + "learning_rate": 7.486109708051319e-06, + "loss": 0.4161, + "step": 4980 + }, + { + "epoch": 0.2520138380343931, + "grad_norm": 3.5246503817814405, + "learning_rate": 7.481058692797253e-06, + "loss": 0.4161, + "step": 4990 + }, + { + "epoch": 0.252518875785965, + "grad_norm": 8.805921139909751, + "learning_rate": 7.476007677543187e-06, + "loss": 0.4045, + "step": 5000 + }, + { + "epoch": 0.25302391353753695, + "grad_norm": 4.998987522917178, + "learning_rate": 7.47095666228912e-06, + "loss": 0.3859, + "step": 5010 + }, + { + "epoch": 0.25352895128910885, + "grad_norm": 4.00150580245438, + "learning_rate": 7.465905647035054e-06, + "loss": 0.4164, + "step": 5020 + }, + { + "epoch": 0.2540339890406808, + "grad_norm": 3.706340372471242, + "learning_rate": 7.460854631780988e-06, + "loss": 0.4176, + "step": 5030 + }, + { + "epoch": 0.2545390267922527, + "grad_norm": 4.998624600906844, + "learning_rate": 7.4558036165269225e-06, + "loss": 0.41, + "step": 5040 + }, + { + "epoch": 0.25504406454382467, + "grad_norm": 8.31884957399422, + "learning_rate": 7.450752601272856e-06, + "loss": 0.3998, + "step": 5050 + }, + { + "epoch": 0.25554910229539657, + "grad_norm": 3.4901102806237314, + "learning_rate": 7.445701586018791e-06, + "loss": 0.3997, + "step": 5060 + }, + { + "epoch": 0.2560541400469685, + "grad_norm": 4.645395770689467, + "learning_rate": 7.440650570764725e-06, + "loss": 0.4083, + "step": 5070 + }, + { + "epoch": 0.2565591777985404, + "grad_norm": 6.384417075428531, + "learning_rate": 7.435599555510659e-06, + "loss": 0.4092, + "step": 5080 + }, + { + "epoch": 0.2570642155501124, + "grad_norm": 2.4454349649277694, + "learning_rate": 7.430548540256592e-06, + "loss": 0.3848, + "step": 5090 + }, + { + "epoch": 0.2575692533016843, + "grad_norm": 3.558542444645485, + "learning_rate": 7.425497525002526e-06, + "loss": 0.4088, + "step": 5100 + }, + { + "epoch": 0.25807429105325624, + "grad_norm": 3.0244806698758713, + "learning_rate": 7.4204465097484605e-06, + "loss": 0.4039, + "step": 5110 + }, + { + "epoch": 0.25857932880482815, + "grad_norm": 3.8966266585093767, + "learning_rate": 7.415395494494394e-06, + "loss": 0.4235, + "step": 5120 + }, + { + "epoch": 0.2590843665564001, + "grad_norm": 5.6318294513336555, + "learning_rate": 7.410344479240328e-06, + "loss": 0.4037, + "step": 5130 + }, + { + "epoch": 0.259589404307972, + "grad_norm": 3.0998441234985714, + "learning_rate": 7.405293463986262e-06, + "loss": 0.3978, + "step": 5140 + }, + { + "epoch": 0.26009444205954396, + "grad_norm": 2.21396747445095, + "learning_rate": 7.400242448732195e-06, + "loss": 0.4145, + "step": 5150 + }, + { + "epoch": 0.26059947981111586, + "grad_norm": 4.1662815020101585, + "learning_rate": 7.3951914334781294e-06, + "loss": 0.3868, + "step": 5160 + }, + { + "epoch": 0.2611045175626878, + "grad_norm": 12.395354056627705, + "learning_rate": 7.3901404182240636e-06, + "loss": 0.4154, + "step": 5170 + }, + { + "epoch": 0.2616095553142597, + "grad_norm": 3.1142309464667712, + "learning_rate": 7.385089402969997e-06, + "loss": 0.413, + "step": 5180 + }, + { + "epoch": 0.2621145930658317, + "grad_norm": 4.513387929199787, + "learning_rate": 7.380038387715931e-06, + "loss": 0.3921, + "step": 5190 + }, + { + "epoch": 0.2626196308174036, + "grad_norm": 4.7697729405684735, + "learning_rate": 7.374987372461866e-06, + "loss": 0.4012, + "step": 5200 + }, + { + "epoch": 0.26312466856897554, + "grad_norm": 4.893393941433008, + "learning_rate": 7.3699363572078e-06, + "loss": 0.4186, + "step": 5210 + }, + { + "epoch": 0.26362970632054744, + "grad_norm": 3.831293276797052, + "learning_rate": 7.364885341953733e-06, + "loss": 0.4226, + "step": 5220 + }, + { + "epoch": 0.2641347440721194, + "grad_norm": 9.39587893739541, + "learning_rate": 7.3598343266996675e-06, + "loss": 0.4223, + "step": 5230 + }, + { + "epoch": 0.2646397818236913, + "grad_norm": 2.5508906516086913, + "learning_rate": 7.354783311445602e-06, + "loss": 0.4204, + "step": 5240 + }, + { + "epoch": 0.26514481957526326, + "grad_norm": 1.9529316193965132, + "learning_rate": 7.349732296191535e-06, + "loss": 0.4137, + "step": 5250 + }, + { + "epoch": 0.26564985732683516, + "grad_norm": 2.2836631555730147, + "learning_rate": 7.344681280937469e-06, + "loss": 0.4272, + "step": 5260 + }, + { + "epoch": 0.2661548950784071, + "grad_norm": 2.740870228644112, + "learning_rate": 7.339630265683403e-06, + "loss": 0.4042, + "step": 5270 + }, + { + "epoch": 0.266659932829979, + "grad_norm": 2.8335765045366057, + "learning_rate": 7.334579250429336e-06, + "loss": 0.4042, + "step": 5280 + }, + { + "epoch": 0.267164970581551, + "grad_norm": 6.021567811733276, + "learning_rate": 7.3295282351752705e-06, + "loss": 0.4154, + "step": 5290 + }, + { + "epoch": 0.2676700083331229, + "grad_norm": 3.8900529078388475, + "learning_rate": 7.324477219921205e-06, + "loss": 0.4066, + "step": 5300 + }, + { + "epoch": 0.26817504608469483, + "grad_norm": 4.107751019837596, + "learning_rate": 7.319426204667138e-06, + "loss": 0.4314, + "step": 5310 + }, + { + "epoch": 0.26868008383626674, + "grad_norm": 3.83489471259606, + "learning_rate": 7.314375189413072e-06, + "loss": 0.4289, + "step": 5320 + }, + { + "epoch": 0.2691851215878387, + "grad_norm": 4.1927101461017955, + "learning_rate": 7.309324174159007e-06, + "loss": 0.4104, + "step": 5330 + }, + { + "epoch": 0.2696901593394106, + "grad_norm": 4.40200271099351, + "learning_rate": 7.304273158904941e-06, + "loss": 0.4117, + "step": 5340 + }, + { + "epoch": 0.27019519709098255, + "grad_norm": 2.5719446012019347, + "learning_rate": 7.299222143650874e-06, + "loss": 0.4172, + "step": 5350 + }, + { + "epoch": 0.27070023484255445, + "grad_norm": 3.550169709846441, + "learning_rate": 7.2941711283968085e-06, + "loss": 0.3961, + "step": 5360 + }, + { + "epoch": 0.2712052725941264, + "grad_norm": 5.9366850534425994, + "learning_rate": 7.289120113142743e-06, + "loss": 0.4092, + "step": 5370 + }, + { + "epoch": 0.2717103103456983, + "grad_norm": 2.2429091940544756, + "learning_rate": 7.284069097888676e-06, + "loss": 0.3911, + "step": 5380 + }, + { + "epoch": 0.27221534809727027, + "grad_norm": 3.417557725862848, + "learning_rate": 7.27901808263461e-06, + "loss": 0.404, + "step": 5390 + }, + { + "epoch": 0.27272038584884223, + "grad_norm": 2.724964100715556, + "learning_rate": 7.273967067380544e-06, + "loss": 0.3954, + "step": 5400 + }, + { + "epoch": 0.27322542360041413, + "grad_norm": 2.856161247602667, + "learning_rate": 7.268916052126478e-06, + "loss": 0.4166, + "step": 5410 + }, + { + "epoch": 0.2737304613519861, + "grad_norm": 3.6154589918708724, + "learning_rate": 7.2638650368724116e-06, + "loss": 0.4194, + "step": 5420 + }, + { + "epoch": 0.274235499103558, + "grad_norm": 4.796628055616583, + "learning_rate": 7.258814021618346e-06, + "loss": 0.4047, + "step": 5430 + }, + { + "epoch": 0.27474053685512995, + "grad_norm": 1.8189441498839218, + "learning_rate": 7.25376300636428e-06, + "loss": 0.427, + "step": 5440 + }, + { + "epoch": 0.27524557460670185, + "grad_norm": 7.024511830423675, + "learning_rate": 7.248711991110213e-06, + "loss": 0.4046, + "step": 5450 + }, + { + "epoch": 0.2757506123582738, + "grad_norm": 3.5667517176775423, + "learning_rate": 7.243660975856147e-06, + "loss": 0.4104, + "step": 5460 + }, + { + "epoch": 0.2762556501098457, + "grad_norm": 2.9842087732377456, + "learning_rate": 7.238609960602082e-06, + "loss": 0.3865, + "step": 5470 + }, + { + "epoch": 0.27676068786141766, + "grad_norm": 3.138211814542453, + "learning_rate": 7.233558945348016e-06, + "loss": 0.421, + "step": 5480 + }, + { + "epoch": 0.27726572561298957, + "grad_norm": 3.7899763794613603, + "learning_rate": 7.22850793009395e-06, + "loss": 0.4211, + "step": 5490 + }, + { + "epoch": 0.2777707633645615, + "grad_norm": 6.302295488602417, + "learning_rate": 7.223456914839884e-06, + "loss": 0.414, + "step": 5500 + }, + { + "epoch": 0.2782758011161334, + "grad_norm": 10.35490362594059, + "learning_rate": 7.218405899585818e-06, + "loss": 0.416, + "step": 5510 + }, + { + "epoch": 0.2787808388677054, + "grad_norm": 6.310469266434071, + "learning_rate": 7.213354884331751e-06, + "loss": 0.4064, + "step": 5520 + }, + { + "epoch": 0.2792858766192773, + "grad_norm": 16.515977602019493, + "learning_rate": 7.208303869077685e-06, + "loss": 0.4079, + "step": 5530 + }, + { + "epoch": 0.27979091437084924, + "grad_norm": 4.823535989192853, + "learning_rate": 7.203252853823619e-06, + "loss": 0.4057, + "step": 5540 + }, + { + "epoch": 0.28029595212242114, + "grad_norm": 5.510784263870791, + "learning_rate": 7.198201838569553e-06, + "loss": 0.4037, + "step": 5550 + }, + { + "epoch": 0.2808009898739931, + "grad_norm": 6.358367429720183, + "learning_rate": 7.193150823315487e-06, + "loss": 0.4229, + "step": 5560 + }, + { + "epoch": 0.281306027625565, + "grad_norm": 6.411947776097414, + "learning_rate": 7.188099808061421e-06, + "loss": 0.4067, + "step": 5570 + }, + { + "epoch": 0.28181106537713696, + "grad_norm": 7.711569114064788, + "learning_rate": 7.183048792807354e-06, + "loss": 0.4033, + "step": 5580 + }, + { + "epoch": 0.28231610312870886, + "grad_norm": 7.577084197751842, + "learning_rate": 7.177997777553288e-06, + "loss": 0.4193, + "step": 5590 + }, + { + "epoch": 0.2828211408802808, + "grad_norm": 8.359668853331197, + "learning_rate": 7.172946762299223e-06, + "loss": 0.4075, + "step": 5600 + }, + { + "epoch": 0.2833261786318527, + "grad_norm": 3.151405334238333, + "learning_rate": 7.167895747045157e-06, + "loss": 0.3968, + "step": 5610 + }, + { + "epoch": 0.2838312163834247, + "grad_norm": 5.148673338465559, + "learning_rate": 7.162844731791091e-06, + "loss": 0.4084, + "step": 5620 + }, + { + "epoch": 0.2843362541349966, + "grad_norm": 4.221476129767122, + "learning_rate": 7.157793716537025e-06, + "loss": 0.4171, + "step": 5630 + }, + { + "epoch": 0.28484129188656854, + "grad_norm": 13.100880512181337, + "learning_rate": 7.152742701282959e-06, + "loss": 0.4153, + "step": 5640 + }, + { + "epoch": 0.28534632963814044, + "grad_norm": 4.5479841682511735, + "learning_rate": 7.147691686028892e-06, + "loss": 0.4008, + "step": 5650 + }, + { + "epoch": 0.2858513673897124, + "grad_norm": 12.774978173115935, + "learning_rate": 7.142640670774826e-06, + "loss": 0.4081, + "step": 5660 + }, + { + "epoch": 0.2863564051412843, + "grad_norm": 4.74811206992399, + "learning_rate": 7.1375896555207604e-06, + "loss": 0.4111, + "step": 5670 + }, + { + "epoch": 0.28686144289285626, + "grad_norm": 6.322992937888857, + "learning_rate": 7.132538640266694e-06, + "loss": 0.4147, + "step": 5680 + }, + { + "epoch": 0.28736648064442816, + "grad_norm": 2.3879730307069913, + "learning_rate": 7.127487625012628e-06, + "loss": 0.413, + "step": 5690 + }, + { + "epoch": 0.2878715183960001, + "grad_norm": 9.78190300595279, + "learning_rate": 7.122436609758562e-06, + "loss": 0.4082, + "step": 5700 + }, + { + "epoch": 0.288376556147572, + "grad_norm": 2.9229120102524546, + "learning_rate": 7.117385594504495e-06, + "loss": 0.3991, + "step": 5710 + }, + { + "epoch": 0.288881593899144, + "grad_norm": 3.0629100552949367, + "learning_rate": 7.112334579250429e-06, + "loss": 0.4219, + "step": 5720 + }, + { + "epoch": 0.2893866316507159, + "grad_norm": 4.461418623396228, + "learning_rate": 7.1072835639963635e-06, + "loss": 0.3935, + "step": 5730 + }, + { + "epoch": 0.28989166940228783, + "grad_norm": 4.158346788895872, + "learning_rate": 7.1022325487422985e-06, + "loss": 0.4055, + "step": 5740 + }, + { + "epoch": 0.29039670715385973, + "grad_norm": 4.423297022005039, + "learning_rate": 7.097181533488232e-06, + "loss": 0.4056, + "step": 5750 + }, + { + "epoch": 0.2909017449054317, + "grad_norm": 9.748691220694795, + "learning_rate": 7.092130518234166e-06, + "loss": 0.4163, + "step": 5760 + }, + { + "epoch": 0.2914067826570036, + "grad_norm": 5.583094724720717, + "learning_rate": 7.0870795029801e-06, + "loss": 0.404, + "step": 5770 + }, + { + "epoch": 0.29191182040857555, + "grad_norm": 5.4361132596130375, + "learning_rate": 7.082028487726034e-06, + "loss": 0.4077, + "step": 5780 + }, + { + "epoch": 0.29241685816014745, + "grad_norm": 5.4795736369075225, + "learning_rate": 7.076977472471967e-06, + "loss": 0.385, + "step": 5790 + }, + { + "epoch": 0.2929218959117194, + "grad_norm": 6.295238304978109, + "learning_rate": 7.0719264572179015e-06, + "loss": 0.3865, + "step": 5800 + }, + { + "epoch": 0.2934269336632913, + "grad_norm": 6.992767843962523, + "learning_rate": 7.066875441963836e-06, + "loss": 0.4016, + "step": 5810 + }, + { + "epoch": 0.29393197141486327, + "grad_norm": 2.2081931951077762, + "learning_rate": 7.061824426709769e-06, + "loss": 0.4177, + "step": 5820 + }, + { + "epoch": 0.29443700916643517, + "grad_norm": 2.6034285969774755, + "learning_rate": 7.056773411455703e-06, + "loss": 0.4259, + "step": 5830 + }, + { + "epoch": 0.29494204691800713, + "grad_norm": 3.409449485576021, + "learning_rate": 7.051722396201637e-06, + "loss": 0.4033, + "step": 5840 + }, + { + "epoch": 0.29544708466957903, + "grad_norm": 4.0667424482713415, + "learning_rate": 7.04667138094757e-06, + "loss": 0.3996, + "step": 5850 + }, + { + "epoch": 0.295952122421151, + "grad_norm": 10.124949019281052, + "learning_rate": 7.0416203656935045e-06, + "loss": 0.4006, + "step": 5860 + }, + { + "epoch": 0.2964571601727229, + "grad_norm": 14.286058521059143, + "learning_rate": 7.0365693504394395e-06, + "loss": 0.4095, + "step": 5870 + }, + { + "epoch": 0.29696219792429485, + "grad_norm": 6.367565010187276, + "learning_rate": 7.031518335185374e-06, + "loss": 0.3838, + "step": 5880 + }, + { + "epoch": 0.29746723567586675, + "grad_norm": 5.583399625565326, + "learning_rate": 7.026467319931307e-06, + "loss": 0.4117, + "step": 5890 + }, + { + "epoch": 0.2979722734274387, + "grad_norm": 3.0731137084379534, + "learning_rate": 7.021416304677241e-06, + "loss": 0.4236, + "step": 5900 + }, + { + "epoch": 0.2984773111790106, + "grad_norm": 3.0663411110006646, + "learning_rate": 7.016365289423175e-06, + "loss": 0.4053, + "step": 5910 + }, + { + "epoch": 0.29898234893058256, + "grad_norm": 2.750574161212685, + "learning_rate": 7.0113142741691084e-06, + "loss": 0.4068, + "step": 5920 + }, + { + "epoch": 0.29948738668215447, + "grad_norm": 6.009688968198948, + "learning_rate": 7.0062632589150426e-06, + "loss": 0.3836, + "step": 5930 + }, + { + "epoch": 0.2999924244337264, + "grad_norm": 2.9753632251149047, + "learning_rate": 7.001212243660977e-06, + "loss": 0.4004, + "step": 5940 + }, + { + "epoch": 0.3004974621852983, + "grad_norm": 4.208610201729938, + "learning_rate": 6.99616122840691e-06, + "loss": 0.4076, + "step": 5950 + }, + { + "epoch": 0.3010024999368703, + "grad_norm": 4.7205159244344435, + "learning_rate": 6.991110213152844e-06, + "loss": 0.4169, + "step": 5960 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 5.1888963188216914, + "learning_rate": 6.986059197898778e-06, + "loss": 0.4131, + "step": 5970 + }, + { + "epoch": 0.30201257544001414, + "grad_norm": 5.94229402465264, + "learning_rate": 6.9810081826447115e-06, + "loss": 0.3973, + "step": 5980 + }, + { + "epoch": 0.30251761319158604, + "grad_norm": 5.525705257458754, + "learning_rate": 6.975957167390646e-06, + "loss": 0.4055, + "step": 5990 + }, + { + "epoch": 0.303022650943158, + "grad_norm": 12.459125129727365, + "learning_rate": 6.97090615213658e-06, + "loss": 0.3971, + "step": 6000 + }, + { + "epoch": 0.30352768869472996, + "grad_norm": 31.656872346789477, + "learning_rate": 6.965855136882515e-06, + "loss": 0.407, + "step": 6010 + }, + { + "epoch": 0.30403272644630186, + "grad_norm": 9.639606695045442, + "learning_rate": 6.960804121628448e-06, + "loss": 0.3863, + "step": 6020 + }, + { + "epoch": 0.3045377641978738, + "grad_norm": 7.035557345160716, + "learning_rate": 6.955753106374382e-06, + "loss": 0.3869, + "step": 6030 + }, + { + "epoch": 0.3050428019494457, + "grad_norm": 4.9371125965383165, + "learning_rate": 6.950702091120316e-06, + "loss": 0.3976, + "step": 6040 + }, + { + "epoch": 0.3055478397010177, + "grad_norm": 3.933897990218482, + "learning_rate": 6.9456510758662495e-06, + "loss": 0.3897, + "step": 6050 + }, + { + "epoch": 0.3060528774525896, + "grad_norm": 3.1641621883370337, + "learning_rate": 6.940600060612184e-06, + "loss": 0.3907, + "step": 6060 + }, + { + "epoch": 0.30655791520416154, + "grad_norm": 8.063219971247296, + "learning_rate": 6.935549045358118e-06, + "loss": 0.4031, + "step": 6070 + }, + { + "epoch": 0.30706295295573344, + "grad_norm": 5.068712111699617, + "learning_rate": 6.930498030104051e-06, + "loss": 0.3669, + "step": 6080 + }, + { + "epoch": 0.3075679907073054, + "grad_norm": 2.659761406451919, + "learning_rate": 6.925447014849985e-06, + "loss": 0.4083, + "step": 6090 + }, + { + "epoch": 0.3080730284588773, + "grad_norm": 2.711900853734696, + "learning_rate": 6.920395999595919e-06, + "loss": 0.3857, + "step": 6100 + }, + { + "epoch": 0.30857806621044925, + "grad_norm": 3.5691023416952015, + "learning_rate": 6.9153449843418526e-06, + "loss": 0.4075, + "step": 6110 + }, + { + "epoch": 0.30908310396202116, + "grad_norm": 9.44528498007685, + "learning_rate": 6.910293969087787e-06, + "loss": 0.3907, + "step": 6120 + }, + { + "epoch": 0.3095881417135931, + "grad_norm": 6.630952946299743, + "learning_rate": 6.905242953833721e-06, + "loss": 0.4166, + "step": 6130 + }, + { + "epoch": 0.310093179465165, + "grad_norm": 2.240750009167658, + "learning_rate": 6.900191938579655e-06, + "loss": 0.4103, + "step": 6140 + }, + { + "epoch": 0.31059821721673697, + "grad_norm": 3.4596094945674865, + "learning_rate": 6.89514092332559e-06, + "loss": 0.3979, + "step": 6150 + }, + { + "epoch": 0.3111032549683089, + "grad_norm": 3.4198773372663984, + "learning_rate": 6.890089908071523e-06, + "loss": 0.4131, + "step": 6160 + }, + { + "epoch": 0.31160829271988083, + "grad_norm": 2.6133323586176163, + "learning_rate": 6.885038892817457e-06, + "loss": 0.3966, + "step": 6170 + }, + { + "epoch": 0.31211333047145273, + "grad_norm": 2.7898977693699667, + "learning_rate": 6.879987877563391e-06, + "loss": 0.4005, + "step": 6180 + }, + { + "epoch": 0.3126183682230247, + "grad_norm": 15.014756094180136, + "learning_rate": 6.874936862309325e-06, + "loss": 0.417, + "step": 6190 + }, + { + "epoch": 0.3131234059745966, + "grad_norm": 3.877839859697261, + "learning_rate": 6.869885847055259e-06, + "loss": 0.3967, + "step": 6200 + }, + { + "epoch": 0.31362844372616855, + "grad_norm": 3.642380487638742, + "learning_rate": 6.864834831801193e-06, + "loss": 0.3872, + "step": 6210 + }, + { + "epoch": 0.31413348147774045, + "grad_norm": 6.1392539662766055, + "learning_rate": 6.859783816547126e-06, + "loss": 0.3969, + "step": 6220 + }, + { + "epoch": 0.3146385192293124, + "grad_norm": 2.8146690736323206, + "learning_rate": 6.85473280129306e-06, + "loss": 0.4129, + "step": 6230 + }, + { + "epoch": 0.3151435569808843, + "grad_norm": 4.058997537956916, + "learning_rate": 6.8496817860389945e-06, + "loss": 0.4015, + "step": 6240 + }, + { + "epoch": 0.31564859473245627, + "grad_norm": 5.742551004803781, + "learning_rate": 6.844630770784928e-06, + "loss": 0.3953, + "step": 6250 + }, + { + "epoch": 0.31615363248402817, + "grad_norm": 15.598787015706185, + "learning_rate": 6.839579755530862e-06, + "loss": 0.4031, + "step": 6260 + }, + { + "epoch": 0.3166586702356001, + "grad_norm": 5.020417282923178, + "learning_rate": 6.834528740276796e-06, + "loss": 0.4035, + "step": 6270 + }, + { + "epoch": 0.317163707987172, + "grad_norm": 4.667007823216816, + "learning_rate": 6.829477725022731e-06, + "loss": 0.406, + "step": 6280 + }, + { + "epoch": 0.317668745738744, + "grad_norm": 7.463699952057543, + "learning_rate": 6.824426709768664e-06, + "loss": 0.4108, + "step": 6290 + }, + { + "epoch": 0.3181737834903159, + "grad_norm": 4.262657782964267, + "learning_rate": 6.819375694514598e-06, + "loss": 0.3974, + "step": 6300 + }, + { + "epoch": 0.31867882124188784, + "grad_norm": 3.3264387869397862, + "learning_rate": 6.8143246792605325e-06, + "loss": 0.3952, + "step": 6310 + }, + { + "epoch": 0.31918385899345975, + "grad_norm": 12.90071142768197, + "learning_rate": 6.809273664006466e-06, + "loss": 0.3821, + "step": 6320 + }, + { + "epoch": 0.3196888967450317, + "grad_norm": 6.103897285160824, + "learning_rate": 6.8042226487524e-06, + "loss": 0.387, + "step": 6330 + }, + { + "epoch": 0.3201939344966036, + "grad_norm": 5.664255551154975, + "learning_rate": 6.799171633498334e-06, + "loss": 0.3939, + "step": 6340 + }, + { + "epoch": 0.32069897224817556, + "grad_norm": 3.9983006490372746, + "learning_rate": 6.794120618244267e-06, + "loss": 0.4026, + "step": 6350 + }, + { + "epoch": 0.32120400999974746, + "grad_norm": 4.502616368454671, + "learning_rate": 6.789069602990201e-06, + "loss": 0.3992, + "step": 6360 + }, + { + "epoch": 0.3217090477513194, + "grad_norm": 6.834315364764452, + "learning_rate": 6.7840185877361355e-06, + "loss": 0.4131, + "step": 6370 + }, + { + "epoch": 0.3222140855028913, + "grad_norm": 6.26718953681543, + "learning_rate": 6.778967572482069e-06, + "loss": 0.4066, + "step": 6380 + }, + { + "epoch": 0.3227191232544633, + "grad_norm": 7.341432352989431, + "learning_rate": 6.773916557228003e-06, + "loss": 0.3883, + "step": 6390 + }, + { + "epoch": 0.3232241610060352, + "grad_norm": 11.668272167662552, + "learning_rate": 6.768865541973937e-06, + "loss": 0.415, + "step": 6400 + }, + { + "epoch": 0.32372919875760714, + "grad_norm": 18.632730593364624, + "learning_rate": 6.76381452671987e-06, + "loss": 0.4084, + "step": 6410 + }, + { + "epoch": 0.32423423650917904, + "grad_norm": 9.46350511552533, + "learning_rate": 6.758763511465805e-06, + "loss": 0.3807, + "step": 6420 + }, + { + "epoch": 0.324739274260751, + "grad_norm": 4.410820137502103, + "learning_rate": 6.7537124962117394e-06, + "loss": 0.3844, + "step": 6430 + }, + { + "epoch": 0.3252443120123229, + "grad_norm": 9.07873183264923, + "learning_rate": 6.7486614809576736e-06, + "loss": 0.3902, + "step": 6440 + }, + { + "epoch": 0.32574934976389486, + "grad_norm": 17.65425817976251, + "learning_rate": 6.743610465703607e-06, + "loss": 0.3902, + "step": 6450 + }, + { + "epoch": 0.32625438751546676, + "grad_norm": 4.109418188768749, + "learning_rate": 6.738559450449541e-06, + "loss": 0.391, + "step": 6460 + }, + { + "epoch": 0.3267594252670387, + "grad_norm": 4.071871347874285, + "learning_rate": 6.733508435195475e-06, + "loss": 0.4067, + "step": 6470 + }, + { + "epoch": 0.3272644630186106, + "grad_norm": 4.833923669112235, + "learning_rate": 6.728457419941408e-06, + "loss": 0.3869, + "step": 6480 + }, + { + "epoch": 0.3277695007701826, + "grad_norm": 5.226542205736505, + "learning_rate": 6.7234064046873425e-06, + "loss": 0.3983, + "step": 6490 + }, + { + "epoch": 0.3282745385217545, + "grad_norm": 14.955066893117705, + "learning_rate": 6.718355389433277e-06, + "loss": 0.3892, + "step": 6500 + }, + { + "epoch": 0.32877957627332643, + "grad_norm": 9.826109172231558, + "learning_rate": 6.713304374179211e-06, + "loss": 0.3849, + "step": 6510 + }, + { + "epoch": 0.32928461402489834, + "grad_norm": 10.365177617347678, + "learning_rate": 6.708253358925144e-06, + "loss": 0.3909, + "step": 6520 + }, + { + "epoch": 0.3297896517764703, + "grad_norm": 3.902511753305759, + "learning_rate": 6.703202343671078e-06, + "loss": 0.3894, + "step": 6530 + }, + { + "epoch": 0.3302946895280422, + "grad_norm": 2.5389813606137, + "learning_rate": 6.698151328417012e-06, + "loss": 0.3947, + "step": 6540 + }, + { + "epoch": 0.33079972727961415, + "grad_norm": 3.076985557483421, + "learning_rate": 6.693100313162947e-06, + "loss": 0.385, + "step": 6550 + }, + { + "epoch": 0.33130476503118605, + "grad_norm": 3.9649750433361164, + "learning_rate": 6.6880492979088805e-06, + "loss": 0.3904, + "step": 6560 + }, + { + "epoch": 0.331809802782758, + "grad_norm": 11.994320837772984, + "learning_rate": 6.682998282654815e-06, + "loss": 0.3983, + "step": 6570 + }, + { + "epoch": 0.3323148405343299, + "grad_norm": 7.214021116152274, + "learning_rate": 6.677947267400749e-06, + "loss": 0.3978, + "step": 6580 + }, + { + "epoch": 0.33281987828590187, + "grad_norm": 2.6026929269364727, + "learning_rate": 6.672896252146682e-06, + "loss": 0.3775, + "step": 6590 + }, + { + "epoch": 0.33332491603747383, + "grad_norm": 13.22492686543516, + "learning_rate": 6.667845236892616e-06, + "loss": 0.3872, + "step": 6600 + }, + { + "epoch": 0.33382995378904573, + "grad_norm": 3.4887621572574443, + "learning_rate": 6.66279422163855e-06, + "loss": 0.3961, + "step": 6610 + }, + { + "epoch": 0.3343349915406177, + "grad_norm": 4.75022907961459, + "learning_rate": 6.6577432063844835e-06, + "loss": 0.4064, + "step": 6620 + }, + { + "epoch": 0.3348400292921896, + "grad_norm": 3.0134556897490876, + "learning_rate": 6.652692191130418e-06, + "loss": 0.4082, + "step": 6630 + }, + { + "epoch": 0.33534506704376155, + "grad_norm": 6.134839516437559, + "learning_rate": 6.647641175876352e-06, + "loss": 0.3807, + "step": 6640 + }, + { + "epoch": 0.33585010479533345, + "grad_norm": 2.967962385112729, + "learning_rate": 6.642590160622285e-06, + "loss": 0.389, + "step": 6650 + }, + { + "epoch": 0.3363551425469054, + "grad_norm": 2.530280284171402, + "learning_rate": 6.637539145368219e-06, + "loss": 0.3966, + "step": 6660 + }, + { + "epoch": 0.3368601802984773, + "grad_norm": 5.8276867252853455, + "learning_rate": 6.632488130114153e-06, + "loss": 0.3899, + "step": 6670 + }, + { + "epoch": 0.33736521805004926, + "grad_norm": 11.32729799857443, + "learning_rate": 6.627437114860087e-06, + "loss": 0.3977, + "step": 6680 + }, + { + "epoch": 0.33787025580162117, + "grad_norm": 4.636057747530336, + "learning_rate": 6.6223860996060216e-06, + "loss": 0.3879, + "step": 6690 + }, + { + "epoch": 0.3383752935531931, + "grad_norm": 3.644150608144144, + "learning_rate": 6.617335084351956e-06, + "loss": 0.4062, + "step": 6700 + }, + { + "epoch": 0.338880331304765, + "grad_norm": 9.355537955116619, + "learning_rate": 6.61228406909789e-06, + "loss": 0.3866, + "step": 6710 + }, + { + "epoch": 0.339385369056337, + "grad_norm": 2.4262554363736517, + "learning_rate": 6.607233053843823e-06, + "loss": 0.4244, + "step": 6720 + }, + { + "epoch": 0.3398904068079089, + "grad_norm": 4.581445110742895, + "learning_rate": 6.602182038589757e-06, + "loss": 0.3843, + "step": 6730 + }, + { + "epoch": 0.34039544455948084, + "grad_norm": 2.4362996437893902, + "learning_rate": 6.597131023335691e-06, + "loss": 0.3948, + "step": 6740 + }, + { + "epoch": 0.34090048231105274, + "grad_norm": 4.360969619561423, + "learning_rate": 6.592080008081625e-06, + "loss": 0.3861, + "step": 6750 + }, + { + "epoch": 0.3414055200626247, + "grad_norm": 3.2345867404100423, + "learning_rate": 6.587028992827559e-06, + "loss": 0.422, + "step": 6760 + }, + { + "epoch": 0.3419105578141966, + "grad_norm": 5.310935386991515, + "learning_rate": 6.581977977573493e-06, + "loss": 0.4137, + "step": 6770 + }, + { + "epoch": 0.34241559556576856, + "grad_norm": 10.95834322260054, + "learning_rate": 6.576926962319426e-06, + "loss": 0.4002, + "step": 6780 + }, + { + "epoch": 0.34292063331734046, + "grad_norm": 7.178923820806059, + "learning_rate": 6.57187594706536e-06, + "loss": 0.3771, + "step": 6790 + }, + { + "epoch": 0.3434256710689124, + "grad_norm": 16.529256071560802, + "learning_rate": 6.566824931811294e-06, + "loss": 0.4129, + "step": 6800 + }, + { + "epoch": 0.3439307088204843, + "grad_norm": 3.673940050614789, + "learning_rate": 6.561773916557228e-06, + "loss": 0.4034, + "step": 6810 + }, + { + "epoch": 0.3444357465720563, + "grad_norm": 2.9280965882270404, + "learning_rate": 6.556722901303163e-06, + "loss": 0.3892, + "step": 6820 + }, + { + "epoch": 0.3449407843236282, + "grad_norm": 93.05414347131021, + "learning_rate": 6.551671886049097e-06, + "loss": 0.4061, + "step": 6830 + }, + { + "epoch": 0.34544582207520014, + "grad_norm": 4.709615772022943, + "learning_rate": 6.546620870795031e-06, + "loss": 0.3948, + "step": 6840 + }, + { + "epoch": 0.34595085982677204, + "grad_norm": 4.852654627656968, + "learning_rate": 6.541569855540965e-06, + "loss": 0.3938, + "step": 6850 + }, + { + "epoch": 0.346455897578344, + "grad_norm": 4.816390805815548, + "learning_rate": 6.536518840286898e-06, + "loss": 0.3792, + "step": 6860 + }, + { + "epoch": 0.3469609353299159, + "grad_norm": 3.6013276651698374, + "learning_rate": 6.531467825032832e-06, + "loss": 0.4012, + "step": 6870 + }, + { + "epoch": 0.34746597308148786, + "grad_norm": 3.4921009820598576, + "learning_rate": 6.5264168097787665e-06, + "loss": 0.4083, + "step": 6880 + }, + { + "epoch": 0.34797101083305976, + "grad_norm": 6.61814343879125, + "learning_rate": 6.5213657945247e-06, + "loss": 0.4049, + "step": 6890 + }, + { + "epoch": 0.3484760485846317, + "grad_norm": 4.052623448201924, + "learning_rate": 6.516314779270634e-06, + "loss": 0.388, + "step": 6900 + }, + { + "epoch": 0.3489810863362036, + "grad_norm": 3.63955593340056, + "learning_rate": 6.511263764016568e-06, + "loss": 0.4061, + "step": 6910 + }, + { + "epoch": 0.3494861240877756, + "grad_norm": 4.150263872578294, + "learning_rate": 6.506212748762501e-06, + "loss": 0.4054, + "step": 6920 + }, + { + "epoch": 0.3499911618393475, + "grad_norm": 7.266970150410544, + "learning_rate": 6.5011617335084354e-06, + "loss": 0.3793, + "step": 6930 + }, + { + "epoch": 0.35049619959091943, + "grad_norm": 7.712101909778269, + "learning_rate": 6.4961107182543696e-06, + "loss": 0.3994, + "step": 6940 + }, + { + "epoch": 0.35100123734249133, + "grad_norm": 5.509148276281905, + "learning_rate": 6.491059703000303e-06, + "loss": 0.3781, + "step": 6950 + }, + { + "epoch": 0.3515062750940633, + "grad_norm": 2.589095673859192, + "learning_rate": 6.486008687746238e-06, + "loss": 0.3978, + "step": 6960 + }, + { + "epoch": 0.3520113128456352, + "grad_norm": 4.122842581672093, + "learning_rate": 6.480957672492172e-06, + "loss": 0.4009, + "step": 6970 + }, + { + "epoch": 0.35251635059720715, + "grad_norm": 4.1842800217172265, + "learning_rate": 6.475906657238106e-06, + "loss": 0.4029, + "step": 6980 + }, + { + "epoch": 0.35302138834877905, + "grad_norm": 7.398626485810365, + "learning_rate": 6.470855641984039e-06, + "loss": 0.4078, + "step": 6990 + }, + { + "epoch": 0.353526426100351, + "grad_norm": 5.396774144943605, + "learning_rate": 6.4658046267299735e-06, + "loss": 0.4059, + "step": 7000 + }, + { + "epoch": 0.3540314638519229, + "grad_norm": 13.281878929001232, + "learning_rate": 6.460753611475908e-06, + "loss": 0.4024, + "step": 7010 + }, + { + "epoch": 0.35453650160349487, + "grad_norm": 4.283305275547664, + "learning_rate": 6.455702596221841e-06, + "loss": 0.4108, + "step": 7020 + }, + { + "epoch": 0.35504153935506677, + "grad_norm": 3.8392385244226794, + "learning_rate": 6.450651580967775e-06, + "loss": 0.4227, + "step": 7030 + }, + { + "epoch": 0.35554657710663873, + "grad_norm": 5.012629063346216, + "learning_rate": 6.445600565713709e-06, + "loss": 0.4071, + "step": 7040 + }, + { + "epoch": 0.35605161485821063, + "grad_norm": 13.322782620701876, + "learning_rate": 6.440549550459642e-06, + "loss": 0.3926, + "step": 7050 + }, + { + "epoch": 0.3565566526097826, + "grad_norm": 5.149647425656023, + "learning_rate": 6.4354985352055765e-06, + "loss": 0.4017, + "step": 7060 + }, + { + "epoch": 0.3570616903613545, + "grad_norm": 7.5875779832186065, + "learning_rate": 6.430447519951511e-06, + "loss": 0.3867, + "step": 7070 + }, + { + "epoch": 0.35756672811292645, + "grad_norm": 3.433655269776608, + "learning_rate": 6.425396504697444e-06, + "loss": 0.3788, + "step": 7080 + }, + { + "epoch": 0.35807176586449835, + "grad_norm": 4.332715392495331, + "learning_rate": 6.420345489443378e-06, + "loss": 0.3672, + "step": 7090 + }, + { + "epoch": 0.3585768036160703, + "grad_norm": 8.825136973366781, + "learning_rate": 6.415294474189313e-06, + "loss": 0.391, + "step": 7100 + }, + { + "epoch": 0.3590818413676422, + "grad_norm": 4.896756858317604, + "learning_rate": 6.410243458935247e-06, + "loss": 0.3981, + "step": 7110 + }, + { + "epoch": 0.35958687911921416, + "grad_norm": 3.0181643910861595, + "learning_rate": 6.40519244368118e-06, + "loss": 0.3998, + "step": 7120 + }, + { + "epoch": 0.36009191687078607, + "grad_norm": 2.5429084560847346, + "learning_rate": 6.4001414284271145e-06, + "loss": 0.3973, + "step": 7130 + }, + { + "epoch": 0.360596954622358, + "grad_norm": 4.518411589704886, + "learning_rate": 6.395090413173049e-06, + "loss": 0.3921, + "step": 7140 + }, + { + "epoch": 0.3611019923739299, + "grad_norm": 2.520647494438761, + "learning_rate": 6.390039397918982e-06, + "loss": 0.3838, + "step": 7150 + }, + { + "epoch": 0.3616070301255019, + "grad_norm": 5.45309379468731, + "learning_rate": 6.384988382664916e-06, + "loss": 0.3966, + "step": 7160 + }, + { + "epoch": 0.3621120678770738, + "grad_norm": 5.822220380763848, + "learning_rate": 6.37993736741085e-06, + "loss": 0.4041, + "step": 7170 + }, + { + "epoch": 0.36261710562864574, + "grad_norm": 4.920376454170573, + "learning_rate": 6.3748863521567835e-06, + "loss": 0.3994, + "step": 7180 + }, + { + "epoch": 0.3631221433802177, + "grad_norm": 2.5736259947121707, + "learning_rate": 6.369835336902718e-06, + "loss": 0.3944, + "step": 7190 + }, + { + "epoch": 0.3636271811317896, + "grad_norm": 2.749872482951078, + "learning_rate": 6.364784321648652e-06, + "loss": 0.3985, + "step": 7200 + }, + { + "epoch": 0.36413221888336156, + "grad_norm": 2.5194888325972267, + "learning_rate": 6.359733306394586e-06, + "loss": 0.3939, + "step": 7210 + }, + { + "epoch": 0.36463725663493346, + "grad_norm": 3.574666468822497, + "learning_rate": 6.354682291140519e-06, + "loss": 0.4234, + "step": 7220 + }, + { + "epoch": 0.3651422943865054, + "grad_norm": 3.0608012516572325, + "learning_rate": 6.349631275886454e-06, + "loss": 0.3894, + "step": 7230 + }, + { + "epoch": 0.3656473321380773, + "grad_norm": 2.888576087403178, + "learning_rate": 6.344580260632388e-06, + "loss": 0.4149, + "step": 7240 + }, + { + "epoch": 0.3661523698896493, + "grad_norm": 2.6426553837219378, + "learning_rate": 6.339529245378322e-06, + "loss": 0.3788, + "step": 7250 + }, + { + "epoch": 0.3666574076412212, + "grad_norm": 8.704787479012706, + "learning_rate": 6.334478230124256e-06, + "loss": 0.4047, + "step": 7260 + }, + { + "epoch": 0.36716244539279314, + "grad_norm": 2.497691163861801, + "learning_rate": 6.32942721487019e-06, + "loss": 0.3959, + "step": 7270 + }, + { + "epoch": 0.36766748314436504, + "grad_norm": 2.8835395006733306, + "learning_rate": 6.324376199616124e-06, + "loss": 0.3964, + "step": 7280 + }, + { + "epoch": 0.368172520895937, + "grad_norm": 4.011809451507155, + "learning_rate": 6.319325184362057e-06, + "loss": 0.4196, + "step": 7290 + }, + { + "epoch": 0.3686775586475089, + "grad_norm": 9.138568921825629, + "learning_rate": 6.314274169107991e-06, + "loss": 0.4075, + "step": 7300 + }, + { + "epoch": 0.36918259639908085, + "grad_norm": 2.6137236844527676, + "learning_rate": 6.309223153853925e-06, + "loss": 0.3967, + "step": 7310 + }, + { + "epoch": 0.36968763415065276, + "grad_norm": 5.521296938399183, + "learning_rate": 6.304172138599859e-06, + "loss": 0.4005, + "step": 7320 + }, + { + "epoch": 0.3701926719022247, + "grad_norm": 7.608318289570054, + "learning_rate": 6.299121123345793e-06, + "loss": 0.4103, + "step": 7330 + }, + { + "epoch": 0.3706977096537966, + "grad_norm": 3.952351490468356, + "learning_rate": 6.294070108091727e-06, + "loss": 0.3955, + "step": 7340 + }, + { + "epoch": 0.37120274740536857, + "grad_norm": 3.2275756395096367, + "learning_rate": 6.28901909283766e-06, + "loss": 0.3691, + "step": 7350 + }, + { + "epoch": 0.3717077851569405, + "grad_norm": 5.1929124366289185, + "learning_rate": 6.283968077583594e-06, + "loss": 0.4109, + "step": 7360 + }, + { + "epoch": 0.37221282290851243, + "grad_norm": 3.7102445761239147, + "learning_rate": 6.278917062329529e-06, + "loss": 0.3855, + "step": 7370 + }, + { + "epoch": 0.37271786066008433, + "grad_norm": 4.102719494934126, + "learning_rate": 6.273866047075463e-06, + "loss": 0.4085, + "step": 7380 + }, + { + "epoch": 0.3732228984116563, + "grad_norm": 2.882775662800612, + "learning_rate": 6.268815031821397e-06, + "loss": 0.3949, + "step": 7390 + }, + { + "epoch": 0.3737279361632282, + "grad_norm": 3.847808178042115, + "learning_rate": 6.263764016567331e-06, + "loss": 0.3893, + "step": 7400 + }, + { + "epoch": 0.37423297391480015, + "grad_norm": 11.984324118193372, + "learning_rate": 6.258713001313265e-06, + "loss": 0.376, + "step": 7410 + }, + { + "epoch": 0.37473801166637205, + "grad_norm": 3.3001953512413897, + "learning_rate": 6.253661986059198e-06, + "loss": 0.3823, + "step": 7420 + }, + { + "epoch": 0.375243049417944, + "grad_norm": 5.373513774406177, + "learning_rate": 6.248610970805132e-06, + "loss": 0.4099, + "step": 7430 + }, + { + "epoch": 0.3757480871695159, + "grad_norm": 2.1585330518152492, + "learning_rate": 6.2435599555510664e-06, + "loss": 0.3977, + "step": 7440 + }, + { + "epoch": 0.37625312492108787, + "grad_norm": 5.400683846446516, + "learning_rate": 6.238508940297e-06, + "loss": 0.3917, + "step": 7450 + }, + { + "epoch": 0.37675816267265977, + "grad_norm": 11.58546846012599, + "learning_rate": 6.233457925042934e-06, + "loss": 0.4104, + "step": 7460 + }, + { + "epoch": 0.3772632004242317, + "grad_norm": 6.158451538704526, + "learning_rate": 6.228406909788868e-06, + "loss": 0.3817, + "step": 7470 + }, + { + "epoch": 0.37776823817580363, + "grad_norm": 3.220415459261749, + "learning_rate": 6.223355894534801e-06, + "loss": 0.4143, + "step": 7480 + }, + { + "epoch": 0.3782732759273756, + "grad_norm": 6.483927128344251, + "learning_rate": 6.218304879280735e-06, + "loss": 0.4009, + "step": 7490 + }, + { + "epoch": 0.3787783136789475, + "grad_norm": 3.1374527379369628, + "learning_rate": 6.21325386402667e-06, + "loss": 0.3825, + "step": 7500 + }, + { + "epoch": 0.37928335143051944, + "grad_norm": 1.9293391844723216, + "learning_rate": 6.2082028487726045e-06, + "loss": 0.3986, + "step": 7510 + }, + { + "epoch": 0.37978838918209135, + "grad_norm": 2.8612302362457704, + "learning_rate": 6.203151833518538e-06, + "loss": 0.4093, + "step": 7520 + }, + { + "epoch": 0.3802934269336633, + "grad_norm": 27.52073154576333, + "learning_rate": 6.198100818264472e-06, + "loss": 0.41, + "step": 7530 + }, + { + "epoch": 0.3807984646852352, + "grad_norm": 18.495429795467157, + "learning_rate": 6.193049803010406e-06, + "loss": 0.3918, + "step": 7540 + }, + { + "epoch": 0.38130350243680716, + "grad_norm": 1.9149364107527063, + "learning_rate": 6.187998787756339e-06, + "loss": 0.4059, + "step": 7550 + }, + { + "epoch": 0.38180854018837906, + "grad_norm": 3.265575132967205, + "learning_rate": 6.182947772502273e-06, + "loss": 0.3761, + "step": 7560 + }, + { + "epoch": 0.382313577939951, + "grad_norm": 2.9358960635943485, + "learning_rate": 6.1778967572482075e-06, + "loss": 0.3868, + "step": 7570 + }, + { + "epoch": 0.3828186156915229, + "grad_norm": 2.7564424046901324, + "learning_rate": 6.172845741994142e-06, + "loss": 0.4058, + "step": 7580 + }, + { + "epoch": 0.3833236534430949, + "grad_norm": 2.2542661022048374, + "learning_rate": 6.167794726740075e-06, + "loss": 0.3878, + "step": 7590 + }, + { + "epoch": 0.3838286911946668, + "grad_norm": 3.185426579737794, + "learning_rate": 6.162743711486009e-06, + "loss": 0.399, + "step": 7600 + }, + { + "epoch": 0.38433372894623874, + "grad_norm": 3.3739552822928665, + "learning_rate": 6.157692696231943e-06, + "loss": 0.3915, + "step": 7610 + }, + { + "epoch": 0.38483876669781064, + "grad_norm": 6.412123283698192, + "learning_rate": 6.1526416809778764e-06, + "loss": 0.3815, + "step": 7620 + }, + { + "epoch": 0.3853438044493826, + "grad_norm": 5.5629296211963215, + "learning_rate": 6.1475906657238105e-06, + "loss": 0.3794, + "step": 7630 + }, + { + "epoch": 0.3858488422009545, + "grad_norm": 3.130873953540236, + "learning_rate": 6.1425396504697455e-06, + "loss": 0.3971, + "step": 7640 + }, + { + "epoch": 0.38635387995252646, + "grad_norm": 2.92441715259481, + "learning_rate": 6.13748863521568e-06, + "loss": 0.396, + "step": 7650 + }, + { + "epoch": 0.38685891770409836, + "grad_norm": 4.577909010601426, + "learning_rate": 6.132437619961613e-06, + "loss": 0.4041, + "step": 7660 + }, + { + "epoch": 0.3873639554556703, + "grad_norm": 6.930598411493494, + "learning_rate": 6.127386604707547e-06, + "loss": 0.4031, + "step": 7670 + }, + { + "epoch": 0.3878689932072422, + "grad_norm": 3.4417274152302513, + "learning_rate": 6.122335589453481e-06, + "loss": 0.3806, + "step": 7680 + }, + { + "epoch": 0.3883740309588142, + "grad_norm": 8.637005031113738, + "learning_rate": 6.1172845741994144e-06, + "loss": 0.3847, + "step": 7690 + }, + { + "epoch": 0.3888790687103861, + "grad_norm": 5.039168163562101, + "learning_rate": 6.1122335589453486e-06, + "loss": 0.3927, + "step": 7700 + }, + { + "epoch": 0.38938410646195803, + "grad_norm": 2.3470164311901307, + "learning_rate": 6.107182543691283e-06, + "loss": 0.38, + "step": 7710 + }, + { + "epoch": 0.38988914421352994, + "grad_norm": 10.576642650070218, + "learning_rate": 6.102131528437216e-06, + "loss": 0.3792, + "step": 7720 + }, + { + "epoch": 0.3903941819651019, + "grad_norm": 4.420199483186855, + "learning_rate": 6.09708051318315e-06, + "loss": 0.4023, + "step": 7730 + }, + { + "epoch": 0.3908992197166738, + "grad_norm": 4.194934818506613, + "learning_rate": 6.092029497929084e-06, + "loss": 0.3889, + "step": 7740 + }, + { + "epoch": 0.39140425746824575, + "grad_norm": 4.111966133004914, + "learning_rate": 6.0869784826750175e-06, + "loss": 0.4035, + "step": 7750 + }, + { + "epoch": 0.39190929521981765, + "grad_norm": 2.451588743941197, + "learning_rate": 6.081927467420952e-06, + "loss": 0.3713, + "step": 7760 + }, + { + "epoch": 0.3924143329713896, + "grad_norm": 4.679648209446588, + "learning_rate": 6.076876452166886e-06, + "loss": 0.3841, + "step": 7770 + }, + { + "epoch": 0.3929193707229615, + "grad_norm": 5.530044736355814, + "learning_rate": 6.071825436912821e-06, + "loss": 0.3775, + "step": 7780 + }, + { + "epoch": 0.39342440847453347, + "grad_norm": 8.103071297854475, + "learning_rate": 6.066774421658754e-06, + "loss": 0.3977, + "step": 7790 + }, + { + "epoch": 0.39392944622610543, + "grad_norm": 3.4887947505414787, + "learning_rate": 6.061723406404688e-06, + "loss": 0.3855, + "step": 7800 + }, + { + "epoch": 0.39443448397767733, + "grad_norm": 8.590868528737689, + "learning_rate": 6.056672391150622e-06, + "loss": 0.4039, + "step": 7810 + }, + { + "epoch": 0.3949395217292493, + "grad_norm": 6.459621411276858, + "learning_rate": 6.0516213758965555e-06, + "loss": 0.3877, + "step": 7820 + }, + { + "epoch": 0.3954445594808212, + "grad_norm": 5.585759607891893, + "learning_rate": 6.04657036064249e-06, + "loss": 0.382, + "step": 7830 + }, + { + "epoch": 0.39594959723239315, + "grad_norm": 13.909241772822636, + "learning_rate": 6.041519345388424e-06, + "loss": 0.3604, + "step": 7840 + }, + { + "epoch": 0.39645463498396505, + "grad_norm": 8.283993010778675, + "learning_rate": 6.036468330134357e-06, + "loss": 0.3892, + "step": 7850 + }, + { + "epoch": 0.396959672735537, + "grad_norm": 5.670024832457869, + "learning_rate": 6.031417314880291e-06, + "loss": 0.3758, + "step": 7860 + }, + { + "epoch": 0.3974647104871089, + "grad_norm": 6.589956544879753, + "learning_rate": 6.026366299626225e-06, + "loss": 0.3788, + "step": 7870 + }, + { + "epoch": 0.39796974823868086, + "grad_norm": 38.93534106041679, + "learning_rate": 6.0213152843721586e-06, + "loss": 0.3959, + "step": 7880 + }, + { + "epoch": 0.39847478599025277, + "grad_norm": 9.349039394830367, + "learning_rate": 6.016264269118093e-06, + "loss": 0.4026, + "step": 7890 + }, + { + "epoch": 0.3989798237418247, + "grad_norm": 12.651279564480014, + "learning_rate": 6.011213253864027e-06, + "loss": 0.3953, + "step": 7900 + }, + { + "epoch": 0.3994848614933966, + "grad_norm": 12.369720860667183, + "learning_rate": 6.006162238609962e-06, + "loss": 0.3834, + "step": 7910 + }, + { + "epoch": 0.3999898992449686, + "grad_norm": 6.935402086209793, + "learning_rate": 6.001111223355895e-06, + "loss": 0.3855, + "step": 7920 + }, + { + "epoch": 0.4004949369965405, + "grad_norm": 14.485519276092187, + "learning_rate": 5.996060208101829e-06, + "loss": 0.3794, + "step": 7930 + }, + { + "epoch": 0.40099997474811244, + "grad_norm": 7.8583919362498795, + "learning_rate": 5.991009192847763e-06, + "loss": 0.3922, + "step": 7940 + }, + { + "epoch": 0.40150501249968434, + "grad_norm": 11.298566008159142, + "learning_rate": 5.985958177593697e-06, + "loss": 0.4002, + "step": 7950 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 28.291667710287033, + "learning_rate": 5.980907162339631e-06, + "loss": 0.3621, + "step": 7960 + }, + { + "epoch": 0.4025150880028282, + "grad_norm": 17.877462639004314, + "learning_rate": 5.975856147085565e-06, + "loss": 0.3968, + "step": 7970 + }, + { + "epoch": 0.40302012575440016, + "grad_norm": 8.068693573421475, + "learning_rate": 5.970805131831499e-06, + "loss": 0.3795, + "step": 7980 + }, + { + "epoch": 0.40352516350597206, + "grad_norm": 6.199946065309128, + "learning_rate": 5.965754116577432e-06, + "loss": 0.3909, + "step": 7990 + }, + { + "epoch": 0.404030201257544, + "grad_norm": 13.7274262284846, + "learning_rate": 5.960703101323366e-06, + "loss": 0.3728, + "step": 8000 + }, + { + "epoch": 0.4045352390091159, + "grad_norm": 15.37764496794447, + "learning_rate": 5.9556520860693005e-06, + "loss": 0.3868, + "step": 8010 + }, + { + "epoch": 0.4050402767606879, + "grad_norm": 7.104894136071469, + "learning_rate": 5.950601070815234e-06, + "loss": 0.3869, + "step": 8020 + }, + { + "epoch": 0.4055453145122598, + "grad_norm": 22.085557636681344, + "learning_rate": 5.945550055561168e-06, + "loss": 0.3842, + "step": 8030 + }, + { + "epoch": 0.40605035226383174, + "grad_norm": 8.557783879131767, + "learning_rate": 5.940499040307102e-06, + "loss": 0.3823, + "step": 8040 + }, + { + "epoch": 0.40655539001540364, + "grad_norm": 19.231211160885596, + "learning_rate": 5.935448025053037e-06, + "loss": 0.3819, + "step": 8050 + }, + { + "epoch": 0.4070604277669756, + "grad_norm": 7.452519753759336, + "learning_rate": 5.93039700979897e-06, + "loss": 0.3789, + "step": 8060 + }, + { + "epoch": 0.4075654655185475, + "grad_norm": 25.150570679742312, + "learning_rate": 5.925345994544904e-06, + "loss": 0.3829, + "step": 8070 + }, + { + "epoch": 0.40807050327011946, + "grad_norm": 12.1446176511635, + "learning_rate": 5.9202949792908385e-06, + "loss": 0.3925, + "step": 8080 + }, + { + "epoch": 0.40857554102169136, + "grad_norm": 8.119469151619079, + "learning_rate": 5.915243964036772e-06, + "loss": 0.3857, + "step": 8090 + }, + { + "epoch": 0.4090805787732633, + "grad_norm": 8.948591507408596, + "learning_rate": 5.910192948782706e-06, + "loss": 0.3757, + "step": 8100 + }, + { + "epoch": 0.4095856165248352, + "grad_norm": 4.621733850994045, + "learning_rate": 5.90514193352864e-06, + "loss": 0.4093, + "step": 8110 + }, + { + "epoch": 0.4100906542764072, + "grad_norm": 4.153858890970974, + "learning_rate": 5.900090918274573e-06, + "loss": 0.3834, + "step": 8120 + }, + { + "epoch": 0.4105956920279791, + "grad_norm": 7.250782877265542, + "learning_rate": 5.895039903020507e-06, + "loss": 0.3914, + "step": 8130 + }, + { + "epoch": 0.41110072977955103, + "grad_norm": 7.214997449416598, + "learning_rate": 5.8899888877664415e-06, + "loss": 0.3763, + "step": 8140 + }, + { + "epoch": 0.41160576753112293, + "grad_norm": 7.292312503512012, + "learning_rate": 5.884937872512375e-06, + "loss": 0.3696, + "step": 8150 + }, + { + "epoch": 0.4121108052826949, + "grad_norm": 8.564034765070186, + "learning_rate": 5.879886857258309e-06, + "loss": 0.36, + "step": 8160 + }, + { + "epoch": 0.4126158430342668, + "grad_norm": 5.03381059488658, + "learning_rate": 5.874835842004243e-06, + "loss": 0.3915, + "step": 8170 + }, + { + "epoch": 0.41312088078583875, + "grad_norm": 7.1010748327814275, + "learning_rate": 5.869784826750178e-06, + "loss": 0.3831, + "step": 8180 + }, + { + "epoch": 0.41362591853741065, + "grad_norm": 3.8185356541714177, + "learning_rate": 5.864733811496111e-06, + "loss": 0.3821, + "step": 8190 + }, + { + "epoch": 0.4141309562889826, + "grad_norm": 15.840372845269206, + "learning_rate": 5.8596827962420454e-06, + "loss": 0.3703, + "step": 8200 + }, + { + "epoch": 0.4146359940405545, + "grad_norm": 8.469089513751111, + "learning_rate": 5.8546317809879796e-06, + "loss": 0.3672, + "step": 8210 + }, + { + "epoch": 0.41514103179212647, + "grad_norm": 5.590802338147382, + "learning_rate": 5.849580765733913e-06, + "loss": 0.3893, + "step": 8220 + }, + { + "epoch": 0.41564606954369837, + "grad_norm": 8.673788237261087, + "learning_rate": 5.844529750479847e-06, + "loss": 0.3802, + "step": 8230 + }, + { + "epoch": 0.41615110729527033, + "grad_norm": 4.9263977139934, + "learning_rate": 5.839478735225781e-06, + "loss": 0.3699, + "step": 8240 + }, + { + "epoch": 0.41665614504684223, + "grad_norm": 2.3618077978561134, + "learning_rate": 5.834427719971714e-06, + "loss": 0.4169, + "step": 8250 + }, + { + "epoch": 0.4171611827984142, + "grad_norm": 3.4421082504609215, + "learning_rate": 5.8293767047176485e-06, + "loss": 0.3895, + "step": 8260 + }, + { + "epoch": 0.4176662205499861, + "grad_norm": 9.036063807272322, + "learning_rate": 5.824325689463583e-06, + "loss": 0.3911, + "step": 8270 + }, + { + "epoch": 0.41817125830155805, + "grad_norm": 2.726905279515538, + "learning_rate": 5.819274674209517e-06, + "loss": 0.3906, + "step": 8280 + }, + { + "epoch": 0.41867629605312995, + "grad_norm": 3.1988219571320173, + "learning_rate": 5.81422365895545e-06, + "loss": 0.3864, + "step": 8290 + }, + { + "epoch": 0.4191813338047019, + "grad_norm": 3.45185619640904, + "learning_rate": 5.809172643701384e-06, + "loss": 0.3985, + "step": 8300 + }, + { + "epoch": 0.4196863715562738, + "grad_norm": 5.02491265508067, + "learning_rate": 5.804121628447318e-06, + "loss": 0.3907, + "step": 8310 + }, + { + "epoch": 0.42019140930784576, + "grad_norm": 3.5658556661908443, + "learning_rate": 5.799070613193253e-06, + "loss": 0.3923, + "step": 8320 + }, + { + "epoch": 0.42069644705941767, + "grad_norm": 5.978581813826559, + "learning_rate": 5.7940195979391865e-06, + "loss": 0.3844, + "step": 8330 + }, + { + "epoch": 0.4212014848109896, + "grad_norm": 8.14240158938725, + "learning_rate": 5.788968582685121e-06, + "loss": 0.3978, + "step": 8340 + }, + { + "epoch": 0.4217065225625615, + "grad_norm": 3.181846621853647, + "learning_rate": 5.783917567431055e-06, + "loss": 0.3992, + "step": 8350 + }, + { + "epoch": 0.4222115603141335, + "grad_norm": 8.322284252076804, + "learning_rate": 5.778866552176988e-06, + "loss": 0.3836, + "step": 8360 + }, + { + "epoch": 0.4227165980657054, + "grad_norm": 4.76585783468973, + "learning_rate": 5.773815536922922e-06, + "loss": 0.3904, + "step": 8370 + }, + { + "epoch": 0.42322163581727734, + "grad_norm": 3.857044578534056, + "learning_rate": 5.768764521668856e-06, + "loss": 0.3851, + "step": 8380 + }, + { + "epoch": 0.4237266735688493, + "grad_norm": 5.058592219879147, + "learning_rate": 5.7637135064147895e-06, + "loss": 0.3867, + "step": 8390 + }, + { + "epoch": 0.4242317113204212, + "grad_norm": 15.788479310500968, + "learning_rate": 5.758662491160724e-06, + "loss": 0.3876, + "step": 8400 + }, + { + "epoch": 0.42473674907199316, + "grad_norm": 14.796150435899415, + "learning_rate": 5.753611475906658e-06, + "loss": 0.3713, + "step": 8410 + }, + { + "epoch": 0.42524178682356506, + "grad_norm": 6.198372860755761, + "learning_rate": 5.748560460652591e-06, + "loss": 0.3959, + "step": 8420 + }, + { + "epoch": 0.425746824575137, + "grad_norm": 4.004395808383159, + "learning_rate": 5.743509445398525e-06, + "loss": 0.3879, + "step": 8430 + }, + { + "epoch": 0.4262518623267089, + "grad_norm": 24.753829770708524, + "learning_rate": 5.738458430144459e-06, + "loss": 0.3907, + "step": 8440 + }, + { + "epoch": 0.4267569000782809, + "grad_norm": 3.79416616415335, + "learning_rate": 5.733407414890394e-06, + "loss": 0.3716, + "step": 8450 + }, + { + "epoch": 0.4272619378298528, + "grad_norm": 16.793962044251266, + "learning_rate": 5.7283563996363276e-06, + "loss": 0.38, + "step": 8460 + }, + { + "epoch": 0.42776697558142474, + "grad_norm": 6.987194101161238, + "learning_rate": 5.723305384382262e-06, + "loss": 0.3741, + "step": 8470 + }, + { + "epoch": 0.42827201333299664, + "grad_norm": 6.1003803215690295, + "learning_rate": 5.718254369128196e-06, + "loss": 0.3836, + "step": 8480 + }, + { + "epoch": 0.4287770510845686, + "grad_norm": 11.39191383765078, + "learning_rate": 5.713203353874129e-06, + "loss": 0.3785, + "step": 8490 + }, + { + "epoch": 0.4292820888361405, + "grad_norm": 4.216231578608531, + "learning_rate": 5.708152338620063e-06, + "loss": 0.3849, + "step": 8500 + }, + { + "epoch": 0.42978712658771245, + "grad_norm": 26.748832800632403, + "learning_rate": 5.703101323365997e-06, + "loss": 0.3877, + "step": 8510 + }, + { + "epoch": 0.43029216433928436, + "grad_norm": 5.3822939480121414, + "learning_rate": 5.698050308111931e-06, + "loss": 0.3692, + "step": 8520 + }, + { + "epoch": 0.4307972020908563, + "grad_norm": 4.249521852101111, + "learning_rate": 5.692999292857865e-06, + "loss": 0.3817, + "step": 8530 + }, + { + "epoch": 0.4313022398424282, + "grad_norm": 6.698259673877979, + "learning_rate": 5.687948277603799e-06, + "loss": 0.3881, + "step": 8540 + }, + { + "epoch": 0.43180727759400017, + "grad_norm": 6.493588412853115, + "learning_rate": 5.682897262349732e-06, + "loss": 0.3977, + "step": 8550 + }, + { + "epoch": 0.4323123153455721, + "grad_norm": 3.733362569921758, + "learning_rate": 5.677846247095666e-06, + "loss": 0.3982, + "step": 8560 + }, + { + "epoch": 0.43281735309714403, + "grad_norm": 4.847622020933204, + "learning_rate": 5.6727952318416e-06, + "loss": 0.3769, + "step": 8570 + }, + { + "epoch": 0.43332239084871593, + "grad_norm": 7.5594667479754785, + "learning_rate": 5.667744216587534e-06, + "loss": 0.3907, + "step": 8580 + }, + { + "epoch": 0.4338274286002879, + "grad_norm": 3.201682379163367, + "learning_rate": 5.662693201333469e-06, + "loss": 0.3729, + "step": 8590 + }, + { + "epoch": 0.4343324663518598, + "grad_norm": 2.5878076554319898, + "learning_rate": 5.657642186079403e-06, + "loss": 0.3956, + "step": 8600 + }, + { + "epoch": 0.43483750410343175, + "grad_norm": 17.488965325778878, + "learning_rate": 5.652591170825337e-06, + "loss": 0.3712, + "step": 8610 + }, + { + "epoch": 0.43534254185500365, + "grad_norm": 4.116982393940921, + "learning_rate": 5.64754015557127e-06, + "loss": 0.3893, + "step": 8620 + }, + { + "epoch": 0.4358475796065756, + "grad_norm": 3.9148601756933554, + "learning_rate": 5.642489140317204e-06, + "loss": 0.3887, + "step": 8630 + }, + { + "epoch": 0.4363526173581475, + "grad_norm": 7.764634135356721, + "learning_rate": 5.637438125063138e-06, + "loss": 0.3776, + "step": 8640 + }, + { + "epoch": 0.43685765510971947, + "grad_norm": 9.506827681627305, + "learning_rate": 5.6323871098090725e-06, + "loss": 0.3989, + "step": 8650 + }, + { + "epoch": 0.43736269286129137, + "grad_norm": 3.2832190526050242, + "learning_rate": 5.627336094555006e-06, + "loss": 0.3982, + "step": 8660 + }, + { + "epoch": 0.4378677306128633, + "grad_norm": 4.082534248878624, + "learning_rate": 5.62228507930094e-06, + "loss": 0.3901, + "step": 8670 + }, + { + "epoch": 0.43837276836443523, + "grad_norm": 3.077297177968144, + "learning_rate": 5.617234064046874e-06, + "loss": 0.3669, + "step": 8680 + }, + { + "epoch": 0.4388778061160072, + "grad_norm": 2.9330241633656167, + "learning_rate": 5.612183048792807e-06, + "loss": 0.3934, + "step": 8690 + }, + { + "epoch": 0.4393828438675791, + "grad_norm": 3.3633206477485222, + "learning_rate": 5.6071320335387414e-06, + "loss": 0.3949, + "step": 8700 + }, + { + "epoch": 0.43988788161915104, + "grad_norm": 4.3033520111069, + "learning_rate": 5.6020810182846756e-06, + "loss": 0.3989, + "step": 8710 + }, + { + "epoch": 0.44039291937072295, + "grad_norm": 3.106225393468076, + "learning_rate": 5.597030003030609e-06, + "loss": 0.3827, + "step": 8720 + }, + { + "epoch": 0.4408979571222949, + "grad_norm": 3.245685369854172, + "learning_rate": 5.591978987776544e-06, + "loss": 0.3892, + "step": 8730 + }, + { + "epoch": 0.4414029948738668, + "grad_norm": 3.381863989802099, + "learning_rate": 5.586927972522478e-06, + "loss": 0.3945, + "step": 8740 + }, + { + "epoch": 0.44190803262543876, + "grad_norm": 5.237035603972838, + "learning_rate": 5.581876957268412e-06, + "loss": 0.3852, + "step": 8750 + }, + { + "epoch": 0.44241307037701066, + "grad_norm": 3.927311917890188, + "learning_rate": 5.576825942014345e-06, + "loss": 0.396, + "step": 8760 + }, + { + "epoch": 0.4429181081285826, + "grad_norm": 3.9833756210512634, + "learning_rate": 5.5717749267602795e-06, + "loss": 0.3823, + "step": 8770 + }, + { + "epoch": 0.4434231458801545, + "grad_norm": 4.82672597736031, + "learning_rate": 5.566723911506214e-06, + "loss": 0.3837, + "step": 8780 + }, + { + "epoch": 0.4439281836317265, + "grad_norm": 6.6207510733703, + "learning_rate": 5.561672896252147e-06, + "loss": 0.3846, + "step": 8790 + }, + { + "epoch": 0.4444332213832984, + "grad_norm": 4.761029989854409, + "learning_rate": 5.556621880998081e-06, + "loss": 0.3851, + "step": 8800 + }, + { + "epoch": 0.44493825913487034, + "grad_norm": 7.583147217262274, + "learning_rate": 5.551570865744015e-06, + "loss": 0.3897, + "step": 8810 + }, + { + "epoch": 0.44544329688644224, + "grad_norm": 8.267756155680992, + "learning_rate": 5.546519850489948e-06, + "loss": 0.3924, + "step": 8820 + }, + { + "epoch": 0.4459483346380142, + "grad_norm": 10.81298502824958, + "learning_rate": 5.5414688352358825e-06, + "loss": 0.3768, + "step": 8830 + }, + { + "epoch": 0.4464533723895861, + "grad_norm": 5.44259386052073, + "learning_rate": 5.536417819981817e-06, + "loss": 0.3766, + "step": 8840 + }, + { + "epoch": 0.44695841014115806, + "grad_norm": 19.848594487097966, + "learning_rate": 5.53136680472775e-06, + "loss": 0.386, + "step": 8850 + }, + { + "epoch": 0.44746344789272996, + "grad_norm": 9.264967854622206, + "learning_rate": 5.526315789473685e-06, + "loss": 0.3916, + "step": 8860 + }, + { + "epoch": 0.4479684856443019, + "grad_norm": 11.970151482632554, + "learning_rate": 5.521264774219619e-06, + "loss": 0.3878, + "step": 8870 + }, + { + "epoch": 0.4484735233958738, + "grad_norm": 4.562463656013562, + "learning_rate": 5.516213758965553e-06, + "loss": 0.3827, + "step": 8880 + }, + { + "epoch": 0.4489785611474458, + "grad_norm": 8.92273565377152, + "learning_rate": 5.511162743711486e-06, + "loss": 0.3691, + "step": 8890 + }, + { + "epoch": 0.4494835988990177, + "grad_norm": 6.168748754085141, + "learning_rate": 5.5061117284574205e-06, + "loss": 0.3762, + "step": 8900 + }, + { + "epoch": 0.44998863665058964, + "grad_norm": 9.45298676374925, + "learning_rate": 5.501060713203355e-06, + "loss": 0.39, + "step": 8910 + }, + { + "epoch": 0.45049367440216154, + "grad_norm": 8.9825721118566, + "learning_rate": 5.496009697949288e-06, + "loss": 0.3946, + "step": 8920 + }, + { + "epoch": 0.4509987121537335, + "grad_norm": 3.970319391221007, + "learning_rate": 5.490958682695222e-06, + "loss": 0.3766, + "step": 8930 + }, + { + "epoch": 0.4515037499053054, + "grad_norm": 10.786278133368974, + "learning_rate": 5.485907667441156e-06, + "loss": 0.3609, + "step": 8940 + }, + { + "epoch": 0.45200878765687735, + "grad_norm": 10.415656893523584, + "learning_rate": 5.4808566521870895e-06, + "loss": 0.381, + "step": 8950 + }, + { + "epoch": 0.45251382540844926, + "grad_norm": 6.642215363932757, + "learning_rate": 5.475805636933024e-06, + "loss": 0.3958, + "step": 8960 + }, + { + "epoch": 0.4530188631600212, + "grad_norm": 23.15078550027285, + "learning_rate": 5.470754621678958e-06, + "loss": 0.3765, + "step": 8970 + }, + { + "epoch": 0.4535239009115931, + "grad_norm": 12.962039757228773, + "learning_rate": 5.465703606424891e-06, + "loss": 0.3796, + "step": 8980 + }, + { + "epoch": 0.45402893866316507, + "grad_norm": 10.834056396167355, + "learning_rate": 5.460652591170825e-06, + "loss": 0.3825, + "step": 8990 + }, + { + "epoch": 0.45453397641473703, + "grad_norm": 31.830300939594174, + "learning_rate": 5.45560157591676e-06, + "loss": 0.3799, + "step": 9000 + }, + { + "epoch": 0.45503901416630893, + "grad_norm": 7.317578019104587, + "learning_rate": 5.450550560662694e-06, + "loss": 0.3617, + "step": 9010 + }, + { + "epoch": 0.4555440519178809, + "grad_norm": 18.51705064335606, + "learning_rate": 5.445499545408628e-06, + "loss": 0.3843, + "step": 9020 + }, + { + "epoch": 0.4560490896694528, + "grad_norm": 12.73637775470279, + "learning_rate": 5.440448530154562e-06, + "loss": 0.3533, + "step": 9030 + }, + { + "epoch": 0.45655412742102475, + "grad_norm": 13.089783277969067, + "learning_rate": 5.435397514900496e-06, + "loss": 0.3844, + "step": 9040 + }, + { + "epoch": 0.45705916517259665, + "grad_norm": 38.50162494773606, + "learning_rate": 5.43034649964643e-06, + "loss": 0.3839, + "step": 9050 + }, + { + "epoch": 0.4575642029241686, + "grad_norm": 5.6808406753592715, + "learning_rate": 5.425295484392363e-06, + "loss": 0.3938, + "step": 9060 + }, + { + "epoch": 0.4580692406757405, + "grad_norm": 9.303424412707237, + "learning_rate": 5.420244469138297e-06, + "loss": 0.3864, + "step": 9070 + }, + { + "epoch": 0.45857427842731247, + "grad_norm": 6.58855670966557, + "learning_rate": 5.415193453884231e-06, + "loss": 0.3802, + "step": 9080 + }, + { + "epoch": 0.45907931617888437, + "grad_norm": 7.626250897081635, + "learning_rate": 5.410142438630165e-06, + "loss": 0.3779, + "step": 9090 + }, + { + "epoch": 0.4595843539304563, + "grad_norm": 7.20955512967183, + "learning_rate": 5.405091423376099e-06, + "loss": 0.3852, + "step": 9100 + }, + { + "epoch": 0.4600893916820282, + "grad_norm": 3.600490312288401, + "learning_rate": 5.400040408122033e-06, + "loss": 0.3734, + "step": 9110 + }, + { + "epoch": 0.4605944294336002, + "grad_norm": 8.58899517905405, + "learning_rate": 5.394989392867966e-06, + "loss": 0.3789, + "step": 9120 + }, + { + "epoch": 0.4610994671851721, + "grad_norm": 6.98087393231808, + "learning_rate": 5.389938377613901e-06, + "loss": 0.3833, + "step": 9130 + }, + { + "epoch": 0.46160450493674404, + "grad_norm": 4.3141382007799765, + "learning_rate": 5.384887362359835e-06, + "loss": 0.3772, + "step": 9140 + }, + { + "epoch": 0.46210954268831594, + "grad_norm": 4.9903497470267615, + "learning_rate": 5.379836347105769e-06, + "loss": 0.3777, + "step": 9150 + }, + { + "epoch": 0.4626145804398879, + "grad_norm": 5.928389544490131, + "learning_rate": 5.374785331851703e-06, + "loss": 0.3862, + "step": 9160 + }, + { + "epoch": 0.4631196181914598, + "grad_norm": 4.797470596868366, + "learning_rate": 5.369734316597637e-06, + "loss": 0.3997, + "step": 9170 + }, + { + "epoch": 0.46362465594303176, + "grad_norm": 8.670905903895585, + "learning_rate": 5.364683301343571e-06, + "loss": 0.3678, + "step": 9180 + }, + { + "epoch": 0.46412969369460366, + "grad_norm": 6.464493728216097, + "learning_rate": 5.359632286089504e-06, + "loss": 0.3815, + "step": 9190 + }, + { + "epoch": 0.4646347314461756, + "grad_norm": 12.803983058209836, + "learning_rate": 5.354581270835438e-06, + "loss": 0.38, + "step": 9200 + }, + { + "epoch": 0.4651397691977475, + "grad_norm": 9.24913498804073, + "learning_rate": 5.3495302555813724e-06, + "loss": 0.3818, + "step": 9210 + }, + { + "epoch": 0.4656448069493195, + "grad_norm": 6.124104683054, + "learning_rate": 5.344479240327306e-06, + "loss": 0.3857, + "step": 9220 + }, + { + "epoch": 0.4661498447008914, + "grad_norm": 19.982398897538843, + "learning_rate": 5.33942822507324e-06, + "loss": 0.3759, + "step": 9230 + }, + { + "epoch": 0.46665488245246334, + "grad_norm": 6.143149352411427, + "learning_rate": 5.334377209819174e-06, + "loss": 0.3901, + "step": 9240 + }, + { + "epoch": 0.46715992020403524, + "grad_norm": 11.010467552273573, + "learning_rate": 5.329326194565107e-06, + "loss": 0.3798, + "step": 9250 + }, + { + "epoch": 0.4676649579556072, + "grad_norm": 5.615638251464779, + "learning_rate": 5.324275179311041e-06, + "loss": 0.3855, + "step": 9260 + }, + { + "epoch": 0.4681699957071791, + "grad_norm": 5.10238166508234, + "learning_rate": 5.319224164056976e-06, + "loss": 0.3793, + "step": 9270 + }, + { + "epoch": 0.46867503345875106, + "grad_norm": 11.264038165848222, + "learning_rate": 5.3141731488029105e-06, + "loss": 0.3912, + "step": 9280 + }, + { + "epoch": 0.46918007121032296, + "grad_norm": 4.638158847573904, + "learning_rate": 5.309122133548844e-06, + "loss": 0.3874, + "step": 9290 + }, + { + "epoch": 0.4696851089618949, + "grad_norm": 5.40563230905546, + "learning_rate": 5.304071118294778e-06, + "loss": 0.3688, + "step": 9300 + }, + { + "epoch": 0.4701901467134668, + "grad_norm": 7.336455357223306, + "learning_rate": 5.299020103040712e-06, + "loss": 0.3595, + "step": 9310 + }, + { + "epoch": 0.4706951844650388, + "grad_norm": 6.604255767738522, + "learning_rate": 5.293969087786645e-06, + "loss": 0.3851, + "step": 9320 + }, + { + "epoch": 0.4712002222166107, + "grad_norm": 8.434342768902814, + "learning_rate": 5.288918072532579e-06, + "loss": 0.3752, + "step": 9330 + }, + { + "epoch": 0.47170525996818263, + "grad_norm": 7.991264568372306, + "learning_rate": 5.2838670572785135e-06, + "loss": 0.3847, + "step": 9340 + }, + { + "epoch": 0.47221029771975453, + "grad_norm": 7.689424953061599, + "learning_rate": 5.278816042024447e-06, + "loss": 0.381, + "step": 9350 + }, + { + "epoch": 0.4727153354713265, + "grad_norm": 8.92289056823504, + "learning_rate": 5.273765026770381e-06, + "loss": 0.3766, + "step": 9360 + }, + { + "epoch": 0.4732203732228984, + "grad_norm": 6.798146146074497, + "learning_rate": 5.268714011516315e-06, + "loss": 0.3973, + "step": 9370 + }, + { + "epoch": 0.47372541097447035, + "grad_norm": 4.0943212163341105, + "learning_rate": 5.263662996262249e-06, + "loss": 0.3852, + "step": 9380 + }, + { + "epoch": 0.47423044872604225, + "grad_norm": 7.923369250973975, + "learning_rate": 5.2586119810081824e-06, + "loss": 0.3759, + "step": 9390 + }, + { + "epoch": 0.4747354864776142, + "grad_norm": 6.958844060305958, + "learning_rate": 5.253560965754117e-06, + "loss": 0.3814, + "step": 9400 + }, + { + "epoch": 0.4752405242291861, + "grad_norm": 5.637050541114169, + "learning_rate": 5.2485099505000515e-06, + "loss": 0.3665, + "step": 9410 + }, + { + "epoch": 0.47574556198075807, + "grad_norm": 7.801257727546048, + "learning_rate": 5.243458935245986e-06, + "loss": 0.3744, + "step": 9420 + }, + { + "epoch": 0.47625059973232997, + "grad_norm": 8.37919084606113, + "learning_rate": 5.238407919991919e-06, + "loss": 0.394, + "step": 9430 + }, + { + "epoch": 0.47675563748390193, + "grad_norm": 4.463876576531971, + "learning_rate": 5.233356904737853e-06, + "loss": 0.3712, + "step": 9440 + }, + { + "epoch": 0.47726067523547383, + "grad_norm": 3.5151581874917253, + "learning_rate": 5.228305889483787e-06, + "loss": 0.3733, + "step": 9450 + }, + { + "epoch": 0.4777657129870458, + "grad_norm": 6.525584094885654, + "learning_rate": 5.2232548742297204e-06, + "loss": 0.3769, + "step": 9460 + }, + { + "epoch": 0.4782707507386177, + "grad_norm": 3.5568609941598344, + "learning_rate": 5.2182038589756546e-06, + "loss": 0.3902, + "step": 9470 + }, + { + "epoch": 0.47877578849018965, + "grad_norm": 4.842870672953731, + "learning_rate": 5.213152843721589e-06, + "loss": 0.3794, + "step": 9480 + }, + { + "epoch": 0.47928082624176155, + "grad_norm": 6.265357359911366, + "learning_rate": 5.208101828467522e-06, + "loss": 0.3803, + "step": 9490 + }, + { + "epoch": 0.4797858639933335, + "grad_norm": 8.175660778042406, + "learning_rate": 5.203050813213456e-06, + "loss": 0.3835, + "step": 9500 + }, + { + "epoch": 0.4802909017449054, + "grad_norm": 2.9195122634550112, + "learning_rate": 5.19799979795939e-06, + "loss": 0.3813, + "step": 9510 + }, + { + "epoch": 0.48079593949647736, + "grad_norm": 3.632448992408266, + "learning_rate": 5.1929487827053235e-06, + "loss": 0.3684, + "step": 9520 + }, + { + "epoch": 0.48130097724804927, + "grad_norm": 4.0935522105259095, + "learning_rate": 5.187897767451258e-06, + "loss": 0.4025, + "step": 9530 + }, + { + "epoch": 0.4818060149996212, + "grad_norm": 2.3275681028787005, + "learning_rate": 5.182846752197193e-06, + "loss": 0.388, + "step": 9540 + }, + { + "epoch": 0.4823110527511931, + "grad_norm": 2.0956548268446458, + "learning_rate": 5.177795736943127e-06, + "loss": 0.4031, + "step": 9550 + }, + { + "epoch": 0.4828160905027651, + "grad_norm": 2.173446663356269, + "learning_rate": 5.17274472168906e-06, + "loss": 0.3984, + "step": 9560 + }, + { + "epoch": 0.483321128254337, + "grad_norm": 3.2812553226056713, + "learning_rate": 5.167693706434994e-06, + "loss": 0.384, + "step": 9570 + }, + { + "epoch": 0.48382616600590894, + "grad_norm": 3.590239737946557, + "learning_rate": 5.162642691180928e-06, + "loss": 0.3877, + "step": 9580 + }, + { + "epoch": 0.4843312037574809, + "grad_norm": 2.83308197947474, + "learning_rate": 5.1575916759268615e-06, + "loss": 0.3934, + "step": 9590 + }, + { + "epoch": 0.4848362415090528, + "grad_norm": 3.847792584135844, + "learning_rate": 5.152540660672796e-06, + "loss": 0.3735, + "step": 9600 + }, + { + "epoch": 0.48534127926062476, + "grad_norm": 20.75585092309958, + "learning_rate": 5.14748964541873e-06, + "loss": 0.3788, + "step": 9610 + }, + { + "epoch": 0.48584631701219666, + "grad_norm": 4.31507443188382, + "learning_rate": 5.142438630164663e-06, + "loss": 0.4073, + "step": 9620 + }, + { + "epoch": 0.4863513547637686, + "grad_norm": 2.675763121628948, + "learning_rate": 5.137387614910597e-06, + "loss": 0.3853, + "step": 9630 + }, + { + "epoch": 0.4868563925153405, + "grad_norm": 3.180499977172799, + "learning_rate": 5.132336599656531e-06, + "loss": 0.3933, + "step": 9640 + }, + { + "epoch": 0.4873614302669125, + "grad_norm": 3.2780129127069633, + "learning_rate": 5.1272855844024646e-06, + "loss": 0.3862, + "step": 9650 + }, + { + "epoch": 0.4878664680184844, + "grad_norm": 2.650097724922774, + "learning_rate": 5.122234569148399e-06, + "loss": 0.3769, + "step": 9660 + }, + { + "epoch": 0.48837150577005634, + "grad_norm": 19.664449194387533, + "learning_rate": 5.117183553894333e-06, + "loss": 0.3909, + "step": 9670 + }, + { + "epoch": 0.48887654352162824, + "grad_norm": 3.055574678383631, + "learning_rate": 5.112132538640268e-06, + "loss": 0.368, + "step": 9680 + }, + { + "epoch": 0.4893815812732002, + "grad_norm": 4.1275772825798285, + "learning_rate": 5.107081523386201e-06, + "loss": 0.3807, + "step": 9690 + }, + { + "epoch": 0.4898866190247721, + "grad_norm": 2.680370704403155, + "learning_rate": 5.102030508132135e-06, + "loss": 0.3852, + "step": 9700 + }, + { + "epoch": 0.49039165677634405, + "grad_norm": 5.918612399643231, + "learning_rate": 5.096979492878069e-06, + "loss": 0.3838, + "step": 9710 + }, + { + "epoch": 0.49089669452791596, + "grad_norm": 3.682371592006304, + "learning_rate": 5.091928477624003e-06, + "loss": 0.4054, + "step": 9720 + }, + { + "epoch": 0.4914017322794879, + "grad_norm": 14.462676090252211, + "learning_rate": 5.086877462369937e-06, + "loss": 0.3992, + "step": 9730 + }, + { + "epoch": 0.4919067700310598, + "grad_norm": 8.666562939588516, + "learning_rate": 5.081826447115871e-06, + "loss": 0.3942, + "step": 9740 + }, + { + "epoch": 0.49241180778263177, + "grad_norm": 8.758850115026672, + "learning_rate": 5.076775431861805e-06, + "loss": 0.3895, + "step": 9750 + }, + { + "epoch": 0.4929168455342037, + "grad_norm": 4.418838276589228, + "learning_rate": 5.071724416607738e-06, + "loss": 0.3696, + "step": 9760 + }, + { + "epoch": 0.49342188328577563, + "grad_norm": 3.2634112325138016, + "learning_rate": 5.066673401353672e-06, + "loss": 0.392, + "step": 9770 + }, + { + "epoch": 0.49392692103734753, + "grad_norm": 9.038919625118139, + "learning_rate": 5.0616223860996065e-06, + "loss": 0.3763, + "step": 9780 + }, + { + "epoch": 0.4944319587889195, + "grad_norm": 9.874288914817955, + "learning_rate": 5.05657137084554e-06, + "loss": 0.3749, + "step": 9790 + }, + { + "epoch": 0.4949369965404914, + "grad_norm": 4.177942828246587, + "learning_rate": 5.051520355591474e-06, + "loss": 0.3803, + "step": 9800 + }, + { + "epoch": 0.49544203429206335, + "grad_norm": 5.209048638874338, + "learning_rate": 5.046469340337409e-06, + "loss": 0.377, + "step": 9810 + }, + { + "epoch": 0.49594707204363525, + "grad_norm": 6.559103955892935, + "learning_rate": 5.041418325083343e-06, + "loss": 0.3779, + "step": 9820 + }, + { + "epoch": 0.4964521097952072, + "grad_norm": 4.815427953732501, + "learning_rate": 5.036367309829276e-06, + "loss": 0.3832, + "step": 9830 + }, + { + "epoch": 0.4969571475467791, + "grad_norm": 4.801884846686052, + "learning_rate": 5.03131629457521e-06, + "loss": 0.3648, + "step": 9840 + }, + { + "epoch": 0.49746218529835107, + "grad_norm": 4.707895499699569, + "learning_rate": 5.0262652793211445e-06, + "loss": 0.3974, + "step": 9850 + }, + { + "epoch": 0.49796722304992297, + "grad_norm": 3.1632503133911176, + "learning_rate": 5.021214264067078e-06, + "loss": 0.3645, + "step": 9860 + }, + { + "epoch": 0.4984722608014949, + "grad_norm": 3.187410732261863, + "learning_rate": 5.016163248813012e-06, + "loss": 0.3964, + "step": 9870 + }, + { + "epoch": 0.49897729855306683, + "grad_norm": 18.87501682400254, + "learning_rate": 5.011112233558946e-06, + "loss": 0.394, + "step": 9880 + }, + { + "epoch": 0.4994823363046388, + "grad_norm": 2.727776305600166, + "learning_rate": 5.006061218304879e-06, + "loss": 0.384, + "step": 9890 + }, + { + "epoch": 0.4999873740562107, + "grad_norm": 2.835732786228447, + "learning_rate": 5.001010203050813e-06, + "loss": 0.3826, + "step": 9900 + }, + { + "epoch": 0.5004924118077826, + "grad_norm": 12.721805859040408, + "learning_rate": 4.9959591877967475e-06, + "loss": 0.3965, + "step": 9910 + }, + { + "epoch": 0.5009974495593545, + "grad_norm": 4.309473567917906, + "learning_rate": 4.990908172542682e-06, + "loss": 0.3867, + "step": 9920 + }, + { + "epoch": 0.5015024873109265, + "grad_norm": 4.385559429438772, + "learning_rate": 4.985857157288616e-06, + "loss": 0.3819, + "step": 9930 + }, + { + "epoch": 0.5020075250624985, + "grad_norm": 3.1179751000194917, + "learning_rate": 4.980806142034549e-06, + "loss": 0.3678, + "step": 9940 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 6.661473319925103, + "learning_rate": 4.975755126780483e-06, + "loss": 0.3803, + "step": 9950 + }, + { + "epoch": 0.5030176005656423, + "grad_norm": 5.422867145087551, + "learning_rate": 4.970704111526417e-06, + "loss": 0.3724, + "step": 9960 + }, + { + "epoch": 0.5035226383172142, + "grad_norm": 5.116363457400816, + "learning_rate": 4.965653096272351e-06, + "loss": 0.3796, + "step": 9970 + }, + { + "epoch": 0.5040276760687862, + "grad_norm": 4.36781162605322, + "learning_rate": 4.9606020810182856e-06, + "loss": 0.3758, + "step": 9980 + }, + { + "epoch": 0.504532713820358, + "grad_norm": 4.280820474874582, + "learning_rate": 4.955551065764219e-06, + "loss": 0.3747, + "step": 9990 + }, + { + "epoch": 0.50503775157193, + "grad_norm": 57.44106262644066, + "learning_rate": 4.950500050510153e-06, + "loss": 0.371, + "step": 10000 + }, + { + "epoch": 0.5055427893235019, + "grad_norm": 4.841001034679017, + "learning_rate": 4.945449035256087e-06, + "loss": 0.3779, + "step": 10010 + }, + { + "epoch": 0.5060478270750739, + "grad_norm": 11.797410635025113, + "learning_rate": 4.94039802000202e-06, + "loss": 0.3683, + "step": 10020 + }, + { + "epoch": 0.5065528648266457, + "grad_norm": 4.75455698304815, + "learning_rate": 4.9353470047479545e-06, + "loss": 0.3705, + "step": 10030 + }, + { + "epoch": 0.5070579025782177, + "grad_norm": 4.360031532937485, + "learning_rate": 4.930295989493889e-06, + "loss": 0.398, + "step": 10040 + }, + { + "epoch": 0.5075629403297897, + "grad_norm": 5.827246838677423, + "learning_rate": 4.925244974239823e-06, + "loss": 0.3841, + "step": 10050 + }, + { + "epoch": 0.5080679780813616, + "grad_norm": 7.5891846258257765, + "learning_rate": 4.920193958985757e-06, + "loss": 0.3744, + "step": 10060 + }, + { + "epoch": 0.5085730158329335, + "grad_norm": 5.9635081800997805, + "learning_rate": 4.91514294373169e-06, + "loss": 0.3986, + "step": 10070 + }, + { + "epoch": 0.5090780535845054, + "grad_norm": 5.484063460889767, + "learning_rate": 4.910091928477624e-06, + "loss": 0.3839, + "step": 10080 + }, + { + "epoch": 0.5095830913360774, + "grad_norm": 5.174164528404758, + "learning_rate": 4.905040913223558e-06, + "loss": 0.3619, + "step": 10090 + }, + { + "epoch": 0.5100881290876493, + "grad_norm": 2.90936959880962, + "learning_rate": 4.8999898979694925e-06, + "loss": 0.3756, + "step": 10100 + }, + { + "epoch": 0.5105931668392212, + "grad_norm": 4.3229203248018715, + "learning_rate": 4.894938882715426e-06, + "loss": 0.3832, + "step": 10110 + }, + { + "epoch": 0.5110982045907931, + "grad_norm": 2.500902066389639, + "learning_rate": 4.889887867461361e-06, + "loss": 0.3826, + "step": 10120 + }, + { + "epoch": 0.5116032423423651, + "grad_norm": 8.290016106139367, + "learning_rate": 4.884836852207294e-06, + "loss": 0.386, + "step": 10130 + }, + { + "epoch": 0.512108280093937, + "grad_norm": 2.6008214460714894, + "learning_rate": 4.879785836953228e-06, + "loss": 0.3676, + "step": 10140 + }, + { + "epoch": 0.5126133178455089, + "grad_norm": 2.1333825641801902, + "learning_rate": 4.874734821699162e-06, + "loss": 0.3898, + "step": 10150 + }, + { + "epoch": 0.5131183555970809, + "grad_norm": 3.013718964658717, + "learning_rate": 4.8696838064450955e-06, + "loss": 0.3887, + "step": 10160 + }, + { + "epoch": 0.5136233933486528, + "grad_norm": 3.6467351317483816, + "learning_rate": 4.86463279119103e-06, + "loss": 0.3782, + "step": 10170 + }, + { + "epoch": 0.5141284311002248, + "grad_norm": 2.950522126672218, + "learning_rate": 4.859581775936964e-06, + "loss": 0.377, + "step": 10180 + }, + { + "epoch": 0.5146334688517967, + "grad_norm": 2.252426318052208, + "learning_rate": 4.854530760682898e-06, + "loss": 0.3758, + "step": 10190 + }, + { + "epoch": 0.5151385066033686, + "grad_norm": 4.755817616459457, + "learning_rate": 4.849479745428832e-06, + "loss": 0.3807, + "step": 10200 + }, + { + "epoch": 0.5156435443549405, + "grad_norm": 4.046138186931513, + "learning_rate": 4.844428730174765e-06, + "loss": 0.391, + "step": 10210 + }, + { + "epoch": 0.5161485821065125, + "grad_norm": 19.99100500952114, + "learning_rate": 4.8393777149206994e-06, + "loss": 0.3781, + "step": 10220 + }, + { + "epoch": 0.5166536198580844, + "grad_norm": 5.479232805622522, + "learning_rate": 4.8343266996666336e-06, + "loss": 0.3708, + "step": 10230 + }, + { + "epoch": 0.5171586576096563, + "grad_norm": 8.844870574151285, + "learning_rate": 4.829275684412567e-06, + "loss": 0.3711, + "step": 10240 + }, + { + "epoch": 0.5176636953612282, + "grad_norm": 7.57710951045164, + "learning_rate": 4.824224669158502e-06, + "loss": 0.3854, + "step": 10250 + }, + { + "epoch": 0.5181687331128002, + "grad_norm": 7.850442473809185, + "learning_rate": 4.819173653904435e-06, + "loss": 0.3786, + "step": 10260 + }, + { + "epoch": 0.5186737708643722, + "grad_norm": 6.316522286246354, + "learning_rate": 4.814122638650369e-06, + "loss": 0.3838, + "step": 10270 + }, + { + "epoch": 0.519178808615944, + "grad_norm": 11.182819849962994, + "learning_rate": 4.809071623396303e-06, + "loss": 0.385, + "step": 10280 + }, + { + "epoch": 0.519683846367516, + "grad_norm": 9.46133568238808, + "learning_rate": 4.804020608142237e-06, + "loss": 0.3703, + "step": 10290 + }, + { + "epoch": 0.5201888841190879, + "grad_norm": 7.691807268341042, + "learning_rate": 4.798969592888171e-06, + "loss": 0.3682, + "step": 10300 + }, + { + "epoch": 0.5206939218706599, + "grad_norm": 9.93078233486807, + "learning_rate": 4.793918577634105e-06, + "loss": 0.3802, + "step": 10310 + }, + { + "epoch": 0.5211989596222317, + "grad_norm": 7.510658702823232, + "learning_rate": 4.788867562380039e-06, + "loss": 0.3886, + "step": 10320 + }, + { + "epoch": 0.5217039973738037, + "grad_norm": 22.882646270297737, + "learning_rate": 4.783816547125973e-06, + "loss": 0.393, + "step": 10330 + }, + { + "epoch": 0.5222090351253756, + "grad_norm": 14.517964204994225, + "learning_rate": 4.778765531871906e-06, + "loss": 0.3803, + "step": 10340 + }, + { + "epoch": 0.5227140728769476, + "grad_norm": 6.854534356382802, + "learning_rate": 4.7737145166178405e-06, + "loss": 0.3931, + "step": 10350 + }, + { + "epoch": 0.5232191106285194, + "grad_norm": 11.309739741500756, + "learning_rate": 4.768663501363775e-06, + "loss": 0.3876, + "step": 10360 + }, + { + "epoch": 0.5237241483800914, + "grad_norm": 8.35728833434225, + "learning_rate": 4.763612486109708e-06, + "loss": 0.3789, + "step": 10370 + }, + { + "epoch": 0.5242291861316634, + "grad_norm": 8.871743836416062, + "learning_rate": 4.758561470855642e-06, + "loss": 0.3981, + "step": 10380 + }, + { + "epoch": 0.5247342238832353, + "grad_norm": 15.15117759530781, + "learning_rate": 4.753510455601576e-06, + "loss": 0.3577, + "step": 10390 + }, + { + "epoch": 0.5252392616348072, + "grad_norm": 10.766312797504908, + "learning_rate": 4.74845944034751e-06, + "loss": 0.3744, + "step": 10400 + }, + { + "epoch": 0.5257442993863791, + "grad_norm": 44.757871998844294, + "learning_rate": 4.743408425093444e-06, + "loss": 0.3697, + "step": 10410 + }, + { + "epoch": 0.5262493371379511, + "grad_norm": 16.586045391427703, + "learning_rate": 4.738357409839378e-06, + "loss": 0.3746, + "step": 10420 + }, + { + "epoch": 0.526754374889523, + "grad_norm": 50.773287263268436, + "learning_rate": 4.733306394585312e-06, + "loss": 0.373, + "step": 10430 + }, + { + "epoch": 0.5272594126410949, + "grad_norm": 10.502847442362183, + "learning_rate": 4.728255379331246e-06, + "loss": 0.387, + "step": 10440 + }, + { + "epoch": 0.5277644503926668, + "grad_norm": 16.528099312938107, + "learning_rate": 4.72320436407718e-06, + "loss": 0.383, + "step": 10450 + }, + { + "epoch": 0.5282694881442388, + "grad_norm": 17.8799111844064, + "learning_rate": 4.718153348823114e-06, + "loss": 0.381, + "step": 10460 + }, + { + "epoch": 0.5287745258958108, + "grad_norm": 14.63753428467757, + "learning_rate": 4.713102333569048e-06, + "loss": 0.3837, + "step": 10470 + }, + { + "epoch": 0.5292795636473826, + "grad_norm": 13.42251301006677, + "learning_rate": 4.7080513183149816e-06, + "loss": 0.3595, + "step": 10480 + }, + { + "epoch": 0.5297846013989546, + "grad_norm": 30.157056141735794, + "learning_rate": 4.703000303060916e-06, + "loss": 0.3648, + "step": 10490 + }, + { + "epoch": 0.5302896391505265, + "grad_norm": 28.508594444052306, + "learning_rate": 4.69794928780685e-06, + "loss": 0.3793, + "step": 10500 + }, + { + "epoch": 0.5307946769020985, + "grad_norm": 28.922781791221336, + "learning_rate": 4.692898272552783e-06, + "loss": 0.3745, + "step": 10510 + }, + { + "epoch": 0.5312997146536703, + "grad_norm": 13.721901669008831, + "learning_rate": 4.687847257298718e-06, + "loss": 0.375, + "step": 10520 + }, + { + "epoch": 0.5318047524052423, + "grad_norm": 65.55837391568407, + "learning_rate": 4.682796242044651e-06, + "loss": 0.3801, + "step": 10530 + }, + { + "epoch": 0.5323097901568142, + "grad_norm": 16.351443187824426, + "learning_rate": 4.6777452267905855e-06, + "loss": 0.3727, + "step": 10540 + }, + { + "epoch": 0.5328148279083862, + "grad_norm": 24.727000388948138, + "learning_rate": 4.67269421153652e-06, + "loss": 0.4, + "step": 10550 + }, + { + "epoch": 0.533319865659958, + "grad_norm": 16.92930292363904, + "learning_rate": 4.667643196282453e-06, + "loss": 0.3818, + "step": 10560 + }, + { + "epoch": 0.53382490341153, + "grad_norm": 8.293298262780807, + "learning_rate": 4.662592181028387e-06, + "loss": 0.3914, + "step": 10570 + }, + { + "epoch": 0.534329941163102, + "grad_norm": 7.923596292500085, + "learning_rate": 4.657541165774321e-06, + "loss": 0.3835, + "step": 10580 + }, + { + "epoch": 0.5348349789146739, + "grad_norm": 21.525089803513207, + "learning_rate": 4.652490150520255e-06, + "loss": 0.3621, + "step": 10590 + }, + { + "epoch": 0.5353400166662458, + "grad_norm": 5.744657346480749, + "learning_rate": 4.647439135266189e-06, + "loss": 0.3674, + "step": 10600 + }, + { + "epoch": 0.5358450544178177, + "grad_norm": 4.54613701131844, + "learning_rate": 4.642388120012123e-06, + "loss": 0.3804, + "step": 10610 + }, + { + "epoch": 0.5363500921693897, + "grad_norm": 14.97113129892423, + "learning_rate": 4.637337104758057e-06, + "loss": 0.3555, + "step": 10620 + }, + { + "epoch": 0.5368551299209616, + "grad_norm": 11.639674507484868, + "learning_rate": 4.632286089503991e-06, + "loss": 0.3632, + "step": 10630 + }, + { + "epoch": 0.5373601676725335, + "grad_norm": 19.897945155280457, + "learning_rate": 4.627235074249924e-06, + "loss": 0.3747, + "step": 10640 + }, + { + "epoch": 0.5378652054241054, + "grad_norm": 6.364339652707885, + "learning_rate": 4.622184058995858e-06, + "loss": 0.3786, + "step": 10650 + }, + { + "epoch": 0.5383702431756774, + "grad_norm": 9.457676241588533, + "learning_rate": 4.617133043741792e-06, + "loss": 0.3756, + "step": 10660 + }, + { + "epoch": 0.5388752809272493, + "grad_norm": 44.15827262329639, + "learning_rate": 4.6120820284877265e-06, + "loss": 0.3741, + "step": 10670 + }, + { + "epoch": 0.5393803186788212, + "grad_norm": 17.038339271477003, + "learning_rate": 4.607031013233661e-06, + "loss": 0.3736, + "step": 10680 + }, + { + "epoch": 0.5398853564303931, + "grad_norm": 7.117484724149919, + "learning_rate": 4.601979997979594e-06, + "loss": 0.3723, + "step": 10690 + }, + { + "epoch": 0.5403903941819651, + "grad_norm": 42.36231217164134, + "learning_rate": 4.596928982725528e-06, + "loss": 0.3729, + "step": 10700 + }, + { + "epoch": 0.5408954319335371, + "grad_norm": 8.564465700855026, + "learning_rate": 4.591877967471462e-06, + "loss": 0.3789, + "step": 10710 + }, + { + "epoch": 0.5414004696851089, + "grad_norm": 8.67428139976207, + "learning_rate": 4.5868269522173955e-06, + "loss": 0.3713, + "step": 10720 + }, + { + "epoch": 0.5419055074366809, + "grad_norm": 3.39326548400569, + "learning_rate": 4.5817759369633304e-06, + "loss": 0.385, + "step": 10730 + }, + { + "epoch": 0.5424105451882528, + "grad_norm": 8.005255858360202, + "learning_rate": 4.576724921709264e-06, + "loss": 0.3707, + "step": 10740 + }, + { + "epoch": 0.5429155829398248, + "grad_norm": 5.061235443799371, + "learning_rate": 4.571673906455198e-06, + "loss": 0.3685, + "step": 10750 + }, + { + "epoch": 0.5434206206913966, + "grad_norm": 3.7186572910401816, + "learning_rate": 4.566622891201132e-06, + "loss": 0.3793, + "step": 10760 + }, + { + "epoch": 0.5439256584429686, + "grad_norm": 3.941979325788329, + "learning_rate": 4.561571875947065e-06, + "loss": 0.3723, + "step": 10770 + }, + { + "epoch": 0.5444306961945405, + "grad_norm": 6.68200128954829, + "learning_rate": 4.556520860692999e-06, + "loss": 0.3699, + "step": 10780 + }, + { + "epoch": 0.5449357339461125, + "grad_norm": 3.8742413460943204, + "learning_rate": 4.5514698454389335e-06, + "loss": 0.3712, + "step": 10790 + }, + { + "epoch": 0.5454407716976845, + "grad_norm": 3.5527678739232327, + "learning_rate": 4.546418830184868e-06, + "loss": 0.3797, + "step": 10800 + }, + { + "epoch": 0.5459458094492563, + "grad_norm": 3.293145624427697, + "learning_rate": 4.541367814930802e-06, + "loss": 0.392, + "step": 10810 + }, + { + "epoch": 0.5464508472008283, + "grad_norm": 43.93745381251672, + "learning_rate": 4.536316799676736e-06, + "loss": 0.3848, + "step": 10820 + }, + { + "epoch": 0.5469558849524002, + "grad_norm": 3.1532560256164466, + "learning_rate": 4.531265784422669e-06, + "loss": 0.3833, + "step": 10830 + }, + { + "epoch": 0.5474609227039722, + "grad_norm": 6.167488921508967, + "learning_rate": 4.526214769168603e-06, + "loss": 0.4006, + "step": 10840 + }, + { + "epoch": 0.547965960455544, + "grad_norm": 12.229490845691188, + "learning_rate": 4.521163753914537e-06, + "loss": 0.3803, + "step": 10850 + }, + { + "epoch": 0.548470998207116, + "grad_norm": 4.1950075492065615, + "learning_rate": 4.5161127386604715e-06, + "loss": 0.3842, + "step": 10860 + }, + { + "epoch": 0.5489760359586879, + "grad_norm": 3.654034176301022, + "learning_rate": 4.511061723406406e-06, + "loss": 0.396, + "step": 10870 + }, + { + "epoch": 0.5494810737102599, + "grad_norm": 12.04048410661582, + "learning_rate": 4.506010708152339e-06, + "loss": 0.3902, + "step": 10880 + }, + { + "epoch": 0.5499861114618317, + "grad_norm": 5.763873510396362, + "learning_rate": 4.500959692898273e-06, + "loss": 0.3803, + "step": 10890 + }, + { + "epoch": 0.5504911492134037, + "grad_norm": 5.97333873812127, + "learning_rate": 4.495908677644207e-06, + "loss": 0.3787, + "step": 10900 + }, + { + "epoch": 0.5509961869649757, + "grad_norm": 4.51021628683305, + "learning_rate": 4.49085766239014e-06, + "loss": 0.3835, + "step": 10910 + }, + { + "epoch": 0.5515012247165476, + "grad_norm": 3.902274506866627, + "learning_rate": 4.4858066471360745e-06, + "loss": 0.3762, + "step": 10920 + }, + { + "epoch": 0.5520062624681195, + "grad_norm": 3.0980828622359335, + "learning_rate": 4.480755631882009e-06, + "loss": 0.3873, + "step": 10930 + }, + { + "epoch": 0.5525113002196914, + "grad_norm": 2.5033456579398674, + "learning_rate": 4.475704616627943e-06, + "loss": 0.4027, + "step": 10940 + }, + { + "epoch": 0.5530163379712634, + "grad_norm": 2.888693890103198, + "learning_rate": 4.470653601373877e-06, + "loss": 0.3828, + "step": 10950 + }, + { + "epoch": 0.5535213757228353, + "grad_norm": 3.5149007219506205, + "learning_rate": 4.46560258611981e-06, + "loss": 0.3797, + "step": 10960 + }, + { + "epoch": 0.5540264134744072, + "grad_norm": 2.3262448954191894, + "learning_rate": 4.460551570865744e-06, + "loss": 0.385, + "step": 10970 + }, + { + "epoch": 0.5545314512259791, + "grad_norm": 2.021941174461837, + "learning_rate": 4.4555005556116784e-06, + "loss": 0.3902, + "step": 10980 + }, + { + "epoch": 0.5550364889775511, + "grad_norm": 3.2136340880143694, + "learning_rate": 4.450449540357612e-06, + "loss": 0.3767, + "step": 10990 + }, + { + "epoch": 0.555541526729123, + "grad_norm": 6.405644243339184, + "learning_rate": 4.445398525103547e-06, + "loss": 0.394, + "step": 11000 + }, + { + "epoch": 0.5560465644806949, + "grad_norm": 3.698639548037572, + "learning_rate": 4.44034750984948e-06, + "loss": 0.3733, + "step": 11010 + }, + { + "epoch": 0.5565516022322669, + "grad_norm": 1.8438864220839366, + "learning_rate": 4.435296494595414e-06, + "loss": 0.387, + "step": 11020 + }, + { + "epoch": 0.5570566399838388, + "grad_norm": 2.9672016536550654, + "learning_rate": 4.430245479341348e-06, + "loss": 0.3767, + "step": 11030 + }, + { + "epoch": 0.5575616777354108, + "grad_norm": 9.95951552056093, + "learning_rate": 4.4251944640872815e-06, + "loss": 0.3796, + "step": 11040 + }, + { + "epoch": 0.5580667154869826, + "grad_norm": 3.844799275005631, + "learning_rate": 4.420143448833216e-06, + "loss": 0.3721, + "step": 11050 + }, + { + "epoch": 0.5585717532385546, + "grad_norm": 3.606527812089754, + "learning_rate": 4.41509243357915e-06, + "loss": 0.3726, + "step": 11060 + }, + { + "epoch": 0.5590767909901265, + "grad_norm": 3.99898244357326, + "learning_rate": 4.410041418325084e-06, + "loss": 0.3822, + "step": 11070 + }, + { + "epoch": 0.5595818287416985, + "grad_norm": 2.365878798228634, + "learning_rate": 4.404990403071018e-06, + "loss": 0.3858, + "step": 11080 + }, + { + "epoch": 0.5600868664932703, + "grad_norm": 2.0114937465222105, + "learning_rate": 4.399939387816951e-06, + "loss": 0.3619, + "step": 11090 + }, + { + "epoch": 0.5605919042448423, + "grad_norm": 4.267506041644642, + "learning_rate": 4.394888372562885e-06, + "loss": 0.3959, + "step": 11100 + }, + { + "epoch": 0.5610969419964142, + "grad_norm": 2.3885028725522663, + "learning_rate": 4.3898373573088195e-06, + "loss": 0.3689, + "step": 11110 + }, + { + "epoch": 0.5616019797479862, + "grad_norm": 8.297216447663791, + "learning_rate": 4.384786342054753e-06, + "loss": 0.3775, + "step": 11120 + }, + { + "epoch": 0.562107017499558, + "grad_norm": 2.8469924568130516, + "learning_rate": 4.379735326800687e-06, + "loss": 0.3898, + "step": 11130 + }, + { + "epoch": 0.56261205525113, + "grad_norm": 6.231958717493552, + "learning_rate": 4.374684311546621e-06, + "loss": 0.3837, + "step": 11140 + }, + { + "epoch": 0.563117093002702, + "grad_norm": 4.309341792972094, + "learning_rate": 4.369633296292555e-06, + "loss": 0.3828, + "step": 11150 + }, + { + "epoch": 0.5636221307542739, + "grad_norm": 3.203446767568462, + "learning_rate": 4.364582281038489e-06, + "loss": 0.3903, + "step": 11160 + }, + { + "epoch": 0.5641271685058458, + "grad_norm": 4.843549458393687, + "learning_rate": 4.359531265784423e-06, + "loss": 0.3855, + "step": 11170 + }, + { + "epoch": 0.5646322062574177, + "grad_norm": 2.525777084557924, + "learning_rate": 4.354480250530357e-06, + "loss": 0.3898, + "step": 11180 + }, + { + "epoch": 0.5651372440089897, + "grad_norm": 1.9829345093280084, + "learning_rate": 4.349429235276291e-06, + "loss": 0.3981, + "step": 11190 + }, + { + "epoch": 0.5656422817605616, + "grad_norm": 2.2195349808444162, + "learning_rate": 4.344378220022225e-06, + "loss": 0.3627, + "step": 11200 + }, + { + "epoch": 0.5661473195121335, + "grad_norm": 1.8120887020230037, + "learning_rate": 4.339327204768159e-06, + "loss": 0.3785, + "step": 11210 + }, + { + "epoch": 0.5666523572637054, + "grad_norm": 3.909050155051535, + "learning_rate": 4.334276189514093e-06, + "loss": 0.3893, + "step": 11220 + }, + { + "epoch": 0.5671573950152774, + "grad_norm": 110.40544319977835, + "learning_rate": 4.3292251742600264e-06, + "loss": 0.3835, + "step": 11230 + }, + { + "epoch": 0.5676624327668494, + "grad_norm": 2.734463901696715, + "learning_rate": 4.3241741590059606e-06, + "loss": 0.3996, + "step": 11240 + }, + { + "epoch": 0.5681674705184212, + "grad_norm": 2.044621346060538, + "learning_rate": 4.319123143751895e-06, + "loss": 0.3963, + "step": 11250 + }, + { + "epoch": 0.5686725082699932, + "grad_norm": 2.6336975727846346, + "learning_rate": 4.314072128497828e-06, + "loss": 0.3862, + "step": 11260 + }, + { + "epoch": 0.5691775460215651, + "grad_norm": 2.817059832465792, + "learning_rate": 4.309021113243763e-06, + "loss": 0.3944, + "step": 11270 + }, + { + "epoch": 0.5696825837731371, + "grad_norm": 3.118909901449835, + "learning_rate": 4.303970097989696e-06, + "loss": 0.3746, + "step": 11280 + }, + { + "epoch": 0.5701876215247089, + "grad_norm": 2.419014264509884, + "learning_rate": 4.29891908273563e-06, + "loss": 0.3756, + "step": 11290 + }, + { + "epoch": 0.5706926592762809, + "grad_norm": 2.009124011568514, + "learning_rate": 4.2938680674815645e-06, + "loss": 0.381, + "step": 11300 + }, + { + "epoch": 0.5711976970278528, + "grad_norm": 1.7912588198575379, + "learning_rate": 4.288817052227498e-06, + "loss": 0.3854, + "step": 11310 + }, + { + "epoch": 0.5717027347794248, + "grad_norm": 2.188948676804761, + "learning_rate": 4.283766036973432e-06, + "loss": 0.4011, + "step": 11320 + }, + { + "epoch": 0.5722077725309966, + "grad_norm": 5.787714109914013, + "learning_rate": 4.278715021719366e-06, + "loss": 0.3953, + "step": 11330 + }, + { + "epoch": 0.5727128102825686, + "grad_norm": 2.895115220305218, + "learning_rate": 4.2736640064653e-06, + "loss": 0.3835, + "step": 11340 + }, + { + "epoch": 0.5732178480341406, + "grad_norm": 3.0581001395729013, + "learning_rate": 4.268612991211234e-06, + "loss": 0.3765, + "step": 11350 + }, + { + "epoch": 0.5737228857857125, + "grad_norm": 2.8763642234703015, + "learning_rate": 4.2635619759571675e-06, + "loss": 0.3844, + "step": 11360 + }, + { + "epoch": 0.5742279235372844, + "grad_norm": 2.73350988345317, + "learning_rate": 4.258510960703102e-06, + "loss": 0.3838, + "step": 11370 + }, + { + "epoch": 0.5747329612888563, + "grad_norm": 2.187379524359724, + "learning_rate": 4.253459945449036e-06, + "loss": 0.3819, + "step": 11380 + }, + { + "epoch": 0.5752379990404283, + "grad_norm": 2.5046900898952518, + "learning_rate": 4.248408930194969e-06, + "loss": 0.3959, + "step": 11390 + }, + { + "epoch": 0.5757430367920002, + "grad_norm": 2.058073939600919, + "learning_rate": 4.243357914940903e-06, + "loss": 0.3931, + "step": 11400 + }, + { + "epoch": 0.5762480745435722, + "grad_norm": 2.15231990076239, + "learning_rate": 4.238306899686837e-06, + "loss": 0.3755, + "step": 11410 + }, + { + "epoch": 0.576753112295144, + "grad_norm": 2.050683275677417, + "learning_rate": 4.233255884432771e-06, + "loss": 0.3736, + "step": 11420 + }, + { + "epoch": 0.577258150046716, + "grad_norm": 2.6748692786354624, + "learning_rate": 4.2282048691787055e-06, + "loss": 0.3766, + "step": 11430 + }, + { + "epoch": 0.577763187798288, + "grad_norm": 3.596459258619974, + "learning_rate": 4.223153853924639e-06, + "loss": 0.3798, + "step": 11440 + }, + { + "epoch": 0.5782682255498599, + "grad_norm": 3.0261933816595987, + "learning_rate": 4.218102838670573e-06, + "loss": 0.3958, + "step": 11450 + }, + { + "epoch": 0.5787732633014318, + "grad_norm": 5.146945239564686, + "learning_rate": 4.213051823416507e-06, + "loss": 0.3725, + "step": 11460 + }, + { + "epoch": 0.5792783010530037, + "grad_norm": 2.4698008594522896, + "learning_rate": 4.20800080816244e-06, + "loss": 0.376, + "step": 11470 + }, + { + "epoch": 0.5797833388045757, + "grad_norm": 2.107316320060359, + "learning_rate": 4.202949792908375e-06, + "loss": 0.3629, + "step": 11480 + }, + { + "epoch": 0.5802883765561476, + "grad_norm": 2.0980153420904575, + "learning_rate": 4.197898777654309e-06, + "loss": 0.4014, + "step": 11490 + }, + { + "epoch": 0.5807934143077195, + "grad_norm": 3.2058729016983136, + "learning_rate": 4.192847762400243e-06, + "loss": 0.3828, + "step": 11500 + }, + { + "epoch": 0.5812984520592914, + "grad_norm": 2.71859111643498, + "learning_rate": 4.187796747146177e-06, + "loss": 0.3819, + "step": 11510 + }, + { + "epoch": 0.5818034898108634, + "grad_norm": 2.4534281139850793, + "learning_rate": 4.182745731892111e-06, + "loss": 0.3928, + "step": 11520 + }, + { + "epoch": 0.5823085275624353, + "grad_norm": 1.8301553116699396, + "learning_rate": 4.177694716638044e-06, + "loss": 0.3654, + "step": 11530 + }, + { + "epoch": 0.5828135653140072, + "grad_norm": 4.477875878428584, + "learning_rate": 4.172643701383979e-06, + "loss": 0.3748, + "step": 11540 + }, + { + "epoch": 0.5833186030655791, + "grad_norm": 13.967741045162365, + "learning_rate": 4.1675926861299125e-06, + "loss": 0.3756, + "step": 11550 + }, + { + "epoch": 0.5838236408171511, + "grad_norm": 3.872741784787074, + "learning_rate": 4.162541670875847e-06, + "loss": 0.3771, + "step": 11560 + }, + { + "epoch": 0.5843286785687231, + "grad_norm": 4.469317092124396, + "learning_rate": 4.157490655621781e-06, + "loss": 0.374, + "step": 11570 + }, + { + "epoch": 0.5848337163202949, + "grad_norm": 4.299796050846851, + "learning_rate": 4.152439640367714e-06, + "loss": 0.3721, + "step": 11580 + }, + { + "epoch": 0.5853387540718669, + "grad_norm": 2.72784485464612, + "learning_rate": 4.147388625113648e-06, + "loss": 0.3752, + "step": 11590 + }, + { + "epoch": 0.5858437918234388, + "grad_norm": 2.4059258940294574, + "learning_rate": 4.142337609859582e-06, + "loss": 0.3725, + "step": 11600 + }, + { + "epoch": 0.5863488295750108, + "grad_norm": 3.311734782523416, + "learning_rate": 4.137286594605516e-06, + "loss": 0.3798, + "step": 11610 + }, + { + "epoch": 0.5868538673265826, + "grad_norm": 3.792159108792209, + "learning_rate": 4.1322355793514505e-06, + "loss": 0.3611, + "step": 11620 + }, + { + "epoch": 0.5873589050781546, + "grad_norm": 3.7603434596067684, + "learning_rate": 4.127184564097384e-06, + "loss": 0.3807, + "step": 11630 + }, + { + "epoch": 0.5878639428297265, + "grad_norm": 2.082805585556261, + "learning_rate": 4.122133548843318e-06, + "loss": 0.3713, + "step": 11640 + }, + { + "epoch": 0.5883689805812985, + "grad_norm": 3.7335861923532847, + "learning_rate": 4.117082533589252e-06, + "loss": 0.37, + "step": 11650 + }, + { + "epoch": 0.5888740183328703, + "grad_norm": 2.56465711212247, + "learning_rate": 4.112031518335185e-06, + "loss": 0.3902, + "step": 11660 + }, + { + "epoch": 0.5893790560844423, + "grad_norm": 2.1557448667197843, + "learning_rate": 4.106980503081119e-06, + "loss": 0.3943, + "step": 11670 + }, + { + "epoch": 0.5898840938360143, + "grad_norm": 4.763988402275346, + "learning_rate": 4.1019294878270535e-06, + "loss": 0.3775, + "step": 11680 + }, + { + "epoch": 0.5903891315875862, + "grad_norm": 5.278109696777853, + "learning_rate": 4.096878472572988e-06, + "loss": 0.3893, + "step": 11690 + }, + { + "epoch": 0.5908941693391581, + "grad_norm": 2.6015113581109466, + "learning_rate": 4.091827457318922e-06, + "loss": 0.3746, + "step": 11700 + }, + { + "epoch": 0.59139920709073, + "grad_norm": 2.851515770660457, + "learning_rate": 4.086776442064855e-06, + "loss": 0.3819, + "step": 11710 + }, + { + "epoch": 0.591904244842302, + "grad_norm": 2.5969899871366215, + "learning_rate": 4.081725426810789e-06, + "loss": 0.3728, + "step": 11720 + }, + { + "epoch": 0.5924092825938739, + "grad_norm": 1.9757415738509565, + "learning_rate": 4.076674411556723e-06, + "loss": 0.3724, + "step": 11730 + }, + { + "epoch": 0.5929143203454458, + "grad_norm": 3.277581750970118, + "learning_rate": 4.071623396302657e-06, + "loss": 0.3688, + "step": 11740 + }, + { + "epoch": 0.5934193580970177, + "grad_norm": 4.8469992620214155, + "learning_rate": 4.0665723810485916e-06, + "loss": 0.3562, + "step": 11750 + }, + { + "epoch": 0.5939243958485897, + "grad_norm": 2.323267430569943, + "learning_rate": 4.061521365794525e-06, + "loss": 0.3907, + "step": 11760 + }, + { + "epoch": 0.5944294336001616, + "grad_norm": 1.9799231963200372, + "learning_rate": 4.056470350540459e-06, + "loss": 0.3894, + "step": 11770 + }, + { + "epoch": 0.5949344713517335, + "grad_norm": 2.576381259976751, + "learning_rate": 4.051419335286393e-06, + "loss": 0.3709, + "step": 11780 + }, + { + "epoch": 0.5954395091033055, + "grad_norm": 3.283926951217961, + "learning_rate": 4.046368320032326e-06, + "loss": 0.3938, + "step": 11790 + }, + { + "epoch": 0.5959445468548774, + "grad_norm": 3.29273998482733, + "learning_rate": 4.0413173047782605e-06, + "loss": 0.3896, + "step": 11800 + }, + { + "epoch": 0.5964495846064494, + "grad_norm": 2.332643126609973, + "learning_rate": 4.036266289524195e-06, + "loss": 0.3719, + "step": 11810 + }, + { + "epoch": 0.5969546223580212, + "grad_norm": 3.2558173835297963, + "learning_rate": 4.031215274270129e-06, + "loss": 0.3741, + "step": 11820 + }, + { + "epoch": 0.5974596601095932, + "grad_norm": 6.835841939224201, + "learning_rate": 4.026164259016063e-06, + "loss": 0.363, + "step": 11830 + }, + { + "epoch": 0.5979646978611651, + "grad_norm": 2.1914586837180687, + "learning_rate": 4.021113243761996e-06, + "loss": 0.3618, + "step": 11840 + }, + { + "epoch": 0.5984697356127371, + "grad_norm": 2.360393516962277, + "learning_rate": 4.01606222850793e-06, + "loss": 0.3726, + "step": 11850 + }, + { + "epoch": 0.5989747733643089, + "grad_norm": 2.46922458613829, + "learning_rate": 4.011011213253864e-06, + "loss": 0.3649, + "step": 11860 + }, + { + "epoch": 0.5994798111158809, + "grad_norm": 1.886510326818929, + "learning_rate": 4.005960197999798e-06, + "loss": 0.371, + "step": 11870 + }, + { + "epoch": 0.5999848488674528, + "grad_norm": 4.097224486709572, + "learning_rate": 4.000909182745733e-06, + "loss": 0.3944, + "step": 11880 + }, + { + "epoch": 0.6004898866190248, + "grad_norm": 7.100119835211903, + "learning_rate": 3.995858167491667e-06, + "loss": 0.3665, + "step": 11890 + }, + { + "epoch": 0.6009949243705967, + "grad_norm": 1.7863799930508097, + "learning_rate": 3.9908071522376e-06, + "loss": 0.3748, + "step": 11900 + }, + { + "epoch": 0.6014999621221686, + "grad_norm": 3.0047977958425487, + "learning_rate": 3.985756136983534e-06, + "loss": 0.3869, + "step": 11910 + }, + { + "epoch": 0.6020049998737406, + "grad_norm": 3.077659231433922, + "learning_rate": 3.980705121729468e-06, + "loss": 0.3618, + "step": 11920 + }, + { + "epoch": 0.6025100376253125, + "grad_norm": 5.808559896098692, + "learning_rate": 3.9756541064754015e-06, + "loss": 0.3735, + "step": 11930 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 2.1197853890011285, + "learning_rate": 3.970603091221336e-06, + "loss": 0.3901, + "step": 11940 + }, + { + "epoch": 0.6035201131284563, + "grad_norm": 2.0357339213647054, + "learning_rate": 3.96555207596727e-06, + "loss": 0.3768, + "step": 11950 + }, + { + "epoch": 0.6040251508800283, + "grad_norm": 1.9386614623259042, + "learning_rate": 3.960501060713204e-06, + "loss": 0.371, + "step": 11960 + }, + { + "epoch": 0.6045301886316002, + "grad_norm": 3.573185030358563, + "learning_rate": 3.955450045459138e-06, + "loss": 0.3748, + "step": 11970 + }, + { + "epoch": 0.6050352263831721, + "grad_norm": 2.6490276092763225, + "learning_rate": 3.950399030205071e-06, + "loss": 0.3907, + "step": 11980 + }, + { + "epoch": 0.605540264134744, + "grad_norm": 1.9765526236319297, + "learning_rate": 3.9453480149510054e-06, + "loss": 0.3851, + "step": 11990 + }, + { + "epoch": 0.606045301886316, + "grad_norm": 2.6148181435511866, + "learning_rate": 3.9402969996969396e-06, + "loss": 0.3829, + "step": 12000 + }, + { + "epoch": 0.606550339637888, + "grad_norm": 2.0137917601843194, + "learning_rate": 3.935245984442873e-06, + "loss": 0.3876, + "step": 12010 + }, + { + "epoch": 0.6070553773894599, + "grad_norm": 2.183286213214996, + "learning_rate": 3.930194969188808e-06, + "loss": 0.3852, + "step": 12020 + }, + { + "epoch": 0.6075604151410318, + "grad_norm": 1.8996262451525328, + "learning_rate": 3.925143953934741e-06, + "loss": 0.3737, + "step": 12030 + }, + { + "epoch": 0.6080654528926037, + "grad_norm": 2.1458215777612293, + "learning_rate": 3.920092938680675e-06, + "loss": 0.3705, + "step": 12040 + }, + { + "epoch": 0.6085704906441757, + "grad_norm": 1.611205289782356, + "learning_rate": 3.915041923426609e-06, + "loss": 0.3774, + "step": 12050 + }, + { + "epoch": 0.6090755283957476, + "grad_norm": 1.8784320811926083, + "learning_rate": 3.909990908172543e-06, + "loss": 0.3705, + "step": 12060 + }, + { + "epoch": 0.6095805661473195, + "grad_norm": 2.0214114750705283, + "learning_rate": 3.904939892918477e-06, + "loss": 0.3579, + "step": 12070 + }, + { + "epoch": 0.6100856038988914, + "grad_norm": 2.52455375915045, + "learning_rate": 3.899888877664411e-06, + "loss": 0.3757, + "step": 12080 + }, + { + "epoch": 0.6105906416504634, + "grad_norm": 2.7488427549157963, + "learning_rate": 3.894837862410345e-06, + "loss": 0.3753, + "step": 12090 + }, + { + "epoch": 0.6110956794020354, + "grad_norm": 2.366101694404488, + "learning_rate": 3.889786847156279e-06, + "loss": 0.3583, + "step": 12100 + }, + { + "epoch": 0.6116007171536072, + "grad_norm": 4.526955084775627, + "learning_rate": 3.884735831902212e-06, + "loss": 0.373, + "step": 12110 + }, + { + "epoch": 0.6121057549051792, + "grad_norm": 1.6632942025151354, + "learning_rate": 3.8796848166481465e-06, + "loss": 0.3797, + "step": 12120 + }, + { + "epoch": 0.6126107926567511, + "grad_norm": 2.122544662038796, + "learning_rate": 3.874633801394081e-06, + "loss": 0.3763, + "step": 12130 + }, + { + "epoch": 0.6131158304083231, + "grad_norm": 2.136310843503549, + "learning_rate": 3.869582786140014e-06, + "loss": 0.3822, + "step": 12140 + }, + { + "epoch": 0.6136208681598949, + "grad_norm": 1.701221883208955, + "learning_rate": 3.864531770885949e-06, + "loss": 0.3636, + "step": 12150 + }, + { + "epoch": 0.6141259059114669, + "grad_norm": 1.9524548661829169, + "learning_rate": 3.859480755631882e-06, + "loss": 0.3913, + "step": 12160 + }, + { + "epoch": 0.6146309436630388, + "grad_norm": 2.58084193291891, + "learning_rate": 3.854429740377816e-06, + "loss": 0.3647, + "step": 12170 + }, + { + "epoch": 0.6151359814146108, + "grad_norm": 2.6683083927745552, + "learning_rate": 3.84937872512375e-06, + "loss": 0.3834, + "step": 12180 + }, + { + "epoch": 0.6156410191661826, + "grad_norm": 5.415821035201854, + "learning_rate": 3.844327709869684e-06, + "loss": 0.3679, + "step": 12190 + }, + { + "epoch": 0.6161460569177546, + "grad_norm": 1.7693494201780369, + "learning_rate": 3.839276694615618e-06, + "loss": 0.3675, + "step": 12200 + }, + { + "epoch": 0.6166510946693265, + "grad_norm": 2.025826626656601, + "learning_rate": 3.834225679361552e-06, + "loss": 0.3807, + "step": 12210 + }, + { + "epoch": 0.6171561324208985, + "grad_norm": 2.409768770452221, + "learning_rate": 3.829174664107486e-06, + "loss": 0.3718, + "step": 12220 + }, + { + "epoch": 0.6176611701724704, + "grad_norm": 2.81111834040087, + "learning_rate": 3.82412364885342e-06, + "loss": 0.3809, + "step": 12230 + }, + { + "epoch": 0.6181662079240423, + "grad_norm": 2.261752998915838, + "learning_rate": 3.8190726335993535e-06, + "loss": 0.392, + "step": 12240 + }, + { + "epoch": 0.6186712456756143, + "grad_norm": 4.1776939292020066, + "learning_rate": 3.8140216183452876e-06, + "loss": 0.3738, + "step": 12250 + }, + { + "epoch": 0.6191762834271862, + "grad_norm": 2.1437198828433197, + "learning_rate": 3.8089706030912217e-06, + "loss": 0.3719, + "step": 12260 + }, + { + "epoch": 0.6196813211787581, + "grad_norm": 3.075348229616783, + "learning_rate": 3.8039195878371554e-06, + "loss": 0.3706, + "step": 12270 + }, + { + "epoch": 0.62018635893033, + "grad_norm": 2.0146213511322566, + "learning_rate": 3.798868572583089e-06, + "loss": 0.3897, + "step": 12280 + }, + { + "epoch": 0.620691396681902, + "grad_norm": 3.0389012573946625, + "learning_rate": 3.7938175573290236e-06, + "loss": 0.3744, + "step": 12290 + }, + { + "epoch": 0.6211964344334739, + "grad_norm": 3.500847066826284, + "learning_rate": 3.7887665420749573e-06, + "loss": 0.3645, + "step": 12300 + }, + { + "epoch": 0.6217014721850458, + "grad_norm": 4.562441605553007, + "learning_rate": 3.7837155268208915e-06, + "loss": 0.3738, + "step": 12310 + }, + { + "epoch": 0.6222065099366177, + "grad_norm": 2.8281857847564957, + "learning_rate": 3.778664511566825e-06, + "loss": 0.3645, + "step": 12320 + }, + { + "epoch": 0.6227115476881897, + "grad_norm": 4.569461603791169, + "learning_rate": 3.7736134963127593e-06, + "loss": 0.3525, + "step": 12330 + }, + { + "epoch": 0.6232165854397617, + "grad_norm": 3.3932139149292198, + "learning_rate": 3.768562481058693e-06, + "loss": 0.3686, + "step": 12340 + }, + { + "epoch": 0.6237216231913335, + "grad_norm": 3.208108830968236, + "learning_rate": 3.7635114658046267e-06, + "loss": 0.3675, + "step": 12350 + }, + { + "epoch": 0.6242266609429055, + "grad_norm": 3.325030777041727, + "learning_rate": 3.7584604505505612e-06, + "loss": 0.3827, + "step": 12360 + }, + { + "epoch": 0.6247316986944774, + "grad_norm": 3.001621471243721, + "learning_rate": 3.753409435296495e-06, + "loss": 0.3673, + "step": 12370 + }, + { + "epoch": 0.6252367364460494, + "grad_norm": 2.7314438383701263, + "learning_rate": 3.748358420042429e-06, + "loss": 0.3831, + "step": 12380 + }, + { + "epoch": 0.6257417741976212, + "grad_norm": 4.37429312079951, + "learning_rate": 3.7433074047883628e-06, + "loss": 0.3665, + "step": 12390 + }, + { + "epoch": 0.6262468119491932, + "grad_norm": 3.9619988373008077, + "learning_rate": 3.7382563895342965e-06, + "loss": 0.3747, + "step": 12400 + }, + { + "epoch": 0.6267518497007651, + "grad_norm": 4.80089629937948, + "learning_rate": 3.7332053742802306e-06, + "loss": 0.3777, + "step": 12410 + }, + { + "epoch": 0.6272568874523371, + "grad_norm": 3.8675160835398485, + "learning_rate": 3.7281543590261643e-06, + "loss": 0.3644, + "step": 12420 + }, + { + "epoch": 0.6277619252039089, + "grad_norm": 2.235819960025121, + "learning_rate": 3.723103343772099e-06, + "loss": 0.37, + "step": 12430 + }, + { + "epoch": 0.6282669629554809, + "grad_norm": 2.1802012458708164, + "learning_rate": 3.7180523285180325e-06, + "loss": 0.3783, + "step": 12440 + }, + { + "epoch": 0.6287720007070529, + "grad_norm": 5.254753451223298, + "learning_rate": 3.7130013132639662e-06, + "loss": 0.3778, + "step": 12450 + }, + { + "epoch": 0.6292770384586248, + "grad_norm": 3.0569298328433794, + "learning_rate": 3.7079502980099004e-06, + "loss": 0.3708, + "step": 12460 + }, + { + "epoch": 0.6297820762101967, + "grad_norm": 3.453756701658818, + "learning_rate": 3.702899282755834e-06, + "loss": 0.3614, + "step": 12470 + }, + { + "epoch": 0.6302871139617686, + "grad_norm": 3.0406418341037105, + "learning_rate": 3.6978482675017678e-06, + "loss": 0.3605, + "step": 12480 + }, + { + "epoch": 0.6307921517133406, + "grad_norm": 5.948695461444471, + "learning_rate": 3.6927972522477023e-06, + "loss": 0.3769, + "step": 12490 + }, + { + "epoch": 0.6312971894649125, + "grad_norm": 3.6565401647116986, + "learning_rate": 3.687746236993636e-06, + "loss": 0.3598, + "step": 12500 + }, + { + "epoch": 0.6318022272164844, + "grad_norm": 2.386153355802283, + "learning_rate": 3.68269522173957e-06, + "loss": 0.3668, + "step": 12510 + }, + { + "epoch": 0.6323072649680563, + "grad_norm": 5.317495105141212, + "learning_rate": 3.677644206485504e-06, + "loss": 0.3678, + "step": 12520 + }, + { + "epoch": 0.6328123027196283, + "grad_norm": 6.576082498743726, + "learning_rate": 3.672593191231438e-06, + "loss": 0.3668, + "step": 12530 + }, + { + "epoch": 0.6333173404712003, + "grad_norm": 6.268613985781669, + "learning_rate": 3.6675421759773717e-06, + "loss": 0.3635, + "step": 12540 + }, + { + "epoch": 0.6338223782227721, + "grad_norm": 5.931534215833603, + "learning_rate": 3.6624911607233054e-06, + "loss": 0.3836, + "step": 12550 + }, + { + "epoch": 0.634327415974344, + "grad_norm": 3.414474702810524, + "learning_rate": 3.65744014546924e-06, + "loss": 0.3638, + "step": 12560 + }, + { + "epoch": 0.634832453725916, + "grad_norm": 2.5786921065802906, + "learning_rate": 3.6523891302151736e-06, + "loss": 0.3792, + "step": 12570 + }, + { + "epoch": 0.635337491477488, + "grad_norm": 3.3296465725491973, + "learning_rate": 3.6473381149611077e-06, + "loss": 0.3726, + "step": 12580 + }, + { + "epoch": 0.6358425292290599, + "grad_norm": 5.112120958135022, + "learning_rate": 3.6422870997070414e-06, + "loss": 0.3608, + "step": 12590 + }, + { + "epoch": 0.6363475669806318, + "grad_norm": 4.712824755681933, + "learning_rate": 3.637236084452975e-06, + "loss": 0.361, + "step": 12600 + }, + { + "epoch": 0.6368526047322037, + "grad_norm": 2.557395610295011, + "learning_rate": 3.6321850691989093e-06, + "loss": 0.3646, + "step": 12610 + }, + { + "epoch": 0.6373576424837757, + "grad_norm": 2.4265764249895962, + "learning_rate": 3.627134053944843e-06, + "loss": 0.3673, + "step": 12620 + }, + { + "epoch": 0.6378626802353476, + "grad_norm": 9.111738338010397, + "learning_rate": 3.6220830386907775e-06, + "loss": 0.3748, + "step": 12630 + }, + { + "epoch": 0.6383677179869195, + "grad_norm": 13.21927265302919, + "learning_rate": 3.617032023436711e-06, + "loss": 0.3736, + "step": 12640 + }, + { + "epoch": 0.6388727557384914, + "grad_norm": 3.433344571738311, + "learning_rate": 3.611981008182645e-06, + "loss": 0.3804, + "step": 12650 + }, + { + "epoch": 0.6393777934900634, + "grad_norm": 3.3546640582148393, + "learning_rate": 3.606929992928579e-06, + "loss": 0.3512, + "step": 12660 + }, + { + "epoch": 0.6398828312416354, + "grad_norm": 2.665395597695038, + "learning_rate": 3.6018789776745127e-06, + "loss": 0.3777, + "step": 12670 + }, + { + "epoch": 0.6403878689932072, + "grad_norm": 3.7554388445793343, + "learning_rate": 3.5968279624204464e-06, + "loss": 0.3745, + "step": 12680 + }, + { + "epoch": 0.6408929067447792, + "grad_norm": 5.3470856792032695, + "learning_rate": 3.5917769471663805e-06, + "loss": 0.3712, + "step": 12690 + }, + { + "epoch": 0.6413979444963511, + "grad_norm": 3.090410688880816, + "learning_rate": 3.586725931912315e-06, + "loss": 0.3621, + "step": 12700 + }, + { + "epoch": 0.6419029822479231, + "grad_norm": 2.664925787957406, + "learning_rate": 3.581674916658249e-06, + "loss": 0.3543, + "step": 12710 + }, + { + "epoch": 0.6424080199994949, + "grad_norm": 2.449259146215423, + "learning_rate": 3.5766239014041825e-06, + "loss": 0.3637, + "step": 12720 + }, + { + "epoch": 0.6429130577510669, + "grad_norm": 3.159318328572562, + "learning_rate": 3.5715728861501166e-06, + "loss": 0.3793, + "step": 12730 + }, + { + "epoch": 0.6434180955026388, + "grad_norm": 2.7133150428468835, + "learning_rate": 3.5665218708960503e-06, + "loss": 0.3798, + "step": 12740 + }, + { + "epoch": 0.6439231332542108, + "grad_norm": 3.374200452364885, + "learning_rate": 3.561470855641984e-06, + "loss": 0.3624, + "step": 12750 + }, + { + "epoch": 0.6444281710057826, + "grad_norm": 2.9622292324304365, + "learning_rate": 3.556419840387918e-06, + "loss": 0.3553, + "step": 12760 + }, + { + "epoch": 0.6449332087573546, + "grad_norm": 2.609047480998997, + "learning_rate": 3.5513688251338523e-06, + "loss": 0.3721, + "step": 12770 + }, + { + "epoch": 0.6454382465089266, + "grad_norm": 5.058720798344808, + "learning_rate": 3.5463178098797864e-06, + "loss": 0.3486, + "step": 12780 + }, + { + "epoch": 0.6459432842604985, + "grad_norm": 2.2596797012742558, + "learning_rate": 3.54126679462572e-06, + "loss": 0.3696, + "step": 12790 + }, + { + "epoch": 0.6464483220120704, + "grad_norm": 2.5737345357996815, + "learning_rate": 3.536215779371654e-06, + "loss": 0.3534, + "step": 12800 + }, + { + "epoch": 0.6469533597636423, + "grad_norm": 3.9591498502643305, + "learning_rate": 3.531164764117588e-06, + "loss": 0.3635, + "step": 12810 + }, + { + "epoch": 0.6474583975152143, + "grad_norm": 4.5793714339332805, + "learning_rate": 3.5261137488635216e-06, + "loss": 0.3633, + "step": 12820 + }, + { + "epoch": 0.6479634352667862, + "grad_norm": 2.4696926794812173, + "learning_rate": 3.521062733609456e-06, + "loss": 0.3681, + "step": 12830 + }, + { + "epoch": 0.6484684730183581, + "grad_norm": 10.025961417450299, + "learning_rate": 3.51601171835539e-06, + "loss": 0.3645, + "step": 12840 + }, + { + "epoch": 0.64897351076993, + "grad_norm": 2.2863708800017934, + "learning_rate": 3.5109607031013236e-06, + "loss": 0.3686, + "step": 12850 + }, + { + "epoch": 0.649478548521502, + "grad_norm": 1.9177686162549965, + "learning_rate": 3.5059096878472577e-06, + "loss": 0.3673, + "step": 12860 + }, + { + "epoch": 0.649983586273074, + "grad_norm": 5.445065512861327, + "learning_rate": 3.5008586725931914e-06, + "loss": 0.3801, + "step": 12870 + }, + { + "epoch": 0.6504886240246458, + "grad_norm": 2.6676713316438336, + "learning_rate": 3.4958076573391255e-06, + "loss": 0.3779, + "step": 12880 + }, + { + "epoch": 0.6509936617762178, + "grad_norm": 3.8611942525569374, + "learning_rate": 3.490756642085059e-06, + "loss": 0.375, + "step": 12890 + }, + { + "epoch": 0.6514986995277897, + "grad_norm": 3.774007520734188, + "learning_rate": 3.4857056268309938e-06, + "loss": 0.3768, + "step": 12900 + }, + { + "epoch": 0.6520037372793617, + "grad_norm": 3.200552688232275, + "learning_rate": 3.4806546115769275e-06, + "loss": 0.3831, + "step": 12910 + }, + { + "epoch": 0.6525087750309335, + "grad_norm": 1.9098229490724261, + "learning_rate": 3.475603596322861e-06, + "loss": 0.379, + "step": 12920 + }, + { + "epoch": 0.6530138127825055, + "grad_norm": 3.943477029717605, + "learning_rate": 3.4705525810687953e-06, + "loss": 0.3712, + "step": 12930 + }, + { + "epoch": 0.6535188505340774, + "grad_norm": 4.9375810935188165, + "learning_rate": 3.465501565814729e-06, + "loss": 0.3727, + "step": 12940 + }, + { + "epoch": 0.6540238882856494, + "grad_norm": 5.033670107006043, + "learning_rate": 3.4604505505606627e-06, + "loss": 0.3663, + "step": 12950 + }, + { + "epoch": 0.6545289260372212, + "grad_norm": 3.064561646751532, + "learning_rate": 3.455399535306597e-06, + "loss": 0.3777, + "step": 12960 + }, + { + "epoch": 0.6550339637887932, + "grad_norm": 2.3278833688543594, + "learning_rate": 3.450348520052531e-06, + "loss": 0.3539, + "step": 12970 + }, + { + "epoch": 0.6555390015403652, + "grad_norm": 2.194839952891626, + "learning_rate": 3.445297504798465e-06, + "loss": 0.3623, + "step": 12980 + }, + { + "epoch": 0.6560440392919371, + "grad_norm": 4.478106964413342, + "learning_rate": 3.4402464895443987e-06, + "loss": 0.3717, + "step": 12990 + }, + { + "epoch": 0.656549077043509, + "grad_norm": 3.862763827323303, + "learning_rate": 3.4351954742903324e-06, + "loss": 0.3497, + "step": 13000 + }, + { + "epoch": 0.6570541147950809, + "grad_norm": 6.994733245951842, + "learning_rate": 3.4301444590362666e-06, + "loss": 0.3814, + "step": 13010 + }, + { + "epoch": 0.6575591525466529, + "grad_norm": 5.798821632935566, + "learning_rate": 3.4250934437822003e-06, + "loss": 0.3632, + "step": 13020 + }, + { + "epoch": 0.6580641902982248, + "grad_norm": 4.04056214450326, + "learning_rate": 3.420042428528134e-06, + "loss": 0.3848, + "step": 13030 + }, + { + "epoch": 0.6585692280497967, + "grad_norm": 2.567673036911413, + "learning_rate": 3.4149914132740685e-06, + "loss": 0.3613, + "step": 13040 + }, + { + "epoch": 0.6590742658013686, + "grad_norm": 4.569693836912058, + "learning_rate": 3.4099403980200022e-06, + "loss": 0.3689, + "step": 13050 + }, + { + "epoch": 0.6595793035529406, + "grad_norm": 7.024167809375034, + "learning_rate": 3.4048893827659363e-06, + "loss": 0.3752, + "step": 13060 + }, + { + "epoch": 0.6600843413045125, + "grad_norm": 15.307342736787076, + "learning_rate": 3.39983836751187e-06, + "loss": 0.3563, + "step": 13070 + }, + { + "epoch": 0.6605893790560844, + "grad_norm": 5.154751774553765, + "learning_rate": 3.394787352257804e-06, + "loss": 0.374, + "step": 13080 + }, + { + "epoch": 0.6610944168076563, + "grad_norm": 6.4670558523466894, + "learning_rate": 3.389736337003738e-06, + "loss": 0.3662, + "step": 13090 + }, + { + "epoch": 0.6615994545592283, + "grad_norm": 4.631202509755095, + "learning_rate": 3.3846853217496724e-06, + "loss": 0.3719, + "step": 13100 + }, + { + "epoch": 0.6621044923108003, + "grad_norm": 2.638649824542641, + "learning_rate": 3.379634306495606e-06, + "loss": 0.3649, + "step": 13110 + }, + { + "epoch": 0.6626095300623721, + "grad_norm": 4.93421678581268, + "learning_rate": 3.37458329124154e-06, + "loss": 0.3583, + "step": 13120 + }, + { + "epoch": 0.6631145678139441, + "grad_norm": 8.596147858750863, + "learning_rate": 3.369532275987474e-06, + "loss": 0.3757, + "step": 13130 + }, + { + "epoch": 0.663619605565516, + "grad_norm": 2.389686125334587, + "learning_rate": 3.3644812607334076e-06, + "loss": 0.3665, + "step": 13140 + }, + { + "epoch": 0.664124643317088, + "grad_norm": 7.517201643559354, + "learning_rate": 3.3594302454793413e-06, + "loss": 0.3772, + "step": 13150 + }, + { + "epoch": 0.6646296810686598, + "grad_norm": 3.2962729985486843, + "learning_rate": 3.3543792302252755e-06, + "loss": 0.3662, + "step": 13160 + }, + { + "epoch": 0.6651347188202318, + "grad_norm": 2.696587145771749, + "learning_rate": 3.3493282149712096e-06, + "loss": 0.3784, + "step": 13170 + }, + { + "epoch": 0.6656397565718037, + "grad_norm": 9.007208510801052, + "learning_rate": 3.3442771997171437e-06, + "loss": 0.3754, + "step": 13180 + }, + { + "epoch": 0.6661447943233757, + "grad_norm": 3.6378171433319384, + "learning_rate": 3.3392261844630774e-06, + "loss": 0.36, + "step": 13190 + }, + { + "epoch": 0.6666498320749477, + "grad_norm": 5.542336432011706, + "learning_rate": 3.334175169209011e-06, + "loss": 0.3594, + "step": 13200 + }, + { + "epoch": 0.6671548698265195, + "grad_norm": 3.865761953639028, + "learning_rate": 3.3291241539549452e-06, + "loss": 0.3627, + "step": 13210 + }, + { + "epoch": 0.6676599075780915, + "grad_norm": 8.688331722475619, + "learning_rate": 3.324073138700879e-06, + "loss": 0.3583, + "step": 13220 + }, + { + "epoch": 0.6681649453296634, + "grad_norm": 7.506416675568487, + "learning_rate": 3.319022123446813e-06, + "loss": 0.389, + "step": 13230 + }, + { + "epoch": 0.6686699830812354, + "grad_norm": 5.431932592827668, + "learning_rate": 3.313971108192747e-06, + "loss": 0.3544, + "step": 13240 + }, + { + "epoch": 0.6691750208328072, + "grad_norm": 5.709981208456748, + "learning_rate": 3.3089200929386813e-06, + "loss": 0.3872, + "step": 13250 + }, + { + "epoch": 0.6696800585843792, + "grad_norm": 3.72615004588395, + "learning_rate": 3.303869077684615e-06, + "loss": 0.3694, + "step": 13260 + }, + { + "epoch": 0.6701850963359511, + "grad_norm": 4.672631711696132, + "learning_rate": 3.2988180624305487e-06, + "loss": 0.3552, + "step": 13270 + }, + { + "epoch": 0.6706901340875231, + "grad_norm": 4.166097239004073, + "learning_rate": 3.293767047176483e-06, + "loss": 0.3705, + "step": 13280 + }, + { + "epoch": 0.6711951718390949, + "grad_norm": 9.195919275508842, + "learning_rate": 3.2887160319224165e-06, + "loss": 0.3688, + "step": 13290 + }, + { + "epoch": 0.6717002095906669, + "grad_norm": 6.6636791033610185, + "learning_rate": 3.2836650166683502e-06, + "loss": 0.3621, + "step": 13300 + }, + { + "epoch": 0.6722052473422389, + "grad_norm": 5.256124870002979, + "learning_rate": 3.2786140014142848e-06, + "loss": 0.3611, + "step": 13310 + }, + { + "epoch": 0.6727102850938108, + "grad_norm": 7.691842196761616, + "learning_rate": 3.2735629861602185e-06, + "loss": 0.3634, + "step": 13320 + }, + { + "epoch": 0.6732153228453827, + "grad_norm": 4.350658776477639, + "learning_rate": 3.2685119709061526e-06, + "loss": 0.3588, + "step": 13330 + }, + { + "epoch": 0.6737203605969546, + "grad_norm": 11.623446284645299, + "learning_rate": 3.2634609556520863e-06, + "loss": 0.371, + "step": 13340 + }, + { + "epoch": 0.6742253983485266, + "grad_norm": 3.656024443945701, + "learning_rate": 3.25840994039802e-06, + "loss": 0.3496, + "step": 13350 + }, + { + "epoch": 0.6747304361000985, + "grad_norm": 4.747809479838541, + "learning_rate": 3.253358925143954e-06, + "loss": 0.369, + "step": 13360 + }, + { + "epoch": 0.6752354738516704, + "grad_norm": 3.045504794009347, + "learning_rate": 3.248307909889888e-06, + "loss": 0.3675, + "step": 13370 + }, + { + "epoch": 0.6757405116032423, + "grad_norm": 4.5225509793672405, + "learning_rate": 3.2432568946358224e-06, + "loss": 0.364, + "step": 13380 + }, + { + "epoch": 0.6762455493548143, + "grad_norm": 6.376859379432204, + "learning_rate": 3.238205879381756e-06, + "loss": 0.3538, + "step": 13390 + }, + { + "epoch": 0.6767505871063862, + "grad_norm": 4.459340178610681, + "learning_rate": 3.2331548641276898e-06, + "loss": 0.3549, + "step": 13400 + }, + { + "epoch": 0.6772556248579581, + "grad_norm": 4.009290064517063, + "learning_rate": 3.228103848873624e-06, + "loss": 0.3635, + "step": 13410 + }, + { + "epoch": 0.67776066260953, + "grad_norm": 4.352335627454303, + "learning_rate": 3.2230528336195576e-06, + "loss": 0.3642, + "step": 13420 + }, + { + "epoch": 0.678265700361102, + "grad_norm": 8.538415021418777, + "learning_rate": 3.2180018183654917e-06, + "loss": 0.3638, + "step": 13430 + }, + { + "epoch": 0.678770738112674, + "grad_norm": 9.929796560384965, + "learning_rate": 3.212950803111426e-06, + "loss": 0.3733, + "step": 13440 + }, + { + "epoch": 0.6792757758642458, + "grad_norm": 10.24812538481589, + "learning_rate": 3.20789978785736e-06, + "loss": 0.3733, + "step": 13450 + }, + { + "epoch": 0.6797808136158178, + "grad_norm": 4.911901994444236, + "learning_rate": 3.2028487726032937e-06, + "loss": 0.3546, + "step": 13460 + }, + { + "epoch": 0.6802858513673897, + "grad_norm": 9.459161115684884, + "learning_rate": 3.1977977573492274e-06, + "loss": 0.3604, + "step": 13470 + }, + { + "epoch": 0.6807908891189617, + "grad_norm": 58.44069759388177, + "learning_rate": 3.1927467420951615e-06, + "loss": 0.3682, + "step": 13480 + }, + { + "epoch": 0.6812959268705335, + "grad_norm": 5.773542468847822, + "learning_rate": 3.187695726841095e-06, + "loss": 0.3654, + "step": 13490 + }, + { + "epoch": 0.6818009646221055, + "grad_norm": 7.236039686643189, + "learning_rate": 3.182644711587029e-06, + "loss": 0.3581, + "step": 13500 + }, + { + "epoch": 0.6823060023736774, + "grad_norm": 8.164527538084936, + "learning_rate": 3.1775936963329634e-06, + "loss": 0.3719, + "step": 13510 + }, + { + "epoch": 0.6828110401252494, + "grad_norm": 8.750460822533052, + "learning_rate": 3.172542681078897e-06, + "loss": 0.3685, + "step": 13520 + }, + { + "epoch": 0.6833160778768212, + "grad_norm": 8.31780664281577, + "learning_rate": 3.1674916658248313e-06, + "loss": 0.3738, + "step": 13530 + }, + { + "epoch": 0.6838211156283932, + "grad_norm": 39.92814930883877, + "learning_rate": 3.162440650570765e-06, + "loss": 0.3561, + "step": 13540 + }, + { + "epoch": 0.6843261533799652, + "grad_norm": 7.250183885140758, + "learning_rate": 3.1573896353166987e-06, + "loss": 0.3763, + "step": 13550 + }, + { + "epoch": 0.6848311911315371, + "grad_norm": 7.545512249578723, + "learning_rate": 3.1523386200626328e-06, + "loss": 0.3619, + "step": 13560 + }, + { + "epoch": 0.685336228883109, + "grad_norm": 13.136078405966021, + "learning_rate": 3.1472876048085665e-06, + "loss": 0.3506, + "step": 13570 + }, + { + "epoch": 0.6858412666346809, + "grad_norm": 12.847273364738184, + "learning_rate": 3.142236589554501e-06, + "loss": 0.3718, + "step": 13580 + }, + { + "epoch": 0.6863463043862529, + "grad_norm": 5.131472126489263, + "learning_rate": 3.1371855743004347e-06, + "loss": 0.3573, + "step": 13590 + }, + { + "epoch": 0.6868513421378248, + "grad_norm": 8.846171149150827, + "learning_rate": 3.132134559046369e-06, + "loss": 0.3645, + "step": 13600 + }, + { + "epoch": 0.6873563798893967, + "grad_norm": 7.08666632645302, + "learning_rate": 3.1270835437923026e-06, + "loss": 0.3863, + "step": 13610 + }, + { + "epoch": 0.6878614176409686, + "grad_norm": 9.125152763348956, + "learning_rate": 3.1220325285382363e-06, + "loss": 0.3612, + "step": 13620 + }, + { + "epoch": 0.6883664553925406, + "grad_norm": 22.795792171268904, + "learning_rate": 3.1169815132841704e-06, + "loss": 0.3514, + "step": 13630 + }, + { + "epoch": 0.6888714931441126, + "grad_norm": 13.972423486929486, + "learning_rate": 3.111930498030104e-06, + "loss": 0.343, + "step": 13640 + }, + { + "epoch": 0.6893765308956844, + "grad_norm": 7.146064054239514, + "learning_rate": 3.1068794827760386e-06, + "loss": 0.3634, + "step": 13650 + }, + { + "epoch": 0.6898815686472564, + "grad_norm": 12.633883151844028, + "learning_rate": 3.1018284675219723e-06, + "loss": 0.3622, + "step": 13660 + }, + { + "epoch": 0.6903866063988283, + "grad_norm": 10.567266022971053, + "learning_rate": 3.096777452267906e-06, + "loss": 0.3775, + "step": 13670 + }, + { + "epoch": 0.6908916441504003, + "grad_norm": 10.665161639920084, + "learning_rate": 3.09172643701384e-06, + "loss": 0.3473, + "step": 13680 + }, + { + "epoch": 0.6913966819019721, + "grad_norm": 13.421618433035983, + "learning_rate": 3.086675421759774e-06, + "loss": 0.3566, + "step": 13690 + }, + { + "epoch": 0.6919017196535441, + "grad_norm": 19.50453127590542, + "learning_rate": 3.0816244065057076e-06, + "loss": 0.3524, + "step": 13700 + }, + { + "epoch": 0.692406757405116, + "grad_norm": 11.56456458693372, + "learning_rate": 3.0765733912516417e-06, + "loss": 0.3489, + "step": 13710 + }, + { + "epoch": 0.692911795156688, + "grad_norm": 9.379959657537876, + "learning_rate": 3.071522375997576e-06, + "loss": 0.3736, + "step": 13720 + }, + { + "epoch": 0.6934168329082598, + "grad_norm": 22.486832278224945, + "learning_rate": 3.06647136074351e-06, + "loss": 0.3586, + "step": 13730 + }, + { + "epoch": 0.6939218706598318, + "grad_norm": 10.367615306255258, + "learning_rate": 3.0614203454894436e-06, + "loss": 0.3598, + "step": 13740 + }, + { + "epoch": 0.6944269084114038, + "grad_norm": 13.659141357148844, + "learning_rate": 3.0563693302353773e-06, + "loss": 0.3559, + "step": 13750 + }, + { + "epoch": 0.6949319461629757, + "grad_norm": 28.090360883092867, + "learning_rate": 3.0513183149813114e-06, + "loss": 0.3572, + "step": 13760 + }, + { + "epoch": 0.6954369839145476, + "grad_norm": 18.453465716800892, + "learning_rate": 3.046267299727245e-06, + "loss": 0.3724, + "step": 13770 + }, + { + "epoch": 0.6959420216661195, + "grad_norm": 29.69383075969026, + "learning_rate": 3.0412162844731797e-06, + "loss": 0.3715, + "step": 13780 + }, + { + "epoch": 0.6964470594176915, + "grad_norm": 5.487134314962381, + "learning_rate": 3.0361652692191134e-06, + "loss": 0.3498, + "step": 13790 + }, + { + "epoch": 0.6969520971692634, + "grad_norm": 5.672011269458868, + "learning_rate": 3.0311142539650475e-06, + "loss": 0.3681, + "step": 13800 + }, + { + "epoch": 0.6974571349208354, + "grad_norm": 4.839559671681653, + "learning_rate": 3.0260632387109812e-06, + "loss": 0.3623, + "step": 13810 + }, + { + "epoch": 0.6979621726724072, + "grad_norm": 4.235787248204053, + "learning_rate": 3.021012223456915e-06, + "loss": 0.3607, + "step": 13820 + }, + { + "epoch": 0.6984672104239792, + "grad_norm": 4.608668652433986, + "learning_rate": 3.015961208202849e-06, + "loss": 0.3477, + "step": 13830 + }, + { + "epoch": 0.6989722481755511, + "grad_norm": 4.704716321355951, + "learning_rate": 3.0109101929487827e-06, + "loss": 0.3685, + "step": 13840 + }, + { + "epoch": 0.6994772859271231, + "grad_norm": 6.503590629238218, + "learning_rate": 3.0058591776947173e-06, + "loss": 0.3721, + "step": 13850 + }, + { + "epoch": 0.699982323678695, + "grad_norm": 4.428750198778166, + "learning_rate": 3.000808162440651e-06, + "loss": 0.3712, + "step": 13860 + }, + { + "epoch": 0.7004873614302669, + "grad_norm": 8.223361489268525, + "learning_rate": 2.9957571471865847e-06, + "loss": 0.3615, + "step": 13870 + }, + { + "epoch": 0.7009923991818389, + "grad_norm": 3.8707110375759908, + "learning_rate": 2.990706131932519e-06, + "loss": 0.368, + "step": 13880 + }, + { + "epoch": 0.7014974369334108, + "grad_norm": 3.3033672047102853, + "learning_rate": 2.9856551166784525e-06, + "loss": 0.3662, + "step": 13890 + }, + { + "epoch": 0.7020024746849827, + "grad_norm": 5.8378570543101045, + "learning_rate": 2.9806041014243862e-06, + "loss": 0.3766, + "step": 13900 + }, + { + "epoch": 0.7025075124365546, + "grad_norm": 3.6651748652891425, + "learning_rate": 2.9755530861703203e-06, + "loss": 0.3661, + "step": 13910 + }, + { + "epoch": 0.7030125501881266, + "grad_norm": 5.784855966194933, + "learning_rate": 2.9705020709162545e-06, + "loss": 0.367, + "step": 13920 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 7.312016198568584, + "learning_rate": 2.9654510556621886e-06, + "loss": 0.3672, + "step": 13930 + }, + { + "epoch": 0.7040226256912704, + "grad_norm": 4.875104979534643, + "learning_rate": 2.9604000404081223e-06, + "loss": 0.3557, + "step": 13940 + }, + { + "epoch": 0.7045276634428423, + "grad_norm": 64.55120292569038, + "learning_rate": 2.955349025154056e-06, + "loss": 0.357, + "step": 13950 + }, + { + "epoch": 0.7050327011944143, + "grad_norm": 4.765695435475721, + "learning_rate": 2.95029800989999e-06, + "loss": 0.3625, + "step": 13960 + }, + { + "epoch": 0.7055377389459863, + "grad_norm": 9.706426400727505, + "learning_rate": 2.945246994645924e-06, + "loss": 0.3692, + "step": 13970 + }, + { + "epoch": 0.7060427766975581, + "grad_norm": 5.7703156609987785, + "learning_rate": 2.940195979391858e-06, + "loss": 0.3496, + "step": 13980 + }, + { + "epoch": 0.7065478144491301, + "grad_norm": 10.413216240197633, + "learning_rate": 2.935144964137792e-06, + "loss": 0.3435, + "step": 13990 + }, + { + "epoch": 0.707052852200702, + "grad_norm": 7.078239510812093, + "learning_rate": 2.930093948883726e-06, + "loss": 0.3661, + "step": 14000 + }, + { + "epoch": 0.707557889952274, + "grad_norm": 10.541563423283328, + "learning_rate": 2.92504293362966e-06, + "loss": 0.3569, + "step": 14010 + }, + { + "epoch": 0.7080629277038458, + "grad_norm": 4.886876661534467, + "learning_rate": 2.9199919183755936e-06, + "loss": 0.368, + "step": 14020 + }, + { + "epoch": 0.7085679654554178, + "grad_norm": 15.262848195897316, + "learning_rate": 2.9149409031215277e-06, + "loss": 0.3682, + "step": 14030 + }, + { + "epoch": 0.7090730032069897, + "grad_norm": 12.794932541476035, + "learning_rate": 2.9098898878674614e-06, + "loss": 0.3592, + "step": 14040 + }, + { + "epoch": 0.7095780409585617, + "grad_norm": 10.61866026419293, + "learning_rate": 2.904838872613395e-06, + "loss": 0.3604, + "step": 14050 + }, + { + "epoch": 0.7100830787101335, + "grad_norm": 25.4476962457588, + "learning_rate": 2.8997878573593296e-06, + "loss": 0.3824, + "step": 14060 + }, + { + "epoch": 0.7105881164617055, + "grad_norm": 7.669338040232453, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.3595, + "step": 14070 + }, + { + "epoch": 0.7110931542132775, + "grad_norm": 11.74527716840071, + "learning_rate": 2.8896858268511975e-06, + "loss": 0.3543, + "step": 14080 + }, + { + "epoch": 0.7115981919648494, + "grad_norm": 6.646635907284065, + "learning_rate": 2.884634811597131e-06, + "loss": 0.372, + "step": 14090 + }, + { + "epoch": 0.7121032297164213, + "grad_norm": 7.097527430156463, + "learning_rate": 2.879583796343065e-06, + "loss": 0.3551, + "step": 14100 + }, + { + "epoch": 0.7126082674679932, + "grad_norm": 13.591824996252463, + "learning_rate": 2.874532781088999e-06, + "loss": 0.3681, + "step": 14110 + }, + { + "epoch": 0.7131133052195652, + "grad_norm": 6.164884039122544, + "learning_rate": 2.869481765834933e-06, + "loss": 0.3587, + "step": 14120 + }, + { + "epoch": 0.7136183429711371, + "grad_norm": 4.715084112453213, + "learning_rate": 2.8644307505808672e-06, + "loss": 0.3573, + "step": 14130 + }, + { + "epoch": 0.714123380722709, + "grad_norm": 6.717756492187504, + "learning_rate": 2.859379735326801e-06, + "loss": 0.3686, + "step": 14140 + }, + { + "epoch": 0.7146284184742809, + "grad_norm": 3.2667847864627855, + "learning_rate": 2.854328720072735e-06, + "loss": 0.3623, + "step": 14150 + }, + { + "epoch": 0.7151334562258529, + "grad_norm": 4.553377349461034, + "learning_rate": 2.8492777048186688e-06, + "loss": 0.3696, + "step": 14160 + }, + { + "epoch": 0.7156384939774248, + "grad_norm": 6.765157372727796, + "learning_rate": 2.8442266895646025e-06, + "loss": 0.3828, + "step": 14170 + }, + { + "epoch": 0.7161435317289967, + "grad_norm": 11.922453637732414, + "learning_rate": 2.8391756743105366e-06, + "loss": 0.362, + "step": 14180 + }, + { + "epoch": 0.7166485694805687, + "grad_norm": 14.304651599986355, + "learning_rate": 2.8341246590564707e-06, + "loss": 0.3609, + "step": 14190 + }, + { + "epoch": 0.7171536072321406, + "grad_norm": 6.623357145482479, + "learning_rate": 2.829073643802405e-06, + "loss": 0.3697, + "step": 14200 + }, + { + "epoch": 0.7176586449837126, + "grad_norm": 8.462478930205787, + "learning_rate": 2.8240226285483385e-06, + "loss": 0.3909, + "step": 14210 + }, + { + "epoch": 0.7181636827352844, + "grad_norm": 4.464396808912767, + "learning_rate": 2.8189716132942722e-06, + "loss": 0.3653, + "step": 14220 + }, + { + "epoch": 0.7186687204868564, + "grad_norm": 5.427275414848616, + "learning_rate": 2.8139205980402064e-06, + "loss": 0.3615, + "step": 14230 + }, + { + "epoch": 0.7191737582384283, + "grad_norm": 6.5602036666267285, + "learning_rate": 2.80886958278614e-06, + "loss": 0.3793, + "step": 14240 + }, + { + "epoch": 0.7196787959900003, + "grad_norm": 8.382640709296943, + "learning_rate": 2.8038185675320738e-06, + "loss": 0.3839, + "step": 14250 + }, + { + "epoch": 0.7201838337415721, + "grad_norm": 8.38849977200125, + "learning_rate": 2.7987675522780083e-06, + "loss": 0.3632, + "step": 14260 + }, + { + "epoch": 0.7206888714931441, + "grad_norm": 4.838800155468274, + "learning_rate": 2.793716537023942e-06, + "loss": 0.3654, + "step": 14270 + }, + { + "epoch": 0.721193909244716, + "grad_norm": 3.202140029458944, + "learning_rate": 2.788665521769876e-06, + "loss": 0.3457, + "step": 14280 + }, + { + "epoch": 0.721698946996288, + "grad_norm": 3.096801851946618, + "learning_rate": 2.78361450651581e-06, + "loss": 0.3624, + "step": 14290 + }, + { + "epoch": 0.7222039847478599, + "grad_norm": 5.2594336987978885, + "learning_rate": 2.7785634912617435e-06, + "loss": 0.3595, + "step": 14300 + }, + { + "epoch": 0.7227090224994318, + "grad_norm": 5.738417846740695, + "learning_rate": 2.7735124760076777e-06, + "loss": 0.3471, + "step": 14310 + }, + { + "epoch": 0.7232140602510038, + "grad_norm": 3.3615923332479283, + "learning_rate": 2.7684614607536114e-06, + "loss": 0.3781, + "step": 14320 + }, + { + "epoch": 0.7237190980025757, + "grad_norm": 13.026933882432127, + "learning_rate": 2.763410445499546e-06, + "loss": 0.3765, + "step": 14330 + }, + { + "epoch": 0.7242241357541476, + "grad_norm": 2.4861597750527884, + "learning_rate": 2.7583594302454796e-06, + "loss": 0.3528, + "step": 14340 + }, + { + "epoch": 0.7247291735057195, + "grad_norm": 6.6503607629591, + "learning_rate": 2.7533084149914137e-06, + "loss": 0.3618, + "step": 14350 + }, + { + "epoch": 0.7252342112572915, + "grad_norm": 2.8124337865494464, + "learning_rate": 2.7482573997373474e-06, + "loss": 0.3791, + "step": 14360 + }, + { + "epoch": 0.7257392490088634, + "grad_norm": 5.334310989331839, + "learning_rate": 2.743206384483281e-06, + "loss": 0.3702, + "step": 14370 + }, + { + "epoch": 0.7262442867604354, + "grad_norm": 4.197944825898265, + "learning_rate": 2.7381553692292153e-06, + "loss": 0.3644, + "step": 14380 + }, + { + "epoch": 0.7267493245120072, + "grad_norm": 3.4074066682001076, + "learning_rate": 2.7331043539751494e-06, + "loss": 0.3694, + "step": 14390 + }, + { + "epoch": 0.7272543622635792, + "grad_norm": 2.5186168503160817, + "learning_rate": 2.7280533387210835e-06, + "loss": 0.3675, + "step": 14400 + }, + { + "epoch": 0.7277594000151512, + "grad_norm": 2.7527199189922134, + "learning_rate": 2.723002323467017e-06, + "loss": 0.3474, + "step": 14410 + }, + { + "epoch": 0.7282644377667231, + "grad_norm": 10.29943264391808, + "learning_rate": 2.717951308212951e-06, + "loss": 0.3627, + "step": 14420 + }, + { + "epoch": 0.728769475518295, + "grad_norm": 3.012910231072848, + "learning_rate": 2.712900292958885e-06, + "loss": 0.359, + "step": 14430 + }, + { + "epoch": 0.7292745132698669, + "grad_norm": 4.291318577581795, + "learning_rate": 2.7078492777048187e-06, + "loss": 0.3657, + "step": 14440 + }, + { + "epoch": 0.7297795510214389, + "grad_norm": 3.161513732407597, + "learning_rate": 2.7027982624507524e-06, + "loss": 0.3774, + "step": 14450 + }, + { + "epoch": 0.7302845887730108, + "grad_norm": 6.231424663495901, + "learning_rate": 2.697747247196687e-06, + "loss": 0.3496, + "step": 14460 + }, + { + "epoch": 0.7307896265245827, + "grad_norm": 9.93800800520396, + "learning_rate": 2.6926962319426207e-06, + "loss": 0.3574, + "step": 14470 + }, + { + "epoch": 0.7312946642761546, + "grad_norm": 5.792304139685358, + "learning_rate": 2.687645216688555e-06, + "loss": 0.3641, + "step": 14480 + }, + { + "epoch": 0.7317997020277266, + "grad_norm": 6.779155670529348, + "learning_rate": 2.6825942014344885e-06, + "loss": 0.3545, + "step": 14490 + }, + { + "epoch": 0.7323047397792986, + "grad_norm": 3.8135773883209527, + "learning_rate": 2.6775431861804226e-06, + "loss": 0.3722, + "step": 14500 + }, + { + "epoch": 0.7328097775308704, + "grad_norm": 3.3544383619935805, + "learning_rate": 2.6724921709263563e-06, + "loss": 0.3603, + "step": 14510 + }, + { + "epoch": 0.7333148152824424, + "grad_norm": 4.268409877609442, + "learning_rate": 2.66744115567229e-06, + "loss": 0.3682, + "step": 14520 + }, + { + "epoch": 0.7338198530340143, + "grad_norm": 6.571521428968219, + "learning_rate": 2.6623901404182246e-06, + "loss": 0.376, + "step": 14530 + }, + { + "epoch": 0.7343248907855863, + "grad_norm": 2.9026381605364935, + "learning_rate": 2.6573391251641583e-06, + "loss": 0.3705, + "step": 14540 + }, + { + "epoch": 0.7348299285371581, + "grad_norm": 3.6365813772686635, + "learning_rate": 2.6522881099100924e-06, + "loss": 0.3556, + "step": 14550 + }, + { + "epoch": 0.7353349662887301, + "grad_norm": 4.744776659753617, + "learning_rate": 2.647237094656026e-06, + "loss": 0.381, + "step": 14560 + }, + { + "epoch": 0.735840004040302, + "grad_norm": 3.8265663489625172, + "learning_rate": 2.64218607940196e-06, + "loss": 0.3675, + "step": 14570 + }, + { + "epoch": 0.736345041791874, + "grad_norm": 4.046535114955657, + "learning_rate": 2.637135064147894e-06, + "loss": 0.354, + "step": 14580 + }, + { + "epoch": 0.7368500795434458, + "grad_norm": 2.011858995754094, + "learning_rate": 2.6320840488938276e-06, + "loss": 0.3674, + "step": 14590 + }, + { + "epoch": 0.7373551172950178, + "grad_norm": 8.920805773112406, + "learning_rate": 2.627033033639762e-06, + "loss": 0.3479, + "step": 14600 + }, + { + "epoch": 0.7378601550465897, + "grad_norm": 3.136158587893057, + "learning_rate": 2.621982018385696e-06, + "loss": 0.3626, + "step": 14610 + }, + { + "epoch": 0.7383651927981617, + "grad_norm": 3.2282462284034175, + "learning_rate": 2.6169310031316296e-06, + "loss": 0.3612, + "step": 14620 + }, + { + "epoch": 0.7388702305497336, + "grad_norm": 2.7063221534655018, + "learning_rate": 2.6118799878775637e-06, + "loss": 0.3658, + "step": 14630 + }, + { + "epoch": 0.7393752683013055, + "grad_norm": 2.0705371802094663, + "learning_rate": 2.6068289726234974e-06, + "loss": 0.3619, + "step": 14640 + }, + { + "epoch": 0.7398803060528775, + "grad_norm": 2.452888217750629, + "learning_rate": 2.601777957369431e-06, + "loss": 0.3745, + "step": 14650 + }, + { + "epoch": 0.7403853438044494, + "grad_norm": 3.5210431082573987, + "learning_rate": 2.596726942115365e-06, + "loss": 0.3621, + "step": 14660 + }, + { + "epoch": 0.7408903815560213, + "grad_norm": 2.705820608986338, + "learning_rate": 2.5916759268612993e-06, + "loss": 0.3722, + "step": 14670 + }, + { + "epoch": 0.7413954193075932, + "grad_norm": 3.8301828984326005, + "learning_rate": 2.5866249116072335e-06, + "loss": 0.3475, + "step": 14680 + }, + { + "epoch": 0.7419004570591652, + "grad_norm": 2.506550997138522, + "learning_rate": 2.581573896353167e-06, + "loss": 0.379, + "step": 14690 + }, + { + "epoch": 0.7424054948107371, + "grad_norm": 4.732673442087478, + "learning_rate": 2.5765228810991013e-06, + "loss": 0.3646, + "step": 14700 + }, + { + "epoch": 0.742910532562309, + "grad_norm": 3.5697728957238297, + "learning_rate": 2.571471865845035e-06, + "loss": 0.3749, + "step": 14710 + }, + { + "epoch": 0.743415570313881, + "grad_norm": 3.408786643280019, + "learning_rate": 2.5664208505909687e-06, + "loss": 0.3729, + "step": 14720 + }, + { + "epoch": 0.7439206080654529, + "grad_norm": 3.2680972265488584, + "learning_rate": 2.5613698353369032e-06, + "loss": 0.3709, + "step": 14730 + }, + { + "epoch": 0.7444256458170249, + "grad_norm": 2.830354639883803, + "learning_rate": 2.556318820082837e-06, + "loss": 0.3575, + "step": 14740 + }, + { + "epoch": 0.7449306835685967, + "grad_norm": 2.1934046044941216, + "learning_rate": 2.551267804828771e-06, + "loss": 0.3615, + "step": 14750 + }, + { + "epoch": 0.7454357213201687, + "grad_norm": 3.7209556097533363, + "learning_rate": 2.5462167895747048e-06, + "loss": 0.3759, + "step": 14760 + }, + { + "epoch": 0.7459407590717406, + "grad_norm": 2.171529108489969, + "learning_rate": 2.5411657743206385e-06, + "loss": 0.3736, + "step": 14770 + }, + { + "epoch": 0.7464457968233126, + "grad_norm": 3.2881080411249903, + "learning_rate": 2.5361147590665726e-06, + "loss": 0.3856, + "step": 14780 + }, + { + "epoch": 0.7469508345748844, + "grad_norm": 2.1841022217022004, + "learning_rate": 2.5310637438125063e-06, + "loss": 0.3594, + "step": 14790 + }, + { + "epoch": 0.7474558723264564, + "grad_norm": 11.74984061570617, + "learning_rate": 2.526012728558441e-06, + "loss": 0.3691, + "step": 14800 + }, + { + "epoch": 0.7479609100780283, + "grad_norm": 2.64035990962851, + "learning_rate": 2.5209617133043745e-06, + "loss": 0.3426, + "step": 14810 + }, + { + "epoch": 0.7484659478296003, + "grad_norm": 1.7507289069126455, + "learning_rate": 2.5159106980503082e-06, + "loss": 0.3652, + "step": 14820 + }, + { + "epoch": 0.7489709855811721, + "grad_norm": 2.720025461643553, + "learning_rate": 2.5108596827962423e-06, + "loss": 0.3757, + "step": 14830 + }, + { + "epoch": 0.7494760233327441, + "grad_norm": 2.595489021549687, + "learning_rate": 2.505808667542176e-06, + "loss": 0.3659, + "step": 14840 + }, + { + "epoch": 0.7499810610843161, + "grad_norm": 3.4867867255116884, + "learning_rate": 2.50075765228811e-06, + "loss": 0.3733, + "step": 14850 + }, + { + "epoch": 0.750486098835888, + "grad_norm": 2.716151183546756, + "learning_rate": 2.4957066370340443e-06, + "loss": 0.3609, + "step": 14860 + }, + { + "epoch": 0.7509911365874599, + "grad_norm": 2.0400103144299737, + "learning_rate": 2.490655621779978e-06, + "loss": 0.3639, + "step": 14870 + }, + { + "epoch": 0.7514961743390318, + "grad_norm": 9.410235472131538, + "learning_rate": 2.4856046065259117e-06, + "loss": 0.3483, + "step": 14880 + }, + { + "epoch": 0.7520012120906038, + "grad_norm": 2.4049800584504584, + "learning_rate": 2.480553591271846e-06, + "loss": 0.3569, + "step": 14890 + }, + { + "epoch": 0.7525062498421757, + "grad_norm": 3.7384168124978565, + "learning_rate": 2.47550257601778e-06, + "loss": 0.3542, + "step": 14900 + }, + { + "epoch": 0.7530112875937476, + "grad_norm": 2.589659210465427, + "learning_rate": 2.4704515607637136e-06, + "loss": 0.368, + "step": 14910 + }, + { + "epoch": 0.7535163253453195, + "grad_norm": 2.2792717912641383, + "learning_rate": 2.4654005455096478e-06, + "loss": 0.3575, + "step": 14920 + }, + { + "epoch": 0.7540213630968915, + "grad_norm": 5.546504545797571, + "learning_rate": 2.4603495302555815e-06, + "loss": 0.361, + "step": 14930 + }, + { + "epoch": 0.7545264008484635, + "grad_norm": 4.5236138626508415, + "learning_rate": 2.4552985150015156e-06, + "loss": 0.3652, + "step": 14940 + }, + { + "epoch": 0.7550314386000353, + "grad_norm": 5.458941587498548, + "learning_rate": 2.4502474997474497e-06, + "loss": 0.3565, + "step": 14950 + }, + { + "epoch": 0.7555364763516073, + "grad_norm": 2.182667398028913, + "learning_rate": 2.4451964844933834e-06, + "loss": 0.3574, + "step": 14960 + }, + { + "epoch": 0.7560415141031792, + "grad_norm": 2.3501955437712563, + "learning_rate": 2.440145469239317e-06, + "loss": 0.3409, + "step": 14970 + }, + { + "epoch": 0.7565465518547512, + "grad_norm": 2.8564528610222473, + "learning_rate": 2.4350944539852512e-06, + "loss": 0.3629, + "step": 14980 + }, + { + "epoch": 0.7570515896063231, + "grad_norm": 3.888970347389462, + "learning_rate": 2.4300434387311854e-06, + "loss": 0.361, + "step": 14990 + }, + { + "epoch": 0.757556627357895, + "grad_norm": 7.312481133609781, + "learning_rate": 2.424992423477119e-06, + "loss": 0.3614, + "step": 15000 + }, + { + "epoch": 0.7580616651094669, + "grad_norm": 2.1317079758826813, + "learning_rate": 2.4199414082230528e-06, + "loss": 0.3738, + "step": 15010 + }, + { + "epoch": 0.7585667028610389, + "grad_norm": 3.6516684684220366, + "learning_rate": 2.414890392968987e-06, + "loss": 0.3648, + "step": 15020 + }, + { + "epoch": 0.7590717406126108, + "grad_norm": 2.6072438507069555, + "learning_rate": 2.409839377714921e-06, + "loss": 0.3621, + "step": 15030 + }, + { + "epoch": 0.7595767783641827, + "grad_norm": 6.351203220775499, + "learning_rate": 2.4047883624608547e-06, + "loss": 0.3558, + "step": 15040 + }, + { + "epoch": 0.7600818161157546, + "grad_norm": 4.130236652741164, + "learning_rate": 2.399737347206789e-06, + "loss": 0.369, + "step": 15050 + }, + { + "epoch": 0.7605868538673266, + "grad_norm": 4.6672030495120564, + "learning_rate": 2.394686331952723e-06, + "loss": 0.3622, + "step": 15060 + }, + { + "epoch": 0.7610918916188986, + "grad_norm": 12.886235429525053, + "learning_rate": 2.3896353166986567e-06, + "loss": 0.3622, + "step": 15070 + }, + { + "epoch": 0.7615969293704704, + "grad_norm": 15.156964415943117, + "learning_rate": 2.3845843014445904e-06, + "loss": 0.3467, + "step": 15080 + }, + { + "epoch": 0.7621019671220424, + "grad_norm": 2.87845766689377, + "learning_rate": 2.3795332861905245e-06, + "loss": 0.3505, + "step": 15090 + }, + { + "epoch": 0.7626070048736143, + "grad_norm": 15.116534480941452, + "learning_rate": 2.3744822709364586e-06, + "loss": 0.3483, + "step": 15100 + }, + { + "epoch": 0.7631120426251863, + "grad_norm": 7.257991090958527, + "learning_rate": 2.3694312556823923e-06, + "loss": 0.3822, + "step": 15110 + }, + { + "epoch": 0.7636170803767581, + "grad_norm": 3.4000262016509697, + "learning_rate": 2.3643802404283264e-06, + "loss": 0.3609, + "step": 15120 + }, + { + "epoch": 0.7641221181283301, + "grad_norm": 5.8302722893046734, + "learning_rate": 2.35932922517426e-06, + "loss": 0.3532, + "step": 15130 + }, + { + "epoch": 0.764627155879902, + "grad_norm": 2.980190177193747, + "learning_rate": 2.3542782099201943e-06, + "loss": 0.3532, + "step": 15140 + }, + { + "epoch": 0.765132193631474, + "grad_norm": 2.886158227100219, + "learning_rate": 2.349227194666128e-06, + "loss": 0.3747, + "step": 15150 + }, + { + "epoch": 0.7656372313830458, + "grad_norm": 3.5525222031207635, + "learning_rate": 2.344176179412062e-06, + "loss": 0.3627, + "step": 15160 + }, + { + "epoch": 0.7661422691346178, + "grad_norm": 5.160435812895103, + "learning_rate": 2.3391251641579958e-06, + "loss": 0.3638, + "step": 15170 + }, + { + "epoch": 0.7666473068861898, + "grad_norm": 11.69613721594715, + "learning_rate": 2.33407414890393e-06, + "loss": 0.354, + "step": 15180 + }, + { + "epoch": 0.7671523446377617, + "grad_norm": 2.5755050921925604, + "learning_rate": 2.329023133649864e-06, + "loss": 0.3571, + "step": 15190 + }, + { + "epoch": 0.7676573823893336, + "grad_norm": 6.875578222537554, + "learning_rate": 2.3239721183957977e-06, + "loss": 0.3494, + "step": 15200 + }, + { + "epoch": 0.7681624201409055, + "grad_norm": 3.951283263659072, + "learning_rate": 2.3189211031417314e-06, + "loss": 0.3642, + "step": 15210 + }, + { + "epoch": 0.7686674578924775, + "grad_norm": 6.490351712773479, + "learning_rate": 2.3138700878876655e-06, + "loss": 0.3625, + "step": 15220 + }, + { + "epoch": 0.7691724956440494, + "grad_norm": 6.723804099342641, + "learning_rate": 2.3088190726335997e-06, + "loss": 0.3542, + "step": 15230 + }, + { + "epoch": 0.7696775333956213, + "grad_norm": 3.266407086981815, + "learning_rate": 2.3037680573795334e-06, + "loss": 0.3593, + "step": 15240 + }, + { + "epoch": 0.7701825711471932, + "grad_norm": 3.9284527391422093, + "learning_rate": 2.2987170421254675e-06, + "loss": 0.3614, + "step": 15250 + }, + { + "epoch": 0.7706876088987652, + "grad_norm": 15.223624137338168, + "learning_rate": 2.2936660268714016e-06, + "loss": 0.3662, + "step": 15260 + }, + { + "epoch": 0.7711926466503372, + "grad_norm": 4.028959949491209, + "learning_rate": 2.2886150116173353e-06, + "loss": 0.3499, + "step": 15270 + }, + { + "epoch": 0.771697684401909, + "grad_norm": 6.41110230244689, + "learning_rate": 2.283563996363269e-06, + "loss": 0.3753, + "step": 15280 + }, + { + "epoch": 0.772202722153481, + "grad_norm": 4.076986580941685, + "learning_rate": 2.278512981109203e-06, + "loss": 0.334, + "step": 15290 + }, + { + "epoch": 0.7727077599050529, + "grad_norm": 3.74862169818398, + "learning_rate": 2.2734619658551373e-06, + "loss": 0.3491, + "step": 15300 + }, + { + "epoch": 0.7732127976566249, + "grad_norm": 3.385686236758005, + "learning_rate": 2.268410950601071e-06, + "loss": 0.3505, + "step": 15310 + }, + { + "epoch": 0.7737178354081967, + "grad_norm": 4.72702362279656, + "learning_rate": 2.2633599353470047e-06, + "loss": 0.362, + "step": 15320 + }, + { + "epoch": 0.7742228731597687, + "grad_norm": 15.01423003858265, + "learning_rate": 2.2583089200929388e-06, + "loss": 0.3557, + "step": 15330 + }, + { + "epoch": 0.7747279109113406, + "grad_norm": 7.0698788662071275, + "learning_rate": 2.253257904838873e-06, + "loss": 0.3414, + "step": 15340 + }, + { + "epoch": 0.7752329486629126, + "grad_norm": 3.524087806752161, + "learning_rate": 2.2482068895848066e-06, + "loss": 0.372, + "step": 15350 + }, + { + "epoch": 0.7757379864144844, + "grad_norm": 3.595099576572771, + "learning_rate": 2.2431558743307407e-06, + "loss": 0.3621, + "step": 15360 + }, + { + "epoch": 0.7762430241660564, + "grad_norm": 3.3403877064629786, + "learning_rate": 2.2381048590766744e-06, + "loss": 0.3565, + "step": 15370 + }, + { + "epoch": 0.7767480619176284, + "grad_norm": 5.0138361529572, + "learning_rate": 2.2330538438226086e-06, + "loss": 0.374, + "step": 15380 + }, + { + "epoch": 0.7772530996692003, + "grad_norm": 6.076755730180162, + "learning_rate": 2.2280028285685423e-06, + "loss": 0.3548, + "step": 15390 + }, + { + "epoch": 0.7777581374207722, + "grad_norm": 7.698809050653406, + "learning_rate": 2.2229518133144764e-06, + "loss": 0.3667, + "step": 15400 + }, + { + "epoch": 0.7782631751723441, + "grad_norm": 3.4467924224973046, + "learning_rate": 2.2179007980604105e-06, + "loss": 0.3628, + "step": 15410 + }, + { + "epoch": 0.7787682129239161, + "grad_norm": 4.893559485834325, + "learning_rate": 2.212849782806344e-06, + "loss": 0.3644, + "step": 15420 + }, + { + "epoch": 0.779273250675488, + "grad_norm": 4.646808808665608, + "learning_rate": 2.2077987675522783e-06, + "loss": 0.3615, + "step": 15430 + }, + { + "epoch": 0.7797782884270599, + "grad_norm": 5.1159354751441395, + "learning_rate": 2.202747752298212e-06, + "loss": 0.3719, + "step": 15440 + }, + { + "epoch": 0.7802833261786318, + "grad_norm": 3.6881998697625784, + "learning_rate": 2.197696737044146e-06, + "loss": 0.3607, + "step": 15450 + }, + { + "epoch": 0.7807883639302038, + "grad_norm": 2.337125255187875, + "learning_rate": 2.1926457217900803e-06, + "loss": 0.3757, + "step": 15460 + }, + { + "epoch": 0.7812934016817757, + "grad_norm": 4.428433133367473, + "learning_rate": 2.187594706536014e-06, + "loss": 0.3588, + "step": 15470 + }, + { + "epoch": 0.7817984394333476, + "grad_norm": 3.458785718908454, + "learning_rate": 2.1825436912819477e-06, + "loss": 0.3705, + "step": 15480 + }, + { + "epoch": 0.7823034771849195, + "grad_norm": 3.5942204693831936, + "learning_rate": 2.177492676027882e-06, + "loss": 0.3683, + "step": 15490 + }, + { + "epoch": 0.7828085149364915, + "grad_norm": 5.889169421367107, + "learning_rate": 2.172441660773816e-06, + "loss": 0.3573, + "step": 15500 + }, + { + "epoch": 0.7833135526880635, + "grad_norm": 3.261189892920889, + "learning_rate": 2.1673906455197496e-06, + "loss": 0.3512, + "step": 15510 + }, + { + "epoch": 0.7838185904396353, + "grad_norm": 8.352842752027808, + "learning_rate": 2.1623396302656833e-06, + "loss": 0.3629, + "step": 15520 + }, + { + "epoch": 0.7843236281912073, + "grad_norm": 2.96386077067302, + "learning_rate": 2.1572886150116174e-06, + "loss": 0.3636, + "step": 15530 + }, + { + "epoch": 0.7848286659427792, + "grad_norm": 5.283581255544922, + "learning_rate": 2.1522375997575516e-06, + "loss": 0.3676, + "step": 15540 + }, + { + "epoch": 0.7853337036943512, + "grad_norm": 2.6179107461634796, + "learning_rate": 2.1471865845034853e-06, + "loss": 0.354, + "step": 15550 + }, + { + "epoch": 0.785838741445923, + "grad_norm": 7.9036355130392755, + "learning_rate": 2.142135569249419e-06, + "loss": 0.3621, + "step": 15560 + }, + { + "epoch": 0.786343779197495, + "grad_norm": 4.522869551719609, + "learning_rate": 2.137084553995353e-06, + "loss": 0.3718, + "step": 15570 + }, + { + "epoch": 0.7868488169490669, + "grad_norm": 4.684014168319032, + "learning_rate": 2.1320335387412872e-06, + "loss": 0.3543, + "step": 15580 + }, + { + "epoch": 0.7873538547006389, + "grad_norm": 3.8914673062545697, + "learning_rate": 2.126982523487221e-06, + "loss": 0.3535, + "step": 15590 + }, + { + "epoch": 0.7878588924522109, + "grad_norm": 5.961270787701582, + "learning_rate": 2.121931508233155e-06, + "loss": 0.3716, + "step": 15600 + }, + { + "epoch": 0.7883639302037827, + "grad_norm": 30.183292311995285, + "learning_rate": 2.116880492979089e-06, + "loss": 0.3613, + "step": 15610 + }, + { + "epoch": 0.7888689679553547, + "grad_norm": 6.697417465261187, + "learning_rate": 2.111829477725023e-06, + "loss": 0.3507, + "step": 15620 + }, + { + "epoch": 0.7893740057069266, + "grad_norm": 3.0372920526034783, + "learning_rate": 2.106778462470957e-06, + "loss": 0.3556, + "step": 15630 + }, + { + "epoch": 0.7898790434584986, + "grad_norm": 2.8257416604165764, + "learning_rate": 2.1017274472168907e-06, + "loss": 0.3558, + "step": 15640 + }, + { + "epoch": 0.7903840812100704, + "grad_norm": 4.371382594369755, + "learning_rate": 2.096676431962825e-06, + "loss": 0.3708, + "step": 15650 + }, + { + "epoch": 0.7908891189616424, + "grad_norm": 2.2916826306015388, + "learning_rate": 2.0916254167087585e-06, + "loss": 0.3575, + "step": 15660 + }, + { + "epoch": 0.7913941567132143, + "grad_norm": 4.127089263253249, + "learning_rate": 2.0865744014546926e-06, + "loss": 0.3727, + "step": 15670 + }, + { + "epoch": 0.7918991944647863, + "grad_norm": 4.5838298238269095, + "learning_rate": 2.0815233862006263e-06, + "loss": 0.3631, + "step": 15680 + }, + { + "epoch": 0.7924042322163581, + "grad_norm": 2.13347121181111, + "learning_rate": 2.0764723709465605e-06, + "loss": 0.3577, + "step": 15690 + }, + { + "epoch": 0.7929092699679301, + "grad_norm": 2.865333714554876, + "learning_rate": 2.0714213556924946e-06, + "loss": 0.3646, + "step": 15700 + }, + { + "epoch": 0.793414307719502, + "grad_norm": 3.3712098781575968, + "learning_rate": 2.0663703404384283e-06, + "loss": 0.3665, + "step": 15710 + }, + { + "epoch": 0.793919345471074, + "grad_norm": 6.140656487678135, + "learning_rate": 2.061319325184362e-06, + "loss": 0.3562, + "step": 15720 + }, + { + "epoch": 0.7944243832226459, + "grad_norm": 6.1891047783662785, + "learning_rate": 2.056268309930296e-06, + "loss": 0.3724, + "step": 15730 + }, + { + "epoch": 0.7949294209742178, + "grad_norm": 9.134267646827375, + "learning_rate": 2.0512172946762302e-06, + "loss": 0.3733, + "step": 15740 + }, + { + "epoch": 0.7954344587257898, + "grad_norm": 3.939505362182306, + "learning_rate": 2.046166279422164e-06, + "loss": 0.356, + "step": 15750 + }, + { + "epoch": 0.7959394964773617, + "grad_norm": 5.797259273365989, + "learning_rate": 2.041115264168098e-06, + "loss": 0.3657, + "step": 15760 + }, + { + "epoch": 0.7964445342289336, + "grad_norm": 4.772342944803025, + "learning_rate": 2.036064248914032e-06, + "loss": 0.3565, + "step": 15770 + }, + { + "epoch": 0.7969495719805055, + "grad_norm": 5.611273163023503, + "learning_rate": 2.031013233659966e-06, + "loss": 0.342, + "step": 15780 + }, + { + "epoch": 0.7974546097320775, + "grad_norm": 4.35699870135289, + "learning_rate": 2.0259622184058996e-06, + "loss": 0.3564, + "step": 15790 + }, + { + "epoch": 0.7979596474836494, + "grad_norm": 4.981366634684693, + "learning_rate": 2.0209112031518337e-06, + "loss": 0.374, + "step": 15800 + }, + { + "epoch": 0.7984646852352213, + "grad_norm": 5.47051690828411, + "learning_rate": 2.015860187897768e-06, + "loss": 0.3648, + "step": 15810 + }, + { + "epoch": 0.7989697229867933, + "grad_norm": 2.7267511419608192, + "learning_rate": 2.0108091726437015e-06, + "loss": 0.3633, + "step": 15820 + }, + { + "epoch": 0.7994747607383652, + "grad_norm": 5.550689226148647, + "learning_rate": 2.0057581573896352e-06, + "loss": 0.3729, + "step": 15830 + }, + { + "epoch": 0.7999797984899372, + "grad_norm": 3.302912522583322, + "learning_rate": 2.0007071421355694e-06, + "loss": 0.3594, + "step": 15840 + }, + { + "epoch": 0.800484836241509, + "grad_norm": 5.487680267201285, + "learning_rate": 1.9956561268815035e-06, + "loss": 0.3529, + "step": 15850 + }, + { + "epoch": 0.800989873993081, + "grad_norm": 2.2707759816513016, + "learning_rate": 1.990605111627437e-06, + "loss": 0.3608, + "step": 15860 + }, + { + "epoch": 0.8014949117446529, + "grad_norm": 7.6358774792742805, + "learning_rate": 1.9855540963733713e-06, + "loss": 0.3664, + "step": 15870 + }, + { + "epoch": 0.8019999494962249, + "grad_norm": 4.568057200617405, + "learning_rate": 1.980503081119305e-06, + "loss": 0.3625, + "step": 15880 + }, + { + "epoch": 0.8025049872477967, + "grad_norm": 15.31945090537045, + "learning_rate": 1.975452065865239e-06, + "loss": 0.3489, + "step": 15890 + }, + { + "epoch": 0.8030100249993687, + "grad_norm": 8.04419756034429, + "learning_rate": 1.970401050611173e-06, + "loss": 0.3658, + "step": 15900 + }, + { + "epoch": 0.8035150627509406, + "grad_norm": 2.473274324466601, + "learning_rate": 1.965350035357107e-06, + "loss": 0.375, + "step": 15910 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 2.6342811148775076, + "learning_rate": 1.9602990201030406e-06, + "loss": 0.3636, + "step": 15920 + }, + { + "epoch": 0.8045251382540844, + "grad_norm": 2.6430322700309214, + "learning_rate": 1.9552480048489748e-06, + "loss": 0.3533, + "step": 15930 + }, + { + "epoch": 0.8050301760056564, + "grad_norm": 4.757917337464147, + "learning_rate": 1.950196989594909e-06, + "loss": 0.3727, + "step": 15940 + }, + { + "epoch": 0.8055352137572284, + "grad_norm": 4.511805579600671, + "learning_rate": 1.9451459743408426e-06, + "loss": 0.337, + "step": 15950 + }, + { + "epoch": 0.8060402515088003, + "grad_norm": 4.838220936104523, + "learning_rate": 1.9400949590867767e-06, + "loss": 0.3577, + "step": 15960 + }, + { + "epoch": 0.8065452892603722, + "grad_norm": 16.008417651939627, + "learning_rate": 1.935043943832711e-06, + "loss": 0.3475, + "step": 15970 + }, + { + "epoch": 0.8070503270119441, + "grad_norm": 6.303632411143941, + "learning_rate": 1.9299929285786445e-06, + "loss": 0.3659, + "step": 15980 + }, + { + "epoch": 0.8075553647635161, + "grad_norm": 5.190471720901927, + "learning_rate": 1.9249419133245782e-06, + "loss": 0.3546, + "step": 15990 + }, + { + "epoch": 0.808060402515088, + "grad_norm": 3.242336430162645, + "learning_rate": 1.9198908980705124e-06, + "loss": 0.3509, + "step": 16000 + }, + { + "epoch": 0.8085654402666599, + "grad_norm": 109.07082323352427, + "learning_rate": 1.9148398828164465e-06, + "loss": 0.3576, + "step": 16010 + }, + { + "epoch": 0.8090704780182318, + "grad_norm": 4.924092447335461, + "learning_rate": 1.90978886756238e-06, + "loss": 0.3513, + "step": 16020 + }, + { + "epoch": 0.8095755157698038, + "grad_norm": 4.008651128754475, + "learning_rate": 1.904737852308314e-06, + "loss": 0.3632, + "step": 16030 + }, + { + "epoch": 0.8100805535213758, + "grad_norm": 2.704598654274715, + "learning_rate": 1.8996868370542482e-06, + "loss": 0.3789, + "step": 16040 + }, + { + "epoch": 0.8105855912729476, + "grad_norm": 3.9535618478053838, + "learning_rate": 1.894635821800182e-06, + "loss": 0.366, + "step": 16050 + }, + { + "epoch": 0.8110906290245196, + "grad_norm": 2.647502432727123, + "learning_rate": 1.8895848065461158e-06, + "loss": 0.3591, + "step": 16060 + }, + { + "epoch": 0.8115956667760915, + "grad_norm": 3.1410244190511296, + "learning_rate": 1.8845337912920497e-06, + "loss": 0.371, + "step": 16070 + }, + { + "epoch": 0.8121007045276635, + "grad_norm": 2.681292853976926, + "learning_rate": 1.8794827760379839e-06, + "loss": 0.3625, + "step": 16080 + }, + { + "epoch": 0.8126057422792353, + "grad_norm": 3.2981049374910203, + "learning_rate": 1.8744317607839178e-06, + "loss": 0.3524, + "step": 16090 + }, + { + "epoch": 0.8131107800308073, + "grad_norm": 3.021432379433984, + "learning_rate": 1.8693807455298515e-06, + "loss": 0.3787, + "step": 16100 + }, + { + "epoch": 0.8136158177823792, + "grad_norm": 4.129295733061501, + "learning_rate": 1.8643297302757856e-06, + "loss": 0.3565, + "step": 16110 + }, + { + "epoch": 0.8141208555339512, + "grad_norm": 2.7563655099935587, + "learning_rate": 1.8592787150217195e-06, + "loss": 0.3594, + "step": 16120 + }, + { + "epoch": 0.814625893285523, + "grad_norm": 17.34216993322677, + "learning_rate": 1.8542276997676534e-06, + "loss": 0.364, + "step": 16130 + }, + { + "epoch": 0.815130931037095, + "grad_norm": 9.964020505556881, + "learning_rate": 1.8491766845135876e-06, + "loss": 0.365, + "step": 16140 + }, + { + "epoch": 0.815635968788667, + "grad_norm": 3.075416765785676, + "learning_rate": 1.8441256692595213e-06, + "loss": 0.3635, + "step": 16150 + }, + { + "epoch": 0.8161410065402389, + "grad_norm": 5.243244426066463, + "learning_rate": 1.8390746540054552e-06, + "loss": 0.3684, + "step": 16160 + }, + { + "epoch": 0.8166460442918108, + "grad_norm": 12.518246409079454, + "learning_rate": 1.834023638751389e-06, + "loss": 0.3622, + "step": 16170 + }, + { + "epoch": 0.8171510820433827, + "grad_norm": 7.312613998018914, + "learning_rate": 1.8289726234973232e-06, + "loss": 0.3657, + "step": 16180 + }, + { + "epoch": 0.8176561197949547, + "grad_norm": 3.588424798034187, + "learning_rate": 1.8239216082432571e-06, + "loss": 0.3704, + "step": 16190 + }, + { + "epoch": 0.8181611575465266, + "grad_norm": 5.675252481804342, + "learning_rate": 1.8188705929891908e-06, + "loss": 0.3769, + "step": 16200 + }, + { + "epoch": 0.8186661952980986, + "grad_norm": 4.439591443663197, + "learning_rate": 1.813819577735125e-06, + "loss": 0.3613, + "step": 16210 + }, + { + "epoch": 0.8191712330496704, + "grad_norm": 3.8799599946003904, + "learning_rate": 1.8087685624810589e-06, + "loss": 0.3507, + "step": 16220 + }, + { + "epoch": 0.8196762708012424, + "grad_norm": 6.706337065885557, + "learning_rate": 1.8037175472269928e-06, + "loss": 0.3524, + "step": 16230 + }, + { + "epoch": 0.8201813085528143, + "grad_norm": 9.215432805836578, + "learning_rate": 1.7986665319729269e-06, + "loss": 0.3503, + "step": 16240 + }, + { + "epoch": 0.8206863463043863, + "grad_norm": 14.076599744963326, + "learning_rate": 1.7936155167188608e-06, + "loss": 0.3628, + "step": 16250 + }, + { + "epoch": 0.8211913840559582, + "grad_norm": 3.743352271180992, + "learning_rate": 1.7885645014647945e-06, + "loss": 0.3661, + "step": 16260 + }, + { + "epoch": 0.8216964218075301, + "grad_norm": 5.7001568185616085, + "learning_rate": 1.7835134862107284e-06, + "loss": 0.3549, + "step": 16270 + }, + { + "epoch": 0.8222014595591021, + "grad_norm": 7.558296626118487, + "learning_rate": 1.7784624709566625e-06, + "loss": 0.3595, + "step": 16280 + }, + { + "epoch": 0.822706497310674, + "grad_norm": 4.177362518260731, + "learning_rate": 1.7734114557025964e-06, + "loss": 0.3649, + "step": 16290 + }, + { + "epoch": 0.8232115350622459, + "grad_norm": 3.640941034887769, + "learning_rate": 1.7683604404485301e-06, + "loss": 0.3545, + "step": 16300 + }, + { + "epoch": 0.8237165728138178, + "grad_norm": 3.349967752869647, + "learning_rate": 1.7633094251944643e-06, + "loss": 0.3489, + "step": 16310 + }, + { + "epoch": 0.8242216105653898, + "grad_norm": 8.902832535316834, + "learning_rate": 1.7582584099403982e-06, + "loss": 0.3613, + "step": 16320 + }, + { + "epoch": 0.8247266483169617, + "grad_norm": 4.87569549754752, + "learning_rate": 1.753207394686332e-06, + "loss": 0.3466, + "step": 16330 + }, + { + "epoch": 0.8252316860685336, + "grad_norm": 3.113899384550637, + "learning_rate": 1.748156379432266e-06, + "loss": 0.356, + "step": 16340 + }, + { + "epoch": 0.8257367238201055, + "grad_norm": 3.908258669462424, + "learning_rate": 1.7431053641782001e-06, + "loss": 0.3542, + "step": 16350 + }, + { + "epoch": 0.8262417615716775, + "grad_norm": 4.63530589205007, + "learning_rate": 1.7380543489241338e-06, + "loss": 0.3627, + "step": 16360 + }, + { + "epoch": 0.8267467993232495, + "grad_norm": 8.784657750253414, + "learning_rate": 1.7330033336700677e-06, + "loss": 0.3578, + "step": 16370 + }, + { + "epoch": 0.8272518370748213, + "grad_norm": 6.545444728348479, + "learning_rate": 1.7279523184160019e-06, + "loss": 0.3539, + "step": 16380 + }, + { + "epoch": 0.8277568748263933, + "grad_norm": 12.650331415485839, + "learning_rate": 1.7229013031619358e-06, + "loss": 0.3477, + "step": 16390 + }, + { + "epoch": 0.8282619125779652, + "grad_norm": 6.830225414304928, + "learning_rate": 1.7178502879078695e-06, + "loss": 0.3503, + "step": 16400 + }, + { + "epoch": 0.8287669503295372, + "grad_norm": 4.071173355925771, + "learning_rate": 1.7127992726538036e-06, + "loss": 0.3556, + "step": 16410 + }, + { + "epoch": 0.829271988081109, + "grad_norm": 4.66654000796166, + "learning_rate": 1.7077482573997375e-06, + "loss": 0.3639, + "step": 16420 + }, + { + "epoch": 0.829777025832681, + "grad_norm": 3.6997372517957183, + "learning_rate": 1.7026972421456714e-06, + "loss": 0.3624, + "step": 16430 + }, + { + "epoch": 0.8302820635842529, + "grad_norm": 2.8900578489503537, + "learning_rate": 1.6976462268916053e-06, + "loss": 0.352, + "step": 16440 + }, + { + "epoch": 0.8307871013358249, + "grad_norm": 7.911446253848937, + "learning_rate": 1.6925952116375395e-06, + "loss": 0.3439, + "step": 16450 + }, + { + "epoch": 0.8312921390873967, + "grad_norm": 4.387449984988524, + "learning_rate": 1.6875441963834732e-06, + "loss": 0.3556, + "step": 16460 + }, + { + "epoch": 0.8317971768389687, + "grad_norm": 3.423134745091436, + "learning_rate": 1.682493181129407e-06, + "loss": 0.3534, + "step": 16470 + }, + { + "epoch": 0.8323022145905407, + "grad_norm": 6.067996848489734, + "learning_rate": 1.6774421658753412e-06, + "loss": 0.3579, + "step": 16480 + }, + { + "epoch": 0.8328072523421126, + "grad_norm": 4.7407053967222, + "learning_rate": 1.6723911506212751e-06, + "loss": 0.3479, + "step": 16490 + }, + { + "epoch": 0.8333122900936845, + "grad_norm": 11.91783824543377, + "learning_rate": 1.6673401353672088e-06, + "loss": 0.3457, + "step": 16500 + }, + { + "epoch": 0.8338173278452564, + "grad_norm": 4.2675850830619115, + "learning_rate": 1.6622891201131427e-06, + "loss": 0.3652, + "step": 16510 + }, + { + "epoch": 0.8343223655968284, + "grad_norm": 6.818897096480064, + "learning_rate": 1.6572381048590768e-06, + "loss": 0.3758, + "step": 16520 + }, + { + "epoch": 0.8348274033484003, + "grad_norm": 4.98946971011121, + "learning_rate": 1.6521870896050108e-06, + "loss": 0.3701, + "step": 16530 + }, + { + "epoch": 0.8353324410999722, + "grad_norm": 9.186502501610118, + "learning_rate": 1.6471360743509447e-06, + "loss": 0.3621, + "step": 16540 + }, + { + "epoch": 0.8358374788515441, + "grad_norm": 4.424346513819785, + "learning_rate": 1.6420850590968788e-06, + "loss": 0.355, + "step": 16550 + }, + { + "epoch": 0.8363425166031161, + "grad_norm": 6.493343915545971, + "learning_rate": 1.6370340438428125e-06, + "loss": 0.3606, + "step": 16560 + }, + { + "epoch": 0.836847554354688, + "grad_norm": 9.200578558866452, + "learning_rate": 1.6319830285887464e-06, + "loss": 0.3586, + "step": 16570 + }, + { + "epoch": 0.8373525921062599, + "grad_norm": 3.604785576320196, + "learning_rate": 1.6269320133346805e-06, + "loss": 0.3659, + "step": 16580 + }, + { + "epoch": 0.8378576298578319, + "grad_norm": 6.963237973002329, + "learning_rate": 1.6218809980806144e-06, + "loss": 0.3585, + "step": 16590 + }, + { + "epoch": 0.8383626676094038, + "grad_norm": 2.3248924867977525, + "learning_rate": 1.6168299828265481e-06, + "loss": 0.361, + "step": 16600 + }, + { + "epoch": 0.8388677053609758, + "grad_norm": 2.6056154769998776, + "learning_rate": 1.611778967572482e-06, + "loss": 0.3551, + "step": 16610 + }, + { + "epoch": 0.8393727431125476, + "grad_norm": 5.347914332983081, + "learning_rate": 1.6067279523184162e-06, + "loss": 0.3626, + "step": 16620 + }, + { + "epoch": 0.8398777808641196, + "grad_norm": 4.2282210080509275, + "learning_rate": 1.60167693706435e-06, + "loss": 0.3408, + "step": 16630 + }, + { + "epoch": 0.8403828186156915, + "grad_norm": 64.68616959956208, + "learning_rate": 1.596625921810284e-06, + "loss": 0.361, + "step": 16640 + }, + { + "epoch": 0.8408878563672635, + "grad_norm": 4.552604163372721, + "learning_rate": 1.5915749065562181e-06, + "loss": 0.3583, + "step": 16650 + }, + { + "epoch": 0.8413928941188353, + "grad_norm": 5.37530591780756, + "learning_rate": 1.5865238913021518e-06, + "loss": 0.3677, + "step": 16660 + }, + { + "epoch": 0.8418979318704073, + "grad_norm": 5.851389647033064, + "learning_rate": 1.5814728760480857e-06, + "loss": 0.3496, + "step": 16670 + }, + { + "epoch": 0.8424029696219792, + "grad_norm": 12.669284701519452, + "learning_rate": 1.5764218607940196e-06, + "loss": 0.3537, + "step": 16680 + }, + { + "epoch": 0.8429080073735512, + "grad_norm": 4.126283598831685, + "learning_rate": 1.5713708455399538e-06, + "loss": 0.3508, + "step": 16690 + }, + { + "epoch": 0.843413045125123, + "grad_norm": 5.925620788856901, + "learning_rate": 1.5663198302858877e-06, + "loss": 0.3575, + "step": 16700 + }, + { + "epoch": 0.843918082876695, + "grad_norm": 4.664307589807468, + "learning_rate": 1.5612688150318214e-06, + "loss": 0.3649, + "step": 16710 + }, + { + "epoch": 0.844423120628267, + "grad_norm": 11.693975175758714, + "learning_rate": 1.5562177997777555e-06, + "loss": 0.3524, + "step": 16720 + }, + { + "epoch": 0.8449281583798389, + "grad_norm": 4.379171141956925, + "learning_rate": 1.5511667845236894e-06, + "loss": 0.3636, + "step": 16730 + }, + { + "epoch": 0.8454331961314108, + "grad_norm": 4.966223200480838, + "learning_rate": 1.5461157692696233e-06, + "loss": 0.3754, + "step": 16740 + }, + { + "epoch": 0.8459382338829827, + "grad_norm": 6.214250837875937, + "learning_rate": 1.5410647540155575e-06, + "loss": 0.3591, + "step": 16750 + }, + { + "epoch": 0.8464432716345547, + "grad_norm": 3.0308071228407876, + "learning_rate": 1.5360137387614912e-06, + "loss": 0.3542, + "step": 16760 + }, + { + "epoch": 0.8469483093861266, + "grad_norm": 2.6208353352508613, + "learning_rate": 1.530962723507425e-06, + "loss": 0.3685, + "step": 16770 + }, + { + "epoch": 0.8474533471376986, + "grad_norm": 6.849704130112083, + "learning_rate": 1.525911708253359e-06, + "loss": 0.3656, + "step": 16780 + }, + { + "epoch": 0.8479583848892704, + "grad_norm": 4.032506964164272, + "learning_rate": 1.520860692999293e-06, + "loss": 0.3432, + "step": 16790 + }, + { + "epoch": 0.8484634226408424, + "grad_norm": 3.5062245447178464, + "learning_rate": 1.515809677745227e-06, + "loss": 0.3635, + "step": 16800 + }, + { + "epoch": 0.8489684603924144, + "grad_norm": 3.098328874271106, + "learning_rate": 1.5107586624911607e-06, + "loss": 0.3625, + "step": 16810 + }, + { + "epoch": 0.8494734981439863, + "grad_norm": 4.466142042227375, + "learning_rate": 1.5057076472370948e-06, + "loss": 0.3613, + "step": 16820 + }, + { + "epoch": 0.8499785358955582, + "grad_norm": 5.448485215579026, + "learning_rate": 1.5006566319830287e-06, + "loss": 0.3677, + "step": 16830 + }, + { + "epoch": 0.8504835736471301, + "grad_norm": 3.7693651118255795, + "learning_rate": 1.4956056167289627e-06, + "loss": 0.3597, + "step": 16840 + }, + { + "epoch": 0.8509886113987021, + "grad_norm": 3.0048683020938265, + "learning_rate": 1.4905546014748964e-06, + "loss": 0.3458, + "step": 16850 + }, + { + "epoch": 0.851493649150274, + "grad_norm": 3.6083434998949278, + "learning_rate": 1.4855035862208305e-06, + "loss": 0.3524, + "step": 16860 + }, + { + "epoch": 0.8519986869018459, + "grad_norm": 3.8301361676049375, + "learning_rate": 1.4804525709667644e-06, + "loss": 0.354, + "step": 16870 + }, + { + "epoch": 0.8525037246534178, + "grad_norm": 9.70263976797423, + "learning_rate": 1.4754015557126983e-06, + "loss": 0.335, + "step": 16880 + }, + { + "epoch": 0.8530087624049898, + "grad_norm": 2.9237142901845368, + "learning_rate": 1.4703505404586324e-06, + "loss": 0.3569, + "step": 16890 + }, + { + "epoch": 0.8535138001565618, + "grad_norm": 9.647980801949249, + "learning_rate": 1.4652995252045663e-06, + "loss": 0.3514, + "step": 16900 + }, + { + "epoch": 0.8540188379081336, + "grad_norm": 10.070888822677043, + "learning_rate": 1.4602485099505e-06, + "loss": 0.3576, + "step": 16910 + }, + { + "epoch": 0.8545238756597056, + "grad_norm": 3.273605158275998, + "learning_rate": 1.4551974946964342e-06, + "loss": 0.3707, + "step": 16920 + }, + { + "epoch": 0.8550289134112775, + "grad_norm": 4.9136034513233575, + "learning_rate": 1.450146479442368e-06, + "loss": 0.3606, + "step": 16930 + }, + { + "epoch": 0.8555339511628495, + "grad_norm": 2.4213339644373426, + "learning_rate": 1.445095464188302e-06, + "loss": 0.3511, + "step": 16940 + }, + { + "epoch": 0.8560389889144213, + "grad_norm": 5.947198902514986, + "learning_rate": 1.4400444489342357e-06, + "loss": 0.3441, + "step": 16950 + }, + { + "epoch": 0.8565440266659933, + "grad_norm": 4.128142354654801, + "learning_rate": 1.4349934336801698e-06, + "loss": 0.3421, + "step": 16960 + }, + { + "epoch": 0.8570490644175652, + "grad_norm": 3.699537501488419, + "learning_rate": 1.4299424184261037e-06, + "loss": 0.3517, + "step": 16970 + }, + { + "epoch": 0.8575541021691372, + "grad_norm": 2.4855687780449074, + "learning_rate": 1.4248914031720376e-06, + "loss": 0.3571, + "step": 16980 + }, + { + "epoch": 0.858059139920709, + "grad_norm": 4.059592927833855, + "learning_rate": 1.4198403879179718e-06, + "loss": 0.3515, + "step": 16990 + }, + { + "epoch": 0.858564177672281, + "grad_norm": 10.362480051840812, + "learning_rate": 1.4147893726639057e-06, + "loss": 0.3599, + "step": 17000 + }, + { + "epoch": 0.859069215423853, + "grad_norm": 4.66559692240262, + "learning_rate": 1.4097383574098394e-06, + "loss": 0.3659, + "step": 17010 + }, + { + "epoch": 0.8595742531754249, + "grad_norm": 3.0262644076926244, + "learning_rate": 1.4046873421557733e-06, + "loss": 0.3627, + "step": 17020 + }, + { + "epoch": 0.8600792909269968, + "grad_norm": 6.4805418875740335, + "learning_rate": 1.3996363269017074e-06, + "loss": 0.3552, + "step": 17030 + }, + { + "epoch": 0.8605843286785687, + "grad_norm": 6.48974005858549, + "learning_rate": 1.3945853116476413e-06, + "loss": 0.3742, + "step": 17040 + }, + { + "epoch": 0.8610893664301407, + "grad_norm": 12.933277403296861, + "learning_rate": 1.389534296393575e-06, + "loss": 0.3688, + "step": 17050 + }, + { + "epoch": 0.8615944041817126, + "grad_norm": 5.0135946224215076, + "learning_rate": 1.3844832811395094e-06, + "loss": 0.3495, + "step": 17060 + }, + { + "epoch": 0.8620994419332845, + "grad_norm": 4.664374173808275, + "learning_rate": 1.379432265885443e-06, + "loss": 0.3479, + "step": 17070 + }, + { + "epoch": 0.8626044796848564, + "grad_norm": 3.413827294110899, + "learning_rate": 1.374381250631377e-06, + "loss": 0.363, + "step": 17080 + }, + { + "epoch": 0.8631095174364284, + "grad_norm": 4.110151494898355, + "learning_rate": 1.369330235377311e-06, + "loss": 0.3601, + "step": 17090 + }, + { + "epoch": 0.8636145551880003, + "grad_norm": 4.057869159667351, + "learning_rate": 1.364279220123245e-06, + "loss": 0.3524, + "step": 17100 + }, + { + "epoch": 0.8641195929395722, + "grad_norm": 3.345052422207178, + "learning_rate": 1.3592282048691787e-06, + "loss": 0.3464, + "step": 17110 + }, + { + "epoch": 0.8646246306911441, + "grad_norm": 17.60499598627659, + "learning_rate": 1.3541771896151126e-06, + "loss": 0.3457, + "step": 17120 + }, + { + "epoch": 0.8651296684427161, + "grad_norm": 3.1318074334341395, + "learning_rate": 1.3491261743610467e-06, + "loss": 0.3519, + "step": 17130 + }, + { + "epoch": 0.8656347061942881, + "grad_norm": 8.924077465063698, + "learning_rate": 1.3440751591069806e-06, + "loss": 0.3536, + "step": 17140 + }, + { + "epoch": 0.8661397439458599, + "grad_norm": 5.697646524473763, + "learning_rate": 1.3390241438529146e-06, + "loss": 0.3528, + "step": 17150 + }, + { + "epoch": 0.8666447816974319, + "grad_norm": 2.759497306215723, + "learning_rate": 1.3339731285988487e-06, + "loss": 0.3619, + "step": 17160 + }, + { + "epoch": 0.8671498194490038, + "grad_norm": 7.1692765874676905, + "learning_rate": 1.3289221133447824e-06, + "loss": 0.3719, + "step": 17170 + }, + { + "epoch": 0.8676548572005758, + "grad_norm": 7.494321248279156, + "learning_rate": 1.3238710980907163e-06, + "loss": 0.345, + "step": 17180 + }, + { + "epoch": 0.8681598949521476, + "grad_norm": 3.809737865656533, + "learning_rate": 1.3188200828366502e-06, + "loss": 0.3482, + "step": 17190 + }, + { + "epoch": 0.8686649327037196, + "grad_norm": 2.566570715695477, + "learning_rate": 1.3137690675825843e-06, + "loss": 0.3606, + "step": 17200 + }, + { + "epoch": 0.8691699704552915, + "grad_norm": 5.642929958428042, + "learning_rate": 1.308718052328518e-06, + "loss": 0.3563, + "step": 17210 + }, + { + "epoch": 0.8696750082068635, + "grad_norm": 7.32713592653821, + "learning_rate": 1.303667037074452e-06, + "loss": 0.3546, + "step": 17220 + }, + { + "epoch": 0.8701800459584353, + "grad_norm": 3.958850560726077, + "learning_rate": 1.298616021820386e-06, + "loss": 0.3455, + "step": 17230 + }, + { + "epoch": 0.8706850837100073, + "grad_norm": 6.811151879320974, + "learning_rate": 1.29356500656632e-06, + "loss": 0.3549, + "step": 17240 + }, + { + "epoch": 0.8711901214615793, + "grad_norm": 7.69433841766783, + "learning_rate": 1.2885139913122539e-06, + "loss": 0.3521, + "step": 17250 + }, + { + "epoch": 0.8716951592131512, + "grad_norm": 2.703944056955305, + "learning_rate": 1.283462976058188e-06, + "loss": 0.364, + "step": 17260 + }, + { + "epoch": 0.8722001969647231, + "grad_norm": 2.6933087195097296, + "learning_rate": 1.2784119608041217e-06, + "loss": 0.3611, + "step": 17270 + }, + { + "epoch": 0.872705234716295, + "grad_norm": 5.320181701389919, + "learning_rate": 1.2733609455500556e-06, + "loss": 0.3645, + "step": 17280 + }, + { + "epoch": 0.873210272467867, + "grad_norm": 3.361877630451869, + "learning_rate": 1.2683099302959895e-06, + "loss": 0.3519, + "step": 17290 + }, + { + "epoch": 0.8737153102194389, + "grad_norm": 2.951104064745123, + "learning_rate": 1.2632589150419237e-06, + "loss": 0.3587, + "step": 17300 + }, + { + "epoch": 0.8742203479710108, + "grad_norm": 6.829151598791138, + "learning_rate": 1.2582078997878574e-06, + "loss": 0.3449, + "step": 17310 + }, + { + "epoch": 0.8747253857225827, + "grad_norm": 2.350852229769939, + "learning_rate": 1.2531568845337913e-06, + "loss": 0.3478, + "step": 17320 + }, + { + "epoch": 0.8752304234741547, + "grad_norm": 3.348683817462015, + "learning_rate": 1.2481058692797254e-06, + "loss": 0.3542, + "step": 17330 + }, + { + "epoch": 0.8757354612257267, + "grad_norm": 13.330436598568463, + "learning_rate": 1.2430548540256593e-06, + "loss": 0.3606, + "step": 17340 + }, + { + "epoch": 0.8762404989772985, + "grad_norm": 2.9324631629214233, + "learning_rate": 1.2380038387715932e-06, + "loss": 0.3403, + "step": 17350 + }, + { + "epoch": 0.8767455367288705, + "grad_norm": 12.462910905007668, + "learning_rate": 1.2329528235175271e-06, + "loss": 0.36, + "step": 17360 + }, + { + "epoch": 0.8772505744804424, + "grad_norm": 6.774737202335634, + "learning_rate": 1.227901808263461e-06, + "loss": 0.3529, + "step": 17370 + }, + { + "epoch": 0.8777556122320144, + "grad_norm": 6.599803354341647, + "learning_rate": 1.222850793009395e-06, + "loss": 0.349, + "step": 17380 + }, + { + "epoch": 0.8782606499835863, + "grad_norm": 4.087712138749244, + "learning_rate": 1.2177997777553289e-06, + "loss": 0.3585, + "step": 17390 + }, + { + "epoch": 0.8787656877351582, + "grad_norm": 5.90307508489925, + "learning_rate": 1.2127487625012628e-06, + "loss": 0.3779, + "step": 17400 + }, + { + "epoch": 0.8792707254867301, + "grad_norm": 2.9671035952158404, + "learning_rate": 1.2076977472471967e-06, + "loss": 0.3581, + "step": 17410 + }, + { + "epoch": 0.8797757632383021, + "grad_norm": 3.797739053184357, + "learning_rate": 1.2026467319931308e-06, + "loss": 0.3523, + "step": 17420 + }, + { + "epoch": 0.880280800989874, + "grad_norm": 2.753560543414613, + "learning_rate": 1.1975957167390647e-06, + "loss": 0.3525, + "step": 17430 + }, + { + "epoch": 0.8807858387414459, + "grad_norm": 9.720571030998466, + "learning_rate": 1.1925447014849986e-06, + "loss": 0.3541, + "step": 17440 + }, + { + "epoch": 0.8812908764930178, + "grad_norm": 3.0251393418296226, + "learning_rate": 1.1874936862309326e-06, + "loss": 0.346, + "step": 17450 + }, + { + "epoch": 0.8817959142445898, + "grad_norm": 3.841487482571985, + "learning_rate": 1.1824426709768665e-06, + "loss": 0.3667, + "step": 17460 + }, + { + "epoch": 0.8823009519961618, + "grad_norm": 3.3891055128420655, + "learning_rate": 1.1773916557228004e-06, + "loss": 0.34, + "step": 17470 + }, + { + "epoch": 0.8828059897477336, + "grad_norm": 6.131210762114432, + "learning_rate": 1.1723406404687343e-06, + "loss": 0.3631, + "step": 17480 + }, + { + "epoch": 0.8833110274993056, + "grad_norm": 3.2369764099154117, + "learning_rate": 1.1672896252146682e-06, + "loss": 0.3588, + "step": 17490 + }, + { + "epoch": 0.8838160652508775, + "grad_norm": 5.076564547768568, + "learning_rate": 1.1622386099606021e-06, + "loss": 0.3556, + "step": 17500 + }, + { + "epoch": 0.8843211030024495, + "grad_norm": 4.679877306058496, + "learning_rate": 1.1571875947065362e-06, + "loss": 0.344, + "step": 17510 + }, + { + "epoch": 0.8848261407540213, + "grad_norm": 11.330975476714967, + "learning_rate": 1.15213657945247e-06, + "loss": 0.3602, + "step": 17520 + }, + { + "epoch": 0.8853311785055933, + "grad_norm": 3.0144585760929576, + "learning_rate": 1.147085564198404e-06, + "loss": 0.3489, + "step": 17530 + }, + { + "epoch": 0.8858362162571652, + "grad_norm": 6.287423740993079, + "learning_rate": 1.142034548944338e-06, + "loss": 0.3558, + "step": 17540 + }, + { + "epoch": 0.8863412540087372, + "grad_norm": 3.489576000710248, + "learning_rate": 1.1369835336902719e-06, + "loss": 0.3531, + "step": 17550 + }, + { + "epoch": 0.886846291760309, + "grad_norm": 3.733608437865078, + "learning_rate": 1.1319325184362058e-06, + "loss": 0.375, + "step": 17560 + }, + { + "epoch": 0.887351329511881, + "grad_norm": 5.217817871919478, + "learning_rate": 1.1268815031821397e-06, + "loss": 0.3542, + "step": 17570 + }, + { + "epoch": 0.887856367263453, + "grad_norm": 2.799146326616282, + "learning_rate": 1.1218304879280736e-06, + "loss": 0.3483, + "step": 17580 + }, + { + "epoch": 0.8883614050150249, + "grad_norm": 2.6219946573680275, + "learning_rate": 1.1167794726740075e-06, + "loss": 0.3553, + "step": 17590 + }, + { + "epoch": 0.8888664427665968, + "grad_norm": 3.3194236691159764, + "learning_rate": 1.1117284574199414e-06, + "loss": 0.3714, + "step": 17600 + }, + { + "epoch": 0.8893714805181687, + "grad_norm": 7.465096104263228, + "learning_rate": 1.1066774421658756e-06, + "loss": 0.358, + "step": 17610 + }, + { + "epoch": 0.8898765182697407, + "grad_norm": 7.584088341048675, + "learning_rate": 1.1016264269118093e-06, + "loss": 0.3474, + "step": 17620 + }, + { + "epoch": 0.8903815560213126, + "grad_norm": 4.658333080329891, + "learning_rate": 1.0965754116577434e-06, + "loss": 0.3491, + "step": 17630 + }, + { + "epoch": 0.8908865937728845, + "grad_norm": 3.8001771174540377, + "learning_rate": 1.091524396403677e-06, + "loss": 0.354, + "step": 17640 + }, + { + "epoch": 0.8913916315244564, + "grad_norm": 2.5239942090163834, + "learning_rate": 1.0864733811496112e-06, + "loss": 0.3526, + "step": 17650 + }, + { + "epoch": 0.8918966692760284, + "grad_norm": 3.192080136278505, + "learning_rate": 1.0814223658955451e-06, + "loss": 0.3674, + "step": 17660 + }, + { + "epoch": 0.8924017070276004, + "grad_norm": 5.19946883439612, + "learning_rate": 1.076371350641479e-06, + "loss": 0.3575, + "step": 17670 + }, + { + "epoch": 0.8929067447791722, + "grad_norm": 4.214472251420109, + "learning_rate": 1.071320335387413e-06, + "loss": 0.3389, + "step": 17680 + }, + { + "epoch": 0.8934117825307442, + "grad_norm": 10.692110937064982, + "learning_rate": 1.0662693201333469e-06, + "loss": 0.3731, + "step": 17690 + }, + { + "epoch": 0.8939168202823161, + "grad_norm": 6.139646910075302, + "learning_rate": 1.0612183048792808e-06, + "loss": 0.3472, + "step": 17700 + }, + { + "epoch": 0.8944218580338881, + "grad_norm": 3.5048760100552157, + "learning_rate": 1.056167289625215e-06, + "loss": 0.3479, + "step": 17710 + }, + { + "epoch": 0.8949268957854599, + "grad_norm": 3.4931757105634254, + "learning_rate": 1.0511162743711486e-06, + "loss": 0.3546, + "step": 17720 + }, + { + "epoch": 0.8954319335370319, + "grad_norm": 7.742107901473751, + "learning_rate": 1.0460652591170827e-06, + "loss": 0.3501, + "step": 17730 + }, + { + "epoch": 0.8959369712886038, + "grad_norm": 2.8474052373770107, + "learning_rate": 1.0410142438630164e-06, + "loss": 0.3509, + "step": 17740 + }, + { + "epoch": 0.8964420090401758, + "grad_norm": 3.2173387588789186, + "learning_rate": 1.0359632286089505e-06, + "loss": 0.3513, + "step": 17750 + }, + { + "epoch": 0.8969470467917476, + "grad_norm": 6.294405766765221, + "learning_rate": 1.0309122133548845e-06, + "loss": 0.3566, + "step": 17760 + }, + { + "epoch": 0.8974520845433196, + "grad_norm": 2.4812196184280344, + "learning_rate": 1.0258611981008184e-06, + "loss": 0.3517, + "step": 17770 + }, + { + "epoch": 0.8979571222948916, + "grad_norm": 34.26237777325275, + "learning_rate": 1.0208101828467523e-06, + "loss": 0.3448, + "step": 17780 + }, + { + "epoch": 0.8984621600464635, + "grad_norm": 62.559391948683924, + "learning_rate": 1.0157591675926862e-06, + "loss": 0.361, + "step": 17790 + }, + { + "epoch": 0.8989671977980354, + "grad_norm": 3.0126489507596106, + "learning_rate": 1.01070815233862e-06, + "loss": 0.3594, + "step": 17800 + }, + { + "epoch": 0.8994722355496073, + "grad_norm": 4.4144846462481935, + "learning_rate": 1.0056571370845542e-06, + "loss": 0.3569, + "step": 17810 + }, + { + "epoch": 0.8999772733011793, + "grad_norm": 6.230988179874581, + "learning_rate": 1.000606121830488e-06, + "loss": 0.3383, + "step": 17820 + }, + { + "epoch": 0.9004823110527512, + "grad_norm": 2.294744478892904, + "learning_rate": 9.95555106576422e-07, + "loss": 0.3443, + "step": 17830 + }, + { + "epoch": 0.9009873488043231, + "grad_norm": 5.478102588014345, + "learning_rate": 9.905040913223558e-07, + "loss": 0.3469, + "step": 17840 + }, + { + "epoch": 0.901492386555895, + "grad_norm": 2.756012421267679, + "learning_rate": 9.854530760682899e-07, + "loss": 0.3629, + "step": 17850 + }, + { + "epoch": 0.901997424307467, + "grad_norm": 2.9051333748854504, + "learning_rate": 9.804020608142236e-07, + "loss": 0.3493, + "step": 17860 + }, + { + "epoch": 0.902502462059039, + "grad_norm": 2.942294785390223, + "learning_rate": 9.753510455601577e-07, + "loss": 0.3497, + "step": 17870 + }, + { + "epoch": 0.9030074998106108, + "grad_norm": 3.2373052706846135, + "learning_rate": 9.703000303060916e-07, + "loss": 0.3423, + "step": 17880 + }, + { + "epoch": 0.9035125375621827, + "grad_norm": 3.2500158251075266, + "learning_rate": 9.652490150520255e-07, + "loss": 0.3539, + "step": 17890 + }, + { + "epoch": 0.9040175753137547, + "grad_norm": 3.5580015551516104, + "learning_rate": 9.601979997979594e-07, + "loss": 0.3332, + "step": 17900 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 2.9893729880240203, + "learning_rate": 9.551469845438933e-07, + "loss": 0.3421, + "step": 17910 + }, + { + "epoch": 0.9050276508168985, + "grad_norm": 3.1028575176827182, + "learning_rate": 9.500959692898274e-07, + "loss": 0.3408, + "step": 17920 + }, + { + "epoch": 0.9055326885684705, + "grad_norm": 3.018466429936209, + "learning_rate": 9.450449540357613e-07, + "loss": 0.3444, + "step": 17930 + }, + { + "epoch": 0.9060377263200424, + "grad_norm": 5.330235007532234, + "learning_rate": 9.399939387816952e-07, + "loss": 0.3635, + "step": 17940 + }, + { + "epoch": 0.9065427640716144, + "grad_norm": 8.530098317004866, + "learning_rate": 9.349429235276292e-07, + "loss": 0.3499, + "step": 17950 + }, + { + "epoch": 0.9070478018231862, + "grad_norm": 5.312690518311376, + "learning_rate": 9.29891908273563e-07, + "loss": 0.3542, + "step": 17960 + }, + { + "epoch": 0.9075528395747582, + "grad_norm": 4.197543188110089, + "learning_rate": 9.24840893019497e-07, + "loss": 0.3545, + "step": 17970 + }, + { + "epoch": 0.9080578773263301, + "grad_norm": 8.064198853556876, + "learning_rate": 9.197898777654309e-07, + "loss": 0.3642, + "step": 17980 + }, + { + "epoch": 0.9085629150779021, + "grad_norm": 2.6318426760910656, + "learning_rate": 9.147388625113649e-07, + "loss": 0.3671, + "step": 17990 + }, + { + "epoch": 0.9090679528294741, + "grad_norm": 5.88110776094459, + "learning_rate": 9.096878472572989e-07, + "loss": 0.3278, + "step": 18000 + }, + { + "epoch": 0.9095729905810459, + "grad_norm": 7.327649689814793, + "learning_rate": 9.046368320032327e-07, + "loss": 0.3612, + "step": 18010 + }, + { + "epoch": 0.9100780283326179, + "grad_norm": 5.1865153794476235, + "learning_rate": 8.995858167491667e-07, + "loss": 0.3382, + "step": 18020 + }, + { + "epoch": 0.9105830660841898, + "grad_norm": 3.0447013025219696, + "learning_rate": 8.945348014951005e-07, + "loss": 0.3543, + "step": 18030 + }, + { + "epoch": 0.9110881038357618, + "grad_norm": 3.6031825002335753, + "learning_rate": 8.894837862410345e-07, + "loss": 0.3447, + "step": 18040 + }, + { + "epoch": 0.9115931415873336, + "grad_norm": 3.724192694212046, + "learning_rate": 8.844327709869685e-07, + "loss": 0.3435, + "step": 18050 + }, + { + "epoch": 0.9120981793389056, + "grad_norm": 3.9945852787473015, + "learning_rate": 8.793817557329023e-07, + "loss": 0.3618, + "step": 18060 + }, + { + "epoch": 0.9126032170904775, + "grad_norm": 4.943722371461341, + "learning_rate": 8.743307404788364e-07, + "loss": 0.3546, + "step": 18070 + }, + { + "epoch": 0.9131082548420495, + "grad_norm": 3.3003892909027313, + "learning_rate": 8.692797252247702e-07, + "loss": 0.3341, + "step": 18080 + }, + { + "epoch": 0.9136132925936213, + "grad_norm": 2.5050619983833826, + "learning_rate": 8.642287099707042e-07, + "loss": 0.3434, + "step": 18090 + }, + { + "epoch": 0.9141183303451933, + "grad_norm": 3.7372168096985408, + "learning_rate": 8.591776947166382e-07, + "loss": 0.3456, + "step": 18100 + }, + { + "epoch": 0.9146233680967653, + "grad_norm": 2.587657901087928, + "learning_rate": 8.54126679462572e-07, + "loss": 0.3442, + "step": 18110 + }, + { + "epoch": 0.9151284058483372, + "grad_norm": 2.9656578110467207, + "learning_rate": 8.49075664208506e-07, + "loss": 0.3588, + "step": 18120 + }, + { + "epoch": 0.9156334435999091, + "grad_norm": 3.3573402419593186, + "learning_rate": 8.440246489544398e-07, + "loss": 0.3277, + "step": 18130 + }, + { + "epoch": 0.916138481351481, + "grad_norm": 2.7211678821317205, + "learning_rate": 8.389736337003738e-07, + "loss": 0.3531, + "step": 18140 + }, + { + "epoch": 0.916643519103053, + "grad_norm": 5.733816461438319, + "learning_rate": 8.339226184463079e-07, + "loss": 0.3554, + "step": 18150 + }, + { + "epoch": 0.9171485568546249, + "grad_norm": 2.9296826900284536, + "learning_rate": 8.288716031922417e-07, + "loss": 0.3484, + "step": 18160 + }, + { + "epoch": 0.9176535946061968, + "grad_norm": 2.660140218702211, + "learning_rate": 8.238205879381757e-07, + "loss": 0.3456, + "step": 18170 + }, + { + "epoch": 0.9181586323577687, + "grad_norm": 2.5446985764311996, + "learning_rate": 8.187695726841095e-07, + "loss": 0.3609, + "step": 18180 + }, + { + "epoch": 0.9186636701093407, + "grad_norm": 4.659560104675861, + "learning_rate": 8.137185574300435e-07, + "loss": 0.3438, + "step": 18190 + }, + { + "epoch": 0.9191687078609126, + "grad_norm": 2.727989773393059, + "learning_rate": 8.086675421759773e-07, + "loss": 0.3573, + "step": 18200 + }, + { + "epoch": 0.9196737456124845, + "grad_norm": 2.415446839777949, + "learning_rate": 8.036165269219113e-07, + "loss": 0.353, + "step": 18210 + }, + { + "epoch": 0.9201787833640565, + "grad_norm": 4.842714491783439, + "learning_rate": 7.985655116678454e-07, + "loss": 0.3616, + "step": 18220 + }, + { + "epoch": 0.9206838211156284, + "grad_norm": 4.320254893741863, + "learning_rate": 7.935144964137792e-07, + "loss": 0.3441, + "step": 18230 + }, + { + "epoch": 0.9211888588672004, + "grad_norm": 7.676940904287274, + "learning_rate": 7.884634811597132e-07, + "loss": 0.3469, + "step": 18240 + }, + { + "epoch": 0.9216938966187722, + "grad_norm": 4.236663818861765, + "learning_rate": 7.834124659056471e-07, + "loss": 0.3415, + "step": 18250 + }, + { + "epoch": 0.9221989343703442, + "grad_norm": 2.5909282600390977, + "learning_rate": 7.78361450651581e-07, + "loss": 0.3735, + "step": 18260 + }, + { + "epoch": 0.9227039721219161, + "grad_norm": 3.095824958631971, + "learning_rate": 7.73310435397515e-07, + "loss": 0.3448, + "step": 18270 + }, + { + "epoch": 0.9232090098734881, + "grad_norm": 2.915058703898021, + "learning_rate": 7.682594201434488e-07, + "loss": 0.3605, + "step": 18280 + }, + { + "epoch": 0.9237140476250599, + "grad_norm": 4.0628438496453985, + "learning_rate": 7.632084048893828e-07, + "loss": 0.3522, + "step": 18290 + }, + { + "epoch": 0.9242190853766319, + "grad_norm": 5.339121668100427, + "learning_rate": 7.581573896353168e-07, + "loss": 0.348, + "step": 18300 + }, + { + "epoch": 0.9247241231282038, + "grad_norm": 2.9403014748581366, + "learning_rate": 7.531063743812507e-07, + "loss": 0.3436, + "step": 18310 + }, + { + "epoch": 0.9252291608797758, + "grad_norm": 4.4297759422952065, + "learning_rate": 7.480553591271847e-07, + "loss": 0.3688, + "step": 18320 + }, + { + "epoch": 0.9257341986313476, + "grad_norm": 8.718344498904465, + "learning_rate": 7.430043438731185e-07, + "loss": 0.3493, + "step": 18330 + }, + { + "epoch": 0.9262392363829196, + "grad_norm": 17.16627380229125, + "learning_rate": 7.379533286190525e-07, + "loss": 0.3635, + "step": 18340 + }, + { + "epoch": 0.9267442741344916, + "grad_norm": 6.9347616228201785, + "learning_rate": 7.329023133649864e-07, + "loss": 0.3443, + "step": 18350 + }, + { + "epoch": 0.9272493118860635, + "grad_norm": 4.584801866934316, + "learning_rate": 7.278512981109203e-07, + "loss": 0.3381, + "step": 18360 + }, + { + "epoch": 0.9277543496376354, + "grad_norm": 6.526194404732607, + "learning_rate": 7.228002828568542e-07, + "loss": 0.3795, + "step": 18370 + }, + { + "epoch": 0.9282593873892073, + "grad_norm": 2.8075224191637536, + "learning_rate": 7.177492676027882e-07, + "loss": 0.3473, + "step": 18380 + }, + { + "epoch": 0.9287644251407793, + "grad_norm": 8.151912813354414, + "learning_rate": 7.126982523487222e-07, + "loss": 0.3358, + "step": 18390 + }, + { + "epoch": 0.9292694628923512, + "grad_norm": 4.7898553816163245, + "learning_rate": 7.076472370946561e-07, + "loss": 0.3662, + "step": 18400 + }, + { + "epoch": 0.9297745006439231, + "grad_norm": 4.443345676540152, + "learning_rate": 7.0259622184059e-07, + "loss": 0.3576, + "step": 18410 + }, + { + "epoch": 0.930279538395495, + "grad_norm": 2.1359533523104464, + "learning_rate": 6.975452065865239e-07, + "loss": 0.3469, + "step": 18420 + }, + { + "epoch": 0.930784576147067, + "grad_norm": 37.17325982324839, + "learning_rate": 6.924941913324579e-07, + "loss": 0.3581, + "step": 18430 + }, + { + "epoch": 0.931289613898639, + "grad_norm": 9.673960641714096, + "learning_rate": 6.874431760783918e-07, + "loss": 0.3522, + "step": 18440 + }, + { + "epoch": 0.9317946516502108, + "grad_norm": 3.176528472152659, + "learning_rate": 6.823921608243258e-07, + "loss": 0.3464, + "step": 18450 + }, + { + "epoch": 0.9322996894017828, + "grad_norm": 6.300715383126615, + "learning_rate": 6.773411455702597e-07, + "loss": 0.3497, + "step": 18460 + }, + { + "epoch": 0.9328047271533547, + "grad_norm": 8.137473676823792, + "learning_rate": 6.722901303161936e-07, + "loss": 0.3542, + "step": 18470 + }, + { + "epoch": 0.9333097649049267, + "grad_norm": 3.7878098971135756, + "learning_rate": 6.672391150621276e-07, + "loss": 0.331, + "step": 18480 + }, + { + "epoch": 0.9338148026564985, + "grad_norm": 3.4537119018992883, + "learning_rate": 6.621880998080615e-07, + "loss": 0.349, + "step": 18490 + }, + { + "epoch": 0.9343198404080705, + "grad_norm": 5.7332136754175425, + "learning_rate": 6.571370845539954e-07, + "loss": 0.3686, + "step": 18500 + }, + { + "epoch": 0.9348248781596424, + "grad_norm": 2.40088922999737, + "learning_rate": 6.520860692999293e-07, + "loss": 0.3283, + "step": 18510 + }, + { + "epoch": 0.9353299159112144, + "grad_norm": 3.363962036008761, + "learning_rate": 6.470350540458632e-07, + "loss": 0.3438, + "step": 18520 + }, + { + "epoch": 0.9358349536627862, + "grad_norm": 4.168660835516454, + "learning_rate": 6.419840387917973e-07, + "loss": 0.3454, + "step": 18530 + }, + { + "epoch": 0.9363399914143582, + "grad_norm": 31.57273809211181, + "learning_rate": 6.369330235377312e-07, + "loss": 0.3525, + "step": 18540 + }, + { + "epoch": 0.9368450291659302, + "grad_norm": 2.6132048866458453, + "learning_rate": 6.318820082836651e-07, + "loss": 0.3445, + "step": 18550 + }, + { + "epoch": 0.9373500669175021, + "grad_norm": 3.121982815268045, + "learning_rate": 6.26830993029599e-07, + "loss": 0.3548, + "step": 18560 + }, + { + "epoch": 0.937855104669074, + "grad_norm": 3.878175783357159, + "learning_rate": 6.21779977775533e-07, + "loss": 0.3551, + "step": 18570 + }, + { + "epoch": 0.9383601424206459, + "grad_norm": 4.9399268768023035, + "learning_rate": 6.167289625214669e-07, + "loss": 0.3406, + "step": 18580 + }, + { + "epoch": 0.9388651801722179, + "grad_norm": 4.102118329658351, + "learning_rate": 6.116779472674008e-07, + "loss": 0.3415, + "step": 18590 + }, + { + "epoch": 0.9393702179237898, + "grad_norm": 7.576480207566005, + "learning_rate": 6.066269320133347e-07, + "loss": 0.3474, + "step": 18600 + }, + { + "epoch": 0.9398752556753618, + "grad_norm": 5.205648741417787, + "learning_rate": 6.015759167592687e-07, + "loss": 0.3701, + "step": 18610 + }, + { + "epoch": 0.9403802934269336, + "grad_norm": 2.574703645382877, + "learning_rate": 5.965249015052026e-07, + "loss": 0.3711, + "step": 18620 + }, + { + "epoch": 0.9408853311785056, + "grad_norm": 2.1710201302013608, + "learning_rate": 5.914738862511366e-07, + "loss": 0.3474, + "step": 18630 + }, + { + "epoch": 0.9413903689300775, + "grad_norm": 4.238488989950128, + "learning_rate": 5.864228709970705e-07, + "loss": 0.3458, + "step": 18640 + }, + { + "epoch": 0.9418954066816495, + "grad_norm": 3.9298261035180815, + "learning_rate": 5.813718557430044e-07, + "loss": 0.3517, + "step": 18650 + }, + { + "epoch": 0.9424004444332214, + "grad_norm": 4.183286579494331, + "learning_rate": 5.763208404889383e-07, + "loss": 0.3535, + "step": 18660 + }, + { + "epoch": 0.9429054821847933, + "grad_norm": 3.787432945522304, + "learning_rate": 5.712698252348722e-07, + "loss": 0.3438, + "step": 18670 + }, + { + "epoch": 0.9434105199363653, + "grad_norm": 42.44616395667764, + "learning_rate": 5.662188099808063e-07, + "loss": 0.3487, + "step": 18680 + }, + { + "epoch": 0.9439155576879372, + "grad_norm": 2.467350994396614, + "learning_rate": 5.611677947267402e-07, + "loss": 0.3495, + "step": 18690 + }, + { + "epoch": 0.9444205954395091, + "grad_norm": 3.4763055801021197, + "learning_rate": 5.561167794726741e-07, + "loss": 0.3471, + "step": 18700 + }, + { + "epoch": 0.944925633191081, + "grad_norm": 2.5579533898925604, + "learning_rate": 5.51065764218608e-07, + "loss": 0.3534, + "step": 18710 + }, + { + "epoch": 0.945430670942653, + "grad_norm": 4.548010779885983, + "learning_rate": 5.460147489645419e-07, + "loss": 0.3471, + "step": 18720 + }, + { + "epoch": 0.9459357086942249, + "grad_norm": 6.469724337112102, + "learning_rate": 5.409637337104758e-07, + "loss": 0.3519, + "step": 18730 + }, + { + "epoch": 0.9464407464457968, + "grad_norm": 3.2909861731073007, + "learning_rate": 5.359127184564098e-07, + "loss": 0.3455, + "step": 18740 + }, + { + "epoch": 0.9469457841973687, + "grad_norm": 3.9954669422093674, + "learning_rate": 5.308617032023437e-07, + "loss": 0.3484, + "step": 18750 + }, + { + "epoch": 0.9474508219489407, + "grad_norm": 4.918272865216922, + "learning_rate": 5.258106879482777e-07, + "loss": 0.3543, + "step": 18760 + }, + { + "epoch": 0.9479558597005127, + "grad_norm": 7.001698550237223, + "learning_rate": 5.207596726942116e-07, + "loss": 0.35, + "step": 18770 + }, + { + "epoch": 0.9484608974520845, + "grad_norm": 5.052929072371654, + "learning_rate": 5.157086574401455e-07, + "loss": 0.3442, + "step": 18780 + }, + { + "epoch": 0.9489659352036565, + "grad_norm": 4.671072953558266, + "learning_rate": 5.106576421860794e-07, + "loss": 0.3413, + "step": 18790 + }, + { + "epoch": 0.9494709729552284, + "grad_norm": 2.2274459931275943, + "learning_rate": 5.056066269320134e-07, + "loss": 0.3579, + "step": 18800 + }, + { + "epoch": 0.9499760107068004, + "grad_norm": 3.679235828488556, + "learning_rate": 5.005556116779473e-07, + "loss": 0.3482, + "step": 18810 + }, + { + "epoch": 0.9504810484583722, + "grad_norm": 5.113998461225806, + "learning_rate": 4.955045964238812e-07, + "loss": 0.3492, + "step": 18820 + }, + { + "epoch": 0.9509860862099442, + "grad_norm": 2.313685561005164, + "learning_rate": 4.904535811698151e-07, + "loss": 0.3498, + "step": 18830 + }, + { + "epoch": 0.9514911239615161, + "grad_norm": 4.548823482232416, + "learning_rate": 4.854025659157491e-07, + "loss": 0.3381, + "step": 18840 + }, + { + "epoch": 0.9519961617130881, + "grad_norm": 5.44596462224921, + "learning_rate": 4.803515506616831e-07, + "loss": 0.3499, + "step": 18850 + }, + { + "epoch": 0.9525011994646599, + "grad_norm": 5.653637436781184, + "learning_rate": 4.75300535407617e-07, + "loss": 0.3606, + "step": 18860 + }, + { + "epoch": 0.9530062372162319, + "grad_norm": 17.20756057482154, + "learning_rate": 4.702495201535509e-07, + "loss": 0.3513, + "step": 18870 + }, + { + "epoch": 0.9535112749678039, + "grad_norm": 2.6415906411811094, + "learning_rate": 4.651985048994848e-07, + "loss": 0.3584, + "step": 18880 + }, + { + "epoch": 0.9540163127193758, + "grad_norm": 3.3045378472333926, + "learning_rate": 4.601474896454188e-07, + "loss": 0.3672, + "step": 18890 + }, + { + "epoch": 0.9545213504709477, + "grad_norm": 2.494761338609364, + "learning_rate": 4.550964743913527e-07, + "loss": 0.3479, + "step": 18900 + }, + { + "epoch": 0.9550263882225196, + "grad_norm": 3.7846428264832346, + "learning_rate": 4.5004545913728665e-07, + "loss": 0.3594, + "step": 18910 + }, + { + "epoch": 0.9555314259740916, + "grad_norm": 11.41402543242729, + "learning_rate": 4.4499444388322056e-07, + "loss": 0.3391, + "step": 18920 + }, + { + "epoch": 0.9560364637256635, + "grad_norm": 23.109764846483394, + "learning_rate": 4.399434286291545e-07, + "loss": 0.3657, + "step": 18930 + }, + { + "epoch": 0.9565415014772354, + "grad_norm": 2.8683015901455313, + "learning_rate": 4.3489241337508844e-07, + "loss": 0.365, + "step": 18940 + }, + { + "epoch": 0.9570465392288073, + "grad_norm": 3.310696390265973, + "learning_rate": 4.2984139812102235e-07, + "loss": 0.3692, + "step": 18950 + }, + { + "epoch": 0.9575515769803793, + "grad_norm": 3.223732467814084, + "learning_rate": 4.247903828669563e-07, + "loss": 0.3268, + "step": 18960 + }, + { + "epoch": 0.9580566147319513, + "grad_norm": 3.067086167535032, + "learning_rate": 4.1973936761289023e-07, + "loss": 0.353, + "step": 18970 + }, + { + "epoch": 0.9585616524835231, + "grad_norm": 4.735815421841188, + "learning_rate": 4.1468835235882414e-07, + "loss": 0.3444, + "step": 18980 + }, + { + "epoch": 0.959066690235095, + "grad_norm": 3.425925550819413, + "learning_rate": 4.096373371047581e-07, + "loss": 0.3385, + "step": 18990 + }, + { + "epoch": 0.959571727986667, + "grad_norm": 4.169456686292674, + "learning_rate": 4.04586321850692e-07, + "loss": 0.3635, + "step": 19000 + }, + { + "epoch": 0.960076765738239, + "grad_norm": 2.7883327435971967, + "learning_rate": 3.9953530659662593e-07, + "loss": 0.3496, + "step": 19010 + }, + { + "epoch": 0.9605818034898108, + "grad_norm": 6.631719541255136, + "learning_rate": 3.944842913425599e-07, + "loss": 0.3536, + "step": 19020 + }, + { + "epoch": 0.9610868412413828, + "grad_norm": 6.199402518918171, + "learning_rate": 3.8943327608849386e-07, + "loss": 0.3492, + "step": 19030 + }, + { + "epoch": 0.9615918789929547, + "grad_norm": 3.832122639682839, + "learning_rate": 3.8438226083442777e-07, + "loss": 0.3559, + "step": 19040 + }, + { + "epoch": 0.9620969167445267, + "grad_norm": 5.054933419698706, + "learning_rate": 3.793312455803617e-07, + "loss": 0.3396, + "step": 19050 + }, + { + "epoch": 0.9626019544960985, + "grad_norm": 4.931557429835319, + "learning_rate": 3.742802303262956e-07, + "loss": 0.361, + "step": 19060 + }, + { + "epoch": 0.9631069922476705, + "grad_norm": 2.8243354412545663, + "learning_rate": 3.692292150722295e-07, + "loss": 0.3519, + "step": 19070 + }, + { + "epoch": 0.9636120299992424, + "grad_norm": 2.9047769826763843, + "learning_rate": 3.641781998181635e-07, + "loss": 0.352, + "step": 19080 + }, + { + "epoch": 0.9641170677508144, + "grad_norm": 3.5750399190488586, + "learning_rate": 3.5912718456409744e-07, + "loss": 0.3515, + "step": 19090 + }, + { + "epoch": 0.9646221055023863, + "grad_norm": 4.083793717766302, + "learning_rate": 3.5407616931003135e-07, + "loss": 0.3364, + "step": 19100 + }, + { + "epoch": 0.9651271432539582, + "grad_norm": 2.237959887146547, + "learning_rate": 3.4902515405596526e-07, + "loss": 0.3451, + "step": 19110 + }, + { + "epoch": 0.9656321810055302, + "grad_norm": 3.1768900583422597, + "learning_rate": 3.4397413880189917e-07, + "loss": 0.3468, + "step": 19120 + }, + { + "epoch": 0.9661372187571021, + "grad_norm": 4.21068826694376, + "learning_rate": 3.389231235478332e-07, + "loss": 0.3412, + "step": 19130 + }, + { + "epoch": 0.966642256508674, + "grad_norm": 5.657422019205803, + "learning_rate": 3.338721082937671e-07, + "loss": 0.3439, + "step": 19140 + }, + { + "epoch": 0.9671472942602459, + "grad_norm": 6.306482550657807, + "learning_rate": 3.28821093039701e-07, + "loss": 0.3668, + "step": 19150 + }, + { + "epoch": 0.9676523320118179, + "grad_norm": 3.949528139283546, + "learning_rate": 3.237700777856349e-07, + "loss": 0.3521, + "step": 19160 + }, + { + "epoch": 0.9681573697633898, + "grad_norm": 3.8577102555565457, + "learning_rate": 3.1871906253156884e-07, + "loss": 0.3432, + "step": 19170 + }, + { + "epoch": 0.9686624075149618, + "grad_norm": 10.01075740910154, + "learning_rate": 3.1366804727750275e-07, + "loss": 0.349, + "step": 19180 + }, + { + "epoch": 0.9691674452665336, + "grad_norm": 5.366835858110643, + "learning_rate": 3.086170320234367e-07, + "loss": 0.3347, + "step": 19190 + }, + { + "epoch": 0.9696724830181056, + "grad_norm": 3.0600066927692553, + "learning_rate": 3.035660167693707e-07, + "loss": 0.3484, + "step": 19200 + }, + { + "epoch": 0.9701775207696776, + "grad_norm": 3.3199457968898933, + "learning_rate": 2.985150015153046e-07, + "loss": 0.3662, + "step": 19210 + }, + { + "epoch": 0.9706825585212495, + "grad_norm": 8.305484409899465, + "learning_rate": 2.934639862612385e-07, + "loss": 0.3401, + "step": 19220 + }, + { + "epoch": 0.9711875962728214, + "grad_norm": 4.5596702016391575, + "learning_rate": 2.8841297100717247e-07, + "loss": 0.3453, + "step": 19230 + }, + { + "epoch": 0.9716926340243933, + "grad_norm": 3.1404063614755056, + "learning_rate": 2.833619557531064e-07, + "loss": 0.3568, + "step": 19240 + }, + { + "epoch": 0.9721976717759653, + "grad_norm": 2.1279962723109542, + "learning_rate": 2.7831094049904034e-07, + "loss": 0.3652, + "step": 19250 + }, + { + "epoch": 0.9727027095275372, + "grad_norm": 8.103709676163785, + "learning_rate": 2.7325992524497426e-07, + "loss": 0.3599, + "step": 19260 + }, + { + "epoch": 0.9732077472791091, + "grad_norm": 3.9079606526402695, + "learning_rate": 2.6820890999090817e-07, + "loss": 0.3332, + "step": 19270 + }, + { + "epoch": 0.973712785030681, + "grad_norm": 2.834072428824953, + "learning_rate": 2.6315789473684213e-07, + "loss": 0.3587, + "step": 19280 + }, + { + "epoch": 0.974217822782253, + "grad_norm": 3.331437469521485, + "learning_rate": 2.5810687948277604e-07, + "loss": 0.3532, + "step": 19290 + }, + { + "epoch": 0.974722860533825, + "grad_norm": 4.5771090753301955, + "learning_rate": 2.5305586422871e-07, + "loss": 0.3619, + "step": 19300 + }, + { + "epoch": 0.9752278982853968, + "grad_norm": 2.1169984635243897, + "learning_rate": 2.480048489746439e-07, + "loss": 0.3338, + "step": 19310 + }, + { + "epoch": 0.9757329360369688, + "grad_norm": 2.4727042377727075, + "learning_rate": 2.429538337205779e-07, + "loss": 0.348, + "step": 19320 + }, + { + "epoch": 0.9762379737885407, + "grad_norm": 4.382104299041976, + "learning_rate": 2.3790281846651177e-07, + "loss": 0.3608, + "step": 19330 + }, + { + "epoch": 0.9767430115401127, + "grad_norm": 24.576345877505307, + "learning_rate": 2.3285180321244574e-07, + "loss": 0.3344, + "step": 19340 + }, + { + "epoch": 0.9772480492916845, + "grad_norm": 5.068003336801587, + "learning_rate": 2.2780078795837965e-07, + "loss": 0.34, + "step": 19350 + }, + { + "epoch": 0.9777530870432565, + "grad_norm": 5.036323992558192, + "learning_rate": 2.2274977270431359e-07, + "loss": 0.3534, + "step": 19360 + }, + { + "epoch": 0.9782581247948284, + "grad_norm": 8.0901350202218, + "learning_rate": 2.1769875745024752e-07, + "loss": 0.3563, + "step": 19370 + }, + { + "epoch": 0.9787631625464004, + "grad_norm": 3.2338907871203575, + "learning_rate": 2.1264774219618146e-07, + "loss": 0.342, + "step": 19380 + }, + { + "epoch": 0.9792682002979722, + "grad_norm": 7.152607151035173, + "learning_rate": 2.0759672694211537e-07, + "loss": 0.3585, + "step": 19390 + }, + { + "epoch": 0.9797732380495442, + "grad_norm": 2.8408399191335234, + "learning_rate": 2.025457116880493e-07, + "loss": 0.3408, + "step": 19400 + }, + { + "epoch": 0.9802782758011162, + "grad_norm": 3.335420575405322, + "learning_rate": 1.9749469643398325e-07, + "loss": 0.3626, + "step": 19410 + }, + { + "epoch": 0.9807833135526881, + "grad_norm": 4.427107484896195, + "learning_rate": 1.9244368117991716e-07, + "loss": 0.3385, + "step": 19420 + }, + { + "epoch": 0.98128835130426, + "grad_norm": 5.431153322507852, + "learning_rate": 1.8739266592585113e-07, + "loss": 0.3513, + "step": 19430 + }, + { + "epoch": 0.9817933890558319, + "grad_norm": 3.813640172056062, + "learning_rate": 1.8234165067178504e-07, + "loss": 0.3393, + "step": 19440 + }, + { + "epoch": 0.9822984268074039, + "grad_norm": 3.898589096762485, + "learning_rate": 1.7729063541771895e-07, + "loss": 0.3405, + "step": 19450 + }, + { + "epoch": 0.9828034645589758, + "grad_norm": 3.037916644567326, + "learning_rate": 1.7223962016365292e-07, + "loss": 0.3519, + "step": 19460 + }, + { + "epoch": 0.9833085023105477, + "grad_norm": 3.190273896432863, + "learning_rate": 1.6718860490958683e-07, + "loss": 0.3533, + "step": 19470 + }, + { + "epoch": 0.9838135400621196, + "grad_norm": 5.858585403848123, + "learning_rate": 1.621375896555208e-07, + "loss": 0.3443, + "step": 19480 + }, + { + "epoch": 0.9843185778136916, + "grad_norm": 3.441915956155067, + "learning_rate": 1.570865744014547e-07, + "loss": 0.3554, + "step": 19490 + }, + { + "epoch": 0.9848236155652635, + "grad_norm": 3.7424368716852348, + "learning_rate": 1.5203555914738864e-07, + "loss": 0.3587, + "step": 19500 + }, + { + "epoch": 0.9853286533168354, + "grad_norm": 7.071299158156384, + "learning_rate": 1.4698454389332256e-07, + "loss": 0.3621, + "step": 19510 + }, + { + "epoch": 0.9858336910684073, + "grad_norm": 3.8198683754171587, + "learning_rate": 1.419335286392565e-07, + "loss": 0.3724, + "step": 19520 + }, + { + "epoch": 0.9863387288199793, + "grad_norm": 2.731071788924555, + "learning_rate": 1.3688251338519043e-07, + "loss": 0.3416, + "step": 19530 + }, + { + "epoch": 0.9868437665715513, + "grad_norm": 3.7212383700962652, + "learning_rate": 1.3183149813112437e-07, + "loss": 0.3473, + "step": 19540 + }, + { + "epoch": 0.9873488043231231, + "grad_norm": 3.302454861812917, + "learning_rate": 1.267804828770583e-07, + "loss": 0.3567, + "step": 19550 + }, + { + "epoch": 0.9878538420746951, + "grad_norm": 13.424373803274289, + "learning_rate": 1.2172946762299225e-07, + "loss": 0.3462, + "step": 19560 + }, + { + "epoch": 0.988358879826267, + "grad_norm": 2.5094569211373683, + "learning_rate": 1.1667845236892617e-07, + "loss": 0.3506, + "step": 19570 + }, + { + "epoch": 0.988863917577839, + "grad_norm": 8.676691313011856, + "learning_rate": 1.116274371148601e-07, + "loss": 0.3399, + "step": 19580 + }, + { + "epoch": 0.9893689553294108, + "grad_norm": 5.111065406394146, + "learning_rate": 1.0657642186079402e-07, + "loss": 0.3486, + "step": 19590 + }, + { + "epoch": 0.9898739930809828, + "grad_norm": 3.1643666611399976, + "learning_rate": 1.0152540660672796e-07, + "loss": 0.3463, + "step": 19600 + }, + { + "epoch": 0.9903790308325547, + "grad_norm": 3.7401113905250845, + "learning_rate": 9.647439135266189e-08, + "loss": 0.3462, + "step": 19610 + }, + { + "epoch": 0.9908840685841267, + "grad_norm": 3.1161613433451696, + "learning_rate": 9.142337609859582e-08, + "loss": 0.3595, + "step": 19620 + }, + { + "epoch": 0.9913891063356985, + "grad_norm": 7.461525216502298, + "learning_rate": 8.637236084452976e-08, + "loss": 0.3478, + "step": 19630 + }, + { + "epoch": 0.9918941440872705, + "grad_norm": 3.278360748256368, + "learning_rate": 8.13213455904637e-08, + "loss": 0.3468, + "step": 19640 + }, + { + "epoch": 0.9923991818388425, + "grad_norm": 2.3662646040461026, + "learning_rate": 7.627033033639761e-08, + "loss": 0.3334, + "step": 19650 + }, + { + "epoch": 0.9929042195904144, + "grad_norm": 4.061819314850138, + "learning_rate": 7.121931508233155e-08, + "loss": 0.339, + "step": 19660 + }, + { + "epoch": 0.9934092573419863, + "grad_norm": 3.0268165058919387, + "learning_rate": 6.616829982826549e-08, + "loss": 0.3358, + "step": 19670 + }, + { + "epoch": 0.9939142950935582, + "grad_norm": 4.547975258460046, + "learning_rate": 6.111728457419941e-08, + "loss": 0.3338, + "step": 19680 + }, + { + "epoch": 0.9944193328451302, + "grad_norm": 5.770374809427592, + "learning_rate": 5.606626932013335e-08, + "loss": 0.3448, + "step": 19690 + }, + { + "epoch": 0.9949243705967021, + "grad_norm": 2.864633009769063, + "learning_rate": 5.1015254066067285e-08, + "loss": 0.3305, + "step": 19700 + }, + { + "epoch": 0.995429408348274, + "grad_norm": 2.15308465344697, + "learning_rate": 4.5964238812001217e-08, + "loss": 0.3756, + "step": 19710 + }, + { + "epoch": 0.9959344460998459, + "grad_norm": 4.787969766327734, + "learning_rate": 4.091322355793514e-08, + "loss": 0.343, + "step": 19720 + }, + { + "epoch": 0.9964394838514179, + "grad_norm": 9.985119863463977, + "learning_rate": 3.586220830386908e-08, + "loss": 0.3436, + "step": 19730 + }, + { + "epoch": 0.9969445216029899, + "grad_norm": 2.6886443082611673, + "learning_rate": 3.081119304980301e-08, + "loss": 0.3517, + "step": 19740 + }, + { + "epoch": 0.9974495593545617, + "grad_norm": 7.198367620435292, + "learning_rate": 2.5760177795736944e-08, + "loss": 0.3608, + "step": 19750 + }, + { + "epoch": 0.9979545971061337, + "grad_norm": 2.4433957061834106, + "learning_rate": 2.0709162541670875e-08, + "loss": 0.3587, + "step": 19760 + }, + { + "epoch": 0.9984596348577056, + "grad_norm": 43.92952696515746, + "learning_rate": 1.565814728760481e-08, + "loss": 0.352, + "step": 19770 + }, + { + "epoch": 0.9989646726092776, + "grad_norm": 5.395974747202963, + "learning_rate": 1.0607132033538742e-08, + "loss": 0.3339, + "step": 19780 + }, + { + "epoch": 0.9994697103608495, + "grad_norm": 2.2549607655291393, + "learning_rate": 5.556116779472674e-09, + "loss": 0.3437, + "step": 19790 + }, + { + "epoch": 0.9999747481124214, + "grad_norm": 4.084387999304972, + "learning_rate": 5.051015254066067e-10, + "loss": 0.3312, + "step": 19800 + } + ], + "logging_steps": 10, + "max_steps": 19800, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.475990740795392e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}