diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12157 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999711408040172, + "eval_steps": 500, + "global_step": 17325, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005771839196559984, + "grad_norm": 15.207763525645962, + "learning_rate": 9.995959129481038e-06, + "loss": 2.6, + "step": 10 + }, + { + "epoch": 0.0011543678393119967, + "grad_norm": 12.615616581147215, + "learning_rate": 9.990186457311089e-06, + "loss": 0.7554, + "step": 20 + }, + { + "epoch": 0.0017315517589679952, + "grad_norm": 38.969453309289854, + "learning_rate": 9.984413785141142e-06, + "loss": 0.6704, + "step": 30 + }, + { + "epoch": 0.0023087356786239935, + "grad_norm": 22.432338112146017, + "learning_rate": 9.978641112971195e-06, + "loss": 0.6122, + "step": 40 + }, + { + "epoch": 0.0028859195982799918, + "grad_norm": 6.366544502174357, + "learning_rate": 9.972868440801248e-06, + "loss": 0.5704, + "step": 50 + }, + { + "epoch": 0.0034631035179359905, + "grad_norm": 5.991950420382146, + "learning_rate": 9.967095768631301e-06, + "loss": 0.5725, + "step": 60 + }, + { + "epoch": 0.004040287437591989, + "grad_norm": 27.360506966186097, + "learning_rate": 9.961323096461352e-06, + "loss": 0.5616, + "step": 70 + }, + { + "epoch": 0.004617471357247987, + "grad_norm": 7.6551554884885835, + "learning_rate": 9.955550424291405e-06, + "loss": 0.5581, + "step": 80 + }, + { + "epoch": 0.005194655276903985, + "grad_norm": 5.380738579039347, + "learning_rate": 9.949777752121458e-06, + "loss": 0.5386, + "step": 90 + }, + { + "epoch": 0.0057718391965599835, + "grad_norm": 14.701566820920394, + "learning_rate": 9.944005079951511e-06, + "loss": 0.5277, + "step": 100 + }, + { + "epoch": 0.006349023116215982, + "grad_norm": 24.2926776932226, + "learning_rate": 9.938232407781563e-06, + "loss": 0.534, + "step": 110 + }, + { + "epoch": 0.006926207035871981, + "grad_norm": 18.333541125312422, + "learning_rate": 9.932459735611616e-06, + "loss": 0.5357, + "step": 120 + }, + { + "epoch": 0.007503390955527979, + "grad_norm": 4.413124779621301, + "learning_rate": 9.926687063441667e-06, + "loss": 0.5189, + "step": 130 + }, + { + "epoch": 0.008080574875183977, + "grad_norm": 6.736449877624926, + "learning_rate": 9.920914391271722e-06, + "loss": 0.5089, + "step": 140 + }, + { + "epoch": 0.008657758794839977, + "grad_norm": 7.499085213610092, + "learning_rate": 9.915141719101773e-06, + "loss": 0.515, + "step": 150 + }, + { + "epoch": 0.009234942714495974, + "grad_norm": 7.213309423254868, + "learning_rate": 9.909369046931826e-06, + "loss": 0.516, + "step": 160 + }, + { + "epoch": 0.009812126634151973, + "grad_norm": 6.224375486360128, + "learning_rate": 9.903596374761877e-06, + "loss": 0.4787, + "step": 170 + }, + { + "epoch": 0.01038931055380797, + "grad_norm": 4.666329365397574, + "learning_rate": 9.89782370259193e-06, + "loss": 0.5457, + "step": 180 + }, + { + "epoch": 0.01096649447346397, + "grad_norm": 7.269438501572431, + "learning_rate": 9.892051030421983e-06, + "loss": 0.4999, + "step": 190 + }, + { + "epoch": 0.011543678393119967, + "grad_norm": 3.4282676515510633, + "learning_rate": 9.886278358252037e-06, + "loss": 0.4979, + "step": 200 + }, + { + "epoch": 0.012120862312775966, + "grad_norm": 6.398684245798025, + "learning_rate": 9.880505686082088e-06, + "loss": 0.5097, + "step": 210 + }, + { + "epoch": 0.012698046232431964, + "grad_norm": 9.308090815837401, + "learning_rate": 9.874733013912141e-06, + "loss": 0.5003, + "step": 220 + }, + { + "epoch": 0.013275230152087963, + "grad_norm": 6.572806422530607, + "learning_rate": 9.868960341742194e-06, + "loss": 0.5151, + "step": 230 + }, + { + "epoch": 0.013852414071743962, + "grad_norm": 3.9790215956458206, + "learning_rate": 9.863187669572247e-06, + "loss": 0.5036, + "step": 240 + }, + { + "epoch": 0.01442959799139996, + "grad_norm": 9.129499916264713, + "learning_rate": 9.857414997402298e-06, + "loss": 0.4993, + "step": 250 + }, + { + "epoch": 0.015006781911055958, + "grad_norm": 5.341270073182352, + "learning_rate": 9.851642325232351e-06, + "loss": 0.5068, + "step": 260 + }, + { + "epoch": 0.015583965830711956, + "grad_norm": 8.88171344021306, + "learning_rate": 9.845869653062403e-06, + "loss": 0.5106, + "step": 270 + }, + { + "epoch": 0.016161149750367955, + "grad_norm": 6.256421713727477, + "learning_rate": 9.840096980892456e-06, + "loss": 0.5012, + "step": 280 + }, + { + "epoch": 0.016738333670023954, + "grad_norm": 14.828284045011356, + "learning_rate": 9.834324308722509e-06, + "loss": 0.4916, + "step": 290 + }, + { + "epoch": 0.017315517589679953, + "grad_norm": 13.134769047818303, + "learning_rate": 9.828551636552562e-06, + "loss": 0.5023, + "step": 300 + }, + { + "epoch": 0.01789270150933595, + "grad_norm": 13.132579195124707, + "learning_rate": 9.822778964382613e-06, + "loss": 0.4979, + "step": 310 + }, + { + "epoch": 0.018469885428991948, + "grad_norm": 7.799223760078651, + "learning_rate": 9.817006292212666e-06, + "loss": 0.5047, + "step": 320 + }, + { + "epoch": 0.019047069348647947, + "grad_norm": 9.15638603603503, + "learning_rate": 9.811233620042719e-06, + "loss": 0.5089, + "step": 330 + }, + { + "epoch": 0.019624253268303946, + "grad_norm": 4.959585555459444, + "learning_rate": 9.805460947872772e-06, + "loss": 0.4952, + "step": 340 + }, + { + "epoch": 0.020201437187959942, + "grad_norm": 11.68075308396199, + "learning_rate": 9.799688275702823e-06, + "loss": 0.4947, + "step": 350 + }, + { + "epoch": 0.02077862110761594, + "grad_norm": 5.031742433330389, + "learning_rate": 9.793915603532876e-06, + "loss": 0.4722, + "step": 360 + }, + { + "epoch": 0.02135580502727194, + "grad_norm": 7.396670940592179, + "learning_rate": 9.788142931362928e-06, + "loss": 0.4784, + "step": 370 + }, + { + "epoch": 0.02193298894692794, + "grad_norm": 12.81464819318774, + "learning_rate": 9.78237025919298e-06, + "loss": 0.5016, + "step": 380 + }, + { + "epoch": 0.02251017286658394, + "grad_norm": 6.952612493218752, + "learning_rate": 9.776597587023034e-06, + "loss": 0.486, + "step": 390 + }, + { + "epoch": 0.023087356786239934, + "grad_norm": 3.8994319647851134, + "learning_rate": 9.770824914853087e-06, + "loss": 0.5066, + "step": 400 + }, + { + "epoch": 0.023664540705895933, + "grad_norm": 8.754480087072261, + "learning_rate": 9.765052242683138e-06, + "loss": 0.477, + "step": 410 + }, + { + "epoch": 0.024241724625551932, + "grad_norm": 14.00837557848952, + "learning_rate": 9.759279570513191e-06, + "loss": 0.4698, + "step": 420 + }, + { + "epoch": 0.02481890854520793, + "grad_norm": 17.668700193812345, + "learning_rate": 9.753506898343244e-06, + "loss": 0.4809, + "step": 430 + }, + { + "epoch": 0.025396092464863927, + "grad_norm": 7.562292913968297, + "learning_rate": 9.747734226173297e-06, + "loss": 0.5051, + "step": 440 + }, + { + "epoch": 0.025973276384519926, + "grad_norm": 10.300306620163619, + "learning_rate": 9.741961554003348e-06, + "loss": 0.5036, + "step": 450 + }, + { + "epoch": 0.026550460304175925, + "grad_norm": 10.310969163346487, + "learning_rate": 9.736188881833401e-06, + "loss": 0.5029, + "step": 460 + }, + { + "epoch": 0.027127644223831925, + "grad_norm": 5.858481709305026, + "learning_rate": 9.730416209663453e-06, + "loss": 0.5021, + "step": 470 + }, + { + "epoch": 0.027704828143487924, + "grad_norm": 92.82221880785818, + "learning_rate": 9.724643537493506e-06, + "loss": 0.487, + "step": 480 + }, + { + "epoch": 0.02828201206314392, + "grad_norm": 5.060105328004147, + "learning_rate": 9.718870865323559e-06, + "loss": 0.5053, + "step": 490 + }, + { + "epoch": 0.02885919598279992, + "grad_norm": 5.269210555899195, + "learning_rate": 9.713098193153612e-06, + "loss": 0.5028, + "step": 500 + }, + { + "epoch": 0.029436379902455918, + "grad_norm": 2.809713279481296, + "learning_rate": 9.707325520983663e-06, + "loss": 0.4789, + "step": 510 + }, + { + "epoch": 0.030013563822111917, + "grad_norm": 5.364421325217877, + "learning_rate": 9.701552848813716e-06, + "loss": 0.488, + "step": 520 + }, + { + "epoch": 0.030590747741767916, + "grad_norm": 81.42225421545997, + "learning_rate": 9.69578017664377e-06, + "loss": 0.4923, + "step": 530 + }, + { + "epoch": 0.03116793166142391, + "grad_norm": 5.472841855533951, + "learning_rate": 9.690007504473822e-06, + "loss": 0.4801, + "step": 540 + }, + { + "epoch": 0.03174511558107991, + "grad_norm": 9.314853911550161, + "learning_rate": 9.684234832303874e-06, + "loss": 0.4889, + "step": 550 + }, + { + "epoch": 0.03232229950073591, + "grad_norm": 5.657736368331141, + "learning_rate": 9.678462160133927e-06, + "loss": 0.468, + "step": 560 + }, + { + "epoch": 0.03289948342039191, + "grad_norm": 5.316343227859894, + "learning_rate": 9.672689487963978e-06, + "loss": 0.4636, + "step": 570 + }, + { + "epoch": 0.03347666734004791, + "grad_norm": 6.818051213532348, + "learning_rate": 9.666916815794033e-06, + "loss": 0.4785, + "step": 580 + }, + { + "epoch": 0.03405385125970391, + "grad_norm": 5.515733833721638, + "learning_rate": 9.661144143624086e-06, + "loss": 0.4676, + "step": 590 + }, + { + "epoch": 0.034631035179359906, + "grad_norm": 4.414342893400196, + "learning_rate": 9.655371471454137e-06, + "loss": 0.4536, + "step": 600 + }, + { + "epoch": 0.0352082190990159, + "grad_norm": 5.5253782212489355, + "learning_rate": 9.64959879928419e-06, + "loss": 0.4831, + "step": 610 + }, + { + "epoch": 0.0357854030186719, + "grad_norm": 3.47774426972913, + "learning_rate": 9.643826127114241e-06, + "loss": 0.4877, + "step": 620 + }, + { + "epoch": 0.0363625869383279, + "grad_norm": 7.150841155138223, + "learning_rate": 9.638053454944294e-06, + "loss": 0.4629, + "step": 630 + }, + { + "epoch": 0.036939770857983896, + "grad_norm": 3.7894713504219357, + "learning_rate": 9.632280782774347e-06, + "loss": 0.4816, + "step": 640 + }, + { + "epoch": 0.037516954777639895, + "grad_norm": 5.346386715612294, + "learning_rate": 9.6265081106044e-06, + "loss": 0.4861, + "step": 650 + }, + { + "epoch": 0.038094138697295894, + "grad_norm": 4.822544974681145, + "learning_rate": 9.620735438434452e-06, + "loss": 0.4756, + "step": 660 + }, + { + "epoch": 0.03867132261695189, + "grad_norm": 5.810598113453792, + "learning_rate": 9.614962766264505e-06, + "loss": 0.4752, + "step": 670 + }, + { + "epoch": 0.03924850653660789, + "grad_norm": 19.319543215067025, + "learning_rate": 9.609190094094558e-06, + "loss": 0.4664, + "step": 680 + }, + { + "epoch": 0.03982569045626389, + "grad_norm": 3.6009359491010864, + "learning_rate": 9.60341742192461e-06, + "loss": 0.481, + "step": 690 + }, + { + "epoch": 0.040402874375919884, + "grad_norm": 5.102687170049628, + "learning_rate": 9.597644749754662e-06, + "loss": 0.4751, + "step": 700 + }, + { + "epoch": 0.04098005829557588, + "grad_norm": 6.202922207392348, + "learning_rate": 9.591872077584715e-06, + "loss": 0.4849, + "step": 710 + }, + { + "epoch": 0.04155724221523188, + "grad_norm": 7.8448485511355965, + "learning_rate": 9.586099405414766e-06, + "loss": 0.4947, + "step": 720 + }, + { + "epoch": 0.04213442613488788, + "grad_norm": 3.9340219935160863, + "learning_rate": 9.58032673324482e-06, + "loss": 0.4948, + "step": 730 + }, + { + "epoch": 0.04271161005454388, + "grad_norm": 4.596617570306747, + "learning_rate": 9.574554061074873e-06, + "loss": 0.4743, + "step": 740 + }, + { + "epoch": 0.04328879397419988, + "grad_norm": 2.1114969175976923, + "learning_rate": 9.568781388904926e-06, + "loss": 0.4717, + "step": 750 + }, + { + "epoch": 0.04386597789385588, + "grad_norm": 5.18107474670299, + "learning_rate": 9.563008716734977e-06, + "loss": 0.468, + "step": 760 + }, + { + "epoch": 0.04444316181351188, + "grad_norm": 4.705370463352637, + "learning_rate": 9.55723604456503e-06, + "loss": 0.4594, + "step": 770 + }, + { + "epoch": 0.04502034573316788, + "grad_norm": 4.477204626343746, + "learning_rate": 9.551463372395083e-06, + "loss": 0.4726, + "step": 780 + }, + { + "epoch": 0.04559752965282387, + "grad_norm": 5.11055150918499, + "learning_rate": 9.545690700225136e-06, + "loss": 0.4878, + "step": 790 + }, + { + "epoch": 0.04617471357247987, + "grad_norm": 5.92970242815697, + "learning_rate": 9.539918028055187e-06, + "loss": 0.4562, + "step": 800 + }, + { + "epoch": 0.04675189749213587, + "grad_norm": 32.04814941479638, + "learning_rate": 9.53414535588524e-06, + "loss": 0.4668, + "step": 810 + }, + { + "epoch": 0.047329081411791867, + "grad_norm": 7.483269872312162, + "learning_rate": 9.528372683715292e-06, + "loss": 0.4593, + "step": 820 + }, + { + "epoch": 0.047906265331447866, + "grad_norm": 3.0548203710383026, + "learning_rate": 9.522600011545345e-06, + "loss": 0.4734, + "step": 830 + }, + { + "epoch": 0.048483449251103865, + "grad_norm": 2.846647873568613, + "learning_rate": 9.516827339375398e-06, + "loss": 0.4583, + "step": 840 + }, + { + "epoch": 0.049060633170759864, + "grad_norm": 2.9958931469528753, + "learning_rate": 9.51105466720545e-06, + "loss": 0.4503, + "step": 850 + }, + { + "epoch": 0.04963781709041586, + "grad_norm": 3.669267079128399, + "learning_rate": 9.505281995035502e-06, + "loss": 0.4543, + "step": 860 + }, + { + "epoch": 0.05021500101007186, + "grad_norm": 4.959841634234083, + "learning_rate": 9.499509322865555e-06, + "loss": 0.4638, + "step": 870 + }, + { + "epoch": 0.050792184929727854, + "grad_norm": 6.2400754071583355, + "learning_rate": 9.493736650695608e-06, + "loss": 0.4675, + "step": 880 + }, + { + "epoch": 0.051369368849383854, + "grad_norm": 4.8904554144848325, + "learning_rate": 9.487963978525661e-06, + "loss": 0.4795, + "step": 890 + }, + { + "epoch": 0.05194655276903985, + "grad_norm": 5.718936473226969, + "learning_rate": 9.482191306355712e-06, + "loss": 0.475, + "step": 900 + }, + { + "epoch": 0.05252373668869585, + "grad_norm": 2.813275923923208, + "learning_rate": 9.476418634185765e-06, + "loss": 0.4588, + "step": 910 + }, + { + "epoch": 0.05310092060835185, + "grad_norm": 3.7622872130635825, + "learning_rate": 9.470645962015817e-06, + "loss": 0.4738, + "step": 920 + }, + { + "epoch": 0.05367810452800785, + "grad_norm": 5.251759558774021, + "learning_rate": 9.464873289845871e-06, + "loss": 0.4548, + "step": 930 + }, + { + "epoch": 0.05425528844766385, + "grad_norm": 4.650175199113373, + "learning_rate": 9.459100617675923e-06, + "loss": 0.469, + "step": 940 + }, + { + "epoch": 0.05483247236731985, + "grad_norm": 19.07121359753558, + "learning_rate": 9.453327945505976e-06, + "loss": 0.4593, + "step": 950 + }, + { + "epoch": 0.05540965628697585, + "grad_norm": 4.899239522938927, + "learning_rate": 9.447555273336027e-06, + "loss": 0.4693, + "step": 960 + }, + { + "epoch": 0.05598684020663185, + "grad_norm": 11.615659586114845, + "learning_rate": 9.44178260116608e-06, + "loss": 0.4447, + "step": 970 + }, + { + "epoch": 0.05656402412628784, + "grad_norm": 10.04941518596728, + "learning_rate": 9.436009928996133e-06, + "loss": 0.4617, + "step": 980 + }, + { + "epoch": 0.05714120804594384, + "grad_norm": 8.509458377026059, + "learning_rate": 9.430237256826186e-06, + "loss": 0.4449, + "step": 990 + }, + { + "epoch": 0.05771839196559984, + "grad_norm": 7.280354559581083, + "learning_rate": 9.424464584656238e-06, + "loss": 0.4473, + "step": 1000 + }, + { + "epoch": 0.058295575885255836, + "grad_norm": 6.178315104553298, + "learning_rate": 9.41869191248629e-06, + "loss": 0.4489, + "step": 1010 + }, + { + "epoch": 0.058872759804911835, + "grad_norm": 4.326714170134293, + "learning_rate": 9.412919240316344e-06, + "loss": 0.4421, + "step": 1020 + }, + { + "epoch": 0.059449943724567834, + "grad_norm": 5.121692072940591, + "learning_rate": 9.407146568146397e-06, + "loss": 0.4474, + "step": 1030 + }, + { + "epoch": 0.060027127644223834, + "grad_norm": 9.081808249358685, + "learning_rate": 9.401373895976448e-06, + "loss": 0.4425, + "step": 1040 + }, + { + "epoch": 0.06060431156387983, + "grad_norm": 3.1699426939861644, + "learning_rate": 9.395601223806501e-06, + "loss": 0.4527, + "step": 1050 + }, + { + "epoch": 0.06118149548353583, + "grad_norm": 4.867955044244513, + "learning_rate": 9.389828551636552e-06, + "loss": 0.4364, + "step": 1060 + }, + { + "epoch": 0.061758679403191824, + "grad_norm": 4.757339575426131, + "learning_rate": 9.384055879466605e-06, + "loss": 0.4744, + "step": 1070 + }, + { + "epoch": 0.06233586332284782, + "grad_norm": 7.0039435743401235, + "learning_rate": 9.378283207296658e-06, + "loss": 0.4542, + "step": 1080 + }, + { + "epoch": 0.06291304724250382, + "grad_norm": 3.3067436767056964, + "learning_rate": 9.372510535126711e-06, + "loss": 0.4434, + "step": 1090 + }, + { + "epoch": 0.06349023116215982, + "grad_norm": 5.215962094810896, + "learning_rate": 9.366737862956763e-06, + "loss": 0.4413, + "step": 1100 + }, + { + "epoch": 0.06406741508181582, + "grad_norm": 5.30554157155974, + "learning_rate": 9.360965190786816e-06, + "loss": 0.4492, + "step": 1110 + }, + { + "epoch": 0.06464459900147182, + "grad_norm": 9.282132535227714, + "learning_rate": 9.355192518616869e-06, + "loss": 0.4552, + "step": 1120 + }, + { + "epoch": 0.06522178292112782, + "grad_norm": 4.436227742317887, + "learning_rate": 9.349419846446922e-06, + "loss": 0.4629, + "step": 1130 + }, + { + "epoch": 0.06579896684078382, + "grad_norm": 5.072582901364672, + "learning_rate": 9.343647174276975e-06, + "loss": 0.4394, + "step": 1140 + }, + { + "epoch": 0.06637615076043982, + "grad_norm": 3.6484490415265287, + "learning_rate": 9.337874502107026e-06, + "loss": 0.4404, + "step": 1150 + }, + { + "epoch": 0.06695333468009582, + "grad_norm": 5.179047870445265, + "learning_rate": 9.332101829937079e-06, + "loss": 0.4401, + "step": 1160 + }, + { + "epoch": 0.06753051859975182, + "grad_norm": 5.1113945913805345, + "learning_rate": 9.32632915776713e-06, + "loss": 0.4642, + "step": 1170 + }, + { + "epoch": 0.06810770251940781, + "grad_norm": 13.536539735261888, + "learning_rate": 9.320556485597183e-06, + "loss": 0.44, + "step": 1180 + }, + { + "epoch": 0.06868488643906381, + "grad_norm": 39.38178075944957, + "learning_rate": 9.314783813427236e-06, + "loss": 0.4401, + "step": 1190 + }, + { + "epoch": 0.06926207035871981, + "grad_norm": 5.347846132280397, + "learning_rate": 9.30901114125729e-06, + "loss": 0.4279, + "step": 1200 + }, + { + "epoch": 0.0698392542783758, + "grad_norm": 17.998657342305947, + "learning_rate": 9.30323846908734e-06, + "loss": 0.4289, + "step": 1210 + }, + { + "epoch": 0.0704164381980318, + "grad_norm": 3.352558494444376, + "learning_rate": 9.297465796917394e-06, + "loss": 0.4345, + "step": 1220 + }, + { + "epoch": 0.0709936221176878, + "grad_norm": 5.425024450686575, + "learning_rate": 9.291693124747447e-06, + "loss": 0.4551, + "step": 1230 + }, + { + "epoch": 0.0715708060373438, + "grad_norm": 5.053276208988115, + "learning_rate": 9.2859204525775e-06, + "loss": 0.455, + "step": 1240 + }, + { + "epoch": 0.0721479899569998, + "grad_norm": 4.369475575649534, + "learning_rate": 9.280147780407551e-06, + "loss": 0.4359, + "step": 1250 + }, + { + "epoch": 0.0727251738766558, + "grad_norm": 7.966064775548766, + "learning_rate": 9.274375108237604e-06, + "loss": 0.4408, + "step": 1260 + }, + { + "epoch": 0.07330235779631179, + "grad_norm": 2.9840558262546626, + "learning_rate": 9.268602436067656e-06, + "loss": 0.445, + "step": 1270 + }, + { + "epoch": 0.07387954171596779, + "grad_norm": 6.923266339159757, + "learning_rate": 9.26282976389771e-06, + "loss": 0.4319, + "step": 1280 + }, + { + "epoch": 0.07445672563562379, + "grad_norm": 5.445323456459659, + "learning_rate": 9.257057091727762e-06, + "loss": 0.4649, + "step": 1290 + }, + { + "epoch": 0.07503390955527979, + "grad_norm": 2.258392109239514, + "learning_rate": 9.251284419557815e-06, + "loss": 0.4435, + "step": 1300 + }, + { + "epoch": 0.07561109347493579, + "grad_norm": 6.963934239231957, + "learning_rate": 9.245511747387866e-06, + "loss": 0.4516, + "step": 1310 + }, + { + "epoch": 0.07618827739459179, + "grad_norm": 3.741922936601378, + "learning_rate": 9.239739075217919e-06, + "loss": 0.4457, + "step": 1320 + }, + { + "epoch": 0.07676546131424779, + "grad_norm": 4.697657485962023, + "learning_rate": 9.233966403047972e-06, + "loss": 0.4585, + "step": 1330 + }, + { + "epoch": 0.07734264523390379, + "grad_norm": 2.7800726567106886, + "learning_rate": 9.228193730878025e-06, + "loss": 0.4303, + "step": 1340 + }, + { + "epoch": 0.07791982915355979, + "grad_norm": 2.374574795518818, + "learning_rate": 9.222421058708076e-06, + "loss": 0.4468, + "step": 1350 + }, + { + "epoch": 0.07849701307321579, + "grad_norm": 3.379706889838213, + "learning_rate": 9.21664838653813e-06, + "loss": 0.4468, + "step": 1360 + }, + { + "epoch": 0.07907419699287178, + "grad_norm": 5.02793266796178, + "learning_rate": 9.210875714368182e-06, + "loss": 0.4541, + "step": 1370 + }, + { + "epoch": 0.07965138091252778, + "grad_norm": 5.377287835326619, + "learning_rate": 9.205103042198235e-06, + "loss": 0.4577, + "step": 1380 + }, + { + "epoch": 0.08022856483218377, + "grad_norm": 4.118238939930749, + "learning_rate": 9.199330370028287e-06, + "loss": 0.4535, + "step": 1390 + }, + { + "epoch": 0.08080574875183977, + "grad_norm": 3.6462071059200785, + "learning_rate": 9.19355769785834e-06, + "loss": 0.452, + "step": 1400 + }, + { + "epoch": 0.08138293267149577, + "grad_norm": 2.023709004387077, + "learning_rate": 9.187785025688391e-06, + "loss": 0.4385, + "step": 1410 + }, + { + "epoch": 0.08196011659115177, + "grad_norm": 2.718469268180074, + "learning_rate": 9.182012353518444e-06, + "loss": 0.4565, + "step": 1420 + }, + { + "epoch": 0.08253730051080777, + "grad_norm": 4.494982724398743, + "learning_rate": 9.176239681348497e-06, + "loss": 0.455, + "step": 1430 + }, + { + "epoch": 0.08311448443046376, + "grad_norm": 3.008004247279657, + "learning_rate": 9.17046700917855e-06, + "loss": 0.4338, + "step": 1440 + }, + { + "epoch": 0.08369166835011976, + "grad_norm": 4.471511152653035, + "learning_rate": 9.164694337008601e-06, + "loss": 0.4498, + "step": 1450 + }, + { + "epoch": 0.08426885226977576, + "grad_norm": 41.046308308564996, + "learning_rate": 9.158921664838654e-06, + "loss": 0.444, + "step": 1460 + }, + { + "epoch": 0.08484603618943176, + "grad_norm": 2.2817565591543087, + "learning_rate": 9.153148992668707e-06, + "loss": 0.4524, + "step": 1470 + }, + { + "epoch": 0.08542322010908776, + "grad_norm": 2.7552178530343974, + "learning_rate": 9.14737632049876e-06, + "loss": 0.4395, + "step": 1480 + }, + { + "epoch": 0.08600040402874376, + "grad_norm": 7.350119226751549, + "learning_rate": 9.141603648328812e-06, + "loss": 0.4439, + "step": 1490 + }, + { + "epoch": 0.08657758794839976, + "grad_norm": 4.927329771744563, + "learning_rate": 9.135830976158865e-06, + "loss": 0.4435, + "step": 1500 + }, + { + "epoch": 0.08715477186805576, + "grad_norm": 3.2061592885446433, + "learning_rate": 9.130058303988916e-06, + "loss": 0.4551, + "step": 1510 + }, + { + "epoch": 0.08773195578771176, + "grad_norm": 2.81815402264405, + "learning_rate": 9.12428563181897e-06, + "loss": 0.4541, + "step": 1520 + }, + { + "epoch": 0.08830913970736776, + "grad_norm": 1.8995196764329627, + "learning_rate": 9.118512959649022e-06, + "loss": 0.426, + "step": 1530 + }, + { + "epoch": 0.08888632362702376, + "grad_norm": 4.57597268577496, + "learning_rate": 9.112740287479075e-06, + "loss": 0.4388, + "step": 1540 + }, + { + "epoch": 0.08946350754667975, + "grad_norm": 4.888253541319005, + "learning_rate": 9.106967615309127e-06, + "loss": 0.4557, + "step": 1550 + }, + { + "epoch": 0.09004069146633575, + "grad_norm": 2.7824915648882853, + "learning_rate": 9.10119494313918e-06, + "loss": 0.4544, + "step": 1560 + }, + { + "epoch": 0.09061787538599175, + "grad_norm": 4.3535000068694645, + "learning_rate": 9.095422270969233e-06, + "loss": 0.4545, + "step": 1570 + }, + { + "epoch": 0.09119505930564774, + "grad_norm": 3.465323902631204, + "learning_rate": 9.089649598799286e-06, + "loss": 0.4343, + "step": 1580 + }, + { + "epoch": 0.09177224322530374, + "grad_norm": 3.685921584283666, + "learning_rate": 9.083876926629337e-06, + "loss": 0.4571, + "step": 1590 + }, + { + "epoch": 0.09234942714495974, + "grad_norm": 3.7270367548310457, + "learning_rate": 9.07810425445939e-06, + "loss": 0.4368, + "step": 1600 + }, + { + "epoch": 0.09292661106461574, + "grad_norm": 4.635316351567143, + "learning_rate": 9.072331582289441e-06, + "loss": 0.4333, + "step": 1610 + }, + { + "epoch": 0.09350379498427173, + "grad_norm": 7.835128472709014, + "learning_rate": 9.066558910119494e-06, + "loss": 0.4371, + "step": 1620 + }, + { + "epoch": 0.09408097890392773, + "grad_norm": 3.497453269659607, + "learning_rate": 9.060786237949547e-06, + "loss": 0.441, + "step": 1630 + }, + { + "epoch": 0.09465816282358373, + "grad_norm": 8.942601742638523, + "learning_rate": 9.0550135657796e-06, + "loss": 0.4381, + "step": 1640 + }, + { + "epoch": 0.09523534674323973, + "grad_norm": 4.589054052495082, + "learning_rate": 9.049240893609653e-06, + "loss": 0.4382, + "step": 1650 + }, + { + "epoch": 0.09581253066289573, + "grad_norm": 5.380659756568862, + "learning_rate": 9.043468221439705e-06, + "loss": 0.4409, + "step": 1660 + }, + { + "epoch": 0.09638971458255173, + "grad_norm": 2.868711750692323, + "learning_rate": 9.037695549269758e-06, + "loss": 0.4456, + "step": 1670 + }, + { + "epoch": 0.09696689850220773, + "grad_norm": 2.5446684388172884, + "learning_rate": 9.03192287709981e-06, + "loss": 0.4491, + "step": 1680 + }, + { + "epoch": 0.09754408242186373, + "grad_norm": 2.258241750087974, + "learning_rate": 9.026150204929864e-06, + "loss": 0.4366, + "step": 1690 + }, + { + "epoch": 0.09812126634151973, + "grad_norm": 2.602005403010541, + "learning_rate": 9.020377532759915e-06, + "loss": 0.4336, + "step": 1700 + }, + { + "epoch": 0.09869845026117573, + "grad_norm": 2.209187153695394, + "learning_rate": 9.014604860589968e-06, + "loss": 0.4438, + "step": 1710 + }, + { + "epoch": 0.09927563418083173, + "grad_norm": 3.0657261742371205, + "learning_rate": 9.008832188420021e-06, + "loss": 0.4438, + "step": 1720 + }, + { + "epoch": 0.09985281810048773, + "grad_norm": 3.199466707600606, + "learning_rate": 9.003059516250074e-06, + "loss": 0.4315, + "step": 1730 + }, + { + "epoch": 0.10043000202014372, + "grad_norm": 2.6315666639919657, + "learning_rate": 8.997286844080125e-06, + "loss": 0.4309, + "step": 1740 + }, + { + "epoch": 0.10100718593979972, + "grad_norm": 2.438482770459306, + "learning_rate": 8.991514171910178e-06, + "loss": 0.4385, + "step": 1750 + }, + { + "epoch": 0.10158436985945571, + "grad_norm": 17.29321542355523, + "learning_rate": 8.98574149974023e-06, + "loss": 0.4408, + "step": 1760 + }, + { + "epoch": 0.10216155377911171, + "grad_norm": 4.982684643457552, + "learning_rate": 8.979968827570283e-06, + "loss": 0.4298, + "step": 1770 + }, + { + "epoch": 0.10273873769876771, + "grad_norm": 6.799624435849738, + "learning_rate": 8.974196155400336e-06, + "loss": 0.4334, + "step": 1780 + }, + { + "epoch": 0.1033159216184237, + "grad_norm": 3.774388255492694, + "learning_rate": 8.968423483230389e-06, + "loss": 0.4308, + "step": 1790 + }, + { + "epoch": 0.1038931055380797, + "grad_norm": 10.033128674066695, + "learning_rate": 8.96265081106044e-06, + "loss": 0.4341, + "step": 1800 + }, + { + "epoch": 0.1044702894577357, + "grad_norm": 2.5219113776584123, + "learning_rate": 8.956878138890493e-06, + "loss": 0.4362, + "step": 1810 + }, + { + "epoch": 0.1050474733773917, + "grad_norm": 2.419786699423789, + "learning_rate": 8.951105466720546e-06, + "loss": 0.4242, + "step": 1820 + }, + { + "epoch": 0.1056246572970477, + "grad_norm": 60.08295959626865, + "learning_rate": 8.9453327945506e-06, + "loss": 0.4385, + "step": 1830 + }, + { + "epoch": 0.1062018412167037, + "grad_norm": 3.096217392102352, + "learning_rate": 8.93956012238065e-06, + "loss": 0.4241, + "step": 1840 + }, + { + "epoch": 0.1067790251363597, + "grad_norm": 2.5832709944291503, + "learning_rate": 8.933787450210704e-06, + "loss": 0.4361, + "step": 1850 + }, + { + "epoch": 0.1073562090560157, + "grad_norm": 2.3859955361814302, + "learning_rate": 8.928014778040755e-06, + "loss": 0.4213, + "step": 1860 + }, + { + "epoch": 0.1079333929756717, + "grad_norm": 3.223331997025027, + "learning_rate": 8.922242105870808e-06, + "loss": 0.4489, + "step": 1870 + }, + { + "epoch": 0.1085105768953277, + "grad_norm": 4.411834387897919, + "learning_rate": 8.916469433700861e-06, + "loss": 0.4336, + "step": 1880 + }, + { + "epoch": 0.1090877608149837, + "grad_norm": 9.832585313890915, + "learning_rate": 8.910696761530914e-06, + "loss": 0.4441, + "step": 1890 + }, + { + "epoch": 0.1096649447346397, + "grad_norm": 5.365037688190915, + "learning_rate": 8.904924089360965e-06, + "loss": 0.4367, + "step": 1900 + }, + { + "epoch": 0.1102421286542957, + "grad_norm": 3.819456675382363, + "learning_rate": 8.899151417191018e-06, + "loss": 0.4272, + "step": 1910 + }, + { + "epoch": 0.1108193125739517, + "grad_norm": 3.036048623046518, + "learning_rate": 8.893378745021071e-06, + "loss": 0.4338, + "step": 1920 + }, + { + "epoch": 0.1113964964936077, + "grad_norm": 3.2508150432730196, + "learning_rate": 8.887606072851124e-06, + "loss": 0.4469, + "step": 1930 + }, + { + "epoch": 0.1119736804132637, + "grad_norm": 2.9078235731412305, + "learning_rate": 8.881833400681176e-06, + "loss": 0.4452, + "step": 1940 + }, + { + "epoch": 0.11255086433291968, + "grad_norm": 2.4501912449235648, + "learning_rate": 8.876060728511229e-06, + "loss": 0.4246, + "step": 1950 + }, + { + "epoch": 0.11312804825257568, + "grad_norm": 6.242529869364432, + "learning_rate": 8.87028805634128e-06, + "loss": 0.4459, + "step": 1960 + }, + { + "epoch": 0.11370523217223168, + "grad_norm": 25.285733450369438, + "learning_rate": 8.864515384171333e-06, + "loss": 0.4377, + "step": 1970 + }, + { + "epoch": 0.11428241609188768, + "grad_norm": 3.177411911875137, + "learning_rate": 8.858742712001386e-06, + "loss": 0.4197, + "step": 1980 + }, + { + "epoch": 0.11485960001154367, + "grad_norm": 3.117021809901308, + "learning_rate": 8.852970039831439e-06, + "loss": 0.4147, + "step": 1990 + }, + { + "epoch": 0.11543678393119967, + "grad_norm": 3.919723512318409, + "learning_rate": 8.84719736766149e-06, + "loss": 0.4414, + "step": 2000 + }, + { + "epoch": 0.11601396785085567, + "grad_norm": 2.9342199285272796, + "learning_rate": 8.841424695491543e-06, + "loss": 0.427, + "step": 2010 + }, + { + "epoch": 0.11659115177051167, + "grad_norm": 1.918258053028722, + "learning_rate": 8.835652023321596e-06, + "loss": 0.4381, + "step": 2020 + }, + { + "epoch": 0.11716833569016767, + "grad_norm": 1.7751748211927454, + "learning_rate": 8.82987935115165e-06, + "loss": 0.4228, + "step": 2030 + }, + { + "epoch": 0.11774551960982367, + "grad_norm": 2.2012455830534385, + "learning_rate": 8.8241066789817e-06, + "loss": 0.4209, + "step": 2040 + }, + { + "epoch": 0.11832270352947967, + "grad_norm": 2.8237417214584544, + "learning_rate": 8.818334006811754e-06, + "loss": 0.4177, + "step": 2050 + }, + { + "epoch": 0.11889988744913567, + "grad_norm": 4.6392306678295805, + "learning_rate": 8.812561334641805e-06, + "loss": 0.4087, + "step": 2060 + }, + { + "epoch": 0.11947707136879167, + "grad_norm": 2.735063304359889, + "learning_rate": 8.80678866247186e-06, + "loss": 0.4272, + "step": 2070 + }, + { + "epoch": 0.12005425528844767, + "grad_norm": 3.0299802933466657, + "learning_rate": 8.801015990301911e-06, + "loss": 0.4371, + "step": 2080 + }, + { + "epoch": 0.12063143920810367, + "grad_norm": 5.124807390829953, + "learning_rate": 8.795243318131964e-06, + "loss": 0.4165, + "step": 2090 + }, + { + "epoch": 0.12120862312775967, + "grad_norm": 5.1154449505887385, + "learning_rate": 8.789470645962016e-06, + "loss": 0.4346, + "step": 2100 + }, + { + "epoch": 0.12178580704741566, + "grad_norm": 2.203947517060104, + "learning_rate": 8.783697973792069e-06, + "loss": 0.4225, + "step": 2110 + }, + { + "epoch": 0.12236299096707166, + "grad_norm": 2.1610133845450683, + "learning_rate": 8.777925301622122e-06, + "loss": 0.4121, + "step": 2120 + }, + { + "epoch": 0.12294017488672765, + "grad_norm": 13.953365114471524, + "learning_rate": 8.772152629452175e-06, + "loss": 0.4304, + "step": 2130 + }, + { + "epoch": 0.12351735880638365, + "grad_norm": 8.125499458140256, + "learning_rate": 8.766379957282226e-06, + "loss": 0.4351, + "step": 2140 + }, + { + "epoch": 0.12409454272603965, + "grad_norm": 10.554186301118584, + "learning_rate": 8.760607285112279e-06, + "loss": 0.4266, + "step": 2150 + }, + { + "epoch": 0.12467172664569565, + "grad_norm": 4.270037728186706, + "learning_rate": 8.754834612942332e-06, + "loss": 0.438, + "step": 2160 + }, + { + "epoch": 0.12524891056535165, + "grad_norm": 8.246386750659413, + "learning_rate": 8.749061940772385e-06, + "loss": 0.4283, + "step": 2170 + }, + { + "epoch": 0.12582609448500764, + "grad_norm": 3.506946209841679, + "learning_rate": 8.743289268602438e-06, + "loss": 0.4106, + "step": 2180 + }, + { + "epoch": 0.12640327840466364, + "grad_norm": 6.297671982703174, + "learning_rate": 8.73751659643249e-06, + "loss": 0.4329, + "step": 2190 + }, + { + "epoch": 0.12698046232431964, + "grad_norm": 3.18530097096491, + "learning_rate": 8.731743924262542e-06, + "loss": 0.4185, + "step": 2200 + }, + { + "epoch": 0.12755764624397564, + "grad_norm": 8.542498824615011, + "learning_rate": 8.725971252092594e-06, + "loss": 0.4255, + "step": 2210 + }, + { + "epoch": 0.12813483016363164, + "grad_norm": 4.533535099713348, + "learning_rate": 8.720198579922647e-06, + "loss": 0.423, + "step": 2220 + }, + { + "epoch": 0.12871201408328764, + "grad_norm": 3.889530684586912, + "learning_rate": 8.7144259077527e-06, + "loss": 0.4235, + "step": 2230 + }, + { + "epoch": 0.12928919800294364, + "grad_norm": 3.6486853451512777, + "learning_rate": 8.708653235582753e-06, + "loss": 0.4211, + "step": 2240 + }, + { + "epoch": 0.12986638192259964, + "grad_norm": 4.615918028596828, + "learning_rate": 8.702880563412804e-06, + "loss": 0.4088, + "step": 2250 + }, + { + "epoch": 0.13044356584225564, + "grad_norm": 3.850082033721987, + "learning_rate": 8.697107891242857e-06, + "loss": 0.4282, + "step": 2260 + }, + { + "epoch": 0.13102074976191164, + "grad_norm": 14.518871261878948, + "learning_rate": 8.69133521907291e-06, + "loss": 0.431, + "step": 2270 + }, + { + "epoch": 0.13159793368156764, + "grad_norm": 3.5577521941063797, + "learning_rate": 8.685562546902963e-06, + "loss": 0.4269, + "step": 2280 + }, + { + "epoch": 0.13217511760122364, + "grad_norm": 4.319284164265623, + "learning_rate": 8.679789874733014e-06, + "loss": 0.4151, + "step": 2290 + }, + { + "epoch": 0.13275230152087963, + "grad_norm": 5.95977091224079, + "learning_rate": 8.674017202563068e-06, + "loss": 0.434, + "step": 2300 + }, + { + "epoch": 0.13332948544053563, + "grad_norm": 5.717322004077494, + "learning_rate": 8.668244530393119e-06, + "loss": 0.412, + "step": 2310 + }, + { + "epoch": 0.13390666936019163, + "grad_norm": 4.1757793177667315, + "learning_rate": 8.662471858223172e-06, + "loss": 0.4154, + "step": 2320 + }, + { + "epoch": 0.13448385327984763, + "grad_norm": 2.4222311786523822, + "learning_rate": 8.656699186053225e-06, + "loss": 0.4075, + "step": 2330 + }, + { + "epoch": 0.13506103719950363, + "grad_norm": 4.366879694604865, + "learning_rate": 8.650926513883278e-06, + "loss": 0.4212, + "step": 2340 + }, + { + "epoch": 0.13563822111915963, + "grad_norm": 4.573113527280892, + "learning_rate": 8.64515384171333e-06, + "loss": 0.4235, + "step": 2350 + }, + { + "epoch": 0.13621540503881563, + "grad_norm": 2.7183246240824612, + "learning_rate": 8.639381169543382e-06, + "loss": 0.4067, + "step": 2360 + }, + { + "epoch": 0.13679258895847163, + "grad_norm": 2.858161612405957, + "learning_rate": 8.633608497373435e-06, + "loss": 0.404, + "step": 2370 + }, + { + "epoch": 0.13736977287812763, + "grad_norm": 4.083715288704242, + "learning_rate": 8.627835825203488e-06, + "loss": 0.4063, + "step": 2380 + }, + { + "epoch": 0.13794695679778363, + "grad_norm": 5.0777931108973595, + "learning_rate": 8.62206315303354e-06, + "loss": 0.422, + "step": 2390 + }, + { + "epoch": 0.13852414071743963, + "grad_norm": 7.507110618573909, + "learning_rate": 8.616290480863593e-06, + "loss": 0.421, + "step": 2400 + }, + { + "epoch": 0.1391013246370956, + "grad_norm": 5.833761997884338, + "learning_rate": 8.610517808693644e-06, + "loss": 0.4149, + "step": 2410 + }, + { + "epoch": 0.1396785085567516, + "grad_norm": 2.9291582156343523, + "learning_rate": 8.604745136523697e-06, + "loss": 0.4223, + "step": 2420 + }, + { + "epoch": 0.1402556924764076, + "grad_norm": 3.8527608900197245, + "learning_rate": 8.59897246435375e-06, + "loss": 0.4381, + "step": 2430 + }, + { + "epoch": 0.1408328763960636, + "grad_norm": 5.009308450797531, + "learning_rate": 8.593199792183803e-06, + "loss": 0.4265, + "step": 2440 + }, + { + "epoch": 0.1414100603157196, + "grad_norm": 4.305682778167611, + "learning_rate": 8.587427120013854e-06, + "loss": 0.422, + "step": 2450 + }, + { + "epoch": 0.1419872442353756, + "grad_norm": 4.119426729774013, + "learning_rate": 8.581654447843907e-06, + "loss": 0.4079, + "step": 2460 + }, + { + "epoch": 0.1425644281550316, + "grad_norm": 6.546817474930748, + "learning_rate": 8.57588177567396e-06, + "loss": 0.42, + "step": 2470 + }, + { + "epoch": 0.1431416120746876, + "grad_norm": 3.668016204374388, + "learning_rate": 8.570109103504013e-06, + "loss": 0.4323, + "step": 2480 + }, + { + "epoch": 0.1437187959943436, + "grad_norm": 4.577088984604785, + "learning_rate": 8.564336431334065e-06, + "loss": 0.412, + "step": 2490 + }, + { + "epoch": 0.1442959799139996, + "grad_norm": 5.57584596543327, + "learning_rate": 8.558563759164118e-06, + "loss": 0.4263, + "step": 2500 + }, + { + "epoch": 0.1448731638336556, + "grad_norm": 2.695769154250156, + "learning_rate": 8.55279108699417e-06, + "loss": 0.4107, + "step": 2510 + }, + { + "epoch": 0.1454503477533116, + "grad_norm": 13.95443843109925, + "learning_rate": 8.547018414824224e-06, + "loss": 0.4213, + "step": 2520 + }, + { + "epoch": 0.1460275316729676, + "grad_norm": 7.136056655011844, + "learning_rate": 8.541245742654275e-06, + "loss": 0.4274, + "step": 2530 + }, + { + "epoch": 0.14660471559262359, + "grad_norm": 6.544753723165297, + "learning_rate": 8.535473070484328e-06, + "loss": 0.414, + "step": 2540 + }, + { + "epoch": 0.14718189951227958, + "grad_norm": 5.397598920832979, + "learning_rate": 8.52970039831438e-06, + "loss": 0.4204, + "step": 2550 + }, + { + "epoch": 0.14775908343193558, + "grad_norm": 4.756636071432368, + "learning_rate": 8.523927726144432e-06, + "loss": 0.4124, + "step": 2560 + }, + { + "epoch": 0.14833626735159158, + "grad_norm": 14.516917457987418, + "learning_rate": 8.518155053974486e-06, + "loss": 0.4287, + "step": 2570 + }, + { + "epoch": 0.14891345127124758, + "grad_norm": 8.042684749735274, + "learning_rate": 8.512382381804539e-06, + "loss": 0.399, + "step": 2580 + }, + { + "epoch": 0.14949063519090358, + "grad_norm": 7.4003473379136775, + "learning_rate": 8.50660970963459e-06, + "loss": 0.4113, + "step": 2590 + }, + { + "epoch": 0.15006781911055958, + "grad_norm": 3.771759855408406, + "learning_rate": 8.500837037464643e-06, + "loss": 0.4056, + "step": 2600 + }, + { + "epoch": 0.15064500303021558, + "grad_norm": 3.6286193403799682, + "learning_rate": 8.495064365294696e-06, + "loss": 0.4246, + "step": 2610 + }, + { + "epoch": 0.15122218694987158, + "grad_norm": 3.863418428458885, + "learning_rate": 8.489291693124749e-06, + "loss": 0.4085, + "step": 2620 + }, + { + "epoch": 0.15179937086952758, + "grad_norm": 4.263367210964064, + "learning_rate": 8.4835190209548e-06, + "loss": 0.4223, + "step": 2630 + }, + { + "epoch": 0.15237655478918358, + "grad_norm": 3.337620340539389, + "learning_rate": 8.477746348784853e-06, + "loss": 0.4175, + "step": 2640 + }, + { + "epoch": 0.15295373870883958, + "grad_norm": 3.0267657042390788, + "learning_rate": 8.471973676614905e-06, + "loss": 0.4113, + "step": 2650 + }, + { + "epoch": 0.15353092262849558, + "grad_norm": 2.6106010567447893, + "learning_rate": 8.466201004444958e-06, + "loss": 0.4157, + "step": 2660 + }, + { + "epoch": 0.15410810654815157, + "grad_norm": 2.264152062153991, + "learning_rate": 8.46042833227501e-06, + "loss": 0.4338, + "step": 2670 + }, + { + "epoch": 0.15468529046780757, + "grad_norm": 1.9215310401757064, + "learning_rate": 8.454655660105064e-06, + "loss": 0.4114, + "step": 2680 + }, + { + "epoch": 0.15526247438746357, + "grad_norm": 2.615154051211967, + "learning_rate": 8.448882987935117e-06, + "loss": 0.4216, + "step": 2690 + }, + { + "epoch": 0.15583965830711957, + "grad_norm": 2.491915544143726, + "learning_rate": 8.443110315765168e-06, + "loss": 0.411, + "step": 2700 + }, + { + "epoch": 0.15641684222677557, + "grad_norm": 10.894040933901527, + "learning_rate": 8.437337643595221e-06, + "loss": 0.43, + "step": 2710 + }, + { + "epoch": 0.15699402614643157, + "grad_norm": 7.186343591522965, + "learning_rate": 8.431564971425274e-06, + "loss": 0.4119, + "step": 2720 + }, + { + "epoch": 0.15757121006608757, + "grad_norm": 11.15516113836552, + "learning_rate": 8.425792299255327e-06, + "loss": 0.4206, + "step": 2730 + }, + { + "epoch": 0.15814839398574357, + "grad_norm": 3.3849670049000626, + "learning_rate": 8.420019627085378e-06, + "loss": 0.4098, + "step": 2740 + }, + { + "epoch": 0.15872557790539957, + "grad_norm": 2.3478329302680176, + "learning_rate": 8.414246954915431e-06, + "loss": 0.4358, + "step": 2750 + }, + { + "epoch": 0.15930276182505557, + "grad_norm": 5.368826342998146, + "learning_rate": 8.408474282745483e-06, + "loss": 0.4114, + "step": 2760 + }, + { + "epoch": 0.15987994574471157, + "grad_norm": 2.778388386877281, + "learning_rate": 8.402701610575536e-06, + "loss": 0.4149, + "step": 2770 + }, + { + "epoch": 0.16045712966436754, + "grad_norm": 6.269723327157733, + "learning_rate": 8.396928938405589e-06, + "loss": 0.4059, + "step": 2780 + }, + { + "epoch": 0.16103431358402354, + "grad_norm": 3.51009855789622, + "learning_rate": 8.391156266235642e-06, + "loss": 0.4073, + "step": 2790 + }, + { + "epoch": 0.16161149750367954, + "grad_norm": 2.84216423707538, + "learning_rate": 8.385383594065693e-06, + "loss": 0.4249, + "step": 2800 + }, + { + "epoch": 0.16218868142333553, + "grad_norm": 4.458775881028344, + "learning_rate": 8.379610921895746e-06, + "loss": 0.4248, + "step": 2810 + }, + { + "epoch": 0.16276586534299153, + "grad_norm": 4.7717011271745875, + "learning_rate": 8.3738382497258e-06, + "loss": 0.4031, + "step": 2820 + }, + { + "epoch": 0.16334304926264753, + "grad_norm": 2.339306133151702, + "learning_rate": 8.368065577555852e-06, + "loss": 0.4089, + "step": 2830 + }, + { + "epoch": 0.16392023318230353, + "grad_norm": 2.242140863987944, + "learning_rate": 8.362292905385904e-06, + "loss": 0.4183, + "step": 2840 + }, + { + "epoch": 0.16449741710195953, + "grad_norm": 2.50085114867505, + "learning_rate": 8.356520233215957e-06, + "loss": 0.4209, + "step": 2850 + }, + { + "epoch": 0.16507460102161553, + "grad_norm": 2.426762382327375, + "learning_rate": 8.350747561046008e-06, + "loss": 0.4009, + "step": 2860 + }, + { + "epoch": 0.16565178494127153, + "grad_norm": 3.535990085619889, + "learning_rate": 8.344974888876063e-06, + "loss": 0.4145, + "step": 2870 + }, + { + "epoch": 0.16622896886092753, + "grad_norm": 1.9434196504603485, + "learning_rate": 8.339202216706114e-06, + "loss": 0.4029, + "step": 2880 + }, + { + "epoch": 0.16680615278058353, + "grad_norm": 3.647575465250371, + "learning_rate": 8.333429544536167e-06, + "loss": 0.4165, + "step": 2890 + }, + { + "epoch": 0.16738333670023953, + "grad_norm": 3.21796325121302, + "learning_rate": 8.327656872366218e-06, + "loss": 0.419, + "step": 2900 + }, + { + "epoch": 0.16796052061989553, + "grad_norm": 2.49775247695972, + "learning_rate": 8.321884200196271e-06, + "loss": 0.4179, + "step": 2910 + }, + { + "epoch": 0.16853770453955152, + "grad_norm": 1.992198339323461, + "learning_rate": 8.316111528026324e-06, + "loss": 0.4062, + "step": 2920 + }, + { + "epoch": 0.16911488845920752, + "grad_norm": 2.82923364634973, + "learning_rate": 8.310338855856377e-06, + "loss": 0.3997, + "step": 2930 + }, + { + "epoch": 0.16969207237886352, + "grad_norm": 2.414622342801528, + "learning_rate": 8.304566183686429e-06, + "loss": 0.3978, + "step": 2940 + }, + { + "epoch": 0.17026925629851952, + "grad_norm": 3.500929543134566, + "learning_rate": 8.298793511516482e-06, + "loss": 0.4214, + "step": 2950 + }, + { + "epoch": 0.17084644021817552, + "grad_norm": 4.533770301559384, + "learning_rate": 8.293020839346535e-06, + "loss": 0.4003, + "step": 2960 + }, + { + "epoch": 0.17142362413783152, + "grad_norm": 3.463538477874354, + "learning_rate": 8.287248167176588e-06, + "loss": 0.4049, + "step": 2970 + }, + { + "epoch": 0.17200080805748752, + "grad_norm": 2.6643523986040023, + "learning_rate": 8.281475495006639e-06, + "loss": 0.4157, + "step": 2980 + }, + { + "epoch": 0.17257799197714352, + "grad_norm": 5.1202436242415175, + "learning_rate": 8.275702822836692e-06, + "loss": 0.4119, + "step": 2990 + }, + { + "epoch": 0.17315517589679952, + "grad_norm": 4.375642688680107, + "learning_rate": 8.269930150666743e-06, + "loss": 0.4241, + "step": 3000 + }, + { + "epoch": 0.17373235981645552, + "grad_norm": 13.184415266130015, + "learning_rate": 8.264157478496796e-06, + "loss": 0.3931, + "step": 3010 + }, + { + "epoch": 0.17430954373611152, + "grad_norm": 3.5119329436961872, + "learning_rate": 8.25838480632685e-06, + "loss": 0.4332, + "step": 3020 + }, + { + "epoch": 0.17488672765576752, + "grad_norm": 2.057788212146213, + "learning_rate": 8.252612134156902e-06, + "loss": 0.4026, + "step": 3030 + }, + { + "epoch": 0.17546391157542351, + "grad_norm": 2.4025914280540293, + "learning_rate": 8.246839461986954e-06, + "loss": 0.4297, + "step": 3040 + }, + { + "epoch": 0.1760410954950795, + "grad_norm": 2.3273476646183187, + "learning_rate": 8.241066789817007e-06, + "loss": 0.4256, + "step": 3050 + }, + { + "epoch": 0.1766182794147355, + "grad_norm": 3.968034409940849, + "learning_rate": 8.23529411764706e-06, + "loss": 0.3911, + "step": 3060 + }, + { + "epoch": 0.1771954633343915, + "grad_norm": 3.5987187085577648, + "learning_rate": 8.229521445477113e-06, + "loss": 0.4137, + "step": 3070 + }, + { + "epoch": 0.1777726472540475, + "grad_norm": 1.6685524653169845, + "learning_rate": 8.223748773307164e-06, + "loss": 0.4115, + "step": 3080 + }, + { + "epoch": 0.1783498311737035, + "grad_norm": 15.206856747451255, + "learning_rate": 8.217976101137217e-06, + "loss": 0.3993, + "step": 3090 + }, + { + "epoch": 0.1789270150933595, + "grad_norm": 3.2248196017828685, + "learning_rate": 8.212203428967268e-06, + "loss": 0.4263, + "step": 3100 + }, + { + "epoch": 0.1795041990130155, + "grad_norm": 5.06230398614513, + "learning_rate": 8.206430756797322e-06, + "loss": 0.4241, + "step": 3110 + }, + { + "epoch": 0.1800813829326715, + "grad_norm": 24.83780832952265, + "learning_rate": 8.200658084627375e-06, + "loss": 0.4139, + "step": 3120 + }, + { + "epoch": 0.1806585668523275, + "grad_norm": 2.8321722632943094, + "learning_rate": 8.194885412457428e-06, + "loss": 0.4233, + "step": 3130 + }, + { + "epoch": 0.1812357507719835, + "grad_norm": 3.0260534676681727, + "learning_rate": 8.189112740287479e-06, + "loss": 0.4142, + "step": 3140 + }, + { + "epoch": 0.18181293469163948, + "grad_norm": 4.68099176672326, + "learning_rate": 8.183340068117532e-06, + "loss": 0.4118, + "step": 3150 + }, + { + "epoch": 0.18239011861129548, + "grad_norm": 11.310308197618612, + "learning_rate": 8.177567395947585e-06, + "loss": 0.415, + "step": 3160 + }, + { + "epoch": 0.18296730253095148, + "grad_norm": 4.572818458115979, + "learning_rate": 8.171794723777638e-06, + "loss": 0.4015, + "step": 3170 + }, + { + "epoch": 0.18354448645060747, + "grad_norm": 6.450421868687323, + "learning_rate": 8.16602205160769e-06, + "loss": 0.4263, + "step": 3180 + }, + { + "epoch": 0.18412167037026347, + "grad_norm": 3.4296934325228263, + "learning_rate": 8.160249379437742e-06, + "loss": 0.4124, + "step": 3190 + }, + { + "epoch": 0.18469885428991947, + "grad_norm": 9.959813392971029, + "learning_rate": 8.154476707267794e-06, + "loss": 0.4015, + "step": 3200 + }, + { + "epoch": 0.18527603820957547, + "grad_norm": 3.3029855110594695, + "learning_rate": 8.148704035097847e-06, + "loss": 0.4009, + "step": 3210 + }, + { + "epoch": 0.18585322212923147, + "grad_norm": 2.8097447076161273, + "learning_rate": 8.142931362927901e-06, + "loss": 0.4002, + "step": 3220 + }, + { + "epoch": 0.18643040604888747, + "grad_norm": 34.98571611934199, + "learning_rate": 8.137158690757953e-06, + "loss": 0.405, + "step": 3230 + }, + { + "epoch": 0.18700758996854347, + "grad_norm": 9.148426684066074, + "learning_rate": 8.131386018588006e-06, + "loss": 0.4044, + "step": 3240 + }, + { + "epoch": 0.18758477388819947, + "grad_norm": 4.173548086962396, + "learning_rate": 8.125613346418057e-06, + "loss": 0.4217, + "step": 3250 + }, + { + "epoch": 0.18816195780785547, + "grad_norm": 53.37411486094812, + "learning_rate": 8.11984067424811e-06, + "loss": 0.4138, + "step": 3260 + }, + { + "epoch": 0.18873914172751147, + "grad_norm": 2.3480379802050733, + "learning_rate": 8.114068002078163e-06, + "loss": 0.4033, + "step": 3270 + }, + { + "epoch": 0.18931632564716747, + "grad_norm": 2.221291345204337, + "learning_rate": 8.108295329908216e-06, + "loss": 0.3914, + "step": 3280 + }, + { + "epoch": 0.18989350956682347, + "grad_norm": 4.990540206022043, + "learning_rate": 8.102522657738267e-06, + "loss": 0.3985, + "step": 3290 + }, + { + "epoch": 0.19047069348647946, + "grad_norm": 4.025051683089965, + "learning_rate": 8.09674998556832e-06, + "loss": 0.415, + "step": 3300 + }, + { + "epoch": 0.19104787740613546, + "grad_norm": 2.5251837331957607, + "learning_rate": 8.090977313398373e-06, + "loss": 0.4133, + "step": 3310 + }, + { + "epoch": 0.19162506132579146, + "grad_norm": 2.380391015882718, + "learning_rate": 8.085204641228426e-06, + "loss": 0.3926, + "step": 3320 + }, + { + "epoch": 0.19220224524544746, + "grad_norm": 4.41017165770354, + "learning_rate": 8.079431969058478e-06, + "loss": 0.4211, + "step": 3330 + }, + { + "epoch": 0.19277942916510346, + "grad_norm": 2.3063890276035846, + "learning_rate": 8.07365929688853e-06, + "loss": 0.4138, + "step": 3340 + }, + { + "epoch": 0.19335661308475946, + "grad_norm": 4.271620366325266, + "learning_rate": 8.067886624718582e-06, + "loss": 0.4093, + "step": 3350 + }, + { + "epoch": 0.19393379700441546, + "grad_norm": 32.70807377417423, + "learning_rate": 8.062113952548635e-06, + "loss": 0.4035, + "step": 3360 + }, + { + "epoch": 0.19451098092407146, + "grad_norm": 2.4018179935448276, + "learning_rate": 8.056341280378688e-06, + "loss": 0.4126, + "step": 3370 + }, + { + "epoch": 0.19508816484372746, + "grad_norm": 3.1209658711505552, + "learning_rate": 8.050568608208741e-06, + "loss": 0.4073, + "step": 3380 + }, + { + "epoch": 0.19566534876338346, + "grad_norm": 3.6351629831980024, + "learning_rate": 8.044795936038793e-06, + "loss": 0.4041, + "step": 3390 + }, + { + "epoch": 0.19624253268303946, + "grad_norm": 3.652368336229329, + "learning_rate": 8.039023263868846e-06, + "loss": 0.4112, + "step": 3400 + }, + { + "epoch": 0.19681971660269545, + "grad_norm": 2.984404633116569, + "learning_rate": 8.033250591698899e-06, + "loss": 0.4134, + "step": 3410 + }, + { + "epoch": 0.19739690052235145, + "grad_norm": 2.4526774932392024, + "learning_rate": 8.027477919528952e-06, + "loss": 0.4011, + "step": 3420 + }, + { + "epoch": 0.19797408444200745, + "grad_norm": 4.571072305725846, + "learning_rate": 8.021705247359003e-06, + "loss": 0.4097, + "step": 3430 + }, + { + "epoch": 0.19855126836166345, + "grad_norm": 2.6003306321869895, + "learning_rate": 8.015932575189056e-06, + "loss": 0.406, + "step": 3440 + }, + { + "epoch": 0.19912845228131945, + "grad_norm": 4.171122512202554, + "learning_rate": 8.010159903019107e-06, + "loss": 0.4083, + "step": 3450 + }, + { + "epoch": 0.19970563620097545, + "grad_norm": 3.2250910237655006, + "learning_rate": 8.00438723084916e-06, + "loss": 0.415, + "step": 3460 + }, + { + "epoch": 0.20028282012063145, + "grad_norm": 4.871724300699273, + "learning_rate": 7.998614558679213e-06, + "loss": 0.4296, + "step": 3470 + }, + { + "epoch": 0.20086000404028745, + "grad_norm": 2.9738579217002887, + "learning_rate": 7.992841886509266e-06, + "loss": 0.4109, + "step": 3480 + }, + { + "epoch": 0.20143718795994345, + "grad_norm": 5.488557624180365, + "learning_rate": 7.987069214339318e-06, + "loss": 0.4203, + "step": 3490 + }, + { + "epoch": 0.20201437187959945, + "grad_norm": 6.30148557707432, + "learning_rate": 7.98129654216937e-06, + "loss": 0.4166, + "step": 3500 + }, + { + "epoch": 0.20259155579925545, + "grad_norm": 4.159946548536443, + "learning_rate": 7.975523869999424e-06, + "loss": 0.4126, + "step": 3510 + }, + { + "epoch": 0.20316873971891142, + "grad_norm": 15.334889223997754, + "learning_rate": 7.969751197829477e-06, + "loss": 0.3947, + "step": 3520 + }, + { + "epoch": 0.20374592363856742, + "grad_norm": 3.025285897677573, + "learning_rate": 7.963978525659528e-06, + "loss": 0.403, + "step": 3530 + }, + { + "epoch": 0.20432310755822342, + "grad_norm": 4.262956488693212, + "learning_rate": 7.958205853489581e-06, + "loss": 0.4066, + "step": 3540 + }, + { + "epoch": 0.20490029147787941, + "grad_norm": 2.5780055567960143, + "learning_rate": 7.952433181319632e-06, + "loss": 0.4032, + "step": 3550 + }, + { + "epoch": 0.20547747539753541, + "grad_norm": 4.956342345823864, + "learning_rate": 7.946660509149685e-06, + "loss": 0.4022, + "step": 3560 + }, + { + "epoch": 0.2060546593171914, + "grad_norm": 3.89966793053596, + "learning_rate": 7.940887836979738e-06, + "loss": 0.4059, + "step": 3570 + }, + { + "epoch": 0.2066318432368474, + "grad_norm": 2.598383914711163, + "learning_rate": 7.935115164809791e-06, + "loss": 0.3988, + "step": 3580 + }, + { + "epoch": 0.2072090271565034, + "grad_norm": 3.673402619499888, + "learning_rate": 7.929342492639843e-06, + "loss": 0.4117, + "step": 3590 + }, + { + "epoch": 0.2077862110761594, + "grad_norm": 5.7308049694271235, + "learning_rate": 7.923569820469896e-06, + "loss": 0.4008, + "step": 3600 + }, + { + "epoch": 0.2083633949958154, + "grad_norm": 7.935773567013425, + "learning_rate": 7.917797148299949e-06, + "loss": 0.3877, + "step": 3610 + }, + { + "epoch": 0.2089405789154714, + "grad_norm": 6.009048915243287, + "learning_rate": 7.912024476130002e-06, + "loss": 0.4065, + "step": 3620 + }, + { + "epoch": 0.2095177628351274, + "grad_norm": 2.95995624359623, + "learning_rate": 7.906251803960053e-06, + "loss": 0.401, + "step": 3630 + }, + { + "epoch": 0.2100949467547834, + "grad_norm": 3.1037746592103255, + "learning_rate": 7.900479131790106e-06, + "loss": 0.4013, + "step": 3640 + }, + { + "epoch": 0.2106721306744394, + "grad_norm": 6.029705929267264, + "learning_rate": 7.894706459620158e-06, + "loss": 0.4085, + "step": 3650 + }, + { + "epoch": 0.2112493145940954, + "grad_norm": 5.81081103172759, + "learning_rate": 7.888933787450212e-06, + "loss": 0.4062, + "step": 3660 + }, + { + "epoch": 0.2118264985137514, + "grad_norm": 6.6915581684773935, + "learning_rate": 7.883161115280264e-06, + "loss": 0.4146, + "step": 3670 + }, + { + "epoch": 0.2124036824334074, + "grad_norm": 4.483451936143915, + "learning_rate": 7.877388443110317e-06, + "loss": 0.3964, + "step": 3680 + }, + { + "epoch": 0.2129808663530634, + "grad_norm": 4.847478023953505, + "learning_rate": 7.871615770940368e-06, + "loss": 0.3968, + "step": 3690 + }, + { + "epoch": 0.2135580502727194, + "grad_norm": 4.734123259756348, + "learning_rate": 7.865843098770421e-06, + "loss": 0.411, + "step": 3700 + }, + { + "epoch": 0.2141352341923754, + "grad_norm": 67.93482554004888, + "learning_rate": 7.860070426600474e-06, + "loss": 0.4171, + "step": 3710 + }, + { + "epoch": 0.2147124181120314, + "grad_norm": 4.427723281232474, + "learning_rate": 7.854297754430527e-06, + "loss": 0.3901, + "step": 3720 + }, + { + "epoch": 0.2152896020316874, + "grad_norm": 5.970970375519135, + "learning_rate": 7.848525082260578e-06, + "loss": 0.392, + "step": 3730 + }, + { + "epoch": 0.2158667859513434, + "grad_norm": 3.364616367304386, + "learning_rate": 7.842752410090631e-06, + "loss": 0.4262, + "step": 3740 + }, + { + "epoch": 0.2164439698709994, + "grad_norm": 14.709875574791262, + "learning_rate": 7.836979737920684e-06, + "loss": 0.4189, + "step": 3750 + }, + { + "epoch": 0.2170211537906554, + "grad_norm": 34.85728037006575, + "learning_rate": 7.831207065750737e-06, + "loss": 0.4281, + "step": 3760 + }, + { + "epoch": 0.2175983377103114, + "grad_norm": 3.8349995808560835, + "learning_rate": 7.82543439358079e-06, + "loss": 0.4197, + "step": 3770 + }, + { + "epoch": 0.2181755216299674, + "grad_norm": 9.826012532840847, + "learning_rate": 7.819661721410842e-06, + "loss": 0.411, + "step": 3780 + }, + { + "epoch": 0.2187527055496234, + "grad_norm": 5.148779185873613, + "learning_rate": 7.813889049240895e-06, + "loss": 0.408, + "step": 3790 + }, + { + "epoch": 0.2193298894692794, + "grad_norm": 6.034690169401536, + "learning_rate": 7.808116377070946e-06, + "loss": 0.389, + "step": 3800 + }, + { + "epoch": 0.2199070733889354, + "grad_norm": 7.98093273312519, + "learning_rate": 7.802343704900999e-06, + "loss": 0.4087, + "step": 3810 + }, + { + "epoch": 0.2204842573085914, + "grad_norm": 20.795705430196968, + "learning_rate": 7.796571032731052e-06, + "loss": 0.4164, + "step": 3820 + }, + { + "epoch": 0.2210614412282474, + "grad_norm": 4.546394166434191, + "learning_rate": 7.790798360561105e-06, + "loss": 0.4198, + "step": 3830 + }, + { + "epoch": 0.2216386251479034, + "grad_norm": 2.8073314189395457, + "learning_rate": 7.785025688391156e-06, + "loss": 0.4201, + "step": 3840 + }, + { + "epoch": 0.2222158090675594, + "grad_norm": 24.430024193972454, + "learning_rate": 7.77925301622121e-06, + "loss": 0.3916, + "step": 3850 + }, + { + "epoch": 0.2227929929872154, + "grad_norm": 3.7314691801815525, + "learning_rate": 7.773480344051262e-06, + "loss": 0.4034, + "step": 3860 + }, + { + "epoch": 0.2233701769068714, + "grad_norm": 4.463974565740604, + "learning_rate": 7.767707671881316e-06, + "loss": 0.4177, + "step": 3870 + }, + { + "epoch": 0.2239473608265274, + "grad_norm": 3.935819483400635, + "learning_rate": 7.761934999711367e-06, + "loss": 0.4186, + "step": 3880 + }, + { + "epoch": 0.22452454474618336, + "grad_norm": 2.7465902804726343, + "learning_rate": 7.75616232754142e-06, + "loss": 0.4355, + "step": 3890 + }, + { + "epoch": 0.22510172866583936, + "grad_norm": 2.59329896630652, + "learning_rate": 7.750389655371471e-06, + "loss": 0.4139, + "step": 3900 + }, + { + "epoch": 0.22567891258549536, + "grad_norm": 2.8540002686300214, + "learning_rate": 7.744616983201524e-06, + "loss": 0.4062, + "step": 3910 + }, + { + "epoch": 0.22625609650515136, + "grad_norm": 6.48349060337823, + "learning_rate": 7.738844311031577e-06, + "loss": 0.4002, + "step": 3920 + }, + { + "epoch": 0.22683328042480735, + "grad_norm": 4.668616924928805, + "learning_rate": 7.73307163886163e-06, + "loss": 0.4041, + "step": 3930 + }, + { + "epoch": 0.22741046434446335, + "grad_norm": 6.539752215401261, + "learning_rate": 7.727298966691682e-06, + "loss": 0.3994, + "step": 3940 + }, + { + "epoch": 0.22798764826411935, + "grad_norm": 3.3595541678536156, + "learning_rate": 7.721526294521735e-06, + "loss": 0.3995, + "step": 3950 + }, + { + "epoch": 0.22856483218377535, + "grad_norm": 2.8441727594832886, + "learning_rate": 7.715753622351788e-06, + "loss": 0.4, + "step": 3960 + }, + { + "epoch": 0.22914201610343135, + "grad_norm": 2.849353128300574, + "learning_rate": 7.70998095018184e-06, + "loss": 0.4048, + "step": 3970 + }, + { + "epoch": 0.22971920002308735, + "grad_norm": 4.881214366450045, + "learning_rate": 7.704208278011892e-06, + "loss": 0.4054, + "step": 3980 + }, + { + "epoch": 0.23029638394274335, + "grad_norm": 15.46486836243805, + "learning_rate": 7.698435605841945e-06, + "loss": 0.3923, + "step": 3990 + }, + { + "epoch": 0.23087356786239935, + "grad_norm": 3.847585477525563, + "learning_rate": 7.692662933671996e-06, + "loss": 0.393, + "step": 4000 + }, + { + "epoch": 0.23145075178205535, + "grad_norm": 38.3399434241166, + "learning_rate": 7.686890261502051e-06, + "loss": 0.3926, + "step": 4010 + }, + { + "epoch": 0.23202793570171135, + "grad_norm": 2.6580858390650466, + "learning_rate": 7.681117589332102e-06, + "loss": 0.3948, + "step": 4020 + }, + { + "epoch": 0.23260511962136735, + "grad_norm": 4.466615319467433, + "learning_rate": 7.675344917162155e-06, + "loss": 0.402, + "step": 4030 + }, + { + "epoch": 0.23318230354102334, + "grad_norm": 6.7414838396917505, + "learning_rate": 7.669572244992207e-06, + "loss": 0.3904, + "step": 4040 + }, + { + "epoch": 0.23375948746067934, + "grad_norm": 4.03997703346681, + "learning_rate": 7.66379957282226e-06, + "loss": 0.3938, + "step": 4050 + }, + { + "epoch": 0.23433667138033534, + "grad_norm": 4.258742526842458, + "learning_rate": 7.658026900652313e-06, + "loss": 0.3998, + "step": 4060 + }, + { + "epoch": 0.23491385529999134, + "grad_norm": 5.100923170543839, + "learning_rate": 7.652254228482366e-06, + "loss": 0.3968, + "step": 4070 + }, + { + "epoch": 0.23549103921964734, + "grad_norm": 8.332021271422962, + "learning_rate": 7.646481556312417e-06, + "loss": 0.4017, + "step": 4080 + }, + { + "epoch": 0.23606822313930334, + "grad_norm": 5.001487618559788, + "learning_rate": 7.64070888414247e-06, + "loss": 0.3929, + "step": 4090 + }, + { + "epoch": 0.23664540705895934, + "grad_norm": 6.605470741420995, + "learning_rate": 7.634936211972523e-06, + "loss": 0.3995, + "step": 4100 + }, + { + "epoch": 0.23722259097861534, + "grad_norm": 4.352594377363156, + "learning_rate": 7.629163539802575e-06, + "loss": 0.4008, + "step": 4110 + }, + { + "epoch": 0.23779977489827134, + "grad_norm": 8.143604964743357, + "learning_rate": 7.6233908676326275e-06, + "loss": 0.3987, + "step": 4120 + }, + { + "epoch": 0.23837695881792734, + "grad_norm": 3.9869800783007427, + "learning_rate": 7.6176181954626805e-06, + "loss": 0.3874, + "step": 4130 + }, + { + "epoch": 0.23895414273758334, + "grad_norm": 3.4272782347037207, + "learning_rate": 7.611845523292733e-06, + "loss": 0.3869, + "step": 4140 + }, + { + "epoch": 0.23953132665723934, + "grad_norm": 2.964033904850173, + "learning_rate": 7.606072851122786e-06, + "loss": 0.3962, + "step": 4150 + }, + { + "epoch": 0.24010851057689533, + "grad_norm": 55.68370603421082, + "learning_rate": 7.600300178952838e-06, + "loss": 0.3981, + "step": 4160 + }, + { + "epoch": 0.24068569449655133, + "grad_norm": 4.402682573412378, + "learning_rate": 7.594527506782891e-06, + "loss": 0.4152, + "step": 4170 + }, + { + "epoch": 0.24126287841620733, + "grad_norm": 2.8760794787596997, + "learning_rate": 7.588754834612942e-06, + "loss": 0.3881, + "step": 4180 + }, + { + "epoch": 0.24184006233586333, + "grad_norm": 2.0239240283122575, + "learning_rate": 7.582982162442995e-06, + "loss": 0.4023, + "step": 4190 + }, + { + "epoch": 0.24241724625551933, + "grad_norm": 2.5930742840295986, + "learning_rate": 7.577209490273047e-06, + "loss": 0.4146, + "step": 4200 + }, + { + "epoch": 0.24299443017517533, + "grad_norm": 12.25806910583576, + "learning_rate": 7.5714368181031e-06, + "loss": 0.4268, + "step": 4210 + }, + { + "epoch": 0.24357161409483133, + "grad_norm": 9.121606156996025, + "learning_rate": 7.565664145933153e-06, + "loss": 0.3926, + "step": 4220 + }, + { + "epoch": 0.24414879801448733, + "grad_norm": 9.12741140098973, + "learning_rate": 7.559891473763206e-06, + "loss": 0.3908, + "step": 4230 + }, + { + "epoch": 0.24472598193414333, + "grad_norm": 2.988966863298224, + "learning_rate": 7.554118801593258e-06, + "loss": 0.4077, + "step": 4240 + }, + { + "epoch": 0.24530316585379933, + "grad_norm": 4.755614024652895, + "learning_rate": 7.548346129423311e-06, + "loss": 0.3923, + "step": 4250 + }, + { + "epoch": 0.2458803497734553, + "grad_norm": 3.286123151497483, + "learning_rate": 7.542573457253363e-06, + "loss": 0.3842, + "step": 4260 + }, + { + "epoch": 0.2464575336931113, + "grad_norm": 24.75179268960897, + "learning_rate": 7.536800785083416e-06, + "loss": 0.4348, + "step": 4270 + }, + { + "epoch": 0.2470347176127673, + "grad_norm": 2.1639276020923064, + "learning_rate": 7.531028112913469e-06, + "loss": 0.4064, + "step": 4280 + }, + { + "epoch": 0.2476119015324233, + "grad_norm": 1.9508786971501029, + "learning_rate": 7.525255440743521e-06, + "loss": 0.4085, + "step": 4290 + }, + { + "epoch": 0.2481890854520793, + "grad_norm": 1.8580731020746826, + "learning_rate": 7.519482768573574e-06, + "loss": 0.4111, + "step": 4300 + }, + { + "epoch": 0.2487662693717353, + "grad_norm": 2.6928452549457407, + "learning_rate": 7.5137100964036255e-06, + "loss": 0.4115, + "step": 4310 + }, + { + "epoch": 0.2493434532913913, + "grad_norm": 2.054083128359146, + "learning_rate": 7.5079374242336786e-06, + "loss": 0.4121, + "step": 4320 + }, + { + "epoch": 0.2499206372110473, + "grad_norm": 2.6316000209561663, + "learning_rate": 7.502164752063731e-06, + "loss": 0.4186, + "step": 4330 + }, + { + "epoch": 0.2504978211307033, + "grad_norm": 1.8771833797620388, + "learning_rate": 7.496392079893784e-06, + "loss": 0.4001, + "step": 4340 + }, + { + "epoch": 0.2510750050503593, + "grad_norm": 9.221358255158016, + "learning_rate": 7.490619407723836e-06, + "loss": 0.4177, + "step": 4350 + }, + { + "epoch": 0.2516521889700153, + "grad_norm": 1.8539444504213582, + "learning_rate": 7.484846735553889e-06, + "loss": 0.4055, + "step": 4360 + }, + { + "epoch": 0.2522293728896713, + "grad_norm": 2.284022523834839, + "learning_rate": 7.479074063383941e-06, + "loss": 0.4228, + "step": 4370 + }, + { + "epoch": 0.2528065568093273, + "grad_norm": 1.8613795057409426, + "learning_rate": 7.473301391213994e-06, + "loss": 0.3886, + "step": 4380 + }, + { + "epoch": 0.2533837407289833, + "grad_norm": 3.7779942998503855, + "learning_rate": 7.467528719044046e-06, + "loss": 0.4158, + "step": 4390 + }, + { + "epoch": 0.2539609246486393, + "grad_norm": 3.7840065627156365, + "learning_rate": 7.461756046874099e-06, + "loss": 0.4187, + "step": 4400 + }, + { + "epoch": 0.2545381085682953, + "grad_norm": 5.099586666363089, + "learning_rate": 7.455983374704151e-06, + "loss": 0.4098, + "step": 4410 + }, + { + "epoch": 0.2551152924879513, + "grad_norm": 2.6870891278248337, + "learning_rate": 7.450210702534204e-06, + "loss": 0.3986, + "step": 4420 + }, + { + "epoch": 0.2556924764076073, + "grad_norm": 2.7248868412027583, + "learning_rate": 7.444438030364256e-06, + "loss": 0.4038, + "step": 4430 + }, + { + "epoch": 0.2562696603272633, + "grad_norm": 3.714403433710303, + "learning_rate": 7.438665358194309e-06, + "loss": 0.4045, + "step": 4440 + }, + { + "epoch": 0.2568468442469193, + "grad_norm": 2.9240340589059644, + "learning_rate": 7.432892686024361e-06, + "loss": 0.3911, + "step": 4450 + }, + { + "epoch": 0.2574240281665753, + "grad_norm": 4.331854988527969, + "learning_rate": 7.427120013854414e-06, + "loss": 0.394, + "step": 4460 + }, + { + "epoch": 0.2580012120862313, + "grad_norm": 6.84340943547103, + "learning_rate": 7.421347341684466e-06, + "loss": 0.3884, + "step": 4470 + }, + { + "epoch": 0.2585783960058873, + "grad_norm": 26.71610826157837, + "learning_rate": 7.415574669514519e-06, + "loss": 0.4126, + "step": 4480 + }, + { + "epoch": 0.2591555799255433, + "grad_norm": 3.691538028091923, + "learning_rate": 7.4098019973445714e-06, + "loss": 0.3997, + "step": 4490 + }, + { + "epoch": 0.2597327638451993, + "grad_norm": 3.6466304992174527, + "learning_rate": 7.4040293251746245e-06, + "loss": 0.387, + "step": 4500 + }, + { + "epoch": 0.2603099477648553, + "grad_norm": 3.069337367045407, + "learning_rate": 7.398256653004677e-06, + "loss": 0.3925, + "step": 4510 + }, + { + "epoch": 0.2608871316845113, + "grad_norm": 17.7941278304272, + "learning_rate": 7.39248398083473e-06, + "loss": 0.3904, + "step": 4520 + }, + { + "epoch": 0.2614643156041673, + "grad_norm": 4.010302222583594, + "learning_rate": 7.386711308664781e-06, + "loss": 0.4, + "step": 4530 + }, + { + "epoch": 0.2620414995238233, + "grad_norm": 2.891669458141575, + "learning_rate": 7.380938636494834e-06, + "loss": 0.3916, + "step": 4540 + }, + { + "epoch": 0.2626186834434793, + "grad_norm": 3.5781925847929736, + "learning_rate": 7.375165964324886e-06, + "loss": 0.3901, + "step": 4550 + }, + { + "epoch": 0.26319586736313527, + "grad_norm": 2.8217409668695814, + "learning_rate": 7.369393292154939e-06, + "loss": 0.41, + "step": 4560 + }, + { + "epoch": 0.26377305128279127, + "grad_norm": 4.326569507185014, + "learning_rate": 7.363620619984991e-06, + "loss": 0.4091, + "step": 4570 + }, + { + "epoch": 0.26435023520244727, + "grad_norm": 5.515247686980751, + "learning_rate": 7.357847947815044e-06, + "loss": 0.4203, + "step": 4580 + }, + { + "epoch": 0.26492741912210327, + "grad_norm": 6.561612435080219, + "learning_rate": 7.3520752756450966e-06, + "loss": 0.3951, + "step": 4590 + }, + { + "epoch": 0.26550460304175927, + "grad_norm": 2.241772546310698, + "learning_rate": 7.34630260347515e-06, + "loss": 0.3985, + "step": 4600 + }, + { + "epoch": 0.26608178696141527, + "grad_norm": 2.524827575292416, + "learning_rate": 7.340529931305202e-06, + "loss": 0.3981, + "step": 4610 + }, + { + "epoch": 0.26665897088107127, + "grad_norm": 2.4686565154848106, + "learning_rate": 7.334757259135255e-06, + "loss": 0.3948, + "step": 4620 + }, + { + "epoch": 0.26723615480072727, + "grad_norm": 5.055286394352697, + "learning_rate": 7.328984586965306e-06, + "loss": 0.3966, + "step": 4630 + }, + { + "epoch": 0.26781333872038326, + "grad_norm": 2.4105713306719023, + "learning_rate": 7.323211914795359e-06, + "loss": 0.3896, + "step": 4640 + }, + { + "epoch": 0.26839052264003926, + "grad_norm": 8.884358381031186, + "learning_rate": 7.317439242625411e-06, + "loss": 0.4043, + "step": 4650 + }, + { + "epoch": 0.26896770655969526, + "grad_norm": 3.3786577911171465, + "learning_rate": 7.311666570455464e-06, + "loss": 0.4087, + "step": 4660 + }, + { + "epoch": 0.26954489047935126, + "grad_norm": 2.241166004757874, + "learning_rate": 7.3058938982855165e-06, + "loss": 0.3961, + "step": 4670 + }, + { + "epoch": 0.27012207439900726, + "grad_norm": 3.7398767727731195, + "learning_rate": 7.3001212261155695e-06, + "loss": 0.4025, + "step": 4680 + }, + { + "epoch": 0.27069925831866326, + "grad_norm": 2.4627601331961024, + "learning_rate": 7.294348553945622e-06, + "loss": 0.3967, + "step": 4690 + }, + { + "epoch": 0.27127644223831926, + "grad_norm": 2.9400014965222243, + "learning_rate": 7.288575881775675e-06, + "loss": 0.398, + "step": 4700 + }, + { + "epoch": 0.27185362615797526, + "grad_norm": 2.371642161881622, + "learning_rate": 7.282803209605727e-06, + "loss": 0.3878, + "step": 4710 + }, + { + "epoch": 0.27243081007763126, + "grad_norm": 2.1217448647861943, + "learning_rate": 7.27703053743578e-06, + "loss": 0.3961, + "step": 4720 + }, + { + "epoch": 0.27300799399728726, + "grad_norm": 6.480519075871927, + "learning_rate": 7.271257865265832e-06, + "loss": 0.3904, + "step": 4730 + }, + { + "epoch": 0.27358517791694326, + "grad_norm": 3.7782562668503292, + "learning_rate": 7.265485193095885e-06, + "loss": 0.4015, + "step": 4740 + }, + { + "epoch": 0.27416236183659926, + "grad_norm": 27.34719190524476, + "learning_rate": 7.259712520925936e-06, + "loss": 0.3995, + "step": 4750 + }, + { + "epoch": 0.27473954575625525, + "grad_norm": 2.5984426690848044, + "learning_rate": 7.2539398487559894e-06, + "loss": 0.4091, + "step": 4760 + }, + { + "epoch": 0.27531672967591125, + "grad_norm": 2.269931164816007, + "learning_rate": 7.248167176586042e-06, + "loss": 0.391, + "step": 4770 + }, + { + "epoch": 0.27589391359556725, + "grad_norm": 2.0949435472109443, + "learning_rate": 7.242394504416095e-06, + "loss": 0.3867, + "step": 4780 + }, + { + "epoch": 0.27647109751522325, + "grad_norm": 2.1688865736563794, + "learning_rate": 7.236621832246147e-06, + "loss": 0.3819, + "step": 4790 + }, + { + "epoch": 0.27704828143487925, + "grad_norm": 3.6275115123885744, + "learning_rate": 7.2308491600762e-06, + "loss": 0.403, + "step": 4800 + }, + { + "epoch": 0.27762546535453525, + "grad_norm": 2.7044630613298204, + "learning_rate": 7.225076487906253e-06, + "loss": 0.401, + "step": 4810 + }, + { + "epoch": 0.2782026492741912, + "grad_norm": 3.6256795573786853, + "learning_rate": 7.219303815736305e-06, + "loss": 0.4008, + "step": 4820 + }, + { + "epoch": 0.2787798331938472, + "grad_norm": 2.3560879474365595, + "learning_rate": 7.213531143566358e-06, + "loss": 0.382, + "step": 4830 + }, + { + "epoch": 0.2793570171135032, + "grad_norm": 6.363609259389832, + "learning_rate": 7.20775847139641e-06, + "loss": 0.3889, + "step": 4840 + }, + { + "epoch": 0.2799342010331592, + "grad_norm": 2.447343796783594, + "learning_rate": 7.201985799226463e-06, + "loss": 0.3819, + "step": 4850 + }, + { + "epoch": 0.2805113849528152, + "grad_norm": 4.454942195776334, + "learning_rate": 7.1962131270565146e-06, + "loss": 0.4029, + "step": 4860 + }, + { + "epoch": 0.2810885688724712, + "grad_norm": 4.7341175135353675, + "learning_rate": 7.1904404548865684e-06, + "loss": 0.3962, + "step": 4870 + }, + { + "epoch": 0.2816657527921272, + "grad_norm": 1.8389081163765115, + "learning_rate": 7.18466778271662e-06, + "loss": 0.3962, + "step": 4880 + }, + { + "epoch": 0.2822429367117832, + "grad_norm": 3.2296499793817612, + "learning_rate": 7.178895110546673e-06, + "loss": 0.3942, + "step": 4890 + }, + { + "epoch": 0.2828201206314392, + "grad_norm": 3.947947823306894, + "learning_rate": 7.173122438376725e-06, + "loss": 0.3976, + "step": 4900 + }, + { + "epoch": 0.2833973045510952, + "grad_norm": 3.562193655967395, + "learning_rate": 7.167349766206778e-06, + "loss": 0.3876, + "step": 4910 + }, + { + "epoch": 0.2839744884707512, + "grad_norm": 2.0441973593521086, + "learning_rate": 7.16157709403683e-06, + "loss": 0.404, + "step": 4920 + }, + { + "epoch": 0.2845516723904072, + "grad_norm": 2.3917393046670075, + "learning_rate": 7.155804421866883e-06, + "loss": 0.4169, + "step": 4930 + }, + { + "epoch": 0.2851288563100632, + "grad_norm": 2.328855705516118, + "learning_rate": 7.150031749696935e-06, + "loss": 0.3886, + "step": 4940 + }, + { + "epoch": 0.2857060402297192, + "grad_norm": 5.150276479313902, + "learning_rate": 7.144259077526988e-06, + "loss": 0.401, + "step": 4950 + }, + { + "epoch": 0.2862832241493752, + "grad_norm": 2.4042125393360907, + "learning_rate": 7.1384864053570405e-06, + "loss": 0.4003, + "step": 4960 + }, + { + "epoch": 0.2868604080690312, + "grad_norm": 2.8838052574781257, + "learning_rate": 7.1327137331870936e-06, + "loss": 0.3992, + "step": 4970 + }, + { + "epoch": 0.2874375919886872, + "grad_norm": 3.3196900990562646, + "learning_rate": 7.126941061017145e-06, + "loss": 0.4025, + "step": 4980 + }, + { + "epoch": 0.2880147759083432, + "grad_norm": 5.299768426314854, + "learning_rate": 7.121168388847198e-06, + "loss": 0.3994, + "step": 4990 + }, + { + "epoch": 0.2885919598279992, + "grad_norm": 27.899807155688983, + "learning_rate": 7.11539571667725e-06, + "loss": 0.402, + "step": 5000 + }, + { + "epoch": 0.2891691437476552, + "grad_norm": 2.2901385928116484, + "learning_rate": 7.109623044507303e-06, + "loss": 0.3936, + "step": 5010 + }, + { + "epoch": 0.2897463276673112, + "grad_norm": 5.293699045227353, + "learning_rate": 7.103850372337355e-06, + "loss": 0.3977, + "step": 5020 + }, + { + "epoch": 0.2903235115869672, + "grad_norm": 2.2638394900698624, + "learning_rate": 7.098077700167408e-06, + "loss": 0.3932, + "step": 5030 + }, + { + "epoch": 0.2909006955066232, + "grad_norm": 2.2021953017040317, + "learning_rate": 7.0923050279974605e-06, + "loss": 0.4063, + "step": 5040 + }, + { + "epoch": 0.2914778794262792, + "grad_norm": 3.0086020205487363, + "learning_rate": 7.0865323558275135e-06, + "loss": 0.3948, + "step": 5050 + }, + { + "epoch": 0.2920550633459352, + "grad_norm": 2.1426519571708615, + "learning_rate": 7.080759683657566e-06, + "loss": 0.3967, + "step": 5060 + }, + { + "epoch": 0.29263224726559117, + "grad_norm": 2.396350457849943, + "learning_rate": 7.074987011487619e-06, + "loss": 0.3699, + "step": 5070 + }, + { + "epoch": 0.29320943118524717, + "grad_norm": 48.09761207732283, + "learning_rate": 7.06921433931767e-06, + "loss": 0.3783, + "step": 5080 + }, + { + "epoch": 0.29378661510490317, + "grad_norm": 1.9858233601346664, + "learning_rate": 7.063441667147724e-06, + "loss": 0.3885, + "step": 5090 + }, + { + "epoch": 0.29436379902455917, + "grad_norm": 1.8720378976655367, + "learning_rate": 7.057668994977775e-06, + "loss": 0.408, + "step": 5100 + }, + { + "epoch": 0.29494098294421517, + "grad_norm": 1.9939149039664568, + "learning_rate": 7.051896322807828e-06, + "loss": 0.3954, + "step": 5110 + }, + { + "epoch": 0.29551816686387117, + "grad_norm": 1.654582540560213, + "learning_rate": 7.04612365063788e-06, + "loss": 0.3914, + "step": 5120 + }, + { + "epoch": 0.29609535078352717, + "grad_norm": 3.06484715205326, + "learning_rate": 7.040350978467933e-06, + "loss": 0.3941, + "step": 5130 + }, + { + "epoch": 0.29667253470318317, + "grad_norm": 4.663889722159032, + "learning_rate": 7.034578306297986e-06, + "loss": 0.3864, + "step": 5140 + }, + { + "epoch": 0.29724971862283917, + "grad_norm": 7.729000917336516, + "learning_rate": 7.028805634128039e-06, + "loss": 0.383, + "step": 5150 + }, + { + "epoch": 0.29782690254249516, + "grad_norm": 3.2930128906393357, + "learning_rate": 7.023032961958091e-06, + "loss": 0.4141, + "step": 5160 + }, + { + "epoch": 0.29840408646215116, + "grad_norm": 2.410906648107205, + "learning_rate": 7.017260289788144e-06, + "loss": 0.3954, + "step": 5170 + }, + { + "epoch": 0.29898127038180716, + "grad_norm": 2.334082494973446, + "learning_rate": 7.011487617618196e-06, + "loss": 0.3946, + "step": 5180 + }, + { + "epoch": 0.29955845430146316, + "grad_norm": 2.5351433587502568, + "learning_rate": 7.005714945448249e-06, + "loss": 0.3793, + "step": 5190 + }, + { + "epoch": 0.30013563822111916, + "grad_norm": 4.293591510370919, + "learning_rate": 6.9999422732783e-06, + "loss": 0.3932, + "step": 5200 + }, + { + "epoch": 0.30071282214077516, + "grad_norm": 2.7812977150432032, + "learning_rate": 6.994169601108353e-06, + "loss": 0.4031, + "step": 5210 + }, + { + "epoch": 0.30129000606043116, + "grad_norm": 2.6445327211143392, + "learning_rate": 6.9883969289384055e-06, + "loss": 0.3971, + "step": 5220 + }, + { + "epoch": 0.30186718998008716, + "grad_norm": 3.667055707656093, + "learning_rate": 6.9826242567684585e-06, + "loss": 0.3863, + "step": 5230 + }, + { + "epoch": 0.30244437389974316, + "grad_norm": 7.251793531417223, + "learning_rate": 6.976851584598511e-06, + "loss": 0.3957, + "step": 5240 + }, + { + "epoch": 0.30302155781939916, + "grad_norm": 2.3512832022952415, + "learning_rate": 6.971078912428564e-06, + "loss": 0.3795, + "step": 5250 + }, + { + "epoch": 0.30359874173905516, + "grad_norm": 3.8467854833379103, + "learning_rate": 6.965306240258616e-06, + "loss": 0.4022, + "step": 5260 + }, + { + "epoch": 0.30417592565871115, + "grad_norm": 3.452869510119873, + "learning_rate": 6.959533568088669e-06, + "loss": 0.367, + "step": 5270 + }, + { + "epoch": 0.30475310957836715, + "grad_norm": 1.7928454544936765, + "learning_rate": 6.953760895918721e-06, + "loss": 0.3835, + "step": 5280 + }, + { + "epoch": 0.30533029349802315, + "grad_norm": 3.3141188092724057, + "learning_rate": 6.947988223748774e-06, + "loss": 0.3841, + "step": 5290 + }, + { + "epoch": 0.30590747741767915, + "grad_norm": 7.1200801655892905, + "learning_rate": 6.942215551578826e-06, + "loss": 0.3851, + "step": 5300 + }, + { + "epoch": 0.30648466133733515, + "grad_norm": 2.6078657985349563, + "learning_rate": 6.936442879408879e-06, + "loss": 0.3851, + "step": 5310 + }, + { + "epoch": 0.30706184525699115, + "grad_norm": 3.8742908966217873, + "learning_rate": 6.930670207238931e-06, + "loss": 0.3657, + "step": 5320 + }, + { + "epoch": 0.30763902917664715, + "grad_norm": 2.0698861633639885, + "learning_rate": 6.924897535068984e-06, + "loss": 0.3911, + "step": 5330 + }, + { + "epoch": 0.30821621309630315, + "grad_norm": 2.5491051422292412, + "learning_rate": 6.919124862899037e-06, + "loss": 0.3765, + "step": 5340 + }, + { + "epoch": 0.30879339701595915, + "grad_norm": 3.734949482545124, + "learning_rate": 6.913352190729089e-06, + "loss": 0.3895, + "step": 5350 + }, + { + "epoch": 0.30937058093561515, + "grad_norm": 3.6071556886180356, + "learning_rate": 6.907579518559142e-06, + "loss": 0.3855, + "step": 5360 + }, + { + "epoch": 0.30994776485527115, + "grad_norm": 2.048967073465003, + "learning_rate": 6.901806846389194e-06, + "loss": 0.3955, + "step": 5370 + }, + { + "epoch": 0.31052494877492715, + "grad_norm": 3.6739243647918016, + "learning_rate": 6.896034174219247e-06, + "loss": 0.3886, + "step": 5380 + }, + { + "epoch": 0.31110213269458314, + "grad_norm": 6.86781835267949, + "learning_rate": 6.890261502049299e-06, + "loss": 0.3865, + "step": 5390 + }, + { + "epoch": 0.31167931661423914, + "grad_norm": 2.3848433309003445, + "learning_rate": 6.884488829879352e-06, + "loss": 0.382, + "step": 5400 + }, + { + "epoch": 0.31225650053389514, + "grad_norm": 4.973233732358959, + "learning_rate": 6.8787161577094044e-06, + "loss": 0.3902, + "step": 5410 + }, + { + "epoch": 0.31283368445355114, + "grad_norm": 4.047034417439155, + "learning_rate": 6.8729434855394575e-06, + "loss": 0.3848, + "step": 5420 + }, + { + "epoch": 0.31341086837320714, + "grad_norm": 2.0502326573281464, + "learning_rate": 6.867170813369509e-06, + "loss": 0.3775, + "step": 5430 + }, + { + "epoch": 0.31398805229286314, + "grad_norm": 2.3429460144586747, + "learning_rate": 6.861398141199563e-06, + "loss": 0.3854, + "step": 5440 + }, + { + "epoch": 0.31456523621251914, + "grad_norm": 2.366729120218485, + "learning_rate": 6.855625469029614e-06, + "loss": 0.392, + "step": 5450 + }, + { + "epoch": 0.31514242013217514, + "grad_norm": 2.846824703607735, + "learning_rate": 6.849852796859667e-06, + "loss": 0.3899, + "step": 5460 + }, + { + "epoch": 0.31571960405183114, + "grad_norm": 4.059470208038008, + "learning_rate": 6.844080124689719e-06, + "loss": 0.3824, + "step": 5470 + }, + { + "epoch": 0.31629678797148714, + "grad_norm": 3.060450364883157, + "learning_rate": 6.838307452519772e-06, + "loss": 0.386, + "step": 5480 + }, + { + "epoch": 0.31687397189114314, + "grad_norm": 6.0419702046843735, + "learning_rate": 6.832534780349824e-06, + "loss": 0.3989, + "step": 5490 + }, + { + "epoch": 0.31745115581079913, + "grad_norm": 148.73139965795136, + "learning_rate": 6.826762108179877e-06, + "loss": 0.3972, + "step": 5500 + }, + { + "epoch": 0.31802833973045513, + "grad_norm": 4.457525994355414, + "learning_rate": 6.8209894360099296e-06, + "loss": 0.3917, + "step": 5510 + }, + { + "epoch": 0.31860552365011113, + "grad_norm": 2.172441867365423, + "learning_rate": 6.815216763839983e-06, + "loss": 0.3877, + "step": 5520 + }, + { + "epoch": 0.31918270756976713, + "grad_norm": 3.287076044270403, + "learning_rate": 6.809444091670035e-06, + "loss": 0.3697, + "step": 5530 + }, + { + "epoch": 0.31975989148942313, + "grad_norm": 2.0889311701748747, + "learning_rate": 6.803671419500088e-06, + "loss": 0.3744, + "step": 5540 + }, + { + "epoch": 0.32033707540907913, + "grad_norm": 3.2176355512083616, + "learning_rate": 6.797898747330139e-06, + "loss": 0.3936, + "step": 5550 + }, + { + "epoch": 0.3209142593287351, + "grad_norm": 2.16842859391982, + "learning_rate": 6.792126075160192e-06, + "loss": 0.3945, + "step": 5560 + }, + { + "epoch": 0.3214914432483911, + "grad_norm": 3.8670603387604134, + "learning_rate": 6.786353402990244e-06, + "loss": 0.3883, + "step": 5570 + }, + { + "epoch": 0.3220686271680471, + "grad_norm": 16.096137517550215, + "learning_rate": 6.780580730820297e-06, + "loss": 0.3982, + "step": 5580 + }, + { + "epoch": 0.32264581108770307, + "grad_norm": 3.253563357226522, + "learning_rate": 6.7748080586503495e-06, + "loss": 0.3971, + "step": 5590 + }, + { + "epoch": 0.32322299500735907, + "grad_norm": 5.849409027392066, + "learning_rate": 6.7690353864804025e-06, + "loss": 0.3966, + "step": 5600 + }, + { + "epoch": 0.32380017892701507, + "grad_norm": 4.446139033898797, + "learning_rate": 6.763262714310455e-06, + "loss": 0.3871, + "step": 5610 + }, + { + "epoch": 0.32437736284667107, + "grad_norm": 2.4319593332542953, + "learning_rate": 6.757490042140508e-06, + "loss": 0.381, + "step": 5620 + }, + { + "epoch": 0.32495454676632707, + "grad_norm": 4.734141112466146, + "learning_rate": 6.75171736997056e-06, + "loss": 0.3688, + "step": 5630 + }, + { + "epoch": 0.32553173068598307, + "grad_norm": 3.0227214340364386, + "learning_rate": 6.745944697800613e-06, + "loss": 0.3939, + "step": 5640 + }, + { + "epoch": 0.32610891460563907, + "grad_norm": 6.9528804097069274, + "learning_rate": 6.740172025630664e-06, + "loss": 0.3859, + "step": 5650 + }, + { + "epoch": 0.32668609852529507, + "grad_norm": 2.4438240264660527, + "learning_rate": 6.734399353460718e-06, + "loss": 0.3929, + "step": 5660 + }, + { + "epoch": 0.32726328244495106, + "grad_norm": 7.001401722934106, + "learning_rate": 6.728626681290769e-06, + "loss": 0.3878, + "step": 5670 + }, + { + "epoch": 0.32784046636460706, + "grad_norm": 4.631134068889566, + "learning_rate": 6.7228540091208224e-06, + "loss": 0.3886, + "step": 5680 + }, + { + "epoch": 0.32841765028426306, + "grad_norm": 3.0416584434332274, + "learning_rate": 6.717081336950875e-06, + "loss": 0.3732, + "step": 5690 + }, + { + "epoch": 0.32899483420391906, + "grad_norm": 3.189118603447828, + "learning_rate": 6.711308664780928e-06, + "loss": 0.3788, + "step": 5700 + }, + { + "epoch": 0.32957201812357506, + "grad_norm": 4.412271751435158, + "learning_rate": 6.70553599261098e-06, + "loss": 0.3903, + "step": 5710 + }, + { + "epoch": 0.33014920204323106, + "grad_norm": 5.781124605717443, + "learning_rate": 6.699763320441033e-06, + "loss": 0.3742, + "step": 5720 + }, + { + "epoch": 0.33072638596288706, + "grad_norm": 2.38240681303682, + "learning_rate": 6.693990648271085e-06, + "loss": 0.3846, + "step": 5730 + }, + { + "epoch": 0.33130356988254306, + "grad_norm": 5.627940078972001, + "learning_rate": 6.688217976101138e-06, + "loss": 0.383, + "step": 5740 + }, + { + "epoch": 0.33188075380219906, + "grad_norm": 2.5562400913295695, + "learning_rate": 6.68244530393119e-06, + "loss": 0.388, + "step": 5750 + }, + { + "epoch": 0.33245793772185506, + "grad_norm": 2.009018555010131, + "learning_rate": 6.676672631761243e-06, + "loss": 0.3863, + "step": 5760 + }, + { + "epoch": 0.33303512164151106, + "grad_norm": 2.6584190178994223, + "learning_rate": 6.6708999595912945e-06, + "loss": 0.382, + "step": 5770 + }, + { + "epoch": 0.33361230556116706, + "grad_norm": 1.6637756869209672, + "learning_rate": 6.6651272874213476e-06, + "loss": 0.384, + "step": 5780 + }, + { + "epoch": 0.33418948948082305, + "grad_norm": 2.3195781804624174, + "learning_rate": 6.6593546152514e-06, + "loss": 0.3766, + "step": 5790 + }, + { + "epoch": 0.33476667340047905, + "grad_norm": 3.760084084073311, + "learning_rate": 6.653581943081453e-06, + "loss": 0.4035, + "step": 5800 + }, + { + "epoch": 0.33534385732013505, + "grad_norm": 2.1527295119213607, + "learning_rate": 6.647809270911505e-06, + "loss": 0.3746, + "step": 5810 + }, + { + "epoch": 0.33592104123979105, + "grad_norm": 2.50518271064483, + "learning_rate": 6.642036598741558e-06, + "loss": 0.3788, + "step": 5820 + }, + { + "epoch": 0.33649822515944705, + "grad_norm": 3.4692947058918895, + "learning_rate": 6.63626392657161e-06, + "loss": 0.3893, + "step": 5830 + }, + { + "epoch": 0.33707540907910305, + "grad_norm": 2.512775623033029, + "learning_rate": 6.630491254401663e-06, + "loss": 0.3719, + "step": 5840 + }, + { + "epoch": 0.33765259299875905, + "grad_norm": 1.9666671304858914, + "learning_rate": 6.624718582231716e-06, + "loss": 0.3785, + "step": 5850 + }, + { + "epoch": 0.33822977691841505, + "grad_norm": 2.724605715859374, + "learning_rate": 6.618945910061768e-06, + "loss": 0.4004, + "step": 5860 + }, + { + "epoch": 0.33880696083807105, + "grad_norm": 2.7489712656132044, + "learning_rate": 6.613173237891821e-06, + "loss": 0.3771, + "step": 5870 + }, + { + "epoch": 0.33938414475772705, + "grad_norm": 2.599485813176942, + "learning_rate": 6.6074005657218735e-06, + "loss": 0.4003, + "step": 5880 + }, + { + "epoch": 0.33996132867738305, + "grad_norm": 2.746831225729797, + "learning_rate": 6.6016278935519266e-06, + "loss": 0.3699, + "step": 5890 + }, + { + "epoch": 0.34053851259703904, + "grad_norm": 10.192368895005096, + "learning_rate": 6.595855221381978e-06, + "loss": 0.3825, + "step": 5900 + }, + { + "epoch": 0.34111569651669504, + "grad_norm": 2.3362074486231785, + "learning_rate": 6.590082549212031e-06, + "loss": 0.3816, + "step": 5910 + }, + { + "epoch": 0.34169288043635104, + "grad_norm": 3.2505345689597194, + "learning_rate": 6.584309877042083e-06, + "loss": 0.4147, + "step": 5920 + }, + { + "epoch": 0.34227006435600704, + "grad_norm": 2.771813125028045, + "learning_rate": 6.578537204872136e-06, + "loss": 0.3822, + "step": 5930 + }, + { + "epoch": 0.34284724827566304, + "grad_norm": 2.600143172725041, + "learning_rate": 6.572764532702188e-06, + "loss": 0.3718, + "step": 5940 + }, + { + "epoch": 0.34342443219531904, + "grad_norm": 3.904138501463535, + "learning_rate": 6.566991860532241e-06, + "loss": 0.3886, + "step": 5950 + }, + { + "epoch": 0.34400161611497504, + "grad_norm": 5.0319871188012515, + "learning_rate": 6.5612191883622935e-06, + "loss": 0.3877, + "step": 5960 + }, + { + "epoch": 0.34457880003463104, + "grad_norm": 13.625359588311552, + "learning_rate": 6.5554465161923465e-06, + "loss": 0.3687, + "step": 5970 + }, + { + "epoch": 0.34515598395428704, + "grad_norm": 2.920042053925227, + "learning_rate": 6.549673844022399e-06, + "loss": 0.3859, + "step": 5980 + }, + { + "epoch": 0.34573316787394304, + "grad_norm": 3.5277767201369223, + "learning_rate": 6.543901171852452e-06, + "loss": 0.3789, + "step": 5990 + }, + { + "epoch": 0.34631035179359904, + "grad_norm": 2.7899995578571426, + "learning_rate": 6.538128499682503e-06, + "loss": 0.3668, + "step": 6000 + }, + { + "epoch": 0.34688753571325504, + "grad_norm": 5.64453785605591, + "learning_rate": 6.532355827512557e-06, + "loss": 0.3824, + "step": 6010 + }, + { + "epoch": 0.34746471963291103, + "grad_norm": 3.440174338768187, + "learning_rate": 6.526583155342608e-06, + "loss": 0.3844, + "step": 6020 + }, + { + "epoch": 0.34804190355256703, + "grad_norm": 3.9486486049020635, + "learning_rate": 6.520810483172661e-06, + "loss": 0.3897, + "step": 6030 + }, + { + "epoch": 0.34861908747222303, + "grad_norm": 7.253991235141298, + "learning_rate": 6.515037811002713e-06, + "loss": 0.3677, + "step": 6040 + }, + { + "epoch": 0.34919627139187903, + "grad_norm": 7.656685618930045, + "learning_rate": 6.509265138832766e-06, + "loss": 0.388, + "step": 6050 + }, + { + "epoch": 0.34977345531153503, + "grad_norm": 2.213875264653562, + "learning_rate": 6.503492466662819e-06, + "loss": 0.3851, + "step": 6060 + }, + { + "epoch": 0.35035063923119103, + "grad_norm": 5.474678610165808, + "learning_rate": 6.497719794492872e-06, + "loss": 0.3777, + "step": 6070 + }, + { + "epoch": 0.35092782315084703, + "grad_norm": 4.538506198098333, + "learning_rate": 6.491947122322924e-06, + "loss": 0.3606, + "step": 6080 + }, + { + "epoch": 0.35150500707050303, + "grad_norm": 2.7367624612828627, + "learning_rate": 6.486174450152977e-06, + "loss": 0.3824, + "step": 6090 + }, + { + "epoch": 0.352082190990159, + "grad_norm": 3.9600996597048432, + "learning_rate": 6.480401777983029e-06, + "loss": 0.3865, + "step": 6100 + }, + { + "epoch": 0.352659374909815, + "grad_norm": 6.138903729575434, + "learning_rate": 6.474629105813082e-06, + "loss": 0.3836, + "step": 6110 + }, + { + "epoch": 0.353236558829471, + "grad_norm": 3.1110082739843676, + "learning_rate": 6.468856433643133e-06, + "loss": 0.3941, + "step": 6120 + }, + { + "epoch": 0.353813742749127, + "grad_norm": 3.0125957250660997, + "learning_rate": 6.463083761473186e-06, + "loss": 0.3907, + "step": 6130 + }, + { + "epoch": 0.354390926668783, + "grad_norm": 5.777332948819984, + "learning_rate": 6.4573110893032385e-06, + "loss": 0.3926, + "step": 6140 + }, + { + "epoch": 0.354968110588439, + "grad_norm": 10.69646287431515, + "learning_rate": 6.4515384171332915e-06, + "loss": 0.3892, + "step": 6150 + }, + { + "epoch": 0.355545294508095, + "grad_norm": 5.389336302495197, + "learning_rate": 6.445765744963344e-06, + "loss": 0.3957, + "step": 6160 + }, + { + "epoch": 0.356122478427751, + "grad_norm": 12.802235491479053, + "learning_rate": 6.439993072793397e-06, + "loss": 0.3742, + "step": 6170 + }, + { + "epoch": 0.356699662347407, + "grad_norm": 4.196338750242119, + "learning_rate": 6.434220400623449e-06, + "loss": 0.3878, + "step": 6180 + }, + { + "epoch": 0.357276846267063, + "grad_norm": 3.7684375534000276, + "learning_rate": 6.428447728453502e-06, + "loss": 0.3724, + "step": 6190 + }, + { + "epoch": 0.357854030186719, + "grad_norm": 2.4825477710744446, + "learning_rate": 6.422675056283554e-06, + "loss": 0.3665, + "step": 6200 + }, + { + "epoch": 0.358431214106375, + "grad_norm": 2.5273547043428244, + "learning_rate": 6.416902384113607e-06, + "loss": 0.3682, + "step": 6210 + }, + { + "epoch": 0.359008398026031, + "grad_norm": 3.3691141387535453, + "learning_rate": 6.4111297119436584e-06, + "loss": 0.3884, + "step": 6220 + }, + { + "epoch": 0.359585581945687, + "grad_norm": 3.986041227799815, + "learning_rate": 6.405357039773712e-06, + "loss": 0.3793, + "step": 6230 + }, + { + "epoch": 0.360162765865343, + "grad_norm": 4.388692532796717, + "learning_rate": 6.399584367603764e-06, + "loss": 0.3826, + "step": 6240 + }, + { + "epoch": 0.360739949784999, + "grad_norm": 5.61124433419293, + "learning_rate": 6.393811695433817e-06, + "loss": 0.3725, + "step": 6250 + }, + { + "epoch": 0.361317133704655, + "grad_norm": 5.79217310796387, + "learning_rate": 6.388039023263869e-06, + "loss": 0.3803, + "step": 6260 + }, + { + "epoch": 0.361894317624311, + "grad_norm": 3.3092133128777017, + "learning_rate": 6.382266351093922e-06, + "loss": 0.3727, + "step": 6270 + }, + { + "epoch": 0.362471501543967, + "grad_norm": 2.6436967571480308, + "learning_rate": 6.376493678923974e-06, + "loss": 0.3869, + "step": 6280 + }, + { + "epoch": 0.363048685463623, + "grad_norm": 4.870192599092706, + "learning_rate": 6.370721006754027e-06, + "loss": 0.3711, + "step": 6290 + }, + { + "epoch": 0.36362586938327895, + "grad_norm": 6.412850489358521, + "learning_rate": 6.364948334584079e-06, + "loss": 0.3778, + "step": 6300 + }, + { + "epoch": 0.36420305330293495, + "grad_norm": 6.723734658950526, + "learning_rate": 6.359175662414132e-06, + "loss": 0.375, + "step": 6310 + }, + { + "epoch": 0.36478023722259095, + "grad_norm": 2.9855811704461916, + "learning_rate": 6.353402990244184e-06, + "loss": 0.3968, + "step": 6320 + }, + { + "epoch": 0.36535742114224695, + "grad_norm": 4.253318369577758, + "learning_rate": 6.3476303180742374e-06, + "loss": 0.3835, + "step": 6330 + }, + { + "epoch": 0.36593460506190295, + "grad_norm": 3.350507256204714, + "learning_rate": 6.341857645904289e-06, + "loss": 0.3934, + "step": 6340 + }, + { + "epoch": 0.36651178898155895, + "grad_norm": 7.6596745260829024, + "learning_rate": 6.336084973734342e-06, + "loss": 0.3695, + "step": 6350 + }, + { + "epoch": 0.36708897290121495, + "grad_norm": 5.170501025053992, + "learning_rate": 6.330312301564394e-06, + "loss": 0.3822, + "step": 6360 + }, + { + "epoch": 0.36766615682087095, + "grad_norm": 3.3572394366722342, + "learning_rate": 6.324539629394447e-06, + "loss": 0.3756, + "step": 6370 + }, + { + "epoch": 0.36824334074052695, + "grad_norm": 4.222113472184697, + "learning_rate": 6.3187669572245e-06, + "loss": 0.3934, + "step": 6380 + }, + { + "epoch": 0.36882052466018295, + "grad_norm": 4.239041230078147, + "learning_rate": 6.312994285054552e-06, + "loss": 0.3841, + "step": 6390 + }, + { + "epoch": 0.36939770857983895, + "grad_norm": 3.8859530952931425, + "learning_rate": 6.307221612884605e-06, + "loss": 0.3747, + "step": 6400 + }, + { + "epoch": 0.36997489249949495, + "grad_norm": 5.0107950962438315, + "learning_rate": 6.301448940714657e-06, + "loss": 0.3781, + "step": 6410 + }, + { + "epoch": 0.37055207641915094, + "grad_norm": 19.339302178387896, + "learning_rate": 6.29567626854471e-06, + "loss": 0.3804, + "step": 6420 + }, + { + "epoch": 0.37112926033880694, + "grad_norm": 103.6902739739912, + "learning_rate": 6.2899035963747626e-06, + "loss": 0.3614, + "step": 6430 + }, + { + "epoch": 0.37170644425846294, + "grad_norm": 9.795833830078546, + "learning_rate": 6.284130924204816e-06, + "loss": 0.3806, + "step": 6440 + }, + { + "epoch": 0.37228362817811894, + "grad_norm": 2.835184422290757, + "learning_rate": 6.278358252034868e-06, + "loss": 0.3635, + "step": 6450 + }, + { + "epoch": 0.37286081209777494, + "grad_norm": 17.867434742010555, + "learning_rate": 6.272585579864921e-06, + "loss": 0.3825, + "step": 6460 + }, + { + "epoch": 0.37343799601743094, + "grad_norm": 3.5064803159499554, + "learning_rate": 6.266812907694972e-06, + "loss": 0.3777, + "step": 6470 + }, + { + "epoch": 0.37401517993708694, + "grad_norm": 6.18556265562049, + "learning_rate": 6.261040235525025e-06, + "loss": 0.354, + "step": 6480 + }, + { + "epoch": 0.37459236385674294, + "grad_norm": 5.3643329344286, + "learning_rate": 6.255267563355077e-06, + "loss": 0.3653, + "step": 6490 + }, + { + "epoch": 0.37516954777639894, + "grad_norm": 4.370066633666132, + "learning_rate": 6.24949489118513e-06, + "loss": 0.3799, + "step": 6500 + }, + { + "epoch": 0.37574673169605494, + "grad_norm": 3.802802160469247, + "learning_rate": 6.2437222190151825e-06, + "loss": 0.3771, + "step": 6510 + }, + { + "epoch": 0.37632391561571094, + "grad_norm": 2.999312565631662, + "learning_rate": 6.2379495468452355e-06, + "loss": 0.3761, + "step": 6520 + }, + { + "epoch": 0.37690109953536693, + "grad_norm": 7.852310497644898, + "learning_rate": 6.232176874675288e-06, + "loss": 0.3823, + "step": 6530 + }, + { + "epoch": 0.37747828345502293, + "grad_norm": 4.876630434197547, + "learning_rate": 6.226404202505341e-06, + "loss": 0.3704, + "step": 6540 + }, + { + "epoch": 0.37805546737467893, + "grad_norm": 4.989568751322678, + "learning_rate": 6.220631530335393e-06, + "loss": 0.3836, + "step": 6550 + }, + { + "epoch": 0.37863265129433493, + "grad_norm": 3.7548796873491135, + "learning_rate": 6.214858858165446e-06, + "loss": 0.3759, + "step": 6560 + }, + { + "epoch": 0.37920983521399093, + "grad_norm": 2.7376006597130513, + "learning_rate": 6.209086185995497e-06, + "loss": 0.3825, + "step": 6570 + }, + { + "epoch": 0.37978701913364693, + "grad_norm": 3.835955921955649, + "learning_rate": 6.203313513825551e-06, + "loss": 0.3892, + "step": 6580 + }, + { + "epoch": 0.38036420305330293, + "grad_norm": 2.728138816573778, + "learning_rate": 6.197540841655602e-06, + "loss": 0.3806, + "step": 6590 + }, + { + "epoch": 0.38094138697295893, + "grad_norm": 6.331854012865428, + "learning_rate": 6.1917681694856554e-06, + "loss": 0.3849, + "step": 6600 + }, + { + "epoch": 0.3815185708926149, + "grad_norm": 2.7712160688455394, + "learning_rate": 6.185995497315708e-06, + "loss": 0.366, + "step": 6610 + }, + { + "epoch": 0.3820957548122709, + "grad_norm": 2.2697691277132406, + "learning_rate": 6.180222825145761e-06, + "loss": 0.3625, + "step": 6620 + }, + { + "epoch": 0.3826729387319269, + "grad_norm": 4.51763189851551, + "learning_rate": 6.174450152975813e-06, + "loss": 0.3745, + "step": 6630 + }, + { + "epoch": 0.3832501226515829, + "grad_norm": 22.00920716007038, + "learning_rate": 6.168677480805866e-06, + "loss": 0.379, + "step": 6640 + }, + { + "epoch": 0.3838273065712389, + "grad_norm": 8.50487988964264, + "learning_rate": 6.162904808635918e-06, + "loss": 0.3798, + "step": 6650 + }, + { + "epoch": 0.3844044904908949, + "grad_norm": 2.756542308650349, + "learning_rate": 6.157132136465971e-06, + "loss": 0.3777, + "step": 6660 + }, + { + "epoch": 0.3849816744105509, + "grad_norm": 1.8967629666152492, + "learning_rate": 6.151359464296023e-06, + "loss": 0.3536, + "step": 6670 + }, + { + "epoch": 0.3855588583302069, + "grad_norm": 2.208647530669507, + "learning_rate": 6.145586792126076e-06, + "loss": 0.3757, + "step": 6680 + }, + { + "epoch": 0.3861360422498629, + "grad_norm": 3.18500818944882, + "learning_rate": 6.1398141199561275e-06, + "loss": 0.381, + "step": 6690 + }, + { + "epoch": 0.3867132261695189, + "grad_norm": 3.7319272107204267, + "learning_rate": 6.1340414477861806e-06, + "loss": 0.3895, + "step": 6700 + }, + { + "epoch": 0.3872904100891749, + "grad_norm": 6.233879379077169, + "learning_rate": 6.128268775616233e-06, + "loss": 0.3931, + "step": 6710 + }, + { + "epoch": 0.3878675940088309, + "grad_norm": 2.5172058960090147, + "learning_rate": 6.122496103446286e-06, + "loss": 0.3696, + "step": 6720 + }, + { + "epoch": 0.3884447779284869, + "grad_norm": 2.4821687545852544, + "learning_rate": 6.116723431276338e-06, + "loss": 0.3783, + "step": 6730 + }, + { + "epoch": 0.3890219618481429, + "grad_norm": 2.5811379708324984, + "learning_rate": 6.110950759106391e-06, + "loss": 0.3883, + "step": 6740 + }, + { + "epoch": 0.3895991457677989, + "grad_norm": 4.606721510393016, + "learning_rate": 6.105178086936443e-06, + "loss": 0.364, + "step": 6750 + }, + { + "epoch": 0.3901763296874549, + "grad_norm": 5.353229433626119, + "learning_rate": 6.099405414766496e-06, + "loss": 0.3882, + "step": 6760 + }, + { + "epoch": 0.3907535136071109, + "grad_norm": 2.3516345262109617, + "learning_rate": 6.093632742596548e-06, + "loss": 0.3788, + "step": 6770 + }, + { + "epoch": 0.3913306975267669, + "grad_norm": 11.487680253286674, + "learning_rate": 6.087860070426601e-06, + "loss": 0.3889, + "step": 6780 + }, + { + "epoch": 0.3919078814464229, + "grad_norm": 3.18290202413646, + "learning_rate": 6.082087398256653e-06, + "loss": 0.3607, + "step": 6790 + }, + { + "epoch": 0.3924850653660789, + "grad_norm": 2.7380986355865917, + "learning_rate": 6.0763147260867065e-06, + "loss": 0.3809, + "step": 6800 + }, + { + "epoch": 0.3930622492857349, + "grad_norm": 2.985403565819371, + "learning_rate": 6.070542053916758e-06, + "loss": 0.3785, + "step": 6810 + }, + { + "epoch": 0.3936394332053909, + "grad_norm": 7.257850197480963, + "learning_rate": 6.064769381746811e-06, + "loss": 0.3813, + "step": 6820 + }, + { + "epoch": 0.3942166171250469, + "grad_norm": 2.651981631840983, + "learning_rate": 6.058996709576863e-06, + "loss": 0.3911, + "step": 6830 + }, + { + "epoch": 0.3947938010447029, + "grad_norm": 3.007540853480136, + "learning_rate": 6.053224037406916e-06, + "loss": 0.3787, + "step": 6840 + }, + { + "epoch": 0.3953709849643589, + "grad_norm": 4.967113215124695, + "learning_rate": 6.047451365236968e-06, + "loss": 0.3729, + "step": 6850 + }, + { + "epoch": 0.3959481688840149, + "grad_norm": 2.4113519734571627, + "learning_rate": 6.041678693067021e-06, + "loss": 0.3576, + "step": 6860 + }, + { + "epoch": 0.3965253528036709, + "grad_norm": 1.5215990778439656, + "learning_rate": 6.0359060208970734e-06, + "loss": 0.3813, + "step": 6870 + }, + { + "epoch": 0.3971025367233269, + "grad_norm": 1.9980571139407164, + "learning_rate": 6.0301333487271265e-06, + "loss": 0.3764, + "step": 6880 + }, + { + "epoch": 0.3976797206429829, + "grad_norm": 3.851850368869639, + "learning_rate": 6.024360676557179e-06, + "loss": 0.3793, + "step": 6890 + }, + { + "epoch": 0.3982569045626389, + "grad_norm": 2.819413612633571, + "learning_rate": 6.018588004387232e-06, + "loss": 0.3915, + "step": 6900 + }, + { + "epoch": 0.3988340884822949, + "grad_norm": 2.2801532893497733, + "learning_rate": 6.012815332217285e-06, + "loss": 0.3927, + "step": 6910 + }, + { + "epoch": 0.3994112724019509, + "grad_norm": 3.0396536780138734, + "learning_rate": 6.007042660047336e-06, + "loss": 0.3787, + "step": 6920 + }, + { + "epoch": 0.3999884563216069, + "grad_norm": 1.9908365824878806, + "learning_rate": 6.00126998787739e-06, + "loss": 0.3751, + "step": 6930 + }, + { + "epoch": 0.4005656402412629, + "grad_norm": 2.5287281468598817, + "learning_rate": 5.995497315707441e-06, + "loss": 0.3739, + "step": 6940 + }, + { + "epoch": 0.4011428241609189, + "grad_norm": 2.3877649628077404, + "learning_rate": 5.989724643537494e-06, + "loss": 0.3931, + "step": 6950 + }, + { + "epoch": 0.4017200080805749, + "grad_norm": 1.751274625854989, + "learning_rate": 5.983951971367546e-06, + "loss": 0.3725, + "step": 6960 + }, + { + "epoch": 0.4022971920002309, + "grad_norm": 7.097265852789204, + "learning_rate": 5.978179299197599e-06, + "loss": 0.3771, + "step": 6970 + }, + { + "epoch": 0.4028743759198869, + "grad_norm": 1.5751898011207688, + "learning_rate": 5.972406627027652e-06, + "loss": 0.3804, + "step": 6980 + }, + { + "epoch": 0.4034515598395429, + "grad_norm": 3.487612778253862, + "learning_rate": 5.966633954857705e-06, + "loss": 0.3677, + "step": 6990 + }, + { + "epoch": 0.4040287437591989, + "grad_norm": 2.622057972311098, + "learning_rate": 5.960861282687757e-06, + "loss": 0.3778, + "step": 7000 + }, + { + "epoch": 0.4046059276788549, + "grad_norm": 2.7368469799858532, + "learning_rate": 5.95508861051781e-06, + "loss": 0.3768, + "step": 7010 + }, + { + "epoch": 0.4051831115985109, + "grad_norm": 1.6133398127083427, + "learning_rate": 5.949315938347862e-06, + "loss": 0.3799, + "step": 7020 + }, + { + "epoch": 0.4057602955181669, + "grad_norm": 3.191334805976918, + "learning_rate": 5.943543266177915e-06, + "loss": 0.3813, + "step": 7030 + }, + { + "epoch": 0.40633747943782283, + "grad_norm": 2.8991810624406784, + "learning_rate": 5.937770594007966e-06, + "loss": 0.376, + "step": 7040 + }, + { + "epoch": 0.40691466335747883, + "grad_norm": 2.0785390805202684, + "learning_rate": 5.931997921838019e-06, + "loss": 0.3729, + "step": 7050 + }, + { + "epoch": 0.40749184727713483, + "grad_norm": 1.9512094562324862, + "learning_rate": 5.9262252496680715e-06, + "loss": 0.3732, + "step": 7060 + }, + { + "epoch": 0.40806903119679083, + "grad_norm": 3.3176725840902206, + "learning_rate": 5.9204525774981245e-06, + "loss": 0.3874, + "step": 7070 + }, + { + "epoch": 0.40864621511644683, + "grad_norm": 3.000837724994079, + "learning_rate": 5.914679905328177e-06, + "loss": 0.3745, + "step": 7080 + }, + { + "epoch": 0.40922339903610283, + "grad_norm": 1.8158962267665133, + "learning_rate": 5.90890723315823e-06, + "loss": 0.3756, + "step": 7090 + }, + { + "epoch": 0.40980058295575883, + "grad_norm": 2.324389501252935, + "learning_rate": 5.903134560988282e-06, + "loss": 0.3886, + "step": 7100 + }, + { + "epoch": 0.41037776687541483, + "grad_norm": 2.894571332845524, + "learning_rate": 5.897361888818335e-06, + "loss": 0.3869, + "step": 7110 + }, + { + "epoch": 0.41095495079507083, + "grad_norm": 2.629677485680801, + "learning_rate": 5.891589216648387e-06, + "loss": 0.3615, + "step": 7120 + }, + { + "epoch": 0.4115321347147268, + "grad_norm": 13.759434005163566, + "learning_rate": 5.88581654447844e-06, + "loss": 0.3678, + "step": 7130 + }, + { + "epoch": 0.4121093186343828, + "grad_norm": 3.187249272214218, + "learning_rate": 5.8800438723084915e-06, + "loss": 0.3496, + "step": 7140 + }, + { + "epoch": 0.4126865025540388, + "grad_norm": 25.01828326406148, + "learning_rate": 5.874271200138545e-06, + "loss": 0.3831, + "step": 7150 + }, + { + "epoch": 0.4132636864736948, + "grad_norm": 4.28899556920541, + "learning_rate": 5.868498527968597e-06, + "loss": 0.3847, + "step": 7160 + }, + { + "epoch": 0.4138408703933508, + "grad_norm": 5.017592395582479, + "learning_rate": 5.86272585579865e-06, + "loss": 0.3528, + "step": 7170 + }, + { + "epoch": 0.4144180543130068, + "grad_norm": 3.6138133944499686, + "learning_rate": 5.856953183628702e-06, + "loss": 0.3615, + "step": 7180 + }, + { + "epoch": 0.4149952382326628, + "grad_norm": 11.345281193048963, + "learning_rate": 5.851180511458755e-06, + "loss": 0.36, + "step": 7190 + }, + { + "epoch": 0.4155724221523188, + "grad_norm": 4.1575514029124525, + "learning_rate": 5.845407839288807e-06, + "loss": 0.3707, + "step": 7200 + }, + { + "epoch": 0.4161496060719748, + "grad_norm": 5.184879687211155, + "learning_rate": 5.83963516711886e-06, + "loss": 0.3584, + "step": 7210 + }, + { + "epoch": 0.4167267899916308, + "grad_norm": 3.6353294525038256, + "learning_rate": 5.833862494948912e-06, + "loss": 0.3922, + "step": 7220 + }, + { + "epoch": 0.4173039739112868, + "grad_norm": 10.083912587939164, + "learning_rate": 5.828089822778965e-06, + "loss": 0.358, + "step": 7230 + }, + { + "epoch": 0.4178811578309428, + "grad_norm": 3.795430776940293, + "learning_rate": 5.822317150609017e-06, + "loss": 0.3584, + "step": 7240 + }, + { + "epoch": 0.4184583417505988, + "grad_norm": 2.735432424886805, + "learning_rate": 5.8165444784390704e-06, + "loss": 0.3628, + "step": 7250 + }, + { + "epoch": 0.4190355256702548, + "grad_norm": 3.7538394849350034, + "learning_rate": 5.810771806269122e-06, + "loss": 0.3808, + "step": 7260 + }, + { + "epoch": 0.4196127095899108, + "grad_norm": 3.486146744000872, + "learning_rate": 5.804999134099175e-06, + "loss": 0.3677, + "step": 7270 + }, + { + "epoch": 0.4201898935095668, + "grad_norm": 6.482596192596544, + "learning_rate": 5.799226461929227e-06, + "loss": 0.3715, + "step": 7280 + }, + { + "epoch": 0.4207670774292228, + "grad_norm": 15.287481532081374, + "learning_rate": 5.79345378975928e-06, + "loss": 0.3607, + "step": 7290 + }, + { + "epoch": 0.4213442613488788, + "grad_norm": 5.756011268210783, + "learning_rate": 5.787681117589332e-06, + "loss": 0.3829, + "step": 7300 + }, + { + "epoch": 0.4219214452685348, + "grad_norm": 5.238271188240731, + "learning_rate": 5.781908445419385e-06, + "loss": 0.3685, + "step": 7310 + }, + { + "epoch": 0.4224986291881908, + "grad_norm": 5.072523904302979, + "learning_rate": 5.776135773249437e-06, + "loss": 0.3785, + "step": 7320 + }, + { + "epoch": 0.4230758131078468, + "grad_norm": 2.7230926250144494, + "learning_rate": 5.77036310107949e-06, + "loss": 0.3586, + "step": 7330 + }, + { + "epoch": 0.4236529970275028, + "grad_norm": 3.1651643016202438, + "learning_rate": 5.7645904289095425e-06, + "loss": 0.3675, + "step": 7340 + }, + { + "epoch": 0.4242301809471588, + "grad_norm": 5.575569273909336, + "learning_rate": 5.7588177567395956e-06, + "loss": 0.3659, + "step": 7350 + }, + { + "epoch": 0.4248073648668148, + "grad_norm": 3.4372405530276686, + "learning_rate": 5.753045084569647e-06, + "loss": 0.3562, + "step": 7360 + }, + { + "epoch": 0.4253845487864708, + "grad_norm": 3.1380962366302203, + "learning_rate": 5.747272412399701e-06, + "loss": 0.3665, + "step": 7370 + }, + { + "epoch": 0.4259617327061268, + "grad_norm": 4.195020514299469, + "learning_rate": 5.741499740229752e-06, + "loss": 0.3834, + "step": 7380 + }, + { + "epoch": 0.4265389166257828, + "grad_norm": 2.5201571788814103, + "learning_rate": 5.735727068059805e-06, + "loss": 0.3598, + "step": 7390 + }, + { + "epoch": 0.4271161005454388, + "grad_norm": 7.757366621212017, + "learning_rate": 5.729954395889857e-06, + "loss": 0.365, + "step": 7400 + }, + { + "epoch": 0.4276932844650948, + "grad_norm": 3.6863947123217438, + "learning_rate": 5.72418172371991e-06, + "loss": 0.3605, + "step": 7410 + }, + { + "epoch": 0.4282704683847508, + "grad_norm": 2.713386138832286, + "learning_rate": 5.7184090515499625e-06, + "loss": 0.3692, + "step": 7420 + }, + { + "epoch": 0.4288476523044068, + "grad_norm": 4.061486235134526, + "learning_rate": 5.7126363793800155e-06, + "loss": 0.3615, + "step": 7430 + }, + { + "epoch": 0.4294248362240628, + "grad_norm": 3.171386095616653, + "learning_rate": 5.7068637072100685e-06, + "loss": 0.3742, + "step": 7440 + }, + { + "epoch": 0.4300020201437188, + "grad_norm": 3.0632786743675173, + "learning_rate": 5.701091035040121e-06, + "loss": 0.3669, + "step": 7450 + }, + { + "epoch": 0.4305792040633748, + "grad_norm": 5.9823682538619884, + "learning_rate": 5.695318362870174e-06, + "loss": 0.3625, + "step": 7460 + }, + { + "epoch": 0.4311563879830308, + "grad_norm": 2.587936253733615, + "learning_rate": 5.689545690700226e-06, + "loss": 0.3654, + "step": 7470 + }, + { + "epoch": 0.4317335719026868, + "grad_norm": 3.3311960193145507, + "learning_rate": 5.683773018530279e-06, + "loss": 0.3849, + "step": 7480 + }, + { + "epoch": 0.4323107558223428, + "grad_norm": 7.2504266943512885, + "learning_rate": 5.67800034636033e-06, + "loss": 0.383, + "step": 7490 + }, + { + "epoch": 0.4328879397419988, + "grad_norm": 2.565927845188403, + "learning_rate": 5.672227674190383e-06, + "loss": 0.3689, + "step": 7500 + }, + { + "epoch": 0.4334651236616548, + "grad_norm": 2.596436540237007, + "learning_rate": 5.6664550020204354e-06, + "loss": 0.3783, + "step": 7510 + }, + { + "epoch": 0.4340423075813108, + "grad_norm": 3.2568921956880397, + "learning_rate": 5.6606823298504884e-06, + "loss": 0.3622, + "step": 7520 + }, + { + "epoch": 0.4346194915009668, + "grad_norm": 3.534180092969136, + "learning_rate": 5.654909657680541e-06, + "loss": 0.374, + "step": 7530 + }, + { + "epoch": 0.4351966754206228, + "grad_norm": 2.320956209280894, + "learning_rate": 5.649136985510594e-06, + "loss": 0.3618, + "step": 7540 + }, + { + "epoch": 0.4357738593402788, + "grad_norm": 2.7986565358175732, + "learning_rate": 5.643364313340646e-06, + "loss": 0.376, + "step": 7550 + }, + { + "epoch": 0.4363510432599348, + "grad_norm": 4.3640055595975955, + "learning_rate": 5.637591641170699e-06, + "loss": 0.359, + "step": 7560 + }, + { + "epoch": 0.4369282271795908, + "grad_norm": 2.4295878318318893, + "learning_rate": 5.631818969000751e-06, + "loss": 0.3837, + "step": 7570 + }, + { + "epoch": 0.4375054110992468, + "grad_norm": 2.5358015892610304, + "learning_rate": 5.626046296830804e-06, + "loss": 0.3824, + "step": 7580 + }, + { + "epoch": 0.4380825950189028, + "grad_norm": 2.732560193932699, + "learning_rate": 5.620273624660856e-06, + "loss": 0.3657, + "step": 7590 + }, + { + "epoch": 0.4386597789385588, + "grad_norm": 4.150107259821488, + "learning_rate": 5.614500952490909e-06, + "loss": 0.3805, + "step": 7600 + }, + { + "epoch": 0.4392369628582148, + "grad_norm": 6.027002919837396, + "learning_rate": 5.6087282803209605e-06, + "loss": 0.3733, + "step": 7610 + }, + { + "epoch": 0.4398141467778708, + "grad_norm": 4.383047686001244, + "learning_rate": 5.6029556081510136e-06, + "loss": 0.3798, + "step": 7620 + }, + { + "epoch": 0.4403913306975268, + "grad_norm": 3.183548428631444, + "learning_rate": 5.597182935981066e-06, + "loss": 0.3704, + "step": 7630 + }, + { + "epoch": 0.4409685146171828, + "grad_norm": 3.2995502847867364, + "learning_rate": 5.591410263811119e-06, + "loss": 0.3868, + "step": 7640 + }, + { + "epoch": 0.4415456985368388, + "grad_norm": 2.9302522543070384, + "learning_rate": 5.585637591641171e-06, + "loss": 0.3719, + "step": 7650 + }, + { + "epoch": 0.4421228824564948, + "grad_norm": 3.699827330713927, + "learning_rate": 5.579864919471224e-06, + "loss": 0.3843, + "step": 7660 + }, + { + "epoch": 0.4427000663761508, + "grad_norm": 10.131740140492866, + "learning_rate": 5.574092247301276e-06, + "loss": 0.3741, + "step": 7670 + }, + { + "epoch": 0.4432772502958068, + "grad_norm": 2.53184828015941, + "learning_rate": 5.568319575131329e-06, + "loss": 0.3679, + "step": 7680 + }, + { + "epoch": 0.4438544342154628, + "grad_norm": 2.735336367396379, + "learning_rate": 5.562546902961381e-06, + "loss": 0.3794, + "step": 7690 + }, + { + "epoch": 0.4444316181351188, + "grad_norm": 3.118950089045635, + "learning_rate": 5.556774230791434e-06, + "loss": 0.3749, + "step": 7700 + }, + { + "epoch": 0.4450088020547748, + "grad_norm": 2.0345152708541736, + "learning_rate": 5.551001558621486e-06, + "loss": 0.3797, + "step": 7710 + }, + { + "epoch": 0.4455859859744308, + "grad_norm": 2.216729946357023, + "learning_rate": 5.5452288864515395e-06, + "loss": 0.3704, + "step": 7720 + }, + { + "epoch": 0.4461631698940868, + "grad_norm": 11.605317744790039, + "learning_rate": 5.539456214281591e-06, + "loss": 0.3826, + "step": 7730 + }, + { + "epoch": 0.4467403538137428, + "grad_norm": 5.555060033849291, + "learning_rate": 5.533683542111644e-06, + "loss": 0.3768, + "step": 7740 + }, + { + "epoch": 0.4473175377333988, + "grad_norm": 2.5005756710793507, + "learning_rate": 5.527910869941696e-06, + "loss": 0.3667, + "step": 7750 + }, + { + "epoch": 0.4478947216530548, + "grad_norm": 4.5009505264076655, + "learning_rate": 5.522138197771749e-06, + "loss": 0.3773, + "step": 7760 + }, + { + "epoch": 0.44847190557271077, + "grad_norm": 2.6271424623008945, + "learning_rate": 5.516365525601801e-06, + "loss": 0.3737, + "step": 7770 + }, + { + "epoch": 0.4490490894923667, + "grad_norm": 12.12839502479119, + "learning_rate": 5.510592853431854e-06, + "loss": 0.3573, + "step": 7780 + }, + { + "epoch": 0.4496262734120227, + "grad_norm": 4.194839597547066, + "learning_rate": 5.5048201812619064e-06, + "loss": 0.3774, + "step": 7790 + }, + { + "epoch": 0.4502034573316787, + "grad_norm": 2.1887367164733016, + "learning_rate": 5.4990475090919595e-06, + "loss": 0.381, + "step": 7800 + }, + { + "epoch": 0.4507806412513347, + "grad_norm": 3.1886528624855925, + "learning_rate": 5.493274836922012e-06, + "loss": 0.37, + "step": 7810 + }, + { + "epoch": 0.4513578251709907, + "grad_norm": 2.4110545743480527, + "learning_rate": 5.487502164752065e-06, + "loss": 0.3604, + "step": 7820 + }, + { + "epoch": 0.4519350090906467, + "grad_norm": 2.9847050808092166, + "learning_rate": 5.481729492582116e-06, + "loss": 0.3675, + "step": 7830 + }, + { + "epoch": 0.4525121930103027, + "grad_norm": 5.885118240316819, + "learning_rate": 5.475956820412169e-06, + "loss": 0.3856, + "step": 7840 + }, + { + "epoch": 0.4530893769299587, + "grad_norm": 5.303575867358966, + "learning_rate": 5.470184148242221e-06, + "loss": 0.3607, + "step": 7850 + }, + { + "epoch": 0.4536665608496147, + "grad_norm": 7.1549036006295035, + "learning_rate": 5.464411476072274e-06, + "loss": 0.3699, + "step": 7860 + }, + { + "epoch": 0.4542437447692707, + "grad_norm": 1.8926865310221554, + "learning_rate": 5.458638803902326e-06, + "loss": 0.3676, + "step": 7870 + }, + { + "epoch": 0.4548209286889267, + "grad_norm": 4.130403133399794, + "learning_rate": 5.452866131732379e-06, + "loss": 0.3624, + "step": 7880 + }, + { + "epoch": 0.4553981126085827, + "grad_norm": 3.4241484954051677, + "learning_rate": 5.4470934595624316e-06, + "loss": 0.3652, + "step": 7890 + }, + { + "epoch": 0.4559752965282387, + "grad_norm": 7.8305369558715725, + "learning_rate": 5.441320787392485e-06, + "loss": 0.35, + "step": 7900 + }, + { + "epoch": 0.4565524804478947, + "grad_norm": 7.1372161394964575, + "learning_rate": 5.435548115222537e-06, + "loss": 0.3742, + "step": 7910 + }, + { + "epoch": 0.4571296643675507, + "grad_norm": 20.273585832785447, + "learning_rate": 5.42977544305259e-06, + "loss": 0.3769, + "step": 7920 + }, + { + "epoch": 0.4577068482872067, + "grad_norm": 2.794498898766565, + "learning_rate": 5.424002770882641e-06, + "loss": 0.384, + "step": 7930 + }, + { + "epoch": 0.4582840322068627, + "grad_norm": 7.066469891261649, + "learning_rate": 5.418230098712695e-06, + "loss": 0.3795, + "step": 7940 + }, + { + "epoch": 0.4588612161265187, + "grad_norm": 2.8353456413911737, + "learning_rate": 5.412457426542746e-06, + "loss": 0.3643, + "step": 7950 + }, + { + "epoch": 0.4594384000461747, + "grad_norm": 3.0383570429357345, + "learning_rate": 5.406684754372799e-06, + "loss": 0.3718, + "step": 7960 + }, + { + "epoch": 0.4600155839658307, + "grad_norm": 3.6164323938018734, + "learning_rate": 5.400912082202852e-06, + "loss": 0.3849, + "step": 7970 + }, + { + "epoch": 0.4605927678854867, + "grad_norm": 2.7123845726783262, + "learning_rate": 5.3951394100329045e-06, + "loss": 0.3683, + "step": 7980 + }, + { + "epoch": 0.4611699518051427, + "grad_norm": 2.312599361300853, + "learning_rate": 5.3893667378629575e-06, + "loss": 0.3798, + "step": 7990 + }, + { + "epoch": 0.4617471357247987, + "grad_norm": 3.251330933463336, + "learning_rate": 5.38359406569301e-06, + "loss": 0.3587, + "step": 8000 + }, + { + "epoch": 0.4623243196444547, + "grad_norm": 6.596375932856641, + "learning_rate": 5.377821393523063e-06, + "loss": 0.3802, + "step": 8010 + }, + { + "epoch": 0.4629015035641107, + "grad_norm": 1.8050467998180781, + "learning_rate": 5.372048721353115e-06, + "loss": 0.3712, + "step": 8020 + }, + { + "epoch": 0.4634786874837667, + "grad_norm": 5.472845808317412, + "learning_rate": 5.366276049183168e-06, + "loss": 0.377, + "step": 8030 + }, + { + "epoch": 0.4640558714034227, + "grad_norm": 21.192833022577837, + "learning_rate": 5.36050337701322e-06, + "loss": 0.3745, + "step": 8040 + }, + { + "epoch": 0.4646330553230787, + "grad_norm": 2.3954727021255677, + "learning_rate": 5.354730704843273e-06, + "loss": 0.368, + "step": 8050 + }, + { + "epoch": 0.4652102392427347, + "grad_norm": 3.6751726559767652, + "learning_rate": 5.3489580326733245e-06, + "loss": 0.3803, + "step": 8060 + }, + { + "epoch": 0.4657874231623907, + "grad_norm": 2.6299220528922316, + "learning_rate": 5.3431853605033775e-06, + "loss": 0.3655, + "step": 8070 + }, + { + "epoch": 0.4663646070820467, + "grad_norm": 4.147182980327485, + "learning_rate": 5.33741268833343e-06, + "loss": 0.376, + "step": 8080 + }, + { + "epoch": 0.4669417910017027, + "grad_norm": 2.3318408925884526, + "learning_rate": 5.331640016163483e-06, + "loss": 0.3793, + "step": 8090 + }, + { + "epoch": 0.4675189749213587, + "grad_norm": 2.5630808791681106, + "learning_rate": 5.325867343993535e-06, + "loss": 0.3841, + "step": 8100 + }, + { + "epoch": 0.4680961588410147, + "grad_norm": 16.561602524939726, + "learning_rate": 5.320094671823588e-06, + "loss": 0.3628, + "step": 8110 + }, + { + "epoch": 0.4686733427606707, + "grad_norm": 2.655435817667697, + "learning_rate": 5.31432199965364e-06, + "loss": 0.3827, + "step": 8120 + }, + { + "epoch": 0.4692505266803267, + "grad_norm": 2.082608101455672, + "learning_rate": 5.308549327483693e-06, + "loss": 0.3726, + "step": 8130 + }, + { + "epoch": 0.4698277105999827, + "grad_norm": 7.450725164048278, + "learning_rate": 5.302776655313745e-06, + "loss": 0.3607, + "step": 8140 + }, + { + "epoch": 0.4704048945196387, + "grad_norm": 3.727260702005544, + "learning_rate": 5.297003983143798e-06, + "loss": 0.3666, + "step": 8150 + }, + { + "epoch": 0.4709820784392947, + "grad_norm": 2.9372607445086816, + "learning_rate": 5.29123131097385e-06, + "loss": 0.3638, + "step": 8160 + }, + { + "epoch": 0.4715592623589507, + "grad_norm": 3.011083825137573, + "learning_rate": 5.2854586388039034e-06, + "loss": 0.3711, + "step": 8170 + }, + { + "epoch": 0.4721364462786067, + "grad_norm": 5.58881805540413, + "learning_rate": 5.279685966633955e-06, + "loss": 0.3737, + "step": 8180 + }, + { + "epoch": 0.4727136301982627, + "grad_norm": 1.9307643066917193, + "learning_rate": 5.273913294464008e-06, + "loss": 0.3633, + "step": 8190 + }, + { + "epoch": 0.4732908141179187, + "grad_norm": 2.246158235550298, + "learning_rate": 5.26814062229406e-06, + "loss": 0.3811, + "step": 8200 + }, + { + "epoch": 0.4738679980375747, + "grad_norm": 5.378095626076901, + "learning_rate": 5.262367950124113e-06, + "loss": 0.3772, + "step": 8210 + }, + { + "epoch": 0.4744451819572307, + "grad_norm": 3.594146830036725, + "learning_rate": 5.256595277954165e-06, + "loss": 0.3689, + "step": 8220 + }, + { + "epoch": 0.4750223658768867, + "grad_norm": 1.880161955339062, + "learning_rate": 5.250822605784218e-06, + "loss": 0.3682, + "step": 8230 + }, + { + "epoch": 0.4755995497965427, + "grad_norm": 3.655512057019781, + "learning_rate": 5.24504993361427e-06, + "loss": 0.3601, + "step": 8240 + }, + { + "epoch": 0.4761767337161987, + "grad_norm": 4.5822643890603345, + "learning_rate": 5.239277261444323e-06, + "loss": 0.3799, + "step": 8250 + }, + { + "epoch": 0.4767539176358547, + "grad_norm": 10.062266012222976, + "learning_rate": 5.2335045892743755e-06, + "loss": 0.3651, + "step": 8260 + }, + { + "epoch": 0.4773311015555107, + "grad_norm": 7.820363388846345, + "learning_rate": 5.2277319171044286e-06, + "loss": 0.3629, + "step": 8270 + }, + { + "epoch": 0.47790828547516667, + "grad_norm": 3.04998159803599, + "learning_rate": 5.22195924493448e-06, + "loss": 0.3693, + "step": 8280 + }, + { + "epoch": 0.47848546939482267, + "grad_norm": 4.5635676614995475, + "learning_rate": 5.216186572764533e-06, + "loss": 0.384, + "step": 8290 + }, + { + "epoch": 0.47906265331447867, + "grad_norm": 2.1736842754902708, + "learning_rate": 5.210413900594585e-06, + "loss": 0.363, + "step": 8300 + }, + { + "epoch": 0.47963983723413467, + "grad_norm": 2.5418326594021887, + "learning_rate": 5.204641228424638e-06, + "loss": 0.3697, + "step": 8310 + }, + { + "epoch": 0.48021702115379067, + "grad_norm": 2.692093940326692, + "learning_rate": 5.19886855625469e-06, + "loss": 0.3691, + "step": 8320 + }, + { + "epoch": 0.48079420507344667, + "grad_norm": 2.4052933532982816, + "learning_rate": 5.193095884084743e-06, + "loss": 0.3506, + "step": 8330 + }, + { + "epoch": 0.48137138899310267, + "grad_norm": 6.620567354797733, + "learning_rate": 5.1873232119147955e-06, + "loss": 0.3838, + "step": 8340 + }, + { + "epoch": 0.48194857291275867, + "grad_norm": 2.9255268074292555, + "learning_rate": 5.1815505397448485e-06, + "loss": 0.3694, + "step": 8350 + }, + { + "epoch": 0.48252575683241467, + "grad_norm": 13.37602989555681, + "learning_rate": 5.175777867574901e-06, + "loss": 0.3752, + "step": 8360 + }, + { + "epoch": 0.48310294075207066, + "grad_norm": 3.511794563236054, + "learning_rate": 5.170005195404954e-06, + "loss": 0.3741, + "step": 8370 + }, + { + "epoch": 0.48368012467172666, + "grad_norm": 4.7758963928181375, + "learning_rate": 5.164232523235006e-06, + "loss": 0.369, + "step": 8380 + }, + { + "epoch": 0.48425730859138266, + "grad_norm": 3.8240515141083002, + "learning_rate": 5.158459851065059e-06, + "loss": 0.374, + "step": 8390 + }, + { + "epoch": 0.48483449251103866, + "grad_norm": 8.100590461673363, + "learning_rate": 5.15268717889511e-06, + "loss": 0.3563, + "step": 8400 + }, + { + "epoch": 0.48541167643069466, + "grad_norm": 4.117224392256427, + "learning_rate": 5.146914506725163e-06, + "loss": 0.3626, + "step": 8410 + }, + { + "epoch": 0.48598886035035066, + "grad_norm": 3.4662871354919904, + "learning_rate": 5.141141834555215e-06, + "loss": 0.3848, + "step": 8420 + }, + { + "epoch": 0.48656604427000666, + "grad_norm": 4.959524985987204, + "learning_rate": 5.1353691623852684e-06, + "loss": 0.3649, + "step": 8430 + }, + { + "epoch": 0.48714322818966266, + "grad_norm": 5.172300477902163, + "learning_rate": 5.129596490215321e-06, + "loss": 0.3732, + "step": 8440 + }, + { + "epoch": 0.48772041210931866, + "grad_norm": 2.996587870660032, + "learning_rate": 5.123823818045374e-06, + "loss": 0.3544, + "step": 8450 + }, + { + "epoch": 0.48829759602897466, + "grad_norm": 5.228719957469869, + "learning_rate": 5.118051145875426e-06, + "loss": 0.3623, + "step": 8460 + }, + { + "epoch": 0.48887477994863066, + "grad_norm": 7.078111368668544, + "learning_rate": 5.112278473705479e-06, + "loss": 0.3521, + "step": 8470 + }, + { + "epoch": 0.48945196386828665, + "grad_norm": 4.585249570356133, + "learning_rate": 5.106505801535531e-06, + "loss": 0.3602, + "step": 8480 + }, + { + "epoch": 0.49002914778794265, + "grad_norm": 3.769909535272591, + "learning_rate": 5.100733129365584e-06, + "loss": 0.3642, + "step": 8490 + }, + { + "epoch": 0.49060633170759865, + "grad_norm": 5.843171007267111, + "learning_rate": 5.094960457195637e-06, + "loss": 0.3844, + "step": 8500 + }, + { + "epoch": 0.49118351562725465, + "grad_norm": 5.011620359523228, + "learning_rate": 5.089187785025688e-06, + "loss": 0.3748, + "step": 8510 + }, + { + "epoch": 0.4917606995469106, + "grad_norm": 3.1629027771513667, + "learning_rate": 5.083415112855742e-06, + "loss": 0.3756, + "step": 8520 + }, + { + "epoch": 0.4923378834665666, + "grad_norm": 3.4287778879420583, + "learning_rate": 5.0776424406857936e-06, + "loss": 0.3654, + "step": 8530 + }, + { + "epoch": 0.4929150673862226, + "grad_norm": 2.3995913819961077, + "learning_rate": 5.0718697685158466e-06, + "loss": 0.3497, + "step": 8540 + }, + { + "epoch": 0.4934922513058786, + "grad_norm": 7.033069082305426, + "learning_rate": 5.066097096345899e-06, + "loss": 0.3679, + "step": 8550 + }, + { + "epoch": 0.4940694352255346, + "grad_norm": 9.94984806585782, + "learning_rate": 5.060324424175952e-06, + "loss": 0.3528, + "step": 8560 + }, + { + "epoch": 0.4946466191451906, + "grad_norm": 3.544058073278801, + "learning_rate": 5.054551752006004e-06, + "loss": 0.3578, + "step": 8570 + }, + { + "epoch": 0.4952238030648466, + "grad_norm": 2.5120291222524176, + "learning_rate": 5.048779079836057e-06, + "loss": 0.3548, + "step": 8580 + }, + { + "epoch": 0.4958009869845026, + "grad_norm": 3.3522580875814887, + "learning_rate": 5.043006407666109e-06, + "loss": 0.3606, + "step": 8590 + }, + { + "epoch": 0.4963781709041586, + "grad_norm": 2.412854685811211, + "learning_rate": 5.037233735496162e-06, + "loss": 0.3605, + "step": 8600 + }, + { + "epoch": 0.4969553548238146, + "grad_norm": 2.9209835829832613, + "learning_rate": 5.031461063326214e-06, + "loss": 0.3566, + "step": 8610 + }, + { + "epoch": 0.4975325387434706, + "grad_norm": 2.6580624391224568, + "learning_rate": 5.025688391156267e-06, + "loss": 0.3806, + "step": 8620 + }, + { + "epoch": 0.4981097226631266, + "grad_norm": 2.9252114550534567, + "learning_rate": 5.019915718986319e-06, + "loss": 0.3596, + "step": 8630 + }, + { + "epoch": 0.4986869065827826, + "grad_norm": 2.432042802537537, + "learning_rate": 5.014143046816372e-06, + "loss": 0.3849, + "step": 8640 + }, + { + "epoch": 0.4992640905024386, + "grad_norm": 14.195133545125286, + "learning_rate": 5.008370374646424e-06, + "loss": 0.3602, + "step": 8650 + }, + { + "epoch": 0.4998412744220946, + "grad_norm": 88.84290269765289, + "learning_rate": 5.002597702476477e-06, + "loss": 0.3642, + "step": 8660 + }, + { + "epoch": 0.5004184583417506, + "grad_norm": 2.935701463863382, + "learning_rate": 4.996825030306529e-06, + "loss": 0.3792, + "step": 8670 + }, + { + "epoch": 0.5009956422614066, + "grad_norm": 3.1279829631060396, + "learning_rate": 4.991052358136581e-06, + "loss": 0.3693, + "step": 8680 + }, + { + "epoch": 0.5015728261810626, + "grad_norm": 10.193114728366995, + "learning_rate": 4.985279685966634e-06, + "loss": 0.3597, + "step": 8690 + }, + { + "epoch": 0.5021500101007186, + "grad_norm": 2.4687234809350107, + "learning_rate": 4.9795070137966864e-06, + "loss": 0.3574, + "step": 8700 + }, + { + "epoch": 0.5027271940203746, + "grad_norm": 5.072249220362736, + "learning_rate": 4.9737343416267395e-06, + "loss": 0.3494, + "step": 8710 + }, + { + "epoch": 0.5033043779400306, + "grad_norm": 2.491352620261712, + "learning_rate": 4.967961669456792e-06, + "loss": 0.3567, + "step": 8720 + }, + { + "epoch": 0.5038815618596866, + "grad_norm": 4.883169609666169, + "learning_rate": 4.962188997286845e-06, + "loss": 0.3563, + "step": 8730 + }, + { + "epoch": 0.5044587457793426, + "grad_norm": 3.9469009793644623, + "learning_rate": 4.956416325116897e-06, + "loss": 0.3669, + "step": 8740 + }, + { + "epoch": 0.5050359296989986, + "grad_norm": 5.325339903718785, + "learning_rate": 4.95064365294695e-06, + "loss": 0.3479, + "step": 8750 + }, + { + "epoch": 0.5056131136186546, + "grad_norm": 2.924190713741744, + "learning_rate": 4.944870980777002e-06, + "loss": 0.3596, + "step": 8760 + }, + { + "epoch": 0.5061902975383106, + "grad_norm": 29.94785140992617, + "learning_rate": 4.939098308607055e-06, + "loss": 0.3482, + "step": 8770 + }, + { + "epoch": 0.5067674814579666, + "grad_norm": 4.600461351076627, + "learning_rate": 4.933325636437107e-06, + "loss": 0.3676, + "step": 8780 + }, + { + "epoch": 0.5073446653776226, + "grad_norm": 3.298079688278539, + "learning_rate": 4.92755296426716e-06, + "loss": 0.3849, + "step": 8790 + }, + { + "epoch": 0.5079218492972786, + "grad_norm": 3.3537368346024903, + "learning_rate": 4.921780292097212e-06, + "loss": 0.354, + "step": 8800 + }, + { + "epoch": 0.5084990332169346, + "grad_norm": 4.562710088882002, + "learning_rate": 4.9160076199272646e-06, + "loss": 0.3762, + "step": 8810 + }, + { + "epoch": 0.5090762171365906, + "grad_norm": 3.600922103480094, + "learning_rate": 4.910234947757318e-06, + "loss": 0.3709, + "step": 8820 + }, + { + "epoch": 0.5096534010562466, + "grad_norm": 5.507110655352795, + "learning_rate": 4.90446227558737e-06, + "loss": 0.35, + "step": 8830 + }, + { + "epoch": 0.5102305849759026, + "grad_norm": 14.876481920624023, + "learning_rate": 4.898689603417423e-06, + "loss": 0.3655, + "step": 8840 + }, + { + "epoch": 0.5108077688955586, + "grad_norm": 6.040943909655418, + "learning_rate": 4.892916931247475e-06, + "loss": 0.3537, + "step": 8850 + }, + { + "epoch": 0.5113849528152146, + "grad_norm": 4.784882455474531, + "learning_rate": 4.887144259077527e-06, + "loss": 0.365, + "step": 8860 + }, + { + "epoch": 0.5119621367348706, + "grad_norm": 3.5957944832471864, + "learning_rate": 4.88137158690758e-06, + "loss": 0.3649, + "step": 8870 + }, + { + "epoch": 0.5125393206545266, + "grad_norm": 7.656707006499249, + "learning_rate": 4.875598914737632e-06, + "loss": 0.3678, + "step": 8880 + }, + { + "epoch": 0.5131165045741826, + "grad_norm": 3.6610216347500666, + "learning_rate": 4.869826242567685e-06, + "loss": 0.3659, + "step": 8890 + }, + { + "epoch": 0.5136936884938386, + "grad_norm": 4.181649664719206, + "learning_rate": 4.8640535703977375e-06, + "loss": 0.358, + "step": 8900 + }, + { + "epoch": 0.5142708724134946, + "grad_norm": 1.8612925513884986, + "learning_rate": 4.8582808982277905e-06, + "loss": 0.3508, + "step": 8910 + }, + { + "epoch": 0.5148480563331506, + "grad_norm": 5.0292268546846195, + "learning_rate": 4.852508226057843e-06, + "loss": 0.3567, + "step": 8920 + }, + { + "epoch": 0.5154252402528066, + "grad_norm": 5.77083593828813, + "learning_rate": 4.846735553887895e-06, + "loss": 0.3706, + "step": 8930 + }, + { + "epoch": 0.5160024241724626, + "grad_norm": 3.1575366683166264, + "learning_rate": 4.840962881717948e-06, + "loss": 0.3603, + "step": 8940 + }, + { + "epoch": 0.5165796080921186, + "grad_norm": 3.0092615460602357, + "learning_rate": 4.835190209548e-06, + "loss": 0.3567, + "step": 8950 + }, + { + "epoch": 0.5171567920117746, + "grad_norm": 5.424773899652038, + "learning_rate": 4.829417537378053e-06, + "loss": 0.3569, + "step": 8960 + }, + { + "epoch": 0.5177339759314306, + "grad_norm": 4.0199024195308075, + "learning_rate": 4.823644865208105e-06, + "loss": 0.376, + "step": 8970 + }, + { + "epoch": 0.5183111598510866, + "grad_norm": 8.92239001104145, + "learning_rate": 4.8178721930381575e-06, + "loss": 0.3572, + "step": 8980 + }, + { + "epoch": 0.5188883437707426, + "grad_norm": 10.702196683213096, + "learning_rate": 4.8120995208682105e-06, + "loss": 0.3651, + "step": 8990 + }, + { + "epoch": 0.5194655276903986, + "grad_norm": 4.470162551927128, + "learning_rate": 4.806326848698263e-06, + "loss": 0.3654, + "step": 9000 + }, + { + "epoch": 0.5200427116100546, + "grad_norm": 2.1821708334970737, + "learning_rate": 4.800554176528316e-06, + "loss": 0.3516, + "step": 9010 + }, + { + "epoch": 0.5206198955297106, + "grad_norm": 5.347973377285275, + "learning_rate": 4.794781504358368e-06, + "loss": 0.3623, + "step": 9020 + }, + { + "epoch": 0.5211970794493666, + "grad_norm": 4.623834142826691, + "learning_rate": 4.78900883218842e-06, + "loss": 0.3727, + "step": 9030 + }, + { + "epoch": 0.5217742633690226, + "grad_norm": 5.255808460960779, + "learning_rate": 4.783236160018473e-06, + "loss": 0.3729, + "step": 9040 + }, + { + "epoch": 0.5223514472886785, + "grad_norm": 3.1254534476864215, + "learning_rate": 4.777463487848525e-06, + "loss": 0.3679, + "step": 9050 + }, + { + "epoch": 0.5229286312083345, + "grad_norm": 9.646051621259671, + "learning_rate": 4.771690815678578e-06, + "loss": 0.3834, + "step": 9060 + }, + { + "epoch": 0.5235058151279905, + "grad_norm": 4.490473334084667, + "learning_rate": 4.76591814350863e-06, + "loss": 0.3656, + "step": 9070 + }, + { + "epoch": 0.5240829990476465, + "grad_norm": 2.607385547296513, + "learning_rate": 4.760145471338683e-06, + "loss": 0.3861, + "step": 9080 + }, + { + "epoch": 0.5246601829673025, + "grad_norm": 3.7754579602738136, + "learning_rate": 4.754372799168736e-06, + "loss": 0.3506, + "step": 9090 + }, + { + "epoch": 0.5252373668869585, + "grad_norm": 2.8550942295775896, + "learning_rate": 4.748600126998788e-06, + "loss": 0.3567, + "step": 9100 + }, + { + "epoch": 0.5258145508066145, + "grad_norm": 1.7112590538881849, + "learning_rate": 4.742827454828841e-06, + "loss": 0.3604, + "step": 9110 + }, + { + "epoch": 0.5263917347262705, + "grad_norm": 2.7066239753400585, + "learning_rate": 4.737054782658893e-06, + "loss": 0.36, + "step": 9120 + }, + { + "epoch": 0.5269689186459265, + "grad_norm": 5.758530747558061, + "learning_rate": 4.731282110488946e-06, + "loss": 0.3634, + "step": 9130 + }, + { + "epoch": 0.5275461025655825, + "grad_norm": 210.77018196053547, + "learning_rate": 4.725509438318998e-06, + "loss": 0.3722, + "step": 9140 + }, + { + "epoch": 0.5281232864852385, + "grad_norm": 2.300368854843036, + "learning_rate": 4.71973676614905e-06, + "loss": 0.3729, + "step": 9150 + }, + { + "epoch": 0.5287004704048945, + "grad_norm": 2.210540304841504, + "learning_rate": 4.713964093979103e-06, + "loss": 0.3796, + "step": 9160 + }, + { + "epoch": 0.5292776543245505, + "grad_norm": 2.2634685628485145, + "learning_rate": 4.7081914218091555e-06, + "loss": 0.356, + "step": 9170 + }, + { + "epoch": 0.5298548382442065, + "grad_norm": 3.1055746156661614, + "learning_rate": 4.7024187496392085e-06, + "loss": 0.3582, + "step": 9180 + }, + { + "epoch": 0.5304320221638625, + "grad_norm": 3.654168505440511, + "learning_rate": 4.696646077469261e-06, + "loss": 0.3607, + "step": 9190 + }, + { + "epoch": 0.5310092060835185, + "grad_norm": 3.0537549885812347, + "learning_rate": 4.690873405299313e-06, + "loss": 0.3767, + "step": 9200 + }, + { + "epoch": 0.5315863900031745, + "grad_norm": 1.9675795559729068, + "learning_rate": 4.685100733129366e-06, + "loss": 0.3657, + "step": 9210 + }, + { + "epoch": 0.5321635739228305, + "grad_norm": 2.0115039115011606, + "learning_rate": 4.679328060959418e-06, + "loss": 0.3601, + "step": 9220 + }, + { + "epoch": 0.5327407578424865, + "grad_norm": 5.152045183089219, + "learning_rate": 4.673555388789471e-06, + "loss": 0.3874, + "step": 9230 + }, + { + "epoch": 0.5333179417621425, + "grad_norm": 3.394195501156254, + "learning_rate": 4.667782716619523e-06, + "loss": 0.3687, + "step": 9240 + }, + { + "epoch": 0.5338951256817985, + "grad_norm": 3.733950998673983, + "learning_rate": 4.6620100444495755e-06, + "loss": 0.3812, + "step": 9250 + }, + { + "epoch": 0.5344723096014545, + "grad_norm": 2.172335452644072, + "learning_rate": 4.6562373722796285e-06, + "loss": 0.3719, + "step": 9260 + }, + { + "epoch": 0.5350494935211105, + "grad_norm": 1.8466321499683245, + "learning_rate": 4.650464700109681e-06, + "loss": 0.3563, + "step": 9270 + }, + { + "epoch": 0.5356266774407665, + "grad_norm": 2.411259261337378, + "learning_rate": 4.644692027939734e-06, + "loss": 0.3652, + "step": 9280 + }, + { + "epoch": 0.5362038613604225, + "grad_norm": 2.525799052658632, + "learning_rate": 4.638919355769787e-06, + "loss": 0.357, + "step": 9290 + }, + { + "epoch": 0.5367810452800785, + "grad_norm": 4.322100164387683, + "learning_rate": 4.633146683599839e-06, + "loss": 0.344, + "step": 9300 + }, + { + "epoch": 0.5373582291997345, + "grad_norm": 2.9720413934133267, + "learning_rate": 4.627374011429892e-06, + "loss": 0.3591, + "step": 9310 + }, + { + "epoch": 0.5379354131193905, + "grad_norm": 3.236889177938413, + "learning_rate": 4.621601339259944e-06, + "loss": 0.3698, + "step": 9320 + }, + { + "epoch": 0.5385125970390465, + "grad_norm": 4.173966589558208, + "learning_rate": 4.615828667089996e-06, + "loss": 0.3687, + "step": 9330 + }, + { + "epoch": 0.5390897809587025, + "grad_norm": 3.3050072649704387, + "learning_rate": 4.610055994920049e-06, + "loss": 0.3559, + "step": 9340 + }, + { + "epoch": 0.5396669648783585, + "grad_norm": 2.2463012820732904, + "learning_rate": 4.6042833227501014e-06, + "loss": 0.3585, + "step": 9350 + }, + { + "epoch": 0.5402441487980145, + "grad_norm": 2.3061353512132357, + "learning_rate": 4.5985106505801544e-06, + "loss": 0.3666, + "step": 9360 + }, + { + "epoch": 0.5408213327176705, + "grad_norm": 2.8767755309606393, + "learning_rate": 4.592737978410207e-06, + "loss": 0.3656, + "step": 9370 + }, + { + "epoch": 0.5413985166373265, + "grad_norm": 3.5885590608959603, + "learning_rate": 4.586965306240259e-06, + "loss": 0.3619, + "step": 9380 + }, + { + "epoch": 0.5419757005569825, + "grad_norm": 3.012510637436092, + "learning_rate": 4.581192634070312e-06, + "loss": 0.3679, + "step": 9390 + }, + { + "epoch": 0.5425528844766385, + "grad_norm": 2.074304355176205, + "learning_rate": 4.575419961900364e-06, + "loss": 0.3558, + "step": 9400 + }, + { + "epoch": 0.5431300683962945, + "grad_norm": 2.410649696384616, + "learning_rate": 4.569647289730417e-06, + "loss": 0.3667, + "step": 9410 + }, + { + "epoch": 0.5437072523159505, + "grad_norm": 6.990964309593162, + "learning_rate": 4.563874617560469e-06, + "loss": 0.3544, + "step": 9420 + }, + { + "epoch": 0.5442844362356065, + "grad_norm": 1.6679890672242221, + "learning_rate": 4.558101945390521e-06, + "loss": 0.3635, + "step": 9430 + }, + { + "epoch": 0.5448616201552625, + "grad_norm": 3.5900096135866177, + "learning_rate": 4.552329273220574e-06, + "loss": 0.3548, + "step": 9440 + }, + { + "epoch": 0.5454388040749185, + "grad_norm": 2.8054899052225655, + "learning_rate": 4.5465566010506266e-06, + "loss": 0.3573, + "step": 9450 + }, + { + "epoch": 0.5460159879945745, + "grad_norm": 2.7792687957236315, + "learning_rate": 4.5407839288806796e-06, + "loss": 0.3677, + "step": 9460 + }, + { + "epoch": 0.5465931719142305, + "grad_norm": 2.2614901204437636, + "learning_rate": 4.535011256710732e-06, + "loss": 0.3643, + "step": 9470 + }, + { + "epoch": 0.5471703558338865, + "grad_norm": 2.062841207589413, + "learning_rate": 4.529238584540785e-06, + "loss": 0.3592, + "step": 9480 + }, + { + "epoch": 0.5477475397535425, + "grad_norm": 8.866919074207425, + "learning_rate": 4.523465912370837e-06, + "loss": 0.3552, + "step": 9490 + }, + { + "epoch": 0.5483247236731985, + "grad_norm": 2.5645835670129618, + "learning_rate": 4.517693240200889e-06, + "loss": 0.3498, + "step": 9500 + }, + { + "epoch": 0.5489019075928545, + "grad_norm": 3.1741970680335716, + "learning_rate": 4.511920568030942e-06, + "loss": 0.3682, + "step": 9510 + }, + { + "epoch": 0.5494790915125105, + "grad_norm": 2.0747984580427765, + "learning_rate": 4.506147895860994e-06, + "loss": 0.3646, + "step": 9520 + }, + { + "epoch": 0.5500562754321665, + "grad_norm": 2.3586125635742654, + "learning_rate": 4.500375223691047e-06, + "loss": 0.355, + "step": 9530 + }, + { + "epoch": 0.5506334593518225, + "grad_norm": 3.8182790115085927, + "learning_rate": 4.4946025515210995e-06, + "loss": 0.3528, + "step": 9540 + }, + { + "epoch": 0.5512106432714785, + "grad_norm": 2.623243257421812, + "learning_rate": 4.488829879351152e-06, + "loss": 0.3551, + "step": 9550 + }, + { + "epoch": 0.5517878271911345, + "grad_norm": 2.775469371839904, + "learning_rate": 4.483057207181205e-06, + "loss": 0.3556, + "step": 9560 + }, + { + "epoch": 0.5523650111107905, + "grad_norm": 2.345529859871698, + "learning_rate": 4.477284535011257e-06, + "loss": 0.3702, + "step": 9570 + }, + { + "epoch": 0.5529421950304465, + "grad_norm": 5.265133567547254, + "learning_rate": 4.47151186284131e-06, + "loss": 0.3617, + "step": 9580 + }, + { + "epoch": 0.5535193789501025, + "grad_norm": 3.1602517841252897, + "learning_rate": 4.465739190671362e-06, + "loss": 0.3524, + "step": 9590 + }, + { + "epoch": 0.5540965628697585, + "grad_norm": 1.8906279954560556, + "learning_rate": 4.459966518501414e-06, + "loss": 0.3584, + "step": 9600 + }, + { + "epoch": 0.5546737467894145, + "grad_norm": 3.4743194872868117, + "learning_rate": 4.454193846331467e-06, + "loss": 0.3531, + "step": 9610 + }, + { + "epoch": 0.5552509307090705, + "grad_norm": 2.9951619722989578, + "learning_rate": 4.4484211741615194e-06, + "loss": 0.3525, + "step": 9620 + }, + { + "epoch": 0.5558281146287264, + "grad_norm": 5.694532155563467, + "learning_rate": 4.4426485019915725e-06, + "loss": 0.3568, + "step": 9630 + }, + { + "epoch": 0.5564052985483824, + "grad_norm": 2.0095159465987398, + "learning_rate": 4.436875829821625e-06, + "loss": 0.3586, + "step": 9640 + }, + { + "epoch": 0.5569824824680384, + "grad_norm": 2.703106746299796, + "learning_rate": 4.431103157651677e-06, + "loss": 0.3586, + "step": 9650 + }, + { + "epoch": 0.5575596663876944, + "grad_norm": 4.366070322479832, + "learning_rate": 4.42533048548173e-06, + "loss": 0.3628, + "step": 9660 + }, + { + "epoch": 0.5581368503073504, + "grad_norm": 5.959735845454689, + "learning_rate": 4.419557813311782e-06, + "loss": 0.3453, + "step": 9670 + }, + { + "epoch": 0.5587140342270064, + "grad_norm": 3.887262505871754, + "learning_rate": 4.413785141141835e-06, + "loss": 0.3579, + "step": 9680 + }, + { + "epoch": 0.5592912181466624, + "grad_norm": 3.5847585820260273, + "learning_rate": 4.408012468971887e-06, + "loss": 0.353, + "step": 9690 + }, + { + "epoch": 0.5598684020663184, + "grad_norm": 11.183076093226834, + "learning_rate": 4.40223979680194e-06, + "loss": 0.354, + "step": 9700 + }, + { + "epoch": 0.5604455859859744, + "grad_norm": 8.751616032322445, + "learning_rate": 4.396467124631992e-06, + "loss": 0.358, + "step": 9710 + }, + { + "epoch": 0.5610227699056304, + "grad_norm": 8.759188394381018, + "learning_rate": 4.3906944524620446e-06, + "loss": 0.3477, + "step": 9720 + }, + { + "epoch": 0.5615999538252864, + "grad_norm": 3.185742989121124, + "learning_rate": 4.3849217802920976e-06, + "loss": 0.3527, + "step": 9730 + }, + { + "epoch": 0.5621771377449424, + "grad_norm": 2.6034839783878847, + "learning_rate": 4.37914910812215e-06, + "loss": 0.3614, + "step": 9740 + }, + { + "epoch": 0.5627543216645984, + "grad_norm": 3.8176039303217943, + "learning_rate": 4.373376435952203e-06, + "loss": 0.3629, + "step": 9750 + }, + { + "epoch": 0.5633315055842544, + "grad_norm": 2.892391389713988, + "learning_rate": 4.367603763782255e-06, + "loss": 0.3543, + "step": 9760 + }, + { + "epoch": 0.5639086895039104, + "grad_norm": 3.756297203577958, + "learning_rate": 4.361831091612307e-06, + "loss": 0.3618, + "step": 9770 + }, + { + "epoch": 0.5644858734235664, + "grad_norm": 5.678261576873807, + "learning_rate": 4.35605841944236e-06, + "loss": 0.3568, + "step": 9780 + }, + { + "epoch": 0.5650630573432224, + "grad_norm": 8.441638312518547, + "learning_rate": 4.350285747272412e-06, + "loss": 0.3729, + "step": 9790 + }, + { + "epoch": 0.5656402412628784, + "grad_norm": 11.963283236920255, + "learning_rate": 4.344513075102465e-06, + "loss": 0.3515, + "step": 9800 + }, + { + "epoch": 0.5662174251825344, + "grad_norm": 5.191809552146641, + "learning_rate": 4.338740402932518e-06, + "loss": 0.3555, + "step": 9810 + }, + { + "epoch": 0.5667946091021904, + "grad_norm": 4.402544511071244, + "learning_rate": 4.3329677307625705e-06, + "loss": 0.3593, + "step": 9820 + }, + { + "epoch": 0.5673717930218464, + "grad_norm": 3.877667010250195, + "learning_rate": 4.327195058592623e-06, + "loss": 0.3671, + "step": 9830 + }, + { + "epoch": 0.5679489769415024, + "grad_norm": 3.6190457236390907, + "learning_rate": 4.321422386422676e-06, + "loss": 0.3821, + "step": 9840 + }, + { + "epoch": 0.5685261608611584, + "grad_norm": 6.877296061159591, + "learning_rate": 4.315649714252728e-06, + "loss": 0.3603, + "step": 9850 + }, + { + "epoch": 0.5691033447808144, + "grad_norm": 4.590765318427339, + "learning_rate": 4.309877042082781e-06, + "loss": 0.3713, + "step": 9860 + }, + { + "epoch": 0.5696805287004704, + "grad_norm": 4.474831184892119, + "learning_rate": 4.304104369912833e-06, + "loss": 0.3475, + "step": 9870 + }, + { + "epoch": 0.5702577126201264, + "grad_norm": 6.348172468881283, + "learning_rate": 4.298331697742886e-06, + "loss": 0.3498, + "step": 9880 + }, + { + "epoch": 0.5708348965397824, + "grad_norm": 3.2161248922526906, + "learning_rate": 4.292559025572938e-06, + "loss": 0.3548, + "step": 9890 + }, + { + "epoch": 0.5714120804594384, + "grad_norm": 3.6712025528595476, + "learning_rate": 4.2867863534029905e-06, + "loss": 0.3698, + "step": 9900 + }, + { + "epoch": 0.5719892643790944, + "grad_norm": 6.372020331564792, + "learning_rate": 4.2810136812330435e-06, + "loss": 0.3774, + "step": 9910 + }, + { + "epoch": 0.5725664482987504, + "grad_norm": 7.0760072190976055, + "learning_rate": 4.275241009063096e-06, + "loss": 0.366, + "step": 9920 + }, + { + "epoch": 0.5731436322184064, + "grad_norm": 4.330191932801956, + "learning_rate": 4.269468336893149e-06, + "loss": 0.3659, + "step": 9930 + }, + { + "epoch": 0.5737208161380624, + "grad_norm": 3.7388442828506183, + "learning_rate": 4.263695664723201e-06, + "loss": 0.3585, + "step": 9940 + }, + { + "epoch": 0.5742980000577184, + "grad_norm": 3.788666515258982, + "learning_rate": 4.257922992553253e-06, + "loss": 0.3624, + "step": 9950 + }, + { + "epoch": 0.5748751839773744, + "grad_norm": 3.882574363808373, + "learning_rate": 4.252150320383306e-06, + "loss": 0.359, + "step": 9960 + }, + { + "epoch": 0.5754523678970304, + "grad_norm": 3.3860959596594764, + "learning_rate": 4.246377648213358e-06, + "loss": 0.3753, + "step": 9970 + }, + { + "epoch": 0.5760295518166864, + "grad_norm": 3.0843751033026936, + "learning_rate": 4.240604976043411e-06, + "loss": 0.3623, + "step": 9980 + }, + { + "epoch": 0.5766067357363424, + "grad_norm": 4.2670578544960165, + "learning_rate": 4.234832303873463e-06, + "loss": 0.3583, + "step": 9990 + }, + { + "epoch": 0.5771839196559984, + "grad_norm": 4.252817625823887, + "learning_rate": 4.229059631703516e-06, + "loss": 0.3576, + "step": 10000 + }, + { + "epoch": 0.5777611035756544, + "grad_norm": 3.407625305823021, + "learning_rate": 4.223286959533569e-06, + "loss": 0.3552, + "step": 10010 + }, + { + "epoch": 0.5783382874953104, + "grad_norm": 3.1679099104052484, + "learning_rate": 4.217514287363621e-06, + "loss": 0.3738, + "step": 10020 + }, + { + "epoch": 0.5789154714149664, + "grad_norm": 3.6787102703539443, + "learning_rate": 4.211741615193674e-06, + "loss": 0.3578, + "step": 10030 + }, + { + "epoch": 0.5794926553346224, + "grad_norm": 5.851925555703329, + "learning_rate": 4.205968943023726e-06, + "loss": 0.3419, + "step": 10040 + }, + { + "epoch": 0.5800698392542784, + "grad_norm": 4.329132073061233, + "learning_rate": 4.200196270853779e-06, + "loss": 0.3643, + "step": 10050 + }, + { + "epoch": 0.5806470231739344, + "grad_norm": 5.997643376925449, + "learning_rate": 4.194423598683831e-06, + "loss": 0.377, + "step": 10060 + }, + { + "epoch": 0.5812242070935904, + "grad_norm": 2.847934197964713, + "learning_rate": 4.188650926513883e-06, + "loss": 0.3637, + "step": 10070 + }, + { + "epoch": 0.5818013910132463, + "grad_norm": 5.914457578502053, + "learning_rate": 4.182878254343936e-06, + "loss": 0.3674, + "step": 10080 + }, + { + "epoch": 0.5823785749329023, + "grad_norm": 2.717688952876076, + "learning_rate": 4.1771055821739885e-06, + "loss": 0.3445, + "step": 10090 + }, + { + "epoch": 0.5829557588525583, + "grad_norm": 4.506246357458738, + "learning_rate": 4.1713329100040415e-06, + "loss": 0.3596, + "step": 10100 + }, + { + "epoch": 0.5835329427722143, + "grad_norm": 4.390097302175211, + "learning_rate": 4.165560237834094e-06, + "loss": 0.3502, + "step": 10110 + }, + { + "epoch": 0.5841101266918703, + "grad_norm": 61.470288646964526, + "learning_rate": 4.159787565664146e-06, + "loss": 0.364, + "step": 10120 + }, + { + "epoch": 0.5846873106115263, + "grad_norm": 4.249154244474576, + "learning_rate": 4.154014893494199e-06, + "loss": 0.3494, + "step": 10130 + }, + { + "epoch": 0.5852644945311823, + "grad_norm": 2.6108964866695956, + "learning_rate": 4.148242221324251e-06, + "loss": 0.3417, + "step": 10140 + }, + { + "epoch": 0.5858416784508383, + "grad_norm": 3.655089863468255, + "learning_rate": 4.142469549154304e-06, + "loss": 0.3576, + "step": 10150 + }, + { + "epoch": 0.5864188623704943, + "grad_norm": 19.653637032520724, + "learning_rate": 4.136696876984356e-06, + "loss": 0.3603, + "step": 10160 + }, + { + "epoch": 0.5869960462901503, + "grad_norm": 4.623567506823282, + "learning_rate": 4.1309242048144085e-06, + "loss": 0.3486, + "step": 10170 + }, + { + "epoch": 0.5875732302098063, + "grad_norm": 5.14547645262892, + "learning_rate": 4.1251515326444615e-06, + "loss": 0.3591, + "step": 10180 + }, + { + "epoch": 0.5881504141294623, + "grad_norm": 20.59966690800242, + "learning_rate": 4.119378860474514e-06, + "loss": 0.3473, + "step": 10190 + }, + { + "epoch": 0.5887275980491183, + "grad_norm": 3.3637726076066796, + "learning_rate": 4.113606188304567e-06, + "loss": 0.3706, + "step": 10200 + }, + { + "epoch": 0.5893047819687743, + "grad_norm": 4.061688985881421, + "learning_rate": 4.107833516134619e-06, + "loss": 0.3694, + "step": 10210 + }, + { + "epoch": 0.5898819658884303, + "grad_norm": 4.941411464322626, + "learning_rate": 4.102060843964671e-06, + "loss": 0.3551, + "step": 10220 + }, + { + "epoch": 0.5904591498080863, + "grad_norm": 4.631050291252514, + "learning_rate": 4.096288171794724e-06, + "loss": 0.3669, + "step": 10230 + }, + { + "epoch": 0.5910363337277423, + "grad_norm": 5.833635533863073, + "learning_rate": 4.090515499624776e-06, + "loss": 0.354, + "step": 10240 + }, + { + "epoch": 0.5916135176473983, + "grad_norm": 17.060908158433886, + "learning_rate": 4.084742827454829e-06, + "loss": 0.355, + "step": 10250 + }, + { + "epoch": 0.5921907015670543, + "grad_norm": 4.438473043902829, + "learning_rate": 4.078970155284881e-06, + "loss": 0.3559, + "step": 10260 + }, + { + "epoch": 0.5927678854867103, + "grad_norm": 4.029441903808598, + "learning_rate": 4.0731974831149344e-06, + "loss": 0.3456, + "step": 10270 + }, + { + "epoch": 0.5933450694063663, + "grad_norm": 4.894018752470052, + "learning_rate": 4.067424810944987e-06, + "loss": 0.342, + "step": 10280 + }, + { + "epoch": 0.5939222533260223, + "grad_norm": 2.403763740670601, + "learning_rate": 4.061652138775039e-06, + "loss": 0.3653, + "step": 10290 + }, + { + "epoch": 0.5944994372456783, + "grad_norm": 4.283731997785048, + "learning_rate": 4.055879466605092e-06, + "loss": 0.3668, + "step": 10300 + }, + { + "epoch": 0.5950766211653343, + "grad_norm": 3.021670467218982, + "learning_rate": 4.050106794435144e-06, + "loss": 0.3554, + "step": 10310 + }, + { + "epoch": 0.5956538050849903, + "grad_norm": 39.77509257463927, + "learning_rate": 4.044334122265197e-06, + "loss": 0.373, + "step": 10320 + }, + { + "epoch": 0.5962309890046463, + "grad_norm": 7.734517576459877, + "learning_rate": 4.03856145009525e-06, + "loss": 0.3545, + "step": 10330 + }, + { + "epoch": 0.5968081729243023, + "grad_norm": 12.164334264143207, + "learning_rate": 4.032788777925302e-06, + "loss": 0.3593, + "step": 10340 + }, + { + "epoch": 0.5973853568439583, + "grad_norm": 6.962481288991228, + "learning_rate": 4.027016105755354e-06, + "loss": 0.3424, + "step": 10350 + }, + { + "epoch": 0.5979625407636143, + "grad_norm": 3.265508104782265, + "learning_rate": 4.021243433585407e-06, + "loss": 0.3401, + "step": 10360 + }, + { + "epoch": 0.5985397246832703, + "grad_norm": 3.169522843364254, + "learning_rate": 4.0154707614154596e-06, + "loss": 0.3534, + "step": 10370 + }, + { + "epoch": 0.5991169086029263, + "grad_norm": 3.2365860904520662, + "learning_rate": 4.0096980892455126e-06, + "loss": 0.3476, + "step": 10380 + }, + { + "epoch": 0.5996940925225823, + "grad_norm": 22.34179305900222, + "learning_rate": 4.003925417075565e-06, + "loss": 0.3615, + "step": 10390 + }, + { + "epoch": 0.6002712764422383, + "grad_norm": 3.6824711768745235, + "learning_rate": 3.998152744905617e-06, + "loss": 0.3598, + "step": 10400 + }, + { + "epoch": 0.6008484603618943, + "grad_norm": 2.244270658998301, + "learning_rate": 3.99238007273567e-06, + "loss": 0.3602, + "step": 10410 + }, + { + "epoch": 0.6014256442815503, + "grad_norm": 2.944264669766013, + "learning_rate": 3.986607400565722e-06, + "loss": 0.3557, + "step": 10420 + }, + { + "epoch": 0.6020028282012063, + "grad_norm": 3.6813564246612893, + "learning_rate": 3.980834728395775e-06, + "loss": 0.3426, + "step": 10430 + }, + { + "epoch": 0.6025800121208623, + "grad_norm": 2.4642472523118193, + "learning_rate": 3.975062056225827e-06, + "loss": 0.3498, + "step": 10440 + }, + { + "epoch": 0.6031571960405183, + "grad_norm": 2.612111770933025, + "learning_rate": 3.96928938405588e-06, + "loss": 0.3738, + "step": 10450 + }, + { + "epoch": 0.6037343799601743, + "grad_norm": 4.989290119921459, + "learning_rate": 3.9635167118859325e-06, + "loss": 0.3437, + "step": 10460 + }, + { + "epoch": 0.6043115638798303, + "grad_norm": 3.191111659552641, + "learning_rate": 3.957744039715985e-06, + "loss": 0.3513, + "step": 10470 + }, + { + "epoch": 0.6048887477994863, + "grad_norm": 6.289514020679802, + "learning_rate": 3.951971367546038e-06, + "loss": 0.3617, + "step": 10480 + }, + { + "epoch": 0.6054659317191423, + "grad_norm": 4.369159045847553, + "learning_rate": 3.94619869537609e-06, + "loss": 0.3622, + "step": 10490 + }, + { + "epoch": 0.6060431156387983, + "grad_norm": 4.272747590300094, + "learning_rate": 3.940426023206143e-06, + "loss": 0.3679, + "step": 10500 + }, + { + "epoch": 0.6066202995584543, + "grad_norm": 6.298266009612924, + "learning_rate": 3.934653351036195e-06, + "loss": 0.3605, + "step": 10510 + }, + { + "epoch": 0.6071974834781103, + "grad_norm": 2.853912711053667, + "learning_rate": 3.928880678866247e-06, + "loss": 0.3643, + "step": 10520 + }, + { + "epoch": 0.6077746673977663, + "grad_norm": 4.905191894605176, + "learning_rate": 3.9231080066963e-06, + "loss": 0.3566, + "step": 10530 + }, + { + "epoch": 0.6083518513174223, + "grad_norm": 3.7179571361092307, + "learning_rate": 3.9173353345263524e-06, + "loss": 0.3534, + "step": 10540 + }, + { + "epoch": 0.6089290352370783, + "grad_norm": 4.720816418264325, + "learning_rate": 3.9115626623564055e-06, + "loss": 0.3445, + "step": 10550 + }, + { + "epoch": 0.6095062191567343, + "grad_norm": 6.0994643547541045, + "learning_rate": 3.905789990186458e-06, + "loss": 0.3447, + "step": 10560 + }, + { + "epoch": 0.6100834030763903, + "grad_norm": 7.143461522640564, + "learning_rate": 3.90001731801651e-06, + "loss": 0.3486, + "step": 10570 + }, + { + "epoch": 0.6106605869960463, + "grad_norm": 3.5865672738484515, + "learning_rate": 3.894244645846563e-06, + "loss": 0.3543, + "step": 10580 + }, + { + "epoch": 0.6112377709157023, + "grad_norm": 3.44671753994167, + "learning_rate": 3.888471973676615e-06, + "loss": 0.339, + "step": 10590 + }, + { + "epoch": 0.6118149548353583, + "grad_norm": 4.037111129069171, + "learning_rate": 3.882699301506668e-06, + "loss": 0.3542, + "step": 10600 + }, + { + "epoch": 0.6123921387550143, + "grad_norm": 2.5068462700876752, + "learning_rate": 3.87692662933672e-06, + "loss": 0.3612, + "step": 10610 + }, + { + "epoch": 0.6129693226746703, + "grad_norm": 2.69916896955261, + "learning_rate": 3.871153957166772e-06, + "loss": 0.3552, + "step": 10620 + }, + { + "epoch": 0.6135465065943263, + "grad_norm": 2.12828690128291, + "learning_rate": 3.865381284996825e-06, + "loss": 0.3464, + "step": 10630 + }, + { + "epoch": 0.6141236905139823, + "grad_norm": 2.4651478648163754, + "learning_rate": 3.8596086128268776e-06, + "loss": 0.3714, + "step": 10640 + }, + { + "epoch": 0.6147008744336383, + "grad_norm": 4.709377859928187, + "learning_rate": 3.853835940656931e-06, + "loss": 0.3462, + "step": 10650 + }, + { + "epoch": 0.6152780583532943, + "grad_norm": 14.878118519317356, + "learning_rate": 3.848063268486983e-06, + "loss": 0.355, + "step": 10660 + }, + { + "epoch": 0.6158552422729503, + "grad_norm": 7.0013585156765314, + "learning_rate": 3.842290596317036e-06, + "loss": 0.3548, + "step": 10670 + }, + { + "epoch": 0.6164324261926063, + "grad_norm": 4.781645383408167, + "learning_rate": 3.836517924147088e-06, + "loss": 0.3544, + "step": 10680 + }, + { + "epoch": 0.6170096101122623, + "grad_norm": 2.5404756093298695, + "learning_rate": 3.83074525197714e-06, + "loss": 0.3534, + "step": 10690 + }, + { + "epoch": 0.6175867940319183, + "grad_norm": 8.95780042415011, + "learning_rate": 3.824972579807193e-06, + "loss": 0.3636, + "step": 10700 + }, + { + "epoch": 0.6181639779515743, + "grad_norm": 4.989641662422552, + "learning_rate": 3.819199907637245e-06, + "loss": 0.3673, + "step": 10710 + }, + { + "epoch": 0.6187411618712303, + "grad_norm": 3.0860360298250096, + "learning_rate": 3.813427235467298e-06, + "loss": 0.3454, + "step": 10720 + }, + { + "epoch": 0.6193183457908863, + "grad_norm": 5.75583484430528, + "learning_rate": 3.8076545632973505e-06, + "loss": 0.3598, + "step": 10730 + }, + { + "epoch": 0.6198955297105423, + "grad_norm": 2.949832246732552, + "learning_rate": 3.801881891127403e-06, + "loss": 0.3553, + "step": 10740 + }, + { + "epoch": 0.6204727136301983, + "grad_norm": 5.670166660495844, + "learning_rate": 3.7961092189574557e-06, + "loss": 0.3626, + "step": 10750 + }, + { + "epoch": 0.6210498975498543, + "grad_norm": 3.972108943307402, + "learning_rate": 3.790336546787508e-06, + "loss": 0.3473, + "step": 10760 + }, + { + "epoch": 0.6216270814695103, + "grad_norm": 6.588272622863319, + "learning_rate": 3.7845638746175605e-06, + "loss": 0.3504, + "step": 10770 + }, + { + "epoch": 0.6222042653891663, + "grad_norm": 10.062313562042537, + "learning_rate": 3.778791202447613e-06, + "loss": 0.348, + "step": 10780 + }, + { + "epoch": 0.6227814493088223, + "grad_norm": 2.393822027910724, + "learning_rate": 3.7730185302776657e-06, + "loss": 0.332, + "step": 10790 + }, + { + "epoch": 0.6233586332284783, + "grad_norm": 3.726451963447983, + "learning_rate": 3.7672458581077183e-06, + "loss": 0.3544, + "step": 10800 + }, + { + "epoch": 0.6239358171481343, + "grad_norm": 3.0713208436951405, + "learning_rate": 3.761473185937771e-06, + "loss": 0.3569, + "step": 10810 + }, + { + "epoch": 0.6245130010677903, + "grad_norm": 8.506793083475245, + "learning_rate": 3.755700513767823e-06, + "loss": 0.3498, + "step": 10820 + }, + { + "epoch": 0.6250901849874463, + "grad_norm": 5.0436471202946205, + "learning_rate": 3.7499278415978756e-06, + "loss": 0.3539, + "step": 10830 + }, + { + "epoch": 0.6256673689071023, + "grad_norm": 3.0241635545445957, + "learning_rate": 3.7441551694279282e-06, + "loss": 0.3623, + "step": 10840 + }, + { + "epoch": 0.6262445528267583, + "grad_norm": 2.4939286036772703, + "learning_rate": 3.7383824972579812e-06, + "loss": 0.3578, + "step": 10850 + }, + { + "epoch": 0.6268217367464143, + "grad_norm": 3.3678230813106373, + "learning_rate": 3.732609825088034e-06, + "loss": 0.3568, + "step": 10860 + }, + { + "epoch": 0.6273989206660703, + "grad_norm": 3.1630251340425795, + "learning_rate": 3.7268371529180864e-06, + "loss": 0.3488, + "step": 10870 + }, + { + "epoch": 0.6279761045857263, + "grad_norm": 4.715668356612278, + "learning_rate": 3.721064480748139e-06, + "loss": 0.3487, + "step": 10880 + }, + { + "epoch": 0.6285532885053823, + "grad_norm": 3.430089287433023, + "learning_rate": 3.7152918085781912e-06, + "loss": 0.3676, + "step": 10890 + }, + { + "epoch": 0.6291304724250383, + "grad_norm": 2.9339747157831546, + "learning_rate": 3.709519136408244e-06, + "loss": 0.3507, + "step": 10900 + }, + { + "epoch": 0.6297076563446943, + "grad_norm": 4.2204605393920485, + "learning_rate": 3.7037464642382964e-06, + "loss": 0.3446, + "step": 10910 + }, + { + "epoch": 0.6302848402643503, + "grad_norm": 2.5323029057405764, + "learning_rate": 3.697973792068349e-06, + "loss": 0.3507, + "step": 10920 + }, + { + "epoch": 0.6308620241840063, + "grad_norm": 1.959685524861653, + "learning_rate": 3.6922011198984016e-06, + "loss": 0.3551, + "step": 10930 + }, + { + "epoch": 0.6314392081036623, + "grad_norm": 2.2201928530131085, + "learning_rate": 3.6864284477284538e-06, + "loss": 0.3476, + "step": 10940 + }, + { + "epoch": 0.6320163920233183, + "grad_norm": 2.5294366254069645, + "learning_rate": 3.6806557755585064e-06, + "loss": 0.3502, + "step": 10950 + }, + { + "epoch": 0.6325935759429743, + "grad_norm": 2.5929823326561103, + "learning_rate": 3.674883103388559e-06, + "loss": 0.3477, + "step": 10960 + }, + { + "epoch": 0.6331707598626303, + "grad_norm": 3.0643397308226903, + "learning_rate": 3.6691104312186116e-06, + "loss": 0.3511, + "step": 10970 + }, + { + "epoch": 0.6337479437822863, + "grad_norm": 3.725143554468828, + "learning_rate": 3.663337759048664e-06, + "loss": 0.3597, + "step": 10980 + }, + { + "epoch": 0.6343251277019423, + "grad_norm": 2.332988363561149, + "learning_rate": 3.6575650868787168e-06, + "loss": 0.3547, + "step": 10990 + }, + { + "epoch": 0.6349023116215983, + "grad_norm": 4.338506151135665, + "learning_rate": 3.651792414708769e-06, + "loss": 0.3621, + "step": 11000 + }, + { + "epoch": 0.6354794955412543, + "grad_norm": 5.853920773449472, + "learning_rate": 3.6460197425388215e-06, + "loss": 0.3467, + "step": 11010 + }, + { + "epoch": 0.6360566794609103, + "grad_norm": 2.9801395957721217, + "learning_rate": 3.640247070368874e-06, + "loss": 0.3533, + "step": 11020 + }, + { + "epoch": 0.6366338633805663, + "grad_norm": 5.428993197624115, + "learning_rate": 3.6344743981989267e-06, + "loss": 0.3477, + "step": 11030 + }, + { + "epoch": 0.6372110473002223, + "grad_norm": 2.1575911965605914, + "learning_rate": 3.6287017260289793e-06, + "loss": 0.3463, + "step": 11040 + }, + { + "epoch": 0.6377882312198783, + "grad_norm": 3.3210877709918982, + "learning_rate": 3.622929053859032e-06, + "loss": 0.3546, + "step": 11050 + }, + { + "epoch": 0.6383654151395343, + "grad_norm": 2.686843207231148, + "learning_rate": 3.617156381689084e-06, + "loss": 0.3518, + "step": 11060 + }, + { + "epoch": 0.6389425990591903, + "grad_norm": 5.280345153851947, + "learning_rate": 3.6113837095191367e-06, + "loss": 0.3579, + "step": 11070 + }, + { + "epoch": 0.6395197829788463, + "grad_norm": 5.403871542937742, + "learning_rate": 3.6056110373491893e-06, + "loss": 0.3489, + "step": 11080 + }, + { + "epoch": 0.6400969668985023, + "grad_norm": 2.9735701887326833, + "learning_rate": 3.599838365179242e-06, + "loss": 0.3502, + "step": 11090 + }, + { + "epoch": 0.6406741508181583, + "grad_norm": 2.6891252705595368, + "learning_rate": 3.5940656930092945e-06, + "loss": 0.3612, + "step": 11100 + }, + { + "epoch": 0.6412513347378141, + "grad_norm": 4.660072834904341, + "learning_rate": 3.5882930208393467e-06, + "loss": 0.3408, + "step": 11110 + }, + { + "epoch": 0.6418285186574701, + "grad_norm": 8.181225664129359, + "learning_rate": 3.5825203486693993e-06, + "loss": 0.3466, + "step": 11120 + }, + { + "epoch": 0.6424057025771261, + "grad_norm": 3.8250794191372943, + "learning_rate": 3.576747676499452e-06, + "loss": 0.3428, + "step": 11130 + }, + { + "epoch": 0.6429828864967821, + "grad_norm": 2.5770691997974975, + "learning_rate": 3.5709750043295044e-06, + "loss": 0.3617, + "step": 11140 + }, + { + "epoch": 0.6435600704164381, + "grad_norm": 3.518147076569533, + "learning_rate": 3.565202332159557e-06, + "loss": 0.3598, + "step": 11150 + }, + { + "epoch": 0.6441372543360941, + "grad_norm": 2.3978173360258332, + "learning_rate": 3.5594296599896096e-06, + "loss": 0.3486, + "step": 11160 + }, + { + "epoch": 0.6447144382557501, + "grad_norm": 4.963521259349147, + "learning_rate": 3.553656987819662e-06, + "loss": 0.3465, + "step": 11170 + }, + { + "epoch": 0.6452916221754061, + "grad_norm": 3.7768523495827, + "learning_rate": 3.5478843156497144e-06, + "loss": 0.3507, + "step": 11180 + }, + { + "epoch": 0.6458688060950621, + "grad_norm": 1.9990854220435814, + "learning_rate": 3.542111643479767e-06, + "loss": 0.3485, + "step": 11190 + }, + { + "epoch": 0.6464459900147181, + "grad_norm": 2.434785484655442, + "learning_rate": 3.5363389713098196e-06, + "loss": 0.3414, + "step": 11200 + }, + { + "epoch": 0.6470231739343741, + "grad_norm": 3.764273559187499, + "learning_rate": 3.530566299139872e-06, + "loss": 0.3475, + "step": 11210 + }, + { + "epoch": 0.6476003578540301, + "grad_norm": 3.063611287505477, + "learning_rate": 3.5247936269699244e-06, + "loss": 0.3487, + "step": 11220 + }, + { + "epoch": 0.6481775417736861, + "grad_norm": 2.7961532657594357, + "learning_rate": 3.519020954799977e-06, + "loss": 0.3591, + "step": 11230 + }, + { + "epoch": 0.6487547256933421, + "grad_norm": 5.675631273424128, + "learning_rate": 3.5132482826300296e-06, + "loss": 0.3387, + "step": 11240 + }, + { + "epoch": 0.6493319096129981, + "grad_norm": 4.038281786465871, + "learning_rate": 3.507475610460082e-06, + "loss": 0.3544, + "step": 11250 + }, + { + "epoch": 0.6499090935326541, + "grad_norm": 19.461560362822837, + "learning_rate": 3.5017029382901348e-06, + "loss": 0.3612, + "step": 11260 + }, + { + "epoch": 0.6504862774523101, + "grad_norm": 2.9170938007747838, + "learning_rate": 3.4959302661201874e-06, + "loss": 0.3633, + "step": 11270 + }, + { + "epoch": 0.6510634613719661, + "grad_norm": 2.100392402638713, + "learning_rate": 3.4901575939502395e-06, + "loss": 0.3628, + "step": 11280 + }, + { + "epoch": 0.6516406452916221, + "grad_norm": 7.466900360838518, + "learning_rate": 3.484384921780292e-06, + "loss": 0.3544, + "step": 11290 + }, + { + "epoch": 0.6522178292112781, + "grad_norm": 2.3522582138412984, + "learning_rate": 3.4786122496103447e-06, + "loss": 0.3768, + "step": 11300 + }, + { + "epoch": 0.6527950131309341, + "grad_norm": 2.0677131586462556, + "learning_rate": 3.4728395774403973e-06, + "loss": 0.3514, + "step": 11310 + }, + { + "epoch": 0.6533721970505901, + "grad_norm": 3.1804108497752943, + "learning_rate": 3.46706690527045e-06, + "loss": 0.3506, + "step": 11320 + }, + { + "epoch": 0.6539493809702461, + "grad_norm": 5.396390003664786, + "learning_rate": 3.461294233100502e-06, + "loss": 0.3539, + "step": 11330 + }, + { + "epoch": 0.6545265648899021, + "grad_norm": 5.1304776342645235, + "learning_rate": 3.4555215609305547e-06, + "loss": 0.3578, + "step": 11340 + }, + { + "epoch": 0.6551037488095581, + "grad_norm": 2.625555244563686, + "learning_rate": 3.4497488887606073e-06, + "loss": 0.3344, + "step": 11350 + }, + { + "epoch": 0.6556809327292141, + "grad_norm": 2.765166389820208, + "learning_rate": 3.44397621659066e-06, + "loss": 0.3467, + "step": 11360 + }, + { + "epoch": 0.6562581166488701, + "grad_norm": 3.1348291760774556, + "learning_rate": 3.4382035444207125e-06, + "loss": 0.348, + "step": 11370 + }, + { + "epoch": 0.6568353005685261, + "grad_norm": 2.6149817919704486, + "learning_rate": 3.4324308722507655e-06, + "loss": 0.3581, + "step": 11380 + }, + { + "epoch": 0.6574124844881821, + "grad_norm": 2.4071247325320084, + "learning_rate": 3.426658200080818e-06, + "loss": 0.3425, + "step": 11390 + }, + { + "epoch": 0.6579896684078381, + "grad_norm": 6.966272376463285, + "learning_rate": 3.4208855279108703e-06, + "loss": 0.3648, + "step": 11400 + }, + { + "epoch": 0.6585668523274941, + "grad_norm": 2.410341785080001, + "learning_rate": 3.415112855740923e-06, + "loss": 0.3518, + "step": 11410 + }, + { + "epoch": 0.6591440362471501, + "grad_norm": 4.052333811947672, + "learning_rate": 3.4093401835709755e-06, + "loss": 0.3538, + "step": 11420 + }, + { + "epoch": 0.6597212201668061, + "grad_norm": 5.112675962153542, + "learning_rate": 3.403567511401028e-06, + "loss": 0.3462, + "step": 11430 + }, + { + "epoch": 0.6602984040864621, + "grad_norm": 2.0711228360250873, + "learning_rate": 3.3977948392310807e-06, + "loss": 0.3523, + "step": 11440 + }, + { + "epoch": 0.6608755880061181, + "grad_norm": 3.0723705312379677, + "learning_rate": 3.3920221670611333e-06, + "loss": 0.3538, + "step": 11450 + }, + { + "epoch": 0.6614527719257741, + "grad_norm": 7.22672591716136, + "learning_rate": 3.3862494948911854e-06, + "loss": 0.3508, + "step": 11460 + }, + { + "epoch": 0.6620299558454301, + "grad_norm": 3.4674616674239447, + "learning_rate": 3.380476822721238e-06, + "loss": 0.3488, + "step": 11470 + }, + { + "epoch": 0.6626071397650861, + "grad_norm": 3.0526763692239602, + "learning_rate": 3.3747041505512906e-06, + "loss": 0.3413, + "step": 11480 + }, + { + "epoch": 0.6631843236847421, + "grad_norm": 8.92552529404141, + "learning_rate": 3.3689314783813432e-06, + "loss": 0.3559, + "step": 11490 + }, + { + "epoch": 0.6637615076043981, + "grad_norm": 7.126184068548845, + "learning_rate": 3.363158806211396e-06, + "loss": 0.3431, + "step": 11500 + }, + { + "epoch": 0.6643386915240541, + "grad_norm": 3.6795312021204993, + "learning_rate": 3.357386134041448e-06, + "loss": 0.3644, + "step": 11510 + }, + { + "epoch": 0.6649158754437101, + "grad_norm": 3.1394487426454765, + "learning_rate": 3.3516134618715006e-06, + "loss": 0.355, + "step": 11520 + }, + { + "epoch": 0.6654930593633661, + "grad_norm": 3.115800444710574, + "learning_rate": 3.345840789701553e-06, + "loss": 0.3595, + "step": 11530 + }, + { + "epoch": 0.6660702432830221, + "grad_norm": 3.04464377321414, + "learning_rate": 3.3400681175316058e-06, + "loss": 0.3474, + "step": 11540 + }, + { + "epoch": 0.6666474272026781, + "grad_norm": 11.414003410738056, + "learning_rate": 3.3342954453616584e-06, + "loss": 0.344, + "step": 11550 + }, + { + "epoch": 0.6672246111223341, + "grad_norm": 7.794881460371124, + "learning_rate": 3.328522773191711e-06, + "loss": 0.3489, + "step": 11560 + }, + { + "epoch": 0.6678017950419901, + "grad_norm": 3.606879802040075, + "learning_rate": 3.322750101021763e-06, + "loss": 0.3542, + "step": 11570 + }, + { + "epoch": 0.6683789789616461, + "grad_norm": 3.271233323948874, + "learning_rate": 3.3169774288518158e-06, + "loss": 0.3572, + "step": 11580 + }, + { + "epoch": 0.6689561628813021, + "grad_norm": 5.312528784803595, + "learning_rate": 3.3112047566818683e-06, + "loss": 0.344, + "step": 11590 + }, + { + "epoch": 0.6695333468009581, + "grad_norm": 4.414037045732359, + "learning_rate": 3.305432084511921e-06, + "loss": 0.3757, + "step": 11600 + }, + { + "epoch": 0.6701105307206141, + "grad_norm": 4.420990727422642, + "learning_rate": 3.2996594123419735e-06, + "loss": 0.3361, + "step": 11610 + }, + { + "epoch": 0.6706877146402701, + "grad_norm": 28.79897728988663, + "learning_rate": 3.2938867401720257e-06, + "loss": 0.3549, + "step": 11620 + }, + { + "epoch": 0.6712648985599261, + "grad_norm": 5.938341510395738, + "learning_rate": 3.2881140680020783e-06, + "loss": 0.3528, + "step": 11630 + }, + { + "epoch": 0.6718420824795821, + "grad_norm": 7.625014010774144, + "learning_rate": 3.282341395832131e-06, + "loss": 0.3569, + "step": 11640 + }, + { + "epoch": 0.6724192663992381, + "grad_norm": 2.627772129382934, + "learning_rate": 3.2765687236621835e-06, + "loss": 0.341, + "step": 11650 + }, + { + "epoch": 0.6729964503188941, + "grad_norm": 6.6462109495436765, + "learning_rate": 3.270796051492236e-06, + "loss": 0.3432, + "step": 11660 + }, + { + "epoch": 0.6735736342385501, + "grad_norm": 4.140894146799749, + "learning_rate": 3.2650233793222887e-06, + "loss": 0.35, + "step": 11670 + }, + { + "epoch": 0.6741508181582061, + "grad_norm": 7.023320528238819, + "learning_rate": 3.259250707152341e-06, + "loss": 0.3483, + "step": 11680 + }, + { + "epoch": 0.6747280020778621, + "grad_norm": 3.56371765942958, + "learning_rate": 3.2534780349823935e-06, + "loss": 0.3456, + "step": 11690 + }, + { + "epoch": 0.6753051859975181, + "grad_norm": 4.693701665628699, + "learning_rate": 3.247705362812446e-06, + "loss": 0.3556, + "step": 11700 + }, + { + "epoch": 0.6758823699171741, + "grad_norm": 4.8769232133317955, + "learning_rate": 3.2419326906424987e-06, + "loss": 0.3406, + "step": 11710 + }, + { + "epoch": 0.6764595538368301, + "grad_norm": 14.213756604351863, + "learning_rate": 3.2361600184725513e-06, + "loss": 0.3422, + "step": 11720 + }, + { + "epoch": 0.6770367377564861, + "grad_norm": 4.009649954155962, + "learning_rate": 3.2303873463026034e-06, + "loss": 0.3449, + "step": 11730 + }, + { + "epoch": 0.6776139216761421, + "grad_norm": 3.538273145119479, + "learning_rate": 3.224614674132656e-06, + "loss": 0.3457, + "step": 11740 + }, + { + "epoch": 0.6781911055957981, + "grad_norm": 3.0329790960952026, + "learning_rate": 3.2188420019627086e-06, + "loss": 0.3479, + "step": 11750 + }, + { + "epoch": 0.6787682895154541, + "grad_norm": 8.21491132526687, + "learning_rate": 3.2130693297927612e-06, + "loss": 0.3577, + "step": 11760 + }, + { + "epoch": 0.6793454734351101, + "grad_norm": 4.981836103874383, + "learning_rate": 3.207296657622814e-06, + "loss": 0.355, + "step": 11770 + }, + { + "epoch": 0.6799226573547661, + "grad_norm": 5.1547852515451975, + "learning_rate": 3.2015239854528664e-06, + "loss": 0.3392, + "step": 11780 + }, + { + "epoch": 0.6804998412744221, + "grad_norm": 9.811331834930291, + "learning_rate": 3.1957513132829186e-06, + "loss": 0.3545, + "step": 11790 + }, + { + "epoch": 0.6810770251940781, + "grad_norm": 4.083480395202693, + "learning_rate": 3.189978641112971e-06, + "loss": 0.3575, + "step": 11800 + }, + { + "epoch": 0.6816542091137341, + "grad_norm": 4.764988431769556, + "learning_rate": 3.184205968943024e-06, + "loss": 0.3372, + "step": 11810 + }, + { + "epoch": 0.6822313930333901, + "grad_norm": 6.794798971465098, + "learning_rate": 3.1784332967730764e-06, + "loss": 0.3539, + "step": 11820 + }, + { + "epoch": 0.6828085769530461, + "grad_norm": 7.111672589883507, + "learning_rate": 3.172660624603129e-06, + "loss": 0.3544, + "step": 11830 + }, + { + "epoch": 0.6833857608727021, + "grad_norm": 24.124344459801147, + "learning_rate": 3.166887952433181e-06, + "loss": 0.3602, + "step": 11840 + }, + { + "epoch": 0.6839629447923581, + "grad_norm": 12.424128663469016, + "learning_rate": 3.1611152802632338e-06, + "loss": 0.3441, + "step": 11850 + }, + { + "epoch": 0.6845401287120141, + "grad_norm": 3.903963843774075, + "learning_rate": 3.1553426080932864e-06, + "loss": 0.3572, + "step": 11860 + }, + { + "epoch": 0.6851173126316701, + "grad_norm": 7.5954443349430525, + "learning_rate": 3.149569935923339e-06, + "loss": 0.3459, + "step": 11870 + }, + { + "epoch": 0.6856944965513261, + "grad_norm": 5.147769651660261, + "learning_rate": 3.1437972637533915e-06, + "loss": 0.3443, + "step": 11880 + }, + { + "epoch": 0.6862716804709821, + "grad_norm": 8.531813464808828, + "learning_rate": 3.138024591583444e-06, + "loss": 0.3427, + "step": 11890 + }, + { + "epoch": 0.6868488643906381, + "grad_norm": 9.203737344622347, + "learning_rate": 3.1322519194134963e-06, + "loss": 0.3466, + "step": 11900 + }, + { + "epoch": 0.6874260483102941, + "grad_norm": 4.1913039016792055, + "learning_rate": 3.1264792472435498e-06, + "loss": 0.3667, + "step": 11910 + }, + { + "epoch": 0.6880032322299501, + "grad_norm": 4.984073528450747, + "learning_rate": 3.120706575073602e-06, + "loss": 0.3417, + "step": 11920 + }, + { + "epoch": 0.6885804161496061, + "grad_norm": 4.3895825229927725, + "learning_rate": 3.1149339029036545e-06, + "loss": 0.3341, + "step": 11930 + }, + { + "epoch": 0.6891576000692621, + "grad_norm": 5.323675472371107, + "learning_rate": 3.109161230733707e-06, + "loss": 0.3341, + "step": 11940 + }, + { + "epoch": 0.6897347839889181, + "grad_norm": 4.263152619263457, + "learning_rate": 3.1033885585637597e-06, + "loss": 0.3463, + "step": 11950 + }, + { + "epoch": 0.6903119679085741, + "grad_norm": 6.116151443830828, + "learning_rate": 3.0976158863938123e-06, + "loss": 0.3603, + "step": 11960 + }, + { + "epoch": 0.6908891518282301, + "grad_norm": 10.96676225065857, + "learning_rate": 3.0918432142238645e-06, + "loss": 0.3317, + "step": 11970 + }, + { + "epoch": 0.6914663357478861, + "grad_norm": 5.204338558399774, + "learning_rate": 3.086070542053917e-06, + "loss": 0.3432, + "step": 11980 + }, + { + "epoch": 0.6920435196675421, + "grad_norm": 3.481643749818502, + "learning_rate": 3.0802978698839697e-06, + "loss": 0.3343, + "step": 11990 + }, + { + "epoch": 0.6926207035871981, + "grad_norm": 5.422983882620972, + "learning_rate": 3.0745251977140223e-06, + "loss": 0.3444, + "step": 12000 + }, + { + "epoch": 0.6931978875068541, + "grad_norm": 3.293726670681602, + "learning_rate": 3.068752525544075e-06, + "loss": 0.3536, + "step": 12010 + }, + { + "epoch": 0.6937750714265101, + "grad_norm": 11.162281611641948, + "learning_rate": 3.0629798533741275e-06, + "loss": 0.3461, + "step": 12020 + }, + { + "epoch": 0.6943522553461661, + "grad_norm": 4.538803769431588, + "learning_rate": 3.0572071812041797e-06, + "loss": 0.3404, + "step": 12030 + }, + { + "epoch": 0.6949294392658221, + "grad_norm": 7.297266576912264, + "learning_rate": 3.0514345090342323e-06, + "loss": 0.3399, + "step": 12040 + }, + { + "epoch": 0.6955066231854781, + "grad_norm": 4.498082885030529, + "learning_rate": 3.045661836864285e-06, + "loss": 0.3572, + "step": 12050 + }, + { + "epoch": 0.6960838071051341, + "grad_norm": 6.71445000313715, + "learning_rate": 3.0398891646943374e-06, + "loss": 0.3456, + "step": 12060 + }, + { + "epoch": 0.6966609910247901, + "grad_norm": 4.130838744263147, + "learning_rate": 3.03411649252439e-06, + "loss": 0.3382, + "step": 12070 + }, + { + "epoch": 0.6972381749444461, + "grad_norm": 2.959998512168581, + "learning_rate": 3.0283438203544422e-06, + "loss": 0.3441, + "step": 12080 + }, + { + "epoch": 0.6978153588641021, + "grad_norm": 8.68519096842326, + "learning_rate": 3.022571148184495e-06, + "loss": 0.3536, + "step": 12090 + }, + { + "epoch": 0.6983925427837581, + "grad_norm": 6.068123748807202, + "learning_rate": 3.0167984760145474e-06, + "loss": 0.3336, + "step": 12100 + }, + { + "epoch": 0.6989697267034141, + "grad_norm": 13.720643945389472, + "learning_rate": 3.0110258038446e-06, + "loss": 0.3453, + "step": 12110 + }, + { + "epoch": 0.6995469106230701, + "grad_norm": 5.2345993949285115, + "learning_rate": 3.0052531316746526e-06, + "loss": 0.3514, + "step": 12120 + }, + { + "epoch": 0.7001240945427261, + "grad_norm": 5.995140869193482, + "learning_rate": 2.999480459504705e-06, + "loss": 0.3465, + "step": 12130 + }, + { + "epoch": 0.7007012784623821, + "grad_norm": 5.534508344244959, + "learning_rate": 2.9937077873347574e-06, + "loss": 0.3436, + "step": 12140 + }, + { + "epoch": 0.7012784623820381, + "grad_norm": 3.837652212059965, + "learning_rate": 2.98793511516481e-06, + "loss": 0.347, + "step": 12150 + }, + { + "epoch": 0.7018556463016941, + "grad_norm": 32.35178311186503, + "learning_rate": 2.9821624429948626e-06, + "loss": 0.3543, + "step": 12160 + }, + { + "epoch": 0.7024328302213501, + "grad_norm": 4.1065566840261125, + "learning_rate": 2.976389770824915e-06, + "loss": 0.3421, + "step": 12170 + }, + { + "epoch": 0.7030100141410061, + "grad_norm": 6.144592670774153, + "learning_rate": 2.9706170986549678e-06, + "loss": 0.3466, + "step": 12180 + }, + { + "epoch": 0.7035871980606621, + "grad_norm": 5.684740566371751, + "learning_rate": 2.96484442648502e-06, + "loss": 0.349, + "step": 12190 + }, + { + "epoch": 0.704164381980318, + "grad_norm": 5.234589933221641, + "learning_rate": 2.9590717543150725e-06, + "loss": 0.3221, + "step": 12200 + }, + { + "epoch": 0.704741565899974, + "grad_norm": 5.9090591405034205, + "learning_rate": 2.953299082145125e-06, + "loss": 0.3461, + "step": 12210 + }, + { + "epoch": 0.70531874981963, + "grad_norm": 5.161621785333446, + "learning_rate": 2.9475264099751777e-06, + "loss": 0.3343, + "step": 12220 + }, + { + "epoch": 0.705895933739286, + "grad_norm": 7.454875045097898, + "learning_rate": 2.9417537378052303e-06, + "loss": 0.339, + "step": 12230 + }, + { + "epoch": 0.706473117658942, + "grad_norm": 3.3533355785311936, + "learning_rate": 2.935981065635283e-06, + "loss": 0.3226, + "step": 12240 + }, + { + "epoch": 0.707050301578598, + "grad_norm": 3.3526244501016507, + "learning_rate": 2.930208393465335e-06, + "loss": 0.3525, + "step": 12250 + }, + { + "epoch": 0.707627485498254, + "grad_norm": 46.63609588889749, + "learning_rate": 2.9244357212953877e-06, + "loss": 0.3388, + "step": 12260 + }, + { + "epoch": 0.70820466941791, + "grad_norm": 6.343222491694745, + "learning_rate": 2.9186630491254403e-06, + "loss": 0.3458, + "step": 12270 + }, + { + "epoch": 0.708781853337566, + "grad_norm": 7.406012410848603, + "learning_rate": 2.912890376955493e-06, + "loss": 0.3523, + "step": 12280 + }, + { + "epoch": 0.709359037257222, + "grad_norm": 4.391956311756113, + "learning_rate": 2.9071177047855455e-06, + "loss": 0.3462, + "step": 12290 + }, + { + "epoch": 0.709936221176878, + "grad_norm": 7.365023031476813, + "learning_rate": 2.9013450326155977e-06, + "loss": 0.3507, + "step": 12300 + }, + { + "epoch": 0.710513405096534, + "grad_norm": 8.355149496371373, + "learning_rate": 2.8955723604456503e-06, + "loss": 0.3516, + "step": 12310 + }, + { + "epoch": 0.71109058901619, + "grad_norm": 4.12597154889129, + "learning_rate": 2.889799688275703e-06, + "loss": 0.3387, + "step": 12320 + }, + { + "epoch": 0.711667772935846, + "grad_norm": 22.783055812157006, + "learning_rate": 2.8840270161057555e-06, + "loss": 0.3505, + "step": 12330 + }, + { + "epoch": 0.712244956855502, + "grad_norm": 6.598877289364409, + "learning_rate": 2.878254343935808e-06, + "loss": 0.3511, + "step": 12340 + }, + { + "epoch": 0.712822140775158, + "grad_norm": 6.658331270547365, + "learning_rate": 2.8724816717658606e-06, + "loss": 0.3472, + "step": 12350 + }, + { + "epoch": 0.713399324694814, + "grad_norm": 4.249593904236529, + "learning_rate": 2.866708999595913e-06, + "loss": 0.3462, + "step": 12360 + }, + { + "epoch": 0.71397650861447, + "grad_norm": 4.820541412534286, + "learning_rate": 2.8609363274259654e-06, + "loss": 0.3327, + "step": 12370 + }, + { + "epoch": 0.714553692534126, + "grad_norm": 29.19756399575411, + "learning_rate": 2.855163655256018e-06, + "loss": 0.3423, + "step": 12380 + }, + { + "epoch": 0.715130876453782, + "grad_norm": 2.5170693039148695, + "learning_rate": 2.8493909830860706e-06, + "loss": 0.3507, + "step": 12390 + }, + { + "epoch": 0.715708060373438, + "grad_norm": 4.848573397975011, + "learning_rate": 2.843618310916123e-06, + "loss": 0.3614, + "step": 12400 + }, + { + "epoch": 0.716285244293094, + "grad_norm": 8.083798553592858, + "learning_rate": 2.8378456387461754e-06, + "loss": 0.3444, + "step": 12410 + }, + { + "epoch": 0.71686242821275, + "grad_norm": 6.013516986737268, + "learning_rate": 2.832072966576228e-06, + "loss": 0.3411, + "step": 12420 + }, + { + "epoch": 0.717439612132406, + "grad_norm": 58.40825846546123, + "learning_rate": 2.826300294406281e-06, + "loss": 0.3639, + "step": 12430 + }, + { + "epoch": 0.718016796052062, + "grad_norm": 3.486306193338465, + "learning_rate": 2.8205276222363336e-06, + "loss": 0.3508, + "step": 12440 + }, + { + "epoch": 0.718593979971718, + "grad_norm": 11.117203715725822, + "learning_rate": 2.814754950066386e-06, + "loss": 0.3441, + "step": 12450 + }, + { + "epoch": 0.719171163891374, + "grad_norm": 6.959568782940119, + "learning_rate": 2.808982277896439e-06, + "loss": 0.355, + "step": 12460 + }, + { + "epoch": 0.71974834781103, + "grad_norm": 4.77575740578489, + "learning_rate": 2.8032096057264914e-06, + "loss": 0.3634, + "step": 12470 + }, + { + "epoch": 0.720325531730686, + "grad_norm": 12.450518772632652, + "learning_rate": 2.797436933556544e-06, + "loss": 0.3454, + "step": 12480 + }, + { + "epoch": 0.720902715650342, + "grad_norm": 4.606989122142115, + "learning_rate": 2.791664261386596e-06, + "loss": 0.338, + "step": 12490 + }, + { + "epoch": 0.721479899569998, + "grad_norm": 4.456459932783003, + "learning_rate": 2.7858915892166488e-06, + "loss": 0.3474, + "step": 12500 + }, + { + "epoch": 0.722057083489654, + "grad_norm": 4.348978440263518, + "learning_rate": 2.7801189170467014e-06, + "loss": 0.3488, + "step": 12510 + }, + { + "epoch": 0.72263426740931, + "grad_norm": 3.220553815408838, + "learning_rate": 2.774346244876754e-06, + "loss": 0.3372, + "step": 12520 + }, + { + "epoch": 0.723211451328966, + "grad_norm": 6.662149536726665, + "learning_rate": 2.7685735727068065e-06, + "loss": 0.3487, + "step": 12530 + }, + { + "epoch": 0.723788635248622, + "grad_norm": 4.489888078931735, + "learning_rate": 2.7628009005368587e-06, + "loss": 0.3589, + "step": 12540 + }, + { + "epoch": 0.724365819168278, + "grad_norm": 3.911221624507832, + "learning_rate": 2.7570282283669113e-06, + "loss": 0.3364, + "step": 12550 + }, + { + "epoch": 0.724943003087934, + "grad_norm": 4.397535770990173, + "learning_rate": 2.751255556196964e-06, + "loss": 0.3468, + "step": 12560 + }, + { + "epoch": 0.72552018700759, + "grad_norm": 4.138162758687126, + "learning_rate": 2.7454828840270165e-06, + "loss": 0.351, + "step": 12570 + }, + { + "epoch": 0.726097370927246, + "grad_norm": 3.6393806674007396, + "learning_rate": 2.739710211857069e-06, + "loss": 0.3439, + "step": 12580 + }, + { + "epoch": 0.7266745548469019, + "grad_norm": 10.959014317841367, + "learning_rate": 2.7339375396871217e-06, + "loss": 0.3466, + "step": 12590 + }, + { + "epoch": 0.7272517387665579, + "grad_norm": 8.5956598139555, + "learning_rate": 2.728164867517174e-06, + "loss": 0.3474, + "step": 12600 + }, + { + "epoch": 0.7278289226862139, + "grad_norm": 8.199922135307672, + "learning_rate": 2.7223921953472265e-06, + "loss": 0.3334, + "step": 12610 + }, + { + "epoch": 0.7284061066058699, + "grad_norm": 3.4555204304735563, + "learning_rate": 2.716619523177279e-06, + "loss": 0.3437, + "step": 12620 + }, + { + "epoch": 0.7289832905255259, + "grad_norm": 4.838169983017387, + "learning_rate": 2.7108468510073317e-06, + "loss": 0.3444, + "step": 12630 + }, + { + "epoch": 0.7295604744451819, + "grad_norm": 3.294894158340646, + "learning_rate": 2.7050741788373843e-06, + "loss": 0.345, + "step": 12640 + }, + { + "epoch": 0.7301376583648379, + "grad_norm": 3.886142779724859, + "learning_rate": 2.6993015066674364e-06, + "loss": 0.34, + "step": 12650 + }, + { + "epoch": 0.7307148422844939, + "grad_norm": 3.1827555908179814, + "learning_rate": 2.693528834497489e-06, + "loss": 0.3383, + "step": 12660 + }, + { + "epoch": 0.7312920262041499, + "grad_norm": 20.97526772421675, + "learning_rate": 2.6877561623275416e-06, + "loss": 0.3416, + "step": 12670 + }, + { + "epoch": 0.7318692101238059, + "grad_norm": 3.474528368008189, + "learning_rate": 2.6819834901575942e-06, + "loss": 0.3359, + "step": 12680 + }, + { + "epoch": 0.7324463940434619, + "grad_norm": 11.34967364860296, + "learning_rate": 2.676210817987647e-06, + "loss": 0.3581, + "step": 12690 + }, + { + "epoch": 0.7330235779631179, + "grad_norm": 3.495369176137086, + "learning_rate": 2.6704381458176994e-06, + "loss": 0.3369, + "step": 12700 + }, + { + "epoch": 0.7336007618827739, + "grad_norm": 4.744362874719428, + "learning_rate": 2.6646654736477516e-06, + "loss": 0.3502, + "step": 12710 + }, + { + "epoch": 0.7341779458024299, + "grad_norm": 2.554299735229823, + "learning_rate": 2.658892801477804e-06, + "loss": 0.3547, + "step": 12720 + }, + { + "epoch": 0.7347551297220859, + "grad_norm": 4.114324789692039, + "learning_rate": 2.653120129307857e-06, + "loss": 0.3406, + "step": 12730 + }, + { + "epoch": 0.7353323136417419, + "grad_norm": 5.99465461001967, + "learning_rate": 2.6473474571379094e-06, + "loss": 0.3568, + "step": 12740 + }, + { + "epoch": 0.7359094975613979, + "grad_norm": 5.08850145360636, + "learning_rate": 2.641574784967962e-06, + "loss": 0.3437, + "step": 12750 + }, + { + "epoch": 0.7364866814810539, + "grad_norm": 2.901320860603511, + "learning_rate": 2.635802112798014e-06, + "loss": 0.334, + "step": 12760 + }, + { + "epoch": 0.7370638654007099, + "grad_norm": 7.740697658957906, + "learning_rate": 2.6300294406280668e-06, + "loss": 0.3361, + "step": 12770 + }, + { + "epoch": 0.7376410493203659, + "grad_norm": 6.245600892762995, + "learning_rate": 2.6242567684581194e-06, + "loss": 0.3308, + "step": 12780 + }, + { + "epoch": 0.7382182332400219, + "grad_norm": 5.308126429822944, + "learning_rate": 2.618484096288172e-06, + "loss": 0.3527, + "step": 12790 + }, + { + "epoch": 0.7387954171596779, + "grad_norm": 4.212291401570202, + "learning_rate": 2.6127114241182245e-06, + "loss": 0.3409, + "step": 12800 + }, + { + "epoch": 0.7393726010793339, + "grad_norm": 3.7060833092802556, + "learning_rate": 2.606938751948277e-06, + "loss": 0.337, + "step": 12810 + }, + { + "epoch": 0.7399497849989899, + "grad_norm": 13.303775807909831, + "learning_rate": 2.6011660797783293e-06, + "loss": 0.3521, + "step": 12820 + }, + { + "epoch": 0.7405269689186459, + "grad_norm": 5.855236173414753, + "learning_rate": 2.595393407608382e-06, + "loss": 0.344, + "step": 12830 + }, + { + "epoch": 0.7411041528383019, + "grad_norm": 2.862607162292994, + "learning_rate": 2.5896207354384345e-06, + "loss": 0.3386, + "step": 12840 + }, + { + "epoch": 0.7416813367579579, + "grad_norm": 3.190707127525178, + "learning_rate": 2.583848063268487e-06, + "loss": 0.3451, + "step": 12850 + }, + { + "epoch": 0.7422585206776139, + "grad_norm": 7.479465788278306, + "learning_rate": 2.5780753910985397e-06, + "loss": 0.3418, + "step": 12860 + }, + { + "epoch": 0.7428357045972699, + "grad_norm": 3.9314452580240795, + "learning_rate": 2.572302718928592e-06, + "loss": 0.3558, + "step": 12870 + }, + { + "epoch": 0.7434128885169259, + "grad_norm": 2.586918485101635, + "learning_rate": 2.5665300467586445e-06, + "loss": 0.3521, + "step": 12880 + }, + { + "epoch": 0.7439900724365819, + "grad_norm": 5.285658124727487, + "learning_rate": 2.560757374588697e-06, + "loss": 0.3467, + "step": 12890 + }, + { + "epoch": 0.7445672563562379, + "grad_norm": 16.29975276837285, + "learning_rate": 2.5549847024187497e-06, + "loss": 0.338, + "step": 12900 + }, + { + "epoch": 0.7451444402758939, + "grad_norm": 5.18800369782506, + "learning_rate": 2.5492120302488023e-06, + "loss": 0.3423, + "step": 12910 + }, + { + "epoch": 0.7457216241955499, + "grad_norm": 2.7621346083831737, + "learning_rate": 2.543439358078855e-06, + "loss": 0.3474, + "step": 12920 + }, + { + "epoch": 0.7462988081152059, + "grad_norm": 9.136730427994907, + "learning_rate": 2.537666685908907e-06, + "loss": 0.3599, + "step": 12930 + }, + { + "epoch": 0.7468759920348619, + "grad_norm": 4.494380679642752, + "learning_rate": 2.5318940137389596e-06, + "loss": 0.3509, + "step": 12940 + }, + { + "epoch": 0.7474531759545179, + "grad_norm": 3.8278742483492554, + "learning_rate": 2.5261213415690122e-06, + "loss": 0.3419, + "step": 12950 + }, + { + "epoch": 0.7480303598741739, + "grad_norm": 13.52933963754357, + "learning_rate": 2.5203486693990653e-06, + "loss": 0.3255, + "step": 12960 + }, + { + "epoch": 0.7486075437938299, + "grad_norm": 4.13788172504123, + "learning_rate": 2.514575997229118e-06, + "loss": 0.3446, + "step": 12970 + }, + { + "epoch": 0.7491847277134859, + "grad_norm": 6.859750860335564, + "learning_rate": 2.5088033250591704e-06, + "loss": 0.3586, + "step": 12980 + }, + { + "epoch": 0.7497619116331419, + "grad_norm": 9.235304443739427, + "learning_rate": 2.503030652889223e-06, + "loss": 0.3393, + "step": 12990 + }, + { + "epoch": 0.7503390955527979, + "grad_norm": 5.829087177972612, + "learning_rate": 2.4972579807192752e-06, + "loss": 0.3518, + "step": 13000 + }, + { + "epoch": 0.7509162794724539, + "grad_norm": 4.209497926424704, + "learning_rate": 2.491485308549328e-06, + "loss": 0.3392, + "step": 13010 + }, + { + "epoch": 0.7514934633921099, + "grad_norm": 7.292629657573057, + "learning_rate": 2.48571263637938e-06, + "loss": 0.3253, + "step": 13020 + }, + { + "epoch": 0.7520706473117659, + "grad_norm": 2.647834968655611, + "learning_rate": 2.4799399642094326e-06, + "loss": 0.3304, + "step": 13030 + }, + { + "epoch": 0.7526478312314219, + "grad_norm": 3.2625519939417638, + "learning_rate": 2.474167292039485e-06, + "loss": 0.3389, + "step": 13040 + }, + { + "epoch": 0.7532250151510779, + "grad_norm": 11.316624355429083, + "learning_rate": 2.4683946198695378e-06, + "loss": 0.3369, + "step": 13050 + }, + { + "epoch": 0.7538021990707339, + "grad_norm": 9.673257149513491, + "learning_rate": 2.4626219476995904e-06, + "loss": 0.3334, + "step": 13060 + }, + { + "epoch": 0.7543793829903899, + "grad_norm": 4.342384289505803, + "learning_rate": 2.4568492755296426e-06, + "loss": 0.3464, + "step": 13070 + }, + { + "epoch": 0.7549565669100459, + "grad_norm": 11.18476020427808, + "learning_rate": 2.451076603359695e-06, + "loss": 0.3427, + "step": 13080 + }, + { + "epoch": 0.7555337508297019, + "grad_norm": 3.7726393590227274, + "learning_rate": 2.445303931189748e-06, + "loss": 0.334, + "step": 13090 + }, + { + "epoch": 0.7561109347493579, + "grad_norm": 12.714136611436194, + "learning_rate": 2.4395312590198008e-06, + "loss": 0.3268, + "step": 13100 + }, + { + "epoch": 0.7566881186690139, + "grad_norm": 2.1269070192971307, + "learning_rate": 2.433758586849853e-06, + "loss": 0.3441, + "step": 13110 + }, + { + "epoch": 0.7572653025886699, + "grad_norm": 6.054989937046523, + "learning_rate": 2.4279859146799055e-06, + "loss": 0.3428, + "step": 13120 + }, + { + "epoch": 0.7578424865083259, + "grad_norm": 4.270071372155915, + "learning_rate": 2.422213242509958e-06, + "loss": 0.3539, + "step": 13130 + }, + { + "epoch": 0.7584196704279819, + "grad_norm": 3.1183499805433814, + "learning_rate": 2.4164405703400107e-06, + "loss": 0.3347, + "step": 13140 + }, + { + "epoch": 0.7589968543476379, + "grad_norm": 8.120134607992474, + "learning_rate": 2.4106678981700633e-06, + "loss": 0.3513, + "step": 13150 + }, + { + "epoch": 0.7595740382672939, + "grad_norm": 4.275672737070938, + "learning_rate": 2.4048952260001155e-06, + "loss": 0.3333, + "step": 13160 + }, + { + "epoch": 0.7601512221869499, + "grad_norm": 4.309409121092216, + "learning_rate": 2.399122553830168e-06, + "loss": 0.342, + "step": 13170 + }, + { + "epoch": 0.7607284061066059, + "grad_norm": 14.975672326288917, + "learning_rate": 2.3933498816602207e-06, + "loss": 0.3491, + "step": 13180 + }, + { + "epoch": 0.7613055900262619, + "grad_norm": 3.3391093670284984, + "learning_rate": 2.3875772094902733e-06, + "loss": 0.3377, + "step": 13190 + }, + { + "epoch": 0.7618827739459179, + "grad_norm": 9.934238745041949, + "learning_rate": 2.381804537320326e-06, + "loss": 0.3316, + "step": 13200 + }, + { + "epoch": 0.7624599578655739, + "grad_norm": 3.323343149796346, + "learning_rate": 2.3760318651503785e-06, + "loss": 0.3217, + "step": 13210 + }, + { + "epoch": 0.7630371417852299, + "grad_norm": 5.084030929299541, + "learning_rate": 2.3702591929804307e-06, + "loss": 0.3582, + "step": 13220 + }, + { + "epoch": 0.7636143257048859, + "grad_norm": 7.012741879467996, + "learning_rate": 2.3644865208104833e-06, + "loss": 0.3433, + "step": 13230 + }, + { + "epoch": 0.7641915096245419, + "grad_norm": 4.5407340579502025, + "learning_rate": 2.358713848640536e-06, + "loss": 0.3388, + "step": 13240 + }, + { + "epoch": 0.7647686935441979, + "grad_norm": 4.194705290127997, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.3312, + "step": 13250 + }, + { + "epoch": 0.7653458774638539, + "grad_norm": 3.7140413786909203, + "learning_rate": 2.347168504300641e-06, + "loss": 0.3496, + "step": 13260 + }, + { + "epoch": 0.7659230613835099, + "grad_norm": 3.257080159852974, + "learning_rate": 2.3413958321306936e-06, + "loss": 0.3366, + "step": 13270 + }, + { + "epoch": 0.7665002453031659, + "grad_norm": 5.234732296881502, + "learning_rate": 2.335623159960746e-06, + "loss": 0.3427, + "step": 13280 + }, + { + "epoch": 0.7670774292228218, + "grad_norm": 6.870037441960529, + "learning_rate": 2.3298504877907984e-06, + "loss": 0.3354, + "step": 13290 + }, + { + "epoch": 0.7676546131424778, + "grad_norm": 9.876776601961089, + "learning_rate": 2.324077815620851e-06, + "loss": 0.3315, + "step": 13300 + }, + { + "epoch": 0.7682317970621338, + "grad_norm": 5.912982961911957, + "learning_rate": 2.3183051434509036e-06, + "loss": 0.3485, + "step": 13310 + }, + { + "epoch": 0.7688089809817898, + "grad_norm": 4.832716437104076, + "learning_rate": 2.312532471280956e-06, + "loss": 0.3348, + "step": 13320 + }, + { + "epoch": 0.7693861649014458, + "grad_norm": 5.098375721379664, + "learning_rate": 2.3067597991110084e-06, + "loss": 0.3484, + "step": 13330 + }, + { + "epoch": 0.7699633488211018, + "grad_norm": 4.63656346157866, + "learning_rate": 2.300987126941061e-06, + "loss": 0.3366, + "step": 13340 + }, + { + "epoch": 0.7705405327407578, + "grad_norm": 6.588466951994697, + "learning_rate": 2.2952144547711136e-06, + "loss": 0.3457, + "step": 13350 + }, + { + "epoch": 0.7711177166604138, + "grad_norm": 4.824094255832894, + "learning_rate": 2.2894417826011666e-06, + "loss": 0.341, + "step": 13360 + }, + { + "epoch": 0.7716949005800698, + "grad_norm": 4.637581573877111, + "learning_rate": 2.2836691104312188e-06, + "loss": 0.3554, + "step": 13370 + }, + { + "epoch": 0.7722720844997258, + "grad_norm": 6.056831078560241, + "learning_rate": 2.2778964382612714e-06, + "loss": 0.3186, + "step": 13380 + }, + { + "epoch": 0.7728492684193818, + "grad_norm": 45.47481090136826, + "learning_rate": 2.272123766091324e-06, + "loss": 0.3299, + "step": 13390 + }, + { + "epoch": 0.7734264523390378, + "grad_norm": 22.884051964592864, + "learning_rate": 2.2663510939213766e-06, + "loss": 0.3392, + "step": 13400 + }, + { + "epoch": 0.7740036362586938, + "grad_norm": 6.492791637464672, + "learning_rate": 2.260578421751429e-06, + "loss": 0.3436, + "step": 13410 + }, + { + "epoch": 0.7745808201783498, + "grad_norm": 5.820865415832969, + "learning_rate": 2.2548057495814813e-06, + "loss": 0.3347, + "step": 13420 + }, + { + "epoch": 0.7751580040980058, + "grad_norm": 3.6093618399609406, + "learning_rate": 2.249033077411534e-06, + "loss": 0.3457, + "step": 13430 + }, + { + "epoch": 0.7757351880176618, + "grad_norm": 3.8630308240200977, + "learning_rate": 2.2432604052415865e-06, + "loss": 0.345, + "step": 13440 + }, + { + "epoch": 0.7763123719373178, + "grad_norm": 5.534539731109275, + "learning_rate": 2.237487733071639e-06, + "loss": 0.3383, + "step": 13450 + }, + { + "epoch": 0.7768895558569738, + "grad_norm": 9.50423287415909, + "learning_rate": 2.2317150609016917e-06, + "loss": 0.3528, + "step": 13460 + }, + { + "epoch": 0.7774667397766298, + "grad_norm": 11.597704923923128, + "learning_rate": 2.2259423887317443e-06, + "loss": 0.3482, + "step": 13470 + }, + { + "epoch": 0.7780439236962858, + "grad_norm": 4.724944252593318, + "learning_rate": 2.2201697165617965e-06, + "loss": 0.341, + "step": 13480 + }, + { + "epoch": 0.7786211076159418, + "grad_norm": 4.741588520697521, + "learning_rate": 2.214397044391849e-06, + "loss": 0.3443, + "step": 13490 + }, + { + "epoch": 0.7791982915355978, + "grad_norm": 4.723745867093743, + "learning_rate": 2.2086243722219017e-06, + "loss": 0.3468, + "step": 13500 + }, + { + "epoch": 0.7797754754552538, + "grad_norm": 4.302802386626385, + "learning_rate": 2.2028517000519543e-06, + "loss": 0.3534, + "step": 13510 + }, + { + "epoch": 0.7803526593749098, + "grad_norm": 3.6957459272751385, + "learning_rate": 2.197079027882007e-06, + "loss": 0.3438, + "step": 13520 + }, + { + "epoch": 0.7809298432945658, + "grad_norm": 9.02964602805584, + "learning_rate": 2.191306355712059e-06, + "loss": 0.3507, + "step": 13530 + }, + { + "epoch": 0.7815070272142218, + "grad_norm": 4.584120438677978, + "learning_rate": 2.1855336835421117e-06, + "loss": 0.3392, + "step": 13540 + }, + { + "epoch": 0.7820842111338778, + "grad_norm": 3.634370316749477, + "learning_rate": 2.1797610113721642e-06, + "loss": 0.3434, + "step": 13550 + }, + { + "epoch": 0.7826613950535338, + "grad_norm": 7.974537610574205, + "learning_rate": 2.173988339202217e-06, + "loss": 0.3488, + "step": 13560 + }, + { + "epoch": 0.7832385789731898, + "grad_norm": 15.500270051358006, + "learning_rate": 2.1682156670322694e-06, + "loss": 0.3319, + "step": 13570 + }, + { + "epoch": 0.7838157628928458, + "grad_norm": 3.6214865336598288, + "learning_rate": 2.162442994862322e-06, + "loss": 0.3431, + "step": 13580 + }, + { + "epoch": 0.7843929468125018, + "grad_norm": 4.251663962722991, + "learning_rate": 2.1566703226923742e-06, + "loss": 0.339, + "step": 13590 + }, + { + "epoch": 0.7849701307321578, + "grad_norm": 5.8477391728665875, + "learning_rate": 2.150897650522427e-06, + "loss": 0.3425, + "step": 13600 + }, + { + "epoch": 0.7855473146518138, + "grad_norm": 12.661348588140084, + "learning_rate": 2.1451249783524794e-06, + "loss": 0.3385, + "step": 13610 + }, + { + "epoch": 0.7861244985714698, + "grad_norm": 3.153875030872274, + "learning_rate": 2.139352306182532e-06, + "loss": 0.3429, + "step": 13620 + }, + { + "epoch": 0.7867016824911258, + "grad_norm": 7.53800643811605, + "learning_rate": 2.1335796340125846e-06, + "loss": 0.3492, + "step": 13630 + }, + { + "epoch": 0.7872788664107818, + "grad_norm": 4.143985301762935, + "learning_rate": 2.127806961842637e-06, + "loss": 0.3361, + "step": 13640 + }, + { + "epoch": 0.7878560503304378, + "grad_norm": 3.005269949918135, + "learning_rate": 2.12203428967269e-06, + "loss": 0.3405, + "step": 13650 + }, + { + "epoch": 0.7884332342500938, + "grad_norm": 3.018587955955484, + "learning_rate": 2.1162616175027424e-06, + "loss": 0.3447, + "step": 13660 + }, + { + "epoch": 0.7890104181697498, + "grad_norm": 3.002589774112856, + "learning_rate": 2.110488945332795e-06, + "loss": 0.3253, + "step": 13670 + }, + { + "epoch": 0.7895876020894058, + "grad_norm": 3.85103590680487, + "learning_rate": 2.104716273162847e-06, + "loss": 0.3395, + "step": 13680 + }, + { + "epoch": 0.7901647860090618, + "grad_norm": 4.040110821545416, + "learning_rate": 2.0989436009928998e-06, + "loss": 0.3439, + "step": 13690 + }, + { + "epoch": 0.7907419699287178, + "grad_norm": 8.395833969271274, + "learning_rate": 2.0931709288229524e-06, + "loss": 0.3334, + "step": 13700 + }, + { + "epoch": 0.7913191538483738, + "grad_norm": 8.808629294549078, + "learning_rate": 2.087398256653005e-06, + "loss": 0.3516, + "step": 13710 + }, + { + "epoch": 0.7918963377680298, + "grad_norm": 5.183013748264493, + "learning_rate": 2.0816255844830576e-06, + "loss": 0.3468, + "step": 13720 + }, + { + "epoch": 0.7924735216876858, + "grad_norm": 4.651858039147579, + "learning_rate": 2.0758529123131097e-06, + "loss": 0.3383, + "step": 13730 + }, + { + "epoch": 0.7930507056073418, + "grad_norm": 5.147274649825693, + "learning_rate": 2.0700802401431623e-06, + "loss": 0.3435, + "step": 13740 + }, + { + "epoch": 0.7936278895269978, + "grad_norm": 3.3157064357282646, + "learning_rate": 2.064307567973215e-06, + "loss": 0.3374, + "step": 13750 + }, + { + "epoch": 0.7942050734466538, + "grad_norm": 5.359738899671234, + "learning_rate": 2.0585348958032675e-06, + "loss": 0.3519, + "step": 13760 + }, + { + "epoch": 0.7947822573663098, + "grad_norm": 3.232416306077255, + "learning_rate": 2.05276222363332e-06, + "loss": 0.3466, + "step": 13770 + }, + { + "epoch": 0.7953594412859658, + "grad_norm": 5.634484175482586, + "learning_rate": 2.0469895514633727e-06, + "loss": 0.3435, + "step": 13780 + }, + { + "epoch": 0.7959366252056218, + "grad_norm": 4.812177943413768, + "learning_rate": 2.041216879293425e-06, + "loss": 0.3394, + "step": 13790 + }, + { + "epoch": 0.7965138091252778, + "grad_norm": 4.18690284048581, + "learning_rate": 2.0354442071234775e-06, + "loss": 0.339, + "step": 13800 + }, + { + "epoch": 0.7970909930449338, + "grad_norm": 2.6302278323472024, + "learning_rate": 2.02967153495353e-06, + "loss": 0.3211, + "step": 13810 + }, + { + "epoch": 0.7976681769645898, + "grad_norm": 3.7677609162356567, + "learning_rate": 2.0238988627835827e-06, + "loss": 0.3459, + "step": 13820 + }, + { + "epoch": 0.7982453608842458, + "grad_norm": 3.877790721238228, + "learning_rate": 2.0181261906136353e-06, + "loss": 0.3527, + "step": 13830 + }, + { + "epoch": 0.7988225448039018, + "grad_norm": 3.3882391753576897, + "learning_rate": 2.0123535184436874e-06, + "loss": 0.3418, + "step": 13840 + }, + { + "epoch": 0.7993997287235578, + "grad_norm": 8.275704148522843, + "learning_rate": 2.00658084627374e-06, + "loss": 0.3584, + "step": 13850 + }, + { + "epoch": 0.7999769126432138, + "grad_norm": 4.07120977222577, + "learning_rate": 2.0008081741037926e-06, + "loss": 0.3369, + "step": 13860 + }, + { + "epoch": 0.8005540965628698, + "grad_norm": 6.796016189984939, + "learning_rate": 1.9950355019338452e-06, + "loss": 0.3359, + "step": 13870 + }, + { + "epoch": 0.8011312804825258, + "grad_norm": 4.70618096154267, + "learning_rate": 1.989262829763898e-06, + "loss": 0.3383, + "step": 13880 + }, + { + "epoch": 0.8017084644021818, + "grad_norm": 9.622472004428674, + "learning_rate": 1.9834901575939504e-06, + "loss": 0.3492, + "step": 13890 + }, + { + "epoch": 0.8022856483218378, + "grad_norm": 20.930166870041855, + "learning_rate": 1.977717485424003e-06, + "loss": 0.3343, + "step": 13900 + }, + { + "epoch": 0.8028628322414938, + "grad_norm": 10.85418413095689, + "learning_rate": 1.9719448132540556e-06, + "loss": 0.3399, + "step": 13910 + }, + { + "epoch": 0.8034400161611498, + "grad_norm": 3.9293112562677353, + "learning_rate": 1.9661721410841082e-06, + "loss": 0.363, + "step": 13920 + }, + { + "epoch": 0.8040172000808058, + "grad_norm": 10.773909045381199, + "learning_rate": 1.960399468914161e-06, + "loss": 0.3427, + "step": 13930 + }, + { + "epoch": 0.8045943840004618, + "grad_norm": 4.932078250676689, + "learning_rate": 1.954626796744213e-06, + "loss": 0.3353, + "step": 13940 + }, + { + "epoch": 0.8051715679201178, + "grad_norm": 5.376321864962683, + "learning_rate": 1.9488541245742656e-06, + "loss": 0.3502, + "step": 13950 + }, + { + "epoch": 0.8057487518397738, + "grad_norm": 4.014977881261309, + "learning_rate": 1.943081452404318e-06, + "loss": 0.3169, + "step": 13960 + }, + { + "epoch": 0.8063259357594298, + "grad_norm": 8.651359182721874, + "learning_rate": 1.9373087802343708e-06, + "loss": 0.3349, + "step": 13970 + }, + { + "epoch": 0.8069031196790858, + "grad_norm": 4.683539873403375, + "learning_rate": 1.9315361080644234e-06, + "loss": 0.3366, + "step": 13980 + }, + { + "epoch": 0.8074803035987418, + "grad_norm": 7.755677266741849, + "learning_rate": 1.9257634358944756e-06, + "loss": 0.3377, + "step": 13990 + }, + { + "epoch": 0.8080574875183978, + "grad_norm": 14.533972050557356, + "learning_rate": 1.919990763724528e-06, + "loss": 0.3337, + "step": 14000 + }, + { + "epoch": 0.8086346714380538, + "grad_norm": 3.5287179981983052, + "learning_rate": 1.9142180915545807e-06, + "loss": 0.3371, + "step": 14010 + }, + { + "epoch": 0.8092118553577098, + "grad_norm": 3.6897961566375588, + "learning_rate": 1.9084454193846333e-06, + "loss": 0.3437, + "step": 14020 + }, + { + "epoch": 0.8097890392773658, + "grad_norm": 4.275916003879041, + "learning_rate": 1.9026727472146857e-06, + "loss": 0.3505, + "step": 14030 + }, + { + "epoch": 0.8103662231970218, + "grad_norm": 5.853581087992128, + "learning_rate": 1.8969000750447383e-06, + "loss": 0.3447, + "step": 14040 + }, + { + "epoch": 0.8109434071166778, + "grad_norm": 4.456219664367194, + "learning_rate": 1.891127402874791e-06, + "loss": 0.3405, + "step": 14050 + }, + { + "epoch": 0.8115205910363338, + "grad_norm": 3.609665854590679, + "learning_rate": 1.8853547307048433e-06, + "loss": 0.3539, + "step": 14060 + }, + { + "epoch": 0.8120977749559897, + "grad_norm": 8.81894657017933, + "learning_rate": 1.879582058534896e-06, + "loss": 0.3434, + "step": 14070 + }, + { + "epoch": 0.8126749588756457, + "grad_norm": 73.62257234022623, + "learning_rate": 1.8738093863649485e-06, + "loss": 0.3362, + "step": 14080 + }, + { + "epoch": 0.8132521427953017, + "grad_norm": 4.843976388543906, + "learning_rate": 1.8680367141950009e-06, + "loss": 0.3522, + "step": 14090 + }, + { + "epoch": 0.8138293267149577, + "grad_norm": 6.3647153836539925, + "learning_rate": 1.8622640420250535e-06, + "loss": 0.3351, + "step": 14100 + }, + { + "epoch": 0.8144065106346137, + "grad_norm": 15.067604508734842, + "learning_rate": 1.8564913698551059e-06, + "loss": 0.341, + "step": 14110 + }, + { + "epoch": 0.8149836945542697, + "grad_norm": 2.837118277081176, + "learning_rate": 1.8507186976851585e-06, + "loss": 0.3433, + "step": 14120 + }, + { + "epoch": 0.8155608784739257, + "grad_norm": 3.8243460158503337, + "learning_rate": 1.844946025515211e-06, + "loss": 0.3433, + "step": 14130 + }, + { + "epoch": 0.8161380623935817, + "grad_norm": 5.476403512789154, + "learning_rate": 1.8391733533452635e-06, + "loss": 0.3507, + "step": 14140 + }, + { + "epoch": 0.8167152463132377, + "grad_norm": 7.503101973214223, + "learning_rate": 1.8334006811753163e-06, + "loss": 0.3426, + "step": 14150 + }, + { + "epoch": 0.8172924302328937, + "grad_norm": 13.62010751923973, + "learning_rate": 1.8276280090053689e-06, + "loss": 0.3467, + "step": 14160 + }, + { + "epoch": 0.8178696141525497, + "grad_norm": 5.792511670187818, + "learning_rate": 1.8218553368354215e-06, + "loss": 0.3618, + "step": 14170 + }, + { + "epoch": 0.8184467980722057, + "grad_norm": 3.268214743983259, + "learning_rate": 1.8160826646654738e-06, + "loss": 0.3383, + "step": 14180 + }, + { + "epoch": 0.8190239819918617, + "grad_norm": 4.2120443389536, + "learning_rate": 1.8103099924955264e-06, + "loss": 0.3444, + "step": 14190 + }, + { + "epoch": 0.8196011659115177, + "grad_norm": 5.497732760033444, + "learning_rate": 1.804537320325579e-06, + "loss": 0.3321, + "step": 14200 + }, + { + "epoch": 0.8201783498311737, + "grad_norm": 3.752608960335712, + "learning_rate": 1.7987646481556314e-06, + "loss": 0.34, + "step": 14210 + }, + { + "epoch": 0.8207555337508297, + "grad_norm": 5.61044709148666, + "learning_rate": 1.792991975985684e-06, + "loss": 0.3477, + "step": 14220 + }, + { + "epoch": 0.8213327176704857, + "grad_norm": 3.2289347743222, + "learning_rate": 1.7872193038157364e-06, + "loss": 0.3354, + "step": 14230 + }, + { + "epoch": 0.8219099015901417, + "grad_norm": 4.132295269856051, + "learning_rate": 1.781446631645789e-06, + "loss": 0.3454, + "step": 14240 + }, + { + "epoch": 0.8224870855097977, + "grad_norm": 8.94256483462556, + "learning_rate": 1.7756739594758416e-06, + "loss": 0.3437, + "step": 14250 + }, + { + "epoch": 0.8230642694294537, + "grad_norm": 6.100086954024239, + "learning_rate": 1.769901287305894e-06, + "loss": 0.3358, + "step": 14260 + }, + { + "epoch": 0.8236414533491097, + "grad_norm": 4.7378243458253255, + "learning_rate": 1.7641286151359466e-06, + "loss": 0.3357, + "step": 14270 + }, + { + "epoch": 0.8242186372687657, + "grad_norm": 4.402684079414389, + "learning_rate": 1.7583559429659992e-06, + "loss": 0.346, + "step": 14280 + }, + { + "epoch": 0.8247958211884217, + "grad_norm": 7.94048495060501, + "learning_rate": 1.7525832707960516e-06, + "loss": 0.3334, + "step": 14290 + }, + { + "epoch": 0.8253730051080777, + "grad_norm": 4.248624676291536, + "learning_rate": 1.7468105986261042e-06, + "loss": 0.3348, + "step": 14300 + }, + { + "epoch": 0.8259501890277336, + "grad_norm": 5.863500294171999, + "learning_rate": 1.7410379264561568e-06, + "loss": 0.3545, + "step": 14310 + }, + { + "epoch": 0.8265273729473896, + "grad_norm": 5.462151225452732, + "learning_rate": 1.7352652542862091e-06, + "loss": 0.3324, + "step": 14320 + }, + { + "epoch": 0.8271045568670456, + "grad_norm": 2.757724626485063, + "learning_rate": 1.7294925821162617e-06, + "loss": 0.3411, + "step": 14330 + }, + { + "epoch": 0.8276817407867016, + "grad_norm": 2.5548734181876456, + "learning_rate": 1.7237199099463141e-06, + "loss": 0.3278, + "step": 14340 + }, + { + "epoch": 0.8282589247063576, + "grad_norm": 4.871936623866052, + "learning_rate": 1.7179472377763667e-06, + "loss": 0.3306, + "step": 14350 + }, + { + "epoch": 0.8288361086260136, + "grad_norm": 3.10101382610628, + "learning_rate": 1.7121745656064193e-06, + "loss": 0.3387, + "step": 14360 + }, + { + "epoch": 0.8294132925456696, + "grad_norm": 10.575051742858964, + "learning_rate": 1.7064018934364717e-06, + "loss": 0.3428, + "step": 14370 + }, + { + "epoch": 0.8299904764653256, + "grad_norm": 4.367916255458309, + "learning_rate": 1.7006292212665243e-06, + "loss": 0.3397, + "step": 14380 + }, + { + "epoch": 0.8305676603849816, + "grad_norm": 6.616840660969775, + "learning_rate": 1.694856549096577e-06, + "loss": 0.331, + "step": 14390 + }, + { + "epoch": 0.8311448443046376, + "grad_norm": 12.71189460753895, + "learning_rate": 1.6890838769266293e-06, + "loss": 0.3352, + "step": 14400 + }, + { + "epoch": 0.8317220282242936, + "grad_norm": 8.817998813310439, + "learning_rate": 1.683311204756682e-06, + "loss": 0.3425, + "step": 14410 + }, + { + "epoch": 0.8322992121439496, + "grad_norm": 4.766835462814113, + "learning_rate": 1.6775385325867347e-06, + "loss": 0.3398, + "step": 14420 + }, + { + "epoch": 0.8328763960636056, + "grad_norm": 4.169057753174133, + "learning_rate": 1.6717658604167873e-06, + "loss": 0.3294, + "step": 14430 + }, + { + "epoch": 0.8334535799832616, + "grad_norm": 5.150260088289917, + "learning_rate": 1.6659931882468397e-06, + "loss": 0.3273, + "step": 14440 + }, + { + "epoch": 0.8340307639029176, + "grad_norm": 5.053684619580007, + "learning_rate": 1.6602205160768923e-06, + "loss": 0.3642, + "step": 14450 + }, + { + "epoch": 0.8346079478225736, + "grad_norm": 4.068134996065944, + "learning_rate": 1.6544478439069447e-06, + "loss": 0.3441, + "step": 14460 + }, + { + "epoch": 0.8351851317422296, + "grad_norm": 4.004964659229183, + "learning_rate": 1.6486751717369972e-06, + "loss": 0.3408, + "step": 14470 + }, + { + "epoch": 0.8357623156618856, + "grad_norm": 4.02559591240971, + "learning_rate": 1.6429024995670498e-06, + "loss": 0.3385, + "step": 14480 + }, + { + "epoch": 0.8363394995815416, + "grad_norm": 5.168203382157519, + "learning_rate": 1.6371298273971022e-06, + "loss": 0.3424, + "step": 14490 + }, + { + "epoch": 0.8369166835011976, + "grad_norm": 6.3562693056901285, + "learning_rate": 1.6313571552271548e-06, + "loss": 0.3432, + "step": 14500 + }, + { + "epoch": 0.8374938674208536, + "grad_norm": 7.105132387517162, + "learning_rate": 1.6255844830572074e-06, + "loss": 0.3385, + "step": 14510 + }, + { + "epoch": 0.8380710513405096, + "grad_norm": 5.167657366774912, + "learning_rate": 1.6198118108872598e-06, + "loss": 0.3357, + "step": 14520 + }, + { + "epoch": 0.8386482352601656, + "grad_norm": 5.322779082569645, + "learning_rate": 1.6140391387173124e-06, + "loss": 0.351, + "step": 14530 + }, + { + "epoch": 0.8392254191798216, + "grad_norm": 5.555960049234855, + "learning_rate": 1.608266466547365e-06, + "loss": 0.3427, + "step": 14540 + }, + { + "epoch": 0.8398026030994776, + "grad_norm": 14.30097637805443, + "learning_rate": 1.6024937943774174e-06, + "loss": 0.3268, + "step": 14550 + }, + { + "epoch": 0.8403797870191336, + "grad_norm": 5.930941981679822, + "learning_rate": 1.59672112220747e-06, + "loss": 0.3378, + "step": 14560 + }, + { + "epoch": 0.8409569709387896, + "grad_norm": 6.41989158034055, + "learning_rate": 1.5909484500375224e-06, + "loss": 0.3425, + "step": 14570 + }, + { + "epoch": 0.8415341548584456, + "grad_norm": 5.309996017042821, + "learning_rate": 1.585175777867575e-06, + "loss": 0.347, + "step": 14580 + }, + { + "epoch": 0.8421113387781016, + "grad_norm": 5.483533161522683, + "learning_rate": 1.5794031056976276e-06, + "loss": 0.3292, + "step": 14590 + }, + { + "epoch": 0.8426885226977576, + "grad_norm": 5.692527937195376, + "learning_rate": 1.57363043352768e-06, + "loss": 0.3377, + "step": 14600 + }, + { + "epoch": 0.8432657066174136, + "grad_norm": 4.521114906145438, + "learning_rate": 1.5678577613577325e-06, + "loss": 0.33, + "step": 14610 + }, + { + "epoch": 0.8438428905370696, + "grad_norm": 5.387502803416387, + "learning_rate": 1.5620850891877851e-06, + "loss": 0.3418, + "step": 14620 + }, + { + "epoch": 0.8444200744567256, + "grad_norm": 3.5934061953432783, + "learning_rate": 1.5563124170178375e-06, + "loss": 0.3386, + "step": 14630 + }, + { + "epoch": 0.8449972583763816, + "grad_norm": 11.194143114734654, + "learning_rate": 1.5505397448478901e-06, + "loss": 0.345, + "step": 14640 + }, + { + "epoch": 0.8455744422960376, + "grad_norm": 10.094088344395155, + "learning_rate": 1.5447670726779427e-06, + "loss": 0.3526, + "step": 14650 + }, + { + "epoch": 0.8461516262156936, + "grad_norm": 3.4477998903369538, + "learning_rate": 1.5389944005079951e-06, + "loss": 0.3373, + "step": 14660 + }, + { + "epoch": 0.8467288101353496, + "grad_norm": 7.047707090908251, + "learning_rate": 1.533221728338048e-06, + "loss": 0.3403, + "step": 14670 + }, + { + "epoch": 0.8473059940550056, + "grad_norm": 2.8576436370742893, + "learning_rate": 1.5274490561681005e-06, + "loss": 0.3453, + "step": 14680 + }, + { + "epoch": 0.8478831779746616, + "grad_norm": 12.938468126614902, + "learning_rate": 1.521676383998153e-06, + "loss": 0.336, + "step": 14690 + }, + { + "epoch": 0.8484603618943176, + "grad_norm": 5.011512190053106, + "learning_rate": 1.5159037118282055e-06, + "loss": 0.3424, + "step": 14700 + }, + { + "epoch": 0.8490375458139736, + "grad_norm": 5.063829942230378, + "learning_rate": 1.510131039658258e-06, + "loss": 0.346, + "step": 14710 + }, + { + "epoch": 0.8496147297336296, + "grad_norm": 6.679043646426254, + "learning_rate": 1.5043583674883105e-06, + "loss": 0.3422, + "step": 14720 + }, + { + "epoch": 0.8501919136532856, + "grad_norm": 5.800788583586723, + "learning_rate": 1.498585695318363e-06, + "loss": 0.3421, + "step": 14730 + }, + { + "epoch": 0.8507690975729416, + "grad_norm": 3.968739599812534, + "learning_rate": 1.4928130231484157e-06, + "loss": 0.3395, + "step": 14740 + }, + { + "epoch": 0.8513462814925976, + "grad_norm": 7.880056504602316, + "learning_rate": 1.487040350978468e-06, + "loss": 0.3326, + "step": 14750 + }, + { + "epoch": 0.8519234654122536, + "grad_norm": 5.685089944033708, + "learning_rate": 1.4812676788085207e-06, + "loss": 0.3356, + "step": 14760 + }, + { + "epoch": 0.8525006493319096, + "grad_norm": 4.795808126193561, + "learning_rate": 1.475495006638573e-06, + "loss": 0.316, + "step": 14770 + }, + { + "epoch": 0.8530778332515656, + "grad_norm": 6.693555623774071, + "learning_rate": 1.4697223344686256e-06, + "loss": 0.3416, + "step": 14780 + }, + { + "epoch": 0.8536550171712216, + "grad_norm": 5.752237694272574, + "learning_rate": 1.4639496622986782e-06, + "loss": 0.3348, + "step": 14790 + }, + { + "epoch": 0.8542322010908776, + "grad_norm": 7.135726397784308, + "learning_rate": 1.4581769901287306e-06, + "loss": 0.3378, + "step": 14800 + }, + { + "epoch": 0.8548093850105336, + "grad_norm": 3.2833774570611234, + "learning_rate": 1.4524043179587832e-06, + "loss": 0.3504, + "step": 14810 + }, + { + "epoch": 0.8553865689301896, + "grad_norm": 6.256462721047408, + "learning_rate": 1.4466316457888358e-06, + "loss": 0.349, + "step": 14820 + }, + { + "epoch": 0.8559637528498456, + "grad_norm": 6.040295806596955, + "learning_rate": 1.4408589736188882e-06, + "loss": 0.3208, + "step": 14830 + }, + { + "epoch": 0.8565409367695016, + "grad_norm": 4.729301211824621, + "learning_rate": 1.4350863014489408e-06, + "loss": 0.3277, + "step": 14840 + }, + { + "epoch": 0.8571181206891576, + "grad_norm": 4.313779706679082, + "learning_rate": 1.4293136292789934e-06, + "loss": 0.3306, + "step": 14850 + }, + { + "epoch": 0.8576953046088136, + "grad_norm": 11.790043476255672, + "learning_rate": 1.4235409571090458e-06, + "loss": 0.3391, + "step": 14860 + }, + { + "epoch": 0.8582724885284696, + "grad_norm": 7.642639050872643, + "learning_rate": 1.4177682849390984e-06, + "loss": 0.3388, + "step": 14870 + }, + { + "epoch": 0.8588496724481256, + "grad_norm": 7.260077362208394, + "learning_rate": 1.4119956127691508e-06, + "loss": 0.3422, + "step": 14880 + }, + { + "epoch": 0.8594268563677816, + "grad_norm": 4.754130590048299, + "learning_rate": 1.4062229405992034e-06, + "loss": 0.3436, + "step": 14890 + }, + { + "epoch": 0.8600040402874376, + "grad_norm": 6.225554657816755, + "learning_rate": 1.400450268429256e-06, + "loss": 0.3352, + "step": 14900 + }, + { + "epoch": 0.8605812242070936, + "grad_norm": 9.369073827925245, + "learning_rate": 1.3946775962593083e-06, + "loss": 0.3552, + "step": 14910 + }, + { + "epoch": 0.8611584081267496, + "grad_norm": 22.876915271022913, + "learning_rate": 1.388904924089361e-06, + "loss": 0.3492, + "step": 14920 + }, + { + "epoch": 0.8617355920464056, + "grad_norm": 3.2981136451706132, + "learning_rate": 1.3831322519194135e-06, + "loss": 0.3328, + "step": 14930 + }, + { + "epoch": 0.8623127759660616, + "grad_norm": 15.61626227774467, + "learning_rate": 1.3773595797494663e-06, + "loss": 0.337, + "step": 14940 + }, + { + "epoch": 0.8628899598857176, + "grad_norm": 3.605130100716397, + "learning_rate": 1.3715869075795187e-06, + "loss": 0.3385, + "step": 14950 + }, + { + "epoch": 0.8634671438053736, + "grad_norm": 12.284387392936685, + "learning_rate": 1.3658142354095713e-06, + "loss": 0.338, + "step": 14960 + }, + { + "epoch": 0.8640443277250296, + "grad_norm": 3.278014138295641, + "learning_rate": 1.360041563239624e-06, + "loss": 0.3337, + "step": 14970 + }, + { + "epoch": 0.8646215116446856, + "grad_norm": 6.104850658183799, + "learning_rate": 1.3542688910696763e-06, + "loss": 0.3276, + "step": 14980 + }, + { + "epoch": 0.8651986955643416, + "grad_norm": 5.279235884437097, + "learning_rate": 1.348496218899729e-06, + "loss": 0.3289, + "step": 14990 + }, + { + "epoch": 0.8657758794839976, + "grad_norm": 8.449467479597608, + "learning_rate": 1.3427235467297813e-06, + "loss": 0.3314, + "step": 15000 + }, + { + "epoch": 0.8663530634036536, + "grad_norm": 5.665683889458834, + "learning_rate": 1.3369508745598339e-06, + "loss": 0.3416, + "step": 15010 + }, + { + "epoch": 0.8669302473233096, + "grad_norm": 3.9335607560335735, + "learning_rate": 1.3311782023898865e-06, + "loss": 0.3525, + "step": 15020 + }, + { + "epoch": 0.8675074312429656, + "grad_norm": 2.894014459167942, + "learning_rate": 1.3254055302199389e-06, + "loss": 0.3328, + "step": 15030 + }, + { + "epoch": 0.8680846151626216, + "grad_norm": 65.75688988910291, + "learning_rate": 1.3196328580499915e-06, + "loss": 0.331, + "step": 15040 + }, + { + "epoch": 0.8686617990822776, + "grad_norm": 61.00701606633841, + "learning_rate": 1.313860185880044e-06, + "loss": 0.3407, + "step": 15050 + }, + { + "epoch": 0.8692389830019336, + "grad_norm": 4.364812051967769, + "learning_rate": 1.3080875137100965e-06, + "loss": 0.335, + "step": 15060 + }, + { + "epoch": 0.8698161669215896, + "grad_norm": 24.662511548298095, + "learning_rate": 1.302314841540149e-06, + "loss": 0.3348, + "step": 15070 + }, + { + "epoch": 0.8703933508412456, + "grad_norm": 4.160466497716753, + "learning_rate": 1.2965421693702016e-06, + "loss": 0.3252, + "step": 15080 + }, + { + "epoch": 0.8709705347609016, + "grad_norm": 6.134539334165056, + "learning_rate": 1.290769497200254e-06, + "loss": 0.343, + "step": 15090 + }, + { + "epoch": 0.8715477186805576, + "grad_norm": 7.228713295937482, + "learning_rate": 1.2849968250303066e-06, + "loss": 0.3338, + "step": 15100 + }, + { + "epoch": 0.8721249026002136, + "grad_norm": 4.918201123965152, + "learning_rate": 1.279224152860359e-06, + "loss": 0.3527, + "step": 15110 + }, + { + "epoch": 0.8727020865198696, + "grad_norm": 11.079488033226095, + "learning_rate": 1.2734514806904116e-06, + "loss": 0.3485, + "step": 15120 + }, + { + "epoch": 0.8732792704395256, + "grad_norm": 4.928609183215457, + "learning_rate": 1.2676788085204642e-06, + "loss": 0.3354, + "step": 15130 + }, + { + "epoch": 0.8738564543591816, + "grad_norm": 6.740385950730952, + "learning_rate": 1.2619061363505166e-06, + "loss": 0.333, + "step": 15140 + }, + { + "epoch": 0.8744336382788376, + "grad_norm": 4.67490770106929, + "learning_rate": 1.2561334641805692e-06, + "loss": 0.331, + "step": 15150 + }, + { + "epoch": 0.8750108221984936, + "grad_norm": 13.018881009854521, + "learning_rate": 1.2503607920106218e-06, + "loss": 0.3418, + "step": 15160 + }, + { + "epoch": 0.8755880061181496, + "grad_norm": 5.447550954730696, + "learning_rate": 1.2445881198406744e-06, + "loss": 0.3372, + "step": 15170 + }, + { + "epoch": 0.8761651900378056, + "grad_norm": 3.273410656701021, + "learning_rate": 1.238815447670727e-06, + "loss": 0.3199, + "step": 15180 + }, + { + "epoch": 0.8767423739574616, + "grad_norm": 4.277523732895546, + "learning_rate": 1.2330427755007794e-06, + "loss": 0.341, + "step": 15190 + }, + { + "epoch": 0.8773195578771176, + "grad_norm": 15.742782882783079, + "learning_rate": 1.227270103330832e-06, + "loss": 0.341, + "step": 15200 + }, + { + "epoch": 0.8778967417967736, + "grad_norm": 5.131013300289669, + "learning_rate": 1.2214974311608844e-06, + "loss": 0.3303, + "step": 15210 + }, + { + "epoch": 0.8784739257164296, + "grad_norm": 12.866336037106171, + "learning_rate": 1.215724758990937e-06, + "loss": 0.3442, + "step": 15220 + }, + { + "epoch": 0.8790511096360856, + "grad_norm": 5.127710193300859, + "learning_rate": 1.2099520868209895e-06, + "loss": 0.352, + "step": 15230 + }, + { + "epoch": 0.8796282935557416, + "grad_norm": 7.005612361267027, + "learning_rate": 1.204179414651042e-06, + "loss": 0.3376, + "step": 15240 + }, + { + "epoch": 0.8802054774753976, + "grad_norm": 19.994629507428854, + "learning_rate": 1.1984067424810945e-06, + "loss": 0.3366, + "step": 15250 + }, + { + "epoch": 0.8807826613950536, + "grad_norm": 19.289162587657003, + "learning_rate": 1.1926340703111471e-06, + "loss": 0.3339, + "step": 15260 + }, + { + "epoch": 0.8813598453147096, + "grad_norm": 5.780176197085242, + "learning_rate": 1.1868613981411997e-06, + "loss": 0.3328, + "step": 15270 + }, + { + "epoch": 0.8819370292343656, + "grad_norm": 5.322625710215796, + "learning_rate": 1.1810887259712523e-06, + "loss": 0.3364, + "step": 15280 + }, + { + "epoch": 0.8825142131540216, + "grad_norm": 4.160665784312188, + "learning_rate": 1.1753160538013047e-06, + "loss": 0.3298, + "step": 15290 + }, + { + "epoch": 0.8830913970736776, + "grad_norm": 39.76071118055878, + "learning_rate": 1.1695433816313573e-06, + "loss": 0.348, + "step": 15300 + }, + { + "epoch": 0.8836685809933336, + "grad_norm": 3.5659508030627958, + "learning_rate": 1.16377070946141e-06, + "loss": 0.3271, + "step": 15310 + }, + { + "epoch": 0.8842457649129896, + "grad_norm": 4.436695102429374, + "learning_rate": 1.1579980372914623e-06, + "loss": 0.3407, + "step": 15320 + }, + { + "epoch": 0.8848229488326456, + "grad_norm": 11.403167564876071, + "learning_rate": 1.1522253651215149e-06, + "loss": 0.3383, + "step": 15330 + }, + { + "epoch": 0.8854001327523016, + "grad_norm": 4.535641723893359, + "learning_rate": 1.1464526929515673e-06, + "loss": 0.3328, + "step": 15340 + }, + { + "epoch": 0.8859773166719576, + "grad_norm": 7.663265877666311, + "learning_rate": 1.1406800207816199e-06, + "loss": 0.3365, + "step": 15350 + }, + { + "epoch": 0.8865545005916136, + "grad_norm": 7.820505381715719, + "learning_rate": 1.1349073486116725e-06, + "loss": 0.3432, + "step": 15360 + }, + { + "epoch": 0.8871316845112696, + "grad_norm": 6.006599271526383, + "learning_rate": 1.1291346764417248e-06, + "loss": 0.3487, + "step": 15370 + }, + { + "epoch": 0.8877088684309256, + "grad_norm": 11.881435307105928, + "learning_rate": 1.1233620042717774e-06, + "loss": 0.3332, + "step": 15380 + }, + { + "epoch": 0.8882860523505816, + "grad_norm": 3.4819561819318103, + "learning_rate": 1.11758933210183e-06, + "loss": 0.335, + "step": 15390 + }, + { + "epoch": 0.8888632362702376, + "grad_norm": 6.893650513052578, + "learning_rate": 1.1118166599318826e-06, + "loss": 0.3493, + "step": 15400 + }, + { + "epoch": 0.8894404201898936, + "grad_norm": 6.925493159604081, + "learning_rate": 1.1060439877619352e-06, + "loss": 0.3448, + "step": 15410 + }, + { + "epoch": 0.8900176041095496, + "grad_norm": 5.5200268397262775, + "learning_rate": 1.1002713155919876e-06, + "loss": 0.3256, + "step": 15420 + }, + { + "epoch": 0.8905947880292056, + "grad_norm": 7.638890582405007, + "learning_rate": 1.0944986434220402e-06, + "loss": 0.3327, + "step": 15430 + }, + { + "epoch": 0.8911719719488616, + "grad_norm": 4.585636430659814, + "learning_rate": 1.0887259712520926e-06, + "loss": 0.3386, + "step": 15440 + }, + { + "epoch": 0.8917491558685176, + "grad_norm": 7.147252901738225, + "learning_rate": 1.0829532990821452e-06, + "loss": 0.3402, + "step": 15450 + }, + { + "epoch": 0.8923263397881736, + "grad_norm": 5.40597402568476, + "learning_rate": 1.0771806269121978e-06, + "loss": 0.3385, + "step": 15460 + }, + { + "epoch": 0.8929035237078295, + "grad_norm": 4.260919590930795, + "learning_rate": 1.0714079547422502e-06, + "loss": 0.3232, + "step": 15470 + }, + { + "epoch": 0.8934807076274855, + "grad_norm": 9.083532528696407, + "learning_rate": 1.0656352825723028e-06, + "loss": 0.354, + "step": 15480 + }, + { + "epoch": 0.8940578915471415, + "grad_norm": 4.996043459346209, + "learning_rate": 1.0598626104023554e-06, + "loss": 0.329, + "step": 15490 + }, + { + "epoch": 0.8946350754667975, + "grad_norm": 4.117500728065101, + "learning_rate": 1.0540899382324078e-06, + "loss": 0.3269, + "step": 15500 + }, + { + "epoch": 0.8952122593864535, + "grad_norm": 4.9248570764155435, + "learning_rate": 1.0483172660624604e-06, + "loss": 0.3321, + "step": 15510 + }, + { + "epoch": 0.8957894433061095, + "grad_norm": 4.696024737813827, + "learning_rate": 1.042544593892513e-06, + "loss": 0.3309, + "step": 15520 + }, + { + "epoch": 0.8963666272257655, + "grad_norm": 7.733578705149999, + "learning_rate": 1.0367719217225656e-06, + "loss": 0.3305, + "step": 15530 + }, + { + "epoch": 0.8969438111454215, + "grad_norm": 5.394346330597671, + "learning_rate": 1.0309992495526181e-06, + "loss": 0.3388, + "step": 15540 + }, + { + "epoch": 0.8975209950650774, + "grad_norm": 5.311667852016155, + "learning_rate": 1.0252265773826705e-06, + "loss": 0.3376, + "step": 15550 + }, + { + "epoch": 0.8980981789847334, + "grad_norm": 25.908831946398365, + "learning_rate": 1.0194539052127231e-06, + "loss": 0.3265, + "step": 15560 + }, + { + "epoch": 0.8986753629043894, + "grad_norm": 33.665974796607145, + "learning_rate": 1.0136812330427755e-06, + "loss": 0.3348, + "step": 15570 + }, + { + "epoch": 0.8992525468240454, + "grad_norm": 6.825259095726871, + "learning_rate": 1.0079085608728281e-06, + "loss": 0.3453, + "step": 15580 + }, + { + "epoch": 0.8998297307437014, + "grad_norm": 3.1735048623731092, + "learning_rate": 1.0021358887028807e-06, + "loss": 0.3218, + "step": 15590 + }, + { + "epoch": 0.9004069146633574, + "grad_norm": 9.114656814093948, + "learning_rate": 9.96363216532933e-07, + "loss": 0.3228, + "step": 15600 + }, + { + "epoch": 0.9009840985830134, + "grad_norm": 5.87155385500696, + "learning_rate": 9.905905443629857e-07, + "loss": 0.3309, + "step": 15610 + }, + { + "epoch": 0.9015612825026694, + "grad_norm": 14.717991716657272, + "learning_rate": 9.848178721930383e-07, + "loss": 0.3486, + "step": 15620 + }, + { + "epoch": 0.9021384664223254, + "grad_norm": 3.7767948462311067, + "learning_rate": 9.790452000230907e-07, + "loss": 0.3182, + "step": 15630 + }, + { + "epoch": 0.9027156503419814, + "grad_norm": 3.6787944993169006, + "learning_rate": 9.732725278531433e-07, + "loss": 0.3336, + "step": 15640 + }, + { + "epoch": 0.9032928342616374, + "grad_norm": 11.812395475228488, + "learning_rate": 9.674998556831959e-07, + "loss": 0.3286, + "step": 15650 + }, + { + "epoch": 0.9038700181812934, + "grad_norm": 5.186436534605756, + "learning_rate": 9.617271835132485e-07, + "loss": 0.3275, + "step": 15660 + }, + { + "epoch": 0.9044472021009494, + "grad_norm": 4.424154650673984, + "learning_rate": 9.559545113433009e-07, + "loss": 0.3185, + "step": 15670 + }, + { + "epoch": 0.9050243860206054, + "grad_norm": 6.3580022358578105, + "learning_rate": 9.501818391733534e-07, + "loss": 0.3226, + "step": 15680 + }, + { + "epoch": 0.9056015699402614, + "grad_norm": 6.310999676892304, + "learning_rate": 9.444091670034059e-07, + "loss": 0.3304, + "step": 15690 + }, + { + "epoch": 0.9061787538599174, + "grad_norm": 3.7913010315172633, + "learning_rate": 9.386364948334585e-07, + "loss": 0.3412, + "step": 15700 + }, + { + "epoch": 0.9067559377795734, + "grad_norm": 3.67407928861924, + "learning_rate": 9.32863822663511e-07, + "loss": 0.341, + "step": 15710 + }, + { + "epoch": 0.9073331216992294, + "grad_norm": 7.334570042771433, + "learning_rate": 9.270911504935635e-07, + "loss": 0.3265, + "step": 15720 + }, + { + "epoch": 0.9079103056188854, + "grad_norm": 6.456404235720165, + "learning_rate": 9.213184783236161e-07, + "loss": 0.3444, + "step": 15730 + }, + { + "epoch": 0.9084874895385414, + "grad_norm": 4.201612752804453, + "learning_rate": 9.155458061536686e-07, + "loss": 0.348, + "step": 15740 + }, + { + "epoch": 0.9090646734581974, + "grad_norm": 5.071365974662106, + "learning_rate": 9.097731339837211e-07, + "loss": 0.3153, + "step": 15750 + }, + { + "epoch": 0.9096418573778534, + "grad_norm": 5.035641882967374, + "learning_rate": 9.040004618137736e-07, + "loss": 0.3359, + "step": 15760 + }, + { + "epoch": 0.9102190412975094, + "grad_norm": 12.03037859002001, + "learning_rate": 8.982277896438262e-07, + "loss": 0.3242, + "step": 15770 + }, + { + "epoch": 0.9107962252171654, + "grad_norm": 7.140437657447056, + "learning_rate": 8.924551174738787e-07, + "loss": 0.3333, + "step": 15780 + }, + { + "epoch": 0.9113734091368214, + "grad_norm": 8.294728901327792, + "learning_rate": 8.866824453039313e-07, + "loss": 0.3267, + "step": 15790 + }, + { + "epoch": 0.9119505930564774, + "grad_norm": 5.258892430294796, + "learning_rate": 8.809097731339839e-07, + "loss": 0.3306, + "step": 15800 + }, + { + "epoch": 0.9125277769761334, + "grad_norm": 4.56809810586393, + "learning_rate": 8.751371009640364e-07, + "loss": 0.3354, + "step": 15810 + }, + { + "epoch": 0.9131049608957894, + "grad_norm": 8.754714473224658, + "learning_rate": 8.693644287940889e-07, + "loss": 0.3258, + "step": 15820 + }, + { + "epoch": 0.9136821448154454, + "grad_norm": 10.722318127648052, + "learning_rate": 8.635917566241415e-07, + "loss": 0.3251, + "step": 15830 + }, + { + "epoch": 0.9142593287351014, + "grad_norm": 17.100240147200765, + "learning_rate": 8.578190844541939e-07, + "loss": 0.3243, + "step": 15840 + }, + { + "epoch": 0.9148365126547574, + "grad_norm": 6.487613210408211, + "learning_rate": 8.520464122842464e-07, + "loss": 0.3299, + "step": 15850 + }, + { + "epoch": 0.9154136965744134, + "grad_norm": 3.047470063867609, + "learning_rate": 8.462737401142989e-07, + "loss": 0.3277, + "step": 15860 + }, + { + "epoch": 0.9159908804940694, + "grad_norm": 3.3099261534656823, + "learning_rate": 8.405010679443515e-07, + "loss": 0.3225, + "step": 15870 + }, + { + "epoch": 0.9165680644137254, + "grad_norm": 12.904829402744845, + "learning_rate": 8.34728395774404e-07, + "loss": 0.3456, + "step": 15880 + }, + { + "epoch": 0.9171452483333814, + "grad_norm": 3.874918781355711, + "learning_rate": 8.289557236044565e-07, + "loss": 0.324, + "step": 15890 + }, + { + "epoch": 0.9177224322530374, + "grad_norm": 8.856329412411933, + "learning_rate": 8.231830514345091e-07, + "loss": 0.3336, + "step": 15900 + }, + { + "epoch": 0.9182996161726934, + "grad_norm": 8.142861706815804, + "learning_rate": 8.174103792645616e-07, + "loss": 0.3346, + "step": 15910 + }, + { + "epoch": 0.9188768000923494, + "grad_norm": 6.512456297032329, + "learning_rate": 8.116377070946142e-07, + "loss": 0.3356, + "step": 15920 + }, + { + "epoch": 0.9194539840120054, + "grad_norm": 5.20827680094837, + "learning_rate": 8.058650349246668e-07, + "loss": 0.327, + "step": 15930 + }, + { + "epoch": 0.9200311679316614, + "grad_norm": 3.6566931406552166, + "learning_rate": 8.000923627547193e-07, + "loss": 0.3393, + "step": 15940 + }, + { + "epoch": 0.9206083518513174, + "grad_norm": 5.430549059652793, + "learning_rate": 7.943196905847718e-07, + "loss": 0.336, + "step": 15950 + }, + { + "epoch": 0.9211855357709734, + "grad_norm": 11.48641695737308, + "learning_rate": 7.885470184148243e-07, + "loss": 0.3239, + "step": 15960 + }, + { + "epoch": 0.9217627196906294, + "grad_norm": 5.74247518929047, + "learning_rate": 7.827743462448769e-07, + "loss": 0.3272, + "step": 15970 + }, + { + "epoch": 0.9223399036102854, + "grad_norm": 2.9972434937325954, + "learning_rate": 7.770016740749293e-07, + "loss": 0.3444, + "step": 15980 + }, + { + "epoch": 0.9229170875299414, + "grad_norm": 4.3485773330395405, + "learning_rate": 7.712290019049818e-07, + "loss": 0.3343, + "step": 15990 + }, + { + "epoch": 0.9234942714495974, + "grad_norm": 5.8420315281490725, + "learning_rate": 7.654563297350344e-07, + "loss": 0.3418, + "step": 16000 + }, + { + "epoch": 0.9240714553692534, + "grad_norm": 3.3727101894208924, + "learning_rate": 7.596836575650869e-07, + "loss": 0.3313, + "step": 16010 + }, + { + "epoch": 0.9246486392889094, + "grad_norm": 4.3271834892078305, + "learning_rate": 7.539109853951394e-07, + "loss": 0.3263, + "step": 16020 + }, + { + "epoch": 0.9252258232085654, + "grad_norm": 4.114539060448778, + "learning_rate": 7.481383132251919e-07, + "loss": 0.3494, + "step": 16030 + }, + { + "epoch": 0.9258030071282214, + "grad_norm": 3.6454496106451946, + "learning_rate": 7.423656410552445e-07, + "loss": 0.3287, + "step": 16040 + }, + { + "epoch": 0.9263801910478774, + "grad_norm": 3.738978996136776, + "learning_rate": 7.365929688852971e-07, + "loss": 0.3463, + "step": 16050 + }, + { + "epoch": 0.9269573749675334, + "grad_norm": 4.789282815458411, + "learning_rate": 7.308202967153497e-07, + "loss": 0.3237, + "step": 16060 + }, + { + "epoch": 0.9275345588871894, + "grad_norm": 9.761730219338645, + "learning_rate": 7.250476245454022e-07, + "loss": 0.3469, + "step": 16070 + }, + { + "epoch": 0.9281117428068454, + "grad_norm": 3.028208359321862, + "learning_rate": 7.192749523754547e-07, + "loss": 0.3355, + "step": 16080 + }, + { + "epoch": 0.9286889267265014, + "grad_norm": 18.45944225954803, + "learning_rate": 7.135022802055072e-07, + "loss": 0.3177, + "step": 16090 + }, + { + "epoch": 0.9292661106461574, + "grad_norm": 7.253195406338047, + "learning_rate": 7.077296080355598e-07, + "loss": 0.3439, + "step": 16100 + }, + { + "epoch": 0.9298432945658134, + "grad_norm": 4.739735623729803, + "learning_rate": 7.019569358656123e-07, + "loss": 0.3357, + "step": 16110 + }, + { + "epoch": 0.9304204784854694, + "grad_norm": 7.262658023729907, + "learning_rate": 6.961842636956648e-07, + "loss": 0.3307, + "step": 16120 + }, + { + "epoch": 0.9309976624051254, + "grad_norm": 4.3276336459159275, + "learning_rate": 6.904115915257172e-07, + "loss": 0.3397, + "step": 16130 + }, + { + "epoch": 0.9315748463247814, + "grad_norm": 4.773618213493451, + "learning_rate": 6.846389193557698e-07, + "loss": 0.3265, + "step": 16140 + }, + { + "epoch": 0.9321520302444374, + "grad_norm": 3.469713298213091, + "learning_rate": 6.788662471858223e-07, + "loss": 0.3356, + "step": 16150 + }, + { + "epoch": 0.9327292141640934, + "grad_norm": 2.680891067948385, + "learning_rate": 6.730935750158748e-07, + "loss": 0.3351, + "step": 16160 + }, + { + "epoch": 0.9333063980837494, + "grad_norm": 4.795647821005584, + "learning_rate": 6.673209028459274e-07, + "loss": 0.316, + "step": 16170 + }, + { + "epoch": 0.9338835820034054, + "grad_norm": 4.072329465945383, + "learning_rate": 6.615482306759799e-07, + "loss": 0.3323, + "step": 16180 + }, + { + "epoch": 0.9344607659230614, + "grad_norm": 6.139834777308556, + "learning_rate": 6.557755585060325e-07, + "loss": 0.3389, + "step": 16190 + }, + { + "epoch": 0.9350379498427174, + "grad_norm": 6.679020147433282, + "learning_rate": 6.500028863360851e-07, + "loss": 0.3213, + "step": 16200 + }, + { + "epoch": 0.9356151337623734, + "grad_norm": 3.931169778597499, + "learning_rate": 6.442302141661376e-07, + "loss": 0.3202, + "step": 16210 + }, + { + "epoch": 0.9361923176820294, + "grad_norm": 9.793848920416451, + "learning_rate": 6.384575419961901e-07, + "loss": 0.3312, + "step": 16220 + }, + { + "epoch": 0.9367695016016854, + "grad_norm": 3.165076810544466, + "learning_rate": 6.326848698262427e-07, + "loss": 0.3268, + "step": 16230 + }, + { + "epoch": 0.9373466855213414, + "grad_norm": 11.613653214904037, + "learning_rate": 6.269121976562952e-07, + "loss": 0.3395, + "step": 16240 + }, + { + "epoch": 0.9379238694409974, + "grad_norm": 8.287793663837986, + "learning_rate": 6.211395254863477e-07, + "loss": 0.337, + "step": 16250 + }, + { + "epoch": 0.9385010533606534, + "grad_norm": 4.919246593010951, + "learning_rate": 6.153668533164002e-07, + "loss": 0.3183, + "step": 16260 + }, + { + "epoch": 0.9390782372803094, + "grad_norm": 12.710584575769794, + "learning_rate": 6.095941811464528e-07, + "loss": 0.3371, + "step": 16270 + }, + { + "epoch": 0.9396554211999654, + "grad_norm": 13.642759112617313, + "learning_rate": 6.038215089765054e-07, + "loss": 0.3364, + "step": 16280 + }, + { + "epoch": 0.9402326051196214, + "grad_norm": 3.2163528904964567, + "learning_rate": 5.980488368065578e-07, + "loss": 0.3482, + "step": 16290 + }, + { + "epoch": 0.9408097890392774, + "grad_norm": 6.3234687340970845, + "learning_rate": 5.922761646366103e-07, + "loss": 0.3389, + "step": 16300 + }, + { + "epoch": 0.9413869729589334, + "grad_norm": 4.810430576924547, + "learning_rate": 5.865034924666628e-07, + "loss": 0.3273, + "step": 16310 + }, + { + "epoch": 0.9419641568785894, + "grad_norm": 16.667420292327467, + "learning_rate": 5.807308202967154e-07, + "loss": 0.3333, + "step": 16320 + }, + { + "epoch": 0.9425413407982454, + "grad_norm": 4.476842181947663, + "learning_rate": 5.749581481267679e-07, + "loss": 0.3319, + "step": 16330 + }, + { + "epoch": 0.9431185247179014, + "grad_norm": 2.753605309820116, + "learning_rate": 5.691854759568204e-07, + "loss": 0.324, + "step": 16340 + }, + { + "epoch": 0.9436957086375574, + "grad_norm": 7.258998836239923, + "learning_rate": 5.63412803786873e-07, + "loss": 0.3317, + "step": 16350 + }, + { + "epoch": 0.9442728925572134, + "grad_norm": 3.3810984203362513, + "learning_rate": 5.576401316169255e-07, + "loss": 0.3308, + "step": 16360 + }, + { + "epoch": 0.9448500764768694, + "grad_norm": 10.448881769543355, + "learning_rate": 5.518674594469781e-07, + "loss": 0.3389, + "step": 16370 + }, + { + "epoch": 0.9454272603965254, + "grad_norm": 7.485384669846898, + "learning_rate": 5.460947872770306e-07, + "loss": 0.3225, + "step": 16380 + }, + { + "epoch": 0.9460044443161814, + "grad_norm": 7.048976390316521, + "learning_rate": 5.403221151070831e-07, + "loss": 0.3316, + "step": 16390 + }, + { + "epoch": 0.9465816282358374, + "grad_norm": 5.343786757617583, + "learning_rate": 5.345494429371357e-07, + "loss": 0.324, + "step": 16400 + }, + { + "epoch": 0.9471588121554934, + "grad_norm": 8.606885533079547, + "learning_rate": 5.287767707671882e-07, + "loss": 0.339, + "step": 16410 + }, + { + "epoch": 0.9477359960751494, + "grad_norm": 5.1476348800105205, + "learning_rate": 5.230040985972408e-07, + "loss": 0.3316, + "step": 16420 + }, + { + "epoch": 0.9483131799948054, + "grad_norm": 3.9518622190419386, + "learning_rate": 5.172314264272933e-07, + "loss": 0.3257, + "step": 16430 + }, + { + "epoch": 0.9488903639144614, + "grad_norm": 4.759115152912856, + "learning_rate": 5.114587542573457e-07, + "loss": 0.3252, + "step": 16440 + }, + { + "epoch": 0.9494675478341174, + "grad_norm": 2.3577377378728936, + "learning_rate": 5.056860820873983e-07, + "loss": 0.3354, + "step": 16450 + }, + { + "epoch": 0.9500447317537734, + "grad_norm": 8.233491658698778, + "learning_rate": 4.999134099174508e-07, + "loss": 0.3269, + "step": 16460 + }, + { + "epoch": 0.9506219156734294, + "grad_norm": 5.830593517325124, + "learning_rate": 4.941407377475033e-07, + "loss": 0.3303, + "step": 16470 + }, + { + "epoch": 0.9511990995930854, + "grad_norm": 4.5976658225857205, + "learning_rate": 4.883680655775559e-07, + "loss": 0.3164, + "step": 16480 + }, + { + "epoch": 0.9517762835127414, + "grad_norm": 2.683143832655395, + "learning_rate": 4.825953934076084e-07, + "loss": 0.3389, + "step": 16490 + }, + { + "epoch": 0.9523534674323973, + "grad_norm": 6.256568384332184, + "learning_rate": 4.7682272123766096e-07, + "loss": 0.3379, + "step": 16500 + }, + { + "epoch": 0.9529306513520533, + "grad_norm": 5.947037852710701, + "learning_rate": 4.710500490677135e-07, + "loss": 0.3336, + "step": 16510 + }, + { + "epoch": 0.9535078352717093, + "grad_norm": 3.9206339656766183, + "learning_rate": 4.65277376897766e-07, + "loss": 0.3416, + "step": 16520 + }, + { + "epoch": 0.9540850191913653, + "grad_norm": 5.305934878449426, + "learning_rate": 4.5950470472781854e-07, + "loss": 0.343, + "step": 16530 + }, + { + "epoch": 0.9546622031110213, + "grad_norm": 4.905648955862364, + "learning_rate": 4.53732032557871e-07, + "loss": 0.3326, + "step": 16540 + }, + { + "epoch": 0.9552393870306773, + "grad_norm": 6.934144679851784, + "learning_rate": 4.479593603879236e-07, + "loss": 0.3315, + "step": 16550 + }, + { + "epoch": 0.9558165709503333, + "grad_norm": 6.121333752853476, + "learning_rate": 4.4218668821797617e-07, + "loss": 0.3337, + "step": 16560 + }, + { + "epoch": 0.9563937548699893, + "grad_norm": 4.161869077945622, + "learning_rate": 4.3641401604802866e-07, + "loss": 0.354, + "step": 16570 + }, + { + "epoch": 0.9569709387896453, + "grad_norm": 4.792938959925312, + "learning_rate": 4.306413438780812e-07, + "loss": 0.3385, + "step": 16580 + }, + { + "epoch": 0.9575481227093013, + "grad_norm": 13.85786954380734, + "learning_rate": 4.248686717081337e-07, + "loss": 0.3206, + "step": 16590 + }, + { + "epoch": 0.9581253066289573, + "grad_norm": 21.263443082950594, + "learning_rate": 4.1909599953818624e-07, + "loss": 0.3325, + "step": 16600 + }, + { + "epoch": 0.9587024905486133, + "grad_norm": 3.660403999109124, + "learning_rate": 4.1332332736823884e-07, + "loss": 0.3227, + "step": 16610 + }, + { + "epoch": 0.9592796744682693, + "grad_norm": 3.9235176913649994, + "learning_rate": 4.075506551982913e-07, + "loss": 0.3283, + "step": 16620 + }, + { + "epoch": 0.9598568583879253, + "grad_norm": 5.6449372673837965, + "learning_rate": 4.0177798302834387e-07, + "loss": 0.3427, + "step": 16630 + }, + { + "epoch": 0.9604340423075813, + "grad_norm": 5.248416354277083, + "learning_rate": 3.9600531085839636e-07, + "loss": 0.3288, + "step": 16640 + }, + { + "epoch": 0.9610112262272373, + "grad_norm": 8.246345220378487, + "learning_rate": 3.902326386884489e-07, + "loss": 0.3354, + "step": 16650 + }, + { + "epoch": 0.9615884101468933, + "grad_norm": 13.274950590494653, + "learning_rate": 3.8445996651850145e-07, + "loss": 0.3312, + "step": 16660 + }, + { + "epoch": 0.9621655940665493, + "grad_norm": 15.537361667999631, + "learning_rate": 3.7868729434855394e-07, + "loss": 0.3237, + "step": 16670 + }, + { + "epoch": 0.9627427779862053, + "grad_norm": 17.30099668558429, + "learning_rate": 3.7291462217860654e-07, + "loss": 0.3447, + "step": 16680 + }, + { + "epoch": 0.9633199619058613, + "grad_norm": 6.00682814280853, + "learning_rate": 3.671419500086591e-07, + "loss": 0.3285, + "step": 16690 + }, + { + "epoch": 0.9638971458255173, + "grad_norm": 4.337349097771177, + "learning_rate": 3.613692778387116e-07, + "loss": 0.3406, + "step": 16700 + }, + { + "epoch": 0.9644743297451733, + "grad_norm": 5.112346889090425, + "learning_rate": 3.555966056687641e-07, + "loss": 0.3245, + "step": 16710 + }, + { + "epoch": 0.9650515136648293, + "grad_norm": 3.333315383479396, + "learning_rate": 3.498239334988166e-07, + "loss": 0.323, + "step": 16720 + }, + { + "epoch": 0.9656286975844853, + "grad_norm": 6.363838641104665, + "learning_rate": 3.4405126132886915e-07, + "loss": 0.3335, + "step": 16730 + }, + { + "epoch": 0.9662058815041413, + "grad_norm": 4.4727853159969095, + "learning_rate": 3.3827858915892164e-07, + "loss": 0.3195, + "step": 16740 + }, + { + "epoch": 0.9667830654237973, + "grad_norm": 2.9245486184268525, + "learning_rate": 3.3250591698897424e-07, + "loss": 0.334, + "step": 16750 + }, + { + "epoch": 0.9673602493434533, + "grad_norm": 8.292114221205217, + "learning_rate": 3.267332448190268e-07, + "loss": 0.3417, + "step": 16760 + }, + { + "epoch": 0.9679374332631093, + "grad_norm": 4.467404105277962, + "learning_rate": 3.209605726490793e-07, + "loss": 0.3273, + "step": 16770 + }, + { + "epoch": 0.9685146171827653, + "grad_norm": 3.555136063724782, + "learning_rate": 3.151879004791318e-07, + "loss": 0.3319, + "step": 16780 + }, + { + "epoch": 0.9690918011024213, + "grad_norm": 3.5864439964386206, + "learning_rate": 3.0941522830918436e-07, + "loss": 0.3151, + "step": 16790 + }, + { + "epoch": 0.9696689850220773, + "grad_norm": 4.286014953806982, + "learning_rate": 3.036425561392369e-07, + "loss": 0.3348, + "step": 16800 + }, + { + "epoch": 0.9702461689417333, + "grad_norm": 4.937397445129751, + "learning_rate": 2.978698839692894e-07, + "loss": 0.3447, + "step": 16810 + }, + { + "epoch": 0.9708233528613893, + "grad_norm": 4.053983936820117, + "learning_rate": 2.9209721179934194e-07, + "loss": 0.3234, + "step": 16820 + }, + { + "epoch": 0.9714005367810453, + "grad_norm": 4.668613443745286, + "learning_rate": 2.863245396293945e-07, + "loss": 0.3318, + "step": 16830 + }, + { + "epoch": 0.9719777207007013, + "grad_norm": 4.726308017445137, + "learning_rate": 2.80551867459447e-07, + "loss": 0.3384, + "step": 16840 + }, + { + "epoch": 0.9725549046203573, + "grad_norm": 5.20234640635383, + "learning_rate": 2.747791952894995e-07, + "loss": 0.3414, + "step": 16850 + }, + { + "epoch": 0.9731320885400133, + "grad_norm": 5.471268202808402, + "learning_rate": 2.6900652311955207e-07, + "loss": 0.3158, + "step": 16860 + }, + { + "epoch": 0.9737092724596693, + "grad_norm": 4.041775287210815, + "learning_rate": 2.632338509496046e-07, + "loss": 0.3353, + "step": 16870 + }, + { + "epoch": 0.9742864563793253, + "grad_norm": 6.9340722075810515, + "learning_rate": 2.574611787796571e-07, + "loss": 0.3292, + "step": 16880 + }, + { + "epoch": 0.9748636402989813, + "grad_norm": 5.462231359128078, + "learning_rate": 2.5168850660970965e-07, + "loss": 0.3382, + "step": 16890 + }, + { + "epoch": 0.9754408242186373, + "grad_norm": 5.016835747194534, + "learning_rate": 2.459158344397622e-07, + "loss": 0.3264, + "step": 16900 + }, + { + "epoch": 0.9760180081382933, + "grad_norm": 6.59783102359862, + "learning_rate": 2.4014316226981474e-07, + "loss": 0.3303, + "step": 16910 + }, + { + "epoch": 0.9765951920579493, + "grad_norm": 11.129353025607179, + "learning_rate": 2.3437049009986723e-07, + "loss": 0.3196, + "step": 16920 + }, + { + "epoch": 0.9771723759776053, + "grad_norm": 5.828386789897742, + "learning_rate": 2.285978179299198e-07, + "loss": 0.3319, + "step": 16930 + }, + { + "epoch": 0.9777495598972613, + "grad_norm": 2.710691717608737, + "learning_rate": 2.2282514575997232e-07, + "loss": 0.3319, + "step": 16940 + }, + { + "epoch": 0.9783267438169173, + "grad_norm": 5.1520597373996715, + "learning_rate": 2.1705247359002483e-07, + "loss": 0.3314, + "step": 16950 + }, + { + "epoch": 0.9789039277365733, + "grad_norm": 3.185806720570308, + "learning_rate": 2.1127980142007738e-07, + "loss": 0.3296, + "step": 16960 + }, + { + "epoch": 0.9794811116562293, + "grad_norm": 6.515634970555692, + "learning_rate": 2.055071292501299e-07, + "loss": 0.3312, + "step": 16970 + }, + { + "epoch": 0.9800582955758853, + "grad_norm": 10.485461655446002, + "learning_rate": 1.997344570801824e-07, + "loss": 0.3422, + "step": 16980 + }, + { + "epoch": 0.9806354794955413, + "grad_norm": 3.8847690688300727, + "learning_rate": 1.9396178491023498e-07, + "loss": 0.3283, + "step": 16990 + }, + { + "epoch": 0.9812126634151973, + "grad_norm": 9.994920110996672, + "learning_rate": 1.881891127402875e-07, + "loss": 0.3327, + "step": 17000 + }, + { + "epoch": 0.9817898473348533, + "grad_norm": 5.025058096183087, + "learning_rate": 1.8241644057034002e-07, + "loss": 0.3207, + "step": 17010 + }, + { + "epoch": 0.9823670312545093, + "grad_norm": 11.136774459380124, + "learning_rate": 1.7664376840039256e-07, + "loss": 0.3233, + "step": 17020 + }, + { + "epoch": 0.9829442151741652, + "grad_norm": 2.906968928719714, + "learning_rate": 1.708710962304451e-07, + "loss": 0.335, + "step": 17030 + }, + { + "epoch": 0.9835213990938212, + "grad_norm": 47.22379036340986, + "learning_rate": 1.6509842406049762e-07, + "loss": 0.326, + "step": 17040 + }, + { + "epoch": 0.9840985830134772, + "grad_norm": 3.5068334227447537, + "learning_rate": 1.5932575189055014e-07, + "loss": 0.3308, + "step": 17050 + }, + { + "epoch": 0.9846757669331332, + "grad_norm": 6.058207244664307, + "learning_rate": 1.5355307972060266e-07, + "loss": 0.336, + "step": 17060 + }, + { + "epoch": 0.9852529508527892, + "grad_norm": 14.19694786309551, + "learning_rate": 1.477804075506552e-07, + "loss": 0.3482, + "step": 17070 + }, + { + "epoch": 0.9858301347724452, + "grad_norm": 4.351680537742745, + "learning_rate": 1.4200773538070775e-07, + "loss": 0.3533, + "step": 17080 + }, + { + "epoch": 0.9864073186921012, + "grad_norm": 11.029248449585278, + "learning_rate": 1.3623506321076027e-07, + "loss": 0.3196, + "step": 17090 + }, + { + "epoch": 0.9869845026117572, + "grad_norm": 5.081238129188481, + "learning_rate": 1.304623910408128e-07, + "loss": 0.333, + "step": 17100 + }, + { + "epoch": 0.9875616865314132, + "grad_norm": 23.011375672208313, + "learning_rate": 1.2468971887086533e-07, + "loss": 0.3353, + "step": 17110 + }, + { + "epoch": 0.9881388704510692, + "grad_norm": 3.615170936112003, + "learning_rate": 1.1891704670091786e-07, + "loss": 0.3293, + "step": 17120 + }, + { + "epoch": 0.9887160543707252, + "grad_norm": 6.0241909835288645, + "learning_rate": 1.131443745309704e-07, + "loss": 0.3305, + "step": 17130 + }, + { + "epoch": 0.9892932382903812, + "grad_norm": 6.4242440293309, + "learning_rate": 1.0737170236102292e-07, + "loss": 0.3229, + "step": 17140 + }, + { + "epoch": 0.9898704222100372, + "grad_norm": 4.8207248692315465, + "learning_rate": 1.0159903019107546e-07, + "loss": 0.3317, + "step": 17150 + }, + { + "epoch": 0.9904476061296932, + "grad_norm": 4.674342685671797, + "learning_rate": 9.5826358021128e-08, + "loss": 0.328, + "step": 17160 + }, + { + "epoch": 0.9910247900493492, + "grad_norm": 5.524320830604144, + "learning_rate": 9.005368585118051e-08, + "loss": 0.3297, + "step": 17170 + }, + { + "epoch": 0.9916019739690052, + "grad_norm": 6.1310872624369175, + "learning_rate": 8.428101368123306e-08, + "loss": 0.3342, + "step": 17180 + }, + { + "epoch": 0.9921791578886612, + "grad_norm": 4.736837397124582, + "learning_rate": 7.850834151128557e-08, + "loss": 0.3261, + "step": 17190 + }, + { + "epoch": 0.9927563418083172, + "grad_norm": 3.3135028507038498, + "learning_rate": 7.273566934133812e-08, + "loss": 0.3125, + "step": 17200 + }, + { + "epoch": 0.9933335257279732, + "grad_norm": 4.80914101916905, + "learning_rate": 6.696299717139064e-08, + "loss": 0.3233, + "step": 17210 + }, + { + "epoch": 0.9939107096476292, + "grad_norm": 4.178676432109751, + "learning_rate": 6.119032500144317e-08, + "loss": 0.316, + "step": 17220 + }, + { + "epoch": 0.9944878935672852, + "grad_norm": 4.648278510632473, + "learning_rate": 5.5417652831495705e-08, + "loss": 0.3247, + "step": 17230 + }, + { + "epoch": 0.9950650774869412, + "grad_norm": 6.4221153929916515, + "learning_rate": 4.9644980661548236e-08, + "loss": 0.3252, + "step": 17240 + }, + { + "epoch": 0.9956422614065972, + "grad_norm": 4.57910460292981, + "learning_rate": 4.3872308491600766e-08, + "loss": 0.346, + "step": 17250 + }, + { + "epoch": 0.9962194453262532, + "grad_norm": 6.013404774431674, + "learning_rate": 3.80996363216533e-08, + "loss": 0.3286, + "step": 17260 + }, + { + "epoch": 0.9967966292459092, + "grad_norm": 4.660240371593313, + "learning_rate": 3.232696415170583e-08, + "loss": 0.3238, + "step": 17270 + }, + { + "epoch": 0.9973738131655652, + "grad_norm": 3.0170050218995734, + "learning_rate": 2.6554291981758356e-08, + "loss": 0.3399, + "step": 17280 + }, + { + "epoch": 0.9979509970852212, + "grad_norm": 3.235155684245671, + "learning_rate": 2.078161981181089e-08, + "loss": 0.343, + "step": 17290 + }, + { + "epoch": 0.9985281810048772, + "grad_norm": 3.668276198786728, + "learning_rate": 1.500894764186342e-08, + "loss": 0.3299, + "step": 17300 + }, + { + "epoch": 0.9991053649245332, + "grad_norm": 5.359004024139474, + "learning_rate": 9.23627547191595e-09, + "loss": 0.3202, + "step": 17310 + }, + { + "epoch": 0.9996825488441892, + "grad_norm": 5.70686001442703, + "learning_rate": 3.4636033019684815e-09, + "loss": 0.3161, + "step": 17320 + } + ], + "logging_steps": 10, + "max_steps": 17325, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9935909437571072.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}