{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999030350043634, "eval_steps": 500, "global_step": 2578, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003878599825463008, "grad_norm": 48.88943947289956, "learning_rate": 3.875968992248062e-08, "loss": 3.5013, "step": 1 }, { "epoch": 0.0007757199650926016, "grad_norm": 45.81863572981332, "learning_rate": 7.751937984496124e-08, "loss": 3.5358, "step": 2 }, { "epoch": 0.0011635799476389023, "grad_norm": 48.78508596173693, "learning_rate": 1.1627906976744187e-07, "loss": 3.5737, "step": 3 }, { "epoch": 0.0015514399301852031, "grad_norm": 49.403992966606346, "learning_rate": 1.5503875968992249e-07, "loss": 3.7577, "step": 4 }, { "epoch": 0.0019392999127315039, "grad_norm": 48.298573056808266, "learning_rate": 1.9379844961240311e-07, "loss": 3.6249, "step": 5 }, { "epoch": 0.0023271598952778047, "grad_norm": 48.31932116761711, "learning_rate": 2.3255813953488374e-07, "loss": 3.5806, "step": 6 }, { "epoch": 0.0027150198778241054, "grad_norm": 47.37433022034094, "learning_rate": 2.7131782945736437e-07, "loss": 3.4885, "step": 7 }, { "epoch": 0.0031028798603704062, "grad_norm": 47.08608025839516, "learning_rate": 3.1007751937984497e-07, "loss": 3.4017, "step": 8 }, { "epoch": 0.003490739842916707, "grad_norm": 47.6986792380859, "learning_rate": 3.488372093023256e-07, "loss": 3.4608, "step": 9 }, { "epoch": 0.0038785998254630078, "grad_norm": 45.34880107243838, "learning_rate": 3.8759689922480623e-07, "loss": 3.4218, "step": 10 }, { "epoch": 0.004266459808009309, "grad_norm": 44.81041994270127, "learning_rate": 4.2635658914728683e-07, "loss": 3.4144, "step": 11 }, { "epoch": 0.004654319790555609, "grad_norm": 48.79405571252639, "learning_rate": 4.651162790697675e-07, "loss": 3.4638, "step": 12 }, { "epoch": 0.0050421797731019105, "grad_norm": 46.65773969304416, "learning_rate": 5.038759689922481e-07, "loss": 3.4682, "step": 13 }, { "epoch": 0.005430039755648211, "grad_norm": 43.61470508283465, "learning_rate": 5.426356589147287e-07, "loss": 3.2096, "step": 14 }, { "epoch": 0.005817899738194512, "grad_norm": 43.17168837132768, "learning_rate": 5.813953488372094e-07, "loss": 3.3099, "step": 15 }, { "epoch": 0.0062057597207408124, "grad_norm": 42.90478056070991, "learning_rate": 6.201550387596899e-07, "loss": 3.2888, "step": 16 }, { "epoch": 0.006593619703287114, "grad_norm": 41.147268517298926, "learning_rate": 6.589147286821707e-07, "loss": 3.3341, "step": 17 }, { "epoch": 0.006981479685833414, "grad_norm": 39.04412788309508, "learning_rate": 6.976744186046513e-07, "loss": 3.1116, "step": 18 }, { "epoch": 0.007369339668379715, "grad_norm": 39.18952211982552, "learning_rate": 7.364341085271319e-07, "loss": 3.1998, "step": 19 }, { "epoch": 0.0077571996509260156, "grad_norm": 36.113021123094846, "learning_rate": 7.751937984496125e-07, "loss": 3.0373, "step": 20 }, { "epoch": 0.008145059633472316, "grad_norm": 37.64123438986915, "learning_rate": 8.139534883720931e-07, "loss": 3.0867, "step": 21 }, { "epoch": 0.008532919616018618, "grad_norm": 26.21332880548061, "learning_rate": 8.527131782945737e-07, "loss": 2.5384, "step": 22 }, { "epoch": 0.008920779598564918, "grad_norm": 24.389687896409562, "learning_rate": 8.914728682170544e-07, "loss": 2.58, "step": 23 }, { "epoch": 0.009308639581111219, "grad_norm": 22.4320654614167, "learning_rate": 9.30232558139535e-07, "loss": 2.474, "step": 24 }, { "epoch": 0.009696499563657519, "grad_norm": 23.809030725855873, "learning_rate": 9.689922480620157e-07, "loss": 2.5039, "step": 25 }, { "epoch": 0.010084359546203821, "grad_norm": 22.446406796271678, "learning_rate": 1.0077519379844962e-06, "loss": 2.3223, "step": 26 }, { "epoch": 0.010472219528750121, "grad_norm": 21.601096527586577, "learning_rate": 1.0465116279069768e-06, "loss": 2.3712, "step": 27 }, { "epoch": 0.010860079511296422, "grad_norm": 20.742556398708874, "learning_rate": 1.0852713178294575e-06, "loss": 2.2954, "step": 28 }, { "epoch": 0.011247939493842722, "grad_norm": 17.914718791216337, "learning_rate": 1.1240310077519381e-06, "loss": 2.2004, "step": 29 }, { "epoch": 0.011635799476389024, "grad_norm": 14.43325642768343, "learning_rate": 1.1627906976744188e-06, "loss": 1.8277, "step": 30 }, { "epoch": 0.012023659458935325, "grad_norm": 18.457930504624493, "learning_rate": 1.2015503875968994e-06, "loss": 1.9471, "step": 31 }, { "epoch": 0.012411519441481625, "grad_norm": 17.611937890190028, "learning_rate": 1.2403100775193799e-06, "loss": 1.9523, "step": 32 }, { "epoch": 0.012799379424027925, "grad_norm": 15.393170984310329, "learning_rate": 1.2790697674418605e-06, "loss": 1.7249, "step": 33 }, { "epoch": 0.013187239406574227, "grad_norm": 11.08788650736289, "learning_rate": 1.3178294573643414e-06, "loss": 1.6342, "step": 34 }, { "epoch": 0.013575099389120528, "grad_norm": 9.027789214580796, "learning_rate": 1.3565891472868216e-06, "loss": 1.6919, "step": 35 }, { "epoch": 0.013962959371666828, "grad_norm": 10.03341197332856, "learning_rate": 1.3953488372093025e-06, "loss": 1.729, "step": 36 }, { "epoch": 0.014350819354213128, "grad_norm": 11.649383865048092, "learning_rate": 1.4341085271317832e-06, "loss": 1.6397, "step": 37 }, { "epoch": 0.01473867933675943, "grad_norm": 7.434380109063409, "learning_rate": 1.4728682170542638e-06, "loss": 1.6428, "step": 38 }, { "epoch": 0.01512653931930573, "grad_norm": 7.298976539108495, "learning_rate": 1.5116279069767443e-06, "loss": 1.6535, "step": 39 }, { "epoch": 0.015514399301852031, "grad_norm": 5.768265083255346, "learning_rate": 1.550387596899225e-06, "loss": 1.5893, "step": 40 }, { "epoch": 0.015902259284398333, "grad_norm": 5.375784888185106, "learning_rate": 1.5891472868217056e-06, "loss": 1.4264, "step": 41 }, { "epoch": 0.016290119266944632, "grad_norm": 6.910959836622738, "learning_rate": 1.6279069767441862e-06, "loss": 1.4312, "step": 42 }, { "epoch": 0.016677979249490934, "grad_norm": 7.2229608487012085, "learning_rate": 1.6666666666666667e-06, "loss": 1.4506, "step": 43 }, { "epoch": 0.017065839232037236, "grad_norm": 6.156221939585921, "learning_rate": 1.7054263565891473e-06, "loss": 1.4043, "step": 44 }, { "epoch": 0.017453699214583535, "grad_norm": 5.188581238679549, "learning_rate": 1.7441860465116282e-06, "loss": 1.2257, "step": 45 }, { "epoch": 0.017841559197129837, "grad_norm": 6.014648141255009, "learning_rate": 1.7829457364341088e-06, "loss": 1.4653, "step": 46 }, { "epoch": 0.018229419179676135, "grad_norm": 5.35901635373087, "learning_rate": 1.8217054263565893e-06, "loss": 1.408, "step": 47 }, { "epoch": 0.018617279162222437, "grad_norm": 5.578535035462798, "learning_rate": 1.86046511627907e-06, "loss": 1.2509, "step": 48 }, { "epoch": 0.01900513914476874, "grad_norm": 4.697874392415059, "learning_rate": 1.8992248062015506e-06, "loss": 1.3253, "step": 49 }, { "epoch": 0.019392999127315038, "grad_norm": 4.8941735453374156, "learning_rate": 1.9379844961240315e-06, "loss": 1.2853, "step": 50 }, { "epoch": 0.01978085910986134, "grad_norm": 6.2070413275233545, "learning_rate": 1.976744186046512e-06, "loss": 1.2231, "step": 51 }, { "epoch": 0.020168719092407642, "grad_norm": 5.375956024103411, "learning_rate": 2.0155038759689923e-06, "loss": 1.247, "step": 52 }, { "epoch": 0.02055657907495394, "grad_norm": 4.191970287927133, "learning_rate": 2.054263565891473e-06, "loss": 1.2462, "step": 53 }, { "epoch": 0.020944439057500243, "grad_norm": 5.0080034285945025, "learning_rate": 2.0930232558139536e-06, "loss": 1.2025, "step": 54 }, { "epoch": 0.02133229904004654, "grad_norm": 13.99746969491131, "learning_rate": 2.131782945736434e-06, "loss": 1.2079, "step": 55 }, { "epoch": 0.021720159022592844, "grad_norm": 4.82944690793504, "learning_rate": 2.170542635658915e-06, "loss": 1.2603, "step": 56 }, { "epoch": 0.022108019005139146, "grad_norm": 8.416110186842875, "learning_rate": 2.2093023255813954e-06, "loss": 1.1938, "step": 57 }, { "epoch": 0.022495878987685444, "grad_norm": 5.519092312840474, "learning_rate": 2.2480620155038763e-06, "loss": 1.3156, "step": 58 }, { "epoch": 0.022883738970231746, "grad_norm": 5.046239687733281, "learning_rate": 2.2868217054263567e-06, "loss": 1.2567, "step": 59 }, { "epoch": 0.02327159895277805, "grad_norm": 4.571388716634533, "learning_rate": 2.3255813953488376e-06, "loss": 1.1804, "step": 60 }, { "epoch": 0.023659458935324347, "grad_norm": 4.0111349965859056, "learning_rate": 2.364341085271318e-06, "loss": 1.2007, "step": 61 }, { "epoch": 0.02404731891787065, "grad_norm": 4.354046077299887, "learning_rate": 2.403100775193799e-06, "loss": 1.2135, "step": 62 }, { "epoch": 0.02443517890041695, "grad_norm": 4.501205247758884, "learning_rate": 2.4418604651162793e-06, "loss": 1.1272, "step": 63 }, { "epoch": 0.02482303888296325, "grad_norm": 3.2179505812119427, "learning_rate": 2.4806201550387598e-06, "loss": 1.1378, "step": 64 }, { "epoch": 0.025210898865509552, "grad_norm": 5.1840294241299345, "learning_rate": 2.5193798449612406e-06, "loss": 1.2148, "step": 65 }, { "epoch": 0.02559875884805585, "grad_norm": 3.469040619375295, "learning_rate": 2.558139534883721e-06, "loss": 1.1266, "step": 66 }, { "epoch": 0.025986618830602153, "grad_norm": 3.7891716175869172, "learning_rate": 2.596899224806202e-06, "loss": 1.1255, "step": 67 }, { "epoch": 0.026374478813148455, "grad_norm": 3.3677145681013227, "learning_rate": 2.635658914728683e-06, "loss": 0.9812, "step": 68 }, { "epoch": 0.026762338795694753, "grad_norm": 4.174782601939158, "learning_rate": 2.674418604651163e-06, "loss": 1.1679, "step": 69 }, { "epoch": 0.027150198778241055, "grad_norm": 3.527620377916958, "learning_rate": 2.7131782945736433e-06, "loss": 1.1039, "step": 70 }, { "epoch": 0.027538058760787357, "grad_norm": 3.4419851700057276, "learning_rate": 2.751937984496124e-06, "loss": 1.0299, "step": 71 }, { "epoch": 0.027925918743333656, "grad_norm": 3.279950241460187, "learning_rate": 2.790697674418605e-06, "loss": 1.0535, "step": 72 }, { "epoch": 0.028313778725879958, "grad_norm": 4.139193851535083, "learning_rate": 2.8294573643410855e-06, "loss": 1.0801, "step": 73 }, { "epoch": 0.028701638708426257, "grad_norm": 2.945183899471885, "learning_rate": 2.8682170542635663e-06, "loss": 0.955, "step": 74 }, { "epoch": 0.02908949869097256, "grad_norm": 4.576199015980327, "learning_rate": 2.9069767441860468e-06, "loss": 1.1457, "step": 75 }, { "epoch": 0.02947735867351886, "grad_norm": 2.9314395496310315, "learning_rate": 2.9457364341085276e-06, "loss": 0.9998, "step": 76 }, { "epoch": 0.02986521865606516, "grad_norm": 3.5848861371779432, "learning_rate": 2.9844961240310076e-06, "loss": 1.06, "step": 77 }, { "epoch": 0.03025307863861146, "grad_norm": 3.0214121239389833, "learning_rate": 3.0232558139534885e-06, "loss": 0.9485, "step": 78 }, { "epoch": 0.030640938621157764, "grad_norm": 3.6219702190977316, "learning_rate": 3.062015503875969e-06, "loss": 1.1465, "step": 79 }, { "epoch": 0.031028798603704062, "grad_norm": 3.955687496657216, "learning_rate": 3.10077519379845e-06, "loss": 1.0771, "step": 80 }, { "epoch": 0.03141665858625036, "grad_norm": 3.756435935139605, "learning_rate": 3.1395348837209307e-06, "loss": 0.9902, "step": 81 }, { "epoch": 0.031804518568796666, "grad_norm": 3.603148863332301, "learning_rate": 3.178294573643411e-06, "loss": 1.1158, "step": 82 }, { "epoch": 0.032192378551342965, "grad_norm": 2.7717909367368008, "learning_rate": 3.217054263565892e-06, "loss": 0.9069, "step": 83 }, { "epoch": 0.032580238533889264, "grad_norm": 3.436970876209558, "learning_rate": 3.2558139534883724e-06, "loss": 1.0199, "step": 84 }, { "epoch": 0.03296809851643557, "grad_norm": 3.65869199588643, "learning_rate": 3.294573643410853e-06, "loss": 1.0186, "step": 85 }, { "epoch": 0.03335595849898187, "grad_norm": 2.8468047298675785, "learning_rate": 3.3333333333333333e-06, "loss": 0.9646, "step": 86 }, { "epoch": 0.033743818481528166, "grad_norm": 3.16225692694364, "learning_rate": 3.372093023255814e-06, "loss": 0.9297, "step": 87 }, { "epoch": 0.03413167846407447, "grad_norm": 3.975118981220515, "learning_rate": 3.4108527131782946e-06, "loss": 1.0008, "step": 88 }, { "epoch": 0.03451953844662077, "grad_norm": 3.650734420084849, "learning_rate": 3.4496124031007755e-06, "loss": 1.1267, "step": 89 }, { "epoch": 0.03490739842916707, "grad_norm": 2.5767756615357476, "learning_rate": 3.4883720930232564e-06, "loss": 0.9761, "step": 90 }, { "epoch": 0.035295258411713375, "grad_norm": 2.9225966296681807, "learning_rate": 3.527131782945737e-06, "loss": 0.9268, "step": 91 }, { "epoch": 0.03568311839425967, "grad_norm": 3.1408592642036286, "learning_rate": 3.5658914728682177e-06, "loss": 0.9239, "step": 92 }, { "epoch": 0.03607097837680597, "grad_norm": 4.3314047270215985, "learning_rate": 3.6046511627906977e-06, "loss": 0.9174, "step": 93 }, { "epoch": 0.03645883835935227, "grad_norm": 4.159091086794285, "learning_rate": 3.6434108527131786e-06, "loss": 1.0274, "step": 94 }, { "epoch": 0.036846698341898576, "grad_norm": 3.2084080149283043, "learning_rate": 3.682170542635659e-06, "loss": 0.9795, "step": 95 }, { "epoch": 0.037234558324444875, "grad_norm": 3.366756340861441, "learning_rate": 3.72093023255814e-06, "loss": 0.9565, "step": 96 }, { "epoch": 0.03762241830699117, "grad_norm": 3.4141015859032295, "learning_rate": 3.7596899224806203e-06, "loss": 0.9684, "step": 97 }, { "epoch": 0.03801027828953748, "grad_norm": 3.3748204280853367, "learning_rate": 3.798449612403101e-06, "loss": 0.9987, "step": 98 }, { "epoch": 0.03839813827208378, "grad_norm": 2.779738645673484, "learning_rate": 3.837209302325582e-06, "loss": 1.0069, "step": 99 }, { "epoch": 0.038785998254630076, "grad_norm": 3.185931890772274, "learning_rate": 3.875968992248063e-06, "loss": 0.9159, "step": 100 }, { "epoch": 0.03917385823717638, "grad_norm": 3.4685423443109236, "learning_rate": 3.914728682170543e-06, "loss": 1.0587, "step": 101 }, { "epoch": 0.03956171821972268, "grad_norm": 3.0284444916016398, "learning_rate": 3.953488372093024e-06, "loss": 0.9473, "step": 102 }, { "epoch": 0.03994957820226898, "grad_norm": 3.3879727180300963, "learning_rate": 3.992248062015504e-06, "loss": 0.9659, "step": 103 }, { "epoch": 0.040337438184815284, "grad_norm": 3.648904478243314, "learning_rate": 4.031007751937985e-06, "loss": 0.9468, "step": 104 }, { "epoch": 0.04072529816736158, "grad_norm": 2.920249306527412, "learning_rate": 4.0697674418604655e-06, "loss": 0.9266, "step": 105 }, { "epoch": 0.04111315814990788, "grad_norm": 4.318084676091223, "learning_rate": 4.108527131782946e-06, "loss": 0.9565, "step": 106 }, { "epoch": 0.04150101813245419, "grad_norm": 3.7980880955093337, "learning_rate": 4.1472868217054264e-06, "loss": 0.9468, "step": 107 }, { "epoch": 0.041888878115000486, "grad_norm": 4.644260827506593, "learning_rate": 4.186046511627907e-06, "loss": 0.9306, "step": 108 }, { "epoch": 0.042276738097546784, "grad_norm": 3.4104262024786083, "learning_rate": 4.224806201550387e-06, "loss": 1.0658, "step": 109 }, { "epoch": 0.04266459808009308, "grad_norm": 3.8327733083278064, "learning_rate": 4.263565891472868e-06, "loss": 1.0066, "step": 110 }, { "epoch": 0.04305245806263939, "grad_norm": 2.305293995324799, "learning_rate": 4.302325581395349e-06, "loss": 0.9473, "step": 111 }, { "epoch": 0.04344031804518569, "grad_norm": 3.3885467371866533, "learning_rate": 4.34108527131783e-06, "loss": 0.9304, "step": 112 }, { "epoch": 0.043828178027731986, "grad_norm": 3.119927693067487, "learning_rate": 4.379844961240311e-06, "loss": 0.8883, "step": 113 }, { "epoch": 0.04421603801027829, "grad_norm": 3.166308648047779, "learning_rate": 4.418604651162791e-06, "loss": 0.9372, "step": 114 }, { "epoch": 0.04460389799282459, "grad_norm": 3.0835397541645575, "learning_rate": 4.457364341085272e-06, "loss": 0.8743, "step": 115 }, { "epoch": 0.04499175797537089, "grad_norm": 2.900617474621125, "learning_rate": 4.4961240310077525e-06, "loss": 0.9665, "step": 116 }, { "epoch": 0.045379617957917194, "grad_norm": 3.1941554441911033, "learning_rate": 4.5348837209302326e-06, "loss": 0.9434, "step": 117 }, { "epoch": 0.04576747794046349, "grad_norm": 2.2597469689375247, "learning_rate": 4.573643410852713e-06, "loss": 0.8971, "step": 118 }, { "epoch": 0.04615533792300979, "grad_norm": 2.701709684606568, "learning_rate": 4.612403100775194e-06, "loss": 0.8307, "step": 119 }, { "epoch": 0.0465431979055561, "grad_norm": 4.34723047306032, "learning_rate": 4.651162790697675e-06, "loss": 0.9888, "step": 120 }, { "epoch": 0.046931057888102395, "grad_norm": 3.5012065207316803, "learning_rate": 4.689922480620155e-06, "loss": 1.0627, "step": 121 }, { "epoch": 0.047318917870648694, "grad_norm": 3.568591690473587, "learning_rate": 4.728682170542636e-06, "loss": 0.958, "step": 122 }, { "epoch": 0.047706777853195, "grad_norm": 3.1315742760378376, "learning_rate": 4.767441860465117e-06, "loss": 0.9471, "step": 123 }, { "epoch": 0.0480946378357413, "grad_norm": 3.433195805014218, "learning_rate": 4.806201550387598e-06, "loss": 0.91, "step": 124 }, { "epoch": 0.0484824978182876, "grad_norm": 3.60399117306288, "learning_rate": 4.844961240310078e-06, "loss": 0.9285, "step": 125 }, { "epoch": 0.0488703578008339, "grad_norm": 2.8621395385700334, "learning_rate": 4.883720930232559e-06, "loss": 0.9399, "step": 126 }, { "epoch": 0.0492582177833802, "grad_norm": 3.1125637840222544, "learning_rate": 4.922480620155039e-06, "loss": 0.9022, "step": 127 }, { "epoch": 0.0496460777659265, "grad_norm": 3.0025950943309847, "learning_rate": 4.9612403100775195e-06, "loss": 0.9142, "step": 128 }, { "epoch": 0.0500339377484728, "grad_norm": 2.570934687163324, "learning_rate": 5e-06, "loss": 0.9549, "step": 129 }, { "epoch": 0.050421797731019104, "grad_norm": 3.1740094414586166, "learning_rate": 5.038759689922481e-06, "loss": 0.7563, "step": 130 }, { "epoch": 0.0508096577135654, "grad_norm": 4.37691101722052, "learning_rate": 5.077519379844962e-06, "loss": 0.9456, "step": 131 }, { "epoch": 0.0511975176961117, "grad_norm": 3.0287262666563928, "learning_rate": 5.116279069767442e-06, "loss": 0.8776, "step": 132 }, { "epoch": 0.051585377678658006, "grad_norm": 3.469172340765002, "learning_rate": 5.155038759689923e-06, "loss": 0.8742, "step": 133 }, { "epoch": 0.051973237661204305, "grad_norm": 3.946826483591162, "learning_rate": 5.193798449612404e-06, "loss": 0.965, "step": 134 }, { "epoch": 0.052361097643750604, "grad_norm": 3.9433370129245895, "learning_rate": 5.232558139534885e-06, "loss": 0.9017, "step": 135 }, { "epoch": 0.05274895762629691, "grad_norm": 3.007341587904893, "learning_rate": 5.271317829457366e-06, "loss": 0.8915, "step": 136 }, { "epoch": 0.05313681760884321, "grad_norm": 3.9688933608320904, "learning_rate": 5.310077519379846e-06, "loss": 0.9031, "step": 137 }, { "epoch": 0.053524677591389506, "grad_norm": 4.3998307960783976, "learning_rate": 5.348837209302326e-06, "loss": 0.9269, "step": 138 }, { "epoch": 0.05391253757393581, "grad_norm": 3.850244500413126, "learning_rate": 5.3875968992248065e-06, "loss": 0.9103, "step": 139 }, { "epoch": 0.05430039755648211, "grad_norm": 3.6118714347026537, "learning_rate": 5.4263565891472865e-06, "loss": 0.869, "step": 140 }, { "epoch": 0.05468825753902841, "grad_norm": 4.165863778683913, "learning_rate": 5.465116279069767e-06, "loss": 0.9492, "step": 141 }, { "epoch": 0.055076117521574715, "grad_norm": 2.5251481365464516, "learning_rate": 5.503875968992248e-06, "loss": 0.8428, "step": 142 }, { "epoch": 0.05546397750412101, "grad_norm": 3.8847154323415816, "learning_rate": 5.542635658914729e-06, "loss": 0.9577, "step": 143 }, { "epoch": 0.05585183748666731, "grad_norm": 3.5944063080396607, "learning_rate": 5.58139534883721e-06, "loss": 0.8702, "step": 144 }, { "epoch": 0.05623969746921361, "grad_norm": 2.866254850316071, "learning_rate": 5.62015503875969e-06, "loss": 0.919, "step": 145 }, { "epoch": 0.056627557451759916, "grad_norm": 3.421322682461167, "learning_rate": 5.658914728682171e-06, "loss": 0.9275, "step": 146 }, { "epoch": 0.057015417434306215, "grad_norm": 2.9328950238132605, "learning_rate": 5.697674418604652e-06, "loss": 0.9141, "step": 147 }, { "epoch": 0.05740327741685251, "grad_norm": 3.5747701801939202, "learning_rate": 5.736434108527133e-06, "loss": 0.8544, "step": 148 }, { "epoch": 0.05779113739939882, "grad_norm": 2.965581128155004, "learning_rate": 5.7751937984496135e-06, "loss": 0.8514, "step": 149 }, { "epoch": 0.05817899738194512, "grad_norm": 3.352335187708362, "learning_rate": 5.8139534883720935e-06, "loss": 0.8739, "step": 150 }, { "epoch": 0.058566857364491416, "grad_norm": 2.7713745486478856, "learning_rate": 5.852713178294574e-06, "loss": 0.8063, "step": 151 }, { "epoch": 0.05895471734703772, "grad_norm": 2.7979403712889437, "learning_rate": 5.891472868217055e-06, "loss": 0.8552, "step": 152 }, { "epoch": 0.05934257732958402, "grad_norm": 2.538393528962019, "learning_rate": 5.930232558139536e-06, "loss": 0.8764, "step": 153 }, { "epoch": 0.05973043731213032, "grad_norm": 4.295864302625053, "learning_rate": 5.968992248062015e-06, "loss": 0.9535, "step": 154 }, { "epoch": 0.060118297294676624, "grad_norm": 3.543053890444346, "learning_rate": 6.007751937984496e-06, "loss": 0.8829, "step": 155 }, { "epoch": 0.06050615727722292, "grad_norm": 3.298821603007628, "learning_rate": 6.046511627906977e-06, "loss": 0.8349, "step": 156 }, { "epoch": 0.06089401725976922, "grad_norm": 3.7174812024664696, "learning_rate": 6.085271317829458e-06, "loss": 0.9473, "step": 157 }, { "epoch": 0.06128187724231553, "grad_norm": 3.8089648612913933, "learning_rate": 6.124031007751938e-06, "loss": 0.955, "step": 158 }, { "epoch": 0.061669737224861826, "grad_norm": 2.76037920406306, "learning_rate": 6.162790697674419e-06, "loss": 0.8581, "step": 159 }, { "epoch": 0.062057597207408124, "grad_norm": 3.7196884493817004, "learning_rate": 6.2015503875969e-06, "loss": 0.9557, "step": 160 }, { "epoch": 0.06244545718995442, "grad_norm": 3.3719219302893513, "learning_rate": 6.2403100775193805e-06, "loss": 0.9625, "step": 161 }, { "epoch": 0.06283331717250072, "grad_norm": 3.8750940810472754, "learning_rate": 6.279069767441861e-06, "loss": 0.952, "step": 162 }, { "epoch": 0.06322117715504703, "grad_norm": 4.056562855786851, "learning_rate": 6.317829457364341e-06, "loss": 0.9195, "step": 163 }, { "epoch": 0.06360903713759333, "grad_norm": 2.4893091625901547, "learning_rate": 6.356589147286822e-06, "loss": 0.8272, "step": 164 }, { "epoch": 0.06399689712013963, "grad_norm": 3.7001860419510386, "learning_rate": 6.395348837209303e-06, "loss": 0.8456, "step": 165 }, { "epoch": 0.06438475710268593, "grad_norm": 3.2274330238517193, "learning_rate": 6.434108527131784e-06, "loss": 0.8284, "step": 166 }, { "epoch": 0.06477261708523223, "grad_norm": 2.5847379097422905, "learning_rate": 6.472868217054265e-06, "loss": 0.815, "step": 167 }, { "epoch": 0.06516047706777853, "grad_norm": 3.666320620665086, "learning_rate": 6.511627906976745e-06, "loss": 0.8759, "step": 168 }, { "epoch": 0.06554833705032483, "grad_norm": 2.7504482232151672, "learning_rate": 6.550387596899226e-06, "loss": 0.8582, "step": 169 }, { "epoch": 0.06593619703287114, "grad_norm": 2.7825477606989537, "learning_rate": 6.589147286821706e-06, "loss": 0.8502, "step": 170 }, { "epoch": 0.06632405701541744, "grad_norm": 2.6805877165401504, "learning_rate": 6.627906976744186e-06, "loss": 0.8592, "step": 171 }, { "epoch": 0.06671191699796374, "grad_norm": 3.6662451427768827, "learning_rate": 6.666666666666667e-06, "loss": 0.8566, "step": 172 }, { "epoch": 0.06709977698051003, "grad_norm": 3.8870874623628273, "learning_rate": 6.7054263565891475e-06, "loss": 0.8847, "step": 173 }, { "epoch": 0.06748763696305633, "grad_norm": 3.150672367733985, "learning_rate": 6.744186046511628e-06, "loss": 0.8998, "step": 174 }, { "epoch": 0.06787549694560263, "grad_norm": 3.1568806254590953, "learning_rate": 6.782945736434109e-06, "loss": 0.9259, "step": 175 }, { "epoch": 0.06826335692814894, "grad_norm": 3.7932470401479317, "learning_rate": 6.821705426356589e-06, "loss": 0.9149, "step": 176 }, { "epoch": 0.06865121691069524, "grad_norm": 2.980997672177329, "learning_rate": 6.86046511627907e-06, "loss": 0.8016, "step": 177 }, { "epoch": 0.06903907689324154, "grad_norm": 3.444971914282816, "learning_rate": 6.899224806201551e-06, "loss": 0.845, "step": 178 }, { "epoch": 0.06942693687578784, "grad_norm": 3.1547084951135376, "learning_rate": 6.937984496124032e-06, "loss": 0.8859, "step": 179 }, { "epoch": 0.06981479685833414, "grad_norm": 2.7196398244182594, "learning_rate": 6.976744186046513e-06, "loss": 0.8299, "step": 180 }, { "epoch": 0.07020265684088044, "grad_norm": 3.4471894686416267, "learning_rate": 7.015503875968993e-06, "loss": 0.8588, "step": 181 }, { "epoch": 0.07059051682342675, "grad_norm": 3.0862359597589686, "learning_rate": 7.054263565891474e-06, "loss": 0.8498, "step": 182 }, { "epoch": 0.07097837680597305, "grad_norm": 2.4982307952913794, "learning_rate": 7.0930232558139545e-06, "loss": 0.8098, "step": 183 }, { "epoch": 0.07136623678851935, "grad_norm": 3.473428626278634, "learning_rate": 7.131782945736435e-06, "loss": 0.8612, "step": 184 }, { "epoch": 0.07175409677106565, "grad_norm": 3.515772143073951, "learning_rate": 7.170542635658916e-06, "loss": 0.9123, "step": 185 }, { "epoch": 0.07214195675361194, "grad_norm": 3.374538613097237, "learning_rate": 7.209302325581395e-06, "loss": 0.9276, "step": 186 }, { "epoch": 0.07252981673615824, "grad_norm": 4.968111698005178, "learning_rate": 7.248062015503876e-06, "loss": 0.8577, "step": 187 }, { "epoch": 0.07291767671870454, "grad_norm": 3.469116368330431, "learning_rate": 7.286821705426357e-06, "loss": 0.8881, "step": 188 }, { "epoch": 0.07330553670125085, "grad_norm": 3.0099266331387278, "learning_rate": 7.325581395348837e-06, "loss": 0.8376, "step": 189 }, { "epoch": 0.07369339668379715, "grad_norm": 3.0883646257088344, "learning_rate": 7.364341085271318e-06, "loss": 0.8204, "step": 190 }, { "epoch": 0.07408125666634345, "grad_norm": 4.688699157150738, "learning_rate": 7.403100775193799e-06, "loss": 0.8821, "step": 191 }, { "epoch": 0.07446911664888975, "grad_norm": 3.93320177835261, "learning_rate": 7.44186046511628e-06, "loss": 0.8867, "step": 192 }, { "epoch": 0.07485697663143605, "grad_norm": 3.0604033426613158, "learning_rate": 7.480620155038761e-06, "loss": 0.8598, "step": 193 }, { "epoch": 0.07524483661398235, "grad_norm": 2.465767396919283, "learning_rate": 7.519379844961241e-06, "loss": 0.7921, "step": 194 }, { "epoch": 0.07563269659652866, "grad_norm": 3.2606525976488276, "learning_rate": 7.5581395348837215e-06, "loss": 0.9079, "step": 195 }, { "epoch": 0.07602055657907496, "grad_norm": 2.57115813079418, "learning_rate": 7.596899224806202e-06, "loss": 0.8965, "step": 196 }, { "epoch": 0.07640841656162126, "grad_norm": 3.306840959910772, "learning_rate": 7.635658914728683e-06, "loss": 0.8777, "step": 197 }, { "epoch": 0.07679627654416755, "grad_norm": 3.2488461498256536, "learning_rate": 7.674418604651164e-06, "loss": 0.8898, "step": 198 }, { "epoch": 0.07718413652671385, "grad_norm": 2.9763749848492296, "learning_rate": 7.713178294573645e-06, "loss": 0.8152, "step": 199 }, { "epoch": 0.07757199650926015, "grad_norm": 2.7367282289559225, "learning_rate": 7.751937984496126e-06, "loss": 0.8602, "step": 200 }, { "epoch": 0.07795985649180646, "grad_norm": 3.0622567783790515, "learning_rate": 7.790697674418605e-06, "loss": 0.8936, "step": 201 }, { "epoch": 0.07834771647435276, "grad_norm": 3.070493699316106, "learning_rate": 7.829457364341086e-06, "loss": 0.831, "step": 202 }, { "epoch": 0.07873557645689906, "grad_norm": 2.791392152200963, "learning_rate": 7.868217054263567e-06, "loss": 0.8397, "step": 203 }, { "epoch": 0.07912343643944536, "grad_norm": 3.4052807756798305, "learning_rate": 7.906976744186048e-06, "loss": 0.8566, "step": 204 }, { "epoch": 0.07951129642199166, "grad_norm": 3.56109028066334, "learning_rate": 7.945736434108527e-06, "loss": 0.8556, "step": 205 }, { "epoch": 0.07989915640453796, "grad_norm": 2.77715731343832, "learning_rate": 7.984496124031008e-06, "loss": 0.7827, "step": 206 }, { "epoch": 0.08028701638708426, "grad_norm": 2.793497132330215, "learning_rate": 8.023255813953488e-06, "loss": 0.8117, "step": 207 }, { "epoch": 0.08067487636963057, "grad_norm": 2.9006702330954015, "learning_rate": 8.06201550387597e-06, "loss": 0.8659, "step": 208 }, { "epoch": 0.08106273635217687, "grad_norm": 2.242176698795504, "learning_rate": 8.10077519379845e-06, "loss": 0.7589, "step": 209 }, { "epoch": 0.08145059633472317, "grad_norm": 3.0736315182334235, "learning_rate": 8.139534883720931e-06, "loss": 0.8069, "step": 210 }, { "epoch": 0.08183845631726946, "grad_norm": 2.134465470835696, "learning_rate": 8.178294573643412e-06, "loss": 0.9265, "step": 211 }, { "epoch": 0.08222631629981576, "grad_norm": 2.8518780278582376, "learning_rate": 8.217054263565893e-06, "loss": 0.888, "step": 212 }, { "epoch": 0.08261417628236206, "grad_norm": 2.9107006116408773, "learning_rate": 8.255813953488374e-06, "loss": 0.8597, "step": 213 }, { "epoch": 0.08300203626490837, "grad_norm": 2.794963301006787, "learning_rate": 8.294573643410853e-06, "loss": 0.8358, "step": 214 }, { "epoch": 0.08338989624745467, "grad_norm": 2.755661153148097, "learning_rate": 8.333333333333334e-06, "loss": 0.7868, "step": 215 }, { "epoch": 0.08377775623000097, "grad_norm": 3.4421919619328425, "learning_rate": 8.372093023255815e-06, "loss": 0.8673, "step": 216 }, { "epoch": 0.08416561621254727, "grad_norm": 3.1003470033584684, "learning_rate": 8.410852713178295e-06, "loss": 0.9002, "step": 217 }, { "epoch": 0.08455347619509357, "grad_norm": 2.9863604481412915, "learning_rate": 8.449612403100775e-06, "loss": 0.8312, "step": 218 }, { "epoch": 0.08494133617763987, "grad_norm": 3.0781897355911316, "learning_rate": 8.488372093023256e-06, "loss": 0.8465, "step": 219 }, { "epoch": 0.08532919616018617, "grad_norm": 3.1176136838330994, "learning_rate": 8.527131782945736e-06, "loss": 0.8406, "step": 220 }, { "epoch": 0.08571705614273248, "grad_norm": 2.959414885702379, "learning_rate": 8.565891472868217e-06, "loss": 0.8222, "step": 221 }, { "epoch": 0.08610491612527878, "grad_norm": 3.754678219809697, "learning_rate": 8.604651162790698e-06, "loss": 0.9229, "step": 222 }, { "epoch": 0.08649277610782508, "grad_norm": 3.0292934410940897, "learning_rate": 8.643410852713179e-06, "loss": 0.8014, "step": 223 }, { "epoch": 0.08688063609037137, "grad_norm": 4.11133738091474, "learning_rate": 8.68217054263566e-06, "loss": 0.7657, "step": 224 }, { "epoch": 0.08726849607291767, "grad_norm": 4.098262305135462, "learning_rate": 8.72093023255814e-06, "loss": 0.9385, "step": 225 }, { "epoch": 0.08765635605546397, "grad_norm": 3.0031586523040112, "learning_rate": 8.759689922480622e-06, "loss": 0.8134, "step": 226 }, { "epoch": 0.08804421603801028, "grad_norm": 2.860017867952654, "learning_rate": 8.7984496124031e-06, "loss": 0.8345, "step": 227 }, { "epoch": 0.08843207602055658, "grad_norm": 2.7149325573047767, "learning_rate": 8.837209302325582e-06, "loss": 0.7323, "step": 228 }, { "epoch": 0.08881993600310288, "grad_norm": 2.761480521010341, "learning_rate": 8.875968992248062e-06, "loss": 0.8638, "step": 229 }, { "epoch": 0.08920779598564918, "grad_norm": 2.7368512401945284, "learning_rate": 8.914728682170543e-06, "loss": 0.8194, "step": 230 }, { "epoch": 0.08959565596819548, "grad_norm": 2.7932970436234865, "learning_rate": 8.953488372093024e-06, "loss": 0.8777, "step": 231 }, { "epoch": 0.08998351595074178, "grad_norm": 2.8394515639916644, "learning_rate": 8.992248062015505e-06, "loss": 0.7894, "step": 232 }, { "epoch": 0.09037137593328809, "grad_norm": 3.9650311699720664, "learning_rate": 9.031007751937986e-06, "loss": 0.9794, "step": 233 }, { "epoch": 0.09075923591583439, "grad_norm": 2.644109766558029, "learning_rate": 9.069767441860465e-06, "loss": 0.7911, "step": 234 }, { "epoch": 0.09114709589838069, "grad_norm": 3.2670999715191025, "learning_rate": 9.108527131782946e-06, "loss": 0.8453, "step": 235 }, { "epoch": 0.09153495588092699, "grad_norm": 3.5309405187859992, "learning_rate": 9.147286821705427e-06, "loss": 0.8747, "step": 236 }, { "epoch": 0.09192281586347328, "grad_norm": 2.625271970321024, "learning_rate": 9.186046511627908e-06, "loss": 0.818, "step": 237 }, { "epoch": 0.09231067584601958, "grad_norm": 3.0569637464915576, "learning_rate": 9.224806201550389e-06, "loss": 0.8054, "step": 238 }, { "epoch": 0.09269853582856588, "grad_norm": 3.00113667340435, "learning_rate": 9.26356589147287e-06, "loss": 0.8418, "step": 239 }, { "epoch": 0.0930863958111122, "grad_norm": 4.58403945963242, "learning_rate": 9.30232558139535e-06, "loss": 0.9628, "step": 240 }, { "epoch": 0.09347425579365849, "grad_norm": 2.8026292090333738, "learning_rate": 9.34108527131783e-06, "loss": 0.8399, "step": 241 }, { "epoch": 0.09386211577620479, "grad_norm": 3.020134011501288, "learning_rate": 9.37984496124031e-06, "loss": 0.8899, "step": 242 }, { "epoch": 0.09424997575875109, "grad_norm": 3.4222231069911864, "learning_rate": 9.418604651162791e-06, "loss": 0.8312, "step": 243 }, { "epoch": 0.09463783574129739, "grad_norm": 3.4412182448770987, "learning_rate": 9.457364341085272e-06, "loss": 0.867, "step": 244 }, { "epoch": 0.09502569572384369, "grad_norm": 3.3855597709423604, "learning_rate": 9.496124031007753e-06, "loss": 0.8351, "step": 245 }, { "epoch": 0.09541355570639, "grad_norm": 3.056754952837644, "learning_rate": 9.534883720930234e-06, "loss": 0.8561, "step": 246 }, { "epoch": 0.0958014156889363, "grad_norm": 2.603432914245702, "learning_rate": 9.573643410852715e-06, "loss": 0.7851, "step": 247 }, { "epoch": 0.0961892756714826, "grad_norm": 2.9804756345370267, "learning_rate": 9.612403100775196e-06, "loss": 0.8615, "step": 248 }, { "epoch": 0.0965771356540289, "grad_norm": 3.7648482549852176, "learning_rate": 9.651162790697676e-06, "loss": 0.9411, "step": 249 }, { "epoch": 0.0969649956365752, "grad_norm": 2.209650753563841, "learning_rate": 9.689922480620156e-06, "loss": 0.8211, "step": 250 }, { "epoch": 0.09735285561912149, "grad_norm": 2.2819644820646796, "learning_rate": 9.728682170542636e-06, "loss": 0.7976, "step": 251 }, { "epoch": 0.0977407156016678, "grad_norm": 3.8734539213279384, "learning_rate": 9.767441860465117e-06, "loss": 0.8034, "step": 252 }, { "epoch": 0.0981285755842141, "grad_norm": 3.4365643573490465, "learning_rate": 9.806201550387598e-06, "loss": 0.8372, "step": 253 }, { "epoch": 0.0985164355667604, "grad_norm": 2.471148286322736, "learning_rate": 9.844961240310077e-06, "loss": 0.8538, "step": 254 }, { "epoch": 0.0989042955493067, "grad_norm": 3.0904263186552727, "learning_rate": 9.883720930232558e-06, "loss": 0.8362, "step": 255 }, { "epoch": 0.099292155531853, "grad_norm": 3.302318288694796, "learning_rate": 9.922480620155039e-06, "loss": 0.8169, "step": 256 }, { "epoch": 0.0996800155143993, "grad_norm": 2.9738352906463947, "learning_rate": 9.96124031007752e-06, "loss": 0.8624, "step": 257 }, { "epoch": 0.1000678754969456, "grad_norm": 3.111208299928909, "learning_rate": 1e-05, "loss": 0.8074, "step": 258 }, { "epoch": 0.10045573547949191, "grad_norm": 3.927769806360254, "learning_rate": 9.9999954157983e-06, "loss": 0.8768, "step": 259 }, { "epoch": 0.10084359546203821, "grad_norm": 3.3448095365924755, "learning_rate": 9.999981663201606e-06, "loss": 0.8294, "step": 260 }, { "epoch": 0.1012314554445845, "grad_norm": 3.0877919346101006, "learning_rate": 9.999958742235133e-06, "loss": 0.8102, "step": 261 }, { "epoch": 0.1016193154271308, "grad_norm": 3.166260410738481, "learning_rate": 9.999926652940914e-06, "loss": 0.8142, "step": 262 }, { "epoch": 0.1020071754096771, "grad_norm": 2.6805912289414606, "learning_rate": 9.999885395377788e-06, "loss": 0.8171, "step": 263 }, { "epoch": 0.1023950353922234, "grad_norm": 3.4456660343041277, "learning_rate": 9.999834969621408e-06, "loss": 0.8728, "step": 264 }, { "epoch": 0.10278289537476971, "grad_norm": 2.933670807036517, "learning_rate": 9.999775375764244e-06, "loss": 0.8259, "step": 265 }, { "epoch": 0.10317075535731601, "grad_norm": 3.463819062644638, "learning_rate": 9.999706613915567e-06, "loss": 0.9086, "step": 266 }, { "epoch": 0.10355861533986231, "grad_norm": 4.07194419753517, "learning_rate": 9.999628684201464e-06, "loss": 0.843, "step": 267 }, { "epoch": 0.10394647532240861, "grad_norm": 2.7253413786139755, "learning_rate": 9.999541586764836e-06, "loss": 0.8036, "step": 268 }, { "epoch": 0.10433433530495491, "grad_norm": 2.757598361684006, "learning_rate": 9.999445321765392e-06, "loss": 0.8402, "step": 269 }, { "epoch": 0.10472219528750121, "grad_norm": 2.812519133288797, "learning_rate": 9.999339889379647e-06, "loss": 0.8428, "step": 270 }, { "epoch": 0.1051100552700475, "grad_norm": 2.6071861764356306, "learning_rate": 9.999225289800935e-06, "loss": 0.8614, "step": 271 }, { "epoch": 0.10549791525259382, "grad_norm": 3.2809273338472367, "learning_rate": 9.999101523239392e-06, "loss": 0.8036, "step": 272 }, { "epoch": 0.10588577523514012, "grad_norm": 2.8140778507224544, "learning_rate": 9.998968589921969e-06, "loss": 0.8045, "step": 273 }, { "epoch": 0.10627363521768642, "grad_norm": 3.6907306841183978, "learning_rate": 9.99882649009242e-06, "loss": 0.872, "step": 274 }, { "epoch": 0.10666149520023271, "grad_norm": 3.771266746843587, "learning_rate": 9.998675224011317e-06, "loss": 0.902, "step": 275 }, { "epoch": 0.10704935518277901, "grad_norm": 2.4860689358265167, "learning_rate": 9.998514791956025e-06, "loss": 0.8096, "step": 276 }, { "epoch": 0.10743721516532531, "grad_norm": 3.1946443106021265, "learning_rate": 9.998345194220732e-06, "loss": 0.847, "step": 277 }, { "epoch": 0.10782507514787162, "grad_norm": 2.727604313040296, "learning_rate": 9.998166431116421e-06, "loss": 0.7907, "step": 278 }, { "epoch": 0.10821293513041792, "grad_norm": 2.5175559064386546, "learning_rate": 9.99797850297089e-06, "loss": 0.8757, "step": 279 }, { "epoch": 0.10860079511296422, "grad_norm": 3.093086169357272, "learning_rate": 9.997781410128737e-06, "loss": 0.8664, "step": 280 }, { "epoch": 0.10898865509551052, "grad_norm": 3.4261889550044216, "learning_rate": 9.99757515295137e-06, "loss": 0.9033, "step": 281 }, { "epoch": 0.10937651507805682, "grad_norm": 2.7749521059914994, "learning_rate": 9.997359731816998e-06, "loss": 0.7469, "step": 282 }, { "epoch": 0.10976437506060312, "grad_norm": 2.5706818993448195, "learning_rate": 9.997135147120633e-06, "loss": 0.8018, "step": 283 }, { "epoch": 0.11015223504314943, "grad_norm": 4.005700277637291, "learning_rate": 9.996901399274093e-06, "loss": 0.7999, "step": 284 }, { "epoch": 0.11054009502569573, "grad_norm": 2.8370322276887485, "learning_rate": 9.996658488705997e-06, "loss": 0.7973, "step": 285 }, { "epoch": 0.11092795500824203, "grad_norm": 2.8493914885044713, "learning_rate": 9.996406415861763e-06, "loss": 0.838, "step": 286 }, { "epoch": 0.11131581499078833, "grad_norm": 2.284828513503112, "learning_rate": 9.996145181203616e-06, "loss": 0.8392, "step": 287 }, { "epoch": 0.11170367497333462, "grad_norm": 2.836737143238613, "learning_rate": 9.995874785210573e-06, "loss": 0.8319, "step": 288 }, { "epoch": 0.11209153495588092, "grad_norm": 2.867603457555486, "learning_rate": 9.995595228378456e-06, "loss": 0.8652, "step": 289 }, { "epoch": 0.11247939493842722, "grad_norm": 2.7074763699047533, "learning_rate": 9.995306511219885e-06, "loss": 0.8315, "step": 290 }, { "epoch": 0.11286725492097353, "grad_norm": 3.135445990793484, "learning_rate": 9.995008634264272e-06, "loss": 0.8904, "step": 291 }, { "epoch": 0.11325511490351983, "grad_norm": 3.356956277499456, "learning_rate": 9.994701598057828e-06, "loss": 0.8342, "step": 292 }, { "epoch": 0.11364297488606613, "grad_norm": 2.5961772805070438, "learning_rate": 9.99438540316356e-06, "loss": 0.7836, "step": 293 }, { "epoch": 0.11403083486861243, "grad_norm": 2.4694141126501488, "learning_rate": 9.99406005016127e-06, "loss": 0.7739, "step": 294 }, { "epoch": 0.11441869485115873, "grad_norm": 2.560536769552751, "learning_rate": 9.99372553964755e-06, "loss": 0.7874, "step": 295 }, { "epoch": 0.11480655483370503, "grad_norm": 3.850775329016022, "learning_rate": 9.993381872235785e-06, "loss": 0.9542, "step": 296 }, { "epoch": 0.11519441481625134, "grad_norm": 4.104644891145374, "learning_rate": 9.993029048556154e-06, "loss": 0.85, "step": 297 }, { "epoch": 0.11558227479879764, "grad_norm": 2.906181487771157, "learning_rate": 9.99266706925562e-06, "loss": 0.7718, "step": 298 }, { "epoch": 0.11597013478134394, "grad_norm": 2.4138958015516616, "learning_rate": 9.99229593499794e-06, "loss": 0.8026, "step": 299 }, { "epoch": 0.11635799476389024, "grad_norm": 3.1068191907229608, "learning_rate": 9.991915646463652e-06, "loss": 0.8376, "step": 300 }, { "epoch": 0.11674585474643653, "grad_norm": 3.9078135710380977, "learning_rate": 9.991526204350087e-06, "loss": 0.7815, "step": 301 }, { "epoch": 0.11713371472898283, "grad_norm": 3.9344813790238136, "learning_rate": 9.991127609371357e-06, "loss": 0.8267, "step": 302 }, { "epoch": 0.11752157471152914, "grad_norm": 2.3208959112191567, "learning_rate": 9.990719862258357e-06, "loss": 0.7553, "step": 303 }, { "epoch": 0.11790943469407544, "grad_norm": 2.9698084307566766, "learning_rate": 9.990302963758765e-06, "loss": 0.7728, "step": 304 }, { "epoch": 0.11829729467662174, "grad_norm": 5.081037231944413, "learning_rate": 9.989876914637042e-06, "loss": 0.8684, "step": 305 }, { "epoch": 0.11868515465916804, "grad_norm": 3.0660264980812824, "learning_rate": 9.989441715674422e-06, "loss": 0.8563, "step": 306 }, { "epoch": 0.11907301464171434, "grad_norm": 2.561937006661983, "learning_rate": 9.988997367668924e-06, "loss": 0.8395, "step": 307 }, { "epoch": 0.11946087462426064, "grad_norm": 2.965848209237417, "learning_rate": 9.988543871435342e-06, "loss": 0.8006, "step": 308 }, { "epoch": 0.11984873460680694, "grad_norm": 2.92718961577099, "learning_rate": 9.988081227805237e-06, "loss": 0.7821, "step": 309 }, { "epoch": 0.12023659458935325, "grad_norm": 3.4454614976210047, "learning_rate": 9.987609437626955e-06, "loss": 0.8088, "step": 310 }, { "epoch": 0.12062445457189955, "grad_norm": 3.1248053296695453, "learning_rate": 9.987128501765606e-06, "loss": 0.7979, "step": 311 }, { "epoch": 0.12101231455444585, "grad_norm": 2.9833816598844836, "learning_rate": 9.986638421103074e-06, "loss": 0.7557, "step": 312 }, { "epoch": 0.12140017453699214, "grad_norm": 3.2096853137499575, "learning_rate": 9.986139196538011e-06, "loss": 0.9, "step": 313 }, { "epoch": 0.12178803451953844, "grad_norm": 3.3709323033748593, "learning_rate": 9.985630828985835e-06, "loss": 0.8236, "step": 314 }, { "epoch": 0.12217589450208474, "grad_norm": 2.670104818939523, "learning_rate": 9.98511331937873e-06, "loss": 0.7314, "step": 315 }, { "epoch": 0.12256375448463105, "grad_norm": 2.8280166673400515, "learning_rate": 9.984586668665641e-06, "loss": 0.8715, "step": 316 }, { "epoch": 0.12295161446717735, "grad_norm": 3.297346453393531, "learning_rate": 9.98405087781228e-06, "loss": 0.844, "step": 317 }, { "epoch": 0.12333947444972365, "grad_norm": 2.873417504902571, "learning_rate": 9.983505947801115e-06, "loss": 0.8201, "step": 318 }, { "epoch": 0.12372733443226995, "grad_norm": 3.3618953016024595, "learning_rate": 9.982951879631373e-06, "loss": 0.8222, "step": 319 }, { "epoch": 0.12411519441481625, "grad_norm": 3.3319182153953886, "learning_rate": 9.982388674319041e-06, "loss": 0.807, "step": 320 }, { "epoch": 0.12450305439736255, "grad_norm": 2.557999550479368, "learning_rate": 9.981816332896854e-06, "loss": 0.7968, "step": 321 }, { "epoch": 0.12489091437990885, "grad_norm": 3.17580629033354, "learning_rate": 9.981234856414306e-06, "loss": 0.8137, "step": 322 }, { "epoch": 0.12527877436245516, "grad_norm": 3.2897669142127217, "learning_rate": 9.98064424593764e-06, "loss": 0.7946, "step": 323 }, { "epoch": 0.12566663434500144, "grad_norm": 2.361937859884439, "learning_rate": 9.980044502549843e-06, "loss": 0.7323, "step": 324 }, { "epoch": 0.12605449432754776, "grad_norm": 3.129424848158955, "learning_rate": 9.979435627350658e-06, "loss": 0.8415, "step": 325 }, { "epoch": 0.12644235431009407, "grad_norm": 2.7958621899362157, "learning_rate": 9.978817621456562e-06, "loss": 0.855, "step": 326 }, { "epoch": 0.12683021429264035, "grad_norm": 3.2979946720644895, "learning_rate": 9.978190486000784e-06, "loss": 0.7853, "step": 327 }, { "epoch": 0.12721807427518667, "grad_norm": 2.6730403295116045, "learning_rate": 9.977554222133293e-06, "loss": 0.8439, "step": 328 }, { "epoch": 0.12760593425773295, "grad_norm": 2.4319078740364275, "learning_rate": 9.976908831020787e-06, "loss": 0.7412, "step": 329 }, { "epoch": 0.12799379424027926, "grad_norm": 3.9544414507441448, "learning_rate": 9.97625431384671e-06, "loss": 0.8615, "step": 330 }, { "epoch": 0.12838165422282555, "grad_norm": 3.3482049739446627, "learning_rate": 9.975590671811239e-06, "loss": 0.8169, "step": 331 }, { "epoch": 0.12876951420537186, "grad_norm": 3.206488385462, "learning_rate": 9.974917906131283e-06, "loss": 0.923, "step": 332 }, { "epoch": 0.12915737418791817, "grad_norm": 2.9633294142021325, "learning_rate": 9.974236018040476e-06, "loss": 0.7248, "step": 333 }, { "epoch": 0.12954523417046446, "grad_norm": 2.7149396553475222, "learning_rate": 9.973545008789182e-06, "loss": 0.8146, "step": 334 }, { "epoch": 0.12993309415301077, "grad_norm": 2.473905558202285, "learning_rate": 9.972844879644494e-06, "loss": 0.7691, "step": 335 }, { "epoch": 0.13032095413555705, "grad_norm": 3.1195498125666337, "learning_rate": 9.972135631890226e-06, "loss": 0.8158, "step": 336 }, { "epoch": 0.13070881411810337, "grad_norm": 2.977860534045371, "learning_rate": 9.97141726682691e-06, "loss": 0.8475, "step": 337 }, { "epoch": 0.13109667410064965, "grad_norm": 3.641242043787349, "learning_rate": 9.970689785771798e-06, "loss": 0.7993, "step": 338 }, { "epoch": 0.13148453408319596, "grad_norm": 3.9278900602162956, "learning_rate": 9.969953190058861e-06, "loss": 0.9272, "step": 339 }, { "epoch": 0.13187239406574228, "grad_norm": 3.042180497132298, "learning_rate": 9.969207481038776e-06, "loss": 0.851, "step": 340 }, { "epoch": 0.13226025404828856, "grad_norm": 3.706903072199103, "learning_rate": 9.968452660078939e-06, "loss": 0.8284, "step": 341 }, { "epoch": 0.13264811403083487, "grad_norm": 2.708527231730924, "learning_rate": 9.967688728563446e-06, "loss": 0.7017, "step": 342 }, { "epoch": 0.13303597401338116, "grad_norm": 2.499675064486875, "learning_rate": 9.966915687893109e-06, "loss": 0.8185, "step": 343 }, { "epoch": 0.13342383399592747, "grad_norm": 2.3520941032452947, "learning_rate": 9.966133539485435e-06, "loss": 0.8778, "step": 344 }, { "epoch": 0.13381169397847378, "grad_norm": 3.063900811997485, "learning_rate": 9.965342284774633e-06, "loss": 0.7783, "step": 345 }, { "epoch": 0.13419955396102007, "grad_norm": 2.623906530003471, "learning_rate": 9.964541925211613e-06, "loss": 0.7607, "step": 346 }, { "epoch": 0.13458741394356638, "grad_norm": 2.4456772754338454, "learning_rate": 9.963732462263979e-06, "loss": 0.753, "step": 347 }, { "epoch": 0.13497527392611267, "grad_norm": 3.0485256637763025, "learning_rate": 9.962913897416029e-06, "loss": 0.8364, "step": 348 }, { "epoch": 0.13536313390865898, "grad_norm": 3.3849519932401244, "learning_rate": 9.962086232168747e-06, "loss": 0.7889, "step": 349 }, { "epoch": 0.13575099389120526, "grad_norm": 2.460358159824383, "learning_rate": 9.961249468039806e-06, "loss": 0.788, "step": 350 }, { "epoch": 0.13613885387375158, "grad_norm": 3.848699844345297, "learning_rate": 9.960403606563568e-06, "loss": 0.8582, "step": 351 }, { "epoch": 0.1365267138562979, "grad_norm": 3.2163582489547906, "learning_rate": 9.959548649291071e-06, "loss": 0.8123, "step": 352 }, { "epoch": 0.13691457383884417, "grad_norm": 3.2795423473747896, "learning_rate": 9.958684597790031e-06, "loss": 0.7966, "step": 353 }, { "epoch": 0.13730243382139048, "grad_norm": 2.2485169283355217, "learning_rate": 9.957811453644848e-06, "loss": 0.8213, "step": 354 }, { "epoch": 0.13769029380393677, "grad_norm": 3.2088834381939653, "learning_rate": 9.956929218456586e-06, "loss": 0.7805, "step": 355 }, { "epoch": 0.13807815378648308, "grad_norm": 2.534977633110964, "learning_rate": 9.956037893842982e-06, "loss": 0.7754, "step": 356 }, { "epoch": 0.13846601376902937, "grad_norm": 3.203656087906015, "learning_rate": 9.955137481438442e-06, "loss": 0.842, "step": 357 }, { "epoch": 0.13885387375157568, "grad_norm": 2.5191329867807783, "learning_rate": 9.954227982894034e-06, "loss": 0.7797, "step": 358 }, { "epoch": 0.139241733734122, "grad_norm": 4.200018518038717, "learning_rate": 9.953309399877491e-06, "loss": 0.747, "step": 359 }, { "epoch": 0.13962959371666828, "grad_norm": 3.2021791772122232, "learning_rate": 9.952381734073197e-06, "loss": 0.8356, "step": 360 }, { "epoch": 0.1400174536992146, "grad_norm": 2.4445184671829705, "learning_rate": 9.951444987182195e-06, "loss": 0.8279, "step": 361 }, { "epoch": 0.14040531368176087, "grad_norm": 3.3353261921211055, "learning_rate": 9.950499160922184e-06, "loss": 0.7939, "step": 362 }, { "epoch": 0.1407931736643072, "grad_norm": 3.4035742852011626, "learning_rate": 9.949544257027503e-06, "loss": 0.9011, "step": 363 }, { "epoch": 0.1411810336468535, "grad_norm": 3.232056778862916, "learning_rate": 9.948580277249142e-06, "loss": 0.739, "step": 364 }, { "epoch": 0.14156889362939978, "grad_norm": 2.112557476127656, "learning_rate": 9.947607223354731e-06, "loss": 0.7718, "step": 365 }, { "epoch": 0.1419567536119461, "grad_norm": 2.9040032582829896, "learning_rate": 9.946625097128544e-06, "loss": 0.8565, "step": 366 }, { "epoch": 0.14234461359449238, "grad_norm": 2.952662191401493, "learning_rate": 9.945633900371483e-06, "loss": 0.804, "step": 367 }, { "epoch": 0.1427324735770387, "grad_norm": 2.8959105770036397, "learning_rate": 9.94463363490109e-06, "loss": 0.7811, "step": 368 }, { "epoch": 0.14312033355958498, "grad_norm": 3.0681184208396837, "learning_rate": 9.943624302551527e-06, "loss": 0.781, "step": 369 }, { "epoch": 0.1435081935421313, "grad_norm": 3.7360711131144364, "learning_rate": 9.942605905173593e-06, "loss": 0.8217, "step": 370 }, { "epoch": 0.1438960535246776, "grad_norm": 2.2962579054210357, "learning_rate": 9.941578444634699e-06, "loss": 0.7779, "step": 371 }, { "epoch": 0.1442839135072239, "grad_norm": 3.1384337390064587, "learning_rate": 9.940541922818882e-06, "loss": 0.8624, "step": 372 }, { "epoch": 0.1446717734897702, "grad_norm": 2.842361136457307, "learning_rate": 9.939496341626791e-06, "loss": 0.7922, "step": 373 }, { "epoch": 0.14505963347231648, "grad_norm": 3.216124090885531, "learning_rate": 9.938441702975689e-06, "loss": 0.7383, "step": 374 }, { "epoch": 0.1454474934548628, "grad_norm": 2.7735527484560487, "learning_rate": 9.937378008799448e-06, "loss": 0.7047, "step": 375 }, { "epoch": 0.14583535343740908, "grad_norm": 3.1429388030376377, "learning_rate": 9.93630526104854e-06, "loss": 0.8228, "step": 376 }, { "epoch": 0.1462232134199554, "grad_norm": 2.908311835495271, "learning_rate": 9.935223461690042e-06, "loss": 0.749, "step": 377 }, { "epoch": 0.1466110734025017, "grad_norm": 2.925237918252306, "learning_rate": 9.934132612707631e-06, "loss": 0.7438, "step": 378 }, { "epoch": 0.146998933385048, "grad_norm": 3.3280507080496426, "learning_rate": 9.933032716101576e-06, "loss": 0.8206, "step": 379 }, { "epoch": 0.1473867933675943, "grad_norm": 3.7345544794875996, "learning_rate": 9.931923773888734e-06, "loss": 0.8122, "step": 380 }, { "epoch": 0.1477746533501406, "grad_norm": 3.041501090198261, "learning_rate": 9.930805788102551e-06, "loss": 0.7942, "step": 381 }, { "epoch": 0.1481625133326869, "grad_norm": 2.4352654497106423, "learning_rate": 9.929678760793057e-06, "loss": 0.8013, "step": 382 }, { "epoch": 0.14855037331523321, "grad_norm": 2.234605055566757, "learning_rate": 9.928542694026862e-06, "loss": 0.7679, "step": 383 }, { "epoch": 0.1489382332977795, "grad_norm": 2.7190308139657358, "learning_rate": 9.927397589887144e-06, "loss": 0.7517, "step": 384 }, { "epoch": 0.1493260932803258, "grad_norm": 3.333124177241881, "learning_rate": 9.926243450473664e-06, "loss": 0.7714, "step": 385 }, { "epoch": 0.1497139532628721, "grad_norm": 2.4788457083670234, "learning_rate": 9.925080277902743e-06, "loss": 0.7846, "step": 386 }, { "epoch": 0.1501018132454184, "grad_norm": 3.226191534572386, "learning_rate": 9.923908074307267e-06, "loss": 0.7402, "step": 387 }, { "epoch": 0.1504896732279647, "grad_norm": 3.2618400223458126, "learning_rate": 9.922726841836685e-06, "loss": 0.8035, "step": 388 }, { "epoch": 0.150877533210511, "grad_norm": 2.852886905975956, "learning_rate": 9.921536582657002e-06, "loss": 0.7868, "step": 389 }, { "epoch": 0.15126539319305732, "grad_norm": 2.4364146969454006, "learning_rate": 9.920337298950767e-06, "loss": 0.7823, "step": 390 }, { "epoch": 0.1516532531756036, "grad_norm": 3.52799380320175, "learning_rate": 9.919128992917086e-06, "loss": 0.9203, "step": 391 }, { "epoch": 0.15204111315814992, "grad_norm": 2.9448988720362905, "learning_rate": 9.91791166677161e-06, "loss": 0.8004, "step": 392 }, { "epoch": 0.1524289731406962, "grad_norm": 2.7794412520727336, "learning_rate": 9.916685322746524e-06, "loss": 0.8044, "step": 393 }, { "epoch": 0.1528168331232425, "grad_norm": 3.6931422946095274, "learning_rate": 9.915449963090551e-06, "loss": 0.8396, "step": 394 }, { "epoch": 0.1532046931057888, "grad_norm": 2.3367036331481215, "learning_rate": 9.914205590068946e-06, "loss": 0.7191, "step": 395 }, { "epoch": 0.1535925530883351, "grad_norm": 3.7302333221311343, "learning_rate": 9.912952205963491e-06, "loss": 0.806, "step": 396 }, { "epoch": 0.15398041307088142, "grad_norm": 2.375465611705481, "learning_rate": 9.911689813072495e-06, "loss": 0.7512, "step": 397 }, { "epoch": 0.1543682730534277, "grad_norm": 3.0896177942564402, "learning_rate": 9.91041841371078e-06, "loss": 0.7691, "step": 398 }, { "epoch": 0.15475613303597402, "grad_norm": 2.9120052948037243, "learning_rate": 9.90913801020969e-06, "loss": 0.8648, "step": 399 }, { "epoch": 0.1551439930185203, "grad_norm": 3.057703075543722, "learning_rate": 9.907848604917075e-06, "loss": 0.7406, "step": 400 }, { "epoch": 0.15553185300106662, "grad_norm": 3.3266645722024695, "learning_rate": 9.906550200197288e-06, "loss": 0.8086, "step": 401 }, { "epoch": 0.15591971298361293, "grad_norm": 3.475818184215848, "learning_rate": 9.905242798431196e-06, "loss": 0.7494, "step": 402 }, { "epoch": 0.15630757296615921, "grad_norm": 2.7594118498383122, "learning_rate": 9.903926402016153e-06, "loss": 0.7878, "step": 403 }, { "epoch": 0.15669543294870553, "grad_norm": 2.7604113638639682, "learning_rate": 9.902601013366009e-06, "loss": 0.8403, "step": 404 }, { "epoch": 0.1570832929312518, "grad_norm": 3.1165930672495548, "learning_rate": 9.901266634911104e-06, "loss": 0.8135, "step": 405 }, { "epoch": 0.15747115291379812, "grad_norm": 6.135665453817984, "learning_rate": 9.899923269098262e-06, "loss": 0.9229, "step": 406 }, { "epoch": 0.1578590128963444, "grad_norm": 2.1115656905859814, "learning_rate": 9.898570918390789e-06, "loss": 0.7136, "step": 407 }, { "epoch": 0.15824687287889072, "grad_norm": 2.3813987234548737, "learning_rate": 9.897209585268459e-06, "loss": 0.8176, "step": 408 }, { "epoch": 0.15863473286143703, "grad_norm": 2.2999242081598954, "learning_rate": 9.895839272227529e-06, "loss": 0.7446, "step": 409 }, { "epoch": 0.15902259284398332, "grad_norm": 4.015239620945287, "learning_rate": 9.894459981780711e-06, "loss": 0.8724, "step": 410 }, { "epoch": 0.15941045282652963, "grad_norm": 3.8409052893425204, "learning_rate": 9.893071716457183e-06, "loss": 0.8395, "step": 411 }, { "epoch": 0.15979831280907592, "grad_norm": 3.1605687858157965, "learning_rate": 9.891674478802585e-06, "loss": 0.8265, "step": 412 }, { "epoch": 0.16018617279162223, "grad_norm": 3.891560949950753, "learning_rate": 9.890268271379e-06, "loss": 0.7886, "step": 413 }, { "epoch": 0.1605740327741685, "grad_norm": 3.0460429128601514, "learning_rate": 9.888853096764963e-06, "loss": 0.7711, "step": 414 }, { "epoch": 0.16096189275671482, "grad_norm": 3.095967323142966, "learning_rate": 9.887428957555457e-06, "loss": 0.7678, "step": 415 }, { "epoch": 0.16134975273926114, "grad_norm": 2.7045003243997403, "learning_rate": 9.885995856361895e-06, "loss": 0.8045, "step": 416 }, { "epoch": 0.16173761272180742, "grad_norm": 3.6023838114986027, "learning_rate": 9.884553795812128e-06, "loss": 0.844, "step": 417 }, { "epoch": 0.16212547270435373, "grad_norm": 2.9402147953999105, "learning_rate": 9.883102778550434e-06, "loss": 0.7977, "step": 418 }, { "epoch": 0.16251333268690002, "grad_norm": 3.640124173317137, "learning_rate": 9.881642807237515e-06, "loss": 0.811, "step": 419 }, { "epoch": 0.16290119266944633, "grad_norm": 3.4997486843632237, "learning_rate": 9.880173884550495e-06, "loss": 0.7502, "step": 420 }, { "epoch": 0.16328905265199264, "grad_norm": 2.4719563575983434, "learning_rate": 9.878696013182906e-06, "loss": 0.8418, "step": 421 }, { "epoch": 0.16367691263453893, "grad_norm": 3.369494652517918, "learning_rate": 9.877209195844692e-06, "loss": 0.7854, "step": 422 }, { "epoch": 0.16406477261708524, "grad_norm": 3.4270313551139457, "learning_rate": 9.875713435262205e-06, "loss": 0.7519, "step": 423 }, { "epoch": 0.16445263259963153, "grad_norm": 3.2077250336375314, "learning_rate": 9.874208734178187e-06, "loss": 0.8681, "step": 424 }, { "epoch": 0.16484049258217784, "grad_norm": 2.7275737520718266, "learning_rate": 9.872695095351784e-06, "loss": 0.8, "step": 425 }, { "epoch": 0.16522835256472412, "grad_norm": 2.6580391358505695, "learning_rate": 9.871172521558523e-06, "loss": 0.7794, "step": 426 }, { "epoch": 0.16561621254727044, "grad_norm": 2.5724819016580045, "learning_rate": 9.869641015590319e-06, "loss": 0.7766, "step": 427 }, { "epoch": 0.16600407252981675, "grad_norm": 2.7499253203447935, "learning_rate": 9.868100580255466e-06, "loss": 0.8301, "step": 428 }, { "epoch": 0.16639193251236303, "grad_norm": 4.121314150213681, "learning_rate": 9.86655121837863e-06, "loss": 0.8062, "step": 429 }, { "epoch": 0.16677979249490935, "grad_norm": 3.10516736828974, "learning_rate": 9.864992932800845e-06, "loss": 0.8118, "step": 430 }, { "epoch": 0.16716765247745563, "grad_norm": 2.7953274415881166, "learning_rate": 9.863425726379512e-06, "loss": 0.7168, "step": 431 }, { "epoch": 0.16755551246000194, "grad_norm": 2.117630493019634, "learning_rate": 9.861849601988384e-06, "loss": 0.7783, "step": 432 }, { "epoch": 0.16794337244254823, "grad_norm": 4.295035492764651, "learning_rate": 9.860264562517571e-06, "loss": 0.8371, "step": 433 }, { "epoch": 0.16833123242509454, "grad_norm": 3.4873302159777486, "learning_rate": 9.858670610873528e-06, "loss": 0.7602, "step": 434 }, { "epoch": 0.16871909240764085, "grad_norm": 2.268073789719776, "learning_rate": 9.857067749979057e-06, "loss": 0.7862, "step": 435 }, { "epoch": 0.16910695239018714, "grad_norm": 2.9765786452898544, "learning_rate": 9.855455982773288e-06, "loss": 0.7924, "step": 436 }, { "epoch": 0.16949481237273345, "grad_norm": 4.455061135650614, "learning_rate": 9.853835312211692e-06, "loss": 0.8528, "step": 437 }, { "epoch": 0.16988267235527973, "grad_norm": 3.082516106412517, "learning_rate": 9.852205741266058e-06, "loss": 0.7868, "step": 438 }, { "epoch": 0.17027053233782605, "grad_norm": 2.3837939842661706, "learning_rate": 9.8505672729245e-06, "loss": 0.7703, "step": 439 }, { "epoch": 0.17065839232037233, "grad_norm": 5.155661602614887, "learning_rate": 9.848919910191446e-06, "loss": 0.7872, "step": 440 }, { "epoch": 0.17104625230291864, "grad_norm": 3.3147046873098227, "learning_rate": 9.847263656087633e-06, "loss": 0.8031, "step": 441 }, { "epoch": 0.17143411228546496, "grad_norm": 2.767499731309263, "learning_rate": 9.845598513650104e-06, "loss": 0.7969, "step": 442 }, { "epoch": 0.17182197226801124, "grad_norm": 2.802333574787054, "learning_rate": 9.843924485932195e-06, "loss": 0.7358, "step": 443 }, { "epoch": 0.17220983225055755, "grad_norm": 2.55005168049186, "learning_rate": 9.84224157600354e-06, "loss": 0.6943, "step": 444 }, { "epoch": 0.17259769223310384, "grad_norm": 3.4986216468297955, "learning_rate": 9.840549786950058e-06, "loss": 0.8512, "step": 445 }, { "epoch": 0.17298555221565015, "grad_norm": 2.2936991633519974, "learning_rate": 9.83884912187395e-06, "loss": 0.8126, "step": 446 }, { "epoch": 0.17337341219819646, "grad_norm": 2.5371750387377223, "learning_rate": 9.837139583893693e-06, "loss": 0.8288, "step": 447 }, { "epoch": 0.17376127218074275, "grad_norm": 2.7826436919349793, "learning_rate": 9.835421176144035e-06, "loss": 0.7768, "step": 448 }, { "epoch": 0.17414913216328906, "grad_norm": 2.701454915642984, "learning_rate": 9.833693901775985e-06, "loss": 0.7587, "step": 449 }, { "epoch": 0.17453699214583535, "grad_norm": 4.784726490360127, "learning_rate": 9.831957763956814e-06, "loss": 0.8017, "step": 450 }, { "epoch": 0.17492485212838166, "grad_norm": 3.9167162924725427, "learning_rate": 9.830212765870043e-06, "loss": 0.88, "step": 451 }, { "epoch": 0.17531271211092794, "grad_norm": 2.2925638824245262, "learning_rate": 9.828458910715442e-06, "loss": 0.7526, "step": 452 }, { "epoch": 0.17570057209347426, "grad_norm": 2.6041739726935145, "learning_rate": 9.826696201709022e-06, "loss": 0.7633, "step": 453 }, { "epoch": 0.17608843207602057, "grad_norm": 2.6589461190900194, "learning_rate": 9.824924642083026e-06, "loss": 0.7816, "step": 454 }, { "epoch": 0.17647629205856685, "grad_norm": 2.6695669249566496, "learning_rate": 9.823144235085934e-06, "loss": 0.8082, "step": 455 }, { "epoch": 0.17686415204111317, "grad_norm": 2.9846947659602057, "learning_rate": 9.821354983982438e-06, "loss": 0.7069, "step": 456 }, { "epoch": 0.17725201202365945, "grad_norm": 2.7903399192646905, "learning_rate": 9.819556892053456e-06, "loss": 0.7951, "step": 457 }, { "epoch": 0.17763987200620576, "grad_norm": 3.674682546898267, "learning_rate": 9.817749962596115e-06, "loss": 0.8073, "step": 458 }, { "epoch": 0.17802773198875205, "grad_norm": 2.2292750125069745, "learning_rate": 9.815934198923746e-06, "loss": 0.7479, "step": 459 }, { "epoch": 0.17841559197129836, "grad_norm": 3.016494990625781, "learning_rate": 9.814109604365878e-06, "loss": 0.8134, "step": 460 }, { "epoch": 0.17880345195384467, "grad_norm": 2.122357779786006, "learning_rate": 9.812276182268236e-06, "loss": 0.7661, "step": 461 }, { "epoch": 0.17919131193639096, "grad_norm": 2.6073798077738544, "learning_rate": 9.810433935992734e-06, "loss": 0.7326, "step": 462 }, { "epoch": 0.17957917191893727, "grad_norm": 2.28598578853254, "learning_rate": 9.808582868917458e-06, "loss": 0.7814, "step": 463 }, { "epoch": 0.17996703190148355, "grad_norm": 3.389907129381922, "learning_rate": 9.806722984436676e-06, "loss": 0.74, "step": 464 }, { "epoch": 0.18035489188402987, "grad_norm": 2.6990252786744877, "learning_rate": 9.804854285960823e-06, "loss": 0.7941, "step": 465 }, { "epoch": 0.18074275186657618, "grad_norm": 3.19629746005018, "learning_rate": 9.802976776916493e-06, "loss": 0.7414, "step": 466 }, { "epoch": 0.18113061184912246, "grad_norm": 3.1470677143625987, "learning_rate": 9.801090460746442e-06, "loss": 0.7975, "step": 467 }, { "epoch": 0.18151847183166878, "grad_norm": 3.792113189504254, "learning_rate": 9.799195340909569e-06, "loss": 0.9043, "step": 468 }, { "epoch": 0.18190633181421506, "grad_norm": 2.983729194637099, "learning_rate": 9.79729142088092e-06, "loss": 0.8273, "step": 469 }, { "epoch": 0.18229419179676137, "grad_norm": 3.3508554668169084, "learning_rate": 9.795378704151675e-06, "loss": 0.7848, "step": 470 }, { "epoch": 0.18268205177930766, "grad_norm": 2.612490330980325, "learning_rate": 9.793457194229145e-06, "loss": 0.7272, "step": 471 }, { "epoch": 0.18306991176185397, "grad_norm": 2.4247100449322714, "learning_rate": 9.791526894636767e-06, "loss": 0.7776, "step": 472 }, { "epoch": 0.18345777174440028, "grad_norm": 2.83875374877576, "learning_rate": 9.789587808914094e-06, "loss": 0.7442, "step": 473 }, { "epoch": 0.18384563172694657, "grad_norm": 2.545942598993907, "learning_rate": 9.787639940616789e-06, "loss": 0.7352, "step": 474 }, { "epoch": 0.18423349170949288, "grad_norm": 2.6723953952589343, "learning_rate": 9.785683293316622e-06, "loss": 0.752, "step": 475 }, { "epoch": 0.18462135169203917, "grad_norm": 2.2343129425943946, "learning_rate": 9.783717870601458e-06, "loss": 0.7542, "step": 476 }, { "epoch": 0.18500921167458548, "grad_norm": 2.085641753702471, "learning_rate": 9.781743676075257e-06, "loss": 0.7617, "step": 477 }, { "epoch": 0.18539707165713176, "grad_norm": 2.2191221947182704, "learning_rate": 9.77976071335806e-06, "loss": 0.7428, "step": 478 }, { "epoch": 0.18578493163967807, "grad_norm": 3.0011293877549763, "learning_rate": 9.777768986085985e-06, "loss": 0.7541, "step": 479 }, { "epoch": 0.1861727916222244, "grad_norm": 3.8163688203055988, "learning_rate": 9.775768497911226e-06, "loss": 0.8672, "step": 480 }, { "epoch": 0.18656065160477067, "grad_norm": 2.419870419595455, "learning_rate": 9.77375925250204e-06, "loss": 0.704, "step": 481 }, { "epoch": 0.18694851158731698, "grad_norm": 2.931793385233233, "learning_rate": 9.771741253542742e-06, "loss": 0.7418, "step": 482 }, { "epoch": 0.18733637156986327, "grad_norm": 2.5382645238216983, "learning_rate": 9.769714504733695e-06, "loss": 0.6944, "step": 483 }, { "epoch": 0.18772423155240958, "grad_norm": 2.5892907008357255, "learning_rate": 9.767679009791312e-06, "loss": 0.8106, "step": 484 }, { "epoch": 0.1881120915349559, "grad_norm": 2.700729187822575, "learning_rate": 9.765634772448039e-06, "loss": 0.7786, "step": 485 }, { "epoch": 0.18849995151750218, "grad_norm": 2.2524260550641295, "learning_rate": 9.763581796452353e-06, "loss": 0.7425, "step": 486 }, { "epoch": 0.1888878115000485, "grad_norm": 2.868195830241581, "learning_rate": 9.76152008556876e-06, "loss": 0.8275, "step": 487 }, { "epoch": 0.18927567148259478, "grad_norm": 3.548288324481627, "learning_rate": 9.759449643577779e-06, "loss": 0.7777, "step": 488 }, { "epoch": 0.1896635314651411, "grad_norm": 2.768018300111607, "learning_rate": 9.757370474275938e-06, "loss": 0.7154, "step": 489 }, { "epoch": 0.19005139144768737, "grad_norm": 3.2095782563737267, "learning_rate": 9.755282581475769e-06, "loss": 0.788, "step": 490 }, { "epoch": 0.19043925143023369, "grad_norm": 2.578292972257673, "learning_rate": 9.753185969005802e-06, "loss": 0.7605, "step": 491 }, { "epoch": 0.19082711141278, "grad_norm": 2.7730825295216457, "learning_rate": 9.751080640710554e-06, "loss": 0.7106, "step": 492 }, { "epoch": 0.19121497139532628, "grad_norm": 3.098149233773937, "learning_rate": 9.748966600450526e-06, "loss": 0.7558, "step": 493 }, { "epoch": 0.1916028313778726, "grad_norm": 2.6909453652563693, "learning_rate": 9.746843852102191e-06, "loss": 0.7569, "step": 494 }, { "epoch": 0.19199069136041888, "grad_norm": 1.9975651679800268, "learning_rate": 9.744712399557992e-06, "loss": 0.7516, "step": 495 }, { "epoch": 0.1923785513429652, "grad_norm": 2.981544039397887, "learning_rate": 9.742572246726336e-06, "loss": 0.7416, "step": 496 }, { "epoch": 0.19276641132551148, "grad_norm": 2.9198826818698542, "learning_rate": 9.740423397531573e-06, "loss": 0.8024, "step": 497 }, { "epoch": 0.1931542713080578, "grad_norm": 2.8423457752570847, "learning_rate": 9.738265855914014e-06, "loss": 0.8143, "step": 498 }, { "epoch": 0.1935421312906041, "grad_norm": 2.6132563604368304, "learning_rate": 9.736099625829894e-06, "loss": 0.8213, "step": 499 }, { "epoch": 0.1939299912731504, "grad_norm": 3.477008872730743, "learning_rate": 9.733924711251393e-06, "loss": 0.8458, "step": 500 }, { "epoch": 0.1939299912731504, "eval_loss": 1.384662389755249, "eval_runtime": 6.4543, "eval_samples_per_second": 0.155, "eval_steps_per_second": 0.155, "step": 500 }, { "epoch": 0.1943178512556967, "grad_norm": 2.1287968766602523, "learning_rate": 9.731741116166607e-06, "loss": 0.7386, "step": 501 }, { "epoch": 0.19470571123824298, "grad_norm": 3.2215084063964463, "learning_rate": 9.729548844579552e-06, "loss": 0.8519, "step": 502 }, { "epoch": 0.1950935712207893, "grad_norm": 2.7792052963730676, "learning_rate": 9.727347900510155e-06, "loss": 0.7238, "step": 503 }, { "epoch": 0.1954814312033356, "grad_norm": 2.725788450894146, "learning_rate": 9.725138287994246e-06, "loss": 0.7587, "step": 504 }, { "epoch": 0.1958692911858819, "grad_norm": 2.5295427267493786, "learning_rate": 9.722920011083546e-06, "loss": 0.7518, "step": 505 }, { "epoch": 0.1962571511684282, "grad_norm": 2.7025973508277596, "learning_rate": 9.720693073845668e-06, "loss": 0.7357, "step": 506 }, { "epoch": 0.1966450111509745, "grad_norm": 2.1716713550004094, "learning_rate": 9.718457480364103e-06, "loss": 0.6888, "step": 507 }, { "epoch": 0.1970328711335208, "grad_norm": 2.3573372777875505, "learning_rate": 9.716213234738216e-06, "loss": 0.7491, "step": 508 }, { "epoch": 0.1974207311160671, "grad_norm": 3.032533975295906, "learning_rate": 9.713960341083237e-06, "loss": 0.7256, "step": 509 }, { "epoch": 0.1978085910986134, "grad_norm": 2.578296506396952, "learning_rate": 9.711698803530253e-06, "loss": 0.7394, "step": 510 }, { "epoch": 0.1981964510811597, "grad_norm": 2.835390140372231, "learning_rate": 9.709428626226204e-06, "loss": 0.8307, "step": 511 }, { "epoch": 0.198584311063706, "grad_norm": 3.5480757424907585, "learning_rate": 9.707149813333866e-06, "loss": 0.8162, "step": 512 }, { "epoch": 0.1989721710462523, "grad_norm": 2.758509972023256, "learning_rate": 9.704862369031857e-06, "loss": 0.8237, "step": 513 }, { "epoch": 0.1993600310287986, "grad_norm": 3.7088487312488803, "learning_rate": 9.70256629751462e-06, "loss": 0.7658, "step": 514 }, { "epoch": 0.1997478910113449, "grad_norm": 2.288583199323312, "learning_rate": 9.700261602992417e-06, "loss": 0.7172, "step": 515 }, { "epoch": 0.2001357509938912, "grad_norm": 2.634736139476763, "learning_rate": 9.69794828969132e-06, "loss": 0.771, "step": 516 }, { "epoch": 0.2005236109764375, "grad_norm": 2.873200454412611, "learning_rate": 9.695626361853207e-06, "loss": 0.7592, "step": 517 }, { "epoch": 0.20091147095898382, "grad_norm": 3.2139265846075427, "learning_rate": 9.693295823735754e-06, "loss": 0.7838, "step": 518 }, { "epoch": 0.2012993309415301, "grad_norm": 2.2994321795675234, "learning_rate": 9.690956679612422e-06, "loss": 0.7473, "step": 519 }, { "epoch": 0.20168719092407641, "grad_norm": 2.9733479859578384, "learning_rate": 9.688608933772454e-06, "loss": 0.8171, "step": 520 }, { "epoch": 0.2020750509066227, "grad_norm": 2.628807154587621, "learning_rate": 9.686252590520869e-06, "loss": 0.7775, "step": 521 }, { "epoch": 0.202462910889169, "grad_norm": 2.457048938585112, "learning_rate": 9.683887654178446e-06, "loss": 0.7574, "step": 522 }, { "epoch": 0.20285077087171532, "grad_norm": 2.7668571771765342, "learning_rate": 9.681514129081725e-06, "loss": 0.7633, "step": 523 }, { "epoch": 0.2032386308542616, "grad_norm": 2.5698839383565284, "learning_rate": 9.679132019582988e-06, "loss": 0.7496, "step": 524 }, { "epoch": 0.20362649083680792, "grad_norm": 2.8454466380235317, "learning_rate": 9.67674133005027e-06, "loss": 0.6795, "step": 525 }, { "epoch": 0.2040143508193542, "grad_norm": 2.4728449330506286, "learning_rate": 9.674342064867326e-06, "loss": 0.7444, "step": 526 }, { "epoch": 0.20440221080190052, "grad_norm": 3.0592971492531804, "learning_rate": 9.671934228433647e-06, "loss": 0.7393, "step": 527 }, { "epoch": 0.2047900707844468, "grad_norm": 3.1524973925396633, "learning_rate": 9.669517825164435e-06, "loss": 0.7749, "step": 528 }, { "epoch": 0.20517793076699312, "grad_norm": 3.677484330983854, "learning_rate": 9.667092859490599e-06, "loss": 0.8387, "step": 529 }, { "epoch": 0.20556579074953943, "grad_norm": 2.762679950829215, "learning_rate": 9.664659335858755e-06, "loss": 0.7584, "step": 530 }, { "epoch": 0.2059536507320857, "grad_norm": 2.4101437666212115, "learning_rate": 9.662217258731208e-06, "loss": 0.7409, "step": 531 }, { "epoch": 0.20634151071463203, "grad_norm": 2.7134583004575097, "learning_rate": 9.659766632585946e-06, "loss": 0.7301, "step": 532 }, { "epoch": 0.2067293706971783, "grad_norm": 2.088313529544365, "learning_rate": 9.657307461916637e-06, "loss": 0.7736, "step": 533 }, { "epoch": 0.20711723067972462, "grad_norm": 2.7308227331106067, "learning_rate": 9.654839751232612e-06, "loss": 0.7174, "step": 534 }, { "epoch": 0.2075050906622709, "grad_norm": 2.379507713766772, "learning_rate": 9.652363505058866e-06, "loss": 0.7824, "step": 535 }, { "epoch": 0.20789295064481722, "grad_norm": 2.57743059632076, "learning_rate": 9.649878727936044e-06, "loss": 0.7301, "step": 536 }, { "epoch": 0.20828081062736353, "grad_norm": 2.9105677913451986, "learning_rate": 9.647385424420435e-06, "loss": 0.74, "step": 537 }, { "epoch": 0.20866867060990982, "grad_norm": 2.632475236788382, "learning_rate": 9.644883599083959e-06, "loss": 0.7609, "step": 538 }, { "epoch": 0.20905653059245613, "grad_norm": 3.4279018455650103, "learning_rate": 9.642373256514164e-06, "loss": 0.8086, "step": 539 }, { "epoch": 0.20944439057500241, "grad_norm": 2.589464924865872, "learning_rate": 9.639854401314219e-06, "loss": 0.7494, "step": 540 }, { "epoch": 0.20983225055754873, "grad_norm": 3.1447054253152174, "learning_rate": 9.637327038102902e-06, "loss": 0.7841, "step": 541 }, { "epoch": 0.210220110540095, "grad_norm": 3.035634414923027, "learning_rate": 9.634791171514585e-06, "loss": 0.8213, "step": 542 }, { "epoch": 0.21060797052264132, "grad_norm": 2.100399692320294, "learning_rate": 9.632246806199242e-06, "loss": 0.7184, "step": 543 }, { "epoch": 0.21099583050518764, "grad_norm": 3.0063722073750774, "learning_rate": 9.629693946822423e-06, "loss": 0.8276, "step": 544 }, { "epoch": 0.21138369048773392, "grad_norm": 2.986586026393103, "learning_rate": 9.627132598065258e-06, "loss": 0.7889, "step": 545 }, { "epoch": 0.21177155047028023, "grad_norm": 2.905747484901666, "learning_rate": 9.624562764624445e-06, "loss": 0.7776, "step": 546 }, { "epoch": 0.21215941045282652, "grad_norm": 3.57181914872069, "learning_rate": 9.621984451212237e-06, "loss": 0.7913, "step": 547 }, { "epoch": 0.21254727043537283, "grad_norm": 2.7418904475043635, "learning_rate": 9.619397662556434e-06, "loss": 0.6921, "step": 548 }, { "epoch": 0.21293513041791914, "grad_norm": 2.8132229595967404, "learning_rate": 9.616802403400384e-06, "loss": 0.6729, "step": 549 }, { "epoch": 0.21332299040046543, "grad_norm": 3.406687232693481, "learning_rate": 9.614198678502965e-06, "loss": 0.8999, "step": 550 }, { "epoch": 0.21371085038301174, "grad_norm": 2.6699029031990635, "learning_rate": 9.611586492638573e-06, "loss": 0.7753, "step": 551 }, { "epoch": 0.21409871036555803, "grad_norm": 2.185182582764279, "learning_rate": 9.608965850597125e-06, "loss": 0.7274, "step": 552 }, { "epoch": 0.21448657034810434, "grad_norm": 2.8074349497280147, "learning_rate": 9.606336757184041e-06, "loss": 0.8163, "step": 553 }, { "epoch": 0.21487443033065062, "grad_norm": 2.73348883474593, "learning_rate": 9.603699217220239e-06, "loss": 0.7914, "step": 554 }, { "epoch": 0.21526229031319694, "grad_norm": 2.9327901537740098, "learning_rate": 9.601053235542124e-06, "loss": 0.7899, "step": 555 }, { "epoch": 0.21565015029574325, "grad_norm": 2.7204159479392724, "learning_rate": 9.598398817001585e-06, "loss": 0.7387, "step": 556 }, { "epoch": 0.21603801027828953, "grad_norm": 2.5856214246258267, "learning_rate": 9.595735966465973e-06, "loss": 0.6925, "step": 557 }, { "epoch": 0.21642587026083585, "grad_norm": 3.0241139376760677, "learning_rate": 9.59306468881811e-06, "loss": 0.7564, "step": 558 }, { "epoch": 0.21681373024338213, "grad_norm": 2.6337776986976364, "learning_rate": 9.590384988956264e-06, "loss": 0.7556, "step": 559 }, { "epoch": 0.21720159022592844, "grad_norm": 3.3448342450203117, "learning_rate": 9.587696871794148e-06, "loss": 0.784, "step": 560 }, { "epoch": 0.21758945020847473, "grad_norm": 3.2978532224392882, "learning_rate": 9.585000342260914e-06, "loss": 0.8251, "step": 561 }, { "epoch": 0.21797731019102104, "grad_norm": 2.432681183631936, "learning_rate": 9.582295405301131e-06, "loss": 0.7192, "step": 562 }, { "epoch": 0.21836517017356735, "grad_norm": 3.415412860943942, "learning_rate": 9.579582065874794e-06, "loss": 0.8013, "step": 563 }, { "epoch": 0.21875303015611364, "grad_norm": 2.3195414563444516, "learning_rate": 9.576860328957299e-06, "loss": 0.7227, "step": 564 }, { "epoch": 0.21914089013865995, "grad_norm": 2.8678533258952426, "learning_rate": 9.574130199539443e-06, "loss": 0.8556, "step": 565 }, { "epoch": 0.21952875012120623, "grad_norm": 2.090487922939531, "learning_rate": 9.571391682627413e-06, "loss": 0.7271, "step": 566 }, { "epoch": 0.21991661010375255, "grad_norm": 2.6987719685498393, "learning_rate": 9.568644783242771e-06, "loss": 0.83, "step": 567 }, { "epoch": 0.22030447008629886, "grad_norm": 2.558686362345426, "learning_rate": 9.565889506422457e-06, "loss": 0.7775, "step": 568 }, { "epoch": 0.22069233006884514, "grad_norm": 2.6988582804573094, "learning_rate": 9.563125857218766e-06, "loss": 0.8072, "step": 569 }, { "epoch": 0.22108019005139146, "grad_norm": 2.975201183044991, "learning_rate": 9.56035384069935e-06, "loss": 0.7927, "step": 570 }, { "epoch": 0.22146805003393774, "grad_norm": 2.2608274694760837, "learning_rate": 9.557573461947201e-06, "loss": 0.7531, "step": 571 }, { "epoch": 0.22185591001648405, "grad_norm": 2.738486323793019, "learning_rate": 9.554784726060647e-06, "loss": 0.7394, "step": 572 }, { "epoch": 0.22224376999903034, "grad_norm": 2.5923417767172077, "learning_rate": 9.551987638153339e-06, "loss": 0.8085, "step": 573 }, { "epoch": 0.22263162998157665, "grad_norm": 2.0165146090852897, "learning_rate": 9.549182203354241e-06, "loss": 0.7494, "step": 574 }, { "epoch": 0.22301948996412296, "grad_norm": 2.0530986871593706, "learning_rate": 9.546368426807628e-06, "loss": 0.753, "step": 575 }, { "epoch": 0.22340734994666925, "grad_norm": 3.1148955577920137, "learning_rate": 9.543546313673065e-06, "loss": 0.7848, "step": 576 }, { "epoch": 0.22379520992921556, "grad_norm": 2.6602007871097713, "learning_rate": 9.540715869125407e-06, "loss": 0.8047, "step": 577 }, { "epoch": 0.22418306991176185, "grad_norm": 3.044615760086635, "learning_rate": 9.537877098354787e-06, "loss": 0.8119, "step": 578 }, { "epoch": 0.22457092989430816, "grad_norm": 1.8879754777736502, "learning_rate": 9.5350300065666e-06, "loss": 0.7028, "step": 579 }, { "epoch": 0.22495878987685444, "grad_norm": 2.187785243926301, "learning_rate": 9.532174598981507e-06, "loss": 0.7053, "step": 580 }, { "epoch": 0.22534664985940075, "grad_norm": 3.1320636331363993, "learning_rate": 9.529310880835414e-06, "loss": 0.7666, "step": 581 }, { "epoch": 0.22573450984194707, "grad_norm": 2.476363796193543, "learning_rate": 9.526438857379463e-06, "loss": 0.7912, "step": 582 }, { "epoch": 0.22612236982449335, "grad_norm": 2.741739695610701, "learning_rate": 9.52355853388003e-06, "loss": 0.7687, "step": 583 }, { "epoch": 0.22651022980703966, "grad_norm": 3.7653230554415664, "learning_rate": 9.520669915618708e-06, "loss": 0.8398, "step": 584 }, { "epoch": 0.22689808978958595, "grad_norm": 2.839217284025819, "learning_rate": 9.5177730078923e-06, "loss": 0.8034, "step": 585 }, { "epoch": 0.22728594977213226, "grad_norm": 2.69181149056099, "learning_rate": 9.514867816012809e-06, "loss": 0.762, "step": 586 }, { "epoch": 0.22767380975467857, "grad_norm": 2.749572239304292, "learning_rate": 9.511954345307432e-06, "loss": 0.7351, "step": 587 }, { "epoch": 0.22806166973722486, "grad_norm": 1.9418825359987257, "learning_rate": 9.509032601118541e-06, "loss": 0.7106, "step": 588 }, { "epoch": 0.22844952971977117, "grad_norm": 2.3293023625044778, "learning_rate": 9.506102588803683e-06, "loss": 0.6808, "step": 589 }, { "epoch": 0.22883738970231746, "grad_norm": 3.151208174837305, "learning_rate": 9.503164313735566e-06, "loss": 0.7463, "step": 590 }, { "epoch": 0.22922524968486377, "grad_norm": 2.3732557728909587, "learning_rate": 9.500217781302048e-06, "loss": 0.7808, "step": 591 }, { "epoch": 0.22961310966741005, "grad_norm": 2.3386243550541503, "learning_rate": 9.497262996906126e-06, "loss": 0.7134, "step": 592 }, { "epoch": 0.23000096964995637, "grad_norm": 2.2931769854037993, "learning_rate": 9.494299965965935e-06, "loss": 0.7447, "step": 593 }, { "epoch": 0.23038882963250268, "grad_norm": 2.375253906028915, "learning_rate": 9.491328693914723e-06, "loss": 0.7618, "step": 594 }, { "epoch": 0.23077668961504896, "grad_norm": 2.399922114681762, "learning_rate": 9.488349186200858e-06, "loss": 0.7745, "step": 595 }, { "epoch": 0.23116454959759528, "grad_norm": 2.7957324327228403, "learning_rate": 9.485361448287804e-06, "loss": 0.7564, "step": 596 }, { "epoch": 0.23155240958014156, "grad_norm": 3.033209560077148, "learning_rate": 9.482365485654118e-06, "loss": 0.8268, "step": 597 }, { "epoch": 0.23194026956268787, "grad_norm": 2.2040555109237285, "learning_rate": 9.479361303793441e-06, "loss": 0.7761, "step": 598 }, { "epoch": 0.23232812954523416, "grad_norm": 2.0514282154327756, "learning_rate": 9.476348908214482e-06, "loss": 0.7209, "step": 599 }, { "epoch": 0.23271598952778047, "grad_norm": 1.9973768920607764, "learning_rate": 9.47332830444101e-06, "loss": 0.7108, "step": 600 }, { "epoch": 0.23310384951032678, "grad_norm": 2.8321382849587757, "learning_rate": 9.470299498011851e-06, "loss": 0.7711, "step": 601 }, { "epoch": 0.23349170949287307, "grad_norm": 2.815646273667529, "learning_rate": 9.46726249448087e-06, "loss": 0.7828, "step": 602 }, { "epoch": 0.23387956947541938, "grad_norm": 4.10517843592682, "learning_rate": 9.464217299416956e-06, "loss": 0.7385, "step": 603 }, { "epoch": 0.23426742945796566, "grad_norm": 3.415533652036187, "learning_rate": 9.46116391840403e-06, "loss": 0.7401, "step": 604 }, { "epoch": 0.23465528944051198, "grad_norm": 2.519305435550141, "learning_rate": 9.458102357041017e-06, "loss": 0.7613, "step": 605 }, { "epoch": 0.2350431494230583, "grad_norm": 2.8512000314057526, "learning_rate": 9.45503262094184e-06, "loss": 0.7033, "step": 606 }, { "epoch": 0.23543100940560457, "grad_norm": 2.4935269021523014, "learning_rate": 9.451954715735416e-06, "loss": 0.7955, "step": 607 }, { "epoch": 0.2358188693881509, "grad_norm": 3.017576558056701, "learning_rate": 9.448868647065644e-06, "loss": 0.8237, "step": 608 }, { "epoch": 0.23620672937069717, "grad_norm": 3.0005546727655945, "learning_rate": 9.445774420591382e-06, "loss": 0.7967, "step": 609 }, { "epoch": 0.23659458935324348, "grad_norm": 2.8046457065073698, "learning_rate": 9.442672041986456e-06, "loss": 0.8166, "step": 610 }, { "epoch": 0.23698244933578977, "grad_norm": 3.1076367811163452, "learning_rate": 9.43956151693964e-06, "loss": 0.7963, "step": 611 }, { "epoch": 0.23737030931833608, "grad_norm": 2.245434271541541, "learning_rate": 9.436442851154642e-06, "loss": 0.705, "step": 612 }, { "epoch": 0.2377581693008824, "grad_norm": 3.080504284958297, "learning_rate": 9.433316050350099e-06, "loss": 0.7606, "step": 613 }, { "epoch": 0.23814602928342868, "grad_norm": 2.9322440916561674, "learning_rate": 9.430181120259566e-06, "loss": 0.8175, "step": 614 }, { "epoch": 0.238533889265975, "grad_norm": 2.645912496622248, "learning_rate": 9.427038066631502e-06, "loss": 0.7935, "step": 615 }, { "epoch": 0.23892174924852128, "grad_norm": 2.434556208085347, "learning_rate": 9.423886895229266e-06, "loss": 0.7537, "step": 616 }, { "epoch": 0.2393096092310676, "grad_norm": 3.0943200261791346, "learning_rate": 9.420727611831098e-06, "loss": 0.8041, "step": 617 }, { "epoch": 0.23969746921361387, "grad_norm": 2.959185379212764, "learning_rate": 9.417560222230115e-06, "loss": 0.7929, "step": 618 }, { "epoch": 0.24008532919616019, "grad_norm": 2.614591690738128, "learning_rate": 9.414384732234301e-06, "loss": 0.7453, "step": 619 }, { "epoch": 0.2404731891787065, "grad_norm": 2.8182936782850705, "learning_rate": 9.411201147666486e-06, "loss": 0.7813, "step": 620 }, { "epoch": 0.24086104916125278, "grad_norm": 2.556776916308077, "learning_rate": 9.408009474364353e-06, "loss": 0.7802, "step": 621 }, { "epoch": 0.2412489091437991, "grad_norm": 2.9316268703795734, "learning_rate": 9.404809718180408e-06, "loss": 0.8046, "step": 622 }, { "epoch": 0.24163676912634538, "grad_norm": 2.7345396432494056, "learning_rate": 9.401601884981983e-06, "loss": 0.7216, "step": 623 }, { "epoch": 0.2420246291088917, "grad_norm": 3.0456277623285546, "learning_rate": 9.39838598065122e-06, "loss": 0.6935, "step": 624 }, { "epoch": 0.24241248909143798, "grad_norm": 2.938150174208275, "learning_rate": 9.39516201108506e-06, "loss": 0.7372, "step": 625 }, { "epoch": 0.2428003490739843, "grad_norm": 2.4337200328817263, "learning_rate": 9.391929982195233e-06, "loss": 0.6739, "step": 626 }, { "epoch": 0.2431882090565306, "grad_norm": 2.143758511445213, "learning_rate": 9.38868989990825e-06, "loss": 0.7033, "step": 627 }, { "epoch": 0.2435760690390769, "grad_norm": 3.5121509223015375, "learning_rate": 9.385441770165385e-06, "loss": 0.7937, "step": 628 }, { "epoch": 0.2439639290216232, "grad_norm": 3.9044528925748585, "learning_rate": 9.382185598922674e-06, "loss": 0.8034, "step": 629 }, { "epoch": 0.24435178900416948, "grad_norm": 2.5938491705830753, "learning_rate": 9.378921392150893e-06, "loss": 0.7743, "step": 630 }, { "epoch": 0.2447396489867158, "grad_norm": 2.990108787341999, "learning_rate": 9.375649155835554e-06, "loss": 0.7647, "step": 631 }, { "epoch": 0.2451275089692621, "grad_norm": 2.891124825554601, "learning_rate": 9.372368895976896e-06, "loss": 0.6766, "step": 632 }, { "epoch": 0.2455153689518084, "grad_norm": 2.299692240142004, "learning_rate": 9.369080618589866e-06, "loss": 0.7083, "step": 633 }, { "epoch": 0.2459032289343547, "grad_norm": 3.318145480252322, "learning_rate": 9.365784329704114e-06, "loss": 0.8101, "step": 634 }, { "epoch": 0.246291088916901, "grad_norm": 4.246702913705579, "learning_rate": 9.362480035363987e-06, "loss": 0.9138, "step": 635 }, { "epoch": 0.2466789488994473, "grad_norm": 2.248245165756438, "learning_rate": 9.3591677416285e-06, "loss": 0.7516, "step": 636 }, { "epoch": 0.2470668088819936, "grad_norm": 2.8552770261419034, "learning_rate": 9.35584745457134e-06, "loss": 0.8157, "step": 637 }, { "epoch": 0.2474546688645399, "grad_norm": 3.7721185299393944, "learning_rate": 9.352519180280862e-06, "loss": 0.7527, "step": 638 }, { "epoch": 0.2478425288470862, "grad_norm": 3.145973395002673, "learning_rate": 9.34918292486005e-06, "loss": 0.8037, "step": 639 }, { "epoch": 0.2482303888296325, "grad_norm": 3.0831422426031168, "learning_rate": 9.345838694426535e-06, "loss": 0.746, "step": 640 }, { "epoch": 0.2486182488121788, "grad_norm": 3.442889380458862, "learning_rate": 9.342486495112566e-06, "loss": 0.7879, "step": 641 }, { "epoch": 0.2490061087947251, "grad_norm": 2.0811721348636127, "learning_rate": 9.339126333065008e-06, "loss": 0.8014, "step": 642 }, { "epoch": 0.2493939687772714, "grad_norm": 2.3299598134966053, "learning_rate": 9.335758214445323e-06, "loss": 0.7037, "step": 643 }, { "epoch": 0.2497818287598177, "grad_norm": 2.5114823729735125, "learning_rate": 9.332382145429568e-06, "loss": 0.744, "step": 644 }, { "epoch": 0.250169688742364, "grad_norm": 2.9997760183848627, "learning_rate": 9.328998132208373e-06, "loss": 0.7956, "step": 645 }, { "epoch": 0.2505575487249103, "grad_norm": 2.703085043633713, "learning_rate": 9.325606180986938e-06, "loss": 0.8042, "step": 646 }, { "epoch": 0.25094540870745663, "grad_norm": 2.3668935214073223, "learning_rate": 9.32220629798502e-06, "loss": 0.7422, "step": 647 }, { "epoch": 0.2513332686900029, "grad_norm": 2.7964810243539846, "learning_rate": 9.318798489436917e-06, "loss": 0.6978, "step": 648 }, { "epoch": 0.2517211286725492, "grad_norm": 2.3605863504226208, "learning_rate": 9.315382761591463e-06, "loss": 0.7511, "step": 649 }, { "epoch": 0.2521089886550955, "grad_norm": 2.331607735378988, "learning_rate": 9.311959120712012e-06, "loss": 0.7606, "step": 650 }, { "epoch": 0.2524968486376418, "grad_norm": 2.4810589733129738, "learning_rate": 9.308527573076425e-06, "loss": 0.7394, "step": 651 }, { "epoch": 0.25288470862018814, "grad_norm": 3.294486649865825, "learning_rate": 9.30508812497707e-06, "loss": 0.7931, "step": 652 }, { "epoch": 0.2532725686027344, "grad_norm": 3.1513995462861706, "learning_rate": 9.301640782720792e-06, "loss": 0.823, "step": 653 }, { "epoch": 0.2536604285852807, "grad_norm": 3.0721983250698806, "learning_rate": 9.298185552628917e-06, "loss": 0.7126, "step": 654 }, { "epoch": 0.254048288567827, "grad_norm": 2.657954255455216, "learning_rate": 9.294722441037238e-06, "loss": 0.7412, "step": 655 }, { "epoch": 0.25443614855037333, "grad_norm": 2.980732675436887, "learning_rate": 9.291251454295989e-06, "loss": 0.7216, "step": 656 }, { "epoch": 0.25482400853291964, "grad_norm": 2.478541316132283, "learning_rate": 9.287772598769855e-06, "loss": 0.7463, "step": 657 }, { "epoch": 0.2552118685154659, "grad_norm": 2.0477473384062406, "learning_rate": 9.284285880837947e-06, "loss": 0.7472, "step": 658 }, { "epoch": 0.2555997284980122, "grad_norm": 2.710001918930043, "learning_rate": 9.28079130689379e-06, "loss": 0.7319, "step": 659 }, { "epoch": 0.2559875884805585, "grad_norm": 2.736296131954191, "learning_rate": 9.277288883345318e-06, "loss": 0.7603, "step": 660 }, { "epoch": 0.25637544846310484, "grad_norm": 2.8888069318062333, "learning_rate": 9.273778616614857e-06, "loss": 0.7199, "step": 661 }, { "epoch": 0.2567633084456511, "grad_norm": 2.2828570231262786, "learning_rate": 9.270260513139116e-06, "loss": 0.7211, "step": 662 }, { "epoch": 0.2571511684281974, "grad_norm": 2.993972236374614, "learning_rate": 9.266734579369172e-06, "loss": 0.8034, "step": 663 }, { "epoch": 0.2575390284107437, "grad_norm": 2.7768213673314865, "learning_rate": 9.263200821770462e-06, "loss": 0.7879, "step": 664 }, { "epoch": 0.25792688839329003, "grad_norm": 3.1285650712089104, "learning_rate": 9.25965924682277e-06, "loss": 0.8451, "step": 665 }, { "epoch": 0.25831474837583634, "grad_norm": 1.9504377475955978, "learning_rate": 9.256109861020213e-06, "loss": 0.6404, "step": 666 }, { "epoch": 0.2587026083583826, "grad_norm": 2.36812861265726, "learning_rate": 9.252552670871232e-06, "loss": 0.7857, "step": 667 }, { "epoch": 0.2590904683409289, "grad_norm": 2.2022205718383794, "learning_rate": 9.248987682898576e-06, "loss": 0.7401, "step": 668 }, { "epoch": 0.2594783283234752, "grad_norm": 3.5671761490077656, "learning_rate": 9.245414903639295e-06, "loss": 0.7685, "step": 669 }, { "epoch": 0.25986618830602154, "grad_norm": 2.2921568110776076, "learning_rate": 9.241834339644726e-06, "loss": 0.7475, "step": 670 }, { "epoch": 0.26025404828856785, "grad_norm": 3.702781278202578, "learning_rate": 9.23824599748048e-06, "loss": 0.8697, "step": 671 }, { "epoch": 0.2606419082711141, "grad_norm": 2.5422481660091374, "learning_rate": 9.234649883726432e-06, "loss": 0.784, "step": 672 }, { "epoch": 0.2610297682536604, "grad_norm": 2.3696818971501203, "learning_rate": 9.231046004976704e-06, "loss": 0.7878, "step": 673 }, { "epoch": 0.26141762823620673, "grad_norm": 3.2733510019717684, "learning_rate": 9.22743436783966e-06, "loss": 0.7528, "step": 674 }, { "epoch": 0.26180548821875305, "grad_norm": 2.670476361862656, "learning_rate": 9.223814978937888e-06, "loss": 0.7139, "step": 675 }, { "epoch": 0.2621933482012993, "grad_norm": 3.657147489288099, "learning_rate": 9.220187844908194e-06, "loss": 0.7927, "step": 676 }, { "epoch": 0.2625812081838456, "grad_norm": 3.292094707953219, "learning_rate": 9.216552972401582e-06, "loss": 0.7873, "step": 677 }, { "epoch": 0.26296906816639193, "grad_norm": 3.4061759263390243, "learning_rate": 9.212910368083246e-06, "loss": 0.7807, "step": 678 }, { "epoch": 0.26335692814893824, "grad_norm": 2.2416025360685743, "learning_rate": 9.209260038632562e-06, "loss": 0.7017, "step": 679 }, { "epoch": 0.26374478813148455, "grad_norm": 2.84160623944316, "learning_rate": 9.205601990743068e-06, "loss": 0.7571, "step": 680 }, { "epoch": 0.2641326481140308, "grad_norm": 3.084165878185236, "learning_rate": 9.201936231122453e-06, "loss": 0.8692, "step": 681 }, { "epoch": 0.2645205080965771, "grad_norm": 2.9492657343019797, "learning_rate": 9.198262766492554e-06, "loss": 0.7209, "step": 682 }, { "epoch": 0.26490836807912344, "grad_norm": 1.866381016451374, "learning_rate": 9.194581603589327e-06, "loss": 0.7006, "step": 683 }, { "epoch": 0.26529622806166975, "grad_norm": 2.70198714170136, "learning_rate": 9.190892749162854e-06, "loss": 0.7344, "step": 684 }, { "epoch": 0.26568408804421606, "grad_norm": 2.443308833860274, "learning_rate": 9.187196209977314e-06, "loss": 0.7273, "step": 685 }, { "epoch": 0.2660719480267623, "grad_norm": 3.381971481339725, "learning_rate": 9.18349199281098e-06, "loss": 0.7182, "step": 686 }, { "epoch": 0.26645980800930863, "grad_norm": 2.117740822525642, "learning_rate": 9.179780104456205e-06, "loss": 0.7761, "step": 687 }, { "epoch": 0.26684766799185494, "grad_norm": 2.1258926638766247, "learning_rate": 9.176060551719402e-06, "loss": 0.8037, "step": 688 }, { "epoch": 0.26723552797440125, "grad_norm": 2.9002654528671266, "learning_rate": 9.172333341421046e-06, "loss": 0.7356, "step": 689 }, { "epoch": 0.26762338795694757, "grad_norm": 3.712063484281102, "learning_rate": 9.168598480395653e-06, "loss": 0.8461, "step": 690 }, { "epoch": 0.2680112479394938, "grad_norm": 3.0548166053969177, "learning_rate": 9.16485597549176e-06, "loss": 0.8468, "step": 691 }, { "epoch": 0.26839910792204014, "grad_norm": 3.127918883883917, "learning_rate": 9.16110583357193e-06, "loss": 0.7983, "step": 692 }, { "epoch": 0.26878696790458645, "grad_norm": 3.18651372173249, "learning_rate": 9.157348061512728e-06, "loss": 0.7257, "step": 693 }, { "epoch": 0.26917482788713276, "grad_norm": 2.2827407035003313, "learning_rate": 9.153582666204702e-06, "loss": 0.6967, "step": 694 }, { "epoch": 0.269562687869679, "grad_norm": 2.2148657068990936, "learning_rate": 9.149809654552387e-06, "loss": 0.7701, "step": 695 }, { "epoch": 0.26995054785222533, "grad_norm": 2.2466375554986575, "learning_rate": 9.146029033474284e-06, "loss": 0.6855, "step": 696 }, { "epoch": 0.27033840783477164, "grad_norm": 2.6590754518003528, "learning_rate": 9.142240809902841e-06, "loss": 0.6635, "step": 697 }, { "epoch": 0.27072626781731796, "grad_norm": 2.380172042795461, "learning_rate": 9.138444990784455e-06, "loss": 0.7443, "step": 698 }, { "epoch": 0.27111412779986427, "grad_norm": 1.9819093519138362, "learning_rate": 9.13464158307944e-06, "loss": 0.7025, "step": 699 }, { "epoch": 0.2715019877824105, "grad_norm": 2.3861213497425697, "learning_rate": 9.130830593762037e-06, "loss": 0.7284, "step": 700 }, { "epoch": 0.27188984776495684, "grad_norm": 2.592690654881493, "learning_rate": 9.12701202982038e-06, "loss": 0.7448, "step": 701 }, { "epoch": 0.27227770774750315, "grad_norm": 2.5381469412392628, "learning_rate": 9.123185898256497e-06, "loss": 0.696, "step": 702 }, { "epoch": 0.27266556773004946, "grad_norm": 2.7141893426209514, "learning_rate": 9.119352206086292e-06, "loss": 0.7605, "step": 703 }, { "epoch": 0.2730534277125958, "grad_norm": 3.228351320238316, "learning_rate": 9.115510960339533e-06, "loss": 0.7209, "step": 704 }, { "epoch": 0.27344128769514203, "grad_norm": 2.2387650144152693, "learning_rate": 9.111662168059836e-06, "loss": 0.7766, "step": 705 }, { "epoch": 0.27382914767768834, "grad_norm": 2.3429838369806704, "learning_rate": 9.107805836304658e-06, "loss": 0.7152, "step": 706 }, { "epoch": 0.27421700766023466, "grad_norm": 2.851733747383336, "learning_rate": 9.10394197214528e-06, "loss": 0.7412, "step": 707 }, { "epoch": 0.27460486764278097, "grad_norm": 2.600732846112595, "learning_rate": 9.100070582666796e-06, "loss": 0.7418, "step": 708 }, { "epoch": 0.2749927276253273, "grad_norm": 2.2318878632962758, "learning_rate": 9.096191674968095e-06, "loss": 0.7039, "step": 709 }, { "epoch": 0.27538058760787354, "grad_norm": 3.9442652688274933, "learning_rate": 9.09230525616186e-06, "loss": 0.8429, "step": 710 }, { "epoch": 0.27576844759041985, "grad_norm": 2.8140667293159214, "learning_rate": 9.088411333374539e-06, "loss": 0.7651, "step": 711 }, { "epoch": 0.27615630757296616, "grad_norm": 2.506108066025076, "learning_rate": 9.084509913746342e-06, "loss": 0.7405, "step": 712 }, { "epoch": 0.2765441675555125, "grad_norm": 2.4047967704157514, "learning_rate": 9.08060100443123e-06, "loss": 0.7533, "step": 713 }, { "epoch": 0.27693202753805873, "grad_norm": 3.210697233153602, "learning_rate": 9.076684612596891e-06, "loss": 0.8181, "step": 714 }, { "epoch": 0.27731988752060505, "grad_norm": 2.1055314607703157, "learning_rate": 9.07276074542474e-06, "loss": 0.7186, "step": 715 }, { "epoch": 0.27770774750315136, "grad_norm": 2.700784721456371, "learning_rate": 9.068829410109893e-06, "loss": 0.6987, "step": 716 }, { "epoch": 0.27809560748569767, "grad_norm": 2.4060669942318977, "learning_rate": 9.064890613861168e-06, "loss": 0.7041, "step": 717 }, { "epoch": 0.278483467468244, "grad_norm": 2.1621939853458354, "learning_rate": 9.060944363901057e-06, "loss": 0.698, "step": 718 }, { "epoch": 0.27887132745079024, "grad_norm": 2.828682060151449, "learning_rate": 9.05699066746572e-06, "loss": 0.7843, "step": 719 }, { "epoch": 0.27925918743333655, "grad_norm": 1.9960805618730986, "learning_rate": 9.05302953180498e-06, "loss": 0.748, "step": 720 }, { "epoch": 0.27964704741588287, "grad_norm": 2.630472085151921, "learning_rate": 9.04906096418229e-06, "loss": 0.7468, "step": 721 }, { "epoch": 0.2800349073984292, "grad_norm": 2.533220806264575, "learning_rate": 9.045084971874738e-06, "loss": 0.7762, "step": 722 }, { "epoch": 0.2804227673809755, "grad_norm": 2.2161561947812882, "learning_rate": 9.041101562173023e-06, "loss": 0.7536, "step": 723 }, { "epoch": 0.28081062736352175, "grad_norm": 2.5440660359946263, "learning_rate": 9.037110742381445e-06, "loss": 0.765, "step": 724 }, { "epoch": 0.28119848734606806, "grad_norm": 2.1895424258254796, "learning_rate": 9.033112519817897e-06, "loss": 0.7036, "step": 725 }, { "epoch": 0.2815863473286144, "grad_norm": 2.1811609283343723, "learning_rate": 9.02910690181384e-06, "loss": 0.7013, "step": 726 }, { "epoch": 0.2819742073111607, "grad_norm": 2.384621731908707, "learning_rate": 9.0250938957143e-06, "loss": 0.7659, "step": 727 }, { "epoch": 0.282362067293707, "grad_norm": 2.2106617222916918, "learning_rate": 9.021073508877845e-06, "loss": 0.7219, "step": 728 }, { "epoch": 0.28274992727625325, "grad_norm": 2.079490190842833, "learning_rate": 9.017045748676584e-06, "loss": 0.7564, "step": 729 }, { "epoch": 0.28313778725879957, "grad_norm": 3.170963787560117, "learning_rate": 9.013010622496145e-06, "loss": 0.7527, "step": 730 }, { "epoch": 0.2835256472413459, "grad_norm": 2.418097780035215, "learning_rate": 9.008968137735655e-06, "loss": 0.7529, "step": 731 }, { "epoch": 0.2839135072238922, "grad_norm": 3.1508067112086153, "learning_rate": 9.004918301807746e-06, "loss": 0.8076, "step": 732 }, { "epoch": 0.28430136720643845, "grad_norm": 2.819192717498469, "learning_rate": 9.000861122138518e-06, "loss": 0.7298, "step": 733 }, { "epoch": 0.28468922718898476, "grad_norm": 2.6668112187897353, "learning_rate": 8.996796606167549e-06, "loss": 0.6593, "step": 734 }, { "epoch": 0.2850770871715311, "grad_norm": 3.052988013239498, "learning_rate": 8.99272476134786e-06, "loss": 0.7642, "step": 735 }, { "epoch": 0.2854649471540774, "grad_norm": 2.6338715745056307, "learning_rate": 8.988645595145913e-06, "loss": 0.7416, "step": 736 }, { "epoch": 0.2858528071366237, "grad_norm": 2.2984198255838946, "learning_rate": 8.9845591150416e-06, "loss": 0.7788, "step": 737 }, { "epoch": 0.28624066711916996, "grad_norm": 2.732956482464638, "learning_rate": 8.98046532852822e-06, "loss": 0.7606, "step": 738 }, { "epoch": 0.28662852710171627, "grad_norm": 3.1057354143268907, "learning_rate": 8.976364243112468e-06, "loss": 0.7298, "step": 739 }, { "epoch": 0.2870163870842626, "grad_norm": 1.7640201592020717, "learning_rate": 8.972255866314425e-06, "loss": 0.7183, "step": 740 }, { "epoch": 0.2874042470668089, "grad_norm": 2.6098143077145837, "learning_rate": 8.968140205667544e-06, "loss": 0.6516, "step": 741 }, { "epoch": 0.2877921070493552, "grad_norm": 2.6748832060644716, "learning_rate": 8.964017268718632e-06, "loss": 0.7623, "step": 742 }, { "epoch": 0.28817996703190146, "grad_norm": 3.280904882406831, "learning_rate": 8.959887063027837e-06, "loss": 0.8322, "step": 743 }, { "epoch": 0.2885678270144478, "grad_norm": 3.3387174121467265, "learning_rate": 8.95574959616864e-06, "loss": 0.8525, "step": 744 }, { "epoch": 0.2889556869969941, "grad_norm": 2.447597842855936, "learning_rate": 8.951604875727833e-06, "loss": 0.6984, "step": 745 }, { "epoch": 0.2893435469795404, "grad_norm": 2.393747247185669, "learning_rate": 8.94745290930551e-06, "loss": 0.7563, "step": 746 }, { "epoch": 0.2897314069620867, "grad_norm": 2.4959310178316287, "learning_rate": 8.94329370451505e-06, "loss": 0.7178, "step": 747 }, { "epoch": 0.29011926694463297, "grad_norm": 2.355416389407571, "learning_rate": 8.93912726898311e-06, "loss": 0.671, "step": 748 }, { "epoch": 0.2905071269271793, "grad_norm": 2.3891201245696507, "learning_rate": 8.934953610349599e-06, "loss": 0.6944, "step": 749 }, { "epoch": 0.2908949869097256, "grad_norm": 2.411712657639014, "learning_rate": 8.930772736267675e-06, "loss": 0.7506, "step": 750 }, { "epoch": 0.2912828468922719, "grad_norm": 2.069148757105507, "learning_rate": 8.926584654403725e-06, "loss": 0.6855, "step": 751 }, { "epoch": 0.29167070687481816, "grad_norm": 2.7733321479040938, "learning_rate": 8.922389372437357e-06, "loss": 0.7357, "step": 752 }, { "epoch": 0.2920585668573645, "grad_norm": 2.649457955986044, "learning_rate": 8.918186898061377e-06, "loss": 0.699, "step": 753 }, { "epoch": 0.2924464268399108, "grad_norm": 2.238085445858793, "learning_rate": 8.91397723898178e-06, "loss": 0.7085, "step": 754 }, { "epoch": 0.2928342868224571, "grad_norm": 2.134811261114108, "learning_rate": 8.909760402917738e-06, "loss": 0.7604, "step": 755 }, { "epoch": 0.2932221468050034, "grad_norm": 2.777900634322574, "learning_rate": 8.90553639760158e-06, "loss": 0.8455, "step": 756 }, { "epoch": 0.29361000678754967, "grad_norm": 2.2259490217031033, "learning_rate": 8.901305230778783e-06, "loss": 0.7527, "step": 757 }, { "epoch": 0.293997866770096, "grad_norm": 2.297921340374574, "learning_rate": 8.897066910207958e-06, "loss": 0.7022, "step": 758 }, { "epoch": 0.2943857267526423, "grad_norm": 2.011592028257856, "learning_rate": 8.892821443660831e-06, "loss": 0.628, "step": 759 }, { "epoch": 0.2947735867351886, "grad_norm": 2.7645958744446557, "learning_rate": 8.888568838922231e-06, "loss": 0.7521, "step": 760 }, { "epoch": 0.2951614467177349, "grad_norm": 2.6199154878673356, "learning_rate": 8.884309103790078e-06, "loss": 0.6891, "step": 761 }, { "epoch": 0.2955493067002812, "grad_norm": 2.355356673458496, "learning_rate": 8.880042246075366e-06, "loss": 0.732, "step": 762 }, { "epoch": 0.2959371666828275, "grad_norm": 2.2085556524891614, "learning_rate": 8.875768273602148e-06, "loss": 0.6935, "step": 763 }, { "epoch": 0.2963250266653738, "grad_norm": 2.0420643435735015, "learning_rate": 8.871487194207527e-06, "loss": 0.5968, "step": 764 }, { "epoch": 0.2967128866479201, "grad_norm": 1.9713293556494578, "learning_rate": 8.867199015741632e-06, "loss": 0.7599, "step": 765 }, { "epoch": 0.29710074663046643, "grad_norm": 2.3309859417389625, "learning_rate": 8.862903746067619e-06, "loss": 0.6798, "step": 766 }, { "epoch": 0.2974886066130127, "grad_norm": 3.354657108022967, "learning_rate": 8.858601393061634e-06, "loss": 0.7714, "step": 767 }, { "epoch": 0.297876466595559, "grad_norm": 2.0809597534880733, "learning_rate": 8.854291964612824e-06, "loss": 0.6704, "step": 768 }, { "epoch": 0.2982643265781053, "grad_norm": 2.088841115935275, "learning_rate": 8.849975468623302e-06, "loss": 0.7034, "step": 769 }, { "epoch": 0.2986521865606516, "grad_norm": 2.2243728792993864, "learning_rate": 8.845651913008145e-06, "loss": 0.7594, "step": 770 }, { "epoch": 0.2990400465431979, "grad_norm": 2.455372344224646, "learning_rate": 8.841321305695372e-06, "loss": 0.7121, "step": 771 }, { "epoch": 0.2994279065257442, "grad_norm": 2.8638247470803155, "learning_rate": 8.836983654625934e-06, "loss": 0.7517, "step": 772 }, { "epoch": 0.2998157665082905, "grad_norm": 2.269210700806868, "learning_rate": 8.832638967753699e-06, "loss": 0.7508, "step": 773 }, { "epoch": 0.3002036264908368, "grad_norm": 2.5618085564786335, "learning_rate": 8.828287253045436e-06, "loss": 0.7596, "step": 774 }, { "epoch": 0.30059148647338313, "grad_norm": 3.5098808640575085, "learning_rate": 8.823928518480797e-06, "loss": 0.8315, "step": 775 }, { "epoch": 0.3009793464559294, "grad_norm": 2.0899247590357266, "learning_rate": 8.819562772052312e-06, "loss": 0.7073, "step": 776 }, { "epoch": 0.3013672064384757, "grad_norm": 2.124815438805027, "learning_rate": 8.815190021765365e-06, "loss": 0.6975, "step": 777 }, { "epoch": 0.301755066421022, "grad_norm": 2.9037526301202448, "learning_rate": 8.810810275638183e-06, "loss": 0.8041, "step": 778 }, { "epoch": 0.3021429264035683, "grad_norm": 2.1120995995476575, "learning_rate": 8.806423541701824e-06, "loss": 0.7168, "step": 779 }, { "epoch": 0.30253078638611464, "grad_norm": 3.0017760291830697, "learning_rate": 8.802029828000157e-06, "loss": 0.7987, "step": 780 }, { "epoch": 0.3029186463686609, "grad_norm": 2.158392707427063, "learning_rate": 8.797629142589846e-06, "loss": 0.7133, "step": 781 }, { "epoch": 0.3033065063512072, "grad_norm": 2.7654613383341227, "learning_rate": 8.793221493540347e-06, "loss": 0.778, "step": 782 }, { "epoch": 0.3036943663337535, "grad_norm": 1.7907805275124964, "learning_rate": 8.788806888933881e-06, "loss": 0.7118, "step": 783 }, { "epoch": 0.30408222631629983, "grad_norm": 2.5964387715319415, "learning_rate": 8.784385336865419e-06, "loss": 0.6707, "step": 784 }, { "epoch": 0.30447008629884614, "grad_norm": 3.216011998455061, "learning_rate": 8.779956845442682e-06, "loss": 0.728, "step": 785 }, { "epoch": 0.3048579462813924, "grad_norm": 2.7089844379260066, "learning_rate": 8.775521422786104e-06, "loss": 0.7423, "step": 786 }, { "epoch": 0.3052458062639387, "grad_norm": 2.7169875907529035, "learning_rate": 8.771079077028836e-06, "loss": 0.7302, "step": 787 }, { "epoch": 0.305633666246485, "grad_norm": 2.301159326907894, "learning_rate": 8.766629816316722e-06, "loss": 0.7855, "step": 788 }, { "epoch": 0.30602152622903134, "grad_norm": 2.5835087823535567, "learning_rate": 8.762173648808283e-06, "loss": 0.7118, "step": 789 }, { "epoch": 0.3064093862115776, "grad_norm": 2.18639013633494, "learning_rate": 8.757710582674708e-06, "loss": 0.7116, "step": 790 }, { "epoch": 0.3067972461941239, "grad_norm": 2.012854137709971, "learning_rate": 8.753240626099836e-06, "loss": 0.7185, "step": 791 }, { "epoch": 0.3071851061766702, "grad_norm": 2.997448286400618, "learning_rate": 8.748763787280142e-06, "loss": 0.7615, "step": 792 }, { "epoch": 0.30757296615921653, "grad_norm": 3.952006505627358, "learning_rate": 8.744280074424713e-06, "loss": 0.7771, "step": 793 }, { "epoch": 0.30796082614176284, "grad_norm": 2.341512569700289, "learning_rate": 8.739789495755254e-06, "loss": 0.6395, "step": 794 }, { "epoch": 0.3083486861243091, "grad_norm": 2.4991646661835323, "learning_rate": 8.735292059506047e-06, "loss": 0.7266, "step": 795 }, { "epoch": 0.3087365461068554, "grad_norm": 2.5413169891109013, "learning_rate": 8.730787773923957e-06, "loss": 0.7377, "step": 796 }, { "epoch": 0.3091244060894017, "grad_norm": 3.153357764605714, "learning_rate": 8.726276647268403e-06, "loss": 0.71, "step": 797 }, { "epoch": 0.30951226607194804, "grad_norm": 2.5361443645383623, "learning_rate": 8.721758687811353e-06, "loss": 0.7061, "step": 798 }, { "epoch": 0.30990012605449435, "grad_norm": 2.8640089557596755, "learning_rate": 8.717233903837298e-06, "loss": 0.7191, "step": 799 }, { "epoch": 0.3102879860370406, "grad_norm": 2.0621809271947473, "learning_rate": 8.712702303643254e-06, "loss": 0.7809, "step": 800 }, { "epoch": 0.3106758460195869, "grad_norm": 2.937528856086167, "learning_rate": 8.708163895538722e-06, "loss": 0.7958, "step": 801 }, { "epoch": 0.31106370600213323, "grad_norm": 2.58108008546307, "learning_rate": 8.703618687845697e-06, "loss": 0.7306, "step": 802 }, { "epoch": 0.31145156598467955, "grad_norm": 2.5435042907829772, "learning_rate": 8.699066688898636e-06, "loss": 0.7805, "step": 803 }, { "epoch": 0.31183942596722586, "grad_norm": 2.6349462051365653, "learning_rate": 8.694507907044454e-06, "loss": 0.7197, "step": 804 }, { "epoch": 0.3122272859497721, "grad_norm": 2.3966718638926023, "learning_rate": 8.6899423506425e-06, "loss": 0.7416, "step": 805 }, { "epoch": 0.31261514593231843, "grad_norm": 3.43671510605343, "learning_rate": 8.685370028064546e-06, "loss": 0.8625, "step": 806 }, { "epoch": 0.31300300591486474, "grad_norm": 2.891666360875281, "learning_rate": 8.680790947694772e-06, "loss": 0.7645, "step": 807 }, { "epoch": 0.31339086589741105, "grad_norm": 2.2454230316103603, "learning_rate": 8.676205117929752e-06, "loss": 0.6821, "step": 808 }, { "epoch": 0.3137787258799573, "grad_norm": 2.8920749771293703, "learning_rate": 8.671612547178428e-06, "loss": 0.7193, "step": 809 }, { "epoch": 0.3141665858625036, "grad_norm": 2.8008319021443127, "learning_rate": 8.667013243862113e-06, "loss": 0.7925, "step": 810 }, { "epoch": 0.31455444584504993, "grad_norm": 2.45661303426054, "learning_rate": 8.66240721641446e-06, "loss": 0.7293, "step": 811 }, { "epoch": 0.31494230582759625, "grad_norm": 2.5286031526869697, "learning_rate": 8.657794473281447e-06, "loss": 0.7276, "step": 812 }, { "epoch": 0.31533016581014256, "grad_norm": 1.9694033958559791, "learning_rate": 8.65317502292138e-06, "loss": 0.6832, "step": 813 }, { "epoch": 0.3157180257926888, "grad_norm": 2.994659935350695, "learning_rate": 8.64854887380485e-06, "loss": 0.7828, "step": 814 }, { "epoch": 0.31610588577523513, "grad_norm": 1.9418165825283746, "learning_rate": 8.643916034414741e-06, "loss": 0.6981, "step": 815 }, { "epoch": 0.31649374575778144, "grad_norm": 2.3075969035234074, "learning_rate": 8.639276513246199e-06, "loss": 0.6588, "step": 816 }, { "epoch": 0.31688160574032775, "grad_norm": 2.5568759545286106, "learning_rate": 8.634630318806626e-06, "loss": 0.7363, "step": 817 }, { "epoch": 0.31726946572287407, "grad_norm": 2.446888765183053, "learning_rate": 8.629977459615655e-06, "loss": 0.7253, "step": 818 }, { "epoch": 0.3176573257054203, "grad_norm": 2.571939628353754, "learning_rate": 8.62531794420515e-06, "loss": 0.7253, "step": 819 }, { "epoch": 0.31804518568796664, "grad_norm": 2.4471344816118594, "learning_rate": 8.620651781119169e-06, "loss": 0.7218, "step": 820 }, { "epoch": 0.31843304567051295, "grad_norm": 2.221319748088087, "learning_rate": 8.615978978913968e-06, "loss": 0.6639, "step": 821 }, { "epoch": 0.31882090565305926, "grad_norm": 2.5508261573391726, "learning_rate": 8.611299546157973e-06, "loss": 0.7518, "step": 822 }, { "epoch": 0.3192087656356056, "grad_norm": 2.904545331982959, "learning_rate": 8.60661349143177e-06, "loss": 0.6858, "step": 823 }, { "epoch": 0.31959662561815183, "grad_norm": 2.0741156194953194, "learning_rate": 8.601920823328088e-06, "loss": 0.6987, "step": 824 }, { "epoch": 0.31998448560069814, "grad_norm": 3.6180949841686663, "learning_rate": 8.59722155045178e-06, "loss": 0.7145, "step": 825 }, { "epoch": 0.32037234558324446, "grad_norm": 3.0876970463030253, "learning_rate": 8.592515681419812e-06, "loss": 0.7794, "step": 826 }, { "epoch": 0.32076020556579077, "grad_norm": 2.7506269113813286, "learning_rate": 8.587803224861248e-06, "loss": 0.7586, "step": 827 }, { "epoch": 0.321148065548337, "grad_norm": 2.0514364240876914, "learning_rate": 8.583084189417225e-06, "loss": 0.692, "step": 828 }, { "epoch": 0.32153592553088334, "grad_norm": 2.7032289596649117, "learning_rate": 8.578358583740947e-06, "loss": 0.807, "step": 829 }, { "epoch": 0.32192378551342965, "grad_norm": 2.906703050120578, "learning_rate": 8.573626416497669e-06, "loss": 0.7811, "step": 830 }, { "epoch": 0.32231164549597596, "grad_norm": 2.7979902024500194, "learning_rate": 8.568887696364673e-06, "loss": 0.7143, "step": 831 }, { "epoch": 0.3226995054785223, "grad_norm": 2.872716932031853, "learning_rate": 8.564142432031257e-06, "loss": 0.747, "step": 832 }, { "epoch": 0.32308736546106853, "grad_norm": 2.4520151274780786, "learning_rate": 8.559390632198723e-06, "loss": 0.7498, "step": 833 }, { "epoch": 0.32347522544361484, "grad_norm": 2.936734208805827, "learning_rate": 8.554632305580355e-06, "loss": 0.7285, "step": 834 }, { "epoch": 0.32386308542616116, "grad_norm": 2.7923310350884694, "learning_rate": 8.549867460901402e-06, "loss": 0.701, "step": 835 }, { "epoch": 0.32425094540870747, "grad_norm": 2.831339412177122, "learning_rate": 8.545096106899068e-06, "loss": 0.707, "step": 836 }, { "epoch": 0.3246388053912538, "grad_norm": 3.3125035002181855, "learning_rate": 8.540318252322493e-06, "loss": 0.807, "step": 837 }, { "epoch": 0.32502666537380004, "grad_norm": 3.6362328321722805, "learning_rate": 8.535533905932739e-06, "loss": 0.7417, "step": 838 }, { "epoch": 0.32541452535634635, "grad_norm": 2.3341353751718854, "learning_rate": 8.530743076502766e-06, "loss": 0.7601, "step": 839 }, { "epoch": 0.32580238533889266, "grad_norm": 2.7278367446945495, "learning_rate": 8.525945772817427e-06, "loss": 0.7695, "step": 840 }, { "epoch": 0.326190245321439, "grad_norm": 2.2409520732725774, "learning_rate": 8.521142003673447e-06, "loss": 0.7085, "step": 841 }, { "epoch": 0.3265781053039853, "grad_norm": 3.2629946016916804, "learning_rate": 8.5163317778794e-06, "loss": 0.7402, "step": 842 }, { "epoch": 0.32696596528653155, "grad_norm": 2.5593016993955384, "learning_rate": 8.51151510425571e-06, "loss": 0.7105, "step": 843 }, { "epoch": 0.32735382526907786, "grad_norm": 1.9762941351330703, "learning_rate": 8.506691991634612e-06, "loss": 0.6186, "step": 844 }, { "epoch": 0.32774168525162417, "grad_norm": 2.2394925896227615, "learning_rate": 8.501862448860159e-06, "loss": 0.759, "step": 845 }, { "epoch": 0.3281295452341705, "grad_norm": 2.82608661351995, "learning_rate": 8.497026484788189e-06, "loss": 0.7971, "step": 846 }, { "epoch": 0.32851740521671674, "grad_norm": 2.6163095656114748, "learning_rate": 8.492184108286316e-06, "loss": 0.7332, "step": 847 }, { "epoch": 0.32890526519926305, "grad_norm": 1.8459281083601498, "learning_rate": 8.487335328233912e-06, "loss": 0.7189, "step": 848 }, { "epoch": 0.32929312518180937, "grad_norm": 2.5238231637295985, "learning_rate": 8.48248015352209e-06, "loss": 0.7087, "step": 849 }, { "epoch": 0.3296809851643557, "grad_norm": 2.724186473324235, "learning_rate": 8.477618593053693e-06, "loss": 0.7497, "step": 850 }, { "epoch": 0.330068845146902, "grad_norm": 2.0323366589006655, "learning_rate": 8.47275065574327e-06, "loss": 0.6773, "step": 851 }, { "epoch": 0.33045670512944825, "grad_norm": 2.3294244747072588, "learning_rate": 8.46787635051706e-06, "loss": 0.7122, "step": 852 }, { "epoch": 0.33084456511199456, "grad_norm": 2.4641870769682352, "learning_rate": 8.462995686312985e-06, "loss": 0.6859, "step": 853 }, { "epoch": 0.33123242509454087, "grad_norm": 2.356786133552249, "learning_rate": 8.458108672080624e-06, "loss": 0.7001, "step": 854 }, { "epoch": 0.3316202850770872, "grad_norm": 2.358797516028863, "learning_rate": 8.453215316781205e-06, "loss": 0.7711, "step": 855 }, { "epoch": 0.3320081450596335, "grad_norm": 2.8323488190775548, "learning_rate": 8.448315629387572e-06, "loss": 0.7484, "step": 856 }, { "epoch": 0.33239600504217975, "grad_norm": 2.999908651981758, "learning_rate": 8.44340961888419e-06, "loss": 0.6947, "step": 857 }, { "epoch": 0.33278386502472607, "grad_norm": 2.403760536750912, "learning_rate": 8.438497294267117e-06, "loss": 0.729, "step": 858 }, { "epoch": 0.3331717250072724, "grad_norm": 1.9556041586454642, "learning_rate": 8.433578664543986e-06, "loss": 0.6846, "step": 859 }, { "epoch": 0.3335595849898187, "grad_norm": 2.919124526041628, "learning_rate": 8.428653738733996e-06, "loss": 0.7415, "step": 860 }, { "epoch": 0.33394744497236495, "grad_norm": 2.2396298560604064, "learning_rate": 8.423722525867883e-06, "loss": 0.7293, "step": 861 }, { "epoch": 0.33433530495491126, "grad_norm": 2.24108520007232, "learning_rate": 8.418785034987921e-06, "loss": 0.6349, "step": 862 }, { "epoch": 0.3347231649374576, "grad_norm": 2.821139830829961, "learning_rate": 8.413841275147893e-06, "loss": 0.7192, "step": 863 }, { "epoch": 0.3351110249200039, "grad_norm": 2.634011341536266, "learning_rate": 8.408891255413072e-06, "loss": 0.7387, "step": 864 }, { "epoch": 0.3354988849025502, "grad_norm": 2.9548472219598505, "learning_rate": 8.403934984860216e-06, "loss": 0.8246, "step": 865 }, { "epoch": 0.33588674488509646, "grad_norm": 1.8780455525819741, "learning_rate": 8.39897247257754e-06, "loss": 0.6637, "step": 866 }, { "epoch": 0.33627460486764277, "grad_norm": 2.1928157204942362, "learning_rate": 8.39400372766471e-06, "loss": 0.6748, "step": 867 }, { "epoch": 0.3366624648501891, "grad_norm": 2.134152592610771, "learning_rate": 8.389028759232816e-06, "loss": 0.6581, "step": 868 }, { "epoch": 0.3370503248327354, "grad_norm": 2.2346363214780802, "learning_rate": 8.38404757640436e-06, "loss": 0.7076, "step": 869 }, { "epoch": 0.3374381848152817, "grad_norm": 2.800741228290852, "learning_rate": 8.379060188313244e-06, "loss": 0.7378, "step": 870 }, { "epoch": 0.33782604479782796, "grad_norm": 2.6755611006312603, "learning_rate": 8.374066604104742e-06, "loss": 0.7052, "step": 871 }, { "epoch": 0.3382139047803743, "grad_norm": 2.8680328165822564, "learning_rate": 8.369066832935498e-06, "loss": 0.7767, "step": 872 }, { "epoch": 0.3386017647629206, "grad_norm": 2.8306528638771518, "learning_rate": 8.364060883973488e-06, "loss": 0.7428, "step": 873 }, { "epoch": 0.3389896247454669, "grad_norm": 2.0943956824884675, "learning_rate": 8.359048766398032e-06, "loss": 0.6688, "step": 874 }, { "epoch": 0.3393774847280132, "grad_norm": 3.7150858294662217, "learning_rate": 8.354030489399747e-06, "loss": 0.819, "step": 875 }, { "epoch": 0.33976534471055947, "grad_norm": 2.5966862978384944, "learning_rate": 8.349006062180552e-06, "loss": 0.6643, "step": 876 }, { "epoch": 0.3401532046931058, "grad_norm": 2.944486283619639, "learning_rate": 8.343975493953645e-06, "loss": 0.7964, "step": 877 }, { "epoch": 0.3405410646756521, "grad_norm": 2.395852399458155, "learning_rate": 8.338938793943478e-06, "loss": 0.6857, "step": 878 }, { "epoch": 0.3409289246581984, "grad_norm": 2.5806729677475753, "learning_rate": 8.333895971385754e-06, "loss": 0.73, "step": 879 }, { "epoch": 0.34131678464074466, "grad_norm": 2.407947748163297, "learning_rate": 8.328847035527397e-06, "loss": 0.6915, "step": 880 }, { "epoch": 0.341704644623291, "grad_norm": 2.1491285738453385, "learning_rate": 8.323791995626543e-06, "loss": 0.7135, "step": 881 }, { "epoch": 0.3420925046058373, "grad_norm": 2.4162456250280218, "learning_rate": 8.318730860952523e-06, "loss": 0.6897, "step": 882 }, { "epoch": 0.3424803645883836, "grad_norm": 2.530247089925128, "learning_rate": 8.313663640785839e-06, "loss": 0.7581, "step": 883 }, { "epoch": 0.3428682245709299, "grad_norm": 2.5772135035227994, "learning_rate": 8.308590344418158e-06, "loss": 0.7624, "step": 884 }, { "epoch": 0.34325608455347617, "grad_norm": 2.9557444307760083, "learning_rate": 8.303510981152283e-06, "loss": 0.7399, "step": 885 }, { "epoch": 0.3436439445360225, "grad_norm": 2.791404850814444, "learning_rate": 8.298425560302146e-06, "loss": 0.7106, "step": 886 }, { "epoch": 0.3440318045185688, "grad_norm": 2.3151684845600156, "learning_rate": 8.293334091192782e-06, "loss": 0.732, "step": 887 }, { "epoch": 0.3444196645011151, "grad_norm": 2.4390884799877552, "learning_rate": 8.288236583160322e-06, "loss": 0.6931, "step": 888 }, { "epoch": 0.3448075244836614, "grad_norm": 2.0858786723394727, "learning_rate": 8.28313304555197e-06, "loss": 0.687, "step": 889 }, { "epoch": 0.3451953844662077, "grad_norm": 3.081759083767533, "learning_rate": 8.278023487725981e-06, "loss": 0.815, "step": 890 }, { "epoch": 0.345583244448754, "grad_norm": 2.304950690918371, "learning_rate": 8.272907919051653e-06, "loss": 0.6955, "step": 891 }, { "epoch": 0.3459711044313003, "grad_norm": 2.5668889468564347, "learning_rate": 8.267786348909306e-06, "loss": 0.6963, "step": 892 }, { "epoch": 0.3463589644138466, "grad_norm": 2.7614585444827338, "learning_rate": 8.262658786690262e-06, "loss": 0.6943, "step": 893 }, { "epoch": 0.3467468243963929, "grad_norm": 2.674332668430997, "learning_rate": 8.257525241796837e-06, "loss": 0.7137, "step": 894 }, { "epoch": 0.3471346843789392, "grad_norm": 2.724346663128011, "learning_rate": 8.252385723642312e-06, "loss": 0.7917, "step": 895 }, { "epoch": 0.3475225443614855, "grad_norm": 3.2800827491387765, "learning_rate": 8.247240241650918e-06, "loss": 0.7668, "step": 896 }, { "epoch": 0.3479104043440318, "grad_norm": 3.2084731279381034, "learning_rate": 8.242088805257832e-06, "loss": 0.8054, "step": 897 }, { "epoch": 0.3482982643265781, "grad_norm": 3.2675109277496106, "learning_rate": 8.23693142390914e-06, "loss": 0.7361, "step": 898 }, { "epoch": 0.3486861243091244, "grad_norm": 2.4016083343972645, "learning_rate": 8.231768107061831e-06, "loss": 0.6408, "step": 899 }, { "epoch": 0.3490739842916707, "grad_norm": 3.3795265081142207, "learning_rate": 8.226598864183782e-06, "loss": 0.7571, "step": 900 }, { "epoch": 0.349461844274217, "grad_norm": 2.366539797747541, "learning_rate": 8.221423704753733e-06, "loss": 0.7053, "step": 901 }, { "epoch": 0.3498497042567633, "grad_norm": 1.849827454871597, "learning_rate": 8.216242638261277e-06, "loss": 0.6594, "step": 902 }, { "epoch": 0.35023756423930963, "grad_norm": 2.796687275507461, "learning_rate": 8.211055674206828e-06, "loss": 0.7494, "step": 903 }, { "epoch": 0.3506254242218559, "grad_norm": 2.403456651129227, "learning_rate": 8.205862822101628e-06, "loss": 0.7621, "step": 904 }, { "epoch": 0.3510132842044022, "grad_norm": 2.773977284326876, "learning_rate": 8.200664091467707e-06, "loss": 0.709, "step": 905 }, { "epoch": 0.3514011441869485, "grad_norm": 2.620227095527354, "learning_rate": 8.195459491837881e-06, "loss": 0.6876, "step": 906 }, { "epoch": 0.3517890041694948, "grad_norm": 2.554821115052626, "learning_rate": 8.190249032755717e-06, "loss": 0.7461, "step": 907 }, { "epoch": 0.35217686415204114, "grad_norm": 2.725434101412938, "learning_rate": 8.18503272377554e-06, "loss": 0.7854, "step": 908 }, { "epoch": 0.3525647241345874, "grad_norm": 2.328743900768879, "learning_rate": 8.179810574462388e-06, "loss": 0.7371, "step": 909 }, { "epoch": 0.3529525841171337, "grad_norm": 2.4426535611495304, "learning_rate": 8.17458259439202e-06, "loss": 0.692, "step": 910 }, { "epoch": 0.35334044409968, "grad_norm": 2.2794976087718766, "learning_rate": 8.169348793150884e-06, "loss": 0.6382, "step": 911 }, { "epoch": 0.35372830408222633, "grad_norm": 2.5897584097874864, "learning_rate": 8.164109180336094e-06, "loss": 0.7627, "step": 912 }, { "epoch": 0.35411616406477264, "grad_norm": 2.1441848115363973, "learning_rate": 8.15886376555543e-06, "loss": 0.7584, "step": 913 }, { "epoch": 0.3545040240473189, "grad_norm": 2.3848263966235317, "learning_rate": 8.153612558427311e-06, "loss": 0.6897, "step": 914 }, { "epoch": 0.3548918840298652, "grad_norm": 2.6717635986918236, "learning_rate": 8.148355568580768e-06, "loss": 0.7953, "step": 915 }, { "epoch": 0.3552797440124115, "grad_norm": 2.93803625316853, "learning_rate": 8.143092805655445e-06, "loss": 0.7091, "step": 916 }, { "epoch": 0.35566760399495784, "grad_norm": 2.9353170496599845, "learning_rate": 8.13782427930157e-06, "loss": 0.6785, "step": 917 }, { "epoch": 0.3560554639775041, "grad_norm": 2.558665907329508, "learning_rate": 8.132549999179934e-06, "loss": 0.7287, "step": 918 }, { "epoch": 0.3564433239600504, "grad_norm": 2.5368294337589923, "learning_rate": 8.127269974961886e-06, "loss": 0.7034, "step": 919 }, { "epoch": 0.3568311839425967, "grad_norm": 3.202749213667715, "learning_rate": 8.121984216329303e-06, "loss": 0.6831, "step": 920 }, { "epoch": 0.35721904392514303, "grad_norm": 2.245396408717645, "learning_rate": 8.116692732974578e-06, "loss": 0.679, "step": 921 }, { "epoch": 0.35760690390768934, "grad_norm": 2.1613982675413195, "learning_rate": 8.111395534600604e-06, "loss": 0.6958, "step": 922 }, { "epoch": 0.3579947638902356, "grad_norm": 2.2444815905280704, "learning_rate": 8.10609263092075e-06, "loss": 0.7004, "step": 923 }, { "epoch": 0.3583826238727819, "grad_norm": 2.8149085435203194, "learning_rate": 8.100784031658846e-06, "loss": 0.727, "step": 924 }, { "epoch": 0.3587704838553282, "grad_norm": 2.265726332199662, "learning_rate": 8.095469746549172e-06, "loss": 0.7517, "step": 925 }, { "epoch": 0.35915834383787454, "grad_norm": 2.4785153320689917, "learning_rate": 8.090149785336426e-06, "loss": 0.7399, "step": 926 }, { "epoch": 0.35954620382042085, "grad_norm": 1.7953247969121509, "learning_rate": 8.084824157775719e-06, "loss": 0.6481, "step": 927 }, { "epoch": 0.3599340638029671, "grad_norm": 3.031997870194104, "learning_rate": 8.079492873632554e-06, "loss": 0.7271, "step": 928 }, { "epoch": 0.3603219237855134, "grad_norm": 3.596885423809487, "learning_rate": 8.074155942682803e-06, "loss": 0.7415, "step": 929 }, { "epoch": 0.36070978376805973, "grad_norm": 3.4034364722281856, "learning_rate": 8.068813374712689e-06, "loss": 0.7739, "step": 930 }, { "epoch": 0.36109764375060605, "grad_norm": 2.0900915422552133, "learning_rate": 8.06346517951878e-06, "loss": 0.7232, "step": 931 }, { "epoch": 0.36148550373315236, "grad_norm": 2.1573261911949984, "learning_rate": 8.058111366907957e-06, "loss": 0.7247, "step": 932 }, { "epoch": 0.3618733637156986, "grad_norm": 2.5023779209037214, "learning_rate": 8.052751946697403e-06, "loss": 0.7249, "step": 933 }, { "epoch": 0.3622612236982449, "grad_norm": 3.314159135037512, "learning_rate": 8.047386928714583e-06, "loss": 0.7242, "step": 934 }, { "epoch": 0.36264908368079124, "grad_norm": 3.083728142353819, "learning_rate": 8.042016322797227e-06, "loss": 0.8031, "step": 935 }, { "epoch": 0.36303694366333755, "grad_norm": 2.429124077471505, "learning_rate": 8.03664013879331e-06, "loss": 0.6999, "step": 936 }, { "epoch": 0.3634248036458838, "grad_norm": 2.5938817395124265, "learning_rate": 8.031258386561038e-06, "loss": 0.7757, "step": 937 }, { "epoch": 0.3638126636284301, "grad_norm": 3.239606224618326, "learning_rate": 8.025871075968828e-06, "loss": 0.7535, "step": 938 }, { "epoch": 0.36420052361097643, "grad_norm": 3.1007212769165395, "learning_rate": 8.020478216895282e-06, "loss": 0.7469, "step": 939 }, { "epoch": 0.36458838359352275, "grad_norm": 2.9211168167872428, "learning_rate": 8.015079819229187e-06, "loss": 0.7188, "step": 940 }, { "epoch": 0.36497624357606906, "grad_norm": 3.075034766856069, "learning_rate": 8.009675892869478e-06, "loss": 0.7177, "step": 941 }, { "epoch": 0.3653641035586153, "grad_norm": 1.8833904818906972, "learning_rate": 8.00426644772523e-06, "loss": 0.7058, "step": 942 }, { "epoch": 0.36575196354116163, "grad_norm": 6.4800431966036385, "learning_rate": 7.99885149371564e-06, "loss": 0.7906, "step": 943 }, { "epoch": 0.36613982352370794, "grad_norm": 1.9317393214818406, "learning_rate": 7.993431040770002e-06, "loss": 0.7163, "step": 944 }, { "epoch": 0.36652768350625425, "grad_norm": 2.77126977031065, "learning_rate": 7.988005098827699e-06, "loss": 0.706, "step": 945 }, { "epoch": 0.36691554348880057, "grad_norm": 2.890040330620129, "learning_rate": 7.982573677838172e-06, "loss": 0.6675, "step": 946 }, { "epoch": 0.3673034034713468, "grad_norm": 2.4042236527195042, "learning_rate": 7.977136787760916e-06, "loss": 0.7428, "step": 947 }, { "epoch": 0.36769126345389314, "grad_norm": 2.4991646725283996, "learning_rate": 7.97169443856545e-06, "loss": 0.7468, "step": 948 }, { "epoch": 0.36807912343643945, "grad_norm": 2.6150340094025726, "learning_rate": 7.966246640231303e-06, "loss": 0.7852, "step": 949 }, { "epoch": 0.36846698341898576, "grad_norm": 1.9848677349717676, "learning_rate": 7.960793402748001e-06, "loss": 0.6421, "step": 950 }, { "epoch": 0.3688548434015321, "grad_norm": 2.306729907156486, "learning_rate": 7.955334736115038e-06, "loss": 0.6727, "step": 951 }, { "epoch": 0.36924270338407833, "grad_norm": 2.405020514603092, "learning_rate": 7.949870650341864e-06, "loss": 0.7363, "step": 952 }, { "epoch": 0.36963056336662464, "grad_norm": 2.12642870052406, "learning_rate": 7.944401155447872e-06, "loss": 0.6371, "step": 953 }, { "epoch": 0.37001842334917096, "grad_norm": 1.8398314657870725, "learning_rate": 7.938926261462366e-06, "loss": 0.6665, "step": 954 }, { "epoch": 0.37040628333171727, "grad_norm": 2.484777652575459, "learning_rate": 7.933445978424555e-06, "loss": 0.7261, "step": 955 }, { "epoch": 0.3707941433142635, "grad_norm": 2.215087558959814, "learning_rate": 7.927960316383524e-06, "loss": 0.6175, "step": 956 }, { "epoch": 0.37118200329680984, "grad_norm": 2.220060133910055, "learning_rate": 7.92246928539823e-06, "loss": 0.7729, "step": 957 }, { "epoch": 0.37156986327935615, "grad_norm": 2.424759863775155, "learning_rate": 7.916972895537471e-06, "loss": 0.792, "step": 958 }, { "epoch": 0.37195772326190246, "grad_norm": 2.946015356376679, "learning_rate": 7.911471156879866e-06, "loss": 0.7349, "step": 959 }, { "epoch": 0.3723455832444488, "grad_norm": 2.7031084738145776, "learning_rate": 7.905964079513851e-06, "loss": 0.801, "step": 960 }, { "epoch": 0.37273344322699503, "grad_norm": 2.010355688943565, "learning_rate": 7.900451673537646e-06, "loss": 0.7013, "step": 961 }, { "epoch": 0.37312130320954134, "grad_norm": 1.8229719175829042, "learning_rate": 7.894933949059245e-06, "loss": 0.7327, "step": 962 }, { "epoch": 0.37350916319208766, "grad_norm": 2.3366568912818284, "learning_rate": 7.88941091619639e-06, "loss": 0.7162, "step": 963 }, { "epoch": 0.37389702317463397, "grad_norm": 2.2549392228849214, "learning_rate": 7.883882585076558e-06, "loss": 0.6855, "step": 964 }, { "epoch": 0.3742848831571803, "grad_norm": 2.3263626230588645, "learning_rate": 7.87834896583695e-06, "loss": 0.7244, "step": 965 }, { "epoch": 0.37467274313972654, "grad_norm": 2.7922208531822315, "learning_rate": 7.872810068624452e-06, "loss": 0.7517, "step": 966 }, { "epoch": 0.37506060312227285, "grad_norm": 2.751227149380739, "learning_rate": 7.867265903595632e-06, "loss": 0.7149, "step": 967 }, { "epoch": 0.37544846310481916, "grad_norm": 2.931367779954032, "learning_rate": 7.86171648091672e-06, "loss": 0.7691, "step": 968 }, { "epoch": 0.3758363230873655, "grad_norm": 1.6450932393063649, "learning_rate": 7.856161810763584e-06, "loss": 0.6593, "step": 969 }, { "epoch": 0.3762241830699118, "grad_norm": 1.9373505151878565, "learning_rate": 7.850601903321717e-06, "loss": 0.7275, "step": 970 }, { "epoch": 0.37661204305245805, "grad_norm": 3.0599477392312746, "learning_rate": 7.845036768786214e-06, "loss": 0.7421, "step": 971 }, { "epoch": 0.37699990303500436, "grad_norm": 1.95292236432813, "learning_rate": 7.839466417361753e-06, "loss": 0.7222, "step": 972 }, { "epoch": 0.37738776301755067, "grad_norm": 2.5418366941642625, "learning_rate": 7.833890859262579e-06, "loss": 0.6928, "step": 973 }, { "epoch": 0.377775623000097, "grad_norm": 3.405357575796972, "learning_rate": 7.828310104712488e-06, "loss": 0.7557, "step": 974 }, { "epoch": 0.37816348298264324, "grad_norm": 2.3908183979471542, "learning_rate": 7.822724163944802e-06, "loss": 0.7168, "step": 975 }, { "epoch": 0.37855134296518955, "grad_norm": 3.190616169648695, "learning_rate": 7.81713304720235e-06, "loss": 0.7467, "step": 976 }, { "epoch": 0.37893920294773586, "grad_norm": 2.0989087868138103, "learning_rate": 7.811536764737454e-06, "loss": 0.7583, "step": 977 }, { "epoch": 0.3793270629302822, "grad_norm": 3.2883682111962633, "learning_rate": 7.805935326811913e-06, "loss": 0.7345, "step": 978 }, { "epoch": 0.3797149229128285, "grad_norm": 2.4297637953584887, "learning_rate": 7.800328743696973e-06, "loss": 0.7153, "step": 979 }, { "epoch": 0.38010278289537475, "grad_norm": 3.220866458431438, "learning_rate": 7.794717025673318e-06, "loss": 0.7666, "step": 980 }, { "epoch": 0.38049064287792106, "grad_norm": 2.5711594982540573, "learning_rate": 7.789100183031045e-06, "loss": 0.6861, "step": 981 }, { "epoch": 0.38087850286046737, "grad_norm": 4.149554585667563, "learning_rate": 7.783478226069652e-06, "loss": 0.8155, "step": 982 }, { "epoch": 0.3812663628430137, "grad_norm": 2.2460023679227064, "learning_rate": 7.777851165098012e-06, "loss": 0.6934, "step": 983 }, { "epoch": 0.38165422282556, "grad_norm": 3.0986451983908565, "learning_rate": 7.772219010434359e-06, "loss": 0.7471, "step": 984 }, { "epoch": 0.38204208280810625, "grad_norm": 2.0310976119119046, "learning_rate": 7.766581772406266e-06, "loss": 0.7165, "step": 985 }, { "epoch": 0.38242994279065257, "grad_norm": 1.8648249647175086, "learning_rate": 7.760939461350622e-06, "loss": 0.6797, "step": 986 }, { "epoch": 0.3828178027731989, "grad_norm": 1.9401447305400175, "learning_rate": 7.755292087613635e-06, "loss": 0.6838, "step": 987 }, { "epoch": 0.3832056627557452, "grad_norm": 2.9424521609748306, "learning_rate": 7.749639661550776e-06, "loss": 0.7558, "step": 988 }, { "epoch": 0.3835935227382915, "grad_norm": 2.4349394678890675, "learning_rate": 7.743982193526791e-06, "loss": 0.7304, "step": 989 }, { "epoch": 0.38398138272083776, "grad_norm": 2.652184879784862, "learning_rate": 7.738319693915673e-06, "loss": 0.6744, "step": 990 }, { "epoch": 0.3843692427033841, "grad_norm": 2.418386599400231, "learning_rate": 7.732652173100634e-06, "loss": 0.7244, "step": 991 }, { "epoch": 0.3847571026859304, "grad_norm": 2.403701373786357, "learning_rate": 7.726979641474102e-06, "loss": 0.736, "step": 992 }, { "epoch": 0.3851449626684767, "grad_norm": 1.9248589975349801, "learning_rate": 7.721302109437686e-06, "loss": 0.7149, "step": 993 }, { "epoch": 0.38553282265102296, "grad_norm": 2.7092021657743524, "learning_rate": 7.715619587402165e-06, "loss": 0.725, "step": 994 }, { "epoch": 0.38592068263356927, "grad_norm": 3.2077094391581595, "learning_rate": 7.709932085787473e-06, "loss": 0.8103, "step": 995 }, { "epoch": 0.3863085426161156, "grad_norm": 2.4906181421408284, "learning_rate": 7.704239615022671e-06, "loss": 0.6967, "step": 996 }, { "epoch": 0.3866964025986619, "grad_norm": 2.3440438790098406, "learning_rate": 7.698542185545932e-06, "loss": 0.719, "step": 997 }, { "epoch": 0.3870842625812082, "grad_norm": 2.163752892199128, "learning_rate": 7.692839807804522e-06, "loss": 0.7305, "step": 998 }, { "epoch": 0.38747212256375446, "grad_norm": 1.9297756205108587, "learning_rate": 7.687132492254783e-06, "loss": 0.7233, "step": 999 }, { "epoch": 0.3878599825463008, "grad_norm": 2.6137139079172833, "learning_rate": 7.681420249362107e-06, "loss": 0.729, "step": 1000 }, { "epoch": 0.3878599825463008, "eval_loss": 1.3839614391326904, "eval_runtime": 6.7053, "eval_samples_per_second": 0.149, "eval_steps_per_second": 0.149, "step": 1000 }, { "epoch": 0.3882478425288471, "grad_norm": 2.6380809557551017, "learning_rate": 7.675703089600926e-06, "loss": 0.7375, "step": 1001 }, { "epoch": 0.3886357025113934, "grad_norm": 3.3153055584677595, "learning_rate": 7.669981023454682e-06, "loss": 0.7563, "step": 1002 }, { "epoch": 0.3890235624939397, "grad_norm": 1.8450075172922809, "learning_rate": 7.664254061415818e-06, "loss": 0.7111, "step": 1003 }, { "epoch": 0.38941142247648597, "grad_norm": 2.9827522118137364, "learning_rate": 7.658522213985757e-06, "loss": 0.7213, "step": 1004 }, { "epoch": 0.3897992824590323, "grad_norm": 2.5355368323228893, "learning_rate": 7.652785491674872e-06, "loss": 0.7465, "step": 1005 }, { "epoch": 0.3901871424415786, "grad_norm": 1.9451035644373962, "learning_rate": 7.647043905002485e-06, "loss": 0.6721, "step": 1006 }, { "epoch": 0.3905750024241249, "grad_norm": 1.9755965592902782, "learning_rate": 7.641297464496828e-06, "loss": 0.7181, "step": 1007 }, { "epoch": 0.3909628624066712, "grad_norm": 2.1097498625452293, "learning_rate": 7.635546180695039e-06, "loss": 0.71, "step": 1008 }, { "epoch": 0.3913507223892175, "grad_norm": 2.624666462418083, "learning_rate": 7.629790064143139e-06, "loss": 0.6911, "step": 1009 }, { "epoch": 0.3917385823717638, "grad_norm": 2.5465555097570998, "learning_rate": 7.624029125396004e-06, "loss": 0.7018, "step": 1010 }, { "epoch": 0.3921264423543101, "grad_norm": 2.291929209482124, "learning_rate": 7.618263375017358e-06, "loss": 0.7172, "step": 1011 }, { "epoch": 0.3925143023368564, "grad_norm": 2.3035332908558, "learning_rate": 7.612492823579744e-06, "loss": 0.7092, "step": 1012 }, { "epoch": 0.39290216231940267, "grad_norm": 2.165195911076904, "learning_rate": 7.606717481664515e-06, "loss": 0.7081, "step": 1013 }, { "epoch": 0.393290022301949, "grad_norm": 2.200275424225907, "learning_rate": 7.600937359861799e-06, "loss": 0.747, "step": 1014 }, { "epoch": 0.3936778822844953, "grad_norm": 2.416569873466146, "learning_rate": 7.595152468770497e-06, "loss": 0.6999, "step": 1015 }, { "epoch": 0.3940657422670416, "grad_norm": 2.684821734252302, "learning_rate": 7.589362818998251e-06, "loss": 0.7304, "step": 1016 }, { "epoch": 0.3944536022495879, "grad_norm": 3.1025451499677206, "learning_rate": 7.58356842116143e-06, "loss": 0.7299, "step": 1017 }, { "epoch": 0.3948414622321342, "grad_norm": 2.030409907701176, "learning_rate": 7.57776928588511e-06, "loss": 0.6755, "step": 1018 }, { "epoch": 0.3952293222146805, "grad_norm": 2.4540316507586475, "learning_rate": 7.571965423803052e-06, "loss": 0.6848, "step": 1019 }, { "epoch": 0.3956171821972268, "grad_norm": 3.7884647900287978, "learning_rate": 7.566156845557684e-06, "loss": 0.824, "step": 1020 }, { "epoch": 0.3960050421797731, "grad_norm": 2.6801744187584924, "learning_rate": 7.560343561800087e-06, "loss": 0.854, "step": 1021 }, { "epoch": 0.3963929021623194, "grad_norm": 2.5608105725488834, "learning_rate": 7.554525583189969e-06, "loss": 0.6911, "step": 1022 }, { "epoch": 0.3967807621448657, "grad_norm": 3.0860651867048685, "learning_rate": 7.548702920395639e-06, "loss": 0.6758, "step": 1023 }, { "epoch": 0.397168622127412, "grad_norm": 2.9498957475008987, "learning_rate": 7.542875584094006e-06, "loss": 0.7203, "step": 1024 }, { "epoch": 0.3975564821099583, "grad_norm": 2.649624976660026, "learning_rate": 7.537043584970543e-06, "loss": 0.7404, "step": 1025 }, { "epoch": 0.3979443420925046, "grad_norm": 2.331525441549279, "learning_rate": 7.53120693371927e-06, "loss": 0.635, "step": 1026 }, { "epoch": 0.39833220207505093, "grad_norm": 2.776297962928585, "learning_rate": 7.525365641042749e-06, "loss": 0.7452, "step": 1027 }, { "epoch": 0.3987200620575972, "grad_norm": 2.840462636244803, "learning_rate": 7.519519717652039e-06, "loss": 0.6922, "step": 1028 }, { "epoch": 0.3991079220401435, "grad_norm": 2.515387959358309, "learning_rate": 7.5136691742667e-06, "loss": 0.6717, "step": 1029 }, { "epoch": 0.3994957820226898, "grad_norm": 2.9686022954056863, "learning_rate": 7.507814021614761e-06, "loss": 0.7304, "step": 1030 }, { "epoch": 0.39988364200523613, "grad_norm": 2.5516527674134095, "learning_rate": 7.501954270432701e-06, "loss": 0.7162, "step": 1031 }, { "epoch": 0.4002715019877824, "grad_norm": 2.4457604368429315, "learning_rate": 7.496089931465432e-06, "loss": 0.768, "step": 1032 }, { "epoch": 0.4006593619703287, "grad_norm": 2.056848823597585, "learning_rate": 7.490221015466279e-06, "loss": 0.7184, "step": 1033 }, { "epoch": 0.401047221952875, "grad_norm": 2.2508811400382127, "learning_rate": 7.4843475331969614e-06, "loss": 0.7414, "step": 1034 }, { "epoch": 0.4014350819354213, "grad_norm": 1.921486664498037, "learning_rate": 7.478469495427569e-06, "loss": 0.6931, "step": 1035 }, { "epoch": 0.40182294191796764, "grad_norm": 2.262092600976307, "learning_rate": 7.4725869129365484e-06, "loss": 0.6665, "step": 1036 }, { "epoch": 0.4022108019005139, "grad_norm": 2.7619829190938354, "learning_rate": 7.4666997965106725e-06, "loss": 0.701, "step": 1037 }, { "epoch": 0.4025986618830602, "grad_norm": 3.2060860389043793, "learning_rate": 7.4608081569450365e-06, "loss": 0.7692, "step": 1038 }, { "epoch": 0.4029865218656065, "grad_norm": 2.293228129781651, "learning_rate": 7.4549120050430265e-06, "loss": 0.7109, "step": 1039 }, { "epoch": 0.40337438184815283, "grad_norm": 2.2326721671229914, "learning_rate": 7.449011351616302e-06, "loss": 0.6667, "step": 1040 }, { "epoch": 0.40376224183069914, "grad_norm": 2.3712169141567387, "learning_rate": 7.443106207484776e-06, "loss": 0.6794, "step": 1041 }, { "epoch": 0.4041501018132454, "grad_norm": 2.278919955747105, "learning_rate": 7.437196583476597e-06, "loss": 0.719, "step": 1042 }, { "epoch": 0.4045379617957917, "grad_norm": 2.80357290525502, "learning_rate": 7.43128249042813e-06, "loss": 0.7613, "step": 1043 }, { "epoch": 0.404925821778338, "grad_norm": 2.196546964206239, "learning_rate": 7.425363939183931e-06, "loss": 0.6592, "step": 1044 }, { "epoch": 0.40531368176088434, "grad_norm": 2.3276494199046183, "learning_rate": 7.419440940596735e-06, "loss": 0.753, "step": 1045 }, { "epoch": 0.40570154174343065, "grad_norm": 2.9985395534899086, "learning_rate": 7.41351350552743e-06, "loss": 0.7594, "step": 1046 }, { "epoch": 0.4060894017259769, "grad_norm": 2.2608261140983643, "learning_rate": 7.407581644845038e-06, "loss": 0.6803, "step": 1047 }, { "epoch": 0.4064772617085232, "grad_norm": 2.3240770122759287, "learning_rate": 7.401645369426697e-06, "loss": 0.7284, "step": 1048 }, { "epoch": 0.40686512169106953, "grad_norm": 2.1799087732026154, "learning_rate": 7.395704690157644e-06, "loss": 0.7124, "step": 1049 }, { "epoch": 0.40725298167361584, "grad_norm": 4.505737603795921, "learning_rate": 7.389759617931183e-06, "loss": 0.7985, "step": 1050 }, { "epoch": 0.4076408416561621, "grad_norm": 2.382055368445747, "learning_rate": 7.383810163648682e-06, "loss": 0.7357, "step": 1051 }, { "epoch": 0.4080287016387084, "grad_norm": 3.33903043853245, "learning_rate": 7.3778563382195365e-06, "loss": 0.7674, "step": 1052 }, { "epoch": 0.4084165616212547, "grad_norm": 2.2468353698470027, "learning_rate": 7.371898152561166e-06, "loss": 0.6735, "step": 1053 }, { "epoch": 0.40880442160380104, "grad_norm": 2.512428800421771, "learning_rate": 7.365935617598975e-06, "loss": 0.7529, "step": 1054 }, { "epoch": 0.40919228158634735, "grad_norm": 2.0109122814886735, "learning_rate": 7.359968744266353e-06, "loss": 0.6422, "step": 1055 }, { "epoch": 0.4095801415688936, "grad_norm": 2.094190861193328, "learning_rate": 7.35399754350464e-06, "loss": 0.722, "step": 1056 }, { "epoch": 0.4099680015514399, "grad_norm": 2.5191179751050843, "learning_rate": 7.3480220262631095e-06, "loss": 0.6671, "step": 1057 }, { "epoch": 0.41035586153398623, "grad_norm": 2.0908660735330193, "learning_rate": 7.342042203498952e-06, "loss": 0.7013, "step": 1058 }, { "epoch": 0.41074372151653255, "grad_norm": 3.0333292914914276, "learning_rate": 7.336058086177253e-06, "loss": 0.7197, "step": 1059 }, { "epoch": 0.41113158149907886, "grad_norm": 2.241982572762841, "learning_rate": 7.330069685270976e-06, "loss": 0.6846, "step": 1060 }, { "epoch": 0.4115194414816251, "grad_norm": 2.0520004419561992, "learning_rate": 7.3240770117609325e-06, "loss": 0.7371, "step": 1061 }, { "epoch": 0.4119073014641714, "grad_norm": 2.2055884117829225, "learning_rate": 7.318080076635773e-06, "loss": 0.7174, "step": 1062 }, { "epoch": 0.41229516144671774, "grad_norm": 2.143001349245801, "learning_rate": 7.312078890891962e-06, "loss": 0.7194, "step": 1063 }, { "epoch": 0.41268302142926405, "grad_norm": 2.0886190674692697, "learning_rate": 7.306073465533759e-06, "loss": 0.7263, "step": 1064 }, { "epoch": 0.4130708814118103, "grad_norm": 1.8048706919830966, "learning_rate": 7.300063811573194e-06, "loss": 0.6721, "step": 1065 }, { "epoch": 0.4134587413943566, "grad_norm": 3.005665719695474, "learning_rate": 7.294049940030055e-06, "loss": 0.7712, "step": 1066 }, { "epoch": 0.41384660137690293, "grad_norm": 2.636467305922151, "learning_rate": 7.2880318619318605e-06, "loss": 0.6981, "step": 1067 }, { "epoch": 0.41423446135944925, "grad_norm": 2.8483925862958617, "learning_rate": 7.2820095883138456e-06, "loss": 0.7501, "step": 1068 }, { "epoch": 0.41462232134199556, "grad_norm": 2.859685735316769, "learning_rate": 7.2759831302189376e-06, "loss": 0.7521, "step": 1069 }, { "epoch": 0.4150101813245418, "grad_norm": 3.191122319930222, "learning_rate": 7.269952498697734e-06, "loss": 0.7537, "step": 1070 }, { "epoch": 0.41539804130708813, "grad_norm": 1.9896201667301765, "learning_rate": 7.2639177048084894e-06, "loss": 0.649, "step": 1071 }, { "epoch": 0.41578590128963444, "grad_norm": 2.2620110971106535, "learning_rate": 7.25787875961709e-06, "loss": 0.7204, "step": 1072 }, { "epoch": 0.41617376127218075, "grad_norm": 2.426011096208415, "learning_rate": 7.2518356741970285e-06, "loss": 0.69, "step": 1073 }, { "epoch": 0.41656162125472707, "grad_norm": 2.154926326311973, "learning_rate": 7.245788459629397e-06, "loss": 0.6831, "step": 1074 }, { "epoch": 0.4169494812372733, "grad_norm": 2.5670497092218674, "learning_rate": 7.239737127002854e-06, "loss": 0.7103, "step": 1075 }, { "epoch": 0.41733734121981964, "grad_norm": 2.293007235065033, "learning_rate": 7.233681687413614e-06, "loss": 0.7502, "step": 1076 }, { "epoch": 0.41772520120236595, "grad_norm": 2.0743476530835534, "learning_rate": 7.227622151965418e-06, "loss": 0.6791, "step": 1077 }, { "epoch": 0.41811306118491226, "grad_norm": 2.4363250362394124, "learning_rate": 7.221558531769519e-06, "loss": 0.6703, "step": 1078 }, { "epoch": 0.4185009211674586, "grad_norm": 2.641758966896306, "learning_rate": 7.21549083794466e-06, "loss": 0.689, "step": 1079 }, { "epoch": 0.41888878115000483, "grad_norm": 2.37231961432997, "learning_rate": 7.209419081617055e-06, "loss": 0.7006, "step": 1080 }, { "epoch": 0.41927664113255114, "grad_norm": 2.816695019702235, "learning_rate": 7.203343273920365e-06, "loss": 0.7543, "step": 1081 }, { "epoch": 0.41966450111509745, "grad_norm": 2.563442620501416, "learning_rate": 7.197263425995682e-06, "loss": 0.7413, "step": 1082 }, { "epoch": 0.42005236109764377, "grad_norm": 3.0425292702699283, "learning_rate": 7.191179548991507e-06, "loss": 0.8253, "step": 1083 }, { "epoch": 0.42044022108019, "grad_norm": 2.37673040429887, "learning_rate": 7.185091654063724e-06, "loss": 0.7078, "step": 1084 }, { "epoch": 0.42082808106273634, "grad_norm": 2.838538029978413, "learning_rate": 7.1789997523755915e-06, "loss": 0.7205, "step": 1085 }, { "epoch": 0.42121594104528265, "grad_norm": 2.061378517253737, "learning_rate": 7.172903855097712e-06, "loss": 0.6584, "step": 1086 }, { "epoch": 0.42160380102782896, "grad_norm": 1.9353867283442512, "learning_rate": 7.166803973408012e-06, "loss": 0.6645, "step": 1087 }, { "epoch": 0.4219916610103753, "grad_norm": 2.7107782273940058, "learning_rate": 7.160700118491729e-06, "loss": 0.7108, "step": 1088 }, { "epoch": 0.42237952099292153, "grad_norm": 2.6191838182335387, "learning_rate": 7.154592301541383e-06, "loss": 0.727, "step": 1089 }, { "epoch": 0.42276738097546784, "grad_norm": 2.898758287719087, "learning_rate": 7.148480533756759e-06, "loss": 0.7377, "step": 1090 }, { "epoch": 0.42315524095801416, "grad_norm": 2.5622905729405643, "learning_rate": 7.142364826344891e-06, "loss": 0.7141, "step": 1091 }, { "epoch": 0.42354310094056047, "grad_norm": 2.964411448345593, "learning_rate": 7.1362451905200285e-06, "loss": 0.6921, "step": 1092 }, { "epoch": 0.4239309609231068, "grad_norm": 1.9480692085947269, "learning_rate": 7.130121637503633e-06, "loss": 0.7308, "step": 1093 }, { "epoch": 0.42431882090565304, "grad_norm": 3.4634192115290134, "learning_rate": 7.123994178524345e-06, "loss": 0.711, "step": 1094 }, { "epoch": 0.42470668088819935, "grad_norm": 2.605027011500429, "learning_rate": 7.117862824817966e-06, "loss": 0.6843, "step": 1095 }, { "epoch": 0.42509454087074566, "grad_norm": 2.4533228381882908, "learning_rate": 7.1117275876274425e-06, "loss": 0.6402, "step": 1096 }, { "epoch": 0.425482400853292, "grad_norm": 2.0160450189613313, "learning_rate": 7.105588478202838e-06, "loss": 0.7037, "step": 1097 }, { "epoch": 0.4258702608358383, "grad_norm": 2.3153867304993008, "learning_rate": 7.099445507801324e-06, "loss": 0.7243, "step": 1098 }, { "epoch": 0.42625812081838454, "grad_norm": 2.3027937466993924, "learning_rate": 7.093298687687141e-06, "loss": 0.7026, "step": 1099 }, { "epoch": 0.42664598080093086, "grad_norm": 2.6100107518574567, "learning_rate": 7.0871480291315975e-06, "loss": 0.7338, "step": 1100 }, { "epoch": 0.42703384078347717, "grad_norm": 2.1810132299809153, "learning_rate": 7.080993543413035e-06, "loss": 0.7157, "step": 1101 }, { "epoch": 0.4274217007660235, "grad_norm": 2.5369649380659567, "learning_rate": 7.0748352418168174e-06, "loss": 0.6911, "step": 1102 }, { "epoch": 0.42780956074856974, "grad_norm": 2.7201665890930276, "learning_rate": 7.068673135635302e-06, "loss": 0.7149, "step": 1103 }, { "epoch": 0.42819742073111605, "grad_norm": 2.451656954881025, "learning_rate": 7.062507236167826e-06, "loss": 0.6909, "step": 1104 }, { "epoch": 0.42858528071366236, "grad_norm": 3.010632199817493, "learning_rate": 7.056337554720676e-06, "loss": 0.7189, "step": 1105 }, { "epoch": 0.4289731406962087, "grad_norm": 2.910901872573296, "learning_rate": 7.050164102607081e-06, "loss": 0.7209, "step": 1106 }, { "epoch": 0.429361000678755, "grad_norm": 2.59165074560226, "learning_rate": 7.043986891147179e-06, "loss": 0.6972, "step": 1107 }, { "epoch": 0.42974886066130125, "grad_norm": 2.456413862179595, "learning_rate": 7.037805931668006e-06, "loss": 0.6695, "step": 1108 }, { "epoch": 0.43013672064384756, "grad_norm": 2.8752158983921743, "learning_rate": 7.031621235503464e-06, "loss": 0.7148, "step": 1109 }, { "epoch": 0.43052458062639387, "grad_norm": 1.7252876449301413, "learning_rate": 7.025432813994315e-06, "loss": 0.6324, "step": 1110 }, { "epoch": 0.4309124406089402, "grad_norm": 2.0162512019816536, "learning_rate": 7.019240678488145e-06, "loss": 0.6433, "step": 1111 }, { "epoch": 0.4313003005914865, "grad_norm": 1.8270212225141147, "learning_rate": 7.013044840339353e-06, "loss": 0.6626, "step": 1112 }, { "epoch": 0.43168816057403275, "grad_norm": 2.5985658883530887, "learning_rate": 7.006845310909131e-06, "loss": 0.6602, "step": 1113 }, { "epoch": 0.43207602055657907, "grad_norm": 3.0561173347689796, "learning_rate": 7.000642101565434e-06, "loss": 0.7108, "step": 1114 }, { "epoch": 0.4324638805391254, "grad_norm": 2.3803120449907533, "learning_rate": 6.994435223682966e-06, "loss": 0.749, "step": 1115 }, { "epoch": 0.4328517405216717, "grad_norm": 2.4395638581562826, "learning_rate": 6.9882246886431615e-06, "loss": 0.6905, "step": 1116 }, { "epoch": 0.433239600504218, "grad_norm": 2.336951041201528, "learning_rate": 6.982010507834158e-06, "loss": 0.6583, "step": 1117 }, { "epoch": 0.43362746048676426, "grad_norm": 2.283415893033479, "learning_rate": 6.975792692650778e-06, "loss": 0.668, "step": 1118 }, { "epoch": 0.4340153204693106, "grad_norm": 2.427346348535036, "learning_rate": 6.969571254494509e-06, "loss": 0.7378, "step": 1119 }, { "epoch": 0.4344031804518569, "grad_norm": 2.103095533289945, "learning_rate": 6.963346204773483e-06, "loss": 0.7309, "step": 1120 }, { "epoch": 0.4347910404344032, "grad_norm": 2.803618292527894, "learning_rate": 6.957117554902452e-06, "loss": 0.7842, "step": 1121 }, { "epoch": 0.43517890041694945, "grad_norm": 2.4102590232650796, "learning_rate": 6.950885316302773e-06, "loss": 0.6459, "step": 1122 }, { "epoch": 0.43556676039949577, "grad_norm": 2.969215507680134, "learning_rate": 6.94464950040238e-06, "loss": 0.7672, "step": 1123 }, { "epoch": 0.4359546203820421, "grad_norm": 2.725236956387263, "learning_rate": 6.938410118635768e-06, "loss": 0.7006, "step": 1124 }, { "epoch": 0.4363424803645884, "grad_norm": 2.137211387280891, "learning_rate": 6.9321671824439715e-06, "loss": 0.7311, "step": 1125 }, { "epoch": 0.4367303403471347, "grad_norm": 2.5167561037071007, "learning_rate": 6.9259207032745415e-06, "loss": 0.6962, "step": 1126 }, { "epoch": 0.43711820032968096, "grad_norm": 2.0108561585112, "learning_rate": 6.919670692581526e-06, "loss": 0.695, "step": 1127 }, { "epoch": 0.4375060603122273, "grad_norm": 2.7385750574880032, "learning_rate": 6.913417161825449e-06, "loss": 0.7154, "step": 1128 }, { "epoch": 0.4378939202947736, "grad_norm": 1.9909381963314752, "learning_rate": 6.907160122473291e-06, "loss": 0.6243, "step": 1129 }, { "epoch": 0.4382817802773199, "grad_norm": 2.5375907858403504, "learning_rate": 6.90089958599846e-06, "loss": 0.6978, "step": 1130 }, { "epoch": 0.4386696402598662, "grad_norm": 2.3718276230693327, "learning_rate": 6.894635563880785e-06, "loss": 0.722, "step": 1131 }, { "epoch": 0.43905750024241247, "grad_norm": 2.102437810462144, "learning_rate": 6.88836806760648e-06, "loss": 0.6871, "step": 1132 }, { "epoch": 0.4394453602249588, "grad_norm": 2.314129295729442, "learning_rate": 6.882097108668132e-06, "loss": 0.7726, "step": 1133 }, { "epoch": 0.4398332202075051, "grad_norm": 2.161365631578299, "learning_rate": 6.875822698564678e-06, "loss": 0.6932, "step": 1134 }, { "epoch": 0.4402210801900514, "grad_norm": 2.4862903285710414, "learning_rate": 6.869544848801383e-06, "loss": 0.7115, "step": 1135 }, { "epoch": 0.4406089401725977, "grad_norm": 2.5194267400762276, "learning_rate": 6.863263570889818e-06, "loss": 0.7134, "step": 1136 }, { "epoch": 0.440996800155144, "grad_norm": 2.833757443218321, "learning_rate": 6.85697887634784e-06, "loss": 0.666, "step": 1137 }, { "epoch": 0.4413846601376903, "grad_norm": 2.7713656698882665, "learning_rate": 6.850690776699574e-06, "loss": 0.6797, "step": 1138 }, { "epoch": 0.4417725201202366, "grad_norm": 2.948313529993076, "learning_rate": 6.844399283475384e-06, "loss": 0.7583, "step": 1139 }, { "epoch": 0.4421603801027829, "grad_norm": 2.024826396726979, "learning_rate": 6.838104408211862e-06, "loss": 0.6944, "step": 1140 }, { "epoch": 0.44254824008532917, "grad_norm": 2.58283976743478, "learning_rate": 6.831806162451799e-06, "loss": 0.73, "step": 1141 }, { "epoch": 0.4429361000678755, "grad_norm": 2.273405737454057, "learning_rate": 6.825504557744167e-06, "loss": 0.7576, "step": 1142 }, { "epoch": 0.4433239600504218, "grad_norm": 3.7775919540166725, "learning_rate": 6.819199605644093e-06, "loss": 0.7204, "step": 1143 }, { "epoch": 0.4437118200329681, "grad_norm": 3.265935120391749, "learning_rate": 6.812891317712851e-06, "loss": 0.8026, "step": 1144 }, { "epoch": 0.4440996800155144, "grad_norm": 2.357636367185338, "learning_rate": 6.806579705517824e-06, "loss": 0.6969, "step": 1145 }, { "epoch": 0.4444875399980607, "grad_norm": 2.0275302981585104, "learning_rate": 6.800264780632495e-06, "loss": 0.6867, "step": 1146 }, { "epoch": 0.444875399980607, "grad_norm": 2.4223864400609187, "learning_rate": 6.793946554636417e-06, "loss": 0.716, "step": 1147 }, { "epoch": 0.4452632599631533, "grad_norm": 2.6669684820552697, "learning_rate": 6.7876250391152e-06, "loss": 0.7539, "step": 1148 }, { "epoch": 0.4456511199456996, "grad_norm": 2.688456712189796, "learning_rate": 6.781300245660487e-06, "loss": 0.7205, "step": 1149 }, { "epoch": 0.4460389799282459, "grad_norm": 2.2042671248363495, "learning_rate": 6.774972185869928e-06, "loss": 0.6816, "step": 1150 }, { "epoch": 0.4464268399107922, "grad_norm": 2.3245713800579733, "learning_rate": 6.768640871347163e-06, "loss": 0.6666, "step": 1151 }, { "epoch": 0.4468146998933385, "grad_norm": 3.8797838222699275, "learning_rate": 6.762306313701803e-06, "loss": 0.6899, "step": 1152 }, { "epoch": 0.4472025598758848, "grad_norm": 2.6273961287379883, "learning_rate": 6.7559685245494025e-06, "loss": 0.7421, "step": 1153 }, { "epoch": 0.4475904198584311, "grad_norm": 2.3714396533649302, "learning_rate": 6.749627515511443e-06, "loss": 0.6931, "step": 1154 }, { "epoch": 0.44797827984097743, "grad_norm": 2.856567736376076, "learning_rate": 6.743283298215312e-06, "loss": 0.7261, "step": 1155 }, { "epoch": 0.4483661398235237, "grad_norm": 2.8661807115196005, "learning_rate": 6.736935884294275e-06, "loss": 0.7378, "step": 1156 }, { "epoch": 0.44875399980607, "grad_norm": 3.7212826640294407, "learning_rate": 6.730585285387465e-06, "loss": 0.76, "step": 1157 }, { "epoch": 0.4491418597886163, "grad_norm": 2.3668829953961525, "learning_rate": 6.724231513139853e-06, "loss": 0.6834, "step": 1158 }, { "epoch": 0.44952971977116263, "grad_norm": 2.7059432404377763, "learning_rate": 6.717874579202227e-06, "loss": 0.6607, "step": 1159 }, { "epoch": 0.4499175797537089, "grad_norm": 2.9724232650541644, "learning_rate": 6.711514495231173e-06, "loss": 0.7374, "step": 1160 }, { "epoch": 0.4503054397362552, "grad_norm": 2.3330108452086025, "learning_rate": 6.705151272889055e-06, "loss": 0.6969, "step": 1161 }, { "epoch": 0.4506932997188015, "grad_norm": 2.2511335823702248, "learning_rate": 6.698784923843993e-06, "loss": 0.7326, "step": 1162 }, { "epoch": 0.4510811597013478, "grad_norm": 2.524031354039617, "learning_rate": 6.692415459769835e-06, "loss": 0.6426, "step": 1163 }, { "epoch": 0.45146901968389413, "grad_norm": 2.393299326922149, "learning_rate": 6.686042892346147e-06, "loss": 0.658, "step": 1164 }, { "epoch": 0.4518568796664404, "grad_norm": 2.8904209626489004, "learning_rate": 6.679667233258179e-06, "loss": 0.7202, "step": 1165 }, { "epoch": 0.4522447396489867, "grad_norm": 3.1392311188688544, "learning_rate": 6.673288494196858e-06, "loss": 0.7103, "step": 1166 }, { "epoch": 0.452632599631533, "grad_norm": 1.8315235029304031, "learning_rate": 6.666906686858753e-06, "loss": 0.6739, "step": 1167 }, { "epoch": 0.45302045961407933, "grad_norm": 2.7450846359284133, "learning_rate": 6.66052182294606e-06, "loss": 0.7515, "step": 1168 }, { "epoch": 0.45340831959662564, "grad_norm": 2.206431238461818, "learning_rate": 6.654133914166582e-06, "loss": 0.6813, "step": 1169 }, { "epoch": 0.4537961795791719, "grad_norm": 2.5835910946866294, "learning_rate": 6.647742972233703e-06, "loss": 0.6914, "step": 1170 }, { "epoch": 0.4541840395617182, "grad_norm": 2.3927315292227607, "learning_rate": 6.641349008866369e-06, "loss": 0.7009, "step": 1171 }, { "epoch": 0.4545718995442645, "grad_norm": 3.0049880327597034, "learning_rate": 6.634952035789069e-06, "loss": 0.7706, "step": 1172 }, { "epoch": 0.45495975952681084, "grad_norm": 2.9235205762718794, "learning_rate": 6.628552064731807e-06, "loss": 0.7356, "step": 1173 }, { "epoch": 0.45534761950935715, "grad_norm": 2.53358841632219, "learning_rate": 6.622149107430088e-06, "loss": 0.6682, "step": 1174 }, { "epoch": 0.4557354794919034, "grad_norm": 3.5010847418649855, "learning_rate": 6.6157431756248906e-06, "loss": 0.7305, "step": 1175 }, { "epoch": 0.4561233394744497, "grad_norm": 3.242189301342513, "learning_rate": 6.609334281062647e-06, "loss": 0.7697, "step": 1176 }, { "epoch": 0.45651119945699603, "grad_norm": 2.2525458804405902, "learning_rate": 6.602922435495225e-06, "loss": 0.7355, "step": 1177 }, { "epoch": 0.45689905943954234, "grad_norm": 2.0828026671022615, "learning_rate": 6.5965076506799e-06, "loss": 0.6735, "step": 1178 }, { "epoch": 0.4572869194220886, "grad_norm": 2.4661949086068935, "learning_rate": 6.5900899383793415e-06, "loss": 0.6944, "step": 1179 }, { "epoch": 0.4576747794046349, "grad_norm": 3.17820171136171, "learning_rate": 6.583669310361583e-06, "loss": 0.667, "step": 1180 }, { "epoch": 0.4580626393871812, "grad_norm": 2.7076952555132383, "learning_rate": 6.577245778400006e-06, "loss": 0.6632, "step": 1181 }, { "epoch": 0.45845049936972754, "grad_norm": 2.897484172291898, "learning_rate": 6.570819354273317e-06, "loss": 0.6922, "step": 1182 }, { "epoch": 0.45883835935227385, "grad_norm": 2.29824211822659, "learning_rate": 6.564390049765528e-06, "loss": 0.6708, "step": 1183 }, { "epoch": 0.4592262193348201, "grad_norm": 2.718595562732594, "learning_rate": 6.557957876665926e-06, "loss": 0.7218, "step": 1184 }, { "epoch": 0.4596140793173664, "grad_norm": 2.3188099826255333, "learning_rate": 6.551522846769067e-06, "loss": 0.6736, "step": 1185 }, { "epoch": 0.46000193929991273, "grad_norm": 2.376947169151022, "learning_rate": 6.545084971874738e-06, "loss": 0.7293, "step": 1186 }, { "epoch": 0.46038979928245904, "grad_norm": 1.7892404241886621, "learning_rate": 6.538644263787948e-06, "loss": 0.6639, "step": 1187 }, { "epoch": 0.46077765926500536, "grad_norm": 5.718988713152486, "learning_rate": 6.532200734318896e-06, "loss": 0.6861, "step": 1188 }, { "epoch": 0.4611655192475516, "grad_norm": 2.5231622301644117, "learning_rate": 6.525754395282961e-06, "loss": 0.7053, "step": 1189 }, { "epoch": 0.4615533792300979, "grad_norm": 2.5518122670378647, "learning_rate": 6.5193052585006666e-06, "loss": 0.7555, "step": 1190 }, { "epoch": 0.46194123921264424, "grad_norm": 2.6267376253203167, "learning_rate": 6.512853335797673e-06, "loss": 0.716, "step": 1191 }, { "epoch": 0.46232909919519055, "grad_norm": 2.068925667156557, "learning_rate": 6.5063986390047475e-06, "loss": 0.7215, "step": 1192 }, { "epoch": 0.46271695917773686, "grad_norm": 2.1466617197488636, "learning_rate": 6.499941179957739e-06, "loss": 0.672, "step": 1193 }, { "epoch": 0.4631048191602831, "grad_norm": 2.9226349621038303, "learning_rate": 6.493480970497569e-06, "loss": 0.7194, "step": 1194 }, { "epoch": 0.46349267914282943, "grad_norm": 2.2441626951684888, "learning_rate": 6.487018022470195e-06, "loss": 0.7102, "step": 1195 }, { "epoch": 0.46388053912537575, "grad_norm": 2.523324997531834, "learning_rate": 6.480552347726604e-06, "loss": 0.6899, "step": 1196 }, { "epoch": 0.46426839910792206, "grad_norm": 1.8742458820428864, "learning_rate": 6.474083958122777e-06, "loss": 0.6476, "step": 1197 }, { "epoch": 0.4646562590904683, "grad_norm": 2.3214909757143922, "learning_rate": 6.467612865519674e-06, "loss": 0.7285, "step": 1198 }, { "epoch": 0.46504411907301463, "grad_norm": 2.4056563915664637, "learning_rate": 6.461139081783215e-06, "loss": 0.6846, "step": 1199 }, { "epoch": 0.46543197905556094, "grad_norm": 2.917562500774395, "learning_rate": 6.454662618784249e-06, "loss": 0.7109, "step": 1200 }, { "epoch": 0.46581983903810725, "grad_norm": 2.0903834920101225, "learning_rate": 6.448183488398545e-06, "loss": 0.721, "step": 1201 }, { "epoch": 0.46620769902065357, "grad_norm": 2.7518406403547178, "learning_rate": 6.441701702506755e-06, "loss": 0.6871, "step": 1202 }, { "epoch": 0.4665955590031998, "grad_norm": 2.119918194024841, "learning_rate": 6.435217272994406e-06, "loss": 0.6494, "step": 1203 }, { "epoch": 0.46698341898574613, "grad_norm": 2.515994596687336, "learning_rate": 6.428730211751873e-06, "loss": 0.7196, "step": 1204 }, { "epoch": 0.46737127896829245, "grad_norm": 2.3942554756594903, "learning_rate": 6.422240530674354e-06, "loss": 0.6886, "step": 1205 }, { "epoch": 0.46775913895083876, "grad_norm": 2.5579097729467177, "learning_rate": 6.4157482416618514e-06, "loss": 0.6623, "step": 1206 }, { "epoch": 0.4681469989333851, "grad_norm": 2.0560588490523006, "learning_rate": 6.409253356619148e-06, "loss": 0.7141, "step": 1207 }, { "epoch": 0.46853485891593133, "grad_norm": 2.032541311521788, "learning_rate": 6.402755887455792e-06, "loss": 0.7179, "step": 1208 }, { "epoch": 0.46892271889847764, "grad_norm": 2.3633738627042145, "learning_rate": 6.396255846086067e-06, "loss": 0.7435, "step": 1209 }, { "epoch": 0.46931057888102395, "grad_norm": 2.400639398530451, "learning_rate": 6.389753244428973e-06, "loss": 0.7644, "step": 1210 }, { "epoch": 0.46969843886357027, "grad_norm": 2.128632841820839, "learning_rate": 6.383248094408203e-06, "loss": 0.6068, "step": 1211 }, { "epoch": 0.4700862988461166, "grad_norm": 2.2413629757512963, "learning_rate": 6.376740407952127e-06, "loss": 0.6416, "step": 1212 }, { "epoch": 0.47047415882866284, "grad_norm": 2.1069300285895736, "learning_rate": 6.370230196993763e-06, "loss": 0.6833, "step": 1213 }, { "epoch": 0.47086201881120915, "grad_norm": 2.248321639089909, "learning_rate": 6.36371747347076e-06, "loss": 0.6886, "step": 1214 }, { "epoch": 0.47124987879375546, "grad_norm": 2.142911495616338, "learning_rate": 6.3572022493253715e-06, "loss": 0.6706, "step": 1215 }, { "epoch": 0.4716377387763018, "grad_norm": 3.470144065671372, "learning_rate": 6.350684536504441e-06, "loss": 0.7487, "step": 1216 }, { "epoch": 0.47202559875884803, "grad_norm": 1.9294155136357178, "learning_rate": 6.344164346959371e-06, "loss": 0.6876, "step": 1217 }, { "epoch": 0.47241345874139434, "grad_norm": 2.7853056478525557, "learning_rate": 6.337641692646106e-06, "loss": 0.7426, "step": 1218 }, { "epoch": 0.47280131872394066, "grad_norm": 1.989910479507593, "learning_rate": 6.331116585525112e-06, "loss": 0.6163, "step": 1219 }, { "epoch": 0.47318917870648697, "grad_norm": 1.9665367895230628, "learning_rate": 6.324589037561352e-06, "loss": 0.6964, "step": 1220 }, { "epoch": 0.4735770386890333, "grad_norm": 2.5389335736020966, "learning_rate": 6.318059060724264e-06, "loss": 0.6823, "step": 1221 }, { "epoch": 0.47396489867157954, "grad_norm": 2.124250355332738, "learning_rate": 6.3115266669877425e-06, "loss": 0.6733, "step": 1222 }, { "epoch": 0.47435275865412585, "grad_norm": 2.049242135850901, "learning_rate": 6.30499186833011e-06, "loss": 0.6264, "step": 1223 }, { "epoch": 0.47474061863667216, "grad_norm": 2.1672015842206793, "learning_rate": 6.2984546767341e-06, "loss": 0.6516, "step": 1224 }, { "epoch": 0.4751284786192185, "grad_norm": 2.232679782174178, "learning_rate": 6.291915104186836e-06, "loss": 0.7434, "step": 1225 }, { "epoch": 0.4755163386017648, "grad_norm": 3.0335039417764342, "learning_rate": 6.285373162679804e-06, "loss": 0.7384, "step": 1226 }, { "epoch": 0.47590419858431104, "grad_norm": 2.2365275451106688, "learning_rate": 6.278828864208839e-06, "loss": 0.6671, "step": 1227 }, { "epoch": 0.47629205856685736, "grad_norm": 1.9944669876142356, "learning_rate": 6.272282220774091e-06, "loss": 0.7089, "step": 1228 }, { "epoch": 0.47667991854940367, "grad_norm": 1.957758772917136, "learning_rate": 6.265733244380014e-06, "loss": 0.6342, "step": 1229 }, { "epoch": 0.47706777853195, "grad_norm": 2.27157698140445, "learning_rate": 6.2591819470353424e-06, "loss": 0.7154, "step": 1230 }, { "epoch": 0.4774556385144963, "grad_norm": 2.6459792072186525, "learning_rate": 6.25262834075306e-06, "loss": 0.7504, "step": 1231 }, { "epoch": 0.47784349849704255, "grad_norm": 2.417386283488738, "learning_rate": 6.246072437550391e-06, "loss": 0.7455, "step": 1232 }, { "epoch": 0.47823135847958886, "grad_norm": 2.4728089522250225, "learning_rate": 6.239514249448767e-06, "loss": 0.6806, "step": 1233 }, { "epoch": 0.4786192184621352, "grad_norm": 2.7343398302419106, "learning_rate": 6.2329537884738115e-06, "loss": 0.6995, "step": 1234 }, { "epoch": 0.4790070784446815, "grad_norm": 2.2711723756315463, "learning_rate": 6.226391066655313e-06, "loss": 0.7091, "step": 1235 }, { "epoch": 0.47939493842722775, "grad_norm": 2.088350289750981, "learning_rate": 6.219826096027211e-06, "loss": 0.698, "step": 1236 }, { "epoch": 0.47978279840977406, "grad_norm": 2.1140522427856725, "learning_rate": 6.213258888627561e-06, "loss": 0.6705, "step": 1237 }, { "epoch": 0.48017065839232037, "grad_norm": 2.434709169339579, "learning_rate": 6.206689456498529e-06, "loss": 0.7627, "step": 1238 }, { "epoch": 0.4805585183748667, "grad_norm": 2.23620954337942, "learning_rate": 6.200117811686354e-06, "loss": 0.711, "step": 1239 }, { "epoch": 0.480946378357413, "grad_norm": 2.4342785458507006, "learning_rate": 6.193543966241332e-06, "loss": 0.7202, "step": 1240 }, { "epoch": 0.48133423833995925, "grad_norm": 2.660702641492823, "learning_rate": 6.1869679322178e-06, "loss": 0.6927, "step": 1241 }, { "epoch": 0.48172209832250557, "grad_norm": 2.1432407365063533, "learning_rate": 6.180389721674101e-06, "loss": 0.6765, "step": 1242 }, { "epoch": 0.4821099583050519, "grad_norm": 2.5738497451654436, "learning_rate": 6.1738093466725745e-06, "loss": 0.7477, "step": 1243 }, { "epoch": 0.4824978182875982, "grad_norm": 2.859521270877098, "learning_rate": 6.1672268192795285e-06, "loss": 0.7285, "step": 1244 }, { "epoch": 0.4828856782701445, "grad_norm": 2.744012493749768, "learning_rate": 6.1606421515652124e-06, "loss": 0.6953, "step": 1245 }, { "epoch": 0.48327353825269076, "grad_norm": 2.5023738206726867, "learning_rate": 6.1540553556038075e-06, "loss": 0.6611, "step": 1246 }, { "epoch": 0.48366139823523707, "grad_norm": 2.726640173978508, "learning_rate": 6.1474664434733935e-06, "loss": 0.6624, "step": 1247 }, { "epoch": 0.4840492582177834, "grad_norm": 2.586791197474196, "learning_rate": 6.14087542725593e-06, "loss": 0.7137, "step": 1248 }, { "epoch": 0.4844371182003297, "grad_norm": 2.149868285979903, "learning_rate": 6.134282319037238e-06, "loss": 0.7024, "step": 1249 }, { "epoch": 0.48482497818287595, "grad_norm": 3.1200650417013027, "learning_rate": 6.127687130906972e-06, "loss": 0.759, "step": 1250 }, { "epoch": 0.48521283816542227, "grad_norm": 3.0385006548730535, "learning_rate": 6.1210898749586e-06, "loss": 0.7335, "step": 1251 }, { "epoch": 0.4856006981479686, "grad_norm": 1.7744197074175725, "learning_rate": 6.114490563289384e-06, "loss": 0.7523, "step": 1252 }, { "epoch": 0.4859885581305149, "grad_norm": 2.4533599402379207, "learning_rate": 6.1078892080003535e-06, "loss": 0.6817, "step": 1253 }, { "epoch": 0.4863764181130612, "grad_norm": 2.8501968777980493, "learning_rate": 6.101285821196285e-06, "loss": 0.7508, "step": 1254 }, { "epoch": 0.48676427809560746, "grad_norm": 2.9387892827633193, "learning_rate": 6.094680414985685e-06, "loss": 0.7009, "step": 1255 }, { "epoch": 0.4871521380781538, "grad_norm": 3.7246911482454874, "learning_rate": 6.088073001480757e-06, "loss": 0.796, "step": 1256 }, { "epoch": 0.4875399980607001, "grad_norm": 2.6342997585153256, "learning_rate": 6.081463592797388e-06, "loss": 0.7054, "step": 1257 }, { "epoch": 0.4879278580432464, "grad_norm": 2.7325734855488886, "learning_rate": 6.074852201055121e-06, "loss": 0.6896, "step": 1258 }, { "epoch": 0.4883157180257927, "grad_norm": 2.6755583857919465, "learning_rate": 6.06823883837714e-06, "loss": 0.7077, "step": 1259 }, { "epoch": 0.48870357800833897, "grad_norm": 1.9730663747648456, "learning_rate": 6.061623516890238e-06, "loss": 0.6811, "step": 1260 }, { "epoch": 0.4890914379908853, "grad_norm": 2.3430414739175927, "learning_rate": 6.0550062487248055e-06, "loss": 0.6565, "step": 1261 }, { "epoch": 0.4894792979734316, "grad_norm": 2.8107623588483874, "learning_rate": 6.048387046014795e-06, "loss": 0.725, "step": 1262 }, { "epoch": 0.4898671579559779, "grad_norm": 2.3112258777673986, "learning_rate": 6.041765920897713e-06, "loss": 0.712, "step": 1263 }, { "epoch": 0.4902550179385242, "grad_norm": 2.4928628099898402, "learning_rate": 6.03514288551459e-06, "loss": 0.7801, "step": 1264 }, { "epoch": 0.4906428779210705, "grad_norm": 2.7433209813469905, "learning_rate": 6.028517952009957e-06, "loss": 0.7303, "step": 1265 }, { "epoch": 0.4910307379036168, "grad_norm": 2.637689479365937, "learning_rate": 6.021891132531825e-06, "loss": 0.7015, "step": 1266 }, { "epoch": 0.4914185978861631, "grad_norm": 3.1191167867689007, "learning_rate": 6.015262439231666e-06, "loss": 0.6878, "step": 1267 }, { "epoch": 0.4918064578687094, "grad_norm": 2.7831035141530465, "learning_rate": 6.008631884264387e-06, "loss": 0.7018, "step": 1268 }, { "epoch": 0.49219431785125567, "grad_norm": 2.3976243824723933, "learning_rate": 6.00199947978831e-06, "loss": 0.6655, "step": 1269 }, { "epoch": 0.492582177833802, "grad_norm": 2.5780823950509957, "learning_rate": 5.995365237965144e-06, "loss": 0.6672, "step": 1270 }, { "epoch": 0.4929700378163483, "grad_norm": 2.0292089837980165, "learning_rate": 5.98872917095997e-06, "loss": 0.6961, "step": 1271 }, { "epoch": 0.4933578977988946, "grad_norm": 1.9630637163462052, "learning_rate": 5.98209129094122e-06, "loss": 0.6692, "step": 1272 }, { "epoch": 0.4937457577814409, "grad_norm": 3.200236519678224, "learning_rate": 5.975451610080643e-06, "loss": 0.7428, "step": 1273 }, { "epoch": 0.4941336177639872, "grad_norm": 2.02786767774801, "learning_rate": 5.968810140553292e-06, "loss": 0.6851, "step": 1274 }, { "epoch": 0.4945214777465335, "grad_norm": 2.272052892601218, "learning_rate": 5.962166894537507e-06, "loss": 0.6298, "step": 1275 }, { "epoch": 0.4949093377290798, "grad_norm": 1.9366796385503493, "learning_rate": 5.955521884214872e-06, "loss": 0.6717, "step": 1276 }, { "epoch": 0.4952971977116261, "grad_norm": 2.079983090043288, "learning_rate": 5.948875121770221e-06, "loss": 0.6638, "step": 1277 }, { "epoch": 0.4956850576941724, "grad_norm": 3.2812461414087135, "learning_rate": 5.942226619391592e-06, "loss": 0.7627, "step": 1278 }, { "epoch": 0.4960729176767187, "grad_norm": 2.3837068581996017, "learning_rate": 5.935576389270215e-06, "loss": 0.7325, "step": 1279 }, { "epoch": 0.496460777659265, "grad_norm": 2.3117711192635033, "learning_rate": 5.928924443600487e-06, "loss": 0.6932, "step": 1280 }, { "epoch": 0.4968486376418113, "grad_norm": 2.8719193445274143, "learning_rate": 5.922270794579953e-06, "loss": 0.7845, "step": 1281 }, { "epoch": 0.4972364976243576, "grad_norm": 2.8439419936559647, "learning_rate": 5.915615454409281e-06, "loss": 0.7559, "step": 1282 }, { "epoch": 0.49762435760690393, "grad_norm": 2.9559357719633725, "learning_rate": 5.908958435292241e-06, "loss": 0.738, "step": 1283 }, { "epoch": 0.4980122175894502, "grad_norm": 2.488807663110869, "learning_rate": 5.902299749435678e-06, "loss": 0.6956, "step": 1284 }, { "epoch": 0.4984000775719965, "grad_norm": 2.1626319339146347, "learning_rate": 5.895639409049497e-06, "loss": 0.7202, "step": 1285 }, { "epoch": 0.4987879375545428, "grad_norm": 2.9340875070621744, "learning_rate": 5.888977426346636e-06, "loss": 0.7958, "step": 1286 }, { "epoch": 0.4991757975370891, "grad_norm": 2.591934498474498, "learning_rate": 5.882313813543043e-06, "loss": 0.785, "step": 1287 }, { "epoch": 0.4995636575196354, "grad_norm": 2.96302617414892, "learning_rate": 5.875648582857655e-06, "loss": 0.7079, "step": 1288 }, { "epoch": 0.4999515175021817, "grad_norm": 2.2728941833596754, "learning_rate": 5.868981746512379e-06, "loss": 0.7111, "step": 1289 }, { "epoch": 0.500339377484728, "grad_norm": 2.633025661311163, "learning_rate": 5.862313316732064e-06, "loss": 0.7338, "step": 1290 }, { "epoch": 0.5007272374672743, "grad_norm": 2.3650556981023567, "learning_rate": 5.855643305744479e-06, "loss": 0.6483, "step": 1291 }, { "epoch": 0.5011150974498206, "grad_norm": 2.643270973244449, "learning_rate": 5.848971725780294e-06, "loss": 0.6611, "step": 1292 }, { "epoch": 0.501502957432367, "grad_norm": 2.9894060460580953, "learning_rate": 5.842298589073058e-06, "loss": 0.7221, "step": 1293 }, { "epoch": 0.5018908174149133, "grad_norm": 2.965192298296221, "learning_rate": 5.835623907859173e-06, "loss": 0.7299, "step": 1294 }, { "epoch": 0.5022786773974596, "grad_norm": 2.307154638156463, "learning_rate": 5.828947694377871e-06, "loss": 0.654, "step": 1295 }, { "epoch": 0.5026665373800058, "grad_norm": 1.7965807175606123, "learning_rate": 5.822269960871198e-06, "loss": 0.6625, "step": 1296 }, { "epoch": 0.5030543973625521, "grad_norm": 1.7622340423619112, "learning_rate": 5.815590719583984e-06, "loss": 0.7039, "step": 1297 }, { "epoch": 0.5034422573450984, "grad_norm": 2.5308037911047725, "learning_rate": 5.808909982763825e-06, "loss": 0.7048, "step": 1298 }, { "epoch": 0.5038301173276447, "grad_norm": 2.4698615694293844, "learning_rate": 5.802227762661058e-06, "loss": 0.7445, "step": 1299 }, { "epoch": 0.504217977310191, "grad_norm": 2.081033237550896, "learning_rate": 5.795544071528742e-06, "loss": 0.6334, "step": 1300 }, { "epoch": 0.5046058372927373, "grad_norm": 3.1215450760644643, "learning_rate": 5.7888589216226295e-06, "loss": 0.7324, "step": 1301 }, { "epoch": 0.5049936972752836, "grad_norm": 2.134490448619941, "learning_rate": 5.782172325201155e-06, "loss": 0.6446, "step": 1302 }, { "epoch": 0.50538155725783, "grad_norm": 2.7732598525433474, "learning_rate": 5.775484294525399e-06, "loss": 0.717, "step": 1303 }, { "epoch": 0.5057694172403763, "grad_norm": 2.5543366147822093, "learning_rate": 5.768794841859074e-06, "loss": 0.7434, "step": 1304 }, { "epoch": 0.5061572772229225, "grad_norm": 2.923152721123601, "learning_rate": 5.762103979468501e-06, "loss": 0.7435, "step": 1305 }, { "epoch": 0.5065451372054688, "grad_norm": 2.004149263900733, "learning_rate": 5.755411719622584e-06, "loss": 0.6678, "step": 1306 }, { "epoch": 0.5069329971880151, "grad_norm": 2.3210331897022636, "learning_rate": 5.748718074592792e-06, "loss": 0.7358, "step": 1307 }, { "epoch": 0.5073208571705614, "grad_norm": 2.719387919596997, "learning_rate": 5.742023056653131e-06, "loss": 0.7279, "step": 1308 }, { "epoch": 0.5077087171531077, "grad_norm": 2.6580849999005274, "learning_rate": 5.735326678080127e-06, "loss": 0.7009, "step": 1309 }, { "epoch": 0.508096577135654, "grad_norm": 1.9219288050266448, "learning_rate": 5.728628951152799e-06, "loss": 0.6697, "step": 1310 }, { "epoch": 0.5084844371182003, "grad_norm": 1.9995184008059135, "learning_rate": 5.721929888152642e-06, "loss": 0.7295, "step": 1311 }, { "epoch": 0.5088722971007467, "grad_norm": 2.5581644677407747, "learning_rate": 5.715229501363595e-06, "loss": 0.7032, "step": 1312 }, { "epoch": 0.509260157083293, "grad_norm": 2.2610163744144884, "learning_rate": 5.708527803072031e-06, "loss": 0.6608, "step": 1313 }, { "epoch": 0.5096480170658393, "grad_norm": 2.0980118439670776, "learning_rate": 5.701824805566722e-06, "loss": 0.6784, "step": 1314 }, { "epoch": 0.5100358770483855, "grad_norm": 2.3393852155870145, "learning_rate": 5.695120521138827e-06, "loss": 0.6233, "step": 1315 }, { "epoch": 0.5104237370309318, "grad_norm": 2.763184653709563, "learning_rate": 5.688414962081862e-06, "loss": 0.7165, "step": 1316 }, { "epoch": 0.5108115970134781, "grad_norm": 2.9122165931313497, "learning_rate": 5.681708140691681e-06, "loss": 0.6969, "step": 1317 }, { "epoch": 0.5111994569960244, "grad_norm": 2.3721080660418323, "learning_rate": 5.675000069266451e-06, "loss": 0.6897, "step": 1318 }, { "epoch": 0.5115873169785707, "grad_norm": 2.4623102738024283, "learning_rate": 5.668290760106636e-06, "loss": 0.6853, "step": 1319 }, { "epoch": 0.511975176961117, "grad_norm": 2.359847978851047, "learning_rate": 5.661580225514966e-06, "loss": 0.7512, "step": 1320 }, { "epoch": 0.5123630369436634, "grad_norm": 2.0348378048115516, "learning_rate": 5.654868477796418e-06, "loss": 0.6445, "step": 1321 }, { "epoch": 0.5127508969262097, "grad_norm": 2.2554060756974796, "learning_rate": 5.648155529258195e-06, "loss": 0.7242, "step": 1322 }, { "epoch": 0.513138756908756, "grad_norm": 1.9581374653539718, "learning_rate": 5.641441392209699e-06, "loss": 0.6628, "step": 1323 }, { "epoch": 0.5135266168913022, "grad_norm": 2.344784118428859, "learning_rate": 5.634726078962514e-06, "loss": 0.6954, "step": 1324 }, { "epoch": 0.5139144768738485, "grad_norm": 2.3922435260153723, "learning_rate": 5.628009601830382e-06, "loss": 0.6799, "step": 1325 }, { "epoch": 0.5143023368563948, "grad_norm": 3.0286773683898893, "learning_rate": 5.621291973129177e-06, "loss": 0.6763, "step": 1326 }, { "epoch": 0.5146901968389411, "grad_norm": 3.348849256560575, "learning_rate": 5.614573205176882e-06, "loss": 0.7791, "step": 1327 }, { "epoch": 0.5150780568214874, "grad_norm": 2.7558242177610177, "learning_rate": 5.607853310293575e-06, "loss": 0.6891, "step": 1328 }, { "epoch": 0.5154659168040338, "grad_norm": 2.825802759095705, "learning_rate": 5.601132300801398e-06, "loss": 0.652, "step": 1329 }, { "epoch": 0.5158537767865801, "grad_norm": 2.6479413995817245, "learning_rate": 5.594410189024533e-06, "loss": 0.6572, "step": 1330 }, { "epoch": 0.5162416367691264, "grad_norm": 2.249789705823667, "learning_rate": 5.587686987289189e-06, "loss": 0.6749, "step": 1331 }, { "epoch": 0.5166294967516727, "grad_norm": 2.2272524537595713, "learning_rate": 5.580962707923571e-06, "loss": 0.6535, "step": 1332 }, { "epoch": 0.517017356734219, "grad_norm": 2.2434340578616867, "learning_rate": 5.574237363257858e-06, "loss": 0.6978, "step": 1333 }, { "epoch": 0.5174052167167652, "grad_norm": 2.4745395427658337, "learning_rate": 5.567510965624187e-06, "loss": 0.694, "step": 1334 }, { "epoch": 0.5177930766993115, "grad_norm": 2.278817785202304, "learning_rate": 5.560783527356622e-06, "loss": 0.6986, "step": 1335 }, { "epoch": 0.5181809366818578, "grad_norm": 1.601930340983911, "learning_rate": 5.554055060791138e-06, "loss": 0.5956, "step": 1336 }, { "epoch": 0.5185687966644041, "grad_norm": 3.3880113362078443, "learning_rate": 5.547325578265594e-06, "loss": 0.6904, "step": 1337 }, { "epoch": 0.5189566566469505, "grad_norm": 2.0069456803390553, "learning_rate": 5.540595092119709e-06, "loss": 0.7102, "step": 1338 }, { "epoch": 0.5193445166294968, "grad_norm": 2.0238780123148596, "learning_rate": 5.53386361469505e-06, "loss": 0.6886, "step": 1339 }, { "epoch": 0.5197323766120431, "grad_norm": 3.080598625334105, "learning_rate": 5.527131158334993e-06, "loss": 0.6889, "step": 1340 }, { "epoch": 0.5201202365945894, "grad_norm": 2.1747479399808087, "learning_rate": 5.520397735384716e-06, "loss": 0.7127, "step": 1341 }, { "epoch": 0.5205080965771357, "grad_norm": 2.345936421566585, "learning_rate": 5.513663358191166e-06, "loss": 0.6917, "step": 1342 }, { "epoch": 0.5208959565596819, "grad_norm": 2.2501169481930856, "learning_rate": 5.50692803910304e-06, "loss": 0.6614, "step": 1343 }, { "epoch": 0.5212838165422282, "grad_norm": 2.791155470716508, "learning_rate": 5.500191790470761e-06, "loss": 0.7049, "step": 1344 }, { "epoch": 0.5216716765247745, "grad_norm": 2.940974342022708, "learning_rate": 5.493454624646461e-06, "loss": 0.7683, "step": 1345 }, { "epoch": 0.5220595365073208, "grad_norm": 2.064205865437221, "learning_rate": 5.4867165539839505e-06, "loss": 0.6961, "step": 1346 }, { "epoch": 0.5224473964898672, "grad_norm": 2.5904016750154004, "learning_rate": 5.479977590838697e-06, "loss": 0.7188, "step": 1347 }, { "epoch": 0.5228352564724135, "grad_norm": 1.8236839883680598, "learning_rate": 5.473237747567805e-06, "loss": 0.6923, "step": 1348 }, { "epoch": 0.5232231164549598, "grad_norm": 2.759509381224824, "learning_rate": 5.466497036530002e-06, "loss": 0.682, "step": 1349 }, { "epoch": 0.5236109764375061, "grad_norm": 2.9009448775571074, "learning_rate": 5.459755470085595e-06, "loss": 0.6915, "step": 1350 }, { "epoch": 0.5239988364200524, "grad_norm": 2.3394798023492274, "learning_rate": 5.453013060596465e-06, "loss": 0.7663, "step": 1351 }, { "epoch": 0.5243866964025986, "grad_norm": 2.611855407608093, "learning_rate": 5.4462698204260365e-06, "loss": 0.7139, "step": 1352 }, { "epoch": 0.5247745563851449, "grad_norm": 2.348158119782249, "learning_rate": 5.439525761939261e-06, "loss": 0.6192, "step": 1353 }, { "epoch": 0.5251624163676912, "grad_norm": 2.7384801581005878, "learning_rate": 5.432780897502588e-06, "loss": 0.7639, "step": 1354 }, { "epoch": 0.5255502763502375, "grad_norm": 2.7076505877824175, "learning_rate": 5.4260352394839445e-06, "loss": 0.6681, "step": 1355 }, { "epoch": 0.5259381363327839, "grad_norm": 2.9251147872116703, "learning_rate": 5.419288800252713e-06, "loss": 0.7148, "step": 1356 }, { "epoch": 0.5263259963153302, "grad_norm": 1.9601181894219555, "learning_rate": 5.412541592179708e-06, "loss": 0.6827, "step": 1357 }, { "epoch": 0.5267138562978765, "grad_norm": 3.4102097310302777, "learning_rate": 5.405793627637157e-06, "loss": 0.6955, "step": 1358 }, { "epoch": 0.5271017162804228, "grad_norm": 2.1162433361476007, "learning_rate": 5.3990449189986705e-06, "loss": 0.6588, "step": 1359 }, { "epoch": 0.5274895762629691, "grad_norm": 3.3787204762191045, "learning_rate": 5.392295478639226e-06, "loss": 0.7436, "step": 1360 }, { "epoch": 0.5278774362455154, "grad_norm": 2.319079857312065, "learning_rate": 5.38554531893514e-06, "loss": 0.6879, "step": 1361 }, { "epoch": 0.5282652962280616, "grad_norm": 3.492343882748287, "learning_rate": 5.378794452264053e-06, "loss": 0.7376, "step": 1362 }, { "epoch": 0.5286531562106079, "grad_norm": 2.1335754314243043, "learning_rate": 5.372042891004896e-06, "loss": 0.6274, "step": 1363 }, { "epoch": 0.5290410161931542, "grad_norm": 2.305033079460489, "learning_rate": 5.365290647537878e-06, "loss": 0.6532, "step": 1364 }, { "epoch": 0.5294288761757006, "grad_norm": 2.6360279044396004, "learning_rate": 5.3585377342444566e-06, "loss": 0.7441, "step": 1365 }, { "epoch": 0.5298167361582469, "grad_norm": 2.1373177958219394, "learning_rate": 5.351784163507319e-06, "loss": 0.6931, "step": 1366 }, { "epoch": 0.5302045961407932, "grad_norm": 2.7557493158423645, "learning_rate": 5.345029947710357e-06, "loss": 0.7409, "step": 1367 }, { "epoch": 0.5305924561233395, "grad_norm": 2.5007986613988096, "learning_rate": 5.338275099238647e-06, "loss": 0.6592, "step": 1368 }, { "epoch": 0.5309803161058858, "grad_norm": 2.654836252482071, "learning_rate": 5.331519630478421e-06, "loss": 0.7917, "step": 1369 }, { "epoch": 0.5313681760884321, "grad_norm": 2.1150002003356905, "learning_rate": 5.3247635538170536e-06, "loss": 0.583, "step": 1370 }, { "epoch": 0.5317560360709783, "grad_norm": 2.955891356109282, "learning_rate": 5.318006881643034e-06, "loss": 0.7658, "step": 1371 }, { "epoch": 0.5321438960535246, "grad_norm": 3.357132645924024, "learning_rate": 5.311249626345938e-06, "loss": 0.7162, "step": 1372 }, { "epoch": 0.532531756036071, "grad_norm": 1.9017060825506893, "learning_rate": 5.304491800316416e-06, "loss": 0.6478, "step": 1373 }, { "epoch": 0.5329196160186173, "grad_norm": 2.9952591059921096, "learning_rate": 5.297733415946161e-06, "loss": 0.7969, "step": 1374 }, { "epoch": 0.5333074760011636, "grad_norm": 2.6090579337720277, "learning_rate": 5.290974485627894e-06, "loss": 0.7137, "step": 1375 }, { "epoch": 0.5336953359837099, "grad_norm": 2.518317027415565, "learning_rate": 5.284215021755336e-06, "loss": 0.6878, "step": 1376 }, { "epoch": 0.5340831959662562, "grad_norm": 3.620625171592069, "learning_rate": 5.277455036723182e-06, "loss": 0.7426, "step": 1377 }, { "epoch": 0.5344710559488025, "grad_norm": 2.4070723555267293, "learning_rate": 5.270694542927089e-06, "loss": 0.7173, "step": 1378 }, { "epoch": 0.5348589159313488, "grad_norm": 3.05876432755538, "learning_rate": 5.263933552763641e-06, "loss": 0.6377, "step": 1379 }, { "epoch": 0.5352467759138951, "grad_norm": 2.4813008559502867, "learning_rate": 5.257172078630337e-06, "loss": 0.7136, "step": 1380 }, { "epoch": 0.5356346358964413, "grad_norm": 2.3987809853830813, "learning_rate": 5.250410132925561e-06, "loss": 0.6631, "step": 1381 }, { "epoch": 0.5360224958789876, "grad_norm": 2.399443403667994, "learning_rate": 5.243647728048561e-06, "loss": 0.7139, "step": 1382 }, { "epoch": 0.536410355861534, "grad_norm": 2.603399893902565, "learning_rate": 5.23688487639943e-06, "loss": 0.7182, "step": 1383 }, { "epoch": 0.5367982158440803, "grad_norm": 2.623561960384915, "learning_rate": 5.2301215903790785e-06, "loss": 0.7391, "step": 1384 }, { "epoch": 0.5371860758266266, "grad_norm": 2.4784116609848845, "learning_rate": 5.223357882389212e-06, "loss": 0.724, "step": 1385 }, { "epoch": 0.5375739358091729, "grad_norm": 2.3811285967869265, "learning_rate": 5.2165937648323115e-06, "loss": 0.7342, "step": 1386 }, { "epoch": 0.5379617957917192, "grad_norm": 2.7977296153363094, "learning_rate": 5.209829250111609e-06, "loss": 0.7187, "step": 1387 }, { "epoch": 0.5383496557742655, "grad_norm": 2.0402256353216544, "learning_rate": 5.203064350631064e-06, "loss": 0.6837, "step": 1388 }, { "epoch": 0.5387375157568118, "grad_norm": 1.8589359160419467, "learning_rate": 5.1962990787953436e-06, "loss": 0.6617, "step": 1389 }, { "epoch": 0.539125375739358, "grad_norm": 3.686601060613512, "learning_rate": 5.189533447009795e-06, "loss": 0.7214, "step": 1390 }, { "epoch": 0.5395132357219043, "grad_norm": 2.5855761336675926, "learning_rate": 5.182767467680425e-06, "loss": 0.7077, "step": 1391 }, { "epoch": 0.5399010957044507, "grad_norm": 2.724862146720742, "learning_rate": 5.176001153213881e-06, "loss": 0.7198, "step": 1392 }, { "epoch": 0.540288955686997, "grad_norm": 2.0665141776028655, "learning_rate": 5.1692345160174225e-06, "loss": 0.6852, "step": 1393 }, { "epoch": 0.5406768156695433, "grad_norm": 2.452057541157922, "learning_rate": 5.1624675684989035e-06, "loss": 0.6817, "step": 1394 }, { "epoch": 0.5410646756520896, "grad_norm": 2.038165812387352, "learning_rate": 5.155700323066741e-06, "loss": 0.6878, "step": 1395 }, { "epoch": 0.5414525356346359, "grad_norm": 2.565935423740852, "learning_rate": 5.148932792129905e-06, "loss": 0.6539, "step": 1396 }, { "epoch": 0.5418403956171822, "grad_norm": 2.645476566126778, "learning_rate": 5.142164988097885e-06, "loss": 0.7074, "step": 1397 }, { "epoch": 0.5422282555997285, "grad_norm": 2.0190424752006746, "learning_rate": 5.1353969233806735e-06, "loss": 0.6537, "step": 1398 }, { "epoch": 0.5426161155822748, "grad_norm": 2.2371223314018045, "learning_rate": 5.128628610388739e-06, "loss": 0.7012, "step": 1399 }, { "epoch": 0.543003975564821, "grad_norm": 2.4052455866205706, "learning_rate": 5.121860061533006e-06, "loss": 0.6532, "step": 1400 }, { "epoch": 0.5433918355473674, "grad_norm": 2.1143090982344095, "learning_rate": 5.1150912892248335e-06, "loss": 0.6782, "step": 1401 }, { "epoch": 0.5437796955299137, "grad_norm": 2.818241910049625, "learning_rate": 5.108322305875988e-06, "loss": 0.6957, "step": 1402 }, { "epoch": 0.54416755551246, "grad_norm": 2.1082717830575404, "learning_rate": 5.101553123898621e-06, "loss": 0.7153, "step": 1403 }, { "epoch": 0.5445554154950063, "grad_norm": 1.94033467843712, "learning_rate": 5.0947837557052536e-06, "loss": 0.628, "step": 1404 }, { "epoch": 0.5449432754775526, "grad_norm": 2.6697507839202967, "learning_rate": 5.0880142137087455e-06, "loss": 0.7618, "step": 1405 }, { "epoch": 0.5453311354600989, "grad_norm": 2.4340840892107107, "learning_rate": 5.0812445103222745e-06, "loss": 0.7031, "step": 1406 }, { "epoch": 0.5457189954426452, "grad_norm": 2.0323977630092815, "learning_rate": 5.074474657959313e-06, "loss": 0.682, "step": 1407 }, { "epoch": 0.5461068554251916, "grad_norm": 3.12277093764224, "learning_rate": 5.06770466903361e-06, "loss": 0.7357, "step": 1408 }, { "epoch": 0.5464947154077378, "grad_norm": 1.935480186052403, "learning_rate": 5.060934555959164e-06, "loss": 0.6941, "step": 1409 }, { "epoch": 0.5468825753902841, "grad_norm": 2.951351939043668, "learning_rate": 5.054164331150199e-06, "loss": 0.7315, "step": 1410 }, { "epoch": 0.5472704353728304, "grad_norm": 2.4371581353990424, "learning_rate": 5.047394007021149e-06, "loss": 0.6207, "step": 1411 }, { "epoch": 0.5476582953553767, "grad_norm": 2.307116163769734, "learning_rate": 5.040623595986622e-06, "loss": 0.6883, "step": 1412 }, { "epoch": 0.548046155337923, "grad_norm": 3.0362479922929118, "learning_rate": 5.033853110461393e-06, "loss": 0.7478, "step": 1413 }, { "epoch": 0.5484340153204693, "grad_norm": 1.9736435010359663, "learning_rate": 5.027082562860368e-06, "loss": 0.6349, "step": 1414 }, { "epoch": 0.5488218753030156, "grad_norm": 2.1970774967106994, "learning_rate": 5.020311965598572e-06, "loss": 0.6412, "step": 1415 }, { "epoch": 0.5492097352855619, "grad_norm": 2.725160272356875, "learning_rate": 5.013541331091117e-06, "loss": 0.6665, "step": 1416 }, { "epoch": 0.5495975952681083, "grad_norm": 2.568707190861362, "learning_rate": 5.006770671753183e-06, "loss": 0.6788, "step": 1417 }, { "epoch": 0.5499854552506546, "grad_norm": 2.516097758281976, "learning_rate": 5e-06, "loss": 0.6312, "step": 1418 }, { "epoch": 0.5503733152332008, "grad_norm": 2.3650320239433973, "learning_rate": 4.993229328246818e-06, "loss": 0.7004, "step": 1419 }, { "epoch": 0.5507611752157471, "grad_norm": 2.73222096416183, "learning_rate": 4.986458668908886e-06, "loss": 0.7246, "step": 1420 }, { "epoch": 0.5511490351982934, "grad_norm": 3.019388721384817, "learning_rate": 4.9796880344014305e-06, "loss": 0.6773, "step": 1421 }, { "epoch": 0.5515368951808397, "grad_norm": 2.5883687207945947, "learning_rate": 4.972917437139634e-06, "loss": 0.6745, "step": 1422 }, { "epoch": 0.551924755163386, "grad_norm": 2.251043655695189, "learning_rate": 4.966146889538608e-06, "loss": 0.6891, "step": 1423 }, { "epoch": 0.5523126151459323, "grad_norm": 2.0934184312535, "learning_rate": 4.959376404013378e-06, "loss": 0.7063, "step": 1424 }, { "epoch": 0.5527004751284786, "grad_norm": 2.020525798016942, "learning_rate": 4.952605992978853e-06, "loss": 0.6583, "step": 1425 }, { "epoch": 0.553088335111025, "grad_norm": 2.2047383672322356, "learning_rate": 4.945835668849801e-06, "loss": 0.7143, "step": 1426 }, { "epoch": 0.5534761950935713, "grad_norm": 1.796428186253084, "learning_rate": 4.9390654440408374e-06, "loss": 0.6454, "step": 1427 }, { "epoch": 0.5538640550761175, "grad_norm": 2.9815499779394976, "learning_rate": 4.932295330966392e-06, "loss": 0.7751, "step": 1428 }, { "epoch": 0.5542519150586638, "grad_norm": 2.3412836692499406, "learning_rate": 4.925525342040689e-06, "loss": 0.6871, "step": 1429 }, { "epoch": 0.5546397750412101, "grad_norm": 2.8047303490858972, "learning_rate": 4.918755489677729e-06, "loss": 0.6839, "step": 1430 }, { "epoch": 0.5550276350237564, "grad_norm": 2.2645800428348792, "learning_rate": 4.9119857862912544e-06, "loss": 0.6958, "step": 1431 }, { "epoch": 0.5554154950063027, "grad_norm": 2.020466801121728, "learning_rate": 4.905216244294746e-06, "loss": 0.6062, "step": 1432 }, { "epoch": 0.555803354988849, "grad_norm": 3.0306311809672555, "learning_rate": 4.8984468761013794e-06, "loss": 0.7574, "step": 1433 }, { "epoch": 0.5561912149713953, "grad_norm": 2.1161115933531445, "learning_rate": 4.891677694124013e-06, "loss": 0.6491, "step": 1434 }, { "epoch": 0.5565790749539417, "grad_norm": 2.635091044387135, "learning_rate": 4.884908710775167e-06, "loss": 0.6905, "step": 1435 }, { "epoch": 0.556966934936488, "grad_norm": 2.7513289360385103, "learning_rate": 4.878139938466995e-06, "loss": 0.6847, "step": 1436 }, { "epoch": 0.5573547949190343, "grad_norm": 2.1895226686294667, "learning_rate": 4.871371389611263e-06, "loss": 0.6448, "step": 1437 }, { "epoch": 0.5577426549015805, "grad_norm": 2.6260431325572844, "learning_rate": 4.864603076619329e-06, "loss": 0.7021, "step": 1438 }, { "epoch": 0.5581305148841268, "grad_norm": 2.6664695023903104, "learning_rate": 4.8578350119021176e-06, "loss": 0.7371, "step": 1439 }, { "epoch": 0.5585183748666731, "grad_norm": 2.3489803503796116, "learning_rate": 4.851067207870096e-06, "loss": 0.7342, "step": 1440 }, { "epoch": 0.5589062348492194, "grad_norm": 2.068110611174421, "learning_rate": 4.8442996769332605e-06, "loss": 0.6663, "step": 1441 }, { "epoch": 0.5592940948317657, "grad_norm": 2.4071078576961242, "learning_rate": 4.837532431501098e-06, "loss": 0.7215, "step": 1442 }, { "epoch": 0.559681954814312, "grad_norm": 1.8671877353991384, "learning_rate": 4.830765483982578e-06, "loss": 0.6056, "step": 1443 }, { "epoch": 0.5600698147968584, "grad_norm": 2.97163024272937, "learning_rate": 4.82399884678612e-06, "loss": 0.731, "step": 1444 }, { "epoch": 0.5604576747794047, "grad_norm": 2.6323569615819356, "learning_rate": 4.817232532319577e-06, "loss": 0.634, "step": 1445 }, { "epoch": 0.560845534761951, "grad_norm": 1.946988094034313, "learning_rate": 4.8104665529902075e-06, "loss": 0.6419, "step": 1446 }, { "epoch": 0.5612333947444972, "grad_norm": 2.174627220697791, "learning_rate": 4.803700921204659e-06, "loss": 0.638, "step": 1447 }, { "epoch": 0.5616212547270435, "grad_norm": 2.6492637357428164, "learning_rate": 4.796935649368936e-06, "loss": 0.7464, "step": 1448 }, { "epoch": 0.5620091147095898, "grad_norm": 2.3311207262320472, "learning_rate": 4.790170749888392e-06, "loss": 0.6614, "step": 1449 }, { "epoch": 0.5623969746921361, "grad_norm": 2.7714059475506505, "learning_rate": 4.783406235167689e-06, "loss": 0.6835, "step": 1450 }, { "epoch": 0.5627848346746824, "grad_norm": 3.2258133202087875, "learning_rate": 4.77664211761079e-06, "loss": 0.723, "step": 1451 }, { "epoch": 0.5631726946572287, "grad_norm": 2.3992893677534703, "learning_rate": 4.769878409620923e-06, "loss": 0.7342, "step": 1452 }, { "epoch": 0.5635605546397751, "grad_norm": 2.8216782365587987, "learning_rate": 4.763115123600571e-06, "loss": 0.6868, "step": 1453 }, { "epoch": 0.5639484146223214, "grad_norm": 2.3798661383216118, "learning_rate": 4.756352271951441e-06, "loss": 0.6843, "step": 1454 }, { "epoch": 0.5643362746048677, "grad_norm": 3.04798366234236, "learning_rate": 4.7495898670744415e-06, "loss": 0.7299, "step": 1455 }, { "epoch": 0.564724134587414, "grad_norm": 2.887622011362595, "learning_rate": 4.742827921369665e-06, "loss": 0.7235, "step": 1456 }, { "epoch": 0.5651119945699602, "grad_norm": 2.3272844149286542, "learning_rate": 4.7360664472363605e-06, "loss": 0.7385, "step": 1457 }, { "epoch": 0.5654998545525065, "grad_norm": 2.4579992941322453, "learning_rate": 4.729305457072913e-06, "loss": 0.6846, "step": 1458 }, { "epoch": 0.5658877145350528, "grad_norm": 2.0940130979193436, "learning_rate": 4.722544963276819e-06, "loss": 0.6732, "step": 1459 }, { "epoch": 0.5662755745175991, "grad_norm": 2.385747736492191, "learning_rate": 4.715784978244666e-06, "loss": 0.7015, "step": 1460 }, { "epoch": 0.5666634345001454, "grad_norm": 3.0407845230750494, "learning_rate": 4.709025514372107e-06, "loss": 0.6429, "step": 1461 }, { "epoch": 0.5670512944826918, "grad_norm": 2.112725589908354, "learning_rate": 4.70226658405384e-06, "loss": 0.6826, "step": 1462 }, { "epoch": 0.5674391544652381, "grad_norm": 3.4095366572263024, "learning_rate": 4.695508199683587e-06, "loss": 0.6845, "step": 1463 }, { "epoch": 0.5678270144477844, "grad_norm": 2.409550830553999, "learning_rate": 4.688750373654065e-06, "loss": 0.7499, "step": 1464 }, { "epoch": 0.5682148744303307, "grad_norm": 1.978640970514898, "learning_rate": 4.681993118356967e-06, "loss": 0.6745, "step": 1465 }, { "epoch": 0.5686027344128769, "grad_norm": 2.096149649351765, "learning_rate": 4.6752364461829456e-06, "loss": 0.6355, "step": 1466 }, { "epoch": 0.5689905943954232, "grad_norm": 2.1065350220341292, "learning_rate": 4.66848036952158e-06, "loss": 0.7129, "step": 1467 }, { "epoch": 0.5693784543779695, "grad_norm": 2.788554117623923, "learning_rate": 4.661724900761355e-06, "loss": 0.775, "step": 1468 }, { "epoch": 0.5697663143605158, "grad_norm": 2.7132240525160154, "learning_rate": 4.654970052289644e-06, "loss": 0.723, "step": 1469 }, { "epoch": 0.5701541743430621, "grad_norm": 2.3635322440289643, "learning_rate": 4.648215836492682e-06, "loss": 0.704, "step": 1470 }, { "epoch": 0.5705420343256085, "grad_norm": 2.390237772752502, "learning_rate": 4.641462265755545e-06, "loss": 0.6481, "step": 1471 }, { "epoch": 0.5709298943081548, "grad_norm": 3.2720418410549814, "learning_rate": 4.634709352462124e-06, "loss": 0.7456, "step": 1472 }, { "epoch": 0.5713177542907011, "grad_norm": 2.840584595084039, "learning_rate": 4.6279571089951056e-06, "loss": 0.6789, "step": 1473 }, { "epoch": 0.5717056142732474, "grad_norm": 2.131980831994724, "learning_rate": 4.621205547735949e-06, "loss": 0.6825, "step": 1474 }, { "epoch": 0.5720934742557937, "grad_norm": 2.8769999466019316, "learning_rate": 4.614454681064861e-06, "loss": 0.7033, "step": 1475 }, { "epoch": 0.5724813342383399, "grad_norm": 2.3878992605571403, "learning_rate": 4.6077045213607765e-06, "loss": 0.7055, "step": 1476 }, { "epoch": 0.5728691942208862, "grad_norm": 2.35511614861779, "learning_rate": 4.600955081001331e-06, "loss": 0.6831, "step": 1477 }, { "epoch": 0.5732570542034325, "grad_norm": 2.767028462087258, "learning_rate": 4.594206372362845e-06, "loss": 0.7091, "step": 1478 }, { "epoch": 0.5736449141859788, "grad_norm": 2.029748878332817, "learning_rate": 4.587458407820293e-06, "loss": 0.6603, "step": 1479 }, { "epoch": 0.5740327741685252, "grad_norm": 2.7813877838299716, "learning_rate": 4.580711199747289e-06, "loss": 0.6993, "step": 1480 }, { "epoch": 0.5744206341510715, "grad_norm": 1.719001194239928, "learning_rate": 4.573964760516058e-06, "loss": 0.6408, "step": 1481 }, { "epoch": 0.5748084941336178, "grad_norm": 2.008463117804807, "learning_rate": 4.567219102497413e-06, "loss": 0.6085, "step": 1482 }, { "epoch": 0.5751963541161641, "grad_norm": 2.7628970490805997, "learning_rate": 4.56047423806074e-06, "loss": 0.7301, "step": 1483 }, { "epoch": 0.5755842140987104, "grad_norm": 2.577003299808045, "learning_rate": 4.5537301795739635e-06, "loss": 0.6638, "step": 1484 }, { "epoch": 0.5759720740812566, "grad_norm": 2.754708924510402, "learning_rate": 4.546986939403537e-06, "loss": 0.6539, "step": 1485 }, { "epoch": 0.5763599340638029, "grad_norm": 2.8773219620227906, "learning_rate": 4.540244529914406e-06, "loss": 0.7057, "step": 1486 }, { "epoch": 0.5767477940463492, "grad_norm": 2.296921237306315, "learning_rate": 4.533502963469999e-06, "loss": 0.6841, "step": 1487 }, { "epoch": 0.5771356540288956, "grad_norm": 2.63244918753409, "learning_rate": 4.5267622524321955e-06, "loss": 0.6668, "step": 1488 }, { "epoch": 0.5775235140114419, "grad_norm": 2.6276553585928224, "learning_rate": 4.520022409161307e-06, "loss": 0.6361, "step": 1489 }, { "epoch": 0.5779113739939882, "grad_norm": 2.333560455855473, "learning_rate": 4.513283446016052e-06, "loss": 0.6395, "step": 1490 }, { "epoch": 0.5782992339765345, "grad_norm": 2.458999529089588, "learning_rate": 4.50654537535354e-06, "loss": 0.7136, "step": 1491 }, { "epoch": 0.5786870939590808, "grad_norm": 2.972309266657022, "learning_rate": 4.499808209529239e-06, "loss": 0.6969, "step": 1492 }, { "epoch": 0.5790749539416271, "grad_norm": 2.4141050039859797, "learning_rate": 4.4930719608969615e-06, "loss": 0.7078, "step": 1493 }, { "epoch": 0.5794628139241734, "grad_norm": 2.274053728681932, "learning_rate": 4.486336641808835e-06, "loss": 0.6371, "step": 1494 }, { "epoch": 0.5798506739067196, "grad_norm": 2.456429778795526, "learning_rate": 4.479602264615285e-06, "loss": 0.7134, "step": 1495 }, { "epoch": 0.5802385338892659, "grad_norm": 2.680603295681133, "learning_rate": 4.472868841665008e-06, "loss": 0.6773, "step": 1496 }, { "epoch": 0.5806263938718123, "grad_norm": 2.877067383106337, "learning_rate": 4.466136385304952e-06, "loss": 0.6605, "step": 1497 }, { "epoch": 0.5810142538543586, "grad_norm": 1.9639530695649463, "learning_rate": 4.459404907880293e-06, "loss": 0.6912, "step": 1498 }, { "epoch": 0.5814021138369049, "grad_norm": 2.9073705058449106, "learning_rate": 4.452674421734409e-06, "loss": 0.6494, "step": 1499 }, { "epoch": 0.5817899738194512, "grad_norm": 2.294406263533163, "learning_rate": 4.445944939208862e-06, "loss": 0.625, "step": 1500 }, { "epoch": 0.5817899738194512, "eval_loss": 1.347784399986267, "eval_runtime": 6.0698, "eval_samples_per_second": 0.165, "eval_steps_per_second": 0.165, "step": 1500 }, { "epoch": 0.5821778338019975, "grad_norm": 2.54026490320808, "learning_rate": 4.439216472643378e-06, "loss": 0.6635, "step": 1501 }, { "epoch": 0.5825656937845438, "grad_norm": 2.930273206880845, "learning_rate": 4.4324890343758134e-06, "loss": 0.7161, "step": 1502 }, { "epoch": 0.5829535537670901, "grad_norm": 2.682567688313872, "learning_rate": 4.425762636742143e-06, "loss": 0.7529, "step": 1503 }, { "epoch": 0.5833414137496363, "grad_norm": 2.172182797621054, "learning_rate": 4.419037292076431e-06, "loss": 0.6849, "step": 1504 }, { "epoch": 0.5837292737321826, "grad_norm": 1.779750848305935, "learning_rate": 4.4123130127108125e-06, "loss": 0.6525, "step": 1505 }, { "epoch": 0.584117133714729, "grad_norm": 3.1400630604900632, "learning_rate": 4.4055898109754684e-06, "loss": 0.6592, "step": 1506 }, { "epoch": 0.5845049936972753, "grad_norm": 2.7286250226472553, "learning_rate": 4.398867699198604e-06, "loss": 0.7321, "step": 1507 }, { "epoch": 0.5848928536798216, "grad_norm": 2.4608599301013667, "learning_rate": 4.392146689706426e-06, "loss": 0.7374, "step": 1508 }, { "epoch": 0.5852807136623679, "grad_norm": 1.993581015215571, "learning_rate": 4.385426794823119e-06, "loss": 0.6318, "step": 1509 }, { "epoch": 0.5856685736449142, "grad_norm": 1.8215345154190532, "learning_rate": 4.378708026870825e-06, "loss": 0.6854, "step": 1510 }, { "epoch": 0.5860564336274605, "grad_norm": 2.8492616679049183, "learning_rate": 4.371990398169619e-06, "loss": 0.6744, "step": 1511 }, { "epoch": 0.5864442936100068, "grad_norm": 1.6848676364777961, "learning_rate": 4.365273921037486e-06, "loss": 0.6424, "step": 1512 }, { "epoch": 0.5868321535925531, "grad_norm": 2.0840189598099714, "learning_rate": 4.358558607790303e-06, "loss": 0.7283, "step": 1513 }, { "epoch": 0.5872200135750993, "grad_norm": 2.3163783131587636, "learning_rate": 4.351844470741808e-06, "loss": 0.7056, "step": 1514 }, { "epoch": 0.5876078735576457, "grad_norm": 2.1662338347531658, "learning_rate": 4.345131522203584e-06, "loss": 0.6837, "step": 1515 }, { "epoch": 0.587995733540192, "grad_norm": 2.976873131869032, "learning_rate": 4.338419774485036e-06, "loss": 0.6421, "step": 1516 }, { "epoch": 0.5883835935227383, "grad_norm": 2.201543182683474, "learning_rate": 4.331709239893364e-06, "loss": 0.662, "step": 1517 }, { "epoch": 0.5887714535052846, "grad_norm": 3.0389743843976293, "learning_rate": 4.32499993073355e-06, "loss": 0.7168, "step": 1518 }, { "epoch": 0.5891593134878309, "grad_norm": 2.676125597377214, "learning_rate": 4.318291859308321e-06, "loss": 0.7162, "step": 1519 }, { "epoch": 0.5895471734703772, "grad_norm": 1.9904854020992586, "learning_rate": 4.31158503791814e-06, "loss": 0.6352, "step": 1520 }, { "epoch": 0.5899350334529235, "grad_norm": 1.7662305366155213, "learning_rate": 4.3048794788611745e-06, "loss": 0.63, "step": 1521 }, { "epoch": 0.5903228934354698, "grad_norm": 3.3095547079696406, "learning_rate": 4.298175194433279e-06, "loss": 0.6952, "step": 1522 }, { "epoch": 0.590710753418016, "grad_norm": 2.857288561260501, "learning_rate": 4.29147219692797e-06, "loss": 0.7045, "step": 1523 }, { "epoch": 0.5910986134005624, "grad_norm": 2.813959711608309, "learning_rate": 4.284770498636406e-06, "loss": 0.6402, "step": 1524 }, { "epoch": 0.5914864733831087, "grad_norm": 2.150936982390981, "learning_rate": 4.2780701118473585e-06, "loss": 0.6746, "step": 1525 }, { "epoch": 0.591874333365655, "grad_norm": 2.2509049077578487, "learning_rate": 4.271371048847201e-06, "loss": 0.6493, "step": 1526 }, { "epoch": 0.5922621933482013, "grad_norm": 2.9078927612027154, "learning_rate": 4.264673321919874e-06, "loss": 0.6771, "step": 1527 }, { "epoch": 0.5926500533307476, "grad_norm": 2.6105871377848304, "learning_rate": 4.25797694334687e-06, "loss": 0.6516, "step": 1528 }, { "epoch": 0.5930379133132939, "grad_norm": 3.049645346822957, "learning_rate": 4.251281925407209e-06, "loss": 0.6951, "step": 1529 }, { "epoch": 0.5934257732958402, "grad_norm": 2.2614642765882293, "learning_rate": 4.244588280377417e-06, "loss": 0.6339, "step": 1530 }, { "epoch": 0.5938136332783865, "grad_norm": 3.0191049090222575, "learning_rate": 4.2378960205315005e-06, "loss": 0.65, "step": 1531 }, { "epoch": 0.5942014932609329, "grad_norm": 2.7086707562970993, "learning_rate": 4.231205158140927e-06, "loss": 0.6243, "step": 1532 }, { "epoch": 0.5945893532434791, "grad_norm": 2.786599755295974, "learning_rate": 4.224515705474603e-06, "loss": 0.6962, "step": 1533 }, { "epoch": 0.5949772132260254, "grad_norm": 2.4746979620829004, "learning_rate": 4.217827674798845e-06, "loss": 0.6711, "step": 1534 }, { "epoch": 0.5953650732085717, "grad_norm": 2.0428697512053, "learning_rate": 4.211141078377371e-06, "loss": 0.6382, "step": 1535 }, { "epoch": 0.595752933191118, "grad_norm": 2.3738688736414835, "learning_rate": 4.20445592847126e-06, "loss": 0.6591, "step": 1536 }, { "epoch": 0.5961407931736643, "grad_norm": 2.907645585259133, "learning_rate": 4.1977722373389435e-06, "loss": 0.7449, "step": 1537 }, { "epoch": 0.5965286531562106, "grad_norm": 1.9641115477869804, "learning_rate": 4.191090017236177e-06, "loss": 0.6229, "step": 1538 }, { "epoch": 0.5969165131387569, "grad_norm": 2.547778790393003, "learning_rate": 4.184409280416018e-06, "loss": 0.7123, "step": 1539 }, { "epoch": 0.5973043731213032, "grad_norm": 2.120651284551054, "learning_rate": 4.177730039128803e-06, "loss": 0.7375, "step": 1540 }, { "epoch": 0.5976922331038496, "grad_norm": 2.150864571533602, "learning_rate": 4.17105230562213e-06, "loss": 0.6175, "step": 1541 }, { "epoch": 0.5980800930863958, "grad_norm": 1.8914192963720162, "learning_rate": 4.164376092140828e-06, "loss": 0.6692, "step": 1542 }, { "epoch": 0.5984679530689421, "grad_norm": 3.0540376329991616, "learning_rate": 4.157701410926943e-06, "loss": 0.7225, "step": 1543 }, { "epoch": 0.5988558130514884, "grad_norm": 2.938852096532211, "learning_rate": 4.151028274219707e-06, "loss": 0.7002, "step": 1544 }, { "epoch": 0.5992436730340347, "grad_norm": 1.8391955960980417, "learning_rate": 4.144356694255524e-06, "loss": 0.6547, "step": 1545 }, { "epoch": 0.599631533016581, "grad_norm": 1.9600787759232543, "learning_rate": 4.137686683267939e-06, "loss": 0.6563, "step": 1546 }, { "epoch": 0.6000193929991273, "grad_norm": 1.9636115865323793, "learning_rate": 4.1310182534876224e-06, "loss": 0.609, "step": 1547 }, { "epoch": 0.6004072529816736, "grad_norm": 1.9697209474752353, "learning_rate": 4.1243514171423465e-06, "loss": 0.6291, "step": 1548 }, { "epoch": 0.60079511296422, "grad_norm": 3.0992151741673046, "learning_rate": 4.117686186456959e-06, "loss": 0.748, "step": 1549 }, { "epoch": 0.6011829729467663, "grad_norm": 3.1799389657092045, "learning_rate": 4.111022573653366e-06, "loss": 0.7513, "step": 1550 }, { "epoch": 0.6015708329293126, "grad_norm": 2.1947419784412183, "learning_rate": 4.104360590950503e-06, "loss": 0.7177, "step": 1551 }, { "epoch": 0.6019586929118588, "grad_norm": 2.4133385070401077, "learning_rate": 4.097700250564323e-06, "loss": 0.7353, "step": 1552 }, { "epoch": 0.6023465528944051, "grad_norm": 3.1881184859968563, "learning_rate": 4.09104156470776e-06, "loss": 0.7479, "step": 1553 }, { "epoch": 0.6027344128769514, "grad_norm": 1.8723791134008665, "learning_rate": 4.0843845455907195e-06, "loss": 0.6927, "step": 1554 }, { "epoch": 0.6031222728594977, "grad_norm": 2.091764251802231, "learning_rate": 4.077729205420049e-06, "loss": 0.7396, "step": 1555 }, { "epoch": 0.603510132842044, "grad_norm": 2.5669977039649736, "learning_rate": 4.0710755563995155e-06, "loss": 0.6387, "step": 1556 }, { "epoch": 0.6038979928245903, "grad_norm": 2.4777555261409425, "learning_rate": 4.064423610729789e-06, "loss": 0.6733, "step": 1557 }, { "epoch": 0.6042858528071366, "grad_norm": 2.5154849903182512, "learning_rate": 4.057773380608411e-06, "loss": 0.7214, "step": 1558 }, { "epoch": 0.604673712789683, "grad_norm": 2.501280031277874, "learning_rate": 4.051124878229779e-06, "loss": 0.6474, "step": 1559 }, { "epoch": 0.6050615727722293, "grad_norm": 2.4787746740591556, "learning_rate": 4.044478115785128e-06, "loss": 0.6819, "step": 1560 }, { "epoch": 0.6054494327547755, "grad_norm": 2.4650688484503798, "learning_rate": 4.037833105462495e-06, "loss": 0.7195, "step": 1561 }, { "epoch": 0.6058372927373218, "grad_norm": 2.52704244011379, "learning_rate": 4.0311898594467084e-06, "loss": 0.6485, "step": 1562 }, { "epoch": 0.6062251527198681, "grad_norm": 3.0560230207085737, "learning_rate": 4.02454838991936e-06, "loss": 0.7083, "step": 1563 }, { "epoch": 0.6066130127024144, "grad_norm": 2.4470696868300266, "learning_rate": 4.017908709058782e-06, "loss": 0.6852, "step": 1564 }, { "epoch": 0.6070008726849607, "grad_norm": 2.6575364434098057, "learning_rate": 4.011270829040031e-06, "loss": 0.6749, "step": 1565 }, { "epoch": 0.607388732667507, "grad_norm": 2.178021586349511, "learning_rate": 4.004634762034858e-06, "loss": 0.6741, "step": 1566 }, { "epoch": 0.6077765926500533, "grad_norm": 2.4549011689308595, "learning_rate": 3.998000520211693e-06, "loss": 0.6191, "step": 1567 }, { "epoch": 0.6081644526325997, "grad_norm": 2.809295823127617, "learning_rate": 3.991368115735612e-06, "loss": 0.6513, "step": 1568 }, { "epoch": 0.608552312615146, "grad_norm": 2.2083603703964267, "learning_rate": 3.9847375607683335e-06, "loss": 0.6985, "step": 1569 }, { "epoch": 0.6089401725976923, "grad_norm": 2.117616811047947, "learning_rate": 3.9781088674681764e-06, "loss": 0.6626, "step": 1570 }, { "epoch": 0.6093280325802385, "grad_norm": 2.7207124591292864, "learning_rate": 3.971482047990045e-06, "loss": 0.76, "step": 1571 }, { "epoch": 0.6097158925627848, "grad_norm": 3.1342895393501564, "learning_rate": 3.964857114485412e-06, "loss": 0.7239, "step": 1572 }, { "epoch": 0.6101037525453311, "grad_norm": 2.1987085680546277, "learning_rate": 3.958234079102288e-06, "loss": 0.682, "step": 1573 }, { "epoch": 0.6104916125278774, "grad_norm": 2.176335028073533, "learning_rate": 3.951612953985207e-06, "loss": 0.64, "step": 1574 }, { "epoch": 0.6108794725104237, "grad_norm": 1.8090395130266996, "learning_rate": 3.944993751275198e-06, "loss": 0.6415, "step": 1575 }, { "epoch": 0.61126733249297, "grad_norm": 3.2024502068779848, "learning_rate": 3.938376483109762e-06, "loss": 0.7193, "step": 1576 }, { "epoch": 0.6116551924755164, "grad_norm": 2.373008283332793, "learning_rate": 3.931761161622861e-06, "loss": 0.7067, "step": 1577 }, { "epoch": 0.6120430524580627, "grad_norm": 2.621643677458823, "learning_rate": 3.92514779894488e-06, "loss": 0.7266, "step": 1578 }, { "epoch": 0.612430912440609, "grad_norm": 2.1145783402787024, "learning_rate": 3.918536407202614e-06, "loss": 0.6685, "step": 1579 }, { "epoch": 0.6128187724231552, "grad_norm": 1.773912337775789, "learning_rate": 3.911926998519244e-06, "loss": 0.632, "step": 1580 }, { "epoch": 0.6132066324057015, "grad_norm": 2.901303620391854, "learning_rate": 3.905319585014316e-06, "loss": 0.6842, "step": 1581 }, { "epoch": 0.6135944923882478, "grad_norm": 2.578042312518183, "learning_rate": 3.898714178803716e-06, "loss": 0.6682, "step": 1582 }, { "epoch": 0.6139823523707941, "grad_norm": 2.6485714588977034, "learning_rate": 3.892110791999649e-06, "loss": 0.6444, "step": 1583 }, { "epoch": 0.6143702123533404, "grad_norm": 2.4665016974028857, "learning_rate": 3.8855094367106185e-06, "loss": 0.6521, "step": 1584 }, { "epoch": 0.6147580723358868, "grad_norm": 2.04125898642337, "learning_rate": 3.878910125041401e-06, "loss": 0.6576, "step": 1585 }, { "epoch": 0.6151459323184331, "grad_norm": 2.2574694868390317, "learning_rate": 3.87231286909303e-06, "loss": 0.6261, "step": 1586 }, { "epoch": 0.6155337923009794, "grad_norm": 2.424531338299485, "learning_rate": 3.865717680962763e-06, "loss": 0.6902, "step": 1587 }, { "epoch": 0.6159216522835257, "grad_norm": 3.30829636108697, "learning_rate": 3.859124572744072e-06, "loss": 0.6921, "step": 1588 }, { "epoch": 0.616309512266072, "grad_norm": 2.376260037883521, "learning_rate": 3.852533556526609e-06, "loss": 0.6179, "step": 1589 }, { "epoch": 0.6166973722486182, "grad_norm": 2.942873644677855, "learning_rate": 3.845944644396194e-06, "loss": 0.7875, "step": 1590 }, { "epoch": 0.6170852322311645, "grad_norm": 3.4431215611350163, "learning_rate": 3.839357848434789e-06, "loss": 0.7496, "step": 1591 }, { "epoch": 0.6174730922137108, "grad_norm": 1.812410289403563, "learning_rate": 3.832773180720475e-06, "loss": 0.6759, "step": 1592 }, { "epoch": 0.6178609521962571, "grad_norm": 2.5139884784298094, "learning_rate": 3.8261906533274254e-06, "loss": 0.6706, "step": 1593 }, { "epoch": 0.6182488121788035, "grad_norm": 2.800762856100008, "learning_rate": 3.8196102783259e-06, "loss": 0.6811, "step": 1594 }, { "epoch": 0.6186366721613498, "grad_norm": 1.7712845152790089, "learning_rate": 3.813032067782202e-06, "loss": 0.6385, "step": 1595 }, { "epoch": 0.6190245321438961, "grad_norm": 2.327658476980756, "learning_rate": 3.806456033758669e-06, "loss": 0.6141, "step": 1596 }, { "epoch": 0.6194123921264424, "grad_norm": 2.151302652620802, "learning_rate": 3.7998821883136483e-06, "loss": 0.6383, "step": 1597 }, { "epoch": 0.6198002521089887, "grad_norm": 1.829805900686165, "learning_rate": 3.7933105435014727e-06, "loss": 0.6702, "step": 1598 }, { "epoch": 0.6201881120915349, "grad_norm": 2.5040916484652387, "learning_rate": 3.7867411113724402e-06, "loss": 0.6872, "step": 1599 }, { "epoch": 0.6205759720740812, "grad_norm": 2.802323219045996, "learning_rate": 3.780173903972792e-06, "loss": 0.6869, "step": 1600 }, { "epoch": 0.6209638320566275, "grad_norm": 2.0785339955209943, "learning_rate": 3.773608933344689e-06, "loss": 0.6653, "step": 1601 }, { "epoch": 0.6213516920391738, "grad_norm": 1.7508660913898402, "learning_rate": 3.767046211526191e-06, "loss": 0.5789, "step": 1602 }, { "epoch": 0.6217395520217202, "grad_norm": 2.400959618911096, "learning_rate": 3.7604857505512342e-06, "loss": 0.6385, "step": 1603 }, { "epoch": 0.6221274120042665, "grad_norm": 1.9357264900811444, "learning_rate": 3.75392756244961e-06, "loss": 0.6262, "step": 1604 }, { "epoch": 0.6225152719868128, "grad_norm": 2.8810712181390388, "learning_rate": 3.747371659246941e-06, "loss": 0.723, "step": 1605 }, { "epoch": 0.6229031319693591, "grad_norm": 2.560416986270056, "learning_rate": 3.7408180529646597e-06, "loss": 0.6762, "step": 1606 }, { "epoch": 0.6232909919519054, "grad_norm": 1.8493281363116516, "learning_rate": 3.7342667556199872e-06, "loss": 0.6357, "step": 1607 }, { "epoch": 0.6236788519344517, "grad_norm": 2.0116575272202195, "learning_rate": 3.727717779225912e-06, "loss": 0.6241, "step": 1608 }, { "epoch": 0.6240667119169979, "grad_norm": 2.1567215521071765, "learning_rate": 3.721171135791164e-06, "loss": 0.6581, "step": 1609 }, { "epoch": 0.6244545718995442, "grad_norm": 2.10297950663754, "learning_rate": 3.7146268373201956e-06, "loss": 0.6349, "step": 1610 }, { "epoch": 0.6248424318820905, "grad_norm": 1.9477940574351518, "learning_rate": 3.7080848958131644e-06, "loss": 0.6484, "step": 1611 }, { "epoch": 0.6252302918646369, "grad_norm": 2.300206097416494, "learning_rate": 3.7015453232659004e-06, "loss": 0.6986, "step": 1612 }, { "epoch": 0.6256181518471832, "grad_norm": 4.282355363017226, "learning_rate": 3.695008131669891e-06, "loss": 0.6439, "step": 1613 }, { "epoch": 0.6260060118297295, "grad_norm": 2.4797488011176143, "learning_rate": 3.6884733330122583e-06, "loss": 0.6415, "step": 1614 }, { "epoch": 0.6263938718122758, "grad_norm": 2.225913301107766, "learning_rate": 3.6819409392757366e-06, "loss": 0.6647, "step": 1615 }, { "epoch": 0.6267817317948221, "grad_norm": 3.4220179928869747, "learning_rate": 3.67541096243865e-06, "loss": 0.6991, "step": 1616 }, { "epoch": 0.6271695917773684, "grad_norm": 2.3681448193437507, "learning_rate": 3.6688834144748906e-06, "loss": 0.65, "step": 1617 }, { "epoch": 0.6275574517599146, "grad_norm": 2.770136918326645, "learning_rate": 3.662358307353897e-06, "loss": 0.6903, "step": 1618 }, { "epoch": 0.6279453117424609, "grad_norm": 2.142655401324396, "learning_rate": 3.655835653040631e-06, "loss": 0.6477, "step": 1619 }, { "epoch": 0.6283331717250072, "grad_norm": 2.9918927665227124, "learning_rate": 3.6493154634955607e-06, "loss": 0.6915, "step": 1620 }, { "epoch": 0.6287210317075536, "grad_norm": 2.5692395191019313, "learning_rate": 3.6427977506746293e-06, "loss": 0.6308, "step": 1621 }, { "epoch": 0.6291088916900999, "grad_norm": 1.745020639564015, "learning_rate": 3.6362825265292424e-06, "loss": 0.6137, "step": 1622 }, { "epoch": 0.6294967516726462, "grad_norm": 2.6882588510953873, "learning_rate": 3.629769803006239e-06, "loss": 0.6589, "step": 1623 }, { "epoch": 0.6298846116551925, "grad_norm": 2.2786953338149707, "learning_rate": 3.623259592047875e-06, "loss": 0.7496, "step": 1624 }, { "epoch": 0.6302724716377388, "grad_norm": 2.3972961686075975, "learning_rate": 3.6167519055917992e-06, "loss": 0.6327, "step": 1625 }, { "epoch": 0.6306603316202851, "grad_norm": 2.67211182451978, "learning_rate": 3.61024675557103e-06, "loss": 0.6823, "step": 1626 }, { "epoch": 0.6310481916028314, "grad_norm": 2.0357988765480943, "learning_rate": 3.6037441539139328e-06, "loss": 0.6322, "step": 1627 }, { "epoch": 0.6314360515853776, "grad_norm": 2.662924274339704, "learning_rate": 3.597244112544208e-06, "loss": 0.6775, "step": 1628 }, { "epoch": 0.631823911567924, "grad_norm": 2.434876311612606, "learning_rate": 3.5907466433808524e-06, "loss": 0.6738, "step": 1629 }, { "epoch": 0.6322117715504703, "grad_norm": 2.1690078592429116, "learning_rate": 3.584251758338151e-06, "loss": 0.6431, "step": 1630 }, { "epoch": 0.6325996315330166, "grad_norm": 1.8050777615531444, "learning_rate": 3.5777594693256474e-06, "loss": 0.6149, "step": 1631 }, { "epoch": 0.6329874915155629, "grad_norm": 2.7518597492826657, "learning_rate": 3.571269788248128e-06, "loss": 0.6622, "step": 1632 }, { "epoch": 0.6333753514981092, "grad_norm": 2.20217140028595, "learning_rate": 3.5647827270055945e-06, "loss": 0.6874, "step": 1633 }, { "epoch": 0.6337632114806555, "grad_norm": 2.352280362525975, "learning_rate": 3.5582982974932467e-06, "loss": 0.7039, "step": 1634 }, { "epoch": 0.6341510714632018, "grad_norm": 2.3681988431282033, "learning_rate": 3.551816511601458e-06, "loss": 0.7138, "step": 1635 }, { "epoch": 0.6345389314457481, "grad_norm": 2.1548451446209675, "learning_rate": 3.5453373812157517e-06, "loss": 0.6349, "step": 1636 }, { "epoch": 0.6349267914282943, "grad_norm": 2.9081962777024746, "learning_rate": 3.5388609182167867e-06, "loss": 0.7519, "step": 1637 }, { "epoch": 0.6353146514108406, "grad_norm": 2.5580398211214272, "learning_rate": 3.532387134480327e-06, "loss": 0.6622, "step": 1638 }, { "epoch": 0.635702511393387, "grad_norm": 2.7379819301362667, "learning_rate": 3.5259160418772242e-06, "loss": 0.7286, "step": 1639 }, { "epoch": 0.6360903713759333, "grad_norm": 2.329603712684695, "learning_rate": 3.5194476522733974e-06, "loss": 0.662, "step": 1640 }, { "epoch": 0.6364782313584796, "grad_norm": 2.3741039768444754, "learning_rate": 3.512981977529806e-06, "loss": 0.6924, "step": 1641 }, { "epoch": 0.6368660913410259, "grad_norm": 2.9107280300082596, "learning_rate": 3.5065190295024334e-06, "loss": 0.7089, "step": 1642 }, { "epoch": 0.6372539513235722, "grad_norm": 1.7010725827726512, "learning_rate": 3.500058820042263e-06, "loss": 0.6509, "step": 1643 }, { "epoch": 0.6376418113061185, "grad_norm": 3.0899727472295315, "learning_rate": 3.493601360995256e-06, "loss": 0.6574, "step": 1644 }, { "epoch": 0.6380296712886648, "grad_norm": 2.439968087819851, "learning_rate": 3.4871466642023264e-06, "loss": 0.6758, "step": 1645 }, { "epoch": 0.6384175312712111, "grad_norm": 3.0096941972849884, "learning_rate": 3.4806947414993342e-06, "loss": 0.6832, "step": 1646 }, { "epoch": 0.6388053912537573, "grad_norm": 2.8529350389135772, "learning_rate": 3.4742456047170413e-06, "loss": 0.728, "step": 1647 }, { "epoch": 0.6391932512363037, "grad_norm": 3.2782395001917073, "learning_rate": 3.4677992656811054e-06, "loss": 0.6318, "step": 1648 }, { "epoch": 0.63958111121885, "grad_norm": 2.1764950580061653, "learning_rate": 3.4613557362120542e-06, "loss": 0.657, "step": 1649 }, { "epoch": 0.6399689712013963, "grad_norm": 2.045468091705192, "learning_rate": 3.4549150281252635e-06, "loss": 0.6425, "step": 1650 }, { "epoch": 0.6403568311839426, "grad_norm": 2.3840831061622705, "learning_rate": 3.4484771532309348e-06, "loss": 0.6577, "step": 1651 }, { "epoch": 0.6407446911664889, "grad_norm": 1.7602550350620552, "learning_rate": 3.442042123334075e-06, "loss": 0.6258, "step": 1652 }, { "epoch": 0.6411325511490352, "grad_norm": 2.5589811991454487, "learning_rate": 3.435609950234473e-06, "loss": 0.7036, "step": 1653 }, { "epoch": 0.6415204111315815, "grad_norm": 3.367176920253775, "learning_rate": 3.429180645726683e-06, "loss": 0.6729, "step": 1654 }, { "epoch": 0.6419082711141278, "grad_norm": 2.493008628201023, "learning_rate": 3.422754221599995e-06, "loss": 0.7035, "step": 1655 }, { "epoch": 0.642296131096674, "grad_norm": 2.4590308538007526, "learning_rate": 3.4163306896384185e-06, "loss": 0.6442, "step": 1656 }, { "epoch": 0.6426839910792204, "grad_norm": 1.845381850643011, "learning_rate": 3.4099100616206597e-06, "loss": 0.6692, "step": 1657 }, { "epoch": 0.6430718510617667, "grad_norm": 2.9775126698352783, "learning_rate": 3.403492349320101e-06, "loss": 0.6811, "step": 1658 }, { "epoch": 0.643459711044313, "grad_norm": 1.8993614599312474, "learning_rate": 3.397077564504777e-06, "loss": 0.6582, "step": 1659 }, { "epoch": 0.6438475710268593, "grad_norm": 3.6454229024869704, "learning_rate": 3.390665718937355e-06, "loss": 0.7151, "step": 1660 }, { "epoch": 0.6442354310094056, "grad_norm": 1.937748013773768, "learning_rate": 3.3842568243751124e-06, "loss": 0.6577, "step": 1661 }, { "epoch": 0.6446232909919519, "grad_norm": 2.6707311454468496, "learning_rate": 3.3778508925699126e-06, "loss": 0.7295, "step": 1662 }, { "epoch": 0.6450111509744982, "grad_norm": 1.824043457500484, "learning_rate": 3.371447935268194e-06, "loss": 0.6393, "step": 1663 }, { "epoch": 0.6453990109570445, "grad_norm": 1.9281360928967337, "learning_rate": 3.3650479642109323e-06, "loss": 0.6202, "step": 1664 }, { "epoch": 0.6457868709395909, "grad_norm": 2.2533876021356254, "learning_rate": 3.3586509911336316e-06, "loss": 0.5834, "step": 1665 }, { "epoch": 0.6461747309221371, "grad_norm": 2.8565961028922042, "learning_rate": 3.3522570277662986e-06, "loss": 0.6709, "step": 1666 }, { "epoch": 0.6465625909046834, "grad_norm": 2.463142764745774, "learning_rate": 3.345866085833419e-06, "loss": 0.6539, "step": 1667 }, { "epoch": 0.6469504508872297, "grad_norm": 2.649781701945277, "learning_rate": 3.3394781770539406e-06, "loss": 0.7055, "step": 1668 }, { "epoch": 0.647338310869776, "grad_norm": 2.2958355672790707, "learning_rate": 3.3330933131412484e-06, "loss": 0.703, "step": 1669 }, { "epoch": 0.6477261708523223, "grad_norm": 3.571687219975531, "learning_rate": 3.3267115058031418e-06, "loss": 0.6782, "step": 1670 }, { "epoch": 0.6481140308348686, "grad_norm": 2.380470576076459, "learning_rate": 3.3203327667418207e-06, "loss": 0.6793, "step": 1671 }, { "epoch": 0.6485018908174149, "grad_norm": 2.3937509417832445, "learning_rate": 3.3139571076538547e-06, "loss": 0.6003, "step": 1672 }, { "epoch": 0.6488897507999613, "grad_norm": 2.4116636510838676, "learning_rate": 3.3075845402301652e-06, "loss": 0.6739, "step": 1673 }, { "epoch": 0.6492776107825076, "grad_norm": 2.743532711527093, "learning_rate": 3.3012150761560085e-06, "loss": 0.6612, "step": 1674 }, { "epoch": 0.6496654707650538, "grad_norm": 2.4875492649125324, "learning_rate": 3.2948487271109453e-06, "loss": 0.6641, "step": 1675 }, { "epoch": 0.6500533307476001, "grad_norm": 2.8901294025828492, "learning_rate": 3.2884855047688292e-06, "loss": 0.6427, "step": 1676 }, { "epoch": 0.6504411907301464, "grad_norm": 2.4493870524661863, "learning_rate": 3.282125420797776e-06, "loss": 0.6737, "step": 1677 }, { "epoch": 0.6508290507126927, "grad_norm": 1.9941545813127117, "learning_rate": 3.275768486860149e-06, "loss": 0.6433, "step": 1678 }, { "epoch": 0.651216910695239, "grad_norm": 3.08602163084545, "learning_rate": 3.269414714612534e-06, "loss": 0.6523, "step": 1679 }, { "epoch": 0.6516047706777853, "grad_norm": 2.4287951893063866, "learning_rate": 3.263064115705725e-06, "loss": 0.6758, "step": 1680 }, { "epoch": 0.6519926306603316, "grad_norm": 2.3170134306855514, "learning_rate": 3.25671670178469e-06, "loss": 0.6306, "step": 1681 }, { "epoch": 0.652380490642878, "grad_norm": 2.2759269914108953, "learning_rate": 3.250372484488558e-06, "loss": 0.6534, "step": 1682 }, { "epoch": 0.6527683506254243, "grad_norm": 2.6328915432816284, "learning_rate": 3.244031475450599e-06, "loss": 0.6822, "step": 1683 }, { "epoch": 0.6531562106079706, "grad_norm": 2.774104894022001, "learning_rate": 3.237693686298199e-06, "loss": 0.7164, "step": 1684 }, { "epoch": 0.6535440705905168, "grad_norm": 2.6561437855776604, "learning_rate": 3.2313591286528384e-06, "loss": 0.6863, "step": 1685 }, { "epoch": 0.6539319305730631, "grad_norm": 2.5845513792646178, "learning_rate": 3.225027814130074e-06, "loss": 0.6805, "step": 1686 }, { "epoch": 0.6543197905556094, "grad_norm": 2.99278127297457, "learning_rate": 3.218699754339513e-06, "loss": 0.6515, "step": 1687 }, { "epoch": 0.6547076505381557, "grad_norm": 2.22831539388589, "learning_rate": 3.2123749608847998e-06, "loss": 0.6774, "step": 1688 }, { "epoch": 0.655095510520702, "grad_norm": 2.0085275024664098, "learning_rate": 3.206053445363584e-06, "loss": 0.5801, "step": 1689 }, { "epoch": 0.6554833705032483, "grad_norm": 2.3051104703331564, "learning_rate": 3.199735219367507e-06, "loss": 0.6509, "step": 1690 }, { "epoch": 0.6558712304857947, "grad_norm": 1.6484342880462437, "learning_rate": 3.193420294482177e-06, "loss": 0.647, "step": 1691 }, { "epoch": 0.656259090468341, "grad_norm": 2.412253141074147, "learning_rate": 3.18710868228715e-06, "loss": 0.7158, "step": 1692 }, { "epoch": 0.6566469504508873, "grad_norm": 2.6583020209243555, "learning_rate": 3.180800394355908e-06, "loss": 0.6169, "step": 1693 }, { "epoch": 0.6570348104334335, "grad_norm": 2.7393645886135745, "learning_rate": 3.174495442255836e-06, "loss": 0.7217, "step": 1694 }, { "epoch": 0.6574226704159798, "grad_norm": 2.0111599274493295, "learning_rate": 3.1681938375482035e-06, "loss": 0.6858, "step": 1695 }, { "epoch": 0.6578105303985261, "grad_norm": 2.641425675969063, "learning_rate": 3.1618955917881383e-06, "loss": 0.7347, "step": 1696 }, { "epoch": 0.6581983903810724, "grad_norm": 1.7320954319994943, "learning_rate": 3.155600716524617e-06, "loss": 0.6196, "step": 1697 }, { "epoch": 0.6585862503636187, "grad_norm": 1.8397255254262115, "learning_rate": 3.149309223300428e-06, "loss": 0.6398, "step": 1698 }, { "epoch": 0.658974110346165, "grad_norm": 2.541416442766392, "learning_rate": 3.1430211236521615e-06, "loss": 0.6218, "step": 1699 }, { "epoch": 0.6593619703287114, "grad_norm": 2.0440623615762505, "learning_rate": 3.1367364291101845e-06, "loss": 0.619, "step": 1700 }, { "epoch": 0.6597498303112577, "grad_norm": 2.7569671931059174, "learning_rate": 3.130455151198618e-06, "loss": 0.7198, "step": 1701 }, { "epoch": 0.660137690293804, "grad_norm": 2.071926308518714, "learning_rate": 3.124177301435324e-06, "loss": 0.6178, "step": 1702 }, { "epoch": 0.6605255502763503, "grad_norm": 2.4746245884861686, "learning_rate": 3.11790289133187e-06, "loss": 0.7175, "step": 1703 }, { "epoch": 0.6609134102588965, "grad_norm": 1.9863983348223095, "learning_rate": 3.1116319323935207e-06, "loss": 0.692, "step": 1704 }, { "epoch": 0.6613012702414428, "grad_norm": 3.0407514954632378, "learning_rate": 3.1053644361192158e-06, "loss": 0.7523, "step": 1705 }, { "epoch": 0.6616891302239891, "grad_norm": 1.8337471477190863, "learning_rate": 3.09910041400154e-06, "loss": 0.6316, "step": 1706 }, { "epoch": 0.6620769902065354, "grad_norm": 2.2802636894986392, "learning_rate": 3.092839877526711e-06, "loss": 0.6228, "step": 1707 }, { "epoch": 0.6624648501890817, "grad_norm": 2.300653441392046, "learning_rate": 3.0865828381745515e-06, "loss": 0.6295, "step": 1708 }, { "epoch": 0.6628527101716281, "grad_norm": 3.1713643634642907, "learning_rate": 3.0803293074184754e-06, "loss": 0.6641, "step": 1709 }, { "epoch": 0.6632405701541744, "grad_norm": 1.9198142560109925, "learning_rate": 3.0740792967254606e-06, "loss": 0.6434, "step": 1710 }, { "epoch": 0.6636284301367207, "grad_norm": 2.3566365635836974, "learning_rate": 3.0678328175560306e-06, "loss": 0.6859, "step": 1711 }, { "epoch": 0.664016290119267, "grad_norm": 2.5484222683949236, "learning_rate": 3.061589881364234e-06, "loss": 0.6671, "step": 1712 }, { "epoch": 0.6644041501018132, "grad_norm": 2.6282283077238677, "learning_rate": 3.0553504995976204e-06, "loss": 0.6868, "step": 1713 }, { "epoch": 0.6647920100843595, "grad_norm": 2.719167645857244, "learning_rate": 3.0491146836972273e-06, "loss": 0.6612, "step": 1714 }, { "epoch": 0.6651798700669058, "grad_norm": 2.233548074188411, "learning_rate": 3.0428824450975484e-06, "loss": 0.6782, "step": 1715 }, { "epoch": 0.6655677300494521, "grad_norm": 2.547695574497603, "learning_rate": 3.0366537952265185e-06, "loss": 0.6774, "step": 1716 }, { "epoch": 0.6659555900319984, "grad_norm": 2.367635276486932, "learning_rate": 3.0304287455054925e-06, "loss": 0.7027, "step": 1717 }, { "epoch": 0.6663434500145448, "grad_norm": 3.5621853797159178, "learning_rate": 3.0242073073492238e-06, "loss": 0.7565, "step": 1718 }, { "epoch": 0.6667313099970911, "grad_norm": 2.414608247421731, "learning_rate": 3.017989492165844e-06, "loss": 0.6979, "step": 1719 }, { "epoch": 0.6671191699796374, "grad_norm": 2.5038425620457403, "learning_rate": 3.0117753113568406e-06, "loss": 0.6901, "step": 1720 }, { "epoch": 0.6675070299621837, "grad_norm": 2.8329111525774686, "learning_rate": 3.0055647763170336e-06, "loss": 0.641, "step": 1721 }, { "epoch": 0.6678948899447299, "grad_norm": 2.639515686639601, "learning_rate": 2.9993578984345673e-06, "loss": 0.7126, "step": 1722 }, { "epoch": 0.6682827499272762, "grad_norm": 2.2997411707197553, "learning_rate": 2.9931546890908695e-06, "loss": 0.6542, "step": 1723 }, { "epoch": 0.6686706099098225, "grad_norm": 2.1061839165726375, "learning_rate": 2.986955159660647e-06, "loss": 0.6171, "step": 1724 }, { "epoch": 0.6690584698923688, "grad_norm": 1.8683605455431285, "learning_rate": 2.980759321511857e-06, "loss": 0.6583, "step": 1725 }, { "epoch": 0.6694463298749151, "grad_norm": 2.286182796930516, "learning_rate": 2.974567186005687e-06, "loss": 0.7069, "step": 1726 }, { "epoch": 0.6698341898574615, "grad_norm": 2.58200347747173, "learning_rate": 2.968378764496537e-06, "loss": 0.6962, "step": 1727 }, { "epoch": 0.6702220498400078, "grad_norm": 1.8200615710476649, "learning_rate": 2.962194068331996e-06, "loss": 0.5961, "step": 1728 }, { "epoch": 0.6706099098225541, "grad_norm": 2.4455381658631787, "learning_rate": 2.9560131088528223e-06, "loss": 0.6262, "step": 1729 }, { "epoch": 0.6709977698051004, "grad_norm": 3.0679117028113363, "learning_rate": 2.9498358973929197e-06, "loss": 0.6912, "step": 1730 }, { "epoch": 0.6713856297876467, "grad_norm": 3.013955580881088, "learning_rate": 2.943662445279325e-06, "loss": 0.6524, "step": 1731 }, { "epoch": 0.6717734897701929, "grad_norm": 2.8446338746777045, "learning_rate": 2.937492763832176e-06, "loss": 0.586, "step": 1732 }, { "epoch": 0.6721613497527392, "grad_norm": 2.2674690731951057, "learning_rate": 2.9313268643646988e-06, "loss": 0.7027, "step": 1733 }, { "epoch": 0.6725492097352855, "grad_norm": 2.475983965308025, "learning_rate": 2.925164758183184e-06, "loss": 0.7094, "step": 1734 }, { "epoch": 0.6729370697178318, "grad_norm": 3.0952860344465964, "learning_rate": 2.9190064565869663e-06, "loss": 0.699, "step": 1735 }, { "epoch": 0.6733249297003782, "grad_norm": 2.484519700451649, "learning_rate": 2.912851970868405e-06, "loss": 0.6238, "step": 1736 }, { "epoch": 0.6737127896829245, "grad_norm": 2.0558472087277972, "learning_rate": 2.906701312312861e-06, "loss": 0.6628, "step": 1737 }, { "epoch": 0.6741006496654708, "grad_norm": 2.7931948586861854, "learning_rate": 2.9005544921986774e-06, "loss": 0.6764, "step": 1738 }, { "epoch": 0.6744885096480171, "grad_norm": 2.12928046165345, "learning_rate": 2.8944115217971613e-06, "loss": 0.5956, "step": 1739 }, { "epoch": 0.6748763696305634, "grad_norm": 2.3617512157765526, "learning_rate": 2.888272412372559e-06, "loss": 0.6696, "step": 1740 }, { "epoch": 0.6752642296131096, "grad_norm": 2.5627632570221452, "learning_rate": 2.8821371751820348e-06, "loss": 0.6645, "step": 1741 }, { "epoch": 0.6756520895956559, "grad_norm": 3.7964077657766797, "learning_rate": 2.876005821475657e-06, "loss": 0.6881, "step": 1742 }, { "epoch": 0.6760399495782022, "grad_norm": 2.76594009973686, "learning_rate": 2.8698783624963684e-06, "loss": 0.634, "step": 1743 }, { "epoch": 0.6764278095607485, "grad_norm": 2.0057090757904468, "learning_rate": 2.8637548094799728e-06, "loss": 0.603, "step": 1744 }, { "epoch": 0.6768156695432949, "grad_norm": 2.2451713702134026, "learning_rate": 2.8576351736551118e-06, "loss": 0.6998, "step": 1745 }, { "epoch": 0.6772035295258412, "grad_norm": 2.637519700077643, "learning_rate": 2.8515194662432423e-06, "loss": 0.6908, "step": 1746 }, { "epoch": 0.6775913895083875, "grad_norm": 2.3965957096183472, "learning_rate": 2.8454076984586176e-06, "loss": 0.6993, "step": 1747 }, { "epoch": 0.6779792494909338, "grad_norm": 2.8469831704094086, "learning_rate": 2.839299881508272e-06, "loss": 0.6717, "step": 1748 }, { "epoch": 0.6783671094734801, "grad_norm": 3.5142840599682885, "learning_rate": 2.833196026591989e-06, "loss": 0.7126, "step": 1749 }, { "epoch": 0.6787549694560264, "grad_norm": 1.8023298823742129, "learning_rate": 2.827096144902289e-06, "loss": 0.5785, "step": 1750 }, { "epoch": 0.6791428294385726, "grad_norm": 2.933861759214739, "learning_rate": 2.8210002476244093e-06, "loss": 0.6821, "step": 1751 }, { "epoch": 0.6795306894211189, "grad_norm": 2.3279363419607106, "learning_rate": 2.814908345936277e-06, "loss": 0.6309, "step": 1752 }, { "epoch": 0.6799185494036653, "grad_norm": 3.1646207689787094, "learning_rate": 2.8088204510084948e-06, "loss": 0.6632, "step": 1753 }, { "epoch": 0.6803064093862116, "grad_norm": 2.6845738695652344, "learning_rate": 2.8027365740043188e-06, "loss": 0.6526, "step": 1754 }, { "epoch": 0.6806942693687579, "grad_norm": 1.9957983841387614, "learning_rate": 2.796656726079636e-06, "loss": 0.6442, "step": 1755 }, { "epoch": 0.6810821293513042, "grad_norm": 2.2560692081248956, "learning_rate": 2.790580918382947e-06, "loss": 0.7002, "step": 1756 }, { "epoch": 0.6814699893338505, "grad_norm": 2.8937423590111773, "learning_rate": 2.7845091620553423e-06, "loss": 0.6336, "step": 1757 }, { "epoch": 0.6818578493163968, "grad_norm": 2.4286555301247694, "learning_rate": 2.778441468230483e-06, "loss": 0.5834, "step": 1758 }, { "epoch": 0.6822457092989431, "grad_norm": 2.9055824163216393, "learning_rate": 2.7723778480345844e-06, "loss": 0.6159, "step": 1759 }, { "epoch": 0.6826335692814893, "grad_norm": 2.9171827585343957, "learning_rate": 2.7663183125863887e-06, "loss": 0.6745, "step": 1760 }, { "epoch": 0.6830214292640356, "grad_norm": 1.8801443517580896, "learning_rate": 2.760262872997148e-06, "loss": 0.6973, "step": 1761 }, { "epoch": 0.683409289246582, "grad_norm": 3.064088226950648, "learning_rate": 2.7542115403706067e-06, "loss": 0.6737, "step": 1762 }, { "epoch": 0.6837971492291283, "grad_norm": 1.9259661963903063, "learning_rate": 2.748164325802975e-06, "loss": 0.6098, "step": 1763 }, { "epoch": 0.6841850092116746, "grad_norm": 1.9083260059936562, "learning_rate": 2.742121240382912e-06, "loss": 0.6022, "step": 1764 }, { "epoch": 0.6845728691942209, "grad_norm": 2.335669176320754, "learning_rate": 2.736082295191511e-06, "loss": 0.664, "step": 1765 }, { "epoch": 0.6849607291767672, "grad_norm": 2.0835194703503963, "learning_rate": 2.7300475013022666e-06, "loss": 0.5898, "step": 1766 }, { "epoch": 0.6853485891593135, "grad_norm": 2.3431081763399155, "learning_rate": 2.724016869781064e-06, "loss": 0.6128, "step": 1767 }, { "epoch": 0.6857364491418598, "grad_norm": 2.5987110165457503, "learning_rate": 2.7179904116861557e-06, "loss": 0.7016, "step": 1768 }, { "epoch": 0.6861243091244061, "grad_norm": 1.7473337085485012, "learning_rate": 2.711968138068141e-06, "loss": 0.6126, "step": 1769 }, { "epoch": 0.6865121691069523, "grad_norm": 2.6507578020097156, "learning_rate": 2.705950059969948e-06, "loss": 0.7392, "step": 1770 }, { "epoch": 0.6869000290894987, "grad_norm": 3.2601667818981985, "learning_rate": 2.6999361884268086e-06, "loss": 0.7038, "step": 1771 }, { "epoch": 0.687287889072045, "grad_norm": 2.3729371875670173, "learning_rate": 2.6939265344662426e-06, "loss": 0.6333, "step": 1772 }, { "epoch": 0.6876757490545913, "grad_norm": 2.2267161362823407, "learning_rate": 2.687921109108038e-06, "loss": 0.6855, "step": 1773 }, { "epoch": 0.6880636090371376, "grad_norm": 1.7475772184787397, "learning_rate": 2.681919923364228e-06, "loss": 0.5513, "step": 1774 }, { "epoch": 0.6884514690196839, "grad_norm": 2.4511887303339903, "learning_rate": 2.675922988239069e-06, "loss": 0.6555, "step": 1775 }, { "epoch": 0.6888393290022302, "grad_norm": 2.60784285297847, "learning_rate": 2.6699303147290257e-06, "loss": 0.6358, "step": 1776 }, { "epoch": 0.6892271889847765, "grad_norm": 1.8278872934879646, "learning_rate": 2.663941913822747e-06, "loss": 0.5967, "step": 1777 }, { "epoch": 0.6896150489673228, "grad_norm": 2.4955399459941883, "learning_rate": 2.65795779650105e-06, "loss": 0.5816, "step": 1778 }, { "epoch": 0.690002908949869, "grad_norm": 1.8676740941295586, "learning_rate": 2.6519779737368935e-06, "loss": 0.6512, "step": 1779 }, { "epoch": 0.6903907689324154, "grad_norm": 2.202958857860599, "learning_rate": 2.6460024564953624e-06, "loss": 0.6189, "step": 1780 }, { "epoch": 0.6907786289149617, "grad_norm": 2.64378018629096, "learning_rate": 2.640031255733646e-06, "loss": 0.6258, "step": 1781 }, { "epoch": 0.691166488897508, "grad_norm": 2.584015807863157, "learning_rate": 2.634064382401025e-06, "loss": 0.7224, "step": 1782 }, { "epoch": 0.6915543488800543, "grad_norm": 2.211624602396288, "learning_rate": 2.6281018474388354e-06, "loss": 0.6804, "step": 1783 }, { "epoch": 0.6919422088626006, "grad_norm": 2.1269506251624875, "learning_rate": 2.6221436617804635e-06, "loss": 0.7122, "step": 1784 }, { "epoch": 0.6923300688451469, "grad_norm": 2.0645291248065654, "learning_rate": 2.6161898363513192e-06, "loss": 0.6307, "step": 1785 }, { "epoch": 0.6927179288276932, "grad_norm": 1.7915648644497735, "learning_rate": 2.610240382068818e-06, "loss": 0.5993, "step": 1786 }, { "epoch": 0.6931057888102395, "grad_norm": 1.9337344796866494, "learning_rate": 2.6042953098423573e-06, "loss": 0.6941, "step": 1787 }, { "epoch": 0.6934936487927859, "grad_norm": 1.9901911314107152, "learning_rate": 2.598354630573303e-06, "loss": 0.6785, "step": 1788 }, { "epoch": 0.6938815087753321, "grad_norm": 2.6484147156221147, "learning_rate": 2.592418355154963e-06, "loss": 0.6855, "step": 1789 }, { "epoch": 0.6942693687578784, "grad_norm": 2.1050056197302913, "learning_rate": 2.586486494472572e-06, "loss": 0.5901, "step": 1790 }, { "epoch": 0.6946572287404247, "grad_norm": 2.5593283172306154, "learning_rate": 2.5805590594032666e-06, "loss": 0.6686, "step": 1791 }, { "epoch": 0.695045088722971, "grad_norm": 2.209984408200109, "learning_rate": 2.5746360608160703e-06, "loss": 0.5966, "step": 1792 }, { "epoch": 0.6954329487055173, "grad_norm": 3.439413121633658, "learning_rate": 2.5687175095718726e-06, "loss": 0.7104, "step": 1793 }, { "epoch": 0.6958208086880636, "grad_norm": 2.9226044935936444, "learning_rate": 2.562803416523405e-06, "loss": 0.681, "step": 1794 }, { "epoch": 0.6962086686706099, "grad_norm": 3.2822424349548966, "learning_rate": 2.5568937925152272e-06, "loss": 0.7039, "step": 1795 }, { "epoch": 0.6965965286531562, "grad_norm": 2.2522971524215016, "learning_rate": 2.550988648383701e-06, "loss": 0.6842, "step": 1796 }, { "epoch": 0.6969843886357026, "grad_norm": 2.6112068518521503, "learning_rate": 2.545087994956975e-06, "loss": 0.6647, "step": 1797 }, { "epoch": 0.6973722486182488, "grad_norm": 1.8165550580572987, "learning_rate": 2.5391918430549635e-06, "loss": 0.6149, "step": 1798 }, { "epoch": 0.6977601086007951, "grad_norm": 2.5695956787588194, "learning_rate": 2.5333002034893283e-06, "loss": 0.682, "step": 1799 }, { "epoch": 0.6981479685833414, "grad_norm": 2.8081694718253383, "learning_rate": 2.527413087063454e-06, "loss": 0.661, "step": 1800 }, { "epoch": 0.6985358285658877, "grad_norm": 2.454277838525377, "learning_rate": 2.521530504572432e-06, "loss": 0.6077, "step": 1801 }, { "epoch": 0.698923688548434, "grad_norm": 2.715233058078684, "learning_rate": 2.5156524668030402e-06, "loss": 0.72, "step": 1802 }, { "epoch": 0.6993115485309803, "grad_norm": 3.264045171913405, "learning_rate": 2.5097789845337223e-06, "loss": 0.6733, "step": 1803 }, { "epoch": 0.6996994085135266, "grad_norm": 1.786838244157091, "learning_rate": 2.50391006853457e-06, "loss": 0.6258, "step": 1804 }, { "epoch": 0.700087268496073, "grad_norm": 2.0536427897050844, "learning_rate": 2.498045729567302e-06, "loss": 0.6525, "step": 1805 }, { "epoch": 0.7004751284786193, "grad_norm": 2.5243314666558683, "learning_rate": 2.492185978385241e-06, "loss": 0.7258, "step": 1806 }, { "epoch": 0.7008629884611656, "grad_norm": 2.5320722972459877, "learning_rate": 2.4863308257333e-06, "loss": 0.6923, "step": 1807 }, { "epoch": 0.7012508484437118, "grad_norm": 2.2648086816144506, "learning_rate": 2.480480282347961e-06, "loss": 0.7052, "step": 1808 }, { "epoch": 0.7016387084262581, "grad_norm": 2.9757970440160415, "learning_rate": 2.4746343589572526e-06, "loss": 0.6757, "step": 1809 }, { "epoch": 0.7020265684088044, "grad_norm": 2.213761752430787, "learning_rate": 2.46879306628073e-06, "loss": 0.6582, "step": 1810 }, { "epoch": 0.7024144283913507, "grad_norm": 2.904135453403461, "learning_rate": 2.4629564150294593e-06, "loss": 0.7032, "step": 1811 }, { "epoch": 0.702802288373897, "grad_norm": 2.6108870718888832, "learning_rate": 2.4571244159059952e-06, "loss": 0.6549, "step": 1812 }, { "epoch": 0.7031901483564433, "grad_norm": 2.3228370771780202, "learning_rate": 2.4512970796043616e-06, "loss": 0.718, "step": 1813 }, { "epoch": 0.7035780083389896, "grad_norm": 2.518248782399687, "learning_rate": 2.445474416810033e-06, "loss": 0.6623, "step": 1814 }, { "epoch": 0.703965868321536, "grad_norm": 2.3916032306431667, "learning_rate": 2.439656438199911e-06, "loss": 0.7185, "step": 1815 }, { "epoch": 0.7043537283040823, "grad_norm": 2.7380002492099837, "learning_rate": 2.433843154442315e-06, "loss": 0.6473, "step": 1816 }, { "epoch": 0.7047415882866285, "grad_norm": 2.551055167758434, "learning_rate": 2.428034576196949e-06, "loss": 0.6271, "step": 1817 }, { "epoch": 0.7051294482691748, "grad_norm": 1.8215834031216438, "learning_rate": 2.422230714114891e-06, "loss": 0.6339, "step": 1818 }, { "epoch": 0.7055173082517211, "grad_norm": 2.153948897765474, "learning_rate": 2.41643157883857e-06, "loss": 0.6796, "step": 1819 }, { "epoch": 0.7059051682342674, "grad_norm": 2.6261189552367266, "learning_rate": 2.4106371810017486e-06, "loss": 0.6013, "step": 1820 }, { "epoch": 0.7062930282168137, "grad_norm": 2.131647066929162, "learning_rate": 2.4048475312295027e-06, "loss": 0.6196, "step": 1821 }, { "epoch": 0.70668088819936, "grad_norm": 3.3543408113635484, "learning_rate": 2.399062640138201e-06, "loss": 0.6725, "step": 1822 }, { "epoch": 0.7070687481819063, "grad_norm": 2.2043502816640443, "learning_rate": 2.3932825183354864e-06, "loss": 0.6795, "step": 1823 }, { "epoch": 0.7074566081644527, "grad_norm": 2.679521966665471, "learning_rate": 2.387507176420256e-06, "loss": 0.6711, "step": 1824 }, { "epoch": 0.707844468146999, "grad_norm": 2.5659089091353238, "learning_rate": 2.381736624982644e-06, "loss": 0.6531, "step": 1825 }, { "epoch": 0.7082323281295453, "grad_norm": 2.1683257137372434, "learning_rate": 2.375970874603998e-06, "loss": 0.5949, "step": 1826 }, { "epoch": 0.7086201881120915, "grad_norm": 1.8396210389731897, "learning_rate": 2.3702099358568635e-06, "loss": 0.6223, "step": 1827 }, { "epoch": 0.7090080480946378, "grad_norm": 2.2126193242670515, "learning_rate": 2.3644538193049626e-06, "loss": 0.6567, "step": 1828 }, { "epoch": 0.7093959080771841, "grad_norm": 2.0159875789586543, "learning_rate": 2.3587025355031744e-06, "loss": 0.6114, "step": 1829 }, { "epoch": 0.7097837680597304, "grad_norm": 2.6951064842499166, "learning_rate": 2.3529560949975184e-06, "loss": 0.6237, "step": 1830 }, { "epoch": 0.7101716280422767, "grad_norm": 2.1160403826356577, "learning_rate": 2.3472145083251296e-06, "loss": 0.6139, "step": 1831 }, { "epoch": 0.710559488024823, "grad_norm": 2.123401449905035, "learning_rate": 2.3414777860142446e-06, "loss": 0.6959, "step": 1832 }, { "epoch": 0.7109473480073694, "grad_norm": 2.8631056168303535, "learning_rate": 2.3357459385841824e-06, "loss": 0.6989, "step": 1833 }, { "epoch": 0.7113352079899157, "grad_norm": 2.398495735930579, "learning_rate": 2.3300189765453198e-06, "loss": 0.6247, "step": 1834 }, { "epoch": 0.711723067972462, "grad_norm": 2.5262759632050162, "learning_rate": 2.3242969103990765e-06, "loss": 0.7041, "step": 1835 }, { "epoch": 0.7121109279550082, "grad_norm": 3.1310613068982653, "learning_rate": 2.3185797506378943e-06, "loss": 0.6637, "step": 1836 }, { "epoch": 0.7124987879375545, "grad_norm": 2.0299530234246874, "learning_rate": 2.312867507745219e-06, "loss": 0.5848, "step": 1837 }, { "epoch": 0.7128866479201008, "grad_norm": 2.458856814284235, "learning_rate": 2.3071601921954797e-06, "loss": 0.6623, "step": 1838 }, { "epoch": 0.7132745079026471, "grad_norm": 2.459903413278072, "learning_rate": 2.3014578144540706e-06, "loss": 0.6784, "step": 1839 }, { "epoch": 0.7136623678851934, "grad_norm": 2.078645391550461, "learning_rate": 2.295760384977331e-06, "loss": 0.6553, "step": 1840 }, { "epoch": 0.7140502278677398, "grad_norm": 3.097122441893936, "learning_rate": 2.2900679142125275e-06, "loss": 0.7511, "step": 1841 }, { "epoch": 0.7144380878502861, "grad_norm": 3.2300068781333917, "learning_rate": 2.2843804125978356e-06, "loss": 0.7119, "step": 1842 }, { "epoch": 0.7148259478328324, "grad_norm": 4.007373434229678, "learning_rate": 2.278697890562316e-06, "loss": 0.7149, "step": 1843 }, { "epoch": 0.7152138078153787, "grad_norm": 2.163237592114809, "learning_rate": 2.273020358525899e-06, "loss": 0.7216, "step": 1844 }, { "epoch": 0.715601667797925, "grad_norm": 2.3200439572585196, "learning_rate": 2.267347826899366e-06, "loss": 0.6766, "step": 1845 }, { "epoch": 0.7159895277804712, "grad_norm": 2.475273372987423, "learning_rate": 2.2616803060843283e-06, "loss": 0.6799, "step": 1846 }, { "epoch": 0.7163773877630175, "grad_norm": 4.34295191638, "learning_rate": 2.2560178064732103e-06, "loss": 0.6772, "step": 1847 }, { "epoch": 0.7167652477455638, "grad_norm": 2.378739909088117, "learning_rate": 2.250360338449226e-06, "loss": 0.6682, "step": 1848 }, { "epoch": 0.7171531077281101, "grad_norm": 2.8163988023154722, "learning_rate": 2.244707912386366e-06, "loss": 0.7519, "step": 1849 }, { "epoch": 0.7175409677106565, "grad_norm": 2.0408090328518482, "learning_rate": 2.2390605386493758e-06, "loss": 0.6452, "step": 1850 }, { "epoch": 0.7179288276932028, "grad_norm": 2.406791983348518, "learning_rate": 2.233418227593736e-06, "loss": 0.7297, "step": 1851 }, { "epoch": 0.7183166876757491, "grad_norm": 2.3561106636662807, "learning_rate": 2.2277809895656415e-06, "loss": 0.6325, "step": 1852 }, { "epoch": 0.7187045476582954, "grad_norm": 2.4946167514845516, "learning_rate": 2.2221488349019903e-06, "loss": 0.7201, "step": 1853 }, { "epoch": 0.7190924076408417, "grad_norm": 2.8010370815317556, "learning_rate": 2.216521773930351e-06, "loss": 0.6024, "step": 1854 }, { "epoch": 0.7194802676233879, "grad_norm": 2.323291354884368, "learning_rate": 2.2108998169689583e-06, "loss": 0.6714, "step": 1855 }, { "epoch": 0.7198681276059342, "grad_norm": 2.1885420703637175, "learning_rate": 2.2052829743266864e-06, "loss": 0.654, "step": 1856 }, { "epoch": 0.7202559875884805, "grad_norm": 2.100146814191206, "learning_rate": 2.1996712563030305e-06, "loss": 0.6515, "step": 1857 }, { "epoch": 0.7206438475710268, "grad_norm": 2.378345850525839, "learning_rate": 2.1940646731880887e-06, "loss": 0.6974, "step": 1858 }, { "epoch": 0.7210317075535732, "grad_norm": 2.497166127383455, "learning_rate": 2.1884632352625468e-06, "loss": 0.7111, "step": 1859 }, { "epoch": 0.7214195675361195, "grad_norm": 2.2393368514897025, "learning_rate": 2.1828669527976525e-06, "loss": 0.6914, "step": 1860 }, { "epoch": 0.7218074275186658, "grad_norm": 2.7110893751817318, "learning_rate": 2.1772758360552006e-06, "loss": 0.7314, "step": 1861 }, { "epoch": 0.7221952875012121, "grad_norm": 2.235319096341446, "learning_rate": 2.1716898952875132e-06, "loss": 0.6979, "step": 1862 }, { "epoch": 0.7225831474837584, "grad_norm": 2.4199157542131693, "learning_rate": 2.166109140737422e-06, "loss": 0.6075, "step": 1863 }, { "epoch": 0.7229710074663047, "grad_norm": 2.4655671699743795, "learning_rate": 2.1605335826382494e-06, "loss": 0.6621, "step": 1864 }, { "epoch": 0.7233588674488509, "grad_norm": 1.8834715384024856, "learning_rate": 2.1549632312137884e-06, "loss": 0.5931, "step": 1865 }, { "epoch": 0.7237467274313972, "grad_norm": 2.5580095398526086, "learning_rate": 2.149398096678283e-06, "loss": 0.6564, "step": 1866 }, { "epoch": 0.7241345874139435, "grad_norm": 2.4341997315459323, "learning_rate": 2.1438381892364163e-06, "loss": 0.6841, "step": 1867 }, { "epoch": 0.7245224473964899, "grad_norm": 1.7783151154777468, "learning_rate": 2.138283519083281e-06, "loss": 0.6276, "step": 1868 }, { "epoch": 0.7249103073790362, "grad_norm": 3.2490285920714133, "learning_rate": 2.1327340964043697e-06, "loss": 0.6698, "step": 1869 }, { "epoch": 0.7252981673615825, "grad_norm": 2.54055738222133, "learning_rate": 2.12718993137555e-06, "loss": 0.6646, "step": 1870 }, { "epoch": 0.7256860273441288, "grad_norm": 2.6383956435165405, "learning_rate": 2.1216510341630513e-06, "loss": 0.6527, "step": 1871 }, { "epoch": 0.7260738873266751, "grad_norm": 1.9718450895186461, "learning_rate": 2.116117414923442e-06, "loss": 0.659, "step": 1872 }, { "epoch": 0.7264617473092214, "grad_norm": 2.6835344711699, "learning_rate": 2.1105890838036133e-06, "loss": 0.7137, "step": 1873 }, { "epoch": 0.7268496072917676, "grad_norm": 2.2461868850847018, "learning_rate": 2.105066050940758e-06, "loss": 0.6092, "step": 1874 }, { "epoch": 0.7272374672743139, "grad_norm": 2.105220574213315, "learning_rate": 2.0995483264623535e-06, "loss": 0.6271, "step": 1875 }, { "epoch": 0.7276253272568602, "grad_norm": 2.6893701022932928, "learning_rate": 2.0940359204861487e-06, "loss": 0.6585, "step": 1876 }, { "epoch": 0.7280131872394066, "grad_norm": 2.628942528761367, "learning_rate": 2.088528843120134e-06, "loss": 0.661, "step": 1877 }, { "epoch": 0.7284010472219529, "grad_norm": 2.6164184736739555, "learning_rate": 2.08302710446253e-06, "loss": 0.6521, "step": 1878 }, { "epoch": 0.7287889072044992, "grad_norm": 2.699707477483017, "learning_rate": 2.0775307146017697e-06, "loss": 0.6693, "step": 1879 }, { "epoch": 0.7291767671870455, "grad_norm": 2.3116742392479503, "learning_rate": 2.0720396836164764e-06, "loss": 0.6146, "step": 1880 }, { "epoch": 0.7295646271695918, "grad_norm": 2.3876387010616975, "learning_rate": 2.066554021575447e-06, "loss": 0.7055, "step": 1881 }, { "epoch": 0.7299524871521381, "grad_norm": 2.560171901781739, "learning_rate": 2.061073738537635e-06, "loss": 0.6895, "step": 1882 }, { "epoch": 0.7303403471346844, "grad_norm": 2.8420056223496024, "learning_rate": 2.055598844552129e-06, "loss": 0.76, "step": 1883 }, { "epoch": 0.7307282071172306, "grad_norm": 2.2481345839271776, "learning_rate": 2.0501293496581367e-06, "loss": 0.6983, "step": 1884 }, { "epoch": 0.731116067099777, "grad_norm": 2.3198948962249584, "learning_rate": 2.044665263884964e-06, "loss": 0.671, "step": 1885 }, { "epoch": 0.7315039270823233, "grad_norm": 2.6580683859081073, "learning_rate": 2.0392065972520008e-06, "loss": 0.6651, "step": 1886 }, { "epoch": 0.7318917870648696, "grad_norm": 2.3144009245461516, "learning_rate": 2.0337533597686987e-06, "loss": 0.6535, "step": 1887 }, { "epoch": 0.7322796470474159, "grad_norm": 2.166121587091818, "learning_rate": 2.028305561434553e-06, "loss": 0.6689, "step": 1888 }, { "epoch": 0.7326675070299622, "grad_norm": 2.9479899472118105, "learning_rate": 2.0228632122390866e-06, "loss": 0.6827, "step": 1889 }, { "epoch": 0.7330553670125085, "grad_norm": 1.9491686484833641, "learning_rate": 2.0174263221618307e-06, "loss": 0.6664, "step": 1890 }, { "epoch": 0.7334432269950548, "grad_norm": 3.0440118791154576, "learning_rate": 2.0119949011723043e-06, "loss": 0.7343, "step": 1891 }, { "epoch": 0.7338310869776011, "grad_norm": 2.474052404208215, "learning_rate": 2.006568959229999e-06, "loss": 0.6232, "step": 1892 }, { "epoch": 0.7342189469601473, "grad_norm": 2.1567398483117346, "learning_rate": 2.001148506284361e-06, "loss": 0.6536, "step": 1893 }, { "epoch": 0.7346068069426936, "grad_norm": 2.568096481918768, "learning_rate": 1.9957335522747707e-06, "loss": 0.6472, "step": 1894 }, { "epoch": 0.73499466692524, "grad_norm": 2.103563789651271, "learning_rate": 1.9903241071305237e-06, "loss": 0.5899, "step": 1895 }, { "epoch": 0.7353825269077863, "grad_norm": 2.161275953967434, "learning_rate": 1.9849201807708146e-06, "loss": 0.6221, "step": 1896 }, { "epoch": 0.7357703868903326, "grad_norm": 2.814770482042378, "learning_rate": 1.9795217831047193e-06, "loss": 0.6145, "step": 1897 }, { "epoch": 0.7361582468728789, "grad_norm": 2.885879034109543, "learning_rate": 1.9741289240311757e-06, "loss": 0.7077, "step": 1898 }, { "epoch": 0.7365461068554252, "grad_norm": 2.2565493152267635, "learning_rate": 1.968741613438964e-06, "loss": 0.6088, "step": 1899 }, { "epoch": 0.7369339668379715, "grad_norm": 2.2019952678591634, "learning_rate": 1.9633598612066914e-06, "loss": 0.6228, "step": 1900 }, { "epoch": 0.7373218268205178, "grad_norm": 2.863544761262813, "learning_rate": 1.957983677202775e-06, "loss": 0.6653, "step": 1901 }, { "epoch": 0.7377096868030641, "grad_norm": 2.6097030620888386, "learning_rate": 1.9526130712854186e-06, "loss": 0.6549, "step": 1902 }, { "epoch": 0.7380975467856103, "grad_norm": 2.416544839498245, "learning_rate": 1.947248053302598e-06, "loss": 0.6553, "step": 1903 }, { "epoch": 0.7384854067681567, "grad_norm": 2.3582188089922367, "learning_rate": 1.9418886330920443e-06, "loss": 0.6607, "step": 1904 }, { "epoch": 0.738873266750703, "grad_norm": 3.3959880084515626, "learning_rate": 1.936534820481222e-06, "loss": 0.7463, "step": 1905 }, { "epoch": 0.7392611267332493, "grad_norm": 1.989519602259875, "learning_rate": 1.931186625287313e-06, "loss": 0.6433, "step": 1906 }, { "epoch": 0.7396489867157956, "grad_norm": 3.3485969200854315, "learning_rate": 1.9258440573172006e-06, "loss": 0.6734, "step": 1907 }, { "epoch": 0.7400368466983419, "grad_norm": 2.5511672236562055, "learning_rate": 1.920507126367448e-06, "loss": 0.7565, "step": 1908 }, { "epoch": 0.7404247066808882, "grad_norm": 2.911070110003065, "learning_rate": 1.9151758422242805e-06, "loss": 0.7098, "step": 1909 }, { "epoch": 0.7408125666634345, "grad_norm": 2.3929504203632694, "learning_rate": 1.909850214663575e-06, "loss": 0.6595, "step": 1910 }, { "epoch": 0.7412004266459808, "grad_norm": 2.170256582390011, "learning_rate": 1.9045302534508298e-06, "loss": 0.5773, "step": 1911 }, { "epoch": 0.741588286628527, "grad_norm": 2.8549001179861464, "learning_rate": 1.8992159683411549e-06, "loss": 0.6378, "step": 1912 }, { "epoch": 0.7419761466110734, "grad_norm": 1.8182029470193923, "learning_rate": 1.893907369079252e-06, "loss": 0.6672, "step": 1913 }, { "epoch": 0.7423640065936197, "grad_norm": 1.976741501598243, "learning_rate": 1.8886044653993968e-06, "loss": 0.65, "step": 1914 }, { "epoch": 0.742751866576166, "grad_norm": 1.9484003714578086, "learning_rate": 1.8833072670254227e-06, "loss": 0.6207, "step": 1915 }, { "epoch": 0.7431397265587123, "grad_norm": 2.3116791095314877, "learning_rate": 1.8780157836706985e-06, "loss": 0.6764, "step": 1916 }, { "epoch": 0.7435275865412586, "grad_norm": 2.0244309346074143, "learning_rate": 1.8727300250381153e-06, "loss": 0.6048, "step": 1917 }, { "epoch": 0.7439154465238049, "grad_norm": 2.0302964124178233, "learning_rate": 1.8674500008200675e-06, "loss": 0.6599, "step": 1918 }, { "epoch": 0.7443033065063512, "grad_norm": 2.0428952116614787, "learning_rate": 1.8621757206984326e-06, "loss": 0.6381, "step": 1919 }, { "epoch": 0.7446911664888975, "grad_norm": 1.7168125054521992, "learning_rate": 1.8569071943445565e-06, "loss": 0.6213, "step": 1920 }, { "epoch": 0.7450790264714439, "grad_norm": 2.279707127027056, "learning_rate": 1.851644431419234e-06, "loss": 0.6498, "step": 1921 }, { "epoch": 0.7454668864539901, "grad_norm": 2.0315566896252846, "learning_rate": 1.8463874415726918e-06, "loss": 0.7108, "step": 1922 }, { "epoch": 0.7458547464365364, "grad_norm": 2.917351506519411, "learning_rate": 1.841136234444571e-06, "loss": 0.6497, "step": 1923 }, { "epoch": 0.7462426064190827, "grad_norm": 2.0424408846182733, "learning_rate": 1.8358908196639086e-06, "loss": 0.6298, "step": 1924 }, { "epoch": 0.746630466401629, "grad_norm": 2.553456180908527, "learning_rate": 1.8306512068491195e-06, "loss": 0.6401, "step": 1925 }, { "epoch": 0.7470183263841753, "grad_norm": 2.0407947554144763, "learning_rate": 1.8254174056079798e-06, "loss": 0.6304, "step": 1926 }, { "epoch": 0.7474061863667216, "grad_norm": 2.1957766012250723, "learning_rate": 1.820189425537613e-06, "loss": 0.5868, "step": 1927 }, { "epoch": 0.7477940463492679, "grad_norm": 2.319150680597082, "learning_rate": 1.8149672762244625e-06, "loss": 0.6289, "step": 1928 }, { "epoch": 0.7481819063318143, "grad_norm": 2.0553852386723968, "learning_rate": 1.8097509672442836e-06, "loss": 0.6148, "step": 1929 }, { "epoch": 0.7485697663143606, "grad_norm": 2.79635587649315, "learning_rate": 1.8045405081621215e-06, "loss": 0.6546, "step": 1930 }, { "epoch": 0.7489576262969068, "grad_norm": 2.2408575294403326, "learning_rate": 1.7993359085322932e-06, "loss": 0.6013, "step": 1931 }, { "epoch": 0.7493454862794531, "grad_norm": 2.4264141911720882, "learning_rate": 1.7941371778983735e-06, "loss": 0.6494, "step": 1932 }, { "epoch": 0.7497333462619994, "grad_norm": 3.2245375117130437, "learning_rate": 1.7889443257931738e-06, "loss": 0.738, "step": 1933 }, { "epoch": 0.7501212062445457, "grad_norm": 2.0017597050565747, "learning_rate": 1.7837573617387266e-06, "loss": 0.6633, "step": 1934 }, { "epoch": 0.750509066227092, "grad_norm": 2.262108095463556, "learning_rate": 1.7785762952462665e-06, "loss": 0.6242, "step": 1935 }, { "epoch": 0.7508969262096383, "grad_norm": 3.343327443492633, "learning_rate": 1.7734011358162183e-06, "loss": 0.706, "step": 1936 }, { "epoch": 0.7512847861921846, "grad_norm": 2.2230449874609715, "learning_rate": 1.7682318929381698e-06, "loss": 0.6389, "step": 1937 }, { "epoch": 0.751672646174731, "grad_norm": 2.4438829662787107, "learning_rate": 1.7630685760908623e-06, "loss": 0.6232, "step": 1938 }, { "epoch": 0.7520605061572773, "grad_norm": 2.593756299074596, "learning_rate": 1.7579111947421695e-06, "loss": 0.6332, "step": 1939 }, { "epoch": 0.7524483661398236, "grad_norm": 2.3121028175000546, "learning_rate": 1.7527597583490825e-06, "loss": 0.6319, "step": 1940 }, { "epoch": 0.7528362261223698, "grad_norm": 2.45038428531278, "learning_rate": 1.7476142763576903e-06, "loss": 0.6445, "step": 1941 }, { "epoch": 0.7532240861049161, "grad_norm": 2.566787152508212, "learning_rate": 1.7424747582031638e-06, "loss": 0.6835, "step": 1942 }, { "epoch": 0.7536119460874624, "grad_norm": 2.687658678555817, "learning_rate": 1.7373412133097373e-06, "loss": 0.6591, "step": 1943 }, { "epoch": 0.7539998060700087, "grad_norm": 2.8167956925382067, "learning_rate": 1.732213651090695e-06, "loss": 0.68, "step": 1944 }, { "epoch": 0.754387666052555, "grad_norm": 2.1621087867394055, "learning_rate": 1.7270920809483476e-06, "loss": 0.6957, "step": 1945 }, { "epoch": 0.7547755260351013, "grad_norm": 2.3677765106347453, "learning_rate": 1.7219765122740202e-06, "loss": 0.5957, "step": 1946 }, { "epoch": 0.7551633860176477, "grad_norm": 2.4976279271133324, "learning_rate": 1.7168669544480305e-06, "loss": 0.6277, "step": 1947 }, { "epoch": 0.755551246000194, "grad_norm": 1.7142072959555192, "learning_rate": 1.7117634168396774e-06, "loss": 0.5938, "step": 1948 }, { "epoch": 0.7559391059827403, "grad_norm": 1.8110831848837754, "learning_rate": 1.7066659088072185e-06, "loss": 0.5787, "step": 1949 }, { "epoch": 0.7563269659652865, "grad_norm": 2.748566109874698, "learning_rate": 1.7015744396978557e-06, "loss": 0.6606, "step": 1950 }, { "epoch": 0.7567148259478328, "grad_norm": 3.206023170141328, "learning_rate": 1.696489018847718e-06, "loss": 0.6904, "step": 1951 }, { "epoch": 0.7571026859303791, "grad_norm": 2.7123213571064544, "learning_rate": 1.6914096555818432e-06, "loss": 0.6296, "step": 1952 }, { "epoch": 0.7574905459129254, "grad_norm": 2.0570912892704483, "learning_rate": 1.6863363592141618e-06, "loss": 0.6036, "step": 1953 }, { "epoch": 0.7578784058954717, "grad_norm": 2.4708090764542, "learning_rate": 1.6812691390474788e-06, "loss": 0.6566, "step": 1954 }, { "epoch": 0.758266265878018, "grad_norm": 2.771865999675987, "learning_rate": 1.676208004373458e-06, "loss": 0.6646, "step": 1955 }, { "epoch": 0.7586541258605644, "grad_norm": 2.362036574363041, "learning_rate": 1.6711529644726048e-06, "loss": 0.6761, "step": 1956 }, { "epoch": 0.7590419858431107, "grad_norm": 2.373656591576874, "learning_rate": 1.6661040286142478e-06, "loss": 0.6213, "step": 1957 }, { "epoch": 0.759429845825657, "grad_norm": 2.3651025442453, "learning_rate": 1.6610612060565235e-06, "loss": 0.5953, "step": 1958 }, { "epoch": 0.7598177058082033, "grad_norm": 2.0662828084287925, "learning_rate": 1.6560245060463575e-06, "loss": 0.6556, "step": 1959 }, { "epoch": 0.7602055657907495, "grad_norm": 2.508986690898162, "learning_rate": 1.6509939378194483e-06, "loss": 0.6096, "step": 1960 }, { "epoch": 0.7605934257732958, "grad_norm": 2.8043536747696733, "learning_rate": 1.645969510600255e-06, "loss": 0.6871, "step": 1961 }, { "epoch": 0.7609812857558421, "grad_norm": 2.94647337033163, "learning_rate": 1.64095123360197e-06, "loss": 0.6659, "step": 1962 }, { "epoch": 0.7613691457383884, "grad_norm": 2.597925744096279, "learning_rate": 1.6359391160265127e-06, "loss": 0.6876, "step": 1963 }, { "epoch": 0.7617570057209347, "grad_norm": 2.4853776182399887, "learning_rate": 1.6309331670645046e-06, "loss": 0.6659, "step": 1964 }, { "epoch": 0.7621448657034811, "grad_norm": 2.2461281160171733, "learning_rate": 1.6259333958952584e-06, "loss": 0.6779, "step": 1965 }, { "epoch": 0.7625327256860274, "grad_norm": 2.6171969922631657, "learning_rate": 1.6209398116867575e-06, "loss": 0.6257, "step": 1966 }, { "epoch": 0.7629205856685737, "grad_norm": 1.8541282938968289, "learning_rate": 1.6159524235956414e-06, "loss": 0.6251, "step": 1967 }, { "epoch": 0.76330844565112, "grad_norm": 2.4215393129792853, "learning_rate": 1.6109712407671867e-06, "loss": 0.6473, "step": 1968 }, { "epoch": 0.7636963056336662, "grad_norm": 2.642685595662524, "learning_rate": 1.6059962723352912e-06, "loss": 0.673, "step": 1969 }, { "epoch": 0.7640841656162125, "grad_norm": 2.450476051284388, "learning_rate": 1.6010275274224607e-06, "loss": 0.6404, "step": 1970 }, { "epoch": 0.7644720255987588, "grad_norm": 2.3232429230620935, "learning_rate": 1.5960650151397855e-06, "loss": 0.5927, "step": 1971 }, { "epoch": 0.7648598855813051, "grad_norm": 3.24196074846951, "learning_rate": 1.5911087445869289e-06, "loss": 0.6724, "step": 1972 }, { "epoch": 0.7652477455638514, "grad_norm": 2.0457015730087056, "learning_rate": 1.5861587248521083e-06, "loss": 0.672, "step": 1973 }, { "epoch": 0.7656356055463978, "grad_norm": 2.125997223801364, "learning_rate": 1.5812149650120784e-06, "loss": 0.6229, "step": 1974 }, { "epoch": 0.7660234655289441, "grad_norm": 2.104586496359949, "learning_rate": 1.5762774741321173e-06, "loss": 0.6725, "step": 1975 }, { "epoch": 0.7664113255114904, "grad_norm": 2.6134780416162355, "learning_rate": 1.5713462612660063e-06, "loss": 0.6976, "step": 1976 }, { "epoch": 0.7667991854940367, "grad_norm": 3.208969526199088, "learning_rate": 1.5664213354560133e-06, "loss": 0.6531, "step": 1977 }, { "epoch": 0.767187045476583, "grad_norm": 2.596801247090851, "learning_rate": 1.561502705732883e-06, "loss": 0.6518, "step": 1978 }, { "epoch": 0.7675749054591292, "grad_norm": 2.188923332498176, "learning_rate": 1.5565903811158095e-06, "loss": 0.6838, "step": 1979 }, { "epoch": 0.7679627654416755, "grad_norm": 2.7396313119856672, "learning_rate": 1.5516843706124285e-06, "loss": 0.6435, "step": 1980 }, { "epoch": 0.7683506254242218, "grad_norm": 2.1576604549340557, "learning_rate": 1.546784683218796e-06, "loss": 0.6664, "step": 1981 }, { "epoch": 0.7687384854067681, "grad_norm": 2.2443647794399855, "learning_rate": 1.5418913279193748e-06, "loss": 0.6413, "step": 1982 }, { "epoch": 0.7691263453893145, "grad_norm": 2.824500738158625, "learning_rate": 1.537004313687015e-06, "loss": 0.7274, "step": 1983 }, { "epoch": 0.7695142053718608, "grad_norm": 2.767606133522679, "learning_rate": 1.5321236494829412e-06, "loss": 0.6852, "step": 1984 }, { "epoch": 0.7699020653544071, "grad_norm": 2.6555837922926333, "learning_rate": 1.5272493442567321e-06, "loss": 0.7224, "step": 1985 }, { "epoch": 0.7702899253369534, "grad_norm": 2.1756272200991598, "learning_rate": 1.5223814069463077e-06, "loss": 0.6114, "step": 1986 }, { "epoch": 0.7706777853194997, "grad_norm": 2.9515937442323663, "learning_rate": 1.5175198464779107e-06, "loss": 0.7132, "step": 1987 }, { "epoch": 0.7710656453020459, "grad_norm": 3.220764365533064, "learning_rate": 1.5126646717660898e-06, "loss": 0.6858, "step": 1988 }, { "epoch": 0.7714535052845922, "grad_norm": 2.6512752031092552, "learning_rate": 1.507815891713686e-06, "loss": 0.7085, "step": 1989 }, { "epoch": 0.7718413652671385, "grad_norm": 2.8639169812039618, "learning_rate": 1.5029735152118125e-06, "loss": 0.7069, "step": 1990 }, { "epoch": 0.7722292252496848, "grad_norm": 2.940829581464311, "learning_rate": 1.4981375511398427e-06, "loss": 0.6405, "step": 1991 }, { "epoch": 0.7726170852322312, "grad_norm": 2.6478006715397027, "learning_rate": 1.49330800836539e-06, "loss": 0.6608, "step": 1992 }, { "epoch": 0.7730049452147775, "grad_norm": 2.316988749316277, "learning_rate": 1.4884848957442933e-06, "loss": 0.6531, "step": 1993 }, { "epoch": 0.7733928051973238, "grad_norm": 2.660543920431166, "learning_rate": 1.4836682221206e-06, "loss": 0.6783, "step": 1994 }, { "epoch": 0.7737806651798701, "grad_norm": 2.350934409793109, "learning_rate": 1.4788579963265547e-06, "loss": 0.6736, "step": 1995 }, { "epoch": 0.7741685251624164, "grad_norm": 1.7875474321887188, "learning_rate": 1.4740542271825736e-06, "loss": 0.6461, "step": 1996 }, { "epoch": 0.7745563851449627, "grad_norm": 2.8455654259673935, "learning_rate": 1.4692569234972348e-06, "loss": 0.6856, "step": 1997 }, { "epoch": 0.7749442451275089, "grad_norm": 1.681184384659447, "learning_rate": 1.4644660940672628e-06, "loss": 0.586, "step": 1998 }, { "epoch": 0.7753321051100552, "grad_norm": 1.9220627163448492, "learning_rate": 1.4596817476775077e-06, "loss": 0.6216, "step": 1999 }, { "epoch": 0.7757199650926015, "grad_norm": 2.80299349779109, "learning_rate": 1.454903893100934e-06, "loss": 0.7061, "step": 2000 }, { "epoch": 0.7757199650926015, "eval_loss": 1.2540079355239868, "eval_runtime": 6.0644, "eval_samples_per_second": 0.165, "eval_steps_per_second": 0.165, "step": 2000 }, { "epoch": 0.7761078250751479, "grad_norm": 2.6302731791642024, "learning_rate": 1.4501325390986004e-06, "loss": 0.6613, "step": 2001 }, { "epoch": 0.7764956850576942, "grad_norm": 2.131767782351463, "learning_rate": 1.4453676944196477e-06, "loss": 0.6535, "step": 2002 }, { "epoch": 0.7768835450402405, "grad_norm": 2.9381673898871563, "learning_rate": 1.4406093678012767e-06, "loss": 0.6744, "step": 2003 }, { "epoch": 0.7772714050227868, "grad_norm": 2.657202644344991, "learning_rate": 1.4358575679687425e-06, "loss": 0.6931, "step": 2004 }, { "epoch": 0.7776592650053331, "grad_norm": 3.261945032738493, "learning_rate": 1.431112303635328e-06, "loss": 0.7129, "step": 2005 }, { "epoch": 0.7780471249878794, "grad_norm": 1.9138155045766194, "learning_rate": 1.4263735835023318e-06, "loss": 0.6724, "step": 2006 }, { "epoch": 0.7784349849704256, "grad_norm": 2.37309789946728, "learning_rate": 1.4216414162590531e-06, "loss": 0.6067, "step": 2007 }, { "epoch": 0.7788228449529719, "grad_norm": 2.1784534209640065, "learning_rate": 1.4169158105827768e-06, "loss": 0.6761, "step": 2008 }, { "epoch": 0.7792107049355183, "grad_norm": 2.400707390884429, "learning_rate": 1.4121967751387538e-06, "loss": 0.657, "step": 2009 }, { "epoch": 0.7795985649180646, "grad_norm": 1.6974205984085624, "learning_rate": 1.4074843185801885e-06, "loss": 0.6235, "step": 2010 }, { "epoch": 0.7799864249006109, "grad_norm": 2.397056081440427, "learning_rate": 1.4027784495482215e-06, "loss": 0.6252, "step": 2011 }, { "epoch": 0.7803742848831572, "grad_norm": 2.931173778706403, "learning_rate": 1.3980791766719138e-06, "loss": 0.6766, "step": 2012 }, { "epoch": 0.7807621448657035, "grad_norm": 2.1448225148887716, "learning_rate": 1.3933865085682313e-06, "loss": 0.6392, "step": 2013 }, { "epoch": 0.7811500048482498, "grad_norm": 2.747228145829711, "learning_rate": 1.388700453842029e-06, "loss": 0.6544, "step": 2014 }, { "epoch": 0.7815378648307961, "grad_norm": 2.3556369108335278, "learning_rate": 1.3840210210860343e-06, "loss": 0.6646, "step": 2015 }, { "epoch": 0.7819257248133424, "grad_norm": 2.6947867763625974, "learning_rate": 1.3793482188808339e-06, "loss": 0.6844, "step": 2016 }, { "epoch": 0.7823135847958886, "grad_norm": 2.9551665500382955, "learning_rate": 1.3746820557948538e-06, "loss": 0.6419, "step": 2017 }, { "epoch": 0.782701444778435, "grad_norm": 3.8204594393778644, "learning_rate": 1.370022540384347e-06, "loss": 0.7426, "step": 2018 }, { "epoch": 0.7830893047609813, "grad_norm": 2.319177712634172, "learning_rate": 1.3653696811933782e-06, "loss": 0.6714, "step": 2019 }, { "epoch": 0.7834771647435276, "grad_norm": 2.0523675022969443, "learning_rate": 1.3607234867538028e-06, "loss": 0.6321, "step": 2020 }, { "epoch": 0.7838650247260739, "grad_norm": 2.4822518524270385, "learning_rate": 1.3560839655852604e-06, "loss": 0.6207, "step": 2021 }, { "epoch": 0.7842528847086202, "grad_norm": 2.8349861407056025, "learning_rate": 1.3514511261951514e-06, "loss": 0.626, "step": 2022 }, { "epoch": 0.7846407446911665, "grad_norm": 2.0864377287609743, "learning_rate": 1.3468249770786223e-06, "loss": 0.6305, "step": 2023 }, { "epoch": 0.7850286046737128, "grad_norm": 2.437502590911481, "learning_rate": 1.3422055267185541e-06, "loss": 0.6239, "step": 2024 }, { "epoch": 0.7854164646562591, "grad_norm": 2.264331027135223, "learning_rate": 1.337592783585544e-06, "loss": 0.6421, "step": 2025 }, { "epoch": 0.7858043246388053, "grad_norm": 2.337181934714096, "learning_rate": 1.332986756137889e-06, "loss": 0.6296, "step": 2026 }, { "epoch": 0.7861921846213517, "grad_norm": 2.908598480517351, "learning_rate": 1.3283874528215735e-06, "loss": 0.6609, "step": 2027 }, { "epoch": 0.786580044603898, "grad_norm": 1.9206659817281797, "learning_rate": 1.3237948820702495e-06, "loss": 0.6622, "step": 2028 }, { "epoch": 0.7869679045864443, "grad_norm": 1.7768530424402864, "learning_rate": 1.3192090523052275e-06, "loss": 0.5997, "step": 2029 }, { "epoch": 0.7873557645689906, "grad_norm": 2.146108613493023, "learning_rate": 1.3146299719354544e-06, "loss": 0.6316, "step": 2030 }, { "epoch": 0.7877436245515369, "grad_norm": 1.9093794169924316, "learning_rate": 1.3100576493575012e-06, "loss": 0.6055, "step": 2031 }, { "epoch": 0.7881314845340832, "grad_norm": 2.5257000363549444, "learning_rate": 1.3054920929555471e-06, "loss": 0.6459, "step": 2032 }, { "epoch": 0.7885193445166295, "grad_norm": 2.19278500702785, "learning_rate": 1.300933311101365e-06, "loss": 0.6198, "step": 2033 }, { "epoch": 0.7889072044991758, "grad_norm": 2.1569825655519024, "learning_rate": 1.296381312154305e-06, "loss": 0.6648, "step": 2034 }, { "epoch": 0.7892950644817222, "grad_norm": 2.716631521602286, "learning_rate": 1.29183610446128e-06, "loss": 0.6261, "step": 2035 }, { "epoch": 0.7896829244642684, "grad_norm": 2.014825722207554, "learning_rate": 1.2872976963567485e-06, "loss": 0.6238, "step": 2036 }, { "epoch": 0.7900707844468147, "grad_norm": 2.3194286359045067, "learning_rate": 1.282766096162701e-06, "loss": 0.6863, "step": 2037 }, { "epoch": 0.790458644429361, "grad_norm": 2.0838849277100326, "learning_rate": 1.2782413121886483e-06, "loss": 0.6488, "step": 2038 }, { "epoch": 0.7908465044119073, "grad_norm": 2.660011902677586, "learning_rate": 1.2737233527315978e-06, "loss": 0.6322, "step": 2039 }, { "epoch": 0.7912343643944536, "grad_norm": 1.796985426393936, "learning_rate": 1.2692122260760442e-06, "loss": 0.5707, "step": 2040 }, { "epoch": 0.7916222243769999, "grad_norm": 1.8236293781761266, "learning_rate": 1.2647079404939533e-06, "loss": 0.6066, "step": 2041 }, { "epoch": 0.7920100843595462, "grad_norm": 2.1055492935306517, "learning_rate": 1.2602105042447472e-06, "loss": 0.6426, "step": 2042 }, { "epoch": 0.7923979443420925, "grad_norm": 1.9859435935769503, "learning_rate": 1.2557199255752866e-06, "loss": 0.6518, "step": 2043 }, { "epoch": 0.7927858043246389, "grad_norm": 3.394080051181626, "learning_rate": 1.25123621271986e-06, "loss": 0.6845, "step": 2044 }, { "epoch": 0.7931736643071851, "grad_norm": 2.415300714050472, "learning_rate": 1.246759373900165e-06, "loss": 0.6494, "step": 2045 }, { "epoch": 0.7935615242897314, "grad_norm": 2.3117367509494593, "learning_rate": 1.2422894173252937e-06, "loss": 0.6559, "step": 2046 }, { "epoch": 0.7939493842722777, "grad_norm": 4.148162957723833, "learning_rate": 1.23782635119172e-06, "loss": 0.6898, "step": 2047 }, { "epoch": 0.794337244254824, "grad_norm": 2.3175590276297866, "learning_rate": 1.2333701836832812e-06, "loss": 0.6611, "step": 2048 }, { "epoch": 0.7947251042373703, "grad_norm": 3.013816397614327, "learning_rate": 1.2289209229711657e-06, "loss": 0.6199, "step": 2049 }, { "epoch": 0.7951129642199166, "grad_norm": 1.9666181447602096, "learning_rate": 1.2244785772138972e-06, "loss": 0.7044, "step": 2050 }, { "epoch": 0.7955008242024629, "grad_norm": 2.130502060126987, "learning_rate": 1.22004315455732e-06, "loss": 0.6159, "step": 2051 }, { "epoch": 0.7958886841850092, "grad_norm": 2.3837744866560304, "learning_rate": 1.2156146631345817e-06, "loss": 0.6639, "step": 2052 }, { "epoch": 0.7962765441675556, "grad_norm": 2.133497414595062, "learning_rate": 1.2111931110661213e-06, "loss": 0.6717, "step": 2053 }, { "epoch": 0.7966644041501019, "grad_norm": 2.6427930354488978, "learning_rate": 1.2067785064596532e-06, "loss": 0.6104, "step": 2054 }, { "epoch": 0.7970522641326481, "grad_norm": 2.166200508060219, "learning_rate": 1.202370857410155e-06, "loss": 0.5986, "step": 2055 }, { "epoch": 0.7974401241151944, "grad_norm": 2.1412602487724244, "learning_rate": 1.1979701719998454e-06, "loss": 0.636, "step": 2056 }, { "epoch": 0.7978279840977407, "grad_norm": 2.575471960467516, "learning_rate": 1.1935764582981774e-06, "loss": 0.7003, "step": 2057 }, { "epoch": 0.798215844080287, "grad_norm": 2.6683135513895095, "learning_rate": 1.1891897243618184e-06, "loss": 0.6504, "step": 2058 }, { "epoch": 0.7986037040628333, "grad_norm": 2.0531491988977733, "learning_rate": 1.1848099782346373e-06, "loss": 0.6293, "step": 2059 }, { "epoch": 0.7989915640453796, "grad_norm": 2.9485326072705704, "learning_rate": 1.1804372279476905e-06, "loss": 0.7091, "step": 2060 }, { "epoch": 0.7993794240279259, "grad_norm": 2.572115489337828, "learning_rate": 1.1760714815192054e-06, "loss": 0.7057, "step": 2061 }, { "epoch": 0.7997672840104723, "grad_norm": 2.3362919733609573, "learning_rate": 1.171712746954566e-06, "loss": 0.6439, "step": 2062 }, { "epoch": 0.8001551439930186, "grad_norm": 2.594612895116914, "learning_rate": 1.1673610322463014e-06, "loss": 0.6331, "step": 2063 }, { "epoch": 0.8005430039755648, "grad_norm": 3.084881586594678, "learning_rate": 1.163016345374066e-06, "loss": 0.7032, "step": 2064 }, { "epoch": 0.8009308639581111, "grad_norm": 2.5772610834657366, "learning_rate": 1.1586786943046284e-06, "loss": 0.6674, "step": 2065 }, { "epoch": 0.8013187239406574, "grad_norm": 2.300101279012236, "learning_rate": 1.1543480869918555e-06, "loss": 0.6348, "step": 2066 }, { "epoch": 0.8017065839232037, "grad_norm": 2.4068204667693407, "learning_rate": 1.1500245313766984e-06, "loss": 0.6843, "step": 2067 }, { "epoch": 0.80209444390575, "grad_norm": 2.418305502894438, "learning_rate": 1.145708035387177e-06, "loss": 0.6937, "step": 2068 }, { "epoch": 0.8024823038882963, "grad_norm": 1.855465766452271, "learning_rate": 1.141398606938367e-06, "loss": 0.634, "step": 2069 }, { "epoch": 0.8028701638708426, "grad_norm": 3.0617960294610658, "learning_rate": 1.1370962539323837e-06, "loss": 0.6653, "step": 2070 }, { "epoch": 0.803258023853389, "grad_norm": 3.342564981912469, "learning_rate": 1.1328009842583677e-06, "loss": 0.7079, "step": 2071 }, { "epoch": 0.8036458838359353, "grad_norm": 2.7061697704522616, "learning_rate": 1.1285128057924743e-06, "loss": 0.6301, "step": 2072 }, { "epoch": 0.8040337438184816, "grad_norm": 1.8485763310411856, "learning_rate": 1.1242317263978525e-06, "loss": 0.6337, "step": 2073 }, { "epoch": 0.8044216038010278, "grad_norm": 1.9901928296077553, "learning_rate": 1.1199577539246348e-06, "loss": 0.6148, "step": 2074 }, { "epoch": 0.8048094637835741, "grad_norm": 2.466619493007625, "learning_rate": 1.1156908962099223e-06, "loss": 0.6042, "step": 2075 }, { "epoch": 0.8051973237661204, "grad_norm": 2.4627225768269714, "learning_rate": 1.111431161077769e-06, "loss": 0.6431, "step": 2076 }, { "epoch": 0.8055851837486667, "grad_norm": 2.045199254053524, "learning_rate": 1.1071785563391697e-06, "loss": 0.7109, "step": 2077 }, { "epoch": 0.805973043731213, "grad_norm": 2.8439103306887668, "learning_rate": 1.102933089792042e-06, "loss": 0.7045, "step": 2078 }, { "epoch": 0.8063609037137593, "grad_norm": 2.0764772613416027, "learning_rate": 1.0986947692212174e-06, "loss": 0.603, "step": 2079 }, { "epoch": 0.8067487636963057, "grad_norm": 2.2853463399229494, "learning_rate": 1.0944636023984222e-06, "loss": 0.6327, "step": 2080 }, { "epoch": 0.807136623678852, "grad_norm": 2.7497092174718025, "learning_rate": 1.0902395970822648e-06, "loss": 0.6438, "step": 2081 }, { "epoch": 0.8075244836613983, "grad_norm": 2.526216223068647, "learning_rate": 1.0860227610182222e-06, "loss": 0.648, "step": 2082 }, { "epoch": 0.8079123436439445, "grad_norm": 2.1781223626902864, "learning_rate": 1.081813101938625e-06, "loss": 0.6244, "step": 2083 }, { "epoch": 0.8083002036264908, "grad_norm": 2.3241475546902404, "learning_rate": 1.0776106275626446e-06, "loss": 0.6813, "step": 2084 }, { "epoch": 0.8086880636090371, "grad_norm": 2.1224478535984335, "learning_rate": 1.0734153455962765e-06, "loss": 0.6481, "step": 2085 }, { "epoch": 0.8090759235915834, "grad_norm": 2.6706950979152877, "learning_rate": 1.0692272637323281e-06, "loss": 0.7134, "step": 2086 }, { "epoch": 0.8094637835741297, "grad_norm": 3.3827971589926897, "learning_rate": 1.0650463896504042e-06, "loss": 0.7351, "step": 2087 }, { "epoch": 0.809851643556676, "grad_norm": 2.0062307024555563, "learning_rate": 1.0608727310168921e-06, "loss": 0.6308, "step": 2088 }, { "epoch": 0.8102395035392224, "grad_norm": 2.0522010029573052, "learning_rate": 1.0567062954849506e-06, "loss": 0.6065, "step": 2089 }, { "epoch": 0.8106273635217687, "grad_norm": 2.2215606875418112, "learning_rate": 1.0525470906944919e-06, "loss": 0.6648, "step": 2090 }, { "epoch": 0.811015223504315, "grad_norm": 2.1960536543209024, "learning_rate": 1.0483951242721685e-06, "loss": 0.6938, "step": 2091 }, { "epoch": 0.8114030834868613, "grad_norm": 2.8412445944851465, "learning_rate": 1.044250403831361e-06, "loss": 0.6269, "step": 2092 }, { "epoch": 0.8117909434694075, "grad_norm": 1.9668630274584717, "learning_rate": 1.040112936972164e-06, "loss": 0.6289, "step": 2093 }, { "epoch": 0.8121788034519538, "grad_norm": 2.379846457558885, "learning_rate": 1.0359827312813702e-06, "loss": 0.6561, "step": 2094 }, { "epoch": 0.8125666634345001, "grad_norm": 2.170710802627724, "learning_rate": 1.0318597943324582e-06, "loss": 0.6971, "step": 2095 }, { "epoch": 0.8129545234170464, "grad_norm": 1.6419801064577062, "learning_rate": 1.027744133685577e-06, "loss": 0.584, "step": 2096 }, { "epoch": 0.8133423833995927, "grad_norm": 1.9427713990987603, "learning_rate": 1.0236357568875333e-06, "loss": 0.6099, "step": 2097 }, { "epoch": 0.8137302433821391, "grad_norm": 2.0971668855437295, "learning_rate": 1.0195346714717813e-06, "loss": 0.6472, "step": 2098 }, { "epoch": 0.8141181033646854, "grad_norm": 2.847543462706242, "learning_rate": 1.0154408849583997e-06, "loss": 0.6699, "step": 2099 }, { "epoch": 0.8145059633472317, "grad_norm": 2.1299591162167704, "learning_rate": 1.0113544048540868e-06, "loss": 0.6148, "step": 2100 }, { "epoch": 0.814893823329778, "grad_norm": 2.464436487800701, "learning_rate": 1.0072752386521417e-06, "loss": 0.6637, "step": 2101 }, { "epoch": 0.8152816833123242, "grad_norm": 1.9674661962386286, "learning_rate": 1.0032033938324527e-06, "loss": 0.6256, "step": 2102 }, { "epoch": 0.8156695432948705, "grad_norm": 2.324090302569577, "learning_rate": 9.991388778614825e-07, "loss": 0.6909, "step": 2103 }, { "epoch": 0.8160574032774168, "grad_norm": 2.238819832898849, "learning_rate": 9.950816981922567e-07, "loss": 0.6472, "step": 2104 }, { "epoch": 0.8164452632599631, "grad_norm": 2.0975725137595465, "learning_rate": 9.91031862264345e-07, "loss": 0.7091, "step": 2105 }, { "epoch": 0.8168331232425095, "grad_norm": 2.4809581524869824, "learning_rate": 9.869893775038558e-07, "loss": 0.7241, "step": 2106 }, { "epoch": 0.8172209832250558, "grad_norm": 2.545649904547363, "learning_rate": 9.829542513234153e-07, "loss": 0.6718, "step": 2107 }, { "epoch": 0.8176088432076021, "grad_norm": 2.45626201708099, "learning_rate": 9.789264911221546e-07, "loss": 0.668, "step": 2108 }, { "epoch": 0.8179967031901484, "grad_norm": 1.743927946285204, "learning_rate": 9.749061042857011e-07, "loss": 0.5756, "step": 2109 }, { "epoch": 0.8183845631726947, "grad_norm": 2.807867148089103, "learning_rate": 9.708930981861603e-07, "loss": 0.6816, "step": 2110 }, { "epoch": 0.8187724231552409, "grad_norm": 2.0986648535128416, "learning_rate": 9.668874801821033e-07, "loss": 0.6368, "step": 2111 }, { "epoch": 0.8191602831377872, "grad_norm": 3.013464810724254, "learning_rate": 9.62889257618555e-07, "loss": 0.7242, "step": 2112 }, { "epoch": 0.8195481431203335, "grad_norm": 3.090345208440008, "learning_rate": 9.588984378269784e-07, "loss": 0.7461, "step": 2113 }, { "epoch": 0.8199360031028798, "grad_norm": 2.432876439489122, "learning_rate": 9.549150281252633e-07, "loss": 0.6551, "step": 2114 }, { "epoch": 0.8203238630854262, "grad_norm": 2.4140404335036174, "learning_rate": 9.509390358177106e-07, "loss": 0.671, "step": 2115 }, { "epoch": 0.8207117230679725, "grad_norm": 2.188299648955462, "learning_rate": 9.469704681950209e-07, "loss": 0.6498, "step": 2116 }, { "epoch": 0.8210995830505188, "grad_norm": 2.838867952282522, "learning_rate": 9.430093325342799e-07, "loss": 0.6181, "step": 2117 }, { "epoch": 0.8214874430330651, "grad_norm": 2.3192545175293158, "learning_rate": 9.39055636098945e-07, "loss": 0.644, "step": 2118 }, { "epoch": 0.8218753030156114, "grad_norm": 2.5767914959139286, "learning_rate": 9.351093861388338e-07, "loss": 0.668, "step": 2119 }, { "epoch": 0.8222631629981577, "grad_norm": 2.0934486150431715, "learning_rate": 9.311705898901086e-07, "loss": 0.6537, "step": 2120 }, { "epoch": 0.8226510229807039, "grad_norm": 2.757679414364006, "learning_rate": 9.272392545752628e-07, "loss": 0.6565, "step": 2121 }, { "epoch": 0.8230388829632502, "grad_norm": 2.646483898867697, "learning_rate": 9.233153874031103e-07, "loss": 0.6308, "step": 2122 }, { "epoch": 0.8234267429457965, "grad_norm": 2.6412169423442347, "learning_rate": 9.193989955687715e-07, "loss": 0.6824, "step": 2123 }, { "epoch": 0.8238146029283429, "grad_norm": 2.9690020786819065, "learning_rate": 9.154900862536586e-07, "loss": 0.701, "step": 2124 }, { "epoch": 0.8242024629108892, "grad_norm": 2.6558108103384463, "learning_rate": 9.115886666254625e-07, "loss": 0.6436, "step": 2125 }, { "epoch": 0.8245903228934355, "grad_norm": 1.8592289810079174, "learning_rate": 9.076947438381411e-07, "loss": 0.6636, "step": 2126 }, { "epoch": 0.8249781828759818, "grad_norm": 2.3106415841120866, "learning_rate": 9.038083250319051e-07, "loss": 0.5844, "step": 2127 }, { "epoch": 0.8253660428585281, "grad_norm": 2.0838655560148514, "learning_rate": 8.999294173332058e-07, "loss": 0.6557, "step": 2128 }, { "epoch": 0.8257539028410744, "grad_norm": 1.8749061792548793, "learning_rate": 8.960580278547216e-07, "loss": 0.6365, "step": 2129 }, { "epoch": 0.8261417628236206, "grad_norm": 1.9668510298362996, "learning_rate": 8.921941636953435e-07, "loss": 0.6302, "step": 2130 }, { "epoch": 0.8265296228061669, "grad_norm": 2.762521354644459, "learning_rate": 8.883378319401648e-07, "loss": 0.6334, "step": 2131 }, { "epoch": 0.8269174827887132, "grad_norm": 2.364745974296835, "learning_rate": 8.844890396604677e-07, "loss": 0.6368, "step": 2132 }, { "epoch": 0.8273053427712596, "grad_norm": 2.919796060612584, "learning_rate": 8.806477939137081e-07, "loss": 0.6677, "step": 2133 }, { "epoch": 0.8276932027538059, "grad_norm": 2.3126123557027234, "learning_rate": 8.768141017435033e-07, "loss": 0.6761, "step": 2134 }, { "epoch": 0.8280810627363522, "grad_norm": 3.621973778713474, "learning_rate": 8.729879701796207e-07, "loss": 0.744, "step": 2135 }, { "epoch": 0.8284689227188985, "grad_norm": 1.7821003096764223, "learning_rate": 8.691694062379647e-07, "loss": 0.6631, "step": 2136 }, { "epoch": 0.8288567827014448, "grad_norm": 1.980653768081549, "learning_rate": 8.653584169205608e-07, "loss": 0.6789, "step": 2137 }, { "epoch": 0.8292446426839911, "grad_norm": 2.377821474425067, "learning_rate": 8.615550092155478e-07, "loss": 0.6478, "step": 2138 }, { "epoch": 0.8296325026665374, "grad_norm": 2.085174572483987, "learning_rate": 8.577591900971588e-07, "loss": 0.6504, "step": 2139 }, { "epoch": 0.8300203626490836, "grad_norm": 1.92803725371804, "learning_rate": 8.539709665257167e-07, "loss": 0.6064, "step": 2140 }, { "epoch": 0.8304082226316299, "grad_norm": 2.150683931197967, "learning_rate": 8.501903454476129e-07, "loss": 0.6246, "step": 2141 }, { "epoch": 0.8307960826141763, "grad_norm": 2.596062924114304, "learning_rate": 8.464173337952991e-07, "loss": 0.5566, "step": 2142 }, { "epoch": 0.8311839425967226, "grad_norm": 2.253183980380244, "learning_rate": 8.426519384872733e-07, "loss": 0.6512, "step": 2143 }, { "epoch": 0.8315718025792689, "grad_norm": 2.08013727872381, "learning_rate": 8.388941664280703e-07, "loss": 0.6567, "step": 2144 }, { "epoch": 0.8319596625618152, "grad_norm": 2.7100359808016603, "learning_rate": 8.351440245082415e-07, "loss": 0.7323, "step": 2145 }, { "epoch": 0.8323475225443615, "grad_norm": 2.8363399476353246, "learning_rate": 8.314015196043501e-07, "loss": 0.6748, "step": 2146 }, { "epoch": 0.8327353825269078, "grad_norm": 2.2691278276729663, "learning_rate": 8.276666585789561e-07, "loss": 0.6454, "step": 2147 }, { "epoch": 0.8331232425094541, "grad_norm": 2.36006959597878, "learning_rate": 8.239394482805996e-07, "loss": 0.6192, "step": 2148 }, { "epoch": 0.8335111024920003, "grad_norm": 2.8744338911207135, "learning_rate": 8.202198955437979e-07, "loss": 0.639, "step": 2149 }, { "epoch": 0.8338989624745466, "grad_norm": 2.83907221377365, "learning_rate": 8.165080071890208e-07, "loss": 0.6973, "step": 2150 }, { "epoch": 0.834286822457093, "grad_norm": 2.287917535241092, "learning_rate": 8.128037900226865e-07, "loss": 0.6999, "step": 2151 }, { "epoch": 0.8346746824396393, "grad_norm": 3.2046900791703705, "learning_rate": 8.091072508371466e-07, "loss": 0.6964, "step": 2152 }, { "epoch": 0.8350625424221856, "grad_norm": 2.3130440618932155, "learning_rate": 8.054183964106737e-07, "loss": 0.673, "step": 2153 }, { "epoch": 0.8354504024047319, "grad_norm": 1.9451759953305983, "learning_rate": 8.017372335074486e-07, "loss": 0.6015, "step": 2154 }, { "epoch": 0.8358382623872782, "grad_norm": 2.7382106442535186, "learning_rate": 7.980637688775484e-07, "loss": 0.6779, "step": 2155 }, { "epoch": 0.8362261223698245, "grad_norm": 2.0890522119733586, "learning_rate": 7.943980092569336e-07, "loss": 0.6495, "step": 2156 }, { "epoch": 0.8366139823523708, "grad_norm": 2.085874951132959, "learning_rate": 7.907399613674388e-07, "loss": 0.5951, "step": 2157 }, { "epoch": 0.8370018423349171, "grad_norm": 2.171350376060195, "learning_rate": 7.870896319167548e-07, "loss": 0.654, "step": 2158 }, { "epoch": 0.8373897023174633, "grad_norm": 2.4978715729077923, "learning_rate": 7.834470275984196e-07, "loss": 0.6658, "step": 2159 }, { "epoch": 0.8377775623000097, "grad_norm": 2.3211073326644867, "learning_rate": 7.79812155091807e-07, "loss": 0.6637, "step": 2160 }, { "epoch": 0.838165422282556, "grad_norm": 2.69886494258317, "learning_rate": 7.761850210621125e-07, "loss": 0.6764, "step": 2161 }, { "epoch": 0.8385532822651023, "grad_norm": 1.8993953836733461, "learning_rate": 7.725656321603414e-07, "loss": 0.6365, "step": 2162 }, { "epoch": 0.8389411422476486, "grad_norm": 2.55463382249981, "learning_rate": 7.689539950232977e-07, "loss": 0.719, "step": 2163 }, { "epoch": 0.8393290022301949, "grad_norm": 1.8797975060072207, "learning_rate": 7.653501162735694e-07, "loss": 0.6176, "step": 2164 }, { "epoch": 0.8397168622127412, "grad_norm": 2.6412661191759974, "learning_rate": 7.617540025195197e-07, "loss": 0.6732, "step": 2165 }, { "epoch": 0.8401047221952875, "grad_norm": 2.0032511564499917, "learning_rate": 7.581656603552745e-07, "loss": 0.6417, "step": 2166 }, { "epoch": 0.8404925821778338, "grad_norm": 3.5761146238801387, "learning_rate": 7.54585096360706e-07, "loss": 0.6755, "step": 2167 }, { "epoch": 0.84088044216038, "grad_norm": 2.707222712356204, "learning_rate": 7.510123171014255e-07, "loss": 0.6214, "step": 2168 }, { "epoch": 0.8412683021429264, "grad_norm": 2.177312938220555, "learning_rate": 7.474473291287699e-07, "loss": 0.6993, "step": 2169 }, { "epoch": 0.8416561621254727, "grad_norm": 2.570666857229838, "learning_rate": 7.438901389797881e-07, "loss": 0.6478, "step": 2170 }, { "epoch": 0.842044022108019, "grad_norm": 2.252956983129279, "learning_rate": 7.403407531772311e-07, "loss": 0.6289, "step": 2171 }, { "epoch": 0.8424318820905653, "grad_norm": 2.0535938139926655, "learning_rate": 7.367991782295392e-07, "loss": 0.6846, "step": 2172 }, { "epoch": 0.8428197420731116, "grad_norm": 3.3408924643740883, "learning_rate": 7.332654206308299e-07, "loss": 0.6942, "step": 2173 }, { "epoch": 0.8432076020556579, "grad_norm": 2.505998953411515, "learning_rate": 7.297394868608859e-07, "loss": 0.6018, "step": 2174 }, { "epoch": 0.8435954620382042, "grad_norm": 2.0229925678230174, "learning_rate": 7.262213833851445e-07, "loss": 0.6102, "step": 2175 }, { "epoch": 0.8439833220207505, "grad_norm": 2.307880044210264, "learning_rate": 7.227111166546835e-07, "loss": 0.6771, "step": 2176 }, { "epoch": 0.8443711820032969, "grad_norm": 2.7076278737581023, "learning_rate": 7.192086931062115e-07, "loss": 0.6965, "step": 2177 }, { "epoch": 0.8447590419858431, "grad_norm": 2.9164605187831856, "learning_rate": 7.157141191620548e-07, "loss": 0.7329, "step": 2178 }, { "epoch": 0.8451469019683894, "grad_norm": 1.8847459244661247, "learning_rate": 7.122274012301461e-07, "loss": 0.6175, "step": 2179 }, { "epoch": 0.8455347619509357, "grad_norm": 2.6469514667121827, "learning_rate": 7.087485457040127e-07, "loss": 0.645, "step": 2180 }, { "epoch": 0.845922621933482, "grad_norm": 2.8179523515895464, "learning_rate": 7.052775589627647e-07, "loss": 0.7033, "step": 2181 }, { "epoch": 0.8463104819160283, "grad_norm": 2.1697161159274923, "learning_rate": 7.018144473710825e-07, "loss": 0.667, "step": 2182 }, { "epoch": 0.8466983418985746, "grad_norm": 2.1040611515943985, "learning_rate": 6.983592172792087e-07, "loss": 0.7188, "step": 2183 }, { "epoch": 0.8470862018811209, "grad_norm": 2.834854808502793, "learning_rate": 6.949118750229317e-07, "loss": 0.7026, "step": 2184 }, { "epoch": 0.8474740618636672, "grad_norm": 2.4477654570105414, "learning_rate": 6.914724269235756e-07, "loss": 0.6522, "step": 2185 }, { "epoch": 0.8478619218462136, "grad_norm": 2.60979800402215, "learning_rate": 6.880408792879905e-07, "loss": 0.6344, "step": 2186 }, { "epoch": 0.8482497818287598, "grad_norm": 2.213128933984646, "learning_rate": 6.846172384085386e-07, "loss": 0.6201, "step": 2187 }, { "epoch": 0.8486376418113061, "grad_norm": 2.0221972100576706, "learning_rate": 6.812015105630842e-07, "loss": 0.6162, "step": 2188 }, { "epoch": 0.8490255017938524, "grad_norm": 2.6574316465729853, "learning_rate": 6.777937020149816e-07, "loss": 0.6301, "step": 2189 }, { "epoch": 0.8494133617763987, "grad_norm": 2.465240465503381, "learning_rate": 6.743938190130616e-07, "loss": 0.7072, "step": 2190 }, { "epoch": 0.849801221758945, "grad_norm": 2.3390664796686, "learning_rate": 6.710018677916275e-07, "loss": 0.6683, "step": 2191 }, { "epoch": 0.8501890817414913, "grad_norm": 2.3899796829513167, "learning_rate": 6.676178545704326e-07, "loss": 0.6377, "step": 2192 }, { "epoch": 0.8505769417240376, "grad_norm": 2.704703846503299, "learning_rate": 6.642417855546768e-07, "loss": 0.7024, "step": 2193 }, { "epoch": 0.850964801706584, "grad_norm": 2.883978155344731, "learning_rate": 6.60873666934993e-07, "loss": 0.6386, "step": 2194 }, { "epoch": 0.8513526616891303, "grad_norm": 2.106697167810104, "learning_rate": 6.575135048874349e-07, "loss": 0.6301, "step": 2195 }, { "epoch": 0.8517405216716766, "grad_norm": 2.5484249424256107, "learning_rate": 6.541613055734669e-07, "loss": 0.685, "step": 2196 }, { "epoch": 0.8521283816542228, "grad_norm": 2.4485507700593288, "learning_rate": 6.508170751399517e-07, "loss": 0.7082, "step": 2197 }, { "epoch": 0.8525162416367691, "grad_norm": 3.2934045667681744, "learning_rate": 6.474808197191401e-07, "loss": 0.7024, "step": 2198 }, { "epoch": 0.8529041016193154, "grad_norm": 2.3215315213961043, "learning_rate": 6.44152545428659e-07, "loss": 0.6296, "step": 2199 }, { "epoch": 0.8532919616018617, "grad_norm": 2.14571250441849, "learning_rate": 6.408322583715021e-07, "loss": 0.618, "step": 2200 }, { "epoch": 0.853679821584408, "grad_norm": 2.4735587286016956, "learning_rate": 6.375199646360142e-07, "loss": 0.6763, "step": 2201 }, { "epoch": 0.8540676815669543, "grad_norm": 2.8742577782614562, "learning_rate": 6.342156702958851e-07, "loss": 0.6751, "step": 2202 }, { "epoch": 0.8544555415495007, "grad_norm": 2.5899022471170783, "learning_rate": 6.30919381410135e-07, "loss": 0.6441, "step": 2203 }, { "epoch": 0.854843401532047, "grad_norm": 2.37179272974293, "learning_rate": 6.276311040231054e-07, "loss": 0.6267, "step": 2204 }, { "epoch": 0.8552312615145933, "grad_norm": 2.46828961677753, "learning_rate": 6.243508441644469e-07, "loss": 0.6462, "step": 2205 }, { "epoch": 0.8556191214971395, "grad_norm": 2.1955695717421397, "learning_rate": 6.210786078491088e-07, "loss": 0.6308, "step": 2206 }, { "epoch": 0.8560069814796858, "grad_norm": 2.6532033428918465, "learning_rate": 6.178144010773274e-07, "loss": 0.5833, "step": 2207 }, { "epoch": 0.8563948414622321, "grad_norm": 2.671871663144469, "learning_rate": 6.145582298346153e-07, "loss": 0.6355, "step": 2208 }, { "epoch": 0.8567827014447784, "grad_norm": 2.6309305170914334, "learning_rate": 6.113101000917515e-07, "loss": 0.6728, "step": 2209 }, { "epoch": 0.8571705614273247, "grad_norm": 2.9955258007727585, "learning_rate": 6.080700178047688e-07, "loss": 0.7076, "step": 2210 }, { "epoch": 0.857558421409871, "grad_norm": 2.197259446501087, "learning_rate": 6.048379889149425e-07, "loss": 0.6044, "step": 2211 }, { "epoch": 0.8579462813924174, "grad_norm": 3.1956789901300233, "learning_rate": 6.016140193487824e-07, "loss": 0.712, "step": 2212 }, { "epoch": 0.8583341413749637, "grad_norm": 2.171248147276586, "learning_rate": 5.98398115018019e-07, "loss": 0.6242, "step": 2213 }, { "epoch": 0.85872200135751, "grad_norm": 2.324442121038707, "learning_rate": 5.951902818195937e-07, "loss": 0.6667, "step": 2214 }, { "epoch": 0.8591098613400563, "grad_norm": 2.170510478630677, "learning_rate": 5.919905256356484e-07, "loss": 0.6515, "step": 2215 }, { "epoch": 0.8594977213226025, "grad_norm": 1.7213214536099182, "learning_rate": 5.887988523335137e-07, "loss": 0.6216, "step": 2216 }, { "epoch": 0.8598855813051488, "grad_norm": 2.185350857519862, "learning_rate": 5.856152677657007e-07, "loss": 0.6027, "step": 2217 }, { "epoch": 0.8602734412876951, "grad_norm": 2.4332411799487947, "learning_rate": 5.824397777698859e-07, "loss": 0.6404, "step": 2218 }, { "epoch": 0.8606613012702414, "grad_norm": 2.526724259496125, "learning_rate": 5.792723881689039e-07, "loss": 0.6543, "step": 2219 }, { "epoch": 0.8610491612527877, "grad_norm": 2.807151526937488, "learning_rate": 5.761131047707363e-07, "loss": 0.6784, "step": 2220 }, { "epoch": 0.861437021235334, "grad_norm": 2.5713097217605436, "learning_rate": 5.729619333684994e-07, "loss": 0.6479, "step": 2221 }, { "epoch": 0.8618248812178804, "grad_norm": 2.7313299832888336, "learning_rate": 5.698188797404358e-07, "loss": 0.6742, "step": 2222 }, { "epoch": 0.8622127412004267, "grad_norm": 2.0379435713987992, "learning_rate": 5.666839496499021e-07, "loss": 0.6531, "step": 2223 }, { "epoch": 0.862600601182973, "grad_norm": 2.1769365747497247, "learning_rate": 5.63557148845359e-07, "loss": 0.6344, "step": 2224 }, { "epoch": 0.8629884611655192, "grad_norm": 2.8179702547650463, "learning_rate": 5.604384830603599e-07, "loss": 0.676, "step": 2225 }, { "epoch": 0.8633763211480655, "grad_norm": 2.493078539949569, "learning_rate": 5.573279580135438e-07, "loss": 0.651, "step": 2226 }, { "epoch": 0.8637641811306118, "grad_norm": 2.5279414787093266, "learning_rate": 5.542255794086193e-07, "loss": 0.7178, "step": 2227 }, { "epoch": 0.8641520411131581, "grad_norm": 2.3087240019268425, "learning_rate": 5.511313529343581e-07, "loss": 0.6476, "step": 2228 }, { "epoch": 0.8645399010957044, "grad_norm": 2.3398774973108742, "learning_rate": 5.480452842645839e-07, "loss": 0.6843, "step": 2229 }, { "epoch": 0.8649277610782508, "grad_norm": 2.0981265669670717, "learning_rate": 5.449673790581611e-07, "loss": 0.6382, "step": 2230 }, { "epoch": 0.8653156210607971, "grad_norm": 2.028643260356592, "learning_rate": 5.418976429589845e-07, "loss": 0.6112, "step": 2231 }, { "epoch": 0.8657034810433434, "grad_norm": 2.453139308961058, "learning_rate": 5.388360815959703e-07, "loss": 0.627, "step": 2232 }, { "epoch": 0.8660913410258897, "grad_norm": 2.5764552538346264, "learning_rate": 5.357827005830435e-07, "loss": 0.6158, "step": 2233 }, { "epoch": 0.866479201008436, "grad_norm": 2.188133899345056, "learning_rate": 5.327375055191313e-07, "loss": 0.6403, "step": 2234 }, { "epoch": 0.8668670609909822, "grad_norm": 2.1026540303918506, "learning_rate": 5.297005019881491e-07, "loss": 0.6763, "step": 2235 }, { "epoch": 0.8672549209735285, "grad_norm": 2.087376519986405, "learning_rate": 5.266716955589907e-07, "loss": 0.6593, "step": 2236 }, { "epoch": 0.8676427809560748, "grad_norm": 2.7633001548721263, "learning_rate": 5.236510917855197e-07, "loss": 0.6335, "step": 2237 }, { "epoch": 0.8680306409386211, "grad_norm": 2.7121099004139015, "learning_rate": 5.206386962065601e-07, "loss": 0.6925, "step": 2238 }, { "epoch": 0.8684185009211675, "grad_norm": 2.5090507241457805, "learning_rate": 5.176345143458827e-07, "loss": 0.6684, "step": 2239 }, { "epoch": 0.8688063609037138, "grad_norm": 2.3740112774126354, "learning_rate": 5.146385517121977e-07, "loss": 0.6259, "step": 2240 }, { "epoch": 0.8691942208862601, "grad_norm": 2.3575534505137954, "learning_rate": 5.116508137991438e-07, "loss": 0.6816, "step": 2241 }, { "epoch": 0.8695820808688064, "grad_norm": 2.6999368119556397, "learning_rate": 5.086713060852788e-07, "loss": 0.6931, "step": 2242 }, { "epoch": 0.8699699408513527, "grad_norm": 2.18154734439112, "learning_rate": 5.057000340340679e-07, "loss": 0.6816, "step": 2243 }, { "epoch": 0.8703578008338989, "grad_norm": 3.001200050423835, "learning_rate": 5.027370030938755e-07, "loss": 0.7629, "step": 2244 }, { "epoch": 0.8707456608164452, "grad_norm": 2.427448338834021, "learning_rate": 4.997822186979539e-07, "loss": 0.6894, "step": 2245 }, { "epoch": 0.8711335207989915, "grad_norm": 2.030349185810588, "learning_rate": 4.968356862644352e-07, "loss": 0.6553, "step": 2246 }, { "epoch": 0.8715213807815378, "grad_norm": 1.8827514722870906, "learning_rate": 4.938974111963174e-07, "loss": 0.6539, "step": 2247 }, { "epoch": 0.8719092407640842, "grad_norm": 3.022781309490935, "learning_rate": 4.9096739888146e-07, "loss": 0.6592, "step": 2248 }, { "epoch": 0.8722971007466305, "grad_norm": 1.9458677969842768, "learning_rate": 4.880456546925693e-07, "loss": 0.6139, "step": 2249 }, { "epoch": 0.8726849607291768, "grad_norm": 1.856629665147042, "learning_rate": 4.851321839871908e-07, "loss": 0.6273, "step": 2250 }, { "epoch": 0.8730728207117231, "grad_norm": 2.423159037895344, "learning_rate": 4.822269921077011e-07, "loss": 0.6576, "step": 2251 }, { "epoch": 0.8734606806942694, "grad_norm": 2.0026969728322634, "learning_rate": 4.793300843812926e-07, "loss": 0.6132, "step": 2252 }, { "epoch": 0.8738485406768157, "grad_norm": 2.767459973739007, "learning_rate": 4.7644146611997064e-07, "loss": 0.6124, "step": 2253 }, { "epoch": 0.8742364006593619, "grad_norm": 3.616543573577782, "learning_rate": 4.735611426205372e-07, "loss": 0.6276, "step": 2254 }, { "epoch": 0.8746242606419082, "grad_norm": 1.8858086216915417, "learning_rate": 4.7068911916458683e-07, "loss": 0.579, "step": 2255 }, { "epoch": 0.8750121206244545, "grad_norm": 2.1710974318713, "learning_rate": 4.678254010184929e-07, "loss": 0.5976, "step": 2256 }, { "epoch": 0.8753999806070009, "grad_norm": 2.0725445812098156, "learning_rate": 4.6496999343340065e-07, "loss": 0.574, "step": 2257 }, { "epoch": 0.8757878405895472, "grad_norm": 2.116373221886325, "learning_rate": 4.6212290164521554e-07, "loss": 0.6593, "step": 2258 }, { "epoch": 0.8761757005720935, "grad_norm": 2.528114851697363, "learning_rate": 4.5928413087459325e-07, "loss": 0.639, "step": 2259 }, { "epoch": 0.8765635605546398, "grad_norm": 2.0726175183861244, "learning_rate": 4.564536863269353e-07, "loss": 0.6241, "step": 2260 }, { "epoch": 0.8769514205371861, "grad_norm": 2.3692070873176676, "learning_rate": 4.536315731923724e-07, "loss": 0.6212, "step": 2261 }, { "epoch": 0.8773392805197324, "grad_norm": 2.52427955198282, "learning_rate": 4.5081779664575887e-07, "loss": 0.6536, "step": 2262 }, { "epoch": 0.8777271405022786, "grad_norm": 1.8700765181663164, "learning_rate": 4.48012361846662e-07, "loss": 0.6169, "step": 2263 }, { "epoch": 0.8781150004848249, "grad_norm": 1.8230874339902725, "learning_rate": 4.4521527393935336e-07, "loss": 0.6676, "step": 2264 }, { "epoch": 0.8785028604673712, "grad_norm": 2.210200081745132, "learning_rate": 4.4242653805279923e-07, "loss": 0.6272, "step": 2265 }, { "epoch": 0.8788907204499176, "grad_norm": 2.2344025033692514, "learning_rate": 4.3964615930065126e-07, "loss": 0.6478, "step": 2266 }, { "epoch": 0.8792785804324639, "grad_norm": 2.622452857758712, "learning_rate": 4.3687414278123454e-07, "loss": 0.6529, "step": 2267 }, { "epoch": 0.8796664404150102, "grad_norm": 2.182032380938603, "learning_rate": 4.341104935775442e-07, "loss": 0.6061, "step": 2268 }, { "epoch": 0.8800543003975565, "grad_norm": 2.5450440675606996, "learning_rate": 4.313552167572294e-07, "loss": 0.6465, "step": 2269 }, { "epoch": 0.8804421603801028, "grad_norm": 2.2835006681673473, "learning_rate": 4.2860831737258857e-07, "loss": 0.7001, "step": 2270 }, { "epoch": 0.8808300203626491, "grad_norm": 2.8736795282271705, "learning_rate": 4.258698004605571e-07, "loss": 0.6728, "step": 2271 }, { "epoch": 0.8812178803451954, "grad_norm": 2.056586070383055, "learning_rate": 4.231396710427016e-07, "loss": 0.606, "step": 2272 }, { "epoch": 0.8816057403277416, "grad_norm": 2.508570272652325, "learning_rate": 4.204179341252074e-07, "loss": 0.6744, "step": 2273 }, { "epoch": 0.881993600310288, "grad_norm": 3.1813587217506623, "learning_rate": 4.1770459469887003e-07, "loss": 0.7428, "step": 2274 }, { "epoch": 0.8823814602928343, "grad_norm": 2.4629925413685085, "learning_rate": 4.149996577390886e-07, "loss": 0.6022, "step": 2275 }, { "epoch": 0.8827693202753806, "grad_norm": 1.8537719871559126, "learning_rate": 4.1230312820585317e-07, "loss": 0.6261, "step": 2276 }, { "epoch": 0.8831571802579269, "grad_norm": 3.2280572573448123, "learning_rate": 4.09615011043738e-07, "loss": 0.6821, "step": 2277 }, { "epoch": 0.8835450402404732, "grad_norm": 2.458670198021006, "learning_rate": 4.069353111818913e-07, "loss": 0.7163, "step": 2278 }, { "epoch": 0.8839329002230195, "grad_norm": 2.1360531919712966, "learning_rate": 4.042640335340281e-07, "loss": 0.6293, "step": 2279 }, { "epoch": 0.8843207602055658, "grad_norm": 2.196037793166695, "learning_rate": 4.016011829984168e-07, "loss": 0.5947, "step": 2280 }, { "epoch": 0.8847086201881121, "grad_norm": 2.786971186101355, "learning_rate": 3.989467644578765e-07, "loss": 0.6457, "step": 2281 }, { "epoch": 0.8850964801706583, "grad_norm": 2.963351743689799, "learning_rate": 3.963007827797627e-07, "loss": 0.6882, "step": 2282 }, { "epoch": 0.8854843401532047, "grad_norm": 2.040818220727546, "learning_rate": 3.936632428159609e-07, "loss": 0.6066, "step": 2283 }, { "epoch": 0.885872200135751, "grad_norm": 2.069987000391076, "learning_rate": 3.9103414940287575e-07, "loss": 0.6198, "step": 2284 }, { "epoch": 0.8862600601182973, "grad_norm": 2.35696640578444, "learning_rate": 3.8841350736142757e-07, "loss": 0.6502, "step": 2285 }, { "epoch": 0.8866479201008436, "grad_norm": 2.033199953978161, "learning_rate": 3.858013214970363e-07, "loss": 0.6536, "step": 2286 }, { "epoch": 0.8870357800833899, "grad_norm": 1.9034484666557314, "learning_rate": 3.831975965996154e-07, "loss": 0.6181, "step": 2287 }, { "epoch": 0.8874236400659362, "grad_norm": 2.4570099655114648, "learning_rate": 3.8060233744356634e-07, "loss": 0.6505, "step": 2288 }, { "epoch": 0.8878115000484825, "grad_norm": 2.439505318621445, "learning_rate": 3.7801554878776514e-07, "loss": 0.6536, "step": 2289 }, { "epoch": 0.8881993600310288, "grad_norm": 2.068008490337173, "learning_rate": 3.754372353755559e-07, "loss": 0.6443, "step": 2290 }, { "epoch": 0.8885872200135752, "grad_norm": 2.140648851478044, "learning_rate": 3.728674019347428e-07, "loss": 0.6062, "step": 2291 }, { "epoch": 0.8889750799961214, "grad_norm": 3.0037902057580634, "learning_rate": 3.703060531775787e-07, "loss": 0.6919, "step": 2292 }, { "epoch": 0.8893629399786677, "grad_norm": 2.5809543362776814, "learning_rate": 3.6775319380076e-07, "loss": 0.6304, "step": 2293 }, { "epoch": 0.889750799961214, "grad_norm": 2.857067630685723, "learning_rate": 3.6520882848541606e-07, "loss": 0.6723, "step": 2294 }, { "epoch": 0.8901386599437603, "grad_norm": 2.420507191162015, "learning_rate": 3.626729618970998e-07, "loss": 0.6536, "step": 2295 }, { "epoch": 0.8905265199263066, "grad_norm": 1.6966678133025377, "learning_rate": 3.6014559868578103e-07, "loss": 0.5932, "step": 2296 }, { "epoch": 0.8909143799088529, "grad_norm": 2.4199374292289355, "learning_rate": 3.576267434858366e-07, "loss": 0.664, "step": 2297 }, { "epoch": 0.8913022398913992, "grad_norm": 2.392540713317348, "learning_rate": 3.5511640091604293e-07, "loss": 0.6517, "step": 2298 }, { "epoch": 0.8916900998739455, "grad_norm": 1.8406584655039469, "learning_rate": 3.5261457557956626e-07, "loss": 0.6304, "step": 2299 }, { "epoch": 0.8920779598564919, "grad_norm": 3.1896464518361465, "learning_rate": 3.501212720639563e-07, "loss": 0.7059, "step": 2300 }, { "epoch": 0.892465819839038, "grad_norm": 2.954185339610949, "learning_rate": 3.476364949411343e-07, "loss": 0.7049, "step": 2301 }, { "epoch": 0.8928536798215844, "grad_norm": 2.358365525618258, "learning_rate": 3.451602487673889e-07, "loss": 0.6325, "step": 2302 }, { "epoch": 0.8932415398041307, "grad_norm": 3.2574795790861883, "learning_rate": 3.4269253808336456e-07, "loss": 0.7582, "step": 2303 }, { "epoch": 0.893629399786677, "grad_norm": 2.563584491879276, "learning_rate": 3.402333674140551e-07, "loss": 0.6228, "step": 2304 }, { "epoch": 0.8940172597692233, "grad_norm": 2.282906184044103, "learning_rate": 3.377827412687934e-07, "loss": 0.6056, "step": 2305 }, { "epoch": 0.8944051197517696, "grad_norm": 2.3401557917020326, "learning_rate": 3.35340664141246e-07, "loss": 0.7004, "step": 2306 }, { "epoch": 0.8947929797343159, "grad_norm": 1.9364309195206466, "learning_rate": 3.32907140509402e-07, "loss": 0.649, "step": 2307 }, { "epoch": 0.8951808397168622, "grad_norm": 3.2651180611341677, "learning_rate": 3.3048217483556743e-07, "loss": 0.659, "step": 2308 }, { "epoch": 0.8955686996994086, "grad_norm": 1.8877054507372766, "learning_rate": 3.2806577156635435e-07, "loss": 0.5768, "step": 2309 }, { "epoch": 0.8959565596819549, "grad_norm": 1.9760127315591527, "learning_rate": 3.256579351326744e-07, "loss": 0.6017, "step": 2310 }, { "epoch": 0.8963444196645011, "grad_norm": 2.4417738366868043, "learning_rate": 3.2325866994973197e-07, "loss": 0.6464, "step": 2311 }, { "epoch": 0.8967322796470474, "grad_norm": 2.5327560193418397, "learning_rate": 3.208679804170128e-07, "loss": 0.6444, "step": 2312 }, { "epoch": 0.8971201396295937, "grad_norm": 2.588019977070708, "learning_rate": 3.1848587091827757e-07, "loss": 0.7021, "step": 2313 }, { "epoch": 0.89750799961214, "grad_norm": 2.6832316300047436, "learning_rate": 3.161123458215554e-07, "loss": 0.7133, "step": 2314 }, { "epoch": 0.8978958595946863, "grad_norm": 2.1504588795202646, "learning_rate": 3.1374740947913206e-07, "loss": 0.5773, "step": 2315 }, { "epoch": 0.8982837195772326, "grad_norm": 1.8376099183143009, "learning_rate": 3.1139106622754655e-07, "loss": 0.5977, "step": 2316 }, { "epoch": 0.8986715795597789, "grad_norm": 2.8220501122424335, "learning_rate": 3.0904332038757977e-07, "loss": 0.6455, "step": 2317 }, { "epoch": 0.8990594395423253, "grad_norm": 2.637248549355356, "learning_rate": 3.067041762642475e-07, "loss": 0.6777, "step": 2318 }, { "epoch": 0.8994472995248716, "grad_norm": 2.5697187951106684, "learning_rate": 3.0437363814679375e-07, "loss": 0.6474, "step": 2319 }, { "epoch": 0.8998351595074178, "grad_norm": 2.1324585070784856, "learning_rate": 3.020517103086812e-07, "loss": 0.6501, "step": 2320 }, { "epoch": 0.9002230194899641, "grad_norm": 2.097775785806987, "learning_rate": 2.99738397007584e-07, "loss": 0.5966, "step": 2321 }, { "epoch": 0.9006108794725104, "grad_norm": 2.6294737086916453, "learning_rate": 2.974337024853802e-07, "loss": 0.7005, "step": 2322 }, { "epoch": 0.9009987394550567, "grad_norm": 1.9807078871688286, "learning_rate": 2.9513763096814305e-07, "loss": 0.6515, "step": 2323 }, { "epoch": 0.901386599437603, "grad_norm": 1.7430709090897525, "learning_rate": 2.9285018666613484e-07, "loss": 0.6664, "step": 2324 }, { "epoch": 0.9017744594201493, "grad_norm": 2.2331137689177183, "learning_rate": 2.9057137377379805e-07, "loss": 0.6644, "step": 2325 }, { "epoch": 0.9021623194026956, "grad_norm": 2.3142736170259703, "learning_rate": 2.8830119646974796e-07, "loss": 0.626, "step": 2326 }, { "epoch": 0.902550179385242, "grad_norm": 3.1152446110062133, "learning_rate": 2.860396589167641e-07, "loss": 0.6318, "step": 2327 }, { "epoch": 0.9029380393677883, "grad_norm": 2.7836161681081157, "learning_rate": 2.8378676526178484e-07, "loss": 0.6762, "step": 2328 }, { "epoch": 0.9033258993503346, "grad_norm": 2.704536440071435, "learning_rate": 2.815425196358984e-07, "loss": 0.6568, "step": 2329 }, { "epoch": 0.9037137593328808, "grad_norm": 2.661597073586334, "learning_rate": 2.7930692615433353e-07, "loss": 0.6827, "step": 2330 }, { "epoch": 0.9041016193154271, "grad_norm": 2.651913643949107, "learning_rate": 2.770799889164549e-07, "loss": 0.6543, "step": 2331 }, { "epoch": 0.9044894792979734, "grad_norm": 2.885365322858343, "learning_rate": 2.748617120057551e-07, "loss": 0.6668, "step": 2332 }, { "epoch": 0.9048773392805197, "grad_norm": 2.5909346100662405, "learning_rate": 2.726520994898452e-07, "loss": 0.6282, "step": 2333 }, { "epoch": 0.905265199263066, "grad_norm": 2.3810299056498962, "learning_rate": 2.704511554204486e-07, "loss": 0.6497, "step": 2334 }, { "epoch": 0.9056530592456123, "grad_norm": 2.515399074799305, "learning_rate": 2.6825888383339436e-07, "loss": 0.6512, "step": 2335 }, { "epoch": 0.9060409192281587, "grad_norm": 2.2161070997443697, "learning_rate": 2.660752887486084e-07, "loss": 0.6817, "step": 2336 }, { "epoch": 0.906428779210705, "grad_norm": 2.321385260506164, "learning_rate": 2.6390037417010683e-07, "loss": 0.6083, "step": 2337 }, { "epoch": 0.9068166391932513, "grad_norm": 2.0474095518331095, "learning_rate": 2.617341440859883e-07, "loss": 0.665, "step": 2338 }, { "epoch": 0.9072044991757975, "grad_norm": 2.5580880322248127, "learning_rate": 2.5957660246842707e-07, "loss": 0.7017, "step": 2339 }, { "epoch": 0.9075923591583438, "grad_norm": 2.759845942404686, "learning_rate": 2.5742775327366634e-07, "loss": 0.7089, "step": 2340 }, { "epoch": 0.9079802191408901, "grad_norm": 2.1117505075767715, "learning_rate": 2.552876004420085e-07, "loss": 0.6296, "step": 2341 }, { "epoch": 0.9083680791234364, "grad_norm": 2.249137197367064, "learning_rate": 2.5315614789781064e-07, "loss": 0.6751, "step": 2342 }, { "epoch": 0.9087559391059827, "grad_norm": 2.9124871694004106, "learning_rate": 2.5103339954947624e-07, "loss": 0.7395, "step": 2343 }, { "epoch": 0.909143799088529, "grad_norm": 2.460750603705051, "learning_rate": 2.4891935928944676e-07, "loss": 0.6751, "step": 2344 }, { "epoch": 0.9095316590710754, "grad_norm": 2.6627777809302855, "learning_rate": 2.468140309941991e-07, "loss": 0.5821, "step": 2345 }, { "epoch": 0.9099195190536217, "grad_norm": 2.1013533198240553, "learning_rate": 2.447174185242324e-07, "loss": 0.6575, "step": 2346 }, { "epoch": 0.910307379036168, "grad_norm": 2.06730270342888, "learning_rate": 2.4262952572406353e-07, "loss": 0.6818, "step": 2347 }, { "epoch": 0.9106952390187143, "grad_norm": 2.395594651063534, "learning_rate": 2.4055035642222225e-07, "loss": 0.6967, "step": 2348 }, { "epoch": 0.9110830990012605, "grad_norm": 1.9746631985267984, "learning_rate": 2.384799144312405e-07, "loss": 0.5837, "step": 2349 }, { "epoch": 0.9114709589838068, "grad_norm": 2.487456983615836, "learning_rate": 2.3641820354764755e-07, "loss": 0.6742, "step": 2350 }, { "epoch": 0.9118588189663531, "grad_norm": 1.9731179032558328, "learning_rate": 2.3436522755196367e-07, "loss": 0.6033, "step": 2351 }, { "epoch": 0.9122466789488994, "grad_norm": 2.212231319141506, "learning_rate": 2.323209902086898e-07, "loss": 0.6567, "step": 2352 }, { "epoch": 0.9126345389314457, "grad_norm": 2.575636221526045, "learning_rate": 2.3028549526630583e-07, "loss": 0.6132, "step": 2353 }, { "epoch": 0.9130223989139921, "grad_norm": 1.9287836478565714, "learning_rate": 2.2825874645725942e-07, "loss": 0.6021, "step": 2354 }, { "epoch": 0.9134102588965384, "grad_norm": 2.4359430531613504, "learning_rate": 2.2624074749796053e-07, "loss": 0.6608, "step": 2355 }, { "epoch": 0.9137981188790847, "grad_norm": 2.3629534885860823, "learning_rate": 2.2423150208877476e-07, "loss": 0.6181, "step": 2356 }, { "epoch": 0.914185978861631, "grad_norm": 2.177919140510809, "learning_rate": 2.2223101391401657e-07, "loss": 0.6361, "step": 2357 }, { "epoch": 0.9145738388441772, "grad_norm": 2.8521277211409526, "learning_rate": 2.2023928664194229e-07, "loss": 0.6874, "step": 2358 }, { "epoch": 0.9149616988267235, "grad_norm": 3.0393548409699718, "learning_rate": 2.1825632392474372e-07, "loss": 0.6684, "step": 2359 }, { "epoch": 0.9153495588092698, "grad_norm": 2.2223146732224435, "learning_rate": 2.1628212939854176e-07, "loss": 0.67, "step": 2360 }, { "epoch": 0.9157374187918161, "grad_norm": 2.4230151466605587, "learning_rate": 2.143167066833779e-07, "loss": 0.6832, "step": 2361 }, { "epoch": 0.9161252787743625, "grad_norm": 2.25222619817154, "learning_rate": 2.1236005938321092e-07, "loss": 0.6538, "step": 2362 }, { "epoch": 0.9165131387569088, "grad_norm": 1.9553869212096515, "learning_rate": 2.1041219108590692e-07, "loss": 0.6539, "step": 2363 }, { "epoch": 0.9169009987394551, "grad_norm": 2.2031913597118677, "learning_rate": 2.0847310536323385e-07, "loss": 0.6716, "step": 2364 }, { "epoch": 0.9172888587220014, "grad_norm": 3.290026958407634, "learning_rate": 2.065428057708563e-07, "loss": 0.6988, "step": 2365 }, { "epoch": 0.9176767187045477, "grad_norm": 2.5554485059150895, "learning_rate": 2.046212958483268e-07, "loss": 0.6367, "step": 2366 }, { "epoch": 0.918064578687094, "grad_norm": 2.5356844008411543, "learning_rate": 2.0270857911908137e-07, "loss": 0.6564, "step": 2367 }, { "epoch": 0.9184524386696402, "grad_norm": 2.161301587641161, "learning_rate": 2.0080465909043113e-07, "loss": 0.6253, "step": 2368 }, { "epoch": 0.9188402986521865, "grad_norm": 2.074659957538084, "learning_rate": 1.9890953925355838e-07, "loss": 0.6461, "step": 2369 }, { "epoch": 0.9192281586347328, "grad_norm": 1.7247583201776917, "learning_rate": 1.9702322308350675e-07, "loss": 0.6594, "step": 2370 }, { "epoch": 0.9196160186172792, "grad_norm": 1.85077506888067, "learning_rate": 1.951457140391788e-07, "loss": 0.6143, "step": 2371 }, { "epoch": 0.9200038785998255, "grad_norm": 2.8970088245289563, "learning_rate": 1.9327701556332569e-07, "loss": 0.7008, "step": 2372 }, { "epoch": 0.9203917385823718, "grad_norm": 2.171668828309959, "learning_rate": 1.9141713108254413e-07, "loss": 0.6645, "step": 2373 }, { "epoch": 0.9207795985649181, "grad_norm": 3.2193586982513906, "learning_rate": 1.895660640072683e-07, "loss": 0.6794, "step": 2374 }, { "epoch": 0.9211674585474644, "grad_norm": 2.5827977047109885, "learning_rate": 1.8772381773176417e-07, "loss": 0.6706, "step": 2375 }, { "epoch": 0.9215553185300107, "grad_norm": 2.5838997504100307, "learning_rate": 1.8589039563412291e-07, "loss": 0.6569, "step": 2376 }, { "epoch": 0.9219431785125569, "grad_norm": 1.9339648100139342, "learning_rate": 1.8406580107625583e-07, "loss": 0.6097, "step": 2377 }, { "epoch": 0.9223310384951032, "grad_norm": 1.7590255429086168, "learning_rate": 1.8225003740388546e-07, "loss": 0.6286, "step": 2378 }, { "epoch": 0.9227188984776495, "grad_norm": 2.524949588429483, "learning_rate": 1.804431079465435e-07, "loss": 0.6167, "step": 2379 }, { "epoch": 0.9231067584601959, "grad_norm": 2.185539469073554, "learning_rate": 1.7864501601756236e-07, "loss": 0.6005, "step": 2380 }, { "epoch": 0.9234946184427422, "grad_norm": 2.600188259549176, "learning_rate": 1.7685576491406676e-07, "loss": 0.6595, "step": 2381 }, { "epoch": 0.9238824784252885, "grad_norm": 2.0098473131236325, "learning_rate": 1.7507535791697338e-07, "loss": 0.6341, "step": 2382 }, { "epoch": 0.9242703384078348, "grad_norm": 2.5834865332788572, "learning_rate": 1.733037982909791e-07, "loss": 0.6164, "step": 2383 }, { "epoch": 0.9246581983903811, "grad_norm": 2.250483975942135, "learning_rate": 1.7154108928455926e-07, "loss": 0.6481, "step": 2384 }, { "epoch": 0.9250460583729274, "grad_norm": 3.162571557225861, "learning_rate": 1.697872341299589e-07, "loss": 0.7014, "step": 2385 }, { "epoch": 0.9254339183554737, "grad_norm": 2.4965773701250455, "learning_rate": 1.6804223604318825e-07, "loss": 0.6251, "step": 2386 }, { "epoch": 0.9258217783380199, "grad_norm": 3.0501945476952885, "learning_rate": 1.6630609822401612e-07, "loss": 0.675, "step": 2387 }, { "epoch": 0.9262096383205662, "grad_norm": 2.182343806792488, "learning_rate": 1.6457882385596647e-07, "loss": 0.6869, "step": 2388 }, { "epoch": 0.9265974983031126, "grad_norm": 2.0869003521409164, "learning_rate": 1.6286041610630742e-07, "loss": 0.6347, "step": 2389 }, { "epoch": 0.9269853582856589, "grad_norm": 3.0812023652216096, "learning_rate": 1.6115087812605123e-07, "loss": 0.6362, "step": 2390 }, { "epoch": 0.9273732182682052, "grad_norm": 2.2874157105304036, "learning_rate": 1.5945021304994368e-07, "loss": 0.6971, "step": 2391 }, { "epoch": 0.9277610782507515, "grad_norm": 2.0368295950991797, "learning_rate": 1.577584239964619e-07, "loss": 0.6229, "step": 2392 }, { "epoch": 0.9281489382332978, "grad_norm": 2.7723927500215586, "learning_rate": 1.560755140678072e-07, "loss": 0.781, "step": 2393 }, { "epoch": 0.9285367982158441, "grad_norm": 2.530772449050802, "learning_rate": 1.5440148634989827e-07, "loss": 0.6092, "step": 2394 }, { "epoch": 0.9289246581983904, "grad_norm": 2.0913784445332704, "learning_rate": 1.527363439123669e-07, "loss": 0.642, "step": 2395 }, { "epoch": 0.9293125181809366, "grad_norm": 2.3178617941024524, "learning_rate": 1.5108008980855405e-07, "loss": 0.7168, "step": 2396 }, { "epoch": 0.9297003781634829, "grad_norm": 2.170130344252857, "learning_rate": 1.4943272707550028e-07, "loss": 0.6285, "step": 2397 }, { "epoch": 0.9300882381460293, "grad_norm": 2.1148438257789373, "learning_rate": 1.477942587339426e-07, "loss": 0.6479, "step": 2398 }, { "epoch": 0.9304760981285756, "grad_norm": 2.607455112896733, "learning_rate": 1.4616468778830939e-07, "loss": 0.6111, "step": 2399 }, { "epoch": 0.9308639581111219, "grad_norm": 2.283167334322051, "learning_rate": 1.4454401722671264e-07, "loss": 0.6372, "step": 2400 }, { "epoch": 0.9312518180936682, "grad_norm": 2.009700771238338, "learning_rate": 1.4293225002094456e-07, "loss": 0.6271, "step": 2401 }, { "epoch": 0.9316396780762145, "grad_norm": 2.24647727068137, "learning_rate": 1.413293891264722e-07, "loss": 0.6097, "step": 2402 }, { "epoch": 0.9320275380587608, "grad_norm": 3.097090979657334, "learning_rate": 1.3973543748243002e-07, "loss": 0.6988, "step": 2403 }, { "epoch": 0.9324153980413071, "grad_norm": 2.613407845086683, "learning_rate": 1.3815039801161723e-07, "loss": 0.6796, "step": 2404 }, { "epoch": 0.9328032580238534, "grad_norm": 3.192497618106652, "learning_rate": 1.3657427362048893e-07, "loss": 0.6647, "step": 2405 }, { "epoch": 0.9331911180063996, "grad_norm": 1.8489041847765637, "learning_rate": 1.350070671991549e-07, "loss": 0.5944, "step": 2406 }, { "epoch": 0.933578977988946, "grad_norm": 1.8040724139703312, "learning_rate": 1.3344878162137087e-07, "loss": 0.5541, "step": 2407 }, { "epoch": 0.9339668379714923, "grad_norm": 2.456267266863812, "learning_rate": 1.3189941974453502e-07, "loss": 0.6955, "step": 2408 }, { "epoch": 0.9343546979540386, "grad_norm": 1.8189522697564051, "learning_rate": 1.3035898440968197e-07, "loss": 0.6383, "step": 2409 }, { "epoch": 0.9347425579365849, "grad_norm": 2.359471688134553, "learning_rate": 1.2882747844147893e-07, "loss": 0.7014, "step": 2410 }, { "epoch": 0.9351304179191312, "grad_norm": 2.7189983324171263, "learning_rate": 1.273049046482183e-07, "loss": 0.6829, "step": 2411 }, { "epoch": 0.9355182779016775, "grad_norm": 2.6811137207812332, "learning_rate": 1.257912658218141e-07, "loss": 0.6368, "step": 2412 }, { "epoch": 0.9359061378842238, "grad_norm": 2.7652208252736896, "learning_rate": 1.242865647377972e-07, "loss": 0.699, "step": 2413 }, { "epoch": 0.9362939978667701, "grad_norm": 2.4574970541889614, "learning_rate": 1.2279080415530832e-07, "loss": 0.6382, "step": 2414 }, { "epoch": 0.9366818578493163, "grad_norm": 2.918390843067682, "learning_rate": 1.2130398681709564e-07, "loss": 0.7415, "step": 2415 }, { "epoch": 0.9370697178318627, "grad_norm": 2.074395064039638, "learning_rate": 1.1982611544950617e-07, "loss": 0.6264, "step": 2416 }, { "epoch": 0.937457577814409, "grad_norm": 2.4866277099924043, "learning_rate": 1.1835719276248491e-07, "loss": 0.6399, "step": 2417 }, { "epoch": 0.9378454377969553, "grad_norm": 2.805307095608044, "learning_rate": 1.1689722144956672e-07, "loss": 0.6392, "step": 2418 }, { "epoch": 0.9382332977795016, "grad_norm": 1.829861872142614, "learning_rate": 1.1544620418787289e-07, "loss": 0.641, "step": 2419 }, { "epoch": 0.9386211577620479, "grad_norm": 2.1388877053127384, "learning_rate": 1.1400414363810564e-07, "loss": 0.6445, "step": 2420 }, { "epoch": 0.9390090177445942, "grad_norm": 2.916098793119781, "learning_rate": 1.1257104244454309e-07, "loss": 0.6139, "step": 2421 }, { "epoch": 0.9393968777271405, "grad_norm": 2.150762687874022, "learning_rate": 1.1114690323503652e-07, "loss": 0.5837, "step": 2422 }, { "epoch": 0.9397847377096868, "grad_norm": 2.098172689835228, "learning_rate": 1.0973172862100145e-07, "loss": 0.6438, "step": 2423 }, { "epoch": 0.9401725976922332, "grad_norm": 2.1110102455717996, "learning_rate": 1.0832552119741658e-07, "loss": 0.6729, "step": 2424 }, { "epoch": 0.9405604576747794, "grad_norm": 2.2624375403921966, "learning_rate": 1.0692828354281704e-07, "loss": 0.6389, "step": 2425 }, { "epoch": 0.9409483176573257, "grad_norm": 2.5419751313098393, "learning_rate": 1.0554001821929061e-07, "loss": 0.6748, "step": 2426 }, { "epoch": 0.941336177639872, "grad_norm": 2.0636172970728746, "learning_rate": 1.0416072777247266e-07, "loss": 0.5912, "step": 2427 }, { "epoch": 0.9417240376224183, "grad_norm": 2.6488370208385628, "learning_rate": 1.0279041473154117e-07, "loss": 0.6672, "step": 2428 }, { "epoch": 0.9421118976049646, "grad_norm": 2.367170209259219, "learning_rate": 1.0142908160921283e-07, "loss": 0.6399, "step": 2429 }, { "epoch": 0.9424997575875109, "grad_norm": 2.061417749780888, "learning_rate": 1.0007673090173808e-07, "loss": 0.5977, "step": 2430 }, { "epoch": 0.9428876175700572, "grad_norm": 2.0487603636464216, "learning_rate": 9.873336508889664e-08, "loss": 0.6523, "step": 2431 }, { "epoch": 0.9432754775526035, "grad_norm": 2.0296483522351907, "learning_rate": 9.739898663399194e-08, "loss": 0.6352, "step": 2432 }, { "epoch": 0.9436633375351499, "grad_norm": 2.2508880910612925, "learning_rate": 9.607359798384785e-08, "loss": 0.6824, "step": 2433 }, { "epoch": 0.9440511975176961, "grad_norm": 2.115579600511906, "learning_rate": 9.475720156880419e-08, "loss": 0.6026, "step": 2434 }, { "epoch": 0.9444390575002424, "grad_norm": 2.616382262988463, "learning_rate": 9.344979980271174e-08, "loss": 0.672, "step": 2435 }, { "epoch": 0.9448269174827887, "grad_norm": 2.386650666131432, "learning_rate": 9.215139508292726e-08, "loss": 0.6461, "step": 2436 }, { "epoch": 0.945214777465335, "grad_norm": 2.1868094661127717, "learning_rate": 9.086198979031125e-08, "loss": 0.6634, "step": 2437 }, { "epoch": 0.9456026374478813, "grad_norm": 2.3603694998767977, "learning_rate": 8.95815862892202e-08, "loss": 0.6539, "step": 2438 }, { "epoch": 0.9459904974304276, "grad_norm": 2.8425054747050384, "learning_rate": 8.831018692750604e-08, "loss": 0.6617, "step": 2439 }, { "epoch": 0.9463783574129739, "grad_norm": 2.4673145323335013, "learning_rate": 8.704779403650943e-08, "loss": 0.6468, "step": 2440 }, { "epoch": 0.9467662173955202, "grad_norm": 2.253921205559015, "learning_rate": 8.579440993105537e-08, "loss": 0.7164, "step": 2441 }, { "epoch": 0.9471540773780666, "grad_norm": 2.5293856467890032, "learning_rate": 8.45500369094504e-08, "loss": 0.6872, "step": 2442 }, { "epoch": 0.9475419373606129, "grad_norm": 2.4755028373581687, "learning_rate": 8.331467725347708e-08, "loss": 0.6326, "step": 2443 }, { "epoch": 0.9479297973431591, "grad_norm": 2.1855585103286805, "learning_rate": 8.20883332283906e-08, "loss": 0.6353, "step": 2444 }, { "epoch": 0.9483176573257054, "grad_norm": 1.8007124928786098, "learning_rate": 8.087100708291384e-08, "loss": 0.6432, "step": 2445 }, { "epoch": 0.9487055173082517, "grad_norm": 2.1363656642134967, "learning_rate": 7.966270104923457e-08, "loss": 0.6333, "step": 2446 }, { "epoch": 0.949093377290798, "grad_norm": 2.98814553896565, "learning_rate": 7.846341734300044e-08, "loss": 0.7199, "step": 2447 }, { "epoch": 0.9494812372733443, "grad_norm": 2.078396757675189, "learning_rate": 7.727315816331515e-08, "loss": 0.6068, "step": 2448 }, { "epoch": 0.9498690972558906, "grad_norm": 2.9674753503203743, "learning_rate": 7.609192569273339e-08, "loss": 0.6845, "step": 2449 }, { "epoch": 0.950256957238437, "grad_norm": 1.786008496656797, "learning_rate": 7.491972209725807e-08, "loss": 0.6331, "step": 2450 }, { "epoch": 0.9506448172209833, "grad_norm": 1.7666706484373182, "learning_rate": 7.375654952633704e-08, "loss": 0.6366, "step": 2451 }, { "epoch": 0.9510326772035296, "grad_norm": 2.0322761473646533, "learning_rate": 7.26024101128564e-08, "loss": 0.7065, "step": 2452 }, { "epoch": 0.9514205371860758, "grad_norm": 2.7565347008053216, "learning_rate": 7.14573059731405e-08, "loss": 0.6887, "step": 2453 }, { "epoch": 0.9518083971686221, "grad_norm": 2.190776383243706, "learning_rate": 7.032123920694356e-08, "loss": 0.63, "step": 2454 }, { "epoch": 0.9521962571511684, "grad_norm": 2.790852853634247, "learning_rate": 6.919421189744979e-08, "loss": 0.6354, "step": 2455 }, { "epoch": 0.9525841171337147, "grad_norm": 2.2654359326808855, "learning_rate": 6.807622611126718e-08, "loss": 0.6039, "step": 2456 }, { "epoch": 0.952971977116261, "grad_norm": 2.5502181463682017, "learning_rate": 6.696728389842532e-08, "loss": 0.6621, "step": 2457 }, { "epoch": 0.9533598370988073, "grad_norm": 2.3646609282380697, "learning_rate": 6.58673872923693e-08, "loss": 0.6949, "step": 2458 }, { "epoch": 0.9537476970813537, "grad_norm": 2.7745518143726704, "learning_rate": 6.477653830995855e-08, "loss": 0.6184, "step": 2459 }, { "epoch": 0.9541355570639, "grad_norm": 1.8801177005702965, "learning_rate": 6.369473895146194e-08, "loss": 0.6395, "step": 2460 }, { "epoch": 0.9545234170464463, "grad_norm": 2.864328500936459, "learning_rate": 6.262199120055379e-08, "loss": 0.6468, "step": 2461 }, { "epoch": 0.9549112770289926, "grad_norm": 2.2794710604271713, "learning_rate": 6.15582970243117e-08, "loss": 0.6092, "step": 2462 }, { "epoch": 0.9552991370115388, "grad_norm": 2.397439388063724, "learning_rate": 6.050365837320993e-08, "loss": 0.6334, "step": 2463 }, { "epoch": 0.9556869969940851, "grad_norm": 2.717742883897927, "learning_rate": 5.945807718111929e-08, "loss": 0.697, "step": 2464 }, { "epoch": 0.9560748569766314, "grad_norm": 3.401965217326661, "learning_rate": 5.842155536530281e-08, "loss": 0.6613, "step": 2465 }, { "epoch": 0.9564627169591777, "grad_norm": 2.3174978010600085, "learning_rate": 5.739409482640956e-08, "loss": 0.591, "step": 2466 }, { "epoch": 0.956850576941724, "grad_norm": 2.2105079860849575, "learning_rate": 5.6375697448474155e-08, "loss": 0.6611, "step": 2467 }, { "epoch": 0.9572384369242704, "grad_norm": 2.203495604653787, "learning_rate": 5.536636509891225e-08, "loss": 0.6654, "step": 2468 }, { "epoch": 0.9576262969068167, "grad_norm": 2.2254163747844595, "learning_rate": 5.436609962851724e-08, "loss": 0.6482, "step": 2469 }, { "epoch": 0.958014156889363, "grad_norm": 2.4135896104391534, "learning_rate": 5.3374902871456965e-08, "loss": 0.6226, "step": 2470 }, { "epoch": 0.9584020168719093, "grad_norm": 2.1133778310435196, "learning_rate": 5.23927766452692e-08, "loss": 0.5506, "step": 2471 }, { "epoch": 0.9587898768544555, "grad_norm": 2.016628439242416, "learning_rate": 5.1419722750859494e-08, "loss": 0.6406, "step": 2472 }, { "epoch": 0.9591777368370018, "grad_norm": 2.650929730802385, "learning_rate": 5.0455742972498335e-08, "loss": 0.6406, "step": 2473 }, { "epoch": 0.9595655968195481, "grad_norm": 2.3270732853484133, "learning_rate": 4.950083907781733e-08, "loss": 0.6354, "step": 2474 }, { "epoch": 0.9599534568020944, "grad_norm": 2.2179284774406285, "learning_rate": 4.855501281780528e-08, "loss": 0.663, "step": 2475 }, { "epoch": 0.9603413167846407, "grad_norm": 2.176204391167904, "learning_rate": 4.7618265926804854e-08, "loss": 0.7075, "step": 2476 }, { "epoch": 0.960729176767187, "grad_norm": 2.676065257047124, "learning_rate": 4.6690600122510924e-08, "loss": 0.6088, "step": 2477 }, { "epoch": 0.9611170367497334, "grad_norm": 2.142716331501586, "learning_rate": 4.577201710596613e-08, "loss": 0.5752, "step": 2478 }, { "epoch": 0.9615048967322797, "grad_norm": 2.4427919258393853, "learning_rate": 4.486251856155921e-08, "loss": 0.6777, "step": 2479 }, { "epoch": 0.961892756714826, "grad_norm": 2.0741321753794586, "learning_rate": 4.3962106157019455e-08, "loss": 0.6687, "step": 2480 }, { "epoch": 0.9622806166973723, "grad_norm": 3.059233040046209, "learning_rate": 4.30707815434156e-08, "loss": 0.7104, "step": 2481 }, { "epoch": 0.9626684766799185, "grad_norm": 2.2810294036921244, "learning_rate": 4.2188546355153016e-08, "loss": 0.623, "step": 2482 }, { "epoch": 0.9630563366624648, "grad_norm": 2.6452251757071332, "learning_rate": 4.1315402209968766e-08, "loss": 0.696, "step": 2483 }, { "epoch": 0.9634441966450111, "grad_norm": 2.218087165798752, "learning_rate": 4.045135070893047e-08, "loss": 0.6084, "step": 2484 }, { "epoch": 0.9638320566275574, "grad_norm": 2.8952074811047703, "learning_rate": 3.9596393436432954e-08, "loss": 0.6848, "step": 2485 }, { "epoch": 0.9642199166101038, "grad_norm": 2.314693177218842, "learning_rate": 3.8750531960194405e-08, "loss": 0.6374, "step": 2486 }, { "epoch": 0.9646077765926501, "grad_norm": 2.424110728648644, "learning_rate": 3.791376783125467e-08, "loss": 0.6247, "step": 2487 }, { "epoch": 0.9649956365751964, "grad_norm": 2.6439096801205744, "learning_rate": 3.70861025839725e-08, "loss": 0.6934, "step": 2488 }, { "epoch": 0.9653834965577427, "grad_norm": 2.6316882226093883, "learning_rate": 3.62675377360211e-08, "loss": 0.6497, "step": 2489 }, { "epoch": 0.965771356540289, "grad_norm": 1.9986266243853295, "learning_rate": 3.5458074788387585e-08, "loss": 0.6144, "step": 2490 }, { "epoch": 0.9661592165228352, "grad_norm": 1.9167894468632798, "learning_rate": 3.465771522536854e-08, "loss": 0.679, "step": 2491 }, { "epoch": 0.9665470765053815, "grad_norm": 2.0570589195255717, "learning_rate": 3.386646051456721e-08, "loss": 0.6224, "step": 2492 }, { "epoch": 0.9669349364879278, "grad_norm": 1.8041163943162748, "learning_rate": 3.308431210689245e-08, "loss": 0.6201, "step": 2493 }, { "epoch": 0.9673227964704741, "grad_norm": 3.5593383028758137, "learning_rate": 3.231127143655422e-08, "loss": 0.6986, "step": 2494 }, { "epoch": 0.9677106564530205, "grad_norm": 2.6430017115781017, "learning_rate": 3.154733992106307e-08, "loss": 0.6208, "step": 2495 }, { "epoch": 0.9680985164355668, "grad_norm": 1.8495208810850283, "learning_rate": 3.0792518961225147e-08, "loss": 0.672, "step": 2496 }, { "epoch": 0.9684863764181131, "grad_norm": 2.6194608318855823, "learning_rate": 3.004680994114051e-08, "loss": 0.6917, "step": 2497 }, { "epoch": 0.9688742364006594, "grad_norm": 2.2943434567684102, "learning_rate": 2.9310214228202016e-08, "loss": 0.6257, "step": 2498 }, { "epoch": 0.9692620963832057, "grad_norm": 2.3045091281708667, "learning_rate": 2.8582733173090904e-08, "loss": 0.6453, "step": 2499 }, { "epoch": 0.9696499563657519, "grad_norm": 2.3919767805487666, "learning_rate": 2.7864368109775108e-08, "loss": 0.6051, "step": 2500 }, { "epoch": 0.9696499563657519, "eval_loss": 1.2684818506240845, "eval_runtime": 5.9104, "eval_samples_per_second": 0.169, "eval_steps_per_second": 0.169, "step": 2500 }, { "epoch": 0.9700378163482982, "grad_norm": 2.3631635289573927, "learning_rate": 2.7155120355506493e-08, "loss": 0.6201, "step": 2501 }, { "epoch": 0.9704256763308445, "grad_norm": 2.0008238020020044, "learning_rate": 2.645499121081918e-08, "loss": 0.6539, "step": 2502 }, { "epoch": 0.9708135363133908, "grad_norm": 2.550318550532624, "learning_rate": 2.5763981959526786e-08, "loss": 0.6934, "step": 2503 }, { "epoch": 0.9712013962959372, "grad_norm": 2.1751098550998313, "learning_rate": 2.5082093868718515e-08, "loss": 0.708, "step": 2504 }, { "epoch": 0.9715892562784835, "grad_norm": 2.246404853638812, "learning_rate": 2.440932818876085e-08, "loss": 0.6374, "step": 2505 }, { "epoch": 0.9719771162610298, "grad_norm": 2.439263617921088, "learning_rate": 2.3745686153290314e-08, "loss": 0.6513, "step": 2506 }, { "epoch": 0.9723649762435761, "grad_norm": 2.439845133346424, "learning_rate": 2.3091168979214595e-08, "loss": 0.6434, "step": 2507 }, { "epoch": 0.9727528362261224, "grad_norm": 2.1425731609454033, "learning_rate": 2.2445777866709208e-08, "loss": 0.646, "step": 2508 }, { "epoch": 0.9731406962086687, "grad_norm": 2.3318882410085955, "learning_rate": 2.1809513999215825e-08, "loss": 0.6923, "step": 2509 }, { "epoch": 0.9735285561912149, "grad_norm": 2.084184019102012, "learning_rate": 2.1182378543438408e-08, "loss": 0.6551, "step": 2510 }, { "epoch": 0.9739164161737612, "grad_norm": 2.4161555754209867, "learning_rate": 2.0564372649343743e-08, "loss": 0.5976, "step": 2511 }, { "epoch": 0.9743042761563075, "grad_norm": 2.105183966175184, "learning_rate": 1.9955497450157012e-08, "loss": 0.5811, "step": 2512 }, { "epoch": 0.9746921361388539, "grad_norm": 2.9308933962965864, "learning_rate": 1.935575406236123e-08, "loss": 0.6148, "step": 2513 }, { "epoch": 0.9750799961214002, "grad_norm": 2.3021720991557113, "learning_rate": 1.8765143585693924e-08, "loss": 0.6795, "step": 2514 }, { "epoch": 0.9754678561039465, "grad_norm": 2.4322749637338914, "learning_rate": 1.8183667103146007e-08, "loss": 0.6611, "step": 2515 }, { "epoch": 0.9758557160864928, "grad_norm": 2.4318750647665484, "learning_rate": 1.7611325680960133e-08, "loss": 0.6292, "step": 2516 }, { "epoch": 0.9762435760690391, "grad_norm": 2.670781818992549, "learning_rate": 1.7048120368627352e-08, "loss": 0.7234, "step": 2517 }, { "epoch": 0.9766314360515854, "grad_norm": 2.0741340150258862, "learning_rate": 1.6494052198886557e-08, "loss": 0.6145, "step": 2518 }, { "epoch": 0.9770192960341316, "grad_norm": 2.1946649785809367, "learning_rate": 1.5949122187721154e-08, "loss": 0.6158, "step": 2519 }, { "epoch": 0.9774071560166779, "grad_norm": 2.2804720930680005, "learning_rate": 1.541333133436018e-08, "loss": 0.6627, "step": 2520 }, { "epoch": 0.9777950159992242, "grad_norm": 2.9887041745552327, "learning_rate": 1.4886680621271631e-08, "loss": 0.688, "step": 2521 }, { "epoch": 0.9781828759817706, "grad_norm": 1.9748586153402536, "learning_rate": 1.4369171014165795e-08, "loss": 0.6098, "step": 2522 }, { "epoch": 0.9785707359643169, "grad_norm": 2.711123104796451, "learning_rate": 1.3860803461989148e-08, "loss": 0.6656, "step": 2523 }, { "epoch": 0.9789585959468632, "grad_norm": 1.8025326256897136, "learning_rate": 1.336157889692602e-08, "loss": 0.6763, "step": 2524 }, { "epoch": 0.9793464559294095, "grad_norm": 2.481618080616605, "learning_rate": 1.2871498234394707e-08, "loss": 0.633, "step": 2525 }, { "epoch": 0.9797343159119558, "grad_norm": 3.044412672062187, "learning_rate": 1.2390562373046367e-08, "loss": 0.6515, "step": 2526 }, { "epoch": 0.9801221758945021, "grad_norm": 2.186035849116284, "learning_rate": 1.1918772194764449e-08, "loss": 0.6691, "step": 2527 }, { "epoch": 0.9805100358770484, "grad_norm": 2.688713492956192, "learning_rate": 1.1456128564660273e-08, "loss": 0.6412, "step": 2528 }, { "epoch": 0.9808978958595946, "grad_norm": 2.8502252031107496, "learning_rate": 1.1002632331076346e-08, "loss": 0.7316, "step": 2529 }, { "epoch": 0.981285755842141, "grad_norm": 1.9399812226359125, "learning_rate": 1.0558284325578038e-08, "loss": 0.5962, "step": 2530 }, { "epoch": 0.9816736158246873, "grad_norm": 2.221852649815676, "learning_rate": 1.012308536295914e-08, "loss": 0.6301, "step": 2531 }, { "epoch": 0.9820614758072336, "grad_norm": 2.6273258711987526, "learning_rate": 9.69703624123519e-09, "loss": 0.6622, "step": 2532 }, { "epoch": 0.9824493357897799, "grad_norm": 1.854873330824356, "learning_rate": 9.280137741643492e-09, "loss": 0.6406, "step": 2533 }, { "epoch": 0.9828371957723262, "grad_norm": 3.4360975739294375, "learning_rate": 8.872390628643645e-09, "loss": 0.7435, "step": 2534 }, { "epoch": 0.9832250557548725, "grad_norm": 2.658456592477646, "learning_rate": 8.473795649913686e-09, "loss": 0.6485, "step": 2535 }, { "epoch": 0.9836129157374188, "grad_norm": 2.3704643735115143, "learning_rate": 8.084353536348955e-09, "loss": 0.7343, "step": 2536 }, { "epoch": 0.9840007757199651, "grad_norm": 3.0252234083900995, "learning_rate": 7.704065002062112e-09, "loss": 0.6138, "step": 2537 }, { "epoch": 0.9843886357025113, "grad_norm": 2.4047630187551903, "learning_rate": 7.332930744380906e-09, "loss": 0.6528, "step": 2538 }, { "epoch": 0.9847764956850577, "grad_norm": 2.634364111232844, "learning_rate": 6.9709514438470735e-09, "loss": 0.664, "step": 2539 }, { "epoch": 0.985164355667604, "grad_norm": 2.8237563410777358, "learning_rate": 6.618127764215221e-09, "loss": 0.6441, "step": 2540 }, { "epoch": 0.9855522156501503, "grad_norm": 2.646779617430198, "learning_rate": 6.274460352451162e-09, "loss": 0.6648, "step": 2541 }, { "epoch": 0.9859400756326966, "grad_norm": 1.918714965529199, "learning_rate": 5.939949838731363e-09, "loss": 0.6371, "step": 2542 }, { "epoch": 0.9863279356152429, "grad_norm": 2.4256591328901216, "learning_rate": 5.614596836440722e-09, "loss": 0.7236, "step": 2543 }, { "epoch": 0.9867157955977892, "grad_norm": 2.931235658570634, "learning_rate": 5.298401942173681e-09, "loss": 0.6397, "step": 2544 }, { "epoch": 0.9871036555803355, "grad_norm": 1.6685498421082987, "learning_rate": 4.991365735730336e-09, "loss": 0.5959, "step": 2545 }, { "epoch": 0.9874915155628818, "grad_norm": 2.3119081622094098, "learning_rate": 4.69348878011644e-09, "loss": 0.6564, "step": 2546 }, { "epoch": 0.9878793755454282, "grad_norm": 2.640701320017966, "learning_rate": 4.4047716215439575e-09, "loss": 0.6609, "step": 2547 }, { "epoch": 0.9882672355279744, "grad_norm": 2.246165758245353, "learning_rate": 4.125214789427734e-09, "loss": 0.6643, "step": 2548 }, { "epoch": 0.9886550955105207, "grad_norm": 2.5350503487600697, "learning_rate": 3.854818796385495e-09, "loss": 0.6477, "step": 2549 }, { "epoch": 0.989042955493067, "grad_norm": 2.167458888606388, "learning_rate": 3.593584138237294e-09, "loss": 0.6557, "step": 2550 }, { "epoch": 0.9894308154756133, "grad_norm": 2.4627439231538775, "learning_rate": 3.341511294004396e-09, "loss": 0.7128, "step": 2551 }, { "epoch": 0.9898186754581596, "grad_norm": 2.8837926326184173, "learning_rate": 3.098600725907619e-09, "loss": 0.6595, "step": 2552 }, { "epoch": 0.9902065354407059, "grad_norm": 2.0827473590861527, "learning_rate": 2.86485287936733e-09, "loss": 0.6731, "step": 2553 }, { "epoch": 0.9905943954232522, "grad_norm": 2.263875384836565, "learning_rate": 2.640268183002337e-09, "loss": 0.6521, "step": 2554 }, { "epoch": 0.9909822554057985, "grad_norm": 2.3750120481435846, "learning_rate": 2.424847048629886e-09, "loss": 0.6879, "step": 2555 }, { "epoch": 0.9913701153883449, "grad_norm": 1.8347703651233918, "learning_rate": 2.2185898712628884e-09, "loss": 0.5991, "step": 2556 }, { "epoch": 0.991757975370891, "grad_norm": 2.080962096443903, "learning_rate": 2.02149702911103e-09, "loss": 0.6611, "step": 2557 }, { "epoch": 0.9921458353534374, "grad_norm": 2.3155878367180276, "learning_rate": 1.8335688835802169e-09, "loss": 0.6847, "step": 2558 }, { "epoch": 0.9925336953359837, "grad_norm": 2.3273156210239963, "learning_rate": 1.654805779270352e-09, "loss": 0.6268, "step": 2559 }, { "epoch": 0.99292155531853, "grad_norm": 1.8308157340286888, "learning_rate": 1.4852080439758943e-09, "loss": 0.6314, "step": 2560 }, { "epoch": 0.9933094153010763, "grad_norm": 2.149726457555682, "learning_rate": 1.3247759886853006e-09, "loss": 0.6011, "step": 2561 }, { "epoch": 0.9936972752836226, "grad_norm": 3.0890850206072997, "learning_rate": 1.173509907579362e-09, "loss": 0.6955, "step": 2562 }, { "epoch": 0.9940851352661689, "grad_norm": 3.4625380010649724, "learning_rate": 1.0314100780317583e-09, "loss": 0.6694, "step": 2563 }, { "epoch": 0.9944729952487152, "grad_norm": 2.318556952119189, "learning_rate": 8.984767606085021e-10, "loss": 0.6962, "step": 2564 }, { "epoch": 0.9948608552312616, "grad_norm": 2.936117130521223, "learning_rate": 7.747101990662753e-10, "loss": 0.6497, "step": 2565 }, { "epoch": 0.9952487152138079, "grad_norm": 2.472879132294535, "learning_rate": 6.601106203535379e-10, "loss": 0.6837, "step": 2566 }, { "epoch": 0.9956365751963541, "grad_norm": 2.1141337311579482, "learning_rate": 5.546782346099733e-10, "loss": 0.6611, "step": 2567 }, { "epoch": 0.9960244351789004, "grad_norm": 2.9483369978486245, "learning_rate": 4.5841323516426784e-10, "loss": 0.6483, "step": 2568 }, { "epoch": 0.9964122951614467, "grad_norm": 1.7025080692624572, "learning_rate": 3.713157985363314e-10, "loss": 0.5972, "step": 2569 }, { "epoch": 0.996800155143993, "grad_norm": 2.287641182152908, "learning_rate": 2.9338608443452154e-10, "loss": 0.6228, "step": 2570 }, { "epoch": 0.9971880151265393, "grad_norm": 2.2310565863299576, "learning_rate": 2.2462423575675408e-10, "loss": 0.7, "step": 2571 }, { "epoch": 0.9975758751090856, "grad_norm": 2.530357513229175, "learning_rate": 1.6503037859105787e-10, "loss": 0.6652, "step": 2572 }, { "epoch": 0.9979637350916319, "grad_norm": 2.2379556315224773, "learning_rate": 1.1460462221279944e-10, "loss": 0.6165, "step": 2573 }, { "epoch": 0.9983515950741783, "grad_norm": 1.9656916374685878, "learning_rate": 7.334705908745854e-11, "loss": 0.6052, "step": 2574 }, { "epoch": 0.9987394550567246, "grad_norm": 2.4482138096366395, "learning_rate": 4.125776486785249e-11, "loss": 0.6994, "step": 2575 }, { "epoch": 0.9991273150392708, "grad_norm": 1.942125342111473, "learning_rate": 1.8336798395801604e-11, "loss": 0.6217, "step": 2576 }, { "epoch": 0.9995151750218171, "grad_norm": 1.948033361320386, "learning_rate": 4.584201700463808e-12, "loss": 0.6392, "step": 2577 }, { "epoch": 0.9999030350043634, "grad_norm": 1.859673516310116, "learning_rate": 0.0, "loss": 0.5821, "step": 2578 }, { "epoch": 0.9999030350043634, "step": 2578, "total_flos": 7.318297512614298e+16, "train_loss": 0.753098239239285, "train_runtime": 80645.5593, "train_samples_per_second": 4.092, "train_steps_per_second": 0.032 } ], "logging_steps": 1.0, "max_steps": 2578, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.318297512614298e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }