{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999030350043634, "eval_steps": 500, "global_step": 2578, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003878599825463008, "grad_norm": 58.88456148983927, "learning_rate": 3.875968992248062e-08, "loss": 3.3661, "step": 1 }, { "epoch": 0.0007757199650926016, "grad_norm": 48.79420317190484, "learning_rate": 7.751937984496124e-08, "loss": 3.3932, "step": 2 }, { "epoch": 0.0011635799476389023, "grad_norm": 58.954661617539344, "learning_rate": 1.1627906976744187e-07, "loss": 3.4041, "step": 3 }, { "epoch": 0.0015514399301852031, "grad_norm": 56.80570931728183, "learning_rate": 1.5503875968992249e-07, "loss": 3.5895, "step": 4 }, { "epoch": 0.0019392999127315039, "grad_norm": 54.318595731253936, "learning_rate": 1.9379844961240311e-07, "loss": 3.511, "step": 5 }, { "epoch": 0.0023271598952778047, "grad_norm": 54.42041068270547, "learning_rate": 2.3255813953488374e-07, "loss": 3.3892, "step": 6 }, { "epoch": 0.0027150198778241054, "grad_norm": 55.023302867713596, "learning_rate": 2.7131782945736437e-07, "loss": 3.3395, "step": 7 }, { "epoch": 0.0031028798603704062, "grad_norm": 65.61344426850057, "learning_rate": 3.1007751937984497e-07, "loss": 3.2531, "step": 8 }, { "epoch": 0.003490739842916707, "grad_norm": 50.19131902289769, "learning_rate": 3.488372093023256e-07, "loss": 3.2936, "step": 9 }, { "epoch": 0.0038785998254630078, "grad_norm": 52.42246572008746, "learning_rate": 3.8759689922480623e-07, "loss": 3.3049, "step": 10 }, { "epoch": 0.004266459808009309, "grad_norm": 45.20639495558827, "learning_rate": 4.2635658914728683e-07, "loss": 3.291, "step": 11 }, { "epoch": 0.004654319790555609, "grad_norm": 47.45045072154142, "learning_rate": 4.651162790697675e-07, "loss": 3.2851, "step": 12 }, { "epoch": 0.0050421797731019105, "grad_norm": 46.6039287186182, "learning_rate": 5.038759689922481e-07, "loss": 3.3033, "step": 13 }, { "epoch": 0.005430039755648211, "grad_norm": 41.6216878379476, "learning_rate": 5.426356589147287e-07, "loss": 3.0617, "step": 14 }, { "epoch": 0.005817899738194512, "grad_norm": 42.34912819319958, "learning_rate": 5.813953488372094e-07, "loss": 3.1743, "step": 15 }, { "epoch": 0.0062057597207408124, "grad_norm": 41.028134119137725, "learning_rate": 6.201550387596899e-07, "loss": 3.1087, "step": 16 }, { "epoch": 0.006593619703287114, "grad_norm": 38.5489352254613, "learning_rate": 6.589147286821707e-07, "loss": 3.1329, "step": 17 }, { "epoch": 0.006981479685833414, "grad_norm": 36.46323171977231, "learning_rate": 6.976744186046513e-07, "loss": 2.9568, "step": 18 }, { "epoch": 0.007369339668379715, "grad_norm": 35.063669459927176, "learning_rate": 7.364341085271319e-07, "loss": 3.0095, "step": 19 }, { "epoch": 0.0077571996509260156, "grad_norm": 34.747322226171754, "learning_rate": 7.751937984496125e-07, "loss": 2.887, "step": 20 }, { "epoch": 0.008145059633472316, "grad_norm": 35.42880988369524, "learning_rate": 8.139534883720931e-07, "loss": 2.8756, "step": 21 }, { "epoch": 0.008532919616018618, "grad_norm": 23.98104635540677, "learning_rate": 8.527131782945737e-07, "loss": 2.354, "step": 22 }, { "epoch": 0.008920779598564918, "grad_norm": 23.061747313573495, "learning_rate": 8.914728682170544e-07, "loss": 2.4368, "step": 23 }, { "epoch": 0.009308639581111219, "grad_norm": 20.786661774933837, "learning_rate": 9.30232558139535e-07, "loss": 2.3424, "step": 24 }, { "epoch": 0.009696499563657519, "grad_norm": 22.35533604075333, "learning_rate": 9.689922480620157e-07, "loss": 2.3974, "step": 25 }, { "epoch": 0.010084359546203821, "grad_norm": 19.727904225298435, "learning_rate": 1.0077519379844962e-06, "loss": 2.18, "step": 26 }, { "epoch": 0.010472219528750121, "grad_norm": 19.022933712801716, "learning_rate": 1.0465116279069768e-06, "loss": 2.2499, "step": 27 }, { "epoch": 0.010860079511296422, "grad_norm": 18.032867308401627, "learning_rate": 1.0852713178294575e-06, "loss": 2.2128, "step": 28 }, { "epoch": 0.011247939493842722, "grad_norm": 13.710182046003846, "learning_rate": 1.1240310077519381e-06, "loss": 2.0547, "step": 29 }, { "epoch": 0.011635799476389024, "grad_norm": 10.820048496387537, "learning_rate": 1.1627906976744188e-06, "loss": 1.7315, "step": 30 }, { "epoch": 0.012023659458935325, "grad_norm": 13.638537811728025, "learning_rate": 1.2015503875968994e-06, "loss": 1.8748, "step": 31 }, { "epoch": 0.012411519441481625, "grad_norm": 13.581533067015044, "learning_rate": 1.2403100775193799e-06, "loss": 1.8964, "step": 32 }, { "epoch": 0.012799379424027925, "grad_norm": 11.6687204363301, "learning_rate": 1.2790697674418605e-06, "loss": 1.6855, "step": 33 }, { "epoch": 0.013187239406574227, "grad_norm": 8.924971882484117, "learning_rate": 1.3178294573643414e-06, "loss": 1.5853, "step": 34 }, { "epoch": 0.013575099389120528, "grad_norm": 8.109272634651157, "learning_rate": 1.3565891472868216e-06, "loss": 1.641, "step": 35 }, { "epoch": 0.013962959371666828, "grad_norm": 9.12757920125405, "learning_rate": 1.3953488372093025e-06, "loss": 1.6873, "step": 36 }, { "epoch": 0.014350819354213128, "grad_norm": 9.605065684627974, "learning_rate": 1.4341085271317832e-06, "loss": 1.6037, "step": 37 }, { "epoch": 0.01473867933675943, "grad_norm": 6.726567840077582, "learning_rate": 1.4728682170542638e-06, "loss": 1.6291, "step": 38 }, { "epoch": 0.01512653931930573, "grad_norm": 6.439352228120319, "learning_rate": 1.5116279069767443e-06, "loss": 1.6308, "step": 39 }, { "epoch": 0.015514399301852031, "grad_norm": 5.795143592185025, "learning_rate": 1.550387596899225e-06, "loss": 1.5617, "step": 40 }, { "epoch": 0.015902259284398333, "grad_norm": 5.7249021626397045, "learning_rate": 1.5891472868217056e-06, "loss": 1.4107, "step": 41 }, { "epoch": 0.016290119266944632, "grad_norm": 6.919473178422623, "learning_rate": 1.6279069767441862e-06, "loss": 1.4276, "step": 42 }, { "epoch": 0.016677979249490934, "grad_norm": 7.051632021467427, "learning_rate": 1.6666666666666667e-06, "loss": 1.4437, "step": 43 }, { "epoch": 0.017065839232037236, "grad_norm": 7.329899775823277, "learning_rate": 1.7054263565891473e-06, "loss": 1.3768, "step": 44 }, { "epoch": 0.017453699214583535, "grad_norm": 4.792166260364399, "learning_rate": 1.7441860465116282e-06, "loss": 1.1996, "step": 45 }, { "epoch": 0.017841559197129837, "grad_norm": 5.185656171722984, "learning_rate": 1.7829457364341088e-06, "loss": 1.4373, "step": 46 }, { "epoch": 0.018229419179676135, "grad_norm": 5.3314959026557505, "learning_rate": 1.8217054263565893e-06, "loss": 1.3756, "step": 47 }, { "epoch": 0.018617279162222437, "grad_norm": 5.183093444453678, "learning_rate": 1.86046511627907e-06, "loss": 1.2133, "step": 48 }, { "epoch": 0.01900513914476874, "grad_norm": 4.560850945305639, "learning_rate": 1.8992248062015506e-06, "loss": 1.3043, "step": 49 }, { "epoch": 0.019392999127315038, "grad_norm": 4.754059159833188, "learning_rate": 1.9379844961240315e-06, "loss": 1.2664, "step": 50 }, { "epoch": 0.01978085910986134, "grad_norm": 5.86816845019291, "learning_rate": 1.976744186046512e-06, "loss": 1.2045, "step": 51 }, { "epoch": 0.020168719092407642, "grad_norm": 5.109705717837876, "learning_rate": 2.0155038759689923e-06, "loss": 1.2059, "step": 52 }, { "epoch": 0.02055657907495394, "grad_norm": 3.9219993147941343, "learning_rate": 2.054263565891473e-06, "loss": 1.2188, "step": 53 }, { "epoch": 0.020944439057500243, "grad_norm": 4.079732503594257, "learning_rate": 2.0930232558139536e-06, "loss": 1.1666, "step": 54 }, { "epoch": 0.02133229904004654, "grad_norm": 4.2692431863213685, "learning_rate": 2.131782945736434e-06, "loss": 1.1824, "step": 55 }, { "epoch": 0.021720159022592844, "grad_norm": 4.863048499147377, "learning_rate": 2.170542635658915e-06, "loss": 1.2346, "step": 56 }, { "epoch": 0.022108019005139146, "grad_norm": 4.020251403567047, "learning_rate": 2.2093023255813954e-06, "loss": 1.1705, "step": 57 }, { "epoch": 0.022495878987685444, "grad_norm": 5.504230146655512, "learning_rate": 2.2480620155038763e-06, "loss": 1.2837, "step": 58 }, { "epoch": 0.022883738970231746, "grad_norm": 5.3368103057614, "learning_rate": 2.2868217054263567e-06, "loss": 1.2414, "step": 59 }, { "epoch": 0.02327159895277805, "grad_norm": 4.509839793857036, "learning_rate": 2.3255813953488376e-06, "loss": 1.1608, "step": 60 }, { "epoch": 0.023659458935324347, "grad_norm": 3.8739766035852883, "learning_rate": 2.364341085271318e-06, "loss": 1.1814, "step": 61 }, { "epoch": 0.02404731891787065, "grad_norm": 4.230786334913567, "learning_rate": 2.403100775193799e-06, "loss": 1.1996, "step": 62 }, { "epoch": 0.02443517890041695, "grad_norm": 4.5956171594214394, "learning_rate": 2.4418604651162793e-06, "loss": 1.1161, "step": 63 }, { "epoch": 0.02482303888296325, "grad_norm": 3.082143430124541, "learning_rate": 2.4806201550387598e-06, "loss": 1.1155, "step": 64 }, { "epoch": 0.025210898865509552, "grad_norm": 5.463609789603021, "learning_rate": 2.5193798449612406e-06, "loss": 1.2168, "step": 65 }, { "epoch": 0.02559875884805585, "grad_norm": 3.0630896724128904, "learning_rate": 2.558139534883721e-06, "loss": 1.0964, "step": 66 }, { "epoch": 0.025986618830602153, "grad_norm": 3.8403908160581968, "learning_rate": 2.596899224806202e-06, "loss": 1.1052, "step": 67 }, { "epoch": 0.026374478813148455, "grad_norm": 3.572947267066657, "learning_rate": 2.635658914728683e-06, "loss": 0.9751, "step": 68 }, { "epoch": 0.026762338795694753, "grad_norm": 3.4869225269101674, "learning_rate": 2.674418604651163e-06, "loss": 1.1368, "step": 69 }, { "epoch": 0.027150198778241055, "grad_norm": 2.968152999763861, "learning_rate": 2.7131782945736433e-06, "loss": 1.0828, "step": 70 }, { "epoch": 0.027538058760787357, "grad_norm": 3.3383935464293417, "learning_rate": 2.751937984496124e-06, "loss": 1.0196, "step": 71 }, { "epoch": 0.027925918743333656, "grad_norm": 3.05584505506979, "learning_rate": 2.790697674418605e-06, "loss": 1.0359, "step": 72 }, { "epoch": 0.028313778725879958, "grad_norm": 3.981068505018593, "learning_rate": 2.8294573643410855e-06, "loss": 1.0565, "step": 73 }, { "epoch": 0.028701638708426257, "grad_norm": 2.806725979762414, "learning_rate": 2.8682170542635663e-06, "loss": 0.9427, "step": 74 }, { "epoch": 0.02908949869097256, "grad_norm": 4.32975583484898, "learning_rate": 2.9069767441860468e-06, "loss": 1.127, "step": 75 }, { "epoch": 0.02947735867351886, "grad_norm": 2.6617420698142498, "learning_rate": 2.9457364341085276e-06, "loss": 0.9855, "step": 76 }, { "epoch": 0.02986521865606516, "grad_norm": 3.3694041984852996, "learning_rate": 2.9844961240310076e-06, "loss": 1.0466, "step": 77 }, { "epoch": 0.03025307863861146, "grad_norm": 2.7431651849763887, "learning_rate": 3.0232558139534885e-06, "loss": 0.9354, "step": 78 }, { "epoch": 0.030640938621157764, "grad_norm": 3.487444011850848, "learning_rate": 3.062015503875969e-06, "loss": 1.1286, "step": 79 }, { "epoch": 0.031028798603704062, "grad_norm": 3.640817446591573, "learning_rate": 3.10077519379845e-06, "loss": 1.0643, "step": 80 }, { "epoch": 0.03141665858625036, "grad_norm": 3.5391963451035635, "learning_rate": 3.1395348837209307e-06, "loss": 0.9638, "step": 81 }, { "epoch": 0.031804518568796666, "grad_norm": 3.3145896907096426, "learning_rate": 3.178294573643411e-06, "loss": 1.0984, "step": 82 }, { "epoch": 0.032192378551342965, "grad_norm": 2.7450986186255597, "learning_rate": 3.217054263565892e-06, "loss": 0.902, "step": 83 }, { "epoch": 0.032580238533889264, "grad_norm": 3.6054757605101058, "learning_rate": 3.2558139534883724e-06, "loss": 1.0168, "step": 84 }, { "epoch": 0.03296809851643557, "grad_norm": 3.6982392763111887, "learning_rate": 3.294573643410853e-06, "loss": 1.0114, "step": 85 }, { "epoch": 0.03335595849898187, "grad_norm": 2.943041521457906, "learning_rate": 3.3333333333333333e-06, "loss": 0.9555, "step": 86 }, { "epoch": 0.033743818481528166, "grad_norm": 2.7902648803063204, "learning_rate": 3.372093023255814e-06, "loss": 0.9119, "step": 87 }, { "epoch": 0.03413167846407447, "grad_norm": 3.7845807510089635, "learning_rate": 3.4108527131782946e-06, "loss": 0.9879, "step": 88 }, { "epoch": 0.03451953844662077, "grad_norm": 3.6550617999537933, "learning_rate": 3.4496124031007755e-06, "loss": 1.1136, "step": 89 }, { "epoch": 0.03490739842916707, "grad_norm": 2.33579344454401, "learning_rate": 3.4883720930232564e-06, "loss": 0.9685, "step": 90 }, { "epoch": 0.035295258411713375, "grad_norm": 2.6702532144509163, "learning_rate": 3.527131782945737e-06, "loss": 0.9135, "step": 91 }, { "epoch": 0.03568311839425967, "grad_norm": 3.0866039518203916, "learning_rate": 3.5658914728682177e-06, "loss": 0.9217, "step": 92 }, { "epoch": 0.03607097837680597, "grad_norm": 4.205629859116154, "learning_rate": 3.6046511627906977e-06, "loss": 0.9041, "step": 93 }, { "epoch": 0.03645883835935227, "grad_norm": 3.945882815254455, "learning_rate": 3.6434108527131786e-06, "loss": 1.0159, "step": 94 }, { "epoch": 0.036846698341898576, "grad_norm": 3.209148575186051, "learning_rate": 3.682170542635659e-06, "loss": 0.9727, "step": 95 }, { "epoch": 0.037234558324444875, "grad_norm": 3.174357373797049, "learning_rate": 3.72093023255814e-06, "loss": 0.9463, "step": 96 }, { "epoch": 0.03762241830699117, "grad_norm": 3.279199329080416, "learning_rate": 3.7596899224806203e-06, "loss": 0.9609, "step": 97 }, { "epoch": 0.03801027828953748, "grad_norm": 3.110832500976669, "learning_rate": 3.798449612403101e-06, "loss": 0.9841, "step": 98 }, { "epoch": 0.03839813827208378, "grad_norm": 2.906214521281683, "learning_rate": 3.837209302325582e-06, "loss": 1.0081, "step": 99 }, { "epoch": 0.038785998254630076, "grad_norm": 3.3911366195010464, "learning_rate": 3.875968992248063e-06, "loss": 0.908, "step": 100 }, { "epoch": 0.03917385823717638, "grad_norm": 3.715818337847017, "learning_rate": 3.914728682170543e-06, "loss": 1.0519, "step": 101 }, { "epoch": 0.03956171821972268, "grad_norm": 2.908123747732032, "learning_rate": 3.953488372093024e-06, "loss": 0.9388, "step": 102 }, { "epoch": 0.03994957820226898, "grad_norm": 3.2670559679381235, "learning_rate": 3.992248062015504e-06, "loss": 0.9509, "step": 103 }, { "epoch": 0.040337438184815284, "grad_norm": 3.572953180635382, "learning_rate": 4.031007751937985e-06, "loss": 0.938, "step": 104 }, { "epoch": 0.04072529816736158, "grad_norm": 2.846099043631193, "learning_rate": 4.0697674418604655e-06, "loss": 0.9224, "step": 105 }, { "epoch": 0.04111315814990788, "grad_norm": 4.014396788584271, "learning_rate": 4.108527131782946e-06, "loss": 0.9418, "step": 106 }, { "epoch": 0.04150101813245419, "grad_norm": 3.8531118629571264, "learning_rate": 4.1472868217054264e-06, "loss": 0.9441, "step": 107 }, { "epoch": 0.041888878115000486, "grad_norm": 4.658955777479399, "learning_rate": 4.186046511627907e-06, "loss": 0.9242, "step": 108 }, { "epoch": 0.042276738097546784, "grad_norm": 3.3995045973108184, "learning_rate": 4.224806201550387e-06, "loss": 1.0573, "step": 109 }, { "epoch": 0.04266459808009308, "grad_norm": 3.9492442082598385, "learning_rate": 4.263565891472868e-06, "loss": 1.0014, "step": 110 }, { "epoch": 0.04305245806263939, "grad_norm": 2.336592276156654, "learning_rate": 4.302325581395349e-06, "loss": 0.9456, "step": 111 }, { "epoch": 0.04344031804518569, "grad_norm": 3.2465852440295278, "learning_rate": 4.34108527131783e-06, "loss": 0.9262, "step": 112 }, { "epoch": 0.043828178027731986, "grad_norm": 3.1580406901093876, "learning_rate": 4.379844961240311e-06, "loss": 0.8847, "step": 113 }, { "epoch": 0.04421603801027829, "grad_norm": 3.1953282737127537, "learning_rate": 4.418604651162791e-06, "loss": 0.9265, "step": 114 }, { "epoch": 0.04460389799282459, "grad_norm": 2.9365116077224154, "learning_rate": 4.457364341085272e-06, "loss": 0.8671, "step": 115 }, { "epoch": 0.04499175797537089, "grad_norm": 2.826987671798214, "learning_rate": 4.4961240310077525e-06, "loss": 0.9599, "step": 116 }, { "epoch": 0.045379617957917194, "grad_norm": 3.0850799621638982, "learning_rate": 4.5348837209302326e-06, "loss": 0.941, "step": 117 }, { "epoch": 0.04576747794046349, "grad_norm": 2.2143642471145224, "learning_rate": 4.573643410852713e-06, "loss": 0.8935, "step": 118 }, { "epoch": 0.04615533792300979, "grad_norm": 2.466342487649227, "learning_rate": 4.612403100775194e-06, "loss": 0.8251, "step": 119 }, { "epoch": 0.0465431979055561, "grad_norm": 4.143536184076209, "learning_rate": 4.651162790697675e-06, "loss": 0.9799, "step": 120 }, { "epoch": 0.046931057888102395, "grad_norm": 3.5181764479941133, "learning_rate": 4.689922480620155e-06, "loss": 1.0621, "step": 121 }, { "epoch": 0.047318917870648694, "grad_norm": 3.458125226046411, "learning_rate": 4.728682170542636e-06, "loss": 0.9517, "step": 122 }, { "epoch": 0.047706777853195, "grad_norm": 3.063470368703218, "learning_rate": 4.767441860465117e-06, "loss": 0.9447, "step": 123 }, { "epoch": 0.0480946378357413, "grad_norm": 3.4983232667859765, "learning_rate": 4.806201550387598e-06, "loss": 0.9007, "step": 124 }, { "epoch": 0.0484824978182876, "grad_norm": 3.4548493265972984, "learning_rate": 4.844961240310078e-06, "loss": 0.9217, "step": 125 }, { "epoch": 0.0488703578008339, "grad_norm": 2.8317887155676305, "learning_rate": 4.883720930232559e-06, "loss": 0.9352, "step": 126 }, { "epoch": 0.0492582177833802, "grad_norm": 2.9585004546911597, "learning_rate": 4.922480620155039e-06, "loss": 0.8992, "step": 127 }, { "epoch": 0.0496460777659265, "grad_norm": 2.952733904169613, "learning_rate": 4.9612403100775195e-06, "loss": 0.9096, "step": 128 }, { "epoch": 0.0500339377484728, "grad_norm": 2.5752544542886153, "learning_rate": 5e-06, "loss": 0.95, "step": 129 }, { "epoch": 0.050421797731019104, "grad_norm": 3.0479620142404795, "learning_rate": 5.038759689922481e-06, "loss": 0.7495, "step": 130 }, { "epoch": 0.0508096577135654, "grad_norm": 3.5600399467714987, "learning_rate": 5.077519379844962e-06, "loss": 0.9288, "step": 131 }, { "epoch": 0.0511975176961117, "grad_norm": 3.042296151768515, "learning_rate": 5.116279069767442e-06, "loss": 0.872, "step": 132 }, { "epoch": 0.051585377678658006, "grad_norm": 3.4542010470451725, "learning_rate": 5.155038759689923e-06, "loss": 0.8755, "step": 133 }, { "epoch": 0.051973237661204305, "grad_norm": 3.829614886770577, "learning_rate": 5.193798449612404e-06, "loss": 0.9601, "step": 134 }, { "epoch": 0.052361097643750604, "grad_norm": 3.6262812419680213, "learning_rate": 5.232558139534885e-06, "loss": 0.8977, "step": 135 }, { "epoch": 0.05274895762629691, "grad_norm": 2.997470133055335, "learning_rate": 5.271317829457366e-06, "loss": 0.8857, "step": 136 }, { "epoch": 0.05313681760884321, "grad_norm": 3.9006839383899146, "learning_rate": 5.310077519379846e-06, "loss": 0.8983, "step": 137 }, { "epoch": 0.053524677591389506, "grad_norm": 5.994067330419478, "learning_rate": 5.348837209302326e-06, "loss": 0.9226, "step": 138 }, { "epoch": 0.05391253757393581, "grad_norm": 3.4735317332405033, "learning_rate": 5.3875968992248065e-06, "loss": 0.9045, "step": 139 }, { "epoch": 0.05430039755648211, "grad_norm": 3.4421282009662355, "learning_rate": 5.4263565891472865e-06, "loss": 0.8623, "step": 140 }, { "epoch": 0.05468825753902841, "grad_norm": 3.9920569799892247, "learning_rate": 5.465116279069767e-06, "loss": 0.9429, "step": 141 }, { "epoch": 0.055076117521574715, "grad_norm": 2.4748361233968006, "learning_rate": 5.503875968992248e-06, "loss": 0.8395, "step": 142 }, { "epoch": 0.05546397750412101, "grad_norm": 3.745083448418481, "learning_rate": 5.542635658914729e-06, "loss": 0.9535, "step": 143 }, { "epoch": 0.05585183748666731, "grad_norm": 3.4656076611012137, "learning_rate": 5.58139534883721e-06, "loss": 0.8642, "step": 144 }, { "epoch": 0.05623969746921361, "grad_norm": 2.9344157568019416, "learning_rate": 5.62015503875969e-06, "loss": 0.9124, "step": 145 }, { "epoch": 0.056627557451759916, "grad_norm": 3.2758047156627366, "learning_rate": 5.658914728682171e-06, "loss": 0.9237, "step": 146 }, { "epoch": 0.057015417434306215, "grad_norm": 2.835662057768622, "learning_rate": 5.697674418604652e-06, "loss": 0.9108, "step": 147 }, { "epoch": 0.05740327741685251, "grad_norm": 3.596055143151447, "learning_rate": 5.736434108527133e-06, "loss": 0.8522, "step": 148 }, { "epoch": 0.05779113739939882, "grad_norm": 2.9293927884593227, "learning_rate": 5.7751937984496135e-06, "loss": 0.8433, "step": 149 }, { "epoch": 0.05817899738194512, "grad_norm": 3.3663494964749794, "learning_rate": 5.8139534883720935e-06, "loss": 0.8559, "step": 150 }, { "epoch": 0.058566857364491416, "grad_norm": 2.61567420952768, "learning_rate": 5.852713178294574e-06, "loss": 0.8027, "step": 151 }, { "epoch": 0.05895471734703772, "grad_norm": 2.7899636656736315, "learning_rate": 5.891472868217055e-06, "loss": 0.8515, "step": 152 }, { "epoch": 0.05934257732958402, "grad_norm": 2.4044547231288234, "learning_rate": 5.930232558139536e-06, "loss": 0.8713, "step": 153 }, { "epoch": 0.05973043731213032, "grad_norm": 4.009670291390418, "learning_rate": 5.968992248062015e-06, "loss": 0.9421, "step": 154 }, { "epoch": 0.060118297294676624, "grad_norm": 3.493347747326293, "learning_rate": 6.007751937984496e-06, "loss": 0.8798, "step": 155 }, { "epoch": 0.06050615727722292, "grad_norm": 3.328695072549328, "learning_rate": 6.046511627906977e-06, "loss": 0.8305, "step": 156 }, { "epoch": 0.06089401725976922, "grad_norm": 3.677740137416495, "learning_rate": 6.085271317829458e-06, "loss": 0.9392, "step": 157 }, { "epoch": 0.06128187724231553, "grad_norm": 3.783479164385208, "learning_rate": 6.124031007751938e-06, "loss": 0.9552, "step": 158 }, { "epoch": 0.061669737224861826, "grad_norm": 2.7855693394581267, "learning_rate": 6.162790697674419e-06, "loss": 0.8532, "step": 159 }, { "epoch": 0.062057597207408124, "grad_norm": 3.772848143120936, "learning_rate": 6.2015503875969e-06, "loss": 0.9514, "step": 160 }, { "epoch": 0.06244545718995442, "grad_norm": 3.3526409329156412, "learning_rate": 6.2403100775193805e-06, "loss": 0.9589, "step": 161 }, { "epoch": 0.06283331717250072, "grad_norm": 3.9556533232278763, "learning_rate": 6.279069767441861e-06, "loss": 0.9546, "step": 162 }, { "epoch": 0.06322117715504703, "grad_norm": 3.941709830256763, "learning_rate": 6.317829457364341e-06, "loss": 0.9143, "step": 163 }, { "epoch": 0.06360903713759333, "grad_norm": 2.593940266512582, "learning_rate": 6.356589147286822e-06, "loss": 0.8237, "step": 164 }, { "epoch": 0.06399689712013963, "grad_norm": 3.600398118269131, "learning_rate": 6.395348837209303e-06, "loss": 0.8394, "step": 165 }, { "epoch": 0.06438475710268593, "grad_norm": 3.1196156475339047, "learning_rate": 6.434108527131784e-06, "loss": 0.8202, "step": 166 }, { "epoch": 0.06477261708523223, "grad_norm": 2.697369342081127, "learning_rate": 6.472868217054265e-06, "loss": 0.813, "step": 167 }, { "epoch": 0.06516047706777853, "grad_norm": 3.615937018239469, "learning_rate": 6.511627906976745e-06, "loss": 0.8824, "step": 168 }, { "epoch": 0.06554833705032483, "grad_norm": 2.5823274689670046, "learning_rate": 6.550387596899226e-06, "loss": 0.8524, "step": 169 }, { "epoch": 0.06593619703287114, "grad_norm": 2.7548801560061604, "learning_rate": 6.589147286821706e-06, "loss": 0.8481, "step": 170 }, { "epoch": 0.06632405701541744, "grad_norm": 19.47644808948743, "learning_rate": 6.627906976744186e-06, "loss": 0.8502, "step": 171 }, { "epoch": 0.06671191699796374, "grad_norm": 3.700415392435054, "learning_rate": 6.666666666666667e-06, "loss": 0.8511, "step": 172 }, { "epoch": 0.06709977698051003, "grad_norm": 3.9392592867410414, "learning_rate": 6.7054263565891475e-06, "loss": 0.8947, "step": 173 }, { "epoch": 0.06748763696305633, "grad_norm": 3.221892215806657, "learning_rate": 6.744186046511628e-06, "loss": 0.8999, "step": 174 }, { "epoch": 0.06787549694560263, "grad_norm": 3.0440899204325116, "learning_rate": 6.782945736434109e-06, "loss": 0.9254, "step": 175 }, { "epoch": 0.06826335692814894, "grad_norm": 4.669086561341695, "learning_rate": 6.821705426356589e-06, "loss": 0.9288, "step": 176 }, { "epoch": 0.06865121691069524, "grad_norm": 2.947710041370068, "learning_rate": 6.86046511627907e-06, "loss": 0.8091, "step": 177 }, { "epoch": 0.06903907689324154, "grad_norm": 3.7476539905861124, "learning_rate": 6.899224806201551e-06, "loss": 0.8398, "step": 178 }, { "epoch": 0.06942693687578784, "grad_norm": 3.1116809872799114, "learning_rate": 6.937984496124032e-06, "loss": 0.8777, "step": 179 }, { "epoch": 0.06981479685833414, "grad_norm": 2.898386053200124, "learning_rate": 6.976744186046513e-06, "loss": 0.8347, "step": 180 }, { "epoch": 0.07020265684088044, "grad_norm": 3.5203191059449788, "learning_rate": 7.015503875968993e-06, "loss": 0.8629, "step": 181 }, { "epoch": 0.07059051682342675, "grad_norm": 2.9566687376259813, "learning_rate": 7.054263565891474e-06, "loss": 0.8471, "step": 182 }, { "epoch": 0.07097837680597305, "grad_norm": 2.584943517663247, "learning_rate": 7.0930232558139545e-06, "loss": 0.8076, "step": 183 }, { "epoch": 0.07136623678851935, "grad_norm": 16.246810099691302, "learning_rate": 7.131782945736435e-06, "loss": 0.8642, "step": 184 }, { "epoch": 0.07175409677106565, "grad_norm": 3.67388834460793, "learning_rate": 7.170542635658916e-06, "loss": 0.9061, "step": 185 }, { "epoch": 0.07214195675361194, "grad_norm": 3.221079071973996, "learning_rate": 7.209302325581395e-06, "loss": 0.9302, "step": 186 }, { "epoch": 0.07252981673615824, "grad_norm": 3.8328146953057085, "learning_rate": 7.248062015503876e-06, "loss": 0.8701, "step": 187 }, { "epoch": 0.07291767671870454, "grad_norm": 3.2705560050477516, "learning_rate": 7.286821705426357e-06, "loss": 0.8823, "step": 188 }, { "epoch": 0.07330553670125085, "grad_norm": 3.013187557771827, "learning_rate": 7.325581395348837e-06, "loss": 0.8337, "step": 189 }, { "epoch": 0.07369339668379715, "grad_norm": 3.026649776606826, "learning_rate": 7.364341085271318e-06, "loss": 0.8206, "step": 190 }, { "epoch": 0.07408125666634345, "grad_norm": 4.821673888846116, "learning_rate": 7.403100775193799e-06, "loss": 0.8462, "step": 191 }, { "epoch": 0.07446911664888975, "grad_norm": 10.57622162380368, "learning_rate": 7.44186046511628e-06, "loss": 0.8846, "step": 192 }, { "epoch": 0.07485697663143605, "grad_norm": 191.0836860179865, "learning_rate": 7.480620155038761e-06, "loss": 0.9005, "step": 193 }, { "epoch": 0.07524483661398235, "grad_norm": 13.958535370307747, "learning_rate": 7.519379844961241e-06, "loss": 0.8072, "step": 194 }, { "epoch": 0.07563269659652866, "grad_norm": 27.131812645039584, "learning_rate": 7.5581395348837215e-06, "loss": 0.9499, "step": 195 }, { "epoch": 0.07602055657907496, "grad_norm": 11.705696558583531, "learning_rate": 7.596899224806202e-06, "loss": 0.9094, "step": 196 }, { "epoch": 0.07640841656162126, "grad_norm": 7.238970648214874, "learning_rate": 7.635658914728683e-06, "loss": 0.8858, "step": 197 }, { "epoch": 0.07679627654416755, "grad_norm": 7.591036159891361, "learning_rate": 7.674418604651164e-06, "loss": 0.901, "step": 198 }, { "epoch": 0.07718413652671385, "grad_norm": 3.195167061701358, "learning_rate": 7.713178294573645e-06, "loss": 0.8195, "step": 199 }, { "epoch": 0.07757199650926015, "grad_norm": 10.899126141303494, "learning_rate": 7.751937984496126e-06, "loss": 0.881, "step": 200 }, { "epoch": 0.07795985649180646, "grad_norm": 3.1306834863329156, "learning_rate": 7.790697674418605e-06, "loss": 0.9007, "step": 201 }, { "epoch": 0.07834771647435276, "grad_norm": 3.476517869851485, "learning_rate": 7.829457364341086e-06, "loss": 0.8416, "step": 202 }, { "epoch": 0.07873557645689906, "grad_norm": 2.5712554086158095, "learning_rate": 7.868217054263567e-06, "loss": 0.8392, "step": 203 }, { "epoch": 0.07912343643944536, "grad_norm": 3.033304200680951, "learning_rate": 7.906976744186048e-06, "loss": 0.8452, "step": 204 }, { "epoch": 0.07951129642199166, "grad_norm": 3.6162497650172147, "learning_rate": 7.945736434108527e-06, "loss": 0.85, "step": 205 }, { "epoch": 0.07989915640453796, "grad_norm": 2.8514472753255626, "learning_rate": 7.984496124031008e-06, "loss": 0.7805, "step": 206 }, { "epoch": 0.08028701638708426, "grad_norm": 3.042926505172788, "learning_rate": 8.023255813953488e-06, "loss": 0.8133, "step": 207 }, { "epoch": 0.08067487636963057, "grad_norm": 3.8866148164953356, "learning_rate": 8.06201550387597e-06, "loss": 0.8715, "step": 208 }, { "epoch": 0.08106273635217687, "grad_norm": 18.208468413769744, "learning_rate": 8.10077519379845e-06, "loss": 0.7671, "step": 209 }, { "epoch": 0.08145059633472317, "grad_norm": 3.195911265474544, "learning_rate": 8.139534883720931e-06, "loss": 0.8095, "step": 210 }, { "epoch": 0.08183845631726946, "grad_norm": 2.3212628812937024, "learning_rate": 8.178294573643412e-06, "loss": 0.9249, "step": 211 }, { "epoch": 0.08222631629981576, "grad_norm": 3.0743609368381217, "learning_rate": 8.217054263565893e-06, "loss": 0.8896, "step": 212 }, { "epoch": 0.08261417628236206, "grad_norm": 2.6247517451322997, "learning_rate": 8.255813953488374e-06, "loss": 0.8507, "step": 213 }, { "epoch": 0.08300203626490837, "grad_norm": 3.035563424906701, "learning_rate": 8.294573643410853e-06, "loss": 0.8419, "step": 214 }, { "epoch": 0.08338989624745467, "grad_norm": 2.7001720535751867, "learning_rate": 8.333333333333334e-06, "loss": 0.7869, "step": 215 }, { "epoch": 0.08377775623000097, "grad_norm": 3.160146434980059, "learning_rate": 8.372093023255815e-06, "loss": 0.8542, "step": 216 }, { "epoch": 0.08416561621254727, "grad_norm": 2.879110848817128, "learning_rate": 8.410852713178295e-06, "loss": 0.8941, "step": 217 }, { "epoch": 0.08455347619509357, "grad_norm": 2.9405451879810838, "learning_rate": 8.449612403100775e-06, "loss": 0.8271, "step": 218 }, { "epoch": 0.08494133617763987, "grad_norm": 3.0863478229360037, "learning_rate": 8.488372093023256e-06, "loss": 0.8426, "step": 219 }, { "epoch": 0.08532919616018617, "grad_norm": 3.0899648742951484, "learning_rate": 8.527131782945736e-06, "loss": 0.8356, "step": 220 }, { "epoch": 0.08571705614273248, "grad_norm": 3.040225343938285, "learning_rate": 8.565891472868217e-06, "loss": 0.822, "step": 221 }, { "epoch": 0.08610491612527878, "grad_norm": 3.742215771347257, "learning_rate": 8.604651162790698e-06, "loss": 0.9189, "step": 222 }, { "epoch": 0.08649277610782508, "grad_norm": 3.1381643224984854, "learning_rate": 8.643410852713179e-06, "loss": 0.8024, "step": 223 }, { "epoch": 0.08688063609037137, "grad_norm": 3.8387026389392958, "learning_rate": 8.68217054263566e-06, "loss": 0.7683, "step": 224 }, { "epoch": 0.08726849607291767, "grad_norm": 4.095555229558274, "learning_rate": 8.72093023255814e-06, "loss": 0.9345, "step": 225 }, { "epoch": 0.08765635605546397, "grad_norm": 3.5093874341939437, "learning_rate": 8.759689922480622e-06, "loss": 0.8283, "step": 226 }, { "epoch": 0.08804421603801028, "grad_norm": 3.1465531301820238, "learning_rate": 8.7984496124031e-06, "loss": 0.8382, "step": 227 }, { "epoch": 0.08843207602055658, "grad_norm": 2.8483101999082767, "learning_rate": 8.837209302325582e-06, "loss": 0.728, "step": 228 }, { "epoch": 0.08881993600310288, "grad_norm": 2.602844760059589, "learning_rate": 8.875968992248062e-06, "loss": 0.8606, "step": 229 }, { "epoch": 0.08920779598564918, "grad_norm": 2.613341885075325, "learning_rate": 8.914728682170543e-06, "loss": 0.8159, "step": 230 }, { "epoch": 0.08959565596819548, "grad_norm": 2.723411684206085, "learning_rate": 8.953488372093024e-06, "loss": 0.8749, "step": 231 }, { "epoch": 0.08998351595074178, "grad_norm": 2.8458455319976834, "learning_rate": 8.992248062015505e-06, "loss": 0.7899, "step": 232 }, { "epoch": 0.09037137593328809, "grad_norm": 4.663003042004078, "learning_rate": 9.031007751937986e-06, "loss": 0.9877, "step": 233 }, { "epoch": 0.09075923591583439, "grad_norm": 2.4982862920449405, "learning_rate": 9.069767441860465e-06, "loss": 0.7942, "step": 234 }, { "epoch": 0.09114709589838069, "grad_norm": 3.555691615996338, "learning_rate": 9.108527131782946e-06, "loss": 0.8528, "step": 235 }, { "epoch": 0.09153495588092699, "grad_norm": 3.8010089752843053, "learning_rate": 9.147286821705427e-06, "loss": 0.8881, "step": 236 }, { "epoch": 0.09192281586347328, "grad_norm": 2.679850284483196, "learning_rate": 9.186046511627908e-06, "loss": 0.8233, "step": 237 }, { "epoch": 0.09231067584601958, "grad_norm": 3.0559618018524684, "learning_rate": 9.224806201550389e-06, "loss": 0.8113, "step": 238 }, { "epoch": 0.09269853582856588, "grad_norm": 3.106516543195227, "learning_rate": 9.26356589147287e-06, "loss": 0.8424, "step": 239 }, { "epoch": 0.0930863958111122, "grad_norm": 4.547101037364033, "learning_rate": 9.30232558139535e-06, "loss": 0.96, "step": 240 }, { "epoch": 0.09347425579365849, "grad_norm": 2.758783405225598, "learning_rate": 9.34108527131783e-06, "loss": 0.8389, "step": 241 }, { "epoch": 0.09386211577620479, "grad_norm": 2.816579259220179, "learning_rate": 9.37984496124031e-06, "loss": 0.8817, "step": 242 }, { "epoch": 0.09424997575875109, "grad_norm": 3.4331487083524563, "learning_rate": 9.418604651162791e-06, "loss": 0.8355, "step": 243 }, { "epoch": 0.09463783574129739, "grad_norm": 3.0559521902164564, "learning_rate": 9.457364341085272e-06, "loss": 0.8647, "step": 244 }, { "epoch": 0.09502569572384369, "grad_norm": 3.4400737538753563, "learning_rate": 9.496124031007753e-06, "loss": 0.8388, "step": 245 }, { "epoch": 0.09541355570639, "grad_norm": 3.0571906640825035, "learning_rate": 9.534883720930234e-06, "loss": 0.8502, "step": 246 }, { "epoch": 0.0958014156889363, "grad_norm": 2.3910728042207023, "learning_rate": 9.573643410852715e-06, "loss": 0.7857, "step": 247 }, { "epoch": 0.0961892756714826, "grad_norm": 2.9122505260765297, "learning_rate": 9.612403100775196e-06, "loss": 0.8581, "step": 248 }, { "epoch": 0.0965771356540289, "grad_norm": 3.7350618487690905, "learning_rate": 9.651162790697676e-06, "loss": 0.9508, "step": 249 }, { "epoch": 0.0969649956365752, "grad_norm": 2.2657163895391066, "learning_rate": 9.689922480620156e-06, "loss": 0.8197, "step": 250 }, { "epoch": 0.09735285561912149, "grad_norm": 2.2122440524420157, "learning_rate": 9.728682170542636e-06, "loss": 0.7959, "step": 251 }, { "epoch": 0.0977407156016678, "grad_norm": 4.207012417968533, "learning_rate": 9.767441860465117e-06, "loss": 0.8165, "step": 252 }, { "epoch": 0.0981285755842141, "grad_norm": 3.5741099722854295, "learning_rate": 9.806201550387598e-06, "loss": 0.8415, "step": 253 }, { "epoch": 0.0985164355667604, "grad_norm": 2.4612902225627784, "learning_rate": 9.844961240310077e-06, "loss": 0.8537, "step": 254 }, { "epoch": 0.0989042955493067, "grad_norm": 3.072883524496467, "learning_rate": 9.883720930232558e-06, "loss": 0.8433, "step": 255 }, { "epoch": 0.099292155531853, "grad_norm": 3.454499685805015, "learning_rate": 9.922480620155039e-06, "loss": 0.8241, "step": 256 }, { "epoch": 0.0996800155143993, "grad_norm": 3.0128605421150687, "learning_rate": 9.96124031007752e-06, "loss": 0.8616, "step": 257 }, { "epoch": 0.1000678754969456, "grad_norm": 3.0688578595031455, "learning_rate": 1e-05, "loss": 0.801, "step": 258 }, { "epoch": 0.10045573547949191, "grad_norm": 3.7752008418426826, "learning_rate": 9.9999954157983e-06, "loss": 0.8804, "step": 259 }, { "epoch": 0.10084359546203821, "grad_norm": 3.386776962690488, "learning_rate": 9.999981663201606e-06, "loss": 0.831, "step": 260 }, { "epoch": 0.1012314554445845, "grad_norm": 3.021206458198746, "learning_rate": 9.999958742235133e-06, "loss": 0.8065, "step": 261 }, { "epoch": 0.1016193154271308, "grad_norm": 3.2267862774764016, "learning_rate": 9.999926652940914e-06, "loss": 0.8176, "step": 262 }, { "epoch": 0.1020071754096771, "grad_norm": 2.7079188793090765, "learning_rate": 9.999885395377788e-06, "loss": 0.8183, "step": 263 }, { "epoch": 0.1023950353922234, "grad_norm": 3.4703145752541706, "learning_rate": 9.999834969621408e-06, "loss": 0.8708, "step": 264 }, { "epoch": 0.10278289537476971, "grad_norm": 2.957931138143859, "learning_rate": 9.999775375764244e-06, "loss": 0.8262, "step": 265 }, { "epoch": 0.10317075535731601, "grad_norm": 3.283386218538667, "learning_rate": 9.999706613915567e-06, "loss": 0.909, "step": 266 }, { "epoch": 0.10355861533986231, "grad_norm": 3.036389834018093, "learning_rate": 9.999628684201464e-06, "loss": 0.8247, "step": 267 }, { "epoch": 0.10394647532240861, "grad_norm": 2.664582370677724, "learning_rate": 9.999541586764836e-06, "loss": 0.7993, "step": 268 }, { "epoch": 0.10433433530495491, "grad_norm": 2.7232919319307256, "learning_rate": 9.999445321765392e-06, "loss": 0.8425, "step": 269 }, { "epoch": 0.10472219528750121, "grad_norm": 2.8606447509539636, "learning_rate": 9.999339889379647e-06, "loss": 0.8469, "step": 270 }, { "epoch": 0.1051100552700475, "grad_norm": 2.592925927999754, "learning_rate": 9.999225289800935e-06, "loss": 0.8584, "step": 271 }, { "epoch": 0.10549791525259382, "grad_norm": 3.3203913938673186, "learning_rate": 9.999101523239392e-06, "loss": 0.8041, "step": 272 }, { "epoch": 0.10588577523514012, "grad_norm": 3.0224589298539115, "learning_rate": 9.998968589921969e-06, "loss": 0.8149, "step": 273 }, { "epoch": 0.10627363521768642, "grad_norm": 4.384269779744289, "learning_rate": 9.99882649009242e-06, "loss": 0.8797, "step": 274 }, { "epoch": 0.10666149520023271, "grad_norm": 4.471192995621748, "learning_rate": 9.998675224011317e-06, "loss": 0.9, "step": 275 }, { "epoch": 0.10704935518277901, "grad_norm": 2.5556416691297787, "learning_rate": 9.998514791956025e-06, "loss": 0.8109, "step": 276 }, { "epoch": 0.10743721516532531, "grad_norm": 3.203495957467087, "learning_rate": 9.998345194220732e-06, "loss": 0.8421, "step": 277 }, { "epoch": 0.10782507514787162, "grad_norm": 2.7852811186024264, "learning_rate": 9.998166431116421e-06, "loss": 0.7952, "step": 278 }, { "epoch": 0.10821293513041792, "grad_norm": 2.5710188982329343, "learning_rate": 9.99797850297089e-06, "loss": 0.8799, "step": 279 }, { "epoch": 0.10860079511296422, "grad_norm": 3.1123029157138866, "learning_rate": 9.997781410128737e-06, "loss": 0.8649, "step": 280 }, { "epoch": 0.10898865509551052, "grad_norm": 3.501269408537823, "learning_rate": 9.99757515295137e-06, "loss": 0.9014, "step": 281 }, { "epoch": 0.10937651507805682, "grad_norm": 2.7607358699905546, "learning_rate": 9.997359731816998e-06, "loss": 0.7446, "step": 282 }, { "epoch": 0.10976437506060312, "grad_norm": 2.6402580687512165, "learning_rate": 9.997135147120633e-06, "loss": 0.8051, "step": 283 }, { "epoch": 0.11015223504314943, "grad_norm": 3.6956921272199765, "learning_rate": 9.996901399274093e-06, "loss": 0.7928, "step": 284 }, { "epoch": 0.11054009502569573, "grad_norm": 2.4723303501101785, "learning_rate": 9.996658488705997e-06, "loss": 0.7906, "step": 285 }, { "epoch": 0.11092795500824203, "grad_norm": 3.209802161997143, "learning_rate": 9.996406415861763e-06, "loss": 0.8373, "step": 286 }, { "epoch": 0.11131581499078833, "grad_norm": 2.3747818295662277, "learning_rate": 9.996145181203616e-06, "loss": 0.8377, "step": 287 }, { "epoch": 0.11170367497333462, "grad_norm": 2.923845506701223, "learning_rate": 9.995874785210573e-06, "loss": 0.8299, "step": 288 }, { "epoch": 0.11209153495588092, "grad_norm": 3.467726101527391, "learning_rate": 9.995595228378456e-06, "loss": 0.8841, "step": 289 }, { "epoch": 0.11247939493842722, "grad_norm": 2.787823064192437, "learning_rate": 9.995306511219885e-06, "loss": 0.8317, "step": 290 }, { "epoch": 0.11286725492097353, "grad_norm": 2.9547775024934317, "learning_rate": 9.995008634264272e-06, "loss": 0.8884, "step": 291 }, { "epoch": 0.11325511490351983, "grad_norm": 3.464322652791451, "learning_rate": 9.994701598057828e-06, "loss": 0.8526, "step": 292 }, { "epoch": 0.11364297488606613, "grad_norm": 2.5058855752086924, "learning_rate": 9.99438540316356e-06, "loss": 0.7799, "step": 293 }, { "epoch": 0.11403083486861243, "grad_norm": 2.4257326408727424, "learning_rate": 9.99406005016127e-06, "loss": 0.7701, "step": 294 }, { "epoch": 0.11441869485115873, "grad_norm": 2.52785031797941, "learning_rate": 9.99372553964755e-06, "loss": 0.7847, "step": 295 }, { "epoch": 0.11480655483370503, "grad_norm": 3.9434110257544184, "learning_rate": 9.993381872235785e-06, "loss": 0.9495, "step": 296 }, { "epoch": 0.11519441481625134, "grad_norm": 4.163557393456395, "learning_rate": 9.993029048556154e-06, "loss": 0.8432, "step": 297 }, { "epoch": 0.11558227479879764, "grad_norm": 2.5478706674147937, "learning_rate": 9.99266706925562e-06, "loss": 0.7498, "step": 298 }, { "epoch": 0.11597013478134394, "grad_norm": 2.346998786311895, "learning_rate": 9.99229593499794e-06, "loss": 0.8033, "step": 299 }, { "epoch": 0.11635799476389024, "grad_norm": 3.1346999793152803, "learning_rate": 9.991915646463652e-06, "loss": 0.8347, "step": 300 }, { "epoch": 0.11674585474643653, "grad_norm": 2.7523335463312266, "learning_rate": 9.991526204350087e-06, "loss": 0.7732, "step": 301 }, { "epoch": 0.11713371472898283, "grad_norm": 3.5513284774879503, "learning_rate": 9.991127609371357e-06, "loss": 0.8207, "step": 302 }, { "epoch": 0.11752157471152914, "grad_norm": 2.318546228829256, "learning_rate": 9.990719862258357e-06, "loss": 0.7533, "step": 303 }, { "epoch": 0.11790943469407544, "grad_norm": 3.036055883237319, "learning_rate": 9.990302963758765e-06, "loss": 0.7785, "step": 304 }, { "epoch": 0.11829729467662174, "grad_norm": 5.400230121404138, "learning_rate": 9.989876914637042e-06, "loss": 0.8834, "step": 305 }, { "epoch": 0.11868515465916804, "grad_norm": 2.807076875392905, "learning_rate": 9.989441715674422e-06, "loss": 0.8524, "step": 306 }, { "epoch": 0.11907301464171434, "grad_norm": 2.615577954207834, "learning_rate": 9.988997367668924e-06, "loss": 0.8437, "step": 307 }, { "epoch": 0.11946087462426064, "grad_norm": 2.968687717555105, "learning_rate": 9.988543871435342e-06, "loss": 0.8041, "step": 308 }, { "epoch": 0.11984873460680694, "grad_norm": 2.6027932307602106, "learning_rate": 9.988081227805237e-06, "loss": 0.7785, "step": 309 }, { "epoch": 0.12023659458935325, "grad_norm": 3.6358693162469686, "learning_rate": 9.987609437626955e-06, "loss": 0.812, "step": 310 }, { "epoch": 0.12062445457189955, "grad_norm": 3.211995622821219, "learning_rate": 9.987128501765606e-06, "loss": 0.7934, "step": 311 }, { "epoch": 0.12101231455444585, "grad_norm": 3.2459994089363664, "learning_rate": 9.986638421103074e-06, "loss": 0.7583, "step": 312 }, { "epoch": 0.12140017453699214, "grad_norm": 3.153083415745224, "learning_rate": 9.986139196538011e-06, "loss": 0.8925, "step": 313 }, { "epoch": 0.12178803451953844, "grad_norm": 3.318555642299724, "learning_rate": 9.985630828985835e-06, "loss": 0.8268, "step": 314 }, { "epoch": 0.12217589450208474, "grad_norm": 2.5847131833247614, "learning_rate": 9.98511331937873e-06, "loss": 0.7251, "step": 315 }, { "epoch": 0.12256375448463105, "grad_norm": 2.7501839196852726, "learning_rate": 9.984586668665641e-06, "loss": 0.8736, "step": 316 }, { "epoch": 0.12295161446717735, "grad_norm": 3.271902360267546, "learning_rate": 9.98405087781228e-06, "loss": 0.8363, "step": 317 }, { "epoch": 0.12333947444972365, "grad_norm": 3.0203463677413454, "learning_rate": 9.983505947801115e-06, "loss": 0.8289, "step": 318 }, { "epoch": 0.12372733443226995, "grad_norm": 3.354884791786397, "learning_rate": 9.982951879631373e-06, "loss": 0.8201, "step": 319 }, { "epoch": 0.12411519441481625, "grad_norm": 3.604086656105164, "learning_rate": 9.982388674319041e-06, "loss": 0.8184, "step": 320 }, { "epoch": 0.12450305439736255, "grad_norm": 2.33483577231792, "learning_rate": 9.981816332896854e-06, "loss": 0.79, "step": 321 }, { "epoch": 0.12489091437990885, "grad_norm": 3.1446636981325278, "learning_rate": 9.981234856414306e-06, "loss": 0.8103, "step": 322 }, { "epoch": 0.12527877436245516, "grad_norm": 3.5254385731945006, "learning_rate": 9.98064424593764e-06, "loss": 0.7961, "step": 323 }, { "epoch": 0.12566663434500144, "grad_norm": 2.294611017769507, "learning_rate": 9.980044502549843e-06, "loss": 0.7272, "step": 324 }, { "epoch": 0.12605449432754776, "grad_norm": 3.122537245304204, "learning_rate": 9.979435627350658e-06, "loss": 0.8388, "step": 325 }, { "epoch": 0.12644235431009407, "grad_norm": 2.6164679080133166, "learning_rate": 9.978817621456562e-06, "loss": 0.8454, "step": 326 }, { "epoch": 0.12683021429264035, "grad_norm": 4.125849518430192, "learning_rate": 9.978190486000784e-06, "loss": 0.7927, "step": 327 }, { "epoch": 0.12721807427518667, "grad_norm": 2.8251144261804764, "learning_rate": 9.977554222133293e-06, "loss": 0.8498, "step": 328 }, { "epoch": 0.12760593425773295, "grad_norm": 2.6440250941017136, "learning_rate": 9.976908831020787e-06, "loss": 0.7425, "step": 329 }, { "epoch": 0.12799379424027926, "grad_norm": 3.8605228460984202, "learning_rate": 9.97625431384671e-06, "loss": 0.8519, "step": 330 }, { "epoch": 0.12838165422282555, "grad_norm": 3.126708147459336, "learning_rate": 9.975590671811239e-06, "loss": 0.8041, "step": 331 }, { "epoch": 0.12876951420537186, "grad_norm": 3.4839341858008797, "learning_rate": 9.974917906131283e-06, "loss": 0.9275, "step": 332 }, { "epoch": 0.12915737418791817, "grad_norm": 3.293122101772993, "learning_rate": 9.974236018040476e-06, "loss": 0.7354, "step": 333 }, { "epoch": 0.12954523417046446, "grad_norm": 2.9733424657976038, "learning_rate": 9.973545008789182e-06, "loss": 0.8187, "step": 334 }, { "epoch": 0.12993309415301077, "grad_norm": 2.476143417146213, "learning_rate": 9.972844879644494e-06, "loss": 0.7725, "step": 335 }, { "epoch": 0.13032095413555705, "grad_norm": 3.3960080087675326, "learning_rate": 9.972135631890226e-06, "loss": 0.8265, "step": 336 }, { "epoch": 0.13070881411810337, "grad_norm": 2.9476615470047984, "learning_rate": 9.97141726682691e-06, "loss": 0.8446, "step": 337 }, { "epoch": 0.13109667410064965, "grad_norm": 3.955922861279826, "learning_rate": 9.970689785771798e-06, "loss": 0.8005, "step": 338 }, { "epoch": 0.13148453408319596, "grad_norm": 3.831668197452571, "learning_rate": 9.969953190058861e-06, "loss": 0.9246, "step": 339 }, { "epoch": 0.13187239406574228, "grad_norm": 2.9881350018017336, "learning_rate": 9.969207481038776e-06, "loss": 0.842, "step": 340 }, { "epoch": 0.13226025404828856, "grad_norm": 3.5018876943937642, "learning_rate": 9.968452660078939e-06, "loss": 0.8333, "step": 341 }, { "epoch": 0.13264811403083487, "grad_norm": 2.7320148191300664, "learning_rate": 9.967688728563446e-06, "loss": 0.7018, "step": 342 }, { "epoch": 0.13303597401338116, "grad_norm": 2.604456297713614, "learning_rate": 9.966915687893109e-06, "loss": 0.8216, "step": 343 }, { "epoch": 0.13342383399592747, "grad_norm": 2.2756610567638993, "learning_rate": 9.966133539485435e-06, "loss": 0.8775, "step": 344 }, { "epoch": 0.13381169397847378, "grad_norm": 3.061786001524778, "learning_rate": 9.965342284774633e-06, "loss": 0.7819, "step": 345 }, { "epoch": 0.13419955396102007, "grad_norm": 2.562487737148423, "learning_rate": 9.964541925211613e-06, "loss": 0.7607, "step": 346 }, { "epoch": 0.13458741394356638, "grad_norm": 2.3762839182052526, "learning_rate": 9.963732462263979e-06, "loss": 0.748, "step": 347 }, { "epoch": 0.13497527392611267, "grad_norm": 3.1544416257485515, "learning_rate": 9.962913897416029e-06, "loss": 0.8356, "step": 348 }, { "epoch": 0.13536313390865898, "grad_norm": 3.450708653545331, "learning_rate": 9.962086232168747e-06, "loss": 0.7879, "step": 349 }, { "epoch": 0.13575099389120526, "grad_norm": 2.804324878548136, "learning_rate": 9.961249468039806e-06, "loss": 0.7902, "step": 350 }, { "epoch": 0.13613885387375158, "grad_norm": 3.799225447773009, "learning_rate": 9.960403606563568e-06, "loss": 0.8645, "step": 351 }, { "epoch": 0.1365267138562979, "grad_norm": 3.248926083886978, "learning_rate": 9.959548649291071e-06, "loss": 0.8121, "step": 352 }, { "epoch": 0.13691457383884417, "grad_norm": 3.2840844204682185, "learning_rate": 9.958684597790031e-06, "loss": 0.7987, "step": 353 }, { "epoch": 0.13730243382139048, "grad_norm": 2.1860346574177547, "learning_rate": 9.957811453644848e-06, "loss": 0.8175, "step": 354 }, { "epoch": 0.13769029380393677, "grad_norm": 3.284799389020322, "learning_rate": 9.956929218456586e-06, "loss": 0.7779, "step": 355 }, { "epoch": 0.13807815378648308, "grad_norm": 2.6052572781902845, "learning_rate": 9.956037893842982e-06, "loss": 0.7771, "step": 356 }, { "epoch": 0.13846601376902937, "grad_norm": 3.0979472963862347, "learning_rate": 9.955137481438442e-06, "loss": 0.8373, "step": 357 }, { "epoch": 0.13885387375157568, "grad_norm": 2.5388164338255006, "learning_rate": 9.954227982894034e-06, "loss": 0.7777, "step": 358 }, { "epoch": 0.139241733734122, "grad_norm": 4.016540784122367, "learning_rate": 9.953309399877491e-06, "loss": 0.7386, "step": 359 }, { "epoch": 0.13962959371666828, "grad_norm": 3.222065733429362, "learning_rate": 9.952381734073197e-06, "loss": 0.8313, "step": 360 }, { "epoch": 0.1400174536992146, "grad_norm": 2.3695907552028173, "learning_rate": 9.951444987182195e-06, "loss": 0.8215, "step": 361 }, { "epoch": 0.14040531368176087, "grad_norm": 3.34606170898925, "learning_rate": 9.950499160922184e-06, "loss": 0.7899, "step": 362 }, { "epoch": 0.1407931736643072, "grad_norm": 3.4138992530229593, "learning_rate": 9.949544257027503e-06, "loss": 0.9067, "step": 363 }, { "epoch": 0.1411810336468535, "grad_norm": 3.348305509467363, "learning_rate": 9.948580277249142e-06, "loss": 0.7307, "step": 364 }, { "epoch": 0.14156889362939978, "grad_norm": 2.0982636885186134, "learning_rate": 9.947607223354731e-06, "loss": 0.7728, "step": 365 }, { "epoch": 0.1419567536119461, "grad_norm": 3.0249841067128056, "learning_rate": 9.946625097128544e-06, "loss": 0.8538, "step": 366 }, { "epoch": 0.14234461359449238, "grad_norm": 2.9520116308857602, "learning_rate": 9.945633900371483e-06, "loss": 0.808, "step": 367 }, { "epoch": 0.1427324735770387, "grad_norm": 2.721541738461367, "learning_rate": 9.94463363490109e-06, "loss": 0.7818, "step": 368 }, { "epoch": 0.14312033355958498, "grad_norm": 3.03669915084012, "learning_rate": 9.943624302551527e-06, "loss": 0.7807, "step": 369 }, { "epoch": 0.1435081935421313, "grad_norm": 3.344676336753662, "learning_rate": 9.942605905173593e-06, "loss": 0.8241, "step": 370 }, { "epoch": 0.1438960535246776, "grad_norm": 2.2407309535884568, "learning_rate": 9.941578444634699e-06, "loss": 0.7657, "step": 371 }, { "epoch": 0.1442839135072239, "grad_norm": 3.146642822591765, "learning_rate": 9.940541922818882e-06, "loss": 0.8796, "step": 372 }, { "epoch": 0.1446717734897702, "grad_norm": 2.8113710504768896, "learning_rate": 9.939496341626791e-06, "loss": 0.7917, "step": 373 }, { "epoch": 0.14505963347231648, "grad_norm": 2.9388108816483474, "learning_rate": 9.938441702975689e-06, "loss": 0.7376, "step": 374 }, { "epoch": 0.1454474934548628, "grad_norm": 2.866544841745575, "learning_rate": 9.937378008799448e-06, "loss": 0.6982, "step": 375 }, { "epoch": 0.14583535343740908, "grad_norm": 3.2231226290359802, "learning_rate": 9.93630526104854e-06, "loss": 0.822, "step": 376 }, { "epoch": 0.1462232134199554, "grad_norm": 2.841752825654073, "learning_rate": 9.935223461690042e-06, "loss": 0.7412, "step": 377 }, { "epoch": 0.1466110734025017, "grad_norm": 3.1458274588464, "learning_rate": 9.934132612707631e-06, "loss": 0.7444, "step": 378 }, { "epoch": 0.146998933385048, "grad_norm": 3.2317382447829304, "learning_rate": 9.933032716101576e-06, "loss": 0.8149, "step": 379 }, { "epoch": 0.1473867933675943, "grad_norm": 3.7167482977288753, "learning_rate": 9.931923773888734e-06, "loss": 0.8172, "step": 380 }, { "epoch": 0.1477746533501406, "grad_norm": 2.9876286150494518, "learning_rate": 9.930805788102551e-06, "loss": 0.7928, "step": 381 }, { "epoch": 0.1481625133326869, "grad_norm": 2.381191578334231, "learning_rate": 9.929678760793057e-06, "loss": 0.7973, "step": 382 }, { "epoch": 0.14855037331523321, "grad_norm": 2.145501440766939, "learning_rate": 9.928542694026862e-06, "loss": 0.7637, "step": 383 }, { "epoch": 0.1489382332977795, "grad_norm": 2.3128230354198998, "learning_rate": 9.927397589887144e-06, "loss": 0.742, "step": 384 }, { "epoch": 0.1493260932803258, "grad_norm": 3.2181404165372873, "learning_rate": 9.926243450473664e-06, "loss": 0.7696, "step": 385 }, { "epoch": 0.1497139532628721, "grad_norm": 2.5970835878849616, "learning_rate": 9.925080277902743e-06, "loss": 0.7861, "step": 386 }, { "epoch": 0.1501018132454184, "grad_norm": 3.00664082566871, "learning_rate": 9.923908074307267e-06, "loss": 0.7388, "step": 387 }, { "epoch": 0.1504896732279647, "grad_norm": 2.9923204106594814, "learning_rate": 9.922726841836685e-06, "loss": 0.8047, "step": 388 }, { "epoch": 0.150877533210511, "grad_norm": 3.0832392194174836, "learning_rate": 9.921536582657002e-06, "loss": 0.7972, "step": 389 }, { "epoch": 0.15126539319305732, "grad_norm": 2.4063929556535357, "learning_rate": 9.920337298950767e-06, "loss": 0.7776, "step": 390 }, { "epoch": 0.1516532531756036, "grad_norm": 3.581417460937023, "learning_rate": 9.919128992917086e-06, "loss": 0.9302, "step": 391 }, { "epoch": 0.15204111315814992, "grad_norm": 3.051444218768337, "learning_rate": 9.91791166677161e-06, "loss": 0.8051, "step": 392 }, { "epoch": 0.1524289731406962, "grad_norm": 2.84536693248292, "learning_rate": 9.916685322746524e-06, "loss": 0.8057, "step": 393 }, { "epoch": 0.1528168331232425, "grad_norm": 3.795726615432535, "learning_rate": 9.915449963090551e-06, "loss": 0.8376, "step": 394 }, { "epoch": 0.1532046931057888, "grad_norm": 2.430098707327249, "learning_rate": 9.914205590068946e-06, "loss": 0.7192, "step": 395 }, { "epoch": 0.1535925530883351, "grad_norm": 3.629733485892195, "learning_rate": 9.912952205963491e-06, "loss": 0.8023, "step": 396 }, { "epoch": 0.15398041307088142, "grad_norm": 2.1899812333661903, "learning_rate": 9.911689813072495e-06, "loss": 0.7442, "step": 397 }, { "epoch": 0.1543682730534277, "grad_norm": 2.8261792221572524, "learning_rate": 9.91041841371078e-06, "loss": 0.7692, "step": 398 }, { "epoch": 0.15475613303597402, "grad_norm": 2.897055365725212, "learning_rate": 9.90913801020969e-06, "loss": 0.862, "step": 399 }, { "epoch": 0.1551439930185203, "grad_norm": 3.5413705604910204, "learning_rate": 9.907848604917075e-06, "loss": 0.7582, "step": 400 }, { "epoch": 0.15553185300106662, "grad_norm": 3.199581126589522, "learning_rate": 9.906550200197288e-06, "loss": 0.7975, "step": 401 }, { "epoch": 0.15591971298361293, "grad_norm": 3.366562005850396, "learning_rate": 9.905242798431196e-06, "loss": 0.7392, "step": 402 }, { "epoch": 0.15630757296615921, "grad_norm": 2.7802225019169557, "learning_rate": 9.903926402016153e-06, "loss": 0.7917, "step": 403 }, { "epoch": 0.15669543294870553, "grad_norm": 2.668714276969088, "learning_rate": 9.902601013366009e-06, "loss": 0.8357, "step": 404 }, { "epoch": 0.1570832929312518, "grad_norm": 2.9070643063132557, "learning_rate": 9.901266634911104e-06, "loss": 0.814, "step": 405 }, { "epoch": 0.15747115291379812, "grad_norm": 4.412809290356206, "learning_rate": 9.899923269098262e-06, "loss": 0.8785, "step": 406 }, { "epoch": 0.1578590128963444, "grad_norm": 1.981537797919318, "learning_rate": 9.898570918390789e-06, "loss": 0.7152, "step": 407 }, { "epoch": 0.15824687287889072, "grad_norm": 2.3295500469997137, "learning_rate": 9.897209585268459e-06, "loss": 0.8151, "step": 408 }, { "epoch": 0.15863473286143703, "grad_norm": 2.1819709266906946, "learning_rate": 9.895839272227529e-06, "loss": 0.7397, "step": 409 }, { "epoch": 0.15902259284398332, "grad_norm": 3.743133546263443, "learning_rate": 9.894459981780711e-06, "loss": 0.8563, "step": 410 }, { "epoch": 0.15941045282652963, "grad_norm": 3.552778433206387, "learning_rate": 9.893071716457183e-06, "loss": 0.8267, "step": 411 }, { "epoch": 0.15979831280907592, "grad_norm": 3.2345356503243057, "learning_rate": 9.891674478802585e-06, "loss": 0.8202, "step": 412 }, { "epoch": 0.16018617279162223, "grad_norm": 3.681695892752571, "learning_rate": 9.890268271379e-06, "loss": 0.7883, "step": 413 }, { "epoch": 0.1605740327741685, "grad_norm": 3.0114874770138553, "learning_rate": 9.888853096764963e-06, "loss": 0.7697, "step": 414 }, { "epoch": 0.16096189275671482, "grad_norm": 3.1058816089878687, "learning_rate": 9.887428957555457e-06, "loss": 0.7674, "step": 415 }, { "epoch": 0.16134975273926114, "grad_norm": 2.757591034958815, "learning_rate": 9.885995856361895e-06, "loss": 0.805, "step": 416 }, { "epoch": 0.16173761272180742, "grad_norm": 3.4031696785135925, "learning_rate": 9.884553795812128e-06, "loss": 0.8429, "step": 417 }, { "epoch": 0.16212547270435373, "grad_norm": 2.7937653033352845, "learning_rate": 9.883102778550434e-06, "loss": 0.7945, "step": 418 }, { "epoch": 0.16251333268690002, "grad_norm": 3.847243272636992, "learning_rate": 9.881642807237515e-06, "loss": 0.8119, "step": 419 }, { "epoch": 0.16290119266944633, "grad_norm": 3.5275489898786345, "learning_rate": 9.880173884550495e-06, "loss": 0.7439, "step": 420 }, { "epoch": 0.16328905265199264, "grad_norm": 2.477234400614848, "learning_rate": 9.878696013182906e-06, "loss": 0.8387, "step": 421 }, { "epoch": 0.16367691263453893, "grad_norm": 3.6585803339707588, "learning_rate": 9.877209195844692e-06, "loss": 0.7874, "step": 422 }, { "epoch": 0.16406477261708524, "grad_norm": 3.739492036638926, "learning_rate": 9.875713435262205e-06, "loss": 0.7563, "step": 423 }, { "epoch": 0.16445263259963153, "grad_norm": 3.2580580096486873, "learning_rate": 9.874208734178187e-06, "loss": 0.8678, "step": 424 }, { "epoch": 0.16484049258217784, "grad_norm": 2.8242917428439136, "learning_rate": 9.872695095351784e-06, "loss": 0.798, "step": 425 }, { "epoch": 0.16522835256472412, "grad_norm": 2.4727529369020265, "learning_rate": 9.871172521558523e-06, "loss": 0.7741, "step": 426 }, { "epoch": 0.16561621254727044, "grad_norm": 2.6711232489618704, "learning_rate": 9.869641015590319e-06, "loss": 0.7783, "step": 427 }, { "epoch": 0.16600407252981675, "grad_norm": 2.7966966902490236, "learning_rate": 9.868100580255466e-06, "loss": 0.8326, "step": 428 }, { "epoch": 0.16639193251236303, "grad_norm": 3.3875780136844003, "learning_rate": 9.86655121837863e-06, "loss": 0.8025, "step": 429 }, { "epoch": 0.16677979249490935, "grad_norm": 3.0616751296905123, "learning_rate": 9.864992932800845e-06, "loss": 0.8214, "step": 430 }, { "epoch": 0.16716765247745563, "grad_norm": 2.7947124070123017, "learning_rate": 9.863425726379512e-06, "loss": 0.7193, "step": 431 }, { "epoch": 0.16755551246000194, "grad_norm": 2.159045605232211, "learning_rate": 9.861849601988384e-06, "loss": 0.7788, "step": 432 }, { "epoch": 0.16794337244254823, "grad_norm": 3.8217489049177056, "learning_rate": 9.860264562517571e-06, "loss": 0.8177, "step": 433 }, { "epoch": 0.16833123242509454, "grad_norm": 3.3710006996867494, "learning_rate": 9.858670610873528e-06, "loss": 0.7527, "step": 434 }, { "epoch": 0.16871909240764085, "grad_norm": 2.1242835600673673, "learning_rate": 9.857067749979057e-06, "loss": 0.7886, "step": 435 }, { "epoch": 0.16910695239018714, "grad_norm": 2.924608296188396, "learning_rate": 9.855455982773288e-06, "loss": 0.7884, "step": 436 }, { "epoch": 0.16949481237273345, "grad_norm": 3.418339032840037, "learning_rate": 9.853835312211692e-06, "loss": 0.8157, "step": 437 }, { "epoch": 0.16988267235527973, "grad_norm": 2.9484836108543737, "learning_rate": 9.852205741266058e-06, "loss": 0.7838, "step": 438 }, { "epoch": 0.17027053233782605, "grad_norm": 2.471741361864527, "learning_rate": 9.8505672729245e-06, "loss": 0.7684, "step": 439 }, { "epoch": 0.17065839232037233, "grad_norm": 3.298350875334692, "learning_rate": 9.848919910191446e-06, "loss": 0.7931, "step": 440 }, { "epoch": 0.17104625230291864, "grad_norm": 3.181789096202697, "learning_rate": 9.847263656087633e-06, "loss": 0.7936, "step": 441 }, { "epoch": 0.17143411228546496, "grad_norm": 2.712400747565228, "learning_rate": 9.845598513650104e-06, "loss": 0.7964, "step": 442 }, { "epoch": 0.17182197226801124, "grad_norm": 3.0757760468000526, "learning_rate": 9.843924485932195e-06, "loss": 0.7456, "step": 443 }, { "epoch": 0.17220983225055755, "grad_norm": 2.587270970017516, "learning_rate": 9.84224157600354e-06, "loss": 0.6945, "step": 444 }, { "epoch": 0.17259769223310384, "grad_norm": 3.2687443867764605, "learning_rate": 9.840549786950058e-06, "loss": 0.8373, "step": 445 }, { "epoch": 0.17298555221565015, "grad_norm": 2.2710480050929194, "learning_rate": 9.83884912187395e-06, "loss": 0.807, "step": 446 }, { "epoch": 0.17337341219819646, "grad_norm": 2.6365906289956036, "learning_rate": 9.837139583893693e-06, "loss": 0.8291, "step": 447 }, { "epoch": 0.17376127218074275, "grad_norm": 2.799450705843874, "learning_rate": 9.835421176144035e-06, "loss": 0.7734, "step": 448 }, { "epoch": 0.17414913216328906, "grad_norm": 2.426875679507541, "learning_rate": 9.833693901775985e-06, "loss": 0.7531, "step": 449 }, { "epoch": 0.17453699214583535, "grad_norm": 3.183161260882943, "learning_rate": 9.831957763956814e-06, "loss": 0.7749, "step": 450 }, { "epoch": 0.17492485212838166, "grad_norm": 3.80887715018293, "learning_rate": 9.830212765870043e-06, "loss": 0.8822, "step": 451 }, { "epoch": 0.17531271211092794, "grad_norm": 2.3289750378790135, "learning_rate": 9.828458910715442e-06, "loss": 0.7485, "step": 452 }, { "epoch": 0.17570057209347426, "grad_norm": 2.56740550488947, "learning_rate": 9.826696201709022e-06, "loss": 0.7583, "step": 453 }, { "epoch": 0.17608843207602057, "grad_norm": 2.6879765397914293, "learning_rate": 9.824924642083026e-06, "loss": 0.7829, "step": 454 }, { "epoch": 0.17647629205856685, "grad_norm": 2.655405683988066, "learning_rate": 9.823144235085934e-06, "loss": 0.8051, "step": 455 }, { "epoch": 0.17686415204111317, "grad_norm": 2.2355919428223614, "learning_rate": 9.821354983982438e-06, "loss": 0.6996, "step": 456 }, { "epoch": 0.17725201202365945, "grad_norm": 2.444369233722097, "learning_rate": 9.819556892053456e-06, "loss": 0.7835, "step": 457 }, { "epoch": 0.17763987200620576, "grad_norm": 3.8782808094180767, "learning_rate": 9.817749962596115e-06, "loss": 0.817, "step": 458 }, { "epoch": 0.17802773198875205, "grad_norm": 2.8004610690284424, "learning_rate": 9.815934198923746e-06, "loss": 0.7573, "step": 459 }, { "epoch": 0.17841559197129836, "grad_norm": 3.0902271840961175, "learning_rate": 9.814109604365878e-06, "loss": 0.8161, "step": 460 }, { "epoch": 0.17880345195384467, "grad_norm": 2.020399870134435, "learning_rate": 9.812276182268236e-06, "loss": 0.76, "step": 461 }, { "epoch": 0.17919131193639096, "grad_norm": 2.535935207557818, "learning_rate": 9.810433935992734e-06, "loss": 0.732, "step": 462 }, { "epoch": 0.17957917191893727, "grad_norm": 2.216972062067977, "learning_rate": 9.808582868917458e-06, "loss": 0.7765, "step": 463 }, { "epoch": 0.17996703190148355, "grad_norm": 3.4433643937725296, "learning_rate": 9.806722984436676e-06, "loss": 0.7361, "step": 464 }, { "epoch": 0.18035489188402987, "grad_norm": 2.694858676243239, "learning_rate": 9.804854285960823e-06, "loss": 0.7925, "step": 465 }, { "epoch": 0.18074275186657618, "grad_norm": 3.365549546556892, "learning_rate": 9.802976776916493e-06, "loss": 0.7503, "step": 466 }, { "epoch": 0.18113061184912246, "grad_norm": 3.004038282353784, "learning_rate": 9.801090460746442e-06, "loss": 0.7993, "step": 467 }, { "epoch": 0.18151847183166878, "grad_norm": 3.7992966874686105, "learning_rate": 9.799195340909569e-06, "loss": 0.9088, "step": 468 }, { "epoch": 0.18190633181421506, "grad_norm": 3.0340109467419643, "learning_rate": 9.79729142088092e-06, "loss": 0.8306, "step": 469 }, { "epoch": 0.18229419179676137, "grad_norm": 3.4496982937358496, "learning_rate": 9.795378704151675e-06, "loss": 0.7883, "step": 470 }, { "epoch": 0.18268205177930766, "grad_norm": 2.6930149969363195, "learning_rate": 9.793457194229145e-06, "loss": 0.7291, "step": 471 }, { "epoch": 0.18306991176185397, "grad_norm": 2.6086031507046745, "learning_rate": 9.791526894636767e-06, "loss": 0.7794, "step": 472 }, { "epoch": 0.18345777174440028, "grad_norm": 2.6414463529073084, "learning_rate": 9.789587808914094e-06, "loss": 0.728, "step": 473 }, { "epoch": 0.18384563172694657, "grad_norm": 2.697215139609785, "learning_rate": 9.787639940616789e-06, "loss": 0.7386, "step": 474 }, { "epoch": 0.18423349170949288, "grad_norm": 2.7278247508932827, "learning_rate": 9.785683293316622e-06, "loss": 0.7574, "step": 475 }, { "epoch": 0.18462135169203917, "grad_norm": 2.276867284656239, "learning_rate": 9.783717870601458e-06, "loss": 0.7514, "step": 476 }, { "epoch": 0.18500921167458548, "grad_norm": 2.297231434026899, "learning_rate": 9.781743676075257e-06, "loss": 0.7662, "step": 477 }, { "epoch": 0.18539707165713176, "grad_norm": 2.1641374046224717, "learning_rate": 9.77976071335806e-06, "loss": 0.7418, "step": 478 }, { "epoch": 0.18578493163967807, "grad_norm": 2.066581272630839, "learning_rate": 9.777768986085985e-06, "loss": 0.7505, "step": 479 }, { "epoch": 0.1861727916222244, "grad_norm": 3.6809537505971295, "learning_rate": 9.775768497911226e-06, "loss": 0.854, "step": 480 }, { "epoch": 0.18656065160477067, "grad_norm": 2.44602376536494, "learning_rate": 9.77375925250204e-06, "loss": 0.7022, "step": 481 }, { "epoch": 0.18694851158731698, "grad_norm": 2.042349705760075, "learning_rate": 9.771741253542742e-06, "loss": 0.7249, "step": 482 }, { "epoch": 0.18733637156986327, "grad_norm": 2.30729499172734, "learning_rate": 9.769714504733695e-06, "loss": 0.6897, "step": 483 }, { "epoch": 0.18772423155240958, "grad_norm": 2.5349610366530775, "learning_rate": 9.767679009791312e-06, "loss": 0.8036, "step": 484 }, { "epoch": 0.1881120915349559, "grad_norm": 2.6923281283156553, "learning_rate": 9.765634772448039e-06, "loss": 0.7748, "step": 485 }, { "epoch": 0.18849995151750218, "grad_norm": 2.299940333065983, "learning_rate": 9.763581796452353e-06, "loss": 0.7377, "step": 486 }, { "epoch": 0.1888878115000485, "grad_norm": 2.853560696099374, "learning_rate": 9.76152008556876e-06, "loss": 0.8299, "step": 487 }, { "epoch": 0.18927567148259478, "grad_norm": 3.4923748457170727, "learning_rate": 9.759449643577779e-06, "loss": 0.7531, "step": 488 }, { "epoch": 0.1896635314651411, "grad_norm": 2.905474883710851, "learning_rate": 9.757370474275938e-06, "loss": 0.7333, "step": 489 }, { "epoch": 0.19005139144768737, "grad_norm": 3.353546010174638, "learning_rate": 9.755282581475769e-06, "loss": 0.7757, "step": 490 }, { "epoch": 0.19043925143023369, "grad_norm": 2.678723073918764, "learning_rate": 9.753185969005802e-06, "loss": 0.7612, "step": 491 }, { "epoch": 0.19082711141278, "grad_norm": 2.490593783860922, "learning_rate": 9.751080640710554e-06, "loss": 0.7031, "step": 492 }, { "epoch": 0.19121497139532628, "grad_norm": 3.09473907808382, "learning_rate": 9.748966600450526e-06, "loss": 0.7475, "step": 493 }, { "epoch": 0.1916028313778726, "grad_norm": 2.6867812010343237, "learning_rate": 9.746843852102191e-06, "loss": 0.7473, "step": 494 }, { "epoch": 0.19199069136041888, "grad_norm": 2.070093417633459, "learning_rate": 9.744712399557992e-06, "loss": 0.7524, "step": 495 }, { "epoch": 0.1923785513429652, "grad_norm": 2.8968840731694905, "learning_rate": 9.742572246726336e-06, "loss": 0.7476, "step": 496 }, { "epoch": 0.19276641132551148, "grad_norm": 2.9684635581030343, "learning_rate": 9.740423397531573e-06, "loss": 0.8073, "step": 497 }, { "epoch": 0.1931542713080578, "grad_norm": 2.9170372054572846, "learning_rate": 9.738265855914014e-06, "loss": 0.8175, "step": 498 }, { "epoch": 0.1935421312906041, "grad_norm": 2.6572112686268876, "learning_rate": 9.736099625829894e-06, "loss": 0.8167, "step": 499 }, { "epoch": 0.1939299912731504, "grad_norm": 3.4994013442866927, "learning_rate": 9.733924711251393e-06, "loss": 0.8447, "step": 500 }, { "epoch": 0.1939299912731504, "eval_loss": 1.3724623918533325, "eval_runtime": 6.3978, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.156, "step": 500 }, { "epoch": 0.1943178512556967, "grad_norm": 2.362851685961732, "learning_rate": 9.731741116166607e-06, "loss": 0.7409, "step": 501 }, { "epoch": 0.19470571123824298, "grad_norm": 3.0177150875955827, "learning_rate": 9.729548844579552e-06, "loss": 0.8506, "step": 502 }, { "epoch": 0.1950935712207893, "grad_norm": 2.718639748320381, "learning_rate": 9.727347900510155e-06, "loss": 0.7177, "step": 503 }, { "epoch": 0.1954814312033356, "grad_norm": 2.82811413211528, "learning_rate": 9.725138287994246e-06, "loss": 0.7558, "step": 504 }, { "epoch": 0.1958692911858819, "grad_norm": 2.5652154970562844, "learning_rate": 9.722920011083546e-06, "loss": 0.7482, "step": 505 }, { "epoch": 0.1962571511684282, "grad_norm": 2.6807983890132827, "learning_rate": 9.720693073845668e-06, "loss": 0.7386, "step": 506 }, { "epoch": 0.1966450111509745, "grad_norm": 2.1615090436748794, "learning_rate": 9.718457480364103e-06, "loss": 0.6835, "step": 507 }, { "epoch": 0.1970328711335208, "grad_norm": 2.285032855328376, "learning_rate": 9.716213234738216e-06, "loss": 0.7472, "step": 508 }, { "epoch": 0.1974207311160671, "grad_norm": 2.916927342146237, "learning_rate": 9.713960341083237e-06, "loss": 0.7284, "step": 509 }, { "epoch": 0.1978085910986134, "grad_norm": 2.8942247286944625, "learning_rate": 9.711698803530253e-06, "loss": 0.7466, "step": 510 }, { "epoch": 0.1981964510811597, "grad_norm": 2.7035564451474476, "learning_rate": 9.709428626226204e-06, "loss": 0.8244, "step": 511 }, { "epoch": 0.198584311063706, "grad_norm": 3.261796616103498, "learning_rate": 9.707149813333866e-06, "loss": 0.8228, "step": 512 }, { "epoch": 0.1989721710462523, "grad_norm": 2.7460801434508433, "learning_rate": 9.704862369031857e-06, "loss": 0.8209, "step": 513 }, { "epoch": 0.1993600310287986, "grad_norm": 3.3499131113735774, "learning_rate": 9.70256629751462e-06, "loss": 0.7571, "step": 514 }, { "epoch": 0.1997478910113449, "grad_norm": 2.366290027491634, "learning_rate": 9.700261602992417e-06, "loss": 0.7181, "step": 515 }, { "epoch": 0.2001357509938912, "grad_norm": 2.682738300022844, "learning_rate": 9.69794828969132e-06, "loss": 0.7657, "step": 516 }, { "epoch": 0.2005236109764375, "grad_norm": 2.8008771847944534, "learning_rate": 9.695626361853207e-06, "loss": 0.7579, "step": 517 }, { "epoch": 0.20091147095898382, "grad_norm": 2.99107767289299, "learning_rate": 9.693295823735754e-06, "loss": 0.7763, "step": 518 }, { "epoch": 0.2012993309415301, "grad_norm": 2.3817628914846845, "learning_rate": 9.690956679612422e-06, "loss": 0.748, "step": 519 }, { "epoch": 0.20168719092407641, "grad_norm": 3.028477704502779, "learning_rate": 9.688608933772454e-06, "loss": 0.8071, "step": 520 }, { "epoch": 0.2020750509066227, "grad_norm": 2.6161038250782305, "learning_rate": 9.686252590520869e-06, "loss": 0.7704, "step": 521 }, { "epoch": 0.202462910889169, "grad_norm": 2.436589092153252, "learning_rate": 9.683887654178446e-06, "loss": 0.7489, "step": 522 }, { "epoch": 0.20285077087171532, "grad_norm": 2.7620758483549914, "learning_rate": 9.681514129081725e-06, "loss": 0.7595, "step": 523 }, { "epoch": 0.2032386308542616, "grad_norm": 2.5264577535167194, "learning_rate": 9.679132019582988e-06, "loss": 0.7487, "step": 524 }, { "epoch": 0.20362649083680792, "grad_norm": 2.973378152509531, "learning_rate": 9.67674133005027e-06, "loss": 0.6818, "step": 525 }, { "epoch": 0.2040143508193542, "grad_norm": 2.4646410807637524, "learning_rate": 9.674342064867326e-06, "loss": 0.7446, "step": 526 }, { "epoch": 0.20440221080190052, "grad_norm": 3.0000176631819477, "learning_rate": 9.671934228433647e-06, "loss": 0.7288, "step": 527 }, { "epoch": 0.2047900707844468, "grad_norm": 3.0066329572478154, "learning_rate": 9.669517825164435e-06, "loss": 0.7427, "step": 528 }, { "epoch": 0.20517793076699312, "grad_norm": 3.8376406884926255, "learning_rate": 9.667092859490599e-06, "loss": 0.8373, "step": 529 }, { "epoch": 0.20556579074953943, "grad_norm": 2.521133079125993, "learning_rate": 9.664659335858755e-06, "loss": 0.7507, "step": 530 }, { "epoch": 0.2059536507320857, "grad_norm": 2.506461580508803, "learning_rate": 9.662217258731208e-06, "loss": 0.7386, "step": 531 }, { "epoch": 0.20634151071463203, "grad_norm": 2.485924672061414, "learning_rate": 9.659766632585946e-06, "loss": 0.7269, "step": 532 }, { "epoch": 0.2067293706971783, "grad_norm": 2.235499382495175, "learning_rate": 9.657307461916637e-06, "loss": 0.7777, "step": 533 }, { "epoch": 0.20711723067972462, "grad_norm": 2.6828717617086255, "learning_rate": 9.654839751232612e-06, "loss": 0.7119, "step": 534 }, { "epoch": 0.2075050906622709, "grad_norm": 2.389764063035867, "learning_rate": 9.652363505058866e-06, "loss": 0.7792, "step": 535 }, { "epoch": 0.20789295064481722, "grad_norm": 2.515950855059916, "learning_rate": 9.649878727936044e-06, "loss": 0.7275, "step": 536 }, { "epoch": 0.20828081062736353, "grad_norm": 2.8557231561602285, "learning_rate": 9.647385424420435e-06, "loss": 0.7351, "step": 537 }, { "epoch": 0.20866867060990982, "grad_norm": 2.6244718504286952, "learning_rate": 9.644883599083959e-06, "loss": 0.7525, "step": 538 }, { "epoch": 0.20905653059245613, "grad_norm": 3.4436183165920706, "learning_rate": 9.642373256514164e-06, "loss": 0.8124, "step": 539 }, { "epoch": 0.20944439057500241, "grad_norm": 2.5377484317977346, "learning_rate": 9.639854401314219e-06, "loss": 0.7439, "step": 540 }, { "epoch": 0.20983225055754873, "grad_norm": 3.3582680198477894, "learning_rate": 9.637327038102902e-06, "loss": 0.8005, "step": 541 }, { "epoch": 0.210220110540095, "grad_norm": 2.7316042927427855, "learning_rate": 9.634791171514585e-06, "loss": 0.8019, "step": 542 }, { "epoch": 0.21060797052264132, "grad_norm": 2.0597050972437603, "learning_rate": 9.632246806199242e-06, "loss": 0.7153, "step": 543 }, { "epoch": 0.21099583050518764, "grad_norm": 2.9065545506868133, "learning_rate": 9.629693946822423e-06, "loss": 0.8139, "step": 544 }, { "epoch": 0.21138369048773392, "grad_norm": 2.751381197120846, "learning_rate": 9.627132598065258e-06, "loss": 0.7836, "step": 545 }, { "epoch": 0.21177155047028023, "grad_norm": 2.8811177091529223, "learning_rate": 9.624562764624445e-06, "loss": 0.7668, "step": 546 }, { "epoch": 0.21215941045282652, "grad_norm": 3.553420701940901, "learning_rate": 9.621984451212237e-06, "loss": 0.7946, "step": 547 }, { "epoch": 0.21254727043537283, "grad_norm": 2.0747790902474375, "learning_rate": 9.619397662556434e-06, "loss": 0.6792, "step": 548 }, { "epoch": 0.21293513041791914, "grad_norm": 2.7186264805552973, "learning_rate": 9.616802403400384e-06, "loss": 0.6774, "step": 549 }, { "epoch": 0.21332299040046543, "grad_norm": 3.3927969902693067, "learning_rate": 9.614198678502965e-06, "loss": 0.9023, "step": 550 }, { "epoch": 0.21371085038301174, "grad_norm": 2.7652734209754826, "learning_rate": 9.611586492638573e-06, "loss": 0.7757, "step": 551 }, { "epoch": 0.21409871036555803, "grad_norm": 3.2547187739055867, "learning_rate": 9.608965850597125e-06, "loss": 0.7407, "step": 552 }, { "epoch": 0.21448657034810434, "grad_norm": 2.812378489630404, "learning_rate": 9.606336757184041e-06, "loss": 0.8103, "step": 553 }, { "epoch": 0.21487443033065062, "grad_norm": 2.650417276497206, "learning_rate": 9.603699217220239e-06, "loss": 0.7827, "step": 554 }, { "epoch": 0.21526229031319694, "grad_norm": 2.979930714042519, "learning_rate": 9.601053235542124e-06, "loss": 0.7934, "step": 555 }, { "epoch": 0.21565015029574325, "grad_norm": 2.772342253363463, "learning_rate": 9.598398817001585e-06, "loss": 0.7394, "step": 556 }, { "epoch": 0.21603801027828953, "grad_norm": 2.6859882582826025, "learning_rate": 9.595735966465973e-06, "loss": 0.698, "step": 557 }, { "epoch": 0.21642587026083585, "grad_norm": 2.9723286331381855, "learning_rate": 9.59306468881811e-06, "loss": 0.7453, "step": 558 }, { "epoch": 0.21681373024338213, "grad_norm": 2.7534585960779934, "learning_rate": 9.590384988956264e-06, "loss": 0.7571, "step": 559 }, { "epoch": 0.21720159022592844, "grad_norm": 3.2093041089968004, "learning_rate": 9.587696871794148e-06, "loss": 0.7717, "step": 560 }, { "epoch": 0.21758945020847473, "grad_norm": 3.2434072352565275, "learning_rate": 9.585000342260914e-06, "loss": 0.8192, "step": 561 }, { "epoch": 0.21797731019102104, "grad_norm": 2.3969866705106964, "learning_rate": 9.582295405301131e-06, "loss": 0.7198, "step": 562 }, { "epoch": 0.21836517017356735, "grad_norm": 3.3179541297257042, "learning_rate": 9.579582065874794e-06, "loss": 0.7954, "step": 563 }, { "epoch": 0.21875303015611364, "grad_norm": 2.595304868098701, "learning_rate": 9.576860328957299e-06, "loss": 0.7286, "step": 564 }, { "epoch": 0.21914089013865995, "grad_norm": 2.888907317985086, "learning_rate": 9.574130199539443e-06, "loss": 0.8566, "step": 565 }, { "epoch": 0.21952875012120623, "grad_norm": 2.0952316835080493, "learning_rate": 9.571391682627413e-06, "loss": 0.7208, "step": 566 }, { "epoch": 0.21991661010375255, "grad_norm": 2.6796849942458105, "learning_rate": 9.568644783242771e-06, "loss": 0.8278, "step": 567 }, { "epoch": 0.22030447008629886, "grad_norm": 2.5895167659616587, "learning_rate": 9.565889506422457e-06, "loss": 0.7731, "step": 568 }, { "epoch": 0.22069233006884514, "grad_norm": 2.5841238373183457, "learning_rate": 9.563125857218766e-06, "loss": 0.8049, "step": 569 }, { "epoch": 0.22108019005139146, "grad_norm": 2.89686430070329, "learning_rate": 9.56035384069935e-06, "loss": 0.7812, "step": 570 }, { "epoch": 0.22146805003393774, "grad_norm": 2.2843332566341172, "learning_rate": 9.557573461947201e-06, "loss": 0.7528, "step": 571 }, { "epoch": 0.22185591001648405, "grad_norm": 2.869086353893181, "learning_rate": 9.554784726060647e-06, "loss": 0.7532, "step": 572 }, { "epoch": 0.22224376999903034, "grad_norm": 2.6339452931394853, "learning_rate": 9.551987638153339e-06, "loss": 0.8067, "step": 573 }, { "epoch": 0.22263162998157665, "grad_norm": 2.070789436965751, "learning_rate": 9.549182203354241e-06, "loss": 0.7451, "step": 574 }, { "epoch": 0.22301948996412296, "grad_norm": 2.1876456325731106, "learning_rate": 9.546368426807628e-06, "loss": 0.7585, "step": 575 }, { "epoch": 0.22340734994666925, "grad_norm": 2.93123315117713, "learning_rate": 9.543546313673065e-06, "loss": 0.7845, "step": 576 }, { "epoch": 0.22379520992921556, "grad_norm": 2.668370068839478, "learning_rate": 9.540715869125407e-06, "loss": 0.8056, "step": 577 }, { "epoch": 0.22418306991176185, "grad_norm": 3.0528388323604423, "learning_rate": 9.537877098354787e-06, "loss": 0.8078, "step": 578 }, { "epoch": 0.22457092989430816, "grad_norm": 1.9179620097820342, "learning_rate": 9.5350300065666e-06, "loss": 0.6997, "step": 579 }, { "epoch": 0.22495878987685444, "grad_norm": 2.17230220127342, "learning_rate": 9.532174598981507e-06, "loss": 0.7021, "step": 580 }, { "epoch": 0.22534664985940075, "grad_norm": 3.0872398221844746, "learning_rate": 9.529310880835414e-06, "loss": 0.7651, "step": 581 }, { "epoch": 0.22573450984194707, "grad_norm": 2.594630528307425, "learning_rate": 9.526438857379463e-06, "loss": 0.7934, "step": 582 }, { "epoch": 0.22612236982449335, "grad_norm": 2.9930291914978877, "learning_rate": 9.52355853388003e-06, "loss": 0.7772, "step": 583 }, { "epoch": 0.22651022980703966, "grad_norm": 3.913494303430372, "learning_rate": 9.520669915618708e-06, "loss": 0.8389, "step": 584 }, { "epoch": 0.22689808978958595, "grad_norm": 3.2743874029905657, "learning_rate": 9.5177730078923e-06, "loss": 0.8008, "step": 585 }, { "epoch": 0.22728594977213226, "grad_norm": 2.3831635659779833, "learning_rate": 9.514867816012809e-06, "loss": 0.7522, "step": 586 }, { "epoch": 0.22767380975467857, "grad_norm": 2.8840782832358407, "learning_rate": 9.511954345307432e-06, "loss": 0.7454, "step": 587 }, { "epoch": 0.22806166973722486, "grad_norm": 1.8997641147145607, "learning_rate": 9.509032601118541e-06, "loss": 0.7077, "step": 588 }, { "epoch": 0.22844952971977117, "grad_norm": 2.295430634914496, "learning_rate": 9.506102588803683e-06, "loss": 0.6751, "step": 589 }, { "epoch": 0.22883738970231746, "grad_norm": 3.0568116345994754, "learning_rate": 9.503164313735566e-06, "loss": 0.7415, "step": 590 }, { "epoch": 0.22922524968486377, "grad_norm": 2.3542961136299847, "learning_rate": 9.500217781302048e-06, "loss": 0.7813, "step": 591 }, { "epoch": 0.22961310966741005, "grad_norm": 2.4386472295301123, "learning_rate": 9.497262996906126e-06, "loss": 0.7122, "step": 592 }, { "epoch": 0.23000096964995637, "grad_norm": 2.2907954974513984, "learning_rate": 9.494299965965935e-06, "loss": 0.7494, "step": 593 }, { "epoch": 0.23038882963250268, "grad_norm": 2.4377272648731565, "learning_rate": 9.491328693914723e-06, "loss": 0.7656, "step": 594 }, { "epoch": 0.23077668961504896, "grad_norm": 2.418661650728181, "learning_rate": 9.488349186200858e-06, "loss": 0.7691, "step": 595 }, { "epoch": 0.23116454959759528, "grad_norm": 2.749376171816596, "learning_rate": 9.485361448287804e-06, "loss": 0.7537, "step": 596 }, { "epoch": 0.23155240958014156, "grad_norm": 3.0610217042984753, "learning_rate": 9.482365485654118e-06, "loss": 0.8264, "step": 597 }, { "epoch": 0.23194026956268787, "grad_norm": 2.3391320616425597, "learning_rate": 9.479361303793441e-06, "loss": 0.7792, "step": 598 }, { "epoch": 0.23232812954523416, "grad_norm": 2.0113890102500607, "learning_rate": 9.476348908214482e-06, "loss": 0.7199, "step": 599 }, { "epoch": 0.23271598952778047, "grad_norm": 2.0390399421387575, "learning_rate": 9.47332830444101e-06, "loss": 0.7134, "step": 600 }, { "epoch": 0.23310384951032678, "grad_norm": 2.7569717740111686, "learning_rate": 9.470299498011851e-06, "loss": 0.7647, "step": 601 }, { "epoch": 0.23349170949287307, "grad_norm": 2.4708907452153763, "learning_rate": 9.46726249448087e-06, "loss": 0.7719, "step": 602 }, { "epoch": 0.23387956947541938, "grad_norm": 3.371121974270879, "learning_rate": 9.464217299416956e-06, "loss": 0.723, "step": 603 }, { "epoch": 0.23426742945796566, "grad_norm": 3.057778697838886, "learning_rate": 9.46116391840403e-06, "loss": 0.7405, "step": 604 }, { "epoch": 0.23465528944051198, "grad_norm": 2.509330061221599, "learning_rate": 9.458102357041017e-06, "loss": 0.7565, "step": 605 }, { "epoch": 0.2350431494230583, "grad_norm": 3.1016139244926326, "learning_rate": 9.45503262094184e-06, "loss": 0.7142, "step": 606 }, { "epoch": 0.23543100940560457, "grad_norm": 2.534676412516658, "learning_rate": 9.451954715735416e-06, "loss": 0.7961, "step": 607 }, { "epoch": 0.2358188693881509, "grad_norm": 2.9749316974930338, "learning_rate": 9.448868647065644e-06, "loss": 0.8166, "step": 608 }, { "epoch": 0.23620672937069717, "grad_norm": 3.1274651672024083, "learning_rate": 9.445774420591382e-06, "loss": 0.7896, "step": 609 }, { "epoch": 0.23659458935324348, "grad_norm": 2.7422887479403366, "learning_rate": 9.442672041986456e-06, "loss": 0.8147, "step": 610 }, { "epoch": 0.23698244933578977, "grad_norm": 2.822042455439801, "learning_rate": 9.43956151693964e-06, "loss": 0.7877, "step": 611 }, { "epoch": 0.23737030931833608, "grad_norm": 2.1894608486707776, "learning_rate": 9.436442851154642e-06, "loss": 0.6992, "step": 612 }, { "epoch": 0.2377581693008824, "grad_norm": 3.1845398827562303, "learning_rate": 9.433316050350099e-06, "loss": 0.7641, "step": 613 }, { "epoch": 0.23814602928342868, "grad_norm": 2.908591786859172, "learning_rate": 9.430181120259566e-06, "loss": 0.818, "step": 614 }, { "epoch": 0.238533889265975, "grad_norm": 2.675783963839706, "learning_rate": 9.427038066631502e-06, "loss": 0.7841, "step": 615 }, { "epoch": 0.23892174924852128, "grad_norm": 2.438515067777562, "learning_rate": 9.423886895229266e-06, "loss": 0.7515, "step": 616 }, { "epoch": 0.2393096092310676, "grad_norm": 2.9147036775252935, "learning_rate": 9.420727611831098e-06, "loss": 0.7937, "step": 617 }, { "epoch": 0.23969746921361387, "grad_norm": 3.0015481030620195, "learning_rate": 9.417560222230115e-06, "loss": 0.7907, "step": 618 }, { "epoch": 0.24008532919616019, "grad_norm": 2.590763718109756, "learning_rate": 9.414384732234301e-06, "loss": 0.7378, "step": 619 }, { "epoch": 0.2404731891787065, "grad_norm": 2.780906876282198, "learning_rate": 9.411201147666486e-06, "loss": 0.7799, "step": 620 }, { "epoch": 0.24086104916125278, "grad_norm": 2.83506702311981, "learning_rate": 9.408009474364353e-06, "loss": 0.7808, "step": 621 }, { "epoch": 0.2412489091437991, "grad_norm": 2.966329197033137, "learning_rate": 9.404809718180408e-06, "loss": 0.804, "step": 622 }, { "epoch": 0.24163676912634538, "grad_norm": 2.823455489213936, "learning_rate": 9.401601884981983e-06, "loss": 0.7159, "step": 623 }, { "epoch": 0.2420246291088917, "grad_norm": 3.022131987573353, "learning_rate": 9.39838598065122e-06, "loss": 0.6995, "step": 624 }, { "epoch": 0.24241248909143798, "grad_norm": 2.9023727000034882, "learning_rate": 9.39516201108506e-06, "loss": 0.7218, "step": 625 }, { "epoch": 0.2428003490739843, "grad_norm": 2.279336216122714, "learning_rate": 9.391929982195233e-06, "loss": 0.6711, "step": 626 }, { "epoch": 0.2431882090565306, "grad_norm": 2.1665102170305763, "learning_rate": 9.38868989990825e-06, "loss": 0.7034, "step": 627 }, { "epoch": 0.2435760690390769, "grad_norm": 3.616798045240433, "learning_rate": 9.385441770165385e-06, "loss": 0.7933, "step": 628 }, { "epoch": 0.2439639290216232, "grad_norm": 3.6156144358456688, "learning_rate": 9.382185598922674e-06, "loss": 0.7995, "step": 629 }, { "epoch": 0.24435178900416948, "grad_norm": 2.638953330234751, "learning_rate": 9.378921392150893e-06, "loss": 0.7721, "step": 630 }, { "epoch": 0.2447396489867158, "grad_norm": 3.059165113361662, "learning_rate": 9.375649155835554e-06, "loss": 0.7608, "step": 631 }, { "epoch": 0.2451275089692621, "grad_norm": 2.6663715567799344, "learning_rate": 9.372368895976896e-06, "loss": 0.6673, "step": 632 }, { "epoch": 0.2455153689518084, "grad_norm": 2.3683233006769058, "learning_rate": 9.369080618589866e-06, "loss": 0.7153, "step": 633 }, { "epoch": 0.2459032289343547, "grad_norm": 3.204312563438914, "learning_rate": 9.365784329704114e-06, "loss": 0.809, "step": 634 }, { "epoch": 0.246291088916901, "grad_norm": 4.928639200998714, "learning_rate": 9.362480035363987e-06, "loss": 0.9288, "step": 635 }, { "epoch": 0.2466789488994473, "grad_norm": 2.257994224914641, "learning_rate": 9.3591677416285e-06, "loss": 0.7475, "step": 636 }, { "epoch": 0.2470668088819936, "grad_norm": 2.853348641841805, "learning_rate": 9.35584745457134e-06, "loss": 0.8101, "step": 637 }, { "epoch": 0.2474546688645399, "grad_norm": 3.6839577845827667, "learning_rate": 9.352519180280862e-06, "loss": 0.7545, "step": 638 }, { "epoch": 0.2478425288470862, "grad_norm": 3.336915535608388, "learning_rate": 9.34918292486005e-06, "loss": 0.8133, "step": 639 }, { "epoch": 0.2482303888296325, "grad_norm": 3.0432812150089097, "learning_rate": 9.345838694426535e-06, "loss": 0.7401, "step": 640 }, { "epoch": 0.2486182488121788, "grad_norm": 3.631914405190385, "learning_rate": 9.342486495112566e-06, "loss": 0.7868, "step": 641 }, { "epoch": 0.2490061087947251, "grad_norm": 2.098322054281513, "learning_rate": 9.339126333065008e-06, "loss": 0.8003, "step": 642 }, { "epoch": 0.2493939687772714, "grad_norm": 2.2778303820722927, "learning_rate": 9.335758214445323e-06, "loss": 0.6993, "step": 643 }, { "epoch": 0.2497818287598177, "grad_norm": 2.43262341243365, "learning_rate": 9.332382145429568e-06, "loss": 0.7416, "step": 644 }, { "epoch": 0.250169688742364, "grad_norm": 2.9286544289229073, "learning_rate": 9.328998132208373e-06, "loss": 0.7942, "step": 645 }, { "epoch": 0.2505575487249103, "grad_norm": 2.807868905735795, "learning_rate": 9.325606180986938e-06, "loss": 0.8021, "step": 646 }, { "epoch": 0.25094540870745663, "grad_norm": 2.5857727795359633, "learning_rate": 9.32220629798502e-06, "loss": 0.7485, "step": 647 }, { "epoch": 0.2513332686900029, "grad_norm": 2.7385663518214014, "learning_rate": 9.318798489436917e-06, "loss": 0.6908, "step": 648 }, { "epoch": 0.2517211286725492, "grad_norm": 2.3185698486113644, "learning_rate": 9.315382761591463e-06, "loss": 0.7512, "step": 649 }, { "epoch": 0.2521089886550955, "grad_norm": 2.315252809110992, "learning_rate": 9.311959120712012e-06, "loss": 0.7596, "step": 650 }, { "epoch": 0.2524968486376418, "grad_norm": 2.4479698450904555, "learning_rate": 9.308527573076425e-06, "loss": 0.7349, "step": 651 }, { "epoch": 0.25288470862018814, "grad_norm": 3.3196647560576498, "learning_rate": 9.30508812497707e-06, "loss": 0.789, "step": 652 }, { "epoch": 0.2532725686027344, "grad_norm": 3.306838083837848, "learning_rate": 9.301640782720792e-06, "loss": 0.8202, "step": 653 }, { "epoch": 0.2536604285852807, "grad_norm": 3.1795546725076593, "learning_rate": 9.298185552628917e-06, "loss": 0.7174, "step": 654 }, { "epoch": 0.254048288567827, "grad_norm": 2.6852200584353394, "learning_rate": 9.294722441037238e-06, "loss": 0.7395, "step": 655 }, { "epoch": 0.25443614855037333, "grad_norm": 2.9827017265964106, "learning_rate": 9.291251454295989e-06, "loss": 0.7231, "step": 656 }, { "epoch": 0.25482400853291964, "grad_norm": 2.6467130334876616, "learning_rate": 9.287772598769855e-06, "loss": 0.7476, "step": 657 }, { "epoch": 0.2552118685154659, "grad_norm": 1.993762631975412, "learning_rate": 9.284285880837947e-06, "loss": 0.7434, "step": 658 }, { "epoch": 0.2555997284980122, "grad_norm": 2.7025107050739208, "learning_rate": 9.28079130689379e-06, "loss": 0.7359, "step": 659 }, { "epoch": 0.2559875884805585, "grad_norm": 2.7185413118597874, "learning_rate": 9.277288883345318e-06, "loss": 0.7561, "step": 660 }, { "epoch": 0.25637544846310484, "grad_norm": 3.1475098355044993, "learning_rate": 9.273778616614857e-06, "loss": 0.7199, "step": 661 }, { "epoch": 0.2567633084456511, "grad_norm": 2.2062647625616507, "learning_rate": 9.270260513139116e-06, "loss": 0.7155, "step": 662 }, { "epoch": 0.2571511684281974, "grad_norm": 2.959976606156566, "learning_rate": 9.266734579369172e-06, "loss": 0.7944, "step": 663 }, { "epoch": 0.2575390284107437, "grad_norm": 2.6975561727329413, "learning_rate": 9.263200821770462e-06, "loss": 0.7862, "step": 664 }, { "epoch": 0.25792688839329003, "grad_norm": 2.9877679671413753, "learning_rate": 9.25965924682277e-06, "loss": 0.8391, "step": 665 }, { "epoch": 0.25831474837583634, "grad_norm": 1.93861970345144, "learning_rate": 9.256109861020213e-06, "loss": 0.6419, "step": 666 }, { "epoch": 0.2587026083583826, "grad_norm": 2.6295653187029115, "learning_rate": 9.252552670871232e-06, "loss": 0.7883, "step": 667 }, { "epoch": 0.2590904683409289, "grad_norm": 2.221548884579689, "learning_rate": 9.248987682898576e-06, "loss": 0.7416, "step": 668 }, { "epoch": 0.2594783283234752, "grad_norm": 2.5198415645984835, "learning_rate": 9.245414903639295e-06, "loss": 0.7419, "step": 669 }, { "epoch": 0.25986618830602154, "grad_norm": 2.2449464657014855, "learning_rate": 9.241834339644726e-06, "loss": 0.7443, "step": 670 }, { "epoch": 0.26025404828856785, "grad_norm": 3.673804909435071, "learning_rate": 9.23824599748048e-06, "loss": 0.8635, "step": 671 }, { "epoch": 0.2606419082711141, "grad_norm": 2.6592132659450143, "learning_rate": 9.234649883726432e-06, "loss": 0.7801, "step": 672 }, { "epoch": 0.2610297682536604, "grad_norm": 2.633877704510565, "learning_rate": 9.231046004976704e-06, "loss": 0.8006, "step": 673 }, { "epoch": 0.26141762823620673, "grad_norm": 3.33510083948498, "learning_rate": 9.22743436783966e-06, "loss": 0.7581, "step": 674 }, { "epoch": 0.26180548821875305, "grad_norm": 2.549186957450707, "learning_rate": 9.223814978937888e-06, "loss": 0.7109, "step": 675 }, { "epoch": 0.2621933482012993, "grad_norm": 3.8089020037032584, "learning_rate": 9.220187844908194e-06, "loss": 0.7894, "step": 676 }, { "epoch": 0.2625812081838456, "grad_norm": 3.2825119414060957, "learning_rate": 9.216552972401582e-06, "loss": 0.7857, "step": 677 }, { "epoch": 0.26296906816639193, "grad_norm": 2.9409998346952926, "learning_rate": 9.212910368083246e-06, "loss": 0.7666, "step": 678 }, { "epoch": 0.26335692814893824, "grad_norm": 2.5405797875642206, "learning_rate": 9.209260038632562e-06, "loss": 0.7097, "step": 679 }, { "epoch": 0.26374478813148455, "grad_norm": 2.511185745349081, "learning_rate": 9.205601990743068e-06, "loss": 0.7502, "step": 680 }, { "epoch": 0.2641326481140308, "grad_norm": 3.161865250912108, "learning_rate": 9.201936231122453e-06, "loss": 0.8719, "step": 681 }, { "epoch": 0.2645205080965771, "grad_norm": 3.070437508698134, "learning_rate": 9.198262766492554e-06, "loss": 0.7255, "step": 682 }, { "epoch": 0.26490836807912344, "grad_norm": 2.0470751524184108, "learning_rate": 9.194581603589327e-06, "loss": 0.7005, "step": 683 }, { "epoch": 0.26529622806166975, "grad_norm": 2.736142864141928, "learning_rate": 9.190892749162854e-06, "loss": 0.7304, "step": 684 }, { "epoch": 0.26568408804421606, "grad_norm": 2.436194573016057, "learning_rate": 9.187196209977314e-06, "loss": 0.7211, "step": 685 }, { "epoch": 0.2660719480267623, "grad_norm": 3.369382504809839, "learning_rate": 9.18349199281098e-06, "loss": 0.7244, "step": 686 }, { "epoch": 0.26645980800930863, "grad_norm": 2.176940704670179, "learning_rate": 9.179780104456205e-06, "loss": 0.7755, "step": 687 }, { "epoch": 0.26684766799185494, "grad_norm": 2.1060300508494096, "learning_rate": 9.176060551719402e-06, "loss": 0.7992, "step": 688 }, { "epoch": 0.26723552797440125, "grad_norm": 2.7428084427158557, "learning_rate": 9.172333341421046e-06, "loss": 0.7216, "step": 689 }, { "epoch": 0.26762338795694757, "grad_norm": 3.8720989585713808, "learning_rate": 9.168598480395653e-06, "loss": 0.8541, "step": 690 }, { "epoch": 0.2680112479394938, "grad_norm": 3.014755341300966, "learning_rate": 9.16485597549176e-06, "loss": 0.8433, "step": 691 }, { "epoch": 0.26839910792204014, "grad_norm": 3.196564522442207, "learning_rate": 9.16110583357193e-06, "loss": 0.805, "step": 692 }, { "epoch": 0.26878696790458645, "grad_norm": 3.604622065697069, "learning_rate": 9.157348061512728e-06, "loss": 0.7311, "step": 693 }, { "epoch": 0.26917482788713276, "grad_norm": 2.336906645108881, "learning_rate": 9.153582666204702e-06, "loss": 0.6974, "step": 694 }, { "epoch": 0.269562687869679, "grad_norm": 2.218171775163523, "learning_rate": 9.149809654552387e-06, "loss": 0.7637, "step": 695 }, { "epoch": 0.26995054785222533, "grad_norm": 2.3329241685568074, "learning_rate": 9.146029033474284e-06, "loss": 0.6845, "step": 696 }, { "epoch": 0.27033840783477164, "grad_norm": 2.6436343246678224, "learning_rate": 9.142240809902841e-06, "loss": 0.6607, "step": 697 }, { "epoch": 0.27072626781731796, "grad_norm": 2.3349008188103237, "learning_rate": 9.138444990784455e-06, "loss": 0.7386, "step": 698 }, { "epoch": 0.27111412779986427, "grad_norm": 1.9959922399613235, "learning_rate": 9.13464158307944e-06, "loss": 0.7026, "step": 699 }, { "epoch": 0.2715019877824105, "grad_norm": 2.457989637295468, "learning_rate": 9.130830593762037e-06, "loss": 0.7288, "step": 700 }, { "epoch": 0.27188984776495684, "grad_norm": 2.971727172482458, "learning_rate": 9.12701202982038e-06, "loss": 0.7525, "step": 701 }, { "epoch": 0.27227770774750315, "grad_norm": 2.399041037991305, "learning_rate": 9.123185898256497e-06, "loss": 0.6894, "step": 702 }, { "epoch": 0.27266556773004946, "grad_norm": 2.7559650298525904, "learning_rate": 9.119352206086292e-06, "loss": 0.7587, "step": 703 }, { "epoch": 0.2730534277125958, "grad_norm": 3.0617519673239366, "learning_rate": 9.115510960339533e-06, "loss": 0.7265, "step": 704 }, { "epoch": 0.27344128769514203, "grad_norm": 2.321892778799493, "learning_rate": 9.111662168059836e-06, "loss": 0.7765, "step": 705 }, { "epoch": 0.27382914767768834, "grad_norm": 2.3345603691414007, "learning_rate": 9.107805836304658e-06, "loss": 0.7116, "step": 706 }, { "epoch": 0.27421700766023466, "grad_norm": 3.309174518833764, "learning_rate": 9.10394197214528e-06, "loss": 0.739, "step": 707 }, { "epoch": 0.27460486764278097, "grad_norm": 2.600192083868772, "learning_rate": 9.100070582666796e-06, "loss": 0.7417, "step": 708 }, { "epoch": 0.2749927276253273, "grad_norm": 2.2394025136475544, "learning_rate": 9.096191674968095e-06, "loss": 0.6998, "step": 709 }, { "epoch": 0.27538058760787354, "grad_norm": 3.8848664479555715, "learning_rate": 9.09230525616186e-06, "loss": 0.8352, "step": 710 }, { "epoch": 0.27576844759041985, "grad_norm": 2.775313049519924, "learning_rate": 9.088411333374539e-06, "loss": 0.762, "step": 711 }, { "epoch": 0.27615630757296616, "grad_norm": 2.558864837626251, "learning_rate": 9.084509913746342e-06, "loss": 0.7434, "step": 712 }, { "epoch": 0.2765441675555125, "grad_norm": 2.5895538319044182, "learning_rate": 9.08060100443123e-06, "loss": 0.7497, "step": 713 }, { "epoch": 0.27693202753805873, "grad_norm": 3.3489462822895137, "learning_rate": 9.076684612596891e-06, "loss": 0.8176, "step": 714 }, { "epoch": 0.27731988752060505, "grad_norm": 2.086026735854734, "learning_rate": 9.07276074542474e-06, "loss": 0.7173, "step": 715 }, { "epoch": 0.27770774750315136, "grad_norm": 2.64825308042557, "learning_rate": 9.068829410109893e-06, "loss": 0.7029, "step": 716 }, { "epoch": 0.27809560748569767, "grad_norm": 2.6561980937138983, "learning_rate": 9.064890613861168e-06, "loss": 0.7058, "step": 717 }, { "epoch": 0.278483467468244, "grad_norm": 2.1013220386734743, "learning_rate": 9.060944363901057e-06, "loss": 0.6971, "step": 718 }, { "epoch": 0.27887132745079024, "grad_norm": 2.854427842577432, "learning_rate": 9.05699066746572e-06, "loss": 0.78, "step": 719 }, { "epoch": 0.27925918743333655, "grad_norm": 1.958471713828364, "learning_rate": 9.05302953180498e-06, "loss": 0.7421, "step": 720 }, { "epoch": 0.27964704741588287, "grad_norm": 2.6816089097902283, "learning_rate": 9.04906096418229e-06, "loss": 0.748, "step": 721 }, { "epoch": 0.2800349073984292, "grad_norm": 2.6013847801154086, "learning_rate": 9.045084971874738e-06, "loss": 0.7831, "step": 722 }, { "epoch": 0.2804227673809755, "grad_norm": 2.1825829412159785, "learning_rate": 9.041101562173023e-06, "loss": 0.7532, "step": 723 }, { "epoch": 0.28081062736352175, "grad_norm": 2.530815907571049, "learning_rate": 9.037110742381445e-06, "loss": 0.7671, "step": 724 }, { "epoch": 0.28119848734606806, "grad_norm": 2.3038904546096473, "learning_rate": 9.033112519817897e-06, "loss": 0.7035, "step": 725 }, { "epoch": 0.2815863473286144, "grad_norm": 2.184750141796278, "learning_rate": 9.02910690181384e-06, "loss": 0.7008, "step": 726 }, { "epoch": 0.2819742073111607, "grad_norm": 2.3923386250173797, "learning_rate": 9.0250938957143e-06, "loss": 0.7636, "step": 727 }, { "epoch": 0.282362067293707, "grad_norm": 2.1683768269838506, "learning_rate": 9.021073508877845e-06, "loss": 0.7208, "step": 728 }, { "epoch": 0.28274992727625325, "grad_norm": 3.7946628246565917, "learning_rate": 9.017045748676584e-06, "loss": 0.7566, "step": 729 }, { "epoch": 0.28313778725879957, "grad_norm": 3.193737570063567, "learning_rate": 9.013010622496145e-06, "loss": 0.7473, "step": 730 }, { "epoch": 0.2835256472413459, "grad_norm": 2.5161580340283134, "learning_rate": 9.008968137735655e-06, "loss": 0.7513, "step": 731 }, { "epoch": 0.2839135072238922, "grad_norm": 5.048807568738143, "learning_rate": 9.004918301807746e-06, "loss": 0.7805, "step": 732 }, { "epoch": 0.28430136720643845, "grad_norm": 2.88381141970992, "learning_rate": 9.000861122138518e-06, "loss": 0.7251, "step": 733 }, { "epoch": 0.28468922718898476, "grad_norm": 2.6084060853973803, "learning_rate": 8.996796606167549e-06, "loss": 0.6564, "step": 734 }, { "epoch": 0.2850770871715311, "grad_norm": 3.0349357282287386, "learning_rate": 8.99272476134786e-06, "loss": 0.7695, "step": 735 }, { "epoch": 0.2854649471540774, "grad_norm": 2.537994211532318, "learning_rate": 8.988645595145913e-06, "loss": 0.7359, "step": 736 }, { "epoch": 0.2858528071366237, "grad_norm": 2.2245401326224137, "learning_rate": 8.9845591150416e-06, "loss": 0.7734, "step": 737 }, { "epoch": 0.28624066711916996, "grad_norm": 2.7014640179436946, "learning_rate": 8.98046532852822e-06, "loss": 0.752, "step": 738 }, { "epoch": 0.28662852710171627, "grad_norm": 2.7016045699269786, "learning_rate": 8.976364243112468e-06, "loss": 0.7211, "step": 739 }, { "epoch": 0.2870163870842626, "grad_norm": 1.7448022653625974, "learning_rate": 8.972255866314425e-06, "loss": 0.7161, "step": 740 }, { "epoch": 0.2874042470668089, "grad_norm": 2.534233623882544, "learning_rate": 8.968140205667544e-06, "loss": 0.6422, "step": 741 }, { "epoch": 0.2877921070493552, "grad_norm": 2.5351445525118033, "learning_rate": 8.964017268718632e-06, "loss": 0.7556, "step": 742 }, { "epoch": 0.28817996703190146, "grad_norm": 3.457593732459978, "learning_rate": 8.959887063027837e-06, "loss": 0.8428, "step": 743 }, { "epoch": 0.2885678270144478, "grad_norm": 3.2345573485989165, "learning_rate": 8.95574959616864e-06, "loss": 0.8437, "step": 744 }, { "epoch": 0.2889556869969941, "grad_norm": 2.928835280406535, "learning_rate": 8.951604875727833e-06, "loss": 0.7117, "step": 745 }, { "epoch": 0.2893435469795404, "grad_norm": 2.6809132677773087, "learning_rate": 8.94745290930551e-06, "loss": 0.7601, "step": 746 }, { "epoch": 0.2897314069620867, "grad_norm": 2.5777431564301905, "learning_rate": 8.94329370451505e-06, "loss": 0.7167, "step": 747 }, { "epoch": 0.29011926694463297, "grad_norm": 2.415968782544957, "learning_rate": 8.93912726898311e-06, "loss": 0.6626, "step": 748 }, { "epoch": 0.2905071269271793, "grad_norm": 2.3271501925724603, "learning_rate": 8.934953610349599e-06, "loss": 0.6933, "step": 749 }, { "epoch": 0.2908949869097256, "grad_norm": 2.3915518002889367, "learning_rate": 8.930772736267675e-06, "loss": 0.7474, "step": 750 }, { "epoch": 0.2912828468922719, "grad_norm": 2.067544025683592, "learning_rate": 8.926584654403725e-06, "loss": 0.6895, "step": 751 }, { "epoch": 0.29167070687481816, "grad_norm": 2.863236715909018, "learning_rate": 8.922389372437357e-06, "loss": 0.7341, "step": 752 }, { "epoch": 0.2920585668573645, "grad_norm": 2.7549687578962927, "learning_rate": 8.918186898061377e-06, "loss": 0.6991, "step": 753 }, { "epoch": 0.2924464268399108, "grad_norm": 2.263846853230215, "learning_rate": 8.91397723898178e-06, "loss": 0.7073, "step": 754 }, { "epoch": 0.2928342868224571, "grad_norm": 2.151652016383664, "learning_rate": 8.909760402917738e-06, "loss": 0.7568, "step": 755 }, { "epoch": 0.2932221468050034, "grad_norm": 2.8008060674980317, "learning_rate": 8.90553639760158e-06, "loss": 0.8454, "step": 756 }, { "epoch": 0.29361000678754967, "grad_norm": 2.3461305216973978, "learning_rate": 8.901305230778783e-06, "loss": 0.7523, "step": 757 }, { "epoch": 0.293997866770096, "grad_norm": 2.3907862247485734, "learning_rate": 8.897066910207958e-06, "loss": 0.7055, "step": 758 }, { "epoch": 0.2943857267526423, "grad_norm": 1.9816647322253584, "learning_rate": 8.892821443660831e-06, "loss": 0.6255, "step": 759 }, { "epoch": 0.2947735867351886, "grad_norm": 3.2916835114156426, "learning_rate": 8.888568838922231e-06, "loss": 0.7634, "step": 760 }, { "epoch": 0.2951614467177349, "grad_norm": 2.801135280943176, "learning_rate": 8.884309103790078e-06, "loss": 0.7006, "step": 761 }, { "epoch": 0.2955493067002812, "grad_norm": 2.2715019527095195, "learning_rate": 8.880042246075366e-06, "loss": 0.7285, "step": 762 }, { "epoch": 0.2959371666828275, "grad_norm": 2.176356170383367, "learning_rate": 8.875768273602148e-06, "loss": 0.6913, "step": 763 }, { "epoch": 0.2963250266653738, "grad_norm": 2.0466214615938494, "learning_rate": 8.871487194207527e-06, "loss": 0.5967, "step": 764 }, { "epoch": 0.2967128866479201, "grad_norm": 1.9609065445996567, "learning_rate": 8.867199015741632e-06, "loss": 0.7562, "step": 765 }, { "epoch": 0.29710074663046643, "grad_norm": 2.323489770827756, "learning_rate": 8.862903746067619e-06, "loss": 0.6787, "step": 766 }, { "epoch": 0.2974886066130127, "grad_norm": 3.2374781221921753, "learning_rate": 8.858601393061634e-06, "loss": 0.7692, "step": 767 }, { "epoch": 0.297876466595559, "grad_norm": 2.0419045705398493, "learning_rate": 8.854291964612824e-06, "loss": 0.6715, "step": 768 }, { "epoch": 0.2982643265781053, "grad_norm": 2.032832505220214, "learning_rate": 8.849975468623302e-06, "loss": 0.7058, "step": 769 }, { "epoch": 0.2986521865606516, "grad_norm": 2.2163703164224824, "learning_rate": 8.845651913008145e-06, "loss": 0.7577, "step": 770 }, { "epoch": 0.2990400465431979, "grad_norm": 2.4913190114881867, "learning_rate": 8.841321305695372e-06, "loss": 0.7129, "step": 771 }, { "epoch": 0.2994279065257442, "grad_norm": 2.750440861061979, "learning_rate": 8.836983654625934e-06, "loss": 0.7457, "step": 772 }, { "epoch": 0.2998157665082905, "grad_norm": 2.231616020605991, "learning_rate": 8.832638967753699e-06, "loss": 0.746, "step": 773 }, { "epoch": 0.3002036264908368, "grad_norm": 2.344866647749361, "learning_rate": 8.828287253045436e-06, "loss": 0.7518, "step": 774 }, { "epoch": 0.30059148647338313, "grad_norm": 3.4045761983756564, "learning_rate": 8.823928518480797e-06, "loss": 0.8256, "step": 775 }, { "epoch": 0.3009793464559294, "grad_norm": 2.0303829994325455, "learning_rate": 8.819562772052312e-06, "loss": 0.704, "step": 776 }, { "epoch": 0.3013672064384757, "grad_norm": 2.7956705358217593, "learning_rate": 8.815190021765365e-06, "loss": 0.7129, "step": 777 }, { "epoch": 0.301755066421022, "grad_norm": 3.064810181515184, "learning_rate": 8.810810275638183e-06, "loss": 0.8076, "step": 778 }, { "epoch": 0.3021429264035683, "grad_norm": 2.2019951121068333, "learning_rate": 8.806423541701824e-06, "loss": 0.7108, "step": 779 }, { "epoch": 0.30253078638611464, "grad_norm": 3.1977244083984617, "learning_rate": 8.802029828000157e-06, "loss": 0.8017, "step": 780 }, { "epoch": 0.3029186463686609, "grad_norm": 2.556808695551486, "learning_rate": 8.797629142589846e-06, "loss": 0.7244, "step": 781 }, { "epoch": 0.3033065063512072, "grad_norm": 2.726299168394344, "learning_rate": 8.793221493540347e-06, "loss": 0.7756, "step": 782 }, { "epoch": 0.3036943663337535, "grad_norm": 1.8738342832595527, "learning_rate": 8.788806888933881e-06, "loss": 0.7109, "step": 783 }, { "epoch": 0.30408222631629983, "grad_norm": 2.53557692975913, "learning_rate": 8.784385336865419e-06, "loss": 0.6634, "step": 784 }, { "epoch": 0.30447008629884614, "grad_norm": 2.5775962680624436, "learning_rate": 8.779956845442682e-06, "loss": 0.7287, "step": 785 }, { "epoch": 0.3048579462813924, "grad_norm": 2.6469996459205114, "learning_rate": 8.775521422786104e-06, "loss": 0.7447, "step": 786 }, { "epoch": 0.3052458062639387, "grad_norm": 2.7703846670503216, "learning_rate": 8.771079077028836e-06, "loss": 0.7321, "step": 787 }, { "epoch": 0.305633666246485, "grad_norm": 2.376611143098289, "learning_rate": 8.766629816316722e-06, "loss": 0.7857, "step": 788 }, { "epoch": 0.30602152622903134, "grad_norm": 2.7386938598723636, "learning_rate": 8.762173648808283e-06, "loss": 0.718, "step": 789 }, { "epoch": 0.3064093862115776, "grad_norm": 2.3140607630751386, "learning_rate": 8.757710582674708e-06, "loss": 0.7136, "step": 790 }, { "epoch": 0.3067972461941239, "grad_norm": 1.9706507839134786, "learning_rate": 8.753240626099836e-06, "loss": 0.7174, "step": 791 }, { "epoch": 0.3071851061766702, "grad_norm": 3.1682089308764185, "learning_rate": 8.748763787280142e-06, "loss": 0.7645, "step": 792 }, { "epoch": 0.30757296615921653, "grad_norm": 3.7685504106779546, "learning_rate": 8.744280074424713e-06, "loss": 0.7548, "step": 793 }, { "epoch": 0.30796082614176284, "grad_norm": 2.399211825076734, "learning_rate": 8.739789495755254e-06, "loss": 0.6446, "step": 794 }, { "epoch": 0.3083486861243091, "grad_norm": 2.5182764790410084, "learning_rate": 8.735292059506047e-06, "loss": 0.7217, "step": 795 }, { "epoch": 0.3087365461068554, "grad_norm": 2.6469519601787344, "learning_rate": 8.730787773923957e-06, "loss": 0.7451, "step": 796 }, { "epoch": 0.3091244060894017, "grad_norm": 3.1869535376830536, "learning_rate": 8.726276647268403e-06, "loss": 0.7058, "step": 797 }, { "epoch": 0.30951226607194804, "grad_norm": 2.5805701533851964, "learning_rate": 8.721758687811353e-06, "loss": 0.7074, "step": 798 }, { "epoch": 0.30990012605449435, "grad_norm": 2.732193985068239, "learning_rate": 8.717233903837298e-06, "loss": 0.7113, "step": 799 }, { "epoch": 0.3102879860370406, "grad_norm": 2.1122453582191545, "learning_rate": 8.712702303643254e-06, "loss": 0.7828, "step": 800 }, { "epoch": 0.3106758460195869, "grad_norm": 2.976383108666941, "learning_rate": 8.708163895538722e-06, "loss": 0.7963, "step": 801 }, { "epoch": 0.31106370600213323, "grad_norm": 2.3939634968796533, "learning_rate": 8.703618687845697e-06, "loss": 0.7251, "step": 802 }, { "epoch": 0.31145156598467955, "grad_norm": 2.5031384725035815, "learning_rate": 8.699066688898636e-06, "loss": 0.7873, "step": 803 }, { "epoch": 0.31183942596722586, "grad_norm": 2.198068979425093, "learning_rate": 8.694507907044454e-06, "loss": 0.7056, "step": 804 }, { "epoch": 0.3122272859497721, "grad_norm": 2.3521534061134215, "learning_rate": 8.6899423506425e-06, "loss": 0.7423, "step": 805 }, { "epoch": 0.31261514593231843, "grad_norm": 3.435707833142974, "learning_rate": 8.685370028064546e-06, "loss": 0.826, "step": 806 }, { "epoch": 0.31300300591486474, "grad_norm": 2.95479350290203, "learning_rate": 8.680790947694772e-06, "loss": 0.7655, "step": 807 }, { "epoch": 0.31339086589741105, "grad_norm": 2.133319060821998, "learning_rate": 8.676205117929752e-06, "loss": 0.6814, "step": 808 }, { "epoch": 0.3137787258799573, "grad_norm": 2.861729332806412, "learning_rate": 8.671612547178428e-06, "loss": 0.7116, "step": 809 }, { "epoch": 0.3141665858625036, "grad_norm": 2.6699721156043705, "learning_rate": 8.667013243862113e-06, "loss": 0.7842, "step": 810 }, { "epoch": 0.31455444584504993, "grad_norm": 2.4656166261774204, "learning_rate": 8.66240721641446e-06, "loss": 0.7303, "step": 811 }, { "epoch": 0.31494230582759625, "grad_norm": 2.4508744643033946, "learning_rate": 8.657794473281447e-06, "loss": 0.7213, "step": 812 }, { "epoch": 0.31533016581014256, "grad_norm": 2.016421573069749, "learning_rate": 8.65317502292138e-06, "loss": 0.684, "step": 813 }, { "epoch": 0.3157180257926888, "grad_norm": 3.297733963046924, "learning_rate": 8.64854887380485e-06, "loss": 0.7911, "step": 814 }, { "epoch": 0.31610588577523513, "grad_norm": 1.919512117516477, "learning_rate": 8.643916034414741e-06, "loss": 0.6959, "step": 815 }, { "epoch": 0.31649374575778144, "grad_norm": 2.970312427544644, "learning_rate": 8.639276513246199e-06, "loss": 0.6738, "step": 816 }, { "epoch": 0.31688160574032775, "grad_norm": 2.5792268183792, "learning_rate": 8.634630318806626e-06, "loss": 0.7383, "step": 817 }, { "epoch": 0.31726946572287407, "grad_norm": 2.6569441088146823, "learning_rate": 8.629977459615655e-06, "loss": 0.7316, "step": 818 }, { "epoch": 0.3176573257054203, "grad_norm": 2.582555145446032, "learning_rate": 8.62531794420515e-06, "loss": 0.7292, "step": 819 }, { "epoch": 0.31804518568796664, "grad_norm": 2.3462963853335377, "learning_rate": 8.620651781119169e-06, "loss": 0.7201, "step": 820 }, { "epoch": 0.31843304567051295, "grad_norm": 2.225439115243207, "learning_rate": 8.615978978913968e-06, "loss": 0.6673, "step": 821 }, { "epoch": 0.31882090565305926, "grad_norm": 2.543733446029756, "learning_rate": 8.611299546157973e-06, "loss": 0.7524, "step": 822 }, { "epoch": 0.3192087656356056, "grad_norm": 3.074184773622074, "learning_rate": 8.60661349143177e-06, "loss": 0.6997, "step": 823 }, { "epoch": 0.31959662561815183, "grad_norm": 2.1330492720365646, "learning_rate": 8.601920823328088e-06, "loss": 0.7002, "step": 824 }, { "epoch": 0.31998448560069814, "grad_norm": 2.696563598095336, "learning_rate": 8.59722155045178e-06, "loss": 0.7221, "step": 825 }, { "epoch": 0.32037234558324446, "grad_norm": 3.3217161855029413, "learning_rate": 8.592515681419812e-06, "loss": 0.786, "step": 826 }, { "epoch": 0.32076020556579077, "grad_norm": 2.6265766912330086, "learning_rate": 8.587803224861248e-06, "loss": 0.7529, "step": 827 }, { "epoch": 0.321148065548337, "grad_norm": 1.9773264296113338, "learning_rate": 8.583084189417225e-06, "loss": 0.6865, "step": 828 }, { "epoch": 0.32153592553088334, "grad_norm": 2.6909190688766746, "learning_rate": 8.578358583740947e-06, "loss": 0.8116, "step": 829 }, { "epoch": 0.32192378551342965, "grad_norm": 2.817589522612703, "learning_rate": 8.573626416497669e-06, "loss": 0.7777, "step": 830 }, { "epoch": 0.32231164549597596, "grad_norm": 2.864623834752134, "learning_rate": 8.568887696364673e-06, "loss": 0.7174, "step": 831 }, { "epoch": 0.3226995054785223, "grad_norm": 2.685909989421781, "learning_rate": 8.564142432031257e-06, "loss": 0.7428, "step": 832 }, { "epoch": 0.32308736546106853, "grad_norm": 2.387231923870693, "learning_rate": 8.559390632198723e-06, "loss": 0.7501, "step": 833 }, { "epoch": 0.32347522544361484, "grad_norm": 2.8593289274153313, "learning_rate": 8.554632305580355e-06, "loss": 0.7234, "step": 834 }, { "epoch": 0.32386308542616116, "grad_norm": 2.847104889955985, "learning_rate": 8.549867460901402e-06, "loss": 0.7039, "step": 835 }, { "epoch": 0.32425094540870747, "grad_norm": 2.9378500713627607, "learning_rate": 8.545096106899068e-06, "loss": 0.7028, "step": 836 }, { "epoch": 0.3246388053912538, "grad_norm": 3.3248570431737487, "learning_rate": 8.540318252322493e-06, "loss": 0.8112, "step": 837 }, { "epoch": 0.32502666537380004, "grad_norm": 2.9037606739725668, "learning_rate": 8.535533905932739e-06, "loss": 0.7328, "step": 838 }, { "epoch": 0.32541452535634635, "grad_norm": 2.385722182232819, "learning_rate": 8.530743076502766e-06, "loss": 0.7541, "step": 839 }, { "epoch": 0.32580238533889266, "grad_norm": 2.6844790387631696, "learning_rate": 8.525945772817427e-06, "loss": 0.7701, "step": 840 }, { "epoch": 0.326190245321439, "grad_norm": 2.2003861218767025, "learning_rate": 8.521142003673447e-06, "loss": 0.7031, "step": 841 }, { "epoch": 0.3265781053039853, "grad_norm": 3.1427679016581975, "learning_rate": 8.5163317778794e-06, "loss": 0.7426, "step": 842 }, { "epoch": 0.32696596528653155, "grad_norm": 2.5472642463649633, "learning_rate": 8.51151510425571e-06, "loss": 0.7108, "step": 843 }, { "epoch": 0.32735382526907786, "grad_norm": 2.030392272023427, "learning_rate": 8.506691991634612e-06, "loss": 0.6197, "step": 844 }, { "epoch": 0.32774168525162417, "grad_norm": 2.189728767382852, "learning_rate": 8.501862448860159e-06, "loss": 0.7567, "step": 845 }, { "epoch": 0.3281295452341705, "grad_norm": 2.842551148915621, "learning_rate": 8.497026484788189e-06, "loss": 0.7992, "step": 846 }, { "epoch": 0.32851740521671674, "grad_norm": 2.577756479701135, "learning_rate": 8.492184108286316e-06, "loss": 0.7318, "step": 847 }, { "epoch": 0.32890526519926305, "grad_norm": 1.8876421178303318, "learning_rate": 8.487335328233912e-06, "loss": 0.7225, "step": 848 }, { "epoch": 0.32929312518180937, "grad_norm": 2.5126724837484993, "learning_rate": 8.48248015352209e-06, "loss": 0.7088, "step": 849 }, { "epoch": 0.3296809851643557, "grad_norm": 2.429044644428857, "learning_rate": 8.477618593053693e-06, "loss": 0.7459, "step": 850 }, { "epoch": 0.330068845146902, "grad_norm": 2.040254015591146, "learning_rate": 8.47275065574327e-06, "loss": 0.6775, "step": 851 }, { "epoch": 0.33045670512944825, "grad_norm": 2.2950927559693413, "learning_rate": 8.46787635051706e-06, "loss": 0.7139, "step": 852 }, { "epoch": 0.33084456511199456, "grad_norm": 2.4120721281442044, "learning_rate": 8.462995686312985e-06, "loss": 0.6806, "step": 853 }, { "epoch": 0.33123242509454087, "grad_norm": 2.0596452846704554, "learning_rate": 8.458108672080624e-06, "loss": 0.6915, "step": 854 }, { "epoch": 0.3316202850770872, "grad_norm": 2.3849596031195457, "learning_rate": 8.453215316781205e-06, "loss": 0.7711, "step": 855 }, { "epoch": 0.3320081450596335, "grad_norm": 2.812848053708224, "learning_rate": 8.448315629387572e-06, "loss": 0.7395, "step": 856 }, { "epoch": 0.33239600504217975, "grad_norm": 2.472032295440724, "learning_rate": 8.44340961888419e-06, "loss": 0.6872, "step": 857 }, { "epoch": 0.33278386502472607, "grad_norm": 2.425807699838393, "learning_rate": 8.438497294267117e-06, "loss": 0.7242, "step": 858 }, { "epoch": 0.3331717250072724, "grad_norm": 1.9873578931093163, "learning_rate": 8.433578664543986e-06, "loss": 0.6838, "step": 859 }, { "epoch": 0.3335595849898187, "grad_norm": 2.8047233404463245, "learning_rate": 8.428653738733996e-06, "loss": 0.7399, "step": 860 }, { "epoch": 0.33394744497236495, "grad_norm": 2.2739386706452187, "learning_rate": 8.423722525867883e-06, "loss": 0.7264, "step": 861 }, { "epoch": 0.33433530495491126, "grad_norm": 2.3908376810801957, "learning_rate": 8.418785034987921e-06, "loss": 0.6349, "step": 862 }, { "epoch": 0.3347231649374576, "grad_norm": 2.9899371457476764, "learning_rate": 8.413841275147893e-06, "loss": 0.7279, "step": 863 }, { "epoch": 0.3351110249200039, "grad_norm": 2.7048899548750556, "learning_rate": 8.408891255413072e-06, "loss": 0.7465, "step": 864 }, { "epoch": 0.3354988849025502, "grad_norm": 2.942558969563079, "learning_rate": 8.403934984860216e-06, "loss": 0.8195, "step": 865 }, { "epoch": 0.33588674488509646, "grad_norm": 1.878172002294058, "learning_rate": 8.39897247257754e-06, "loss": 0.6647, "step": 866 }, { "epoch": 0.33627460486764277, "grad_norm": 2.121885160937187, "learning_rate": 8.39400372766471e-06, "loss": 0.6703, "step": 867 }, { "epoch": 0.3366624648501891, "grad_norm": 2.1789481310834082, "learning_rate": 8.389028759232816e-06, "loss": 0.6592, "step": 868 }, { "epoch": 0.3370503248327354, "grad_norm": 2.2020983435218158, "learning_rate": 8.38404757640436e-06, "loss": 0.7027, "step": 869 }, { "epoch": 0.3374381848152817, "grad_norm": 2.759261347590789, "learning_rate": 8.379060188313244e-06, "loss": 0.7284, "step": 870 }, { "epoch": 0.33782604479782796, "grad_norm": 2.86538243720504, "learning_rate": 8.374066604104742e-06, "loss": 0.6939, "step": 871 }, { "epoch": 0.3382139047803743, "grad_norm": 2.945369476021069, "learning_rate": 8.369066832935498e-06, "loss": 0.7721, "step": 872 }, { "epoch": 0.3386017647629206, "grad_norm": 2.9598860836821475, "learning_rate": 8.364060883973488e-06, "loss": 0.7442, "step": 873 }, { "epoch": 0.3389896247454669, "grad_norm": 2.045352429695778, "learning_rate": 8.359048766398032e-06, "loss": 0.665, "step": 874 }, { "epoch": 0.3393774847280132, "grad_norm": 3.8307130006810723, "learning_rate": 8.354030489399747e-06, "loss": 0.8232, "step": 875 }, { "epoch": 0.33976534471055947, "grad_norm": 2.6563929187830744, "learning_rate": 8.349006062180552e-06, "loss": 0.6645, "step": 876 }, { "epoch": 0.3401532046931058, "grad_norm": 3.02093234507686, "learning_rate": 8.343975493953645e-06, "loss": 0.8079, "step": 877 }, { "epoch": 0.3405410646756521, "grad_norm": 2.6547562406928407, "learning_rate": 8.338938793943478e-06, "loss": 0.6929, "step": 878 }, { "epoch": 0.3409289246581984, "grad_norm": 2.5696853375589117, "learning_rate": 8.333895971385754e-06, "loss": 0.7321, "step": 879 }, { "epoch": 0.34131678464074466, "grad_norm": 2.487211834372955, "learning_rate": 8.328847035527397e-06, "loss": 0.6929, "step": 880 }, { "epoch": 0.341704644623291, "grad_norm": 2.137954260912244, "learning_rate": 8.323791995626543e-06, "loss": 0.7156, "step": 881 }, { "epoch": 0.3420925046058373, "grad_norm": 2.405682307006779, "learning_rate": 8.318730860952523e-06, "loss": 0.6906, "step": 882 }, { "epoch": 0.3424803645883836, "grad_norm": 2.660740458227899, "learning_rate": 8.313663640785839e-06, "loss": 0.7552, "step": 883 }, { "epoch": 0.3428682245709299, "grad_norm": 2.519305478568743, "learning_rate": 8.308590344418158e-06, "loss": 0.7546, "step": 884 }, { "epoch": 0.34325608455347617, "grad_norm": 2.8512044923044857, "learning_rate": 8.303510981152283e-06, "loss": 0.7378, "step": 885 }, { "epoch": 0.3436439445360225, "grad_norm": 2.8482578682034334, "learning_rate": 8.298425560302146e-06, "loss": 0.7253, "step": 886 }, { "epoch": 0.3440318045185688, "grad_norm": 2.2804892643812202, "learning_rate": 8.293334091192782e-06, "loss": 0.7295, "step": 887 }, { "epoch": 0.3444196645011151, "grad_norm": 2.4172982374702485, "learning_rate": 8.288236583160322e-06, "loss": 0.6896, "step": 888 }, { "epoch": 0.3448075244836614, "grad_norm": 2.2108550855525673, "learning_rate": 8.28313304555197e-06, "loss": 0.6882, "step": 889 }, { "epoch": 0.3451953844662077, "grad_norm": 3.151159373084786, "learning_rate": 8.278023487725981e-06, "loss": 0.8186, "step": 890 }, { "epoch": 0.345583244448754, "grad_norm": 2.7706802045857963, "learning_rate": 8.272907919051653e-06, "loss": 0.6994, "step": 891 }, { "epoch": 0.3459711044313003, "grad_norm": 2.6076381735626124, "learning_rate": 8.267786348909306e-06, "loss": 0.6939, "step": 892 }, { "epoch": 0.3463589644138466, "grad_norm": 2.8346822330911987, "learning_rate": 8.262658786690262e-06, "loss": 0.6971, "step": 893 }, { "epoch": 0.3467468243963929, "grad_norm": 3.5591015326703874, "learning_rate": 8.257525241796837e-06, "loss": 0.7242, "step": 894 }, { "epoch": 0.3471346843789392, "grad_norm": 2.780329200078068, "learning_rate": 8.252385723642312e-06, "loss": 0.7951, "step": 895 }, { "epoch": 0.3475225443614855, "grad_norm": 3.304581763587232, "learning_rate": 8.247240241650918e-06, "loss": 0.7651, "step": 896 }, { "epoch": 0.3479104043440318, "grad_norm": 3.0154239686945257, "learning_rate": 8.242088805257832e-06, "loss": 0.8051, "step": 897 }, { "epoch": 0.3482982643265781, "grad_norm": 3.2636621242218853, "learning_rate": 8.23693142390914e-06, "loss": 0.7276, "step": 898 }, { "epoch": 0.3486861243091244, "grad_norm": 2.3224154325527295, "learning_rate": 8.231768107061831e-06, "loss": 0.6387, "step": 899 }, { "epoch": 0.3490739842916707, "grad_norm": 3.403412155886512, "learning_rate": 8.226598864183782e-06, "loss": 0.7513, "step": 900 }, { "epoch": 0.349461844274217, "grad_norm": 2.282797467693676, "learning_rate": 8.221423704753733e-06, "loss": 0.7041, "step": 901 }, { "epoch": 0.3498497042567633, "grad_norm": 1.812421656583064, "learning_rate": 8.216242638261277e-06, "loss": 0.6544, "step": 902 }, { "epoch": 0.35023756423930963, "grad_norm": 2.7881963136504817, "learning_rate": 8.211055674206828e-06, "loss": 0.7503, "step": 903 }, { "epoch": 0.3506254242218559, "grad_norm": 2.4695367524974774, "learning_rate": 8.205862822101628e-06, "loss": 0.7632, "step": 904 }, { "epoch": 0.3510132842044022, "grad_norm": 2.8068680549798195, "learning_rate": 8.200664091467707e-06, "loss": 0.7094, "step": 905 }, { "epoch": 0.3514011441869485, "grad_norm": 2.345649554276779, "learning_rate": 8.195459491837881e-06, "loss": 0.6804, "step": 906 }, { "epoch": 0.3517890041694948, "grad_norm": 2.452729578160967, "learning_rate": 8.190249032755717e-06, "loss": 0.7404, "step": 907 }, { "epoch": 0.35217686415204114, "grad_norm": 2.7701572163897326, "learning_rate": 8.18503272377554e-06, "loss": 0.7848, "step": 908 }, { "epoch": 0.3525647241345874, "grad_norm": 2.5205739054284195, "learning_rate": 8.179810574462388e-06, "loss": 0.7448, "step": 909 }, { "epoch": 0.3529525841171337, "grad_norm": 2.396914956712164, "learning_rate": 8.17458259439202e-06, "loss": 0.6878, "step": 910 }, { "epoch": 0.35334044409968, "grad_norm": 2.101334536061199, "learning_rate": 8.169348793150884e-06, "loss": 0.6337, "step": 911 }, { "epoch": 0.35372830408222633, "grad_norm": 2.6291980903925802, "learning_rate": 8.164109180336094e-06, "loss": 0.7611, "step": 912 }, { "epoch": 0.35411616406477264, "grad_norm": 2.38786435276684, "learning_rate": 8.15886376555543e-06, "loss": 0.7594, "step": 913 }, { "epoch": 0.3545040240473189, "grad_norm": 2.3770884080206405, "learning_rate": 8.153612558427311e-06, "loss": 0.6857, "step": 914 }, { "epoch": 0.3548918840298652, "grad_norm": 3.0558470271083977, "learning_rate": 8.148355568580768e-06, "loss": 0.7991, "step": 915 }, { "epoch": 0.3552797440124115, "grad_norm": 3.0018011878573927, "learning_rate": 8.143092805655445e-06, "loss": 0.7103, "step": 916 }, { "epoch": 0.35566760399495784, "grad_norm": 2.687757742646694, "learning_rate": 8.13782427930157e-06, "loss": 0.6646, "step": 917 }, { "epoch": 0.3560554639775041, "grad_norm": 2.9377905796624852, "learning_rate": 8.132549999179934e-06, "loss": 0.7405, "step": 918 }, { "epoch": 0.3564433239600504, "grad_norm": 2.5594743469302714, "learning_rate": 8.127269974961886e-06, "loss": 0.6999, "step": 919 }, { "epoch": 0.3568311839425967, "grad_norm": 3.220724066161192, "learning_rate": 8.121984216329303e-06, "loss": 0.6817, "step": 920 }, { "epoch": 0.35721904392514303, "grad_norm": 2.280391302525875, "learning_rate": 8.116692732974578e-06, "loss": 0.6799, "step": 921 }, { "epoch": 0.35760690390768934, "grad_norm": 2.104915890089456, "learning_rate": 8.111395534600604e-06, "loss": 0.6956, "step": 922 }, { "epoch": 0.3579947638902356, "grad_norm": 2.254068021223425, "learning_rate": 8.10609263092075e-06, "loss": 0.6975, "step": 923 }, { "epoch": 0.3583826238727819, "grad_norm": 2.856644942475502, "learning_rate": 8.100784031658846e-06, "loss": 0.7243, "step": 924 }, { "epoch": 0.3587704838553282, "grad_norm": 2.4917430587863514, "learning_rate": 8.095469746549172e-06, "loss": 0.7572, "step": 925 }, { "epoch": 0.35915834383787454, "grad_norm": 2.4255586726999456, "learning_rate": 8.090149785336426e-06, "loss": 0.735, "step": 926 }, { "epoch": 0.35954620382042085, "grad_norm": 1.7689195852315163, "learning_rate": 8.084824157775719e-06, "loss": 0.6452, "step": 927 }, { "epoch": 0.3599340638029671, "grad_norm": 2.8607439075286365, "learning_rate": 8.079492873632554e-06, "loss": 0.7257, "step": 928 }, { "epoch": 0.3603219237855134, "grad_norm": 3.4224848903908747, "learning_rate": 8.074155942682803e-06, "loss": 0.7337, "step": 929 }, { "epoch": 0.36070978376805973, "grad_norm": 3.5232261278663914, "learning_rate": 8.068813374712689e-06, "loss": 0.7749, "step": 930 }, { "epoch": 0.36109764375060605, "grad_norm": 2.0414540326704227, "learning_rate": 8.06346517951878e-06, "loss": 0.7244, "step": 931 }, { "epoch": 0.36148550373315236, "grad_norm": 2.1558679836582977, "learning_rate": 8.058111366907957e-06, "loss": 0.7222, "step": 932 }, { "epoch": 0.3618733637156986, "grad_norm": 2.401301730769135, "learning_rate": 8.052751946697403e-06, "loss": 0.7181, "step": 933 }, { "epoch": 0.3622612236982449, "grad_norm": 3.2924333586268504, "learning_rate": 8.047386928714583e-06, "loss": 0.715, "step": 934 }, { "epoch": 0.36264908368079124, "grad_norm": 3.057913698394937, "learning_rate": 8.042016322797227e-06, "loss": 0.8033, "step": 935 }, { "epoch": 0.36303694366333755, "grad_norm": 2.752948797394045, "learning_rate": 8.03664013879331e-06, "loss": 0.7038, "step": 936 }, { "epoch": 0.3634248036458838, "grad_norm": 2.6229296403637665, "learning_rate": 8.031258386561038e-06, "loss": 0.7775, "step": 937 }, { "epoch": 0.3638126636284301, "grad_norm": 3.00479640998773, "learning_rate": 8.025871075968828e-06, "loss": 0.747, "step": 938 }, { "epoch": 0.36420052361097643, "grad_norm": 3.0960758174747554, "learning_rate": 8.020478216895282e-06, "loss": 0.7485, "step": 939 }, { "epoch": 0.36458838359352275, "grad_norm": 2.908311961382021, "learning_rate": 8.015079819229187e-06, "loss": 0.7149, "step": 940 }, { "epoch": 0.36497624357606906, "grad_norm": 2.884139935417321, "learning_rate": 8.009675892869478e-06, "loss": 0.703, "step": 941 }, { "epoch": 0.3653641035586153, "grad_norm": 1.99534938744625, "learning_rate": 8.00426644772523e-06, "loss": 0.7065, "step": 942 }, { "epoch": 0.36575196354116163, "grad_norm": 2.588138026408731, "learning_rate": 7.99885149371564e-06, "loss": 0.7934, "step": 943 }, { "epoch": 0.36613982352370794, "grad_norm": 1.8198315055212329, "learning_rate": 7.993431040770002e-06, "loss": 0.7159, "step": 944 }, { "epoch": 0.36652768350625425, "grad_norm": 2.828487347441194, "learning_rate": 7.988005098827699e-06, "loss": 0.7018, "step": 945 }, { "epoch": 0.36691554348880057, "grad_norm": 2.957647008778818, "learning_rate": 7.982573677838172e-06, "loss": 0.6706, "step": 946 }, { "epoch": 0.3673034034713468, "grad_norm": 2.479654579877714, "learning_rate": 7.977136787760916e-06, "loss": 0.7409, "step": 947 }, { "epoch": 0.36769126345389314, "grad_norm": 2.504737256309277, "learning_rate": 7.97169443856545e-06, "loss": 0.7457, "step": 948 }, { "epoch": 0.36807912343643945, "grad_norm": 2.6675895693920264, "learning_rate": 7.966246640231303e-06, "loss": 0.7796, "step": 949 }, { "epoch": 0.36846698341898576, "grad_norm": 2.000502875177774, "learning_rate": 7.960793402748001e-06, "loss": 0.6435, "step": 950 }, { "epoch": 0.3688548434015321, "grad_norm": 2.3048720247718886, "learning_rate": 7.955334736115038e-06, "loss": 0.6708, "step": 951 }, { "epoch": 0.36924270338407833, "grad_norm": 2.3548197007923575, "learning_rate": 7.949870650341864e-06, "loss": 0.7338, "step": 952 }, { "epoch": 0.36963056336662464, "grad_norm": 2.150253154799868, "learning_rate": 7.944401155447872e-06, "loss": 0.6338, "step": 953 }, { "epoch": 0.37001842334917096, "grad_norm": 1.8295621601537033, "learning_rate": 7.938926261462366e-06, "loss": 0.6623, "step": 954 }, { "epoch": 0.37040628333171727, "grad_norm": 2.667788835073008, "learning_rate": 7.933445978424555e-06, "loss": 0.7372, "step": 955 }, { "epoch": 0.3707941433142635, "grad_norm": 1.856719147056243, "learning_rate": 7.927960316383524e-06, "loss": 0.6117, "step": 956 }, { "epoch": 0.37118200329680984, "grad_norm": 2.252439723343179, "learning_rate": 7.92246928539823e-06, "loss": 0.7707, "step": 957 }, { "epoch": 0.37156986327935615, "grad_norm": 2.3687167802510163, "learning_rate": 7.916972895537471e-06, "loss": 0.7866, "step": 958 }, { "epoch": 0.37195772326190246, "grad_norm": 3.063573800969174, "learning_rate": 7.911471156879866e-06, "loss": 0.7382, "step": 959 }, { "epoch": 0.3723455832444488, "grad_norm": 2.7889299591531236, "learning_rate": 7.905964079513851e-06, "loss": 0.7976, "step": 960 }, { "epoch": 0.37273344322699503, "grad_norm": 2.018414238810879, "learning_rate": 7.900451673537646e-06, "loss": 0.6959, "step": 961 }, { "epoch": 0.37312130320954134, "grad_norm": 1.8554220245141526, "learning_rate": 7.894933949059245e-06, "loss": 0.7254, "step": 962 }, { "epoch": 0.37350916319208766, "grad_norm": 2.467420576554491, "learning_rate": 7.88941091619639e-06, "loss": 0.7179, "step": 963 }, { "epoch": 0.37389702317463397, "grad_norm": 2.2383438727799048, "learning_rate": 7.883882585076558e-06, "loss": 0.6819, "step": 964 }, { "epoch": 0.3742848831571803, "grad_norm": 2.3566580816532765, "learning_rate": 7.87834896583695e-06, "loss": 0.7242, "step": 965 }, { "epoch": 0.37467274313972654, "grad_norm": 2.800095931630177, "learning_rate": 7.872810068624452e-06, "loss": 0.7548, "step": 966 }, { "epoch": 0.37506060312227285, "grad_norm": 2.758635576589722, "learning_rate": 7.867265903595632e-06, "loss": 0.7071, "step": 967 }, { "epoch": 0.37544846310481916, "grad_norm": 2.8804478394138995, "learning_rate": 7.86171648091672e-06, "loss": 0.7607, "step": 968 }, { "epoch": 0.3758363230873655, "grad_norm": 1.7082427914994027, "learning_rate": 7.856161810763584e-06, "loss": 0.6584, "step": 969 }, { "epoch": 0.3762241830699118, "grad_norm": 1.9529323346403569, "learning_rate": 7.850601903321717e-06, "loss": 0.7242, "step": 970 }, { "epoch": 0.37661204305245805, "grad_norm": 3.1241012897710974, "learning_rate": 7.845036768786214e-06, "loss": 0.7422, "step": 971 }, { "epoch": 0.37699990303500436, "grad_norm": 1.9682091441774319, "learning_rate": 7.839466417361753e-06, "loss": 0.7243, "step": 972 }, { "epoch": 0.37738776301755067, "grad_norm": 2.5175537138980397, "learning_rate": 7.833890859262579e-06, "loss": 0.6824, "step": 973 }, { "epoch": 0.377775623000097, "grad_norm": 3.4339292023831103, "learning_rate": 7.828310104712488e-06, "loss": 0.7551, "step": 974 }, { "epoch": 0.37816348298264324, "grad_norm": 2.4540250192800257, "learning_rate": 7.822724163944802e-06, "loss": 0.7157, "step": 975 }, { "epoch": 0.37855134296518955, "grad_norm": 3.2063829742623295, "learning_rate": 7.81713304720235e-06, "loss": 0.7479, "step": 976 }, { "epoch": 0.37893920294773586, "grad_norm": 2.0983045699362224, "learning_rate": 7.811536764737454e-06, "loss": 0.7575, "step": 977 }, { "epoch": 0.3793270629302822, "grad_norm": 3.311557064078919, "learning_rate": 7.805935326811913e-06, "loss": 0.7383, "step": 978 }, { "epoch": 0.3797149229128285, "grad_norm": 2.4190886333937995, "learning_rate": 7.800328743696973e-06, "loss": 0.7155, "step": 979 }, { "epoch": 0.38010278289537475, "grad_norm": 3.191969988088667, "learning_rate": 7.794717025673318e-06, "loss": 0.7699, "step": 980 }, { "epoch": 0.38049064287792106, "grad_norm": 2.575737553058701, "learning_rate": 7.789100183031045e-06, "loss": 0.6863, "step": 981 }, { "epoch": 0.38087850286046737, "grad_norm": 4.214401130828242, "learning_rate": 7.783478226069652e-06, "loss": 0.8149, "step": 982 }, { "epoch": 0.3812663628430137, "grad_norm": 2.2307480035503064, "learning_rate": 7.777851165098012e-06, "loss": 0.6876, "step": 983 }, { "epoch": 0.38165422282556, "grad_norm": 3.0122699229597583, "learning_rate": 7.772219010434359e-06, "loss": 0.7405, "step": 984 }, { "epoch": 0.38204208280810625, "grad_norm": 2.0549845502053614, "learning_rate": 7.766581772406266e-06, "loss": 0.7178, "step": 985 }, { "epoch": 0.38242994279065257, "grad_norm": 1.8656339653882852, "learning_rate": 7.760939461350622e-06, "loss": 0.676, "step": 986 }, { "epoch": 0.3828178027731989, "grad_norm": 1.9590834360090448, "learning_rate": 7.755292087613635e-06, "loss": 0.6791, "step": 987 }, { "epoch": 0.3832056627557452, "grad_norm": 2.8838465413752883, "learning_rate": 7.749639661550776e-06, "loss": 0.7573, "step": 988 }, { "epoch": 0.3835935227382915, "grad_norm": 2.499053241113577, "learning_rate": 7.743982193526791e-06, "loss": 0.7327, "step": 989 }, { "epoch": 0.38398138272083776, "grad_norm": 2.6587922236397126, "learning_rate": 7.738319693915673e-06, "loss": 0.6775, "step": 990 }, { "epoch": 0.3843692427033841, "grad_norm": 2.3884742019255967, "learning_rate": 7.732652173100634e-06, "loss": 0.724, "step": 991 }, { "epoch": 0.3847571026859304, "grad_norm": 2.3907974828964154, "learning_rate": 7.726979641474102e-06, "loss": 0.7319, "step": 992 }, { "epoch": 0.3851449626684767, "grad_norm": 1.8496024591885087, "learning_rate": 7.721302109437686e-06, "loss": 0.7091, "step": 993 }, { "epoch": 0.38553282265102296, "grad_norm": 2.6436757508986615, "learning_rate": 7.715619587402165e-06, "loss": 0.7214, "step": 994 }, { "epoch": 0.38592068263356927, "grad_norm": 3.0390136729744976, "learning_rate": 7.709932085787473e-06, "loss": 0.7975, "step": 995 }, { "epoch": 0.3863085426161156, "grad_norm": 2.2911607288944635, "learning_rate": 7.704239615022671e-06, "loss": 0.6893, "step": 996 }, { "epoch": 0.3866964025986619, "grad_norm": 2.234983090566906, "learning_rate": 7.698542185545932e-06, "loss": 0.7145, "step": 997 }, { "epoch": 0.3870842625812082, "grad_norm": 2.214533536513175, "learning_rate": 7.692839807804522e-06, "loss": 0.7353, "step": 998 }, { "epoch": 0.38747212256375446, "grad_norm": 1.9544830987785515, "learning_rate": 7.687132492254783e-06, "loss": 0.722, "step": 999 }, { "epoch": 0.3878599825463008, "grad_norm": 2.718185650997253, "learning_rate": 7.681420249362107e-06, "loss": 0.734, "step": 1000 }, { "epoch": 0.3878599825463008, "eval_loss": 1.367966651916504, "eval_runtime": 6.6246, "eval_samples_per_second": 0.151, "eval_steps_per_second": 0.151, "step": 1000 }, { "epoch": 0.3882478425288471, "grad_norm": 2.619166766833621, "learning_rate": 7.675703089600926e-06, "loss": 0.7427, "step": 1001 }, { "epoch": 0.3886357025113934, "grad_norm": 2.616291660418023, "learning_rate": 7.669981023454682e-06, "loss": 0.7483, "step": 1002 }, { "epoch": 0.3890235624939397, "grad_norm": 1.872006032867962, "learning_rate": 7.664254061415818e-06, "loss": 0.7111, "step": 1003 }, { "epoch": 0.38941142247648597, "grad_norm": 2.9576014668635104, "learning_rate": 7.658522213985757e-06, "loss": 0.7131, "step": 1004 }, { "epoch": 0.3897992824590323, "grad_norm": 2.607599680808191, "learning_rate": 7.652785491674872e-06, "loss": 0.7488, "step": 1005 }, { "epoch": 0.3901871424415786, "grad_norm": 2.0381612755872647, "learning_rate": 7.647043905002485e-06, "loss": 0.6733, "step": 1006 }, { "epoch": 0.3905750024241249, "grad_norm": 1.9441795351252467, "learning_rate": 7.641297464496828e-06, "loss": 0.7141, "step": 1007 }, { "epoch": 0.3909628624066712, "grad_norm": 2.103337542171184, "learning_rate": 7.635546180695039e-06, "loss": 0.7071, "step": 1008 }, { "epoch": 0.3913507223892175, "grad_norm": 2.6365127990730253, "learning_rate": 7.629790064143139e-06, "loss": 0.6904, "step": 1009 }, { "epoch": 0.3917385823717638, "grad_norm": 2.659770186027065, "learning_rate": 7.624029125396004e-06, "loss": 0.7035, "step": 1010 }, { "epoch": 0.3921264423543101, "grad_norm": 2.3641890138021537, "learning_rate": 7.618263375017358e-06, "loss": 0.7151, "step": 1011 }, { "epoch": 0.3925143023368564, "grad_norm": 2.2454311676937935, "learning_rate": 7.612492823579744e-06, "loss": 0.7106, "step": 1012 }, { "epoch": 0.39290216231940267, "grad_norm": 2.1306130908174117, "learning_rate": 7.606717481664515e-06, "loss": 0.7011, "step": 1013 }, { "epoch": 0.393290022301949, "grad_norm": 2.1810989923148068, "learning_rate": 7.600937359861799e-06, "loss": 0.747, "step": 1014 }, { "epoch": 0.3936778822844953, "grad_norm": 2.4475718575118726, "learning_rate": 7.595152468770497e-06, "loss": 0.7077, "step": 1015 }, { "epoch": 0.3940657422670416, "grad_norm": 2.7163217343911406, "learning_rate": 7.589362818998251e-06, "loss": 0.7306, "step": 1016 }, { "epoch": 0.3944536022495879, "grad_norm": 3.1109366882916554, "learning_rate": 7.58356842116143e-06, "loss": 0.7346, "step": 1017 }, { "epoch": 0.3948414622321342, "grad_norm": 2.061876985926165, "learning_rate": 7.57776928588511e-06, "loss": 0.6719, "step": 1018 }, { "epoch": 0.3952293222146805, "grad_norm": 2.4624850876859163, "learning_rate": 7.571965423803052e-06, "loss": 0.6848, "step": 1019 }, { "epoch": 0.3956171821972268, "grad_norm": 3.706814077566213, "learning_rate": 7.566156845557684e-06, "loss": 0.8318, "step": 1020 }, { "epoch": 0.3960050421797731, "grad_norm": 2.6727541587436736, "learning_rate": 7.560343561800087e-06, "loss": 0.8528, "step": 1021 }, { "epoch": 0.3963929021623194, "grad_norm": 2.6787539281083936, "learning_rate": 7.554525583189969e-06, "loss": 0.6866, "step": 1022 }, { "epoch": 0.3967807621448657, "grad_norm": 3.133840132831606, "learning_rate": 7.548702920395639e-06, "loss": 0.6763, "step": 1023 }, { "epoch": 0.397168622127412, "grad_norm": 2.892999557788481, "learning_rate": 7.542875584094006e-06, "loss": 0.7169, "step": 1024 }, { "epoch": 0.3975564821099583, "grad_norm": 2.6180129938559586, "learning_rate": 7.537043584970543e-06, "loss": 0.7358, "step": 1025 }, { "epoch": 0.3979443420925046, "grad_norm": 2.323742154264389, "learning_rate": 7.53120693371927e-06, "loss": 0.6326, "step": 1026 }, { "epoch": 0.39833220207505093, "grad_norm": 2.728619106795736, "learning_rate": 7.525365641042749e-06, "loss": 0.743, "step": 1027 }, { "epoch": 0.3987200620575972, "grad_norm": 2.7389004348965966, "learning_rate": 7.519519717652039e-06, "loss": 0.6784, "step": 1028 }, { "epoch": 0.3991079220401435, "grad_norm": 2.5298841721232175, "learning_rate": 7.5136691742667e-06, "loss": 0.6724, "step": 1029 }, { "epoch": 0.3994957820226898, "grad_norm": 2.859340968153736, "learning_rate": 7.507814021614761e-06, "loss": 0.7237, "step": 1030 }, { "epoch": 0.39988364200523613, "grad_norm": 2.5835533060806606, "learning_rate": 7.501954270432701e-06, "loss": 0.7187, "step": 1031 }, { "epoch": 0.4002715019877824, "grad_norm": 2.511375481933769, "learning_rate": 7.496089931465432e-06, "loss": 0.7699, "step": 1032 }, { "epoch": 0.4006593619703287, "grad_norm": 2.0567793889647588, "learning_rate": 7.490221015466279e-06, "loss": 0.7197, "step": 1033 }, { "epoch": 0.401047221952875, "grad_norm": 2.246910933105281, "learning_rate": 7.4843475331969614e-06, "loss": 0.7376, "step": 1034 }, { "epoch": 0.4014350819354213, "grad_norm": 1.9913314850249764, "learning_rate": 7.478469495427569e-06, "loss": 0.6911, "step": 1035 }, { "epoch": 0.40182294191796764, "grad_norm": 2.252389262844585, "learning_rate": 7.4725869129365484e-06, "loss": 0.666, "step": 1036 }, { "epoch": 0.4022108019005139, "grad_norm": 2.8714120211282195, "learning_rate": 7.4666997965106725e-06, "loss": 0.7018, "step": 1037 }, { "epoch": 0.4025986618830602, "grad_norm": 3.154400469687015, "learning_rate": 7.4608081569450365e-06, "loss": 0.7662, "step": 1038 }, { "epoch": 0.4029865218656065, "grad_norm": 2.522966920181243, "learning_rate": 7.4549120050430265e-06, "loss": 0.7086, "step": 1039 }, { "epoch": 0.40337438184815283, "grad_norm": 2.131396793338579, "learning_rate": 7.449011351616302e-06, "loss": 0.6648, "step": 1040 }, { "epoch": 0.40376224183069914, "grad_norm": 2.2860983354435223, "learning_rate": 7.443106207484776e-06, "loss": 0.6767, "step": 1041 }, { "epoch": 0.4041501018132454, "grad_norm": 2.2720208235603234, "learning_rate": 7.437196583476597e-06, "loss": 0.7162, "step": 1042 }, { "epoch": 0.4045379617957917, "grad_norm": 2.750215655535136, "learning_rate": 7.43128249042813e-06, "loss": 0.7562, "step": 1043 }, { "epoch": 0.404925821778338, "grad_norm": 2.0415622199962566, "learning_rate": 7.425363939183931e-06, "loss": 0.6585, "step": 1044 }, { "epoch": 0.40531368176088434, "grad_norm": 2.399135531033268, "learning_rate": 7.419440940596735e-06, "loss": 0.754, "step": 1045 }, { "epoch": 0.40570154174343065, "grad_norm": 2.9398971221697834, "learning_rate": 7.41351350552743e-06, "loss": 0.7579, "step": 1046 }, { "epoch": 0.4060894017259769, "grad_norm": 2.2789959479241855, "learning_rate": 7.407581644845038e-06, "loss": 0.6783, "step": 1047 }, { "epoch": 0.4064772617085232, "grad_norm": 2.276183285738163, "learning_rate": 7.401645369426697e-06, "loss": 0.723, "step": 1048 }, { "epoch": 0.40686512169106953, "grad_norm": 2.1731229442218214, "learning_rate": 7.395704690157644e-06, "loss": 0.7036, "step": 1049 }, { "epoch": 0.40725298167361584, "grad_norm": 3.983913069384336, "learning_rate": 7.389759617931183e-06, "loss": 0.8, "step": 1050 }, { "epoch": 0.4076408416561621, "grad_norm": 2.3942205463273, "learning_rate": 7.383810163648682e-06, "loss": 0.7353, "step": 1051 }, { "epoch": 0.4080287016387084, "grad_norm": 3.3561499787569775, "learning_rate": 7.3778563382195365e-06, "loss": 0.7694, "step": 1052 }, { "epoch": 0.4084165616212547, "grad_norm": 2.2947779741569074, "learning_rate": 7.371898152561166e-06, "loss": 0.6741, "step": 1053 }, { "epoch": 0.40880442160380104, "grad_norm": 2.5170658178433873, "learning_rate": 7.365935617598975e-06, "loss": 0.7504, "step": 1054 }, { "epoch": 0.40919228158634735, "grad_norm": 2.0579662144658704, "learning_rate": 7.359968744266353e-06, "loss": 0.6459, "step": 1055 }, { "epoch": 0.4095801415688936, "grad_norm": 2.0477980204006236, "learning_rate": 7.35399754350464e-06, "loss": 0.7176, "step": 1056 }, { "epoch": 0.4099680015514399, "grad_norm": 2.4978945460103414, "learning_rate": 7.3480220262631095e-06, "loss": 0.6663, "step": 1057 }, { "epoch": 0.41035586153398623, "grad_norm": 2.151363250751284, "learning_rate": 7.342042203498952e-06, "loss": 0.7009, "step": 1058 }, { "epoch": 0.41074372151653255, "grad_norm": 3.302983097444228, "learning_rate": 7.336058086177253e-06, "loss": 0.7257, "step": 1059 }, { "epoch": 0.41113158149907886, "grad_norm": 2.1562841612057846, "learning_rate": 7.330069685270976e-06, "loss": 0.6851, "step": 1060 }, { "epoch": 0.4115194414816251, "grad_norm": 2.061050230190722, "learning_rate": 7.3240770117609325e-06, "loss": 0.7372, "step": 1061 }, { "epoch": 0.4119073014641714, "grad_norm": 2.14816683956562, "learning_rate": 7.318080076635773e-06, "loss": 0.7133, "step": 1062 }, { "epoch": 0.41229516144671774, "grad_norm": 2.156117431500482, "learning_rate": 7.312078890891962e-06, "loss": 0.7207, "step": 1063 }, { "epoch": 0.41268302142926405, "grad_norm": 2.102390586110967, "learning_rate": 7.306073465533759e-06, "loss": 0.7212, "step": 1064 }, { "epoch": 0.4130708814118103, "grad_norm": 1.8141878348971954, "learning_rate": 7.300063811573194e-06, "loss": 0.6681, "step": 1065 }, { "epoch": 0.4134587413943566, "grad_norm": 3.1485966410041786, "learning_rate": 7.294049940030055e-06, "loss": 0.7734, "step": 1066 }, { "epoch": 0.41384660137690293, "grad_norm": 2.547736223471862, "learning_rate": 7.2880318619318605e-06, "loss": 0.6938, "step": 1067 }, { "epoch": 0.41423446135944925, "grad_norm": 2.9901035409330556, "learning_rate": 7.2820095883138456e-06, "loss": 0.7494, "step": 1068 }, { "epoch": 0.41462232134199556, "grad_norm": 2.8493572610826803, "learning_rate": 7.2759831302189376e-06, "loss": 0.7489, "step": 1069 }, { "epoch": 0.4150101813245418, "grad_norm": 3.1452730671891516, "learning_rate": 7.269952498697734e-06, "loss": 0.7464, "step": 1070 }, { "epoch": 0.41539804130708813, "grad_norm": 2.0092931790245343, "learning_rate": 7.2639177048084894e-06, "loss": 0.6499, "step": 1071 }, { "epoch": 0.41578590128963444, "grad_norm": 2.271171625588576, "learning_rate": 7.25787875961709e-06, "loss": 0.7171, "step": 1072 }, { "epoch": 0.41617376127218075, "grad_norm": 2.5202367199190943, "learning_rate": 7.2518356741970285e-06, "loss": 0.6913, "step": 1073 }, { "epoch": 0.41656162125472707, "grad_norm": 2.127675085233399, "learning_rate": 7.245788459629397e-06, "loss": 0.6799, "step": 1074 }, { "epoch": 0.4169494812372733, "grad_norm": 2.5891339494769974, "learning_rate": 7.239737127002854e-06, "loss": 0.709, "step": 1075 }, { "epoch": 0.41733734121981964, "grad_norm": 2.2741320585115052, "learning_rate": 7.233681687413614e-06, "loss": 0.7491, "step": 1076 }, { "epoch": 0.41772520120236595, "grad_norm": 2.09765827298815, "learning_rate": 7.227622151965418e-06, "loss": 0.6795, "step": 1077 }, { "epoch": 0.41811306118491226, "grad_norm": 2.2506750881216298, "learning_rate": 7.221558531769519e-06, "loss": 0.667, "step": 1078 }, { "epoch": 0.4185009211674586, "grad_norm": 2.7394230392050254, "learning_rate": 7.21549083794466e-06, "loss": 0.6917, "step": 1079 }, { "epoch": 0.41888878115000483, "grad_norm": 2.44728301000445, "learning_rate": 7.209419081617055e-06, "loss": 0.7033, "step": 1080 }, { "epoch": 0.41927664113255114, "grad_norm": 2.9145868269271267, "learning_rate": 7.203343273920365e-06, "loss": 0.7505, "step": 1081 }, { "epoch": 0.41966450111509745, "grad_norm": 2.5588363170583217, "learning_rate": 7.197263425995682e-06, "loss": 0.7387, "step": 1082 }, { "epoch": 0.42005236109764377, "grad_norm": 3.035497031423682, "learning_rate": 7.191179548991507e-06, "loss": 0.8222, "step": 1083 }, { "epoch": 0.42044022108019, "grad_norm": 2.4053907993072836, "learning_rate": 7.185091654063724e-06, "loss": 0.7096, "step": 1084 }, { "epoch": 0.42082808106273634, "grad_norm": 3.041747110180431, "learning_rate": 7.1789997523755915e-06, "loss": 0.7218, "step": 1085 }, { "epoch": 0.42121594104528265, "grad_norm": 2.0400132728511835, "learning_rate": 7.172903855097712e-06, "loss": 0.6554, "step": 1086 }, { "epoch": 0.42160380102782896, "grad_norm": 1.8949456026540594, "learning_rate": 7.166803973408012e-06, "loss": 0.6607, "step": 1087 }, { "epoch": 0.4219916610103753, "grad_norm": 2.0951268909798286, "learning_rate": 7.160700118491729e-06, "loss": 0.709, "step": 1088 }, { "epoch": 0.42237952099292153, "grad_norm": 2.5947221541900087, "learning_rate": 7.154592301541383e-06, "loss": 0.7276, "step": 1089 }, { "epoch": 0.42276738097546784, "grad_norm": 2.59562900088286, "learning_rate": 7.148480533756759e-06, "loss": 0.7271, "step": 1090 }, { "epoch": 0.42315524095801416, "grad_norm": 2.5375501514341265, "learning_rate": 7.142364826344891e-06, "loss": 0.716, "step": 1091 }, { "epoch": 0.42354310094056047, "grad_norm": 2.787433658203666, "learning_rate": 7.1362451905200285e-06, "loss": 0.6865, "step": 1092 }, { "epoch": 0.4239309609231068, "grad_norm": 1.94700503784988, "learning_rate": 7.130121637503633e-06, "loss": 0.7292, "step": 1093 }, { "epoch": 0.42431882090565304, "grad_norm": 2.333275131368888, "learning_rate": 7.123994178524345e-06, "loss": 0.7011, "step": 1094 }, { "epoch": 0.42470668088819935, "grad_norm": 2.7720026283221633, "learning_rate": 7.117862824817966e-06, "loss": 0.6841, "step": 1095 }, { "epoch": 0.42509454087074566, "grad_norm": 2.521998136870531, "learning_rate": 7.1117275876274425e-06, "loss": 0.6356, "step": 1096 }, { "epoch": 0.425482400853292, "grad_norm": 1.9919151186634003, "learning_rate": 7.105588478202838e-06, "loss": 0.6983, "step": 1097 }, { "epoch": 0.4258702608358383, "grad_norm": 2.2902501415450525, "learning_rate": 7.099445507801324e-06, "loss": 0.7197, "step": 1098 }, { "epoch": 0.42625812081838454, "grad_norm": 2.2380690183087206, "learning_rate": 7.093298687687141e-06, "loss": 0.6972, "step": 1099 }, { "epoch": 0.42664598080093086, "grad_norm": 2.5611263263401263, "learning_rate": 7.0871480291315975e-06, "loss": 0.732, "step": 1100 }, { "epoch": 0.42703384078347717, "grad_norm": 2.1692921715486295, "learning_rate": 7.080993543413035e-06, "loss": 0.7158, "step": 1101 }, { "epoch": 0.4274217007660235, "grad_norm": 2.6112921070938757, "learning_rate": 7.0748352418168174e-06, "loss": 0.6922, "step": 1102 }, { "epoch": 0.42780956074856974, "grad_norm": 2.6767577670377656, "learning_rate": 7.068673135635302e-06, "loss": 0.7123, "step": 1103 }, { "epoch": 0.42819742073111605, "grad_norm": 2.263021126060051, "learning_rate": 7.062507236167826e-06, "loss": 0.6855, "step": 1104 }, { "epoch": 0.42858528071366236, "grad_norm": 2.9771977805185, "learning_rate": 7.056337554720676e-06, "loss": 0.7187, "step": 1105 }, { "epoch": 0.4289731406962087, "grad_norm": 2.9920392029487566, "learning_rate": 7.050164102607081e-06, "loss": 0.7181, "step": 1106 }, { "epoch": 0.429361000678755, "grad_norm": 2.550900742587274, "learning_rate": 7.043986891147179e-06, "loss": 0.6901, "step": 1107 }, { "epoch": 0.42974886066130125, "grad_norm": 2.4940556801230507, "learning_rate": 7.037805931668006e-06, "loss": 0.6755, "step": 1108 }, { "epoch": 0.43013672064384756, "grad_norm": 3.235561989757126, "learning_rate": 7.031621235503464e-06, "loss": 0.7083, "step": 1109 }, { "epoch": 0.43052458062639387, "grad_norm": 1.7194861927453622, "learning_rate": 7.025432813994315e-06, "loss": 0.6307, "step": 1110 }, { "epoch": 0.4309124406089402, "grad_norm": 2.01811155154844, "learning_rate": 7.019240678488145e-06, "loss": 0.6443, "step": 1111 }, { "epoch": 0.4313003005914865, "grad_norm": 1.8616538454449747, "learning_rate": 7.013044840339353e-06, "loss": 0.6587, "step": 1112 }, { "epoch": 0.43168816057403275, "grad_norm": 2.609535190903, "learning_rate": 7.006845310909131e-06, "loss": 0.6592, "step": 1113 }, { "epoch": 0.43207602055657907, "grad_norm": 3.146718082087228, "learning_rate": 7.000642101565434e-06, "loss": 0.7137, "step": 1114 }, { "epoch": 0.4324638805391254, "grad_norm": 2.427653072399018, "learning_rate": 6.994435223682966e-06, "loss": 0.7486, "step": 1115 }, { "epoch": 0.4328517405216717, "grad_norm": 2.6708607233502586, "learning_rate": 6.9882246886431615e-06, "loss": 0.6982, "step": 1116 }, { "epoch": 0.433239600504218, "grad_norm": 2.393513721556069, "learning_rate": 6.982010507834158e-06, "loss": 0.6586, "step": 1117 }, { "epoch": 0.43362746048676426, "grad_norm": 2.247890679400462, "learning_rate": 6.975792692650778e-06, "loss": 0.6665, "step": 1118 }, { "epoch": 0.4340153204693106, "grad_norm": 2.418500719485879, "learning_rate": 6.969571254494509e-06, "loss": 0.7378, "step": 1119 }, { "epoch": 0.4344031804518569, "grad_norm": 2.08236184302825, "learning_rate": 6.963346204773483e-06, "loss": 0.7316, "step": 1120 }, { "epoch": 0.4347910404344032, "grad_norm": 2.9559171643314657, "learning_rate": 6.957117554902452e-06, "loss": 0.7842, "step": 1121 }, { "epoch": 0.43517890041694945, "grad_norm": 2.3607201585059525, "learning_rate": 6.950885316302773e-06, "loss": 0.6427, "step": 1122 }, { "epoch": 0.43556676039949577, "grad_norm": 2.9166173547365215, "learning_rate": 6.94464950040238e-06, "loss": 0.7689, "step": 1123 }, { "epoch": 0.4359546203820421, "grad_norm": 2.7412007963013787, "learning_rate": 6.938410118635768e-06, "loss": 0.6974, "step": 1124 }, { "epoch": 0.4363424803645884, "grad_norm": 2.0453909240615586, "learning_rate": 6.9321671824439715e-06, "loss": 0.7294, "step": 1125 }, { "epoch": 0.4367303403471347, "grad_norm": 2.490421014985969, "learning_rate": 6.9259207032745415e-06, "loss": 0.6977, "step": 1126 }, { "epoch": 0.43711820032968096, "grad_norm": 2.0580941503828627, "learning_rate": 6.919670692581526e-06, "loss": 0.7001, "step": 1127 }, { "epoch": 0.4375060603122273, "grad_norm": 2.731630490915455, "learning_rate": 6.913417161825449e-06, "loss": 0.7154, "step": 1128 }, { "epoch": 0.4378939202947736, "grad_norm": 1.9700842883632363, "learning_rate": 6.907160122473291e-06, "loss": 0.6226, "step": 1129 }, { "epoch": 0.4382817802773199, "grad_norm": 2.5557694648058624, "learning_rate": 6.90089958599846e-06, "loss": 0.6959, "step": 1130 }, { "epoch": 0.4386696402598662, "grad_norm": 3.02746911334377, "learning_rate": 6.894635563880785e-06, "loss": 0.7257, "step": 1131 }, { "epoch": 0.43905750024241247, "grad_norm": 2.102038875073131, "learning_rate": 6.88836806760648e-06, "loss": 0.6859, "step": 1132 }, { "epoch": 0.4394453602249588, "grad_norm": 2.2056521491672445, "learning_rate": 6.882097108668132e-06, "loss": 0.7582, "step": 1133 }, { "epoch": 0.4398332202075051, "grad_norm": 2.1739692572142704, "learning_rate": 6.875822698564678e-06, "loss": 0.6948, "step": 1134 }, { "epoch": 0.4402210801900514, "grad_norm": 2.5446912856090678, "learning_rate": 6.869544848801383e-06, "loss": 0.7163, "step": 1135 }, { "epoch": 0.4406089401725977, "grad_norm": 2.531641105935224, "learning_rate": 6.863263570889818e-06, "loss": 0.7173, "step": 1136 }, { "epoch": 0.440996800155144, "grad_norm": 2.9570227063988344, "learning_rate": 6.85697887634784e-06, "loss": 0.6789, "step": 1137 }, { "epoch": 0.4413846601376903, "grad_norm": 2.673217342460435, "learning_rate": 6.850690776699574e-06, "loss": 0.6779, "step": 1138 }, { "epoch": 0.4417725201202366, "grad_norm": 3.050739143251941, "learning_rate": 6.844399283475384e-06, "loss": 0.7698, "step": 1139 }, { "epoch": 0.4421603801027829, "grad_norm": 1.953968913671597, "learning_rate": 6.838104408211862e-06, "loss": 0.6934, "step": 1140 }, { "epoch": 0.44254824008532917, "grad_norm": 2.528885657770975, "learning_rate": 6.831806162451799e-06, "loss": 0.7307, "step": 1141 }, { "epoch": 0.4429361000678755, "grad_norm": 2.2215165574055296, "learning_rate": 6.825504557744167e-06, "loss": 0.7581, "step": 1142 }, { "epoch": 0.4433239600504218, "grad_norm": 3.1255414727933566, "learning_rate": 6.819199605644093e-06, "loss": 0.7184, "step": 1143 }, { "epoch": 0.4437118200329681, "grad_norm": 3.237615608470384, "learning_rate": 6.812891317712851e-06, "loss": 0.8028, "step": 1144 }, { "epoch": 0.4440996800155144, "grad_norm": 2.2975653374057328, "learning_rate": 6.806579705517824e-06, "loss": 0.6941, "step": 1145 }, { "epoch": 0.4444875399980607, "grad_norm": 1.9673779905697817, "learning_rate": 6.800264780632495e-06, "loss": 0.6849, "step": 1146 }, { "epoch": 0.444875399980607, "grad_norm": 2.573205579125811, "learning_rate": 6.793946554636417e-06, "loss": 0.7251, "step": 1147 }, { "epoch": 0.4452632599631533, "grad_norm": 2.584009384136493, "learning_rate": 6.7876250391152e-06, "loss": 0.7463, "step": 1148 }, { "epoch": 0.4456511199456996, "grad_norm": 2.774307746698678, "learning_rate": 6.781300245660487e-06, "loss": 0.7245, "step": 1149 }, { "epoch": 0.4460389799282459, "grad_norm": 2.3037649793173633, "learning_rate": 6.774972185869928e-06, "loss": 0.6835, "step": 1150 }, { "epoch": 0.4464268399107922, "grad_norm": 2.3625407627814177, "learning_rate": 6.768640871347163e-06, "loss": 0.6688, "step": 1151 }, { "epoch": 0.4468146998933385, "grad_norm": 3.651464258491016, "learning_rate": 6.762306313701803e-06, "loss": 0.6911, "step": 1152 }, { "epoch": 0.4472025598758848, "grad_norm": 2.5898235526883853, "learning_rate": 6.7559685245494025e-06, "loss": 0.7483, "step": 1153 }, { "epoch": 0.4475904198584311, "grad_norm": 2.3930328171711044, "learning_rate": 6.749627515511443e-06, "loss": 0.6936, "step": 1154 }, { "epoch": 0.44797827984097743, "grad_norm": 2.8299420278609646, "learning_rate": 6.743283298215312e-06, "loss": 0.7213, "step": 1155 }, { "epoch": 0.4483661398235237, "grad_norm": 2.5457379319226026, "learning_rate": 6.736935884294275e-06, "loss": 0.731, "step": 1156 }, { "epoch": 0.44875399980607, "grad_norm": 2.88233255854639, "learning_rate": 6.730585285387465e-06, "loss": 0.7603, "step": 1157 }, { "epoch": 0.4491418597886163, "grad_norm": 2.316082539464162, "learning_rate": 6.724231513139853e-06, "loss": 0.686, "step": 1158 }, { "epoch": 0.44952971977116263, "grad_norm": 2.7296485282300096, "learning_rate": 6.717874579202227e-06, "loss": 0.6659, "step": 1159 }, { "epoch": 0.4499175797537089, "grad_norm": 2.9330214548713673, "learning_rate": 6.711514495231173e-06, "loss": 0.7357, "step": 1160 }, { "epoch": 0.4503054397362552, "grad_norm": 2.5162971469604396, "learning_rate": 6.705151272889055e-06, "loss": 0.6972, "step": 1161 }, { "epoch": 0.4506932997188015, "grad_norm": 2.2892970595671587, "learning_rate": 6.698784923843993e-06, "loss": 0.732, "step": 1162 }, { "epoch": 0.4510811597013478, "grad_norm": 2.312817839501261, "learning_rate": 6.692415459769835e-06, "loss": 0.6366, "step": 1163 }, { "epoch": 0.45146901968389413, "grad_norm": 2.3026119697065104, "learning_rate": 6.686042892346147e-06, "loss": 0.6557, "step": 1164 }, { "epoch": 0.4518568796664404, "grad_norm": 3.1986112211572735, "learning_rate": 6.679667233258179e-06, "loss": 0.7165, "step": 1165 }, { "epoch": 0.4522447396489867, "grad_norm": 3.06069619093926, "learning_rate": 6.673288494196858e-06, "loss": 0.7141, "step": 1166 }, { "epoch": 0.452632599631533, "grad_norm": 1.8160023450177534, "learning_rate": 6.666906686858753e-06, "loss": 0.6735, "step": 1167 }, { "epoch": 0.45302045961407933, "grad_norm": 2.268235175474956, "learning_rate": 6.66052182294606e-06, "loss": 0.7419, "step": 1168 }, { "epoch": 0.45340831959662564, "grad_norm": 2.3448492710301285, "learning_rate": 6.654133914166582e-06, "loss": 0.6804, "step": 1169 }, { "epoch": 0.4537961795791719, "grad_norm": 2.4676201821036527, "learning_rate": 6.647742972233703e-06, "loss": 0.6876, "step": 1170 }, { "epoch": 0.4541840395617182, "grad_norm": 2.3461934277135432, "learning_rate": 6.641349008866369e-06, "loss": 0.6995, "step": 1171 }, { "epoch": 0.4545718995442645, "grad_norm": 2.9922552479632505, "learning_rate": 6.634952035789069e-06, "loss": 0.7721, "step": 1172 }, { "epoch": 0.45495975952681084, "grad_norm": 2.8800918406708713, "learning_rate": 6.628552064731807e-06, "loss": 0.7358, "step": 1173 }, { "epoch": 0.45534761950935715, "grad_norm": 2.670901041785104, "learning_rate": 6.622149107430088e-06, "loss": 0.6714, "step": 1174 }, { "epoch": 0.4557354794919034, "grad_norm": 3.5126626892299373, "learning_rate": 6.6157431756248906e-06, "loss": 0.7355, "step": 1175 }, { "epoch": 0.4561233394744497, "grad_norm": 3.190446252668891, "learning_rate": 6.609334281062647e-06, "loss": 0.767, "step": 1176 }, { "epoch": 0.45651119945699603, "grad_norm": 2.275681360387093, "learning_rate": 6.602922435495225e-06, "loss": 0.7354, "step": 1177 }, { "epoch": 0.45689905943954234, "grad_norm": 2.0414464984060423, "learning_rate": 6.5965076506799e-06, "loss": 0.6768, "step": 1178 }, { "epoch": 0.4572869194220886, "grad_norm": 2.5149890570512734, "learning_rate": 6.5900899383793415e-06, "loss": 0.6988, "step": 1179 }, { "epoch": 0.4576747794046349, "grad_norm": 2.467242589675749, "learning_rate": 6.583669310361583e-06, "loss": 0.6538, "step": 1180 }, { "epoch": 0.4580626393871812, "grad_norm": 2.7310252879330243, "learning_rate": 6.577245778400006e-06, "loss": 0.6679, "step": 1181 }, { "epoch": 0.45845049936972754, "grad_norm": 2.941997399875281, "learning_rate": 6.570819354273317e-06, "loss": 0.7, "step": 1182 }, { "epoch": 0.45883835935227385, "grad_norm": 2.3517685724077078, "learning_rate": 6.564390049765528e-06, "loss": 0.6704, "step": 1183 }, { "epoch": 0.4592262193348201, "grad_norm": 2.6563478348339933, "learning_rate": 6.557957876665926e-06, "loss": 0.7106, "step": 1184 }, { "epoch": 0.4596140793173664, "grad_norm": 2.513045674988935, "learning_rate": 6.551522846769067e-06, "loss": 0.677, "step": 1185 }, { "epoch": 0.46000193929991273, "grad_norm": 2.401548858222256, "learning_rate": 6.545084971874738e-06, "loss": 0.7313, "step": 1186 }, { "epoch": 0.46038979928245904, "grad_norm": 1.8138737221405339, "learning_rate": 6.538644263787948e-06, "loss": 0.6643, "step": 1187 }, { "epoch": 0.46077765926500536, "grad_norm": 2.3295241036737036, "learning_rate": 6.532200734318896e-06, "loss": 0.6767, "step": 1188 }, { "epoch": 0.4611655192475516, "grad_norm": 2.4374015491036953, "learning_rate": 6.525754395282961e-06, "loss": 0.7029, "step": 1189 }, { "epoch": 0.4615533792300979, "grad_norm": 2.5276614482203184, "learning_rate": 6.5193052585006666e-06, "loss": 0.754, "step": 1190 }, { "epoch": 0.46194123921264424, "grad_norm": 2.569223024816501, "learning_rate": 6.512853335797673e-06, "loss": 0.7149, "step": 1191 }, { "epoch": 0.46232909919519055, "grad_norm": 2.068999603532362, "learning_rate": 6.5063986390047475e-06, "loss": 0.7168, "step": 1192 }, { "epoch": 0.46271695917773686, "grad_norm": 2.1523204852309967, "learning_rate": 6.499941179957739e-06, "loss": 0.6734, "step": 1193 }, { "epoch": 0.4631048191602831, "grad_norm": 2.9824084732889213, "learning_rate": 6.493480970497569e-06, "loss": 0.7159, "step": 1194 }, { "epoch": 0.46349267914282943, "grad_norm": 2.2420954025638467, "learning_rate": 6.487018022470195e-06, "loss": 0.7107, "step": 1195 }, { "epoch": 0.46388053912537575, "grad_norm": 2.227071049893546, "learning_rate": 6.480552347726604e-06, "loss": 0.6874, "step": 1196 }, { "epoch": 0.46426839910792206, "grad_norm": 2.0012952502723964, "learning_rate": 6.474083958122777e-06, "loss": 0.6483, "step": 1197 }, { "epoch": 0.4646562590904683, "grad_norm": 2.310054540523407, "learning_rate": 6.467612865519674e-06, "loss": 0.7242, "step": 1198 }, { "epoch": 0.46504411907301463, "grad_norm": 2.4108588578193033, "learning_rate": 6.461139081783215e-06, "loss": 0.6837, "step": 1199 }, { "epoch": 0.46543197905556094, "grad_norm": 2.8571264873418807, "learning_rate": 6.454662618784249e-06, "loss": 0.6893, "step": 1200 }, { "epoch": 0.46581983903810725, "grad_norm": 2.0674647793209244, "learning_rate": 6.448183488398545e-06, "loss": 0.7228, "step": 1201 }, { "epoch": 0.46620769902065357, "grad_norm": 2.651318297162342, "learning_rate": 6.441701702506755e-06, "loss": 0.695, "step": 1202 }, { "epoch": 0.4665955590031998, "grad_norm": 2.1191622460860864, "learning_rate": 6.435217272994406e-06, "loss": 0.6496, "step": 1203 }, { "epoch": 0.46698341898574613, "grad_norm": 2.5072688324545234, "learning_rate": 6.428730211751873e-06, "loss": 0.7146, "step": 1204 }, { "epoch": 0.46737127896829245, "grad_norm": 2.190311688339955, "learning_rate": 6.422240530674354e-06, "loss": 0.6863, "step": 1205 }, { "epoch": 0.46775913895083876, "grad_norm": 2.5018424635503775, "learning_rate": 6.4157482416618514e-06, "loss": 0.6645, "step": 1206 }, { "epoch": 0.4681469989333851, "grad_norm": 2.0323104674629637, "learning_rate": 6.409253356619148e-06, "loss": 0.7094, "step": 1207 }, { "epoch": 0.46853485891593133, "grad_norm": 2.0477813983927313, "learning_rate": 6.402755887455792e-06, "loss": 0.7151, "step": 1208 }, { "epoch": 0.46892271889847764, "grad_norm": 2.260942029714187, "learning_rate": 6.396255846086067e-06, "loss": 0.7286, "step": 1209 }, { "epoch": 0.46931057888102395, "grad_norm": 2.383450916045315, "learning_rate": 6.389753244428973e-06, "loss": 0.7624, "step": 1210 }, { "epoch": 0.46969843886357027, "grad_norm": 2.047491044025192, "learning_rate": 6.383248094408203e-06, "loss": 0.6056, "step": 1211 }, { "epoch": 0.4700862988461166, "grad_norm": 2.1851832701292015, "learning_rate": 6.376740407952127e-06, "loss": 0.6378, "step": 1212 }, { "epoch": 0.47047415882866284, "grad_norm": 2.1110778370537826, "learning_rate": 6.370230196993763e-06, "loss": 0.6853, "step": 1213 }, { "epoch": 0.47086201881120915, "grad_norm": 2.3099197105094333, "learning_rate": 6.36371747347076e-06, "loss": 0.6834, "step": 1214 }, { "epoch": 0.47124987879375546, "grad_norm": 2.189480673499418, "learning_rate": 6.3572022493253715e-06, "loss": 0.674, "step": 1215 }, { "epoch": 0.4716377387763018, "grad_norm": 3.51470072633223, "learning_rate": 6.350684536504441e-06, "loss": 0.7549, "step": 1216 }, { "epoch": 0.47202559875884803, "grad_norm": 1.9376930476503031, "learning_rate": 6.344164346959371e-06, "loss": 0.6826, "step": 1217 }, { "epoch": 0.47241345874139434, "grad_norm": 2.6506329553145314, "learning_rate": 6.337641692646106e-06, "loss": 0.743, "step": 1218 }, { "epoch": 0.47280131872394066, "grad_norm": 2.010838939242567, "learning_rate": 6.331116585525112e-06, "loss": 0.6153, "step": 1219 }, { "epoch": 0.47318917870648697, "grad_norm": 1.9328704451597094, "learning_rate": 6.324589037561352e-06, "loss": 0.6936, "step": 1220 }, { "epoch": 0.4735770386890333, "grad_norm": 2.564225923753749, "learning_rate": 6.318059060724264e-06, "loss": 0.6859, "step": 1221 }, { "epoch": 0.47396489867157954, "grad_norm": 2.0436303158866713, "learning_rate": 6.3115266669877425e-06, "loss": 0.6714, "step": 1222 }, { "epoch": 0.47435275865412585, "grad_norm": 1.9940902182555145, "learning_rate": 6.30499186833011e-06, "loss": 0.6271, "step": 1223 }, { "epoch": 0.47474061863667216, "grad_norm": 2.300648085964488, "learning_rate": 6.2984546767341e-06, "loss": 0.6527, "step": 1224 }, { "epoch": 0.4751284786192185, "grad_norm": 2.175472694913287, "learning_rate": 6.291915104186836e-06, "loss": 0.7431, "step": 1225 }, { "epoch": 0.4755163386017648, "grad_norm": 2.773328278020023, "learning_rate": 6.285373162679804e-06, "loss": 0.7339, "step": 1226 }, { "epoch": 0.47590419858431104, "grad_norm": 2.2825320133266924, "learning_rate": 6.278828864208839e-06, "loss": 0.6642, "step": 1227 }, { "epoch": 0.47629205856685736, "grad_norm": 1.9453775194029113, "learning_rate": 6.272282220774091e-06, "loss": 0.706, "step": 1228 }, { "epoch": 0.47667991854940367, "grad_norm": 2.0657576240396116, "learning_rate": 6.265733244380014e-06, "loss": 0.6332, "step": 1229 }, { "epoch": 0.47706777853195, "grad_norm": 2.325584712903702, "learning_rate": 6.2591819470353424e-06, "loss": 0.7188, "step": 1230 }, { "epoch": 0.4774556385144963, "grad_norm": 2.595421894780154, "learning_rate": 6.25262834075306e-06, "loss": 0.7454, "step": 1231 }, { "epoch": 0.47784349849704255, "grad_norm": 2.3724320739150837, "learning_rate": 6.246072437550391e-06, "loss": 0.7395, "step": 1232 }, { "epoch": 0.47823135847958886, "grad_norm": 2.536200005267173, "learning_rate": 6.239514249448767e-06, "loss": 0.6752, "step": 1233 }, { "epoch": 0.4786192184621352, "grad_norm": 2.6265387026874727, "learning_rate": 6.2329537884738115e-06, "loss": 0.6975, "step": 1234 }, { "epoch": 0.4790070784446815, "grad_norm": 2.2935184806005537, "learning_rate": 6.226391066655313e-06, "loss": 0.7081, "step": 1235 }, { "epoch": 0.47939493842722775, "grad_norm": 2.2558228910633398, "learning_rate": 6.219826096027211e-06, "loss": 0.6997, "step": 1236 }, { "epoch": 0.47978279840977406, "grad_norm": 2.1866640909679536, "learning_rate": 6.213258888627561e-06, "loss": 0.6727, "step": 1237 }, { "epoch": 0.48017065839232037, "grad_norm": 2.467877501359811, "learning_rate": 6.206689456498529e-06, "loss": 0.7625, "step": 1238 }, { "epoch": 0.4805585183748667, "grad_norm": 2.2534806784131356, "learning_rate": 6.200117811686354e-06, "loss": 0.7094, "step": 1239 }, { "epoch": 0.480946378357413, "grad_norm": 2.439084864907586, "learning_rate": 6.193543966241332e-06, "loss": 0.7167, "step": 1240 }, { "epoch": 0.48133423833995925, "grad_norm": 2.6407703438226826, "learning_rate": 6.1869679322178e-06, "loss": 0.6936, "step": 1241 }, { "epoch": 0.48172209832250557, "grad_norm": 2.0825369526023323, "learning_rate": 6.180389721674101e-06, "loss": 0.6797, "step": 1242 }, { "epoch": 0.4821099583050519, "grad_norm": 2.690236437833537, "learning_rate": 6.1738093466725745e-06, "loss": 0.7561, "step": 1243 }, { "epoch": 0.4824978182875982, "grad_norm": 2.7824512320375865, "learning_rate": 6.1672268192795285e-06, "loss": 0.7232, "step": 1244 }, { "epoch": 0.4828856782701445, "grad_norm": 2.812977298158211, "learning_rate": 6.1606421515652124e-06, "loss": 0.6992, "step": 1245 }, { "epoch": 0.48327353825269076, "grad_norm": 2.5647297684191415, "learning_rate": 6.1540553556038075e-06, "loss": 0.6647, "step": 1246 }, { "epoch": 0.48366139823523707, "grad_norm": 3.296275106455928, "learning_rate": 6.1474664434733935e-06, "loss": 0.6648, "step": 1247 }, { "epoch": 0.4840492582177834, "grad_norm": 2.5226466007079935, "learning_rate": 6.14087542725593e-06, "loss": 0.72, "step": 1248 }, { "epoch": 0.4844371182003297, "grad_norm": 2.1541471516687247, "learning_rate": 6.134282319037238e-06, "loss": 0.6973, "step": 1249 }, { "epoch": 0.48482497818287595, "grad_norm": 2.835839472517752, "learning_rate": 6.127687130906972e-06, "loss": 0.7504, "step": 1250 }, { "epoch": 0.48521283816542227, "grad_norm": 3.0722170596244394, "learning_rate": 6.1210898749586e-06, "loss": 0.7357, "step": 1251 }, { "epoch": 0.4856006981479686, "grad_norm": 1.7963592714184073, "learning_rate": 6.114490563289384e-06, "loss": 0.7518, "step": 1252 }, { "epoch": 0.4859885581305149, "grad_norm": 2.4649532458739656, "learning_rate": 6.1078892080003535e-06, "loss": 0.6775, "step": 1253 }, { "epoch": 0.4863764181130612, "grad_norm": 2.737238568981606, "learning_rate": 6.101285821196285e-06, "loss": 0.7425, "step": 1254 }, { "epoch": 0.48676427809560746, "grad_norm": 3.1001589366971642, "learning_rate": 6.094680414985685e-06, "loss": 0.7075, "step": 1255 }, { "epoch": 0.4871521380781538, "grad_norm": 3.6907077747261514, "learning_rate": 6.088073001480757e-06, "loss": 0.787, "step": 1256 }, { "epoch": 0.4875399980607001, "grad_norm": 2.7675049163886927, "learning_rate": 6.081463592797388e-06, "loss": 0.7068, "step": 1257 }, { "epoch": 0.4879278580432464, "grad_norm": 2.572315307873498, "learning_rate": 6.074852201055121e-06, "loss": 0.6797, "step": 1258 }, { "epoch": 0.4883157180257927, "grad_norm": 2.44171593090819, "learning_rate": 6.06823883837714e-06, "loss": 0.6991, "step": 1259 }, { "epoch": 0.48870357800833897, "grad_norm": 2.1791607196655223, "learning_rate": 6.061623516890238e-06, "loss": 0.6827, "step": 1260 }, { "epoch": 0.4890914379908853, "grad_norm": 2.3582783763157797, "learning_rate": 6.0550062487248055e-06, "loss": 0.6569, "step": 1261 }, { "epoch": 0.4894792979734316, "grad_norm": 2.74684496373786, "learning_rate": 6.048387046014795e-06, "loss": 0.7213, "step": 1262 }, { "epoch": 0.4898671579559779, "grad_norm": 2.3041746263659855, "learning_rate": 6.041765920897713e-06, "loss": 0.7114, "step": 1263 }, { "epoch": 0.4902550179385242, "grad_norm": 2.5055010343893622, "learning_rate": 6.03514288551459e-06, "loss": 0.7785, "step": 1264 }, { "epoch": 0.4906428779210705, "grad_norm": 2.7950179301857982, "learning_rate": 6.028517952009957e-06, "loss": 0.7284, "step": 1265 }, { "epoch": 0.4910307379036168, "grad_norm": 2.692278991668634, "learning_rate": 6.021891132531825e-06, "loss": 0.7061, "step": 1266 }, { "epoch": 0.4914185978861631, "grad_norm": 2.902977786296156, "learning_rate": 6.015262439231666e-06, "loss": 0.6763, "step": 1267 }, { "epoch": 0.4918064578687094, "grad_norm": 2.985177376292219, "learning_rate": 6.008631884264387e-06, "loss": 0.7206, "step": 1268 }, { "epoch": 0.49219431785125567, "grad_norm": 2.419947696729931, "learning_rate": 6.00199947978831e-06, "loss": 0.6634, "step": 1269 }, { "epoch": 0.492582177833802, "grad_norm": 2.621770051934068, "learning_rate": 5.995365237965144e-06, "loss": 0.6677, "step": 1270 }, { "epoch": 0.4929700378163483, "grad_norm": 2.1199354960766037, "learning_rate": 5.98872917095997e-06, "loss": 0.6926, "step": 1271 }, { "epoch": 0.4933578977988946, "grad_norm": 2.0275285260151117, "learning_rate": 5.98209129094122e-06, "loss": 0.6714, "step": 1272 }, { "epoch": 0.4937457577814409, "grad_norm": 3.4217122913640816, "learning_rate": 5.975451610080643e-06, "loss": 0.746, "step": 1273 }, { "epoch": 0.4941336177639872, "grad_norm": 1.9444379277178494, "learning_rate": 5.968810140553292e-06, "loss": 0.6779, "step": 1274 }, { "epoch": 0.4945214777465335, "grad_norm": 2.281110832049723, "learning_rate": 5.962166894537507e-06, "loss": 0.6285, "step": 1275 }, { "epoch": 0.4949093377290798, "grad_norm": 1.952044043667781, "learning_rate": 5.955521884214872e-06, "loss": 0.6723, "step": 1276 }, { "epoch": 0.4952971977116261, "grad_norm": 2.1257640080135176, "learning_rate": 5.948875121770221e-06, "loss": 0.6627, "step": 1277 }, { "epoch": 0.4956850576941724, "grad_norm": 3.0021669621849068, "learning_rate": 5.942226619391592e-06, "loss": 0.7484, "step": 1278 }, { "epoch": 0.4960729176767187, "grad_norm": 2.4140534018763464, "learning_rate": 5.935576389270215e-06, "loss": 0.7349, "step": 1279 }, { "epoch": 0.496460777659265, "grad_norm": 2.271897652876336, "learning_rate": 5.928924443600487e-06, "loss": 0.6888, "step": 1280 }, { "epoch": 0.4968486376418113, "grad_norm": 2.8499116040215364, "learning_rate": 5.922270794579953e-06, "loss": 0.7772, "step": 1281 }, { "epoch": 0.4972364976243576, "grad_norm": 2.9114241437694512, "learning_rate": 5.915615454409281e-06, "loss": 0.753, "step": 1282 }, { "epoch": 0.49762435760690393, "grad_norm": 2.977757797755179, "learning_rate": 5.908958435292241e-06, "loss": 0.7394, "step": 1283 }, { "epoch": 0.4980122175894502, "grad_norm": 2.5122158729190436, "learning_rate": 5.902299749435678e-06, "loss": 0.6952, "step": 1284 }, { "epoch": 0.4984000775719965, "grad_norm": 2.1617284565198576, "learning_rate": 5.895639409049497e-06, "loss": 0.7142, "step": 1285 }, { "epoch": 0.4987879375545428, "grad_norm": 3.067042429264119, "learning_rate": 5.888977426346636e-06, "loss": 0.7977, "step": 1286 }, { "epoch": 0.4991757975370891, "grad_norm": 2.53722260673096, "learning_rate": 5.882313813543043e-06, "loss": 0.7759, "step": 1287 }, { "epoch": 0.4995636575196354, "grad_norm": 2.5760743015495495, "learning_rate": 5.875648582857655e-06, "loss": 0.708, "step": 1288 }, { "epoch": 0.4999515175021817, "grad_norm": 2.2166206369703, "learning_rate": 5.868981746512379e-06, "loss": 0.7072, "step": 1289 }, { "epoch": 0.500339377484728, "grad_norm": 2.6841519887612537, "learning_rate": 5.862313316732064e-06, "loss": 0.7341, "step": 1290 }, { "epoch": 0.5007272374672743, "grad_norm": 2.3927931390311947, "learning_rate": 5.855643305744479e-06, "loss": 0.6511, "step": 1291 }, { "epoch": 0.5011150974498206, "grad_norm": 2.633590384199103, "learning_rate": 5.848971725780294e-06, "loss": 0.6492, "step": 1292 }, { "epoch": 0.501502957432367, "grad_norm": 2.9077829872345875, "learning_rate": 5.842298589073058e-06, "loss": 0.7177, "step": 1293 }, { "epoch": 0.5018908174149133, "grad_norm": 3.1710746555169407, "learning_rate": 5.835623907859173e-06, "loss": 0.7428, "step": 1294 }, { "epoch": 0.5022786773974596, "grad_norm": 2.221839091062329, "learning_rate": 5.828947694377871e-06, "loss": 0.6524, "step": 1295 }, { "epoch": 0.5026665373800058, "grad_norm": 1.8009104478415454, "learning_rate": 5.822269960871198e-06, "loss": 0.6605, "step": 1296 }, { "epoch": 0.5030543973625521, "grad_norm": 1.9885150498385615, "learning_rate": 5.815590719583984e-06, "loss": 0.7067, "step": 1297 }, { "epoch": 0.5034422573450984, "grad_norm": 2.54789296068977, "learning_rate": 5.808909982763825e-06, "loss": 0.7069, "step": 1298 }, { "epoch": 0.5038301173276447, "grad_norm": 2.4718808630606026, "learning_rate": 5.802227762661058e-06, "loss": 0.7398, "step": 1299 }, { "epoch": 0.504217977310191, "grad_norm": 2.2152374523531746, "learning_rate": 5.795544071528742e-06, "loss": 0.6348, "step": 1300 }, { "epoch": 0.5046058372927373, "grad_norm": 3.102379595539992, "learning_rate": 5.7888589216226295e-06, "loss": 0.7329, "step": 1301 }, { "epoch": 0.5049936972752836, "grad_norm": 2.151482917832596, "learning_rate": 5.782172325201155e-06, "loss": 0.6468, "step": 1302 }, { "epoch": 0.50538155725783, "grad_norm": 2.7663040681800948, "learning_rate": 5.775484294525399e-06, "loss": 0.7143, "step": 1303 }, { "epoch": 0.5057694172403763, "grad_norm": 2.732484164107616, "learning_rate": 5.768794841859074e-06, "loss": 0.7471, "step": 1304 }, { "epoch": 0.5061572772229225, "grad_norm": 2.7996951073030187, "learning_rate": 5.762103979468501e-06, "loss": 0.741, "step": 1305 }, { "epoch": 0.5065451372054688, "grad_norm": 1.9466157023453141, "learning_rate": 5.755411719622584e-06, "loss": 0.6665, "step": 1306 }, { "epoch": 0.5069329971880151, "grad_norm": 2.253351971349571, "learning_rate": 5.748718074592792e-06, "loss": 0.7334, "step": 1307 }, { "epoch": 0.5073208571705614, "grad_norm": 2.740112282829335, "learning_rate": 5.742023056653131e-06, "loss": 0.7262, "step": 1308 }, { "epoch": 0.5077087171531077, "grad_norm": 2.724494217932167, "learning_rate": 5.735326678080127e-06, "loss": 0.6982, "step": 1309 }, { "epoch": 0.508096577135654, "grad_norm": 1.8991733092693257, "learning_rate": 5.728628951152799e-06, "loss": 0.6675, "step": 1310 }, { "epoch": 0.5084844371182003, "grad_norm": 2.02322688793638, "learning_rate": 5.721929888152642e-06, "loss": 0.7297, "step": 1311 }, { "epoch": 0.5088722971007467, "grad_norm": 3.405791125858387, "learning_rate": 5.715229501363595e-06, "loss": 0.7098, "step": 1312 }, { "epoch": 0.509260157083293, "grad_norm": 2.284069526570624, "learning_rate": 5.708527803072031e-06, "loss": 0.6611, "step": 1313 }, { "epoch": 0.5096480170658393, "grad_norm": 1.9902319592256599, "learning_rate": 5.701824805566722e-06, "loss": 0.6778, "step": 1314 }, { "epoch": 0.5100358770483855, "grad_norm": 2.447442367176345, "learning_rate": 5.695120521138827e-06, "loss": 0.6242, "step": 1315 }, { "epoch": 0.5104237370309318, "grad_norm": 2.5526697052470775, "learning_rate": 5.688414962081862e-06, "loss": 0.7125, "step": 1316 }, { "epoch": 0.5108115970134781, "grad_norm": 2.9993904225055226, "learning_rate": 5.681708140691681e-06, "loss": 0.6937, "step": 1317 }, { "epoch": 0.5111994569960244, "grad_norm": 2.3440698928264885, "learning_rate": 5.675000069266451e-06, "loss": 0.69, "step": 1318 }, { "epoch": 0.5115873169785707, "grad_norm": 2.7701599674487674, "learning_rate": 5.668290760106636e-06, "loss": 0.6896, "step": 1319 }, { "epoch": 0.511975176961117, "grad_norm": 2.355338752120438, "learning_rate": 5.661580225514966e-06, "loss": 0.7504, "step": 1320 }, { "epoch": 0.5123630369436634, "grad_norm": 2.3327115727420944, "learning_rate": 5.654868477796418e-06, "loss": 0.6478, "step": 1321 }, { "epoch": 0.5127508969262097, "grad_norm": 2.2537723706175967, "learning_rate": 5.648155529258195e-06, "loss": 0.7238, "step": 1322 }, { "epoch": 0.513138756908756, "grad_norm": 1.915738691654421, "learning_rate": 5.641441392209699e-06, "loss": 0.6601, "step": 1323 }, { "epoch": 0.5135266168913022, "grad_norm": 2.324675436882344, "learning_rate": 5.634726078962514e-06, "loss": 0.6981, "step": 1324 }, { "epoch": 0.5139144768738485, "grad_norm": 2.301300400928471, "learning_rate": 5.628009601830382e-06, "loss": 0.675, "step": 1325 }, { "epoch": 0.5143023368563948, "grad_norm": 3.0323386959646497, "learning_rate": 5.621291973129177e-06, "loss": 0.6727, "step": 1326 }, { "epoch": 0.5146901968389411, "grad_norm": 3.3367595086749424, "learning_rate": 5.614573205176882e-06, "loss": 0.7775, "step": 1327 }, { "epoch": 0.5150780568214874, "grad_norm": 2.999572281871942, "learning_rate": 5.607853310293575e-06, "loss": 0.697, "step": 1328 }, { "epoch": 0.5154659168040338, "grad_norm": 2.7945141128619326, "learning_rate": 5.601132300801398e-06, "loss": 0.655, "step": 1329 }, { "epoch": 0.5158537767865801, "grad_norm": 2.833496282978474, "learning_rate": 5.594410189024533e-06, "loss": 0.6603, "step": 1330 }, { "epoch": 0.5162416367691264, "grad_norm": 2.2421169198829527, "learning_rate": 5.587686987289189e-06, "loss": 0.6759, "step": 1331 }, { "epoch": 0.5166294967516727, "grad_norm": 2.196451076053958, "learning_rate": 5.580962707923571e-06, "loss": 0.6541, "step": 1332 }, { "epoch": 0.517017356734219, "grad_norm": 2.248285764653769, "learning_rate": 5.574237363257858e-06, "loss": 0.7005, "step": 1333 }, { "epoch": 0.5174052167167652, "grad_norm": 2.492089603346059, "learning_rate": 5.567510965624187e-06, "loss": 0.6943, "step": 1334 }, { "epoch": 0.5177930766993115, "grad_norm": 2.5580508944336398, "learning_rate": 5.560783527356622e-06, "loss": 0.7076, "step": 1335 }, { "epoch": 0.5181809366818578, "grad_norm": 1.8513634093009363, "learning_rate": 5.554055060791138e-06, "loss": 0.5972, "step": 1336 }, { "epoch": 0.5185687966644041, "grad_norm": 3.326935994021286, "learning_rate": 5.547325578265594e-06, "loss": 0.6874, "step": 1337 }, { "epoch": 0.5189566566469505, "grad_norm": 2.0322982092083106, "learning_rate": 5.540595092119709e-06, "loss": 0.7092, "step": 1338 }, { "epoch": 0.5193445166294968, "grad_norm": 2.0820900794685837, "learning_rate": 5.53386361469505e-06, "loss": 0.6873, "step": 1339 }, { "epoch": 0.5197323766120431, "grad_norm": 2.426090547806607, "learning_rate": 5.527131158334993e-06, "loss": 0.6757, "step": 1340 }, { "epoch": 0.5201202365945894, "grad_norm": 2.145805958877921, "learning_rate": 5.520397735384716e-06, "loss": 0.7112, "step": 1341 }, { "epoch": 0.5205080965771357, "grad_norm": 2.2833541816200658, "learning_rate": 5.513663358191166e-06, "loss": 0.6813, "step": 1342 }, { "epoch": 0.5208959565596819, "grad_norm": 2.2162817324423645, "learning_rate": 5.50692803910304e-06, "loss": 0.6605, "step": 1343 }, { "epoch": 0.5212838165422282, "grad_norm": 2.688476273514431, "learning_rate": 5.500191790470761e-06, "loss": 0.7044, "step": 1344 }, { "epoch": 0.5216716765247745, "grad_norm": 2.971660516310938, "learning_rate": 5.493454624646461e-06, "loss": 0.7763, "step": 1345 }, { "epoch": 0.5220595365073208, "grad_norm": 2.0808744646304884, "learning_rate": 5.4867165539839505e-06, "loss": 0.6906, "step": 1346 }, { "epoch": 0.5224473964898672, "grad_norm": 2.5719581497801545, "learning_rate": 5.479977590838697e-06, "loss": 0.717, "step": 1347 }, { "epoch": 0.5228352564724135, "grad_norm": 1.8378409786168817, "learning_rate": 5.473237747567805e-06, "loss": 0.6929, "step": 1348 }, { "epoch": 0.5232231164549598, "grad_norm": 2.613107708362667, "learning_rate": 5.466497036530002e-06, "loss": 0.6784, "step": 1349 }, { "epoch": 0.5236109764375061, "grad_norm": 3.062728113971853, "learning_rate": 5.459755470085595e-06, "loss": 0.6868, "step": 1350 }, { "epoch": 0.5239988364200524, "grad_norm": 2.3266557772898917, "learning_rate": 5.453013060596465e-06, "loss": 0.7658, "step": 1351 }, { "epoch": 0.5243866964025986, "grad_norm": 2.5935972721415577, "learning_rate": 5.4462698204260365e-06, "loss": 0.7112, "step": 1352 }, { "epoch": 0.5247745563851449, "grad_norm": 2.302797931927242, "learning_rate": 5.439525761939261e-06, "loss": 0.6185, "step": 1353 }, { "epoch": 0.5251624163676912, "grad_norm": 2.7822657317116715, "learning_rate": 5.432780897502588e-06, "loss": 0.764, "step": 1354 }, { "epoch": 0.5255502763502375, "grad_norm": 2.4465202175550655, "learning_rate": 5.4260352394839445e-06, "loss": 0.658, "step": 1355 }, { "epoch": 0.5259381363327839, "grad_norm": 2.85520995029526, "learning_rate": 5.419288800252713e-06, "loss": 0.7039, "step": 1356 }, { "epoch": 0.5263259963153302, "grad_norm": 1.936502255340335, "learning_rate": 5.412541592179708e-06, "loss": 0.6796, "step": 1357 }, { "epoch": 0.5267138562978765, "grad_norm": 3.4516463591460265, "learning_rate": 5.405793627637157e-06, "loss": 0.7021, "step": 1358 }, { "epoch": 0.5271017162804228, "grad_norm": 2.112369557476709, "learning_rate": 5.3990449189986705e-06, "loss": 0.6586, "step": 1359 }, { "epoch": 0.5274895762629691, "grad_norm": 3.371250010682158, "learning_rate": 5.392295478639226e-06, "loss": 0.7397, "step": 1360 }, { "epoch": 0.5278774362455154, "grad_norm": 2.2398663183553253, "learning_rate": 5.38554531893514e-06, "loss": 0.6826, "step": 1361 }, { "epoch": 0.5282652962280616, "grad_norm": 3.877045370982858, "learning_rate": 5.378794452264053e-06, "loss": 0.7496, "step": 1362 }, { "epoch": 0.5286531562106079, "grad_norm": 2.2798984863547926, "learning_rate": 5.372042891004896e-06, "loss": 0.637, "step": 1363 }, { "epoch": 0.5290410161931542, "grad_norm": 2.328258126444413, "learning_rate": 5.365290647537878e-06, "loss": 0.6556, "step": 1364 }, { "epoch": 0.5294288761757006, "grad_norm": 2.5876698963881024, "learning_rate": 5.3585377342444566e-06, "loss": 0.738, "step": 1365 }, { "epoch": 0.5298167361582469, "grad_norm": 2.0713333428049636, "learning_rate": 5.351784163507319e-06, "loss": 0.6897, "step": 1366 }, { "epoch": 0.5302045961407932, "grad_norm": 2.8755862753715355, "learning_rate": 5.345029947710357e-06, "loss": 0.7464, "step": 1367 }, { "epoch": 0.5305924561233395, "grad_norm": 2.4801751801028256, "learning_rate": 5.338275099238647e-06, "loss": 0.6639, "step": 1368 }, { "epoch": 0.5309803161058858, "grad_norm": 2.7227634780115, "learning_rate": 5.331519630478421e-06, "loss": 0.7902, "step": 1369 }, { "epoch": 0.5313681760884321, "grad_norm": 2.355346766059658, "learning_rate": 5.3247635538170536e-06, "loss": 0.5882, "step": 1370 }, { "epoch": 0.5317560360709783, "grad_norm": 2.921106700330558, "learning_rate": 5.318006881643034e-06, "loss": 0.7655, "step": 1371 }, { "epoch": 0.5321438960535246, "grad_norm": 3.3373871347389388, "learning_rate": 5.311249626345938e-06, "loss": 0.7148, "step": 1372 }, { "epoch": 0.532531756036071, "grad_norm": 1.9764317676495966, "learning_rate": 5.304491800316416e-06, "loss": 0.6524, "step": 1373 }, { "epoch": 0.5329196160186173, "grad_norm": 3.112241300842561, "learning_rate": 5.297733415946161e-06, "loss": 0.8009, "step": 1374 }, { "epoch": 0.5333074760011636, "grad_norm": 2.651911899713928, "learning_rate": 5.290974485627894e-06, "loss": 0.7153, "step": 1375 }, { "epoch": 0.5336953359837099, "grad_norm": 2.6700236736347684, "learning_rate": 5.284215021755336e-06, "loss": 0.6867, "step": 1376 }, { "epoch": 0.5340831959662562, "grad_norm": 3.778235281198076, "learning_rate": 5.277455036723182e-06, "loss": 0.7476, "step": 1377 }, { "epoch": 0.5344710559488025, "grad_norm": 2.596850607389826, "learning_rate": 5.270694542927089e-06, "loss": 0.7177, "step": 1378 }, { "epoch": 0.5348589159313488, "grad_norm": 3.0270664860929624, "learning_rate": 5.263933552763641e-06, "loss": 0.6321, "step": 1379 }, { "epoch": 0.5352467759138951, "grad_norm": 2.5561748667411086, "learning_rate": 5.257172078630337e-06, "loss": 0.713, "step": 1380 }, { "epoch": 0.5356346358964413, "grad_norm": 2.3603224553238236, "learning_rate": 5.250410132925561e-06, "loss": 0.661, "step": 1381 }, { "epoch": 0.5360224958789876, "grad_norm": 2.379895048874697, "learning_rate": 5.243647728048561e-06, "loss": 0.7104, "step": 1382 }, { "epoch": 0.536410355861534, "grad_norm": 2.5824162923484866, "learning_rate": 5.23688487639943e-06, "loss": 0.7178, "step": 1383 }, { "epoch": 0.5367982158440803, "grad_norm": 2.471216946659117, "learning_rate": 5.2301215903790785e-06, "loss": 0.7351, "step": 1384 }, { "epoch": 0.5371860758266266, "grad_norm": 2.4304355558212927, "learning_rate": 5.223357882389212e-06, "loss": 0.7165, "step": 1385 }, { "epoch": 0.5375739358091729, "grad_norm": 2.3683982897547735, "learning_rate": 5.2165937648323115e-06, "loss": 0.7313, "step": 1386 }, { "epoch": 0.5379617957917192, "grad_norm": 2.7305988539173978, "learning_rate": 5.209829250111609e-06, "loss": 0.7174, "step": 1387 }, { "epoch": 0.5383496557742655, "grad_norm": 2.0545163789185947, "learning_rate": 5.203064350631064e-06, "loss": 0.6837, "step": 1388 }, { "epoch": 0.5387375157568118, "grad_norm": 1.9669546989153932, "learning_rate": 5.1962990787953436e-06, "loss": 0.6665, "step": 1389 }, { "epoch": 0.539125375739358, "grad_norm": 3.5326884114349815, "learning_rate": 5.189533447009795e-06, "loss": 0.7181, "step": 1390 }, { "epoch": 0.5395132357219043, "grad_norm": 2.5643601955501945, "learning_rate": 5.182767467680425e-06, "loss": 0.7066, "step": 1391 }, { "epoch": 0.5399010957044507, "grad_norm": 2.7702547600432443, "learning_rate": 5.176001153213881e-06, "loss": 0.7244, "step": 1392 }, { "epoch": 0.540288955686997, "grad_norm": 2.086367082913311, "learning_rate": 5.1692345160174225e-06, "loss": 0.6831, "step": 1393 }, { "epoch": 0.5406768156695433, "grad_norm": 2.4350420083490416, "learning_rate": 5.1624675684989035e-06, "loss": 0.6848, "step": 1394 }, { "epoch": 0.5410646756520896, "grad_norm": 2.064134281219293, "learning_rate": 5.155700323066741e-06, "loss": 0.6888, "step": 1395 }, { "epoch": 0.5414525356346359, "grad_norm": 2.6201875581295013, "learning_rate": 5.148932792129905e-06, "loss": 0.6502, "step": 1396 }, { "epoch": 0.5418403956171822, "grad_norm": 2.6803628989282293, "learning_rate": 5.142164988097885e-06, "loss": 0.7054, "step": 1397 }, { "epoch": 0.5422282555997285, "grad_norm": 2.0051137168830326, "learning_rate": 5.1353969233806735e-06, "loss": 0.6539, "step": 1398 }, { "epoch": 0.5426161155822748, "grad_norm": 2.277836363968983, "learning_rate": 5.128628610388739e-06, "loss": 0.7001, "step": 1399 }, { "epoch": 0.543003975564821, "grad_norm": 2.45770150869687, "learning_rate": 5.121860061533006e-06, "loss": 0.6486, "step": 1400 }, { "epoch": 0.5433918355473674, "grad_norm": 2.2403641077361045, "learning_rate": 5.1150912892248335e-06, "loss": 0.6811, "step": 1401 }, { "epoch": 0.5437796955299137, "grad_norm": 2.853619600461689, "learning_rate": 5.108322305875988e-06, "loss": 0.6939, "step": 1402 }, { "epoch": 0.54416755551246, "grad_norm": 2.1759024398322637, "learning_rate": 5.101553123898621e-06, "loss": 0.7164, "step": 1403 }, { "epoch": 0.5445554154950063, "grad_norm": 1.960863483904538, "learning_rate": 5.0947837557052536e-06, "loss": 0.6261, "step": 1404 }, { "epoch": 0.5449432754775526, "grad_norm": 2.7137930507189743, "learning_rate": 5.0880142137087455e-06, "loss": 0.765, "step": 1405 }, { "epoch": 0.5453311354600989, "grad_norm": 2.430204281039372, "learning_rate": 5.0812445103222745e-06, "loss": 0.7029, "step": 1406 }, { "epoch": 0.5457189954426452, "grad_norm": 2.077227113159899, "learning_rate": 5.074474657959313e-06, "loss": 0.6841, "step": 1407 }, { "epoch": 0.5461068554251916, "grad_norm": 3.274591900016193, "learning_rate": 5.06770466903361e-06, "loss": 0.7355, "step": 1408 }, { "epoch": 0.5464947154077378, "grad_norm": 1.9236927579181267, "learning_rate": 5.060934555959164e-06, "loss": 0.6945, "step": 1409 }, { "epoch": 0.5468825753902841, "grad_norm": 2.9420198108412543, "learning_rate": 5.054164331150199e-06, "loss": 0.7349, "step": 1410 }, { "epoch": 0.5472704353728304, "grad_norm": 2.3801534697840876, "learning_rate": 5.047394007021149e-06, "loss": 0.6189, "step": 1411 }, { "epoch": 0.5476582953553767, "grad_norm": 2.3215646416908187, "learning_rate": 5.040623595986622e-06, "loss": 0.6872, "step": 1412 }, { "epoch": 0.548046155337923, "grad_norm": 3.10838591997273, "learning_rate": 5.033853110461393e-06, "loss": 0.7439, "step": 1413 }, { "epoch": 0.5484340153204693, "grad_norm": 1.9757899988433187, "learning_rate": 5.027082562860368e-06, "loss": 0.6351, "step": 1414 }, { "epoch": 0.5488218753030156, "grad_norm": 2.168242796578048, "learning_rate": 5.020311965598572e-06, "loss": 0.6377, "step": 1415 }, { "epoch": 0.5492097352855619, "grad_norm": 2.7531011110152463, "learning_rate": 5.013541331091117e-06, "loss": 0.663, "step": 1416 }, { "epoch": 0.5495975952681083, "grad_norm": 2.4708346289366534, "learning_rate": 5.006770671753183e-06, "loss": 0.6773, "step": 1417 }, { "epoch": 0.5499854552506546, "grad_norm": 2.428584885714815, "learning_rate": 5e-06, "loss": 0.6251, "step": 1418 }, { "epoch": 0.5503733152332008, "grad_norm": 2.3729712221496473, "learning_rate": 4.993229328246818e-06, "loss": 0.6979, "step": 1419 }, { "epoch": 0.5507611752157471, "grad_norm": 2.674165697363652, "learning_rate": 4.986458668908886e-06, "loss": 0.7159, "step": 1420 }, { "epoch": 0.5511490351982934, "grad_norm": 3.0288504994339296, "learning_rate": 4.9796880344014305e-06, "loss": 0.6797, "step": 1421 }, { "epoch": 0.5515368951808397, "grad_norm": 2.4901878054217987, "learning_rate": 4.972917437139634e-06, "loss": 0.6778, "step": 1422 }, { "epoch": 0.551924755163386, "grad_norm": 2.267469507243416, "learning_rate": 4.966146889538608e-06, "loss": 0.6895, "step": 1423 }, { "epoch": 0.5523126151459323, "grad_norm": 2.1954132760098735, "learning_rate": 4.959376404013378e-06, "loss": 0.7072, "step": 1424 }, { "epoch": 0.5527004751284786, "grad_norm": 2.1085496017273826, "learning_rate": 4.952605992978853e-06, "loss": 0.66, "step": 1425 }, { "epoch": 0.553088335111025, "grad_norm": 2.207573857657043, "learning_rate": 4.945835668849801e-06, "loss": 0.7122, "step": 1426 }, { "epoch": 0.5534761950935713, "grad_norm": 1.8139489550549492, "learning_rate": 4.9390654440408374e-06, "loss": 0.6457, "step": 1427 }, { "epoch": 0.5538640550761175, "grad_norm": 2.939579541467068, "learning_rate": 4.932295330966392e-06, "loss": 0.7745, "step": 1428 }, { "epoch": 0.5542519150586638, "grad_norm": 2.27655789834531, "learning_rate": 4.925525342040689e-06, "loss": 0.6842, "step": 1429 }, { "epoch": 0.5546397750412101, "grad_norm": 2.5391122851231276, "learning_rate": 4.918755489677729e-06, "loss": 0.6848, "step": 1430 }, { "epoch": 0.5550276350237564, "grad_norm": 2.287723047265105, "learning_rate": 4.9119857862912544e-06, "loss": 0.6974, "step": 1431 }, { "epoch": 0.5554154950063027, "grad_norm": 1.9924090805045678, "learning_rate": 4.905216244294746e-06, "loss": 0.6043, "step": 1432 }, { "epoch": 0.555803354988849, "grad_norm": 3.0440664881333976, "learning_rate": 4.8984468761013794e-06, "loss": 0.7551, "step": 1433 }, { "epoch": 0.5561912149713953, "grad_norm": 2.120031904164007, "learning_rate": 4.891677694124013e-06, "loss": 0.6454, "step": 1434 }, { "epoch": 0.5565790749539417, "grad_norm": 2.5307676384413123, "learning_rate": 4.884908710775167e-06, "loss": 0.6897, "step": 1435 }, { "epoch": 0.556966934936488, "grad_norm": 2.821576890013646, "learning_rate": 4.878139938466995e-06, "loss": 0.6889, "step": 1436 }, { "epoch": 0.5573547949190343, "grad_norm": 2.18924651669471, "learning_rate": 4.871371389611263e-06, "loss": 0.6436, "step": 1437 }, { "epoch": 0.5577426549015805, "grad_norm": 2.6659194601428364, "learning_rate": 4.864603076619329e-06, "loss": 0.7002, "step": 1438 }, { "epoch": 0.5581305148841268, "grad_norm": 2.6541865312596835, "learning_rate": 4.8578350119021176e-06, "loss": 0.7387, "step": 1439 }, { "epoch": 0.5585183748666731, "grad_norm": 2.3081827000486443, "learning_rate": 4.851067207870096e-06, "loss": 0.7296, "step": 1440 }, { "epoch": 0.5589062348492194, "grad_norm": 2.0731290644604137, "learning_rate": 4.8442996769332605e-06, "loss": 0.6635, "step": 1441 }, { "epoch": 0.5592940948317657, "grad_norm": 2.4149066847976877, "learning_rate": 4.837532431501098e-06, "loss": 0.7179, "step": 1442 }, { "epoch": 0.559681954814312, "grad_norm": 2.0342147835957127, "learning_rate": 4.830765483982578e-06, "loss": 0.6041, "step": 1443 }, { "epoch": 0.5600698147968584, "grad_norm": 3.0281470192374553, "learning_rate": 4.82399884678612e-06, "loss": 0.7318, "step": 1444 }, { "epoch": 0.5604576747794047, "grad_norm": 2.722364428143249, "learning_rate": 4.817232532319577e-06, "loss": 0.6399, "step": 1445 }, { "epoch": 0.560845534761951, "grad_norm": 1.9389174847412152, "learning_rate": 4.8104665529902075e-06, "loss": 0.6422, "step": 1446 }, { "epoch": 0.5612333947444972, "grad_norm": 2.1593830480614478, "learning_rate": 4.803700921204659e-06, "loss": 0.6356, "step": 1447 }, { "epoch": 0.5616212547270435, "grad_norm": 2.6711909420129087, "learning_rate": 4.796935649368936e-06, "loss": 0.746, "step": 1448 }, { "epoch": 0.5620091147095898, "grad_norm": 2.324008710285125, "learning_rate": 4.790170749888392e-06, "loss": 0.6637, "step": 1449 }, { "epoch": 0.5623969746921361, "grad_norm": 2.7964399994450613, "learning_rate": 4.783406235167689e-06, "loss": 0.6812, "step": 1450 }, { "epoch": 0.5627848346746824, "grad_norm": 3.1932371391676337, "learning_rate": 4.77664211761079e-06, "loss": 0.7236, "step": 1451 }, { "epoch": 0.5631726946572287, "grad_norm": 2.432957968266681, "learning_rate": 4.769878409620923e-06, "loss": 0.7385, "step": 1452 }, { "epoch": 0.5635605546397751, "grad_norm": 2.898037614188649, "learning_rate": 4.763115123600571e-06, "loss": 0.6836, "step": 1453 }, { "epoch": 0.5639484146223214, "grad_norm": 2.2786589584318673, "learning_rate": 4.756352271951441e-06, "loss": 0.6847, "step": 1454 }, { "epoch": 0.5643362746048677, "grad_norm": 3.10813517253798, "learning_rate": 4.7495898670744415e-06, "loss": 0.7299, "step": 1455 }, { "epoch": 0.564724134587414, "grad_norm": 3.231559386463882, "learning_rate": 4.742827921369665e-06, "loss": 0.7279, "step": 1456 }, { "epoch": 0.5651119945699602, "grad_norm": 2.322589228478006, "learning_rate": 4.7360664472363605e-06, "loss": 0.7396, "step": 1457 }, { "epoch": 0.5654998545525065, "grad_norm": 2.414793630243324, "learning_rate": 4.729305457072913e-06, "loss": 0.6787, "step": 1458 }, { "epoch": 0.5658877145350528, "grad_norm": 2.0710623196763276, "learning_rate": 4.722544963276819e-06, "loss": 0.6739, "step": 1459 }, { "epoch": 0.5662755745175991, "grad_norm": 2.3816781530879694, "learning_rate": 4.715784978244666e-06, "loss": 0.7051, "step": 1460 }, { "epoch": 0.5666634345001454, "grad_norm": 3.064338614781548, "learning_rate": 4.709025514372107e-06, "loss": 0.6411, "step": 1461 }, { "epoch": 0.5670512944826918, "grad_norm": 2.1377919393647407, "learning_rate": 4.70226658405384e-06, "loss": 0.6793, "step": 1462 }, { "epoch": 0.5674391544652381, "grad_norm": 3.4607245339009403, "learning_rate": 4.695508199683587e-06, "loss": 0.6826, "step": 1463 }, { "epoch": 0.5678270144477844, "grad_norm": 2.35491103658332, "learning_rate": 4.688750373654065e-06, "loss": 0.7483, "step": 1464 }, { "epoch": 0.5682148744303307, "grad_norm": 1.965885244513761, "learning_rate": 4.681993118356967e-06, "loss": 0.6713, "step": 1465 }, { "epoch": 0.5686027344128769, "grad_norm": 2.116527410664051, "learning_rate": 4.6752364461829456e-06, "loss": 0.6376, "step": 1466 }, { "epoch": 0.5689905943954232, "grad_norm": 2.0822347377477297, "learning_rate": 4.66848036952158e-06, "loss": 0.7115, "step": 1467 }, { "epoch": 0.5693784543779695, "grad_norm": 2.8087305149134076, "learning_rate": 4.661724900761355e-06, "loss": 0.7723, "step": 1468 }, { "epoch": 0.5697663143605158, "grad_norm": 2.7349163047804286, "learning_rate": 4.654970052289644e-06, "loss": 0.7198, "step": 1469 }, { "epoch": 0.5701541743430621, "grad_norm": 2.3149516893315276, "learning_rate": 4.648215836492682e-06, "loss": 0.7011, "step": 1470 }, { "epoch": 0.5705420343256085, "grad_norm": 2.301627525302107, "learning_rate": 4.641462265755545e-06, "loss": 0.6462, "step": 1471 }, { "epoch": 0.5709298943081548, "grad_norm": 3.2300097334774653, "learning_rate": 4.634709352462124e-06, "loss": 0.7394, "step": 1472 }, { "epoch": 0.5713177542907011, "grad_norm": 2.809377645438014, "learning_rate": 4.6279571089951056e-06, "loss": 0.6771, "step": 1473 }, { "epoch": 0.5717056142732474, "grad_norm": 2.1455253942866097, "learning_rate": 4.621205547735949e-06, "loss": 0.6808, "step": 1474 }, { "epoch": 0.5720934742557937, "grad_norm": 2.83647891793575, "learning_rate": 4.614454681064861e-06, "loss": 0.6978, "step": 1475 }, { "epoch": 0.5724813342383399, "grad_norm": 2.438188422451454, "learning_rate": 4.6077045213607765e-06, "loss": 0.7067, "step": 1476 }, { "epoch": 0.5728691942208862, "grad_norm": 2.4108952270155, "learning_rate": 4.600955081001331e-06, "loss": 0.6809, "step": 1477 }, { "epoch": 0.5732570542034325, "grad_norm": 2.748256854529697, "learning_rate": 4.594206372362845e-06, "loss": 0.7046, "step": 1478 }, { "epoch": 0.5736449141859788, "grad_norm": 2.001091293689959, "learning_rate": 4.587458407820293e-06, "loss": 0.6581, "step": 1479 }, { "epoch": 0.5740327741685252, "grad_norm": 2.792283110720085, "learning_rate": 4.580711199747289e-06, "loss": 0.6977, "step": 1480 }, { "epoch": 0.5744206341510715, "grad_norm": 1.7141733285987955, "learning_rate": 4.573964760516058e-06, "loss": 0.6387, "step": 1481 }, { "epoch": 0.5748084941336178, "grad_norm": 2.0214874742132336, "learning_rate": 4.567219102497413e-06, "loss": 0.6038, "step": 1482 }, { "epoch": 0.5751963541161641, "grad_norm": 2.767657498655234, "learning_rate": 4.56047423806074e-06, "loss": 0.73, "step": 1483 }, { "epoch": 0.5755842140987104, "grad_norm": 2.548884650801424, "learning_rate": 4.5537301795739635e-06, "loss": 0.6631, "step": 1484 }, { "epoch": 0.5759720740812566, "grad_norm": 2.7393423932141694, "learning_rate": 4.546986939403537e-06, "loss": 0.653, "step": 1485 }, { "epoch": 0.5763599340638029, "grad_norm": 2.366479125779377, "learning_rate": 4.540244529914406e-06, "loss": 0.7027, "step": 1486 }, { "epoch": 0.5767477940463492, "grad_norm": 2.2692281719616774, "learning_rate": 4.533502963469999e-06, "loss": 0.6839, "step": 1487 }, { "epoch": 0.5771356540288956, "grad_norm": 2.8381102536357767, "learning_rate": 4.5267622524321955e-06, "loss": 0.6679, "step": 1488 }, { "epoch": 0.5775235140114419, "grad_norm": 2.5680248140657462, "learning_rate": 4.520022409161307e-06, "loss": 0.6361, "step": 1489 }, { "epoch": 0.5779113739939882, "grad_norm": 2.3408783212969175, "learning_rate": 4.513283446016052e-06, "loss": 0.6365, "step": 1490 }, { "epoch": 0.5782992339765345, "grad_norm": 2.2342598741293367, "learning_rate": 4.50654537535354e-06, "loss": 0.7117, "step": 1491 }, { "epoch": 0.5786870939590808, "grad_norm": 2.819405706198201, "learning_rate": 4.499808209529239e-06, "loss": 0.6985, "step": 1492 }, { "epoch": 0.5790749539416271, "grad_norm": 2.4258135106856344, "learning_rate": 4.4930719608969615e-06, "loss": 0.7105, "step": 1493 }, { "epoch": 0.5794628139241734, "grad_norm": 2.3198759138453857, "learning_rate": 4.486336641808835e-06, "loss": 0.6388, "step": 1494 }, { "epoch": 0.5798506739067196, "grad_norm": 2.4538378992638226, "learning_rate": 4.479602264615285e-06, "loss": 0.7096, "step": 1495 }, { "epoch": 0.5802385338892659, "grad_norm": 2.7087757481001344, "learning_rate": 4.472868841665008e-06, "loss": 0.6793, "step": 1496 }, { "epoch": 0.5806263938718123, "grad_norm": 3.1309584572938975, "learning_rate": 4.466136385304952e-06, "loss": 0.6717, "step": 1497 }, { "epoch": 0.5810142538543586, "grad_norm": 2.051902772912677, "learning_rate": 4.459404907880293e-06, "loss": 0.6883, "step": 1498 }, { "epoch": 0.5814021138369049, "grad_norm": 2.828332987561901, "learning_rate": 4.452674421734409e-06, "loss": 0.6452, "step": 1499 }, { "epoch": 0.5817899738194512, "grad_norm": 2.40750264788923, "learning_rate": 4.445944939208862e-06, "loss": 0.6246, "step": 1500 }, { "epoch": 0.5817899738194512, "eval_loss": 1.3614202737808228, "eval_runtime": 6.0459, "eval_samples_per_second": 0.165, "eval_steps_per_second": 0.165, "step": 1500 }, { "epoch": 0.5821778338019975, "grad_norm": 2.6211611030892117, "learning_rate": 4.439216472643378e-06, "loss": 0.6673, "step": 1501 }, { "epoch": 0.5825656937845438, "grad_norm": 2.9508285399242373, "learning_rate": 4.4324890343758134e-06, "loss": 0.719, "step": 1502 }, { "epoch": 0.5829535537670901, "grad_norm": 2.6869263011057734, "learning_rate": 4.425762636742143e-06, "loss": 0.7511, "step": 1503 }, { "epoch": 0.5833414137496363, "grad_norm": 2.178732362470906, "learning_rate": 4.419037292076431e-06, "loss": 0.6824, "step": 1504 }, { "epoch": 0.5837292737321826, "grad_norm": 1.781497513197294, "learning_rate": 4.4123130127108125e-06, "loss": 0.651, "step": 1505 }, { "epoch": 0.584117133714729, "grad_norm": 2.789426448882254, "learning_rate": 4.4055898109754684e-06, "loss": 0.6569, "step": 1506 }, { "epoch": 0.5845049936972753, "grad_norm": 2.7955744126730027, "learning_rate": 4.398867699198604e-06, "loss": 0.7306, "step": 1507 }, { "epoch": 0.5848928536798216, "grad_norm": 2.4372234931341406, "learning_rate": 4.392146689706426e-06, "loss": 0.7372, "step": 1508 }, { "epoch": 0.5852807136623679, "grad_norm": 2.0585365124654875, "learning_rate": 4.385426794823119e-06, "loss": 0.6365, "step": 1509 }, { "epoch": 0.5856685736449142, "grad_norm": 1.8208721276749247, "learning_rate": 4.378708026870825e-06, "loss": 0.6861, "step": 1510 }, { "epoch": 0.5860564336274605, "grad_norm": 2.8270075619502513, "learning_rate": 4.371990398169619e-06, "loss": 0.6727, "step": 1511 }, { "epoch": 0.5864442936100068, "grad_norm": 1.689743759557478, "learning_rate": 4.365273921037486e-06, "loss": 0.6418, "step": 1512 }, { "epoch": 0.5868321535925531, "grad_norm": 2.0355766315276544, "learning_rate": 4.358558607790303e-06, "loss": 0.7269, "step": 1513 }, { "epoch": 0.5872200135750993, "grad_norm": 2.1306341920412133, "learning_rate": 4.351844470741808e-06, "loss": 0.7005, "step": 1514 }, { "epoch": 0.5876078735576457, "grad_norm": 2.1775436052945962, "learning_rate": 4.345131522203584e-06, "loss": 0.6833, "step": 1515 }, { "epoch": 0.587995733540192, "grad_norm": 2.3770429633703527, "learning_rate": 4.338419774485036e-06, "loss": 0.631, "step": 1516 }, { "epoch": 0.5883835935227383, "grad_norm": 2.16372520542803, "learning_rate": 4.331709239893364e-06, "loss": 0.6601, "step": 1517 }, { "epoch": 0.5887714535052846, "grad_norm": 2.9852678551370166, "learning_rate": 4.32499993073355e-06, "loss": 0.7153, "step": 1518 }, { "epoch": 0.5891593134878309, "grad_norm": 2.6115193764297717, "learning_rate": 4.318291859308321e-06, "loss": 0.7132, "step": 1519 }, { "epoch": 0.5895471734703772, "grad_norm": 2.049170515471118, "learning_rate": 4.31158503791814e-06, "loss": 0.6359, "step": 1520 }, { "epoch": 0.5899350334529235, "grad_norm": 1.7677697645330466, "learning_rate": 4.3048794788611745e-06, "loss": 0.6331, "step": 1521 }, { "epoch": 0.5903228934354698, "grad_norm": 3.291953729389254, "learning_rate": 4.298175194433279e-06, "loss": 0.6973, "step": 1522 }, { "epoch": 0.590710753418016, "grad_norm": 2.8447215575759617, "learning_rate": 4.29147219692797e-06, "loss": 0.7033, "step": 1523 }, { "epoch": 0.5910986134005624, "grad_norm": 2.8056228045206693, "learning_rate": 4.284770498636406e-06, "loss": 0.6365, "step": 1524 }, { "epoch": 0.5914864733831087, "grad_norm": 2.1814713237824312, "learning_rate": 4.2780701118473585e-06, "loss": 0.6753, "step": 1525 }, { "epoch": 0.591874333365655, "grad_norm": 2.3094180031804665, "learning_rate": 4.271371048847201e-06, "loss": 0.6559, "step": 1526 }, { "epoch": 0.5922621933482013, "grad_norm": 2.9076586700654286, "learning_rate": 4.264673321919874e-06, "loss": 0.6741, "step": 1527 }, { "epoch": 0.5926500533307476, "grad_norm": 2.3658898524401786, "learning_rate": 4.25797694334687e-06, "loss": 0.6506, "step": 1528 }, { "epoch": 0.5930379133132939, "grad_norm": 2.923521318795054, "learning_rate": 4.251281925407209e-06, "loss": 0.6907, "step": 1529 }, { "epoch": 0.5934257732958402, "grad_norm": 2.2500910216441126, "learning_rate": 4.244588280377417e-06, "loss": 0.6344, "step": 1530 }, { "epoch": 0.5938136332783865, "grad_norm": 2.743316688185132, "learning_rate": 4.2378960205315005e-06, "loss": 0.6456, "step": 1531 }, { "epoch": 0.5942014932609329, "grad_norm": 2.887644498556077, "learning_rate": 4.231205158140927e-06, "loss": 0.6326, "step": 1532 }, { "epoch": 0.5945893532434791, "grad_norm": 2.8080354269004655, "learning_rate": 4.224515705474603e-06, "loss": 0.6988, "step": 1533 }, { "epoch": 0.5949772132260254, "grad_norm": 2.580906493034682, "learning_rate": 4.217827674798845e-06, "loss": 0.6738, "step": 1534 }, { "epoch": 0.5953650732085717, "grad_norm": 2.0486628611902327, "learning_rate": 4.211141078377371e-06, "loss": 0.6354, "step": 1535 }, { "epoch": 0.595752933191118, "grad_norm": 2.4983631859394033, "learning_rate": 4.20445592847126e-06, "loss": 0.661, "step": 1536 }, { "epoch": 0.5961407931736643, "grad_norm": 2.9154437415935632, "learning_rate": 4.1977722373389435e-06, "loss": 0.7433, "step": 1537 }, { "epoch": 0.5965286531562106, "grad_norm": 1.9633884293179145, "learning_rate": 4.191090017236177e-06, "loss": 0.6207, "step": 1538 }, { "epoch": 0.5969165131387569, "grad_norm": 2.5412581038148416, "learning_rate": 4.184409280416018e-06, "loss": 0.7157, "step": 1539 }, { "epoch": 0.5973043731213032, "grad_norm": 2.0672952903977784, "learning_rate": 4.177730039128803e-06, "loss": 0.7359, "step": 1540 }, { "epoch": 0.5976922331038496, "grad_norm": 2.1375445499771897, "learning_rate": 4.17105230562213e-06, "loss": 0.6142, "step": 1541 }, { "epoch": 0.5980800930863958, "grad_norm": 1.870650406986506, "learning_rate": 4.164376092140828e-06, "loss": 0.6665, "step": 1542 }, { "epoch": 0.5984679530689421, "grad_norm": 3.013341653566092, "learning_rate": 4.157701410926943e-06, "loss": 0.7235, "step": 1543 }, { "epoch": 0.5988558130514884, "grad_norm": 3.0675603695543137, "learning_rate": 4.151028274219707e-06, "loss": 0.7072, "step": 1544 }, { "epoch": 0.5992436730340347, "grad_norm": 1.8403944956093237, "learning_rate": 4.144356694255524e-06, "loss": 0.6529, "step": 1545 }, { "epoch": 0.599631533016581, "grad_norm": 1.9519971807080918, "learning_rate": 4.137686683267939e-06, "loss": 0.6568, "step": 1546 }, { "epoch": 0.6000193929991273, "grad_norm": 1.9635706076282193, "learning_rate": 4.1310182534876224e-06, "loss": 0.6082, "step": 1547 }, { "epoch": 0.6004072529816736, "grad_norm": 1.8531805920704663, "learning_rate": 4.1243514171423465e-06, "loss": 0.6284, "step": 1548 }, { "epoch": 0.60079511296422, "grad_norm": 3.110986078148104, "learning_rate": 4.117686186456959e-06, "loss": 0.7441, "step": 1549 }, { "epoch": 0.6011829729467663, "grad_norm": 3.1397819614418307, "learning_rate": 4.111022573653366e-06, "loss": 0.7486, "step": 1550 }, { "epoch": 0.6015708329293126, "grad_norm": 2.193250495467767, "learning_rate": 4.104360590950503e-06, "loss": 0.7161, "step": 1551 }, { "epoch": 0.6019586929118588, "grad_norm": 2.401706481101792, "learning_rate": 4.097700250564323e-06, "loss": 0.7327, "step": 1552 }, { "epoch": 0.6023465528944051, "grad_norm": 3.206251897831227, "learning_rate": 4.09104156470776e-06, "loss": 0.7453, "step": 1553 }, { "epoch": 0.6027344128769514, "grad_norm": 1.8504500107919302, "learning_rate": 4.0843845455907195e-06, "loss": 0.6906, "step": 1554 }, { "epoch": 0.6031222728594977, "grad_norm": 2.0962209469166977, "learning_rate": 4.077729205420049e-06, "loss": 0.7383, "step": 1555 }, { "epoch": 0.603510132842044, "grad_norm": 2.5729618005017687, "learning_rate": 4.0710755563995155e-06, "loss": 0.6363, "step": 1556 }, { "epoch": 0.6038979928245903, "grad_norm": 2.430827732385274, "learning_rate": 4.064423610729789e-06, "loss": 0.6678, "step": 1557 }, { "epoch": 0.6042858528071366, "grad_norm": 2.5070528837217667, "learning_rate": 4.057773380608411e-06, "loss": 0.7162, "step": 1558 }, { "epoch": 0.604673712789683, "grad_norm": 2.6590571161521526, "learning_rate": 4.051124878229779e-06, "loss": 0.6499, "step": 1559 }, { "epoch": 0.6050615727722293, "grad_norm": 2.470371795231045, "learning_rate": 4.044478115785128e-06, "loss": 0.6803, "step": 1560 }, { "epoch": 0.6054494327547755, "grad_norm": 2.6106636712910833, "learning_rate": 4.037833105462495e-06, "loss": 0.7214, "step": 1561 }, { "epoch": 0.6058372927373218, "grad_norm": 2.418309452049335, "learning_rate": 4.0311898594467084e-06, "loss": 0.6415, "step": 1562 }, { "epoch": 0.6062251527198681, "grad_norm": 3.034771923913275, "learning_rate": 4.02454838991936e-06, "loss": 0.7017, "step": 1563 }, { "epoch": 0.6066130127024144, "grad_norm": 2.4069764640726277, "learning_rate": 4.017908709058782e-06, "loss": 0.6821, "step": 1564 }, { "epoch": 0.6070008726849607, "grad_norm": 2.579551278503177, "learning_rate": 4.011270829040031e-06, "loss": 0.6726, "step": 1565 }, { "epoch": 0.607388732667507, "grad_norm": 2.1436157709441392, "learning_rate": 4.004634762034858e-06, "loss": 0.6731, "step": 1566 }, { "epoch": 0.6077765926500533, "grad_norm": 2.451220835673826, "learning_rate": 3.998000520211693e-06, "loss": 0.6202, "step": 1567 }, { "epoch": 0.6081644526325997, "grad_norm": 2.7774741694328373, "learning_rate": 3.991368115735612e-06, "loss": 0.6505, "step": 1568 }, { "epoch": 0.608552312615146, "grad_norm": 2.197982687163006, "learning_rate": 3.9847375607683335e-06, "loss": 0.7001, "step": 1569 }, { "epoch": 0.6089401725976923, "grad_norm": 2.1458630686493345, "learning_rate": 3.9781088674681764e-06, "loss": 0.6655, "step": 1570 }, { "epoch": 0.6093280325802385, "grad_norm": 2.7184335178590247, "learning_rate": 3.971482047990045e-06, "loss": 0.7619, "step": 1571 }, { "epoch": 0.6097158925627848, "grad_norm": 2.8962945971075325, "learning_rate": 3.964857114485412e-06, "loss": 0.7122, "step": 1572 }, { "epoch": 0.6101037525453311, "grad_norm": 2.038127239324106, "learning_rate": 3.958234079102288e-06, "loss": 0.6794, "step": 1573 }, { "epoch": 0.6104916125278774, "grad_norm": 2.1390505294836477, "learning_rate": 3.951612953985207e-06, "loss": 0.6375, "step": 1574 }, { "epoch": 0.6108794725104237, "grad_norm": 1.8127337860689428, "learning_rate": 3.944993751275198e-06, "loss": 0.6414, "step": 1575 }, { "epoch": 0.61126733249297, "grad_norm": 3.1672751081160526, "learning_rate": 3.938376483109762e-06, "loss": 0.7234, "step": 1576 }, { "epoch": 0.6116551924755164, "grad_norm": 2.3336109331422743, "learning_rate": 3.931761161622861e-06, "loss": 0.7055, "step": 1577 }, { "epoch": 0.6120430524580627, "grad_norm": 2.51861892216, "learning_rate": 3.92514779894488e-06, "loss": 0.7194, "step": 1578 }, { "epoch": 0.612430912440609, "grad_norm": 2.2779588128226678, "learning_rate": 3.918536407202614e-06, "loss": 0.6693, "step": 1579 }, { "epoch": 0.6128187724231552, "grad_norm": 1.7932487234130976, "learning_rate": 3.911926998519244e-06, "loss": 0.6324, "step": 1580 }, { "epoch": 0.6132066324057015, "grad_norm": 2.963439551573898, "learning_rate": 3.905319585014316e-06, "loss": 0.6809, "step": 1581 }, { "epoch": 0.6135944923882478, "grad_norm": 2.4164832873116824, "learning_rate": 3.898714178803716e-06, "loss": 0.6591, "step": 1582 }, { "epoch": 0.6139823523707941, "grad_norm": 2.743445880300928, "learning_rate": 3.892110791999649e-06, "loss": 0.645, "step": 1583 }, { "epoch": 0.6143702123533404, "grad_norm": 2.342967117222375, "learning_rate": 3.8855094367106185e-06, "loss": 0.6509, "step": 1584 }, { "epoch": 0.6147580723358868, "grad_norm": 2.0285776990165076, "learning_rate": 3.878910125041401e-06, "loss": 0.6561, "step": 1585 }, { "epoch": 0.6151459323184331, "grad_norm": 2.2490922372643163, "learning_rate": 3.87231286909303e-06, "loss": 0.6292, "step": 1586 }, { "epoch": 0.6155337923009794, "grad_norm": 2.439339348011258, "learning_rate": 3.865717680962763e-06, "loss": 0.6871, "step": 1587 }, { "epoch": 0.6159216522835257, "grad_norm": 3.242837189633756, "learning_rate": 3.859124572744072e-06, "loss": 0.6852, "step": 1588 }, { "epoch": 0.616309512266072, "grad_norm": 2.3507534892279653, "learning_rate": 3.852533556526609e-06, "loss": 0.6134, "step": 1589 }, { "epoch": 0.6166973722486182, "grad_norm": 3.018239735957644, "learning_rate": 3.845944644396194e-06, "loss": 0.7906, "step": 1590 }, { "epoch": 0.6170852322311645, "grad_norm": 3.5588768348632818, "learning_rate": 3.839357848434789e-06, "loss": 0.7565, "step": 1591 }, { "epoch": 0.6174730922137108, "grad_norm": 1.8353955680650131, "learning_rate": 3.832773180720475e-06, "loss": 0.6755, "step": 1592 }, { "epoch": 0.6178609521962571, "grad_norm": 2.55389381473284, "learning_rate": 3.8261906533274254e-06, "loss": 0.6689, "step": 1593 }, { "epoch": 0.6182488121788035, "grad_norm": 2.8341022646946823, "learning_rate": 3.8196102783259e-06, "loss": 0.6799, "step": 1594 }, { "epoch": 0.6186366721613498, "grad_norm": 1.7291691322819571, "learning_rate": 3.813032067782202e-06, "loss": 0.6365, "step": 1595 }, { "epoch": 0.6190245321438961, "grad_norm": 2.3950839446441643, "learning_rate": 3.806456033758669e-06, "loss": 0.6121, "step": 1596 }, { "epoch": 0.6194123921264424, "grad_norm": 2.1600429665590677, "learning_rate": 3.7998821883136483e-06, "loss": 0.6409, "step": 1597 }, { "epoch": 0.6198002521089887, "grad_norm": 1.8551850522175244, "learning_rate": 3.7933105435014727e-06, "loss": 0.6719, "step": 1598 }, { "epoch": 0.6201881120915349, "grad_norm": 2.4546228699627997, "learning_rate": 3.7867411113724402e-06, "loss": 0.6844, "step": 1599 }, { "epoch": 0.6205759720740812, "grad_norm": 2.7966036474580918, "learning_rate": 3.780173903972792e-06, "loss": 0.6866, "step": 1600 }, { "epoch": 0.6209638320566275, "grad_norm": 2.1275775592683117, "learning_rate": 3.773608933344689e-06, "loss": 0.6655, "step": 1601 }, { "epoch": 0.6213516920391738, "grad_norm": 1.7888521552200678, "learning_rate": 3.767046211526191e-06, "loss": 0.5791, "step": 1602 }, { "epoch": 0.6217395520217202, "grad_norm": 2.4355430925099193, "learning_rate": 3.7604857505512342e-06, "loss": 0.64, "step": 1603 }, { "epoch": 0.6221274120042665, "grad_norm": 1.8478190411509423, "learning_rate": 3.75392756244961e-06, "loss": 0.62, "step": 1604 }, { "epoch": 0.6225152719868128, "grad_norm": 2.6924873716235265, "learning_rate": 3.747371659246941e-06, "loss": 0.7134, "step": 1605 }, { "epoch": 0.6229031319693591, "grad_norm": 2.4641341971250013, "learning_rate": 3.7408180529646597e-06, "loss": 0.6756, "step": 1606 }, { "epoch": 0.6232909919519054, "grad_norm": 1.8741566324176209, "learning_rate": 3.7342667556199872e-06, "loss": 0.6384, "step": 1607 }, { "epoch": 0.6236788519344517, "grad_norm": 1.972891866360053, "learning_rate": 3.727717779225912e-06, "loss": 0.6208, "step": 1608 }, { "epoch": 0.6240667119169979, "grad_norm": 2.1576373272577714, "learning_rate": 3.721171135791164e-06, "loss": 0.6586, "step": 1609 }, { "epoch": 0.6244545718995442, "grad_norm": 2.282191799606226, "learning_rate": 3.7146268373201956e-06, "loss": 0.6375, "step": 1610 }, { "epoch": 0.6248424318820905, "grad_norm": 1.958906069246968, "learning_rate": 3.7080848958131644e-06, "loss": 0.6463, "step": 1611 }, { "epoch": 0.6252302918646369, "grad_norm": 2.298781552713599, "learning_rate": 3.7015453232659004e-06, "loss": 0.6978, "step": 1612 }, { "epoch": 0.6256181518471832, "grad_norm": 2.8030245795988584, "learning_rate": 3.695008131669891e-06, "loss": 0.6366, "step": 1613 }, { "epoch": 0.6260060118297295, "grad_norm": 2.584145367556981, "learning_rate": 3.6884733330122583e-06, "loss": 0.6438, "step": 1614 }, { "epoch": 0.6263938718122758, "grad_norm": 2.23324929265469, "learning_rate": 3.6819409392757366e-06, "loss": 0.6639, "step": 1615 }, { "epoch": 0.6267817317948221, "grad_norm": 3.45045006263283, "learning_rate": 3.67541096243865e-06, "loss": 0.6981, "step": 1616 }, { "epoch": 0.6271695917773684, "grad_norm": 2.4176315815480387, "learning_rate": 3.6688834144748906e-06, "loss": 0.6496, "step": 1617 }, { "epoch": 0.6275574517599146, "grad_norm": 2.702358690833787, "learning_rate": 3.662358307353897e-06, "loss": 0.6879, "step": 1618 }, { "epoch": 0.6279453117424609, "grad_norm": 2.2925155580581995, "learning_rate": 3.655835653040631e-06, "loss": 0.6485, "step": 1619 }, { "epoch": 0.6283331717250072, "grad_norm": 2.8946523849102936, "learning_rate": 3.6493154634955607e-06, "loss": 0.7064, "step": 1620 }, { "epoch": 0.6287210317075536, "grad_norm": 2.596166000552542, "learning_rate": 3.6427977506746293e-06, "loss": 0.6308, "step": 1621 }, { "epoch": 0.6291088916900999, "grad_norm": 1.7171262197079147, "learning_rate": 3.6362825265292424e-06, "loss": 0.614, "step": 1622 }, { "epoch": 0.6294967516726462, "grad_norm": 2.325715264494156, "learning_rate": 3.629769803006239e-06, "loss": 0.6434, "step": 1623 }, { "epoch": 0.6298846116551925, "grad_norm": 2.2363092743777546, "learning_rate": 3.623259592047875e-06, "loss": 0.7469, "step": 1624 }, { "epoch": 0.6302724716377388, "grad_norm": 2.9245043093337824, "learning_rate": 3.6167519055917992e-06, "loss": 0.6382, "step": 1625 }, { "epoch": 0.6306603316202851, "grad_norm": 2.8904426776336845, "learning_rate": 3.61024675557103e-06, "loss": 0.6862, "step": 1626 }, { "epoch": 0.6310481916028314, "grad_norm": 2.02434990907026, "learning_rate": 3.6037441539139328e-06, "loss": 0.6329, "step": 1627 }, { "epoch": 0.6314360515853776, "grad_norm": 2.585616365393776, "learning_rate": 3.597244112544208e-06, "loss": 0.6768, "step": 1628 }, { "epoch": 0.631823911567924, "grad_norm": 2.365124713436376, "learning_rate": 3.5907466433808524e-06, "loss": 0.6631, "step": 1629 }, { "epoch": 0.6322117715504703, "grad_norm": 2.3357154135006297, "learning_rate": 3.584251758338151e-06, "loss": 0.6462, "step": 1630 }, { "epoch": 0.6325996315330166, "grad_norm": 1.8755347283102712, "learning_rate": 3.5777594693256474e-06, "loss": 0.6154, "step": 1631 }, { "epoch": 0.6329874915155629, "grad_norm": 2.6822184851325273, "learning_rate": 3.571269788248128e-06, "loss": 0.6655, "step": 1632 }, { "epoch": 0.6333753514981092, "grad_norm": 2.1661059259563937, "learning_rate": 3.5647827270055945e-06, "loss": 0.6807, "step": 1633 }, { "epoch": 0.6337632114806555, "grad_norm": 2.328639721695957, "learning_rate": 3.5582982974932467e-06, "loss": 0.6995, "step": 1634 }, { "epoch": 0.6341510714632018, "grad_norm": 2.3021514841699764, "learning_rate": 3.551816511601458e-06, "loss": 0.7133, "step": 1635 }, { "epoch": 0.6345389314457481, "grad_norm": 2.1717346618252935, "learning_rate": 3.5453373812157517e-06, "loss": 0.6341, "step": 1636 }, { "epoch": 0.6349267914282943, "grad_norm": 2.8629218664183873, "learning_rate": 3.5388609182167867e-06, "loss": 0.758, "step": 1637 }, { "epoch": 0.6353146514108406, "grad_norm": 2.5147100697008304, "learning_rate": 3.532387134480327e-06, "loss": 0.6617, "step": 1638 }, { "epoch": 0.635702511393387, "grad_norm": 2.711615352650269, "learning_rate": 3.5259160418772242e-06, "loss": 0.729, "step": 1639 }, { "epoch": 0.6360903713759333, "grad_norm": 2.3501027805888635, "learning_rate": 3.5194476522733974e-06, "loss": 0.6602, "step": 1640 }, { "epoch": 0.6364782313584796, "grad_norm": 2.3787919317505137, "learning_rate": 3.512981977529806e-06, "loss": 0.6946, "step": 1641 }, { "epoch": 0.6368660913410259, "grad_norm": 3.0024432080364276, "learning_rate": 3.5065190295024334e-06, "loss": 0.7097, "step": 1642 }, { "epoch": 0.6372539513235722, "grad_norm": 1.715285396010998, "learning_rate": 3.500058820042263e-06, "loss": 0.6495, "step": 1643 }, { "epoch": 0.6376418113061185, "grad_norm": 2.954695405998854, "learning_rate": 3.493601360995256e-06, "loss": 0.6554, "step": 1644 }, { "epoch": 0.6380296712886648, "grad_norm": 2.4948065463716182, "learning_rate": 3.4871466642023264e-06, "loss": 0.6788, "step": 1645 }, { "epoch": 0.6384175312712111, "grad_norm": 3.061713511178477, "learning_rate": 3.4806947414993342e-06, "loss": 0.687, "step": 1646 }, { "epoch": 0.6388053912537573, "grad_norm": 2.621457155608336, "learning_rate": 3.4742456047170413e-06, "loss": 0.7255, "step": 1647 }, { "epoch": 0.6391932512363037, "grad_norm": 3.2419076322354643, "learning_rate": 3.4677992656811054e-06, "loss": 0.63, "step": 1648 }, { "epoch": 0.63958111121885, "grad_norm": 2.1843474185408462, "learning_rate": 3.4613557362120542e-06, "loss": 0.6567, "step": 1649 }, { "epoch": 0.6399689712013963, "grad_norm": 1.9304591173045524, "learning_rate": 3.4549150281252635e-06, "loss": 0.6428, "step": 1650 }, { "epoch": 0.6403568311839426, "grad_norm": 2.381801303530764, "learning_rate": 3.4484771532309348e-06, "loss": 0.6559, "step": 1651 }, { "epoch": 0.6407446911664889, "grad_norm": 1.750522554481377, "learning_rate": 3.442042123334075e-06, "loss": 0.624, "step": 1652 }, { "epoch": 0.6411325511490352, "grad_norm": 2.8362261901809362, "learning_rate": 3.435609950234473e-06, "loss": 0.7085, "step": 1653 }, { "epoch": 0.6415204111315815, "grad_norm": 3.3460978199650113, "learning_rate": 3.429180645726683e-06, "loss": 0.6689, "step": 1654 }, { "epoch": 0.6419082711141278, "grad_norm": 2.6392049533198754, "learning_rate": 3.422754221599995e-06, "loss": 0.7096, "step": 1655 }, { "epoch": 0.642296131096674, "grad_norm": 2.4767015966686974, "learning_rate": 3.4163306896384185e-06, "loss": 0.6486, "step": 1656 }, { "epoch": 0.6426839910792204, "grad_norm": 1.86147163541491, "learning_rate": 3.4099100616206597e-06, "loss": 0.6684, "step": 1657 }, { "epoch": 0.6430718510617667, "grad_norm": 2.9468048083131952, "learning_rate": 3.403492349320101e-06, "loss": 0.6763, "step": 1658 }, { "epoch": 0.643459711044313, "grad_norm": 1.8895588620063253, "learning_rate": 3.397077564504777e-06, "loss": 0.6586, "step": 1659 }, { "epoch": 0.6438475710268593, "grad_norm": 3.497683431375031, "learning_rate": 3.390665718937355e-06, "loss": 0.7027, "step": 1660 }, { "epoch": 0.6442354310094056, "grad_norm": 1.8729644036095547, "learning_rate": 3.3842568243751124e-06, "loss": 0.6551, "step": 1661 }, { "epoch": 0.6446232909919519, "grad_norm": 2.6156424989197506, "learning_rate": 3.3778508925699126e-06, "loss": 0.7259, "step": 1662 }, { "epoch": 0.6450111509744982, "grad_norm": 1.827342619381396, "learning_rate": 3.371447935268194e-06, "loss": 0.642, "step": 1663 }, { "epoch": 0.6453990109570445, "grad_norm": 1.909641058904357, "learning_rate": 3.3650479642109323e-06, "loss": 0.6186, "step": 1664 }, { "epoch": 0.6457868709395909, "grad_norm": 2.241562130745076, "learning_rate": 3.3586509911336316e-06, "loss": 0.5816, "step": 1665 }, { "epoch": 0.6461747309221371, "grad_norm": 2.6752206929551483, "learning_rate": 3.3522570277662986e-06, "loss": 0.6657, "step": 1666 }, { "epoch": 0.6465625909046834, "grad_norm": 2.9715899202430345, "learning_rate": 3.345866085833419e-06, "loss": 0.6688, "step": 1667 }, { "epoch": 0.6469504508872297, "grad_norm": 2.671076842077024, "learning_rate": 3.3394781770539406e-06, "loss": 0.7028, "step": 1668 }, { "epoch": 0.647338310869776, "grad_norm": 2.2702108841963664, "learning_rate": 3.3330933131412484e-06, "loss": 0.6997, "step": 1669 }, { "epoch": 0.6477261708523223, "grad_norm": 3.4984043250422445, "learning_rate": 3.3267115058031418e-06, "loss": 0.673, "step": 1670 }, { "epoch": 0.6481140308348686, "grad_norm": 2.8661296785244943, "learning_rate": 3.3203327667418207e-06, "loss": 0.6938, "step": 1671 }, { "epoch": 0.6485018908174149, "grad_norm": 2.5176665890846666, "learning_rate": 3.3139571076538547e-06, "loss": 0.6068, "step": 1672 }, { "epoch": 0.6488897507999613, "grad_norm": 2.297391751870999, "learning_rate": 3.3075845402301652e-06, "loss": 0.6703, "step": 1673 }, { "epoch": 0.6492776107825076, "grad_norm": 2.556267258504842, "learning_rate": 3.3012150761560085e-06, "loss": 0.6562, "step": 1674 }, { "epoch": 0.6496654707650538, "grad_norm": 2.288920362069363, "learning_rate": 3.2948487271109453e-06, "loss": 0.6585, "step": 1675 }, { "epoch": 0.6500533307476001, "grad_norm": 2.7617991910556268, "learning_rate": 3.2884855047688292e-06, "loss": 0.6367, "step": 1676 }, { "epoch": 0.6504411907301464, "grad_norm": 2.9684738028960806, "learning_rate": 3.282125420797776e-06, "loss": 0.6793, "step": 1677 }, { "epoch": 0.6508290507126927, "grad_norm": 1.964680663461783, "learning_rate": 3.275768486860149e-06, "loss": 0.6412, "step": 1678 }, { "epoch": 0.651216910695239, "grad_norm": 3.180536226614934, "learning_rate": 3.269414714612534e-06, "loss": 0.6507, "step": 1679 }, { "epoch": 0.6516047706777853, "grad_norm": 2.4081506738459635, "learning_rate": 3.263064115705725e-06, "loss": 0.6746, "step": 1680 }, { "epoch": 0.6519926306603316, "grad_norm": 2.3192708675394136, "learning_rate": 3.25671670178469e-06, "loss": 0.6314, "step": 1681 }, { "epoch": 0.652380490642878, "grad_norm": 2.235580045088617, "learning_rate": 3.250372484488558e-06, "loss": 0.6528, "step": 1682 }, { "epoch": 0.6527683506254243, "grad_norm": 2.6141275756483933, "learning_rate": 3.244031475450599e-06, "loss": 0.6799, "step": 1683 }, { "epoch": 0.6531562106079706, "grad_norm": 2.755618689197231, "learning_rate": 3.237693686298199e-06, "loss": 0.7223, "step": 1684 }, { "epoch": 0.6535440705905168, "grad_norm": 2.657243779655197, "learning_rate": 3.2313591286528384e-06, "loss": 0.687, "step": 1685 }, { "epoch": 0.6539319305730631, "grad_norm": 2.627673154821721, "learning_rate": 3.225027814130074e-06, "loss": 0.6808, "step": 1686 }, { "epoch": 0.6543197905556094, "grad_norm": 2.9393221045694244, "learning_rate": 3.218699754339513e-06, "loss": 0.6629, "step": 1687 }, { "epoch": 0.6547076505381557, "grad_norm": 2.21308436423035, "learning_rate": 3.2123749608847998e-06, "loss": 0.6766, "step": 1688 }, { "epoch": 0.655095510520702, "grad_norm": 2.146947646487609, "learning_rate": 3.206053445363584e-06, "loss": 0.5836, "step": 1689 }, { "epoch": 0.6554833705032483, "grad_norm": 2.3219044354451146, "learning_rate": 3.199735219367507e-06, "loss": 0.6458, "step": 1690 }, { "epoch": 0.6558712304857947, "grad_norm": 1.6533912898818457, "learning_rate": 3.193420294482177e-06, "loss": 0.6455, "step": 1691 }, { "epoch": 0.656259090468341, "grad_norm": 2.3929621469825335, "learning_rate": 3.18710868228715e-06, "loss": 0.7156, "step": 1692 }, { "epoch": 0.6566469504508873, "grad_norm": 2.525734211721821, "learning_rate": 3.180800394355908e-06, "loss": 0.6112, "step": 1693 }, { "epoch": 0.6570348104334335, "grad_norm": 2.6369475345406164, "learning_rate": 3.174495442255836e-06, "loss": 0.7192, "step": 1694 }, { "epoch": 0.6574226704159798, "grad_norm": 2.03281569305387, "learning_rate": 3.1681938375482035e-06, "loss": 0.6858, "step": 1695 }, { "epoch": 0.6578105303985261, "grad_norm": 2.6506502622026433, "learning_rate": 3.1618955917881383e-06, "loss": 0.7413, "step": 1696 }, { "epoch": 0.6581983903810724, "grad_norm": 1.7115341428389312, "learning_rate": 3.155600716524617e-06, "loss": 0.6162, "step": 1697 }, { "epoch": 0.6585862503636187, "grad_norm": 1.8672854601493158, "learning_rate": 3.149309223300428e-06, "loss": 0.6394, "step": 1698 }, { "epoch": 0.658974110346165, "grad_norm": 2.4807563590358, "learning_rate": 3.1430211236521615e-06, "loss": 0.6122, "step": 1699 }, { "epoch": 0.6593619703287114, "grad_norm": 2.015717087562288, "learning_rate": 3.1367364291101845e-06, "loss": 0.6157, "step": 1700 }, { "epoch": 0.6597498303112577, "grad_norm": 2.7098469881563356, "learning_rate": 3.130455151198618e-06, "loss": 0.7194, "step": 1701 }, { "epoch": 0.660137690293804, "grad_norm": 2.1264093090213723, "learning_rate": 3.124177301435324e-06, "loss": 0.6185, "step": 1702 }, { "epoch": 0.6605255502763503, "grad_norm": 2.473286473792266, "learning_rate": 3.11790289133187e-06, "loss": 0.7159, "step": 1703 }, { "epoch": 0.6609134102588965, "grad_norm": 2.0174582495391804, "learning_rate": 3.1116319323935207e-06, "loss": 0.6909, "step": 1704 }, { "epoch": 0.6613012702414428, "grad_norm": 3.0967480852762295, "learning_rate": 3.1053644361192158e-06, "loss": 0.7484, "step": 1705 }, { "epoch": 0.6616891302239891, "grad_norm": 1.7865693936748686, "learning_rate": 3.09910041400154e-06, "loss": 0.6274, "step": 1706 }, { "epoch": 0.6620769902065354, "grad_norm": 2.3321530703893916, "learning_rate": 3.092839877526711e-06, "loss": 0.6226, "step": 1707 }, { "epoch": 0.6624648501890817, "grad_norm": 2.2600793719146695, "learning_rate": 3.0865828381745515e-06, "loss": 0.6233, "step": 1708 }, { "epoch": 0.6628527101716281, "grad_norm": 3.156670964008596, "learning_rate": 3.0803293074184754e-06, "loss": 0.6643, "step": 1709 }, { "epoch": 0.6632405701541744, "grad_norm": 1.8827109226593057, "learning_rate": 3.0740792967254606e-06, "loss": 0.643, "step": 1710 }, { "epoch": 0.6636284301367207, "grad_norm": 2.401307516980964, "learning_rate": 3.0678328175560306e-06, "loss": 0.6862, "step": 1711 }, { "epoch": 0.664016290119267, "grad_norm": 2.5839480764967764, "learning_rate": 3.061589881364234e-06, "loss": 0.6631, "step": 1712 }, { "epoch": 0.6644041501018132, "grad_norm": 2.5828338749972772, "learning_rate": 3.0553504995976204e-06, "loss": 0.685, "step": 1713 }, { "epoch": 0.6647920100843595, "grad_norm": 2.7528235061282493, "learning_rate": 3.0491146836972273e-06, "loss": 0.6654, "step": 1714 }, { "epoch": 0.6651798700669058, "grad_norm": 2.349344378017545, "learning_rate": 3.0428824450975484e-06, "loss": 0.6769, "step": 1715 }, { "epoch": 0.6655677300494521, "grad_norm": 2.4823864339433226, "learning_rate": 3.0366537952265185e-06, "loss": 0.6737, "step": 1716 }, { "epoch": 0.6659555900319984, "grad_norm": 2.4345365872443026, "learning_rate": 3.0304287455054925e-06, "loss": 0.7056, "step": 1717 }, { "epoch": 0.6663434500145448, "grad_norm": 3.4301910694312006, "learning_rate": 3.0242073073492238e-06, "loss": 0.7497, "step": 1718 }, { "epoch": 0.6667313099970911, "grad_norm": 2.39629633822788, "learning_rate": 3.017989492165844e-06, "loss": 0.6996, "step": 1719 }, { "epoch": 0.6671191699796374, "grad_norm": 2.3447937830478724, "learning_rate": 3.0117753113568406e-06, "loss": 0.6935, "step": 1720 }, { "epoch": 0.6675070299621837, "grad_norm": 2.633921336559955, "learning_rate": 3.0055647763170336e-06, "loss": 0.6334, "step": 1721 }, { "epoch": 0.6678948899447299, "grad_norm": 2.8616685748171005, "learning_rate": 2.9993578984345673e-06, "loss": 0.7198, "step": 1722 }, { "epoch": 0.6682827499272762, "grad_norm": 2.1954989570696744, "learning_rate": 2.9931546890908695e-06, "loss": 0.6503, "step": 1723 }, { "epoch": 0.6686706099098225, "grad_norm": 2.153894623401768, "learning_rate": 2.986955159660647e-06, "loss": 0.6143, "step": 1724 }, { "epoch": 0.6690584698923688, "grad_norm": 1.8494711103067492, "learning_rate": 2.980759321511857e-06, "loss": 0.6543, "step": 1725 }, { "epoch": 0.6694463298749151, "grad_norm": 2.354660844608629, "learning_rate": 2.974567186005687e-06, "loss": 0.7063, "step": 1726 }, { "epoch": 0.6698341898574615, "grad_norm": 2.504979637699637, "learning_rate": 2.968378764496537e-06, "loss": 0.6969, "step": 1727 }, { "epoch": 0.6702220498400078, "grad_norm": 1.830986992701897, "learning_rate": 2.962194068331996e-06, "loss": 0.5963, "step": 1728 }, { "epoch": 0.6706099098225541, "grad_norm": 2.39769306988883, "learning_rate": 2.9560131088528223e-06, "loss": 0.6218, "step": 1729 }, { "epoch": 0.6709977698051004, "grad_norm": 2.782286351139065, "learning_rate": 2.9498358973929197e-06, "loss": 0.6904, "step": 1730 }, { "epoch": 0.6713856297876467, "grad_norm": 2.3955151888935826, "learning_rate": 2.943662445279325e-06, "loss": 0.6502, "step": 1731 }, { "epoch": 0.6717734897701929, "grad_norm": 2.7826373031072094, "learning_rate": 2.937492763832176e-06, "loss": 0.5778, "step": 1732 }, { "epoch": 0.6721613497527392, "grad_norm": 2.347925056721915, "learning_rate": 2.9313268643646988e-06, "loss": 0.7014, "step": 1733 }, { "epoch": 0.6725492097352855, "grad_norm": 2.4580320263264555, "learning_rate": 2.925164758183184e-06, "loss": 0.7133, "step": 1734 }, { "epoch": 0.6729370697178318, "grad_norm": 3.2021099734602525, "learning_rate": 2.9190064565869663e-06, "loss": 0.7257, "step": 1735 }, { "epoch": 0.6733249297003782, "grad_norm": 2.4938299306632765, "learning_rate": 2.912851970868405e-06, "loss": 0.6225, "step": 1736 }, { "epoch": 0.6737127896829245, "grad_norm": 2.0424472326635525, "learning_rate": 2.906701312312861e-06, "loss": 0.6599, "step": 1737 }, { "epoch": 0.6741006496654708, "grad_norm": 2.7533325191938727, "learning_rate": 2.9005544921986774e-06, "loss": 0.679, "step": 1738 }, { "epoch": 0.6744885096480171, "grad_norm": 2.118713138689999, "learning_rate": 2.8944115217971613e-06, "loss": 0.5946, "step": 1739 }, { "epoch": 0.6748763696305634, "grad_norm": 2.395258417919304, "learning_rate": 2.888272412372559e-06, "loss": 0.6699, "step": 1740 }, { "epoch": 0.6752642296131096, "grad_norm": 2.5280905964690326, "learning_rate": 2.8821371751820348e-06, "loss": 0.6631, "step": 1741 }, { "epoch": 0.6756520895956559, "grad_norm": 3.333218759776002, "learning_rate": 2.876005821475657e-06, "loss": 0.6803, "step": 1742 }, { "epoch": 0.6760399495782022, "grad_norm": 2.5899712600302474, "learning_rate": 2.8698783624963684e-06, "loss": 0.6302, "step": 1743 }, { "epoch": 0.6764278095607485, "grad_norm": 1.871687355042566, "learning_rate": 2.8637548094799728e-06, "loss": 0.5995, "step": 1744 }, { "epoch": 0.6768156695432949, "grad_norm": 2.243656880620021, "learning_rate": 2.8576351736551118e-06, "loss": 0.7006, "step": 1745 }, { "epoch": 0.6772035295258412, "grad_norm": 2.750746769523929, "learning_rate": 2.8515194662432423e-06, "loss": 0.692, "step": 1746 }, { "epoch": 0.6775913895083875, "grad_norm": 2.3431004361558343, "learning_rate": 2.8454076984586176e-06, "loss": 0.7007, "step": 1747 }, { "epoch": 0.6779792494909338, "grad_norm": 2.661393164184605, "learning_rate": 2.839299881508272e-06, "loss": 0.6673, "step": 1748 }, { "epoch": 0.6783671094734801, "grad_norm": 2.9890940014578056, "learning_rate": 2.833196026591989e-06, "loss": 0.7108, "step": 1749 }, { "epoch": 0.6787549694560264, "grad_norm": 1.7835423491410336, "learning_rate": 2.827096144902289e-06, "loss": 0.5753, "step": 1750 }, { "epoch": 0.6791428294385726, "grad_norm": 2.6603663694403967, "learning_rate": 2.8210002476244093e-06, "loss": 0.6854, "step": 1751 }, { "epoch": 0.6795306894211189, "grad_norm": 2.3312604182389767, "learning_rate": 2.814908345936277e-06, "loss": 0.6326, "step": 1752 }, { "epoch": 0.6799185494036653, "grad_norm": 2.717794729988797, "learning_rate": 2.8088204510084948e-06, "loss": 0.6548, "step": 1753 }, { "epoch": 0.6803064093862116, "grad_norm": 2.5878650159878043, "learning_rate": 2.8027365740043188e-06, "loss": 0.6468, "step": 1754 }, { "epoch": 0.6806942693687579, "grad_norm": 1.9854076603340918, "learning_rate": 2.796656726079636e-06, "loss": 0.6432, "step": 1755 }, { "epoch": 0.6810821293513042, "grad_norm": 2.211129633281632, "learning_rate": 2.790580918382947e-06, "loss": 0.697, "step": 1756 }, { "epoch": 0.6814699893338505, "grad_norm": 2.806611346992742, "learning_rate": 2.7845091620553423e-06, "loss": 0.6326, "step": 1757 }, { "epoch": 0.6818578493163968, "grad_norm": 2.3636665642556047, "learning_rate": 2.778441468230483e-06, "loss": 0.5815, "step": 1758 }, { "epoch": 0.6822457092989431, "grad_norm": 2.7458802808947387, "learning_rate": 2.7723778480345844e-06, "loss": 0.6076, "step": 1759 }, { "epoch": 0.6826335692814893, "grad_norm": 2.6729661494855446, "learning_rate": 2.7663183125863887e-06, "loss": 0.6684, "step": 1760 }, { "epoch": 0.6830214292640356, "grad_norm": 1.8691961809380775, "learning_rate": 2.760262872997148e-06, "loss": 0.6947, "step": 1761 }, { "epoch": 0.683409289246582, "grad_norm": 2.97658853483196, "learning_rate": 2.7542115403706067e-06, "loss": 0.6692, "step": 1762 }, { "epoch": 0.6837971492291283, "grad_norm": 1.8657480594829485, "learning_rate": 2.748164325802975e-06, "loss": 0.6078, "step": 1763 }, { "epoch": 0.6841850092116746, "grad_norm": 1.9294651153987994, "learning_rate": 2.742121240382912e-06, "loss": 0.6006, "step": 1764 }, { "epoch": 0.6845728691942209, "grad_norm": 2.3246804474072915, "learning_rate": 2.736082295191511e-06, "loss": 0.6643, "step": 1765 }, { "epoch": 0.6849607291767672, "grad_norm": 2.0567532899214833, "learning_rate": 2.7300475013022666e-06, "loss": 0.5877, "step": 1766 }, { "epoch": 0.6853485891593135, "grad_norm": 2.344063176416792, "learning_rate": 2.724016869781064e-06, "loss": 0.6115, "step": 1767 }, { "epoch": 0.6857364491418598, "grad_norm": 2.641373824930953, "learning_rate": 2.7179904116861557e-06, "loss": 0.7006, "step": 1768 }, { "epoch": 0.6861243091244061, "grad_norm": 1.8492778587225964, "learning_rate": 2.711968138068141e-06, "loss": 0.6115, "step": 1769 }, { "epoch": 0.6865121691069523, "grad_norm": 2.6745355688436834, "learning_rate": 2.705950059969948e-06, "loss": 0.7389, "step": 1770 }, { "epoch": 0.6869000290894987, "grad_norm": 3.3019223354751186, "learning_rate": 2.6999361884268086e-06, "loss": 0.7065, "step": 1771 }, { "epoch": 0.687287889072045, "grad_norm": 2.3917182282400327, "learning_rate": 2.6939265344662426e-06, "loss": 0.636, "step": 1772 }, { "epoch": 0.6876757490545913, "grad_norm": 2.2341061141953915, "learning_rate": 2.687921109108038e-06, "loss": 0.6849, "step": 1773 }, { "epoch": 0.6880636090371376, "grad_norm": 1.7384642106764594, "learning_rate": 2.681919923364228e-06, "loss": 0.5508, "step": 1774 }, { "epoch": 0.6884514690196839, "grad_norm": 2.1977655989134837, "learning_rate": 2.675922988239069e-06, "loss": 0.6487, "step": 1775 }, { "epoch": 0.6888393290022302, "grad_norm": 2.5929224669651165, "learning_rate": 2.6699303147290257e-06, "loss": 0.6335, "step": 1776 }, { "epoch": 0.6892271889847765, "grad_norm": 1.836688146694951, "learning_rate": 2.663941913822747e-06, "loss": 0.5986, "step": 1777 }, { "epoch": 0.6896150489673228, "grad_norm": 2.4806010829840264, "learning_rate": 2.65795779650105e-06, "loss": 0.5808, "step": 1778 }, { "epoch": 0.690002908949869, "grad_norm": 1.8406669070802397, "learning_rate": 2.6519779737368935e-06, "loss": 0.6488, "step": 1779 }, { "epoch": 0.6903907689324154, "grad_norm": 2.2436078088666, "learning_rate": 2.6460024564953624e-06, "loss": 0.6222, "step": 1780 }, { "epoch": 0.6907786289149617, "grad_norm": 2.5923047204374643, "learning_rate": 2.640031255733646e-06, "loss": 0.6268, "step": 1781 }, { "epoch": 0.691166488897508, "grad_norm": 2.5825449176280264, "learning_rate": 2.634064382401025e-06, "loss": 0.7227, "step": 1782 }, { "epoch": 0.6915543488800543, "grad_norm": 2.1691800403828805, "learning_rate": 2.6281018474388354e-06, "loss": 0.6774, "step": 1783 }, { "epoch": 0.6919422088626006, "grad_norm": 2.0749798114166738, "learning_rate": 2.6221436617804635e-06, "loss": 0.7068, "step": 1784 }, { "epoch": 0.6923300688451469, "grad_norm": 2.1322197999086008, "learning_rate": 2.6161898363513192e-06, "loss": 0.6317, "step": 1785 }, { "epoch": 0.6927179288276932, "grad_norm": 1.7796409907192516, "learning_rate": 2.610240382068818e-06, "loss": 0.5968, "step": 1786 }, { "epoch": 0.6931057888102395, "grad_norm": 1.9327779793656887, "learning_rate": 2.6042953098423573e-06, "loss": 0.696, "step": 1787 }, { "epoch": 0.6934936487927859, "grad_norm": 2.0707282021966043, "learning_rate": 2.598354630573303e-06, "loss": 0.6804, "step": 1788 }, { "epoch": 0.6938815087753321, "grad_norm": 2.6383145261530463, "learning_rate": 2.592418355154963e-06, "loss": 0.6825, "step": 1789 }, { "epoch": 0.6942693687578784, "grad_norm": 1.934057966926295, "learning_rate": 2.586486494472572e-06, "loss": 0.5859, "step": 1790 }, { "epoch": 0.6946572287404247, "grad_norm": 2.5494018792313926, "learning_rate": 2.5805590594032666e-06, "loss": 0.6624, "step": 1791 }, { "epoch": 0.695045088722971, "grad_norm": 2.2760364758977834, "learning_rate": 2.5746360608160703e-06, "loss": 0.5983, "step": 1792 }, { "epoch": 0.6954329487055173, "grad_norm": 2.982911876531326, "learning_rate": 2.5687175095718726e-06, "loss": 0.706, "step": 1793 }, { "epoch": 0.6958208086880636, "grad_norm": 2.8968820887645674, "learning_rate": 2.562803416523405e-06, "loss": 0.6792, "step": 1794 }, { "epoch": 0.6962086686706099, "grad_norm": 3.070793674716604, "learning_rate": 2.5568937925152272e-06, "loss": 0.6939, "step": 1795 }, { "epoch": 0.6965965286531562, "grad_norm": 2.2443145566906075, "learning_rate": 2.550988648383701e-06, "loss": 0.6822, "step": 1796 }, { "epoch": 0.6969843886357026, "grad_norm": 1.9767055784859635, "learning_rate": 2.545087994956975e-06, "loss": 0.6638, "step": 1797 }, { "epoch": 0.6973722486182488, "grad_norm": 1.7835161839794424, "learning_rate": 2.5391918430549635e-06, "loss": 0.616, "step": 1798 }, { "epoch": 0.6977601086007951, "grad_norm": 2.524305610143234, "learning_rate": 2.5333002034893283e-06, "loss": 0.6784, "step": 1799 }, { "epoch": 0.6981479685833414, "grad_norm": 2.8546336627823865, "learning_rate": 2.527413087063454e-06, "loss": 0.6637, "step": 1800 }, { "epoch": 0.6985358285658877, "grad_norm": 2.39390396286077, "learning_rate": 2.521530504572432e-06, "loss": 0.6036, "step": 1801 }, { "epoch": 0.698923688548434, "grad_norm": 2.6862313726875957, "learning_rate": 2.5156524668030402e-06, "loss": 0.7165, "step": 1802 }, { "epoch": 0.6993115485309803, "grad_norm": 3.25660507333057, "learning_rate": 2.5097789845337223e-06, "loss": 0.6785, "step": 1803 }, { "epoch": 0.6996994085135266, "grad_norm": 1.8611017232895033, "learning_rate": 2.50391006853457e-06, "loss": 0.6241, "step": 1804 }, { "epoch": 0.700087268496073, "grad_norm": 2.0356633779992146, "learning_rate": 2.498045729567302e-06, "loss": 0.6468, "step": 1805 }, { "epoch": 0.7004751284786193, "grad_norm": 2.504066361289267, "learning_rate": 2.492185978385241e-06, "loss": 0.721, "step": 1806 }, { "epoch": 0.7008629884611656, "grad_norm": 2.4158498442917704, "learning_rate": 2.4863308257333e-06, "loss": 0.6908, "step": 1807 }, { "epoch": 0.7012508484437118, "grad_norm": 2.24515327336019, "learning_rate": 2.480480282347961e-06, "loss": 0.7038, "step": 1808 }, { "epoch": 0.7016387084262581, "grad_norm": 2.917972962013133, "learning_rate": 2.4746343589572526e-06, "loss": 0.6786, "step": 1809 }, { "epoch": 0.7020265684088044, "grad_norm": 2.2654362924631455, "learning_rate": 2.46879306628073e-06, "loss": 0.6567, "step": 1810 }, { "epoch": 0.7024144283913507, "grad_norm": 2.869284118415104, "learning_rate": 2.4629564150294593e-06, "loss": 0.7002, "step": 1811 }, { "epoch": 0.702802288373897, "grad_norm": 2.6297535950682196, "learning_rate": 2.4571244159059952e-06, "loss": 0.6521, "step": 1812 }, { "epoch": 0.7031901483564433, "grad_norm": 2.3071319635393053, "learning_rate": 2.4512970796043616e-06, "loss": 0.7147, "step": 1813 }, { "epoch": 0.7035780083389896, "grad_norm": 2.437016171084647, "learning_rate": 2.445474416810033e-06, "loss": 0.6578, "step": 1814 }, { "epoch": 0.703965868321536, "grad_norm": 2.3285116310583063, "learning_rate": 2.439656438199911e-06, "loss": 0.7152, "step": 1815 }, { "epoch": 0.7043537283040823, "grad_norm": 1.968319821596651, "learning_rate": 2.433843154442315e-06, "loss": 0.6425, "step": 1816 }, { "epoch": 0.7047415882866285, "grad_norm": 2.4712904939026856, "learning_rate": 2.428034576196949e-06, "loss": 0.624, "step": 1817 }, { "epoch": 0.7051294482691748, "grad_norm": 1.825995669497734, "learning_rate": 2.422230714114891e-06, "loss": 0.6363, "step": 1818 }, { "epoch": 0.7055173082517211, "grad_norm": 2.1389054993139625, "learning_rate": 2.41643157883857e-06, "loss": 0.6774, "step": 1819 }, { "epoch": 0.7059051682342674, "grad_norm": 2.8038731267632215, "learning_rate": 2.4106371810017486e-06, "loss": 0.6102, "step": 1820 }, { "epoch": 0.7062930282168137, "grad_norm": 2.205340674899081, "learning_rate": 2.4048475312295027e-06, "loss": 0.6165, "step": 1821 }, { "epoch": 0.70668088819936, "grad_norm": 3.0950075232492638, "learning_rate": 2.399062640138201e-06, "loss": 0.6644, "step": 1822 }, { "epoch": 0.7070687481819063, "grad_norm": 2.2444241248190604, "learning_rate": 2.3932825183354864e-06, "loss": 0.6806, "step": 1823 }, { "epoch": 0.7074566081644527, "grad_norm": 2.516131838240547, "learning_rate": 2.387507176420256e-06, "loss": 0.6654, "step": 1824 }, { "epoch": 0.707844468146999, "grad_norm": 2.5160744334561995, "learning_rate": 2.381736624982644e-06, "loss": 0.6526, "step": 1825 }, { "epoch": 0.7082323281295453, "grad_norm": 2.182085038990395, "learning_rate": 2.375970874603998e-06, "loss": 0.5953, "step": 1826 }, { "epoch": 0.7086201881120915, "grad_norm": 1.8029619910512331, "learning_rate": 2.3702099358568635e-06, "loss": 0.6182, "step": 1827 }, { "epoch": 0.7090080480946378, "grad_norm": 2.152358937129284, "learning_rate": 2.3644538193049626e-06, "loss": 0.6548, "step": 1828 }, { "epoch": 0.7093959080771841, "grad_norm": 2.0508905560035893, "learning_rate": 2.3587025355031744e-06, "loss": 0.6144, "step": 1829 }, { "epoch": 0.7097837680597304, "grad_norm": 2.8532167633447774, "learning_rate": 2.3529560949975184e-06, "loss": 0.6267, "step": 1830 }, { "epoch": 0.7101716280422767, "grad_norm": 1.8370839358996596, "learning_rate": 2.3472145083251296e-06, "loss": 0.6065, "step": 1831 }, { "epoch": 0.710559488024823, "grad_norm": 2.0583216133117617, "learning_rate": 2.3414777860142446e-06, "loss": 0.6932, "step": 1832 }, { "epoch": 0.7109473480073694, "grad_norm": 2.9465626251863086, "learning_rate": 2.3357459385841824e-06, "loss": 0.7029, "step": 1833 }, { "epoch": 0.7113352079899157, "grad_norm": 2.405095879774373, "learning_rate": 2.3300189765453198e-06, "loss": 0.6204, "step": 1834 }, { "epoch": 0.711723067972462, "grad_norm": 2.504221325173376, "learning_rate": 2.3242969103990765e-06, "loss": 0.7076, "step": 1835 }, { "epoch": 0.7121109279550082, "grad_norm": 2.9119714642146586, "learning_rate": 2.3185797506378943e-06, "loss": 0.6638, "step": 1836 }, { "epoch": 0.7124987879375545, "grad_norm": 2.1594326769119467, "learning_rate": 2.312867507745219e-06, "loss": 0.5852, "step": 1837 }, { "epoch": 0.7128866479201008, "grad_norm": 2.549863631453841, "learning_rate": 2.3071601921954797e-06, "loss": 0.6664, "step": 1838 }, { "epoch": 0.7132745079026471, "grad_norm": 2.4657804108521844, "learning_rate": 2.3014578144540706e-06, "loss": 0.681, "step": 1839 }, { "epoch": 0.7136623678851934, "grad_norm": 2.0695739491004015, "learning_rate": 2.295760384977331e-06, "loss": 0.6554, "step": 1840 }, { "epoch": 0.7140502278677398, "grad_norm": 3.0873277113730424, "learning_rate": 2.2900679142125275e-06, "loss": 0.7498, "step": 1841 }, { "epoch": 0.7144380878502861, "grad_norm": 3.045009380915321, "learning_rate": 2.2843804125978356e-06, "loss": 0.7027, "step": 1842 }, { "epoch": 0.7148259478328324, "grad_norm": 3.6905812274636562, "learning_rate": 2.278697890562316e-06, "loss": 0.6902, "step": 1843 }, { "epoch": 0.7152138078153787, "grad_norm": 2.37916047898191, "learning_rate": 2.273020358525899e-06, "loss": 0.7261, "step": 1844 }, { "epoch": 0.715601667797925, "grad_norm": 2.51790863580229, "learning_rate": 2.267347826899366e-06, "loss": 0.6797, "step": 1845 }, { "epoch": 0.7159895277804712, "grad_norm": 2.47095644624716, "learning_rate": 2.2616803060843283e-06, "loss": 0.6776, "step": 1846 }, { "epoch": 0.7163773877630175, "grad_norm": 3.5213825111886594, "learning_rate": 2.2560178064732103e-06, "loss": 0.6816, "step": 1847 }, { "epoch": 0.7167652477455638, "grad_norm": 2.354745032659215, "learning_rate": 2.250360338449226e-06, "loss": 0.6669, "step": 1848 }, { "epoch": 0.7171531077281101, "grad_norm": 2.802907751183984, "learning_rate": 2.244707912386366e-06, "loss": 0.7475, "step": 1849 }, { "epoch": 0.7175409677106565, "grad_norm": 2.0099834920472612, "learning_rate": 2.2390605386493758e-06, "loss": 0.6428, "step": 1850 }, { "epoch": 0.7179288276932028, "grad_norm": 2.342224647558598, "learning_rate": 2.233418227593736e-06, "loss": 0.7257, "step": 1851 }, { "epoch": 0.7183166876757491, "grad_norm": 2.542382571372441, "learning_rate": 2.2277809895656415e-06, "loss": 0.6378, "step": 1852 }, { "epoch": 0.7187045476582954, "grad_norm": 2.4271838551100946, "learning_rate": 2.2221488349019903e-06, "loss": 0.7146, "step": 1853 }, { "epoch": 0.7190924076408417, "grad_norm": 2.779246577785969, "learning_rate": 2.216521773930351e-06, "loss": 0.6039, "step": 1854 }, { "epoch": 0.7194802676233879, "grad_norm": 2.4363300466584024, "learning_rate": 2.2108998169689583e-06, "loss": 0.6719, "step": 1855 }, { "epoch": 0.7198681276059342, "grad_norm": 2.158460242717752, "learning_rate": 2.2052829743266864e-06, "loss": 0.6549, "step": 1856 }, { "epoch": 0.7202559875884805, "grad_norm": 2.1878302719691933, "learning_rate": 2.1996712563030305e-06, "loss": 0.6537, "step": 1857 }, { "epoch": 0.7206438475710268, "grad_norm": 2.3955060707931395, "learning_rate": 2.1940646731880887e-06, "loss": 0.6963, "step": 1858 }, { "epoch": 0.7210317075535732, "grad_norm": 2.555382337078086, "learning_rate": 2.1884632352625468e-06, "loss": 0.7137, "step": 1859 }, { "epoch": 0.7214195675361195, "grad_norm": 2.175375334443656, "learning_rate": 2.1828669527976525e-06, "loss": 0.6854, "step": 1860 }, { "epoch": 0.7218074275186658, "grad_norm": 2.7459597180642015, "learning_rate": 2.1772758360552006e-06, "loss": 0.7324, "step": 1861 }, { "epoch": 0.7221952875012121, "grad_norm": 2.142348738321855, "learning_rate": 2.1716898952875132e-06, "loss": 0.6938, "step": 1862 }, { "epoch": 0.7225831474837584, "grad_norm": 2.4267269401823786, "learning_rate": 2.166109140737422e-06, "loss": 0.6054, "step": 1863 }, { "epoch": 0.7229710074663047, "grad_norm": 2.2186879571465505, "learning_rate": 2.1605335826382494e-06, "loss": 0.6571, "step": 1864 }, { "epoch": 0.7233588674488509, "grad_norm": 1.9323638906156964, "learning_rate": 2.1549632312137884e-06, "loss": 0.5947, "step": 1865 }, { "epoch": 0.7237467274313972, "grad_norm": 2.560328176074415, "learning_rate": 2.149398096678283e-06, "loss": 0.6515, "step": 1866 }, { "epoch": 0.7241345874139435, "grad_norm": 2.4304886696775676, "learning_rate": 2.1438381892364163e-06, "loss": 0.6864, "step": 1867 }, { "epoch": 0.7245224473964899, "grad_norm": 1.7622686731482884, "learning_rate": 2.138283519083281e-06, "loss": 0.6251, "step": 1868 }, { "epoch": 0.7249103073790362, "grad_norm": 3.311505627108053, "learning_rate": 2.1327340964043697e-06, "loss": 0.6689, "step": 1869 }, { "epoch": 0.7252981673615825, "grad_norm": 2.5809042084261455, "learning_rate": 2.12718993137555e-06, "loss": 0.6629, "step": 1870 }, { "epoch": 0.7256860273441288, "grad_norm": 2.5503056479284276, "learning_rate": 2.1216510341630513e-06, "loss": 0.6486, "step": 1871 }, { "epoch": 0.7260738873266751, "grad_norm": 1.9574560732699375, "learning_rate": 2.116117414923442e-06, "loss": 0.656, "step": 1872 }, { "epoch": 0.7264617473092214, "grad_norm": 2.688134280339677, "learning_rate": 2.1105890838036133e-06, "loss": 0.7125, "step": 1873 }, { "epoch": 0.7268496072917676, "grad_norm": 2.458354592137125, "learning_rate": 2.105066050940758e-06, "loss": 0.6113, "step": 1874 }, { "epoch": 0.7272374672743139, "grad_norm": 2.0657550332891574, "learning_rate": 2.0995483264623535e-06, "loss": 0.6256, "step": 1875 }, { "epoch": 0.7276253272568602, "grad_norm": 2.631854585506566, "learning_rate": 2.0940359204861487e-06, "loss": 0.6539, "step": 1876 }, { "epoch": 0.7280131872394066, "grad_norm": 2.6469396180827065, "learning_rate": 2.088528843120134e-06, "loss": 0.6631, "step": 1877 }, { "epoch": 0.7284010472219529, "grad_norm": 2.420530355640313, "learning_rate": 2.08302710446253e-06, "loss": 0.6482, "step": 1878 }, { "epoch": 0.7287889072044992, "grad_norm": 2.750724768380338, "learning_rate": 2.0775307146017697e-06, "loss": 0.6687, "step": 1879 }, { "epoch": 0.7291767671870455, "grad_norm": 2.353158531975416, "learning_rate": 2.0720396836164764e-06, "loss": 0.6169, "step": 1880 }, { "epoch": 0.7295646271695918, "grad_norm": 2.2891782832305925, "learning_rate": 2.066554021575447e-06, "loss": 0.7014, "step": 1881 }, { "epoch": 0.7299524871521381, "grad_norm": 2.530029470396148, "learning_rate": 2.061073738537635e-06, "loss": 0.682, "step": 1882 }, { "epoch": 0.7303403471346844, "grad_norm": 2.799076249827395, "learning_rate": 2.055598844552129e-06, "loss": 0.7556, "step": 1883 }, { "epoch": 0.7307282071172306, "grad_norm": 2.2565220183411916, "learning_rate": 2.0501293496581367e-06, "loss": 0.7024, "step": 1884 }, { "epoch": 0.731116067099777, "grad_norm": 2.346388256513677, "learning_rate": 2.044665263884964e-06, "loss": 0.6692, "step": 1885 }, { "epoch": 0.7315039270823233, "grad_norm": 2.7407686208308575, "learning_rate": 2.0392065972520008e-06, "loss": 0.6696, "step": 1886 }, { "epoch": 0.7318917870648696, "grad_norm": 2.1924830979786343, "learning_rate": 2.0337533597686987e-06, "loss": 0.6541, "step": 1887 }, { "epoch": 0.7322796470474159, "grad_norm": 2.3850064718568738, "learning_rate": 2.028305561434553e-06, "loss": 0.6736, "step": 1888 }, { "epoch": 0.7326675070299622, "grad_norm": 3.054285928224789, "learning_rate": 2.0228632122390866e-06, "loss": 0.6791, "step": 1889 }, { "epoch": 0.7330553670125085, "grad_norm": 1.9602484136079152, "learning_rate": 2.0174263221618307e-06, "loss": 0.6653, "step": 1890 }, { "epoch": 0.7334432269950548, "grad_norm": 3.0844333117285636, "learning_rate": 2.0119949011723043e-06, "loss": 0.7337, "step": 1891 }, { "epoch": 0.7338310869776011, "grad_norm": 2.764924806681105, "learning_rate": 2.006568959229999e-06, "loss": 0.6306, "step": 1892 }, { "epoch": 0.7342189469601473, "grad_norm": 2.126613625990851, "learning_rate": 2.001148506284361e-06, "loss": 0.6542, "step": 1893 }, { "epoch": 0.7346068069426936, "grad_norm": 2.457634548111474, "learning_rate": 1.9957335522747707e-06, "loss": 0.644, "step": 1894 }, { "epoch": 0.73499466692524, "grad_norm": 2.0880483983844127, "learning_rate": 1.9903241071305237e-06, "loss": 0.5906, "step": 1895 }, { "epoch": 0.7353825269077863, "grad_norm": 2.1803507832954585, "learning_rate": 1.9849201807708146e-06, "loss": 0.6176, "step": 1896 }, { "epoch": 0.7357703868903326, "grad_norm": 2.8412305907300137, "learning_rate": 1.9795217831047193e-06, "loss": 0.6177, "step": 1897 }, { "epoch": 0.7361582468728789, "grad_norm": 2.830808344462822, "learning_rate": 1.9741289240311757e-06, "loss": 0.7039, "step": 1898 }, { "epoch": 0.7365461068554252, "grad_norm": 2.170257432879622, "learning_rate": 1.968741613438964e-06, "loss": 0.5996, "step": 1899 }, { "epoch": 0.7369339668379715, "grad_norm": 2.2004000075433416, "learning_rate": 1.9633598612066914e-06, "loss": 0.6212, "step": 1900 }, { "epoch": 0.7373218268205178, "grad_norm": 2.8005360728311692, "learning_rate": 1.957983677202775e-06, "loss": 0.6636, "step": 1901 }, { "epoch": 0.7377096868030641, "grad_norm": 2.444898780674979, "learning_rate": 1.9526130712854186e-06, "loss": 0.6455, "step": 1902 }, { "epoch": 0.7380975467856103, "grad_norm": 2.5378935035129997, "learning_rate": 1.947248053302598e-06, "loss": 0.6608, "step": 1903 }, { "epoch": 0.7384854067681567, "grad_norm": 2.5085117991563486, "learning_rate": 1.9418886330920443e-06, "loss": 0.6635, "step": 1904 }, { "epoch": 0.738873266750703, "grad_norm": 3.1221652955220374, "learning_rate": 1.936534820481222e-06, "loss": 0.7395, "step": 1905 }, { "epoch": 0.7392611267332493, "grad_norm": 2.0025220878681496, "learning_rate": 1.931186625287313e-06, "loss": 0.6449, "step": 1906 }, { "epoch": 0.7396489867157956, "grad_norm": 3.486882377286913, "learning_rate": 1.9258440573172006e-06, "loss": 0.6744, "step": 1907 }, { "epoch": 0.7400368466983419, "grad_norm": 2.516157994384941, "learning_rate": 1.920507126367448e-06, "loss": 0.7535, "step": 1908 }, { "epoch": 0.7404247066808882, "grad_norm": 2.8601275011344587, "learning_rate": 1.9151758422242805e-06, "loss": 0.7035, "step": 1909 }, { "epoch": 0.7408125666634345, "grad_norm": 2.2718430916925514, "learning_rate": 1.909850214663575e-06, "loss": 0.6566, "step": 1910 }, { "epoch": 0.7412004266459808, "grad_norm": 2.192268581372496, "learning_rate": 1.9045302534508298e-06, "loss": 0.5786, "step": 1911 }, { "epoch": 0.741588286628527, "grad_norm": 2.756857741713851, "learning_rate": 1.8992159683411549e-06, "loss": 0.6427, "step": 1912 }, { "epoch": 0.7419761466110734, "grad_norm": 1.812671509508603, "learning_rate": 1.893907369079252e-06, "loss": 0.6676, "step": 1913 }, { "epoch": 0.7423640065936197, "grad_norm": 1.9990142765329206, "learning_rate": 1.8886044653993968e-06, "loss": 0.6495, "step": 1914 }, { "epoch": 0.742751866576166, "grad_norm": 1.9659809256007, "learning_rate": 1.8833072670254227e-06, "loss": 0.6218, "step": 1915 }, { "epoch": 0.7431397265587123, "grad_norm": 2.2504673751448148, "learning_rate": 1.8780157836706985e-06, "loss": 0.6696, "step": 1916 }, { "epoch": 0.7435275865412586, "grad_norm": 2.002396175971286, "learning_rate": 1.8727300250381153e-06, "loss": 0.6011, "step": 1917 }, { "epoch": 0.7439154465238049, "grad_norm": 1.9849386816250654, "learning_rate": 1.8674500008200675e-06, "loss": 0.6574, "step": 1918 }, { "epoch": 0.7443033065063512, "grad_norm": 2.130874602583693, "learning_rate": 1.8621757206984326e-06, "loss": 0.6376, "step": 1919 }, { "epoch": 0.7446911664888975, "grad_norm": 1.7233667555680423, "learning_rate": 1.8569071943445565e-06, "loss": 0.6195, "step": 1920 }, { "epoch": 0.7450790264714439, "grad_norm": 2.263814179582645, "learning_rate": 1.851644431419234e-06, "loss": 0.6488, "step": 1921 }, { "epoch": 0.7454668864539901, "grad_norm": 2.028431227406226, "learning_rate": 1.8463874415726918e-06, "loss": 0.7096, "step": 1922 }, { "epoch": 0.7458547464365364, "grad_norm": 2.8723983306035827, "learning_rate": 1.841136234444571e-06, "loss": 0.6514, "step": 1923 }, { "epoch": 0.7462426064190827, "grad_norm": 1.994516476105284, "learning_rate": 1.8358908196639086e-06, "loss": 0.6293, "step": 1924 }, { "epoch": 0.746630466401629, "grad_norm": 2.317295090340895, "learning_rate": 1.8306512068491195e-06, "loss": 0.6338, "step": 1925 }, { "epoch": 0.7470183263841753, "grad_norm": 2.084781349227569, "learning_rate": 1.8254174056079798e-06, "loss": 0.6332, "step": 1926 }, { "epoch": 0.7474061863667216, "grad_norm": 2.2019376040616896, "learning_rate": 1.820189425537613e-06, "loss": 0.5847, "step": 1927 }, { "epoch": 0.7477940463492679, "grad_norm": 2.306267008279495, "learning_rate": 1.8149672762244625e-06, "loss": 0.6286, "step": 1928 }, { "epoch": 0.7481819063318143, "grad_norm": 2.0302277089778387, "learning_rate": 1.8097509672442836e-06, "loss": 0.6153, "step": 1929 }, { "epoch": 0.7485697663143606, "grad_norm": 2.7963633520936906, "learning_rate": 1.8045405081621215e-06, "loss": 0.6534, "step": 1930 }, { "epoch": 0.7489576262969068, "grad_norm": 2.193883950792707, "learning_rate": 1.7993359085322932e-06, "loss": 0.6013, "step": 1931 }, { "epoch": 0.7493454862794531, "grad_norm": 2.3538017134567193, "learning_rate": 1.7941371778983735e-06, "loss": 0.6428, "step": 1932 }, { "epoch": 0.7497333462619994, "grad_norm": 3.2082293391279793, "learning_rate": 1.7889443257931738e-06, "loss": 0.7333, "step": 1933 }, { "epoch": 0.7501212062445457, "grad_norm": 1.9837244734807185, "learning_rate": 1.7837573617387266e-06, "loss": 0.6605, "step": 1934 }, { "epoch": 0.750509066227092, "grad_norm": 2.2891813823212, "learning_rate": 1.7785762952462665e-06, "loss": 0.6244, "step": 1935 }, { "epoch": 0.7508969262096383, "grad_norm": 3.231100219280733, "learning_rate": 1.7734011358162183e-06, "loss": 0.7011, "step": 1936 }, { "epoch": 0.7512847861921846, "grad_norm": 2.2711768498105767, "learning_rate": 1.7682318929381698e-06, "loss": 0.6397, "step": 1937 }, { "epoch": 0.751672646174731, "grad_norm": 2.479394882903708, "learning_rate": 1.7630685760908623e-06, "loss": 0.6219, "step": 1938 }, { "epoch": 0.7520605061572773, "grad_norm": 2.552679793003044, "learning_rate": 1.7579111947421695e-06, "loss": 0.6338, "step": 1939 }, { "epoch": 0.7524483661398236, "grad_norm": 2.352735061173876, "learning_rate": 1.7527597583490825e-06, "loss": 0.6295, "step": 1940 }, { "epoch": 0.7528362261223698, "grad_norm": 2.411443301933882, "learning_rate": 1.7476142763576903e-06, "loss": 0.6415, "step": 1941 }, { "epoch": 0.7532240861049161, "grad_norm": 2.5646472581531663, "learning_rate": 1.7424747582031638e-06, "loss": 0.6815, "step": 1942 }, { "epoch": 0.7536119460874624, "grad_norm": 2.4702563226197656, "learning_rate": 1.7373412133097373e-06, "loss": 0.6477, "step": 1943 }, { "epoch": 0.7539998060700087, "grad_norm": 2.752249522084967, "learning_rate": 1.732213651090695e-06, "loss": 0.6792, "step": 1944 }, { "epoch": 0.754387666052555, "grad_norm": 2.1583865124559463, "learning_rate": 1.7270920809483476e-06, "loss": 0.6906, "step": 1945 }, { "epoch": 0.7547755260351013, "grad_norm": 2.3541087667108522, "learning_rate": 1.7219765122740202e-06, "loss": 0.5924, "step": 1946 }, { "epoch": 0.7551633860176477, "grad_norm": 2.5057583768469027, "learning_rate": 1.7168669544480305e-06, "loss": 0.625, "step": 1947 }, { "epoch": 0.755551246000194, "grad_norm": 1.7251712994135007, "learning_rate": 1.7117634168396774e-06, "loss": 0.5964, "step": 1948 }, { "epoch": 0.7559391059827403, "grad_norm": 1.7725898484086937, "learning_rate": 1.7066659088072185e-06, "loss": 0.5776, "step": 1949 }, { "epoch": 0.7563269659652865, "grad_norm": 2.7672046967984496, "learning_rate": 1.7015744396978557e-06, "loss": 0.6525, "step": 1950 }, { "epoch": 0.7567148259478328, "grad_norm": 2.8037822735575797, "learning_rate": 1.696489018847718e-06, "loss": 0.6839, "step": 1951 }, { "epoch": 0.7571026859303791, "grad_norm": 2.6790463714846915, "learning_rate": 1.6914096555818432e-06, "loss": 0.6283, "step": 1952 }, { "epoch": 0.7574905459129254, "grad_norm": 2.05814713083302, "learning_rate": 1.6863363592141618e-06, "loss": 0.6027, "step": 1953 }, { "epoch": 0.7578784058954717, "grad_norm": 2.4652340683465335, "learning_rate": 1.6812691390474788e-06, "loss": 0.6562, "step": 1954 }, { "epoch": 0.758266265878018, "grad_norm": 2.732794585495742, "learning_rate": 1.676208004373458e-06, "loss": 0.6666, "step": 1955 }, { "epoch": 0.7586541258605644, "grad_norm": 2.4188239330823076, "learning_rate": 1.6711529644726048e-06, "loss": 0.6785, "step": 1956 }, { "epoch": 0.7590419858431107, "grad_norm": 2.431458841621992, "learning_rate": 1.6661040286142478e-06, "loss": 0.6207, "step": 1957 }, { "epoch": 0.759429845825657, "grad_norm": 2.4783187317516133, "learning_rate": 1.6610612060565235e-06, "loss": 0.5958, "step": 1958 }, { "epoch": 0.7598177058082033, "grad_norm": 2.0667880617023506, "learning_rate": 1.6560245060463575e-06, "loss": 0.653, "step": 1959 }, { "epoch": 0.7602055657907495, "grad_norm": 2.7884654766047543, "learning_rate": 1.6509939378194483e-06, "loss": 0.6212, "step": 1960 }, { "epoch": 0.7605934257732958, "grad_norm": 2.8496643049163812, "learning_rate": 1.645969510600255e-06, "loss": 0.6881, "step": 1961 }, { "epoch": 0.7609812857558421, "grad_norm": 2.868140364312367, "learning_rate": 1.64095123360197e-06, "loss": 0.6668, "step": 1962 }, { "epoch": 0.7613691457383884, "grad_norm": 2.5162975285393836, "learning_rate": 1.6359391160265127e-06, "loss": 0.6826, "step": 1963 }, { "epoch": 0.7617570057209347, "grad_norm": 2.4562349054496644, "learning_rate": 1.6309331670645046e-06, "loss": 0.6667, "step": 1964 }, { "epoch": 0.7621448657034811, "grad_norm": 2.2479256365221922, "learning_rate": 1.6259333958952584e-06, "loss": 0.6773, "step": 1965 }, { "epoch": 0.7625327256860274, "grad_norm": 2.600595483793298, "learning_rate": 1.6209398116867575e-06, "loss": 0.625, "step": 1966 }, { "epoch": 0.7629205856685737, "grad_norm": 1.8960908613851122, "learning_rate": 1.6159524235956414e-06, "loss": 0.6242, "step": 1967 }, { "epoch": 0.76330844565112, "grad_norm": 2.426204100465553, "learning_rate": 1.6109712407671867e-06, "loss": 0.64, "step": 1968 }, { "epoch": 0.7636963056336662, "grad_norm": 2.5934312627924556, "learning_rate": 1.6059962723352912e-06, "loss": 0.6748, "step": 1969 }, { "epoch": 0.7640841656162125, "grad_norm": 2.39164047586135, "learning_rate": 1.6010275274224607e-06, "loss": 0.6366, "step": 1970 }, { "epoch": 0.7644720255987588, "grad_norm": 2.27349646424037, "learning_rate": 1.5960650151397855e-06, "loss": 0.5954, "step": 1971 }, { "epoch": 0.7648598855813051, "grad_norm": 2.996198451848902, "learning_rate": 1.5911087445869289e-06, "loss": 0.6637, "step": 1972 }, { "epoch": 0.7652477455638514, "grad_norm": 2.04432183394412, "learning_rate": 1.5861587248521083e-06, "loss": 0.6743, "step": 1973 }, { "epoch": 0.7656356055463978, "grad_norm": 2.1429661293229976, "learning_rate": 1.5812149650120784e-06, "loss": 0.6187, "step": 1974 }, { "epoch": 0.7660234655289441, "grad_norm": 2.1879109693206846, "learning_rate": 1.5762774741321173e-06, "loss": 0.6715, "step": 1975 }, { "epoch": 0.7664113255114904, "grad_norm": 2.6871191195167627, "learning_rate": 1.5713462612660063e-06, "loss": 0.6957, "step": 1976 }, { "epoch": 0.7667991854940367, "grad_norm": 2.9759386795877605, "learning_rate": 1.5664213354560133e-06, "loss": 0.6442, "step": 1977 }, { "epoch": 0.767187045476583, "grad_norm": 2.572703993237556, "learning_rate": 1.561502705732883e-06, "loss": 0.6484, "step": 1978 }, { "epoch": 0.7675749054591292, "grad_norm": 2.195684575583811, "learning_rate": 1.5565903811158095e-06, "loss": 0.681, "step": 1979 }, { "epoch": 0.7679627654416755, "grad_norm": 2.7170899862493347, "learning_rate": 1.5516843706124285e-06, "loss": 0.641, "step": 1980 }, { "epoch": 0.7683506254242218, "grad_norm": 2.2072132314156914, "learning_rate": 1.546784683218796e-06, "loss": 0.6675, "step": 1981 }, { "epoch": 0.7687384854067681, "grad_norm": 2.142922263177276, "learning_rate": 1.5418913279193748e-06, "loss": 0.6392, "step": 1982 }, { "epoch": 0.7691263453893145, "grad_norm": 2.6817348888582146, "learning_rate": 1.537004313687015e-06, "loss": 0.7313, "step": 1983 }, { "epoch": 0.7695142053718608, "grad_norm": 3.0178485969960334, "learning_rate": 1.5321236494829412e-06, "loss": 0.6848, "step": 1984 }, { "epoch": 0.7699020653544071, "grad_norm": 2.777286691342203, "learning_rate": 1.5272493442567321e-06, "loss": 0.7237, "step": 1985 }, { "epoch": 0.7702899253369534, "grad_norm": 2.1087357799130153, "learning_rate": 1.5223814069463077e-06, "loss": 0.6142, "step": 1986 }, { "epoch": 0.7706777853194997, "grad_norm": 2.4578167139569644, "learning_rate": 1.5175198464779107e-06, "loss": 0.7079, "step": 1987 }, { "epoch": 0.7710656453020459, "grad_norm": 3.041928095212389, "learning_rate": 1.5126646717660898e-06, "loss": 0.6836, "step": 1988 }, { "epoch": 0.7714535052845922, "grad_norm": 2.5825121270790463, "learning_rate": 1.507815891713686e-06, "loss": 0.7103, "step": 1989 }, { "epoch": 0.7718413652671385, "grad_norm": 2.670503894699853, "learning_rate": 1.5029735152118125e-06, "loss": 0.6988, "step": 1990 }, { "epoch": 0.7722292252496848, "grad_norm": 2.986410716554129, "learning_rate": 1.4981375511398427e-06, "loss": 0.6465, "step": 1991 }, { "epoch": 0.7726170852322312, "grad_norm": 2.626832804007448, "learning_rate": 1.49330800836539e-06, "loss": 0.6584, "step": 1992 }, { "epoch": 0.7730049452147775, "grad_norm": 2.339702344015157, "learning_rate": 1.4884848957442933e-06, "loss": 0.6533, "step": 1993 }, { "epoch": 0.7733928051973238, "grad_norm": 2.609135782566214, "learning_rate": 1.4836682221206e-06, "loss": 0.6749, "step": 1994 }, { "epoch": 0.7737806651798701, "grad_norm": 2.327196352899715, "learning_rate": 1.4788579963265547e-06, "loss": 0.6735, "step": 1995 }, { "epoch": 0.7741685251624164, "grad_norm": 1.915726429562975, "learning_rate": 1.4740542271825736e-06, "loss": 0.6481, "step": 1996 }, { "epoch": 0.7745563851449627, "grad_norm": 2.7945116941779324, "learning_rate": 1.4692569234972348e-06, "loss": 0.6839, "step": 1997 }, { "epoch": 0.7749442451275089, "grad_norm": 1.6723387891446975, "learning_rate": 1.4644660940672628e-06, "loss": 0.5869, "step": 1998 }, { "epoch": 0.7753321051100552, "grad_norm": 1.9805670373513178, "learning_rate": 1.4596817476775077e-06, "loss": 0.6227, "step": 1999 }, { "epoch": 0.7757199650926015, "grad_norm": 2.7425132424079113, "learning_rate": 1.454903893100934e-06, "loss": 0.7059, "step": 2000 }, { "epoch": 0.7757199650926015, "eval_loss": 1.2946887016296387, "eval_runtime": 6.2011, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.161, "step": 2000 }, { "epoch": 0.7761078250751479, "grad_norm": 2.381904382026506, "learning_rate": 1.4501325390986004e-06, "loss": 0.659, "step": 2001 }, { "epoch": 0.7764956850576942, "grad_norm": 2.2044635081868265, "learning_rate": 1.4453676944196477e-06, "loss": 0.6558, "step": 2002 }, { "epoch": 0.7768835450402405, "grad_norm": 2.8556620529442402, "learning_rate": 1.4406093678012767e-06, "loss": 0.6643, "step": 2003 }, { "epoch": 0.7772714050227868, "grad_norm": 2.640221147278785, "learning_rate": 1.4358575679687425e-06, "loss": 0.6955, "step": 2004 }, { "epoch": 0.7776592650053331, "grad_norm": 3.1896467687518197, "learning_rate": 1.431112303635328e-06, "loss": 0.702, "step": 2005 }, { "epoch": 0.7780471249878794, "grad_norm": 1.8840874344637595, "learning_rate": 1.4263735835023318e-06, "loss": 0.6689, "step": 2006 }, { "epoch": 0.7784349849704256, "grad_norm": 2.3802454365935715, "learning_rate": 1.4216414162590531e-06, "loss": 0.6078, "step": 2007 }, { "epoch": 0.7788228449529719, "grad_norm": 2.089716662442757, "learning_rate": 1.4169158105827768e-06, "loss": 0.6769, "step": 2008 }, { "epoch": 0.7792107049355183, "grad_norm": 2.2869327578039838, "learning_rate": 1.4121967751387538e-06, "loss": 0.6559, "step": 2009 }, { "epoch": 0.7795985649180646, "grad_norm": 1.6929006049774804, "learning_rate": 1.4074843185801885e-06, "loss": 0.6245, "step": 2010 }, { "epoch": 0.7799864249006109, "grad_norm": 2.437080029891352, "learning_rate": 1.4027784495482215e-06, "loss": 0.6275, "step": 2011 }, { "epoch": 0.7803742848831572, "grad_norm": 2.865506003874525, "learning_rate": 1.3980791766719138e-06, "loss": 0.671, "step": 2012 }, { "epoch": 0.7807621448657035, "grad_norm": 2.161582655332493, "learning_rate": 1.3933865085682313e-06, "loss": 0.6387, "step": 2013 }, { "epoch": 0.7811500048482498, "grad_norm": 2.7117719588208256, "learning_rate": 1.388700453842029e-06, "loss": 0.6502, "step": 2014 }, { "epoch": 0.7815378648307961, "grad_norm": 2.374692937150257, "learning_rate": 1.3840210210860343e-06, "loss": 0.6636, "step": 2015 }, { "epoch": 0.7819257248133424, "grad_norm": 2.532144959727545, "learning_rate": 1.3793482188808339e-06, "loss": 0.6867, "step": 2016 }, { "epoch": 0.7823135847958886, "grad_norm": 2.9265440635599687, "learning_rate": 1.3746820557948538e-06, "loss": 0.6352, "step": 2017 }, { "epoch": 0.782701444778435, "grad_norm": 3.717354240236664, "learning_rate": 1.370022540384347e-06, "loss": 0.7409, "step": 2018 }, { "epoch": 0.7830893047609813, "grad_norm": 2.338228984981957, "learning_rate": 1.3653696811933782e-06, "loss": 0.6783, "step": 2019 }, { "epoch": 0.7834771647435276, "grad_norm": 2.023117074853429, "learning_rate": 1.3607234867538028e-06, "loss": 0.6333, "step": 2020 }, { "epoch": 0.7838650247260739, "grad_norm": 2.5194417509215232, "learning_rate": 1.3560839655852604e-06, "loss": 0.6225, "step": 2021 }, { "epoch": 0.7842528847086202, "grad_norm": 2.8821140987326355, "learning_rate": 1.3514511261951514e-06, "loss": 0.6235, "step": 2022 }, { "epoch": 0.7846407446911665, "grad_norm": 2.0757839072008477, "learning_rate": 1.3468249770786223e-06, "loss": 0.6297, "step": 2023 }, { "epoch": 0.7850286046737128, "grad_norm": 2.4195720372860077, "learning_rate": 1.3422055267185541e-06, "loss": 0.6235, "step": 2024 }, { "epoch": 0.7854164646562591, "grad_norm": 2.251293599085807, "learning_rate": 1.337592783585544e-06, "loss": 0.639, "step": 2025 }, { "epoch": 0.7858043246388053, "grad_norm": 2.3454547891851725, "learning_rate": 1.332986756137889e-06, "loss": 0.6297, "step": 2026 }, { "epoch": 0.7861921846213517, "grad_norm": 2.889430364521892, "learning_rate": 1.3283874528215735e-06, "loss": 0.6648, "step": 2027 }, { "epoch": 0.786580044603898, "grad_norm": 1.9126782571932657, "learning_rate": 1.3237948820702495e-06, "loss": 0.6637, "step": 2028 }, { "epoch": 0.7869679045864443, "grad_norm": 1.7614248304950493, "learning_rate": 1.3192090523052275e-06, "loss": 0.5991, "step": 2029 }, { "epoch": 0.7873557645689906, "grad_norm": 2.121679076360506, "learning_rate": 1.3146299719354544e-06, "loss": 0.6298, "step": 2030 }, { "epoch": 0.7877436245515369, "grad_norm": 1.8654244689444535, "learning_rate": 1.3100576493575012e-06, "loss": 0.6026, "step": 2031 }, { "epoch": 0.7881314845340832, "grad_norm": 2.665840108284171, "learning_rate": 1.3054920929555471e-06, "loss": 0.6451, "step": 2032 }, { "epoch": 0.7885193445166295, "grad_norm": 2.163057522548589, "learning_rate": 1.300933311101365e-06, "loss": 0.6206, "step": 2033 }, { "epoch": 0.7889072044991758, "grad_norm": 2.169477935342071, "learning_rate": 1.296381312154305e-06, "loss": 0.6638, "step": 2034 }, { "epoch": 0.7892950644817222, "grad_norm": 2.642878916134218, "learning_rate": 1.29183610446128e-06, "loss": 0.6259, "step": 2035 }, { "epoch": 0.7896829244642684, "grad_norm": 1.9913810323547931, "learning_rate": 1.2872976963567485e-06, "loss": 0.6194, "step": 2036 }, { "epoch": 0.7900707844468147, "grad_norm": 2.2707150629979815, "learning_rate": 1.282766096162701e-06, "loss": 0.6832, "step": 2037 }, { "epoch": 0.790458644429361, "grad_norm": 2.2029017758988276, "learning_rate": 1.2782413121886483e-06, "loss": 0.6485, "step": 2038 }, { "epoch": 0.7908465044119073, "grad_norm": 2.6318396997666698, "learning_rate": 1.2737233527315978e-06, "loss": 0.6347, "step": 2039 }, { "epoch": 0.7912343643944536, "grad_norm": 1.7919435135070145, "learning_rate": 1.2692122260760442e-06, "loss": 0.5691, "step": 2040 }, { "epoch": 0.7916222243769999, "grad_norm": 1.825793218006119, "learning_rate": 1.2647079404939533e-06, "loss": 0.6069, "step": 2041 }, { "epoch": 0.7920100843595462, "grad_norm": 2.089780024151066, "learning_rate": 1.2602105042447472e-06, "loss": 0.6424, "step": 2042 }, { "epoch": 0.7923979443420925, "grad_norm": 1.8917578961404196, "learning_rate": 1.2557199255752866e-06, "loss": 0.648, "step": 2043 }, { "epoch": 0.7927858043246389, "grad_norm": 3.286640815813561, "learning_rate": 1.25123621271986e-06, "loss": 0.6834, "step": 2044 }, { "epoch": 0.7931736643071851, "grad_norm": 2.3835624318238606, "learning_rate": 1.246759373900165e-06, "loss": 0.647, "step": 2045 }, { "epoch": 0.7935615242897314, "grad_norm": 2.2583487931493336, "learning_rate": 1.2422894173252937e-06, "loss": 0.6493, "step": 2046 }, { "epoch": 0.7939493842722777, "grad_norm": 3.9078004352252083, "learning_rate": 1.23782635119172e-06, "loss": 0.6809, "step": 2047 }, { "epoch": 0.794337244254824, "grad_norm": 2.3256848460211756, "learning_rate": 1.2333701836832812e-06, "loss": 0.6625, "step": 2048 }, { "epoch": 0.7947251042373703, "grad_norm": 2.9759738101980457, "learning_rate": 1.2289209229711657e-06, "loss": 0.619, "step": 2049 }, { "epoch": 0.7951129642199166, "grad_norm": 1.9478240974265253, "learning_rate": 1.2244785772138972e-06, "loss": 0.7035, "step": 2050 }, { "epoch": 0.7955008242024629, "grad_norm": 2.126168069851282, "learning_rate": 1.22004315455732e-06, "loss": 0.6136, "step": 2051 }, { "epoch": 0.7958886841850092, "grad_norm": 2.3770440354473363, "learning_rate": 1.2156146631345817e-06, "loss": 0.6605, "step": 2052 }, { "epoch": 0.7962765441675556, "grad_norm": 2.112258784306048, "learning_rate": 1.2111931110661213e-06, "loss": 0.6668, "step": 2053 }, { "epoch": 0.7966644041501019, "grad_norm": 2.616583638496431, "learning_rate": 1.2067785064596532e-06, "loss": 0.6044, "step": 2054 }, { "epoch": 0.7970522641326481, "grad_norm": 2.141406032807671, "learning_rate": 1.202370857410155e-06, "loss": 0.5976, "step": 2055 }, { "epoch": 0.7974401241151944, "grad_norm": 2.1370325757813395, "learning_rate": 1.1979701719998454e-06, "loss": 0.6372, "step": 2056 }, { "epoch": 0.7978279840977407, "grad_norm": 2.5667312086480965, "learning_rate": 1.1935764582981774e-06, "loss": 0.6995, "step": 2057 }, { "epoch": 0.798215844080287, "grad_norm": 2.6388769801178826, "learning_rate": 1.1891897243618184e-06, "loss": 0.6512, "step": 2058 }, { "epoch": 0.7986037040628333, "grad_norm": 2.0151755341899604, "learning_rate": 1.1848099782346373e-06, "loss": 0.6283, "step": 2059 }, { "epoch": 0.7989915640453796, "grad_norm": 2.896435393244018, "learning_rate": 1.1804372279476905e-06, "loss": 0.7081, "step": 2060 }, { "epoch": 0.7993794240279259, "grad_norm": 2.5665965477593438, "learning_rate": 1.1760714815192054e-06, "loss": 0.7035, "step": 2061 }, { "epoch": 0.7997672840104723, "grad_norm": 2.340242284252969, "learning_rate": 1.171712746954566e-06, "loss": 0.6445, "step": 2062 }, { "epoch": 0.8001551439930186, "grad_norm": 2.7253267602760944, "learning_rate": 1.1673610322463014e-06, "loss": 0.6331, "step": 2063 }, { "epoch": 0.8005430039755648, "grad_norm": 2.984959007733708, "learning_rate": 1.163016345374066e-06, "loss": 0.7007, "step": 2064 }, { "epoch": 0.8009308639581111, "grad_norm": 2.5353445679195676, "learning_rate": 1.1586786943046284e-06, "loss": 0.6666, "step": 2065 }, { "epoch": 0.8013187239406574, "grad_norm": 2.3612570799382233, "learning_rate": 1.1543480869918555e-06, "loss": 0.6353, "step": 2066 }, { "epoch": 0.8017065839232037, "grad_norm": 2.3697356216831573, "learning_rate": 1.1500245313766984e-06, "loss": 0.6774, "step": 2067 }, { "epoch": 0.80209444390575, "grad_norm": 2.318810447610721, "learning_rate": 1.145708035387177e-06, "loss": 0.6903, "step": 2068 }, { "epoch": 0.8024823038882963, "grad_norm": 1.8137929431106654, "learning_rate": 1.141398606938367e-06, "loss": 0.6317, "step": 2069 }, { "epoch": 0.8028701638708426, "grad_norm": 3.069295344699254, "learning_rate": 1.1370962539323837e-06, "loss": 0.6642, "step": 2070 }, { "epoch": 0.803258023853389, "grad_norm": 3.444023043869518, "learning_rate": 1.1328009842583677e-06, "loss": 0.7134, "step": 2071 }, { "epoch": 0.8036458838359353, "grad_norm": 2.700528370100708, "learning_rate": 1.1285128057924743e-06, "loss": 0.6268, "step": 2072 }, { "epoch": 0.8040337438184816, "grad_norm": 1.8543460490658479, "learning_rate": 1.1242317263978525e-06, "loss": 0.6311, "step": 2073 }, { "epoch": 0.8044216038010278, "grad_norm": 2.11081146748378, "learning_rate": 1.1199577539246348e-06, "loss": 0.615, "step": 2074 }, { "epoch": 0.8048094637835741, "grad_norm": 2.354112328639301, "learning_rate": 1.1156908962099223e-06, "loss": 0.6061, "step": 2075 }, { "epoch": 0.8051973237661204, "grad_norm": 2.430556325398603, "learning_rate": 1.111431161077769e-06, "loss": 0.6371, "step": 2076 }, { "epoch": 0.8055851837486667, "grad_norm": 2.089095120256514, "learning_rate": 1.1071785563391697e-06, "loss": 0.7115, "step": 2077 }, { "epoch": 0.805973043731213, "grad_norm": 2.804958175591617, "learning_rate": 1.102933089792042e-06, "loss": 0.7028, "step": 2078 }, { "epoch": 0.8063609037137593, "grad_norm": 2.0393259538063213, "learning_rate": 1.0986947692212174e-06, "loss": 0.6037, "step": 2079 }, { "epoch": 0.8067487636963057, "grad_norm": 2.2749213684312632, "learning_rate": 1.0944636023984222e-06, "loss": 0.6288, "step": 2080 }, { "epoch": 0.807136623678852, "grad_norm": 2.8738015301052156, "learning_rate": 1.0902395970822648e-06, "loss": 0.6555, "step": 2081 }, { "epoch": 0.8075244836613983, "grad_norm": 2.4789064403314724, "learning_rate": 1.0860227610182222e-06, "loss": 0.6455, "step": 2082 }, { "epoch": 0.8079123436439445, "grad_norm": 2.1376343857559528, "learning_rate": 1.081813101938625e-06, "loss": 0.6238, "step": 2083 }, { "epoch": 0.8083002036264908, "grad_norm": 2.304746222745813, "learning_rate": 1.0776106275626446e-06, "loss": 0.675, "step": 2084 }, { "epoch": 0.8086880636090371, "grad_norm": 2.1607848882060114, "learning_rate": 1.0734153455962765e-06, "loss": 0.649, "step": 2085 }, { "epoch": 0.8090759235915834, "grad_norm": 2.6346895185310535, "learning_rate": 1.0692272637323281e-06, "loss": 0.7129, "step": 2086 }, { "epoch": 0.8094637835741297, "grad_norm": 3.5211039710355374, "learning_rate": 1.0650463896504042e-06, "loss": 0.7295, "step": 2087 }, { "epoch": 0.809851643556676, "grad_norm": 1.939964500877891, "learning_rate": 1.0608727310168921e-06, "loss": 0.6305, "step": 2088 }, { "epoch": 0.8102395035392224, "grad_norm": 2.0624050774393403, "learning_rate": 1.0567062954849506e-06, "loss": 0.6036, "step": 2089 }, { "epoch": 0.8106273635217687, "grad_norm": 2.2801683255993583, "learning_rate": 1.0525470906944919e-06, "loss": 0.6648, "step": 2090 }, { "epoch": 0.811015223504315, "grad_norm": 2.1778505242024866, "learning_rate": 1.0483951242721685e-06, "loss": 0.69, "step": 2091 }, { "epoch": 0.8114030834868613, "grad_norm": 2.827758567595865, "learning_rate": 1.044250403831361e-06, "loss": 0.6249, "step": 2092 }, { "epoch": 0.8117909434694075, "grad_norm": 1.9574078688840388, "learning_rate": 1.040112936972164e-06, "loss": 0.6273, "step": 2093 }, { "epoch": 0.8121788034519538, "grad_norm": 2.374331012458886, "learning_rate": 1.0359827312813702e-06, "loss": 0.6559, "step": 2094 }, { "epoch": 0.8125666634345001, "grad_norm": 2.1903868169433967, "learning_rate": 1.0318597943324582e-06, "loss": 0.6942, "step": 2095 }, { "epoch": 0.8129545234170464, "grad_norm": 1.684979893755111, "learning_rate": 1.027744133685577e-06, "loss": 0.5831, "step": 2096 }, { "epoch": 0.8133423833995927, "grad_norm": 1.885977757572892, "learning_rate": 1.0236357568875333e-06, "loss": 0.608, "step": 2097 }, { "epoch": 0.8137302433821391, "grad_norm": 2.135572291980809, "learning_rate": 1.0195346714717813e-06, "loss": 0.6486, "step": 2098 }, { "epoch": 0.8141181033646854, "grad_norm": 2.7555137474668423, "learning_rate": 1.0154408849583997e-06, "loss": 0.6683, "step": 2099 }, { "epoch": 0.8145059633472317, "grad_norm": 2.1242157009235916, "learning_rate": 1.0113544048540868e-06, "loss": 0.6153, "step": 2100 }, { "epoch": 0.814893823329778, "grad_norm": 2.4573822947618513, "learning_rate": 1.0072752386521417e-06, "loss": 0.6684, "step": 2101 }, { "epoch": 0.8152816833123242, "grad_norm": 1.90744155944795, "learning_rate": 1.0032033938324527e-06, "loss": 0.6222, "step": 2102 }, { "epoch": 0.8156695432948705, "grad_norm": 2.604817164154957, "learning_rate": 9.991388778614825e-07, "loss": 0.6991, "step": 2103 }, { "epoch": 0.8160574032774168, "grad_norm": 2.463174762709331, "learning_rate": 9.950816981922567e-07, "loss": 0.6532, "step": 2104 }, { "epoch": 0.8164452632599631, "grad_norm": 2.136367801219671, "learning_rate": 9.91031862264345e-07, "loss": 0.7078, "step": 2105 }, { "epoch": 0.8168331232425095, "grad_norm": 2.535255266263822, "learning_rate": 9.869893775038558e-07, "loss": 0.7204, "step": 2106 }, { "epoch": 0.8172209832250558, "grad_norm": 2.521601908613316, "learning_rate": 9.829542513234153e-07, "loss": 0.6706, "step": 2107 }, { "epoch": 0.8176088432076021, "grad_norm": 2.481102919996633, "learning_rate": 9.789264911221546e-07, "loss": 0.6702, "step": 2108 }, { "epoch": 0.8179967031901484, "grad_norm": 1.7382945037554915, "learning_rate": 9.749061042857011e-07, "loss": 0.5742, "step": 2109 }, { "epoch": 0.8183845631726947, "grad_norm": 2.800253807364091, "learning_rate": 9.708930981861603e-07, "loss": 0.6798, "step": 2110 }, { "epoch": 0.8187724231552409, "grad_norm": 2.0527561084615664, "learning_rate": 9.668874801821033e-07, "loss": 0.6316, "step": 2111 }, { "epoch": 0.8191602831377872, "grad_norm": 3.0067593473023155, "learning_rate": 9.62889257618555e-07, "loss": 0.7217, "step": 2112 }, { "epoch": 0.8195481431203335, "grad_norm": 3.0124673515921705, "learning_rate": 9.588984378269784e-07, "loss": 0.7387, "step": 2113 }, { "epoch": 0.8199360031028798, "grad_norm": 2.433202815246098, "learning_rate": 9.549150281252633e-07, "loss": 0.6571, "step": 2114 }, { "epoch": 0.8203238630854262, "grad_norm": 2.4599435861693095, "learning_rate": 9.509390358177106e-07, "loss": 0.6696, "step": 2115 }, { "epoch": 0.8207117230679725, "grad_norm": 2.3902226829431523, "learning_rate": 9.469704681950209e-07, "loss": 0.6459, "step": 2116 }, { "epoch": 0.8210995830505188, "grad_norm": 2.754602246169835, "learning_rate": 9.430093325342799e-07, "loss": 0.6224, "step": 2117 }, { "epoch": 0.8214874430330651, "grad_norm": 2.273487299893077, "learning_rate": 9.39055636098945e-07, "loss": 0.6442, "step": 2118 }, { "epoch": 0.8218753030156114, "grad_norm": 2.067206136841442, "learning_rate": 9.351093861388338e-07, "loss": 0.6613, "step": 2119 }, { "epoch": 0.8222631629981577, "grad_norm": 2.0980636542063427, "learning_rate": 9.311705898901086e-07, "loss": 0.6539, "step": 2120 }, { "epoch": 0.8226510229807039, "grad_norm": 2.6696148706828433, "learning_rate": 9.272392545752628e-07, "loss": 0.6498, "step": 2121 }, { "epoch": 0.8230388829632502, "grad_norm": 2.576323074361704, "learning_rate": 9.233153874031103e-07, "loss": 0.6297, "step": 2122 }, { "epoch": 0.8234267429457965, "grad_norm": 2.5073242771328283, "learning_rate": 9.193989955687715e-07, "loss": 0.6749, "step": 2123 }, { "epoch": 0.8238146029283429, "grad_norm": 2.573882573231028, "learning_rate": 9.154900862536586e-07, "loss": 0.6926, "step": 2124 }, { "epoch": 0.8242024629108892, "grad_norm": 2.636316032677939, "learning_rate": 9.115886666254625e-07, "loss": 0.6413, "step": 2125 }, { "epoch": 0.8245903228934355, "grad_norm": 1.8627665345209892, "learning_rate": 9.076947438381411e-07, "loss": 0.6619, "step": 2126 }, { "epoch": 0.8249781828759818, "grad_norm": 2.306485340630249, "learning_rate": 9.038083250319051e-07, "loss": 0.5815, "step": 2127 }, { "epoch": 0.8253660428585281, "grad_norm": 2.025688055593741, "learning_rate": 8.999294173332058e-07, "loss": 0.6548, "step": 2128 }, { "epoch": 0.8257539028410744, "grad_norm": 1.8783749943046575, "learning_rate": 8.960580278547216e-07, "loss": 0.6371, "step": 2129 }, { "epoch": 0.8261417628236206, "grad_norm": 1.978219775507761, "learning_rate": 8.921941636953435e-07, "loss": 0.6284, "step": 2130 }, { "epoch": 0.8265296228061669, "grad_norm": 2.8034979498116064, "learning_rate": 8.883378319401648e-07, "loss": 0.6307, "step": 2131 }, { "epoch": 0.8269174827887132, "grad_norm": 2.2979073818998255, "learning_rate": 8.844890396604677e-07, "loss": 0.6324, "step": 2132 }, { "epoch": 0.8273053427712596, "grad_norm": 2.9112869267657318, "learning_rate": 8.806477939137081e-07, "loss": 0.6644, "step": 2133 }, { "epoch": 0.8276932027538059, "grad_norm": 2.496845960510061, "learning_rate": 8.768141017435033e-07, "loss": 0.6794, "step": 2134 }, { "epoch": 0.8280810627363522, "grad_norm": 3.7431521574195954, "learning_rate": 8.729879701796207e-07, "loss": 0.7438, "step": 2135 }, { "epoch": 0.8284689227188985, "grad_norm": 1.7322215388131736, "learning_rate": 8.691694062379647e-07, "loss": 0.6589, "step": 2136 }, { "epoch": 0.8288567827014448, "grad_norm": 1.9833777793881955, "learning_rate": 8.653584169205608e-07, "loss": 0.6763, "step": 2137 }, { "epoch": 0.8292446426839911, "grad_norm": 2.285859110518478, "learning_rate": 8.615550092155478e-07, "loss": 0.65, "step": 2138 }, { "epoch": 0.8296325026665374, "grad_norm": 2.075132151647272, "learning_rate": 8.577591900971588e-07, "loss": 0.6499, "step": 2139 }, { "epoch": 0.8300203626490836, "grad_norm": 1.9415810762495416, "learning_rate": 8.539709665257167e-07, "loss": 0.607, "step": 2140 }, { "epoch": 0.8304082226316299, "grad_norm": 2.147700428289965, "learning_rate": 8.501903454476129e-07, "loss": 0.6258, "step": 2141 }, { "epoch": 0.8307960826141763, "grad_norm": 2.565404713406526, "learning_rate": 8.464173337952991e-07, "loss": 0.5594, "step": 2142 }, { "epoch": 0.8311839425967226, "grad_norm": 2.292897402718634, "learning_rate": 8.426519384872733e-07, "loss": 0.6515, "step": 2143 }, { "epoch": 0.8315718025792689, "grad_norm": 2.1030973007401994, "learning_rate": 8.388941664280703e-07, "loss": 0.6571, "step": 2144 }, { "epoch": 0.8319596625618152, "grad_norm": 2.678742436641568, "learning_rate": 8.351440245082415e-07, "loss": 0.7358, "step": 2145 }, { "epoch": 0.8323475225443615, "grad_norm": 2.5085987283698965, "learning_rate": 8.314015196043501e-07, "loss": 0.667, "step": 2146 }, { "epoch": 0.8327353825269078, "grad_norm": 2.3310151300808957, "learning_rate": 8.276666585789561e-07, "loss": 0.6448, "step": 2147 }, { "epoch": 0.8331232425094541, "grad_norm": 2.374860976977045, "learning_rate": 8.239394482805996e-07, "loss": 0.6189, "step": 2148 }, { "epoch": 0.8335111024920003, "grad_norm": 2.82594225160682, "learning_rate": 8.202198955437979e-07, "loss": 0.6354, "step": 2149 }, { "epoch": 0.8338989624745466, "grad_norm": 2.8414609893584197, "learning_rate": 8.165080071890208e-07, "loss": 0.6995, "step": 2150 }, { "epoch": 0.834286822457093, "grad_norm": 2.2832003285846523, "learning_rate": 8.128037900226865e-07, "loss": 0.6971, "step": 2151 }, { "epoch": 0.8346746824396393, "grad_norm": 3.2929765099901958, "learning_rate": 8.091072508371466e-07, "loss": 0.6969, "step": 2152 }, { "epoch": 0.8350625424221856, "grad_norm": 2.3670936699740848, "learning_rate": 8.054183964106737e-07, "loss": 0.6724, "step": 2153 }, { "epoch": 0.8354504024047319, "grad_norm": 1.928312640888906, "learning_rate": 8.017372335074486e-07, "loss": 0.5988, "step": 2154 }, { "epoch": 0.8358382623872782, "grad_norm": 2.7434991853997674, "learning_rate": 7.980637688775484e-07, "loss": 0.6731, "step": 2155 }, { "epoch": 0.8362261223698245, "grad_norm": 2.073485547889316, "learning_rate": 7.943980092569336e-07, "loss": 0.6463, "step": 2156 }, { "epoch": 0.8366139823523708, "grad_norm": 2.1081781776615967, "learning_rate": 7.907399613674388e-07, "loss": 0.5968, "step": 2157 }, { "epoch": 0.8370018423349171, "grad_norm": 2.1840072192409274, "learning_rate": 7.870896319167548e-07, "loss": 0.6535, "step": 2158 }, { "epoch": 0.8373897023174633, "grad_norm": 2.4597010667746786, "learning_rate": 7.834470275984196e-07, "loss": 0.6627, "step": 2159 }, { "epoch": 0.8377775623000097, "grad_norm": 2.3186142133057563, "learning_rate": 7.79812155091807e-07, "loss": 0.664, "step": 2160 }, { "epoch": 0.838165422282556, "grad_norm": 2.6980256225060715, "learning_rate": 7.761850210621125e-07, "loss": 0.6792, "step": 2161 }, { "epoch": 0.8385532822651023, "grad_norm": 1.8749303225786422, "learning_rate": 7.725656321603414e-07, "loss": 0.6357, "step": 2162 }, { "epoch": 0.8389411422476486, "grad_norm": 4.02211846939898, "learning_rate": 7.689539950232977e-07, "loss": 0.7265, "step": 2163 }, { "epoch": 0.8393290022301949, "grad_norm": 1.8650261578529026, "learning_rate": 7.653501162735694e-07, "loss": 0.6177, "step": 2164 }, { "epoch": 0.8397168622127412, "grad_norm": 2.7242068947828826, "learning_rate": 7.617540025195197e-07, "loss": 0.6705, "step": 2165 }, { "epoch": 0.8401047221952875, "grad_norm": 1.9832674628425027, "learning_rate": 7.581656603552745e-07, "loss": 0.6434, "step": 2166 }, { "epoch": 0.8404925821778338, "grad_norm": 3.6897738657504258, "learning_rate": 7.54585096360706e-07, "loss": 0.6793, "step": 2167 }, { "epoch": 0.84088044216038, "grad_norm": 2.7361801099059346, "learning_rate": 7.510123171014255e-07, "loss": 0.6214, "step": 2168 }, { "epoch": 0.8412683021429264, "grad_norm": 2.201224102766937, "learning_rate": 7.474473291287699e-07, "loss": 0.6975, "step": 2169 }, { "epoch": 0.8416561621254727, "grad_norm": 2.5870122911453963, "learning_rate": 7.438901389797881e-07, "loss": 0.6462, "step": 2170 }, { "epoch": 0.842044022108019, "grad_norm": 2.1569735092561637, "learning_rate": 7.403407531772311e-07, "loss": 0.6252, "step": 2171 }, { "epoch": 0.8424318820905653, "grad_norm": 1.994314417713421, "learning_rate": 7.367991782295392e-07, "loss": 0.6838, "step": 2172 }, { "epoch": 0.8428197420731116, "grad_norm": 3.3763421949913375, "learning_rate": 7.332654206308299e-07, "loss": 0.697, "step": 2173 }, { "epoch": 0.8432076020556579, "grad_norm": 1.902042555425268, "learning_rate": 7.297394868608859e-07, "loss": 0.5999, "step": 2174 }, { "epoch": 0.8435954620382042, "grad_norm": 2.028175851140813, "learning_rate": 7.262213833851445e-07, "loss": 0.6085, "step": 2175 }, { "epoch": 0.8439833220207505, "grad_norm": 2.2943281085578997, "learning_rate": 7.227111166546835e-07, "loss": 0.6749, "step": 2176 }, { "epoch": 0.8443711820032969, "grad_norm": 2.7200677976553527, "learning_rate": 7.192086931062115e-07, "loss": 0.6939, "step": 2177 }, { "epoch": 0.8447590419858431, "grad_norm": 2.877741467404864, "learning_rate": 7.157141191620548e-07, "loss": 0.7333, "step": 2178 }, { "epoch": 0.8451469019683894, "grad_norm": 1.8777304457471171, "learning_rate": 7.122274012301461e-07, "loss": 0.6146, "step": 2179 }, { "epoch": 0.8455347619509357, "grad_norm": 2.556423706592391, "learning_rate": 7.087485457040127e-07, "loss": 0.6455, "step": 2180 }, { "epoch": 0.845922621933482, "grad_norm": 2.771544126381906, "learning_rate": 7.052775589627647e-07, "loss": 0.6997, "step": 2181 }, { "epoch": 0.8463104819160283, "grad_norm": 2.1484110307023543, "learning_rate": 7.018144473710825e-07, "loss": 0.6645, "step": 2182 }, { "epoch": 0.8466983418985746, "grad_norm": 2.101752025790129, "learning_rate": 6.983592172792087e-07, "loss": 0.7158, "step": 2183 }, { "epoch": 0.8470862018811209, "grad_norm": 2.9239632642835254, "learning_rate": 6.949118750229317e-07, "loss": 0.6999, "step": 2184 }, { "epoch": 0.8474740618636672, "grad_norm": 2.24568608022255, "learning_rate": 6.914724269235756e-07, "loss": 0.6444, "step": 2185 }, { "epoch": 0.8478619218462136, "grad_norm": 2.5536104324322535, "learning_rate": 6.880408792879905e-07, "loss": 0.6364, "step": 2186 }, { "epoch": 0.8482497818287598, "grad_norm": 2.2229830630018315, "learning_rate": 6.846172384085386e-07, "loss": 0.6183, "step": 2187 }, { "epoch": 0.8486376418113061, "grad_norm": 2.019203206232619, "learning_rate": 6.812015105630842e-07, "loss": 0.6185, "step": 2188 }, { "epoch": 0.8490255017938524, "grad_norm": 2.659704096748484, "learning_rate": 6.777937020149816e-07, "loss": 0.6301, "step": 2189 }, { "epoch": 0.8494133617763987, "grad_norm": 2.4577381545656563, "learning_rate": 6.743938190130616e-07, "loss": 0.7041, "step": 2190 }, { "epoch": 0.849801221758945, "grad_norm": 2.356912301038884, "learning_rate": 6.710018677916275e-07, "loss": 0.6735, "step": 2191 }, { "epoch": 0.8501890817414913, "grad_norm": 2.3989391702760186, "learning_rate": 6.676178545704326e-07, "loss": 0.6389, "step": 2192 }, { "epoch": 0.8505769417240376, "grad_norm": 2.6633192796945835, "learning_rate": 6.642417855546768e-07, "loss": 0.7003, "step": 2193 }, { "epoch": 0.850964801706584, "grad_norm": 2.7628075124478984, "learning_rate": 6.60873666934993e-07, "loss": 0.6324, "step": 2194 }, { "epoch": 0.8513526616891303, "grad_norm": 2.0945338689699056, "learning_rate": 6.575135048874349e-07, "loss": 0.6305, "step": 2195 }, { "epoch": 0.8517405216716766, "grad_norm": 2.5620136062925787, "learning_rate": 6.541613055734669e-07, "loss": 0.6823, "step": 2196 }, { "epoch": 0.8521283816542228, "grad_norm": 2.3966273932112188, "learning_rate": 6.508170751399517e-07, "loss": 0.7028, "step": 2197 }, { "epoch": 0.8525162416367691, "grad_norm": 3.2612047496043854, "learning_rate": 6.474808197191401e-07, "loss": 0.7048, "step": 2198 }, { "epoch": 0.8529041016193154, "grad_norm": 2.2990252258559836, "learning_rate": 6.44152545428659e-07, "loss": 0.6271, "step": 2199 }, { "epoch": 0.8532919616018617, "grad_norm": 2.1652835964917077, "learning_rate": 6.408322583715021e-07, "loss": 0.618, "step": 2200 }, { "epoch": 0.853679821584408, "grad_norm": 2.448417094920521, "learning_rate": 6.375199646360142e-07, "loss": 0.6764, "step": 2201 }, { "epoch": 0.8540676815669543, "grad_norm": 2.627135595004947, "learning_rate": 6.342156702958851e-07, "loss": 0.6637, "step": 2202 }, { "epoch": 0.8544555415495007, "grad_norm": 2.5320217125580635, "learning_rate": 6.30919381410135e-07, "loss": 0.6377, "step": 2203 }, { "epoch": 0.854843401532047, "grad_norm": 2.408457973039574, "learning_rate": 6.276311040231054e-07, "loss": 0.6271, "step": 2204 }, { "epoch": 0.8552312615145933, "grad_norm": 2.3124585784210123, "learning_rate": 6.243508441644469e-07, "loss": 0.6423, "step": 2205 }, { "epoch": 0.8556191214971395, "grad_norm": 2.261937520168477, "learning_rate": 6.210786078491088e-07, "loss": 0.6293, "step": 2206 }, { "epoch": 0.8560069814796858, "grad_norm": 2.608401794184418, "learning_rate": 6.178144010773274e-07, "loss": 0.5808, "step": 2207 }, { "epoch": 0.8563948414622321, "grad_norm": 2.53263443663922, "learning_rate": 6.145582298346153e-07, "loss": 0.6318, "step": 2208 }, { "epoch": 0.8567827014447784, "grad_norm": 2.3897769123931436, "learning_rate": 6.113101000917515e-07, "loss": 0.6668, "step": 2209 }, { "epoch": 0.8571705614273247, "grad_norm": 3.0103840262287607, "learning_rate": 6.080700178047688e-07, "loss": 0.71, "step": 2210 }, { "epoch": 0.857558421409871, "grad_norm": 2.2731787345626704, "learning_rate": 6.048379889149425e-07, "loss": 0.6049, "step": 2211 }, { "epoch": 0.8579462813924174, "grad_norm": 3.1939463595601505, "learning_rate": 6.016140193487824e-07, "loss": 0.7108, "step": 2212 }, { "epoch": 0.8583341413749637, "grad_norm": 2.183700612017053, "learning_rate": 5.98398115018019e-07, "loss": 0.625, "step": 2213 }, { "epoch": 0.85872200135751, "grad_norm": 2.3658120293896103, "learning_rate": 5.951902818195937e-07, "loss": 0.669, "step": 2214 }, { "epoch": 0.8591098613400563, "grad_norm": 2.15460002482815, "learning_rate": 5.919905256356484e-07, "loss": 0.6509, "step": 2215 }, { "epoch": 0.8594977213226025, "grad_norm": 1.7067271819532175, "learning_rate": 5.887988523335137e-07, "loss": 0.6173, "step": 2216 }, { "epoch": 0.8598855813051488, "grad_norm": 2.1879817179617205, "learning_rate": 5.856152677657007e-07, "loss": 0.6026, "step": 2217 }, { "epoch": 0.8602734412876951, "grad_norm": 2.5050244744214023, "learning_rate": 5.824397777698859e-07, "loss": 0.6421, "step": 2218 }, { "epoch": 0.8606613012702414, "grad_norm": 2.536159085289848, "learning_rate": 5.792723881689039e-07, "loss": 0.6558, "step": 2219 }, { "epoch": 0.8610491612527877, "grad_norm": 2.8951482922266596, "learning_rate": 5.761131047707363e-07, "loss": 0.6804, "step": 2220 }, { "epoch": 0.861437021235334, "grad_norm": 2.5181354367408066, "learning_rate": 5.729619333684994e-07, "loss": 0.6385, "step": 2221 }, { "epoch": 0.8618248812178804, "grad_norm": 2.7524952329285512, "learning_rate": 5.698188797404358e-07, "loss": 0.6711, "step": 2222 }, { "epoch": 0.8622127412004267, "grad_norm": 2.0075199612807126, "learning_rate": 5.666839496499021e-07, "loss": 0.6508, "step": 2223 }, { "epoch": 0.862600601182973, "grad_norm": 2.2066527125063162, "learning_rate": 5.63557148845359e-07, "loss": 0.6341, "step": 2224 }, { "epoch": 0.8629884611655192, "grad_norm": 2.720512329233808, "learning_rate": 5.604384830603599e-07, "loss": 0.669, "step": 2225 }, { "epoch": 0.8633763211480655, "grad_norm": 2.4899646410527754, "learning_rate": 5.573279580135438e-07, "loss": 0.6504, "step": 2226 }, { "epoch": 0.8637641811306118, "grad_norm": 2.504619223250893, "learning_rate": 5.542255794086193e-07, "loss": 0.7175, "step": 2227 }, { "epoch": 0.8641520411131581, "grad_norm": 2.3246390690059395, "learning_rate": 5.511313529343581e-07, "loss": 0.6461, "step": 2228 }, { "epoch": 0.8645399010957044, "grad_norm": 2.311142446528406, "learning_rate": 5.480452842645839e-07, "loss": 0.6806, "step": 2229 }, { "epoch": 0.8649277610782508, "grad_norm": 2.07744971598023, "learning_rate": 5.449673790581611e-07, "loss": 0.6381, "step": 2230 }, { "epoch": 0.8653156210607971, "grad_norm": 2.014915814582545, "learning_rate": 5.418976429589845e-07, "loss": 0.611, "step": 2231 }, { "epoch": 0.8657034810433434, "grad_norm": 2.532444481999695, "learning_rate": 5.388360815959703e-07, "loss": 0.6309, "step": 2232 }, { "epoch": 0.8660913410258897, "grad_norm": 2.606314848271102, "learning_rate": 5.357827005830435e-07, "loss": 0.6109, "step": 2233 }, { "epoch": 0.866479201008436, "grad_norm": 2.1653150900917812, "learning_rate": 5.327375055191313e-07, "loss": 0.6351, "step": 2234 }, { "epoch": 0.8668670609909822, "grad_norm": 2.120367664118541, "learning_rate": 5.297005019881491e-07, "loss": 0.6743, "step": 2235 }, { "epoch": 0.8672549209735285, "grad_norm": 2.0852853883207274, "learning_rate": 5.266716955589907e-07, "loss": 0.6581, "step": 2236 }, { "epoch": 0.8676427809560748, "grad_norm": 2.6534388002512657, "learning_rate": 5.236510917855197e-07, "loss": 0.6299, "step": 2237 }, { "epoch": 0.8680306409386211, "grad_norm": 2.6283206308901277, "learning_rate": 5.206386962065601e-07, "loss": 0.6902, "step": 2238 }, { "epoch": 0.8684185009211675, "grad_norm": 2.555184685860975, "learning_rate": 5.176345143458827e-07, "loss": 0.6695, "step": 2239 }, { "epoch": 0.8688063609037138, "grad_norm": 2.3425974488591996, "learning_rate": 5.146385517121977e-07, "loss": 0.628, "step": 2240 }, { "epoch": 0.8691942208862601, "grad_norm": 2.30760642165541, "learning_rate": 5.116508137991438e-07, "loss": 0.6778, "step": 2241 }, { "epoch": 0.8695820808688064, "grad_norm": 2.696778382740496, "learning_rate": 5.086713060852788e-07, "loss": 0.6921, "step": 2242 }, { "epoch": 0.8699699408513527, "grad_norm": 2.1785096530806625, "learning_rate": 5.057000340340679e-07, "loss": 0.6838, "step": 2243 }, { "epoch": 0.8703578008338989, "grad_norm": 3.116970810515075, "learning_rate": 5.027370030938755e-07, "loss": 0.7628, "step": 2244 }, { "epoch": 0.8707456608164452, "grad_norm": 2.4309333688235966, "learning_rate": 4.997822186979539e-07, "loss": 0.6901, "step": 2245 }, { "epoch": 0.8711335207989915, "grad_norm": 2.0732380612748087, "learning_rate": 4.968356862644352e-07, "loss": 0.6533, "step": 2246 }, { "epoch": 0.8715213807815378, "grad_norm": 1.8741904533138958, "learning_rate": 4.938974111963174e-07, "loss": 0.653, "step": 2247 }, { "epoch": 0.8719092407640842, "grad_norm": 2.997659214283092, "learning_rate": 4.9096739888146e-07, "loss": 0.6592, "step": 2248 }, { "epoch": 0.8722971007466305, "grad_norm": 1.8985485752393598, "learning_rate": 4.880456546925693e-07, "loss": 0.6105, "step": 2249 }, { "epoch": 0.8726849607291768, "grad_norm": 1.844797235295743, "learning_rate": 4.851321839871908e-07, "loss": 0.6247, "step": 2250 }, { "epoch": 0.8730728207117231, "grad_norm": 2.3563894579577394, "learning_rate": 4.822269921077011e-07, "loss": 0.6536, "step": 2251 }, { "epoch": 0.8734606806942694, "grad_norm": 1.9790494310893918, "learning_rate": 4.793300843812926e-07, "loss": 0.611, "step": 2252 }, { "epoch": 0.8738485406768157, "grad_norm": 2.3364491760380175, "learning_rate": 4.7644146611997064e-07, "loss": 0.6016, "step": 2253 }, { "epoch": 0.8742364006593619, "grad_norm": 3.546098880481106, "learning_rate": 4.735611426205372e-07, "loss": 0.6284, "step": 2254 }, { "epoch": 0.8746242606419082, "grad_norm": 1.874066155915751, "learning_rate": 4.7068911916458683e-07, "loss": 0.5771, "step": 2255 }, { "epoch": 0.8750121206244545, "grad_norm": 2.1661092717510795, "learning_rate": 4.678254010184929e-07, "loss": 0.5945, "step": 2256 }, { "epoch": 0.8753999806070009, "grad_norm": 2.0784087151975337, "learning_rate": 4.6496999343340065e-07, "loss": 0.5713, "step": 2257 }, { "epoch": 0.8757878405895472, "grad_norm": 2.0840104434681384, "learning_rate": 4.6212290164521554e-07, "loss": 0.6552, "step": 2258 }, { "epoch": 0.8761757005720935, "grad_norm": 2.4880087032275355, "learning_rate": 4.5928413087459325e-07, "loss": 0.6369, "step": 2259 }, { "epoch": 0.8765635605546398, "grad_norm": 2.07827442093364, "learning_rate": 4.564536863269353e-07, "loss": 0.6225, "step": 2260 }, { "epoch": 0.8769514205371861, "grad_norm": 2.408265255872751, "learning_rate": 4.536315731923724e-07, "loss": 0.6261, "step": 2261 }, { "epoch": 0.8773392805197324, "grad_norm": 2.4771616549606295, "learning_rate": 4.5081779664575887e-07, "loss": 0.6524, "step": 2262 }, { "epoch": 0.8777271405022786, "grad_norm": 1.9088825043319426, "learning_rate": 4.48012361846662e-07, "loss": 0.6135, "step": 2263 }, { "epoch": 0.8781150004848249, "grad_norm": 1.8231912413587372, "learning_rate": 4.4521527393935336e-07, "loss": 0.6654, "step": 2264 }, { "epoch": 0.8785028604673712, "grad_norm": 2.1788989108495813, "learning_rate": 4.4242653805279923e-07, "loss": 0.6257, "step": 2265 }, { "epoch": 0.8788907204499176, "grad_norm": 2.1773799721555838, "learning_rate": 4.3964615930065126e-07, "loss": 0.6428, "step": 2266 }, { "epoch": 0.8792785804324639, "grad_norm": 2.6576946945717297, "learning_rate": 4.3687414278123454e-07, "loss": 0.6537, "step": 2267 }, { "epoch": 0.8796664404150102, "grad_norm": 2.340665724817105, "learning_rate": 4.341104935775442e-07, "loss": 0.6099, "step": 2268 }, { "epoch": 0.8800543003975565, "grad_norm": 2.526490372879739, "learning_rate": 4.313552167572294e-07, "loss": 0.6453, "step": 2269 }, { "epoch": 0.8804421603801028, "grad_norm": 2.29243506680432, "learning_rate": 4.2860831737258857e-07, "loss": 0.6952, "step": 2270 }, { "epoch": 0.8808300203626491, "grad_norm": 2.770942231483744, "learning_rate": 4.258698004605571e-07, "loss": 0.6655, "step": 2271 }, { "epoch": 0.8812178803451954, "grad_norm": 2.0717577922827277, "learning_rate": 4.231396710427016e-07, "loss": 0.6057, "step": 2272 }, { "epoch": 0.8816057403277416, "grad_norm": 2.544760376308485, "learning_rate": 4.204179341252074e-07, "loss": 0.6791, "step": 2273 }, { "epoch": 0.881993600310288, "grad_norm": 3.1152760299750772, "learning_rate": 4.1770459469887003e-07, "loss": 0.7401, "step": 2274 }, { "epoch": 0.8823814602928343, "grad_norm": 2.397675050374366, "learning_rate": 4.149996577390886e-07, "loss": 0.5986, "step": 2275 }, { "epoch": 0.8827693202753806, "grad_norm": 1.860272238583415, "learning_rate": 4.1230312820585317e-07, "loss": 0.6252, "step": 2276 }, { "epoch": 0.8831571802579269, "grad_norm": 3.1970270612300724, "learning_rate": 4.09615011043738e-07, "loss": 0.6812, "step": 2277 }, { "epoch": 0.8835450402404732, "grad_norm": 2.45103346828556, "learning_rate": 4.069353111818913e-07, "loss": 0.7158, "step": 2278 }, { "epoch": 0.8839329002230195, "grad_norm": 2.1403610895616225, "learning_rate": 4.042640335340281e-07, "loss": 0.6281, "step": 2279 }, { "epoch": 0.8843207602055658, "grad_norm": 2.110038790212783, "learning_rate": 4.016011829984168e-07, "loss": 0.5936, "step": 2280 }, { "epoch": 0.8847086201881121, "grad_norm": 2.704344590800952, "learning_rate": 3.989467644578765e-07, "loss": 0.6442, "step": 2281 }, { "epoch": 0.8850964801706583, "grad_norm": 3.0211430980683454, "learning_rate": 3.963007827797627e-07, "loss": 0.6864, "step": 2282 }, { "epoch": 0.8854843401532047, "grad_norm": 1.990287155691176, "learning_rate": 3.936632428159609e-07, "loss": 0.6065, "step": 2283 }, { "epoch": 0.885872200135751, "grad_norm": 2.0437297027736463, "learning_rate": 3.9103414940287575e-07, "loss": 0.6171, "step": 2284 }, { "epoch": 0.8862600601182973, "grad_norm": 2.1572584735933336, "learning_rate": 3.8841350736142757e-07, "loss": 0.6463, "step": 2285 }, { "epoch": 0.8866479201008436, "grad_norm": 2.018565658925553, "learning_rate": 3.858013214970363e-07, "loss": 0.6507, "step": 2286 }, { "epoch": 0.8870357800833899, "grad_norm": 1.8969076532887093, "learning_rate": 3.831975965996154e-07, "loss": 0.6182, "step": 2287 }, { "epoch": 0.8874236400659362, "grad_norm": 2.3942665339521154, "learning_rate": 3.8060233744356634e-07, "loss": 0.6512, "step": 2288 }, { "epoch": 0.8878115000484825, "grad_norm": 2.461554167990943, "learning_rate": 3.7801554878776514e-07, "loss": 0.6542, "step": 2289 }, { "epoch": 0.8881993600310288, "grad_norm": 2.0515767854398623, "learning_rate": 3.754372353755559e-07, "loss": 0.6424, "step": 2290 }, { "epoch": 0.8885872200135752, "grad_norm": 2.3017742814222095, "learning_rate": 3.728674019347428e-07, "loss": 0.6057, "step": 2291 }, { "epoch": 0.8889750799961214, "grad_norm": 2.9287979238842423, "learning_rate": 3.703060531775787e-07, "loss": 0.6903, "step": 2292 }, { "epoch": 0.8893629399786677, "grad_norm": 2.549469282327486, "learning_rate": 3.6775319380076e-07, "loss": 0.6307, "step": 2293 }, { "epoch": 0.889750799961214, "grad_norm": 2.7980191523672664, "learning_rate": 3.6520882848541606e-07, "loss": 0.668, "step": 2294 }, { "epoch": 0.8901386599437603, "grad_norm": 2.440150068069646, "learning_rate": 3.626729618970998e-07, "loss": 0.6507, "step": 2295 }, { "epoch": 0.8905265199263066, "grad_norm": 1.7358246969039075, "learning_rate": 3.6014559868578103e-07, "loss": 0.5923, "step": 2296 }, { "epoch": 0.8909143799088529, "grad_norm": 2.4481220151281136, "learning_rate": 3.576267434858366e-07, "loss": 0.6664, "step": 2297 }, { "epoch": 0.8913022398913992, "grad_norm": 2.3790359210748178, "learning_rate": 3.5511640091604293e-07, "loss": 0.6509, "step": 2298 }, { "epoch": 0.8916900998739455, "grad_norm": 1.8171100751304667, "learning_rate": 3.5261457557956626e-07, "loss": 0.6282, "step": 2299 }, { "epoch": 0.8920779598564919, "grad_norm": 3.0853278066490253, "learning_rate": 3.501212720639563e-07, "loss": 0.7003, "step": 2300 }, { "epoch": 0.892465819839038, "grad_norm": 2.9438441171467584, "learning_rate": 3.476364949411343e-07, "loss": 0.7049, "step": 2301 }, { "epoch": 0.8928536798215844, "grad_norm": 2.3963535542307572, "learning_rate": 3.451602487673889e-07, "loss": 0.632, "step": 2302 }, { "epoch": 0.8932415398041307, "grad_norm": 3.3364874300188836, "learning_rate": 3.4269253808336456e-07, "loss": 0.7651, "step": 2303 }, { "epoch": 0.893629399786677, "grad_norm": 2.5814425528196994, "learning_rate": 3.402333674140551e-07, "loss": 0.6241, "step": 2304 }, { "epoch": 0.8940172597692233, "grad_norm": 2.11871001326057, "learning_rate": 3.377827412687934e-07, "loss": 0.603, "step": 2305 }, { "epoch": 0.8944051197517696, "grad_norm": 2.312836959831643, "learning_rate": 3.35340664141246e-07, "loss": 0.6976, "step": 2306 }, { "epoch": 0.8947929797343159, "grad_norm": 1.9110271287975158, "learning_rate": 3.32907140509402e-07, "loss": 0.6451, "step": 2307 }, { "epoch": 0.8951808397168622, "grad_norm": 3.328378234653525, "learning_rate": 3.3048217483556743e-07, "loss": 0.657, "step": 2308 }, { "epoch": 0.8955686996994086, "grad_norm": 1.8434806651217879, "learning_rate": 3.2806577156635435e-07, "loss": 0.5734, "step": 2309 }, { "epoch": 0.8959565596819549, "grad_norm": 2.0918034971960364, "learning_rate": 3.256579351326744e-07, "loss": 0.6021, "step": 2310 }, { "epoch": 0.8963444196645011, "grad_norm": 2.56964203014448, "learning_rate": 3.2325866994973197e-07, "loss": 0.6471, "step": 2311 }, { "epoch": 0.8967322796470474, "grad_norm": 2.451041109357463, "learning_rate": 3.208679804170128e-07, "loss": 0.643, "step": 2312 }, { "epoch": 0.8971201396295937, "grad_norm": 2.565317059721434, "learning_rate": 3.1848587091827757e-07, "loss": 0.698, "step": 2313 }, { "epoch": 0.89750799961214, "grad_norm": 2.6320146091688446, "learning_rate": 3.161123458215554e-07, "loss": 0.7103, "step": 2314 }, { "epoch": 0.8978958595946863, "grad_norm": 2.1485058207621357, "learning_rate": 3.1374740947913206e-07, "loss": 0.5779, "step": 2315 }, { "epoch": 0.8982837195772326, "grad_norm": 1.7993991994332263, "learning_rate": 3.1139106622754655e-07, "loss": 0.5976, "step": 2316 }, { "epoch": 0.8986715795597789, "grad_norm": 2.786630927440506, "learning_rate": 3.0904332038757977e-07, "loss": 0.6424, "step": 2317 }, { "epoch": 0.8990594395423253, "grad_norm": 2.406809470717278, "learning_rate": 3.067041762642475e-07, "loss": 0.6782, "step": 2318 }, { "epoch": 0.8994472995248716, "grad_norm": 2.572886724559831, "learning_rate": 3.0437363814679375e-07, "loss": 0.6427, "step": 2319 }, { "epoch": 0.8998351595074178, "grad_norm": 2.1859555257261434, "learning_rate": 3.020517103086812e-07, "loss": 0.6516, "step": 2320 }, { "epoch": 0.9002230194899641, "grad_norm": 2.092704661536328, "learning_rate": 2.99738397007584e-07, "loss": 0.595, "step": 2321 }, { "epoch": 0.9006108794725104, "grad_norm": 2.5052316745693033, "learning_rate": 2.974337024853802e-07, "loss": 0.6978, "step": 2322 }, { "epoch": 0.9009987394550567, "grad_norm": 1.9419304692973431, "learning_rate": 2.9513763096814305e-07, "loss": 0.6495, "step": 2323 }, { "epoch": 0.901386599437603, "grad_norm": 1.7225542735057005, "learning_rate": 2.9285018666613484e-07, "loss": 0.6625, "step": 2324 }, { "epoch": 0.9017744594201493, "grad_norm": 2.2706260788355825, "learning_rate": 2.9057137377379805e-07, "loss": 0.6651, "step": 2325 }, { "epoch": 0.9021623194026956, "grad_norm": 2.218890186752535, "learning_rate": 2.8830119646974796e-07, "loss": 0.6216, "step": 2326 }, { "epoch": 0.902550179385242, "grad_norm": 2.657701463796221, "learning_rate": 2.860396589167641e-07, "loss": 0.6228, "step": 2327 }, { "epoch": 0.9029380393677883, "grad_norm": 2.766940254022477, "learning_rate": 2.8378676526178484e-07, "loss": 0.6731, "step": 2328 }, { "epoch": 0.9033258993503346, "grad_norm": 2.753738015315766, "learning_rate": 2.815425196358984e-07, "loss": 0.6572, "step": 2329 }, { "epoch": 0.9037137593328808, "grad_norm": 2.711715946567715, "learning_rate": 2.7930692615433353e-07, "loss": 0.6824, "step": 2330 }, { "epoch": 0.9041016193154271, "grad_norm": 2.7385712375388382, "learning_rate": 2.770799889164549e-07, "loss": 0.6582, "step": 2331 }, { "epoch": 0.9044894792979734, "grad_norm": 3.090555636060696, "learning_rate": 2.748617120057551e-07, "loss": 0.6704, "step": 2332 }, { "epoch": 0.9048773392805197, "grad_norm": 2.666024732817055, "learning_rate": 2.726520994898452e-07, "loss": 0.6324, "step": 2333 }, { "epoch": 0.905265199263066, "grad_norm": 2.3150298299756855, "learning_rate": 2.704511554204486e-07, "loss": 0.6455, "step": 2334 }, { "epoch": 0.9056530592456123, "grad_norm": 2.511323318930818, "learning_rate": 2.6825888383339436e-07, "loss": 0.6511, "step": 2335 }, { "epoch": 0.9060409192281587, "grad_norm": 2.181862735500553, "learning_rate": 2.660752887486084e-07, "loss": 0.6765, "step": 2336 }, { "epoch": 0.906428779210705, "grad_norm": 2.2900642250436873, "learning_rate": 2.6390037417010683e-07, "loss": 0.6068, "step": 2337 }, { "epoch": 0.9068166391932513, "grad_norm": 2.034464261980882, "learning_rate": 2.617341440859883e-07, "loss": 0.6639, "step": 2338 }, { "epoch": 0.9072044991757975, "grad_norm": 2.5027263419110386, "learning_rate": 2.5957660246842707e-07, "loss": 0.6981, "step": 2339 }, { "epoch": 0.9075923591583438, "grad_norm": 2.8531232129957327, "learning_rate": 2.5742775327366634e-07, "loss": 0.7127, "step": 2340 }, { "epoch": 0.9079802191408901, "grad_norm": 2.045131262086616, "learning_rate": 2.552876004420085e-07, "loss": 0.6239, "step": 2341 }, { "epoch": 0.9083680791234364, "grad_norm": 1.9787203985852082, "learning_rate": 2.5315614789781064e-07, "loss": 0.6702, "step": 2342 }, { "epoch": 0.9087559391059827, "grad_norm": 2.8913603310680465, "learning_rate": 2.5103339954947624e-07, "loss": 0.7361, "step": 2343 }, { "epoch": 0.909143799088529, "grad_norm": 2.5894733661280176, "learning_rate": 2.4891935928944676e-07, "loss": 0.6775, "step": 2344 }, { "epoch": 0.9095316590710754, "grad_norm": 2.4940459049564407, "learning_rate": 2.468140309941991e-07, "loss": 0.5792, "step": 2345 }, { "epoch": 0.9099195190536217, "grad_norm": 2.0844351515417667, "learning_rate": 2.447174185242324e-07, "loss": 0.6553, "step": 2346 }, { "epoch": 0.910307379036168, "grad_norm": 2.135116259916173, "learning_rate": 2.4262952572406353e-07, "loss": 0.6819, "step": 2347 }, { "epoch": 0.9106952390187143, "grad_norm": 2.400142063728019, "learning_rate": 2.4055035642222225e-07, "loss": 0.6946, "step": 2348 }, { "epoch": 0.9110830990012605, "grad_norm": 2.5593745480230337, "learning_rate": 2.384799144312405e-07, "loss": 0.5874, "step": 2349 }, { "epoch": 0.9114709589838068, "grad_norm": 2.4470270786178165, "learning_rate": 2.3641820354764755e-07, "loss": 0.673, "step": 2350 }, { "epoch": 0.9118588189663531, "grad_norm": 1.9676315384417595, "learning_rate": 2.3436522755196367e-07, "loss": 0.6013, "step": 2351 }, { "epoch": 0.9122466789488994, "grad_norm": 2.171625439753764, "learning_rate": 2.323209902086898e-07, "loss": 0.654, "step": 2352 }, { "epoch": 0.9126345389314457, "grad_norm": 2.6058656884869555, "learning_rate": 2.3028549526630583e-07, "loss": 0.612, "step": 2353 }, { "epoch": 0.9130223989139921, "grad_norm": 1.9671456383325805, "learning_rate": 2.2825874645725942e-07, "loss": 0.6027, "step": 2354 }, { "epoch": 0.9134102588965384, "grad_norm": 2.24530611504996, "learning_rate": 2.2624074749796053e-07, "loss": 0.6561, "step": 2355 }, { "epoch": 0.9137981188790847, "grad_norm": 2.3773573092452716, "learning_rate": 2.2423150208877476e-07, "loss": 0.6156, "step": 2356 }, { "epoch": 0.914185978861631, "grad_norm": 2.18262860337035, "learning_rate": 2.2223101391401657e-07, "loss": 0.6374, "step": 2357 }, { "epoch": 0.9145738388441772, "grad_norm": 2.790234218440362, "learning_rate": 2.2023928664194229e-07, "loss": 0.687, "step": 2358 }, { "epoch": 0.9149616988267235, "grad_norm": 3.046055211663888, "learning_rate": 2.1825632392474372e-07, "loss": 0.6661, "step": 2359 }, { "epoch": 0.9153495588092698, "grad_norm": 2.214517412563203, "learning_rate": 2.1628212939854176e-07, "loss": 0.6702, "step": 2360 }, { "epoch": 0.9157374187918161, "grad_norm": 2.4634924380171883, "learning_rate": 2.143167066833779e-07, "loss": 0.6836, "step": 2361 }, { "epoch": 0.9161252787743625, "grad_norm": 1.9740766520570339, "learning_rate": 2.1236005938321092e-07, "loss": 0.6509, "step": 2362 }, { "epoch": 0.9165131387569088, "grad_norm": 1.975112157791904, "learning_rate": 2.1041219108590692e-07, "loss": 0.6547, "step": 2363 }, { "epoch": 0.9169009987394551, "grad_norm": 2.2505159787442777, "learning_rate": 2.0847310536323385e-07, "loss": 0.6706, "step": 2364 }, { "epoch": 0.9172888587220014, "grad_norm": 3.2027422816946713, "learning_rate": 2.065428057708563e-07, "loss": 0.6986, "step": 2365 }, { "epoch": 0.9176767187045477, "grad_norm": 2.6455396188643134, "learning_rate": 2.046212958483268e-07, "loss": 0.634, "step": 2366 }, { "epoch": 0.918064578687094, "grad_norm": 2.5404134293996226, "learning_rate": 2.0270857911908137e-07, "loss": 0.6584, "step": 2367 }, { "epoch": 0.9184524386696402, "grad_norm": 2.1706140189447107, "learning_rate": 2.0080465909043113e-07, "loss": 0.6261, "step": 2368 }, { "epoch": 0.9188402986521865, "grad_norm": 2.042592154126197, "learning_rate": 1.9890953925355838e-07, "loss": 0.6442, "step": 2369 }, { "epoch": 0.9192281586347328, "grad_norm": 1.7044160251634062, "learning_rate": 1.9702322308350675e-07, "loss": 0.6563, "step": 2370 }, { "epoch": 0.9196160186172792, "grad_norm": 1.8519890163076211, "learning_rate": 1.951457140391788e-07, "loss": 0.6128, "step": 2371 }, { "epoch": 0.9200038785998255, "grad_norm": 2.861460100881092, "learning_rate": 1.9327701556332569e-07, "loss": 0.701, "step": 2372 }, { "epoch": 0.9203917385823718, "grad_norm": 2.1572391935946644, "learning_rate": 1.9141713108254413e-07, "loss": 0.6639, "step": 2373 }, { "epoch": 0.9207795985649181, "grad_norm": 3.2304449655289202, "learning_rate": 1.895660640072683e-07, "loss": 0.6763, "step": 2374 }, { "epoch": 0.9211674585474644, "grad_norm": 2.5653999185630054, "learning_rate": 1.8772381773176417e-07, "loss": 0.668, "step": 2375 }, { "epoch": 0.9215553185300107, "grad_norm": 2.4484343994787032, "learning_rate": 1.8589039563412291e-07, "loss": 0.652, "step": 2376 }, { "epoch": 0.9219431785125569, "grad_norm": 1.9174334132953017, "learning_rate": 1.8406580107625583e-07, "loss": 0.6089, "step": 2377 }, { "epoch": 0.9223310384951032, "grad_norm": 1.727610691663483, "learning_rate": 1.8225003740388546e-07, "loss": 0.6284, "step": 2378 }, { "epoch": 0.9227188984776495, "grad_norm": 2.5432132131240253, "learning_rate": 1.804431079465435e-07, "loss": 0.6155, "step": 2379 }, { "epoch": 0.9231067584601959, "grad_norm": 2.0960396069125666, "learning_rate": 1.7864501601756236e-07, "loss": 0.5998, "step": 2380 }, { "epoch": 0.9234946184427422, "grad_norm": 2.583430388860488, "learning_rate": 1.7685576491406676e-07, "loss": 0.6583, "step": 2381 }, { "epoch": 0.9238824784252885, "grad_norm": 1.993935943787717, "learning_rate": 1.7507535791697338e-07, "loss": 0.6281, "step": 2382 }, { "epoch": 0.9242703384078348, "grad_norm": 2.8679239988368015, "learning_rate": 1.733037982909791e-07, "loss": 0.6156, "step": 2383 }, { "epoch": 0.9246581983903811, "grad_norm": 2.2303964324062013, "learning_rate": 1.7154108928455926e-07, "loss": 0.6496, "step": 2384 }, { "epoch": 0.9250460583729274, "grad_norm": 3.0183867683114927, "learning_rate": 1.697872341299589e-07, "loss": 0.6957, "step": 2385 }, { "epoch": 0.9254339183554737, "grad_norm": 2.4801824217640713, "learning_rate": 1.6804223604318825e-07, "loss": 0.622, "step": 2386 }, { "epoch": 0.9258217783380199, "grad_norm": 3.012627444299355, "learning_rate": 1.6630609822401612e-07, "loss": 0.6695, "step": 2387 }, { "epoch": 0.9262096383205662, "grad_norm": 2.1869026595163694, "learning_rate": 1.6457882385596647e-07, "loss": 0.6858, "step": 2388 }, { "epoch": 0.9265974983031126, "grad_norm": 1.9848941681332382, "learning_rate": 1.6286041610630742e-07, "loss": 0.6296, "step": 2389 }, { "epoch": 0.9269853582856589, "grad_norm": 3.269228353052787, "learning_rate": 1.6115087812605123e-07, "loss": 0.6392, "step": 2390 }, { "epoch": 0.9273732182682052, "grad_norm": 2.329330900691631, "learning_rate": 1.5945021304994368e-07, "loss": 0.6964, "step": 2391 }, { "epoch": 0.9277610782507515, "grad_norm": 2.0577310155180495, "learning_rate": 1.577584239964619e-07, "loss": 0.6181, "step": 2392 }, { "epoch": 0.9281489382332978, "grad_norm": 2.661663790003621, "learning_rate": 1.560755140678072e-07, "loss": 0.7832, "step": 2393 }, { "epoch": 0.9285367982158441, "grad_norm": 2.4973132132084417, "learning_rate": 1.5440148634989827e-07, "loss": 0.6111, "step": 2394 }, { "epoch": 0.9289246581983904, "grad_norm": 2.0360855579463073, "learning_rate": 1.527363439123669e-07, "loss": 0.6426, "step": 2395 }, { "epoch": 0.9293125181809366, "grad_norm": 2.334790596227784, "learning_rate": 1.5108008980855405e-07, "loss": 0.716, "step": 2396 }, { "epoch": 0.9297003781634829, "grad_norm": 2.151934194432014, "learning_rate": 1.4943272707550028e-07, "loss": 0.6273, "step": 2397 }, { "epoch": 0.9300882381460293, "grad_norm": 2.2278376381918332, "learning_rate": 1.477942587339426e-07, "loss": 0.6483, "step": 2398 }, { "epoch": 0.9304760981285756, "grad_norm": 2.6457143110659045, "learning_rate": 1.4616468778830939e-07, "loss": 0.6098, "step": 2399 }, { "epoch": 0.9308639581111219, "grad_norm": 2.27323098807673, "learning_rate": 1.4454401722671264e-07, "loss": 0.6323, "step": 2400 }, { "epoch": 0.9312518180936682, "grad_norm": 2.059102859359682, "learning_rate": 1.4293225002094456e-07, "loss": 0.6281, "step": 2401 }, { "epoch": 0.9316396780762145, "grad_norm": 2.2882273432592397, "learning_rate": 1.413293891264722e-07, "loss": 0.6083, "step": 2402 }, { "epoch": 0.9320275380587608, "grad_norm": 2.85171849582778, "learning_rate": 1.3973543748243002e-07, "loss": 0.6946, "step": 2403 }, { "epoch": 0.9324153980413071, "grad_norm": 2.685797480672074, "learning_rate": 1.3815039801161723e-07, "loss": 0.6822, "step": 2404 }, { "epoch": 0.9328032580238534, "grad_norm": 3.195169961609648, "learning_rate": 1.3657427362048893e-07, "loss": 0.6691, "step": 2405 }, { "epoch": 0.9331911180063996, "grad_norm": 1.8729941940701655, "learning_rate": 1.350070671991549e-07, "loss": 0.5942, "step": 2406 }, { "epoch": 0.933578977988946, "grad_norm": 1.8462615938261502, "learning_rate": 1.3344878162137087e-07, "loss": 0.5565, "step": 2407 }, { "epoch": 0.9339668379714923, "grad_norm": 2.840473585180799, "learning_rate": 1.3189941974453502e-07, "loss": 0.7075, "step": 2408 }, { "epoch": 0.9343546979540386, "grad_norm": 1.8341440925843129, "learning_rate": 1.3035898440968197e-07, "loss": 0.6386, "step": 2409 }, { "epoch": 0.9347425579365849, "grad_norm": 2.2851768897814595, "learning_rate": 1.2882747844147893e-07, "loss": 0.6995, "step": 2410 }, { "epoch": 0.9351304179191312, "grad_norm": 2.7490631931345013, "learning_rate": 1.273049046482183e-07, "loss": 0.6802, "step": 2411 }, { "epoch": 0.9355182779016775, "grad_norm": 2.719145044871563, "learning_rate": 1.257912658218141e-07, "loss": 0.6322, "step": 2412 }, { "epoch": 0.9359061378842238, "grad_norm": 2.7428104707552152, "learning_rate": 1.242865647377972e-07, "loss": 0.7028, "step": 2413 }, { "epoch": 0.9362939978667701, "grad_norm": 2.460694300718825, "learning_rate": 1.2279080415530832e-07, "loss": 0.6436, "step": 2414 }, { "epoch": 0.9366818578493163, "grad_norm": 2.8187748680433593, "learning_rate": 1.2130398681709564e-07, "loss": 0.7357, "step": 2415 }, { "epoch": 0.9370697178318627, "grad_norm": 2.0812686162349996, "learning_rate": 1.1982611544950617e-07, "loss": 0.6236, "step": 2416 }, { "epoch": 0.937457577814409, "grad_norm": 2.586955155625382, "learning_rate": 1.1835719276248491e-07, "loss": 0.6437, "step": 2417 }, { "epoch": 0.9378454377969553, "grad_norm": 2.696623260010667, "learning_rate": 1.1689722144956672e-07, "loss": 0.6361, "step": 2418 }, { "epoch": 0.9382332977795016, "grad_norm": 1.7995711237705823, "learning_rate": 1.1544620418787289e-07, "loss": 0.6399, "step": 2419 }, { "epoch": 0.9386211577620479, "grad_norm": 2.1507680600461505, "learning_rate": 1.1400414363810564e-07, "loss": 0.6442, "step": 2420 }, { "epoch": 0.9390090177445942, "grad_norm": 3.0771717576646633, "learning_rate": 1.1257104244454309e-07, "loss": 0.6179, "step": 2421 }, { "epoch": 0.9393968777271405, "grad_norm": 2.1253422455767095, "learning_rate": 1.1114690323503652e-07, "loss": 0.5816, "step": 2422 }, { "epoch": 0.9397847377096868, "grad_norm": 2.1302709424632433, "learning_rate": 1.0973172862100145e-07, "loss": 0.6426, "step": 2423 }, { "epoch": 0.9401725976922332, "grad_norm": 2.0832790761816042, "learning_rate": 1.0832552119741658e-07, "loss": 0.6754, "step": 2424 }, { "epoch": 0.9405604576747794, "grad_norm": 2.2703838940592207, "learning_rate": 1.0692828354281704e-07, "loss": 0.6359, "step": 2425 }, { "epoch": 0.9409483176573257, "grad_norm": 2.5181440903514556, "learning_rate": 1.0554001821929061e-07, "loss": 0.6701, "step": 2426 }, { "epoch": 0.941336177639872, "grad_norm": 2.0899183627889264, "learning_rate": 1.0416072777247266e-07, "loss": 0.5902, "step": 2427 }, { "epoch": 0.9417240376224183, "grad_norm": 2.628644463841617, "learning_rate": 1.0279041473154117e-07, "loss": 0.6646, "step": 2428 }, { "epoch": 0.9421118976049646, "grad_norm": 2.355108425246451, "learning_rate": 1.0142908160921283e-07, "loss": 0.6408, "step": 2429 }, { "epoch": 0.9424997575875109, "grad_norm": 2.0769342580520744, "learning_rate": 1.0007673090173808e-07, "loss": 0.5965, "step": 2430 }, { "epoch": 0.9428876175700572, "grad_norm": 2.045109595150016, "learning_rate": 9.873336508889664e-08, "loss": 0.6516, "step": 2431 }, { "epoch": 0.9432754775526035, "grad_norm": 2.0641184171030167, "learning_rate": 9.739898663399194e-08, "loss": 0.6357, "step": 2432 }, { "epoch": 0.9436633375351499, "grad_norm": 2.2502470638085055, "learning_rate": 9.607359798384785e-08, "loss": 0.679, "step": 2433 }, { "epoch": 0.9440511975176961, "grad_norm": 2.148478120793914, "learning_rate": 9.475720156880419e-08, "loss": 0.5904, "step": 2434 }, { "epoch": 0.9444390575002424, "grad_norm": 2.615223800023759, "learning_rate": 9.344979980271174e-08, "loss": 0.6748, "step": 2435 }, { "epoch": 0.9448269174827887, "grad_norm": 2.41872274587414, "learning_rate": 9.215139508292726e-08, "loss": 0.6453, "step": 2436 }, { "epoch": 0.945214777465335, "grad_norm": 2.184054369224218, "learning_rate": 9.086198979031125e-08, "loss": 0.6603, "step": 2437 }, { "epoch": 0.9456026374478813, "grad_norm": 2.343687644902258, "learning_rate": 8.95815862892202e-08, "loss": 0.6541, "step": 2438 }, { "epoch": 0.9459904974304276, "grad_norm": 2.724265815788407, "learning_rate": 8.831018692750604e-08, "loss": 0.6597, "step": 2439 }, { "epoch": 0.9463783574129739, "grad_norm": 2.3825017373144175, "learning_rate": 8.704779403650943e-08, "loss": 0.6444, "step": 2440 }, { "epoch": 0.9467662173955202, "grad_norm": 2.3939849729658613, "learning_rate": 8.579440993105537e-08, "loss": 0.7167, "step": 2441 }, { "epoch": 0.9471540773780666, "grad_norm": 2.6040588075625117, "learning_rate": 8.45500369094504e-08, "loss": 0.6886, "step": 2442 }, { "epoch": 0.9475419373606129, "grad_norm": 2.492642893973516, "learning_rate": 8.331467725347708e-08, "loss": 0.6352, "step": 2443 }, { "epoch": 0.9479297973431591, "grad_norm": 2.16532056045003, "learning_rate": 8.20883332283906e-08, "loss": 0.6294, "step": 2444 }, { "epoch": 0.9483176573257054, "grad_norm": 1.808922493595823, "learning_rate": 8.087100708291384e-08, "loss": 0.6419, "step": 2445 }, { "epoch": 0.9487055173082517, "grad_norm": 2.1378004340387293, "learning_rate": 7.966270104923457e-08, "loss": 0.6305, "step": 2446 }, { "epoch": 0.949093377290798, "grad_norm": 2.920906221259346, "learning_rate": 7.846341734300044e-08, "loss": 0.7163, "step": 2447 }, { "epoch": 0.9494812372733443, "grad_norm": 2.022037412802286, "learning_rate": 7.727315816331515e-08, "loss": 0.6053, "step": 2448 }, { "epoch": 0.9498690972558906, "grad_norm": 2.876999791151563, "learning_rate": 7.609192569273339e-08, "loss": 0.6818, "step": 2449 }, { "epoch": 0.950256957238437, "grad_norm": 1.7689209871632172, "learning_rate": 7.491972209725807e-08, "loss": 0.6317, "step": 2450 }, { "epoch": 0.9506448172209833, "grad_norm": 1.7984528622647873, "learning_rate": 7.375654952633704e-08, "loss": 0.6346, "step": 2451 }, { "epoch": 0.9510326772035296, "grad_norm": 2.0088601966692714, "learning_rate": 7.26024101128564e-08, "loss": 0.7052, "step": 2452 }, { "epoch": 0.9514205371860758, "grad_norm": 2.8145226300133217, "learning_rate": 7.14573059731405e-08, "loss": 0.6946, "step": 2453 }, { "epoch": 0.9518083971686221, "grad_norm": 2.251929277043298, "learning_rate": 7.032123920694356e-08, "loss": 0.629, "step": 2454 }, { "epoch": 0.9521962571511684, "grad_norm": 2.8030007549688403, "learning_rate": 6.919421189744979e-08, "loss": 0.636, "step": 2455 }, { "epoch": 0.9525841171337147, "grad_norm": 2.2152505404742753, "learning_rate": 6.807622611126718e-08, "loss": 0.6005, "step": 2456 }, { "epoch": 0.952971977116261, "grad_norm": 2.4650085607656345, "learning_rate": 6.696728389842532e-08, "loss": 0.6601, "step": 2457 }, { "epoch": 0.9533598370988073, "grad_norm": 2.350017806193688, "learning_rate": 6.58673872923693e-08, "loss": 0.69, "step": 2458 }, { "epoch": 0.9537476970813537, "grad_norm": 2.8111665207971916, "learning_rate": 6.477653830995855e-08, "loss": 0.6188, "step": 2459 }, { "epoch": 0.9541355570639, "grad_norm": 1.9833264095768304, "learning_rate": 6.369473895146194e-08, "loss": 0.6385, "step": 2460 }, { "epoch": 0.9545234170464463, "grad_norm": 2.96355192740182, "learning_rate": 6.262199120055379e-08, "loss": 0.6448, "step": 2461 }, { "epoch": 0.9549112770289926, "grad_norm": 2.260228076822843, "learning_rate": 6.15582970243117e-08, "loss": 0.6073, "step": 2462 }, { "epoch": 0.9552991370115388, "grad_norm": 2.3817864415621735, "learning_rate": 6.050365837320993e-08, "loss": 0.6339, "step": 2463 }, { "epoch": 0.9556869969940851, "grad_norm": 2.7051510455762204, "learning_rate": 5.945807718111929e-08, "loss": 0.6927, "step": 2464 }, { "epoch": 0.9560748569766314, "grad_norm": 3.236984520810184, "learning_rate": 5.842155536530281e-08, "loss": 0.6534, "step": 2465 }, { "epoch": 0.9564627169591777, "grad_norm": 2.3005127379878934, "learning_rate": 5.739409482640956e-08, "loss": 0.5924, "step": 2466 }, { "epoch": 0.956850576941724, "grad_norm": 2.2483541282041344, "learning_rate": 5.6375697448474155e-08, "loss": 0.6597, "step": 2467 }, { "epoch": 0.9572384369242704, "grad_norm": 2.193889048704225, "learning_rate": 5.536636509891225e-08, "loss": 0.6641, "step": 2468 }, { "epoch": 0.9576262969068167, "grad_norm": 2.2185860755626523, "learning_rate": 5.436609962851724e-08, "loss": 0.6495, "step": 2469 }, { "epoch": 0.958014156889363, "grad_norm": 2.3401178015395816, "learning_rate": 5.3374902871456965e-08, "loss": 0.6208, "step": 2470 }, { "epoch": 0.9584020168719093, "grad_norm": 2.084896169602109, "learning_rate": 5.23927766452692e-08, "loss": 0.5463, "step": 2471 }, { "epoch": 0.9587898768544555, "grad_norm": 1.9933799836093262, "learning_rate": 5.1419722750859494e-08, "loss": 0.6399, "step": 2472 }, { "epoch": 0.9591777368370018, "grad_norm": 2.7093007264069007, "learning_rate": 5.0455742972498335e-08, "loss": 0.6402, "step": 2473 }, { "epoch": 0.9595655968195481, "grad_norm": 2.376706471315945, "learning_rate": 4.950083907781733e-08, "loss": 0.6382, "step": 2474 }, { "epoch": 0.9599534568020944, "grad_norm": 2.296741906654527, "learning_rate": 4.855501281780528e-08, "loss": 0.6652, "step": 2475 }, { "epoch": 0.9603413167846407, "grad_norm": 2.217521217806659, "learning_rate": 4.7618265926804854e-08, "loss": 0.7085, "step": 2476 }, { "epoch": 0.960729176767187, "grad_norm": 2.8569808908263488, "learning_rate": 4.6690600122510924e-08, "loss": 0.6026, "step": 2477 }, { "epoch": 0.9611170367497334, "grad_norm": 2.2480078067777707, "learning_rate": 4.577201710596613e-08, "loss": 0.5729, "step": 2478 }, { "epoch": 0.9615048967322797, "grad_norm": 2.457166880310885, "learning_rate": 4.486251856155921e-08, "loss": 0.678, "step": 2479 }, { "epoch": 0.961892756714826, "grad_norm": 2.104723339190166, "learning_rate": 4.3962106157019455e-08, "loss": 0.671, "step": 2480 }, { "epoch": 0.9622806166973723, "grad_norm": 3.0232137970332498, "learning_rate": 4.30707815434156e-08, "loss": 0.7145, "step": 2481 }, { "epoch": 0.9626684766799185, "grad_norm": 2.216172536243986, "learning_rate": 4.2188546355153016e-08, "loss": 0.6227, "step": 2482 }, { "epoch": 0.9630563366624648, "grad_norm": 2.7047243164284405, "learning_rate": 4.1315402209968766e-08, "loss": 0.7002, "step": 2483 }, { "epoch": 0.9634441966450111, "grad_norm": 2.202776431580375, "learning_rate": 4.045135070893047e-08, "loss": 0.6075, "step": 2484 }, { "epoch": 0.9638320566275574, "grad_norm": 2.9093559134232168, "learning_rate": 3.9596393436432954e-08, "loss": 0.6845, "step": 2485 }, { "epoch": 0.9642199166101038, "grad_norm": 2.350731677482875, "learning_rate": 3.8750531960194405e-08, "loss": 0.6376, "step": 2486 }, { "epoch": 0.9646077765926501, "grad_norm": 2.425665300585113, "learning_rate": 3.791376783125467e-08, "loss": 0.6202, "step": 2487 }, { "epoch": 0.9649956365751964, "grad_norm": 2.5971620597901888, "learning_rate": 3.70861025839725e-08, "loss": 0.6894, "step": 2488 }, { "epoch": 0.9653834965577427, "grad_norm": 2.7622014552919136, "learning_rate": 3.62675377360211e-08, "loss": 0.6499, "step": 2489 }, { "epoch": 0.965771356540289, "grad_norm": 1.995933516754342, "learning_rate": 3.5458074788387585e-08, "loss": 0.6116, "step": 2490 }, { "epoch": 0.9661592165228352, "grad_norm": 1.8839932622872373, "learning_rate": 3.465771522536854e-08, "loss": 0.6781, "step": 2491 }, { "epoch": 0.9665470765053815, "grad_norm": 1.9941368099806747, "learning_rate": 3.386646051456721e-08, "loss": 0.6205, "step": 2492 }, { "epoch": 0.9669349364879278, "grad_norm": 1.7761901835862757, "learning_rate": 3.308431210689245e-08, "loss": 0.6153, "step": 2493 }, { "epoch": 0.9673227964704741, "grad_norm": 3.5197971919550133, "learning_rate": 3.231127143655422e-08, "loss": 0.7002, "step": 2494 }, { "epoch": 0.9677106564530205, "grad_norm": 2.642906802751097, "learning_rate": 3.154733992106307e-08, "loss": 0.6175, "step": 2495 }, { "epoch": 0.9680985164355668, "grad_norm": 1.7991470755931405, "learning_rate": 3.0792518961225147e-08, "loss": 0.6691, "step": 2496 }, { "epoch": 0.9684863764181131, "grad_norm": 2.533686029935981, "learning_rate": 3.004680994114051e-08, "loss": 0.6836, "step": 2497 }, { "epoch": 0.9688742364006594, "grad_norm": 2.278917474565878, "learning_rate": 2.9310214228202016e-08, "loss": 0.6249, "step": 2498 }, { "epoch": 0.9692620963832057, "grad_norm": 2.316602608635925, "learning_rate": 2.8582733173090904e-08, "loss": 0.6447, "step": 2499 }, { "epoch": 0.9696499563657519, "grad_norm": 2.584008487449923, "learning_rate": 2.7864368109775108e-08, "loss": 0.6048, "step": 2500 }, { "epoch": 0.9696499563657519, "eval_loss": 1.3041239976882935, "eval_runtime": 6.0304, "eval_samples_per_second": 0.166, "eval_steps_per_second": 0.166, "step": 2500 }, { "epoch": 0.9700378163482982, "grad_norm": 2.4494667074819314, "learning_rate": 2.7155120355506493e-08, "loss": 0.6227, "step": 2501 }, { "epoch": 0.9704256763308445, "grad_norm": 1.841962569320598, "learning_rate": 2.645499121081918e-08, "loss": 0.6502, "step": 2502 }, { "epoch": 0.9708135363133908, "grad_norm": 2.546420356659433, "learning_rate": 2.5763981959526786e-08, "loss": 0.6933, "step": 2503 }, { "epoch": 0.9712013962959372, "grad_norm": 2.1621659801954687, "learning_rate": 2.5082093868718515e-08, "loss": 0.7051, "step": 2504 }, { "epoch": 0.9715892562784835, "grad_norm": 2.247712292381037, "learning_rate": 2.440932818876085e-08, "loss": 0.637, "step": 2505 }, { "epoch": 0.9719771162610298, "grad_norm": 2.449929664313162, "learning_rate": 2.3745686153290314e-08, "loss": 0.6492, "step": 2506 }, { "epoch": 0.9723649762435761, "grad_norm": 2.420037625647971, "learning_rate": 2.3091168979214595e-08, "loss": 0.637, "step": 2507 }, { "epoch": 0.9727528362261224, "grad_norm": 2.082581603897288, "learning_rate": 2.2445777866709208e-08, "loss": 0.6456, "step": 2508 }, { "epoch": 0.9731406962086687, "grad_norm": 2.3128591235371827, "learning_rate": 2.1809513999215825e-08, "loss": 0.6916, "step": 2509 }, { "epoch": 0.9735285561912149, "grad_norm": 2.123225961721447, "learning_rate": 2.1182378543438408e-08, "loss": 0.655, "step": 2510 }, { "epoch": 0.9739164161737612, "grad_norm": 2.5482393401366332, "learning_rate": 2.0564372649343743e-08, "loss": 0.6025, "step": 2511 }, { "epoch": 0.9743042761563075, "grad_norm": 2.1036777194278917, "learning_rate": 1.9955497450157012e-08, "loss": 0.5799, "step": 2512 }, { "epoch": 0.9746921361388539, "grad_norm": 2.909545074065832, "learning_rate": 1.935575406236123e-08, "loss": 0.6164, "step": 2513 }, { "epoch": 0.9750799961214002, "grad_norm": 2.251720689514894, "learning_rate": 1.8765143585693924e-08, "loss": 0.6785, "step": 2514 }, { "epoch": 0.9754678561039465, "grad_norm": 2.4435492041836975, "learning_rate": 1.8183667103146007e-08, "loss": 0.6534, "step": 2515 }, { "epoch": 0.9758557160864928, "grad_norm": 2.482502861259122, "learning_rate": 1.7611325680960133e-08, "loss": 0.6304, "step": 2516 }, { "epoch": 0.9762435760690391, "grad_norm": 2.647892417343947, "learning_rate": 1.7048120368627352e-08, "loss": 0.7246, "step": 2517 }, { "epoch": 0.9766314360515854, "grad_norm": 1.9766137533856996, "learning_rate": 1.6494052198886557e-08, "loss": 0.6133, "step": 2518 }, { "epoch": 0.9770192960341316, "grad_norm": 2.049255340153611, "learning_rate": 1.5949122187721154e-08, "loss": 0.6126, "step": 2519 }, { "epoch": 0.9774071560166779, "grad_norm": 2.2870634127306433, "learning_rate": 1.541333133436018e-08, "loss": 0.6613, "step": 2520 }, { "epoch": 0.9777950159992242, "grad_norm": 3.016640696834442, "learning_rate": 1.4886680621271631e-08, "loss": 0.6863, "step": 2521 }, { "epoch": 0.9781828759817706, "grad_norm": 1.8978024863335048, "learning_rate": 1.4369171014165795e-08, "loss": 0.608, "step": 2522 }, { "epoch": 0.9785707359643169, "grad_norm": 2.8749243880680595, "learning_rate": 1.3860803461989148e-08, "loss": 0.6668, "step": 2523 }, { "epoch": 0.9789585959468632, "grad_norm": 1.8431870494800988, "learning_rate": 1.336157889692602e-08, "loss": 0.6779, "step": 2524 }, { "epoch": 0.9793464559294095, "grad_norm": 2.374067945081439, "learning_rate": 1.2871498234394707e-08, "loss": 0.6278, "step": 2525 }, { "epoch": 0.9797343159119558, "grad_norm": 3.004625517959071, "learning_rate": 1.2390562373046367e-08, "loss": 0.654, "step": 2526 }, { "epoch": 0.9801221758945021, "grad_norm": 2.2008355851127304, "learning_rate": 1.1918772194764449e-08, "loss": 0.6702, "step": 2527 }, { "epoch": 0.9805100358770484, "grad_norm": 2.7508346897483174, "learning_rate": 1.1456128564660273e-08, "loss": 0.6398, "step": 2528 }, { "epoch": 0.9808978958595946, "grad_norm": 2.9553487851211706, "learning_rate": 1.1002632331076346e-08, "loss": 0.7347, "step": 2529 }, { "epoch": 0.981285755842141, "grad_norm": 1.8969091557785236, "learning_rate": 1.0558284325578038e-08, "loss": 0.5963, "step": 2530 }, { "epoch": 0.9816736158246873, "grad_norm": 2.2083902780359965, "learning_rate": 1.012308536295914e-08, "loss": 0.6286, "step": 2531 }, { "epoch": 0.9820614758072336, "grad_norm": 2.849745631238844, "learning_rate": 9.69703624123519e-09, "loss": 0.6653, "step": 2532 }, { "epoch": 0.9824493357897799, "grad_norm": 1.7924366171848145, "learning_rate": 9.280137741643492e-09, "loss": 0.6371, "step": 2533 }, { "epoch": 0.9828371957723262, "grad_norm": 3.444362655935515, "learning_rate": 8.872390628643645e-09, "loss": 0.7456, "step": 2534 }, { "epoch": 0.9832250557548725, "grad_norm": 2.6455737464950895, "learning_rate": 8.473795649913686e-09, "loss": 0.6473, "step": 2535 }, { "epoch": 0.9836129157374188, "grad_norm": 2.382073161938767, "learning_rate": 8.084353536348955e-09, "loss": 0.7334, "step": 2536 }, { "epoch": 0.9840007757199651, "grad_norm": 3.0250171405886013, "learning_rate": 7.704065002062112e-09, "loss": 0.612, "step": 2537 }, { "epoch": 0.9843886357025113, "grad_norm": 2.367768245101841, "learning_rate": 7.332930744380906e-09, "loss": 0.6512, "step": 2538 }, { "epoch": 0.9847764956850577, "grad_norm": 2.756216902638447, "learning_rate": 6.9709514438470735e-09, "loss": 0.663, "step": 2539 }, { "epoch": 0.985164355667604, "grad_norm": 2.851098773641774, "learning_rate": 6.618127764215221e-09, "loss": 0.6433, "step": 2540 }, { "epoch": 0.9855522156501503, "grad_norm": 2.6317272696145917, "learning_rate": 6.274460352451162e-09, "loss": 0.6664, "step": 2541 }, { "epoch": 0.9859400756326966, "grad_norm": 1.911549304334903, "learning_rate": 5.939949838731363e-09, "loss": 0.6384, "step": 2542 }, { "epoch": 0.9863279356152429, "grad_norm": 2.423553811666132, "learning_rate": 5.614596836440722e-09, "loss": 0.726, "step": 2543 }, { "epoch": 0.9867157955977892, "grad_norm": 2.9835916360208556, "learning_rate": 5.298401942173681e-09, "loss": 0.6409, "step": 2544 }, { "epoch": 0.9871036555803355, "grad_norm": 1.6649552721676637, "learning_rate": 4.991365735730336e-09, "loss": 0.5967, "step": 2545 }, { "epoch": 0.9874915155628818, "grad_norm": 2.39844673737037, "learning_rate": 4.69348878011644e-09, "loss": 0.6569, "step": 2546 }, { "epoch": 0.9878793755454282, "grad_norm": 2.6403999220638292, "learning_rate": 4.4047716215439575e-09, "loss": 0.6563, "step": 2547 }, { "epoch": 0.9882672355279744, "grad_norm": 2.3109345371816934, "learning_rate": 4.125214789427734e-09, "loss": 0.6674, "step": 2548 }, { "epoch": 0.9886550955105207, "grad_norm": 2.749885881165021, "learning_rate": 3.854818796385495e-09, "loss": 0.6474, "step": 2549 }, { "epoch": 0.989042955493067, "grad_norm": 2.1763261130122906, "learning_rate": 3.593584138237294e-09, "loss": 0.6542, "step": 2550 }, { "epoch": 0.9894308154756133, "grad_norm": 2.482043334398038, "learning_rate": 3.341511294004396e-09, "loss": 0.7153, "step": 2551 }, { "epoch": 0.9898186754581596, "grad_norm": 2.8567334069556902, "learning_rate": 3.098600725907619e-09, "loss": 0.6525, "step": 2552 }, { "epoch": 0.9902065354407059, "grad_norm": 2.161431137209311, "learning_rate": 2.86485287936733e-09, "loss": 0.6725, "step": 2553 }, { "epoch": 0.9905943954232522, "grad_norm": 2.2830015639665913, "learning_rate": 2.640268183002337e-09, "loss": 0.6516, "step": 2554 }, { "epoch": 0.9909822554057985, "grad_norm": 2.3706480943306443, "learning_rate": 2.424847048629886e-09, "loss": 0.6895, "step": 2555 }, { "epoch": 0.9913701153883449, "grad_norm": 1.8004433131986881, "learning_rate": 2.2185898712628884e-09, "loss": 0.5967, "step": 2556 }, { "epoch": 0.991757975370891, "grad_norm": 2.111242238965235, "learning_rate": 2.02149702911103e-09, "loss": 0.6642, "step": 2557 }, { "epoch": 0.9921458353534374, "grad_norm": 2.290769599610399, "learning_rate": 1.8335688835802169e-09, "loss": 0.684, "step": 2558 }, { "epoch": 0.9925336953359837, "grad_norm": 2.353145293234086, "learning_rate": 1.654805779270352e-09, "loss": 0.6239, "step": 2559 }, { "epoch": 0.99292155531853, "grad_norm": 1.823337260775195, "learning_rate": 1.4852080439758943e-09, "loss": 0.6286, "step": 2560 }, { "epoch": 0.9933094153010763, "grad_norm": 2.086997745374766, "learning_rate": 1.3247759886853006e-09, "loss": 0.5983, "step": 2561 }, { "epoch": 0.9936972752836226, "grad_norm": 3.060136624701758, "learning_rate": 1.173509907579362e-09, "loss": 0.6906, "step": 2562 }, { "epoch": 0.9940851352661689, "grad_norm": 3.52301013974828, "learning_rate": 1.0314100780317583e-09, "loss": 0.678, "step": 2563 }, { "epoch": 0.9944729952487152, "grad_norm": 2.356969684195123, "learning_rate": 8.984767606085021e-10, "loss": 0.6953, "step": 2564 }, { "epoch": 0.9948608552312616, "grad_norm": 2.91639246642519, "learning_rate": 7.747101990662753e-10, "loss": 0.6488, "step": 2565 }, { "epoch": 0.9952487152138079, "grad_norm": 2.4741734871408507, "learning_rate": 6.601106203535379e-10, "loss": 0.6819, "step": 2566 }, { "epoch": 0.9956365751963541, "grad_norm": 2.0964206240720085, "learning_rate": 5.546782346099733e-10, "loss": 0.6556, "step": 2567 }, { "epoch": 0.9960244351789004, "grad_norm": 2.6657407570308567, "learning_rate": 4.5841323516426784e-10, "loss": 0.647, "step": 2568 }, { "epoch": 0.9964122951614467, "grad_norm": 1.6763316326458497, "learning_rate": 3.713157985363314e-10, "loss": 0.5961, "step": 2569 }, { "epoch": 0.996800155143993, "grad_norm": 2.3325263727253898, "learning_rate": 2.9338608443452154e-10, "loss": 0.6266, "step": 2570 }, { "epoch": 0.9971880151265393, "grad_norm": 2.235953601615239, "learning_rate": 2.2462423575675408e-10, "loss": 0.7006, "step": 2571 }, { "epoch": 0.9975758751090856, "grad_norm": 2.5678703544179333, "learning_rate": 1.6503037859105787e-10, "loss": 0.6645, "step": 2572 }, { "epoch": 0.9979637350916319, "grad_norm": 2.2903478941193796, "learning_rate": 1.1460462221279944e-10, "loss": 0.6135, "step": 2573 }, { "epoch": 0.9983515950741783, "grad_norm": 1.9488209578081999, "learning_rate": 7.334705908745854e-11, "loss": 0.6037, "step": 2574 }, { "epoch": 0.9987394550567246, "grad_norm": 2.4469657040291937, "learning_rate": 4.125776486785249e-11, "loss": 0.6947, "step": 2575 }, { "epoch": 0.9991273150392708, "grad_norm": 1.9568735881400912, "learning_rate": 1.8336798395801604e-11, "loss": 0.623, "step": 2576 }, { "epoch": 0.9995151750218171, "grad_norm": 1.9695005604891378, "learning_rate": 4.584201700463808e-12, "loss": 0.6359, "step": 2577 }, { "epoch": 0.9999030350043634, "grad_norm": 1.768432352967807, "learning_rate": 0.0, "loss": 0.5827, "step": 2578 }, { "epoch": 0.9999030350043634, "step": 2578, "total_flos": 7.318297512614298e+16, "train_loss": 0.7496252401426284, "train_runtime": 79930.219, "train_samples_per_second": 4.129, "train_steps_per_second": 0.032 } ], "logging_steps": 1.0, "max_steps": 2578, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.318297512614298e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }