diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,104833 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 14970, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002004008016032064, + "grad_norm": 233.1182845475113, + "learning_rate": 0.0, + "loss": 5.021, + "step": 1 + }, + { + "epoch": 0.0004008016032064128, + "grad_norm": 116.25234351859154, + "learning_rate": 6.680026720106882e-09, + "loss": 4.9202, + "step": 2 + }, + { + "epoch": 0.0006012024048096192, + "grad_norm": 141.59136266518655, + "learning_rate": 1.3360053440213763e-08, + "loss": 4.8662, + "step": 3 + }, + { + "epoch": 0.0008016032064128256, + "grad_norm": 72.01338962191006, + "learning_rate": 2.004008016032064e-08, + "loss": 4.1307, + "step": 4 + }, + { + "epoch": 0.001002004008016032, + "grad_norm": 156.6306465557448, + "learning_rate": 2.6720106880427527e-08, + "loss": 4.8704, + "step": 5 + }, + { + "epoch": 0.0012024048096192384, + "grad_norm": 136.77163433768501, + "learning_rate": 3.340013360053441e-08, + "loss": 4.6019, + "step": 6 + }, + { + "epoch": 0.0014028056112224449, + "grad_norm": 76.45868540267129, + "learning_rate": 4.008016032064128e-08, + "loss": 4.297, + "step": 7 + }, + { + "epoch": 0.0016032064128256513, + "grad_norm": 114.00729255523113, + "learning_rate": 4.676018704074817e-08, + "loss": 5.2973, + "step": 8 + }, + { + "epoch": 0.0018036072144288577, + "grad_norm": 76.50370052306724, + "learning_rate": 5.3440213760855053e-08, + "loss": 4.4858, + "step": 9 + }, + { + "epoch": 0.002004008016032064, + "grad_norm": 74.06940338259653, + "learning_rate": 6.012024048096193e-08, + "loss": 4.1965, + "step": 10 + }, + { + "epoch": 0.0022044088176352704, + "grad_norm": 86.94918058668554, + "learning_rate": 6.680026720106882e-08, + "loss": 4.6047, + "step": 11 + }, + { + "epoch": 0.002404809619238477, + "grad_norm": 85.70416269460215, + "learning_rate": 7.348029392117568e-08, + "loss": 4.5391, + "step": 12 + }, + { + "epoch": 0.0026052104208416833, + "grad_norm": 52.4564254446488, + "learning_rate": 8.016032064128256e-08, + "loss": 3.8975, + "step": 13 + }, + { + "epoch": 0.0028056112224448897, + "grad_norm": 108.52774877895322, + "learning_rate": 8.684034736138946e-08, + "loss": 4.8908, + "step": 14 + }, + { + "epoch": 0.003006012024048096, + "grad_norm": 96.51998609555481, + "learning_rate": 9.352037408149634e-08, + "loss": 5.3394, + "step": 15 + }, + { + "epoch": 0.0032064128256513026, + "grad_norm": 108.82804052931347, + "learning_rate": 1.0020040080160321e-07, + "loss": 4.4127, + "step": 16 + }, + { + "epoch": 0.003406813627254509, + "grad_norm": 77.8194631218101, + "learning_rate": 1.0688042752171011e-07, + "loss": 3.8781, + "step": 17 + }, + { + "epoch": 0.0036072144288577155, + "grad_norm": 127.51560965319358, + "learning_rate": 1.1356045424181699e-07, + "loss": 4.6356, + "step": 18 + }, + { + "epoch": 0.003807615230460922, + "grad_norm": 156.5105277031196, + "learning_rate": 1.2024048096192385e-07, + "loss": 4.5185, + "step": 19 + }, + { + "epoch": 0.004008016032064128, + "grad_norm": 160.31459706045698, + "learning_rate": 1.2692050768203073e-07, + "loss": 4.5763, + "step": 20 + }, + { + "epoch": 0.004208416833667334, + "grad_norm": 144.27311258165204, + "learning_rate": 1.3360053440213764e-07, + "loss": 4.4223, + "step": 21 + }, + { + "epoch": 0.004408817635270541, + "grad_norm": 78.69424225549884, + "learning_rate": 1.402805611222445e-07, + "loss": 4.3829, + "step": 22 + }, + { + "epoch": 0.004609218436873747, + "grad_norm": 148.23862199565824, + "learning_rate": 1.4696058784235137e-07, + "loss": 5.4896, + "step": 23 + }, + { + "epoch": 0.004809619238476954, + "grad_norm": 176.3980652034837, + "learning_rate": 1.5364061456245825e-07, + "loss": 5.5475, + "step": 24 + }, + { + "epoch": 0.00501002004008016, + "grad_norm": 136.13789737604168, + "learning_rate": 1.6032064128256513e-07, + "loss": 4.2485, + "step": 25 + }, + { + "epoch": 0.0052104208416833666, + "grad_norm": 116.34762096355176, + "learning_rate": 1.6700066800267203e-07, + "loss": 5.0126, + "step": 26 + }, + { + "epoch": 0.005410821643286573, + "grad_norm": 76.33162799641016, + "learning_rate": 1.736806947227789e-07, + "loss": 4.5422, + "step": 27 + }, + { + "epoch": 0.0056112224448897794, + "grad_norm": 114.40190242615975, + "learning_rate": 1.803607214428858e-07, + "loss": 5.089, + "step": 28 + }, + { + "epoch": 0.005811623246492986, + "grad_norm": 119.57767005347809, + "learning_rate": 1.8704074816299267e-07, + "loss": 4.5376, + "step": 29 + }, + { + "epoch": 0.006012024048096192, + "grad_norm": 135.03179715957725, + "learning_rate": 1.9372077488309955e-07, + "loss": 5.0101, + "step": 30 + }, + { + "epoch": 0.006212424849699399, + "grad_norm": 77.94424350687707, + "learning_rate": 2.0040080160320643e-07, + "loss": 4.6793, + "step": 31 + }, + { + "epoch": 0.006412825651302605, + "grad_norm": 90.64721621076947, + "learning_rate": 2.070808283233133e-07, + "loss": 4.6797, + "step": 32 + }, + { + "epoch": 0.006613226452905812, + "grad_norm": 73.06657132308227, + "learning_rate": 2.1376085504342021e-07, + "loss": 4.3615, + "step": 33 + }, + { + "epoch": 0.006813627254509018, + "grad_norm": 243.81500654720776, + "learning_rate": 2.204408817635271e-07, + "loss": 4.6312, + "step": 34 + }, + { + "epoch": 0.0070140280561122245, + "grad_norm": 108.61420954221225, + "learning_rate": 2.2712090848363397e-07, + "loss": 3.8882, + "step": 35 + }, + { + "epoch": 0.007214428857715431, + "grad_norm": 80.50701627139055, + "learning_rate": 2.3380093520374082e-07, + "loss": 4.7108, + "step": 36 + }, + { + "epoch": 0.007414829659318637, + "grad_norm": 145.72353466810657, + "learning_rate": 2.404809619238477e-07, + "loss": 4.9356, + "step": 37 + }, + { + "epoch": 0.007615230460921844, + "grad_norm": 67.83563706496697, + "learning_rate": 2.471609886439546e-07, + "loss": 4.777, + "step": 38 + }, + { + "epoch": 0.00781563126252505, + "grad_norm": 63.40927179071586, + "learning_rate": 2.5384101536406146e-07, + "loss": 4.2472, + "step": 39 + }, + { + "epoch": 0.008016032064128256, + "grad_norm": 64.30057801386324, + "learning_rate": 2.605210420841683e-07, + "loss": 4.4143, + "step": 40 + }, + { + "epoch": 0.008216432865731463, + "grad_norm": 178.5547617021841, + "learning_rate": 2.6720106880427527e-07, + "loss": 5.1779, + "step": 41 + }, + { + "epoch": 0.008416833667334669, + "grad_norm": 143.0720621982713, + "learning_rate": 2.738810955243821e-07, + "loss": 5.0327, + "step": 42 + }, + { + "epoch": 0.008617234468937876, + "grad_norm": 70.60932172097976, + "learning_rate": 2.80561122244489e-07, + "loss": 4.5313, + "step": 43 + }, + { + "epoch": 0.008817635270541082, + "grad_norm": 84.43328526381814, + "learning_rate": 2.872411489645959e-07, + "loss": 4.7231, + "step": 44 + }, + { + "epoch": 0.009018036072144289, + "grad_norm": 77.318869068078, + "learning_rate": 2.9392117568470274e-07, + "loss": 3.9638, + "step": 45 + }, + { + "epoch": 0.009218436873747494, + "grad_norm": 51.24029531674491, + "learning_rate": 3.0060120240480964e-07, + "loss": 4.097, + "step": 46 + }, + { + "epoch": 0.009418837675350702, + "grad_norm": 82.56838179767422, + "learning_rate": 3.072812291249165e-07, + "loss": 4.4833, + "step": 47 + }, + { + "epoch": 0.009619238476953907, + "grad_norm": 85.14333937463665, + "learning_rate": 3.139612558450234e-07, + "loss": 4.0704, + "step": 48 + }, + { + "epoch": 0.009819639278557115, + "grad_norm": 165.18559309870128, + "learning_rate": 3.2064128256513025e-07, + "loss": 4.8622, + "step": 49 + }, + { + "epoch": 0.01002004008016032, + "grad_norm": 120.51295471016691, + "learning_rate": 3.2732130928523716e-07, + "loss": 4.7884, + "step": 50 + }, + { + "epoch": 0.010220440881763528, + "grad_norm": 52.162918948288244, + "learning_rate": 3.3400133600534407e-07, + "loss": 4.0368, + "step": 51 + }, + { + "epoch": 0.010420841683366733, + "grad_norm": 130.29452189374882, + "learning_rate": 3.406813627254509e-07, + "loss": 4.9343, + "step": 52 + }, + { + "epoch": 0.01062124248496994, + "grad_norm": 73.13447694541131, + "learning_rate": 3.473613894455578e-07, + "loss": 4.4696, + "step": 53 + }, + { + "epoch": 0.010821643286573146, + "grad_norm": 130.62063631372382, + "learning_rate": 3.540414161656647e-07, + "loss": 5.0507, + "step": 54 + }, + { + "epoch": 0.011022044088176353, + "grad_norm": 105.37325897211323, + "learning_rate": 3.607214428857716e-07, + "loss": 4.5029, + "step": 55 + }, + { + "epoch": 0.011222444889779559, + "grad_norm": 131.26467067237476, + "learning_rate": 3.6740146960587843e-07, + "loss": 4.2229, + "step": 56 + }, + { + "epoch": 0.011422845691382766, + "grad_norm": 65.42313729101073, + "learning_rate": 3.7408149632598534e-07, + "loss": 4.4164, + "step": 57 + }, + { + "epoch": 0.011623246492985972, + "grad_norm": 94.20439249261153, + "learning_rate": 3.8076152304609225e-07, + "loss": 4.6002, + "step": 58 + }, + { + "epoch": 0.011823647294589179, + "grad_norm": 45.354870811038325, + "learning_rate": 3.874415497661991e-07, + "loss": 4.0277, + "step": 59 + }, + { + "epoch": 0.012024048096192385, + "grad_norm": 94.72456068089946, + "learning_rate": 3.94121576486306e-07, + "loss": 4.4145, + "step": 60 + }, + { + "epoch": 0.012224448897795592, + "grad_norm": 111.75133690844805, + "learning_rate": 4.0080160320641286e-07, + "loss": 5.0581, + "step": 61 + }, + { + "epoch": 0.012424849699398798, + "grad_norm": 118.27911426681854, + "learning_rate": 4.0748162992651976e-07, + "loss": 4.9045, + "step": 62 + }, + { + "epoch": 0.012625250501002005, + "grad_norm": 114.91724155220436, + "learning_rate": 4.141616566466266e-07, + "loss": 4.8945, + "step": 63 + }, + { + "epoch": 0.01282565130260521, + "grad_norm": 88.07758113452708, + "learning_rate": 4.208416833667335e-07, + "loss": 4.6527, + "step": 64 + }, + { + "epoch": 0.013026052104208416, + "grad_norm": 53.9519194799033, + "learning_rate": 4.2752171008684043e-07, + "loss": 3.7137, + "step": 65 + }, + { + "epoch": 0.013226452905811623, + "grad_norm": 101.7167851545079, + "learning_rate": 4.342017368069473e-07, + "loss": 4.6143, + "step": 66 + }, + { + "epoch": 0.013426853707414829, + "grad_norm": 58.79978433359711, + "learning_rate": 4.408817635270542e-07, + "loss": 4.3448, + "step": 67 + }, + { + "epoch": 0.013627254509018036, + "grad_norm": 62.810390489464766, + "learning_rate": 4.47561790247161e-07, + "loss": 3.949, + "step": 68 + }, + { + "epoch": 0.013827655310621242, + "grad_norm": 59.790291125744716, + "learning_rate": 4.5424181696726794e-07, + "loss": 4.2289, + "step": 69 + }, + { + "epoch": 0.014028056112224449, + "grad_norm": 82.71226330693653, + "learning_rate": 4.6092184368737474e-07, + "loss": 4.2901, + "step": 70 + }, + { + "epoch": 0.014228456913827655, + "grad_norm": 56.90871150067609, + "learning_rate": 4.6760187040748165e-07, + "loss": 4.1681, + "step": 71 + }, + { + "epoch": 0.014428857715430862, + "grad_norm": 56.502669979456684, + "learning_rate": 4.742818971275886e-07, + "loss": 4.339, + "step": 72 + }, + { + "epoch": 0.014629258517034067, + "grad_norm": 88.2386825674213, + "learning_rate": 4.809619238476954e-07, + "loss": 4.15, + "step": 73 + }, + { + "epoch": 0.014829659318637275, + "grad_norm": 82.9531703024562, + "learning_rate": 4.876419505678023e-07, + "loss": 4.7262, + "step": 74 + }, + { + "epoch": 0.01503006012024048, + "grad_norm": 96.05517345655596, + "learning_rate": 4.943219772879092e-07, + "loss": 3.9894, + "step": 75 + }, + { + "epoch": 0.015230460921843688, + "grad_norm": 106.56230375309212, + "learning_rate": 5.010020040080161e-07, + "loss": 4.8975, + "step": 76 + }, + { + "epoch": 0.015430861723446893, + "grad_norm": 62.95677202938884, + "learning_rate": 5.076820307281229e-07, + "loss": 3.9855, + "step": 77 + }, + { + "epoch": 0.0156312625250501, + "grad_norm": 96.07451556575995, + "learning_rate": 5.143620574482298e-07, + "loss": 4.5227, + "step": 78 + }, + { + "epoch": 0.015831663326653308, + "grad_norm": 45.91645847938717, + "learning_rate": 5.210420841683366e-07, + "loss": 4.1003, + "step": 79 + }, + { + "epoch": 0.01603206412825651, + "grad_norm": 87.94320216406048, + "learning_rate": 5.277221108884436e-07, + "loss": 4.1179, + "step": 80 + }, + { + "epoch": 0.01623246492985972, + "grad_norm": 61.61405005559732, + "learning_rate": 5.344021376085505e-07, + "loss": 4.1125, + "step": 81 + }, + { + "epoch": 0.016432865731462926, + "grad_norm": 80.58818235780882, + "learning_rate": 5.410821643286573e-07, + "loss": 4.6826, + "step": 82 + }, + { + "epoch": 0.016633266533066134, + "grad_norm": 69.48983185619107, + "learning_rate": 5.477621910487643e-07, + "loss": 3.8008, + "step": 83 + }, + { + "epoch": 0.016833667334669337, + "grad_norm": 58.094458036148296, + "learning_rate": 5.54442217768871e-07, + "loss": 3.8362, + "step": 84 + }, + { + "epoch": 0.017034068136272545, + "grad_norm": 98.17228067006174, + "learning_rate": 5.61122244488978e-07, + "loss": 4.0359, + "step": 85 + }, + { + "epoch": 0.017234468937875752, + "grad_norm": 88.39734719531887, + "learning_rate": 5.678022712090849e-07, + "loss": 4.4047, + "step": 86 + }, + { + "epoch": 0.01743486973947896, + "grad_norm": 74.06907913353581, + "learning_rate": 5.744822979291918e-07, + "loss": 4.4726, + "step": 87 + }, + { + "epoch": 0.017635270541082163, + "grad_norm": 114.30316078114426, + "learning_rate": 5.811623246492987e-07, + "loss": 4.2981, + "step": 88 + }, + { + "epoch": 0.01783567134268537, + "grad_norm": 59.15676311169365, + "learning_rate": 5.878423513694055e-07, + "loss": 4.1073, + "step": 89 + }, + { + "epoch": 0.018036072144288578, + "grad_norm": 147.06003478836683, + "learning_rate": 5.945223780895124e-07, + "loss": 4.4162, + "step": 90 + }, + { + "epoch": 0.018236472945891785, + "grad_norm": 80.90720382023162, + "learning_rate": 6.012024048096193e-07, + "loss": 4.6096, + "step": 91 + }, + { + "epoch": 0.01843687374749499, + "grad_norm": 43.21457056332791, + "learning_rate": 6.078824315297262e-07, + "loss": 4.2724, + "step": 92 + }, + { + "epoch": 0.018637274549098196, + "grad_norm": 61.10932638198538, + "learning_rate": 6.14562458249833e-07, + "loss": 4.227, + "step": 93 + }, + { + "epoch": 0.018837675350701404, + "grad_norm": 60.466371229651344, + "learning_rate": 6.212424849699399e-07, + "loss": 4.0639, + "step": 94 + }, + { + "epoch": 0.01903807615230461, + "grad_norm": 56.69181191164283, + "learning_rate": 6.279225116900468e-07, + "loss": 4.3361, + "step": 95 + }, + { + "epoch": 0.019238476953907815, + "grad_norm": 94.90486406310912, + "learning_rate": 6.346025384101537e-07, + "loss": 4.0175, + "step": 96 + }, + { + "epoch": 0.019438877755511022, + "grad_norm": 128.6375513829131, + "learning_rate": 6.412825651302605e-07, + "loss": 4.3096, + "step": 97 + }, + { + "epoch": 0.01963927855711423, + "grad_norm": 53.19677187233393, + "learning_rate": 6.479625918503675e-07, + "loss": 4.2061, + "step": 98 + }, + { + "epoch": 0.019839679358717433, + "grad_norm": 80.50523437640472, + "learning_rate": 6.546426185704743e-07, + "loss": 4.255, + "step": 99 + }, + { + "epoch": 0.02004008016032064, + "grad_norm": 73.81675023464089, + "learning_rate": 6.613226452905812e-07, + "loss": 3.9587, + "step": 100 + }, + { + "epoch": 0.020240480961923848, + "grad_norm": 87.17495940348523, + "learning_rate": 6.680026720106881e-07, + "loss": 3.9015, + "step": 101 + }, + { + "epoch": 0.020440881763527055, + "grad_norm": 79.41282911965277, + "learning_rate": 6.74682698730795e-07, + "loss": 3.6279, + "step": 102 + }, + { + "epoch": 0.02064128256513026, + "grad_norm": 44.62106790056415, + "learning_rate": 6.813627254509018e-07, + "loss": 4.1445, + "step": 103 + }, + { + "epoch": 0.020841683366733466, + "grad_norm": 118.46242315274789, + "learning_rate": 6.880427521710086e-07, + "loss": 4.5732, + "step": 104 + }, + { + "epoch": 0.021042084168336674, + "grad_norm": 60.58323375056512, + "learning_rate": 6.947227788911156e-07, + "loss": 4.1109, + "step": 105 + }, + { + "epoch": 0.02124248496993988, + "grad_norm": 115.25001262389674, + "learning_rate": 7.014028056112226e-07, + "loss": 4.5549, + "step": 106 + }, + { + "epoch": 0.021442885771543085, + "grad_norm": 84.08228419146414, + "learning_rate": 7.080828323313294e-07, + "loss": 4.4807, + "step": 107 + }, + { + "epoch": 0.021643286573146292, + "grad_norm": 198.70449863594044, + "learning_rate": 7.147628590514364e-07, + "loss": 4.784, + "step": 108 + }, + { + "epoch": 0.0218436873747495, + "grad_norm": 49.09018337281011, + "learning_rate": 7.214428857715432e-07, + "loss": 3.837, + "step": 109 + }, + { + "epoch": 0.022044088176352707, + "grad_norm": 83.39241873512779, + "learning_rate": 7.2812291249165e-07, + "loss": 4.3615, + "step": 110 + }, + { + "epoch": 0.02224448897795591, + "grad_norm": 244.68314125793518, + "learning_rate": 7.348029392117569e-07, + "loss": 4.9516, + "step": 111 + }, + { + "epoch": 0.022444889779559118, + "grad_norm": 70.4993786934377, + "learning_rate": 7.414829659318639e-07, + "loss": 4.63, + "step": 112 + }, + { + "epoch": 0.022645290581162325, + "grad_norm": 73.05833354338584, + "learning_rate": 7.481629926519707e-07, + "loss": 4.5409, + "step": 113 + }, + { + "epoch": 0.022845691382765532, + "grad_norm": 58.509622214578116, + "learning_rate": 7.548430193720775e-07, + "loss": 4.1875, + "step": 114 + }, + { + "epoch": 0.023046092184368736, + "grad_norm": 37.426771980304714, + "learning_rate": 7.615230460921845e-07, + "loss": 3.6731, + "step": 115 + }, + { + "epoch": 0.023246492985971944, + "grad_norm": 74.57083859718176, + "learning_rate": 7.682030728122913e-07, + "loss": 4.3055, + "step": 116 + }, + { + "epoch": 0.02344689378757515, + "grad_norm": 58.60471322379089, + "learning_rate": 7.748830995323982e-07, + "loss": 4.0264, + "step": 117 + }, + { + "epoch": 0.023647294589178358, + "grad_norm": 64.4817244010331, + "learning_rate": 7.81563126252505e-07, + "loss": 4.1025, + "step": 118 + }, + { + "epoch": 0.023847695390781562, + "grad_norm": 74.29331747128265, + "learning_rate": 7.88243152972612e-07, + "loss": 4.1792, + "step": 119 + }, + { + "epoch": 0.02404809619238477, + "grad_norm": 79.90880195872026, + "learning_rate": 7.949231796927188e-07, + "loss": 4.5107, + "step": 120 + }, + { + "epoch": 0.024248496993987977, + "grad_norm": 52.02786068734346, + "learning_rate": 8.016032064128257e-07, + "loss": 3.5498, + "step": 121 + }, + { + "epoch": 0.024448897795591184, + "grad_norm": 81.27314195194698, + "learning_rate": 8.082832331329326e-07, + "loss": 3.7797, + "step": 122 + }, + { + "epoch": 0.024649298597194388, + "grad_norm": 53.37787271658115, + "learning_rate": 8.149632598530395e-07, + "loss": 3.4299, + "step": 123 + }, + { + "epoch": 0.024849699398797595, + "grad_norm": 83.12601739941188, + "learning_rate": 8.216432865731463e-07, + "loss": 4.3913, + "step": 124 + }, + { + "epoch": 0.025050100200400802, + "grad_norm": 60.52875433089574, + "learning_rate": 8.283233132932532e-07, + "loss": 3.9027, + "step": 125 + }, + { + "epoch": 0.02525050100200401, + "grad_norm": 66.67795519562064, + "learning_rate": 8.350033400133601e-07, + "loss": 4.1337, + "step": 126 + }, + { + "epoch": 0.025450901803607213, + "grad_norm": 59.185902875804594, + "learning_rate": 8.41683366733467e-07, + "loss": 4.1095, + "step": 127 + }, + { + "epoch": 0.02565130260521042, + "grad_norm": 55.97954598701067, + "learning_rate": 8.483633934535738e-07, + "loss": 4.043, + "step": 128 + }, + { + "epoch": 0.025851703406813628, + "grad_norm": 120.87925756570307, + "learning_rate": 8.550434201736809e-07, + "loss": 4.3636, + "step": 129 + }, + { + "epoch": 0.026052104208416832, + "grad_norm": 106.7843348163236, + "learning_rate": 8.617234468937877e-07, + "loss": 4.6725, + "step": 130 + }, + { + "epoch": 0.02625250501002004, + "grad_norm": 54.61378704358549, + "learning_rate": 8.684034736138946e-07, + "loss": 4.2308, + "step": 131 + }, + { + "epoch": 0.026452905811623247, + "grad_norm": 92.2888830082676, + "learning_rate": 8.750835003340014e-07, + "loss": 3.5883, + "step": 132 + }, + { + "epoch": 0.026653306613226454, + "grad_norm": 75.22495810761832, + "learning_rate": 8.817635270541084e-07, + "loss": 3.5011, + "step": 133 + }, + { + "epoch": 0.026853707414829658, + "grad_norm": 113.15802907312874, + "learning_rate": 8.884435537742152e-07, + "loss": 4.1344, + "step": 134 + }, + { + "epoch": 0.027054108216432865, + "grad_norm": 92.02298277483824, + "learning_rate": 8.95123580494322e-07, + "loss": 3.7739, + "step": 135 + }, + { + "epoch": 0.027254509018036072, + "grad_norm": 47.994868929995995, + "learning_rate": 9.01803607214429e-07, + "loss": 3.7912, + "step": 136 + }, + { + "epoch": 0.02745490981963928, + "grad_norm": 66.5385843982007, + "learning_rate": 9.084836339345359e-07, + "loss": 4.6665, + "step": 137 + }, + { + "epoch": 0.027655310621242483, + "grad_norm": 175.30655712075315, + "learning_rate": 9.151636606546427e-07, + "loss": 4.2414, + "step": 138 + }, + { + "epoch": 0.02785571142284569, + "grad_norm": 113.59294055036699, + "learning_rate": 9.218436873747495e-07, + "loss": 4.6257, + "step": 139 + }, + { + "epoch": 0.028056112224448898, + "grad_norm": 51.85004424547012, + "learning_rate": 9.285237140948565e-07, + "loss": 3.9288, + "step": 140 + }, + { + "epoch": 0.028256513026052105, + "grad_norm": 73.19897913676581, + "learning_rate": 9.352037408149633e-07, + "loss": 4.4479, + "step": 141 + }, + { + "epoch": 0.02845691382765531, + "grad_norm": 49.97981224940877, + "learning_rate": 9.418837675350702e-07, + "loss": 2.9083, + "step": 142 + }, + { + "epoch": 0.028657314629258517, + "grad_norm": 69.51563783997081, + "learning_rate": 9.485637942551772e-07, + "loss": 4.375, + "step": 143 + }, + { + "epoch": 0.028857715430861724, + "grad_norm": 50.996408181127855, + "learning_rate": 9.55243820975284e-07, + "loss": 4.0834, + "step": 144 + }, + { + "epoch": 0.02905811623246493, + "grad_norm": 88.5411136719378, + "learning_rate": 9.619238476953908e-07, + "loss": 3.8876, + "step": 145 + }, + { + "epoch": 0.029258517034068135, + "grad_norm": 94.61072294361017, + "learning_rate": 9.686038744154976e-07, + "loss": 4.0429, + "step": 146 + }, + { + "epoch": 0.029458917835671342, + "grad_norm": 130.74018651332045, + "learning_rate": 9.752839011356046e-07, + "loss": 3.6281, + "step": 147 + }, + { + "epoch": 0.02965931863727455, + "grad_norm": 56.38145721764789, + "learning_rate": 9.819639278557114e-07, + "loss": 3.9052, + "step": 148 + }, + { + "epoch": 0.029859719438877757, + "grad_norm": 52.022308295776604, + "learning_rate": 9.886439545758184e-07, + "loss": 3.8081, + "step": 149 + }, + { + "epoch": 0.03006012024048096, + "grad_norm": 75.68965381628891, + "learning_rate": 9.953239812959252e-07, + "loss": 4.1626, + "step": 150 + }, + { + "epoch": 0.030260521042084168, + "grad_norm": 48.40503028363651, + "learning_rate": 1.0020040080160322e-06, + "loss": 4.1482, + "step": 151 + }, + { + "epoch": 0.030460921843687375, + "grad_norm": 72.4308013758533, + "learning_rate": 1.008684034736139e-06, + "loss": 3.9401, + "step": 152 + }, + { + "epoch": 0.030661322645290583, + "grad_norm": 81.49869134832907, + "learning_rate": 1.0153640614562458e-06, + "loss": 3.7046, + "step": 153 + }, + { + "epoch": 0.030861723446893786, + "grad_norm": 93.50731478302438, + "learning_rate": 1.0220440881763529e-06, + "loss": 3.9536, + "step": 154 + }, + { + "epoch": 0.031062124248496994, + "grad_norm": 95.06581295515103, + "learning_rate": 1.0287241148964597e-06, + "loss": 4.4421, + "step": 155 + }, + { + "epoch": 0.0312625250501002, + "grad_norm": 117.52889562408885, + "learning_rate": 1.0354041416165665e-06, + "loss": 4.8463, + "step": 156 + }, + { + "epoch": 0.031462925851703405, + "grad_norm": 42.812419358834944, + "learning_rate": 1.0420841683366733e-06, + "loss": 3.8298, + "step": 157 + }, + { + "epoch": 0.031663326653306616, + "grad_norm": 70.30541806403379, + "learning_rate": 1.0487641950567803e-06, + "loss": 4.3588, + "step": 158 + }, + { + "epoch": 0.03186372745490982, + "grad_norm": 67.82418334836903, + "learning_rate": 1.0554442217768873e-06, + "loss": 3.7385, + "step": 159 + }, + { + "epoch": 0.03206412825651302, + "grad_norm": 69.01150834770962, + "learning_rate": 1.062124248496994e-06, + "loss": 3.6991, + "step": 160 + }, + { + "epoch": 0.032264529058116234, + "grad_norm": 73.65518235129407, + "learning_rate": 1.068804275217101e-06, + "loss": 4.1811, + "step": 161 + }, + { + "epoch": 0.03246492985971944, + "grad_norm": 74.2822521516228, + "learning_rate": 1.0754843019372079e-06, + "loss": 3.3786, + "step": 162 + }, + { + "epoch": 0.03266533066132264, + "grad_norm": 60.73305408250144, + "learning_rate": 1.0821643286573147e-06, + "loss": 3.6866, + "step": 163 + }, + { + "epoch": 0.03286573146292585, + "grad_norm": 96.97054962496365, + "learning_rate": 1.0888443553774215e-06, + "loss": 3.5976, + "step": 164 + }, + { + "epoch": 0.033066132264529056, + "grad_norm": 47.64943351717658, + "learning_rate": 1.0955243820975285e-06, + "loss": 3.7548, + "step": 165 + }, + { + "epoch": 0.03326653306613227, + "grad_norm": 80.57131618284414, + "learning_rate": 1.1022044088176353e-06, + "loss": 4.3468, + "step": 166 + }, + { + "epoch": 0.03346693386773547, + "grad_norm": 215.92070556025402, + "learning_rate": 1.108884435537742e-06, + "loss": 3.446, + "step": 167 + }, + { + "epoch": 0.033667334669338675, + "grad_norm": 62.22320578378476, + "learning_rate": 1.1155644622578491e-06, + "loss": 3.5355, + "step": 168 + }, + { + "epoch": 0.033867735470941886, + "grad_norm": 54.81408117980186, + "learning_rate": 1.122244488977956e-06, + "loss": 3.7005, + "step": 169 + }, + { + "epoch": 0.03406813627254509, + "grad_norm": 94.88576390729528, + "learning_rate": 1.128924515698063e-06, + "loss": 4.6451, + "step": 170 + }, + { + "epoch": 0.03426853707414829, + "grad_norm": 56.31668881376985, + "learning_rate": 1.1356045424181697e-06, + "loss": 3.7618, + "step": 171 + }, + { + "epoch": 0.034468937875751504, + "grad_norm": 54.84979006119315, + "learning_rate": 1.1422845691382767e-06, + "loss": 3.9207, + "step": 172 + }, + { + "epoch": 0.03466933867735471, + "grad_norm": 164.5177937195445, + "learning_rate": 1.1489645958583835e-06, + "loss": 3.8104, + "step": 173 + }, + { + "epoch": 0.03486973947895792, + "grad_norm": 44.34693104148756, + "learning_rate": 1.1556446225784903e-06, + "loss": 3.6964, + "step": 174 + }, + { + "epoch": 0.03507014028056112, + "grad_norm": 63.14879596679864, + "learning_rate": 1.1623246492985973e-06, + "loss": 4.2025, + "step": 175 + }, + { + "epoch": 0.035270541082164326, + "grad_norm": 49.04100445456969, + "learning_rate": 1.1690046760187041e-06, + "loss": 3.822, + "step": 176 + }, + { + "epoch": 0.03547094188376754, + "grad_norm": 67.94360061736685, + "learning_rate": 1.175684702738811e-06, + "loss": 3.4276, + "step": 177 + }, + { + "epoch": 0.03567134268537074, + "grad_norm": 62.34087166155569, + "learning_rate": 1.182364729458918e-06, + "loss": 3.8707, + "step": 178 + }, + { + "epoch": 0.035871743486973945, + "grad_norm": 86.83308740542886, + "learning_rate": 1.1890447561790248e-06, + "loss": 3.8846, + "step": 179 + }, + { + "epoch": 0.036072144288577156, + "grad_norm": 59.3312024669915, + "learning_rate": 1.1957247828991318e-06, + "loss": 4.0148, + "step": 180 + }, + { + "epoch": 0.03627254509018036, + "grad_norm": 89.7720811073352, + "learning_rate": 1.2024048096192386e-06, + "loss": 4.2944, + "step": 181 + }, + { + "epoch": 0.03647294589178357, + "grad_norm": 64.62568441036206, + "learning_rate": 1.2090848363393456e-06, + "loss": 3.696, + "step": 182 + }, + { + "epoch": 0.036673346693386774, + "grad_norm": 78.1809341058202, + "learning_rate": 1.2157648630594524e-06, + "loss": 3.6548, + "step": 183 + }, + { + "epoch": 0.03687374749498998, + "grad_norm": 76.68831360809888, + "learning_rate": 1.2224448897795592e-06, + "loss": 3.7094, + "step": 184 + }, + { + "epoch": 0.03707414829659319, + "grad_norm": 80.66791262671138, + "learning_rate": 1.229124916499666e-06, + "loss": 4.2097, + "step": 185 + }, + { + "epoch": 0.03727454909819639, + "grad_norm": 335.68562510392223, + "learning_rate": 1.235804943219773e-06, + "loss": 4.3172, + "step": 186 + }, + { + "epoch": 0.037474949899799596, + "grad_norm": 147.34506457949246, + "learning_rate": 1.2424849699398798e-06, + "loss": 3.7048, + "step": 187 + }, + { + "epoch": 0.03767535070140281, + "grad_norm": 80.0006720000668, + "learning_rate": 1.2491649966599866e-06, + "loss": 4.2733, + "step": 188 + }, + { + "epoch": 0.03787575150300601, + "grad_norm": 74.27484376024044, + "learning_rate": 1.2558450233800936e-06, + "loss": 3.7613, + "step": 189 + }, + { + "epoch": 0.03807615230460922, + "grad_norm": 60.351114154392555, + "learning_rate": 1.2625250501002004e-06, + "loss": 3.8675, + "step": 190 + }, + { + "epoch": 0.038276553106212426, + "grad_norm": 37.951891972925175, + "learning_rate": 1.2692050768203074e-06, + "loss": 3.5295, + "step": 191 + }, + { + "epoch": 0.03847695390781563, + "grad_norm": 74.94955176011926, + "learning_rate": 1.2758851035404144e-06, + "loss": 3.4022, + "step": 192 + }, + { + "epoch": 0.03867735470941884, + "grad_norm": 48.36576424731231, + "learning_rate": 1.282565130260521e-06, + "loss": 3.8251, + "step": 193 + }, + { + "epoch": 0.038877755511022044, + "grad_norm": 85.25327489991805, + "learning_rate": 1.289245156980628e-06, + "loss": 4.4422, + "step": 194 + }, + { + "epoch": 0.03907815631262525, + "grad_norm": 91.06518784273273, + "learning_rate": 1.295925183700735e-06, + "loss": 4.3269, + "step": 195 + }, + { + "epoch": 0.03927855711422846, + "grad_norm": 62.148676094358954, + "learning_rate": 1.3026052104208416e-06, + "loss": 3.4612, + "step": 196 + }, + { + "epoch": 0.03947895791583166, + "grad_norm": 56.645245655807265, + "learning_rate": 1.3092852371409486e-06, + "loss": 3.7122, + "step": 197 + }, + { + "epoch": 0.039679358717434866, + "grad_norm": 137.28654534043974, + "learning_rate": 1.3159652638610557e-06, + "loss": 3.223, + "step": 198 + }, + { + "epoch": 0.03987975951903808, + "grad_norm": 80.17987669808389, + "learning_rate": 1.3226452905811624e-06, + "loss": 4.188, + "step": 199 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 179.3135411548064, + "learning_rate": 1.3293253173012692e-06, + "loss": 4.3718, + "step": 200 + }, + { + "epoch": 0.04028056112224449, + "grad_norm": 88.76248747555702, + "learning_rate": 1.3360053440213763e-06, + "loss": 3.7276, + "step": 201 + }, + { + "epoch": 0.040480961923847696, + "grad_norm": 171.2623483679073, + "learning_rate": 1.342685370741483e-06, + "loss": 3.9521, + "step": 202 + }, + { + "epoch": 0.0406813627254509, + "grad_norm": 66.14306196333686, + "learning_rate": 1.34936539746159e-06, + "loss": 4.2452, + "step": 203 + }, + { + "epoch": 0.04088176352705411, + "grad_norm": 60.89109186796276, + "learning_rate": 1.3560454241816967e-06, + "loss": 4.1155, + "step": 204 + }, + { + "epoch": 0.041082164328657314, + "grad_norm": 51.26761906167773, + "learning_rate": 1.3627254509018037e-06, + "loss": 3.8018, + "step": 205 + }, + { + "epoch": 0.04128256513026052, + "grad_norm": 64.43162472749887, + "learning_rate": 1.3694054776219107e-06, + "loss": 3.617, + "step": 206 + }, + { + "epoch": 0.04148296593186373, + "grad_norm": 58.01592453817006, + "learning_rate": 1.3760855043420173e-06, + "loss": 4.1165, + "step": 207 + }, + { + "epoch": 0.04168336673346693, + "grad_norm": 126.51837365547053, + "learning_rate": 1.3827655310621243e-06, + "loss": 3.7433, + "step": 208 + }, + { + "epoch": 0.04188376753507014, + "grad_norm": 59.72915363245001, + "learning_rate": 1.3894455577822313e-06, + "loss": 3.9898, + "step": 209 + }, + { + "epoch": 0.04208416833667335, + "grad_norm": 83.71013900291346, + "learning_rate": 1.396125584502338e-06, + "loss": 3.5693, + "step": 210 + }, + { + "epoch": 0.04228456913827655, + "grad_norm": 57.88266634545234, + "learning_rate": 1.4028056112224451e-06, + "loss": 4.0787, + "step": 211 + }, + { + "epoch": 0.04248496993987976, + "grad_norm": 49.06631732810578, + "learning_rate": 1.409485637942552e-06, + "loss": 3.5523, + "step": 212 + }, + { + "epoch": 0.042685370741482966, + "grad_norm": 78.36354780989343, + "learning_rate": 1.4161656646626587e-06, + "loss": 3.7993, + "step": 213 + }, + { + "epoch": 0.04288577154308617, + "grad_norm": 99.39784162650389, + "learning_rate": 1.4228456913827657e-06, + "loss": 4.312, + "step": 214 + }, + { + "epoch": 0.04308617234468938, + "grad_norm": 84.29729942547819, + "learning_rate": 1.4295257181028727e-06, + "loss": 4.0839, + "step": 215 + }, + { + "epoch": 0.043286573146292584, + "grad_norm": 83.93518510577644, + "learning_rate": 1.4362057448229793e-06, + "loss": 4.0039, + "step": 216 + }, + { + "epoch": 0.043486973947895795, + "grad_norm": 50.746114164949034, + "learning_rate": 1.4428857715430863e-06, + "loss": 4.2697, + "step": 217 + }, + { + "epoch": 0.043687374749499, + "grad_norm": 103.30585079244742, + "learning_rate": 1.4495657982631931e-06, + "loss": 4.06, + "step": 218 + }, + { + "epoch": 0.0438877755511022, + "grad_norm": 85.01114832356534, + "learning_rate": 1.4562458249833e-06, + "loss": 3.9176, + "step": 219 + }, + { + "epoch": 0.04408817635270541, + "grad_norm": 70.81673400573258, + "learning_rate": 1.462925851703407e-06, + "loss": 4.0868, + "step": 220 + }, + { + "epoch": 0.04428857715430862, + "grad_norm": 163.7105542628516, + "learning_rate": 1.4696058784235137e-06, + "loss": 3.7364, + "step": 221 + }, + { + "epoch": 0.04448897795591182, + "grad_norm": 86.57553909400875, + "learning_rate": 1.4762859051436208e-06, + "loss": 3.7218, + "step": 222 + }, + { + "epoch": 0.04468937875751503, + "grad_norm": 82.30666920796159, + "learning_rate": 1.4829659318637278e-06, + "loss": 4.078, + "step": 223 + }, + { + "epoch": 0.044889779559118236, + "grad_norm": 77.05797818281447, + "learning_rate": 1.4896459585838344e-06, + "loss": 4.2675, + "step": 224 + }, + { + "epoch": 0.045090180360721446, + "grad_norm": 63.42696397799475, + "learning_rate": 1.4963259853039414e-06, + "loss": 4.1174, + "step": 225 + }, + { + "epoch": 0.04529058116232465, + "grad_norm": 59.06457488455172, + "learning_rate": 1.5030060120240484e-06, + "loss": 4.3793, + "step": 226 + }, + { + "epoch": 0.045490981963927854, + "grad_norm": 75.23471524940949, + "learning_rate": 1.509686038744155e-06, + "loss": 4.1527, + "step": 227 + }, + { + "epoch": 0.045691382765531065, + "grad_norm": 67.44280088284634, + "learning_rate": 1.516366065464262e-06, + "loss": 4.2339, + "step": 228 + }, + { + "epoch": 0.04589178356713427, + "grad_norm": 159.24379904499057, + "learning_rate": 1.523046092184369e-06, + "loss": 4.6986, + "step": 229 + }, + { + "epoch": 0.04609218436873747, + "grad_norm": 65.627560860261, + "learning_rate": 1.5297261189044758e-06, + "loss": 4.0448, + "step": 230 + }, + { + "epoch": 0.04629258517034068, + "grad_norm": 89.0386854924914, + "learning_rate": 1.5364061456245826e-06, + "loss": 4.1695, + "step": 231 + }, + { + "epoch": 0.04649298597194389, + "grad_norm": 51.8959866705712, + "learning_rate": 1.5430861723446894e-06, + "loss": 3.5336, + "step": 232 + }, + { + "epoch": 0.04669338677354709, + "grad_norm": 42.00077355441906, + "learning_rate": 1.5497661990647964e-06, + "loss": 4.05, + "step": 233 + }, + { + "epoch": 0.0468937875751503, + "grad_norm": 87.60310559741576, + "learning_rate": 1.5564462257849034e-06, + "loss": 3.4892, + "step": 234 + }, + { + "epoch": 0.047094188376753505, + "grad_norm": 71.41360764076626, + "learning_rate": 1.56312625250501e-06, + "loss": 3.8545, + "step": 235 + }, + { + "epoch": 0.047294589178356716, + "grad_norm": 60.737909636333775, + "learning_rate": 1.569806279225117e-06, + "loss": 3.7618, + "step": 236 + }, + { + "epoch": 0.04749498997995992, + "grad_norm": 73.2555369474299, + "learning_rate": 1.576486305945224e-06, + "loss": 3.7154, + "step": 237 + }, + { + "epoch": 0.047695390781563124, + "grad_norm": 60.04623925615915, + "learning_rate": 1.5831663326653306e-06, + "loss": 4.0925, + "step": 238 + }, + { + "epoch": 0.047895791583166335, + "grad_norm": 58.425970018948384, + "learning_rate": 1.5898463593854376e-06, + "loss": 4.1526, + "step": 239 + }, + { + "epoch": 0.04809619238476954, + "grad_norm": 62.442589833528174, + "learning_rate": 1.5965263861055446e-06, + "loss": 4.1146, + "step": 240 + }, + { + "epoch": 0.04829659318637274, + "grad_norm": 122.68533138773194, + "learning_rate": 1.6032064128256514e-06, + "loss": 3.8496, + "step": 241 + }, + { + "epoch": 0.04849699398797595, + "grad_norm": 127.24793202549529, + "learning_rate": 1.6098864395457584e-06, + "loss": 4.8468, + "step": 242 + }, + { + "epoch": 0.04869739478957916, + "grad_norm": 64.60785864471976, + "learning_rate": 1.6165664662658652e-06, + "loss": 3.7697, + "step": 243 + }, + { + "epoch": 0.04889779559118237, + "grad_norm": 79.51006824185596, + "learning_rate": 1.623246492985972e-06, + "loss": 3.8793, + "step": 244 + }, + { + "epoch": 0.04909819639278557, + "grad_norm": 193.78591932037557, + "learning_rate": 1.629926519706079e-06, + "loss": 4.1879, + "step": 245 + }, + { + "epoch": 0.049298597194388775, + "grad_norm": 65.059940315542, + "learning_rate": 1.6366065464261856e-06, + "loss": 3.5266, + "step": 246 + }, + { + "epoch": 0.049498997995991986, + "grad_norm": 65.1400732802522, + "learning_rate": 1.6432865731462927e-06, + "loss": 4.0497, + "step": 247 + }, + { + "epoch": 0.04969939879759519, + "grad_norm": 55.99069627582577, + "learning_rate": 1.6499665998663997e-06, + "loss": 3.4766, + "step": 248 + }, + { + "epoch": 0.049899799599198394, + "grad_norm": 60.73712796242953, + "learning_rate": 1.6566466265865065e-06, + "loss": 4.1031, + "step": 249 + }, + { + "epoch": 0.050100200400801605, + "grad_norm": 63.449520342049446, + "learning_rate": 1.6633266533066133e-06, + "loss": 3.5152, + "step": 250 + }, + { + "epoch": 0.05030060120240481, + "grad_norm": 92.34033160650287, + "learning_rate": 1.6700066800267203e-06, + "loss": 4.2653, + "step": 251 + }, + { + "epoch": 0.05050100200400802, + "grad_norm": 60.38934847973838, + "learning_rate": 1.676686706746827e-06, + "loss": 3.81, + "step": 252 + }, + { + "epoch": 0.05070140280561122, + "grad_norm": 80.73412752062102, + "learning_rate": 1.683366733466934e-06, + "loss": 4.0991, + "step": 253 + }, + { + "epoch": 0.05090180360721443, + "grad_norm": 88.90920112147442, + "learning_rate": 1.690046760187041e-06, + "loss": 4.3639, + "step": 254 + }, + { + "epoch": 0.05110220440881764, + "grad_norm": 74.57698600149462, + "learning_rate": 1.6967267869071477e-06, + "loss": 4.138, + "step": 255 + }, + { + "epoch": 0.05130260521042084, + "grad_norm": 96.46105083299666, + "learning_rate": 1.7034068136272547e-06, + "loss": 4.7619, + "step": 256 + }, + { + "epoch": 0.051503006012024045, + "grad_norm": 80.57311028328085, + "learning_rate": 1.7100868403473617e-06, + "loss": 4.1069, + "step": 257 + }, + { + "epoch": 0.051703406813627256, + "grad_norm": 129.2023205237157, + "learning_rate": 1.7167668670674683e-06, + "loss": 3.8124, + "step": 258 + }, + { + "epoch": 0.05190380761523046, + "grad_norm": 95.38042731465143, + "learning_rate": 1.7234468937875753e-06, + "loss": 3.9428, + "step": 259 + }, + { + "epoch": 0.052104208416833664, + "grad_norm": 104.2053034705093, + "learning_rate": 1.7301269205076821e-06, + "loss": 4.3752, + "step": 260 + }, + { + "epoch": 0.052304609218436875, + "grad_norm": 77.95701389834662, + "learning_rate": 1.7368069472277891e-06, + "loss": 4.1214, + "step": 261 + }, + { + "epoch": 0.05250501002004008, + "grad_norm": 203.42280558658504, + "learning_rate": 1.743486973947896e-06, + "loss": 3.8409, + "step": 262 + }, + { + "epoch": 0.05270541082164329, + "grad_norm": 65.10669005904107, + "learning_rate": 1.7501670006680027e-06, + "loss": 3.2614, + "step": 263 + }, + { + "epoch": 0.05290581162324649, + "grad_norm": 50.690322926895654, + "learning_rate": 1.7568470273881097e-06, + "loss": 3.7174, + "step": 264 + }, + { + "epoch": 0.0531062124248497, + "grad_norm": 74.32561672195109, + "learning_rate": 1.7635270541082167e-06, + "loss": 4.1856, + "step": 265 + }, + { + "epoch": 0.05330661322645291, + "grad_norm": 74.44804530930773, + "learning_rate": 1.7702070808283233e-06, + "loss": 4.2021, + "step": 266 + }, + { + "epoch": 0.05350701402805611, + "grad_norm": 55.124743724016724, + "learning_rate": 1.7768871075484303e-06, + "loss": 3.9715, + "step": 267 + }, + { + "epoch": 0.053707414829659315, + "grad_norm": 261.14745482579355, + "learning_rate": 1.7835671342685374e-06, + "loss": 3.7107, + "step": 268 + }, + { + "epoch": 0.053907815631262526, + "grad_norm": 134.67192214219264, + "learning_rate": 1.790247160988644e-06, + "loss": 3.9623, + "step": 269 + }, + { + "epoch": 0.05410821643286573, + "grad_norm": 59.88468508392641, + "learning_rate": 1.796927187708751e-06, + "loss": 4.4812, + "step": 270 + }, + { + "epoch": 0.05430861723446894, + "grad_norm": 84.49493616012587, + "learning_rate": 1.803607214428858e-06, + "loss": 4.2542, + "step": 271 + }, + { + "epoch": 0.054509018036072145, + "grad_norm": 196.01713912022947, + "learning_rate": 1.8102872411489648e-06, + "loss": 3.3998, + "step": 272 + }, + { + "epoch": 0.05470941883767535, + "grad_norm": 65.18262444057248, + "learning_rate": 1.8169672678690718e-06, + "loss": 3.5672, + "step": 273 + }, + { + "epoch": 0.05490981963927856, + "grad_norm": 54.262283063636815, + "learning_rate": 1.8236472945891784e-06, + "loss": 3.9953, + "step": 274 + }, + { + "epoch": 0.05511022044088176, + "grad_norm": 148.18842545454012, + "learning_rate": 1.8303273213092854e-06, + "loss": 4.1136, + "step": 275 + }, + { + "epoch": 0.05531062124248497, + "grad_norm": 129.72468656629076, + "learning_rate": 1.8370073480293924e-06, + "loss": 4.5995, + "step": 276 + }, + { + "epoch": 0.05551102204408818, + "grad_norm": 78.49835257585468, + "learning_rate": 1.843687374749499e-06, + "loss": 4.3939, + "step": 277 + }, + { + "epoch": 0.05571142284569138, + "grad_norm": 49.489588594557034, + "learning_rate": 1.850367401469606e-06, + "loss": 3.8003, + "step": 278 + }, + { + "epoch": 0.05591182364729459, + "grad_norm": 109.03989563983778, + "learning_rate": 1.857047428189713e-06, + "loss": 4.1134, + "step": 279 + }, + { + "epoch": 0.056112224448897796, + "grad_norm": 50.08481952038622, + "learning_rate": 1.8637274549098198e-06, + "loss": 3.8917, + "step": 280 + }, + { + "epoch": 0.056312625250501, + "grad_norm": 93.88826872897796, + "learning_rate": 1.8704074816299266e-06, + "loss": 4.2617, + "step": 281 + }, + { + "epoch": 0.05651302605210421, + "grad_norm": 78.72475197121199, + "learning_rate": 1.8770875083500336e-06, + "loss": 3.9829, + "step": 282 + }, + { + "epoch": 0.056713426853707415, + "grad_norm": 57.24188186040767, + "learning_rate": 1.8837675350701404e-06, + "loss": 4.4959, + "step": 283 + }, + { + "epoch": 0.05691382765531062, + "grad_norm": 41.89043125804091, + "learning_rate": 1.8904475617902474e-06, + "loss": 3.3911, + "step": 284 + }, + { + "epoch": 0.05711422845691383, + "grad_norm": 69.77618993589998, + "learning_rate": 1.8971275885103544e-06, + "loss": 3.9698, + "step": 285 + }, + { + "epoch": 0.05731462925851703, + "grad_norm": 58.25688281150491, + "learning_rate": 1.903807615230461e-06, + "loss": 4.4098, + "step": 286 + }, + { + "epoch": 0.057515030060120244, + "grad_norm": 91.51763386598459, + "learning_rate": 1.910487641950568e-06, + "loss": 4.085, + "step": 287 + }, + { + "epoch": 0.05771543086172345, + "grad_norm": 64.08076950887911, + "learning_rate": 1.917167668670675e-06, + "loss": 3.9533, + "step": 288 + }, + { + "epoch": 0.05791583166332665, + "grad_norm": 65.73540827612361, + "learning_rate": 1.9238476953907816e-06, + "loss": 3.993, + "step": 289 + }, + { + "epoch": 0.05811623246492986, + "grad_norm": 65.85577402626004, + "learning_rate": 1.930527722110889e-06, + "loss": 4.5246, + "step": 290 + }, + { + "epoch": 0.058316633266533066, + "grad_norm": 50.84216402127399, + "learning_rate": 1.9372077488309952e-06, + "loss": 3.6822, + "step": 291 + }, + { + "epoch": 0.05851703406813627, + "grad_norm": 57.57811060251673, + "learning_rate": 1.9438877755511025e-06, + "loss": 3.6017, + "step": 292 + }, + { + "epoch": 0.05871743486973948, + "grad_norm": 55.9435578065231, + "learning_rate": 1.9505678022712093e-06, + "loss": 3.9163, + "step": 293 + }, + { + "epoch": 0.058917835671342685, + "grad_norm": 39.64392648659655, + "learning_rate": 1.957247828991316e-06, + "loss": 3.7763, + "step": 294 + }, + { + "epoch": 0.05911823647294589, + "grad_norm": 58.07985726420877, + "learning_rate": 1.963927855711423e-06, + "loss": 3.9132, + "step": 295 + }, + { + "epoch": 0.0593186372745491, + "grad_norm": 67.59401362237826, + "learning_rate": 1.97060788243153e-06, + "loss": 3.9728, + "step": 296 + }, + { + "epoch": 0.0595190380761523, + "grad_norm": 50.29123955987195, + "learning_rate": 1.977287909151637e-06, + "loss": 4.2808, + "step": 297 + }, + { + "epoch": 0.059719438877755514, + "grad_norm": 61.70671620165177, + "learning_rate": 1.9839679358717437e-06, + "loss": 3.4418, + "step": 298 + }, + { + "epoch": 0.05991983967935872, + "grad_norm": 61.540863362292086, + "learning_rate": 1.9906479625918505e-06, + "loss": 3.9666, + "step": 299 + }, + { + "epoch": 0.06012024048096192, + "grad_norm": 63.16310022008188, + "learning_rate": 1.9973279893119573e-06, + "loss": 3.7969, + "step": 300 + }, + { + "epoch": 0.06032064128256513, + "grad_norm": 272.1918492698633, + "learning_rate": 2.0040080160320645e-06, + "loss": 4.1162, + "step": 301 + }, + { + "epoch": 0.060521042084168336, + "grad_norm": 73.97258774528017, + "learning_rate": 2.010688042752171e-06, + "loss": 3.7717, + "step": 302 + }, + { + "epoch": 0.06072144288577154, + "grad_norm": 203.93892760906806, + "learning_rate": 2.017368069472278e-06, + "loss": 3.5752, + "step": 303 + }, + { + "epoch": 0.06092184368737475, + "grad_norm": 109.21373811478212, + "learning_rate": 2.024048096192385e-06, + "loss": 4.3866, + "step": 304 + }, + { + "epoch": 0.061122244488977955, + "grad_norm": 38.110049252317275, + "learning_rate": 2.0307281229124917e-06, + "loss": 3.4181, + "step": 305 + }, + { + "epoch": 0.061322645290581165, + "grad_norm": 121.09944104564782, + "learning_rate": 2.0374081496325985e-06, + "loss": 4.4905, + "step": 306 + }, + { + "epoch": 0.06152304609218437, + "grad_norm": 71.69205197768936, + "learning_rate": 2.0440881763527057e-06, + "loss": 3.7199, + "step": 307 + }, + { + "epoch": 0.06172344689378757, + "grad_norm": 53.58843825800108, + "learning_rate": 2.0507682030728125e-06, + "loss": 3.8124, + "step": 308 + }, + { + "epoch": 0.061923847695390784, + "grad_norm": 65.39237066543205, + "learning_rate": 2.0574482297929193e-06, + "loss": 4.1008, + "step": 309 + }, + { + "epoch": 0.06212424849699399, + "grad_norm": 53.099550229430626, + "learning_rate": 2.0641282565130265e-06, + "loss": 3.6053, + "step": 310 + }, + { + "epoch": 0.06232464929859719, + "grad_norm": 56.495670178894834, + "learning_rate": 2.070808283233133e-06, + "loss": 3.6951, + "step": 311 + }, + { + "epoch": 0.0625250501002004, + "grad_norm": 57.90306976006945, + "learning_rate": 2.07748830995324e-06, + "loss": 3.3195, + "step": 312 + }, + { + "epoch": 0.0627254509018036, + "grad_norm": 51.26844762806982, + "learning_rate": 2.0841683366733465e-06, + "loss": 4.1732, + "step": 313 + }, + { + "epoch": 0.06292585170340681, + "grad_norm": 61.6482108657367, + "learning_rate": 2.0908483633934537e-06, + "loss": 4.3676, + "step": 314 + }, + { + "epoch": 0.06312625250501001, + "grad_norm": 118.32414098774792, + "learning_rate": 2.0975283901135605e-06, + "loss": 3.4726, + "step": 315 + }, + { + "epoch": 0.06332665330661323, + "grad_norm": 74.15287961765146, + "learning_rate": 2.1042084168336673e-06, + "loss": 3.9423, + "step": 316 + }, + { + "epoch": 0.06352705410821644, + "grad_norm": 58.366817882151125, + "learning_rate": 2.1108884435537746e-06, + "loss": 3.5364, + "step": 317 + }, + { + "epoch": 0.06372745490981964, + "grad_norm": 68.70433745934199, + "learning_rate": 2.1175684702738814e-06, + "loss": 4.123, + "step": 318 + }, + { + "epoch": 0.06392785571142284, + "grad_norm": 66.74841038747056, + "learning_rate": 2.124248496993988e-06, + "loss": 3.8378, + "step": 319 + }, + { + "epoch": 0.06412825651302605, + "grad_norm": 70.29698760639766, + "learning_rate": 2.130928523714095e-06, + "loss": 3.492, + "step": 320 + }, + { + "epoch": 0.06432865731462926, + "grad_norm": 54.81435647671575, + "learning_rate": 2.137608550434202e-06, + "loss": 3.7312, + "step": 321 + }, + { + "epoch": 0.06452905811623247, + "grad_norm": 44.01938859104746, + "learning_rate": 2.1442885771543086e-06, + "loss": 3.6665, + "step": 322 + }, + { + "epoch": 0.06472945891783567, + "grad_norm": 60.984299012918655, + "learning_rate": 2.1509686038744158e-06, + "loss": 3.7394, + "step": 323 + }, + { + "epoch": 0.06492985971943888, + "grad_norm": 38.8439283286195, + "learning_rate": 2.1576486305945226e-06, + "loss": 4.1601, + "step": 324 + }, + { + "epoch": 0.06513026052104208, + "grad_norm": 76.96580337288566, + "learning_rate": 2.1643286573146294e-06, + "loss": 4.0208, + "step": 325 + }, + { + "epoch": 0.06533066132264528, + "grad_norm": 54.85205396375104, + "learning_rate": 2.171008684034736e-06, + "loss": 3.8221, + "step": 326 + }, + { + "epoch": 0.0655310621242485, + "grad_norm": 78.06172624546576, + "learning_rate": 2.177688710754843e-06, + "loss": 3.5886, + "step": 327 + }, + { + "epoch": 0.0657314629258517, + "grad_norm": 80.0123409386791, + "learning_rate": 2.18436873747495e-06, + "loss": 3.9485, + "step": 328 + }, + { + "epoch": 0.06593186372745491, + "grad_norm": 95.27881332201298, + "learning_rate": 2.191048764195057e-06, + "loss": 4.1834, + "step": 329 + }, + { + "epoch": 0.06613226452905811, + "grad_norm": 51.29416677262202, + "learning_rate": 2.197728790915164e-06, + "loss": 3.3592, + "step": 330 + }, + { + "epoch": 0.06633266533066132, + "grad_norm": 47.36312811666307, + "learning_rate": 2.2044088176352706e-06, + "loss": 4.1674, + "step": 331 + }, + { + "epoch": 0.06653306613226453, + "grad_norm": 60.4150825674889, + "learning_rate": 2.211088844355378e-06, + "loss": 4.2963, + "step": 332 + }, + { + "epoch": 0.06673346693386774, + "grad_norm": 157.6804670713469, + "learning_rate": 2.217768871075484e-06, + "loss": 4.2536, + "step": 333 + }, + { + "epoch": 0.06693386773547094, + "grad_norm": 108.77485757072812, + "learning_rate": 2.2244488977955914e-06, + "loss": 3.8004, + "step": 334 + }, + { + "epoch": 0.06713426853707415, + "grad_norm": 49.493296012274925, + "learning_rate": 2.2311289245156982e-06, + "loss": 3.9496, + "step": 335 + }, + { + "epoch": 0.06733466933867735, + "grad_norm": 68.63191283034413, + "learning_rate": 2.237808951235805e-06, + "loss": 3.4681, + "step": 336 + }, + { + "epoch": 0.06753507014028057, + "grad_norm": 53.80992112384704, + "learning_rate": 2.244488977955912e-06, + "loss": 3.8519, + "step": 337 + }, + { + "epoch": 0.06773547094188377, + "grad_norm": 69.9054204552534, + "learning_rate": 2.251169004676019e-06, + "loss": 3.4846, + "step": 338 + }, + { + "epoch": 0.06793587174348698, + "grad_norm": 199.70520419542035, + "learning_rate": 2.257849031396126e-06, + "loss": 3.9884, + "step": 339 + }, + { + "epoch": 0.06813627254509018, + "grad_norm": 76.02609150241389, + "learning_rate": 2.2645290581162327e-06, + "loss": 4.5056, + "step": 340 + }, + { + "epoch": 0.06833667334669338, + "grad_norm": 82.18346980060998, + "learning_rate": 2.2712090848363395e-06, + "loss": 3.5881, + "step": 341 + }, + { + "epoch": 0.06853707414829659, + "grad_norm": 61.43177840565582, + "learning_rate": 2.2778891115564463e-06, + "loss": 3.9374, + "step": 342 + }, + { + "epoch": 0.0687374749498998, + "grad_norm": 64.52974327349845, + "learning_rate": 2.2845691382765535e-06, + "loss": 3.5624, + "step": 343 + }, + { + "epoch": 0.06893787575150301, + "grad_norm": 50.80216753258444, + "learning_rate": 2.29124916499666e-06, + "loss": 3.7862, + "step": 344 + }, + { + "epoch": 0.06913827655310621, + "grad_norm": 76.19905464536704, + "learning_rate": 2.297929191716767e-06, + "loss": 3.7105, + "step": 345 + }, + { + "epoch": 0.06933867735470942, + "grad_norm": 40.50595540631092, + "learning_rate": 2.304609218436874e-06, + "loss": 3.6984, + "step": 346 + }, + { + "epoch": 0.06953907815631262, + "grad_norm": 63.09284178482287, + "learning_rate": 2.3112892451569807e-06, + "loss": 3.8022, + "step": 347 + }, + { + "epoch": 0.06973947895791584, + "grad_norm": 54.86820547167222, + "learning_rate": 2.317969271877088e-06, + "loss": 3.7966, + "step": 348 + }, + { + "epoch": 0.06993987975951904, + "grad_norm": 62.99845903372721, + "learning_rate": 2.3246492985971947e-06, + "loss": 3.9025, + "step": 349 + }, + { + "epoch": 0.07014028056112225, + "grad_norm": 97.96246948345359, + "learning_rate": 2.3313293253173015e-06, + "loss": 3.7944, + "step": 350 + }, + { + "epoch": 0.07034068136272545, + "grad_norm": 58.97697758806943, + "learning_rate": 2.3380093520374083e-06, + "loss": 3.4991, + "step": 351 + }, + { + "epoch": 0.07054108216432865, + "grad_norm": 61.58060421074276, + "learning_rate": 2.3446893787575155e-06, + "loss": 4.0079, + "step": 352 + }, + { + "epoch": 0.07074148296593187, + "grad_norm": 57.148570409218195, + "learning_rate": 2.351369405477622e-06, + "loss": 3.7953, + "step": 353 + }, + { + "epoch": 0.07094188376753507, + "grad_norm": 109.80101202018982, + "learning_rate": 2.358049432197729e-06, + "loss": 3.8467, + "step": 354 + }, + { + "epoch": 0.07114228456913828, + "grad_norm": 181.70839297362951, + "learning_rate": 2.364729458917836e-06, + "loss": 4.1451, + "step": 355 + }, + { + "epoch": 0.07134268537074148, + "grad_norm": 164.98564607027413, + "learning_rate": 2.3714094856379427e-06, + "loss": 3.5713, + "step": 356 + }, + { + "epoch": 0.07154308617234469, + "grad_norm": 73.2863758800141, + "learning_rate": 2.3780895123580495e-06, + "loss": 4.0163, + "step": 357 + }, + { + "epoch": 0.07174348697394789, + "grad_norm": 86.1207606448672, + "learning_rate": 2.3847695390781563e-06, + "loss": 3.2783, + "step": 358 + }, + { + "epoch": 0.07194388777555111, + "grad_norm": 64.02601115919931, + "learning_rate": 2.3914495657982635e-06, + "loss": 3.859, + "step": 359 + }, + { + "epoch": 0.07214428857715431, + "grad_norm": 66.22874245626927, + "learning_rate": 2.3981295925183703e-06, + "loss": 3.9588, + "step": 360 + }, + { + "epoch": 0.07234468937875752, + "grad_norm": 60.97972789079834, + "learning_rate": 2.404809619238477e-06, + "loss": 3.9057, + "step": 361 + }, + { + "epoch": 0.07254509018036072, + "grad_norm": 71.04595560702029, + "learning_rate": 2.411489645958584e-06, + "loss": 4.5073, + "step": 362 + }, + { + "epoch": 0.07274549098196392, + "grad_norm": 36.33000160928221, + "learning_rate": 2.418169672678691e-06, + "loss": 3.8806, + "step": 363 + }, + { + "epoch": 0.07294589178356714, + "grad_norm": 88.58898166020342, + "learning_rate": 2.4248496993987975e-06, + "loss": 3.8601, + "step": 364 + }, + { + "epoch": 0.07314629258517034, + "grad_norm": 140.4168891143956, + "learning_rate": 2.4315297261189048e-06, + "loss": 3.9536, + "step": 365 + }, + { + "epoch": 0.07334669338677355, + "grad_norm": 47.99283764854366, + "learning_rate": 2.4382097528390116e-06, + "loss": 3.5463, + "step": 366 + }, + { + "epoch": 0.07354709418837675, + "grad_norm": 87.52911348431466, + "learning_rate": 2.4448897795591184e-06, + "loss": 3.836, + "step": 367 + }, + { + "epoch": 0.07374749498997996, + "grad_norm": 53.702948780533255, + "learning_rate": 2.451569806279225e-06, + "loss": 3.4406, + "step": 368 + }, + { + "epoch": 0.07394789579158316, + "grad_norm": 58.34857348087771, + "learning_rate": 2.458249832999332e-06, + "loss": 3.6574, + "step": 369 + }, + { + "epoch": 0.07414829659318638, + "grad_norm": 34.79152239918439, + "learning_rate": 2.464929859719439e-06, + "loss": 3.3618, + "step": 370 + }, + { + "epoch": 0.07434869739478958, + "grad_norm": 55.8057906236533, + "learning_rate": 2.471609886439546e-06, + "loss": 3.3389, + "step": 371 + }, + { + "epoch": 0.07454909819639279, + "grad_norm": 47.996819965551104, + "learning_rate": 2.4782899131596528e-06, + "loss": 3.707, + "step": 372 + }, + { + "epoch": 0.07474949899799599, + "grad_norm": 42.224722715622185, + "learning_rate": 2.4849699398797596e-06, + "loss": 4.2251, + "step": 373 + }, + { + "epoch": 0.07494989979959919, + "grad_norm": 70.52133246963963, + "learning_rate": 2.491649966599867e-06, + "loss": 4.862, + "step": 374 + }, + { + "epoch": 0.07515030060120241, + "grad_norm": 146.02133138899168, + "learning_rate": 2.498329993319973e-06, + "loss": 3.8675, + "step": 375 + }, + { + "epoch": 0.07535070140280561, + "grad_norm": 87.9105591090238, + "learning_rate": 2.50501002004008e-06, + "loss": 3.9602, + "step": 376 + }, + { + "epoch": 0.07555110220440882, + "grad_norm": 39.000410772780604, + "learning_rate": 2.511690046760187e-06, + "loss": 3.548, + "step": 377 + }, + { + "epoch": 0.07575150300601202, + "grad_norm": 63.13553469109009, + "learning_rate": 2.518370073480294e-06, + "loss": 4.4603, + "step": 378 + }, + { + "epoch": 0.07595190380761523, + "grad_norm": 133.42006601043275, + "learning_rate": 2.525050100200401e-06, + "loss": 3.7183, + "step": 379 + }, + { + "epoch": 0.07615230460921844, + "grad_norm": 62.603896909898495, + "learning_rate": 2.531730126920508e-06, + "loss": 3.8314, + "step": 380 + }, + { + "epoch": 0.07635270541082165, + "grad_norm": 79.78863001800475, + "learning_rate": 2.538410153640615e-06, + "loss": 3.9749, + "step": 381 + }, + { + "epoch": 0.07655310621242485, + "grad_norm": 75.6542056204846, + "learning_rate": 2.545090180360721e-06, + "loss": 3.503, + "step": 382 + }, + { + "epoch": 0.07675350701402806, + "grad_norm": 67.50681086716446, + "learning_rate": 2.551770207080829e-06, + "loss": 4.103, + "step": 383 + }, + { + "epoch": 0.07695390781563126, + "grad_norm": 95.2920582972379, + "learning_rate": 2.5584502338009352e-06, + "loss": 3.7676, + "step": 384 + }, + { + "epoch": 0.07715430861723446, + "grad_norm": 110.60041936511308, + "learning_rate": 2.565130260521042e-06, + "loss": 4.3313, + "step": 385 + }, + { + "epoch": 0.07735470941883768, + "grad_norm": 82.52052610461152, + "learning_rate": 2.5718102872411493e-06, + "loss": 4.0981, + "step": 386 + }, + { + "epoch": 0.07755511022044088, + "grad_norm": 65.19219255299693, + "learning_rate": 2.578490313961256e-06, + "loss": 3.6106, + "step": 387 + }, + { + "epoch": 0.07775551102204409, + "grad_norm": 72.94006999168465, + "learning_rate": 2.585170340681363e-06, + "loss": 3.6226, + "step": 388 + }, + { + "epoch": 0.07795591182364729, + "grad_norm": 98.00794991331209, + "learning_rate": 2.59185036740147e-06, + "loss": 4.5364, + "step": 389 + }, + { + "epoch": 0.0781563126252505, + "grad_norm": 63.645322163840916, + "learning_rate": 2.598530394121577e-06, + "loss": 3.4689, + "step": 390 + }, + { + "epoch": 0.07835671342685371, + "grad_norm": 75.87331852870388, + "learning_rate": 2.6052104208416833e-06, + "loss": 3.8874, + "step": 391 + }, + { + "epoch": 0.07855711422845692, + "grad_norm": 78.27878014665968, + "learning_rate": 2.6118904475617905e-06, + "loss": 3.8226, + "step": 392 + }, + { + "epoch": 0.07875751503006012, + "grad_norm": 56.18109460736375, + "learning_rate": 2.6185704742818973e-06, + "loss": 3.6958, + "step": 393 + }, + { + "epoch": 0.07895791583166333, + "grad_norm": 48.47071531401958, + "learning_rate": 2.625250501002004e-06, + "loss": 3.8119, + "step": 394 + }, + { + "epoch": 0.07915831663326653, + "grad_norm": 91.55886649631378, + "learning_rate": 2.6319305277221113e-06, + "loss": 4.4622, + "step": 395 + }, + { + "epoch": 0.07935871743486973, + "grad_norm": 56.931090444924614, + "learning_rate": 2.638610554442218e-06, + "loss": 3.572, + "step": 396 + }, + { + "epoch": 0.07955911823647295, + "grad_norm": 62.84926124336124, + "learning_rate": 2.645290581162325e-06, + "loss": 4.5693, + "step": 397 + }, + { + "epoch": 0.07975951903807615, + "grad_norm": 148.6055905644588, + "learning_rate": 2.651970607882432e-06, + "loss": 4.1559, + "step": 398 + }, + { + "epoch": 0.07995991983967936, + "grad_norm": 84.31841293053974, + "learning_rate": 2.6586506346025385e-06, + "loss": 3.5626, + "step": 399 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 44.135829624613514, + "learning_rate": 2.6653306613226453e-06, + "loss": 3.676, + "step": 400 + }, + { + "epoch": 0.08036072144288577, + "grad_norm": 53.981245897993425, + "learning_rate": 2.6720106880427525e-06, + "loss": 3.7976, + "step": 401 + }, + { + "epoch": 0.08056112224448898, + "grad_norm": 64.27445724913075, + "learning_rate": 2.6786907147628593e-06, + "loss": 3.9239, + "step": 402 + }, + { + "epoch": 0.08076152304609219, + "grad_norm": 58.95219868094442, + "learning_rate": 2.685370741482966e-06, + "loss": 3.9781, + "step": 403 + }, + { + "epoch": 0.08096192384769539, + "grad_norm": 70.23314023006778, + "learning_rate": 2.692050768203073e-06, + "loss": 4.0738, + "step": 404 + }, + { + "epoch": 0.0811623246492986, + "grad_norm": 63.216735096466444, + "learning_rate": 2.69873079492318e-06, + "loss": 3.9735, + "step": 405 + }, + { + "epoch": 0.0813627254509018, + "grad_norm": 58.15720228905888, + "learning_rate": 2.7054108216432865e-06, + "loss": 3.9843, + "step": 406 + }, + { + "epoch": 0.08156312625250502, + "grad_norm": 53.05267030460441, + "learning_rate": 2.7120908483633933e-06, + "loss": 3.5966, + "step": 407 + }, + { + "epoch": 0.08176352705410822, + "grad_norm": 94.41412365414827, + "learning_rate": 2.7187708750835005e-06, + "loss": 3.9587, + "step": 408 + }, + { + "epoch": 0.08196392785571142, + "grad_norm": 43.18850056288134, + "learning_rate": 2.7254509018036073e-06, + "loss": 4.1053, + "step": 409 + }, + { + "epoch": 0.08216432865731463, + "grad_norm": 46.16155210233893, + "learning_rate": 2.732130928523714e-06, + "loss": 3.9358, + "step": 410 + }, + { + "epoch": 0.08236472945891783, + "grad_norm": 54.89216302537754, + "learning_rate": 2.7388109552438214e-06, + "loss": 4.032, + "step": 411 + }, + { + "epoch": 0.08256513026052104, + "grad_norm": 58.533413673468964, + "learning_rate": 2.745490981963928e-06, + "loss": 3.4122, + "step": 412 + }, + { + "epoch": 0.08276553106212425, + "grad_norm": 85.59951442965914, + "learning_rate": 2.7521710086840345e-06, + "loss": 3.5326, + "step": 413 + }, + { + "epoch": 0.08296593186372746, + "grad_norm": 41.31650359247322, + "learning_rate": 2.758851035404142e-06, + "loss": 3.8348, + "step": 414 + }, + { + "epoch": 0.08316633266533066, + "grad_norm": 127.28547100588615, + "learning_rate": 2.7655310621242486e-06, + "loss": 3.5263, + "step": 415 + }, + { + "epoch": 0.08336673346693386, + "grad_norm": 65.97491596646955, + "learning_rate": 2.7722110888443554e-06, + "loss": 3.9349, + "step": 416 + }, + { + "epoch": 0.08356713426853707, + "grad_norm": 69.05877706829571, + "learning_rate": 2.7788911155644626e-06, + "loss": 3.6495, + "step": 417 + }, + { + "epoch": 0.08376753507014029, + "grad_norm": 74.59545792469312, + "learning_rate": 2.7855711422845694e-06, + "loss": 5.1216, + "step": 418 + }, + { + "epoch": 0.08396793587174349, + "grad_norm": 181.6478657087667, + "learning_rate": 2.792251169004676e-06, + "loss": 4.39, + "step": 419 + }, + { + "epoch": 0.0841683366733467, + "grad_norm": 47.80298218438976, + "learning_rate": 2.7989311957247834e-06, + "loss": 4.1247, + "step": 420 + }, + { + "epoch": 0.0843687374749499, + "grad_norm": 46.79459389296778, + "learning_rate": 2.8056112224448902e-06, + "loss": 4.2447, + "step": 421 + }, + { + "epoch": 0.0845691382765531, + "grad_norm": 48.739858766042865, + "learning_rate": 2.8122912491649966e-06, + "loss": 3.6674, + "step": 422 + }, + { + "epoch": 0.0847695390781563, + "grad_norm": 62.154243368026414, + "learning_rate": 2.818971275885104e-06, + "loss": 3.7772, + "step": 423 + }, + { + "epoch": 0.08496993987975952, + "grad_norm": 80.32179623682673, + "learning_rate": 2.8256513026052106e-06, + "loss": 3.9642, + "step": 424 + }, + { + "epoch": 0.08517034068136273, + "grad_norm": 83.01830075806788, + "learning_rate": 2.8323313293253174e-06, + "loss": 3.6256, + "step": 425 + }, + { + "epoch": 0.08537074148296593, + "grad_norm": 47.69453807464047, + "learning_rate": 2.8390113560454246e-06, + "loss": 4.2607, + "step": 426 + }, + { + "epoch": 0.08557114228456913, + "grad_norm": 61.436518078468296, + "learning_rate": 2.8456913827655314e-06, + "loss": 3.9398, + "step": 427 + }, + { + "epoch": 0.08577154308617234, + "grad_norm": 105.80285948153542, + "learning_rate": 2.8523714094856382e-06, + "loss": 4.015, + "step": 428 + }, + { + "epoch": 0.08597194388777556, + "grad_norm": 186.5727839981967, + "learning_rate": 2.8590514362057455e-06, + "loss": 4.1423, + "step": 429 + }, + { + "epoch": 0.08617234468937876, + "grad_norm": 49.48475603768422, + "learning_rate": 2.865731462925852e-06, + "loss": 3.8097, + "step": 430 + }, + { + "epoch": 0.08637274549098196, + "grad_norm": 53.34806880307156, + "learning_rate": 2.8724114896459586e-06, + "loss": 3.5843, + "step": 431 + }, + { + "epoch": 0.08657314629258517, + "grad_norm": 88.7341039412237, + "learning_rate": 2.8790915163660654e-06, + "loss": 4.1627, + "step": 432 + }, + { + "epoch": 0.08677354709418837, + "grad_norm": 32.196104368618556, + "learning_rate": 2.8857715430861727e-06, + "loss": 3.5647, + "step": 433 + }, + { + "epoch": 0.08697394789579159, + "grad_norm": 43.007359857578294, + "learning_rate": 2.8924515698062795e-06, + "loss": 3.6433, + "step": 434 + }, + { + "epoch": 0.0871743486973948, + "grad_norm": 56.402567767326204, + "learning_rate": 2.8991315965263863e-06, + "loss": 3.8483, + "step": 435 + }, + { + "epoch": 0.087374749498998, + "grad_norm": 155.7589442064284, + "learning_rate": 2.9058116232464935e-06, + "loss": 3.9293, + "step": 436 + }, + { + "epoch": 0.0875751503006012, + "grad_norm": 50.23083888248077, + "learning_rate": 2.9124916499666e-06, + "loss": 3.6033, + "step": 437 + }, + { + "epoch": 0.0877755511022044, + "grad_norm": 52.966190802501806, + "learning_rate": 2.9191716766867067e-06, + "loss": 3.3935, + "step": 438 + }, + { + "epoch": 0.08797595190380761, + "grad_norm": 67.36237338431243, + "learning_rate": 2.925851703406814e-06, + "loss": 4.1916, + "step": 439 + }, + { + "epoch": 0.08817635270541083, + "grad_norm": 57.14734143232641, + "learning_rate": 2.9325317301269207e-06, + "loss": 3.8853, + "step": 440 + }, + { + "epoch": 0.08837675350701403, + "grad_norm": 67.39897870850514, + "learning_rate": 2.9392117568470275e-06, + "loss": 3.8857, + "step": 441 + }, + { + "epoch": 0.08857715430861723, + "grad_norm": 54.01282405521128, + "learning_rate": 2.9458917835671347e-06, + "loss": 4.0805, + "step": 442 + }, + { + "epoch": 0.08877755511022044, + "grad_norm": 50.267747771559215, + "learning_rate": 2.9525718102872415e-06, + "loss": 4.0456, + "step": 443 + }, + { + "epoch": 0.08897795591182364, + "grad_norm": 66.45044058684091, + "learning_rate": 2.959251837007348e-06, + "loss": 3.6291, + "step": 444 + }, + { + "epoch": 0.08917835671342686, + "grad_norm": 38.10754954563025, + "learning_rate": 2.9659318637274555e-06, + "loss": 4.2345, + "step": 445 + }, + { + "epoch": 0.08937875751503006, + "grad_norm": 97.55515845166464, + "learning_rate": 2.972611890447562e-06, + "loss": 4.3317, + "step": 446 + }, + { + "epoch": 0.08957915831663327, + "grad_norm": 66.00363541245277, + "learning_rate": 2.9792919171676687e-06, + "loss": 3.882, + "step": 447 + }, + { + "epoch": 0.08977955911823647, + "grad_norm": 90.78624063988431, + "learning_rate": 2.985971943887776e-06, + "loss": 3.7273, + "step": 448 + }, + { + "epoch": 0.08997995991983967, + "grad_norm": 78.27148010638506, + "learning_rate": 2.9926519706078827e-06, + "loss": 4.0701, + "step": 449 + }, + { + "epoch": 0.09018036072144289, + "grad_norm": 49.51738575600719, + "learning_rate": 2.9993319973279895e-06, + "loss": 3.6871, + "step": 450 + }, + { + "epoch": 0.0903807615230461, + "grad_norm": 74.23396503333333, + "learning_rate": 3.0060120240480967e-06, + "loss": 3.9562, + "step": 451 + }, + { + "epoch": 0.0905811623246493, + "grad_norm": 61.21249770895908, + "learning_rate": 3.0126920507682035e-06, + "loss": 4.3823, + "step": 452 + }, + { + "epoch": 0.0907815631262525, + "grad_norm": 56.94123033327208, + "learning_rate": 3.01937207748831e-06, + "loss": 3.9717, + "step": 453 + }, + { + "epoch": 0.09098196392785571, + "grad_norm": 50.173687442841, + "learning_rate": 3.026052104208417e-06, + "loss": 4.2022, + "step": 454 + }, + { + "epoch": 0.09118236472945891, + "grad_norm": 71.16864713970972, + "learning_rate": 3.032732130928524e-06, + "loss": 3.7878, + "step": 455 + }, + { + "epoch": 0.09138276553106213, + "grad_norm": 64.51955211394717, + "learning_rate": 3.0394121576486307e-06, + "loss": 4.0136, + "step": 456 + }, + { + "epoch": 0.09158316633266533, + "grad_norm": 70.50663619422139, + "learning_rate": 3.046092184368738e-06, + "loss": 3.8474, + "step": 457 + }, + { + "epoch": 0.09178356713426854, + "grad_norm": 42.2891471370905, + "learning_rate": 3.0527722110888448e-06, + "loss": 3.6525, + "step": 458 + }, + { + "epoch": 0.09198396793587174, + "grad_norm": 52.52453759267007, + "learning_rate": 3.0594522378089516e-06, + "loss": 4.3304, + "step": 459 + }, + { + "epoch": 0.09218436873747494, + "grad_norm": 39.22745710304181, + "learning_rate": 3.066132264529058e-06, + "loss": 3.7961, + "step": 460 + }, + { + "epoch": 0.09238476953907816, + "grad_norm": 100.34336155192847, + "learning_rate": 3.072812291249165e-06, + "loss": 3.7686, + "step": 461 + }, + { + "epoch": 0.09258517034068137, + "grad_norm": 48.23982467180046, + "learning_rate": 3.079492317969272e-06, + "loss": 4.0486, + "step": 462 + }, + { + "epoch": 0.09278557114228457, + "grad_norm": 205.6152598070768, + "learning_rate": 3.0861723446893788e-06, + "loss": 3.8258, + "step": 463 + }, + { + "epoch": 0.09298597194388777, + "grad_norm": 57.77194191763979, + "learning_rate": 3.092852371409486e-06, + "loss": 3.7551, + "step": 464 + }, + { + "epoch": 0.09318637274549098, + "grad_norm": 57.598035360892354, + "learning_rate": 3.099532398129593e-06, + "loss": 4.1234, + "step": 465 + }, + { + "epoch": 0.09338677354709418, + "grad_norm": 46.129784830779585, + "learning_rate": 3.1062124248496996e-06, + "loss": 3.6059, + "step": 466 + }, + { + "epoch": 0.0935871743486974, + "grad_norm": 70.78908806343222, + "learning_rate": 3.112892451569807e-06, + "loss": 3.5257, + "step": 467 + }, + { + "epoch": 0.0937875751503006, + "grad_norm": 49.96963768807178, + "learning_rate": 3.119572478289913e-06, + "loss": 3.6862, + "step": 468 + }, + { + "epoch": 0.09398797595190381, + "grad_norm": 45.75385824560887, + "learning_rate": 3.12625250501002e-06, + "loss": 3.5827, + "step": 469 + }, + { + "epoch": 0.09418837675350701, + "grad_norm": 95.75042962138738, + "learning_rate": 3.1329325317301272e-06, + "loss": 4.6036, + "step": 470 + }, + { + "epoch": 0.09438877755511021, + "grad_norm": 49.27143897661616, + "learning_rate": 3.139612558450234e-06, + "loss": 3.5187, + "step": 471 + }, + { + "epoch": 0.09458917835671343, + "grad_norm": 63.3047883604663, + "learning_rate": 3.146292585170341e-06, + "loss": 4.1676, + "step": 472 + }, + { + "epoch": 0.09478957915831664, + "grad_norm": 62.26668828400745, + "learning_rate": 3.152972611890448e-06, + "loss": 3.7888, + "step": 473 + }, + { + "epoch": 0.09498997995991984, + "grad_norm": 65.39435999786288, + "learning_rate": 3.159652638610555e-06, + "loss": 3.4888, + "step": 474 + }, + { + "epoch": 0.09519038076152304, + "grad_norm": 150.42453422108355, + "learning_rate": 3.166332665330661e-06, + "loss": 3.8454, + "step": 475 + }, + { + "epoch": 0.09539078156312625, + "grad_norm": 63.06293788127613, + "learning_rate": 3.173012692050769e-06, + "loss": 3.8558, + "step": 476 + }, + { + "epoch": 0.09559118236472947, + "grad_norm": 97.10804248267742, + "learning_rate": 3.1796927187708752e-06, + "loss": 4.1518, + "step": 477 + }, + { + "epoch": 0.09579158316633267, + "grad_norm": 32.029944235105475, + "learning_rate": 3.186372745490982e-06, + "loss": 3.4344, + "step": 478 + }, + { + "epoch": 0.09599198396793587, + "grad_norm": 154.11790224182712, + "learning_rate": 3.1930527722110893e-06, + "loss": 3.8648, + "step": 479 + }, + { + "epoch": 0.09619238476953908, + "grad_norm": 59.66201292708959, + "learning_rate": 3.199732798931196e-06, + "loss": 3.3992, + "step": 480 + }, + { + "epoch": 0.09639278557114228, + "grad_norm": 62.94376971456586, + "learning_rate": 3.206412825651303e-06, + "loss": 3.8074, + "step": 481 + }, + { + "epoch": 0.09659318637274548, + "grad_norm": 79.58859393299846, + "learning_rate": 3.21309285237141e-06, + "loss": 3.6848, + "step": 482 + }, + { + "epoch": 0.0967935871743487, + "grad_norm": 50.728024243646814, + "learning_rate": 3.219772879091517e-06, + "loss": 3.8241, + "step": 483 + }, + { + "epoch": 0.0969939879759519, + "grad_norm": 43.488944822234636, + "learning_rate": 3.2264529058116233e-06, + "loss": 3.635, + "step": 484 + }, + { + "epoch": 0.09719438877755511, + "grad_norm": 62.77259656883525, + "learning_rate": 3.2331329325317305e-06, + "loss": 4.1109, + "step": 485 + }, + { + "epoch": 0.09739478957915831, + "grad_norm": 71.3815719592916, + "learning_rate": 3.2398129592518373e-06, + "loss": 3.4817, + "step": 486 + }, + { + "epoch": 0.09759519038076152, + "grad_norm": 170.355456789187, + "learning_rate": 3.246492985971944e-06, + "loss": 4.1156, + "step": 487 + }, + { + "epoch": 0.09779559118236474, + "grad_norm": 54.30140318423542, + "learning_rate": 3.253173012692051e-06, + "loss": 3.9303, + "step": 488 + }, + { + "epoch": 0.09799599198396794, + "grad_norm": 46.38441490032428, + "learning_rate": 3.259853039412158e-06, + "loss": 3.3889, + "step": 489 + }, + { + "epoch": 0.09819639278557114, + "grad_norm": 91.13155795085329, + "learning_rate": 3.266533066132265e-06, + "loss": 3.9769, + "step": 490 + }, + { + "epoch": 0.09839679358717435, + "grad_norm": 81.29489648981051, + "learning_rate": 3.2732130928523713e-06, + "loss": 4.0554, + "step": 491 + }, + { + "epoch": 0.09859719438877755, + "grad_norm": 49.76477380189661, + "learning_rate": 3.2798931195724785e-06, + "loss": 3.7638, + "step": 492 + }, + { + "epoch": 0.09879759519038075, + "grad_norm": 48.55240895244637, + "learning_rate": 3.2865731462925853e-06, + "loss": 3.6094, + "step": 493 + }, + { + "epoch": 0.09899799599198397, + "grad_norm": 96.21841111871917, + "learning_rate": 3.293253173012692e-06, + "loss": 3.6503, + "step": 494 + }, + { + "epoch": 0.09919839679358718, + "grad_norm": 77.54055608957746, + "learning_rate": 3.2999331997327993e-06, + "loss": 4.1257, + "step": 495 + }, + { + "epoch": 0.09939879759519038, + "grad_norm": 49.315178742595, + "learning_rate": 3.306613226452906e-06, + "loss": 3.7049, + "step": 496 + }, + { + "epoch": 0.09959919839679358, + "grad_norm": 79.58064199475503, + "learning_rate": 3.313293253173013e-06, + "loss": 4.0505, + "step": 497 + }, + { + "epoch": 0.09979959919839679, + "grad_norm": 62.86048677856288, + "learning_rate": 3.31997327989312e-06, + "loss": 3.6877, + "step": 498 + }, + { + "epoch": 0.1, + "grad_norm": 52.85181507268585, + "learning_rate": 3.3266533066132265e-06, + "loss": 3.6585, + "step": 499 + }, + { + "epoch": 0.10020040080160321, + "grad_norm": 116.05618863292823, + "learning_rate": 3.3333333333333333e-06, + "loss": 4.2428, + "step": 500 + }, + { + "epoch": 0.10040080160320641, + "grad_norm": 45.228716897190665, + "learning_rate": 3.3400133600534405e-06, + "loss": 3.7591, + "step": 501 + }, + { + "epoch": 0.10060120240480962, + "grad_norm": 113.23425798824745, + "learning_rate": 3.3466933867735473e-06, + "loss": 4.5316, + "step": 502 + }, + { + "epoch": 0.10080160320641282, + "grad_norm": 53.80910203695476, + "learning_rate": 3.353373413493654e-06, + "loss": 3.825, + "step": 503 + }, + { + "epoch": 0.10100200400801604, + "grad_norm": 52.78350457191134, + "learning_rate": 3.3600534402137614e-06, + "loss": 3.8488, + "step": 504 + }, + { + "epoch": 0.10120240480961924, + "grad_norm": 61.13638637055739, + "learning_rate": 3.366733466933868e-06, + "loss": 3.6459, + "step": 505 + }, + { + "epoch": 0.10140280561122245, + "grad_norm": 48.67420741454259, + "learning_rate": 3.3734134936539745e-06, + "loss": 3.7827, + "step": 506 + }, + { + "epoch": 0.10160320641282565, + "grad_norm": 49.68378413753463, + "learning_rate": 3.380093520374082e-06, + "loss": 4.0925, + "step": 507 + }, + { + "epoch": 0.10180360721442885, + "grad_norm": 59.95970849919852, + "learning_rate": 3.3867735470941886e-06, + "loss": 3.9281, + "step": 508 + }, + { + "epoch": 0.10200400801603206, + "grad_norm": 74.24993257568364, + "learning_rate": 3.3934535738142954e-06, + "loss": 3.8028, + "step": 509 + }, + { + "epoch": 0.10220440881763528, + "grad_norm": 146.36789181967686, + "learning_rate": 3.4001336005344026e-06, + "loss": 3.9952, + "step": 510 + }, + { + "epoch": 0.10240480961923848, + "grad_norm": 44.67166761621613, + "learning_rate": 3.4068136272545094e-06, + "loss": 3.7182, + "step": 511 + }, + { + "epoch": 0.10260521042084168, + "grad_norm": 59.405605512076406, + "learning_rate": 3.413493653974616e-06, + "loss": 3.5262, + "step": 512 + }, + { + "epoch": 0.10280561122244489, + "grad_norm": 58.09501128027652, + "learning_rate": 3.4201736806947234e-06, + "loss": 3.8116, + "step": 513 + }, + { + "epoch": 0.10300601202404809, + "grad_norm": 63.34053397625317, + "learning_rate": 3.4268537074148302e-06, + "loss": 3.4885, + "step": 514 + }, + { + "epoch": 0.10320641282565131, + "grad_norm": 59.26059854294819, + "learning_rate": 3.4335337341349366e-06, + "loss": 3.9009, + "step": 515 + }, + { + "epoch": 0.10340681362725451, + "grad_norm": 53.61929908441558, + "learning_rate": 3.4402137608550434e-06, + "loss": 3.3789, + "step": 516 + }, + { + "epoch": 0.10360721442885772, + "grad_norm": 69.52652865989423, + "learning_rate": 3.4468937875751506e-06, + "loss": 4.0193, + "step": 517 + }, + { + "epoch": 0.10380761523046092, + "grad_norm": 45.109706283725714, + "learning_rate": 3.4535738142952574e-06, + "loss": 3.5843, + "step": 518 + }, + { + "epoch": 0.10400801603206412, + "grad_norm": 58.53799553272585, + "learning_rate": 3.4602538410153642e-06, + "loss": 3.9724, + "step": 519 + }, + { + "epoch": 0.10420841683366733, + "grad_norm": 53.00366017955666, + "learning_rate": 3.4669338677354714e-06, + "loss": 4.3074, + "step": 520 + }, + { + "epoch": 0.10440881763527055, + "grad_norm": 59.59800117463795, + "learning_rate": 3.4736138944555782e-06, + "loss": 3.9593, + "step": 521 + }, + { + "epoch": 0.10460921843687375, + "grad_norm": 74.25383257179634, + "learning_rate": 3.4802939211756846e-06, + "loss": 4.2785, + "step": 522 + }, + { + "epoch": 0.10480961923847695, + "grad_norm": 53.309787100939694, + "learning_rate": 3.486973947895792e-06, + "loss": 3.629, + "step": 523 + }, + { + "epoch": 0.10501002004008016, + "grad_norm": 115.20565474620287, + "learning_rate": 3.4936539746158986e-06, + "loss": 3.9248, + "step": 524 + }, + { + "epoch": 0.10521042084168336, + "grad_norm": 131.8786995368123, + "learning_rate": 3.5003340013360054e-06, + "loss": 4.3645, + "step": 525 + }, + { + "epoch": 0.10541082164328658, + "grad_norm": 71.2037283102481, + "learning_rate": 3.5070140280561127e-06, + "loss": 3.9408, + "step": 526 + }, + { + "epoch": 0.10561122244488978, + "grad_norm": 72.20809938734173, + "learning_rate": 3.5136940547762195e-06, + "loss": 3.9518, + "step": 527 + }, + { + "epoch": 0.10581162324649299, + "grad_norm": 73.3995444976234, + "learning_rate": 3.5203740814963263e-06, + "loss": 4.1246, + "step": 528 + }, + { + "epoch": 0.10601202404809619, + "grad_norm": 66.37142136257137, + "learning_rate": 3.5270541082164335e-06, + "loss": 3.6668, + "step": 529 + }, + { + "epoch": 0.1062124248496994, + "grad_norm": 58.04910240328494, + "learning_rate": 3.53373413493654e-06, + "loss": 3.8583, + "step": 530 + }, + { + "epoch": 0.10641282565130261, + "grad_norm": 55.1847040747884, + "learning_rate": 3.5404141616566467e-06, + "loss": 3.9586, + "step": 531 + }, + { + "epoch": 0.10661322645290582, + "grad_norm": 72.63019989579288, + "learning_rate": 3.547094188376754e-06, + "loss": 4.2395, + "step": 532 + }, + { + "epoch": 0.10681362725450902, + "grad_norm": 32.13097390171639, + "learning_rate": 3.5537742150968607e-06, + "loss": 3.7118, + "step": 533 + }, + { + "epoch": 0.10701402805611222, + "grad_norm": 58.81218364862504, + "learning_rate": 3.5604542418169675e-06, + "loss": 4.2588, + "step": 534 + }, + { + "epoch": 0.10721442885771543, + "grad_norm": 58.0351605941419, + "learning_rate": 3.5671342685370747e-06, + "loss": 4.0773, + "step": 535 + }, + { + "epoch": 0.10741482965931863, + "grad_norm": 62.34056274743318, + "learning_rate": 3.5738142952571815e-06, + "loss": 3.8174, + "step": 536 + }, + { + "epoch": 0.10761523046092185, + "grad_norm": 57.17352396844232, + "learning_rate": 3.580494321977288e-06, + "loss": 3.4316, + "step": 537 + }, + { + "epoch": 0.10781563126252505, + "grad_norm": 60.0657537682618, + "learning_rate": 3.5871743486973955e-06, + "loss": 4.2243, + "step": 538 + }, + { + "epoch": 0.10801603206412826, + "grad_norm": 58.86443824271127, + "learning_rate": 3.593854375417502e-06, + "loss": 4.3559, + "step": 539 + }, + { + "epoch": 0.10821643286573146, + "grad_norm": 59.62336517833914, + "learning_rate": 3.6005344021376087e-06, + "loss": 3.3918, + "step": 540 + }, + { + "epoch": 0.10841683366733466, + "grad_norm": 66.79178342665719, + "learning_rate": 3.607214428857716e-06, + "loss": 3.9864, + "step": 541 + }, + { + "epoch": 0.10861723446893788, + "grad_norm": 51.12689328391152, + "learning_rate": 3.6138944555778227e-06, + "loss": 3.7952, + "step": 542 + }, + { + "epoch": 0.10881763527054109, + "grad_norm": 55.724004379354845, + "learning_rate": 3.6205744822979295e-06, + "loss": 4.0146, + "step": 543 + }, + { + "epoch": 0.10901803607214429, + "grad_norm": 45.542853105502545, + "learning_rate": 3.627254509018036e-06, + "loss": 3.1694, + "step": 544 + }, + { + "epoch": 0.1092184368737475, + "grad_norm": 46.09342559507713, + "learning_rate": 3.6339345357381435e-06, + "loss": 4.0555, + "step": 545 + }, + { + "epoch": 0.1094188376753507, + "grad_norm": 86.35352212960775, + "learning_rate": 3.64061456245825e-06, + "loss": 3.7819, + "step": 546 + }, + { + "epoch": 0.10961923847695391, + "grad_norm": 76.50655560425113, + "learning_rate": 3.6472945891783567e-06, + "loss": 4.3419, + "step": 547 + }, + { + "epoch": 0.10981963927855712, + "grad_norm": 80.90373124142151, + "learning_rate": 3.653974615898464e-06, + "loss": 4.1711, + "step": 548 + }, + { + "epoch": 0.11002004008016032, + "grad_norm": 65.30513085713604, + "learning_rate": 3.6606546426185707e-06, + "loss": 4.3793, + "step": 549 + }, + { + "epoch": 0.11022044088176353, + "grad_norm": 81.48842433347448, + "learning_rate": 3.6673346693386775e-06, + "loss": 4.7804, + "step": 550 + }, + { + "epoch": 0.11042084168336673, + "grad_norm": 40.54382376666547, + "learning_rate": 3.6740146960587848e-06, + "loss": 4.2874, + "step": 551 + }, + { + "epoch": 0.11062124248496993, + "grad_norm": 67.79560705066623, + "learning_rate": 3.6806947227788916e-06, + "loss": 4.0498, + "step": 552 + }, + { + "epoch": 0.11082164328657315, + "grad_norm": 46.50365660507714, + "learning_rate": 3.687374749498998e-06, + "loss": 3.3949, + "step": 553 + }, + { + "epoch": 0.11102204408817636, + "grad_norm": 89.195794716531, + "learning_rate": 3.694054776219105e-06, + "loss": 3.4426, + "step": 554 + }, + { + "epoch": 0.11122244488977956, + "grad_norm": 62.53180329298292, + "learning_rate": 3.700734802939212e-06, + "loss": 4.2942, + "step": 555 + }, + { + "epoch": 0.11142284569138276, + "grad_norm": 58.34656174619196, + "learning_rate": 3.7074148296593188e-06, + "loss": 4.043, + "step": 556 + }, + { + "epoch": 0.11162324649298597, + "grad_norm": 52.0287307695703, + "learning_rate": 3.714094856379426e-06, + "loss": 3.8509, + "step": 557 + }, + { + "epoch": 0.11182364729458918, + "grad_norm": 44.564873443499735, + "learning_rate": 3.720774883099533e-06, + "loss": 3.3197, + "step": 558 + }, + { + "epoch": 0.11202404809619239, + "grad_norm": 49.933917482999085, + "learning_rate": 3.7274549098196396e-06, + "loss": 3.6726, + "step": 559 + }, + { + "epoch": 0.11222444889779559, + "grad_norm": 63.338672387602365, + "learning_rate": 3.734134936539747e-06, + "loss": 4.201, + "step": 560 + }, + { + "epoch": 0.1124248496993988, + "grad_norm": 144.88053129015321, + "learning_rate": 3.740814963259853e-06, + "loss": 4.1175, + "step": 561 + }, + { + "epoch": 0.112625250501002, + "grad_norm": 111.81191918972574, + "learning_rate": 3.74749498997996e-06, + "loss": 4.0166, + "step": 562 + }, + { + "epoch": 0.1128256513026052, + "grad_norm": 56.67198384585171, + "learning_rate": 3.7541750167000672e-06, + "loss": 3.8192, + "step": 563 + }, + { + "epoch": 0.11302605210420842, + "grad_norm": 48.517987266578125, + "learning_rate": 3.760855043420174e-06, + "loss": 4.0578, + "step": 564 + }, + { + "epoch": 0.11322645290581163, + "grad_norm": 95.10340139091157, + "learning_rate": 3.767535070140281e-06, + "loss": 3.3177, + "step": 565 + }, + { + "epoch": 0.11342685370741483, + "grad_norm": 47.250894771344576, + "learning_rate": 3.774215096860388e-06, + "loss": 3.1219, + "step": 566 + }, + { + "epoch": 0.11362725450901803, + "grad_norm": 37.47826711433861, + "learning_rate": 3.780895123580495e-06, + "loss": 3.4393, + "step": 567 + }, + { + "epoch": 0.11382765531062124, + "grad_norm": 80.19551807359807, + "learning_rate": 3.7875751503006012e-06, + "loss": 4.3528, + "step": 568 + }, + { + "epoch": 0.11402805611222445, + "grad_norm": 59.58292145101431, + "learning_rate": 3.794255177020709e-06, + "loss": 3.5876, + "step": 569 + }, + { + "epoch": 0.11422845691382766, + "grad_norm": 50.689895724938694, + "learning_rate": 3.8009352037408152e-06, + "loss": 3.6868, + "step": 570 + }, + { + "epoch": 0.11442885771543086, + "grad_norm": 57.04420746295166, + "learning_rate": 3.807615230460922e-06, + "loss": 4.3083, + "step": 571 + }, + { + "epoch": 0.11462925851703407, + "grad_norm": 77.16188021377691, + "learning_rate": 3.814295257181029e-06, + "loss": 3.8321, + "step": 572 + }, + { + "epoch": 0.11482965931863727, + "grad_norm": 194.42719172765345, + "learning_rate": 3.820975283901136e-06, + "loss": 3.6738, + "step": 573 + }, + { + "epoch": 0.11503006012024049, + "grad_norm": 60.18989931468585, + "learning_rate": 3.8276553106212424e-06, + "loss": 3.6632, + "step": 574 + }, + { + "epoch": 0.11523046092184369, + "grad_norm": 71.57429343790812, + "learning_rate": 3.83433533734135e-06, + "loss": 4.6259, + "step": 575 + }, + { + "epoch": 0.1154308617234469, + "grad_norm": 42.76169581538113, + "learning_rate": 3.841015364061457e-06, + "loss": 3.382, + "step": 576 + }, + { + "epoch": 0.1156312625250501, + "grad_norm": 42.05015378536224, + "learning_rate": 3.847695390781563e-06, + "loss": 3.4261, + "step": 577 + }, + { + "epoch": 0.1158316633266533, + "grad_norm": 60.4818561306367, + "learning_rate": 3.8543754175016705e-06, + "loss": 4.4062, + "step": 578 + }, + { + "epoch": 0.1160320641282565, + "grad_norm": 49.02617746028979, + "learning_rate": 3.861055444221778e-06, + "loss": 4.4473, + "step": 579 + }, + { + "epoch": 0.11623246492985972, + "grad_norm": 59.299386423771246, + "learning_rate": 3.867735470941884e-06, + "loss": 4.6327, + "step": 580 + }, + { + "epoch": 0.11643286573146293, + "grad_norm": 79.04287263181764, + "learning_rate": 3.8744154976619905e-06, + "loss": 4.4793, + "step": 581 + }, + { + "epoch": 0.11663326653306613, + "grad_norm": 42.83220009021428, + "learning_rate": 3.881095524382098e-06, + "loss": 3.6305, + "step": 582 + }, + { + "epoch": 0.11683366733466934, + "grad_norm": 65.41210393267832, + "learning_rate": 3.887775551102205e-06, + "loss": 4.4809, + "step": 583 + }, + { + "epoch": 0.11703406813627254, + "grad_norm": 62.377435818844006, + "learning_rate": 3.894455577822311e-06, + "loss": 3.6588, + "step": 584 + }, + { + "epoch": 0.11723446893787576, + "grad_norm": 67.30611979779692, + "learning_rate": 3.9011356045424185e-06, + "loss": 4.2119, + "step": 585 + }, + { + "epoch": 0.11743486973947896, + "grad_norm": 50.039363933057345, + "learning_rate": 3.907815631262526e-06, + "loss": 3.7219, + "step": 586 + }, + { + "epoch": 0.11763527054108217, + "grad_norm": 209.1337990480528, + "learning_rate": 3.914495657982632e-06, + "loss": 4.5188, + "step": 587 + }, + { + "epoch": 0.11783567134268537, + "grad_norm": 51.24146890167293, + "learning_rate": 3.921175684702739e-06, + "loss": 3.2887, + "step": 588 + }, + { + "epoch": 0.11803607214428857, + "grad_norm": 95.48493179154428, + "learning_rate": 3.927855711422846e-06, + "loss": 3.8724, + "step": 589 + }, + { + "epoch": 0.11823647294589178, + "grad_norm": 47.60109974785956, + "learning_rate": 3.934535738142953e-06, + "loss": 3.7866, + "step": 590 + }, + { + "epoch": 0.118436873747495, + "grad_norm": 49.774088080008134, + "learning_rate": 3.94121576486306e-06, + "loss": 3.85, + "step": 591 + }, + { + "epoch": 0.1186372745490982, + "grad_norm": 84.28866398934437, + "learning_rate": 3.9478957915831665e-06, + "loss": 3.9565, + "step": 592 + }, + { + "epoch": 0.1188376753507014, + "grad_norm": 46.11113414314852, + "learning_rate": 3.954575818303274e-06, + "loss": 3.636, + "step": 593 + }, + { + "epoch": 0.1190380761523046, + "grad_norm": 56.264340246025995, + "learning_rate": 3.961255845023381e-06, + "loss": 4.0397, + "step": 594 + }, + { + "epoch": 0.11923847695390781, + "grad_norm": 37.85128535634514, + "learning_rate": 3.967935871743487e-06, + "loss": 3.798, + "step": 595 + }, + { + "epoch": 0.11943887775551103, + "grad_norm": 75.37938581912474, + "learning_rate": 3.974615898463594e-06, + "loss": 4.6361, + "step": 596 + }, + { + "epoch": 0.11963927855711423, + "grad_norm": 72.92780668416718, + "learning_rate": 3.981295925183701e-06, + "loss": 4.0842, + "step": 597 + }, + { + "epoch": 0.11983967935871744, + "grad_norm": 38.5598860614375, + "learning_rate": 3.987975951903808e-06, + "loss": 3.6625, + "step": 598 + }, + { + "epoch": 0.12004008016032064, + "grad_norm": 49.7139198432214, + "learning_rate": 3.9946559786239145e-06, + "loss": 3.824, + "step": 599 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 43.35243748575465, + "learning_rate": 4.001336005344022e-06, + "loss": 3.9247, + "step": 600 + }, + { + "epoch": 0.12044088176352706, + "grad_norm": 89.61906158673925, + "learning_rate": 4.008016032064129e-06, + "loss": 4.2888, + "step": 601 + }, + { + "epoch": 0.12064128256513026, + "grad_norm": 66.04770446994326, + "learning_rate": 4.014696058784235e-06, + "loss": 4.0754, + "step": 602 + }, + { + "epoch": 0.12084168336673347, + "grad_norm": 45.40266196600683, + "learning_rate": 4.021376085504342e-06, + "loss": 3.8084, + "step": 603 + }, + { + "epoch": 0.12104208416833667, + "grad_norm": 66.12615853680542, + "learning_rate": 4.028056112224449e-06, + "loss": 4.0624, + "step": 604 + }, + { + "epoch": 0.12124248496993988, + "grad_norm": 84.79024187667576, + "learning_rate": 4.034736138944556e-06, + "loss": 3.9579, + "step": 605 + }, + { + "epoch": 0.12144288577154308, + "grad_norm": 47.19506341591547, + "learning_rate": 4.0414161656646626e-06, + "loss": 3.9976, + "step": 606 + }, + { + "epoch": 0.1216432865731463, + "grad_norm": 77.9126388585683, + "learning_rate": 4.04809619238477e-06, + "loss": 5.1712, + "step": 607 + }, + { + "epoch": 0.1218436873747495, + "grad_norm": 59.18810507588276, + "learning_rate": 4.054776219104877e-06, + "loss": 4.0167, + "step": 608 + }, + { + "epoch": 0.1220440881763527, + "grad_norm": 39.77916182771285, + "learning_rate": 4.061456245824983e-06, + "loss": 3.7723, + "step": 609 + }, + { + "epoch": 0.12224448897795591, + "grad_norm": 96.28064862465021, + "learning_rate": 4.068136272545091e-06, + "loss": 3.6005, + "step": 610 + }, + { + "epoch": 0.12244488977955911, + "grad_norm": 50.204595049577065, + "learning_rate": 4.074816299265197e-06, + "loss": 3.7226, + "step": 611 + }, + { + "epoch": 0.12264529058116233, + "grad_norm": 44.71843376885064, + "learning_rate": 4.081496325985304e-06, + "loss": 3.966, + "step": 612 + }, + { + "epoch": 0.12284569138276553, + "grad_norm": 41.17514518783375, + "learning_rate": 4.0881763527054114e-06, + "loss": 4.081, + "step": 613 + }, + { + "epoch": 0.12304609218436874, + "grad_norm": 47.29845434126307, + "learning_rate": 4.094856379425518e-06, + "loss": 4.3331, + "step": 614 + }, + { + "epoch": 0.12324649298597194, + "grad_norm": 42.86238204038183, + "learning_rate": 4.101536406145625e-06, + "loss": 3.5787, + "step": 615 + }, + { + "epoch": 0.12344689378757515, + "grad_norm": 50.083569240018214, + "learning_rate": 4.108216432865732e-06, + "loss": 4.0176, + "step": 616 + }, + { + "epoch": 0.12364729458917835, + "grad_norm": 45.320050203149314, + "learning_rate": 4.114896459585839e-06, + "loss": 3.8439, + "step": 617 + }, + { + "epoch": 0.12384769539078157, + "grad_norm": 62.68048611933137, + "learning_rate": 4.121576486305945e-06, + "loss": 4.1247, + "step": 618 + }, + { + "epoch": 0.12404809619238477, + "grad_norm": 54.33758005885702, + "learning_rate": 4.128256513026053e-06, + "loss": 3.8537, + "step": 619 + }, + { + "epoch": 0.12424849699398798, + "grad_norm": 248.74827319494725, + "learning_rate": 4.1349365397461595e-06, + "loss": 4.0598, + "step": 620 + }, + { + "epoch": 0.12444889779559118, + "grad_norm": 47.6014057613948, + "learning_rate": 4.141616566466266e-06, + "loss": 4.1606, + "step": 621 + }, + { + "epoch": 0.12464929859719438, + "grad_norm": 80.94140897795008, + "learning_rate": 4.148296593186373e-06, + "loss": 4.3626, + "step": 622 + }, + { + "epoch": 0.1248496993987976, + "grad_norm": 74.57976857165899, + "learning_rate": 4.15497661990648e-06, + "loss": 3.8164, + "step": 623 + }, + { + "epoch": 0.1250501002004008, + "grad_norm": 43.83605631495027, + "learning_rate": 4.161656646626587e-06, + "loss": 4.0632, + "step": 624 + }, + { + "epoch": 0.125250501002004, + "grad_norm": 58.00937920640922, + "learning_rate": 4.168336673346693e-06, + "loss": 4.0338, + "step": 625 + }, + { + "epoch": 0.1254509018036072, + "grad_norm": 69.37018831062915, + "learning_rate": 4.175016700066801e-06, + "loss": 3.8725, + "step": 626 + }, + { + "epoch": 0.12565130260521043, + "grad_norm": 49.3804201817332, + "learning_rate": 4.1816967267869075e-06, + "loss": 3.7884, + "step": 627 + }, + { + "epoch": 0.12585170340681362, + "grad_norm": 53.64786896771885, + "learning_rate": 4.188376753507014e-06, + "loss": 3.6779, + "step": 628 + }, + { + "epoch": 0.12605210420841684, + "grad_norm": 64.04160202818595, + "learning_rate": 4.195056780227121e-06, + "loss": 4.2557, + "step": 629 + }, + { + "epoch": 0.12625250501002003, + "grad_norm": 54.76385014587677, + "learning_rate": 4.201736806947228e-06, + "loss": 3.8897, + "step": 630 + }, + { + "epoch": 0.12645290581162325, + "grad_norm": 60.67099657036268, + "learning_rate": 4.208416833667335e-06, + "loss": 3.5535, + "step": 631 + }, + { + "epoch": 0.12665330661322646, + "grad_norm": 59.78419351062487, + "learning_rate": 4.215096860387442e-06, + "loss": 3.8376, + "step": 632 + }, + { + "epoch": 0.12685370741482965, + "grad_norm": 57.56640164651356, + "learning_rate": 4.221776887107549e-06, + "loss": 4.2402, + "step": 633 + }, + { + "epoch": 0.12705410821643287, + "grad_norm": 56.173007736891115, + "learning_rate": 4.2284569138276555e-06, + "loss": 4.1552, + "step": 634 + }, + { + "epoch": 0.12725450901803606, + "grad_norm": 49.781547594709366, + "learning_rate": 4.235136940547763e-06, + "loss": 3.5016, + "step": 635 + }, + { + "epoch": 0.12745490981963928, + "grad_norm": 59.653658513461274, + "learning_rate": 4.241816967267869e-06, + "loss": 3.779, + "step": 636 + }, + { + "epoch": 0.1276553106212425, + "grad_norm": 67.38047180499919, + "learning_rate": 4.248496993987976e-06, + "loss": 4.1143, + "step": 637 + }, + { + "epoch": 0.12785571142284569, + "grad_norm": 44.90031484306784, + "learning_rate": 4.2551770207080836e-06, + "loss": 3.7529, + "step": 638 + }, + { + "epoch": 0.1280561122244489, + "grad_norm": 52.71160595812772, + "learning_rate": 4.26185704742819e-06, + "loss": 4.1732, + "step": 639 + }, + { + "epoch": 0.1282565130260521, + "grad_norm": 124.1325894762764, + "learning_rate": 4.268537074148297e-06, + "loss": 4.1695, + "step": 640 + }, + { + "epoch": 0.1284569138276553, + "grad_norm": 39.16709075487667, + "learning_rate": 4.275217100868404e-06, + "loss": 3.8524, + "step": 641 + }, + { + "epoch": 0.12865731462925853, + "grad_norm": 38.706262603708026, + "learning_rate": 4.281897127588511e-06, + "loss": 3.9227, + "step": 642 + }, + { + "epoch": 0.12885771543086172, + "grad_norm": 33.982891077721085, + "learning_rate": 4.288577154308617e-06, + "loss": 3.8344, + "step": 643 + }, + { + "epoch": 0.12905811623246494, + "grad_norm": 49.29417998544682, + "learning_rate": 4.295257181028724e-06, + "loss": 4.1658, + "step": 644 + }, + { + "epoch": 0.12925851703406813, + "grad_norm": 40.85533496631252, + "learning_rate": 4.3019372077488316e-06, + "loss": 3.6326, + "step": 645 + }, + { + "epoch": 0.12945891783567134, + "grad_norm": 39.263899794125216, + "learning_rate": 4.308617234468938e-06, + "loss": 3.9206, + "step": 646 + }, + { + "epoch": 0.12965931863727456, + "grad_norm": 42.548210030069534, + "learning_rate": 4.315297261189045e-06, + "loss": 3.9989, + "step": 647 + }, + { + "epoch": 0.12985971943887775, + "grad_norm": 51.73917715818566, + "learning_rate": 4.321977287909152e-06, + "loss": 4.0153, + "step": 648 + }, + { + "epoch": 0.13006012024048097, + "grad_norm": 86.94216342711194, + "learning_rate": 4.328657314629259e-06, + "loss": 4.2299, + "step": 649 + }, + { + "epoch": 0.13026052104208416, + "grad_norm": 51.905171190231016, + "learning_rate": 4.335337341349366e-06, + "loss": 4.0337, + "step": 650 + }, + { + "epoch": 0.13046092184368738, + "grad_norm": 51.95851161978854, + "learning_rate": 4.342017368069472e-06, + "loss": 3.8795, + "step": 651 + }, + { + "epoch": 0.13066132264529057, + "grad_norm": 93.20312436314804, + "learning_rate": 4.34869739478958e-06, + "loss": 4.0381, + "step": 652 + }, + { + "epoch": 0.13086172344689379, + "grad_norm": 68.51356354474612, + "learning_rate": 4.355377421509686e-06, + "loss": 4.0083, + "step": 653 + }, + { + "epoch": 0.131062124248497, + "grad_norm": 92.15373324360453, + "learning_rate": 4.362057448229793e-06, + "loss": 4.6522, + "step": 654 + }, + { + "epoch": 0.1312625250501002, + "grad_norm": 47.29306576997128, + "learning_rate": 4.3687374749499e-06, + "loss": 3.8401, + "step": 655 + }, + { + "epoch": 0.1314629258517034, + "grad_norm": 67.69597647502613, + "learning_rate": 4.375417501670007e-06, + "loss": 3.9928, + "step": 656 + }, + { + "epoch": 0.1316633266533066, + "grad_norm": 53.11326321272894, + "learning_rate": 4.382097528390114e-06, + "loss": 3.9251, + "step": 657 + }, + { + "epoch": 0.13186372745490982, + "grad_norm": 98.8981263926031, + "learning_rate": 4.38877755511022e-06, + "loss": 4.4324, + "step": 658 + }, + { + "epoch": 0.13206412825651304, + "grad_norm": 41.51066594365577, + "learning_rate": 4.395457581830328e-06, + "loss": 3.7848, + "step": 659 + }, + { + "epoch": 0.13226452905811623, + "grad_norm": 212.85042998514496, + "learning_rate": 4.402137608550435e-06, + "loss": 3.262, + "step": 660 + }, + { + "epoch": 0.13246492985971944, + "grad_norm": 44.23702012626701, + "learning_rate": 4.408817635270541e-06, + "loss": 3.9987, + "step": 661 + }, + { + "epoch": 0.13266533066132263, + "grad_norm": 51.73552455919646, + "learning_rate": 4.4154976619906484e-06, + "loss": 4.2355, + "step": 662 + }, + { + "epoch": 0.13286573146292585, + "grad_norm": 35.73546186849993, + "learning_rate": 4.422177688710756e-06, + "loss": 3.2542, + "step": 663 + }, + { + "epoch": 0.13306613226452907, + "grad_norm": 55.14730533369874, + "learning_rate": 4.428857715430862e-06, + "loss": 4.3976, + "step": 664 + }, + { + "epoch": 0.13326653306613226, + "grad_norm": 54.830900986417106, + "learning_rate": 4.435537742150968e-06, + "loss": 3.8464, + "step": 665 + }, + { + "epoch": 0.13346693386773548, + "grad_norm": 51.91279062906336, + "learning_rate": 4.442217768871076e-06, + "loss": 3.5572, + "step": 666 + }, + { + "epoch": 0.13366733466933867, + "grad_norm": 41.44082403254595, + "learning_rate": 4.448897795591183e-06, + "loss": 3.8618, + "step": 667 + }, + { + "epoch": 0.13386773547094188, + "grad_norm": 55.39480715827816, + "learning_rate": 4.455577822311289e-06, + "loss": 3.9598, + "step": 668 + }, + { + "epoch": 0.1340681362725451, + "grad_norm": 53.17803317918304, + "learning_rate": 4.4622578490313965e-06, + "loss": 4.014, + "step": 669 + }, + { + "epoch": 0.1342685370741483, + "grad_norm": 55.562075438706835, + "learning_rate": 4.468937875751504e-06, + "loss": 3.1846, + "step": 670 + }, + { + "epoch": 0.1344689378757515, + "grad_norm": 63.05361444496449, + "learning_rate": 4.47561790247161e-06, + "loss": 4.5627, + "step": 671 + }, + { + "epoch": 0.1346693386773547, + "grad_norm": 52.22178148876271, + "learning_rate": 4.482297929191717e-06, + "loss": 4.0409, + "step": 672 + }, + { + "epoch": 0.13486973947895792, + "grad_norm": 158.20836592732692, + "learning_rate": 4.488977955911824e-06, + "loss": 4.1714, + "step": 673 + }, + { + "epoch": 0.13507014028056114, + "grad_norm": 59.28543544966722, + "learning_rate": 4.495657982631931e-06, + "loss": 3.963, + "step": 674 + }, + { + "epoch": 0.13527054108216433, + "grad_norm": 64.1800471367181, + "learning_rate": 4.502338009352038e-06, + "loss": 4.13, + "step": 675 + }, + { + "epoch": 0.13547094188376754, + "grad_norm": 41.6749374089899, + "learning_rate": 4.5090180360721445e-06, + "loss": 3.9863, + "step": 676 + }, + { + "epoch": 0.13567134268537073, + "grad_norm": 34.18212558288347, + "learning_rate": 4.515698062792252e-06, + "loss": 3.6886, + "step": 677 + }, + { + "epoch": 0.13587174348697395, + "grad_norm": 40.727552242008926, + "learning_rate": 4.522378089512359e-06, + "loss": 3.758, + "step": 678 + }, + { + "epoch": 0.13607214428857717, + "grad_norm": 50.311363445122474, + "learning_rate": 4.529058116232465e-06, + "loss": 3.9801, + "step": 679 + }, + { + "epoch": 0.13627254509018036, + "grad_norm": 40.17063979548693, + "learning_rate": 4.535738142952572e-06, + "loss": 3.8426, + "step": 680 + }, + { + "epoch": 0.13647294589178358, + "grad_norm": 65.79083768653608, + "learning_rate": 4.542418169672679e-06, + "loss": 4.25, + "step": 681 + }, + { + "epoch": 0.13667334669338677, + "grad_norm": 29.335039230836234, + "learning_rate": 4.549098196392786e-06, + "loss": 2.7113, + "step": 682 + }, + { + "epoch": 0.13687374749498998, + "grad_norm": 48.02397218647937, + "learning_rate": 4.5557782231128925e-06, + "loss": 3.6874, + "step": 683 + }, + { + "epoch": 0.13707414829659317, + "grad_norm": 42.12327342479628, + "learning_rate": 4.562458249833e-06, + "loss": 3.7774, + "step": 684 + }, + { + "epoch": 0.1372745490981964, + "grad_norm": 56.46105445530455, + "learning_rate": 4.569138276553107e-06, + "loss": 4.4094, + "step": 685 + }, + { + "epoch": 0.1374749498997996, + "grad_norm": 43.45522976109931, + "learning_rate": 4.575818303273213e-06, + "loss": 3.866, + "step": 686 + }, + { + "epoch": 0.1376753507014028, + "grad_norm": 66.4206606094189, + "learning_rate": 4.58249832999332e-06, + "loss": 3.9831, + "step": 687 + }, + { + "epoch": 0.13787575150300602, + "grad_norm": 57.082058701528915, + "learning_rate": 4.589178356713428e-06, + "loss": 4.6091, + "step": 688 + }, + { + "epoch": 0.1380761523046092, + "grad_norm": 75.19642675320065, + "learning_rate": 4.595858383433534e-06, + "loss": 3.9393, + "step": 689 + }, + { + "epoch": 0.13827655310621242, + "grad_norm": 57.54502712760806, + "learning_rate": 4.6025384101536405e-06, + "loss": 4.2402, + "step": 690 + }, + { + "epoch": 0.13847695390781564, + "grad_norm": 45.36185300446708, + "learning_rate": 4.609218436873748e-06, + "loss": 3.7275, + "step": 691 + }, + { + "epoch": 0.13867735470941883, + "grad_norm": 52.746228564402394, + "learning_rate": 4.615898463593855e-06, + "loss": 4.0747, + "step": 692 + }, + { + "epoch": 0.13887775551102205, + "grad_norm": 61.7225255259265, + "learning_rate": 4.622578490313961e-06, + "loss": 4.4304, + "step": 693 + }, + { + "epoch": 0.13907815631262524, + "grad_norm": 47.68067792775192, + "learning_rate": 4.6292585170340686e-06, + "loss": 3.8992, + "step": 694 + }, + { + "epoch": 0.13927855711422846, + "grad_norm": 53.703833431572214, + "learning_rate": 4.635938543754176e-06, + "loss": 4.604, + "step": 695 + }, + { + "epoch": 0.13947895791583168, + "grad_norm": 44.66822678823244, + "learning_rate": 4.642618570474282e-06, + "loss": 4.1013, + "step": 696 + }, + { + "epoch": 0.13967935871743486, + "grad_norm": 40.21631637480262, + "learning_rate": 4.649298597194389e-06, + "loss": 4.0033, + "step": 697 + }, + { + "epoch": 0.13987975951903808, + "grad_norm": 57.39317728947689, + "learning_rate": 4.655978623914496e-06, + "loss": 3.8675, + "step": 698 + }, + { + "epoch": 0.14008016032064127, + "grad_norm": 41.03772207672275, + "learning_rate": 4.662658650634603e-06, + "loss": 4.1379, + "step": 699 + }, + { + "epoch": 0.1402805611222445, + "grad_norm": 51.80979104454015, + "learning_rate": 4.66933867735471e-06, + "loss": 4.5926, + "step": 700 + }, + { + "epoch": 0.1404809619238477, + "grad_norm": 105.52806021099909, + "learning_rate": 4.676018704074817e-06, + "loss": 3.901, + "step": 701 + }, + { + "epoch": 0.1406813627254509, + "grad_norm": 230.28083299295383, + "learning_rate": 4.682698730794924e-06, + "loss": 3.9438, + "step": 702 + }, + { + "epoch": 0.14088176352705412, + "grad_norm": 31.284853002040155, + "learning_rate": 4.689378757515031e-06, + "loss": 4.0685, + "step": 703 + }, + { + "epoch": 0.1410821643286573, + "grad_norm": 59.7194442749965, + "learning_rate": 4.696058784235137e-06, + "loss": 3.65, + "step": 704 + }, + { + "epoch": 0.14128256513026052, + "grad_norm": 51.91474316930896, + "learning_rate": 4.702738810955244e-06, + "loss": 3.4672, + "step": 705 + }, + { + "epoch": 0.14148296593186374, + "grad_norm": 87.25671409461019, + "learning_rate": 4.709418837675351e-06, + "loss": 4.1232, + "step": 706 + }, + { + "epoch": 0.14168336673346693, + "grad_norm": 56.63706438988102, + "learning_rate": 4.716098864395458e-06, + "loss": 4.0024, + "step": 707 + }, + { + "epoch": 0.14188376753507015, + "grad_norm": 42.402415657823894, + "learning_rate": 4.722778891115565e-06, + "loss": 3.9359, + "step": 708 + }, + { + "epoch": 0.14208416833667334, + "grad_norm": 73.64210099498163, + "learning_rate": 4.729458917835672e-06, + "loss": 3.82, + "step": 709 + }, + { + "epoch": 0.14228456913827656, + "grad_norm": 64.78034207133342, + "learning_rate": 4.736138944555779e-06, + "loss": 3.742, + "step": 710 + }, + { + "epoch": 0.14248496993987975, + "grad_norm": 58.788329762794774, + "learning_rate": 4.7428189712758854e-06, + "loss": 3.4859, + "step": 711 + }, + { + "epoch": 0.14268537074148296, + "grad_norm": 75.35406624938578, + "learning_rate": 4.749498997995992e-06, + "loss": 3.8788, + "step": 712 + }, + { + "epoch": 0.14288577154308618, + "grad_norm": 74.02278665332283, + "learning_rate": 4.756179024716099e-06, + "loss": 3.7531, + "step": 713 + }, + { + "epoch": 0.14308617234468937, + "grad_norm": 46.1423636515417, + "learning_rate": 4.762859051436206e-06, + "loss": 3.8236, + "step": 714 + }, + { + "epoch": 0.1432865731462926, + "grad_norm": 38.30606563769741, + "learning_rate": 4.769539078156313e-06, + "loss": 3.8058, + "step": 715 + }, + { + "epoch": 0.14348697394789578, + "grad_norm": 45.00852310852753, + "learning_rate": 4.77621910487642e-06, + "loss": 4.0299, + "step": 716 + }, + { + "epoch": 0.143687374749499, + "grad_norm": 71.27187725165837, + "learning_rate": 4.782899131596527e-06, + "loss": 4.1911, + "step": 717 + }, + { + "epoch": 0.14388777555110221, + "grad_norm": 43.37239763903805, + "learning_rate": 4.7895791583166335e-06, + "loss": 4.2136, + "step": 718 + }, + { + "epoch": 0.1440881763527054, + "grad_norm": 52.463765563480216, + "learning_rate": 4.796259185036741e-06, + "loss": 4.3319, + "step": 719 + }, + { + "epoch": 0.14428857715430862, + "grad_norm": 40.354731657778146, + "learning_rate": 4.802939211756847e-06, + "loss": 3.8877, + "step": 720 + }, + { + "epoch": 0.1444889779559118, + "grad_norm": 38.0102649670491, + "learning_rate": 4.809619238476954e-06, + "loss": 3.8423, + "step": 721 + }, + { + "epoch": 0.14468937875751503, + "grad_norm": 35.784412062250496, + "learning_rate": 4.8162992651970615e-06, + "loss": 3.4473, + "step": 722 + }, + { + "epoch": 0.14488977955911825, + "grad_norm": 51.86426299730704, + "learning_rate": 4.822979291917168e-06, + "loss": 3.3275, + "step": 723 + }, + { + "epoch": 0.14509018036072144, + "grad_norm": 59.21892401346073, + "learning_rate": 4.829659318637275e-06, + "loss": 4.1772, + "step": 724 + }, + { + "epoch": 0.14529058116232466, + "grad_norm": 63.528499360452486, + "learning_rate": 4.836339345357382e-06, + "loss": 4.2901, + "step": 725 + }, + { + "epoch": 0.14549098196392785, + "grad_norm": 41.58788254536124, + "learning_rate": 4.843019372077489e-06, + "loss": 3.8081, + "step": 726 + }, + { + "epoch": 0.14569138276553106, + "grad_norm": 34.852624794686065, + "learning_rate": 4.849699398797595e-06, + "loss": 4.0085, + "step": 727 + }, + { + "epoch": 0.14589178356713428, + "grad_norm": 41.14239272665592, + "learning_rate": 4.856379425517702e-06, + "loss": 4.4766, + "step": 728 + }, + { + "epoch": 0.14609218436873747, + "grad_norm": 71.65909937367644, + "learning_rate": 4.8630594522378095e-06, + "loss": 4.0314, + "step": 729 + }, + { + "epoch": 0.1462925851703407, + "grad_norm": 43.740089914360624, + "learning_rate": 4.869739478957916e-06, + "loss": 3.9844, + "step": 730 + }, + { + "epoch": 0.14649298597194388, + "grad_norm": 51.284247030141934, + "learning_rate": 4.876419505678023e-06, + "loss": 4.1764, + "step": 731 + }, + { + "epoch": 0.1466933867735471, + "grad_norm": 57.771517987762415, + "learning_rate": 4.88309953239813e-06, + "loss": 3.8236, + "step": 732 + }, + { + "epoch": 0.14689378757515031, + "grad_norm": 57.82558866175055, + "learning_rate": 4.889779559118237e-06, + "loss": 3.9808, + "step": 733 + }, + { + "epoch": 0.1470941883767535, + "grad_norm": 45.71713017931839, + "learning_rate": 4.896459585838344e-06, + "loss": 4.3144, + "step": 734 + }, + { + "epoch": 0.14729458917835672, + "grad_norm": 47.97675336724236, + "learning_rate": 4.90313961255845e-06, + "loss": 4.0547, + "step": 735 + }, + { + "epoch": 0.1474949899799599, + "grad_norm": 34.64117832718427, + "learning_rate": 4.9098196392785576e-06, + "loss": 4.3204, + "step": 736 + }, + { + "epoch": 0.14769539078156313, + "grad_norm": 48.48163271239898, + "learning_rate": 4.916499665998664e-06, + "loss": 4.5012, + "step": 737 + }, + { + "epoch": 0.14789579158316632, + "grad_norm": 45.659078925174335, + "learning_rate": 4.923179692718771e-06, + "loss": 3.8643, + "step": 738 + }, + { + "epoch": 0.14809619238476954, + "grad_norm": 41.00304845940691, + "learning_rate": 4.929859719438878e-06, + "loss": 4.5705, + "step": 739 + }, + { + "epoch": 0.14829659318637275, + "grad_norm": 66.39080591461992, + "learning_rate": 4.936539746158985e-06, + "loss": 3.9889, + "step": 740 + }, + { + "epoch": 0.14849699398797594, + "grad_norm": 76.09403977345958, + "learning_rate": 4.943219772879092e-06, + "loss": 3.9061, + "step": 741 + }, + { + "epoch": 0.14869739478957916, + "grad_norm": 36.53685911319737, + "learning_rate": 4.949899799599198e-06, + "loss": 3.8148, + "step": 742 + }, + { + "epoch": 0.14889779559118235, + "grad_norm": 53.42764544994598, + "learning_rate": 4.9565798263193056e-06, + "loss": 3.7742, + "step": 743 + }, + { + "epoch": 0.14909819639278557, + "grad_norm": 54.346653723207176, + "learning_rate": 4.963259853039413e-06, + "loss": 3.7822, + "step": 744 + }, + { + "epoch": 0.1492985971943888, + "grad_norm": 57.52661988955763, + "learning_rate": 4.969939879759519e-06, + "loss": 3.6971, + "step": 745 + }, + { + "epoch": 0.14949899799599198, + "grad_norm": 71.98000377650338, + "learning_rate": 4.976619906479626e-06, + "loss": 3.731, + "step": 746 + }, + { + "epoch": 0.1496993987975952, + "grad_norm": 110.60195726972856, + "learning_rate": 4.983299933199734e-06, + "loss": 3.9852, + "step": 747 + }, + { + "epoch": 0.14989979959919839, + "grad_norm": 42.79969150063577, + "learning_rate": 4.98997995991984e-06, + "loss": 4.2848, + "step": 748 + }, + { + "epoch": 0.1501002004008016, + "grad_norm": 48.3974266118484, + "learning_rate": 4.996659986639946e-06, + "loss": 3.8456, + "step": 749 + }, + { + "epoch": 0.15030060120240482, + "grad_norm": 47.820494792027255, + "learning_rate": 5.0033400133600544e-06, + "loss": 4.0677, + "step": 750 + }, + { + "epoch": 0.150501002004008, + "grad_norm": 51.38534890656464, + "learning_rate": 5.01002004008016e-06, + "loss": 4.3264, + "step": 751 + }, + { + "epoch": 0.15070140280561123, + "grad_norm": 48.36119595942643, + "learning_rate": 5.016700066800267e-06, + "loss": 3.871, + "step": 752 + }, + { + "epoch": 0.15090180360721442, + "grad_norm": 70.89805334642034, + "learning_rate": 5.023380093520374e-06, + "loss": 3.7545, + "step": 753 + }, + { + "epoch": 0.15110220440881764, + "grad_norm": 77.2867335593919, + "learning_rate": 5.030060120240481e-06, + "loss": 3.64, + "step": 754 + }, + { + "epoch": 0.15130260521042085, + "grad_norm": 102.66051596899247, + "learning_rate": 5.036740146960588e-06, + "loss": 4.1, + "step": 755 + }, + { + "epoch": 0.15150300601202404, + "grad_norm": 49.85997519344628, + "learning_rate": 5.043420173680695e-06, + "loss": 3.8305, + "step": 756 + }, + { + "epoch": 0.15170340681362726, + "grad_norm": 43.482083744660244, + "learning_rate": 5.050100200400802e-06, + "loss": 4.2117, + "step": 757 + }, + { + "epoch": 0.15190380761523045, + "grad_norm": 51.164325543723585, + "learning_rate": 5.056780227120909e-06, + "loss": 4.2098, + "step": 758 + }, + { + "epoch": 0.15210420841683367, + "grad_norm": 61.36555257631555, + "learning_rate": 5.063460253841016e-06, + "loss": 4.4425, + "step": 759 + }, + { + "epoch": 0.1523046092184369, + "grad_norm": 41.1662310222957, + "learning_rate": 5.0701402805611224e-06, + "loss": 3.8141, + "step": 760 + }, + { + "epoch": 0.15250501002004008, + "grad_norm": 53.24291353529339, + "learning_rate": 5.07682030728123e-06, + "loss": 3.8695, + "step": 761 + }, + { + "epoch": 0.1527054108216433, + "grad_norm": 72.93643013550798, + "learning_rate": 5.083500334001337e-06, + "loss": 4.1962, + "step": 762 + }, + { + "epoch": 0.15290581162324648, + "grad_norm": 37.55340188913551, + "learning_rate": 5.090180360721442e-06, + "loss": 3.7472, + "step": 763 + }, + { + "epoch": 0.1531062124248497, + "grad_norm": 60.51524591857403, + "learning_rate": 5.0968603874415505e-06, + "loss": 4.2838, + "step": 764 + }, + { + "epoch": 0.1533066132264529, + "grad_norm": 60.078603755859895, + "learning_rate": 5.103540414161658e-06, + "loss": 4.1085, + "step": 765 + }, + { + "epoch": 0.1535070140280561, + "grad_norm": 65.5315159920439, + "learning_rate": 5.110220440881763e-06, + "loss": 4.2972, + "step": 766 + }, + { + "epoch": 0.15370741482965933, + "grad_norm": 36.966578637990814, + "learning_rate": 5.1169004676018705e-06, + "loss": 3.6427, + "step": 767 + }, + { + "epoch": 0.15390781563126252, + "grad_norm": 70.43227195149352, + "learning_rate": 5.123580494321978e-06, + "loss": 4.1947, + "step": 768 + }, + { + "epoch": 0.15410821643286574, + "grad_norm": 90.94709686421928, + "learning_rate": 5.130260521042084e-06, + "loss": 4.3804, + "step": 769 + }, + { + "epoch": 0.15430861723446893, + "grad_norm": 32.338417585951554, + "learning_rate": 5.136940547762191e-06, + "loss": 3.6204, + "step": 770 + }, + { + "epoch": 0.15450901803607214, + "grad_norm": 48.14232029705206, + "learning_rate": 5.1436205744822985e-06, + "loss": 3.9296, + "step": 771 + }, + { + "epoch": 0.15470941883767536, + "grad_norm": 38.61527026864001, + "learning_rate": 5.150300601202405e-06, + "loss": 3.9415, + "step": 772 + }, + { + "epoch": 0.15490981963927855, + "grad_norm": 78.39461170666941, + "learning_rate": 5.156980627922512e-06, + "loss": 4.0813, + "step": 773 + }, + { + "epoch": 0.15511022044088177, + "grad_norm": 45.327724925883984, + "learning_rate": 5.163660654642619e-06, + "loss": 4.3589, + "step": 774 + }, + { + "epoch": 0.15531062124248496, + "grad_norm": 89.78143228846503, + "learning_rate": 5.170340681362726e-06, + "loss": 4.2235, + "step": 775 + }, + { + "epoch": 0.15551102204408818, + "grad_norm": 72.59864797833455, + "learning_rate": 5.177020708082833e-06, + "loss": 3.8186, + "step": 776 + }, + { + "epoch": 0.1557114228456914, + "grad_norm": 43.9705029794593, + "learning_rate": 5.18370073480294e-06, + "loss": 3.512, + "step": 777 + }, + { + "epoch": 0.15591182364729458, + "grad_norm": 43.01698537226014, + "learning_rate": 5.1903807615230465e-06, + "loss": 3.9387, + "step": 778 + }, + { + "epoch": 0.1561122244488978, + "grad_norm": 46.66942661339783, + "learning_rate": 5.197060788243154e-06, + "loss": 4.2046, + "step": 779 + }, + { + "epoch": 0.156312625250501, + "grad_norm": 32.47934387147877, + "learning_rate": 5.203740814963261e-06, + "loss": 3.5756, + "step": 780 + }, + { + "epoch": 0.1565130260521042, + "grad_norm": 27.79093454804305, + "learning_rate": 5.2104208416833665e-06, + "loss": 3.9088, + "step": 781 + }, + { + "epoch": 0.15671342685370743, + "grad_norm": 58.61674387037048, + "learning_rate": 5.217100868403474e-06, + "loss": 3.7356, + "step": 782 + }, + { + "epoch": 0.15691382765531062, + "grad_norm": 34.03907919534118, + "learning_rate": 5.223780895123581e-06, + "loss": 3.7187, + "step": 783 + }, + { + "epoch": 0.15711422845691383, + "grad_norm": 42.95262461392402, + "learning_rate": 5.230460921843687e-06, + "loss": 3.2992, + "step": 784 + }, + { + "epoch": 0.15731462925851702, + "grad_norm": 47.98574530696508, + "learning_rate": 5.2371409485637946e-06, + "loss": 3.7275, + "step": 785 + }, + { + "epoch": 0.15751503006012024, + "grad_norm": 39.00443993469154, + "learning_rate": 5.243820975283902e-06, + "loss": 3.9386, + "step": 786 + }, + { + "epoch": 0.15771543086172346, + "grad_norm": 92.14776608433039, + "learning_rate": 5.250501002004008e-06, + "loss": 3.9526, + "step": 787 + }, + { + "epoch": 0.15791583166332665, + "grad_norm": 52.29293513697859, + "learning_rate": 5.257181028724115e-06, + "loss": 4.7719, + "step": 788 + }, + { + "epoch": 0.15811623246492987, + "grad_norm": 57.686230763314285, + "learning_rate": 5.263861055444223e-06, + "loss": 4.3571, + "step": 789 + }, + { + "epoch": 0.15831663326653306, + "grad_norm": 46.983106238196434, + "learning_rate": 5.270541082164329e-06, + "loss": 3.5965, + "step": 790 + }, + { + "epoch": 0.15851703406813628, + "grad_norm": 46.746735973739284, + "learning_rate": 5.277221108884436e-06, + "loss": 3.7921, + "step": 791 + }, + { + "epoch": 0.15871743486973947, + "grad_norm": 44.914986056257604, + "learning_rate": 5.283901135604543e-06, + "loss": 4.032, + "step": 792 + }, + { + "epoch": 0.15891783567134268, + "grad_norm": 52.9979304182496, + "learning_rate": 5.29058116232465e-06, + "loss": 4.0455, + "step": 793 + }, + { + "epoch": 0.1591182364729459, + "grad_norm": 55.24928693695172, + "learning_rate": 5.297261189044757e-06, + "loss": 3.4322, + "step": 794 + }, + { + "epoch": 0.1593186372745491, + "grad_norm": 44.756623158155804, + "learning_rate": 5.303941215764864e-06, + "loss": 4.238, + "step": 795 + }, + { + "epoch": 0.1595190380761523, + "grad_norm": 43.95316656478097, + "learning_rate": 5.31062124248497e-06, + "loss": 3.9186, + "step": 796 + }, + { + "epoch": 0.1597194388777555, + "grad_norm": 44.15893935833208, + "learning_rate": 5.317301269205077e-06, + "loss": 4.1429, + "step": 797 + }, + { + "epoch": 0.15991983967935872, + "grad_norm": 50.911986329811455, + "learning_rate": 5.323981295925184e-06, + "loss": 3.7118, + "step": 798 + }, + { + "epoch": 0.16012024048096193, + "grad_norm": 54.56554767430819, + "learning_rate": 5.330661322645291e-06, + "loss": 3.3611, + "step": 799 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 42.098010014351644, + "learning_rate": 5.337341349365398e-06, + "loss": 3.9521, + "step": 800 + }, + { + "epoch": 0.16052104208416834, + "grad_norm": 38.01675899264353, + "learning_rate": 5.344021376085505e-06, + "loss": 3.6498, + "step": 801 + }, + { + "epoch": 0.16072144288577153, + "grad_norm": 44.058735949529506, + "learning_rate": 5.350701402805611e-06, + "loss": 3.9032, + "step": 802 + }, + { + "epoch": 0.16092184368737475, + "grad_norm": 49.81857890389535, + "learning_rate": 5.357381429525719e-06, + "loss": 4.1426, + "step": 803 + }, + { + "epoch": 0.16112224448897797, + "grad_norm": 57.212395517471414, + "learning_rate": 5.364061456245826e-06, + "loss": 3.9516, + "step": 804 + }, + { + "epoch": 0.16132264529058116, + "grad_norm": 40.66446027213756, + "learning_rate": 5.370741482965932e-06, + "loss": 3.6938, + "step": 805 + }, + { + "epoch": 0.16152304609218437, + "grad_norm": 40.35029343409612, + "learning_rate": 5.3774215096860395e-06, + "loss": 3.5719, + "step": 806 + }, + { + "epoch": 0.16172344689378756, + "grad_norm": 71.62828125703311, + "learning_rate": 5.384101536406146e-06, + "loss": 4.0354, + "step": 807 + }, + { + "epoch": 0.16192384769539078, + "grad_norm": 167.69378038525036, + "learning_rate": 5.390781563126253e-06, + "loss": 4.4861, + "step": 808 + }, + { + "epoch": 0.162124248496994, + "grad_norm": 48.482260048451316, + "learning_rate": 5.39746158984636e-06, + "loss": 3.9517, + "step": 809 + }, + { + "epoch": 0.1623246492985972, + "grad_norm": 76.48565126964816, + "learning_rate": 5.404141616566466e-06, + "loss": 4.5071, + "step": 810 + }, + { + "epoch": 0.1625250501002004, + "grad_norm": 58.69849563337139, + "learning_rate": 5.410821643286573e-06, + "loss": 3.9445, + "step": 811 + }, + { + "epoch": 0.1627254509018036, + "grad_norm": 68.14389131415555, + "learning_rate": 5.417501670006681e-06, + "loss": 4.5134, + "step": 812 + }, + { + "epoch": 0.16292585170340682, + "grad_norm": 71.430401339029, + "learning_rate": 5.424181696726787e-06, + "loss": 4.1685, + "step": 813 + }, + { + "epoch": 0.16312625250501003, + "grad_norm": 45.956622830586554, + "learning_rate": 5.430861723446894e-06, + "loss": 3.7005, + "step": 814 + }, + { + "epoch": 0.16332665330661322, + "grad_norm": 53.87337802093682, + "learning_rate": 5.437541750167001e-06, + "loss": 4.3696, + "step": 815 + }, + { + "epoch": 0.16352705410821644, + "grad_norm": 36.44668564499246, + "learning_rate": 5.4442217768871075e-06, + "loss": 4.1247, + "step": 816 + }, + { + "epoch": 0.16372745490981963, + "grad_norm": 37.831289007955554, + "learning_rate": 5.450901803607215e-06, + "loss": 3.897, + "step": 817 + }, + { + "epoch": 0.16392785571142285, + "grad_norm": 109.82081454334455, + "learning_rate": 5.457581830327322e-06, + "loss": 4.3433, + "step": 818 + }, + { + "epoch": 0.16412825651302604, + "grad_norm": 50.49119733882816, + "learning_rate": 5.464261857047428e-06, + "loss": 4.254, + "step": 819 + }, + { + "epoch": 0.16432865731462926, + "grad_norm": 35.86346061815708, + "learning_rate": 5.4709418837675355e-06, + "loss": 3.5303, + "step": 820 + }, + { + "epoch": 0.16452905811623247, + "grad_norm": 38.24872009419366, + "learning_rate": 5.477621910487643e-06, + "loss": 4.0571, + "step": 821 + }, + { + "epoch": 0.16472945891783566, + "grad_norm": 41.00222747853926, + "learning_rate": 5.484301937207749e-06, + "loss": 3.9851, + "step": 822 + }, + { + "epoch": 0.16492985971943888, + "grad_norm": 44.37284924218675, + "learning_rate": 5.490981963927856e-06, + "loss": 4.2206, + "step": 823 + }, + { + "epoch": 0.16513026052104207, + "grad_norm": 41.69969791765935, + "learning_rate": 5.4976619906479636e-06, + "loss": 3.7158, + "step": 824 + }, + { + "epoch": 0.1653306613226453, + "grad_norm": 40.31764852301056, + "learning_rate": 5.504342017368069e-06, + "loss": 4.0391, + "step": 825 + }, + { + "epoch": 0.1655310621242485, + "grad_norm": 62.97080699383332, + "learning_rate": 5.511022044088177e-06, + "loss": 4.3683, + "step": 826 + }, + { + "epoch": 0.1657314629258517, + "grad_norm": 42.34652264469021, + "learning_rate": 5.517702070808284e-06, + "loss": 4.0944, + "step": 827 + }, + { + "epoch": 0.16593186372745491, + "grad_norm": 39.7156846794454, + "learning_rate": 5.52438209752839e-06, + "loss": 4.1274, + "step": 828 + }, + { + "epoch": 0.1661322645290581, + "grad_norm": 46.39194590522613, + "learning_rate": 5.531062124248497e-06, + "loss": 3.9688, + "step": 829 + }, + { + "epoch": 0.16633266533066132, + "grad_norm": 40.48131001126146, + "learning_rate": 5.537742150968604e-06, + "loss": 3.9597, + "step": 830 + }, + { + "epoch": 0.16653306613226454, + "grad_norm": 47.70667654021607, + "learning_rate": 5.544422177688711e-06, + "loss": 4.1469, + "step": 831 + }, + { + "epoch": 0.16673346693386773, + "grad_norm": 49.246860716786806, + "learning_rate": 5.551102204408818e-06, + "loss": 3.8884, + "step": 832 + }, + { + "epoch": 0.16693386773547095, + "grad_norm": 44.89965997207345, + "learning_rate": 5.557782231128925e-06, + "loss": 3.9376, + "step": 833 + }, + { + "epoch": 0.16713426853707414, + "grad_norm": 43.38480671217447, + "learning_rate": 5.5644622578490316e-06, + "loss": 4.5948, + "step": 834 + }, + { + "epoch": 0.16733466933867736, + "grad_norm": 56.00213394461325, + "learning_rate": 5.571142284569139e-06, + "loss": 4.2327, + "step": 835 + }, + { + "epoch": 0.16753507014028057, + "grad_norm": 45.57969340143748, + "learning_rate": 5.577822311289246e-06, + "loss": 3.9482, + "step": 836 + }, + { + "epoch": 0.16773547094188376, + "grad_norm": 53.100521218571394, + "learning_rate": 5.584502338009352e-06, + "loss": 3.9237, + "step": 837 + }, + { + "epoch": 0.16793587174348698, + "grad_norm": 44.0145255560251, + "learning_rate": 5.59118236472946e-06, + "loss": 4.3921, + "step": 838 + }, + { + "epoch": 0.16813627254509017, + "grad_norm": 34.320019083628424, + "learning_rate": 5.597862391449567e-06, + "loss": 3.8328, + "step": 839 + }, + { + "epoch": 0.1683366733466934, + "grad_norm": 61.77596139091738, + "learning_rate": 5.604542418169673e-06, + "loss": 4.6413, + "step": 840 + }, + { + "epoch": 0.1685370741482966, + "grad_norm": 39.01749173289854, + "learning_rate": 5.6112224448897804e-06, + "loss": 3.9529, + "step": 841 + }, + { + "epoch": 0.1687374749498998, + "grad_norm": 111.50413505776771, + "learning_rate": 5.617902471609888e-06, + "loss": 3.896, + "step": 842 + }, + { + "epoch": 0.168937875751503, + "grad_norm": 71.55622829466455, + "learning_rate": 5.624582498329993e-06, + "loss": 4.5812, + "step": 843 + }, + { + "epoch": 0.1691382765531062, + "grad_norm": 43.72843566185143, + "learning_rate": 5.6312625250501e-06, + "loss": 4.2552, + "step": 844 + }, + { + "epoch": 0.16933867735470942, + "grad_norm": 43.66771831055134, + "learning_rate": 5.637942551770208e-06, + "loss": 3.8128, + "step": 845 + }, + { + "epoch": 0.1695390781563126, + "grad_norm": 39.16928140261099, + "learning_rate": 5.644622578490314e-06, + "loss": 4.242, + "step": 846 + }, + { + "epoch": 0.16973947895791583, + "grad_norm": 42.046532931182476, + "learning_rate": 5.651302605210421e-06, + "loss": 3.9025, + "step": 847 + }, + { + "epoch": 0.16993987975951905, + "grad_norm": 34.8287811626661, + "learning_rate": 5.6579826319305284e-06, + "loss": 3.5105, + "step": 848 + }, + { + "epoch": 0.17014028056112224, + "grad_norm": 40.81525075370976, + "learning_rate": 5.664662658650635e-06, + "loss": 3.6763, + "step": 849 + }, + { + "epoch": 0.17034068136272545, + "grad_norm": 38.799203307840095, + "learning_rate": 5.671342685370742e-06, + "loss": 3.499, + "step": 850 + }, + { + "epoch": 0.17054108216432864, + "grad_norm": 74.22327710794924, + "learning_rate": 5.678022712090849e-06, + "loss": 4.2306, + "step": 851 + }, + { + "epoch": 0.17074148296593186, + "grad_norm": 76.41835445403355, + "learning_rate": 5.684702738810956e-06, + "loss": 4.1546, + "step": 852 + }, + { + "epoch": 0.17094188376753508, + "grad_norm": 63.54983225853726, + "learning_rate": 5.691382765531063e-06, + "loss": 3.8415, + "step": 853 + }, + { + "epoch": 0.17114228456913827, + "grad_norm": 48.78034552269204, + "learning_rate": 5.69806279225117e-06, + "loss": 4.0836, + "step": 854 + }, + { + "epoch": 0.1713426853707415, + "grad_norm": 100.19365332836041, + "learning_rate": 5.7047428189712765e-06, + "loss": 4.0926, + "step": 855 + }, + { + "epoch": 0.17154308617234468, + "grad_norm": 51.33689312479, + "learning_rate": 5.711422845691384e-06, + "loss": 3.7593, + "step": 856 + }, + { + "epoch": 0.1717434869739479, + "grad_norm": 36.812853828434065, + "learning_rate": 5.718102872411491e-06, + "loss": 4.2604, + "step": 857 + }, + { + "epoch": 0.1719438877755511, + "grad_norm": 138.83668835699197, + "learning_rate": 5.7247828991315964e-06, + "loss": 4.561, + "step": 858 + }, + { + "epoch": 0.1721442885771543, + "grad_norm": 32.821257088274734, + "learning_rate": 5.731462925851704e-06, + "loss": 3.8463, + "step": 859 + }, + { + "epoch": 0.17234468937875752, + "grad_norm": 43.19507860667376, + "learning_rate": 5.738142952571811e-06, + "loss": 4.4926, + "step": 860 + }, + { + "epoch": 0.1725450901803607, + "grad_norm": 33.13224595679116, + "learning_rate": 5.744822979291917e-06, + "loss": 3.6563, + "step": 861 + }, + { + "epoch": 0.17274549098196393, + "grad_norm": 60.67791374933684, + "learning_rate": 5.7515030060120245e-06, + "loss": 4.2368, + "step": 862 + }, + { + "epoch": 0.17294589178356715, + "grad_norm": 33.08393416763506, + "learning_rate": 5.758183032732131e-06, + "loss": 3.7529, + "step": 863 + }, + { + "epoch": 0.17314629258517034, + "grad_norm": 30.763929700069156, + "learning_rate": 5.764863059452238e-06, + "loss": 3.6721, + "step": 864 + }, + { + "epoch": 0.17334669338677355, + "grad_norm": 101.36969445544914, + "learning_rate": 5.771543086172345e-06, + "loss": 4.1286, + "step": 865 + }, + { + "epoch": 0.17354709418837674, + "grad_norm": 41.97627384955911, + "learning_rate": 5.778223112892452e-06, + "loss": 3.6469, + "step": 866 + }, + { + "epoch": 0.17374749498997996, + "grad_norm": 47.11988360269636, + "learning_rate": 5.784903139612559e-06, + "loss": 3.9685, + "step": 867 + }, + { + "epoch": 0.17394789579158318, + "grad_norm": 30.80846678412594, + "learning_rate": 5.791583166332666e-06, + "loss": 3.0832, + "step": 868 + }, + { + "epoch": 0.17414829659318637, + "grad_norm": 66.60604022372058, + "learning_rate": 5.7982631930527725e-06, + "loss": 3.779, + "step": 869 + }, + { + "epoch": 0.1743486973947896, + "grad_norm": 74.49576106586511, + "learning_rate": 5.80494321977288e-06, + "loss": 4.0501, + "step": 870 + }, + { + "epoch": 0.17454909819639278, + "grad_norm": 39.875323315855006, + "learning_rate": 5.811623246492987e-06, + "loss": 3.6062, + "step": 871 + }, + { + "epoch": 0.174749498997996, + "grad_norm": 37.96664369810909, + "learning_rate": 5.8183032732130925e-06, + "loss": 4.0273, + "step": 872 + }, + { + "epoch": 0.1749498997995992, + "grad_norm": 35.58396550528713, + "learning_rate": 5.8249832999332e-06, + "loss": 3.9997, + "step": 873 + }, + { + "epoch": 0.1751503006012024, + "grad_norm": 48.701729842999214, + "learning_rate": 5.831663326653308e-06, + "loss": 4.102, + "step": 874 + }, + { + "epoch": 0.17535070140280562, + "grad_norm": 56.64663618274882, + "learning_rate": 5.838343353373413e-06, + "loss": 4.1978, + "step": 875 + }, + { + "epoch": 0.1755511022044088, + "grad_norm": 53.128199787810466, + "learning_rate": 5.8450233800935205e-06, + "loss": 3.7827, + "step": 876 + }, + { + "epoch": 0.17575150300601203, + "grad_norm": 102.60652471967975, + "learning_rate": 5.851703406813628e-06, + "loss": 4.6363, + "step": 877 + }, + { + "epoch": 0.17595190380761522, + "grad_norm": 47.536824298930284, + "learning_rate": 5.858383433533734e-06, + "loss": 4.1499, + "step": 878 + }, + { + "epoch": 0.17615230460921844, + "grad_norm": 58.674017357072266, + "learning_rate": 5.865063460253841e-06, + "loss": 4.4062, + "step": 879 + }, + { + "epoch": 0.17635270541082165, + "grad_norm": 44.4869299082284, + "learning_rate": 5.871743486973949e-06, + "loss": 3.4945, + "step": 880 + }, + { + "epoch": 0.17655310621242484, + "grad_norm": 47.84615121525859, + "learning_rate": 5.878423513694055e-06, + "loss": 3.949, + "step": 881 + }, + { + "epoch": 0.17675350701402806, + "grad_norm": 44.02684090052903, + "learning_rate": 5.885103540414162e-06, + "loss": 4.319, + "step": 882 + }, + { + "epoch": 0.17695390781563125, + "grad_norm": 32.984297290583186, + "learning_rate": 5.891783567134269e-06, + "loss": 4.0232, + "step": 883 + }, + { + "epoch": 0.17715430861723447, + "grad_norm": 48.48348366441968, + "learning_rate": 5.898463593854376e-06, + "loss": 4.0508, + "step": 884 + }, + { + "epoch": 0.17735470941883769, + "grad_norm": 44.57930330364495, + "learning_rate": 5.905143620574483e-06, + "loss": 3.8104, + "step": 885 + }, + { + "epoch": 0.17755511022044088, + "grad_norm": 38.527754961529865, + "learning_rate": 5.91182364729459e-06, + "loss": 4.023, + "step": 886 + }, + { + "epoch": 0.1777555110220441, + "grad_norm": 34.98635454146358, + "learning_rate": 5.918503674014696e-06, + "loss": 4.3353, + "step": 887 + }, + { + "epoch": 0.17795591182364728, + "grad_norm": 62.10203163732273, + "learning_rate": 5.925183700734804e-06, + "loss": 3.8765, + "step": 888 + }, + { + "epoch": 0.1781563126252505, + "grad_norm": 38.38744737514561, + "learning_rate": 5.931863727454911e-06, + "loss": 3.9857, + "step": 889 + }, + { + "epoch": 0.17835671342685372, + "grad_norm": 41.295813460045196, + "learning_rate": 5.938543754175017e-06, + "loss": 3.4205, + "step": 890 + }, + { + "epoch": 0.1785571142284569, + "grad_norm": 43.64197961439921, + "learning_rate": 5.945223780895124e-06, + "loss": 3.6858, + "step": 891 + }, + { + "epoch": 0.17875751503006013, + "grad_norm": 60.65878396996693, + "learning_rate": 5.951903807615231e-06, + "loss": 4.1345, + "step": 892 + }, + { + "epoch": 0.17895791583166332, + "grad_norm": 34.82891089292426, + "learning_rate": 5.958583834335337e-06, + "loss": 3.7729, + "step": 893 + }, + { + "epoch": 0.17915831663326653, + "grad_norm": 38.84363613326236, + "learning_rate": 5.965263861055445e-06, + "loss": 4.351, + "step": 894 + }, + { + "epoch": 0.17935871743486975, + "grad_norm": 63.584290077013414, + "learning_rate": 5.971943887775552e-06, + "loss": 4.1209, + "step": 895 + }, + { + "epoch": 0.17955911823647294, + "grad_norm": 38.37884776165155, + "learning_rate": 5.978623914495658e-06, + "loss": 4.42, + "step": 896 + }, + { + "epoch": 0.17975951903807616, + "grad_norm": 57.98743313547058, + "learning_rate": 5.9853039412157654e-06, + "loss": 3.961, + "step": 897 + }, + { + "epoch": 0.17995991983967935, + "grad_norm": 52.440605195806945, + "learning_rate": 5.991983967935873e-06, + "loss": 4.0661, + "step": 898 + }, + { + "epoch": 0.18016032064128257, + "grad_norm": 45.15414918531459, + "learning_rate": 5.998663994655979e-06, + "loss": 4.0511, + "step": 899 + }, + { + "epoch": 0.18036072144288579, + "grad_norm": 30.862709308187565, + "learning_rate": 6.005344021376086e-06, + "loss": 3.771, + "step": 900 + }, + { + "epoch": 0.18056112224448898, + "grad_norm": 45.17914866005745, + "learning_rate": 6.0120240480961935e-06, + "loss": 4.0198, + "step": 901 + }, + { + "epoch": 0.1807615230460922, + "grad_norm": 50.72895198267893, + "learning_rate": 6.0187040748163e-06, + "loss": 4.6121, + "step": 902 + }, + { + "epoch": 0.18096192384769538, + "grad_norm": 42.96305567481802, + "learning_rate": 6.025384101536407e-06, + "loss": 3.7632, + "step": 903 + }, + { + "epoch": 0.1811623246492986, + "grad_norm": 42.65165064724406, + "learning_rate": 6.032064128256514e-06, + "loss": 3.6709, + "step": 904 + }, + { + "epoch": 0.1813627254509018, + "grad_norm": 59.29920165204637, + "learning_rate": 6.03874415497662e-06, + "loss": 4.1482, + "step": 905 + }, + { + "epoch": 0.181563126252505, + "grad_norm": 37.67071311642579, + "learning_rate": 6.045424181696727e-06, + "loss": 4.1003, + "step": 906 + }, + { + "epoch": 0.18176352705410823, + "grad_norm": 121.68931996371344, + "learning_rate": 6.052104208416834e-06, + "loss": 4.2119, + "step": 907 + }, + { + "epoch": 0.18196392785571142, + "grad_norm": 58.52946926542667, + "learning_rate": 6.058784235136941e-06, + "loss": 4.0013, + "step": 908 + }, + { + "epoch": 0.18216432865731463, + "grad_norm": 76.40905842266677, + "learning_rate": 6.065464261857048e-06, + "loss": 4.2348, + "step": 909 + }, + { + "epoch": 0.18236472945891782, + "grad_norm": 60.50028224457465, + "learning_rate": 6.072144288577155e-06, + "loss": 4.2532, + "step": 910 + }, + { + "epoch": 0.18256513026052104, + "grad_norm": 50.1570209423741, + "learning_rate": 6.0788243152972615e-06, + "loss": 3.9259, + "step": 911 + }, + { + "epoch": 0.18276553106212426, + "grad_norm": 50.58177962613364, + "learning_rate": 6.085504342017369e-06, + "loss": 3.9203, + "step": 912 + }, + { + "epoch": 0.18296593186372745, + "grad_norm": 53.789011533761915, + "learning_rate": 6.092184368737476e-06, + "loss": 4.156, + "step": 913 + }, + { + "epoch": 0.18316633266533067, + "grad_norm": 31.333626334795976, + "learning_rate": 6.098864395457582e-06, + "loss": 3.7424, + "step": 914 + }, + { + "epoch": 0.18336673346693386, + "grad_norm": 34.9484055699747, + "learning_rate": 6.1055444221776895e-06, + "loss": 3.4186, + "step": 915 + }, + { + "epoch": 0.18356713426853707, + "grad_norm": 65.56957095412925, + "learning_rate": 6.112224448897796e-06, + "loss": 4.0444, + "step": 916 + }, + { + "epoch": 0.1837675350701403, + "grad_norm": 106.66753590004821, + "learning_rate": 6.118904475617903e-06, + "loss": 3.949, + "step": 917 + }, + { + "epoch": 0.18396793587174348, + "grad_norm": 41.399322760389424, + "learning_rate": 6.12558450233801e-06, + "loss": 4.1142, + "step": 918 + }, + { + "epoch": 0.1841683366733467, + "grad_norm": 46.432546173167836, + "learning_rate": 6.132264529058116e-06, + "loss": 3.9655, + "step": 919 + }, + { + "epoch": 0.1843687374749499, + "grad_norm": 45.367577093541556, + "learning_rate": 6.138944555778223e-06, + "loss": 4.0823, + "step": 920 + }, + { + "epoch": 0.1845691382765531, + "grad_norm": 49.33427561914852, + "learning_rate": 6.14562458249833e-06, + "loss": 4.0855, + "step": 921 + }, + { + "epoch": 0.18476953907815633, + "grad_norm": 31.074529054946254, + "learning_rate": 6.152304609218437e-06, + "loss": 3.8583, + "step": 922 + }, + { + "epoch": 0.18496993987975952, + "grad_norm": 70.47317624133699, + "learning_rate": 6.158984635938544e-06, + "loss": 4.7384, + "step": 923 + }, + { + "epoch": 0.18517034068136273, + "grad_norm": 49.27940743725742, + "learning_rate": 6.165664662658651e-06, + "loss": 4.3181, + "step": 924 + }, + { + "epoch": 0.18537074148296592, + "grad_norm": 45.97916774018853, + "learning_rate": 6.1723446893787575e-06, + "loss": 3.8139, + "step": 925 + }, + { + "epoch": 0.18557114228456914, + "grad_norm": 39.08562666559791, + "learning_rate": 6.179024716098865e-06, + "loss": 4.3268, + "step": 926 + }, + { + "epoch": 0.18577154308617236, + "grad_norm": 48.267521642574835, + "learning_rate": 6.185704742818972e-06, + "loss": 4.1207, + "step": 927 + }, + { + "epoch": 0.18597194388777555, + "grad_norm": 68.48078553190008, + "learning_rate": 6.192384769539078e-06, + "loss": 3.8985, + "step": 928 + }, + { + "epoch": 0.18617234468937877, + "grad_norm": 68.67883372127282, + "learning_rate": 6.199064796259186e-06, + "loss": 4.0792, + "step": 929 + }, + { + "epoch": 0.18637274549098196, + "grad_norm": 30.86803857402922, + "learning_rate": 6.205744822979293e-06, + "loss": 4.2406, + "step": 930 + }, + { + "epoch": 0.18657314629258517, + "grad_norm": 45.63748466518368, + "learning_rate": 6.212424849699399e-06, + "loss": 3.617, + "step": 931 + }, + { + "epoch": 0.18677354709418836, + "grad_norm": 32.58948155754732, + "learning_rate": 6.219104876419506e-06, + "loss": 3.9412, + "step": 932 + }, + { + "epoch": 0.18697394789579158, + "grad_norm": 41.88748955487754, + "learning_rate": 6.225784903139614e-06, + "loss": 3.7892, + "step": 933 + }, + { + "epoch": 0.1871743486973948, + "grad_norm": 55.741553216305704, + "learning_rate": 6.232464929859719e-06, + "loss": 4.3662, + "step": 934 + }, + { + "epoch": 0.187374749498998, + "grad_norm": 38.34324418196921, + "learning_rate": 6.239144956579826e-06, + "loss": 3.7746, + "step": 935 + }, + { + "epoch": 0.1875751503006012, + "grad_norm": 43.305574542395995, + "learning_rate": 6.2458249832999344e-06, + "loss": 4.4813, + "step": 936 + }, + { + "epoch": 0.1877755511022044, + "grad_norm": 28.30697474017263, + "learning_rate": 6.25250501002004e-06, + "loss": 3.6928, + "step": 937 + }, + { + "epoch": 0.18797595190380761, + "grad_norm": 45.9684442858821, + "learning_rate": 6.259185036740147e-06, + "loss": 4.4684, + "step": 938 + }, + { + "epoch": 0.18817635270541083, + "grad_norm": 30.995900692549366, + "learning_rate": 6.2658650634602544e-06, + "loss": 4.0344, + "step": 939 + }, + { + "epoch": 0.18837675350701402, + "grad_norm": 23.412594004236006, + "learning_rate": 6.272545090180361e-06, + "loss": 3.7677, + "step": 940 + }, + { + "epoch": 0.18857715430861724, + "grad_norm": 44.21714038666888, + "learning_rate": 6.279225116900468e-06, + "loss": 4.0271, + "step": 941 + }, + { + "epoch": 0.18877755511022043, + "grad_norm": 78.97590159511483, + "learning_rate": 6.285905143620575e-06, + "loss": 4.0976, + "step": 942 + }, + { + "epoch": 0.18897795591182365, + "grad_norm": 33.357614949584764, + "learning_rate": 6.292585170340682e-06, + "loss": 3.7288, + "step": 943 + }, + { + "epoch": 0.18917835671342687, + "grad_norm": 36.62263943334311, + "learning_rate": 6.299265197060789e-06, + "loss": 4.0657, + "step": 944 + }, + { + "epoch": 0.18937875751503006, + "grad_norm": 49.12598427148571, + "learning_rate": 6.305945223780896e-06, + "loss": 4.2716, + "step": 945 + }, + { + "epoch": 0.18957915831663327, + "grad_norm": 56.91618559415642, + "learning_rate": 6.3126252505010024e-06, + "loss": 4.3992, + "step": 946 + }, + { + "epoch": 0.18977955911823646, + "grad_norm": 67.01958721360728, + "learning_rate": 6.31930527722111e-06, + "loss": 4.0209, + "step": 947 + }, + { + "epoch": 0.18997995991983968, + "grad_norm": 54.07991912889618, + "learning_rate": 6.325985303941217e-06, + "loss": 4.1775, + "step": 948 + }, + { + "epoch": 0.1901803607214429, + "grad_norm": 68.07798575636654, + "learning_rate": 6.332665330661322e-06, + "loss": 4.2557, + "step": 949 + }, + { + "epoch": 0.1903807615230461, + "grad_norm": 36.58304264673822, + "learning_rate": 6.3393453573814305e-06, + "loss": 3.9681, + "step": 950 + }, + { + "epoch": 0.1905811623246493, + "grad_norm": 50.19037552884816, + "learning_rate": 6.346025384101538e-06, + "loss": 4.7873, + "step": 951 + }, + { + "epoch": 0.1907815631262525, + "grad_norm": 33.73416007451519, + "learning_rate": 6.352705410821643e-06, + "loss": 3.8056, + "step": 952 + }, + { + "epoch": 0.1909819639278557, + "grad_norm": 39.52269684951649, + "learning_rate": 6.3593854375417505e-06, + "loss": 4.1633, + "step": 953 + }, + { + "epoch": 0.19118236472945893, + "grad_norm": 61.28376046229104, + "learning_rate": 6.366065464261858e-06, + "loss": 4.462, + "step": 954 + }, + { + "epoch": 0.19138276553106212, + "grad_norm": 108.99990975760991, + "learning_rate": 6.372745490981964e-06, + "loss": 4.15, + "step": 955 + }, + { + "epoch": 0.19158316633266534, + "grad_norm": 53.997348881069854, + "learning_rate": 6.379425517702071e-06, + "loss": 4.4717, + "step": 956 + }, + { + "epoch": 0.19178356713426853, + "grad_norm": 35.09210206061957, + "learning_rate": 6.3861055444221785e-06, + "loss": 3.9019, + "step": 957 + }, + { + "epoch": 0.19198396793587175, + "grad_norm": 57.79986714996263, + "learning_rate": 6.392785571142285e-06, + "loss": 3.7831, + "step": 958 + }, + { + "epoch": 0.19218436873747494, + "grad_norm": 24.265384526529026, + "learning_rate": 6.399465597862392e-06, + "loss": 3.7009, + "step": 959 + }, + { + "epoch": 0.19238476953907815, + "grad_norm": 62.11760133845296, + "learning_rate": 6.406145624582499e-06, + "loss": 4.0837, + "step": 960 + }, + { + "epoch": 0.19258517034068137, + "grad_norm": 42.13017577819554, + "learning_rate": 6.412825651302606e-06, + "loss": 3.9153, + "step": 961 + }, + { + "epoch": 0.19278557114228456, + "grad_norm": 39.30501928582028, + "learning_rate": 6.419505678022713e-06, + "loss": 3.9489, + "step": 962 + }, + { + "epoch": 0.19298597194388778, + "grad_norm": 35.63024116621252, + "learning_rate": 6.42618570474282e-06, + "loss": 4.1788, + "step": 963 + }, + { + "epoch": 0.19318637274549097, + "grad_norm": 50.81053241282147, + "learning_rate": 6.4328657314629265e-06, + "loss": 4.0994, + "step": 964 + }, + { + "epoch": 0.1933867735470942, + "grad_norm": 39.24714482906844, + "learning_rate": 6.439545758183034e-06, + "loss": 4.245, + "step": 965 + }, + { + "epoch": 0.1935871743486974, + "grad_norm": 49.446492368385925, + "learning_rate": 6.446225784903141e-06, + "loss": 3.7904, + "step": 966 + }, + { + "epoch": 0.1937875751503006, + "grad_norm": 99.65931934328744, + "learning_rate": 6.4529058116232465e-06, + "loss": 4.6793, + "step": 967 + }, + { + "epoch": 0.1939879759519038, + "grad_norm": 44.3221544334407, + "learning_rate": 6.459585838343354e-06, + "loss": 3.8897, + "step": 968 + }, + { + "epoch": 0.194188376753507, + "grad_norm": 42.96804344929117, + "learning_rate": 6.466265865063461e-06, + "loss": 4.1953, + "step": 969 + }, + { + "epoch": 0.19438877755511022, + "grad_norm": 39.76957401232256, + "learning_rate": 6.472945891783567e-06, + "loss": 3.4631, + "step": 970 + }, + { + "epoch": 0.19458917835671344, + "grad_norm": 30.60880057232533, + "learning_rate": 6.4796259185036746e-06, + "loss": 3.6972, + "step": 971 + }, + { + "epoch": 0.19478957915831663, + "grad_norm": 54.87617602764357, + "learning_rate": 6.486305945223781e-06, + "loss": 4.2072, + "step": 972 + }, + { + "epoch": 0.19498997995991985, + "grad_norm": 73.37800252041613, + "learning_rate": 6.492985971943888e-06, + "loss": 4.3285, + "step": 973 + }, + { + "epoch": 0.19519038076152304, + "grad_norm": 40.147022243948165, + "learning_rate": 6.499665998663995e-06, + "loss": 3.8455, + "step": 974 + }, + { + "epoch": 0.19539078156312625, + "grad_norm": 45.304811960825845, + "learning_rate": 6.506346025384102e-06, + "loss": 3.9276, + "step": 975 + }, + { + "epoch": 0.19559118236472947, + "grad_norm": 53.43443634976777, + "learning_rate": 6.513026052104209e-06, + "loss": 4.0035, + "step": 976 + }, + { + "epoch": 0.19579158316633266, + "grad_norm": 40.587877083376874, + "learning_rate": 6.519706078824316e-06, + "loss": 4.0409, + "step": 977 + }, + { + "epoch": 0.19599198396793588, + "grad_norm": 39.804057996066845, + "learning_rate": 6.526386105544423e-06, + "loss": 4.7201, + "step": 978 + }, + { + "epoch": 0.19619238476953907, + "grad_norm": 36.698064144563396, + "learning_rate": 6.53306613226453e-06, + "loss": 3.9046, + "step": 979 + }, + { + "epoch": 0.1963927855711423, + "grad_norm": 61.47590738085851, + "learning_rate": 6.539746158984637e-06, + "loss": 4.6309, + "step": 980 + }, + { + "epoch": 0.1965931863727455, + "grad_norm": 45.8213543638033, + "learning_rate": 6.5464261857047426e-06, + "loss": 4.9337, + "step": 981 + }, + { + "epoch": 0.1967935871743487, + "grad_norm": 41.13695924487937, + "learning_rate": 6.55310621242485e-06, + "loss": 4.614, + "step": 982 + }, + { + "epoch": 0.1969939879759519, + "grad_norm": 51.008812252870975, + "learning_rate": 6.559786239144957e-06, + "loss": 4.1691, + "step": 983 + }, + { + "epoch": 0.1971943887775551, + "grad_norm": 36.87809425759383, + "learning_rate": 6.566466265865063e-06, + "loss": 4.0343, + "step": 984 + }, + { + "epoch": 0.19739478957915832, + "grad_norm": 116.0181301751056, + "learning_rate": 6.573146292585171e-06, + "loss": 3.9457, + "step": 985 + }, + { + "epoch": 0.1975951903807615, + "grad_norm": 28.106583833802944, + "learning_rate": 6.579826319305278e-06, + "loss": 3.8525, + "step": 986 + }, + { + "epoch": 0.19779559118236473, + "grad_norm": 39.551132360095444, + "learning_rate": 6.586506346025384e-06, + "loss": 3.5848, + "step": 987 + }, + { + "epoch": 0.19799599198396794, + "grad_norm": 29.487077538519014, + "learning_rate": 6.5931863727454914e-06, + "loss": 3.9399, + "step": 988 + }, + { + "epoch": 0.19819639278557113, + "grad_norm": 31.241207419922187, + "learning_rate": 6.599866399465599e-06, + "loss": 3.6176, + "step": 989 + }, + { + "epoch": 0.19839679358717435, + "grad_norm": 45.437254574460404, + "learning_rate": 6.606546426185705e-06, + "loss": 3.982, + "step": 990 + }, + { + "epoch": 0.19859719438877754, + "grad_norm": 144.3059788159311, + "learning_rate": 6.613226452905812e-06, + "loss": 4.319, + "step": 991 + }, + { + "epoch": 0.19879759519038076, + "grad_norm": 45.498028897021804, + "learning_rate": 6.6199064796259195e-06, + "loss": 4.2375, + "step": 992 + }, + { + "epoch": 0.19899799599198398, + "grad_norm": 61.14485357105067, + "learning_rate": 6.626586506346026e-06, + "loss": 3.7169, + "step": 993 + }, + { + "epoch": 0.19919839679358717, + "grad_norm": 41.510932713644266, + "learning_rate": 6.633266533066133e-06, + "loss": 3.4188, + "step": 994 + }, + { + "epoch": 0.19939879759519039, + "grad_norm": 33.68719489290033, + "learning_rate": 6.63994655978624e-06, + "loss": 3.9141, + "step": 995 + }, + { + "epoch": 0.19959919839679358, + "grad_norm": 99.60585054578357, + "learning_rate": 6.646626586506346e-06, + "loss": 4.7944, + "step": 996 + }, + { + "epoch": 0.1997995991983968, + "grad_norm": 37.28099351365737, + "learning_rate": 6.653306613226453e-06, + "loss": 4.5588, + "step": 997 + }, + { + "epoch": 0.2, + "grad_norm": 59.12684396489448, + "learning_rate": 6.65998663994656e-06, + "loss": 4.1078, + "step": 998 + }, + { + "epoch": 0.2002004008016032, + "grad_norm": 47.53370076348628, + "learning_rate": 6.666666666666667e-06, + "loss": 4.1387, + "step": 999 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 40.56350420758383, + "learning_rate": 6.673346693386774e-06, + "loss": 3.9491, + "step": 1000 + }, + { + "epoch": 0.2006012024048096, + "grad_norm": 50.94731305033708, + "learning_rate": 6.680026720106881e-06, + "loss": 4.2066, + "step": 1001 + }, + { + "epoch": 0.20080160320641283, + "grad_norm": 34.55206954318056, + "learning_rate": 6.6867067468269875e-06, + "loss": 3.8416, + "step": 1002 + }, + { + "epoch": 0.20100200400801604, + "grad_norm": 51.031493647220046, + "learning_rate": 6.693386773547095e-06, + "loss": 4.9277, + "step": 1003 + }, + { + "epoch": 0.20120240480961923, + "grad_norm": 31.16529253847505, + "learning_rate": 6.700066800267202e-06, + "loss": 3.683, + "step": 1004 + }, + { + "epoch": 0.20140280561122245, + "grad_norm": 66.37226670810037, + "learning_rate": 6.706746826987308e-06, + "loss": 3.3966, + "step": 1005 + }, + { + "epoch": 0.20160320641282564, + "grad_norm": 101.65950011509109, + "learning_rate": 6.7134268537074155e-06, + "loss": 4.1282, + "step": 1006 + }, + { + "epoch": 0.20180360721442886, + "grad_norm": 49.771699573597154, + "learning_rate": 6.720106880427523e-06, + "loss": 4.1711, + "step": 1007 + }, + { + "epoch": 0.20200400801603208, + "grad_norm": 42.93660059131203, + "learning_rate": 6.726786907147629e-06, + "loss": 3.9502, + "step": 1008 + }, + { + "epoch": 0.20220440881763527, + "grad_norm": 188.08886079633132, + "learning_rate": 6.733466933867736e-06, + "loss": 3.6282, + "step": 1009 + }, + { + "epoch": 0.20240480961923848, + "grad_norm": 33.7091132427258, + "learning_rate": 6.7401469605878436e-06, + "loss": 4.5402, + "step": 1010 + }, + { + "epoch": 0.20260521042084167, + "grad_norm": 81.90274101925041, + "learning_rate": 6.746826987307949e-06, + "loss": 4.4407, + "step": 1011 + }, + { + "epoch": 0.2028056112224449, + "grad_norm": 69.82050760048146, + "learning_rate": 6.753507014028057e-06, + "loss": 4.669, + "step": 1012 + }, + { + "epoch": 0.20300601202404808, + "grad_norm": 38.61243028825191, + "learning_rate": 6.760187040748164e-06, + "loss": 4.3207, + "step": 1013 + }, + { + "epoch": 0.2032064128256513, + "grad_norm": 42.234623858311785, + "learning_rate": 6.76686706746827e-06, + "loss": 4.1183, + "step": 1014 + }, + { + "epoch": 0.20340681362725452, + "grad_norm": 41.240154287831544, + "learning_rate": 6.773547094188377e-06, + "loss": 3.6383, + "step": 1015 + }, + { + "epoch": 0.2036072144288577, + "grad_norm": 46.39631330294213, + "learning_rate": 6.780227120908484e-06, + "loss": 3.8951, + "step": 1016 + }, + { + "epoch": 0.20380761523046093, + "grad_norm": 49.22159275259354, + "learning_rate": 6.786907147628591e-06, + "loss": 3.7922, + "step": 1017 + }, + { + "epoch": 0.20400801603206412, + "grad_norm": 50.62239354760117, + "learning_rate": 6.793587174348698e-06, + "loss": 3.6923, + "step": 1018 + }, + { + "epoch": 0.20420841683366733, + "grad_norm": 42.09527010635107, + "learning_rate": 6.800267201068805e-06, + "loss": 3.6601, + "step": 1019 + }, + { + "epoch": 0.20440881763527055, + "grad_norm": 47.561567320887285, + "learning_rate": 6.8069472277889116e-06, + "loss": 3.6463, + "step": 1020 + }, + { + "epoch": 0.20460921843687374, + "grad_norm": 33.362882552132724, + "learning_rate": 6.813627254509019e-06, + "loss": 4.1022, + "step": 1021 + }, + { + "epoch": 0.20480961923847696, + "grad_norm": 63.360784157639536, + "learning_rate": 6.820307281229126e-06, + "loss": 3.7596, + "step": 1022 + }, + { + "epoch": 0.20501002004008015, + "grad_norm": 42.532982954634576, + "learning_rate": 6.826987307949232e-06, + "loss": 4.737, + "step": 1023 + }, + { + "epoch": 0.20521042084168337, + "grad_norm": 35.70104261590352, + "learning_rate": 6.83366733466934e-06, + "loss": 3.8359, + "step": 1024 + }, + { + "epoch": 0.20541082164328658, + "grad_norm": 42.33270557752073, + "learning_rate": 6.840347361389447e-06, + "loss": 4.2945, + "step": 1025 + }, + { + "epoch": 0.20561122244488977, + "grad_norm": 44.59732596567752, + "learning_rate": 6.847027388109553e-06, + "loss": 4.1957, + "step": 1026 + }, + { + "epoch": 0.205811623246493, + "grad_norm": 27.483418188545446, + "learning_rate": 6.8537074148296604e-06, + "loss": 3.9378, + "step": 1027 + }, + { + "epoch": 0.20601202404809618, + "grad_norm": 47.99399009161505, + "learning_rate": 6.860387441549766e-06, + "loss": 4.3767, + "step": 1028 + }, + { + "epoch": 0.2062124248496994, + "grad_norm": 37.22375211372046, + "learning_rate": 6.867067468269873e-06, + "loss": 4.002, + "step": 1029 + }, + { + "epoch": 0.20641282565130262, + "grad_norm": 47.03292171364008, + "learning_rate": 6.87374749498998e-06, + "loss": 3.7182, + "step": 1030 + }, + { + "epoch": 0.2066132264529058, + "grad_norm": 29.53711610079696, + "learning_rate": 6.880427521710087e-06, + "loss": 3.9423, + "step": 1031 + }, + { + "epoch": 0.20681362725450902, + "grad_norm": 30.747774944841918, + "learning_rate": 6.887107548430194e-06, + "loss": 4.085, + "step": 1032 + }, + { + "epoch": 0.20701402805611221, + "grad_norm": 54.97736930126935, + "learning_rate": 6.893787575150301e-06, + "loss": 3.6091, + "step": 1033 + }, + { + "epoch": 0.20721442885771543, + "grad_norm": 48.552275821195295, + "learning_rate": 6.900467601870408e-06, + "loss": 3.9163, + "step": 1034 + }, + { + "epoch": 0.20741482965931865, + "grad_norm": 35.23295491761421, + "learning_rate": 6.907147628590515e-06, + "loss": 4.3395, + "step": 1035 + }, + { + "epoch": 0.20761523046092184, + "grad_norm": 63.716695022002824, + "learning_rate": 6.913827655310622e-06, + "loss": 4.3049, + "step": 1036 + }, + { + "epoch": 0.20781563126252506, + "grad_norm": 36.79321313908303, + "learning_rate": 6.9205076820307284e-06, + "loss": 3.8331, + "step": 1037 + }, + { + "epoch": 0.20801603206412825, + "grad_norm": 86.84608693824875, + "learning_rate": 6.927187708750836e-06, + "loss": 5.1276, + "step": 1038 + }, + { + "epoch": 0.20821643286573147, + "grad_norm": 28.642581215930097, + "learning_rate": 6.933867735470943e-06, + "loss": 4.0411, + "step": 1039 + }, + { + "epoch": 0.20841683366733466, + "grad_norm": 35.11999628208468, + "learning_rate": 6.940547762191049e-06, + "loss": 4.1142, + "step": 1040 + }, + { + "epoch": 0.20861723446893787, + "grad_norm": 26.43274617803439, + "learning_rate": 6.9472277889111565e-06, + "loss": 3.799, + "step": 1041 + }, + { + "epoch": 0.2088176352705411, + "grad_norm": 41.50043376951691, + "learning_rate": 6.953907815631264e-06, + "loss": 3.8658, + "step": 1042 + }, + { + "epoch": 0.20901803607214428, + "grad_norm": 58.09042155932211, + "learning_rate": 6.960587842351369e-06, + "loss": 4.509, + "step": 1043 + }, + { + "epoch": 0.2092184368737475, + "grad_norm": 88.22478467437705, + "learning_rate": 6.9672678690714764e-06, + "loss": 4.0784, + "step": 1044 + }, + { + "epoch": 0.2094188376753507, + "grad_norm": 42.18316042672315, + "learning_rate": 6.973947895791584e-06, + "loss": 4.0621, + "step": 1045 + }, + { + "epoch": 0.2096192384769539, + "grad_norm": 84.93107834339317, + "learning_rate": 6.98062792251169e-06, + "loss": 3.9059, + "step": 1046 + }, + { + "epoch": 0.20981963927855712, + "grad_norm": 35.55576099271037, + "learning_rate": 6.987307949231797e-06, + "loss": 3.3255, + "step": 1047 + }, + { + "epoch": 0.21002004008016031, + "grad_norm": 56.12505679055811, + "learning_rate": 6.9939879759519045e-06, + "loss": 4.3428, + "step": 1048 + }, + { + "epoch": 0.21022044088176353, + "grad_norm": 45.427691398999826, + "learning_rate": 7.000668002672011e-06, + "loss": 3.4988, + "step": 1049 + }, + { + "epoch": 0.21042084168336672, + "grad_norm": 45.12392591299335, + "learning_rate": 7.007348029392118e-06, + "loss": 4.2249, + "step": 1050 + }, + { + "epoch": 0.21062124248496994, + "grad_norm": 102.53245129285874, + "learning_rate": 7.014028056112225e-06, + "loss": 4.7702, + "step": 1051 + }, + { + "epoch": 0.21082164328657316, + "grad_norm": 36.99809978728265, + "learning_rate": 7.020708082832332e-06, + "loss": 3.7134, + "step": 1052 + }, + { + "epoch": 0.21102204408817635, + "grad_norm": 33.27721102555493, + "learning_rate": 7.027388109552439e-06, + "loss": 4.1705, + "step": 1053 + }, + { + "epoch": 0.21122244488977956, + "grad_norm": 130.16573098203776, + "learning_rate": 7.034068136272546e-06, + "loss": 3.8713, + "step": 1054 + }, + { + "epoch": 0.21142284569138275, + "grad_norm": 34.20391900965179, + "learning_rate": 7.0407481629926525e-06, + "loss": 3.9465, + "step": 1055 + }, + { + "epoch": 0.21162324649298597, + "grad_norm": 54.883986336818175, + "learning_rate": 7.04742818971276e-06, + "loss": 3.6788, + "step": 1056 + }, + { + "epoch": 0.2118236472945892, + "grad_norm": 134.88289896558777, + "learning_rate": 7.054108216432867e-06, + "loss": 3.9083, + "step": 1057 + }, + { + "epoch": 0.21202404809619238, + "grad_norm": 42.61585237378702, + "learning_rate": 7.0607882431529725e-06, + "loss": 4.0958, + "step": 1058 + }, + { + "epoch": 0.2122244488977956, + "grad_norm": 32.14669874573267, + "learning_rate": 7.06746826987308e-06, + "loss": 3.7323, + "step": 1059 + }, + { + "epoch": 0.2124248496993988, + "grad_norm": 54.507966327029, + "learning_rate": 7.074148296593187e-06, + "loss": 3.6975, + "step": 1060 + }, + { + "epoch": 0.212625250501002, + "grad_norm": 55.7810782014183, + "learning_rate": 7.080828323313293e-06, + "loss": 4.0481, + "step": 1061 + }, + { + "epoch": 0.21282565130260522, + "grad_norm": 27.587524379385613, + "learning_rate": 7.0875083500334005e-06, + "loss": 3.4378, + "step": 1062 + }, + { + "epoch": 0.2130260521042084, + "grad_norm": 47.743304304149895, + "learning_rate": 7.094188376753508e-06, + "loss": 3.6873, + "step": 1063 + }, + { + "epoch": 0.21322645290581163, + "grad_norm": 41.47435693225965, + "learning_rate": 7.100868403473614e-06, + "loss": 4.0253, + "step": 1064 + }, + { + "epoch": 0.21342685370741482, + "grad_norm": 58.10907122200508, + "learning_rate": 7.107548430193721e-06, + "loss": 3.9146, + "step": 1065 + }, + { + "epoch": 0.21362725450901804, + "grad_norm": 51.42566905732435, + "learning_rate": 7.114228456913829e-06, + "loss": 4.6322, + "step": 1066 + }, + { + "epoch": 0.21382765531062126, + "grad_norm": 68.77748505795127, + "learning_rate": 7.120908483633935e-06, + "loss": 3.5942, + "step": 1067 + }, + { + "epoch": 0.21402805611222445, + "grad_norm": 52.67247000454378, + "learning_rate": 7.127588510354042e-06, + "loss": 4.6619, + "step": 1068 + }, + { + "epoch": 0.21422845691382766, + "grad_norm": 31.91430202239629, + "learning_rate": 7.134268537074149e-06, + "loss": 3.9421, + "step": 1069 + }, + { + "epoch": 0.21442885771543085, + "grad_norm": 42.85833786333902, + "learning_rate": 7.140948563794256e-06, + "loss": 4.2168, + "step": 1070 + }, + { + "epoch": 0.21462925851703407, + "grad_norm": 26.61227298380019, + "learning_rate": 7.147628590514363e-06, + "loss": 3.3024, + "step": 1071 + }, + { + "epoch": 0.21482965931863726, + "grad_norm": 69.44138560324836, + "learning_rate": 7.15430861723447e-06, + "loss": 3.7631, + "step": 1072 + }, + { + "epoch": 0.21503006012024048, + "grad_norm": 42.836683261960545, + "learning_rate": 7.160988643954576e-06, + "loss": 4.2396, + "step": 1073 + }, + { + "epoch": 0.2152304609218437, + "grad_norm": 47.07893353831095, + "learning_rate": 7.167668670674684e-06, + "loss": 4.2235, + "step": 1074 + }, + { + "epoch": 0.2154308617234469, + "grad_norm": 54.145397201514655, + "learning_rate": 7.174348697394791e-06, + "loss": 4.6721, + "step": 1075 + }, + { + "epoch": 0.2156312625250501, + "grad_norm": 31.71490010759202, + "learning_rate": 7.181028724114897e-06, + "loss": 4.4567, + "step": 1076 + }, + { + "epoch": 0.2158316633266533, + "grad_norm": 35.24283335150571, + "learning_rate": 7.187708750835004e-06, + "loss": 3.8908, + "step": 1077 + }, + { + "epoch": 0.2160320641282565, + "grad_norm": 31.438877031469172, + "learning_rate": 7.194388777555111e-06, + "loss": 3.8907, + "step": 1078 + }, + { + "epoch": 0.21623246492985973, + "grad_norm": 37.14079819391158, + "learning_rate": 7.201068804275217e-06, + "loss": 3.7069, + "step": 1079 + }, + { + "epoch": 0.21643286573146292, + "grad_norm": 43.75611196005151, + "learning_rate": 7.207748830995325e-06, + "loss": 4.5327, + "step": 1080 + }, + { + "epoch": 0.21663326653306614, + "grad_norm": 56.29417435819481, + "learning_rate": 7.214428857715432e-06, + "loss": 3.6049, + "step": 1081 + }, + { + "epoch": 0.21683366733466933, + "grad_norm": 41.80598410979844, + "learning_rate": 7.221108884435538e-06, + "loss": 3.5272, + "step": 1082 + }, + { + "epoch": 0.21703406813627255, + "grad_norm": 51.76757745034523, + "learning_rate": 7.2277889111556455e-06, + "loss": 3.9424, + "step": 1083 + }, + { + "epoch": 0.21723446893787576, + "grad_norm": 46.808641179031184, + "learning_rate": 7.234468937875752e-06, + "loss": 3.5369, + "step": 1084 + }, + { + "epoch": 0.21743486973947895, + "grad_norm": 33.42556922765298, + "learning_rate": 7.241148964595859e-06, + "loss": 4.2415, + "step": 1085 + }, + { + "epoch": 0.21763527054108217, + "grad_norm": 29.85568954343617, + "learning_rate": 7.247828991315966e-06, + "loss": 3.7829, + "step": 1086 + }, + { + "epoch": 0.21783567134268536, + "grad_norm": 33.78342850365639, + "learning_rate": 7.254509018036072e-06, + "loss": 3.944, + "step": 1087 + }, + { + "epoch": 0.21803607214428858, + "grad_norm": 42.63396597952269, + "learning_rate": 7.26118904475618e-06, + "loss": 4.0699, + "step": 1088 + }, + { + "epoch": 0.2182364729458918, + "grad_norm": 60.21935407436411, + "learning_rate": 7.267869071476287e-06, + "loss": 4.0449, + "step": 1089 + }, + { + "epoch": 0.218436873747495, + "grad_norm": 67.45904283821878, + "learning_rate": 7.274549098196393e-06, + "loss": 5.0495, + "step": 1090 + }, + { + "epoch": 0.2186372745490982, + "grad_norm": 33.77351326102813, + "learning_rate": 7.2812291249165e-06, + "loss": 4.3618, + "step": 1091 + }, + { + "epoch": 0.2188376753507014, + "grad_norm": 41.38127158655977, + "learning_rate": 7.287909151636607e-06, + "loss": 4.0687, + "step": 1092 + }, + { + "epoch": 0.2190380761523046, + "grad_norm": 50.92936345596076, + "learning_rate": 7.2945891783567134e-06, + "loss": 4.5146, + "step": 1093 + }, + { + "epoch": 0.21923847695390783, + "grad_norm": 40.44334405304517, + "learning_rate": 7.301269205076821e-06, + "loss": 4.2482, + "step": 1094 + }, + { + "epoch": 0.21943887775551102, + "grad_norm": 29.15927827698101, + "learning_rate": 7.307949231796928e-06, + "loss": 3.8786, + "step": 1095 + }, + { + "epoch": 0.21963927855711424, + "grad_norm": 42.18332215812884, + "learning_rate": 7.314629258517034e-06, + "loss": 3.8263, + "step": 1096 + }, + { + "epoch": 0.21983967935871743, + "grad_norm": 43.55092737615091, + "learning_rate": 7.3213092852371415e-06, + "loss": 4.1636, + "step": 1097 + }, + { + "epoch": 0.22004008016032064, + "grad_norm": 42.38454134798736, + "learning_rate": 7.327989311957249e-06, + "loss": 4.5157, + "step": 1098 + }, + { + "epoch": 0.22024048096192383, + "grad_norm": 34.23422051211958, + "learning_rate": 7.334669338677355e-06, + "loss": 3.7497, + "step": 1099 + }, + { + "epoch": 0.22044088176352705, + "grad_norm": 100.78176007914018, + "learning_rate": 7.341349365397462e-06, + "loss": 3.8574, + "step": 1100 + }, + { + "epoch": 0.22064128256513027, + "grad_norm": 33.305636600685546, + "learning_rate": 7.3480293921175695e-06, + "loss": 3.9194, + "step": 1101 + }, + { + "epoch": 0.22084168336673346, + "grad_norm": 37.29099982093691, + "learning_rate": 7.354709418837676e-06, + "loss": 4.0037, + "step": 1102 + }, + { + "epoch": 0.22104208416833668, + "grad_norm": 41.62857831519957, + "learning_rate": 7.361389445557783e-06, + "loss": 4.4235, + "step": 1103 + }, + { + "epoch": 0.22124248496993987, + "grad_norm": 57.754198674027116, + "learning_rate": 7.36806947227789e-06, + "loss": 4.6899, + "step": 1104 + }, + { + "epoch": 0.22144288577154309, + "grad_norm": 43.101216724081816, + "learning_rate": 7.374749498997996e-06, + "loss": 3.3515, + "step": 1105 + }, + { + "epoch": 0.2216432865731463, + "grad_norm": 33.79370250304973, + "learning_rate": 7.381429525718103e-06, + "loss": 3.6448, + "step": 1106 + }, + { + "epoch": 0.2218436873747495, + "grad_norm": 42.37337457095862, + "learning_rate": 7.38810955243821e-06, + "loss": 3.8585, + "step": 1107 + }, + { + "epoch": 0.2220440881763527, + "grad_norm": 39.53528436941088, + "learning_rate": 7.394789579158317e-06, + "loss": 4.0725, + "step": 1108 + }, + { + "epoch": 0.2222444889779559, + "grad_norm": 33.94546428834019, + "learning_rate": 7.401469605878424e-06, + "loss": 3.7667, + "step": 1109 + }, + { + "epoch": 0.22244488977955912, + "grad_norm": 78.93023584863484, + "learning_rate": 7.408149632598531e-06, + "loss": 4.2003, + "step": 1110 + }, + { + "epoch": 0.22264529058116234, + "grad_norm": 36.285621601592275, + "learning_rate": 7.4148296593186375e-06, + "loss": 4.0786, + "step": 1111 + }, + { + "epoch": 0.22284569138276553, + "grad_norm": 29.47508398949864, + "learning_rate": 7.421509686038745e-06, + "loss": 3.7865, + "step": 1112 + }, + { + "epoch": 0.22304609218436874, + "grad_norm": 39.89841165741084, + "learning_rate": 7.428189712758852e-06, + "loss": 3.8905, + "step": 1113 + }, + { + "epoch": 0.22324649298597193, + "grad_norm": 38.39657490211506, + "learning_rate": 7.434869739478958e-06, + "loss": 3.9966, + "step": 1114 + }, + { + "epoch": 0.22344689378757515, + "grad_norm": 38.48164076544712, + "learning_rate": 7.441549766199066e-06, + "loss": 4.1137, + "step": 1115 + }, + { + "epoch": 0.22364729458917837, + "grad_norm": 32.91140995925423, + "learning_rate": 7.448229792919173e-06, + "loss": 3.9699, + "step": 1116 + }, + { + "epoch": 0.22384769539078156, + "grad_norm": 50.71802204051613, + "learning_rate": 7.454909819639279e-06, + "loss": 4.3384, + "step": 1117 + }, + { + "epoch": 0.22404809619238478, + "grad_norm": 61.04444762480463, + "learning_rate": 7.461589846359386e-06, + "loss": 4.0391, + "step": 1118 + }, + { + "epoch": 0.22424849699398797, + "grad_norm": 30.768682036781165, + "learning_rate": 7.468269873079494e-06, + "loss": 4.2545, + "step": 1119 + }, + { + "epoch": 0.22444889779559118, + "grad_norm": 28.96896347376514, + "learning_rate": 7.474949899799599e-06, + "loss": 3.871, + "step": 1120 + }, + { + "epoch": 0.2246492985971944, + "grad_norm": 50.21066450922108, + "learning_rate": 7.481629926519706e-06, + "loss": 4.0681, + "step": 1121 + }, + { + "epoch": 0.2248496993987976, + "grad_norm": 29.30341890337176, + "learning_rate": 7.488309953239814e-06, + "loss": 3.7735, + "step": 1122 + }, + { + "epoch": 0.2250501002004008, + "grad_norm": 32.02840553970113, + "learning_rate": 7.49498997995992e-06, + "loss": 3.9377, + "step": 1123 + }, + { + "epoch": 0.225250501002004, + "grad_norm": 51.623053680985315, + "learning_rate": 7.501670006680027e-06, + "loss": 4.0806, + "step": 1124 + }, + { + "epoch": 0.22545090180360722, + "grad_norm": 51.90459288673928, + "learning_rate": 7.5083500334001344e-06, + "loss": 4.448, + "step": 1125 + }, + { + "epoch": 0.2256513026052104, + "grad_norm": 46.56666762829547, + "learning_rate": 7.515030060120241e-06, + "loss": 3.5976, + "step": 1126 + }, + { + "epoch": 0.22585170340681363, + "grad_norm": 28.787519140631822, + "learning_rate": 7.521710086840348e-06, + "loss": 4.0912, + "step": 1127 + }, + { + "epoch": 0.22605210420841684, + "grad_norm": 49.06675372683693, + "learning_rate": 7.528390113560455e-06, + "loss": 4.4108, + "step": 1128 + }, + { + "epoch": 0.22625250501002003, + "grad_norm": 28.683717307943777, + "learning_rate": 7.535070140280562e-06, + "loss": 3.9785, + "step": 1129 + }, + { + "epoch": 0.22645290581162325, + "grad_norm": 47.44862014849367, + "learning_rate": 7.541750167000669e-06, + "loss": 4.2859, + "step": 1130 + }, + { + "epoch": 0.22665330661322644, + "grad_norm": 35.76007341427158, + "learning_rate": 7.548430193720776e-06, + "loss": 3.9511, + "step": 1131 + }, + { + "epoch": 0.22685370741482966, + "grad_norm": 35.4891306181429, + "learning_rate": 7.5551102204408825e-06, + "loss": 4.3919, + "step": 1132 + }, + { + "epoch": 0.22705410821643288, + "grad_norm": 54.58727255767654, + "learning_rate": 7.56179024716099e-06, + "loss": 4.2358, + "step": 1133 + }, + { + "epoch": 0.22725450901803607, + "grad_norm": 41.77365010188889, + "learning_rate": 7.568470273881097e-06, + "loss": 4.065, + "step": 1134 + }, + { + "epoch": 0.22745490981963928, + "grad_norm": 45.514389034414144, + "learning_rate": 7.5751503006012024e-06, + "loss": 3.9339, + "step": 1135 + }, + { + "epoch": 0.22765531062124247, + "grad_norm": 58.892434148821934, + "learning_rate": 7.5818303273213105e-06, + "loss": 4.5469, + "step": 1136 + }, + { + "epoch": 0.2278557114228457, + "grad_norm": 42.84027408658972, + "learning_rate": 7.588510354041418e-06, + "loss": 4.001, + "step": 1137 + }, + { + "epoch": 0.2280561122244489, + "grad_norm": 41.859151641346145, + "learning_rate": 7.595190380761523e-06, + "loss": 3.9538, + "step": 1138 + }, + { + "epoch": 0.2282565130260521, + "grad_norm": 48.43207212912739, + "learning_rate": 7.6018704074816305e-06, + "loss": 4.3791, + "step": 1139 + }, + { + "epoch": 0.22845691382765532, + "grad_norm": 71.38207266029264, + "learning_rate": 7.608550434201737e-06, + "loss": 3.8941, + "step": 1140 + }, + { + "epoch": 0.2286573146292585, + "grad_norm": 42.34461456329479, + "learning_rate": 7.615230460921844e-06, + "loss": 4.3623, + "step": 1141 + }, + { + "epoch": 0.22885771543086172, + "grad_norm": 50.08640554824379, + "learning_rate": 7.621910487641951e-06, + "loss": 4.0964, + "step": 1142 + }, + { + "epoch": 0.22905811623246494, + "grad_norm": 25.53317686461734, + "learning_rate": 7.628590514362058e-06, + "loss": 3.7574, + "step": 1143 + }, + { + "epoch": 0.22925851703406813, + "grad_norm": 33.897008892379915, + "learning_rate": 7.635270541082164e-06, + "loss": 4.1739, + "step": 1144 + }, + { + "epoch": 0.22945891783567135, + "grad_norm": 42.58077321476748, + "learning_rate": 7.641950567802272e-06, + "loss": 4.0136, + "step": 1145 + }, + { + "epoch": 0.22965931863727454, + "grad_norm": 44.61464210049689, + "learning_rate": 7.648630594522378e-06, + "loss": 4.6871, + "step": 1146 + }, + { + "epoch": 0.22985971943887776, + "grad_norm": 30.74539613799849, + "learning_rate": 7.655310621242485e-06, + "loss": 3.7822, + "step": 1147 + }, + { + "epoch": 0.23006012024048098, + "grad_norm": 39.38619538541988, + "learning_rate": 7.661990647962593e-06, + "loss": 3.522, + "step": 1148 + }, + { + "epoch": 0.23026052104208417, + "grad_norm": 42.79519669749307, + "learning_rate": 7.6686706746827e-06, + "loss": 3.6934, + "step": 1149 + }, + { + "epoch": 0.23046092184368738, + "grad_norm": 46.45760947300551, + "learning_rate": 7.675350701402806e-06, + "loss": 4.1852, + "step": 1150 + }, + { + "epoch": 0.23066132264529057, + "grad_norm": 54.61016476619112, + "learning_rate": 7.682030728122914e-06, + "loss": 4.1882, + "step": 1151 + }, + { + "epoch": 0.2308617234468938, + "grad_norm": 36.679913499867034, + "learning_rate": 7.68871075484302e-06, + "loss": 3.9538, + "step": 1152 + }, + { + "epoch": 0.23106212424849698, + "grad_norm": 139.67421346846447, + "learning_rate": 7.695390781563127e-06, + "loss": 4.1504, + "step": 1153 + }, + { + "epoch": 0.2312625250501002, + "grad_norm": 49.61496330718998, + "learning_rate": 7.702070808283235e-06, + "loss": 3.9392, + "step": 1154 + }, + { + "epoch": 0.23146292585170342, + "grad_norm": 34.056255675721374, + "learning_rate": 7.708750835003341e-06, + "loss": 3.9863, + "step": 1155 + }, + { + "epoch": 0.2316633266533066, + "grad_norm": 47.46311673726404, + "learning_rate": 7.715430861723447e-06, + "loss": 4.0779, + "step": 1156 + }, + { + "epoch": 0.23186372745490982, + "grad_norm": 37.898293595938355, + "learning_rate": 7.722110888443555e-06, + "loss": 4.1266, + "step": 1157 + }, + { + "epoch": 0.232064128256513, + "grad_norm": 46.832894290665934, + "learning_rate": 7.72879091516366e-06, + "loss": 3.4836, + "step": 1158 + }, + { + "epoch": 0.23226452905811623, + "grad_norm": 44.26332694088612, + "learning_rate": 7.735470941883768e-06, + "loss": 4.4204, + "step": 1159 + }, + { + "epoch": 0.23246492985971945, + "grad_norm": 48.85521824478128, + "learning_rate": 7.742150968603875e-06, + "loss": 4.8037, + "step": 1160 + }, + { + "epoch": 0.23266533066132264, + "grad_norm": 39.77548494836003, + "learning_rate": 7.748830995323981e-06, + "loss": 4.0836, + "step": 1161 + }, + { + "epoch": 0.23286573146292586, + "grad_norm": 32.4132538082305, + "learning_rate": 7.755511022044089e-06, + "loss": 3.6607, + "step": 1162 + }, + { + "epoch": 0.23306613226452905, + "grad_norm": 41.498755601639, + "learning_rate": 7.762191048764195e-06, + "loss": 4.4675, + "step": 1163 + }, + { + "epoch": 0.23326653306613226, + "grad_norm": 58.70062381944353, + "learning_rate": 7.768871075484302e-06, + "loss": 4.6118, + "step": 1164 + }, + { + "epoch": 0.23346693386773548, + "grad_norm": 43.923671913367656, + "learning_rate": 7.77555110220441e-06, + "loss": 4.194, + "step": 1165 + }, + { + "epoch": 0.23366733466933867, + "grad_norm": 49.323943634439075, + "learning_rate": 7.782231128924516e-06, + "loss": 4.2441, + "step": 1166 + }, + { + "epoch": 0.2338677354709419, + "grad_norm": 25.144245920191413, + "learning_rate": 7.788911155644623e-06, + "loss": 4.1488, + "step": 1167 + }, + { + "epoch": 0.23406813627254508, + "grad_norm": 32.48863553272543, + "learning_rate": 7.79559118236473e-06, + "loss": 3.9687, + "step": 1168 + }, + { + "epoch": 0.2342685370741483, + "grad_norm": 37.02457066854885, + "learning_rate": 7.802271209084837e-06, + "loss": 4.4501, + "step": 1169 + }, + { + "epoch": 0.23446893787575152, + "grad_norm": 42.43750060116158, + "learning_rate": 7.808951235804943e-06, + "loss": 4.0315, + "step": 1170 + }, + { + "epoch": 0.2346693386773547, + "grad_norm": 39.53479824667412, + "learning_rate": 7.815631262525051e-06, + "loss": 4.1074, + "step": 1171 + }, + { + "epoch": 0.23486973947895792, + "grad_norm": 49.25357803226064, + "learning_rate": 7.822311289245158e-06, + "loss": 4.0663, + "step": 1172 + }, + { + "epoch": 0.2350701402805611, + "grad_norm": 37.23817792915836, + "learning_rate": 7.828991315965264e-06, + "loss": 4.5048, + "step": 1173 + }, + { + "epoch": 0.23527054108216433, + "grad_norm": 31.20170320156939, + "learning_rate": 7.835671342685372e-06, + "loss": 3.8763, + "step": 1174 + }, + { + "epoch": 0.23547094188376755, + "grad_norm": 53.81234884085264, + "learning_rate": 7.842351369405479e-06, + "loss": 4.6944, + "step": 1175 + }, + { + "epoch": 0.23567134268537074, + "grad_norm": 47.695097414716514, + "learning_rate": 7.849031396125585e-06, + "loss": 4.2595, + "step": 1176 + }, + { + "epoch": 0.23587174348697396, + "grad_norm": 36.61792529707904, + "learning_rate": 7.855711422845691e-06, + "loss": 3.8238, + "step": 1177 + }, + { + "epoch": 0.23607214428857715, + "grad_norm": 32.305655768710444, + "learning_rate": 7.8623914495658e-06, + "loss": 4.4038, + "step": 1178 + }, + { + "epoch": 0.23627254509018036, + "grad_norm": 47.482726399518285, + "learning_rate": 7.869071476285906e-06, + "loss": 4.234, + "step": 1179 + }, + { + "epoch": 0.23647294589178355, + "grad_norm": 29.98310363455451, + "learning_rate": 7.875751503006012e-06, + "loss": 3.8548, + "step": 1180 + }, + { + "epoch": 0.23667334669338677, + "grad_norm": 45.062334289854114, + "learning_rate": 7.88243152972612e-06, + "loss": 4.0354, + "step": 1181 + }, + { + "epoch": 0.23687374749499, + "grad_norm": 27.911049475852295, + "learning_rate": 7.889111556446227e-06, + "loss": 3.848, + "step": 1182 + }, + { + "epoch": 0.23707414829659318, + "grad_norm": 66.32553826823079, + "learning_rate": 7.895791583166333e-06, + "loss": 3.9747, + "step": 1183 + }, + { + "epoch": 0.2372745490981964, + "grad_norm": 24.10405660920101, + "learning_rate": 7.902471609886441e-06, + "loss": 3.8379, + "step": 1184 + }, + { + "epoch": 0.2374749498997996, + "grad_norm": 49.74784428924063, + "learning_rate": 7.909151636606547e-06, + "loss": 4.3299, + "step": 1185 + }, + { + "epoch": 0.2376753507014028, + "grad_norm": 31.533898084337704, + "learning_rate": 7.915831663326654e-06, + "loss": 3.304, + "step": 1186 + }, + { + "epoch": 0.23787575150300602, + "grad_norm": 45.54796752417705, + "learning_rate": 7.922511690046762e-06, + "loss": 3.8386, + "step": 1187 + }, + { + "epoch": 0.2380761523046092, + "grad_norm": 34.335972775246276, + "learning_rate": 7.929191716766868e-06, + "loss": 4.2398, + "step": 1188 + }, + { + "epoch": 0.23827655310621243, + "grad_norm": 31.11075897592091, + "learning_rate": 7.935871743486975e-06, + "loss": 3.5907, + "step": 1189 + }, + { + "epoch": 0.23847695390781562, + "grad_norm": 38.73678291235831, + "learning_rate": 7.942551770207083e-06, + "loss": 4.112, + "step": 1190 + }, + { + "epoch": 0.23867735470941884, + "grad_norm": 30.093081745316027, + "learning_rate": 7.949231796927187e-06, + "loss": 4.5344, + "step": 1191 + }, + { + "epoch": 0.23887775551102206, + "grad_norm": 29.470112304719514, + "learning_rate": 7.955911823647296e-06, + "loss": 4.0424, + "step": 1192 + }, + { + "epoch": 0.23907815631262525, + "grad_norm": 44.292532929900254, + "learning_rate": 7.962591850367402e-06, + "loss": 4.9037, + "step": 1193 + }, + { + "epoch": 0.23927855711422846, + "grad_norm": 44.027331324855936, + "learning_rate": 7.969271877087508e-06, + "loss": 4.6112, + "step": 1194 + }, + { + "epoch": 0.23947895791583165, + "grad_norm": 52.57839833957864, + "learning_rate": 7.975951903807616e-06, + "loss": 4.0509, + "step": 1195 + }, + { + "epoch": 0.23967935871743487, + "grad_norm": 48.27071163108536, + "learning_rate": 7.982631930527723e-06, + "loss": 4.2444, + "step": 1196 + }, + { + "epoch": 0.2398797595190381, + "grad_norm": 39.59163836659977, + "learning_rate": 7.989311957247829e-06, + "loss": 4.7473, + "step": 1197 + }, + { + "epoch": 0.24008016032064128, + "grad_norm": 37.735399794155924, + "learning_rate": 7.995991983967937e-06, + "loss": 4.7964, + "step": 1198 + }, + { + "epoch": 0.2402805611222445, + "grad_norm": 44.50162998753866, + "learning_rate": 8.002672010688044e-06, + "loss": 5.0377, + "step": 1199 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 42.011676695596414, + "learning_rate": 8.00935203740815e-06, + "loss": 3.4699, + "step": 1200 + }, + { + "epoch": 0.2406813627254509, + "grad_norm": 28.592149547737836, + "learning_rate": 8.016032064128258e-06, + "loss": 4.6994, + "step": 1201 + }, + { + "epoch": 0.24088176352705412, + "grad_norm": 24.6378828427407, + "learning_rate": 8.022712090848364e-06, + "loss": 3.8218, + "step": 1202 + }, + { + "epoch": 0.2410821643286573, + "grad_norm": 29.413763912058663, + "learning_rate": 8.02939211756847e-06, + "loss": 3.8888, + "step": 1203 + }, + { + "epoch": 0.24128256513026053, + "grad_norm": 27.09339769449182, + "learning_rate": 8.036072144288579e-06, + "loss": 3.844, + "step": 1204 + }, + { + "epoch": 0.24148296593186372, + "grad_norm": 42.960694267921184, + "learning_rate": 8.042752171008683e-06, + "loss": 4.5729, + "step": 1205 + }, + { + "epoch": 0.24168336673346694, + "grad_norm": 29.061790871463486, + "learning_rate": 8.049432197728792e-06, + "loss": 3.9302, + "step": 1206 + }, + { + "epoch": 0.24188376753507013, + "grad_norm": 23.49128464841866, + "learning_rate": 8.056112224448898e-06, + "loss": 3.9444, + "step": 1207 + }, + { + "epoch": 0.24208416833667334, + "grad_norm": 36.28324699260823, + "learning_rate": 8.062792251169004e-06, + "loss": 3.8141, + "step": 1208 + }, + { + "epoch": 0.24228456913827656, + "grad_norm": 70.18739531876204, + "learning_rate": 8.069472277889112e-06, + "loss": 4.2735, + "step": 1209 + }, + { + "epoch": 0.24248496993987975, + "grad_norm": 58.7519151042029, + "learning_rate": 8.076152304609219e-06, + "loss": 4.0599, + "step": 1210 + }, + { + "epoch": 0.24268537074148297, + "grad_norm": 54.33825512827317, + "learning_rate": 8.082832331329325e-06, + "loss": 4.9369, + "step": 1211 + }, + { + "epoch": 0.24288577154308616, + "grad_norm": 50.40891960359417, + "learning_rate": 8.089512358049433e-06, + "loss": 4.526, + "step": 1212 + }, + { + "epoch": 0.24308617234468938, + "grad_norm": 32.556171514915825, + "learning_rate": 8.09619238476954e-06, + "loss": 3.5156, + "step": 1213 + }, + { + "epoch": 0.2432865731462926, + "grad_norm": 36.250513666318156, + "learning_rate": 8.102872411489646e-06, + "loss": 3.7279, + "step": 1214 + }, + { + "epoch": 0.24348697394789579, + "grad_norm": 26.590438752427488, + "learning_rate": 8.109552438209754e-06, + "loss": 3.7018, + "step": 1215 + }, + { + "epoch": 0.243687374749499, + "grad_norm": 34.113483344691616, + "learning_rate": 8.11623246492986e-06, + "loss": 3.6765, + "step": 1216 + }, + { + "epoch": 0.2438877755511022, + "grad_norm": 37.78471981345742, + "learning_rate": 8.122912491649967e-06, + "loss": 4.4051, + "step": 1217 + }, + { + "epoch": 0.2440881763527054, + "grad_norm": 51.33814062998889, + "learning_rate": 8.129592518370075e-06, + "loss": 4.0972, + "step": 1218 + }, + { + "epoch": 0.24428857715430863, + "grad_norm": 29.909155699991004, + "learning_rate": 8.136272545090181e-06, + "loss": 4.1055, + "step": 1219 + }, + { + "epoch": 0.24448897795591182, + "grad_norm": 60.646721255031075, + "learning_rate": 8.142952571810288e-06, + "loss": 4.4554, + "step": 1220 + }, + { + "epoch": 0.24468937875751504, + "grad_norm": 33.66668819366149, + "learning_rate": 8.149632598530394e-06, + "loss": 4.2902, + "step": 1221 + }, + { + "epoch": 0.24488977955911823, + "grad_norm": 32.09498368828062, + "learning_rate": 8.156312625250502e-06, + "loss": 3.4193, + "step": 1222 + }, + { + "epoch": 0.24509018036072144, + "grad_norm": 86.08329872210827, + "learning_rate": 8.162992651970608e-06, + "loss": 3.666, + "step": 1223 + }, + { + "epoch": 0.24529058116232466, + "grad_norm": 40.59932266359836, + "learning_rate": 8.169672678690715e-06, + "loss": 3.8573, + "step": 1224 + }, + { + "epoch": 0.24549098196392785, + "grad_norm": 40.09788013520861, + "learning_rate": 8.176352705410823e-06, + "loss": 4.1097, + "step": 1225 + }, + { + "epoch": 0.24569138276553107, + "grad_norm": 39.84145752863726, + "learning_rate": 8.18303273213093e-06, + "loss": 3.7104, + "step": 1226 + }, + { + "epoch": 0.24589178356713426, + "grad_norm": 26.58172898014521, + "learning_rate": 8.189712758851036e-06, + "loss": 4.0939, + "step": 1227 + }, + { + "epoch": 0.24609218436873748, + "grad_norm": 26.375147843391368, + "learning_rate": 8.196392785571144e-06, + "loss": 3.8904, + "step": 1228 + }, + { + "epoch": 0.2462925851703407, + "grad_norm": 53.38292755413034, + "learning_rate": 8.20307281229125e-06, + "loss": 3.951, + "step": 1229 + }, + { + "epoch": 0.24649298597194388, + "grad_norm": 47.76941223600255, + "learning_rate": 8.209752839011356e-06, + "loss": 4.6069, + "step": 1230 + }, + { + "epoch": 0.2466933867735471, + "grad_norm": 28.08182991642957, + "learning_rate": 8.216432865731465e-06, + "loss": 3.7968, + "step": 1231 + }, + { + "epoch": 0.2468937875751503, + "grad_norm": 41.694765563772606, + "learning_rate": 8.223112892451571e-06, + "loss": 3.7516, + "step": 1232 + }, + { + "epoch": 0.2470941883767535, + "grad_norm": 45.530881018478155, + "learning_rate": 8.229792919171677e-06, + "loss": 3.8134, + "step": 1233 + }, + { + "epoch": 0.2472945891783567, + "grad_norm": 35.94002366021184, + "learning_rate": 8.236472945891785e-06, + "loss": 4.1794, + "step": 1234 + }, + { + "epoch": 0.24749498997995992, + "grad_norm": 56.44499433446861, + "learning_rate": 8.24315297261189e-06, + "loss": 4.4144, + "step": 1235 + }, + { + "epoch": 0.24769539078156314, + "grad_norm": 39.232705027681085, + "learning_rate": 8.249832999331998e-06, + "loss": 4.2374, + "step": 1236 + }, + { + "epoch": 0.24789579158316633, + "grad_norm": 43.44324295168092, + "learning_rate": 8.256513026052106e-06, + "loss": 3.7706, + "step": 1237 + }, + { + "epoch": 0.24809619238476954, + "grad_norm": 30.195609633943242, + "learning_rate": 8.263193052772211e-06, + "loss": 3.8944, + "step": 1238 + }, + { + "epoch": 0.24829659318637273, + "grad_norm": 50.88144509041542, + "learning_rate": 8.269873079492319e-06, + "loss": 4.6289, + "step": 1239 + }, + { + "epoch": 0.24849699398797595, + "grad_norm": 45.16987817695608, + "learning_rate": 8.276553106212425e-06, + "loss": 4.3903, + "step": 1240 + }, + { + "epoch": 0.24869739478957917, + "grad_norm": 40.86954212077223, + "learning_rate": 8.283233132932532e-06, + "loss": 4.3937, + "step": 1241 + }, + { + "epoch": 0.24889779559118236, + "grad_norm": 59.725043701673236, + "learning_rate": 8.28991315965264e-06, + "loss": 3.8031, + "step": 1242 + }, + { + "epoch": 0.24909819639278558, + "grad_norm": 52.799610169310945, + "learning_rate": 8.296593186372746e-06, + "loss": 4.2264, + "step": 1243 + }, + { + "epoch": 0.24929859719438877, + "grad_norm": 34.01074052700417, + "learning_rate": 8.303273213092852e-06, + "loss": 4.0457, + "step": 1244 + }, + { + "epoch": 0.24949899799599198, + "grad_norm": 38.7627133619688, + "learning_rate": 8.30995323981296e-06, + "loss": 4.8298, + "step": 1245 + }, + { + "epoch": 0.2496993987975952, + "grad_norm": 44.29584462967087, + "learning_rate": 8.316633266533067e-06, + "loss": 3.9038, + "step": 1246 + }, + { + "epoch": 0.2498997995991984, + "grad_norm": 37.210345334429284, + "learning_rate": 8.323313293253173e-06, + "loss": 4.8056, + "step": 1247 + }, + { + "epoch": 0.2501002004008016, + "grad_norm": 32.92426097489949, + "learning_rate": 8.329993319973281e-06, + "loss": 4.2642, + "step": 1248 + }, + { + "epoch": 0.2503006012024048, + "grad_norm": 54.99935697452104, + "learning_rate": 8.336673346693386e-06, + "loss": 4.1339, + "step": 1249 + }, + { + "epoch": 0.250501002004008, + "grad_norm": 52.19290903810462, + "learning_rate": 8.343353373413494e-06, + "loss": 4.1094, + "step": 1250 + }, + { + "epoch": 0.2507014028056112, + "grad_norm": 41.693475853495585, + "learning_rate": 8.350033400133602e-06, + "loss": 3.6677, + "step": 1251 + }, + { + "epoch": 0.2509018036072144, + "grad_norm": 50.46131396783461, + "learning_rate": 8.356713426853707e-06, + "loss": 4.3108, + "step": 1252 + }, + { + "epoch": 0.25110220440881764, + "grad_norm": 22.16020017045775, + "learning_rate": 8.363393453573815e-06, + "loss": 3.4592, + "step": 1253 + }, + { + "epoch": 0.25130260521042086, + "grad_norm": 53.91453268712119, + "learning_rate": 8.370073480293921e-06, + "loss": 3.7609, + "step": 1254 + }, + { + "epoch": 0.251503006012024, + "grad_norm": 32.417836367924885, + "learning_rate": 8.376753507014028e-06, + "loss": 4.1911, + "step": 1255 + }, + { + "epoch": 0.25170340681362724, + "grad_norm": 38.96235847939599, + "learning_rate": 8.383433533734136e-06, + "loss": 4.4196, + "step": 1256 + }, + { + "epoch": 0.25190380761523046, + "grad_norm": 34.51271413297292, + "learning_rate": 8.390113560454242e-06, + "loss": 4.2149, + "step": 1257 + }, + { + "epoch": 0.2521042084168337, + "grad_norm": 41.67647848717363, + "learning_rate": 8.396793587174349e-06, + "loss": 3.7452, + "step": 1258 + }, + { + "epoch": 0.2523046092184369, + "grad_norm": 29.26833733503482, + "learning_rate": 8.403473613894457e-06, + "loss": 3.8538, + "step": 1259 + }, + { + "epoch": 0.25250501002004005, + "grad_norm": 90.48044625242602, + "learning_rate": 8.410153640614563e-06, + "loss": 4.1494, + "step": 1260 + }, + { + "epoch": 0.2527054108216433, + "grad_norm": 43.93230385633648, + "learning_rate": 8.41683366733467e-06, + "loss": 4.001, + "step": 1261 + }, + { + "epoch": 0.2529058116232465, + "grad_norm": 49.4875885781461, + "learning_rate": 8.423513694054777e-06, + "loss": 4.0327, + "step": 1262 + }, + { + "epoch": 0.2531062124248497, + "grad_norm": 44.6784493461215, + "learning_rate": 8.430193720774884e-06, + "loss": 4.7878, + "step": 1263 + }, + { + "epoch": 0.2533066132264529, + "grad_norm": 33.942819610058955, + "learning_rate": 8.43687374749499e-06, + "loss": 4.5943, + "step": 1264 + }, + { + "epoch": 0.2535070140280561, + "grad_norm": 25.095238328075887, + "learning_rate": 8.443553774215098e-06, + "loss": 3.7531, + "step": 1265 + }, + { + "epoch": 0.2537074148296593, + "grad_norm": 25.198276719174725, + "learning_rate": 8.450233800935205e-06, + "loss": 3.7198, + "step": 1266 + }, + { + "epoch": 0.2539078156312625, + "grad_norm": 27.12068860264348, + "learning_rate": 8.456913827655311e-06, + "loss": 4.0937, + "step": 1267 + }, + { + "epoch": 0.25410821643286574, + "grad_norm": 38.854893272780515, + "learning_rate": 8.463593854375417e-06, + "loss": 4.2237, + "step": 1268 + }, + { + "epoch": 0.25430861723446896, + "grad_norm": 53.141429550003835, + "learning_rate": 8.470273881095525e-06, + "loss": 3.9916, + "step": 1269 + }, + { + "epoch": 0.2545090180360721, + "grad_norm": 41.34491853716838, + "learning_rate": 8.476953907815632e-06, + "loss": 4.2477, + "step": 1270 + }, + { + "epoch": 0.25470941883767534, + "grad_norm": 36.02378497886609, + "learning_rate": 8.483633934535738e-06, + "loss": 3.4468, + "step": 1271 + }, + { + "epoch": 0.25490981963927856, + "grad_norm": 36.0668938265325, + "learning_rate": 8.490313961255846e-06, + "loss": 3.9662, + "step": 1272 + }, + { + "epoch": 0.2551102204408818, + "grad_norm": 36.86707854974696, + "learning_rate": 8.496993987975953e-06, + "loss": 4.0037, + "step": 1273 + }, + { + "epoch": 0.255310621242485, + "grad_norm": 51.145026154576655, + "learning_rate": 8.503674014696059e-06, + "loss": 4.366, + "step": 1274 + }, + { + "epoch": 0.25551102204408815, + "grad_norm": 55.13400747428729, + "learning_rate": 8.510354041416167e-06, + "loss": 4.0045, + "step": 1275 + }, + { + "epoch": 0.25571142284569137, + "grad_norm": 48.07195976889317, + "learning_rate": 8.517034068136273e-06, + "loss": 4.3559, + "step": 1276 + }, + { + "epoch": 0.2559118236472946, + "grad_norm": 27.623077050775194, + "learning_rate": 8.52371409485638e-06, + "loss": 3.8207, + "step": 1277 + }, + { + "epoch": 0.2561122244488978, + "grad_norm": 42.43457133086321, + "learning_rate": 8.530394121576488e-06, + "loss": 5.0097, + "step": 1278 + }, + { + "epoch": 0.256312625250501, + "grad_norm": 41.916677195108164, + "learning_rate": 8.537074148296594e-06, + "loss": 4.6787, + "step": 1279 + }, + { + "epoch": 0.2565130260521042, + "grad_norm": 31.247628176810053, + "learning_rate": 8.5437541750167e-06, + "loss": 3.6229, + "step": 1280 + }, + { + "epoch": 0.2567134268537074, + "grad_norm": 28.932560245316896, + "learning_rate": 8.550434201736809e-06, + "loss": 4.1018, + "step": 1281 + }, + { + "epoch": 0.2569138276553106, + "grad_norm": 43.76306586994059, + "learning_rate": 8.557114228456913e-06, + "loss": 4.0252, + "step": 1282 + }, + { + "epoch": 0.25711422845691384, + "grad_norm": 34.60478186525862, + "learning_rate": 8.563794255177022e-06, + "loss": 4.1738, + "step": 1283 + }, + { + "epoch": 0.25731462925851706, + "grad_norm": 41.73692527138776, + "learning_rate": 8.570474281897128e-06, + "loss": 4.5019, + "step": 1284 + }, + { + "epoch": 0.2575150300601202, + "grad_norm": 26.56096173611716, + "learning_rate": 8.577154308617234e-06, + "loss": 3.8776, + "step": 1285 + }, + { + "epoch": 0.25771543086172344, + "grad_norm": 31.847741316863107, + "learning_rate": 8.583834335337342e-06, + "loss": 3.8637, + "step": 1286 + }, + { + "epoch": 0.25791583166332666, + "grad_norm": 29.607969591293802, + "learning_rate": 8.590514362057449e-06, + "loss": 4.1373, + "step": 1287 + }, + { + "epoch": 0.2581162324649299, + "grad_norm": 38.56743726519778, + "learning_rate": 8.597194388777555e-06, + "loss": 4.0309, + "step": 1288 + }, + { + "epoch": 0.2583166332665331, + "grad_norm": 40.418966101209335, + "learning_rate": 8.603874415497663e-06, + "loss": 4.3501, + "step": 1289 + }, + { + "epoch": 0.25851703406813625, + "grad_norm": 34.782062529842634, + "learning_rate": 8.61055444221777e-06, + "loss": 4.3222, + "step": 1290 + }, + { + "epoch": 0.25871743486973947, + "grad_norm": 39.01805872014274, + "learning_rate": 8.617234468937876e-06, + "loss": 3.9967, + "step": 1291 + }, + { + "epoch": 0.2589178356713427, + "grad_norm": 32.019602374694045, + "learning_rate": 8.623914495657984e-06, + "loss": 4.1944, + "step": 1292 + }, + { + "epoch": 0.2591182364729459, + "grad_norm": 34.853954479981084, + "learning_rate": 8.63059452237809e-06, + "loss": 4.2304, + "step": 1293 + }, + { + "epoch": 0.2593186372745491, + "grad_norm": 30.323214745862476, + "learning_rate": 8.637274549098197e-06, + "loss": 4.0818, + "step": 1294 + }, + { + "epoch": 0.2595190380761523, + "grad_norm": 99.2863001401623, + "learning_rate": 8.643954575818305e-06, + "loss": 4.6729, + "step": 1295 + }, + { + "epoch": 0.2597194388777555, + "grad_norm": 34.9391973218984, + "learning_rate": 8.650634602538411e-06, + "loss": 3.8444, + "step": 1296 + }, + { + "epoch": 0.2599198396793587, + "grad_norm": 49.121336659061704, + "learning_rate": 8.657314629258518e-06, + "loss": 3.9106, + "step": 1297 + }, + { + "epoch": 0.26012024048096194, + "grad_norm": 30.181596640580995, + "learning_rate": 8.663994655978624e-06, + "loss": 3.9812, + "step": 1298 + }, + { + "epoch": 0.26032064128256516, + "grad_norm": 27.64740732509629, + "learning_rate": 8.670674682698732e-06, + "loss": 4.2306, + "step": 1299 + }, + { + "epoch": 0.2605210420841683, + "grad_norm": 42.01764853704564, + "learning_rate": 8.677354709418838e-06, + "loss": 3.5074, + "step": 1300 + }, + { + "epoch": 0.26072144288577154, + "grad_norm": 43.34693847525127, + "learning_rate": 8.684034736138945e-06, + "loss": 4.2858, + "step": 1301 + }, + { + "epoch": 0.26092184368737475, + "grad_norm": 36.065904739980354, + "learning_rate": 8.690714762859053e-06, + "loss": 4.347, + "step": 1302 + }, + { + "epoch": 0.261122244488978, + "grad_norm": 21.036468531412186, + "learning_rate": 8.69739478957916e-06, + "loss": 3.8594, + "step": 1303 + }, + { + "epoch": 0.26132264529058113, + "grad_norm": 34.386105046690425, + "learning_rate": 8.704074816299266e-06, + "loss": 3.9859, + "step": 1304 + }, + { + "epoch": 0.26152304609218435, + "grad_norm": 28.44341808811352, + "learning_rate": 8.710754843019372e-06, + "loss": 3.9361, + "step": 1305 + }, + { + "epoch": 0.26172344689378757, + "grad_norm": 38.687558671621254, + "learning_rate": 8.71743486973948e-06, + "loss": 4.1788, + "step": 1306 + }, + { + "epoch": 0.2619238476953908, + "grad_norm": 28.639966624245137, + "learning_rate": 8.724114896459586e-06, + "loss": 4.0852, + "step": 1307 + }, + { + "epoch": 0.262124248496994, + "grad_norm": 28.86162165183535, + "learning_rate": 8.730794923179693e-06, + "loss": 4.3472, + "step": 1308 + }, + { + "epoch": 0.26232464929859717, + "grad_norm": 35.164459208757606, + "learning_rate": 8.7374749498998e-06, + "loss": 4.365, + "step": 1309 + }, + { + "epoch": 0.2625250501002004, + "grad_norm": 26.81434193941182, + "learning_rate": 8.744154976619907e-06, + "loss": 3.9487, + "step": 1310 + }, + { + "epoch": 0.2627254509018036, + "grad_norm": 31.541091441087737, + "learning_rate": 8.750835003340014e-06, + "loss": 4.1216, + "step": 1311 + }, + { + "epoch": 0.2629258517034068, + "grad_norm": 33.8823870200703, + "learning_rate": 8.757515030060122e-06, + "loss": 4.0588, + "step": 1312 + }, + { + "epoch": 0.26312625250501004, + "grad_norm": 58.40178586424824, + "learning_rate": 8.764195056780228e-06, + "loss": 4.7793, + "step": 1313 + }, + { + "epoch": 0.2633266533066132, + "grad_norm": 24.60699618004312, + "learning_rate": 8.770875083500334e-06, + "loss": 3.8011, + "step": 1314 + }, + { + "epoch": 0.2635270541082164, + "grad_norm": 77.2939617961386, + "learning_rate": 8.77755511022044e-06, + "loss": 4.5172, + "step": 1315 + }, + { + "epoch": 0.26372745490981964, + "grad_norm": 23.899011554912423, + "learning_rate": 8.784235136940549e-06, + "loss": 3.7962, + "step": 1316 + }, + { + "epoch": 0.26392785571142285, + "grad_norm": 40.110717849007855, + "learning_rate": 8.790915163660655e-06, + "loss": 4.5875, + "step": 1317 + }, + { + "epoch": 0.26412825651302607, + "grad_norm": 44.41534437192513, + "learning_rate": 8.797595190380762e-06, + "loss": 4.3091, + "step": 1318 + }, + { + "epoch": 0.26432865731462923, + "grad_norm": 40.35609621363318, + "learning_rate": 8.80427521710087e-06, + "loss": 3.7862, + "step": 1319 + }, + { + "epoch": 0.26452905811623245, + "grad_norm": 44.02392538943282, + "learning_rate": 8.810955243820976e-06, + "loss": 4.2652, + "step": 1320 + }, + { + "epoch": 0.26472945891783567, + "grad_norm": 40.785587000360486, + "learning_rate": 8.817635270541082e-06, + "loss": 3.8939, + "step": 1321 + }, + { + "epoch": 0.2649298597194389, + "grad_norm": 36.64507999154061, + "learning_rate": 8.82431529726119e-06, + "loss": 4.2207, + "step": 1322 + }, + { + "epoch": 0.2651302605210421, + "grad_norm": 43.49238910856558, + "learning_rate": 8.830995323981297e-06, + "loss": 4.8647, + "step": 1323 + }, + { + "epoch": 0.26533066132264527, + "grad_norm": 31.20996370887813, + "learning_rate": 8.837675350701403e-06, + "loss": 4.2908, + "step": 1324 + }, + { + "epoch": 0.2655310621242485, + "grad_norm": 50.680667148098536, + "learning_rate": 8.844355377421511e-06, + "loss": 4.6351, + "step": 1325 + }, + { + "epoch": 0.2657314629258517, + "grad_norm": 35.197934072752346, + "learning_rate": 8.851035404141618e-06, + "loss": 3.8578, + "step": 1326 + }, + { + "epoch": 0.2659318637274549, + "grad_norm": 33.987782915095345, + "learning_rate": 8.857715430861724e-06, + "loss": 4.3718, + "step": 1327 + }, + { + "epoch": 0.26613226452905814, + "grad_norm": 47.97423206661277, + "learning_rate": 8.864395457581832e-06, + "loss": 4.1458, + "step": 1328 + }, + { + "epoch": 0.2663326653306613, + "grad_norm": 52.00715460935246, + "learning_rate": 8.871075484301937e-06, + "loss": 4.3225, + "step": 1329 + }, + { + "epoch": 0.2665330661322645, + "grad_norm": 33.632973399635624, + "learning_rate": 8.877755511022045e-06, + "loss": 4.3539, + "step": 1330 + }, + { + "epoch": 0.26673346693386774, + "grad_norm": 27.533364393233434, + "learning_rate": 8.884435537742151e-06, + "loss": 4.2105, + "step": 1331 + }, + { + "epoch": 0.26693386773547095, + "grad_norm": 130.91562694983412, + "learning_rate": 8.891115564462258e-06, + "loss": 4.7095, + "step": 1332 + }, + { + "epoch": 0.26713426853707417, + "grad_norm": 40.220485025470616, + "learning_rate": 8.897795591182366e-06, + "loss": 4.6222, + "step": 1333 + }, + { + "epoch": 0.26733466933867733, + "grad_norm": 39.3886454956086, + "learning_rate": 8.904475617902472e-06, + "loss": 4.3553, + "step": 1334 + }, + { + "epoch": 0.26753507014028055, + "grad_norm": 39.655001405977195, + "learning_rate": 8.911155644622578e-06, + "loss": 4.7769, + "step": 1335 + }, + { + "epoch": 0.26773547094188377, + "grad_norm": 25.8466297581618, + "learning_rate": 8.917835671342687e-06, + "loss": 3.9793, + "step": 1336 + }, + { + "epoch": 0.267935871743487, + "grad_norm": 38.5924772954878, + "learning_rate": 8.924515698062793e-06, + "loss": 4.6141, + "step": 1337 + }, + { + "epoch": 0.2681362725450902, + "grad_norm": 48.15836806635997, + "learning_rate": 8.9311957247829e-06, + "loss": 3.9482, + "step": 1338 + }, + { + "epoch": 0.26833667334669337, + "grad_norm": 40.046141633083515, + "learning_rate": 8.937875751503007e-06, + "loss": 4.6302, + "step": 1339 + }, + { + "epoch": 0.2685370741482966, + "grad_norm": 33.091689897718716, + "learning_rate": 8.944555778223114e-06, + "loss": 3.5928, + "step": 1340 + }, + { + "epoch": 0.2687374749498998, + "grad_norm": 48.742410051222244, + "learning_rate": 8.95123580494322e-06, + "loss": 4.1809, + "step": 1341 + }, + { + "epoch": 0.268937875751503, + "grad_norm": 35.11529472160456, + "learning_rate": 8.957915831663328e-06, + "loss": 4.5502, + "step": 1342 + }, + { + "epoch": 0.26913827655310624, + "grad_norm": 46.41374809158483, + "learning_rate": 8.964595858383435e-06, + "loss": 3.9003, + "step": 1343 + }, + { + "epoch": 0.2693386773547094, + "grad_norm": 36.86165497902043, + "learning_rate": 8.971275885103541e-06, + "loss": 4.597, + "step": 1344 + }, + { + "epoch": 0.2695390781563126, + "grad_norm": 28.756256248445805, + "learning_rate": 8.977955911823647e-06, + "loss": 3.8819, + "step": 1345 + }, + { + "epoch": 0.26973947895791583, + "grad_norm": 32.64235051907961, + "learning_rate": 8.984635938543755e-06, + "loss": 4.7262, + "step": 1346 + }, + { + "epoch": 0.26993987975951905, + "grad_norm": 40.72103115898515, + "learning_rate": 8.991315965263862e-06, + "loss": 5.1531, + "step": 1347 + }, + { + "epoch": 0.27014028056112227, + "grad_norm": 30.48188193714281, + "learning_rate": 8.997995991983968e-06, + "loss": 3.8787, + "step": 1348 + }, + { + "epoch": 0.27034068136272543, + "grad_norm": 35.98151485576356, + "learning_rate": 9.004676018704076e-06, + "loss": 4.1552, + "step": 1349 + }, + { + "epoch": 0.27054108216432865, + "grad_norm": 34.451064214514965, + "learning_rate": 9.011356045424183e-06, + "loss": 4.6824, + "step": 1350 + }, + { + "epoch": 0.27074148296593187, + "grad_norm": 29.049207699007216, + "learning_rate": 9.018036072144289e-06, + "loss": 4.028, + "step": 1351 + }, + { + "epoch": 0.2709418837675351, + "grad_norm": 32.42096542439769, + "learning_rate": 9.024716098864397e-06, + "loss": 4.213, + "step": 1352 + }, + { + "epoch": 0.2711422845691383, + "grad_norm": 48.17154156989782, + "learning_rate": 9.031396125584503e-06, + "loss": 4.6272, + "step": 1353 + }, + { + "epoch": 0.27134268537074147, + "grad_norm": 41.865920389746286, + "learning_rate": 9.03807615230461e-06, + "loss": 3.8131, + "step": 1354 + }, + { + "epoch": 0.2715430861723447, + "grad_norm": 33.895099169158435, + "learning_rate": 9.044756179024718e-06, + "loss": 4.3291, + "step": 1355 + }, + { + "epoch": 0.2717434869739479, + "grad_norm": 40.28864970522726, + "learning_rate": 9.051436205744824e-06, + "loss": 4.6851, + "step": 1356 + }, + { + "epoch": 0.2719438877755511, + "grad_norm": 25.27427354299928, + "learning_rate": 9.05811623246493e-06, + "loss": 3.885, + "step": 1357 + }, + { + "epoch": 0.27214428857715434, + "grad_norm": 37.90075188870943, + "learning_rate": 9.064796259185039e-06, + "loss": 4.116, + "step": 1358 + }, + { + "epoch": 0.2723446893787575, + "grad_norm": 33.077816521657915, + "learning_rate": 9.071476285905143e-06, + "loss": 3.7908, + "step": 1359 + }, + { + "epoch": 0.2725450901803607, + "grad_norm": 41.37269287613336, + "learning_rate": 9.078156312625251e-06, + "loss": 4.4428, + "step": 1360 + }, + { + "epoch": 0.27274549098196393, + "grad_norm": 49.54512865672194, + "learning_rate": 9.084836339345358e-06, + "loss": 4.4365, + "step": 1361 + }, + { + "epoch": 0.27294589178356715, + "grad_norm": 76.88445013356964, + "learning_rate": 9.091516366065464e-06, + "loss": 3.6192, + "step": 1362 + }, + { + "epoch": 0.2731462925851703, + "grad_norm": 53.742055442627716, + "learning_rate": 9.098196392785572e-06, + "loss": 4.974, + "step": 1363 + }, + { + "epoch": 0.27334669338677353, + "grad_norm": 38.788527014331116, + "learning_rate": 9.104876419505679e-06, + "loss": 4.754, + "step": 1364 + }, + { + "epoch": 0.27354709418837675, + "grad_norm": 28.87228069743021, + "learning_rate": 9.111556446225785e-06, + "loss": 3.9847, + "step": 1365 + }, + { + "epoch": 0.27374749498997997, + "grad_norm": 38.59792573142843, + "learning_rate": 9.118236472945893e-06, + "loss": 4.2713, + "step": 1366 + }, + { + "epoch": 0.2739478957915832, + "grad_norm": 63.93402322465201, + "learning_rate": 9.124916499666e-06, + "loss": 4.8426, + "step": 1367 + }, + { + "epoch": 0.27414829659318635, + "grad_norm": 34.728418867372945, + "learning_rate": 9.131596526386106e-06, + "loss": 3.9361, + "step": 1368 + }, + { + "epoch": 0.27434869739478956, + "grad_norm": 48.07848389009424, + "learning_rate": 9.138276553106214e-06, + "loss": 4.5364, + "step": 1369 + }, + { + "epoch": 0.2745490981963928, + "grad_norm": 34.19716489543941, + "learning_rate": 9.14495657982632e-06, + "loss": 4.4569, + "step": 1370 + }, + { + "epoch": 0.274749498997996, + "grad_norm": 31.741220542253004, + "learning_rate": 9.151636606546427e-06, + "loss": 4.12, + "step": 1371 + }, + { + "epoch": 0.2749498997995992, + "grad_norm": 52.72035911167065, + "learning_rate": 9.158316633266535e-06, + "loss": 4.2448, + "step": 1372 + }, + { + "epoch": 0.2751503006012024, + "grad_norm": 42.18220627266274, + "learning_rate": 9.16499665998664e-06, + "loss": 4.0285, + "step": 1373 + }, + { + "epoch": 0.2753507014028056, + "grad_norm": 53.26275358132648, + "learning_rate": 9.171676686706747e-06, + "loss": 3.7637, + "step": 1374 + }, + { + "epoch": 0.2755511022044088, + "grad_norm": 55.498016846281146, + "learning_rate": 9.178356713426856e-06, + "loss": 4.7269, + "step": 1375 + }, + { + "epoch": 0.27575150300601203, + "grad_norm": 33.222485179705586, + "learning_rate": 9.18503674014696e-06, + "loss": 4.0459, + "step": 1376 + }, + { + "epoch": 0.27595190380761525, + "grad_norm": 36.32455362374125, + "learning_rate": 9.191716766867068e-06, + "loss": 4.4555, + "step": 1377 + }, + { + "epoch": 0.2761523046092184, + "grad_norm": 43.595239382346556, + "learning_rate": 9.198396793587175e-06, + "loss": 4.5352, + "step": 1378 + }, + { + "epoch": 0.27635270541082163, + "grad_norm": 38.09432729982777, + "learning_rate": 9.205076820307281e-06, + "loss": 4.1041, + "step": 1379 + }, + { + "epoch": 0.27655310621242485, + "grad_norm": 28.539875075847625, + "learning_rate": 9.211756847027389e-06, + "loss": 3.9706, + "step": 1380 + }, + { + "epoch": 0.27675350701402807, + "grad_norm": 28.171496141788236, + "learning_rate": 9.218436873747496e-06, + "loss": 3.944, + "step": 1381 + }, + { + "epoch": 0.2769539078156313, + "grad_norm": 31.088507125253237, + "learning_rate": 9.225116900467602e-06, + "loss": 4.7105, + "step": 1382 + }, + { + "epoch": 0.27715430861723445, + "grad_norm": 98.91055792964202, + "learning_rate": 9.23179692718771e-06, + "loss": 5.1169, + "step": 1383 + }, + { + "epoch": 0.27735470941883766, + "grad_norm": 26.25063699798774, + "learning_rate": 9.238476953907816e-06, + "loss": 3.8414, + "step": 1384 + }, + { + "epoch": 0.2775551102204409, + "grad_norm": 28.911591218623087, + "learning_rate": 9.245156980627923e-06, + "loss": 4.147, + "step": 1385 + }, + { + "epoch": 0.2777555110220441, + "grad_norm": 23.519588959134385, + "learning_rate": 9.25183700734803e-06, + "loss": 3.7649, + "step": 1386 + }, + { + "epoch": 0.2779559118236473, + "grad_norm": 30.34803682400738, + "learning_rate": 9.258517034068137e-06, + "loss": 3.906, + "step": 1387 + }, + { + "epoch": 0.2781563126252505, + "grad_norm": 34.86234769287122, + "learning_rate": 9.265197060788244e-06, + "loss": 4.2274, + "step": 1388 + }, + { + "epoch": 0.2783567134268537, + "grad_norm": 29.54026126082427, + "learning_rate": 9.271877087508352e-06, + "loss": 4.152, + "step": 1389 + }, + { + "epoch": 0.2785571142284569, + "grad_norm": 27.735076773063774, + "learning_rate": 9.278557114228458e-06, + "loss": 3.9697, + "step": 1390 + }, + { + "epoch": 0.27875751503006013, + "grad_norm": 72.16901659658188, + "learning_rate": 9.285237140948564e-06, + "loss": 3.6481, + "step": 1391 + }, + { + "epoch": 0.27895791583166335, + "grad_norm": 35.4948659593684, + "learning_rate": 9.29191716766867e-06, + "loss": 3.6872, + "step": 1392 + }, + { + "epoch": 0.2791583166332665, + "grad_norm": 33.18461644954401, + "learning_rate": 9.298597194388779e-06, + "loss": 4.3021, + "step": 1393 + }, + { + "epoch": 0.27935871743486973, + "grad_norm": 26.726885250192037, + "learning_rate": 9.305277221108885e-06, + "loss": 3.7301, + "step": 1394 + }, + { + "epoch": 0.27955911823647295, + "grad_norm": 31.336272570568262, + "learning_rate": 9.311957247828992e-06, + "loss": 4.0047, + "step": 1395 + }, + { + "epoch": 0.27975951903807617, + "grad_norm": 31.953416502192834, + "learning_rate": 9.3186372745491e-06, + "loss": 3.394, + "step": 1396 + }, + { + "epoch": 0.2799599198396794, + "grad_norm": 35.779866266643786, + "learning_rate": 9.325317301269206e-06, + "loss": 4.1114, + "step": 1397 + }, + { + "epoch": 0.28016032064128255, + "grad_norm": 30.663904155275542, + "learning_rate": 9.331997327989312e-06, + "loss": 3.994, + "step": 1398 + }, + { + "epoch": 0.28036072144288576, + "grad_norm": 77.54948351436677, + "learning_rate": 9.33867735470942e-06, + "loss": 4.3639, + "step": 1399 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 40.317714918235154, + "learning_rate": 9.345357381429527e-06, + "loss": 4.1074, + "step": 1400 + }, + { + "epoch": 0.2807615230460922, + "grad_norm": 32.1971162651264, + "learning_rate": 9.352037408149633e-06, + "loss": 4.5378, + "step": 1401 + }, + { + "epoch": 0.2809619238476954, + "grad_norm": 33.08266986444899, + "learning_rate": 9.358717434869741e-06, + "loss": 4.3851, + "step": 1402 + }, + { + "epoch": 0.2811623246492986, + "grad_norm": 47.300739030283694, + "learning_rate": 9.365397461589848e-06, + "loss": 4.7432, + "step": 1403 + }, + { + "epoch": 0.2813627254509018, + "grad_norm": 46.669035509336744, + "learning_rate": 9.372077488309954e-06, + "loss": 4.7298, + "step": 1404 + }, + { + "epoch": 0.281563126252505, + "grad_norm": 28.221093834252056, + "learning_rate": 9.378757515030062e-06, + "loss": 3.9836, + "step": 1405 + }, + { + "epoch": 0.28176352705410823, + "grad_norm": 28.846982401514357, + "learning_rate": 9.385437541750167e-06, + "loss": 4.5468, + "step": 1406 + }, + { + "epoch": 0.28196392785571145, + "grad_norm": 32.49218395656471, + "learning_rate": 9.392117568470275e-06, + "loss": 5.0579, + "step": 1407 + }, + { + "epoch": 0.2821643286573146, + "grad_norm": 30.313413807099057, + "learning_rate": 9.398797595190381e-06, + "loss": 4.3743, + "step": 1408 + }, + { + "epoch": 0.28236472945891783, + "grad_norm": 31.201667431726296, + "learning_rate": 9.405477621910488e-06, + "loss": 4.6342, + "step": 1409 + }, + { + "epoch": 0.28256513026052105, + "grad_norm": 39.4976602447453, + "learning_rate": 9.412157648630596e-06, + "loss": 4.3323, + "step": 1410 + }, + { + "epoch": 0.28276553106212426, + "grad_norm": 28.17819080642687, + "learning_rate": 9.418837675350702e-06, + "loss": 3.6968, + "step": 1411 + }, + { + "epoch": 0.2829659318637275, + "grad_norm": 36.94373645759503, + "learning_rate": 9.425517702070808e-06, + "loss": 4.2627, + "step": 1412 + }, + { + "epoch": 0.28316633266533064, + "grad_norm": 24.05177773463666, + "learning_rate": 9.432197728790916e-06, + "loss": 3.7489, + "step": 1413 + }, + { + "epoch": 0.28336673346693386, + "grad_norm": 21.799486961734175, + "learning_rate": 9.438877755511023e-06, + "loss": 3.6544, + "step": 1414 + }, + { + "epoch": 0.2835671342685371, + "grad_norm": 22.222514897770342, + "learning_rate": 9.44555778223113e-06, + "loss": 4.1464, + "step": 1415 + }, + { + "epoch": 0.2837675350701403, + "grad_norm": 41.95046686335725, + "learning_rate": 9.452237808951237e-06, + "loss": 4.6776, + "step": 1416 + }, + { + "epoch": 0.28396793587174346, + "grad_norm": 42.122046181567875, + "learning_rate": 9.458917835671344e-06, + "loss": 3.9818, + "step": 1417 + }, + { + "epoch": 0.2841683366733467, + "grad_norm": 59.81610226637001, + "learning_rate": 9.46559786239145e-06, + "loss": 4.3613, + "step": 1418 + }, + { + "epoch": 0.2843687374749499, + "grad_norm": 44.79942814085427, + "learning_rate": 9.472277889111558e-06, + "loss": 4.2331, + "step": 1419 + }, + { + "epoch": 0.2845691382765531, + "grad_norm": 29.47606812841823, + "learning_rate": 9.478957915831663e-06, + "loss": 3.7383, + "step": 1420 + }, + { + "epoch": 0.28476953907815633, + "grad_norm": 43.40217877509996, + "learning_rate": 9.485637942551771e-06, + "loss": 4.3055, + "step": 1421 + }, + { + "epoch": 0.2849699398797595, + "grad_norm": 49.1821756359786, + "learning_rate": 9.492317969271877e-06, + "loss": 4.1222, + "step": 1422 + }, + { + "epoch": 0.2851703406813627, + "grad_norm": 51.154232246128544, + "learning_rate": 9.498997995991984e-06, + "loss": 4.5133, + "step": 1423 + }, + { + "epoch": 0.28537074148296593, + "grad_norm": 29.952408927932726, + "learning_rate": 9.505678022712092e-06, + "loss": 4.5309, + "step": 1424 + }, + { + "epoch": 0.28557114228456915, + "grad_norm": 56.18297535845448, + "learning_rate": 9.512358049432198e-06, + "loss": 4.1381, + "step": 1425 + }, + { + "epoch": 0.28577154308617236, + "grad_norm": 26.441905711568722, + "learning_rate": 9.519038076152304e-06, + "loss": 4.1994, + "step": 1426 + }, + { + "epoch": 0.2859719438877755, + "grad_norm": 33.673358769310546, + "learning_rate": 9.525718102872413e-06, + "loss": 4.2753, + "step": 1427 + }, + { + "epoch": 0.28617234468937874, + "grad_norm": 25.194971257553846, + "learning_rate": 9.532398129592519e-06, + "loss": 3.979, + "step": 1428 + }, + { + "epoch": 0.28637274549098196, + "grad_norm": 24.491230243509694, + "learning_rate": 9.539078156312625e-06, + "loss": 3.5042, + "step": 1429 + }, + { + "epoch": 0.2865731462925852, + "grad_norm": 34.397937558227895, + "learning_rate": 9.545758183032733e-06, + "loss": 4.3484, + "step": 1430 + }, + { + "epoch": 0.2867735470941884, + "grad_norm": 33.62973119756023, + "learning_rate": 9.55243820975284e-06, + "loss": 4.5864, + "step": 1431 + }, + { + "epoch": 0.28697394789579156, + "grad_norm": 27.319358903008574, + "learning_rate": 9.559118236472946e-06, + "loss": 3.7219, + "step": 1432 + }, + { + "epoch": 0.2871743486973948, + "grad_norm": 24.392817914668317, + "learning_rate": 9.565798263193054e-06, + "loss": 3.5062, + "step": 1433 + }, + { + "epoch": 0.287374749498998, + "grad_norm": 144.08549057769116, + "learning_rate": 9.57247828991316e-06, + "loss": 4.111, + "step": 1434 + }, + { + "epoch": 0.2875751503006012, + "grad_norm": 60.52272544408397, + "learning_rate": 9.579158316633267e-06, + "loss": 4.3157, + "step": 1435 + }, + { + "epoch": 0.28777555110220443, + "grad_norm": 55.37283364087434, + "learning_rate": 9.585838343353375e-06, + "loss": 4.7302, + "step": 1436 + }, + { + "epoch": 0.2879759519038076, + "grad_norm": 31.10881020667003, + "learning_rate": 9.592518370073481e-06, + "loss": 4.4232, + "step": 1437 + }, + { + "epoch": 0.2881763527054108, + "grad_norm": 27.737411575954862, + "learning_rate": 9.599198396793588e-06, + "loss": 4.062, + "step": 1438 + }, + { + "epoch": 0.288376753507014, + "grad_norm": 30.567331485811632, + "learning_rate": 9.605878423513694e-06, + "loss": 4.1356, + "step": 1439 + }, + { + "epoch": 0.28857715430861725, + "grad_norm": 44.29688434118359, + "learning_rate": 9.612558450233802e-06, + "loss": 4.327, + "step": 1440 + }, + { + "epoch": 0.28877755511022046, + "grad_norm": 34.22058155281708, + "learning_rate": 9.619238476953909e-06, + "loss": 4.1476, + "step": 1441 + }, + { + "epoch": 0.2889779559118236, + "grad_norm": 83.34042031263553, + "learning_rate": 9.625918503674015e-06, + "loss": 4.5972, + "step": 1442 + }, + { + "epoch": 0.28917835671342684, + "grad_norm": 30.885740560127715, + "learning_rate": 9.632598530394123e-06, + "loss": 4.1949, + "step": 1443 + }, + { + "epoch": 0.28937875751503006, + "grad_norm": 37.106520256148166, + "learning_rate": 9.63927855711423e-06, + "loss": 4.5556, + "step": 1444 + }, + { + "epoch": 0.2895791583166333, + "grad_norm": 36.87257465591293, + "learning_rate": 9.645958583834336e-06, + "loss": 4.4115, + "step": 1445 + }, + { + "epoch": 0.2897795591182365, + "grad_norm": 33.02821445578738, + "learning_rate": 9.652638610554444e-06, + "loss": 3.9362, + "step": 1446 + }, + { + "epoch": 0.28997995991983966, + "grad_norm": 33.997532758986786, + "learning_rate": 9.65931863727455e-06, + "loss": 4.7011, + "step": 1447 + }, + { + "epoch": 0.2901803607214429, + "grad_norm": 26.783603436556433, + "learning_rate": 9.665998663994657e-06, + "loss": 3.9425, + "step": 1448 + }, + { + "epoch": 0.2903807615230461, + "grad_norm": 29.290603627787764, + "learning_rate": 9.672678690714765e-06, + "loss": 4.2503, + "step": 1449 + }, + { + "epoch": 0.2905811623246493, + "grad_norm": 27.313862172215458, + "learning_rate": 9.679358717434871e-06, + "loss": 4.0895, + "step": 1450 + }, + { + "epoch": 0.29078156312625253, + "grad_norm": 32.17860412904081, + "learning_rate": 9.686038744154977e-06, + "loss": 3.9315, + "step": 1451 + }, + { + "epoch": 0.2909819639278557, + "grad_norm": 28.831413078501217, + "learning_rate": 9.692718770875085e-06, + "loss": 4.0477, + "step": 1452 + }, + { + "epoch": 0.2911823647294589, + "grad_norm": 30.485832001994762, + "learning_rate": 9.69939879759519e-06, + "loss": 4.1653, + "step": 1453 + }, + { + "epoch": 0.2913827655310621, + "grad_norm": 52.108096029066125, + "learning_rate": 9.706078824315298e-06, + "loss": 4.1707, + "step": 1454 + }, + { + "epoch": 0.29158316633266534, + "grad_norm": 28.054981507288623, + "learning_rate": 9.712758851035405e-06, + "loss": 4.3883, + "step": 1455 + }, + { + "epoch": 0.29178356713426856, + "grad_norm": 28.330132843500788, + "learning_rate": 9.719438877755511e-06, + "loss": 4.2425, + "step": 1456 + }, + { + "epoch": 0.2919839679358717, + "grad_norm": 28.739788619731325, + "learning_rate": 9.726118904475619e-06, + "loss": 4.4855, + "step": 1457 + }, + { + "epoch": 0.29218436873747494, + "grad_norm": 31.93299322482225, + "learning_rate": 9.732798931195725e-06, + "loss": 4.3711, + "step": 1458 + }, + { + "epoch": 0.29238476953907816, + "grad_norm": 44.82678769875923, + "learning_rate": 9.739478957915832e-06, + "loss": 4.4577, + "step": 1459 + }, + { + "epoch": 0.2925851703406814, + "grad_norm": 34.99377048318665, + "learning_rate": 9.74615898463594e-06, + "loss": 4.1173, + "step": 1460 + }, + { + "epoch": 0.2927855711422846, + "grad_norm": 44.64927603848862, + "learning_rate": 9.752839011356046e-06, + "loss": 4.2664, + "step": 1461 + }, + { + "epoch": 0.29298597194388776, + "grad_norm": 45.34847235477171, + "learning_rate": 9.759519038076153e-06, + "loss": 5.3468, + "step": 1462 + }, + { + "epoch": 0.293186372745491, + "grad_norm": 37.16981818093948, + "learning_rate": 9.76619906479626e-06, + "loss": 4.2219, + "step": 1463 + }, + { + "epoch": 0.2933867735470942, + "grad_norm": 45.807690234944126, + "learning_rate": 9.772879091516367e-06, + "loss": 4.6853, + "step": 1464 + }, + { + "epoch": 0.2935871743486974, + "grad_norm": 37.57892770395602, + "learning_rate": 9.779559118236473e-06, + "loss": 4.4324, + "step": 1465 + }, + { + "epoch": 0.29378757515030063, + "grad_norm": 49.14831314803758, + "learning_rate": 9.786239144956582e-06, + "loss": 4.0341, + "step": 1466 + }, + { + "epoch": 0.2939879759519038, + "grad_norm": 30.305303462276754, + "learning_rate": 9.792919171676688e-06, + "loss": 3.7368, + "step": 1467 + }, + { + "epoch": 0.294188376753507, + "grad_norm": 26.6603932757528, + "learning_rate": 9.799599198396794e-06, + "loss": 3.899, + "step": 1468 + }, + { + "epoch": 0.2943887775551102, + "grad_norm": 29.053030450318126, + "learning_rate": 9.8062792251169e-06, + "loss": 4.621, + "step": 1469 + }, + { + "epoch": 0.29458917835671344, + "grad_norm": 60.010121938816724, + "learning_rate": 9.812959251837009e-06, + "loss": 4.7625, + "step": 1470 + }, + { + "epoch": 0.2947895791583166, + "grad_norm": 39.43898069160205, + "learning_rate": 9.819639278557115e-06, + "loss": 4.3294, + "step": 1471 + }, + { + "epoch": 0.2949899799599198, + "grad_norm": 27.14930665896293, + "learning_rate": 9.826319305277221e-06, + "loss": 4.2542, + "step": 1472 + }, + { + "epoch": 0.29519038076152304, + "grad_norm": 29.803812558696727, + "learning_rate": 9.832999331997328e-06, + "loss": 4.3767, + "step": 1473 + }, + { + "epoch": 0.29539078156312626, + "grad_norm": 44.35894213836671, + "learning_rate": 9.839679358717436e-06, + "loss": 4.9349, + "step": 1474 + }, + { + "epoch": 0.2955911823647295, + "grad_norm": 56.360630643772225, + "learning_rate": 9.846359385437542e-06, + "loss": 3.8879, + "step": 1475 + }, + { + "epoch": 0.29579158316633264, + "grad_norm": 47.8618423948014, + "learning_rate": 9.853039412157649e-06, + "loss": 4.9644, + "step": 1476 + }, + { + "epoch": 0.29599198396793586, + "grad_norm": 25.040966463440625, + "learning_rate": 9.859719438877757e-06, + "loss": 3.469, + "step": 1477 + }, + { + "epoch": 0.2961923847695391, + "grad_norm": 38.858540036485316, + "learning_rate": 9.866399465597863e-06, + "loss": 3.6402, + "step": 1478 + }, + { + "epoch": 0.2963927855711423, + "grad_norm": 42.70194403728498, + "learning_rate": 9.87307949231797e-06, + "loss": 4.7337, + "step": 1479 + }, + { + "epoch": 0.2965931863727455, + "grad_norm": 31.348359084633504, + "learning_rate": 9.879759519038078e-06, + "loss": 4.176, + "step": 1480 + }, + { + "epoch": 0.29679358717434867, + "grad_norm": 35.02808708214957, + "learning_rate": 9.886439545758184e-06, + "loss": 3.7075, + "step": 1481 + }, + { + "epoch": 0.2969939879759519, + "grad_norm": 42.181641664965554, + "learning_rate": 9.89311957247829e-06, + "loss": 4.6429, + "step": 1482 + }, + { + "epoch": 0.2971943887775551, + "grad_norm": 35.79955263693198, + "learning_rate": 9.899799599198397e-06, + "loss": 4.2781, + "step": 1483 + }, + { + "epoch": 0.2973947895791583, + "grad_norm": 40.24712405938842, + "learning_rate": 9.906479625918505e-06, + "loss": 4.1038, + "step": 1484 + }, + { + "epoch": 0.29759519038076154, + "grad_norm": 35.582128045008915, + "learning_rate": 9.913159652638611e-06, + "loss": 4.3555, + "step": 1485 + }, + { + "epoch": 0.2977955911823647, + "grad_norm": 20.37345444682592, + "learning_rate": 9.919839679358718e-06, + "loss": 3.522, + "step": 1486 + }, + { + "epoch": 0.2979959919839679, + "grad_norm": 29.90282751621295, + "learning_rate": 9.926519706078826e-06, + "loss": 4.2275, + "step": 1487 + }, + { + "epoch": 0.29819639278557114, + "grad_norm": 36.237175347275524, + "learning_rate": 9.933199732798932e-06, + "loss": 4.5998, + "step": 1488 + }, + { + "epoch": 0.29839679358717436, + "grad_norm": 27.773670228885187, + "learning_rate": 9.939879759519038e-06, + "loss": 3.8715, + "step": 1489 + }, + { + "epoch": 0.2985971943887776, + "grad_norm": 39.47727762581271, + "learning_rate": 9.946559786239146e-06, + "loss": 3.9621, + "step": 1490 + }, + { + "epoch": 0.29879759519038074, + "grad_norm": 61.697480577107555, + "learning_rate": 9.953239812959253e-06, + "loss": 4.3365, + "step": 1491 + }, + { + "epoch": 0.29899799599198396, + "grad_norm": 33.22469336740814, + "learning_rate": 9.95991983967936e-06, + "loss": 4.0867, + "step": 1492 + }, + { + "epoch": 0.2991983967935872, + "grad_norm": 27.513977778230768, + "learning_rate": 9.966599866399467e-06, + "loss": 3.7164, + "step": 1493 + }, + { + "epoch": 0.2993987975951904, + "grad_norm": 43.78457817780047, + "learning_rate": 9.973279893119574e-06, + "loss": 4.471, + "step": 1494 + }, + { + "epoch": 0.2995991983967936, + "grad_norm": 47.138556301496436, + "learning_rate": 9.97995991983968e-06, + "loss": 4.4058, + "step": 1495 + }, + { + "epoch": 0.29979959919839677, + "grad_norm": 29.382465853243865, + "learning_rate": 9.986639946559788e-06, + "loss": 4.1681, + "step": 1496 + }, + { + "epoch": 0.3, + "grad_norm": 44.67166491880201, + "learning_rate": 9.993319973279893e-06, + "loss": 4.1158, + "step": 1497 + }, + { + "epoch": 0.3002004008016032, + "grad_norm": 44.16078007992641, + "learning_rate": 1e-05, + "loss": 4.5213, + "step": 1498 + }, + { + "epoch": 0.3004008016032064, + "grad_norm": 30.57300822705988, + "learning_rate": 9.99999986407131e-06, + "loss": 4.3062, + "step": 1499 + }, + { + "epoch": 0.30060120240480964, + "grad_norm": 32.470129276502384, + "learning_rate": 9.999999456285247e-06, + "loss": 4.2448, + "step": 1500 + }, + { + "epoch": 0.3008016032064128, + "grad_norm": 49.408582172746854, + "learning_rate": 9.999998776641833e-06, + "loss": 4.5758, + "step": 1501 + }, + { + "epoch": 0.301002004008016, + "grad_norm": 47.48423133885789, + "learning_rate": 9.999997825141104e-06, + "loss": 4.588, + "step": 1502 + }, + { + "epoch": 0.30120240480961924, + "grad_norm": 26.330726034006595, + "learning_rate": 9.999996601783112e-06, + "loss": 4.5815, + "step": 1503 + }, + { + "epoch": 0.30140280561122246, + "grad_norm": 42.06944630650507, + "learning_rate": 9.999995106567924e-06, + "loss": 4.8208, + "step": 1504 + }, + { + "epoch": 0.3016032064128257, + "grad_norm": 34.96679844855493, + "learning_rate": 9.999993339495623e-06, + "loss": 3.7733, + "step": 1505 + }, + { + "epoch": 0.30180360721442884, + "grad_norm": 34.53202467126884, + "learning_rate": 9.999991300566304e-06, + "loss": 3.7267, + "step": 1506 + }, + { + "epoch": 0.30200400801603206, + "grad_norm": 31.86885012381608, + "learning_rate": 9.999988989780074e-06, + "loss": 4.4345, + "step": 1507 + }, + { + "epoch": 0.3022044088176353, + "grad_norm": 44.31730172392624, + "learning_rate": 9.999986407137065e-06, + "loss": 4.1301, + "step": 1508 + }, + { + "epoch": 0.3024048096192385, + "grad_norm": 23.457984424938648, + "learning_rate": 9.999983552637413e-06, + "loss": 4.1818, + "step": 1509 + }, + { + "epoch": 0.3026052104208417, + "grad_norm": 33.59491972183703, + "learning_rate": 9.999980426281275e-06, + "loss": 4.4504, + "step": 1510 + }, + { + "epoch": 0.30280561122244487, + "grad_norm": 27.18390828877876, + "learning_rate": 9.99997702806882e-06, + "loss": 4.1871, + "step": 1511 + }, + { + "epoch": 0.3030060120240481, + "grad_norm": 31.159456901834517, + "learning_rate": 9.999973358000234e-06, + "loss": 4.121, + "step": 1512 + }, + { + "epoch": 0.3032064128256513, + "grad_norm": 42.35744722309082, + "learning_rate": 9.999969416075715e-06, + "loss": 4.2498, + "step": 1513 + }, + { + "epoch": 0.3034068136272545, + "grad_norm": 102.31139167965415, + "learning_rate": 9.999965202295478e-06, + "loss": 4.2086, + "step": 1514 + }, + { + "epoch": 0.30360721442885774, + "grad_norm": 21.032066348731465, + "learning_rate": 9.999960716659755e-06, + "loss": 3.6807, + "step": 1515 + }, + { + "epoch": 0.3038076152304609, + "grad_norm": 31.282476742166807, + "learning_rate": 9.999955959168786e-06, + "loss": 4.3503, + "step": 1516 + }, + { + "epoch": 0.3040080160320641, + "grad_norm": 51.71242632981615, + "learning_rate": 9.999950929822829e-06, + "loss": 4.7827, + "step": 1517 + }, + { + "epoch": 0.30420841683366734, + "grad_norm": 33.850867599832654, + "learning_rate": 9.999945628622161e-06, + "loss": 4.6213, + "step": 1518 + }, + { + "epoch": 0.30440881763527056, + "grad_norm": 30.69829518790637, + "learning_rate": 9.999940055567067e-06, + "loss": 4.1845, + "step": 1519 + }, + { + "epoch": 0.3046092184368738, + "grad_norm": 33.07825368658083, + "learning_rate": 9.999934210657853e-06, + "loss": 4.1977, + "step": 1520 + }, + { + "epoch": 0.30480961923847694, + "grad_norm": 25.708813839960115, + "learning_rate": 9.999928093894836e-06, + "loss": 4.2793, + "step": 1521 + }, + { + "epoch": 0.30501002004008015, + "grad_norm": 27.265660335232198, + "learning_rate": 9.999921705278348e-06, + "loss": 4.1444, + "step": 1522 + }, + { + "epoch": 0.30521042084168337, + "grad_norm": 27.283009105301108, + "learning_rate": 9.999915044808734e-06, + "loss": 4.1457, + "step": 1523 + }, + { + "epoch": 0.3054108216432866, + "grad_norm": 27.075760140303824, + "learning_rate": 9.99990811248636e-06, + "loss": 4.1601, + "step": 1524 + }, + { + "epoch": 0.30561122244488975, + "grad_norm": 25.85607415135364, + "learning_rate": 9.999900908311602e-06, + "loss": 4.4969, + "step": 1525 + }, + { + "epoch": 0.30581162324649297, + "grad_norm": 39.01680900079098, + "learning_rate": 9.99989343228485e-06, + "loss": 4.7704, + "step": 1526 + }, + { + "epoch": 0.3060120240480962, + "grad_norm": 41.07344587931943, + "learning_rate": 9.999885684406512e-06, + "loss": 3.7434, + "step": 1527 + }, + { + "epoch": 0.3062124248496994, + "grad_norm": 92.6193745127869, + "learning_rate": 9.999877664677009e-06, + "loss": 4.0162, + "step": 1528 + }, + { + "epoch": 0.3064128256513026, + "grad_norm": 47.61698754027946, + "learning_rate": 9.999869373096777e-06, + "loss": 4.6186, + "step": 1529 + }, + { + "epoch": 0.3066132264529058, + "grad_norm": 38.1551447966739, + "learning_rate": 9.999860809666266e-06, + "loss": 4.2503, + "step": 1530 + }, + { + "epoch": 0.306813627254509, + "grad_norm": 42.99504410052648, + "learning_rate": 9.999851974385943e-06, + "loss": 4.1463, + "step": 1531 + }, + { + "epoch": 0.3070140280561122, + "grad_norm": 35.79484792146368, + "learning_rate": 9.99984286725629e-06, + "loss": 4.1029, + "step": 1532 + }, + { + "epoch": 0.30721442885771544, + "grad_norm": 32.233161771594155, + "learning_rate": 9.999833488277795e-06, + "loss": 4.4599, + "step": 1533 + }, + { + "epoch": 0.30741482965931866, + "grad_norm": 35.827316750477394, + "learning_rate": 9.999823837450975e-06, + "loss": 3.9985, + "step": 1534 + }, + { + "epoch": 0.3076152304609218, + "grad_norm": 68.78015796033624, + "learning_rate": 9.999813914776353e-06, + "loss": 4.6659, + "step": 1535 + }, + { + "epoch": 0.30781563126252504, + "grad_norm": 28.501013863063204, + "learning_rate": 9.999803720254467e-06, + "loss": 4.1934, + "step": 1536 + }, + { + "epoch": 0.30801603206412825, + "grad_norm": 80.32163505763349, + "learning_rate": 9.999793253885874e-06, + "loss": 4.3542, + "step": 1537 + }, + { + "epoch": 0.30821643286573147, + "grad_norm": 42.601176964463924, + "learning_rate": 9.99978251567114e-06, + "loss": 3.6452, + "step": 1538 + }, + { + "epoch": 0.3084168336673347, + "grad_norm": 30.870268309338826, + "learning_rate": 9.999771505610852e-06, + "loss": 3.9283, + "step": 1539 + }, + { + "epoch": 0.30861723446893785, + "grad_norm": 63.51101493017678, + "learning_rate": 9.999760223705605e-06, + "loss": 4.5227, + "step": 1540 + }, + { + "epoch": 0.30881763527054107, + "grad_norm": 37.68908743767659, + "learning_rate": 9.999748669956013e-06, + "loss": 3.9638, + "step": 1541 + }, + { + "epoch": 0.3090180360721443, + "grad_norm": 44.49938172103992, + "learning_rate": 9.999736844362708e-06, + "loss": 3.666, + "step": 1542 + }, + { + "epoch": 0.3092184368737475, + "grad_norm": 26.06028640396469, + "learning_rate": 9.999724746926332e-06, + "loss": 3.9802, + "step": 1543 + }, + { + "epoch": 0.3094188376753507, + "grad_norm": 30.206772898712515, + "learning_rate": 9.999712377647539e-06, + "loss": 4.5508, + "step": 1544 + }, + { + "epoch": 0.3096192384769539, + "grad_norm": 44.339005224094684, + "learning_rate": 9.999699736527002e-06, + "loss": 4.4706, + "step": 1545 + }, + { + "epoch": 0.3098196392785571, + "grad_norm": 36.39762407311245, + "learning_rate": 9.999686823565413e-06, + "loss": 4.4811, + "step": 1546 + }, + { + "epoch": 0.3100200400801603, + "grad_norm": 28.49045731810747, + "learning_rate": 9.99967363876347e-06, + "loss": 4.1498, + "step": 1547 + }, + { + "epoch": 0.31022044088176354, + "grad_norm": 39.783358563861746, + "learning_rate": 9.999660182121891e-06, + "loss": 4.0123, + "step": 1548 + }, + { + "epoch": 0.31042084168336675, + "grad_norm": 37.57917035234525, + "learning_rate": 9.999646453641408e-06, + "loss": 4.4626, + "step": 1549 + }, + { + "epoch": 0.3106212424849699, + "grad_norm": 32.9904805654265, + "learning_rate": 9.999632453322768e-06, + "loss": 4.0184, + "step": 1550 + }, + { + "epoch": 0.31082164328657313, + "grad_norm": 30.628030331595138, + "learning_rate": 9.99961818116673e-06, + "loss": 3.971, + "step": 1551 + }, + { + "epoch": 0.31102204408817635, + "grad_norm": 22.029134759135726, + "learning_rate": 9.999603637174072e-06, + "loss": 3.6032, + "step": 1552 + }, + { + "epoch": 0.31122244488977957, + "grad_norm": 35.232603295685, + "learning_rate": 9.999588821345584e-06, + "loss": 4.3495, + "step": 1553 + }, + { + "epoch": 0.3114228456913828, + "grad_norm": 29.19372612553243, + "learning_rate": 9.999573733682073e-06, + "loss": 4.0597, + "step": 1554 + }, + { + "epoch": 0.31162324649298595, + "grad_norm": 54.26762395387426, + "learning_rate": 9.999558374184355e-06, + "loss": 3.7896, + "step": 1555 + }, + { + "epoch": 0.31182364729458917, + "grad_norm": 27.751965103436312, + "learning_rate": 9.99954274285327e-06, + "loss": 4.1262, + "step": 1556 + }, + { + "epoch": 0.3120240480961924, + "grad_norm": 35.8058960231863, + "learning_rate": 9.999526839689665e-06, + "loss": 4.244, + "step": 1557 + }, + { + "epoch": 0.3122244488977956, + "grad_norm": 31.23139024050887, + "learning_rate": 9.999510664694408e-06, + "loss": 3.8979, + "step": 1558 + }, + { + "epoch": 0.3124248496993988, + "grad_norm": 39.53862187582993, + "learning_rate": 9.999494217868375e-06, + "loss": 4.7707, + "step": 1559 + }, + { + "epoch": 0.312625250501002, + "grad_norm": 37.71189638910491, + "learning_rate": 9.999477499212462e-06, + "loss": 4.2195, + "step": 1560 + }, + { + "epoch": 0.3128256513026052, + "grad_norm": 33.221050823763264, + "learning_rate": 9.999460508727577e-06, + "loss": 4.4595, + "step": 1561 + }, + { + "epoch": 0.3130260521042084, + "grad_norm": 39.54207986854778, + "learning_rate": 9.999443246414645e-06, + "loss": 4.5964, + "step": 1562 + }, + { + "epoch": 0.31322645290581164, + "grad_norm": 44.48011104517783, + "learning_rate": 9.999425712274603e-06, + "loss": 4.1006, + "step": 1563 + }, + { + "epoch": 0.31342685370741485, + "grad_norm": 27.874052200262845, + "learning_rate": 9.999407906308405e-06, + "loss": 4.7567, + "step": 1564 + }, + { + "epoch": 0.313627254509018, + "grad_norm": 49.82993056945954, + "learning_rate": 9.999389828517018e-06, + "loss": 4.3619, + "step": 1565 + }, + { + "epoch": 0.31382765531062123, + "grad_norm": 42.637866285640634, + "learning_rate": 9.999371478901429e-06, + "loss": 3.8861, + "step": 1566 + }, + { + "epoch": 0.31402805611222445, + "grad_norm": 33.99954692536504, + "learning_rate": 9.999352857462632e-06, + "loss": 4.473, + "step": 1567 + }, + { + "epoch": 0.31422845691382767, + "grad_norm": 29.131775755881804, + "learning_rate": 9.999333964201639e-06, + "loss": 4.6344, + "step": 1568 + }, + { + "epoch": 0.3144288577154309, + "grad_norm": 31.973484810983862, + "learning_rate": 9.999314799119481e-06, + "loss": 4.4465, + "step": 1569 + }, + { + "epoch": 0.31462925851703405, + "grad_norm": 35.787014643956816, + "learning_rate": 9.999295362217196e-06, + "loss": 4.39, + "step": 1570 + }, + { + "epoch": 0.31482965931863727, + "grad_norm": 39.83228518520044, + "learning_rate": 9.999275653495844e-06, + "loss": 4.1648, + "step": 1571 + }, + { + "epoch": 0.3150300601202405, + "grad_norm": 53.07317016092632, + "learning_rate": 9.999255672956494e-06, + "loss": 4.2307, + "step": 1572 + }, + { + "epoch": 0.3152304609218437, + "grad_norm": 42.14019928906876, + "learning_rate": 9.999235420600232e-06, + "loss": 3.9696, + "step": 1573 + }, + { + "epoch": 0.3154308617234469, + "grad_norm": 46.62951250115141, + "learning_rate": 9.999214896428164e-06, + "loss": 4.4982, + "step": 1574 + }, + { + "epoch": 0.3156312625250501, + "grad_norm": 57.88327828863587, + "learning_rate": 9.9991941004414e-06, + "loss": 4.4408, + "step": 1575 + }, + { + "epoch": 0.3158316633266533, + "grad_norm": 38.32279202570548, + "learning_rate": 9.999173032641074e-06, + "loss": 4.0738, + "step": 1576 + }, + { + "epoch": 0.3160320641282565, + "grad_norm": 62.465029572570785, + "learning_rate": 9.99915169302833e-06, + "loss": 4.4333, + "step": 1577 + }, + { + "epoch": 0.31623246492985974, + "grad_norm": 29.932372171480537, + "learning_rate": 9.99913008160433e-06, + "loss": 3.943, + "step": 1578 + }, + { + "epoch": 0.31643286573146295, + "grad_norm": 36.56326979710196, + "learning_rate": 9.999108198370248e-06, + "loss": 4.3839, + "step": 1579 + }, + { + "epoch": 0.3166332665330661, + "grad_norm": 37.32038645707749, + "learning_rate": 9.999086043327276e-06, + "loss": 4.2592, + "step": 1580 + }, + { + "epoch": 0.31683366733466933, + "grad_norm": 42.19845298652882, + "learning_rate": 9.999063616476614e-06, + "loss": 4.0272, + "step": 1581 + }, + { + "epoch": 0.31703406813627255, + "grad_norm": 45.345249672846315, + "learning_rate": 9.999040917819484e-06, + "loss": 4.304, + "step": 1582 + }, + { + "epoch": 0.31723446893787577, + "grad_norm": 39.900035948535034, + "learning_rate": 9.999017947357119e-06, + "loss": 3.9337, + "step": 1583 + }, + { + "epoch": 0.31743486973947893, + "grad_norm": 28.134226948912723, + "learning_rate": 9.998994705090772e-06, + "loss": 4.1482, + "step": 1584 + }, + { + "epoch": 0.31763527054108215, + "grad_norm": 35.58710846813915, + "learning_rate": 9.9989711910217e-06, + "loss": 4.461, + "step": 1585 + }, + { + "epoch": 0.31783567134268537, + "grad_norm": 31.596969998115217, + "learning_rate": 9.998947405151189e-06, + "loss": 4.3179, + "step": 1586 + }, + { + "epoch": 0.3180360721442886, + "grad_norm": 45.69247605078572, + "learning_rate": 9.998923347480526e-06, + "loss": 3.83, + "step": 1587 + }, + { + "epoch": 0.3182364729458918, + "grad_norm": 22.464357162738292, + "learning_rate": 9.998899018011021e-06, + "loss": 3.6972, + "step": 1588 + }, + { + "epoch": 0.31843687374749496, + "grad_norm": 34.092287325177125, + "learning_rate": 9.998874416744e-06, + "loss": 4.2568, + "step": 1589 + }, + { + "epoch": 0.3186372745490982, + "grad_norm": 37.59997102393774, + "learning_rate": 9.998849543680795e-06, + "loss": 4.4452, + "step": 1590 + }, + { + "epoch": 0.3188376753507014, + "grad_norm": 27.562624902953203, + "learning_rate": 9.998824398822763e-06, + "loss": 4.2775, + "step": 1591 + }, + { + "epoch": 0.3190380761523046, + "grad_norm": 40.19668564377868, + "learning_rate": 9.99879898217127e-06, + "loss": 4.2383, + "step": 1592 + }, + { + "epoch": 0.31923847695390783, + "grad_norm": 30.166229643552324, + "learning_rate": 9.998773293727696e-06, + "loss": 4.1953, + "step": 1593 + }, + { + "epoch": 0.319438877755511, + "grad_norm": 45.868820102117596, + "learning_rate": 9.998747333493441e-06, + "loss": 3.9924, + "step": 1594 + }, + { + "epoch": 0.3196392785571142, + "grad_norm": 45.47095943223161, + "learning_rate": 9.998721101469913e-06, + "loss": 4.2334, + "step": 1595 + }, + { + "epoch": 0.31983967935871743, + "grad_norm": 44.02385840298947, + "learning_rate": 9.99869459765854e-06, + "loss": 4.5277, + "step": 1596 + }, + { + "epoch": 0.32004008016032065, + "grad_norm": 25.629002474845286, + "learning_rate": 9.998667822060764e-06, + "loss": 3.553, + "step": 1597 + }, + { + "epoch": 0.32024048096192387, + "grad_norm": 36.45238884510709, + "learning_rate": 9.998640774678038e-06, + "loss": 4.5606, + "step": 1598 + }, + { + "epoch": 0.32044088176352703, + "grad_norm": 24.45953744219881, + "learning_rate": 9.998613455511837e-06, + "loss": 4.3753, + "step": 1599 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 33.267177329173116, + "learning_rate": 9.998585864563641e-06, + "loss": 4.8599, + "step": 1600 + }, + { + "epoch": 0.32084168336673347, + "grad_norm": 32.81722373029679, + "learning_rate": 9.998558001834954e-06, + "loss": 3.8208, + "step": 1601 + }, + { + "epoch": 0.3210420841683367, + "grad_norm": 27.708841217003396, + "learning_rate": 9.998529867327291e-06, + "loss": 4.0096, + "step": 1602 + }, + { + "epoch": 0.3212424849699399, + "grad_norm": 27.941192343675283, + "learning_rate": 9.99850146104218e-06, + "loss": 3.8824, + "step": 1603 + }, + { + "epoch": 0.32144288577154306, + "grad_norm": 33.1566196355965, + "learning_rate": 9.998472782981166e-06, + "loss": 4.1224, + "step": 1604 + }, + { + "epoch": 0.3216432865731463, + "grad_norm": 35.680081552290595, + "learning_rate": 9.998443833145808e-06, + "loss": 4.5626, + "step": 1605 + }, + { + "epoch": 0.3218436873747495, + "grad_norm": 32.1335611846054, + "learning_rate": 9.998414611537682e-06, + "loss": 4.1383, + "step": 1606 + }, + { + "epoch": 0.3220440881763527, + "grad_norm": 37.85536620809725, + "learning_rate": 9.998385118158373e-06, + "loss": 4.6256, + "step": 1607 + }, + { + "epoch": 0.32224448897795593, + "grad_norm": 27.030612847468586, + "learning_rate": 9.998355353009488e-06, + "loss": 4.0551, + "step": 1608 + }, + { + "epoch": 0.3224448897795591, + "grad_norm": 25.55020662215223, + "learning_rate": 9.998325316092644e-06, + "loss": 3.7121, + "step": 1609 + }, + { + "epoch": 0.3226452905811623, + "grad_norm": 46.903299916928816, + "learning_rate": 9.998295007409475e-06, + "loss": 5.1738, + "step": 1610 + }, + { + "epoch": 0.32284569138276553, + "grad_norm": 31.367703333510526, + "learning_rate": 9.998264426961626e-06, + "loss": 3.5527, + "step": 1611 + }, + { + "epoch": 0.32304609218436875, + "grad_norm": 34.14230946267689, + "learning_rate": 9.998233574750765e-06, + "loss": 4.9408, + "step": 1612 + }, + { + "epoch": 0.32324649298597197, + "grad_norm": 93.684075393082, + "learning_rate": 9.998202450778565e-06, + "loss": 3.829, + "step": 1613 + }, + { + "epoch": 0.32344689378757513, + "grad_norm": 42.014229941121904, + "learning_rate": 9.998171055046719e-06, + "loss": 4.5932, + "step": 1614 + }, + { + "epoch": 0.32364729458917835, + "grad_norm": 31.78277474180362, + "learning_rate": 9.998139387556935e-06, + "loss": 4.4227, + "step": 1615 + }, + { + "epoch": 0.32384769539078156, + "grad_norm": 38.66672574735086, + "learning_rate": 9.998107448310936e-06, + "loss": 4.2249, + "step": 1616 + }, + { + "epoch": 0.3240480961923848, + "grad_norm": 29.548333785625957, + "learning_rate": 9.998075237310456e-06, + "loss": 4.3978, + "step": 1617 + }, + { + "epoch": 0.324248496993988, + "grad_norm": 28.070856737523503, + "learning_rate": 9.998042754557249e-06, + "loss": 3.7886, + "step": 1618 + }, + { + "epoch": 0.32444889779559116, + "grad_norm": 30.3620355560525, + "learning_rate": 9.998010000053077e-06, + "loss": 4.6665, + "step": 1619 + }, + { + "epoch": 0.3246492985971944, + "grad_norm": 35.080946045516654, + "learning_rate": 9.997976973799726e-06, + "loss": 3.9874, + "step": 1620 + }, + { + "epoch": 0.3248496993987976, + "grad_norm": 27.736581068287435, + "learning_rate": 9.997943675798988e-06, + "loss": 4.0769, + "step": 1621 + }, + { + "epoch": 0.3250501002004008, + "grad_norm": 32.89821134954861, + "learning_rate": 9.997910106052676e-06, + "loss": 4.1331, + "step": 1622 + }, + { + "epoch": 0.32525050100200403, + "grad_norm": 26.16093743968586, + "learning_rate": 9.997876264562612e-06, + "loss": 4.182, + "step": 1623 + }, + { + "epoch": 0.3254509018036072, + "grad_norm": 32.969298880236046, + "learning_rate": 9.99784215133064e-06, + "loss": 4.4299, + "step": 1624 + }, + { + "epoch": 0.3256513026052104, + "grad_norm": 31.022429535413913, + "learning_rate": 9.99780776635861e-06, + "loss": 3.2934, + "step": 1625 + }, + { + "epoch": 0.32585170340681363, + "grad_norm": 38.240224407808846, + "learning_rate": 9.997773109648397e-06, + "loss": 3.7716, + "step": 1626 + }, + { + "epoch": 0.32605210420841685, + "grad_norm": 32.534106813336464, + "learning_rate": 9.997738181201882e-06, + "loss": 4.8788, + "step": 1627 + }, + { + "epoch": 0.32625250501002007, + "grad_norm": 25.17783731584081, + "learning_rate": 9.997702981020962e-06, + "loss": 4.1509, + "step": 1628 + }, + { + "epoch": 0.32645290581162323, + "grad_norm": 90.0878649465684, + "learning_rate": 9.997667509107558e-06, + "loss": 4.353, + "step": 1629 + }, + { + "epoch": 0.32665330661322645, + "grad_norm": 23.461857235112443, + "learning_rate": 9.997631765463592e-06, + "loss": 3.885, + "step": 1630 + }, + { + "epoch": 0.32685370741482966, + "grad_norm": 29.206367377574008, + "learning_rate": 9.99759575009101e-06, + "loss": 4.092, + "step": 1631 + }, + { + "epoch": 0.3270541082164329, + "grad_norm": 26.854408227323862, + "learning_rate": 9.997559462991768e-06, + "loss": 4.0947, + "step": 1632 + }, + { + "epoch": 0.3272545090180361, + "grad_norm": 32.0658417317578, + "learning_rate": 9.997522904167844e-06, + "loss": 3.9428, + "step": 1633 + }, + { + "epoch": 0.32745490981963926, + "grad_norm": 70.60911854258829, + "learning_rate": 9.997486073621221e-06, + "loss": 4.2559, + "step": 1634 + }, + { + "epoch": 0.3276553106212425, + "grad_norm": 28.245262851917282, + "learning_rate": 9.997448971353904e-06, + "loss": 3.7404, + "step": 1635 + }, + { + "epoch": 0.3278557114228457, + "grad_norm": 35.18647616724965, + "learning_rate": 9.99741159736791e-06, + "loss": 3.6588, + "step": 1636 + }, + { + "epoch": 0.3280561122244489, + "grad_norm": 37.29304512587553, + "learning_rate": 9.997373951665268e-06, + "loss": 4.6726, + "step": 1637 + }, + { + "epoch": 0.3282565130260521, + "grad_norm": 29.297786922760515, + "learning_rate": 9.99733603424803e-06, + "loss": 3.8244, + "step": 1638 + }, + { + "epoch": 0.3284569138276553, + "grad_norm": 36.52522889069633, + "learning_rate": 9.997297845118255e-06, + "loss": 4.5952, + "step": 1639 + }, + { + "epoch": 0.3286573146292585, + "grad_norm": 29.112307306716637, + "learning_rate": 9.997259384278019e-06, + "loss": 3.8801, + "step": 1640 + }, + { + "epoch": 0.32885771543086173, + "grad_norm": 35.26958403029214, + "learning_rate": 9.997220651729414e-06, + "loss": 4.5718, + "step": 1641 + }, + { + "epoch": 0.32905811623246495, + "grad_norm": 28.205638388094314, + "learning_rate": 9.997181647474544e-06, + "loss": 4.5408, + "step": 1642 + }, + { + "epoch": 0.3292585170340681, + "grad_norm": 68.32478704543063, + "learning_rate": 9.997142371515533e-06, + "loss": 4.2479, + "step": 1643 + }, + { + "epoch": 0.3294589178356713, + "grad_norm": 45.379252034295135, + "learning_rate": 9.997102823854514e-06, + "loss": 4.8637, + "step": 1644 + }, + { + "epoch": 0.32965931863727455, + "grad_norm": 33.508526569460415, + "learning_rate": 9.997063004493639e-06, + "loss": 4.3227, + "step": 1645 + }, + { + "epoch": 0.32985971943887776, + "grad_norm": 44.385916262743585, + "learning_rate": 9.99702291343507e-06, + "loss": 4.861, + "step": 1646 + }, + { + "epoch": 0.330060120240481, + "grad_norm": 36.354048264957854, + "learning_rate": 9.99698255068099e-06, + "loss": 3.8734, + "step": 1647 + }, + { + "epoch": 0.33026052104208414, + "grad_norm": 28.047161599386815, + "learning_rate": 9.996941916233594e-06, + "loss": 4.3378, + "step": 1648 + }, + { + "epoch": 0.33046092184368736, + "grad_norm": 52.48777156494219, + "learning_rate": 9.996901010095088e-06, + "loss": 5.0019, + "step": 1649 + }, + { + "epoch": 0.3306613226452906, + "grad_norm": 34.37055715443493, + "learning_rate": 9.996859832267698e-06, + "loss": 4.5083, + "step": 1650 + }, + { + "epoch": 0.3308617234468938, + "grad_norm": 44.38398944035477, + "learning_rate": 9.996818382753663e-06, + "loss": 4.1346, + "step": 1651 + }, + { + "epoch": 0.331062124248497, + "grad_norm": 49.2712330242756, + "learning_rate": 9.996776661555237e-06, + "loss": 4.5711, + "step": 1652 + }, + { + "epoch": 0.3312625250501002, + "grad_norm": 27.878242979083183, + "learning_rate": 9.996734668674688e-06, + "loss": 4.4844, + "step": 1653 + }, + { + "epoch": 0.3314629258517034, + "grad_norm": 23.801420365529406, + "learning_rate": 9.9966924041143e-06, + "loss": 3.9536, + "step": 1654 + }, + { + "epoch": 0.3316633266533066, + "grad_norm": 36.98311153165889, + "learning_rate": 9.996649867876369e-06, + "loss": 4.0279, + "step": 1655 + }, + { + "epoch": 0.33186372745490983, + "grad_norm": 41.25768914154459, + "learning_rate": 9.99660705996321e-06, + "loss": 4.8353, + "step": 1656 + }, + { + "epoch": 0.33206412825651305, + "grad_norm": 38.346599413497835, + "learning_rate": 9.996563980377149e-06, + "loss": 4.6055, + "step": 1657 + }, + { + "epoch": 0.3322645290581162, + "grad_norm": 44.31052394557685, + "learning_rate": 9.996520629120528e-06, + "loss": 4.0449, + "step": 1658 + }, + { + "epoch": 0.3324649298597194, + "grad_norm": 46.471475655964184, + "learning_rate": 9.996477006195706e-06, + "loss": 4.3402, + "step": 1659 + }, + { + "epoch": 0.33266533066132264, + "grad_norm": 36.06470836994193, + "learning_rate": 9.996433111605053e-06, + "loss": 4.403, + "step": 1660 + }, + { + "epoch": 0.33286573146292586, + "grad_norm": 34.260953632374914, + "learning_rate": 9.996388945350957e-06, + "loss": 4.7001, + "step": 1661 + }, + { + "epoch": 0.3330661322645291, + "grad_norm": 36.45380526589998, + "learning_rate": 9.996344507435817e-06, + "loss": 4.3494, + "step": 1662 + }, + { + "epoch": 0.33326653306613224, + "grad_norm": 37.01251621222658, + "learning_rate": 9.996299797862053e-06, + "loss": 4.3071, + "step": 1663 + }, + { + "epoch": 0.33346693386773546, + "grad_norm": 35.048909432638986, + "learning_rate": 9.996254816632091e-06, + "loss": 4.4191, + "step": 1664 + }, + { + "epoch": 0.3336673346693387, + "grad_norm": 21.451996701239125, + "learning_rate": 9.996209563748383e-06, + "loss": 4.0785, + "step": 1665 + }, + { + "epoch": 0.3338677354709419, + "grad_norm": 32.145376474233984, + "learning_rate": 9.996164039213384e-06, + "loss": 4.4156, + "step": 1666 + }, + { + "epoch": 0.3340681362725451, + "grad_norm": 27.060926355533855, + "learning_rate": 9.996118243029572e-06, + "loss": 4.2109, + "step": 1667 + }, + { + "epoch": 0.3342685370741483, + "grad_norm": 33.202921219030685, + "learning_rate": 9.996072175199435e-06, + "loss": 4.6982, + "step": 1668 + }, + { + "epoch": 0.3344689378757515, + "grad_norm": 34.09354184619776, + "learning_rate": 9.99602583572548e-06, + "loss": 4.7933, + "step": 1669 + }, + { + "epoch": 0.3346693386773547, + "grad_norm": 36.997940295652555, + "learning_rate": 9.995979224610223e-06, + "loss": 4.3156, + "step": 1670 + }, + { + "epoch": 0.33486973947895793, + "grad_norm": 29.29379398349952, + "learning_rate": 9.995932341856205e-06, + "loss": 4.3441, + "step": 1671 + }, + { + "epoch": 0.33507014028056115, + "grad_norm": 27.543449830289685, + "learning_rate": 9.995885187465968e-06, + "loss": 3.8127, + "step": 1672 + }, + { + "epoch": 0.3352705410821643, + "grad_norm": 35.64891667310218, + "learning_rate": 9.99583776144208e-06, + "loss": 4.2695, + "step": 1673 + }, + { + "epoch": 0.3354709418837675, + "grad_norm": 21.115632680746806, + "learning_rate": 9.995790063787118e-06, + "loss": 3.9419, + "step": 1674 + }, + { + "epoch": 0.33567134268537074, + "grad_norm": 42.044574692112015, + "learning_rate": 9.995742094503676e-06, + "loss": 4.6015, + "step": 1675 + }, + { + "epoch": 0.33587174348697396, + "grad_norm": 34.275183740342996, + "learning_rate": 9.99569385359436e-06, + "loss": 4.2801, + "step": 1676 + }, + { + "epoch": 0.3360721442885772, + "grad_norm": 43.7210541299795, + "learning_rate": 9.995645341061797e-06, + "loss": 4.0039, + "step": 1677 + }, + { + "epoch": 0.33627254509018034, + "grad_norm": 44.284418640207186, + "learning_rate": 9.995596556908622e-06, + "loss": 4.6027, + "step": 1678 + }, + { + "epoch": 0.33647294589178356, + "grad_norm": 19.936890453374435, + "learning_rate": 9.995547501137488e-06, + "loss": 3.4958, + "step": 1679 + }, + { + "epoch": 0.3366733466933868, + "grad_norm": 33.28067942792342, + "learning_rate": 9.995498173751061e-06, + "loss": 4.3054, + "step": 1680 + }, + { + "epoch": 0.33687374749499, + "grad_norm": 30.58447324666037, + "learning_rate": 9.995448574752027e-06, + "loss": 4.3431, + "step": 1681 + }, + { + "epoch": 0.3370741482965932, + "grad_norm": 23.10624120777385, + "learning_rate": 9.995398704143077e-06, + "loss": 3.6969, + "step": 1682 + }, + { + "epoch": 0.3372745490981964, + "grad_norm": 39.147777330102, + "learning_rate": 9.995348561926928e-06, + "loss": 4.745, + "step": 1683 + }, + { + "epoch": 0.3374749498997996, + "grad_norm": 31.975877182715845, + "learning_rate": 9.995298148106303e-06, + "loss": 4.4783, + "step": 1684 + }, + { + "epoch": 0.3376753507014028, + "grad_norm": 27.659717549853866, + "learning_rate": 9.995247462683944e-06, + "loss": 4.4507, + "step": 1685 + }, + { + "epoch": 0.337875751503006, + "grad_norm": 22.44983793189279, + "learning_rate": 9.995196505662608e-06, + "loss": 4.0951, + "step": 1686 + }, + { + "epoch": 0.33807615230460925, + "grad_norm": 27.31154567293554, + "learning_rate": 9.995145277045061e-06, + "loss": 3.8386, + "step": 1687 + }, + { + "epoch": 0.3382765531062124, + "grad_norm": 28.64643257079629, + "learning_rate": 9.995093776834095e-06, + "loss": 4.474, + "step": 1688 + }, + { + "epoch": 0.3384769539078156, + "grad_norm": 32.45102849844069, + "learning_rate": 9.995042005032506e-06, + "loss": 4.0891, + "step": 1689 + }, + { + "epoch": 0.33867735470941884, + "grad_norm": 23.457944472028863, + "learning_rate": 9.994989961643108e-06, + "loss": 3.9105, + "step": 1690 + }, + { + "epoch": 0.33887775551102206, + "grad_norm": 29.960419306759363, + "learning_rate": 9.994937646668736e-06, + "loss": 3.8914, + "step": 1691 + }, + { + "epoch": 0.3390781563126252, + "grad_norm": 27.091924030401696, + "learning_rate": 9.99488506011223e-06, + "loss": 4.0022, + "step": 1692 + }, + { + "epoch": 0.33927855711422844, + "grad_norm": 27.213306817001264, + "learning_rate": 9.994832201976448e-06, + "loss": 4.144, + "step": 1693 + }, + { + "epoch": 0.33947895791583166, + "grad_norm": 22.47072521218331, + "learning_rate": 9.994779072264267e-06, + "loss": 3.9107, + "step": 1694 + }, + { + "epoch": 0.3396793587174349, + "grad_norm": 29.535306502252045, + "learning_rate": 9.994725670978575e-06, + "loss": 4.3005, + "step": 1695 + }, + { + "epoch": 0.3398797595190381, + "grad_norm": 25.70356257463102, + "learning_rate": 9.994671998122276e-06, + "loss": 4.0045, + "step": 1696 + }, + { + "epoch": 0.34008016032064126, + "grad_norm": 41.7986418313879, + "learning_rate": 9.994618053698289e-06, + "loss": 3.9776, + "step": 1697 + }, + { + "epoch": 0.3402805611222445, + "grad_norm": 28.102910530156475, + "learning_rate": 9.994563837709543e-06, + "loss": 4.2892, + "step": 1698 + }, + { + "epoch": 0.3404809619238477, + "grad_norm": 35.451962482033586, + "learning_rate": 9.994509350158989e-06, + "loss": 3.575, + "step": 1699 + }, + { + "epoch": 0.3406813627254509, + "grad_norm": 24.359741704492894, + "learning_rate": 9.994454591049589e-06, + "loss": 3.9704, + "step": 1700 + }, + { + "epoch": 0.3408817635270541, + "grad_norm": 42.271270189835285, + "learning_rate": 9.994399560384322e-06, + "loss": 4.5302, + "step": 1701 + }, + { + "epoch": 0.3410821643286573, + "grad_norm": 73.98921338694893, + "learning_rate": 9.994344258166178e-06, + "loss": 3.6487, + "step": 1702 + }, + { + "epoch": 0.3412825651302605, + "grad_norm": 34.47852528811214, + "learning_rate": 9.994288684398163e-06, + "loss": 4.1335, + "step": 1703 + }, + { + "epoch": 0.3414829659318637, + "grad_norm": 28.00207359484573, + "learning_rate": 9.994232839083302e-06, + "loss": 4.4549, + "step": 1704 + }, + { + "epoch": 0.34168336673346694, + "grad_norm": 63.9978830032524, + "learning_rate": 9.994176722224627e-06, + "loss": 4.8591, + "step": 1705 + }, + { + "epoch": 0.34188376753507016, + "grad_norm": 31.66921103525608, + "learning_rate": 9.994120333825193e-06, + "loss": 4.1511, + "step": 1706 + }, + { + "epoch": 0.3420841683366733, + "grad_norm": 34.49985538991611, + "learning_rate": 9.994063673888064e-06, + "loss": 4.1554, + "step": 1707 + }, + { + "epoch": 0.34228456913827654, + "grad_norm": 39.59405749087512, + "learning_rate": 9.99400674241632e-06, + "loss": 4.5626, + "step": 1708 + }, + { + "epoch": 0.34248496993987976, + "grad_norm": 28.532270447148928, + "learning_rate": 9.993949539413061e-06, + "loss": 4.2024, + "step": 1709 + }, + { + "epoch": 0.342685370741483, + "grad_norm": 27.568759044800167, + "learning_rate": 9.993892064881391e-06, + "loss": 3.9668, + "step": 1710 + }, + { + "epoch": 0.3428857715430862, + "grad_norm": 25.24465996534829, + "learning_rate": 9.993834318824438e-06, + "loss": 3.8414, + "step": 1711 + }, + { + "epoch": 0.34308617234468936, + "grad_norm": 73.23170744086941, + "learning_rate": 9.993776301245342e-06, + "loss": 3.7449, + "step": 1712 + }, + { + "epoch": 0.3432865731462926, + "grad_norm": 35.35750427154929, + "learning_rate": 9.993718012147257e-06, + "loss": 4.3501, + "step": 1713 + }, + { + "epoch": 0.3434869739478958, + "grad_norm": 50.251306572476, + "learning_rate": 9.993659451533352e-06, + "loss": 4.0909, + "step": 1714 + }, + { + "epoch": 0.343687374749499, + "grad_norm": 41.20990546616036, + "learning_rate": 9.99360061940681e-06, + "loss": 4.3915, + "step": 1715 + }, + { + "epoch": 0.3438877755511022, + "grad_norm": 49.64235176481733, + "learning_rate": 9.993541515770836e-06, + "loss": 4.5653, + "step": 1716 + }, + { + "epoch": 0.3440881763527054, + "grad_norm": 30.197106682850475, + "learning_rate": 9.993482140628634e-06, + "loss": 4.2715, + "step": 1717 + }, + { + "epoch": 0.3442885771543086, + "grad_norm": 59.03561173122369, + "learning_rate": 9.993422493983438e-06, + "loss": 4.3076, + "step": 1718 + }, + { + "epoch": 0.3444889779559118, + "grad_norm": 48.93973168962881, + "learning_rate": 9.99336257583849e-06, + "loss": 4.3746, + "step": 1719 + }, + { + "epoch": 0.34468937875751504, + "grad_norm": 28.48568004215839, + "learning_rate": 9.993302386197047e-06, + "loss": 4.6562, + "step": 1720 + }, + { + "epoch": 0.34488977955911826, + "grad_norm": 84.277199714467, + "learning_rate": 9.993241925062385e-06, + "loss": 4.0294, + "step": 1721 + }, + { + "epoch": 0.3450901803607214, + "grad_norm": 29.1662307516602, + "learning_rate": 9.993181192437787e-06, + "loss": 4.0706, + "step": 1722 + }, + { + "epoch": 0.34529058116232464, + "grad_norm": 46.35485556106263, + "learning_rate": 9.993120188326557e-06, + "loss": 4.5556, + "step": 1723 + }, + { + "epoch": 0.34549098196392786, + "grad_norm": 24.741418368700653, + "learning_rate": 9.993058912732014e-06, + "loss": 4.5481, + "step": 1724 + }, + { + "epoch": 0.3456913827655311, + "grad_norm": 42.217047641664735, + "learning_rate": 9.992997365657483e-06, + "loss": 3.9596, + "step": 1725 + }, + { + "epoch": 0.3458917835671343, + "grad_norm": 39.88081126364379, + "learning_rate": 9.992935547106319e-06, + "loss": 4.1675, + "step": 1726 + }, + { + "epoch": 0.34609218436873745, + "grad_norm": 30.639996417352666, + "learning_rate": 9.992873457081877e-06, + "loss": 4.4564, + "step": 1727 + }, + { + "epoch": 0.34629258517034067, + "grad_norm": 24.044010137704785, + "learning_rate": 9.992811095587536e-06, + "loss": 3.9029, + "step": 1728 + }, + { + "epoch": 0.3464929859719439, + "grad_norm": 40.25610948014608, + "learning_rate": 9.992748462626687e-06, + "loss": 4.3162, + "step": 1729 + }, + { + "epoch": 0.3466933867735471, + "grad_norm": 32.74293101889326, + "learning_rate": 9.992685558202731e-06, + "loss": 4.4017, + "step": 1730 + }, + { + "epoch": 0.3468937875751503, + "grad_norm": 32.39682293861993, + "learning_rate": 9.992622382319093e-06, + "loss": 4.5953, + "step": 1731 + }, + { + "epoch": 0.3470941883767535, + "grad_norm": 24.688507230673643, + "learning_rate": 9.992558934979206e-06, + "loss": 4.2429, + "step": 1732 + }, + { + "epoch": 0.3472945891783567, + "grad_norm": 33.533168115635284, + "learning_rate": 9.99249521618652e-06, + "loss": 4.3013, + "step": 1733 + }, + { + "epoch": 0.3474949899799599, + "grad_norm": 44.50465381628396, + "learning_rate": 9.9924312259445e-06, + "loss": 4.0828, + "step": 1734 + }, + { + "epoch": 0.34769539078156314, + "grad_norm": 31.01595710513024, + "learning_rate": 9.992366964256623e-06, + "loss": 4.8991, + "step": 1735 + }, + { + "epoch": 0.34789579158316636, + "grad_norm": 31.806576480287568, + "learning_rate": 9.992302431126387e-06, + "loss": 4.6595, + "step": 1736 + }, + { + "epoch": 0.3480961923847695, + "grad_norm": 27.08561617061518, + "learning_rate": 9.992237626557297e-06, + "loss": 4.458, + "step": 1737 + }, + { + "epoch": 0.34829659318637274, + "grad_norm": 27.829020859763084, + "learning_rate": 9.992172550552878e-06, + "loss": 4.7176, + "step": 1738 + }, + { + "epoch": 0.34849699398797596, + "grad_norm": 24.88726757480941, + "learning_rate": 9.99210720311667e-06, + "loss": 4.3595, + "step": 1739 + }, + { + "epoch": 0.3486973947895792, + "grad_norm": 25.538395933314803, + "learning_rate": 9.992041584252223e-06, + "loss": 4.3069, + "step": 1740 + }, + { + "epoch": 0.3488977955911824, + "grad_norm": 24.033789393771684, + "learning_rate": 9.991975693963108e-06, + "loss": 4.1718, + "step": 1741 + }, + { + "epoch": 0.34909819639278555, + "grad_norm": 29.492051461067494, + "learning_rate": 9.991909532252903e-06, + "loss": 4.2717, + "step": 1742 + }, + { + "epoch": 0.34929859719438877, + "grad_norm": 36.892049097254635, + "learning_rate": 9.991843099125209e-06, + "loss": 4.3173, + "step": 1743 + }, + { + "epoch": 0.349498997995992, + "grad_norm": 37.08424573934741, + "learning_rate": 9.991776394583637e-06, + "loss": 3.7599, + "step": 1744 + }, + { + "epoch": 0.3496993987975952, + "grad_norm": 68.49646913417321, + "learning_rate": 9.991709418631813e-06, + "loss": 4.3218, + "step": 1745 + }, + { + "epoch": 0.3498997995991984, + "grad_norm": 29.776231008859117, + "learning_rate": 9.991642171273382e-06, + "loss": 4.7203, + "step": 1746 + }, + { + "epoch": 0.3501002004008016, + "grad_norm": 34.55136696894816, + "learning_rate": 9.991574652511995e-06, + "loss": 4.3936, + "step": 1747 + }, + { + "epoch": 0.3503006012024048, + "grad_norm": 42.1838040886927, + "learning_rate": 9.991506862351329e-06, + "loss": 4.4325, + "step": 1748 + }, + { + "epoch": 0.350501002004008, + "grad_norm": 25.334854311234324, + "learning_rate": 9.991438800795063e-06, + "loss": 3.8663, + "step": 1749 + }, + { + "epoch": 0.35070140280561124, + "grad_norm": 26.932183526906574, + "learning_rate": 9.991370467846904e-06, + "loss": 4.2842, + "step": 1750 + }, + { + "epoch": 0.3509018036072144, + "grad_norm": 36.18057343433032, + "learning_rate": 9.991301863510564e-06, + "loss": 4.1233, + "step": 1751 + }, + { + "epoch": 0.3511022044088176, + "grad_norm": 30.14064621222119, + "learning_rate": 9.991232987789775e-06, + "loss": 4.7803, + "step": 1752 + }, + { + "epoch": 0.35130260521042084, + "grad_norm": 32.88728386061924, + "learning_rate": 9.991163840688278e-06, + "loss": 4.1871, + "step": 1753 + }, + { + "epoch": 0.35150300601202406, + "grad_norm": 37.64193252168309, + "learning_rate": 9.991094422209837e-06, + "loss": 4.267, + "step": 1754 + }, + { + "epoch": 0.3517034068136273, + "grad_norm": 23.451308035736346, + "learning_rate": 9.991024732358226e-06, + "loss": 3.9702, + "step": 1755 + }, + { + "epoch": 0.35190380761523044, + "grad_norm": 32.099249477239546, + "learning_rate": 9.99095477113723e-06, + "loss": 3.967, + "step": 1756 + }, + { + "epoch": 0.35210420841683365, + "grad_norm": 33.06388668858651, + "learning_rate": 9.990884538550659e-06, + "loss": 4.668, + "step": 1757 + }, + { + "epoch": 0.35230460921843687, + "grad_norm": 29.91001367534365, + "learning_rate": 9.990814034602325e-06, + "loss": 3.9755, + "step": 1758 + }, + { + "epoch": 0.3525050100200401, + "grad_norm": 42.547388819947734, + "learning_rate": 9.990743259296067e-06, + "loss": 4.0065, + "step": 1759 + }, + { + "epoch": 0.3527054108216433, + "grad_norm": 28.752368864631592, + "learning_rate": 9.99067221263573e-06, + "loss": 4.4021, + "step": 1760 + }, + { + "epoch": 0.35290581162324647, + "grad_norm": 57.24279928404908, + "learning_rate": 9.99060089462518e-06, + "loss": 4.8342, + "step": 1761 + }, + { + "epoch": 0.3531062124248497, + "grad_norm": 32.519424692594015, + "learning_rate": 9.99052930526829e-06, + "loss": 4.4448, + "step": 1762 + }, + { + "epoch": 0.3533066132264529, + "grad_norm": 34.84821193375178, + "learning_rate": 9.990457444568955e-06, + "loss": 4.9555, + "step": 1763 + }, + { + "epoch": 0.3535070140280561, + "grad_norm": 36.39209421874032, + "learning_rate": 9.990385312531084e-06, + "loss": 4.2916, + "step": 1764 + }, + { + "epoch": 0.35370741482965934, + "grad_norm": 22.807140434080278, + "learning_rate": 9.990312909158597e-06, + "loss": 3.433, + "step": 1765 + }, + { + "epoch": 0.3539078156312625, + "grad_norm": 39.51924535734253, + "learning_rate": 9.990240234455429e-06, + "loss": 4.4673, + "step": 1766 + }, + { + "epoch": 0.3541082164328657, + "grad_norm": 82.9690269083463, + "learning_rate": 9.990167288425535e-06, + "loss": 4.1175, + "step": 1767 + }, + { + "epoch": 0.35430861723446894, + "grad_norm": 22.121438666564433, + "learning_rate": 9.990094071072878e-06, + "loss": 3.7721, + "step": 1768 + }, + { + "epoch": 0.35450901803607215, + "grad_norm": 24.192147938647395, + "learning_rate": 9.99002058240144e-06, + "loss": 4.1862, + "step": 1769 + }, + { + "epoch": 0.35470941883767537, + "grad_norm": 25.362590578519082, + "learning_rate": 9.989946822415217e-06, + "loss": 4.3553, + "step": 1770 + }, + { + "epoch": 0.35490981963927853, + "grad_norm": 25.106758069175257, + "learning_rate": 9.98987279111822e-06, + "loss": 3.8672, + "step": 1771 + }, + { + "epoch": 0.35511022044088175, + "grad_norm": 70.45747911968313, + "learning_rate": 9.989798488514475e-06, + "loss": 3.5069, + "step": 1772 + }, + { + "epoch": 0.35531062124248497, + "grad_norm": 39.14352603357403, + "learning_rate": 9.98972391460802e-06, + "loss": 4.5682, + "step": 1773 + }, + { + "epoch": 0.3555110220440882, + "grad_norm": 48.86192816929149, + "learning_rate": 9.989649069402909e-06, + "loss": 3.8201, + "step": 1774 + }, + { + "epoch": 0.3557114228456914, + "grad_norm": 28.248927707527294, + "learning_rate": 9.989573952903214e-06, + "loss": 3.9043, + "step": 1775 + }, + { + "epoch": 0.35591182364729457, + "grad_norm": 31.69115492093987, + "learning_rate": 9.989498565113016e-06, + "loss": 4.3692, + "step": 1776 + }, + { + "epoch": 0.3561122244488978, + "grad_norm": 30.505546192875563, + "learning_rate": 9.989422906036417e-06, + "loss": 4.188, + "step": 1777 + }, + { + "epoch": 0.356312625250501, + "grad_norm": 31.07424622879295, + "learning_rate": 9.98934697567753e-06, + "loss": 4.5888, + "step": 1778 + }, + { + "epoch": 0.3565130260521042, + "grad_norm": 34.936393152748025, + "learning_rate": 9.989270774040483e-06, + "loss": 5.0312, + "step": 1779 + }, + { + "epoch": 0.35671342685370744, + "grad_norm": 24.999385187141815, + "learning_rate": 9.98919430112942e-06, + "loss": 4.2455, + "step": 1780 + }, + { + "epoch": 0.3569138276553106, + "grad_norm": 21.873158223300933, + "learning_rate": 9.989117556948495e-06, + "loss": 4.1731, + "step": 1781 + }, + { + "epoch": 0.3571142284569138, + "grad_norm": 40.24397594626588, + "learning_rate": 9.989040541501886e-06, + "loss": 4.4414, + "step": 1782 + }, + { + "epoch": 0.35731462925851704, + "grad_norm": 32.2358333434664, + "learning_rate": 9.988963254793778e-06, + "loss": 4.5121, + "step": 1783 + }, + { + "epoch": 0.35751503006012025, + "grad_norm": 37.20038717335384, + "learning_rate": 9.988885696828374e-06, + "loss": 4.6132, + "step": 1784 + }, + { + "epoch": 0.35771543086172347, + "grad_norm": 40.79924264781999, + "learning_rate": 9.988807867609889e-06, + "loss": 3.9747, + "step": 1785 + }, + { + "epoch": 0.35791583166332663, + "grad_norm": 42.29975437615047, + "learning_rate": 9.988729767142557e-06, + "loss": 4.5074, + "step": 1786 + }, + { + "epoch": 0.35811623246492985, + "grad_norm": 40.613974952001655, + "learning_rate": 9.988651395430624e-06, + "loss": 4.4222, + "step": 1787 + }, + { + "epoch": 0.35831663326653307, + "grad_norm": 35.37051266304566, + "learning_rate": 9.988572752478349e-06, + "loss": 4.0116, + "step": 1788 + }, + { + "epoch": 0.3585170340681363, + "grad_norm": 31.901120612544474, + "learning_rate": 9.98849383829001e-06, + "loss": 4.771, + "step": 1789 + }, + { + "epoch": 0.3587174348697395, + "grad_norm": 20.539412136438386, + "learning_rate": 9.9884146528699e-06, + "loss": 3.8777, + "step": 1790 + }, + { + "epoch": 0.35891783567134267, + "grad_norm": 23.984086747366213, + "learning_rate": 9.988335196222319e-06, + "loss": 4.0929, + "step": 1791 + }, + { + "epoch": 0.3591182364729459, + "grad_norm": 31.022999399286984, + "learning_rate": 9.98825546835159e-06, + "loss": 4.6714, + "step": 1792 + }, + { + "epoch": 0.3593186372745491, + "grad_norm": 64.78739856799146, + "learning_rate": 9.988175469262048e-06, + "loss": 3.9682, + "step": 1793 + }, + { + "epoch": 0.3595190380761523, + "grad_norm": 25.856647896993508, + "learning_rate": 9.988095198958042e-06, + "loss": 3.9249, + "step": 1794 + }, + { + "epoch": 0.35971943887775554, + "grad_norm": 29.783884005666426, + "learning_rate": 9.988014657443941e-06, + "loss": 3.784, + "step": 1795 + }, + { + "epoch": 0.3599198396793587, + "grad_norm": 26.90109729559895, + "learning_rate": 9.987933844724117e-06, + "loss": 4.2513, + "step": 1796 + }, + { + "epoch": 0.3601202404809619, + "grad_norm": 41.16776840326063, + "learning_rate": 9.987852760802965e-06, + "loss": 4.9322, + "step": 1797 + }, + { + "epoch": 0.36032064128256514, + "grad_norm": 36.702311015684714, + "learning_rate": 9.987771405684899e-06, + "loss": 4.1803, + "step": 1798 + }, + { + "epoch": 0.36052104208416835, + "grad_norm": 53.76517498536305, + "learning_rate": 9.987689779374338e-06, + "loss": 4.3493, + "step": 1799 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 26.030862292481547, + "learning_rate": 9.98760788187572e-06, + "loss": 4.2453, + "step": 1800 + }, + { + "epoch": 0.36092184368737473, + "grad_norm": 85.33235638058392, + "learning_rate": 9.9875257131935e-06, + "loss": 4.9515, + "step": 1801 + }, + { + "epoch": 0.36112224448897795, + "grad_norm": 33.65695377136267, + "learning_rate": 9.987443273332146e-06, + "loss": 4.2247, + "step": 1802 + }, + { + "epoch": 0.36132264529058117, + "grad_norm": 60.21791332653529, + "learning_rate": 9.987360562296138e-06, + "loss": 4.4372, + "step": 1803 + }, + { + "epoch": 0.3615230460921844, + "grad_norm": 24.720516015903325, + "learning_rate": 9.987277580089974e-06, + "loss": 4.0833, + "step": 1804 + }, + { + "epoch": 0.36172344689378755, + "grad_norm": 23.966142958643086, + "learning_rate": 9.987194326718165e-06, + "loss": 4.1076, + "step": 1805 + }, + { + "epoch": 0.36192384769539077, + "grad_norm": 32.60764576399976, + "learning_rate": 9.98711080218524e-06, + "loss": 4.5944, + "step": 1806 + }, + { + "epoch": 0.362124248496994, + "grad_norm": 24.37355145734943, + "learning_rate": 9.98702700649574e-06, + "loss": 4.1831, + "step": 1807 + }, + { + "epoch": 0.3623246492985972, + "grad_norm": 28.62272266843179, + "learning_rate": 9.986942939654219e-06, + "loss": 4.4051, + "step": 1808 + }, + { + "epoch": 0.3625250501002004, + "grad_norm": 46.751116714899084, + "learning_rate": 9.986858601665249e-06, + "loss": 4.2267, + "step": 1809 + }, + { + "epoch": 0.3627254509018036, + "grad_norm": 25.683949529492207, + "learning_rate": 9.986773992533414e-06, + "loss": 4.3889, + "step": 1810 + }, + { + "epoch": 0.3629258517034068, + "grad_norm": 35.67100989549219, + "learning_rate": 9.986689112263318e-06, + "loss": 4.318, + "step": 1811 + }, + { + "epoch": 0.36312625250501, + "grad_norm": 70.11339127483394, + "learning_rate": 9.986603960859573e-06, + "loss": 4.0771, + "step": 1812 + }, + { + "epoch": 0.36332665330661323, + "grad_norm": 32.32853226792746, + "learning_rate": 9.98651853832681e-06, + "loss": 4.2776, + "step": 1813 + }, + { + "epoch": 0.36352705410821645, + "grad_norm": 28.778138607123758, + "learning_rate": 9.986432844669673e-06, + "loss": 4.6176, + "step": 1814 + }, + { + "epoch": 0.3637274549098196, + "grad_norm": 30.886961849233252, + "learning_rate": 9.986346879892823e-06, + "loss": 4.4498, + "step": 1815 + }, + { + "epoch": 0.36392785571142283, + "grad_norm": 27.090227488628212, + "learning_rate": 9.986260644000932e-06, + "loss": 3.8584, + "step": 1816 + }, + { + "epoch": 0.36412825651302605, + "grad_norm": 25.77204391233318, + "learning_rate": 9.986174136998687e-06, + "loss": 3.9586, + "step": 1817 + }, + { + "epoch": 0.36432865731462927, + "grad_norm": 32.457405266808465, + "learning_rate": 9.986087358890798e-06, + "loss": 4.7123, + "step": 1818 + }, + { + "epoch": 0.3645290581162325, + "grad_norm": 20.79834974527047, + "learning_rate": 9.986000309681975e-06, + "loss": 4.2384, + "step": 1819 + }, + { + "epoch": 0.36472945891783565, + "grad_norm": 36.4684449383233, + "learning_rate": 9.985912989376958e-06, + "loss": 4.9314, + "step": 1820 + }, + { + "epoch": 0.36492985971943886, + "grad_norm": 27.865410156936303, + "learning_rate": 9.98582539798049e-06, + "loss": 4.3023, + "step": 1821 + }, + { + "epoch": 0.3651302605210421, + "grad_norm": 32.82810225205578, + "learning_rate": 9.985737535497337e-06, + "loss": 4.0612, + "step": 1822 + }, + { + "epoch": 0.3653306613226453, + "grad_norm": 27.981628765367258, + "learning_rate": 9.985649401932275e-06, + "loss": 4.1688, + "step": 1823 + }, + { + "epoch": 0.3655310621242485, + "grad_norm": 33.80267618553597, + "learning_rate": 9.985560997290094e-06, + "loss": 4.5351, + "step": 1824 + }, + { + "epoch": 0.3657314629258517, + "grad_norm": 33.89498992318836, + "learning_rate": 9.985472321575602e-06, + "loss": 4.2963, + "step": 1825 + }, + { + "epoch": 0.3659318637274549, + "grad_norm": 67.30258830243302, + "learning_rate": 9.985383374793622e-06, + "loss": 3.999, + "step": 1826 + }, + { + "epoch": 0.3661322645290581, + "grad_norm": 26.01720743556433, + "learning_rate": 9.985294156948987e-06, + "loss": 3.7892, + "step": 1827 + }, + { + "epoch": 0.36633266533066133, + "grad_norm": 25.558595015377236, + "learning_rate": 9.985204668046554e-06, + "loss": 4.0313, + "step": 1828 + }, + { + "epoch": 0.36653306613226455, + "grad_norm": 23.852221256132143, + "learning_rate": 9.98511490809118e-06, + "loss": 3.7849, + "step": 1829 + }, + { + "epoch": 0.3667334669338677, + "grad_norm": 38.58810158747277, + "learning_rate": 9.98502487708775e-06, + "loss": 4.5666, + "step": 1830 + }, + { + "epoch": 0.36693386773547093, + "grad_norm": 29.255425829976822, + "learning_rate": 9.98493457504116e-06, + "loss": 3.8834, + "step": 1831 + }, + { + "epoch": 0.36713426853707415, + "grad_norm": 28.950637617326873, + "learning_rate": 9.98484400195632e-06, + "loss": 4.0724, + "step": 1832 + }, + { + "epoch": 0.36733466933867737, + "grad_norm": 27.990570849539356, + "learning_rate": 9.98475315783815e-06, + "loss": 4.0135, + "step": 1833 + }, + { + "epoch": 0.3675350701402806, + "grad_norm": 38.53785457986264, + "learning_rate": 9.984662042691594e-06, + "loss": 4.4785, + "step": 1834 + }, + { + "epoch": 0.36773547094188375, + "grad_norm": 23.05241547284947, + "learning_rate": 9.984570656521606e-06, + "loss": 3.5892, + "step": 1835 + }, + { + "epoch": 0.36793587174348696, + "grad_norm": 32.725674030922654, + "learning_rate": 9.984478999333153e-06, + "loss": 3.7089, + "step": 1836 + }, + { + "epoch": 0.3681362725450902, + "grad_norm": 30.61503910069366, + "learning_rate": 9.984387071131218e-06, + "loss": 4.0164, + "step": 1837 + }, + { + "epoch": 0.3683366733466934, + "grad_norm": 32.47220237826144, + "learning_rate": 9.984294871920802e-06, + "loss": 4.4828, + "step": 1838 + }, + { + "epoch": 0.3685370741482966, + "grad_norm": 25.903053465775404, + "learning_rate": 9.984202401706916e-06, + "loss": 4.4071, + "step": 1839 + }, + { + "epoch": 0.3687374749498998, + "grad_norm": 33.37911635842313, + "learning_rate": 9.984109660494587e-06, + "loss": 3.5807, + "step": 1840 + }, + { + "epoch": 0.368937875751503, + "grad_norm": 85.22869574652289, + "learning_rate": 9.984016648288858e-06, + "loss": 4.4918, + "step": 1841 + }, + { + "epoch": 0.3691382765531062, + "grad_norm": 22.309818093737743, + "learning_rate": 9.983923365094789e-06, + "loss": 3.6649, + "step": 1842 + }, + { + "epoch": 0.36933867735470943, + "grad_norm": 22.285180772837762, + "learning_rate": 9.983829810917448e-06, + "loss": 4.0077, + "step": 1843 + }, + { + "epoch": 0.36953907815631265, + "grad_norm": 36.61005005352822, + "learning_rate": 9.983735985761924e-06, + "loss": 4.2807, + "step": 1844 + }, + { + "epoch": 0.3697394789579158, + "grad_norm": 43.32815583194087, + "learning_rate": 9.983641889633318e-06, + "loss": 4.8648, + "step": 1845 + }, + { + "epoch": 0.36993987975951903, + "grad_norm": 20.457844575485417, + "learning_rate": 9.983547522536745e-06, + "loss": 3.4201, + "step": 1846 + }, + { + "epoch": 0.37014028056112225, + "grad_norm": 29.863050681090286, + "learning_rate": 9.983452884477339e-06, + "loss": 4.3851, + "step": 1847 + }, + { + "epoch": 0.37034068136272547, + "grad_norm": 30.308605656227456, + "learning_rate": 9.98335797546024e-06, + "loss": 4.519, + "step": 1848 + }, + { + "epoch": 0.3705410821643287, + "grad_norm": 36.457723781610994, + "learning_rate": 9.983262795490614e-06, + "loss": 3.9877, + "step": 1849 + }, + { + "epoch": 0.37074148296593185, + "grad_norm": 32.133163597366554, + "learning_rate": 9.983167344573632e-06, + "loss": 4.3062, + "step": 1850 + }, + { + "epoch": 0.37094188376753506, + "grad_norm": 21.768462712522567, + "learning_rate": 9.98307162271449e-06, + "loss": 4.1178, + "step": 1851 + }, + { + "epoch": 0.3711422845691383, + "grad_norm": 31.074280743622833, + "learning_rate": 9.982975629918383e-06, + "loss": 4.8827, + "step": 1852 + }, + { + "epoch": 0.3713426853707415, + "grad_norm": 24.47419164157091, + "learning_rate": 9.982879366190537e-06, + "loss": 3.7721, + "step": 1853 + }, + { + "epoch": 0.3715430861723447, + "grad_norm": 33.575806703093576, + "learning_rate": 9.982782831536186e-06, + "loss": 4.7011, + "step": 1854 + }, + { + "epoch": 0.3717434869739479, + "grad_norm": 22.646548944953718, + "learning_rate": 9.982686025960576e-06, + "loss": 4.0847, + "step": 1855 + }, + { + "epoch": 0.3719438877755511, + "grad_norm": 28.46530007595797, + "learning_rate": 9.982588949468972e-06, + "loss": 4.0031, + "step": 1856 + }, + { + "epoch": 0.3721442885771543, + "grad_norm": 46.35809636108396, + "learning_rate": 9.982491602066652e-06, + "loss": 4.1916, + "step": 1857 + }, + { + "epoch": 0.37234468937875753, + "grad_norm": 40.44973164103229, + "learning_rate": 9.982393983758909e-06, + "loss": 5.2319, + "step": 1858 + }, + { + "epoch": 0.3725450901803607, + "grad_norm": 31.32571822863572, + "learning_rate": 9.982296094551049e-06, + "loss": 3.9892, + "step": 1859 + }, + { + "epoch": 0.3727454909819639, + "grad_norm": 45.72867114094556, + "learning_rate": 9.982197934448397e-06, + "loss": 4.2451, + "step": 1860 + }, + { + "epoch": 0.37294589178356713, + "grad_norm": 40.5238420751581, + "learning_rate": 9.982099503456288e-06, + "loss": 4.8001, + "step": 1861 + }, + { + "epoch": 0.37314629258517035, + "grad_norm": 59.59270770013551, + "learning_rate": 9.982000801580075e-06, + "loss": 4.3594, + "step": 1862 + }, + { + "epoch": 0.37334669338677356, + "grad_norm": 75.66139492343333, + "learning_rate": 9.981901828825127e-06, + "loss": 4.0492, + "step": 1863 + }, + { + "epoch": 0.3735470941883767, + "grad_norm": 21.086665253866133, + "learning_rate": 9.981802585196819e-06, + "loss": 3.7234, + "step": 1864 + }, + { + "epoch": 0.37374749498997994, + "grad_norm": 30.340318267602953, + "learning_rate": 9.981703070700552e-06, + "loss": 4.2527, + "step": 1865 + }, + { + "epoch": 0.37394789579158316, + "grad_norm": 27.20080029902389, + "learning_rate": 9.981603285341735e-06, + "loss": 4.245, + "step": 1866 + }, + { + "epoch": 0.3741482965931864, + "grad_norm": 22.25291528638188, + "learning_rate": 9.981503229125794e-06, + "loss": 3.7111, + "step": 1867 + }, + { + "epoch": 0.3743486973947896, + "grad_norm": 28.57688791945932, + "learning_rate": 9.98140290205817e-06, + "loss": 3.9151, + "step": 1868 + }, + { + "epoch": 0.37454909819639276, + "grad_norm": 25.76546823272907, + "learning_rate": 9.981302304144316e-06, + "loss": 4.0527, + "step": 1869 + }, + { + "epoch": 0.374749498997996, + "grad_norm": 30.43960030463618, + "learning_rate": 9.981201435389702e-06, + "loss": 4.7687, + "step": 1870 + }, + { + "epoch": 0.3749498997995992, + "grad_norm": 46.906614562630224, + "learning_rate": 9.981100295799814e-06, + "loss": 4.1767, + "step": 1871 + }, + { + "epoch": 0.3751503006012024, + "grad_norm": 38.50604005877605, + "learning_rate": 9.98099888538015e-06, + "loss": 3.735, + "step": 1872 + }, + { + "epoch": 0.37535070140280563, + "grad_norm": 57.294777301091365, + "learning_rate": 9.980897204136226e-06, + "loss": 4.5273, + "step": 1873 + }, + { + "epoch": 0.3755511022044088, + "grad_norm": 28.525692557969943, + "learning_rate": 9.980795252073565e-06, + "loss": 4.2726, + "step": 1874 + }, + { + "epoch": 0.375751503006012, + "grad_norm": 49.99569783880763, + "learning_rate": 9.980693029197717e-06, + "loss": 5.2239, + "step": 1875 + }, + { + "epoch": 0.37595190380761523, + "grad_norm": 32.245615515027595, + "learning_rate": 9.980590535514234e-06, + "loss": 4.1111, + "step": 1876 + }, + { + "epoch": 0.37615230460921845, + "grad_norm": 27.900670439481388, + "learning_rate": 9.980487771028693e-06, + "loss": 4.5403, + "step": 1877 + }, + { + "epoch": 0.37635270541082166, + "grad_norm": 26.35683537675117, + "learning_rate": 9.98038473574668e-06, + "loss": 3.9897, + "step": 1878 + }, + { + "epoch": 0.3765531062124248, + "grad_norm": 32.04538745230117, + "learning_rate": 9.980281429673796e-06, + "loss": 4.1139, + "step": 1879 + }, + { + "epoch": 0.37675350701402804, + "grad_norm": 48.55096688078433, + "learning_rate": 9.98017785281566e-06, + "loss": 4.3255, + "step": 1880 + }, + { + "epoch": 0.37695390781563126, + "grad_norm": 31.42161223518995, + "learning_rate": 9.980074005177904e-06, + "loss": 4.2219, + "step": 1881 + }, + { + "epoch": 0.3771543086172345, + "grad_norm": 25.45283349723508, + "learning_rate": 9.979969886766171e-06, + "loss": 4.2843, + "step": 1882 + }, + { + "epoch": 0.3773547094188377, + "grad_norm": 29.456034243060692, + "learning_rate": 9.979865497586125e-06, + "loss": 4.5153, + "step": 1883 + }, + { + "epoch": 0.37755511022044086, + "grad_norm": 71.5362875484047, + "learning_rate": 9.979760837643443e-06, + "loss": 4.1022, + "step": 1884 + }, + { + "epoch": 0.3777555110220441, + "grad_norm": 18.79311299805307, + "learning_rate": 9.97965590694381e-06, + "loss": 3.4778, + "step": 1885 + }, + { + "epoch": 0.3779559118236473, + "grad_norm": 22.244615746209227, + "learning_rate": 9.979550705492935e-06, + "loss": 4.1328, + "step": 1886 + }, + { + "epoch": 0.3781563126252505, + "grad_norm": 24.55748537666317, + "learning_rate": 9.97944523329654e-06, + "loss": 4.2734, + "step": 1887 + }, + { + "epoch": 0.37835671342685373, + "grad_norm": 22.3560399563075, + "learning_rate": 9.979339490360356e-06, + "loss": 3.859, + "step": 1888 + }, + { + "epoch": 0.3785571142284569, + "grad_norm": 20.16502523814936, + "learning_rate": 9.979233476690134e-06, + "loss": 4.246, + "step": 1889 + }, + { + "epoch": 0.3787575150300601, + "grad_norm": 30.575202987780212, + "learning_rate": 9.979127192291637e-06, + "loss": 3.8712, + "step": 1890 + }, + { + "epoch": 0.37895791583166333, + "grad_norm": 21.75711615918879, + "learning_rate": 9.979020637170645e-06, + "loss": 4.0398, + "step": 1891 + }, + { + "epoch": 0.37915831663326655, + "grad_norm": 31.690123987335443, + "learning_rate": 9.978913811332952e-06, + "loss": 4.2641, + "step": 1892 + }, + { + "epoch": 0.37935871743486976, + "grad_norm": 38.13386656321749, + "learning_rate": 9.978806714784367e-06, + "loss": 4.53, + "step": 1893 + }, + { + "epoch": 0.3795591182364729, + "grad_norm": 30.889985768899255, + "learning_rate": 9.978699347530709e-06, + "loss": 4.0591, + "step": 1894 + }, + { + "epoch": 0.37975951903807614, + "grad_norm": 57.694881987034584, + "learning_rate": 9.978591709577818e-06, + "loss": 4.4821, + "step": 1895 + }, + { + "epoch": 0.37995991983967936, + "grad_norm": 41.45837600253992, + "learning_rate": 9.978483800931547e-06, + "loss": 3.9595, + "step": 1896 + }, + { + "epoch": 0.3801603206412826, + "grad_norm": 47.98351454354782, + "learning_rate": 9.978375621597765e-06, + "loss": 4.5299, + "step": 1897 + }, + { + "epoch": 0.3803607214428858, + "grad_norm": 41.072777195404726, + "learning_rate": 9.978267171582348e-06, + "loss": 4.7114, + "step": 1898 + }, + { + "epoch": 0.38056112224448896, + "grad_norm": 23.753185338851946, + "learning_rate": 9.9781584508912e-06, + "loss": 4.1612, + "step": 1899 + }, + { + "epoch": 0.3807615230460922, + "grad_norm": 28.14065568322544, + "learning_rate": 9.978049459530226e-06, + "loss": 4.627, + "step": 1900 + }, + { + "epoch": 0.3809619238476954, + "grad_norm": 25.79842810845467, + "learning_rate": 9.977940197505357e-06, + "loss": 4.1752, + "step": 1901 + }, + { + "epoch": 0.3811623246492986, + "grad_norm": 33.06630279850805, + "learning_rate": 9.977830664822528e-06, + "loss": 4.3763, + "step": 1902 + }, + { + "epoch": 0.38136272545090183, + "grad_norm": 26.461606773224293, + "learning_rate": 9.9777208614877e-06, + "loss": 4.1939, + "step": 1903 + }, + { + "epoch": 0.381563126252505, + "grad_norm": 24.50783952978314, + "learning_rate": 9.977610787506841e-06, + "loss": 3.9378, + "step": 1904 + }, + { + "epoch": 0.3817635270541082, + "grad_norm": 30.82828251943764, + "learning_rate": 9.977500442885936e-06, + "loss": 4.0372, + "step": 1905 + }, + { + "epoch": 0.3819639278557114, + "grad_norm": 31.261690594224273, + "learning_rate": 9.977389827630984e-06, + "loss": 4.4721, + "step": 1906 + }, + { + "epoch": 0.38216432865731464, + "grad_norm": 25.78119508584258, + "learning_rate": 9.977278941748e-06, + "loss": 4.1993, + "step": 1907 + }, + { + "epoch": 0.38236472945891786, + "grad_norm": 28.761012405428975, + "learning_rate": 9.977167785243012e-06, + "loss": 4.1857, + "step": 1908 + }, + { + "epoch": 0.382565130260521, + "grad_norm": 47.94673943623902, + "learning_rate": 9.977056358122065e-06, + "loss": 4.6361, + "step": 1909 + }, + { + "epoch": 0.38276553106212424, + "grad_norm": 31.01681077758332, + "learning_rate": 9.976944660391217e-06, + "loss": 4.4672, + "step": 1910 + }, + { + "epoch": 0.38296593186372746, + "grad_norm": 34.40102089617197, + "learning_rate": 9.976832692056541e-06, + "loss": 4.2212, + "step": 1911 + }, + { + "epoch": 0.3831663326653307, + "grad_norm": 36.80611096098526, + "learning_rate": 9.976720453124126e-06, + "loss": 4.1197, + "step": 1912 + }, + { + "epoch": 0.38336673346693384, + "grad_norm": 25.132579316023765, + "learning_rate": 9.976607943600073e-06, + "loss": 4.1235, + "step": 1913 + }, + { + "epoch": 0.38356713426853706, + "grad_norm": 62.01622945316947, + "learning_rate": 9.976495163490501e-06, + "loss": 3.7253, + "step": 1914 + }, + { + "epoch": 0.3837675350701403, + "grad_norm": 42.27514348766147, + "learning_rate": 9.97638211280154e-06, + "loss": 4.1353, + "step": 1915 + }, + { + "epoch": 0.3839679358717435, + "grad_norm": 28.346131625226487, + "learning_rate": 9.976268791539337e-06, + "loss": 4.439, + "step": 1916 + }, + { + "epoch": 0.3841683366733467, + "grad_norm": 42.0125823717013, + "learning_rate": 9.976155199710053e-06, + "loss": 4.9756, + "step": 1917 + }, + { + "epoch": 0.3843687374749499, + "grad_norm": 29.30539578889457, + "learning_rate": 9.976041337319868e-06, + "loss": 3.9207, + "step": 1918 + }, + { + "epoch": 0.3845691382765531, + "grad_norm": 69.49008976377357, + "learning_rate": 9.97592720437497e-06, + "loss": 4.208, + "step": 1919 + }, + { + "epoch": 0.3847695390781563, + "grad_norm": 92.26424440228882, + "learning_rate": 9.975812800881564e-06, + "loss": 4.323, + "step": 1920 + }, + { + "epoch": 0.3849699398797595, + "grad_norm": 50.066357446890365, + "learning_rate": 9.97569812684587e-06, + "loss": 4.1722, + "step": 1921 + }, + { + "epoch": 0.38517034068136274, + "grad_norm": 27.76176434044297, + "learning_rate": 9.975583182274126e-06, + "loss": 4.3199, + "step": 1922 + }, + { + "epoch": 0.3853707414829659, + "grad_norm": 28.765493505901205, + "learning_rate": 9.975467967172578e-06, + "loss": 4.5123, + "step": 1923 + }, + { + "epoch": 0.3855711422845691, + "grad_norm": 37.13546330985124, + "learning_rate": 9.975352481547493e-06, + "loss": 4.3219, + "step": 1924 + }, + { + "epoch": 0.38577154308617234, + "grad_norm": 31.6861486698387, + "learning_rate": 9.975236725405151e-06, + "loss": 4.2599, + "step": 1925 + }, + { + "epoch": 0.38597194388777556, + "grad_norm": 33.61300417976179, + "learning_rate": 9.97512069875184e-06, + "loss": 4.0048, + "step": 1926 + }, + { + "epoch": 0.3861723446893788, + "grad_norm": 42.816574817889354, + "learning_rate": 9.975004401593878e-06, + "loss": 5.0739, + "step": 1927 + }, + { + "epoch": 0.38637274549098194, + "grad_norm": 21.09102577907881, + "learning_rate": 9.974887833937578e-06, + "loss": 3.8869, + "step": 1928 + }, + { + "epoch": 0.38657314629258516, + "grad_norm": 23.879231004865115, + "learning_rate": 9.974770995789286e-06, + "loss": 4.0961, + "step": 1929 + }, + { + "epoch": 0.3867735470941884, + "grad_norm": 27.30098115732616, + "learning_rate": 9.97465388715535e-06, + "loss": 3.9981, + "step": 1930 + }, + { + "epoch": 0.3869739478957916, + "grad_norm": 37.37916773389435, + "learning_rate": 9.97453650804214e-06, + "loss": 4.8306, + "step": 1931 + }, + { + "epoch": 0.3871743486973948, + "grad_norm": 33.470219368030214, + "learning_rate": 9.974418858456037e-06, + "loss": 4.2614, + "step": 1932 + }, + { + "epoch": 0.387374749498998, + "grad_norm": 37.298734492854905, + "learning_rate": 9.974300938403437e-06, + "loss": 3.7436, + "step": 1933 + }, + { + "epoch": 0.3875751503006012, + "grad_norm": 36.4669585292097, + "learning_rate": 9.97418274789075e-06, + "loss": 4.4285, + "step": 1934 + }, + { + "epoch": 0.3877755511022044, + "grad_norm": 25.65578365380366, + "learning_rate": 9.974064286924408e-06, + "loss": 3.9138, + "step": 1935 + }, + { + "epoch": 0.3879759519038076, + "grad_norm": 25.833463825141777, + "learning_rate": 9.973945555510847e-06, + "loss": 4.1344, + "step": 1936 + }, + { + "epoch": 0.38817635270541084, + "grad_norm": 74.69630003473269, + "learning_rate": 9.973826553656524e-06, + "loss": 4.4083, + "step": 1937 + }, + { + "epoch": 0.388376753507014, + "grad_norm": 30.834939600609083, + "learning_rate": 9.973707281367907e-06, + "loss": 4.3416, + "step": 1938 + }, + { + "epoch": 0.3885771543086172, + "grad_norm": 22.047550862050752, + "learning_rate": 9.973587738651487e-06, + "loss": 3.9205, + "step": 1939 + }, + { + "epoch": 0.38877755511022044, + "grad_norm": 44.5550778180004, + "learning_rate": 9.973467925513758e-06, + "loss": 5.0485, + "step": 1940 + }, + { + "epoch": 0.38897795591182366, + "grad_norm": 24.838041915906032, + "learning_rate": 9.973347841961238e-06, + "loss": 3.9478, + "step": 1941 + }, + { + "epoch": 0.3891783567134269, + "grad_norm": 71.85707249429196, + "learning_rate": 9.973227488000452e-06, + "loss": 4.033, + "step": 1942 + }, + { + "epoch": 0.38937875751503004, + "grad_norm": 45.96573789540825, + "learning_rate": 9.973106863637948e-06, + "loss": 4.1807, + "step": 1943 + }, + { + "epoch": 0.38957915831663326, + "grad_norm": 46.52985775429259, + "learning_rate": 9.972985968880282e-06, + "loss": 5.0254, + "step": 1944 + }, + { + "epoch": 0.3897795591182365, + "grad_norm": 43.321559763845244, + "learning_rate": 9.97286480373403e-06, + "loss": 4.6561, + "step": 1945 + }, + { + "epoch": 0.3899799599198397, + "grad_norm": 32.63807270685037, + "learning_rate": 9.972743368205776e-06, + "loss": 4.1631, + "step": 1946 + }, + { + "epoch": 0.3901803607214429, + "grad_norm": 22.063770730761938, + "learning_rate": 9.972621662302125e-06, + "loss": 3.9952, + "step": 1947 + }, + { + "epoch": 0.39038076152304607, + "grad_norm": 76.62075394452788, + "learning_rate": 9.972499686029694e-06, + "loss": 4.5847, + "step": 1948 + }, + { + "epoch": 0.3905811623246493, + "grad_norm": 31.5649465239072, + "learning_rate": 9.972377439395116e-06, + "loss": 4.8721, + "step": 1949 + }, + { + "epoch": 0.3907815631262525, + "grad_norm": 31.594777203589736, + "learning_rate": 9.972254922405034e-06, + "loss": 4.3621, + "step": 1950 + }, + { + "epoch": 0.3909819639278557, + "grad_norm": 22.61089798619473, + "learning_rate": 9.972132135066115e-06, + "loss": 3.9427, + "step": 1951 + }, + { + "epoch": 0.39118236472945894, + "grad_norm": 38.99550520795903, + "learning_rate": 9.97200907738503e-06, + "loss": 4.8481, + "step": 1952 + }, + { + "epoch": 0.3913827655310621, + "grad_norm": 24.65218826307859, + "learning_rate": 9.971885749368474e-06, + "loss": 3.5682, + "step": 1953 + }, + { + "epoch": 0.3915831663326653, + "grad_norm": 22.345549641864327, + "learning_rate": 9.97176215102315e-06, + "loss": 4.3344, + "step": 1954 + }, + { + "epoch": 0.39178356713426854, + "grad_norm": 32.093513786640315, + "learning_rate": 9.971638282355779e-06, + "loss": 4.7478, + "step": 1955 + }, + { + "epoch": 0.39198396793587176, + "grad_norm": 32.458663023560206, + "learning_rate": 9.971514143373095e-06, + "loss": 4.2951, + "step": 1956 + }, + { + "epoch": 0.392184368737475, + "grad_norm": 26.544028790824846, + "learning_rate": 9.971389734081848e-06, + "loss": 3.791, + "step": 1957 + }, + { + "epoch": 0.39238476953907814, + "grad_norm": 39.10472280698276, + "learning_rate": 9.971265054488806e-06, + "loss": 4.3054, + "step": 1958 + }, + { + "epoch": 0.39258517034068136, + "grad_norm": 20.331752445690874, + "learning_rate": 9.971140104600742e-06, + "loss": 3.9309, + "step": 1959 + }, + { + "epoch": 0.3927855711422846, + "grad_norm": 31.429706917223807, + "learning_rate": 9.971014884424453e-06, + "loss": 4.6473, + "step": 1960 + }, + { + "epoch": 0.3929859719438878, + "grad_norm": 34.86854980732473, + "learning_rate": 9.970889393966746e-06, + "loss": 4.6573, + "step": 1961 + }, + { + "epoch": 0.393186372745491, + "grad_norm": 29.240892802196527, + "learning_rate": 9.970763633234445e-06, + "loss": 4.2317, + "step": 1962 + }, + { + "epoch": 0.39338677354709417, + "grad_norm": 65.90325860594214, + "learning_rate": 9.97063760223439e-06, + "loss": 3.7778, + "step": 1963 + }, + { + "epoch": 0.3935871743486974, + "grad_norm": 29.69905291278442, + "learning_rate": 9.97051130097343e-06, + "loss": 4.6063, + "step": 1964 + }, + { + "epoch": 0.3937875751503006, + "grad_norm": 24.724292998338893, + "learning_rate": 9.970384729458431e-06, + "loss": 4.1279, + "step": 1965 + }, + { + "epoch": 0.3939879759519038, + "grad_norm": 60.08874779123214, + "learning_rate": 9.97025788769628e-06, + "loss": 4.9222, + "step": 1966 + }, + { + "epoch": 0.39418837675350704, + "grad_norm": 52.461803514362636, + "learning_rate": 9.97013077569387e-06, + "loss": 4.4396, + "step": 1967 + }, + { + "epoch": 0.3943887775551102, + "grad_norm": 33.82598731632094, + "learning_rate": 9.970003393458115e-06, + "loss": 4.3237, + "step": 1968 + }, + { + "epoch": 0.3945891783567134, + "grad_norm": 24.55398689581785, + "learning_rate": 9.969875740995936e-06, + "loss": 4.2428, + "step": 1969 + }, + { + "epoch": 0.39478957915831664, + "grad_norm": 23.050144149005877, + "learning_rate": 9.969747818314279e-06, + "loss": 3.7081, + "step": 1970 + }, + { + "epoch": 0.39498997995991986, + "grad_norm": 26.42175133749172, + "learning_rate": 9.969619625420094e-06, + "loss": 4.5837, + "step": 1971 + }, + { + "epoch": 0.395190380761523, + "grad_norm": 30.013807018544767, + "learning_rate": 9.969491162320358e-06, + "loss": 4.1374, + "step": 1972 + }, + { + "epoch": 0.39539078156312624, + "grad_norm": 30.2836390888192, + "learning_rate": 9.96936242902205e-06, + "loss": 4.3794, + "step": 1973 + }, + { + "epoch": 0.39559118236472945, + "grad_norm": 28.203692604863082, + "learning_rate": 9.969233425532173e-06, + "loss": 4.3107, + "step": 1974 + }, + { + "epoch": 0.39579158316633267, + "grad_norm": 30.719218914986936, + "learning_rate": 9.969104151857737e-06, + "loss": 4.4055, + "step": 1975 + }, + { + "epoch": 0.3959919839679359, + "grad_norm": 28.214312463938857, + "learning_rate": 9.968974608005775e-06, + "loss": 4.4278, + "step": 1976 + }, + { + "epoch": 0.39619238476953905, + "grad_norm": 25.239331665352466, + "learning_rate": 9.968844793983327e-06, + "loss": 4.2868, + "step": 1977 + }, + { + "epoch": 0.39639278557114227, + "grad_norm": 22.297508734295416, + "learning_rate": 9.968714709797453e-06, + "loss": 4.3855, + "step": 1978 + }, + { + "epoch": 0.3965931863727455, + "grad_norm": 66.45800173922731, + "learning_rate": 9.968584355455226e-06, + "loss": 4.4964, + "step": 1979 + }, + { + "epoch": 0.3967935871743487, + "grad_norm": 34.99999452092519, + "learning_rate": 9.968453730963734e-06, + "loss": 4.3651, + "step": 1980 + }, + { + "epoch": 0.3969939879759519, + "grad_norm": 27.673150049423032, + "learning_rate": 9.968322836330079e-06, + "loss": 4.3126, + "step": 1981 + }, + { + "epoch": 0.3971943887775551, + "grad_norm": 31.11241198079208, + "learning_rate": 9.968191671561378e-06, + "loss": 4.169, + "step": 1982 + }, + { + "epoch": 0.3973947895791583, + "grad_norm": 21.275427608200282, + "learning_rate": 9.968060236664759e-06, + "loss": 4.1895, + "step": 1983 + }, + { + "epoch": 0.3975951903807615, + "grad_norm": 37.44108616585207, + "learning_rate": 9.967928531647375e-06, + "loss": 4.6693, + "step": 1984 + }, + { + "epoch": 0.39779559118236474, + "grad_norm": 30.30062670918694, + "learning_rate": 9.96779655651638e-06, + "loss": 4.5302, + "step": 1985 + }, + { + "epoch": 0.39799599198396796, + "grad_norm": 28.812432641374176, + "learning_rate": 9.967664311278954e-06, + "loss": 4.0502, + "step": 1986 + }, + { + "epoch": 0.3981963927855711, + "grad_norm": 29.026187586894714, + "learning_rate": 9.967531795942289e-06, + "loss": 4.674, + "step": 1987 + }, + { + "epoch": 0.39839679358717434, + "grad_norm": 17.66054496415478, + "learning_rate": 9.967399010513586e-06, + "loss": 3.4153, + "step": 1988 + }, + { + "epoch": 0.39859719438877755, + "grad_norm": 34.035545348591235, + "learning_rate": 9.967265955000066e-06, + "loss": 4.5013, + "step": 1989 + }, + { + "epoch": 0.39879759519038077, + "grad_norm": 29.850720975860206, + "learning_rate": 9.967132629408962e-06, + "loss": 3.7378, + "step": 1990 + }, + { + "epoch": 0.398997995991984, + "grad_norm": 29.425720387238332, + "learning_rate": 9.966999033747526e-06, + "loss": 4.3036, + "step": 1991 + }, + { + "epoch": 0.39919839679358715, + "grad_norm": 26.694707825213527, + "learning_rate": 9.96686516802302e-06, + "loss": 4.0549, + "step": 1992 + }, + { + "epoch": 0.39939879759519037, + "grad_norm": 25.376273736243736, + "learning_rate": 9.966731032242726e-06, + "loss": 4.3703, + "step": 1993 + }, + { + "epoch": 0.3995991983967936, + "grad_norm": 35.40948403548425, + "learning_rate": 9.966596626413932e-06, + "loss": 4.2315, + "step": 1994 + }, + { + "epoch": 0.3997995991983968, + "grad_norm": 63.04308939477783, + "learning_rate": 9.966461950543948e-06, + "loss": 4.5712, + "step": 1995 + }, + { + "epoch": 0.4, + "grad_norm": 34.0896704425005, + "learning_rate": 9.966327004640098e-06, + "loss": 4.1847, + "step": 1996 + }, + { + "epoch": 0.4002004008016032, + "grad_norm": 24.433568457236678, + "learning_rate": 9.966191788709716e-06, + "loss": 4.288, + "step": 1997 + }, + { + "epoch": 0.4004008016032064, + "grad_norm": 39.98019214771919, + "learning_rate": 9.966056302760157e-06, + "loss": 4.3979, + "step": 1998 + }, + { + "epoch": 0.4006012024048096, + "grad_norm": 38.73125056073687, + "learning_rate": 9.965920546798784e-06, + "loss": 4.5454, + "step": 1999 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 40.608935088126835, + "learning_rate": 9.965784520832983e-06, + "loss": 4.2851, + "step": 2000 + }, + { + "epoch": 0.40100200400801606, + "grad_norm": 24.026435318762864, + "learning_rate": 9.965648224870145e-06, + "loss": 4.2131, + "step": 2001 + }, + { + "epoch": 0.4012024048096192, + "grad_norm": 46.431092674083104, + "learning_rate": 9.965511658917685e-06, + "loss": 4.1371, + "step": 2002 + }, + { + "epoch": 0.40140280561122244, + "grad_norm": 27.19490054034781, + "learning_rate": 9.965374822983027e-06, + "loss": 3.9063, + "step": 2003 + }, + { + "epoch": 0.40160320641282565, + "grad_norm": 47.980018909347876, + "learning_rate": 9.965237717073607e-06, + "loss": 4.4311, + "step": 2004 + }, + { + "epoch": 0.40180360721442887, + "grad_norm": 31.05168131481436, + "learning_rate": 9.965100341196888e-06, + "loss": 3.9572, + "step": 2005 + }, + { + "epoch": 0.4020040080160321, + "grad_norm": 23.69728637846242, + "learning_rate": 9.964962695360329e-06, + "loss": 3.7626, + "step": 2006 + }, + { + "epoch": 0.40220440881763525, + "grad_norm": 35.42523815730136, + "learning_rate": 9.964824779571422e-06, + "loss": 4.7518, + "step": 2007 + }, + { + "epoch": 0.40240480961923847, + "grad_norm": 32.92971486617655, + "learning_rate": 9.964686593837664e-06, + "loss": 3.857, + "step": 2008 + }, + { + "epoch": 0.4026052104208417, + "grad_norm": 16.53424173821417, + "learning_rate": 9.964548138166565e-06, + "loss": 4.0146, + "step": 2009 + }, + { + "epoch": 0.4028056112224449, + "grad_norm": 28.907579534494793, + "learning_rate": 9.964409412565656e-06, + "loss": 4.7994, + "step": 2010 + }, + { + "epoch": 0.4030060120240481, + "grad_norm": 23.835246028413383, + "learning_rate": 9.96427041704248e-06, + "loss": 3.7224, + "step": 2011 + }, + { + "epoch": 0.4032064128256513, + "grad_norm": 22.653626222740524, + "learning_rate": 9.964131151604594e-06, + "loss": 4.2088, + "step": 2012 + }, + { + "epoch": 0.4034068136272545, + "grad_norm": 18.182328541839237, + "learning_rate": 9.963991616259568e-06, + "loss": 3.4213, + "step": 2013 + }, + { + "epoch": 0.4036072144288577, + "grad_norm": 34.45037473791887, + "learning_rate": 9.963851811014991e-06, + "loss": 3.9903, + "step": 2014 + }, + { + "epoch": 0.40380761523046094, + "grad_norm": 48.97652771209653, + "learning_rate": 9.963711735878463e-06, + "loss": 4.2145, + "step": 2015 + }, + { + "epoch": 0.40400801603206415, + "grad_norm": 28.636093764834044, + "learning_rate": 9.963571390857602e-06, + "loss": 4.2177, + "step": 2016 + }, + { + "epoch": 0.4042084168336673, + "grad_norm": 29.5923363611352, + "learning_rate": 9.963430775960036e-06, + "loss": 4.0964, + "step": 2017 + }, + { + "epoch": 0.40440881763527053, + "grad_norm": 25.680638404157456, + "learning_rate": 9.963289891193415e-06, + "loss": 4.2499, + "step": 2018 + }, + { + "epoch": 0.40460921843687375, + "grad_norm": 31.44665596972097, + "learning_rate": 9.963148736565393e-06, + "loss": 3.9478, + "step": 2019 + }, + { + "epoch": 0.40480961923847697, + "grad_norm": 55.089726447446786, + "learning_rate": 9.963007312083648e-06, + "loss": 4.2669, + "step": 2020 + }, + { + "epoch": 0.4050100200400802, + "grad_norm": 30.885664572489063, + "learning_rate": 9.962865617755872e-06, + "loss": 4.3206, + "step": 2021 + }, + { + "epoch": 0.40521042084168335, + "grad_norm": 43.64972732958971, + "learning_rate": 9.962723653589765e-06, + "loss": 4.8924, + "step": 2022 + }, + { + "epoch": 0.40541082164328657, + "grad_norm": 26.51255819177259, + "learning_rate": 9.962581419593047e-06, + "loss": 4.8745, + "step": 2023 + }, + { + "epoch": 0.4056112224448898, + "grad_norm": 35.892610862698405, + "learning_rate": 9.96243891577345e-06, + "loss": 4.1404, + "step": 2024 + }, + { + "epoch": 0.405811623246493, + "grad_norm": 21.06496497143519, + "learning_rate": 9.962296142138727e-06, + "loss": 3.908, + "step": 2025 + }, + { + "epoch": 0.40601202404809617, + "grad_norm": 47.940636689340685, + "learning_rate": 9.962153098696637e-06, + "loss": 4.3862, + "step": 2026 + }, + { + "epoch": 0.4062124248496994, + "grad_norm": 26.229344123250634, + "learning_rate": 9.962009785454958e-06, + "loss": 3.7637, + "step": 2027 + }, + { + "epoch": 0.4064128256513026, + "grad_norm": 21.738371060413098, + "learning_rate": 9.961866202421481e-06, + "loss": 4.0986, + "step": 2028 + }, + { + "epoch": 0.4066132264529058, + "grad_norm": 49.12494994485717, + "learning_rate": 9.961722349604018e-06, + "loss": 4.3772, + "step": 2029 + }, + { + "epoch": 0.40681362725450904, + "grad_norm": 21.381843525009632, + "learning_rate": 9.961578227010382e-06, + "loss": 4.1947, + "step": 2030 + }, + { + "epoch": 0.4070140280561122, + "grad_norm": 42.39928844423353, + "learning_rate": 9.961433834648417e-06, + "loss": 5.3626, + "step": 2031 + }, + { + "epoch": 0.4072144288577154, + "grad_norm": 19.38657432739034, + "learning_rate": 9.96128917252597e-06, + "loss": 3.94, + "step": 2032 + }, + { + "epoch": 0.40741482965931863, + "grad_norm": 39.578383245355546, + "learning_rate": 9.961144240650906e-06, + "loss": 4.6829, + "step": 2033 + }, + { + "epoch": 0.40761523046092185, + "grad_norm": 37.71288308267556, + "learning_rate": 9.960999039031108e-06, + "loss": 4.455, + "step": 2034 + }, + { + "epoch": 0.40781563126252507, + "grad_norm": 24.50097855127652, + "learning_rate": 9.960853567674467e-06, + "loss": 4.5053, + "step": 2035 + }, + { + "epoch": 0.40801603206412823, + "grad_norm": 30.114730313996393, + "learning_rate": 9.960707826588896e-06, + "loss": 4.0015, + "step": 2036 + }, + { + "epoch": 0.40821643286573145, + "grad_norm": 35.17312532150399, + "learning_rate": 9.960561815782316e-06, + "loss": 3.769, + "step": 2037 + }, + { + "epoch": 0.40841683366733467, + "grad_norm": 30.995698947390334, + "learning_rate": 9.96041553526267e-06, + "loss": 4.0838, + "step": 2038 + }, + { + "epoch": 0.4086172344689379, + "grad_norm": 37.134955508647735, + "learning_rate": 9.960268985037909e-06, + "loss": 4.0904, + "step": 2039 + }, + { + "epoch": 0.4088176352705411, + "grad_norm": 27.964028183823903, + "learning_rate": 9.960122165116002e-06, + "loss": 4.716, + "step": 2040 + }, + { + "epoch": 0.40901803607214426, + "grad_norm": 26.24112671818957, + "learning_rate": 9.959975075504928e-06, + "loss": 3.3293, + "step": 2041 + }, + { + "epoch": 0.4092184368737475, + "grad_norm": 34.192753000408295, + "learning_rate": 9.95982771621269e-06, + "loss": 4.5032, + "step": 2042 + }, + { + "epoch": 0.4094188376753507, + "grad_norm": 26.77608984095535, + "learning_rate": 9.959680087247296e-06, + "loss": 4.5054, + "step": 2043 + }, + { + "epoch": 0.4096192384769539, + "grad_norm": 32.12010891595362, + "learning_rate": 9.959532188616776e-06, + "loss": 4.4255, + "step": 2044 + }, + { + "epoch": 0.40981963927855714, + "grad_norm": 30.134057256437604, + "learning_rate": 9.95938402032917e-06, + "loss": 4.3089, + "step": 2045 + }, + { + "epoch": 0.4100200400801603, + "grad_norm": 21.575797691484333, + "learning_rate": 9.959235582392533e-06, + "loss": 3.8247, + "step": 2046 + }, + { + "epoch": 0.4102204408817635, + "grad_norm": 33.41106722362283, + "learning_rate": 9.959086874814938e-06, + "loss": 4.5184, + "step": 2047 + }, + { + "epoch": 0.41042084168336673, + "grad_norm": 28.6300130150278, + "learning_rate": 9.958937897604468e-06, + "loss": 4.1836, + "step": 2048 + }, + { + "epoch": 0.41062124248496995, + "grad_norm": 23.738304048933813, + "learning_rate": 9.958788650769224e-06, + "loss": 4.0041, + "step": 2049 + }, + { + "epoch": 0.41082164328657317, + "grad_norm": 30.2182955401956, + "learning_rate": 9.958639134317323e-06, + "loss": 4.7064, + "step": 2050 + }, + { + "epoch": 0.41102204408817633, + "grad_norm": 42.629827124687424, + "learning_rate": 9.958489348256893e-06, + "loss": 4.0022, + "step": 2051 + }, + { + "epoch": 0.41122244488977955, + "grad_norm": 34.55368051829629, + "learning_rate": 9.958339292596076e-06, + "loss": 3.7951, + "step": 2052 + }, + { + "epoch": 0.41142284569138277, + "grad_norm": 45.81134854275559, + "learning_rate": 9.958188967343033e-06, + "loss": 5.0153, + "step": 2053 + }, + { + "epoch": 0.411623246492986, + "grad_norm": 20.229942795179838, + "learning_rate": 9.958038372505938e-06, + "loss": 4.1465, + "step": 2054 + }, + { + "epoch": 0.4118236472945892, + "grad_norm": 61.13655557898978, + "learning_rate": 9.957887508092975e-06, + "loss": 4.165, + "step": 2055 + }, + { + "epoch": 0.41202404809619236, + "grad_norm": 26.529161605662264, + "learning_rate": 9.957736374112352e-06, + "loss": 3.7759, + "step": 2056 + }, + { + "epoch": 0.4122244488977956, + "grad_norm": 40.51329913420804, + "learning_rate": 9.957584970572285e-06, + "loss": 5.0132, + "step": 2057 + }, + { + "epoch": 0.4124248496993988, + "grad_norm": 24.429151915552907, + "learning_rate": 9.957433297481003e-06, + "loss": 3.6307, + "step": 2058 + }, + { + "epoch": 0.412625250501002, + "grad_norm": 21.864073703796286, + "learning_rate": 9.957281354846755e-06, + "loss": 3.95, + "step": 2059 + }, + { + "epoch": 0.41282565130260523, + "grad_norm": 30.195529864477574, + "learning_rate": 9.957129142677803e-06, + "loss": 4.5443, + "step": 2060 + }, + { + "epoch": 0.4130260521042084, + "grad_norm": 23.236880786971, + "learning_rate": 9.956976660982422e-06, + "loss": 3.913, + "step": 2061 + }, + { + "epoch": 0.4132264529058116, + "grad_norm": 49.0168582521663, + "learning_rate": 9.956823909768902e-06, + "loss": 4.7624, + "step": 2062 + }, + { + "epoch": 0.41342685370741483, + "grad_norm": 28.852205851171547, + "learning_rate": 9.956670889045551e-06, + "loss": 4.2477, + "step": 2063 + }, + { + "epoch": 0.41362725450901805, + "grad_norm": 44.77680009110405, + "learning_rate": 9.956517598820686e-06, + "loss": 4.2091, + "step": 2064 + }, + { + "epoch": 0.41382765531062127, + "grad_norm": 18.84487519125969, + "learning_rate": 9.956364039102642e-06, + "loss": 3.7487, + "step": 2065 + }, + { + "epoch": 0.41402805611222443, + "grad_norm": 30.728325012300957, + "learning_rate": 9.956210209899769e-06, + "loss": 4.5018, + "step": 2066 + }, + { + "epoch": 0.41422845691382765, + "grad_norm": 23.578105210687497, + "learning_rate": 9.956056111220431e-06, + "loss": 4.478, + "step": 2067 + }, + { + "epoch": 0.41442885771543087, + "grad_norm": 47.82892042567813, + "learning_rate": 9.955901743073006e-06, + "loss": 4.303, + "step": 2068 + }, + { + "epoch": 0.4146292585170341, + "grad_norm": 27.379812247916444, + "learning_rate": 9.955747105465889e-06, + "loss": 4.7692, + "step": 2069 + }, + { + "epoch": 0.4148296593186373, + "grad_norm": 24.32088309942875, + "learning_rate": 9.955592198407486e-06, + "loss": 4.148, + "step": 2070 + }, + { + "epoch": 0.41503006012024046, + "grad_norm": 32.8944178859107, + "learning_rate": 9.955437021906221e-06, + "loss": 3.9335, + "step": 2071 + }, + { + "epoch": 0.4152304609218437, + "grad_norm": 28.055284731345125, + "learning_rate": 9.955281575970528e-06, + "loss": 4.3064, + "step": 2072 + }, + { + "epoch": 0.4154308617234469, + "grad_norm": 31.26284146726785, + "learning_rate": 9.955125860608862e-06, + "loss": 4.1268, + "step": 2073 + }, + { + "epoch": 0.4156312625250501, + "grad_norm": 32.82422750316124, + "learning_rate": 9.954969875829688e-06, + "loss": 4.7241, + "step": 2074 + }, + { + "epoch": 0.41583166332665333, + "grad_norm": 35.32761785331307, + "learning_rate": 9.954813621641489e-06, + "loss": 4.391, + "step": 2075 + }, + { + "epoch": 0.4160320641282565, + "grad_norm": 23.103378279642712, + "learning_rate": 9.954657098052758e-06, + "loss": 4.1346, + "step": 2076 + }, + { + "epoch": 0.4162324649298597, + "grad_norm": 33.18618282966821, + "learning_rate": 9.954500305072009e-06, + "loss": 4.0761, + "step": 2077 + }, + { + "epoch": 0.41643286573146293, + "grad_norm": 28.22925357465705, + "learning_rate": 9.954343242707762e-06, + "loss": 3.8548, + "step": 2078 + }, + { + "epoch": 0.41663326653306615, + "grad_norm": 20.521638152609718, + "learning_rate": 9.95418591096856e-06, + "loss": 3.7394, + "step": 2079 + }, + { + "epoch": 0.4168336673346693, + "grad_norm": 33.870222263747166, + "learning_rate": 9.95402830986296e-06, + "loss": 4.7195, + "step": 2080 + }, + { + "epoch": 0.41703406813627253, + "grad_norm": 17.97912784574251, + "learning_rate": 9.953870439399525e-06, + "loss": 3.7208, + "step": 2081 + }, + { + "epoch": 0.41723446893787575, + "grad_norm": 24.817296236363397, + "learning_rate": 9.953712299586845e-06, + "loss": 4.0721, + "step": 2082 + }, + { + "epoch": 0.41743486973947896, + "grad_norm": 23.168576947661006, + "learning_rate": 9.95355389043351e-06, + "loss": 4.3327, + "step": 2083 + }, + { + "epoch": 0.4176352705410822, + "grad_norm": 43.04540509121277, + "learning_rate": 9.953395211948142e-06, + "loss": 5.3664, + "step": 2084 + }, + { + "epoch": 0.41783567134268534, + "grad_norm": 26.595007529891912, + "learning_rate": 9.953236264139362e-06, + "loss": 3.8787, + "step": 2085 + }, + { + "epoch": 0.41803607214428856, + "grad_norm": 21.487206598718885, + "learning_rate": 9.953077047015816e-06, + "loss": 3.9481, + "step": 2086 + }, + { + "epoch": 0.4182364729458918, + "grad_norm": 23.410852413600946, + "learning_rate": 9.952917560586158e-06, + "loss": 3.9393, + "step": 2087 + }, + { + "epoch": 0.418436873747495, + "grad_norm": 47.96401947959827, + "learning_rate": 9.952757804859063e-06, + "loss": 4.335, + "step": 2088 + }, + { + "epoch": 0.4186372745490982, + "grad_norm": 26.18660124054045, + "learning_rate": 9.952597779843214e-06, + "loss": 3.8919, + "step": 2089 + }, + { + "epoch": 0.4188376753507014, + "grad_norm": 32.226373852411356, + "learning_rate": 9.952437485547313e-06, + "loss": 4.4053, + "step": 2090 + }, + { + "epoch": 0.4190380761523046, + "grad_norm": 24.24308937462833, + "learning_rate": 9.952276921980075e-06, + "loss": 3.8043, + "step": 2091 + }, + { + "epoch": 0.4192384769539078, + "grad_norm": 39.661598391937154, + "learning_rate": 9.952116089150233e-06, + "loss": 4.0355, + "step": 2092 + }, + { + "epoch": 0.41943887775551103, + "grad_norm": 36.01588712115084, + "learning_rate": 9.951954987066526e-06, + "loss": 4.3938, + "step": 2093 + }, + { + "epoch": 0.41963927855711425, + "grad_norm": 25.593503232167077, + "learning_rate": 9.951793615737719e-06, + "loss": 4.2467, + "step": 2094 + }, + { + "epoch": 0.4198396793587174, + "grad_norm": 33.38252497175515, + "learning_rate": 9.951631975172582e-06, + "loss": 4.4364, + "step": 2095 + }, + { + "epoch": 0.42004008016032063, + "grad_norm": 18.462265380376756, + "learning_rate": 9.951470065379907e-06, + "loss": 4.1583, + "step": 2096 + }, + { + "epoch": 0.42024048096192385, + "grad_norm": 24.559758191927237, + "learning_rate": 9.951307886368495e-06, + "loss": 4.0693, + "step": 2097 + }, + { + "epoch": 0.42044088176352706, + "grad_norm": 25.977201123175462, + "learning_rate": 9.951145438147162e-06, + "loss": 3.9947, + "step": 2098 + }, + { + "epoch": 0.4206412825651303, + "grad_norm": 22.88527552893032, + "learning_rate": 9.950982720724745e-06, + "loss": 4.2599, + "step": 2099 + }, + { + "epoch": 0.42084168336673344, + "grad_norm": 28.222165679838618, + "learning_rate": 9.95081973411009e-06, + "loss": 3.9139, + "step": 2100 + }, + { + "epoch": 0.42104208416833666, + "grad_norm": 31.61383058585071, + "learning_rate": 9.950656478312057e-06, + "loss": 4.6278, + "step": 2101 + }, + { + "epoch": 0.4212424849699399, + "grad_norm": 34.92919235995486, + "learning_rate": 9.950492953339523e-06, + "loss": 3.925, + "step": 2102 + }, + { + "epoch": 0.4214428857715431, + "grad_norm": 23.891273946073007, + "learning_rate": 9.95032915920138e-06, + "loss": 3.7446, + "step": 2103 + }, + { + "epoch": 0.4216432865731463, + "grad_norm": 31.143562733681037, + "learning_rate": 9.950165095906533e-06, + "loss": 4.8114, + "step": 2104 + }, + { + "epoch": 0.4218436873747495, + "grad_norm": 25.58418020160835, + "learning_rate": 9.950000763463902e-06, + "loss": 4.5353, + "step": 2105 + }, + { + "epoch": 0.4220440881763527, + "grad_norm": 23.873279350879244, + "learning_rate": 9.949836161882424e-06, + "loss": 4.4912, + "step": 2106 + }, + { + "epoch": 0.4222444889779559, + "grad_norm": 19.329291227708243, + "learning_rate": 9.949671291171046e-06, + "loss": 3.8881, + "step": 2107 + }, + { + "epoch": 0.42244488977955913, + "grad_norm": 46.00408730303477, + "learning_rate": 9.949506151338735e-06, + "loss": 3.9604, + "step": 2108 + }, + { + "epoch": 0.42264529058116235, + "grad_norm": 41.86890174737568, + "learning_rate": 9.949340742394467e-06, + "loss": 3.9703, + "step": 2109 + }, + { + "epoch": 0.4228456913827655, + "grad_norm": 31.92327988054621, + "learning_rate": 9.94917506434724e-06, + "loss": 4.2062, + "step": 2110 + }, + { + "epoch": 0.4230460921843687, + "grad_norm": 28.49336077206498, + "learning_rate": 9.949009117206055e-06, + "loss": 3.9293, + "step": 2111 + }, + { + "epoch": 0.42324649298597194, + "grad_norm": 25.241714094284113, + "learning_rate": 9.94884290097994e-06, + "loss": 3.7914, + "step": 2112 + }, + { + "epoch": 0.42344689378757516, + "grad_norm": 30.10158393571134, + "learning_rate": 9.948676415677932e-06, + "loss": 4.1586, + "step": 2113 + }, + { + "epoch": 0.4236472945891784, + "grad_norm": 22.171659874098296, + "learning_rate": 9.948509661309082e-06, + "loss": 3.784, + "step": 2114 + }, + { + "epoch": 0.42384769539078154, + "grad_norm": 27.690314586047773, + "learning_rate": 9.948342637882458e-06, + "loss": 4.8072, + "step": 2115 + }, + { + "epoch": 0.42404809619238476, + "grad_norm": 23.792371741593055, + "learning_rate": 9.94817534540714e-06, + "loss": 3.9998, + "step": 2116 + }, + { + "epoch": 0.424248496993988, + "grad_norm": 15.675686087522443, + "learning_rate": 9.948007783892224e-06, + "loss": 3.4943, + "step": 2117 + }, + { + "epoch": 0.4244488977955912, + "grad_norm": 23.216389423203697, + "learning_rate": 9.94783995334682e-06, + "loss": 4.1589, + "step": 2118 + }, + { + "epoch": 0.4246492985971944, + "grad_norm": 19.210616221427163, + "learning_rate": 9.947671853780055e-06, + "loss": 3.928, + "step": 2119 + }, + { + "epoch": 0.4248496993987976, + "grad_norm": 31.057762376134487, + "learning_rate": 9.947503485201068e-06, + "loss": 4.8637, + "step": 2120 + }, + { + "epoch": 0.4250501002004008, + "grad_norm": 74.37935555152383, + "learning_rate": 9.947334847619013e-06, + "loss": 4.0469, + "step": 2121 + }, + { + "epoch": 0.425250501002004, + "grad_norm": 26.67692844096649, + "learning_rate": 9.947165941043058e-06, + "loss": 4.1571, + "step": 2122 + }, + { + "epoch": 0.42545090180360723, + "grad_norm": 29.20617096820399, + "learning_rate": 9.946996765482391e-06, + "loss": 4.616, + "step": 2123 + }, + { + "epoch": 0.42565130260521045, + "grad_norm": 22.529642571701277, + "learning_rate": 9.946827320946205e-06, + "loss": 4.4486, + "step": 2124 + }, + { + "epoch": 0.4258517034068136, + "grad_norm": 30.67713801183616, + "learning_rate": 9.946657607443717e-06, + "loss": 4.3168, + "step": 2125 + }, + { + "epoch": 0.4260521042084168, + "grad_norm": 20.644637397892133, + "learning_rate": 9.946487624984151e-06, + "loss": 3.8365, + "step": 2126 + }, + { + "epoch": 0.42625250501002004, + "grad_norm": 25.779115009662696, + "learning_rate": 9.946317373576753e-06, + "loss": 4.2669, + "step": 2127 + }, + { + "epoch": 0.42645290581162326, + "grad_norm": 33.82602364201044, + "learning_rate": 9.946146853230777e-06, + "loss": 4.5866, + "step": 2128 + }, + { + "epoch": 0.4266533066132265, + "grad_norm": 23.347771201436185, + "learning_rate": 9.945976063955496e-06, + "loss": 3.7303, + "step": 2129 + }, + { + "epoch": 0.42685370741482964, + "grad_norm": 29.890181856893477, + "learning_rate": 9.945805005760196e-06, + "loss": 4.5878, + "step": 2130 + }, + { + "epoch": 0.42705410821643286, + "grad_norm": 30.415631152372292, + "learning_rate": 9.945633678654177e-06, + "loss": 4.039, + "step": 2131 + }, + { + "epoch": 0.4272545090180361, + "grad_norm": 25.198741405213948, + "learning_rate": 9.945462082646752e-06, + "loss": 4.3535, + "step": 2132 + }, + { + "epoch": 0.4274549098196393, + "grad_norm": 28.742428289035537, + "learning_rate": 9.945290217747257e-06, + "loss": 4.2959, + "step": 2133 + }, + { + "epoch": 0.4276553106212425, + "grad_norm": 22.760503981799005, + "learning_rate": 9.945118083965031e-06, + "loss": 3.7884, + "step": 2134 + }, + { + "epoch": 0.4278557114228457, + "grad_norm": 19.974468827527502, + "learning_rate": 9.944945681309435e-06, + "loss": 4.2749, + "step": 2135 + }, + { + "epoch": 0.4280561122244489, + "grad_norm": 29.799971363158242, + "learning_rate": 9.944773009789844e-06, + "loss": 4.8181, + "step": 2136 + }, + { + "epoch": 0.4282565130260521, + "grad_norm": 19.770729766255194, + "learning_rate": 9.944600069415644e-06, + "loss": 3.6823, + "step": 2137 + }, + { + "epoch": 0.42845691382765533, + "grad_norm": 24.437097194231423, + "learning_rate": 9.94442686019624e-06, + "loss": 4.1366, + "step": 2138 + }, + { + "epoch": 0.4286573146292585, + "grad_norm": 78.00110646180205, + "learning_rate": 9.944253382141047e-06, + "loss": 4.2061, + "step": 2139 + }, + { + "epoch": 0.4288577154308617, + "grad_norm": 28.67618221013658, + "learning_rate": 9.944079635259502e-06, + "loss": 4.2167, + "step": 2140 + }, + { + "epoch": 0.4290581162324649, + "grad_norm": 33.45524103229543, + "learning_rate": 9.943905619561048e-06, + "loss": 4.0721, + "step": 2141 + }, + { + "epoch": 0.42925851703406814, + "grad_norm": 32.81887430373188, + "learning_rate": 9.943731335055149e-06, + "loss": 4.265, + "step": 2142 + }, + { + "epoch": 0.42945891783567136, + "grad_norm": 30.877447099423655, + "learning_rate": 9.943556781751278e-06, + "loss": 3.8823, + "step": 2143 + }, + { + "epoch": 0.4296593186372745, + "grad_norm": 31.278811407281488, + "learning_rate": 9.943381959658927e-06, + "loss": 4.4631, + "step": 2144 + }, + { + "epoch": 0.42985971943887774, + "grad_norm": 28.370939865464443, + "learning_rate": 9.943206868787602e-06, + "loss": 4.4895, + "step": 2145 + }, + { + "epoch": 0.43006012024048096, + "grad_norm": 23.777451991075505, + "learning_rate": 9.943031509146825e-06, + "loss": 4.45, + "step": 2146 + }, + { + "epoch": 0.4302605210420842, + "grad_norm": 34.691277051092264, + "learning_rate": 9.942855880746127e-06, + "loss": 4.2989, + "step": 2147 + }, + { + "epoch": 0.4304609218436874, + "grad_norm": 23.77434069115061, + "learning_rate": 9.942679983595056e-06, + "loss": 3.9161, + "step": 2148 + }, + { + "epoch": 0.43066132264529056, + "grad_norm": 30.982348377463996, + "learning_rate": 9.942503817703181e-06, + "loss": 4.6989, + "step": 2149 + }, + { + "epoch": 0.4308617234468938, + "grad_norm": 27.280544794089664, + "learning_rate": 9.942327383080076e-06, + "loss": 4.351, + "step": 2150 + }, + { + "epoch": 0.431062124248497, + "grad_norm": 26.11207218023462, + "learning_rate": 9.942150679735337e-06, + "loss": 4.154, + "step": 2151 + }, + { + "epoch": 0.4312625250501002, + "grad_norm": 24.324122159226786, + "learning_rate": 9.94197370767857e-06, + "loss": 4.0983, + "step": 2152 + }, + { + "epoch": 0.4314629258517034, + "grad_norm": 45.930701164529395, + "learning_rate": 9.941796466919398e-06, + "loss": 4.3626, + "step": 2153 + }, + { + "epoch": 0.4316633266533066, + "grad_norm": 20.978450397477783, + "learning_rate": 9.941618957467455e-06, + "loss": 3.7398, + "step": 2154 + }, + { + "epoch": 0.4318637274549098, + "grad_norm": 29.438023403681107, + "learning_rate": 9.941441179332398e-06, + "loss": 3.8972, + "step": 2155 + }, + { + "epoch": 0.432064128256513, + "grad_norm": 47.97560366788711, + "learning_rate": 9.941263132523888e-06, + "loss": 4.0125, + "step": 2156 + }, + { + "epoch": 0.43226452905811624, + "grad_norm": 23.348533629006646, + "learning_rate": 9.941084817051608e-06, + "loss": 4.3898, + "step": 2157 + }, + { + "epoch": 0.43246492985971946, + "grad_norm": 46.95568494147444, + "learning_rate": 9.940906232925251e-06, + "loss": 4.5422, + "step": 2158 + }, + { + "epoch": 0.4326653306613226, + "grad_norm": 27.31662602249833, + "learning_rate": 9.940727380154531e-06, + "loss": 4.2271, + "step": 2159 + }, + { + "epoch": 0.43286573146292584, + "grad_norm": 48.294776074607846, + "learning_rate": 9.94054825874917e-06, + "loss": 4.3941, + "step": 2160 + }, + { + "epoch": 0.43306613226452906, + "grad_norm": 24.352913234879203, + "learning_rate": 9.940368868718906e-06, + "loss": 3.5132, + "step": 2161 + }, + { + "epoch": 0.4332665330661323, + "grad_norm": 50.295262611448855, + "learning_rate": 9.940189210073493e-06, + "loss": 4.5028, + "step": 2162 + }, + { + "epoch": 0.4334669338677355, + "grad_norm": 27.573109842660603, + "learning_rate": 9.940009282822703e-06, + "loss": 4.2703, + "step": 2163 + }, + { + "epoch": 0.43366733466933866, + "grad_norm": 26.745008014674177, + "learning_rate": 9.939829086976316e-06, + "loss": 3.9758, + "step": 2164 + }, + { + "epoch": 0.4338677354709419, + "grad_norm": 24.946890486189172, + "learning_rate": 9.939648622544127e-06, + "loss": 3.535, + "step": 2165 + }, + { + "epoch": 0.4340681362725451, + "grad_norm": 24.988145230341004, + "learning_rate": 9.939467889535954e-06, + "loss": 4.3367, + "step": 2166 + }, + { + "epoch": 0.4342685370741483, + "grad_norm": 26.05727608198158, + "learning_rate": 9.939286887961618e-06, + "loss": 4.2392, + "step": 2167 + }, + { + "epoch": 0.4344689378757515, + "grad_norm": 29.911230914415018, + "learning_rate": 9.939105617830965e-06, + "loss": 3.9411, + "step": 2168 + }, + { + "epoch": 0.4346693386773547, + "grad_norm": 29.51060753906574, + "learning_rate": 9.938924079153846e-06, + "loss": 4.8207, + "step": 2169 + }, + { + "epoch": 0.4348697394789579, + "grad_norm": 33.61159689733876, + "learning_rate": 9.938742271940136e-06, + "loss": 4.2877, + "step": 2170 + }, + { + "epoch": 0.4350701402805611, + "grad_norm": 25.055870816046447, + "learning_rate": 9.93856019619972e-06, + "loss": 3.4256, + "step": 2171 + }, + { + "epoch": 0.43527054108216434, + "grad_norm": 51.015448067916864, + "learning_rate": 9.938377851942494e-06, + "loss": 4.7379, + "step": 2172 + }, + { + "epoch": 0.43547094188376756, + "grad_norm": 23.81967340976246, + "learning_rate": 9.938195239178374e-06, + "loss": 4.5413, + "step": 2173 + }, + { + "epoch": 0.4356713426853707, + "grad_norm": 25.06265022912124, + "learning_rate": 9.93801235791729e-06, + "loss": 4.4398, + "step": 2174 + }, + { + "epoch": 0.43587174348697394, + "grad_norm": 26.615279467294553, + "learning_rate": 9.937829208169186e-06, + "loss": 3.938, + "step": 2175 + }, + { + "epoch": 0.43607214428857716, + "grad_norm": 42.37002976901113, + "learning_rate": 9.937645789944019e-06, + "loss": 3.6264, + "step": 2176 + }, + { + "epoch": 0.4362725450901804, + "grad_norm": 58.69630242329452, + "learning_rate": 9.93746210325176e-06, + "loss": 4.7565, + "step": 2177 + }, + { + "epoch": 0.4364729458917836, + "grad_norm": 25.886556665910113, + "learning_rate": 9.9372781481024e-06, + "loss": 4.8566, + "step": 2178 + }, + { + "epoch": 0.43667334669338675, + "grad_norm": 27.7327395547353, + "learning_rate": 9.937093924505938e-06, + "loss": 3.7017, + "step": 2179 + }, + { + "epoch": 0.43687374749499, + "grad_norm": 27.51091495454794, + "learning_rate": 9.936909432472391e-06, + "loss": 4.2763, + "step": 2180 + }, + { + "epoch": 0.4370741482965932, + "grad_norm": 86.45462724717136, + "learning_rate": 9.936724672011791e-06, + "loss": 4.1072, + "step": 2181 + }, + { + "epoch": 0.4372745490981964, + "grad_norm": 21.923640815476606, + "learning_rate": 9.936539643134184e-06, + "loss": 3.9361, + "step": 2182 + }, + { + "epoch": 0.4374749498997996, + "grad_norm": 35.117222598058994, + "learning_rate": 9.93635434584963e-06, + "loss": 4.3528, + "step": 2183 + }, + { + "epoch": 0.4376753507014028, + "grad_norm": 23.99698826807432, + "learning_rate": 9.936168780168203e-06, + "loss": 4.728, + "step": 2184 + }, + { + "epoch": 0.437875751503006, + "grad_norm": 22.564874330878016, + "learning_rate": 9.935982946099991e-06, + "loss": 4.0855, + "step": 2185 + }, + { + "epoch": 0.4380761523046092, + "grad_norm": 31.760519469937872, + "learning_rate": 9.935796843655103e-06, + "loss": 4.207, + "step": 2186 + }, + { + "epoch": 0.43827655310621244, + "grad_norm": 59.11643560899154, + "learning_rate": 9.935610472843653e-06, + "loss": 4.8453, + "step": 2187 + }, + { + "epoch": 0.43847695390781566, + "grad_norm": 26.46548009176407, + "learning_rate": 9.935423833675777e-06, + "loss": 4.1507, + "step": 2188 + }, + { + "epoch": 0.4386773547094188, + "grad_norm": 26.573123363388557, + "learning_rate": 9.93523692616162e-06, + "loss": 3.6281, + "step": 2189 + }, + { + "epoch": 0.43887775551102204, + "grad_norm": 39.8992398737135, + "learning_rate": 9.935049750311347e-06, + "loss": 4.5228, + "step": 2190 + }, + { + "epoch": 0.43907815631262526, + "grad_norm": 43.34393143855484, + "learning_rate": 9.934862306135135e-06, + "loss": 4.3886, + "step": 2191 + }, + { + "epoch": 0.4392785571142285, + "grad_norm": 39.490335743063795, + "learning_rate": 9.934674593643175e-06, + "loss": 4.3824, + "step": 2192 + }, + { + "epoch": 0.43947895791583164, + "grad_norm": 42.54546477380478, + "learning_rate": 9.934486612845672e-06, + "loss": 4.0249, + "step": 2193 + }, + { + "epoch": 0.43967935871743485, + "grad_norm": 27.868160778624947, + "learning_rate": 9.93429836375285e-06, + "loss": 4.3446, + "step": 2194 + }, + { + "epoch": 0.43987975951903807, + "grad_norm": 28.431110969219677, + "learning_rate": 9.93410984637494e-06, + "loss": 4.184, + "step": 2195 + }, + { + "epoch": 0.4400801603206413, + "grad_norm": 32.559646762423604, + "learning_rate": 9.933921060722195e-06, + "loss": 4.5803, + "step": 2196 + }, + { + "epoch": 0.4402805611222445, + "grad_norm": 24.20277769621157, + "learning_rate": 9.93373200680488e-06, + "loss": 3.7848, + "step": 2197 + }, + { + "epoch": 0.44048096192384767, + "grad_norm": 24.895744906988472, + "learning_rate": 9.93354268463327e-06, + "loss": 4.9506, + "step": 2198 + }, + { + "epoch": 0.4406813627254509, + "grad_norm": 29.32102705182627, + "learning_rate": 9.933353094217665e-06, + "loss": 4.5169, + "step": 2199 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 30.26733233826549, + "learning_rate": 9.933163235568369e-06, + "loss": 4.351, + "step": 2200 + }, + { + "epoch": 0.4410821643286573, + "grad_norm": 77.27538620380348, + "learning_rate": 9.932973108695705e-06, + "loss": 4.1313, + "step": 2201 + }, + { + "epoch": 0.44128256513026054, + "grad_norm": 37.46571832086234, + "learning_rate": 9.932782713610011e-06, + "loss": 3.7338, + "step": 2202 + }, + { + "epoch": 0.4414829659318637, + "grad_norm": 28.301816083382167, + "learning_rate": 9.93259205032164e-06, + "loss": 3.9128, + "step": 2203 + }, + { + "epoch": 0.4416833667334669, + "grad_norm": 19.254778830354596, + "learning_rate": 9.932401118840958e-06, + "loss": 3.9818, + "step": 2204 + }, + { + "epoch": 0.44188376753507014, + "grad_norm": 26.037914239767385, + "learning_rate": 9.932209919178346e-06, + "loss": 4.0223, + "step": 2205 + }, + { + "epoch": 0.44208416833667336, + "grad_norm": 26.537759797084288, + "learning_rate": 9.932018451344201e-06, + "loss": 3.9351, + "step": 2206 + }, + { + "epoch": 0.4422845691382766, + "grad_norm": 24.39739190269358, + "learning_rate": 9.931826715348931e-06, + "loss": 4.2002, + "step": 2207 + }, + { + "epoch": 0.44248496993987974, + "grad_norm": 29.34630339843139, + "learning_rate": 9.931634711202964e-06, + "loss": 3.9768, + "step": 2208 + }, + { + "epoch": 0.44268537074148295, + "grad_norm": 25.34777851286338, + "learning_rate": 9.931442438916738e-06, + "loss": 4.0873, + "step": 2209 + }, + { + "epoch": 0.44288577154308617, + "grad_norm": 28.136559287463584, + "learning_rate": 9.931249898500705e-06, + "loss": 4.0781, + "step": 2210 + }, + { + "epoch": 0.4430861723446894, + "grad_norm": 26.324711769068863, + "learning_rate": 9.931057089965339e-06, + "loss": 3.8168, + "step": 2211 + }, + { + "epoch": 0.4432865731462926, + "grad_norm": 24.08300022752525, + "learning_rate": 9.930864013321118e-06, + "loss": 3.7524, + "step": 2212 + }, + { + "epoch": 0.44348697394789577, + "grad_norm": 22.345938842862306, + "learning_rate": 9.93067066857854e-06, + "loss": 3.9098, + "step": 2213 + }, + { + "epoch": 0.443687374749499, + "grad_norm": 32.56613527592071, + "learning_rate": 9.930477055748125e-06, + "loss": 4.0026, + "step": 2214 + }, + { + "epoch": 0.4438877755511022, + "grad_norm": 27.321245446835903, + "learning_rate": 9.930283174840391e-06, + "loss": 4.5199, + "step": 2215 + }, + { + "epoch": 0.4440881763527054, + "grad_norm": 22.12022870420982, + "learning_rate": 9.930089025865884e-06, + "loss": 3.9213, + "step": 2216 + }, + { + "epoch": 0.44428857715430864, + "grad_norm": 63.46652778122548, + "learning_rate": 9.929894608835158e-06, + "loss": 4.541, + "step": 2217 + }, + { + "epoch": 0.4444889779559118, + "grad_norm": 17.865507140023748, + "learning_rate": 9.929699923758784e-06, + "loss": 3.5482, + "step": 2218 + }, + { + "epoch": 0.444689378757515, + "grad_norm": 23.220539681149923, + "learning_rate": 9.92950497064735e-06, + "loss": 3.9572, + "step": 2219 + }, + { + "epoch": 0.44488977955911824, + "grad_norm": 21.698082499220575, + "learning_rate": 9.929309749511453e-06, + "loss": 3.9724, + "step": 2220 + }, + { + "epoch": 0.44509018036072145, + "grad_norm": 26.19029821126503, + "learning_rate": 9.92911426036171e-06, + "loss": 4.3758, + "step": 2221 + }, + { + "epoch": 0.4452905811623247, + "grad_norm": 33.283978953601775, + "learning_rate": 9.928918503208749e-06, + "loss": 4.4184, + "step": 2222 + }, + { + "epoch": 0.44549098196392783, + "grad_norm": 32.05313262591834, + "learning_rate": 9.92872247806321e-06, + "loss": 4.6345, + "step": 2223 + }, + { + "epoch": 0.44569138276553105, + "grad_norm": 32.62245049505282, + "learning_rate": 9.928526184935759e-06, + "loss": 4.2563, + "step": 2224 + }, + { + "epoch": 0.44589178356713427, + "grad_norm": 28.39935565857802, + "learning_rate": 9.92832962383706e-06, + "loss": 4.4172, + "step": 2225 + }, + { + "epoch": 0.4460921843687375, + "grad_norm": 24.384623154617667, + "learning_rate": 9.928132794777808e-06, + "loss": 3.5393, + "step": 2226 + }, + { + "epoch": 0.4462925851703407, + "grad_norm": 21.45444324120293, + "learning_rate": 9.927935697768697e-06, + "loss": 3.8404, + "step": 2227 + }, + { + "epoch": 0.44649298597194387, + "grad_norm": 38.96490833893464, + "learning_rate": 9.927738332820452e-06, + "loss": 4.9566, + "step": 2228 + }, + { + "epoch": 0.4466933867735471, + "grad_norm": 28.387451809592445, + "learning_rate": 9.927540699943798e-06, + "loss": 4.4341, + "step": 2229 + }, + { + "epoch": 0.4468937875751503, + "grad_norm": 21.421261957132323, + "learning_rate": 9.927342799149482e-06, + "loss": 4.0448, + "step": 2230 + }, + { + "epoch": 0.4470941883767535, + "grad_norm": 31.52697894770482, + "learning_rate": 9.927144630448266e-06, + "loss": 4.1407, + "step": 2231 + }, + { + "epoch": 0.44729458917835674, + "grad_norm": 28.72947605724456, + "learning_rate": 9.926946193850922e-06, + "loss": 5.1393, + "step": 2232 + }, + { + "epoch": 0.4474949899799599, + "grad_norm": 35.13407631483349, + "learning_rate": 9.926747489368242e-06, + "loss": 4.6581, + "step": 2233 + }, + { + "epoch": 0.4476953907815631, + "grad_norm": 27.642973997926624, + "learning_rate": 9.926548517011027e-06, + "loss": 3.6415, + "step": 2234 + }, + { + "epoch": 0.44789579158316634, + "grad_norm": 19.424512328477704, + "learning_rate": 9.926349276790097e-06, + "loss": 4.1168, + "step": 2235 + }, + { + "epoch": 0.44809619238476955, + "grad_norm": 33.73239417647211, + "learning_rate": 9.926149768716286e-06, + "loss": 4.5254, + "step": 2236 + }, + { + "epoch": 0.44829659318637277, + "grad_norm": 41.34997365621815, + "learning_rate": 9.92594999280044e-06, + "loss": 4.2313, + "step": 2237 + }, + { + "epoch": 0.44849699398797593, + "grad_norm": 19.880682297848896, + "learning_rate": 9.92574994905342e-06, + "loss": 4.016, + "step": 2238 + }, + { + "epoch": 0.44869739478957915, + "grad_norm": 25.922940913026967, + "learning_rate": 9.925549637486108e-06, + "loss": 4.5156, + "step": 2239 + }, + { + "epoch": 0.44889779559118237, + "grad_norm": 33.916512088896944, + "learning_rate": 9.925349058109388e-06, + "loss": 4.3526, + "step": 2240 + }, + { + "epoch": 0.4490981963927856, + "grad_norm": 29.37005836129989, + "learning_rate": 9.925148210934171e-06, + "loss": 4.0184, + "step": 2241 + }, + { + "epoch": 0.4492985971943888, + "grad_norm": 33.72554422923276, + "learning_rate": 9.924947095971373e-06, + "loss": 4.7433, + "step": 2242 + }, + { + "epoch": 0.44949899799599197, + "grad_norm": 22.450907356599377, + "learning_rate": 9.924745713231933e-06, + "loss": 3.9837, + "step": 2243 + }, + { + "epoch": 0.4496993987975952, + "grad_norm": 24.21260427436713, + "learning_rate": 9.9245440627268e-06, + "loss": 3.7175, + "step": 2244 + }, + { + "epoch": 0.4498997995991984, + "grad_norm": 20.146315555416546, + "learning_rate": 9.924342144466937e-06, + "loss": 3.8429, + "step": 2245 + }, + { + "epoch": 0.4501002004008016, + "grad_norm": 38.02775857028413, + "learning_rate": 9.924139958463322e-06, + "loss": 4.6007, + "step": 2246 + }, + { + "epoch": 0.4503006012024048, + "grad_norm": 26.873322308056018, + "learning_rate": 9.923937504726948e-06, + "loss": 4.1184, + "step": 2247 + }, + { + "epoch": 0.450501002004008, + "grad_norm": 25.899998260813863, + "learning_rate": 9.923734783268823e-06, + "loss": 4.3752, + "step": 2248 + }, + { + "epoch": 0.4507014028056112, + "grad_norm": 21.405332314936267, + "learning_rate": 9.92353179409997e-06, + "loss": 4.3515, + "step": 2249 + }, + { + "epoch": 0.45090180360721444, + "grad_norm": 24.791352228952377, + "learning_rate": 9.923328537231426e-06, + "loss": 4.1792, + "step": 2250 + }, + { + "epoch": 0.45110220440881765, + "grad_norm": 25.66297388354089, + "learning_rate": 9.923125012674244e-06, + "loss": 4.3331, + "step": 2251 + }, + { + "epoch": 0.4513026052104208, + "grad_norm": 30.58582470545673, + "learning_rate": 9.922921220439485e-06, + "loss": 4.8468, + "step": 2252 + }, + { + "epoch": 0.45150300601202403, + "grad_norm": 24.504145570833916, + "learning_rate": 9.922717160538235e-06, + "loss": 4.123, + "step": 2253 + }, + { + "epoch": 0.45170340681362725, + "grad_norm": 27.307994810783217, + "learning_rate": 9.922512832981585e-06, + "loss": 4.1459, + "step": 2254 + }, + { + "epoch": 0.45190380761523047, + "grad_norm": 24.25515471855524, + "learning_rate": 9.922308237780644e-06, + "loss": 4.4501, + "step": 2255 + }, + { + "epoch": 0.4521042084168337, + "grad_norm": 23.64957983396546, + "learning_rate": 9.922103374946542e-06, + "loss": 4.1165, + "step": 2256 + }, + { + "epoch": 0.45230460921843685, + "grad_norm": 20.04699632369684, + "learning_rate": 9.92189824449041e-06, + "loss": 3.7387, + "step": 2257 + }, + { + "epoch": 0.45250501002004007, + "grad_norm": 32.39128814500481, + "learning_rate": 9.921692846423407e-06, + "loss": 4.4792, + "step": 2258 + }, + { + "epoch": 0.4527054108216433, + "grad_norm": 30.535588341613174, + "learning_rate": 9.9214871807567e-06, + "loss": 4.7597, + "step": 2259 + }, + { + "epoch": 0.4529058116232465, + "grad_norm": 28.010592627680087, + "learning_rate": 9.92128124750147e-06, + "loss": 4.68, + "step": 2260 + }, + { + "epoch": 0.4531062124248497, + "grad_norm": 25.630109061968376, + "learning_rate": 9.92107504666891e-06, + "loss": 3.794, + "step": 2261 + }, + { + "epoch": 0.4533066132264529, + "grad_norm": 36.46451578852354, + "learning_rate": 9.92086857827024e-06, + "loss": 4.1872, + "step": 2262 + }, + { + "epoch": 0.4535070140280561, + "grad_norm": 36.12362504099853, + "learning_rate": 9.92066184231668e-06, + "loss": 3.9807, + "step": 2263 + }, + { + "epoch": 0.4537074148296593, + "grad_norm": 31.128265198778394, + "learning_rate": 9.920454838819472e-06, + "loss": 4.6421, + "step": 2264 + }, + { + "epoch": 0.45390781563126253, + "grad_norm": 19.11954873044024, + "learning_rate": 9.920247567789872e-06, + "loss": 3.7708, + "step": 2265 + }, + { + "epoch": 0.45410821643286575, + "grad_norm": 40.31317113346755, + "learning_rate": 9.920040029239148e-06, + "loss": 5.642, + "step": 2266 + }, + { + "epoch": 0.4543086172344689, + "grad_norm": 29.102897881855057, + "learning_rate": 9.919832223178584e-06, + "loss": 3.9154, + "step": 2267 + }, + { + "epoch": 0.45450901803607213, + "grad_norm": 38.03054397644249, + "learning_rate": 9.919624149619481e-06, + "loss": 4.4988, + "step": 2268 + }, + { + "epoch": 0.45470941883767535, + "grad_norm": 22.84296277918975, + "learning_rate": 9.91941580857315e-06, + "loss": 3.8494, + "step": 2269 + }, + { + "epoch": 0.45490981963927857, + "grad_norm": 30.06514899564313, + "learning_rate": 9.919207200050922e-06, + "loss": 4.2565, + "step": 2270 + }, + { + "epoch": 0.4551102204408818, + "grad_norm": 18.683263589523875, + "learning_rate": 9.918998324064135e-06, + "loss": 3.8319, + "step": 2271 + }, + { + "epoch": 0.45531062124248495, + "grad_norm": 24.93906932706951, + "learning_rate": 9.918789180624149e-06, + "loss": 4.1361, + "step": 2272 + }, + { + "epoch": 0.45551102204408817, + "grad_norm": 28.39041933772512, + "learning_rate": 9.918579769742334e-06, + "loss": 4.0353, + "step": 2273 + }, + { + "epoch": 0.4557114228456914, + "grad_norm": 27.19623302617322, + "learning_rate": 9.918370091430076e-06, + "loss": 4.4071, + "step": 2274 + }, + { + "epoch": 0.4559118236472946, + "grad_norm": 36.623231950837294, + "learning_rate": 9.918160145698776e-06, + "loss": 4.5259, + "step": 2275 + }, + { + "epoch": 0.4561122244488978, + "grad_norm": 44.75015053444008, + "learning_rate": 9.917949932559849e-06, + "loss": 4.0307, + "step": 2276 + }, + { + "epoch": 0.456312625250501, + "grad_norm": 75.35239345978397, + "learning_rate": 9.917739452024726e-06, + "loss": 4.2514, + "step": 2277 + }, + { + "epoch": 0.4565130260521042, + "grad_norm": 29.4508436935236, + "learning_rate": 9.917528704104848e-06, + "loss": 3.8992, + "step": 2278 + }, + { + "epoch": 0.4567134268537074, + "grad_norm": 30.99427485690951, + "learning_rate": 9.917317688811676e-06, + "loss": 4.2219, + "step": 2279 + }, + { + "epoch": 0.45691382765531063, + "grad_norm": 24.028408154121635, + "learning_rate": 9.917106406156684e-06, + "loss": 4.1861, + "step": 2280 + }, + { + "epoch": 0.45711422845691385, + "grad_norm": 28.554216489831123, + "learning_rate": 9.916894856151357e-06, + "loss": 4.2746, + "step": 2281 + }, + { + "epoch": 0.457314629258517, + "grad_norm": 61.35708646090652, + "learning_rate": 9.916683038807199e-06, + "loss": 4.6658, + "step": 2282 + }, + { + "epoch": 0.45751503006012023, + "grad_norm": 25.293588163403797, + "learning_rate": 9.916470954135727e-06, + "loss": 4.3678, + "step": 2283 + }, + { + "epoch": 0.45771543086172345, + "grad_norm": 31.299119014018345, + "learning_rate": 9.91625860214847e-06, + "loss": 3.7372, + "step": 2284 + }, + { + "epoch": 0.45791583166332667, + "grad_norm": 23.048299424634543, + "learning_rate": 9.916045982856977e-06, + "loss": 4.4787, + "step": 2285 + }, + { + "epoch": 0.4581162324649299, + "grad_norm": 31.950586147101962, + "learning_rate": 9.915833096272807e-06, + "loss": 4.3884, + "step": 2286 + }, + { + "epoch": 0.45831663326653305, + "grad_norm": 19.587974742175987, + "learning_rate": 9.915619942407535e-06, + "loss": 3.6217, + "step": 2287 + }, + { + "epoch": 0.45851703406813626, + "grad_norm": 38.96980146282656, + "learning_rate": 9.91540652127275e-06, + "loss": 3.6889, + "step": 2288 + }, + { + "epoch": 0.4587174348697395, + "grad_norm": 33.23423564593001, + "learning_rate": 9.915192832880058e-06, + "loss": 4.3091, + "step": 2289 + }, + { + "epoch": 0.4589178356713427, + "grad_norm": 30.29482199760411, + "learning_rate": 9.914978877241077e-06, + "loss": 4.0612, + "step": 2290 + }, + { + "epoch": 0.4591182364729459, + "grad_norm": 26.347858378398904, + "learning_rate": 9.914764654367436e-06, + "loss": 4.49, + "step": 2291 + }, + { + "epoch": 0.4593186372745491, + "grad_norm": 34.010945938802095, + "learning_rate": 9.914550164270788e-06, + "loss": 4.7599, + "step": 2292 + }, + { + "epoch": 0.4595190380761523, + "grad_norm": 46.01440902926408, + "learning_rate": 9.914335406962794e-06, + "loss": 4.0996, + "step": 2293 + }, + { + "epoch": 0.4597194388777555, + "grad_norm": 37.41255145319585, + "learning_rate": 9.914120382455129e-06, + "loss": 4.4652, + "step": 2294 + }, + { + "epoch": 0.45991983967935873, + "grad_norm": 28.81627084187453, + "learning_rate": 9.913905090759484e-06, + "loss": 4.429, + "step": 2295 + }, + { + "epoch": 0.46012024048096195, + "grad_norm": 53.93641998105982, + "learning_rate": 9.913689531887566e-06, + "loss": 3.9656, + "step": 2296 + }, + { + "epoch": 0.4603206412825651, + "grad_norm": 42.00701098166341, + "learning_rate": 9.913473705851095e-06, + "loss": 4.1021, + "step": 2297 + }, + { + "epoch": 0.46052104208416833, + "grad_norm": 38.205315528719666, + "learning_rate": 9.913257612661807e-06, + "loss": 4.4701, + "step": 2298 + }, + { + "epoch": 0.46072144288577155, + "grad_norm": 19.74622940781406, + "learning_rate": 9.913041252331447e-06, + "loss": 3.6563, + "step": 2299 + }, + { + "epoch": 0.46092184368737477, + "grad_norm": 31.61935838276101, + "learning_rate": 9.912824624871784e-06, + "loss": 5.3496, + "step": 2300 + }, + { + "epoch": 0.46112224448897793, + "grad_norm": 26.067057957352993, + "learning_rate": 9.912607730294593e-06, + "loss": 4.1447, + "step": 2301 + }, + { + "epoch": 0.46132264529058115, + "grad_norm": 33.84612662564789, + "learning_rate": 9.91239056861167e-06, + "loss": 4.7436, + "step": 2302 + }, + { + "epoch": 0.46152304609218436, + "grad_norm": 21.434805105992837, + "learning_rate": 9.912173139834817e-06, + "loss": 3.8747, + "step": 2303 + }, + { + "epoch": 0.4617234468937876, + "grad_norm": 20.027408122601063, + "learning_rate": 9.911955443975863e-06, + "loss": 4.2794, + "step": 2304 + }, + { + "epoch": 0.4619238476953908, + "grad_norm": 27.887658117121568, + "learning_rate": 9.911737481046638e-06, + "loss": 4.8326, + "step": 2305 + }, + { + "epoch": 0.46212424849699396, + "grad_norm": 20.23858134435642, + "learning_rate": 9.911519251058997e-06, + "loss": 4.2171, + "step": 2306 + }, + { + "epoch": 0.4623246492985972, + "grad_norm": 31.972211164666984, + "learning_rate": 9.911300754024804e-06, + "loss": 4.9687, + "step": 2307 + }, + { + "epoch": 0.4625250501002004, + "grad_norm": 34.97571263724098, + "learning_rate": 9.911081989955939e-06, + "loss": 4.3764, + "step": 2308 + }, + { + "epoch": 0.4627254509018036, + "grad_norm": 25.250363281750094, + "learning_rate": 9.910862958864298e-06, + "loss": 3.6034, + "step": 2309 + }, + { + "epoch": 0.46292585170340683, + "grad_norm": 23.26540958700895, + "learning_rate": 9.910643660761787e-06, + "loss": 4.1426, + "step": 2310 + }, + { + "epoch": 0.46312625250501, + "grad_norm": 27.164402433874248, + "learning_rate": 9.910424095660333e-06, + "loss": 3.6995, + "step": 2311 + }, + { + "epoch": 0.4633266533066132, + "grad_norm": 21.622836156780856, + "learning_rate": 9.910204263571872e-06, + "loss": 3.9319, + "step": 2312 + }, + { + "epoch": 0.46352705410821643, + "grad_norm": 33.79174507563793, + "learning_rate": 9.909984164508357e-06, + "loss": 4.3981, + "step": 2313 + }, + { + "epoch": 0.46372745490981965, + "grad_norm": 22.31873108999499, + "learning_rate": 9.909763798481756e-06, + "loss": 4.1941, + "step": 2314 + }, + { + "epoch": 0.46392785571142287, + "grad_norm": 26.80217140921068, + "learning_rate": 9.909543165504048e-06, + "loss": 4.2649, + "step": 2315 + }, + { + "epoch": 0.464128256513026, + "grad_norm": 30.5535452670344, + "learning_rate": 9.909322265587232e-06, + "loss": 4.2891, + "step": 2316 + }, + { + "epoch": 0.46432865731462925, + "grad_norm": 30.217703997533047, + "learning_rate": 9.909101098743317e-06, + "loss": 4.4881, + "step": 2317 + }, + { + "epoch": 0.46452905811623246, + "grad_norm": 32.00324328545348, + "learning_rate": 9.90887966498433e-06, + "loss": 4.3279, + "step": 2318 + }, + { + "epoch": 0.4647294589178357, + "grad_norm": 19.73673868457927, + "learning_rate": 9.908657964322309e-06, + "loss": 3.6597, + "step": 2319 + }, + { + "epoch": 0.4649298597194389, + "grad_norm": 40.618626174651084, + "learning_rate": 9.908435996769308e-06, + "loss": 4.2742, + "step": 2320 + }, + { + "epoch": 0.46513026052104206, + "grad_norm": 27.28047877658537, + "learning_rate": 9.908213762337396e-06, + "loss": 4.1097, + "step": 2321 + }, + { + "epoch": 0.4653306613226453, + "grad_norm": 26.058555153835055, + "learning_rate": 9.907991261038657e-06, + "loss": 3.8149, + "step": 2322 + }, + { + "epoch": 0.4655310621242485, + "grad_norm": 34.51771063966495, + "learning_rate": 9.90776849288519e-06, + "loss": 4.2703, + "step": 2323 + }, + { + "epoch": 0.4657314629258517, + "grad_norm": 24.513532487647506, + "learning_rate": 9.907545457889102e-06, + "loss": 3.7275, + "step": 2324 + }, + { + "epoch": 0.46593186372745493, + "grad_norm": 22.15722153765628, + "learning_rate": 9.907322156062527e-06, + "loss": 3.7284, + "step": 2325 + }, + { + "epoch": 0.4661322645290581, + "grad_norm": 30.15171847184458, + "learning_rate": 9.907098587417601e-06, + "loss": 4.215, + "step": 2326 + }, + { + "epoch": 0.4663326653306613, + "grad_norm": 24.256630354329697, + "learning_rate": 9.90687475196648e-06, + "loss": 4.2919, + "step": 2327 + }, + { + "epoch": 0.46653306613226453, + "grad_norm": 38.256206446067445, + "learning_rate": 9.906650649721336e-06, + "loss": 4.1471, + "step": 2328 + }, + { + "epoch": 0.46673346693386775, + "grad_norm": 27.85960477758951, + "learning_rate": 9.906426280694354e-06, + "loss": 4.7658, + "step": 2329 + }, + { + "epoch": 0.46693386773547096, + "grad_norm": 30.31033579892959, + "learning_rate": 9.906201644897733e-06, + "loss": 4.1369, + "step": 2330 + }, + { + "epoch": 0.4671342685370741, + "grad_norm": 29.53060852454797, + "learning_rate": 9.905976742343686e-06, + "loss": 4.3639, + "step": 2331 + }, + { + "epoch": 0.46733466933867734, + "grad_norm": 44.09522870279142, + "learning_rate": 9.90575157304444e-06, + "loss": 4.2071, + "step": 2332 + }, + { + "epoch": 0.46753507014028056, + "grad_norm": 32.04711386276928, + "learning_rate": 9.905526137012243e-06, + "loss": 4.738, + "step": 2333 + }, + { + "epoch": 0.4677354709418838, + "grad_norm": 38.78703988607526, + "learning_rate": 9.905300434259347e-06, + "loss": 5.0199, + "step": 2334 + }, + { + "epoch": 0.467935871743487, + "grad_norm": 27.406710644305537, + "learning_rate": 9.905074464798024e-06, + "loss": 4.5644, + "step": 2335 + }, + { + "epoch": 0.46813627254509016, + "grad_norm": 24.12938204272556, + "learning_rate": 9.904848228640562e-06, + "loss": 4.4534, + "step": 2336 + }, + { + "epoch": 0.4683366733466934, + "grad_norm": 30.30567820502849, + "learning_rate": 9.904621725799262e-06, + "loss": 4.2797, + "step": 2337 + }, + { + "epoch": 0.4685370741482966, + "grad_norm": 28.119578852896325, + "learning_rate": 9.90439495628644e-06, + "loss": 4.0241, + "step": 2338 + }, + { + "epoch": 0.4687374749498998, + "grad_norm": 24.281542252732976, + "learning_rate": 9.904167920114424e-06, + "loss": 4.5197, + "step": 2339 + }, + { + "epoch": 0.46893787575150303, + "grad_norm": 19.393352090964974, + "learning_rate": 9.903940617295558e-06, + "loss": 3.6991, + "step": 2340 + }, + { + "epoch": 0.4691382765531062, + "grad_norm": 24.3232449914273, + "learning_rate": 9.903713047842203e-06, + "loss": 4.6439, + "step": 2341 + }, + { + "epoch": 0.4693386773547094, + "grad_norm": 27.19245246004317, + "learning_rate": 9.90348521176673e-06, + "loss": 4.7734, + "step": 2342 + }, + { + "epoch": 0.46953907815631263, + "grad_norm": 27.31687938268793, + "learning_rate": 9.90325710908153e-06, + "loss": 4.3964, + "step": 2343 + }, + { + "epoch": 0.46973947895791585, + "grad_norm": 25.488673808897456, + "learning_rate": 9.903028739799001e-06, + "loss": 3.853, + "step": 2344 + }, + { + "epoch": 0.46993987975951906, + "grad_norm": 32.00736501349698, + "learning_rate": 9.902800103931562e-06, + "loss": 4.6485, + "step": 2345 + }, + { + "epoch": 0.4701402805611222, + "grad_norm": 23.272509950329965, + "learning_rate": 9.902571201491647e-06, + "loss": 3.8968, + "step": 2346 + }, + { + "epoch": 0.47034068136272544, + "grad_norm": 17.88997304476553, + "learning_rate": 9.902342032491696e-06, + "loss": 3.983, + "step": 2347 + }, + { + "epoch": 0.47054108216432866, + "grad_norm": 26.942492079794214, + "learning_rate": 9.902112596944172e-06, + "loss": 4.4385, + "step": 2348 + }, + { + "epoch": 0.4707414829659319, + "grad_norm": 27.146066271421063, + "learning_rate": 9.90188289486155e-06, + "loss": 4.0947, + "step": 2349 + }, + { + "epoch": 0.4709418837675351, + "grad_norm": 35.43670638710453, + "learning_rate": 9.901652926256319e-06, + "loss": 4.7676, + "step": 2350 + }, + { + "epoch": 0.47114228456913826, + "grad_norm": 17.642589825006436, + "learning_rate": 9.901422691140985e-06, + "loss": 3.644, + "step": 2351 + }, + { + "epoch": 0.4713426853707415, + "grad_norm": 31.878035291076184, + "learning_rate": 9.901192189528065e-06, + "loss": 4.5429, + "step": 2352 + }, + { + "epoch": 0.4715430861723447, + "grad_norm": 22.2413060533971, + "learning_rate": 9.900961421430086e-06, + "loss": 3.6732, + "step": 2353 + }, + { + "epoch": 0.4717434869739479, + "grad_norm": 48.510906550858124, + "learning_rate": 9.900730386859604e-06, + "loss": 4.5792, + "step": 2354 + }, + { + "epoch": 0.47194388777555113, + "grad_norm": 22.05732265668916, + "learning_rate": 9.900499085829174e-06, + "loss": 3.5915, + "step": 2355 + }, + { + "epoch": 0.4721442885771543, + "grad_norm": 27.5416885095582, + "learning_rate": 9.900267518351377e-06, + "loss": 4.6368, + "step": 2356 + }, + { + "epoch": 0.4723446893787575, + "grad_norm": 20.607291935216868, + "learning_rate": 9.900035684438802e-06, + "loss": 4.3066, + "step": 2357 + }, + { + "epoch": 0.4725450901803607, + "grad_norm": 36.40175223161532, + "learning_rate": 9.899803584104054e-06, + "loss": 4.5968, + "step": 2358 + }, + { + "epoch": 0.47274549098196395, + "grad_norm": 32.521929641796014, + "learning_rate": 9.899571217359749e-06, + "loss": 4.9732, + "step": 2359 + }, + { + "epoch": 0.4729458917835671, + "grad_norm": 23.86638264212831, + "learning_rate": 9.899338584218528e-06, + "loss": 3.9837, + "step": 2360 + }, + { + "epoch": 0.4731462925851703, + "grad_norm": 40.09388099810199, + "learning_rate": 9.899105684693034e-06, + "loss": 4.5213, + "step": 2361 + }, + { + "epoch": 0.47334669338677354, + "grad_norm": 31.719615355488145, + "learning_rate": 9.898872518795933e-06, + "loss": 4.4146, + "step": 2362 + }, + { + "epoch": 0.47354709418837676, + "grad_norm": 25.042472438542458, + "learning_rate": 9.898639086539901e-06, + "loss": 3.5549, + "step": 2363 + }, + { + "epoch": 0.47374749498998, + "grad_norm": 28.430904969526654, + "learning_rate": 9.898405387937631e-06, + "loss": 3.6436, + "step": 2364 + }, + { + "epoch": 0.47394789579158314, + "grad_norm": 27.20372504066417, + "learning_rate": 9.89817142300183e-06, + "loss": 4.1836, + "step": 2365 + }, + { + "epoch": 0.47414829659318636, + "grad_norm": 31.48350805158058, + "learning_rate": 9.897937191745217e-06, + "loss": 4.3183, + "step": 2366 + }, + { + "epoch": 0.4743486973947896, + "grad_norm": 23.03092805420031, + "learning_rate": 9.897702694180531e-06, + "loss": 3.8227, + "step": 2367 + }, + { + "epoch": 0.4745490981963928, + "grad_norm": 60.936540424997325, + "learning_rate": 9.897467930320519e-06, + "loss": 3.4289, + "step": 2368 + }, + { + "epoch": 0.474749498997996, + "grad_norm": 27.474296782981146, + "learning_rate": 9.897232900177946e-06, + "loss": 4.369, + "step": 2369 + }, + { + "epoch": 0.4749498997995992, + "grad_norm": 23.109263686489026, + "learning_rate": 9.89699760376559e-06, + "loss": 3.6015, + "step": 2370 + }, + { + "epoch": 0.4751503006012024, + "grad_norm": 23.44138565310031, + "learning_rate": 9.896762041096249e-06, + "loss": 4.2975, + "step": 2371 + }, + { + "epoch": 0.4753507014028056, + "grad_norm": 48.809408999908314, + "learning_rate": 9.896526212182724e-06, + "loss": 4.1513, + "step": 2372 + }, + { + "epoch": 0.4755511022044088, + "grad_norm": 31.251640300935378, + "learning_rate": 9.896290117037844e-06, + "loss": 3.755, + "step": 2373 + }, + { + "epoch": 0.47575150300601204, + "grad_norm": 39.0202744438084, + "learning_rate": 9.89605375567444e-06, + "loss": 4.8884, + "step": 2374 + }, + { + "epoch": 0.4759519038076152, + "grad_norm": 33.91948592120795, + "learning_rate": 9.895817128105367e-06, + "loss": 4.1034, + "step": 2375 + }, + { + "epoch": 0.4761523046092184, + "grad_norm": 27.735485172967977, + "learning_rate": 9.89558023434349e-06, + "loss": 4.4019, + "step": 2376 + }, + { + "epoch": 0.47635270541082164, + "grad_norm": 23.745682443543647, + "learning_rate": 9.89534307440169e-06, + "loss": 3.9302, + "step": 2377 + }, + { + "epoch": 0.47655310621242486, + "grad_norm": 20.83130572525425, + "learning_rate": 9.89510564829286e-06, + "loss": 4.0773, + "step": 2378 + }, + { + "epoch": 0.4767535070140281, + "grad_norm": 23.88419737145629, + "learning_rate": 9.89486795602991e-06, + "loss": 4.0787, + "step": 2379 + }, + { + "epoch": 0.47695390781563124, + "grad_norm": 27.75809574998946, + "learning_rate": 9.894629997625764e-06, + "loss": 4.4822, + "step": 2380 + }, + { + "epoch": 0.47715430861723446, + "grad_norm": 19.677288092851747, + "learning_rate": 9.89439177309336e-06, + "loss": 3.4902, + "step": 2381 + }, + { + "epoch": 0.4773547094188377, + "grad_norm": 39.91174925003568, + "learning_rate": 9.89415328244565e-06, + "loss": 4.7508, + "step": 2382 + }, + { + "epoch": 0.4775551102204409, + "grad_norm": 20.23426222188601, + "learning_rate": 9.893914525695602e-06, + "loss": 3.9073, + "step": 2383 + }, + { + "epoch": 0.4777555110220441, + "grad_norm": 27.08669670258525, + "learning_rate": 9.893675502856196e-06, + "loss": 4.3371, + "step": 2384 + }, + { + "epoch": 0.4779559118236473, + "grad_norm": 25.328547762975656, + "learning_rate": 9.89343621394043e-06, + "loss": 4.0532, + "step": 2385 + }, + { + "epoch": 0.4781563126252505, + "grad_norm": 24.51986607442215, + "learning_rate": 9.893196658961315e-06, + "loss": 4.2084, + "step": 2386 + }, + { + "epoch": 0.4783567134268537, + "grad_norm": 26.72376936551587, + "learning_rate": 9.892956837931873e-06, + "loss": 4.7596, + "step": 2387 + }, + { + "epoch": 0.4785571142284569, + "grad_norm": 38.32833239842254, + "learning_rate": 9.892716750865146e-06, + "loss": 4.8952, + "step": 2388 + }, + { + "epoch": 0.47875751503006014, + "grad_norm": 22.539520513608963, + "learning_rate": 9.892476397774187e-06, + "loss": 4.1785, + "step": 2389 + }, + { + "epoch": 0.4789579158316633, + "grad_norm": 28.337023562201466, + "learning_rate": 9.892235778672063e-06, + "loss": 3.7277, + "step": 2390 + }, + { + "epoch": 0.4791583166332665, + "grad_norm": 148.3223426871411, + "learning_rate": 9.89199489357186e-06, + "loss": 4.8176, + "step": 2391 + }, + { + "epoch": 0.47935871743486974, + "grad_norm": 20.773707303875934, + "learning_rate": 9.891753742486671e-06, + "loss": 3.7767, + "step": 2392 + }, + { + "epoch": 0.47955911823647296, + "grad_norm": 31.778680865869177, + "learning_rate": 9.891512325429612e-06, + "loss": 4.3706, + "step": 2393 + }, + { + "epoch": 0.4797595190380762, + "grad_norm": 30.690305469090127, + "learning_rate": 9.891270642413807e-06, + "loss": 4.7671, + "step": 2394 + }, + { + "epoch": 0.47995991983967934, + "grad_norm": 29.779868583091904, + "learning_rate": 9.891028693452397e-06, + "loss": 4.5846, + "step": 2395 + }, + { + "epoch": 0.48016032064128256, + "grad_norm": 29.67310799827218, + "learning_rate": 9.890786478558537e-06, + "loss": 4.4215, + "step": 2396 + }, + { + "epoch": 0.4803607214428858, + "grad_norm": 21.904032195202532, + "learning_rate": 9.890543997745396e-06, + "loss": 3.9183, + "step": 2397 + }, + { + "epoch": 0.480561122244489, + "grad_norm": 29.910367451113398, + "learning_rate": 9.89030125102616e-06, + "loss": 4.8884, + "step": 2398 + }, + { + "epoch": 0.4807615230460922, + "grad_norm": 26.43513845741924, + "learning_rate": 9.890058238414024e-06, + "loss": 3.9632, + "step": 2399 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 21.070217414871287, + "learning_rate": 9.889814959922207e-06, + "loss": 3.9211, + "step": 2400 + }, + { + "epoch": 0.4811623246492986, + "grad_norm": 22.616742662204178, + "learning_rate": 9.889571415563929e-06, + "loss": 4.1948, + "step": 2401 + }, + { + "epoch": 0.4813627254509018, + "grad_norm": 26.873957374917318, + "learning_rate": 9.889327605352437e-06, + "loss": 4.4643, + "step": 2402 + }, + { + "epoch": 0.481563126252505, + "grad_norm": 35.50827029056545, + "learning_rate": 9.889083529300985e-06, + "loss": 4.5669, + "step": 2403 + }, + { + "epoch": 0.48176352705410824, + "grad_norm": 26.009383973927115, + "learning_rate": 9.888839187422846e-06, + "loss": 3.8206, + "step": 2404 + }, + { + "epoch": 0.4819639278557114, + "grad_norm": 30.113052838567185, + "learning_rate": 9.888594579731303e-06, + "loss": 3.8068, + "step": 2405 + }, + { + "epoch": 0.4821643286573146, + "grad_norm": 24.49335032789792, + "learning_rate": 9.888349706239656e-06, + "loss": 4.0896, + "step": 2406 + }, + { + "epoch": 0.48236472945891784, + "grad_norm": 24.016366071428873, + "learning_rate": 9.88810456696122e-06, + "loss": 4.4567, + "step": 2407 + }, + { + "epoch": 0.48256513026052106, + "grad_norm": 26.154584382305202, + "learning_rate": 9.887859161909323e-06, + "loss": 4.5493, + "step": 2408 + }, + { + "epoch": 0.4827655310621243, + "grad_norm": 26.281876649387886, + "learning_rate": 9.887613491097308e-06, + "loss": 4.3357, + "step": 2409 + }, + { + "epoch": 0.48296593186372744, + "grad_norm": 23.75177824802206, + "learning_rate": 9.887367554538534e-06, + "loss": 4.3605, + "step": 2410 + }, + { + "epoch": 0.48316633266533066, + "grad_norm": 35.570635093196614, + "learning_rate": 9.88712135224637e-06, + "loss": 4.4795, + "step": 2411 + }, + { + "epoch": 0.4833667334669339, + "grad_norm": 39.52878652764852, + "learning_rate": 9.886874884234205e-06, + "loss": 4.5525, + "step": 2412 + }, + { + "epoch": 0.4835671342685371, + "grad_norm": 24.84025193618411, + "learning_rate": 9.88662815051544e-06, + "loss": 4.2118, + "step": 2413 + }, + { + "epoch": 0.48376753507014025, + "grad_norm": 28.93611749469528, + "learning_rate": 9.886381151103487e-06, + "loss": 4.6871, + "step": 2414 + }, + { + "epoch": 0.48396793587174347, + "grad_norm": 30.423423155989937, + "learning_rate": 9.88613388601178e-06, + "loss": 4.5311, + "step": 2415 + }, + { + "epoch": 0.4841683366733467, + "grad_norm": 23.39178478185057, + "learning_rate": 9.885886355253758e-06, + "loss": 4.0648, + "step": 2416 + }, + { + "epoch": 0.4843687374749499, + "grad_norm": 22.707982040930677, + "learning_rate": 9.885638558842885e-06, + "loss": 4.1437, + "step": 2417 + }, + { + "epoch": 0.4845691382765531, + "grad_norm": 34.03258500629255, + "learning_rate": 9.885390496792631e-06, + "loss": 4.1626, + "step": 2418 + }, + { + "epoch": 0.4847695390781563, + "grad_norm": 21.8020115708907, + "learning_rate": 9.885142169116484e-06, + "loss": 4.0484, + "step": 2419 + }, + { + "epoch": 0.4849699398797595, + "grad_norm": 42.00201598965883, + "learning_rate": 9.884893575827946e-06, + "loss": 5.5461, + "step": 2420 + }, + { + "epoch": 0.4851703406813627, + "grad_norm": 21.60260370023659, + "learning_rate": 9.884644716940535e-06, + "loss": 3.6106, + "step": 2421 + }, + { + "epoch": 0.48537074148296594, + "grad_norm": 26.970325901533975, + "learning_rate": 9.884395592467779e-06, + "loss": 4.5045, + "step": 2422 + }, + { + "epoch": 0.48557114228456916, + "grad_norm": 36.468214193961536, + "learning_rate": 9.884146202423224e-06, + "loss": 4.3835, + "step": 2423 + }, + { + "epoch": 0.4857715430861723, + "grad_norm": 54.43998155585417, + "learning_rate": 9.883896546820431e-06, + "loss": 4.019, + "step": 2424 + }, + { + "epoch": 0.48597194388777554, + "grad_norm": 37.13497972516295, + "learning_rate": 9.883646625672974e-06, + "loss": 4.6614, + "step": 2425 + }, + { + "epoch": 0.48617234468937875, + "grad_norm": 40.333548523288144, + "learning_rate": 9.883396438994442e-06, + "loss": 5.6829, + "step": 2426 + }, + { + "epoch": 0.486372745490982, + "grad_norm": 24.383151306587106, + "learning_rate": 9.883145986798435e-06, + "loss": 4.1986, + "step": 2427 + }, + { + "epoch": 0.4865731462925852, + "grad_norm": 25.40488322752081, + "learning_rate": 9.882895269098572e-06, + "loss": 4.4012, + "step": 2428 + }, + { + "epoch": 0.48677354709418835, + "grad_norm": 31.465481151383347, + "learning_rate": 9.882644285908487e-06, + "loss": 4.2298, + "step": 2429 + }, + { + "epoch": 0.48697394789579157, + "grad_norm": 42.64148430317465, + "learning_rate": 9.882393037241824e-06, + "loss": 4.4934, + "step": 2430 + }, + { + "epoch": 0.4871743486973948, + "grad_norm": 31.719759994080988, + "learning_rate": 9.882141523112246e-06, + "loss": 4.3221, + "step": 2431 + }, + { + "epoch": 0.487374749498998, + "grad_norm": 35.90348976091029, + "learning_rate": 9.881889743533425e-06, + "loss": 4.0193, + "step": 2432 + }, + { + "epoch": 0.4875751503006012, + "grad_norm": 35.45547203061879, + "learning_rate": 9.881637698519054e-06, + "loss": 4.1884, + "step": 2433 + }, + { + "epoch": 0.4877755511022044, + "grad_norm": 28.015837268704026, + "learning_rate": 9.881385388082833e-06, + "loss": 4.5016, + "step": 2434 + }, + { + "epoch": 0.4879759519038076, + "grad_norm": 27.628867065834775, + "learning_rate": 9.881132812238484e-06, + "loss": 4.1914, + "step": 2435 + }, + { + "epoch": 0.4881763527054108, + "grad_norm": 21.205530554035082, + "learning_rate": 9.880879970999739e-06, + "loss": 3.9382, + "step": 2436 + }, + { + "epoch": 0.48837675350701404, + "grad_norm": 23.978518071202846, + "learning_rate": 9.880626864380346e-06, + "loss": 4.0997, + "step": 2437 + }, + { + "epoch": 0.48857715430861726, + "grad_norm": 29.987249184287702, + "learning_rate": 9.880373492394065e-06, + "loss": 4.028, + "step": 2438 + }, + { + "epoch": 0.4887775551102204, + "grad_norm": 28.015542929960247, + "learning_rate": 9.880119855054672e-06, + "loss": 4.7458, + "step": 2439 + }, + { + "epoch": 0.48897795591182364, + "grad_norm": 25.385865393507665, + "learning_rate": 9.87986595237596e-06, + "loss": 3.8777, + "step": 2440 + }, + { + "epoch": 0.48917835671342685, + "grad_norm": 22.631829296235157, + "learning_rate": 9.879611784371733e-06, + "loss": 4.086, + "step": 2441 + }, + { + "epoch": 0.48937875751503007, + "grad_norm": 44.23162038824099, + "learning_rate": 9.87935735105581e-06, + "loss": 4.2457, + "step": 2442 + }, + { + "epoch": 0.4895791583166333, + "grad_norm": 40.202393662806465, + "learning_rate": 9.879102652442024e-06, + "loss": 4.4247, + "step": 2443 + }, + { + "epoch": 0.48977955911823645, + "grad_norm": 30.462548897595283, + "learning_rate": 9.878847688544226e-06, + "loss": 4.8697, + "step": 2444 + }, + { + "epoch": 0.48997995991983967, + "grad_norm": 28.262548094434283, + "learning_rate": 9.878592459376278e-06, + "loss": 4.1849, + "step": 2445 + }, + { + "epoch": 0.4901803607214429, + "grad_norm": 23.04689130059478, + "learning_rate": 9.878336964952056e-06, + "loss": 4.1965, + "step": 2446 + }, + { + "epoch": 0.4903807615230461, + "grad_norm": 31.331412192525107, + "learning_rate": 9.87808120528545e-06, + "loss": 4.5347, + "step": 2447 + }, + { + "epoch": 0.4905811623246493, + "grad_norm": 38.99897495810947, + "learning_rate": 9.87782518039037e-06, + "loss": 4.1379, + "step": 2448 + }, + { + "epoch": 0.4907815631262525, + "grad_norm": 23.104442633368063, + "learning_rate": 9.877568890280736e-06, + "loss": 3.5143, + "step": 2449 + }, + { + "epoch": 0.4909819639278557, + "grad_norm": 24.741634860174475, + "learning_rate": 9.877312334970479e-06, + "loss": 4.5427, + "step": 2450 + }, + { + "epoch": 0.4911823647294589, + "grad_norm": 43.789771781904314, + "learning_rate": 9.877055514473552e-06, + "loss": 3.9188, + "step": 2451 + }, + { + "epoch": 0.49138276553106214, + "grad_norm": 27.20836605870511, + "learning_rate": 9.876798428803917e-06, + "loss": 4.6119, + "step": 2452 + }, + { + "epoch": 0.49158316633266536, + "grad_norm": 29.246616782921247, + "learning_rate": 9.876541077975552e-06, + "loss": 4.6457, + "step": 2453 + }, + { + "epoch": 0.4917835671342685, + "grad_norm": 23.489356261093956, + "learning_rate": 9.876283462002451e-06, + "loss": 3.8803, + "step": 2454 + }, + { + "epoch": 0.49198396793587174, + "grad_norm": 28.56562300887527, + "learning_rate": 9.87602558089862e-06, + "loss": 4.0317, + "step": 2455 + }, + { + "epoch": 0.49218436873747495, + "grad_norm": 24.304501988475014, + "learning_rate": 9.875767434678082e-06, + "loss": 4.1364, + "step": 2456 + }, + { + "epoch": 0.49238476953907817, + "grad_norm": 19.42567281034897, + "learning_rate": 9.87550902335487e-06, + "loss": 3.6582, + "step": 2457 + }, + { + "epoch": 0.4925851703406814, + "grad_norm": 20.983337982720197, + "learning_rate": 9.875250346943035e-06, + "loss": 4.0167, + "step": 2458 + }, + { + "epoch": 0.49278557114228455, + "grad_norm": 25.366523870463762, + "learning_rate": 9.874991405456642e-06, + "loss": 4.2484, + "step": 2459 + }, + { + "epoch": 0.49298597194388777, + "grad_norm": 21.78893743976146, + "learning_rate": 9.874732198909772e-06, + "loss": 4.4121, + "step": 2460 + }, + { + "epoch": 0.493186372745491, + "grad_norm": 38.883551862643586, + "learning_rate": 9.874472727316514e-06, + "loss": 4.9063, + "step": 2461 + }, + { + "epoch": 0.4933867735470942, + "grad_norm": 29.734969413310452, + "learning_rate": 9.87421299069098e-06, + "loss": 4.3176, + "step": 2462 + }, + { + "epoch": 0.4935871743486974, + "grad_norm": 20.65570839723973, + "learning_rate": 9.87395298904729e-06, + "loss": 4.0255, + "step": 2463 + }, + { + "epoch": 0.4937875751503006, + "grad_norm": 33.87547764485125, + "learning_rate": 9.873692722399582e-06, + "loss": 4.7467, + "step": 2464 + }, + { + "epoch": 0.4939879759519038, + "grad_norm": 58.045061271197156, + "learning_rate": 9.873432190762006e-06, + "loss": 4.2426, + "step": 2465 + }, + { + "epoch": 0.494188376753507, + "grad_norm": 28.27127057780871, + "learning_rate": 9.873171394148728e-06, + "loss": 3.2838, + "step": 2466 + }, + { + "epoch": 0.49438877755511024, + "grad_norm": 26.10339711403185, + "learning_rate": 9.872910332573928e-06, + "loss": 4.3247, + "step": 2467 + }, + { + "epoch": 0.4945891783567134, + "grad_norm": 23.33734219257904, + "learning_rate": 9.8726490060518e-06, + "loss": 4.1358, + "step": 2468 + }, + { + "epoch": 0.4947895791583166, + "grad_norm": 25.169161779301596, + "learning_rate": 9.872387414596554e-06, + "loss": 4.2998, + "step": 2469 + }, + { + "epoch": 0.49498997995991983, + "grad_norm": 24.53089133352484, + "learning_rate": 9.87212555822241e-06, + "loss": 3.3091, + "step": 2470 + }, + { + "epoch": 0.49519038076152305, + "grad_norm": 30.645536683507437, + "learning_rate": 9.87186343694361e-06, + "loss": 4.4904, + "step": 2471 + }, + { + "epoch": 0.49539078156312627, + "grad_norm": 21.063978558722557, + "learning_rate": 9.8716010507744e-06, + "loss": 3.3896, + "step": 2472 + }, + { + "epoch": 0.49559118236472943, + "grad_norm": 26.288545823762632, + "learning_rate": 9.871338399729051e-06, + "loss": 4.0804, + "step": 2473 + }, + { + "epoch": 0.49579158316633265, + "grad_norm": 23.473154232729634, + "learning_rate": 9.871075483821844e-06, + "loss": 4.2255, + "step": 2474 + }, + { + "epoch": 0.49599198396793587, + "grad_norm": 55.03298619258001, + "learning_rate": 9.870812303067072e-06, + "loss": 3.9315, + "step": 2475 + }, + { + "epoch": 0.4961923847695391, + "grad_norm": 30.06370216766803, + "learning_rate": 9.870548857479045e-06, + "loss": 4.9282, + "step": 2476 + }, + { + "epoch": 0.4963927855711423, + "grad_norm": 27.397551173670326, + "learning_rate": 9.870285147072086e-06, + "loss": 4.0567, + "step": 2477 + }, + { + "epoch": 0.49659318637274547, + "grad_norm": 32.2098873658279, + "learning_rate": 9.870021171860535e-06, + "loss": 3.9865, + "step": 2478 + }, + { + "epoch": 0.4967935871743487, + "grad_norm": 35.69634251052153, + "learning_rate": 9.869756931858745e-06, + "loss": 4.2958, + "step": 2479 + }, + { + "epoch": 0.4969939879759519, + "grad_norm": 29.054555074168015, + "learning_rate": 9.86949242708108e-06, + "loss": 4.2446, + "step": 2480 + }, + { + "epoch": 0.4971943887775551, + "grad_norm": 22.827227775939626, + "learning_rate": 9.869227657541925e-06, + "loss": 4.2052, + "step": 2481 + }, + { + "epoch": 0.49739478957915834, + "grad_norm": 27.62794713298114, + "learning_rate": 9.868962623255676e-06, + "loss": 4.276, + "step": 2482 + }, + { + "epoch": 0.4975951903807615, + "grad_norm": 24.67740918296789, + "learning_rate": 9.86869732423674e-06, + "loss": 4.0009, + "step": 2483 + }, + { + "epoch": 0.4977955911823647, + "grad_norm": 22.618456507009423, + "learning_rate": 9.868431760499545e-06, + "loss": 4.0472, + "step": 2484 + }, + { + "epoch": 0.49799599198396793, + "grad_norm": 36.00725097978652, + "learning_rate": 9.868165932058527e-06, + "loss": 4.363, + "step": 2485 + }, + { + "epoch": 0.49819639278557115, + "grad_norm": 33.23201827725477, + "learning_rate": 9.867899838928143e-06, + "loss": 3.9006, + "step": 2486 + }, + { + "epoch": 0.49839679358717437, + "grad_norm": 44.9723394850263, + "learning_rate": 9.867633481122858e-06, + "loss": 3.5556, + "step": 2487 + }, + { + "epoch": 0.49859719438877753, + "grad_norm": 43.74304113140069, + "learning_rate": 9.867366858657155e-06, + "loss": 4.1563, + "step": 2488 + }, + { + "epoch": 0.49879759519038075, + "grad_norm": 39.17195485794993, + "learning_rate": 9.867099971545532e-06, + "loss": 4.9176, + "step": 2489 + }, + { + "epoch": 0.49899799599198397, + "grad_norm": 25.358843106403185, + "learning_rate": 9.8668328198025e-06, + "loss": 4.5956, + "step": 2490 + }, + { + "epoch": 0.4991983967935872, + "grad_norm": 30.032653844522386, + "learning_rate": 9.866565403442581e-06, + "loss": 4.7992, + "step": 2491 + }, + { + "epoch": 0.4993987975951904, + "grad_norm": 36.47450884652448, + "learning_rate": 9.866297722480318e-06, + "loss": 4.2364, + "step": 2492 + }, + { + "epoch": 0.49959919839679356, + "grad_norm": 29.23038197122908, + "learning_rate": 9.866029776930264e-06, + "loss": 4.2447, + "step": 2493 + }, + { + "epoch": 0.4997995991983968, + "grad_norm": 44.704235609499705, + "learning_rate": 9.865761566806989e-06, + "loss": 4.4826, + "step": 2494 + }, + { + "epoch": 0.5, + "grad_norm": 43.53079748737486, + "learning_rate": 9.865493092125075e-06, + "loss": 4.3875, + "step": 2495 + }, + { + "epoch": 0.5002004008016032, + "grad_norm": 32.897348390211896, + "learning_rate": 9.86522435289912e-06, + "loss": 4.5396, + "step": 2496 + }, + { + "epoch": 0.5004008016032064, + "grad_norm": 22.37322496256202, + "learning_rate": 9.864955349143735e-06, + "loss": 3.878, + "step": 2497 + }, + { + "epoch": 0.5006012024048097, + "grad_norm": 25.566795494314647, + "learning_rate": 9.864686080873546e-06, + "loss": 4.3789, + "step": 2498 + }, + { + "epoch": 0.5008016032064129, + "grad_norm": 22.67161806650151, + "learning_rate": 9.864416548103194e-06, + "loss": 4.091, + "step": 2499 + }, + { + "epoch": 0.501002004008016, + "grad_norm": 19.33580878764467, + "learning_rate": 9.864146750847335e-06, + "loss": 3.8038, + "step": 2500 + }, + { + "epoch": 0.5012024048096192, + "grad_norm": 20.369046909484172, + "learning_rate": 9.863876689120635e-06, + "loss": 3.9104, + "step": 2501 + }, + { + "epoch": 0.5014028056112224, + "grad_norm": 27.593834862072605, + "learning_rate": 9.86360636293778e-06, + "loss": 4.0268, + "step": 2502 + }, + { + "epoch": 0.5016032064128256, + "grad_norm": 46.42368157912416, + "learning_rate": 9.863335772313467e-06, + "loss": 3.9962, + "step": 2503 + }, + { + "epoch": 0.5018036072144288, + "grad_norm": 27.752481516329393, + "learning_rate": 9.863064917262411e-06, + "loss": 4.2331, + "step": 2504 + }, + { + "epoch": 0.5020040080160321, + "grad_norm": 30.008382090527608, + "learning_rate": 9.862793797799336e-06, + "loss": 4.5445, + "step": 2505 + }, + { + "epoch": 0.5022044088176353, + "grad_norm": 34.23122732560596, + "learning_rate": 9.862522413938983e-06, + "loss": 4.5692, + "step": 2506 + }, + { + "epoch": 0.5024048096192385, + "grad_norm": 31.88420545739476, + "learning_rate": 9.86225076569611e-06, + "loss": 3.9634, + "step": 2507 + }, + { + "epoch": 0.5026052104208417, + "grad_norm": 34.02586980199603, + "learning_rate": 9.861978853085487e-06, + "loss": 4.2147, + "step": 2508 + }, + { + "epoch": 0.5028056112224449, + "grad_norm": 25.757128593250847, + "learning_rate": 9.861706676121894e-06, + "loss": 4.6581, + "step": 2509 + }, + { + "epoch": 0.503006012024048, + "grad_norm": 28.604735278946258, + "learning_rate": 9.861434234820135e-06, + "loss": 3.943, + "step": 2510 + }, + { + "epoch": 0.5032064128256513, + "grad_norm": 20.959580335966656, + "learning_rate": 9.861161529195018e-06, + "loss": 4.2616, + "step": 2511 + }, + { + "epoch": 0.5034068136272545, + "grad_norm": 17.487693201029842, + "learning_rate": 9.860888559261375e-06, + "loss": 3.8603, + "step": 2512 + }, + { + "epoch": 0.5036072144288577, + "grad_norm": 45.97153069488128, + "learning_rate": 9.860615325034044e-06, + "loss": 4.1716, + "step": 2513 + }, + { + "epoch": 0.5038076152304609, + "grad_norm": 55.9676035470228, + "learning_rate": 9.860341826527884e-06, + "loss": 4.3357, + "step": 2514 + }, + { + "epoch": 0.5040080160320641, + "grad_norm": 20.05048387819733, + "learning_rate": 9.860068063757764e-06, + "loss": 4.1127, + "step": 2515 + }, + { + "epoch": 0.5042084168336673, + "grad_norm": 34.88645940432102, + "learning_rate": 9.859794036738569e-06, + "loss": 3.6724, + "step": 2516 + }, + { + "epoch": 0.5044088176352706, + "grad_norm": 30.03618624979091, + "learning_rate": 9.8595197454852e-06, + "loss": 3.6645, + "step": 2517 + }, + { + "epoch": 0.5046092184368738, + "grad_norm": 25.912489214108174, + "learning_rate": 9.859245190012567e-06, + "loss": 4.6909, + "step": 2518 + }, + { + "epoch": 0.504809619238477, + "grad_norm": 23.493063832648172, + "learning_rate": 9.858970370335601e-06, + "loss": 4.3225, + "step": 2519 + }, + { + "epoch": 0.5050100200400801, + "grad_norm": 30.294105619275427, + "learning_rate": 9.858695286469244e-06, + "loss": 4.7126, + "step": 2520 + }, + { + "epoch": 0.5052104208416833, + "grad_norm": 23.95596349734076, + "learning_rate": 9.85841993842845e-06, + "loss": 4.24, + "step": 2521 + }, + { + "epoch": 0.5054108216432865, + "grad_norm": 33.38740880837137, + "learning_rate": 9.858144326228194e-06, + "loss": 4.562, + "step": 2522 + }, + { + "epoch": 0.5056112224448898, + "grad_norm": 23.64532273530581, + "learning_rate": 9.85786844988346e-06, + "loss": 4.7947, + "step": 2523 + }, + { + "epoch": 0.505811623246493, + "grad_norm": 22.255124543582284, + "learning_rate": 9.857592309409247e-06, + "loss": 4.0638, + "step": 2524 + }, + { + "epoch": 0.5060120240480962, + "grad_norm": 22.602084021531965, + "learning_rate": 9.85731590482057e-06, + "loss": 4.0523, + "step": 2525 + }, + { + "epoch": 0.5062124248496994, + "grad_norm": 18.613973438580846, + "learning_rate": 9.857039236132458e-06, + "loss": 3.9788, + "step": 2526 + }, + { + "epoch": 0.5064128256513026, + "grad_norm": 27.732243124237556, + "learning_rate": 9.856762303359952e-06, + "loss": 4.2192, + "step": 2527 + }, + { + "epoch": 0.5066132264529059, + "grad_norm": 38.821698430426224, + "learning_rate": 9.85648510651811e-06, + "loss": 4.7259, + "step": 2528 + }, + { + "epoch": 0.5068136272545091, + "grad_norm": 26.29351600586704, + "learning_rate": 9.856207645622005e-06, + "loss": 4.4611, + "step": 2529 + }, + { + "epoch": 0.5070140280561122, + "grad_norm": 30.803667684141473, + "learning_rate": 9.85592992068672e-06, + "loss": 4.146, + "step": 2530 + }, + { + "epoch": 0.5072144288577154, + "grad_norm": 34.46815656701526, + "learning_rate": 9.855651931727359e-06, + "loss": 4.0564, + "step": 2531 + }, + { + "epoch": 0.5074148296593186, + "grad_norm": 26.396655668817267, + "learning_rate": 9.855373678759034e-06, + "loss": 4.3511, + "step": 2532 + }, + { + "epoch": 0.5076152304609218, + "grad_norm": 45.45852954993852, + "learning_rate": 9.855095161796875e-06, + "loss": 4.2874, + "step": 2533 + }, + { + "epoch": 0.507815631262525, + "grad_norm": 23.40650306179982, + "learning_rate": 9.854816380856025e-06, + "loss": 3.8804, + "step": 2534 + }, + { + "epoch": 0.5080160320641283, + "grad_norm": 31.884433552226245, + "learning_rate": 9.854537335951642e-06, + "loss": 4.1779, + "step": 2535 + }, + { + "epoch": 0.5082164328657315, + "grad_norm": 43.96502283675167, + "learning_rate": 9.8542580270989e-06, + "loss": 4.1671, + "step": 2536 + }, + { + "epoch": 0.5084168336673347, + "grad_norm": 25.69665131366203, + "learning_rate": 9.853978454312983e-06, + "loss": 4.299, + "step": 2537 + }, + { + "epoch": 0.5086172344689379, + "grad_norm": 25.398697572467455, + "learning_rate": 9.85369861760909e-06, + "loss": 3.8429, + "step": 2538 + }, + { + "epoch": 0.5088176352705411, + "grad_norm": 18.692607832333902, + "learning_rate": 9.85341851700244e-06, + "loss": 4.0597, + "step": 2539 + }, + { + "epoch": 0.5090180360721442, + "grad_norm": 39.08826556872088, + "learning_rate": 9.853138152508261e-06, + "loss": 4.2694, + "step": 2540 + }, + { + "epoch": 0.5092184368737475, + "grad_norm": 55.997294610402825, + "learning_rate": 9.852857524141797e-06, + "loss": 5.1641, + "step": 2541 + }, + { + "epoch": 0.5094188376753507, + "grad_norm": 17.88012812725888, + "learning_rate": 9.852576631918304e-06, + "loss": 3.6059, + "step": 2542 + }, + { + "epoch": 0.5096192384769539, + "grad_norm": 62.724791202641846, + "learning_rate": 9.852295475853059e-06, + "loss": 5.0053, + "step": 2543 + }, + { + "epoch": 0.5098196392785571, + "grad_norm": 28.794601817063892, + "learning_rate": 9.852014055961346e-06, + "loss": 4.0663, + "step": 2544 + }, + { + "epoch": 0.5100200400801603, + "grad_norm": 34.28986757367083, + "learning_rate": 9.851732372258465e-06, + "loss": 3.9099, + "step": 2545 + }, + { + "epoch": 0.5102204408817635, + "grad_norm": 27.06590701820271, + "learning_rate": 9.851450424759734e-06, + "loss": 4.2122, + "step": 2546 + }, + { + "epoch": 0.5104208416833668, + "grad_norm": 27.254935736170186, + "learning_rate": 9.851168213480482e-06, + "loss": 4.3556, + "step": 2547 + }, + { + "epoch": 0.51062124248497, + "grad_norm": 27.208602790600516, + "learning_rate": 9.850885738436054e-06, + "loss": 4.5169, + "step": 2548 + }, + { + "epoch": 0.5108216432865732, + "grad_norm": 39.4649263203075, + "learning_rate": 9.850602999641805e-06, + "loss": 4.5381, + "step": 2549 + }, + { + "epoch": 0.5110220440881763, + "grad_norm": 20.88017075217217, + "learning_rate": 9.850319997113114e-06, + "loss": 4.0904, + "step": 2550 + }, + { + "epoch": 0.5112224448897795, + "grad_norm": 22.956823271213317, + "learning_rate": 9.850036730865362e-06, + "loss": 4.0669, + "step": 2551 + }, + { + "epoch": 0.5114228456913827, + "grad_norm": 22.274889588882452, + "learning_rate": 9.849753200913957e-06, + "loss": 4.1371, + "step": 2552 + }, + { + "epoch": 0.511623246492986, + "grad_norm": 42.390438032613034, + "learning_rate": 9.84946940727431e-06, + "loss": 4.0152, + "step": 2553 + }, + { + "epoch": 0.5118236472945892, + "grad_norm": 26.755101668187184, + "learning_rate": 9.849185349961853e-06, + "loss": 3.9866, + "step": 2554 + }, + { + "epoch": 0.5120240480961924, + "grad_norm": 27.494291919031756, + "learning_rate": 9.848901028992031e-06, + "loss": 4.1358, + "step": 2555 + }, + { + "epoch": 0.5122244488977956, + "grad_norm": 33.2184639663924, + "learning_rate": 9.8486164443803e-06, + "loss": 4.2665, + "step": 2556 + }, + { + "epoch": 0.5124248496993988, + "grad_norm": 26.360100821706045, + "learning_rate": 9.848331596142137e-06, + "loss": 3.9316, + "step": 2557 + }, + { + "epoch": 0.512625250501002, + "grad_norm": 52.802621847910835, + "learning_rate": 9.84804648429303e-06, + "loss": 4.0489, + "step": 2558 + }, + { + "epoch": 0.5128256513026052, + "grad_norm": 52.49396211410612, + "learning_rate": 9.847761108848478e-06, + "loss": 4.943, + "step": 2559 + }, + { + "epoch": 0.5130260521042084, + "grad_norm": 23.816305996063207, + "learning_rate": 9.847475469823999e-06, + "loss": 4.2241, + "step": 2560 + }, + { + "epoch": 0.5132264529058116, + "grad_norm": 32.301892328669155, + "learning_rate": 9.847189567235123e-06, + "loss": 4.7545, + "step": 2561 + }, + { + "epoch": 0.5134268537074148, + "grad_norm": 36.95871824870126, + "learning_rate": 9.846903401097394e-06, + "loss": 4.6312, + "step": 2562 + }, + { + "epoch": 0.513627254509018, + "grad_norm": 35.89167011990708, + "learning_rate": 9.846616971426373e-06, + "loss": 4.1475, + "step": 2563 + }, + { + "epoch": 0.5138276553106212, + "grad_norm": 33.04464652105744, + "learning_rate": 9.846330278237634e-06, + "loss": 4.2622, + "step": 2564 + }, + { + "epoch": 0.5140280561122245, + "grad_norm": 27.361294031942084, + "learning_rate": 9.846043321546762e-06, + "loss": 4.2466, + "step": 2565 + }, + { + "epoch": 0.5142284569138277, + "grad_norm": 22.416839591990882, + "learning_rate": 9.845756101369364e-06, + "loss": 3.6748, + "step": 2566 + }, + { + "epoch": 0.5144288577154309, + "grad_norm": 20.9006373912375, + "learning_rate": 9.845468617721052e-06, + "loss": 4.066, + "step": 2567 + }, + { + "epoch": 0.5146292585170341, + "grad_norm": 26.872574221395453, + "learning_rate": 9.845180870617459e-06, + "loss": 3.9942, + "step": 2568 + }, + { + "epoch": 0.5148296593186372, + "grad_norm": 31.702808598420958, + "learning_rate": 9.84489286007423e-06, + "loss": 4.5119, + "step": 2569 + }, + { + "epoch": 0.5150300601202404, + "grad_norm": 39.47252444543564, + "learning_rate": 9.844604586107024e-06, + "loss": 4.5368, + "step": 2570 + }, + { + "epoch": 0.5152304609218437, + "grad_norm": 21.271574399660786, + "learning_rate": 9.844316048731515e-06, + "loss": 4.0546, + "step": 2571 + }, + { + "epoch": 0.5154308617234469, + "grad_norm": 22.879431942058158, + "learning_rate": 9.844027247963393e-06, + "loss": 4.3406, + "step": 2572 + }, + { + "epoch": 0.5156312625250501, + "grad_norm": 20.79665890811027, + "learning_rate": 9.84373818381836e-06, + "loss": 3.9237, + "step": 2573 + }, + { + "epoch": 0.5158316633266533, + "grad_norm": 25.423482297202966, + "learning_rate": 9.843448856312128e-06, + "loss": 4.3494, + "step": 2574 + }, + { + "epoch": 0.5160320641282565, + "grad_norm": 30.07655351152043, + "learning_rate": 9.843159265460434e-06, + "loss": 4.3153, + "step": 2575 + }, + { + "epoch": 0.5162324649298597, + "grad_norm": 40.889142585314474, + "learning_rate": 9.842869411279023e-06, + "loss": 4.6926, + "step": 2576 + }, + { + "epoch": 0.516432865731463, + "grad_norm": 27.913641452420215, + "learning_rate": 9.842579293783651e-06, + "loss": 4.743, + "step": 2577 + }, + { + "epoch": 0.5166332665330662, + "grad_norm": 21.56027361186348, + "learning_rate": 9.842288912990096e-06, + "loss": 4.019, + "step": 2578 + }, + { + "epoch": 0.5168336673346693, + "grad_norm": 24.51837450874744, + "learning_rate": 9.841998268914146e-06, + "loss": 3.8733, + "step": 2579 + }, + { + "epoch": 0.5170340681362725, + "grad_norm": 20.67426204513002, + "learning_rate": 9.8417073615716e-06, + "loss": 4.021, + "step": 2580 + }, + { + "epoch": 0.5172344689378757, + "grad_norm": 26.8205483521294, + "learning_rate": 9.84141619097828e-06, + "loss": 3.7457, + "step": 2581 + }, + { + "epoch": 0.5174348697394789, + "grad_norm": 23.031346873801805, + "learning_rate": 9.841124757150014e-06, + "loss": 4.3622, + "step": 2582 + }, + { + "epoch": 0.5176352705410822, + "grad_norm": 19.270930038910194, + "learning_rate": 9.84083306010265e-06, + "loss": 4.047, + "step": 2583 + }, + { + "epoch": 0.5178356713426854, + "grad_norm": 27.774279961808254, + "learning_rate": 9.840541099852046e-06, + "loss": 4.4296, + "step": 2584 + }, + { + "epoch": 0.5180360721442886, + "grad_norm": 36.48533035100257, + "learning_rate": 9.840248876414076e-06, + "loss": 3.7391, + "step": 2585 + }, + { + "epoch": 0.5182364729458918, + "grad_norm": 58.907538679366596, + "learning_rate": 9.83995638980463e-06, + "loss": 4.3371, + "step": 2586 + }, + { + "epoch": 0.518436873747495, + "grad_norm": 28.376098661903526, + "learning_rate": 9.839663640039612e-06, + "loss": 4.4154, + "step": 2587 + }, + { + "epoch": 0.5186372745490982, + "grad_norm": 36.56820001747874, + "learning_rate": 9.839370627134938e-06, + "loss": 4.3209, + "step": 2588 + }, + { + "epoch": 0.5188376753507014, + "grad_norm": 47.68969457698344, + "learning_rate": 9.839077351106538e-06, + "loss": 4.2887, + "step": 2589 + }, + { + "epoch": 0.5190380761523046, + "grad_norm": 30.801620907604896, + "learning_rate": 9.83878381197036e-06, + "loss": 3.9478, + "step": 2590 + }, + { + "epoch": 0.5192384769539078, + "grad_norm": 30.78832159369641, + "learning_rate": 9.838490009742363e-06, + "loss": 4.3996, + "step": 2591 + }, + { + "epoch": 0.519438877755511, + "grad_norm": 19.315053983858892, + "learning_rate": 9.838195944438522e-06, + "loss": 3.7555, + "step": 2592 + }, + { + "epoch": 0.5196392785571142, + "grad_norm": 34.136715784480636, + "learning_rate": 9.837901616074827e-06, + "loss": 4.456, + "step": 2593 + }, + { + "epoch": 0.5198396793587174, + "grad_norm": 39.31401944182888, + "learning_rate": 9.837607024667278e-06, + "loss": 4.6233, + "step": 2594 + }, + { + "epoch": 0.5200400801603207, + "grad_norm": 27.019845051800125, + "learning_rate": 9.837312170231895e-06, + "loss": 4.5036, + "step": 2595 + }, + { + "epoch": 0.5202404809619239, + "grad_norm": 26.05441956786278, + "learning_rate": 9.83701705278471e-06, + "loss": 4.045, + "step": 2596 + }, + { + "epoch": 0.5204408817635271, + "grad_norm": 27.272966257723322, + "learning_rate": 9.836721672341764e-06, + "loss": 4.6761, + "step": 2597 + }, + { + "epoch": 0.5206412825651303, + "grad_norm": 36.62966072632823, + "learning_rate": 9.836426028919125e-06, + "loss": 4.6293, + "step": 2598 + }, + { + "epoch": 0.5208416833667334, + "grad_norm": 21.599002164294063, + "learning_rate": 9.836130122532861e-06, + "loss": 4.2369, + "step": 2599 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 19.38651623274021, + "learning_rate": 9.835833953199066e-06, + "loss": 3.9352, + "step": 2600 + }, + { + "epoch": 0.5212424849699399, + "grad_norm": 26.871036690880214, + "learning_rate": 9.835537520933838e-06, + "loss": 4.4731, + "step": 2601 + }, + { + "epoch": 0.5214428857715431, + "grad_norm": 21.1985123544921, + "learning_rate": 9.835240825753299e-06, + "loss": 4.1396, + "step": 2602 + }, + { + "epoch": 0.5216432865731463, + "grad_norm": 24.064098511435937, + "learning_rate": 9.834943867673577e-06, + "loss": 4.2029, + "step": 2603 + }, + { + "epoch": 0.5218436873747495, + "grad_norm": 21.438022298690573, + "learning_rate": 9.834646646710822e-06, + "loss": 3.6732, + "step": 2604 + }, + { + "epoch": 0.5220440881763527, + "grad_norm": 22.61374737324199, + "learning_rate": 9.834349162881189e-06, + "loss": 4.286, + "step": 2605 + }, + { + "epoch": 0.522244488977956, + "grad_norm": 22.675053505608982, + "learning_rate": 9.834051416200859e-06, + "loss": 3.9378, + "step": 2606 + }, + { + "epoch": 0.5224448897795592, + "grad_norm": 24.822139447394992, + "learning_rate": 9.833753406686016e-06, + "loss": 4.2473, + "step": 2607 + }, + { + "epoch": 0.5226452905811623, + "grad_norm": 35.54003639731588, + "learning_rate": 9.833455134352866e-06, + "loss": 4.5691, + "step": 2608 + }, + { + "epoch": 0.5228456913827655, + "grad_norm": 22.551497240275637, + "learning_rate": 9.833156599217626e-06, + "loss": 4.0587, + "step": 2609 + }, + { + "epoch": 0.5230460921843687, + "grad_norm": 32.8135058574283, + "learning_rate": 9.832857801296525e-06, + "loss": 3.5777, + "step": 2610 + }, + { + "epoch": 0.5232464929859719, + "grad_norm": 23.61883437163014, + "learning_rate": 9.832558740605814e-06, + "loss": 3.7693, + "step": 2611 + }, + { + "epoch": 0.5234468937875751, + "grad_norm": 20.631906461956053, + "learning_rate": 9.83225941716175e-06, + "loss": 4.1225, + "step": 2612 + }, + { + "epoch": 0.5236472945891784, + "grad_norm": 58.58249525645405, + "learning_rate": 9.831959830980607e-06, + "loss": 3.9369, + "step": 2613 + }, + { + "epoch": 0.5238476953907816, + "grad_norm": 27.979927552289812, + "learning_rate": 9.831659982078677e-06, + "loss": 4.2315, + "step": 2614 + }, + { + "epoch": 0.5240480961923848, + "grad_norm": 27.72890386049554, + "learning_rate": 9.831359870472261e-06, + "loss": 4.6461, + "step": 2615 + }, + { + "epoch": 0.524248496993988, + "grad_norm": 20.715939478472702, + "learning_rate": 9.831059496177678e-06, + "loss": 4.1683, + "step": 2616 + }, + { + "epoch": 0.5244488977955912, + "grad_norm": 19.940896509799153, + "learning_rate": 9.830758859211258e-06, + "loss": 4.0497, + "step": 2617 + }, + { + "epoch": 0.5246492985971943, + "grad_norm": 23.617459770302617, + "learning_rate": 9.830457959589348e-06, + "loss": 3.8919, + "step": 2618 + }, + { + "epoch": 0.5248496993987976, + "grad_norm": 29.169379846302512, + "learning_rate": 9.830156797328308e-06, + "loss": 4.5395, + "step": 2619 + }, + { + "epoch": 0.5250501002004008, + "grad_norm": 34.88612779105737, + "learning_rate": 9.829855372444513e-06, + "loss": 4.0345, + "step": 2620 + }, + { + "epoch": 0.525250501002004, + "grad_norm": 22.07121635398203, + "learning_rate": 9.829553684954351e-06, + "loss": 3.9211, + "step": 2621 + }, + { + "epoch": 0.5254509018036072, + "grad_norm": 36.71483543097033, + "learning_rate": 9.829251734874228e-06, + "loss": 4.3672, + "step": 2622 + }, + { + "epoch": 0.5256513026052104, + "grad_norm": 29.317696938348337, + "learning_rate": 9.82894952222056e-06, + "loss": 4.3749, + "step": 2623 + }, + { + "epoch": 0.5258517034068136, + "grad_norm": 64.65735498377197, + "learning_rate": 9.828647047009776e-06, + "loss": 4.7703, + "step": 2624 + }, + { + "epoch": 0.5260521042084169, + "grad_norm": 32.403458511532705, + "learning_rate": 9.828344309258327e-06, + "loss": 4.9485, + "step": 2625 + }, + { + "epoch": 0.5262525050100201, + "grad_norm": 21.316762352965306, + "learning_rate": 9.828041308982668e-06, + "loss": 4.3622, + "step": 2626 + }, + { + "epoch": 0.5264529058116233, + "grad_norm": 26.73841659294914, + "learning_rate": 9.827738046199277e-06, + "loss": 4.6413, + "step": 2627 + }, + { + "epoch": 0.5266533066132264, + "grad_norm": 42.1574127064603, + "learning_rate": 9.82743452092464e-06, + "loss": 4.4391, + "step": 2628 + }, + { + "epoch": 0.5268537074148296, + "grad_norm": 28.29214989695583, + "learning_rate": 9.827130733175266e-06, + "loss": 4.138, + "step": 2629 + }, + { + "epoch": 0.5270541082164328, + "grad_norm": 35.83361696522377, + "learning_rate": 9.826826682967668e-06, + "loss": 4.7142, + "step": 2630 + }, + { + "epoch": 0.527254509018036, + "grad_norm": 36.59452908400607, + "learning_rate": 9.826522370318376e-06, + "loss": 4.7187, + "step": 2631 + }, + { + "epoch": 0.5274549098196393, + "grad_norm": 34.76815191922033, + "learning_rate": 9.826217795243939e-06, + "loss": 4.8328, + "step": 2632 + }, + { + "epoch": 0.5276553106212425, + "grad_norm": 41.596195628694886, + "learning_rate": 9.825912957760917e-06, + "loss": 4.225, + "step": 2633 + }, + { + "epoch": 0.5278557114228457, + "grad_norm": 23.765768058216537, + "learning_rate": 9.825607857885884e-06, + "loss": 3.8388, + "step": 2634 + }, + { + "epoch": 0.5280561122244489, + "grad_norm": 27.026386533442274, + "learning_rate": 9.825302495635426e-06, + "loss": 3.995, + "step": 2635 + }, + { + "epoch": 0.5282565130260521, + "grad_norm": 21.4443185790106, + "learning_rate": 9.824996871026152e-06, + "loss": 4.2094, + "step": 2636 + }, + { + "epoch": 0.5284569138276554, + "grad_norm": 28.417411764520025, + "learning_rate": 9.824690984074674e-06, + "loss": 4.2262, + "step": 2637 + }, + { + "epoch": 0.5286573146292585, + "grad_norm": 25.1074641034623, + "learning_rate": 9.824384834797626e-06, + "loss": 4.178, + "step": 2638 + }, + { + "epoch": 0.5288577154308617, + "grad_norm": 23.555302919384335, + "learning_rate": 9.824078423211651e-06, + "loss": 4.3201, + "step": 2639 + }, + { + "epoch": 0.5290581162324649, + "grad_norm": 25.002803140059726, + "learning_rate": 9.823771749333414e-06, + "loss": 3.9616, + "step": 2640 + }, + { + "epoch": 0.5292585170340681, + "grad_norm": 23.237471566796472, + "learning_rate": 9.823464813179585e-06, + "loss": 4.0835, + "step": 2641 + }, + { + "epoch": 0.5294589178356713, + "grad_norm": 33.691262455555545, + "learning_rate": 9.823157614766854e-06, + "loss": 4.2377, + "step": 2642 + }, + { + "epoch": 0.5296593186372746, + "grad_norm": 29.060310489918233, + "learning_rate": 9.822850154111924e-06, + "loss": 4.5522, + "step": 2643 + }, + { + "epoch": 0.5298597194388778, + "grad_norm": 26.919911490090477, + "learning_rate": 9.822542431231512e-06, + "loss": 4.3402, + "step": 2644 + }, + { + "epoch": 0.530060120240481, + "grad_norm": 34.28843114155003, + "learning_rate": 9.822234446142349e-06, + "loss": 4.3242, + "step": 2645 + }, + { + "epoch": 0.5302605210420842, + "grad_norm": 32.92125361758578, + "learning_rate": 9.821926198861182e-06, + "loss": 4.3301, + "step": 2646 + }, + { + "epoch": 0.5304609218436874, + "grad_norm": 38.50740496828879, + "learning_rate": 9.821617689404768e-06, + "loss": 4.4275, + "step": 2647 + }, + { + "epoch": 0.5306613226452905, + "grad_norm": 19.179572337528366, + "learning_rate": 9.821308917789884e-06, + "loss": 4.0504, + "step": 2648 + }, + { + "epoch": 0.5308617234468938, + "grad_norm": 23.14146660848231, + "learning_rate": 9.820999884033317e-06, + "loss": 4.0475, + "step": 2649 + }, + { + "epoch": 0.531062124248497, + "grad_norm": 29.458761544261904, + "learning_rate": 9.82069058815187e-06, + "loss": 4.2426, + "step": 2650 + }, + { + "epoch": 0.5312625250501002, + "grad_norm": 31.383226283770707, + "learning_rate": 9.82038103016236e-06, + "loss": 3.9724, + "step": 2651 + }, + { + "epoch": 0.5314629258517034, + "grad_norm": 22.85179921984248, + "learning_rate": 9.820071210081616e-06, + "loss": 4.3227, + "step": 2652 + }, + { + "epoch": 0.5316633266533066, + "grad_norm": 30.91363673499481, + "learning_rate": 9.819761127926489e-06, + "loss": 5.2763, + "step": 2653 + }, + { + "epoch": 0.5318637274549098, + "grad_norm": 20.411044039211824, + "learning_rate": 9.81945078371383e-06, + "loss": 3.8398, + "step": 2654 + }, + { + "epoch": 0.5320641282565131, + "grad_norm": 29.208620774530903, + "learning_rate": 9.819140177460523e-06, + "loss": 4.4768, + "step": 2655 + }, + { + "epoch": 0.5322645290581163, + "grad_norm": 44.797775434970546, + "learning_rate": 9.818829309183446e-06, + "loss": 4.3942, + "step": 2656 + }, + { + "epoch": 0.5324649298597195, + "grad_norm": 39.19879626576946, + "learning_rate": 9.818518178899508e-06, + "loss": 4.4771, + "step": 2657 + }, + { + "epoch": 0.5326653306613226, + "grad_norm": 33.52817589635592, + "learning_rate": 9.818206786625625e-06, + "loss": 4.0838, + "step": 2658 + }, + { + "epoch": 0.5328657314629258, + "grad_norm": 41.93987954684077, + "learning_rate": 9.817895132378725e-06, + "loss": 4.7584, + "step": 2659 + }, + { + "epoch": 0.533066132264529, + "grad_norm": 33.52641438542398, + "learning_rate": 9.817583216175757e-06, + "loss": 4.1855, + "step": 2660 + }, + { + "epoch": 0.5332665330661323, + "grad_norm": 23.218163913650944, + "learning_rate": 9.817271038033676e-06, + "loss": 4.0608, + "step": 2661 + }, + { + "epoch": 0.5334669338677355, + "grad_norm": 23.087765918811748, + "learning_rate": 9.81695859796946e-06, + "loss": 4.4001, + "step": 2662 + }, + { + "epoch": 0.5336673346693387, + "grad_norm": 17.847237825303413, + "learning_rate": 9.816645896000091e-06, + "loss": 3.9264, + "step": 2663 + }, + { + "epoch": 0.5338677354709419, + "grad_norm": 24.000147369401386, + "learning_rate": 9.816332932142578e-06, + "loss": 4.7039, + "step": 2664 + }, + { + "epoch": 0.5340681362725451, + "grad_norm": 34.87536718801361, + "learning_rate": 9.816019706413932e-06, + "loss": 4.3404, + "step": 2665 + }, + { + "epoch": 0.5342685370741483, + "grad_norm": 29.618162779497208, + "learning_rate": 9.815706218831185e-06, + "loss": 4.7195, + "step": 2666 + }, + { + "epoch": 0.5344689378757514, + "grad_norm": 76.93867763251595, + "learning_rate": 9.815392469411384e-06, + "loss": 4.833, + "step": 2667 + }, + { + "epoch": 0.5346693386773547, + "grad_norm": 29.19259434823209, + "learning_rate": 9.815078458171585e-06, + "loss": 4.5121, + "step": 2668 + }, + { + "epoch": 0.5348697394789579, + "grad_norm": 40.27399694235347, + "learning_rate": 9.814764185128864e-06, + "loss": 4.9915, + "step": 2669 + }, + { + "epoch": 0.5350701402805611, + "grad_norm": 18.556134812420055, + "learning_rate": 9.814449650300307e-06, + "loss": 3.7869, + "step": 2670 + }, + { + "epoch": 0.5352705410821643, + "grad_norm": 24.365304918289027, + "learning_rate": 9.814134853703013e-06, + "loss": 3.9786, + "step": 2671 + }, + { + "epoch": 0.5354709418837675, + "grad_norm": 21.76898140721887, + "learning_rate": 9.813819795354103e-06, + "loss": 4.1946, + "step": 2672 + }, + { + "epoch": 0.5356713426853708, + "grad_norm": 38.28936813625571, + "learning_rate": 9.813504475270704e-06, + "loss": 4.4257, + "step": 2673 + }, + { + "epoch": 0.535871743486974, + "grad_norm": 31.483407279020373, + "learning_rate": 9.813188893469963e-06, + "loss": 4.6327, + "step": 2674 + }, + { + "epoch": 0.5360721442885772, + "grad_norm": 21.51608764169907, + "learning_rate": 9.812873049969033e-06, + "loss": 3.4885, + "step": 2675 + }, + { + "epoch": 0.5362725450901804, + "grad_norm": 36.37902962204116, + "learning_rate": 9.812556944785094e-06, + "loss": 4.2668, + "step": 2676 + }, + { + "epoch": 0.5364729458917835, + "grad_norm": 25.35608217542063, + "learning_rate": 9.81224057793533e-06, + "loss": 4.7525, + "step": 2677 + }, + { + "epoch": 0.5366733466933867, + "grad_norm": 27.4585909702261, + "learning_rate": 9.811923949436941e-06, + "loss": 4.0097, + "step": 2678 + }, + { + "epoch": 0.53687374749499, + "grad_norm": 17.82956575777597, + "learning_rate": 9.811607059307145e-06, + "loss": 3.8867, + "step": 2679 + }, + { + "epoch": 0.5370741482965932, + "grad_norm": 20.45778298139949, + "learning_rate": 9.81128990756317e-06, + "loss": 3.7108, + "step": 2680 + }, + { + "epoch": 0.5372745490981964, + "grad_norm": 19.132081527278725, + "learning_rate": 9.810972494222262e-06, + "loss": 3.5272, + "step": 2681 + }, + { + "epoch": 0.5374749498997996, + "grad_norm": 30.666436900693736, + "learning_rate": 9.810654819301675e-06, + "loss": 4.4217, + "step": 2682 + }, + { + "epoch": 0.5376753507014028, + "grad_norm": 26.859447517394592, + "learning_rate": 9.810336882818687e-06, + "loss": 4.9514, + "step": 2683 + }, + { + "epoch": 0.537875751503006, + "grad_norm": 191.43141733267618, + "learning_rate": 9.81001868479058e-06, + "loss": 4.2248, + "step": 2684 + }, + { + "epoch": 0.5380761523046093, + "grad_norm": 25.65749418745323, + "learning_rate": 9.809700225234657e-06, + "loss": 3.9163, + "step": 2685 + }, + { + "epoch": 0.5382765531062125, + "grad_norm": 32.02808034625519, + "learning_rate": 9.809381504168235e-06, + "loss": 4.6185, + "step": 2686 + }, + { + "epoch": 0.5384769539078156, + "grad_norm": 17.802121897134352, + "learning_rate": 9.809062521608639e-06, + "loss": 3.4543, + "step": 2687 + }, + { + "epoch": 0.5386773547094188, + "grad_norm": 24.683561918108985, + "learning_rate": 9.808743277573216e-06, + "loss": 3.982, + "step": 2688 + }, + { + "epoch": 0.538877755511022, + "grad_norm": 19.07920857991838, + "learning_rate": 9.808423772079322e-06, + "loss": 3.7909, + "step": 2689 + }, + { + "epoch": 0.5390781563126252, + "grad_norm": 36.91548600933323, + "learning_rate": 9.80810400514433e-06, + "loss": 4.3884, + "step": 2690 + }, + { + "epoch": 0.5392785571142285, + "grad_norm": 49.649160442603275, + "learning_rate": 9.807783976785626e-06, + "loss": 4.2716, + "step": 2691 + }, + { + "epoch": 0.5394789579158317, + "grad_norm": 26.283857565787475, + "learning_rate": 9.807463687020611e-06, + "loss": 4.3078, + "step": 2692 + }, + { + "epoch": 0.5396793587174349, + "grad_norm": 54.16075478444984, + "learning_rate": 9.807143135866698e-06, + "loss": 4.6187, + "step": 2693 + }, + { + "epoch": 0.5398797595190381, + "grad_norm": 18.748340522563147, + "learning_rate": 9.806822323341317e-06, + "loss": 3.8345, + "step": 2694 + }, + { + "epoch": 0.5400801603206413, + "grad_norm": 25.777900529417604, + "learning_rate": 9.806501249461909e-06, + "loss": 4.1416, + "step": 2695 + }, + { + "epoch": 0.5402805611222445, + "grad_norm": 44.35944125810301, + "learning_rate": 9.806179914245935e-06, + "loss": 4.5265, + "step": 2696 + }, + { + "epoch": 0.5404809619238476, + "grad_norm": 24.0955627955539, + "learning_rate": 9.805858317710863e-06, + "loss": 4.0995, + "step": 2697 + }, + { + "epoch": 0.5406813627254509, + "grad_norm": 25.73582243024887, + "learning_rate": 9.805536459874182e-06, + "loss": 4.0855, + "step": 2698 + }, + { + "epoch": 0.5408817635270541, + "grad_norm": 31.231922423641752, + "learning_rate": 9.80521434075339e-06, + "loss": 4.506, + "step": 2699 + }, + { + "epoch": 0.5410821643286573, + "grad_norm": 36.61602124908788, + "learning_rate": 9.804891960366e-06, + "loss": 4.0242, + "step": 2700 + }, + { + "epoch": 0.5412825651302605, + "grad_norm": 21.870626146212423, + "learning_rate": 9.804569318729543e-06, + "loss": 3.5106, + "step": 2701 + }, + { + "epoch": 0.5414829659318637, + "grad_norm": 28.523371117589384, + "learning_rate": 9.804246415861558e-06, + "loss": 3.5967, + "step": 2702 + }, + { + "epoch": 0.541683366733467, + "grad_norm": 45.13548416153298, + "learning_rate": 9.803923251779604e-06, + "loss": 4.3759, + "step": 2703 + }, + { + "epoch": 0.5418837675350702, + "grad_norm": 24.566407095539894, + "learning_rate": 9.803599826501251e-06, + "loss": 3.9027, + "step": 2704 + }, + { + "epoch": 0.5420841683366734, + "grad_norm": 29.471702052640353, + "learning_rate": 9.803276140044086e-06, + "loss": 3.8668, + "step": 2705 + }, + { + "epoch": 0.5422845691382766, + "grad_norm": 27.62627091087858, + "learning_rate": 9.802952192425707e-06, + "loss": 4.1399, + "step": 2706 + }, + { + "epoch": 0.5424849699398797, + "grad_norm": 27.242805822942362, + "learning_rate": 9.802627983663726e-06, + "loss": 4.3263, + "step": 2707 + }, + { + "epoch": 0.5426853707414829, + "grad_norm": 68.13429816364697, + "learning_rate": 9.802303513775775e-06, + "loss": 4.4684, + "step": 2708 + }, + { + "epoch": 0.5428857715430861, + "grad_norm": 21.3285085126443, + "learning_rate": 9.801978782779491e-06, + "loss": 3.9565, + "step": 2709 + }, + { + "epoch": 0.5430861723446894, + "grad_norm": 29.83412274583456, + "learning_rate": 9.801653790692532e-06, + "loss": 3.6157, + "step": 2710 + }, + { + "epoch": 0.5432865731462926, + "grad_norm": 28.051122706710988, + "learning_rate": 9.801328537532569e-06, + "loss": 4.2971, + "step": 2711 + }, + { + "epoch": 0.5434869739478958, + "grad_norm": 49.26648636869873, + "learning_rate": 9.801003023317285e-06, + "loss": 5.212, + "step": 2712 + }, + { + "epoch": 0.543687374749499, + "grad_norm": 24.78232709945016, + "learning_rate": 9.800677248064383e-06, + "loss": 4.1896, + "step": 2713 + }, + { + "epoch": 0.5438877755511022, + "grad_norm": 30.721571246051393, + "learning_rate": 9.80035121179157e-06, + "loss": 4.0825, + "step": 2714 + }, + { + "epoch": 0.5440881763527055, + "grad_norm": 41.12480464048516, + "learning_rate": 9.800024914516575e-06, + "loss": 4.3072, + "step": 2715 + }, + { + "epoch": 0.5442885771543087, + "grad_norm": 37.8499789793319, + "learning_rate": 9.799698356257141e-06, + "loss": 4.4476, + "step": 2716 + }, + { + "epoch": 0.5444889779559118, + "grad_norm": 25.525718412669562, + "learning_rate": 9.799371537031024e-06, + "loss": 4.4743, + "step": 2717 + }, + { + "epoch": 0.544689378757515, + "grad_norm": 24.78076203504575, + "learning_rate": 9.79904445685599e-06, + "loss": 3.6375, + "step": 2718 + }, + { + "epoch": 0.5448897795591182, + "grad_norm": 21.772538742812102, + "learning_rate": 9.798717115749825e-06, + "loss": 3.9138, + "step": 2719 + }, + { + "epoch": 0.5450901803607214, + "grad_norm": 33.62530596380182, + "learning_rate": 9.798389513730327e-06, + "loss": 4.223, + "step": 2720 + }, + { + "epoch": 0.5452905811623247, + "grad_norm": 27.392800970885197, + "learning_rate": 9.798061650815307e-06, + "loss": 3.6149, + "step": 2721 + }, + { + "epoch": 0.5454909819639279, + "grad_norm": 26.23594738768637, + "learning_rate": 9.797733527022593e-06, + "loss": 4.8375, + "step": 2722 + }, + { + "epoch": 0.5456913827655311, + "grad_norm": 56.27327396847454, + "learning_rate": 9.797405142370027e-06, + "loss": 4.3763, + "step": 2723 + }, + { + "epoch": 0.5458917835671343, + "grad_norm": 29.83817459192356, + "learning_rate": 9.79707649687546e-06, + "loss": 4.7641, + "step": 2724 + }, + { + "epoch": 0.5460921843687375, + "grad_norm": 29.91664926355359, + "learning_rate": 9.796747590556764e-06, + "loss": 3.9911, + "step": 2725 + }, + { + "epoch": 0.5462925851703406, + "grad_norm": 22.568143072458696, + "learning_rate": 9.796418423431819e-06, + "loss": 4.2647, + "step": 2726 + }, + { + "epoch": 0.5464929859719438, + "grad_norm": 29.843587427245534, + "learning_rate": 9.796088995518524e-06, + "loss": 4.1791, + "step": 2727 + }, + { + "epoch": 0.5466933867735471, + "grad_norm": 25.476498017979672, + "learning_rate": 9.795759306834793e-06, + "loss": 4.0561, + "step": 2728 + }, + { + "epoch": 0.5468937875751503, + "grad_norm": 27.28278882402715, + "learning_rate": 9.795429357398548e-06, + "loss": 4.5363, + "step": 2729 + }, + { + "epoch": 0.5470941883767535, + "grad_norm": 25.213184466450045, + "learning_rate": 9.79509914722773e-06, + "loss": 3.9529, + "step": 2730 + }, + { + "epoch": 0.5472945891783567, + "grad_norm": 46.68846040443938, + "learning_rate": 9.794768676340293e-06, + "loss": 4.0012, + "step": 2731 + }, + { + "epoch": 0.5474949899799599, + "grad_norm": 20.71222789225276, + "learning_rate": 9.794437944754205e-06, + "loss": 3.8759, + "step": 2732 + }, + { + "epoch": 0.5476953907815632, + "grad_norm": 26.596388541870983, + "learning_rate": 9.79410695248745e-06, + "loss": 4.4114, + "step": 2733 + }, + { + "epoch": 0.5478957915831664, + "grad_norm": 26.636427545395435, + "learning_rate": 9.793775699558024e-06, + "loss": 4.8794, + "step": 2734 + }, + { + "epoch": 0.5480961923847696, + "grad_norm": 32.42662809955328, + "learning_rate": 9.793444185983934e-06, + "loss": 4.3287, + "step": 2735 + }, + { + "epoch": 0.5482965931863727, + "grad_norm": 34.327647820422804, + "learning_rate": 9.79311241178321e-06, + "loss": 4.737, + "step": 2736 + }, + { + "epoch": 0.5484969939879759, + "grad_norm": 30.109679115046525, + "learning_rate": 9.792780376973888e-06, + "loss": 5.0158, + "step": 2737 + }, + { + "epoch": 0.5486973947895791, + "grad_norm": 32.04265042823614, + "learning_rate": 9.792448081574022e-06, + "loss": 4.7135, + "step": 2738 + }, + { + "epoch": 0.5488977955911823, + "grad_norm": 22.592324106506098, + "learning_rate": 9.792115525601679e-06, + "loss": 4.189, + "step": 2739 + }, + { + "epoch": 0.5490981963927856, + "grad_norm": 19.929799547866892, + "learning_rate": 9.791782709074944e-06, + "loss": 3.5242, + "step": 2740 + }, + { + "epoch": 0.5492985971943888, + "grad_norm": 34.030288463225766, + "learning_rate": 9.791449632011907e-06, + "loss": 4.0111, + "step": 2741 + }, + { + "epoch": 0.549498997995992, + "grad_norm": 27.32901659721259, + "learning_rate": 9.791116294430681e-06, + "loss": 4.3439, + "step": 2742 + }, + { + "epoch": 0.5496993987975952, + "grad_norm": 23.06849229095208, + "learning_rate": 9.79078269634939e-06, + "loss": 3.8525, + "step": 2743 + }, + { + "epoch": 0.5498997995991984, + "grad_norm": 27.48171489495013, + "learning_rate": 9.790448837786173e-06, + "loss": 4.1762, + "step": 2744 + }, + { + "epoch": 0.5501002004008017, + "grad_norm": 20.26194319180912, + "learning_rate": 9.79011471875918e-06, + "loss": 4.2401, + "step": 2745 + }, + { + "epoch": 0.5503006012024048, + "grad_norm": 24.43637333019393, + "learning_rate": 9.78978033928658e-06, + "loss": 4.2033, + "step": 2746 + }, + { + "epoch": 0.550501002004008, + "grad_norm": 20.438741313101165, + "learning_rate": 9.789445699386552e-06, + "loss": 4.2664, + "step": 2747 + }, + { + "epoch": 0.5507014028056112, + "grad_norm": 36.88496888954092, + "learning_rate": 9.78911079907729e-06, + "loss": 3.9234, + "step": 2748 + }, + { + "epoch": 0.5509018036072144, + "grad_norm": 31.215801446640842, + "learning_rate": 9.788775638377008e-06, + "loss": 4.3455, + "step": 2749 + }, + { + "epoch": 0.5511022044088176, + "grad_norm": 23.707172519534605, + "learning_rate": 9.788440217303923e-06, + "loss": 3.8168, + "step": 2750 + }, + { + "epoch": 0.5513026052104208, + "grad_norm": 24.200849318214292, + "learning_rate": 9.788104535876275e-06, + "loss": 4.4043, + "step": 2751 + }, + { + "epoch": 0.5515030060120241, + "grad_norm": 42.77173995868858, + "learning_rate": 9.787768594112316e-06, + "loss": 4.2227, + "step": 2752 + }, + { + "epoch": 0.5517034068136273, + "grad_norm": 21.629114336888875, + "learning_rate": 9.787432392030312e-06, + "loss": 4.0541, + "step": 2753 + }, + { + "epoch": 0.5519038076152305, + "grad_norm": 23.234596087891276, + "learning_rate": 9.78709592964854e-06, + "loss": 4.4206, + "step": 2754 + }, + { + "epoch": 0.5521042084168337, + "grad_norm": 25.451976273659284, + "learning_rate": 9.786759206985296e-06, + "loss": 4.2197, + "step": 2755 + }, + { + "epoch": 0.5523046092184368, + "grad_norm": 30.7531744499332, + "learning_rate": 9.78642222405889e-06, + "loss": 4.4805, + "step": 2756 + }, + { + "epoch": 0.55250501002004, + "grad_norm": 22.944986344502084, + "learning_rate": 9.786084980887642e-06, + "loss": 3.931, + "step": 2757 + }, + { + "epoch": 0.5527054108216433, + "grad_norm": 28.88892289106622, + "learning_rate": 9.785747477489888e-06, + "loss": 4.5979, + "step": 2758 + }, + { + "epoch": 0.5529058116232465, + "grad_norm": 30.602485701726252, + "learning_rate": 9.785409713883978e-06, + "loss": 4.2588, + "step": 2759 + }, + { + "epoch": 0.5531062124248497, + "grad_norm": 56.099682440426136, + "learning_rate": 9.78507169008828e-06, + "loss": 4.1097, + "step": 2760 + }, + { + "epoch": 0.5533066132264529, + "grad_norm": 36.23026772111192, + "learning_rate": 9.78473340612117e-06, + "loss": 4.0401, + "step": 2761 + }, + { + "epoch": 0.5535070140280561, + "grad_norm": 51.94726993949978, + "learning_rate": 9.78439486200104e-06, + "loss": 4.803, + "step": 2762 + }, + { + "epoch": 0.5537074148296594, + "grad_norm": 25.433047716395862, + "learning_rate": 9.784056057746303e-06, + "loss": 3.9612, + "step": 2763 + }, + { + "epoch": 0.5539078156312626, + "grad_norm": 26.09479934662594, + "learning_rate": 9.783716993375375e-06, + "loss": 4.6722, + "step": 2764 + }, + { + "epoch": 0.5541082164328658, + "grad_norm": 38.13246442525629, + "learning_rate": 9.783377668906691e-06, + "loss": 4.5637, + "step": 2765 + }, + { + "epoch": 0.5543086172344689, + "grad_norm": 35.78917030139564, + "learning_rate": 9.783038084358703e-06, + "loss": 4.4513, + "step": 2766 + }, + { + "epoch": 0.5545090180360721, + "grad_norm": 32.30711963349897, + "learning_rate": 9.782698239749872e-06, + "loss": 4.0372, + "step": 2767 + }, + { + "epoch": 0.5547094188376753, + "grad_norm": 38.82028976557841, + "learning_rate": 9.78235813509868e-06, + "loss": 4.7761, + "step": 2768 + }, + { + "epoch": 0.5549098196392785, + "grad_norm": 28.54383598588499, + "learning_rate": 9.782017770423617e-06, + "loss": 3.9248, + "step": 2769 + }, + { + "epoch": 0.5551102204408818, + "grad_norm": 32.40840615926477, + "learning_rate": 9.781677145743186e-06, + "loss": 3.9658, + "step": 2770 + }, + { + "epoch": 0.555310621242485, + "grad_norm": 33.070167012444514, + "learning_rate": 9.781336261075913e-06, + "loss": 5.0484, + "step": 2771 + }, + { + "epoch": 0.5555110220440882, + "grad_norm": 28.38718023194632, + "learning_rate": 9.78099511644033e-06, + "loss": 4.4234, + "step": 2772 + }, + { + "epoch": 0.5557114228456914, + "grad_norm": 24.11567779043712, + "learning_rate": 9.780653711854984e-06, + "loss": 4.1602, + "step": 2773 + }, + { + "epoch": 0.5559118236472946, + "grad_norm": 33.34398194943299, + "learning_rate": 9.780312047338438e-06, + "loss": 4.4378, + "step": 2774 + }, + { + "epoch": 0.5561122244488977, + "grad_norm": 30.323534780165485, + "learning_rate": 9.77997012290927e-06, + "loss": 4.8797, + "step": 2775 + }, + { + "epoch": 0.556312625250501, + "grad_norm": 36.13823189643893, + "learning_rate": 9.779627938586073e-06, + "loss": 4.9404, + "step": 2776 + }, + { + "epoch": 0.5565130260521042, + "grad_norm": 29.029404707421865, + "learning_rate": 9.779285494387448e-06, + "loss": 4.1011, + "step": 2777 + }, + { + "epoch": 0.5567134268537074, + "grad_norm": 26.990077300744428, + "learning_rate": 9.778942790332015e-06, + "loss": 4.5598, + "step": 2778 + }, + { + "epoch": 0.5569138276553106, + "grad_norm": 50.0952678060997, + "learning_rate": 9.778599826438409e-06, + "loss": 4.3307, + "step": 2779 + }, + { + "epoch": 0.5571142284569138, + "grad_norm": 24.085732448439593, + "learning_rate": 9.778256602725276e-06, + "loss": 4.3835, + "step": 2780 + }, + { + "epoch": 0.557314629258517, + "grad_norm": 20.34448596486883, + "learning_rate": 9.777913119211277e-06, + "loss": 4.1335, + "step": 2781 + }, + { + "epoch": 0.5575150300601203, + "grad_norm": 23.3038058444429, + "learning_rate": 9.77756937591509e-06, + "loss": 4.3604, + "step": 2782 + }, + { + "epoch": 0.5577154308617235, + "grad_norm": 25.535454097822665, + "learning_rate": 9.777225372855406e-06, + "loss": 3.8248, + "step": 2783 + }, + { + "epoch": 0.5579158316633267, + "grad_norm": 43.40139323037765, + "learning_rate": 9.776881110050923e-06, + "loss": 4.6219, + "step": 2784 + }, + { + "epoch": 0.5581162324649298, + "grad_norm": 42.313188343946834, + "learning_rate": 9.776536587520364e-06, + "loss": 4.5951, + "step": 2785 + }, + { + "epoch": 0.558316633266533, + "grad_norm": 21.312244915868803, + "learning_rate": 9.77619180528246e-06, + "loss": 3.7186, + "step": 2786 + }, + { + "epoch": 0.5585170340681362, + "grad_norm": 27.334373860131898, + "learning_rate": 9.775846763355958e-06, + "loss": 4.0929, + "step": 2787 + }, + { + "epoch": 0.5587174348697395, + "grad_norm": 20.161576762145675, + "learning_rate": 9.775501461759617e-06, + "loss": 4.1336, + "step": 2788 + }, + { + "epoch": 0.5589178356713427, + "grad_norm": 23.314839525587683, + "learning_rate": 9.775155900512212e-06, + "loss": 4.262, + "step": 2789 + }, + { + "epoch": 0.5591182364729459, + "grad_norm": 25.476506247800188, + "learning_rate": 9.774810079632533e-06, + "loss": 4.5963, + "step": 2790 + }, + { + "epoch": 0.5593186372745491, + "grad_norm": 28.90158135942523, + "learning_rate": 9.774463999139382e-06, + "loss": 3.9604, + "step": 2791 + }, + { + "epoch": 0.5595190380761523, + "grad_norm": 20.664351098562662, + "learning_rate": 9.774117659051574e-06, + "loss": 4.1909, + "step": 2792 + }, + { + "epoch": 0.5597194388777555, + "grad_norm": 21.537637645362807, + "learning_rate": 9.773771059387942e-06, + "loss": 4.1509, + "step": 2793 + }, + { + "epoch": 0.5599198396793588, + "grad_norm": 28.684328776111364, + "learning_rate": 9.773424200167329e-06, + "loss": 3.8464, + "step": 2794 + }, + { + "epoch": 0.5601202404809619, + "grad_norm": 30.887761064009094, + "learning_rate": 9.7730770814086e-06, + "loss": 4.6495, + "step": 2795 + }, + { + "epoch": 0.5603206412825651, + "grad_norm": 26.10746903624003, + "learning_rate": 9.772729703130621e-06, + "loss": 4.8343, + "step": 2796 + }, + { + "epoch": 0.5605210420841683, + "grad_norm": 30.890057028990768, + "learning_rate": 9.772382065352286e-06, + "loss": 4.2625, + "step": 2797 + }, + { + "epoch": 0.5607214428857715, + "grad_norm": 30.271224284158716, + "learning_rate": 9.772034168092491e-06, + "loss": 4.0577, + "step": 2798 + }, + { + "epoch": 0.5609218436873747, + "grad_norm": 29.083275973454505, + "learning_rate": 9.771686011370154e-06, + "loss": 4.6893, + "step": 2799 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 51.041998403805174, + "learning_rate": 9.771337595204206e-06, + "loss": 3.6273, + "step": 2800 + }, + { + "epoch": 0.5613226452905812, + "grad_norm": 24.897386587272447, + "learning_rate": 9.770988919613588e-06, + "loss": 4.3473, + "step": 2801 + }, + { + "epoch": 0.5615230460921844, + "grad_norm": 34.61217171374729, + "learning_rate": 9.770639984617262e-06, + "loss": 4.3177, + "step": 2802 + }, + { + "epoch": 0.5617234468937876, + "grad_norm": 20.926755149109308, + "learning_rate": 9.770290790234199e-06, + "loss": 4.2255, + "step": 2803 + }, + { + "epoch": 0.5619238476953908, + "grad_norm": 26.940858214883207, + "learning_rate": 9.769941336483383e-06, + "loss": 4.0677, + "step": 2804 + }, + { + "epoch": 0.5621242484969939, + "grad_norm": 20.75789977736449, + "learning_rate": 9.769591623383812e-06, + "loss": 3.8553, + "step": 2805 + }, + { + "epoch": 0.5623246492985972, + "grad_norm": 23.393035517434452, + "learning_rate": 9.769241650954507e-06, + "loss": 3.9648, + "step": 2806 + }, + { + "epoch": 0.5625250501002004, + "grad_norm": 17.957482743243855, + "learning_rate": 9.768891419214494e-06, + "loss": 3.9734, + "step": 2807 + }, + { + "epoch": 0.5627254509018036, + "grad_norm": 41.127532680588466, + "learning_rate": 9.768540928182814e-06, + "loss": 4.3382, + "step": 2808 + }, + { + "epoch": 0.5629258517034068, + "grad_norm": 24.048678413046527, + "learning_rate": 9.768190177878525e-06, + "loss": 4.2289, + "step": 2809 + }, + { + "epoch": 0.56312625250501, + "grad_norm": 26.969142254077003, + "learning_rate": 9.767839168320697e-06, + "loss": 4.301, + "step": 2810 + }, + { + "epoch": 0.5633266533066132, + "grad_norm": 50.633377524842565, + "learning_rate": 9.767487899528416e-06, + "loss": 4.1796, + "step": 2811 + }, + { + "epoch": 0.5635270541082165, + "grad_norm": 29.99020806551593, + "learning_rate": 9.76713637152078e-06, + "loss": 4.4776, + "step": 2812 + }, + { + "epoch": 0.5637274549098197, + "grad_norm": 22.385636603270143, + "learning_rate": 9.766784584316902e-06, + "loss": 4.852, + "step": 2813 + }, + { + "epoch": 0.5639278557114229, + "grad_norm": 24.0047435135848, + "learning_rate": 9.766432537935911e-06, + "loss": 4.4805, + "step": 2814 + }, + { + "epoch": 0.564128256513026, + "grad_norm": 22.482254765127195, + "learning_rate": 9.766080232396947e-06, + "loss": 4.0279, + "step": 2815 + }, + { + "epoch": 0.5643286573146292, + "grad_norm": 19.98943496904104, + "learning_rate": 9.765727667719165e-06, + "loss": 4.3453, + "step": 2816 + }, + { + "epoch": 0.5645290581162324, + "grad_norm": 16.984394862010227, + "learning_rate": 9.765374843921733e-06, + "loss": 3.8534, + "step": 2817 + }, + { + "epoch": 0.5647294589178357, + "grad_norm": 21.367702591419448, + "learning_rate": 9.765021761023839e-06, + "loss": 3.8342, + "step": 2818 + }, + { + "epoch": 0.5649298597194389, + "grad_norm": 26.833861084331886, + "learning_rate": 9.764668419044677e-06, + "loss": 4.0635, + "step": 2819 + }, + { + "epoch": 0.5651302605210421, + "grad_norm": 25.651002828812487, + "learning_rate": 9.76431481800346e-06, + "loss": 4.0948, + "step": 2820 + }, + { + "epoch": 0.5653306613226453, + "grad_norm": 24.36496516671261, + "learning_rate": 9.763960957919413e-06, + "loss": 4.6338, + "step": 2821 + }, + { + "epoch": 0.5655310621242485, + "grad_norm": 22.945991520594077, + "learning_rate": 9.763606838811778e-06, + "loss": 4.186, + "step": 2822 + }, + { + "epoch": 0.5657314629258517, + "grad_norm": 30.109002641929226, + "learning_rate": 9.763252460699806e-06, + "loss": 4.0745, + "step": 2823 + }, + { + "epoch": 0.565931863727455, + "grad_norm": 42.38877331775191, + "learning_rate": 9.762897823602767e-06, + "loss": 4.6039, + "step": 2824 + }, + { + "epoch": 0.5661322645290581, + "grad_norm": 20.064526735168293, + "learning_rate": 9.762542927539943e-06, + "loss": 3.6146, + "step": 2825 + }, + { + "epoch": 0.5663326653306613, + "grad_norm": 22.303774180911542, + "learning_rate": 9.762187772530628e-06, + "loss": 4.3696, + "step": 2826 + }, + { + "epoch": 0.5665330661322645, + "grad_norm": 22.856451692362988, + "learning_rate": 9.761832358594137e-06, + "loss": 4.1235, + "step": 2827 + }, + { + "epoch": 0.5667334669338677, + "grad_norm": 31.74976374583318, + "learning_rate": 9.761476685749789e-06, + "loss": 4.4407, + "step": 2828 + }, + { + "epoch": 0.5669338677354709, + "grad_norm": 27.07687405817371, + "learning_rate": 9.761120754016926e-06, + "loss": 4.3104, + "step": 2829 + }, + { + "epoch": 0.5671342685370742, + "grad_norm": 33.76570093656068, + "learning_rate": 9.760764563414901e-06, + "loss": 4.8029, + "step": 2830 + }, + { + "epoch": 0.5673346693386774, + "grad_norm": 25.107054805943424, + "learning_rate": 9.760408113963078e-06, + "loss": 3.8549, + "step": 2831 + }, + { + "epoch": 0.5675350701402806, + "grad_norm": 34.88834735112885, + "learning_rate": 9.760051405680839e-06, + "loss": 4.2312, + "step": 2832 + }, + { + "epoch": 0.5677354709418838, + "grad_norm": 27.979844071997153, + "learning_rate": 9.759694438587578e-06, + "loss": 4.2565, + "step": 2833 + }, + { + "epoch": 0.5679358717434869, + "grad_norm": 28.79403160958942, + "learning_rate": 9.759337212702703e-06, + "loss": 3.9596, + "step": 2834 + }, + { + "epoch": 0.5681362725450901, + "grad_norm": 39.59436238111865, + "learning_rate": 9.758979728045642e-06, + "loss": 4.4864, + "step": 2835 + }, + { + "epoch": 0.5683366733466934, + "grad_norm": 45.760915928665206, + "learning_rate": 9.758621984635825e-06, + "loss": 4.3051, + "step": 2836 + }, + { + "epoch": 0.5685370741482966, + "grad_norm": 18.77500485498221, + "learning_rate": 9.758263982492709e-06, + "loss": 3.8187, + "step": 2837 + }, + { + "epoch": 0.5687374749498998, + "grad_norm": 29.307949409589796, + "learning_rate": 9.757905721635754e-06, + "loss": 3.942, + "step": 2838 + }, + { + "epoch": 0.568937875751503, + "grad_norm": 37.72051596141967, + "learning_rate": 9.757547202084442e-06, + "loss": 4.77, + "step": 2839 + }, + { + "epoch": 0.5691382765531062, + "grad_norm": 19.609256372169718, + "learning_rate": 9.757188423858268e-06, + "loss": 3.9058, + "step": 2840 + }, + { + "epoch": 0.5693386773547094, + "grad_norm": 23.753625103436505, + "learning_rate": 9.756829386976735e-06, + "loss": 4.1901, + "step": 2841 + }, + { + "epoch": 0.5695390781563127, + "grad_norm": 30.185386476790274, + "learning_rate": 9.756470091459367e-06, + "loss": 4.1207, + "step": 2842 + }, + { + "epoch": 0.5697394789579159, + "grad_norm": 28.275491709090094, + "learning_rate": 9.7561105373257e-06, + "loss": 4.5683, + "step": 2843 + }, + { + "epoch": 0.569939879759519, + "grad_norm": 24.678047949160522, + "learning_rate": 9.75575072459528e-06, + "loss": 4.6815, + "step": 2844 + }, + { + "epoch": 0.5701402805611222, + "grad_norm": 51.93035932684001, + "learning_rate": 9.755390653287675e-06, + "loss": 4.3885, + "step": 2845 + }, + { + "epoch": 0.5703406813627254, + "grad_norm": 26.23901356445324, + "learning_rate": 9.75503032342246e-06, + "loss": 4.1242, + "step": 2846 + }, + { + "epoch": 0.5705410821643286, + "grad_norm": 19.368487780008838, + "learning_rate": 9.754669735019228e-06, + "loss": 4.3382, + "step": 2847 + }, + { + "epoch": 0.5707414829659319, + "grad_norm": 46.087054865011844, + "learning_rate": 9.754308888097583e-06, + "loss": 4.4089, + "step": 2848 + }, + { + "epoch": 0.5709418837675351, + "grad_norm": 48.693684039606346, + "learning_rate": 9.753947782677146e-06, + "loss": 3.6093, + "step": 2849 + }, + { + "epoch": 0.5711422845691383, + "grad_norm": 24.965462307707813, + "learning_rate": 9.75358641877755e-06, + "loss": 4.4171, + "step": 2850 + }, + { + "epoch": 0.5713426853707415, + "grad_norm": 25.104942645762964, + "learning_rate": 9.753224796418445e-06, + "loss": 4.0255, + "step": 2851 + }, + { + "epoch": 0.5715430861723447, + "grad_norm": 61.58511066810162, + "learning_rate": 9.752862915619489e-06, + "loss": 4.6423, + "step": 2852 + }, + { + "epoch": 0.571743486973948, + "grad_norm": 24.874854878411526, + "learning_rate": 9.752500776400362e-06, + "loss": 3.8035, + "step": 2853 + }, + { + "epoch": 0.571943887775551, + "grad_norm": 42.82746829211029, + "learning_rate": 9.752138378780754e-06, + "loss": 4.2933, + "step": 2854 + }, + { + "epoch": 0.5721442885771543, + "grad_norm": 49.02685906396632, + "learning_rate": 9.751775722780365e-06, + "loss": 4.8611, + "step": 2855 + }, + { + "epoch": 0.5723446893787575, + "grad_norm": 35.29345276647859, + "learning_rate": 9.751412808418916e-06, + "loss": 4.8633, + "step": 2856 + }, + { + "epoch": 0.5725450901803607, + "grad_norm": 31.91219246138919, + "learning_rate": 9.751049635716141e-06, + "loss": 4.3365, + "step": 2857 + }, + { + "epoch": 0.5727454909819639, + "grad_norm": 24.3938302485068, + "learning_rate": 9.750686204691781e-06, + "loss": 4.1211, + "step": 2858 + }, + { + "epoch": 0.5729458917835671, + "grad_norm": 30.526448700098335, + "learning_rate": 9.750322515365602e-06, + "loss": 4.2947, + "step": 2859 + }, + { + "epoch": 0.5731462925851704, + "grad_norm": 22.79610837112271, + "learning_rate": 9.749958567757375e-06, + "loss": 4.0203, + "step": 2860 + }, + { + "epoch": 0.5733466933867736, + "grad_norm": 25.267118527616717, + "learning_rate": 9.74959436188689e-06, + "loss": 3.8505, + "step": 2861 + }, + { + "epoch": 0.5735470941883768, + "grad_norm": 29.815031560361152, + "learning_rate": 9.749229897773947e-06, + "loss": 3.9265, + "step": 2862 + }, + { + "epoch": 0.57374749498998, + "grad_norm": 28.101539926726765, + "learning_rate": 9.748865175438364e-06, + "loss": 4.4894, + "step": 2863 + }, + { + "epoch": 0.5739478957915831, + "grad_norm": 22.885681590918917, + "learning_rate": 9.74850019489997e-06, + "loss": 3.9771, + "step": 2864 + }, + { + "epoch": 0.5741482965931863, + "grad_norm": 23.46998878187342, + "learning_rate": 9.748134956178615e-06, + "loss": 3.8104, + "step": 2865 + }, + { + "epoch": 0.5743486973947896, + "grad_norm": 25.38124237840336, + "learning_rate": 9.74776945929415e-06, + "loss": 4.3831, + "step": 2866 + }, + { + "epoch": 0.5745490981963928, + "grad_norm": 25.703481792313955, + "learning_rate": 9.747403704266451e-06, + "loss": 3.2654, + "step": 2867 + }, + { + "epoch": 0.574749498997996, + "grad_norm": 27.835210759379912, + "learning_rate": 9.747037691115408e-06, + "loss": 4.2684, + "step": 2868 + }, + { + "epoch": 0.5749498997995992, + "grad_norm": 40.07891651165912, + "learning_rate": 9.746671419860917e-06, + "loss": 3.7208, + "step": 2869 + }, + { + "epoch": 0.5751503006012024, + "grad_norm": 49.640873467892106, + "learning_rate": 9.746304890522893e-06, + "loss": 4.4423, + "step": 2870 + }, + { + "epoch": 0.5753507014028056, + "grad_norm": 69.8327796451002, + "learning_rate": 9.745938103121267e-06, + "loss": 4.5168, + "step": 2871 + }, + { + "epoch": 0.5755511022044089, + "grad_norm": 30.929214436236123, + "learning_rate": 9.74557105767598e-06, + "loss": 3.9616, + "step": 2872 + }, + { + "epoch": 0.5757515030060121, + "grad_norm": 25.93458017935748, + "learning_rate": 9.74520375420699e-06, + "loss": 4.2167, + "step": 2873 + }, + { + "epoch": 0.5759519038076152, + "grad_norm": 23.440492626940667, + "learning_rate": 9.744836192734267e-06, + "loss": 3.8645, + "step": 2874 + }, + { + "epoch": 0.5761523046092184, + "grad_norm": 29.90416941657705, + "learning_rate": 9.744468373277796e-06, + "loss": 3.415, + "step": 2875 + }, + { + "epoch": 0.5763527054108216, + "grad_norm": 21.073412111799254, + "learning_rate": 9.744100295857577e-06, + "loss": 3.7406, + "step": 2876 + }, + { + "epoch": 0.5765531062124248, + "grad_norm": 23.55631058685154, + "learning_rate": 9.743731960493621e-06, + "loss": 3.8976, + "step": 2877 + }, + { + "epoch": 0.576753507014028, + "grad_norm": 23.225173276059955, + "learning_rate": 9.743363367205956e-06, + "loss": 4.4508, + "step": 2878 + }, + { + "epoch": 0.5769539078156313, + "grad_norm": 15.877325230088301, + "learning_rate": 9.742994516014623e-06, + "loss": 4.1746, + "step": 2879 + }, + { + "epoch": 0.5771543086172345, + "grad_norm": 27.92721248544511, + "learning_rate": 9.742625406939679e-06, + "loss": 4.3552, + "step": 2880 + }, + { + "epoch": 0.5773547094188377, + "grad_norm": 27.315097789511558, + "learning_rate": 9.742256040001188e-06, + "loss": 4.4118, + "step": 2881 + }, + { + "epoch": 0.5775551102204409, + "grad_norm": 27.9140445481079, + "learning_rate": 9.741886415219238e-06, + "loss": 4.6471, + "step": 2882 + }, + { + "epoch": 0.5777555110220441, + "grad_norm": 27.957015889077624, + "learning_rate": 9.741516532613921e-06, + "loss": 4.8072, + "step": 2883 + }, + { + "epoch": 0.5779559118236473, + "grad_norm": 18.955370022409138, + "learning_rate": 9.741146392205353e-06, + "loss": 3.966, + "step": 2884 + }, + { + "epoch": 0.5781563126252505, + "grad_norm": 24.79635854991248, + "learning_rate": 9.740775994013658e-06, + "loss": 4.2206, + "step": 2885 + }, + { + "epoch": 0.5783567134268537, + "grad_norm": 26.28317427273813, + "learning_rate": 9.740405338058972e-06, + "loss": 4.4162, + "step": 2886 + }, + { + "epoch": 0.5785571142284569, + "grad_norm": 60.88331900512232, + "learning_rate": 9.740034424361451e-06, + "loss": 4.6374, + "step": 2887 + }, + { + "epoch": 0.5787575150300601, + "grad_norm": 28.995180180653435, + "learning_rate": 9.739663252941263e-06, + "loss": 4.2805, + "step": 2888 + }, + { + "epoch": 0.5789579158316633, + "grad_norm": 40.08607588553318, + "learning_rate": 9.739291823818587e-06, + "loss": 4.3833, + "step": 2889 + }, + { + "epoch": 0.5791583166332666, + "grad_norm": 23.25245288741377, + "learning_rate": 9.738920137013616e-06, + "loss": 3.9185, + "step": 2890 + }, + { + "epoch": 0.5793587174348698, + "grad_norm": 29.618013462916558, + "learning_rate": 9.738548192546564e-06, + "loss": 4.0815, + "step": 2891 + }, + { + "epoch": 0.579559118236473, + "grad_norm": 40.19354981552889, + "learning_rate": 9.738175990437652e-06, + "loss": 3.86, + "step": 2892 + }, + { + "epoch": 0.5797595190380761, + "grad_norm": 35.824631036456296, + "learning_rate": 9.737803530707118e-06, + "loss": 4.0665, + "step": 2893 + }, + { + "epoch": 0.5799599198396793, + "grad_norm": 36.321436894880485, + "learning_rate": 9.73743081337521e-06, + "loss": 4.2002, + "step": 2894 + }, + { + "epoch": 0.5801603206412825, + "grad_norm": 26.009495393147485, + "learning_rate": 9.737057838462198e-06, + "loss": 4.158, + "step": 2895 + }, + { + "epoch": 0.5803607214428858, + "grad_norm": 35.57950395299306, + "learning_rate": 9.736684605988357e-06, + "loss": 4.5283, + "step": 2896 + }, + { + "epoch": 0.580561122244489, + "grad_norm": 28.88284740082898, + "learning_rate": 9.736311115973985e-06, + "loss": 4.2972, + "step": 2897 + }, + { + "epoch": 0.5807615230460922, + "grad_norm": 35.625360431105875, + "learning_rate": 9.735937368439383e-06, + "loss": 4.335, + "step": 2898 + }, + { + "epoch": 0.5809619238476954, + "grad_norm": 41.93436447906826, + "learning_rate": 9.735563363404876e-06, + "loss": 4.4801, + "step": 2899 + }, + { + "epoch": 0.5811623246492986, + "grad_norm": 18.590572370430536, + "learning_rate": 9.7351891008908e-06, + "loss": 4.1677, + "step": 2900 + }, + { + "epoch": 0.5813627254509018, + "grad_norm": 31.143287029095905, + "learning_rate": 9.734814580917501e-06, + "loss": 4.3359, + "step": 2901 + }, + { + "epoch": 0.5815631262525051, + "grad_norm": 26.256019016736072, + "learning_rate": 9.734439803505345e-06, + "loss": 4.5179, + "step": 2902 + }, + { + "epoch": 0.5817635270541082, + "grad_norm": 35.61310333299306, + "learning_rate": 9.734064768674709e-06, + "loss": 4.2526, + "step": 2903 + }, + { + "epoch": 0.5819639278557114, + "grad_norm": 18.190678321726292, + "learning_rate": 9.733689476445982e-06, + "loss": 3.8874, + "step": 2904 + }, + { + "epoch": 0.5821643286573146, + "grad_norm": 25.07763046754839, + "learning_rate": 9.733313926839571e-06, + "loss": 4.125, + "step": 2905 + }, + { + "epoch": 0.5823647294589178, + "grad_norm": 25.062872725140306, + "learning_rate": 9.732938119875894e-06, + "loss": 4.0412, + "step": 2906 + }, + { + "epoch": 0.582565130260521, + "grad_norm": 27.74152033983986, + "learning_rate": 9.732562055575387e-06, + "loss": 4.4346, + "step": 2907 + }, + { + "epoch": 0.5827655310621243, + "grad_norm": 18.65279703182356, + "learning_rate": 9.732185733958493e-06, + "loss": 3.8242, + "step": 2908 + }, + { + "epoch": 0.5829659318637275, + "grad_norm": 18.150498917912977, + "learning_rate": 9.731809155045678e-06, + "loss": 3.7029, + "step": 2909 + }, + { + "epoch": 0.5831663326653307, + "grad_norm": 33.01897956313422, + "learning_rate": 9.731432318857412e-06, + "loss": 4.0873, + "step": 2910 + }, + { + "epoch": 0.5833667334669339, + "grad_norm": 41.176733813452536, + "learning_rate": 9.731055225414188e-06, + "loss": 4.9033, + "step": 2911 + }, + { + "epoch": 0.5835671342685371, + "grad_norm": 23.97870778520062, + "learning_rate": 9.730677874736507e-06, + "loss": 3.8745, + "step": 2912 + }, + { + "epoch": 0.5837675350701402, + "grad_norm": 22.41727046095112, + "learning_rate": 9.730300266844885e-06, + "loss": 4.2243, + "step": 2913 + }, + { + "epoch": 0.5839679358717434, + "grad_norm": 32.14635384434395, + "learning_rate": 9.729922401759856e-06, + "loss": 4.3872, + "step": 2914 + }, + { + "epoch": 0.5841683366733467, + "grad_norm": 24.15795381363594, + "learning_rate": 9.729544279501966e-06, + "loss": 4.1975, + "step": 2915 + }, + { + "epoch": 0.5843687374749499, + "grad_norm": 44.86797390095693, + "learning_rate": 9.729165900091771e-06, + "loss": 4.5328, + "step": 2916 + }, + { + "epoch": 0.5845691382765531, + "grad_norm": 25.579421610723855, + "learning_rate": 9.728787263549846e-06, + "loss": 3.5669, + "step": 2917 + }, + { + "epoch": 0.5847695390781563, + "grad_norm": 23.61556797814255, + "learning_rate": 9.728408369896776e-06, + "loss": 4.2352, + "step": 2918 + }, + { + "epoch": 0.5849699398797595, + "grad_norm": 23.397646586646943, + "learning_rate": 9.728029219153163e-06, + "loss": 4.4411, + "step": 2919 + }, + { + "epoch": 0.5851703406813628, + "grad_norm": 28.089797936288306, + "learning_rate": 9.727649811339621e-06, + "loss": 4.61, + "step": 2920 + }, + { + "epoch": 0.585370741482966, + "grad_norm": 31.29509966961669, + "learning_rate": 9.727270146476783e-06, + "loss": 4.0701, + "step": 2921 + }, + { + "epoch": 0.5855711422845692, + "grad_norm": 34.45089569603105, + "learning_rate": 9.726890224585288e-06, + "loss": 4.7841, + "step": 2922 + }, + { + "epoch": 0.5857715430861723, + "grad_norm": 24.703736457849516, + "learning_rate": 9.726510045685792e-06, + "loss": 4.2908, + "step": 2923 + }, + { + "epoch": 0.5859719438877755, + "grad_norm": 24.206135382267128, + "learning_rate": 9.72612960979897e-06, + "loss": 3.9396, + "step": 2924 + }, + { + "epoch": 0.5861723446893787, + "grad_norm": 18.199328579792382, + "learning_rate": 9.725748916945504e-06, + "loss": 4.1867, + "step": 2925 + }, + { + "epoch": 0.586372745490982, + "grad_norm": 25.983407907476295, + "learning_rate": 9.725367967146095e-06, + "loss": 4.2978, + "step": 2926 + }, + { + "epoch": 0.5865731462925852, + "grad_norm": 20.039260686589788, + "learning_rate": 9.724986760421453e-06, + "loss": 4.0987, + "step": 2927 + }, + { + "epoch": 0.5867735470941884, + "grad_norm": 27.305392851143793, + "learning_rate": 9.724605296792305e-06, + "loss": 4.6166, + "step": 2928 + }, + { + "epoch": 0.5869739478957916, + "grad_norm": 21.319548061844593, + "learning_rate": 9.724223576279395e-06, + "loss": 4.0148, + "step": 2929 + }, + { + "epoch": 0.5871743486973948, + "grad_norm": 22.212903178159603, + "learning_rate": 9.723841598903474e-06, + "loss": 4.4976, + "step": 2930 + }, + { + "epoch": 0.587374749498998, + "grad_norm": 20.433276252793174, + "learning_rate": 9.723459364685313e-06, + "loss": 4.118, + "step": 2931 + }, + { + "epoch": 0.5875751503006013, + "grad_norm": 52.20822566572159, + "learning_rate": 9.723076873645694e-06, + "loss": 3.7468, + "step": 2932 + }, + { + "epoch": 0.5877755511022044, + "grad_norm": 20.780150490709467, + "learning_rate": 9.722694125805412e-06, + "loss": 3.8, + "step": 2933 + }, + { + "epoch": 0.5879759519038076, + "grad_norm": 27.199631821029197, + "learning_rate": 9.72231112118528e-06, + "loss": 4.1934, + "step": 2934 + }, + { + "epoch": 0.5881763527054108, + "grad_norm": 40.21056197934216, + "learning_rate": 9.721927859806123e-06, + "loss": 4.5898, + "step": 2935 + }, + { + "epoch": 0.588376753507014, + "grad_norm": 23.14344681770715, + "learning_rate": 9.721544341688775e-06, + "loss": 3.8767, + "step": 2936 + }, + { + "epoch": 0.5885771543086172, + "grad_norm": 29.07184009326207, + "learning_rate": 9.721160566854094e-06, + "loss": 4.7531, + "step": 2937 + }, + { + "epoch": 0.5887775551102205, + "grad_norm": 29.82991597605241, + "learning_rate": 9.720776535322943e-06, + "loss": 4.205, + "step": 2938 + }, + { + "epoch": 0.5889779559118237, + "grad_norm": 24.596813509688978, + "learning_rate": 9.720392247116203e-06, + "loss": 3.9176, + "step": 2939 + }, + { + "epoch": 0.5891783567134269, + "grad_norm": 72.57659149455034, + "learning_rate": 9.72000770225477e-06, + "loss": 4.6985, + "step": 2940 + }, + { + "epoch": 0.5893787575150301, + "grad_norm": 29.331952892102834, + "learning_rate": 9.71962290075955e-06, + "loss": 4.6833, + "step": 2941 + }, + { + "epoch": 0.5895791583166332, + "grad_norm": 32.33098591461034, + "learning_rate": 9.719237842651466e-06, + "loss": 4.3439, + "step": 2942 + }, + { + "epoch": 0.5897795591182364, + "grad_norm": 23.701507070181542, + "learning_rate": 9.718852527951455e-06, + "loss": 4.2706, + "step": 2943 + }, + { + "epoch": 0.5899799599198396, + "grad_norm": 32.14216496767171, + "learning_rate": 9.718466956680465e-06, + "loss": 4.2777, + "step": 2944 + }, + { + "epoch": 0.5901803607214429, + "grad_norm": 28.545026664733452, + "learning_rate": 9.718081128859462e-06, + "loss": 4.2562, + "step": 2945 + }, + { + "epoch": 0.5903807615230461, + "grad_norm": 31.599362249456487, + "learning_rate": 9.717695044509423e-06, + "loss": 4.3076, + "step": 2946 + }, + { + "epoch": 0.5905811623246493, + "grad_norm": 43.1084459102273, + "learning_rate": 9.717308703651343e-06, + "loss": 4.6955, + "step": 2947 + }, + { + "epoch": 0.5907815631262525, + "grad_norm": 21.8516657590052, + "learning_rate": 9.716922106306223e-06, + "loss": 3.9114, + "step": 2948 + }, + { + "epoch": 0.5909819639278557, + "grad_norm": 25.533978886110127, + "learning_rate": 9.716535252495085e-06, + "loss": 4.0211, + "step": 2949 + }, + { + "epoch": 0.591182364729459, + "grad_norm": 21.047923710505366, + "learning_rate": 9.716148142238964e-06, + "loss": 3.6824, + "step": 2950 + }, + { + "epoch": 0.5913827655310622, + "grad_norm": 26.015446488716513, + "learning_rate": 9.715760775558906e-06, + "loss": 4.0288, + "step": 2951 + }, + { + "epoch": 0.5915831663326653, + "grad_norm": 21.60495386172023, + "learning_rate": 9.715373152475976e-06, + "loss": 3.736, + "step": 2952 + }, + { + "epoch": 0.5917835671342685, + "grad_norm": 31.083551558362295, + "learning_rate": 9.714985273011243e-06, + "loss": 4.3237, + "step": 2953 + }, + { + "epoch": 0.5919839679358717, + "grad_norm": 30.480426036089188, + "learning_rate": 9.714597137185804e-06, + "loss": 4.5573, + "step": 2954 + }, + { + "epoch": 0.5921843687374749, + "grad_norm": 23.117705471964644, + "learning_rate": 9.714208745020759e-06, + "loss": 3.8679, + "step": 2955 + }, + { + "epoch": 0.5923847695390781, + "grad_norm": 30.597546647847008, + "learning_rate": 9.713820096537226e-06, + "loss": 4.6627, + "step": 2956 + }, + { + "epoch": 0.5925851703406814, + "grad_norm": 23.119572864705876, + "learning_rate": 9.713431191756334e-06, + "loss": 4.3656, + "step": 2957 + }, + { + "epoch": 0.5927855711422846, + "grad_norm": 21.336499039887467, + "learning_rate": 9.713042030699232e-06, + "loss": 3.8864, + "step": 2958 + }, + { + "epoch": 0.5929859719438878, + "grad_norm": 23.0946659289019, + "learning_rate": 9.712652613387077e-06, + "loss": 4.3643, + "step": 2959 + }, + { + "epoch": 0.593186372745491, + "grad_norm": 27.27328007099386, + "learning_rate": 9.712262939841044e-06, + "loss": 4.0879, + "step": 2960 + }, + { + "epoch": 0.5933867735470942, + "grad_norm": 23.338875955535702, + "learning_rate": 9.711873010082318e-06, + "loss": 4.3608, + "step": 2961 + }, + { + "epoch": 0.5935871743486973, + "grad_norm": 25.21994349302126, + "learning_rate": 9.711482824132101e-06, + "loss": 4.2415, + "step": 2962 + }, + { + "epoch": 0.5937875751503006, + "grad_norm": 22.53803755355099, + "learning_rate": 9.71109238201161e-06, + "loss": 3.938, + "step": 2963 + }, + { + "epoch": 0.5939879759519038, + "grad_norm": 23.686654117303696, + "learning_rate": 9.710701683742069e-06, + "loss": 4.0899, + "step": 2964 + }, + { + "epoch": 0.594188376753507, + "grad_norm": 31.446687069325353, + "learning_rate": 9.710310729344726e-06, + "loss": 4.6201, + "step": 2965 + }, + { + "epoch": 0.5943887775551102, + "grad_norm": 30.827095872933977, + "learning_rate": 9.709919518840833e-06, + "loss": 4.0851, + "step": 2966 + }, + { + "epoch": 0.5945891783567134, + "grad_norm": 28.708272878490803, + "learning_rate": 9.709528052251666e-06, + "loss": 3.9751, + "step": 2967 + }, + { + "epoch": 0.5947895791583167, + "grad_norm": 24.414504123164686, + "learning_rate": 9.709136329598506e-06, + "loss": 3.7295, + "step": 2968 + }, + { + "epoch": 0.5949899799599199, + "grad_norm": 26.16843016311996, + "learning_rate": 9.708744350902653e-06, + "loss": 4.1066, + "step": 2969 + }, + { + "epoch": 0.5951903807615231, + "grad_norm": 59.93220617116143, + "learning_rate": 9.708352116185418e-06, + "loss": 4.7305, + "step": 2970 + }, + { + "epoch": 0.5953907815631263, + "grad_norm": 17.75382382641389, + "learning_rate": 9.70795962546813e-06, + "loss": 3.8161, + "step": 2971 + }, + { + "epoch": 0.5955911823647294, + "grad_norm": 22.786339967762302, + "learning_rate": 9.707566878772126e-06, + "loss": 4.1656, + "step": 2972 + }, + { + "epoch": 0.5957915831663326, + "grad_norm": 24.73575791405567, + "learning_rate": 9.707173876118761e-06, + "loss": 4.142, + "step": 2973 + }, + { + "epoch": 0.5959919839679358, + "grad_norm": 17.80558777491865, + "learning_rate": 9.706780617529406e-06, + "loss": 3.6477, + "step": 2974 + }, + { + "epoch": 0.5961923847695391, + "grad_norm": 30.444721983216503, + "learning_rate": 9.70638710302544e-06, + "loss": 4.6144, + "step": 2975 + }, + { + "epoch": 0.5963927855711423, + "grad_norm": 25.3884629533726, + "learning_rate": 9.705993332628259e-06, + "loss": 4.0067, + "step": 2976 + }, + { + "epoch": 0.5965931863727455, + "grad_norm": 32.560446030903265, + "learning_rate": 9.705599306359274e-06, + "loss": 4.6916, + "step": 2977 + }, + { + "epoch": 0.5967935871743487, + "grad_norm": 20.872627424225396, + "learning_rate": 9.70520502423991e-06, + "loss": 4.1888, + "step": 2978 + }, + { + "epoch": 0.5969939879759519, + "grad_norm": 42.71527243683257, + "learning_rate": 9.704810486291603e-06, + "loss": 4.9823, + "step": 2979 + }, + { + "epoch": 0.5971943887775552, + "grad_norm": 24.037519989731837, + "learning_rate": 9.704415692535804e-06, + "loss": 3.929, + "step": 2980 + }, + { + "epoch": 0.5973947895791584, + "grad_norm": 22.73171323173701, + "learning_rate": 9.70402064299398e-06, + "loss": 4.2521, + "step": 2981 + }, + { + "epoch": 0.5975951903807615, + "grad_norm": 54.537314273607585, + "learning_rate": 9.703625337687609e-06, + "loss": 3.8274, + "step": 2982 + }, + { + "epoch": 0.5977955911823647, + "grad_norm": 20.36867927398828, + "learning_rate": 9.703229776638185e-06, + "loss": 3.6205, + "step": 2983 + }, + { + "epoch": 0.5979959919839679, + "grad_norm": 30.073421552133485, + "learning_rate": 9.702833959867217e-06, + "loss": 4.0127, + "step": 2984 + }, + { + "epoch": 0.5981963927855711, + "grad_norm": 32.24737612406889, + "learning_rate": 9.702437887396225e-06, + "loss": 4.5994, + "step": 2985 + }, + { + "epoch": 0.5983967935871743, + "grad_norm": 23.644418891928183, + "learning_rate": 9.702041559246743e-06, + "loss": 4.1593, + "step": 2986 + }, + { + "epoch": 0.5985971943887776, + "grad_norm": 23.888418200373593, + "learning_rate": 9.70164497544032e-06, + "loss": 4.4121, + "step": 2987 + }, + { + "epoch": 0.5987975951903808, + "grad_norm": 15.298804110238205, + "learning_rate": 9.70124813599852e-06, + "loss": 3.7674, + "step": 2988 + }, + { + "epoch": 0.598997995991984, + "grad_norm": 39.295578356755094, + "learning_rate": 9.700851040942918e-06, + "loss": 4.5264, + "step": 2989 + }, + { + "epoch": 0.5991983967935872, + "grad_norm": 28.375048493763764, + "learning_rate": 9.700453690295105e-06, + "loss": 4.3198, + "step": 2990 + }, + { + "epoch": 0.5993987975951904, + "grad_norm": 25.17357720989645, + "learning_rate": 9.70005608407669e-06, + "loss": 3.684, + "step": 2991 + }, + { + "epoch": 0.5995991983967935, + "grad_norm": 34.37973345533918, + "learning_rate": 9.699658222309286e-06, + "loss": 4.9473, + "step": 2992 + }, + { + "epoch": 0.5997995991983968, + "grad_norm": 22.542188425088955, + "learning_rate": 9.699260105014528e-06, + "loss": 3.7283, + "step": 2993 + }, + { + "epoch": 0.6, + "grad_norm": 28.47823301488714, + "learning_rate": 9.698861732214058e-06, + "loss": 4.2469, + "step": 2994 + }, + { + "epoch": 0.6002004008016032, + "grad_norm": 38.202910696562576, + "learning_rate": 9.698463103929542e-06, + "loss": 4.0892, + "step": 2995 + }, + { + "epoch": 0.6004008016032064, + "grad_norm": 37.759237664810634, + "learning_rate": 9.698064220182653e-06, + "loss": 4.5575, + "step": 2996 + }, + { + "epoch": 0.6006012024048096, + "grad_norm": 27.57817933248699, + "learning_rate": 9.697665080995074e-06, + "loss": 3.975, + "step": 2997 + }, + { + "epoch": 0.6008016032064128, + "grad_norm": 23.65613670470337, + "learning_rate": 9.697265686388513e-06, + "loss": 4.4265, + "step": 2998 + }, + { + "epoch": 0.6010020040080161, + "grad_norm": 27.74011673571849, + "learning_rate": 9.69686603638468e-06, + "loss": 3.9993, + "step": 2999 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 34.102810866352954, + "learning_rate": 9.69646613100531e-06, + "loss": 4.514, + "step": 3000 + }, + { + "epoch": 0.6014028056112224, + "grad_norm": 18.610211601295077, + "learning_rate": 9.696065970272143e-06, + "loss": 4.0432, + "step": 3001 + }, + { + "epoch": 0.6016032064128256, + "grad_norm": 28.751982114221047, + "learning_rate": 9.695665554206937e-06, + "loss": 4.19, + "step": 3002 + }, + { + "epoch": 0.6018036072144288, + "grad_norm": 53.17694708083377, + "learning_rate": 9.695264882831463e-06, + "loss": 4.3686, + "step": 3003 + }, + { + "epoch": 0.602004008016032, + "grad_norm": 38.82742625104106, + "learning_rate": 9.694863956167506e-06, + "loss": 3.8356, + "step": 3004 + }, + { + "epoch": 0.6022044088176353, + "grad_norm": 33.91061080502567, + "learning_rate": 9.694462774236867e-06, + "loss": 4.3643, + "step": 3005 + }, + { + "epoch": 0.6024048096192385, + "grad_norm": 23.811159770999836, + "learning_rate": 9.694061337061356e-06, + "loss": 4.0954, + "step": 3006 + }, + { + "epoch": 0.6026052104208417, + "grad_norm": 36.2685829446584, + "learning_rate": 9.693659644662803e-06, + "loss": 4.6326, + "step": 3007 + }, + { + "epoch": 0.6028056112224449, + "grad_norm": 42.41973808792667, + "learning_rate": 9.693257697063045e-06, + "loss": 5.0652, + "step": 3008 + }, + { + "epoch": 0.6030060120240481, + "grad_norm": 23.211566077515336, + "learning_rate": 9.692855494283937e-06, + "loss": 4.1655, + "step": 3009 + }, + { + "epoch": 0.6032064128256514, + "grad_norm": 19.444218392936815, + "learning_rate": 9.692453036347352e-06, + "loss": 3.9692, + "step": 3010 + }, + { + "epoch": 0.6034068136272545, + "grad_norm": 18.3242964620238, + "learning_rate": 9.692050323275166e-06, + "loss": 4.1131, + "step": 3011 + }, + { + "epoch": 0.6036072144288577, + "grad_norm": 32.472357322806374, + "learning_rate": 9.691647355089277e-06, + "loss": 4.9665, + "step": 3012 + }, + { + "epoch": 0.6038076152304609, + "grad_norm": 22.008878761249434, + "learning_rate": 9.691244131811598e-06, + "loss": 3.9904, + "step": 3013 + }, + { + "epoch": 0.6040080160320641, + "grad_norm": 28.00027255686149, + "learning_rate": 9.69084065346405e-06, + "loss": 4.3107, + "step": 3014 + }, + { + "epoch": 0.6042084168336673, + "grad_norm": 22.030399032977076, + "learning_rate": 9.690436920068571e-06, + "loss": 3.9443, + "step": 3015 + }, + { + "epoch": 0.6044088176352705, + "grad_norm": 19.804232988580818, + "learning_rate": 9.690032931647113e-06, + "loss": 4.0894, + "step": 3016 + }, + { + "epoch": 0.6046092184368738, + "grad_norm": 28.10881344976774, + "learning_rate": 9.689628688221642e-06, + "loss": 4.0821, + "step": 3017 + }, + { + "epoch": 0.604809619238477, + "grad_norm": 19.35598427685409, + "learning_rate": 9.689224189814135e-06, + "loss": 4.1913, + "step": 3018 + }, + { + "epoch": 0.6050100200400802, + "grad_norm": 20.49184481070053, + "learning_rate": 9.688819436446589e-06, + "loss": 4.7543, + "step": 3019 + }, + { + "epoch": 0.6052104208416834, + "grad_norm": 22.92781646606036, + "learning_rate": 9.688414428141008e-06, + "loss": 4.1158, + "step": 3020 + }, + { + "epoch": 0.6054108216432865, + "grad_norm": 23.76962623926763, + "learning_rate": 9.688009164919414e-06, + "loss": 4.3077, + "step": 3021 + }, + { + "epoch": 0.6056112224448897, + "grad_norm": 35.37670719569397, + "learning_rate": 9.68760364680384e-06, + "loss": 4.9676, + "step": 3022 + }, + { + "epoch": 0.605811623246493, + "grad_norm": 22.137351590631045, + "learning_rate": 9.687197873816339e-06, + "loss": 3.7745, + "step": 3023 + }, + { + "epoch": 0.6060120240480962, + "grad_norm": 25.40272023957885, + "learning_rate": 9.68679184597897e-06, + "loss": 4.0758, + "step": 3024 + }, + { + "epoch": 0.6062124248496994, + "grad_norm": 21.249492639581494, + "learning_rate": 9.686385563313807e-06, + "loss": 3.7748, + "step": 3025 + }, + { + "epoch": 0.6064128256513026, + "grad_norm": 21.913317350275065, + "learning_rate": 9.685979025842946e-06, + "loss": 3.9401, + "step": 3026 + }, + { + "epoch": 0.6066132264529058, + "grad_norm": 16.03001395865112, + "learning_rate": 9.685572233588488e-06, + "loss": 3.9255, + "step": 3027 + }, + { + "epoch": 0.606813627254509, + "grad_norm": 21.221169303214744, + "learning_rate": 9.68516518657255e-06, + "loss": 3.5019, + "step": 3028 + }, + { + "epoch": 0.6070140280561123, + "grad_norm": 25.394992557852714, + "learning_rate": 9.684757884817266e-06, + "loss": 4.277, + "step": 3029 + }, + { + "epoch": 0.6072144288577155, + "grad_norm": 23.5301727651187, + "learning_rate": 9.68435032834478e-06, + "loss": 4.2192, + "step": 3030 + }, + { + "epoch": 0.6074148296593186, + "grad_norm": 21.46432863636985, + "learning_rate": 9.683942517177252e-06, + "loss": 4.0525, + "step": 3031 + }, + { + "epoch": 0.6076152304609218, + "grad_norm": 31.245161729499397, + "learning_rate": 9.683534451336855e-06, + "loss": 5.3381, + "step": 3032 + }, + { + "epoch": 0.607815631262525, + "grad_norm": 29.053024090053075, + "learning_rate": 9.683126130845777e-06, + "loss": 4.1642, + "step": 3033 + }, + { + "epoch": 0.6080160320641282, + "grad_norm": 30.946960197534445, + "learning_rate": 9.682717555726217e-06, + "loss": 4.8163, + "step": 3034 + }, + { + "epoch": 0.6082164328657315, + "grad_norm": 32.920128182266986, + "learning_rate": 9.682308726000393e-06, + "loss": 4.3575, + "step": 3035 + }, + { + "epoch": 0.6084168336673347, + "grad_norm": 26.451040584481586, + "learning_rate": 9.681899641690531e-06, + "loss": 4.7807, + "step": 3036 + }, + { + "epoch": 0.6086172344689379, + "grad_norm": 32.667067194019275, + "learning_rate": 9.681490302818874e-06, + "loss": 4.2377, + "step": 3037 + }, + { + "epoch": 0.6088176352705411, + "grad_norm": 23.49639888901289, + "learning_rate": 9.68108070940768e-06, + "loss": 4.0264, + "step": 3038 + }, + { + "epoch": 0.6090180360721443, + "grad_norm": 25.367666367432623, + "learning_rate": 9.680670861479216e-06, + "loss": 4.1515, + "step": 3039 + }, + { + "epoch": 0.6092184368737475, + "grad_norm": 27.25374792783396, + "learning_rate": 9.680260759055769e-06, + "loss": 4.203, + "step": 3040 + }, + { + "epoch": 0.6094188376753507, + "grad_norm": 22.43580784352024, + "learning_rate": 9.679850402159636e-06, + "loss": 4.1161, + "step": 3041 + }, + { + "epoch": 0.6096192384769539, + "grad_norm": 26.355999609511436, + "learning_rate": 9.679439790813128e-06, + "loss": 4.6685, + "step": 3042 + }, + { + "epoch": 0.6098196392785571, + "grad_norm": 45.09983066521594, + "learning_rate": 9.679028925038571e-06, + "loss": 4.7269, + "step": 3043 + }, + { + "epoch": 0.6100200400801603, + "grad_norm": 27.204605995317444, + "learning_rate": 9.678617804858305e-06, + "loss": 4.3738, + "step": 3044 + }, + { + "epoch": 0.6102204408817635, + "grad_norm": 31.39926068089379, + "learning_rate": 9.678206430294682e-06, + "loss": 4.4341, + "step": 3045 + }, + { + "epoch": 0.6104208416833667, + "grad_norm": 25.43297564943211, + "learning_rate": 9.677794801370071e-06, + "loss": 4.1047, + "step": 3046 + }, + { + "epoch": 0.61062124248497, + "grad_norm": 33.57596392769465, + "learning_rate": 9.67738291810685e-06, + "loss": 4.0727, + "step": 3047 + }, + { + "epoch": 0.6108216432865732, + "grad_norm": 23.09516694975356, + "learning_rate": 9.676970780527415e-06, + "loss": 4.4996, + "step": 3048 + }, + { + "epoch": 0.6110220440881764, + "grad_norm": 55.86516405490781, + "learning_rate": 9.676558388654176e-06, + "loss": 4.1798, + "step": 3049 + }, + { + "epoch": 0.6112224448897795, + "grad_norm": 35.29001515084557, + "learning_rate": 9.676145742509554e-06, + "loss": 4.7412, + "step": 3050 + }, + { + "epoch": 0.6114228456913827, + "grad_norm": 25.30850026059117, + "learning_rate": 9.675732842115986e-06, + "loss": 4.3159, + "step": 3051 + }, + { + "epoch": 0.6116232464929859, + "grad_norm": 26.480127596589536, + "learning_rate": 9.67531968749592e-06, + "loss": 4.1367, + "step": 3052 + }, + { + "epoch": 0.6118236472945892, + "grad_norm": 23.4052187228872, + "learning_rate": 9.674906278671823e-06, + "loss": 4.6958, + "step": 3053 + }, + { + "epoch": 0.6120240480961924, + "grad_norm": 34.9158133163623, + "learning_rate": 9.67449261566617e-06, + "loss": 4.2516, + "step": 3054 + }, + { + "epoch": 0.6122244488977956, + "grad_norm": 24.8506763874831, + "learning_rate": 9.674078698501452e-06, + "loss": 4.1613, + "step": 3055 + }, + { + "epoch": 0.6124248496993988, + "grad_norm": 21.050279954940784, + "learning_rate": 9.673664527200178e-06, + "loss": 4.0147, + "step": 3056 + }, + { + "epoch": 0.612625250501002, + "grad_norm": 29.092885342998457, + "learning_rate": 9.673250101784864e-06, + "loss": 4.4684, + "step": 3057 + }, + { + "epoch": 0.6128256513026052, + "grad_norm": 33.78399833641505, + "learning_rate": 9.672835422278041e-06, + "loss": 4.2412, + "step": 3058 + }, + { + "epoch": 0.6130260521042085, + "grad_norm": 24.968438279600594, + "learning_rate": 9.672420488702261e-06, + "loss": 4.2798, + "step": 3059 + }, + { + "epoch": 0.6132264529058116, + "grad_norm": 36.5253674777602, + "learning_rate": 9.672005301080082e-06, + "loss": 4.3653, + "step": 3060 + }, + { + "epoch": 0.6134268537074148, + "grad_norm": 22.37277423042699, + "learning_rate": 9.671589859434078e-06, + "loss": 4.0439, + "step": 3061 + }, + { + "epoch": 0.613627254509018, + "grad_norm": 58.15036514841903, + "learning_rate": 9.671174163786836e-06, + "loss": 4.9292, + "step": 3062 + }, + { + "epoch": 0.6138276553106212, + "grad_norm": 23.165684668180674, + "learning_rate": 9.67075821416096e-06, + "loss": 3.7982, + "step": 3063 + }, + { + "epoch": 0.6140280561122244, + "grad_norm": 18.970550583312736, + "learning_rate": 9.670342010579067e-06, + "loss": 3.6324, + "step": 3064 + }, + { + "epoch": 0.6142284569138277, + "grad_norm": 18.173477253790473, + "learning_rate": 9.669925553063782e-06, + "loss": 3.7246, + "step": 3065 + }, + { + "epoch": 0.6144288577154309, + "grad_norm": 23.58359403429582, + "learning_rate": 9.669508841637754e-06, + "loss": 3.7913, + "step": 3066 + }, + { + "epoch": 0.6146292585170341, + "grad_norm": 31.92613859691851, + "learning_rate": 9.669091876323635e-06, + "loss": 4.1624, + "step": 3067 + }, + { + "epoch": 0.6148296593186373, + "grad_norm": 21.941647715592268, + "learning_rate": 9.6686746571441e-06, + "loss": 4.1156, + "step": 3068 + }, + { + "epoch": 0.6150300601202405, + "grad_norm": 23.5388960297402, + "learning_rate": 9.668257184121831e-06, + "loss": 4.2455, + "step": 3069 + }, + { + "epoch": 0.6152304609218436, + "grad_norm": 41.78752405559999, + "learning_rate": 9.66783945727953e-06, + "loss": 4.4515, + "step": 3070 + }, + { + "epoch": 0.6154308617234469, + "grad_norm": 76.12594750375578, + "learning_rate": 9.667421476639906e-06, + "loss": 4.497, + "step": 3071 + }, + { + "epoch": 0.6156312625250501, + "grad_norm": 25.470045189589328, + "learning_rate": 9.667003242225687e-06, + "loss": 3.9829, + "step": 3072 + }, + { + "epoch": 0.6158316633266533, + "grad_norm": 62.97918718940048, + "learning_rate": 9.666584754059612e-06, + "loss": 4.5707, + "step": 3073 + }, + { + "epoch": 0.6160320641282565, + "grad_norm": 23.020306270948012, + "learning_rate": 9.666166012164434e-06, + "loss": 3.76, + "step": 3074 + }, + { + "epoch": 0.6162324649298597, + "grad_norm": 21.000299983624554, + "learning_rate": 9.665747016562924e-06, + "loss": 4.343, + "step": 3075 + }, + { + "epoch": 0.6164328657314629, + "grad_norm": 35.76366401552435, + "learning_rate": 9.665327767277864e-06, + "loss": 4.6204, + "step": 3076 + }, + { + "epoch": 0.6166332665330662, + "grad_norm": 26.499363360785967, + "learning_rate": 9.664908264332042e-06, + "loss": 4.1621, + "step": 3077 + }, + { + "epoch": 0.6168336673346694, + "grad_norm": 24.13290109029855, + "learning_rate": 9.664488507748273e-06, + "loss": 3.9743, + "step": 3078 + }, + { + "epoch": 0.6170340681362726, + "grad_norm": 25.629318755190816, + "learning_rate": 9.66406849754938e-06, + "loss": 3.8878, + "step": 3079 + }, + { + "epoch": 0.6172344689378757, + "grad_norm": 24.721421480689926, + "learning_rate": 9.663648233758198e-06, + "loss": 4.1516, + "step": 3080 + }, + { + "epoch": 0.6174348697394789, + "grad_norm": 31.808791477291123, + "learning_rate": 9.663227716397575e-06, + "loss": 5.0111, + "step": 3081 + }, + { + "epoch": 0.6176352705410821, + "grad_norm": 23.06160619561942, + "learning_rate": 9.66280694549038e-06, + "loss": 3.7943, + "step": 3082 + }, + { + "epoch": 0.6178356713426854, + "grad_norm": 30.468964486619715, + "learning_rate": 9.662385921059486e-06, + "loss": 3.9009, + "step": 3083 + }, + { + "epoch": 0.6180360721442886, + "grad_norm": 26.922239890614783, + "learning_rate": 9.661964643127788e-06, + "loss": 4.4353, + "step": 3084 + }, + { + "epoch": 0.6182364729458918, + "grad_norm": 75.20121974887164, + "learning_rate": 9.66154311171819e-06, + "loss": 4.6721, + "step": 3085 + }, + { + "epoch": 0.618436873747495, + "grad_norm": 49.18647098578546, + "learning_rate": 9.661121326853615e-06, + "loss": 4.3528, + "step": 3086 + }, + { + "epoch": 0.6186372745490982, + "grad_norm": 22.39852100656736, + "learning_rate": 9.660699288556989e-06, + "loss": 4.2814, + "step": 3087 + }, + { + "epoch": 0.6188376753507014, + "grad_norm": 27.4791574816454, + "learning_rate": 9.660276996851265e-06, + "loss": 4.5215, + "step": 3088 + }, + { + "epoch": 0.6190380761523047, + "grad_norm": 25.510798081836807, + "learning_rate": 9.659854451759403e-06, + "loss": 4.4822, + "step": 3089 + }, + { + "epoch": 0.6192384769539078, + "grad_norm": 18.451685796566572, + "learning_rate": 9.659431653304373e-06, + "loss": 3.9925, + "step": 3090 + }, + { + "epoch": 0.619438877755511, + "grad_norm": 40.470159958383775, + "learning_rate": 9.659008601509167e-06, + "loss": 5.1069, + "step": 3091 + }, + { + "epoch": 0.6196392785571142, + "grad_norm": 24.255134812896312, + "learning_rate": 9.658585296396788e-06, + "loss": 3.989, + "step": 3092 + }, + { + "epoch": 0.6198396793587174, + "grad_norm": 34.859738694099015, + "learning_rate": 9.658161737990247e-06, + "loss": 4.4134, + "step": 3093 + }, + { + "epoch": 0.6200400801603206, + "grad_norm": 21.497125700460604, + "learning_rate": 9.657737926312577e-06, + "loss": 4.0922, + "step": 3094 + }, + { + "epoch": 0.6202404809619239, + "grad_norm": 26.16353417602791, + "learning_rate": 9.65731386138682e-06, + "loss": 4.5787, + "step": 3095 + }, + { + "epoch": 0.6204408817635271, + "grad_norm": 37.305586161450265, + "learning_rate": 9.656889543236035e-06, + "loss": 4.3744, + "step": 3096 + }, + { + "epoch": 0.6206412825651303, + "grad_norm": 28.402956467328057, + "learning_rate": 9.656464971883291e-06, + "loss": 4.5728, + "step": 3097 + }, + { + "epoch": 0.6208416833667335, + "grad_norm": 25.853541522739672, + "learning_rate": 9.656040147351672e-06, + "loss": 4.1148, + "step": 3098 + }, + { + "epoch": 0.6210420841683367, + "grad_norm": 24.809426536406065, + "learning_rate": 9.655615069664278e-06, + "loss": 4.4246, + "step": 3099 + }, + { + "epoch": 0.6212424849699398, + "grad_norm": 19.84611086034391, + "learning_rate": 9.65518973884422e-06, + "loss": 3.9883, + "step": 3100 + }, + { + "epoch": 0.621442885771543, + "grad_norm": 31.611476677931854, + "learning_rate": 9.654764154914624e-06, + "loss": 4.1762, + "step": 3101 + }, + { + "epoch": 0.6216432865731463, + "grad_norm": 24.083267106612556, + "learning_rate": 9.65433831789863e-06, + "loss": 3.8098, + "step": 3102 + }, + { + "epoch": 0.6218436873747495, + "grad_norm": 23.46749139090663, + "learning_rate": 9.653912227819391e-06, + "loss": 3.6837, + "step": 3103 + }, + { + "epoch": 0.6220440881763527, + "grad_norm": 21.624119771804008, + "learning_rate": 9.653485884700076e-06, + "loss": 3.9113, + "step": 3104 + }, + { + "epoch": 0.6222444889779559, + "grad_norm": 28.390643002166563, + "learning_rate": 9.653059288563863e-06, + "loss": 4.2094, + "step": 3105 + }, + { + "epoch": 0.6224448897795591, + "grad_norm": 27.208600620974607, + "learning_rate": 9.652632439433946e-06, + "loss": 4.1384, + "step": 3106 + }, + { + "epoch": 0.6226452905811624, + "grad_norm": 30.779690814285342, + "learning_rate": 9.652205337333539e-06, + "loss": 4.4167, + "step": 3107 + }, + { + "epoch": 0.6228456913827656, + "grad_norm": 20.31365082094567, + "learning_rate": 9.651777982285859e-06, + "loss": 4.1619, + "step": 3108 + }, + { + "epoch": 0.6230460921843687, + "grad_norm": 41.104465166036746, + "learning_rate": 9.651350374314143e-06, + "loss": 3.8436, + "step": 3109 + }, + { + "epoch": 0.6232464929859719, + "grad_norm": 26.78749688429552, + "learning_rate": 9.650922513441641e-06, + "loss": 4.1137, + "step": 3110 + }, + { + "epoch": 0.6234468937875751, + "grad_norm": 37.12480545046294, + "learning_rate": 9.650494399691618e-06, + "loss": 3.9969, + "step": 3111 + }, + { + "epoch": 0.6236472945891783, + "grad_norm": 30.46728953417034, + "learning_rate": 9.650066033087348e-06, + "loss": 3.6077, + "step": 3112 + }, + { + "epoch": 0.6238476953907816, + "grad_norm": 25.232540144972067, + "learning_rate": 9.649637413652124e-06, + "loss": 3.9668, + "step": 3113 + }, + { + "epoch": 0.6240480961923848, + "grad_norm": 23.238621792578982, + "learning_rate": 9.649208541409251e-06, + "loss": 4.4329, + "step": 3114 + }, + { + "epoch": 0.624248496993988, + "grad_norm": 33.08710109361233, + "learning_rate": 9.648779416382046e-06, + "loss": 3.9879, + "step": 3115 + }, + { + "epoch": 0.6244488977955912, + "grad_norm": 16.709819500777133, + "learning_rate": 9.648350038593842e-06, + "loss": 3.9697, + "step": 3116 + }, + { + "epoch": 0.6246492985971944, + "grad_norm": 34.539254757457655, + "learning_rate": 9.647920408067986e-06, + "loss": 4.2336, + "step": 3117 + }, + { + "epoch": 0.6248496993987976, + "grad_norm": 25.21276062155912, + "learning_rate": 9.647490524827834e-06, + "loss": 4.191, + "step": 3118 + }, + { + "epoch": 0.6250501002004007, + "grad_norm": 38.687724251825436, + "learning_rate": 9.647060388896763e-06, + "loss": 4.5441, + "step": 3119 + }, + { + "epoch": 0.625250501002004, + "grad_norm": 27.53778534541889, + "learning_rate": 9.646630000298161e-06, + "loss": 3.9604, + "step": 3120 + }, + { + "epoch": 0.6254509018036072, + "grad_norm": 33.108049096781514, + "learning_rate": 9.646199359055425e-06, + "loss": 4.5189, + "step": 3121 + }, + { + "epoch": 0.6256513026052104, + "grad_norm": 21.424006514605612, + "learning_rate": 9.645768465191971e-06, + "loss": 3.9356, + "step": 3122 + }, + { + "epoch": 0.6258517034068136, + "grad_norm": 28.763127122712987, + "learning_rate": 9.645337318731229e-06, + "loss": 4.4171, + "step": 3123 + }, + { + "epoch": 0.6260521042084168, + "grad_norm": 21.307988318535003, + "learning_rate": 9.64490591969664e-06, + "loss": 4.3101, + "step": 3124 + }, + { + "epoch": 0.62625250501002, + "grad_norm": 41.639212127571035, + "learning_rate": 9.644474268111658e-06, + "loss": 4.6118, + "step": 3125 + }, + { + "epoch": 0.6264529058116233, + "grad_norm": 19.720230999136387, + "learning_rate": 9.644042363999755e-06, + "loss": 4.0961, + "step": 3126 + }, + { + "epoch": 0.6266533066132265, + "grad_norm": 25.044211920670563, + "learning_rate": 9.643610207384412e-06, + "loss": 4.4119, + "step": 3127 + }, + { + "epoch": 0.6268537074148297, + "grad_norm": 36.29376817440289, + "learning_rate": 9.64317779828913e-06, + "loss": 4.8029, + "step": 3128 + }, + { + "epoch": 0.6270541082164328, + "grad_norm": 19.91982292466474, + "learning_rate": 9.642745136737416e-06, + "loss": 3.7562, + "step": 3129 + }, + { + "epoch": 0.627254509018036, + "grad_norm": 16.314278714978663, + "learning_rate": 9.642312222752796e-06, + "loss": 3.8517, + "step": 3130 + }, + { + "epoch": 0.6274549098196393, + "grad_norm": 24.63984238307664, + "learning_rate": 9.641879056358807e-06, + "loss": 4.1998, + "step": 3131 + }, + { + "epoch": 0.6276553106212425, + "grad_norm": 24.65441674496523, + "learning_rate": 9.641445637579003e-06, + "loss": 4.5146, + "step": 3132 + }, + { + "epoch": 0.6278557114228457, + "grad_norm": 29.5653463307864, + "learning_rate": 9.641011966436945e-06, + "loss": 4.454, + "step": 3133 + }, + { + "epoch": 0.6280561122244489, + "grad_norm": 49.7116595953441, + "learning_rate": 9.640578042956219e-06, + "loss": 3.823, + "step": 3134 + }, + { + "epoch": 0.6282565130260521, + "grad_norm": 29.473230300668916, + "learning_rate": 9.640143867160414e-06, + "loss": 4.8921, + "step": 3135 + }, + { + "epoch": 0.6284569138276553, + "grad_norm": 22.191059594684983, + "learning_rate": 9.639709439073136e-06, + "loss": 4.456, + "step": 3136 + }, + { + "epoch": 0.6286573146292586, + "grad_norm": 59.27058616179294, + "learning_rate": 9.639274758718009e-06, + "loss": 3.2677, + "step": 3137 + }, + { + "epoch": 0.6288577154308618, + "grad_norm": 43.79170383936955, + "learning_rate": 9.638839826118664e-06, + "loss": 4.6094, + "step": 3138 + }, + { + "epoch": 0.6290581162324649, + "grad_norm": 45.80068187887457, + "learning_rate": 9.638404641298751e-06, + "loss": 4.6146, + "step": 3139 + }, + { + "epoch": 0.6292585170340681, + "grad_norm": 25.85695651896449, + "learning_rate": 9.63796920428193e-06, + "loss": 4.3287, + "step": 3140 + }, + { + "epoch": 0.6294589178356713, + "grad_norm": 23.761843633722663, + "learning_rate": 9.637533515091876e-06, + "loss": 3.9365, + "step": 3141 + }, + { + "epoch": 0.6296593186372745, + "grad_norm": 32.38056799046837, + "learning_rate": 9.637097573752281e-06, + "loss": 4.5604, + "step": 3142 + }, + { + "epoch": 0.6298597194388778, + "grad_norm": 33.199583729401176, + "learning_rate": 9.636661380286845e-06, + "loss": 4.2407, + "step": 3143 + }, + { + "epoch": 0.630060120240481, + "grad_norm": 23.485783480430733, + "learning_rate": 9.636224934719285e-06, + "loss": 4.1864, + "step": 3144 + }, + { + "epoch": 0.6302605210420842, + "grad_norm": 24.06249113963706, + "learning_rate": 9.635788237073333e-06, + "loss": 4.3913, + "step": 3145 + }, + { + "epoch": 0.6304609218436874, + "grad_norm": 22.847211191066275, + "learning_rate": 9.635351287372731e-06, + "loss": 3.5244, + "step": 3146 + }, + { + "epoch": 0.6306613226452906, + "grad_norm": 26.203092700530338, + "learning_rate": 9.634914085641236e-06, + "loss": 4.2547, + "step": 3147 + }, + { + "epoch": 0.6308617234468938, + "grad_norm": 32.839896018857765, + "learning_rate": 9.634476631902623e-06, + "loss": 4.1881, + "step": 3148 + }, + { + "epoch": 0.631062124248497, + "grad_norm": 28.814529242101546, + "learning_rate": 9.634038926180672e-06, + "loss": 4.2026, + "step": 3149 + }, + { + "epoch": 0.6312625250501002, + "grad_norm": 24.612245098810224, + "learning_rate": 9.633600968499183e-06, + "loss": 3.8264, + "step": 3150 + }, + { + "epoch": 0.6314629258517034, + "grad_norm": 27.78663517068689, + "learning_rate": 9.633162758881973e-06, + "loss": 4.5547, + "step": 3151 + }, + { + "epoch": 0.6316633266533066, + "grad_norm": 23.38918159422122, + "learning_rate": 9.632724297352862e-06, + "loss": 3.7351, + "step": 3152 + }, + { + "epoch": 0.6318637274549098, + "grad_norm": 26.556794179893856, + "learning_rate": 9.632285583935694e-06, + "loss": 3.9695, + "step": 3153 + }, + { + "epoch": 0.632064128256513, + "grad_norm": 24.1828599552876, + "learning_rate": 9.63184661865432e-06, + "loss": 3.9752, + "step": 3154 + }, + { + "epoch": 0.6322645290581163, + "grad_norm": 30.134212344243792, + "learning_rate": 9.631407401532609e-06, + "loss": 3.8653, + "step": 3155 + }, + { + "epoch": 0.6324649298597195, + "grad_norm": 34.95075292428924, + "learning_rate": 9.63096793259444e-06, + "loss": 4.2632, + "step": 3156 + }, + { + "epoch": 0.6326653306613227, + "grad_norm": 36.87203832012132, + "learning_rate": 9.63052821186371e-06, + "loss": 3.8444, + "step": 3157 + }, + { + "epoch": 0.6328657314629259, + "grad_norm": 27.886897262225197, + "learning_rate": 9.630088239364323e-06, + "loss": 3.7724, + "step": 3158 + }, + { + "epoch": 0.633066132264529, + "grad_norm": 26.002116342230412, + "learning_rate": 9.629648015120207e-06, + "loss": 4.4853, + "step": 3159 + }, + { + "epoch": 0.6332665330661322, + "grad_norm": 27.726417514740767, + "learning_rate": 9.629207539155292e-06, + "loss": 4.8024, + "step": 3160 + }, + { + "epoch": 0.6334669338677354, + "grad_norm": 31.56183274168405, + "learning_rate": 9.62876681149353e-06, + "loss": 4.6396, + "step": 3161 + }, + { + "epoch": 0.6336673346693387, + "grad_norm": 21.073771564723177, + "learning_rate": 9.628325832158885e-06, + "loss": 3.8878, + "step": 3162 + }, + { + "epoch": 0.6338677354709419, + "grad_norm": 29.01623395314413, + "learning_rate": 9.627884601175332e-06, + "loss": 4.0931, + "step": 3163 + }, + { + "epoch": 0.6340681362725451, + "grad_norm": 26.648118080429555, + "learning_rate": 9.627443118566862e-06, + "loss": 4.0143, + "step": 3164 + }, + { + "epoch": 0.6342685370741483, + "grad_norm": 21.074274578918818, + "learning_rate": 9.627001384357479e-06, + "loss": 4.5787, + "step": 3165 + }, + { + "epoch": 0.6344689378757515, + "grad_norm": 23.35372008366228, + "learning_rate": 9.6265593985712e-06, + "loss": 4.2715, + "step": 3166 + }, + { + "epoch": 0.6346693386773548, + "grad_norm": 23.845744903631626, + "learning_rate": 9.626117161232058e-06, + "loss": 4.6836, + "step": 3167 + }, + { + "epoch": 0.6348697394789579, + "grad_norm": 21.666435925024775, + "learning_rate": 9.625674672364097e-06, + "loss": 4.0446, + "step": 3168 + }, + { + "epoch": 0.6350701402805611, + "grad_norm": 56.09037625202743, + "learning_rate": 9.625231931991375e-06, + "loss": 4.4899, + "step": 3169 + }, + { + "epoch": 0.6352705410821643, + "grad_norm": 20.520516733064266, + "learning_rate": 9.624788940137965e-06, + "loss": 4.0351, + "step": 3170 + }, + { + "epoch": 0.6354709418837675, + "grad_norm": 37.34757639098987, + "learning_rate": 9.624345696827956e-06, + "loss": 4.9923, + "step": 3171 + }, + { + "epoch": 0.6356713426853707, + "grad_norm": 38.872770974912925, + "learning_rate": 9.623902202085444e-06, + "loss": 4.2539, + "step": 3172 + }, + { + "epoch": 0.635871743486974, + "grad_norm": 51.49099428745513, + "learning_rate": 9.623458455934544e-06, + "loss": 4.3839, + "step": 3173 + }, + { + "epoch": 0.6360721442885772, + "grad_norm": 28.75978803330248, + "learning_rate": 9.623014458399382e-06, + "loss": 4.2073, + "step": 3174 + }, + { + "epoch": 0.6362725450901804, + "grad_norm": 21.767518171720976, + "learning_rate": 9.6225702095041e-06, + "loss": 4.6221, + "step": 3175 + }, + { + "epoch": 0.6364729458917836, + "grad_norm": 27.93278885942951, + "learning_rate": 9.622125709272853e-06, + "loss": 4.65, + "step": 3176 + }, + { + "epoch": 0.6366733466933868, + "grad_norm": 21.943236739909434, + "learning_rate": 9.621680957729806e-06, + "loss": 4.0875, + "step": 3177 + }, + { + "epoch": 0.6368737474949899, + "grad_norm": 22.60379648977269, + "learning_rate": 9.621235954899147e-06, + "loss": 4.1141, + "step": 3178 + }, + { + "epoch": 0.6370741482965931, + "grad_norm": 19.824781731916218, + "learning_rate": 9.620790700805064e-06, + "loss": 3.9978, + "step": 3179 + }, + { + "epoch": 0.6372745490981964, + "grad_norm": 31.388198910000664, + "learning_rate": 9.620345195471772e-06, + "loss": 4.6978, + "step": 3180 + }, + { + "epoch": 0.6374749498997996, + "grad_norm": 60.11354920191922, + "learning_rate": 9.61989943892349e-06, + "loss": 4.3981, + "step": 3181 + }, + { + "epoch": 0.6376753507014028, + "grad_norm": 29.853734153124854, + "learning_rate": 9.619453431184456e-06, + "loss": 4.3913, + "step": 3182 + }, + { + "epoch": 0.637875751503006, + "grad_norm": 19.09559996536484, + "learning_rate": 9.61900717227892e-06, + "loss": 3.5947, + "step": 3183 + }, + { + "epoch": 0.6380761523046092, + "grad_norm": 31.68326683894663, + "learning_rate": 9.618560662231146e-06, + "loss": 3.7936, + "step": 3184 + }, + { + "epoch": 0.6382765531062125, + "grad_norm": 57.49682318599954, + "learning_rate": 9.618113901065412e-06, + "loss": 4.0802, + "step": 3185 + }, + { + "epoch": 0.6384769539078157, + "grad_norm": 20.847447365474796, + "learning_rate": 9.617666888806008e-06, + "loss": 4.2523, + "step": 3186 + }, + { + "epoch": 0.6386773547094189, + "grad_norm": 40.32157405127481, + "learning_rate": 9.617219625477238e-06, + "loss": 5.0719, + "step": 3187 + }, + { + "epoch": 0.638877755511022, + "grad_norm": 29.360824665307433, + "learning_rate": 9.616772111103422e-06, + "loss": 4.6636, + "step": 3188 + }, + { + "epoch": 0.6390781563126252, + "grad_norm": 24.67241686669745, + "learning_rate": 9.616324345708891e-06, + "loss": 4.1183, + "step": 3189 + }, + { + "epoch": 0.6392785571142284, + "grad_norm": 26.489562802775193, + "learning_rate": 9.615876329317992e-06, + "loss": 4.5252, + "step": 3190 + }, + { + "epoch": 0.6394789579158316, + "grad_norm": 24.558128581942157, + "learning_rate": 9.61542806195508e-06, + "loss": 4.3837, + "step": 3191 + }, + { + "epoch": 0.6396793587174349, + "grad_norm": 18.587774074928525, + "learning_rate": 9.614979543644536e-06, + "loss": 4.0661, + "step": 3192 + }, + { + "epoch": 0.6398797595190381, + "grad_norm": 24.049591761698125, + "learning_rate": 9.614530774410739e-06, + "loss": 3.9045, + "step": 3193 + }, + { + "epoch": 0.6400801603206413, + "grad_norm": 26.05060307142412, + "learning_rate": 9.614081754278092e-06, + "loss": 4.4227, + "step": 3194 + }, + { + "epoch": 0.6402805611222445, + "grad_norm": 35.42459158756531, + "learning_rate": 9.61363248327101e-06, + "loss": 4.2253, + "step": 3195 + }, + { + "epoch": 0.6404809619238477, + "grad_norm": 27.110898267993445, + "learning_rate": 9.61318296141392e-06, + "loss": 4.2579, + "step": 3196 + }, + { + "epoch": 0.640681362725451, + "grad_norm": 28.114745150486645, + "learning_rate": 9.612733188731262e-06, + "loss": 4.5223, + "step": 3197 + }, + { + "epoch": 0.6408817635270541, + "grad_norm": 32.0078164777094, + "learning_rate": 9.612283165247491e-06, + "loss": 3.9863, + "step": 3198 + }, + { + "epoch": 0.6410821643286573, + "grad_norm": 28.452753299958726, + "learning_rate": 9.611832890987077e-06, + "loss": 4.4382, + "step": 3199 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 35.34877244844285, + "learning_rate": 9.6113823659745e-06, + "loss": 4.3756, + "step": 3200 + }, + { + "epoch": 0.6414829659318637, + "grad_norm": 23.31334572097269, + "learning_rate": 9.610931590234258e-06, + "loss": 4.0537, + "step": 3201 + }, + { + "epoch": 0.6416833667334669, + "grad_norm": 25.726533371131723, + "learning_rate": 9.610480563790858e-06, + "loss": 3.9191, + "step": 3202 + }, + { + "epoch": 0.6418837675350701, + "grad_norm": 28.21852378835125, + "learning_rate": 9.610029286668825e-06, + "loss": 4.0945, + "step": 3203 + }, + { + "epoch": 0.6420841683366734, + "grad_norm": 26.05796834727033, + "learning_rate": 9.609577758892693e-06, + "loss": 4.2702, + "step": 3204 + }, + { + "epoch": 0.6422845691382766, + "grad_norm": 28.707292660460425, + "learning_rate": 9.609125980487016e-06, + "loss": 4.5285, + "step": 3205 + }, + { + "epoch": 0.6424849699398798, + "grad_norm": 21.128469085997363, + "learning_rate": 9.608673951476354e-06, + "loss": 3.993, + "step": 3206 + }, + { + "epoch": 0.642685370741483, + "grad_norm": 55.59252433096768, + "learning_rate": 9.608221671885287e-06, + "loss": 4.2895, + "step": 3207 + }, + { + "epoch": 0.6428857715430861, + "grad_norm": 51.070682525207985, + "learning_rate": 9.607769141738406e-06, + "loss": 4.691, + "step": 3208 + }, + { + "epoch": 0.6430861723446893, + "grad_norm": 25.727778853705885, + "learning_rate": 9.607316361060312e-06, + "loss": 4.3076, + "step": 3209 + }, + { + "epoch": 0.6432865731462926, + "grad_norm": 24.639011998750988, + "learning_rate": 9.606863329875631e-06, + "loss": 3.2301, + "step": 3210 + }, + { + "epoch": 0.6434869739478958, + "grad_norm": 25.26019316702766, + "learning_rate": 9.606410048208988e-06, + "loss": 3.547, + "step": 3211 + }, + { + "epoch": 0.643687374749499, + "grad_norm": 24.92887637014872, + "learning_rate": 9.60595651608503e-06, + "loss": 4.6414, + "step": 3212 + }, + { + "epoch": 0.6438877755511022, + "grad_norm": 59.447344950555056, + "learning_rate": 9.605502733528418e-06, + "loss": 4.3944, + "step": 3213 + }, + { + "epoch": 0.6440881763527054, + "grad_norm": 16.16666150349413, + "learning_rate": 9.605048700563825e-06, + "loss": 3.7379, + "step": 3214 + }, + { + "epoch": 0.6442885771543087, + "grad_norm": 29.97777192455741, + "learning_rate": 9.604594417215936e-06, + "loss": 4.1139, + "step": 3215 + }, + { + "epoch": 0.6444889779559119, + "grad_norm": 88.1249189845408, + "learning_rate": 9.60413988350945e-06, + "loss": 3.8799, + "step": 3216 + }, + { + "epoch": 0.644689378757515, + "grad_norm": 18.111646941498563, + "learning_rate": 9.603685099469085e-06, + "loss": 4.147, + "step": 3217 + }, + { + "epoch": 0.6448897795591182, + "grad_norm": 26.590695483835955, + "learning_rate": 9.603230065119565e-06, + "loss": 4.4459, + "step": 3218 + }, + { + "epoch": 0.6450901803607214, + "grad_norm": 27.987908243676785, + "learning_rate": 9.60277478048563e-06, + "loss": 4.3692, + "step": 3219 + }, + { + "epoch": 0.6452905811623246, + "grad_norm": 30.19571332721932, + "learning_rate": 9.602319245592036e-06, + "loss": 4.3619, + "step": 3220 + }, + { + "epoch": 0.6454909819639278, + "grad_norm": 31.00889882058378, + "learning_rate": 9.601863460463552e-06, + "loss": 3.9323, + "step": 3221 + }, + { + "epoch": 0.6456913827655311, + "grad_norm": 69.08727249840511, + "learning_rate": 9.601407425124957e-06, + "loss": 4.641, + "step": 3222 + }, + { + "epoch": 0.6458917835671343, + "grad_norm": 21.424986395012983, + "learning_rate": 9.60095113960105e-06, + "loss": 4.7279, + "step": 3223 + }, + { + "epoch": 0.6460921843687375, + "grad_norm": 24.670685390488046, + "learning_rate": 9.600494603916635e-06, + "loss": 4.4092, + "step": 3224 + }, + { + "epoch": 0.6462925851703407, + "grad_norm": 24.598516007294688, + "learning_rate": 9.60003781809654e-06, + "loss": 4.0652, + "step": 3225 + }, + { + "epoch": 0.6464929859719439, + "grad_norm": 83.45613363644725, + "learning_rate": 9.599580782165598e-06, + "loss": 4.935, + "step": 3226 + }, + { + "epoch": 0.646693386773547, + "grad_norm": 36.80928158714227, + "learning_rate": 9.599123496148658e-06, + "loss": 4.9211, + "step": 3227 + }, + { + "epoch": 0.6468937875751503, + "grad_norm": 22.301319677415748, + "learning_rate": 9.598665960070586e-06, + "loss": 4.0787, + "step": 3228 + }, + { + "epoch": 0.6470941883767535, + "grad_norm": 37.322433807681506, + "learning_rate": 9.598208173956258e-06, + "loss": 3.7946, + "step": 3229 + }, + { + "epoch": 0.6472945891783567, + "grad_norm": 28.878695381138368, + "learning_rate": 9.597750137830563e-06, + "loss": 4.4726, + "step": 3230 + }, + { + "epoch": 0.6474949899799599, + "grad_norm": 24.97084718148997, + "learning_rate": 9.597291851718407e-06, + "loss": 4.5159, + "step": 3231 + }, + { + "epoch": 0.6476953907815631, + "grad_norm": 45.26945363417552, + "learning_rate": 9.596833315644706e-06, + "loss": 4.3788, + "step": 3232 + }, + { + "epoch": 0.6478957915831663, + "grad_norm": 20.09796519715511, + "learning_rate": 9.596374529634392e-06, + "loss": 4.1177, + "step": 3233 + }, + { + "epoch": 0.6480961923847696, + "grad_norm": 30.81394332648457, + "learning_rate": 9.59591549371241e-06, + "loss": 3.8682, + "step": 3234 + }, + { + "epoch": 0.6482965931863728, + "grad_norm": 25.313872851835065, + "learning_rate": 9.595456207903718e-06, + "loss": 4.4385, + "step": 3235 + }, + { + "epoch": 0.648496993987976, + "grad_norm": 33.40661992117686, + "learning_rate": 9.59499667223329e-06, + "loss": 3.9892, + "step": 3236 + }, + { + "epoch": 0.6486973947895791, + "grad_norm": 28.096544830152265, + "learning_rate": 9.594536886726108e-06, + "loss": 4.5105, + "step": 3237 + }, + { + "epoch": 0.6488977955911823, + "grad_norm": 21.381873005618697, + "learning_rate": 9.594076851407174e-06, + "loss": 4.0312, + "step": 3238 + }, + { + "epoch": 0.6490981963927855, + "grad_norm": 22.28840074308073, + "learning_rate": 9.593616566301501e-06, + "loss": 3.9732, + "step": 3239 + }, + { + "epoch": 0.6492985971943888, + "grad_norm": 20.735763340431316, + "learning_rate": 9.593156031434114e-06, + "loss": 3.8605, + "step": 3240 + }, + { + "epoch": 0.649498997995992, + "grad_norm": 25.97545452024654, + "learning_rate": 9.592695246830055e-06, + "loss": 4.5475, + "step": 3241 + }, + { + "epoch": 0.6496993987975952, + "grad_norm": 26.390421123954706, + "learning_rate": 9.592234212514374e-06, + "loss": 3.9152, + "step": 3242 + }, + { + "epoch": 0.6498997995991984, + "grad_norm": 41.69651090590181, + "learning_rate": 9.59177292851214e-06, + "loss": 5.4243, + "step": 3243 + }, + { + "epoch": 0.6501002004008016, + "grad_norm": 25.76944404141097, + "learning_rate": 9.591311394848434e-06, + "loss": 4.0671, + "step": 3244 + }, + { + "epoch": 0.6503006012024048, + "grad_norm": 32.008765020371, + "learning_rate": 9.59084961154835e-06, + "loss": 4.011, + "step": 3245 + }, + { + "epoch": 0.6505010020040081, + "grad_norm": 23.552389874428336, + "learning_rate": 9.590387578636997e-06, + "loss": 4.5051, + "step": 3246 + }, + { + "epoch": 0.6507014028056112, + "grad_norm": 22.06934903358431, + "learning_rate": 9.589925296139496e-06, + "loss": 3.9097, + "step": 3247 + }, + { + "epoch": 0.6509018036072144, + "grad_norm": 38.46262262498931, + "learning_rate": 9.58946276408098e-06, + "loss": 5.1537, + "step": 3248 + }, + { + "epoch": 0.6511022044088176, + "grad_norm": 35.091938970673816, + "learning_rate": 9.588999982486597e-06, + "loss": 4.6226, + "step": 3249 + }, + { + "epoch": 0.6513026052104208, + "grad_norm": 29.840610484849375, + "learning_rate": 9.588536951381513e-06, + "loss": 4.3531, + "step": 3250 + }, + { + "epoch": 0.651503006012024, + "grad_norm": 30.88491472620453, + "learning_rate": 9.588073670790902e-06, + "loss": 4.2959, + "step": 3251 + }, + { + "epoch": 0.6517034068136273, + "grad_norm": 25.919437794660055, + "learning_rate": 9.587610140739952e-06, + "loss": 4.1969, + "step": 3252 + }, + { + "epoch": 0.6519038076152305, + "grad_norm": 39.07395602623797, + "learning_rate": 9.587146361253868e-06, + "loss": 4.2601, + "step": 3253 + }, + { + "epoch": 0.6521042084168337, + "grad_norm": 24.975296851740428, + "learning_rate": 9.586682332357865e-06, + "loss": 3.6063, + "step": 3254 + }, + { + "epoch": 0.6523046092184369, + "grad_norm": 21.83691287345257, + "learning_rate": 9.58621805407717e-06, + "loss": 3.6091, + "step": 3255 + }, + { + "epoch": 0.6525050100200401, + "grad_norm": 16.980953335447428, + "learning_rate": 9.585753526437032e-06, + "loss": 4.0086, + "step": 3256 + }, + { + "epoch": 0.6527054108216432, + "grad_norm": 29.04134342058964, + "learning_rate": 9.585288749462704e-06, + "loss": 4.4479, + "step": 3257 + }, + { + "epoch": 0.6529058116232465, + "grad_norm": 23.852347970071875, + "learning_rate": 9.58482372317946e-06, + "loss": 4.1398, + "step": 3258 + }, + { + "epoch": 0.6531062124248497, + "grad_norm": 32.04298026224823, + "learning_rate": 9.584358447612583e-06, + "loss": 4.1139, + "step": 3259 + }, + { + "epoch": 0.6533066132264529, + "grad_norm": 47.124811974692186, + "learning_rate": 9.583892922787367e-06, + "loss": 4.2629, + "step": 3260 + }, + { + "epoch": 0.6535070140280561, + "grad_norm": 28.252582114797576, + "learning_rate": 9.58342714872913e-06, + "loss": 4.5139, + "step": 3261 + }, + { + "epoch": 0.6537074148296593, + "grad_norm": 21.77563800696789, + "learning_rate": 9.58296112546319e-06, + "loss": 4.2168, + "step": 3262 + }, + { + "epoch": 0.6539078156312625, + "grad_norm": 19.61674346541508, + "learning_rate": 9.582494853014891e-06, + "loss": 4.4475, + "step": 3263 + }, + { + "epoch": 0.6541082164328658, + "grad_norm": 20.814249923377098, + "learning_rate": 9.582028331409581e-06, + "loss": 3.5752, + "step": 3264 + }, + { + "epoch": 0.654308617234469, + "grad_norm": 24.264386562079967, + "learning_rate": 9.581561560672627e-06, + "loss": 4.0581, + "step": 3265 + }, + { + "epoch": 0.6545090180360722, + "grad_norm": 24.41349424956329, + "learning_rate": 9.58109454082941e-06, + "loss": 4.381, + "step": 3266 + }, + { + "epoch": 0.6547094188376753, + "grad_norm": 26.79125354985378, + "learning_rate": 9.580627271905318e-06, + "loss": 4.9334, + "step": 3267 + }, + { + "epoch": 0.6549098196392785, + "grad_norm": 18.5311657158332, + "learning_rate": 9.580159753925759e-06, + "loss": 3.9777, + "step": 3268 + }, + { + "epoch": 0.6551102204408817, + "grad_norm": 31.283841572895966, + "learning_rate": 9.579691986916155e-06, + "loss": 4.7252, + "step": 3269 + }, + { + "epoch": 0.655310621242485, + "grad_norm": 34.390429937528026, + "learning_rate": 9.579223970901937e-06, + "loss": 4.0467, + "step": 3270 + }, + { + "epoch": 0.6555110220440882, + "grad_norm": 30.297975735953376, + "learning_rate": 9.578755705908551e-06, + "loss": 4.8015, + "step": 3271 + }, + { + "epoch": 0.6557114228456914, + "grad_norm": 27.01722438024048, + "learning_rate": 9.578287191961461e-06, + "loss": 3.7197, + "step": 3272 + }, + { + "epoch": 0.6559118236472946, + "grad_norm": 24.250411974721683, + "learning_rate": 9.577818429086136e-06, + "loss": 4.4085, + "step": 3273 + }, + { + "epoch": 0.6561122244488978, + "grad_norm": 21.042542008925555, + "learning_rate": 9.577349417308066e-06, + "loss": 3.7703, + "step": 3274 + }, + { + "epoch": 0.656312625250501, + "grad_norm": 38.87130779136583, + "learning_rate": 9.576880156652752e-06, + "loss": 4.4945, + "step": 3275 + }, + { + "epoch": 0.6565130260521042, + "grad_norm": 43.1736706914411, + "learning_rate": 9.576410647145709e-06, + "loss": 3.9493, + "step": 3276 + }, + { + "epoch": 0.6567134268537074, + "grad_norm": 21.11670622611854, + "learning_rate": 9.57594088881246e-06, + "loss": 4.2175, + "step": 3277 + }, + { + "epoch": 0.6569138276553106, + "grad_norm": 22.360912657927596, + "learning_rate": 9.575470881678553e-06, + "loss": 3.8758, + "step": 3278 + }, + { + "epoch": 0.6571142284569138, + "grad_norm": 22.928616526864126, + "learning_rate": 9.57500062576954e-06, + "loss": 4.5898, + "step": 3279 + }, + { + "epoch": 0.657314629258517, + "grad_norm": 21.684352721133806, + "learning_rate": 9.57453012111099e-06, + "loss": 4.1267, + "step": 3280 + }, + { + "epoch": 0.6575150300601202, + "grad_norm": 24.260855482737455, + "learning_rate": 9.574059367728483e-06, + "loss": 4.2419, + "step": 3281 + }, + { + "epoch": 0.6577154308617235, + "grad_norm": 30.093972375313903, + "learning_rate": 9.573588365647616e-06, + "loss": 4.6206, + "step": 3282 + }, + { + "epoch": 0.6579158316633267, + "grad_norm": 17.632123485494322, + "learning_rate": 9.573117114894e-06, + "loss": 3.9196, + "step": 3283 + }, + { + "epoch": 0.6581162324649299, + "grad_norm": 31.65198826039968, + "learning_rate": 9.572645615493256e-06, + "loss": 4.8162, + "step": 3284 + }, + { + "epoch": 0.6583166332665331, + "grad_norm": 22.580227137073482, + "learning_rate": 9.57217386747102e-06, + "loss": 4.1507, + "step": 3285 + }, + { + "epoch": 0.6585170340681362, + "grad_norm": 22.886891427819172, + "learning_rate": 9.571701870852941e-06, + "loss": 4.1854, + "step": 3286 + }, + { + "epoch": 0.6587174348697394, + "grad_norm": 29.160106272628, + "learning_rate": 9.571229625664681e-06, + "loss": 4.3526, + "step": 3287 + }, + { + "epoch": 0.6589178356713427, + "grad_norm": 53.99266907806015, + "learning_rate": 9.570757131931921e-06, + "loss": 4.1939, + "step": 3288 + }, + { + "epoch": 0.6591182364729459, + "grad_norm": 35.2232311316323, + "learning_rate": 9.570284389680348e-06, + "loss": 4.65, + "step": 3289 + }, + { + "epoch": 0.6593186372745491, + "grad_norm": 52.03709647269554, + "learning_rate": 9.569811398935667e-06, + "loss": 4.0535, + "step": 3290 + }, + { + "epoch": 0.6595190380761523, + "grad_norm": 32.87448244868884, + "learning_rate": 9.569338159723594e-06, + "loss": 3.8024, + "step": 3291 + }, + { + "epoch": 0.6597194388777555, + "grad_norm": 20.046735496832348, + "learning_rate": 9.56886467206986e-06, + "loss": 4.0044, + "step": 3292 + }, + { + "epoch": 0.6599198396793587, + "grad_norm": 22.018379130333237, + "learning_rate": 9.56839093600021e-06, + "loss": 4.2724, + "step": 3293 + }, + { + "epoch": 0.660120240480962, + "grad_norm": 21.862122556791647, + "learning_rate": 9.567916951540402e-06, + "loss": 4.3438, + "step": 3294 + }, + { + "epoch": 0.6603206412825652, + "grad_norm": 27.156507612439373, + "learning_rate": 9.567442718716205e-06, + "loss": 4.0674, + "step": 3295 + }, + { + "epoch": 0.6605210420841683, + "grad_norm": 39.26170104285103, + "learning_rate": 9.566968237553406e-06, + "loss": 4.6504, + "step": 3296 + }, + { + "epoch": 0.6607214428857715, + "grad_norm": 40.07656902603781, + "learning_rate": 9.5664935080778e-06, + "loss": 4.6014, + "step": 3297 + }, + { + "epoch": 0.6609218436873747, + "grad_norm": 94.04241694286357, + "learning_rate": 9.566018530315204e-06, + "loss": 4.0201, + "step": 3298 + }, + { + "epoch": 0.6611222444889779, + "grad_norm": 22.726259812993565, + "learning_rate": 9.565543304291441e-06, + "loss": 4.5462, + "step": 3299 + }, + { + "epoch": 0.6613226452905812, + "grad_norm": 27.526713846980346, + "learning_rate": 9.565067830032348e-06, + "loss": 3.7539, + "step": 3300 + }, + { + "epoch": 0.6615230460921844, + "grad_norm": 24.93278166085409, + "learning_rate": 9.564592107563778e-06, + "loss": 4.0392, + "step": 3301 + }, + { + "epoch": 0.6617234468937876, + "grad_norm": 30.759058205168838, + "learning_rate": 9.5641161369116e-06, + "loss": 4.6119, + "step": 3302 + }, + { + "epoch": 0.6619238476953908, + "grad_norm": 22.40721095444599, + "learning_rate": 9.563639918101686e-06, + "loss": 4.0978, + "step": 3303 + }, + { + "epoch": 0.662124248496994, + "grad_norm": 19.108287041054894, + "learning_rate": 9.563163451159937e-06, + "loss": 3.9146, + "step": 3304 + }, + { + "epoch": 0.6623246492985972, + "grad_norm": 24.860574544317732, + "learning_rate": 9.562686736112252e-06, + "loss": 4.2844, + "step": 3305 + }, + { + "epoch": 0.6625250501002004, + "grad_norm": 17.59942782985018, + "learning_rate": 9.562209772984556e-06, + "loss": 4.0768, + "step": 3306 + }, + { + "epoch": 0.6627254509018036, + "grad_norm": 35.13568289980336, + "learning_rate": 9.561732561802779e-06, + "loss": 4.836, + "step": 3307 + }, + { + "epoch": 0.6629258517034068, + "grad_norm": 31.997874160431838, + "learning_rate": 9.56125510259287e-06, + "loss": 5.2314, + "step": 3308 + }, + { + "epoch": 0.66312625250501, + "grad_norm": 21.595684684579695, + "learning_rate": 9.560777395380787e-06, + "loss": 4.3838, + "step": 3309 + }, + { + "epoch": 0.6633266533066132, + "grad_norm": 22.521100237139077, + "learning_rate": 9.560299440192505e-06, + "loss": 4.1723, + "step": 3310 + }, + { + "epoch": 0.6635270541082164, + "grad_norm": 21.307017297285203, + "learning_rate": 9.55982123705401e-06, + "loss": 4.1086, + "step": 3311 + }, + { + "epoch": 0.6637274549098197, + "grad_norm": 50.205123346572954, + "learning_rate": 9.559342785991306e-06, + "loss": 4.3401, + "step": 3312 + }, + { + "epoch": 0.6639278557114229, + "grad_norm": 26.19665880442317, + "learning_rate": 9.558864087030403e-06, + "loss": 4.4966, + "step": 3313 + }, + { + "epoch": 0.6641282565130261, + "grad_norm": 23.46818246835406, + "learning_rate": 9.558385140197331e-06, + "loss": 3.9971, + "step": 3314 + }, + { + "epoch": 0.6643286573146293, + "grad_norm": 19.407548797514405, + "learning_rate": 9.55790594551813e-06, + "loss": 4.1146, + "step": 3315 + }, + { + "epoch": 0.6645290581162324, + "grad_norm": 39.890670344086494, + "learning_rate": 9.557426503018851e-06, + "loss": 4.3788, + "step": 3316 + }, + { + "epoch": 0.6647294589178356, + "grad_norm": 20.575616744169718, + "learning_rate": 9.55694681272557e-06, + "loss": 4.3277, + "step": 3317 + }, + { + "epoch": 0.6649298597194389, + "grad_norm": 34.254845887672076, + "learning_rate": 9.556466874664362e-06, + "loss": 4.2219, + "step": 3318 + }, + { + "epoch": 0.6651302605210421, + "grad_norm": 27.408060427572636, + "learning_rate": 9.555986688861325e-06, + "loss": 4.0199, + "step": 3319 + }, + { + "epoch": 0.6653306613226453, + "grad_norm": 23.71616888387382, + "learning_rate": 9.555506255342566e-06, + "loss": 4.1807, + "step": 3320 + }, + { + "epoch": 0.6655310621242485, + "grad_norm": 20.723227354185813, + "learning_rate": 9.555025574134208e-06, + "loss": 4.0489, + "step": 3321 + }, + { + "epoch": 0.6657314629258517, + "grad_norm": 30.85557518252407, + "learning_rate": 9.554544645262384e-06, + "loss": 4.3188, + "step": 3322 + }, + { + "epoch": 0.6659318637274549, + "grad_norm": 51.36922334187818, + "learning_rate": 9.554063468753247e-06, + "loss": 4.5549, + "step": 3323 + }, + { + "epoch": 0.6661322645290582, + "grad_norm": 55.17936139652388, + "learning_rate": 9.553582044632956e-06, + "loss": 4.3006, + "step": 3324 + }, + { + "epoch": 0.6663326653306614, + "grad_norm": 49.33212771493585, + "learning_rate": 9.553100372927687e-06, + "loss": 4.305, + "step": 3325 + }, + { + "epoch": 0.6665330661322645, + "grad_norm": 27.479828133802688, + "learning_rate": 9.55261845366363e-06, + "loss": 3.8002, + "step": 3326 + }, + { + "epoch": 0.6667334669338677, + "grad_norm": 40.69127751532063, + "learning_rate": 9.552136286866986e-06, + "loss": 4.6337, + "step": 3327 + }, + { + "epoch": 0.6669338677354709, + "grad_norm": 54.23531507912247, + "learning_rate": 9.551653872563976e-06, + "loss": 4.5423, + "step": 3328 + }, + { + "epoch": 0.6671342685370741, + "grad_norm": 29.35963918666724, + "learning_rate": 9.551171210780823e-06, + "loss": 4.3859, + "step": 3329 + }, + { + "epoch": 0.6673346693386774, + "grad_norm": 18.256095987454266, + "learning_rate": 9.550688301543774e-06, + "loss": 3.8372, + "step": 3330 + }, + { + "epoch": 0.6675350701402806, + "grad_norm": 29.348595491294077, + "learning_rate": 9.550205144879083e-06, + "loss": 4.5909, + "step": 3331 + }, + { + "epoch": 0.6677354709418838, + "grad_norm": 30.613212964479906, + "learning_rate": 9.549721740813024e-06, + "loss": 4.575, + "step": 3332 + }, + { + "epoch": 0.667935871743487, + "grad_norm": 40.329618414645715, + "learning_rate": 9.549238089371877e-06, + "loss": 4.0883, + "step": 3333 + }, + { + "epoch": 0.6681362725450902, + "grad_norm": 28.27694599930765, + "learning_rate": 9.548754190581939e-06, + "loss": 4.7728, + "step": 3334 + }, + { + "epoch": 0.6683366733466933, + "grad_norm": 22.71443854911215, + "learning_rate": 9.548270044469524e-06, + "loss": 4.7215, + "step": 3335 + }, + { + "epoch": 0.6685370741482966, + "grad_norm": 23.50322762571337, + "learning_rate": 9.54778565106095e-06, + "loss": 4.6333, + "step": 3336 + }, + { + "epoch": 0.6687374749498998, + "grad_norm": 22.353114457966825, + "learning_rate": 9.547301010382557e-06, + "loss": 4.4552, + "step": 3337 + }, + { + "epoch": 0.668937875751503, + "grad_norm": 27.425859233593155, + "learning_rate": 9.546816122460697e-06, + "loss": 4.6616, + "step": 3338 + }, + { + "epoch": 0.6691382765531062, + "grad_norm": 32.819789632213165, + "learning_rate": 9.546330987321733e-06, + "loss": 4.2369, + "step": 3339 + }, + { + "epoch": 0.6693386773547094, + "grad_norm": 26.80088839362771, + "learning_rate": 9.54584560499204e-06, + "loss": 4.1168, + "step": 3340 + }, + { + "epoch": 0.6695390781563126, + "grad_norm": 35.84560027435997, + "learning_rate": 9.545359975498012e-06, + "loss": 4.5639, + "step": 3341 + }, + { + "epoch": 0.6697394789579159, + "grad_norm": 35.47473669570102, + "learning_rate": 9.544874098866054e-06, + "loss": 4.1819, + "step": 3342 + }, + { + "epoch": 0.6699398797595191, + "grad_norm": 22.509704559427096, + "learning_rate": 9.544387975122581e-06, + "loss": 3.9461, + "step": 3343 + }, + { + "epoch": 0.6701402805611223, + "grad_norm": 25.04300230857879, + "learning_rate": 9.543901604294026e-06, + "loss": 4.0063, + "step": 3344 + }, + { + "epoch": 0.6703406813627254, + "grad_norm": 28.674635092426627, + "learning_rate": 9.543414986406834e-06, + "loss": 4.7849, + "step": 3345 + }, + { + "epoch": 0.6705410821643286, + "grad_norm": 26.061911527173677, + "learning_rate": 9.542928121487462e-06, + "loss": 3.7231, + "step": 3346 + }, + { + "epoch": 0.6707414829659318, + "grad_norm": 35.76316620426208, + "learning_rate": 9.542441009562383e-06, + "loss": 4.4783, + "step": 3347 + }, + { + "epoch": 0.670941883767535, + "grad_norm": 19.143735813705423, + "learning_rate": 9.541953650658078e-06, + "loss": 4.1449, + "step": 3348 + }, + { + "epoch": 0.6711422845691383, + "grad_norm": 25.361734815158464, + "learning_rate": 9.54146604480105e-06, + "loss": 3.7935, + "step": 3349 + }, + { + "epoch": 0.6713426853707415, + "grad_norm": 15.588279862314014, + "learning_rate": 9.540978192017811e-06, + "loss": 3.8902, + "step": 3350 + }, + { + "epoch": 0.6715430861723447, + "grad_norm": 22.011766771701094, + "learning_rate": 9.540490092334884e-06, + "loss": 3.9593, + "step": 3351 + }, + { + "epoch": 0.6717434869739479, + "grad_norm": 60.094371483142645, + "learning_rate": 9.540001745778808e-06, + "loss": 4.5251, + "step": 3352 + }, + { + "epoch": 0.6719438877755511, + "grad_norm": 30.446212184554547, + "learning_rate": 9.539513152376134e-06, + "loss": 4.6159, + "step": 3353 + }, + { + "epoch": 0.6721442885771544, + "grad_norm": 23.747939233000523, + "learning_rate": 9.53902431215343e-06, + "loss": 4.2869, + "step": 3354 + }, + { + "epoch": 0.6723446893787575, + "grad_norm": 28.087787426068143, + "learning_rate": 9.538535225137276e-06, + "loss": 4.6727, + "step": 3355 + }, + { + "epoch": 0.6725450901803607, + "grad_norm": 19.016960693531548, + "learning_rate": 9.53804589135426e-06, + "loss": 4.1361, + "step": 3356 + }, + { + "epoch": 0.6727454909819639, + "grad_norm": 25.055753754206897, + "learning_rate": 9.537556310830991e-06, + "loss": 4.9723, + "step": 3357 + }, + { + "epoch": 0.6729458917835671, + "grad_norm": 21.89198139307543, + "learning_rate": 9.537066483594087e-06, + "loss": 4.1478, + "step": 3358 + }, + { + "epoch": 0.6731462925851703, + "grad_norm": 23.53926288488303, + "learning_rate": 9.536576409670179e-06, + "loss": 4.273, + "step": 3359 + }, + { + "epoch": 0.6733466933867736, + "grad_norm": 26.58411756937917, + "learning_rate": 9.536086089085918e-06, + "loss": 4.4373, + "step": 3360 + }, + { + "epoch": 0.6735470941883768, + "grad_norm": 20.00444780129293, + "learning_rate": 9.53559552186796e-06, + "loss": 4.2038, + "step": 3361 + }, + { + "epoch": 0.67374749498998, + "grad_norm": 21.018308052347585, + "learning_rate": 9.535104708042978e-06, + "loss": 4.4154, + "step": 3362 + }, + { + "epoch": 0.6739478957915832, + "grad_norm": 25.940124806336115, + "learning_rate": 9.534613647637657e-06, + "loss": 4.4332, + "step": 3363 + }, + { + "epoch": 0.6741482965931864, + "grad_norm": 18.288044312154447, + "learning_rate": 9.5341223406787e-06, + "loss": 3.8852, + "step": 3364 + }, + { + "epoch": 0.6743486973947895, + "grad_norm": 18.341724924726147, + "learning_rate": 9.533630787192817e-06, + "loss": 3.72, + "step": 3365 + }, + { + "epoch": 0.6745490981963927, + "grad_norm": 21.521162496526138, + "learning_rate": 9.533138987206737e-06, + "loss": 3.8366, + "step": 3366 + }, + { + "epoch": 0.674749498997996, + "grad_norm": 23.613423455984602, + "learning_rate": 9.532646940747199e-06, + "loss": 4.7616, + "step": 3367 + }, + { + "epoch": 0.6749498997995992, + "grad_norm": 17.740651387805777, + "learning_rate": 9.532154647840955e-06, + "loss": 3.9596, + "step": 3368 + }, + { + "epoch": 0.6751503006012024, + "grad_norm": 14.800555518564257, + "learning_rate": 9.531662108514775e-06, + "loss": 3.8154, + "step": 3369 + }, + { + "epoch": 0.6753507014028056, + "grad_norm": 28.109460434926945, + "learning_rate": 9.531169322795433e-06, + "loss": 4.6086, + "step": 3370 + }, + { + "epoch": 0.6755511022044088, + "grad_norm": 25.380027282378613, + "learning_rate": 9.530676290709728e-06, + "loss": 4.082, + "step": 3371 + }, + { + "epoch": 0.675751503006012, + "grad_norm": 33.43190607038869, + "learning_rate": 9.530183012284465e-06, + "loss": 3.9336, + "step": 3372 + }, + { + "epoch": 0.6759519038076153, + "grad_norm": 23.97620592600136, + "learning_rate": 9.529689487546467e-06, + "loss": 4.4517, + "step": 3373 + }, + { + "epoch": 0.6761523046092185, + "grad_norm": 38.08093563019005, + "learning_rate": 9.529195716522563e-06, + "loss": 4.7429, + "step": 3374 + }, + { + "epoch": 0.6763527054108216, + "grad_norm": 26.177054091276233, + "learning_rate": 9.528701699239601e-06, + "loss": 3.7976, + "step": 3375 + }, + { + "epoch": 0.6765531062124248, + "grad_norm": 28.954699928270465, + "learning_rate": 9.528207435724445e-06, + "loss": 4.7118, + "step": 3376 + }, + { + "epoch": 0.676753507014028, + "grad_norm": 22.289824307781302, + "learning_rate": 9.527712926003965e-06, + "loss": 3.973, + "step": 3377 + }, + { + "epoch": 0.6769539078156313, + "grad_norm": 21.851994494636372, + "learning_rate": 9.52721817010505e-06, + "loss": 4.0788, + "step": 3378 + }, + { + "epoch": 0.6771543086172345, + "grad_norm": 62.553846629630335, + "learning_rate": 9.5267231680546e-06, + "loss": 5.1043, + "step": 3379 + }, + { + "epoch": 0.6773547094188377, + "grad_norm": 28.135553352183624, + "learning_rate": 9.52622791987953e-06, + "loss": 3.9238, + "step": 3380 + }, + { + "epoch": 0.6775551102204409, + "grad_norm": 34.10457329211767, + "learning_rate": 9.525732425606766e-06, + "loss": 4.8682, + "step": 3381 + }, + { + "epoch": 0.6777555110220441, + "grad_norm": 19.783925721997907, + "learning_rate": 9.525236685263251e-06, + "loss": 3.7088, + "step": 3382 + }, + { + "epoch": 0.6779559118236473, + "grad_norm": 16.913157753918256, + "learning_rate": 9.524740698875936e-06, + "loss": 4.3415, + "step": 3383 + }, + { + "epoch": 0.6781563126252504, + "grad_norm": 22.249008647704528, + "learning_rate": 9.524244466471791e-06, + "loss": 4.4124, + "step": 3384 + }, + { + "epoch": 0.6783567134268537, + "grad_norm": 22.98900297516967, + "learning_rate": 9.523747988077794e-06, + "loss": 4.0758, + "step": 3385 + }, + { + "epoch": 0.6785571142284569, + "grad_norm": 22.67041759710577, + "learning_rate": 9.523251263720943e-06, + "loss": 4.4735, + "step": 3386 + }, + { + "epoch": 0.6787575150300601, + "grad_norm": 18.708360045413663, + "learning_rate": 9.522754293428243e-06, + "loss": 3.9498, + "step": 3387 + }, + { + "epoch": 0.6789579158316633, + "grad_norm": 24.107604523783657, + "learning_rate": 9.522257077226717e-06, + "loss": 4.6891, + "step": 3388 + }, + { + "epoch": 0.6791583166332665, + "grad_norm": 25.482986186639813, + "learning_rate": 9.521759615143397e-06, + "loss": 4.0935, + "step": 3389 + }, + { + "epoch": 0.6793587174348698, + "grad_norm": 22.78951085132593, + "learning_rate": 9.521261907205332e-06, + "loss": 4.0567, + "step": 3390 + }, + { + "epoch": 0.679559118236473, + "grad_norm": 27.200333141773186, + "learning_rate": 9.520763953439585e-06, + "loss": 4.3725, + "step": 3391 + }, + { + "epoch": 0.6797595190380762, + "grad_norm": 53.62880114153336, + "learning_rate": 9.520265753873226e-06, + "loss": 4.1621, + "step": 3392 + }, + { + "epoch": 0.6799599198396794, + "grad_norm": 20.77810007036439, + "learning_rate": 9.519767308533348e-06, + "loss": 4.5789, + "step": 3393 + }, + { + "epoch": 0.6801603206412825, + "grad_norm": 17.738612328635643, + "learning_rate": 9.519268617447048e-06, + "loss": 3.8522, + "step": 3394 + }, + { + "epoch": 0.6803607214428857, + "grad_norm": 26.92306984260005, + "learning_rate": 9.518769680641442e-06, + "loss": 4.3049, + "step": 3395 + }, + { + "epoch": 0.680561122244489, + "grad_norm": 25.029116528014892, + "learning_rate": 9.518270498143659e-06, + "loss": 4.0168, + "step": 3396 + }, + { + "epoch": 0.6807615230460922, + "grad_norm": 16.21795640734512, + "learning_rate": 9.517771069980838e-06, + "loss": 3.7622, + "step": 3397 + }, + { + "epoch": 0.6809619238476954, + "grad_norm": 23.67710100842685, + "learning_rate": 9.517271396180137e-06, + "loss": 4.1818, + "step": 3398 + }, + { + "epoch": 0.6811623246492986, + "grad_norm": 19.674290978089726, + "learning_rate": 9.516771476768723e-06, + "loss": 3.5104, + "step": 3399 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 24.88555428717208, + "learning_rate": 9.516271311773774e-06, + "loss": 4.366, + "step": 3400 + }, + { + "epoch": 0.681563126252505, + "grad_norm": 27.77103111502252, + "learning_rate": 9.515770901222487e-06, + "loss": 4.0584, + "step": 3401 + }, + { + "epoch": 0.6817635270541083, + "grad_norm": 27.98939229338618, + "learning_rate": 9.515270245142072e-06, + "loss": 4.2022, + "step": 3402 + }, + { + "epoch": 0.6819639278557115, + "grad_norm": 28.3503887874771, + "learning_rate": 9.514769343559748e-06, + "loss": 4.2532, + "step": 3403 + }, + { + "epoch": 0.6821643286573146, + "grad_norm": 20.642944229762712, + "learning_rate": 9.514268196502749e-06, + "loss": 4.2258, + "step": 3404 + }, + { + "epoch": 0.6823647294589178, + "grad_norm": 23.112636091022228, + "learning_rate": 9.513766803998326e-06, + "loss": 4.0064, + "step": 3405 + }, + { + "epoch": 0.682565130260521, + "grad_norm": 24.58920791823618, + "learning_rate": 9.51326516607374e-06, + "loss": 3.8367, + "step": 3406 + }, + { + "epoch": 0.6827655310621242, + "grad_norm": 35.15709156182899, + "learning_rate": 9.512763282756263e-06, + "loss": 3.7081, + "step": 3407 + }, + { + "epoch": 0.6829659318637274, + "grad_norm": 18.189945889036526, + "learning_rate": 9.512261154073184e-06, + "loss": 3.9056, + "step": 3408 + }, + { + "epoch": 0.6831663326653307, + "grad_norm": 49.89632504393308, + "learning_rate": 9.511758780051807e-06, + "loss": 4.9834, + "step": 3409 + }, + { + "epoch": 0.6833667334669339, + "grad_norm": 28.29914590560623, + "learning_rate": 9.511256160719444e-06, + "loss": 3.8378, + "step": 3410 + }, + { + "epoch": 0.6835671342685371, + "grad_norm": 22.191711940271247, + "learning_rate": 9.510753296103425e-06, + "loss": 3.982, + "step": 3411 + }, + { + "epoch": 0.6837675350701403, + "grad_norm": 27.930778065883487, + "learning_rate": 9.51025018623109e-06, + "loss": 4.0991, + "step": 3412 + }, + { + "epoch": 0.6839679358717435, + "grad_norm": 21.163269923112733, + "learning_rate": 9.509746831129798e-06, + "loss": 4.0627, + "step": 3413 + }, + { + "epoch": 0.6841683366733466, + "grad_norm": 22.16511091316921, + "learning_rate": 9.50924323082691e-06, + "loss": 3.881, + "step": 3414 + }, + { + "epoch": 0.6843687374749499, + "grad_norm": 31.671553002059454, + "learning_rate": 9.508739385349812e-06, + "loss": 4.2773, + "step": 3415 + }, + { + "epoch": 0.6845691382765531, + "grad_norm": 21.136399411376097, + "learning_rate": 9.508235294725899e-06, + "loss": 4.201, + "step": 3416 + }, + { + "epoch": 0.6847695390781563, + "grad_norm": 24.62708703101997, + "learning_rate": 9.507730958982579e-06, + "loss": 3.7573, + "step": 3417 + }, + { + "epoch": 0.6849699398797595, + "grad_norm": 43.728711513505644, + "learning_rate": 9.50722637814727e-06, + "loss": 4.6951, + "step": 3418 + }, + { + "epoch": 0.6851703406813627, + "grad_norm": 21.458974610991568, + "learning_rate": 9.506721552247412e-06, + "loss": 4.0711, + "step": 3419 + }, + { + "epoch": 0.685370741482966, + "grad_norm": 26.005737970296025, + "learning_rate": 9.506216481310449e-06, + "loss": 4.0629, + "step": 3420 + }, + { + "epoch": 0.6855711422845692, + "grad_norm": 24.396923189380228, + "learning_rate": 9.505711165363846e-06, + "loss": 4.3057, + "step": 3421 + }, + { + "epoch": 0.6857715430861724, + "grad_norm": 22.61427500529006, + "learning_rate": 9.505205604435075e-06, + "loss": 4.072, + "step": 3422 + }, + { + "epoch": 0.6859719438877756, + "grad_norm": 34.87388277431055, + "learning_rate": 9.504699798551624e-06, + "loss": 4.0444, + "step": 3423 + }, + { + "epoch": 0.6861723446893787, + "grad_norm": 18.84070340775345, + "learning_rate": 9.504193747740998e-06, + "loss": 3.6129, + "step": 3424 + }, + { + "epoch": 0.6863727454909819, + "grad_norm": 21.589007868538847, + "learning_rate": 9.503687452030706e-06, + "loss": 4.6234, + "step": 3425 + }, + { + "epoch": 0.6865731462925851, + "grad_norm": 33.291074505626646, + "learning_rate": 9.50318091144828e-06, + "loss": 4.8434, + "step": 3426 + }, + { + "epoch": 0.6867735470941884, + "grad_norm": 32.88068025238109, + "learning_rate": 9.502674126021262e-06, + "loss": 4.3236, + "step": 3427 + }, + { + "epoch": 0.6869739478957916, + "grad_norm": 28.286344772672148, + "learning_rate": 9.502167095777204e-06, + "loss": 4.5523, + "step": 3428 + }, + { + "epoch": 0.6871743486973948, + "grad_norm": 30.24393171248298, + "learning_rate": 9.501659820743676e-06, + "loss": 4.425, + "step": 3429 + }, + { + "epoch": 0.687374749498998, + "grad_norm": 23.630331697894064, + "learning_rate": 9.501152300948258e-06, + "loss": 3.8341, + "step": 3430 + }, + { + "epoch": 0.6875751503006012, + "grad_norm": 17.460265486338866, + "learning_rate": 9.500644536418544e-06, + "loss": 3.8419, + "step": 3431 + }, + { + "epoch": 0.6877755511022045, + "grad_norm": 29.425874106820626, + "learning_rate": 9.500136527182145e-06, + "loss": 4.8182, + "step": 3432 + }, + { + "epoch": 0.6879759519038077, + "grad_norm": 28.424775085216677, + "learning_rate": 9.499628273266679e-06, + "loss": 4.4324, + "step": 3433 + }, + { + "epoch": 0.6881763527054108, + "grad_norm": 20.678773387051223, + "learning_rate": 9.499119774699782e-06, + "loss": 4.3254, + "step": 3434 + }, + { + "epoch": 0.688376753507014, + "grad_norm": 24.191238725374884, + "learning_rate": 9.498611031509101e-06, + "loss": 4.4539, + "step": 3435 + }, + { + "epoch": 0.6885771543086172, + "grad_norm": 21.596185652593643, + "learning_rate": 9.4981020437223e-06, + "loss": 4.7282, + "step": 3436 + }, + { + "epoch": 0.6887775551102204, + "grad_norm": 53.27172533778382, + "learning_rate": 9.497592811367048e-06, + "loss": 3.9654, + "step": 3437 + }, + { + "epoch": 0.6889779559118236, + "grad_norm": 18.908822469819096, + "learning_rate": 9.497083334471038e-06, + "loss": 4.2379, + "step": 3438 + }, + { + "epoch": 0.6891783567134269, + "grad_norm": 20.145164786160677, + "learning_rate": 9.496573613061969e-06, + "loss": 4.6727, + "step": 3439 + }, + { + "epoch": 0.6893787575150301, + "grad_norm": 22.084544765963226, + "learning_rate": 9.496063647167552e-06, + "loss": 4.1405, + "step": 3440 + }, + { + "epoch": 0.6895791583166333, + "grad_norm": 22.127059599728952, + "learning_rate": 9.495553436815519e-06, + "loss": 4.5051, + "step": 3441 + }, + { + "epoch": 0.6897795591182365, + "grad_norm": 28.185542765668593, + "learning_rate": 9.49504298203361e-06, + "loss": 4.1041, + "step": 3442 + }, + { + "epoch": 0.6899799599198396, + "grad_norm": 32.04410089222457, + "learning_rate": 9.49453228284958e-06, + "loss": 4.9284, + "step": 3443 + }, + { + "epoch": 0.6901803607214428, + "grad_norm": 26.10350635829036, + "learning_rate": 9.494021339291195e-06, + "loss": 4.1074, + "step": 3444 + }, + { + "epoch": 0.6903807615230461, + "grad_norm": 22.16738547495424, + "learning_rate": 9.493510151386235e-06, + "loss": 4.4438, + "step": 3445 + }, + { + "epoch": 0.6905811623246493, + "grad_norm": 42.604198002600214, + "learning_rate": 9.492998719162495e-06, + "loss": 4.2162, + "step": 3446 + }, + { + "epoch": 0.6907815631262525, + "grad_norm": 33.755894971017064, + "learning_rate": 9.492487042647782e-06, + "loss": 4.2678, + "step": 3447 + }, + { + "epoch": 0.6909819639278557, + "grad_norm": 33.109473203459544, + "learning_rate": 9.491975121869918e-06, + "loss": 4.6733, + "step": 3448 + }, + { + "epoch": 0.6911823647294589, + "grad_norm": 30.778692862322057, + "learning_rate": 9.491462956856737e-06, + "loss": 4.379, + "step": 3449 + }, + { + "epoch": 0.6913827655310621, + "grad_norm": 29.539977564790846, + "learning_rate": 9.490950547636082e-06, + "loss": 4.6943, + "step": 3450 + }, + { + "epoch": 0.6915831663326654, + "grad_norm": 22.61637983647758, + "learning_rate": 9.49043789423582e-06, + "loss": 4.2491, + "step": 3451 + }, + { + "epoch": 0.6917835671342686, + "grad_norm": 49.47779910006682, + "learning_rate": 9.489924996683817e-06, + "loss": 4.1873, + "step": 3452 + }, + { + "epoch": 0.6919839679358717, + "grad_norm": 29.532586101899515, + "learning_rate": 9.489411855007967e-06, + "loss": 4.2274, + "step": 3453 + }, + { + "epoch": 0.6921843687374749, + "grad_norm": 21.293069101804694, + "learning_rate": 9.488898469236167e-06, + "loss": 4.1956, + "step": 3454 + }, + { + "epoch": 0.6923847695390781, + "grad_norm": 32.505553478322064, + "learning_rate": 9.488384839396332e-06, + "loss": 4.2808, + "step": 3455 + }, + { + "epoch": 0.6925851703406813, + "grad_norm": 23.32166636493729, + "learning_rate": 9.487870965516386e-06, + "loss": 4.3234, + "step": 3456 + }, + { + "epoch": 0.6927855711422846, + "grad_norm": 22.902625533041718, + "learning_rate": 9.487356847624272e-06, + "loss": 3.6486, + "step": 3457 + }, + { + "epoch": 0.6929859719438878, + "grad_norm": 26.10916560969004, + "learning_rate": 9.48684248574794e-06, + "loss": 3.9002, + "step": 3458 + }, + { + "epoch": 0.693186372745491, + "grad_norm": 47.333116714667725, + "learning_rate": 9.486327879915362e-06, + "loss": 4.6033, + "step": 3459 + }, + { + "epoch": 0.6933867735470942, + "grad_norm": 24.652481484672997, + "learning_rate": 9.485813030154514e-06, + "loss": 3.6754, + "step": 3460 + }, + { + "epoch": 0.6935871743486974, + "grad_norm": 23.124999912889688, + "learning_rate": 9.48529793649339e-06, + "loss": 4.1065, + "step": 3461 + }, + { + "epoch": 0.6937875751503007, + "grad_norm": 27.846555662069033, + "learning_rate": 9.484782598959994e-06, + "loss": 3.9809, + "step": 3462 + }, + { + "epoch": 0.6939879759519038, + "grad_norm": 31.271594005495878, + "learning_rate": 9.48426701758235e-06, + "loss": 4.5445, + "step": 3463 + }, + { + "epoch": 0.694188376753507, + "grad_norm": 29.54014314708965, + "learning_rate": 9.483751192388489e-06, + "loss": 3.9571, + "step": 3464 + }, + { + "epoch": 0.6943887775551102, + "grad_norm": 28.610485022711444, + "learning_rate": 9.483235123406456e-06, + "loss": 4.3599, + "step": 3465 + }, + { + "epoch": 0.6945891783567134, + "grad_norm": 69.10788507804858, + "learning_rate": 9.482718810664313e-06, + "loss": 4.4553, + "step": 3466 + }, + { + "epoch": 0.6947895791583166, + "grad_norm": 34.58635548150017, + "learning_rate": 9.48220225419013e-06, + "loss": 4.8919, + "step": 3467 + }, + { + "epoch": 0.6949899799599198, + "grad_norm": 50.72331440763535, + "learning_rate": 9.481685454011992e-06, + "loss": 4.162, + "step": 3468 + }, + { + "epoch": 0.6951903807615231, + "grad_norm": 28.24937285807105, + "learning_rate": 9.481168410158004e-06, + "loss": 4.0302, + "step": 3469 + }, + { + "epoch": 0.6953907815631263, + "grad_norm": 32.65852500259858, + "learning_rate": 9.480651122656274e-06, + "loss": 5.1825, + "step": 3470 + }, + { + "epoch": 0.6955911823647295, + "grad_norm": 25.55914832702962, + "learning_rate": 9.480133591534926e-06, + "loss": 4.7573, + "step": 3471 + }, + { + "epoch": 0.6957915831663327, + "grad_norm": 16.72056049819987, + "learning_rate": 9.479615816822104e-06, + "loss": 3.3043, + "step": 3472 + }, + { + "epoch": 0.6959919839679358, + "grad_norm": 22.761460683189707, + "learning_rate": 9.479097798545958e-06, + "loss": 4.2961, + "step": 3473 + }, + { + "epoch": 0.696192384769539, + "grad_norm": 32.86548677996912, + "learning_rate": 9.47857953673465e-06, + "loss": 4.5544, + "step": 3474 + }, + { + "epoch": 0.6963927855711423, + "grad_norm": 35.464702031213456, + "learning_rate": 9.478061031416364e-06, + "loss": 4.4629, + "step": 3475 + }, + { + "epoch": 0.6965931863727455, + "grad_norm": 43.2028777629792, + "learning_rate": 9.477542282619287e-06, + "loss": 5.0896, + "step": 3476 + }, + { + "epoch": 0.6967935871743487, + "grad_norm": 23.728376272935474, + "learning_rate": 9.47702329037163e-06, + "loss": 3.9433, + "step": 3477 + }, + { + "epoch": 0.6969939879759519, + "grad_norm": 28.725763605929778, + "learning_rate": 9.476504054701605e-06, + "loss": 3.6515, + "step": 3478 + }, + { + "epoch": 0.6971943887775551, + "grad_norm": 26.801788309987902, + "learning_rate": 9.475984575637448e-06, + "loss": 4.002, + "step": 3479 + }, + { + "epoch": 0.6973947895791583, + "grad_norm": 30.723828245613475, + "learning_rate": 9.475464853207402e-06, + "loss": 4.1826, + "step": 3480 + }, + { + "epoch": 0.6975951903807616, + "grad_norm": 20.907398704228346, + "learning_rate": 9.474944887439726e-06, + "loss": 4.4195, + "step": 3481 + }, + { + "epoch": 0.6977955911823648, + "grad_norm": 39.92263912107618, + "learning_rate": 9.47442467836269e-06, + "loss": 4.1799, + "step": 3482 + }, + { + "epoch": 0.6979959919839679, + "grad_norm": 31.823641323195268, + "learning_rate": 9.47390422600458e-06, + "loss": 4.6431, + "step": 3483 + }, + { + "epoch": 0.6981963927855711, + "grad_norm": 17.613413564876446, + "learning_rate": 9.473383530393693e-06, + "loss": 4.3173, + "step": 3484 + }, + { + "epoch": 0.6983967935871743, + "grad_norm": 28.046508833686367, + "learning_rate": 9.472862591558342e-06, + "loss": 5.1142, + "step": 3485 + }, + { + "epoch": 0.6985971943887775, + "grad_norm": 26.4168318815832, + "learning_rate": 9.472341409526847e-06, + "loss": 4.2058, + "step": 3486 + }, + { + "epoch": 0.6987975951903808, + "grad_norm": 34.633743601554, + "learning_rate": 9.471819984327549e-06, + "loss": 4.074, + "step": 3487 + }, + { + "epoch": 0.698997995991984, + "grad_norm": 26.56982311377568, + "learning_rate": 9.471298315988797e-06, + "loss": 3.9693, + "step": 3488 + }, + { + "epoch": 0.6991983967935872, + "grad_norm": 20.770057933930143, + "learning_rate": 9.470776404538955e-06, + "loss": 4.4471, + "step": 3489 + }, + { + "epoch": 0.6993987975951904, + "grad_norm": 27.048643828809503, + "learning_rate": 9.470254250006401e-06, + "loss": 4.2173, + "step": 3490 + }, + { + "epoch": 0.6995991983967936, + "grad_norm": 26.47694586003793, + "learning_rate": 9.469731852419524e-06, + "loss": 4.18, + "step": 3491 + }, + { + "epoch": 0.6997995991983968, + "grad_norm": 19.316849631235378, + "learning_rate": 9.46920921180673e-06, + "loss": 4.1577, + "step": 3492 + }, + { + "epoch": 0.7, + "grad_norm": 28.480626600707126, + "learning_rate": 9.468686328196432e-06, + "loss": 4.0803, + "step": 3493 + }, + { + "epoch": 0.7002004008016032, + "grad_norm": 26.371340429823196, + "learning_rate": 9.468163201617063e-06, + "loss": 4.4694, + "step": 3494 + }, + { + "epoch": 0.7004008016032064, + "grad_norm": 22.58436903117005, + "learning_rate": 9.467639832097063e-06, + "loss": 4.9161, + "step": 3495 + }, + { + "epoch": 0.7006012024048096, + "grad_norm": 21.835660851171983, + "learning_rate": 9.467116219664893e-06, + "loss": 3.9806, + "step": 3496 + }, + { + "epoch": 0.7008016032064128, + "grad_norm": 39.414831468097525, + "learning_rate": 9.466592364349018e-06, + "loss": 4.424, + "step": 3497 + }, + { + "epoch": 0.701002004008016, + "grad_norm": 27.259415416616374, + "learning_rate": 9.466068266177925e-06, + "loss": 4.6008, + "step": 3498 + }, + { + "epoch": 0.7012024048096193, + "grad_norm": 23.736507343855322, + "learning_rate": 9.465543925180107e-06, + "loss": 4.1087, + "step": 3499 + }, + { + "epoch": 0.7014028056112225, + "grad_norm": 34.31371236863574, + "learning_rate": 9.465019341384073e-06, + "loss": 4.4276, + "step": 3500 + }, + { + "epoch": 0.7016032064128257, + "grad_norm": 18.9542293650197, + "learning_rate": 9.464494514818349e-06, + "loss": 4.171, + "step": 3501 + }, + { + "epoch": 0.7018036072144288, + "grad_norm": 21.43373350248574, + "learning_rate": 9.463969445511466e-06, + "loss": 3.9252, + "step": 3502 + }, + { + "epoch": 0.702004008016032, + "grad_norm": 31.97250683925423, + "learning_rate": 9.463444133491976e-06, + "loss": 4.7478, + "step": 3503 + }, + { + "epoch": 0.7022044088176352, + "grad_norm": 28.718402712854214, + "learning_rate": 9.462918578788438e-06, + "loss": 4.379, + "step": 3504 + }, + { + "epoch": 0.7024048096192385, + "grad_norm": 18.691227071729703, + "learning_rate": 9.46239278142943e-06, + "loss": 3.8364, + "step": 3505 + }, + { + "epoch": 0.7026052104208417, + "grad_norm": 23.71913661883715, + "learning_rate": 9.46186674144354e-06, + "loss": 4.181, + "step": 3506 + }, + { + "epoch": 0.7028056112224449, + "grad_norm": 26.4973374733743, + "learning_rate": 9.461340458859369e-06, + "loss": 4.3903, + "step": 3507 + }, + { + "epoch": 0.7030060120240481, + "grad_norm": 23.531143083921933, + "learning_rate": 9.46081393370553e-06, + "loss": 3.8438, + "step": 3508 + }, + { + "epoch": 0.7032064128256513, + "grad_norm": 18.771288805765884, + "learning_rate": 9.460287166010656e-06, + "loss": 4.314, + "step": 3509 + }, + { + "epoch": 0.7034068136272545, + "grad_norm": 53.55818214592884, + "learning_rate": 9.459760155803382e-06, + "loss": 4.3189, + "step": 3510 + }, + { + "epoch": 0.7036072144288578, + "grad_norm": 23.792023447796428, + "learning_rate": 9.459232903112366e-06, + "loss": 4.8012, + "step": 3511 + }, + { + "epoch": 0.7038076152304609, + "grad_norm": 27.24987234319823, + "learning_rate": 9.458705407966274e-06, + "loss": 4.3662, + "step": 3512 + }, + { + "epoch": 0.7040080160320641, + "grad_norm": 26.386409059102675, + "learning_rate": 9.458177670393786e-06, + "loss": 4.2654, + "step": 3513 + }, + { + "epoch": 0.7042084168336673, + "grad_norm": 23.020147766583626, + "learning_rate": 9.4576496904236e-06, + "loss": 4.0642, + "step": 3514 + }, + { + "epoch": 0.7044088176352705, + "grad_norm": 30.834755203927877, + "learning_rate": 9.457121468084419e-06, + "loss": 3.7099, + "step": 3515 + }, + { + "epoch": 0.7046092184368737, + "grad_norm": 25.302965068709014, + "learning_rate": 9.456593003404965e-06, + "loss": 3.7949, + "step": 3516 + }, + { + "epoch": 0.704809619238477, + "grad_norm": 19.946537306365943, + "learning_rate": 9.45606429641397e-06, + "loss": 4.1302, + "step": 3517 + }, + { + "epoch": 0.7050100200400802, + "grad_norm": 23.526324206749848, + "learning_rate": 9.45553534714018e-06, + "loss": 4.629, + "step": 3518 + }, + { + "epoch": 0.7052104208416834, + "grad_norm": 24.71164206104178, + "learning_rate": 9.455006155612361e-06, + "loss": 3.8125, + "step": 3519 + }, + { + "epoch": 0.7054108216432866, + "grad_norm": 24.860176562715232, + "learning_rate": 9.454476721859277e-06, + "loss": 4.2335, + "step": 3520 + }, + { + "epoch": 0.7056112224448898, + "grad_norm": 26.566825885874778, + "learning_rate": 9.453947045909719e-06, + "loss": 4.3426, + "step": 3521 + }, + { + "epoch": 0.7058116232464929, + "grad_norm": 22.85823273739799, + "learning_rate": 9.453417127792486e-06, + "loss": 4.0173, + "step": 3522 + }, + { + "epoch": 0.7060120240480962, + "grad_norm": 25.4276237911336, + "learning_rate": 9.45288696753639e-06, + "loss": 4.1473, + "step": 3523 + }, + { + "epoch": 0.7062124248496994, + "grad_norm": 18.529018077806896, + "learning_rate": 9.452356565170256e-06, + "loss": 3.618, + "step": 3524 + }, + { + "epoch": 0.7064128256513026, + "grad_norm": 41.35195280340972, + "learning_rate": 9.451825920722923e-06, + "loss": 4.5288, + "step": 3525 + }, + { + "epoch": 0.7066132264529058, + "grad_norm": 26.14468022162584, + "learning_rate": 9.451295034223245e-06, + "loss": 4.574, + "step": 3526 + }, + { + "epoch": 0.706813627254509, + "grad_norm": 20.27364592158989, + "learning_rate": 9.450763905700084e-06, + "loss": 4.3422, + "step": 3527 + }, + { + "epoch": 0.7070140280561122, + "grad_norm": 23.20148944838498, + "learning_rate": 9.45023253518232e-06, + "loss": 4.138, + "step": 3528 + }, + { + "epoch": 0.7072144288577155, + "grad_norm": 20.617936462380726, + "learning_rate": 9.449700922698843e-06, + "loss": 3.8923, + "step": 3529 + }, + { + "epoch": 0.7074148296593187, + "grad_norm": 31.095855764690683, + "learning_rate": 9.449169068278558e-06, + "loss": 4.3575, + "step": 3530 + }, + { + "epoch": 0.7076152304609219, + "grad_norm": 22.8570284358918, + "learning_rate": 9.448636971950385e-06, + "loss": 4.0554, + "step": 3531 + }, + { + "epoch": 0.707815631262525, + "grad_norm": 20.53499479491523, + "learning_rate": 9.448104633743252e-06, + "loss": 4.0193, + "step": 3532 + }, + { + "epoch": 0.7080160320641282, + "grad_norm": 33.31864636095604, + "learning_rate": 9.447572053686104e-06, + "loss": 4.244, + "step": 3533 + }, + { + "epoch": 0.7082164328657314, + "grad_norm": 33.604752869895414, + "learning_rate": 9.4470392318079e-06, + "loss": 4.5911, + "step": 3534 + }, + { + "epoch": 0.7084168336673347, + "grad_norm": 18.63495467117275, + "learning_rate": 9.446506168137605e-06, + "loss": 4.0034, + "step": 3535 + }, + { + "epoch": 0.7086172344689379, + "grad_norm": 21.32651041873567, + "learning_rate": 9.44597286270421e-06, + "loss": 4.0422, + "step": 3536 + }, + { + "epoch": 0.7088176352705411, + "grad_norm": 36.07936005667137, + "learning_rate": 9.445439315536704e-06, + "loss": 4.3896, + "step": 3537 + }, + { + "epoch": 0.7090180360721443, + "grad_norm": 19.243196152400188, + "learning_rate": 9.444905526664103e-06, + "loss": 3.7192, + "step": 3538 + }, + { + "epoch": 0.7092184368737475, + "grad_norm": 21.866171045933783, + "learning_rate": 9.444371496115425e-06, + "loss": 3.9572, + "step": 3539 + }, + { + "epoch": 0.7094188376753507, + "grad_norm": 20.305075265631423, + "learning_rate": 9.443837223919711e-06, + "loss": 4.363, + "step": 3540 + }, + { + "epoch": 0.709619238476954, + "grad_norm": 26.871030801988713, + "learning_rate": 9.443302710106006e-06, + "loss": 4.0765, + "step": 3541 + }, + { + "epoch": 0.7098196392785571, + "grad_norm": 19.891426399699654, + "learning_rate": 9.442767954703373e-06, + "loss": 4.4794, + "step": 3542 + }, + { + "epoch": 0.7100200400801603, + "grad_norm": 31.468450096077753, + "learning_rate": 9.442232957740889e-06, + "loss": 4.3298, + "step": 3543 + }, + { + "epoch": 0.7102204408817635, + "grad_norm": 24.584584000486274, + "learning_rate": 9.441697719247642e-06, + "loss": 4.2963, + "step": 3544 + }, + { + "epoch": 0.7104208416833667, + "grad_norm": 26.179524886255475, + "learning_rate": 9.441162239252733e-06, + "loss": 4.1829, + "step": 3545 + }, + { + "epoch": 0.7106212424849699, + "grad_norm": 24.750297314256745, + "learning_rate": 9.440626517785276e-06, + "loss": 3.8327, + "step": 3546 + }, + { + "epoch": 0.7108216432865732, + "grad_norm": 27.118483443000198, + "learning_rate": 9.440090554874401e-06, + "loss": 4.4107, + "step": 3547 + }, + { + "epoch": 0.7110220440881764, + "grad_norm": 34.29965628428228, + "learning_rate": 9.43955435054925e-06, + "loss": 4.8486, + "step": 3548 + }, + { + "epoch": 0.7112224448897796, + "grad_norm": 29.68097839329633, + "learning_rate": 9.439017904838974e-06, + "loss": 4.5804, + "step": 3549 + }, + { + "epoch": 0.7114228456913828, + "grad_norm": 36.30790133225774, + "learning_rate": 9.438481217772744e-06, + "loss": 5.1028, + "step": 3550 + }, + { + "epoch": 0.7116232464929859, + "grad_norm": 18.60776770284051, + "learning_rate": 9.437944289379735e-06, + "loss": 3.7967, + "step": 3551 + }, + { + "epoch": 0.7118236472945891, + "grad_norm": 49.404294653890226, + "learning_rate": 9.437407119689147e-06, + "loss": 4.4524, + "step": 3552 + }, + { + "epoch": 0.7120240480961924, + "grad_norm": 36.555196854359714, + "learning_rate": 9.436869708730183e-06, + "loss": 4.1386, + "step": 3553 + }, + { + "epoch": 0.7122244488977956, + "grad_norm": 28.47321274227775, + "learning_rate": 9.436332056532062e-06, + "loss": 4.5539, + "step": 3554 + }, + { + "epoch": 0.7124248496993988, + "grad_norm": 19.78575414203938, + "learning_rate": 9.43579416312402e-06, + "loss": 3.7488, + "step": 3555 + }, + { + "epoch": 0.712625250501002, + "grad_norm": 16.860982601302183, + "learning_rate": 9.4352560285353e-06, + "loss": 3.88, + "step": 3556 + }, + { + "epoch": 0.7128256513026052, + "grad_norm": 19.255472903853235, + "learning_rate": 9.434717652795165e-06, + "loss": 3.9416, + "step": 3557 + }, + { + "epoch": 0.7130260521042084, + "grad_norm": 26.75075896788293, + "learning_rate": 9.434179035932882e-06, + "loss": 4.1925, + "step": 3558 + }, + { + "epoch": 0.7132264529058117, + "grad_norm": 25.63360692800062, + "learning_rate": 9.433640177977741e-06, + "loss": 4.1095, + "step": 3559 + }, + { + "epoch": 0.7134268537074149, + "grad_norm": 24.695520346121555, + "learning_rate": 9.43310107895904e-06, + "loss": 4.0195, + "step": 3560 + }, + { + "epoch": 0.713627254509018, + "grad_norm": 20.962110270821718, + "learning_rate": 9.432561738906088e-06, + "loss": 3.8605, + "step": 3561 + }, + { + "epoch": 0.7138276553106212, + "grad_norm": 29.75819186717548, + "learning_rate": 9.432022157848212e-06, + "loss": 4.1072, + "step": 3562 + }, + { + "epoch": 0.7140280561122244, + "grad_norm": 21.75981846837312, + "learning_rate": 9.431482335814749e-06, + "loss": 4.0864, + "step": 3563 + }, + { + "epoch": 0.7142284569138276, + "grad_norm": 25.894944743079094, + "learning_rate": 9.430942272835048e-06, + "loss": 4.2871, + "step": 3564 + }, + { + "epoch": 0.7144288577154309, + "grad_norm": 26.923133505833896, + "learning_rate": 9.430401968938478e-06, + "loss": 4.3663, + "step": 3565 + }, + { + "epoch": 0.7146292585170341, + "grad_norm": 25.777168304330605, + "learning_rate": 9.42986142415441e-06, + "loss": 5.1008, + "step": 3566 + }, + { + "epoch": 0.7148296593186373, + "grad_norm": 49.53634886271763, + "learning_rate": 9.429320638512239e-06, + "loss": 4.7544, + "step": 3567 + }, + { + "epoch": 0.7150300601202405, + "grad_norm": 28.690025325953286, + "learning_rate": 9.428779612041367e-06, + "loss": 3.8022, + "step": 3568 + }, + { + "epoch": 0.7152304609218437, + "grad_norm": 16.91472606789252, + "learning_rate": 9.42823834477121e-06, + "loss": 3.9974, + "step": 3569 + }, + { + "epoch": 0.7154308617234469, + "grad_norm": 21.79596518573478, + "learning_rate": 9.427696836731197e-06, + "loss": 4.5733, + "step": 3570 + }, + { + "epoch": 0.71563126252505, + "grad_norm": 43.45399034172493, + "learning_rate": 9.42715508795077e-06, + "loss": 4.8112, + "step": 3571 + }, + { + "epoch": 0.7158316633266533, + "grad_norm": 30.357866544926637, + "learning_rate": 9.426613098459387e-06, + "loss": 4.4514, + "step": 3572 + }, + { + "epoch": 0.7160320641282565, + "grad_norm": 18.833649725948177, + "learning_rate": 9.426070868286516e-06, + "loss": 4.1484, + "step": 3573 + }, + { + "epoch": 0.7162324649298597, + "grad_norm": 27.190867984659537, + "learning_rate": 9.425528397461638e-06, + "loss": 4.4892, + "step": 3574 + }, + { + "epoch": 0.7164328657314629, + "grad_norm": 22.24955804410841, + "learning_rate": 9.424985686014248e-06, + "loss": 4.0087, + "step": 3575 + }, + { + "epoch": 0.7166332665330661, + "grad_norm": 19.73178712525854, + "learning_rate": 9.424442733973855e-06, + "loss": 4.254, + "step": 3576 + }, + { + "epoch": 0.7168336673346694, + "grad_norm": 18.28751189579725, + "learning_rate": 9.423899541369979e-06, + "loss": 4.4833, + "step": 3577 + }, + { + "epoch": 0.7170340681362726, + "grad_norm": 27.26924605357326, + "learning_rate": 9.423356108232154e-06, + "loss": 4.2526, + "step": 3578 + }, + { + "epoch": 0.7172344689378758, + "grad_norm": 34.5168749821047, + "learning_rate": 9.422812434589929e-06, + "loss": 4.3312, + "step": 3579 + }, + { + "epoch": 0.717434869739479, + "grad_norm": 22.043663232189076, + "learning_rate": 9.422268520472863e-06, + "loss": 4.4069, + "step": 3580 + }, + { + "epoch": 0.7176352705410821, + "grad_norm": 29.99100837807279, + "learning_rate": 9.42172436591053e-06, + "loss": 4.1318, + "step": 3581 + }, + { + "epoch": 0.7178356713426853, + "grad_norm": 41.05452247889694, + "learning_rate": 9.421179970932515e-06, + "loss": 4.6804, + "step": 3582 + }, + { + "epoch": 0.7180360721442886, + "grad_norm": 24.044233057414953, + "learning_rate": 9.42063533556842e-06, + "loss": 4.0135, + "step": 3583 + }, + { + "epoch": 0.7182364729458918, + "grad_norm": 27.088427266028994, + "learning_rate": 9.420090459847855e-06, + "loss": 3.7382, + "step": 3584 + }, + { + "epoch": 0.718436873747495, + "grad_norm": 20.21766350757318, + "learning_rate": 9.419545343800448e-06, + "loss": 4.4354, + "step": 3585 + }, + { + "epoch": 0.7186372745490982, + "grad_norm": 26.31327510349422, + "learning_rate": 9.418999987455838e-06, + "loss": 4.3297, + "step": 3586 + }, + { + "epoch": 0.7188376753507014, + "grad_norm": 22.58453346654097, + "learning_rate": 9.418454390843672e-06, + "loss": 4.0805, + "step": 3587 + }, + { + "epoch": 0.7190380761523046, + "grad_norm": 33.54523779911777, + "learning_rate": 9.417908553993622e-06, + "loss": 4.5493, + "step": 3588 + }, + { + "epoch": 0.7192384769539079, + "grad_norm": 25.051506897312425, + "learning_rate": 9.41736247693536e-06, + "loss": 4.5595, + "step": 3589 + }, + { + "epoch": 0.7194388777555111, + "grad_norm": 21.733337634709933, + "learning_rate": 9.416816159698582e-06, + "loss": 4.0414, + "step": 3590 + }, + { + "epoch": 0.7196392785571142, + "grad_norm": 22.798608996458555, + "learning_rate": 9.41626960231299e-06, + "loss": 4.2375, + "step": 3591 + }, + { + "epoch": 0.7198396793587174, + "grad_norm": 24.53759634261594, + "learning_rate": 9.415722804808298e-06, + "loss": 3.9261, + "step": 3592 + }, + { + "epoch": 0.7200400801603206, + "grad_norm": 25.555900643957507, + "learning_rate": 9.41517576721424e-06, + "loss": 3.7374, + "step": 3593 + }, + { + "epoch": 0.7202404809619238, + "grad_norm": 25.919168738368857, + "learning_rate": 9.414628489560557e-06, + "loss": 4.2752, + "step": 3594 + }, + { + "epoch": 0.720440881763527, + "grad_norm": 22.206050888151683, + "learning_rate": 9.414080971877007e-06, + "loss": 4.6925, + "step": 3595 + }, + { + "epoch": 0.7206412825651303, + "grad_norm": 23.858195176099684, + "learning_rate": 9.413533214193359e-06, + "loss": 3.8906, + "step": 3596 + }, + { + "epoch": 0.7208416833667335, + "grad_norm": 33.93330659853728, + "learning_rate": 9.412985216539396e-06, + "loss": 4.2433, + "step": 3597 + }, + { + "epoch": 0.7210420841683367, + "grad_norm": 48.91933004847266, + "learning_rate": 9.412436978944912e-06, + "loss": 4.6987, + "step": 3598 + }, + { + "epoch": 0.7212424849699399, + "grad_norm": 23.25623371840918, + "learning_rate": 9.411888501439716e-06, + "loss": 4.1029, + "step": 3599 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 40.319999954322604, + "learning_rate": 9.411339784053631e-06, + "loss": 4.2308, + "step": 3600 + }, + { + "epoch": 0.7216432865731462, + "grad_norm": 30.646438787324495, + "learning_rate": 9.410790826816489e-06, + "loss": 4.1606, + "step": 3601 + }, + { + "epoch": 0.7218436873747495, + "grad_norm": 27.592540897684287, + "learning_rate": 9.41024162975814e-06, + "loss": 3.8045, + "step": 3602 + }, + { + "epoch": 0.7220440881763527, + "grad_norm": 22.103687740999202, + "learning_rate": 9.409692192908442e-06, + "loss": 4.2771, + "step": 3603 + }, + { + "epoch": 0.7222444889779559, + "grad_norm": 22.498531696719756, + "learning_rate": 9.409142516297269e-06, + "loss": 3.9102, + "step": 3604 + }, + { + "epoch": 0.7224448897795591, + "grad_norm": 27.093032476377832, + "learning_rate": 9.408592599954512e-06, + "loss": 3.671, + "step": 3605 + }, + { + "epoch": 0.7226452905811623, + "grad_norm": 17.11443309166908, + "learning_rate": 9.408042443910066e-06, + "loss": 3.9666, + "step": 3606 + }, + { + "epoch": 0.7228456913827656, + "grad_norm": 23.506800801391773, + "learning_rate": 9.407492048193844e-06, + "loss": 3.9057, + "step": 3607 + }, + { + "epoch": 0.7230460921843688, + "grad_norm": 27.394976210951327, + "learning_rate": 9.406941412835775e-06, + "loss": 4.0213, + "step": 3608 + }, + { + "epoch": 0.723246492985972, + "grad_norm": 29.948697085595754, + "learning_rate": 9.406390537865797e-06, + "loss": 4.0428, + "step": 3609 + }, + { + "epoch": 0.7234468937875751, + "grad_norm": 30.53790185608089, + "learning_rate": 9.40583942331386e-06, + "loss": 4.648, + "step": 3610 + }, + { + "epoch": 0.7236472945891783, + "grad_norm": 26.083852884403825, + "learning_rate": 9.40528806920993e-06, + "loss": 4.7091, + "step": 3611 + }, + { + "epoch": 0.7238476953907815, + "grad_norm": 29.123102666913987, + "learning_rate": 9.404736475583984e-06, + "loss": 4.6102, + "step": 3612 + }, + { + "epoch": 0.7240480961923847, + "grad_norm": 21.546084856681677, + "learning_rate": 9.404184642466015e-06, + "loss": 3.7326, + "step": 3613 + }, + { + "epoch": 0.724248496993988, + "grad_norm": 25.936761396998417, + "learning_rate": 9.403632569886025e-06, + "loss": 4.0239, + "step": 3614 + }, + { + "epoch": 0.7244488977955912, + "grad_norm": 27.02412376536145, + "learning_rate": 9.403080257874033e-06, + "loss": 4.969, + "step": 3615 + }, + { + "epoch": 0.7246492985971944, + "grad_norm": 24.758502524330137, + "learning_rate": 9.402527706460066e-06, + "loss": 3.8313, + "step": 3616 + }, + { + "epoch": 0.7248496993987976, + "grad_norm": 61.133616756373094, + "learning_rate": 9.401974915674171e-06, + "loss": 4.4577, + "step": 3617 + }, + { + "epoch": 0.7250501002004008, + "grad_norm": 27.475971408494434, + "learning_rate": 9.4014218855464e-06, + "loss": 4.6174, + "step": 3618 + }, + { + "epoch": 0.725250501002004, + "grad_norm": 20.026234573806743, + "learning_rate": 9.400868616106827e-06, + "loss": 3.6433, + "step": 3619 + }, + { + "epoch": 0.7254509018036072, + "grad_norm": 23.634396783261106, + "learning_rate": 9.400315107385527e-06, + "loss": 3.8983, + "step": 3620 + }, + { + "epoch": 0.7256513026052104, + "grad_norm": 25.930042279495144, + "learning_rate": 9.399761359412603e-06, + "loss": 4.1678, + "step": 3621 + }, + { + "epoch": 0.7258517034068136, + "grad_norm": 21.261120830872766, + "learning_rate": 9.399207372218157e-06, + "loss": 3.8112, + "step": 3622 + }, + { + "epoch": 0.7260521042084168, + "grad_norm": 19.853285257309906, + "learning_rate": 9.398653145832312e-06, + "loss": 4.1632, + "step": 3623 + }, + { + "epoch": 0.72625250501002, + "grad_norm": 29.627341587341995, + "learning_rate": 9.398098680285203e-06, + "loss": 4.4402, + "step": 3624 + }, + { + "epoch": 0.7264529058116233, + "grad_norm": 45.1974041955132, + "learning_rate": 9.397543975606977e-06, + "loss": 4.2584, + "step": 3625 + }, + { + "epoch": 0.7266533066132265, + "grad_norm": 31.77609398249826, + "learning_rate": 9.396989031827793e-06, + "loss": 3.9487, + "step": 3626 + }, + { + "epoch": 0.7268537074148297, + "grad_norm": 39.73873225223963, + "learning_rate": 9.396433848977827e-06, + "loss": 5.0512, + "step": 3627 + }, + { + "epoch": 0.7270541082164329, + "grad_norm": 30.208215658193808, + "learning_rate": 9.395878427087259e-06, + "loss": 5.024, + "step": 3628 + }, + { + "epoch": 0.7272545090180361, + "grad_norm": 34.93970183243182, + "learning_rate": 9.395322766186295e-06, + "loss": 4.2277, + "step": 3629 + }, + { + "epoch": 0.7274549098196392, + "grad_norm": 21.51562810620663, + "learning_rate": 9.394766866305143e-06, + "loss": 4.1782, + "step": 3630 + }, + { + "epoch": 0.7276553106212424, + "grad_norm": 45.65807021814329, + "learning_rate": 9.39421072747403e-06, + "loss": 5.1389, + "step": 3631 + }, + { + "epoch": 0.7278557114228457, + "grad_norm": 41.535341902341074, + "learning_rate": 9.393654349723192e-06, + "loss": 4.7227, + "step": 3632 + }, + { + "epoch": 0.7280561122244489, + "grad_norm": 23.482645106364537, + "learning_rate": 9.393097733082882e-06, + "loss": 3.7006, + "step": 3633 + }, + { + "epoch": 0.7282565130260521, + "grad_norm": 24.83591116608835, + "learning_rate": 9.392540877583364e-06, + "loss": 4.2127, + "step": 3634 + }, + { + "epoch": 0.7284569138276553, + "grad_norm": 23.43568282470154, + "learning_rate": 9.391983783254915e-06, + "loss": 3.7867, + "step": 3635 + }, + { + "epoch": 0.7286573146292585, + "grad_norm": 18.98633639596008, + "learning_rate": 9.391426450127824e-06, + "loss": 3.6477, + "step": 3636 + }, + { + "epoch": 0.7288577154308618, + "grad_norm": 36.61710286791874, + "learning_rate": 9.390868878232394e-06, + "loss": 4.4523, + "step": 3637 + }, + { + "epoch": 0.729058116232465, + "grad_norm": 28.60213753177432, + "learning_rate": 9.39031106759894e-06, + "loss": 4.1667, + "step": 3638 + }, + { + "epoch": 0.7292585170340682, + "grad_norm": 24.792055628593392, + "learning_rate": 9.389753018257794e-06, + "loss": 4.1688, + "step": 3639 + }, + { + "epoch": 0.7294589178356713, + "grad_norm": 19.989526421858045, + "learning_rate": 9.389194730239298e-06, + "loss": 3.6354, + "step": 3640 + }, + { + "epoch": 0.7296593186372745, + "grad_norm": 67.5688323390281, + "learning_rate": 9.388636203573805e-06, + "loss": 4.0092, + "step": 3641 + }, + { + "epoch": 0.7298597194388777, + "grad_norm": 20.595768489578642, + "learning_rate": 9.388077438291682e-06, + "loss": 4.1175, + "step": 3642 + }, + { + "epoch": 0.730060120240481, + "grad_norm": 28.394840026885266, + "learning_rate": 9.387518434423312e-06, + "loss": 4.1612, + "step": 3643 + }, + { + "epoch": 0.7302605210420842, + "grad_norm": 15.16087805249705, + "learning_rate": 9.386959191999087e-06, + "loss": 3.6895, + "step": 3644 + }, + { + "epoch": 0.7304609218436874, + "grad_norm": 30.62971769181065, + "learning_rate": 9.386399711049417e-06, + "loss": 4.3715, + "step": 3645 + }, + { + "epoch": 0.7306613226452906, + "grad_norm": 20.951500919534222, + "learning_rate": 9.385839991604718e-06, + "loss": 4.0492, + "step": 3646 + }, + { + "epoch": 0.7308617234468938, + "grad_norm": 23.830675834905577, + "learning_rate": 9.385280033695425e-06, + "loss": 4.0076, + "step": 3647 + }, + { + "epoch": 0.731062124248497, + "grad_norm": 16.761737431131206, + "learning_rate": 9.384719837351986e-06, + "loss": 3.6114, + "step": 3648 + }, + { + "epoch": 0.7312625250501003, + "grad_norm": 24.841361511308474, + "learning_rate": 9.384159402604853e-06, + "loss": 4.0235, + "step": 3649 + }, + { + "epoch": 0.7314629258517034, + "grad_norm": 25.232825085961334, + "learning_rate": 9.383598729484504e-06, + "loss": 4.1891, + "step": 3650 + }, + { + "epoch": 0.7316633266533066, + "grad_norm": 19.98249763930306, + "learning_rate": 9.383037818021421e-06, + "loss": 4.0719, + "step": 3651 + }, + { + "epoch": 0.7318637274549098, + "grad_norm": 29.60392084592404, + "learning_rate": 9.382476668246101e-06, + "loss": 4.0697, + "step": 3652 + }, + { + "epoch": 0.732064128256513, + "grad_norm": 27.0120067206578, + "learning_rate": 9.381915280189056e-06, + "loss": 4.2168, + "step": 3653 + }, + { + "epoch": 0.7322645290581162, + "grad_norm": 34.43383635775801, + "learning_rate": 9.381353653880808e-06, + "loss": 4.559, + "step": 3654 + }, + { + "epoch": 0.7324649298597194, + "grad_norm": 26.44615651642056, + "learning_rate": 9.380791789351896e-06, + "loss": 3.9616, + "step": 3655 + }, + { + "epoch": 0.7326653306613227, + "grad_norm": 22.433406795247464, + "learning_rate": 9.380229686632867e-06, + "loss": 3.988, + "step": 3656 + }, + { + "epoch": 0.7328657314629259, + "grad_norm": 20.51184397883438, + "learning_rate": 9.379667345754284e-06, + "loss": 4.2933, + "step": 3657 + }, + { + "epoch": 0.7330661322645291, + "grad_norm": 16.7107001656469, + "learning_rate": 9.379104766746723e-06, + "loss": 3.6198, + "step": 3658 + }, + { + "epoch": 0.7332665330661323, + "grad_norm": 20.353929456434056, + "learning_rate": 9.37854194964077e-06, + "loss": 4.4314, + "step": 3659 + }, + { + "epoch": 0.7334669338677354, + "grad_norm": 23.990301112838242, + "learning_rate": 9.37797889446703e-06, + "loss": 4.0522, + "step": 3660 + }, + { + "epoch": 0.7336673346693386, + "grad_norm": 25.118655637777593, + "learning_rate": 9.377415601256114e-06, + "loss": 4.5588, + "step": 3661 + }, + { + "epoch": 0.7338677354709419, + "grad_norm": 29.792704125442942, + "learning_rate": 9.376852070038651e-06, + "loss": 4.3925, + "step": 3662 + }, + { + "epoch": 0.7340681362725451, + "grad_norm": 22.56674114145205, + "learning_rate": 9.37628830084528e-06, + "loss": 4.0542, + "step": 3663 + }, + { + "epoch": 0.7342685370741483, + "grad_norm": 29.412062783962135, + "learning_rate": 9.375724293706653e-06, + "loss": 3.9388, + "step": 3664 + }, + { + "epoch": 0.7344689378757515, + "grad_norm": 23.625977846835678, + "learning_rate": 9.375160048653437e-06, + "loss": 4.6765, + "step": 3665 + }, + { + "epoch": 0.7346693386773547, + "grad_norm": 28.017916675321306, + "learning_rate": 9.374595565716312e-06, + "loss": 4.7015, + "step": 3666 + }, + { + "epoch": 0.734869739478958, + "grad_norm": 19.65274984124026, + "learning_rate": 9.374030844925967e-06, + "loss": 3.8006, + "step": 3667 + }, + { + "epoch": 0.7350701402805612, + "grad_norm": 22.295475089565837, + "learning_rate": 9.37346588631311e-06, + "loss": 3.7383, + "step": 3668 + }, + { + "epoch": 0.7352705410821643, + "grad_norm": 33.4623354036282, + "learning_rate": 9.372900689908457e-06, + "loss": 4.0455, + "step": 3669 + }, + { + "epoch": 0.7354709418837675, + "grad_norm": 23.93419375784039, + "learning_rate": 9.372335255742737e-06, + "loss": 4.2077, + "step": 3670 + }, + { + "epoch": 0.7356713426853707, + "grad_norm": 17.901195082250684, + "learning_rate": 9.371769583846694e-06, + "loss": 3.6574, + "step": 3671 + }, + { + "epoch": 0.7358717434869739, + "grad_norm": 18.458332891963902, + "learning_rate": 9.371203674251088e-06, + "loss": 4.1287, + "step": 3672 + }, + { + "epoch": 0.7360721442885771, + "grad_norm": 22.145552940621954, + "learning_rate": 9.370637526986685e-06, + "loss": 4.456, + "step": 3673 + }, + { + "epoch": 0.7362725450901804, + "grad_norm": 27.965633210444448, + "learning_rate": 9.370071142084269e-06, + "loss": 4.6824, + "step": 3674 + }, + { + "epoch": 0.7364729458917836, + "grad_norm": 21.197432810852376, + "learning_rate": 9.369504519574633e-06, + "loss": 4.2631, + "step": 3675 + }, + { + "epoch": 0.7366733466933868, + "grad_norm": 41.66215306954186, + "learning_rate": 9.368937659488586e-06, + "loss": 4.8376, + "step": 3676 + }, + { + "epoch": 0.73687374749499, + "grad_norm": 15.370846365309243, + "learning_rate": 9.36837056185695e-06, + "loss": 4.0662, + "step": 3677 + }, + { + "epoch": 0.7370741482965932, + "grad_norm": 24.967251554268447, + "learning_rate": 9.367803226710557e-06, + "loss": 3.9574, + "step": 3678 + }, + { + "epoch": 0.7372745490981963, + "grad_norm": 29.962471237165083, + "learning_rate": 9.367235654080256e-06, + "loss": 4.0695, + "step": 3679 + }, + { + "epoch": 0.7374749498997996, + "grad_norm": 177.30210957913397, + "learning_rate": 9.366667843996906e-06, + "loss": 5.0598, + "step": 3680 + }, + { + "epoch": 0.7376753507014028, + "grad_norm": 21.52164401670226, + "learning_rate": 9.36609979649138e-06, + "loss": 3.8885, + "step": 3681 + }, + { + "epoch": 0.737875751503006, + "grad_norm": 24.962408426340634, + "learning_rate": 9.365531511594561e-06, + "loss": 4.3526, + "step": 3682 + }, + { + "epoch": 0.7380761523046092, + "grad_norm": 62.3156154970035, + "learning_rate": 9.364962989337352e-06, + "loss": 4.0903, + "step": 3683 + }, + { + "epoch": 0.7382765531062124, + "grad_norm": 35.23582549244814, + "learning_rate": 9.36439422975066e-06, + "loss": 4.4316, + "step": 3684 + }, + { + "epoch": 0.7384769539078156, + "grad_norm": 45.77885617446484, + "learning_rate": 9.363825232865414e-06, + "loss": 4.1758, + "step": 3685 + }, + { + "epoch": 0.7386773547094189, + "grad_norm": 33.17236080483201, + "learning_rate": 9.363255998712546e-06, + "loss": 4.3417, + "step": 3686 + }, + { + "epoch": 0.7388777555110221, + "grad_norm": 28.081853544107705, + "learning_rate": 9.362686527323008e-06, + "loss": 4.956, + "step": 3687 + }, + { + "epoch": 0.7390781563126253, + "grad_norm": 17.759752649813816, + "learning_rate": 9.362116818727766e-06, + "loss": 4.0464, + "step": 3688 + }, + { + "epoch": 0.7392785571142284, + "grad_norm": 27.831364621191113, + "learning_rate": 9.361546872957793e-06, + "loss": 3.9023, + "step": 3689 + }, + { + "epoch": 0.7394789579158316, + "grad_norm": 24.09105158742217, + "learning_rate": 9.360976690044078e-06, + "loss": 4.1024, + "step": 3690 + }, + { + "epoch": 0.7396793587174348, + "grad_norm": 44.35845584754271, + "learning_rate": 9.360406270017623e-06, + "loss": 4.203, + "step": 3691 + }, + { + "epoch": 0.7398797595190381, + "grad_norm": 21.284021415798396, + "learning_rate": 9.359835612909441e-06, + "loss": 4.5252, + "step": 3692 + }, + { + "epoch": 0.7400801603206413, + "grad_norm": 28.662850153044356, + "learning_rate": 9.359264718750563e-06, + "loss": 4.3062, + "step": 3693 + }, + { + "epoch": 0.7402805611222445, + "grad_norm": 22.994790407843624, + "learning_rate": 9.358693587572027e-06, + "loss": 4.2404, + "step": 3694 + }, + { + "epoch": 0.7404809619238477, + "grad_norm": 20.258525243639088, + "learning_rate": 9.358122219404886e-06, + "loss": 4.2082, + "step": 3695 + }, + { + "epoch": 0.7406813627254509, + "grad_norm": 22.258687743283957, + "learning_rate": 9.357550614280206e-06, + "loss": 4.0084, + "step": 3696 + }, + { + "epoch": 0.7408817635270541, + "grad_norm": 25.235120854381922, + "learning_rate": 9.356978772229069e-06, + "loss": 4.0416, + "step": 3697 + }, + { + "epoch": 0.7410821643286574, + "grad_norm": 136.03904807906673, + "learning_rate": 9.356406693282563e-06, + "loss": 4.0638, + "step": 3698 + }, + { + "epoch": 0.7412825651302605, + "grad_norm": 25.00821081900605, + "learning_rate": 9.355834377471796e-06, + "loss": 4.5903, + "step": 3699 + }, + { + "epoch": 0.7414829659318637, + "grad_norm": 43.17130295710369, + "learning_rate": 9.355261824827882e-06, + "loss": 4.5043, + "step": 3700 + }, + { + "epoch": 0.7416833667334669, + "grad_norm": 25.60326443006193, + "learning_rate": 9.354689035381954e-06, + "loss": 4.2279, + "step": 3701 + }, + { + "epoch": 0.7418837675350701, + "grad_norm": 26.41713002144221, + "learning_rate": 9.354116009165157e-06, + "loss": 4.043, + "step": 3702 + }, + { + "epoch": 0.7420841683366733, + "grad_norm": 25.424068838461608, + "learning_rate": 9.353542746208643e-06, + "loss": 4.6737, + "step": 3703 + }, + { + "epoch": 0.7422845691382766, + "grad_norm": 24.339313637893362, + "learning_rate": 9.352969246543585e-06, + "loss": 4.1109, + "step": 3704 + }, + { + "epoch": 0.7424849699398798, + "grad_norm": 21.18563132613965, + "learning_rate": 9.352395510201161e-06, + "loss": 4.4662, + "step": 3705 + }, + { + "epoch": 0.742685370741483, + "grad_norm": 42.870063662020875, + "learning_rate": 9.351821537212573e-06, + "loss": 4.7017, + "step": 3706 + }, + { + "epoch": 0.7428857715430862, + "grad_norm": 18.734023847125947, + "learning_rate": 9.351247327609022e-06, + "loss": 3.9891, + "step": 3707 + }, + { + "epoch": 0.7430861723446894, + "grad_norm": 23.442996408241633, + "learning_rate": 9.35067288142173e-06, + "loss": 4.3648, + "step": 3708 + }, + { + "epoch": 0.7432865731462925, + "grad_norm": 26.323843509801236, + "learning_rate": 9.35009819868193e-06, + "loss": 3.6972, + "step": 3709 + }, + { + "epoch": 0.7434869739478958, + "grad_norm": 25.032410267170924, + "learning_rate": 9.34952327942087e-06, + "loss": 4.1207, + "step": 3710 + }, + { + "epoch": 0.743687374749499, + "grad_norm": 25.59042449901656, + "learning_rate": 9.348948123669811e-06, + "loss": 3.808, + "step": 3711 + }, + { + "epoch": 0.7438877755511022, + "grad_norm": 19.935058434213147, + "learning_rate": 9.348372731460023e-06, + "loss": 3.725, + "step": 3712 + }, + { + "epoch": 0.7440881763527054, + "grad_norm": 21.60848926558819, + "learning_rate": 9.347797102822789e-06, + "loss": 4.4617, + "step": 3713 + }, + { + "epoch": 0.7442885771543086, + "grad_norm": 29.62126010342221, + "learning_rate": 9.34722123778941e-06, + "loss": 4.0909, + "step": 3714 + }, + { + "epoch": 0.7444889779559118, + "grad_norm": 39.08132515828897, + "learning_rate": 9.346645136391194e-06, + "loss": 3.8625, + "step": 3715 + }, + { + "epoch": 0.7446893787575151, + "grad_norm": 55.71624416623117, + "learning_rate": 9.346068798659466e-06, + "loss": 5.3611, + "step": 3716 + }, + { + "epoch": 0.7448897795591183, + "grad_norm": 20.16544492043266, + "learning_rate": 9.34549222462556e-06, + "loss": 3.8262, + "step": 3717 + }, + { + "epoch": 0.7450901803607214, + "grad_norm": 25.13820848524116, + "learning_rate": 9.34491541432083e-06, + "loss": 3.7138, + "step": 3718 + }, + { + "epoch": 0.7452905811623246, + "grad_norm": 21.959576063073936, + "learning_rate": 9.344338367776636e-06, + "loss": 4.1499, + "step": 3719 + }, + { + "epoch": 0.7454909819639278, + "grad_norm": 31.134467295560732, + "learning_rate": 9.34376108502435e-06, + "loss": 4.4441, + "step": 3720 + }, + { + "epoch": 0.745691382765531, + "grad_norm": 18.107063286815595, + "learning_rate": 9.343183566095364e-06, + "loss": 4.4348, + "step": 3721 + }, + { + "epoch": 0.7458917835671343, + "grad_norm": 25.20491003683394, + "learning_rate": 9.342605811021073e-06, + "loss": 4.861, + "step": 3722 + }, + { + "epoch": 0.7460921843687375, + "grad_norm": 23.744061211793504, + "learning_rate": 9.342027819832897e-06, + "loss": 4.132, + "step": 3723 + }, + { + "epoch": 0.7462925851703407, + "grad_norm": 50.88546837581109, + "learning_rate": 9.341449592562257e-06, + "loss": 4.2998, + "step": 3724 + }, + { + "epoch": 0.7464929859719439, + "grad_norm": 37.539218088524144, + "learning_rate": 9.340871129240595e-06, + "loss": 4.8322, + "step": 3725 + }, + { + "epoch": 0.7466933867735471, + "grad_norm": 25.993433138161585, + "learning_rate": 9.340292429899362e-06, + "loss": 3.7325, + "step": 3726 + }, + { + "epoch": 0.7468937875751503, + "grad_norm": 36.190756492385106, + "learning_rate": 9.339713494570022e-06, + "loss": 3.7188, + "step": 3727 + }, + { + "epoch": 0.7470941883767535, + "grad_norm": 28.565328282571567, + "learning_rate": 9.339134323284054e-06, + "loss": 3.3081, + "step": 3728 + }, + { + "epoch": 0.7472945891783567, + "grad_norm": 25.086736063657927, + "learning_rate": 9.338554916072948e-06, + "loss": 4.2086, + "step": 3729 + }, + { + "epoch": 0.7474949899799599, + "grad_norm": 20.564608582924386, + "learning_rate": 9.337975272968206e-06, + "loss": 4.1191, + "step": 3730 + }, + { + "epoch": 0.7476953907815631, + "grad_norm": 18.59737092029698, + "learning_rate": 9.337395394001346e-06, + "loss": 3.965, + "step": 3731 + }, + { + "epoch": 0.7478957915831663, + "grad_norm": 22.14672716730449, + "learning_rate": 9.336815279203895e-06, + "loss": 4.2301, + "step": 3732 + }, + { + "epoch": 0.7480961923847695, + "grad_norm": 20.873752132100712, + "learning_rate": 9.336234928607396e-06, + "loss": 4.2167, + "step": 3733 + }, + { + "epoch": 0.7482965931863728, + "grad_norm": 30.570610163741584, + "learning_rate": 9.335654342243403e-06, + "loss": 3.9337, + "step": 3734 + }, + { + "epoch": 0.748496993987976, + "grad_norm": 22.611643124264333, + "learning_rate": 9.335073520143483e-06, + "loss": 4.0998, + "step": 3735 + }, + { + "epoch": 0.7486973947895792, + "grad_norm": 24.849009548802197, + "learning_rate": 9.334492462339216e-06, + "loss": 4.2473, + "step": 3736 + }, + { + "epoch": 0.7488977955911824, + "grad_norm": 23.660451903155376, + "learning_rate": 9.333911168862197e-06, + "loss": 3.7061, + "step": 3737 + }, + { + "epoch": 0.7490981963927855, + "grad_norm": 24.369041999448488, + "learning_rate": 9.33332963974403e-06, + "loss": 3.8324, + "step": 3738 + }, + { + "epoch": 0.7492985971943887, + "grad_norm": 52.58839888238446, + "learning_rate": 9.332747875016332e-06, + "loss": 4.4565, + "step": 3739 + }, + { + "epoch": 0.749498997995992, + "grad_norm": 22.228123985344045, + "learning_rate": 9.33216587471074e-06, + "loss": 4.1168, + "step": 3740 + }, + { + "epoch": 0.7496993987975952, + "grad_norm": 23.081198517311975, + "learning_rate": 9.331583638858892e-06, + "loss": 3.6056, + "step": 3741 + }, + { + "epoch": 0.7498997995991984, + "grad_norm": 21.890903801637396, + "learning_rate": 9.331001167492448e-06, + "loss": 4.1098, + "step": 3742 + }, + { + "epoch": 0.7501002004008016, + "grad_norm": 24.320638289791997, + "learning_rate": 9.330418460643075e-06, + "loss": 3.9352, + "step": 3743 + }, + { + "epoch": 0.7503006012024048, + "grad_norm": 19.74634508207819, + "learning_rate": 9.329835518342463e-06, + "loss": 4.1416, + "step": 3744 + }, + { + "epoch": 0.750501002004008, + "grad_norm": 27.125870522934743, + "learning_rate": 9.329252340622298e-06, + "loss": 4.6045, + "step": 3745 + }, + { + "epoch": 0.7507014028056113, + "grad_norm": 22.500074527970234, + "learning_rate": 9.328668927514294e-06, + "loss": 4.1357, + "step": 3746 + }, + { + "epoch": 0.7509018036072145, + "grad_norm": 19.287364966389788, + "learning_rate": 9.328085279050172e-06, + "loss": 3.7756, + "step": 3747 + }, + { + "epoch": 0.7511022044088176, + "grad_norm": 33.671396185669956, + "learning_rate": 9.327501395261665e-06, + "loss": 4.7837, + "step": 3748 + }, + { + "epoch": 0.7513026052104208, + "grad_norm": 33.97747939353349, + "learning_rate": 9.326917276180515e-06, + "loss": 4.3086, + "step": 3749 + }, + { + "epoch": 0.751503006012024, + "grad_norm": 23.23456489627185, + "learning_rate": 9.326332921838491e-06, + "loss": 4.0985, + "step": 3750 + }, + { + "epoch": 0.7517034068136272, + "grad_norm": 28.51855164758351, + "learning_rate": 9.325748332267357e-06, + "loss": 4.4197, + "step": 3751 + }, + { + "epoch": 0.7519038076152305, + "grad_norm": 29.951808495452358, + "learning_rate": 9.3251635074989e-06, + "loss": 4.8419, + "step": 3752 + }, + { + "epoch": 0.7521042084168337, + "grad_norm": 54.03502346579483, + "learning_rate": 9.32457844756492e-06, + "loss": 4.694, + "step": 3753 + }, + { + "epoch": 0.7523046092184369, + "grad_norm": 27.385772665275237, + "learning_rate": 9.323993152497227e-06, + "loss": 4.2695, + "step": 3754 + }, + { + "epoch": 0.7525050100200401, + "grad_norm": 29.46708079517109, + "learning_rate": 9.323407622327641e-06, + "loss": 3.971, + "step": 3755 + }, + { + "epoch": 0.7527054108216433, + "grad_norm": 33.59856433351383, + "learning_rate": 9.322821857088003e-06, + "loss": 4.5783, + "step": 3756 + }, + { + "epoch": 0.7529058116232465, + "grad_norm": 20.360359075893744, + "learning_rate": 9.322235856810158e-06, + "loss": 4.2759, + "step": 3757 + }, + { + "epoch": 0.7531062124248497, + "grad_norm": 27.158191513384253, + "learning_rate": 9.32164962152597e-06, + "loss": 4.318, + "step": 3758 + }, + { + "epoch": 0.7533066132264529, + "grad_norm": 21.758906853695247, + "learning_rate": 9.321063151267311e-06, + "loss": 4.2318, + "step": 3759 + }, + { + "epoch": 0.7535070140280561, + "grad_norm": 23.044392689815098, + "learning_rate": 9.32047644606607e-06, + "loss": 4.0002, + "step": 3760 + }, + { + "epoch": 0.7537074148296593, + "grad_norm": 22.946601602365966, + "learning_rate": 9.319889505954149e-06, + "loss": 4.1341, + "step": 3761 + }, + { + "epoch": 0.7539078156312625, + "grad_norm": 66.62285071160312, + "learning_rate": 9.31930233096346e-06, + "loss": 4.0642, + "step": 3762 + }, + { + "epoch": 0.7541082164328657, + "grad_norm": 23.350416434825277, + "learning_rate": 9.318714921125925e-06, + "loss": 4.4603, + "step": 3763 + }, + { + "epoch": 0.754308617234469, + "grad_norm": 25.396602538953463, + "learning_rate": 9.318127276473486e-06, + "loss": 4.3104, + "step": 3764 + }, + { + "epoch": 0.7545090180360722, + "grad_norm": 21.659970036182596, + "learning_rate": 9.317539397038092e-06, + "loss": 4.432, + "step": 3765 + }, + { + "epoch": 0.7547094188376754, + "grad_norm": 22.75390135843567, + "learning_rate": 9.316951282851708e-06, + "loss": 4.9745, + "step": 3766 + }, + { + "epoch": 0.7549098196392786, + "grad_norm": 19.18410966965126, + "learning_rate": 9.31636293394631e-06, + "loss": 3.7756, + "step": 3767 + }, + { + "epoch": 0.7551102204408817, + "grad_norm": 40.54733231106273, + "learning_rate": 9.315774350353888e-06, + "loss": 4.9631, + "step": 3768 + }, + { + "epoch": 0.7553106212424849, + "grad_norm": 23.34165453154166, + "learning_rate": 9.315185532106444e-06, + "loss": 4.1485, + "step": 3769 + }, + { + "epoch": 0.7555110220440882, + "grad_norm": 21.471886401100367, + "learning_rate": 9.314596479235993e-06, + "loss": 3.7723, + "step": 3770 + }, + { + "epoch": 0.7557114228456914, + "grad_norm": 26.215216529530082, + "learning_rate": 9.314007191774561e-06, + "loss": 3.8667, + "step": 3771 + }, + { + "epoch": 0.7559118236472946, + "grad_norm": 58.53531217938736, + "learning_rate": 9.313417669754192e-06, + "loss": 4.191, + "step": 3772 + }, + { + "epoch": 0.7561122244488978, + "grad_norm": 23.995617020770872, + "learning_rate": 9.312827913206936e-06, + "loss": 3.7693, + "step": 3773 + }, + { + "epoch": 0.756312625250501, + "grad_norm": 39.121916570143405, + "learning_rate": 9.312237922164861e-06, + "loss": 4.6419, + "step": 3774 + }, + { + "epoch": 0.7565130260521042, + "grad_norm": 27.014759538668145, + "learning_rate": 9.311647696660043e-06, + "loss": 3.9098, + "step": 3775 + }, + { + "epoch": 0.7567134268537075, + "grad_norm": 33.95437466197999, + "learning_rate": 9.311057236724577e-06, + "loss": 4.5463, + "step": 3776 + }, + { + "epoch": 0.7569138276553106, + "grad_norm": 38.312437287540256, + "learning_rate": 9.310466542390564e-06, + "loss": 4.1219, + "step": 3777 + }, + { + "epoch": 0.7571142284569138, + "grad_norm": 26.749292775339097, + "learning_rate": 9.309875613690123e-06, + "loss": 4.1801, + "step": 3778 + }, + { + "epoch": 0.757314629258517, + "grad_norm": 28.4296474662413, + "learning_rate": 9.309284450655383e-06, + "loss": 4.3264, + "step": 3779 + }, + { + "epoch": 0.7575150300601202, + "grad_norm": 29.573778812414922, + "learning_rate": 9.308693053318487e-06, + "loss": 4.7373, + "step": 3780 + }, + { + "epoch": 0.7577154308617234, + "grad_norm": 33.64197318120599, + "learning_rate": 9.308101421711588e-06, + "loss": 3.8057, + "step": 3781 + }, + { + "epoch": 0.7579158316633267, + "grad_norm": 32.51245271940081, + "learning_rate": 9.307509555866855e-06, + "loss": 4.3073, + "step": 3782 + }, + { + "epoch": 0.7581162324649299, + "grad_norm": 54.83116705369137, + "learning_rate": 9.30691745581647e-06, + "loss": 4.2631, + "step": 3783 + }, + { + "epoch": 0.7583166332665331, + "grad_norm": 34.90833840855633, + "learning_rate": 9.306325121592626e-06, + "loss": 4.0413, + "step": 3784 + }, + { + "epoch": 0.7585170340681363, + "grad_norm": 22.953314113537825, + "learning_rate": 9.305732553227527e-06, + "loss": 4.1102, + "step": 3785 + }, + { + "epoch": 0.7587174348697395, + "grad_norm": 28.566374335737258, + "learning_rate": 9.305139750753394e-06, + "loss": 4.1421, + "step": 3786 + }, + { + "epoch": 0.7589178356713426, + "grad_norm": 25.53800066367523, + "learning_rate": 9.304546714202458e-06, + "loss": 3.829, + "step": 3787 + }, + { + "epoch": 0.7591182364729459, + "grad_norm": 34.77875815329998, + "learning_rate": 9.303953443606963e-06, + "loss": 4.2815, + "step": 3788 + }, + { + "epoch": 0.7593186372745491, + "grad_norm": 25.265113321835297, + "learning_rate": 9.303359938999164e-06, + "loss": 4.3506, + "step": 3789 + }, + { + "epoch": 0.7595190380761523, + "grad_norm": 26.49948130594582, + "learning_rate": 9.302766200411335e-06, + "loss": 4.6489, + "step": 3790 + }, + { + "epoch": 0.7597194388777555, + "grad_norm": 32.19981130608422, + "learning_rate": 9.302172227875756e-06, + "loss": 4.2408, + "step": 3791 + }, + { + "epoch": 0.7599198396793587, + "grad_norm": 29.10419012477641, + "learning_rate": 9.301578021424722e-06, + "loss": 4.5883, + "step": 3792 + }, + { + "epoch": 0.7601202404809619, + "grad_norm": 31.756638521246103, + "learning_rate": 9.300983581090541e-06, + "loss": 4.7326, + "step": 3793 + }, + { + "epoch": 0.7603206412825652, + "grad_norm": 29.421631775874204, + "learning_rate": 9.300388906905536e-06, + "loss": 4.3361, + "step": 3794 + }, + { + "epoch": 0.7605210420841684, + "grad_norm": 30.595655650339502, + "learning_rate": 9.299793998902036e-06, + "loss": 4.6883, + "step": 3795 + }, + { + "epoch": 0.7607214428857716, + "grad_norm": 49.28179740926392, + "learning_rate": 9.299198857112389e-06, + "loss": 3.8768, + "step": 3796 + }, + { + "epoch": 0.7609218436873747, + "grad_norm": 54.368837573746696, + "learning_rate": 9.298603481568953e-06, + "loss": 3.7999, + "step": 3797 + }, + { + "epoch": 0.7611222444889779, + "grad_norm": 25.281202901160288, + "learning_rate": 9.298007872304103e-06, + "loss": 4.3363, + "step": 3798 + }, + { + "epoch": 0.7613226452905811, + "grad_norm": 24.53376697512056, + "learning_rate": 9.29741202935022e-06, + "loss": 4.1165, + "step": 3799 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 24.48930109516744, + "learning_rate": 9.296815952739701e-06, + "loss": 4.051, + "step": 3800 + }, + { + "epoch": 0.7617234468937876, + "grad_norm": 22.31046880041972, + "learning_rate": 9.296219642504956e-06, + "loss": 3.9022, + "step": 3801 + }, + { + "epoch": 0.7619238476953908, + "grad_norm": 18.718223772624953, + "learning_rate": 9.295623098678406e-06, + "loss": 3.9875, + "step": 3802 + }, + { + "epoch": 0.762124248496994, + "grad_norm": 87.97537437028313, + "learning_rate": 9.295026321292488e-06, + "loss": 4.3563, + "step": 3803 + }, + { + "epoch": 0.7623246492985972, + "grad_norm": 23.30747456865539, + "learning_rate": 9.29442931037965e-06, + "loss": 3.897, + "step": 3804 + }, + { + "epoch": 0.7625250501002004, + "grad_norm": 24.86520255173003, + "learning_rate": 9.29383206597235e-06, + "loss": 4.535, + "step": 3805 + }, + { + "epoch": 0.7627254509018037, + "grad_norm": 22.133099332631335, + "learning_rate": 9.293234588103063e-06, + "loss": 4.0566, + "step": 3806 + }, + { + "epoch": 0.7629258517034068, + "grad_norm": 28.351317572526963, + "learning_rate": 9.292636876804275e-06, + "loss": 4.4918, + "step": 3807 + }, + { + "epoch": 0.76312625250501, + "grad_norm": 27.251102981784506, + "learning_rate": 9.29203893210848e-06, + "loss": 4.0311, + "step": 3808 + }, + { + "epoch": 0.7633266533066132, + "grad_norm": 71.24185475505126, + "learning_rate": 9.291440754048196e-06, + "loss": 4.0744, + "step": 3809 + }, + { + "epoch": 0.7635270541082164, + "grad_norm": 36.23152912568644, + "learning_rate": 9.290842342655943e-06, + "loss": 4.124, + "step": 3810 + }, + { + "epoch": 0.7637274549098196, + "grad_norm": 21.180731452796007, + "learning_rate": 9.290243697964256e-06, + "loss": 4.4603, + "step": 3811 + }, + { + "epoch": 0.7639278557114229, + "grad_norm": 20.88246056621588, + "learning_rate": 9.289644820005688e-06, + "loss": 4.1644, + "step": 3812 + }, + { + "epoch": 0.7641282565130261, + "grad_norm": 33.155913254642016, + "learning_rate": 9.289045708812798e-06, + "loss": 4.4125, + "step": 3813 + }, + { + "epoch": 0.7643286573146293, + "grad_norm": 23.035565406113456, + "learning_rate": 9.288446364418161e-06, + "loss": 4.2313, + "step": 3814 + }, + { + "epoch": 0.7645290581162325, + "grad_norm": 23.650638025804767, + "learning_rate": 9.287846786854367e-06, + "loss": 4.3369, + "step": 3815 + }, + { + "epoch": 0.7647294589178357, + "grad_norm": 29.76872893178468, + "learning_rate": 9.287246976154013e-06, + "loss": 4.1812, + "step": 3816 + }, + { + "epoch": 0.7649298597194388, + "grad_norm": 21.157206460933757, + "learning_rate": 9.286646932349712e-06, + "loss": 4.3332, + "step": 3817 + }, + { + "epoch": 0.765130260521042, + "grad_norm": 19.590121367204173, + "learning_rate": 9.28604665547409e-06, + "loss": 4.1767, + "step": 3818 + }, + { + "epoch": 0.7653306613226453, + "grad_norm": 24.621347695992746, + "learning_rate": 9.285446145559785e-06, + "loss": 3.7257, + "step": 3819 + }, + { + "epoch": 0.7655310621242485, + "grad_norm": 26.110666994395842, + "learning_rate": 9.284845402639447e-06, + "loss": 4.7891, + "step": 3820 + }, + { + "epoch": 0.7657314629258517, + "grad_norm": 19.735890512319937, + "learning_rate": 9.284244426745738e-06, + "loss": 3.587, + "step": 3821 + }, + { + "epoch": 0.7659318637274549, + "grad_norm": 32.6705199574989, + "learning_rate": 9.283643217911336e-06, + "loss": 4.1099, + "step": 3822 + }, + { + "epoch": 0.7661322645290581, + "grad_norm": 22.980203978664456, + "learning_rate": 9.283041776168929e-06, + "loss": 4.3241, + "step": 3823 + }, + { + "epoch": 0.7663326653306614, + "grad_norm": 28.677970833408367, + "learning_rate": 9.282440101551218e-06, + "loss": 4.5117, + "step": 3824 + }, + { + "epoch": 0.7665330661322646, + "grad_norm": 23.912812112336884, + "learning_rate": 9.281838194090917e-06, + "loss": 4.2874, + "step": 3825 + }, + { + "epoch": 0.7667334669338677, + "grad_norm": 28.053789475711753, + "learning_rate": 9.281236053820755e-06, + "loss": 4.3311, + "step": 3826 + }, + { + "epoch": 0.7669338677354709, + "grad_norm": 23.18118614878938, + "learning_rate": 9.280633680773467e-06, + "loss": 4.1791, + "step": 3827 + }, + { + "epoch": 0.7671342685370741, + "grad_norm": 28.394961835466567, + "learning_rate": 9.280031074981809e-06, + "loss": 4.0727, + "step": 3828 + }, + { + "epoch": 0.7673346693386773, + "grad_norm": 24.279728518642912, + "learning_rate": 9.279428236478542e-06, + "loss": 3.722, + "step": 3829 + }, + { + "epoch": 0.7675350701402806, + "grad_norm": 31.334101918291786, + "learning_rate": 9.278825165296443e-06, + "loss": 4.9464, + "step": 3830 + }, + { + "epoch": 0.7677354709418838, + "grad_norm": 30.565194136366653, + "learning_rate": 9.278221861468306e-06, + "loss": 4.447, + "step": 3831 + }, + { + "epoch": 0.767935871743487, + "grad_norm": 33.35042382666662, + "learning_rate": 9.277618325026932e-06, + "loss": 4.6123, + "step": 3832 + }, + { + "epoch": 0.7681362725450902, + "grad_norm": 33.914359699012344, + "learning_rate": 9.277014556005133e-06, + "loss": 4.6827, + "step": 3833 + }, + { + "epoch": 0.7683366733466934, + "grad_norm": 22.151922581339527, + "learning_rate": 9.27641055443574e-06, + "loss": 4.7054, + "step": 3834 + }, + { + "epoch": 0.7685370741482966, + "grad_norm": 28.985074056021265, + "learning_rate": 9.275806320351593e-06, + "loss": 4.4829, + "step": 3835 + }, + { + "epoch": 0.7687374749498997, + "grad_norm": 31.01686049789194, + "learning_rate": 9.275201853785545e-06, + "loss": 4.7515, + "step": 3836 + }, + { + "epoch": 0.768937875751503, + "grad_norm": 30.130452373106298, + "learning_rate": 9.274597154770461e-06, + "loss": 4.78, + "step": 3837 + }, + { + "epoch": 0.7691382765531062, + "grad_norm": 18.718944589082614, + "learning_rate": 9.273992223339218e-06, + "loss": 3.9375, + "step": 3838 + }, + { + "epoch": 0.7693386773547094, + "grad_norm": 27.10842342971888, + "learning_rate": 9.273387059524713e-06, + "loss": 4.1502, + "step": 3839 + }, + { + "epoch": 0.7695390781563126, + "grad_norm": 19.431125466260497, + "learning_rate": 9.272781663359843e-06, + "loss": 4.2386, + "step": 3840 + }, + { + "epoch": 0.7697394789579158, + "grad_norm": 26.054681580234547, + "learning_rate": 9.272176034877525e-06, + "loss": 4.4399, + "step": 3841 + }, + { + "epoch": 0.769939879759519, + "grad_norm": 17.507353053483165, + "learning_rate": 9.271570174110691e-06, + "loss": 3.4664, + "step": 3842 + }, + { + "epoch": 0.7701402805611223, + "grad_norm": 31.160905019044385, + "learning_rate": 9.270964081092281e-06, + "loss": 4.2131, + "step": 3843 + }, + { + "epoch": 0.7703406813627255, + "grad_norm": 26.16745585782059, + "learning_rate": 9.270357755855252e-06, + "loss": 3.8293, + "step": 3844 + }, + { + "epoch": 0.7705410821643287, + "grad_norm": 21.79251887065329, + "learning_rate": 9.269751198432565e-06, + "loss": 3.9976, + "step": 3845 + }, + { + "epoch": 0.7707414829659318, + "grad_norm": 24.855330257744566, + "learning_rate": 9.269144408857204e-06, + "loss": 4.2537, + "step": 3846 + }, + { + "epoch": 0.770941883767535, + "grad_norm": 23.363823194767605, + "learning_rate": 9.268537387162159e-06, + "loss": 4.5644, + "step": 3847 + }, + { + "epoch": 0.7711422845691382, + "grad_norm": 33.01639130025416, + "learning_rate": 9.267930133380437e-06, + "loss": 4.0038, + "step": 3848 + }, + { + "epoch": 0.7713426853707415, + "grad_norm": 21.004201816634396, + "learning_rate": 9.267322647545052e-06, + "loss": 4.1065, + "step": 3849 + }, + { + "epoch": 0.7715430861723447, + "grad_norm": 32.257518054833184, + "learning_rate": 9.266714929689035e-06, + "loss": 4.4824, + "step": 3850 + }, + { + "epoch": 0.7717434869739479, + "grad_norm": 18.83356097412045, + "learning_rate": 9.26610697984543e-06, + "loss": 3.6321, + "step": 3851 + }, + { + "epoch": 0.7719438877755511, + "grad_norm": 20.67897903672496, + "learning_rate": 9.265498798047293e-06, + "loss": 4.5805, + "step": 3852 + }, + { + "epoch": 0.7721442885771543, + "grad_norm": 33.11416737645592, + "learning_rate": 9.264890384327687e-06, + "loss": 4.6528, + "step": 3853 + }, + { + "epoch": 0.7723446893787576, + "grad_norm": 24.71861331493458, + "learning_rate": 9.264281738719695e-06, + "loss": 4.183, + "step": 3854 + }, + { + "epoch": 0.7725450901803608, + "grad_norm": 32.68389829920738, + "learning_rate": 9.263672861256413e-06, + "loss": 4.3481, + "step": 3855 + }, + { + "epoch": 0.7727454909819639, + "grad_norm": 32.90029682355209, + "learning_rate": 9.263063751970943e-06, + "loss": 4.5229, + "step": 3856 + }, + { + "epoch": 0.7729458917835671, + "grad_norm": 24.028159603214828, + "learning_rate": 9.262454410896402e-06, + "loss": 4.5971, + "step": 3857 + }, + { + "epoch": 0.7731462925851703, + "grad_norm": 32.75651296121565, + "learning_rate": 9.261844838065925e-06, + "loss": 4.3137, + "step": 3858 + }, + { + "epoch": 0.7733466933867735, + "grad_norm": 30.609539342623094, + "learning_rate": 9.261235033512651e-06, + "loss": 4.4318, + "step": 3859 + }, + { + "epoch": 0.7735470941883767, + "grad_norm": 38.83458475118568, + "learning_rate": 9.26062499726974e-06, + "loss": 4.4593, + "step": 3860 + }, + { + "epoch": 0.77374749498998, + "grad_norm": 20.832382752510004, + "learning_rate": 9.260014729370357e-06, + "loss": 4.1456, + "step": 3861 + }, + { + "epoch": 0.7739478957915832, + "grad_norm": 25.652182124438507, + "learning_rate": 9.259404229847687e-06, + "loss": 3.8281, + "step": 3862 + }, + { + "epoch": 0.7741482965931864, + "grad_norm": 18.659241363529752, + "learning_rate": 9.25879349873492e-06, + "loss": 3.9991, + "step": 3863 + }, + { + "epoch": 0.7743486973947896, + "grad_norm": 25.702456254374255, + "learning_rate": 9.258182536065263e-06, + "loss": 3.7557, + "step": 3864 + }, + { + "epoch": 0.7745490981963928, + "grad_norm": 29.297902966791053, + "learning_rate": 9.257571341871937e-06, + "loss": 4.2608, + "step": 3865 + }, + { + "epoch": 0.774749498997996, + "grad_norm": 24.732570439126338, + "learning_rate": 9.256959916188172e-06, + "loss": 4.0378, + "step": 3866 + }, + { + "epoch": 0.7749498997995992, + "grad_norm": 32.34292071952263, + "learning_rate": 9.256348259047212e-06, + "loss": 3.9274, + "step": 3867 + }, + { + "epoch": 0.7751503006012024, + "grad_norm": 26.87043515753491, + "learning_rate": 9.255736370482315e-06, + "loss": 4.3878, + "step": 3868 + }, + { + "epoch": 0.7753507014028056, + "grad_norm": 23.948893985731992, + "learning_rate": 9.25512425052675e-06, + "loss": 4.2566, + "step": 3869 + }, + { + "epoch": 0.7755511022044088, + "grad_norm": 34.63642834156052, + "learning_rate": 9.254511899213798e-06, + "loss": 4.0876, + "step": 3870 + }, + { + "epoch": 0.775751503006012, + "grad_norm": 25.58031644328261, + "learning_rate": 9.253899316576753e-06, + "loss": 3.9618, + "step": 3871 + }, + { + "epoch": 0.7759519038076153, + "grad_norm": 26.096276987668592, + "learning_rate": 9.253286502648924e-06, + "loss": 4.2851, + "step": 3872 + }, + { + "epoch": 0.7761523046092185, + "grad_norm": 30.20115753425198, + "learning_rate": 9.252673457463628e-06, + "loss": 4.0624, + "step": 3873 + }, + { + "epoch": 0.7763527054108217, + "grad_norm": 34.80039013622876, + "learning_rate": 9.2520601810542e-06, + "loss": 4.3288, + "step": 3874 + }, + { + "epoch": 0.7765531062124249, + "grad_norm": 20.87246569261964, + "learning_rate": 9.251446673453984e-06, + "loss": 4.1869, + "step": 3875 + }, + { + "epoch": 0.776753507014028, + "grad_norm": 37.54200397080925, + "learning_rate": 9.250832934696335e-06, + "loss": 3.9435, + "step": 3876 + }, + { + "epoch": 0.7769539078156312, + "grad_norm": 21.873807577457846, + "learning_rate": 9.250218964814624e-06, + "loss": 4.0826, + "step": 3877 + }, + { + "epoch": 0.7771543086172344, + "grad_norm": 42.877533604878685, + "learning_rate": 9.249604763842234e-06, + "loss": 3.696, + "step": 3878 + }, + { + "epoch": 0.7773547094188377, + "grad_norm": 32.29936997835584, + "learning_rate": 9.248990331812562e-06, + "loss": 4.5205, + "step": 3879 + }, + { + "epoch": 0.7775551102204409, + "grad_norm": 24.125707548960555, + "learning_rate": 9.248375668759013e-06, + "loss": 4.017, + "step": 3880 + }, + { + "epoch": 0.7777555110220441, + "grad_norm": 24.057809054755676, + "learning_rate": 9.247760774715006e-06, + "loss": 4.4815, + "step": 3881 + }, + { + "epoch": 0.7779559118236473, + "grad_norm": 21.54723002630375, + "learning_rate": 9.247145649713977e-06, + "loss": 3.6984, + "step": 3882 + }, + { + "epoch": 0.7781563126252505, + "grad_norm": 27.745833501816744, + "learning_rate": 9.24653029378937e-06, + "loss": 3.6528, + "step": 3883 + }, + { + "epoch": 0.7783567134268538, + "grad_norm": 21.138511635239066, + "learning_rate": 9.245914706974642e-06, + "loss": 3.6622, + "step": 3884 + }, + { + "epoch": 0.7785571142284569, + "grad_norm": 26.90452606990135, + "learning_rate": 9.245298889303263e-06, + "loss": 4.4372, + "step": 3885 + }, + { + "epoch": 0.7787575150300601, + "grad_norm": 28.465796726431634, + "learning_rate": 9.24468284080872e-06, + "loss": 3.9816, + "step": 3886 + }, + { + "epoch": 0.7789579158316633, + "grad_norm": 28.081653248186793, + "learning_rate": 9.244066561524501e-06, + "loss": 4.1031, + "step": 3887 + }, + { + "epoch": 0.7791583166332665, + "grad_norm": 22.023872132755336, + "learning_rate": 9.24345005148412e-06, + "loss": 4.1083, + "step": 3888 + }, + { + "epoch": 0.7793587174348697, + "grad_norm": 18.338769976206375, + "learning_rate": 9.242833310721095e-06, + "loss": 4.0176, + "step": 3889 + }, + { + "epoch": 0.779559118236473, + "grad_norm": 36.04469036758314, + "learning_rate": 9.242216339268962e-06, + "loss": 3.6262, + "step": 3890 + }, + { + "epoch": 0.7797595190380762, + "grad_norm": 20.167369359788612, + "learning_rate": 9.241599137161262e-06, + "loss": 3.5788, + "step": 3891 + }, + { + "epoch": 0.7799599198396794, + "grad_norm": 23.859755871176834, + "learning_rate": 9.240981704431557e-06, + "loss": 4.3903, + "step": 3892 + }, + { + "epoch": 0.7801603206412826, + "grad_norm": 35.64021247816476, + "learning_rate": 9.240364041113416e-06, + "loss": 4.2309, + "step": 3893 + }, + { + "epoch": 0.7803607214428858, + "grad_norm": 30.810845438286073, + "learning_rate": 9.239746147240423e-06, + "loss": 3.9599, + "step": 3894 + }, + { + "epoch": 0.7805611222444889, + "grad_norm": 34.99508758276817, + "learning_rate": 9.239128022846174e-06, + "loss": 4.2314, + "step": 3895 + }, + { + "epoch": 0.7807615230460921, + "grad_norm": 22.225049026072114, + "learning_rate": 9.238509667964276e-06, + "loss": 3.9454, + "step": 3896 + }, + { + "epoch": 0.7809619238476954, + "grad_norm": 22.836619203015445, + "learning_rate": 9.237891082628352e-06, + "loss": 3.9104, + "step": 3897 + }, + { + "epoch": 0.7811623246492986, + "grad_norm": 19.719804528355386, + "learning_rate": 9.237272266872032e-06, + "loss": 3.7393, + "step": 3898 + }, + { + "epoch": 0.7813627254509018, + "grad_norm": 31.843045958529416, + "learning_rate": 9.236653220728967e-06, + "loss": 4.3973, + "step": 3899 + }, + { + "epoch": 0.781563126252505, + "grad_norm": 69.96307753052518, + "learning_rate": 9.23603394423281e-06, + "loss": 5.1257, + "step": 3900 + }, + { + "epoch": 0.7817635270541082, + "grad_norm": 26.053519060390396, + "learning_rate": 9.235414437417234e-06, + "loss": 4.4842, + "step": 3901 + }, + { + "epoch": 0.7819639278557114, + "grad_norm": 24.207050768276716, + "learning_rate": 9.234794700315926e-06, + "loss": 3.738, + "step": 3902 + }, + { + "epoch": 0.7821643286573147, + "grad_norm": 28.94919516031473, + "learning_rate": 9.234174732962577e-06, + "loss": 4.4379, + "step": 3903 + }, + { + "epoch": 0.7823647294589179, + "grad_norm": 20.569773791160035, + "learning_rate": 9.233554535390898e-06, + "loss": 4.2998, + "step": 3904 + }, + { + "epoch": 0.782565130260521, + "grad_norm": 25.66127924860335, + "learning_rate": 9.23293410763461e-06, + "loss": 4.3127, + "step": 3905 + }, + { + "epoch": 0.7827655310621242, + "grad_norm": 19.618094033268275, + "learning_rate": 9.232313449727446e-06, + "loss": 4.001, + "step": 3906 + }, + { + "epoch": 0.7829659318637274, + "grad_norm": 56.43975780148428, + "learning_rate": 9.231692561703151e-06, + "loss": 3.8462, + "step": 3907 + }, + { + "epoch": 0.7831663326653306, + "grad_norm": 34.82803077969456, + "learning_rate": 9.231071443595486e-06, + "loss": 4.4517, + "step": 3908 + }, + { + "epoch": 0.7833667334669339, + "grad_norm": 30.34279456878842, + "learning_rate": 9.230450095438222e-06, + "loss": 4.3559, + "step": 3909 + }, + { + "epoch": 0.7835671342685371, + "grad_norm": 23.52371957245168, + "learning_rate": 9.229828517265141e-06, + "loss": 4.6703, + "step": 3910 + }, + { + "epoch": 0.7837675350701403, + "grad_norm": 25.00067029688117, + "learning_rate": 9.229206709110037e-06, + "loss": 4.4637, + "step": 3911 + }, + { + "epoch": 0.7839679358717435, + "grad_norm": 22.395973844169447, + "learning_rate": 9.228584671006724e-06, + "loss": 4.4541, + "step": 3912 + }, + { + "epoch": 0.7841683366733467, + "grad_norm": 29.026117165606863, + "learning_rate": 9.22796240298902e-06, + "loss": 4.2774, + "step": 3913 + }, + { + "epoch": 0.78436873747495, + "grad_norm": 33.43984124231791, + "learning_rate": 9.22733990509076e-06, + "loss": 4.468, + "step": 3914 + }, + { + "epoch": 0.7845691382765531, + "grad_norm": 25.92702369787247, + "learning_rate": 9.226717177345788e-06, + "loss": 4.4889, + "step": 3915 + }, + { + "epoch": 0.7847695390781563, + "grad_norm": 22.22344942903396, + "learning_rate": 9.226094219787965e-06, + "loss": 3.7957, + "step": 3916 + }, + { + "epoch": 0.7849699398797595, + "grad_norm": 33.40088760386022, + "learning_rate": 9.22547103245116e-06, + "loss": 4.0013, + "step": 3917 + }, + { + "epoch": 0.7851703406813627, + "grad_norm": 20.91339514646628, + "learning_rate": 9.224847615369257e-06, + "loss": 4.1764, + "step": 3918 + }, + { + "epoch": 0.7853707414829659, + "grad_norm": 30.256682337091064, + "learning_rate": 9.224223968576153e-06, + "loss": 4.3705, + "step": 3919 + }, + { + "epoch": 0.7855711422845691, + "grad_norm": 24.668844864952973, + "learning_rate": 9.223600092105757e-06, + "loss": 4.5861, + "step": 3920 + }, + { + "epoch": 0.7857715430861724, + "grad_norm": 30.301764297256856, + "learning_rate": 9.222975985991988e-06, + "loss": 4.6092, + "step": 3921 + }, + { + "epoch": 0.7859719438877756, + "grad_norm": 21.55527659032828, + "learning_rate": 9.222351650268784e-06, + "loss": 3.9926, + "step": 3922 + }, + { + "epoch": 0.7861723446893788, + "grad_norm": 22.718492538693916, + "learning_rate": 9.221727084970085e-06, + "loss": 4.1385, + "step": 3923 + }, + { + "epoch": 0.786372745490982, + "grad_norm": 20.777196523260248, + "learning_rate": 9.221102290129853e-06, + "loss": 4.124, + "step": 3924 + }, + { + "epoch": 0.7865731462925851, + "grad_norm": 19.566879065729502, + "learning_rate": 9.220477265782059e-06, + "loss": 4.1245, + "step": 3925 + }, + { + "epoch": 0.7867735470941883, + "grad_norm": 33.42175227337267, + "learning_rate": 9.219852011960688e-06, + "loss": 4.5118, + "step": 3926 + }, + { + "epoch": 0.7869739478957916, + "grad_norm": 23.13165505886778, + "learning_rate": 9.219226528699732e-06, + "loss": 3.8999, + "step": 3927 + }, + { + "epoch": 0.7871743486973948, + "grad_norm": 18.06161124427472, + "learning_rate": 9.2186008160332e-06, + "loss": 3.8879, + "step": 3928 + }, + { + "epoch": 0.787374749498998, + "grad_norm": 32.54957786032352, + "learning_rate": 9.217974873995118e-06, + "loss": 4.0575, + "step": 3929 + }, + { + "epoch": 0.7875751503006012, + "grad_norm": 29.029874655397773, + "learning_rate": 9.217348702619514e-06, + "loss": 4.1511, + "step": 3930 + }, + { + "epoch": 0.7877755511022044, + "grad_norm": 26.891316115716865, + "learning_rate": 9.216722301940435e-06, + "loss": 4.0608, + "step": 3931 + }, + { + "epoch": 0.7879759519038076, + "grad_norm": 23.73328551004117, + "learning_rate": 9.216095671991941e-06, + "loss": 4.2586, + "step": 3932 + }, + { + "epoch": 0.7881763527054109, + "grad_norm": 25.912962479026618, + "learning_rate": 9.2154688128081e-06, + "loss": 3.8755, + "step": 3933 + }, + { + "epoch": 0.7883767535070141, + "grad_norm": 25.533048681417167, + "learning_rate": 9.214841724423001e-06, + "loss": 4.0799, + "step": 3934 + }, + { + "epoch": 0.7885771543086172, + "grad_norm": 29.822976661964624, + "learning_rate": 9.214214406870734e-06, + "loss": 3.6927, + "step": 3935 + }, + { + "epoch": 0.7887775551102204, + "grad_norm": 23.306583761210586, + "learning_rate": 9.213586860185407e-06, + "loss": 4.3195, + "step": 3936 + }, + { + "epoch": 0.7889779559118236, + "grad_norm": 41.86592968123823, + "learning_rate": 9.212959084401144e-06, + "loss": 4.7546, + "step": 3937 + }, + { + "epoch": 0.7891783567134268, + "grad_norm": 24.858618788226266, + "learning_rate": 9.212331079552077e-06, + "loss": 4.119, + "step": 3938 + }, + { + "epoch": 0.7893787575150301, + "grad_norm": 24.218561609666004, + "learning_rate": 9.21170284567235e-06, + "loss": 3.9353, + "step": 3939 + }, + { + "epoch": 0.7895791583166333, + "grad_norm": 28.630900253499604, + "learning_rate": 9.211074382796123e-06, + "loss": 3.6687, + "step": 3940 + }, + { + "epoch": 0.7897795591182365, + "grad_norm": 44.56647329024156, + "learning_rate": 9.210445690957566e-06, + "loss": 4.7739, + "step": 3941 + }, + { + "epoch": 0.7899799599198397, + "grad_norm": 24.75301558946956, + "learning_rate": 9.20981677019086e-06, + "loss": 4.2183, + "step": 3942 + }, + { + "epoch": 0.7901803607214429, + "grad_norm": 26.072585897669015, + "learning_rate": 9.209187620530204e-06, + "loss": 3.9934, + "step": 3943 + }, + { + "epoch": 0.790380761523046, + "grad_norm": 23.118270102875243, + "learning_rate": 9.208558242009804e-06, + "loss": 3.6094, + "step": 3944 + }, + { + "epoch": 0.7905811623246493, + "grad_norm": 21.902037486964588, + "learning_rate": 9.207928634663878e-06, + "loss": 4.1363, + "step": 3945 + }, + { + "epoch": 0.7907815631262525, + "grad_norm": 23.0686872698601, + "learning_rate": 9.207298798526662e-06, + "loss": 4.2892, + "step": 3946 + }, + { + "epoch": 0.7909819639278557, + "grad_norm": 32.762762526108496, + "learning_rate": 9.206668733632398e-06, + "loss": 4.4525, + "step": 3947 + }, + { + "epoch": 0.7911823647294589, + "grad_norm": 34.090142400906615, + "learning_rate": 9.206038440015348e-06, + "loss": 4.8694, + "step": 3948 + }, + { + "epoch": 0.7913827655310621, + "grad_norm": 30.14253078584979, + "learning_rate": 9.205407917709777e-06, + "loss": 4.0244, + "step": 3949 + }, + { + "epoch": 0.7915831663326653, + "grad_norm": 26.385410707357916, + "learning_rate": 9.204777166749971e-06, + "loss": 3.7898, + "step": 3950 + }, + { + "epoch": 0.7917835671342686, + "grad_norm": 26.11506120015422, + "learning_rate": 9.204146187170224e-06, + "loss": 4.3326, + "step": 3951 + }, + { + "epoch": 0.7919839679358718, + "grad_norm": 47.525706010616716, + "learning_rate": 9.203514979004842e-06, + "loss": 4.7438, + "step": 3952 + }, + { + "epoch": 0.792184368737475, + "grad_norm": 26.61629943564172, + "learning_rate": 9.202883542288146e-06, + "loss": 4.2749, + "step": 3953 + }, + { + "epoch": 0.7923847695390781, + "grad_norm": 23.331199754663775, + "learning_rate": 9.202251877054469e-06, + "loss": 3.8901, + "step": 3954 + }, + { + "epoch": 0.7925851703406813, + "grad_norm": 24.850341525420895, + "learning_rate": 9.201619983338152e-06, + "loss": 4.0561, + "step": 3955 + }, + { + "epoch": 0.7927855711422845, + "grad_norm": 30.388429579226134, + "learning_rate": 9.200987861173556e-06, + "loss": 4.2865, + "step": 3956 + }, + { + "epoch": 0.7929859719438878, + "grad_norm": 19.786612541893412, + "learning_rate": 9.200355510595048e-06, + "loss": 4.0327, + "step": 3957 + }, + { + "epoch": 0.793186372745491, + "grad_norm": 21.553233151533636, + "learning_rate": 9.19972293163701e-06, + "loss": 3.8685, + "step": 3958 + }, + { + "epoch": 0.7933867735470942, + "grad_norm": 24.04194513117983, + "learning_rate": 9.199090124333838e-06, + "loss": 4.7409, + "step": 3959 + }, + { + "epoch": 0.7935871743486974, + "grad_norm": 46.062857821359536, + "learning_rate": 9.198457088719938e-06, + "loss": 4.3564, + "step": 3960 + }, + { + "epoch": 0.7937875751503006, + "grad_norm": 29.39632877749004, + "learning_rate": 9.197823824829727e-06, + "loss": 4.5066, + "step": 3961 + }, + { + "epoch": 0.7939879759519038, + "grad_norm": 23.117740808856084, + "learning_rate": 9.197190332697638e-06, + "loss": 4.1402, + "step": 3962 + }, + { + "epoch": 0.7941883767535071, + "grad_norm": 21.260315383097783, + "learning_rate": 9.196556612358118e-06, + "loss": 3.868, + "step": 3963 + }, + { + "epoch": 0.7943887775551102, + "grad_norm": 29.21398150725875, + "learning_rate": 9.195922663845618e-06, + "loss": 5.0488, + "step": 3964 + }, + { + "epoch": 0.7945891783567134, + "grad_norm": 21.55952822914403, + "learning_rate": 9.195288487194608e-06, + "loss": 4.2657, + "step": 3965 + }, + { + "epoch": 0.7947895791583166, + "grad_norm": 27.181550443485204, + "learning_rate": 9.194654082439571e-06, + "loss": 4.1353, + "step": 3966 + }, + { + "epoch": 0.7949899799599198, + "grad_norm": 44.28080141045367, + "learning_rate": 9.194019449615001e-06, + "loss": 4.5915, + "step": 3967 + }, + { + "epoch": 0.795190380761523, + "grad_norm": 25.670398675450475, + "learning_rate": 9.1933845887554e-06, + "loss": 4.5178, + "step": 3968 + }, + { + "epoch": 0.7953907815631263, + "grad_norm": 28.901319223394115, + "learning_rate": 9.192749499895293e-06, + "loss": 4.7541, + "step": 3969 + }, + { + "epoch": 0.7955911823647295, + "grad_norm": 20.807240398515745, + "learning_rate": 9.192114183069203e-06, + "loss": 3.7882, + "step": 3970 + }, + { + "epoch": 0.7957915831663327, + "grad_norm": 28.95952602503882, + "learning_rate": 9.191478638311677e-06, + "loss": 4.3174, + "step": 3971 + }, + { + "epoch": 0.7959919839679359, + "grad_norm": 17.48323375520141, + "learning_rate": 9.190842865657272e-06, + "loss": 3.7457, + "step": 3972 + }, + { + "epoch": 0.7961923847695391, + "grad_norm": 18.531384344261404, + "learning_rate": 9.190206865140554e-06, + "loss": 4.1082, + "step": 3973 + }, + { + "epoch": 0.7963927855711422, + "grad_norm": 21.293660399766207, + "learning_rate": 9.1895706367961e-06, + "loss": 3.7411, + "step": 3974 + }, + { + "epoch": 0.7965931863727455, + "grad_norm": 20.3659729866496, + "learning_rate": 9.188934180658511e-06, + "loss": 3.8114, + "step": 3975 + }, + { + "epoch": 0.7967935871743487, + "grad_norm": 23.20001754846776, + "learning_rate": 9.188297496762384e-06, + "loss": 3.9956, + "step": 3976 + }, + { + "epoch": 0.7969939879759519, + "grad_norm": 24.76056526045411, + "learning_rate": 9.187660585142341e-06, + "loss": 3.686, + "step": 3977 + }, + { + "epoch": 0.7971943887775551, + "grad_norm": 20.87567805197439, + "learning_rate": 9.18702344583301e-06, + "loss": 4.1265, + "step": 3978 + }, + { + "epoch": 0.7973947895791583, + "grad_norm": 17.099496840642292, + "learning_rate": 9.186386078869034e-06, + "loss": 3.9412, + "step": 3979 + }, + { + "epoch": 0.7975951903807615, + "grad_norm": 25.151374544688945, + "learning_rate": 9.185748484285066e-06, + "loss": 3.7703, + "step": 3980 + }, + { + "epoch": 0.7977955911823648, + "grad_norm": 89.27376211116476, + "learning_rate": 9.185110662115775e-06, + "loss": 4.7118, + "step": 3981 + }, + { + "epoch": 0.797995991983968, + "grad_norm": 30.556225310533243, + "learning_rate": 9.18447261239584e-06, + "loss": 4.5189, + "step": 3982 + }, + { + "epoch": 0.7981963927855712, + "grad_norm": 22.009578930106102, + "learning_rate": 9.183834335159951e-06, + "loss": 3.9939, + "step": 3983 + }, + { + "epoch": 0.7983967935871743, + "grad_norm": 26.754549031465974, + "learning_rate": 9.183195830442814e-06, + "loss": 4.2277, + "step": 3984 + }, + { + "epoch": 0.7985971943887775, + "grad_norm": 27.619526717884884, + "learning_rate": 9.182557098279146e-06, + "loss": 4.3462, + "step": 3985 + }, + { + "epoch": 0.7987975951903807, + "grad_norm": 18.868382741658433, + "learning_rate": 9.181918138703671e-06, + "loss": 3.6628, + "step": 3986 + }, + { + "epoch": 0.798997995991984, + "grad_norm": 21.6572193583733, + "learning_rate": 9.181278951751139e-06, + "loss": 4.2256, + "step": 3987 + }, + { + "epoch": 0.7991983967935872, + "grad_norm": 37.15082019163718, + "learning_rate": 9.180639537456295e-06, + "loss": 4.5079, + "step": 3988 + }, + { + "epoch": 0.7993987975951904, + "grad_norm": 27.45044373852729, + "learning_rate": 9.179999895853906e-06, + "loss": 4.3315, + "step": 3989 + }, + { + "epoch": 0.7995991983967936, + "grad_norm": 29.38733708630942, + "learning_rate": 9.179360026978756e-06, + "loss": 4.6627, + "step": 3990 + }, + { + "epoch": 0.7997995991983968, + "grad_norm": 43.3674808315943, + "learning_rate": 9.178719930865632e-06, + "loss": 4.9849, + "step": 3991 + }, + { + "epoch": 0.8, + "grad_norm": 64.46290899643185, + "learning_rate": 9.178079607549335e-06, + "loss": 4.5236, + "step": 3992 + }, + { + "epoch": 0.8002004008016032, + "grad_norm": 20.90848655301029, + "learning_rate": 9.177439057064684e-06, + "loss": 3.7307, + "step": 3993 + }, + { + "epoch": 0.8004008016032064, + "grad_norm": 27.85329981527475, + "learning_rate": 9.176798279446504e-06, + "loss": 4.1276, + "step": 3994 + }, + { + "epoch": 0.8006012024048096, + "grad_norm": 41.30603966928948, + "learning_rate": 9.176157274729635e-06, + "loss": 4.9622, + "step": 3995 + }, + { + "epoch": 0.8008016032064128, + "grad_norm": 30.853134375666816, + "learning_rate": 9.175516042948932e-06, + "loss": 4.1957, + "step": 3996 + }, + { + "epoch": 0.801002004008016, + "grad_norm": 43.72298913056466, + "learning_rate": 9.174874584139257e-06, + "loss": 4.3072, + "step": 3997 + }, + { + "epoch": 0.8012024048096192, + "grad_norm": 22.941320314733048, + "learning_rate": 9.174232898335488e-06, + "loss": 3.805, + "step": 3998 + }, + { + "epoch": 0.8014028056112225, + "grad_norm": 21.44513598378101, + "learning_rate": 9.173590985572518e-06, + "loss": 4.3906, + "step": 3999 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 33.26436313836429, + "learning_rate": 9.17294884588524e-06, + "loss": 3.8808, + "step": 4000 + }, + { + "epoch": 0.8018036072144289, + "grad_norm": 34.13149267807348, + "learning_rate": 9.172306479308579e-06, + "loss": 3.9836, + "step": 4001 + }, + { + "epoch": 0.8020040080160321, + "grad_norm": 35.9638178030568, + "learning_rate": 9.171663885877452e-06, + "loss": 4.2445, + "step": 4002 + }, + { + "epoch": 0.8022044088176352, + "grad_norm": 22.105430896766578, + "learning_rate": 9.171021065626803e-06, + "loss": 3.7366, + "step": 4003 + }, + { + "epoch": 0.8024048096192384, + "grad_norm": 24.54418260377774, + "learning_rate": 9.170378018591581e-06, + "loss": 3.9167, + "step": 4004 + }, + { + "epoch": 0.8026052104208417, + "grad_norm": 43.30240155270094, + "learning_rate": 9.169734744806751e-06, + "loss": 4.7673, + "step": 4005 + }, + { + "epoch": 0.8028056112224449, + "grad_norm": 36.74118337292982, + "learning_rate": 9.169091244307286e-06, + "loss": 4.0479, + "step": 4006 + }, + { + "epoch": 0.8030060120240481, + "grad_norm": 17.214956163514987, + "learning_rate": 9.168447517128179e-06, + "loss": 4.1644, + "step": 4007 + }, + { + "epoch": 0.8032064128256513, + "grad_norm": 19.309803343485626, + "learning_rate": 9.167803563304425e-06, + "loss": 3.716, + "step": 4008 + }, + { + "epoch": 0.8034068136272545, + "grad_norm": 22.8734856612771, + "learning_rate": 9.167159382871039e-06, + "loss": 4.4869, + "step": 4009 + }, + { + "epoch": 0.8036072144288577, + "grad_norm": 22.67395937300693, + "learning_rate": 9.166514975863048e-06, + "loss": 4.4272, + "step": 4010 + }, + { + "epoch": 0.803807615230461, + "grad_norm": 21.732455361517456, + "learning_rate": 9.165870342315483e-06, + "loss": 4.3635, + "step": 4011 + }, + { + "epoch": 0.8040080160320642, + "grad_norm": 44.425276628577144, + "learning_rate": 9.165225482263403e-06, + "loss": 4.758, + "step": 4012 + }, + { + "epoch": 0.8042084168336673, + "grad_norm": 29.21338280776975, + "learning_rate": 9.164580395741864e-06, + "loss": 4.1987, + "step": 4013 + }, + { + "epoch": 0.8044088176352705, + "grad_norm": 26.155629571091833, + "learning_rate": 9.16393508278594e-06, + "loss": 4.536, + "step": 4014 + }, + { + "epoch": 0.8046092184368737, + "grad_norm": 23.961263600466836, + "learning_rate": 9.163289543430719e-06, + "loss": 3.9594, + "step": 4015 + }, + { + "epoch": 0.8048096192384769, + "grad_norm": 28.562815937858637, + "learning_rate": 9.1626437777113e-06, + "loss": 4.4244, + "step": 4016 + }, + { + "epoch": 0.8050100200400802, + "grad_norm": 64.30268519991202, + "learning_rate": 9.161997785662794e-06, + "loss": 4.0692, + "step": 4017 + }, + { + "epoch": 0.8052104208416834, + "grad_norm": 18.096632741672742, + "learning_rate": 9.161351567320326e-06, + "loss": 4.0877, + "step": 4018 + }, + { + "epoch": 0.8054108216432866, + "grad_norm": 59.67026356663745, + "learning_rate": 9.160705122719032e-06, + "loss": 4.4074, + "step": 4019 + }, + { + "epoch": 0.8056112224448898, + "grad_norm": 19.38012978570561, + "learning_rate": 9.160058451894057e-06, + "loss": 3.8817, + "step": 4020 + }, + { + "epoch": 0.805811623246493, + "grad_norm": 17.218145315231656, + "learning_rate": 9.159411554880561e-06, + "loss": 4.2757, + "step": 4021 + }, + { + "epoch": 0.8060120240480962, + "grad_norm": 27.877002492607136, + "learning_rate": 9.15876443171372e-06, + "loss": 3.6247, + "step": 4022 + }, + { + "epoch": 0.8062124248496993, + "grad_norm": 25.09892300440598, + "learning_rate": 9.158117082428721e-06, + "loss": 3.6471, + "step": 4023 + }, + { + "epoch": 0.8064128256513026, + "grad_norm": 27.32775721554838, + "learning_rate": 9.157469507060755e-06, + "loss": 4.3836, + "step": 4024 + }, + { + "epoch": 0.8066132264529058, + "grad_norm": 20.374196877889446, + "learning_rate": 9.156821705645036e-06, + "loss": 4.0656, + "step": 4025 + }, + { + "epoch": 0.806813627254509, + "grad_norm": 24.033226406585374, + "learning_rate": 9.156173678216784e-06, + "loss": 4.105, + "step": 4026 + }, + { + "epoch": 0.8070140280561122, + "grad_norm": 42.000037554629465, + "learning_rate": 9.155525424811235e-06, + "loss": 4.0946, + "step": 4027 + }, + { + "epoch": 0.8072144288577154, + "grad_norm": 30.91974066209311, + "learning_rate": 9.154876945463634e-06, + "loss": 4.4447, + "step": 4028 + }, + { + "epoch": 0.8074148296593187, + "grad_norm": 26.140433771651452, + "learning_rate": 9.154228240209241e-06, + "loss": 4.1479, + "step": 4029 + }, + { + "epoch": 0.8076152304609219, + "grad_norm": 41.38165630661821, + "learning_rate": 9.153579309083326e-06, + "loss": 4.3314, + "step": 4030 + }, + { + "epoch": 0.8078156312625251, + "grad_norm": 25.221395366789565, + "learning_rate": 9.152930152121173e-06, + "loss": 4.4006, + "step": 4031 + }, + { + "epoch": 0.8080160320641283, + "grad_norm": 22.090690114419395, + "learning_rate": 9.152280769358077e-06, + "loss": 4.0771, + "step": 4032 + }, + { + "epoch": 0.8082164328657314, + "grad_norm": 22.60284104951894, + "learning_rate": 9.151631160829346e-06, + "loss": 4.245, + "step": 4033 + }, + { + "epoch": 0.8084168336673346, + "grad_norm": 21.561638960317268, + "learning_rate": 9.150981326570301e-06, + "loss": 3.7453, + "step": 4034 + }, + { + "epoch": 0.8086172344689379, + "grad_norm": 21.882816585686157, + "learning_rate": 9.150331266616274e-06, + "loss": 4.1988, + "step": 4035 + }, + { + "epoch": 0.8088176352705411, + "grad_norm": 24.465242222454258, + "learning_rate": 9.149680981002609e-06, + "loss": 4.209, + "step": 4036 + }, + { + "epoch": 0.8090180360721443, + "grad_norm": 31.037565010894994, + "learning_rate": 9.149030469764664e-06, + "loss": 4.2159, + "step": 4037 + }, + { + "epoch": 0.8092184368737475, + "grad_norm": 31.020324494727575, + "learning_rate": 9.148379732937807e-06, + "loss": 4.0792, + "step": 4038 + }, + { + "epoch": 0.8094188376753507, + "grad_norm": 31.947495714398745, + "learning_rate": 9.147728770557421e-06, + "loss": 4.3408, + "step": 4039 + }, + { + "epoch": 0.8096192384769539, + "grad_norm": 20.24282121037517, + "learning_rate": 9.147077582658899e-06, + "loss": 3.584, + "step": 4040 + }, + { + "epoch": 0.8098196392785572, + "grad_norm": 23.93364021095207, + "learning_rate": 9.146426169277648e-06, + "loss": 3.884, + "step": 4041 + }, + { + "epoch": 0.8100200400801604, + "grad_norm": 23.607513707625323, + "learning_rate": 9.145774530449084e-06, + "loss": 4.3926, + "step": 4042 + }, + { + "epoch": 0.8102204408817635, + "grad_norm": 18.609018700571053, + "learning_rate": 9.14512266620864e-06, + "loss": 3.5249, + "step": 4043 + }, + { + "epoch": 0.8104208416833667, + "grad_norm": 22.197476651759743, + "learning_rate": 9.144470576591758e-06, + "loss": 4.2674, + "step": 4044 + }, + { + "epoch": 0.8106212424849699, + "grad_norm": 25.632565538208116, + "learning_rate": 9.143818261633893e-06, + "loss": 4.0717, + "step": 4045 + }, + { + "epoch": 0.8108216432865731, + "grad_norm": 51.476786677065036, + "learning_rate": 9.143165721370512e-06, + "loss": 5.0465, + "step": 4046 + }, + { + "epoch": 0.8110220440881764, + "grad_norm": 31.900737617354867, + "learning_rate": 9.142512955837095e-06, + "loss": 3.9448, + "step": 4047 + }, + { + "epoch": 0.8112224448897796, + "grad_norm": 28.268192024857658, + "learning_rate": 9.141859965069132e-06, + "loss": 3.9991, + "step": 4048 + }, + { + "epoch": 0.8114228456913828, + "grad_norm": 39.44971966927148, + "learning_rate": 9.14120674910213e-06, + "loss": 4.7551, + "step": 4049 + }, + { + "epoch": 0.811623246492986, + "grad_norm": 28.914759513229484, + "learning_rate": 9.140553307971605e-06, + "loss": 3.8827, + "step": 4050 + }, + { + "epoch": 0.8118236472945892, + "grad_norm": 35.51238185454677, + "learning_rate": 9.139899641713083e-06, + "loss": 4.1968, + "step": 4051 + }, + { + "epoch": 0.8120240480961923, + "grad_norm": 24.483106354901057, + "learning_rate": 9.139245750362109e-06, + "loss": 4.4234, + "step": 4052 + }, + { + "epoch": 0.8122244488977955, + "grad_norm": 38.893294408625586, + "learning_rate": 9.138591633954231e-06, + "loss": 3.8228, + "step": 4053 + }, + { + "epoch": 0.8124248496993988, + "grad_norm": 28.41399224413034, + "learning_rate": 9.137937292525018e-06, + "loss": 4.078, + "step": 4054 + }, + { + "epoch": 0.812625250501002, + "grad_norm": 26.19384547393693, + "learning_rate": 9.137282726110046e-06, + "loss": 4.2744, + "step": 4055 + }, + { + "epoch": 0.8128256513026052, + "grad_norm": 25.5901765486376, + "learning_rate": 9.136627934744903e-06, + "loss": 4.3421, + "step": 4056 + }, + { + "epoch": 0.8130260521042084, + "grad_norm": 30.61479877205177, + "learning_rate": 9.135972918465196e-06, + "loss": 4.1392, + "step": 4057 + }, + { + "epoch": 0.8132264529058116, + "grad_norm": 22.721113760246116, + "learning_rate": 9.135317677306534e-06, + "loss": 3.7842, + "step": 4058 + }, + { + "epoch": 0.8134268537074149, + "grad_norm": 32.09066721203594, + "learning_rate": 9.134662211304548e-06, + "loss": 4.5848, + "step": 4059 + }, + { + "epoch": 0.8136272545090181, + "grad_norm": 33.65764326806177, + "learning_rate": 9.13400652049487e-06, + "loss": 4.5941, + "step": 4060 + }, + { + "epoch": 0.8138276553106213, + "grad_norm": 16.87603005011309, + "learning_rate": 9.13335060491316e-06, + "loss": 3.4259, + "step": 4061 + }, + { + "epoch": 0.8140280561122244, + "grad_norm": 18.293821149183458, + "learning_rate": 9.132694464595075e-06, + "loss": 4.4224, + "step": 4062 + }, + { + "epoch": 0.8142284569138276, + "grad_norm": 22.03314684139647, + "learning_rate": 9.132038099576291e-06, + "loss": 3.8638, + "step": 4063 + }, + { + "epoch": 0.8144288577154308, + "grad_norm": 47.03266873702274, + "learning_rate": 9.131381509892494e-06, + "loss": 4.0138, + "step": 4064 + }, + { + "epoch": 0.814629258517034, + "grad_norm": 22.168193710259295, + "learning_rate": 9.130724695579387e-06, + "loss": 3.8974, + "step": 4065 + }, + { + "epoch": 0.8148296593186373, + "grad_norm": 22.0902736080567, + "learning_rate": 9.130067656672683e-06, + "loss": 4.6793, + "step": 4066 + }, + { + "epoch": 0.8150300601202405, + "grad_norm": 37.12471575785862, + "learning_rate": 9.129410393208102e-06, + "loss": 4.2931, + "step": 4067 + }, + { + "epoch": 0.8152304609218437, + "grad_norm": 26.43724116996435, + "learning_rate": 9.128752905221383e-06, + "loss": 4.4475, + "step": 4068 + }, + { + "epoch": 0.8154308617234469, + "grad_norm": 38.45496778689234, + "learning_rate": 9.128095192748273e-06, + "loss": 4.7167, + "step": 4069 + }, + { + "epoch": 0.8156312625250501, + "grad_norm": 20.947311424271707, + "learning_rate": 9.127437255824534e-06, + "loss": 4.1666, + "step": 4070 + }, + { + "epoch": 0.8158316633266534, + "grad_norm": 22.696759531824245, + "learning_rate": 9.126779094485939e-06, + "loss": 3.9999, + "step": 4071 + }, + { + "epoch": 0.8160320641282565, + "grad_norm": 26.30601486248999, + "learning_rate": 9.126120708768273e-06, + "loss": 4.1661, + "step": 4072 + }, + { + "epoch": 0.8162324649298597, + "grad_norm": 21.920870205328757, + "learning_rate": 9.125462098707333e-06, + "loss": 4.0774, + "step": 4073 + }, + { + "epoch": 0.8164328657314629, + "grad_norm": 21.633700802138772, + "learning_rate": 9.124803264338931e-06, + "loss": 4.1526, + "step": 4074 + }, + { + "epoch": 0.8166332665330661, + "grad_norm": 38.16728356578319, + "learning_rate": 9.124144205698883e-06, + "loss": 3.9639, + "step": 4075 + }, + { + "epoch": 0.8168336673346693, + "grad_norm": 28.26198126077582, + "learning_rate": 9.12348492282303e-06, + "loss": 4.5936, + "step": 4076 + }, + { + "epoch": 0.8170340681362726, + "grad_norm": 24.48387955461397, + "learning_rate": 9.122825415747213e-06, + "loss": 3.9947, + "step": 4077 + }, + { + "epoch": 0.8172344689378758, + "grad_norm": 17.211037819492585, + "learning_rate": 9.122165684507294e-06, + "loss": 3.8599, + "step": 4078 + }, + { + "epoch": 0.817434869739479, + "grad_norm": 29.955219831051732, + "learning_rate": 9.121505729139142e-06, + "loss": 4.497, + "step": 4079 + }, + { + "epoch": 0.8176352705410822, + "grad_norm": 26.921062680447566, + "learning_rate": 9.120845549678637e-06, + "loss": 4.7059, + "step": 4080 + }, + { + "epoch": 0.8178356713426854, + "grad_norm": 26.156972356792192, + "learning_rate": 9.120185146161678e-06, + "loss": 4.4692, + "step": 4081 + }, + { + "epoch": 0.8180360721442885, + "grad_norm": 25.163962915828346, + "learning_rate": 9.119524518624173e-06, + "loss": 3.6236, + "step": 4082 + }, + { + "epoch": 0.8182364729458917, + "grad_norm": 24.44199402841602, + "learning_rate": 9.118863667102035e-06, + "loss": 3.9993, + "step": 4083 + }, + { + "epoch": 0.818436873747495, + "grad_norm": 45.764350441436264, + "learning_rate": 9.118202591631201e-06, + "loss": 4.5635, + "step": 4084 + }, + { + "epoch": 0.8186372745490982, + "grad_norm": 24.240576839698818, + "learning_rate": 9.117541292247613e-06, + "loss": 3.7747, + "step": 4085 + }, + { + "epoch": 0.8188376753507014, + "grad_norm": 19.40270186167907, + "learning_rate": 9.116879768987228e-06, + "loss": 4.5889, + "step": 4086 + }, + { + "epoch": 0.8190380761523046, + "grad_norm": 20.189861395350775, + "learning_rate": 9.116218021886012e-06, + "loss": 3.5206, + "step": 4087 + }, + { + "epoch": 0.8192384769539078, + "grad_norm": 22.662814288504734, + "learning_rate": 9.115556050979946e-06, + "loss": 3.9354, + "step": 4088 + }, + { + "epoch": 0.819438877755511, + "grad_norm": 66.27609991130169, + "learning_rate": 9.114893856305021e-06, + "loss": 4.4905, + "step": 4089 + }, + { + "epoch": 0.8196392785571143, + "grad_norm": 26.68019031332249, + "learning_rate": 9.114231437897245e-06, + "loss": 4.2838, + "step": 4090 + }, + { + "epoch": 0.8198396793587175, + "grad_norm": 37.96663241068031, + "learning_rate": 9.113568795792632e-06, + "loss": 5.1471, + "step": 4091 + }, + { + "epoch": 0.8200400801603206, + "grad_norm": 23.25552798909133, + "learning_rate": 9.112905930027212e-06, + "loss": 3.9, + "step": 4092 + }, + { + "epoch": 0.8202404809619238, + "grad_norm": 16.847372453226814, + "learning_rate": 9.112242840637024e-06, + "loss": 3.9176, + "step": 4093 + }, + { + "epoch": 0.820440881763527, + "grad_norm": 28.886065009914002, + "learning_rate": 9.111579527658123e-06, + "loss": 4.1618, + "step": 4094 + }, + { + "epoch": 0.8206412825651302, + "grad_norm": 24.862298123746648, + "learning_rate": 9.110915991126574e-06, + "loss": 4.2455, + "step": 4095 + }, + { + "epoch": 0.8208416833667335, + "grad_norm": 37.01420753704754, + "learning_rate": 9.110252231078454e-06, + "loss": 4.4763, + "step": 4096 + }, + { + "epoch": 0.8210420841683367, + "grad_norm": 47.56884563699116, + "learning_rate": 9.109588247549853e-06, + "loss": 4.1499, + "step": 4097 + }, + { + "epoch": 0.8212424849699399, + "grad_norm": 26.6338667268436, + "learning_rate": 9.108924040576874e-06, + "loss": 4.2578, + "step": 4098 + }, + { + "epoch": 0.8214428857715431, + "grad_norm": 18.495114899378862, + "learning_rate": 9.108259610195627e-06, + "loss": 4.444, + "step": 4099 + }, + { + "epoch": 0.8216432865731463, + "grad_norm": 30.379766812059806, + "learning_rate": 9.107594956442242e-06, + "loss": 4.2358, + "step": 4100 + }, + { + "epoch": 0.8218436873747496, + "grad_norm": 20.74601943205499, + "learning_rate": 9.106930079352857e-06, + "loss": 4.336, + "step": 4101 + }, + { + "epoch": 0.8220440881763527, + "grad_norm": 23.06005104047212, + "learning_rate": 9.106264978963619e-06, + "loss": 4.5493, + "step": 4102 + }, + { + "epoch": 0.8222444889779559, + "grad_norm": 67.80100778052949, + "learning_rate": 9.105599655310696e-06, + "loss": 4.4709, + "step": 4103 + }, + { + "epoch": 0.8224448897795591, + "grad_norm": 24.047911909840924, + "learning_rate": 9.104934108430256e-06, + "loss": 3.9309, + "step": 4104 + }, + { + "epoch": 0.8226452905811623, + "grad_norm": 27.681986287910803, + "learning_rate": 9.10426833835849e-06, + "loss": 4.936, + "step": 4105 + }, + { + "epoch": 0.8228456913827655, + "grad_norm": 54.29060537181796, + "learning_rate": 9.103602345131597e-06, + "loss": 4.5115, + "step": 4106 + }, + { + "epoch": 0.8230460921843687, + "grad_norm": 22.52744074729842, + "learning_rate": 9.102936128785785e-06, + "loss": 3.8063, + "step": 4107 + }, + { + "epoch": 0.823246492985972, + "grad_norm": 23.678584726376208, + "learning_rate": 9.10226968935728e-06, + "loss": 4.1945, + "step": 4108 + }, + { + "epoch": 0.8234468937875752, + "grad_norm": 14.664980095404925, + "learning_rate": 9.101603026882318e-06, + "loss": 3.5674, + "step": 4109 + }, + { + "epoch": 0.8236472945891784, + "grad_norm": 35.015066208301704, + "learning_rate": 9.100936141397143e-06, + "loss": 3.62, + "step": 4110 + }, + { + "epoch": 0.8238476953907815, + "grad_norm": 22.22232995345669, + "learning_rate": 9.100269032938016e-06, + "loss": 4.2157, + "step": 4111 + }, + { + "epoch": 0.8240480961923847, + "grad_norm": 25.808707486405414, + "learning_rate": 9.099601701541211e-06, + "loss": 4.2511, + "step": 4112 + }, + { + "epoch": 0.824248496993988, + "grad_norm": 30.018504156982473, + "learning_rate": 9.09893414724301e-06, + "loss": 4.7491, + "step": 4113 + }, + { + "epoch": 0.8244488977955912, + "grad_norm": 21.060133001587356, + "learning_rate": 9.098266370079709e-06, + "loss": 3.9473, + "step": 4114 + }, + { + "epoch": 0.8246492985971944, + "grad_norm": 23.049186114345012, + "learning_rate": 9.097598370087614e-06, + "loss": 4.0543, + "step": 4115 + }, + { + "epoch": 0.8248496993987976, + "grad_norm": 21.236158372222295, + "learning_rate": 9.09693014730305e-06, + "loss": 3.9279, + "step": 4116 + }, + { + "epoch": 0.8250501002004008, + "grad_norm": 43.73357894452031, + "learning_rate": 9.096261701762343e-06, + "loss": 5.0276, + "step": 4117 + }, + { + "epoch": 0.825250501002004, + "grad_norm": 22.677009757492705, + "learning_rate": 9.095593033501841e-06, + "loss": 3.7147, + "step": 4118 + }, + { + "epoch": 0.8254509018036073, + "grad_norm": 35.503240256208066, + "learning_rate": 9.094924142557902e-06, + "loss": 4.3851, + "step": 4119 + }, + { + "epoch": 0.8256513026052105, + "grad_norm": 27.881448483334697, + "learning_rate": 9.094255028966893e-06, + "loss": 4.0355, + "step": 4120 + }, + { + "epoch": 0.8258517034068136, + "grad_norm": 28.663897955060094, + "learning_rate": 9.093585692765192e-06, + "loss": 4.3283, + "step": 4121 + }, + { + "epoch": 0.8260521042084168, + "grad_norm": 25.256135207214125, + "learning_rate": 9.092916133989195e-06, + "loss": 4.2976, + "step": 4122 + }, + { + "epoch": 0.82625250501002, + "grad_norm": 24.168292564368752, + "learning_rate": 9.092246352675307e-06, + "loss": 4.3039, + "step": 4123 + }, + { + "epoch": 0.8264529058116232, + "grad_norm": 25.268013469419415, + "learning_rate": 9.091576348859942e-06, + "loss": 4.6178, + "step": 4124 + }, + { + "epoch": 0.8266533066132264, + "grad_norm": 22.706179977001437, + "learning_rate": 9.090906122579534e-06, + "loss": 3.8217, + "step": 4125 + }, + { + "epoch": 0.8268537074148297, + "grad_norm": 27.887104062917803, + "learning_rate": 9.09023567387052e-06, + "loss": 4.2162, + "step": 4126 + }, + { + "epoch": 0.8270541082164329, + "grad_norm": 21.861660191166564, + "learning_rate": 9.089565002769353e-06, + "loss": 4.5886, + "step": 4127 + }, + { + "epoch": 0.8272545090180361, + "grad_norm": 20.6070960272119, + "learning_rate": 9.088894109312503e-06, + "loss": 3.8334, + "step": 4128 + }, + { + "epoch": 0.8274549098196393, + "grad_norm": 23.18476065558724, + "learning_rate": 9.088222993536444e-06, + "loss": 4.2971, + "step": 4129 + }, + { + "epoch": 0.8276553106212425, + "grad_norm": 17.15045538580259, + "learning_rate": 9.087551655477666e-06, + "loss": 3.7382, + "step": 4130 + }, + { + "epoch": 0.8278557114228456, + "grad_norm": 29.518341799769996, + "learning_rate": 9.08688009517267e-06, + "loss": 4.3015, + "step": 4131 + }, + { + "epoch": 0.8280561122244489, + "grad_norm": 27.044068365413086, + "learning_rate": 9.086208312657968e-06, + "loss": 4.2879, + "step": 4132 + }, + { + "epoch": 0.8282565130260521, + "grad_norm": 23.422186907674856, + "learning_rate": 9.085536307970093e-06, + "loss": 4.0394, + "step": 4133 + }, + { + "epoch": 0.8284569138276553, + "grad_norm": 23.321138452502133, + "learning_rate": 9.084864081145575e-06, + "loss": 3.8313, + "step": 4134 + }, + { + "epoch": 0.8286573146292585, + "grad_norm": 43.83052261666396, + "learning_rate": 9.08419163222097e-06, + "loss": 4.237, + "step": 4135 + }, + { + "epoch": 0.8288577154308617, + "grad_norm": 21.199259355342328, + "learning_rate": 9.083518961232834e-06, + "loss": 4.2738, + "step": 4136 + }, + { + "epoch": 0.829058116232465, + "grad_norm": 27.989753874443238, + "learning_rate": 9.082846068217746e-06, + "loss": 3.8144, + "step": 4137 + }, + { + "epoch": 0.8292585170340682, + "grad_norm": 22.635265805719115, + "learning_rate": 9.082172953212289e-06, + "loss": 3.9804, + "step": 4138 + }, + { + "epoch": 0.8294589178356714, + "grad_norm": 28.21643096828433, + "learning_rate": 9.081499616253064e-06, + "loss": 3.9157, + "step": 4139 + }, + { + "epoch": 0.8296593186372746, + "grad_norm": 28.46025452418498, + "learning_rate": 9.08082605737668e-06, + "loss": 4.1813, + "step": 4140 + }, + { + "epoch": 0.8298597194388777, + "grad_norm": 20.239798520883276, + "learning_rate": 9.080152276619758e-06, + "loss": 4.3914, + "step": 4141 + }, + { + "epoch": 0.8300601202404809, + "grad_norm": 30.212070967857006, + "learning_rate": 9.079478274018936e-06, + "loss": 4.4847, + "step": 4142 + }, + { + "epoch": 0.8302605210420841, + "grad_norm": 55.60081749276686, + "learning_rate": 9.078804049610856e-06, + "loss": 4.1025, + "step": 4143 + }, + { + "epoch": 0.8304609218436874, + "grad_norm": 19.780270925883503, + "learning_rate": 9.07812960343218e-06, + "loss": 4.3004, + "step": 4144 + }, + { + "epoch": 0.8306613226452906, + "grad_norm": 26.054736239221644, + "learning_rate": 9.077454935519579e-06, + "loss": 4.6747, + "step": 4145 + }, + { + "epoch": 0.8308617234468938, + "grad_norm": 23.569305442634917, + "learning_rate": 9.076780045909732e-06, + "loss": 3.7651, + "step": 4146 + }, + { + "epoch": 0.831062124248497, + "grad_norm": 38.57744983257343, + "learning_rate": 9.076104934639338e-06, + "loss": 4.0966, + "step": 4147 + }, + { + "epoch": 0.8312625250501002, + "grad_norm": 35.85306888357474, + "learning_rate": 9.0754296017451e-06, + "loss": 4.3232, + "step": 4148 + }, + { + "epoch": 0.8314629258517034, + "grad_norm": 40.167166444375816, + "learning_rate": 9.074754047263741e-06, + "loss": 4.905, + "step": 4149 + }, + { + "epoch": 0.8316633266533067, + "grad_norm": 38.967023194293844, + "learning_rate": 9.074078271231988e-06, + "loss": 4.7538, + "step": 4150 + }, + { + "epoch": 0.8318637274549098, + "grad_norm": 20.46345464121112, + "learning_rate": 9.073402273686585e-06, + "loss": 3.9647, + "step": 4151 + }, + { + "epoch": 0.832064128256513, + "grad_norm": 22.335807271880878, + "learning_rate": 9.072726054664288e-06, + "loss": 4.1563, + "step": 4152 + }, + { + "epoch": 0.8322645290581162, + "grad_norm": 19.335323461119266, + "learning_rate": 9.072049614201864e-06, + "loss": 3.8428, + "step": 4153 + }, + { + "epoch": 0.8324649298597194, + "grad_norm": 22.55649043086437, + "learning_rate": 9.071372952336091e-06, + "loss": 4.0842, + "step": 4154 + }, + { + "epoch": 0.8326653306613226, + "grad_norm": 27.101390870203133, + "learning_rate": 9.07069606910376e-06, + "loss": 4.1355, + "step": 4155 + }, + { + "epoch": 0.8328657314629259, + "grad_norm": 26.545133842872122, + "learning_rate": 9.070018964541676e-06, + "loss": 4.8139, + "step": 4156 + }, + { + "epoch": 0.8330661322645291, + "grad_norm": 37.06979103746242, + "learning_rate": 9.069341638686653e-06, + "loss": 3.6063, + "step": 4157 + }, + { + "epoch": 0.8332665330661323, + "grad_norm": 23.45659476127781, + "learning_rate": 9.068664091575519e-06, + "loss": 4.3475, + "step": 4158 + }, + { + "epoch": 0.8334669338677355, + "grad_norm": 21.824974407316585, + "learning_rate": 9.06798632324511e-06, + "loss": 3.1683, + "step": 4159 + }, + { + "epoch": 0.8336673346693386, + "grad_norm": 27.84853631924564, + "learning_rate": 9.067308333732281e-06, + "loss": 4.0846, + "step": 4160 + }, + { + "epoch": 0.8338677354709418, + "grad_norm": 16.774814355481436, + "learning_rate": 9.066630123073893e-06, + "loss": 3.9031, + "step": 4161 + }, + { + "epoch": 0.8340681362725451, + "grad_norm": 37.04020373660806, + "learning_rate": 9.065951691306824e-06, + "loss": 4.0995, + "step": 4162 + }, + { + "epoch": 0.8342685370741483, + "grad_norm": 25.517975016973143, + "learning_rate": 9.065273038467961e-06, + "loss": 4.6538, + "step": 4163 + }, + { + "epoch": 0.8344689378757515, + "grad_norm": 21.606218076026895, + "learning_rate": 9.064594164594199e-06, + "loss": 4.1797, + "step": 4164 + }, + { + "epoch": 0.8346693386773547, + "grad_norm": 19.86930131577214, + "learning_rate": 9.063915069722454e-06, + "loss": 3.7835, + "step": 4165 + }, + { + "epoch": 0.8348697394789579, + "grad_norm": 19.18845230747274, + "learning_rate": 9.063235753889647e-06, + "loss": 4.1279, + "step": 4166 + }, + { + "epoch": 0.8350701402805611, + "grad_norm": 27.330311091854675, + "learning_rate": 9.062556217132715e-06, + "loss": 4.22, + "step": 4167 + }, + { + "epoch": 0.8352705410821644, + "grad_norm": 27.476513802373475, + "learning_rate": 9.061876459488604e-06, + "loss": 4.6739, + "step": 4168 + }, + { + "epoch": 0.8354709418837676, + "grad_norm": 24.250302468444158, + "learning_rate": 9.061196480994272e-06, + "loss": 4.2959, + "step": 4169 + }, + { + "epoch": 0.8356713426853707, + "grad_norm": 31.44715851050976, + "learning_rate": 9.060516281686695e-06, + "loss": 4.8106, + "step": 4170 + }, + { + "epoch": 0.8358717434869739, + "grad_norm": 43.11978311147202, + "learning_rate": 9.059835861602854e-06, + "loss": 4.47, + "step": 4171 + }, + { + "epoch": 0.8360721442885771, + "grad_norm": 25.067697818578708, + "learning_rate": 9.059155220779742e-06, + "loss": 4.3651, + "step": 4172 + }, + { + "epoch": 0.8362725450901803, + "grad_norm": 30.036342754999822, + "learning_rate": 9.058474359254373e-06, + "loss": 4.4873, + "step": 4173 + }, + { + "epoch": 0.8364729458917836, + "grad_norm": 25.09587054134745, + "learning_rate": 9.05779327706376e-06, + "loss": 3.6338, + "step": 4174 + }, + { + "epoch": 0.8366733466933868, + "grad_norm": 33.93956494084584, + "learning_rate": 9.057111974244934e-06, + "loss": 4.4438, + "step": 4175 + }, + { + "epoch": 0.83687374749499, + "grad_norm": 32.72885021639674, + "learning_rate": 9.056430450834944e-06, + "loss": 4.5522, + "step": 4176 + }, + { + "epoch": 0.8370741482965932, + "grad_norm": 24.56239612437919, + "learning_rate": 9.055748706870841e-06, + "loss": 4.1259, + "step": 4177 + }, + { + "epoch": 0.8372745490981964, + "grad_norm": 19.307695756702298, + "learning_rate": 9.055066742389697e-06, + "loss": 3.5976, + "step": 4178 + }, + { + "epoch": 0.8374749498997996, + "grad_norm": 23.11349067711587, + "learning_rate": 9.054384557428587e-06, + "loss": 4.2673, + "step": 4179 + }, + { + "epoch": 0.8376753507014028, + "grad_norm": 16.958030120537654, + "learning_rate": 9.0537021520246e-06, + "loss": 4.28, + "step": 4180 + }, + { + "epoch": 0.837875751503006, + "grad_norm": 49.131477823516676, + "learning_rate": 9.053019526214848e-06, + "loss": 3.6728, + "step": 4181 + }, + { + "epoch": 0.8380761523046092, + "grad_norm": 20.684747156153495, + "learning_rate": 9.052336680036439e-06, + "loss": 4.0565, + "step": 4182 + }, + { + "epoch": 0.8382765531062124, + "grad_norm": 20.337073572917326, + "learning_rate": 9.051653613526506e-06, + "loss": 3.6741, + "step": 4183 + }, + { + "epoch": 0.8384769539078156, + "grad_norm": 22.41986759227579, + "learning_rate": 9.050970326722182e-06, + "loss": 4.1136, + "step": 4184 + }, + { + "epoch": 0.8386773547094188, + "grad_norm": 81.75616384661645, + "learning_rate": 9.050286819660622e-06, + "loss": 4.7423, + "step": 4185 + }, + { + "epoch": 0.8388777555110221, + "grad_norm": 22.63236773477591, + "learning_rate": 9.049603092378989e-06, + "loss": 4.0835, + "step": 4186 + }, + { + "epoch": 0.8390781563126253, + "grad_norm": 28.873503946946563, + "learning_rate": 9.04891914491446e-06, + "loss": 4.1444, + "step": 4187 + }, + { + "epoch": 0.8392785571142285, + "grad_norm": 23.275980736890588, + "learning_rate": 9.04823497730422e-06, + "loss": 4.5725, + "step": 4188 + }, + { + "epoch": 0.8394789579158317, + "grad_norm": 27.714486672138214, + "learning_rate": 9.047550589585465e-06, + "loss": 4.1378, + "step": 4189 + }, + { + "epoch": 0.8396793587174348, + "grad_norm": 21.76123125753666, + "learning_rate": 9.046865981795414e-06, + "loss": 4.3843, + "step": 4190 + }, + { + "epoch": 0.839879759519038, + "grad_norm": 22.59889450462701, + "learning_rate": 9.046181153971282e-06, + "loss": 4.3089, + "step": 4191 + }, + { + "epoch": 0.8400801603206413, + "grad_norm": 24.655624956606786, + "learning_rate": 9.04549610615031e-06, + "loss": 4.4207, + "step": 4192 + }, + { + "epoch": 0.8402805611222445, + "grad_norm": 18.833957532983646, + "learning_rate": 9.044810838369744e-06, + "loss": 3.9945, + "step": 4193 + }, + { + "epoch": 0.8404809619238477, + "grad_norm": 28.86078512262403, + "learning_rate": 9.044125350666841e-06, + "loss": 3.5954, + "step": 4194 + }, + { + "epoch": 0.8406813627254509, + "grad_norm": 27.06447940748083, + "learning_rate": 9.043439643078873e-06, + "loss": 4.3768, + "step": 4195 + }, + { + "epoch": 0.8408817635270541, + "grad_norm": 25.47405437821949, + "learning_rate": 9.042753715643122e-06, + "loss": 4.0184, + "step": 4196 + }, + { + "epoch": 0.8410821643286573, + "grad_norm": 29.750496001533772, + "learning_rate": 9.042067568396886e-06, + "loss": 4.5522, + "step": 4197 + }, + { + "epoch": 0.8412825651302606, + "grad_norm": 18.26113710562264, + "learning_rate": 9.041381201377468e-06, + "loss": 4.1462, + "step": 4198 + }, + { + "epoch": 0.8414829659318638, + "grad_norm": 24.31060767371249, + "learning_rate": 9.04069461462219e-06, + "loss": 4.2367, + "step": 4199 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 15.578500972025997, + "learning_rate": 9.040007808168378e-06, + "loss": 3.914, + "step": 4200 + }, + { + "epoch": 0.8418837675350701, + "grad_norm": 20.405216656253742, + "learning_rate": 9.03932078205338e-06, + "loss": 4.4123, + "step": 4201 + }, + { + "epoch": 0.8420841683366733, + "grad_norm": 33.10187952302611, + "learning_rate": 9.038633536314549e-06, + "loss": 4.112, + "step": 4202 + }, + { + "epoch": 0.8422845691382765, + "grad_norm": 43.169462700979544, + "learning_rate": 9.037946070989251e-06, + "loss": 4.5087, + "step": 4203 + }, + { + "epoch": 0.8424849699398798, + "grad_norm": 27.84890882259751, + "learning_rate": 9.037258386114864e-06, + "loss": 4.4601, + "step": 4204 + }, + { + "epoch": 0.842685370741483, + "grad_norm": 32.72415971350088, + "learning_rate": 9.03657048172878e-06, + "loss": 3.9849, + "step": 4205 + }, + { + "epoch": 0.8428857715430862, + "grad_norm": 23.73226792278673, + "learning_rate": 9.035882357868398e-06, + "loss": 4.2237, + "step": 4206 + }, + { + "epoch": 0.8430861723446894, + "grad_norm": 21.58098565298114, + "learning_rate": 9.035194014571137e-06, + "loss": 3.4096, + "step": 4207 + }, + { + "epoch": 0.8432865731462926, + "grad_norm": 33.51640767712509, + "learning_rate": 9.03450545187442e-06, + "loss": 4.3893, + "step": 4208 + }, + { + "epoch": 0.8434869739478958, + "grad_norm": 23.81053904706952, + "learning_rate": 9.033816669815686e-06, + "loss": 3.9871, + "step": 4209 + }, + { + "epoch": 0.843687374749499, + "grad_norm": 28.348261867600776, + "learning_rate": 9.033127668432385e-06, + "loss": 4.0755, + "step": 4210 + }, + { + "epoch": 0.8438877755511022, + "grad_norm": 24.940072736493097, + "learning_rate": 9.03243844776198e-06, + "loss": 4.3597, + "step": 4211 + }, + { + "epoch": 0.8440881763527054, + "grad_norm": 27.080454831991364, + "learning_rate": 9.031749007841944e-06, + "loss": 4.2665, + "step": 4212 + }, + { + "epoch": 0.8442885771543086, + "grad_norm": 21.192288203011113, + "learning_rate": 9.031059348709763e-06, + "loss": 4.235, + "step": 4213 + }, + { + "epoch": 0.8444889779559118, + "grad_norm": 30.422621534548362, + "learning_rate": 9.030369470402935e-06, + "loss": 5.199, + "step": 4214 + }, + { + "epoch": 0.844689378757515, + "grad_norm": 31.538519115374733, + "learning_rate": 9.029679372958968e-06, + "loss": 4.3536, + "step": 4215 + }, + { + "epoch": 0.8448897795591183, + "grad_norm": 21.618987086276956, + "learning_rate": 9.028989056415387e-06, + "loss": 3.7274, + "step": 4216 + }, + { + "epoch": 0.8450901803607215, + "grad_norm": 23.220062067961997, + "learning_rate": 9.028298520809723e-06, + "loss": 4.4173, + "step": 4217 + }, + { + "epoch": 0.8452905811623247, + "grad_norm": 23.94874167582281, + "learning_rate": 9.027607766179523e-06, + "loss": 4.2672, + "step": 4218 + }, + { + "epoch": 0.8454909819639278, + "grad_norm": 25.374424179425592, + "learning_rate": 9.02691679256234e-06, + "loss": 3.9257, + "step": 4219 + }, + { + "epoch": 0.845691382765531, + "grad_norm": 20.737152524421916, + "learning_rate": 9.026225599995751e-06, + "loss": 4.0048, + "step": 4220 + }, + { + "epoch": 0.8458917835671342, + "grad_norm": 20.17407728865361, + "learning_rate": 9.025534188517331e-06, + "loss": 3.6511, + "step": 4221 + }, + { + "epoch": 0.8460921843687375, + "grad_norm": 19.053686514632137, + "learning_rate": 9.024842558164675e-06, + "loss": 4.2955, + "step": 4222 + }, + { + "epoch": 0.8462925851703407, + "grad_norm": 24.880438729875273, + "learning_rate": 9.024150708975388e-06, + "loss": 4.2288, + "step": 4223 + }, + { + "epoch": 0.8464929859719439, + "grad_norm": 32.30250563210225, + "learning_rate": 9.023458640987086e-06, + "loss": 4.0174, + "step": 4224 + }, + { + "epoch": 0.8466933867735471, + "grad_norm": 27.29736853615097, + "learning_rate": 9.0227663542374e-06, + "loss": 4.4398, + "step": 4225 + }, + { + "epoch": 0.8468937875751503, + "grad_norm": 52.57983786421953, + "learning_rate": 9.022073848763968e-06, + "loss": 4.7502, + "step": 4226 + }, + { + "epoch": 0.8470941883767535, + "grad_norm": 24.795695202679877, + "learning_rate": 9.021381124604445e-06, + "loss": 3.8602, + "step": 4227 + }, + { + "epoch": 0.8472945891783568, + "grad_norm": 25.764883941562015, + "learning_rate": 9.020688181796493e-06, + "loss": 4.3346, + "step": 4228 + }, + { + "epoch": 0.8474949899799599, + "grad_norm": 21.95091300031103, + "learning_rate": 9.01999502037779e-06, + "loss": 3.9168, + "step": 4229 + }, + { + "epoch": 0.8476953907815631, + "grad_norm": 20.2599288816899, + "learning_rate": 9.019301640386022e-06, + "loss": 3.8293, + "step": 4230 + }, + { + "epoch": 0.8478957915831663, + "grad_norm": 26.594316650045933, + "learning_rate": 9.018608041858893e-06, + "loss": 4.7227, + "step": 4231 + }, + { + "epoch": 0.8480961923847695, + "grad_norm": 28.41337448352412, + "learning_rate": 9.017914224834112e-06, + "loss": 4.1079, + "step": 4232 + }, + { + "epoch": 0.8482965931863727, + "grad_norm": 20.149554343106058, + "learning_rate": 9.017220189349405e-06, + "loss": 4.0561, + "step": 4233 + }, + { + "epoch": 0.848496993987976, + "grad_norm": 25.80568632598686, + "learning_rate": 9.016525935442503e-06, + "loss": 4.2155, + "step": 4234 + }, + { + "epoch": 0.8486973947895792, + "grad_norm": 19.37696480597211, + "learning_rate": 9.015831463151159e-06, + "loss": 4.2289, + "step": 4235 + }, + { + "epoch": 0.8488977955911824, + "grad_norm": 24.237723565638547, + "learning_rate": 9.015136772513131e-06, + "loss": 4.5008, + "step": 4236 + }, + { + "epoch": 0.8490981963927856, + "grad_norm": 26.985019253028433, + "learning_rate": 9.014441863566189e-06, + "loss": 4.7423, + "step": 4237 + }, + { + "epoch": 0.8492985971943888, + "grad_norm": 21.71658216640793, + "learning_rate": 9.013746736348117e-06, + "loss": 4.5189, + "step": 4238 + }, + { + "epoch": 0.8494989979959919, + "grad_norm": 18.920360532605102, + "learning_rate": 9.013051390896709e-06, + "loss": 3.9984, + "step": 4239 + }, + { + "epoch": 0.8496993987975952, + "grad_norm": 33.84350316013346, + "learning_rate": 9.012355827249775e-06, + "loss": 4.5723, + "step": 4240 + }, + { + "epoch": 0.8498997995991984, + "grad_norm": 24.443271024342504, + "learning_rate": 9.011660045445131e-06, + "loss": 4.3432, + "step": 4241 + }, + { + "epoch": 0.8501002004008016, + "grad_norm": 22.435347710236286, + "learning_rate": 9.01096404552061e-06, + "loss": 4.3515, + "step": 4242 + }, + { + "epoch": 0.8503006012024048, + "grad_norm": 20.888079696858735, + "learning_rate": 9.010267827514053e-06, + "loss": 3.7643, + "step": 4243 + }, + { + "epoch": 0.850501002004008, + "grad_norm": 23.68030056168854, + "learning_rate": 9.009571391463312e-06, + "loss": 4.3344, + "step": 4244 + }, + { + "epoch": 0.8507014028056112, + "grad_norm": 22.71813094276961, + "learning_rate": 9.008874737406257e-06, + "loss": 4.169, + "step": 4245 + }, + { + "epoch": 0.8509018036072145, + "grad_norm": 19.878261664294627, + "learning_rate": 9.008177865380765e-06, + "loss": 3.8619, + "step": 4246 + }, + { + "epoch": 0.8511022044088177, + "grad_norm": 25.756507138357705, + "learning_rate": 9.007480775424726e-06, + "loss": 4.2157, + "step": 4247 + }, + { + "epoch": 0.8513026052104209, + "grad_norm": 25.699774855613914, + "learning_rate": 9.006783467576041e-06, + "loss": 4.0258, + "step": 4248 + }, + { + "epoch": 0.851503006012024, + "grad_norm": 19.026724052862374, + "learning_rate": 9.006085941872627e-06, + "loss": 4.2989, + "step": 4249 + }, + { + "epoch": 0.8517034068136272, + "grad_norm": 32.362981786752684, + "learning_rate": 9.005388198352405e-06, + "loss": 4.791, + "step": 4250 + }, + { + "epoch": 0.8519038076152304, + "grad_norm": 20.843254943201213, + "learning_rate": 9.004690237053314e-06, + "loss": 3.8854, + "step": 4251 + }, + { + "epoch": 0.8521042084168337, + "grad_norm": 41.38190747565222, + "learning_rate": 9.003992058013302e-06, + "loss": 4.2503, + "step": 4252 + }, + { + "epoch": 0.8523046092184369, + "grad_norm": 27.450870666645915, + "learning_rate": 9.003293661270333e-06, + "loss": 4.3528, + "step": 4253 + }, + { + "epoch": 0.8525050100200401, + "grad_norm": 41.58939176932375, + "learning_rate": 9.002595046862376e-06, + "loss": 4.1722, + "step": 4254 + }, + { + "epoch": 0.8527054108216433, + "grad_norm": 23.75813855351965, + "learning_rate": 9.001896214827422e-06, + "loss": 3.7181, + "step": 4255 + }, + { + "epoch": 0.8529058116232465, + "grad_norm": 25.127883710398088, + "learning_rate": 9.00119716520346e-06, + "loss": 4.1167, + "step": 4256 + }, + { + "epoch": 0.8531062124248497, + "grad_norm": 31.754062064726682, + "learning_rate": 9.000497898028505e-06, + "loss": 4.4222, + "step": 4257 + }, + { + "epoch": 0.853306613226453, + "grad_norm": 25.853526442648544, + "learning_rate": 8.99979841334057e-06, + "loss": 4.0588, + "step": 4258 + }, + { + "epoch": 0.8535070140280561, + "grad_norm": 26.065237255976793, + "learning_rate": 8.999098711177693e-06, + "loss": 4.3281, + "step": 4259 + }, + { + "epoch": 0.8537074148296593, + "grad_norm": 24.63253428923603, + "learning_rate": 8.998398791577917e-06, + "loss": 4.3668, + "step": 4260 + }, + { + "epoch": 0.8539078156312625, + "grad_norm": 27.452915323820182, + "learning_rate": 8.997698654579294e-06, + "loss": 3.843, + "step": 4261 + }, + { + "epoch": 0.8541082164328657, + "grad_norm": 29.280991933380427, + "learning_rate": 8.996998300219896e-06, + "loss": 4.5742, + "step": 4262 + }, + { + "epoch": 0.8543086172344689, + "grad_norm": 22.25960020834508, + "learning_rate": 8.996297728537799e-06, + "loss": 3.9177, + "step": 4263 + }, + { + "epoch": 0.8545090180360722, + "grad_norm": 27.816663814236957, + "learning_rate": 8.995596939571097e-06, + "loss": 4.8166, + "step": 4264 + }, + { + "epoch": 0.8547094188376754, + "grad_norm": 25.96981972467446, + "learning_rate": 8.99489593335789e-06, + "loss": 4.0825, + "step": 4265 + }, + { + "epoch": 0.8549098196392786, + "grad_norm": 38.50405638911088, + "learning_rate": 8.994194709936295e-06, + "loss": 3.857, + "step": 4266 + }, + { + "epoch": 0.8551102204408818, + "grad_norm": 34.17062395638649, + "learning_rate": 8.993493269344438e-06, + "loss": 4.2684, + "step": 4267 + }, + { + "epoch": 0.855310621242485, + "grad_norm": 21.252212913540074, + "learning_rate": 8.992791611620457e-06, + "loss": 4.051, + "step": 4268 + }, + { + "epoch": 0.8555110220440881, + "grad_norm": 17.743852715013936, + "learning_rate": 8.9920897368025e-06, + "loss": 4.1579, + "step": 4269 + }, + { + "epoch": 0.8557114228456913, + "grad_norm": 56.1186159164332, + "learning_rate": 8.991387644928732e-06, + "loss": 4.4437, + "step": 4270 + }, + { + "epoch": 0.8559118236472946, + "grad_norm": 23.8073992828888, + "learning_rate": 8.990685336037327e-06, + "loss": 4.0175, + "step": 4271 + }, + { + "epoch": 0.8561122244488978, + "grad_norm": 26.91728896431069, + "learning_rate": 8.989982810166471e-06, + "loss": 4.4031, + "step": 4272 + }, + { + "epoch": 0.856312625250501, + "grad_norm": 23.45604431606649, + "learning_rate": 8.989280067354357e-06, + "loss": 4.4858, + "step": 4273 + }, + { + "epoch": 0.8565130260521042, + "grad_norm": 19.446155330297177, + "learning_rate": 8.988577107639198e-06, + "loss": 3.3022, + "step": 4274 + }, + { + "epoch": 0.8567134268537074, + "grad_norm": 22.827173182229245, + "learning_rate": 8.987873931059214e-06, + "loss": 4.2615, + "step": 4275 + }, + { + "epoch": 0.8569138276553107, + "grad_norm": 31.219919414625235, + "learning_rate": 8.987170537652637e-06, + "loss": 4.183, + "step": 4276 + }, + { + "epoch": 0.8571142284569139, + "grad_norm": 21.141342907335137, + "learning_rate": 8.986466927457714e-06, + "loss": 3.9266, + "step": 4277 + }, + { + "epoch": 0.857314629258517, + "grad_norm": 22.552778841397977, + "learning_rate": 8.985763100512697e-06, + "loss": 3.799, + "step": 4278 + }, + { + "epoch": 0.8575150300601202, + "grad_norm": 20.066098114111515, + "learning_rate": 8.985059056855858e-06, + "loss": 4.223, + "step": 4279 + }, + { + "epoch": 0.8577154308617234, + "grad_norm": 32.85922599579037, + "learning_rate": 8.984354796525476e-06, + "loss": 4.4093, + "step": 4280 + }, + { + "epoch": 0.8579158316633266, + "grad_norm": 27.74442951227385, + "learning_rate": 8.983650319559842e-06, + "loss": 3.8466, + "step": 4281 + }, + { + "epoch": 0.8581162324649299, + "grad_norm": 65.39511091126926, + "learning_rate": 8.982945625997262e-06, + "loss": 4.4409, + "step": 4282 + }, + { + "epoch": 0.8583166332665331, + "grad_norm": 21.00093994015684, + "learning_rate": 8.982240715876044e-06, + "loss": 3.9566, + "step": 4283 + }, + { + "epoch": 0.8585170340681363, + "grad_norm": 33.9668575394016, + "learning_rate": 8.981535589234525e-06, + "loss": 4.0188, + "step": 4284 + }, + { + "epoch": 0.8587174348697395, + "grad_norm": 30.148114145643675, + "learning_rate": 8.980830246111034e-06, + "loss": 4.0949, + "step": 4285 + }, + { + "epoch": 0.8589178356713427, + "grad_norm": 22.995126740764434, + "learning_rate": 8.980124686543929e-06, + "loss": 3.8731, + "step": 4286 + }, + { + "epoch": 0.8591182364729459, + "grad_norm": 17.61528652238117, + "learning_rate": 8.97941891057157e-06, + "loss": 3.743, + "step": 4287 + }, + { + "epoch": 0.859318637274549, + "grad_norm": 22.092521019028357, + "learning_rate": 8.97871291823233e-06, + "loss": 3.8594, + "step": 4288 + }, + { + "epoch": 0.8595190380761523, + "grad_norm": 28.094062258140266, + "learning_rate": 8.978006709564595e-06, + "loss": 5.041, + "step": 4289 + }, + { + "epoch": 0.8597194388777555, + "grad_norm": 25.497582855514505, + "learning_rate": 8.977300284606765e-06, + "loss": 3.9505, + "step": 4290 + }, + { + "epoch": 0.8599198396793587, + "grad_norm": 28.530422477596954, + "learning_rate": 8.976593643397244e-06, + "loss": 4.1026, + "step": 4291 + }, + { + "epoch": 0.8601202404809619, + "grad_norm": 43.27958613719711, + "learning_rate": 8.975886785974459e-06, + "loss": 4.5499, + "step": 4292 + }, + { + "epoch": 0.8603206412825651, + "grad_norm": 24.377121107439457, + "learning_rate": 8.975179712376841e-06, + "loss": 3.9833, + "step": 4293 + }, + { + "epoch": 0.8605210420841684, + "grad_norm": 28.53670999134748, + "learning_rate": 8.974472422642834e-06, + "loss": 4.1536, + "step": 4294 + }, + { + "epoch": 0.8607214428857716, + "grad_norm": 27.385314133347386, + "learning_rate": 8.973764916810893e-06, + "loss": 4.1438, + "step": 4295 + }, + { + "epoch": 0.8609218436873748, + "grad_norm": 20.548309502418054, + "learning_rate": 8.973057194919487e-06, + "loss": 4.0338, + "step": 4296 + }, + { + "epoch": 0.861122244488978, + "grad_norm": 31.474101005543304, + "learning_rate": 8.9723492570071e-06, + "loss": 4.3065, + "step": 4297 + }, + { + "epoch": 0.8613226452905811, + "grad_norm": 20.015508234048024, + "learning_rate": 8.971641103112217e-06, + "loss": 3.4005, + "step": 4298 + }, + { + "epoch": 0.8615230460921843, + "grad_norm": 23.202823503006066, + "learning_rate": 8.970932733273346e-06, + "loss": 4.4526, + "step": 4299 + }, + { + "epoch": 0.8617234468937875, + "grad_norm": 24.053750869626537, + "learning_rate": 8.970224147529e-06, + "loss": 4.1192, + "step": 4300 + }, + { + "epoch": 0.8619238476953908, + "grad_norm": 24.45336992134216, + "learning_rate": 8.969515345917707e-06, + "loss": 4.0858, + "step": 4301 + }, + { + "epoch": 0.862124248496994, + "grad_norm": 29.984924051901906, + "learning_rate": 8.968806328478006e-06, + "loss": 4.1103, + "step": 4302 + }, + { + "epoch": 0.8623246492985972, + "grad_norm": 37.32626059311253, + "learning_rate": 8.968097095248445e-06, + "loss": 3.5246, + "step": 4303 + }, + { + "epoch": 0.8625250501002004, + "grad_norm": 24.174768931516795, + "learning_rate": 8.96738764626759e-06, + "loss": 4.3661, + "step": 4304 + }, + { + "epoch": 0.8627254509018036, + "grad_norm": 24.913933213980272, + "learning_rate": 8.966677981574008e-06, + "loss": 4.4285, + "step": 4305 + }, + { + "epoch": 0.8629258517034069, + "grad_norm": 26.563295310933682, + "learning_rate": 8.965968101206291e-06, + "loss": 4.5678, + "step": 4306 + }, + { + "epoch": 0.8631262525050101, + "grad_norm": 39.391665010892474, + "learning_rate": 8.965258005203034e-06, + "loss": 4.4303, + "step": 4307 + }, + { + "epoch": 0.8633266533066132, + "grad_norm": 32.1011372829675, + "learning_rate": 8.964547693602846e-06, + "loss": 4.5833, + "step": 4308 + }, + { + "epoch": 0.8635270541082164, + "grad_norm": 21.819624732802335, + "learning_rate": 8.963837166444348e-06, + "loss": 4.149, + "step": 4309 + }, + { + "epoch": 0.8637274549098196, + "grad_norm": 25.391865946115647, + "learning_rate": 8.963126423766172e-06, + "loss": 4.3251, + "step": 4310 + }, + { + "epoch": 0.8639278557114228, + "grad_norm": 24.629733765678832, + "learning_rate": 8.962415465606962e-06, + "loss": 3.9148, + "step": 4311 + }, + { + "epoch": 0.864128256513026, + "grad_norm": 33.238252682807435, + "learning_rate": 8.961704292005373e-06, + "loss": 4.3752, + "step": 4312 + }, + { + "epoch": 0.8643286573146293, + "grad_norm": 24.54345112686024, + "learning_rate": 8.960992903000076e-06, + "loss": 4.3831, + "step": 4313 + }, + { + "epoch": 0.8645290581162325, + "grad_norm": 22.292408686342355, + "learning_rate": 8.960281298629745e-06, + "loss": 3.8283, + "step": 4314 + }, + { + "epoch": 0.8647294589178357, + "grad_norm": 46.41157199832231, + "learning_rate": 8.959569478933077e-06, + "loss": 4.7261, + "step": 4315 + }, + { + "epoch": 0.8649298597194389, + "grad_norm": 26.45801635133441, + "learning_rate": 8.958857443948769e-06, + "loss": 3.5518, + "step": 4316 + }, + { + "epoch": 0.8651302605210421, + "grad_norm": 16.28560077438233, + "learning_rate": 8.95814519371554e-06, + "loss": 4.0584, + "step": 4317 + }, + { + "epoch": 0.8653306613226452, + "grad_norm": 27.581302860766094, + "learning_rate": 8.957432728272114e-06, + "loss": 4.1028, + "step": 4318 + }, + { + "epoch": 0.8655310621242485, + "grad_norm": 20.823677290614633, + "learning_rate": 8.956720047657226e-06, + "loss": 4.4137, + "step": 4319 + }, + { + "epoch": 0.8657314629258517, + "grad_norm": 19.334582879215144, + "learning_rate": 8.956007151909631e-06, + "loss": 3.799, + "step": 4320 + }, + { + "epoch": 0.8659318637274549, + "grad_norm": 23.891144431172826, + "learning_rate": 8.955294041068088e-06, + "loss": 4.9959, + "step": 4321 + }, + { + "epoch": 0.8661322645290581, + "grad_norm": 30.388812369183178, + "learning_rate": 8.954580715171369e-06, + "loss": 4.4307, + "step": 4322 + }, + { + "epoch": 0.8663326653306613, + "grad_norm": 22.15780647988255, + "learning_rate": 8.953867174258257e-06, + "loss": 4.7363, + "step": 4323 + }, + { + "epoch": 0.8665330661322646, + "grad_norm": 52.65005957766859, + "learning_rate": 8.95315341836755e-06, + "loss": 4.8679, + "step": 4324 + }, + { + "epoch": 0.8667334669338678, + "grad_norm": 30.975761206125103, + "learning_rate": 8.95243944753806e-06, + "loss": 4.4396, + "step": 4325 + }, + { + "epoch": 0.866933867735471, + "grad_norm": 21.82343281611049, + "learning_rate": 8.9517252618086e-06, + "loss": 4.5594, + "step": 4326 + }, + { + "epoch": 0.8671342685370741, + "grad_norm": 36.673172341140344, + "learning_rate": 8.951010861218005e-06, + "loss": 4.8891, + "step": 4327 + }, + { + "epoch": 0.8673346693386773, + "grad_norm": 22.011982079423547, + "learning_rate": 8.950296245805116e-06, + "loss": 4.2749, + "step": 4328 + }, + { + "epoch": 0.8675350701402805, + "grad_norm": 35.43450504414758, + "learning_rate": 8.949581415608792e-06, + "loss": 4.229, + "step": 4329 + }, + { + "epoch": 0.8677354709418837, + "grad_norm": 22.031008427672603, + "learning_rate": 8.948866370667895e-06, + "loss": 4.033, + "step": 4330 + }, + { + "epoch": 0.867935871743487, + "grad_norm": 36.46982202669357, + "learning_rate": 8.948151111021304e-06, + "loss": 4.1855, + "step": 4331 + }, + { + "epoch": 0.8681362725450902, + "grad_norm": 25.056225369957904, + "learning_rate": 8.947435636707908e-06, + "loss": 5.0314, + "step": 4332 + }, + { + "epoch": 0.8683366733466934, + "grad_norm": 23.782069593680742, + "learning_rate": 8.94671994776661e-06, + "loss": 4.3088, + "step": 4333 + }, + { + "epoch": 0.8685370741482966, + "grad_norm": 24.41103864207476, + "learning_rate": 8.946004044236322e-06, + "loss": 4.4207, + "step": 4334 + }, + { + "epoch": 0.8687374749498998, + "grad_norm": 56.18652936216958, + "learning_rate": 8.945287926155972e-06, + "loss": 4.616, + "step": 4335 + }, + { + "epoch": 0.868937875751503, + "grad_norm": 23.222125252987183, + "learning_rate": 8.944571593564492e-06, + "loss": 4.2468, + "step": 4336 + }, + { + "epoch": 0.8691382765531062, + "grad_norm": 16.114210446828817, + "learning_rate": 8.943855046500831e-06, + "loss": 3.9136, + "step": 4337 + }, + { + "epoch": 0.8693386773547094, + "grad_norm": 37.53481931588515, + "learning_rate": 8.94313828500395e-06, + "loss": 4.1975, + "step": 4338 + }, + { + "epoch": 0.8695390781563126, + "grad_norm": 28.302473718865674, + "learning_rate": 8.94242130911282e-06, + "loss": 4.294, + "step": 4339 + }, + { + "epoch": 0.8697394789579158, + "grad_norm": 23.02570192829926, + "learning_rate": 8.941704118866424e-06, + "loss": 4.1066, + "step": 4340 + }, + { + "epoch": 0.869939879759519, + "grad_norm": 26.05578457573926, + "learning_rate": 8.940986714303757e-06, + "loss": 4.1605, + "step": 4341 + }, + { + "epoch": 0.8701402805611222, + "grad_norm": 22.17649478255025, + "learning_rate": 8.940269095463823e-06, + "loss": 4.3705, + "step": 4342 + }, + { + "epoch": 0.8703406813627255, + "grad_norm": 21.06425950728993, + "learning_rate": 8.939551262385644e-06, + "loss": 4.0179, + "step": 4343 + }, + { + "epoch": 0.8705410821643287, + "grad_norm": 38.96881865105104, + "learning_rate": 8.938833215108247e-06, + "loss": 4.562, + "step": 4344 + }, + { + "epoch": 0.8707414829659319, + "grad_norm": 20.845671745507524, + "learning_rate": 8.938114953670673e-06, + "loss": 3.6139, + "step": 4345 + }, + { + "epoch": 0.8709418837675351, + "grad_norm": 21.807318100238856, + "learning_rate": 8.937396478111976e-06, + "loss": 3.9344, + "step": 4346 + }, + { + "epoch": 0.8711422845691382, + "grad_norm": 24.30868761621147, + "learning_rate": 8.936677788471221e-06, + "loss": 4.485, + "step": 4347 + }, + { + "epoch": 0.8713426853707414, + "grad_norm": 19.874532805052805, + "learning_rate": 8.935958884787485e-06, + "loss": 4.3242, + "step": 4348 + }, + { + "epoch": 0.8715430861723447, + "grad_norm": 19.76152754694758, + "learning_rate": 8.935239767099853e-06, + "loss": 3.5986, + "step": 4349 + }, + { + "epoch": 0.8717434869739479, + "grad_norm": 33.28413710631155, + "learning_rate": 8.934520435447424e-06, + "loss": 4.3452, + "step": 4350 + }, + { + "epoch": 0.8719438877755511, + "grad_norm": 31.411867566597195, + "learning_rate": 8.933800889869313e-06, + "loss": 4.5587, + "step": 4351 + }, + { + "epoch": 0.8721442885771543, + "grad_norm": 28.245032549030533, + "learning_rate": 8.933081130404641e-06, + "loss": 3.8138, + "step": 4352 + }, + { + "epoch": 0.8723446893787575, + "grad_norm": 30.004012106333285, + "learning_rate": 8.932361157092542e-06, + "loss": 4.4821, + "step": 4353 + }, + { + "epoch": 0.8725450901803607, + "grad_norm": 18.236019340353504, + "learning_rate": 8.931640969972162e-06, + "loss": 3.5394, + "step": 4354 + }, + { + "epoch": 0.872745490981964, + "grad_norm": 24.270816035977543, + "learning_rate": 8.93092056908266e-06, + "loss": 4.2823, + "step": 4355 + }, + { + "epoch": 0.8729458917835672, + "grad_norm": 17.842351690197162, + "learning_rate": 8.930199954463202e-06, + "loss": 3.5531, + "step": 4356 + }, + { + "epoch": 0.8731462925851703, + "grad_norm": 25.855400228196, + "learning_rate": 8.929479126152971e-06, + "loss": 4.5128, + "step": 4357 + }, + { + "epoch": 0.8733466933867735, + "grad_norm": 42.89622911694955, + "learning_rate": 8.92875808419116e-06, + "loss": 4.4804, + "step": 4358 + }, + { + "epoch": 0.8735470941883767, + "grad_norm": 35.95980597552644, + "learning_rate": 8.928036828616972e-06, + "loss": 4.3298, + "step": 4359 + }, + { + "epoch": 0.87374749498998, + "grad_norm": 22.401245528967973, + "learning_rate": 8.927315359469627e-06, + "loss": 4.3149, + "step": 4360 + }, + { + "epoch": 0.8739478957915832, + "grad_norm": 29.796407860853485, + "learning_rate": 8.926593676788345e-06, + "loss": 4.1977, + "step": 4361 + }, + { + "epoch": 0.8741482965931864, + "grad_norm": 33.23102862404562, + "learning_rate": 8.925871780612368e-06, + "loss": 4.2241, + "step": 4362 + }, + { + "epoch": 0.8743486973947896, + "grad_norm": 33.83721202341276, + "learning_rate": 8.92514967098095e-06, + "loss": 4.3994, + "step": 4363 + }, + { + "epoch": 0.8745490981963928, + "grad_norm": 24.066398188263154, + "learning_rate": 8.924427347933348e-06, + "loss": 3.6436, + "step": 4364 + }, + { + "epoch": 0.874749498997996, + "grad_norm": 23.496558352594736, + "learning_rate": 8.92370481150884e-06, + "loss": 4.2163, + "step": 4365 + }, + { + "epoch": 0.8749498997995993, + "grad_norm": 31.20232145719141, + "learning_rate": 8.922982061746708e-06, + "loss": 3.8481, + "step": 4366 + }, + { + "epoch": 0.8751503006012024, + "grad_norm": 23.095399334521275, + "learning_rate": 8.92225909868625e-06, + "loss": 3.8451, + "step": 4367 + }, + { + "epoch": 0.8753507014028056, + "grad_norm": 32.426202337882, + "learning_rate": 8.921535922366777e-06, + "loss": 4.6527, + "step": 4368 + }, + { + "epoch": 0.8755511022044088, + "grad_norm": 28.85117917223835, + "learning_rate": 8.920812532827606e-06, + "loss": 4.3785, + "step": 4369 + }, + { + "epoch": 0.875751503006012, + "grad_norm": 24.85589233067486, + "learning_rate": 8.92008893010807e-06, + "loss": 3.9932, + "step": 4370 + }, + { + "epoch": 0.8759519038076152, + "grad_norm": 23.54669712494685, + "learning_rate": 8.919365114247513e-06, + "loss": 4.4043, + "step": 4371 + }, + { + "epoch": 0.8761523046092184, + "grad_norm": 27.455764357260094, + "learning_rate": 8.918641085285288e-06, + "loss": 4.0731, + "step": 4372 + }, + { + "epoch": 0.8763527054108217, + "grad_norm": 24.396439289951967, + "learning_rate": 8.917916843260762e-06, + "loss": 4.2708, + "step": 4373 + }, + { + "epoch": 0.8765531062124249, + "grad_norm": 30.18240693972756, + "learning_rate": 8.917192388213316e-06, + "loss": 4.721, + "step": 4374 + }, + { + "epoch": 0.8767535070140281, + "grad_norm": 48.06172399562005, + "learning_rate": 8.916467720182338e-06, + "loss": 4.2725, + "step": 4375 + }, + { + "epoch": 0.8769539078156313, + "grad_norm": 19.97291572339493, + "learning_rate": 8.915742839207227e-06, + "loss": 3.53, + "step": 4376 + }, + { + "epoch": 0.8771543086172344, + "grad_norm": 19.728818494326234, + "learning_rate": 8.915017745327399e-06, + "loss": 3.5858, + "step": 4377 + }, + { + "epoch": 0.8773547094188376, + "grad_norm": 22.963798047744532, + "learning_rate": 8.914292438582274e-06, + "loss": 4.1637, + "step": 4378 + }, + { + "epoch": 0.8775551102204409, + "grad_norm": 26.043910445441462, + "learning_rate": 8.913566919011296e-06, + "loss": 4.1452, + "step": 4379 + }, + { + "epoch": 0.8777555110220441, + "grad_norm": 30.284292788467322, + "learning_rate": 8.912841186653904e-06, + "loss": 3.6913, + "step": 4380 + }, + { + "epoch": 0.8779559118236473, + "grad_norm": 24.009588181927782, + "learning_rate": 8.912115241549561e-06, + "loss": 3.8984, + "step": 4381 + }, + { + "epoch": 0.8781563126252505, + "grad_norm": 29.533053010470635, + "learning_rate": 8.911389083737739e-06, + "loss": 4.2861, + "step": 4382 + }, + { + "epoch": 0.8783567134268537, + "grad_norm": 31.9562357641513, + "learning_rate": 8.910662713257919e-06, + "loss": 4.9275, + "step": 4383 + }, + { + "epoch": 0.878557114228457, + "grad_norm": 24.356035299492426, + "learning_rate": 8.909936130149593e-06, + "loss": 4.6935, + "step": 4384 + }, + { + "epoch": 0.8787575150300602, + "grad_norm": 25.811001314609143, + "learning_rate": 8.90920933445227e-06, + "loss": 4.4442, + "step": 4385 + }, + { + "epoch": 0.8789579158316633, + "grad_norm": 30.941682482082346, + "learning_rate": 8.908482326205463e-06, + "loss": 4.3555, + "step": 4386 + }, + { + "epoch": 0.8791583166332665, + "grad_norm": 18.460483746517028, + "learning_rate": 8.907755105448704e-06, + "loss": 3.7526, + "step": 4387 + }, + { + "epoch": 0.8793587174348697, + "grad_norm": 21.304814994108476, + "learning_rate": 8.90702767222153e-06, + "loss": 4.0093, + "step": 4388 + }, + { + "epoch": 0.8795591182364729, + "grad_norm": 20.235618976866473, + "learning_rate": 8.906300026563495e-06, + "loss": 3.8862, + "step": 4389 + }, + { + "epoch": 0.8797595190380761, + "grad_norm": 27.75147300668815, + "learning_rate": 8.90557216851416e-06, + "loss": 4.3305, + "step": 4390 + }, + { + "epoch": 0.8799599198396794, + "grad_norm": 65.05441368291737, + "learning_rate": 8.904844098113104e-06, + "loss": 4.6838, + "step": 4391 + }, + { + "epoch": 0.8801603206412826, + "grad_norm": 38.04893584499754, + "learning_rate": 8.904115815399908e-06, + "loss": 4.6365, + "step": 4392 + }, + { + "epoch": 0.8803607214428858, + "grad_norm": 35.063680757869236, + "learning_rate": 8.903387320414173e-06, + "loss": 4.5601, + "step": 4393 + }, + { + "epoch": 0.880561122244489, + "grad_norm": 23.71965745115253, + "learning_rate": 8.902658613195508e-06, + "loss": 3.9614, + "step": 4394 + }, + { + "epoch": 0.8807615230460922, + "grad_norm": 31.1870828324678, + "learning_rate": 8.901929693783531e-06, + "loss": 4.9191, + "step": 4395 + }, + { + "epoch": 0.8809619238476953, + "grad_norm": 24.38005009563098, + "learning_rate": 8.901200562217879e-06, + "loss": 3.8279, + "step": 4396 + }, + { + "epoch": 0.8811623246492986, + "grad_norm": 35.082834336391855, + "learning_rate": 8.900471218538193e-06, + "loss": 4.7733, + "step": 4397 + }, + { + "epoch": 0.8813627254509018, + "grad_norm": 20.477127443304187, + "learning_rate": 8.899741662784129e-06, + "loss": 4.1075, + "step": 4398 + }, + { + "epoch": 0.881563126252505, + "grad_norm": 32.12043242618599, + "learning_rate": 8.899011894995355e-06, + "loss": 4.2805, + "step": 4399 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 25.551278123825124, + "learning_rate": 8.898281915211548e-06, + "loss": 3.9294, + "step": 4400 + }, + { + "epoch": 0.8819639278557114, + "grad_norm": 21.07702806232441, + "learning_rate": 8.8975517234724e-06, + "loss": 3.7903, + "step": 4401 + }, + { + "epoch": 0.8821643286573146, + "grad_norm": 22.086975141093227, + "learning_rate": 8.89682131981761e-06, + "loss": 4.3006, + "step": 4402 + }, + { + "epoch": 0.8823647294589179, + "grad_norm": 25.995137793162243, + "learning_rate": 8.896090704286893e-06, + "loss": 4.4451, + "step": 4403 + }, + { + "epoch": 0.8825651302605211, + "grad_norm": 25.24723027407478, + "learning_rate": 8.895359876919974e-06, + "loss": 3.9598, + "step": 4404 + }, + { + "epoch": 0.8827655310621243, + "grad_norm": 19.531115887275224, + "learning_rate": 8.894628837756587e-06, + "loss": 4.1458, + "step": 4405 + }, + { + "epoch": 0.8829659318637274, + "grad_norm": 18.81309963212445, + "learning_rate": 8.893897586836482e-06, + "loss": 3.3972, + "step": 4406 + }, + { + "epoch": 0.8831663326653306, + "grad_norm": 35.324393803324234, + "learning_rate": 8.893166124199418e-06, + "loss": 4.7745, + "step": 4407 + }, + { + "epoch": 0.8833667334669338, + "grad_norm": 26.77300668145777, + "learning_rate": 8.892434449885164e-06, + "loss": 4.7716, + "step": 4408 + }, + { + "epoch": 0.8835671342685371, + "grad_norm": 33.67171406890043, + "learning_rate": 8.891702563933505e-06, + "loss": 4.5108, + "step": 4409 + }, + { + "epoch": 0.8837675350701403, + "grad_norm": 41.58697849387545, + "learning_rate": 8.89097046638423e-06, + "loss": 3.6597, + "step": 4410 + }, + { + "epoch": 0.8839679358717435, + "grad_norm": 23.40954086388323, + "learning_rate": 8.89023815727715e-06, + "loss": 3.9347, + "step": 4411 + }, + { + "epoch": 0.8841683366733467, + "grad_norm": 30.053913170254035, + "learning_rate": 8.889505636652076e-06, + "loss": 4.9073, + "step": 4412 + }, + { + "epoch": 0.8843687374749499, + "grad_norm": 32.487233765882486, + "learning_rate": 8.888772904548841e-06, + "loss": 4.5491, + "step": 4413 + }, + { + "epoch": 0.8845691382765531, + "grad_norm": 33.73982155735967, + "learning_rate": 8.888039961007282e-06, + "loss": 4.0084, + "step": 4414 + }, + { + "epoch": 0.8847695390781564, + "grad_norm": 26.659570041262292, + "learning_rate": 8.887306806067253e-06, + "loss": 4.212, + "step": 4415 + }, + { + "epoch": 0.8849699398797595, + "grad_norm": 23.161227001011856, + "learning_rate": 8.886573439768612e-06, + "loss": 4.2023, + "step": 4416 + }, + { + "epoch": 0.8851703406813627, + "grad_norm": 16.92806056657457, + "learning_rate": 8.885839862151237e-06, + "loss": 3.7727, + "step": 4417 + }, + { + "epoch": 0.8853707414829659, + "grad_norm": 40.725830153657284, + "learning_rate": 8.885106073255011e-06, + "loss": 4.2369, + "step": 4418 + }, + { + "epoch": 0.8855711422845691, + "grad_norm": 27.194328113099846, + "learning_rate": 8.884372073119837e-06, + "loss": 4.2831, + "step": 4419 + }, + { + "epoch": 0.8857715430861723, + "grad_norm": 19.31183833386721, + "learning_rate": 8.883637861785617e-06, + "loss": 3.8119, + "step": 4420 + }, + { + "epoch": 0.8859719438877756, + "grad_norm": 25.722595802639653, + "learning_rate": 8.882903439292273e-06, + "loss": 4.0866, + "step": 4421 + }, + { + "epoch": 0.8861723446893788, + "grad_norm": 32.11995609085729, + "learning_rate": 8.882168805679739e-06, + "loss": 5.2333, + "step": 4422 + }, + { + "epoch": 0.886372745490982, + "grad_norm": 33.68478722140083, + "learning_rate": 8.881433960987955e-06, + "loss": 4.5708, + "step": 4423 + }, + { + "epoch": 0.8865731462925852, + "grad_norm": 18.0285849250441, + "learning_rate": 8.880698905256878e-06, + "loss": 3.6781, + "step": 4424 + }, + { + "epoch": 0.8867735470941884, + "grad_norm": 20.529616501135422, + "learning_rate": 8.879963638526473e-06, + "loss": 4.2882, + "step": 4425 + }, + { + "epoch": 0.8869739478957915, + "grad_norm": 17.31896588672583, + "learning_rate": 8.879228160836716e-06, + "loss": 3.892, + "step": 4426 + }, + { + "epoch": 0.8871743486973948, + "grad_norm": 34.732827320492, + "learning_rate": 8.8784924722276e-06, + "loss": 4.4356, + "step": 4427 + }, + { + "epoch": 0.887374749498998, + "grad_norm": 23.91303124402181, + "learning_rate": 8.877756572739123e-06, + "loss": 4.3054, + "step": 4428 + }, + { + "epoch": 0.8875751503006012, + "grad_norm": 23.699939119248963, + "learning_rate": 8.877020462411296e-06, + "loss": 4.314, + "step": 4429 + }, + { + "epoch": 0.8877755511022044, + "grad_norm": 39.79231959508062, + "learning_rate": 8.876284141284143e-06, + "loss": 3.6256, + "step": 4430 + }, + { + "epoch": 0.8879759519038076, + "grad_norm": 24.691394663683518, + "learning_rate": 8.8755476093977e-06, + "loss": 3.78, + "step": 4431 + }, + { + "epoch": 0.8881763527054108, + "grad_norm": 19.538562520449137, + "learning_rate": 8.874810866792013e-06, + "loss": 3.5956, + "step": 4432 + }, + { + "epoch": 0.8883767535070141, + "grad_norm": 14.475460810979488, + "learning_rate": 8.874073913507137e-06, + "loss": 3.6352, + "step": 4433 + }, + { + "epoch": 0.8885771543086173, + "grad_norm": 18.053007243478966, + "learning_rate": 8.873336749583147e-06, + "loss": 3.9187, + "step": 4434 + }, + { + "epoch": 0.8887775551102205, + "grad_norm": 21.405224815209024, + "learning_rate": 8.872599375060119e-06, + "loss": 4.3541, + "step": 4435 + }, + { + "epoch": 0.8889779559118236, + "grad_norm": 24.07520812903385, + "learning_rate": 8.871861789978147e-06, + "loss": 3.9889, + "step": 4436 + }, + { + "epoch": 0.8891783567134268, + "grad_norm": 30.148042983513207, + "learning_rate": 8.871123994377334e-06, + "loss": 3.3985, + "step": 4437 + }, + { + "epoch": 0.88937875751503, + "grad_norm": 28.846031913069325, + "learning_rate": 8.870385988297793e-06, + "loss": 4.6594, + "step": 4438 + }, + { + "epoch": 0.8895791583166333, + "grad_norm": 29.91517371219746, + "learning_rate": 8.869647771779654e-06, + "loss": 4.9104, + "step": 4439 + }, + { + "epoch": 0.8897795591182365, + "grad_norm": 34.04856281111053, + "learning_rate": 8.868909344863054e-06, + "loss": 3.7197, + "step": 4440 + }, + { + "epoch": 0.8899799599198397, + "grad_norm": 22.799165402732726, + "learning_rate": 8.868170707588143e-06, + "loss": 4.5546, + "step": 4441 + }, + { + "epoch": 0.8901803607214429, + "grad_norm": 26.741462826451222, + "learning_rate": 8.86743185999508e-06, + "loss": 4.2022, + "step": 4442 + }, + { + "epoch": 0.8903807615230461, + "grad_norm": 37.60532557124237, + "learning_rate": 8.866692802124035e-06, + "loss": 4.5734, + "step": 4443 + }, + { + "epoch": 0.8905811623246493, + "grad_norm": 29.92493783492867, + "learning_rate": 8.865953534015199e-06, + "loss": 4.3434, + "step": 4444 + }, + { + "epoch": 0.8907815631262525, + "grad_norm": 25.423559304901282, + "learning_rate": 8.86521405570876e-06, + "loss": 4.1599, + "step": 4445 + }, + { + "epoch": 0.8909819639278557, + "grad_norm": 26.21445844979814, + "learning_rate": 8.864474367244927e-06, + "loss": 4.327, + "step": 4446 + }, + { + "epoch": 0.8911823647294589, + "grad_norm": 28.533128279574754, + "learning_rate": 8.86373446866392e-06, + "loss": 4.3296, + "step": 4447 + }, + { + "epoch": 0.8913827655310621, + "grad_norm": 25.273088067349626, + "learning_rate": 8.862994360005965e-06, + "loss": 3.9352, + "step": 4448 + }, + { + "epoch": 0.8915831663326653, + "grad_norm": 30.901951350344326, + "learning_rate": 8.862254041311305e-06, + "loss": 3.8443, + "step": 4449 + }, + { + "epoch": 0.8917835671342685, + "grad_norm": 22.078952646495793, + "learning_rate": 8.861513512620191e-06, + "loss": 3.9457, + "step": 4450 + }, + { + "epoch": 0.8919839679358718, + "grad_norm": 20.481649260224177, + "learning_rate": 8.860772773972888e-06, + "loss": 3.9765, + "step": 4451 + }, + { + "epoch": 0.892184368737475, + "grad_norm": 31.364648772364127, + "learning_rate": 8.86003182540967e-06, + "loss": 4.4567, + "step": 4452 + }, + { + "epoch": 0.8923847695390782, + "grad_norm": 18.816618305301933, + "learning_rate": 8.859290666970823e-06, + "loss": 4.0627, + "step": 4453 + }, + { + "epoch": 0.8925851703406814, + "grad_norm": 21.0251260995934, + "learning_rate": 8.858549298696646e-06, + "loss": 4.3599, + "step": 4454 + }, + { + "epoch": 0.8927855711422845, + "grad_norm": 26.785458451778737, + "learning_rate": 8.857807720627448e-06, + "loss": 4.8223, + "step": 4455 + }, + { + "epoch": 0.8929859719438877, + "grad_norm": 22.383469392524994, + "learning_rate": 8.85706593280355e-06, + "loss": 4.0294, + "step": 4456 + }, + { + "epoch": 0.893186372745491, + "grad_norm": 64.53333955212443, + "learning_rate": 8.856323935265284e-06, + "loss": 3.4292, + "step": 4457 + }, + { + "epoch": 0.8933867735470942, + "grad_norm": 41.1571287935524, + "learning_rate": 8.855581728052992e-06, + "loss": 4.6032, + "step": 4458 + }, + { + "epoch": 0.8935871743486974, + "grad_norm": 34.061407661403294, + "learning_rate": 8.85483931120703e-06, + "loss": 3.9762, + "step": 4459 + }, + { + "epoch": 0.8937875751503006, + "grad_norm": 31.57121723272231, + "learning_rate": 8.854096684767765e-06, + "loss": 4.0422, + "step": 4460 + }, + { + "epoch": 0.8939879759519038, + "grad_norm": 25.57310662336843, + "learning_rate": 8.853353848775574e-06, + "loss": 3.6994, + "step": 4461 + }, + { + "epoch": 0.894188376753507, + "grad_norm": 30.747421017935253, + "learning_rate": 8.852610803270847e-06, + "loss": 4.3577, + "step": 4462 + }, + { + "epoch": 0.8943887775551103, + "grad_norm": 33.27855682441914, + "learning_rate": 8.851867548293983e-06, + "loss": 3.7898, + "step": 4463 + }, + { + "epoch": 0.8945891783567135, + "grad_norm": 29.525350608161045, + "learning_rate": 8.851124083885394e-06, + "loss": 4.6413, + "step": 4464 + }, + { + "epoch": 0.8947895791583166, + "grad_norm": 30.319654630398762, + "learning_rate": 8.850380410085504e-06, + "loss": 5.0006, + "step": 4465 + }, + { + "epoch": 0.8949899799599198, + "grad_norm": 32.30439711932485, + "learning_rate": 8.849636526934749e-06, + "loss": 4.1553, + "step": 4466 + }, + { + "epoch": 0.895190380761523, + "grad_norm": 49.109205217889205, + "learning_rate": 8.84889243447357e-06, + "loss": 4.4977, + "step": 4467 + }, + { + "epoch": 0.8953907815631262, + "grad_norm": 18.46887305186138, + "learning_rate": 8.848148132742432e-06, + "loss": 3.5651, + "step": 4468 + }, + { + "epoch": 0.8955911823647295, + "grad_norm": 25.775706479158316, + "learning_rate": 8.847403621781797e-06, + "loss": 4.0421, + "step": 4469 + }, + { + "epoch": 0.8957915831663327, + "grad_norm": 40.23769781682361, + "learning_rate": 8.846658901632149e-06, + "loss": 3.969, + "step": 4470 + }, + { + "epoch": 0.8959919839679359, + "grad_norm": 18.48623367134788, + "learning_rate": 8.845913972333977e-06, + "loss": 3.4027, + "step": 4471 + }, + { + "epoch": 0.8961923847695391, + "grad_norm": 22.788389093497596, + "learning_rate": 8.845168833927786e-06, + "loss": 4.297, + "step": 4472 + }, + { + "epoch": 0.8963927855711423, + "grad_norm": 59.34633913498227, + "learning_rate": 8.844423486454089e-06, + "loss": 4.1297, + "step": 4473 + }, + { + "epoch": 0.8965931863727455, + "grad_norm": 23.827413688062403, + "learning_rate": 8.843677929953414e-06, + "loss": 4.655, + "step": 4474 + }, + { + "epoch": 0.8967935871743486, + "grad_norm": 32.28080569195322, + "learning_rate": 8.842932164466294e-06, + "loss": 4.9112, + "step": 4475 + }, + { + "epoch": 0.8969939879759519, + "grad_norm": 35.9696367170787, + "learning_rate": 8.84218619003328e-06, + "loss": 3.8334, + "step": 4476 + }, + { + "epoch": 0.8971943887775551, + "grad_norm": 23.765221638407965, + "learning_rate": 8.84144000669493e-06, + "loss": 4.1578, + "step": 4477 + }, + { + "epoch": 0.8973947895791583, + "grad_norm": 20.36220123651963, + "learning_rate": 8.84069361449182e-06, + "loss": 3.9168, + "step": 4478 + }, + { + "epoch": 0.8975951903807615, + "grad_norm": 23.446692793960178, + "learning_rate": 8.839947013464526e-06, + "loss": 4.5197, + "step": 4479 + }, + { + "epoch": 0.8977955911823647, + "grad_norm": 22.17437208187711, + "learning_rate": 8.839200203653644e-06, + "loss": 4.0718, + "step": 4480 + }, + { + "epoch": 0.897995991983968, + "grad_norm": 20.282893978264173, + "learning_rate": 8.838453185099782e-06, + "loss": 3.7852, + "step": 4481 + }, + { + "epoch": 0.8981963927855712, + "grad_norm": 25.994573607632997, + "learning_rate": 8.837705957843555e-06, + "loss": 3.7789, + "step": 4482 + }, + { + "epoch": 0.8983967935871744, + "grad_norm": 23.38607512468957, + "learning_rate": 8.836958521925587e-06, + "loss": 4.1757, + "step": 4483 + }, + { + "epoch": 0.8985971943887776, + "grad_norm": 23.274885304042055, + "learning_rate": 8.836210877386521e-06, + "loss": 4.3077, + "step": 4484 + }, + { + "epoch": 0.8987975951903807, + "grad_norm": 18.230655671698678, + "learning_rate": 8.835463024267009e-06, + "loss": 3.8173, + "step": 4485 + }, + { + "epoch": 0.8989979959919839, + "grad_norm": 20.5050590015049, + "learning_rate": 8.834714962607708e-06, + "loss": 3.9327, + "step": 4486 + }, + { + "epoch": 0.8991983967935872, + "grad_norm": 29.340589197832163, + "learning_rate": 8.833966692449295e-06, + "loss": 4.8258, + "step": 4487 + }, + { + "epoch": 0.8993987975951904, + "grad_norm": 25.573122346089992, + "learning_rate": 8.833218213832454e-06, + "loss": 4.1863, + "step": 4488 + }, + { + "epoch": 0.8995991983967936, + "grad_norm": 22.924753317109047, + "learning_rate": 8.83246952679788e-06, + "loss": 4.4576, + "step": 4489 + }, + { + "epoch": 0.8997995991983968, + "grad_norm": 23.627552624688494, + "learning_rate": 8.83172063138628e-06, + "loss": 3.7553, + "step": 4490 + }, + { + "epoch": 0.9, + "grad_norm": 15.474526796532444, + "learning_rate": 8.830971527638374e-06, + "loss": 3.7182, + "step": 4491 + }, + { + "epoch": 0.9002004008016032, + "grad_norm": 18.573042822202115, + "learning_rate": 8.83022221559489e-06, + "loss": 3.7635, + "step": 4492 + }, + { + "epoch": 0.9004008016032065, + "grad_norm": 24.527531418804802, + "learning_rate": 8.829472695296572e-06, + "loss": 4.2319, + "step": 4493 + }, + { + "epoch": 0.9006012024048096, + "grad_norm": 21.07242704325505, + "learning_rate": 8.828722966784169e-06, + "loss": 4.1292, + "step": 4494 + }, + { + "epoch": 0.9008016032064128, + "grad_norm": 29.28146520235887, + "learning_rate": 8.827973030098447e-06, + "loss": 4.2846, + "step": 4495 + }, + { + "epoch": 0.901002004008016, + "grad_norm": 19.779513018578978, + "learning_rate": 8.827222885280182e-06, + "loss": 4.2663, + "step": 4496 + }, + { + "epoch": 0.9012024048096192, + "grad_norm": 22.355824202071826, + "learning_rate": 8.826472532370158e-06, + "loss": 4.2531, + "step": 4497 + }, + { + "epoch": 0.9014028056112224, + "grad_norm": 29.512871192681196, + "learning_rate": 8.825721971409173e-06, + "loss": 3.7349, + "step": 4498 + }, + { + "epoch": 0.9016032064128257, + "grad_norm": 23.444298605680913, + "learning_rate": 8.82497120243804e-06, + "loss": 3.9063, + "step": 4499 + }, + { + "epoch": 0.9018036072144289, + "grad_norm": 27.622798104047064, + "learning_rate": 8.824220225497574e-06, + "loss": 4.2031, + "step": 4500 + }, + { + "epoch": 0.9020040080160321, + "grad_norm": 47.301196771286556, + "learning_rate": 8.823469040628611e-06, + "loss": 4.167, + "step": 4501 + }, + { + "epoch": 0.9022044088176353, + "grad_norm": 22.542131293512575, + "learning_rate": 8.82271764787199e-06, + "loss": 3.7415, + "step": 4502 + }, + { + "epoch": 0.9024048096192385, + "grad_norm": 19.948193931388182, + "learning_rate": 8.82196604726857e-06, + "loss": 4.1353, + "step": 4503 + }, + { + "epoch": 0.9026052104208416, + "grad_norm": 31.9319208511459, + "learning_rate": 8.821214238859213e-06, + "loss": 5.1071, + "step": 4504 + }, + { + "epoch": 0.9028056112224448, + "grad_norm": 28.868722155983118, + "learning_rate": 8.820462222684798e-06, + "loss": 4.531, + "step": 4505 + }, + { + "epoch": 0.9030060120240481, + "grad_norm": 17.935327629558845, + "learning_rate": 8.819709998786212e-06, + "loss": 3.9745, + "step": 4506 + }, + { + "epoch": 0.9032064128256513, + "grad_norm": 24.24688750627672, + "learning_rate": 8.818957567204357e-06, + "loss": 4.4798, + "step": 4507 + }, + { + "epoch": 0.9034068136272545, + "grad_norm": 16.65672380733663, + "learning_rate": 8.818204927980138e-06, + "loss": 3.6532, + "step": 4508 + }, + { + "epoch": 0.9036072144288577, + "grad_norm": 17.999635586687322, + "learning_rate": 8.817452081154483e-06, + "loss": 4.1072, + "step": 4509 + }, + { + "epoch": 0.9038076152304609, + "grad_norm": 18.175781675654076, + "learning_rate": 8.816699026768324e-06, + "loss": 3.6249, + "step": 4510 + }, + { + "epoch": 0.9040080160320642, + "grad_norm": 19.419015431764, + "learning_rate": 8.815945764862602e-06, + "loss": 4.021, + "step": 4511 + }, + { + "epoch": 0.9042084168336674, + "grad_norm": 17.676058710785117, + "learning_rate": 8.815192295478278e-06, + "loss": 3.8377, + "step": 4512 + }, + { + "epoch": 0.9044088176352706, + "grad_norm": 22.245871895838203, + "learning_rate": 8.814438618656318e-06, + "loss": 4.2973, + "step": 4513 + }, + { + "epoch": 0.9046092184368737, + "grad_norm": 27.845468836599593, + "learning_rate": 8.813684734437697e-06, + "loss": 4.2926, + "step": 4514 + }, + { + "epoch": 0.9048096192384769, + "grad_norm": 26.984200580975035, + "learning_rate": 8.81293064286341e-06, + "loss": 4.0688, + "step": 4515 + }, + { + "epoch": 0.9050100200400801, + "grad_norm": 31.476545644079348, + "learning_rate": 8.812176343974455e-06, + "loss": 3.3124, + "step": 4516 + }, + { + "epoch": 0.9052104208416833, + "grad_norm": 21.747222051738024, + "learning_rate": 8.811421837811844e-06, + "loss": 4.4719, + "step": 4517 + }, + { + "epoch": 0.9054108216432866, + "grad_norm": 38.278804267271944, + "learning_rate": 8.810667124416602e-06, + "loss": 3.2792, + "step": 4518 + }, + { + "epoch": 0.9056112224448898, + "grad_norm": 27.95986839150239, + "learning_rate": 8.809912203829764e-06, + "loss": 4.5051, + "step": 4519 + }, + { + "epoch": 0.905811623246493, + "grad_norm": 55.38747745346928, + "learning_rate": 8.809157076092374e-06, + "loss": 4.3409, + "step": 4520 + }, + { + "epoch": 0.9060120240480962, + "grad_norm": 41.91804427140679, + "learning_rate": 8.808401741245491e-06, + "loss": 4.3187, + "step": 4521 + }, + { + "epoch": 0.9062124248496994, + "grad_norm": 23.640923629158554, + "learning_rate": 8.807646199330186e-06, + "loss": 4.0366, + "step": 4522 + }, + { + "epoch": 0.9064128256513027, + "grad_norm": 31.77588405061952, + "learning_rate": 8.806890450387535e-06, + "loss": 4.2628, + "step": 4523 + }, + { + "epoch": 0.9066132264529058, + "grad_norm": 39.415320839171684, + "learning_rate": 8.806134494458632e-06, + "loss": 4.7585, + "step": 4524 + }, + { + "epoch": 0.906813627254509, + "grad_norm": 21.048248603187524, + "learning_rate": 8.805378331584575e-06, + "loss": 3.8195, + "step": 4525 + }, + { + "epoch": 0.9070140280561122, + "grad_norm": 26.775659480859566, + "learning_rate": 8.804621961806484e-06, + "loss": 4.4648, + "step": 4526 + }, + { + "epoch": 0.9072144288577154, + "grad_norm": 38.022499543225216, + "learning_rate": 8.80386538516548e-06, + "loss": 4.5846, + "step": 4527 + }, + { + "epoch": 0.9074148296593186, + "grad_norm": 35.88747790953487, + "learning_rate": 8.8031086017027e-06, + "loss": 4.4814, + "step": 4528 + }, + { + "epoch": 0.9076152304609219, + "grad_norm": 34.07672277803539, + "learning_rate": 8.80235161145929e-06, + "loss": 4.8068, + "step": 4529 + }, + { + "epoch": 0.9078156312625251, + "grad_norm": 25.67381097229571, + "learning_rate": 8.80159441447641e-06, + "loss": 4.324, + "step": 4530 + }, + { + "epoch": 0.9080160320641283, + "grad_norm": 27.448647538435075, + "learning_rate": 8.800837010795232e-06, + "loss": 4.3618, + "step": 4531 + }, + { + "epoch": 0.9082164328657315, + "grad_norm": 35.44583891196461, + "learning_rate": 8.800079400456933e-06, + "loss": 3.9953, + "step": 4532 + }, + { + "epoch": 0.9084168336673347, + "grad_norm": 22.712000374315583, + "learning_rate": 8.79932158350271e-06, + "loss": 4.3058, + "step": 4533 + }, + { + "epoch": 0.9086172344689378, + "grad_norm": 31.36316706984702, + "learning_rate": 8.798563559973762e-06, + "loss": 4.3081, + "step": 4534 + }, + { + "epoch": 0.908817635270541, + "grad_norm": 29.327817703357432, + "learning_rate": 8.79780532991131e-06, + "loss": 3.9089, + "step": 4535 + }, + { + "epoch": 0.9090180360721443, + "grad_norm": 31.11782262589486, + "learning_rate": 8.79704689335657e-06, + "loss": 4.2802, + "step": 4536 + }, + { + "epoch": 0.9092184368737475, + "grad_norm": 24.433032940565912, + "learning_rate": 8.796288250350789e-06, + "loss": 4.3076, + "step": 4537 + }, + { + "epoch": 0.9094188376753507, + "grad_norm": 30.075880231090036, + "learning_rate": 8.795529400935212e-06, + "loss": 4.4233, + "step": 4538 + }, + { + "epoch": 0.9096192384769539, + "grad_norm": 56.18813487048815, + "learning_rate": 8.794770345151098e-06, + "loss": 3.7165, + "step": 4539 + }, + { + "epoch": 0.9098196392785571, + "grad_norm": 37.23631937120119, + "learning_rate": 8.79401108303972e-06, + "loss": 3.855, + "step": 4540 + }, + { + "epoch": 0.9100200400801604, + "grad_norm": 25.086323357391315, + "learning_rate": 8.793251614642357e-06, + "loss": 4.14, + "step": 4541 + }, + { + "epoch": 0.9102204408817636, + "grad_norm": 20.51500810014396, + "learning_rate": 8.792491940000307e-06, + "loss": 4.2674, + "step": 4542 + }, + { + "epoch": 0.9104208416833668, + "grad_norm": 22.442215494676336, + "learning_rate": 8.79173205915487e-06, + "loss": 4.1877, + "step": 4543 + }, + { + "epoch": 0.9106212424849699, + "grad_norm": 115.97383335874451, + "learning_rate": 8.790971972147364e-06, + "loss": 3.8814, + "step": 4544 + }, + { + "epoch": 0.9108216432865731, + "grad_norm": 27.298511694224825, + "learning_rate": 8.790211679019117e-06, + "loss": 3.8243, + "step": 4545 + }, + { + "epoch": 0.9110220440881763, + "grad_norm": 32.13585674823522, + "learning_rate": 8.789451179811465e-06, + "loss": 4.6097, + "step": 4546 + }, + { + "epoch": 0.9112224448897795, + "grad_norm": 27.380309402881487, + "learning_rate": 8.788690474565759e-06, + "loss": 3.7675, + "step": 4547 + }, + { + "epoch": 0.9114228456913828, + "grad_norm": 19.66994429010176, + "learning_rate": 8.787929563323358e-06, + "loss": 3.8018, + "step": 4548 + }, + { + "epoch": 0.911623246492986, + "grad_norm": 18.59230200394594, + "learning_rate": 8.787168446125638e-06, + "loss": 3.8703, + "step": 4549 + }, + { + "epoch": 0.9118236472945892, + "grad_norm": 21.045826339532713, + "learning_rate": 8.786407123013977e-06, + "loss": 3.9574, + "step": 4550 + }, + { + "epoch": 0.9120240480961924, + "grad_norm": 18.69010674592494, + "learning_rate": 8.785645594029772e-06, + "loss": 3.7506, + "step": 4551 + }, + { + "epoch": 0.9122244488977956, + "grad_norm": 25.814688044748102, + "learning_rate": 8.784883859214428e-06, + "loss": 3.9922, + "step": 4552 + }, + { + "epoch": 0.9124248496993987, + "grad_norm": 34.94274502543855, + "learning_rate": 8.784121918609361e-06, + "loss": 4.2352, + "step": 4553 + }, + { + "epoch": 0.912625250501002, + "grad_norm": 30.04038567525534, + "learning_rate": 8.783359772256e-06, + "loss": 4.3684, + "step": 4554 + }, + { + "epoch": 0.9128256513026052, + "grad_norm": 17.179836063541927, + "learning_rate": 8.782597420195784e-06, + "loss": 4.034, + "step": 4555 + }, + { + "epoch": 0.9130260521042084, + "grad_norm": 13.925834478198164, + "learning_rate": 8.781834862470163e-06, + "loss": 3.5617, + "step": 4556 + }, + { + "epoch": 0.9132264529058116, + "grad_norm": 20.86422584594095, + "learning_rate": 8.781072099120595e-06, + "loss": 4.5384, + "step": 4557 + }, + { + "epoch": 0.9134268537074148, + "grad_norm": 34.08626183722031, + "learning_rate": 8.780309130188558e-06, + "loss": 4.6905, + "step": 4558 + }, + { + "epoch": 0.913627254509018, + "grad_norm": 39.57932457911254, + "learning_rate": 8.779545955715534e-06, + "loss": 4.3377, + "step": 4559 + }, + { + "epoch": 0.9138276553106213, + "grad_norm": 30.90012757801875, + "learning_rate": 8.778782575743016e-06, + "loss": 4.4082, + "step": 4560 + }, + { + "epoch": 0.9140280561122245, + "grad_norm": 28.89892992044453, + "learning_rate": 8.778018990312511e-06, + "loss": 4.6617, + "step": 4561 + }, + { + "epoch": 0.9142284569138277, + "grad_norm": 19.693350528416378, + "learning_rate": 8.777255199465537e-06, + "loss": 3.5691, + "step": 4562 + }, + { + "epoch": 0.9144288577154308, + "grad_norm": 21.949366179915796, + "learning_rate": 8.776491203243623e-06, + "loss": 4.2374, + "step": 4563 + }, + { + "epoch": 0.914629258517034, + "grad_norm": 24.824705785656015, + "learning_rate": 8.775727001688306e-06, + "loss": 4.7558, + "step": 4564 + }, + { + "epoch": 0.9148296593186372, + "grad_norm": 24.421858658526116, + "learning_rate": 8.77496259484114e-06, + "loss": 4.4528, + "step": 4565 + }, + { + "epoch": 0.9150300601202405, + "grad_norm": 23.307387417617157, + "learning_rate": 8.774197982743685e-06, + "loss": 3.8306, + "step": 4566 + }, + { + "epoch": 0.9152304609218437, + "grad_norm": 31.611523141851677, + "learning_rate": 8.773433165437515e-06, + "loss": 4.6965, + "step": 4567 + }, + { + "epoch": 0.9154308617234469, + "grad_norm": 52.148174738606045, + "learning_rate": 8.772668142964212e-06, + "loss": 4.655, + "step": 4568 + }, + { + "epoch": 0.9156312625250501, + "grad_norm": 35.78395911211143, + "learning_rate": 8.771902915365374e-06, + "loss": 4.2177, + "step": 4569 + }, + { + "epoch": 0.9158316633266533, + "grad_norm": 27.193234840223187, + "learning_rate": 8.771137482682607e-06, + "loss": 4.4648, + "step": 4570 + }, + { + "epoch": 0.9160320641282566, + "grad_norm": 17.16251233193143, + "learning_rate": 8.770371844957527e-06, + "loss": 3.9393, + "step": 4571 + }, + { + "epoch": 0.9162324649298598, + "grad_norm": 25.144918452405484, + "learning_rate": 8.769606002231767e-06, + "loss": 4.1155, + "step": 4572 + }, + { + "epoch": 0.9164328657314629, + "grad_norm": 26.623476251837996, + "learning_rate": 8.768839954546961e-06, + "loss": 4.3807, + "step": 4573 + }, + { + "epoch": 0.9166332665330661, + "grad_norm": 16.129672170777773, + "learning_rate": 8.768073701944766e-06, + "loss": 3.6212, + "step": 4574 + }, + { + "epoch": 0.9168336673346693, + "grad_norm": 20.035001015625873, + "learning_rate": 8.76730724446684e-06, + "loss": 4.5823, + "step": 4575 + }, + { + "epoch": 0.9170340681362725, + "grad_norm": 28.42805677582359, + "learning_rate": 8.76654058215486e-06, + "loss": 4.8415, + "step": 4576 + }, + { + "epoch": 0.9172344689378757, + "grad_norm": 16.097881476765586, + "learning_rate": 8.765773715050506e-06, + "loss": 3.967, + "step": 4577 + }, + { + "epoch": 0.917434869739479, + "grad_norm": 33.1391739073268, + "learning_rate": 8.765006643195478e-06, + "loss": 3.7949, + "step": 4578 + }, + { + "epoch": 0.9176352705410822, + "grad_norm": 27.903368988895632, + "learning_rate": 8.764239366631479e-06, + "loss": 4.2916, + "step": 4579 + }, + { + "epoch": 0.9178356713426854, + "grad_norm": 21.92032806759857, + "learning_rate": 8.76347188540023e-06, + "loss": 4.2266, + "step": 4580 + }, + { + "epoch": 0.9180360721442886, + "grad_norm": 23.625163308313482, + "learning_rate": 8.762704199543462e-06, + "loss": 3.7315, + "step": 4581 + }, + { + "epoch": 0.9182364729458918, + "grad_norm": 29.662021513540058, + "learning_rate": 8.76193630910291e-06, + "loss": 4.5706, + "step": 4582 + }, + { + "epoch": 0.9184368737474949, + "grad_norm": 27.623083595694947, + "learning_rate": 8.761168214120328e-06, + "loss": 4.5341, + "step": 4583 + }, + { + "epoch": 0.9186372745490982, + "grad_norm": 24.271069015807697, + "learning_rate": 8.760399914637478e-06, + "loss": 3.9573, + "step": 4584 + }, + { + "epoch": 0.9188376753507014, + "grad_norm": 22.910989103780015, + "learning_rate": 8.759631410696135e-06, + "loss": 4.2671, + "step": 4585 + }, + { + "epoch": 0.9190380761523046, + "grad_norm": 19.828810731764737, + "learning_rate": 8.75886270233808e-06, + "loss": 3.9683, + "step": 4586 + }, + { + "epoch": 0.9192384769539078, + "grad_norm": 18.74982696360698, + "learning_rate": 8.758093789605114e-06, + "loss": 3.865, + "step": 4587 + }, + { + "epoch": 0.919438877755511, + "grad_norm": 24.183634241280497, + "learning_rate": 8.75732467253904e-06, + "loss": 3.8274, + "step": 4588 + }, + { + "epoch": 0.9196392785571142, + "grad_norm": 27.376366452090856, + "learning_rate": 8.756555351181676e-06, + "loss": 4.0086, + "step": 4589 + }, + { + "epoch": 0.9198396793587175, + "grad_norm": 43.666072792650255, + "learning_rate": 8.755785825574854e-06, + "loss": 4.1781, + "step": 4590 + }, + { + "epoch": 0.9200400801603207, + "grad_norm": 19.659865273161515, + "learning_rate": 8.755016095760412e-06, + "loss": 3.8749, + "step": 4591 + }, + { + "epoch": 0.9202404809619239, + "grad_norm": 29.310257888325044, + "learning_rate": 8.754246161780202e-06, + "loss": 4.3303, + "step": 4592 + }, + { + "epoch": 0.920440881763527, + "grad_norm": 21.25535451532094, + "learning_rate": 8.753476023676087e-06, + "loss": 4.0501, + "step": 4593 + }, + { + "epoch": 0.9206412825651302, + "grad_norm": 27.912212104302203, + "learning_rate": 8.75270568148994e-06, + "loss": 4.3611, + "step": 4594 + }, + { + "epoch": 0.9208416833667334, + "grad_norm": 27.702326848684812, + "learning_rate": 8.751935135263644e-06, + "loss": 4.126, + "step": 4595 + }, + { + "epoch": 0.9210420841683367, + "grad_norm": 27.14227025925014, + "learning_rate": 8.751164385039099e-06, + "loss": 3.8622, + "step": 4596 + }, + { + "epoch": 0.9212424849699399, + "grad_norm": 32.34997182029611, + "learning_rate": 8.750393430858207e-06, + "loss": 4.3959, + "step": 4597 + }, + { + "epoch": 0.9214428857715431, + "grad_norm": 23.864381481791835, + "learning_rate": 8.749622272762888e-06, + "loss": 4.6311, + "step": 4598 + }, + { + "epoch": 0.9216432865731463, + "grad_norm": 29.420907633943713, + "learning_rate": 8.748850910795072e-06, + "loss": 4.177, + "step": 4599 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 28.442352823276554, + "learning_rate": 8.748079344996698e-06, + "loss": 4.3755, + "step": 4600 + }, + { + "epoch": 0.9220440881763527, + "grad_norm": 52.499142257564756, + "learning_rate": 8.747307575409718e-06, + "loss": 4.0869, + "step": 4601 + }, + { + "epoch": 0.9222444889779559, + "grad_norm": 17.41042698798401, + "learning_rate": 8.746535602076093e-06, + "loss": 3.549, + "step": 4602 + }, + { + "epoch": 0.9224448897795591, + "grad_norm": 26.63717386275375, + "learning_rate": 8.745763425037796e-06, + "loss": 4.3432, + "step": 4603 + }, + { + "epoch": 0.9226452905811623, + "grad_norm": 23.481298034269972, + "learning_rate": 8.744991044336814e-06, + "loss": 4.4362, + "step": 4604 + }, + { + "epoch": 0.9228456913827655, + "grad_norm": 25.61164703064238, + "learning_rate": 8.74421846001514e-06, + "loss": 4.8058, + "step": 4605 + }, + { + "epoch": 0.9230460921843687, + "grad_norm": 28.576869113764563, + "learning_rate": 8.74344567211478e-06, + "loss": 4.5914, + "step": 4606 + }, + { + "epoch": 0.923246492985972, + "grad_norm": 27.198896327701462, + "learning_rate": 8.742672680677755e-06, + "loss": 4.6678, + "step": 4607 + }, + { + "epoch": 0.9234468937875752, + "grad_norm": 17.202608021960643, + "learning_rate": 8.741899485746091e-06, + "loss": 3.899, + "step": 4608 + }, + { + "epoch": 0.9236472945891784, + "grad_norm": 21.28197408162291, + "learning_rate": 8.741126087361829e-06, + "loss": 3.4312, + "step": 4609 + }, + { + "epoch": 0.9238476953907816, + "grad_norm": 48.0617639815559, + "learning_rate": 8.740352485567018e-06, + "loss": 4.6076, + "step": 4610 + }, + { + "epoch": 0.9240480961923848, + "grad_norm": 25.770080688137995, + "learning_rate": 8.739578680403721e-06, + "loss": 4.3568, + "step": 4611 + }, + { + "epoch": 0.9242484969939879, + "grad_norm": 23.125154683053484, + "learning_rate": 8.738804671914013e-06, + "loss": 4.3039, + "step": 4612 + }, + { + "epoch": 0.9244488977955911, + "grad_norm": 35.99555711769745, + "learning_rate": 8.738030460139975e-06, + "loss": 4.2023, + "step": 4613 + }, + { + "epoch": 0.9246492985971944, + "grad_norm": 27.069106156075033, + "learning_rate": 8.737256045123701e-06, + "loss": 3.9745, + "step": 4614 + }, + { + "epoch": 0.9248496993987976, + "grad_norm": 25.123230769346335, + "learning_rate": 8.736481426907302e-06, + "loss": 4.7139, + "step": 4615 + }, + { + "epoch": 0.9250501002004008, + "grad_norm": 31.310853698039807, + "learning_rate": 8.735706605532891e-06, + "loss": 4.3946, + "step": 4616 + }, + { + "epoch": 0.925250501002004, + "grad_norm": 20.413428290768014, + "learning_rate": 8.734931581042599e-06, + "loss": 4.378, + "step": 4617 + }, + { + "epoch": 0.9254509018036072, + "grad_norm": 17.443417336989995, + "learning_rate": 8.734156353478561e-06, + "loss": 3.297, + "step": 4618 + }, + { + "epoch": 0.9256513026052104, + "grad_norm": 26.66183333958841, + "learning_rate": 8.733380922882932e-06, + "loss": 4.4112, + "step": 4619 + }, + { + "epoch": 0.9258517034068137, + "grad_norm": 18.808175961602096, + "learning_rate": 8.73260528929787e-06, + "loss": 4.2191, + "step": 4620 + }, + { + "epoch": 0.9260521042084169, + "grad_norm": 29.53352444044928, + "learning_rate": 8.731829452765547e-06, + "loss": 4.6464, + "step": 4621 + }, + { + "epoch": 0.92625250501002, + "grad_norm": 31.360876583994923, + "learning_rate": 8.731053413328153e-06, + "loss": 5.3927, + "step": 4622 + }, + { + "epoch": 0.9264529058116232, + "grad_norm": 20.76283635383262, + "learning_rate": 8.730277171027872e-06, + "loss": 4.0948, + "step": 4623 + }, + { + "epoch": 0.9266533066132264, + "grad_norm": 28.822758774409905, + "learning_rate": 8.729500725906919e-06, + "loss": 4.1515, + "step": 4624 + }, + { + "epoch": 0.9268537074148296, + "grad_norm": 18.952886557933226, + "learning_rate": 8.728724078007505e-06, + "loss": 3.5179, + "step": 4625 + }, + { + "epoch": 0.9270541082164329, + "grad_norm": 35.44980643188909, + "learning_rate": 8.727947227371856e-06, + "loss": 3.8053, + "step": 4626 + }, + { + "epoch": 0.9272545090180361, + "grad_norm": 21.08326770148422, + "learning_rate": 8.727170174042216e-06, + "loss": 3.9026, + "step": 4627 + }, + { + "epoch": 0.9274549098196393, + "grad_norm": 29.271695085039546, + "learning_rate": 8.726392918060832e-06, + "loss": 4.6569, + "step": 4628 + }, + { + "epoch": 0.9276553106212425, + "grad_norm": 29.102295792163908, + "learning_rate": 8.725615459469964e-06, + "loss": 3.6694, + "step": 4629 + }, + { + "epoch": 0.9278557114228457, + "grad_norm": 20.45855625704462, + "learning_rate": 8.724837798311883e-06, + "loss": 3.8674, + "step": 4630 + }, + { + "epoch": 0.928056112224449, + "grad_norm": 24.675588745256192, + "learning_rate": 8.724059934628874e-06, + "loss": 4.7215, + "step": 4631 + }, + { + "epoch": 0.928256513026052, + "grad_norm": 19.696540281072796, + "learning_rate": 8.723281868463228e-06, + "loss": 4.166, + "step": 4632 + }, + { + "epoch": 0.9284569138276553, + "grad_norm": 18.194962815384162, + "learning_rate": 8.722503599857252e-06, + "loss": 4.1181, + "step": 4633 + }, + { + "epoch": 0.9286573146292585, + "grad_norm": 28.90556281629678, + "learning_rate": 8.721725128853259e-06, + "loss": 3.8564, + "step": 4634 + }, + { + "epoch": 0.9288577154308617, + "grad_norm": 20.91234848578149, + "learning_rate": 8.720946455493577e-06, + "loss": 3.5768, + "step": 4635 + }, + { + "epoch": 0.9290581162324649, + "grad_norm": 24.005297676499534, + "learning_rate": 8.720167579820544e-06, + "loss": 4.2708, + "step": 4636 + }, + { + "epoch": 0.9292585170340681, + "grad_norm": 56.34971025698399, + "learning_rate": 8.719388501876508e-06, + "loss": 3.5307, + "step": 4637 + }, + { + "epoch": 0.9294589178356714, + "grad_norm": 29.9239720190267, + "learning_rate": 8.71860922170383e-06, + "loss": 4.2411, + "step": 4638 + }, + { + "epoch": 0.9296593186372746, + "grad_norm": 24.517086960722278, + "learning_rate": 8.717829739344879e-06, + "loss": 4.3155, + "step": 4639 + }, + { + "epoch": 0.9298597194388778, + "grad_norm": 21.82506944542746, + "learning_rate": 8.717050054842036e-06, + "loss": 4.3894, + "step": 4640 + }, + { + "epoch": 0.930060120240481, + "grad_norm": 37.489795870690976, + "learning_rate": 8.716270168237695e-06, + "loss": 4.406, + "step": 4641 + }, + { + "epoch": 0.9302605210420841, + "grad_norm": 29.754326578744784, + "learning_rate": 8.71549007957426e-06, + "loss": 4.0609, + "step": 4642 + }, + { + "epoch": 0.9304609218436873, + "grad_norm": 27.23718480001349, + "learning_rate": 8.714709788894145e-06, + "loss": 4.4683, + "step": 4643 + }, + { + "epoch": 0.9306613226452906, + "grad_norm": 27.942038098561888, + "learning_rate": 8.713929296239774e-06, + "loss": 4.5682, + "step": 4644 + }, + { + "epoch": 0.9308617234468938, + "grad_norm": 46.98861163051296, + "learning_rate": 8.713148601653585e-06, + "loss": 5.1628, + "step": 4645 + }, + { + "epoch": 0.931062124248497, + "grad_norm": 21.536737123934248, + "learning_rate": 8.712367705178027e-06, + "loss": 4.0387, + "step": 4646 + }, + { + "epoch": 0.9312625250501002, + "grad_norm": 27.332449932796727, + "learning_rate": 8.711586606855555e-06, + "loss": 4.3731, + "step": 4647 + }, + { + "epoch": 0.9314629258517034, + "grad_norm": 29.182117412421793, + "learning_rate": 8.710805306728642e-06, + "loss": 4.3725, + "step": 4648 + }, + { + "epoch": 0.9316633266533066, + "grad_norm": 26.629285898410398, + "learning_rate": 8.710023804839765e-06, + "loss": 4.5289, + "step": 4649 + }, + { + "epoch": 0.9318637274549099, + "grad_norm": 36.417546537626826, + "learning_rate": 8.70924210123142e-06, + "loss": 4.6768, + "step": 4650 + }, + { + "epoch": 0.9320641282565131, + "grad_norm": 32.57840126389177, + "learning_rate": 8.708460195946104e-06, + "loss": 4.3823, + "step": 4651 + }, + { + "epoch": 0.9322645290581162, + "grad_norm": 20.982304117541798, + "learning_rate": 8.707678089026335e-06, + "loss": 4.0639, + "step": 4652 + }, + { + "epoch": 0.9324649298597194, + "grad_norm": 27.809367329548888, + "learning_rate": 8.706895780514633e-06, + "loss": 4.4648, + "step": 4653 + }, + { + "epoch": 0.9326653306613226, + "grad_norm": 27.901439151359366, + "learning_rate": 8.706113270453537e-06, + "loss": 4.1909, + "step": 4654 + }, + { + "epoch": 0.9328657314629258, + "grad_norm": 20.533668466134664, + "learning_rate": 8.705330558885591e-06, + "loss": 3.9322, + "step": 4655 + }, + { + "epoch": 0.9330661322645291, + "grad_norm": 37.62281089165698, + "learning_rate": 8.704547645853354e-06, + "loss": 4.0837, + "step": 4656 + }, + { + "epoch": 0.9332665330661323, + "grad_norm": 26.650988263579716, + "learning_rate": 8.703764531399392e-06, + "loss": 4.1415, + "step": 4657 + }, + { + "epoch": 0.9334669338677355, + "grad_norm": 27.644124556891203, + "learning_rate": 8.702981215566286e-06, + "loss": 4.0347, + "step": 4658 + }, + { + "epoch": 0.9336673346693387, + "grad_norm": 24.773537087946213, + "learning_rate": 8.702197698396625e-06, + "loss": 5.0987, + "step": 4659 + }, + { + "epoch": 0.9338677354709419, + "grad_norm": 43.829172062301595, + "learning_rate": 8.70141397993301e-06, + "loss": 3.609, + "step": 4660 + }, + { + "epoch": 0.934068136272545, + "grad_norm": 28.479065696981017, + "learning_rate": 8.700630060218054e-06, + "loss": 4.3772, + "step": 4661 + }, + { + "epoch": 0.9342685370741483, + "grad_norm": 57.03643838049675, + "learning_rate": 8.699845939294379e-06, + "loss": 4.4376, + "step": 4662 + }, + { + "epoch": 0.9344689378757515, + "grad_norm": 48.22810347867234, + "learning_rate": 8.699061617204615e-06, + "loss": 4.4323, + "step": 4663 + }, + { + "epoch": 0.9346693386773547, + "grad_norm": 25.62329573246484, + "learning_rate": 8.698277093991414e-06, + "loss": 3.5548, + "step": 4664 + }, + { + "epoch": 0.9348697394789579, + "grad_norm": 32.71828237646568, + "learning_rate": 8.697492369697429e-06, + "loss": 4.1436, + "step": 4665 + }, + { + "epoch": 0.9350701402805611, + "grad_norm": 36.40514078745818, + "learning_rate": 8.696707444365324e-06, + "loss": 5.5422, + "step": 4666 + }, + { + "epoch": 0.9352705410821643, + "grad_norm": 16.99609833207044, + "learning_rate": 8.69592231803778e-06, + "loss": 3.651, + "step": 4667 + }, + { + "epoch": 0.9354709418837676, + "grad_norm": 22.347741941148517, + "learning_rate": 8.695136990757482e-06, + "loss": 3.9668, + "step": 4668 + }, + { + "epoch": 0.9356713426853708, + "grad_norm": 28.432417671791473, + "learning_rate": 8.694351462567132e-06, + "loss": 4.3352, + "step": 4669 + }, + { + "epoch": 0.935871743486974, + "grad_norm": 24.837699650119728, + "learning_rate": 8.69356573350944e-06, + "loss": 3.7983, + "step": 4670 + }, + { + "epoch": 0.9360721442885771, + "grad_norm": 31.29910568193494, + "learning_rate": 8.692779803627127e-06, + "loss": 4.9328, + "step": 4671 + }, + { + "epoch": 0.9362725450901803, + "grad_norm": 20.33005378914884, + "learning_rate": 8.691993672962925e-06, + "loss": 3.9744, + "step": 4672 + }, + { + "epoch": 0.9364729458917835, + "grad_norm": 33.770714801429754, + "learning_rate": 8.691207341559578e-06, + "loss": 3.9823, + "step": 4673 + }, + { + "epoch": 0.9366733466933868, + "grad_norm": 48.916039691279884, + "learning_rate": 8.690420809459837e-06, + "loss": 4.7311, + "step": 4674 + }, + { + "epoch": 0.93687374749499, + "grad_norm": 30.164378546277746, + "learning_rate": 8.68963407670647e-06, + "loss": 4.7487, + "step": 4675 + }, + { + "epoch": 0.9370741482965932, + "grad_norm": 25.136678265224422, + "learning_rate": 8.688847143342251e-06, + "loss": 3.9548, + "step": 4676 + }, + { + "epoch": 0.9372745490981964, + "grad_norm": 54.87646970538796, + "learning_rate": 8.688060009409968e-06, + "loss": 4.1175, + "step": 4677 + }, + { + "epoch": 0.9374749498997996, + "grad_norm": 20.98270118802, + "learning_rate": 8.687272674952421e-06, + "loss": 3.6671, + "step": 4678 + }, + { + "epoch": 0.9376753507014028, + "grad_norm": 20.26257449644013, + "learning_rate": 8.686485140012413e-06, + "loss": 4.1331, + "step": 4679 + }, + { + "epoch": 0.9378757515030061, + "grad_norm": 21.652634985199875, + "learning_rate": 8.685697404632766e-06, + "loss": 3.7609, + "step": 4680 + }, + { + "epoch": 0.9380761523046092, + "grad_norm": 21.170913168432033, + "learning_rate": 8.684909468856311e-06, + "loss": 4.0668, + "step": 4681 + }, + { + "epoch": 0.9382765531062124, + "grad_norm": 25.479319894718344, + "learning_rate": 8.684121332725888e-06, + "loss": 4.3998, + "step": 4682 + }, + { + "epoch": 0.9384769539078156, + "grad_norm": 18.867593892874105, + "learning_rate": 8.683332996284352e-06, + "loss": 4.2967, + "step": 4683 + }, + { + "epoch": 0.9386773547094188, + "grad_norm": 21.895009677867762, + "learning_rate": 8.682544459574562e-06, + "loss": 4.1438, + "step": 4684 + }, + { + "epoch": 0.938877755511022, + "grad_norm": 24.236591707896547, + "learning_rate": 8.681755722639395e-06, + "loss": 4.1797, + "step": 4685 + }, + { + "epoch": 0.9390781563126253, + "grad_norm": 21.38848385619156, + "learning_rate": 8.680966785521734e-06, + "loss": 3.819, + "step": 4686 + }, + { + "epoch": 0.9392785571142285, + "grad_norm": 22.518142787340278, + "learning_rate": 8.680177648264475e-06, + "loss": 4.168, + "step": 4687 + }, + { + "epoch": 0.9394789579158317, + "grad_norm": 22.575714459351698, + "learning_rate": 8.679388310910525e-06, + "loss": 4.4027, + "step": 4688 + }, + { + "epoch": 0.9396793587174349, + "grad_norm": 35.329561253476136, + "learning_rate": 8.678598773502803e-06, + "loss": 4.3645, + "step": 4689 + }, + { + "epoch": 0.9398797595190381, + "grad_norm": 26.044423313295795, + "learning_rate": 8.677809036084232e-06, + "loss": 3.9008, + "step": 4690 + }, + { + "epoch": 0.9400801603206412, + "grad_norm": 24.33174368723124, + "learning_rate": 8.677019098697758e-06, + "loss": 4.4957, + "step": 4691 + }, + { + "epoch": 0.9402805611222445, + "grad_norm": 26.804248209511954, + "learning_rate": 8.676228961386325e-06, + "loss": 4.0938, + "step": 4692 + }, + { + "epoch": 0.9404809619238477, + "grad_norm": 26.77032107089457, + "learning_rate": 8.6754386241929e-06, + "loss": 4.4286, + "step": 4693 + }, + { + "epoch": 0.9406813627254509, + "grad_norm": 23.83806330972216, + "learning_rate": 8.674648087160449e-06, + "loss": 4.2453, + "step": 4694 + }, + { + "epoch": 0.9408817635270541, + "grad_norm": 24.230872829129712, + "learning_rate": 8.673857350331957e-06, + "loss": 4.5984, + "step": 4695 + }, + { + "epoch": 0.9410821643286573, + "grad_norm": 21.568713733926053, + "learning_rate": 8.67306641375042e-06, + "loss": 4.2821, + "step": 4696 + }, + { + "epoch": 0.9412825651302605, + "grad_norm": 28.88941061649544, + "learning_rate": 8.672275277458839e-06, + "loss": 4.3405, + "step": 4697 + }, + { + "epoch": 0.9414829659318638, + "grad_norm": 22.42499010014705, + "learning_rate": 8.671483941500231e-06, + "loss": 4.0014, + "step": 4698 + }, + { + "epoch": 0.941683366733467, + "grad_norm": 25.39316930149063, + "learning_rate": 8.67069240591762e-06, + "loss": 3.8317, + "step": 4699 + }, + { + "epoch": 0.9418837675350702, + "grad_norm": 22.514258469382394, + "learning_rate": 8.669900670754046e-06, + "loss": 4.3474, + "step": 4700 + }, + { + "epoch": 0.9420841683366733, + "grad_norm": 55.9394523254405, + "learning_rate": 8.669108736052554e-06, + "loss": 3.9442, + "step": 4701 + }, + { + "epoch": 0.9422845691382765, + "grad_norm": 18.487041561734326, + "learning_rate": 8.668316601856205e-06, + "loss": 3.9598, + "step": 4702 + }, + { + "epoch": 0.9424849699398797, + "grad_norm": 16.72420777546434, + "learning_rate": 8.667524268208067e-06, + "loss": 4.1177, + "step": 4703 + }, + { + "epoch": 0.942685370741483, + "grad_norm": 70.92612531645186, + "learning_rate": 8.66673173515122e-06, + "loss": 4.3942, + "step": 4704 + }, + { + "epoch": 0.9428857715430862, + "grad_norm": 21.213870477400555, + "learning_rate": 8.665939002728757e-06, + "loss": 4.0486, + "step": 4705 + }, + { + "epoch": 0.9430861723446894, + "grad_norm": 20.348822986295982, + "learning_rate": 8.665146070983779e-06, + "loss": 4.2385, + "step": 4706 + }, + { + "epoch": 0.9432865731462926, + "grad_norm": 21.55559215824776, + "learning_rate": 8.664352939959399e-06, + "loss": 4.1862, + "step": 4707 + }, + { + "epoch": 0.9434869739478958, + "grad_norm": 18.931318862879245, + "learning_rate": 8.66355960969874e-06, + "loss": 3.8053, + "step": 4708 + }, + { + "epoch": 0.943687374749499, + "grad_norm": 26.571814765077026, + "learning_rate": 8.662766080244937e-06, + "loss": 4.7341, + "step": 4709 + }, + { + "epoch": 0.9438877755511023, + "grad_norm": 22.22628715788436, + "learning_rate": 8.661972351641135e-06, + "loss": 4.005, + "step": 4710 + }, + { + "epoch": 0.9440881763527054, + "grad_norm": 21.87332607708315, + "learning_rate": 8.661178423930492e-06, + "loss": 3.8847, + "step": 4711 + }, + { + "epoch": 0.9442885771543086, + "grad_norm": 27.048992316748514, + "learning_rate": 8.660384297156172e-06, + "loss": 4.1286, + "step": 4712 + }, + { + "epoch": 0.9444889779559118, + "grad_norm": 25.804439162269126, + "learning_rate": 8.659589971361355e-06, + "loss": 4.8422, + "step": 4713 + }, + { + "epoch": 0.944689378757515, + "grad_norm": 17.66928417627442, + "learning_rate": 8.658795446589231e-06, + "loss": 3.4454, + "step": 4714 + }, + { + "epoch": 0.9448897795591182, + "grad_norm": 25.809189977078677, + "learning_rate": 8.658000722882997e-06, + "loss": 4.2615, + "step": 4715 + }, + { + "epoch": 0.9450901803607215, + "grad_norm": 18.976896388940066, + "learning_rate": 8.657205800285864e-06, + "loss": 3.9757, + "step": 4716 + }, + { + "epoch": 0.9452905811623247, + "grad_norm": 22.34787644557551, + "learning_rate": 8.656410678841052e-06, + "loss": 4.3142, + "step": 4717 + }, + { + "epoch": 0.9454909819639279, + "grad_norm": 21.130478615411036, + "learning_rate": 8.655615358591795e-06, + "loss": 3.3292, + "step": 4718 + }, + { + "epoch": 0.9456913827655311, + "grad_norm": 29.14124497301365, + "learning_rate": 8.654819839581335e-06, + "loss": 4.1683, + "step": 4719 + }, + { + "epoch": 0.9458917835671342, + "grad_norm": 17.423472441454948, + "learning_rate": 8.654024121852924e-06, + "loss": 3.6094, + "step": 4720 + }, + { + "epoch": 0.9460921843687374, + "grad_norm": 22.045778440761204, + "learning_rate": 8.653228205449829e-06, + "loss": 4.7173, + "step": 4721 + }, + { + "epoch": 0.9462925851703406, + "grad_norm": 51.16801801277794, + "learning_rate": 8.652432090415322e-06, + "loss": 4.3254, + "step": 4722 + }, + { + "epoch": 0.9464929859719439, + "grad_norm": 26.519220618670452, + "learning_rate": 8.651635776792692e-06, + "loss": 5.1692, + "step": 4723 + }, + { + "epoch": 0.9466933867735471, + "grad_norm": 23.840280206944733, + "learning_rate": 8.650839264625234e-06, + "loss": 3.8533, + "step": 4724 + }, + { + "epoch": 0.9468937875751503, + "grad_norm": 21.48060015417201, + "learning_rate": 8.650042553956258e-06, + "loss": 3.8238, + "step": 4725 + }, + { + "epoch": 0.9470941883767535, + "grad_norm": 17.76652218001443, + "learning_rate": 8.649245644829078e-06, + "loss": 4.0795, + "step": 4726 + }, + { + "epoch": 0.9472945891783567, + "grad_norm": 16.817664231755334, + "learning_rate": 8.648448537287027e-06, + "loss": 3.9873, + "step": 4727 + }, + { + "epoch": 0.94749498997996, + "grad_norm": 24.939840481337306, + "learning_rate": 8.647651231373442e-06, + "loss": 4.3068, + "step": 4728 + }, + { + "epoch": 0.9476953907815632, + "grad_norm": 28.43436729348248, + "learning_rate": 8.646853727131676e-06, + "loss": 4.0469, + "step": 4729 + }, + { + "epoch": 0.9478957915831663, + "grad_norm": 20.41228599509371, + "learning_rate": 8.646056024605089e-06, + "loss": 3.9275, + "step": 4730 + }, + { + "epoch": 0.9480961923847695, + "grad_norm": 31.461491602064385, + "learning_rate": 8.645258123837054e-06, + "loss": 4.7648, + "step": 4731 + }, + { + "epoch": 0.9482965931863727, + "grad_norm": 23.291072667763142, + "learning_rate": 8.644460024870955e-06, + "loss": 3.9083, + "step": 4732 + }, + { + "epoch": 0.9484969939879759, + "grad_norm": 27.717515323157954, + "learning_rate": 8.643661727750186e-06, + "loss": 4.4978, + "step": 4733 + }, + { + "epoch": 0.9486973947895792, + "grad_norm": 21.08978942296489, + "learning_rate": 8.642863232518147e-06, + "loss": 4.4792, + "step": 4734 + }, + { + "epoch": 0.9488977955911824, + "grad_norm": 20.39805942981312, + "learning_rate": 8.642064539218258e-06, + "loss": 3.9091, + "step": 4735 + }, + { + "epoch": 0.9490981963927856, + "grad_norm": 21.56397311183711, + "learning_rate": 8.641265647893944e-06, + "loss": 3.939, + "step": 4736 + }, + { + "epoch": 0.9492985971943888, + "grad_norm": 20.712632360931046, + "learning_rate": 8.640466558588642e-06, + "loss": 4.4386, + "step": 4737 + }, + { + "epoch": 0.949498997995992, + "grad_norm": 29.23921624855623, + "learning_rate": 8.639667271345798e-06, + "loss": 5.2779, + "step": 4738 + }, + { + "epoch": 0.9496993987975952, + "grad_norm": 32.35682237040354, + "learning_rate": 8.638867786208873e-06, + "loss": 4.1945, + "step": 4739 + }, + { + "epoch": 0.9498997995991983, + "grad_norm": 22.95996902734391, + "learning_rate": 8.638068103221336e-06, + "loss": 4.3006, + "step": 4740 + }, + { + "epoch": 0.9501002004008016, + "grad_norm": 23.78385816026768, + "learning_rate": 8.637268222426664e-06, + "loss": 4.2604, + "step": 4741 + }, + { + "epoch": 0.9503006012024048, + "grad_norm": 17.5470889439286, + "learning_rate": 8.636468143868351e-06, + "loss": 4.4083, + "step": 4742 + }, + { + "epoch": 0.950501002004008, + "grad_norm": 25.081061489538072, + "learning_rate": 8.635667867589897e-06, + "loss": 4.25, + "step": 4743 + }, + { + "epoch": 0.9507014028056112, + "grad_norm": 51.8918125601935, + "learning_rate": 8.634867393634814e-06, + "loss": 3.9761, + "step": 4744 + }, + { + "epoch": 0.9509018036072144, + "grad_norm": 23.319723333007772, + "learning_rate": 8.634066722046623e-06, + "loss": 4.4057, + "step": 4745 + }, + { + "epoch": 0.9511022044088177, + "grad_norm": 44.812443827117725, + "learning_rate": 8.633265852868864e-06, + "loss": 4.6579, + "step": 4746 + }, + { + "epoch": 0.9513026052104209, + "grad_norm": 23.48650660891069, + "learning_rate": 8.632464786145075e-06, + "loss": 3.9905, + "step": 4747 + }, + { + "epoch": 0.9515030060120241, + "grad_norm": 22.229451715229327, + "learning_rate": 8.631663521918813e-06, + "loss": 4.4235, + "step": 4748 + }, + { + "epoch": 0.9517034068136273, + "grad_norm": 28.299035934516922, + "learning_rate": 8.630862060233646e-06, + "loss": 4.1745, + "step": 4749 + }, + { + "epoch": 0.9519038076152304, + "grad_norm": 18.64691058349811, + "learning_rate": 8.63006040113315e-06, + "loss": 3.9942, + "step": 4750 + }, + { + "epoch": 0.9521042084168336, + "grad_norm": 37.062178265448026, + "learning_rate": 8.629258544660909e-06, + "loss": 4.3499, + "step": 4751 + }, + { + "epoch": 0.9523046092184368, + "grad_norm": 25.94490882900076, + "learning_rate": 8.628456490860524e-06, + "loss": 4.0984, + "step": 4752 + }, + { + "epoch": 0.9525050100200401, + "grad_norm": 25.656005938496566, + "learning_rate": 8.627654239775604e-06, + "loss": 4.2268, + "step": 4753 + }, + { + "epoch": 0.9527054108216433, + "grad_norm": 21.300953397765834, + "learning_rate": 8.626851791449769e-06, + "loss": 3.9799, + "step": 4754 + }, + { + "epoch": 0.9529058116232465, + "grad_norm": 21.670657165153095, + "learning_rate": 8.626049145926649e-06, + "loss": 4.1022, + "step": 4755 + }, + { + "epoch": 0.9531062124248497, + "grad_norm": 68.6450632399526, + "learning_rate": 8.625246303249883e-06, + "loss": 3.9515, + "step": 4756 + }, + { + "epoch": 0.9533066132264529, + "grad_norm": 31.861490957142042, + "learning_rate": 8.624443263463124e-06, + "loss": 4.3105, + "step": 4757 + }, + { + "epoch": 0.9535070140280562, + "grad_norm": 78.87385014348506, + "learning_rate": 8.623640026610034e-06, + "loss": 4.3112, + "step": 4758 + }, + { + "epoch": 0.9537074148296594, + "grad_norm": 20.556558207400833, + "learning_rate": 8.622836592734288e-06, + "loss": 4.0504, + "step": 4759 + }, + { + "epoch": 0.9539078156312625, + "grad_norm": 18.571246890069713, + "learning_rate": 8.622032961879569e-06, + "loss": 4.2848, + "step": 4760 + }, + { + "epoch": 0.9541082164328657, + "grad_norm": 33.04585459751733, + "learning_rate": 8.62122913408957e-06, + "loss": 3.7921, + "step": 4761 + }, + { + "epoch": 0.9543086172344689, + "grad_norm": 27.61904090357193, + "learning_rate": 8.620425109407998e-06, + "loss": 4.0099, + "step": 4762 + }, + { + "epoch": 0.9545090180360721, + "grad_norm": 27.5520162038037, + "learning_rate": 8.61962088787857e-06, + "loss": 4.7749, + "step": 4763 + }, + { + "epoch": 0.9547094188376753, + "grad_norm": 23.891068652312253, + "learning_rate": 8.618816469545008e-06, + "loss": 4.1211, + "step": 4764 + }, + { + "epoch": 0.9549098196392786, + "grad_norm": 33.03007708918214, + "learning_rate": 8.618011854451056e-06, + "loss": 3.8747, + "step": 4765 + }, + { + "epoch": 0.9551102204408818, + "grad_norm": 27.044684488636918, + "learning_rate": 8.617207042640457e-06, + "loss": 4.2367, + "step": 4766 + }, + { + "epoch": 0.955310621242485, + "grad_norm": 29.94490927669466, + "learning_rate": 8.616402034156973e-06, + "loss": 4.1594, + "step": 4767 + }, + { + "epoch": 0.9555110220440882, + "grad_norm": 18.564198446050607, + "learning_rate": 8.61559682904437e-06, + "loss": 3.7671, + "step": 4768 + }, + { + "epoch": 0.9557114228456913, + "grad_norm": 19.764633756560542, + "learning_rate": 8.614791427346431e-06, + "loss": 3.9027, + "step": 4769 + }, + { + "epoch": 0.9559118236472945, + "grad_norm": 30.678442157957427, + "learning_rate": 8.613985829106948e-06, + "loss": 5.0647, + "step": 4770 + }, + { + "epoch": 0.9561122244488978, + "grad_norm": 32.30331681715419, + "learning_rate": 8.61318003436972e-06, + "loss": 4.2907, + "step": 4771 + }, + { + "epoch": 0.956312625250501, + "grad_norm": 30.456993434430974, + "learning_rate": 8.612374043178558e-06, + "loss": 4.5141, + "step": 4772 + }, + { + "epoch": 0.9565130260521042, + "grad_norm": 17.362387859926812, + "learning_rate": 8.611567855577287e-06, + "loss": 3.8029, + "step": 4773 + }, + { + "epoch": 0.9567134268537074, + "grad_norm": 27.24607621785263, + "learning_rate": 8.610761471609743e-06, + "loss": 4.6404, + "step": 4774 + }, + { + "epoch": 0.9569138276553106, + "grad_norm": 22.935080118155586, + "learning_rate": 8.609954891319766e-06, + "loss": 4.5013, + "step": 4775 + }, + { + "epoch": 0.9571142284569139, + "grad_norm": 23.38483791285557, + "learning_rate": 8.609148114751214e-06, + "loss": 4.1779, + "step": 4776 + }, + { + "epoch": 0.9573146292585171, + "grad_norm": 23.775321843276156, + "learning_rate": 8.60834114194795e-06, + "loss": 3.759, + "step": 4777 + }, + { + "epoch": 0.9575150300601203, + "grad_norm": 26.95385914777741, + "learning_rate": 8.607533972953853e-06, + "loss": 4.0677, + "step": 4778 + }, + { + "epoch": 0.9577154308617234, + "grad_norm": 21.846065258757072, + "learning_rate": 8.606726607812807e-06, + "loss": 4.6135, + "step": 4779 + }, + { + "epoch": 0.9579158316633266, + "grad_norm": 24.398309007083068, + "learning_rate": 8.605919046568713e-06, + "loss": 4.1843, + "step": 4780 + }, + { + "epoch": 0.9581162324649298, + "grad_norm": 22.402486910888523, + "learning_rate": 8.605111289265477e-06, + "loss": 4.2002, + "step": 4781 + }, + { + "epoch": 0.958316633266533, + "grad_norm": 28.19084274558204, + "learning_rate": 8.604303335947019e-06, + "loss": 3.7761, + "step": 4782 + }, + { + "epoch": 0.9585170340681363, + "grad_norm": 25.019112665016493, + "learning_rate": 8.603495186657269e-06, + "loss": 4.7867, + "step": 4783 + }, + { + "epoch": 0.9587174348697395, + "grad_norm": 21.110145094650026, + "learning_rate": 8.602686841440165e-06, + "loss": 4.5866, + "step": 4784 + }, + { + "epoch": 0.9589178356713427, + "grad_norm": 18.357234894527927, + "learning_rate": 8.60187830033966e-06, + "loss": 4.0845, + "step": 4785 + }, + { + "epoch": 0.9591182364729459, + "grad_norm": 23.969687629799644, + "learning_rate": 8.601069563399714e-06, + "loss": 3.921, + "step": 4786 + }, + { + "epoch": 0.9593186372745491, + "grad_norm": 32.76186576955715, + "learning_rate": 8.600260630664302e-06, + "loss": 3.9082, + "step": 4787 + }, + { + "epoch": 0.9595190380761524, + "grad_norm": 30.9039190340814, + "learning_rate": 8.599451502177403e-06, + "loss": 4.715, + "step": 4788 + }, + { + "epoch": 0.9597194388777555, + "grad_norm": 36.597725342797006, + "learning_rate": 8.598642177983012e-06, + "loss": 4.5158, + "step": 4789 + }, + { + "epoch": 0.9599198396793587, + "grad_norm": 16.65046540540128, + "learning_rate": 8.597832658125135e-06, + "loss": 4.1928, + "step": 4790 + }, + { + "epoch": 0.9601202404809619, + "grad_norm": 32.4859971758563, + "learning_rate": 8.597022942647788e-06, + "loss": 4.0719, + "step": 4791 + }, + { + "epoch": 0.9603206412825651, + "grad_norm": 22.203159325285622, + "learning_rate": 8.596213031594991e-06, + "loss": 4.0851, + "step": 4792 + }, + { + "epoch": 0.9605210420841683, + "grad_norm": 36.31765729247007, + "learning_rate": 8.595402925010783e-06, + "loss": 4.354, + "step": 4793 + }, + { + "epoch": 0.9607214428857715, + "grad_norm": 26.25913048700721, + "learning_rate": 8.594592622939211e-06, + "loss": 4.0351, + "step": 4794 + }, + { + "epoch": 0.9609218436873748, + "grad_norm": 23.607715830694378, + "learning_rate": 8.593782125424332e-06, + "loss": 3.7257, + "step": 4795 + }, + { + "epoch": 0.961122244488978, + "grad_norm": 31.705877714395463, + "learning_rate": 8.592971432510215e-06, + "loss": 4.6646, + "step": 4796 + }, + { + "epoch": 0.9613226452905812, + "grad_norm": 25.797338365718595, + "learning_rate": 8.592160544240937e-06, + "loss": 3.973, + "step": 4797 + }, + { + "epoch": 0.9615230460921844, + "grad_norm": 34.798964089414454, + "learning_rate": 8.591349460660587e-06, + "loss": 4.5457, + "step": 4798 + }, + { + "epoch": 0.9617234468937875, + "grad_norm": 25.343871030507987, + "learning_rate": 8.590538181813266e-06, + "loss": 4.2849, + "step": 4799 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 25.926613147578177, + "learning_rate": 8.589726707743083e-06, + "loss": 4.4498, + "step": 4800 + }, + { + "epoch": 0.962124248496994, + "grad_norm": 33.914999886178514, + "learning_rate": 8.58891503849416e-06, + "loss": 4.4616, + "step": 4801 + }, + { + "epoch": 0.9623246492985972, + "grad_norm": 23.308754561682143, + "learning_rate": 8.58810317411063e-06, + "loss": 4.2912, + "step": 4802 + }, + { + "epoch": 0.9625250501002004, + "grad_norm": 20.053124836517693, + "learning_rate": 8.587291114636633e-06, + "loss": 3.6106, + "step": 4803 + }, + { + "epoch": 0.9627254509018036, + "grad_norm": 22.52858604808908, + "learning_rate": 8.586478860116323e-06, + "loss": 3.7835, + "step": 4804 + }, + { + "epoch": 0.9629258517034068, + "grad_norm": 29.11098871208611, + "learning_rate": 8.585666410593863e-06, + "loss": 4.4566, + "step": 4805 + }, + { + "epoch": 0.96312625250501, + "grad_norm": 14.56541787910971, + "learning_rate": 8.584853766113428e-06, + "loss": 3.4749, + "step": 4806 + }, + { + "epoch": 0.9633266533066133, + "grad_norm": 20.32333182281101, + "learning_rate": 8.584040926719202e-06, + "loss": 4.1369, + "step": 4807 + }, + { + "epoch": 0.9635270541082165, + "grad_norm": 28.318357350286455, + "learning_rate": 8.583227892455379e-06, + "loss": 4.0267, + "step": 4808 + }, + { + "epoch": 0.9637274549098196, + "grad_norm": 26.392513555721095, + "learning_rate": 8.582414663366168e-06, + "loss": 4.5332, + "step": 4809 + }, + { + "epoch": 0.9639278557114228, + "grad_norm": 24.908708326824303, + "learning_rate": 8.581601239495783e-06, + "loss": 3.5523, + "step": 4810 + }, + { + "epoch": 0.964128256513026, + "grad_norm": 22.968539787124037, + "learning_rate": 8.58078762088845e-06, + "loss": 3.9846, + "step": 4811 + }, + { + "epoch": 0.9643286573146292, + "grad_norm": 28.9976938318428, + "learning_rate": 8.57997380758841e-06, + "loss": 4.7654, + "step": 4812 + }, + { + "epoch": 0.9645290581162325, + "grad_norm": 20.412997374694214, + "learning_rate": 8.57915979963991e-06, + "loss": 4.2265, + "step": 4813 + }, + { + "epoch": 0.9647294589178357, + "grad_norm": 22.526223353806895, + "learning_rate": 8.578345597087208e-06, + "loss": 4.2426, + "step": 4814 + }, + { + "epoch": 0.9649298597194389, + "grad_norm": 23.676406281197323, + "learning_rate": 8.577531199974572e-06, + "loss": 4.2311, + "step": 4815 + }, + { + "epoch": 0.9651302605210421, + "grad_norm": 24.88754477963126, + "learning_rate": 8.576716608346285e-06, + "loss": 4.173, + "step": 4816 + }, + { + "epoch": 0.9653306613226453, + "grad_norm": 19.25495434041008, + "learning_rate": 8.575901822246637e-06, + "loss": 3.7917, + "step": 4817 + }, + { + "epoch": 0.9655310621242486, + "grad_norm": 38.511934730920814, + "learning_rate": 8.575086841719927e-06, + "loss": 4.1772, + "step": 4818 + }, + { + "epoch": 0.9657314629258517, + "grad_norm": 20.665369777683946, + "learning_rate": 8.57427166681047e-06, + "loss": 3.831, + "step": 4819 + }, + { + "epoch": 0.9659318637274549, + "grad_norm": 25.1275167815821, + "learning_rate": 8.573456297562584e-06, + "loss": 3.9912, + "step": 4820 + }, + { + "epoch": 0.9661322645290581, + "grad_norm": 23.85898984815849, + "learning_rate": 8.572640734020606e-06, + "loss": 4.244, + "step": 4821 + }, + { + "epoch": 0.9663326653306613, + "grad_norm": 19.70157413743972, + "learning_rate": 8.571824976228877e-06, + "loss": 3.6131, + "step": 4822 + }, + { + "epoch": 0.9665330661322645, + "grad_norm": 40.796530930292874, + "learning_rate": 8.571009024231752e-06, + "loss": 4.9422, + "step": 4823 + }, + { + "epoch": 0.9667334669338677, + "grad_norm": 42.454861683504724, + "learning_rate": 8.570192878073594e-06, + "loss": 4.5577, + "step": 4824 + }, + { + "epoch": 0.966933867735471, + "grad_norm": 21.622885448078282, + "learning_rate": 8.56937653779878e-06, + "loss": 3.4396, + "step": 4825 + }, + { + "epoch": 0.9671342685370742, + "grad_norm": 21.169550329856115, + "learning_rate": 8.568560003451695e-06, + "loss": 4.4918, + "step": 4826 + }, + { + "epoch": 0.9673346693386774, + "grad_norm": 29.321508261441526, + "learning_rate": 8.567743275076735e-06, + "loss": 4.5419, + "step": 4827 + }, + { + "epoch": 0.9675350701402805, + "grad_norm": 16.04532886809187, + "learning_rate": 8.566926352718306e-06, + "loss": 3.6266, + "step": 4828 + }, + { + "epoch": 0.9677354709418837, + "grad_norm": 30.76777233860871, + "learning_rate": 8.566109236420825e-06, + "loss": 4.6306, + "step": 4829 + }, + { + "epoch": 0.9679358717434869, + "grad_norm": 24.097373207219825, + "learning_rate": 8.565291926228723e-06, + "loss": 4.232, + "step": 4830 + }, + { + "epoch": 0.9681362725450902, + "grad_norm": 25.65374919563609, + "learning_rate": 8.564474422186433e-06, + "loss": 3.8686, + "step": 4831 + }, + { + "epoch": 0.9683366733466934, + "grad_norm": 19.76289326213352, + "learning_rate": 8.563656724338409e-06, + "loss": 3.5852, + "step": 4832 + }, + { + "epoch": 0.9685370741482966, + "grad_norm": 23.997943926054724, + "learning_rate": 8.56283883272911e-06, + "loss": 4.1898, + "step": 4833 + }, + { + "epoch": 0.9687374749498998, + "grad_norm": 20.685191067638456, + "learning_rate": 8.562020747403e-06, + "loss": 3.6141, + "step": 4834 + }, + { + "epoch": 0.968937875751503, + "grad_norm": 29.925666936682507, + "learning_rate": 8.561202468404569e-06, + "loss": 4.6565, + "step": 4835 + }, + { + "epoch": 0.9691382765531062, + "grad_norm": 23.916389241434565, + "learning_rate": 8.560383995778301e-06, + "loss": 4.1839, + "step": 4836 + }, + { + "epoch": 0.9693386773547095, + "grad_norm": 62.55993350095342, + "learning_rate": 8.5595653295687e-06, + "loss": 4.3196, + "step": 4837 + }, + { + "epoch": 0.9695390781563126, + "grad_norm": 38.1627096363938, + "learning_rate": 8.558746469820276e-06, + "loss": 4.5149, + "step": 4838 + }, + { + "epoch": 0.9697394789579158, + "grad_norm": 19.790711591388124, + "learning_rate": 8.557927416577555e-06, + "loss": 3.8551, + "step": 4839 + }, + { + "epoch": 0.969939879759519, + "grad_norm": 23.377677378628608, + "learning_rate": 8.557108169885067e-06, + "loss": 4.1694, + "step": 4840 + }, + { + "epoch": 0.9701402805611222, + "grad_norm": 26.332622707186005, + "learning_rate": 8.556288729787357e-06, + "loss": 4.5998, + "step": 4841 + }, + { + "epoch": 0.9703406813627254, + "grad_norm": 37.46669140224579, + "learning_rate": 8.55546909632898e-06, + "loss": 4.1106, + "step": 4842 + }, + { + "epoch": 0.9705410821643287, + "grad_norm": 50.35341785730677, + "learning_rate": 8.5546492695545e-06, + "loss": 4.3845, + "step": 4843 + }, + { + "epoch": 0.9707414829659319, + "grad_norm": 19.004564882650357, + "learning_rate": 8.553829249508492e-06, + "loss": 4.4818, + "step": 4844 + }, + { + "epoch": 0.9709418837675351, + "grad_norm": 23.194341160937697, + "learning_rate": 8.55300903623554e-06, + "loss": 3.9604, + "step": 4845 + }, + { + "epoch": 0.9711422845691383, + "grad_norm": 20.36432558462247, + "learning_rate": 8.552188629780245e-06, + "loss": 4.2058, + "step": 4846 + }, + { + "epoch": 0.9713426853707415, + "grad_norm": 30.67208400380634, + "learning_rate": 8.551368030187209e-06, + "loss": 4.0108, + "step": 4847 + }, + { + "epoch": 0.9715430861723446, + "grad_norm": 36.55015773698355, + "learning_rate": 8.55054723750105e-06, + "loss": 4.2394, + "step": 4848 + }, + { + "epoch": 0.9717434869739479, + "grad_norm": 18.683148951637396, + "learning_rate": 8.549726251766398e-06, + "loss": 3.8962, + "step": 4849 + }, + { + "epoch": 0.9719438877755511, + "grad_norm": 18.60021953658863, + "learning_rate": 8.54890507302789e-06, + "loss": 3.9889, + "step": 4850 + }, + { + "epoch": 0.9721442885771543, + "grad_norm": 31.94277051453436, + "learning_rate": 8.548083701330174e-06, + "loss": 4.1012, + "step": 4851 + }, + { + "epoch": 0.9723446893787575, + "grad_norm": 26.08107607240504, + "learning_rate": 8.54726213671791e-06, + "loss": 4.4186, + "step": 4852 + }, + { + "epoch": 0.9725450901803607, + "grad_norm": 22.37543633000572, + "learning_rate": 8.546440379235767e-06, + "loss": 4.1898, + "step": 4853 + }, + { + "epoch": 0.972745490981964, + "grad_norm": 42.81381136270305, + "learning_rate": 8.545618428928426e-06, + "loss": 5.2692, + "step": 4854 + }, + { + "epoch": 0.9729458917835672, + "grad_norm": 16.397893354651437, + "learning_rate": 8.544796285840576e-06, + "loss": 3.9646, + "step": 4855 + }, + { + "epoch": 0.9731462925851704, + "grad_norm": 21.660495189192044, + "learning_rate": 8.54397395001692e-06, + "loss": 4.2376, + "step": 4856 + }, + { + "epoch": 0.9733466933867736, + "grad_norm": 22.607860460367878, + "learning_rate": 8.543151421502169e-06, + "loss": 4.4966, + "step": 4857 + }, + { + "epoch": 0.9735470941883767, + "grad_norm": 21.813789092975853, + "learning_rate": 8.542328700341047e-06, + "loss": 3.9108, + "step": 4858 + }, + { + "epoch": 0.9737474949899799, + "grad_norm": 28.73385522997557, + "learning_rate": 8.541505786578281e-06, + "loss": 4.3937, + "step": 4859 + }, + { + "epoch": 0.9739478957915831, + "grad_norm": 17.600863477334958, + "learning_rate": 8.540682680258621e-06, + "loss": 3.7916, + "step": 4860 + }, + { + "epoch": 0.9741482965931864, + "grad_norm": 28.29660391622816, + "learning_rate": 8.539859381426814e-06, + "loss": 4.5055, + "step": 4861 + }, + { + "epoch": 0.9743486973947896, + "grad_norm": 33.25297524050064, + "learning_rate": 8.539035890127629e-06, + "loss": 4.8815, + "step": 4862 + }, + { + "epoch": 0.9745490981963928, + "grad_norm": 22.73544883757732, + "learning_rate": 8.538212206405837e-06, + "loss": 3.971, + "step": 4863 + }, + { + "epoch": 0.974749498997996, + "grad_norm": 23.15537818162383, + "learning_rate": 8.537388330306227e-06, + "loss": 3.5207, + "step": 4864 + }, + { + "epoch": 0.9749498997995992, + "grad_norm": 24.471160288311285, + "learning_rate": 8.53656426187359e-06, + "loss": 4.0379, + "step": 4865 + }, + { + "epoch": 0.9751503006012024, + "grad_norm": 20.754803866850562, + "learning_rate": 8.535740001152735e-06, + "loss": 4.3112, + "step": 4866 + }, + { + "epoch": 0.9753507014028057, + "grad_norm": 28.582508832388623, + "learning_rate": 8.534915548188476e-06, + "loss": 4.5134, + "step": 4867 + }, + { + "epoch": 0.9755511022044088, + "grad_norm": 19.786573993173647, + "learning_rate": 8.53409090302564e-06, + "loss": 3.9167, + "step": 4868 + }, + { + "epoch": 0.975751503006012, + "grad_norm": 42.19767281984607, + "learning_rate": 8.533266065709068e-06, + "loss": 4.0091, + "step": 4869 + }, + { + "epoch": 0.9759519038076152, + "grad_norm": 23.412443587878585, + "learning_rate": 8.5324410362836e-06, + "loss": 4.1297, + "step": 4870 + }, + { + "epoch": 0.9761523046092184, + "grad_norm": 36.29232033419346, + "learning_rate": 8.531615814794102e-06, + "loss": 3.5949, + "step": 4871 + }, + { + "epoch": 0.9763527054108216, + "grad_norm": 20.764060060944693, + "learning_rate": 8.530790401285436e-06, + "loss": 4.3229, + "step": 4872 + }, + { + "epoch": 0.9765531062124249, + "grad_norm": 29.294314932135563, + "learning_rate": 8.529964795802485e-06, + "loss": 3.7561, + "step": 4873 + }, + { + "epoch": 0.9767535070140281, + "grad_norm": 19.785389713975093, + "learning_rate": 8.529138998390139e-06, + "loss": 4.1075, + "step": 4874 + }, + { + "epoch": 0.9769539078156313, + "grad_norm": 20.376591943600573, + "learning_rate": 8.528313009093295e-06, + "loss": 4.187, + "step": 4875 + }, + { + "epoch": 0.9771543086172345, + "grad_norm": 34.291830788311664, + "learning_rate": 8.527486827956863e-06, + "loss": 4.7716, + "step": 4876 + }, + { + "epoch": 0.9773547094188377, + "grad_norm": 21.351068032118114, + "learning_rate": 8.526660455025767e-06, + "loss": 4.0023, + "step": 4877 + }, + { + "epoch": 0.9775551102204408, + "grad_norm": 22.61555187807588, + "learning_rate": 8.525833890344936e-06, + "loss": 4.3649, + "step": 4878 + }, + { + "epoch": 0.977755511022044, + "grad_norm": 24.682330240501713, + "learning_rate": 8.52500713395931e-06, + "loss": 4.322, + "step": 4879 + }, + { + "epoch": 0.9779559118236473, + "grad_norm": 30.997723462159293, + "learning_rate": 8.524180185913844e-06, + "loss": 4.1427, + "step": 4880 + }, + { + "epoch": 0.9781563126252505, + "grad_norm": 23.527832711459013, + "learning_rate": 8.523353046253501e-06, + "loss": 4.0143, + "step": 4881 + }, + { + "epoch": 0.9783567134268537, + "grad_norm": 19.363649435923197, + "learning_rate": 8.52252571502325e-06, + "loss": 3.547, + "step": 4882 + }, + { + "epoch": 0.9785571142284569, + "grad_norm": 22.24798103821081, + "learning_rate": 8.521698192268078e-06, + "loss": 4.3031, + "step": 4883 + }, + { + "epoch": 0.9787575150300601, + "grad_norm": 26.384478537274997, + "learning_rate": 8.520870478032974e-06, + "loss": 4.1542, + "step": 4884 + }, + { + "epoch": 0.9789579158316634, + "grad_norm": 26.560354384595122, + "learning_rate": 8.520042572362946e-06, + "loss": 4.1976, + "step": 4885 + }, + { + "epoch": 0.9791583166332666, + "grad_norm": 17.462445148970104, + "learning_rate": 8.519214475303007e-06, + "loss": 3.7471, + "step": 4886 + }, + { + "epoch": 0.9793587174348697, + "grad_norm": 29.654472657528938, + "learning_rate": 8.518386186898182e-06, + "loss": 4.7382, + "step": 4887 + }, + { + "epoch": 0.9795591182364729, + "grad_norm": 21.61629419256487, + "learning_rate": 8.517557707193507e-06, + "loss": 4.3441, + "step": 4888 + }, + { + "epoch": 0.9797595190380761, + "grad_norm": 22.003433724647262, + "learning_rate": 8.516729036234027e-06, + "loss": 4.2885, + "step": 4889 + }, + { + "epoch": 0.9799599198396793, + "grad_norm": 17.706248881623374, + "learning_rate": 8.515900174064797e-06, + "loss": 3.9292, + "step": 4890 + }, + { + "epoch": 0.9801603206412826, + "grad_norm": 21.214446793805873, + "learning_rate": 8.515071120730887e-06, + "loss": 3.8333, + "step": 4891 + }, + { + "epoch": 0.9803607214428858, + "grad_norm": 24.621948903178556, + "learning_rate": 8.514241876277369e-06, + "loss": 3.947, + "step": 4892 + }, + { + "epoch": 0.980561122244489, + "grad_norm": 24.87129747442167, + "learning_rate": 8.513412440749335e-06, + "loss": 3.9134, + "step": 4893 + }, + { + "epoch": 0.9807615230460922, + "grad_norm": 22.580398624366914, + "learning_rate": 8.512582814191879e-06, + "loss": 4.4645, + "step": 4894 + }, + { + "epoch": 0.9809619238476954, + "grad_norm": 23.258850508463603, + "learning_rate": 8.511752996650112e-06, + "loss": 4.1222, + "step": 4895 + }, + { + "epoch": 0.9811623246492986, + "grad_norm": 23.57178914634162, + "learning_rate": 8.510922988169148e-06, + "loss": 4.2586, + "step": 4896 + }, + { + "epoch": 0.9813627254509018, + "grad_norm": 23.20571792819137, + "learning_rate": 8.51009278879412e-06, + "loss": 4.2883, + "step": 4897 + }, + { + "epoch": 0.981563126252505, + "grad_norm": 21.16326771392384, + "learning_rate": 8.509262398570165e-06, + "loss": 3.9686, + "step": 4898 + }, + { + "epoch": 0.9817635270541082, + "grad_norm": 35.812327174049116, + "learning_rate": 8.508431817542433e-06, + "loss": 4.9087, + "step": 4899 + }, + { + "epoch": 0.9819639278557114, + "grad_norm": 23.92023210620627, + "learning_rate": 8.507601045756085e-06, + "loss": 4.0605, + "step": 4900 + }, + { + "epoch": 0.9821643286573146, + "grad_norm": 35.10464676186804, + "learning_rate": 8.50677008325629e-06, + "loss": 3.9928, + "step": 4901 + }, + { + "epoch": 0.9823647294589178, + "grad_norm": 30.280236413433634, + "learning_rate": 8.50593893008823e-06, + "loss": 4.9341, + "step": 4902 + }, + { + "epoch": 0.9825651302605211, + "grad_norm": 22.875020634150943, + "learning_rate": 8.505107586297094e-06, + "loss": 4.09, + "step": 4903 + }, + { + "epoch": 0.9827655310621243, + "grad_norm": 23.89144236670922, + "learning_rate": 8.504276051928085e-06, + "loss": 4.5084, + "step": 4904 + }, + { + "epoch": 0.9829659318637275, + "grad_norm": 19.878373020775435, + "learning_rate": 8.503444327026415e-06, + "loss": 4.0237, + "step": 4905 + }, + { + "epoch": 0.9831663326653307, + "grad_norm": 23.594176845883442, + "learning_rate": 8.502612411637306e-06, + "loss": 4.4498, + "step": 4906 + }, + { + "epoch": 0.9833667334669338, + "grad_norm": 23.581837051730954, + "learning_rate": 8.501780305805986e-06, + "loss": 4.1717, + "step": 4907 + }, + { + "epoch": 0.983567134268537, + "grad_norm": 17.68476632703697, + "learning_rate": 8.500948009577705e-06, + "loss": 3.7966, + "step": 4908 + }, + { + "epoch": 0.9837675350701403, + "grad_norm": 20.3181920186217, + "learning_rate": 8.500115522997711e-06, + "loss": 3.7979, + "step": 4909 + }, + { + "epoch": 0.9839679358717435, + "grad_norm": 23.13313060667702, + "learning_rate": 8.499282846111271e-06, + "loss": 3.8102, + "step": 4910 + }, + { + "epoch": 0.9841683366733467, + "grad_norm": 23.014887569532167, + "learning_rate": 8.498449978963657e-06, + "loss": 4.2855, + "step": 4911 + }, + { + "epoch": 0.9843687374749499, + "grad_norm": 29.240861176308332, + "learning_rate": 8.497616921600153e-06, + "loss": 4.242, + "step": 4912 + }, + { + "epoch": 0.9845691382765531, + "grad_norm": 23.046101489706494, + "learning_rate": 8.496783674066055e-06, + "loss": 3.9116, + "step": 4913 + }, + { + "epoch": 0.9847695390781563, + "grad_norm": 22.25182927758732, + "learning_rate": 8.495950236406665e-06, + "loss": 4.2751, + "step": 4914 + }, + { + "epoch": 0.9849699398797596, + "grad_norm": 27.703804968483638, + "learning_rate": 8.495116608667301e-06, + "loss": 4.1756, + "step": 4915 + }, + { + "epoch": 0.9851703406813628, + "grad_norm": 23.685239774666165, + "learning_rate": 8.49428279089329e-06, + "loss": 4.2523, + "step": 4916 + }, + { + "epoch": 0.9853707414829659, + "grad_norm": 21.98919587747742, + "learning_rate": 8.493448783129963e-06, + "loss": 4.0412, + "step": 4917 + }, + { + "epoch": 0.9855711422845691, + "grad_norm": 26.700413263156644, + "learning_rate": 8.49261458542267e-06, + "loss": 4.0234, + "step": 4918 + }, + { + "epoch": 0.9857715430861723, + "grad_norm": 54.96168914290787, + "learning_rate": 8.491780197816765e-06, + "loss": 4.588, + "step": 4919 + }, + { + "epoch": 0.9859719438877755, + "grad_norm": 17.82661674669936, + "learning_rate": 8.490945620357618e-06, + "loss": 3.6395, + "step": 4920 + }, + { + "epoch": 0.9861723446893788, + "grad_norm": 33.767128277732944, + "learning_rate": 8.490110853090603e-06, + "loss": 3.9874, + "step": 4921 + }, + { + "epoch": 0.986372745490982, + "grad_norm": 22.974797960517694, + "learning_rate": 8.489275896061108e-06, + "loss": 3.7632, + "step": 4922 + }, + { + "epoch": 0.9865731462925852, + "grad_norm": 20.967844508873682, + "learning_rate": 8.488440749314535e-06, + "loss": 4.2165, + "step": 4923 + }, + { + "epoch": 0.9867735470941884, + "grad_norm": 34.776972580127094, + "learning_rate": 8.487605412896287e-06, + "loss": 4.482, + "step": 4924 + }, + { + "epoch": 0.9869739478957916, + "grad_norm": 31.25634154512062, + "learning_rate": 8.486769886851787e-06, + "loss": 4.3866, + "step": 4925 + }, + { + "epoch": 0.9871743486973948, + "grad_norm": 27.40375871542617, + "learning_rate": 8.485934171226458e-06, + "loss": 4.3709, + "step": 4926 + }, + { + "epoch": 0.987374749498998, + "grad_norm": 18.525488812303283, + "learning_rate": 8.485098266065745e-06, + "loss": 3.9502, + "step": 4927 + }, + { + "epoch": 0.9875751503006012, + "grad_norm": 23.882687886092587, + "learning_rate": 8.484262171415093e-06, + "loss": 4.3241, + "step": 4928 + }, + { + "epoch": 0.9877755511022044, + "grad_norm": 27.986292773631824, + "learning_rate": 8.483425887319965e-06, + "loss": 4.4341, + "step": 4929 + }, + { + "epoch": 0.9879759519038076, + "grad_norm": 26.01995805860167, + "learning_rate": 8.482589413825828e-06, + "loss": 3.9988, + "step": 4930 + }, + { + "epoch": 0.9881763527054108, + "grad_norm": 20.369013171979923, + "learning_rate": 8.481752750978164e-06, + "loss": 3.7409, + "step": 4931 + }, + { + "epoch": 0.988376753507014, + "grad_norm": 20.615636557876318, + "learning_rate": 8.480915898822464e-06, + "loss": 4.0651, + "step": 4932 + }, + { + "epoch": 0.9885771543086173, + "grad_norm": 29.711294051144925, + "learning_rate": 8.480078857404229e-06, + "loss": 4.0862, + "step": 4933 + }, + { + "epoch": 0.9887775551102205, + "grad_norm": 28.849763681117558, + "learning_rate": 8.479241626768969e-06, + "loss": 3.8645, + "step": 4934 + }, + { + "epoch": 0.9889779559118237, + "grad_norm": 23.081111078609464, + "learning_rate": 8.478404206962203e-06, + "loss": 4.2426, + "step": 4935 + }, + { + "epoch": 0.9891783567134268, + "grad_norm": 29.04467900833674, + "learning_rate": 8.47756659802947e-06, + "loss": 4.3371, + "step": 4936 + }, + { + "epoch": 0.98937875751503, + "grad_norm": 21.927100070654415, + "learning_rate": 8.476728800016304e-06, + "loss": 3.8985, + "step": 4937 + }, + { + "epoch": 0.9895791583166332, + "grad_norm": 36.92221812517823, + "learning_rate": 8.475890812968264e-06, + "loss": 4.3688, + "step": 4938 + }, + { + "epoch": 0.9897795591182365, + "grad_norm": 23.292111403822595, + "learning_rate": 8.475052636930909e-06, + "loss": 3.8379, + "step": 4939 + }, + { + "epoch": 0.9899799599198397, + "grad_norm": 42.72136385083651, + "learning_rate": 8.474214271949812e-06, + "loss": 3.7783, + "step": 4940 + }, + { + "epoch": 0.9901803607214429, + "grad_norm": 23.87444945665167, + "learning_rate": 8.473375718070556e-06, + "loss": 4.4138, + "step": 4941 + }, + { + "epoch": 0.9903807615230461, + "grad_norm": 26.90131256741624, + "learning_rate": 8.472536975338736e-06, + "loss": 4.2054, + "step": 4942 + }, + { + "epoch": 0.9905811623246493, + "grad_norm": 24.62189430414794, + "learning_rate": 8.471698043799954e-06, + "loss": 4.7453, + "step": 4943 + }, + { + "epoch": 0.9907815631262525, + "grad_norm": 36.77077293125454, + "learning_rate": 8.470858923499825e-06, + "loss": 3.9456, + "step": 4944 + }, + { + "epoch": 0.9909819639278558, + "grad_norm": 25.36840521514797, + "learning_rate": 8.470019614483972e-06, + "loss": 4.401, + "step": 4945 + }, + { + "epoch": 0.9911823647294589, + "grad_norm": 45.21951371496458, + "learning_rate": 8.469180116798033e-06, + "loss": 3.9686, + "step": 4946 + }, + { + "epoch": 0.9913827655310621, + "grad_norm": 29.272784960612103, + "learning_rate": 8.468340430487649e-06, + "loss": 4.4128, + "step": 4947 + }, + { + "epoch": 0.9915831663326653, + "grad_norm": 30.63645906499823, + "learning_rate": 8.467500555598473e-06, + "loss": 4.1735, + "step": 4948 + }, + { + "epoch": 0.9917835671342685, + "grad_norm": 33.590126038370904, + "learning_rate": 8.466660492176177e-06, + "loss": 4.1684, + "step": 4949 + }, + { + "epoch": 0.9919839679358717, + "grad_norm": 25.738887579993406, + "learning_rate": 8.465820240266431e-06, + "loss": 4.3422, + "step": 4950 + }, + { + "epoch": 0.992184368737475, + "grad_norm": 32.2478465997635, + "learning_rate": 8.464979799914924e-06, + "loss": 4.6686, + "step": 4951 + }, + { + "epoch": 0.9923847695390782, + "grad_norm": 30.389163328478407, + "learning_rate": 8.46413917116735e-06, + "loss": 4.0867, + "step": 4952 + }, + { + "epoch": 0.9925851703406814, + "grad_norm": 22.78890672349669, + "learning_rate": 8.463298354069413e-06, + "loss": 4.0058, + "step": 4953 + }, + { + "epoch": 0.9927855711422846, + "grad_norm": 21.167658446287856, + "learning_rate": 8.462457348666835e-06, + "loss": 3.989, + "step": 4954 + }, + { + "epoch": 0.9929859719438878, + "grad_norm": 33.22854297161485, + "learning_rate": 8.46161615500534e-06, + "loss": 4.0292, + "step": 4955 + }, + { + "epoch": 0.9931863727454909, + "grad_norm": 34.63790503012625, + "learning_rate": 8.460774773130664e-06, + "loss": 4.22, + "step": 4956 + }, + { + "epoch": 0.9933867735470941, + "grad_norm": 18.673774590915087, + "learning_rate": 8.459933203088555e-06, + "loss": 3.8427, + "step": 4957 + }, + { + "epoch": 0.9935871743486974, + "grad_norm": 20.239591669942715, + "learning_rate": 8.45909144492477e-06, + "loss": 4.1822, + "step": 4958 + }, + { + "epoch": 0.9937875751503006, + "grad_norm": 25.27164281324431, + "learning_rate": 8.458249498685078e-06, + "loss": 4.5335, + "step": 4959 + }, + { + "epoch": 0.9939879759519038, + "grad_norm": 15.50821231910566, + "learning_rate": 8.457407364415255e-06, + "loss": 3.4526, + "step": 4960 + }, + { + "epoch": 0.994188376753507, + "grad_norm": 21.676056784433094, + "learning_rate": 8.456565042161091e-06, + "loss": 4.2979, + "step": 4961 + }, + { + "epoch": 0.9943887775551102, + "grad_norm": 47.43511493027028, + "learning_rate": 8.455722531968383e-06, + "loss": 4.7679, + "step": 4962 + }, + { + "epoch": 0.9945891783567135, + "grad_norm": 23.756795217094453, + "learning_rate": 8.454879833882939e-06, + "loss": 4.9309, + "step": 4963 + }, + { + "epoch": 0.9947895791583167, + "grad_norm": 21.14284863080073, + "learning_rate": 8.45403694795058e-06, + "loss": 4.1266, + "step": 4964 + }, + { + "epoch": 0.9949899799599199, + "grad_norm": 26.961703939195438, + "learning_rate": 8.453193874217131e-06, + "loss": 4.2226, + "step": 4965 + }, + { + "epoch": 0.995190380761523, + "grad_norm": 35.20124709025573, + "learning_rate": 8.452350612728437e-06, + "loss": 3.7679, + "step": 4966 + }, + { + "epoch": 0.9953907815631262, + "grad_norm": 31.103024119642193, + "learning_rate": 8.451507163530342e-06, + "loss": 4.1372, + "step": 4967 + }, + { + "epoch": 0.9955911823647294, + "grad_norm": 24.54593660354436, + "learning_rate": 8.450663526668707e-06, + "loss": 4.4099, + "step": 4968 + }, + { + "epoch": 0.9957915831663327, + "grad_norm": 28.949042271955907, + "learning_rate": 8.449819702189404e-06, + "loss": 5.1304, + "step": 4969 + }, + { + "epoch": 0.9959919839679359, + "grad_norm": 24.454187881568913, + "learning_rate": 8.448975690138312e-06, + "loss": 3.8435, + "step": 4970 + }, + { + "epoch": 0.9961923847695391, + "grad_norm": 31.083632614368845, + "learning_rate": 8.44813149056132e-06, + "loss": 4.505, + "step": 4971 + }, + { + "epoch": 0.9963927855711423, + "grad_norm": 23.27544113905919, + "learning_rate": 8.447287103504327e-06, + "loss": 4.3542, + "step": 4972 + }, + { + "epoch": 0.9965931863727455, + "grad_norm": 21.84405004002317, + "learning_rate": 8.446442529013247e-06, + "loss": 4.3551, + "step": 4973 + }, + { + "epoch": 0.9967935871743487, + "grad_norm": 21.475156765341595, + "learning_rate": 8.445597767133999e-06, + "loss": 4.2728, + "step": 4974 + }, + { + "epoch": 0.996993987975952, + "grad_norm": 23.100532050420135, + "learning_rate": 8.444752817912515e-06, + "loss": 4.0716, + "step": 4975 + }, + { + "epoch": 0.9971943887775551, + "grad_norm": 43.33273528606073, + "learning_rate": 8.443907681394734e-06, + "loss": 3.5655, + "step": 4976 + }, + { + "epoch": 0.9973947895791583, + "grad_norm": 29.917452735490908, + "learning_rate": 8.44306235762661e-06, + "loss": 4.4669, + "step": 4977 + }, + { + "epoch": 0.9975951903807615, + "grad_norm": 25.467945262625847, + "learning_rate": 8.442216846654104e-06, + "loss": 3.952, + "step": 4978 + }, + { + "epoch": 0.9977955911823647, + "grad_norm": 36.16915048365427, + "learning_rate": 8.441371148523184e-06, + "loss": 3.9037, + "step": 4979 + }, + { + "epoch": 0.9979959919839679, + "grad_norm": 44.30671297690325, + "learning_rate": 8.440525263279836e-06, + "loss": 4.0719, + "step": 4980 + }, + { + "epoch": 0.9981963927855712, + "grad_norm": 27.9288172183177, + "learning_rate": 8.439679190970052e-06, + "loss": 4.2358, + "step": 4981 + }, + { + "epoch": 0.9983967935871744, + "grad_norm": 17.85904785384535, + "learning_rate": 8.438832931639831e-06, + "loss": 3.6168, + "step": 4982 + }, + { + "epoch": 0.9985971943887776, + "grad_norm": 27.535776457238924, + "learning_rate": 8.437986485335187e-06, + "loss": 4.1919, + "step": 4983 + }, + { + "epoch": 0.9987975951903808, + "grad_norm": 28.255328971746962, + "learning_rate": 8.437139852102146e-06, + "loss": 4.7006, + "step": 4984 + }, + { + "epoch": 0.998997995991984, + "grad_norm": 29.18345132096226, + "learning_rate": 8.436293031986736e-06, + "loss": 4.8527, + "step": 4985 + }, + { + "epoch": 0.9991983967935871, + "grad_norm": 37.80680536699088, + "learning_rate": 8.435446025035e-06, + "loss": 3.8026, + "step": 4986 + }, + { + "epoch": 0.9993987975951903, + "grad_norm": 21.7803478173753, + "learning_rate": 8.434598831292995e-06, + "loss": 4.0814, + "step": 4987 + }, + { + "epoch": 0.9995991983967936, + "grad_norm": 20.647828562186252, + "learning_rate": 8.433751450806779e-06, + "loss": 3.7955, + "step": 4988 + }, + { + "epoch": 0.9997995991983968, + "grad_norm": 22.05606836119026, + "learning_rate": 8.43290388362243e-06, + "loss": 4.5645, + "step": 4989 + }, + { + "epoch": 1.0, + "grad_norm": 30.60954591545004, + "learning_rate": 8.432056129786028e-06, + "loss": 4.4224, + "step": 4990 + }, + { + "epoch": 1.0002004008016032, + "grad_norm": 14.853077004772047, + "learning_rate": 8.43120818934367e-06, + "loss": 2.5481, + "step": 4991 + }, + { + "epoch": 1.0004008016032064, + "grad_norm": 45.35274923482358, + "learning_rate": 8.430360062341454e-06, + "loss": 3.5427, + "step": 4992 + }, + { + "epoch": 1.0006012024048097, + "grad_norm": 21.245306425804696, + "learning_rate": 8.429511748825502e-06, + "loss": 2.9207, + "step": 4993 + }, + { + "epoch": 1.0008016032064129, + "grad_norm": 30.9940237562021, + "learning_rate": 8.42866324884193e-06, + "loss": 2.8016, + "step": 4994 + }, + { + "epoch": 1.001002004008016, + "grad_norm": 21.786662557463195, + "learning_rate": 8.427814562436878e-06, + "loss": 2.799, + "step": 4995 + }, + { + "epoch": 1.0012024048096193, + "grad_norm": 14.718097849763467, + "learning_rate": 8.42696568965649e-06, + "loss": 2.867, + "step": 4996 + }, + { + "epoch": 1.0014028056112225, + "grad_norm": 14.397435088343427, + "learning_rate": 8.426116630546917e-06, + "loss": 2.8653, + "step": 4997 + }, + { + "epoch": 1.0016032064128257, + "grad_norm": 45.47130568530222, + "learning_rate": 8.425267385154326e-06, + "loss": 3.1524, + "step": 4998 + }, + { + "epoch": 1.001803607214429, + "grad_norm": 44.31919293040457, + "learning_rate": 8.424417953524889e-06, + "loss": 3.0532, + "step": 4999 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 21.601930323988327, + "learning_rate": 8.423568335704794e-06, + "loss": 3.0824, + "step": 5000 + }, + { + "epoch": 1.0022044088176352, + "grad_norm": 30.566495349838608, + "learning_rate": 8.422718531740236e-06, + "loss": 3.0311, + "step": 5001 + }, + { + "epoch": 1.0024048096192384, + "grad_norm": 22.701792641866234, + "learning_rate": 8.421868541677418e-06, + "loss": 2.9495, + "step": 5002 + }, + { + "epoch": 1.0026052104208416, + "grad_norm": 27.152184008799285, + "learning_rate": 8.421018365562558e-06, + "loss": 2.8981, + "step": 5003 + }, + { + "epoch": 1.0028056112224448, + "grad_norm": 23.92030913302502, + "learning_rate": 8.420168003441878e-06, + "loss": 2.6596, + "step": 5004 + }, + { + "epoch": 1.003006012024048, + "grad_norm": 15.842603969125882, + "learning_rate": 8.419317455361615e-06, + "loss": 2.7057, + "step": 5005 + }, + { + "epoch": 1.0032064128256513, + "grad_norm": 32.18391038866444, + "learning_rate": 8.418466721368014e-06, + "loss": 3.179, + "step": 5006 + }, + { + "epoch": 1.0034068136272545, + "grad_norm": 44.50583688769301, + "learning_rate": 8.417615801507333e-06, + "loss": 2.851, + "step": 5007 + }, + { + "epoch": 1.0036072144288577, + "grad_norm": 17.10320803565968, + "learning_rate": 8.416764695825835e-06, + "loss": 2.7464, + "step": 5008 + }, + { + "epoch": 1.003807615230461, + "grad_norm": 31.374801062774832, + "learning_rate": 8.415913404369798e-06, + "loss": 2.5172, + "step": 5009 + }, + { + "epoch": 1.0040080160320641, + "grad_norm": 31.023327626824212, + "learning_rate": 8.415061927185505e-06, + "loss": 3.6314, + "step": 5010 + }, + { + "epoch": 1.0042084168336673, + "grad_norm": 34.55172681348548, + "learning_rate": 8.414210264319254e-06, + "loss": 2.456, + "step": 5011 + }, + { + "epoch": 1.0044088176352706, + "grad_norm": 24.414052952303006, + "learning_rate": 8.413358415817352e-06, + "loss": 2.7683, + "step": 5012 + }, + { + "epoch": 1.0046092184368738, + "grad_norm": 21.442177444466793, + "learning_rate": 8.412506381726114e-06, + "loss": 2.7592, + "step": 5013 + }, + { + "epoch": 1.004809619238477, + "grad_norm": 18.935769825201117, + "learning_rate": 8.411654162091867e-06, + "loss": 2.8048, + "step": 5014 + }, + { + "epoch": 1.0050100200400802, + "grad_norm": 41.558637954415325, + "learning_rate": 8.410801756960948e-06, + "loss": 2.5624, + "step": 5015 + }, + { + "epoch": 1.0052104208416834, + "grad_norm": 24.720924971647758, + "learning_rate": 8.409949166379702e-06, + "loss": 2.6439, + "step": 5016 + }, + { + "epoch": 1.0054108216432867, + "grad_norm": 22.496451386504724, + "learning_rate": 8.409096390394485e-06, + "loss": 2.4124, + "step": 5017 + }, + { + "epoch": 1.0056112224448899, + "grad_norm": 28.7273136973776, + "learning_rate": 8.408243429051667e-06, + "loss": 2.8208, + "step": 5018 + }, + { + "epoch": 1.005811623246493, + "grad_norm": 25.126207726302273, + "learning_rate": 8.407390282397621e-06, + "loss": 3.0546, + "step": 5019 + }, + { + "epoch": 1.006012024048096, + "grad_norm": 18.46011808182036, + "learning_rate": 8.406536950478737e-06, + "loss": 3.1552, + "step": 5020 + }, + { + "epoch": 1.0062124248496993, + "grad_norm": 29.80391481935227, + "learning_rate": 8.405683433341409e-06, + "loss": 2.7157, + "step": 5021 + }, + { + "epoch": 1.0064128256513025, + "grad_norm": 54.34626101490591, + "learning_rate": 8.404829731032047e-06, + "loss": 2.8533, + "step": 5022 + }, + { + "epoch": 1.0066132264529057, + "grad_norm": 26.604080494620113, + "learning_rate": 8.403975843597066e-06, + "loss": 2.8504, + "step": 5023 + }, + { + "epoch": 1.006813627254509, + "grad_norm": 51.91904299215086, + "learning_rate": 8.403121771082893e-06, + "loss": 2.7956, + "step": 5024 + }, + { + "epoch": 1.0070140280561122, + "grad_norm": 22.003025668699088, + "learning_rate": 8.402267513535966e-06, + "loss": 2.2935, + "step": 5025 + }, + { + "epoch": 1.0072144288577154, + "grad_norm": 23.385737691558315, + "learning_rate": 8.401413071002733e-06, + "loss": 2.5769, + "step": 5026 + }, + { + "epoch": 1.0074148296593186, + "grad_norm": 28.03130654978053, + "learning_rate": 8.400558443529649e-06, + "loss": 2.3313, + "step": 5027 + }, + { + "epoch": 1.0076152304609218, + "grad_norm": 37.35579348355006, + "learning_rate": 8.399703631163184e-06, + "loss": 2.7653, + "step": 5028 + }, + { + "epoch": 1.007815631262525, + "grad_norm": 36.253017962749645, + "learning_rate": 8.398848633949813e-06, + "loss": 2.7743, + "step": 5029 + }, + { + "epoch": 1.0080160320641283, + "grad_norm": 40.121943520631014, + "learning_rate": 8.397993451936026e-06, + "loss": 2.4295, + "step": 5030 + }, + { + "epoch": 1.0082164328657315, + "grad_norm": 28.817794163993973, + "learning_rate": 8.397138085168318e-06, + "loss": 2.6451, + "step": 5031 + }, + { + "epoch": 1.0084168336673347, + "grad_norm": 26.570410515364905, + "learning_rate": 8.396282533693199e-06, + "loss": 2.8588, + "step": 5032 + }, + { + "epoch": 1.008617234468938, + "grad_norm": 42.610678105559884, + "learning_rate": 8.395426797557184e-06, + "loss": 2.7386, + "step": 5033 + }, + { + "epoch": 1.0088176352705411, + "grad_norm": 34.20487560976935, + "learning_rate": 8.394570876806801e-06, + "loss": 3.0487, + "step": 5034 + }, + { + "epoch": 1.0090180360721444, + "grad_norm": 25.24123119154684, + "learning_rate": 8.39371477148859e-06, + "loss": 2.5873, + "step": 5035 + }, + { + "epoch": 1.0092184368737476, + "grad_norm": 29.059701240400877, + "learning_rate": 8.392858481649098e-06, + "loss": 2.6812, + "step": 5036 + }, + { + "epoch": 1.0094188376753508, + "grad_norm": 40.214407715372175, + "learning_rate": 8.392002007334881e-06, + "loss": 3.3527, + "step": 5037 + }, + { + "epoch": 1.009619238476954, + "grad_norm": 17.947746129820246, + "learning_rate": 8.391145348592506e-06, + "loss": 2.2269, + "step": 5038 + }, + { + "epoch": 1.0098196392785572, + "grad_norm": 39.23203007937664, + "learning_rate": 8.390288505468555e-06, + "loss": 3.5265, + "step": 5039 + }, + { + "epoch": 1.0100200400801602, + "grad_norm": 41.6821458982396, + "learning_rate": 8.389431478009614e-06, + "loss": 2.6068, + "step": 5040 + }, + { + "epoch": 1.0102204408817634, + "grad_norm": 19.006438833695753, + "learning_rate": 8.38857426626228e-06, + "loss": 2.4834, + "step": 5041 + }, + { + "epoch": 1.0104208416833667, + "grad_norm": 30.963188830462403, + "learning_rate": 8.38771687027316e-06, + "loss": 2.7923, + "step": 5042 + }, + { + "epoch": 1.0106212424849699, + "grad_norm": 22.793172537426848, + "learning_rate": 8.386859290088873e-06, + "loss": 2.8307, + "step": 5043 + }, + { + "epoch": 1.010821643286573, + "grad_norm": 23.16395436992974, + "learning_rate": 8.386001525756047e-06, + "loss": 2.9715, + "step": 5044 + }, + { + "epoch": 1.0110220440881763, + "grad_norm": 38.205081806352936, + "learning_rate": 8.38514357732132e-06, + "loss": 2.9346, + "step": 5045 + }, + { + "epoch": 1.0112224448897795, + "grad_norm": 21.19281838070626, + "learning_rate": 8.38428544483134e-06, + "loss": 2.4763, + "step": 5046 + }, + { + "epoch": 1.0114228456913827, + "grad_norm": 38.96196913320919, + "learning_rate": 8.383427128332766e-06, + "loss": 2.9777, + "step": 5047 + }, + { + "epoch": 1.011623246492986, + "grad_norm": 50.215293479109945, + "learning_rate": 8.382568627872263e-06, + "loss": 2.8919, + "step": 5048 + }, + { + "epoch": 1.0118236472945892, + "grad_norm": 18.88787799213783, + "learning_rate": 8.38170994349651e-06, + "loss": 2.3444, + "step": 5049 + }, + { + "epoch": 1.0120240480961924, + "grad_norm": 21.156056062841234, + "learning_rate": 8.380851075252198e-06, + "loss": 3.3515, + "step": 5050 + }, + { + "epoch": 1.0122244488977956, + "grad_norm": 18.708989523431267, + "learning_rate": 8.37999202318602e-06, + "loss": 2.8084, + "step": 5051 + }, + { + "epoch": 1.0124248496993988, + "grad_norm": 28.51837145395137, + "learning_rate": 8.379132787344689e-06, + "loss": 3.1615, + "step": 5052 + }, + { + "epoch": 1.012625250501002, + "grad_norm": 21.659019533537542, + "learning_rate": 8.378273367774916e-06, + "loss": 3.1657, + "step": 5053 + }, + { + "epoch": 1.0128256513026053, + "grad_norm": 23.603523182573284, + "learning_rate": 8.377413764523437e-06, + "loss": 3.0437, + "step": 5054 + }, + { + "epoch": 1.0130260521042085, + "grad_norm": 17.30581465289811, + "learning_rate": 8.376553977636985e-06, + "loss": 2.4159, + "step": 5055 + }, + { + "epoch": 1.0132264529058117, + "grad_norm": 19.228542326293, + "learning_rate": 8.375694007162308e-06, + "loss": 2.5664, + "step": 5056 + }, + { + "epoch": 1.013426853707415, + "grad_norm": 21.986339573340345, + "learning_rate": 8.374833853146165e-06, + "loss": 2.6347, + "step": 5057 + }, + { + "epoch": 1.0136272545090181, + "grad_norm": 27.852578956647697, + "learning_rate": 8.373973515635326e-06, + "loss": 3.094, + "step": 5058 + }, + { + "epoch": 1.0138276553106211, + "grad_norm": 26.407843738605823, + "learning_rate": 8.373112994676564e-06, + "loss": 2.6856, + "step": 5059 + }, + { + "epoch": 1.0140280561122244, + "grad_norm": 20.659226871138284, + "learning_rate": 8.37225229031667e-06, + "loss": 3.0185, + "step": 5060 + }, + { + "epoch": 1.0142284569138276, + "grad_norm": 28.681600865067463, + "learning_rate": 8.371391402602442e-06, + "loss": 2.6374, + "step": 5061 + }, + { + "epoch": 1.0144288577154308, + "grad_norm": 25.299515213809453, + "learning_rate": 8.370530331580686e-06, + "loss": 2.5686, + "step": 5062 + }, + { + "epoch": 1.014629258517034, + "grad_norm": 32.31868640702981, + "learning_rate": 8.36966907729822e-06, + "loss": 3.2027, + "step": 5063 + }, + { + "epoch": 1.0148296593186372, + "grad_norm": 23.317655984792538, + "learning_rate": 8.368807639801874e-06, + "loss": 2.4137, + "step": 5064 + }, + { + "epoch": 1.0150300601202404, + "grad_norm": 24.155779850845413, + "learning_rate": 8.367946019138482e-06, + "loss": 3.3798, + "step": 5065 + }, + { + "epoch": 1.0152304609218437, + "grad_norm": 41.16711970076829, + "learning_rate": 8.367084215354894e-06, + "loss": 2.9475, + "step": 5066 + }, + { + "epoch": 1.0154308617234469, + "grad_norm": 29.268911067883955, + "learning_rate": 8.366222228497966e-06, + "loss": 2.9157, + "step": 5067 + }, + { + "epoch": 1.01563126252505, + "grad_norm": 49.01773034446143, + "learning_rate": 8.365360058614568e-06, + "loss": 2.6398, + "step": 5068 + }, + { + "epoch": 1.0158316633266533, + "grad_norm": 19.979483038834296, + "learning_rate": 8.364497705751575e-06, + "loss": 2.7102, + "step": 5069 + }, + { + "epoch": 1.0160320641282565, + "grad_norm": 22.11675133286392, + "learning_rate": 8.363635169955875e-06, + "loss": 2.7309, + "step": 5070 + }, + { + "epoch": 1.0162324649298597, + "grad_norm": 28.80738714964423, + "learning_rate": 8.362772451274367e-06, + "loss": 2.6226, + "step": 5071 + }, + { + "epoch": 1.016432865731463, + "grad_norm": 21.669187291334108, + "learning_rate": 8.361909549753958e-06, + "loss": 2.8605, + "step": 5072 + }, + { + "epoch": 1.0166332665330662, + "grad_norm": 45.638390834457866, + "learning_rate": 8.36104646544156e-06, + "loss": 3.1515, + "step": 5073 + }, + { + "epoch": 1.0168336673346694, + "grad_norm": 22.600533057686736, + "learning_rate": 8.360183198384107e-06, + "loss": 2.4975, + "step": 5074 + }, + { + "epoch": 1.0170340681362726, + "grad_norm": 24.616761679125325, + "learning_rate": 8.359319748628532e-06, + "loss": 2.7297, + "step": 5075 + }, + { + "epoch": 1.0172344689378758, + "grad_norm": 30.564833125283315, + "learning_rate": 8.358456116221785e-06, + "loss": 2.6274, + "step": 5076 + }, + { + "epoch": 1.017434869739479, + "grad_norm": 45.30310480535916, + "learning_rate": 8.357592301210822e-06, + "loss": 3.2208, + "step": 5077 + }, + { + "epoch": 1.0176352705410823, + "grad_norm": 22.267015868857257, + "learning_rate": 8.356728303642606e-06, + "loss": 2.4219, + "step": 5078 + }, + { + "epoch": 1.0178356713426853, + "grad_norm": 24.30778670001267, + "learning_rate": 8.35586412356412e-06, + "loss": 2.8579, + "step": 5079 + }, + { + "epoch": 1.0180360721442885, + "grad_norm": 39.932371150134706, + "learning_rate": 8.354999761022348e-06, + "loss": 2.9675, + "step": 5080 + }, + { + "epoch": 1.0182364729458917, + "grad_norm": 24.362272323741433, + "learning_rate": 8.354135216064286e-06, + "loss": 2.7175, + "step": 5081 + }, + { + "epoch": 1.018436873747495, + "grad_norm": 27.776968195000006, + "learning_rate": 8.35327048873694e-06, + "loss": 3.2026, + "step": 5082 + }, + { + "epoch": 1.0186372745490981, + "grad_norm": 29.982067830038286, + "learning_rate": 8.352405579087329e-06, + "loss": 2.9365, + "step": 5083 + }, + { + "epoch": 1.0188376753507014, + "grad_norm": 23.167149680548622, + "learning_rate": 8.351540487162477e-06, + "loss": 2.8094, + "step": 5084 + }, + { + "epoch": 1.0190380761523046, + "grad_norm": 36.62880424404684, + "learning_rate": 8.350675213009423e-06, + "loss": 2.88, + "step": 5085 + }, + { + "epoch": 1.0192384769539078, + "grad_norm": 24.49991038435533, + "learning_rate": 8.34980975667521e-06, + "loss": 2.6482, + "step": 5086 + }, + { + "epoch": 1.019438877755511, + "grad_norm": 22.44704141268912, + "learning_rate": 8.348944118206896e-06, + "loss": 2.5392, + "step": 5087 + }, + { + "epoch": 1.0196392785571142, + "grad_norm": 33.97725772463389, + "learning_rate": 8.348078297651547e-06, + "loss": 2.4527, + "step": 5088 + }, + { + "epoch": 1.0198396793587174, + "grad_norm": 30.04691808918787, + "learning_rate": 8.34721229505624e-06, + "loss": 3.1364, + "step": 5089 + }, + { + "epoch": 1.0200400801603207, + "grad_norm": 28.33673586647659, + "learning_rate": 8.346346110468057e-06, + "loss": 2.6133, + "step": 5090 + }, + { + "epoch": 1.0202404809619239, + "grad_norm": 26.976652940799607, + "learning_rate": 8.3454797439341e-06, + "loss": 3.0646, + "step": 5091 + }, + { + "epoch": 1.020440881763527, + "grad_norm": 21.019814812282473, + "learning_rate": 8.344613195501467e-06, + "loss": 2.53, + "step": 5092 + }, + { + "epoch": 1.0206412825651303, + "grad_norm": 23.003656170913402, + "learning_rate": 8.34374646521728e-06, + "loss": 3.0431, + "step": 5093 + }, + { + "epoch": 1.0208416833667335, + "grad_norm": 20.343551573642497, + "learning_rate": 8.342879553128661e-06, + "loss": 2.8608, + "step": 5094 + }, + { + "epoch": 1.0210420841683367, + "grad_norm": 26.059097336802555, + "learning_rate": 8.342012459282745e-06, + "loss": 2.7511, + "step": 5095 + }, + { + "epoch": 1.02124248496994, + "grad_norm": 33.36293938587561, + "learning_rate": 8.341145183726681e-06, + "loss": 2.7573, + "step": 5096 + }, + { + "epoch": 1.0214428857715432, + "grad_norm": 31.707952848880055, + "learning_rate": 8.34027772650762e-06, + "loss": 3.4124, + "step": 5097 + }, + { + "epoch": 1.0216432865731462, + "grad_norm": 26.489425378746066, + "learning_rate": 8.339410087672727e-06, + "loss": 2.6946, + "step": 5098 + }, + { + "epoch": 1.0218436873747494, + "grad_norm": 26.127944397696677, + "learning_rate": 8.338542267269181e-06, + "loss": 2.339, + "step": 5099 + }, + { + "epoch": 1.0220440881763526, + "grad_norm": 25.237068256686875, + "learning_rate": 8.33767426534416e-06, + "loss": 2.3508, + "step": 5100 + }, + { + "epoch": 1.0222444889779558, + "grad_norm": 20.62502722921669, + "learning_rate": 8.336806081944865e-06, + "loss": 2.9596, + "step": 5101 + }, + { + "epoch": 1.022444889779559, + "grad_norm": 24.759083525594416, + "learning_rate": 8.335937717118497e-06, + "loss": 2.5893, + "step": 5102 + }, + { + "epoch": 1.0226452905811623, + "grad_norm": 59.74188903836617, + "learning_rate": 8.335069170912273e-06, + "loss": 2.4635, + "step": 5103 + }, + { + "epoch": 1.0228456913827655, + "grad_norm": 19.002112036161005, + "learning_rate": 8.334200443373412e-06, + "loss": 2.5272, + "step": 5104 + }, + { + "epoch": 1.0230460921843687, + "grad_norm": 20.96980819163826, + "learning_rate": 8.333331534549152e-06, + "loss": 3.106, + "step": 5105 + }, + { + "epoch": 1.023246492985972, + "grad_norm": 17.892308701932215, + "learning_rate": 8.332462444486737e-06, + "loss": 3.1485, + "step": 5106 + }, + { + "epoch": 1.0234468937875751, + "grad_norm": 54.4736190240775, + "learning_rate": 8.331593173233419e-06, + "loss": 3.2863, + "step": 5107 + }, + { + "epoch": 1.0236472945891784, + "grad_norm": 31.581538549793084, + "learning_rate": 8.330723720836464e-06, + "loss": 2.9279, + "step": 5108 + }, + { + "epoch": 1.0238476953907816, + "grad_norm": 26.253290072529918, + "learning_rate": 8.329854087343141e-06, + "loss": 3.2216, + "step": 5109 + }, + { + "epoch": 1.0240480961923848, + "grad_norm": 28.312697193636474, + "learning_rate": 8.328984272800737e-06, + "loss": 3.0106, + "step": 5110 + }, + { + "epoch": 1.024248496993988, + "grad_norm": 41.87289702049704, + "learning_rate": 8.328114277256544e-06, + "loss": 3.4234, + "step": 5111 + }, + { + "epoch": 1.0244488977955912, + "grad_norm": 22.684996983021367, + "learning_rate": 8.327244100757867e-06, + "loss": 2.7557, + "step": 5112 + }, + { + "epoch": 1.0246492985971944, + "grad_norm": 20.602705105847686, + "learning_rate": 8.326373743352015e-06, + "loss": 2.8592, + "step": 5113 + }, + { + "epoch": 1.0248496993987977, + "grad_norm": 29.70562694008305, + "learning_rate": 8.32550320508631e-06, + "loss": 3.0574, + "step": 5114 + }, + { + "epoch": 1.0250501002004009, + "grad_norm": 23.0442145089829, + "learning_rate": 8.32463248600809e-06, + "loss": 2.6983, + "step": 5115 + }, + { + "epoch": 1.025250501002004, + "grad_norm": 26.26441774302839, + "learning_rate": 8.323761586164695e-06, + "loss": 3.0268, + "step": 5116 + }, + { + "epoch": 1.0254509018036073, + "grad_norm": 28.18855797673923, + "learning_rate": 8.322890505603474e-06, + "loss": 2.9358, + "step": 5117 + }, + { + "epoch": 1.0256513026052103, + "grad_norm": 21.790119755196677, + "learning_rate": 8.322019244371793e-06, + "loss": 2.2761, + "step": 5118 + }, + { + "epoch": 1.0258517034068135, + "grad_norm": 21.108504134890556, + "learning_rate": 8.32114780251702e-06, + "loss": 2.9557, + "step": 5119 + }, + { + "epoch": 1.0260521042084167, + "grad_norm": 23.141953417873854, + "learning_rate": 8.32027618008654e-06, + "loss": 2.6956, + "step": 5120 + }, + { + "epoch": 1.02625250501002, + "grad_norm": 18.439356727896957, + "learning_rate": 8.319404377127741e-06, + "loss": 2.7898, + "step": 5121 + }, + { + "epoch": 1.0264529058116232, + "grad_norm": 21.7642953903566, + "learning_rate": 8.318532393688028e-06, + "loss": 2.8322, + "step": 5122 + }, + { + "epoch": 1.0266533066132264, + "grad_norm": 21.501670886893525, + "learning_rate": 8.31766022981481e-06, + "loss": 3.0087, + "step": 5123 + }, + { + "epoch": 1.0268537074148296, + "grad_norm": 25.217475115836617, + "learning_rate": 8.316787885555507e-06, + "loss": 2.1127, + "step": 5124 + }, + { + "epoch": 1.0270541082164328, + "grad_norm": 25.591678695900335, + "learning_rate": 8.315915360957551e-06, + "loss": 2.9531, + "step": 5125 + }, + { + "epoch": 1.027254509018036, + "grad_norm": 26.14650368837022, + "learning_rate": 8.315042656068382e-06, + "loss": 2.9902, + "step": 5126 + }, + { + "epoch": 1.0274549098196393, + "grad_norm": 20.757553110632905, + "learning_rate": 8.31416977093545e-06, + "loss": 2.5916, + "step": 5127 + }, + { + "epoch": 1.0276553106212425, + "grad_norm": 22.983507983012473, + "learning_rate": 8.313296705606217e-06, + "loss": 2.4514, + "step": 5128 + }, + { + "epoch": 1.0278557114228457, + "grad_norm": 40.308599468268056, + "learning_rate": 8.31242346012815e-06, + "loss": 2.711, + "step": 5129 + }, + { + "epoch": 1.028056112224449, + "grad_norm": 19.73342597309436, + "learning_rate": 8.311550034548729e-06, + "loss": 2.5879, + "step": 5130 + }, + { + "epoch": 1.0282565130260521, + "grad_norm": 24.790108203457997, + "learning_rate": 8.310676428915447e-06, + "loss": 2.6018, + "step": 5131 + }, + { + "epoch": 1.0284569138276554, + "grad_norm": 31.31986819901823, + "learning_rate": 8.309802643275799e-06, + "loss": 2.7086, + "step": 5132 + }, + { + "epoch": 1.0286573146292586, + "grad_norm": 29.674811655355562, + "learning_rate": 8.308928677677294e-06, + "loss": 2.9332, + "step": 5133 + }, + { + "epoch": 1.0288577154308618, + "grad_norm": 25.553010717355804, + "learning_rate": 8.308054532167455e-06, + "loss": 2.8168, + "step": 5134 + }, + { + "epoch": 1.029058116232465, + "grad_norm": 21.54618274861838, + "learning_rate": 8.307180206793808e-06, + "loss": 2.7498, + "step": 5135 + }, + { + "epoch": 1.0292585170340682, + "grad_norm": 27.276525723073046, + "learning_rate": 8.30630570160389e-06, + "loss": 2.3899, + "step": 5136 + }, + { + "epoch": 1.0294589178356714, + "grad_norm": 25.120616700954177, + "learning_rate": 8.30543101664525e-06, + "loss": 3.0806, + "step": 5137 + }, + { + "epoch": 1.0296593186372744, + "grad_norm": 21.194073290997558, + "learning_rate": 8.304556151965446e-06, + "loss": 2.1218, + "step": 5138 + }, + { + "epoch": 1.0298597194388777, + "grad_norm": 20.99619853613815, + "learning_rate": 8.303681107612047e-06, + "loss": 2.6458, + "step": 5139 + }, + { + "epoch": 1.0300601202404809, + "grad_norm": 29.57770131350949, + "learning_rate": 8.30280588363263e-06, + "loss": 2.5372, + "step": 5140 + }, + { + "epoch": 1.030260521042084, + "grad_norm": 17.175185689687467, + "learning_rate": 8.30193048007478e-06, + "loss": 2.357, + "step": 5141 + }, + { + "epoch": 1.0304609218436873, + "grad_norm": 20.318785601191397, + "learning_rate": 8.301054896986099e-06, + "loss": 2.2112, + "step": 5142 + }, + { + "epoch": 1.0306613226452905, + "grad_norm": 29.574702429910797, + "learning_rate": 8.300179134414188e-06, + "loss": 2.2013, + "step": 5143 + }, + { + "epoch": 1.0308617234468938, + "grad_norm": 35.841300796169385, + "learning_rate": 8.299303192406666e-06, + "loss": 3.1922, + "step": 5144 + }, + { + "epoch": 1.031062124248497, + "grad_norm": 39.06549818918743, + "learning_rate": 8.298427071011158e-06, + "loss": 2.7173, + "step": 5145 + }, + { + "epoch": 1.0312625250501002, + "grad_norm": 25.02411992819668, + "learning_rate": 8.297550770275304e-06, + "loss": 2.9605, + "step": 5146 + }, + { + "epoch": 1.0314629258517034, + "grad_norm": 30.01102766641819, + "learning_rate": 8.296674290246746e-06, + "loss": 3.3531, + "step": 5147 + }, + { + "epoch": 1.0316633266533066, + "grad_norm": 22.41677742142192, + "learning_rate": 8.29579763097314e-06, + "loss": 2.6532, + "step": 5148 + }, + { + "epoch": 1.0318637274549098, + "grad_norm": 21.941300893068828, + "learning_rate": 8.294920792502152e-06, + "loss": 2.8093, + "step": 5149 + }, + { + "epoch": 1.032064128256513, + "grad_norm": 31.915991085699233, + "learning_rate": 8.294043774881458e-06, + "loss": 2.7188, + "step": 5150 + }, + { + "epoch": 1.0322645290581163, + "grad_norm": 43.50712018679717, + "learning_rate": 8.29316657815874e-06, + "loss": 3.4827, + "step": 5151 + }, + { + "epoch": 1.0324649298597195, + "grad_norm": 26.1664238388471, + "learning_rate": 8.292289202381694e-06, + "loss": 2.5401, + "step": 5152 + }, + { + "epoch": 1.0326653306613227, + "grad_norm": 44.75842457045321, + "learning_rate": 8.291411647598025e-06, + "loss": 2.7346, + "step": 5153 + }, + { + "epoch": 1.032865731462926, + "grad_norm": 46.02581369692365, + "learning_rate": 8.290533913855446e-06, + "loss": 2.5161, + "step": 5154 + }, + { + "epoch": 1.0330661322645291, + "grad_norm": 40.62961455775569, + "learning_rate": 8.28965600120168e-06, + "loss": 2.9742, + "step": 5155 + }, + { + "epoch": 1.0332665330661324, + "grad_norm": 23.603097087703713, + "learning_rate": 8.28877790968446e-06, + "loss": 2.8151, + "step": 5156 + }, + { + "epoch": 1.0334669338677354, + "grad_norm": 28.29550267496414, + "learning_rate": 8.287899639351534e-06, + "loss": 2.8143, + "step": 5157 + }, + { + "epoch": 1.0336673346693386, + "grad_norm": 27.93272763672245, + "learning_rate": 8.287021190250649e-06, + "loss": 2.8936, + "step": 5158 + }, + { + "epoch": 1.0338677354709418, + "grad_norm": 22.310151548864198, + "learning_rate": 8.286142562429569e-06, + "loss": 2.79, + "step": 5159 + }, + { + "epoch": 1.034068136272545, + "grad_norm": 22.521815888659205, + "learning_rate": 8.285263755936069e-06, + "loss": 2.5829, + "step": 5160 + }, + { + "epoch": 1.0342685370741482, + "grad_norm": 27.762428528304635, + "learning_rate": 8.284384770817928e-06, + "loss": 3.1428, + "step": 5161 + }, + { + "epoch": 1.0344689378757514, + "grad_norm": 32.68430813447294, + "learning_rate": 8.283505607122938e-06, + "loss": 2.376, + "step": 5162 + }, + { + "epoch": 1.0346693386773547, + "grad_norm": 35.083322541486936, + "learning_rate": 8.282626264898902e-06, + "loss": 3.2827, + "step": 5163 + }, + { + "epoch": 1.0348697394789579, + "grad_norm": 19.37877724700553, + "learning_rate": 8.28174674419363e-06, + "loss": 2.5311, + "step": 5164 + }, + { + "epoch": 1.035070140280561, + "grad_norm": 24.066925863569857, + "learning_rate": 8.280867045054945e-06, + "loss": 2.6304, + "step": 5165 + }, + { + "epoch": 1.0352705410821643, + "grad_norm": 28.70860147827555, + "learning_rate": 8.279987167530676e-06, + "loss": 3.001, + "step": 5166 + }, + { + "epoch": 1.0354709418837675, + "grad_norm": 22.752027884687447, + "learning_rate": 8.27910711166866e-06, + "loss": 2.3597, + "step": 5167 + }, + { + "epoch": 1.0356713426853708, + "grad_norm": 26.58778880923881, + "learning_rate": 8.278226877516753e-06, + "loss": 3.1826, + "step": 5168 + }, + { + "epoch": 1.035871743486974, + "grad_norm": 19.547239238866386, + "learning_rate": 8.277346465122809e-06, + "loss": 2.7124, + "step": 5169 + }, + { + "epoch": 1.0360721442885772, + "grad_norm": 25.12982872100098, + "learning_rate": 8.276465874534701e-06, + "loss": 3.5033, + "step": 5170 + }, + { + "epoch": 1.0362725450901804, + "grad_norm": 29.603110406321147, + "learning_rate": 8.275585105800309e-06, + "loss": 2.7252, + "step": 5171 + }, + { + "epoch": 1.0364729458917836, + "grad_norm": 18.44938222335229, + "learning_rate": 8.274704158967516e-06, + "loss": 2.6692, + "step": 5172 + }, + { + "epoch": 1.0366733466933868, + "grad_norm": 28.64304616675825, + "learning_rate": 8.273823034084225e-06, + "loss": 2.7093, + "step": 5173 + }, + { + "epoch": 1.03687374749499, + "grad_norm": 28.58480517975179, + "learning_rate": 8.272941731198345e-06, + "loss": 2.7845, + "step": 5174 + }, + { + "epoch": 1.0370741482965933, + "grad_norm": 39.72015830063694, + "learning_rate": 8.27206025035779e-06, + "loss": 2.5542, + "step": 5175 + }, + { + "epoch": 1.0372745490981965, + "grad_norm": 24.642363517808935, + "learning_rate": 8.271178591610488e-06, + "loss": 3.1632, + "step": 5176 + }, + { + "epoch": 1.0374749498997995, + "grad_norm": 19.25004136131166, + "learning_rate": 8.27029675500438e-06, + "loss": 3.3403, + "step": 5177 + }, + { + "epoch": 1.0376753507014027, + "grad_norm": 22.852524877147534, + "learning_rate": 8.269414740587409e-06, + "loss": 2.975, + "step": 5178 + }, + { + "epoch": 1.037875751503006, + "grad_norm": 34.362271002161314, + "learning_rate": 8.268532548407533e-06, + "loss": 3.1929, + "step": 5179 + }, + { + "epoch": 1.0380761523046091, + "grad_norm": 27.99380546590251, + "learning_rate": 8.267650178512716e-06, + "loss": 2.7683, + "step": 5180 + }, + { + "epoch": 1.0382765531062124, + "grad_norm": 33.18653877424888, + "learning_rate": 8.266767630950936e-06, + "loss": 2.1961, + "step": 5181 + }, + { + "epoch": 1.0384769539078156, + "grad_norm": 27.455382940457906, + "learning_rate": 8.265884905770177e-06, + "loss": 2.7093, + "step": 5182 + }, + { + "epoch": 1.0386773547094188, + "grad_norm": 22.891411762813327, + "learning_rate": 8.265002003018436e-06, + "loss": 2.5336, + "step": 5183 + }, + { + "epoch": 1.038877755511022, + "grad_norm": 29.364165293355803, + "learning_rate": 8.264118922743715e-06, + "loss": 3.2596, + "step": 5184 + }, + { + "epoch": 1.0390781563126252, + "grad_norm": 20.024579745128722, + "learning_rate": 8.263235664994032e-06, + "loss": 2.4035, + "step": 5185 + }, + { + "epoch": 1.0392785571142285, + "grad_norm": 37.27967110086838, + "learning_rate": 8.262352229817407e-06, + "loss": 2.7786, + "step": 5186 + }, + { + "epoch": 1.0394789579158317, + "grad_norm": 24.556291784539066, + "learning_rate": 8.261468617261877e-06, + "loss": 2.3987, + "step": 5187 + }, + { + "epoch": 1.0396793587174349, + "grad_norm": 58.427900952925505, + "learning_rate": 8.260584827375484e-06, + "loss": 2.3127, + "step": 5188 + }, + { + "epoch": 1.039879759519038, + "grad_norm": 22.229600572396148, + "learning_rate": 8.25970086020628e-06, + "loss": 2.7784, + "step": 5189 + }, + { + "epoch": 1.0400801603206413, + "grad_norm": 19.023577216300296, + "learning_rate": 8.25881671580233e-06, + "loss": 2.6555, + "step": 5190 + }, + { + "epoch": 1.0402805611222445, + "grad_norm": 31.430816958991514, + "learning_rate": 8.257932394211702e-06, + "loss": 2.9755, + "step": 5191 + }, + { + "epoch": 1.0404809619238478, + "grad_norm": 39.2771025656407, + "learning_rate": 8.257047895482482e-06, + "loss": 3.2975, + "step": 5192 + }, + { + "epoch": 1.040681362725451, + "grad_norm": 21.45622711854613, + "learning_rate": 8.256163219662761e-06, + "loss": 2.5236, + "step": 5193 + }, + { + "epoch": 1.0408817635270542, + "grad_norm": 23.099356455873462, + "learning_rate": 8.255278366800637e-06, + "loss": 2.0928, + "step": 5194 + }, + { + "epoch": 1.0410821643286574, + "grad_norm": 26.413223758312856, + "learning_rate": 8.254393336944224e-06, + "loss": 3.2049, + "step": 5195 + }, + { + "epoch": 1.0412825651302606, + "grad_norm": 18.348042775293443, + "learning_rate": 8.253508130141644e-06, + "loss": 3.0457, + "step": 5196 + }, + { + "epoch": 1.0414829659318636, + "grad_norm": 19.38196631124915, + "learning_rate": 8.252622746441022e-06, + "loss": 2.9253, + "step": 5197 + }, + { + "epoch": 1.0416833667334668, + "grad_norm": 24.98672972363331, + "learning_rate": 8.251737185890498e-06, + "loss": 2.5924, + "step": 5198 + }, + { + "epoch": 1.04188376753507, + "grad_norm": 24.653299073816278, + "learning_rate": 8.250851448538227e-06, + "loss": 2.3911, + "step": 5199 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 23.844116007697885, + "learning_rate": 8.24996553443236e-06, + "loss": 2.083, + "step": 5200 + }, + { + "epoch": 1.0422845691382765, + "grad_norm": 46.63155324904206, + "learning_rate": 8.249079443621073e-06, + "loss": 2.4824, + "step": 5201 + }, + { + "epoch": 1.0424849699398797, + "grad_norm": 28.19511339105466, + "learning_rate": 8.248193176152539e-06, + "loss": 2.9991, + "step": 5202 + }, + { + "epoch": 1.042685370741483, + "grad_norm": 23.211621410842074, + "learning_rate": 8.247306732074946e-06, + "loss": 2.5702, + "step": 5203 + }, + { + "epoch": 1.0428857715430861, + "grad_norm": 26.19055227727981, + "learning_rate": 8.246420111436496e-06, + "loss": 2.6642, + "step": 5204 + }, + { + "epoch": 1.0430861723446894, + "grad_norm": 32.88898450916465, + "learning_rate": 8.24553331428539e-06, + "loss": 2.9528, + "step": 5205 + }, + { + "epoch": 1.0432865731462926, + "grad_norm": 29.316959950374684, + "learning_rate": 8.244646340669848e-06, + "loss": 2.9462, + "step": 5206 + }, + { + "epoch": 1.0434869739478958, + "grad_norm": 22.22202613526673, + "learning_rate": 8.243759190638094e-06, + "loss": 2.8128, + "step": 5207 + }, + { + "epoch": 1.043687374749499, + "grad_norm": 26.732749282745747, + "learning_rate": 8.242871864238364e-06, + "loss": 2.8053, + "step": 5208 + }, + { + "epoch": 1.0438877755511022, + "grad_norm": 24.889019412168544, + "learning_rate": 8.241984361518905e-06, + "loss": 2.4724, + "step": 5209 + }, + { + "epoch": 1.0440881763527055, + "grad_norm": 32.57778601667877, + "learning_rate": 8.241096682527969e-06, + "loss": 3.1428, + "step": 5210 + }, + { + "epoch": 1.0442885771543087, + "grad_norm": 23.22676725162041, + "learning_rate": 8.240208827313825e-06, + "loss": 3.0931, + "step": 5211 + }, + { + "epoch": 1.044488977955912, + "grad_norm": 24.689971802114968, + "learning_rate": 8.239320795924742e-06, + "loss": 3.1904, + "step": 5212 + }, + { + "epoch": 1.044689378757515, + "grad_norm": 24.52284331743427, + "learning_rate": 8.238432588409004e-06, + "loss": 2.7559, + "step": 5213 + }, + { + "epoch": 1.0448897795591183, + "grad_norm": 31.45851485233932, + "learning_rate": 8.237544204814909e-06, + "loss": 2.6514, + "step": 5214 + }, + { + "epoch": 1.0450901803607215, + "grad_norm": 30.80025294871854, + "learning_rate": 8.236655645190755e-06, + "loss": 3.0494, + "step": 5215 + }, + { + "epoch": 1.0452905811623245, + "grad_norm": 20.62717218589092, + "learning_rate": 8.235766909584857e-06, + "loss": 2.7967, + "step": 5216 + }, + { + "epoch": 1.0454909819639278, + "grad_norm": 45.19944414358682, + "learning_rate": 8.234877998045534e-06, + "loss": 3.3861, + "step": 5217 + }, + { + "epoch": 1.045691382765531, + "grad_norm": 19.336537688123606, + "learning_rate": 8.233988910621122e-06, + "loss": 2.8922, + "step": 5218 + }, + { + "epoch": 1.0458917835671342, + "grad_norm": 21.120998377654963, + "learning_rate": 8.233099647359956e-06, + "loss": 3.0344, + "step": 5219 + }, + { + "epoch": 1.0460921843687374, + "grad_norm": 19.869574930637224, + "learning_rate": 8.232210208310389e-06, + "loss": 2.5345, + "step": 5220 + }, + { + "epoch": 1.0462925851703406, + "grad_norm": 38.897345909207104, + "learning_rate": 8.231320593520785e-06, + "loss": 2.7386, + "step": 5221 + }, + { + "epoch": 1.0464929859719438, + "grad_norm": 20.199379305657637, + "learning_rate": 8.230430803039508e-06, + "loss": 2.726, + "step": 5222 + }, + { + "epoch": 1.046693386773547, + "grad_norm": 18.110687422192164, + "learning_rate": 8.229540836914941e-06, + "loss": 2.4444, + "step": 5223 + }, + { + "epoch": 1.0468937875751503, + "grad_norm": 22.24012020254099, + "learning_rate": 8.228650695195472e-06, + "loss": 3.1534, + "step": 5224 + }, + { + "epoch": 1.0470941883767535, + "grad_norm": 29.101556640336977, + "learning_rate": 8.227760377929498e-06, + "loss": 3.334, + "step": 5225 + }, + { + "epoch": 1.0472945891783567, + "grad_norm": 27.696208303238407, + "learning_rate": 8.226869885165428e-06, + "loss": 3.0751, + "step": 5226 + }, + { + "epoch": 1.04749498997996, + "grad_norm": 16.05850159141314, + "learning_rate": 8.225979216951678e-06, + "loss": 2.2797, + "step": 5227 + }, + { + "epoch": 1.0476953907815632, + "grad_norm": 32.78395488945399, + "learning_rate": 8.225088373336679e-06, + "loss": 3.4579, + "step": 5228 + }, + { + "epoch": 1.0478957915831664, + "grad_norm": 106.4891079876635, + "learning_rate": 8.22419735436886e-06, + "loss": 2.441, + "step": 5229 + }, + { + "epoch": 1.0480961923847696, + "grad_norm": 31.051590040381846, + "learning_rate": 8.223306160096674e-06, + "loss": 2.842, + "step": 5230 + }, + { + "epoch": 1.0482965931863728, + "grad_norm": 25.152907028996324, + "learning_rate": 8.222414790568573e-06, + "loss": 3.1218, + "step": 5231 + }, + { + "epoch": 1.048496993987976, + "grad_norm": 20.005506898024393, + "learning_rate": 8.221523245833024e-06, + "loss": 2.9573, + "step": 5232 + }, + { + "epoch": 1.0486973947895792, + "grad_norm": 29.920443082251683, + "learning_rate": 8.2206315259385e-06, + "loss": 2.072, + "step": 5233 + }, + { + "epoch": 1.0488977955911825, + "grad_norm": 43.20482511753738, + "learning_rate": 8.219739630933487e-06, + "loss": 2.7838, + "step": 5234 + }, + { + "epoch": 1.0490981963927857, + "grad_norm": 25.14469373221556, + "learning_rate": 8.218847560866476e-06, + "loss": 2.852, + "step": 5235 + }, + { + "epoch": 1.0492985971943887, + "grad_norm": 29.390463075024957, + "learning_rate": 8.217955315785973e-06, + "loss": 3.0351, + "step": 5236 + }, + { + "epoch": 1.049498997995992, + "grad_norm": 27.147479622637157, + "learning_rate": 8.21706289574049e-06, + "loss": 2.8745, + "step": 5237 + }, + { + "epoch": 1.049699398797595, + "grad_norm": 24.508254107866478, + "learning_rate": 8.216170300778547e-06, + "loss": 2.6135, + "step": 5238 + }, + { + "epoch": 1.0498997995991983, + "grad_norm": 52.374768160903095, + "learning_rate": 8.215277530948677e-06, + "loss": 2.9838, + "step": 5239 + }, + { + "epoch": 1.0501002004008015, + "grad_norm": 34.668894348069244, + "learning_rate": 8.214384586299423e-06, + "loss": 2.3288, + "step": 5240 + }, + { + "epoch": 1.0503006012024048, + "grad_norm": 22.907457468434348, + "learning_rate": 8.213491466879333e-06, + "loss": 1.8573, + "step": 5241 + }, + { + "epoch": 1.050501002004008, + "grad_norm": 25.48264868047143, + "learning_rate": 8.212598172736968e-06, + "loss": 2.9686, + "step": 5242 + }, + { + "epoch": 1.0507014028056112, + "grad_norm": 43.61306739590025, + "learning_rate": 8.2117047039209e-06, + "loss": 2.4369, + "step": 5243 + }, + { + "epoch": 1.0509018036072144, + "grad_norm": 22.765625981051098, + "learning_rate": 8.210811060479704e-06, + "loss": 2.6829, + "step": 5244 + }, + { + "epoch": 1.0511022044088176, + "grad_norm": 27.682650774775745, + "learning_rate": 8.209917242461974e-06, + "loss": 2.9289, + "step": 5245 + }, + { + "epoch": 1.0513026052104208, + "grad_norm": 37.68649829596218, + "learning_rate": 8.209023249916302e-06, + "loss": 3.0976, + "step": 5246 + }, + { + "epoch": 1.051503006012024, + "grad_norm": 24.47813411440297, + "learning_rate": 8.208129082891301e-06, + "loss": 2.5954, + "step": 5247 + }, + { + "epoch": 1.0517034068136273, + "grad_norm": 17.074107010938253, + "learning_rate": 8.207234741435586e-06, + "loss": 2.4602, + "step": 5248 + }, + { + "epoch": 1.0519038076152305, + "grad_norm": 22.410825253177993, + "learning_rate": 8.206340225597782e-06, + "loss": 2.7668, + "step": 5249 + }, + { + "epoch": 1.0521042084168337, + "grad_norm": 23.65854756657058, + "learning_rate": 8.205445535426528e-06, + "loss": 2.8167, + "step": 5250 + }, + { + "epoch": 1.052304609218437, + "grad_norm": 19.676918678294903, + "learning_rate": 8.204550670970469e-06, + "loss": 2.6539, + "step": 5251 + }, + { + "epoch": 1.0525050100200402, + "grad_norm": 28.243290356341383, + "learning_rate": 8.20365563227826e-06, + "loss": 3.0666, + "step": 5252 + }, + { + "epoch": 1.0527054108216434, + "grad_norm": 31.207017218787566, + "learning_rate": 8.202760419398565e-06, + "loss": 2.9417, + "step": 5253 + }, + { + "epoch": 1.0529058116232466, + "grad_norm": 29.146435485039007, + "learning_rate": 8.201865032380058e-06, + "loss": 2.5775, + "step": 5254 + }, + { + "epoch": 1.0531062124248498, + "grad_norm": 23.219990688421383, + "learning_rate": 8.200969471271422e-06, + "loss": 2.9347, + "step": 5255 + }, + { + "epoch": 1.0533066132264528, + "grad_norm": 38.78311277390967, + "learning_rate": 8.200073736121353e-06, + "loss": 2.7641, + "step": 5256 + }, + { + "epoch": 1.053507014028056, + "grad_norm": 21.961139695354817, + "learning_rate": 8.199177826978548e-06, + "loss": 2.5989, + "step": 5257 + }, + { + "epoch": 1.0537074148296592, + "grad_norm": 25.836640082935656, + "learning_rate": 8.198281743891724e-06, + "loss": 3.0968, + "step": 5258 + }, + { + "epoch": 1.0539078156312625, + "grad_norm": 15.3582491070151, + "learning_rate": 8.197385486909601e-06, + "loss": 2.6448, + "step": 5259 + }, + { + "epoch": 1.0541082164328657, + "grad_norm": 48.51913129978052, + "learning_rate": 8.19648905608091e-06, + "loss": 2.9038, + "step": 5260 + }, + { + "epoch": 1.054308617234469, + "grad_norm": 28.123434175278998, + "learning_rate": 8.195592451454388e-06, + "loss": 2.2297, + "step": 5261 + }, + { + "epoch": 1.054509018036072, + "grad_norm": 47.811689145498924, + "learning_rate": 8.19469567307879e-06, + "loss": 2.8673, + "step": 5262 + }, + { + "epoch": 1.0547094188376753, + "grad_norm": 30.145036628175532, + "learning_rate": 8.19379872100287e-06, + "loss": 2.4902, + "step": 5263 + }, + { + "epoch": 1.0549098196392785, + "grad_norm": 20.49036633311436, + "learning_rate": 8.192901595275399e-06, + "loss": 2.6677, + "step": 5264 + }, + { + "epoch": 1.0551102204408818, + "grad_norm": 30.019018042229636, + "learning_rate": 8.192004295945158e-06, + "loss": 3.0357, + "step": 5265 + }, + { + "epoch": 1.055310621242485, + "grad_norm": 26.06674363389629, + "learning_rate": 8.191106823060929e-06, + "loss": 3.0169, + "step": 5266 + }, + { + "epoch": 1.0555110220440882, + "grad_norm": 27.274037850485715, + "learning_rate": 8.190209176671513e-06, + "loss": 2.2915, + "step": 5267 + }, + { + "epoch": 1.0557114228456914, + "grad_norm": 21.80445411122128, + "learning_rate": 8.189311356825714e-06, + "loss": 2.8891, + "step": 5268 + }, + { + "epoch": 1.0559118236472946, + "grad_norm": 28.608035690063943, + "learning_rate": 8.18841336357235e-06, + "loss": 2.4283, + "step": 5269 + }, + { + "epoch": 1.0561122244488979, + "grad_norm": 31.85908181888426, + "learning_rate": 8.187515196960243e-06, + "loss": 2.7652, + "step": 5270 + }, + { + "epoch": 1.056312625250501, + "grad_norm": 15.947692667362391, + "learning_rate": 8.18661685703823e-06, + "loss": 2.4572, + "step": 5271 + }, + { + "epoch": 1.0565130260521043, + "grad_norm": 20.474265610651567, + "learning_rate": 8.185718343855156e-06, + "loss": 2.2746, + "step": 5272 + }, + { + "epoch": 1.0567134268537075, + "grad_norm": 24.152021399524592, + "learning_rate": 8.18481965745987e-06, + "loss": 2.8146, + "step": 5273 + }, + { + "epoch": 1.0569138276553107, + "grad_norm": 30.043275744205445, + "learning_rate": 8.183920797901241e-06, + "loss": 2.9314, + "step": 5274 + }, + { + "epoch": 1.0571142284569137, + "grad_norm": 25.301089051061734, + "learning_rate": 8.183021765228138e-06, + "loss": 2.6784, + "step": 5275 + }, + { + "epoch": 1.057314629258517, + "grad_norm": 22.88506562475349, + "learning_rate": 8.182122559489442e-06, + "loss": 3.0611, + "step": 5276 + }, + { + "epoch": 1.0575150300601202, + "grad_norm": 22.597945333643942, + "learning_rate": 8.181223180734046e-06, + "loss": 2.7139, + "step": 5277 + }, + { + "epoch": 1.0577154308617234, + "grad_norm": 23.23862664422533, + "learning_rate": 8.180323629010849e-06, + "loss": 2.7872, + "step": 5278 + }, + { + "epoch": 1.0579158316633266, + "grad_norm": 24.15920090893879, + "learning_rate": 8.179423904368761e-06, + "loss": 2.8694, + "step": 5279 + }, + { + "epoch": 1.0581162324649298, + "grad_norm": 23.525597142484802, + "learning_rate": 8.178524006856703e-06, + "loss": 2.3774, + "step": 5280 + }, + { + "epoch": 1.058316633266533, + "grad_norm": 35.31969434496682, + "learning_rate": 8.177623936523605e-06, + "loss": 2.9529, + "step": 5281 + }, + { + "epoch": 1.0585170340681362, + "grad_norm": 27.40964368824505, + "learning_rate": 8.1767236934184e-06, + "loss": 2.3873, + "step": 5282 + }, + { + "epoch": 1.0587174348697395, + "grad_norm": 42.364332803899806, + "learning_rate": 8.175823277590039e-06, + "loss": 2.386, + "step": 5283 + }, + { + "epoch": 1.0589178356713427, + "grad_norm": 22.16632949443541, + "learning_rate": 8.174922689087478e-06, + "loss": 3.0722, + "step": 5284 + }, + { + "epoch": 1.059118236472946, + "grad_norm": 20.27746901394688, + "learning_rate": 8.174021927959685e-06, + "loss": 2.5022, + "step": 5285 + }, + { + "epoch": 1.0593186372745491, + "grad_norm": 33.10892596018695, + "learning_rate": 8.173120994255635e-06, + "loss": 2.393, + "step": 5286 + }, + { + "epoch": 1.0595190380761523, + "grad_norm": 22.54232090788708, + "learning_rate": 8.172219888024312e-06, + "loss": 2.6464, + "step": 5287 + }, + { + "epoch": 1.0597194388777555, + "grad_norm": 42.609527404263105, + "learning_rate": 8.171318609314711e-06, + "loss": 2.6073, + "step": 5288 + }, + { + "epoch": 1.0599198396793588, + "grad_norm": 24.898584926716747, + "learning_rate": 8.170417158175836e-06, + "loss": 3.0902, + "step": 5289 + }, + { + "epoch": 1.060120240480962, + "grad_norm": 21.437092607897352, + "learning_rate": 8.1695155346567e-06, + "loss": 2.1529, + "step": 5290 + }, + { + "epoch": 1.0603206412825652, + "grad_norm": 21.939372035773985, + "learning_rate": 8.168613738806325e-06, + "loss": 2.7486, + "step": 5291 + }, + { + "epoch": 1.0605210420841684, + "grad_norm": 32.12686412897578, + "learning_rate": 8.167711770673743e-06, + "loss": 2.9636, + "step": 5292 + }, + { + "epoch": 1.0607214428857716, + "grad_norm": 36.827851681788246, + "learning_rate": 8.166809630307998e-06, + "loss": 2.6233, + "step": 5293 + }, + { + "epoch": 1.0609218436873749, + "grad_norm": 29.177947554056985, + "learning_rate": 8.165907317758139e-06, + "loss": 2.9204, + "step": 5294 + }, + { + "epoch": 1.0611222444889779, + "grad_norm": 18.266874109772036, + "learning_rate": 8.165004833073224e-06, + "loss": 2.3926, + "step": 5295 + }, + { + "epoch": 1.061322645290581, + "grad_norm": 26.300263040892723, + "learning_rate": 8.164102176302326e-06, + "loss": 2.8626, + "step": 5296 + }, + { + "epoch": 1.0615230460921843, + "grad_norm": 25.993749825604755, + "learning_rate": 8.163199347494522e-06, + "loss": 3.2284, + "step": 5297 + }, + { + "epoch": 1.0617234468937875, + "grad_norm": 23.503938248457505, + "learning_rate": 8.162296346698899e-06, + "loss": 2.4667, + "step": 5298 + }, + { + "epoch": 1.0619238476953907, + "grad_norm": 19.057986297146865, + "learning_rate": 8.161393173964555e-06, + "loss": 2.9189, + "step": 5299 + }, + { + "epoch": 1.062124248496994, + "grad_norm": 20.340484004194007, + "learning_rate": 8.160489829340598e-06, + "loss": 2.0937, + "step": 5300 + }, + { + "epoch": 1.0623246492985972, + "grad_norm": 25.103517132005624, + "learning_rate": 8.159586312876145e-06, + "loss": 2.5391, + "step": 5301 + }, + { + "epoch": 1.0625250501002004, + "grad_norm": 26.93824976230825, + "learning_rate": 8.15868262462032e-06, + "loss": 3.1554, + "step": 5302 + }, + { + "epoch": 1.0627254509018036, + "grad_norm": 20.916472396626407, + "learning_rate": 8.157778764622257e-06, + "loss": 2.2671, + "step": 5303 + }, + { + "epoch": 1.0629258517034068, + "grad_norm": 41.16264208880769, + "learning_rate": 8.156874732931101e-06, + "loss": 2.9054, + "step": 5304 + }, + { + "epoch": 1.06312625250501, + "grad_norm": 18.26297962163721, + "learning_rate": 8.155970529596008e-06, + "loss": 2.7359, + "step": 5305 + }, + { + "epoch": 1.0633266533066132, + "grad_norm": 36.38330045115238, + "learning_rate": 8.155066154666135e-06, + "loss": 3.5914, + "step": 5306 + }, + { + "epoch": 1.0635270541082165, + "grad_norm": 34.03825724777312, + "learning_rate": 8.15416160819066e-06, + "loss": 3.2558, + "step": 5307 + }, + { + "epoch": 1.0637274549098197, + "grad_norm": 29.76591723003856, + "learning_rate": 8.153256890218763e-06, + "loss": 2.5827, + "step": 5308 + }, + { + "epoch": 1.063927855711423, + "grad_norm": 24.375348677164347, + "learning_rate": 8.152352000799635e-06, + "loss": 2.8226, + "step": 5309 + }, + { + "epoch": 1.0641282565130261, + "grad_norm": 32.14982329003735, + "learning_rate": 8.151446939982472e-06, + "loss": 2.4668, + "step": 5310 + }, + { + "epoch": 1.0643286573146293, + "grad_norm": 18.899187375827733, + "learning_rate": 8.15054170781649e-06, + "loss": 2.752, + "step": 5311 + }, + { + "epoch": 1.0645290581162326, + "grad_norm": 22.18845688019073, + "learning_rate": 8.149636304350906e-06, + "loss": 2.5627, + "step": 5312 + }, + { + "epoch": 1.0647294589178358, + "grad_norm": 33.82398042872245, + "learning_rate": 8.148730729634942e-06, + "loss": 2.662, + "step": 5313 + }, + { + "epoch": 1.064929859719439, + "grad_norm": 17.462918954325378, + "learning_rate": 8.147824983717845e-06, + "loss": 2.5851, + "step": 5314 + }, + { + "epoch": 1.065130260521042, + "grad_norm": 27.134705149575947, + "learning_rate": 8.146919066648854e-06, + "loss": 2.6873, + "step": 5315 + }, + { + "epoch": 1.0653306613226452, + "grad_norm": 22.41026338001775, + "learning_rate": 8.146012978477228e-06, + "loss": 2.5391, + "step": 5316 + }, + { + "epoch": 1.0655310621242484, + "grad_norm": 29.160208664785266, + "learning_rate": 8.145106719252234e-06, + "loss": 3.0163, + "step": 5317 + }, + { + "epoch": 1.0657314629258516, + "grad_norm": 31.80634135708184, + "learning_rate": 8.144200289023145e-06, + "loss": 2.9359, + "step": 5318 + }, + { + "epoch": 1.0659318637274549, + "grad_norm": 17.609758780183373, + "learning_rate": 8.143293687839244e-06, + "loss": 2.4949, + "step": 5319 + }, + { + "epoch": 1.066132264529058, + "grad_norm": 35.843696626759034, + "learning_rate": 8.142386915749825e-06, + "loss": 2.7939, + "step": 5320 + }, + { + "epoch": 1.0663326653306613, + "grad_norm": 25.74241532846758, + "learning_rate": 8.141479972804193e-06, + "loss": 2.4865, + "step": 5321 + }, + { + "epoch": 1.0665330661322645, + "grad_norm": 24.98974598729965, + "learning_rate": 8.140572859051655e-06, + "loss": 3.3326, + "step": 5322 + }, + { + "epoch": 1.0667334669338677, + "grad_norm": 37.720248001772525, + "learning_rate": 8.139665574541535e-06, + "loss": 2.7452, + "step": 5323 + }, + { + "epoch": 1.066933867735471, + "grad_norm": 54.8956523715076, + "learning_rate": 8.138758119323165e-06, + "loss": 3.1216, + "step": 5324 + }, + { + "epoch": 1.0671342685370742, + "grad_norm": 34.86345356079689, + "learning_rate": 8.137850493445883e-06, + "loss": 3.0348, + "step": 5325 + }, + { + "epoch": 1.0673346693386774, + "grad_norm": 19.190627464275504, + "learning_rate": 8.136942696959036e-06, + "loss": 2.5212, + "step": 5326 + }, + { + "epoch": 1.0675350701402806, + "grad_norm": 26.405289804926245, + "learning_rate": 8.136034729911983e-06, + "loss": 2.8069, + "step": 5327 + }, + { + "epoch": 1.0677354709418838, + "grad_norm": 27.303770911765376, + "learning_rate": 8.135126592354095e-06, + "loss": 3.2459, + "step": 5328 + }, + { + "epoch": 1.067935871743487, + "grad_norm": 23.132159377029392, + "learning_rate": 8.134218284334746e-06, + "loss": 2.6479, + "step": 5329 + }, + { + "epoch": 1.0681362725450902, + "grad_norm": 27.07394891115675, + "learning_rate": 8.133309805903323e-06, + "loss": 2.9445, + "step": 5330 + }, + { + "epoch": 1.0683366733466935, + "grad_norm": 22.82462862056077, + "learning_rate": 8.132401157109219e-06, + "loss": 2.6941, + "step": 5331 + }, + { + "epoch": 1.0685370741482967, + "grad_norm": 21.52754643548924, + "learning_rate": 8.13149233800184e-06, + "loss": 3.0811, + "step": 5332 + }, + { + "epoch": 1.0687374749499, + "grad_norm": 27.934970881342597, + "learning_rate": 8.1305833486306e-06, + "loss": 2.773, + "step": 5333 + }, + { + "epoch": 1.068937875751503, + "grad_norm": 25.403328780363815, + "learning_rate": 8.129674189044923e-06, + "loss": 2.6351, + "step": 5334 + }, + { + "epoch": 1.0691382765531061, + "grad_norm": 20.56110631262087, + "learning_rate": 8.128764859294241e-06, + "loss": 2.6077, + "step": 5335 + }, + { + "epoch": 1.0693386773547093, + "grad_norm": 29.894810052059107, + "learning_rate": 8.127855359427995e-06, + "loss": 2.6393, + "step": 5336 + }, + { + "epoch": 1.0695390781563126, + "grad_norm": 31.388134223347375, + "learning_rate": 8.126945689495636e-06, + "loss": 2.5725, + "step": 5337 + }, + { + "epoch": 1.0697394789579158, + "grad_norm": 46.972374113864, + "learning_rate": 8.126035849546623e-06, + "loss": 2.7623, + "step": 5338 + }, + { + "epoch": 1.069939879759519, + "grad_norm": 23.483328115979383, + "learning_rate": 8.125125839630428e-06, + "loss": 2.4048, + "step": 5339 + }, + { + "epoch": 1.0701402805611222, + "grad_norm": 27.44871266405126, + "learning_rate": 8.124215659796526e-06, + "loss": 3.0794, + "step": 5340 + }, + { + "epoch": 1.0703406813627254, + "grad_norm": 23.053317705255843, + "learning_rate": 8.123305310094408e-06, + "loss": 2.9251, + "step": 5341 + }, + { + "epoch": 1.0705410821643286, + "grad_norm": 27.55634709641814, + "learning_rate": 8.122394790573569e-06, + "loss": 2.5451, + "step": 5342 + }, + { + "epoch": 1.0707414829659319, + "grad_norm": 34.982002571186214, + "learning_rate": 8.121484101283519e-06, + "loss": 2.5951, + "step": 5343 + }, + { + "epoch": 1.070941883767535, + "grad_norm": 23.510622394429063, + "learning_rate": 8.120573242273769e-06, + "loss": 2.5438, + "step": 5344 + }, + { + "epoch": 1.0711422845691383, + "grad_norm": 31.171860050221657, + "learning_rate": 8.119662213593843e-06, + "loss": 3.2034, + "step": 5345 + }, + { + "epoch": 1.0713426853707415, + "grad_norm": 22.187846366884038, + "learning_rate": 8.11875101529328e-06, + "loss": 2.877, + "step": 5346 + }, + { + "epoch": 1.0715430861723447, + "grad_norm": 42.7219778736799, + "learning_rate": 8.117839647421619e-06, + "loss": 3.5223, + "step": 5347 + }, + { + "epoch": 1.071743486973948, + "grad_norm": 52.493133617991326, + "learning_rate": 8.116928110028416e-06, + "loss": 3.2798, + "step": 5348 + }, + { + "epoch": 1.0719438877755512, + "grad_norm": 32.410427573943664, + "learning_rate": 8.11601640316323e-06, + "loss": 2.6074, + "step": 5349 + }, + { + "epoch": 1.0721442885771544, + "grad_norm": 19.65374479940701, + "learning_rate": 8.11510452687563e-06, + "loss": 2.8688, + "step": 5350 + }, + { + "epoch": 1.0723446893787576, + "grad_norm": 24.845778456386427, + "learning_rate": 8.114192481215202e-06, + "loss": 2.6571, + "step": 5351 + }, + { + "epoch": 1.0725450901803608, + "grad_norm": 21.884343517886926, + "learning_rate": 8.113280266231531e-06, + "loss": 2.5471, + "step": 5352 + }, + { + "epoch": 1.0727454909819638, + "grad_norm": 25.345150465434312, + "learning_rate": 8.112367881974217e-06, + "loss": 2.6291, + "step": 5353 + }, + { + "epoch": 1.0729458917835673, + "grad_norm": 37.6775444611651, + "learning_rate": 8.111455328492864e-06, + "loss": 2.6438, + "step": 5354 + }, + { + "epoch": 1.0731462925851702, + "grad_norm": 19.322982862253095, + "learning_rate": 8.110542605837095e-06, + "loss": 2.8027, + "step": 5355 + }, + { + "epoch": 1.0733466933867735, + "grad_norm": 44.50651082997578, + "learning_rate": 8.10962971405653e-06, + "loss": 2.3216, + "step": 5356 + }, + { + "epoch": 1.0735470941883767, + "grad_norm": 22.184504380775625, + "learning_rate": 8.10871665320081e-06, + "loss": 3.2633, + "step": 5357 + }, + { + "epoch": 1.07374749498998, + "grad_norm": 36.89056406063757, + "learning_rate": 8.107803423319576e-06, + "loss": 2.5964, + "step": 5358 + }, + { + "epoch": 1.0739478957915831, + "grad_norm": 27.923755361284083, + "learning_rate": 8.106890024462482e-06, + "loss": 2.4263, + "step": 5359 + }, + { + "epoch": 1.0741482965931863, + "grad_norm": 38.984606562225935, + "learning_rate": 8.10597645667919e-06, + "loss": 2.5449, + "step": 5360 + }, + { + "epoch": 1.0743486973947896, + "grad_norm": 40.11584366993497, + "learning_rate": 8.105062720019375e-06, + "loss": 2.8979, + "step": 5361 + }, + { + "epoch": 1.0745490981963928, + "grad_norm": 28.24572887066175, + "learning_rate": 8.104148814532715e-06, + "loss": 2.7845, + "step": 5362 + }, + { + "epoch": 1.074749498997996, + "grad_norm": 37.095424992834204, + "learning_rate": 8.103234740268903e-06, + "loss": 2.8106, + "step": 5363 + }, + { + "epoch": 1.0749498997995992, + "grad_norm": 25.75022506065454, + "learning_rate": 8.102320497277636e-06, + "loss": 2.9768, + "step": 5364 + }, + { + "epoch": 1.0751503006012024, + "grad_norm": 25.862388159117685, + "learning_rate": 8.101406085608625e-06, + "loss": 2.7583, + "step": 5365 + }, + { + "epoch": 1.0753507014028056, + "grad_norm": 47.090424277940734, + "learning_rate": 8.100491505311588e-06, + "loss": 3.2515, + "step": 5366 + }, + { + "epoch": 1.0755511022044089, + "grad_norm": 29.582250476285985, + "learning_rate": 8.099576756436249e-06, + "loss": 2.6456, + "step": 5367 + }, + { + "epoch": 1.075751503006012, + "grad_norm": 27.402801265970723, + "learning_rate": 8.09866183903235e-06, + "loss": 2.6462, + "step": 5368 + }, + { + "epoch": 1.0759519038076153, + "grad_norm": 23.205739743664246, + "learning_rate": 8.09774675314963e-06, + "loss": 2.6965, + "step": 5369 + }, + { + "epoch": 1.0761523046092185, + "grad_norm": 26.098402460204642, + "learning_rate": 8.096831498837845e-06, + "loss": 2.9763, + "step": 5370 + }, + { + "epoch": 1.0763527054108217, + "grad_norm": 20.02549362703415, + "learning_rate": 8.095916076146761e-06, + "loss": 2.3367, + "step": 5371 + }, + { + "epoch": 1.076553106212425, + "grad_norm": 18.769254233893978, + "learning_rate": 8.095000485126152e-06, + "loss": 2.4626, + "step": 5372 + }, + { + "epoch": 1.0767535070140282, + "grad_norm": 25.886803813069523, + "learning_rate": 8.094084725825794e-06, + "loss": 2.4958, + "step": 5373 + }, + { + "epoch": 1.0769539078156312, + "grad_norm": 28.006318947614137, + "learning_rate": 8.093168798295486e-06, + "loss": 3.1862, + "step": 5374 + }, + { + "epoch": 1.0771543086172344, + "grad_norm": 33.57562225430752, + "learning_rate": 8.092252702585025e-06, + "loss": 2.9469, + "step": 5375 + }, + { + "epoch": 1.0773547094188376, + "grad_norm": 18.47556077880198, + "learning_rate": 8.091336438744217e-06, + "loss": 2.5224, + "step": 5376 + }, + { + "epoch": 1.0775551102204408, + "grad_norm": 125.9220104117738, + "learning_rate": 8.090420006822885e-06, + "loss": 3.3408, + "step": 5377 + }, + { + "epoch": 1.077755511022044, + "grad_norm": 24.146444064757624, + "learning_rate": 8.089503406870856e-06, + "loss": 2.8139, + "step": 5378 + }, + { + "epoch": 1.0779559118236473, + "grad_norm": 18.559931290348704, + "learning_rate": 8.088586638937963e-06, + "loss": 2.2609, + "step": 5379 + }, + { + "epoch": 1.0781563126252505, + "grad_norm": 21.373595584409827, + "learning_rate": 8.087669703074059e-06, + "loss": 2.7714, + "step": 5380 + }, + { + "epoch": 1.0783567134268537, + "grad_norm": 39.89266582294409, + "learning_rate": 8.086752599328994e-06, + "loss": 3.2139, + "step": 5381 + }, + { + "epoch": 1.078557114228457, + "grad_norm": 19.545051836997565, + "learning_rate": 8.085835327752633e-06, + "loss": 2.5432, + "step": 5382 + }, + { + "epoch": 1.0787575150300601, + "grad_norm": 27.309777449364258, + "learning_rate": 8.084917888394853e-06, + "loss": 3.5277, + "step": 5383 + }, + { + "epoch": 1.0789579158316633, + "grad_norm": 30.407658017377525, + "learning_rate": 8.084000281305531e-06, + "loss": 3.0837, + "step": 5384 + }, + { + "epoch": 1.0791583166332666, + "grad_norm": 21.349583436207645, + "learning_rate": 8.08308250653456e-06, + "loss": 2.4728, + "step": 5385 + }, + { + "epoch": 1.0793587174348698, + "grad_norm": 32.1015272099405, + "learning_rate": 8.082164564131844e-06, + "loss": 2.9799, + "step": 5386 + }, + { + "epoch": 1.079559118236473, + "grad_norm": 17.949793341782627, + "learning_rate": 8.081246454147293e-06, + "loss": 2.6958, + "step": 5387 + }, + { + "epoch": 1.0797595190380762, + "grad_norm": 22.566955694882246, + "learning_rate": 8.080328176630821e-06, + "loss": 2.4893, + "step": 5388 + }, + { + "epoch": 1.0799599198396794, + "grad_norm": 22.971684374543397, + "learning_rate": 8.079409731632359e-06, + "loss": 2.5579, + "step": 5389 + }, + { + "epoch": 1.0801603206412826, + "grad_norm": 20.239779339768088, + "learning_rate": 8.078491119201845e-06, + "loss": 3.1811, + "step": 5390 + }, + { + "epoch": 1.0803607214428859, + "grad_norm": 24.175005740422556, + "learning_rate": 8.077572339389226e-06, + "loss": 3.0447, + "step": 5391 + }, + { + "epoch": 1.080561122244489, + "grad_norm": 19.37089386242385, + "learning_rate": 8.076653392244456e-06, + "loss": 2.5477, + "step": 5392 + }, + { + "epoch": 1.080761523046092, + "grad_norm": 36.44287056706334, + "learning_rate": 8.075734277817497e-06, + "loss": 3.0087, + "step": 5393 + }, + { + "epoch": 1.0809619238476953, + "grad_norm": 19.1090548292707, + "learning_rate": 8.074814996158326e-06, + "loss": 2.3169, + "step": 5394 + }, + { + "epoch": 1.0811623246492985, + "grad_norm": 47.23821815975589, + "learning_rate": 8.073895547316926e-06, + "loss": 3.1027, + "step": 5395 + }, + { + "epoch": 1.0813627254509017, + "grad_norm": 32.752142041403864, + "learning_rate": 8.072975931343285e-06, + "loss": 3.2299, + "step": 5396 + }, + { + "epoch": 1.081563126252505, + "grad_norm": 18.32875854860439, + "learning_rate": 8.07205614828741e-06, + "loss": 2.7018, + "step": 5397 + }, + { + "epoch": 1.0817635270541082, + "grad_norm": 62.54007555395744, + "learning_rate": 8.071136198199306e-06, + "loss": 2.0437, + "step": 5398 + }, + { + "epoch": 1.0819639278557114, + "grad_norm": 23.612417805495927, + "learning_rate": 8.070216081128993e-06, + "loss": 2.6349, + "step": 5399 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 15.096879010922098, + "learning_rate": 8.0692957971265e-06, + "loss": 2.5774, + "step": 5400 + }, + { + "epoch": 1.0823647294589178, + "grad_norm": 26.42002812615742, + "learning_rate": 8.068375346241863e-06, + "loss": 2.9397, + "step": 5401 + }, + { + "epoch": 1.082565130260521, + "grad_norm": 29.713691749520994, + "learning_rate": 8.067454728525131e-06, + "loss": 3.0816, + "step": 5402 + }, + { + "epoch": 1.0827655310621243, + "grad_norm": 19.301286752069977, + "learning_rate": 8.066533944026356e-06, + "loss": 2.3016, + "step": 5403 + }, + { + "epoch": 1.0829659318637275, + "grad_norm": 22.545498639389585, + "learning_rate": 8.065612992795604e-06, + "loss": 2.2967, + "step": 5404 + }, + { + "epoch": 1.0831663326653307, + "grad_norm": 28.6356488046888, + "learning_rate": 8.064691874882948e-06, + "loss": 2.6178, + "step": 5405 + }, + { + "epoch": 1.083366733466934, + "grad_norm": 36.905904796535054, + "learning_rate": 8.063770590338471e-06, + "loss": 2.5828, + "step": 5406 + }, + { + "epoch": 1.0835671342685371, + "grad_norm": 21.87006366187431, + "learning_rate": 8.062849139212265e-06, + "loss": 2.6368, + "step": 5407 + }, + { + "epoch": 1.0837675350701403, + "grad_norm": 24.520892632682386, + "learning_rate": 8.061927521554429e-06, + "loss": 2.9958, + "step": 5408 + }, + { + "epoch": 1.0839679358717436, + "grad_norm": 29.068115248239703, + "learning_rate": 8.061005737415074e-06, + "loss": 2.6664, + "step": 5409 + }, + { + "epoch": 1.0841683366733468, + "grad_norm": 20.842190826675242, + "learning_rate": 8.060083786844319e-06, + "loss": 2.9762, + "step": 5410 + }, + { + "epoch": 1.08436873747495, + "grad_norm": 23.877890926726, + "learning_rate": 8.059161669892292e-06, + "loss": 3.0676, + "step": 5411 + }, + { + "epoch": 1.084569138276553, + "grad_norm": 25.110680040575918, + "learning_rate": 8.05823938660913e-06, + "loss": 3.0656, + "step": 5412 + }, + { + "epoch": 1.0847695390781562, + "grad_norm": 28.87041548082499, + "learning_rate": 8.057316937044977e-06, + "loss": 2.8433, + "step": 5413 + }, + { + "epoch": 1.0849699398797594, + "grad_norm": 21.87341928847021, + "learning_rate": 8.05639432124999e-06, + "loss": 2.7487, + "step": 5414 + }, + { + "epoch": 1.0851703406813626, + "grad_norm": 23.289094837015703, + "learning_rate": 8.05547153927433e-06, + "loss": 2.8884, + "step": 5415 + }, + { + "epoch": 1.0853707414829659, + "grad_norm": 24.439938996412298, + "learning_rate": 8.054548591168174e-06, + "loss": 2.6586, + "step": 5416 + }, + { + "epoch": 1.085571142284569, + "grad_norm": 20.76679201129621, + "learning_rate": 8.053625476981702e-06, + "loss": 3.2573, + "step": 5417 + }, + { + "epoch": 1.0857715430861723, + "grad_norm": 31.19836193054101, + "learning_rate": 8.052702196765104e-06, + "loss": 3.4713, + "step": 5418 + }, + { + "epoch": 1.0859719438877755, + "grad_norm": 19.410414555167517, + "learning_rate": 8.051778750568583e-06, + "loss": 2.8594, + "step": 5419 + }, + { + "epoch": 1.0861723446893787, + "grad_norm": 26.213967859999453, + "learning_rate": 8.050855138442344e-06, + "loss": 2.7194, + "step": 5420 + }, + { + "epoch": 1.086372745490982, + "grad_norm": 25.781246779073747, + "learning_rate": 8.04993136043661e-06, + "loss": 2.2954, + "step": 5421 + }, + { + "epoch": 1.0865731462925852, + "grad_norm": 18.572104711965384, + "learning_rate": 8.049007416601606e-06, + "loss": 2.5505, + "step": 5422 + }, + { + "epoch": 1.0867735470941884, + "grad_norm": 24.608838629542614, + "learning_rate": 8.048083306987566e-06, + "loss": 2.7947, + "step": 5423 + }, + { + "epoch": 1.0869739478957916, + "grad_norm": 21.32440417332422, + "learning_rate": 8.04715903164474e-06, + "loss": 2.4692, + "step": 5424 + }, + { + "epoch": 1.0871743486973948, + "grad_norm": 26.098228038393223, + "learning_rate": 8.046234590623376e-06, + "loss": 3.0822, + "step": 5425 + }, + { + "epoch": 1.087374749498998, + "grad_norm": 49.1984372824823, + "learning_rate": 8.045309983973743e-06, + "loss": 2.3009, + "step": 5426 + }, + { + "epoch": 1.0875751503006013, + "grad_norm": 24.971826428122892, + "learning_rate": 8.04438521174611e-06, + "loss": 2.9028, + "step": 5427 + }, + { + "epoch": 1.0877755511022045, + "grad_norm": 40.78818481311596, + "learning_rate": 8.04346027399076e-06, + "loss": 3.4586, + "step": 5428 + }, + { + "epoch": 1.0879759519038077, + "grad_norm": 21.79298129503318, + "learning_rate": 8.042535170757982e-06, + "loss": 3.2428, + "step": 5429 + }, + { + "epoch": 1.088176352705411, + "grad_norm": 35.151394821796345, + "learning_rate": 8.041609902098075e-06, + "loss": 2.5687, + "step": 5430 + }, + { + "epoch": 1.0883767535070141, + "grad_norm": 24.961190809280545, + "learning_rate": 8.040684468061349e-06, + "loss": 2.9866, + "step": 5431 + }, + { + "epoch": 1.0885771543086173, + "grad_norm": 31.432809411767984, + "learning_rate": 8.03975886869812e-06, + "loss": 3.1072, + "step": 5432 + }, + { + "epoch": 1.0887775551102203, + "grad_norm": 22.50285223242765, + "learning_rate": 8.038833104058713e-06, + "loss": 2.789, + "step": 5433 + }, + { + "epoch": 1.0889779559118236, + "grad_norm": 25.90963638335878, + "learning_rate": 8.037907174193465e-06, + "loss": 2.7699, + "step": 5434 + }, + { + "epoch": 1.0891783567134268, + "grad_norm": 33.38777110056663, + "learning_rate": 8.036981079152717e-06, + "loss": 2.8289, + "step": 5435 + }, + { + "epoch": 1.08937875751503, + "grad_norm": 22.33764058050908, + "learning_rate": 8.03605481898683e-06, + "loss": 2.8326, + "step": 5436 + }, + { + "epoch": 1.0895791583166332, + "grad_norm": 23.294906376859796, + "learning_rate": 8.035128393746156e-06, + "loss": 3.2923, + "step": 5437 + }, + { + "epoch": 1.0897795591182364, + "grad_norm": 19.965968058937865, + "learning_rate": 8.034201803481072e-06, + "loss": 2.4399, + "step": 5438 + }, + { + "epoch": 1.0899799599198396, + "grad_norm": 37.77677616071681, + "learning_rate": 8.03327504824196e-06, + "loss": 2.2121, + "step": 5439 + }, + { + "epoch": 1.0901803607214429, + "grad_norm": 38.06916835070276, + "learning_rate": 8.032348128079204e-06, + "loss": 3.2716, + "step": 5440 + }, + { + "epoch": 1.090380761523046, + "grad_norm": 48.56372073299277, + "learning_rate": 8.031421043043204e-06, + "loss": 2.1553, + "step": 5441 + }, + { + "epoch": 1.0905811623246493, + "grad_norm": 33.98279986053873, + "learning_rate": 8.030493793184365e-06, + "loss": 2.8258, + "step": 5442 + }, + { + "epoch": 1.0907815631262525, + "grad_norm": 22.63887478526504, + "learning_rate": 8.029566378553109e-06, + "loss": 3.0373, + "step": 5443 + }, + { + "epoch": 1.0909819639278557, + "grad_norm": 37.72071626712052, + "learning_rate": 8.028638799199855e-06, + "loss": 2.8161, + "step": 5444 + }, + { + "epoch": 1.091182364729459, + "grad_norm": 25.050581306564435, + "learning_rate": 8.02771105517504e-06, + "loss": 2.7896, + "step": 5445 + }, + { + "epoch": 1.0913827655310622, + "grad_norm": 24.065792916702208, + "learning_rate": 8.026783146529105e-06, + "loss": 3.0472, + "step": 5446 + }, + { + "epoch": 1.0915831663326654, + "grad_norm": 38.86722382297667, + "learning_rate": 8.025855073312501e-06, + "loss": 2.5224, + "step": 5447 + }, + { + "epoch": 1.0917835671342686, + "grad_norm": 20.72149063025587, + "learning_rate": 8.024926835575693e-06, + "loss": 2.9841, + "step": 5448 + }, + { + "epoch": 1.0919839679358718, + "grad_norm": 24.24886876373201, + "learning_rate": 8.023998433369145e-06, + "loss": 2.0701, + "step": 5449 + }, + { + "epoch": 1.092184368737475, + "grad_norm": 29.301882899255542, + "learning_rate": 8.02306986674334e-06, + "loss": 3.2628, + "step": 5450 + }, + { + "epoch": 1.0923847695390783, + "grad_norm": 28.189074057966486, + "learning_rate": 8.022141135748762e-06, + "loss": 2.4439, + "step": 5451 + }, + { + "epoch": 1.0925851703406813, + "grad_norm": 29.341972424242055, + "learning_rate": 8.021212240435911e-06, + "loss": 2.5321, + "step": 5452 + }, + { + "epoch": 1.0927855711422845, + "grad_norm": 23.830699958682047, + "learning_rate": 8.02028318085529e-06, + "loss": 2.7414, + "step": 5453 + }, + { + "epoch": 1.0929859719438877, + "grad_norm": 29.678284052715597, + "learning_rate": 8.019353957057414e-06, + "loss": 2.4358, + "step": 5454 + }, + { + "epoch": 1.093186372745491, + "grad_norm": 31.30436124761264, + "learning_rate": 8.018424569092808e-06, + "loss": 3.2491, + "step": 5455 + }, + { + "epoch": 1.0933867735470941, + "grad_norm": 34.46574335488579, + "learning_rate": 8.017495017012002e-06, + "loss": 2.9054, + "step": 5456 + }, + { + "epoch": 1.0935871743486973, + "grad_norm": 31.39356421503263, + "learning_rate": 8.016565300865535e-06, + "loss": 2.4331, + "step": 5457 + }, + { + "epoch": 1.0937875751503006, + "grad_norm": 30.481582270865154, + "learning_rate": 8.015635420703964e-06, + "loss": 2.9314, + "step": 5458 + }, + { + "epoch": 1.0939879759519038, + "grad_norm": 28.61391766286076, + "learning_rate": 8.01470537657784e-06, + "loss": 2.7261, + "step": 5459 + }, + { + "epoch": 1.094188376753507, + "grad_norm": 28.160618471345703, + "learning_rate": 8.013775168537736e-06, + "loss": 2.7404, + "step": 5460 + }, + { + "epoch": 1.0943887775551102, + "grad_norm": 24.779132867000364, + "learning_rate": 8.012844796634229e-06, + "loss": 2.9614, + "step": 5461 + }, + { + "epoch": 1.0945891783567134, + "grad_norm": 29.97228481631664, + "learning_rate": 8.011914260917902e-06, + "loss": 3.3166, + "step": 5462 + }, + { + "epoch": 1.0947895791583167, + "grad_norm": 24.36152687161207, + "learning_rate": 8.01098356143935e-06, + "loss": 3.1809, + "step": 5463 + }, + { + "epoch": 1.0949899799599199, + "grad_norm": 20.863175644647363, + "learning_rate": 8.010052698249176e-06, + "loss": 2.4851, + "step": 5464 + }, + { + "epoch": 1.095190380761523, + "grad_norm": 31.533747250074693, + "learning_rate": 8.009121671397994e-06, + "loss": 3.0232, + "step": 5465 + }, + { + "epoch": 1.0953907815631263, + "grad_norm": 36.44950343176961, + "learning_rate": 8.008190480936427e-06, + "loss": 2.6808, + "step": 5466 + }, + { + "epoch": 1.0955911823647295, + "grad_norm": 21.39213790631178, + "learning_rate": 8.007259126915102e-06, + "loss": 2.3053, + "step": 5467 + }, + { + "epoch": 1.0957915831663327, + "grad_norm": 34.38213205051957, + "learning_rate": 8.00632760938466e-06, + "loss": 3.7811, + "step": 5468 + }, + { + "epoch": 1.095991983967936, + "grad_norm": 27.158928627117348, + "learning_rate": 8.005395928395745e-06, + "loss": 2.5839, + "step": 5469 + }, + { + "epoch": 1.0961923847695392, + "grad_norm": 32.232527941944845, + "learning_rate": 8.00446408399902e-06, + "loss": 2.7587, + "step": 5470 + }, + { + "epoch": 1.0963927855711422, + "grad_norm": 30.062645867322782, + "learning_rate": 8.003532076245147e-06, + "loss": 3.1579, + "step": 5471 + }, + { + "epoch": 1.0965931863727454, + "grad_norm": 38.75557328617746, + "learning_rate": 8.0025999051848e-06, + "loss": 2.4507, + "step": 5472 + }, + { + "epoch": 1.0967935871743486, + "grad_norm": 43.490260998955705, + "learning_rate": 8.001667570868666e-06, + "loss": 2.5631, + "step": 5473 + }, + { + "epoch": 1.0969939879759518, + "grad_norm": 25.055012322905913, + "learning_rate": 8.000735073347433e-06, + "loss": 2.4971, + "step": 5474 + }, + { + "epoch": 1.097194388777555, + "grad_norm": 27.416549713786047, + "learning_rate": 7.999802412671807e-06, + "loss": 3.215, + "step": 5475 + }, + { + "epoch": 1.0973947895791583, + "grad_norm": 27.686885907695306, + "learning_rate": 7.998869588892493e-06, + "loss": 2.757, + "step": 5476 + }, + { + "epoch": 1.0975951903807615, + "grad_norm": 37.043881591845185, + "learning_rate": 7.997936602060214e-06, + "loss": 2.6834, + "step": 5477 + }, + { + "epoch": 1.0977955911823647, + "grad_norm": 16.04768082174384, + "learning_rate": 7.997003452225698e-06, + "loss": 3.0198, + "step": 5478 + }, + { + "epoch": 1.097995991983968, + "grad_norm": 34.211943137010465, + "learning_rate": 7.99607013943968e-06, + "loss": 2.7639, + "step": 5479 + }, + { + "epoch": 1.0981963927855711, + "grad_norm": 19.64329586113172, + "learning_rate": 7.995136663752904e-06, + "loss": 2.9833, + "step": 5480 + }, + { + "epoch": 1.0983967935871743, + "grad_norm": 25.881470125027597, + "learning_rate": 7.994203025216129e-06, + "loss": 2.2891, + "step": 5481 + }, + { + "epoch": 1.0985971943887776, + "grad_norm": 26.633757610723425, + "learning_rate": 7.993269223880114e-06, + "loss": 2.7672, + "step": 5482 + }, + { + "epoch": 1.0987975951903808, + "grad_norm": 35.5142750322519, + "learning_rate": 7.992335259795632e-06, + "loss": 2.8854, + "step": 5483 + }, + { + "epoch": 1.098997995991984, + "grad_norm": 23.963862977632214, + "learning_rate": 7.991401133013465e-06, + "loss": 2.5935, + "step": 5484 + }, + { + "epoch": 1.0991983967935872, + "grad_norm": 37.94643997189506, + "learning_rate": 7.990466843584406e-06, + "loss": 3.6244, + "step": 5485 + }, + { + "epoch": 1.0993987975951904, + "grad_norm": 18.80654581997024, + "learning_rate": 7.989532391559247e-06, + "loss": 2.7493, + "step": 5486 + }, + { + "epoch": 1.0995991983967937, + "grad_norm": 22.43755654131515, + "learning_rate": 7.9885977769888e-06, + "loss": 2.8885, + "step": 5487 + }, + { + "epoch": 1.0997995991983969, + "grad_norm": 20.38317471880332, + "learning_rate": 7.987662999923879e-06, + "loss": 2.9561, + "step": 5488 + }, + { + "epoch": 1.1, + "grad_norm": 26.870121270344356, + "learning_rate": 7.986728060415312e-06, + "loss": 3.0234, + "step": 5489 + }, + { + "epoch": 1.1002004008016033, + "grad_norm": 47.64583523676487, + "learning_rate": 7.985792958513932e-06, + "loss": 2.4319, + "step": 5490 + }, + { + "epoch": 1.1004008016032065, + "grad_norm": 25.1444733364321, + "learning_rate": 7.98485769427058e-06, + "loss": 2.575, + "step": 5491 + }, + { + "epoch": 1.1006012024048095, + "grad_norm": 31.861599205542984, + "learning_rate": 7.98392226773611e-06, + "loss": 2.2863, + "step": 5492 + }, + { + "epoch": 1.1008016032064127, + "grad_norm": 29.995251758048628, + "learning_rate": 7.982986678961381e-06, + "loss": 2.6773, + "step": 5493 + }, + { + "epoch": 1.101002004008016, + "grad_norm": 19.488554493291147, + "learning_rate": 7.982050927997263e-06, + "loss": 2.7987, + "step": 5494 + }, + { + "epoch": 1.1012024048096192, + "grad_norm": 56.593261500906834, + "learning_rate": 7.981115014894636e-06, + "loss": 3.2223, + "step": 5495 + }, + { + "epoch": 1.1014028056112224, + "grad_norm": 33.7581656526319, + "learning_rate": 7.980178939704383e-06, + "loss": 2.7906, + "step": 5496 + }, + { + "epoch": 1.1016032064128256, + "grad_norm": 22.043708997469228, + "learning_rate": 7.979242702477403e-06, + "loss": 3.0077, + "step": 5497 + }, + { + "epoch": 1.1018036072144288, + "grad_norm": 35.112889309786446, + "learning_rate": 7.978306303264598e-06, + "loss": 2.7189, + "step": 5498 + }, + { + "epoch": 1.102004008016032, + "grad_norm": 55.73029037474742, + "learning_rate": 7.977369742116883e-06, + "loss": 2.5983, + "step": 5499 + }, + { + "epoch": 1.1022044088176353, + "grad_norm": 39.63343028656192, + "learning_rate": 7.976433019085181e-06, + "loss": 3.1796, + "step": 5500 + }, + { + "epoch": 1.1024048096192385, + "grad_norm": 42.129102939142385, + "learning_rate": 7.975496134220423e-06, + "loss": 3.283, + "step": 5501 + }, + { + "epoch": 1.1026052104208417, + "grad_norm": 29.36343214262288, + "learning_rate": 7.974559087573548e-06, + "loss": 2.4794, + "step": 5502 + }, + { + "epoch": 1.102805611222445, + "grad_norm": 24.216537215785674, + "learning_rate": 7.973621879195503e-06, + "loss": 2.6769, + "step": 5503 + }, + { + "epoch": 1.1030060120240481, + "grad_norm": 22.716109506623738, + "learning_rate": 7.972684509137248e-06, + "loss": 2.5964, + "step": 5504 + }, + { + "epoch": 1.1032064128256514, + "grad_norm": 20.3014762497665, + "learning_rate": 7.971746977449747e-06, + "loss": 2.9334, + "step": 5505 + }, + { + "epoch": 1.1034068136272546, + "grad_norm": 18.04947893283636, + "learning_rate": 7.970809284183977e-06, + "loss": 2.5061, + "step": 5506 + }, + { + "epoch": 1.1036072144288578, + "grad_norm": 53.39842398917372, + "learning_rate": 7.96987142939092e-06, + "loss": 2.6293, + "step": 5507 + }, + { + "epoch": 1.103807615230461, + "grad_norm": 30.40915425338432, + "learning_rate": 7.968933413121571e-06, + "loss": 3.2006, + "step": 5508 + }, + { + "epoch": 1.1040080160320642, + "grad_norm": 52.87452713244655, + "learning_rate": 7.967995235426928e-06, + "loss": 3.0243, + "step": 5509 + }, + { + "epoch": 1.1042084168336674, + "grad_norm": 24.026785985526956, + "learning_rate": 7.967056896358005e-06, + "loss": 2.6925, + "step": 5510 + }, + { + "epoch": 1.1044088176352704, + "grad_norm": 23.048035125664644, + "learning_rate": 7.966118395965818e-06, + "loss": 2.6536, + "step": 5511 + }, + { + "epoch": 1.1046092184368737, + "grad_norm": 14.610821621869768, + "learning_rate": 7.965179734301395e-06, + "loss": 2.596, + "step": 5512 + }, + { + "epoch": 1.1048096192384769, + "grad_norm": 33.11337989780332, + "learning_rate": 7.964240911415773e-06, + "loss": 3.4023, + "step": 5513 + }, + { + "epoch": 1.10501002004008, + "grad_norm": 28.794409332949304, + "learning_rate": 7.963301927359997e-06, + "loss": 2.5133, + "step": 5514 + }, + { + "epoch": 1.1052104208416833, + "grad_norm": 29.89176090664784, + "learning_rate": 7.96236278218512e-06, + "loss": 3.3937, + "step": 5515 + }, + { + "epoch": 1.1054108216432865, + "grad_norm": 43.75200191358472, + "learning_rate": 7.961423475942207e-06, + "loss": 2.1843, + "step": 5516 + }, + { + "epoch": 1.1056112224448897, + "grad_norm": 29.693992997603196, + "learning_rate": 7.960484008682327e-06, + "loss": 2.9087, + "step": 5517 + }, + { + "epoch": 1.105811623246493, + "grad_norm": 19.485019284784162, + "learning_rate": 7.959544380456564e-06, + "loss": 3.3331, + "step": 5518 + }, + { + "epoch": 1.1060120240480962, + "grad_norm": 66.43386232326763, + "learning_rate": 7.958604591316002e-06, + "loss": 3.117, + "step": 5519 + }, + { + "epoch": 1.1062124248496994, + "grad_norm": 24.754974755680067, + "learning_rate": 7.957664641311741e-06, + "loss": 2.7491, + "step": 5520 + }, + { + "epoch": 1.1064128256513026, + "grad_norm": 25.11145683006674, + "learning_rate": 7.956724530494889e-06, + "loss": 2.6065, + "step": 5521 + }, + { + "epoch": 1.1066132264529058, + "grad_norm": 20.948530726560765, + "learning_rate": 7.955784258916556e-06, + "loss": 2.3127, + "step": 5522 + }, + { + "epoch": 1.106813627254509, + "grad_norm": 23.09766028415179, + "learning_rate": 7.954843826627874e-06, + "loss": 3.0161, + "step": 5523 + }, + { + "epoch": 1.1070140280561123, + "grad_norm": 26.194206775249818, + "learning_rate": 7.95390323367997e-06, + "loss": 2.4196, + "step": 5524 + }, + { + "epoch": 1.1072144288577155, + "grad_norm": 19.588913681829055, + "learning_rate": 7.952962480123986e-06, + "loss": 2.7959, + "step": 5525 + }, + { + "epoch": 1.1074148296593187, + "grad_norm": 22.619422923334465, + "learning_rate": 7.952021566011075e-06, + "loss": 2.644, + "step": 5526 + }, + { + "epoch": 1.107615230460922, + "grad_norm": 20.937996024871982, + "learning_rate": 7.951080491392393e-06, + "loss": 2.371, + "step": 5527 + }, + { + "epoch": 1.1078156312625251, + "grad_norm": 17.656422848113174, + "learning_rate": 7.950139256319107e-06, + "loss": 2.8046, + "step": 5528 + }, + { + "epoch": 1.1080160320641284, + "grad_norm": 29.744854734542226, + "learning_rate": 7.949197860842397e-06, + "loss": 3.0386, + "step": 5529 + }, + { + "epoch": 1.1082164328657313, + "grad_norm": 25.465905483727123, + "learning_rate": 7.948256305013445e-06, + "loss": 2.4186, + "step": 5530 + }, + { + "epoch": 1.1084168336673346, + "grad_norm": 23.619840053773743, + "learning_rate": 7.947314588883444e-06, + "loss": 2.7256, + "step": 5531 + }, + { + "epoch": 1.1086172344689378, + "grad_norm": 38.46377801768837, + "learning_rate": 7.946372712503601e-06, + "loss": 2.9934, + "step": 5532 + }, + { + "epoch": 1.108817635270541, + "grad_norm": 22.834547489468477, + "learning_rate": 7.945430675925122e-06, + "loss": 2.409, + "step": 5533 + }, + { + "epoch": 1.1090180360721442, + "grad_norm": 18.16535509704933, + "learning_rate": 7.94448847919923e-06, + "loss": 2.4923, + "step": 5534 + }, + { + "epoch": 1.1092184368737474, + "grad_norm": 20.16869440389917, + "learning_rate": 7.943546122377154e-06, + "loss": 2.1837, + "step": 5535 + }, + { + "epoch": 1.1094188376753507, + "grad_norm": 22.55869714545359, + "learning_rate": 7.94260360551013e-06, + "loss": 2.9231, + "step": 5536 + }, + { + "epoch": 1.1096192384769539, + "grad_norm": 32.72255377225733, + "learning_rate": 7.941660928649404e-06, + "loss": 1.9431, + "step": 5537 + }, + { + "epoch": 1.109819639278557, + "grad_norm": 24.393033190016073, + "learning_rate": 7.94071809184623e-06, + "loss": 2.7584, + "step": 5538 + }, + { + "epoch": 1.1100200400801603, + "grad_norm": 24.443275828884417, + "learning_rate": 7.939775095151874e-06, + "loss": 2.8554, + "step": 5539 + }, + { + "epoch": 1.1102204408817635, + "grad_norm": 22.901339879171204, + "learning_rate": 7.938831938617604e-06, + "loss": 2.507, + "step": 5540 + }, + { + "epoch": 1.1104208416833667, + "grad_norm": 17.169469944641815, + "learning_rate": 7.937888622294708e-06, + "loss": 2.8102, + "step": 5541 + }, + { + "epoch": 1.11062124248497, + "grad_norm": 25.83518691630319, + "learning_rate": 7.936945146234467e-06, + "loss": 2.8899, + "step": 5542 + }, + { + "epoch": 1.1108216432865732, + "grad_norm": 35.18469651064033, + "learning_rate": 7.936001510488183e-06, + "loss": 2.8959, + "step": 5543 + }, + { + "epoch": 1.1110220440881764, + "grad_norm": 24.034311988729584, + "learning_rate": 7.935057715107165e-06, + "loss": 2.4123, + "step": 5544 + }, + { + "epoch": 1.1112224448897796, + "grad_norm": 21.446860736734852, + "learning_rate": 7.934113760142725e-06, + "loss": 2.9156, + "step": 5545 + }, + { + "epoch": 1.1114228456913828, + "grad_norm": 26.060531104257656, + "learning_rate": 7.933169645646191e-06, + "loss": 2.791, + "step": 5546 + }, + { + "epoch": 1.111623246492986, + "grad_norm": 23.64529696484694, + "learning_rate": 7.93222537166889e-06, + "loss": 3.2351, + "step": 5547 + }, + { + "epoch": 1.1118236472945893, + "grad_norm": 22.790940536923106, + "learning_rate": 7.931280938262168e-06, + "loss": 2.6203, + "step": 5548 + }, + { + "epoch": 1.1120240480961925, + "grad_norm": 25.027186458118276, + "learning_rate": 7.930336345477378e-06, + "loss": 3.0415, + "step": 5549 + }, + { + "epoch": 1.1122244488977957, + "grad_norm": 23.736002790674927, + "learning_rate": 7.929391593365871e-06, + "loss": 2.6465, + "step": 5550 + }, + { + "epoch": 1.1124248496993987, + "grad_norm": 24.656301911613284, + "learning_rate": 7.928446681979022e-06, + "loss": 2.7011, + "step": 5551 + }, + { + "epoch": 1.112625250501002, + "grad_norm": 21.645492645958594, + "learning_rate": 7.927501611368203e-06, + "loss": 2.9397, + "step": 5552 + }, + { + "epoch": 1.1128256513026051, + "grad_norm": 32.79980837230085, + "learning_rate": 7.9265563815848e-06, + "loss": 3.1159, + "step": 5553 + }, + { + "epoch": 1.1130260521042084, + "grad_norm": 31.135137740335754, + "learning_rate": 7.925610992680206e-06, + "loss": 2.8647, + "step": 5554 + }, + { + "epoch": 1.1132264529058116, + "grad_norm": 18.94750022442148, + "learning_rate": 7.924665444705824e-06, + "loss": 3.2492, + "step": 5555 + }, + { + "epoch": 1.1134268537074148, + "grad_norm": 25.09001568423647, + "learning_rate": 7.923719737713064e-06, + "loss": 3.1032, + "step": 5556 + }, + { + "epoch": 1.113627254509018, + "grad_norm": 25.063808053621084, + "learning_rate": 7.922773871753347e-06, + "loss": 2.6824, + "step": 5557 + }, + { + "epoch": 1.1138276553106212, + "grad_norm": 57.27530717974959, + "learning_rate": 7.9218278468781e-06, + "loss": 3.0283, + "step": 5558 + }, + { + "epoch": 1.1140280561122244, + "grad_norm": 23.105729977213937, + "learning_rate": 7.92088166313876e-06, + "loss": 3.0233, + "step": 5559 + }, + { + "epoch": 1.1142284569138277, + "grad_norm": 31.42586035164743, + "learning_rate": 7.91993532058677e-06, + "loss": 2.8872, + "step": 5560 + }, + { + "epoch": 1.1144288577154309, + "grad_norm": 38.9683635969408, + "learning_rate": 7.918988819273593e-06, + "loss": 2.9889, + "step": 5561 + }, + { + "epoch": 1.114629258517034, + "grad_norm": 19.69092669417946, + "learning_rate": 7.918042159250679e-06, + "loss": 2.7664, + "step": 5562 + }, + { + "epoch": 1.1148296593186373, + "grad_norm": 23.638177249216625, + "learning_rate": 7.917095340569507e-06, + "loss": 2.8368, + "step": 5563 + }, + { + "epoch": 1.1150300601202405, + "grad_norm": 21.18187205442297, + "learning_rate": 7.916148363281555e-06, + "loss": 2.5295, + "step": 5564 + }, + { + "epoch": 1.1152304609218437, + "grad_norm": 19.90115285110217, + "learning_rate": 7.915201227438312e-06, + "loss": 2.8817, + "step": 5565 + }, + { + "epoch": 1.115430861723447, + "grad_norm": 19.709073454170586, + "learning_rate": 7.914253933091276e-06, + "loss": 3.0834, + "step": 5566 + }, + { + "epoch": 1.1156312625250502, + "grad_norm": 24.691825604992264, + "learning_rate": 7.913306480291951e-06, + "loss": 3.1079, + "step": 5567 + }, + { + "epoch": 1.1158316633266534, + "grad_norm": 25.88432643259445, + "learning_rate": 7.912358869091853e-06, + "loss": 2.8859, + "step": 5568 + }, + { + "epoch": 1.1160320641282566, + "grad_norm": 23.527748216856985, + "learning_rate": 7.911411099542502e-06, + "loss": 3.0351, + "step": 5569 + }, + { + "epoch": 1.1162324649298596, + "grad_norm": 23.308852214323196, + "learning_rate": 7.910463171695434e-06, + "loss": 2.8947, + "step": 5570 + }, + { + "epoch": 1.1164328657314628, + "grad_norm": 33.15084613420275, + "learning_rate": 7.909515085602187e-06, + "loss": 3.1031, + "step": 5571 + }, + { + "epoch": 1.116633266533066, + "grad_norm": 20.675291797289653, + "learning_rate": 7.908566841314309e-06, + "loss": 2.8539, + "step": 5572 + }, + { + "epoch": 1.1168336673346693, + "grad_norm": 24.56074986523274, + "learning_rate": 7.90761843888336e-06, + "loss": 2.586, + "step": 5573 + }, + { + "epoch": 1.1170340681362725, + "grad_norm": 24.43208424456409, + "learning_rate": 7.906669878360903e-06, + "loss": 2.6491, + "step": 5574 + }, + { + "epoch": 1.1172344689378757, + "grad_norm": 18.779414664961138, + "learning_rate": 7.905721159798514e-06, + "loss": 2.2798, + "step": 5575 + }, + { + "epoch": 1.117434869739479, + "grad_norm": 25.459128120459667, + "learning_rate": 7.904772283247776e-06, + "loss": 3.3671, + "step": 5576 + }, + { + "epoch": 1.1176352705410821, + "grad_norm": 27.242743813073883, + "learning_rate": 7.903823248760283e-06, + "loss": 3.079, + "step": 5577 + }, + { + "epoch": 1.1178356713426854, + "grad_norm": 25.056384584092555, + "learning_rate": 7.902874056387633e-06, + "loss": 2.8396, + "step": 5578 + }, + { + "epoch": 1.1180360721442886, + "grad_norm": 26.69902937309507, + "learning_rate": 7.901924706181435e-06, + "loss": 2.7347, + "step": 5579 + }, + { + "epoch": 1.1182364729458918, + "grad_norm": 21.663545340675576, + "learning_rate": 7.900975198193305e-06, + "loss": 2.7918, + "step": 5580 + }, + { + "epoch": 1.118436873747495, + "grad_norm": 21.010494972602505, + "learning_rate": 7.900025532474876e-06, + "loss": 3.3956, + "step": 5581 + }, + { + "epoch": 1.1186372745490982, + "grad_norm": 20.553090738690187, + "learning_rate": 7.899075709077774e-06, + "loss": 2.194, + "step": 5582 + }, + { + "epoch": 1.1188376753507014, + "grad_norm": 19.637644015797523, + "learning_rate": 7.898125728053648e-06, + "loss": 2.4029, + "step": 5583 + }, + { + "epoch": 1.1190380761523047, + "grad_norm": 22.5930188994421, + "learning_rate": 7.897175589454148e-06, + "loss": 2.5945, + "step": 5584 + }, + { + "epoch": 1.1192384769539079, + "grad_norm": 19.794509328149942, + "learning_rate": 7.896225293330934e-06, + "loss": 2.6709, + "step": 5585 + }, + { + "epoch": 1.119438877755511, + "grad_norm": 27.99337263837587, + "learning_rate": 7.895274839735675e-06, + "loss": 2.4035, + "step": 5586 + }, + { + "epoch": 1.1196392785571143, + "grad_norm": 22.485803648414908, + "learning_rate": 7.89432422872005e-06, + "loss": 2.5956, + "step": 5587 + }, + { + "epoch": 1.1198396793587175, + "grad_norm": 20.282092269173045, + "learning_rate": 7.893373460335745e-06, + "loss": 3.0033, + "step": 5588 + }, + { + "epoch": 1.1200400801603205, + "grad_norm": 33.57980225662153, + "learning_rate": 7.892422534634451e-06, + "loss": 2.9302, + "step": 5589 + }, + { + "epoch": 1.1202404809619237, + "grad_norm": 23.08783581775422, + "learning_rate": 7.891471451667877e-06, + "loss": 2.6168, + "step": 5590 + }, + { + "epoch": 1.120440881763527, + "grad_norm": 23.095642702279903, + "learning_rate": 7.890520211487732e-06, + "loss": 3.2951, + "step": 5591 + }, + { + "epoch": 1.1206412825651302, + "grad_norm": 50.030852203494035, + "learning_rate": 7.889568814145737e-06, + "loss": 3.12, + "step": 5592 + }, + { + "epoch": 1.1208416833667334, + "grad_norm": 19.774585409171586, + "learning_rate": 7.888617259693618e-06, + "loss": 2.6586, + "step": 5593 + }, + { + "epoch": 1.1210420841683366, + "grad_norm": 42.3531694762207, + "learning_rate": 7.887665548183115e-06, + "loss": 3.3353, + "step": 5594 + }, + { + "epoch": 1.1212424849699398, + "grad_norm": 24.540091372248842, + "learning_rate": 7.886713679665973e-06, + "loss": 2.7525, + "step": 5595 + }, + { + "epoch": 1.121442885771543, + "grad_norm": 24.001238061513025, + "learning_rate": 7.885761654193948e-06, + "loss": 2.641, + "step": 5596 + }, + { + "epoch": 1.1216432865731463, + "grad_norm": 37.814430283449106, + "learning_rate": 7.884809471818803e-06, + "loss": 3.1065, + "step": 5597 + }, + { + "epoch": 1.1218436873747495, + "grad_norm": 26.017565190758106, + "learning_rate": 7.883857132592308e-06, + "loss": 2.6024, + "step": 5598 + }, + { + "epoch": 1.1220440881763527, + "grad_norm": 27.58780491075037, + "learning_rate": 7.882904636566244e-06, + "loss": 3.0022, + "step": 5599 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 27.532768347385847, + "learning_rate": 7.8819519837924e-06, + "loss": 3.3795, + "step": 5600 + }, + { + "epoch": 1.1224448897795591, + "grad_norm": 32.578144222288515, + "learning_rate": 7.88099917432257e-06, + "loss": 3.4342, + "step": 5601 + }, + { + "epoch": 1.1226452905811624, + "grad_norm": 31.3085451444779, + "learning_rate": 7.880046208208562e-06, + "loss": 2.5515, + "step": 5602 + }, + { + "epoch": 1.1228456913827656, + "grad_norm": 28.898417841193673, + "learning_rate": 7.879093085502192e-06, + "loss": 2.5132, + "step": 5603 + }, + { + "epoch": 1.1230460921843688, + "grad_norm": 67.0716185750441, + "learning_rate": 7.878139806255282e-06, + "loss": 3.2676, + "step": 5604 + }, + { + "epoch": 1.123246492985972, + "grad_norm": 20.431186765254722, + "learning_rate": 7.87718637051966e-06, + "loss": 2.86, + "step": 5605 + }, + { + "epoch": 1.1234468937875752, + "grad_norm": 23.230201601264625, + "learning_rate": 7.876232778347171e-06, + "loss": 2.9815, + "step": 5606 + }, + { + "epoch": 1.1236472945891784, + "grad_norm": 31.50512690338244, + "learning_rate": 7.875279029789657e-06, + "loss": 2.2791, + "step": 5607 + }, + { + "epoch": 1.1238476953907817, + "grad_norm": 21.44558368672819, + "learning_rate": 7.874325124898977e-06, + "loss": 2.6785, + "step": 5608 + }, + { + "epoch": 1.1240480961923849, + "grad_norm": 29.481339182338765, + "learning_rate": 7.873371063727e-06, + "loss": 2.28, + "step": 5609 + }, + { + "epoch": 1.1242484969939879, + "grad_norm": 25.054826149388827, + "learning_rate": 7.872416846325597e-06, + "loss": 2.7675, + "step": 5610 + }, + { + "epoch": 1.124448897795591, + "grad_norm": 23.201829411172906, + "learning_rate": 7.871462472746649e-06, + "loss": 2.583, + "step": 5611 + }, + { + "epoch": 1.1246492985971943, + "grad_norm": 29.45846211128337, + "learning_rate": 7.870507943042048e-06, + "loss": 2.9586, + "step": 5612 + }, + { + "epoch": 1.1248496993987975, + "grad_norm": 33.520086621142696, + "learning_rate": 7.86955325726369e-06, + "loss": 2.5167, + "step": 5613 + }, + { + "epoch": 1.1250501002004007, + "grad_norm": 25.37918343028737, + "learning_rate": 7.868598415463488e-06, + "loss": 2.9015, + "step": 5614 + }, + { + "epoch": 1.125250501002004, + "grad_norm": 18.278974896930954, + "learning_rate": 7.867643417693358e-06, + "loss": 2.1991, + "step": 5615 + }, + { + "epoch": 1.1254509018036072, + "grad_norm": 25.299290028634026, + "learning_rate": 7.866688264005219e-06, + "loss": 2.9012, + "step": 5616 + }, + { + "epoch": 1.1256513026052104, + "grad_norm": 31.47967028221934, + "learning_rate": 7.865732954451008e-06, + "loss": 3.2628, + "step": 5617 + }, + { + "epoch": 1.1258517034068136, + "grad_norm": 34.20174272756956, + "learning_rate": 7.864777489082667e-06, + "loss": 3.1659, + "step": 5618 + }, + { + "epoch": 1.1260521042084168, + "grad_norm": 34.881883373535445, + "learning_rate": 7.863821867952144e-06, + "loss": 2.8823, + "step": 5619 + }, + { + "epoch": 1.12625250501002, + "grad_norm": 29.144902812124432, + "learning_rate": 7.8628660911114e-06, + "loss": 3.4305, + "step": 5620 + }, + { + "epoch": 1.1264529058116233, + "grad_norm": 20.191673416977583, + "learning_rate": 7.861910158612399e-06, + "loss": 2.5639, + "step": 5621 + }, + { + "epoch": 1.1266533066132265, + "grad_norm": 27.06057086387275, + "learning_rate": 7.86095407050712e-06, + "loss": 2.3188, + "step": 5622 + }, + { + "epoch": 1.1268537074148297, + "grad_norm": 21.736383796823908, + "learning_rate": 7.859997826847543e-06, + "loss": 2.6214, + "step": 5623 + }, + { + "epoch": 1.127054108216433, + "grad_norm": 23.43219287798309, + "learning_rate": 7.859041427685662e-06, + "loss": 2.2966, + "step": 5624 + }, + { + "epoch": 1.1272545090180361, + "grad_norm": 17.484543486744194, + "learning_rate": 7.85808487307348e-06, + "loss": 2.491, + "step": 5625 + }, + { + "epoch": 1.1274549098196394, + "grad_norm": 25.71894550410352, + "learning_rate": 7.857128163063005e-06, + "loss": 2.2764, + "step": 5626 + }, + { + "epoch": 1.1276553106212426, + "grad_norm": 29.079055017228434, + "learning_rate": 7.856171297706255e-06, + "loss": 2.9731, + "step": 5627 + }, + { + "epoch": 1.1278557114228458, + "grad_norm": 20.514239313666412, + "learning_rate": 7.855214277055253e-06, + "loss": 2.9033, + "step": 5628 + }, + { + "epoch": 1.1280561122244488, + "grad_norm": 23.592936186046725, + "learning_rate": 7.854257101162037e-06, + "loss": 2.6342, + "step": 5629 + }, + { + "epoch": 1.128256513026052, + "grad_norm": 31.84842865419957, + "learning_rate": 7.85329977007865e-06, + "loss": 2.3111, + "step": 5630 + }, + { + "epoch": 1.1284569138276552, + "grad_norm": 34.39247243514591, + "learning_rate": 7.852342283857142e-06, + "loss": 3.0421, + "step": 5631 + }, + { + "epoch": 1.1286573146292584, + "grad_norm": 52.03902738534454, + "learning_rate": 7.851384642549574e-06, + "loss": 3.0772, + "step": 5632 + }, + { + "epoch": 1.1288577154308617, + "grad_norm": 50.76230521382987, + "learning_rate": 7.850426846208013e-06, + "loss": 2.5835, + "step": 5633 + }, + { + "epoch": 1.1290581162324649, + "grad_norm": 24.099946036518816, + "learning_rate": 7.849468894884536e-06, + "loss": 2.5959, + "step": 5634 + }, + { + "epoch": 1.129258517034068, + "grad_norm": 27.791771993690237, + "learning_rate": 7.84851078863123e-06, + "loss": 2.8414, + "step": 5635 + }, + { + "epoch": 1.1294589178356713, + "grad_norm": 25.67847994121488, + "learning_rate": 7.847552527500188e-06, + "loss": 2.6464, + "step": 5636 + }, + { + "epoch": 1.1296593186372745, + "grad_norm": 22.608984658091863, + "learning_rate": 7.846594111543511e-06, + "loss": 2.3838, + "step": 5637 + }, + { + "epoch": 1.1298597194388778, + "grad_norm": 34.04529151300627, + "learning_rate": 7.84563554081331e-06, + "loss": 2.9558, + "step": 5638 + }, + { + "epoch": 1.130060120240481, + "grad_norm": 22.14961760111831, + "learning_rate": 7.844676815361707e-06, + "loss": 2.9242, + "step": 5639 + }, + { + "epoch": 1.1302605210420842, + "grad_norm": 23.016680785724926, + "learning_rate": 7.843717935240823e-06, + "loss": 2.6141, + "step": 5640 + }, + { + "epoch": 1.1304609218436874, + "grad_norm": 25.59411335040978, + "learning_rate": 7.842758900502799e-06, + "loss": 2.1659, + "step": 5641 + }, + { + "epoch": 1.1306613226452906, + "grad_norm": 25.966851751442444, + "learning_rate": 7.841799711199776e-06, + "loss": 2.6418, + "step": 5642 + }, + { + "epoch": 1.1308617234468938, + "grad_norm": 20.971984581505396, + "learning_rate": 7.84084036738391e-06, + "loss": 2.6164, + "step": 5643 + }, + { + "epoch": 1.131062124248497, + "grad_norm": 25.37929655445102, + "learning_rate": 7.839880869107357e-06, + "loss": 2.9603, + "step": 5644 + }, + { + "epoch": 1.1312625250501003, + "grad_norm": 28.691313374381245, + "learning_rate": 7.838921216422288e-06, + "loss": 2.8535, + "step": 5645 + }, + { + "epoch": 1.1314629258517035, + "grad_norm": 17.10587802433604, + "learning_rate": 7.837961409380885e-06, + "loss": 2.701, + "step": 5646 + }, + { + "epoch": 1.1316633266533067, + "grad_norm": 21.932955931937013, + "learning_rate": 7.83700144803533e-06, + "loss": 2.7636, + "step": 5647 + }, + { + "epoch": 1.1318637274549097, + "grad_norm": 39.628677012210645, + "learning_rate": 7.836041332437817e-06, + "loss": 2.5682, + "step": 5648 + }, + { + "epoch": 1.1320641282565131, + "grad_norm": 27.59342913843146, + "learning_rate": 7.835081062640555e-06, + "loss": 2.9607, + "step": 5649 + }, + { + "epoch": 1.1322645290581161, + "grad_norm": 43.10738390031757, + "learning_rate": 7.834120638695746e-06, + "loss": 2.7965, + "step": 5650 + }, + { + "epoch": 1.1324649298597194, + "grad_norm": 26.598019532392126, + "learning_rate": 7.833160060655616e-06, + "loss": 2.7193, + "step": 5651 + }, + { + "epoch": 1.1326653306613226, + "grad_norm": 35.0813645434162, + "learning_rate": 7.832199328572391e-06, + "loss": 2.9614, + "step": 5652 + }, + { + "epoch": 1.1328657314629258, + "grad_norm": 31.553116195833137, + "learning_rate": 7.831238442498309e-06, + "loss": 2.3078, + "step": 5653 + }, + { + "epoch": 1.133066132264529, + "grad_norm": 24.309638838389578, + "learning_rate": 7.830277402485612e-06, + "loss": 2.706, + "step": 5654 + }, + { + "epoch": 1.1332665330661322, + "grad_norm": 31.959498548066225, + "learning_rate": 7.829316208586554e-06, + "loss": 2.799, + "step": 5655 + }, + { + "epoch": 1.1334669338677354, + "grad_norm": 35.9446787161417, + "learning_rate": 7.8283548608534e-06, + "loss": 2.6518, + "step": 5656 + }, + { + "epoch": 1.1336673346693387, + "grad_norm": 60.31925517104818, + "learning_rate": 7.827393359338415e-06, + "loss": 2.9548, + "step": 5657 + }, + { + "epoch": 1.1338677354709419, + "grad_norm": 25.238060903906426, + "learning_rate": 7.826431704093881e-06, + "loss": 3.2127, + "step": 5658 + }, + { + "epoch": 1.134068136272545, + "grad_norm": 29.481746271583848, + "learning_rate": 7.825469895172083e-06, + "loss": 2.7519, + "step": 5659 + }, + { + "epoch": 1.1342685370741483, + "grad_norm": 38.14750694015546, + "learning_rate": 7.824507932625315e-06, + "loss": 3.3366, + "step": 5660 + }, + { + "epoch": 1.1344689378757515, + "grad_norm": 24.746842334923198, + "learning_rate": 7.823545816505882e-06, + "loss": 2.739, + "step": 5661 + }, + { + "epoch": 1.1346693386773548, + "grad_norm": 23.97864201847154, + "learning_rate": 7.822583546866094e-06, + "loss": 2.8793, + "step": 5662 + }, + { + "epoch": 1.134869739478958, + "grad_norm": 21.419505440399238, + "learning_rate": 7.821621123758273e-06, + "loss": 2.3668, + "step": 5663 + }, + { + "epoch": 1.1350701402805612, + "grad_norm": 20.87713110291656, + "learning_rate": 7.820658547234748e-06, + "loss": 2.7362, + "step": 5664 + }, + { + "epoch": 1.1352705410821644, + "grad_norm": 21.664385849733687, + "learning_rate": 7.81969581734785e-06, + "loss": 2.7667, + "step": 5665 + }, + { + "epoch": 1.1354709418837676, + "grad_norm": 22.780780385202103, + "learning_rate": 7.818732934149931e-06, + "loss": 2.5281, + "step": 5666 + }, + { + "epoch": 1.1356713426853706, + "grad_norm": 33.670298624320885, + "learning_rate": 7.81776989769334e-06, + "loss": 2.5884, + "step": 5667 + }, + { + "epoch": 1.135871743486974, + "grad_norm": 17.311421811015542, + "learning_rate": 7.816806708030442e-06, + "loss": 2.5166, + "step": 5668 + }, + { + "epoch": 1.136072144288577, + "grad_norm": 44.44460996151145, + "learning_rate": 7.815843365213605e-06, + "loss": 3.2516, + "step": 5669 + }, + { + "epoch": 1.1362725450901803, + "grad_norm": 53.20720018753899, + "learning_rate": 7.814879869295207e-06, + "loss": 2.9256, + "step": 5670 + }, + { + "epoch": 1.1364729458917835, + "grad_norm": 27.17620304613942, + "learning_rate": 7.813916220327636e-06, + "loss": 2.8833, + "step": 5671 + }, + { + "epoch": 1.1366733466933867, + "grad_norm": 19.4080553699755, + "learning_rate": 7.812952418363286e-06, + "loss": 2.8649, + "step": 5672 + }, + { + "epoch": 1.13687374749499, + "grad_norm": 21.66827148952406, + "learning_rate": 7.811988463454561e-06, + "loss": 2.4864, + "step": 5673 + }, + { + "epoch": 1.1370741482965931, + "grad_norm": 58.19318687832935, + "learning_rate": 7.811024355653872e-06, + "loss": 2.5936, + "step": 5674 + }, + { + "epoch": 1.1372745490981964, + "grad_norm": 21.724020200317888, + "learning_rate": 7.810060095013638e-06, + "loss": 2.626, + "step": 5675 + }, + { + "epoch": 1.1374749498997996, + "grad_norm": 47.623027831222295, + "learning_rate": 7.809095681586292e-06, + "loss": 2.7392, + "step": 5676 + }, + { + "epoch": 1.1376753507014028, + "grad_norm": 48.195439267167906, + "learning_rate": 7.808131115424264e-06, + "loss": 2.5481, + "step": 5677 + }, + { + "epoch": 1.137875751503006, + "grad_norm": 25.759165893404845, + "learning_rate": 7.807166396580003e-06, + "loss": 2.3834, + "step": 5678 + }, + { + "epoch": 1.1380761523046092, + "grad_norm": 32.47002769633577, + "learning_rate": 7.80620152510596e-06, + "loss": 2.8371, + "step": 5679 + }, + { + "epoch": 1.1382765531062125, + "grad_norm": 33.16921222949504, + "learning_rate": 7.805236501054601e-06, + "loss": 3.0933, + "step": 5680 + }, + { + "epoch": 1.1384769539078157, + "grad_norm": 28.508437349541936, + "learning_rate": 7.804271324478391e-06, + "loss": 2.8024, + "step": 5681 + }, + { + "epoch": 1.1386773547094189, + "grad_norm": 24.81881579688282, + "learning_rate": 7.803305995429808e-06, + "loss": 2.5307, + "step": 5682 + }, + { + "epoch": 1.138877755511022, + "grad_norm": 20.759452959984174, + "learning_rate": 7.802340513961343e-06, + "loss": 2.7703, + "step": 5683 + }, + { + "epoch": 1.1390781563126253, + "grad_norm": 24.895412629258423, + "learning_rate": 7.801374880125484e-06, + "loss": 3.0544, + "step": 5684 + }, + { + "epoch": 1.1392785571142285, + "grad_norm": 25.275513966649317, + "learning_rate": 7.800409093974742e-06, + "loss": 2.297, + "step": 5685 + }, + { + "epoch": 1.1394789579158318, + "grad_norm": 32.275542692530706, + "learning_rate": 7.79944315556162e-06, + "loss": 2.5576, + "step": 5686 + }, + { + "epoch": 1.139679358717435, + "grad_norm": 54.320343906581904, + "learning_rate": 7.798477064938644e-06, + "loss": 2.8721, + "step": 5687 + }, + { + "epoch": 1.139879759519038, + "grad_norm": 20.242369857766846, + "learning_rate": 7.797510822158336e-06, + "loss": 2.5278, + "step": 5688 + }, + { + "epoch": 1.1400801603206412, + "grad_norm": 19.774055922547905, + "learning_rate": 7.796544427273237e-06, + "loss": 2.5874, + "step": 5689 + }, + { + "epoch": 1.1402805611222444, + "grad_norm": 23.952631495054206, + "learning_rate": 7.79557788033589e-06, + "loss": 3.0628, + "step": 5690 + }, + { + "epoch": 1.1404809619238476, + "grad_norm": 20.995788702840244, + "learning_rate": 7.794611181398846e-06, + "loss": 2.7305, + "step": 5691 + }, + { + "epoch": 1.1406813627254508, + "grad_norm": 31.21325386610617, + "learning_rate": 7.793644330514666e-06, + "loss": 2.3414, + "step": 5692 + }, + { + "epoch": 1.140881763527054, + "grad_norm": 27.712896346487042, + "learning_rate": 7.792677327735922e-06, + "loss": 3.0088, + "step": 5693 + }, + { + "epoch": 1.1410821643286573, + "grad_norm": 27.58220271613234, + "learning_rate": 7.791710173115189e-06, + "loss": 2.7643, + "step": 5694 + }, + { + "epoch": 1.1412825651302605, + "grad_norm": 20.175248640453738, + "learning_rate": 7.79074286670505e-06, + "loss": 2.63, + "step": 5695 + }, + { + "epoch": 1.1414829659318637, + "grad_norm": 54.41655137459165, + "learning_rate": 7.789775408558105e-06, + "loss": 2.9778, + "step": 5696 + }, + { + "epoch": 1.141683366733467, + "grad_norm": 29.037437072036973, + "learning_rate": 7.788807798726951e-06, + "loss": 3.137, + "step": 5697 + }, + { + "epoch": 1.1418837675350701, + "grad_norm": 26.657955570067713, + "learning_rate": 7.787840037264203e-06, + "loss": 3.0116, + "step": 5698 + }, + { + "epoch": 1.1420841683366734, + "grad_norm": 22.110736929297893, + "learning_rate": 7.786872124222472e-06, + "loss": 2.7419, + "step": 5699 + }, + { + "epoch": 1.1422845691382766, + "grad_norm": 22.572676455923, + "learning_rate": 7.785904059654395e-06, + "loss": 2.5004, + "step": 5700 + }, + { + "epoch": 1.1424849699398798, + "grad_norm": 30.89212662787774, + "learning_rate": 7.784935843612599e-06, + "loss": 2.4906, + "step": 5701 + }, + { + "epoch": 1.142685370741483, + "grad_norm": 20.874865098872224, + "learning_rate": 7.783967476149732e-06, + "loss": 2.8606, + "step": 5702 + }, + { + "epoch": 1.1428857715430862, + "grad_norm": 21.811018267632484, + "learning_rate": 7.782998957318444e-06, + "loss": 2.8414, + "step": 5703 + }, + { + "epoch": 1.1430861723446895, + "grad_norm": 34.66835778660148, + "learning_rate": 7.782030287171393e-06, + "loss": 2.8981, + "step": 5704 + }, + { + "epoch": 1.1432865731462927, + "grad_norm": 22.851998686804063, + "learning_rate": 7.78106146576125e-06, + "loss": 2.3865, + "step": 5705 + }, + { + "epoch": 1.143486973947896, + "grad_norm": 16.303118081279734, + "learning_rate": 7.780092493140688e-06, + "loss": 2.598, + "step": 5706 + }, + { + "epoch": 1.1436873747494989, + "grad_norm": 21.3047487381606, + "learning_rate": 7.779123369362394e-06, + "loss": 2.8669, + "step": 5707 + }, + { + "epoch": 1.1438877755511023, + "grad_norm": 25.639411267633825, + "learning_rate": 7.778154094479062e-06, + "loss": 3.1595, + "step": 5708 + }, + { + "epoch": 1.1440881763527053, + "grad_norm": 18.51503208382476, + "learning_rate": 7.777184668543388e-06, + "loss": 2.7211, + "step": 5709 + }, + { + "epoch": 1.1442885771543085, + "grad_norm": 31.087660894427408, + "learning_rate": 7.776215091608087e-06, + "loss": 2.9254, + "step": 5710 + }, + { + "epoch": 1.1444889779559118, + "grad_norm": 24.293848344128115, + "learning_rate": 7.775245363725873e-06, + "loss": 3.2842, + "step": 5711 + }, + { + "epoch": 1.144689378757515, + "grad_norm": 26.14254012155614, + "learning_rate": 7.774275484949469e-06, + "loss": 2.6423, + "step": 5712 + }, + { + "epoch": 1.1448897795591182, + "grad_norm": 20.596188540761787, + "learning_rate": 7.773305455331613e-06, + "loss": 2.7649, + "step": 5713 + }, + { + "epoch": 1.1450901803607214, + "grad_norm": 24.591817765519384, + "learning_rate": 7.772335274925047e-06, + "loss": 2.746, + "step": 5714 + }, + { + "epoch": 1.1452905811623246, + "grad_norm": 28.059265070865386, + "learning_rate": 7.771364943782519e-06, + "loss": 3.4627, + "step": 5715 + }, + { + "epoch": 1.1454909819639278, + "grad_norm": 27.9892935983681, + "learning_rate": 7.770394461956787e-06, + "loss": 3.3483, + "step": 5716 + }, + { + "epoch": 1.145691382765531, + "grad_norm": 25.0672101885868, + "learning_rate": 7.769423829500619e-06, + "loss": 2.9833, + "step": 5717 + }, + { + "epoch": 1.1458917835671343, + "grad_norm": 24.9973462862546, + "learning_rate": 7.768453046466791e-06, + "loss": 2.7503, + "step": 5718 + }, + { + "epoch": 1.1460921843687375, + "grad_norm": 42.25510250970786, + "learning_rate": 7.767482112908083e-06, + "loss": 3.0165, + "step": 5719 + }, + { + "epoch": 1.1462925851703407, + "grad_norm": 25.463068340869228, + "learning_rate": 7.766511028877289e-06, + "loss": 2.5667, + "step": 5720 + }, + { + "epoch": 1.146492985971944, + "grad_norm": 33.97415219543488, + "learning_rate": 7.765539794427204e-06, + "loss": 2.9163, + "step": 5721 + }, + { + "epoch": 1.1466933867735472, + "grad_norm": 20.761850058844473, + "learning_rate": 7.76456840961064e-06, + "loss": 2.5839, + "step": 5722 + }, + { + "epoch": 1.1468937875751504, + "grad_norm": 35.133835424315244, + "learning_rate": 7.76359687448041e-06, + "loss": 3.0756, + "step": 5723 + }, + { + "epoch": 1.1470941883767536, + "grad_norm": 35.940875015653525, + "learning_rate": 7.762625189089337e-06, + "loss": 2.6876, + "step": 5724 + }, + { + "epoch": 1.1472945891783568, + "grad_norm": 22.528600958264697, + "learning_rate": 7.761653353490258e-06, + "loss": 3.193, + "step": 5725 + }, + { + "epoch": 1.1474949899799598, + "grad_norm": 23.45970007868568, + "learning_rate": 7.760681367736006e-06, + "loss": 2.693, + "step": 5726 + }, + { + "epoch": 1.1476953907815632, + "grad_norm": 21.21416047791187, + "learning_rate": 7.759709231879436e-06, + "loss": 2.8496, + "step": 5727 + }, + { + "epoch": 1.1478957915831662, + "grad_norm": 19.28539417685495, + "learning_rate": 7.7587369459734e-06, + "loss": 2.5032, + "step": 5728 + }, + { + "epoch": 1.1480961923847695, + "grad_norm": 23.729577006491397, + "learning_rate": 7.757764510070763e-06, + "loss": 2.6352, + "step": 5729 + }, + { + "epoch": 1.1482965931863727, + "grad_norm": 21.5435437995842, + "learning_rate": 7.756791924224398e-06, + "loss": 2.5314, + "step": 5730 + }, + { + "epoch": 1.148496993987976, + "grad_norm": 30.60686927505142, + "learning_rate": 7.755819188487189e-06, + "loss": 3.3817, + "step": 5731 + }, + { + "epoch": 1.148697394789579, + "grad_norm": 27.278561858894076, + "learning_rate": 7.754846302912023e-06, + "loss": 3.4571, + "step": 5732 + }, + { + "epoch": 1.1488977955911823, + "grad_norm": 28.991409216673137, + "learning_rate": 7.753873267551795e-06, + "loss": 2.6023, + "step": 5733 + }, + { + "epoch": 1.1490981963927855, + "grad_norm": 24.313476379863957, + "learning_rate": 7.752900082459413e-06, + "loss": 2.7644, + "step": 5734 + }, + { + "epoch": 1.1492985971943888, + "grad_norm": 23.436283101729394, + "learning_rate": 7.75192674768779e-06, + "loss": 2.4895, + "step": 5735 + }, + { + "epoch": 1.149498997995992, + "grad_norm": 38.261138812671355, + "learning_rate": 7.750953263289848e-06, + "loss": 3.1617, + "step": 5736 + }, + { + "epoch": 1.1496993987975952, + "grad_norm": 46.02101479276165, + "learning_rate": 7.749979629318517e-06, + "loss": 2.5515, + "step": 5737 + }, + { + "epoch": 1.1498997995991984, + "grad_norm": 28.20843266539138, + "learning_rate": 7.749005845826732e-06, + "loss": 2.7605, + "step": 5738 + }, + { + "epoch": 1.1501002004008016, + "grad_norm": 29.046112577498846, + "learning_rate": 7.748031912867442e-06, + "loss": 2.3451, + "step": 5739 + }, + { + "epoch": 1.1503006012024048, + "grad_norm": 23.108827752938392, + "learning_rate": 7.747057830493602e-06, + "loss": 2.7854, + "step": 5740 + }, + { + "epoch": 1.150501002004008, + "grad_norm": 37.268856059140624, + "learning_rate": 7.746083598758172e-06, + "loss": 2.9273, + "step": 5741 + }, + { + "epoch": 1.1507014028056113, + "grad_norm": 24.23397652373812, + "learning_rate": 7.745109217714121e-06, + "loss": 2.0646, + "step": 5742 + }, + { + "epoch": 1.1509018036072145, + "grad_norm": 25.198053187403634, + "learning_rate": 7.744134687414432e-06, + "loss": 2.5273, + "step": 5743 + }, + { + "epoch": 1.1511022044088177, + "grad_norm": 17.443015598458597, + "learning_rate": 7.74316000791209e-06, + "loss": 2.6259, + "step": 5744 + }, + { + "epoch": 1.151302605210421, + "grad_norm": 28.548864268978427, + "learning_rate": 7.742185179260087e-06, + "loss": 2.7941, + "step": 5745 + }, + { + "epoch": 1.1515030060120242, + "grad_norm": 31.025495545149322, + "learning_rate": 7.741210201511429e-06, + "loss": 2.4125, + "step": 5746 + }, + { + "epoch": 1.1517034068136272, + "grad_norm": 25.666073574985038, + "learning_rate": 7.740235074719124e-06, + "loss": 2.6728, + "step": 5747 + }, + { + "epoch": 1.1519038076152304, + "grad_norm": 29.53284310097078, + "learning_rate": 7.739259798936196e-06, + "loss": 2.6817, + "step": 5748 + }, + { + "epoch": 1.1521042084168336, + "grad_norm": 33.58644933651165, + "learning_rate": 7.738284374215668e-06, + "loss": 2.8292, + "step": 5749 + }, + { + "epoch": 1.1523046092184368, + "grad_norm": 20.544806687476562, + "learning_rate": 7.737308800610576e-06, + "loss": 2.5345, + "step": 5750 + }, + { + "epoch": 1.15250501002004, + "grad_norm": 21.189676467528827, + "learning_rate": 7.736333078173964e-06, + "loss": 2.1603, + "step": 5751 + }, + { + "epoch": 1.1527054108216432, + "grad_norm": 18.678452770806686, + "learning_rate": 7.735357206958885e-06, + "loss": 2.6075, + "step": 5752 + }, + { + "epoch": 1.1529058116232465, + "grad_norm": 18.73902814211181, + "learning_rate": 7.734381187018395e-06, + "loss": 2.8128, + "step": 5753 + }, + { + "epoch": 1.1531062124248497, + "grad_norm": 23.193827255814593, + "learning_rate": 7.733405018405565e-06, + "loss": 2.5577, + "step": 5754 + }, + { + "epoch": 1.153306613226453, + "grad_norm": 24.341359826262625, + "learning_rate": 7.732428701173468e-06, + "loss": 2.8256, + "step": 5755 + }, + { + "epoch": 1.153507014028056, + "grad_norm": 27.395969113267952, + "learning_rate": 7.73145223537519e-06, + "loss": 2.8878, + "step": 5756 + }, + { + "epoch": 1.1537074148296593, + "grad_norm": 39.27946739693835, + "learning_rate": 7.73047562106382e-06, + "loss": 2.7258, + "step": 5757 + }, + { + "epoch": 1.1539078156312625, + "grad_norm": 51.7289958025361, + "learning_rate": 7.729498858292462e-06, + "loss": 2.8947, + "step": 5758 + }, + { + "epoch": 1.1541082164328658, + "grad_norm": 33.42955391466862, + "learning_rate": 7.728521947114221e-06, + "loss": 2.466, + "step": 5759 + }, + { + "epoch": 1.154308617234469, + "grad_norm": 25.69792538738647, + "learning_rate": 7.727544887582214e-06, + "loss": 3.0434, + "step": 5760 + }, + { + "epoch": 1.1545090180360722, + "grad_norm": 23.97140460323991, + "learning_rate": 7.726567679749566e-06, + "loss": 2.7293, + "step": 5761 + }, + { + "epoch": 1.1547094188376754, + "grad_norm": 29.806208437645257, + "learning_rate": 7.72559032366941e-06, + "loss": 3.0233, + "step": 5762 + }, + { + "epoch": 1.1549098196392786, + "grad_norm": 31.719053207408436, + "learning_rate": 7.72461281939488e-06, + "loss": 2.8852, + "step": 5763 + }, + { + "epoch": 1.1551102204408819, + "grad_norm": 21.23151937164916, + "learning_rate": 7.723635166979133e-06, + "loss": 3.1153, + "step": 5764 + }, + { + "epoch": 1.155310621242485, + "grad_norm": 26.58167678645099, + "learning_rate": 7.72265736647532e-06, + "loss": 3.0146, + "step": 5765 + }, + { + "epoch": 1.155511022044088, + "grad_norm": 20.417792619663892, + "learning_rate": 7.721679417936609e-06, + "loss": 2.6252, + "step": 5766 + }, + { + "epoch": 1.1557114228456915, + "grad_norm": 18.906674738402007, + "learning_rate": 7.720701321416169e-06, + "loss": 2.8343, + "step": 5767 + }, + { + "epoch": 1.1559118236472945, + "grad_norm": 57.41510900478538, + "learning_rate": 7.719723076967181e-06, + "loss": 2.6282, + "step": 5768 + }, + { + "epoch": 1.1561122244488977, + "grad_norm": 33.9740954189521, + "learning_rate": 7.718744684642836e-06, + "loss": 2.6511, + "step": 5769 + }, + { + "epoch": 1.156312625250501, + "grad_norm": 26.682313992067314, + "learning_rate": 7.717766144496327e-06, + "loss": 2.0721, + "step": 5770 + }, + { + "epoch": 1.1565130260521042, + "grad_norm": 35.83254263563088, + "learning_rate": 7.716787456580865e-06, + "loss": 2.718, + "step": 5771 + }, + { + "epoch": 1.1567134268537074, + "grad_norm": 16.985076844732856, + "learning_rate": 7.715808620949657e-06, + "loss": 2.8601, + "step": 5772 + }, + { + "epoch": 1.1569138276553106, + "grad_norm": 24.043538690435284, + "learning_rate": 7.714829637655924e-06, + "loss": 2.2733, + "step": 5773 + }, + { + "epoch": 1.1571142284569138, + "grad_norm": 32.236838087178626, + "learning_rate": 7.713850506752898e-06, + "loss": 2.8091, + "step": 5774 + }, + { + "epoch": 1.157314629258517, + "grad_norm": 24.247542116650052, + "learning_rate": 7.712871228293812e-06, + "loss": 2.7359, + "step": 5775 + }, + { + "epoch": 1.1575150300601202, + "grad_norm": 19.834642376171693, + "learning_rate": 7.711891802331915e-06, + "loss": 2.7092, + "step": 5776 + }, + { + "epoch": 1.1577154308617235, + "grad_norm": 28.07712075015899, + "learning_rate": 7.710912228920455e-06, + "loss": 2.6758, + "step": 5777 + }, + { + "epoch": 1.1579158316633267, + "grad_norm": 52.77835362189873, + "learning_rate": 7.709932508112696e-06, + "loss": 2.7749, + "step": 5778 + }, + { + "epoch": 1.15811623246493, + "grad_norm": 29.793469261397174, + "learning_rate": 7.708952639961908e-06, + "loss": 2.8423, + "step": 5779 + }, + { + "epoch": 1.1583166332665331, + "grad_norm": 24.086468172386812, + "learning_rate": 7.707972624521364e-06, + "loss": 2.9865, + "step": 5780 + }, + { + "epoch": 1.1585170340681363, + "grad_norm": 16.661759917972297, + "learning_rate": 7.706992461844354e-06, + "loss": 2.9133, + "step": 5781 + }, + { + "epoch": 1.1587174348697395, + "grad_norm": 21.883671478794433, + "learning_rate": 7.706012151984165e-06, + "loss": 2.7535, + "step": 5782 + }, + { + "epoch": 1.1589178356713428, + "grad_norm": 22.234767252667186, + "learning_rate": 7.705031694994102e-06, + "loss": 2.5338, + "step": 5783 + }, + { + "epoch": 1.159118236472946, + "grad_norm": 21.89417722701697, + "learning_rate": 7.704051090927473e-06, + "loss": 2.3667, + "step": 5784 + }, + { + "epoch": 1.159318637274549, + "grad_norm": 29.899467241585658, + "learning_rate": 7.703070339837595e-06, + "loss": 2.9929, + "step": 5785 + }, + { + "epoch": 1.1595190380761524, + "grad_norm": 23.37730232551806, + "learning_rate": 7.702089441777793e-06, + "loss": 2.6492, + "step": 5786 + }, + { + "epoch": 1.1597194388777554, + "grad_norm": 21.074834927706284, + "learning_rate": 7.701108396801398e-06, + "loss": 3.1932, + "step": 5787 + }, + { + "epoch": 1.1599198396793586, + "grad_norm": 37.57510323849078, + "learning_rate": 7.700127204961753e-06, + "loss": 2.8732, + "step": 5788 + }, + { + "epoch": 1.1601202404809619, + "grad_norm": 26.112062800707477, + "learning_rate": 7.699145866312205e-06, + "loss": 2.772, + "step": 5789 + }, + { + "epoch": 1.160320641282565, + "grad_norm": 27.2124778993136, + "learning_rate": 7.698164380906112e-06, + "loss": 3.1063, + "step": 5790 + }, + { + "epoch": 1.1605210420841683, + "grad_norm": 17.412195456602003, + "learning_rate": 7.697182748796841e-06, + "loss": 2.4224, + "step": 5791 + }, + { + "epoch": 1.1607214428857715, + "grad_norm": 38.91883939371399, + "learning_rate": 7.69620097003776e-06, + "loss": 2.9699, + "step": 5792 + }, + { + "epoch": 1.1609218436873747, + "grad_norm": 33.87338496369091, + "learning_rate": 7.695219044682254e-06, + "loss": 3.1935, + "step": 5793 + }, + { + "epoch": 1.161122244488978, + "grad_norm": 25.796789401101822, + "learning_rate": 7.694236972783708e-06, + "loss": 2.1853, + "step": 5794 + }, + { + "epoch": 1.1613226452905812, + "grad_norm": 20.091195756467773, + "learning_rate": 7.693254754395523e-06, + "loss": 2.7013, + "step": 5795 + }, + { + "epoch": 1.1615230460921844, + "grad_norm": 23.7804301221773, + "learning_rate": 7.6922723895711e-06, + "loss": 2.4083, + "step": 5796 + }, + { + "epoch": 1.1617234468937876, + "grad_norm": 23.738670505133133, + "learning_rate": 7.691289878363854e-06, + "loss": 2.7643, + "step": 5797 + }, + { + "epoch": 1.1619238476953908, + "grad_norm": 28.665093149137633, + "learning_rate": 7.690307220827204e-06, + "loss": 2.8514, + "step": 5798 + }, + { + "epoch": 1.162124248496994, + "grad_norm": 69.59913018789432, + "learning_rate": 7.68932441701458e-06, + "loss": 3.0628, + "step": 5799 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 70.32352040097544, + "learning_rate": 7.688341466979416e-06, + "loss": 2.6122, + "step": 5800 + }, + { + "epoch": 1.1625250501002005, + "grad_norm": 31.197690523149777, + "learning_rate": 7.68735837077516e-06, + "loss": 2.7015, + "step": 5801 + }, + { + "epoch": 1.1627254509018037, + "grad_norm": 26.557681089782122, + "learning_rate": 7.686375128455262e-06, + "loss": 2.9066, + "step": 5802 + }, + { + "epoch": 1.162925851703407, + "grad_norm": 16.57438249807096, + "learning_rate": 7.685391740073182e-06, + "loss": 2.1532, + "step": 5803 + }, + { + "epoch": 1.1631262525050101, + "grad_norm": 20.89354270024853, + "learning_rate": 7.684408205682389e-06, + "loss": 3.0928, + "step": 5804 + }, + { + "epoch": 1.1633266533066133, + "grad_norm": 25.723435346779084, + "learning_rate": 7.68342452533636e-06, + "loss": 2.4457, + "step": 5805 + }, + { + "epoch": 1.1635270541082163, + "grad_norm": 33.040092079637404, + "learning_rate": 7.682440699088577e-06, + "loss": 2.79, + "step": 5806 + }, + { + "epoch": 1.1637274549098195, + "grad_norm": 22.213437870454168, + "learning_rate": 7.681456726992535e-06, + "loss": 2.5861, + "step": 5807 + }, + { + "epoch": 1.1639278557114228, + "grad_norm": 37.48125126945824, + "learning_rate": 7.680472609101732e-06, + "loss": 2.33, + "step": 5808 + }, + { + "epoch": 1.164128256513026, + "grad_norm": 27.83003831320637, + "learning_rate": 7.679488345469678e-06, + "loss": 2.786, + "step": 5809 + }, + { + "epoch": 1.1643286573146292, + "grad_norm": 40.776725371218426, + "learning_rate": 7.678503936149886e-06, + "loss": 2.8633, + "step": 5810 + }, + { + "epoch": 1.1645290581162324, + "grad_norm": 30.546699758830183, + "learning_rate": 7.67751938119588e-06, + "loss": 3.5123, + "step": 5811 + }, + { + "epoch": 1.1647294589178356, + "grad_norm": 24.282364494872162, + "learning_rate": 7.676534680661194e-06, + "loss": 2.8007, + "step": 5812 + }, + { + "epoch": 1.1649298597194389, + "grad_norm": 32.53288630081589, + "learning_rate": 7.675549834599368e-06, + "loss": 2.7769, + "step": 5813 + }, + { + "epoch": 1.165130260521042, + "grad_norm": 540.9221363132799, + "learning_rate": 7.674564843063945e-06, + "loss": 3.1843, + "step": 5814 + }, + { + "epoch": 1.1653306613226453, + "grad_norm": 19.647794919998102, + "learning_rate": 7.673579706108486e-06, + "loss": 2.4893, + "step": 5815 + }, + { + "epoch": 1.1655310621242485, + "grad_norm": 33.004197542791765, + "learning_rate": 7.67259442378655e-06, + "loss": 3.3372, + "step": 5816 + }, + { + "epoch": 1.1657314629258517, + "grad_norm": 31.123817330486485, + "learning_rate": 7.671608996151712e-06, + "loss": 2.8532, + "step": 5817 + }, + { + "epoch": 1.165931863727455, + "grad_norm": 32.5286652782727, + "learning_rate": 7.670623423257548e-06, + "loss": 3.0778, + "step": 5818 + }, + { + "epoch": 1.1661322645290582, + "grad_norm": 27.18888920628188, + "learning_rate": 7.669637705157647e-06, + "loss": 2.4905, + "step": 5819 + }, + { + "epoch": 1.1663326653306614, + "grad_norm": 27.707983217778708, + "learning_rate": 7.668651841905602e-06, + "loss": 2.6562, + "step": 5820 + }, + { + "epoch": 1.1665330661322646, + "grad_norm": 19.031824791543663, + "learning_rate": 7.667665833555019e-06, + "loss": 2.22, + "step": 5821 + }, + { + "epoch": 1.1667334669338678, + "grad_norm": 25.513578859106836, + "learning_rate": 7.666679680159503e-06, + "loss": 3.0112, + "step": 5822 + }, + { + "epoch": 1.166933867735471, + "grad_norm": 20.919625840022498, + "learning_rate": 7.66569338177268e-06, + "loss": 2.7337, + "step": 5823 + }, + { + "epoch": 1.1671342685370742, + "grad_norm": 22.489467280319506, + "learning_rate": 7.664706938448173e-06, + "loss": 3.121, + "step": 5824 + }, + { + "epoch": 1.1673346693386772, + "grad_norm": 38.571733558110424, + "learning_rate": 7.663720350239613e-06, + "loss": 3.1949, + "step": 5825 + }, + { + "epoch": 1.1675350701402807, + "grad_norm": 19.949246893915475, + "learning_rate": 7.66273361720065e-06, + "loss": 2.5029, + "step": 5826 + }, + { + "epoch": 1.1677354709418837, + "grad_norm": 29.14177527291755, + "learning_rate": 7.661746739384925e-06, + "loss": 2.4903, + "step": 5827 + }, + { + "epoch": 1.167935871743487, + "grad_norm": 29.004676249564312, + "learning_rate": 7.660759716846105e-06, + "loss": 2.4711, + "step": 5828 + }, + { + "epoch": 1.1681362725450901, + "grad_norm": 27.60687498895618, + "learning_rate": 7.659772549637849e-06, + "loss": 2.681, + "step": 5829 + }, + { + "epoch": 1.1683366733466933, + "grad_norm": 25.119378915872037, + "learning_rate": 7.658785237813833e-06, + "loss": 2.8105, + "step": 5830 + }, + { + "epoch": 1.1685370741482966, + "grad_norm": 23.972966946459717, + "learning_rate": 7.65779778142774e-06, + "loss": 2.678, + "step": 5831 + }, + { + "epoch": 1.1687374749498998, + "grad_norm": 25.425574654165874, + "learning_rate": 7.656810180533258e-06, + "loss": 3.1609, + "step": 5832 + }, + { + "epoch": 1.168937875751503, + "grad_norm": 22.36468849113373, + "learning_rate": 7.655822435184085e-06, + "loss": 2.4164, + "step": 5833 + }, + { + "epoch": 1.1691382765531062, + "grad_norm": 16.237766061572785, + "learning_rate": 7.654834545433927e-06, + "loss": 2.4924, + "step": 5834 + }, + { + "epoch": 1.1693386773547094, + "grad_norm": 20.76732302805341, + "learning_rate": 7.653846511336494e-06, + "loss": 2.5737, + "step": 5835 + }, + { + "epoch": 1.1695390781563126, + "grad_norm": 20.224869193758646, + "learning_rate": 7.65285833294551e-06, + "loss": 2.826, + "step": 5836 + }, + { + "epoch": 1.1697394789579159, + "grad_norm": 25.230375462185762, + "learning_rate": 7.651870010314701e-06, + "loss": 2.4984, + "step": 5837 + }, + { + "epoch": 1.169939879759519, + "grad_norm": 16.83166133592596, + "learning_rate": 7.650881543497806e-06, + "loss": 2.3279, + "step": 5838 + }, + { + "epoch": 1.1701402805611223, + "grad_norm": 81.38902825824864, + "learning_rate": 7.649892932548568e-06, + "loss": 2.7929, + "step": 5839 + }, + { + "epoch": 1.1703406813627255, + "grad_norm": 55.50997966528397, + "learning_rate": 7.64890417752074e-06, + "loss": 2.5904, + "step": 5840 + }, + { + "epoch": 1.1705410821643287, + "grad_norm": 48.503980532817394, + "learning_rate": 7.647915278468081e-06, + "loss": 2.6128, + "step": 5841 + }, + { + "epoch": 1.170741482965932, + "grad_norm": 65.06036108625584, + "learning_rate": 7.64692623544436e-06, + "loss": 3.0727, + "step": 5842 + }, + { + "epoch": 1.1709418837675352, + "grad_norm": 20.311863303399107, + "learning_rate": 7.645937048503353e-06, + "loss": 3.1241, + "step": 5843 + }, + { + "epoch": 1.1711422845691382, + "grad_norm": 24.952966175260727, + "learning_rate": 7.644947717698842e-06, + "loss": 2.5608, + "step": 5844 + }, + { + "epoch": 1.1713426853707416, + "grad_norm": 21.110114199651782, + "learning_rate": 7.64395824308462e-06, + "loss": 2.6718, + "step": 5845 + }, + { + "epoch": 1.1715430861723446, + "grad_norm": 30.923809660037474, + "learning_rate": 7.642968624714487e-06, + "loss": 2.6901, + "step": 5846 + }, + { + "epoch": 1.1717434869739478, + "grad_norm": 43.58337281199379, + "learning_rate": 7.641978862642245e-06, + "loss": 3.023, + "step": 5847 + }, + { + "epoch": 1.171943887775551, + "grad_norm": 21.84490750336401, + "learning_rate": 7.640988956921715e-06, + "loss": 3.0727, + "step": 5848 + }, + { + "epoch": 1.1721442885771542, + "grad_norm": 22.72648693177969, + "learning_rate": 7.639998907606715e-06, + "loss": 2.8252, + "step": 5849 + }, + { + "epoch": 1.1723446893787575, + "grad_norm": 30.053231361276534, + "learning_rate": 7.63900871475108e-06, + "loss": 2.6076, + "step": 5850 + }, + { + "epoch": 1.1725450901803607, + "grad_norm": 25.692917379761777, + "learning_rate": 7.638018378408643e-06, + "loss": 2.6363, + "step": 5851 + }, + { + "epoch": 1.172745490981964, + "grad_norm": 40.46663236978924, + "learning_rate": 7.637027898633253e-06, + "loss": 2.8062, + "step": 5852 + }, + { + "epoch": 1.1729458917835671, + "grad_norm": 25.73669548738112, + "learning_rate": 7.636037275478764e-06, + "loss": 2.3729, + "step": 5853 + }, + { + "epoch": 1.1731462925851703, + "grad_norm": 29.0129778048092, + "learning_rate": 7.635046508999038e-06, + "loss": 3.0104, + "step": 5854 + }, + { + "epoch": 1.1733466933867736, + "grad_norm": 33.97321589651812, + "learning_rate": 7.634055599247943e-06, + "loss": 2.9382, + "step": 5855 + }, + { + "epoch": 1.1735470941883768, + "grad_norm": 21.506560566190828, + "learning_rate": 7.633064546279355e-06, + "loss": 2.7147, + "step": 5856 + }, + { + "epoch": 1.17374749498998, + "grad_norm": 35.33176754425178, + "learning_rate": 7.632073350147166e-06, + "loss": 2.781, + "step": 5857 + }, + { + "epoch": 1.1739478957915832, + "grad_norm": 25.36089695539166, + "learning_rate": 7.631082010905259e-06, + "loss": 2.7336, + "step": 5858 + }, + { + "epoch": 1.1741482965931864, + "grad_norm": 26.818389758826427, + "learning_rate": 7.630090528607542e-06, + "loss": 2.4076, + "step": 5859 + }, + { + "epoch": 1.1743486973947896, + "grad_norm": 31.867328824780877, + "learning_rate": 7.629098903307919e-06, + "loss": 2.7818, + "step": 5860 + }, + { + "epoch": 1.1745490981963929, + "grad_norm": 20.11280697934474, + "learning_rate": 7.628107135060308e-06, + "loss": 2.7919, + "step": 5861 + }, + { + "epoch": 1.174749498997996, + "grad_norm": 20.522176855791695, + "learning_rate": 7.627115223918633e-06, + "loss": 2.4328, + "step": 5862 + }, + { + "epoch": 1.1749498997995993, + "grad_norm": 47.373397713505355, + "learning_rate": 7.626123169936823e-06, + "loss": 2.884, + "step": 5863 + }, + { + "epoch": 1.1751503006012025, + "grad_norm": 20.267560264339753, + "learning_rate": 7.6251309731688235e-06, + "loss": 3.0279, + "step": 5864 + }, + { + "epoch": 1.1753507014028055, + "grad_norm": 25.781501159883252, + "learning_rate": 7.624138633668576e-06, + "loss": 2.7443, + "step": 5865 + }, + { + "epoch": 1.1755511022044087, + "grad_norm": 36.67286479903188, + "learning_rate": 7.6231461514900375e-06, + "loss": 2.8752, + "step": 5866 + }, + { + "epoch": 1.175751503006012, + "grad_norm": 31.9833227577002, + "learning_rate": 7.6221535266871705e-06, + "loss": 3.1407, + "step": 5867 + }, + { + "epoch": 1.1759519038076152, + "grad_norm": 25.541014441125686, + "learning_rate": 7.621160759313945e-06, + "loss": 2.9347, + "step": 5868 + }, + { + "epoch": 1.1761523046092184, + "grad_norm": 33.99875678609467, + "learning_rate": 7.6201678494243415e-06, + "loss": 2.7185, + "step": 5869 + }, + { + "epoch": 1.1763527054108216, + "grad_norm": 26.65500696695205, + "learning_rate": 7.619174797072343e-06, + "loss": 3.1701, + "step": 5870 + }, + { + "epoch": 1.1765531062124248, + "grad_norm": 19.84362886079887, + "learning_rate": 7.618181602311945e-06, + "loss": 2.5467, + "step": 5871 + }, + { + "epoch": 1.176753507014028, + "grad_norm": 28.41501250005943, + "learning_rate": 7.617188265197148e-06, + "loss": 2.8314, + "step": 5872 + }, + { + "epoch": 1.1769539078156313, + "grad_norm": 23.219906467409533, + "learning_rate": 7.616194785781963e-06, + "loss": 2.8619, + "step": 5873 + }, + { + "epoch": 1.1771543086172345, + "grad_norm": 34.33921809393274, + "learning_rate": 7.615201164120404e-06, + "loss": 2.8106, + "step": 5874 + }, + { + "epoch": 1.1773547094188377, + "grad_norm": 17.77733608910797, + "learning_rate": 7.614207400266498e-06, + "loss": 2.5615, + "step": 5875 + }, + { + "epoch": 1.177555110220441, + "grad_norm": 46.92105200828071, + "learning_rate": 7.613213494274276e-06, + "loss": 2.7633, + "step": 5876 + }, + { + "epoch": 1.1777555110220441, + "grad_norm": 22.330798453301348, + "learning_rate": 7.612219446197781e-06, + "loss": 2.482, + "step": 5877 + }, + { + "epoch": 1.1779559118236473, + "grad_norm": 20.52245428680997, + "learning_rate": 7.611225256091057e-06, + "loss": 2.5197, + "step": 5878 + }, + { + "epoch": 1.1781563126252506, + "grad_norm": 15.042203095063055, + "learning_rate": 7.6102309240081615e-06, + "loss": 2.557, + "step": 5879 + }, + { + "epoch": 1.1783567134268538, + "grad_norm": 24.714759024077846, + "learning_rate": 7.609236450003157e-06, + "loss": 3.1155, + "step": 5880 + }, + { + "epoch": 1.178557114228457, + "grad_norm": 32.645502056140764, + "learning_rate": 7.608241834130117e-06, + "loss": 2.7004, + "step": 5881 + }, + { + "epoch": 1.1787575150300602, + "grad_norm": 25.230500253602422, + "learning_rate": 7.607247076443116e-06, + "loss": 2.6224, + "step": 5882 + }, + { + "epoch": 1.1789579158316634, + "grad_norm": 17.273585810842253, + "learning_rate": 7.606252176996245e-06, + "loss": 2.7157, + "step": 5883 + }, + { + "epoch": 1.1791583166332664, + "grad_norm": 30.957817285714373, + "learning_rate": 7.6052571358435956e-06, + "loss": 2.8899, + "step": 5884 + }, + { + "epoch": 1.1793587174348699, + "grad_norm": 18.849857794065045, + "learning_rate": 7.604261953039269e-06, + "loss": 2.3885, + "step": 5885 + }, + { + "epoch": 1.1795591182364729, + "grad_norm": 25.03832169540292, + "learning_rate": 7.6032666286373755e-06, + "loss": 2.5553, + "step": 5886 + }, + { + "epoch": 1.179759519038076, + "grad_norm": 26.58798580042499, + "learning_rate": 7.602271162692033e-06, + "loss": 2.8078, + "step": 5887 + }, + { + "epoch": 1.1799599198396793, + "grad_norm": 26.489097060405445, + "learning_rate": 7.601275555257366e-06, + "loss": 2.6873, + "step": 5888 + }, + { + "epoch": 1.1801603206412825, + "grad_norm": 30.86674202021797, + "learning_rate": 7.60027980638751e-06, + "loss": 2.5641, + "step": 5889 + }, + { + "epoch": 1.1803607214428857, + "grad_norm": 22.746880977227104, + "learning_rate": 7.599283916136599e-06, + "loss": 2.4847, + "step": 5890 + }, + { + "epoch": 1.180561122244489, + "grad_norm": 25.225815578400187, + "learning_rate": 7.598287884558786e-06, + "loss": 2.6331, + "step": 5891 + }, + { + "epoch": 1.1807615230460922, + "grad_norm": 26.735652872361275, + "learning_rate": 7.597291711708226e-06, + "loss": 2.9036, + "step": 5892 + }, + { + "epoch": 1.1809619238476954, + "grad_norm": 25.658599582598658, + "learning_rate": 7.5962953976390785e-06, + "loss": 2.8458, + "step": 5893 + }, + { + "epoch": 1.1811623246492986, + "grad_norm": 27.858756500281455, + "learning_rate": 7.59529894240552e-06, + "loss": 3.4017, + "step": 5894 + }, + { + "epoch": 1.1813627254509018, + "grad_norm": 23.02192228526449, + "learning_rate": 7.5943023460617285e-06, + "loss": 2.7279, + "step": 5895 + }, + { + "epoch": 1.181563126252505, + "grad_norm": 25.02568152063429, + "learning_rate": 7.5933056086618874e-06, + "loss": 2.7759, + "step": 5896 + }, + { + "epoch": 1.1817635270541083, + "grad_norm": 30.165652776910306, + "learning_rate": 7.592308730260192e-06, + "loss": 2.9097, + "step": 5897 + }, + { + "epoch": 1.1819639278557115, + "grad_norm": 20.785186442410883, + "learning_rate": 7.591311710910845e-06, + "loss": 2.4034, + "step": 5898 + }, + { + "epoch": 1.1821643286573147, + "grad_norm": 28.710309182572626, + "learning_rate": 7.5903145506680545e-06, + "loss": 2.782, + "step": 5899 + }, + { + "epoch": 1.182364729458918, + "grad_norm": 24.063019497127495, + "learning_rate": 7.589317249586036e-06, + "loss": 2.9132, + "step": 5900 + }, + { + "epoch": 1.1825651302605211, + "grad_norm": 23.809466916489637, + "learning_rate": 7.58831980771902e-06, + "loss": 2.7595, + "step": 5901 + }, + { + "epoch": 1.1827655310621243, + "grad_norm": 19.892750933883022, + "learning_rate": 7.587322225121234e-06, + "loss": 2.8065, + "step": 5902 + }, + { + "epoch": 1.1829659318637273, + "grad_norm": 30.624913182024702, + "learning_rate": 7.5863245018469215e-06, + "loss": 3.2371, + "step": 5903 + }, + { + "epoch": 1.1831663326653308, + "grad_norm": 30.1389077672809, + "learning_rate": 7.585326637950325e-06, + "loss": 2.6591, + "step": 5904 + }, + { + "epoch": 1.1833667334669338, + "grad_norm": 38.56429136410954, + "learning_rate": 7.584328633485704e-06, + "loss": 2.734, + "step": 5905 + }, + { + "epoch": 1.183567134268537, + "grad_norm": 62.03468063701728, + "learning_rate": 7.5833304885073214e-06, + "loss": 2.9227, + "step": 5906 + }, + { + "epoch": 1.1837675350701402, + "grad_norm": 29.178972048523622, + "learning_rate": 7.582332203069444e-06, + "loss": 2.2313, + "step": 5907 + }, + { + "epoch": 1.1839679358717434, + "grad_norm": 26.3452414411225, + "learning_rate": 7.581333777226357e-06, + "loss": 2.3997, + "step": 5908 + }, + { + "epoch": 1.1841683366733466, + "grad_norm": 25.97459588505622, + "learning_rate": 7.580335211032339e-06, + "loss": 2.5465, + "step": 5909 + }, + { + "epoch": 1.1843687374749499, + "grad_norm": 29.500226592205102, + "learning_rate": 7.579336504541688e-06, + "loss": 2.7372, + "step": 5910 + }, + { + "epoch": 1.184569138276553, + "grad_norm": 29.565458294810554, + "learning_rate": 7.5783376578087035e-06, + "loss": 3.0058, + "step": 5911 + }, + { + "epoch": 1.1847695390781563, + "grad_norm": 29.24195898482193, + "learning_rate": 7.577338670887693e-06, + "loss": 3.2439, + "step": 5912 + }, + { + "epoch": 1.1849699398797595, + "grad_norm": 27.526225449488535, + "learning_rate": 7.576339543832977e-06, + "loss": 3.171, + "step": 5913 + }, + { + "epoch": 1.1851703406813627, + "grad_norm": 21.487432922558433, + "learning_rate": 7.575340276698875e-06, + "loss": 2.2665, + "step": 5914 + }, + { + "epoch": 1.185370741482966, + "grad_norm": 43.25113812806699, + "learning_rate": 7.574340869539722e-06, + "loss": 2.9819, + "step": 5915 + }, + { + "epoch": 1.1855711422845692, + "grad_norm": 28.108324877464412, + "learning_rate": 7.573341322409855e-06, + "loss": 2.6694, + "step": 5916 + }, + { + "epoch": 1.1857715430861724, + "grad_norm": 30.083578005293205, + "learning_rate": 7.572341635363623e-06, + "loss": 2.4869, + "step": 5917 + }, + { + "epoch": 1.1859719438877756, + "grad_norm": 35.60918920555415, + "learning_rate": 7.571341808455377e-06, + "loss": 2.9117, + "step": 5918 + }, + { + "epoch": 1.1861723446893788, + "grad_norm": 44.6186724848067, + "learning_rate": 7.5703418417394824e-06, + "loss": 2.9897, + "step": 5919 + }, + { + "epoch": 1.186372745490982, + "grad_norm": 19.18952268409695, + "learning_rate": 7.569341735270308e-06, + "loss": 2.7813, + "step": 5920 + }, + { + "epoch": 1.1865731462925853, + "grad_norm": 31.980561639945233, + "learning_rate": 7.568341489102231e-06, + "loss": 2.8011, + "step": 5921 + }, + { + "epoch": 1.1867735470941883, + "grad_norm": 28.28043737788703, + "learning_rate": 7.5673411032896345e-06, + "loss": 2.4588, + "step": 5922 + }, + { + "epoch": 1.1869739478957917, + "grad_norm": 27.466627661375195, + "learning_rate": 7.566340577886914e-06, + "loss": 3.5188, + "step": 5923 + }, + { + "epoch": 1.1871743486973947, + "grad_norm": 32.925844663153775, + "learning_rate": 7.565339912948466e-06, + "loss": 2.6858, + "step": 5924 + }, + { + "epoch": 1.187374749498998, + "grad_norm": 25.59789935162567, + "learning_rate": 7.564339108528703e-06, + "loss": 2.2336, + "step": 5925 + }, + { + "epoch": 1.1875751503006011, + "grad_norm": 26.69170757829235, + "learning_rate": 7.563338164682036e-06, + "loss": 2.4845, + "step": 5926 + }, + { + "epoch": 1.1877755511022043, + "grad_norm": 20.36964193621664, + "learning_rate": 7.56233708146289e-06, + "loss": 2.6842, + "step": 5927 + }, + { + "epoch": 1.1879759519038076, + "grad_norm": 22.042507314727512, + "learning_rate": 7.561335858925694e-06, + "loss": 2.6489, + "step": 5928 + }, + { + "epoch": 1.1881763527054108, + "grad_norm": 29.677514281501924, + "learning_rate": 7.560334497124885e-06, + "loss": 2.2829, + "step": 5929 + }, + { + "epoch": 1.188376753507014, + "grad_norm": 21.75000555863203, + "learning_rate": 7.559332996114911e-06, + "loss": 2.4958, + "step": 5930 + }, + { + "epoch": 1.1885771543086172, + "grad_norm": 22.05843355196526, + "learning_rate": 7.558331355950223e-06, + "loss": 2.7505, + "step": 5931 + }, + { + "epoch": 1.1887775551102204, + "grad_norm": 23.859276883854232, + "learning_rate": 7.557329576685284e-06, + "loss": 2.1644, + "step": 5932 + }, + { + "epoch": 1.1889779559118236, + "grad_norm": 35.23775169251867, + "learning_rate": 7.556327658374562e-06, + "loss": 2.8521, + "step": 5933 + }, + { + "epoch": 1.1891783567134269, + "grad_norm": 25.057023233806007, + "learning_rate": 7.555325601072529e-06, + "loss": 2.8756, + "step": 5934 + }, + { + "epoch": 1.18937875751503, + "grad_norm": 45.300749379784584, + "learning_rate": 7.554323404833675e-06, + "loss": 3.1561, + "step": 5935 + }, + { + "epoch": 1.1895791583166333, + "grad_norm": 23.912732904401736, + "learning_rate": 7.553321069712483e-06, + "loss": 2.672, + "step": 5936 + }, + { + "epoch": 1.1897795591182365, + "grad_norm": 28.916971746941798, + "learning_rate": 7.552318595763457e-06, + "loss": 3.2834, + "step": 5937 + }, + { + "epoch": 1.1899799599198397, + "grad_norm": 20.076514170093674, + "learning_rate": 7.551315983041102e-06, + "loss": 2.2366, + "step": 5938 + }, + { + "epoch": 1.190180360721443, + "grad_norm": 39.547517637428534, + "learning_rate": 7.55031323159993e-06, + "loss": 2.9666, + "step": 5939 + }, + { + "epoch": 1.1903807615230462, + "grad_norm": 19.458522027570773, + "learning_rate": 7.549310341494465e-06, + "loss": 2.628, + "step": 5940 + }, + { + "epoch": 1.1905811623246494, + "grad_norm": 24.17314357180005, + "learning_rate": 7.548307312779231e-06, + "loss": 3.1372, + "step": 5941 + }, + { + "epoch": 1.1907815631262526, + "grad_norm": 19.25589410506902, + "learning_rate": 7.547304145508767e-06, + "loss": 2.8341, + "step": 5942 + }, + { + "epoch": 1.1909819639278556, + "grad_norm": 35.375863283948426, + "learning_rate": 7.546300839737617e-06, + "loss": 2.5516, + "step": 5943 + }, + { + "epoch": 1.191182364729459, + "grad_norm": 44.49797523742885, + "learning_rate": 7.545297395520331e-06, + "loss": 2.7559, + "step": 5944 + }, + { + "epoch": 1.191382765531062, + "grad_norm": 26.852184927475417, + "learning_rate": 7.544293812911469e-06, + "loss": 2.8023, + "step": 5945 + }, + { + "epoch": 1.1915831663326653, + "grad_norm": 20.610967650483357, + "learning_rate": 7.543290091965598e-06, + "loss": 2.637, + "step": 5946 + }, + { + "epoch": 1.1917835671342685, + "grad_norm": 26.32318895413013, + "learning_rate": 7.542286232737288e-06, + "loss": 3.1748, + "step": 5947 + }, + { + "epoch": 1.1919839679358717, + "grad_norm": 25.166806041786216, + "learning_rate": 7.541282235281125e-06, + "loss": 2.9661, + "step": 5948 + }, + { + "epoch": 1.192184368737475, + "grad_norm": 27.085967425518422, + "learning_rate": 7.540278099651694e-06, + "loss": 2.5008, + "step": 5949 + }, + { + "epoch": 1.1923847695390781, + "grad_norm": 19.05527953130622, + "learning_rate": 7.539273825903595e-06, + "loss": 2.807, + "step": 5950 + }, + { + "epoch": 1.1925851703406813, + "grad_norm": 27.340034783761627, + "learning_rate": 7.538269414091427e-06, + "loss": 2.8076, + "step": 5951 + }, + { + "epoch": 1.1927855711422846, + "grad_norm": 32.97649960869817, + "learning_rate": 7.5372648642698065e-06, + "loss": 2.9234, + "step": 5952 + }, + { + "epoch": 1.1929859719438878, + "grad_norm": 26.719789253899133, + "learning_rate": 7.536260176493348e-06, + "loss": 2.9138, + "step": 5953 + }, + { + "epoch": 1.193186372745491, + "grad_norm": 26.99443648681895, + "learning_rate": 7.535255350816682e-06, + "loss": 2.5125, + "step": 5954 + }, + { + "epoch": 1.1933867735470942, + "grad_norm": 24.20250599244981, + "learning_rate": 7.534250387294437e-06, + "loss": 2.4411, + "step": 5955 + }, + { + "epoch": 1.1935871743486974, + "grad_norm": 24.107755168745218, + "learning_rate": 7.53324528598126e-06, + "loss": 2.5441, + "step": 5956 + }, + { + "epoch": 1.1937875751503007, + "grad_norm": 17.651200250975645, + "learning_rate": 7.532240046931798e-06, + "loss": 2.4001, + "step": 5957 + }, + { + "epoch": 1.1939879759519039, + "grad_norm": 21.09862750868277, + "learning_rate": 7.531234670200704e-06, + "loss": 3.2508, + "step": 5958 + }, + { + "epoch": 1.194188376753507, + "grad_norm": 21.898553283133136, + "learning_rate": 7.530229155842647e-06, + "loss": 2.277, + "step": 5959 + }, + { + "epoch": 1.1943887775551103, + "grad_norm": 30.38634413326552, + "learning_rate": 7.529223503912295e-06, + "loss": 2.9764, + "step": 5960 + }, + { + "epoch": 1.1945891783567135, + "grad_norm": 25.35412251486209, + "learning_rate": 7.528217714464327e-06, + "loss": 3.2453, + "step": 5961 + }, + { + "epoch": 1.1947895791583165, + "grad_norm": 33.06275766832502, + "learning_rate": 7.527211787553431e-06, + "loss": 2.3906, + "step": 5962 + }, + { + "epoch": 1.19498997995992, + "grad_norm": 23.24967421151767, + "learning_rate": 7.526205723234299e-06, + "loss": 2.5524, + "step": 5963 + }, + { + "epoch": 1.195190380761523, + "grad_norm": 50.93250488303664, + "learning_rate": 7.525199521561633e-06, + "loss": 3.0745, + "step": 5964 + }, + { + "epoch": 1.1953907815631262, + "grad_norm": 28.13627360001727, + "learning_rate": 7.524193182590143e-06, + "loss": 2.9637, + "step": 5965 + }, + { + "epoch": 1.1955911823647294, + "grad_norm": 72.33798222874381, + "learning_rate": 7.523186706374541e-06, + "loss": 3.5268, + "step": 5966 + }, + { + "epoch": 1.1957915831663326, + "grad_norm": 39.46233120057937, + "learning_rate": 7.522180092969553e-06, + "loss": 2.7139, + "step": 5967 + }, + { + "epoch": 1.1959919839679358, + "grad_norm": 25.60197451336735, + "learning_rate": 7.521173342429912e-06, + "loss": 2.2982, + "step": 5968 + }, + { + "epoch": 1.196192384769539, + "grad_norm": 26.57448303228763, + "learning_rate": 7.520166454810354e-06, + "loss": 3.1933, + "step": 5969 + }, + { + "epoch": 1.1963927855711423, + "grad_norm": 19.836538785999636, + "learning_rate": 7.519159430165626e-06, + "loss": 2.3902, + "step": 5970 + }, + { + "epoch": 1.1965931863727455, + "grad_norm": 37.08305863968461, + "learning_rate": 7.518152268550481e-06, + "loss": 2.0587, + "step": 5971 + }, + { + "epoch": 1.1967935871743487, + "grad_norm": 27.337356750617126, + "learning_rate": 7.517144970019681e-06, + "loss": 2.6928, + "step": 5972 + }, + { + "epoch": 1.196993987975952, + "grad_norm": 16.74933792539477, + "learning_rate": 7.516137534627992e-06, + "loss": 2.2109, + "step": 5973 + }, + { + "epoch": 1.1971943887775551, + "grad_norm": 19.439712398601092, + "learning_rate": 7.515129962430192e-06, + "loss": 2.6558, + "step": 5974 + }, + { + "epoch": 1.1973947895791583, + "grad_norm": 47.76048811495484, + "learning_rate": 7.514122253481062e-06, + "loss": 2.8471, + "step": 5975 + }, + { + "epoch": 1.1975951903807616, + "grad_norm": 23.414807625651076, + "learning_rate": 7.513114407835396e-06, + "loss": 2.8886, + "step": 5976 + }, + { + "epoch": 1.1977955911823648, + "grad_norm": 48.180675911938984, + "learning_rate": 7.512106425547988e-06, + "loss": 2.6181, + "step": 5977 + }, + { + "epoch": 1.197995991983968, + "grad_norm": 30.03929738359782, + "learning_rate": 7.5110983066736465e-06, + "loss": 2.6667, + "step": 5978 + }, + { + "epoch": 1.1981963927855712, + "grad_norm": 25.094231416937575, + "learning_rate": 7.510090051267183e-06, + "loss": 3.2074, + "step": 5979 + }, + { + "epoch": 1.1983967935871744, + "grad_norm": 27.630742828538665, + "learning_rate": 7.509081659383417e-06, + "loss": 2.309, + "step": 5980 + }, + { + "epoch": 1.1985971943887774, + "grad_norm": 30.68843670865852, + "learning_rate": 7.5080731310771785e-06, + "loss": 2.8966, + "step": 5981 + }, + { + "epoch": 1.1987975951903809, + "grad_norm": 25.201700358900723, + "learning_rate": 7.507064466403302e-06, + "loss": 2.8591, + "step": 5982 + }, + { + "epoch": 1.1989979959919839, + "grad_norm": 21.54728378054299, + "learning_rate": 7.50605566541663e-06, + "loss": 2.7647, + "step": 5983 + }, + { + "epoch": 1.199198396793587, + "grad_norm": 27.373947877316176, + "learning_rate": 7.5050467281720115e-06, + "loss": 2.7525, + "step": 5984 + }, + { + "epoch": 1.1993987975951903, + "grad_norm": 24.861323361802636, + "learning_rate": 7.5040376547243056e-06, + "loss": 2.7727, + "step": 5985 + }, + { + "epoch": 1.1995991983967935, + "grad_norm": 18.707488329265505, + "learning_rate": 7.503028445128375e-06, + "loss": 2.831, + "step": 5986 + }, + { + "epoch": 1.1997995991983967, + "grad_norm": 20.797983879969127, + "learning_rate": 7.502019099439093e-06, + "loss": 2.563, + "step": 5987 + }, + { + "epoch": 1.2, + "grad_norm": 38.67109491629981, + "learning_rate": 7.501009617711339e-06, + "loss": 3.1089, + "step": 5988 + }, + { + "epoch": 1.2002004008016032, + "grad_norm": 25.86535720986332, + "learning_rate": 7.500000000000001e-06, + "loss": 2.5916, + "step": 5989 + }, + { + "epoch": 1.2004008016032064, + "grad_norm": 30.362352989152516, + "learning_rate": 7.498990246359972e-06, + "loss": 2.6929, + "step": 5990 + }, + { + "epoch": 1.2006012024048096, + "grad_norm": 26.63847917049345, + "learning_rate": 7.497980356846155e-06, + "loss": 2.4964, + "step": 5991 + }, + { + "epoch": 1.2008016032064128, + "grad_norm": 21.761870598414653, + "learning_rate": 7.496970331513458e-06, + "loss": 2.7461, + "step": 5992 + }, + { + "epoch": 1.201002004008016, + "grad_norm": 23.403363682224324, + "learning_rate": 7.495960170416799e-06, + "loss": 3.0414, + "step": 5993 + }, + { + "epoch": 1.2012024048096193, + "grad_norm": 19.897275906486268, + "learning_rate": 7.494949873611101e-06, + "loss": 2.717, + "step": 5994 + }, + { + "epoch": 1.2014028056112225, + "grad_norm": 25.268185042947763, + "learning_rate": 7.493939441151296e-06, + "loss": 3.1306, + "step": 5995 + }, + { + "epoch": 1.2016032064128257, + "grad_norm": 19.514795125832112, + "learning_rate": 7.492928873092322e-06, + "loss": 2.8308, + "step": 5996 + }, + { + "epoch": 1.201803607214429, + "grad_norm": 24.250295732173342, + "learning_rate": 7.491918169489124e-06, + "loss": 2.6554, + "step": 5997 + }, + { + "epoch": 1.2020040080160321, + "grad_norm": 33.97800535120173, + "learning_rate": 7.490907330396657e-06, + "loss": 2.8356, + "step": 5998 + }, + { + "epoch": 1.2022044088176354, + "grad_norm": 24.363960805155788, + "learning_rate": 7.489896355869882e-06, + "loss": 2.57, + "step": 5999 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 23.831467508368455, + "learning_rate": 7.488885245963766e-06, + "loss": 2.6966, + "step": 6000 + }, + { + "epoch": 1.2026052104208418, + "grad_norm": 19.070727341338255, + "learning_rate": 7.487874000733287e-06, + "loss": 2.459, + "step": 6001 + }, + { + "epoch": 1.2028056112224448, + "grad_norm": 20.605188755546816, + "learning_rate": 7.486862620233426e-06, + "loss": 2.9623, + "step": 6002 + }, + { + "epoch": 1.203006012024048, + "grad_norm": 21.13973884755794, + "learning_rate": 7.485851104519171e-06, + "loss": 2.5526, + "step": 6003 + }, + { + "epoch": 1.2032064128256512, + "grad_norm": 22.9285882200395, + "learning_rate": 7.484839453645525e-06, + "loss": 2.8322, + "step": 6004 + }, + { + "epoch": 1.2034068136272544, + "grad_norm": 36.534183033476914, + "learning_rate": 7.483827667667487e-06, + "loss": 2.8488, + "step": 6005 + }, + { + "epoch": 1.2036072144288577, + "grad_norm": 15.225109790321389, + "learning_rate": 7.482815746640076e-06, + "loss": 2.7205, + "step": 6006 + }, + { + "epoch": 1.2038076152304609, + "grad_norm": 23.991750291085307, + "learning_rate": 7.481803690618304e-06, + "loss": 2.7962, + "step": 6007 + }, + { + "epoch": 1.204008016032064, + "grad_norm": 21.25734429099323, + "learning_rate": 7.480791499657203e-06, + "loss": 2.2064, + "step": 6008 + }, + { + "epoch": 1.2042084168336673, + "grad_norm": 28.07804510908742, + "learning_rate": 7.479779173811808e-06, + "loss": 2.801, + "step": 6009 + }, + { + "epoch": 1.2044088176352705, + "grad_norm": 30.047232546254204, + "learning_rate": 7.478766713137157e-06, + "loss": 3.085, + "step": 6010 + }, + { + "epoch": 1.2046092184368737, + "grad_norm": 26.09472050605454, + "learning_rate": 7.477754117688301e-06, + "loss": 2.8915, + "step": 6011 + }, + { + "epoch": 1.204809619238477, + "grad_norm": 21.266378634839956, + "learning_rate": 7.476741387520296e-06, + "loss": 2.8371, + "step": 6012 + }, + { + "epoch": 1.2050100200400802, + "grad_norm": 21.539555736098407, + "learning_rate": 7.475728522688206e-06, + "loss": 2.7438, + "step": 6013 + }, + { + "epoch": 1.2052104208416834, + "grad_norm": 22.105056383578543, + "learning_rate": 7.474715523247103e-06, + "loss": 2.678, + "step": 6014 + }, + { + "epoch": 1.2054108216432866, + "grad_norm": 19.266011997762238, + "learning_rate": 7.473702389252062e-06, + "loss": 2.5844, + "step": 6015 + }, + { + "epoch": 1.2056112224448898, + "grad_norm": 16.556130564294907, + "learning_rate": 7.472689120758172e-06, + "loss": 2.5393, + "step": 6016 + }, + { + "epoch": 1.205811623246493, + "grad_norm": 23.330369475585528, + "learning_rate": 7.471675717820525e-06, + "loss": 2.8529, + "step": 6017 + }, + { + "epoch": 1.2060120240480963, + "grad_norm": 20.454041864031595, + "learning_rate": 7.470662180494219e-06, + "loss": 2.4735, + "step": 6018 + }, + { + "epoch": 1.2062124248496995, + "grad_norm": 31.28239854973773, + "learning_rate": 7.469648508834363e-06, + "loss": 2.65, + "step": 6019 + }, + { + "epoch": 1.2064128256513027, + "grad_norm": 36.66510894809173, + "learning_rate": 7.468634702896073e-06, + "loss": 2.6192, + "step": 6020 + }, + { + "epoch": 1.2066132264529057, + "grad_norm": 26.440670330250992, + "learning_rate": 7.467620762734471e-06, + "loss": 3.1033, + "step": 6021 + }, + { + "epoch": 1.2068136272545091, + "grad_norm": 22.192636404828942, + "learning_rate": 7.466606688404684e-06, + "loss": 2.3599, + "step": 6022 + }, + { + "epoch": 1.2070140280561121, + "grad_norm": 26.799619854079488, + "learning_rate": 7.4655924799618505e-06, + "loss": 2.6011, + "step": 6023 + }, + { + "epoch": 1.2072144288577153, + "grad_norm": 22.611912147065766, + "learning_rate": 7.464578137461115e-06, + "loss": 2.2983, + "step": 6024 + }, + { + "epoch": 1.2074148296593186, + "grad_norm": 25.536167506923032, + "learning_rate": 7.4635636609576265e-06, + "loss": 2.7184, + "step": 6025 + }, + { + "epoch": 1.2076152304609218, + "grad_norm": 20.6960540511212, + "learning_rate": 7.462549050506548e-06, + "loss": 3.0821, + "step": 6026 + }, + { + "epoch": 1.207815631262525, + "grad_norm": 31.71277792670749, + "learning_rate": 7.4615343061630405e-06, + "loss": 3.1834, + "step": 6027 + }, + { + "epoch": 1.2080160320641282, + "grad_norm": 30.260719621410296, + "learning_rate": 7.4605194279822815e-06, + "loss": 2.486, + "step": 6028 + }, + { + "epoch": 1.2082164328657314, + "grad_norm": 30.336747627229595, + "learning_rate": 7.459504416019447e-06, + "loss": 2.9764, + "step": 6029 + }, + { + "epoch": 1.2084168336673347, + "grad_norm": 19.42497454824745, + "learning_rate": 7.458489270329727e-06, + "loss": 2.7273, + "step": 6030 + }, + { + "epoch": 1.2086172344689379, + "grad_norm": 26.37725823325525, + "learning_rate": 7.457473990968315e-06, + "loss": 2.6762, + "step": 6031 + }, + { + "epoch": 1.208817635270541, + "grad_norm": 25.82094392195967, + "learning_rate": 7.4564585779904175e-06, + "loss": 2.5106, + "step": 6032 + }, + { + "epoch": 1.2090180360721443, + "grad_norm": 29.041598425614552, + "learning_rate": 7.4554430314512404e-06, + "loss": 2.8965, + "step": 6033 + }, + { + "epoch": 1.2092184368737475, + "grad_norm": 38.80653968451261, + "learning_rate": 7.454427351406e-06, + "loss": 3.2854, + "step": 6034 + }, + { + "epoch": 1.2094188376753507, + "grad_norm": 18.195451735584708, + "learning_rate": 7.453411537909921e-06, + "loss": 1.9602, + "step": 6035 + }, + { + "epoch": 1.209619238476954, + "grad_norm": 25.659290047143394, + "learning_rate": 7.452395591018236e-06, + "loss": 2.9489, + "step": 6036 + }, + { + "epoch": 1.2098196392785572, + "grad_norm": 32.649586667942415, + "learning_rate": 7.451379510786183e-06, + "loss": 2.9167, + "step": 6037 + }, + { + "epoch": 1.2100200400801604, + "grad_norm": 34.611195112950206, + "learning_rate": 7.4503632972690075e-06, + "loss": 2.8781, + "step": 6038 + }, + { + "epoch": 1.2102204408817636, + "grad_norm": 33.58039289009814, + "learning_rate": 7.4493469505219614e-06, + "loss": 2.7575, + "step": 6039 + }, + { + "epoch": 1.2104208416833666, + "grad_norm": 22.994161139729638, + "learning_rate": 7.448330470600308e-06, + "loss": 2.7259, + "step": 6040 + }, + { + "epoch": 1.21062124248497, + "grad_norm": 28.746305721689634, + "learning_rate": 7.447313857559311e-06, + "loss": 3.3349, + "step": 6041 + }, + { + "epoch": 1.210821643286573, + "grad_norm": 22.872772494452217, + "learning_rate": 7.446297111454247e-06, + "loss": 3.0941, + "step": 6042 + }, + { + "epoch": 1.2110220440881763, + "grad_norm": 25.408068553029846, + "learning_rate": 7.445280232340398e-06, + "loss": 2.9117, + "step": 6043 + }, + { + "epoch": 1.2112224448897795, + "grad_norm": 32.00845956252673, + "learning_rate": 7.444263220273054e-06, + "loss": 3.0752, + "step": 6044 + }, + { + "epoch": 1.2114228456913827, + "grad_norm": 20.669064892355806, + "learning_rate": 7.443246075307511e-06, + "loss": 2.4802, + "step": 6045 + }, + { + "epoch": 1.211623246492986, + "grad_norm": 21.876923040174614, + "learning_rate": 7.442228797499072e-06, + "loss": 2.7084, + "step": 6046 + }, + { + "epoch": 1.2118236472945891, + "grad_norm": 24.037219408341702, + "learning_rate": 7.441211386903047e-06, + "loss": 3.0183, + "step": 6047 + }, + { + "epoch": 1.2120240480961924, + "grad_norm": 26.67264488859685, + "learning_rate": 7.440193843574757e-06, + "loss": 2.781, + "step": 6048 + }, + { + "epoch": 1.2122244488977956, + "grad_norm": 19.637658785329062, + "learning_rate": 7.439176167569524e-06, + "loss": 2.4256, + "step": 6049 + }, + { + "epoch": 1.2124248496993988, + "grad_norm": 25.06383871483329, + "learning_rate": 7.438158358942684e-06, + "loss": 2.1749, + "step": 6050 + }, + { + "epoch": 1.212625250501002, + "grad_norm": 21.64217804412544, + "learning_rate": 7.437140417749573e-06, + "loss": 2.7329, + "step": 6051 + }, + { + "epoch": 1.2128256513026052, + "grad_norm": 23.70062092338861, + "learning_rate": 7.436122344045542e-06, + "loss": 2.8957, + "step": 6052 + }, + { + "epoch": 1.2130260521042084, + "grad_norm": 26.51794358559379, + "learning_rate": 7.4351041378859425e-06, + "loss": 2.0799, + "step": 6053 + }, + { + "epoch": 1.2132264529058117, + "grad_norm": 41.0481313644151, + "learning_rate": 7.434085799326136e-06, + "loss": 2.5055, + "step": 6054 + }, + { + "epoch": 1.2134268537074149, + "grad_norm": 24.37836878342053, + "learning_rate": 7.4330673284214905e-06, + "loss": 3.1275, + "step": 6055 + }, + { + "epoch": 1.213627254509018, + "grad_norm": 38.765500613858585, + "learning_rate": 7.432048725227384e-06, + "loss": 2.4542, + "step": 6056 + }, + { + "epoch": 1.2138276553106213, + "grad_norm": 31.760556984907595, + "learning_rate": 7.431029989799199e-06, + "loss": 2.7053, + "step": 6057 + }, + { + "epoch": 1.2140280561122245, + "grad_norm": 22.990203065057546, + "learning_rate": 7.430011122192324e-06, + "loss": 2.4819, + "step": 6058 + }, + { + "epoch": 1.2142284569138277, + "grad_norm": 25.323615517469282, + "learning_rate": 7.428992122462158e-06, + "loss": 2.3229, + "step": 6059 + }, + { + "epoch": 1.214428857715431, + "grad_norm": 20.90104771931298, + "learning_rate": 7.4279729906641055e-06, + "loss": 2.2801, + "step": 6060 + }, + { + "epoch": 1.214629258517034, + "grad_norm": 22.833590075250157, + "learning_rate": 7.426953726853574e-06, + "loss": 2.905, + "step": 6061 + }, + { + "epoch": 1.2148296593186372, + "grad_norm": 83.57207491822129, + "learning_rate": 7.425934331085989e-06, + "loss": 2.9775, + "step": 6062 + }, + { + "epoch": 1.2150300601202404, + "grad_norm": 24.97764635944486, + "learning_rate": 7.424914803416772e-06, + "loss": 2.9799, + "step": 6063 + }, + { + "epoch": 1.2152304609218436, + "grad_norm": 24.636705174950695, + "learning_rate": 7.423895143901358e-06, + "loss": 2.9328, + "step": 6064 + }, + { + "epoch": 1.2154308617234468, + "grad_norm": 35.099852570159555, + "learning_rate": 7.422875352595188e-06, + "loss": 3.3878, + "step": 6065 + }, + { + "epoch": 1.21563126252505, + "grad_norm": 36.83452724667174, + "learning_rate": 7.421855429553707e-06, + "loss": 2.3311, + "step": 6066 + }, + { + "epoch": 1.2158316633266533, + "grad_norm": 24.048489146310867, + "learning_rate": 7.420835374832372e-06, + "loss": 2.5583, + "step": 6067 + }, + { + "epoch": 1.2160320641282565, + "grad_norm": 22.090806585174967, + "learning_rate": 7.419815188486644e-06, + "loss": 2.539, + "step": 6068 + }, + { + "epoch": 1.2162324649298597, + "grad_norm": 29.144450712118047, + "learning_rate": 7.418794870571992e-06, + "loss": 2.9489, + "step": 6069 + }, + { + "epoch": 1.216432865731463, + "grad_norm": 29.915171801350283, + "learning_rate": 7.417774421143894e-06, + "loss": 2.913, + "step": 6070 + }, + { + "epoch": 1.2166332665330661, + "grad_norm": 27.17252140985152, + "learning_rate": 7.41675384025783e-06, + "loss": 2.5097, + "step": 6071 + }, + { + "epoch": 1.2168336673346694, + "grad_norm": 41.77583385860498, + "learning_rate": 7.415733127969293e-06, + "loss": 2.8335, + "step": 6072 + }, + { + "epoch": 1.2170340681362726, + "grad_norm": 22.720169091874407, + "learning_rate": 7.4147122843337805e-06, + "loss": 2.6637, + "step": 6073 + }, + { + "epoch": 1.2172344689378758, + "grad_norm": 151.18264764390977, + "learning_rate": 7.413691309406794e-06, + "loss": 2.3551, + "step": 6074 + }, + { + "epoch": 1.217434869739479, + "grad_norm": 24.110276140667317, + "learning_rate": 7.412670203243851e-06, + "loss": 2.1259, + "step": 6075 + }, + { + "epoch": 1.2176352705410822, + "grad_norm": 79.20551615104327, + "learning_rate": 7.411648965900466e-06, + "loss": 2.7293, + "step": 6076 + }, + { + "epoch": 1.2178356713426854, + "grad_norm": 29.79644439689944, + "learning_rate": 7.41062759743217e-06, + "loss": 3.2739, + "step": 6077 + }, + { + "epoch": 1.2180360721442887, + "grad_norm": 18.487648188759472, + "learning_rate": 7.409606097894489e-06, + "loss": 2.4548, + "step": 6078 + }, + { + "epoch": 1.2182364729458919, + "grad_norm": 24.792855057270845, + "learning_rate": 7.40858446734297e-06, + "loss": 2.7449, + "step": 6079 + }, + { + "epoch": 1.2184368737474949, + "grad_norm": 21.360520761857334, + "learning_rate": 7.407562705833156e-06, + "loss": 2.705, + "step": 6080 + }, + { + "epoch": 1.2186372745490983, + "grad_norm": 25.67554206954903, + "learning_rate": 7.406540813420604e-06, + "loss": 2.9531, + "step": 6081 + }, + { + "epoch": 1.2188376753507013, + "grad_norm": 34.41543886312651, + "learning_rate": 7.405518790160878e-06, + "loss": 2.7739, + "step": 6082 + }, + { + "epoch": 1.2190380761523045, + "grad_norm": 51.2518687570531, + "learning_rate": 7.404496636109543e-06, + "loss": 2.73, + "step": 6083 + }, + { + "epoch": 1.2192384769539077, + "grad_norm": 26.53847249325922, + "learning_rate": 7.403474351322176e-06, + "loss": 2.6652, + "step": 6084 + }, + { + "epoch": 1.219438877755511, + "grad_norm": 30.905594629516855, + "learning_rate": 7.402451935854362e-06, + "loss": 2.5479, + "step": 6085 + }, + { + "epoch": 1.2196392785571142, + "grad_norm": 27.11948189880245, + "learning_rate": 7.40142938976169e-06, + "loss": 2.6069, + "step": 6086 + }, + { + "epoch": 1.2198396793587174, + "grad_norm": 31.87540484642931, + "learning_rate": 7.4004067130997555e-06, + "loss": 2.5647, + "step": 6087 + }, + { + "epoch": 1.2200400801603206, + "grad_norm": 28.82513321790514, + "learning_rate": 7.399383905924166e-06, + "loss": 2.4226, + "step": 6088 + }, + { + "epoch": 1.2202404809619238, + "grad_norm": 23.459009503115055, + "learning_rate": 7.398360968290531e-06, + "loss": 2.7216, + "step": 6089 + }, + { + "epoch": 1.220440881763527, + "grad_norm": 25.950749133177187, + "learning_rate": 7.397337900254471e-06, + "loss": 2.5442, + "step": 6090 + }, + { + "epoch": 1.2206412825651303, + "grad_norm": 25.27159294065821, + "learning_rate": 7.3963147018716084e-06, + "loss": 2.1865, + "step": 6091 + }, + { + "epoch": 1.2208416833667335, + "grad_norm": 21.283935286451392, + "learning_rate": 7.39529137319758e-06, + "loss": 2.7669, + "step": 6092 + }, + { + "epoch": 1.2210420841683367, + "grad_norm": 28.3090293884745, + "learning_rate": 7.3942679142880225e-06, + "loss": 2.9354, + "step": 6093 + }, + { + "epoch": 1.22124248496994, + "grad_norm": 20.525362287788298, + "learning_rate": 7.393244325198586e-06, + "loss": 2.5878, + "step": 6094 + }, + { + "epoch": 1.2214428857715431, + "grad_norm": 18.18383145274663, + "learning_rate": 7.3922206059849225e-06, + "loss": 2.6592, + "step": 6095 + }, + { + "epoch": 1.2216432865731464, + "grad_norm": 25.80062226049431, + "learning_rate": 7.391196756702692e-06, + "loss": 2.848, + "step": 6096 + }, + { + "epoch": 1.2218436873747496, + "grad_norm": 25.470189753203346, + "learning_rate": 7.390172777407566e-06, + "loss": 2.6798, + "step": 6097 + }, + { + "epoch": 1.2220440881763528, + "grad_norm": 27.656807570457413, + "learning_rate": 7.389148668155216e-06, + "loss": 2.6075, + "step": 6098 + }, + { + "epoch": 1.2222444889779558, + "grad_norm": 19.364798810073648, + "learning_rate": 7.3881244290013265e-06, + "loss": 2.4053, + "step": 6099 + }, + { + "epoch": 1.2224448897795592, + "grad_norm": 49.313217809292475, + "learning_rate": 7.387100060001587e-06, + "loss": 2.0681, + "step": 6100 + }, + { + "epoch": 1.2226452905811622, + "grad_norm": 69.13105330271009, + "learning_rate": 7.386075561211694e-06, + "loss": 3.1728, + "step": 6101 + }, + { + "epoch": 1.2228456913827654, + "grad_norm": 38.428883479032024, + "learning_rate": 7.385050932687351e-06, + "loss": 2.4794, + "step": 6102 + }, + { + "epoch": 1.2230460921843687, + "grad_norm": 32.020398765419415, + "learning_rate": 7.384026174484267e-06, + "loss": 2.9706, + "step": 6103 + }, + { + "epoch": 1.2232464929859719, + "grad_norm": 18.786058603200477, + "learning_rate": 7.3830012866581625e-06, + "loss": 2.9195, + "step": 6104 + }, + { + "epoch": 1.223446893787575, + "grad_norm": 23.341444847283558, + "learning_rate": 7.381976269264757e-06, + "loss": 2.6749, + "step": 6105 + }, + { + "epoch": 1.2236472945891783, + "grad_norm": 24.164389253711576, + "learning_rate": 7.380951122359787e-06, + "loss": 2.9959, + "step": 6106 + }, + { + "epoch": 1.2238476953907815, + "grad_norm": 22.81562598890371, + "learning_rate": 7.379925845998992e-06, + "loss": 3.0975, + "step": 6107 + }, + { + "epoch": 1.2240480961923847, + "grad_norm": 31.090360813090697, + "learning_rate": 7.378900440238113e-06, + "loss": 2.5477, + "step": 6108 + }, + { + "epoch": 1.224248496993988, + "grad_norm": 24.162970522473824, + "learning_rate": 7.377874905132909e-06, + "loss": 3.1578, + "step": 6109 + }, + { + "epoch": 1.2244488977955912, + "grad_norm": 24.035033690513004, + "learning_rate": 7.376849240739134e-06, + "loss": 2.8998, + "step": 6110 + }, + { + "epoch": 1.2246492985971944, + "grad_norm": 21.699504762460023, + "learning_rate": 7.375823447112555e-06, + "loss": 2.4466, + "step": 6111 + }, + { + "epoch": 1.2248496993987976, + "grad_norm": 15.38244300278986, + "learning_rate": 7.374797524308952e-06, + "loss": 2.4277, + "step": 6112 + }, + { + "epoch": 1.2250501002004008, + "grad_norm": 21.70385053752429, + "learning_rate": 7.373771472384099e-06, + "loss": 2.9994, + "step": 6113 + }, + { + "epoch": 1.225250501002004, + "grad_norm": 17.406077575203955, + "learning_rate": 7.3727452913937905e-06, + "loss": 2.3909, + "step": 6114 + }, + { + "epoch": 1.2254509018036073, + "grad_norm": 29.529102794366946, + "learning_rate": 7.371718981393815e-06, + "loss": 2.1415, + "step": 6115 + }, + { + "epoch": 1.2256513026052105, + "grad_norm": 21.89532777966408, + "learning_rate": 7.370692542439979e-06, + "loss": 2.4204, + "step": 6116 + }, + { + "epoch": 1.2258517034068137, + "grad_norm": 24.020877568138324, + "learning_rate": 7.3696659745880894e-06, + "loss": 3.2281, + "step": 6117 + }, + { + "epoch": 1.226052104208417, + "grad_norm": 25.211974585670017, + "learning_rate": 7.368639277893962e-06, + "loss": 2.74, + "step": 6118 + }, + { + "epoch": 1.2262525050100201, + "grad_norm": 22.226736726078563, + "learning_rate": 7.367612452413422e-06, + "loss": 2.7497, + "step": 6119 + }, + { + "epoch": 1.2264529058116231, + "grad_norm": 24.50381811803595, + "learning_rate": 7.366585498202297e-06, + "loss": 2.5976, + "step": 6120 + }, + { + "epoch": 1.2266533066132264, + "grad_norm": 27.16183222160418, + "learning_rate": 7.365558415316428e-06, + "loss": 1.9673, + "step": 6121 + }, + { + "epoch": 1.2268537074148296, + "grad_norm": 32.71046398903653, + "learning_rate": 7.3645312038116526e-06, + "loss": 2.934, + "step": 6122 + }, + { + "epoch": 1.2270541082164328, + "grad_norm": 19.888832677333465, + "learning_rate": 7.363503863743826e-06, + "loss": 2.7741, + "step": 6123 + }, + { + "epoch": 1.227254509018036, + "grad_norm": 31.138959925203462, + "learning_rate": 7.362476395168806e-06, + "loss": 2.9369, + "step": 6124 + }, + { + "epoch": 1.2274549098196392, + "grad_norm": 17.713502034901655, + "learning_rate": 7.361448798142456e-06, + "loss": 3.0046, + "step": 6125 + }, + { + "epoch": 1.2276553106212424, + "grad_norm": 46.578136762342574, + "learning_rate": 7.3604210727206515e-06, + "loss": 2.1446, + "step": 6126 + }, + { + "epoch": 1.2278557114228457, + "grad_norm": 24.201193503562788, + "learning_rate": 7.359393218959267e-06, + "loss": 3.1802, + "step": 6127 + }, + { + "epoch": 1.2280561122244489, + "grad_norm": 19.470778914206075, + "learning_rate": 7.358365236914191e-06, + "loss": 2.4227, + "step": 6128 + }, + { + "epoch": 1.228256513026052, + "grad_norm": 64.12901800276339, + "learning_rate": 7.357337126641317e-06, + "loss": 2.7209, + "step": 6129 + }, + { + "epoch": 1.2284569138276553, + "grad_norm": 24.02264457905369, + "learning_rate": 7.356308888196541e-06, + "loss": 2.6999, + "step": 6130 + }, + { + "epoch": 1.2286573146292585, + "grad_norm": 20.241205702924404, + "learning_rate": 7.355280521635773e-06, + "loss": 2.8381, + "step": 6131 + }, + { + "epoch": 1.2288577154308618, + "grad_norm": 26.555470376990286, + "learning_rate": 7.354252027014928e-06, + "loss": 2.6011, + "step": 6132 + }, + { + "epoch": 1.229058116232465, + "grad_norm": 18.94185329989584, + "learning_rate": 7.353223404389926e-06, + "loss": 2.4687, + "step": 6133 + }, + { + "epoch": 1.2292585170340682, + "grad_norm": 42.49962457592147, + "learning_rate": 7.352194653816691e-06, + "loss": 2.5355, + "step": 6134 + }, + { + "epoch": 1.2294589178356714, + "grad_norm": 30.058271926094566, + "learning_rate": 7.351165775351161e-06, + "loss": 2.6785, + "step": 6135 + }, + { + "epoch": 1.2296593186372746, + "grad_norm": 28.91108991136012, + "learning_rate": 7.350136769049278e-06, + "loss": 3.3355, + "step": 6136 + }, + { + "epoch": 1.2298597194388778, + "grad_norm": 21.969540556357234, + "learning_rate": 7.349107634966989e-06, + "loss": 2.9258, + "step": 6137 + }, + { + "epoch": 1.230060120240481, + "grad_norm": 51.77652310190723, + "learning_rate": 7.348078373160252e-06, + "loss": 3.0698, + "step": 6138 + }, + { + "epoch": 1.230260521042084, + "grad_norm": 26.399909754114585, + "learning_rate": 7.347048983685026e-06, + "loss": 3.4747, + "step": 6139 + }, + { + "epoch": 1.2304609218436875, + "grad_norm": 26.393393354957226, + "learning_rate": 7.3460194665972825e-06, + "loss": 3.2146, + "step": 6140 + }, + { + "epoch": 1.2306613226452905, + "grad_norm": 24.132356251919997, + "learning_rate": 7.3449898219529995e-06, + "loss": 2.8651, + "step": 6141 + }, + { + "epoch": 1.2308617234468937, + "grad_norm": 19.269580292539565, + "learning_rate": 7.3439600498081555e-06, + "loss": 2.6104, + "step": 6142 + }, + { + "epoch": 1.231062124248497, + "grad_norm": 48.664220715179404, + "learning_rate": 7.3429301502187445e-06, + "loss": 2.7242, + "step": 6143 + }, + { + "epoch": 1.2312625250501001, + "grad_norm": 27.789751074798858, + "learning_rate": 7.341900123240763e-06, + "loss": 2.9464, + "step": 6144 + }, + { + "epoch": 1.2314629258517034, + "grad_norm": 38.237216395016276, + "learning_rate": 7.340869968930214e-06, + "loss": 2.6877, + "step": 6145 + }, + { + "epoch": 1.2316633266533066, + "grad_norm": 22.687572013985122, + "learning_rate": 7.339839687343111e-06, + "loss": 3.2279, + "step": 6146 + }, + { + "epoch": 1.2318637274549098, + "grad_norm": 22.407148689891926, + "learning_rate": 7.3388092785354704e-06, + "loss": 2.5101, + "step": 6147 + }, + { + "epoch": 1.232064128256513, + "grad_norm": 24.258683144253922, + "learning_rate": 7.3377787425633155e-06, + "loss": 3.0733, + "step": 6148 + }, + { + "epoch": 1.2322645290581162, + "grad_norm": 35.110676728161366, + "learning_rate": 7.336748079482679e-06, + "loss": 2.687, + "step": 6149 + }, + { + "epoch": 1.2324649298597194, + "grad_norm": 19.325785129531358, + "learning_rate": 7.335717289349602e-06, + "loss": 2.6003, + "step": 6150 + }, + { + "epoch": 1.2326653306613227, + "grad_norm": 21.712939868227124, + "learning_rate": 7.334686372220128e-06, + "loss": 2.3731, + "step": 6151 + }, + { + "epoch": 1.2328657314629259, + "grad_norm": 22.401136182280585, + "learning_rate": 7.333655328150309e-06, + "loss": 2.8924, + "step": 6152 + }, + { + "epoch": 1.233066132264529, + "grad_norm": 23.9162790283414, + "learning_rate": 7.332624157196204e-06, + "loss": 2.7515, + "step": 6153 + }, + { + "epoch": 1.2332665330661323, + "grad_norm": 29.135192936677054, + "learning_rate": 7.331592859413882e-06, + "loss": 3.2652, + "step": 6154 + }, + { + "epoch": 1.2334669338677355, + "grad_norm": 28.032886566593437, + "learning_rate": 7.3305614348594134e-06, + "loss": 2.455, + "step": 6155 + }, + { + "epoch": 1.2336673346693388, + "grad_norm": 15.989400027140288, + "learning_rate": 7.329529883588879e-06, + "loss": 2.8238, + "step": 6156 + }, + { + "epoch": 1.233867735470942, + "grad_norm": 42.37506972806136, + "learning_rate": 7.328498205658366e-06, + "loss": 2.6241, + "step": 6157 + }, + { + "epoch": 1.234068136272545, + "grad_norm": 20.722164490590586, + "learning_rate": 7.32746640112397e-06, + "loss": 2.3983, + "step": 6158 + }, + { + "epoch": 1.2342685370741484, + "grad_norm": 23.954844800811493, + "learning_rate": 7.32643447004179e-06, + "loss": 2.7736, + "step": 6159 + }, + { + "epoch": 1.2344689378757514, + "grad_norm": 26.58231128608536, + "learning_rate": 7.325402412467933e-06, + "loss": 2.4485, + "step": 6160 + }, + { + "epoch": 1.2346693386773546, + "grad_norm": 17.063629161869375, + "learning_rate": 7.324370228458513e-06, + "loss": 2.508, + "step": 6161 + }, + { + "epoch": 1.2348697394789578, + "grad_norm": 36.83534570777766, + "learning_rate": 7.323337918069654e-06, + "loss": 2.9259, + "step": 6162 + }, + { + "epoch": 1.235070140280561, + "grad_norm": 25.40637268267847, + "learning_rate": 7.3223054813574835e-06, + "loss": 2.3893, + "step": 6163 + }, + { + "epoch": 1.2352705410821643, + "grad_norm": 20.889474936835356, + "learning_rate": 7.321272918378135e-06, + "loss": 2.9652, + "step": 6164 + }, + { + "epoch": 1.2354709418837675, + "grad_norm": 30.619455579838842, + "learning_rate": 7.3202402291877516e-06, + "loss": 3.1012, + "step": 6165 + }, + { + "epoch": 1.2356713426853707, + "grad_norm": 53.579599331209344, + "learning_rate": 7.319207413842482e-06, + "loss": 2.3075, + "step": 6166 + }, + { + "epoch": 1.235871743486974, + "grad_norm": 19.680286795533725, + "learning_rate": 7.31817447239848e-06, + "loss": 2.7719, + "step": 6167 + }, + { + "epoch": 1.2360721442885771, + "grad_norm": 29.468967226159627, + "learning_rate": 7.317141404911913e-06, + "loss": 3.373, + "step": 6168 + }, + { + "epoch": 1.2362725450901804, + "grad_norm": 41.115615553619726, + "learning_rate": 7.3161082114389456e-06, + "loss": 2.4875, + "step": 6169 + }, + { + "epoch": 1.2364729458917836, + "grad_norm": 28.52510188988966, + "learning_rate": 7.315074892035757e-06, + "loss": 2.5378, + "step": 6170 + }, + { + "epoch": 1.2366733466933868, + "grad_norm": 23.212372591635578, + "learning_rate": 7.3140414467585286e-06, + "loss": 2.3233, + "step": 6171 + }, + { + "epoch": 1.23687374749499, + "grad_norm": 24.755388474210157, + "learning_rate": 7.31300787566345e-06, + "loss": 2.7845, + "step": 6172 + }, + { + "epoch": 1.2370741482965932, + "grad_norm": 29.46131316692346, + "learning_rate": 7.31197417880672e-06, + "loss": 2.6655, + "step": 6173 + }, + { + "epoch": 1.2372745490981965, + "grad_norm": 25.465586018474145, + "learning_rate": 7.31094035624454e-06, + "loss": 2.888, + "step": 6174 + }, + { + "epoch": 1.2374749498997997, + "grad_norm": 22.75625942362254, + "learning_rate": 7.309906408033123e-06, + "loss": 2.967, + "step": 6175 + }, + { + "epoch": 1.2376753507014029, + "grad_norm": 22.239815041059682, + "learning_rate": 7.308872334228685e-06, + "loss": 2.4718, + "step": 6176 + }, + { + "epoch": 1.237875751503006, + "grad_norm": 23.197277142510547, + "learning_rate": 7.307838134887449e-06, + "loss": 2.6922, + "step": 6177 + }, + { + "epoch": 1.2380761523046093, + "grad_norm": 22.499416082314454, + "learning_rate": 7.306803810065647e-06, + "loss": 2.7874, + "step": 6178 + }, + { + "epoch": 1.2382765531062123, + "grad_norm": 22.091135348996875, + "learning_rate": 7.305769359819517e-06, + "loss": 2.5964, + "step": 6179 + }, + { + "epoch": 1.2384769539078155, + "grad_norm": 27.29469316505334, + "learning_rate": 7.304734784205303e-06, + "loss": 3.008, + "step": 6180 + }, + { + "epoch": 1.2386773547094188, + "grad_norm": 24.12041243578072, + "learning_rate": 7.303700083279256e-06, + "loss": 2.8804, + "step": 6181 + }, + { + "epoch": 1.238877755511022, + "grad_norm": 16.46427979707707, + "learning_rate": 7.302665257097637e-06, + "loss": 2.6918, + "step": 6182 + }, + { + "epoch": 1.2390781563126252, + "grad_norm": 22.765774744987866, + "learning_rate": 7.3016303057167074e-06, + "loss": 2.7748, + "step": 6183 + }, + { + "epoch": 1.2392785571142284, + "grad_norm": 21.20175900339727, + "learning_rate": 7.300595229192739e-06, + "loss": 2.8539, + "step": 6184 + }, + { + "epoch": 1.2394789579158316, + "grad_norm": 39.0985253746014, + "learning_rate": 7.299560027582015e-06, + "loss": 3.1731, + "step": 6185 + }, + { + "epoch": 1.2396793587174348, + "grad_norm": 21.395885330957405, + "learning_rate": 7.298524700940816e-06, + "loss": 2.7246, + "step": 6186 + }, + { + "epoch": 1.239879759519038, + "grad_norm": 27.274621339870606, + "learning_rate": 7.297489249325438e-06, + "loss": 2.9795, + "step": 6187 + }, + { + "epoch": 1.2400801603206413, + "grad_norm": 28.536499561821095, + "learning_rate": 7.2964536727921766e-06, + "loss": 2.5326, + "step": 6188 + }, + { + "epoch": 1.2402805611222445, + "grad_norm": 20.313686177247373, + "learning_rate": 7.295417971397339e-06, + "loss": 2.617, + "step": 6189 + }, + { + "epoch": 1.2404809619238477, + "grad_norm": 23.878465674185748, + "learning_rate": 7.294382145197238e-06, + "loss": 2.6154, + "step": 6190 + }, + { + "epoch": 1.240681362725451, + "grad_norm": 32.13273631292432, + "learning_rate": 7.293346194248193e-06, + "loss": 2.7204, + "step": 6191 + }, + { + "epoch": 1.2408817635270541, + "grad_norm": 29.478277559249683, + "learning_rate": 7.292310118606531e-06, + "loss": 2.8251, + "step": 6192 + }, + { + "epoch": 1.2410821643286574, + "grad_norm": 30.8110998499313, + "learning_rate": 7.291273918328583e-06, + "loss": 2.4537, + "step": 6193 + }, + { + "epoch": 1.2412825651302606, + "grad_norm": 21.917943751009368, + "learning_rate": 7.290237593470691e-06, + "loss": 2.4161, + "step": 6194 + }, + { + "epoch": 1.2414829659318638, + "grad_norm": 48.09836974464288, + "learning_rate": 7.2892011440892e-06, + "loss": 3.1241, + "step": 6195 + }, + { + "epoch": 1.241683366733467, + "grad_norm": 26.092041887238103, + "learning_rate": 7.2881645702404625e-06, + "loss": 2.591, + "step": 6196 + }, + { + "epoch": 1.2418837675350702, + "grad_norm": 27.20789281793405, + "learning_rate": 7.287127871980841e-06, + "loss": 2.9335, + "step": 6197 + }, + { + "epoch": 1.2420841683366732, + "grad_norm": 25.29947906161996, + "learning_rate": 7.2860910493667005e-06, + "loss": 2.4201, + "step": 6198 + }, + { + "epoch": 1.2422845691382767, + "grad_norm": 34.72519029677356, + "learning_rate": 7.285054102454414e-06, + "loss": 2.6065, + "step": 6199 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 26.96391364346612, + "learning_rate": 7.2840170313003635e-06, + "loss": 3.0133, + "step": 6200 + }, + { + "epoch": 1.2426853707414829, + "grad_norm": 29.06907757673361, + "learning_rate": 7.282979835960936e-06, + "loss": 3.0401, + "step": 6201 + }, + { + "epoch": 1.242885771543086, + "grad_norm": 22.153601265920095, + "learning_rate": 7.281942516492526e-06, + "loss": 2.2789, + "step": 6202 + }, + { + "epoch": 1.2430861723446893, + "grad_norm": 26.05665115945267, + "learning_rate": 7.28090507295153e-06, + "loss": 2.501, + "step": 6203 + }, + { + "epoch": 1.2432865731462925, + "grad_norm": 26.747258784015088, + "learning_rate": 7.2798675053943605e-06, + "loss": 2.9549, + "step": 6204 + }, + { + "epoch": 1.2434869739478958, + "grad_norm": 31.66538547767724, + "learning_rate": 7.278829813877428e-06, + "loss": 2.8397, + "step": 6205 + }, + { + "epoch": 1.243687374749499, + "grad_norm": 21.592694744560777, + "learning_rate": 7.2777919984571555e-06, + "loss": 2.9279, + "step": 6206 + }, + { + "epoch": 1.2438877755511022, + "grad_norm": 33.75130907211018, + "learning_rate": 7.27675405918997e-06, + "loss": 3.3867, + "step": 6207 + }, + { + "epoch": 1.2440881763527054, + "grad_norm": 25.111199896206923, + "learning_rate": 7.275715996132305e-06, + "loss": 2.4864, + "step": 6208 + }, + { + "epoch": 1.2442885771543086, + "grad_norm": 27.688133440912782, + "learning_rate": 7.274677809340603e-06, + "loss": 3.1099, + "step": 6209 + }, + { + "epoch": 1.2444889779559118, + "grad_norm": 29.53761456902942, + "learning_rate": 7.27363949887131e-06, + "loss": 3.0967, + "step": 6210 + }, + { + "epoch": 1.244689378757515, + "grad_norm": 27.764561325047016, + "learning_rate": 7.272601064780881e-06, + "loss": 2.7807, + "step": 6211 + }, + { + "epoch": 1.2448897795591183, + "grad_norm": 20.06822530297961, + "learning_rate": 7.271562507125776e-06, + "loss": 2.5084, + "step": 6212 + }, + { + "epoch": 1.2450901803607215, + "grad_norm": 23.65754004178412, + "learning_rate": 7.270523825962467e-06, + "loss": 2.9064, + "step": 6213 + }, + { + "epoch": 1.2452905811623247, + "grad_norm": 32.05284126525905, + "learning_rate": 7.269485021347425e-06, + "loss": 2.8222, + "step": 6214 + }, + { + "epoch": 1.245490981963928, + "grad_norm": 26.48309364916915, + "learning_rate": 7.268446093337133e-06, + "loss": 2.595, + "step": 6215 + }, + { + "epoch": 1.2456913827655312, + "grad_norm": 32.000254731536636, + "learning_rate": 7.267407041988077e-06, + "loss": 2.8562, + "step": 6216 + }, + { + "epoch": 1.2458917835671341, + "grad_norm": 33.09999281092031, + "learning_rate": 7.266367867356754e-06, + "loss": 2.3234, + "step": 6217 + }, + { + "epoch": 1.2460921843687376, + "grad_norm": 25.7177592275531, + "learning_rate": 7.265328569499663e-06, + "loss": 2.4449, + "step": 6218 + }, + { + "epoch": 1.2462925851703406, + "grad_norm": 25.84803460163972, + "learning_rate": 7.264289148473316e-06, + "loss": 2.8122, + "step": 6219 + }, + { + "epoch": 1.2464929859719438, + "grad_norm": 24.739617783789182, + "learning_rate": 7.263249604334225e-06, + "loss": 2.3033, + "step": 6220 + }, + { + "epoch": 1.246693386773547, + "grad_norm": 22.922441229735455, + "learning_rate": 7.262209937138911e-06, + "loss": 1.9954, + "step": 6221 + }, + { + "epoch": 1.2468937875751502, + "grad_norm": 18.921116211857807, + "learning_rate": 7.2611701469439055e-06, + "loss": 2.7856, + "step": 6222 + }, + { + "epoch": 1.2470941883767535, + "grad_norm": 24.29982915274373, + "learning_rate": 7.26013023380574e-06, + "loss": 2.5454, + "step": 6223 + }, + { + "epoch": 1.2472945891783567, + "grad_norm": 33.60120033823072, + "learning_rate": 7.259090197780957e-06, + "loss": 2.4454, + "step": 6224 + }, + { + "epoch": 1.24749498997996, + "grad_norm": 34.47279853301886, + "learning_rate": 7.258050038926105e-06, + "loss": 2.7359, + "step": 6225 + }, + { + "epoch": 1.247695390781563, + "grad_norm": 32.301756832983195, + "learning_rate": 7.25700975729774e-06, + "loss": 2.9059, + "step": 6226 + }, + { + "epoch": 1.2478957915831663, + "grad_norm": 39.40417356451292, + "learning_rate": 7.255969352952424e-06, + "loss": 2.3497, + "step": 6227 + }, + { + "epoch": 1.2480961923847695, + "grad_norm": 31.068104771502497, + "learning_rate": 7.254928825946722e-06, + "loss": 3.479, + "step": 6228 + }, + { + "epoch": 1.2482965931863728, + "grad_norm": 43.27099069507002, + "learning_rate": 7.253888176337213e-06, + "loss": 2.782, + "step": 6229 + }, + { + "epoch": 1.248496993987976, + "grad_norm": 24.35653820681767, + "learning_rate": 7.252847404180474e-06, + "loss": 1.9471, + "step": 6230 + }, + { + "epoch": 1.2486973947895792, + "grad_norm": 19.657784351545992, + "learning_rate": 7.251806509533099e-06, + "loss": 2.3764, + "step": 6231 + }, + { + "epoch": 1.2488977955911824, + "grad_norm": 41.712016433516055, + "learning_rate": 7.250765492451679e-06, + "loss": 2.5415, + "step": 6232 + }, + { + "epoch": 1.2490981963927856, + "grad_norm": 24.011343129446555, + "learning_rate": 7.249724352992816e-06, + "loss": 2.6234, + "step": 6233 + }, + { + "epoch": 1.2492985971943888, + "grad_norm": 33.388462639612825, + "learning_rate": 7.2486830912131214e-06, + "loss": 2.5798, + "step": 6234 + }, + { + "epoch": 1.249498997995992, + "grad_norm": 62.04154803587106, + "learning_rate": 7.247641707169206e-06, + "loss": 2.5595, + "step": 6235 + }, + { + "epoch": 1.2496993987975953, + "grad_norm": 26.74280857775084, + "learning_rate": 7.2466002009176925e-06, + "loss": 3.2112, + "step": 6236 + }, + { + "epoch": 1.2498997995991985, + "grad_norm": 33.10138498719346, + "learning_rate": 7.245558572515211e-06, + "loss": 3.0002, + "step": 6237 + }, + { + "epoch": 1.2501002004008015, + "grad_norm": 20.698881488012592, + "learning_rate": 7.244516822018395e-06, + "loss": 2.6508, + "step": 6238 + }, + { + "epoch": 1.250300601202405, + "grad_norm": 26.809664164075688, + "learning_rate": 7.243474949483886e-06, + "loss": 2.8089, + "step": 6239 + }, + { + "epoch": 1.250501002004008, + "grad_norm": 23.27716318270455, + "learning_rate": 7.2424329549683325e-06, + "loss": 3.2345, + "step": 6240 + }, + { + "epoch": 1.2507014028056112, + "grad_norm": 24.771231777728573, + "learning_rate": 7.241390838528389e-06, + "loss": 2.7403, + "step": 6241 + }, + { + "epoch": 1.2509018036072144, + "grad_norm": 32.43141414341755, + "learning_rate": 7.2403486002207165e-06, + "loss": 3.2053, + "step": 6242 + }, + { + "epoch": 1.2511022044088176, + "grad_norm": 23.766941204109393, + "learning_rate": 7.239306240101983e-06, + "loss": 2.9015, + "step": 6243 + }, + { + "epoch": 1.2513026052104208, + "grad_norm": 29.23274282135648, + "learning_rate": 7.2382637582288664e-06, + "loss": 3.1069, + "step": 6244 + }, + { + "epoch": 1.251503006012024, + "grad_norm": 23.4325683152781, + "learning_rate": 7.237221154658043e-06, + "loss": 2.5037, + "step": 6245 + }, + { + "epoch": 1.2517034068136272, + "grad_norm": 17.864826979094786, + "learning_rate": 7.236178429446203e-06, + "loss": 2.2988, + "step": 6246 + }, + { + "epoch": 1.2519038076152305, + "grad_norm": 40.457749311471154, + "learning_rate": 7.235135582650043e-06, + "loss": 2.8116, + "step": 6247 + }, + { + "epoch": 1.2521042084168337, + "grad_norm": 24.778877237765744, + "learning_rate": 7.234092614326259e-06, + "loss": 2.5819, + "step": 6248 + }, + { + "epoch": 1.252304609218437, + "grad_norm": 33.584112623331436, + "learning_rate": 7.233049524531564e-06, + "loss": 3.0609, + "step": 6249 + }, + { + "epoch": 1.25250501002004, + "grad_norm": 40.45316785844878, + "learning_rate": 7.232006313322668e-06, + "loss": 2.9345, + "step": 6250 + }, + { + "epoch": 1.2527054108216433, + "grad_norm": 29.607198392524193, + "learning_rate": 7.230962980756296e-06, + "loss": 2.8798, + "step": 6251 + }, + { + "epoch": 1.2529058116232465, + "grad_norm": 21.310500817847636, + "learning_rate": 7.229919526889173e-06, + "loss": 2.9308, + "step": 6252 + }, + { + "epoch": 1.2531062124248498, + "grad_norm": 22.116259856909032, + "learning_rate": 7.228875951778034e-06, + "loss": 2.5765, + "step": 6253 + }, + { + "epoch": 1.253306613226453, + "grad_norm": 30.472474543088374, + "learning_rate": 7.227832255479619e-06, + "loss": 2.6137, + "step": 6254 + }, + { + "epoch": 1.253507014028056, + "grad_norm": 56.14252632035735, + "learning_rate": 7.2267884380506735e-06, + "loss": 2.552, + "step": 6255 + }, + { + "epoch": 1.2537074148296594, + "grad_norm": 19.378359820337618, + "learning_rate": 7.225744499547957e-06, + "loss": 2.347, + "step": 6256 + }, + { + "epoch": 1.2539078156312624, + "grad_norm": 22.19868838721107, + "learning_rate": 7.224700440028225e-06, + "loss": 3.0761, + "step": 6257 + }, + { + "epoch": 1.2541082164328659, + "grad_norm": 23.819323789331875, + "learning_rate": 7.223656259548247e-06, + "loss": 2.9675, + "step": 6258 + }, + { + "epoch": 1.2543086172344688, + "grad_norm": 24.34795741405172, + "learning_rate": 7.222611958164795e-06, + "loss": 2.4401, + "step": 6259 + }, + { + "epoch": 1.254509018036072, + "grad_norm": 26.84315174344164, + "learning_rate": 7.221567535934649e-06, + "loss": 2.9305, + "step": 6260 + }, + { + "epoch": 1.2547094188376753, + "grad_norm": 26.4770582940564, + "learning_rate": 7.220522992914598e-06, + "loss": 3.3455, + "step": 6261 + }, + { + "epoch": 1.2549098196392785, + "grad_norm": 23.38858895952147, + "learning_rate": 7.219478329161433e-06, + "loss": 3.2759, + "step": 6262 + }, + { + "epoch": 1.2551102204408817, + "grad_norm": 20.41945978934646, + "learning_rate": 7.2184335447319575e-06, + "loss": 2.6488, + "step": 6263 + }, + { + "epoch": 1.255310621242485, + "grad_norm": 35.26418841592288, + "learning_rate": 7.2173886396829725e-06, + "loss": 2.9939, + "step": 6264 + }, + { + "epoch": 1.2555110220440882, + "grad_norm": 25.41127484701099, + "learning_rate": 7.216343614071296e-06, + "loss": 3.05, + "step": 6265 + }, + { + "epoch": 1.2557114228456914, + "grad_norm": 24.269564979877572, + "learning_rate": 7.215298467953745e-06, + "loss": 3.1081, + "step": 6266 + }, + { + "epoch": 1.2559118236472946, + "grad_norm": 19.869462348732288, + "learning_rate": 7.214253201387147e-06, + "loss": 2.5554, + "step": 6267 + }, + { + "epoch": 1.2561122244488978, + "grad_norm": 26.85762563719154, + "learning_rate": 7.2132078144283314e-06, + "loss": 2.5207, + "step": 6268 + }, + { + "epoch": 1.256312625250501, + "grad_norm": 23.020236300227946, + "learning_rate": 7.212162307134142e-06, + "loss": 2.4205, + "step": 6269 + }, + { + "epoch": 1.2565130260521042, + "grad_norm": 30.259610984188235, + "learning_rate": 7.211116679561423e-06, + "loss": 2.6509, + "step": 6270 + }, + { + "epoch": 1.2567134268537075, + "grad_norm": 20.655006612491082, + "learning_rate": 7.210070931767025e-06, + "loss": 3.075, + "step": 6271 + }, + { + "epoch": 1.2569138276553107, + "grad_norm": 22.67436695371188, + "learning_rate": 7.209025063807808e-06, + "loss": 2.5086, + "step": 6272 + }, + { + "epoch": 1.257114228456914, + "grad_norm": 25.54284619288831, + "learning_rate": 7.207979075740638e-06, + "loss": 2.8246, + "step": 6273 + }, + { + "epoch": 1.2573146292585171, + "grad_norm": 35.8335013654297, + "learning_rate": 7.206932967622386e-06, + "loss": 2.954, + "step": 6274 + }, + { + "epoch": 1.2575150300601203, + "grad_norm": 27.27082574755037, + "learning_rate": 7.205886739509931e-06, + "loss": 2.5227, + "step": 6275 + }, + { + "epoch": 1.2577154308617233, + "grad_norm": 21.068418889764086, + "learning_rate": 7.204840391460157e-06, + "loss": 2.7231, + "step": 6276 + }, + { + "epoch": 1.2579158316633268, + "grad_norm": 22.951575731339876, + "learning_rate": 7.203793923529957e-06, + "loss": 2.7812, + "step": 6277 + }, + { + "epoch": 1.2581162324649298, + "grad_norm": 27.457158027372323, + "learning_rate": 7.2027473357762286e-06, + "loss": 2.9354, + "step": 6278 + }, + { + "epoch": 1.2583166332665332, + "grad_norm": 27.802096740339397, + "learning_rate": 7.201700628255877e-06, + "loss": 2.8027, + "step": 6279 + }, + { + "epoch": 1.2585170340681362, + "grad_norm": 19.899782743614008, + "learning_rate": 7.20065380102581e-06, + "loss": 2.6977, + "step": 6280 + }, + { + "epoch": 1.2587174348697394, + "grad_norm": 27.437174833944326, + "learning_rate": 7.199606854142948e-06, + "loss": 2.4362, + "step": 6281 + }, + { + "epoch": 1.2589178356713426, + "grad_norm": 22.50215654279284, + "learning_rate": 7.198559787664214e-06, + "loss": 2.7307, + "step": 6282 + }, + { + "epoch": 1.2591182364729459, + "grad_norm": 27.61559117095106, + "learning_rate": 7.1975126016465415e-06, + "loss": 2.5914, + "step": 6283 + }, + { + "epoch": 1.259318637274549, + "grad_norm": 19.889689950391773, + "learning_rate": 7.196465296146862e-06, + "loss": 2.5702, + "step": 6284 + }, + { + "epoch": 1.2595190380761523, + "grad_norm": 27.8152744819285, + "learning_rate": 7.195417871222125e-06, + "loss": 2.611, + "step": 6285 + }, + { + "epoch": 1.2597194388777555, + "grad_norm": 23.606426314293767, + "learning_rate": 7.194370326929274e-06, + "loss": 2.697, + "step": 6286 + }, + { + "epoch": 1.2599198396793587, + "grad_norm": 24.45470748781099, + "learning_rate": 7.193322663325271e-06, + "loss": 2.9951, + "step": 6287 + }, + { + "epoch": 1.260120240480962, + "grad_norm": 29.587538326435133, + "learning_rate": 7.192274880467079e-06, + "loss": 2.9252, + "step": 6288 + }, + { + "epoch": 1.2603206412825652, + "grad_norm": 44.74173531545082, + "learning_rate": 7.191226978411663e-06, + "loss": 3.6931, + "step": 6289 + }, + { + "epoch": 1.2605210420841684, + "grad_norm": 27.406917136901452, + "learning_rate": 7.190178957216003e-06, + "loss": 2.9632, + "step": 6290 + }, + { + "epoch": 1.2607214428857716, + "grad_norm": 51.11003337139766, + "learning_rate": 7.189130816937079e-06, + "loss": 2.7024, + "step": 6291 + }, + { + "epoch": 1.2609218436873748, + "grad_norm": 29.599193174005887, + "learning_rate": 7.1880825576318815e-06, + "loss": 3.2823, + "step": 6292 + }, + { + "epoch": 1.261122244488978, + "grad_norm": 24.495831056053703, + "learning_rate": 7.187034179357405e-06, + "loss": 2.7623, + "step": 6293 + }, + { + "epoch": 1.2613226452905812, + "grad_norm": 26.40747891093518, + "learning_rate": 7.185985682170653e-06, + "loss": 3.2123, + "step": 6294 + }, + { + "epoch": 1.2615230460921842, + "grad_norm": 22.50585155790682, + "learning_rate": 7.184937066128632e-06, + "loss": 3.2333, + "step": 6295 + }, + { + "epoch": 1.2617234468937877, + "grad_norm": 21.90225539235219, + "learning_rate": 7.183888331288358e-06, + "loss": 2.5545, + "step": 6296 + }, + { + "epoch": 1.2619238476953907, + "grad_norm": 63.93852069190834, + "learning_rate": 7.182839477706851e-06, + "loss": 3.1014, + "step": 6297 + }, + { + "epoch": 1.2621242484969941, + "grad_norm": 24.602134659637393, + "learning_rate": 7.18179050544114e-06, + "loss": 2.3791, + "step": 6298 + }, + { + "epoch": 1.2623246492985971, + "grad_norm": 27.398321310363986, + "learning_rate": 7.180741414548257e-06, + "loss": 2.7663, + "step": 6299 + }, + { + "epoch": 1.2625250501002003, + "grad_norm": 22.594280777063233, + "learning_rate": 7.179692205085246e-06, + "loss": 2.3504, + "step": 6300 + }, + { + "epoch": 1.2627254509018035, + "grad_norm": 26.667569412446788, + "learning_rate": 7.178642877109151e-06, + "loss": 2.2386, + "step": 6301 + }, + { + "epoch": 1.2629258517034068, + "grad_norm": 26.56713371877463, + "learning_rate": 7.177593430677029e-06, + "loss": 2.7761, + "step": 6302 + }, + { + "epoch": 1.26312625250501, + "grad_norm": 34.26789479197395, + "learning_rate": 7.176543865845935e-06, + "loss": 3.0686, + "step": 6303 + }, + { + "epoch": 1.2633266533066132, + "grad_norm": 21.390140252885118, + "learning_rate": 7.175494182672939e-06, + "loss": 3.1917, + "step": 6304 + }, + { + "epoch": 1.2635270541082164, + "grad_norm": 25.702490230597224, + "learning_rate": 7.174444381215114e-06, + "loss": 2.9849, + "step": 6305 + }, + { + "epoch": 1.2637274549098196, + "grad_norm": 30.834452643266147, + "learning_rate": 7.173394461529537e-06, + "loss": 2.8794, + "step": 6306 + }, + { + "epoch": 1.2639278557114229, + "grad_norm": 32.6277522918017, + "learning_rate": 7.172344423673296e-06, + "loss": 2.9924, + "step": 6307 + }, + { + "epoch": 1.264128256513026, + "grad_norm": 25.522582449465393, + "learning_rate": 7.171294267703481e-06, + "loss": 2.5718, + "step": 6308 + }, + { + "epoch": 1.2643286573146293, + "grad_norm": 30.925022850643085, + "learning_rate": 7.170243993677193e-06, + "loss": 2.9642, + "step": 6309 + }, + { + "epoch": 1.2645290581162325, + "grad_norm": 23.140699192758728, + "learning_rate": 7.169193601651535e-06, + "loss": 3.0953, + "step": 6310 + }, + { + "epoch": 1.2647294589178357, + "grad_norm": 57.10430238494751, + "learning_rate": 7.168143091683618e-06, + "loss": 3.5104, + "step": 6311 + }, + { + "epoch": 1.264929859719439, + "grad_norm": 23.351939193704222, + "learning_rate": 7.1670924638305605e-06, + "loss": 2.7186, + "step": 6312 + }, + { + "epoch": 1.2651302605210422, + "grad_norm": 21.067522359112044, + "learning_rate": 7.166041718149488e-06, + "loss": 3.0576, + "step": 6313 + }, + { + "epoch": 1.2653306613226452, + "grad_norm": 25.378751004468647, + "learning_rate": 7.164990854697529e-06, + "loss": 2.9861, + "step": 6314 + }, + { + "epoch": 1.2655310621242486, + "grad_norm": 19.891683924069522, + "learning_rate": 7.163939873531823e-06, + "loss": 2.9093, + "step": 6315 + }, + { + "epoch": 1.2657314629258516, + "grad_norm": 21.508939624143046, + "learning_rate": 7.1628887747095085e-06, + "loss": 2.9227, + "step": 6316 + }, + { + "epoch": 1.265931863727455, + "grad_norm": 27.85007085330779, + "learning_rate": 7.161837558287741e-06, + "loss": 2.6752, + "step": 6317 + }, + { + "epoch": 1.266132264529058, + "grad_norm": 26.087017869853163, + "learning_rate": 7.1607862243236734e-06, + "loss": 2.584, + "step": 6318 + }, + { + "epoch": 1.2663326653306612, + "grad_norm": 29.348643634237394, + "learning_rate": 7.159734772874471e-06, + "loss": 2.4356, + "step": 6319 + }, + { + "epoch": 1.2665330661322645, + "grad_norm": 35.39959846325873, + "learning_rate": 7.158683203997299e-06, + "loss": 3.0592, + "step": 6320 + }, + { + "epoch": 1.2667334669338677, + "grad_norm": 30.858609142163395, + "learning_rate": 7.157631517749335e-06, + "loss": 2.9184, + "step": 6321 + }, + { + "epoch": 1.266933867735471, + "grad_norm": 19.644924615992608, + "learning_rate": 7.156579714187761e-06, + "loss": 2.3716, + "step": 6322 + }, + { + "epoch": 1.2671342685370741, + "grad_norm": 21.17127141422649, + "learning_rate": 7.1555277933697645e-06, + "loss": 3.1329, + "step": 6323 + }, + { + "epoch": 1.2673346693386773, + "grad_norm": 25.491184117454086, + "learning_rate": 7.154475755352539e-06, + "loss": 2.5584, + "step": 6324 + }, + { + "epoch": 1.2675350701402806, + "grad_norm": 30.223899972624046, + "learning_rate": 7.153423600193288e-06, + "loss": 2.6582, + "step": 6325 + }, + { + "epoch": 1.2677354709418838, + "grad_norm": 28.218593137712727, + "learning_rate": 7.152371327949216e-06, + "loss": 2.83, + "step": 6326 + }, + { + "epoch": 1.267935871743487, + "grad_norm": 24.550427255779155, + "learning_rate": 7.151318938677539e-06, + "loss": 2.9008, + "step": 6327 + }, + { + "epoch": 1.2681362725450902, + "grad_norm": 43.03301151833378, + "learning_rate": 7.150266432435473e-06, + "loss": 3.1164, + "step": 6328 + }, + { + "epoch": 1.2683366733466934, + "grad_norm": 23.215951801720077, + "learning_rate": 7.149213809280249e-06, + "loss": 2.8441, + "step": 6329 + }, + { + "epoch": 1.2685370741482966, + "grad_norm": 26.71366723832453, + "learning_rate": 7.148161069269097e-06, + "loss": 3.403, + "step": 6330 + }, + { + "epoch": 1.2687374749498999, + "grad_norm": 26.114274770452546, + "learning_rate": 7.147108212459257e-06, + "loss": 2.4284, + "step": 6331 + }, + { + "epoch": 1.268937875751503, + "grad_norm": 50.41532105290253, + "learning_rate": 7.146055238907974e-06, + "loss": 3.2413, + "step": 6332 + }, + { + "epoch": 1.2691382765531063, + "grad_norm": 35.13384470570797, + "learning_rate": 7.1450021486725e-06, + "loss": 3.0574, + "step": 6333 + }, + { + "epoch": 1.2693386773547095, + "grad_norm": 24.99975196969279, + "learning_rate": 7.143948941810094e-06, + "loss": 2.3666, + "step": 6334 + }, + { + "epoch": 1.2695390781563125, + "grad_norm": 23.09032020809276, + "learning_rate": 7.142895618378017e-06, + "loss": 2.4133, + "step": 6335 + }, + { + "epoch": 1.269739478957916, + "grad_norm": 19.437235111931773, + "learning_rate": 7.141842178433542e-06, + "loss": 2.587, + "step": 6336 + }, + { + "epoch": 1.269939879759519, + "grad_norm": 32.56469043725005, + "learning_rate": 7.140788622033947e-06, + "loss": 2.4999, + "step": 6337 + }, + { + "epoch": 1.2701402805611224, + "grad_norm": 21.842010861489033, + "learning_rate": 7.139734949236515e-06, + "loss": 2.5514, + "step": 6338 + }, + { + "epoch": 1.2703406813627254, + "grad_norm": 25.52108791148876, + "learning_rate": 7.138681160098536e-06, + "loss": 2.9035, + "step": 6339 + }, + { + "epoch": 1.2705410821643286, + "grad_norm": 17.431168397292296, + "learning_rate": 7.137627254677304e-06, + "loss": 2.408, + "step": 6340 + }, + { + "epoch": 1.2707414829659318, + "grad_norm": 22.130433440872057, + "learning_rate": 7.136573233030124e-06, + "loss": 2.4804, + "step": 6341 + }, + { + "epoch": 1.270941883767535, + "grad_norm": 24.320707664997133, + "learning_rate": 7.1355190952143e-06, + "loss": 2.8751, + "step": 6342 + }, + { + "epoch": 1.2711422845691382, + "grad_norm": 19.32634610869234, + "learning_rate": 7.134464841287153e-06, + "loss": 2.2981, + "step": 6343 + }, + { + "epoch": 1.2713426853707415, + "grad_norm": 26.84872609977439, + "learning_rate": 7.133410471306002e-06, + "loss": 3.3989, + "step": 6344 + }, + { + "epoch": 1.2715430861723447, + "grad_norm": 27.359878291836058, + "learning_rate": 7.132355985328174e-06, + "loss": 2.233, + "step": 6345 + }, + { + "epoch": 1.271743486973948, + "grad_norm": 24.636426054179402, + "learning_rate": 7.131301383411004e-06, + "loss": 2.1123, + "step": 6346 + }, + { + "epoch": 1.2719438877755511, + "grad_norm": 26.497792077560554, + "learning_rate": 7.1302466656118315e-06, + "loss": 3.0983, + "step": 6347 + }, + { + "epoch": 1.2721442885771543, + "grad_norm": 36.170357148036885, + "learning_rate": 7.129191831988004e-06, + "loss": 3.7945, + "step": 6348 + }, + { + "epoch": 1.2723446893787576, + "grad_norm": 44.46907636244326, + "learning_rate": 7.128136882596872e-06, + "loss": 2.3078, + "step": 6349 + }, + { + "epoch": 1.2725450901803608, + "grad_norm": 26.161064349902745, + "learning_rate": 7.127081817495797e-06, + "loss": 2.9171, + "step": 6350 + }, + { + "epoch": 1.272745490981964, + "grad_norm": 25.06955763330201, + "learning_rate": 7.126026636742144e-06, + "loss": 2.9328, + "step": 6351 + }, + { + "epoch": 1.2729458917835672, + "grad_norm": 27.92845903088589, + "learning_rate": 7.124971340393284e-06, + "loss": 2.6768, + "step": 6352 + }, + { + "epoch": 1.2731462925851704, + "grad_norm": 31.135482429409123, + "learning_rate": 7.123915928506595e-06, + "loss": 2.6642, + "step": 6353 + }, + { + "epoch": 1.2733466933867734, + "grad_norm": 22.337777964450076, + "learning_rate": 7.122860401139464e-06, + "loss": 2.9516, + "step": 6354 + }, + { + "epoch": 1.2735470941883769, + "grad_norm": 28.014006715419768, + "learning_rate": 7.1218047583492756e-06, + "loss": 3.1999, + "step": 6355 + }, + { + "epoch": 1.2737474949899799, + "grad_norm": 21.699989360729592, + "learning_rate": 7.1207490001934345e-06, + "loss": 2.6845, + "step": 6356 + }, + { + "epoch": 1.2739478957915833, + "grad_norm": 20.1892205235311, + "learning_rate": 7.1196931267293365e-06, + "loss": 2.7368, + "step": 6357 + }, + { + "epoch": 1.2741482965931863, + "grad_norm": 23.23617889776201, + "learning_rate": 7.118637138014396e-06, + "loss": 2.9466, + "step": 6358 + }, + { + "epoch": 1.2743486973947895, + "grad_norm": 28.777533819734597, + "learning_rate": 7.117581034106027e-06, + "loss": 2.6287, + "step": 6359 + }, + { + "epoch": 1.2745490981963927, + "grad_norm": 30.21392158641006, + "learning_rate": 7.116524815061651e-06, + "loss": 2.7434, + "step": 6360 + }, + { + "epoch": 1.274749498997996, + "grad_norm": 25.015113123400152, + "learning_rate": 7.115468480938695e-06, + "loss": 3.1967, + "step": 6361 + }, + { + "epoch": 1.2749498997995992, + "grad_norm": 22.363825761529938, + "learning_rate": 7.114412031794597e-06, + "loss": 2.7051, + "step": 6362 + }, + { + "epoch": 1.2751503006012024, + "grad_norm": 32.550238922759334, + "learning_rate": 7.113355467686795e-06, + "loss": 3.2856, + "step": 6363 + }, + { + "epoch": 1.2753507014028056, + "grad_norm": 19.77550138480804, + "learning_rate": 7.112298788672737e-06, + "loss": 2.442, + "step": 6364 + }, + { + "epoch": 1.2755511022044088, + "grad_norm": 26.12124958928582, + "learning_rate": 7.111241994809876e-06, + "loss": 2.2385, + "step": 6365 + }, + { + "epoch": 1.275751503006012, + "grad_norm": 25.93077658858525, + "learning_rate": 7.110185086155671e-06, + "loss": 2.8886, + "step": 6366 + }, + { + "epoch": 1.2759519038076153, + "grad_norm": 18.624305813112965, + "learning_rate": 7.109128062767587e-06, + "loss": 2.8121, + "step": 6367 + }, + { + "epoch": 1.2761523046092185, + "grad_norm": 17.358481251432288, + "learning_rate": 7.108070924703096e-06, + "loss": 2.3431, + "step": 6368 + }, + { + "epoch": 1.2763527054108217, + "grad_norm": 34.564011319428204, + "learning_rate": 7.1070136720196795e-06, + "loss": 2.89, + "step": 6369 + }, + { + "epoch": 1.276553106212425, + "grad_norm": 27.461417689262504, + "learning_rate": 7.1059563047748184e-06, + "loss": 3.3456, + "step": 6370 + }, + { + "epoch": 1.2767535070140281, + "grad_norm": 27.922028913457574, + "learning_rate": 7.104898823026006e-06, + "loss": 2.847, + "step": 6371 + }, + { + "epoch": 1.2769539078156313, + "grad_norm": 31.179304288907367, + "learning_rate": 7.103841226830734e-06, + "loss": 2.8868, + "step": 6372 + }, + { + "epoch": 1.2771543086172343, + "grad_norm": 20.210667869657453, + "learning_rate": 7.102783516246511e-06, + "loss": 2.6727, + "step": 6373 + }, + { + "epoch": 1.2773547094188378, + "grad_norm": 21.046624324577067, + "learning_rate": 7.101725691330844e-06, + "loss": 2.6139, + "step": 6374 + }, + { + "epoch": 1.2775551102204408, + "grad_norm": 15.760612881580478, + "learning_rate": 7.100667752141248e-06, + "loss": 2.6382, + "step": 6375 + }, + { + "epoch": 1.2777555110220442, + "grad_norm": 24.36526006931832, + "learning_rate": 7.099609698735248e-06, + "loss": 2.2728, + "step": 6376 + }, + { + "epoch": 1.2779559118236472, + "grad_norm": 24.787004974635295, + "learning_rate": 7.098551531170367e-06, + "loss": 2.495, + "step": 6377 + }, + { + "epoch": 1.2781563126252504, + "grad_norm": 28.338613228156255, + "learning_rate": 7.097493249504143e-06, + "loss": 2.6334, + "step": 6378 + }, + { + "epoch": 1.2783567134268536, + "grad_norm": 22.75355066198689, + "learning_rate": 7.096434853794114e-06, + "loss": 2.7885, + "step": 6379 + }, + { + "epoch": 1.2785571142284569, + "grad_norm": 54.36969285679266, + "learning_rate": 7.095376344097826e-06, + "loss": 3.1989, + "step": 6380 + }, + { + "epoch": 1.27875751503006, + "grad_norm": 29.9539468410727, + "learning_rate": 7.094317720472834e-06, + "loss": 2.2471, + "step": 6381 + }, + { + "epoch": 1.2789579158316633, + "grad_norm": 26.005559931777434, + "learning_rate": 7.093258982976696e-06, + "loss": 2.7432, + "step": 6382 + }, + { + "epoch": 1.2791583166332665, + "grad_norm": 38.9371525936711, + "learning_rate": 7.092200131666979e-06, + "loss": 2.8337, + "step": 6383 + }, + { + "epoch": 1.2793587174348697, + "grad_norm": 45.757563032026695, + "learning_rate": 7.0911411666012505e-06, + "loss": 2.993, + "step": 6384 + }, + { + "epoch": 1.279559118236473, + "grad_norm": 19.3242114479701, + "learning_rate": 7.090082087837092e-06, + "loss": 2.5611, + "step": 6385 + }, + { + "epoch": 1.2797595190380762, + "grad_norm": 30.66192969621606, + "learning_rate": 7.0890228954320825e-06, + "loss": 2.9471, + "step": 6386 + }, + { + "epoch": 1.2799599198396794, + "grad_norm": 40.44449438595971, + "learning_rate": 7.087963589443816e-06, + "loss": 3.3594, + "step": 6387 + }, + { + "epoch": 1.2801603206412826, + "grad_norm": 24.21962469070255, + "learning_rate": 7.086904169929887e-06, + "loss": 2.688, + "step": 6388 + }, + { + "epoch": 1.2803607214428858, + "grad_norm": 25.292817903835726, + "learning_rate": 7.085844636947897e-06, + "loss": 2.6939, + "step": 6389 + }, + { + "epoch": 1.280561122244489, + "grad_norm": 20.83932196508336, + "learning_rate": 7.0847849905554566e-06, + "loss": 2.637, + "step": 6390 + }, + { + "epoch": 1.2807615230460923, + "grad_norm": 26.537492737973302, + "learning_rate": 7.083725230810177e-06, + "loss": 2.9903, + "step": 6391 + }, + { + "epoch": 1.2809619238476955, + "grad_norm": 29.201309442019955, + "learning_rate": 7.0826653577696815e-06, + "loss": 2.9812, + "step": 6392 + }, + { + "epoch": 1.2811623246492987, + "grad_norm": 23.4727894712901, + "learning_rate": 7.081605371491595e-06, + "loss": 2.522, + "step": 6393 + }, + { + "epoch": 1.2813627254509017, + "grad_norm": 47.38839700035899, + "learning_rate": 7.0805452720335545e-06, + "loss": 2.855, + "step": 6394 + }, + { + "epoch": 1.2815631262525051, + "grad_norm": 27.461900454104622, + "learning_rate": 7.079485059453195e-06, + "loss": 2.7764, + "step": 6395 + }, + { + "epoch": 1.2817635270541081, + "grad_norm": 22.630514719179338, + "learning_rate": 7.078424733808162e-06, + "loss": 2.7881, + "step": 6396 + }, + { + "epoch": 1.2819639278557116, + "grad_norm": 21.20479085330028, + "learning_rate": 7.07736429515611e-06, + "loss": 2.5483, + "step": 6397 + }, + { + "epoch": 1.2821643286573146, + "grad_norm": 49.28520788523637, + "learning_rate": 7.076303743554693e-06, + "loss": 2.8339, + "step": 6398 + }, + { + "epoch": 1.2823647294589178, + "grad_norm": 24.465575576274844, + "learning_rate": 7.075243079061578e-06, + "loss": 2.6303, + "step": 6399 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 17.543088502786862, + "learning_rate": 7.0741823017344335e-06, + "loss": 3.0009, + "step": 6400 + }, + { + "epoch": 1.2827655310621242, + "grad_norm": 19.0682498962099, + "learning_rate": 7.073121411630935e-06, + "loss": 2.6697, + "step": 6401 + }, + { + "epoch": 1.2829659318637274, + "grad_norm": 24.253136889937075, + "learning_rate": 7.072060408808765e-06, + "loss": 2.8301, + "step": 6402 + }, + { + "epoch": 1.2831663326653306, + "grad_norm": 25.098358907202492, + "learning_rate": 7.0709992933256135e-06, + "loss": 2.9767, + "step": 6403 + }, + { + "epoch": 1.2833667334669339, + "grad_norm": 31.844858403716746, + "learning_rate": 7.069938065239171e-06, + "loss": 2.6197, + "step": 6404 + }, + { + "epoch": 1.283567134268537, + "grad_norm": 22.057311867306552, + "learning_rate": 7.068876724607141e-06, + "loss": 2.9741, + "step": 6405 + }, + { + "epoch": 1.2837675350701403, + "grad_norm": 25.060438819367736, + "learning_rate": 7.06781527148723e-06, + "loss": 3.1385, + "step": 6406 + }, + { + "epoch": 1.2839679358717435, + "grad_norm": 34.022777484312364, + "learning_rate": 7.06675370593715e-06, + "loss": 3.0291, + "step": 6407 + }, + { + "epoch": 1.2841683366733467, + "grad_norm": 23.165870471391592, + "learning_rate": 7.065692028014621e-06, + "loss": 3.1188, + "step": 6408 + }, + { + "epoch": 1.28436873747495, + "grad_norm": 43.64496395937128, + "learning_rate": 7.064630237777366e-06, + "loss": 2.8184, + "step": 6409 + }, + { + "epoch": 1.2845691382765532, + "grad_norm": 20.14852782023496, + "learning_rate": 7.063568335283119e-06, + "loss": 2.4872, + "step": 6410 + }, + { + "epoch": 1.2847695390781564, + "grad_norm": 25.727075218476458, + "learning_rate": 7.062506320589613e-06, + "loss": 2.5076, + "step": 6411 + }, + { + "epoch": 1.2849699398797596, + "grad_norm": 24.19044206643202, + "learning_rate": 7.061444193754597e-06, + "loss": 2.4038, + "step": 6412 + }, + { + "epoch": 1.2851703406813626, + "grad_norm": 50.18807267679607, + "learning_rate": 7.060381954835815e-06, + "loss": 3.2328, + "step": 6413 + }, + { + "epoch": 1.285370741482966, + "grad_norm": 27.38014160812599, + "learning_rate": 7.059319603891024e-06, + "loss": 3.2016, + "step": 6414 + }, + { + "epoch": 1.285571142284569, + "grad_norm": 33.960509958718276, + "learning_rate": 7.058257140977989e-06, + "loss": 3.0532, + "step": 6415 + }, + { + "epoch": 1.2857715430861725, + "grad_norm": 29.492167359599378, + "learning_rate": 7.0571945661544725e-06, + "loss": 2.9705, + "step": 6416 + }, + { + "epoch": 1.2859719438877755, + "grad_norm": 25.82610256434226, + "learning_rate": 7.056131879478252e-06, + "loss": 2.6148, + "step": 6417 + }, + { + "epoch": 1.2861723446893787, + "grad_norm": 29.977594034866478, + "learning_rate": 7.0550690810071045e-06, + "loss": 3.5008, + "step": 6418 + }, + { + "epoch": 1.286372745490982, + "grad_norm": 33.09172685905947, + "learning_rate": 7.05400617079882e-06, + "loss": 3.0374, + "step": 6419 + }, + { + "epoch": 1.2865731462925851, + "grad_norm": 44.51449391499716, + "learning_rate": 7.0529431489111865e-06, + "loss": 2.8827, + "step": 6420 + }, + { + "epoch": 1.2867735470941883, + "grad_norm": 21.539746532370238, + "learning_rate": 7.051880015402004e-06, + "loss": 2.3014, + "step": 6421 + }, + { + "epoch": 1.2869739478957916, + "grad_norm": 24.753214346763947, + "learning_rate": 7.0508167703290765e-06, + "loss": 2.8386, + "step": 6422 + }, + { + "epoch": 1.2871743486973948, + "grad_norm": 191.88821943969398, + "learning_rate": 7.049753413750213e-06, + "loss": 3.1599, + "step": 6423 + }, + { + "epoch": 1.287374749498998, + "grad_norm": 46.01889304631574, + "learning_rate": 7.048689945723231e-06, + "loss": 2.893, + "step": 6424 + }, + { + "epoch": 1.2875751503006012, + "grad_norm": 21.611629948181122, + "learning_rate": 7.047626366305953e-06, + "loss": 3.204, + "step": 6425 + }, + { + "epoch": 1.2877755511022044, + "grad_norm": 27.595671832417175, + "learning_rate": 7.046562675556206e-06, + "loss": 2.7871, + "step": 6426 + }, + { + "epoch": 1.2879759519038076, + "grad_norm": 21.572039775151882, + "learning_rate": 7.045498873531827e-06, + "loss": 2.5107, + "step": 6427 + }, + { + "epoch": 1.2881763527054109, + "grad_norm": 17.388187126972262, + "learning_rate": 7.044434960290652e-06, + "loss": 2.4605, + "step": 6428 + }, + { + "epoch": 1.288376753507014, + "grad_norm": 37.307845385406644, + "learning_rate": 7.043370935890533e-06, + "loss": 2.9835, + "step": 6429 + }, + { + "epoch": 1.2885771543086173, + "grad_norm": 21.400516337171517, + "learning_rate": 7.042306800389318e-06, + "loss": 2.1724, + "step": 6430 + }, + { + "epoch": 1.2887775551102205, + "grad_norm": 38.461203802106155, + "learning_rate": 7.041242553844868e-06, + "loss": 2.982, + "step": 6431 + }, + { + "epoch": 1.2889779559118235, + "grad_norm": 24.971178591011864, + "learning_rate": 7.040178196315048e-06, + "loss": 3.1353, + "step": 6432 + }, + { + "epoch": 1.289178356713427, + "grad_norm": 22.565182884371893, + "learning_rate": 7.039113727857728e-06, + "loss": 2.38, + "step": 6433 + }, + { + "epoch": 1.28937875751503, + "grad_norm": 23.61879740935004, + "learning_rate": 7.038049148530786e-06, + "loss": 2.305, + "step": 6434 + }, + { + "epoch": 1.2895791583166334, + "grad_norm": 23.210194108526423, + "learning_rate": 7.0369844583921e-06, + "loss": 2.5154, + "step": 6435 + }, + { + "epoch": 1.2897795591182364, + "grad_norm": 34.15758933629778, + "learning_rate": 7.035919657499563e-06, + "loss": 2.3069, + "step": 6436 + }, + { + "epoch": 1.2899799599198396, + "grad_norm": 21.449217878209804, + "learning_rate": 7.03485474591107e-06, + "loss": 2.8599, + "step": 6437 + }, + { + "epoch": 1.2901803607214428, + "grad_norm": 27.985033868183397, + "learning_rate": 7.03378972368452e-06, + "loss": 2.7719, + "step": 6438 + }, + { + "epoch": 1.290380761523046, + "grad_norm": 23.989854907214855, + "learning_rate": 7.032724590877822e-06, + "loss": 3.1599, + "step": 6439 + }, + { + "epoch": 1.2905811623246493, + "grad_norm": 32.19643999439671, + "learning_rate": 7.031659347548886e-06, + "loss": 3.0217, + "step": 6440 + }, + { + "epoch": 1.2907815631262525, + "grad_norm": 39.98273070692245, + "learning_rate": 7.030593993755632e-06, + "loss": 2.8359, + "step": 6441 + }, + { + "epoch": 1.2909819639278557, + "grad_norm": 52.13738402399133, + "learning_rate": 7.0295285295559855e-06, + "loss": 2.712, + "step": 6442 + }, + { + "epoch": 1.291182364729459, + "grad_norm": 24.074113587605698, + "learning_rate": 7.0284629550078775e-06, + "loss": 1.8915, + "step": 6443 + }, + { + "epoch": 1.2913827655310621, + "grad_norm": 32.8159092073609, + "learning_rate": 7.027397270169245e-06, + "loss": 3.0234, + "step": 6444 + }, + { + "epoch": 1.2915831663326653, + "grad_norm": 23.787720965181933, + "learning_rate": 7.026331475098028e-06, + "loss": 2.4624, + "step": 6445 + }, + { + "epoch": 1.2917835671342686, + "grad_norm": 17.79578845834178, + "learning_rate": 7.025265569852179e-06, + "loss": 2.813, + "step": 6446 + }, + { + "epoch": 1.2919839679358718, + "grad_norm": 31.655844336568673, + "learning_rate": 7.024199554489652e-06, + "loss": 3.0418, + "step": 6447 + }, + { + "epoch": 1.292184368737475, + "grad_norm": 24.8673029091158, + "learning_rate": 7.023133429068406e-06, + "loss": 2.6183, + "step": 6448 + }, + { + "epoch": 1.2923847695390782, + "grad_norm": 21.54435123426312, + "learning_rate": 7.022067193646409e-06, + "loss": 3.0925, + "step": 6449 + }, + { + "epoch": 1.2925851703406814, + "grad_norm": 55.735846796429975, + "learning_rate": 7.021000848281635e-06, + "loss": 3.1586, + "step": 6450 + }, + { + "epoch": 1.2927855711422847, + "grad_norm": 30.26569368930504, + "learning_rate": 7.019934393032062e-06, + "loss": 2.1961, + "step": 6451 + }, + { + "epoch": 1.2929859719438879, + "grad_norm": 49.23718957727631, + "learning_rate": 7.018867827955672e-06, + "loss": 2.8872, + "step": 6452 + }, + { + "epoch": 1.2931863727454909, + "grad_norm": 19.214284615577682, + "learning_rate": 7.017801153110459e-06, + "loss": 2.9487, + "step": 6453 + }, + { + "epoch": 1.2933867735470943, + "grad_norm": 23.849931613493894, + "learning_rate": 7.01673436855442e-06, + "loss": 2.5439, + "step": 6454 + }, + { + "epoch": 1.2935871743486973, + "grad_norm": 29.443355234550012, + "learning_rate": 7.0156674743455555e-06, + "loss": 2.5064, + "step": 6455 + }, + { + "epoch": 1.2937875751503007, + "grad_norm": 30.87957540274819, + "learning_rate": 7.014600470541875e-06, + "loss": 2.8911, + "step": 6456 + }, + { + "epoch": 1.2939879759519037, + "grad_norm": 36.20183140241489, + "learning_rate": 7.013533357201393e-06, + "loss": 3.6883, + "step": 6457 + }, + { + "epoch": 1.294188376753507, + "grad_norm": 29.201404941352507, + "learning_rate": 7.01246613438213e-06, + "loss": 3.0082, + "step": 6458 + }, + { + "epoch": 1.2943887775551102, + "grad_norm": 42.90682745511628, + "learning_rate": 7.0113988021421135e-06, + "loss": 2.3956, + "step": 6459 + }, + { + "epoch": 1.2945891783567134, + "grad_norm": 28.818655972954062, + "learning_rate": 7.010331360539374e-06, + "loss": 2.5755, + "step": 6460 + }, + { + "epoch": 1.2947895791583166, + "grad_norm": 27.112114718141996, + "learning_rate": 7.009263809631952e-06, + "loss": 2.835, + "step": 6461 + }, + { + "epoch": 1.2949899799599198, + "grad_norm": 25.048941810862754, + "learning_rate": 7.00819614947789e-06, + "loss": 2.677, + "step": 6462 + }, + { + "epoch": 1.295190380761523, + "grad_norm": 28.436391863845987, + "learning_rate": 7.0071283801352375e-06, + "loss": 2.8639, + "step": 6463 + }, + { + "epoch": 1.2953907815631263, + "grad_norm": 35.75173416629313, + "learning_rate": 7.006060501662056e-06, + "loss": 3.6042, + "step": 6464 + }, + { + "epoch": 1.2955911823647295, + "grad_norm": 30.130046527388004, + "learning_rate": 7.0049925141164e-06, + "loss": 2.9803, + "step": 6465 + }, + { + "epoch": 1.2957915831663327, + "grad_norm": 19.43867353011513, + "learning_rate": 7.003924417556344e-06, + "loss": 2.5816, + "step": 6466 + }, + { + "epoch": 1.295991983967936, + "grad_norm": 28.9669160199681, + "learning_rate": 7.0028562120399565e-06, + "loss": 2.3656, + "step": 6467 + }, + { + "epoch": 1.2961923847695391, + "grad_norm": 26.76128421561029, + "learning_rate": 7.001787897625323e-06, + "loss": 3.0529, + "step": 6468 + }, + { + "epoch": 1.2963927855711423, + "grad_norm": 22.522094115417833, + "learning_rate": 7.000719474370525e-06, + "loss": 2.5225, + "step": 6469 + }, + { + "epoch": 1.2965931863727456, + "grad_norm": 23.36184844934133, + "learning_rate": 6.999650942333657e-06, + "loss": 2.3415, + "step": 6470 + }, + { + "epoch": 1.2967935871743488, + "grad_norm": 26.722377588119414, + "learning_rate": 6.998582301572816e-06, + "loss": 2.9229, + "step": 6471 + }, + { + "epoch": 1.2969939879759518, + "grad_norm": 27.864610657105757, + "learning_rate": 6.997513552146103e-06, + "loss": 2.9863, + "step": 6472 + }, + { + "epoch": 1.2971943887775552, + "grad_norm": 28.081340834049016, + "learning_rate": 6.996444694111631e-06, + "loss": 2.7449, + "step": 6473 + }, + { + "epoch": 1.2973947895791582, + "grad_norm": 23.617152308753546, + "learning_rate": 6.995375727527513e-06, + "loss": 2.4213, + "step": 6474 + }, + { + "epoch": 1.2975951903807617, + "grad_norm": 19.250189650438276, + "learning_rate": 6.994306652451872e-06, + "loss": 2.6741, + "step": 6475 + }, + { + "epoch": 1.2977955911823646, + "grad_norm": 25.304527701966958, + "learning_rate": 6.993237468942835e-06, + "loss": 2.9556, + "step": 6476 + }, + { + "epoch": 1.2979959919839679, + "grad_norm": 22.04269987057819, + "learning_rate": 6.992168177058534e-06, + "loss": 2.8577, + "step": 6477 + }, + { + "epoch": 1.298196392785571, + "grad_norm": 21.65988546007079, + "learning_rate": 6.991098776857108e-06, + "loss": 2.585, + "step": 6478 + }, + { + "epoch": 1.2983967935871743, + "grad_norm": 24.172493903473132, + "learning_rate": 6.990029268396703e-06, + "loss": 2.646, + "step": 6479 + }, + { + "epoch": 1.2985971943887775, + "grad_norm": 18.930934911998868, + "learning_rate": 6.9889596517354675e-06, + "loss": 2.7713, + "step": 6480 + }, + { + "epoch": 1.2987975951903807, + "grad_norm": 42.975717458686304, + "learning_rate": 6.9878899269315615e-06, + "loss": 2.387, + "step": 6481 + }, + { + "epoch": 1.298997995991984, + "grad_norm": 27.976831291671054, + "learning_rate": 6.986820094043145e-06, + "loss": 2.7446, + "step": 6482 + }, + { + "epoch": 1.2991983967935872, + "grad_norm": 25.349029510899175, + "learning_rate": 6.985750153128388e-06, + "loss": 3.2186, + "step": 6483 + }, + { + "epoch": 1.2993987975951904, + "grad_norm": 17.729463877778617, + "learning_rate": 6.9846801042454634e-06, + "loss": 2.8101, + "step": 6484 + }, + { + "epoch": 1.2995991983967936, + "grad_norm": 28.22626980450887, + "learning_rate": 6.983609947452552e-06, + "loss": 2.6274, + "step": 6485 + }, + { + "epoch": 1.2997995991983968, + "grad_norm": 19.993490572183983, + "learning_rate": 6.982539682807838e-06, + "loss": 2.7125, + "step": 6486 + }, + { + "epoch": 1.3, + "grad_norm": 28.583489975348364, + "learning_rate": 6.981469310369517e-06, + "loss": 2.8898, + "step": 6487 + }, + { + "epoch": 1.3002004008016033, + "grad_norm": 33.59564507810463, + "learning_rate": 6.980398830195785e-06, + "loss": 2.4011, + "step": 6488 + }, + { + "epoch": 1.3004008016032065, + "grad_norm": 27.84684277866999, + "learning_rate": 6.979328242344845e-06, + "loss": 2.6608, + "step": 6489 + }, + { + "epoch": 1.3006012024048097, + "grad_norm": 26.628414364687412, + "learning_rate": 6.978257546874905e-06, + "loss": 2.6628, + "step": 6490 + }, + { + "epoch": 1.3008016032064127, + "grad_norm": 30.946708926061465, + "learning_rate": 6.977186743844184e-06, + "loss": 2.7969, + "step": 6491 + }, + { + "epoch": 1.3010020040080161, + "grad_norm": 22.085067256101116, + "learning_rate": 6.9761158333109e-06, + "loss": 2.8247, + "step": 6492 + }, + { + "epoch": 1.3012024048096191, + "grad_norm": 23.528753615360177, + "learning_rate": 6.975044815333282e-06, + "loss": 2.9546, + "step": 6493 + }, + { + "epoch": 1.3014028056112226, + "grad_norm": 28.25272542064184, + "learning_rate": 6.973973689969561e-06, + "loss": 2.8407, + "step": 6494 + }, + { + "epoch": 1.3016032064128256, + "grad_norm": 27.88774371778425, + "learning_rate": 6.972902457277977e-06, + "loss": 2.5209, + "step": 6495 + }, + { + "epoch": 1.3018036072144288, + "grad_norm": 24.685650779706727, + "learning_rate": 6.971831117316774e-06, + "loss": 2.5949, + "step": 6496 + }, + { + "epoch": 1.302004008016032, + "grad_norm": 29.06973799400146, + "learning_rate": 6.970759670144203e-06, + "loss": 2.7468, + "step": 6497 + }, + { + "epoch": 1.3022044088176352, + "grad_norm": 28.718623315079768, + "learning_rate": 6.969688115818519e-06, + "loss": 2.9192, + "step": 6498 + }, + { + "epoch": 1.3024048096192384, + "grad_norm": 21.133554091400164, + "learning_rate": 6.9686164543979855e-06, + "loss": 2.9224, + "step": 6499 + }, + { + "epoch": 1.3026052104208417, + "grad_norm": 32.94099296821045, + "learning_rate": 6.967544685940868e-06, + "loss": 3.243, + "step": 6500 + }, + { + "epoch": 1.3028056112224449, + "grad_norm": 28.8678986573663, + "learning_rate": 6.966472810505443e-06, + "loss": 2.8161, + "step": 6501 + }, + { + "epoch": 1.303006012024048, + "grad_norm": 21.929927194933864, + "learning_rate": 6.965400828149988e-06, + "loss": 2.9058, + "step": 6502 + }, + { + "epoch": 1.3032064128256513, + "grad_norm": 22.448094433225283, + "learning_rate": 6.9643287389327885e-06, + "loss": 2.6893, + "step": 6503 + }, + { + "epoch": 1.3034068136272545, + "grad_norm": 22.345837227608467, + "learning_rate": 6.963256542912136e-06, + "loss": 2.8281, + "step": 6504 + }, + { + "epoch": 1.3036072144288577, + "grad_norm": 33.61498105957864, + "learning_rate": 6.962184240146327e-06, + "loss": 2.8845, + "step": 6505 + }, + { + "epoch": 1.303807615230461, + "grad_norm": 25.13339346239481, + "learning_rate": 6.961111830693664e-06, + "loss": 2.6515, + "step": 6506 + }, + { + "epoch": 1.3040080160320642, + "grad_norm": 25.689339700272203, + "learning_rate": 6.960039314612457e-06, + "loss": 2.434, + "step": 6507 + }, + { + "epoch": 1.3042084168336674, + "grad_norm": 31.08217467801159, + "learning_rate": 6.958966691961019e-06, + "loss": 2.9985, + "step": 6508 + }, + { + "epoch": 1.3044088176352706, + "grad_norm": 24.480793841663534, + "learning_rate": 6.957893962797669e-06, + "loss": 2.434, + "step": 6509 + }, + { + "epoch": 1.3046092184368738, + "grad_norm": 27.024009974830953, + "learning_rate": 6.9568211271807375e-06, + "loss": 2.5835, + "step": 6510 + }, + { + "epoch": 1.304809619238477, + "grad_norm": 26.70547343506214, + "learning_rate": 6.9557481851685485e-06, + "loss": 2.5914, + "step": 6511 + }, + { + "epoch": 1.30501002004008, + "grad_norm": 34.51081871004642, + "learning_rate": 6.954675136819445e-06, + "loss": 3.1635, + "step": 6512 + }, + { + "epoch": 1.3052104208416835, + "grad_norm": 28.563909209375336, + "learning_rate": 6.953601982191771e-06, + "loss": 2.9549, + "step": 6513 + }, + { + "epoch": 1.3054108216432865, + "grad_norm": 23.045216067014927, + "learning_rate": 6.952528721343872e-06, + "loss": 2.8262, + "step": 6514 + }, + { + "epoch": 1.3056112224448897, + "grad_norm": 24.886903203549686, + "learning_rate": 6.951455354334104e-06, + "loss": 2.8166, + "step": 6515 + }, + { + "epoch": 1.305811623246493, + "grad_norm": 70.32539595435387, + "learning_rate": 6.950381881220828e-06, + "loss": 3.4181, + "step": 6516 + }, + { + "epoch": 1.3060120240480961, + "grad_norm": 31.694033528282056, + "learning_rate": 6.94930830206241e-06, + "loss": 2.709, + "step": 6517 + }, + { + "epoch": 1.3062124248496993, + "grad_norm": 22.965660144731377, + "learning_rate": 6.948234616917222e-06, + "loss": 3.2861, + "step": 6518 + }, + { + "epoch": 1.3064128256513026, + "grad_norm": 24.21365326914241, + "learning_rate": 6.947160825843642e-06, + "loss": 2.832, + "step": 6519 + }, + { + "epoch": 1.3066132264529058, + "grad_norm": 46.33474486319197, + "learning_rate": 6.946086928900054e-06, + "loss": 3.2146, + "step": 6520 + }, + { + "epoch": 1.306813627254509, + "grad_norm": 28.76305646500804, + "learning_rate": 6.945012926144847e-06, + "loss": 2.5827, + "step": 6521 + }, + { + "epoch": 1.3070140280561122, + "grad_norm": 26.86943380707272, + "learning_rate": 6.943938817636418e-06, + "loss": 2.2614, + "step": 6522 + }, + { + "epoch": 1.3072144288577154, + "grad_norm": 20.364384997348107, + "learning_rate": 6.942864603433161e-06, + "loss": 3.0518, + "step": 6523 + }, + { + "epoch": 1.3074148296593187, + "grad_norm": 25.882542724991303, + "learning_rate": 6.941790283593491e-06, + "loss": 2.7722, + "step": 6524 + }, + { + "epoch": 1.3076152304609219, + "grad_norm": 26.355575797964224, + "learning_rate": 6.940715858175817e-06, + "loss": 2.7334, + "step": 6525 + }, + { + "epoch": 1.307815631262525, + "grad_norm": 38.84858647987904, + "learning_rate": 6.939641327238557e-06, + "loss": 2.9704, + "step": 6526 + }, + { + "epoch": 1.3080160320641283, + "grad_norm": 23.886024156757152, + "learning_rate": 6.938566690840135e-06, + "loss": 2.7643, + "step": 6527 + }, + { + "epoch": 1.3082164328657315, + "grad_norm": 45.66006745478071, + "learning_rate": 6.937491949038979e-06, + "loss": 2.4864, + "step": 6528 + }, + { + "epoch": 1.3084168336673347, + "grad_norm": 26.633179104608704, + "learning_rate": 6.936417101893527e-06, + "loss": 2.1731, + "step": 6529 + }, + { + "epoch": 1.308617234468938, + "grad_norm": 23.55275627037781, + "learning_rate": 6.935342149462217e-06, + "loss": 2.7429, + "step": 6530 + }, + { + "epoch": 1.308817635270541, + "grad_norm": 21.211631544168483, + "learning_rate": 6.9342670918035e-06, + "loss": 2.6851, + "step": 6531 + }, + { + "epoch": 1.3090180360721444, + "grad_norm": 22.699056535188348, + "learning_rate": 6.933191928975825e-06, + "loss": 2.5724, + "step": 6532 + }, + { + "epoch": 1.3092184368737474, + "grad_norm": 19.268892099525424, + "learning_rate": 6.93211666103765e-06, + "loss": 2.8854, + "step": 6533 + }, + { + "epoch": 1.3094188376753508, + "grad_norm": 25.432000738668822, + "learning_rate": 6.931041288047441e-06, + "loss": 3.1265, + "step": 6534 + }, + { + "epoch": 1.3096192384769538, + "grad_norm": 24.630272243990323, + "learning_rate": 6.929965810063668e-06, + "loss": 2.9119, + "step": 6535 + }, + { + "epoch": 1.309819639278557, + "grad_norm": 32.2673568186244, + "learning_rate": 6.928890227144802e-06, + "loss": 2.6776, + "step": 6536 + }, + { + "epoch": 1.3100200400801603, + "grad_norm": 40.264728004384374, + "learning_rate": 6.927814539349329e-06, + "loss": 3.0191, + "step": 6537 + }, + { + "epoch": 1.3102204408817635, + "grad_norm": 25.06396554670058, + "learning_rate": 6.9267387467357335e-06, + "loss": 2.5481, + "step": 6538 + }, + { + "epoch": 1.3104208416833667, + "grad_norm": 25.024035733047132, + "learning_rate": 6.925662849362509e-06, + "loss": 3.0427, + "step": 6539 + }, + { + "epoch": 1.31062124248497, + "grad_norm": 21.589809655043222, + "learning_rate": 6.924586847288153e-06, + "loss": 2.8243, + "step": 6540 + }, + { + "epoch": 1.3108216432865731, + "grad_norm": 21.082830078218958, + "learning_rate": 6.923510740571167e-06, + "loss": 2.6255, + "step": 6541 + }, + { + "epoch": 1.3110220440881764, + "grad_norm": 22.002088687392213, + "learning_rate": 6.922434529270065e-06, + "loss": 2.3145, + "step": 6542 + }, + { + "epoch": 1.3112224448897796, + "grad_norm": 35.84769557333003, + "learning_rate": 6.921358213443358e-06, + "loss": 2.8108, + "step": 6543 + }, + { + "epoch": 1.3114228456913828, + "grad_norm": 26.037574734582737, + "learning_rate": 6.92028179314957e-06, + "loss": 2.6074, + "step": 6544 + }, + { + "epoch": 1.311623246492986, + "grad_norm": 18.832123232257437, + "learning_rate": 6.919205268447226e-06, + "loss": 2.6927, + "step": 6545 + }, + { + "epoch": 1.3118236472945892, + "grad_norm": 25.921957243684062, + "learning_rate": 6.918128639394858e-06, + "loss": 3.0616, + "step": 6546 + }, + { + "epoch": 1.3120240480961924, + "grad_norm": 23.354872530400726, + "learning_rate": 6.917051906051006e-06, + "loss": 2.7602, + "step": 6547 + }, + { + "epoch": 1.3122244488977957, + "grad_norm": 23.084972909475987, + "learning_rate": 6.91597506847421e-06, + "loss": 2.8073, + "step": 6548 + }, + { + "epoch": 1.3124248496993989, + "grad_norm": 22.218833925485644, + "learning_rate": 6.914898126723022e-06, + "loss": 2.4961, + "step": 6549 + }, + { + "epoch": 1.3126252505010019, + "grad_norm": 29.853209452601288, + "learning_rate": 6.913821080855995e-06, + "loss": 2.8009, + "step": 6550 + }, + { + "epoch": 1.3128256513026053, + "grad_norm": 34.65084913286771, + "learning_rate": 6.912743930931694e-06, + "loss": 2.587, + "step": 6551 + }, + { + "epoch": 1.3130260521042083, + "grad_norm": 26.484198538474864, + "learning_rate": 6.911666677008681e-06, + "loss": 2.6599, + "step": 6552 + }, + { + "epoch": 1.3132264529058117, + "grad_norm": 22.350573429345406, + "learning_rate": 6.910589319145527e-06, + "loss": 3.0368, + "step": 6553 + }, + { + "epoch": 1.3134268537074147, + "grad_norm": 29.25148182136788, + "learning_rate": 6.909511857400812e-06, + "loss": 2.6095, + "step": 6554 + }, + { + "epoch": 1.313627254509018, + "grad_norm": 23.27542897451195, + "learning_rate": 6.908434291833119e-06, + "loss": 2.9201, + "step": 6555 + }, + { + "epoch": 1.3138276553106212, + "grad_norm": 25.572370069324688, + "learning_rate": 6.907356622501037e-06, + "loss": 2.7298, + "step": 6556 + }, + { + "epoch": 1.3140280561122244, + "grad_norm": 38.15385885721645, + "learning_rate": 6.906278849463159e-06, + "loss": 2.7187, + "step": 6557 + }, + { + "epoch": 1.3142284569138276, + "grad_norm": 24.372653802546314, + "learning_rate": 6.905200972778088e-06, + "loss": 2.6298, + "step": 6558 + }, + { + "epoch": 1.3144288577154308, + "grad_norm": 22.95747181132069, + "learning_rate": 6.904122992504426e-06, + "loss": 3.1775, + "step": 6559 + }, + { + "epoch": 1.314629258517034, + "grad_norm": 19.941077813510322, + "learning_rate": 6.9030449087007864e-06, + "loss": 2.9188, + "step": 6560 + }, + { + "epoch": 1.3148296593186373, + "grad_norm": 27.06068678204489, + "learning_rate": 6.901966721425786e-06, + "loss": 2.7736, + "step": 6561 + }, + { + "epoch": 1.3150300601202405, + "grad_norm": 28.641921111187532, + "learning_rate": 6.900888430738046e-06, + "loss": 3.129, + "step": 6562 + }, + { + "epoch": 1.3152304609218437, + "grad_norm": 30.200204266847614, + "learning_rate": 6.899810036696198e-06, + "loss": 2.9877, + "step": 6563 + }, + { + "epoch": 1.315430861723447, + "grad_norm": 21.442688908152775, + "learning_rate": 6.898731539358874e-06, + "loss": 3.2076, + "step": 6564 + }, + { + "epoch": 1.3156312625250501, + "grad_norm": 21.403695400634064, + "learning_rate": 6.8976529387847136e-06, + "loss": 3.1021, + "step": 6565 + }, + { + "epoch": 1.3158316633266534, + "grad_norm": 30.538580930735627, + "learning_rate": 6.896574235032361e-06, + "loss": 3.0384, + "step": 6566 + }, + { + "epoch": 1.3160320641282566, + "grad_norm": 20.60658095423402, + "learning_rate": 6.8954954281604655e-06, + "loss": 2.2882, + "step": 6567 + }, + { + "epoch": 1.3162324649298598, + "grad_norm": 32.23561927091744, + "learning_rate": 6.894416518227689e-06, + "loss": 2.8614, + "step": 6568 + }, + { + "epoch": 1.316432865731463, + "grad_norm": 23.167486234992996, + "learning_rate": 6.8933375052926886e-06, + "loss": 2.1821, + "step": 6569 + }, + { + "epoch": 1.3166332665330662, + "grad_norm": 24.298182212663725, + "learning_rate": 6.892258389414133e-06, + "loss": 3.1148, + "step": 6570 + }, + { + "epoch": 1.3168336673346692, + "grad_norm": 23.35185277854323, + "learning_rate": 6.891179170650697e-06, + "loss": 3.1727, + "step": 6571 + }, + { + "epoch": 1.3170340681362727, + "grad_norm": 35.06061817471321, + "learning_rate": 6.890099849061056e-06, + "loss": 2.654, + "step": 6572 + }, + { + "epoch": 1.3172344689378757, + "grad_norm": 30.19523680464533, + "learning_rate": 6.889020424703896e-06, + "loss": 3.1607, + "step": 6573 + }, + { + "epoch": 1.3174348697394789, + "grad_norm": 30.544647868504118, + "learning_rate": 6.887940897637908e-06, + "loss": 2.9164, + "step": 6574 + }, + { + "epoch": 1.317635270541082, + "grad_norm": 33.496570230580794, + "learning_rate": 6.8868612679217864e-06, + "loss": 3.3255, + "step": 6575 + }, + { + "epoch": 1.3178356713426853, + "grad_norm": 36.90506803048214, + "learning_rate": 6.885781535614233e-06, + "loss": 2.5189, + "step": 6576 + }, + { + "epoch": 1.3180360721442885, + "grad_norm": 16.484583187440855, + "learning_rate": 6.884701700773954e-06, + "loss": 2.511, + "step": 6577 + }, + { + "epoch": 1.3182364729458917, + "grad_norm": 24.57221950632615, + "learning_rate": 6.88362176345966e-06, + "loss": 2.8603, + "step": 6578 + }, + { + "epoch": 1.318436873747495, + "grad_norm": 16.257999401417333, + "learning_rate": 6.882541723730072e-06, + "loss": 2.0937, + "step": 6579 + }, + { + "epoch": 1.3186372745490982, + "grad_norm": 59.846138427736456, + "learning_rate": 6.881461581643909e-06, + "loss": 3.4462, + "step": 6580 + }, + { + "epoch": 1.3188376753507014, + "grad_norm": 23.032043438876777, + "learning_rate": 6.880381337259905e-06, + "loss": 2.8617, + "step": 6581 + }, + { + "epoch": 1.3190380761523046, + "grad_norm": 29.649259333828347, + "learning_rate": 6.87930099063679e-06, + "loss": 2.1807, + "step": 6582 + }, + { + "epoch": 1.3192384769539078, + "grad_norm": 28.257502540577946, + "learning_rate": 6.878220541833307e-06, + "loss": 2.8281, + "step": 6583 + }, + { + "epoch": 1.319438877755511, + "grad_norm": 21.22534034610309, + "learning_rate": 6.877139990908202e-06, + "loss": 2.599, + "step": 6584 + }, + { + "epoch": 1.3196392785571143, + "grad_norm": 25.340586492936755, + "learning_rate": 6.876059337920222e-06, + "loss": 2.561, + "step": 6585 + }, + { + "epoch": 1.3198396793587175, + "grad_norm": 22.186050126462536, + "learning_rate": 6.874978582928129e-06, + "loss": 3.0969, + "step": 6586 + }, + { + "epoch": 1.3200400801603207, + "grad_norm": 17.540056175503537, + "learning_rate": 6.873897725990682e-06, + "loss": 2.5669, + "step": 6587 + }, + { + "epoch": 1.320240480961924, + "grad_norm": 23.10214325611509, + "learning_rate": 6.872816767166649e-06, + "loss": 2.9848, + "step": 6588 + }, + { + "epoch": 1.3204408817635271, + "grad_norm": 29.340523901727778, + "learning_rate": 6.871735706514804e-06, + "loss": 2.4956, + "step": 6589 + }, + { + "epoch": 1.3206412825651301, + "grad_norm": 26.777476243693123, + "learning_rate": 6.870654544093925e-06, + "loss": 3.2319, + "step": 6590 + }, + { + "epoch": 1.3208416833667336, + "grad_norm": 23.70121365705222, + "learning_rate": 6.869573279962799e-06, + "loss": 2.4544, + "step": 6591 + }, + { + "epoch": 1.3210420841683366, + "grad_norm": 16.562862149703253, + "learning_rate": 6.868491914180211e-06, + "loss": 2.7239, + "step": 6592 + }, + { + "epoch": 1.32124248496994, + "grad_norm": 23.771110643310692, + "learning_rate": 6.867410446804963e-06, + "loss": 2.8091, + "step": 6593 + }, + { + "epoch": 1.321442885771543, + "grad_norm": 39.458103215778664, + "learning_rate": 6.86632887789585e-06, + "loss": 3.2726, + "step": 6594 + }, + { + "epoch": 1.3216432865731462, + "grad_norm": 28.750168522661173, + "learning_rate": 6.865247207511681e-06, + "loss": 2.6842, + "step": 6595 + }, + { + "epoch": 1.3218436873747494, + "grad_norm": 35.33520738052709, + "learning_rate": 6.86416543571127e-06, + "loss": 2.6712, + "step": 6596 + }, + { + "epoch": 1.3220440881763527, + "grad_norm": 26.749762960339105, + "learning_rate": 6.863083562553431e-06, + "loss": 3.2302, + "step": 6597 + }, + { + "epoch": 1.3222444889779559, + "grad_norm": 26.313832670734115, + "learning_rate": 6.8620015880969895e-06, + "loss": 3.1861, + "step": 6598 + }, + { + "epoch": 1.322444889779559, + "grad_norm": 17.79311286625788, + "learning_rate": 6.860919512400771e-06, + "loss": 2.4843, + "step": 6599 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 25.563665493820896, + "learning_rate": 6.859837335523613e-06, + "loss": 2.8168, + "step": 6600 + }, + { + "epoch": 1.3228456913827655, + "grad_norm": 42.93738282550933, + "learning_rate": 6.858755057524354e-06, + "loss": 2.3932, + "step": 6601 + }, + { + "epoch": 1.3230460921843687, + "grad_norm": 45.984900957131366, + "learning_rate": 6.8576726784618386e-06, + "loss": 2.7186, + "step": 6602 + }, + { + "epoch": 1.323246492985972, + "grad_norm": 26.414327317086123, + "learning_rate": 6.856590198394919e-06, + "loss": 3.1262, + "step": 6603 + }, + { + "epoch": 1.3234468937875752, + "grad_norm": 21.406897618919196, + "learning_rate": 6.855507617382448e-06, + "loss": 3.033, + "step": 6604 + }, + { + "epoch": 1.3236472945891784, + "grad_norm": 26.7224805280615, + "learning_rate": 6.854424935483287e-06, + "loss": 2.9915, + "step": 6605 + }, + { + "epoch": 1.3238476953907816, + "grad_norm": 22.32311070112446, + "learning_rate": 6.853342152756307e-06, + "loss": 2.7268, + "step": 6606 + }, + { + "epoch": 1.3240480961923848, + "grad_norm": 21.39262947842618, + "learning_rate": 6.852259269260379e-06, + "loss": 2.6078, + "step": 6607 + }, + { + "epoch": 1.324248496993988, + "grad_norm": 29.41011023587723, + "learning_rate": 6.8511762850543795e-06, + "loss": 2.6591, + "step": 6608 + }, + { + "epoch": 1.324448897795591, + "grad_norm": 26.947955969370106, + "learning_rate": 6.850093200197193e-06, + "loss": 2.9866, + "step": 6609 + }, + { + "epoch": 1.3246492985971945, + "grad_norm": 20.25140092939082, + "learning_rate": 6.849010014747708e-06, + "loss": 2.5221, + "step": 6610 + }, + { + "epoch": 1.3248496993987975, + "grad_norm": 24.347434683258786, + "learning_rate": 6.847926728764819e-06, + "loss": 2.6399, + "step": 6611 + }, + { + "epoch": 1.325050100200401, + "grad_norm": 33.17581111335959, + "learning_rate": 6.846843342307426e-06, + "loss": 2.5864, + "step": 6612 + }, + { + "epoch": 1.325250501002004, + "grad_norm": 24.901084833399164, + "learning_rate": 6.845759855434435e-06, + "loss": 2.5956, + "step": 6613 + }, + { + "epoch": 1.3254509018036071, + "grad_norm": 20.06038631152907, + "learning_rate": 6.8446762682047556e-06, + "loss": 2.8492, + "step": 6614 + }, + { + "epoch": 1.3256513026052104, + "grad_norm": 29.03690240551523, + "learning_rate": 6.843592580677304e-06, + "loss": 2.9225, + "step": 6615 + }, + { + "epoch": 1.3258517034068136, + "grad_norm": 32.36243242432255, + "learning_rate": 6.842508792911002e-06, + "loss": 2.9785, + "step": 6616 + }, + { + "epoch": 1.3260521042084168, + "grad_norm": 30.627664565162174, + "learning_rate": 6.841424904964778e-06, + "loss": 2.7015, + "step": 6617 + }, + { + "epoch": 1.32625250501002, + "grad_norm": 52.337420052050845, + "learning_rate": 6.8403409168975646e-06, + "loss": 3.0479, + "step": 6618 + }, + { + "epoch": 1.3264529058116232, + "grad_norm": 21.290254536108005, + "learning_rate": 6.839256828768297e-06, + "loss": 2.6088, + "step": 6619 + }, + { + "epoch": 1.3266533066132264, + "grad_norm": 22.598205068416785, + "learning_rate": 6.8381726406359226e-06, + "loss": 2.3861, + "step": 6620 + }, + { + "epoch": 1.3268537074148297, + "grad_norm": 36.130574177718906, + "learning_rate": 6.837088352559387e-06, + "loss": 2.4937, + "step": 6621 + }, + { + "epoch": 1.3270541082164329, + "grad_norm": 71.4454980664469, + "learning_rate": 6.836003964597646e-06, + "loss": 2.8657, + "step": 6622 + }, + { + "epoch": 1.327254509018036, + "grad_norm": 18.210344228296968, + "learning_rate": 6.83491947680966e-06, + "loss": 2.6573, + "step": 6623 + }, + { + "epoch": 1.3274549098196393, + "grad_norm": 20.40489422447581, + "learning_rate": 6.8338348892543915e-06, + "loss": 2.4637, + "step": 6624 + }, + { + "epoch": 1.3276553106212425, + "grad_norm": 22.33281123103073, + "learning_rate": 6.832750201990815e-06, + "loss": 2.9743, + "step": 6625 + }, + { + "epoch": 1.3278557114228458, + "grad_norm": 37.4422426386517, + "learning_rate": 6.831665415077905e-06, + "loss": 2.5861, + "step": 6626 + }, + { + "epoch": 1.328056112224449, + "grad_norm": 32.66723663039173, + "learning_rate": 6.8305805285746415e-06, + "loss": 2.5388, + "step": 6627 + }, + { + "epoch": 1.328256513026052, + "grad_norm": 53.66007526546259, + "learning_rate": 6.829495542540014e-06, + "loss": 2.839, + "step": 6628 + }, + { + "epoch": 1.3284569138276554, + "grad_norm": 28.590133202612733, + "learning_rate": 6.828410457033011e-06, + "loss": 2.6732, + "step": 6629 + }, + { + "epoch": 1.3286573146292584, + "grad_norm": 31.60512536579356, + "learning_rate": 6.8273252721126325e-06, + "loss": 2.3935, + "step": 6630 + }, + { + "epoch": 1.3288577154308618, + "grad_norm": 25.096190109618743, + "learning_rate": 6.826239987837883e-06, + "loss": 3.0695, + "step": 6631 + }, + { + "epoch": 1.3290581162324648, + "grad_norm": 26.12208786832833, + "learning_rate": 6.825154604267769e-06, + "loss": 2.5188, + "step": 6632 + }, + { + "epoch": 1.329258517034068, + "grad_norm": 19.223957443339987, + "learning_rate": 6.824069121461306e-06, + "loss": 2.9697, + "step": 6633 + }, + { + "epoch": 1.3294589178356713, + "grad_norm": 27.906274531313194, + "learning_rate": 6.822983539477511e-06, + "loss": 2.5017, + "step": 6634 + }, + { + "epoch": 1.3296593186372745, + "grad_norm": 21.96552421668643, + "learning_rate": 6.821897858375411e-06, + "loss": 2.5429, + "step": 6635 + }, + { + "epoch": 1.3298597194388777, + "grad_norm": 38.62468104654006, + "learning_rate": 6.820812078214033e-06, + "loss": 3.0872, + "step": 6636 + }, + { + "epoch": 1.330060120240481, + "grad_norm": 20.796959386112583, + "learning_rate": 6.819726199052417e-06, + "loss": 2.7499, + "step": 6637 + }, + { + "epoch": 1.3302605210420841, + "grad_norm": 38.672926725079954, + "learning_rate": 6.8186402209496e-06, + "loss": 2.6265, + "step": 6638 + }, + { + "epoch": 1.3304609218436874, + "grad_norm": 20.856854160895743, + "learning_rate": 6.81755414396463e-06, + "loss": 2.34, + "step": 6639 + }, + { + "epoch": 1.3306613226452906, + "grad_norm": 28.21095222696912, + "learning_rate": 6.816467968156559e-06, + "loss": 3.1354, + "step": 6640 + }, + { + "epoch": 1.3308617234468938, + "grad_norm": 26.752349066040146, + "learning_rate": 6.815381693584442e-06, + "loss": 2.6222, + "step": 6641 + }, + { + "epoch": 1.331062124248497, + "grad_norm": 24.252091367776043, + "learning_rate": 6.8142953203073435e-06, + "loss": 3.3024, + "step": 6642 + }, + { + "epoch": 1.3312625250501002, + "grad_norm": 61.088472424175194, + "learning_rate": 6.8132088483843295e-06, + "loss": 3.6782, + "step": 6643 + }, + { + "epoch": 1.3314629258517034, + "grad_norm": 20.791848021239304, + "learning_rate": 6.812122277874473e-06, + "loss": 2.8213, + "step": 6644 + }, + { + "epoch": 1.3316633266533067, + "grad_norm": 117.04249696048682, + "learning_rate": 6.811035608836856e-06, + "loss": 2.8466, + "step": 6645 + }, + { + "epoch": 1.3318637274549099, + "grad_norm": 22.344398282439954, + "learning_rate": 6.809948841330558e-06, + "loss": 2.591, + "step": 6646 + }, + { + "epoch": 1.332064128256513, + "grad_norm": 23.112606171741408, + "learning_rate": 6.808861975414671e-06, + "loss": 2.5125, + "step": 6647 + }, + { + "epoch": 1.3322645290581163, + "grad_norm": 26.47104727936171, + "learning_rate": 6.807775011148286e-06, + "loss": 2.8529, + "step": 6648 + }, + { + "epoch": 1.3324649298597193, + "grad_norm": 43.93239767754511, + "learning_rate": 6.806687948590506e-06, + "loss": 3.0786, + "step": 6649 + }, + { + "epoch": 1.3326653306613228, + "grad_norm": 20.751390018931627, + "learning_rate": 6.805600787800436e-06, + "loss": 2.5372, + "step": 6650 + }, + { + "epoch": 1.3328657314629258, + "grad_norm": 24.519876957328623, + "learning_rate": 6.804513528837185e-06, + "loss": 2.1634, + "step": 6651 + }, + { + "epoch": 1.3330661322645292, + "grad_norm": 19.59028900560566, + "learning_rate": 6.803426171759872e-06, + "loss": 2.4718, + "step": 6652 + }, + { + "epoch": 1.3332665330661322, + "grad_norm": 24.480415359714456, + "learning_rate": 6.802338716627614e-06, + "loss": 2.7011, + "step": 6653 + }, + { + "epoch": 1.3334669338677354, + "grad_norm": 30.487130511248427, + "learning_rate": 6.801251163499539e-06, + "loss": 2.7082, + "step": 6654 + }, + { + "epoch": 1.3336673346693386, + "grad_norm": 25.94368098199845, + "learning_rate": 6.80016351243478e-06, + "loss": 2.6859, + "step": 6655 + }, + { + "epoch": 1.3338677354709418, + "grad_norm": 55.171867593665986, + "learning_rate": 6.799075763492473e-06, + "loss": 2.8326, + "step": 6656 + }, + { + "epoch": 1.334068136272545, + "grad_norm": 26.85894036032584, + "learning_rate": 6.797987916731762e-06, + "loss": 2.164, + "step": 6657 + }, + { + "epoch": 1.3342685370741483, + "grad_norm": 24.869955271586313, + "learning_rate": 6.796899972211793e-06, + "loss": 2.4497, + "step": 6658 + }, + { + "epoch": 1.3344689378757515, + "grad_norm": 19.407390436744727, + "learning_rate": 6.795811929991722e-06, + "loss": 3.4902, + "step": 6659 + }, + { + "epoch": 1.3346693386773547, + "grad_norm": 23.531274706693495, + "learning_rate": 6.794723790130704e-06, + "loss": 2.4957, + "step": 6660 + }, + { + "epoch": 1.334869739478958, + "grad_norm": 24.360870441394397, + "learning_rate": 6.793635552687902e-06, + "loss": 2.8857, + "step": 6661 + }, + { + "epoch": 1.3350701402805611, + "grad_norm": 28.939752827170732, + "learning_rate": 6.792547217722491e-06, + "loss": 2.2538, + "step": 6662 + }, + { + "epoch": 1.3352705410821644, + "grad_norm": 35.51285046813469, + "learning_rate": 6.791458785293641e-06, + "loss": 3.1005, + "step": 6663 + }, + { + "epoch": 1.3354709418837676, + "grad_norm": 36.06054006817266, + "learning_rate": 6.790370255460532e-06, + "loss": 3.0628, + "step": 6664 + }, + { + "epoch": 1.3356713426853708, + "grad_norm": 19.274144194383947, + "learning_rate": 6.789281628282349e-06, + "loss": 2.4221, + "step": 6665 + }, + { + "epoch": 1.335871743486974, + "grad_norm": 29.348244063478493, + "learning_rate": 6.788192903818283e-06, + "loss": 2.4897, + "step": 6666 + }, + { + "epoch": 1.3360721442885772, + "grad_norm": 19.606257403425403, + "learning_rate": 6.787104082127528e-06, + "loss": 2.555, + "step": 6667 + }, + { + "epoch": 1.3362725450901802, + "grad_norm": 34.272974168189016, + "learning_rate": 6.786015163269287e-06, + "loss": 2.8896, + "step": 6668 + }, + { + "epoch": 1.3364729458917837, + "grad_norm": 18.785741363022154, + "learning_rate": 6.784926147302766e-06, + "loss": 2.3037, + "step": 6669 + }, + { + "epoch": 1.3366733466933867, + "grad_norm": 28.174389718555, + "learning_rate": 6.783837034287175e-06, + "loss": 2.489, + "step": 6670 + }, + { + "epoch": 1.33687374749499, + "grad_norm": 29.32098015746766, + "learning_rate": 6.78274782428173e-06, + "loss": 3.1229, + "step": 6671 + }, + { + "epoch": 1.337074148296593, + "grad_norm": 18.516327684722324, + "learning_rate": 6.781658517345657e-06, + "loss": 2.3797, + "step": 6672 + }, + { + "epoch": 1.3372745490981963, + "grad_norm": 36.364204605018685, + "learning_rate": 6.780569113538178e-06, + "loss": 3.3951, + "step": 6673 + }, + { + "epoch": 1.3374749498997995, + "grad_norm": 34.26801934489103, + "learning_rate": 6.7794796129185295e-06, + "loss": 2.6487, + "step": 6674 + }, + { + "epoch": 1.3376753507014028, + "grad_norm": 23.4198413225526, + "learning_rate": 6.778390015545946e-06, + "loss": 2.705, + "step": 6675 + }, + { + "epoch": 1.337875751503006, + "grad_norm": 21.877668598700012, + "learning_rate": 6.777300321479673e-06, + "loss": 2.4222, + "step": 6676 + }, + { + "epoch": 1.3380761523046092, + "grad_norm": 24.223891197278785, + "learning_rate": 6.776210530778958e-06, + "loss": 2.6782, + "step": 6677 + }, + { + "epoch": 1.3382765531062124, + "grad_norm": 25.30784774649172, + "learning_rate": 6.775120643503055e-06, + "loss": 2.9782, + "step": 6678 + }, + { + "epoch": 1.3384769539078156, + "grad_norm": 39.75796014395625, + "learning_rate": 6.774030659711222e-06, + "loss": 2.4367, + "step": 6679 + }, + { + "epoch": 1.3386773547094188, + "grad_norm": 34.22098322857849, + "learning_rate": 6.772940579462722e-06, + "loss": 2.4966, + "step": 6680 + }, + { + "epoch": 1.338877755511022, + "grad_norm": 23.048193378393393, + "learning_rate": 6.771850402816828e-06, + "loss": 2.9092, + "step": 6681 + }, + { + "epoch": 1.3390781563126253, + "grad_norm": 19.586235030966396, + "learning_rate": 6.7707601298328106e-06, + "loss": 2.5999, + "step": 6682 + }, + { + "epoch": 1.3392785571142285, + "grad_norm": 22.767236797951284, + "learning_rate": 6.769669760569952e-06, + "loss": 2.6115, + "step": 6683 + }, + { + "epoch": 1.3394789579158317, + "grad_norm": 28.291795121925635, + "learning_rate": 6.7685792950875365e-06, + "loss": 3.3661, + "step": 6684 + }, + { + "epoch": 1.339679358717435, + "grad_norm": 27.651179399122963, + "learning_rate": 6.767488733444852e-06, + "loss": 3.3117, + "step": 6685 + }, + { + "epoch": 1.3398797595190381, + "grad_norm": 17.414954756103164, + "learning_rate": 6.766398075701197e-06, + "loss": 2.9193, + "step": 6686 + }, + { + "epoch": 1.3400801603206411, + "grad_norm": 21.6583573349476, + "learning_rate": 6.765307321915871e-06, + "loss": 2.7252, + "step": 6687 + }, + { + "epoch": 1.3402805611222446, + "grad_norm": 22.593878488052447, + "learning_rate": 6.764216472148181e-06, + "loss": 2.4595, + "step": 6688 + }, + { + "epoch": 1.3404809619238476, + "grad_norm": 24.46164884239557, + "learning_rate": 6.763125526457437e-06, + "loss": 2.2849, + "step": 6689 + }, + { + "epoch": 1.340681362725451, + "grad_norm": 22.292892706076366, + "learning_rate": 6.762034484902955e-06, + "loss": 2.2933, + "step": 6690 + }, + { + "epoch": 1.340881763527054, + "grad_norm": 27.456855386679052, + "learning_rate": 6.7609433475440575e-06, + "loss": 2.8594, + "step": 6691 + }, + { + "epoch": 1.3410821643286572, + "grad_norm": 28.57897255215912, + "learning_rate": 6.75985211444007e-06, + "loss": 3.0516, + "step": 6692 + }, + { + "epoch": 1.3412825651302605, + "grad_norm": 33.969570696513905, + "learning_rate": 6.758760785650325e-06, + "loss": 3.3648, + "step": 6693 + }, + { + "epoch": 1.3414829659318637, + "grad_norm": 35.8406222610747, + "learning_rate": 6.757669361234161e-06, + "loss": 2.9313, + "step": 6694 + }, + { + "epoch": 1.3416833667334669, + "grad_norm": 34.25515917194222, + "learning_rate": 6.756577841250919e-06, + "loss": 2.8474, + "step": 6695 + }, + { + "epoch": 1.34188376753507, + "grad_norm": 66.48825889337813, + "learning_rate": 6.755486225759949e-06, + "loss": 2.7467, + "step": 6696 + }, + { + "epoch": 1.3420841683366733, + "grad_norm": 23.716755826761016, + "learning_rate": 6.754394514820598e-06, + "loss": 2.9045, + "step": 6697 + }, + { + "epoch": 1.3422845691382765, + "grad_norm": 36.81190745330389, + "learning_rate": 6.753302708492229e-06, + "loss": 2.4438, + "step": 6698 + }, + { + "epoch": 1.3424849699398798, + "grad_norm": 23.159403226046514, + "learning_rate": 6.752210806834203e-06, + "loss": 2.9043, + "step": 6699 + }, + { + "epoch": 1.342685370741483, + "grad_norm": 35.04332538199242, + "learning_rate": 6.751118809905889e-06, + "loss": 2.8836, + "step": 6700 + }, + { + "epoch": 1.3428857715430862, + "grad_norm": 32.327593141745055, + "learning_rate": 6.750026717766662e-06, + "loss": 3.1095, + "step": 6701 + }, + { + "epoch": 1.3430861723446894, + "grad_norm": 19.004608947366055, + "learning_rate": 6.7489345304758966e-06, + "loss": 3.0257, + "step": 6702 + }, + { + "epoch": 1.3432865731462926, + "grad_norm": 24.50973531216252, + "learning_rate": 6.747842248092981e-06, + "loss": 2.4698, + "step": 6703 + }, + { + "epoch": 1.3434869739478958, + "grad_norm": 28.960883240189137, + "learning_rate": 6.746749870677303e-06, + "loss": 3.158, + "step": 6704 + }, + { + "epoch": 1.343687374749499, + "grad_norm": 25.23249629903369, + "learning_rate": 6.745657398288253e-06, + "loss": 2.6442, + "step": 6705 + }, + { + "epoch": 1.3438877755511023, + "grad_norm": 39.58169452266825, + "learning_rate": 6.744564830985236e-06, + "loss": 3.0705, + "step": 6706 + }, + { + "epoch": 1.3440881763527055, + "grad_norm": 23.294206948498573, + "learning_rate": 6.7434721688276526e-06, + "loss": 3.1518, + "step": 6707 + }, + { + "epoch": 1.3442885771543085, + "grad_norm": 21.180866565571797, + "learning_rate": 6.7423794118749156e-06, + "loss": 2.9111, + "step": 6708 + }, + { + "epoch": 1.344488977955912, + "grad_norm": 24.87115245036951, + "learning_rate": 6.741286560186437e-06, + "loss": 2.7959, + "step": 6709 + }, + { + "epoch": 1.344689378757515, + "grad_norm": 47.655401641751034, + "learning_rate": 6.740193613821637e-06, + "loss": 2.7329, + "step": 6710 + }, + { + "epoch": 1.3448897795591184, + "grad_norm": 29.63102488590749, + "learning_rate": 6.739100572839942e-06, + "loss": 2.9041, + "step": 6711 + }, + { + "epoch": 1.3450901803607214, + "grad_norm": 17.286011508991546, + "learning_rate": 6.738007437300781e-06, + "loss": 2.8825, + "step": 6712 + }, + { + "epoch": 1.3452905811623246, + "grad_norm": 26.828730479934755, + "learning_rate": 6.736914207263592e-06, + "loss": 2.7214, + "step": 6713 + }, + { + "epoch": 1.3454909819639278, + "grad_norm": 24.926096244477026, + "learning_rate": 6.735820882787811e-06, + "loss": 2.6412, + "step": 6714 + }, + { + "epoch": 1.345691382765531, + "grad_norm": 19.908250672947016, + "learning_rate": 6.7347274639328885e-06, + "loss": 2.6817, + "step": 6715 + }, + { + "epoch": 1.3458917835671342, + "grad_norm": 44.786240768217674, + "learning_rate": 6.733633950758272e-06, + "loss": 2.5933, + "step": 6716 + }, + { + "epoch": 1.3460921843687375, + "grad_norm": 33.93612052222067, + "learning_rate": 6.732540343323418e-06, + "loss": 2.8903, + "step": 6717 + }, + { + "epoch": 1.3462925851703407, + "grad_norm": 30.032000102549077, + "learning_rate": 6.73144664168779e-06, + "loss": 2.8838, + "step": 6718 + }, + { + "epoch": 1.346492985971944, + "grad_norm": 36.2882148260453, + "learning_rate": 6.73035284591085e-06, + "loss": 2.2787, + "step": 6719 + }, + { + "epoch": 1.346693386773547, + "grad_norm": 19.581771006492115, + "learning_rate": 6.729258956052074e-06, + "loss": 2.7824, + "step": 6720 + }, + { + "epoch": 1.3468937875751503, + "grad_norm": 25.35309726899093, + "learning_rate": 6.7281649721709344e-06, + "loss": 2.8353, + "step": 6721 + }, + { + "epoch": 1.3470941883767535, + "grad_norm": 21.8398827614862, + "learning_rate": 6.727070894326915e-06, + "loss": 2.5347, + "step": 6722 + }, + { + "epoch": 1.3472945891783568, + "grad_norm": 50.446918150433966, + "learning_rate": 6.725976722579501e-06, + "loss": 2.7986, + "step": 6723 + }, + { + "epoch": 1.34749498997996, + "grad_norm": 54.08671727015835, + "learning_rate": 6.724882456988185e-06, + "loss": 2.7388, + "step": 6724 + }, + { + "epoch": 1.3476953907815632, + "grad_norm": 17.510619906176775, + "learning_rate": 6.7237880976124635e-06, + "loss": 2.5708, + "step": 6725 + }, + { + "epoch": 1.3478957915831664, + "grad_norm": 32.543515926852734, + "learning_rate": 6.7226936445118386e-06, + "loss": 2.8667, + "step": 6726 + }, + { + "epoch": 1.3480961923847694, + "grad_norm": 41.63029686542321, + "learning_rate": 6.7215990977458165e-06, + "loss": 2.7792, + "step": 6727 + }, + { + "epoch": 1.3482965931863728, + "grad_norm": 23.45923063664752, + "learning_rate": 6.720504457373912e-06, + "loss": 2.5267, + "step": 6728 + }, + { + "epoch": 1.3484969939879758, + "grad_norm": 24.204824140407304, + "learning_rate": 6.7194097234556396e-06, + "loss": 2.6317, + "step": 6729 + }, + { + "epoch": 1.3486973947895793, + "grad_norm": 24.040157637868937, + "learning_rate": 6.718314896050522e-06, + "loss": 2.8717, + "step": 6730 + }, + { + "epoch": 1.3488977955911823, + "grad_norm": 35.70635857865932, + "learning_rate": 6.717219975218088e-06, + "loss": 2.4121, + "step": 6731 + }, + { + "epoch": 1.3490981963927855, + "grad_norm": 27.234973515789587, + "learning_rate": 6.716124961017867e-06, + "loss": 2.4785, + "step": 6732 + }, + { + "epoch": 1.3492985971943887, + "grad_norm": 30.122026954766127, + "learning_rate": 6.715029853509401e-06, + "loss": 3.0295, + "step": 6733 + }, + { + "epoch": 1.349498997995992, + "grad_norm": 16.516055830656477, + "learning_rate": 6.71393465275223e-06, + "loss": 2.2727, + "step": 6734 + }, + { + "epoch": 1.3496993987975952, + "grad_norm": 24.263974012807694, + "learning_rate": 6.7128393588059e-06, + "loss": 2.7067, + "step": 6735 + }, + { + "epoch": 1.3498997995991984, + "grad_norm": 33.056179420522106, + "learning_rate": 6.711743971729967e-06, + "loss": 2.1708, + "step": 6736 + }, + { + "epoch": 1.3501002004008016, + "grad_norm": 29.622652632830423, + "learning_rate": 6.710648491583987e-06, + "loss": 3.045, + "step": 6737 + }, + { + "epoch": 1.3503006012024048, + "grad_norm": 21.94864502579006, + "learning_rate": 6.709552918427525e-06, + "loss": 2.7883, + "step": 6738 + }, + { + "epoch": 1.350501002004008, + "grad_norm": 22.862379517638704, + "learning_rate": 6.708457252320146e-06, + "loss": 2.6846, + "step": 6739 + }, + { + "epoch": 1.3507014028056112, + "grad_norm": 20.090137186146805, + "learning_rate": 6.707361493321425e-06, + "loss": 2.4889, + "step": 6740 + }, + { + "epoch": 1.3509018036072145, + "grad_norm": 22.287869580815606, + "learning_rate": 6.7062656414909384e-06, + "loss": 2.5488, + "step": 6741 + }, + { + "epoch": 1.3511022044088177, + "grad_norm": 30.260075039920718, + "learning_rate": 6.705169696888271e-06, + "loss": 2.763, + "step": 6742 + }, + { + "epoch": 1.351302605210421, + "grad_norm": 22.566234375569167, + "learning_rate": 6.7040736595730095e-06, + "loss": 3.2215, + "step": 6743 + }, + { + "epoch": 1.351503006012024, + "grad_norm": 37.26159108688241, + "learning_rate": 6.702977529604747e-06, + "loss": 2.6249, + "step": 6744 + }, + { + "epoch": 1.3517034068136273, + "grad_norm": 23.45431165512472, + "learning_rate": 6.7018813070430845e-06, + "loss": 3.2509, + "step": 6745 + }, + { + "epoch": 1.3519038076152303, + "grad_norm": 26.792608489873526, + "learning_rate": 6.700784991947622e-06, + "loss": 3.1341, + "step": 6746 + }, + { + "epoch": 1.3521042084168338, + "grad_norm": 19.66120571034428, + "learning_rate": 6.699688584377968e-06, + "loss": 2.814, + "step": 6747 + }, + { + "epoch": 1.3523046092184368, + "grad_norm": 21.147560589615544, + "learning_rate": 6.698592084393737e-06, + "loss": 2.7591, + "step": 6748 + }, + { + "epoch": 1.3525050100200402, + "grad_norm": 24.17743399149455, + "learning_rate": 6.697495492054547e-06, + "loss": 2.5813, + "step": 6749 + }, + { + "epoch": 1.3527054108216432, + "grad_norm": 21.110156949777643, + "learning_rate": 6.696398807420024e-06, + "loss": 2.6094, + "step": 6750 + }, + { + "epoch": 1.3529058116232464, + "grad_norm": 25.716556566662643, + "learning_rate": 6.6953020305497905e-06, + "loss": 2.6771, + "step": 6751 + }, + { + "epoch": 1.3531062124248496, + "grad_norm": 33.22322271282587, + "learning_rate": 6.694205161503485e-06, + "loss": 2.4283, + "step": 6752 + }, + { + "epoch": 1.3533066132264528, + "grad_norm": 20.680232427277392, + "learning_rate": 6.693108200340743e-06, + "loss": 2.5537, + "step": 6753 + }, + { + "epoch": 1.353507014028056, + "grad_norm": 21.16617707027986, + "learning_rate": 6.692011147121209e-06, + "loss": 3.3536, + "step": 6754 + }, + { + "epoch": 1.3537074148296593, + "grad_norm": 24.5120867182948, + "learning_rate": 6.6909140019045304e-06, + "loss": 3.0527, + "step": 6755 + }, + { + "epoch": 1.3539078156312625, + "grad_norm": 30.23694152516201, + "learning_rate": 6.689816764750362e-06, + "loss": 2.736, + "step": 6756 + }, + { + "epoch": 1.3541082164328657, + "grad_norm": 20.406338340079948, + "learning_rate": 6.688719435718363e-06, + "loss": 2.2212, + "step": 6757 + }, + { + "epoch": 1.354308617234469, + "grad_norm": 40.853113598415916, + "learning_rate": 6.687622014868195e-06, + "loss": 2.804, + "step": 6758 + }, + { + "epoch": 1.3545090180360722, + "grad_norm": 41.234133095927554, + "learning_rate": 6.686524502259526e-06, + "loss": 2.7195, + "step": 6759 + }, + { + "epoch": 1.3547094188376754, + "grad_norm": 17.87168726272729, + "learning_rate": 6.685426897952032e-06, + "loss": 2.4728, + "step": 6760 + }, + { + "epoch": 1.3549098196392786, + "grad_norm": 23.993079988090855, + "learning_rate": 6.684329202005387e-06, + "loss": 2.7671, + "step": 6761 + }, + { + "epoch": 1.3551102204408818, + "grad_norm": 30.331319784063364, + "learning_rate": 6.683231414479279e-06, + "loss": 2.8198, + "step": 6762 + }, + { + "epoch": 1.355310621242485, + "grad_norm": 21.13067253381632, + "learning_rate": 6.682133535433394e-06, + "loss": 2.3492, + "step": 6763 + }, + { + "epoch": 1.3555110220440882, + "grad_norm": 23.409302025314766, + "learning_rate": 6.681035564927425e-06, + "loss": 2.8442, + "step": 6764 + }, + { + "epoch": 1.3557114228456915, + "grad_norm": 16.818881390826498, + "learning_rate": 6.679937503021072e-06, + "loss": 2.4162, + "step": 6765 + }, + { + "epoch": 1.3559118236472947, + "grad_norm": 21.842151802273516, + "learning_rate": 6.6788393497740355e-06, + "loss": 3.1868, + "step": 6766 + }, + { + "epoch": 1.3561122244488977, + "grad_norm": 27.054814199944285, + "learning_rate": 6.677741105246026e-06, + "loss": 2.6347, + "step": 6767 + }, + { + "epoch": 1.3563126252505011, + "grad_norm": 26.26030689004071, + "learning_rate": 6.676642769496756e-06, + "loss": 3.2875, + "step": 6768 + }, + { + "epoch": 1.356513026052104, + "grad_norm": 36.220506138220216, + "learning_rate": 6.675544342585944e-06, + "loss": 2.9657, + "step": 6769 + }, + { + "epoch": 1.3567134268537075, + "grad_norm": 25.210774235205854, + "learning_rate": 6.674445824573314e-06, + "loss": 3.1416, + "step": 6770 + }, + { + "epoch": 1.3569138276553105, + "grad_norm": 29.85185186274824, + "learning_rate": 6.673347215518591e-06, + "loss": 2.5198, + "step": 6771 + }, + { + "epoch": 1.3571142284569138, + "grad_norm": 20.234968546841152, + "learning_rate": 6.672248515481511e-06, + "loss": 2.6324, + "step": 6772 + }, + { + "epoch": 1.357314629258517, + "grad_norm": 19.883307078611118, + "learning_rate": 6.67114972452181e-06, + "loss": 2.6499, + "step": 6773 + }, + { + "epoch": 1.3575150300601202, + "grad_norm": 34.285426561148576, + "learning_rate": 6.670050842699232e-06, + "loss": 2.9717, + "step": 6774 + }, + { + "epoch": 1.3577154308617234, + "grad_norm": 24.89153577365228, + "learning_rate": 6.6689518700735234e-06, + "loss": 2.0783, + "step": 6775 + }, + { + "epoch": 1.3579158316633266, + "grad_norm": 30.250558516114914, + "learning_rate": 6.667852806704439e-06, + "loss": 2.957, + "step": 6776 + }, + { + "epoch": 1.3581162324649299, + "grad_norm": 24.80582268054729, + "learning_rate": 6.666753652651736e-06, + "loss": 2.8079, + "step": 6777 + }, + { + "epoch": 1.358316633266533, + "grad_norm": 46.14612297683069, + "learning_rate": 6.665654407975175e-06, + "loss": 2.6382, + "step": 6778 + }, + { + "epoch": 1.3585170340681363, + "grad_norm": 41.79258566023005, + "learning_rate": 6.664555072734527e-06, + "loss": 3.015, + "step": 6779 + }, + { + "epoch": 1.3587174348697395, + "grad_norm": 39.42917646157932, + "learning_rate": 6.663455646989561e-06, + "loss": 2.8816, + "step": 6780 + }, + { + "epoch": 1.3589178356713427, + "grad_norm": 35.04366993410945, + "learning_rate": 6.662356130800056e-06, + "loss": 2.3866, + "step": 6781 + }, + { + "epoch": 1.359118236472946, + "grad_norm": 16.005125457206333, + "learning_rate": 6.661256524225796e-06, + "loss": 2.5608, + "step": 6782 + }, + { + "epoch": 1.3593186372745492, + "grad_norm": 24.4233131051777, + "learning_rate": 6.660156827326565e-06, + "loss": 2.7113, + "step": 6783 + }, + { + "epoch": 1.3595190380761524, + "grad_norm": 25.987586247826584, + "learning_rate": 6.659057040162158e-06, + "loss": 2.5149, + "step": 6784 + }, + { + "epoch": 1.3597194388777556, + "grad_norm": 24.353030598030536, + "learning_rate": 6.65795716279237e-06, + "loss": 2.8073, + "step": 6785 + }, + { + "epoch": 1.3599198396793586, + "grad_norm": 43.29060395719037, + "learning_rate": 6.656857195277002e-06, + "loss": 3.3305, + "step": 6786 + }, + { + "epoch": 1.360120240480962, + "grad_norm": 24.548832370514365, + "learning_rate": 6.655757137675864e-06, + "loss": 3.2525, + "step": 6787 + }, + { + "epoch": 1.360320641282565, + "grad_norm": 28.04671455061133, + "learning_rate": 6.654656990048767e-06, + "loss": 2.8103, + "step": 6788 + }, + { + "epoch": 1.3605210420841685, + "grad_norm": 47.18853977253339, + "learning_rate": 6.653556752455527e-06, + "loss": 2.8541, + "step": 6789 + }, + { + "epoch": 1.3607214428857715, + "grad_norm": 38.47197828651945, + "learning_rate": 6.6524564249559645e-06, + "loss": 2.8919, + "step": 6790 + }, + { + "epoch": 1.3609218436873747, + "grad_norm": 33.42259769905945, + "learning_rate": 6.6513560076099075e-06, + "loss": 2.3317, + "step": 6791 + }, + { + "epoch": 1.361122244488978, + "grad_norm": 29.335870902039474, + "learning_rate": 6.650255500477185e-06, + "loss": 2.9996, + "step": 6792 + }, + { + "epoch": 1.3613226452905811, + "grad_norm": 26.299476016658787, + "learning_rate": 6.649154903617637e-06, + "loss": 2.6245, + "step": 6793 + }, + { + "epoch": 1.3615230460921843, + "grad_norm": 22.686784866879446, + "learning_rate": 6.648054217091102e-06, + "loss": 2.4466, + "step": 6794 + }, + { + "epoch": 1.3617234468937875, + "grad_norm": 31.662133869546793, + "learning_rate": 6.6469534409574265e-06, + "loss": 2.7619, + "step": 6795 + }, + { + "epoch": 1.3619238476953908, + "grad_norm": 19.82793823995392, + "learning_rate": 6.645852575276462e-06, + "loss": 2.8685, + "step": 6796 + }, + { + "epoch": 1.362124248496994, + "grad_norm": 35.66007144890146, + "learning_rate": 6.644751620108063e-06, + "loss": 3.4932, + "step": 6797 + }, + { + "epoch": 1.3623246492985972, + "grad_norm": 26.59760881883747, + "learning_rate": 6.643650575512089e-06, + "loss": 2.5717, + "step": 6798 + }, + { + "epoch": 1.3625250501002004, + "grad_norm": 26.830772839233525, + "learning_rate": 6.642549441548409e-06, + "loss": 3.1067, + "step": 6799 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 26.272207935548593, + "learning_rate": 6.64144821827689e-06, + "loss": 2.9124, + "step": 6800 + }, + { + "epoch": 1.3629258517034069, + "grad_norm": 30.323631112690233, + "learning_rate": 6.64034690575741e-06, + "loss": 2.466, + "step": 6801 + }, + { + "epoch": 1.36312625250501, + "grad_norm": 27.224218657031855, + "learning_rate": 6.639245504049845e-06, + "loss": 2.4407, + "step": 6802 + }, + { + "epoch": 1.3633266533066133, + "grad_norm": 24.834887280921897, + "learning_rate": 6.638144013214083e-06, + "loss": 2.8583, + "step": 6803 + }, + { + "epoch": 1.3635270541082165, + "grad_norm": 22.282259839894518, + "learning_rate": 6.6370424333100134e-06, + "loss": 3.0371, + "step": 6804 + }, + { + "epoch": 1.3637274549098195, + "grad_norm": 27.57987068694597, + "learning_rate": 6.635940764397531e-06, + "loss": 2.4352, + "step": 6805 + }, + { + "epoch": 1.363927855711423, + "grad_norm": 22.030659203568533, + "learning_rate": 6.634839006536533e-06, + "loss": 2.8693, + "step": 6806 + }, + { + "epoch": 1.364128256513026, + "grad_norm": 25.655249368859007, + "learning_rate": 6.633737159786925e-06, + "loss": 2.6382, + "step": 6807 + }, + { + "epoch": 1.3643286573146294, + "grad_norm": 33.863507870841254, + "learning_rate": 6.632635224208617e-06, + "loss": 2.5663, + "step": 6808 + }, + { + "epoch": 1.3645290581162324, + "grad_norm": 34.750919085137255, + "learning_rate": 6.631533199861522e-06, + "loss": 3.0367, + "step": 6809 + }, + { + "epoch": 1.3647294589178356, + "grad_norm": 23.663222058185795, + "learning_rate": 6.6304310868055575e-06, + "loss": 2.5442, + "step": 6810 + }, + { + "epoch": 1.3649298597194388, + "grad_norm": 20.62403098107511, + "learning_rate": 6.629328885100648e-06, + "loss": 2.7785, + "step": 6811 + }, + { + "epoch": 1.365130260521042, + "grad_norm": 31.510125747214268, + "learning_rate": 6.628226594806722e-06, + "loss": 2.9853, + "step": 6812 + }, + { + "epoch": 1.3653306613226452, + "grad_norm": 33.32297176621012, + "learning_rate": 6.627124215983713e-06, + "loss": 2.7715, + "step": 6813 + }, + { + "epoch": 1.3655310621242485, + "grad_norm": 32.79899034224441, + "learning_rate": 6.626021748691558e-06, + "loss": 3.2611, + "step": 6814 + }, + { + "epoch": 1.3657314629258517, + "grad_norm": 26.807954291829216, + "learning_rate": 6.624919192990201e-06, + "loss": 2.5016, + "step": 6815 + }, + { + "epoch": 1.365931863727455, + "grad_norm": 21.508717169978905, + "learning_rate": 6.6238165489395886e-06, + "loss": 2.5506, + "step": 6816 + }, + { + "epoch": 1.3661322645290581, + "grad_norm": 26.00926582044873, + "learning_rate": 6.622713816599673e-06, + "loss": 2.6546, + "step": 6817 + }, + { + "epoch": 1.3663326653306613, + "grad_norm": 31.511882489233006, + "learning_rate": 6.6216109960304135e-06, + "loss": 2.1724, + "step": 6818 + }, + { + "epoch": 1.3665330661322646, + "grad_norm": 20.735328568521506, + "learning_rate": 6.620508087291768e-06, + "loss": 2.6291, + "step": 6819 + }, + { + "epoch": 1.3667334669338678, + "grad_norm": 31.19568844606835, + "learning_rate": 6.6194050904437065e-06, + "loss": 2.7517, + "step": 6820 + }, + { + "epoch": 1.366933867735471, + "grad_norm": 36.5548561822805, + "learning_rate": 6.618302005546201e-06, + "loss": 3.2522, + "step": 6821 + }, + { + "epoch": 1.3671342685370742, + "grad_norm": 21.931535332575834, + "learning_rate": 6.617198832659226e-06, + "loss": 2.6488, + "step": 6822 + }, + { + "epoch": 1.3673346693386774, + "grad_norm": 30.079491910703698, + "learning_rate": 6.616095571842762e-06, + "loss": 2.9881, + "step": 6823 + }, + { + "epoch": 1.3675350701402806, + "grad_norm": 19.127657447331305, + "learning_rate": 6.614992223156797e-06, + "loss": 2.8394, + "step": 6824 + }, + { + "epoch": 1.3677354709418839, + "grad_norm": 42.71616071623532, + "learning_rate": 6.613888786661322e-06, + "loss": 2.6312, + "step": 6825 + }, + { + "epoch": 1.3679358717434869, + "grad_norm": 36.25492960311852, + "learning_rate": 6.61278526241633e-06, + "loss": 3.3294, + "step": 6826 + }, + { + "epoch": 1.3681362725450903, + "grad_norm": 22.531518725184867, + "learning_rate": 6.6116816504818236e-06, + "loss": 2.5079, + "step": 6827 + }, + { + "epoch": 1.3683366733466933, + "grad_norm": 21.23719996055576, + "learning_rate": 6.610577950917807e-06, + "loss": 2.3717, + "step": 6828 + }, + { + "epoch": 1.3685370741482967, + "grad_norm": 79.63930866931966, + "learning_rate": 6.6094741637842885e-06, + "loss": 2.601, + "step": 6829 + }, + { + "epoch": 1.3687374749498997, + "grad_norm": 44.762606377086065, + "learning_rate": 6.608370289141284e-06, + "loss": 2.654, + "step": 6830 + }, + { + "epoch": 1.368937875751503, + "grad_norm": 18.526890939290286, + "learning_rate": 6.607266327048813e-06, + "loss": 2.1565, + "step": 6831 + }, + { + "epoch": 1.3691382765531062, + "grad_norm": 19.382061900713417, + "learning_rate": 6.6061622775668995e-06, + "loss": 2.68, + "step": 6832 + }, + { + "epoch": 1.3693386773547094, + "grad_norm": 19.296051976447373, + "learning_rate": 6.6050581407555736e-06, + "loss": 2.792, + "step": 6833 + }, + { + "epoch": 1.3695390781563126, + "grad_norm": 26.449730158254884, + "learning_rate": 6.603953916674865e-06, + "loss": 2.4575, + "step": 6834 + }, + { + "epoch": 1.3697394789579158, + "grad_norm": 30.214253060227733, + "learning_rate": 6.602849605384815e-06, + "loss": 2.5381, + "step": 6835 + }, + { + "epoch": 1.369939879759519, + "grad_norm": 21.954302060208317, + "learning_rate": 6.601745206945465e-06, + "loss": 2.3859, + "step": 6836 + }, + { + "epoch": 1.3701402805611222, + "grad_norm": 70.1236833028014, + "learning_rate": 6.6006407214168645e-06, + "loss": 3.0443, + "step": 6837 + }, + { + "epoch": 1.3703406813627255, + "grad_norm": 23.281417946179403, + "learning_rate": 6.599536148859066e-06, + "loss": 2.8703, + "step": 6838 + }, + { + "epoch": 1.3705410821643287, + "grad_norm": 23.463057271773675, + "learning_rate": 6.598431489332125e-06, + "loss": 3.0717, + "step": 6839 + }, + { + "epoch": 1.370741482965932, + "grad_norm": 34.48899714299252, + "learning_rate": 6.597326742896106e-06, + "loss": 2.8119, + "step": 6840 + }, + { + "epoch": 1.3709418837675351, + "grad_norm": 24.249537783755414, + "learning_rate": 6.596221909611073e-06, + "loss": 2.8641, + "step": 6841 + }, + { + "epoch": 1.3711422845691383, + "grad_norm": 20.26051709234332, + "learning_rate": 6.595116989537097e-06, + "loss": 3.3193, + "step": 6842 + }, + { + "epoch": 1.3713426853707416, + "grad_norm": 28.32415694363436, + "learning_rate": 6.594011982734258e-06, + "loss": 2.9688, + "step": 6843 + }, + { + "epoch": 1.3715430861723448, + "grad_norm": 29.373186025641093, + "learning_rate": 6.592906889262633e-06, + "loss": 2.4663, + "step": 6844 + }, + { + "epoch": 1.3717434869739478, + "grad_norm": 21.411413263574794, + "learning_rate": 6.59180170918231e-06, + "loss": 2.179, + "step": 6845 + }, + { + "epoch": 1.3719438877755512, + "grad_norm": 26.918447851094164, + "learning_rate": 6.590696442553377e-06, + "loss": 2.9339, + "step": 6846 + }, + { + "epoch": 1.3721442885771542, + "grad_norm": 25.997238951896488, + "learning_rate": 6.589591089435932e-06, + "loss": 2.4357, + "step": 6847 + }, + { + "epoch": 1.3723446893787576, + "grad_norm": 54.01250372782054, + "learning_rate": 6.588485649890072e-06, + "loss": 3.1541, + "step": 6848 + }, + { + "epoch": 1.3725450901803606, + "grad_norm": 27.20494846618657, + "learning_rate": 6.587380123975902e-06, + "loss": 3.2744, + "step": 6849 + }, + { + "epoch": 1.3727454909819639, + "grad_norm": 37.122264387341716, + "learning_rate": 6.5862745117535336e-06, + "loss": 3.0818, + "step": 6850 + }, + { + "epoch": 1.372945891783567, + "grad_norm": 22.28805707158276, + "learning_rate": 6.585168813283076e-06, + "loss": 2.683, + "step": 6851 + }, + { + "epoch": 1.3731462925851703, + "grad_norm": 20.490170586694457, + "learning_rate": 6.584063028624651e-06, + "loss": 2.3405, + "step": 6852 + }, + { + "epoch": 1.3733466933867735, + "grad_norm": 31.091778795208842, + "learning_rate": 6.582957157838382e-06, + "loss": 2.767, + "step": 6853 + }, + { + "epoch": 1.3735470941883767, + "grad_norm": 26.451102711041205, + "learning_rate": 6.581851200984395e-06, + "loss": 2.9265, + "step": 6854 + }, + { + "epoch": 1.37374749498998, + "grad_norm": 18.81740124964915, + "learning_rate": 6.580745158122822e-06, + "loss": 2.5588, + "step": 6855 + }, + { + "epoch": 1.3739478957915832, + "grad_norm": 22.789772153600868, + "learning_rate": 6.5796390293138025e-06, + "loss": 2.6076, + "step": 6856 + }, + { + "epoch": 1.3741482965931864, + "grad_norm": 26.306177745422058, + "learning_rate": 6.5785328146174764e-06, + "loss": 2.625, + "step": 6857 + }, + { + "epoch": 1.3743486973947896, + "grad_norm": 28.51574158155133, + "learning_rate": 6.577426514093992e-06, + "loss": 2.5711, + "step": 6858 + }, + { + "epoch": 1.3745490981963928, + "grad_norm": 31.16053962025778, + "learning_rate": 6.5763201278034996e-06, + "loss": 2.4792, + "step": 6859 + }, + { + "epoch": 1.374749498997996, + "grad_norm": 23.907907479149777, + "learning_rate": 6.575213655806154e-06, + "loss": 2.6188, + "step": 6860 + }, + { + "epoch": 1.3749498997995993, + "grad_norm": 26.996688287364602, + "learning_rate": 6.574107098162117e-06, + "loss": 2.6116, + "step": 6861 + }, + { + "epoch": 1.3751503006012025, + "grad_norm": 35.4500754925626, + "learning_rate": 6.573000454931553e-06, + "loss": 2.8095, + "step": 6862 + }, + { + "epoch": 1.3753507014028057, + "grad_norm": 38.9502253852131, + "learning_rate": 6.571893726174634e-06, + "loss": 2.3934, + "step": 6863 + }, + { + "epoch": 1.3755511022044087, + "grad_norm": 23.99478531778641, + "learning_rate": 6.570786911951531e-06, + "loss": 2.6574, + "step": 6864 + }, + { + "epoch": 1.3757515030060121, + "grad_norm": 35.15265215022773, + "learning_rate": 6.569680012322427e-06, + "loss": 3.11, + "step": 6865 + }, + { + "epoch": 1.3759519038076151, + "grad_norm": 31.08619116657859, + "learning_rate": 6.568573027347503e-06, + "loss": 2.3509, + "step": 6866 + }, + { + "epoch": 1.3761523046092186, + "grad_norm": 23.222646612181357, + "learning_rate": 6.567465957086946e-06, + "loss": 2.7381, + "step": 6867 + }, + { + "epoch": 1.3763527054108216, + "grad_norm": 29.434550555971764, + "learning_rate": 6.5663588016009515e-06, + "loss": 2.6104, + "step": 6868 + }, + { + "epoch": 1.3765531062124248, + "grad_norm": 33.87684434299598, + "learning_rate": 6.565251560949718e-06, + "loss": 2.8009, + "step": 6869 + }, + { + "epoch": 1.376753507014028, + "grad_norm": 36.84892099288119, + "learning_rate": 6.564144235193447e-06, + "loss": 2.87, + "step": 6870 + }, + { + "epoch": 1.3769539078156312, + "grad_norm": 33.59867736273839, + "learning_rate": 6.563036824392345e-06, + "loss": 3.3877, + "step": 6871 + }, + { + "epoch": 1.3771543086172344, + "grad_norm": 23.376015473712147, + "learning_rate": 6.561929328606623e-06, + "loss": 2.5912, + "step": 6872 + }, + { + "epoch": 1.3773547094188376, + "grad_norm": 38.84362812407682, + "learning_rate": 6.560821747896497e-06, + "loss": 2.9149, + "step": 6873 + }, + { + "epoch": 1.3775551102204409, + "grad_norm": 29.820945400854182, + "learning_rate": 6.559714082322189e-06, + "loss": 2.896, + "step": 6874 + }, + { + "epoch": 1.377755511022044, + "grad_norm": 22.29237131233141, + "learning_rate": 6.5586063319439254e-06, + "loss": 2.3017, + "step": 6875 + }, + { + "epoch": 1.3779559118236473, + "grad_norm": 30.539791287059217, + "learning_rate": 6.557498496821934e-06, + "loss": 2.6311, + "step": 6876 + }, + { + "epoch": 1.3781563126252505, + "grad_norm": 27.436508760298636, + "learning_rate": 6.556390577016451e-06, + "loss": 2.3773, + "step": 6877 + }, + { + "epoch": 1.3783567134268537, + "grad_norm": 30.54649511593421, + "learning_rate": 6.5552825725877134e-06, + "loss": 2.8663, + "step": 6878 + }, + { + "epoch": 1.378557114228457, + "grad_norm": 21.506365002341674, + "learning_rate": 6.554174483595966e-06, + "loss": 2.852, + "step": 6879 + }, + { + "epoch": 1.3787575150300602, + "grad_norm": 25.64894039953564, + "learning_rate": 6.553066310101459e-06, + "loss": 2.3043, + "step": 6880 + }, + { + "epoch": 1.3789579158316634, + "grad_norm": 34.27582870639749, + "learning_rate": 6.551958052164445e-06, + "loss": 2.6907, + "step": 6881 + }, + { + "epoch": 1.3791583166332666, + "grad_norm": 25.862041572541127, + "learning_rate": 6.5508497098451795e-06, + "loss": 3.0916, + "step": 6882 + }, + { + "epoch": 1.3793587174348698, + "grad_norm": 42.81597509469587, + "learning_rate": 6.549741283203927e-06, + "loss": 2.9817, + "step": 6883 + }, + { + "epoch": 1.379559118236473, + "grad_norm": 29.97316884288077, + "learning_rate": 6.548632772300951e-06, + "loss": 2.8187, + "step": 6884 + }, + { + "epoch": 1.379759519038076, + "grad_norm": 27.342996554382978, + "learning_rate": 6.547524177196528e-06, + "loss": 2.6004, + "step": 6885 + }, + { + "epoch": 1.3799599198396795, + "grad_norm": 24.94026049095977, + "learning_rate": 6.546415497950929e-06, + "loss": 2.5981, + "step": 6886 + }, + { + "epoch": 1.3801603206412825, + "grad_norm": 22.257317686116856, + "learning_rate": 6.5453067346244385e-06, + "loss": 2.8141, + "step": 6887 + }, + { + "epoch": 1.380360721442886, + "grad_norm": 28.00719645133037, + "learning_rate": 6.5441978872773385e-06, + "loss": 2.9533, + "step": 6888 + }, + { + "epoch": 1.380561122244489, + "grad_norm": 23.59682173277437, + "learning_rate": 6.54308895596992e-06, + "loss": 2.4717, + "step": 6889 + }, + { + "epoch": 1.3807615230460921, + "grad_norm": 29.97561230441295, + "learning_rate": 6.541979940762479e-06, + "loss": 2.9214, + "step": 6890 + }, + { + "epoch": 1.3809619238476953, + "grad_norm": 30.51021829884051, + "learning_rate": 6.54087084171531e-06, + "loss": 2.8611, + "step": 6891 + }, + { + "epoch": 1.3811623246492986, + "grad_norm": 31.551898985551517, + "learning_rate": 6.539761658888719e-06, + "loss": 2.9988, + "step": 6892 + }, + { + "epoch": 1.3813627254509018, + "grad_norm": 31.34113891870874, + "learning_rate": 6.538652392343014e-06, + "loss": 2.8266, + "step": 6893 + }, + { + "epoch": 1.381563126252505, + "grad_norm": 39.76801523848996, + "learning_rate": 6.537543042138509e-06, + "loss": 3.6258, + "step": 6894 + }, + { + "epoch": 1.3817635270541082, + "grad_norm": 25.358989515381428, + "learning_rate": 6.536433608335517e-06, + "loss": 3.2713, + "step": 6895 + }, + { + "epoch": 1.3819639278557114, + "grad_norm": 24.597530231094474, + "learning_rate": 6.535324090994362e-06, + "loss": 3.0201, + "step": 6896 + }, + { + "epoch": 1.3821643286573146, + "grad_norm": 18.631186662867922, + "learning_rate": 6.534214490175372e-06, + "loss": 2.8056, + "step": 6897 + }, + { + "epoch": 1.3823647294589179, + "grad_norm": 19.45561584679754, + "learning_rate": 6.533104805938874e-06, + "loss": 2.7476, + "step": 6898 + }, + { + "epoch": 1.382565130260521, + "grad_norm": 21.543134664793808, + "learning_rate": 6.5319950383452035e-06, + "loss": 3.1561, + "step": 6899 + }, + { + "epoch": 1.3827655310621243, + "grad_norm": 26.31111688551921, + "learning_rate": 6.530885187454702e-06, + "loss": 2.8956, + "step": 6900 + }, + { + "epoch": 1.3829659318637275, + "grad_norm": 24.411335728002516, + "learning_rate": 6.5297752533277124e-06, + "loss": 2.9687, + "step": 6901 + }, + { + "epoch": 1.3831663326653307, + "grad_norm": 29.9043894084629, + "learning_rate": 6.528665236024587e-06, + "loss": 2.9651, + "step": 6902 + }, + { + "epoch": 1.383366733466934, + "grad_norm": 25.219502709655348, + "learning_rate": 6.527555135605673e-06, + "loss": 2.5502, + "step": 6903 + }, + { + "epoch": 1.383567134268537, + "grad_norm": 22.328624343443938, + "learning_rate": 6.526444952131334e-06, + "loss": 2.8638, + "step": 6904 + }, + { + "epoch": 1.3837675350701404, + "grad_norm": 26.052118625113817, + "learning_rate": 6.525334685661927e-06, + "loss": 3.2369, + "step": 6905 + }, + { + "epoch": 1.3839679358717434, + "grad_norm": 33.400405411577594, + "learning_rate": 6.524224336257823e-06, + "loss": 2.2719, + "step": 6906 + }, + { + "epoch": 1.3841683366733468, + "grad_norm": 18.720316619782576, + "learning_rate": 6.5231139039793924e-06, + "loss": 2.402, + "step": 6907 + }, + { + "epoch": 1.3843687374749498, + "grad_norm": 21.43565297408075, + "learning_rate": 6.522003388887009e-06, + "loss": 3.1236, + "step": 6908 + }, + { + "epoch": 1.384569138276553, + "grad_norm": 39.85910559013166, + "learning_rate": 6.520892791041057e-06, + "loss": 3.1666, + "step": 6909 + }, + { + "epoch": 1.3847695390781563, + "grad_norm": 31.860802556076347, + "learning_rate": 6.519782110501917e-06, + "loss": 2.8946, + "step": 6910 + }, + { + "epoch": 1.3849699398797595, + "grad_norm": 42.16266499283713, + "learning_rate": 6.5186713473299815e-06, + "loss": 2.4667, + "step": 6911 + }, + { + "epoch": 1.3851703406813627, + "grad_norm": 18.878965824473443, + "learning_rate": 6.517560501585641e-06, + "loss": 2.2526, + "step": 6912 + }, + { + "epoch": 1.385370741482966, + "grad_norm": 32.582885376284324, + "learning_rate": 6.516449573329298e-06, + "loss": 2.6974, + "step": 6913 + }, + { + "epoch": 1.3855711422845691, + "grad_norm": 26.274352598095707, + "learning_rate": 6.515338562621352e-06, + "loss": 2.3419, + "step": 6914 + }, + { + "epoch": 1.3857715430861723, + "grad_norm": 23.33808091092552, + "learning_rate": 6.514227469522212e-06, + "loss": 3.0972, + "step": 6915 + }, + { + "epoch": 1.3859719438877756, + "grad_norm": 27.82115719805529, + "learning_rate": 6.513116294092289e-06, + "loss": 3.0784, + "step": 6916 + }, + { + "epoch": 1.3861723446893788, + "grad_norm": 25.072939361522607, + "learning_rate": 6.512005036392e-06, + "loss": 2.8681, + "step": 6917 + }, + { + "epoch": 1.386372745490982, + "grad_norm": 34.70400858879536, + "learning_rate": 6.510893696481763e-06, + "loss": 2.7081, + "step": 6918 + }, + { + "epoch": 1.3865731462925852, + "grad_norm": 21.016016292565723, + "learning_rate": 6.509782274422009e-06, + "loss": 2.556, + "step": 6919 + }, + { + "epoch": 1.3867735470941884, + "grad_norm": 30.025311812516676, + "learning_rate": 6.508670770273162e-06, + "loss": 2.5151, + "step": 6920 + }, + { + "epoch": 1.3869739478957916, + "grad_norm": 21.021562904112397, + "learning_rate": 6.50755918409566e-06, + "loss": 2.4645, + "step": 6921 + }, + { + "epoch": 1.3871743486973949, + "grad_norm": 22.888013237406593, + "learning_rate": 6.506447515949939e-06, + "loss": 2.3533, + "step": 6922 + }, + { + "epoch": 1.3873747494989979, + "grad_norm": 21.657892740339786, + "learning_rate": 6.5053357658964415e-06, + "loss": 2.94, + "step": 6923 + }, + { + "epoch": 1.3875751503006013, + "grad_norm": 22.835035022655404, + "learning_rate": 6.504223933995617e-06, + "loss": 2.745, + "step": 6924 + }, + { + "epoch": 1.3877755511022043, + "grad_norm": 28.421452807884805, + "learning_rate": 6.503112020307916e-06, + "loss": 2.7957, + "step": 6925 + }, + { + "epoch": 1.3879759519038077, + "grad_norm": 21.625882265485686, + "learning_rate": 6.5020000248937975e-06, + "loss": 2.2384, + "step": 6926 + }, + { + "epoch": 1.3881763527054107, + "grad_norm": 30.364308865393973, + "learning_rate": 6.50088794781372e-06, + "loss": 2.6756, + "step": 6927 + }, + { + "epoch": 1.388376753507014, + "grad_norm": 23.763887836165843, + "learning_rate": 6.499775789128149e-06, + "loss": 2.6497, + "step": 6928 + }, + { + "epoch": 1.3885771543086172, + "grad_norm": 35.937639736593276, + "learning_rate": 6.498663548897554e-06, + "loss": 2.6638, + "step": 6929 + }, + { + "epoch": 1.3887775551102204, + "grad_norm": 23.13182294098869, + "learning_rate": 6.497551227182409e-06, + "loss": 2.8052, + "step": 6930 + }, + { + "epoch": 1.3889779559118236, + "grad_norm": 27.353506006944993, + "learning_rate": 6.496438824043195e-06, + "loss": 2.8051, + "step": 6931 + }, + { + "epoch": 1.3891783567134268, + "grad_norm": 29.881350540094203, + "learning_rate": 6.495326339540392e-06, + "loss": 2.6576, + "step": 6932 + }, + { + "epoch": 1.38937875751503, + "grad_norm": 20.332035733984696, + "learning_rate": 6.4942137737344905e-06, + "loss": 2.2493, + "step": 6933 + }, + { + "epoch": 1.3895791583166333, + "grad_norm": 24.410080182296408, + "learning_rate": 6.493101126685978e-06, + "loss": 2.9592, + "step": 6934 + }, + { + "epoch": 1.3897795591182365, + "grad_norm": 26.36181753478042, + "learning_rate": 6.491988398455353e-06, + "loss": 2.5813, + "step": 6935 + }, + { + "epoch": 1.3899799599198397, + "grad_norm": 46.76155804443375, + "learning_rate": 6.490875589103118e-06, + "loss": 2.8169, + "step": 6936 + }, + { + "epoch": 1.390180360721443, + "grad_norm": 36.48753986959931, + "learning_rate": 6.4897626986897755e-06, + "loss": 3.394, + "step": 6937 + }, + { + "epoch": 1.3903807615230461, + "grad_norm": 17.990673071681666, + "learning_rate": 6.488649727275838e-06, + "loss": 2.5446, + "step": 6938 + }, + { + "epoch": 1.3905811623246493, + "grad_norm": 24.301629165392725, + "learning_rate": 6.487536674921815e-06, + "loss": 2.5004, + "step": 6939 + }, + { + "epoch": 1.3907815631262526, + "grad_norm": 22.911821144496546, + "learning_rate": 6.486423541688228e-06, + "loss": 2.5278, + "step": 6940 + }, + { + "epoch": 1.3909819639278558, + "grad_norm": 28.43992712444539, + "learning_rate": 6.4853103276356e-06, + "loss": 2.6967, + "step": 6941 + }, + { + "epoch": 1.391182364729459, + "grad_norm": 18.96221321004608, + "learning_rate": 6.484197032824454e-06, + "loss": 2.565, + "step": 6942 + }, + { + "epoch": 1.3913827655310622, + "grad_norm": 36.08823490233895, + "learning_rate": 6.483083657315328e-06, + "loss": 3.3436, + "step": 6943 + }, + { + "epoch": 1.3915831663326652, + "grad_norm": 34.12540243346821, + "learning_rate": 6.481970201168752e-06, + "loss": 2.7532, + "step": 6944 + }, + { + "epoch": 1.3917835671342687, + "grad_norm": 26.618689705033862, + "learning_rate": 6.480856664445269e-06, + "loss": 2.6006, + "step": 6945 + }, + { + "epoch": 1.3919839679358716, + "grad_norm": 23.018044743469634, + "learning_rate": 6.479743047205425e-06, + "loss": 2.308, + "step": 6946 + }, + { + "epoch": 1.392184368737475, + "grad_norm": 42.780784504597555, + "learning_rate": 6.4786293495097645e-06, + "loss": 3.0137, + "step": 6947 + }, + { + "epoch": 1.392384769539078, + "grad_norm": 27.49782982537682, + "learning_rate": 6.477515571418844e-06, + "loss": 2.66, + "step": 6948 + }, + { + "epoch": 1.3925851703406813, + "grad_norm": 43.94711369446778, + "learning_rate": 6.476401712993222e-06, + "loss": 3.2229, + "step": 6949 + }, + { + "epoch": 1.3927855711422845, + "grad_norm": 19.476834182492237, + "learning_rate": 6.475287774293459e-06, + "loss": 2.5008, + "step": 6950 + }, + { + "epoch": 1.3929859719438877, + "grad_norm": 33.30906292331856, + "learning_rate": 6.474173755380123e-06, + "loss": 3.5104, + "step": 6951 + }, + { + "epoch": 1.393186372745491, + "grad_norm": 30.62496044639932, + "learning_rate": 6.473059656313783e-06, + "loss": 2.876, + "step": 6952 + }, + { + "epoch": 1.3933867735470942, + "grad_norm": 19.134237115071432, + "learning_rate": 6.471945477155015e-06, + "loss": 2.9872, + "step": 6953 + }, + { + "epoch": 1.3935871743486974, + "grad_norm": 22.5423826148487, + "learning_rate": 6.4708312179643995e-06, + "loss": 2.7751, + "step": 6954 + }, + { + "epoch": 1.3937875751503006, + "grad_norm": 25.307692589828683, + "learning_rate": 6.469716878802518e-06, + "loss": 2.8726, + "step": 6955 + }, + { + "epoch": 1.3939879759519038, + "grad_norm": 17.146652984428254, + "learning_rate": 6.46860245972996e-06, + "loss": 2.7226, + "step": 6956 + }, + { + "epoch": 1.394188376753507, + "grad_norm": 26.67706482744913, + "learning_rate": 6.46748796080732e-06, + "loss": 2.4512, + "step": 6957 + }, + { + "epoch": 1.3943887775551103, + "grad_norm": 25.697508405698418, + "learning_rate": 6.466373382095194e-06, + "loss": 2.6383, + "step": 6958 + }, + { + "epoch": 1.3945891783567135, + "grad_norm": 34.60266944714127, + "learning_rate": 6.46525872365418e-06, + "loss": 2.5545, + "step": 6959 + }, + { + "epoch": 1.3947895791583167, + "grad_norm": 27.161115349416463, + "learning_rate": 6.464143985544887e-06, + "loss": 2.6122, + "step": 6960 + }, + { + "epoch": 1.39498997995992, + "grad_norm": 28.626879664441734, + "learning_rate": 6.463029167827926e-06, + "loss": 2.1229, + "step": 6961 + }, + { + "epoch": 1.3951903807615231, + "grad_norm": 25.26191968638323, + "learning_rate": 6.461914270563907e-06, + "loss": 2.6623, + "step": 6962 + }, + { + "epoch": 1.3953907815631261, + "grad_norm": 27.4878783929443, + "learning_rate": 6.460799293813453e-06, + "loss": 2.2216, + "step": 6963 + }, + { + "epoch": 1.3955911823647296, + "grad_norm": 22.14024073705447, + "learning_rate": 6.459684237637185e-06, + "loss": 2.7056, + "step": 6964 + }, + { + "epoch": 1.3957915831663326, + "grad_norm": 23.35936451499328, + "learning_rate": 6.458569102095731e-06, + "loss": 2.6468, + "step": 6965 + }, + { + "epoch": 1.395991983967936, + "grad_norm": 23.13719075149973, + "learning_rate": 6.45745388724972e-06, + "loss": 2.9291, + "step": 6966 + }, + { + "epoch": 1.396192384769539, + "grad_norm": 22.426401359700606, + "learning_rate": 6.456338593159791e-06, + "loss": 2.8652, + "step": 6967 + }, + { + "epoch": 1.3963927855711422, + "grad_norm": 24.116446594041765, + "learning_rate": 6.455223219886582e-06, + "loss": 2.1965, + "step": 6968 + }, + { + "epoch": 1.3965931863727454, + "grad_norm": 20.987999275697835, + "learning_rate": 6.454107767490738e-06, + "loss": 2.525, + "step": 6969 + }, + { + "epoch": 1.3967935871743486, + "grad_norm": 28.006861517291266, + "learning_rate": 6.4529922360329116e-06, + "loss": 2.93, + "step": 6970 + }, + { + "epoch": 1.3969939879759519, + "grad_norm": 22.222470138532483, + "learning_rate": 6.451876625573749e-06, + "loss": 2.6524, + "step": 6971 + }, + { + "epoch": 1.397194388777555, + "grad_norm": 33.53746051054405, + "learning_rate": 6.450760936173912e-06, + "loss": 3.0435, + "step": 6972 + }, + { + "epoch": 1.3973947895791583, + "grad_norm": 28.24768329340634, + "learning_rate": 6.449645167894061e-06, + "loss": 2.7118, + "step": 6973 + }, + { + "epoch": 1.3975951903807615, + "grad_norm": 28.05669944123124, + "learning_rate": 6.448529320794863e-06, + "loss": 3.1238, + "step": 6974 + }, + { + "epoch": 1.3977955911823647, + "grad_norm": 27.038447468756004, + "learning_rate": 6.447413394936987e-06, + "loss": 3.2768, + "step": 6975 + }, + { + "epoch": 1.397995991983968, + "grad_norm": 20.054354180644143, + "learning_rate": 6.4462973903811086e-06, + "loss": 2.4323, + "step": 6976 + }, + { + "epoch": 1.3981963927855712, + "grad_norm": 26.007066456429193, + "learning_rate": 6.445181307187908e-06, + "loss": 2.6052, + "step": 6977 + }, + { + "epoch": 1.3983967935871744, + "grad_norm": 110.8126171035785, + "learning_rate": 6.444065145418063e-06, + "loss": 2.556, + "step": 6978 + }, + { + "epoch": 1.3985971943887776, + "grad_norm": 28.222208640337204, + "learning_rate": 6.442948905132267e-06, + "loss": 2.9086, + "step": 6979 + }, + { + "epoch": 1.3987975951903808, + "grad_norm": 24.71475176130314, + "learning_rate": 6.441832586391208e-06, + "loss": 3.1611, + "step": 6980 + }, + { + "epoch": 1.398997995991984, + "grad_norm": 23.424505292741188, + "learning_rate": 6.440716189255582e-06, + "loss": 2.7744, + "step": 6981 + }, + { + "epoch": 1.399198396793587, + "grad_norm": 36.3594407306032, + "learning_rate": 6.439599713786093e-06, + "loss": 2.907, + "step": 6982 + }, + { + "epoch": 1.3993987975951905, + "grad_norm": 26.800646418575248, + "learning_rate": 6.438483160043439e-06, + "loss": 3.101, + "step": 6983 + }, + { + "epoch": 1.3995991983967935, + "grad_norm": 26.244131071715444, + "learning_rate": 6.437366528088334e-06, + "loss": 2.8467, + "step": 6984 + }, + { + "epoch": 1.399799599198397, + "grad_norm": 31.539907110129526, + "learning_rate": 6.436249817981488e-06, + "loss": 3.1938, + "step": 6985 + }, + { + "epoch": 1.4, + "grad_norm": 20.539353642306448, + "learning_rate": 6.43513302978362e-06, + "loss": 2.2866, + "step": 6986 + }, + { + "epoch": 1.4002004008016031, + "grad_norm": 17.48989892888267, + "learning_rate": 6.434016163555452e-06, + "loss": 2.5424, + "step": 6987 + }, + { + "epoch": 1.4004008016032063, + "grad_norm": 28.139286406360966, + "learning_rate": 6.432899219357707e-06, + "loss": 2.6997, + "step": 6988 + }, + { + "epoch": 1.4006012024048096, + "grad_norm": 42.96282612618221, + "learning_rate": 6.431782197251116e-06, + "loss": 2.8057, + "step": 6989 + }, + { + "epoch": 1.4008016032064128, + "grad_norm": 38.47521152823481, + "learning_rate": 6.430665097296416e-06, + "loss": 3.302, + "step": 6990 + }, + { + "epoch": 1.401002004008016, + "grad_norm": 27.295583925510957, + "learning_rate": 6.42954791955434e-06, + "loss": 3.3554, + "step": 6991 + }, + { + "epoch": 1.4012024048096192, + "grad_norm": 32.92579481415985, + "learning_rate": 6.428430664085635e-06, + "loss": 2.7022, + "step": 6992 + }, + { + "epoch": 1.4014028056112224, + "grad_norm": 29.097185872832636, + "learning_rate": 6.427313330951044e-06, + "loss": 3.2043, + "step": 6993 + }, + { + "epoch": 1.4016032064128257, + "grad_norm": 15.826722832935154, + "learning_rate": 6.426195920211322e-06, + "loss": 2.6517, + "step": 6994 + }, + { + "epoch": 1.4018036072144289, + "grad_norm": 21.51557747372415, + "learning_rate": 6.425078431927223e-06, + "loss": 2.5951, + "step": 6995 + }, + { + "epoch": 1.402004008016032, + "grad_norm": 29.963421412514936, + "learning_rate": 6.4239608661595056e-06, + "loss": 2.5924, + "step": 6996 + }, + { + "epoch": 1.4022044088176353, + "grad_norm": 23.301752579419997, + "learning_rate": 6.422843222968934e-06, + "loss": 2.3727, + "step": 6997 + }, + { + "epoch": 1.4024048096192385, + "grad_norm": 19.532326566311333, + "learning_rate": 6.421725502416274e-06, + "loss": 3.0602, + "step": 6998 + }, + { + "epoch": 1.4026052104208417, + "grad_norm": 20.248013898135582, + "learning_rate": 6.420607704562302e-06, + "loss": 2.744, + "step": 6999 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 39.76079724552932, + "learning_rate": 6.419489829467792e-06, + "loss": 2.8733, + "step": 7000 + }, + { + "epoch": 1.4030060120240482, + "grad_norm": 33.233951181719, + "learning_rate": 6.418371877193525e-06, + "loss": 3.4465, + "step": 7001 + }, + { + "epoch": 1.4032064128256514, + "grad_norm": 24.37264264968348, + "learning_rate": 6.417253847800285e-06, + "loss": 3.1454, + "step": 7002 + }, + { + "epoch": 1.4034068136272544, + "grad_norm": 27.25675107465488, + "learning_rate": 6.41613574134886e-06, + "loss": 2.6921, + "step": 7003 + }, + { + "epoch": 1.4036072144288578, + "grad_norm": 31.01795625595185, + "learning_rate": 6.415017557900045e-06, + "loss": 2.4302, + "step": 7004 + }, + { + "epoch": 1.4038076152304608, + "grad_norm": 26.095416900857984, + "learning_rate": 6.413899297514636e-06, + "loss": 2.5815, + "step": 7005 + }, + { + "epoch": 1.4040080160320643, + "grad_norm": 48.97884277564318, + "learning_rate": 6.412780960253437e-06, + "loss": 2.7815, + "step": 7006 + }, + { + "epoch": 1.4042084168336673, + "grad_norm": 37.77589159943879, + "learning_rate": 6.411662546177251e-06, + "loss": 3.0371, + "step": 7007 + }, + { + "epoch": 1.4044088176352705, + "grad_norm": 31.988868209460083, + "learning_rate": 6.410544055346888e-06, + "loss": 2.5374, + "step": 7008 + }, + { + "epoch": 1.4046092184368737, + "grad_norm": 38.99838335544912, + "learning_rate": 6.409425487823163e-06, + "loss": 2.575, + "step": 7009 + }, + { + "epoch": 1.404809619238477, + "grad_norm": 24.194132958089707, + "learning_rate": 6.408306843666894e-06, + "loss": 2.2131, + "step": 7010 + }, + { + "epoch": 1.4050100200400801, + "grad_norm": 18.753431913781338, + "learning_rate": 6.4071881229389014e-06, + "loss": 2.716, + "step": 7011 + }, + { + "epoch": 1.4052104208416833, + "grad_norm": 16.87732440319832, + "learning_rate": 6.406069325700015e-06, + "loss": 2.5147, + "step": 7012 + }, + { + "epoch": 1.4054108216432866, + "grad_norm": 37.590653167180605, + "learning_rate": 6.404950452011063e-06, + "loss": 2.895, + "step": 7013 + }, + { + "epoch": 1.4056112224448898, + "grad_norm": 24.356602697751562, + "learning_rate": 6.4038315019328825e-06, + "loss": 2.6892, + "step": 7014 + }, + { + "epoch": 1.405811623246493, + "grad_norm": 23.944452854381268, + "learning_rate": 6.402712475526309e-06, + "loss": 2.784, + "step": 7015 + }, + { + "epoch": 1.4060120240480962, + "grad_norm": 123.90149437468962, + "learning_rate": 6.401593372852189e-06, + "loss": 3.3268, + "step": 7016 + }, + { + "epoch": 1.4062124248496994, + "grad_norm": 17.48412433065974, + "learning_rate": 6.400474193971366e-06, + "loss": 2.3137, + "step": 7017 + }, + { + "epoch": 1.4064128256513027, + "grad_norm": 17.642482782551976, + "learning_rate": 6.399354938944695e-06, + "loss": 2.4033, + "step": 7018 + }, + { + "epoch": 1.4066132264529059, + "grad_norm": 23.99813887020733, + "learning_rate": 6.3982356078330324e-06, + "loss": 2.9553, + "step": 7019 + }, + { + "epoch": 1.406813627254509, + "grad_norm": 23.04089151846828, + "learning_rate": 6.397116200697234e-06, + "loss": 2.6547, + "step": 7020 + }, + { + "epoch": 1.4070140280561123, + "grad_norm": 39.83310211080346, + "learning_rate": 6.3959967175981655e-06, + "loss": 2.7621, + "step": 7021 + }, + { + "epoch": 1.4072144288577153, + "grad_norm": 20.359749966947476, + "learning_rate": 6.3948771585966965e-06, + "loss": 2.2462, + "step": 7022 + }, + { + "epoch": 1.4074148296593187, + "grad_norm": 25.652910889622326, + "learning_rate": 6.393757523753696e-06, + "loss": 2.3891, + "step": 7023 + }, + { + "epoch": 1.4076152304609217, + "grad_norm": 22.46327695492878, + "learning_rate": 6.39263781313004e-06, + "loss": 3.062, + "step": 7024 + }, + { + "epoch": 1.4078156312625252, + "grad_norm": 22.81728392335985, + "learning_rate": 6.391518026786612e-06, + "loss": 2.622, + "step": 7025 + }, + { + "epoch": 1.4080160320641282, + "grad_norm": 21.78733003657052, + "learning_rate": 6.390398164784295e-06, + "loss": 2.771, + "step": 7026 + }, + { + "epoch": 1.4082164328657314, + "grad_norm": 22.49408170634583, + "learning_rate": 6.389278227183977e-06, + "loss": 2.2869, + "step": 7027 + }, + { + "epoch": 1.4084168336673346, + "grad_norm": 73.13193550955054, + "learning_rate": 6.388158214046552e-06, + "loss": 2.8762, + "step": 7028 + }, + { + "epoch": 1.4086172344689378, + "grad_norm": 36.15689930512186, + "learning_rate": 6.387038125432914e-06, + "loss": 2.7967, + "step": 7029 + }, + { + "epoch": 1.408817635270541, + "grad_norm": 23.449344793189855, + "learning_rate": 6.385917961403966e-06, + "loss": 2.6776, + "step": 7030 + }, + { + "epoch": 1.4090180360721443, + "grad_norm": 21.229928891279044, + "learning_rate": 6.384797722020614e-06, + "loss": 2.5309, + "step": 7031 + }, + { + "epoch": 1.4092184368737475, + "grad_norm": 20.544757447715195, + "learning_rate": 6.383677407343764e-06, + "loss": 2.9024, + "step": 7032 + }, + { + "epoch": 1.4094188376753507, + "grad_norm": 33.061893909066114, + "learning_rate": 6.382557017434332e-06, + "loss": 2.1307, + "step": 7033 + }, + { + "epoch": 1.409619238476954, + "grad_norm": 23.69529591187376, + "learning_rate": 6.381436552353235e-06, + "loss": 3.0362, + "step": 7034 + }, + { + "epoch": 1.4098196392785571, + "grad_norm": 24.45331423351596, + "learning_rate": 6.380316012161392e-06, + "loss": 2.5539, + "step": 7035 + }, + { + "epoch": 1.4100200400801604, + "grad_norm": 23.363174370499856, + "learning_rate": 6.379195396919729e-06, + "loss": 3.0672, + "step": 7036 + }, + { + "epoch": 1.4102204408817636, + "grad_norm": 15.013761279055496, + "learning_rate": 6.378074706689178e-06, + "loss": 2.8211, + "step": 7037 + }, + { + "epoch": 1.4104208416833668, + "grad_norm": 29.755230592950692, + "learning_rate": 6.37695394153067e-06, + "loss": 2.8398, + "step": 7038 + }, + { + "epoch": 1.41062124248497, + "grad_norm": 24.52285585954142, + "learning_rate": 6.375833101505145e-06, + "loss": 2.8621, + "step": 7039 + }, + { + "epoch": 1.4108216432865732, + "grad_norm": 24.00116696986443, + "learning_rate": 6.374712186673542e-06, + "loss": 2.7114, + "step": 7040 + }, + { + "epoch": 1.4110220440881762, + "grad_norm": 17.54985343817805, + "learning_rate": 6.37359119709681e-06, + "loss": 3.0186, + "step": 7041 + }, + { + "epoch": 1.4112224448897797, + "grad_norm": 24.0634607040012, + "learning_rate": 6.372470132835894e-06, + "loss": 2.4297, + "step": 7042 + }, + { + "epoch": 1.4114228456913827, + "grad_norm": 19.39472492395833, + "learning_rate": 6.371348993951753e-06, + "loss": 2.5978, + "step": 7043 + }, + { + "epoch": 1.411623246492986, + "grad_norm": 25.269736826412643, + "learning_rate": 6.370227780505342e-06, + "loss": 3.3297, + "step": 7044 + }, + { + "epoch": 1.411823647294589, + "grad_norm": 28.717625335407703, + "learning_rate": 6.3691064925576255e-06, + "loss": 3.0022, + "step": 7045 + }, + { + "epoch": 1.4120240480961923, + "grad_norm": 50.014861920146394, + "learning_rate": 6.3679851301695674e-06, + "loss": 3.3058, + "step": 7046 + }, + { + "epoch": 1.4122244488977955, + "grad_norm": 21.671664348041087, + "learning_rate": 6.366863693402138e-06, + "loss": 2.3528, + "step": 7047 + }, + { + "epoch": 1.4124248496993987, + "grad_norm": 31.057966135226835, + "learning_rate": 6.3657421823163115e-06, + "loss": 2.6872, + "step": 7048 + }, + { + "epoch": 1.412625250501002, + "grad_norm": 40.72682353711218, + "learning_rate": 6.364620596973067e-06, + "loss": 3.1406, + "step": 7049 + }, + { + "epoch": 1.4128256513026052, + "grad_norm": 15.633490488517682, + "learning_rate": 6.363498937433385e-06, + "loss": 2.2814, + "step": 7050 + }, + { + "epoch": 1.4130260521042084, + "grad_norm": 32.15640280886722, + "learning_rate": 6.362377203758255e-06, + "loss": 2.9193, + "step": 7051 + }, + { + "epoch": 1.4132264529058116, + "grad_norm": 24.581018744920165, + "learning_rate": 6.361255396008665e-06, + "loss": 2.61, + "step": 7052 + }, + { + "epoch": 1.4134268537074148, + "grad_norm": 29.792359115681137, + "learning_rate": 6.36013351424561e-06, + "loss": 2.3325, + "step": 7053 + }, + { + "epoch": 1.413627254509018, + "grad_norm": 33.45151432591716, + "learning_rate": 6.359011558530086e-06, + "loss": 2.8234, + "step": 7054 + }, + { + "epoch": 1.4138276553106213, + "grad_norm": 18.29783871334819, + "learning_rate": 6.357889528923099e-06, + "loss": 2.6097, + "step": 7055 + }, + { + "epoch": 1.4140280561122245, + "grad_norm": 40.04081385715326, + "learning_rate": 6.3567674254856546e-06, + "loss": 2.9654, + "step": 7056 + }, + { + "epoch": 1.4142284569138277, + "grad_norm": 19.391804748614774, + "learning_rate": 6.3556452482787615e-06, + "loss": 2.7877, + "step": 7057 + }, + { + "epoch": 1.414428857715431, + "grad_norm": 22.248822624786623, + "learning_rate": 6.354522997363436e-06, + "loss": 2.3823, + "step": 7058 + }, + { + "epoch": 1.4146292585170341, + "grad_norm": 24.75900454647992, + "learning_rate": 6.353400672800695e-06, + "loss": 2.9856, + "step": 7059 + }, + { + "epoch": 1.4148296593186374, + "grad_norm": 22.963797100898084, + "learning_rate": 6.352278274651562e-06, + "loss": 2.5242, + "step": 7060 + }, + { + "epoch": 1.4150300601202406, + "grad_norm": 27.18983674131752, + "learning_rate": 6.3511558029770625e-06, + "loss": 2.4767, + "step": 7061 + }, + { + "epoch": 1.4152304609218436, + "grad_norm": 27.12445208556389, + "learning_rate": 6.350033257838226e-06, + "loss": 2.9892, + "step": 7062 + }, + { + "epoch": 1.415430861723447, + "grad_norm": 29.03198762559888, + "learning_rate": 6.348910639296092e-06, + "loss": 2.58, + "step": 7063 + }, + { + "epoch": 1.41563126252505, + "grad_norm": 32.50747024323299, + "learning_rate": 6.347787947411694e-06, + "loss": 2.8333, + "step": 7064 + }, + { + "epoch": 1.4158316633266534, + "grad_norm": 29.233841066950333, + "learning_rate": 6.346665182246075e-06, + "loss": 2.832, + "step": 7065 + }, + { + "epoch": 1.4160320641282564, + "grad_norm": 57.476685233435646, + "learning_rate": 6.345542343860283e-06, + "loss": 3.1198, + "step": 7066 + }, + { + "epoch": 1.4162324649298597, + "grad_norm": 27.63795394685166, + "learning_rate": 6.3444194323153656e-06, + "loss": 2.9165, + "step": 7067 + }, + { + "epoch": 1.4164328657314629, + "grad_norm": 28.03785738880918, + "learning_rate": 6.343296447672381e-06, + "loss": 2.6722, + "step": 7068 + }, + { + "epoch": 1.416633266533066, + "grad_norm": 40.30579624922172, + "learning_rate": 6.3421733899923854e-06, + "loss": 2.531, + "step": 7069 + }, + { + "epoch": 1.4168336673346693, + "grad_norm": 32.34954064989627, + "learning_rate": 6.341050259336442e-06, + "loss": 2.9252, + "step": 7070 + }, + { + "epoch": 1.4170340681362725, + "grad_norm": 24.442121350742784, + "learning_rate": 6.339927055765616e-06, + "loss": 3.1797, + "step": 7071 + }, + { + "epoch": 1.4172344689378757, + "grad_norm": 28.858448234993432, + "learning_rate": 6.338803779340976e-06, + "loss": 2.7813, + "step": 7072 + }, + { + "epoch": 1.417434869739479, + "grad_norm": 23.1360851279298, + "learning_rate": 6.337680430123601e-06, + "loss": 2.2007, + "step": 7073 + }, + { + "epoch": 1.4176352705410822, + "grad_norm": 31.722239678057274, + "learning_rate": 6.3365570081745645e-06, + "loss": 3.0961, + "step": 7074 + }, + { + "epoch": 1.4178356713426854, + "grad_norm": 21.321660040550977, + "learning_rate": 6.335433513554952e-06, + "loss": 2.2842, + "step": 7075 + }, + { + "epoch": 1.4180360721442886, + "grad_norm": 28.161986246834303, + "learning_rate": 6.334309946325848e-06, + "loss": 2.8246, + "step": 7076 + }, + { + "epoch": 1.4182364729458918, + "grad_norm": 35.21774322565007, + "learning_rate": 6.333186306548341e-06, + "loss": 2.9075, + "step": 7077 + }, + { + "epoch": 1.418436873747495, + "grad_norm": 32.06405866695639, + "learning_rate": 6.332062594283528e-06, + "loss": 2.9315, + "step": 7078 + }, + { + "epoch": 1.4186372745490983, + "grad_norm": 33.13800609032884, + "learning_rate": 6.330938809592505e-06, + "loss": 3.0231, + "step": 7079 + }, + { + "epoch": 1.4188376753507015, + "grad_norm": 22.25005489025947, + "learning_rate": 6.329814952536374e-06, + "loss": 2.5563, + "step": 7080 + }, + { + "epoch": 1.4190380761523045, + "grad_norm": 19.782800437726767, + "learning_rate": 6.328691023176241e-06, + "loss": 2.2458, + "step": 7081 + }, + { + "epoch": 1.419238476953908, + "grad_norm": 16.82471169932657, + "learning_rate": 6.327567021573217e-06, + "loss": 2.9495, + "step": 7082 + }, + { + "epoch": 1.419438877755511, + "grad_norm": 21.11814749644692, + "learning_rate": 6.326442947788413e-06, + "loss": 2.8415, + "step": 7083 + }, + { + "epoch": 1.4196392785571144, + "grad_norm": 26.411210012297815, + "learning_rate": 6.325318801882949e-06, + "loss": 3.3077, + "step": 7084 + }, + { + "epoch": 1.4198396793587174, + "grad_norm": 23.179004149813082, + "learning_rate": 6.324194583917944e-06, + "loss": 2.1382, + "step": 7085 + }, + { + "epoch": 1.4200400801603206, + "grad_norm": 25.747505349342514, + "learning_rate": 6.323070293954525e-06, + "loss": 2.8372, + "step": 7086 + }, + { + "epoch": 1.4202404809619238, + "grad_norm": 27.223692808434297, + "learning_rate": 6.321945932053823e-06, + "loss": 3.1058, + "step": 7087 + }, + { + "epoch": 1.420440881763527, + "grad_norm": 24.307442928001972, + "learning_rate": 6.3208214982769674e-06, + "loss": 2.6805, + "step": 7088 + }, + { + "epoch": 1.4206412825651302, + "grad_norm": 27.48342198670738, + "learning_rate": 6.319696992685098e-06, + "loss": 2.9984, + "step": 7089 + }, + { + "epoch": 1.4208416833667334, + "grad_norm": 16.138798206267104, + "learning_rate": 6.318572415339356e-06, + "loss": 2.0484, + "step": 7090 + }, + { + "epoch": 1.4210420841683367, + "grad_norm": 24.75807270516743, + "learning_rate": 6.317447766300884e-06, + "loss": 2.8636, + "step": 7091 + }, + { + "epoch": 1.4212424849699399, + "grad_norm": 20.496440073625983, + "learning_rate": 6.3163230456308325e-06, + "loss": 2.2633, + "step": 7092 + }, + { + "epoch": 1.421442885771543, + "grad_norm": 29.82166704043389, + "learning_rate": 6.315198253390353e-06, + "loss": 2.7681, + "step": 7093 + }, + { + "epoch": 1.4216432865731463, + "grad_norm": 20.79134938605759, + "learning_rate": 6.3140733896406036e-06, + "loss": 2.7502, + "step": 7094 + }, + { + "epoch": 1.4218436873747495, + "grad_norm": 21.58733777080146, + "learning_rate": 6.312948454442746e-06, + "loss": 2.7099, + "step": 7095 + }, + { + "epoch": 1.4220440881763527, + "grad_norm": 22.352149071580655, + "learning_rate": 6.311823447857941e-06, + "loss": 3.0638, + "step": 7096 + }, + { + "epoch": 1.422244488977956, + "grad_norm": 27.42830424492222, + "learning_rate": 6.31069836994736e-06, + "loss": 2.8263, + "step": 7097 + }, + { + "epoch": 1.4224448897795592, + "grad_norm": 31.758771028365707, + "learning_rate": 6.309573220772172e-06, + "loss": 3.1217, + "step": 7098 + }, + { + "epoch": 1.4226452905811624, + "grad_norm": 22.21480506258998, + "learning_rate": 6.3084480003935545e-06, + "loss": 2.6686, + "step": 7099 + }, + { + "epoch": 1.4228456913827654, + "grad_norm": 20.588789245871098, + "learning_rate": 6.307322708872691e-06, + "loss": 2.4704, + "step": 7100 + }, + { + "epoch": 1.4230460921843688, + "grad_norm": 24.015733410110556, + "learning_rate": 6.306197346270759e-06, + "loss": 2.5296, + "step": 7101 + }, + { + "epoch": 1.4232464929859718, + "grad_norm": 29.388139007766544, + "learning_rate": 6.3050719126489515e-06, + "loss": 3.083, + "step": 7102 + }, + { + "epoch": 1.4234468937875753, + "grad_norm": 41.22003099145093, + "learning_rate": 6.303946408068457e-06, + "loss": 2.6375, + "step": 7103 + }, + { + "epoch": 1.4236472945891783, + "grad_norm": 25.075003707773575, + "learning_rate": 6.30282083259047e-06, + "loss": 2.8596, + "step": 7104 + }, + { + "epoch": 1.4238476953907815, + "grad_norm": 22.752220054539368, + "learning_rate": 6.301695186276192e-06, + "loss": 2.658, + "step": 7105 + }, + { + "epoch": 1.4240480961923847, + "grad_norm": 27.99900445456586, + "learning_rate": 6.300569469186825e-06, + "loss": 2.8982, + "step": 7106 + }, + { + "epoch": 1.424248496993988, + "grad_norm": 19.471502340823896, + "learning_rate": 6.299443681383578e-06, + "loss": 2.5815, + "step": 7107 + }, + { + "epoch": 1.4244488977955911, + "grad_norm": 24.152751560800134, + "learning_rate": 6.298317822927658e-06, + "loss": 2.6601, + "step": 7108 + }, + { + "epoch": 1.4246492985971944, + "grad_norm": 30.507738304507768, + "learning_rate": 6.297191893880281e-06, + "loss": 2.7128, + "step": 7109 + }, + { + "epoch": 1.4248496993987976, + "grad_norm": 21.468529260192735, + "learning_rate": 6.296065894302668e-06, + "loss": 2.597, + "step": 7110 + }, + { + "epoch": 1.4250501002004008, + "grad_norm": 27.90804212342553, + "learning_rate": 6.294939824256037e-06, + "loss": 2.7173, + "step": 7111 + }, + { + "epoch": 1.425250501002004, + "grad_norm": 21.65545564133948, + "learning_rate": 6.293813683801617e-06, + "loss": 3.1158, + "step": 7112 + }, + { + "epoch": 1.4254509018036072, + "grad_norm": 35.635885236035946, + "learning_rate": 6.292687473000638e-06, + "loss": 2.8939, + "step": 7113 + }, + { + "epoch": 1.4256513026052104, + "grad_norm": 26.520676176854685, + "learning_rate": 6.291561191914333e-06, + "loss": 2.5086, + "step": 7114 + }, + { + "epoch": 1.4258517034068137, + "grad_norm": 26.111853689220744, + "learning_rate": 6.290434840603938e-06, + "loss": 3.3428, + "step": 7115 + }, + { + "epoch": 1.4260521042084169, + "grad_norm": 30.00891598070566, + "learning_rate": 6.289308419130697e-06, + "loss": 2.6648, + "step": 7116 + }, + { + "epoch": 1.42625250501002, + "grad_norm": 40.13334850399352, + "learning_rate": 6.288181927555854e-06, + "loss": 2.9682, + "step": 7117 + }, + { + "epoch": 1.4264529058116233, + "grad_norm": 28.810416042854474, + "learning_rate": 6.287055365940657e-06, + "loss": 2.7539, + "step": 7118 + }, + { + "epoch": 1.4266533066132265, + "grad_norm": 23.07247811576958, + "learning_rate": 6.285928734346362e-06, + "loss": 2.6197, + "step": 7119 + }, + { + "epoch": 1.4268537074148298, + "grad_norm": 60.94164893525912, + "learning_rate": 6.284802032834222e-06, + "loss": 2.7254, + "step": 7120 + }, + { + "epoch": 1.4270541082164327, + "grad_norm": 42.5722535190214, + "learning_rate": 6.283675261465498e-06, + "loss": 3.2706, + "step": 7121 + }, + { + "epoch": 1.4272545090180362, + "grad_norm": 33.73909913430544, + "learning_rate": 6.282548420301458e-06, + "loss": 3.3726, + "step": 7122 + }, + { + "epoch": 1.4274549098196392, + "grad_norm": 22.661528237915142, + "learning_rate": 6.2814215094033636e-06, + "loss": 2.9138, + "step": 7123 + }, + { + "epoch": 1.4276553106212426, + "grad_norm": 18.939638591569203, + "learning_rate": 6.280294528832493e-06, + "loss": 3.1398, + "step": 7124 + }, + { + "epoch": 1.4278557114228456, + "grad_norm": 25.664406073815513, + "learning_rate": 6.279167478650118e-06, + "loss": 2.6818, + "step": 7125 + }, + { + "epoch": 1.4280561122244488, + "grad_norm": 23.264756393875185, + "learning_rate": 6.278040358917519e-06, + "loss": 2.915, + "step": 7126 + }, + { + "epoch": 1.428256513026052, + "grad_norm": 25.81670487830907, + "learning_rate": 6.27691316969598e-06, + "loss": 2.797, + "step": 7127 + }, + { + "epoch": 1.4284569138276553, + "grad_norm": 28.17254990623769, + "learning_rate": 6.275785911046785e-06, + "loss": 2.9251, + "step": 7128 + }, + { + "epoch": 1.4286573146292585, + "grad_norm": 51.002494703081084, + "learning_rate": 6.274658583031228e-06, + "loss": 2.6378, + "step": 7129 + }, + { + "epoch": 1.4288577154308617, + "grad_norm": 21.766446891439962, + "learning_rate": 6.2735311857106014e-06, + "loss": 2.3155, + "step": 7130 + }, + { + "epoch": 1.429058116232465, + "grad_norm": 22.460003241096075, + "learning_rate": 6.272403719146205e-06, + "loss": 2.675, + "step": 7131 + }, + { + "epoch": 1.4292585170340681, + "grad_norm": 32.324674656829515, + "learning_rate": 6.271276183399342e-06, + "loss": 2.4575, + "step": 7132 + }, + { + "epoch": 1.4294589178356714, + "grad_norm": 28.44588488838507, + "learning_rate": 6.270148578531314e-06, + "loss": 2.2798, + "step": 7133 + }, + { + "epoch": 1.4296593186372746, + "grad_norm": 19.650556698930718, + "learning_rate": 6.269020904603435e-06, + "loss": 2.7379, + "step": 7134 + }, + { + "epoch": 1.4298597194388778, + "grad_norm": 78.92526655521479, + "learning_rate": 6.267893161677013e-06, + "loss": 3.2964, + "step": 7135 + }, + { + "epoch": 1.430060120240481, + "grad_norm": 27.39010639718572, + "learning_rate": 6.266765349813369e-06, + "loss": 2.6666, + "step": 7136 + }, + { + "epoch": 1.4302605210420842, + "grad_norm": 22.44440550563221, + "learning_rate": 6.265637469073824e-06, + "loss": 2.7015, + "step": 7137 + }, + { + "epoch": 1.4304609218436874, + "grad_norm": 29.692339807499028, + "learning_rate": 6.2645095195197015e-06, + "loss": 3.3263, + "step": 7138 + }, + { + "epoch": 1.4306613226452907, + "grad_norm": 29.966251529043067, + "learning_rate": 6.263381501212331e-06, + "loss": 2.7693, + "step": 7139 + }, + { + "epoch": 1.4308617234468937, + "grad_norm": 22.04233811031559, + "learning_rate": 6.262253414213042e-06, + "loss": 2.4777, + "step": 7140 + }, + { + "epoch": 1.431062124248497, + "grad_norm": 18.398353449744278, + "learning_rate": 6.261125258583172e-06, + "loss": 2.9445, + "step": 7141 + }, + { + "epoch": 1.4312625250501, + "grad_norm": 22.57392017144619, + "learning_rate": 6.2599970343840585e-06, + "loss": 3.0339, + "step": 7142 + }, + { + "epoch": 1.4314629258517035, + "grad_norm": 25.899481231996095, + "learning_rate": 6.258868741677047e-06, + "loss": 3.4685, + "step": 7143 + }, + { + "epoch": 1.4316633266533065, + "grad_norm": 24.646062594733085, + "learning_rate": 6.257740380523486e-06, + "loss": 2.7543, + "step": 7144 + }, + { + "epoch": 1.4318637274549098, + "grad_norm": 34.307885558680645, + "learning_rate": 6.256611950984722e-06, + "loss": 2.6976, + "step": 7145 + }, + { + "epoch": 1.432064128256513, + "grad_norm": 19.160619037658638, + "learning_rate": 6.255483453122113e-06, + "loss": 2.9763, + "step": 7146 + }, + { + "epoch": 1.4322645290581162, + "grad_norm": 16.840610624030454, + "learning_rate": 6.2543548869970135e-06, + "loss": 2.6092, + "step": 7147 + }, + { + "epoch": 1.4324649298597194, + "grad_norm": 28.313394512271287, + "learning_rate": 6.253226252670788e-06, + "loss": 2.7231, + "step": 7148 + }, + { + "epoch": 1.4326653306613226, + "grad_norm": 24.42347645539045, + "learning_rate": 6.252097550204801e-06, + "loss": 3.2577, + "step": 7149 + }, + { + "epoch": 1.4328657314629258, + "grad_norm": 29.147671568388784, + "learning_rate": 6.250968779660421e-06, + "loss": 2.8255, + "step": 7150 + }, + { + "epoch": 1.433066132264529, + "grad_norm": 28.431652720067692, + "learning_rate": 6.249839941099023e-06, + "loss": 2.8357, + "step": 7151 + }, + { + "epoch": 1.4332665330661323, + "grad_norm": 22.783645176949953, + "learning_rate": 6.248711034581983e-06, + "loss": 2.5712, + "step": 7152 + }, + { + "epoch": 1.4334669338677355, + "grad_norm": 17.8806568324761, + "learning_rate": 6.2475820601706795e-06, + "loss": 2.9154, + "step": 7153 + }, + { + "epoch": 1.4336673346693387, + "grad_norm": 30.72641249550183, + "learning_rate": 6.246453017926497e-06, + "loss": 2.7169, + "step": 7154 + }, + { + "epoch": 1.433867735470942, + "grad_norm": 37.63139011353141, + "learning_rate": 6.245323907910825e-06, + "loss": 2.8548, + "step": 7155 + }, + { + "epoch": 1.4340681362725451, + "grad_norm": 24.096630289535952, + "learning_rate": 6.244194730185056e-06, + "loss": 2.4631, + "step": 7156 + }, + { + "epoch": 1.4342685370741484, + "grad_norm": 19.33691245186243, + "learning_rate": 6.24306548481058e-06, + "loss": 2.6141, + "step": 7157 + }, + { + "epoch": 1.4344689378757516, + "grad_norm": 20.43484722589181, + "learning_rate": 6.241936171848799e-06, + "loss": 2.5734, + "step": 7158 + }, + { + "epoch": 1.4346693386773546, + "grad_norm": 33.63102941198163, + "learning_rate": 6.240806791361117e-06, + "loss": 2.5928, + "step": 7159 + }, + { + "epoch": 1.434869739478958, + "grad_norm": 41.32580290204217, + "learning_rate": 6.239677343408937e-06, + "loss": 2.9476, + "step": 7160 + }, + { + "epoch": 1.435070140280561, + "grad_norm": 27.322413669569645, + "learning_rate": 6.23854782805367e-06, + "loss": 2.6435, + "step": 7161 + }, + { + "epoch": 1.4352705410821645, + "grad_norm": 35.30394478701519, + "learning_rate": 6.2374182453567286e-06, + "loss": 2.8389, + "step": 7162 + }, + { + "epoch": 1.4354709418837674, + "grad_norm": 19.774742231839568, + "learning_rate": 6.236288595379532e-06, + "loss": 2.7136, + "step": 7163 + }, + { + "epoch": 1.4356713426853707, + "grad_norm": 26.636338261004724, + "learning_rate": 6.235158878183498e-06, + "loss": 2.5031, + "step": 7164 + }, + { + "epoch": 1.4358717434869739, + "grad_norm": 29.60617996294171, + "learning_rate": 6.234029093830053e-06, + "loss": 2.6546, + "step": 7165 + }, + { + "epoch": 1.436072144288577, + "grad_norm": 35.07768799700286, + "learning_rate": 6.232899242380626e-06, + "loss": 2.4781, + "step": 7166 + }, + { + "epoch": 1.4362725450901803, + "grad_norm": 20.633345856229386, + "learning_rate": 6.2317693238966445e-06, + "loss": 2.8694, + "step": 7167 + }, + { + "epoch": 1.4364729458917835, + "grad_norm": 45.56732503346603, + "learning_rate": 6.230639338439549e-06, + "loss": 3.4286, + "step": 7168 + }, + { + "epoch": 1.4366733466933868, + "grad_norm": 24.496703301407425, + "learning_rate": 6.229509286070775e-06, + "loss": 2.3108, + "step": 7169 + }, + { + "epoch": 1.43687374749499, + "grad_norm": 34.14001433630055, + "learning_rate": 6.228379166851766e-06, + "loss": 3.0561, + "step": 7170 + }, + { + "epoch": 1.4370741482965932, + "grad_norm": 25.342467093662968, + "learning_rate": 6.22724898084397e-06, + "loss": 3.0744, + "step": 7171 + }, + { + "epoch": 1.4372745490981964, + "grad_norm": 21.180184131447707, + "learning_rate": 6.226118728108834e-06, + "loss": 2.6846, + "step": 7172 + }, + { + "epoch": 1.4374749498997996, + "grad_norm": 33.84296005885525, + "learning_rate": 6.224988408707813e-06, + "loss": 1.842, + "step": 7173 + }, + { + "epoch": 1.4376753507014028, + "grad_norm": 24.939337670430707, + "learning_rate": 6.223858022702363e-06, + "loss": 2.6696, + "step": 7174 + }, + { + "epoch": 1.437875751503006, + "grad_norm": 31.64202007283307, + "learning_rate": 6.222727570153947e-06, + "loss": 2.6697, + "step": 7175 + }, + { + "epoch": 1.4380761523046093, + "grad_norm": 26.529416738351696, + "learning_rate": 6.221597051124029e-06, + "loss": 2.8812, + "step": 7176 + }, + { + "epoch": 1.4382765531062125, + "grad_norm": 22.82303027630989, + "learning_rate": 6.2204664656740745e-06, + "loss": 2.5912, + "step": 7177 + }, + { + "epoch": 1.4384769539078157, + "grad_norm": 31.64233427264563, + "learning_rate": 6.219335813865558e-06, + "loss": 2.2836, + "step": 7178 + }, + { + "epoch": 1.438677354709419, + "grad_norm": 27.576209918648846, + "learning_rate": 6.218205095759951e-06, + "loss": 2.9347, + "step": 7179 + }, + { + "epoch": 1.438877755511022, + "grad_norm": 38.847319606676685, + "learning_rate": 6.217074311418736e-06, + "loss": 3.3079, + "step": 7180 + }, + { + "epoch": 1.4390781563126254, + "grad_norm": 20.579109892400833, + "learning_rate": 6.215943460903395e-06, + "loss": 2.7815, + "step": 7181 + }, + { + "epoch": 1.4392785571142284, + "grad_norm": 25.923109247798056, + "learning_rate": 6.214812544275413e-06, + "loss": 2.8133, + "step": 7182 + }, + { + "epoch": 1.4394789579158316, + "grad_norm": 24.04406445109883, + "learning_rate": 6.213681561596279e-06, + "loss": 2.1739, + "step": 7183 + }, + { + "epoch": 1.4396793587174348, + "grad_norm": 30.09604026187886, + "learning_rate": 6.212550512927486e-06, + "loss": 3.1509, + "step": 7184 + }, + { + "epoch": 1.439879759519038, + "grad_norm": 53.02036600316503, + "learning_rate": 6.211419398330534e-06, + "loss": 3.1525, + "step": 7185 + }, + { + "epoch": 1.4400801603206412, + "grad_norm": 27.053282927046556, + "learning_rate": 6.210288217866919e-06, + "loss": 2.7124, + "step": 7186 + }, + { + "epoch": 1.4402805611222445, + "grad_norm": 25.25859033617991, + "learning_rate": 6.209156971598148e-06, + "loss": 2.427, + "step": 7187 + }, + { + "epoch": 1.4404809619238477, + "grad_norm": 28.719315034169455, + "learning_rate": 6.208025659585728e-06, + "loss": 2.6103, + "step": 7188 + }, + { + "epoch": 1.4406813627254509, + "grad_norm": 27.440311898475485, + "learning_rate": 6.206894281891169e-06, + "loss": 3.1544, + "step": 7189 + }, + { + "epoch": 1.440881763527054, + "grad_norm": 27.353558251021713, + "learning_rate": 6.205762838575988e-06, + "loss": 2.4013, + "step": 7190 + }, + { + "epoch": 1.4410821643286573, + "grad_norm": 39.262388901401806, + "learning_rate": 6.2046313297017e-06, + "loss": 2.6421, + "step": 7191 + }, + { + "epoch": 1.4412825651302605, + "grad_norm": 26.333142589100273, + "learning_rate": 6.203499755329827e-06, + "loss": 2.4509, + "step": 7192 + }, + { + "epoch": 1.4414829659318638, + "grad_norm": 23.76320501663785, + "learning_rate": 6.202368115521898e-06, + "loss": 2.5551, + "step": 7193 + }, + { + "epoch": 1.441683366733467, + "grad_norm": 52.04964855349741, + "learning_rate": 6.20123641033944e-06, + "loss": 2.1622, + "step": 7194 + }, + { + "epoch": 1.4418837675350702, + "grad_norm": 24.12368578467835, + "learning_rate": 6.200104639843984e-06, + "loss": 2.6094, + "step": 7195 + }, + { + "epoch": 1.4420841683366734, + "grad_norm": 22.065139527900367, + "learning_rate": 6.198972804097068e-06, + "loss": 2.3294, + "step": 7196 + }, + { + "epoch": 1.4422845691382766, + "grad_norm": 24.183800296902472, + "learning_rate": 6.197840903160229e-06, + "loss": 2.4598, + "step": 7197 + }, + { + "epoch": 1.4424849699398798, + "grad_norm": 29.359261374906865, + "learning_rate": 6.1967089370950126e-06, + "loss": 2.4356, + "step": 7198 + }, + { + "epoch": 1.4426853707414828, + "grad_norm": 44.15115475801079, + "learning_rate": 6.1955769059629655e-06, + "loss": 2.7114, + "step": 7199 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 26.632616185226585, + "learning_rate": 6.194444809825638e-06, + "loss": 2.7357, + "step": 7200 + }, + { + "epoch": 1.4430861723446893, + "grad_norm": 29.447507324964402, + "learning_rate": 6.193312648744582e-06, + "loss": 3.2028, + "step": 7201 + }, + { + "epoch": 1.4432865731462927, + "grad_norm": 24.7199845957118, + "learning_rate": 6.192180422781355e-06, + "loss": 2.4286, + "step": 7202 + }, + { + "epoch": 1.4434869739478957, + "grad_norm": 44.506225429483905, + "learning_rate": 6.191048131997521e-06, + "loss": 2.1398, + "step": 7203 + }, + { + "epoch": 1.443687374749499, + "grad_norm": 25.814992691586514, + "learning_rate": 6.18991577645464e-06, + "loss": 2.7184, + "step": 7204 + }, + { + "epoch": 1.4438877755511021, + "grad_norm": 38.59951492716146, + "learning_rate": 6.188783356214282e-06, + "loss": 3.2649, + "step": 7205 + }, + { + "epoch": 1.4440881763527054, + "grad_norm": 27.17242235456548, + "learning_rate": 6.187650871338018e-06, + "loss": 2.4073, + "step": 7206 + }, + { + "epoch": 1.4442885771543086, + "grad_norm": 26.06955014666368, + "learning_rate": 6.186518321887424e-06, + "loss": 2.8725, + "step": 7207 + }, + { + "epoch": 1.4444889779559118, + "grad_norm": 28.912172800537313, + "learning_rate": 6.1853857079240766e-06, + "loss": 3.2292, + "step": 7208 + }, + { + "epoch": 1.444689378757515, + "grad_norm": 28.03393778481264, + "learning_rate": 6.184253029509558e-06, + "loss": 2.7159, + "step": 7209 + }, + { + "epoch": 1.4448897795591182, + "grad_norm": 21.74923146860944, + "learning_rate": 6.183120286705455e-06, + "loss": 2.0082, + "step": 7210 + }, + { + "epoch": 1.4450901803607215, + "grad_norm": 25.2129006600997, + "learning_rate": 6.181987479573355e-06, + "loss": 2.9756, + "step": 7211 + }, + { + "epoch": 1.4452905811623247, + "grad_norm": 21.80867443613061, + "learning_rate": 6.180854608174851e-06, + "loss": 2.9829, + "step": 7212 + }, + { + "epoch": 1.445490981963928, + "grad_norm": 21.89810502499575, + "learning_rate": 6.179721672571539e-06, + "loss": 3.0737, + "step": 7213 + }, + { + "epoch": 1.445691382765531, + "grad_norm": 25.62182101340072, + "learning_rate": 6.1785886728250186e-06, + "loss": 2.8012, + "step": 7214 + }, + { + "epoch": 1.4458917835671343, + "grad_norm": 20.704070751342194, + "learning_rate": 6.177455608996892e-06, + "loss": 2.3899, + "step": 7215 + }, + { + "epoch": 1.4460921843687375, + "grad_norm": 22.874283240130968, + "learning_rate": 6.1763224811487654e-06, + "loss": 2.8492, + "step": 7216 + }, + { + "epoch": 1.4462925851703408, + "grad_norm": 31.143127803431845, + "learning_rate": 6.17518928934225e-06, + "loss": 2.7874, + "step": 7217 + }, + { + "epoch": 1.4464929859719438, + "grad_norm": 20.438414471094298, + "learning_rate": 6.1740560336389576e-06, + "loss": 2.7498, + "step": 7218 + }, + { + "epoch": 1.4466933867735472, + "grad_norm": 33.09281208617481, + "learning_rate": 6.1729227141005045e-06, + "loss": 2.6605, + "step": 7219 + }, + { + "epoch": 1.4468937875751502, + "grad_norm": 25.936690214312552, + "learning_rate": 6.171789330788515e-06, + "loss": 2.8722, + "step": 7220 + }, + { + "epoch": 1.4470941883767536, + "grad_norm": 24.65607899769425, + "learning_rate": 6.170655883764607e-06, + "loss": 1.9989, + "step": 7221 + }, + { + "epoch": 1.4472945891783566, + "grad_norm": 24.982608361897753, + "learning_rate": 6.169522373090413e-06, + "loss": 3.0844, + "step": 7222 + }, + { + "epoch": 1.4474949899799598, + "grad_norm": 24.841149679121134, + "learning_rate": 6.168388798827558e-06, + "loss": 3.0026, + "step": 7223 + }, + { + "epoch": 1.447695390781563, + "grad_norm": 27.060377234010254, + "learning_rate": 6.167255161037681e-06, + "loss": 2.6178, + "step": 7224 + }, + { + "epoch": 1.4478957915831663, + "grad_norm": 30.630854956742542, + "learning_rate": 6.166121459782419e-06, + "loss": 2.6759, + "step": 7225 + }, + { + "epoch": 1.4480961923847695, + "grad_norm": 20.85777778088713, + "learning_rate": 6.16498769512341e-06, + "loss": 2.6566, + "step": 7226 + }, + { + "epoch": 1.4482965931863727, + "grad_norm": 20.92281399883483, + "learning_rate": 6.163853867122302e-06, + "loss": 2.627, + "step": 7227 + }, + { + "epoch": 1.448496993987976, + "grad_norm": 27.17142510190696, + "learning_rate": 6.162719975840741e-06, + "loss": 3.0109, + "step": 7228 + }, + { + "epoch": 1.4486973947895792, + "grad_norm": 29.984408134416423, + "learning_rate": 6.161586021340378e-06, + "loss": 3.5079, + "step": 7229 + }, + { + "epoch": 1.4488977955911824, + "grad_norm": 34.53843284815313, + "learning_rate": 6.160452003682867e-06, + "loss": 2.9534, + "step": 7230 + }, + { + "epoch": 1.4490981963927856, + "grad_norm": 24.577177464987113, + "learning_rate": 6.1593179229298694e-06, + "loss": 2.8285, + "step": 7231 + }, + { + "epoch": 1.4492985971943888, + "grad_norm": 34.45177032078319, + "learning_rate": 6.1581837791430455e-06, + "loss": 2.4085, + "step": 7232 + }, + { + "epoch": 1.449498997995992, + "grad_norm": 32.53072854445845, + "learning_rate": 6.157049572384059e-06, + "loss": 3.1312, + "step": 7233 + }, + { + "epoch": 1.4496993987975952, + "grad_norm": 28.236218251367557, + "learning_rate": 6.155915302714579e-06, + "loss": 2.895, + "step": 7234 + }, + { + "epoch": 1.4498997995991985, + "grad_norm": 32.06699305414942, + "learning_rate": 6.154780970196278e-06, + "loss": 2.9393, + "step": 7235 + }, + { + "epoch": 1.4501002004008017, + "grad_norm": 20.846600000318187, + "learning_rate": 6.153646574890829e-06, + "loss": 2.5461, + "step": 7236 + }, + { + "epoch": 1.4503006012024047, + "grad_norm": 32.0865694158658, + "learning_rate": 6.152512116859917e-06, + "loss": 2.8099, + "step": 7237 + }, + { + "epoch": 1.450501002004008, + "grad_norm": 24.381679720060628, + "learning_rate": 6.151377596165217e-06, + "loss": 2.4467, + "step": 7238 + }, + { + "epoch": 1.450701402805611, + "grad_norm": 25.262626973579444, + "learning_rate": 6.150243012868419e-06, + "loss": 2.5802, + "step": 7239 + }, + { + "epoch": 1.4509018036072145, + "grad_norm": 32.74046974246736, + "learning_rate": 6.1491083670312094e-06, + "loss": 2.9638, + "step": 7240 + }, + { + "epoch": 1.4511022044088175, + "grad_norm": 21.97781597770605, + "learning_rate": 6.147973658715281e-06, + "loss": 3.1443, + "step": 7241 + }, + { + "epoch": 1.4513026052104208, + "grad_norm": 37.66171943861282, + "learning_rate": 6.1468388879823315e-06, + "loss": 2.4278, + "step": 7242 + }, + { + "epoch": 1.451503006012024, + "grad_norm": 27.078785546018235, + "learning_rate": 6.1457040548940585e-06, + "loss": 2.8743, + "step": 7243 + }, + { + "epoch": 1.4517034068136272, + "grad_norm": 62.54075543019432, + "learning_rate": 6.1445691595121644e-06, + "loss": 2.8949, + "step": 7244 + }, + { + "epoch": 1.4519038076152304, + "grad_norm": 23.43659246563863, + "learning_rate": 6.1434342018983566e-06, + "loss": 2.3158, + "step": 7245 + }, + { + "epoch": 1.4521042084168336, + "grad_norm": 32.25021925167602, + "learning_rate": 6.142299182114342e-06, + "loss": 3.2223, + "step": 7246 + }, + { + "epoch": 1.4523046092184368, + "grad_norm": 21.3634556716685, + "learning_rate": 6.141164100221837e-06, + "loss": 2.9578, + "step": 7247 + }, + { + "epoch": 1.45250501002004, + "grad_norm": 24.13453382508502, + "learning_rate": 6.1400289562825525e-06, + "loss": 2.914, + "step": 7248 + }, + { + "epoch": 1.4527054108216433, + "grad_norm": 24.976680830063255, + "learning_rate": 6.138893750358212e-06, + "loss": 2.7707, + "step": 7249 + }, + { + "epoch": 1.4529058116232465, + "grad_norm": 40.74836617912143, + "learning_rate": 6.137758482510537e-06, + "loss": 3.2058, + "step": 7250 + }, + { + "epoch": 1.4531062124248497, + "grad_norm": 21.86646889873503, + "learning_rate": 6.136623152801255e-06, + "loss": 2.7074, + "step": 7251 + }, + { + "epoch": 1.453306613226453, + "grad_norm": 24.713970437437382, + "learning_rate": 6.1354877612920925e-06, + "loss": 2.5483, + "step": 7252 + }, + { + "epoch": 1.4535070140280562, + "grad_norm": 21.897407477676165, + "learning_rate": 6.1343523080447855e-06, + "loss": 2.7536, + "step": 7253 + }, + { + "epoch": 1.4537074148296594, + "grad_norm": 18.949012683326828, + "learning_rate": 6.133216793121068e-06, + "loss": 2.8171, + "step": 7254 + }, + { + "epoch": 1.4539078156312626, + "grad_norm": 32.315445862565795, + "learning_rate": 6.132081216582681e-06, + "loss": 2.9774, + "step": 7255 + }, + { + "epoch": 1.4541082164328658, + "grad_norm": 31.080256753242548, + "learning_rate": 6.130945578491369e-06, + "loss": 2.6882, + "step": 7256 + }, + { + "epoch": 1.454308617234469, + "grad_norm": 63.22074926916907, + "learning_rate": 6.129809878908875e-06, + "loss": 2.0932, + "step": 7257 + }, + { + "epoch": 1.454509018036072, + "grad_norm": 18.32700229497039, + "learning_rate": 6.128674117896949e-06, + "loss": 2.4035, + "step": 7258 + }, + { + "epoch": 1.4547094188376755, + "grad_norm": 19.71569470053947, + "learning_rate": 6.127538295517348e-06, + "loss": 2.5364, + "step": 7259 + }, + { + "epoch": 1.4549098196392785, + "grad_norm": 29.161910054010157, + "learning_rate": 6.1264024118318235e-06, + "loss": 2.8505, + "step": 7260 + }, + { + "epoch": 1.455110220440882, + "grad_norm": 34.7588609308861, + "learning_rate": 6.125266466902136e-06, + "loss": 2.8291, + "step": 7261 + }, + { + "epoch": 1.455310621242485, + "grad_norm": 37.14680908444429, + "learning_rate": 6.124130460790051e-06, + "loss": 2.834, + "step": 7262 + }, + { + "epoch": 1.455511022044088, + "grad_norm": 24.250883771500224, + "learning_rate": 6.122994393557333e-06, + "loss": 2.9135, + "step": 7263 + }, + { + "epoch": 1.4557114228456913, + "grad_norm": 22.906812238425108, + "learning_rate": 6.121858265265754e-06, + "loss": 1.9082, + "step": 7264 + }, + { + "epoch": 1.4559118236472945, + "grad_norm": 21.700798158946682, + "learning_rate": 6.120722075977083e-06, + "loss": 2.622, + "step": 7265 + }, + { + "epoch": 1.4561122244488978, + "grad_norm": 21.007947857939346, + "learning_rate": 6.119585825753099e-06, + "loss": 2.6417, + "step": 7266 + }, + { + "epoch": 1.456312625250501, + "grad_norm": 21.519267779816545, + "learning_rate": 6.118449514655581e-06, + "loss": 2.3693, + "step": 7267 + }, + { + "epoch": 1.4565130260521042, + "grad_norm": 29.26896695169624, + "learning_rate": 6.117313142746312e-06, + "loss": 3.0905, + "step": 7268 + }, + { + "epoch": 1.4567134268537074, + "grad_norm": 35.54295806293411, + "learning_rate": 6.116176710087078e-06, + "loss": 3.8714, + "step": 7269 + }, + { + "epoch": 1.4569138276553106, + "grad_norm": 23.916988533221257, + "learning_rate": 6.115040216739669e-06, + "loss": 2.2724, + "step": 7270 + }, + { + "epoch": 1.4571142284569139, + "grad_norm": 21.901723703998048, + "learning_rate": 6.113903662765879e-06, + "loss": 3.0521, + "step": 7271 + }, + { + "epoch": 1.457314629258517, + "grad_norm": 31.741082384382928, + "learning_rate": 6.1127670482275e-06, + "loss": 3.4172, + "step": 7272 + }, + { + "epoch": 1.4575150300601203, + "grad_norm": 25.756065283761515, + "learning_rate": 6.1116303731863346e-06, + "loss": 2.6497, + "step": 7273 + }, + { + "epoch": 1.4577154308617235, + "grad_norm": 24.838307452782544, + "learning_rate": 6.110493637704185e-06, + "loss": 2.902, + "step": 7274 + }, + { + "epoch": 1.4579158316633267, + "grad_norm": 23.96510118190845, + "learning_rate": 6.109356841842857e-06, + "loss": 2.4688, + "step": 7275 + }, + { + "epoch": 1.45811623246493, + "grad_norm": 22.77294709011027, + "learning_rate": 6.108219985664161e-06, + "loss": 2.8342, + "step": 7276 + }, + { + "epoch": 1.458316633266533, + "grad_norm": 25.47485332865845, + "learning_rate": 6.1070830692299074e-06, + "loss": 2.5971, + "step": 7277 + }, + { + "epoch": 1.4585170340681364, + "grad_norm": 34.33410344268842, + "learning_rate": 6.1059460926019145e-06, + "loss": 2.8789, + "step": 7278 + }, + { + "epoch": 1.4587174348697394, + "grad_norm": 47.20112431087792, + "learning_rate": 6.104809055841997e-06, + "loss": 2.8995, + "step": 7279 + }, + { + "epoch": 1.4589178356713428, + "grad_norm": 31.81820518945881, + "learning_rate": 6.103671959011982e-06, + "loss": 2.5084, + "step": 7280 + }, + { + "epoch": 1.4591182364729458, + "grad_norm": 34.14089581142127, + "learning_rate": 6.102534802173694e-06, + "loss": 2.8742, + "step": 7281 + }, + { + "epoch": 1.459318637274549, + "grad_norm": 20.115200426765057, + "learning_rate": 6.1013975853889615e-06, + "loss": 2.7261, + "step": 7282 + }, + { + "epoch": 1.4595190380761522, + "grad_norm": 49.98475309225058, + "learning_rate": 6.100260308719617e-06, + "loss": 2.8188, + "step": 7283 + }, + { + "epoch": 1.4597194388777555, + "grad_norm": 28.370396324615832, + "learning_rate": 6.099122972227493e-06, + "loss": 2.7531, + "step": 7284 + }, + { + "epoch": 1.4599198396793587, + "grad_norm": 25.391725312972742, + "learning_rate": 6.097985575974432e-06, + "loss": 2.5059, + "step": 7285 + }, + { + "epoch": 1.460120240480962, + "grad_norm": 24.499783131254063, + "learning_rate": 6.096848120022274e-06, + "loss": 3.2102, + "step": 7286 + }, + { + "epoch": 1.4603206412825651, + "grad_norm": 21.835179191889498, + "learning_rate": 6.095710604432865e-06, + "loss": 2.1639, + "step": 7287 + }, + { + "epoch": 1.4605210420841683, + "grad_norm": 20.909690744603793, + "learning_rate": 6.094573029268054e-06, + "loss": 2.699, + "step": 7288 + }, + { + "epoch": 1.4607214428857715, + "grad_norm": 33.330958386984605, + "learning_rate": 6.093435394589689e-06, + "loss": 2.6837, + "step": 7289 + }, + { + "epoch": 1.4609218436873748, + "grad_norm": 58.36808816850378, + "learning_rate": 6.0922977004596295e-06, + "loss": 3.4373, + "step": 7290 + }, + { + "epoch": 1.461122244488978, + "grad_norm": 50.163814813110115, + "learning_rate": 6.091159946939732e-06, + "loss": 3.4351, + "step": 7291 + }, + { + "epoch": 1.4613226452905812, + "grad_norm": 22.9448369559119, + "learning_rate": 6.090022134091855e-06, + "loss": 3.0779, + "step": 7292 + }, + { + "epoch": 1.4615230460921844, + "grad_norm": 20.4723223306832, + "learning_rate": 6.088884261977869e-06, + "loss": 2.7179, + "step": 7293 + }, + { + "epoch": 1.4617234468937876, + "grad_norm": 27.98790721921142, + "learning_rate": 6.0877463306596365e-06, + "loss": 2.7347, + "step": 7294 + }, + { + "epoch": 1.4619238476953909, + "grad_norm": 22.0277069044917, + "learning_rate": 6.0866083401990295e-06, + "loss": 2.6838, + "step": 7295 + }, + { + "epoch": 1.4621242484969939, + "grad_norm": 19.286682317388678, + "learning_rate": 6.085470290657925e-06, + "loss": 2.9069, + "step": 7296 + }, + { + "epoch": 1.4623246492985973, + "grad_norm": 23.682315209252078, + "learning_rate": 6.084332182098197e-06, + "loss": 2.8367, + "step": 7297 + }, + { + "epoch": 1.4625250501002003, + "grad_norm": 21.160143022885187, + "learning_rate": 6.083194014581728e-06, + "loss": 2.4454, + "step": 7298 + }, + { + "epoch": 1.4627254509018037, + "grad_norm": 44.86222571028306, + "learning_rate": 6.082055788170401e-06, + "loss": 3.4308, + "step": 7299 + }, + { + "epoch": 1.4629258517034067, + "grad_norm": 25.616088264594282, + "learning_rate": 6.080917502926106e-06, + "loss": 2.7343, + "step": 7300 + }, + { + "epoch": 1.46312625250501, + "grad_norm": 29.782714556016494, + "learning_rate": 6.079779158910728e-06, + "loss": 2.7525, + "step": 7301 + }, + { + "epoch": 1.4633266533066132, + "grad_norm": 23.02261205665528, + "learning_rate": 6.078640756186165e-06, + "loss": 2.2572, + "step": 7302 + }, + { + "epoch": 1.4635270541082164, + "grad_norm": 20.77461157216628, + "learning_rate": 6.077502294814311e-06, + "loss": 2.9133, + "step": 7303 + }, + { + "epoch": 1.4637274549098196, + "grad_norm": 19.90563692511012, + "learning_rate": 6.076363774857067e-06, + "loss": 3.0382, + "step": 7304 + }, + { + "epoch": 1.4639278557114228, + "grad_norm": 26.94956477798218, + "learning_rate": 6.075225196376335e-06, + "loss": 2.9695, + "step": 7305 + }, + { + "epoch": 1.464128256513026, + "grad_norm": 20.481530278310107, + "learning_rate": 6.074086559434022e-06, + "loss": 2.7843, + "step": 7306 + }, + { + "epoch": 1.4643286573146292, + "grad_norm": 29.936488945723745, + "learning_rate": 6.072947864092037e-06, + "loss": 2.9891, + "step": 7307 + }, + { + "epoch": 1.4645290581162325, + "grad_norm": 30.694839150134175, + "learning_rate": 6.071809110412294e-06, + "loss": 3.1927, + "step": 7308 + }, + { + "epoch": 1.4647294589178357, + "grad_norm": 21.987250287102317, + "learning_rate": 6.070670298456708e-06, + "loss": 2.7156, + "step": 7309 + }, + { + "epoch": 1.464929859719439, + "grad_norm": 27.872954062847576, + "learning_rate": 6.069531428287195e-06, + "loss": 2.7072, + "step": 7310 + }, + { + "epoch": 1.4651302605210421, + "grad_norm": 41.24126899005139, + "learning_rate": 6.0683924999656805e-06, + "loss": 2.7669, + "step": 7311 + }, + { + "epoch": 1.4653306613226453, + "grad_norm": 39.122484117685346, + "learning_rate": 6.06725351355409e-06, + "loss": 3.087, + "step": 7312 + }, + { + "epoch": 1.4655310621242486, + "grad_norm": 26.422741197838313, + "learning_rate": 6.06611446911435e-06, + "loss": 3.0145, + "step": 7313 + }, + { + "epoch": 1.4657314629258518, + "grad_norm": 44.85548127279196, + "learning_rate": 6.0649753667083925e-06, + "loss": 3.0764, + "step": 7314 + }, + { + "epoch": 1.465931863727455, + "grad_norm": 28.10971719321745, + "learning_rate": 6.063836206398153e-06, + "loss": 2.6522, + "step": 7315 + }, + { + "epoch": 1.4661322645290582, + "grad_norm": 25.895920675051226, + "learning_rate": 6.062696988245567e-06, + "loss": 2.6522, + "step": 7316 + }, + { + "epoch": 1.4663326653306612, + "grad_norm": 29.207733846482604, + "learning_rate": 6.061557712312576e-06, + "loss": 2.8251, + "step": 7317 + }, + { + "epoch": 1.4665330661322646, + "grad_norm": 21.96129233679758, + "learning_rate": 6.060418378661127e-06, + "loss": 2.2854, + "step": 7318 + }, + { + "epoch": 1.4667334669338676, + "grad_norm": 30.54753549819864, + "learning_rate": 6.059278987353165e-06, + "loss": 3.5649, + "step": 7319 + }, + { + "epoch": 1.466933867735471, + "grad_norm": 32.63725281013389, + "learning_rate": 6.058139538450642e-06, + "loss": 2.5038, + "step": 7320 + }, + { + "epoch": 1.467134268537074, + "grad_norm": 20.92544404199881, + "learning_rate": 6.057000032015509e-06, + "loss": 2.9545, + "step": 7321 + }, + { + "epoch": 1.4673346693386773, + "grad_norm": 20.465819588318855, + "learning_rate": 6.055860468109725e-06, + "loss": 2.5124, + "step": 7322 + }, + { + "epoch": 1.4675350701402805, + "grad_norm": 21.806767522453235, + "learning_rate": 6.0547208467952465e-06, + "loss": 2.7326, + "step": 7323 + }, + { + "epoch": 1.4677354709418837, + "grad_norm": 21.663190837284997, + "learning_rate": 6.053581168134041e-06, + "loss": 2.9341, + "step": 7324 + }, + { + "epoch": 1.467935871743487, + "grad_norm": 25.99152429669759, + "learning_rate": 6.052441432188072e-06, + "loss": 2.5259, + "step": 7325 + }, + { + "epoch": 1.4681362725450902, + "grad_norm": 24.998604199977304, + "learning_rate": 6.051301639019308e-06, + "loss": 2.836, + "step": 7326 + }, + { + "epoch": 1.4683366733466934, + "grad_norm": 26.991350141476225, + "learning_rate": 6.050161788689722e-06, + "loss": 3.2106, + "step": 7327 + }, + { + "epoch": 1.4685370741482966, + "grad_norm": 25.24758371940949, + "learning_rate": 6.04902188126129e-06, + "loss": 3.0866, + "step": 7328 + }, + { + "epoch": 1.4687374749498998, + "grad_norm": 21.90358292616697, + "learning_rate": 6.047881916795989e-06, + "loss": 2.7473, + "step": 7329 + }, + { + "epoch": 1.468937875751503, + "grad_norm": 29.665304150718413, + "learning_rate": 6.046741895355802e-06, + "loss": 2.888, + "step": 7330 + }, + { + "epoch": 1.4691382765531062, + "grad_norm": 22.98961563288616, + "learning_rate": 6.045601817002712e-06, + "loss": 3.3432, + "step": 7331 + }, + { + "epoch": 1.4693386773547095, + "grad_norm": 24.963235448476958, + "learning_rate": 6.04446168179871e-06, + "loss": 2.6923, + "step": 7332 + }, + { + "epoch": 1.4695390781563127, + "grad_norm": 24.920568503173044, + "learning_rate": 6.043321489805782e-06, + "loss": 2.666, + "step": 7333 + }, + { + "epoch": 1.469739478957916, + "grad_norm": 29.503383637848753, + "learning_rate": 6.042181241085926e-06, + "loss": 2.6531, + "step": 7334 + }, + { + "epoch": 1.4699398797595191, + "grad_norm": 31.072247115492676, + "learning_rate": 6.041040935701137e-06, + "loss": 2.1704, + "step": 7335 + }, + { + "epoch": 1.4701402805611221, + "grad_norm": 26.333950548117045, + "learning_rate": 6.039900573713415e-06, + "loss": 2.9577, + "step": 7336 + }, + { + "epoch": 1.4703406813627256, + "grad_norm": 30.750040573509285, + "learning_rate": 6.038760155184766e-06, + "loss": 2.8926, + "step": 7337 + }, + { + "epoch": 1.4705410821643286, + "grad_norm": 20.525175291600497, + "learning_rate": 6.037619680177192e-06, + "loss": 2.7, + "step": 7338 + }, + { + "epoch": 1.470741482965932, + "grad_norm": 19.203520385600708, + "learning_rate": 6.0364791487527056e-06, + "loss": 2.865, + "step": 7339 + }, + { + "epoch": 1.470941883767535, + "grad_norm": 41.90581379884263, + "learning_rate": 6.035338560973318e-06, + "loss": 2.3506, + "step": 7340 + }, + { + "epoch": 1.4711422845691382, + "grad_norm": 17.61119932134887, + "learning_rate": 6.034197916901043e-06, + "loss": 2.7385, + "step": 7341 + }, + { + "epoch": 1.4713426853707414, + "grad_norm": 25.8568020001397, + "learning_rate": 6.033057216597901e-06, + "loss": 2.713, + "step": 7342 + }, + { + "epoch": 1.4715430861723446, + "grad_norm": 29.63102368955032, + "learning_rate": 6.031916460125913e-06, + "loss": 2.9654, + "step": 7343 + }, + { + "epoch": 1.4717434869739479, + "grad_norm": 26.621329750586078, + "learning_rate": 6.0307756475471056e-06, + "loss": 2.9326, + "step": 7344 + }, + { + "epoch": 1.471943887775551, + "grad_norm": 17.474650472568815, + "learning_rate": 6.0296347789235035e-06, + "loss": 2.4775, + "step": 7345 + }, + { + "epoch": 1.4721442885771543, + "grad_norm": 59.865580044909706, + "learning_rate": 6.0284938543171375e-06, + "loss": 2.8284, + "step": 7346 + }, + { + "epoch": 1.4723446893787575, + "grad_norm": 20.463923530473327, + "learning_rate": 6.027352873790044e-06, + "loss": 2.8775, + "step": 7347 + }, + { + "epoch": 1.4725450901803607, + "grad_norm": 31.1536396079631, + "learning_rate": 6.026211837404256e-06, + "loss": 2.8525, + "step": 7348 + }, + { + "epoch": 1.472745490981964, + "grad_norm": 27.211081187277724, + "learning_rate": 6.0250707452218195e-06, + "loss": 2.703, + "step": 7349 + }, + { + "epoch": 1.4729458917835672, + "grad_norm": 20.998465028559295, + "learning_rate": 6.023929597304771e-06, + "loss": 2.5402, + "step": 7350 + }, + { + "epoch": 1.4731462925851704, + "grad_norm": 32.5012105824844, + "learning_rate": 6.022788393715159e-06, + "loss": 2.528, + "step": 7351 + }, + { + "epoch": 1.4733466933867736, + "grad_norm": 21.193819850969067, + "learning_rate": 6.021647134515033e-06, + "loss": 2.2772, + "step": 7352 + }, + { + "epoch": 1.4735470941883768, + "grad_norm": 25.37372615259277, + "learning_rate": 6.020505819766443e-06, + "loss": 2.7979, + "step": 7353 + }, + { + "epoch": 1.47374749498998, + "grad_norm": 30.093173276160677, + "learning_rate": 6.019364449531446e-06, + "loss": 2.8282, + "step": 7354 + }, + { + "epoch": 1.473947895791583, + "grad_norm": 20.6537446238163, + "learning_rate": 6.018223023872099e-06, + "loss": 2.6289, + "step": 7355 + }, + { + "epoch": 1.4741482965931865, + "grad_norm": 19.938316479365522, + "learning_rate": 6.017081542850462e-06, + "loss": 2.5985, + "step": 7356 + }, + { + "epoch": 1.4743486973947895, + "grad_norm": 31.067760681751587, + "learning_rate": 6.015940006528602e-06, + "loss": 3.0754, + "step": 7357 + }, + { + "epoch": 1.474549098196393, + "grad_norm": 26.27242803711202, + "learning_rate": 6.014798414968584e-06, + "loss": 2.6564, + "step": 7358 + }, + { + "epoch": 1.474749498997996, + "grad_norm": 19.109161356383403, + "learning_rate": 6.013656768232478e-06, + "loss": 2.8626, + "step": 7359 + }, + { + "epoch": 1.4749498997995991, + "grad_norm": 27.003907969920007, + "learning_rate": 6.0125150663823564e-06, + "loss": 3.1036, + "step": 7360 + }, + { + "epoch": 1.4751503006012023, + "grad_norm": 27.62984470661515, + "learning_rate": 6.011373309480295e-06, + "loss": 2.4713, + "step": 7361 + }, + { + "epoch": 1.4753507014028056, + "grad_norm": 32.05751274419613, + "learning_rate": 6.010231497588377e-06, + "loss": 3.1301, + "step": 7362 + }, + { + "epoch": 1.4755511022044088, + "grad_norm": 26.573838672813544, + "learning_rate": 6.009089630768679e-06, + "loss": 2.7184, + "step": 7363 + }, + { + "epoch": 1.475751503006012, + "grad_norm": 28.128164369909264, + "learning_rate": 6.0079477090832894e-06, + "loss": 3.1081, + "step": 7364 + }, + { + "epoch": 1.4759519038076152, + "grad_norm": 25.788046064612377, + "learning_rate": 6.006805732594294e-06, + "loss": 3.1211, + "step": 7365 + }, + { + "epoch": 1.4761523046092184, + "grad_norm": 18.97517542902329, + "learning_rate": 6.005663701363784e-06, + "loss": 2.0472, + "step": 7366 + }, + { + "epoch": 1.4763527054108216, + "grad_norm": 42.48692374130973, + "learning_rate": 6.0045216154538555e-06, + "loss": 2.7494, + "step": 7367 + }, + { + "epoch": 1.4765531062124249, + "grad_norm": 26.93469357258903, + "learning_rate": 6.003379474926603e-06, + "loss": 3.2491, + "step": 7368 + }, + { + "epoch": 1.476753507014028, + "grad_norm": 21.420312516394855, + "learning_rate": 6.002237279844129e-06, + "loss": 2.7897, + "step": 7369 + }, + { + "epoch": 1.4769539078156313, + "grad_norm": 81.05898748058057, + "learning_rate": 6.001095030268533e-06, + "loss": 2.8611, + "step": 7370 + }, + { + "epoch": 1.4771543086172345, + "grad_norm": 32.0811659531202, + "learning_rate": 5.999952726261924e-06, + "loss": 2.7795, + "step": 7371 + }, + { + "epoch": 1.4773547094188377, + "grad_norm": 40.477963700818464, + "learning_rate": 5.998810367886408e-06, + "loss": 2.958, + "step": 7372 + }, + { + "epoch": 1.477555110220441, + "grad_norm": 60.6819167788553, + "learning_rate": 5.997667955204099e-06, + "loss": 3.011, + "step": 7373 + }, + { + "epoch": 1.4777555110220442, + "grad_norm": 20.84017691323037, + "learning_rate": 5.996525488277109e-06, + "loss": 2.6131, + "step": 7374 + }, + { + "epoch": 1.4779559118236474, + "grad_norm": 23.10775061730157, + "learning_rate": 5.9953829671675575e-06, + "loss": 2.6553, + "step": 7375 + }, + { + "epoch": 1.4781563126252504, + "grad_norm": 35.73896094650398, + "learning_rate": 5.994240391937566e-06, + "loss": 3.3129, + "step": 7376 + }, + { + "epoch": 1.4783567134268538, + "grad_norm": 28.512963666984263, + "learning_rate": 5.993097762649256e-06, + "loss": 2.8236, + "step": 7377 + }, + { + "epoch": 1.4785571142284568, + "grad_norm": 23.444766459075996, + "learning_rate": 5.991955079364754e-06, + "loss": 2.954, + "step": 7378 + }, + { + "epoch": 1.4787575150300603, + "grad_norm": 27.65173701105969, + "learning_rate": 5.990812342146191e-06, + "loss": 2.3821, + "step": 7379 + }, + { + "epoch": 1.4789579158316633, + "grad_norm": 27.68637564053746, + "learning_rate": 5.9896695510556964e-06, + "loss": 2.512, + "step": 7380 + }, + { + "epoch": 1.4791583166332665, + "grad_norm": 84.36209553648102, + "learning_rate": 5.988526706155411e-06, + "loss": 2.824, + "step": 7381 + }, + { + "epoch": 1.4793587174348697, + "grad_norm": 29.661051509691454, + "learning_rate": 5.9873838075074675e-06, + "loss": 2.6521, + "step": 7382 + }, + { + "epoch": 1.479559118236473, + "grad_norm": 16.29300211861558, + "learning_rate": 5.986240855174008e-06, + "loss": 2.3792, + "step": 7383 + }, + { + "epoch": 1.4797595190380761, + "grad_norm": 22.77653449675098, + "learning_rate": 5.98509784921718e-06, + "loss": 2.8206, + "step": 7384 + }, + { + "epoch": 1.4799599198396793, + "grad_norm": 22.587022663871938, + "learning_rate": 5.983954789699125e-06, + "loss": 2.266, + "step": 7385 + }, + { + "epoch": 1.4801603206412826, + "grad_norm": 27.48709160419223, + "learning_rate": 5.982811676681996e-06, + "loss": 2.0553, + "step": 7386 + }, + { + "epoch": 1.4803607214428858, + "grad_norm": 23.997582811262475, + "learning_rate": 5.981668510227946e-06, + "loss": 2.6373, + "step": 7387 + }, + { + "epoch": 1.480561122244489, + "grad_norm": 21.3571918632125, + "learning_rate": 5.980525290399131e-06, + "loss": 2.6429, + "step": 7388 + }, + { + "epoch": 1.4807615230460922, + "grad_norm": 29.2733393731601, + "learning_rate": 5.979382017257707e-06, + "loss": 2.9941, + "step": 7389 + }, + { + "epoch": 1.4809619238476954, + "grad_norm": 26.64759730123197, + "learning_rate": 5.978238690865839e-06, + "loss": 2.7302, + "step": 7390 + }, + { + "epoch": 1.4811623246492986, + "grad_norm": 20.754871331330257, + "learning_rate": 5.977095311285687e-06, + "loss": 2.5162, + "step": 7391 + }, + { + "epoch": 1.4813627254509019, + "grad_norm": 22.47803164466735, + "learning_rate": 5.9759518785794215e-06, + "loss": 2.5267, + "step": 7392 + }, + { + "epoch": 1.481563126252505, + "grad_norm": 26.297633238839065, + "learning_rate": 5.974808392809213e-06, + "loss": 2.7298, + "step": 7393 + }, + { + "epoch": 1.4817635270541083, + "grad_norm": 22.48070313936613, + "learning_rate": 5.973664854037231e-06, + "loss": 3.1823, + "step": 7394 + }, + { + "epoch": 1.4819639278557113, + "grad_norm": 25.045500062085825, + "learning_rate": 5.972521262325655e-06, + "loss": 3.2817, + "step": 7395 + }, + { + "epoch": 1.4821643286573147, + "grad_norm": 19.192143366641634, + "learning_rate": 5.971377617736663e-06, + "loss": 2.2206, + "step": 7396 + }, + { + "epoch": 1.4823647294589177, + "grad_norm": 18.003236648253427, + "learning_rate": 5.9702339203324344e-06, + "loss": 2.0409, + "step": 7397 + }, + { + "epoch": 1.4825651302605212, + "grad_norm": 35.64026556998716, + "learning_rate": 5.969090170175156e-06, + "loss": 3.0738, + "step": 7398 + }, + { + "epoch": 1.4827655310621242, + "grad_norm": 33.331504026890805, + "learning_rate": 5.967946367327014e-06, + "loss": 2.3272, + "step": 7399 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 21.954132397397096, + "learning_rate": 5.9668025118502005e-06, + "loss": 2.9428, + "step": 7400 + }, + { + "epoch": 1.4831663326653306, + "grad_norm": 28.95130351916707, + "learning_rate": 5.9656586038069075e-06, + "loss": 2.753, + "step": 7401 + }, + { + "epoch": 1.4833667334669338, + "grad_norm": 22.114666845597196, + "learning_rate": 5.964514643259329e-06, + "loss": 2.608, + "step": 7402 + }, + { + "epoch": 1.483567134268537, + "grad_norm": 25.11956104525263, + "learning_rate": 5.963370630269668e-06, + "loss": 2.7368, + "step": 7403 + }, + { + "epoch": 1.4837675350701403, + "grad_norm": 27.086271892958635, + "learning_rate": 5.962226564900121e-06, + "loss": 2.7937, + "step": 7404 + }, + { + "epoch": 1.4839679358717435, + "grad_norm": 26.394600358810784, + "learning_rate": 5.961082447212896e-06, + "loss": 2.9247, + "step": 7405 + }, + { + "epoch": 1.4841683366733467, + "grad_norm": 29.072767015208637, + "learning_rate": 5.959938277270201e-06, + "loss": 2.7272, + "step": 7406 + }, + { + "epoch": 1.48436873747495, + "grad_norm": 20.980472235114984, + "learning_rate": 5.9587940551342434e-06, + "loss": 2.8983, + "step": 7407 + }, + { + "epoch": 1.4845691382765531, + "grad_norm": 19.44172900302252, + "learning_rate": 5.95764978086724e-06, + "loss": 2.7573, + "step": 7408 + }, + { + "epoch": 1.4847695390781563, + "grad_norm": 23.544505668259504, + "learning_rate": 5.956505454531402e-06, + "loss": 3.1115, + "step": 7409 + }, + { + "epoch": 1.4849699398797596, + "grad_norm": 27.639034198518704, + "learning_rate": 5.9553610761889515e-06, + "loss": 3.1357, + "step": 7410 + }, + { + "epoch": 1.4851703406813628, + "grad_norm": 50.38978268627787, + "learning_rate": 5.9542166459021085e-06, + "loss": 3.3499, + "step": 7411 + }, + { + "epoch": 1.485370741482966, + "grad_norm": 35.43725822141885, + "learning_rate": 5.953072163733098e-06, + "loss": 2.767, + "step": 7412 + }, + { + "epoch": 1.4855711422845692, + "grad_norm": 51.15861773116547, + "learning_rate": 5.951927629744149e-06, + "loss": 2.7116, + "step": 7413 + }, + { + "epoch": 1.4857715430861722, + "grad_norm": 32.54070126659839, + "learning_rate": 5.950783043997487e-06, + "loss": 3.0313, + "step": 7414 + }, + { + "epoch": 1.4859719438877756, + "grad_norm": 22.217304408634668, + "learning_rate": 5.949638406555349e-06, + "loss": 2.441, + "step": 7415 + }, + { + "epoch": 1.4861723446893786, + "grad_norm": 29.284886837985074, + "learning_rate": 5.9484937174799685e-06, + "loss": 2.7503, + "step": 7416 + }, + { + "epoch": 1.486372745490982, + "grad_norm": 25.715503649777972, + "learning_rate": 5.9473489768335825e-06, + "loss": 3.239, + "step": 7417 + }, + { + "epoch": 1.486573146292585, + "grad_norm": 34.861489971919696, + "learning_rate": 5.946204184678437e-06, + "loss": 2.6031, + "step": 7418 + }, + { + "epoch": 1.4867735470941883, + "grad_norm": 26.088344585161867, + "learning_rate": 5.945059341076772e-06, + "loss": 3.0095, + "step": 7419 + }, + { + "epoch": 1.4869739478957915, + "grad_norm": 23.259800472673014, + "learning_rate": 5.943914446090837e-06, + "loss": 2.8202, + "step": 7420 + }, + { + "epoch": 1.4871743486973947, + "grad_norm": 38.393034555074124, + "learning_rate": 5.942769499782879e-06, + "loss": 3.1229, + "step": 7421 + }, + { + "epoch": 1.487374749498998, + "grad_norm": 19.139150377093504, + "learning_rate": 5.941624502215152e-06, + "loss": 2.3196, + "step": 7422 + }, + { + "epoch": 1.4875751503006012, + "grad_norm": 62.67993581077662, + "learning_rate": 5.94047945344991e-06, + "loss": 2.739, + "step": 7423 + }, + { + "epoch": 1.4877755511022044, + "grad_norm": 19.622911277563553, + "learning_rate": 5.939334353549412e-06, + "loss": 3.1581, + "step": 7424 + }, + { + "epoch": 1.4879759519038076, + "grad_norm": 21.385412799775047, + "learning_rate": 5.93818920257592e-06, + "loss": 2.8353, + "step": 7425 + }, + { + "epoch": 1.4881763527054108, + "grad_norm": 36.79798398748886, + "learning_rate": 5.937044000591695e-06, + "loss": 3.3728, + "step": 7426 + }, + { + "epoch": 1.488376753507014, + "grad_norm": 48.4306805973025, + "learning_rate": 5.935898747659004e-06, + "loss": 2.8703, + "step": 7427 + }, + { + "epoch": 1.4885771543086173, + "grad_norm": 19.87563130312786, + "learning_rate": 5.934753443840119e-06, + "loss": 2.5363, + "step": 7428 + }, + { + "epoch": 1.4887775551102205, + "grad_norm": 26.22861046902509, + "learning_rate": 5.933608089197307e-06, + "loss": 2.9172, + "step": 7429 + }, + { + "epoch": 1.4889779559118237, + "grad_norm": 22.260444820346944, + "learning_rate": 5.9324626837928455e-06, + "loss": 3.0851, + "step": 7430 + }, + { + "epoch": 1.489178356713427, + "grad_norm": 30.306823772848446, + "learning_rate": 5.9313172276890115e-06, + "loss": 2.4899, + "step": 7431 + }, + { + "epoch": 1.4893787575150301, + "grad_norm": 31.239374786304243, + "learning_rate": 5.930171720948086e-06, + "loss": 2.821, + "step": 7432 + }, + { + "epoch": 1.4895791583166333, + "grad_norm": 31.00332379250705, + "learning_rate": 5.929026163632351e-06, + "loss": 2.6279, + "step": 7433 + }, + { + "epoch": 1.4897795591182366, + "grad_norm": 23.387826901040377, + "learning_rate": 5.927880555804091e-06, + "loss": 2.6304, + "step": 7434 + }, + { + "epoch": 1.4899799599198396, + "grad_norm": 21.909600849179043, + "learning_rate": 5.926734897525596e-06, + "loss": 2.2917, + "step": 7435 + }, + { + "epoch": 1.490180360721443, + "grad_norm": 38.158968723626174, + "learning_rate": 5.9255891888591555e-06, + "loss": 2.3825, + "step": 7436 + }, + { + "epoch": 1.490380761523046, + "grad_norm": 32.988675054958556, + "learning_rate": 5.924443429867067e-06, + "loss": 2.7307, + "step": 7437 + }, + { + "epoch": 1.4905811623246494, + "grad_norm": 28.52072024256722, + "learning_rate": 5.923297620611623e-06, + "loss": 2.6095, + "step": 7438 + }, + { + "epoch": 1.4907815631262524, + "grad_norm": 25.505789452249957, + "learning_rate": 5.9221517611551236e-06, + "loss": 2.4592, + "step": 7439 + }, + { + "epoch": 1.4909819639278556, + "grad_norm": 57.569641475585286, + "learning_rate": 5.921005851559874e-06, + "loss": 2.8851, + "step": 7440 + }, + { + "epoch": 1.4911823647294589, + "grad_norm": 80.62831439860626, + "learning_rate": 5.919859891888175e-06, + "loss": 3.2238, + "step": 7441 + }, + { + "epoch": 1.491382765531062, + "grad_norm": 24.72337987038129, + "learning_rate": 5.918713882202336e-06, + "loss": 2.8255, + "step": 7442 + }, + { + "epoch": 1.4915831663326653, + "grad_norm": 23.517356573862852, + "learning_rate": 5.917567822564667e-06, + "loss": 2.4515, + "step": 7443 + }, + { + "epoch": 1.4917835671342685, + "grad_norm": 22.084801747785445, + "learning_rate": 5.916421713037481e-06, + "loss": 2.3156, + "step": 7444 + }, + { + "epoch": 1.4919839679358717, + "grad_norm": 33.36549904098136, + "learning_rate": 5.915275553683095e-06, + "loss": 2.9463, + "step": 7445 + }, + { + "epoch": 1.492184368737475, + "grad_norm": 22.175521456174966, + "learning_rate": 5.914129344563826e-06, + "loss": 3.0788, + "step": 7446 + }, + { + "epoch": 1.4923847695390782, + "grad_norm": 21.65671018863181, + "learning_rate": 5.9129830857419936e-06, + "loss": 2.7316, + "step": 7447 + }, + { + "epoch": 1.4925851703406814, + "grad_norm": 35.37368126085392, + "learning_rate": 5.911836777279925e-06, + "loss": 3.0514, + "step": 7448 + }, + { + "epoch": 1.4927855711422846, + "grad_norm": 17.96145560977583, + "learning_rate": 5.9106904192399425e-06, + "loss": 2.6459, + "step": 7449 + }, + { + "epoch": 1.4929859719438878, + "grad_norm": 24.647268283166287, + "learning_rate": 5.909544011684379e-06, + "loss": 3.0867, + "step": 7450 + }, + { + "epoch": 1.493186372745491, + "grad_norm": 31.343432051375142, + "learning_rate": 5.908397554675566e-06, + "loss": 3.1581, + "step": 7451 + }, + { + "epoch": 1.4933867735470943, + "grad_norm": 32.48180563539111, + "learning_rate": 5.907251048275836e-06, + "loss": 2.7026, + "step": 7452 + }, + { + "epoch": 1.4935871743486975, + "grad_norm": 32.036362152453286, + "learning_rate": 5.906104492547526e-06, + "loss": 2.9456, + "step": 7453 + }, + { + "epoch": 1.4937875751503005, + "grad_norm": 27.80444986067065, + "learning_rate": 5.904957887552979e-06, + "loss": 2.664, + "step": 7454 + }, + { + "epoch": 1.493987975951904, + "grad_norm": 27.805385543876437, + "learning_rate": 5.903811233354535e-06, + "loss": 3.0189, + "step": 7455 + }, + { + "epoch": 1.494188376753507, + "grad_norm": 32.60005538116084, + "learning_rate": 5.90266453001454e-06, + "loss": 2.3531, + "step": 7456 + }, + { + "epoch": 1.4943887775551103, + "grad_norm": 47.94339291543329, + "learning_rate": 5.901517777595343e-06, + "loss": 2.8039, + "step": 7457 + }, + { + "epoch": 1.4945891783567133, + "grad_norm": 21.747436709738007, + "learning_rate": 5.900370976159293e-06, + "loss": 2.48, + "step": 7458 + }, + { + "epoch": 1.4947895791583166, + "grad_norm": 31.467020743966266, + "learning_rate": 5.899224125768745e-06, + "loss": 2.337, + "step": 7459 + }, + { + "epoch": 1.4949899799599198, + "grad_norm": 114.08201469574796, + "learning_rate": 5.898077226486051e-06, + "loss": 2.9601, + "step": 7460 + }, + { + "epoch": 1.495190380761523, + "grad_norm": 18.55051886755003, + "learning_rate": 5.896930278373574e-06, + "loss": 2.3622, + "step": 7461 + }, + { + "epoch": 1.4953907815631262, + "grad_norm": 34.501899129192005, + "learning_rate": 5.895783281493675e-06, + "loss": 2.7154, + "step": 7462 + }, + { + "epoch": 1.4955911823647294, + "grad_norm": 27.88491606363507, + "learning_rate": 5.894636235908715e-06, + "loss": 2.7035, + "step": 7463 + }, + { + "epoch": 1.4957915831663327, + "grad_norm": 35.84688063747163, + "learning_rate": 5.893489141681064e-06, + "loss": 2.9333, + "step": 7464 + }, + { + "epoch": 1.4959919839679359, + "grad_norm": 31.23236342121068, + "learning_rate": 5.892341998873089e-06, + "loss": 3.066, + "step": 7465 + }, + { + "epoch": 1.496192384769539, + "grad_norm": 91.05814608163716, + "learning_rate": 5.891194807547161e-06, + "loss": 2.8198, + "step": 7466 + }, + { + "epoch": 1.4963927855711423, + "grad_norm": 23.01226861623573, + "learning_rate": 5.890047567765657e-06, + "loss": 2.3344, + "step": 7467 + }, + { + "epoch": 1.4965931863727455, + "grad_norm": 29.67224418747462, + "learning_rate": 5.888900279590952e-06, + "loss": 2.9304, + "step": 7468 + }, + { + "epoch": 1.4967935871743487, + "grad_norm": 22.03277416593056, + "learning_rate": 5.887752943085427e-06, + "loss": 2.7208, + "step": 7469 + }, + { + "epoch": 1.496993987975952, + "grad_norm": 24.63519479119081, + "learning_rate": 5.886605558311465e-06, + "loss": 2.8214, + "step": 7470 + }, + { + "epoch": 1.4971943887775552, + "grad_norm": 20.023071192418985, + "learning_rate": 5.885458125331449e-06, + "loss": 2.4491, + "step": 7471 + }, + { + "epoch": 1.4973947895791584, + "grad_norm": 26.43286977573676, + "learning_rate": 5.884310644207769e-06, + "loss": 2.3854, + "step": 7472 + }, + { + "epoch": 1.4975951903807614, + "grad_norm": 33.52376964850214, + "learning_rate": 5.883163115002811e-06, + "loss": 2.4941, + "step": 7473 + }, + { + "epoch": 1.4977955911823648, + "grad_norm": 46.14736880020445, + "learning_rate": 5.8820155377789735e-06, + "loss": 2.6563, + "step": 7474 + }, + { + "epoch": 1.4979959919839678, + "grad_norm": 21.597577788173446, + "learning_rate": 5.880867912598648e-06, + "loss": 2.4856, + "step": 7475 + }, + { + "epoch": 1.4981963927855713, + "grad_norm": 28.709519604571017, + "learning_rate": 5.879720239524234e-06, + "loss": 3.0009, + "step": 7476 + }, + { + "epoch": 1.4983967935871743, + "grad_norm": 29.91590162986143, + "learning_rate": 5.878572518618133e-06, + "loss": 2.7073, + "step": 7477 + }, + { + "epoch": 1.4985971943887775, + "grad_norm": 18.511923393944162, + "learning_rate": 5.8774247499427465e-06, + "loss": 2.0244, + "step": 7478 + }, + { + "epoch": 1.4987975951903807, + "grad_norm": 23.41150155972076, + "learning_rate": 5.876276933560481e-06, + "loss": 2.8771, + "step": 7479 + }, + { + "epoch": 1.498997995991984, + "grad_norm": 21.208878438816157, + "learning_rate": 5.8751290695337444e-06, + "loss": 2.4577, + "step": 7480 + }, + { + "epoch": 1.4991983967935871, + "grad_norm": 17.933681276421666, + "learning_rate": 5.873981157924951e-06, + "loss": 1.9988, + "step": 7481 + }, + { + "epoch": 1.4993987975951903, + "grad_norm": 28.586243373078272, + "learning_rate": 5.872833198796509e-06, + "loss": 2.4485, + "step": 7482 + }, + { + "epoch": 1.4995991983967936, + "grad_norm": 26.654650190141126, + "learning_rate": 5.871685192210839e-06, + "loss": 2.7545, + "step": 7483 + }, + { + "epoch": 1.4997995991983968, + "grad_norm": 19.54274800027549, + "learning_rate": 5.870537138230359e-06, + "loss": 2.7321, + "step": 7484 + }, + { + "epoch": 1.5, + "grad_norm": 24.162495850657148, + "learning_rate": 5.869389036917488e-06, + "loss": 2.6842, + "step": 7485 + }, + { + "epoch": 1.5002004008016032, + "grad_norm": 23.99080720527577, + "learning_rate": 5.8682408883346535e-06, + "loss": 3.0471, + "step": 7486 + }, + { + "epoch": 1.5004008016032064, + "grad_norm": 24.000050624971152, + "learning_rate": 5.867092692544278e-06, + "loss": 2.7679, + "step": 7487 + }, + { + "epoch": 1.5006012024048097, + "grad_norm": 29.527651597710776, + "learning_rate": 5.8659444496087946e-06, + "loss": 2.9868, + "step": 7488 + }, + { + "epoch": 1.5008016032064129, + "grad_norm": 21.306632009802417, + "learning_rate": 5.864796159590633e-06, + "loss": 2.3267, + "step": 7489 + }, + { + "epoch": 1.5010020040080159, + "grad_norm": 19.588030636048938, + "learning_rate": 5.8636478225522275e-06, + "loss": 2.4668, + "step": 7490 + }, + { + "epoch": 1.5012024048096193, + "grad_norm": 20.52076138953069, + "learning_rate": 5.862499438556015e-06, + "loss": 2.9038, + "step": 7491 + }, + { + "epoch": 1.5014028056112223, + "grad_norm": 42.19372281826652, + "learning_rate": 5.861351007664434e-06, + "loss": 3.0773, + "step": 7492 + }, + { + "epoch": 1.5016032064128257, + "grad_norm": 19.957576175832532, + "learning_rate": 5.860202529939929e-06, + "loss": 2.8265, + "step": 7493 + }, + { + "epoch": 1.5018036072144287, + "grad_norm": 25.590589188967446, + "learning_rate": 5.859054005444942e-06, + "loss": 2.8804, + "step": 7494 + }, + { + "epoch": 1.5020040080160322, + "grad_norm": 21.88236611879978, + "learning_rate": 5.8579054342419205e-06, + "loss": 2.6551, + "step": 7495 + }, + { + "epoch": 1.5022044088176352, + "grad_norm": 22.616764720647275, + "learning_rate": 5.856756816393314e-06, + "loss": 2.4938, + "step": 7496 + }, + { + "epoch": 1.5024048096192386, + "grad_norm": 21.653864728062327, + "learning_rate": 5.855608151961576e-06, + "loss": 2.9156, + "step": 7497 + }, + { + "epoch": 1.5026052104208416, + "grad_norm": 33.57539939510506, + "learning_rate": 5.8544594410091574e-06, + "loss": 2.9617, + "step": 7498 + }, + { + "epoch": 1.502805611222445, + "grad_norm": 26.412961476308237, + "learning_rate": 5.853310683598519e-06, + "loss": 2.6274, + "step": 7499 + }, + { + "epoch": 1.503006012024048, + "grad_norm": 26.031384661699622, + "learning_rate": 5.852161879792119e-06, + "loss": 2.7913, + "step": 7500 + }, + { + "epoch": 1.5032064128256513, + "grad_norm": 46.24893490591411, + "learning_rate": 5.8510130296524205e-06, + "loss": 3.1634, + "step": 7501 + }, + { + "epoch": 1.5034068136272545, + "grad_norm": 24.50663345828398, + "learning_rate": 5.849864133241886e-06, + "loss": 2.414, + "step": 7502 + }, + { + "epoch": 1.5036072144288577, + "grad_norm": 27.17434620255758, + "learning_rate": 5.848715190622986e-06, + "loss": 3.0959, + "step": 7503 + }, + { + "epoch": 1.503807615230461, + "grad_norm": 25.19526250284899, + "learning_rate": 5.847566201858186e-06, + "loss": 2.8133, + "step": 7504 + }, + { + "epoch": 1.5040080160320641, + "grad_norm": 33.38962409559003, + "learning_rate": 5.84641716700996e-06, + "loss": 2.5555, + "step": 7505 + }, + { + "epoch": 1.5042084168336673, + "grad_norm": 18.879699991590865, + "learning_rate": 5.8452680861407864e-06, + "loss": 2.6081, + "step": 7506 + }, + { + "epoch": 1.5044088176352706, + "grad_norm": 20.823938231521097, + "learning_rate": 5.844118959313137e-06, + "loss": 2.6543, + "step": 7507 + }, + { + "epoch": 1.5046092184368738, + "grad_norm": 23.324720445400235, + "learning_rate": 5.842969786589494e-06, + "loss": 2.6825, + "step": 7508 + }, + { + "epoch": 1.504809619238477, + "grad_norm": 23.649602809109563, + "learning_rate": 5.84182056803234e-06, + "loss": 2.5284, + "step": 7509 + }, + { + "epoch": 1.5050100200400802, + "grad_norm": 21.579770757474773, + "learning_rate": 5.840671303704158e-06, + "loss": 2.8823, + "step": 7510 + }, + { + "epoch": 1.5052104208416832, + "grad_norm": 21.663435504119807, + "learning_rate": 5.839521993667436e-06, + "loss": 2.4507, + "step": 7511 + }, + { + "epoch": 1.5054108216432867, + "grad_norm": 38.111406397258946, + "learning_rate": 5.838372637984665e-06, + "loss": 3.1347, + "step": 7512 + }, + { + "epoch": 1.5056112224448897, + "grad_norm": 32.04183647540246, + "learning_rate": 5.837223236718337e-06, + "loss": 3.1604, + "step": 7513 + }, + { + "epoch": 1.505811623246493, + "grad_norm": 26.781933746683006, + "learning_rate": 5.836073789930945e-06, + "loss": 2.443, + "step": 7514 + }, + { + "epoch": 1.506012024048096, + "grad_norm": 29.796681908612754, + "learning_rate": 5.834924297684986e-06, + "loss": 3.106, + "step": 7515 + }, + { + "epoch": 1.5062124248496995, + "grad_norm": 24.94898478478032, + "learning_rate": 5.8337747600429605e-06, + "loss": 2.6195, + "step": 7516 + }, + { + "epoch": 1.5064128256513025, + "grad_norm": 19.88075752397012, + "learning_rate": 5.832625177067371e-06, + "loss": 2.9679, + "step": 7517 + }, + { + "epoch": 1.506613226452906, + "grad_norm": 21.015796100606032, + "learning_rate": 5.831475548820723e-06, + "loss": 2.4017, + "step": 7518 + }, + { + "epoch": 1.506813627254509, + "grad_norm": 17.787297708968403, + "learning_rate": 5.83032587536552e-06, + "loss": 2.3403, + "step": 7519 + }, + { + "epoch": 1.5070140280561122, + "grad_norm": 26.362648152312353, + "learning_rate": 5.829176156764275e-06, + "loss": 2.6716, + "step": 7520 + }, + { + "epoch": 1.5072144288577154, + "grad_norm": 26.91394497664105, + "learning_rate": 5.828026393079498e-06, + "loss": 2.9361, + "step": 7521 + }, + { + "epoch": 1.5074148296593186, + "grad_norm": 36.339876736368396, + "learning_rate": 5.826876584373703e-06, + "loss": 2.8789, + "step": 7522 + }, + { + "epoch": 1.5076152304609218, + "grad_norm": 25.47560949209726, + "learning_rate": 5.825726730709406e-06, + "loss": 3.3135, + "step": 7523 + }, + { + "epoch": 1.507815631262525, + "grad_norm": 34.06009418396234, + "learning_rate": 5.82457683214913e-06, + "loss": 3.2799, + "step": 7524 + }, + { + "epoch": 1.5080160320641283, + "grad_norm": 25.42625302379951, + "learning_rate": 5.823426888755394e-06, + "loss": 2.6917, + "step": 7525 + }, + { + "epoch": 1.5082164328657315, + "grad_norm": 16.248690184481717, + "learning_rate": 5.822276900590724e-06, + "loss": 2.5181, + "step": 7526 + }, + { + "epoch": 1.5084168336673347, + "grad_norm": 20.7027896512124, + "learning_rate": 5.821126867717644e-06, + "loss": 2.2098, + "step": 7527 + }, + { + "epoch": 1.508617234468938, + "grad_norm": 25.686019110104958, + "learning_rate": 5.8199767901986835e-06, + "loss": 2.5966, + "step": 7528 + }, + { + "epoch": 1.5088176352705411, + "grad_norm": 23.415730631120365, + "learning_rate": 5.818826668096374e-06, + "loss": 3.1228, + "step": 7529 + }, + { + "epoch": 1.5090180360721441, + "grad_norm": 17.404159372435124, + "learning_rate": 5.817676501473251e-06, + "loss": 2.5137, + "step": 7530 + }, + { + "epoch": 1.5092184368737476, + "grad_norm": 26.186295061147742, + "learning_rate": 5.81652629039185e-06, + "loss": 3.3234, + "step": 7531 + }, + { + "epoch": 1.5094188376753506, + "grad_norm": 35.10876248511842, + "learning_rate": 5.815376034914708e-06, + "loss": 2.8073, + "step": 7532 + }, + { + "epoch": 1.509619238476954, + "grad_norm": 33.86795382278911, + "learning_rate": 5.814225735104369e-06, + "loss": 3.1662, + "step": 7533 + }, + { + "epoch": 1.509819639278557, + "grad_norm": 26.949171995447134, + "learning_rate": 5.8130753910233735e-06, + "loss": 2.461, + "step": 7534 + }, + { + "epoch": 1.5100200400801604, + "grad_norm": 23.772945553887855, + "learning_rate": 5.811925002734269e-06, + "loss": 2.8582, + "step": 7535 + }, + { + "epoch": 1.5102204408817634, + "grad_norm": 27.606361619334404, + "learning_rate": 5.810774570299603e-06, + "loss": 3.0917, + "step": 7536 + }, + { + "epoch": 1.5104208416833669, + "grad_norm": 16.779838917524337, + "learning_rate": 5.809624093781928e-06, + "loss": 2.5905, + "step": 7537 + }, + { + "epoch": 1.5106212424849699, + "grad_norm": 22.376195709889085, + "learning_rate": 5.808473573243796e-06, + "loss": 2.5813, + "step": 7538 + }, + { + "epoch": 1.5108216432865733, + "grad_norm": 30.996073574572023, + "learning_rate": 5.8073230087477614e-06, + "loss": 2.5636, + "step": 7539 + }, + { + "epoch": 1.5110220440881763, + "grad_norm": 23.11092048685468, + "learning_rate": 5.806172400356384e-06, + "loss": 2.557, + "step": 7540 + }, + { + "epoch": 1.5112224448897795, + "grad_norm": 16.27723372185543, + "learning_rate": 5.8050217481322236e-06, + "loss": 2.4899, + "step": 7541 + }, + { + "epoch": 1.5114228456913827, + "grad_norm": 29.04561805297096, + "learning_rate": 5.803871052137839e-06, + "loss": 2.6118, + "step": 7542 + }, + { + "epoch": 1.511623246492986, + "grad_norm": 23.09018548854062, + "learning_rate": 5.802720312435803e-06, + "loss": 2.8116, + "step": 7543 + }, + { + "epoch": 1.5118236472945892, + "grad_norm": 32.92798833280153, + "learning_rate": 5.801569529088676e-06, + "loss": 2.8294, + "step": 7544 + }, + { + "epoch": 1.5120240480961924, + "grad_norm": 22.071361360325643, + "learning_rate": 5.800418702159033e-06, + "loss": 2.8307, + "step": 7545 + }, + { + "epoch": 1.5122244488977956, + "grad_norm": 17.54307227628642, + "learning_rate": 5.799267831709442e-06, + "loss": 2.6969, + "step": 7546 + }, + { + "epoch": 1.5124248496993988, + "grad_norm": 17.522888826338622, + "learning_rate": 5.798116917802479e-06, + "loss": 2.4884, + "step": 7547 + }, + { + "epoch": 1.512625250501002, + "grad_norm": 29.391851708868828, + "learning_rate": 5.79696596050072e-06, + "loss": 2.7029, + "step": 7548 + }, + { + "epoch": 1.512825651302605, + "grad_norm": 21.319578887970117, + "learning_rate": 5.795814959866747e-06, + "loss": 2.6744, + "step": 7549 + }, + { + "epoch": 1.5130260521042085, + "grad_norm": 29.604527658653883, + "learning_rate": 5.79466391596314e-06, + "loss": 2.8762, + "step": 7550 + }, + { + "epoch": 1.5132264529058115, + "grad_norm": 17.463542404709614, + "learning_rate": 5.793512828852482e-06, + "loss": 2.6067, + "step": 7551 + }, + { + "epoch": 1.513426853707415, + "grad_norm": 21.939809263118523, + "learning_rate": 5.792361698597362e-06, + "loss": 3.0457, + "step": 7552 + }, + { + "epoch": 1.513627254509018, + "grad_norm": 26.126920990590442, + "learning_rate": 5.791210525260365e-06, + "loss": 2.5101, + "step": 7553 + }, + { + "epoch": 1.5138276553106214, + "grad_norm": 29.760025103344404, + "learning_rate": 5.790059308904083e-06, + "loss": 3.4381, + "step": 7554 + }, + { + "epoch": 1.5140280561122244, + "grad_norm": 23.244299711792653, + "learning_rate": 5.788908049591111e-06, + "loss": 2.5041, + "step": 7555 + }, + { + "epoch": 1.5142284569138278, + "grad_norm": 25.349323432749582, + "learning_rate": 5.7877567473840445e-06, + "loss": 2.6416, + "step": 7556 + }, + { + "epoch": 1.5144288577154308, + "grad_norm": 22.30136251897258, + "learning_rate": 5.786605402345481e-06, + "loss": 2.8702, + "step": 7557 + }, + { + "epoch": 1.5146292585170342, + "grad_norm": 22.815827553752484, + "learning_rate": 5.78545401453802e-06, + "loss": 2.9605, + "step": 7558 + }, + { + "epoch": 1.5148296593186372, + "grad_norm": 20.87636564372309, + "learning_rate": 5.784302584024264e-06, + "loss": 2.1915, + "step": 7559 + }, + { + "epoch": 1.5150300601202404, + "grad_norm": 37.258916375648504, + "learning_rate": 5.7831511108668195e-06, + "loss": 3.3574, + "step": 7560 + }, + { + "epoch": 1.5152304609218437, + "grad_norm": 22.74456121354404, + "learning_rate": 5.781999595128293e-06, + "loss": 2.7728, + "step": 7561 + }, + { + "epoch": 1.5154308617234469, + "grad_norm": 20.83336344338351, + "learning_rate": 5.780848036871295e-06, + "loss": 2.4679, + "step": 7562 + }, + { + "epoch": 1.51563126252505, + "grad_norm": 20.23345001693703, + "learning_rate": 5.779696436158435e-06, + "loss": 2.3139, + "step": 7563 + }, + { + "epoch": 1.5158316633266533, + "grad_norm": 20.463411121706084, + "learning_rate": 5.77854479305233e-06, + "loss": 2.4043, + "step": 7564 + }, + { + "epoch": 1.5160320641282565, + "grad_norm": 23.01695784048526, + "learning_rate": 5.777393107615596e-06, + "loss": 2.5869, + "step": 7565 + }, + { + "epoch": 1.5162324649298597, + "grad_norm": 29.66907653281847, + "learning_rate": 5.776241379910849e-06, + "loss": 2.5447, + "step": 7566 + }, + { + "epoch": 1.516432865731463, + "grad_norm": 32.090175017877236, + "learning_rate": 5.775089610000715e-06, + "loss": 3.2386, + "step": 7567 + }, + { + "epoch": 1.5166332665330662, + "grad_norm": 22.274308572845616, + "learning_rate": 5.773937797947813e-06, + "loss": 2.3132, + "step": 7568 + }, + { + "epoch": 1.5168336673346694, + "grad_norm": 36.68288374316375, + "learning_rate": 5.772785943814771e-06, + "loss": 3.0923, + "step": 7569 + }, + { + "epoch": 1.5170340681362724, + "grad_norm": 62.03575726135692, + "learning_rate": 5.7716340476642164e-06, + "loss": 3.1898, + "step": 7570 + }, + { + "epoch": 1.5172344689378758, + "grad_norm": 19.15423588583794, + "learning_rate": 5.7704821095587794e-06, + "loss": 3.0212, + "step": 7571 + }, + { + "epoch": 1.5174348697394788, + "grad_norm": 25.380792305927393, + "learning_rate": 5.7693301295610936e-06, + "loss": 3.2005, + "step": 7572 + }, + { + "epoch": 1.5176352705410823, + "grad_norm": 31.567454249324204, + "learning_rate": 5.768178107733791e-06, + "loss": 2.2893, + "step": 7573 + }, + { + "epoch": 1.5178356713426853, + "grad_norm": 21.045551777534996, + "learning_rate": 5.767026044139513e-06, + "loss": 3.068, + "step": 7574 + }, + { + "epoch": 1.5180360721442887, + "grad_norm": 18.11441207336083, + "learning_rate": 5.765873938840896e-06, + "loss": 2.4739, + "step": 7575 + }, + { + "epoch": 1.5182364729458917, + "grad_norm": 28.107711757056016, + "learning_rate": 5.7647217919005815e-06, + "loss": 3.4086, + "step": 7576 + }, + { + "epoch": 1.5184368737474951, + "grad_norm": 21.79151838689284, + "learning_rate": 5.763569603381216e-06, + "loss": 2.7782, + "step": 7577 + }, + { + "epoch": 1.5186372745490981, + "grad_norm": 25.00287711422454, + "learning_rate": 5.762417373345443e-06, + "loss": 2.8411, + "step": 7578 + }, + { + "epoch": 1.5188376753507014, + "grad_norm": 22.337922697388603, + "learning_rate": 5.761265101855912e-06, + "loss": 2.9049, + "step": 7579 + }, + { + "epoch": 1.5190380761523046, + "grad_norm": 30.6624229601437, + "learning_rate": 5.760112788975274e-06, + "loss": 2.8031, + "step": 7580 + }, + { + "epoch": 1.5192384769539078, + "grad_norm": 20.777809907864874, + "learning_rate": 5.758960434766181e-06, + "loss": 2.8097, + "step": 7581 + }, + { + "epoch": 1.519438877755511, + "grad_norm": 18.361824889525185, + "learning_rate": 5.75780803929129e-06, + "loss": 2.6969, + "step": 7582 + }, + { + "epoch": 1.5196392785571142, + "grad_norm": 24.842736805548473, + "learning_rate": 5.756655602613256e-06, + "loss": 2.4417, + "step": 7583 + }, + { + "epoch": 1.5198396793587174, + "grad_norm": 22.40669048739143, + "learning_rate": 5.7555031247947415e-06, + "loss": 2.7668, + "step": 7584 + }, + { + "epoch": 1.5200400801603207, + "grad_norm": 43.28630073691176, + "learning_rate": 5.754350605898404e-06, + "loss": 2.7695, + "step": 7585 + }, + { + "epoch": 1.5202404809619239, + "grad_norm": 21.175632182638935, + "learning_rate": 5.753198045986913e-06, + "loss": 2.5896, + "step": 7586 + }, + { + "epoch": 1.520440881763527, + "grad_norm": 25.928700362157336, + "learning_rate": 5.752045445122932e-06, + "loss": 2.2117, + "step": 7587 + }, + { + "epoch": 1.5206412825651303, + "grad_norm": 23.44473883130252, + "learning_rate": 5.75089280336913e-06, + "loss": 2.3912, + "step": 7588 + }, + { + "epoch": 1.5208416833667333, + "grad_norm": 28.278531421340308, + "learning_rate": 5.749740120788179e-06, + "loss": 2.3834, + "step": 7589 + }, + { + "epoch": 1.5210420841683367, + "grad_norm": 19.101576892520107, + "learning_rate": 5.748587397442749e-06, + "loss": 2.6803, + "step": 7590 + }, + { + "epoch": 1.5212424849699397, + "grad_norm": 27.65575773456561, + "learning_rate": 5.747434633395518e-06, + "loss": 2.7297, + "step": 7591 + }, + { + "epoch": 1.5214428857715432, + "grad_norm": 20.17853347739686, + "learning_rate": 5.7462818287091615e-06, + "loss": 2.9526, + "step": 7592 + }, + { + "epoch": 1.5216432865731462, + "grad_norm": 28.107880750561634, + "learning_rate": 5.745128983446362e-06, + "loss": 2.5939, + "step": 7593 + }, + { + "epoch": 1.5218436873747496, + "grad_norm": 23.768934040669798, + "learning_rate": 5.743976097669799e-06, + "loss": 2.6863, + "step": 7594 + }, + { + "epoch": 1.5220440881763526, + "grad_norm": 30.016977837359676, + "learning_rate": 5.742823171442158e-06, + "loss": 2.7018, + "step": 7595 + }, + { + "epoch": 1.522244488977956, + "grad_norm": 23.764981639050475, + "learning_rate": 5.741670204826124e-06, + "loss": 3.0263, + "step": 7596 + }, + { + "epoch": 1.522444889779559, + "grad_norm": 26.43416802467178, + "learning_rate": 5.740517197884386e-06, + "loss": 2.8941, + "step": 7597 + }, + { + "epoch": 1.5226452905811623, + "grad_norm": 27.196815062759413, + "learning_rate": 5.739364150679634e-06, + "loss": 2.8516, + "step": 7598 + }, + { + "epoch": 1.5228456913827655, + "grad_norm": 18.709265302853666, + "learning_rate": 5.738211063274564e-06, + "loss": 2.8938, + "step": 7599 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 38.64705220335807, + "learning_rate": 5.737057935731868e-06, + "loss": 3.3905, + "step": 7600 + }, + { + "epoch": 1.523246492985972, + "grad_norm": 34.79778188412445, + "learning_rate": 5.735904768114244e-06, + "loss": 2.9373, + "step": 7601 + }, + { + "epoch": 1.5234468937875751, + "grad_norm": 20.18343849793087, + "learning_rate": 5.73475156048439e-06, + "loss": 3.2257, + "step": 7602 + }, + { + "epoch": 1.5236472945891784, + "grad_norm": 21.047339005100625, + "learning_rate": 5.733598312905011e-06, + "loss": 2.9348, + "step": 7603 + }, + { + "epoch": 1.5238476953907816, + "grad_norm": 28.918462998466996, + "learning_rate": 5.732445025438808e-06, + "loss": 2.8388, + "step": 7604 + }, + { + "epoch": 1.5240480961923848, + "grad_norm": 23.638848155449395, + "learning_rate": 5.731291698148486e-06, + "loss": 2.4865, + "step": 7605 + }, + { + "epoch": 1.524248496993988, + "grad_norm": 21.300730387507823, + "learning_rate": 5.730138331096758e-06, + "loss": 2.8054, + "step": 7606 + }, + { + "epoch": 1.5244488977955912, + "grad_norm": 26.04345440733143, + "learning_rate": 5.72898492434633e-06, + "loss": 2.368, + "step": 7607 + }, + { + "epoch": 1.5246492985971942, + "grad_norm": 21.935500313580263, + "learning_rate": 5.7278314779599155e-06, + "loss": 2.8642, + "step": 7608 + }, + { + "epoch": 1.5248496993987977, + "grad_norm": 26.160258193403145, + "learning_rate": 5.72667799200023e-06, + "loss": 3.0688, + "step": 7609 + }, + { + "epoch": 1.5250501002004007, + "grad_norm": 30.692776680712047, + "learning_rate": 5.725524466529988e-06, + "loss": 3.0158, + "step": 7610 + }, + { + "epoch": 1.525250501002004, + "grad_norm": 20.280953486599685, + "learning_rate": 5.724370901611911e-06, + "loss": 2.3124, + "step": 7611 + }, + { + "epoch": 1.525450901803607, + "grad_norm": 21.575036393670196, + "learning_rate": 5.723217297308717e-06, + "loss": 2.6753, + "step": 7612 + }, + { + "epoch": 1.5256513026052105, + "grad_norm": 25.741705088811496, + "learning_rate": 5.722063653683132e-06, + "loss": 2.5442, + "step": 7613 + }, + { + "epoch": 1.5258517034068135, + "grad_norm": 16.151674154933453, + "learning_rate": 5.720909970797881e-06, + "loss": 2.5504, + "step": 7614 + }, + { + "epoch": 1.526052104208417, + "grad_norm": 27.79933231493894, + "learning_rate": 5.71975624871569e-06, + "loss": 2.9862, + "step": 7615 + }, + { + "epoch": 1.52625250501002, + "grad_norm": 18.550988682295355, + "learning_rate": 5.7186024874992895e-06, + "loss": 2.4764, + "step": 7616 + }, + { + "epoch": 1.5264529058116234, + "grad_norm": 22.58445018893151, + "learning_rate": 5.717448687211411e-06, + "loss": 2.6179, + "step": 7617 + }, + { + "epoch": 1.5266533066132264, + "grad_norm": 24.758906939078308, + "learning_rate": 5.716294847914788e-06, + "loss": 2.9508, + "step": 7618 + }, + { + "epoch": 1.5268537074148296, + "grad_norm": 20.640876491298968, + "learning_rate": 5.7151409696721575e-06, + "loss": 2.5638, + "step": 7619 + }, + { + "epoch": 1.5270541082164328, + "grad_norm": 44.25620839251937, + "learning_rate": 5.713987052546256e-06, + "loss": 3.1322, + "step": 7620 + }, + { + "epoch": 1.527254509018036, + "grad_norm": 18.381622164440692, + "learning_rate": 5.712833096599826e-06, + "loss": 2.7658, + "step": 7621 + }, + { + "epoch": 1.5274549098196393, + "grad_norm": 28.421238085182146, + "learning_rate": 5.711679101895606e-06, + "loss": 2.6822, + "step": 7622 + }, + { + "epoch": 1.5276553106212425, + "grad_norm": 21.574180669815718, + "learning_rate": 5.710525068496344e-06, + "loss": 2.5211, + "step": 7623 + }, + { + "epoch": 1.5278557114228457, + "grad_norm": 22.777209698247948, + "learning_rate": 5.709370996464785e-06, + "loss": 2.8264, + "step": 7624 + }, + { + "epoch": 1.528056112224449, + "grad_norm": 24.716550874392997, + "learning_rate": 5.708216885863678e-06, + "loss": 2.8357, + "step": 7625 + }, + { + "epoch": 1.5282565130260521, + "grad_norm": 31.103679014961646, + "learning_rate": 5.707062736755774e-06, + "loss": 2.6377, + "step": 7626 + }, + { + "epoch": 1.5284569138276554, + "grad_norm": 29.733870131756152, + "learning_rate": 5.7059085492038235e-06, + "loss": 2.7711, + "step": 7627 + }, + { + "epoch": 1.5286573146292586, + "grad_norm": 25.216044771867445, + "learning_rate": 5.704754323270586e-06, + "loss": 2.4253, + "step": 7628 + }, + { + "epoch": 1.5288577154308616, + "grad_norm": 32.643495207199884, + "learning_rate": 5.703600059018812e-06, + "loss": 3.2582, + "step": 7629 + }, + { + "epoch": 1.529058116232465, + "grad_norm": 24.980050081009498, + "learning_rate": 5.702445756511266e-06, + "loss": 2.5696, + "step": 7630 + }, + { + "epoch": 1.529258517034068, + "grad_norm": 30.45196895868115, + "learning_rate": 5.701291415810708e-06, + "loss": 2.6955, + "step": 7631 + }, + { + "epoch": 1.5294589178356714, + "grad_norm": 26.170516114860035, + "learning_rate": 5.7001370369799e-06, + "loss": 2.7688, + "step": 7632 + }, + { + "epoch": 1.5296593186372744, + "grad_norm": 31.756612923721402, + "learning_rate": 5.6989826200816074e-06, + "loss": 2.7308, + "step": 7633 + }, + { + "epoch": 1.5298597194388779, + "grad_norm": 40.53402834817806, + "learning_rate": 5.6978281651786e-06, + "loss": 2.9945, + "step": 7634 + }, + { + "epoch": 1.5300601202404809, + "grad_norm": 22.7187772320708, + "learning_rate": 5.696673672333642e-06, + "loss": 2.9719, + "step": 7635 + }, + { + "epoch": 1.5302605210420843, + "grad_norm": 32.55423844231317, + "learning_rate": 5.695519141609509e-06, + "loss": 3.2331, + "step": 7636 + }, + { + "epoch": 1.5304609218436873, + "grad_norm": 25.107142053279905, + "learning_rate": 5.694364573068974e-06, + "loss": 2.6115, + "step": 7637 + }, + { + "epoch": 1.5306613226452905, + "grad_norm": 28.326984897068744, + "learning_rate": 5.693209966774813e-06, + "loss": 3.1809, + "step": 7638 + }, + { + "epoch": 1.5308617234468938, + "grad_norm": 51.90523992557542, + "learning_rate": 5.692055322789802e-06, + "loss": 2.6495, + "step": 7639 + }, + { + "epoch": 1.531062124248497, + "grad_norm": 26.464688861576818, + "learning_rate": 5.690900641176723e-06, + "loss": 2.1399, + "step": 7640 + }, + { + "epoch": 1.5312625250501002, + "grad_norm": 39.223238359858385, + "learning_rate": 5.689745921998353e-06, + "loss": 2.342, + "step": 7641 + }, + { + "epoch": 1.5314629258517034, + "grad_norm": 22.040272932006467, + "learning_rate": 5.688591165317481e-06, + "loss": 2.854, + "step": 7642 + }, + { + "epoch": 1.5316633266533066, + "grad_norm": 20.625057506975576, + "learning_rate": 5.687436371196892e-06, + "loss": 2.7112, + "step": 7643 + }, + { + "epoch": 1.5318637274549098, + "grad_norm": 29.369492305179833, + "learning_rate": 5.686281539699371e-06, + "loss": 2.4165, + "step": 7644 + }, + { + "epoch": 1.532064128256513, + "grad_norm": 44.88311067742987, + "learning_rate": 5.685126670887712e-06, + "loss": 2.5974, + "step": 7645 + }, + { + "epoch": 1.5322645290581163, + "grad_norm": 22.690162286881595, + "learning_rate": 5.6839717648247015e-06, + "loss": 2.8054, + "step": 7646 + }, + { + "epoch": 1.5324649298597195, + "grad_norm": 22.04483291514998, + "learning_rate": 5.682816821573138e-06, + "loss": 2.9951, + "step": 7647 + }, + { + "epoch": 1.5326653306613225, + "grad_norm": 28.99111937309678, + "learning_rate": 5.6816618411958155e-06, + "loss": 3.1173, + "step": 7648 + }, + { + "epoch": 1.532865731462926, + "grad_norm": 27.601716785289902, + "learning_rate": 5.680506823755533e-06, + "loss": 2.497, + "step": 7649 + }, + { + "epoch": 1.533066132264529, + "grad_norm": 20.030911944491162, + "learning_rate": 5.67935176931509e-06, + "loss": 2.5981, + "step": 7650 + }, + { + "epoch": 1.5332665330661324, + "grad_norm": 27.113136453591398, + "learning_rate": 5.6781966779372875e-06, + "loss": 2.5693, + "step": 7651 + }, + { + "epoch": 1.5334669338677354, + "grad_norm": 24.255984734147408, + "learning_rate": 5.677041549684932e-06, + "loss": 2.6422, + "step": 7652 + }, + { + "epoch": 1.5336673346693388, + "grad_norm": 24.860202488589316, + "learning_rate": 5.675886384620828e-06, + "loss": 2.1617, + "step": 7653 + }, + { + "epoch": 1.5338677354709418, + "grad_norm": 20.99911301544481, + "learning_rate": 5.674731182807781e-06, + "loss": 2.8823, + "step": 7654 + }, + { + "epoch": 1.5340681362725452, + "grad_norm": 31.68327553692959, + "learning_rate": 5.673575944308607e-06, + "loss": 2.9299, + "step": 7655 + }, + { + "epoch": 1.5342685370741482, + "grad_norm": 20.690102997856286, + "learning_rate": 5.672420669186113e-06, + "loss": 2.0589, + "step": 7656 + }, + { + "epoch": 1.5344689378757514, + "grad_norm": 24.759149535298675, + "learning_rate": 5.671265357503114e-06, + "loss": 2.4598, + "step": 7657 + }, + { + "epoch": 1.5346693386773547, + "grad_norm": 27.444862675178214, + "learning_rate": 5.6701100093224285e-06, + "loss": 3.3507, + "step": 7658 + }, + { + "epoch": 1.5348697394789579, + "grad_norm": 37.61803705664379, + "learning_rate": 5.66895462470687e-06, + "loss": 3.5538, + "step": 7659 + }, + { + "epoch": 1.535070140280561, + "grad_norm": 43.864767930683364, + "learning_rate": 5.667799203719263e-06, + "loss": 2.9731, + "step": 7660 + }, + { + "epoch": 1.5352705410821643, + "grad_norm": 33.39896063720446, + "learning_rate": 5.6666437464224265e-06, + "loss": 2.797, + "step": 7661 + }, + { + "epoch": 1.5354709418837675, + "grad_norm": 23.58732484752011, + "learning_rate": 5.665488252879187e-06, + "loss": 2.9598, + "step": 7662 + }, + { + "epoch": 1.5356713426853708, + "grad_norm": 31.406218970125803, + "learning_rate": 5.664332723152368e-06, + "loss": 2.9721, + "step": 7663 + }, + { + "epoch": 1.535871743486974, + "grad_norm": 20.82477981119323, + "learning_rate": 5.6631771573047975e-06, + "loss": 2.32, + "step": 7664 + }, + { + "epoch": 1.5360721442885772, + "grad_norm": 37.23901888512307, + "learning_rate": 5.662021555399307e-06, + "loss": 3.122, + "step": 7665 + }, + { + "epoch": 1.5362725450901804, + "grad_norm": 25.918743718179616, + "learning_rate": 5.6608659174987264e-06, + "loss": 3.331, + "step": 7666 + }, + { + "epoch": 1.5364729458917834, + "grad_norm": 22.167615543097348, + "learning_rate": 5.659710243665891e-06, + "loss": 2.2982, + "step": 7667 + }, + { + "epoch": 1.5366733466933868, + "grad_norm": 35.33071502801509, + "learning_rate": 5.658554533963636e-06, + "loss": 3.2683, + "step": 7668 + }, + { + "epoch": 1.5368737474949898, + "grad_norm": 32.67197959406181, + "learning_rate": 5.657398788454798e-06, + "loss": 2.3302, + "step": 7669 + }, + { + "epoch": 1.5370741482965933, + "grad_norm": 18.92448773575508, + "learning_rate": 5.656243007202219e-06, + "loss": 2.5932, + "step": 7670 + }, + { + "epoch": 1.5372745490981963, + "grad_norm": 31.31930774251743, + "learning_rate": 5.655087190268738e-06, + "loss": 2.2246, + "step": 7671 + }, + { + "epoch": 1.5374749498997997, + "grad_norm": 35.077212632754694, + "learning_rate": 5.653931337717199e-06, + "loss": 2.7554, + "step": 7672 + }, + { + "epoch": 1.5376753507014027, + "grad_norm": 41.519564794570655, + "learning_rate": 5.652775449610448e-06, + "loss": 3.2599, + "step": 7673 + }, + { + "epoch": 1.5378757515030061, + "grad_norm": 25.79473312628028, + "learning_rate": 5.651619526011332e-06, + "loss": 3.2694, + "step": 7674 + }, + { + "epoch": 1.5380761523046091, + "grad_norm": 48.38619928853948, + "learning_rate": 5.650463566982703e-06, + "loss": 2.3797, + "step": 7675 + }, + { + "epoch": 1.5382765531062126, + "grad_norm": 31.925245503670716, + "learning_rate": 5.6493075725874066e-06, + "loss": 2.1772, + "step": 7676 + }, + { + "epoch": 1.5384769539078156, + "grad_norm": 20.034370322883934, + "learning_rate": 5.6481515428883006e-06, + "loss": 2.4392, + "step": 7677 + }, + { + "epoch": 1.5386773547094188, + "grad_norm": 26.378044201677593, + "learning_rate": 5.646995477948238e-06, + "loss": 2.5221, + "step": 7678 + }, + { + "epoch": 1.538877755511022, + "grad_norm": 47.55335710946821, + "learning_rate": 5.645839377830076e-06, + "loss": 2.5177, + "step": 7679 + }, + { + "epoch": 1.5390781563126252, + "grad_norm": 25.13476470326783, + "learning_rate": 5.644683242596673e-06, + "loss": 2.6236, + "step": 7680 + }, + { + "epoch": 1.5392785571142285, + "grad_norm": 28.22450890099182, + "learning_rate": 5.643527072310892e-06, + "loss": 3.1217, + "step": 7681 + }, + { + "epoch": 1.5394789579158317, + "grad_norm": 19.655898551360387, + "learning_rate": 5.642370867035594e-06, + "loss": 2.6539, + "step": 7682 + }, + { + "epoch": 1.5396793587174349, + "grad_norm": 21.373117747820817, + "learning_rate": 5.641214626833643e-06, + "loss": 2.3229, + "step": 7683 + }, + { + "epoch": 1.539879759519038, + "grad_norm": 22.561627861617623, + "learning_rate": 5.640058351767906e-06, + "loss": 2.4126, + "step": 7684 + }, + { + "epoch": 1.5400801603206413, + "grad_norm": 21.060997526474857, + "learning_rate": 5.638902041901252e-06, + "loss": 2.2643, + "step": 7685 + }, + { + "epoch": 1.5402805611222445, + "grad_norm": 21.600073621953964, + "learning_rate": 5.637745697296551e-06, + "loss": 2.7294, + "step": 7686 + }, + { + "epoch": 1.5404809619238478, + "grad_norm": 20.858734018202973, + "learning_rate": 5.6365893180166755e-06, + "loss": 2.5379, + "step": 7687 + }, + { + "epoch": 1.5406813627254508, + "grad_norm": 29.07848242863989, + "learning_rate": 5.635432904124498e-06, + "loss": 2.9462, + "step": 7688 + }, + { + "epoch": 1.5408817635270542, + "grad_norm": 18.467525246746185, + "learning_rate": 5.634276455682898e-06, + "loss": 2.4069, + "step": 7689 + }, + { + "epoch": 1.5410821643286572, + "grad_norm": 24.69882657384263, + "learning_rate": 5.633119972754748e-06, + "loss": 2.6301, + "step": 7690 + }, + { + "epoch": 1.5412825651302606, + "grad_norm": 22.996845717103323, + "learning_rate": 5.631963455402932e-06, + "loss": 2.773, + "step": 7691 + }, + { + "epoch": 1.5414829659318636, + "grad_norm": 27.33379700251562, + "learning_rate": 5.630806903690328e-06, + "loss": 3.0466, + "step": 7692 + }, + { + "epoch": 1.541683366733467, + "grad_norm": 34.00411893785297, + "learning_rate": 5.629650317679823e-06, + "loss": 3.1767, + "step": 7693 + }, + { + "epoch": 1.54188376753507, + "grad_norm": 27.55973591009988, + "learning_rate": 5.628493697434302e-06, + "loss": 2.7192, + "step": 7694 + }, + { + "epoch": 1.5420841683366735, + "grad_norm": 15.458809368633656, + "learning_rate": 5.627337043016649e-06, + "loss": 2.414, + "step": 7695 + }, + { + "epoch": 1.5422845691382765, + "grad_norm": 27.631068309147075, + "learning_rate": 5.626180354489755e-06, + "loss": 2.8196, + "step": 7696 + }, + { + "epoch": 1.5424849699398797, + "grad_norm": 18.920935331756702, + "learning_rate": 5.625023631916513e-06, + "loss": 2.4684, + "step": 7697 + }, + { + "epoch": 1.542685370741483, + "grad_norm": 38.821112454370585, + "learning_rate": 5.62386687535981e-06, + "loss": 3.3993, + "step": 7698 + }, + { + "epoch": 1.5428857715430861, + "grad_norm": 18.618387937123696, + "learning_rate": 5.622710084882547e-06, + "loss": 2.6719, + "step": 7699 + }, + { + "epoch": 1.5430861723446894, + "grad_norm": 19.219034544047744, + "learning_rate": 5.621553260547616e-06, + "loss": 2.4046, + "step": 7700 + }, + { + "epoch": 1.5432865731462926, + "grad_norm": 38.62933530070406, + "learning_rate": 5.620396402417917e-06, + "loss": 2.8353, + "step": 7701 + }, + { + "epoch": 1.5434869739478958, + "grad_norm": 27.78850834157389, + "learning_rate": 5.6192395105563514e-06, + "loss": 2.7087, + "step": 7702 + }, + { + "epoch": 1.543687374749499, + "grad_norm": 29.379691336144035, + "learning_rate": 5.618082585025818e-06, + "loss": 2.4593, + "step": 7703 + }, + { + "epoch": 1.5438877755511022, + "grad_norm": 37.07638679229564, + "learning_rate": 5.616925625889222e-06, + "loss": 3.11, + "step": 7704 + }, + { + "epoch": 1.5440881763527055, + "grad_norm": 32.46921284293191, + "learning_rate": 5.615768633209468e-06, + "loss": 3.0499, + "step": 7705 + }, + { + "epoch": 1.5442885771543087, + "grad_norm": 24.88600894280298, + "learning_rate": 5.614611607049466e-06, + "loss": 3.0075, + "step": 7706 + }, + { + "epoch": 1.5444889779559117, + "grad_norm": 24.27222896552402, + "learning_rate": 5.613454547472124e-06, + "loss": 2.4396, + "step": 7707 + }, + { + "epoch": 1.544689378757515, + "grad_norm": 33.75827396433788, + "learning_rate": 5.612297454540352e-06, + "loss": 2.5659, + "step": 7708 + }, + { + "epoch": 1.544889779559118, + "grad_norm": 25.10681027363891, + "learning_rate": 5.611140328317065e-06, + "loss": 3.0679, + "step": 7709 + }, + { + "epoch": 1.5450901803607215, + "grad_norm": 23.901108266243142, + "learning_rate": 5.609983168865173e-06, + "loss": 2.6582, + "step": 7710 + }, + { + "epoch": 1.5452905811623245, + "grad_norm": 27.034414498791183, + "learning_rate": 5.6088259762475985e-06, + "loss": 3.0741, + "step": 7711 + }, + { + "epoch": 1.545490981963928, + "grad_norm": 23.45579685095202, + "learning_rate": 5.607668750527256e-06, + "loss": 2.904, + "step": 7712 + }, + { + "epoch": 1.545691382765531, + "grad_norm": 26.820963309124895, + "learning_rate": 5.606511491767066e-06, + "loss": 2.3115, + "step": 7713 + }, + { + "epoch": 1.5458917835671344, + "grad_norm": 25.594928284984434, + "learning_rate": 5.605354200029952e-06, + "loss": 2.7951, + "step": 7714 + }, + { + "epoch": 1.5460921843687374, + "grad_norm": 23.393924541812325, + "learning_rate": 5.604196875378837e-06, + "loss": 2.3139, + "step": 7715 + }, + { + "epoch": 1.5462925851703406, + "grad_norm": 20.706681974372565, + "learning_rate": 5.603039517876644e-06, + "loss": 2.6866, + "step": 7716 + }, + { + "epoch": 1.5464929859719438, + "grad_norm": 27.009088048566877, + "learning_rate": 5.601882127586303e-06, + "loss": 2.8463, + "step": 7717 + }, + { + "epoch": 1.546693386773547, + "grad_norm": 26.780386043766256, + "learning_rate": 5.600724704570742e-06, + "loss": 2.8372, + "step": 7718 + }, + { + "epoch": 1.5468937875751503, + "grad_norm": 26.30150649761435, + "learning_rate": 5.599567248892894e-06, + "loss": 2.3523, + "step": 7719 + }, + { + "epoch": 1.5470941883767535, + "grad_norm": 31.243329389041754, + "learning_rate": 5.598409760615688e-06, + "loss": 2.7847, + "step": 7720 + }, + { + "epoch": 1.5472945891783567, + "grad_norm": 22.07460440176716, + "learning_rate": 5.597252239802061e-06, + "loss": 3.0997, + "step": 7721 + }, + { + "epoch": 1.54749498997996, + "grad_norm": 24.715819038057532, + "learning_rate": 5.596094686514947e-06, + "loss": 2.7025, + "step": 7722 + }, + { + "epoch": 1.5476953907815632, + "grad_norm": 40.76257767719056, + "learning_rate": 5.594937100817283e-06, + "loss": 3.0702, + "step": 7723 + }, + { + "epoch": 1.5478957915831664, + "grad_norm": 31.886887136370976, + "learning_rate": 5.593779482772015e-06, + "loss": 2.8453, + "step": 7724 + }, + { + "epoch": 1.5480961923847696, + "grad_norm": 22.947515870815558, + "learning_rate": 5.5926218324420775e-06, + "loss": 2.6216, + "step": 7725 + }, + { + "epoch": 1.5482965931863726, + "grad_norm": 29.83155143327426, + "learning_rate": 5.591464149890418e-06, + "loss": 2.5687, + "step": 7726 + }, + { + "epoch": 1.548496993987976, + "grad_norm": 31.457838597263464, + "learning_rate": 5.590306435179978e-06, + "loss": 2.8661, + "step": 7727 + }, + { + "epoch": 1.548697394789579, + "grad_norm": 33.50013482827063, + "learning_rate": 5.589148688373706e-06, + "loss": 2.7299, + "step": 7728 + }, + { + "epoch": 1.5488977955911825, + "grad_norm": 19.802360760814874, + "learning_rate": 5.5879909095345494e-06, + "loss": 2.6994, + "step": 7729 + }, + { + "epoch": 1.5490981963927855, + "grad_norm": 24.632968295257182, + "learning_rate": 5.586833098725459e-06, + "loss": 2.436, + "step": 7730 + }, + { + "epoch": 1.549298597194389, + "grad_norm": 28.522215481951086, + "learning_rate": 5.585675256009388e-06, + "loss": 2.9429, + "step": 7731 + }, + { + "epoch": 1.549498997995992, + "grad_norm": 26.739651448383746, + "learning_rate": 5.584517381449289e-06, + "loss": 2.9193, + "step": 7732 + }, + { + "epoch": 1.5496993987975953, + "grad_norm": 69.31902363947277, + "learning_rate": 5.583359475108118e-06, + "loss": 2.4973, + "step": 7733 + }, + { + "epoch": 1.5498997995991983, + "grad_norm": 24.839032709534845, + "learning_rate": 5.582201537048829e-06, + "loss": 3.065, + "step": 7734 + }, + { + "epoch": 1.5501002004008018, + "grad_norm": 31.50730422106772, + "learning_rate": 5.581043567334383e-06, + "loss": 2.493, + "step": 7735 + }, + { + "epoch": 1.5503006012024048, + "grad_norm": 31.126272822342415, + "learning_rate": 5.579885566027742e-06, + "loss": 3.0704, + "step": 7736 + }, + { + "epoch": 1.550501002004008, + "grad_norm": 37.92571682204569, + "learning_rate": 5.578727533191866e-06, + "loss": 2.8757, + "step": 7737 + }, + { + "epoch": 1.5507014028056112, + "grad_norm": 20.5321917764774, + "learning_rate": 5.5775694688897196e-06, + "loss": 2.0374, + "step": 7738 + }, + { + "epoch": 1.5509018036072144, + "grad_norm": 21.638530140652563, + "learning_rate": 5.57641137318427e-06, + "loss": 2.9389, + "step": 7739 + }, + { + "epoch": 1.5511022044088176, + "grad_norm": 22.84538742032857, + "learning_rate": 5.575253246138481e-06, + "loss": 3.0773, + "step": 7740 + }, + { + "epoch": 1.5513026052104208, + "grad_norm": 23.933542351584503, + "learning_rate": 5.5740950878153245e-06, + "loss": 2.4375, + "step": 7741 + }, + { + "epoch": 1.551503006012024, + "grad_norm": 28.497632047293205, + "learning_rate": 5.572936898277771e-06, + "loss": 2.7286, + "step": 7742 + }, + { + "epoch": 1.5517034068136273, + "grad_norm": 29.975491071274316, + "learning_rate": 5.571778677588793e-06, + "loss": 2.3366, + "step": 7743 + }, + { + "epoch": 1.5519038076152305, + "grad_norm": 20.164057314185673, + "learning_rate": 5.570620425811364e-06, + "loss": 2.5409, + "step": 7744 + }, + { + "epoch": 1.5521042084168337, + "grad_norm": 28.84376486555462, + "learning_rate": 5.5694621430084595e-06, + "loss": 2.5965, + "step": 7745 + }, + { + "epoch": 1.552304609218437, + "grad_norm": 22.18969406137341, + "learning_rate": 5.568303829243059e-06, + "loss": 2.9535, + "step": 7746 + }, + { + "epoch": 1.55250501002004, + "grad_norm": 30.635077475099756, + "learning_rate": 5.5671454845781405e-06, + "loss": 2.9007, + "step": 7747 + }, + { + "epoch": 1.5527054108216434, + "grad_norm": 27.512143519539183, + "learning_rate": 5.565987109076682e-06, + "loss": 2.4501, + "step": 7748 + }, + { + "epoch": 1.5529058116232464, + "grad_norm": 20.5367901344519, + "learning_rate": 5.564828702801672e-06, + "loss": 2.9453, + "step": 7749 + }, + { + "epoch": 1.5531062124248498, + "grad_norm": 25.38630992131996, + "learning_rate": 5.56367026581609e-06, + "loss": 3.3347, + "step": 7750 + }, + { + "epoch": 1.5533066132264528, + "grad_norm": 25.090052898994617, + "learning_rate": 5.562511798182925e-06, + "loss": 2.606, + "step": 7751 + }, + { + "epoch": 1.5535070140280562, + "grad_norm": 26.851703994956004, + "learning_rate": 5.561353299965162e-06, + "loss": 2.8126, + "step": 7752 + }, + { + "epoch": 1.5537074148296592, + "grad_norm": 20.411349274149654, + "learning_rate": 5.560194771225792e-06, + "loss": 2.7632, + "step": 7753 + }, + { + "epoch": 1.5539078156312627, + "grad_norm": 23.147776868071045, + "learning_rate": 5.559036212027804e-06, + "loss": 2.9279, + "step": 7754 + }, + { + "epoch": 1.5541082164328657, + "grad_norm": 23.38288670175374, + "learning_rate": 5.557877622434194e-06, + "loss": 2.5372, + "step": 7755 + }, + { + "epoch": 1.554308617234469, + "grad_norm": 22.775166007713192, + "learning_rate": 5.556719002507953e-06, + "loss": 2.497, + "step": 7756 + }, + { + "epoch": 1.554509018036072, + "grad_norm": 27.708717669574295, + "learning_rate": 5.555560352312078e-06, + "loss": 2.8755, + "step": 7757 + }, + { + "epoch": 1.5547094188376753, + "grad_norm": 18.601811222181347, + "learning_rate": 5.554401671909568e-06, + "loss": 2.6137, + "step": 7758 + }, + { + "epoch": 1.5549098196392785, + "grad_norm": 26.87925652130567, + "learning_rate": 5.5532429613634186e-06, + "loss": 2.5592, + "step": 7759 + }, + { + "epoch": 1.5551102204408818, + "grad_norm": 24.27244907040074, + "learning_rate": 5.552084220736635e-06, + "loss": 2.7413, + "step": 7760 + }, + { + "epoch": 1.555310621242485, + "grad_norm": 22.12998065965977, + "learning_rate": 5.550925450092215e-06, + "loss": 2.6532, + "step": 7761 + }, + { + "epoch": 1.5555110220440882, + "grad_norm": 25.558746162844464, + "learning_rate": 5.549766649493166e-06, + "loss": 2.3976, + "step": 7762 + }, + { + "epoch": 1.5557114228456914, + "grad_norm": 19.251913856882265, + "learning_rate": 5.548607819002493e-06, + "loss": 2.0758, + "step": 7763 + }, + { + "epoch": 1.5559118236472946, + "grad_norm": 32.158070479075306, + "learning_rate": 5.547448958683203e-06, + "loss": 2.5223, + "step": 7764 + }, + { + "epoch": 1.5561122244488979, + "grad_norm": 27.068744088846604, + "learning_rate": 5.546290068598306e-06, + "loss": 2.4346, + "step": 7765 + }, + { + "epoch": 1.5563126252505008, + "grad_norm": 28.138372750353046, + "learning_rate": 5.545131148810809e-06, + "loss": 2.5318, + "step": 7766 + }, + { + "epoch": 1.5565130260521043, + "grad_norm": 23.762395845775792, + "learning_rate": 5.543972199383728e-06, + "loss": 3.1738, + "step": 7767 + }, + { + "epoch": 1.5567134268537073, + "grad_norm": 26.20455604938298, + "learning_rate": 5.542813220380077e-06, + "loss": 3.0891, + "step": 7768 + }, + { + "epoch": 1.5569138276553107, + "grad_norm": 28.66315276206435, + "learning_rate": 5.541654211862868e-06, + "loss": 2.4416, + "step": 7769 + }, + { + "epoch": 1.5571142284569137, + "grad_norm": 25.34868973464639, + "learning_rate": 5.5404951738951214e-06, + "loss": 2.8176, + "step": 7770 + }, + { + "epoch": 1.5573146292585172, + "grad_norm": 47.38873787677419, + "learning_rate": 5.539336106539853e-06, + "loss": 2.2291, + "step": 7771 + }, + { + "epoch": 1.5575150300601202, + "grad_norm": 33.28169712754249, + "learning_rate": 5.538177009860085e-06, + "loss": 2.7059, + "step": 7772 + }, + { + "epoch": 1.5577154308617236, + "grad_norm": 26.12152742070301, + "learning_rate": 5.537017883918838e-06, + "loss": 2.892, + "step": 7773 + }, + { + "epoch": 1.5579158316633266, + "grad_norm": 22.586314811690503, + "learning_rate": 5.5358587287791365e-06, + "loss": 2.7669, + "step": 7774 + }, + { + "epoch": 1.5581162324649298, + "grad_norm": 33.56725488785324, + "learning_rate": 5.5346995445040055e-06, + "loss": 2.9536, + "step": 7775 + }, + { + "epoch": 1.558316633266533, + "grad_norm": 48.72016484373481, + "learning_rate": 5.53354033115647e-06, + "loss": 3.3978, + "step": 7776 + }, + { + "epoch": 1.5585170340681362, + "grad_norm": 28.81308824947015, + "learning_rate": 5.53238108879956e-06, + "loss": 2.743, + "step": 7777 + }, + { + "epoch": 1.5587174348697395, + "grad_norm": 40.98424411128906, + "learning_rate": 5.531221817496304e-06, + "loss": 2.91, + "step": 7778 + }, + { + "epoch": 1.5589178356713427, + "grad_norm": 25.104109828139315, + "learning_rate": 5.530062517309732e-06, + "loss": 2.5903, + "step": 7779 + }, + { + "epoch": 1.559118236472946, + "grad_norm": 22.1774963841137, + "learning_rate": 5.528903188302881e-06, + "loss": 2.3841, + "step": 7780 + }, + { + "epoch": 1.5593186372745491, + "grad_norm": 34.9245034475545, + "learning_rate": 5.527743830538782e-06, + "loss": 2.6601, + "step": 7781 + }, + { + "epoch": 1.5595190380761523, + "grad_norm": 35.91315020892495, + "learning_rate": 5.526584444080472e-06, + "loss": 2.9295, + "step": 7782 + }, + { + "epoch": 1.5597194388777555, + "grad_norm": 21.387987667548806, + "learning_rate": 5.525425028990988e-06, + "loss": 2.5467, + "step": 7783 + }, + { + "epoch": 1.5599198396793588, + "grad_norm": 22.272593653655044, + "learning_rate": 5.524265585333369e-06, + "loss": 2.7167, + "step": 7784 + }, + { + "epoch": 1.5601202404809618, + "grad_norm": 21.795486481518214, + "learning_rate": 5.523106113170658e-06, + "loss": 2.4484, + "step": 7785 + }, + { + "epoch": 1.5603206412825652, + "grad_norm": 20.48964514808249, + "learning_rate": 5.521946612565894e-06, + "loss": 2.9013, + "step": 7786 + }, + { + "epoch": 1.5605210420841682, + "grad_norm": 21.436590810415137, + "learning_rate": 5.520787083582123e-06, + "loss": 3.0125, + "step": 7787 + }, + { + "epoch": 1.5607214428857716, + "grad_norm": 27.40159880534175, + "learning_rate": 5.5196275262823895e-06, + "loss": 2.8993, + "step": 7788 + }, + { + "epoch": 1.5609218436873746, + "grad_norm": 23.523146996995877, + "learning_rate": 5.518467940729739e-06, + "loss": 2.5848, + "step": 7789 + }, + { + "epoch": 1.561122244488978, + "grad_norm": 31.0566483013305, + "learning_rate": 5.517308326987224e-06, + "loss": 3.4245, + "step": 7790 + }, + { + "epoch": 1.561322645290581, + "grad_norm": 21.893749850961147, + "learning_rate": 5.516148685117889e-06, + "loss": 2.4553, + "step": 7791 + }, + { + "epoch": 1.5615230460921845, + "grad_norm": 35.856275721759985, + "learning_rate": 5.51498901518479e-06, + "loss": 2.4148, + "step": 7792 + }, + { + "epoch": 1.5617234468937875, + "grad_norm": 27.07212882604358, + "learning_rate": 5.513829317250976e-06, + "loss": 2.9488, + "step": 7793 + }, + { + "epoch": 1.561923847695391, + "grad_norm": 27.159103051871387, + "learning_rate": 5.512669591379505e-06, + "loss": 3.1701, + "step": 7794 + }, + { + "epoch": 1.562124248496994, + "grad_norm": 25.023494708859428, + "learning_rate": 5.511509837633431e-06, + "loss": 2.4409, + "step": 7795 + }, + { + "epoch": 1.5623246492985972, + "grad_norm": 25.117448815459053, + "learning_rate": 5.510350056075813e-06, + "loss": 2.4735, + "step": 7796 + }, + { + "epoch": 1.5625250501002004, + "grad_norm": 34.47232197659266, + "learning_rate": 5.509190246769707e-06, + "loss": 2.9294, + "step": 7797 + }, + { + "epoch": 1.5627254509018036, + "grad_norm": 33.297536834470364, + "learning_rate": 5.508030409778177e-06, + "loss": 2.6721, + "step": 7798 + }, + { + "epoch": 1.5629258517034068, + "grad_norm": 17.90229170904712, + "learning_rate": 5.506870545164284e-06, + "loss": 2.8048, + "step": 7799 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 22.13560598219175, + "learning_rate": 5.505710652991091e-06, + "loss": 2.6306, + "step": 7800 + }, + { + "epoch": 1.5633266533066132, + "grad_norm": 33.370735946753975, + "learning_rate": 5.504550733321663e-06, + "loss": 2.2697, + "step": 7801 + }, + { + "epoch": 1.5635270541082165, + "grad_norm": 25.252088778104632, + "learning_rate": 5.503390786219068e-06, + "loss": 2.8389, + "step": 7802 + }, + { + "epoch": 1.5637274549098197, + "grad_norm": 25.94072425092646, + "learning_rate": 5.502230811746372e-06, + "loss": 2.6594, + "step": 7803 + }, + { + "epoch": 1.563927855711423, + "grad_norm": 18.988392768528307, + "learning_rate": 5.501070809966645e-06, + "loss": 2.3149, + "step": 7804 + }, + { + "epoch": 1.5641282565130261, + "grad_norm": 24.70818408550022, + "learning_rate": 5.499910780942959e-06, + "loss": 2.4146, + "step": 7805 + }, + { + "epoch": 1.5643286573146291, + "grad_norm": 29.342312713187148, + "learning_rate": 5.498750724738385e-06, + "loss": 2.7739, + "step": 7806 + }, + { + "epoch": 1.5645290581162326, + "grad_norm": 17.0273914202945, + "learning_rate": 5.497590641416e-06, + "loss": 2.5304, + "step": 7807 + }, + { + "epoch": 1.5647294589178355, + "grad_norm": 31.788128627376945, + "learning_rate": 5.496430531038876e-06, + "loss": 3.079, + "step": 7808 + }, + { + "epoch": 1.564929859719439, + "grad_norm": 27.474625265890612, + "learning_rate": 5.495270393670091e-06, + "loss": 3.2336, + "step": 7809 + }, + { + "epoch": 1.565130260521042, + "grad_norm": 43.22477483326028, + "learning_rate": 5.494110229372722e-06, + "loss": 3.5867, + "step": 7810 + }, + { + "epoch": 1.5653306613226454, + "grad_norm": 26.842287027216553, + "learning_rate": 5.492950038209852e-06, + "loss": 3.4657, + "step": 7811 + }, + { + "epoch": 1.5655310621242484, + "grad_norm": 43.94552368306622, + "learning_rate": 5.491789820244562e-06, + "loss": 2.6615, + "step": 7812 + }, + { + "epoch": 1.5657314629258519, + "grad_norm": 19.18044031591911, + "learning_rate": 5.490629575539932e-06, + "loss": 2.4011, + "step": 7813 + }, + { + "epoch": 1.5659318637274549, + "grad_norm": 19.91497578920851, + "learning_rate": 5.48946930415905e-06, + "loss": 2.6009, + "step": 7814 + }, + { + "epoch": 1.566132264529058, + "grad_norm": 22.33213163874701, + "learning_rate": 5.488309006164997e-06, + "loss": 2.4673, + "step": 7815 + }, + { + "epoch": 1.5663326653306613, + "grad_norm": 31.40585327264803, + "learning_rate": 5.487148681620862e-06, + "loss": 2.8337, + "step": 7816 + }, + { + "epoch": 1.5665330661322645, + "grad_norm": 34.670004275135305, + "learning_rate": 5.485988330589735e-06, + "loss": 2.0068, + "step": 7817 + }, + { + "epoch": 1.5667334669338677, + "grad_norm": 30.770705599519037, + "learning_rate": 5.484827953134706e-06, + "loss": 2.979, + "step": 7818 + }, + { + "epoch": 1.566933867735471, + "grad_norm": 51.039048237760056, + "learning_rate": 5.483667549318865e-06, + "loss": 2.9851, + "step": 7819 + }, + { + "epoch": 1.5671342685370742, + "grad_norm": 21.239862602663848, + "learning_rate": 5.482507119205304e-06, + "loss": 2.4111, + "step": 7820 + }, + { + "epoch": 1.5673346693386774, + "grad_norm": 24.935456614024307, + "learning_rate": 5.481346662857121e-06, + "loss": 2.6637, + "step": 7821 + }, + { + "epoch": 1.5675350701402806, + "grad_norm": 43.24798488304068, + "learning_rate": 5.480186180337409e-06, + "loss": 2.3441, + "step": 7822 + }, + { + "epoch": 1.5677354709418838, + "grad_norm": 21.95495729527726, + "learning_rate": 5.479025671709263e-06, + "loss": 2.5758, + "step": 7823 + }, + { + "epoch": 1.567935871743487, + "grad_norm": 28.38171007495174, + "learning_rate": 5.477865137035787e-06, + "loss": 3.2495, + "step": 7824 + }, + { + "epoch": 1.56813627254509, + "grad_norm": 20.99019897971509, + "learning_rate": 5.4767045763800765e-06, + "loss": 2.5576, + "step": 7825 + }, + { + "epoch": 1.5683366733466935, + "grad_norm": 22.798601607169036, + "learning_rate": 5.475543989805237e-06, + "loss": 2.5032, + "step": 7826 + }, + { + "epoch": 1.5685370741482965, + "grad_norm": 25.443399635816245, + "learning_rate": 5.474383377374366e-06, + "loss": 2.9673, + "step": 7827 + }, + { + "epoch": 1.5687374749499, + "grad_norm": 23.556197188494384, + "learning_rate": 5.473222739150571e-06, + "loss": 2.7849, + "step": 7828 + }, + { + "epoch": 1.568937875751503, + "grad_norm": 26.274558357598718, + "learning_rate": 5.4720620751969574e-06, + "loss": 2.4817, + "step": 7829 + }, + { + "epoch": 1.5691382765531063, + "grad_norm": 28.152790542199966, + "learning_rate": 5.470901385576631e-06, + "loss": 2.6829, + "step": 7830 + }, + { + "epoch": 1.5693386773547093, + "grad_norm": 23.657150894873315, + "learning_rate": 5.4697406703527035e-06, + "loss": 2.7783, + "step": 7831 + }, + { + "epoch": 1.5695390781563128, + "grad_norm": 27.629973851255738, + "learning_rate": 5.468579929588281e-06, + "loss": 2.6052, + "step": 7832 + }, + { + "epoch": 1.5697394789579158, + "grad_norm": 22.8348331743492, + "learning_rate": 5.467419163346476e-06, + "loss": 3.0874, + "step": 7833 + }, + { + "epoch": 1.569939879759519, + "grad_norm": 37.620095243468576, + "learning_rate": 5.466258371690402e-06, + "loss": 2.3673, + "step": 7834 + }, + { + "epoch": 1.5701402805611222, + "grad_norm": 23.484714266846748, + "learning_rate": 5.465097554683171e-06, + "loss": 2.8238, + "step": 7835 + }, + { + "epoch": 1.5703406813627254, + "grad_norm": 22.14929645082198, + "learning_rate": 5.463936712387899e-06, + "loss": 2.6104, + "step": 7836 + }, + { + "epoch": 1.5705410821643286, + "grad_norm": 28.632834707224752, + "learning_rate": 5.462775844867704e-06, + "loss": 2.8689, + "step": 7837 + }, + { + "epoch": 1.5707414829659319, + "grad_norm": 24.862846464934265, + "learning_rate": 5.461614952185703e-06, + "loss": 2.5415, + "step": 7838 + }, + { + "epoch": 1.570941883767535, + "grad_norm": 40.66783481169248, + "learning_rate": 5.460454034405016e-06, + "loss": 2.9085, + "step": 7839 + }, + { + "epoch": 1.5711422845691383, + "grad_norm": 17.280097112282416, + "learning_rate": 5.459293091588763e-06, + "loss": 2.7146, + "step": 7840 + }, + { + "epoch": 1.5713426853707415, + "grad_norm": 29.96416075073006, + "learning_rate": 5.458132123800066e-06, + "loss": 2.8194, + "step": 7841 + }, + { + "epoch": 1.5715430861723447, + "grad_norm": 22.577717751066437, + "learning_rate": 5.4569711311020505e-06, + "loss": 1.838, + "step": 7842 + }, + { + "epoch": 1.571743486973948, + "grad_norm": 26.644810650902645, + "learning_rate": 5.455810113557839e-06, + "loss": 2.7424, + "step": 7843 + }, + { + "epoch": 1.571943887775551, + "grad_norm": 48.287264246264044, + "learning_rate": 5.45464907123056e-06, + "loss": 2.7719, + "step": 7844 + }, + { + "epoch": 1.5721442885771544, + "grad_norm": 78.39570781331545, + "learning_rate": 5.453488004183339e-06, + "loss": 3.7107, + "step": 7845 + }, + { + "epoch": 1.5723446893787574, + "grad_norm": 31.4612177346572, + "learning_rate": 5.452326912479307e-06, + "loss": 3.0391, + "step": 7846 + }, + { + "epoch": 1.5725450901803608, + "grad_norm": 20.683710717646605, + "learning_rate": 5.451165796181591e-06, + "loss": 3.2052, + "step": 7847 + }, + { + "epoch": 1.5727454909819638, + "grad_norm": 19.674323511011217, + "learning_rate": 5.450004655353326e-06, + "loss": 2.5555, + "step": 7848 + }, + { + "epoch": 1.5729458917835673, + "grad_norm": 20.478481487340176, + "learning_rate": 5.4488434900576425e-06, + "loss": 2.5045, + "step": 7849 + }, + { + "epoch": 1.5731462925851702, + "grad_norm": 25.370889303527328, + "learning_rate": 5.4476823003576776e-06, + "loss": 3.0943, + "step": 7850 + }, + { + "epoch": 1.5733466933867737, + "grad_norm": 21.555594766115107, + "learning_rate": 5.446521086316564e-06, + "loss": 2.0037, + "step": 7851 + }, + { + "epoch": 1.5735470941883767, + "grad_norm": 23.315835250416807, + "learning_rate": 5.44535984799744e-06, + "loss": 2.6402, + "step": 7852 + }, + { + "epoch": 1.5737474949899801, + "grad_norm": 29.11140613929868, + "learning_rate": 5.444198585463444e-06, + "loss": 2.8187, + "step": 7853 + }, + { + "epoch": 1.5739478957915831, + "grad_norm": 22.048115854303695, + "learning_rate": 5.443037298777715e-06, + "loss": 2.4718, + "step": 7854 + }, + { + "epoch": 1.5741482965931863, + "grad_norm": 24.928523871002184, + "learning_rate": 5.441875988003393e-06, + "loss": 3.2524, + "step": 7855 + }, + { + "epoch": 1.5743486973947896, + "grad_norm": 21.904374823177914, + "learning_rate": 5.440714653203623e-06, + "loss": 2.6508, + "step": 7856 + }, + { + "epoch": 1.5745490981963928, + "grad_norm": 26.85433330867086, + "learning_rate": 5.439553294441546e-06, + "loss": 2.4681, + "step": 7857 + }, + { + "epoch": 1.574749498997996, + "grad_norm": 29.583558330026534, + "learning_rate": 5.438391911780308e-06, + "loss": 3.1598, + "step": 7858 + }, + { + "epoch": 1.5749498997995992, + "grad_norm": 23.1688360922413, + "learning_rate": 5.437230505283054e-06, + "loss": 2.4639, + "step": 7859 + }, + { + "epoch": 1.5751503006012024, + "grad_norm": 19.66882908980324, + "learning_rate": 5.4360690750129334e-06, + "loss": 2.7362, + "step": 7860 + }, + { + "epoch": 1.5753507014028056, + "grad_norm": 21.24400423804388, + "learning_rate": 5.434907621033091e-06, + "loss": 2.5505, + "step": 7861 + }, + { + "epoch": 1.5755511022044089, + "grad_norm": 20.98037556954598, + "learning_rate": 5.43374614340668e-06, + "loss": 2.4791, + "step": 7862 + }, + { + "epoch": 1.575751503006012, + "grad_norm": 15.760234436167204, + "learning_rate": 5.432584642196853e-06, + "loss": 2.4012, + "step": 7863 + }, + { + "epoch": 1.5759519038076153, + "grad_norm": 42.40843527954633, + "learning_rate": 5.4314231174667585e-06, + "loss": 2.6769, + "step": 7864 + }, + { + "epoch": 1.5761523046092183, + "grad_norm": 32.046611234414094, + "learning_rate": 5.430261569279552e-06, + "loss": 2.5135, + "step": 7865 + }, + { + "epoch": 1.5763527054108217, + "grad_norm": 20.936693087645914, + "learning_rate": 5.429099997698391e-06, + "loss": 2.6488, + "step": 7866 + }, + { + "epoch": 1.5765531062124247, + "grad_norm": 29.865689499296288, + "learning_rate": 5.427938402786427e-06, + "loss": 2.8089, + "step": 7867 + }, + { + "epoch": 1.5767535070140282, + "grad_norm": 23.532984650023153, + "learning_rate": 5.426776784606822e-06, + "loss": 3.1214, + "step": 7868 + }, + { + "epoch": 1.5769539078156312, + "grad_norm": 29.062547151705193, + "learning_rate": 5.425615143222733e-06, + "loss": 2.7515, + "step": 7869 + }, + { + "epoch": 1.5771543086172346, + "grad_norm": 22.168357910925252, + "learning_rate": 5.424453478697321e-06, + "loss": 3.177, + "step": 7870 + }, + { + "epoch": 1.5773547094188376, + "grad_norm": 24.7165482753406, + "learning_rate": 5.423291791093747e-06, + "loss": 2.8848, + "step": 7871 + }, + { + "epoch": 1.577555110220441, + "grad_norm": 20.488315137386834, + "learning_rate": 5.422130080475172e-06, + "loss": 2.8408, + "step": 7872 + }, + { + "epoch": 1.577755511022044, + "grad_norm": 24.637513218518777, + "learning_rate": 5.420968346904761e-06, + "loss": 2.5675, + "step": 7873 + }, + { + "epoch": 1.5779559118236473, + "grad_norm": 20.710641975138195, + "learning_rate": 5.4198065904456805e-06, + "loss": 3.1701, + "step": 7874 + }, + { + "epoch": 1.5781563126252505, + "grad_norm": 35.1492655900189, + "learning_rate": 5.4186448111610965e-06, + "loss": 2.3856, + "step": 7875 + }, + { + "epoch": 1.5783567134268537, + "grad_norm": 31.514548611481946, + "learning_rate": 5.4174830091141765e-06, + "loss": 3.0405, + "step": 7876 + }, + { + "epoch": 1.578557114228457, + "grad_norm": 26.89052627462087, + "learning_rate": 5.416321184368087e-06, + "loss": 3.0708, + "step": 7877 + }, + { + "epoch": 1.5787575150300601, + "grad_norm": 25.52475032103583, + "learning_rate": 5.415159336986002e-06, + "loss": 2.8215, + "step": 7878 + }, + { + "epoch": 1.5789579158316633, + "grad_norm": 19.23062325542604, + "learning_rate": 5.413997467031089e-06, + "loss": 3.1434, + "step": 7879 + }, + { + "epoch": 1.5791583166332666, + "grad_norm": 25.00180712860546, + "learning_rate": 5.412835574566525e-06, + "loss": 2.9862, + "step": 7880 + }, + { + "epoch": 1.5793587174348698, + "grad_norm": 28.085131249195108, + "learning_rate": 5.411673659655481e-06, + "loss": 3.0925, + "step": 7881 + }, + { + "epoch": 1.579559118236473, + "grad_norm": 18.767527433765256, + "learning_rate": 5.410511722361132e-06, + "loss": 2.6386, + "step": 7882 + }, + { + "epoch": 1.5797595190380762, + "grad_norm": 30.645491368405917, + "learning_rate": 5.409349762746654e-06, + "loss": 2.5922, + "step": 7883 + }, + { + "epoch": 1.5799599198396792, + "grad_norm": 25.086056191085564, + "learning_rate": 5.408187780875226e-06, + "loss": 2.7458, + "step": 7884 + }, + { + "epoch": 1.5801603206412826, + "grad_norm": 20.937532464781658, + "learning_rate": 5.407025776810026e-06, + "loss": 2.5627, + "step": 7885 + }, + { + "epoch": 1.5803607214428856, + "grad_norm": 30.15447339245237, + "learning_rate": 5.405863750614232e-06, + "loss": 3.2934, + "step": 7886 + }, + { + "epoch": 1.580561122244489, + "grad_norm": 35.0944487780239, + "learning_rate": 5.404701702351027e-06, + "loss": 3.1948, + "step": 7887 + }, + { + "epoch": 1.580761523046092, + "grad_norm": 18.86628069979619, + "learning_rate": 5.403539632083595e-06, + "loss": 2.6883, + "step": 7888 + }, + { + "epoch": 1.5809619238476955, + "grad_norm": 32.776161023842675, + "learning_rate": 5.402377539875116e-06, + "loss": 3.2286, + "step": 7889 + }, + { + "epoch": 1.5811623246492985, + "grad_norm": 24.87674171644354, + "learning_rate": 5.401215425788779e-06, + "loss": 3.2534, + "step": 7890 + }, + { + "epoch": 1.581362725450902, + "grad_norm": 27.618615348299564, + "learning_rate": 5.400053289887763e-06, + "loss": 2.8177, + "step": 7891 + }, + { + "epoch": 1.581563126252505, + "grad_norm": 24.559967923780995, + "learning_rate": 5.398891132235261e-06, + "loss": 2.8953, + "step": 7892 + }, + { + "epoch": 1.5817635270541082, + "grad_norm": 33.53963559313394, + "learning_rate": 5.397728952894459e-06, + "loss": 3.2281, + "step": 7893 + }, + { + "epoch": 1.5819639278557114, + "grad_norm": 19.114805330809318, + "learning_rate": 5.396566751928547e-06, + "loss": 2.4793, + "step": 7894 + }, + { + "epoch": 1.5821643286573146, + "grad_norm": 23.446023524159944, + "learning_rate": 5.395404529400716e-06, + "loss": 2.9615, + "step": 7895 + }, + { + "epoch": 1.5823647294589178, + "grad_norm": 25.3246076706644, + "learning_rate": 5.394242285374156e-06, + "loss": 2.6009, + "step": 7896 + }, + { + "epoch": 1.582565130260521, + "grad_norm": 18.58267191361247, + "learning_rate": 5.393080019912061e-06, + "loss": 2.5231, + "step": 7897 + }, + { + "epoch": 1.5827655310621243, + "grad_norm": 25.605917191423767, + "learning_rate": 5.391917733077625e-06, + "loss": 2.5693, + "step": 7898 + }, + { + "epoch": 1.5829659318637275, + "grad_norm": 20.611929438632476, + "learning_rate": 5.390755424934043e-06, + "loss": 2.9153, + "step": 7899 + }, + { + "epoch": 1.5831663326653307, + "grad_norm": 19.81089818509602, + "learning_rate": 5.389593095544514e-06, + "loss": 3.2131, + "step": 7900 + }, + { + "epoch": 1.583366733466934, + "grad_norm": 21.201710992478738, + "learning_rate": 5.388430744972231e-06, + "loss": 2.8229, + "step": 7901 + }, + { + "epoch": 1.5835671342685371, + "grad_norm": 36.3273469202214, + "learning_rate": 5.387268373280396e-06, + "loss": 3.4045, + "step": 7902 + }, + { + "epoch": 1.5837675350701401, + "grad_norm": 17.620685525437363, + "learning_rate": 5.386105980532208e-06, + "loss": 2.6844, + "step": 7903 + }, + { + "epoch": 1.5839679358717436, + "grad_norm": 35.30300947725549, + "learning_rate": 5.384943566790868e-06, + "loss": 2.5956, + "step": 7904 + }, + { + "epoch": 1.5841683366733466, + "grad_norm": 39.385549122936496, + "learning_rate": 5.383781132119576e-06, + "loss": 3.2958, + "step": 7905 + }, + { + "epoch": 1.58436873747495, + "grad_norm": 27.770900964417123, + "learning_rate": 5.382618676581539e-06, + "loss": 3.1442, + "step": 7906 + }, + { + "epoch": 1.584569138276553, + "grad_norm": 19.680777250732884, + "learning_rate": 5.381456200239961e-06, + "loss": 2.8945, + "step": 7907 + }, + { + "epoch": 1.5847695390781564, + "grad_norm": 31.877714639752597, + "learning_rate": 5.380293703158045e-06, + "loss": 3.1457, + "step": 7908 + }, + { + "epoch": 1.5849699398797594, + "grad_norm": 32.542896508468125, + "learning_rate": 5.379131185398998e-06, + "loss": 2.7495, + "step": 7909 + }, + { + "epoch": 1.5851703406813629, + "grad_norm": 21.06764455452025, + "learning_rate": 5.37796864702603e-06, + "loss": 2.7378, + "step": 7910 + }, + { + "epoch": 1.5853707414829659, + "grad_norm": 20.403875078228698, + "learning_rate": 5.376806088102348e-06, + "loss": 2.6457, + "step": 7911 + }, + { + "epoch": 1.5855711422845693, + "grad_norm": 29.63066751295606, + "learning_rate": 5.375643508691164e-06, + "loss": 2.8298, + "step": 7912 + }, + { + "epoch": 1.5857715430861723, + "grad_norm": 48.701927689739684, + "learning_rate": 5.374480908855687e-06, + "loss": 2.951, + "step": 7913 + }, + { + "epoch": 1.5859719438877755, + "grad_norm": 21.335753496261294, + "learning_rate": 5.373318288659131e-06, + "loss": 2.4032, + "step": 7914 + }, + { + "epoch": 1.5861723446893787, + "grad_norm": 31.60263265827556, + "learning_rate": 5.372155648164707e-06, + "loss": 2.5832, + "step": 7915 + }, + { + "epoch": 1.586372745490982, + "grad_norm": 27.391892443838355, + "learning_rate": 5.370992987435632e-06, + "loss": 3.411, + "step": 7916 + }, + { + "epoch": 1.5865731462925852, + "grad_norm": 33.186873727320496, + "learning_rate": 5.369830306535121e-06, + "loss": 2.5343, + "step": 7917 + }, + { + "epoch": 1.5867735470941884, + "grad_norm": 21.322024932174646, + "learning_rate": 5.3686676055263895e-06, + "loss": 2.7382, + "step": 7918 + }, + { + "epoch": 1.5869739478957916, + "grad_norm": 27.434680529341318, + "learning_rate": 5.367504884472657e-06, + "loss": 2.7643, + "step": 7919 + }, + { + "epoch": 1.5871743486973948, + "grad_norm": 29.219931406460784, + "learning_rate": 5.366342143437142e-06, + "loss": 2.3677, + "step": 7920 + }, + { + "epoch": 1.587374749498998, + "grad_norm": 24.730763254100193, + "learning_rate": 5.365179382483062e-06, + "loss": 2.7172, + "step": 7921 + }, + { + "epoch": 1.5875751503006013, + "grad_norm": 48.68198159805373, + "learning_rate": 5.36401660167364e-06, + "loss": 2.919, + "step": 7922 + }, + { + "epoch": 1.5877755511022045, + "grad_norm": 34.27992057827866, + "learning_rate": 5.3628538010721e-06, + "loss": 2.3293, + "step": 7923 + }, + { + "epoch": 1.5879759519038075, + "grad_norm": 27.136341750193946, + "learning_rate": 5.361690980741663e-06, + "loss": 2.9969, + "step": 7924 + }, + { + "epoch": 1.588176352705411, + "grad_norm": 25.005349807618263, + "learning_rate": 5.360528140745551e-06, + "loss": 2.5731, + "step": 7925 + }, + { + "epoch": 1.588376753507014, + "grad_norm": 38.738390356370694, + "learning_rate": 5.359365281146993e-06, + "loss": 3.2498, + "step": 7926 + }, + { + "epoch": 1.5885771543086173, + "grad_norm": 22.08473895086645, + "learning_rate": 5.358202402009216e-06, + "loss": 2.8191, + "step": 7927 + }, + { + "epoch": 1.5887775551102203, + "grad_norm": 21.05956694414721, + "learning_rate": 5.357039503395444e-06, + "loss": 2.6091, + "step": 7928 + }, + { + "epoch": 1.5889779559118238, + "grad_norm": 44.07936625038269, + "learning_rate": 5.3558765853689065e-06, + "loss": 2.8941, + "step": 7929 + }, + { + "epoch": 1.5891783567134268, + "grad_norm": 21.703652197195282, + "learning_rate": 5.354713647992835e-06, + "loss": 3.0948, + "step": 7930 + }, + { + "epoch": 1.5893787575150302, + "grad_norm": 24.44771066015439, + "learning_rate": 5.353550691330458e-06, + "loss": 3.199, + "step": 7931 + }, + { + "epoch": 1.5895791583166332, + "grad_norm": 26.61234624600953, + "learning_rate": 5.3523877154450085e-06, + "loss": 2.5438, + "step": 7932 + }, + { + "epoch": 1.5897795591182364, + "grad_norm": 25.10675090862484, + "learning_rate": 5.3512247203997195e-06, + "loss": 2.7256, + "step": 7933 + }, + { + "epoch": 1.5899799599198396, + "grad_norm": 29.329168841642257, + "learning_rate": 5.350061706257825e-06, + "loss": 2.9314, + "step": 7934 + }, + { + "epoch": 1.5901803607214429, + "grad_norm": 29.039732185657115, + "learning_rate": 5.348898673082555e-06, + "loss": 3.0477, + "step": 7935 + }, + { + "epoch": 1.590380761523046, + "grad_norm": 18.4686707135828, + "learning_rate": 5.34773562093715e-06, + "loss": 2.852, + "step": 7936 + }, + { + "epoch": 1.5905811623246493, + "grad_norm": 23.13658749932922, + "learning_rate": 5.346572549884848e-06, + "loss": 2.6339, + "step": 7937 + }, + { + "epoch": 1.5907815631262525, + "grad_norm": 17.917839359456188, + "learning_rate": 5.345409459988884e-06, + "loss": 2.9051, + "step": 7938 + }, + { + "epoch": 1.5909819639278557, + "grad_norm": 22.46897505971909, + "learning_rate": 5.344246351312499e-06, + "loss": 2.748, + "step": 7939 + }, + { + "epoch": 1.591182364729459, + "grad_norm": 20.567261175757054, + "learning_rate": 5.34308322391893e-06, + "loss": 2.7901, + "step": 7940 + }, + { + "epoch": 1.5913827655310622, + "grad_norm": 21.34493933148705, + "learning_rate": 5.341920077871419e-06, + "loss": 2.3202, + "step": 7941 + }, + { + "epoch": 1.5915831663326654, + "grad_norm": 27.690278928951006, + "learning_rate": 5.340756913233211e-06, + "loss": 3.3586, + "step": 7942 + }, + { + "epoch": 1.5917835671342684, + "grad_norm": 23.272190282502784, + "learning_rate": 5.339593730067544e-06, + "loss": 3.0701, + "step": 7943 + }, + { + "epoch": 1.5919839679358718, + "grad_norm": 24.725239973786255, + "learning_rate": 5.338430528437667e-06, + "loss": 2.7417, + "step": 7944 + }, + { + "epoch": 1.5921843687374748, + "grad_norm": 22.617688289353403, + "learning_rate": 5.3372673084068216e-06, + "loss": 2.8492, + "step": 7945 + }, + { + "epoch": 1.5923847695390783, + "grad_norm": 26.871286319236674, + "learning_rate": 5.336104070038255e-06, + "loss": 2.6834, + "step": 7946 + }, + { + "epoch": 1.5925851703406813, + "grad_norm": 22.32866226947402, + "learning_rate": 5.334940813395214e-06, + "loss": 2.5844, + "step": 7947 + }, + { + "epoch": 1.5927855711422847, + "grad_norm": 22.528435927679965, + "learning_rate": 5.333777538540945e-06, + "loss": 2.7218, + "step": 7948 + }, + { + "epoch": 1.5929859719438877, + "grad_norm": 25.809480019732458, + "learning_rate": 5.3326142455387006e-06, + "loss": 3.1146, + "step": 7949 + }, + { + "epoch": 1.5931863727454911, + "grad_norm": 27.30197794431254, + "learning_rate": 5.3314509344517275e-06, + "loss": 3.0957, + "step": 7950 + }, + { + "epoch": 1.5933867735470941, + "grad_norm": 17.482665428526854, + "learning_rate": 5.33028760534328e-06, + "loss": 2.9728, + "step": 7951 + }, + { + "epoch": 1.5935871743486973, + "grad_norm": 60.867179644878085, + "learning_rate": 5.3291242582766054e-06, + "loss": 3.375, + "step": 7952 + }, + { + "epoch": 1.5937875751503006, + "grad_norm": 20.466228327739806, + "learning_rate": 5.32796089331496e-06, + "loss": 3.1879, + "step": 7953 + }, + { + "epoch": 1.5939879759519038, + "grad_norm": 34.99623223011972, + "learning_rate": 5.326797510521595e-06, + "loss": 2.1963, + "step": 7954 + }, + { + "epoch": 1.594188376753507, + "grad_norm": 27.951287665396897, + "learning_rate": 5.325634109959768e-06, + "loss": 2.8214, + "step": 7955 + }, + { + "epoch": 1.5943887775551102, + "grad_norm": 28.347760194712524, + "learning_rate": 5.324470691692736e-06, + "loss": 3.2246, + "step": 7956 + }, + { + "epoch": 1.5945891783567134, + "grad_norm": 31.11546920716012, + "learning_rate": 5.323307255783752e-06, + "loss": 2.9999, + "step": 7957 + }, + { + "epoch": 1.5947895791583167, + "grad_norm": 18.998000858648226, + "learning_rate": 5.322143802296075e-06, + "loss": 2.2815, + "step": 7958 + }, + { + "epoch": 1.5949899799599199, + "grad_norm": 24.301354080181525, + "learning_rate": 5.3209803312929644e-06, + "loss": 2.7076, + "step": 7959 + }, + { + "epoch": 1.595190380761523, + "grad_norm": 27.001809275306595, + "learning_rate": 5.3198168428376785e-06, + "loss": 3.1516, + "step": 7960 + }, + { + "epoch": 1.5953907815631263, + "grad_norm": 31.455932533833266, + "learning_rate": 5.318653336993479e-06, + "loss": 2.6128, + "step": 7961 + }, + { + "epoch": 1.5955911823647293, + "grad_norm": 21.374052387619628, + "learning_rate": 5.317489813823627e-06, + "loss": 2.7234, + "step": 7962 + }, + { + "epoch": 1.5957915831663327, + "grad_norm": 34.48265552285679, + "learning_rate": 5.316326273391387e-06, + "loss": 2.477, + "step": 7963 + }, + { + "epoch": 1.5959919839679357, + "grad_norm": 27.222785913968917, + "learning_rate": 5.315162715760018e-06, + "loss": 2.5727, + "step": 7964 + }, + { + "epoch": 1.5961923847695392, + "grad_norm": 55.39329879508129, + "learning_rate": 5.313999140992788e-06, + "loss": 2.4656, + "step": 7965 + }, + { + "epoch": 1.5963927855711422, + "grad_norm": 20.676039341263674, + "learning_rate": 5.31283554915296e-06, + "loss": 2.6169, + "step": 7966 + }, + { + "epoch": 1.5965931863727456, + "grad_norm": 25.026074696224235, + "learning_rate": 5.311671940303803e-06, + "loss": 2.4228, + "step": 7967 + }, + { + "epoch": 1.5967935871743486, + "grad_norm": 39.9561358581406, + "learning_rate": 5.310508314508584e-06, + "loss": 2.3113, + "step": 7968 + }, + { + "epoch": 1.596993987975952, + "grad_norm": 24.37369230406169, + "learning_rate": 5.309344671830567e-06, + "loss": 3.0487, + "step": 7969 + }, + { + "epoch": 1.597194388777555, + "grad_norm": 19.81356460139057, + "learning_rate": 5.308181012333023e-06, + "loss": 2.5902, + "step": 7970 + }, + { + "epoch": 1.5973947895791585, + "grad_norm": 31.860288201337973, + "learning_rate": 5.307017336079225e-06, + "loss": 3.3382, + "step": 7971 + }, + { + "epoch": 1.5975951903807615, + "grad_norm": 69.72951623518024, + "learning_rate": 5.30585364313244e-06, + "loss": 2.7461, + "step": 7972 + }, + { + "epoch": 1.5977955911823647, + "grad_norm": 25.982158870688554, + "learning_rate": 5.304689933555941e-06, + "loss": 2.5385, + "step": 7973 + }, + { + "epoch": 1.597995991983968, + "grad_norm": 28.132414193234638, + "learning_rate": 5.303526207413e-06, + "loss": 2.5241, + "step": 7974 + }, + { + "epoch": 1.5981963927855711, + "grad_norm": 35.896109659026436, + "learning_rate": 5.302362464766891e-06, + "loss": 3.0067, + "step": 7975 + }, + { + "epoch": 1.5983967935871743, + "grad_norm": 34.280706427504846, + "learning_rate": 5.30119870568089e-06, + "loss": 2.8958, + "step": 7976 + }, + { + "epoch": 1.5985971943887776, + "grad_norm": 30.498220949634035, + "learning_rate": 5.30003493021827e-06, + "loss": 2.8048, + "step": 7977 + }, + { + "epoch": 1.5987975951903808, + "grad_norm": 19.652313979521914, + "learning_rate": 5.298871138442307e-06, + "loss": 2.9941, + "step": 7978 + }, + { + "epoch": 1.598997995991984, + "grad_norm": 25.397188234882744, + "learning_rate": 5.29770733041628e-06, + "loss": 2.7564, + "step": 7979 + }, + { + "epoch": 1.5991983967935872, + "grad_norm": 21.07131171263571, + "learning_rate": 5.296543506203465e-06, + "loss": 2.8609, + "step": 7980 + }, + { + "epoch": 1.5993987975951904, + "grad_norm": 26.698116858869682, + "learning_rate": 5.2953796658671445e-06, + "loss": 2.8736, + "step": 7981 + }, + { + "epoch": 1.5995991983967937, + "grad_norm": 23.076865190842028, + "learning_rate": 5.294215809470593e-06, + "loss": 2.2697, + "step": 7982 + }, + { + "epoch": 1.5997995991983966, + "grad_norm": 23.663785660626075, + "learning_rate": 5.293051937077095e-06, + "loss": 2.6644, + "step": 7983 + }, + { + "epoch": 1.6, + "grad_norm": 25.230465616329322, + "learning_rate": 5.291888048749929e-06, + "loss": 2.7681, + "step": 7984 + }, + { + "epoch": 1.600200400801603, + "grad_norm": 21.407345445725973, + "learning_rate": 5.290724144552379e-06, + "loss": 2.8272, + "step": 7985 + }, + { + "epoch": 1.6004008016032065, + "grad_norm": 20.07630670779727, + "learning_rate": 5.2895602245477295e-06, + "loss": 2.8752, + "step": 7986 + }, + { + "epoch": 1.6006012024048095, + "grad_norm": 21.976864364912583, + "learning_rate": 5.288396288799262e-06, + "loss": 2.2733, + "step": 7987 + }, + { + "epoch": 1.600801603206413, + "grad_norm": 29.47058965278208, + "learning_rate": 5.287232337370264e-06, + "loss": 2.8772, + "step": 7988 + }, + { + "epoch": 1.601002004008016, + "grad_norm": 65.93987769302153, + "learning_rate": 5.286068370324019e-06, + "loss": 3.4368, + "step": 7989 + }, + { + "epoch": 1.6012024048096194, + "grad_norm": 25.680292104790663, + "learning_rate": 5.284904387723815e-06, + "loss": 2.7608, + "step": 7990 + }, + { + "epoch": 1.6014028056112224, + "grad_norm": 25.95266430335895, + "learning_rate": 5.2837403896329375e-06, + "loss": 2.5724, + "step": 7991 + }, + { + "epoch": 1.6016032064128256, + "grad_norm": 27.932216220148632, + "learning_rate": 5.282576376114676e-06, + "loss": 2.3718, + "step": 7992 + }, + { + "epoch": 1.6018036072144288, + "grad_norm": 19.74236319448849, + "learning_rate": 5.281412347232322e-06, + "loss": 2.7483, + "step": 7993 + }, + { + "epoch": 1.602004008016032, + "grad_norm": 23.381317328125135, + "learning_rate": 5.280248303049161e-06, + "loss": 3.0562, + "step": 7994 + }, + { + "epoch": 1.6022044088176353, + "grad_norm": 26.981710145621232, + "learning_rate": 5.279084243628488e-06, + "loss": 2.4603, + "step": 7995 + }, + { + "epoch": 1.6024048096192385, + "grad_norm": 32.40672423005715, + "learning_rate": 5.277920169033592e-06, + "loss": 2.8358, + "step": 7996 + }, + { + "epoch": 1.6026052104208417, + "grad_norm": 26.101520344463303, + "learning_rate": 5.276756079327766e-06, + "loss": 3.0253, + "step": 7997 + }, + { + "epoch": 1.602805611222445, + "grad_norm": 30.61489563920035, + "learning_rate": 5.275591974574303e-06, + "loss": 2.8965, + "step": 7998 + }, + { + "epoch": 1.6030060120240481, + "grad_norm": 35.042267972102195, + "learning_rate": 5.274427854836498e-06, + "loss": 2.2458, + "step": 7999 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 28.96615736406023, + "learning_rate": 5.273263720177646e-06, + "loss": 2.6016, + "step": 8000 + }, + { + "epoch": 1.6034068136272546, + "grad_norm": 25.146009918417747, + "learning_rate": 5.272099570661041e-06, + "loss": 2.2553, + "step": 8001 + }, + { + "epoch": 1.6036072144288576, + "grad_norm": 35.34025057203723, + "learning_rate": 5.270935406349981e-06, + "loss": 3.4525, + "step": 8002 + }, + { + "epoch": 1.603807615230461, + "grad_norm": 31.78509057883017, + "learning_rate": 5.269771227307764e-06, + "loss": 3.0775, + "step": 8003 + }, + { + "epoch": 1.604008016032064, + "grad_norm": 21.943452919170344, + "learning_rate": 5.268607033597686e-06, + "loss": 2.7404, + "step": 8004 + }, + { + "epoch": 1.6042084168336674, + "grad_norm": 32.213859208304974, + "learning_rate": 5.2674428252830475e-06, + "loss": 2.4696, + "step": 8005 + }, + { + "epoch": 1.6044088176352704, + "grad_norm": 30.75275023034917, + "learning_rate": 5.266278602427148e-06, + "loss": 2.6252, + "step": 8006 + }, + { + "epoch": 1.6046092184368739, + "grad_norm": 20.70678070209232, + "learning_rate": 5.265114365093288e-06, + "loss": 2.1016, + "step": 8007 + }, + { + "epoch": 1.6048096192384769, + "grad_norm": 18.8698415413841, + "learning_rate": 5.2639501133447675e-06, + "loss": 2.6961, + "step": 8008 + }, + { + "epoch": 1.6050100200400803, + "grad_norm": 23.870595074807405, + "learning_rate": 5.2627858472448915e-06, + "loss": 2.7964, + "step": 8009 + }, + { + "epoch": 1.6052104208416833, + "grad_norm": 20.19236944854119, + "learning_rate": 5.2616215668569596e-06, + "loss": 2.3205, + "step": 8010 + }, + { + "epoch": 1.6054108216432865, + "grad_norm": 29.36836952103527, + "learning_rate": 5.260457272244277e-06, + "loss": 2.7586, + "step": 8011 + }, + { + "epoch": 1.6056112224448897, + "grad_norm": 49.87066347087974, + "learning_rate": 5.259292963470149e-06, + "loss": 3.1487, + "step": 8012 + }, + { + "epoch": 1.605811623246493, + "grad_norm": 23.033020905494933, + "learning_rate": 5.258128640597879e-06, + "loss": 3.23, + "step": 8013 + }, + { + "epoch": 1.6060120240480962, + "grad_norm": 26.33667844262718, + "learning_rate": 5.256964303690774e-06, + "loss": 3.0927, + "step": 8014 + }, + { + "epoch": 1.6062124248496994, + "grad_norm": 37.31956864061414, + "learning_rate": 5.255799952812141e-06, + "loss": 3.3425, + "step": 8015 + }, + { + "epoch": 1.6064128256513026, + "grad_norm": 32.53358340569951, + "learning_rate": 5.254635588025286e-06, + "loss": 2.9429, + "step": 8016 + }, + { + "epoch": 1.6066132264529058, + "grad_norm": 25.02537810625494, + "learning_rate": 5.253471209393518e-06, + "loss": 2.7843, + "step": 8017 + }, + { + "epoch": 1.606813627254509, + "grad_norm": 24.75972387578307, + "learning_rate": 5.252306816980147e-06, + "loss": 2.2349, + "step": 8018 + }, + { + "epoch": 1.6070140280561123, + "grad_norm": 19.489339905675095, + "learning_rate": 5.251142410848481e-06, + "loss": 2.3996, + "step": 8019 + }, + { + "epoch": 1.6072144288577155, + "grad_norm": 18.02341586011864, + "learning_rate": 5.2499779910618334e-06, + "loss": 2.6721, + "step": 8020 + }, + { + "epoch": 1.6074148296593185, + "grad_norm": 26.232876190570938, + "learning_rate": 5.248813557683512e-06, + "loss": 2.7771, + "step": 8021 + }, + { + "epoch": 1.607615230460922, + "grad_norm": 21.862491397365336, + "learning_rate": 5.24764911077683e-06, + "loss": 3.0522, + "step": 8022 + }, + { + "epoch": 1.607815631262525, + "grad_norm": 25.389050276402013, + "learning_rate": 5.2464846504051e-06, + "loss": 2.8147, + "step": 8023 + }, + { + "epoch": 1.6080160320641284, + "grad_norm": 22.340896547646032, + "learning_rate": 5.245320176631637e-06, + "loss": 2.2255, + "step": 8024 + }, + { + "epoch": 1.6082164328657313, + "grad_norm": 18.736123178482288, + "learning_rate": 5.244155689519754e-06, + "loss": 2.5072, + "step": 8025 + }, + { + "epoch": 1.6084168336673348, + "grad_norm": 23.063088667067163, + "learning_rate": 5.2429911891327645e-06, + "loss": 2.9655, + "step": 8026 + }, + { + "epoch": 1.6086172344689378, + "grad_norm": 24.151342950992312, + "learning_rate": 5.241826675533986e-06, + "loss": 2.8443, + "step": 8027 + }, + { + "epoch": 1.6088176352705412, + "grad_norm": 21.913094221105254, + "learning_rate": 5.240662148786735e-06, + "loss": 2.5472, + "step": 8028 + }, + { + "epoch": 1.6090180360721442, + "grad_norm": 33.05009356863624, + "learning_rate": 5.239497608954326e-06, + "loss": 2.9553, + "step": 8029 + }, + { + "epoch": 1.6092184368737477, + "grad_norm": 31.493989446161333, + "learning_rate": 5.238333056100079e-06, + "loss": 2.6326, + "step": 8030 + }, + { + "epoch": 1.6094188376753507, + "grad_norm": 26.103204177825816, + "learning_rate": 5.237168490287312e-06, + "loss": 2.7512, + "step": 8031 + }, + { + "epoch": 1.6096192384769539, + "grad_norm": 29.946862324271525, + "learning_rate": 5.236003911579345e-06, + "loss": 2.2864, + "step": 8032 + }, + { + "epoch": 1.609819639278557, + "grad_norm": 26.815412926301, + "learning_rate": 5.234839320039495e-06, + "loss": 2.7421, + "step": 8033 + }, + { + "epoch": 1.6100200400801603, + "grad_norm": 24.66046858643461, + "learning_rate": 5.233674715731087e-06, + "loss": 2.9537, + "step": 8034 + }, + { + "epoch": 1.6102204408817635, + "grad_norm": 22.846770320891064, + "learning_rate": 5.232510098717437e-06, + "loss": 2.8517, + "step": 8035 + }, + { + "epoch": 1.6104208416833667, + "grad_norm": 31.248908204507373, + "learning_rate": 5.231345469061871e-06, + "loss": 2.2864, + "step": 8036 + }, + { + "epoch": 1.61062124248497, + "grad_norm": 22.911720645749654, + "learning_rate": 5.230180826827712e-06, + "loss": 2.576, + "step": 8037 + }, + { + "epoch": 1.6108216432865732, + "grad_norm": 25.603626554478435, + "learning_rate": 5.2290161720782794e-06, + "loss": 2.4036, + "step": 8038 + }, + { + "epoch": 1.6110220440881764, + "grad_norm": 18.460636472820866, + "learning_rate": 5.227851504876902e-06, + "loss": 2.9894, + "step": 8039 + }, + { + "epoch": 1.6112224448897794, + "grad_norm": 23.297185562138782, + "learning_rate": 5.2266868252869e-06, + "loss": 2.6267, + "step": 8040 + }, + { + "epoch": 1.6114228456913828, + "grad_norm": 22.470763020575955, + "learning_rate": 5.2255221333716e-06, + "loss": 3.2088, + "step": 8041 + }, + { + "epoch": 1.6116232464929858, + "grad_norm": 44.451081386699, + "learning_rate": 5.22435742919433e-06, + "loss": 2.7182, + "step": 8042 + }, + { + "epoch": 1.6118236472945893, + "grad_norm": 22.112049525527496, + "learning_rate": 5.223192712818414e-06, + "loss": 2.5096, + "step": 8043 + }, + { + "epoch": 1.6120240480961923, + "grad_norm": 26.060939443620864, + "learning_rate": 5.222027984307183e-06, + "loss": 3.0645, + "step": 8044 + }, + { + "epoch": 1.6122244488977957, + "grad_norm": 29.329706316078514, + "learning_rate": 5.220863243723962e-06, + "loss": 2.7663, + "step": 8045 + }, + { + "epoch": 1.6124248496993987, + "grad_norm": 33.76137667816358, + "learning_rate": 5.2196984911320794e-06, + "loss": 2.8738, + "step": 8046 + }, + { + "epoch": 1.6126252505010021, + "grad_norm": 44.995067096609134, + "learning_rate": 5.218533726594865e-06, + "loss": 3.0864, + "step": 8047 + }, + { + "epoch": 1.6128256513026051, + "grad_norm": 46.917761096797214, + "learning_rate": 5.2173689501756505e-06, + "loss": 2.6766, + "step": 8048 + }, + { + "epoch": 1.6130260521042086, + "grad_norm": 27.24421288823473, + "learning_rate": 5.216204161937766e-06, + "loss": 2.5953, + "step": 8049 + }, + { + "epoch": 1.6132264529058116, + "grad_norm": 52.60369639575935, + "learning_rate": 5.215039361944541e-06, + "loss": 3.1709, + "step": 8050 + }, + { + "epoch": 1.6134268537074148, + "grad_norm": 34.16319907105814, + "learning_rate": 5.213874550259308e-06, + "loss": 2.6234, + "step": 8051 + }, + { + "epoch": 1.613627254509018, + "grad_norm": 35.13270930529927, + "learning_rate": 5.212709726945402e-06, + "loss": 2.685, + "step": 8052 + }, + { + "epoch": 1.6138276553106212, + "grad_norm": 52.150440173987995, + "learning_rate": 5.211544892066153e-06, + "loss": 2.9797, + "step": 8053 + }, + { + "epoch": 1.6140280561122244, + "grad_norm": 24.912340590660662, + "learning_rate": 5.210380045684895e-06, + "loss": 2.7109, + "step": 8054 + }, + { + "epoch": 1.6142284569138277, + "grad_norm": 21.252814125977757, + "learning_rate": 5.209215187864965e-06, + "loss": 2.7647, + "step": 8055 + }, + { + "epoch": 1.6144288577154309, + "grad_norm": 25.848088378968733, + "learning_rate": 5.208050318669695e-06, + "loss": 2.3517, + "step": 8056 + }, + { + "epoch": 1.614629258517034, + "grad_norm": 38.85300119438722, + "learning_rate": 5.206885438162422e-06, + "loss": 2.7815, + "step": 8057 + }, + { + "epoch": 1.6148296593186373, + "grad_norm": 23.88273012300032, + "learning_rate": 5.205720546406483e-06, + "loss": 2.6085, + "step": 8058 + }, + { + "epoch": 1.6150300601202405, + "grad_norm": 33.42278337681494, + "learning_rate": 5.204555643465215e-06, + "loss": 2.6397, + "step": 8059 + }, + { + "epoch": 1.6152304609218437, + "grad_norm": 41.65693769946936, + "learning_rate": 5.2033907294019525e-06, + "loss": 2.7289, + "step": 8060 + }, + { + "epoch": 1.6154308617234467, + "grad_norm": 17.39379242957049, + "learning_rate": 5.202225804280039e-06, + "loss": 2.2968, + "step": 8061 + }, + { + "epoch": 1.6156312625250502, + "grad_norm": 24.41013668156333, + "learning_rate": 5.201060868162807e-06, + "loss": 2.6624, + "step": 8062 + }, + { + "epoch": 1.6158316633266532, + "grad_norm": 26.685862389095206, + "learning_rate": 5.1998959211136006e-06, + "loss": 2.4316, + "step": 8063 + }, + { + "epoch": 1.6160320641282566, + "grad_norm": 36.65743282985394, + "learning_rate": 5.198730963195758e-06, + "loss": 3.0953, + "step": 8064 + }, + { + "epoch": 1.6162324649298596, + "grad_norm": 19.858503086904047, + "learning_rate": 5.197565994472619e-06, + "loss": 2.9704, + "step": 8065 + }, + { + "epoch": 1.616432865731463, + "grad_norm": 21.483020765468662, + "learning_rate": 5.196401015007524e-06, + "loss": 2.4675, + "step": 8066 + }, + { + "epoch": 1.616633266533066, + "grad_norm": 24.937460939846844, + "learning_rate": 5.195236024863817e-06, + "loss": 2.3756, + "step": 8067 + }, + { + "epoch": 1.6168336673346695, + "grad_norm": 23.03633437422996, + "learning_rate": 5.194071024104839e-06, + "loss": 2.7564, + "step": 8068 + }, + { + "epoch": 1.6170340681362725, + "grad_norm": 24.158606189582077, + "learning_rate": 5.192906012793934e-06, + "loss": 2.782, + "step": 8069 + }, + { + "epoch": 1.6172344689378757, + "grad_norm": 24.12013334274394, + "learning_rate": 5.191740990994444e-06, + "loss": 3.0742, + "step": 8070 + }, + { + "epoch": 1.617434869739479, + "grad_norm": 38.238504913227615, + "learning_rate": 5.1905759587697125e-06, + "loss": 3.3366, + "step": 8071 + }, + { + "epoch": 1.6176352705410821, + "grad_norm": 25.033229230500517, + "learning_rate": 5.189410916183085e-06, + "loss": 2.9471, + "step": 8072 + }, + { + "epoch": 1.6178356713426854, + "grad_norm": 18.454652711003217, + "learning_rate": 5.188245863297905e-06, + "loss": 2.5343, + "step": 8073 + }, + { + "epoch": 1.6180360721442886, + "grad_norm": 26.17078093648511, + "learning_rate": 5.187080800177522e-06, + "loss": 3.0009, + "step": 8074 + }, + { + "epoch": 1.6182364729458918, + "grad_norm": 24.849830402127974, + "learning_rate": 5.185915726885279e-06, + "loss": 2.4385, + "step": 8075 + }, + { + "epoch": 1.618436873747495, + "grad_norm": 29.81154211955045, + "learning_rate": 5.184750643484524e-06, + "loss": 3.2467, + "step": 8076 + }, + { + "epoch": 1.6186372745490982, + "grad_norm": 28.838890239590373, + "learning_rate": 5.183585550038602e-06, + "loss": 2.4937, + "step": 8077 + }, + { + "epoch": 1.6188376753507014, + "grad_norm": 25.048082844772836, + "learning_rate": 5.1824204466108645e-06, + "loss": 2.7792, + "step": 8078 + }, + { + "epoch": 1.6190380761523047, + "grad_norm": 21.06368639253213, + "learning_rate": 5.181255333264657e-06, + "loss": 2.4981, + "step": 8079 + }, + { + "epoch": 1.6192384769539077, + "grad_norm": 23.08587087870399, + "learning_rate": 5.180090210063329e-06, + "loss": 2.4294, + "step": 8080 + }, + { + "epoch": 1.619438877755511, + "grad_norm": 17.765299081057613, + "learning_rate": 5.178925077070232e-06, + "loss": 2.6233, + "step": 8081 + }, + { + "epoch": 1.619639278557114, + "grad_norm": 33.02723557023595, + "learning_rate": 5.177759934348713e-06, + "loss": 2.5955, + "step": 8082 + }, + { + "epoch": 1.6198396793587175, + "grad_norm": 26.33385390910159, + "learning_rate": 5.176594781962125e-06, + "loss": 2.7064, + "step": 8083 + }, + { + "epoch": 1.6200400801603205, + "grad_norm": 43.03854572758787, + "learning_rate": 5.175429619973817e-06, + "loss": 2.49, + "step": 8084 + }, + { + "epoch": 1.620240480961924, + "grad_norm": 27.628379961981555, + "learning_rate": 5.174264448447142e-06, + "loss": 2.8494, + "step": 8085 + }, + { + "epoch": 1.620440881763527, + "grad_norm": 23.719420985319154, + "learning_rate": 5.173099267445452e-06, + "loss": 2.9461, + "step": 8086 + }, + { + "epoch": 1.6206412825651304, + "grad_norm": 25.3730523498454, + "learning_rate": 5.171934077032099e-06, + "loss": 2.8772, + "step": 8087 + }, + { + "epoch": 1.6208416833667334, + "grad_norm": 30.583469464539867, + "learning_rate": 5.170768877270437e-06, + "loss": 2.5835, + "step": 8088 + }, + { + "epoch": 1.6210420841683368, + "grad_norm": 23.73684435767202, + "learning_rate": 5.169603668223818e-06, + "loss": 2.462, + "step": 8089 + }, + { + "epoch": 1.6212424849699398, + "grad_norm": 24.423794103870403, + "learning_rate": 5.168438449955598e-06, + "loss": 2.3515, + "step": 8090 + }, + { + "epoch": 1.621442885771543, + "grad_norm": 23.42156870512657, + "learning_rate": 5.1672732225291314e-06, + "loss": 2.6426, + "step": 8091 + }, + { + "epoch": 1.6216432865731463, + "grad_norm": 26.773016517323004, + "learning_rate": 5.166107986007771e-06, + "loss": 3.0604, + "step": 8092 + }, + { + "epoch": 1.6218436873747495, + "grad_norm": 20.584829813662587, + "learning_rate": 5.164942740454876e-06, + "loss": 2.4401, + "step": 8093 + }, + { + "epoch": 1.6220440881763527, + "grad_norm": 27.616019808685245, + "learning_rate": 5.1637774859338e-06, + "loss": 3.1537, + "step": 8094 + }, + { + "epoch": 1.622244488977956, + "grad_norm": 24.437647535400785, + "learning_rate": 5.1626122225079004e-06, + "loss": 2.8151, + "step": 8095 + }, + { + "epoch": 1.6224448897795591, + "grad_norm": 24.076939398538507, + "learning_rate": 5.161446950240535e-06, + "loss": 2.7214, + "step": 8096 + }, + { + "epoch": 1.6226452905811624, + "grad_norm": 19.659210495026127, + "learning_rate": 5.160281669195059e-06, + "loss": 2.6336, + "step": 8097 + }, + { + "epoch": 1.6228456913827656, + "grad_norm": 59.32425185396856, + "learning_rate": 5.159116379434833e-06, + "loss": 2.9194, + "step": 8098 + }, + { + "epoch": 1.6230460921843686, + "grad_norm": 24.606861564267614, + "learning_rate": 5.157951081023213e-06, + "loss": 3.0679, + "step": 8099 + }, + { + "epoch": 1.623246492985972, + "grad_norm": 24.778647438044832, + "learning_rate": 5.1567857740235625e-06, + "loss": 2.6433, + "step": 8100 + }, + { + "epoch": 1.623446893787575, + "grad_norm": 25.41101488704486, + "learning_rate": 5.155620458499237e-06, + "loss": 2.5214, + "step": 8101 + }, + { + "epoch": 1.6236472945891784, + "grad_norm": 30.262318202469633, + "learning_rate": 5.154455134513596e-06, + "loss": 2.598, + "step": 8102 + }, + { + "epoch": 1.6238476953907814, + "grad_norm": 22.575683139025642, + "learning_rate": 5.153289802130001e-06, + "loss": 2.966, + "step": 8103 + }, + { + "epoch": 1.6240480961923849, + "grad_norm": 19.500273049934904, + "learning_rate": 5.152124461411815e-06, + "loss": 2.8101, + "step": 8104 + }, + { + "epoch": 1.6242484969939879, + "grad_norm": 28.616667553649087, + "learning_rate": 5.150959112422398e-06, + "loss": 3.2761, + "step": 8105 + }, + { + "epoch": 1.6244488977955913, + "grad_norm": 20.88573401476252, + "learning_rate": 5.14979375522511e-06, + "loss": 2.8132, + "step": 8106 + }, + { + "epoch": 1.6246492985971943, + "grad_norm": 40.15447321164598, + "learning_rate": 5.1486283898833144e-06, + "loss": 3.092, + "step": 8107 + }, + { + "epoch": 1.6248496993987978, + "grad_norm": 26.671344019857276, + "learning_rate": 5.147463016460374e-06, + "loss": 2.4185, + "step": 8108 + }, + { + "epoch": 1.6250501002004007, + "grad_norm": 25.943624285955984, + "learning_rate": 5.146297635019651e-06, + "loss": 2.3845, + "step": 8109 + }, + { + "epoch": 1.625250501002004, + "grad_norm": 21.343054838704955, + "learning_rate": 5.1451322456245115e-06, + "loss": 2.5285, + "step": 8110 + }, + { + "epoch": 1.6254509018036072, + "grad_norm": 25.50699435212919, + "learning_rate": 5.143966848338315e-06, + "loss": 2.7681, + "step": 8111 + }, + { + "epoch": 1.6256513026052104, + "grad_norm": 36.1802883207716, + "learning_rate": 5.14280144322443e-06, + "loss": 2.6948, + "step": 8112 + }, + { + "epoch": 1.6258517034068136, + "grad_norm": 23.163659372284688, + "learning_rate": 5.141636030346221e-06, + "loss": 2.6613, + "step": 8113 + }, + { + "epoch": 1.6260521042084168, + "grad_norm": 33.22119434249645, + "learning_rate": 5.14047060976705e-06, + "loss": 2.7937, + "step": 8114 + }, + { + "epoch": 1.62625250501002, + "grad_norm": 23.26709651156207, + "learning_rate": 5.139305181550286e-06, + "loss": 2.5534, + "step": 8115 + }, + { + "epoch": 1.6264529058116233, + "grad_norm": 45.20404174574257, + "learning_rate": 5.138139745759292e-06, + "loss": 2.8211, + "step": 8116 + }, + { + "epoch": 1.6266533066132265, + "grad_norm": 18.04118223443338, + "learning_rate": 5.136974302457436e-06, + "loss": 2.7025, + "step": 8117 + }, + { + "epoch": 1.6268537074148297, + "grad_norm": 28.931107147282823, + "learning_rate": 5.135808851708087e-06, + "loss": 2.5488, + "step": 8118 + }, + { + "epoch": 1.627054108216433, + "grad_norm": 26.547765540721905, + "learning_rate": 5.134643393574609e-06, + "loss": 3.7123, + "step": 8119 + }, + { + "epoch": 1.627254509018036, + "grad_norm": 33.56287819901557, + "learning_rate": 5.1334779281203715e-06, + "loss": 3.0955, + "step": 8120 + }, + { + "epoch": 1.6274549098196394, + "grad_norm": 63.81017908797404, + "learning_rate": 5.132312455408741e-06, + "loss": 2.4977, + "step": 8121 + }, + { + "epoch": 1.6276553106212424, + "grad_norm": 30.729933119604024, + "learning_rate": 5.131146975503087e-06, + "loss": 2.7291, + "step": 8122 + }, + { + "epoch": 1.6278557114228458, + "grad_norm": 24.835779051751192, + "learning_rate": 5.12998148846678e-06, + "loss": 3.2451, + "step": 8123 + }, + { + "epoch": 1.6280561122244488, + "grad_norm": 27.607906364392303, + "learning_rate": 5.128815994363186e-06, + "loss": 2.7272, + "step": 8124 + }, + { + "epoch": 1.6282565130260522, + "grad_norm": 22.595909440529837, + "learning_rate": 5.127650493255677e-06, + "loss": 2.7452, + "step": 8125 + }, + { + "epoch": 1.6284569138276552, + "grad_norm": 32.91950553943257, + "learning_rate": 5.126484985207622e-06, + "loss": 2.6336, + "step": 8126 + }, + { + "epoch": 1.6286573146292587, + "grad_norm": 30.040121019869925, + "learning_rate": 5.125319470282393e-06, + "loss": 2.8345, + "step": 8127 + }, + { + "epoch": 1.6288577154308617, + "grad_norm": 45.65293847943526, + "learning_rate": 5.1241539485433575e-06, + "loss": 2.4333, + "step": 8128 + }, + { + "epoch": 1.6290581162324649, + "grad_norm": 24.191196777148036, + "learning_rate": 5.122988420053888e-06, + "loss": 2.3163, + "step": 8129 + }, + { + "epoch": 1.629258517034068, + "grad_norm": 19.885229462551006, + "learning_rate": 5.121822884877359e-06, + "loss": 2.9675, + "step": 8130 + }, + { + "epoch": 1.6294589178356713, + "grad_norm": 32.39496167319045, + "learning_rate": 5.1206573430771375e-06, + "loss": 2.3459, + "step": 8131 + }, + { + "epoch": 1.6296593186372745, + "grad_norm": 21.024187558359646, + "learning_rate": 5.1194917947166e-06, + "loss": 2.7915, + "step": 8132 + }, + { + "epoch": 1.6298597194388778, + "grad_norm": 22.145287781136442, + "learning_rate": 5.118326239859117e-06, + "loss": 2.9214, + "step": 8133 + }, + { + "epoch": 1.630060120240481, + "grad_norm": 34.49759005006342, + "learning_rate": 5.117160678568061e-06, + "loss": 2.9553, + "step": 8134 + }, + { + "epoch": 1.6302605210420842, + "grad_norm": 26.674242877290926, + "learning_rate": 5.115995110906805e-06, + "loss": 2.3908, + "step": 8135 + }, + { + "epoch": 1.6304609218436874, + "grad_norm": 23.842708591887398, + "learning_rate": 5.114829536938724e-06, + "loss": 2.5155, + "step": 8136 + }, + { + "epoch": 1.6306613226452906, + "grad_norm": 28.886853096180534, + "learning_rate": 5.113663956727194e-06, + "loss": 2.4478, + "step": 8137 + }, + { + "epoch": 1.6308617234468938, + "grad_norm": 33.85940194944114, + "learning_rate": 5.112498370335585e-06, + "loss": 2.8168, + "step": 8138 + }, + { + "epoch": 1.6310621242484968, + "grad_norm": 36.65705218435502, + "learning_rate": 5.111332777827274e-06, + "loss": 3.0628, + "step": 8139 + }, + { + "epoch": 1.6312625250501003, + "grad_norm": 27.552339280505976, + "learning_rate": 5.110167179265636e-06, + "loss": 2.2482, + "step": 8140 + }, + { + "epoch": 1.6314629258517033, + "grad_norm": 26.70794709631193, + "learning_rate": 5.109001574714044e-06, + "loss": 2.7966, + "step": 8141 + }, + { + "epoch": 1.6316633266533067, + "grad_norm": 20.861102383277352, + "learning_rate": 5.107835964235877e-06, + "loss": 2.1586, + "step": 8142 + }, + { + "epoch": 1.6318637274549097, + "grad_norm": 21.05645835508473, + "learning_rate": 5.106670347894509e-06, + "loss": 2.2415, + "step": 8143 + }, + { + "epoch": 1.6320641282565131, + "grad_norm": 51.232042561747505, + "learning_rate": 5.105504725753318e-06, + "loss": 2.1404, + "step": 8144 + }, + { + "epoch": 1.6322645290581161, + "grad_norm": 26.506885541338285, + "learning_rate": 5.1043390978756765e-06, + "loss": 2.1557, + "step": 8145 + }, + { + "epoch": 1.6324649298597196, + "grad_norm": 19.109204491515932, + "learning_rate": 5.103173464324967e-06, + "loss": 2.181, + "step": 8146 + }, + { + "epoch": 1.6326653306613226, + "grad_norm": 31.006748123078776, + "learning_rate": 5.102007825164561e-06, + "loss": 3.0606, + "step": 8147 + }, + { + "epoch": 1.632865731462926, + "grad_norm": 29.501894686658478, + "learning_rate": 5.100842180457841e-06, + "loss": 2.4328, + "step": 8148 + }, + { + "epoch": 1.633066132264529, + "grad_norm": 27.15830543074512, + "learning_rate": 5.099676530268183e-06, + "loss": 2.8379, + "step": 8149 + }, + { + "epoch": 1.6332665330661322, + "grad_norm": 25.59925468208382, + "learning_rate": 5.098510874658964e-06, + "loss": 2.9377, + "step": 8150 + }, + { + "epoch": 1.6334669338677354, + "grad_norm": 24.955461383302918, + "learning_rate": 5.097345213693564e-06, + "loss": 2.5377, + "step": 8151 + }, + { + "epoch": 1.6336673346693387, + "grad_norm": 17.277228829564315, + "learning_rate": 5.0961795474353614e-06, + "loss": 2.7224, + "step": 8152 + }, + { + "epoch": 1.6338677354709419, + "grad_norm": 63.91505209320007, + "learning_rate": 5.095013875947734e-06, + "loss": 2.6113, + "step": 8153 + }, + { + "epoch": 1.634068136272545, + "grad_norm": 29.398671340361403, + "learning_rate": 5.093848199294062e-06, + "loss": 3.1689, + "step": 8154 + }, + { + "epoch": 1.6342685370741483, + "grad_norm": 46.68373808647605, + "learning_rate": 5.092682517537725e-06, + "loss": 3.6518, + "step": 8155 + }, + { + "epoch": 1.6344689378757515, + "grad_norm": 23.926303907784547, + "learning_rate": 5.091516830742102e-06, + "loss": 2.9787, + "step": 8156 + }, + { + "epoch": 1.6346693386773548, + "grad_norm": 26.312050881636768, + "learning_rate": 5.090351138970576e-06, + "loss": 3.018, + "step": 8157 + }, + { + "epoch": 1.6348697394789578, + "grad_norm": 17.669515539479043, + "learning_rate": 5.089185442286523e-06, + "loss": 2.7694, + "step": 8158 + }, + { + "epoch": 1.6350701402805612, + "grad_norm": 36.79173845066257, + "learning_rate": 5.088019740753327e-06, + "loss": 2.1642, + "step": 8159 + }, + { + "epoch": 1.6352705410821642, + "grad_norm": 26.120316352407325, + "learning_rate": 5.086854034434366e-06, + "loss": 2.8428, + "step": 8160 + }, + { + "epoch": 1.6354709418837676, + "grad_norm": 17.896726885563666, + "learning_rate": 5.0856883233930245e-06, + "loss": 2.4622, + "step": 8161 + }, + { + "epoch": 1.6356713426853706, + "grad_norm": 24.049068289383325, + "learning_rate": 5.084522607692684e-06, + "loss": 2.7367, + "step": 8162 + }, + { + "epoch": 1.635871743486974, + "grad_norm": 27.89152274544215, + "learning_rate": 5.083356887396722e-06, + "loss": 2.3599, + "step": 8163 + }, + { + "epoch": 1.636072144288577, + "grad_norm": 27.258849829361388, + "learning_rate": 5.082191162568524e-06, + "loss": 2.9582, + "step": 8164 + }, + { + "epoch": 1.6362725450901805, + "grad_norm": 23.89781519920593, + "learning_rate": 5.08102543327147e-06, + "loss": 2.6816, + "step": 8165 + }, + { + "epoch": 1.6364729458917835, + "grad_norm": 33.5778472765644, + "learning_rate": 5.079859699568945e-06, + "loss": 3.1682, + "step": 8166 + }, + { + "epoch": 1.636673346693387, + "grad_norm": 24.549673307087833, + "learning_rate": 5.078693961524329e-06, + "loss": 2.9257, + "step": 8167 + }, + { + "epoch": 1.63687374749499, + "grad_norm": 28.029056763034614, + "learning_rate": 5.077528219201007e-06, + "loss": 2.5901, + "step": 8168 + }, + { + "epoch": 1.6370741482965931, + "grad_norm": 32.43908726196193, + "learning_rate": 5.076362472662362e-06, + "loss": 2.657, + "step": 8169 + }, + { + "epoch": 1.6372745490981964, + "grad_norm": 49.976012770126864, + "learning_rate": 5.075196721971776e-06, + "loss": 3.7671, + "step": 8170 + }, + { + "epoch": 1.6374749498997996, + "grad_norm": 28.889848434993294, + "learning_rate": 5.074030967192633e-06, + "loss": 2.7178, + "step": 8171 + }, + { + "epoch": 1.6376753507014028, + "grad_norm": 27.288128297056325, + "learning_rate": 5.072865208388316e-06, + "loss": 2.4823, + "step": 8172 + }, + { + "epoch": 1.637875751503006, + "grad_norm": 52.85124425466211, + "learning_rate": 5.071699445622211e-06, + "loss": 3.1021, + "step": 8173 + }, + { + "epoch": 1.6380761523046092, + "grad_norm": 21.9379839646943, + "learning_rate": 5.070533678957702e-06, + "loss": 2.638, + "step": 8174 + }, + { + "epoch": 1.6382765531062125, + "grad_norm": 25.165443183115777, + "learning_rate": 5.069367908458173e-06, + "loss": 2.6007, + "step": 8175 + }, + { + "epoch": 1.6384769539078157, + "grad_norm": 25.794822108831326, + "learning_rate": 5.068202134187008e-06, + "loss": 2.5216, + "step": 8176 + }, + { + "epoch": 1.6386773547094189, + "grad_norm": 61.47214080245274, + "learning_rate": 5.067036356207591e-06, + "loss": 2.8817, + "step": 8177 + }, + { + "epoch": 1.638877755511022, + "grad_norm": 45.67646931262061, + "learning_rate": 5.0658705745833095e-06, + "loss": 2.3751, + "step": 8178 + }, + { + "epoch": 1.639078156312625, + "grad_norm": 22.791991599313025, + "learning_rate": 5.064704789377547e-06, + "loss": 2.8812, + "step": 8179 + }, + { + "epoch": 1.6392785571142285, + "grad_norm": 28.95399741530872, + "learning_rate": 5.06353900065369e-06, + "loss": 2.9892, + "step": 8180 + }, + { + "epoch": 1.6394789579158315, + "grad_norm": 18.62994166548432, + "learning_rate": 5.062373208475124e-06, + "loss": 2.6722, + "step": 8181 + }, + { + "epoch": 1.639679358717435, + "grad_norm": 18.505201261697643, + "learning_rate": 5.0612074129052336e-06, + "loss": 2.4531, + "step": 8182 + }, + { + "epoch": 1.639879759519038, + "grad_norm": 41.94436591154562, + "learning_rate": 5.0600416140074075e-06, + "loss": 3.1033, + "step": 8183 + }, + { + "epoch": 1.6400801603206414, + "grad_norm": 20.327966624679792, + "learning_rate": 5.0588758118450284e-06, + "loss": 2.4794, + "step": 8184 + }, + { + "epoch": 1.6402805611222444, + "grad_norm": 22.720464799243736, + "learning_rate": 5.057710006481484e-06, + "loss": 2.5286, + "step": 8185 + }, + { + "epoch": 1.6404809619238478, + "grad_norm": 40.038756991517495, + "learning_rate": 5.056544197980164e-06, + "loss": 2.8331, + "step": 8186 + }, + { + "epoch": 1.6406813627254508, + "grad_norm": 26.894223868853413, + "learning_rate": 5.0553783864044515e-06, + "loss": 3.0404, + "step": 8187 + }, + { + "epoch": 1.640881763527054, + "grad_norm": 22.797895543457262, + "learning_rate": 5.054212571817734e-06, + "loss": 2.9557, + "step": 8188 + }, + { + "epoch": 1.6410821643286573, + "grad_norm": 29.316901987697197, + "learning_rate": 5.053046754283399e-06, + "loss": 2.7417, + "step": 8189 + }, + { + "epoch": 1.6412825651302605, + "grad_norm": 54.84817238795031, + "learning_rate": 5.051880933864833e-06, + "loss": 2.8298, + "step": 8190 + }, + { + "epoch": 1.6414829659318637, + "grad_norm": 40.50171386317683, + "learning_rate": 5.050715110625425e-06, + "loss": 2.2335, + "step": 8191 + }, + { + "epoch": 1.641683366733467, + "grad_norm": 40.36773762838619, + "learning_rate": 5.049549284628561e-06, + "loss": 3.0646, + "step": 8192 + }, + { + "epoch": 1.6418837675350701, + "grad_norm": 20.257947158723425, + "learning_rate": 5.048383455937631e-06, + "loss": 2.501, + "step": 8193 + }, + { + "epoch": 1.6420841683366734, + "grad_norm": 21.671982753809342, + "learning_rate": 5.047217624616018e-06, + "loss": 2.2517, + "step": 8194 + }, + { + "epoch": 1.6422845691382766, + "grad_norm": 32.521029871944236, + "learning_rate": 5.046051790727116e-06, + "loss": 3.3229, + "step": 8195 + }, + { + "epoch": 1.6424849699398798, + "grad_norm": 23.267071324799065, + "learning_rate": 5.044885954334309e-06, + "loss": 2.1511, + "step": 8196 + }, + { + "epoch": 1.642685370741483, + "grad_norm": 23.454615779198534, + "learning_rate": 5.043720115500986e-06, + "loss": 2.522, + "step": 8197 + }, + { + "epoch": 1.642885771543086, + "grad_norm": 23.378924152226947, + "learning_rate": 5.042554274290535e-06, + "loss": 2.3057, + "step": 8198 + }, + { + "epoch": 1.6430861723446895, + "grad_norm": 24.94683726464289, + "learning_rate": 5.041388430766347e-06, + "loss": 2.9206, + "step": 8199 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 23.22161899404657, + "learning_rate": 5.040222584991807e-06, + "loss": 2.8997, + "step": 8200 + }, + { + "epoch": 1.643486973947896, + "grad_norm": 32.298129295838, + "learning_rate": 5.0390567370303075e-06, + "loss": 2.3364, + "step": 8201 + }, + { + "epoch": 1.6436873747494989, + "grad_norm": 28.501448083200277, + "learning_rate": 5.037890886945236e-06, + "loss": 2.6708, + "step": 8202 + }, + { + "epoch": 1.6438877755511023, + "grad_norm": 24.666133345403498, + "learning_rate": 5.03672503479998e-06, + "loss": 2.6846, + "step": 8203 + }, + { + "epoch": 1.6440881763527053, + "grad_norm": 21.708142767736, + "learning_rate": 5.035559180657929e-06, + "loss": 2.5854, + "step": 8204 + }, + { + "epoch": 1.6442885771543088, + "grad_norm": 52.537988822910805, + "learning_rate": 5.034393324582473e-06, + "loss": 2.8387, + "step": 8205 + }, + { + "epoch": 1.6444889779559118, + "grad_norm": 22.38892992705894, + "learning_rate": 5.033227466637002e-06, + "loss": 2.7793, + "step": 8206 + }, + { + "epoch": 1.644689378757515, + "grad_norm": 19.12433571916378, + "learning_rate": 5.032061606884904e-06, + "loss": 2.9716, + "step": 8207 + }, + { + "epoch": 1.6448897795591182, + "grad_norm": 33.11673584343377, + "learning_rate": 5.0308957453895705e-06, + "loss": 2.4057, + "step": 8208 + }, + { + "epoch": 1.6450901803607214, + "grad_norm": 27.20978338756409, + "learning_rate": 5.029729882214388e-06, + "loss": 2.5178, + "step": 8209 + }, + { + "epoch": 1.6452905811623246, + "grad_norm": 22.87681734108839, + "learning_rate": 5.028564017422749e-06, + "loss": 2.8489, + "step": 8210 + }, + { + "epoch": 1.6454909819639278, + "grad_norm": 19.202000014426936, + "learning_rate": 5.0273981510780415e-06, + "loss": 2.6245, + "step": 8211 + }, + { + "epoch": 1.645691382765531, + "grad_norm": 23.885010637708195, + "learning_rate": 5.026232283243656e-06, + "loss": 2.6589, + "step": 8212 + }, + { + "epoch": 1.6458917835671343, + "grad_norm": 57.165155083683175, + "learning_rate": 5.025066413982983e-06, + "loss": 2.3915, + "step": 8213 + }, + { + "epoch": 1.6460921843687375, + "grad_norm": 33.79756097999334, + "learning_rate": 5.0239005433594124e-06, + "loss": 2.8071, + "step": 8214 + }, + { + "epoch": 1.6462925851703407, + "grad_norm": 29.806504645455668, + "learning_rate": 5.022734671436333e-06, + "loss": 3.022, + "step": 8215 + }, + { + "epoch": 1.646492985971944, + "grad_norm": 25.46140619780732, + "learning_rate": 5.021568798277136e-06, + "loss": 2.6903, + "step": 8216 + }, + { + "epoch": 1.646693386773547, + "grad_norm": 26.319148055937987, + "learning_rate": 5.020402923945212e-06, + "loss": 3.1846, + "step": 8217 + }, + { + "epoch": 1.6468937875751504, + "grad_norm": 24.37215095192285, + "learning_rate": 5.019237048503951e-06, + "loss": 2.749, + "step": 8218 + }, + { + "epoch": 1.6470941883767534, + "grad_norm": 23.0303571946908, + "learning_rate": 5.018071172016743e-06, + "loss": 2.4445, + "step": 8219 + }, + { + "epoch": 1.6472945891783568, + "grad_norm": 24.308274819832054, + "learning_rate": 5.01690529454698e-06, + "loss": 2.5391, + "step": 8220 + }, + { + "epoch": 1.6474949899799598, + "grad_norm": 60.53513244099372, + "learning_rate": 5.0157394161580495e-06, + "loss": 2.83, + "step": 8221 + }, + { + "epoch": 1.6476953907815632, + "grad_norm": 35.484155450683204, + "learning_rate": 5.014573536913344e-06, + "loss": 2.9559, + "step": 8222 + }, + { + "epoch": 1.6478957915831662, + "grad_norm": 49.54512147972019, + "learning_rate": 5.013407656876255e-06, + "loss": 2.7212, + "step": 8223 + }, + { + "epoch": 1.6480961923847697, + "grad_norm": 25.703093534471563, + "learning_rate": 5.01224177611017e-06, + "loss": 2.4274, + "step": 8224 + }, + { + "epoch": 1.6482965931863727, + "grad_norm": 28.28480315507023, + "learning_rate": 5.011075894678483e-06, + "loss": 2.7414, + "step": 8225 + }, + { + "epoch": 1.6484969939879761, + "grad_norm": 26.271426413416105, + "learning_rate": 5.009910012644583e-06, + "loss": 2.7292, + "step": 8226 + }, + { + "epoch": 1.648697394789579, + "grad_norm": 29.53575414685331, + "learning_rate": 5.008744130071861e-06, + "loss": 3.1754, + "step": 8227 + }, + { + "epoch": 1.6488977955911823, + "grad_norm": 28.832874058729715, + "learning_rate": 5.007578247023708e-06, + "loss": 2.6441, + "step": 8228 + }, + { + "epoch": 1.6490981963927855, + "grad_norm": 21.70122886925456, + "learning_rate": 5.006412363563513e-06, + "loss": 2.655, + "step": 8229 + }, + { + "epoch": 1.6492985971943888, + "grad_norm": 28.186541820589238, + "learning_rate": 5.00524647975467e-06, + "loss": 2.7062, + "step": 8230 + }, + { + "epoch": 1.649498997995992, + "grad_norm": 20.39671795185193, + "learning_rate": 5.004080595660567e-06, + "loss": 2.3785, + "step": 8231 + }, + { + "epoch": 1.6496993987975952, + "grad_norm": 29.202748379495773, + "learning_rate": 5.002914711344596e-06, + "loss": 3.0979, + "step": 8232 + }, + { + "epoch": 1.6498997995991984, + "grad_norm": 33.8559965070052, + "learning_rate": 5.00174882687015e-06, + "loss": 2.8773, + "step": 8233 + }, + { + "epoch": 1.6501002004008016, + "grad_norm": 23.90028748922107, + "learning_rate": 5.000582942300616e-06, + "loss": 2.6195, + "step": 8234 + }, + { + "epoch": 1.6503006012024048, + "grad_norm": 16.936602176143875, + "learning_rate": 4.999417057699386e-06, + "loss": 2.27, + "step": 8235 + }, + { + "epoch": 1.650501002004008, + "grad_norm": 21.07170106329207, + "learning_rate": 4.998251173129852e-06, + "loss": 2.7259, + "step": 8236 + }, + { + "epoch": 1.6507014028056113, + "grad_norm": 27.183568962232396, + "learning_rate": 4.997085288655405e-06, + "loss": 2.9267, + "step": 8237 + }, + { + "epoch": 1.6509018036072143, + "grad_norm": 27.45391797780986, + "learning_rate": 4.995919404339434e-06, + "loss": 3.1357, + "step": 8238 + }, + { + "epoch": 1.6511022044088177, + "grad_norm": 31.177042220521237, + "learning_rate": 4.994753520245332e-06, + "loss": 3.3734, + "step": 8239 + }, + { + "epoch": 1.6513026052104207, + "grad_norm": 37.9496545734764, + "learning_rate": 4.993587636436489e-06, + "loss": 2.5195, + "step": 8240 + }, + { + "epoch": 1.6515030060120242, + "grad_norm": 16.698744718820958, + "learning_rate": 4.992421752976294e-06, + "loss": 2.2552, + "step": 8241 + }, + { + "epoch": 1.6517034068136272, + "grad_norm": 29.990903040934228, + "learning_rate": 4.99125586992814e-06, + "loss": 3.0634, + "step": 8242 + }, + { + "epoch": 1.6519038076152306, + "grad_norm": 23.90486325080561, + "learning_rate": 4.9900899873554185e-06, + "loss": 2.583, + "step": 8243 + }, + { + "epoch": 1.6521042084168336, + "grad_norm": 22.52333054274745, + "learning_rate": 4.98892410532152e-06, + "loss": 2.7248, + "step": 8244 + }, + { + "epoch": 1.652304609218437, + "grad_norm": 34.6266601588528, + "learning_rate": 4.987758223889831e-06, + "loss": 2.8227, + "step": 8245 + }, + { + "epoch": 1.65250501002004, + "grad_norm": 21.62814246374997, + "learning_rate": 4.986592343123747e-06, + "loss": 2.6937, + "step": 8246 + }, + { + "epoch": 1.6527054108216432, + "grad_norm": 23.31178209692734, + "learning_rate": 4.985426463086657e-06, + "loss": 2.626, + "step": 8247 + }, + { + "epoch": 1.6529058116232465, + "grad_norm": 18.49598798996279, + "learning_rate": 4.984260583841953e-06, + "loss": 2.8007, + "step": 8248 + }, + { + "epoch": 1.6531062124248497, + "grad_norm": 23.844664222483384, + "learning_rate": 4.9830947054530215e-06, + "loss": 2.5479, + "step": 8249 + }, + { + "epoch": 1.653306613226453, + "grad_norm": 48.24808143822165, + "learning_rate": 4.981928827983258e-06, + "loss": 3.1853, + "step": 8250 + }, + { + "epoch": 1.653507014028056, + "grad_norm": 16.116827585815862, + "learning_rate": 4.980762951496051e-06, + "loss": 2.4209, + "step": 8251 + }, + { + "epoch": 1.6537074148296593, + "grad_norm": 20.0928344758366, + "learning_rate": 4.979597076054788e-06, + "loss": 2.9189, + "step": 8252 + }, + { + "epoch": 1.6539078156312625, + "grad_norm": 21.44991388076078, + "learning_rate": 4.978431201722865e-06, + "loss": 3.0739, + "step": 8253 + }, + { + "epoch": 1.6541082164328658, + "grad_norm": 20.457213368538053, + "learning_rate": 4.9772653285636684e-06, + "loss": 2.4537, + "step": 8254 + }, + { + "epoch": 1.654308617234469, + "grad_norm": 25.31870436837554, + "learning_rate": 4.976099456640588e-06, + "loss": 3.1957, + "step": 8255 + }, + { + "epoch": 1.6545090180360722, + "grad_norm": 31.947372139850224, + "learning_rate": 4.974933586017019e-06, + "loss": 3.1908, + "step": 8256 + }, + { + "epoch": 1.6547094188376752, + "grad_norm": 19.254333232461256, + "learning_rate": 4.973767716756346e-06, + "loss": 2.2498, + "step": 8257 + }, + { + "epoch": 1.6549098196392786, + "grad_norm": 40.11490930960543, + "learning_rate": 4.972601848921961e-06, + "loss": 2.7457, + "step": 8258 + }, + { + "epoch": 1.6551102204408816, + "grad_norm": 26.537355428286183, + "learning_rate": 4.971435982577253e-06, + "loss": 2.4126, + "step": 8259 + }, + { + "epoch": 1.655310621242485, + "grad_norm": 18.35491789294057, + "learning_rate": 4.970270117785613e-06, + "loss": 2.7306, + "step": 8260 + }, + { + "epoch": 1.655511022044088, + "grad_norm": 19.116437671394355, + "learning_rate": 4.969104254610432e-06, + "loss": 1.9878, + "step": 8261 + }, + { + "epoch": 1.6557114228456915, + "grad_norm": 19.742943727300055, + "learning_rate": 4.967938393115096e-06, + "loss": 2.6571, + "step": 8262 + }, + { + "epoch": 1.6559118236472945, + "grad_norm": 20.0194800579318, + "learning_rate": 4.966772533362999e-06, + "loss": 3.0018, + "step": 8263 + }, + { + "epoch": 1.656112224448898, + "grad_norm": 23.989396225665438, + "learning_rate": 4.965606675417529e-06, + "loss": 2.6979, + "step": 8264 + }, + { + "epoch": 1.656312625250501, + "grad_norm": 52.27180542735283, + "learning_rate": 4.964440819342072e-06, + "loss": 2.8116, + "step": 8265 + }, + { + "epoch": 1.6565130260521042, + "grad_norm": 22.116798883705915, + "learning_rate": 4.963274965200022e-06, + "loss": 2.7126, + "step": 8266 + }, + { + "epoch": 1.6567134268537074, + "grad_norm": 39.45497466436122, + "learning_rate": 4.962109113054766e-06, + "loss": 2.7035, + "step": 8267 + }, + { + "epoch": 1.6569138276553106, + "grad_norm": 26.007762070316822, + "learning_rate": 4.960943262969694e-06, + "loss": 3.0042, + "step": 8268 + }, + { + "epoch": 1.6571142284569138, + "grad_norm": 19.771535720744236, + "learning_rate": 4.959777415008193e-06, + "loss": 2.4711, + "step": 8269 + }, + { + "epoch": 1.657314629258517, + "grad_norm": 32.45880282604956, + "learning_rate": 4.958611569233655e-06, + "loss": 3.1876, + "step": 8270 + }, + { + "epoch": 1.6575150300601202, + "grad_norm": 28.662783637131803, + "learning_rate": 4.957445725709467e-06, + "loss": 2.2846, + "step": 8271 + }, + { + "epoch": 1.6577154308617235, + "grad_norm": 17.331424949904147, + "learning_rate": 4.956279884499016e-06, + "loss": 2.8199, + "step": 8272 + }, + { + "epoch": 1.6579158316633267, + "grad_norm": 18.963270913763616, + "learning_rate": 4.955114045665693e-06, + "loss": 2.7445, + "step": 8273 + }, + { + "epoch": 1.65811623246493, + "grad_norm": 18.388128513633017, + "learning_rate": 4.953948209272886e-06, + "loss": 2.817, + "step": 8274 + }, + { + "epoch": 1.6583166332665331, + "grad_norm": 23.021235430132492, + "learning_rate": 4.952782375383984e-06, + "loss": 2.9368, + "step": 8275 + }, + { + "epoch": 1.658517034068136, + "grad_norm": 24.324839743377925, + "learning_rate": 4.951616544062371e-06, + "loss": 3.0675, + "step": 8276 + }, + { + "epoch": 1.6587174348697395, + "grad_norm": 25.833593451316798, + "learning_rate": 4.95045071537144e-06, + "loss": 3.0968, + "step": 8277 + }, + { + "epoch": 1.6589178356713425, + "grad_norm": 32.154227969131945, + "learning_rate": 4.9492848893745765e-06, + "loss": 2.9361, + "step": 8278 + }, + { + "epoch": 1.659118236472946, + "grad_norm": 30.980172121909114, + "learning_rate": 4.948119066135168e-06, + "loss": 2.73, + "step": 8279 + }, + { + "epoch": 1.659318637274549, + "grad_norm": 27.40957002635497, + "learning_rate": 4.946953245716602e-06, + "loss": 2.7834, + "step": 8280 + }, + { + "epoch": 1.6595190380761524, + "grad_norm": 30.186190662617417, + "learning_rate": 4.945787428182268e-06, + "loss": 3.2765, + "step": 8281 + }, + { + "epoch": 1.6597194388777554, + "grad_norm": 22.80313376214352, + "learning_rate": 4.944621613595549e-06, + "loss": 2.1939, + "step": 8282 + }, + { + "epoch": 1.6599198396793589, + "grad_norm": 21.075640710103052, + "learning_rate": 4.943455802019837e-06, + "loss": 2.1018, + "step": 8283 + }, + { + "epoch": 1.6601202404809619, + "grad_norm": 26.641735894196387, + "learning_rate": 4.942289993518517e-06, + "loss": 2.992, + "step": 8284 + }, + { + "epoch": 1.6603206412825653, + "grad_norm": 17.654093078181443, + "learning_rate": 4.941124188154973e-06, + "loss": 2.0387, + "step": 8285 + }, + { + "epoch": 1.6605210420841683, + "grad_norm": 17.935133655669336, + "learning_rate": 4.939958385992593e-06, + "loss": 2.546, + "step": 8286 + }, + { + "epoch": 1.6607214428857715, + "grad_norm": 27.312958831003886, + "learning_rate": 4.938792587094767e-06, + "loss": 3.1933, + "step": 8287 + }, + { + "epoch": 1.6609218436873747, + "grad_norm": 48.49825172422504, + "learning_rate": 4.937626791524879e-06, + "loss": 2.8434, + "step": 8288 + }, + { + "epoch": 1.661122244488978, + "grad_norm": 30.274993563811588, + "learning_rate": 4.936460999346311e-06, + "loss": 3.0951, + "step": 8289 + }, + { + "epoch": 1.6613226452905812, + "grad_norm": 19.85566910251027, + "learning_rate": 4.935295210622454e-06, + "loss": 2.6562, + "step": 8290 + }, + { + "epoch": 1.6615230460921844, + "grad_norm": 26.770659566081875, + "learning_rate": 4.934129425416692e-06, + "loss": 2.9818, + "step": 8291 + }, + { + "epoch": 1.6617234468937876, + "grad_norm": 38.90584739078364, + "learning_rate": 4.932963643792411e-06, + "loss": 3.0519, + "step": 8292 + }, + { + "epoch": 1.6619238476953908, + "grad_norm": 23.045426500265485, + "learning_rate": 4.931797865812994e-06, + "loss": 2.6106, + "step": 8293 + }, + { + "epoch": 1.662124248496994, + "grad_norm": 22.560996598021013, + "learning_rate": 4.9306320915418295e-06, + "loss": 2.6446, + "step": 8294 + }, + { + "epoch": 1.6623246492985972, + "grad_norm": 28.51796796190178, + "learning_rate": 4.9294663210423e-06, + "loss": 3.0232, + "step": 8295 + }, + { + "epoch": 1.6625250501002005, + "grad_norm": 19.62164423398894, + "learning_rate": 4.928300554377789e-06, + "loss": 2.4629, + "step": 8296 + }, + { + "epoch": 1.6627254509018035, + "grad_norm": 26.79849808088189, + "learning_rate": 4.927134791611685e-06, + "loss": 2.2778, + "step": 8297 + }, + { + "epoch": 1.662925851703407, + "grad_norm": 26.552281811503327, + "learning_rate": 4.925969032807369e-06, + "loss": 3.0336, + "step": 8298 + }, + { + "epoch": 1.66312625250501, + "grad_norm": 19.725207283979945, + "learning_rate": 4.924803278028225e-06, + "loss": 2.6082, + "step": 8299 + }, + { + "epoch": 1.6633266533066133, + "grad_norm": 32.84848158586084, + "learning_rate": 4.923637527337639e-06, + "loss": 2.7252, + "step": 8300 + }, + { + "epoch": 1.6635270541082163, + "grad_norm": 35.30155150968063, + "learning_rate": 4.922471780798994e-06, + "loss": 3.1204, + "step": 8301 + }, + { + "epoch": 1.6637274549098198, + "grad_norm": 35.726398920197575, + "learning_rate": 4.9213060384756716e-06, + "loss": 3.1384, + "step": 8302 + }, + { + "epoch": 1.6639278557114228, + "grad_norm": 19.738735010423703, + "learning_rate": 4.9201403004310565e-06, + "loss": 2.8029, + "step": 8303 + }, + { + "epoch": 1.6641282565130262, + "grad_norm": 28.317342775547548, + "learning_rate": 4.918974566728531e-06, + "loss": 2.7013, + "step": 8304 + }, + { + "epoch": 1.6643286573146292, + "grad_norm": 43.57058040428361, + "learning_rate": 4.917808837431478e-06, + "loss": 3.2, + "step": 8305 + }, + { + "epoch": 1.6645290581162324, + "grad_norm": 23.89326357536405, + "learning_rate": 4.916643112603279e-06, + "loss": 2.7998, + "step": 8306 + }, + { + "epoch": 1.6647294589178356, + "grad_norm": 25.538775803817874, + "learning_rate": 4.915477392307319e-06, + "loss": 2.9003, + "step": 8307 + }, + { + "epoch": 1.6649298597194389, + "grad_norm": 21.53379209675894, + "learning_rate": 4.914311676606977e-06, + "loss": 2.154, + "step": 8308 + }, + { + "epoch": 1.665130260521042, + "grad_norm": 22.221000506992755, + "learning_rate": 4.9131459655656336e-06, + "loss": 2.5779, + "step": 8309 + }, + { + "epoch": 1.6653306613226453, + "grad_norm": 19.908755748993173, + "learning_rate": 4.911980259246674e-06, + "loss": 2.8617, + "step": 8310 + }, + { + "epoch": 1.6655310621242485, + "grad_norm": 16.391371397426166, + "learning_rate": 4.910814557713478e-06, + "loss": 2.683, + "step": 8311 + }, + { + "epoch": 1.6657314629258517, + "grad_norm": 65.89566645917303, + "learning_rate": 4.9096488610294264e-06, + "loss": 3.0506, + "step": 8312 + }, + { + "epoch": 1.665931863727455, + "grad_norm": 25.625811474148122, + "learning_rate": 4.908483169257899e-06, + "loss": 2.6931, + "step": 8313 + }, + { + "epoch": 1.6661322645290582, + "grad_norm": 21.520917063432318, + "learning_rate": 4.907317482462277e-06, + "loss": 2.876, + "step": 8314 + }, + { + "epoch": 1.6663326653306614, + "grad_norm": 23.859211866470496, + "learning_rate": 4.9061518007059395e-06, + "loss": 2.5922, + "step": 8315 + }, + { + "epoch": 1.6665330661322644, + "grad_norm": 24.39221956761649, + "learning_rate": 4.9049861240522675e-06, + "loss": 2.7406, + "step": 8316 + }, + { + "epoch": 1.6667334669338678, + "grad_norm": 30.552769948519718, + "learning_rate": 4.90382045256464e-06, + "loss": 3.2022, + "step": 8317 + }, + { + "epoch": 1.6669338677354708, + "grad_norm": 26.658729546534985, + "learning_rate": 4.902654786306437e-06, + "loss": 2.9899, + "step": 8318 + }, + { + "epoch": 1.6671342685370742, + "grad_norm": 21.83741589060096, + "learning_rate": 4.901489125341038e-06, + "loss": 2.8571, + "step": 8319 + }, + { + "epoch": 1.6673346693386772, + "grad_norm": 24.83924468463904, + "learning_rate": 4.900323469731818e-06, + "loss": 3.3202, + "step": 8320 + }, + { + "epoch": 1.6675350701402807, + "grad_norm": 25.184082930289186, + "learning_rate": 4.89915781954216e-06, + "loss": 2.4896, + "step": 8321 + }, + { + "epoch": 1.6677354709418837, + "grad_norm": 23.244419123297977, + "learning_rate": 4.89799217483544e-06, + "loss": 3.2856, + "step": 8322 + }, + { + "epoch": 1.6679358717434871, + "grad_norm": 24.848436050481602, + "learning_rate": 4.896826535675034e-06, + "loss": 2.5934, + "step": 8323 + }, + { + "epoch": 1.6681362725450901, + "grad_norm": 42.08012742717582, + "learning_rate": 4.895660902124324e-06, + "loss": 3.0152, + "step": 8324 + }, + { + "epoch": 1.6683366733466933, + "grad_norm": 24.796367725763037, + "learning_rate": 4.8944952742466855e-06, + "loss": 2.5588, + "step": 8325 + }, + { + "epoch": 1.6685370741482966, + "grad_norm": 25.565192108939456, + "learning_rate": 4.893329652105492e-06, + "loss": 2.0766, + "step": 8326 + }, + { + "epoch": 1.6687374749498998, + "grad_norm": 20.89002092347498, + "learning_rate": 4.892164035764125e-06, + "loss": 2.6644, + "step": 8327 + }, + { + "epoch": 1.668937875751503, + "grad_norm": 21.310870462950625, + "learning_rate": 4.8909984252859574e-06, + "loss": 2.9111, + "step": 8328 + }, + { + "epoch": 1.6691382765531062, + "grad_norm": 34.92100994915337, + "learning_rate": 4.889832820734367e-06, + "loss": 2.7829, + "step": 8329 + }, + { + "epoch": 1.6693386773547094, + "grad_norm": 34.46805975521606, + "learning_rate": 4.888667222172726e-06, + "loss": 3.2488, + "step": 8330 + }, + { + "epoch": 1.6695390781563126, + "grad_norm": 24.339148311147827, + "learning_rate": 4.887501629664416e-06, + "loss": 3.0014, + "step": 8331 + }, + { + "epoch": 1.6697394789579159, + "grad_norm": 25.491150046933033, + "learning_rate": 4.886336043272809e-06, + "loss": 2.7182, + "step": 8332 + }, + { + "epoch": 1.669939879759519, + "grad_norm": 19.853076071891927, + "learning_rate": 4.8851704630612756e-06, + "loss": 2.3087, + "step": 8333 + }, + { + "epoch": 1.6701402805611223, + "grad_norm": 26.968640418253713, + "learning_rate": 4.884004889093196e-06, + "loss": 3.4716, + "step": 8334 + }, + { + "epoch": 1.6703406813627253, + "grad_norm": 18.350090321574942, + "learning_rate": 4.882839321431941e-06, + "loss": 2.3938, + "step": 8335 + }, + { + "epoch": 1.6705410821643287, + "grad_norm": 22.867926810821245, + "learning_rate": 4.8816737601408835e-06, + "loss": 2.7837, + "step": 8336 + }, + { + "epoch": 1.6707414829659317, + "grad_norm": 16.42562547909945, + "learning_rate": 4.880508205283401e-06, + "loss": 2.4082, + "step": 8337 + }, + { + "epoch": 1.6709418837675352, + "grad_norm": 38.63662522653315, + "learning_rate": 4.879342656922863e-06, + "loss": 3.6812, + "step": 8338 + }, + { + "epoch": 1.6711422845691382, + "grad_norm": 24.40452169710245, + "learning_rate": 4.878177115122644e-06, + "loss": 2.4396, + "step": 8339 + }, + { + "epoch": 1.6713426853707416, + "grad_norm": 21.07858949919853, + "learning_rate": 4.877011579946113e-06, + "loss": 2.8806, + "step": 8340 + }, + { + "epoch": 1.6715430861723446, + "grad_norm": 26.357824164702073, + "learning_rate": 4.875846051456643e-06, + "loss": 2.4623, + "step": 8341 + }, + { + "epoch": 1.671743486973948, + "grad_norm": 14.785751549775991, + "learning_rate": 4.87468052971761e-06, + "loss": 2.2126, + "step": 8342 + }, + { + "epoch": 1.671943887775551, + "grad_norm": 22.376427250489744, + "learning_rate": 4.873515014792379e-06, + "loss": 2.6158, + "step": 8343 + }, + { + "epoch": 1.6721442885771545, + "grad_norm": 24.324199186635294, + "learning_rate": 4.872349506744324e-06, + "loss": 3.1907, + "step": 8344 + }, + { + "epoch": 1.6723446893787575, + "grad_norm": 36.362862834698305, + "learning_rate": 4.871184005636815e-06, + "loss": 2.9457, + "step": 8345 + }, + { + "epoch": 1.6725450901803607, + "grad_norm": 23.265722683113616, + "learning_rate": 4.870018511533222e-06, + "loss": 2.5756, + "step": 8346 + }, + { + "epoch": 1.672745490981964, + "grad_norm": 20.233003316671983, + "learning_rate": 4.8688530244969135e-06, + "loss": 2.3075, + "step": 8347 + }, + { + "epoch": 1.6729458917835671, + "grad_norm": 34.301457560203666, + "learning_rate": 4.86768754459126e-06, + "loss": 2.4529, + "step": 8348 + }, + { + "epoch": 1.6731462925851703, + "grad_norm": 20.93801733087005, + "learning_rate": 4.866522071879631e-06, + "loss": 2.7726, + "step": 8349 + }, + { + "epoch": 1.6733466933867736, + "grad_norm": 24.145195616285, + "learning_rate": 4.865356606425391e-06, + "loss": 2.4865, + "step": 8350 + }, + { + "epoch": 1.6735470941883768, + "grad_norm": 20.53875396170042, + "learning_rate": 4.864191148291914e-06, + "loss": 2.3163, + "step": 8351 + }, + { + "epoch": 1.67374749498998, + "grad_norm": 24.11842036618836, + "learning_rate": 4.8630256975425655e-06, + "loss": 2.7725, + "step": 8352 + }, + { + "epoch": 1.6739478957915832, + "grad_norm": 25.375519167985217, + "learning_rate": 4.861860254240709e-06, + "loss": 3.076, + "step": 8353 + }, + { + "epoch": 1.6741482965931864, + "grad_norm": 72.38763771568267, + "learning_rate": 4.8606948184497156e-06, + "loss": 2.9182, + "step": 8354 + }, + { + "epoch": 1.6743486973947896, + "grad_norm": 28.81190635812602, + "learning_rate": 4.859529390232952e-06, + "loss": 2.3237, + "step": 8355 + }, + { + "epoch": 1.6745490981963926, + "grad_norm": 27.832644048960933, + "learning_rate": 4.8583639696537815e-06, + "loss": 2.3247, + "step": 8356 + }, + { + "epoch": 1.674749498997996, + "grad_norm": 22.56687587465976, + "learning_rate": 4.857198556775571e-06, + "loss": 2.4275, + "step": 8357 + }, + { + "epoch": 1.674949899799599, + "grad_norm": 27.286284289323586, + "learning_rate": 4.856033151661686e-06, + "loss": 2.7365, + "step": 8358 + }, + { + "epoch": 1.6751503006012025, + "grad_norm": 33.006613409647926, + "learning_rate": 4.85486775437549e-06, + "loss": 2.6488, + "step": 8359 + }, + { + "epoch": 1.6753507014028055, + "grad_norm": 22.561333000900323, + "learning_rate": 4.8537023649803495e-06, + "loss": 3.0386, + "step": 8360 + }, + { + "epoch": 1.675551102204409, + "grad_norm": 57.827434475923155, + "learning_rate": 4.852536983539627e-06, + "loss": 3.1667, + "step": 8361 + }, + { + "epoch": 1.675751503006012, + "grad_norm": 20.36619488871439, + "learning_rate": 4.851371610116687e-06, + "loss": 3.0262, + "step": 8362 + }, + { + "epoch": 1.6759519038076154, + "grad_norm": 70.12910209956539, + "learning_rate": 4.850206244774893e-06, + "loss": 2.8363, + "step": 8363 + }, + { + "epoch": 1.6761523046092184, + "grad_norm": 22.05601427269316, + "learning_rate": 4.849040887577604e-06, + "loss": 2.4641, + "step": 8364 + }, + { + "epoch": 1.6763527054108216, + "grad_norm": 28.162272918656466, + "learning_rate": 4.847875538588186e-06, + "loss": 2.55, + "step": 8365 + }, + { + "epoch": 1.6765531062124248, + "grad_norm": 24.319257840964767, + "learning_rate": 4.8467101978699996e-06, + "loss": 2.6847, + "step": 8366 + }, + { + "epoch": 1.676753507014028, + "grad_norm": 25.44011513376822, + "learning_rate": 4.8455448654864045e-06, + "loss": 2.7016, + "step": 8367 + }, + { + "epoch": 1.6769539078156313, + "grad_norm": 18.025669770678046, + "learning_rate": 4.844379541500765e-06, + "loss": 2.5622, + "step": 8368 + }, + { + "epoch": 1.6771543086172345, + "grad_norm": 18.866402941185214, + "learning_rate": 4.84321422597644e-06, + "loss": 2.5892, + "step": 8369 + }, + { + "epoch": 1.6773547094188377, + "grad_norm": 21.857698673668573, + "learning_rate": 4.842048918976787e-06, + "loss": 3.1121, + "step": 8370 + }, + { + "epoch": 1.677555110220441, + "grad_norm": 38.92425886356942, + "learning_rate": 4.840883620565169e-06, + "loss": 2.1318, + "step": 8371 + }, + { + "epoch": 1.6777555110220441, + "grad_norm": 25.339688657663267, + "learning_rate": 4.839718330804942e-06, + "loss": 2.5408, + "step": 8372 + }, + { + "epoch": 1.6779559118236473, + "grad_norm": 25.75744325530769, + "learning_rate": 4.838553049759468e-06, + "loss": 2.8769, + "step": 8373 + }, + { + "epoch": 1.6781563126252506, + "grad_norm": 22.855518064423798, + "learning_rate": 4.8373877774921e-06, + "loss": 2.3736, + "step": 8374 + }, + { + "epoch": 1.6783567134268536, + "grad_norm": 21.490335182227415, + "learning_rate": 4.836222514066202e-06, + "loss": 2.626, + "step": 8375 + }, + { + "epoch": 1.678557114228457, + "grad_norm": 25.023240353620473, + "learning_rate": 4.8350572595451264e-06, + "loss": 2.7504, + "step": 8376 + }, + { + "epoch": 1.67875751503006, + "grad_norm": 25.553947260802474, + "learning_rate": 4.833892013992229e-06, + "loss": 2.812, + "step": 8377 + }, + { + "epoch": 1.6789579158316634, + "grad_norm": 95.8351678376814, + "learning_rate": 4.832726777470871e-06, + "loss": 2.7245, + "step": 8378 + }, + { + "epoch": 1.6791583166332664, + "grad_norm": 28.24311369622681, + "learning_rate": 4.831561550044403e-06, + "loss": 2.3964, + "step": 8379 + }, + { + "epoch": 1.6793587174348699, + "grad_norm": 25.53473246946435, + "learning_rate": 4.830396331776182e-06, + "loss": 2.9943, + "step": 8380 + }, + { + "epoch": 1.6795591182364729, + "grad_norm": 34.601789258667495, + "learning_rate": 4.829231122729564e-06, + "loss": 2.344, + "step": 8381 + }, + { + "epoch": 1.6797595190380763, + "grad_norm": 39.067367570504814, + "learning_rate": 4.828065922967903e-06, + "loss": 2.7279, + "step": 8382 + }, + { + "epoch": 1.6799599198396793, + "grad_norm": 24.778181636286376, + "learning_rate": 4.826900732554551e-06, + "loss": 2.8744, + "step": 8383 + }, + { + "epoch": 1.6801603206412825, + "grad_norm": 19.206372360270752, + "learning_rate": 4.825735551552859e-06, + "loss": 2.9824, + "step": 8384 + }, + { + "epoch": 1.6803607214428857, + "grad_norm": 25.824370924889447, + "learning_rate": 4.8245703800261835e-06, + "loss": 2.9575, + "step": 8385 + }, + { + "epoch": 1.680561122244489, + "grad_norm": 30.18716899939884, + "learning_rate": 4.823405218037878e-06, + "loss": 2.5626, + "step": 8386 + }, + { + "epoch": 1.6807615230460922, + "grad_norm": 29.890326314210053, + "learning_rate": 4.822240065651287e-06, + "loss": 2.4416, + "step": 8387 + }, + { + "epoch": 1.6809619238476954, + "grad_norm": 29.673015852293297, + "learning_rate": 4.82107492292977e-06, + "loss": 2.8834, + "step": 8388 + }, + { + "epoch": 1.6811623246492986, + "grad_norm": 38.380177719597306, + "learning_rate": 4.819909789936673e-06, + "loss": 2.7804, + "step": 8389 + }, + { + "epoch": 1.6813627254509018, + "grad_norm": 29.177681044685883, + "learning_rate": 4.818744666735346e-06, + "loss": 2.4711, + "step": 8390 + }, + { + "epoch": 1.681563126252505, + "grad_norm": 28.091736109598532, + "learning_rate": 4.817579553389138e-06, + "loss": 2.9234, + "step": 8391 + }, + { + "epoch": 1.6817635270541083, + "grad_norm": 21.68593405048972, + "learning_rate": 4.816414449961399e-06, + "loss": 2.2863, + "step": 8392 + }, + { + "epoch": 1.6819639278557115, + "grad_norm": 25.52282831165206, + "learning_rate": 4.815249356515479e-06, + "loss": 2.882, + "step": 8393 + }, + { + "epoch": 1.6821643286573145, + "grad_norm": 32.678063183155714, + "learning_rate": 4.8140842731147215e-06, + "loss": 3.1311, + "step": 8394 + }, + { + "epoch": 1.682364729458918, + "grad_norm": 20.319089036013942, + "learning_rate": 4.812919199822479e-06, + "loss": 2.8313, + "step": 8395 + }, + { + "epoch": 1.682565130260521, + "grad_norm": 45.17811957743002, + "learning_rate": 4.811754136702096e-06, + "loss": 3.0019, + "step": 8396 + }, + { + "epoch": 1.6827655310621243, + "grad_norm": 21.97680056655337, + "learning_rate": 4.810589083816916e-06, + "loss": 2.6456, + "step": 8397 + }, + { + "epoch": 1.6829659318637273, + "grad_norm": 24.27722918624885, + "learning_rate": 4.809424041230288e-06, + "loss": 2.6802, + "step": 8398 + }, + { + "epoch": 1.6831663326653308, + "grad_norm": 26.18596612593098, + "learning_rate": 4.808259009005558e-06, + "loss": 2.7464, + "step": 8399 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 24.629313328899773, + "learning_rate": 4.807093987206068e-06, + "loss": 2.3055, + "step": 8400 + }, + { + "epoch": 1.6835671342685372, + "grad_norm": 25.46373712017328, + "learning_rate": 4.805928975895161e-06, + "loss": 2.76, + "step": 8401 + }, + { + "epoch": 1.6837675350701402, + "grad_norm": 40.66335534067424, + "learning_rate": 4.804763975136184e-06, + "loss": 2.9801, + "step": 8402 + }, + { + "epoch": 1.6839679358717436, + "grad_norm": 22.586466324831047, + "learning_rate": 4.803598984992477e-06, + "loss": 2.6256, + "step": 8403 + }, + { + "epoch": 1.6841683366733466, + "grad_norm": 26.48236112129184, + "learning_rate": 4.802434005527383e-06, + "loss": 2.7104, + "step": 8404 + }, + { + "epoch": 1.6843687374749499, + "grad_norm": 22.25254301881873, + "learning_rate": 4.801269036804243e-06, + "loss": 2.7354, + "step": 8405 + }, + { + "epoch": 1.684569138276553, + "grad_norm": 24.494038990725677, + "learning_rate": 4.800104078886401e-06, + "loss": 2.8671, + "step": 8406 + }, + { + "epoch": 1.6847695390781563, + "grad_norm": 32.82011901318304, + "learning_rate": 4.798939131837194e-06, + "loss": 3.3107, + "step": 8407 + }, + { + "epoch": 1.6849699398797595, + "grad_norm": 27.353276575923186, + "learning_rate": 4.797774195719963e-06, + "loss": 2.7249, + "step": 8408 + }, + { + "epoch": 1.6851703406813627, + "grad_norm": 31.214675523270035, + "learning_rate": 4.796609270598048e-06, + "loss": 2.8624, + "step": 8409 + }, + { + "epoch": 1.685370741482966, + "grad_norm": 23.482991301460835, + "learning_rate": 4.795444356534787e-06, + "loss": 2.8837, + "step": 8410 + }, + { + "epoch": 1.6855711422845692, + "grad_norm": 18.091726413007393, + "learning_rate": 4.794279453593517e-06, + "loss": 2.8853, + "step": 8411 + }, + { + "epoch": 1.6857715430861724, + "grad_norm": 19.26672301376966, + "learning_rate": 4.793114561837579e-06, + "loss": 2.6851, + "step": 8412 + }, + { + "epoch": 1.6859719438877756, + "grad_norm": 23.710328497701997, + "learning_rate": 4.791949681330307e-06, + "loss": 2.1893, + "step": 8413 + }, + { + "epoch": 1.6861723446893788, + "grad_norm": 48.37698162001844, + "learning_rate": 4.790784812135036e-06, + "loss": 2.5981, + "step": 8414 + }, + { + "epoch": 1.6863727454909818, + "grad_norm": 25.739552374569364, + "learning_rate": 4.7896199543151064e-06, + "loss": 2.5761, + "step": 8415 + }, + { + "epoch": 1.6865731462925853, + "grad_norm": 38.62981679599212, + "learning_rate": 4.7884551079338495e-06, + "loss": 2.6565, + "step": 8416 + }, + { + "epoch": 1.6867735470941883, + "grad_norm": 18.007010999094433, + "learning_rate": 4.7872902730546e-06, + "loss": 2.4223, + "step": 8417 + }, + { + "epoch": 1.6869739478957917, + "grad_norm": 25.118971528593704, + "learning_rate": 4.7861254497406915e-06, + "loss": 2.5814, + "step": 8418 + }, + { + "epoch": 1.6871743486973947, + "grad_norm": 21.849703724149997, + "learning_rate": 4.78496063805546e-06, + "loss": 2.5982, + "step": 8419 + }, + { + "epoch": 1.6873747494989981, + "grad_norm": 31.33271461610673, + "learning_rate": 4.783795838062237e-06, + "loss": 2.3899, + "step": 8420 + }, + { + "epoch": 1.6875751503006011, + "grad_norm": 26.76493691230619, + "learning_rate": 4.78263104982435e-06, + "loss": 2.8401, + "step": 8421 + }, + { + "epoch": 1.6877755511022046, + "grad_norm": 21.53847610680552, + "learning_rate": 4.781466273405136e-06, + "loss": 3.0071, + "step": 8422 + }, + { + "epoch": 1.6879759519038076, + "grad_norm": 15.563149805571504, + "learning_rate": 4.780301508867923e-06, + "loss": 2.5774, + "step": 8423 + }, + { + "epoch": 1.6881763527054108, + "grad_norm": 20.572986842202326, + "learning_rate": 4.779136756276039e-06, + "loss": 2.2334, + "step": 8424 + }, + { + "epoch": 1.688376753507014, + "grad_norm": 27.18120413804928, + "learning_rate": 4.777972015692818e-06, + "loss": 2.4361, + "step": 8425 + }, + { + "epoch": 1.6885771543086172, + "grad_norm": 29.9661230762316, + "learning_rate": 4.7768072871815865e-06, + "loss": 2.9147, + "step": 8426 + }, + { + "epoch": 1.6887775551102204, + "grad_norm": 31.82651536443186, + "learning_rate": 4.775642570805673e-06, + "loss": 3.2993, + "step": 8427 + }, + { + "epoch": 1.6889779559118236, + "grad_norm": 23.958425827278056, + "learning_rate": 4.774477866628402e-06, + "loss": 2.3288, + "step": 8428 + }, + { + "epoch": 1.6891783567134269, + "grad_norm": 27.4561303460451, + "learning_rate": 4.773313174713102e-06, + "loss": 2.6904, + "step": 8429 + }, + { + "epoch": 1.68937875751503, + "grad_norm": 30.299961642191747, + "learning_rate": 4.772148495123101e-06, + "loss": 3.3062, + "step": 8430 + }, + { + "epoch": 1.6895791583166333, + "grad_norm": 22.11763668008553, + "learning_rate": 4.7709838279217205e-06, + "loss": 2.6233, + "step": 8431 + }, + { + "epoch": 1.6897795591182365, + "grad_norm": 25.528425470389358, + "learning_rate": 4.7698191731722895e-06, + "loss": 2.297, + "step": 8432 + }, + { + "epoch": 1.6899799599198397, + "grad_norm": 26.362011830030223, + "learning_rate": 4.76865453093813e-06, + "loss": 3.0175, + "step": 8433 + }, + { + "epoch": 1.6901803607214427, + "grad_norm": 24.185137813708884, + "learning_rate": 4.767489901282564e-06, + "loss": 2.4964, + "step": 8434 + }, + { + "epoch": 1.6903807615230462, + "grad_norm": 38.28143877478678, + "learning_rate": 4.766325284268914e-06, + "loss": 2.4201, + "step": 8435 + }, + { + "epoch": 1.6905811623246492, + "grad_norm": 28.345179097096256, + "learning_rate": 4.765160679960506e-06, + "loss": 3.1021, + "step": 8436 + }, + { + "epoch": 1.6907815631262526, + "grad_norm": 20.61262010102559, + "learning_rate": 4.7639960884206576e-06, + "loss": 2.0834, + "step": 8437 + }, + { + "epoch": 1.6909819639278556, + "grad_norm": 30.384916993630167, + "learning_rate": 4.762831509712689e-06, + "loss": 3.0136, + "step": 8438 + }, + { + "epoch": 1.691182364729459, + "grad_norm": 21.361169055808315, + "learning_rate": 4.761666943899922e-06, + "loss": 3.0001, + "step": 8439 + }, + { + "epoch": 1.691382765531062, + "grad_norm": 23.342015007202768, + "learning_rate": 4.760502391045676e-06, + "loss": 2.8889, + "step": 8440 + }, + { + "epoch": 1.6915831663326655, + "grad_norm": 20.441877824457393, + "learning_rate": 4.759337851213267e-06, + "loss": 2.5147, + "step": 8441 + }, + { + "epoch": 1.6917835671342685, + "grad_norm": 19.147458118348105, + "learning_rate": 4.7581733244660145e-06, + "loss": 2.466, + "step": 8442 + }, + { + "epoch": 1.6919839679358717, + "grad_norm": 22.87486386102037, + "learning_rate": 4.757008810867237e-06, + "loss": 2.4955, + "step": 8443 + }, + { + "epoch": 1.692184368737475, + "grad_norm": 28.541610194353524, + "learning_rate": 4.7558443104802485e-06, + "loss": 2.6074, + "step": 8444 + }, + { + "epoch": 1.6923847695390781, + "grad_norm": 22.419896177700554, + "learning_rate": 4.754679823368364e-06, + "loss": 2.5418, + "step": 8445 + }, + { + "epoch": 1.6925851703406813, + "grad_norm": 19.1108302545013, + "learning_rate": 4.753515349594901e-06, + "loss": 2.5843, + "step": 8446 + }, + { + "epoch": 1.6927855711422846, + "grad_norm": 23.241757684570953, + "learning_rate": 4.752350889223172e-06, + "loss": 2.4095, + "step": 8447 + }, + { + "epoch": 1.6929859719438878, + "grad_norm": 20.743714959638236, + "learning_rate": 4.751186442316488e-06, + "loss": 2.5681, + "step": 8448 + }, + { + "epoch": 1.693186372745491, + "grad_norm": 71.40430220466487, + "learning_rate": 4.750022008938167e-06, + "loss": 2.539, + "step": 8449 + }, + { + "epoch": 1.6933867735470942, + "grad_norm": 46.13669046528572, + "learning_rate": 4.7488575891515195e-06, + "loss": 2.8135, + "step": 8450 + }, + { + "epoch": 1.6935871743486974, + "grad_norm": 23.66901917909789, + "learning_rate": 4.747693183019853e-06, + "loss": 2.8786, + "step": 8451 + }, + { + "epoch": 1.6937875751503007, + "grad_norm": 21.966745460068257, + "learning_rate": 4.746528790606483e-06, + "loss": 3.1364, + "step": 8452 + }, + { + "epoch": 1.6939879759519036, + "grad_norm": 30.973325546170322, + "learning_rate": 4.745364411974716e-06, + "loss": 2.3846, + "step": 8453 + }, + { + "epoch": 1.694188376753507, + "grad_norm": 23.290113801655046, + "learning_rate": 4.744200047187862e-06, + "loss": 2.8789, + "step": 8454 + }, + { + "epoch": 1.69438877755511, + "grad_norm": 25.18268684483408, + "learning_rate": 4.743035696309226e-06, + "loss": 2.2871, + "step": 8455 + }, + { + "epoch": 1.6945891783567135, + "grad_norm": 24.62884461800587, + "learning_rate": 4.741871359402122e-06, + "loss": 2.8418, + "step": 8456 + }, + { + "epoch": 1.6947895791583165, + "grad_norm": 22.570781308486023, + "learning_rate": 4.740707036529854e-06, + "loss": 2.2146, + "step": 8457 + }, + { + "epoch": 1.69498997995992, + "grad_norm": 27.38374970153898, + "learning_rate": 4.739542727755723e-06, + "loss": 2.643, + "step": 8458 + }, + { + "epoch": 1.695190380761523, + "grad_norm": 34.5335659900903, + "learning_rate": 4.738378433143042e-06, + "loss": 2.7768, + "step": 8459 + }, + { + "epoch": 1.6953907815631264, + "grad_norm": 20.2445125265252, + "learning_rate": 4.737214152755111e-06, + "loss": 2.8502, + "step": 8460 + }, + { + "epoch": 1.6955911823647294, + "grad_norm": 29.73036990733104, + "learning_rate": 4.736049886655234e-06, + "loss": 3.0422, + "step": 8461 + }, + { + "epoch": 1.6957915831663328, + "grad_norm": 47.63726930810955, + "learning_rate": 4.7348856349067136e-06, + "loss": 2.8084, + "step": 8462 + }, + { + "epoch": 1.6959919839679358, + "grad_norm": 21.09620361724617, + "learning_rate": 4.733721397572853e-06, + "loss": 2.6615, + "step": 8463 + }, + { + "epoch": 1.696192384769539, + "grad_norm": 26.790183571950735, + "learning_rate": 4.732557174716955e-06, + "loss": 2.6635, + "step": 8464 + }, + { + "epoch": 1.6963927855711423, + "grad_norm": 28.977615781616375, + "learning_rate": 4.731392966402315e-06, + "loss": 3.1381, + "step": 8465 + }, + { + "epoch": 1.6965931863727455, + "grad_norm": 36.859644447782564, + "learning_rate": 4.730228772692237e-06, + "loss": 2.8531, + "step": 8466 + }, + { + "epoch": 1.6967935871743487, + "grad_norm": 23.475286454589416, + "learning_rate": 4.72906459365002e-06, + "loss": 2.4193, + "step": 8467 + }, + { + "epoch": 1.696993987975952, + "grad_norm": 19.52480229518473, + "learning_rate": 4.727900429338959e-06, + "loss": 3.016, + "step": 8468 + }, + { + "epoch": 1.6971943887775551, + "grad_norm": 25.19783868978937, + "learning_rate": 4.7267362798223555e-06, + "loss": 2.6211, + "step": 8469 + }, + { + "epoch": 1.6973947895791583, + "grad_norm": 28.80964927381454, + "learning_rate": 4.725572145163504e-06, + "loss": 2.9166, + "step": 8470 + }, + { + "epoch": 1.6975951903807616, + "grad_norm": 19.303440239209195, + "learning_rate": 4.724408025425698e-06, + "loss": 2.3617, + "step": 8471 + }, + { + "epoch": 1.6977955911823648, + "grad_norm": 27.028931357568354, + "learning_rate": 4.723243920672236e-06, + "loss": 2.6023, + "step": 8472 + }, + { + "epoch": 1.697995991983968, + "grad_norm": 18.496166807206443, + "learning_rate": 4.7220798309664095e-06, + "loss": 2.489, + "step": 8473 + }, + { + "epoch": 1.698196392785571, + "grad_norm": 34.32680051634504, + "learning_rate": 4.720915756371514e-06, + "loss": 4.0525, + "step": 8474 + }, + { + "epoch": 1.6983967935871744, + "grad_norm": 22.26211974293894, + "learning_rate": 4.719751696950839e-06, + "loss": 2.7503, + "step": 8475 + }, + { + "epoch": 1.6985971943887774, + "grad_norm": 25.10289180494937, + "learning_rate": 4.718587652767679e-06, + "loss": 2.4703, + "step": 8476 + }, + { + "epoch": 1.6987975951903809, + "grad_norm": 26.640311580726685, + "learning_rate": 4.7174236238853245e-06, + "loss": 2.8071, + "step": 8477 + }, + { + "epoch": 1.6989979959919839, + "grad_norm": 35.18677039094227, + "learning_rate": 4.716259610367064e-06, + "loss": 2.8158, + "step": 8478 + }, + { + "epoch": 1.6991983967935873, + "grad_norm": 27.690088106214475, + "learning_rate": 4.715095612276186e-06, + "loss": 2.7768, + "step": 8479 + }, + { + "epoch": 1.6993987975951903, + "grad_norm": 25.83839465240193, + "learning_rate": 4.7139316296759824e-06, + "loss": 2.7734, + "step": 8480 + }, + { + "epoch": 1.6995991983967937, + "grad_norm": 35.11200967509209, + "learning_rate": 4.712767662629738e-06, + "loss": 2.4899, + "step": 8481 + }, + { + "epoch": 1.6997995991983967, + "grad_norm": 26.316690632533454, + "learning_rate": 4.711603711200738e-06, + "loss": 2.38, + "step": 8482 + }, + { + "epoch": 1.7, + "grad_norm": 26.550883237158725, + "learning_rate": 4.710439775452272e-06, + "loss": 2.8135, + "step": 8483 + }, + { + "epoch": 1.7002004008016032, + "grad_norm": 19.919223151806925, + "learning_rate": 4.7092758554476215e-06, + "loss": 2.5839, + "step": 8484 + }, + { + "epoch": 1.7004008016032064, + "grad_norm": 32.87308977846171, + "learning_rate": 4.708111951250073e-06, + "loss": 2.8882, + "step": 8485 + }, + { + "epoch": 1.7006012024048096, + "grad_norm": 29.04473363347507, + "learning_rate": 4.706948062922907e-06, + "loss": 3.0953, + "step": 8486 + }, + { + "epoch": 1.7008016032064128, + "grad_norm": 19.734681665628802, + "learning_rate": 4.705784190529409e-06, + "loss": 2.2617, + "step": 8487 + }, + { + "epoch": 1.701002004008016, + "grad_norm": 24.924796006792956, + "learning_rate": 4.704620334132859e-06, + "loss": 2.6288, + "step": 8488 + }, + { + "epoch": 1.7012024048096193, + "grad_norm": 22.33055601727067, + "learning_rate": 4.703456493796535e-06, + "loss": 2.3256, + "step": 8489 + }, + { + "epoch": 1.7014028056112225, + "grad_norm": 26.878758668835744, + "learning_rate": 4.702292669583722e-06, + "loss": 2.8874, + "step": 8490 + }, + { + "epoch": 1.7016032064128257, + "grad_norm": 20.758103757881354, + "learning_rate": 4.701128861557694e-06, + "loss": 2.4639, + "step": 8491 + }, + { + "epoch": 1.701803607214429, + "grad_norm": 24.88569774364957, + "learning_rate": 4.69996506978173e-06, + "loss": 2.7107, + "step": 8492 + }, + { + "epoch": 1.702004008016032, + "grad_norm": 73.15858241077235, + "learning_rate": 4.698801294319111e-06, + "loss": 2.3947, + "step": 8493 + }, + { + "epoch": 1.7022044088176354, + "grad_norm": 25.117607343433434, + "learning_rate": 4.69763753523311e-06, + "loss": 2.9587, + "step": 8494 + }, + { + "epoch": 1.7024048096192383, + "grad_norm": 22.262463455650487, + "learning_rate": 4.696473792587e-06, + "loss": 2.3662, + "step": 8495 + }, + { + "epoch": 1.7026052104208418, + "grad_norm": 20.923456150638746, + "learning_rate": 4.69531006644406e-06, + "loss": 3.3143, + "step": 8496 + }, + { + "epoch": 1.7028056112224448, + "grad_norm": 23.75457032165751, + "learning_rate": 4.694146356867561e-06, + "loss": 2.626, + "step": 8497 + }, + { + "epoch": 1.7030060120240482, + "grad_norm": 27.13037940437826, + "learning_rate": 4.6929826639207774e-06, + "loss": 2.5504, + "step": 8498 + }, + { + "epoch": 1.7032064128256512, + "grad_norm": 22.289916338188664, + "learning_rate": 4.691818987666976e-06, + "loss": 2.3874, + "step": 8499 + }, + { + "epoch": 1.7034068136272547, + "grad_norm": 28.664240864379092, + "learning_rate": 4.690655328169434e-06, + "loss": 2.4935, + "step": 8500 + }, + { + "epoch": 1.7036072144288577, + "grad_norm": 24.156105373360887, + "learning_rate": 4.689491685491419e-06, + "loss": 2.8528, + "step": 8501 + }, + { + "epoch": 1.7038076152304609, + "grad_norm": 29.224634244845568, + "learning_rate": 4.6883280596961975e-06, + "loss": 2.7395, + "step": 8502 + }, + { + "epoch": 1.704008016032064, + "grad_norm": 21.622908910223526, + "learning_rate": 4.687164450847041e-06, + "loss": 2.8549, + "step": 8503 + }, + { + "epoch": 1.7042084168336673, + "grad_norm": 39.098903577121234, + "learning_rate": 4.686000859007214e-06, + "loss": 2.6777, + "step": 8504 + }, + { + "epoch": 1.7044088176352705, + "grad_norm": 25.636245961798416, + "learning_rate": 4.6848372842399845e-06, + "loss": 2.7726, + "step": 8505 + }, + { + "epoch": 1.7046092184368737, + "grad_norm": 28.455078275336852, + "learning_rate": 4.6836737266086155e-06, + "loss": 3.3482, + "step": 8506 + }, + { + "epoch": 1.704809619238477, + "grad_norm": 23.636858232533665, + "learning_rate": 4.682510186176374e-06, + "loss": 2.8127, + "step": 8507 + }, + { + "epoch": 1.7050100200400802, + "grad_norm": 29.28456113799536, + "learning_rate": 4.681346663006524e-06, + "loss": 2.866, + "step": 8508 + }, + { + "epoch": 1.7052104208416834, + "grad_norm": 27.9824299641389, + "learning_rate": 4.680183157162323e-06, + "loss": 3.4689, + "step": 8509 + }, + { + "epoch": 1.7054108216432866, + "grad_norm": 26.976576435833756, + "learning_rate": 4.679019668707037e-06, + "loss": 2.9384, + "step": 8510 + }, + { + "epoch": 1.7056112224448898, + "grad_norm": 46.24726205485753, + "learning_rate": 4.6778561977039275e-06, + "loss": 2.9244, + "step": 8511 + }, + { + "epoch": 1.7058116232464928, + "grad_norm": 23.232793787744534, + "learning_rate": 4.676692744216249e-06, + "loss": 2.5382, + "step": 8512 + }, + { + "epoch": 1.7060120240480963, + "grad_norm": 22.625042841133432, + "learning_rate": 4.675529308307266e-06, + "loss": 3.0419, + "step": 8513 + }, + { + "epoch": 1.7062124248496993, + "grad_norm": 19.09723821967963, + "learning_rate": 4.674365890040233e-06, + "loss": 2.7287, + "step": 8514 + }, + { + "epoch": 1.7064128256513027, + "grad_norm": 25.22968160765599, + "learning_rate": 4.6732024894784055e-06, + "loss": 2.8696, + "step": 8515 + }, + { + "epoch": 1.7066132264529057, + "grad_norm": 28.39495337494719, + "learning_rate": 4.672039106685042e-06, + "loss": 3.0266, + "step": 8516 + }, + { + "epoch": 1.7068136272545091, + "grad_norm": 27.988317589839788, + "learning_rate": 4.670875741723396e-06, + "loss": 3.1343, + "step": 8517 + }, + { + "epoch": 1.7070140280561121, + "grad_norm": 20.097095820658755, + "learning_rate": 4.669712394656723e-06, + "loss": 2.5335, + "step": 8518 + }, + { + "epoch": 1.7072144288577156, + "grad_norm": 22.175560207496602, + "learning_rate": 4.6685490655482724e-06, + "loss": 2.3055, + "step": 8519 + }, + { + "epoch": 1.7074148296593186, + "grad_norm": 21.872156406350225, + "learning_rate": 4.6673857544613e-06, + "loss": 3.5234, + "step": 8520 + }, + { + "epoch": 1.707615230460922, + "grad_norm": 30.707023755300174, + "learning_rate": 4.666222461459056e-06, + "loss": 2.5433, + "step": 8521 + }, + { + "epoch": 1.707815631262525, + "grad_norm": 29.878793187640397, + "learning_rate": 4.665059186604789e-06, + "loss": 2.3374, + "step": 8522 + }, + { + "epoch": 1.7080160320641282, + "grad_norm": 30.13565201139005, + "learning_rate": 4.663895929961746e-06, + "loss": 2.6737, + "step": 8523 + }, + { + "epoch": 1.7082164328657314, + "grad_norm": 28.58989110914879, + "learning_rate": 4.66273269159318e-06, + "loss": 2.7223, + "step": 8524 + }, + { + "epoch": 1.7084168336673347, + "grad_norm": 28.88234747779724, + "learning_rate": 4.661569471562335e-06, + "loss": 3.0785, + "step": 8525 + }, + { + "epoch": 1.7086172344689379, + "grad_norm": 22.541021309651253, + "learning_rate": 4.660406269932456e-06, + "loss": 2.5945, + "step": 8526 + }, + { + "epoch": 1.708817635270541, + "grad_norm": 29.419036565294714, + "learning_rate": 4.659243086766792e-06, + "loss": 2.7175, + "step": 8527 + }, + { + "epoch": 1.7090180360721443, + "grad_norm": 16.807674394181625, + "learning_rate": 4.658079922128582e-06, + "loss": 2.8013, + "step": 8528 + }, + { + "epoch": 1.7092184368737475, + "grad_norm": 22.712911209396406, + "learning_rate": 4.656916776081072e-06, + "loss": 2.7207, + "step": 8529 + }, + { + "epoch": 1.7094188376753507, + "grad_norm": 24.767373441803276, + "learning_rate": 4.6557536486875036e-06, + "loss": 2.9459, + "step": 8530 + }, + { + "epoch": 1.709619238476954, + "grad_norm": 40.69790143489044, + "learning_rate": 4.654590540011118e-06, + "loss": 2.2847, + "step": 8531 + }, + { + "epoch": 1.7098196392785572, + "grad_norm": 25.81022070407046, + "learning_rate": 4.653427450115155e-06, + "loss": 2.7686, + "step": 8532 + }, + { + "epoch": 1.7100200400801602, + "grad_norm": 27.114933058746477, + "learning_rate": 4.65226437906285e-06, + "loss": 2.7096, + "step": 8533 + }, + { + "epoch": 1.7102204408817636, + "grad_norm": 22.935942804521765, + "learning_rate": 4.651101326917447e-06, + "loss": 2.6625, + "step": 8534 + }, + { + "epoch": 1.7104208416833666, + "grad_norm": 33.926081204870115, + "learning_rate": 4.649938293742179e-06, + "loss": 2.7319, + "step": 8535 + }, + { + "epoch": 1.71062124248497, + "grad_norm": 20.56566848001298, + "learning_rate": 4.648775279600281e-06, + "loss": 2.515, + "step": 8536 + }, + { + "epoch": 1.710821643286573, + "grad_norm": 24.80714608763012, + "learning_rate": 4.647612284554992e-06, + "loss": 2.7386, + "step": 8537 + }, + { + "epoch": 1.7110220440881765, + "grad_norm": 21.728869139602217, + "learning_rate": 4.646449308669543e-06, + "loss": 2.8673, + "step": 8538 + }, + { + "epoch": 1.7112224448897795, + "grad_norm": 23.70743668784998, + "learning_rate": 4.645286352007166e-06, + "loss": 2.9409, + "step": 8539 + }, + { + "epoch": 1.711422845691383, + "grad_norm": 21.11515081047184, + "learning_rate": 4.644123414631094e-06, + "loss": 2.7427, + "step": 8540 + }, + { + "epoch": 1.711623246492986, + "grad_norm": 35.02530933847464, + "learning_rate": 4.642960496604558e-06, + "loss": 2.8986, + "step": 8541 + }, + { + "epoch": 1.7118236472945891, + "grad_norm": 29.873603143673062, + "learning_rate": 4.641797597990786e-06, + "loss": 2.6157, + "step": 8542 + }, + { + "epoch": 1.7120240480961924, + "grad_norm": 30.694299703817677, + "learning_rate": 4.640634718853006e-06, + "loss": 2.4191, + "step": 8543 + }, + { + "epoch": 1.7122244488977956, + "grad_norm": 22.29087449542697, + "learning_rate": 4.6394718592544494e-06, + "loss": 2.0152, + "step": 8544 + }, + { + "epoch": 1.7124248496993988, + "grad_norm": 28.66488117777487, + "learning_rate": 4.63830901925834e-06, + "loss": 2.6024, + "step": 8545 + }, + { + "epoch": 1.712625250501002, + "grad_norm": 22.91583493599682, + "learning_rate": 4.637146198927901e-06, + "loss": 2.9186, + "step": 8546 + }, + { + "epoch": 1.7128256513026052, + "grad_norm": 22.59412256075667, + "learning_rate": 4.63598339832636e-06, + "loss": 2.428, + "step": 8547 + }, + { + "epoch": 1.7130260521042084, + "grad_norm": 26.117423234306646, + "learning_rate": 4.634820617516939e-06, + "loss": 2.7981, + "step": 8548 + }, + { + "epoch": 1.7132264529058117, + "grad_norm": 26.449805092216394, + "learning_rate": 4.633657856562861e-06, + "loss": 3.2485, + "step": 8549 + }, + { + "epoch": 1.7134268537074149, + "grad_norm": 28.17011026390615, + "learning_rate": 4.6324951155273435e-06, + "loss": 2.5998, + "step": 8550 + }, + { + "epoch": 1.713627254509018, + "grad_norm": 38.907875741785595, + "learning_rate": 4.631332394473611e-06, + "loss": 2.5363, + "step": 8551 + }, + { + "epoch": 1.713827655310621, + "grad_norm": 28.027874576358943, + "learning_rate": 4.6301696934648816e-06, + "loss": 2.8914, + "step": 8552 + }, + { + "epoch": 1.7140280561122245, + "grad_norm": 32.42046725891152, + "learning_rate": 4.629007012564368e-06, + "loss": 2.0919, + "step": 8553 + }, + { + "epoch": 1.7142284569138275, + "grad_norm": 38.351545399820445, + "learning_rate": 4.6278443518352935e-06, + "loss": 2.6913, + "step": 8554 + }, + { + "epoch": 1.714428857715431, + "grad_norm": 32.43543544677632, + "learning_rate": 4.626681711340871e-06, + "loss": 2.974, + "step": 8555 + }, + { + "epoch": 1.714629258517034, + "grad_norm": 59.19133299108548, + "learning_rate": 4.625519091144313e-06, + "loss": 3.2178, + "step": 8556 + }, + { + "epoch": 1.7148296593186374, + "grad_norm": 21.951859796717958, + "learning_rate": 4.6243564913088375e-06, + "loss": 2.5726, + "step": 8557 + }, + { + "epoch": 1.7150300601202404, + "grad_norm": 25.34848032673499, + "learning_rate": 4.623193911897653e-06, + "loss": 2.7664, + "step": 8558 + }, + { + "epoch": 1.7152304609218438, + "grad_norm": 16.987462741305944, + "learning_rate": 4.622031352973972e-06, + "loss": 2.8593, + "step": 8559 + }, + { + "epoch": 1.7154308617234468, + "grad_norm": 28.63163753686281, + "learning_rate": 4.620868814601004e-06, + "loss": 3.0121, + "step": 8560 + }, + { + "epoch": 1.71563126252505, + "grad_norm": 31.211720272625143, + "learning_rate": 4.619706296841957e-06, + "loss": 2.3306, + "step": 8561 + }, + { + "epoch": 1.7158316633266533, + "grad_norm": 21.79311444385488, + "learning_rate": 4.618543799760042e-06, + "loss": 2.6583, + "step": 8562 + }, + { + "epoch": 1.7160320641282565, + "grad_norm": 25.336799340631018, + "learning_rate": 4.6173813234184615e-06, + "loss": 2.7957, + "step": 8563 + }, + { + "epoch": 1.7162324649298597, + "grad_norm": 24.59015768951278, + "learning_rate": 4.6162188678804246e-06, + "loss": 2.648, + "step": 8564 + }, + { + "epoch": 1.716432865731463, + "grad_norm": 19.210273511009532, + "learning_rate": 4.615056433209134e-06, + "loss": 3.138, + "step": 8565 + }, + { + "epoch": 1.7166332665330661, + "grad_norm": 23.829753828003938, + "learning_rate": 4.613894019467793e-06, + "loss": 2.7153, + "step": 8566 + }, + { + "epoch": 1.7168336673346694, + "grad_norm": 51.99610998269154, + "learning_rate": 4.6127316267196045e-06, + "loss": 3.1239, + "step": 8567 + }, + { + "epoch": 1.7170340681362726, + "grad_norm": 26.69944863779016, + "learning_rate": 4.61156925502777e-06, + "loss": 2.6431, + "step": 8568 + }, + { + "epoch": 1.7172344689378758, + "grad_norm": 19.79213291193112, + "learning_rate": 4.610406904455489e-06, + "loss": 2.6549, + "step": 8569 + }, + { + "epoch": 1.717434869739479, + "grad_norm": 21.936304074635146, + "learning_rate": 4.609244575065957e-06, + "loss": 2.9932, + "step": 8570 + }, + { + "epoch": 1.717635270541082, + "grad_norm": 43.43578534905866, + "learning_rate": 4.608082266922376e-06, + "loss": 2.9468, + "step": 8571 + }, + { + "epoch": 1.7178356713426854, + "grad_norm": 20.587321424720777, + "learning_rate": 4.606919980087941e-06, + "loss": 2.8077, + "step": 8572 + }, + { + "epoch": 1.7180360721442884, + "grad_norm": 28.35602498532913, + "learning_rate": 4.605757714625844e-06, + "loss": 3.0105, + "step": 8573 + }, + { + "epoch": 1.7182364729458919, + "grad_norm": 23.965282700753516, + "learning_rate": 4.604595470599286e-06, + "loss": 2.7929, + "step": 8574 + }, + { + "epoch": 1.7184368737474949, + "grad_norm": 25.042234851241297, + "learning_rate": 4.603433248071454e-06, + "loss": 2.4713, + "step": 8575 + }, + { + "epoch": 1.7186372745490983, + "grad_norm": 41.174090097137174, + "learning_rate": 4.602271047105543e-06, + "loss": 3.461, + "step": 8576 + }, + { + "epoch": 1.7188376753507013, + "grad_norm": 20.669668324054424, + "learning_rate": 4.60110886776474e-06, + "loss": 2.6176, + "step": 8577 + }, + { + "epoch": 1.7190380761523048, + "grad_norm": 23.664955311764952, + "learning_rate": 4.599946710112238e-06, + "loss": 3.043, + "step": 8578 + }, + { + "epoch": 1.7192384769539077, + "grad_norm": 27.52572532430017, + "learning_rate": 4.598784574211224e-06, + "loss": 2.3622, + "step": 8579 + }, + { + "epoch": 1.7194388777555112, + "grad_norm": 18.43435679256475, + "learning_rate": 4.5976224601248836e-06, + "loss": 2.2781, + "step": 8580 + }, + { + "epoch": 1.7196392785571142, + "grad_norm": 33.52554642288186, + "learning_rate": 4.596460367916406e-06, + "loss": 2.536, + "step": 8581 + }, + { + "epoch": 1.7198396793587174, + "grad_norm": 25.09816321440869, + "learning_rate": 4.595298297648974e-06, + "loss": 2.3425, + "step": 8582 + }, + { + "epoch": 1.7200400801603206, + "grad_norm": 30.090838464131426, + "learning_rate": 4.594136249385768e-06, + "loss": 2.9838, + "step": 8583 + }, + { + "epoch": 1.7202404809619238, + "grad_norm": 15.638134788043315, + "learning_rate": 4.592974223189977e-06, + "loss": 2.7023, + "step": 8584 + }, + { + "epoch": 1.720440881763527, + "grad_norm": 24.148629775942695, + "learning_rate": 4.591812219124776e-06, + "loss": 2.7989, + "step": 8585 + }, + { + "epoch": 1.7206412825651303, + "grad_norm": 32.603871728210045, + "learning_rate": 4.590650237253348e-06, + "loss": 2.5668, + "step": 8586 + }, + { + "epoch": 1.7208416833667335, + "grad_norm": 42.99971753758617, + "learning_rate": 4.589488277638869e-06, + "loss": 2.515, + "step": 8587 + }, + { + "epoch": 1.7210420841683367, + "grad_norm": 31.939657772395027, + "learning_rate": 4.588326340344521e-06, + "loss": 2.958, + "step": 8588 + }, + { + "epoch": 1.72124248496994, + "grad_norm": 33.89633886758522, + "learning_rate": 4.5871644254334775e-06, + "loss": 2.9897, + "step": 8589 + }, + { + "epoch": 1.7214428857715431, + "grad_norm": 16.25522365553828, + "learning_rate": 4.586002532968911e-06, + "loss": 2.7934, + "step": 8590 + }, + { + "epoch": 1.7216432865731464, + "grad_norm": 41.052952614066285, + "learning_rate": 4.584840663013999e-06, + "loss": 2.6031, + "step": 8591 + }, + { + "epoch": 1.7218436873747494, + "grad_norm": 38.01549414962785, + "learning_rate": 4.583678815631914e-06, + "loss": 2.546, + "step": 8592 + }, + { + "epoch": 1.7220440881763528, + "grad_norm": 21.295884333162704, + "learning_rate": 4.582516990885827e-06, + "loss": 2.8608, + "step": 8593 + }, + { + "epoch": 1.7222444889779558, + "grad_norm": 25.04982689631112, + "learning_rate": 4.581355188838904e-06, + "loss": 2.4881, + "step": 8594 + }, + { + "epoch": 1.7224448897795592, + "grad_norm": 28.475119200865755, + "learning_rate": 4.580193409554321e-06, + "loss": 2.6742, + "step": 8595 + }, + { + "epoch": 1.7226452905811622, + "grad_norm": 26.24325199461328, + "learning_rate": 4.5790316530952404e-06, + "loss": 2.7492, + "step": 8596 + }, + { + "epoch": 1.7228456913827657, + "grad_norm": 20.498657925051052, + "learning_rate": 4.57786991952483e-06, + "loss": 2.4045, + "step": 8597 + }, + { + "epoch": 1.7230460921843687, + "grad_norm": 31.043753423221972, + "learning_rate": 4.576708208906256e-06, + "loss": 2.3789, + "step": 8598 + }, + { + "epoch": 1.723246492985972, + "grad_norm": 22.242485358165155, + "learning_rate": 4.5755465213026815e-06, + "loss": 2.6016, + "step": 8599 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 28.75814385257395, + "learning_rate": 4.574384856777268e-06, + "loss": 2.5024, + "step": 8600 + }, + { + "epoch": 1.7236472945891783, + "grad_norm": 28.928328302099477, + "learning_rate": 4.573223215393179e-06, + "loss": 3.1776, + "step": 8601 + }, + { + "epoch": 1.7238476953907815, + "grad_norm": 23.232120695812856, + "learning_rate": 4.5720615972135744e-06, + "loss": 2.3203, + "step": 8602 + }, + { + "epoch": 1.7240480961923847, + "grad_norm": 32.76892785185402, + "learning_rate": 4.570900002301612e-06, + "loss": 2.5362, + "step": 8603 + }, + { + "epoch": 1.724248496993988, + "grad_norm": 26.01524583592182, + "learning_rate": 4.569738430720448e-06, + "loss": 2.1839, + "step": 8604 + }, + { + "epoch": 1.7244488977955912, + "grad_norm": 23.223620343361482, + "learning_rate": 4.568576882533242e-06, + "loss": 2.7376, + "step": 8605 + }, + { + "epoch": 1.7246492985971944, + "grad_norm": 25.922524934585837, + "learning_rate": 4.567415357803149e-06, + "loss": 2.6239, + "step": 8606 + }, + { + "epoch": 1.7248496993987976, + "grad_norm": 27.557392402979065, + "learning_rate": 4.56625385659332e-06, + "loss": 2.7742, + "step": 8607 + }, + { + "epoch": 1.7250501002004008, + "grad_norm": 23.044413678231287, + "learning_rate": 4.56509237896691e-06, + "loss": 2.5308, + "step": 8608 + }, + { + "epoch": 1.725250501002004, + "grad_norm": 36.366546207625845, + "learning_rate": 4.563930924987069e-06, + "loss": 2.756, + "step": 8609 + }, + { + "epoch": 1.7254509018036073, + "grad_norm": 27.27226287175164, + "learning_rate": 4.562769494716947e-06, + "loss": 2.5633, + "step": 8610 + }, + { + "epoch": 1.7256513026052103, + "grad_norm": 42.650470186322124, + "learning_rate": 4.561608088219693e-06, + "loss": 2.7417, + "step": 8611 + }, + { + "epoch": 1.7258517034068137, + "grad_norm": 20.163744170357674, + "learning_rate": 4.560446705558455e-06, + "loss": 2.6424, + "step": 8612 + }, + { + "epoch": 1.7260521042084167, + "grad_norm": 23.46472016986198, + "learning_rate": 4.559285346796379e-06, + "loss": 2.4316, + "step": 8613 + }, + { + "epoch": 1.7262525050100201, + "grad_norm": 23.635712243658755, + "learning_rate": 4.558124011996607e-06, + "loss": 2.5623, + "step": 8614 + }, + { + "epoch": 1.7264529058116231, + "grad_norm": 18.522972147839138, + "learning_rate": 4.5569627012222876e-06, + "loss": 2.3997, + "step": 8615 + }, + { + "epoch": 1.7266533066132266, + "grad_norm": 22.952925628962625, + "learning_rate": 4.555801414536557e-06, + "loss": 2.5552, + "step": 8616 + }, + { + "epoch": 1.7268537074148296, + "grad_norm": 49.727191392795724, + "learning_rate": 4.55464015200256e-06, + "loss": 2.946, + "step": 8617 + }, + { + "epoch": 1.727054108216433, + "grad_norm": 27.421531165031055, + "learning_rate": 4.553478913683437e-06, + "loss": 2.2801, + "step": 8618 + }, + { + "epoch": 1.727254509018036, + "grad_norm": 22.621164548639232, + "learning_rate": 4.552317699642325e-06, + "loss": 2.4447, + "step": 8619 + }, + { + "epoch": 1.7274549098196392, + "grad_norm": 30.196909486647044, + "learning_rate": 4.551156509942359e-06, + "loss": 2.6022, + "step": 8620 + }, + { + "epoch": 1.7276553106212424, + "grad_norm": 26.95629769518859, + "learning_rate": 4.5499953446466755e-06, + "loss": 2.954, + "step": 8621 + }, + { + "epoch": 1.7278557114228457, + "grad_norm": 56.858667720852196, + "learning_rate": 4.54883420381841e-06, + "loss": 2.3485, + "step": 8622 + }, + { + "epoch": 1.7280561122244489, + "grad_norm": 27.96415487800641, + "learning_rate": 4.547673087520696e-06, + "loss": 3.1332, + "step": 8623 + }, + { + "epoch": 1.728256513026052, + "grad_norm": 23.755681807043633, + "learning_rate": 4.546511995816662e-06, + "loss": 2.5938, + "step": 8624 + }, + { + "epoch": 1.7284569138276553, + "grad_norm": 22.499846555212642, + "learning_rate": 4.545350928769442e-06, + "loss": 2.7715, + "step": 8625 + }, + { + "epoch": 1.7286573146292585, + "grad_norm": 27.98216276585864, + "learning_rate": 4.544189886442163e-06, + "loss": 2.8591, + "step": 8626 + }, + { + "epoch": 1.7288577154308618, + "grad_norm": 29.99416422446775, + "learning_rate": 4.54302886889795e-06, + "loss": 2.7919, + "step": 8627 + }, + { + "epoch": 1.729058116232465, + "grad_norm": 23.510008856145163, + "learning_rate": 4.541867876199935e-06, + "loss": 2.0624, + "step": 8628 + }, + { + "epoch": 1.7292585170340682, + "grad_norm": 42.79813448400034, + "learning_rate": 4.540706908411238e-06, + "loss": 2.8981, + "step": 8629 + }, + { + "epoch": 1.7294589178356712, + "grad_norm": 26.339501225081353, + "learning_rate": 4.5395459655949865e-06, + "loss": 2.956, + "step": 8630 + }, + { + "epoch": 1.7296593186372746, + "grad_norm": 27.990903560966498, + "learning_rate": 4.538385047814298e-06, + "loss": 2.9641, + "step": 8631 + }, + { + "epoch": 1.7298597194388776, + "grad_norm": 26.95557433076279, + "learning_rate": 4.537224155132297e-06, + "loss": 2.3796, + "step": 8632 + }, + { + "epoch": 1.730060120240481, + "grad_norm": 17.4235659208698, + "learning_rate": 4.536063287612103e-06, + "loss": 2.5838, + "step": 8633 + }, + { + "epoch": 1.730260521042084, + "grad_norm": 29.6116573515023, + "learning_rate": 4.534902445316831e-06, + "loss": 3.4936, + "step": 8634 + }, + { + "epoch": 1.7304609218436875, + "grad_norm": 29.415676781074346, + "learning_rate": 4.533741628309599e-06, + "loss": 2.729, + "step": 8635 + }, + { + "epoch": 1.7306613226452905, + "grad_norm": 20.58627986461767, + "learning_rate": 4.532580836653526e-06, + "loss": 2.5636, + "step": 8636 + }, + { + "epoch": 1.730861723446894, + "grad_norm": 28.736607286481263, + "learning_rate": 4.531420070411722e-06, + "loss": 3.3689, + "step": 8637 + }, + { + "epoch": 1.731062124248497, + "grad_norm": 29.99273409928022, + "learning_rate": 4.530259329647297e-06, + "loss": 2.8544, + "step": 8638 + }, + { + "epoch": 1.7312625250501004, + "grad_norm": 53.27607450079156, + "learning_rate": 4.52909861442337e-06, + "loss": 3.0678, + "step": 8639 + }, + { + "epoch": 1.7314629258517034, + "grad_norm": 24.371345155068322, + "learning_rate": 4.527937924803044e-06, + "loss": 3.0002, + "step": 8640 + }, + { + "epoch": 1.7316633266533066, + "grad_norm": 26.378903249781068, + "learning_rate": 4.526777260849431e-06, + "loss": 2.3345, + "step": 8641 + }, + { + "epoch": 1.7318637274549098, + "grad_norm": 25.853256369977352, + "learning_rate": 4.525616622625635e-06, + "loss": 2.4555, + "step": 8642 + }, + { + "epoch": 1.732064128256513, + "grad_norm": 26.880334657811968, + "learning_rate": 4.524456010194767e-06, + "loss": 2.7098, + "step": 8643 + }, + { + "epoch": 1.7322645290581162, + "grad_norm": 25.03945814799318, + "learning_rate": 4.5232954236199235e-06, + "loss": 2.396, + "step": 8644 + }, + { + "epoch": 1.7324649298597194, + "grad_norm": 20.797265435005006, + "learning_rate": 4.522134862964214e-06, + "loss": 2.2102, + "step": 8645 + }, + { + "epoch": 1.7326653306613227, + "grad_norm": 31.17947913365408, + "learning_rate": 4.520974328290738e-06, + "loss": 3.3245, + "step": 8646 + }, + { + "epoch": 1.7328657314629259, + "grad_norm": 28.645046125946614, + "learning_rate": 4.519813819662594e-06, + "loss": 2.2453, + "step": 8647 + }, + { + "epoch": 1.733066132264529, + "grad_norm": 24.529888195265034, + "learning_rate": 4.51865333714288e-06, + "loss": 3.1057, + "step": 8648 + }, + { + "epoch": 1.7332665330661323, + "grad_norm": 26.85845128057761, + "learning_rate": 4.5174928807946964e-06, + "loss": 2.7416, + "step": 8649 + }, + { + "epoch": 1.7334669338677355, + "grad_norm": 29.772821793430015, + "learning_rate": 4.516332450681137e-06, + "loss": 2.4997, + "step": 8650 + }, + { + "epoch": 1.7336673346693385, + "grad_norm": 28.082470578739578, + "learning_rate": 4.515172046865295e-06, + "loss": 3.0161, + "step": 8651 + }, + { + "epoch": 1.733867735470942, + "grad_norm": 35.818540497124324, + "learning_rate": 4.514011669410266e-06, + "loss": 2.4361, + "step": 8652 + }, + { + "epoch": 1.734068136272545, + "grad_norm": 28.365786646590802, + "learning_rate": 4.512851318379139e-06, + "loss": 2.8261, + "step": 8653 + }, + { + "epoch": 1.7342685370741484, + "grad_norm": 59.10024579360017, + "learning_rate": 4.511690993835005e-06, + "loss": 2.9698, + "step": 8654 + }, + { + "epoch": 1.7344689378757514, + "grad_norm": 31.90296371911323, + "learning_rate": 4.5105306958409524e-06, + "loss": 3.1084, + "step": 8655 + }, + { + "epoch": 1.7346693386773548, + "grad_norm": 24.162752048662114, + "learning_rate": 4.509370424460069e-06, + "loss": 3.1316, + "step": 8656 + }, + { + "epoch": 1.7348697394789578, + "grad_norm": 31.95579725664462, + "learning_rate": 4.508210179755441e-06, + "loss": 2.5433, + "step": 8657 + }, + { + "epoch": 1.7350701402805613, + "grad_norm": 22.994935324445176, + "learning_rate": 4.507049961790148e-06, + "loss": 2.8609, + "step": 8658 + }, + { + "epoch": 1.7352705410821643, + "grad_norm": 24.562910121642112, + "learning_rate": 4.505889770627279e-06, + "loss": 2.5666, + "step": 8659 + }, + { + "epoch": 1.7354709418837675, + "grad_norm": 22.788220429184722, + "learning_rate": 4.504729606329912e-06, + "loss": 2.5127, + "step": 8660 + }, + { + "epoch": 1.7356713426853707, + "grad_norm": 23.48462957113349, + "learning_rate": 4.503569468961125e-06, + "loss": 2.7515, + "step": 8661 + }, + { + "epoch": 1.735871743486974, + "grad_norm": 23.040106696580114, + "learning_rate": 4.502409358584003e-06, + "loss": 2.7179, + "step": 8662 + }, + { + "epoch": 1.7360721442885771, + "grad_norm": 29.314139040318967, + "learning_rate": 4.501249275261617e-06, + "loss": 2.7891, + "step": 8663 + }, + { + "epoch": 1.7362725450901804, + "grad_norm": 22.270136950852866, + "learning_rate": 4.500089219057044e-06, + "loss": 2.2499, + "step": 8664 + }, + { + "epoch": 1.7364729458917836, + "grad_norm": 24.45080139878975, + "learning_rate": 4.498929190033357e-06, + "loss": 3.0177, + "step": 8665 + }, + { + "epoch": 1.7366733466933868, + "grad_norm": 34.68716785278154, + "learning_rate": 4.49776918825363e-06, + "loss": 2.6061, + "step": 8666 + }, + { + "epoch": 1.73687374749499, + "grad_norm": 18.99680783692501, + "learning_rate": 4.4966092137809344e-06, + "loss": 3.365, + "step": 8667 + }, + { + "epoch": 1.7370741482965932, + "grad_norm": 25.423580543790315, + "learning_rate": 4.495449266678337e-06, + "loss": 2.8007, + "step": 8668 + }, + { + "epoch": 1.7372745490981965, + "grad_norm": 23.11302573622857, + "learning_rate": 4.49428934700891e-06, + "loss": 2.3609, + "step": 8669 + }, + { + "epoch": 1.7374749498997994, + "grad_norm": 24.699955013506134, + "learning_rate": 4.493129454835718e-06, + "loss": 2.0853, + "step": 8670 + }, + { + "epoch": 1.7376753507014029, + "grad_norm": 28.904214379804237, + "learning_rate": 4.491969590221823e-06, + "loss": 2.3988, + "step": 8671 + }, + { + "epoch": 1.7378757515030059, + "grad_norm": 29.794510376856863, + "learning_rate": 4.490809753230294e-06, + "loss": 2.4985, + "step": 8672 + }, + { + "epoch": 1.7380761523046093, + "grad_norm": 32.768789996289605, + "learning_rate": 4.489649943924189e-06, + "loss": 2.6681, + "step": 8673 + }, + { + "epoch": 1.7382765531062123, + "grad_norm": 23.74861304627823, + "learning_rate": 4.488490162366571e-06, + "loss": 3.0496, + "step": 8674 + }, + { + "epoch": 1.7384769539078158, + "grad_norm": 24.325675012151194, + "learning_rate": 4.487330408620496e-06, + "loss": 2.7424, + "step": 8675 + }, + { + "epoch": 1.7386773547094188, + "grad_norm": 28.3330764703339, + "learning_rate": 4.486170682749025e-06, + "loss": 2.8654, + "step": 8676 + }, + { + "epoch": 1.7388777555110222, + "grad_norm": 35.36460485887875, + "learning_rate": 4.485010984815214e-06, + "loss": 3.4935, + "step": 8677 + }, + { + "epoch": 1.7390781563126252, + "grad_norm": 24.19388111748627, + "learning_rate": 4.4838513148821126e-06, + "loss": 3.2929, + "step": 8678 + }, + { + "epoch": 1.7392785571142284, + "grad_norm": 22.20480538445435, + "learning_rate": 4.4826916730127775e-06, + "loss": 2.6578, + "step": 8679 + }, + { + "epoch": 1.7394789579158316, + "grad_norm": 19.584084959379016, + "learning_rate": 4.4815320592702625e-06, + "loss": 2.5408, + "step": 8680 + }, + { + "epoch": 1.7396793587174348, + "grad_norm": 34.25937310372276, + "learning_rate": 4.480372473717614e-06, + "loss": 3.5393, + "step": 8681 + }, + { + "epoch": 1.739879759519038, + "grad_norm": 29.355824771232, + "learning_rate": 4.479212916417878e-06, + "loss": 2.6126, + "step": 8682 + }, + { + "epoch": 1.7400801603206413, + "grad_norm": 28.09652190182994, + "learning_rate": 4.478053387434108e-06, + "loss": 2.408, + "step": 8683 + }, + { + "epoch": 1.7402805611222445, + "grad_norm": 20.029584333082315, + "learning_rate": 4.476893886829344e-06, + "loss": 2.6788, + "step": 8684 + }, + { + "epoch": 1.7404809619238477, + "grad_norm": 37.49279473632264, + "learning_rate": 4.475734414666632e-06, + "loss": 3.2106, + "step": 8685 + }, + { + "epoch": 1.740681362725451, + "grad_norm": 31.72458967882687, + "learning_rate": 4.474574971009013e-06, + "loss": 2.9786, + "step": 8686 + }, + { + "epoch": 1.7408817635270541, + "grad_norm": 30.646782057146886, + "learning_rate": 4.4734155559195305e-06, + "loss": 2.7154, + "step": 8687 + }, + { + "epoch": 1.7410821643286574, + "grad_norm": 41.33193056777993, + "learning_rate": 4.472256169461219e-06, + "loss": 3.6864, + "step": 8688 + }, + { + "epoch": 1.7412825651302604, + "grad_norm": 23.475471370866824, + "learning_rate": 4.471096811697121e-06, + "loss": 2.6409, + "step": 8689 + }, + { + "epoch": 1.7414829659318638, + "grad_norm": 26.169434439439094, + "learning_rate": 4.46993748269027e-06, + "loss": 2.7267, + "step": 8690 + }, + { + "epoch": 1.7416833667334668, + "grad_norm": 28.32057860154129, + "learning_rate": 4.4687781825036985e-06, + "loss": 2.6911, + "step": 8691 + }, + { + "epoch": 1.7418837675350702, + "grad_norm": 18.063376294944838, + "learning_rate": 4.467618911200441e-06, + "loss": 2.6806, + "step": 8692 + }, + { + "epoch": 1.7420841683366732, + "grad_norm": 17.94947159026345, + "learning_rate": 4.4664596688435315e-06, + "loss": 3.0144, + "step": 8693 + }, + { + "epoch": 1.7422845691382767, + "grad_norm": 15.904902158936753, + "learning_rate": 4.465300455495997e-06, + "loss": 2.0966, + "step": 8694 + }, + { + "epoch": 1.7424849699398797, + "grad_norm": 16.52750474621095, + "learning_rate": 4.4641412712208635e-06, + "loss": 3.0143, + "step": 8695 + }, + { + "epoch": 1.742685370741483, + "grad_norm": 25.907595913869418, + "learning_rate": 4.462982116081163e-06, + "loss": 2.5921, + "step": 8696 + }, + { + "epoch": 1.742885771543086, + "grad_norm": 24.238274757227384, + "learning_rate": 4.461822990139916e-06, + "loss": 2.6882, + "step": 8697 + }, + { + "epoch": 1.7430861723446895, + "grad_norm": 22.422532645426887, + "learning_rate": 4.460663893460147e-06, + "loss": 2.8491, + "step": 8698 + }, + { + "epoch": 1.7432865731462925, + "grad_norm": 40.19019062245227, + "learning_rate": 4.45950482610488e-06, + "loss": 2.6044, + "step": 8699 + }, + { + "epoch": 1.7434869739478958, + "grad_norm": 26.82688633002994, + "learning_rate": 4.458345788137134e-06, + "loss": 2.5308, + "step": 8700 + }, + { + "epoch": 1.743687374749499, + "grad_norm": 23.426386522162847, + "learning_rate": 4.457186779619926e-06, + "loss": 2.2073, + "step": 8701 + }, + { + "epoch": 1.7438877755511022, + "grad_norm": 21.57041152405374, + "learning_rate": 4.456027800616273e-06, + "loss": 2.6388, + "step": 8702 + }, + { + "epoch": 1.7440881763527054, + "grad_norm": 23.650109299100833, + "learning_rate": 4.454868851189192e-06, + "loss": 3.1337, + "step": 8703 + }, + { + "epoch": 1.7442885771543086, + "grad_norm": 29.773351596636484, + "learning_rate": 4.453709931401697e-06, + "loss": 2.7824, + "step": 8704 + }, + { + "epoch": 1.7444889779559118, + "grad_norm": 38.02717570498865, + "learning_rate": 4.452551041316797e-06, + "loss": 2.8409, + "step": 8705 + }, + { + "epoch": 1.744689378757515, + "grad_norm": 26.10906114608674, + "learning_rate": 4.451392180997508e-06, + "loss": 2.8961, + "step": 8706 + }, + { + "epoch": 1.7448897795591183, + "grad_norm": 23.137765086907645, + "learning_rate": 4.450233350506836e-06, + "loss": 2.6086, + "step": 8707 + }, + { + "epoch": 1.7450901803607213, + "grad_norm": 25.266720575592178, + "learning_rate": 4.449074549907787e-06, + "loss": 2.6699, + "step": 8708 + }, + { + "epoch": 1.7452905811623247, + "grad_norm": 23.808531001386307, + "learning_rate": 4.447915779263368e-06, + "loss": 3.1662, + "step": 8709 + }, + { + "epoch": 1.7454909819639277, + "grad_norm": 26.284024146528044, + "learning_rate": 4.446757038636582e-06, + "loss": 2.522, + "step": 8710 + }, + { + "epoch": 1.7456913827655312, + "grad_norm": 25.729172317355864, + "learning_rate": 4.4455983280904344e-06, + "loss": 2.9781, + "step": 8711 + }, + { + "epoch": 1.7458917835671341, + "grad_norm": 53.77889780082761, + "learning_rate": 4.444439647687922e-06, + "loss": 2.7058, + "step": 8712 + }, + { + "epoch": 1.7460921843687376, + "grad_norm": 24.316287773802877, + "learning_rate": 4.443280997492048e-06, + "loss": 2.15, + "step": 8713 + }, + { + "epoch": 1.7462925851703406, + "grad_norm": 20.991175559224253, + "learning_rate": 4.442122377565808e-06, + "loss": 2.7747, + "step": 8714 + }, + { + "epoch": 1.746492985971944, + "grad_norm": 32.72772316019425, + "learning_rate": 4.440963787972196e-06, + "loss": 2.8365, + "step": 8715 + }, + { + "epoch": 1.746693386773547, + "grad_norm": 33.49664942725685, + "learning_rate": 4.439805228774209e-06, + "loss": 2.7335, + "step": 8716 + }, + { + "epoch": 1.7468937875751505, + "grad_norm": 31.08947554740688, + "learning_rate": 4.43864670003484e-06, + "loss": 2.6984, + "step": 8717 + }, + { + "epoch": 1.7470941883767535, + "grad_norm": 16.979387244913816, + "learning_rate": 4.437488201817078e-06, + "loss": 2.4306, + "step": 8718 + }, + { + "epoch": 1.7472945891783567, + "grad_norm": 23.829182958771035, + "learning_rate": 4.436329734183911e-06, + "loss": 2.7545, + "step": 8719 + }, + { + "epoch": 1.74749498997996, + "grad_norm": 33.51605489694879, + "learning_rate": 4.435171297198329e-06, + "loss": 3.2265, + "step": 8720 + }, + { + "epoch": 1.747695390781563, + "grad_norm": 49.76265660453067, + "learning_rate": 4.434012890923319e-06, + "loss": 3.2297, + "step": 8721 + }, + { + "epoch": 1.7478957915831663, + "grad_norm": 35.65587318685999, + "learning_rate": 4.432854515421862e-06, + "loss": 2.5039, + "step": 8722 + }, + { + "epoch": 1.7480961923847695, + "grad_norm": 22.724226965407052, + "learning_rate": 4.431696170756942e-06, + "loss": 2.5374, + "step": 8723 + }, + { + "epoch": 1.7482965931863728, + "grad_norm": 20.12799112995673, + "learning_rate": 4.430537856991541e-06, + "loss": 2.7328, + "step": 8724 + }, + { + "epoch": 1.748496993987976, + "grad_norm": 26.602436538787384, + "learning_rate": 4.429379574188636e-06, + "loss": 2.9981, + "step": 8725 + }, + { + "epoch": 1.7486973947895792, + "grad_norm": 31.469668070735526, + "learning_rate": 4.428221322411208e-06, + "loss": 2.3104, + "step": 8726 + }, + { + "epoch": 1.7488977955911824, + "grad_norm": 23.62848115727637, + "learning_rate": 4.4270631017222305e-06, + "loss": 2.3701, + "step": 8727 + }, + { + "epoch": 1.7490981963927856, + "grad_norm": 28.751428815483685, + "learning_rate": 4.425904912184677e-06, + "loss": 2.7588, + "step": 8728 + }, + { + "epoch": 1.7492985971943886, + "grad_norm": 27.661003766328452, + "learning_rate": 4.42474675386152e-06, + "loss": 2.5666, + "step": 8729 + }, + { + "epoch": 1.749498997995992, + "grad_norm": 28.819105502242788, + "learning_rate": 4.423588626815732e-06, + "loss": 2.545, + "step": 8730 + }, + { + "epoch": 1.749699398797595, + "grad_norm": 22.552891217435292, + "learning_rate": 4.422430531110282e-06, + "loss": 2.9224, + "step": 8731 + }, + { + "epoch": 1.7498997995991985, + "grad_norm": 27.708867714094996, + "learning_rate": 4.421272466808135e-06, + "loss": 2.6459, + "step": 8732 + }, + { + "epoch": 1.7501002004008015, + "grad_norm": 21.11615162546871, + "learning_rate": 4.42011443397226e-06, + "loss": 2.8866, + "step": 8733 + }, + { + "epoch": 1.750300601202405, + "grad_norm": 32.67929135872099, + "learning_rate": 4.418956432665618e-06, + "loss": 2.7935, + "step": 8734 + }, + { + "epoch": 1.750501002004008, + "grad_norm": 26.239979941910526, + "learning_rate": 4.417798462951174e-06, + "loss": 2.5504, + "step": 8735 + }, + { + "epoch": 1.7507014028056114, + "grad_norm": 34.43835407026716, + "learning_rate": 4.4166405248918844e-06, + "loss": 2.9255, + "step": 8736 + }, + { + "epoch": 1.7509018036072144, + "grad_norm": 29.533514075636077, + "learning_rate": 4.415482618550712e-06, + "loss": 2.2635, + "step": 8737 + }, + { + "epoch": 1.7511022044088176, + "grad_norm": 23.825160773003898, + "learning_rate": 4.414324743990613e-06, + "loss": 2.902, + "step": 8738 + }, + { + "epoch": 1.7513026052104208, + "grad_norm": 20.281389475567636, + "learning_rate": 4.413166901274541e-06, + "loss": 2.8091, + "step": 8739 + }, + { + "epoch": 1.751503006012024, + "grad_norm": 21.67417300784348, + "learning_rate": 4.412009090465452e-06, + "loss": 2.5826, + "step": 8740 + }, + { + "epoch": 1.7517034068136272, + "grad_norm": 26.317985299331404, + "learning_rate": 4.410851311626297e-06, + "loss": 2.9029, + "step": 8741 + }, + { + "epoch": 1.7519038076152305, + "grad_norm": 21.785608458496327, + "learning_rate": 4.409693564820023e-06, + "loss": 2.5188, + "step": 8742 + }, + { + "epoch": 1.7521042084168337, + "grad_norm": 26.65148690325315, + "learning_rate": 4.408535850109584e-06, + "loss": 2.8545, + "step": 8743 + }, + { + "epoch": 1.752304609218437, + "grad_norm": 29.277774989166403, + "learning_rate": 4.407378167557923e-06, + "loss": 2.9956, + "step": 8744 + }, + { + "epoch": 1.75250501002004, + "grad_norm": 21.07853824828076, + "learning_rate": 4.4062205172279874e-06, + "loss": 2.5429, + "step": 8745 + }, + { + "epoch": 1.7527054108216433, + "grad_norm": 21.45507770015498, + "learning_rate": 4.405062899182716e-06, + "loss": 3.0342, + "step": 8746 + }, + { + "epoch": 1.7529058116232465, + "grad_norm": 23.167427842883995, + "learning_rate": 4.403905313485054e-06, + "loss": 3.0508, + "step": 8747 + }, + { + "epoch": 1.7531062124248495, + "grad_norm": 30.473199589171145, + "learning_rate": 4.402747760197941e-06, + "loss": 2.6084, + "step": 8748 + }, + { + "epoch": 1.753306613226453, + "grad_norm": 26.816867816064626, + "learning_rate": 4.401590239384313e-06, + "loss": 3.0731, + "step": 8749 + }, + { + "epoch": 1.753507014028056, + "grad_norm": 19.696326832307143, + "learning_rate": 4.4004327511071075e-06, + "loss": 2.382, + "step": 8750 + }, + { + "epoch": 1.7537074148296594, + "grad_norm": 25.110938178529018, + "learning_rate": 4.399275295429259e-06, + "loss": 2.5097, + "step": 8751 + }, + { + "epoch": 1.7539078156312624, + "grad_norm": 16.591063704223128, + "learning_rate": 4.398117872413698e-06, + "loss": 2.5022, + "step": 8752 + }, + { + "epoch": 1.7541082164328659, + "grad_norm": 27.25560832133457, + "learning_rate": 4.396960482123357e-06, + "loss": 2.7647, + "step": 8753 + }, + { + "epoch": 1.7543086172344688, + "grad_norm": 23.30556560660821, + "learning_rate": 4.395803124621165e-06, + "loss": 2.8414, + "step": 8754 + }, + { + "epoch": 1.7545090180360723, + "grad_norm": 17.410966092954173, + "learning_rate": 4.39464579997005e-06, + "loss": 2.5264, + "step": 8755 + }, + { + "epoch": 1.7547094188376753, + "grad_norm": 47.37244207058936, + "learning_rate": 4.393488508232935e-06, + "loss": 2.9007, + "step": 8756 + }, + { + "epoch": 1.7549098196392787, + "grad_norm": 17.0119163142203, + "learning_rate": 4.392331249472746e-06, + "loss": 2.581, + "step": 8757 + }, + { + "epoch": 1.7551102204408817, + "grad_norm": 56.86960267732401, + "learning_rate": 4.391174023752404e-06, + "loss": 2.5121, + "step": 8758 + }, + { + "epoch": 1.755310621242485, + "grad_norm": 30.638090777081285, + "learning_rate": 4.390016831134827e-06, + "loss": 2.6879, + "step": 8759 + }, + { + "epoch": 1.7555110220440882, + "grad_norm": 20.046245750475958, + "learning_rate": 4.388859671682937e-06, + "loss": 2.7333, + "step": 8760 + }, + { + "epoch": 1.7557114228456914, + "grad_norm": 18.290734636549455, + "learning_rate": 4.38770254545965e-06, + "loss": 2.7465, + "step": 8761 + }, + { + "epoch": 1.7559118236472946, + "grad_norm": 23.348338531555992, + "learning_rate": 4.386545452527878e-06, + "loss": 2.2611, + "step": 8762 + }, + { + "epoch": 1.7561122244488978, + "grad_norm": 27.190462281725914, + "learning_rate": 4.385388392950534e-06, + "loss": 2.867, + "step": 8763 + }, + { + "epoch": 1.756312625250501, + "grad_norm": 30.49848018375652, + "learning_rate": 4.384231366790533e-06, + "loss": 2.9635, + "step": 8764 + }, + { + "epoch": 1.7565130260521042, + "grad_norm": 34.765477497080084, + "learning_rate": 4.38307437411078e-06, + "loss": 2.9598, + "step": 8765 + }, + { + "epoch": 1.7567134268537075, + "grad_norm": 45.216529079976944, + "learning_rate": 4.381917414974184e-06, + "loss": 2.7454, + "step": 8766 + }, + { + "epoch": 1.7569138276553105, + "grad_norm": 32.542338869023006, + "learning_rate": 4.38076048944365e-06, + "loss": 2.5654, + "step": 8767 + }, + { + "epoch": 1.757114228456914, + "grad_norm": 23.339010345090937, + "learning_rate": 4.379603597582084e-06, + "loss": 2.2185, + "step": 8768 + }, + { + "epoch": 1.757314629258517, + "grad_norm": 29.777981324072453, + "learning_rate": 4.378446739452384e-06, + "loss": 2.6083, + "step": 8769 + }, + { + "epoch": 1.7575150300601203, + "grad_norm": 26.422122484935382, + "learning_rate": 4.377289915117454e-06, + "loss": 3.0384, + "step": 8770 + }, + { + "epoch": 1.7577154308617233, + "grad_norm": 36.00017159615898, + "learning_rate": 4.3761331246401915e-06, + "loss": 2.5713, + "step": 8771 + }, + { + "epoch": 1.7579158316633268, + "grad_norm": 24.70197335178172, + "learning_rate": 4.37497636808349e-06, + "loss": 2.8373, + "step": 8772 + }, + { + "epoch": 1.7581162324649298, + "grad_norm": 24.367500262129397, + "learning_rate": 4.3738196455102446e-06, + "loss": 2.5018, + "step": 8773 + }, + { + "epoch": 1.7583166332665332, + "grad_norm": 32.54504538997369, + "learning_rate": 4.372662956983352e-06, + "loss": 3.0118, + "step": 8774 + }, + { + "epoch": 1.7585170340681362, + "grad_norm": 28.689238874029716, + "learning_rate": 4.371506302565701e-06, + "loss": 2.482, + "step": 8775 + }, + { + "epoch": 1.7587174348697396, + "grad_norm": 28.076116324503463, + "learning_rate": 4.3703496823201775e-06, + "loss": 2.5364, + "step": 8776 + }, + { + "epoch": 1.7589178356713426, + "grad_norm": 29.579742131635566, + "learning_rate": 4.369193096309673e-06, + "loss": 2.8884, + "step": 8777 + }, + { + "epoch": 1.7591182364729459, + "grad_norm": 39.976762021708424, + "learning_rate": 4.36803654459707e-06, + "loss": 2.5587, + "step": 8778 + }, + { + "epoch": 1.759318637274549, + "grad_norm": 20.98080465278206, + "learning_rate": 4.366880027245255e-06, + "loss": 2.5217, + "step": 8779 + }, + { + "epoch": 1.7595190380761523, + "grad_norm": 20.7736376766329, + "learning_rate": 4.365723544317104e-06, + "loss": 2.9868, + "step": 8780 + }, + { + "epoch": 1.7597194388777555, + "grad_norm": 34.17297374312149, + "learning_rate": 4.364567095875503e-06, + "loss": 2.9901, + "step": 8781 + }, + { + "epoch": 1.7599198396793587, + "grad_norm": 28.4454364944318, + "learning_rate": 4.363410681983327e-06, + "loss": 2.7027, + "step": 8782 + }, + { + "epoch": 1.760120240480962, + "grad_norm": 17.451896178311674, + "learning_rate": 4.36225430270345e-06, + "loss": 2.6501, + "step": 8783 + }, + { + "epoch": 1.7603206412825652, + "grad_norm": 31.04822144878554, + "learning_rate": 4.36109795809875e-06, + "loss": 2.4051, + "step": 8784 + }, + { + "epoch": 1.7605210420841684, + "grad_norm": 34.200266564195836, + "learning_rate": 4.359941648232095e-06, + "loss": 3.2137, + "step": 8785 + }, + { + "epoch": 1.7607214428857716, + "grad_norm": 29.082169066321168, + "learning_rate": 4.358785373166357e-06, + "loss": 2.9128, + "step": 8786 + }, + { + "epoch": 1.7609218436873748, + "grad_norm": 23.91576015298978, + "learning_rate": 4.357629132964407e-06, + "loss": 2.5945, + "step": 8787 + }, + { + "epoch": 1.7611222444889778, + "grad_norm": 20.408519569779553, + "learning_rate": 4.356472927689109e-06, + "loss": 2.493, + "step": 8788 + }, + { + "epoch": 1.7613226452905812, + "grad_norm": 25.393801625114257, + "learning_rate": 4.355316757403329e-06, + "loss": 2.6251, + "step": 8789 + }, + { + "epoch": 1.7615230460921842, + "grad_norm": 25.0019022048971, + "learning_rate": 4.354160622169925e-06, + "loss": 2.3712, + "step": 8790 + }, + { + "epoch": 1.7617234468937877, + "grad_norm": 27.632661879977437, + "learning_rate": 4.353004522051764e-06, + "loss": 3.3253, + "step": 8791 + }, + { + "epoch": 1.7619238476953907, + "grad_norm": 26.172083924074993, + "learning_rate": 4.351848457111701e-06, + "loss": 2.9275, + "step": 8792 + }, + { + "epoch": 1.7621242484969941, + "grad_norm": 43.930453098390814, + "learning_rate": 4.350692427412594e-06, + "loss": 2.7183, + "step": 8793 + }, + { + "epoch": 1.7623246492985971, + "grad_norm": 27.49869050240691, + "learning_rate": 4.3495364330173e-06, + "loss": 2.8072, + "step": 8794 + }, + { + "epoch": 1.7625250501002006, + "grad_norm": 29.90811763436739, + "learning_rate": 4.348380473988669e-06, + "loss": 3.1562, + "step": 8795 + }, + { + "epoch": 1.7627254509018035, + "grad_norm": 22.682869282733495, + "learning_rate": 4.347224550389554e-06, + "loss": 2.161, + "step": 8796 + }, + { + "epoch": 1.7629258517034068, + "grad_norm": 21.632792574920266, + "learning_rate": 4.346068662282802e-06, + "loss": 2.2893, + "step": 8797 + }, + { + "epoch": 1.76312625250501, + "grad_norm": 37.37271121837932, + "learning_rate": 4.344912809731264e-06, + "loss": 2.8925, + "step": 8798 + }, + { + "epoch": 1.7633266533066132, + "grad_norm": 28.83899220462015, + "learning_rate": 4.3437569927977836e-06, + "loss": 2.8014, + "step": 8799 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 20.22373067002738, + "learning_rate": 4.342601211545202e-06, + "loss": 2.7313, + "step": 8800 + }, + { + "epoch": 1.7637274549098196, + "grad_norm": 23.976083823056545, + "learning_rate": 4.341445466036366e-06, + "loss": 2.6361, + "step": 8801 + }, + { + "epoch": 1.7639278557114229, + "grad_norm": 51.10108481440771, + "learning_rate": 4.340289756334112e-06, + "loss": 2.9936, + "step": 8802 + }, + { + "epoch": 1.764128256513026, + "grad_norm": 25.599110247199278, + "learning_rate": 4.339134082501274e-06, + "loss": 2.766, + "step": 8803 + }, + { + "epoch": 1.7643286573146293, + "grad_norm": 31.67669846986613, + "learning_rate": 4.337978444600694e-06, + "loss": 3.6655, + "step": 8804 + }, + { + "epoch": 1.7645290581162325, + "grad_norm": 22.817579451030145, + "learning_rate": 4.336822842695203e-06, + "loss": 3.245, + "step": 8805 + }, + { + "epoch": 1.7647294589178357, + "grad_norm": 21.920883538616945, + "learning_rate": 4.3356672768476345e-06, + "loss": 2.1681, + "step": 8806 + }, + { + "epoch": 1.7649298597194387, + "grad_norm": 24.969556800533073, + "learning_rate": 4.334511747120814e-06, + "loss": 3.093, + "step": 8807 + }, + { + "epoch": 1.7651302605210422, + "grad_norm": 24.528333197499567, + "learning_rate": 4.333356253577574e-06, + "loss": 2.5763, + "step": 8808 + }, + { + "epoch": 1.7653306613226452, + "grad_norm": 26.759398963727484, + "learning_rate": 4.332200796280739e-06, + "loss": 2.8365, + "step": 8809 + }, + { + "epoch": 1.7655310621242486, + "grad_norm": 22.116116408307583, + "learning_rate": 4.331045375293131e-06, + "loss": 2.673, + "step": 8810 + }, + { + "epoch": 1.7657314629258516, + "grad_norm": 20.436780571713037, + "learning_rate": 4.329889990677574e-06, + "loss": 3.2721, + "step": 8811 + }, + { + "epoch": 1.765931863727455, + "grad_norm": 20.72225309069503, + "learning_rate": 4.328734642496888e-06, + "loss": 2.1396, + "step": 8812 + }, + { + "epoch": 1.766132264529058, + "grad_norm": 20.879394301761693, + "learning_rate": 4.327579330813888e-06, + "loss": 2.1236, + "step": 8813 + }, + { + "epoch": 1.7663326653306615, + "grad_norm": 24.89220353299717, + "learning_rate": 4.326424055691395e-06, + "loss": 2.5386, + "step": 8814 + }, + { + "epoch": 1.7665330661322645, + "grad_norm": 27.492099394621246, + "learning_rate": 4.32526881719222e-06, + "loss": 2.6842, + "step": 8815 + }, + { + "epoch": 1.7667334669338677, + "grad_norm": 19.610467314723582, + "learning_rate": 4.324113615379175e-06, + "loss": 2.8933, + "step": 8816 + }, + { + "epoch": 1.766933867735471, + "grad_norm": 19.450222668573712, + "learning_rate": 4.322958450315069e-06, + "loss": 2.6746, + "step": 8817 + }, + { + "epoch": 1.7671342685370741, + "grad_norm": 23.696440375475188, + "learning_rate": 4.321803322062713e-06, + "loss": 2.722, + "step": 8818 + }, + { + "epoch": 1.7673346693386773, + "grad_norm": 18.643504746676733, + "learning_rate": 4.320648230684912e-06, + "loss": 2.4389, + "step": 8819 + }, + { + "epoch": 1.7675350701402806, + "grad_norm": 31.654887700497234, + "learning_rate": 4.319493176244468e-06, + "loss": 2.9274, + "step": 8820 + }, + { + "epoch": 1.7677354709418838, + "grad_norm": 42.69440239617932, + "learning_rate": 4.318338158804186e-06, + "loss": 3.0733, + "step": 8821 + }, + { + "epoch": 1.767935871743487, + "grad_norm": 22.33923137690326, + "learning_rate": 4.317183178426864e-06, + "loss": 2.3296, + "step": 8822 + }, + { + "epoch": 1.7681362725450902, + "grad_norm": 23.886863068129873, + "learning_rate": 4.316028235175301e-06, + "loss": 3.1625, + "step": 8823 + }, + { + "epoch": 1.7683366733466934, + "grad_norm": 33.430983079880654, + "learning_rate": 4.31487332911229e-06, + "loss": 2.8122, + "step": 8824 + }, + { + "epoch": 1.7685370741482966, + "grad_norm": 23.807625225899812, + "learning_rate": 4.31371846030063e-06, + "loss": 2.6822, + "step": 8825 + }, + { + "epoch": 1.7687374749498996, + "grad_norm": 27.648810358792453, + "learning_rate": 4.312563628803111e-06, + "loss": 3.0203, + "step": 8826 + }, + { + "epoch": 1.768937875751503, + "grad_norm": 36.2949574946703, + "learning_rate": 4.311408834682519e-06, + "loss": 3.4046, + "step": 8827 + }, + { + "epoch": 1.769138276553106, + "grad_norm": 26.6659676540248, + "learning_rate": 4.310254078001648e-06, + "loss": 2.6597, + "step": 8828 + }, + { + "epoch": 1.7693386773547095, + "grad_norm": 21.04722404740811, + "learning_rate": 4.30909935882328e-06, + "loss": 2.4324, + "step": 8829 + }, + { + "epoch": 1.7695390781563125, + "grad_norm": 27.222334689336268, + "learning_rate": 4.307944677210198e-06, + "loss": 2.6506, + "step": 8830 + }, + { + "epoch": 1.769739478957916, + "grad_norm": 28.03369158248122, + "learning_rate": 4.306790033225188e-06, + "loss": 3.0628, + "step": 8831 + }, + { + "epoch": 1.769939879759519, + "grad_norm": 22.27217253122475, + "learning_rate": 4.305635426931027e-06, + "loss": 2.2081, + "step": 8832 + }, + { + "epoch": 1.7701402805611224, + "grad_norm": 22.841337621125078, + "learning_rate": 4.304480858390492e-06, + "loss": 2.6422, + "step": 8833 + }, + { + "epoch": 1.7703406813627254, + "grad_norm": 24.658029191615075, + "learning_rate": 4.3033263276663585e-06, + "loss": 2.4841, + "step": 8834 + }, + { + "epoch": 1.7705410821643288, + "grad_norm": 17.766760921655113, + "learning_rate": 4.302171834821403e-06, + "loss": 2.7561, + "step": 8835 + }, + { + "epoch": 1.7707414829659318, + "grad_norm": 35.057899130602436, + "learning_rate": 4.301017379918394e-06, + "loss": 3.2417, + "step": 8836 + }, + { + "epoch": 1.770941883767535, + "grad_norm": 24.31680880826916, + "learning_rate": 4.299862963020101e-06, + "loss": 2.2303, + "step": 8837 + }, + { + "epoch": 1.7711422845691382, + "grad_norm": 35.16652319337963, + "learning_rate": 4.298708584189293e-06, + "loss": 2.8038, + "step": 8838 + }, + { + "epoch": 1.7713426853707415, + "grad_norm": 31.095429822894392, + "learning_rate": 4.297554243488735e-06, + "loss": 2.5441, + "step": 8839 + }, + { + "epoch": 1.7715430861723447, + "grad_norm": 31.218638168572244, + "learning_rate": 4.296399940981188e-06, + "loss": 2.4211, + "step": 8840 + }, + { + "epoch": 1.771743486973948, + "grad_norm": 36.95514477409469, + "learning_rate": 4.295245676729416e-06, + "loss": 3.1917, + "step": 8841 + }, + { + "epoch": 1.7719438877755511, + "grad_norm": 22.284149453540675, + "learning_rate": 4.294091450796177e-06, + "loss": 2.6471, + "step": 8842 + }, + { + "epoch": 1.7721442885771543, + "grad_norm": 32.1013919295742, + "learning_rate": 4.292937263244229e-06, + "loss": 3.3362, + "step": 8843 + }, + { + "epoch": 1.7723446893787576, + "grad_norm": 39.25364395288775, + "learning_rate": 4.291783114136323e-06, + "loss": 3.1373, + "step": 8844 + }, + { + "epoch": 1.7725450901803608, + "grad_norm": 36.12449072804708, + "learning_rate": 4.290629003535217e-06, + "loss": 2.1847, + "step": 8845 + }, + { + "epoch": 1.772745490981964, + "grad_norm": 24.273917989529522, + "learning_rate": 4.289474931503657e-06, + "loss": 2.5661, + "step": 8846 + }, + { + "epoch": 1.772945891783567, + "grad_norm": 29.562610548645356, + "learning_rate": 4.288320898104395e-06, + "loss": 3.166, + "step": 8847 + }, + { + "epoch": 1.7731462925851704, + "grad_norm": 23.83510389129532, + "learning_rate": 4.287166903400176e-06, + "loss": 2.9297, + "step": 8848 + }, + { + "epoch": 1.7733466933867734, + "grad_norm": 19.97390199949089, + "learning_rate": 4.286012947453745e-06, + "loss": 2.4836, + "step": 8849 + }, + { + "epoch": 1.7735470941883769, + "grad_norm": 38.513105234015526, + "learning_rate": 4.284859030327845e-06, + "loss": 3.1308, + "step": 8850 + }, + { + "epoch": 1.7737474949899799, + "grad_norm": 23.558919242559746, + "learning_rate": 4.283705152085213e-06, + "loss": 2.8287, + "step": 8851 + }, + { + "epoch": 1.7739478957915833, + "grad_norm": 27.067299419255267, + "learning_rate": 4.282551312788591e-06, + "loss": 3.1606, + "step": 8852 + }, + { + "epoch": 1.7741482965931863, + "grad_norm": 21.3445535373212, + "learning_rate": 4.281397512500712e-06, + "loss": 2.7546, + "step": 8853 + }, + { + "epoch": 1.7743486973947897, + "grad_norm": 20.05614423875906, + "learning_rate": 4.28024375128431e-06, + "loss": 2.9245, + "step": 8854 + }, + { + "epoch": 1.7745490981963927, + "grad_norm": 21.917736925072216, + "learning_rate": 4.279090029202119e-06, + "loss": 2.6375, + "step": 8855 + }, + { + "epoch": 1.774749498997996, + "grad_norm": 20.40100283751316, + "learning_rate": 4.27793634631687e-06, + "loss": 2.6953, + "step": 8856 + }, + { + "epoch": 1.7749498997995992, + "grad_norm": 31.672495733166443, + "learning_rate": 4.2767827026912835e-06, + "loss": 2.7855, + "step": 8857 + }, + { + "epoch": 1.7751503006012024, + "grad_norm": 33.82648350218782, + "learning_rate": 4.275629098388091e-06, + "loss": 3.1857, + "step": 8858 + }, + { + "epoch": 1.7753507014028056, + "grad_norm": 27.78075775004278, + "learning_rate": 4.274475533470013e-06, + "loss": 2.7595, + "step": 8859 + }, + { + "epoch": 1.7755511022044088, + "grad_norm": 31.25068577186667, + "learning_rate": 4.273322007999773e-06, + "loss": 2.8087, + "step": 8860 + }, + { + "epoch": 1.775751503006012, + "grad_norm": 28.129337961495338, + "learning_rate": 4.2721685220400845e-06, + "loss": 2.8885, + "step": 8861 + }, + { + "epoch": 1.7759519038076153, + "grad_norm": 17.32583319770108, + "learning_rate": 4.2710150756536714e-06, + "loss": 2.7689, + "step": 8862 + }, + { + "epoch": 1.7761523046092185, + "grad_norm": 34.11867658603605, + "learning_rate": 4.269861668903244e-06, + "loss": 2.3347, + "step": 8863 + }, + { + "epoch": 1.7763527054108217, + "grad_norm": 28.65884266492357, + "learning_rate": 4.268708301851513e-06, + "loss": 2.6847, + "step": 8864 + }, + { + "epoch": 1.776553106212425, + "grad_norm": 19.134809542139312, + "learning_rate": 4.267554974561194e-06, + "loss": 2.5043, + "step": 8865 + }, + { + "epoch": 1.776753507014028, + "grad_norm": 49.005311071911606, + "learning_rate": 4.266401687094991e-06, + "loss": 2.0738, + "step": 8866 + }, + { + "epoch": 1.7769539078156313, + "grad_norm": 19.730660873368027, + "learning_rate": 4.265248439515611e-06, + "loss": 2.7834, + "step": 8867 + }, + { + "epoch": 1.7771543086172343, + "grad_norm": 26.645800178886066, + "learning_rate": 4.264095231885757e-06, + "loss": 2.7962, + "step": 8868 + }, + { + "epoch": 1.7773547094188378, + "grad_norm": 74.53431562047655, + "learning_rate": 4.262942064268134e-06, + "loss": 2.5504, + "step": 8869 + }, + { + "epoch": 1.7775551102204408, + "grad_norm": 28.977936581050532, + "learning_rate": 4.261788936725438e-06, + "loss": 2.8402, + "step": 8870 + }, + { + "epoch": 1.7777555110220442, + "grad_norm": 28.541510468293406, + "learning_rate": 4.260635849320366e-06, + "loss": 2.3334, + "step": 8871 + }, + { + "epoch": 1.7779559118236472, + "grad_norm": 27.755404737642827, + "learning_rate": 4.259482802115615e-06, + "loss": 2.6162, + "step": 8872 + }, + { + "epoch": 1.7781563126252506, + "grad_norm": 41.593295864134504, + "learning_rate": 4.258329795173877e-06, + "loss": 3.0525, + "step": 8873 + }, + { + "epoch": 1.7783567134268536, + "grad_norm": 24.811236971680387, + "learning_rate": 4.257176828557843e-06, + "loss": 2.1527, + "step": 8874 + }, + { + "epoch": 1.7785571142284569, + "grad_norm": 27.968129041370396, + "learning_rate": 4.2560239023302015e-06, + "loss": 2.5201, + "step": 8875 + }, + { + "epoch": 1.77875751503006, + "grad_norm": 23.96001009178332, + "learning_rate": 4.25487101655364e-06, + "loss": 2.8134, + "step": 8876 + }, + { + "epoch": 1.7789579158316633, + "grad_norm": 23.381221042481144, + "learning_rate": 4.25371817129084e-06, + "loss": 2.6894, + "step": 8877 + }, + { + "epoch": 1.7791583166332665, + "grad_norm": 47.13108375371771, + "learning_rate": 4.252565366604484e-06, + "loss": 3.1033, + "step": 8878 + }, + { + "epoch": 1.7793587174348697, + "grad_norm": 26.426264025659115, + "learning_rate": 4.251412602557252e-06, + "loss": 2.7522, + "step": 8879 + }, + { + "epoch": 1.779559118236473, + "grad_norm": 24.111301299231755, + "learning_rate": 4.250259879211824e-06, + "loss": 2.9999, + "step": 8880 + }, + { + "epoch": 1.7797595190380762, + "grad_norm": 46.80953140517995, + "learning_rate": 4.249107196630871e-06, + "loss": 2.6668, + "step": 8881 + }, + { + "epoch": 1.7799599198396794, + "grad_norm": 25.57287355911345, + "learning_rate": 4.247954554877069e-06, + "loss": 2.526, + "step": 8882 + }, + { + "epoch": 1.7801603206412826, + "grad_norm": 22.550326435927982, + "learning_rate": 4.246801954013089e-06, + "loss": 2.5233, + "step": 8883 + }, + { + "epoch": 1.7803607214428858, + "grad_norm": 33.133127082665865, + "learning_rate": 4.2456493941015955e-06, + "loss": 2.7984, + "step": 8884 + }, + { + "epoch": 1.7805611222444888, + "grad_norm": 31.21958485301695, + "learning_rate": 4.24449687520526e-06, + "loss": 2.4693, + "step": 8885 + }, + { + "epoch": 1.7807615230460923, + "grad_norm": 33.44326895192471, + "learning_rate": 4.243344397386745e-06, + "loss": 2.8146, + "step": 8886 + }, + { + "epoch": 1.7809619238476952, + "grad_norm": 25.015364875219483, + "learning_rate": 4.242191960708713e-06, + "loss": 2.4573, + "step": 8887 + }, + { + "epoch": 1.7811623246492987, + "grad_norm": 21.70373887911328, + "learning_rate": 4.24103956523382e-06, + "loss": 2.6533, + "step": 8888 + }, + { + "epoch": 1.7813627254509017, + "grad_norm": 26.49950823503274, + "learning_rate": 4.239887211024728e-06, + "loss": 2.586, + "step": 8889 + }, + { + "epoch": 1.7815631262525051, + "grad_norm": 19.03786341898924, + "learning_rate": 4.2387348981440895e-06, + "loss": 2.7268, + "step": 8890 + }, + { + "epoch": 1.7817635270541081, + "grad_norm": 31.640765159000555, + "learning_rate": 4.2375826266545584e-06, + "loss": 3.23, + "step": 8891 + }, + { + "epoch": 1.7819639278557116, + "grad_norm": 25.53881946713614, + "learning_rate": 4.236430396618785e-06, + "loss": 2.6214, + "step": 8892 + }, + { + "epoch": 1.7821643286573146, + "grad_norm": 18.458870050539947, + "learning_rate": 4.23527820809942e-06, + "loss": 2.4915, + "step": 8893 + }, + { + "epoch": 1.782364729458918, + "grad_norm": 28.00429654915012, + "learning_rate": 4.234126061159107e-06, + "loss": 2.1453, + "step": 8894 + }, + { + "epoch": 1.782565130260521, + "grad_norm": 23.460284907956588, + "learning_rate": 4.2329739558604885e-06, + "loss": 2.0403, + "step": 8895 + }, + { + "epoch": 1.7827655310621242, + "grad_norm": 24.553443713605677, + "learning_rate": 4.23182189226621e-06, + "loss": 3.0678, + "step": 8896 + }, + { + "epoch": 1.7829659318637274, + "grad_norm": 20.52580333226405, + "learning_rate": 4.230669870438909e-06, + "loss": 2.8091, + "step": 8897 + }, + { + "epoch": 1.7831663326653306, + "grad_norm": 28.844784022095055, + "learning_rate": 4.229517890441221e-06, + "loss": 2.7918, + "step": 8898 + }, + { + "epoch": 1.7833667334669339, + "grad_norm": 25.19767523800901, + "learning_rate": 4.228365952335785e-06, + "loss": 2.4156, + "step": 8899 + }, + { + "epoch": 1.783567134268537, + "grad_norm": 23.406135261869405, + "learning_rate": 4.227214056185231e-06, + "loss": 2.4136, + "step": 8900 + }, + { + "epoch": 1.7837675350701403, + "grad_norm": 30.762666306609443, + "learning_rate": 4.226062202052188e-06, + "loss": 2.7923, + "step": 8901 + }, + { + "epoch": 1.7839679358717435, + "grad_norm": 24.56655228204973, + "learning_rate": 4.224910389999287e-06, + "loss": 3.1749, + "step": 8902 + }, + { + "epoch": 1.7841683366733467, + "grad_norm": 32.551850309452334, + "learning_rate": 4.223758620089151e-06, + "loss": 2.6328, + "step": 8903 + }, + { + "epoch": 1.78436873747495, + "grad_norm": 29.178202494711446, + "learning_rate": 4.2226068923844065e-06, + "loss": 3.0337, + "step": 8904 + }, + { + "epoch": 1.7845691382765532, + "grad_norm": 31.04488381834454, + "learning_rate": 4.22145520694767e-06, + "loss": 2.4111, + "step": 8905 + }, + { + "epoch": 1.7847695390781562, + "grad_norm": 17.777148919980355, + "learning_rate": 4.220303563841566e-06, + "loss": 2.4653, + "step": 8906 + }, + { + "epoch": 1.7849699398797596, + "grad_norm": 25.856572702206723, + "learning_rate": 4.219151963128707e-06, + "loss": 3.0654, + "step": 8907 + }, + { + "epoch": 1.7851703406813626, + "grad_norm": 39.68201160059856, + "learning_rate": 4.218000404871707e-06, + "loss": 3.7613, + "step": 8908 + }, + { + "epoch": 1.785370741482966, + "grad_norm": 24.089759521336095, + "learning_rate": 4.216848889133181e-06, + "loss": 3.0191, + "step": 8909 + }, + { + "epoch": 1.785571142284569, + "grad_norm": 16.647271488403256, + "learning_rate": 4.215697415975737e-06, + "loss": 2.9911, + "step": 8910 + }, + { + "epoch": 1.7857715430861725, + "grad_norm": 31.05386538980284, + "learning_rate": 4.214545985461983e-06, + "loss": 2.4543, + "step": 8911 + }, + { + "epoch": 1.7859719438877755, + "grad_norm": 33.27146754416966, + "learning_rate": 4.21339459765452e-06, + "loss": 2.7266, + "step": 8912 + }, + { + "epoch": 1.786172344689379, + "grad_norm": 24.404965751804987, + "learning_rate": 4.212243252615957e-06, + "loss": 2.745, + "step": 8913 + }, + { + "epoch": 1.786372745490982, + "grad_norm": 38.52346992992645, + "learning_rate": 4.211091950408891e-06, + "loss": 2.8613, + "step": 8914 + }, + { + "epoch": 1.7865731462925851, + "grad_norm": 23.479867315887113, + "learning_rate": 4.209940691095918e-06, + "loss": 2.5581, + "step": 8915 + }, + { + "epoch": 1.7867735470941883, + "grad_norm": 32.70432252861887, + "learning_rate": 4.208789474739637e-06, + "loss": 3.209, + "step": 8916 + }, + { + "epoch": 1.7869739478957916, + "grad_norm": 22.163135054576074, + "learning_rate": 4.2076383014026414e-06, + "loss": 2.8893, + "step": 8917 + }, + { + "epoch": 1.7871743486973948, + "grad_norm": 33.379255234293744, + "learning_rate": 4.206487171147518e-06, + "loss": 3.6342, + "step": 8918 + }, + { + "epoch": 1.787374749498998, + "grad_norm": 31.413251902839587, + "learning_rate": 4.205336084036861e-06, + "loss": 2.5813, + "step": 8919 + }, + { + "epoch": 1.7875751503006012, + "grad_norm": 26.676231192136083, + "learning_rate": 4.204185040133255e-06, + "loss": 2.4185, + "step": 8920 + }, + { + "epoch": 1.7877755511022044, + "grad_norm": 37.0535612737808, + "learning_rate": 4.203034039499281e-06, + "loss": 3.2178, + "step": 8921 + }, + { + "epoch": 1.7879759519038076, + "grad_norm": 22.60293654286504, + "learning_rate": 4.201883082197523e-06, + "loss": 2.7748, + "step": 8922 + }, + { + "epoch": 1.7881763527054109, + "grad_norm": 21.26886107024945, + "learning_rate": 4.20073216829056e-06, + "loss": 2.722, + "step": 8923 + }, + { + "epoch": 1.788376753507014, + "grad_norm": 22.35206987629022, + "learning_rate": 4.19958129784097e-06, + "loss": 2.3046, + "step": 8924 + }, + { + "epoch": 1.788577154308617, + "grad_norm": 29.595211055485432, + "learning_rate": 4.198430470911324e-06, + "loss": 2.8477, + "step": 8925 + }, + { + "epoch": 1.7887775551102205, + "grad_norm": 16.293546181508425, + "learning_rate": 4.197279687564199e-06, + "loss": 2.7178, + "step": 8926 + }, + { + "epoch": 1.7889779559118235, + "grad_norm": 25.887796754372932, + "learning_rate": 4.196128947862161e-06, + "loss": 2.7707, + "step": 8927 + }, + { + "epoch": 1.789178356713427, + "grad_norm": 27.986150279225047, + "learning_rate": 4.194978251867778e-06, + "loss": 2.552, + "step": 8928 + }, + { + "epoch": 1.78937875751503, + "grad_norm": 36.05535779086217, + "learning_rate": 4.1938275996436165e-06, + "loss": 3.4203, + "step": 8929 + }, + { + "epoch": 1.7895791583166334, + "grad_norm": 28.48627360107248, + "learning_rate": 4.192676991252239e-06, + "loss": 2.1882, + "step": 8930 + }, + { + "epoch": 1.7897795591182364, + "grad_norm": 24.555820840586993, + "learning_rate": 4.191526426756206e-06, + "loss": 2.764, + "step": 8931 + }, + { + "epoch": 1.7899799599198398, + "grad_norm": 22.8537761016537, + "learning_rate": 4.190375906218073e-06, + "loss": 2.7545, + "step": 8932 + }, + { + "epoch": 1.7901803607214428, + "grad_norm": 24.89421505124643, + "learning_rate": 4.1892254297003974e-06, + "loss": 3.1135, + "step": 8933 + }, + { + "epoch": 1.790380761523046, + "grad_norm": 28.261609906159283, + "learning_rate": 4.1880749972657325e-06, + "loss": 2.4299, + "step": 8934 + }, + { + "epoch": 1.7905811623246493, + "grad_norm": 26.525317604858902, + "learning_rate": 4.186924608976628e-06, + "loss": 2.7038, + "step": 8935 + }, + { + "epoch": 1.7907815631262525, + "grad_norm": 38.35024549412099, + "learning_rate": 4.185774264895632e-06, + "loss": 1.7835, + "step": 8936 + }, + { + "epoch": 1.7909819639278557, + "grad_norm": 21.524451895327925, + "learning_rate": 4.184623965085293e-06, + "loss": 2.8628, + "step": 8937 + }, + { + "epoch": 1.791182364729459, + "grad_norm": 50.53494699238448, + "learning_rate": 4.183473709608153e-06, + "loss": 3.1675, + "step": 8938 + }, + { + "epoch": 1.7913827655310621, + "grad_norm": 28.46255939980439, + "learning_rate": 4.1823234985267495e-06, + "loss": 2.721, + "step": 8939 + }, + { + "epoch": 1.7915831663326653, + "grad_norm": 35.370708722794085, + "learning_rate": 4.181173331903627e-06, + "loss": 3.1079, + "step": 8940 + }, + { + "epoch": 1.7917835671342686, + "grad_norm": 43.419942503941364, + "learning_rate": 4.180023209801318e-06, + "loss": 2.27, + "step": 8941 + }, + { + "epoch": 1.7919839679358718, + "grad_norm": 27.488931676916692, + "learning_rate": 4.178873132282357e-06, + "loss": 3.0357, + "step": 8942 + }, + { + "epoch": 1.792184368737475, + "grad_norm": 39.233582586842815, + "learning_rate": 4.177723099409278e-06, + "loss": 2.4267, + "step": 8943 + }, + { + "epoch": 1.792384769539078, + "grad_norm": 26.35364673647159, + "learning_rate": 4.176573111244607e-06, + "loss": 2.9077, + "step": 8944 + }, + { + "epoch": 1.7925851703406814, + "grad_norm": 21.8283635361549, + "learning_rate": 4.17542316785087e-06, + "loss": 2.1484, + "step": 8945 + }, + { + "epoch": 1.7927855711422844, + "grad_norm": 28.49035023451897, + "learning_rate": 4.1742732692905945e-06, + "loss": 2.3534, + "step": 8946 + }, + { + "epoch": 1.7929859719438879, + "grad_norm": 27.43029200588631, + "learning_rate": 4.173123415626299e-06, + "loss": 3.1739, + "step": 8947 + }, + { + "epoch": 1.7931863727454909, + "grad_norm": 30.51216011168234, + "learning_rate": 4.171973606920505e-06, + "loss": 2.6709, + "step": 8948 + }, + { + "epoch": 1.7933867735470943, + "grad_norm": 44.167225886153865, + "learning_rate": 4.170823843235726e-06, + "loss": 2.565, + "step": 8949 + }, + { + "epoch": 1.7935871743486973, + "grad_norm": 24.359265844000756, + "learning_rate": 4.169674124634482e-06, + "loss": 2.8892, + "step": 8950 + }, + { + "epoch": 1.7937875751503007, + "grad_norm": 26.5934568021428, + "learning_rate": 4.16852445117928e-06, + "loss": 2.6774, + "step": 8951 + }, + { + "epoch": 1.7939879759519037, + "grad_norm": 23.077636570000085, + "learning_rate": 4.167374822932629e-06, + "loss": 2.9117, + "step": 8952 + }, + { + "epoch": 1.7941883767535072, + "grad_norm": 26.626168231084165, + "learning_rate": 4.16622523995704e-06, + "loss": 2.7158, + "step": 8953 + }, + { + "epoch": 1.7943887775551102, + "grad_norm": 27.96849159161828, + "learning_rate": 4.165075702315015e-06, + "loss": 2.8567, + "step": 8954 + }, + { + "epoch": 1.7945891783567134, + "grad_norm": 27.957372917705495, + "learning_rate": 4.163926210069056e-06, + "loss": 2.4949, + "step": 8955 + }, + { + "epoch": 1.7947895791583166, + "grad_norm": 22.137253223643654, + "learning_rate": 4.162776763281664e-06, + "loss": 2.6013, + "step": 8956 + }, + { + "epoch": 1.7949899799599198, + "grad_norm": 25.138677468961404, + "learning_rate": 4.1616273620153356e-06, + "loss": 2.5843, + "step": 8957 + }, + { + "epoch": 1.795190380761523, + "grad_norm": 28.455818666040127, + "learning_rate": 4.160478006332566e-06, + "loss": 3.034, + "step": 8958 + }, + { + "epoch": 1.7953907815631263, + "grad_norm": 22.0722009453634, + "learning_rate": 4.159328696295843e-06, + "loss": 2.627, + "step": 8959 + }, + { + "epoch": 1.7955911823647295, + "grad_norm": 30.170184418423187, + "learning_rate": 4.158179431967661e-06, + "loss": 2.7188, + "step": 8960 + }, + { + "epoch": 1.7957915831663327, + "grad_norm": 34.06885480567445, + "learning_rate": 4.157030213410508e-06, + "loss": 3.031, + "step": 8961 + }, + { + "epoch": 1.795991983967936, + "grad_norm": 29.676425470227336, + "learning_rate": 4.155881040686864e-06, + "loss": 2.5737, + "step": 8962 + }, + { + "epoch": 1.7961923847695391, + "grad_norm": 26.968437086356406, + "learning_rate": 4.154731913859216e-06, + "loss": 2.4994, + "step": 8963 + }, + { + "epoch": 1.7963927855711423, + "grad_norm": 37.62331470064039, + "learning_rate": 4.153582832990042e-06, + "loss": 3.1479, + "step": 8964 + }, + { + "epoch": 1.7965931863727453, + "grad_norm": 16.935948574657797, + "learning_rate": 4.1524337981418164e-06, + "loss": 2.5784, + "step": 8965 + }, + { + "epoch": 1.7967935871743488, + "grad_norm": 31.271697331447097, + "learning_rate": 4.151284809377016e-06, + "loss": 2.9514, + "step": 8966 + }, + { + "epoch": 1.7969939879759518, + "grad_norm": 23.675268019036245, + "learning_rate": 4.150135866758115e-06, + "loss": 2.4719, + "step": 8967 + }, + { + "epoch": 1.7971943887775552, + "grad_norm": 20.921229427400913, + "learning_rate": 4.148986970347582e-06, + "loss": 2.7068, + "step": 8968 + }, + { + "epoch": 1.7973947895791582, + "grad_norm": 44.6491524538976, + "learning_rate": 4.147838120207882e-06, + "loss": 2.728, + "step": 8969 + }, + { + "epoch": 1.7975951903807617, + "grad_norm": 29.171743783810008, + "learning_rate": 4.146689316401483e-06, + "loss": 2.6394, + "step": 8970 + }, + { + "epoch": 1.7977955911823646, + "grad_norm": 34.79486691539925, + "learning_rate": 4.145540558990843e-06, + "loss": 2.4598, + "step": 8971 + }, + { + "epoch": 1.797995991983968, + "grad_norm": 38.892268132457104, + "learning_rate": 4.144391848038426e-06, + "loss": 2.7746, + "step": 8972 + }, + { + "epoch": 1.798196392785571, + "grad_norm": 23.33573123133654, + "learning_rate": 4.1432431836066865e-06, + "loss": 2.4036, + "step": 8973 + }, + { + "epoch": 1.7983967935871743, + "grad_norm": 27.507445613306786, + "learning_rate": 4.142094565758081e-06, + "loss": 2.3546, + "step": 8974 + }, + { + "epoch": 1.7985971943887775, + "grad_norm": 27.77468943799646, + "learning_rate": 4.14094599455506e-06, + "loss": 2.276, + "step": 8975 + }, + { + "epoch": 1.7987975951903807, + "grad_norm": 41.35201424038475, + "learning_rate": 4.139797470060072e-06, + "loss": 2.6713, + "step": 8976 + }, + { + "epoch": 1.798997995991984, + "grad_norm": 31.906961680804475, + "learning_rate": 4.138648992335567e-06, + "loss": 2.5858, + "step": 8977 + }, + { + "epoch": 1.7991983967935872, + "grad_norm": 30.190612153714202, + "learning_rate": 4.137500561443986e-06, + "loss": 3.3034, + "step": 8978 + }, + { + "epoch": 1.7993987975951904, + "grad_norm": 23.04126928492458, + "learning_rate": 4.1363521774477725e-06, + "loss": 2.8648, + "step": 8979 + }, + { + "epoch": 1.7995991983967936, + "grad_norm": 28.421416954403806, + "learning_rate": 4.135203840409368e-06, + "loss": 2.8877, + "step": 8980 + }, + { + "epoch": 1.7997995991983968, + "grad_norm": 27.220969751939336, + "learning_rate": 4.134055550391207e-06, + "loss": 2.8577, + "step": 8981 + }, + { + "epoch": 1.8, + "grad_norm": 25.252454037072543, + "learning_rate": 4.132907307455724e-06, + "loss": 2.9152, + "step": 8982 + }, + { + "epoch": 1.8002004008016033, + "grad_norm": 16.35721575844881, + "learning_rate": 4.131759111665349e-06, + "loss": 2.5672, + "step": 8983 + }, + { + "epoch": 1.8004008016032063, + "grad_norm": 65.35012839190436, + "learning_rate": 4.130610963082513e-06, + "loss": 3.5516, + "step": 8984 + }, + { + "epoch": 1.8006012024048097, + "grad_norm": 23.31030144792492, + "learning_rate": 4.1294628617696445e-06, + "loss": 2.6235, + "step": 8985 + }, + { + "epoch": 1.8008016032064127, + "grad_norm": 30.027828237937126, + "learning_rate": 4.128314807789162e-06, + "loss": 2.676, + "step": 8986 + }, + { + "epoch": 1.8010020040080161, + "grad_norm": 16.8739421633079, + "learning_rate": 4.127166801203492e-06, + "loss": 2.2862, + "step": 8987 + }, + { + "epoch": 1.8012024048096191, + "grad_norm": 34.99498741740887, + "learning_rate": 4.1260188420750525e-06, + "loss": 2.887, + "step": 8988 + }, + { + "epoch": 1.8014028056112226, + "grad_norm": 22.40024379899867, + "learning_rate": 4.124870930466256e-06, + "loss": 2.9954, + "step": 8989 + }, + { + "epoch": 1.8016032064128256, + "grad_norm": 25.753979791338562, + "learning_rate": 4.123723066439521e-06, + "loss": 2.681, + "step": 8990 + }, + { + "epoch": 1.801803607214429, + "grad_norm": 25.785294276443736, + "learning_rate": 4.122575250057255e-06, + "loss": 2.7474, + "step": 8991 + }, + { + "epoch": 1.802004008016032, + "grad_norm": 21.514049167699856, + "learning_rate": 4.12142748138187e-06, + "loss": 2.6871, + "step": 8992 + }, + { + "epoch": 1.8022044088176352, + "grad_norm": 21.9679974988355, + "learning_rate": 4.120279760475766e-06, + "loss": 3.0985, + "step": 8993 + }, + { + "epoch": 1.8024048096192384, + "grad_norm": 33.02150553751873, + "learning_rate": 4.119132087401353e-06, + "loss": 3.4031, + "step": 8994 + }, + { + "epoch": 1.8026052104208417, + "grad_norm": 22.902252947370012, + "learning_rate": 4.117984462221029e-06, + "loss": 2.6578, + "step": 8995 + }, + { + "epoch": 1.8028056112224449, + "grad_norm": 33.483676714163224, + "learning_rate": 4.1168368849971895e-06, + "loss": 3.1178, + "step": 8996 + }, + { + "epoch": 1.803006012024048, + "grad_norm": 29.04665929683162, + "learning_rate": 4.115689355792233e-06, + "loss": 3.0432, + "step": 8997 + }, + { + "epoch": 1.8032064128256513, + "grad_norm": 23.158909699343113, + "learning_rate": 4.114541874668553e-06, + "loss": 2.8362, + "step": 8998 + }, + { + "epoch": 1.8034068136272545, + "grad_norm": 25.38161806896136, + "learning_rate": 4.113394441688535e-06, + "loss": 2.6393, + "step": 8999 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 32.02507780209389, + "learning_rate": 4.1122470569145735e-06, + "loss": 2.8005, + "step": 9000 + }, + { + "epoch": 1.803807615230461, + "grad_norm": 26.547309688495947, + "learning_rate": 4.111099720409049e-06, + "loss": 2.5858, + "step": 9001 + }, + { + "epoch": 1.8040080160320642, + "grad_norm": 36.989842214198454, + "learning_rate": 4.109952432234345e-06, + "loss": 2.3106, + "step": 9002 + }, + { + "epoch": 1.8042084168336672, + "grad_norm": 27.59313910670064, + "learning_rate": 4.1088051924528405e-06, + "loss": 2.6337, + "step": 9003 + }, + { + "epoch": 1.8044088176352706, + "grad_norm": 25.57968785370982, + "learning_rate": 4.107658001126913e-06, + "loss": 2.6914, + "step": 9004 + }, + { + "epoch": 1.8046092184368736, + "grad_norm": 49.636641127140955, + "learning_rate": 4.106510858318938e-06, + "loss": 3.5266, + "step": 9005 + }, + { + "epoch": 1.804809619238477, + "grad_norm": 28.672390747276765, + "learning_rate": 4.105363764091285e-06, + "loss": 2.9903, + "step": 9006 + }, + { + "epoch": 1.80501002004008, + "grad_norm": 21.071830097085233, + "learning_rate": 4.104216718506326e-06, + "loss": 2.9232, + "step": 9007 + }, + { + "epoch": 1.8052104208416835, + "grad_norm": 22.97422334791701, + "learning_rate": 4.103069721626428e-06, + "loss": 2.6274, + "step": 9008 + }, + { + "epoch": 1.8054108216432865, + "grad_norm": 38.74614164361861, + "learning_rate": 4.101922773513951e-06, + "loss": 2.6286, + "step": 9009 + }, + { + "epoch": 1.80561122244489, + "grad_norm": 23.632359981570936, + "learning_rate": 4.100775874231257e-06, + "loss": 2.5196, + "step": 9010 + }, + { + "epoch": 1.805811623246493, + "grad_norm": 28.829008154036597, + "learning_rate": 4.099629023840709e-06, + "loss": 2.8036, + "step": 9011 + }, + { + "epoch": 1.8060120240480964, + "grad_norm": 21.698918155645902, + "learning_rate": 4.09848222240466e-06, + "loss": 2.598, + "step": 9012 + }, + { + "epoch": 1.8062124248496993, + "grad_norm": 23.418736939670765, + "learning_rate": 4.0973354699854605e-06, + "loss": 2.4929, + "step": 9013 + }, + { + "epoch": 1.8064128256513026, + "grad_norm": 33.29762980201947, + "learning_rate": 4.096188766645467e-06, + "loss": 3.5295, + "step": 9014 + }, + { + "epoch": 1.8066132264529058, + "grad_norm": 49.38801504700252, + "learning_rate": 4.0950421124470225e-06, + "loss": 2.7303, + "step": 9015 + }, + { + "epoch": 1.806813627254509, + "grad_norm": 25.80759195152269, + "learning_rate": 4.093895507452475e-06, + "loss": 2.4273, + "step": 9016 + }, + { + "epoch": 1.8070140280561122, + "grad_norm": 22.065512355388588, + "learning_rate": 4.092748951724166e-06, + "loss": 2.5394, + "step": 9017 + }, + { + "epoch": 1.8072144288577154, + "grad_norm": 28.44941175266132, + "learning_rate": 4.091602445324437e-06, + "loss": 2.4946, + "step": 9018 + }, + { + "epoch": 1.8074148296593187, + "grad_norm": 29.240219211207, + "learning_rate": 4.090455988315623e-06, + "loss": 3.1503, + "step": 9019 + }, + { + "epoch": 1.8076152304609219, + "grad_norm": 36.99602187919301, + "learning_rate": 4.089309580760058e-06, + "loss": 2.863, + "step": 9020 + }, + { + "epoch": 1.807815631262525, + "grad_norm": 28.57050789662674, + "learning_rate": 4.088163222720077e-06, + "loss": 2.7141, + "step": 9021 + }, + { + "epoch": 1.8080160320641283, + "grad_norm": 20.402665456319983, + "learning_rate": 4.087016914258007e-06, + "loss": 2.4006, + "step": 9022 + }, + { + "epoch": 1.8082164328657315, + "grad_norm": 31.028010154842697, + "learning_rate": 4.085870655436175e-06, + "loss": 2.5367, + "step": 9023 + }, + { + "epoch": 1.8084168336673345, + "grad_norm": 22.27524683191492, + "learning_rate": 4.0847244463169055e-06, + "loss": 2.8853, + "step": 9024 + }, + { + "epoch": 1.808617234468938, + "grad_norm": 25.563324572795967, + "learning_rate": 4.0835782869625195e-06, + "loss": 3.2101, + "step": 9025 + }, + { + "epoch": 1.808817635270541, + "grad_norm": 23.699447517944144, + "learning_rate": 4.0824321774353345e-06, + "loss": 2.9248, + "step": 9026 + }, + { + "epoch": 1.8090180360721444, + "grad_norm": 25.105064359915506, + "learning_rate": 4.081286117797665e-06, + "loss": 2.7211, + "step": 9027 + }, + { + "epoch": 1.8092184368737474, + "grad_norm": 34.718275676722335, + "learning_rate": 4.0801401081118265e-06, + "loss": 2.7, + "step": 9028 + }, + { + "epoch": 1.8094188376753508, + "grad_norm": 21.795988046564993, + "learning_rate": 4.078994148440128e-06, + "loss": 2.506, + "step": 9029 + }, + { + "epoch": 1.8096192384769538, + "grad_norm": 24.43683765764179, + "learning_rate": 4.0778482388448756e-06, + "loss": 2.8323, + "step": 9030 + }, + { + "epoch": 1.8098196392785573, + "grad_norm": 22.374405461738768, + "learning_rate": 4.076702379388379e-06, + "loss": 2.0313, + "step": 9031 + }, + { + "epoch": 1.8100200400801603, + "grad_norm": 31.113726472186517, + "learning_rate": 4.075556570132936e-06, + "loss": 3.1416, + "step": 9032 + }, + { + "epoch": 1.8102204408817635, + "grad_norm": 24.688381045418073, + "learning_rate": 4.0744108111408445e-06, + "loss": 2.8479, + "step": 9033 + }, + { + "epoch": 1.8104208416833667, + "grad_norm": 86.03656822844177, + "learning_rate": 4.073265102474406e-06, + "loss": 2.7067, + "step": 9034 + }, + { + "epoch": 1.81062124248497, + "grad_norm": 18.212050043499673, + "learning_rate": 4.07211944419591e-06, + "loss": 2.3742, + "step": 9035 + }, + { + "epoch": 1.8108216432865731, + "grad_norm": 29.634949482858303, + "learning_rate": 4.070973836367652e-06, + "loss": 2.7487, + "step": 9036 + }, + { + "epoch": 1.8110220440881764, + "grad_norm": 21.38024391607711, + "learning_rate": 4.069828279051915e-06, + "loss": 2.1342, + "step": 9037 + }, + { + "epoch": 1.8112224448897796, + "grad_norm": 21.760652731333668, + "learning_rate": 4.068682772310989e-06, + "loss": 1.988, + "step": 9038 + }, + { + "epoch": 1.8114228456913828, + "grad_norm": 55.35631916067747, + "learning_rate": 4.067537316207156e-06, + "loss": 3.2245, + "step": 9039 + }, + { + "epoch": 1.811623246492986, + "grad_norm": 37.077666535466264, + "learning_rate": 4.0663919108026936e-06, + "loss": 2.5073, + "step": 9040 + }, + { + "epoch": 1.8118236472945892, + "grad_norm": 24.0467735964676, + "learning_rate": 4.065246556159882e-06, + "loss": 2.7477, + "step": 9041 + }, + { + "epoch": 1.8120240480961924, + "grad_norm": 24.993342151867726, + "learning_rate": 4.0641012523409965e-06, + "loss": 2.8224, + "step": 9042 + }, + { + "epoch": 1.8122244488977954, + "grad_norm": 19.359124080881653, + "learning_rate": 4.0629559994083055e-06, + "loss": 3.2575, + "step": 9043 + }, + { + "epoch": 1.8124248496993989, + "grad_norm": 23.00263269144176, + "learning_rate": 4.061810797424081e-06, + "loss": 3.0904, + "step": 9044 + }, + { + "epoch": 1.8126252505010019, + "grad_norm": 17.055874882218447, + "learning_rate": 4.0606656464505885e-06, + "loss": 2.6636, + "step": 9045 + }, + { + "epoch": 1.8128256513026053, + "grad_norm": 31.582467549061906, + "learning_rate": 4.059520546550092e-06, + "loss": 2.5901, + "step": 9046 + }, + { + "epoch": 1.8130260521042083, + "grad_norm": 25.13064555612544, + "learning_rate": 4.05837549778485e-06, + "loss": 2.4782, + "step": 9047 + }, + { + "epoch": 1.8132264529058117, + "grad_norm": 26.42684610403443, + "learning_rate": 4.0572305002171225e-06, + "loss": 3.0771, + "step": 9048 + }, + { + "epoch": 1.8134268537074147, + "grad_norm": 27.997412721970495, + "learning_rate": 4.056085553909165e-06, + "loss": 2.5722, + "step": 9049 + }, + { + "epoch": 1.8136272545090182, + "grad_norm": 20.64062513662349, + "learning_rate": 4.054940658923228e-06, + "loss": 2.1468, + "step": 9050 + }, + { + "epoch": 1.8138276553106212, + "grad_norm": 28.426114790318607, + "learning_rate": 4.053795815321564e-06, + "loss": 2.9181, + "step": 9051 + }, + { + "epoch": 1.8140280561122244, + "grad_norm": 28.659936836123574, + "learning_rate": 4.052651023166418e-06, + "loss": 2.7099, + "step": 9052 + }, + { + "epoch": 1.8142284569138276, + "grad_norm": 19.377021546776227, + "learning_rate": 4.051506282520034e-06, + "loss": 2.1943, + "step": 9053 + }, + { + "epoch": 1.8144288577154308, + "grad_norm": 37.465006651290075, + "learning_rate": 4.050361593444652e-06, + "loss": 2.8024, + "step": 9054 + }, + { + "epoch": 1.814629258517034, + "grad_norm": 23.31868905179812, + "learning_rate": 4.049216956002514e-06, + "loss": 2.6937, + "step": 9055 + }, + { + "epoch": 1.8148296593186373, + "grad_norm": 17.08789827169204, + "learning_rate": 4.0480723702558545e-06, + "loss": 2.7421, + "step": 9056 + }, + { + "epoch": 1.8150300601202405, + "grad_norm": 19.68978551556467, + "learning_rate": 4.0469278362669025e-06, + "loss": 2.5589, + "step": 9057 + }, + { + "epoch": 1.8152304609218437, + "grad_norm": 36.44026843680061, + "learning_rate": 4.045783354097893e-06, + "loss": 2.9842, + "step": 9058 + }, + { + "epoch": 1.815430861723447, + "grad_norm": 20.894294297154854, + "learning_rate": 4.04463892381105e-06, + "loss": 2.4989, + "step": 9059 + }, + { + "epoch": 1.8156312625250501, + "grad_norm": 34.17645135457706, + "learning_rate": 4.043494545468599e-06, + "loss": 2.7103, + "step": 9060 + }, + { + "epoch": 1.8158316633266534, + "grad_norm": 26.29076579730274, + "learning_rate": 4.042350219132761e-06, + "loss": 2.3914, + "step": 9061 + }, + { + "epoch": 1.8160320641282564, + "grad_norm": 28.645467006405347, + "learning_rate": 4.041205944865757e-06, + "loss": 2.4774, + "step": 9062 + }, + { + "epoch": 1.8162324649298598, + "grad_norm": 18.3529737282993, + "learning_rate": 4.040061722729801e-06, + "loss": 2.4249, + "step": 9063 + }, + { + "epoch": 1.8164328657314628, + "grad_norm": 22.292309755465027, + "learning_rate": 4.038917552787104e-06, + "loss": 2.1406, + "step": 9064 + }, + { + "epoch": 1.8166332665330662, + "grad_norm": 29.97271474169637, + "learning_rate": 4.037773435099881e-06, + "loss": 3.0257, + "step": 9065 + }, + { + "epoch": 1.8168336673346692, + "grad_norm": 22.337002029452503, + "learning_rate": 4.036629369730335e-06, + "loss": 3.3613, + "step": 9066 + }, + { + "epoch": 1.8170340681362727, + "grad_norm": 34.523970243134855, + "learning_rate": 4.0354853567406706e-06, + "loss": 2.9563, + "step": 9067 + }, + { + "epoch": 1.8172344689378757, + "grad_norm": 37.732744841307394, + "learning_rate": 4.034341396193094e-06, + "loss": 3.0336, + "step": 9068 + }, + { + "epoch": 1.817434869739479, + "grad_norm": 24.65711640369146, + "learning_rate": 4.033197488149801e-06, + "loss": 2.6112, + "step": 9069 + }, + { + "epoch": 1.817635270541082, + "grad_norm": 26.906359872013727, + "learning_rate": 4.032053632672987e-06, + "loss": 2.6746, + "step": 9070 + }, + { + "epoch": 1.8178356713426855, + "grad_norm": 26.49004459085294, + "learning_rate": 4.030909829824845e-06, + "loss": 2.968, + "step": 9071 + }, + { + "epoch": 1.8180360721442885, + "grad_norm": 22.407612255517165, + "learning_rate": 4.029766079667566e-06, + "loss": 2.9868, + "step": 9072 + }, + { + "epoch": 1.8182364729458917, + "grad_norm": 24.812690566450165, + "learning_rate": 4.02862238226334e-06, + "loss": 2.9223, + "step": 9073 + }, + { + "epoch": 1.818436873747495, + "grad_norm": 32.48089897174079, + "learning_rate": 4.027478737674345e-06, + "loss": 3.3219, + "step": 9074 + }, + { + "epoch": 1.8186372745490982, + "grad_norm": 24.01032365527999, + "learning_rate": 4.02633514596277e-06, + "loss": 2.5444, + "step": 9075 + }, + { + "epoch": 1.8188376753507014, + "grad_norm": 30.353795656427017, + "learning_rate": 4.02519160719079e-06, + "loss": 2.5438, + "step": 9076 + }, + { + "epoch": 1.8190380761523046, + "grad_norm": 34.14594826314712, + "learning_rate": 4.024048121420579e-06, + "loss": 3.1299, + "step": 9077 + }, + { + "epoch": 1.8192384769539078, + "grad_norm": 20.725861381607253, + "learning_rate": 4.022904688714314e-06, + "loss": 2.3212, + "step": 9078 + }, + { + "epoch": 1.819438877755511, + "grad_norm": 23.95438006430131, + "learning_rate": 4.021761309134164e-06, + "loss": 2.6647, + "step": 9079 + }, + { + "epoch": 1.8196392785571143, + "grad_norm": 25.54069752045167, + "learning_rate": 4.020617982742295e-06, + "loss": 1.9655, + "step": 9080 + }, + { + "epoch": 1.8198396793587175, + "grad_norm": 41.12834650627488, + "learning_rate": 4.01947470960087e-06, + "loss": 3.2012, + "step": 9081 + }, + { + "epoch": 1.8200400801603207, + "grad_norm": 36.8918068631021, + "learning_rate": 4.018331489772055e-06, + "loss": 2.529, + "step": 9082 + }, + { + "epoch": 1.8202404809619237, + "grad_norm": 26.814603823460565, + "learning_rate": 4.017188323318006e-06, + "loss": 3.0108, + "step": 9083 + }, + { + "epoch": 1.8204408817635271, + "grad_norm": 25.67733097992346, + "learning_rate": 4.016045210300876e-06, + "loss": 2.9006, + "step": 9084 + }, + { + "epoch": 1.8206412825651301, + "grad_norm": 35.07630861360729, + "learning_rate": 4.014902150782823e-06, + "loss": 2.4111, + "step": 9085 + }, + { + "epoch": 1.8208416833667336, + "grad_norm": 30.223354035393797, + "learning_rate": 4.013759144825993e-06, + "loss": 3.0756, + "step": 9086 + }, + { + "epoch": 1.8210420841683366, + "grad_norm": 26.294964261540652, + "learning_rate": 4.012616192492533e-06, + "loss": 3.1361, + "step": 9087 + }, + { + "epoch": 1.82124248496994, + "grad_norm": 24.35263489512766, + "learning_rate": 4.01147329384459e-06, + "loss": 2.656, + "step": 9088 + }, + { + "epoch": 1.821442885771543, + "grad_norm": 27.592791240365052, + "learning_rate": 4.010330448944304e-06, + "loss": 2.7121, + "step": 9089 + }, + { + "epoch": 1.8216432865731464, + "grad_norm": 17.564032242370423, + "learning_rate": 4.009187657853811e-06, + "loss": 2.0247, + "step": 9090 + }, + { + "epoch": 1.8218436873747494, + "grad_norm": 26.179101172075537, + "learning_rate": 4.0080449206352475e-06, + "loss": 3.1312, + "step": 9091 + }, + { + "epoch": 1.8220440881763527, + "grad_norm": 27.93822065160226, + "learning_rate": 4.006902237350746e-06, + "loss": 2.5713, + "step": 9092 + }, + { + "epoch": 1.8222444889779559, + "grad_norm": 29.889949658899177, + "learning_rate": 4.005759608062436e-06, + "loss": 2.8743, + "step": 9093 + }, + { + "epoch": 1.822444889779559, + "grad_norm": 25.224274748438454, + "learning_rate": 4.004617032832443e-06, + "loss": 2.674, + "step": 9094 + }, + { + "epoch": 1.8226452905811623, + "grad_norm": 27.0526568564832, + "learning_rate": 4.003474511722893e-06, + "loss": 2.8439, + "step": 9095 + }, + { + "epoch": 1.8228456913827655, + "grad_norm": 26.495677938970292, + "learning_rate": 4.002332044795904e-06, + "loss": 2.3894, + "step": 9096 + }, + { + "epoch": 1.8230460921843687, + "grad_norm": 32.818599673546444, + "learning_rate": 4.001189632113594e-06, + "loss": 3.0487, + "step": 9097 + }, + { + "epoch": 1.823246492985972, + "grad_norm": 20.051147490343965, + "learning_rate": 4.000047273738077e-06, + "loss": 2.4073, + "step": 9098 + }, + { + "epoch": 1.8234468937875752, + "grad_norm": 24.318427930023347, + "learning_rate": 3.998904969731468e-06, + "loss": 2.8836, + "step": 9099 + }, + { + "epoch": 1.8236472945891784, + "grad_norm": 22.718685978816684, + "learning_rate": 3.997762720155873e-06, + "loss": 2.2781, + "step": 9100 + }, + { + "epoch": 1.8238476953907816, + "grad_norm": 21.75514240717329, + "learning_rate": 3.996620525073397e-06, + "loss": 2.4804, + "step": 9101 + }, + { + "epoch": 1.8240480961923846, + "grad_norm": 31.624094828753368, + "learning_rate": 3.995478384546146e-06, + "loss": 2.916, + "step": 9102 + }, + { + "epoch": 1.824248496993988, + "grad_norm": 35.991093506406074, + "learning_rate": 3.994336298636217e-06, + "loss": 2.5472, + "step": 9103 + }, + { + "epoch": 1.824448897795591, + "grad_norm": 26.800959030283035, + "learning_rate": 3.9931942674057066e-06, + "loss": 2.4652, + "step": 9104 + }, + { + "epoch": 1.8246492985971945, + "grad_norm": 23.196621921097005, + "learning_rate": 3.992052290916712e-06, + "loss": 2.4701, + "step": 9105 + }, + { + "epoch": 1.8248496993987975, + "grad_norm": 29.744670392736214, + "learning_rate": 3.990910369231323e-06, + "loss": 3.1292, + "step": 9106 + }, + { + "epoch": 1.825050100200401, + "grad_norm": 27.599122859994402, + "learning_rate": 3.9897685024116265e-06, + "loss": 2.6185, + "step": 9107 + }, + { + "epoch": 1.825250501002004, + "grad_norm": 64.19017161787521, + "learning_rate": 3.988626690519705e-06, + "loss": 2.6985, + "step": 9108 + }, + { + "epoch": 1.8254509018036074, + "grad_norm": 34.13221019711865, + "learning_rate": 3.987484933617644e-06, + "loss": 2.2056, + "step": 9109 + }, + { + "epoch": 1.8256513026052104, + "grad_norm": 26.58297566196903, + "learning_rate": 3.986343231767525e-06, + "loss": 2.9452, + "step": 9110 + }, + { + "epoch": 1.8258517034068136, + "grad_norm": 23.220065395943777, + "learning_rate": 3.985201585031417e-06, + "loss": 3.0437, + "step": 9111 + }, + { + "epoch": 1.8260521042084168, + "grad_norm": 25.421897267077025, + "learning_rate": 3.984059993471399e-06, + "loss": 3.2114, + "step": 9112 + }, + { + "epoch": 1.82625250501002, + "grad_norm": 19.937393060322435, + "learning_rate": 3.982918457149539e-06, + "loss": 2.6058, + "step": 9113 + }, + { + "epoch": 1.8264529058116232, + "grad_norm": 20.91203348581544, + "learning_rate": 3.981776976127902e-06, + "loss": 2.6024, + "step": 9114 + }, + { + "epoch": 1.8266533066132264, + "grad_norm": 31.49645126956564, + "learning_rate": 3.980635550468556e-06, + "loss": 2.8256, + "step": 9115 + }, + { + "epoch": 1.8268537074148297, + "grad_norm": 48.29877094483716, + "learning_rate": 3.979494180233558e-06, + "loss": 2.8924, + "step": 9116 + }, + { + "epoch": 1.8270541082164329, + "grad_norm": 22.139996489074242, + "learning_rate": 3.97835286548497e-06, + "loss": 2.5775, + "step": 9117 + }, + { + "epoch": 1.827254509018036, + "grad_norm": 22.970724443835365, + "learning_rate": 3.977211606284842e-06, + "loss": 2.7928, + "step": 9118 + }, + { + "epoch": 1.8274549098196393, + "grad_norm": 26.570961653320136, + "learning_rate": 3.97607040269523e-06, + "loss": 3.2083, + "step": 9119 + }, + { + "epoch": 1.8276553106212425, + "grad_norm": 29.690573837004226, + "learning_rate": 3.974929254778183e-06, + "loss": 2.8837, + "step": 9120 + }, + { + "epoch": 1.8278557114228455, + "grad_norm": 18.960623276423494, + "learning_rate": 3.973788162595743e-06, + "loss": 2.4119, + "step": 9121 + }, + { + "epoch": 1.828056112224449, + "grad_norm": 32.422818910514636, + "learning_rate": 3.9726471262099565e-06, + "loss": 3.3866, + "step": 9122 + }, + { + "epoch": 1.828256513026052, + "grad_norm": 32.96792885787491, + "learning_rate": 3.971506145682863e-06, + "loss": 2.8347, + "step": 9123 + }, + { + "epoch": 1.8284569138276554, + "grad_norm": 22.373295427463493, + "learning_rate": 3.970365221076499e-06, + "loss": 2.1774, + "step": 9124 + }, + { + "epoch": 1.8286573146292584, + "grad_norm": 30.81653045693472, + "learning_rate": 3.969224352452896e-06, + "loss": 2.4976, + "step": 9125 + }, + { + "epoch": 1.8288577154308618, + "grad_norm": 21.595524894243646, + "learning_rate": 3.968083539874088e-06, + "loss": 2.3961, + "step": 9126 + }, + { + "epoch": 1.8290581162324648, + "grad_norm": 25.787083672748903, + "learning_rate": 3.9669427834021e-06, + "loss": 2.5915, + "step": 9127 + }, + { + "epoch": 1.8292585170340683, + "grad_norm": 25.236221109908836, + "learning_rate": 3.965802083098958e-06, + "loss": 2.509, + "step": 9128 + }, + { + "epoch": 1.8294589178356713, + "grad_norm": 23.551055828364643, + "learning_rate": 3.964661439026684e-06, + "loss": 2.9373, + "step": 9129 + }, + { + "epoch": 1.8296593186372747, + "grad_norm": 20.969424750680787, + "learning_rate": 3.963520851247297e-06, + "loss": 3.0121, + "step": 9130 + }, + { + "epoch": 1.8298597194388777, + "grad_norm": 21.999776691127686, + "learning_rate": 3.9623803198228085e-06, + "loss": 2.46, + "step": 9131 + }, + { + "epoch": 1.830060120240481, + "grad_norm": 27.641182438567807, + "learning_rate": 3.961239844815236e-06, + "loss": 3.1615, + "step": 9132 + }, + { + "epoch": 1.8302605210420841, + "grad_norm": 24.213882408839815, + "learning_rate": 3.960099426286586e-06, + "loss": 2.4621, + "step": 9133 + }, + { + "epoch": 1.8304609218436874, + "grad_norm": 22.874418706584837, + "learning_rate": 3.958959064298865e-06, + "loss": 3.133, + "step": 9134 + }, + { + "epoch": 1.8306613226452906, + "grad_norm": 28.64917664525417, + "learning_rate": 3.957818758914074e-06, + "loss": 2.6875, + "step": 9135 + }, + { + "epoch": 1.8308617234468938, + "grad_norm": 27.763580720443045, + "learning_rate": 3.956678510194219e-06, + "loss": 2.3084, + "step": 9136 + }, + { + "epoch": 1.831062124248497, + "grad_norm": 42.845126887978324, + "learning_rate": 3.955538318201293e-06, + "loss": 2.1975, + "step": 9137 + }, + { + "epoch": 1.8312625250501002, + "grad_norm": 18.398826077245957, + "learning_rate": 3.9543981829972874e-06, + "loss": 2.2074, + "step": 9138 + }, + { + "epoch": 1.8314629258517034, + "grad_norm": 20.79266912875045, + "learning_rate": 3.9532581046441994e-06, + "loss": 2.3326, + "step": 9139 + }, + { + "epoch": 1.8316633266533067, + "grad_norm": 19.59801886282074, + "learning_rate": 3.9521180832040116e-06, + "loss": 2.6823, + "step": 9140 + }, + { + "epoch": 1.8318637274549099, + "grad_norm": 24.295159710995975, + "learning_rate": 3.950978118738712e-06, + "loss": 2.5812, + "step": 9141 + }, + { + "epoch": 1.8320641282565129, + "grad_norm": 30.755869961821436, + "learning_rate": 3.949838211310279e-06, + "loss": 2.952, + "step": 9142 + }, + { + "epoch": 1.8322645290581163, + "grad_norm": 31.73829652330017, + "learning_rate": 3.948698360980694e-06, + "loss": 3.0154, + "step": 9143 + }, + { + "epoch": 1.8324649298597193, + "grad_norm": 25.86769242527332, + "learning_rate": 3.9475585678119305e-06, + "loss": 3.0567, + "step": 9144 + }, + { + "epoch": 1.8326653306613228, + "grad_norm": 25.656295845610156, + "learning_rate": 3.946418831865959e-06, + "loss": 2.6568, + "step": 9145 + }, + { + "epoch": 1.8328657314629258, + "grad_norm": 19.802159275007558, + "learning_rate": 3.945279153204754e-06, + "loss": 3.2283, + "step": 9146 + }, + { + "epoch": 1.8330661322645292, + "grad_norm": 22.137690897477547, + "learning_rate": 3.944139531890277e-06, + "loss": 2.4742, + "step": 9147 + }, + { + "epoch": 1.8332665330661322, + "grad_norm": 69.80770548299144, + "learning_rate": 3.942999967984491e-06, + "loss": 2.5531, + "step": 9148 + }, + { + "epoch": 1.8334669338677356, + "grad_norm": 22.52338439952154, + "learning_rate": 3.941860461549359e-06, + "loss": 2.6574, + "step": 9149 + }, + { + "epoch": 1.8336673346693386, + "grad_norm": 30.53553308318932, + "learning_rate": 3.9407210126468365e-06, + "loss": 2.4176, + "step": 9150 + }, + { + "epoch": 1.8338677354709418, + "grad_norm": 20.055948275980324, + "learning_rate": 3.9395816213388755e-06, + "loss": 2.544, + "step": 9151 + }, + { + "epoch": 1.834068136272545, + "grad_norm": 34.0032428560636, + "learning_rate": 3.938442287687425e-06, + "loss": 2.6683, + "step": 9152 + }, + { + "epoch": 1.8342685370741483, + "grad_norm": 52.33763716234161, + "learning_rate": 3.937303011754435e-06, + "loss": 3.1263, + "step": 9153 + }, + { + "epoch": 1.8344689378757515, + "grad_norm": 18.147394752515545, + "learning_rate": 3.936163793601851e-06, + "loss": 2.6976, + "step": 9154 + }, + { + "epoch": 1.8346693386773547, + "grad_norm": 32.180683103045276, + "learning_rate": 3.935024633291608e-06, + "loss": 3.2391, + "step": 9155 + }, + { + "epoch": 1.834869739478958, + "grad_norm": 27.80547963670794, + "learning_rate": 3.933885530885651e-06, + "loss": 2.8715, + "step": 9156 + }, + { + "epoch": 1.8350701402805611, + "grad_norm": 33.95031740497343, + "learning_rate": 3.932746486445912e-06, + "loss": 3.0155, + "step": 9157 + }, + { + "epoch": 1.8352705410821644, + "grad_norm": 20.149944647388843, + "learning_rate": 3.931607500034319e-06, + "loss": 2.385, + "step": 9158 + }, + { + "epoch": 1.8354709418837676, + "grad_norm": 45.23263342432124, + "learning_rate": 3.930468571712806e-06, + "loss": 3.2359, + "step": 9159 + }, + { + "epoch": 1.8356713426853708, + "grad_norm": 27.652287872658558, + "learning_rate": 3.929329701543294e-06, + "loss": 2.5994, + "step": 9160 + }, + { + "epoch": 1.8358717434869738, + "grad_norm": 22.729444008792576, + "learning_rate": 3.928190889587708e-06, + "loss": 2.5692, + "step": 9161 + }, + { + "epoch": 1.8360721442885772, + "grad_norm": 31.650005095942337, + "learning_rate": 3.927052135907963e-06, + "loss": 2.7529, + "step": 9162 + }, + { + "epoch": 1.8362725450901802, + "grad_norm": 23.02630335148611, + "learning_rate": 3.925913440565979e-06, + "loss": 2.4915, + "step": 9163 + }, + { + "epoch": 1.8364729458917837, + "grad_norm": 23.389502307865737, + "learning_rate": 3.924774803623668e-06, + "loss": 2.6833, + "step": 9164 + }, + { + "epoch": 1.8366733466933867, + "grad_norm": 25.532063589773024, + "learning_rate": 3.923636225142935e-06, + "loss": 3.1007, + "step": 9165 + }, + { + "epoch": 1.83687374749499, + "grad_norm": 23.73756183316396, + "learning_rate": 3.9224977051856906e-06, + "loss": 2.77, + "step": 9166 + }, + { + "epoch": 1.837074148296593, + "grad_norm": 32.39553349931393, + "learning_rate": 3.921359243813837e-06, + "loss": 2.6543, + "step": 9167 + }, + { + "epoch": 1.8372745490981965, + "grad_norm": 24.897178373619063, + "learning_rate": 3.920220841089274e-06, + "loss": 2.2073, + "step": 9168 + }, + { + "epoch": 1.8374749498997995, + "grad_norm": 16.63811168795614, + "learning_rate": 3.919082497073896e-06, + "loss": 2.8353, + "step": 9169 + }, + { + "epoch": 1.8376753507014028, + "grad_norm": 30.549988382409833, + "learning_rate": 3.9179442118296e-06, + "loss": 2.8807, + "step": 9170 + }, + { + "epoch": 1.837875751503006, + "grad_norm": 29.96115667604976, + "learning_rate": 3.916805985418274e-06, + "loss": 2.4119, + "step": 9171 + }, + { + "epoch": 1.8380761523046092, + "grad_norm": 34.74075034582922, + "learning_rate": 3.915667817901804e-06, + "loss": 2.8215, + "step": 9172 + }, + { + "epoch": 1.8382765531062124, + "grad_norm": 22.014733824708934, + "learning_rate": 3.914529709342076e-06, + "loss": 2.8254, + "step": 9173 + }, + { + "epoch": 1.8384769539078156, + "grad_norm": 19.3910390042869, + "learning_rate": 3.913391659800971e-06, + "loss": 2.4789, + "step": 9174 + }, + { + "epoch": 1.8386773547094188, + "grad_norm": 25.89590169908528, + "learning_rate": 3.912253669340364e-06, + "loss": 3.0341, + "step": 9175 + }, + { + "epoch": 1.838877755511022, + "grad_norm": 26.680876233544073, + "learning_rate": 3.911115738022133e-06, + "loss": 3.2197, + "step": 9176 + }, + { + "epoch": 1.8390781563126253, + "grad_norm": 19.499061551033062, + "learning_rate": 3.909977865908146e-06, + "loss": 2.1853, + "step": 9177 + }, + { + "epoch": 1.8392785571142285, + "grad_norm": 22.21101516814532, + "learning_rate": 3.908840053060271e-06, + "loss": 2.8038, + "step": 9178 + }, + { + "epoch": 1.8394789579158317, + "grad_norm": 22.5677524824221, + "learning_rate": 3.9077022995403704e-06, + "loss": 2.7095, + "step": 9179 + }, + { + "epoch": 1.8396793587174347, + "grad_norm": 28.98697106415881, + "learning_rate": 3.9065646054103115e-06, + "loss": 2.5813, + "step": 9180 + }, + { + "epoch": 1.8398797595190381, + "grad_norm": 19.58194081759413, + "learning_rate": 3.905426970731949e-06, + "loss": 2.6381, + "step": 9181 + }, + { + "epoch": 1.8400801603206411, + "grad_norm": 21.975501252248797, + "learning_rate": 3.904289395567136e-06, + "loss": 2.8341, + "step": 9182 + }, + { + "epoch": 1.8402805611222446, + "grad_norm": 19.562081639153497, + "learning_rate": 3.903151879977727e-06, + "loss": 2.3039, + "step": 9183 + }, + { + "epoch": 1.8404809619238476, + "grad_norm": 22.301499926602656, + "learning_rate": 3.90201442402557e-06, + "loss": 3.0368, + "step": 9184 + }, + { + "epoch": 1.840681362725451, + "grad_norm": 25.69792415620008, + "learning_rate": 3.900877027772509e-06, + "loss": 2.9537, + "step": 9185 + }, + { + "epoch": 1.840881763527054, + "grad_norm": 49.97316251321131, + "learning_rate": 3.899739691280385e-06, + "loss": 2.2322, + "step": 9186 + }, + { + "epoch": 1.8410821643286575, + "grad_norm": 26.10897450037583, + "learning_rate": 3.89860241461104e-06, + "loss": 2.6862, + "step": 9187 + }, + { + "epoch": 1.8412825651302605, + "grad_norm": 20.099927842338356, + "learning_rate": 3.897465197826308e-06, + "loss": 2.6231, + "step": 9188 + }, + { + "epoch": 1.841482965931864, + "grad_norm": 23.631710331871105, + "learning_rate": 3.8963280409880186e-06, + "loss": 3.3554, + "step": 9189 + }, + { + "epoch": 1.8416833667334669, + "grad_norm": 16.15209408326775, + "learning_rate": 3.8951909441580044e-06, + "loss": 2.6591, + "step": 9190 + }, + { + "epoch": 1.84188376753507, + "grad_norm": 27.76262663774271, + "learning_rate": 3.894053907398088e-06, + "loss": 2.4577, + "step": 9191 + }, + { + "epoch": 1.8420841683366733, + "grad_norm": 31.102036673314288, + "learning_rate": 3.892916930770093e-06, + "loss": 2.5304, + "step": 9192 + }, + { + "epoch": 1.8422845691382765, + "grad_norm": 25.80390305309671, + "learning_rate": 3.89178001433584e-06, + "loss": 2.7173, + "step": 9193 + }, + { + "epoch": 1.8424849699398798, + "grad_norm": 27.70996161656905, + "learning_rate": 3.890643158157145e-06, + "loss": 2.6633, + "step": 9194 + }, + { + "epoch": 1.842685370741483, + "grad_norm": 27.77009715896719, + "learning_rate": 3.8895063622958176e-06, + "loss": 2.8586, + "step": 9195 + }, + { + "epoch": 1.8428857715430862, + "grad_norm": 31.691496086244427, + "learning_rate": 3.888369626813667e-06, + "loss": 3.0148, + "step": 9196 + }, + { + "epoch": 1.8430861723446894, + "grad_norm": 28.195371739697876, + "learning_rate": 3.887232951772501e-06, + "loss": 2.7586, + "step": 9197 + }, + { + "epoch": 1.8432865731462926, + "grad_norm": 29.561618910997826, + "learning_rate": 3.886096337234124e-06, + "loss": 3.1657, + "step": 9198 + }, + { + "epoch": 1.8434869739478958, + "grad_norm": 43.86006913158467, + "learning_rate": 3.884959783260331e-06, + "loss": 3.1715, + "step": 9199 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 22.728844609072826, + "learning_rate": 3.883823289912923e-06, + "loss": 3.2778, + "step": 9200 + }, + { + "epoch": 1.843887775551102, + "grad_norm": 19.30789353784864, + "learning_rate": 3.882686857253689e-06, + "loss": 2.4988, + "step": 9201 + }, + { + "epoch": 1.8440881763527055, + "grad_norm": 37.38440013933444, + "learning_rate": 3.881550485344419e-06, + "loss": 3.1947, + "step": 9202 + }, + { + "epoch": 1.8442885771543085, + "grad_norm": 30.277842875848137, + "learning_rate": 3.880414174246902e-06, + "loss": 2.654, + "step": 9203 + }, + { + "epoch": 1.844488977955912, + "grad_norm": 20.109157496841416, + "learning_rate": 3.8792779240229176e-06, + "loss": 2.4747, + "step": 9204 + }, + { + "epoch": 1.844689378757515, + "grad_norm": 26.295127458667444, + "learning_rate": 3.878141734734249e-06, + "loss": 3.0843, + "step": 9205 + }, + { + "epoch": 1.8448897795591184, + "grad_norm": 18.356327976652373, + "learning_rate": 3.877005606442667e-06, + "loss": 2.4352, + "step": 9206 + }, + { + "epoch": 1.8450901803607214, + "grad_norm": 24.60233831566954, + "learning_rate": 3.87586953920995e-06, + "loss": 2.2692, + "step": 9207 + }, + { + "epoch": 1.8452905811623248, + "grad_norm": 23.715211241353487, + "learning_rate": 3.874733533097866e-06, + "loss": 2.5899, + "step": 9208 + }, + { + "epoch": 1.8454909819639278, + "grad_norm": 36.53577141052951, + "learning_rate": 3.873597588168179e-06, + "loss": 3.0018, + "step": 9209 + }, + { + "epoch": 1.845691382765531, + "grad_norm": 28.374023973165002, + "learning_rate": 3.872461704482654e-06, + "loss": 1.9542, + "step": 9210 + }, + { + "epoch": 1.8458917835671342, + "grad_norm": 24.460521664215683, + "learning_rate": 3.871325882103052e-06, + "loss": 2.355, + "step": 9211 + }, + { + "epoch": 1.8460921843687375, + "grad_norm": 29.305404542690237, + "learning_rate": 3.870190121091128e-06, + "loss": 2.3828, + "step": 9212 + }, + { + "epoch": 1.8462925851703407, + "grad_norm": 27.326676697990592, + "learning_rate": 3.869054421508633e-06, + "loss": 2.586, + "step": 9213 + }, + { + "epoch": 1.846492985971944, + "grad_norm": 26.34196355379908, + "learning_rate": 3.8679187834173195e-06, + "loss": 2.7702, + "step": 9214 + }, + { + "epoch": 1.846693386773547, + "grad_norm": 28.67793939296415, + "learning_rate": 3.866783206878933e-06, + "loss": 3.3942, + "step": 9215 + }, + { + "epoch": 1.8468937875751503, + "grad_norm": 27.08322588636248, + "learning_rate": 3.865647691955216e-06, + "loss": 2.8504, + "step": 9216 + }, + { + "epoch": 1.8470941883767535, + "grad_norm": 37.665250745642446, + "learning_rate": 3.864512238707908e-06, + "loss": 2.9149, + "step": 9217 + }, + { + "epoch": 1.8472945891783568, + "grad_norm": 27.113721704919946, + "learning_rate": 3.8633768471987475e-06, + "loss": 2.3523, + "step": 9218 + }, + { + "epoch": 1.84749498997996, + "grad_norm": 28.004587092198285, + "learning_rate": 3.862241517489464e-06, + "loss": 2.8074, + "step": 9219 + }, + { + "epoch": 1.847695390781563, + "grad_norm": 18.4635813986064, + "learning_rate": 3.8611062496417895e-06, + "loss": 2.4704, + "step": 9220 + }, + { + "epoch": 1.8478957915831664, + "grad_norm": 23.582341776424258, + "learning_rate": 3.85997104371745e-06, + "loss": 3.0427, + "step": 9221 + }, + { + "epoch": 1.8480961923847694, + "grad_norm": 26.839649098552158, + "learning_rate": 3.858835899778166e-06, + "loss": 2.6849, + "step": 9222 + }, + { + "epoch": 1.8482965931863728, + "grad_norm": 21.58691532509327, + "learning_rate": 3.857700817885658e-06, + "loss": 2.8562, + "step": 9223 + }, + { + "epoch": 1.8484969939879758, + "grad_norm": 23.497422268834566, + "learning_rate": 3.856565798101645e-06, + "loss": 2.8964, + "step": 9224 + }, + { + "epoch": 1.8486973947895793, + "grad_norm": 27.417079382784735, + "learning_rate": 3.855430840487837e-06, + "loss": 3.016, + "step": 9225 + }, + { + "epoch": 1.8488977955911823, + "grad_norm": 17.27083165942011, + "learning_rate": 3.854295945105942e-06, + "loss": 2.6785, + "step": 9226 + }, + { + "epoch": 1.8490981963927857, + "grad_norm": 22.290112401601586, + "learning_rate": 3.85316111201767e-06, + "loss": 2.7855, + "step": 9227 + }, + { + "epoch": 1.8492985971943887, + "grad_norm": 19.830153054988056, + "learning_rate": 3.852026341284719e-06, + "loss": 3.0089, + "step": 9228 + }, + { + "epoch": 1.849498997995992, + "grad_norm": 29.77959487515808, + "learning_rate": 3.850891632968791e-06, + "loss": 2.6019, + "step": 9229 + }, + { + "epoch": 1.8496993987975952, + "grad_norm": 22.25260395003554, + "learning_rate": 3.849756987131582e-06, + "loss": 2.911, + "step": 9230 + }, + { + "epoch": 1.8498997995991984, + "grad_norm": 23.805936426357093, + "learning_rate": 3.848622403834785e-06, + "loss": 2.6169, + "step": 9231 + }, + { + "epoch": 1.8501002004008016, + "grad_norm": 21.70172085332955, + "learning_rate": 3.847487883140087e-06, + "loss": 2.6172, + "step": 9232 + }, + { + "epoch": 1.8503006012024048, + "grad_norm": 25.193206147588533, + "learning_rate": 3.846353425109171e-06, + "loss": 2.5727, + "step": 9233 + }, + { + "epoch": 1.850501002004008, + "grad_norm": 18.981617331142772, + "learning_rate": 3.845219029803724e-06, + "loss": 2.5625, + "step": 9234 + }, + { + "epoch": 1.8507014028056112, + "grad_norm": 22.16855852486101, + "learning_rate": 3.844084697285423e-06, + "loss": 3.0159, + "step": 9235 + }, + { + "epoch": 1.8509018036072145, + "grad_norm": 31.314126686764688, + "learning_rate": 3.842950427615942e-06, + "loss": 3.068, + "step": 9236 + }, + { + "epoch": 1.8511022044088177, + "grad_norm": 18.067047882501967, + "learning_rate": 3.841816220856956e-06, + "loss": 2.4662, + "step": 9237 + }, + { + "epoch": 1.851302605210421, + "grad_norm": 36.16572382053861, + "learning_rate": 3.840682077070132e-06, + "loss": 2.947, + "step": 9238 + }, + { + "epoch": 1.8515030060120239, + "grad_norm": 21.824203023929922, + "learning_rate": 3.839547996317134e-06, + "loss": 2.8249, + "step": 9239 + }, + { + "epoch": 1.8517034068136273, + "grad_norm": 26.32153982929855, + "learning_rate": 3.838413978659624e-06, + "loss": 3.0894, + "step": 9240 + }, + { + "epoch": 1.8519038076152303, + "grad_norm": 44.05395518787516, + "learning_rate": 3.837280024159261e-06, + "loss": 3.3553, + "step": 9241 + }, + { + "epoch": 1.8521042084168338, + "grad_norm": 24.354031881777182, + "learning_rate": 3.8361461328777e-06, + "loss": 3.0089, + "step": 9242 + }, + { + "epoch": 1.8523046092184368, + "grad_norm": 21.76401461417678, + "learning_rate": 3.8350123048765905e-06, + "loss": 2.4718, + "step": 9243 + }, + { + "epoch": 1.8525050100200402, + "grad_norm": 24.845793336690008, + "learning_rate": 3.833878540217583e-06, + "loss": 2.5084, + "step": 9244 + }, + { + "epoch": 1.8527054108216432, + "grad_norm": 18.0661277836902, + "learning_rate": 3.832744838962321e-06, + "loss": 2.5953, + "step": 9245 + }, + { + "epoch": 1.8529058116232466, + "grad_norm": 25.506781939134154, + "learning_rate": 3.8316112011724425e-06, + "loss": 2.7388, + "step": 9246 + }, + { + "epoch": 1.8531062124248496, + "grad_norm": 29.61759089970505, + "learning_rate": 3.830477626909589e-06, + "loss": 2.6517, + "step": 9247 + }, + { + "epoch": 1.853306613226453, + "grad_norm": 32.44688986092293, + "learning_rate": 3.829344116235394e-06, + "loss": 3.2922, + "step": 9248 + }, + { + "epoch": 1.853507014028056, + "grad_norm": 33.01323776835228, + "learning_rate": 3.8282106692114886e-06, + "loss": 2.6195, + "step": 9249 + }, + { + "epoch": 1.8537074148296593, + "grad_norm": 20.744907162425523, + "learning_rate": 3.8270772858994955e-06, + "loss": 2.2376, + "step": 9250 + }, + { + "epoch": 1.8539078156312625, + "grad_norm": 28.663013954227484, + "learning_rate": 3.825943966361044e-06, + "loss": 2.4945, + "step": 9251 + }, + { + "epoch": 1.8541082164328657, + "grad_norm": 28.745750341354274, + "learning_rate": 3.824810710657751e-06, + "loss": 3.0109, + "step": 9252 + }, + { + "epoch": 1.854308617234469, + "grad_norm": 31.442412771566488, + "learning_rate": 3.823677518851235e-06, + "loss": 2.5413, + "step": 9253 + }, + { + "epoch": 1.8545090180360722, + "grad_norm": 33.11752307636874, + "learning_rate": 3.822544391003109e-06, + "loss": 3.1237, + "step": 9254 + }, + { + "epoch": 1.8547094188376754, + "grad_norm": 26.58025724498709, + "learning_rate": 3.821411327174983e-06, + "loss": 2.8969, + "step": 9255 + }, + { + "epoch": 1.8549098196392786, + "grad_norm": 39.90148561277831, + "learning_rate": 3.8202783274284635e-06, + "loss": 2.8196, + "step": 9256 + }, + { + "epoch": 1.8551102204408818, + "grad_norm": 35.1075671626961, + "learning_rate": 3.8191453918251494e-06, + "loss": 3.1684, + "step": 9257 + }, + { + "epoch": 1.855310621242485, + "grad_norm": 25.04697603222754, + "learning_rate": 3.8180125204266465e-06, + "loss": 2.6737, + "step": 9258 + }, + { + "epoch": 1.8555110220440882, + "grad_norm": 26.14088675945343, + "learning_rate": 3.816879713294547e-06, + "loss": 2.582, + "step": 9259 + }, + { + "epoch": 1.8557114228456912, + "grad_norm": 26.059302969977004, + "learning_rate": 3.815746970490442e-06, + "loss": 2.8411, + "step": 9260 + }, + { + "epoch": 1.8559118236472947, + "grad_norm": 20.323256607726726, + "learning_rate": 3.8146142920759243e-06, + "loss": 2.4224, + "step": 9261 + }, + { + "epoch": 1.8561122244488977, + "grad_norm": 26.15589336488862, + "learning_rate": 3.813481678112578e-06, + "loss": 2.9687, + "step": 9262 + }, + { + "epoch": 1.8563126252505011, + "grad_norm": 23.64660742152888, + "learning_rate": 3.812349128661982e-06, + "loss": 2.3181, + "step": 9263 + }, + { + "epoch": 1.856513026052104, + "grad_norm": 29.96172682805982, + "learning_rate": 3.811216643785719e-06, + "loss": 2.7822, + "step": 9264 + }, + { + "epoch": 1.8567134268537075, + "grad_norm": 26.462676272481215, + "learning_rate": 3.810084223545361e-06, + "loss": 2.6237, + "step": 9265 + }, + { + "epoch": 1.8569138276553105, + "grad_norm": 29.251565449152977, + "learning_rate": 3.808951868002481e-06, + "loss": 2.7868, + "step": 9266 + }, + { + "epoch": 1.857114228456914, + "grad_norm": 25.42425479452928, + "learning_rate": 3.8078195772186446e-06, + "loss": 2.139, + "step": 9267 + }, + { + "epoch": 1.857314629258517, + "grad_norm": 30.216573407565683, + "learning_rate": 3.8066873512554193e-06, + "loss": 2.5398, + "step": 9268 + }, + { + "epoch": 1.8575150300601202, + "grad_norm": 35.54506579023756, + "learning_rate": 3.805555190174364e-06, + "loss": 2.6281, + "step": 9269 + }, + { + "epoch": 1.8577154308617234, + "grad_norm": 26.22999370691051, + "learning_rate": 3.804423094037034e-06, + "loss": 2.8934, + "step": 9270 + }, + { + "epoch": 1.8579158316633266, + "grad_norm": 23.92367033032665, + "learning_rate": 3.803291062904988e-06, + "loss": 2.7084, + "step": 9271 + }, + { + "epoch": 1.8581162324649299, + "grad_norm": 19.7039395205273, + "learning_rate": 3.802159096839772e-06, + "loss": 2.6841, + "step": 9272 + }, + { + "epoch": 1.858316633266533, + "grad_norm": 26.126968371309356, + "learning_rate": 3.801027195902933e-06, + "loss": 2.5696, + "step": 9273 + }, + { + "epoch": 1.8585170340681363, + "grad_norm": 22.581464006155024, + "learning_rate": 3.7998953601560175e-06, + "loss": 2.6259, + "step": 9274 + }, + { + "epoch": 1.8587174348697395, + "grad_norm": 17.645131259379678, + "learning_rate": 3.7987635896605624e-06, + "loss": 2.4057, + "step": 9275 + }, + { + "epoch": 1.8589178356713427, + "grad_norm": 26.99664250707423, + "learning_rate": 3.7976318844781043e-06, + "loss": 2.7668, + "step": 9276 + }, + { + "epoch": 1.859118236472946, + "grad_norm": 16.9704598960888, + "learning_rate": 3.7965002446701733e-06, + "loss": 2.673, + "step": 9277 + }, + { + "epoch": 1.8593186372745492, + "grad_norm": 29.993064443991713, + "learning_rate": 3.795368670298302e-06, + "loss": 3.0552, + "step": 9278 + }, + { + "epoch": 1.8595190380761522, + "grad_norm": 27.620771850084793, + "learning_rate": 3.7942371614240146e-06, + "loss": 2.8597, + "step": 9279 + }, + { + "epoch": 1.8597194388777556, + "grad_norm": 31.155476081434706, + "learning_rate": 3.793105718108831e-06, + "loss": 2.655, + "step": 9280 + }, + { + "epoch": 1.8599198396793586, + "grad_norm": 49.85672209601709, + "learning_rate": 3.7919743404142728e-06, + "loss": 2.8587, + "step": 9281 + }, + { + "epoch": 1.860120240480962, + "grad_norm": 19.40461987118391, + "learning_rate": 3.790843028401853e-06, + "loss": 2.791, + "step": 9282 + }, + { + "epoch": 1.860320641282565, + "grad_norm": 29.452410624043612, + "learning_rate": 3.789711782133082e-06, + "loss": 2.9437, + "step": 9283 + }, + { + "epoch": 1.8605210420841685, + "grad_norm": 28.390855889763458, + "learning_rate": 3.788580601669468e-06, + "loss": 2.5413, + "step": 9284 + }, + { + "epoch": 1.8607214428857715, + "grad_norm": 22.66347153553687, + "learning_rate": 3.787449487072514e-06, + "loss": 3.1277, + "step": 9285 + }, + { + "epoch": 1.860921843687375, + "grad_norm": 33.457421034321605, + "learning_rate": 3.7863184384037233e-06, + "loss": 2.8843, + "step": 9286 + }, + { + "epoch": 1.861122244488978, + "grad_norm": 38.340537845050584, + "learning_rate": 3.785187455724588e-06, + "loss": 2.3493, + "step": 9287 + }, + { + "epoch": 1.8613226452905811, + "grad_norm": 19.836108396176854, + "learning_rate": 3.7840565390966066e-06, + "loss": 2.6112, + "step": 9288 + }, + { + "epoch": 1.8615230460921843, + "grad_norm": 21.721512909562932, + "learning_rate": 3.782925688581266e-06, + "loss": 3.0624, + "step": 9289 + }, + { + "epoch": 1.8617234468937875, + "grad_norm": 23.109109849309153, + "learning_rate": 3.7817949042400497e-06, + "loss": 3.2192, + "step": 9290 + }, + { + "epoch": 1.8619238476953908, + "grad_norm": 19.661745999769295, + "learning_rate": 3.780664186134444e-06, + "loss": 2.1304, + "step": 9291 + }, + { + "epoch": 1.862124248496994, + "grad_norm": 24.794103560032013, + "learning_rate": 3.7795335343259276e-06, + "loss": 2.3461, + "step": 9292 + }, + { + "epoch": 1.8623246492985972, + "grad_norm": 30.93904140706363, + "learning_rate": 3.778402948875974e-06, + "loss": 3.2171, + "step": 9293 + }, + { + "epoch": 1.8625250501002004, + "grad_norm": 19.544387642517368, + "learning_rate": 3.7772724298460534e-06, + "loss": 2.8602, + "step": 9294 + }, + { + "epoch": 1.8627254509018036, + "grad_norm": 26.41233796085768, + "learning_rate": 3.7761419772976383e-06, + "loss": 2.5078, + "step": 9295 + }, + { + "epoch": 1.8629258517034069, + "grad_norm": 33.15458979362211, + "learning_rate": 3.775011591292189e-06, + "loss": 2.7319, + "step": 9296 + }, + { + "epoch": 1.86312625250501, + "grad_norm": 19.80449396794417, + "learning_rate": 3.7738812718911675e-06, + "loss": 2.6325, + "step": 9297 + }, + { + "epoch": 1.863326653306613, + "grad_norm": 19.192018133045497, + "learning_rate": 3.772751019156031e-06, + "loss": 2.8092, + "step": 9298 + }, + { + "epoch": 1.8635270541082165, + "grad_norm": 28.62146492173943, + "learning_rate": 3.771620833148235e-06, + "loss": 2.9021, + "step": 9299 + }, + { + "epoch": 1.8637274549098195, + "grad_norm": 30.649074812081043, + "learning_rate": 3.7704907139292276e-06, + "loss": 3.1492, + "step": 9300 + }, + { + "epoch": 1.863927855711423, + "grad_norm": 28.99662151929248, + "learning_rate": 3.7693606615604527e-06, + "loss": 2.5869, + "step": 9301 + }, + { + "epoch": 1.864128256513026, + "grad_norm": 22.39285205643776, + "learning_rate": 3.768230676103357e-06, + "loss": 2.2196, + "step": 9302 + }, + { + "epoch": 1.8643286573146294, + "grad_norm": 27.374772460671927, + "learning_rate": 3.7671007576193767e-06, + "loss": 2.5261, + "step": 9303 + }, + { + "epoch": 1.8645290581162324, + "grad_norm": 25.739538550310943, + "learning_rate": 3.765970906169947e-06, + "loss": 2.6829, + "step": 9304 + }, + { + "epoch": 1.8647294589178358, + "grad_norm": 21.24453229205467, + "learning_rate": 3.7648411218165033e-06, + "loss": 2.8068, + "step": 9305 + }, + { + "epoch": 1.8649298597194388, + "grad_norm": 21.764761867589993, + "learning_rate": 3.76371140462047e-06, + "loss": 2.2929, + "step": 9306 + }, + { + "epoch": 1.8651302605210422, + "grad_norm": 23.396831205483736, + "learning_rate": 3.762581754643272e-06, + "loss": 3.0724, + "step": 9307 + }, + { + "epoch": 1.8653306613226452, + "grad_norm": 21.00472092654395, + "learning_rate": 3.761452171946332e-06, + "loss": 2.2665, + "step": 9308 + }, + { + "epoch": 1.8655310621242485, + "grad_norm": 23.278991393286816, + "learning_rate": 3.7603226565910645e-06, + "loss": 2.666, + "step": 9309 + }, + { + "epoch": 1.8657314629258517, + "grad_norm": 23.35907406151829, + "learning_rate": 3.759193208638885e-06, + "loss": 2.9391, + "step": 9310 + }, + { + "epoch": 1.865931863727455, + "grad_norm": 28.672809744395064, + "learning_rate": 3.7580638281512006e-06, + "loss": 2.9773, + "step": 9311 + }, + { + "epoch": 1.8661322645290581, + "grad_norm": 27.291144738089823, + "learning_rate": 3.756934515189421e-06, + "loss": 3.1527, + "step": 9312 + }, + { + "epoch": 1.8663326653306613, + "grad_norm": 30.388865744420393, + "learning_rate": 3.7558052698149474e-06, + "loss": 2.8201, + "step": 9313 + }, + { + "epoch": 1.8665330661322646, + "grad_norm": 28.757826482596908, + "learning_rate": 3.7546760920891747e-06, + "loss": 2.6384, + "step": 9314 + }, + { + "epoch": 1.8667334669338678, + "grad_norm": 23.124947373152427, + "learning_rate": 3.753546982073504e-06, + "loss": 2.4174, + "step": 9315 + }, + { + "epoch": 1.866933867735471, + "grad_norm": 21.668082948703113, + "learning_rate": 3.7524179398293226e-06, + "loss": 3.091, + "step": 9316 + }, + { + "epoch": 1.867134268537074, + "grad_norm": 25.534583403384193, + "learning_rate": 3.751288965418018e-06, + "loss": 3.12, + "step": 9317 + }, + { + "epoch": 1.8673346693386774, + "grad_norm": 23.912779087348813, + "learning_rate": 3.7501600589009778e-06, + "loss": 2.7798, + "step": 9318 + }, + { + "epoch": 1.8675350701402804, + "grad_norm": 21.003901382837597, + "learning_rate": 3.7490312203395808e-06, + "loss": 2.7608, + "step": 9319 + }, + { + "epoch": 1.8677354709418839, + "grad_norm": 25.42277252756243, + "learning_rate": 3.747902449795202e-06, + "loss": 3.2824, + "step": 9320 + }, + { + "epoch": 1.8679358717434869, + "grad_norm": 28.199942709985013, + "learning_rate": 3.7467737473292138e-06, + "loss": 2.8428, + "step": 9321 + }, + { + "epoch": 1.8681362725450903, + "grad_norm": 25.312556949410574, + "learning_rate": 3.745645113002988e-06, + "loss": 2.4575, + "step": 9322 + }, + { + "epoch": 1.8683366733466933, + "grad_norm": 22.241654667227827, + "learning_rate": 3.74451654687789e-06, + "loss": 3.1932, + "step": 9323 + }, + { + "epoch": 1.8685370741482967, + "grad_norm": 23.922631717858234, + "learning_rate": 3.743388049015278e-06, + "loss": 2.6467, + "step": 9324 + }, + { + "epoch": 1.8687374749498997, + "grad_norm": 44.34245011751712, + "learning_rate": 3.742259619476515e-06, + "loss": 3.104, + "step": 9325 + }, + { + "epoch": 1.8689378757515032, + "grad_norm": 18.93757552313333, + "learning_rate": 3.741131258322953e-06, + "loss": 2.5092, + "step": 9326 + }, + { + "epoch": 1.8691382765531062, + "grad_norm": 22.61652264604468, + "learning_rate": 3.740002965615942e-06, + "loss": 2.8029, + "step": 9327 + }, + { + "epoch": 1.8693386773547094, + "grad_norm": 13.378801526168726, + "learning_rate": 3.7388747414168296e-06, + "loss": 2.7522, + "step": 9328 + }, + { + "epoch": 1.8695390781563126, + "grad_norm": 24.70939363702203, + "learning_rate": 3.7377465857869594e-06, + "loss": 2.8475, + "step": 9329 + }, + { + "epoch": 1.8697394789579158, + "grad_norm": 20.236535927444056, + "learning_rate": 3.7366184987876715e-06, + "loss": 2.2502, + "step": 9330 + }, + { + "epoch": 1.869939879759519, + "grad_norm": 24.965301360237326, + "learning_rate": 3.7354904804802984e-06, + "loss": 2.1735, + "step": 9331 + }, + { + "epoch": 1.8701402805611222, + "grad_norm": 23.82723900734521, + "learning_rate": 3.7343625309261767e-06, + "loss": 2.4824, + "step": 9332 + }, + { + "epoch": 1.8703406813627255, + "grad_norm": 18.095591426978892, + "learning_rate": 3.7332346501866325e-06, + "loss": 2.3911, + "step": 9333 + }, + { + "epoch": 1.8705410821643287, + "grad_norm": 29.10395593110118, + "learning_rate": 3.7321068383229875e-06, + "loss": 2.7503, + "step": 9334 + }, + { + "epoch": 1.870741482965932, + "grad_norm": 22.83093079423333, + "learning_rate": 3.7309790953965673e-06, + "loss": 2.5647, + "step": 9335 + }, + { + "epoch": 1.8709418837675351, + "grad_norm": 21.192285094570767, + "learning_rate": 3.729851421468688e-06, + "loss": 2.8047, + "step": 9336 + }, + { + "epoch": 1.8711422845691383, + "grad_norm": 36.859135889355045, + "learning_rate": 3.728723816600661e-06, + "loss": 3.0356, + "step": 9337 + }, + { + "epoch": 1.8713426853707413, + "grad_norm": 40.37393386355088, + "learning_rate": 3.7275962808537946e-06, + "loss": 3.2847, + "step": 9338 + }, + { + "epoch": 1.8715430861723448, + "grad_norm": 22.106949586518525, + "learning_rate": 3.7264688142893994e-06, + "loss": 2.3628, + "step": 9339 + }, + { + "epoch": 1.8717434869739478, + "grad_norm": 33.18715911400323, + "learning_rate": 3.7253414169687736e-06, + "loss": 3.2959, + "step": 9340 + }, + { + "epoch": 1.8719438877755512, + "grad_norm": 24.999758852563648, + "learning_rate": 3.7242140889532163e-06, + "loss": 3.0133, + "step": 9341 + }, + { + "epoch": 1.8721442885771542, + "grad_norm": 22.445021726611703, + "learning_rate": 3.7230868303040224e-06, + "loss": 2.6556, + "step": 9342 + }, + { + "epoch": 1.8723446893787576, + "grad_norm": 17.90158660953204, + "learning_rate": 3.7219596410824834e-06, + "loss": 2.3449, + "step": 9343 + }, + { + "epoch": 1.8725450901803606, + "grad_norm": 24.510174860757026, + "learning_rate": 3.720832521349885e-06, + "loss": 2.3989, + "step": 9344 + }, + { + "epoch": 1.872745490981964, + "grad_norm": 26.25910135306248, + "learning_rate": 3.7197054711675086e-06, + "loss": 2.9439, + "step": 9345 + }, + { + "epoch": 1.872945891783567, + "grad_norm": 28.848355494251255, + "learning_rate": 3.7185784905966377e-06, + "loss": 2.9594, + "step": 9346 + }, + { + "epoch": 1.8731462925851703, + "grad_norm": 27.62352850299303, + "learning_rate": 3.717451579698545e-06, + "loss": 2.7365, + "step": 9347 + }, + { + "epoch": 1.8733466933867735, + "grad_norm": 24.32146827695477, + "learning_rate": 3.716324738534502e-06, + "loss": 2.0332, + "step": 9348 + }, + { + "epoch": 1.8735470941883767, + "grad_norm": 23.75126716342703, + "learning_rate": 3.71519796716578e-06, + "loss": 2.8008, + "step": 9349 + }, + { + "epoch": 1.87374749498998, + "grad_norm": 19.615315018164587, + "learning_rate": 3.714071265653641e-06, + "loss": 2.4798, + "step": 9350 + }, + { + "epoch": 1.8739478957915832, + "grad_norm": 17.542578349336434, + "learning_rate": 3.7129446340593435e-06, + "loss": 2.2345, + "step": 9351 + }, + { + "epoch": 1.8741482965931864, + "grad_norm": 28.850546854796878, + "learning_rate": 3.7118180724441478e-06, + "loss": 2.8655, + "step": 9352 + }, + { + "epoch": 1.8743486973947896, + "grad_norm": 26.824680667135837, + "learning_rate": 3.7106915808693047e-06, + "loss": 2.4042, + "step": 9353 + }, + { + "epoch": 1.8745490981963928, + "grad_norm": 27.375158414176358, + "learning_rate": 3.7095651593960637e-06, + "loss": 2.3169, + "step": 9354 + }, + { + "epoch": 1.874749498997996, + "grad_norm": 24.261468770044374, + "learning_rate": 3.708438808085668e-06, + "loss": 2.6109, + "step": 9355 + }, + { + "epoch": 1.8749498997995993, + "grad_norm": 30.224545540891725, + "learning_rate": 3.7073125269993637e-06, + "loss": 2.7004, + "step": 9356 + }, + { + "epoch": 1.8751503006012022, + "grad_norm": 36.25926943713081, + "learning_rate": 3.7061863161983845e-06, + "loss": 2.6647, + "step": 9357 + }, + { + "epoch": 1.8753507014028057, + "grad_norm": 23.135854791474742, + "learning_rate": 3.7050601757439634e-06, + "loss": 2.8555, + "step": 9358 + }, + { + "epoch": 1.8755511022044087, + "grad_norm": 20.82498241204493, + "learning_rate": 3.703934105697333e-06, + "loss": 2.5897, + "step": 9359 + }, + { + "epoch": 1.8757515030060121, + "grad_norm": 25.61891056474431, + "learning_rate": 3.7028081061197195e-06, + "loss": 2.697, + "step": 9360 + }, + { + "epoch": 1.8759519038076151, + "grad_norm": 55.7529483028699, + "learning_rate": 3.7016821770723422e-06, + "loss": 2.8085, + "step": 9361 + }, + { + "epoch": 1.8761523046092186, + "grad_norm": 20.23301117937424, + "learning_rate": 3.700556318616424e-06, + "loss": 3.1413, + "step": 9362 + }, + { + "epoch": 1.8763527054108216, + "grad_norm": 24.451693851171626, + "learning_rate": 3.6994305308131763e-06, + "loss": 3.4017, + "step": 9363 + }, + { + "epoch": 1.876553106212425, + "grad_norm": 24.183584571521912, + "learning_rate": 3.6983048137238098e-06, + "loss": 2.3102, + "step": 9364 + }, + { + "epoch": 1.876753507014028, + "grad_norm": 31.90661566290373, + "learning_rate": 3.6971791674095314e-06, + "loss": 2.7089, + "step": 9365 + }, + { + "epoch": 1.8769539078156314, + "grad_norm": 23.978520237092944, + "learning_rate": 3.6960535919315456e-06, + "loss": 2.8206, + "step": 9366 + }, + { + "epoch": 1.8771543086172344, + "grad_norm": 19.443701419711296, + "learning_rate": 3.6949280873510506e-06, + "loss": 2.4089, + "step": 9367 + }, + { + "epoch": 1.8773547094188376, + "grad_norm": 53.33278099323826, + "learning_rate": 3.693802653729241e-06, + "loss": 2.2683, + "step": 9368 + }, + { + "epoch": 1.8775551102204409, + "grad_norm": 27.762542214874482, + "learning_rate": 3.6926772911273113e-06, + "loss": 2.5547, + "step": 9369 + }, + { + "epoch": 1.877755511022044, + "grad_norm": 27.109821288359196, + "learning_rate": 3.6915519996064464e-06, + "loss": 2.9851, + "step": 9370 + }, + { + "epoch": 1.8779559118236473, + "grad_norm": 18.499359476747134, + "learning_rate": 3.6904267792278304e-06, + "loss": 3.0464, + "step": 9371 + }, + { + "epoch": 1.8781563126252505, + "grad_norm": 30.26889847225129, + "learning_rate": 3.689301630052642e-06, + "loss": 2.7573, + "step": 9372 + }, + { + "epoch": 1.8783567134268537, + "grad_norm": 25.13590233688482, + "learning_rate": 3.6881765521420607e-06, + "loss": 2.5632, + "step": 9373 + }, + { + "epoch": 1.878557114228457, + "grad_norm": 45.98384730697394, + "learning_rate": 3.687051545557257e-06, + "loss": 2.838, + "step": 9374 + }, + { + "epoch": 1.8787575150300602, + "grad_norm": 21.923373725898895, + "learning_rate": 3.6859266103593964e-06, + "loss": 2.8639, + "step": 9375 + }, + { + "epoch": 1.8789579158316632, + "grad_norm": 39.981808872737915, + "learning_rate": 3.6848017466096486e-06, + "loss": 3.0536, + "step": 9376 + }, + { + "epoch": 1.8791583166332666, + "grad_norm": 44.28494903015051, + "learning_rate": 3.6836769543691696e-06, + "loss": 2.2533, + "step": 9377 + }, + { + "epoch": 1.8793587174348696, + "grad_norm": 29.67059158940577, + "learning_rate": 3.682552233699118e-06, + "loss": 3.2051, + "step": 9378 + }, + { + "epoch": 1.879559118236473, + "grad_norm": 24.339185371796756, + "learning_rate": 3.6814275846606456e-06, + "loss": 2.7591, + "step": 9379 + }, + { + "epoch": 1.879759519038076, + "grad_norm": 27.281205968582388, + "learning_rate": 3.6803030073149037e-06, + "loss": 2.2365, + "step": 9380 + }, + { + "epoch": 1.8799599198396795, + "grad_norm": 27.196951415451046, + "learning_rate": 3.6791785017230346e-06, + "loss": 2.96, + "step": 9381 + }, + { + "epoch": 1.8801603206412825, + "grad_norm": 26.595485939691855, + "learning_rate": 3.6780540679461785e-06, + "loss": 2.7545, + "step": 9382 + }, + { + "epoch": 1.880360721442886, + "grad_norm": 34.66486018246985, + "learning_rate": 3.676929706045476e-06, + "loss": 3.2156, + "step": 9383 + }, + { + "epoch": 1.880561122244489, + "grad_norm": 21.06362317111165, + "learning_rate": 3.6758054160820574e-06, + "loss": 2.5681, + "step": 9384 + }, + { + "epoch": 1.8807615230460923, + "grad_norm": 21.67509514433812, + "learning_rate": 3.6746811981170514e-06, + "loss": 2.8632, + "step": 9385 + }, + { + "epoch": 1.8809619238476953, + "grad_norm": 58.39659300248909, + "learning_rate": 3.6735570522115883e-06, + "loss": 2.6956, + "step": 9386 + }, + { + "epoch": 1.8811623246492986, + "grad_norm": 39.52861423090298, + "learning_rate": 3.6724329784267854e-06, + "loss": 2.7991, + "step": 9387 + }, + { + "epoch": 1.8813627254509018, + "grad_norm": 25.175798646901903, + "learning_rate": 3.6713089768237584e-06, + "loss": 2.4109, + "step": 9388 + }, + { + "epoch": 1.881563126252505, + "grad_norm": 17.550775507896958, + "learning_rate": 3.670185047463627e-06, + "loss": 2.6347, + "step": 9389 + }, + { + "epoch": 1.8817635270541082, + "grad_norm": 16.916348145125088, + "learning_rate": 3.6690611904074963e-06, + "loss": 2.4008, + "step": 9390 + }, + { + "epoch": 1.8819639278557114, + "grad_norm": 33.90223060105866, + "learning_rate": 3.667937405716474e-06, + "loss": 2.5087, + "step": 9391 + }, + { + "epoch": 1.8821643286573146, + "grad_norm": 42.01779948970801, + "learning_rate": 3.6668136934516598e-06, + "loss": 2.745, + "step": 9392 + }, + { + "epoch": 1.8823647294589179, + "grad_norm": 32.33104410662434, + "learning_rate": 3.6656900536741545e-06, + "loss": 2.7284, + "step": 9393 + }, + { + "epoch": 1.882565130260521, + "grad_norm": 22.018130850584704, + "learning_rate": 3.6645664864450504e-06, + "loss": 2.6523, + "step": 9394 + }, + { + "epoch": 1.8827655310621243, + "grad_norm": 19.699724108413722, + "learning_rate": 3.663442991825436e-06, + "loss": 2.4757, + "step": 9395 + }, + { + "epoch": 1.8829659318637275, + "grad_norm": 26.268810642548402, + "learning_rate": 3.6623195698764015e-06, + "loss": 2.5415, + "step": 9396 + }, + { + "epoch": 1.8831663326653305, + "grad_norm": 27.734064322520293, + "learning_rate": 3.6611962206590247e-06, + "loss": 2.425, + "step": 9397 + }, + { + "epoch": 1.883366733466934, + "grad_norm": 38.803749468732434, + "learning_rate": 3.6600729442343876e-06, + "loss": 3.1571, + "step": 9398 + }, + { + "epoch": 1.883567134268537, + "grad_norm": 29.97265755140529, + "learning_rate": 3.65894974066356e-06, + "loss": 2.8201, + "step": 9399 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 27.479181830282545, + "learning_rate": 3.6578266100076166e-06, + "loss": 2.6362, + "step": 9400 + }, + { + "epoch": 1.8839679358717434, + "grad_norm": 27.795906747096893, + "learning_rate": 3.6567035523276216e-06, + "loss": 3.1238, + "step": 9401 + }, + { + "epoch": 1.8841683366733468, + "grad_norm": 23.330899949658303, + "learning_rate": 3.6555805676846353e-06, + "loss": 2.2234, + "step": 9402 + }, + { + "epoch": 1.8843687374749498, + "grad_norm": 36.637361659450015, + "learning_rate": 3.654457656139719e-06, + "loss": 3.043, + "step": 9403 + }, + { + "epoch": 1.8845691382765533, + "grad_norm": 18.34696350060476, + "learning_rate": 3.6533348177539273e-06, + "loss": 1.8281, + "step": 9404 + }, + { + "epoch": 1.8847695390781563, + "grad_norm": 26.275536797136986, + "learning_rate": 3.6522120525883076e-06, + "loss": 2.1961, + "step": 9405 + }, + { + "epoch": 1.8849699398797595, + "grad_norm": 22.040137154978165, + "learning_rate": 3.6510893607039094e-06, + "loss": 2.9226, + "step": 9406 + }, + { + "epoch": 1.8851703406813627, + "grad_norm": 22.850205415204066, + "learning_rate": 3.6499667421617745e-06, + "loss": 2.5894, + "step": 9407 + }, + { + "epoch": 1.885370741482966, + "grad_norm": 20.87996278446417, + "learning_rate": 3.648844197022939e-06, + "loss": 2.8296, + "step": 9408 + }, + { + "epoch": 1.8855711422845691, + "grad_norm": 19.814186072050145, + "learning_rate": 3.64772172534844e-06, + "loss": 2.6833, + "step": 9409 + }, + { + "epoch": 1.8857715430861723, + "grad_norm": 28.718319582688054, + "learning_rate": 3.646599327199306e-06, + "loss": 2.8531, + "step": 9410 + }, + { + "epoch": 1.8859719438877756, + "grad_norm": 20.20022551527319, + "learning_rate": 3.645477002636566e-06, + "loss": 3.0397, + "step": 9411 + }, + { + "epoch": 1.8861723446893788, + "grad_norm": 40.11615614462388, + "learning_rate": 3.6443547517212385e-06, + "loss": 2.8415, + "step": 9412 + }, + { + "epoch": 1.886372745490982, + "grad_norm": 32.5044041797229, + "learning_rate": 3.643232574514347e-06, + "loss": 2.5314, + "step": 9413 + }, + { + "epoch": 1.8865731462925852, + "grad_norm": 17.70604510850068, + "learning_rate": 3.6421104710769023e-06, + "loss": 2.5057, + "step": 9414 + }, + { + "epoch": 1.8867735470941884, + "grad_norm": 27.10114845601699, + "learning_rate": 3.6409884414699147e-06, + "loss": 2.9191, + "step": 9415 + }, + { + "epoch": 1.8869739478957914, + "grad_norm": 26.390017269275198, + "learning_rate": 3.6398664857543908e-06, + "loss": 2.6915, + "step": 9416 + }, + { + "epoch": 1.8871743486973949, + "grad_norm": 35.840381414221355, + "learning_rate": 3.6387446039913366e-06, + "loss": 2.5442, + "step": 9417 + }, + { + "epoch": 1.8873747494989979, + "grad_norm": 26.89238613835575, + "learning_rate": 3.6376227962417464e-06, + "loss": 2.8094, + "step": 9418 + }, + { + "epoch": 1.8875751503006013, + "grad_norm": 29.417938708551482, + "learning_rate": 3.6365010625666147e-06, + "loss": 3.3229, + "step": 9419 + }, + { + "epoch": 1.8877755511022043, + "grad_norm": 34.13144307983102, + "learning_rate": 3.635379403026935e-06, + "loss": 2.3257, + "step": 9420 + }, + { + "epoch": 1.8879759519038077, + "grad_norm": 34.15844415372376, + "learning_rate": 3.63425781768369e-06, + "loss": 2.5083, + "step": 9421 + }, + { + "epoch": 1.8881763527054107, + "grad_norm": 22.923350352531273, + "learning_rate": 3.6331363065978636e-06, + "loss": 2.8659, + "step": 9422 + }, + { + "epoch": 1.8883767535070142, + "grad_norm": 29.43951478196781, + "learning_rate": 3.6320148698304342e-06, + "loss": 2.6375, + "step": 9423 + }, + { + "epoch": 1.8885771543086172, + "grad_norm": 25.051590132547446, + "learning_rate": 3.630893507442376e-06, + "loss": 2.4442, + "step": 9424 + }, + { + "epoch": 1.8887775551102206, + "grad_norm": 26.954308829385955, + "learning_rate": 3.6297722194946593e-06, + "loss": 2.82, + "step": 9425 + }, + { + "epoch": 1.8889779559118236, + "grad_norm": 27.214387356277044, + "learning_rate": 3.628651006048248e-06, + "loss": 2.9267, + "step": 9426 + }, + { + "epoch": 1.8891783567134268, + "grad_norm": 23.524009888081462, + "learning_rate": 3.6275298671641067e-06, + "loss": 2.7157, + "step": 9427 + }, + { + "epoch": 1.88937875751503, + "grad_norm": 27.113125815108095, + "learning_rate": 3.626408802903193e-06, + "loss": 2.5365, + "step": 9428 + }, + { + "epoch": 1.8895791583166333, + "grad_norm": 22.730244641459937, + "learning_rate": 3.625287813326458e-06, + "loss": 2.1992, + "step": 9429 + }, + { + "epoch": 1.8897795591182365, + "grad_norm": 24.236549541848678, + "learning_rate": 3.6241668984948565e-06, + "loss": 2.167, + "step": 9430 + }, + { + "epoch": 1.8899799599198397, + "grad_norm": 23.635023902666738, + "learning_rate": 3.6230460584693315e-06, + "loss": 3.124, + "step": 9431 + }, + { + "epoch": 1.890180360721443, + "grad_norm": 33.09000975227695, + "learning_rate": 3.6219252933108227e-06, + "loss": 2.9367, + "step": 9432 + }, + { + "epoch": 1.8903807615230461, + "grad_norm": 27.423197867032894, + "learning_rate": 3.6208046030802716e-06, + "loss": 2.5927, + "step": 9433 + }, + { + "epoch": 1.8905811623246493, + "grad_norm": 26.612289384551893, + "learning_rate": 3.61968398783861e-06, + "loss": 2.4803, + "step": 9434 + }, + { + "epoch": 1.8907815631262523, + "grad_norm": 21.232951091893977, + "learning_rate": 3.618563447646768e-06, + "loss": 2.8935, + "step": 9435 + }, + { + "epoch": 1.8909819639278558, + "grad_norm": 38.30254166832688, + "learning_rate": 3.6174429825656687e-06, + "loss": 2.293, + "step": 9436 + }, + { + "epoch": 1.8911823647294588, + "grad_norm": 26.15906047591758, + "learning_rate": 3.6163225926562373e-06, + "loss": 2.7605, + "step": 9437 + }, + { + "epoch": 1.8913827655310622, + "grad_norm": 21.93210041299801, + "learning_rate": 3.6152022779793884e-06, + "loss": 2.6384, + "step": 9438 + }, + { + "epoch": 1.8915831663326652, + "grad_norm": 48.8641853126596, + "learning_rate": 3.6140820385960348e-06, + "loss": 2.5321, + "step": 9439 + }, + { + "epoch": 1.8917835671342687, + "grad_norm": 25.296641323757488, + "learning_rate": 3.612961874567088e-06, + "loss": 2.7713, + "step": 9440 + }, + { + "epoch": 1.8919839679358716, + "grad_norm": 23.582393471002497, + "learning_rate": 3.6118417859534505e-06, + "loss": 3.1529, + "step": 9441 + }, + { + "epoch": 1.892184368737475, + "grad_norm": 25.606453017172672, + "learning_rate": 3.610721772816025e-06, + "loss": 2.9386, + "step": 9442 + }, + { + "epoch": 1.892384769539078, + "grad_norm": 22.588845699683574, + "learning_rate": 3.609601835215706e-06, + "loss": 3.1102, + "step": 9443 + }, + { + "epoch": 1.8925851703406815, + "grad_norm": 43.50815753529965, + "learning_rate": 3.608481973213389e-06, + "loss": 2.9065, + "step": 9444 + }, + { + "epoch": 1.8927855711422845, + "grad_norm": 32.056009521408015, + "learning_rate": 3.607362186869962e-06, + "loss": 2.9682, + "step": 9445 + }, + { + "epoch": 1.8929859719438877, + "grad_norm": 22.664965916158728, + "learning_rate": 3.606242476246306e-06, + "loss": 2.7485, + "step": 9446 + }, + { + "epoch": 1.893186372745491, + "grad_norm": 26.726964313648626, + "learning_rate": 3.6051228414033056e-06, + "loss": 3.1794, + "step": 9447 + }, + { + "epoch": 1.8933867735470942, + "grad_norm": 24.706743261607773, + "learning_rate": 3.604003282401836e-06, + "loss": 3.0225, + "step": 9448 + }, + { + "epoch": 1.8935871743486974, + "grad_norm": 20.321621212944585, + "learning_rate": 3.602883799302766e-06, + "loss": 2.6557, + "step": 9449 + }, + { + "epoch": 1.8937875751503006, + "grad_norm": 23.220784749827683, + "learning_rate": 3.601764392166969e-06, + "loss": 2.7037, + "step": 9450 + }, + { + "epoch": 1.8939879759519038, + "grad_norm": 23.40540095509174, + "learning_rate": 3.6006450610553055e-06, + "loss": 2.2606, + "step": 9451 + }, + { + "epoch": 1.894188376753507, + "grad_norm": 20.0420977131103, + "learning_rate": 3.5995258060286346e-06, + "loss": 2.241, + "step": 9452 + }, + { + "epoch": 1.8943887775551103, + "grad_norm": 37.01560166225326, + "learning_rate": 3.5984066271478133e-06, + "loss": 2.6051, + "step": 9453 + }, + { + "epoch": 1.8945891783567135, + "grad_norm": 20.80500419462468, + "learning_rate": 3.5972875244736925e-06, + "loss": 2.7795, + "step": 9454 + }, + { + "epoch": 1.8947895791583167, + "grad_norm": 25.81680083150697, + "learning_rate": 3.596168498067121e-06, + "loss": 2.5669, + "step": 9455 + }, + { + "epoch": 1.8949899799599197, + "grad_norm": 29.41602842418922, + "learning_rate": 3.5950495479889382e-06, + "loss": 2.9722, + "step": 9456 + }, + { + "epoch": 1.8951903807615231, + "grad_norm": 42.40245835860611, + "learning_rate": 3.593930674299987e-06, + "loss": 2.5552, + "step": 9457 + }, + { + "epoch": 1.8953907815631261, + "grad_norm": 43.30887959599827, + "learning_rate": 3.592811877061101e-06, + "loss": 2.7114, + "step": 9458 + }, + { + "epoch": 1.8955911823647296, + "grad_norm": 23.481237040184762, + "learning_rate": 3.5916931563331093e-06, + "loss": 2.7912, + "step": 9459 + }, + { + "epoch": 1.8957915831663326, + "grad_norm": 22.199190434868317, + "learning_rate": 3.5905745121768383e-06, + "loss": 2.4165, + "step": 9460 + }, + { + "epoch": 1.895991983967936, + "grad_norm": 26.817284108707955, + "learning_rate": 3.5894559446531136e-06, + "loss": 2.9501, + "step": 9461 + }, + { + "epoch": 1.896192384769539, + "grad_norm": 22.927182531371322, + "learning_rate": 3.5883374538227517e-06, + "loss": 2.7678, + "step": 9462 + }, + { + "epoch": 1.8963927855711424, + "grad_norm": 26.777483453049328, + "learning_rate": 3.587219039746564e-06, + "loss": 3.0112, + "step": 9463 + }, + { + "epoch": 1.8965931863727454, + "grad_norm": 25.15500572813013, + "learning_rate": 3.5861007024853643e-06, + "loss": 2.9044, + "step": 9464 + }, + { + "epoch": 1.8967935871743486, + "grad_norm": 34.13311399299949, + "learning_rate": 3.584982442099956e-06, + "loss": 3.1001, + "step": 9465 + }, + { + "epoch": 1.8969939879759519, + "grad_norm": 20.485316403064324, + "learning_rate": 3.5838642586511407e-06, + "loss": 2.1159, + "step": 9466 + }, + { + "epoch": 1.897194388777555, + "grad_norm": 24.813504886166157, + "learning_rate": 3.5827461521997165e-06, + "loss": 2.7408, + "step": 9467 + }, + { + "epoch": 1.8973947895791583, + "grad_norm": 21.518588478467642, + "learning_rate": 3.581628122806477e-06, + "loss": 2.8255, + "step": 9468 + }, + { + "epoch": 1.8975951903807615, + "grad_norm": 23.904112299563483, + "learning_rate": 3.58051017053221e-06, + "loss": 2.8503, + "step": 9469 + }, + { + "epoch": 1.8977955911823647, + "grad_norm": 30.73864169423603, + "learning_rate": 3.579392295437698e-06, + "loss": 2.7313, + "step": 9470 + }, + { + "epoch": 1.897995991983968, + "grad_norm": 23.180402897223047, + "learning_rate": 3.578274497583727e-06, + "loss": 2.2483, + "step": 9471 + }, + { + "epoch": 1.8981963927855712, + "grad_norm": 26.579765591285106, + "learning_rate": 3.5771567770310684e-06, + "loss": 2.0558, + "step": 9472 + }, + { + "epoch": 1.8983967935871744, + "grad_norm": 23.09809054015315, + "learning_rate": 3.5760391338404952e-06, + "loss": 2.1021, + "step": 9473 + }, + { + "epoch": 1.8985971943887776, + "grad_norm": 25.927890559735314, + "learning_rate": 3.5749215680727785e-06, + "loss": 2.3091, + "step": 9474 + }, + { + "epoch": 1.8987975951903806, + "grad_norm": 25.870159109571112, + "learning_rate": 3.5738040797886796e-06, + "loss": 2.6739, + "step": 9475 + }, + { + "epoch": 1.898997995991984, + "grad_norm": 19.830845237622036, + "learning_rate": 3.572686669048956e-06, + "loss": 2.7768, + "step": 9476 + }, + { + "epoch": 1.899198396793587, + "grad_norm": 22.713266322095713, + "learning_rate": 3.571569335914368e-06, + "loss": 2.8944, + "step": 9477 + }, + { + "epoch": 1.8993987975951905, + "grad_norm": 26.862020711428507, + "learning_rate": 3.570452080445661e-06, + "loss": 2.7982, + "step": 9478 + }, + { + "epoch": 1.8995991983967935, + "grad_norm": 34.894298916299334, + "learning_rate": 3.569334902703587e-06, + "loss": 2.8967, + "step": 9479 + }, + { + "epoch": 1.899799599198397, + "grad_norm": 30.764793830472666, + "learning_rate": 3.5682178027488834e-06, + "loss": 2.9322, + "step": 9480 + }, + { + "epoch": 1.9, + "grad_norm": 20.962534688793056, + "learning_rate": 3.567100780642294e-06, + "loss": 2.8422, + "step": 9481 + }, + { + "epoch": 1.9002004008016034, + "grad_norm": 31.36627668437366, + "learning_rate": 3.5659838364445505e-06, + "loss": 2.6835, + "step": 9482 + }, + { + "epoch": 1.9004008016032063, + "grad_norm": 56.424698496723124, + "learning_rate": 3.5648669702163794e-06, + "loss": 2.5532, + "step": 9483 + }, + { + "epoch": 1.9006012024048096, + "grad_norm": 26.787710882498395, + "learning_rate": 3.5637501820185127e-06, + "loss": 2.7209, + "step": 9484 + }, + { + "epoch": 1.9008016032064128, + "grad_norm": 24.96242806355563, + "learning_rate": 3.5626334719116675e-06, + "loss": 2.5085, + "step": 9485 + }, + { + "epoch": 1.901002004008016, + "grad_norm": 40.46451565631095, + "learning_rate": 3.561516839956563e-06, + "loss": 2.9599, + "step": 9486 + }, + { + "epoch": 1.9012024048096192, + "grad_norm": 19.87877612495509, + "learning_rate": 3.56040028621391e-06, + "loss": 2.2883, + "step": 9487 + }, + { + "epoch": 1.9014028056112224, + "grad_norm": 28.519013561502927, + "learning_rate": 3.5592838107444194e-06, + "loss": 2.5957, + "step": 9488 + }, + { + "epoch": 1.9016032064128257, + "grad_norm": 46.86909088712082, + "learning_rate": 3.558167413608795e-06, + "loss": 3.1708, + "step": 9489 + }, + { + "epoch": 1.9018036072144289, + "grad_norm": 44.15635980305286, + "learning_rate": 3.557051094867735e-06, + "loss": 2.9301, + "step": 9490 + }, + { + "epoch": 1.902004008016032, + "grad_norm": 27.26061618500275, + "learning_rate": 3.555934854581937e-06, + "loss": 2.7396, + "step": 9491 + }, + { + "epoch": 1.9022044088176353, + "grad_norm": 23.51536288783421, + "learning_rate": 3.5548186928120953e-06, + "loss": 3.1365, + "step": 9492 + }, + { + "epoch": 1.9024048096192385, + "grad_norm": 27.796852126357273, + "learning_rate": 3.553702609618891e-06, + "loss": 2.4448, + "step": 9493 + }, + { + "epoch": 1.9026052104208415, + "grad_norm": 20.73430587710751, + "learning_rate": 3.552586605063014e-06, + "loss": 2.721, + "step": 9494 + }, + { + "epoch": 1.902805611222445, + "grad_norm": 27.45786373971511, + "learning_rate": 3.551470679205139e-06, + "loss": 2.5497, + "step": 9495 + }, + { + "epoch": 1.903006012024048, + "grad_norm": 38.6848093312127, + "learning_rate": 3.5503548321059405e-06, + "loss": 2.68, + "step": 9496 + }, + { + "epoch": 1.9032064128256514, + "grad_norm": 25.93790762646603, + "learning_rate": 3.5492390638260885e-06, + "loss": 3.2721, + "step": 9497 + }, + { + "epoch": 1.9034068136272544, + "grad_norm": 25.928015708404196, + "learning_rate": 3.5481233744262526e-06, + "loss": 2.7952, + "step": 9498 + }, + { + "epoch": 1.9036072144288578, + "grad_norm": 26.923067991210438, + "learning_rate": 3.547007763967092e-06, + "loss": 2.5696, + "step": 9499 + }, + { + "epoch": 1.9038076152304608, + "grad_norm": 25.78979730097194, + "learning_rate": 3.5458922325092616e-06, + "loss": 2.6387, + "step": 9500 + }, + { + "epoch": 1.9040080160320643, + "grad_norm": 26.866380371471042, + "learning_rate": 3.5447767801134194e-06, + "loss": 2.5234, + "step": 9501 + }, + { + "epoch": 1.9042084168336673, + "grad_norm": 32.15332219320114, + "learning_rate": 3.543661406840211e-06, + "loss": 3.0125, + "step": 9502 + }, + { + "epoch": 1.9044088176352707, + "grad_norm": 18.970821578218626, + "learning_rate": 3.542546112750281e-06, + "loss": 2.7283, + "step": 9503 + }, + { + "epoch": 1.9046092184368737, + "grad_norm": 30.032772238303583, + "learning_rate": 3.5414308979042706e-06, + "loss": 2.2404, + "step": 9504 + }, + { + "epoch": 1.904809619238477, + "grad_norm": 28.96749763522759, + "learning_rate": 3.540315762362816e-06, + "loss": 3.2399, + "step": 9505 + }, + { + "epoch": 1.9050100200400801, + "grad_norm": 29.710434857699603, + "learning_rate": 3.5392007061865486e-06, + "loss": 2.5847, + "step": 9506 + }, + { + "epoch": 1.9052104208416833, + "grad_norm": 20.997426395007597, + "learning_rate": 3.5380857294360926e-06, + "loss": 2.5801, + "step": 9507 + }, + { + "epoch": 1.9054108216432866, + "grad_norm": 19.76492466006673, + "learning_rate": 3.5369708321720763e-06, + "loss": 2.4279, + "step": 9508 + }, + { + "epoch": 1.9056112224448898, + "grad_norm": 21.61868798184925, + "learning_rate": 3.5358560144551136e-06, + "loss": 3.1368, + "step": 9509 + }, + { + "epoch": 1.905811623246493, + "grad_norm": 34.00235134743884, + "learning_rate": 3.53474127634582e-06, + "loss": 2.8304, + "step": 9510 + }, + { + "epoch": 1.9060120240480962, + "grad_norm": 25.721761491919207, + "learning_rate": 3.5336266179048085e-06, + "loss": 2.9479, + "step": 9511 + }, + { + "epoch": 1.9062124248496994, + "grad_norm": 16.61155887037831, + "learning_rate": 3.5325120391926816e-06, + "loss": 2.7257, + "step": 9512 + }, + { + "epoch": 1.9064128256513027, + "grad_norm": 27.01972228268364, + "learning_rate": 3.531397540270042e-06, + "loss": 2.9507, + "step": 9513 + }, + { + "epoch": 1.9066132264529059, + "grad_norm": 22.17577489494922, + "learning_rate": 3.5302831211974836e-06, + "loss": 3.1607, + "step": 9514 + }, + { + "epoch": 1.9068136272545089, + "grad_norm": 22.009238432142265, + "learning_rate": 3.5291687820356026e-06, + "loss": 2.4772, + "step": 9515 + }, + { + "epoch": 1.9070140280561123, + "grad_norm": 17.627085565974696, + "learning_rate": 3.528054522844987e-06, + "loss": 2.4394, + "step": 9516 + }, + { + "epoch": 1.9072144288577153, + "grad_norm": 18.60016796674133, + "learning_rate": 3.526940343686218e-06, + "loss": 2.7141, + "step": 9517 + }, + { + "epoch": 1.9074148296593187, + "grad_norm": 20.097473962827898, + "learning_rate": 3.5258262446198787e-06, + "loss": 2.3269, + "step": 9518 + }, + { + "epoch": 1.9076152304609217, + "grad_norm": 22.601046786246272, + "learning_rate": 3.5247122257065425e-06, + "loss": 2.7326, + "step": 9519 + }, + { + "epoch": 1.9078156312625252, + "grad_norm": 27.3192754536835, + "learning_rate": 3.5235982870067787e-06, + "loss": 2.9797, + "step": 9520 + }, + { + "epoch": 1.9080160320641282, + "grad_norm": 27.88685815622222, + "learning_rate": 3.5224844285811565e-06, + "loss": 2.5277, + "step": 9521 + }, + { + "epoch": 1.9082164328657316, + "grad_norm": 33.14337363924698, + "learning_rate": 3.5213706504902367e-06, + "loss": 2.5209, + "step": 9522 + }, + { + "epoch": 1.9084168336673346, + "grad_norm": 21.536313306988852, + "learning_rate": 3.5202569527945784e-06, + "loss": 2.5111, + "step": 9523 + }, + { + "epoch": 1.9086172344689378, + "grad_norm": 26.639912926926083, + "learning_rate": 3.5191433355547313e-06, + "loss": 2.9697, + "step": 9524 + }, + { + "epoch": 1.908817635270541, + "grad_norm": 26.344543621530935, + "learning_rate": 3.5180297988312497e-06, + "loss": 2.4752, + "step": 9525 + }, + { + "epoch": 1.9090180360721443, + "grad_norm": 29.384134852890394, + "learning_rate": 3.516916342684675e-06, + "loss": 2.7629, + "step": 9526 + }, + { + "epoch": 1.9092184368737475, + "grad_norm": 22.03934791908274, + "learning_rate": 3.5158029671755456e-06, + "loss": 2.6269, + "step": 9527 + }, + { + "epoch": 1.9094188376753507, + "grad_norm": 33.389340174295405, + "learning_rate": 3.514689672364402e-06, + "loss": 3.3541, + "step": 9528 + }, + { + "epoch": 1.909619238476954, + "grad_norm": 20.693982976668227, + "learning_rate": 3.513576458311773e-06, + "loss": 2.1894, + "step": 9529 + }, + { + "epoch": 1.9098196392785571, + "grad_norm": 25.145861973120137, + "learning_rate": 3.512463325078187e-06, + "loss": 2.5191, + "step": 9530 + }, + { + "epoch": 1.9100200400801604, + "grad_norm": 32.773076928834975, + "learning_rate": 3.511350272724164e-06, + "loss": 2.6575, + "step": 9531 + }, + { + "epoch": 1.9102204408817636, + "grad_norm": 20.82957967267327, + "learning_rate": 3.5102373013102254e-06, + "loss": 2.8508, + "step": 9532 + }, + { + "epoch": 1.9104208416833668, + "grad_norm": 24.014294545959523, + "learning_rate": 3.5091244108968836e-06, + "loss": 2.455, + "step": 9533 + }, + { + "epoch": 1.9106212424849698, + "grad_norm": 24.456370527120285, + "learning_rate": 3.5080116015446474e-06, + "loss": 3.2919, + "step": 9534 + }, + { + "epoch": 1.9108216432865732, + "grad_norm": 28.345361265027872, + "learning_rate": 3.506898873314023e-06, + "loss": 3.4317, + "step": 9535 + }, + { + "epoch": 1.9110220440881762, + "grad_norm": 36.96418563901129, + "learning_rate": 3.505786226265513e-06, + "loss": 2.9793, + "step": 9536 + }, + { + "epoch": 1.9112224448897797, + "grad_norm": 26.725003525790637, + "learning_rate": 3.5046736604596084e-06, + "loss": 2.9197, + "step": 9537 + }, + { + "epoch": 1.9114228456913827, + "grad_norm": 21.115724466474717, + "learning_rate": 3.503561175956807e-06, + "loss": 2.5803, + "step": 9538 + }, + { + "epoch": 1.911623246492986, + "grad_norm": 19.555977751252215, + "learning_rate": 3.5024487728175926e-06, + "loss": 2.9198, + "step": 9539 + }, + { + "epoch": 1.911823647294589, + "grad_norm": 33.052111525017274, + "learning_rate": 3.5013364511024482e-06, + "loss": 3.3279, + "step": 9540 + }, + { + "epoch": 1.9120240480961925, + "grad_norm": 35.95025706896599, + "learning_rate": 3.500224210871852e-06, + "loss": 3.2568, + "step": 9541 + }, + { + "epoch": 1.9122244488977955, + "grad_norm": 17.268027244308133, + "learning_rate": 3.4991120521862822e-06, + "loss": 2.6459, + "step": 9542 + }, + { + "epoch": 1.9124248496993987, + "grad_norm": 20.673127720809486, + "learning_rate": 3.497999975106204e-06, + "loss": 2.5675, + "step": 9543 + }, + { + "epoch": 1.912625250501002, + "grad_norm": 26.24738196186902, + "learning_rate": 3.496887979692084e-06, + "loss": 2.6425, + "step": 9544 + }, + { + "epoch": 1.9128256513026052, + "grad_norm": 26.254210037469125, + "learning_rate": 3.495776066004385e-06, + "loss": 2.4722, + "step": 9545 + }, + { + "epoch": 1.9130260521042084, + "grad_norm": 14.04873356549869, + "learning_rate": 3.4946642341035598e-06, + "loss": 2.4562, + "step": 9546 + }, + { + "epoch": 1.9132264529058116, + "grad_norm": 17.326750708636993, + "learning_rate": 3.4935524840500634e-06, + "loss": 2.4286, + "step": 9547 + }, + { + "epoch": 1.9134268537074148, + "grad_norm": 25.425708692955986, + "learning_rate": 3.4924408159043417e-06, + "loss": 2.673, + "step": 9548 + }, + { + "epoch": 1.913627254509018, + "grad_norm": 16.713009892843196, + "learning_rate": 3.4913292297268385e-06, + "loss": 2.7715, + "step": 9549 + }, + { + "epoch": 1.9138276553106213, + "grad_norm": 20.94378226272098, + "learning_rate": 3.4902177255779936e-06, + "loss": 2.6396, + "step": 9550 + }, + { + "epoch": 1.9140280561122245, + "grad_norm": 21.605701204318173, + "learning_rate": 3.489106303518236e-06, + "loss": 2.6673, + "step": 9551 + }, + { + "epoch": 1.9142284569138277, + "grad_norm": 26.74249326970825, + "learning_rate": 3.4879949636080023e-06, + "loss": 2.7704, + "step": 9552 + }, + { + "epoch": 1.9144288577154307, + "grad_norm": 23.755090180477918, + "learning_rate": 3.486883705907713e-06, + "loss": 2.5642, + "step": 9553 + }, + { + "epoch": 1.9146292585170341, + "grad_norm": 27.66519543803346, + "learning_rate": 3.4857725304777882e-06, + "loss": 2.8926, + "step": 9554 + }, + { + "epoch": 1.9148296593186371, + "grad_norm": 30.779467376542012, + "learning_rate": 3.484661437378649e-06, + "loss": 3.0688, + "step": 9555 + }, + { + "epoch": 1.9150300601202406, + "grad_norm": 28.212091564777314, + "learning_rate": 3.4835504266707043e-06, + "loss": 3.0113, + "step": 9556 + }, + { + "epoch": 1.9152304609218436, + "grad_norm": 63.41843175293573, + "learning_rate": 3.4824394984143616e-06, + "loss": 3.0067, + "step": 9557 + }, + { + "epoch": 1.915430861723447, + "grad_norm": 25.260675923871073, + "learning_rate": 3.4813286526700206e-06, + "loss": 2.5591, + "step": 9558 + }, + { + "epoch": 1.91563126252505, + "grad_norm": 21.078996609353183, + "learning_rate": 3.4802178894980843e-06, + "loss": 2.3777, + "step": 9559 + }, + { + "epoch": 1.9158316633266534, + "grad_norm": 31.43866255217284, + "learning_rate": 3.4791072089589457e-06, + "loss": 3.4006, + "step": 9560 + }, + { + "epoch": 1.9160320641282564, + "grad_norm": 34.70986613460001, + "learning_rate": 3.477996611112991e-06, + "loss": 2.1827, + "step": 9561 + }, + { + "epoch": 1.9162324649298599, + "grad_norm": 46.381748325656844, + "learning_rate": 3.4768860960206092e-06, + "loss": 2.9142, + "step": 9562 + }, + { + "epoch": 1.9164328657314629, + "grad_norm": 22.198925263411674, + "learning_rate": 3.4757756637421785e-06, + "loss": 3.3717, + "step": 9563 + }, + { + "epoch": 1.916633266533066, + "grad_norm": 22.077319485533717, + "learning_rate": 3.474665314338073e-06, + "loss": 2.3164, + "step": 9564 + }, + { + "epoch": 1.9168336673346693, + "grad_norm": 18.442233507005266, + "learning_rate": 3.4735550478686685e-06, + "loss": 2.4711, + "step": 9565 + }, + { + "epoch": 1.9170340681362725, + "grad_norm": 23.147541529889107, + "learning_rate": 3.4724448643943274e-06, + "loss": 2.6726, + "step": 9566 + }, + { + "epoch": 1.9172344689378757, + "grad_norm": 26.736423357368547, + "learning_rate": 3.4713347639754157e-06, + "loss": 3.169, + "step": 9567 + }, + { + "epoch": 1.917434869739479, + "grad_norm": 16.29431851261302, + "learning_rate": 3.470224746672287e-06, + "loss": 2.2113, + "step": 9568 + }, + { + "epoch": 1.9176352705410822, + "grad_norm": 17.866028616241508, + "learning_rate": 3.469114812545299e-06, + "loss": 2.5018, + "step": 9569 + }, + { + "epoch": 1.9178356713426854, + "grad_norm": 24.580013389279326, + "learning_rate": 3.468004961654799e-06, + "loss": 2.5893, + "step": 9570 + }, + { + "epoch": 1.9180360721442886, + "grad_norm": 26.53630526345957, + "learning_rate": 3.4668951940611278e-06, + "loss": 2.7878, + "step": 9571 + }, + { + "epoch": 1.9182364729458918, + "grad_norm": 23.626845572157436, + "learning_rate": 3.4657855098246295e-06, + "loss": 2.5234, + "step": 9572 + }, + { + "epoch": 1.918436873747495, + "grad_norm": 54.68034902575549, + "learning_rate": 3.4646759090056385e-06, + "loss": 2.4823, + "step": 9573 + }, + { + "epoch": 1.918637274549098, + "grad_norm": 18.105544566088913, + "learning_rate": 3.4635663916644846e-06, + "loss": 2.2318, + "step": 9574 + }, + { + "epoch": 1.9188376753507015, + "grad_norm": 28.773685764596234, + "learning_rate": 3.4624569578614926e-06, + "loss": 2.5027, + "step": 9575 + }, + { + "epoch": 1.9190380761523045, + "grad_norm": 26.167375023878837, + "learning_rate": 3.461347607656987e-06, + "loss": 2.9848, + "step": 9576 + }, + { + "epoch": 1.919238476953908, + "grad_norm": 23.372382071644417, + "learning_rate": 3.4602383411112815e-06, + "loss": 2.3862, + "step": 9577 + }, + { + "epoch": 1.919438877755511, + "grad_norm": 26.7826126518841, + "learning_rate": 3.4591291582846913e-06, + "loss": 2.1847, + "step": 9578 + }, + { + "epoch": 1.9196392785571144, + "grad_norm": 37.19997083267353, + "learning_rate": 3.458020059237523e-06, + "loss": 2.1432, + "step": 9579 + }, + { + "epoch": 1.9198396793587174, + "grad_norm": 39.42169416977029, + "learning_rate": 3.4569110440300812e-06, + "loss": 2.7507, + "step": 9580 + }, + { + "epoch": 1.9200400801603208, + "grad_norm": 28.28060707500491, + "learning_rate": 3.4558021127226615e-06, + "loss": 2.657, + "step": 9581 + }, + { + "epoch": 1.9202404809619238, + "grad_norm": 27.23232036943172, + "learning_rate": 3.4546932653755627e-06, + "loss": 3.1153, + "step": 9582 + }, + { + "epoch": 1.920440881763527, + "grad_norm": 27.388958328563355, + "learning_rate": 3.453584502049072e-06, + "loss": 2.8412, + "step": 9583 + }, + { + "epoch": 1.9206412825651302, + "grad_norm": 61.164387077849256, + "learning_rate": 3.452475822803474e-06, + "loss": 2.9091, + "step": 9584 + }, + { + "epoch": 1.9208416833667334, + "grad_norm": 27.45813522156423, + "learning_rate": 3.4513672276990486e-06, + "loss": 3.0502, + "step": 9585 + }, + { + "epoch": 1.9210420841683367, + "grad_norm": 26.756298116625885, + "learning_rate": 3.4502587167960754e-06, + "loss": 2.3171, + "step": 9586 + }, + { + "epoch": 1.9212424849699399, + "grad_norm": 25.15204282962941, + "learning_rate": 3.449150290154822e-06, + "loss": 3.0175, + "step": 9587 + }, + { + "epoch": 1.921442885771543, + "grad_norm": 23.86376926185154, + "learning_rate": 3.4480419478355566e-06, + "loss": 3.1363, + "step": 9588 + }, + { + "epoch": 1.9216432865731463, + "grad_norm": 28.203301182161585, + "learning_rate": 3.4469336898985424e-06, + "loss": 2.7671, + "step": 9589 + }, + { + "epoch": 1.9218436873747495, + "grad_norm": 36.75785090302229, + "learning_rate": 3.4458255164040342e-06, + "loss": 3.2548, + "step": 9590 + }, + { + "epoch": 1.9220440881763527, + "grad_norm": 27.923041884113154, + "learning_rate": 3.4447174274122887e-06, + "loss": 2.5069, + "step": 9591 + }, + { + "epoch": 1.922244488977956, + "grad_norm": 21.27949093363359, + "learning_rate": 3.443609422983551e-06, + "loss": 2.9833, + "step": 9592 + }, + { + "epoch": 1.922444889779559, + "grad_norm": 24.2294971258393, + "learning_rate": 3.442501503178068e-06, + "loss": 2.7927, + "step": 9593 + }, + { + "epoch": 1.9226452905811624, + "grad_norm": 40.4496215234612, + "learning_rate": 3.441393668056077e-06, + "loss": 3.1478, + "step": 9594 + }, + { + "epoch": 1.9228456913827654, + "grad_norm": 40.64186751923463, + "learning_rate": 3.4402859176778114e-06, + "loss": 2.69, + "step": 9595 + }, + { + "epoch": 1.9230460921843688, + "grad_norm": 29.7402104412268, + "learning_rate": 3.4391782521035044e-06, + "loss": 2.5777, + "step": 9596 + }, + { + "epoch": 1.9232464929859718, + "grad_norm": 25.73516335808598, + "learning_rate": 3.438070671393379e-06, + "loss": 2.6437, + "step": 9597 + }, + { + "epoch": 1.9234468937875753, + "grad_norm": 26.603256327194078, + "learning_rate": 3.436963175607656e-06, + "loss": 3.034, + "step": 9598 + }, + { + "epoch": 1.9236472945891783, + "grad_norm": 25.81899804489269, + "learning_rate": 3.4358557648065537e-06, + "loss": 3.1793, + "step": 9599 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 33.2509580321676, + "learning_rate": 3.4347484390502832e-06, + "loss": 2.6803, + "step": 9600 + }, + { + "epoch": 1.9240480961923847, + "grad_norm": 27.425742722489353, + "learning_rate": 3.43364119839905e-06, + "loss": 2.9832, + "step": 9601 + }, + { + "epoch": 1.924248496993988, + "grad_norm": 35.09917075474449, + "learning_rate": 3.4325340429130556e-06, + "loss": 2.3423, + "step": 9602 + }, + { + "epoch": 1.9244488977955911, + "grad_norm": 25.644135548810297, + "learning_rate": 3.4314269726524994e-06, + "loss": 2.6077, + "step": 9603 + }, + { + "epoch": 1.9246492985971944, + "grad_norm": 25.88225533151802, + "learning_rate": 3.430319987677576e-06, + "loss": 2.5646, + "step": 9604 + }, + { + "epoch": 1.9248496993987976, + "grad_norm": 23.30138115608388, + "learning_rate": 3.429213088048469e-06, + "loss": 2.9477, + "step": 9605 + }, + { + "epoch": 1.9250501002004008, + "grad_norm": 18.17811112603274, + "learning_rate": 3.428106273825368e-06, + "loss": 2.3563, + "step": 9606 + }, + { + "epoch": 1.925250501002004, + "grad_norm": 21.735994309922166, + "learning_rate": 3.4269995450684486e-06, + "loss": 2.3596, + "step": 9607 + }, + { + "epoch": 1.9254509018036072, + "grad_norm": 29.040114361478615, + "learning_rate": 3.425892901837884e-06, + "loss": 2.8247, + "step": 9608 + }, + { + "epoch": 1.9256513026052104, + "grad_norm": 24.65051484752229, + "learning_rate": 3.424786344193848e-06, + "loss": 2.5884, + "step": 9609 + }, + { + "epoch": 1.9258517034068137, + "grad_norm": 34.53334882848326, + "learning_rate": 3.4236798721965025e-06, + "loss": 2.4734, + "step": 9610 + }, + { + "epoch": 1.9260521042084169, + "grad_norm": 21.603165899349158, + "learning_rate": 3.42257348590601e-06, + "loss": 2.9412, + "step": 9611 + }, + { + "epoch": 1.9262525050100199, + "grad_norm": 27.758804298287522, + "learning_rate": 3.4214671853825244e-06, + "loss": 2.7338, + "step": 9612 + }, + { + "epoch": 1.9264529058116233, + "grad_norm": 27.327949041129806, + "learning_rate": 3.4203609706861996e-06, + "loss": 2.7659, + "step": 9613 + }, + { + "epoch": 1.9266533066132263, + "grad_norm": 22.231808507642388, + "learning_rate": 3.4192548418771806e-06, + "loss": 2.7167, + "step": 9614 + }, + { + "epoch": 1.9268537074148298, + "grad_norm": 23.598850404812016, + "learning_rate": 3.418148799015607e-06, + "loss": 2.6434, + "step": 9615 + }, + { + "epoch": 1.9270541082164327, + "grad_norm": 39.41220402013403, + "learning_rate": 3.4170428421616192e-06, + "loss": 2.7306, + "step": 9616 + }, + { + "epoch": 1.9272545090180362, + "grad_norm": 34.142561633612075, + "learning_rate": 3.4159369713753497e-06, + "loss": 2.2279, + "step": 9617 + }, + { + "epoch": 1.9274549098196392, + "grad_norm": 24.471556671458668, + "learning_rate": 3.4148311867169238e-06, + "loss": 2.3768, + "step": 9618 + }, + { + "epoch": 1.9276553106212426, + "grad_norm": 26.406542273751715, + "learning_rate": 3.413725488246468e-06, + "loss": 2.4226, + "step": 9619 + }, + { + "epoch": 1.9278557114228456, + "grad_norm": 27.032886742203118, + "learning_rate": 3.412619876024099e-06, + "loss": 2.9158, + "step": 9620 + }, + { + "epoch": 1.928056112224449, + "grad_norm": 21.34538164252495, + "learning_rate": 3.4115143501099297e-06, + "loss": 2.4544, + "step": 9621 + }, + { + "epoch": 1.928256513026052, + "grad_norm": 27.583132720003707, + "learning_rate": 3.410408910564069e-06, + "loss": 2.6591, + "step": 9622 + }, + { + "epoch": 1.9284569138276553, + "grad_norm": 25.376363914530366, + "learning_rate": 3.4093035574466237e-06, + "loss": 2.9598, + "step": 9623 + }, + { + "epoch": 1.9286573146292585, + "grad_norm": 28.614134841898938, + "learning_rate": 3.4081982908176923e-06, + "loss": 2.7675, + "step": 9624 + }, + { + "epoch": 1.9288577154308617, + "grad_norm": 52.73824622231053, + "learning_rate": 3.4070931107373678e-06, + "loss": 2.5507, + "step": 9625 + }, + { + "epoch": 1.929058116232465, + "grad_norm": 27.148258504300667, + "learning_rate": 3.4059880172657444e-06, + "loss": 2.7467, + "step": 9626 + }, + { + "epoch": 1.9292585170340681, + "grad_norm": 28.524042921104034, + "learning_rate": 3.4048830104629037e-06, + "loss": 1.9605, + "step": 9627 + }, + { + "epoch": 1.9294589178356714, + "grad_norm": 27.186538585463463, + "learning_rate": 3.4037780903889307e-06, + "loss": 2.1315, + "step": 9628 + }, + { + "epoch": 1.9296593186372746, + "grad_norm": 21.674504276987037, + "learning_rate": 3.4026732571038957e-06, + "loss": 2.5051, + "step": 9629 + }, + { + "epoch": 1.9298597194388778, + "grad_norm": 36.71617425056818, + "learning_rate": 3.4015685106678762e-06, + "loss": 3.0213, + "step": 9630 + }, + { + "epoch": 1.930060120240481, + "grad_norm": 23.680867048273182, + "learning_rate": 3.400463851140936e-06, + "loss": 2.9632, + "step": 9631 + }, + { + "epoch": 1.9302605210420842, + "grad_norm": 19.51991577264965, + "learning_rate": 3.399359278583136e-06, + "loss": 2.8912, + "step": 9632 + }, + { + "epoch": 1.9304609218436872, + "grad_norm": 20.44016869668953, + "learning_rate": 3.398254793054536e-06, + "loss": 2.6579, + "step": 9633 + }, + { + "epoch": 1.9306613226452907, + "grad_norm": 26.39292138972259, + "learning_rate": 3.397150394615187e-06, + "loss": 2.2284, + "step": 9634 + }, + { + "epoch": 1.9308617234468937, + "grad_norm": 37.62090267396881, + "learning_rate": 3.396046083325135e-06, + "loss": 2.6632, + "step": 9635 + }, + { + "epoch": 1.931062124248497, + "grad_norm": 26.727781776641105, + "learning_rate": 3.3949418592444285e-06, + "loss": 2.9113, + "step": 9636 + }, + { + "epoch": 1.9312625250501, + "grad_norm": 20.040413385367074, + "learning_rate": 3.393837722433102e-06, + "loss": 2.2455, + "step": 9637 + }, + { + "epoch": 1.9314629258517035, + "grad_norm": 22.21050365614559, + "learning_rate": 3.392733672951189e-06, + "loss": 2.9029, + "step": 9638 + }, + { + "epoch": 1.9316633266533065, + "grad_norm": 25.47734192255267, + "learning_rate": 3.391629710858717e-06, + "loss": 3.0897, + "step": 9639 + }, + { + "epoch": 1.93186372745491, + "grad_norm": 22.13945510926931, + "learning_rate": 3.3905258362157123e-06, + "loss": 2.2499, + "step": 9640 + }, + { + "epoch": 1.932064128256513, + "grad_norm": 77.92283782150083, + "learning_rate": 3.389422049082195e-06, + "loss": 3.08, + "step": 9641 + }, + { + "epoch": 1.9322645290581162, + "grad_norm": 20.784243827047924, + "learning_rate": 3.3883183495181773e-06, + "loss": 3.0181, + "step": 9642 + }, + { + "epoch": 1.9324649298597194, + "grad_norm": 32.936499962127186, + "learning_rate": 3.3872147375836704e-06, + "loss": 2.5096, + "step": 9643 + }, + { + "epoch": 1.9326653306613226, + "grad_norm": 29.509888457415254, + "learning_rate": 3.3861112133386798e-06, + "loss": 2.4397, + "step": 9644 + }, + { + "epoch": 1.9328657314629258, + "grad_norm": 24.083951534988593, + "learning_rate": 3.385007776843204e-06, + "loss": 2.5861, + "step": 9645 + }, + { + "epoch": 1.933066132264529, + "grad_norm": 26.34807407043304, + "learning_rate": 3.383904428157239e-06, + "loss": 2.4456, + "step": 9646 + }, + { + "epoch": 1.9332665330661323, + "grad_norm": 33.73403188988351, + "learning_rate": 3.3828011673407755e-06, + "loss": 2.9302, + "step": 9647 + }, + { + "epoch": 1.9334669338677355, + "grad_norm": 21.422309956787583, + "learning_rate": 3.3816979944538007e-06, + "loss": 2.8737, + "step": 9648 + }, + { + "epoch": 1.9336673346693387, + "grad_norm": 19.903297222625223, + "learning_rate": 3.3805949095562935e-06, + "loss": 2.8389, + "step": 9649 + }, + { + "epoch": 1.933867735470942, + "grad_norm": 21.232720050430324, + "learning_rate": 3.3794919127082334e-06, + "loss": 2.8625, + "step": 9650 + }, + { + "epoch": 1.9340681362725451, + "grad_norm": 22.306537628501754, + "learning_rate": 3.37838900396959e-06, + "loss": 2.6967, + "step": 9651 + }, + { + "epoch": 1.9342685370741481, + "grad_norm": 31.423781047307592, + "learning_rate": 3.377286183400328e-06, + "loss": 2.3278, + "step": 9652 + }, + { + "epoch": 1.9344689378757516, + "grad_norm": 26.934028050167466, + "learning_rate": 3.376183451060412e-06, + "loss": 2.6984, + "step": 9653 + }, + { + "epoch": 1.9346693386773546, + "grad_norm": 17.4887734823364, + "learning_rate": 3.3750808070098006e-06, + "loss": 2.6697, + "step": 9654 + }, + { + "epoch": 1.934869739478958, + "grad_norm": 26.899582597305763, + "learning_rate": 3.373978251308444e-06, + "loss": 2.6304, + "step": 9655 + }, + { + "epoch": 1.935070140280561, + "grad_norm": 27.07542687715716, + "learning_rate": 3.3728757840162878e-06, + "loss": 3.0717, + "step": 9656 + }, + { + "epoch": 1.9352705410821645, + "grad_norm": 25.805710048169047, + "learning_rate": 3.3717734051932794e-06, + "loss": 2.2609, + "step": 9657 + }, + { + "epoch": 1.9354709418837674, + "grad_norm": 18.355623290819754, + "learning_rate": 3.3706711148993535e-06, + "loss": 2.2651, + "step": 9658 + }, + { + "epoch": 1.9356713426853709, + "grad_norm": 20.741724375013476, + "learning_rate": 3.369568913194444e-06, + "loss": 2.748, + "step": 9659 + }, + { + "epoch": 1.9358717434869739, + "grad_norm": 23.210091050085875, + "learning_rate": 3.36846680013848e-06, + "loss": 2.4101, + "step": 9660 + }, + { + "epoch": 1.936072144288577, + "grad_norm": 26.663376459903386, + "learning_rate": 3.3673647757913854e-06, + "loss": 2.6763, + "step": 9661 + }, + { + "epoch": 1.9362725450901803, + "grad_norm": 25.859347638107625, + "learning_rate": 3.3662628402130747e-06, + "loss": 2.5715, + "step": 9662 + }, + { + "epoch": 1.9364729458917835, + "grad_norm": 19.01594239734439, + "learning_rate": 3.3651609934634683e-06, + "loss": 2.7298, + "step": 9663 + }, + { + "epoch": 1.9366733466933868, + "grad_norm": 43.03474700871657, + "learning_rate": 3.3640592356024714e-06, + "loss": 2.5402, + "step": 9664 + }, + { + "epoch": 1.93687374749499, + "grad_norm": 23.532536078132942, + "learning_rate": 3.362957566689988e-06, + "loss": 2.893, + "step": 9665 + }, + { + "epoch": 1.9370741482965932, + "grad_norm": 37.454603678608784, + "learning_rate": 3.3618559867859167e-06, + "loss": 2.5645, + "step": 9666 + }, + { + "epoch": 1.9372745490981964, + "grad_norm": 21.02663496879598, + "learning_rate": 3.360754495950156e-06, + "loss": 3.0294, + "step": 9667 + }, + { + "epoch": 1.9374749498997996, + "grad_norm": 56.46174140666105, + "learning_rate": 3.3596530942425933e-06, + "loss": 2.6825, + "step": 9668 + }, + { + "epoch": 1.9376753507014028, + "grad_norm": 25.60744048127347, + "learning_rate": 3.3585517817231106e-06, + "loss": 2.7236, + "step": 9669 + }, + { + "epoch": 1.937875751503006, + "grad_norm": 22.55555641335869, + "learning_rate": 3.3574505584515927e-06, + "loss": 2.6383, + "step": 9670 + }, + { + "epoch": 1.938076152304609, + "grad_norm": 26.80144576075709, + "learning_rate": 3.3563494244879114e-06, + "loss": 2.1255, + "step": 9671 + }, + { + "epoch": 1.9382765531062125, + "grad_norm": 20.735976199649375, + "learning_rate": 3.35524837989194e-06, + "loss": 2.4009, + "step": 9672 + }, + { + "epoch": 1.9384769539078155, + "grad_norm": 18.379765694485773, + "learning_rate": 3.354147424723539e-06, + "loss": 2.2841, + "step": 9673 + }, + { + "epoch": 1.938677354709419, + "grad_norm": 29.20621615773183, + "learning_rate": 3.3530465590425743e-06, + "loss": 2.6946, + "step": 9674 + }, + { + "epoch": 1.938877755511022, + "grad_norm": 25.1739447788385, + "learning_rate": 3.3519457829088997e-06, + "loss": 2.6228, + "step": 9675 + }, + { + "epoch": 1.9390781563126254, + "grad_norm": 26.81710185146921, + "learning_rate": 3.3508450963823637e-06, + "loss": 2.5266, + "step": 9676 + }, + { + "epoch": 1.9392785571142284, + "grad_norm": 22.08348755309134, + "learning_rate": 3.3497444995228155e-06, + "loss": 2.5613, + "step": 9677 + }, + { + "epoch": 1.9394789579158318, + "grad_norm": 17.974861209224173, + "learning_rate": 3.348643992390094e-06, + "loss": 2.2331, + "step": 9678 + }, + { + "epoch": 1.9396793587174348, + "grad_norm": 39.72331521059242, + "learning_rate": 3.3475435750440355e-06, + "loss": 3.1854, + "step": 9679 + }, + { + "epoch": 1.9398797595190382, + "grad_norm": 21.575508650756937, + "learning_rate": 3.346443247544474e-06, + "loss": 2.4799, + "step": 9680 + }, + { + "epoch": 1.9400801603206412, + "grad_norm": 38.45963041745287, + "learning_rate": 3.3453430099512345e-06, + "loss": 3.392, + "step": 9681 + }, + { + "epoch": 1.9402805611222445, + "grad_norm": 31.974568479384565, + "learning_rate": 3.344242862324138e-06, + "loss": 2.5149, + "step": 9682 + }, + { + "epoch": 1.9404809619238477, + "grad_norm": 31.664785350204593, + "learning_rate": 3.3431428047229986e-06, + "loss": 2.3451, + "step": 9683 + }, + { + "epoch": 1.9406813627254509, + "grad_norm": 23.061384927018008, + "learning_rate": 3.3420428372076317e-06, + "loss": 2.5525, + "step": 9684 + }, + { + "epoch": 1.940881763527054, + "grad_norm": 25.19588362654374, + "learning_rate": 3.3409429598378445e-06, + "loss": 2.4239, + "step": 9685 + }, + { + "epoch": 1.9410821643286573, + "grad_norm": 29.665364812452715, + "learning_rate": 3.3398431726734353e-06, + "loss": 2.4559, + "step": 9686 + }, + { + "epoch": 1.9412825651302605, + "grad_norm": 23.06906162598365, + "learning_rate": 3.3387434757742056e-06, + "loss": 2.8949, + "step": 9687 + }, + { + "epoch": 1.9414829659318638, + "grad_norm": 33.49696787531123, + "learning_rate": 3.3376438691999453e-06, + "loss": 2.471, + "step": 9688 + }, + { + "epoch": 1.941683366733467, + "grad_norm": 21.390742170851823, + "learning_rate": 3.336544353010441e-06, + "loss": 3.0434, + "step": 9689 + }, + { + "epoch": 1.9418837675350702, + "grad_norm": 37.44972520465166, + "learning_rate": 3.3354449272654753e-06, + "loss": 2.7972, + "step": 9690 + }, + { + "epoch": 1.9420841683366734, + "grad_norm": 41.30492179760638, + "learning_rate": 3.334345592024826e-06, + "loss": 2.8272, + "step": 9691 + }, + { + "epoch": 1.9422845691382764, + "grad_norm": 51.23442689411394, + "learning_rate": 3.3332463473482667e-06, + "loss": 2.5978, + "step": 9692 + }, + { + "epoch": 1.9424849699398798, + "grad_norm": 33.884045867233326, + "learning_rate": 3.332147193295562e-06, + "loss": 2.5953, + "step": 9693 + }, + { + "epoch": 1.9426853707414828, + "grad_norm": 38.38711879455653, + "learning_rate": 3.3310481299264774e-06, + "loss": 2.5973, + "step": 9694 + }, + { + "epoch": 1.9428857715430863, + "grad_norm": 32.20815235350056, + "learning_rate": 3.329949157300771e-06, + "loss": 2.8601, + "step": 9695 + }, + { + "epoch": 1.9430861723446893, + "grad_norm": 29.50086447689811, + "learning_rate": 3.3288502754781913e-06, + "loss": 2.7203, + "step": 9696 + }, + { + "epoch": 1.9432865731462927, + "grad_norm": 23.322198885381635, + "learning_rate": 3.32775148451849e-06, + "loss": 2.7264, + "step": 9697 + }, + { + "epoch": 1.9434869739478957, + "grad_norm": 25.80079171185366, + "learning_rate": 3.32665278448141e-06, + "loss": 2.7638, + "step": 9698 + }, + { + "epoch": 1.9436873747494992, + "grad_norm": 32.3241493641367, + "learning_rate": 3.3255541754266885e-06, + "loss": 2.847, + "step": 9699 + }, + { + "epoch": 1.9438877755511021, + "grad_norm": 20.081323001287107, + "learning_rate": 3.3244556574140565e-06, + "loss": 2.6698, + "step": 9700 + }, + { + "epoch": 1.9440881763527054, + "grad_norm": 22.68848694910863, + "learning_rate": 3.3233572305032446e-06, + "loss": 2.508, + "step": 9701 + }, + { + "epoch": 1.9442885771543086, + "grad_norm": 23.37617362764932, + "learning_rate": 3.3222588947539746e-06, + "loss": 2.5961, + "step": 9702 + }, + { + "epoch": 1.9444889779559118, + "grad_norm": 22.141419668213466, + "learning_rate": 3.3211606502259654e-06, + "loss": 2.8715, + "step": 9703 + }, + { + "epoch": 1.944689378757515, + "grad_norm": 27.49273081107521, + "learning_rate": 3.3200624969789296e-06, + "loss": 3.2031, + "step": 9704 + }, + { + "epoch": 1.9448897795591182, + "grad_norm": 23.288570358657577, + "learning_rate": 3.318964435072577e-06, + "loss": 2.6784, + "step": 9705 + }, + { + "epoch": 1.9450901803607215, + "grad_norm": 29.486480065867017, + "learning_rate": 3.317866464566607e-06, + "loss": 2.5389, + "step": 9706 + }, + { + "epoch": 1.9452905811623247, + "grad_norm": 18.047981353321163, + "learning_rate": 3.316768585520722e-06, + "loss": 3.0512, + "step": 9707 + }, + { + "epoch": 1.945490981963928, + "grad_norm": 32.370104616479004, + "learning_rate": 3.3156707979946146e-06, + "loss": 2.9809, + "step": 9708 + }, + { + "epoch": 1.945691382765531, + "grad_norm": 29.394233047308905, + "learning_rate": 3.314573102047971e-06, + "loss": 3.0133, + "step": 9709 + }, + { + "epoch": 1.9458917835671343, + "grad_norm": 37.87275471816403, + "learning_rate": 3.313475497740474e-06, + "loss": 2.2866, + "step": 9710 + }, + { + "epoch": 1.9460921843687373, + "grad_norm": 22.61616565071733, + "learning_rate": 3.3123779851318067e-06, + "loss": 2.3602, + "step": 9711 + }, + { + "epoch": 1.9462925851703408, + "grad_norm": 24.883371952442012, + "learning_rate": 3.311280564281639e-06, + "loss": 2.2838, + "step": 9712 + }, + { + "epoch": 1.9464929859719438, + "grad_norm": 35.44940801154284, + "learning_rate": 3.3101832352496375e-06, + "loss": 2.6295, + "step": 9713 + }, + { + "epoch": 1.9466933867735472, + "grad_norm": 39.42955486826449, + "learning_rate": 3.3090859980954704e-06, + "loss": 2.374, + "step": 9714 + }, + { + "epoch": 1.9468937875751502, + "grad_norm": 23.763077843498515, + "learning_rate": 3.307988852878793e-06, + "loss": 2.8848, + "step": 9715 + }, + { + "epoch": 1.9470941883767536, + "grad_norm": 26.201007897740247, + "learning_rate": 3.3068917996592598e-06, + "loss": 2.7721, + "step": 9716 + }, + { + "epoch": 1.9472945891783566, + "grad_norm": 22.468836030456185, + "learning_rate": 3.3057948384965166e-06, + "loss": 2.2794, + "step": 9717 + }, + { + "epoch": 1.94749498997996, + "grad_norm": 21.79768253338503, + "learning_rate": 3.304697969450211e-06, + "loss": 2.5111, + "step": 9718 + }, + { + "epoch": 1.947695390781563, + "grad_norm": 26.502947192492048, + "learning_rate": 3.3036011925799792e-06, + "loss": 2.6435, + "step": 9719 + }, + { + "epoch": 1.9478957915831663, + "grad_norm": 24.679832234300534, + "learning_rate": 3.302504507945453e-06, + "loss": 2.0739, + "step": 9720 + }, + { + "epoch": 1.9480961923847695, + "grad_norm": 26.229237601397156, + "learning_rate": 3.301407915606264e-06, + "loss": 2.9507, + "step": 9721 + }, + { + "epoch": 1.9482965931863727, + "grad_norm": 19.784595150805348, + "learning_rate": 3.3003114156220335e-06, + "loss": 2.612, + "step": 9722 + }, + { + "epoch": 1.948496993987976, + "grad_norm": 49.61374895686702, + "learning_rate": 3.2992150080523787e-06, + "loss": 2.8078, + "step": 9723 + }, + { + "epoch": 1.9486973947895792, + "grad_norm": 22.874037621621348, + "learning_rate": 3.298118692956917e-06, + "loss": 2.7936, + "step": 9724 + }, + { + "epoch": 1.9488977955911824, + "grad_norm": 26.27552737088929, + "learning_rate": 3.2970224703952543e-06, + "loss": 3.1947, + "step": 9725 + }, + { + "epoch": 1.9490981963927856, + "grad_norm": 34.451619381878224, + "learning_rate": 3.295926340426993e-06, + "loss": 2.49, + "step": 9726 + }, + { + "epoch": 1.9492985971943888, + "grad_norm": 25.737211828990027, + "learning_rate": 3.29483030311173e-06, + "loss": 2.7387, + "step": 9727 + }, + { + "epoch": 1.949498997995992, + "grad_norm": 21.27634295319216, + "learning_rate": 3.293734358509062e-06, + "loss": 2.5429, + "step": 9728 + }, + { + "epoch": 1.9496993987975952, + "grad_norm": 29.904313659777234, + "learning_rate": 3.292638506678577e-06, + "loss": 2.7727, + "step": 9729 + }, + { + "epoch": 1.9498997995991982, + "grad_norm": 31.79746221630646, + "learning_rate": 3.291542747679854e-06, + "loss": 2.6147, + "step": 9730 + }, + { + "epoch": 1.9501002004008017, + "grad_norm": 23.527577257635148, + "learning_rate": 3.290447081572476e-06, + "loss": 2.6609, + "step": 9731 + }, + { + "epoch": 1.9503006012024047, + "grad_norm": 24.843760268973572, + "learning_rate": 3.289351508416013e-06, + "loss": 3.3036, + "step": 9732 + }, + { + "epoch": 1.950501002004008, + "grad_norm": 19.43582911391942, + "learning_rate": 3.2882560282700338e-06, + "loss": 2.5095, + "step": 9733 + }, + { + "epoch": 1.950701402805611, + "grad_norm": 25.292353687883615, + "learning_rate": 3.2871606411941005e-06, + "loss": 3.2813, + "step": 9734 + }, + { + "epoch": 1.9509018036072145, + "grad_norm": 21.71088170529454, + "learning_rate": 3.2860653472477715e-06, + "loss": 2.3501, + "step": 9735 + }, + { + "epoch": 1.9511022044088175, + "grad_norm": 27.165925449513715, + "learning_rate": 3.284970146490601e-06, + "loss": 2.5029, + "step": 9736 + }, + { + "epoch": 1.951302605210421, + "grad_norm": 43.0795080905143, + "learning_rate": 3.283875038982133e-06, + "loss": 2.8807, + "step": 9737 + }, + { + "epoch": 1.951503006012024, + "grad_norm": 23.686484850546982, + "learning_rate": 3.282780024781914e-06, + "loss": 2.5229, + "step": 9738 + }, + { + "epoch": 1.9517034068136274, + "grad_norm": 21.23678874896301, + "learning_rate": 3.28168510394948e-06, + "loss": 2.702, + "step": 9739 + }, + { + "epoch": 1.9519038076152304, + "grad_norm": 24.04942986537425, + "learning_rate": 3.280590276544362e-06, + "loss": 2.8318, + "step": 9740 + }, + { + "epoch": 1.9521042084168336, + "grad_norm": 29.936109488224204, + "learning_rate": 3.279495542626089e-06, + "loss": 2.4691, + "step": 9741 + }, + { + "epoch": 1.9523046092184368, + "grad_norm": 20.19313035507218, + "learning_rate": 3.278400902254184e-06, + "loss": 2.6119, + "step": 9742 + }, + { + "epoch": 1.95250501002004, + "grad_norm": 25.998664522000386, + "learning_rate": 3.277306355488164e-06, + "loss": 2.406, + "step": 9743 + }, + { + "epoch": 1.9527054108216433, + "grad_norm": 31.53402674614796, + "learning_rate": 3.276211902387538e-06, + "loss": 2.2149, + "step": 9744 + }, + { + "epoch": 1.9529058116232465, + "grad_norm": 29.596670847779535, + "learning_rate": 3.275117543011818e-06, + "loss": 2.7911, + "step": 9745 + }, + { + "epoch": 1.9531062124248497, + "grad_norm": 28.63130421991359, + "learning_rate": 3.274023277420501e-06, + "loss": 2.8606, + "step": 9746 + }, + { + "epoch": 1.953306613226453, + "grad_norm": 19.754528796549607, + "learning_rate": 3.272929105673087e-06, + "loss": 2.9177, + "step": 9747 + }, + { + "epoch": 1.9535070140280562, + "grad_norm": 47.07875057957712, + "learning_rate": 3.271835027829067e-06, + "loss": 2.5736, + "step": 9748 + }, + { + "epoch": 1.9537074148296594, + "grad_norm": 29.582458538893214, + "learning_rate": 3.2707410439479283e-06, + "loss": 2.3875, + "step": 9749 + }, + { + "epoch": 1.9539078156312626, + "grad_norm": 22.647713335369197, + "learning_rate": 3.2696471540891495e-06, + "loss": 2.4398, + "step": 9750 + }, + { + "epoch": 1.9541082164328656, + "grad_norm": 27.44038806996139, + "learning_rate": 3.2685533583122116e-06, + "loss": 2.9375, + "step": 9751 + }, + { + "epoch": 1.954308617234469, + "grad_norm": 35.070788659903016, + "learning_rate": 3.267459656676583e-06, + "loss": 2.4142, + "step": 9752 + }, + { + "epoch": 1.954509018036072, + "grad_norm": 31.92480135708644, + "learning_rate": 3.2663660492417294e-06, + "loss": 2.9673, + "step": 9753 + }, + { + "epoch": 1.9547094188376755, + "grad_norm": 29.0068945450599, + "learning_rate": 3.2652725360671123e-06, + "loss": 2.7132, + "step": 9754 + }, + { + "epoch": 1.9549098196392785, + "grad_norm": 31.17703832090043, + "learning_rate": 3.2641791172121893e-06, + "loss": 2.3925, + "step": 9755 + }, + { + "epoch": 1.955110220440882, + "grad_norm": 26.705858238901456, + "learning_rate": 3.2630857927364107e-06, + "loss": 2.9486, + "step": 9756 + }, + { + "epoch": 1.955310621242485, + "grad_norm": 25.51356498751549, + "learning_rate": 3.2619925626992195e-06, + "loss": 2.6683, + "step": 9757 + }, + { + "epoch": 1.9555110220440883, + "grad_norm": 21.361342083805678, + "learning_rate": 3.2608994271600597e-06, + "loss": 2.6223, + "step": 9758 + }, + { + "epoch": 1.9557114228456913, + "grad_norm": 27.409726716075266, + "learning_rate": 3.2598063861783646e-06, + "loss": 2.4859, + "step": 9759 + }, + { + "epoch": 1.9559118236472945, + "grad_norm": 20.91295395290065, + "learning_rate": 3.2587134398135654e-06, + "loss": 3.0854, + "step": 9760 + }, + { + "epoch": 1.9561122244488978, + "grad_norm": 21.18059933286351, + "learning_rate": 3.2576205881250857e-06, + "loss": 2.4801, + "step": 9761 + }, + { + "epoch": 1.956312625250501, + "grad_norm": 25.671135347619042, + "learning_rate": 3.2565278311723483e-06, + "loss": 2.9777, + "step": 9762 + }, + { + "epoch": 1.9565130260521042, + "grad_norm": 36.17058927594806, + "learning_rate": 3.255435169014766e-06, + "loss": 2.7087, + "step": 9763 + }, + { + "epoch": 1.9567134268537074, + "grad_norm": 23.320150488779507, + "learning_rate": 3.2543426017117476e-06, + "loss": 2.4054, + "step": 9764 + }, + { + "epoch": 1.9569138276553106, + "grad_norm": 25.563744232103108, + "learning_rate": 3.253250129322699e-06, + "loss": 2.4924, + "step": 9765 + }, + { + "epoch": 1.9571142284569139, + "grad_norm": 34.90801186832441, + "learning_rate": 3.2521577519070204e-06, + "loss": 2.7537, + "step": 9766 + }, + { + "epoch": 1.957314629258517, + "grad_norm": 40.11640649498448, + "learning_rate": 3.251065469524103e-06, + "loss": 3.0721, + "step": 9767 + }, + { + "epoch": 1.9575150300601203, + "grad_norm": 23.12714792339146, + "learning_rate": 3.2499732822333394e-06, + "loss": 2.5869, + "step": 9768 + }, + { + "epoch": 1.9577154308617235, + "grad_norm": 24.25303466664046, + "learning_rate": 3.248881190094112e-06, + "loss": 2.808, + "step": 9769 + }, + { + "epoch": 1.9579158316633265, + "grad_norm": 29.902666988253713, + "learning_rate": 3.2477891931657984e-06, + "loss": 2.5726, + "step": 9770 + }, + { + "epoch": 1.95811623246493, + "grad_norm": 23.488306448741728, + "learning_rate": 3.2466972915077727e-06, + "loss": 2.1552, + "step": 9771 + }, + { + "epoch": 1.958316633266533, + "grad_norm": 35.67954605647382, + "learning_rate": 3.2456054851794035e-06, + "loss": 2.6494, + "step": 9772 + }, + { + "epoch": 1.9585170340681364, + "grad_norm": 30.90834867215171, + "learning_rate": 3.2445137742400544e-06, + "loss": 3.241, + "step": 9773 + }, + { + "epoch": 1.9587174348697394, + "grad_norm": 23.616139576194566, + "learning_rate": 3.243422158749081e-06, + "loss": 2.5395, + "step": 9774 + }, + { + "epoch": 1.9589178356713428, + "grad_norm": 20.906470562429423, + "learning_rate": 3.242330638765839e-06, + "loss": 2.8517, + "step": 9775 + }, + { + "epoch": 1.9591182364729458, + "grad_norm": 20.768800941058004, + "learning_rate": 3.241239214349676e-06, + "loss": 2.7945, + "step": 9776 + }, + { + "epoch": 1.9593186372745492, + "grad_norm": 20.330474500187695, + "learning_rate": 3.2401478855599304e-06, + "loss": 2.6573, + "step": 9777 + }, + { + "epoch": 1.9595190380761522, + "grad_norm": 31.132924913520796, + "learning_rate": 3.239056652455943e-06, + "loss": 3.2931, + "step": 9778 + }, + { + "epoch": 1.9597194388777555, + "grad_norm": 29.566313689192146, + "learning_rate": 3.237965515097047e-06, + "loss": 3.8275, + "step": 9779 + }, + { + "epoch": 1.9599198396793587, + "grad_norm": 21.895838251065925, + "learning_rate": 3.236874473542565e-06, + "loss": 2.4691, + "step": 9780 + }, + { + "epoch": 1.960120240480962, + "grad_norm": 27.55621821395974, + "learning_rate": 3.23578352785182e-06, + "loss": 2.7847, + "step": 9781 + }, + { + "epoch": 1.9603206412825651, + "grad_norm": 20.993206910006915, + "learning_rate": 3.2346926780841303e-06, + "loss": 2.348, + "step": 9782 + }, + { + "epoch": 1.9605210420841683, + "grad_norm": 28.315664133331314, + "learning_rate": 3.2336019242988036e-06, + "loss": 2.8914, + "step": 9783 + }, + { + "epoch": 1.9607214428857715, + "grad_norm": 24.848528564043114, + "learning_rate": 3.2325112665551496e-06, + "loss": 2.1893, + "step": 9784 + }, + { + "epoch": 1.9609218436873748, + "grad_norm": 31.3614879597278, + "learning_rate": 3.2314207049124656e-06, + "loss": 3.004, + "step": 9785 + }, + { + "epoch": 1.961122244488978, + "grad_norm": 22.00818607617665, + "learning_rate": 3.2303302394300495e-06, + "loss": 2.5496, + "step": 9786 + }, + { + "epoch": 1.9613226452905812, + "grad_norm": 23.567822714262853, + "learning_rate": 3.229239870167191e-06, + "loss": 2.7325, + "step": 9787 + }, + { + "epoch": 1.9615230460921844, + "grad_norm": 28.26479850169536, + "learning_rate": 3.2281495971831733e-06, + "loss": 2.9992, + "step": 9788 + }, + { + "epoch": 1.9617234468937874, + "grad_norm": 25.98340571637793, + "learning_rate": 3.2270594205372787e-06, + "loss": 2.5596, + "step": 9789 + }, + { + "epoch": 1.9619238476953909, + "grad_norm": 36.88665817931661, + "learning_rate": 3.2259693402887796e-06, + "loss": 2.5977, + "step": 9790 + }, + { + "epoch": 1.9621242484969939, + "grad_norm": 28.76547730113284, + "learning_rate": 3.224879356496945e-06, + "loss": 2.5854, + "step": 9791 + }, + { + "epoch": 1.9623246492985973, + "grad_norm": 21.422828938381606, + "learning_rate": 3.2237894692210423e-06, + "loss": 2.0784, + "step": 9792 + }, + { + "epoch": 1.9625250501002003, + "grad_norm": 35.05698964533305, + "learning_rate": 3.2226996785203285e-06, + "loss": 2.1962, + "step": 9793 + }, + { + "epoch": 1.9627254509018037, + "grad_norm": 26.588826569156616, + "learning_rate": 3.2216099844540546e-06, + "loss": 2.3502, + "step": 9794 + }, + { + "epoch": 1.9629258517034067, + "grad_norm": 22.135316573815363, + "learning_rate": 3.2205203870814726e-06, + "loss": 2.2937, + "step": 9795 + }, + { + "epoch": 1.9631262525050102, + "grad_norm": 32.26451937941645, + "learning_rate": 3.2194308864618228e-06, + "loss": 2.9223, + "step": 9796 + }, + { + "epoch": 1.9633266533066132, + "grad_norm": 20.14404235353195, + "learning_rate": 3.2183414826543457e-06, + "loss": 2.7051, + "step": 9797 + }, + { + "epoch": 1.9635270541082166, + "grad_norm": 22.82739569086617, + "learning_rate": 3.2172521757182695e-06, + "loss": 2.7012, + "step": 9798 + }, + { + "epoch": 1.9637274549098196, + "grad_norm": 29.376475208733705, + "learning_rate": 3.216162965712827e-06, + "loss": 2.6245, + "step": 9799 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 25.144875854766582, + "learning_rate": 3.215073852697236e-06, + "loss": 2.8869, + "step": 9800 + }, + { + "epoch": 1.964128256513026, + "grad_norm": 23.79459690431327, + "learning_rate": 3.213984836730713e-06, + "loss": 2.3624, + "step": 9801 + }, + { + "epoch": 1.9643286573146292, + "grad_norm": 26.663784736092072, + "learning_rate": 3.212895917872473e-06, + "loss": 2.6985, + "step": 9802 + }, + { + "epoch": 1.9645290581162325, + "grad_norm": 19.040379345423744, + "learning_rate": 3.211807096181719e-06, + "loss": 2.8481, + "step": 9803 + }, + { + "epoch": 1.9647294589178357, + "grad_norm": 24.5658672850803, + "learning_rate": 3.2107183717176536e-06, + "loss": 2.7921, + "step": 9804 + }, + { + "epoch": 1.964929859719439, + "grad_norm": 23.69353593089996, + "learning_rate": 3.20962974453947e-06, + "loss": 2.9776, + "step": 9805 + }, + { + "epoch": 1.9651302605210421, + "grad_norm": 30.688306212141313, + "learning_rate": 3.208541214706361e-06, + "loss": 3.1188, + "step": 9806 + }, + { + "epoch": 1.9653306613226453, + "grad_norm": 23.004048938706525, + "learning_rate": 3.2074527822775115e-06, + "loss": 2.5084, + "step": 9807 + }, + { + "epoch": 1.9655310621242486, + "grad_norm": 22.86499540787984, + "learning_rate": 3.2063644473120976e-06, + "loss": 2.1186, + "step": 9808 + }, + { + "epoch": 1.9657314629258518, + "grad_norm": 27.85734096125147, + "learning_rate": 3.205276209869298e-06, + "loss": 2.3152, + "step": 9809 + }, + { + "epoch": 1.9659318637274548, + "grad_norm": 25.966303410638677, + "learning_rate": 3.2041880700082805e-06, + "loss": 2.8942, + "step": 9810 + }, + { + "epoch": 1.9661322645290582, + "grad_norm": 24.698826630949732, + "learning_rate": 3.203100027788207e-06, + "loss": 2.6454, + "step": 9811 + }, + { + "epoch": 1.9663326653306612, + "grad_norm": 23.77723384076702, + "learning_rate": 3.2020120832682387e-06, + "loss": 2.9116, + "step": 9812 + }, + { + "epoch": 1.9665330661322646, + "grad_norm": 33.445327056161524, + "learning_rate": 3.200924236507529e-06, + "loss": 3.0126, + "step": 9813 + }, + { + "epoch": 1.9667334669338676, + "grad_norm": 31.28815622686171, + "learning_rate": 3.1998364875652223e-06, + "loss": 2.9079, + "step": 9814 + }, + { + "epoch": 1.966933867735471, + "grad_norm": 28.969159031565447, + "learning_rate": 3.1987488365004627e-06, + "loss": 2.5295, + "step": 9815 + }, + { + "epoch": 1.967134268537074, + "grad_norm": 25.477629198031682, + "learning_rate": 3.197661283372388e-06, + "loss": 2.8132, + "step": 9816 + }, + { + "epoch": 1.9673346693386775, + "grad_norm": 29.059517634219162, + "learning_rate": 3.196573828240131e-06, + "loss": 2.4412, + "step": 9817 + }, + { + "epoch": 1.9675350701402805, + "grad_norm": 23.380588106632818, + "learning_rate": 3.195486471162815e-06, + "loss": 2.6766, + "step": 9818 + }, + { + "epoch": 1.9677354709418837, + "grad_norm": 25.81874284708127, + "learning_rate": 3.1943992121995647e-06, + "loss": 2.6909, + "step": 9819 + }, + { + "epoch": 1.967935871743487, + "grad_norm": 16.062119974673294, + "learning_rate": 3.1933120514094957e-06, + "loss": 2.7516, + "step": 9820 + }, + { + "epoch": 1.9681362725450902, + "grad_norm": 22.904751665965296, + "learning_rate": 3.192224988851714e-06, + "loss": 2.4042, + "step": 9821 + }, + { + "epoch": 1.9683366733466934, + "grad_norm": 31.78397253050052, + "learning_rate": 3.191138024585331e-06, + "loss": 3.1771, + "step": 9822 + }, + { + "epoch": 1.9685370741482966, + "grad_norm": 21.310155741517494, + "learning_rate": 3.1900511586694434e-06, + "loss": 2.4769, + "step": 9823 + }, + { + "epoch": 1.9687374749498998, + "grad_norm": 20.23497028364941, + "learning_rate": 3.188964391163146e-06, + "loss": 2.7893, + "step": 9824 + }, + { + "epoch": 1.968937875751503, + "grad_norm": 20.05180533696624, + "learning_rate": 3.1878777221255263e-06, + "loss": 3.0015, + "step": 9825 + }, + { + "epoch": 1.9691382765531062, + "grad_norm": 23.383855820033812, + "learning_rate": 3.1867911516156717e-06, + "loss": 2.1448, + "step": 9826 + }, + { + "epoch": 1.9693386773547095, + "grad_norm": 20.512110558933532, + "learning_rate": 3.185704679692658e-06, + "loss": 3.1082, + "step": 9827 + }, + { + "epoch": 1.9695390781563127, + "grad_norm": 24.250177677385874, + "learning_rate": 3.184618306415559e-06, + "loss": 3.0973, + "step": 9828 + }, + { + "epoch": 1.9697394789579157, + "grad_norm": 23.22376004939433, + "learning_rate": 3.183532031843442e-06, + "loss": 2.6754, + "step": 9829 + }, + { + "epoch": 1.9699398797595191, + "grad_norm": 21.55849518598093, + "learning_rate": 3.182445856035371e-06, + "loss": 2.9526, + "step": 9830 + }, + { + "epoch": 1.9701402805611221, + "grad_norm": 45.14052017256485, + "learning_rate": 3.181359779050402e-06, + "loss": 3.3816, + "step": 9831 + }, + { + "epoch": 1.9703406813627256, + "grad_norm": 76.4592828274386, + "learning_rate": 3.1802738009475843e-06, + "loss": 3.1403, + "step": 9832 + }, + { + "epoch": 1.9705410821643286, + "grad_norm": 20.83562111339318, + "learning_rate": 3.179187921785968e-06, + "loss": 2.6381, + "step": 9833 + }, + { + "epoch": 1.970741482965932, + "grad_norm": 25.5984734335804, + "learning_rate": 3.1781021416245917e-06, + "loss": 3.0732, + "step": 9834 + }, + { + "epoch": 1.970941883767535, + "grad_norm": 21.791198205738656, + "learning_rate": 3.1770164605224895e-06, + "loss": 2.5356, + "step": 9835 + }, + { + "epoch": 1.9711422845691384, + "grad_norm": 23.945692360558585, + "learning_rate": 3.1759308785386955e-06, + "loss": 2.2055, + "step": 9836 + }, + { + "epoch": 1.9713426853707414, + "grad_norm": 21.837946143025814, + "learning_rate": 3.174845395732232e-06, + "loss": 2.5113, + "step": 9837 + }, + { + "epoch": 1.9715430861723446, + "grad_norm": 29.07222332424024, + "learning_rate": 3.1737600121621172e-06, + "loss": 2.807, + "step": 9838 + }, + { + "epoch": 1.9717434869739479, + "grad_norm": 44.91561693781228, + "learning_rate": 3.1726747278873683e-06, + "loss": 3.0435, + "step": 9839 + }, + { + "epoch": 1.971943887775551, + "grad_norm": 29.25247800338675, + "learning_rate": 3.171589542966991e-06, + "loss": 2.7966, + "step": 9840 + }, + { + "epoch": 1.9721442885771543, + "grad_norm": 18.519308588477962, + "learning_rate": 3.1705044574599892e-06, + "loss": 2.1584, + "step": 9841 + }, + { + "epoch": 1.9723446893787575, + "grad_norm": 34.066761908609415, + "learning_rate": 3.169419471425359e-06, + "loss": 3.0193, + "step": 9842 + }, + { + "epoch": 1.9725450901803607, + "grad_norm": 32.84572366954362, + "learning_rate": 3.168334584922097e-06, + "loss": 2.5887, + "step": 9843 + }, + { + "epoch": 1.972745490981964, + "grad_norm": 20.54981062122061, + "learning_rate": 3.1672497980091864e-06, + "loss": 2.3997, + "step": 9844 + }, + { + "epoch": 1.9729458917835672, + "grad_norm": 26.318474544218773, + "learning_rate": 3.166165110745608e-06, + "loss": 2.626, + "step": 9845 + }, + { + "epoch": 1.9731462925851704, + "grad_norm": 31.82408705449996, + "learning_rate": 3.1650805231903424e-06, + "loss": 2.8712, + "step": 9846 + }, + { + "epoch": 1.9733466933867736, + "grad_norm": 26.19200052507153, + "learning_rate": 3.1639960354023557e-06, + "loss": 2.4266, + "step": 9847 + }, + { + "epoch": 1.9735470941883766, + "grad_norm": 38.004840559944356, + "learning_rate": 3.1629116474406154e-06, + "loss": 2.8767, + "step": 9848 + }, + { + "epoch": 1.97374749498998, + "grad_norm": 24.165330677594774, + "learning_rate": 3.161827359364079e-06, + "loss": 2.9232, + "step": 9849 + }, + { + "epoch": 1.973947895791583, + "grad_norm": 24.107677523638618, + "learning_rate": 3.1607431712317043e-06, + "loss": 3.2429, + "step": 9850 + }, + { + "epoch": 1.9741482965931865, + "grad_norm": 28.1481871823858, + "learning_rate": 3.159659083102439e-06, + "loss": 2.7973, + "step": 9851 + }, + { + "epoch": 1.9743486973947895, + "grad_norm": 33.39936678411602, + "learning_rate": 3.1585750950352227e-06, + "loss": 2.7153, + "step": 9852 + }, + { + "epoch": 1.974549098196393, + "grad_norm": 19.657761795122855, + "learning_rate": 3.1574912070889984e-06, + "loss": 2.3299, + "step": 9853 + }, + { + "epoch": 1.974749498997996, + "grad_norm": 28.064961726017767, + "learning_rate": 3.156407419322698e-06, + "loss": 2.3764, + "step": 9854 + }, + { + "epoch": 1.9749498997995993, + "grad_norm": 25.10748189433361, + "learning_rate": 3.1553237317952457e-06, + "loss": 2.5693, + "step": 9855 + }, + { + "epoch": 1.9751503006012023, + "grad_norm": 29.94053357938151, + "learning_rate": 3.154240144565567e-06, + "loss": 2.4579, + "step": 9856 + }, + { + "epoch": 1.9753507014028058, + "grad_norm": 27.045519930378436, + "learning_rate": 3.153156657692576e-06, + "loss": 2.6466, + "step": 9857 + }, + { + "epoch": 1.9755511022044088, + "grad_norm": 29.342100276018442, + "learning_rate": 3.1520732712351833e-06, + "loss": 2.0351, + "step": 9858 + }, + { + "epoch": 1.975751503006012, + "grad_norm": 25.806381044985987, + "learning_rate": 3.1509899852522942e-06, + "loss": 2.988, + "step": 9859 + }, + { + "epoch": 1.9759519038076152, + "grad_norm": 23.698721328326425, + "learning_rate": 3.149906799802809e-06, + "loss": 2.9682, + "step": 9860 + }, + { + "epoch": 1.9761523046092184, + "grad_norm": 20.534429405982483, + "learning_rate": 3.1488237149456226e-06, + "loss": 2.8385, + "step": 9861 + }, + { + "epoch": 1.9763527054108216, + "grad_norm": 28.236053443472343, + "learning_rate": 3.1477407307396225e-06, + "loss": 2.5098, + "step": 9862 + }, + { + "epoch": 1.9765531062124249, + "grad_norm": 28.51406033523413, + "learning_rate": 3.146657847243694e-06, + "loss": 2.684, + "step": 9863 + }, + { + "epoch": 1.976753507014028, + "grad_norm": 20.112243891462267, + "learning_rate": 3.145575064516715e-06, + "loss": 2.5189, + "step": 9864 + }, + { + "epoch": 1.9769539078156313, + "grad_norm": 28.99000928832079, + "learning_rate": 3.1444923826175546e-06, + "loss": 2.2106, + "step": 9865 + }, + { + "epoch": 1.9771543086172345, + "grad_norm": 21.900586425666262, + "learning_rate": 3.1434098016050828e-06, + "loss": 2.3509, + "step": 9866 + }, + { + "epoch": 1.9773547094188377, + "grad_norm": 31.223950344354463, + "learning_rate": 3.1423273215381622e-06, + "loss": 3.0313, + "step": 9867 + }, + { + "epoch": 1.977555110220441, + "grad_norm": 24.65749835831282, + "learning_rate": 3.1412449424756474e-06, + "loss": 2.1774, + "step": 9868 + }, + { + "epoch": 1.977755511022044, + "grad_norm": 34.68268666158839, + "learning_rate": 3.140162664476387e-06, + "loss": 3.2412, + "step": 9869 + }, + { + "epoch": 1.9779559118236474, + "grad_norm": 31.67011552871038, + "learning_rate": 3.13908048759923e-06, + "loss": 3.1739, + "step": 9870 + }, + { + "epoch": 1.9781563126252504, + "grad_norm": 22.451636655770706, + "learning_rate": 3.1379984119030126e-06, + "loss": 2.4846, + "step": 9871 + }, + { + "epoch": 1.9783567134268538, + "grad_norm": 83.88420375608962, + "learning_rate": 3.13691643744657e-06, + "loss": 2.8706, + "step": 9872 + }, + { + "epoch": 1.9785571142284568, + "grad_norm": 21.610152117739496, + "learning_rate": 3.1358345642887312e-06, + "loss": 2.8052, + "step": 9873 + }, + { + "epoch": 1.9787575150300603, + "grad_norm": 35.49245452488558, + "learning_rate": 3.13475279248832e-06, + "loss": 2.6075, + "step": 9874 + }, + { + "epoch": 1.9789579158316633, + "grad_norm": 25.899370192974395, + "learning_rate": 3.1336711221041523e-06, + "loss": 2.9025, + "step": 9875 + }, + { + "epoch": 1.9791583166332667, + "grad_norm": 25.964937806657165, + "learning_rate": 3.132589553195039e-06, + "loss": 2.6758, + "step": 9876 + }, + { + "epoch": 1.9793587174348697, + "grad_norm": 23.09017771007768, + "learning_rate": 3.13150808581979e-06, + "loss": 2.4303, + "step": 9877 + }, + { + "epoch": 1.979559118236473, + "grad_norm": 31.470455781055488, + "learning_rate": 3.1304267200372035e-06, + "loss": 2.4834, + "step": 9878 + }, + { + "epoch": 1.9797595190380761, + "grad_norm": 22.62442527469959, + "learning_rate": 3.1293454559060755e-06, + "loss": 2.6241, + "step": 9879 + }, + { + "epoch": 1.9799599198396793, + "grad_norm": 20.760687618715444, + "learning_rate": 3.1282642934851976e-06, + "loss": 2.848, + "step": 9880 + }, + { + "epoch": 1.9801603206412826, + "grad_norm": 36.128349538285306, + "learning_rate": 3.1271832328333534e-06, + "loss": 2.537, + "step": 9881 + }, + { + "epoch": 1.9803607214428858, + "grad_norm": 22.70285786278118, + "learning_rate": 3.1261022740093195e-06, + "loss": 2.0456, + "step": 9882 + }, + { + "epoch": 1.980561122244489, + "grad_norm": 27.617273803869264, + "learning_rate": 3.125021417071873e-06, + "loss": 2.7069, + "step": 9883 + }, + { + "epoch": 1.9807615230460922, + "grad_norm": 66.74714023913353, + "learning_rate": 3.1239406620797784e-06, + "loss": 2.9605, + "step": 9884 + }, + { + "epoch": 1.9809619238476954, + "grad_norm": 27.94207290601981, + "learning_rate": 3.1228600090918005e-06, + "loss": 2.4371, + "step": 9885 + }, + { + "epoch": 1.9811623246492986, + "grad_norm": 28.12083862554832, + "learning_rate": 3.1217794581666933e-06, + "loss": 2.5169, + "step": 9886 + }, + { + "epoch": 1.9813627254509019, + "grad_norm": 24.971396319499327, + "learning_rate": 3.1206990093632113e-06, + "loss": 3.2765, + "step": 9887 + }, + { + "epoch": 1.9815631262525049, + "grad_norm": 25.14596869365056, + "learning_rate": 3.1196186627400983e-06, + "loss": 2.793, + "step": 9888 + }, + { + "epoch": 1.9817635270541083, + "grad_norm": 24.394203982386834, + "learning_rate": 3.118538418356092e-06, + "loss": 2.456, + "step": 9889 + }, + { + "epoch": 1.9819639278557113, + "grad_norm": 26.8773163299347, + "learning_rate": 3.11745827626993e-06, + "loss": 2.9829, + "step": 9890 + }, + { + "epoch": 1.9821643286573147, + "grad_norm": 23.4989978409374, + "learning_rate": 3.1163782365403416e-06, + "loss": 2.4041, + "step": 9891 + }, + { + "epoch": 1.9823647294589177, + "grad_norm": 23.460788899571163, + "learning_rate": 3.115298299226047e-06, + "loss": 2.8375, + "step": 9892 + }, + { + "epoch": 1.9825651302605212, + "grad_norm": 23.167924358437425, + "learning_rate": 3.1142184643857683e-06, + "loss": 2.5844, + "step": 9893 + }, + { + "epoch": 1.9827655310621242, + "grad_norm": 35.35412363304568, + "learning_rate": 3.113138732078215e-06, + "loss": 2.65, + "step": 9894 + }, + { + "epoch": 1.9829659318637276, + "grad_norm": 21.046100147938915, + "learning_rate": 3.112059102362093e-06, + "loss": 2.1374, + "step": 9895 + }, + { + "epoch": 1.9831663326653306, + "grad_norm": 21.515029863449758, + "learning_rate": 3.1109795752961046e-06, + "loss": 2.4141, + "step": 9896 + }, + { + "epoch": 1.9833667334669338, + "grad_norm": 19.168145664903818, + "learning_rate": 3.1099001509389453e-06, + "loss": 2.5016, + "step": 9897 + }, + { + "epoch": 1.983567134268537, + "grad_norm": 26.18908089925714, + "learning_rate": 3.108820829349306e-06, + "loss": 2.5619, + "step": 9898 + }, + { + "epoch": 1.9837675350701403, + "grad_norm": 34.54635492685449, + "learning_rate": 3.107741610585867e-06, + "loss": 3.147, + "step": 9899 + }, + { + "epoch": 1.9839679358717435, + "grad_norm": 31.57233483910179, + "learning_rate": 3.1066624947073127e-06, + "loss": 2.5774, + "step": 9900 + }, + { + "epoch": 1.9841683366733467, + "grad_norm": 26.85569353912431, + "learning_rate": 3.105583481772313e-06, + "loss": 3.0804, + "step": 9901 + }, + { + "epoch": 1.98436873747495, + "grad_norm": 25.49672758614005, + "learning_rate": 3.104504571839535e-06, + "loss": 2.723, + "step": 9902 + }, + { + "epoch": 1.9845691382765531, + "grad_norm": 18.75943141941242, + "learning_rate": 3.1034257649676404e-06, + "loss": 2.362, + "step": 9903 + }, + { + "epoch": 1.9847695390781563, + "grad_norm": 23.533131721920107, + "learning_rate": 3.1023470612152885e-06, + "loss": 2.8089, + "step": 9904 + }, + { + "epoch": 1.9849699398797596, + "grad_norm": 20.086384984258924, + "learning_rate": 3.101268460641128e-06, + "loss": 2.4074, + "step": 9905 + }, + { + "epoch": 1.9851703406813628, + "grad_norm": 24.361827968951733, + "learning_rate": 3.1001899633038024e-06, + "loss": 2.5935, + "step": 9906 + }, + { + "epoch": 1.9853707414829658, + "grad_norm": 31.613343976086714, + "learning_rate": 3.0991115692619543e-06, + "loss": 2.6606, + "step": 9907 + }, + { + "epoch": 1.9855711422845692, + "grad_norm": 21.97465954682825, + "learning_rate": 3.0980332785742155e-06, + "loss": 2.5753, + "step": 9908 + }, + { + "epoch": 1.9857715430861722, + "grad_norm": 57.03533434445272, + "learning_rate": 3.0969550912992152e-06, + "loss": 2.602, + "step": 9909 + }, + { + "epoch": 1.9859719438877756, + "grad_norm": 31.169374321097695, + "learning_rate": 3.0958770074955753e-06, + "loss": 2.7094, + "step": 9910 + }, + { + "epoch": 1.9861723446893786, + "grad_norm": 20.084218826370236, + "learning_rate": 3.094799027221914e-06, + "loss": 2.6489, + "step": 9911 + }, + { + "epoch": 1.986372745490982, + "grad_norm": 36.760143346944915, + "learning_rate": 3.0937211505368427e-06, + "loss": 2.9967, + "step": 9912 + }, + { + "epoch": 1.986573146292585, + "grad_norm": 32.21444551557656, + "learning_rate": 3.0926433774989638e-06, + "loss": 3.3719, + "step": 9913 + }, + { + "epoch": 1.9867735470941885, + "grad_norm": 31.670881144896367, + "learning_rate": 3.091565708166882e-06, + "loss": 2.8587, + "step": 9914 + }, + { + "epoch": 1.9869739478957915, + "grad_norm": 22.963508857124765, + "learning_rate": 3.090488142599189e-06, + "loss": 2.2079, + "step": 9915 + }, + { + "epoch": 1.987174348697395, + "grad_norm": 35.696987177910515, + "learning_rate": 3.089410680854473e-06, + "loss": 3.7726, + "step": 9916 + }, + { + "epoch": 1.987374749498998, + "grad_norm": 35.825489504223754, + "learning_rate": 3.0883333229913213e-06, + "loss": 2.6358, + "step": 9917 + }, + { + "epoch": 1.9875751503006012, + "grad_norm": 24.84924215034954, + "learning_rate": 3.0872560690683084e-06, + "loss": 2.4456, + "step": 9918 + }, + { + "epoch": 1.9877755511022044, + "grad_norm": 95.56225705260971, + "learning_rate": 3.0861789191440063e-06, + "loss": 2.8316, + "step": 9919 + }, + { + "epoch": 1.9879759519038076, + "grad_norm": 31.725536696761285, + "learning_rate": 3.0851018732769787e-06, + "loss": 2.5947, + "step": 9920 + }, + { + "epoch": 1.9881763527054108, + "grad_norm": 35.82526232238525, + "learning_rate": 3.084024931525791e-06, + "loss": 3.0986, + "step": 9921 + }, + { + "epoch": 1.988376753507014, + "grad_norm": 23.296550985407915, + "learning_rate": 3.082948093948997e-06, + "loss": 3.0374, + "step": 9922 + }, + { + "epoch": 1.9885771543086173, + "grad_norm": 44.30907046173293, + "learning_rate": 3.0818713606051427e-06, + "loss": 2.6426, + "step": 9923 + }, + { + "epoch": 1.9887775551102205, + "grad_norm": 19.22113383692428, + "learning_rate": 3.0807947315527752e-06, + "loss": 1.9837, + "step": 9924 + }, + { + "epoch": 1.9889779559118237, + "grad_norm": 40.81434175936837, + "learning_rate": 3.079718206850432e-06, + "loss": 3.2275, + "step": 9925 + }, + { + "epoch": 1.9891783567134267, + "grad_norm": 53.70608431737395, + "learning_rate": 3.0786417865566427e-06, + "loss": 2.4434, + "step": 9926 + }, + { + "epoch": 1.9893787575150301, + "grad_norm": 23.46225942626944, + "learning_rate": 3.0775654707299373e-06, + "loss": 2.5963, + "step": 9927 + }, + { + "epoch": 1.9895791583166331, + "grad_norm": 22.615122890070317, + "learning_rate": 3.0764892594288344e-06, + "loss": 2.1376, + "step": 9928 + }, + { + "epoch": 1.9897795591182366, + "grad_norm": 16.340989733491373, + "learning_rate": 3.0754131527118504e-06, + "loss": 2.5261, + "step": 9929 + }, + { + "epoch": 1.9899799599198396, + "grad_norm": 41.009823994412, + "learning_rate": 3.074337150637492e-06, + "loss": 2.307, + "step": 9930 + }, + { + "epoch": 1.990180360721443, + "grad_norm": 22.976868393771518, + "learning_rate": 3.0732612532642673e-06, + "loss": 2.7955, + "step": 9931 + }, + { + "epoch": 1.990380761523046, + "grad_norm": 17.234386370598113, + "learning_rate": 3.072185460650673e-06, + "loss": 2.5347, + "step": 9932 + }, + { + "epoch": 1.9905811623246494, + "grad_norm": 34.721434826908975, + "learning_rate": 3.0711097728551988e-06, + "loss": 2.2679, + "step": 9933 + }, + { + "epoch": 1.9907815631262524, + "grad_norm": 32.45618577597128, + "learning_rate": 3.070034189936334e-06, + "loss": 2.5997, + "step": 9934 + }, + { + "epoch": 1.9909819639278559, + "grad_norm": 26.40579716265768, + "learning_rate": 3.0689587119525597e-06, + "loss": 2.3456, + "step": 9935 + }, + { + "epoch": 1.9911823647294589, + "grad_norm": 57.60469162287667, + "learning_rate": 3.0678833389623495e-06, + "loss": 3.2204, + "step": 9936 + }, + { + "epoch": 1.991382765531062, + "grad_norm": 38.94006728285486, + "learning_rate": 3.0668080710241766e-06, + "loss": 2.695, + "step": 9937 + }, + { + "epoch": 1.9915831663326653, + "grad_norm": 25.03734965006006, + "learning_rate": 3.0657329081965024e-06, + "loss": 2.7411, + "step": 9938 + }, + { + "epoch": 1.9917835671342685, + "grad_norm": 30.11583818486317, + "learning_rate": 3.064657850537784e-06, + "loss": 2.6679, + "step": 9939 + }, + { + "epoch": 1.9919839679358717, + "grad_norm": 24.94891589999128, + "learning_rate": 3.063582898106475e-06, + "loss": 2.509, + "step": 9940 + }, + { + "epoch": 1.992184368737475, + "grad_norm": 23.634694459767946, + "learning_rate": 3.062508050961022e-06, + "loss": 2.4, + "step": 9941 + }, + { + "epoch": 1.9923847695390782, + "grad_norm": 16.762724507447604, + "learning_rate": 3.061433309159868e-06, + "loss": 2.2319, + "step": 9942 + }, + { + "epoch": 1.9925851703406814, + "grad_norm": 54.57094005606004, + "learning_rate": 3.0603586727614445e-06, + "loss": 3.5357, + "step": 9943 + }, + { + "epoch": 1.9927855711422846, + "grad_norm": 26.765941446926337, + "learning_rate": 3.0592841418241843e-06, + "loss": 2.6126, + "step": 9944 + }, + { + "epoch": 1.9929859719438878, + "grad_norm": 31.153001394315005, + "learning_rate": 3.0582097164065105e-06, + "loss": 2.9592, + "step": 9945 + }, + { + "epoch": 1.993186372745491, + "grad_norm": 23.62223013025131, + "learning_rate": 3.0571353965668395e-06, + "loss": 2.6382, + "step": 9946 + }, + { + "epoch": 1.993386773547094, + "grad_norm": 27.129216432654868, + "learning_rate": 3.0560611823635845e-06, + "loss": 2.284, + "step": 9947 + }, + { + "epoch": 1.9935871743486975, + "grad_norm": 23.251226923115304, + "learning_rate": 3.0549870738551547e-06, + "loss": 2.3529, + "step": 9948 + }, + { + "epoch": 1.9937875751503005, + "grad_norm": 24.69872203867745, + "learning_rate": 3.0539130710999475e-06, + "loss": 2.9255, + "step": 9949 + }, + { + "epoch": 1.993987975951904, + "grad_norm": 29.894280131211914, + "learning_rate": 3.0528391741563583e-06, + "loss": 2.7861, + "step": 9950 + }, + { + "epoch": 1.994188376753507, + "grad_norm": 27.84784282388893, + "learning_rate": 3.0517653830827797e-06, + "loss": 2.1215, + "step": 9951 + }, + { + "epoch": 1.9943887775551103, + "grad_norm": 30.023444776147286, + "learning_rate": 3.0506916979375915e-06, + "loss": 2.2901, + "step": 9952 + }, + { + "epoch": 1.9945891783567133, + "grad_norm": 24.15398015186126, + "learning_rate": 3.0496181187791736e-06, + "loss": 2.8516, + "step": 9953 + }, + { + "epoch": 1.9947895791583168, + "grad_norm": 19.925626303319, + "learning_rate": 3.0485446456658972e-06, + "loss": 2.2834, + "step": 9954 + }, + { + "epoch": 1.9949899799599198, + "grad_norm": 19.580912749306766, + "learning_rate": 3.04747127865613e-06, + "loss": 2.5511, + "step": 9955 + }, + { + "epoch": 1.995190380761523, + "grad_norm": 23.42289770364762, + "learning_rate": 3.046398017808232e-06, + "loss": 2.7522, + "step": 9956 + }, + { + "epoch": 1.9953907815631262, + "grad_norm": 22.016587620504364, + "learning_rate": 3.0453248631805556e-06, + "loss": 2.657, + "step": 9957 + }, + { + "epoch": 1.9955911823647294, + "grad_norm": 26.201757048996175, + "learning_rate": 3.044251814831453e-06, + "loss": 2.6037, + "step": 9958 + }, + { + "epoch": 1.9957915831663327, + "grad_norm": 26.13427557622181, + "learning_rate": 3.043178872819266e-06, + "loss": 2.7859, + "step": 9959 + }, + { + "epoch": 1.9959919839679359, + "grad_norm": 34.435836421108284, + "learning_rate": 3.04210603720233e-06, + "loss": 2.4467, + "step": 9960 + }, + { + "epoch": 1.996192384769539, + "grad_norm": 20.42751541920694, + "learning_rate": 3.041033308038982e-06, + "loss": 2.8797, + "step": 9961 + }, + { + "epoch": 1.9963927855711423, + "grad_norm": 25.21250204310548, + "learning_rate": 3.0399606853875452e-06, + "loss": 2.8838, + "step": 9962 + }, + { + "epoch": 1.9965931863727455, + "grad_norm": 22.527722359712143, + "learning_rate": 3.0388881693063384e-06, + "loss": 2.6826, + "step": 9963 + }, + { + "epoch": 1.9967935871743487, + "grad_norm": 34.0661399956834, + "learning_rate": 3.0378157598536746e-06, + "loss": 2.6229, + "step": 9964 + }, + { + "epoch": 1.996993987975952, + "grad_norm": 30.629567521630296, + "learning_rate": 3.036743457087865e-06, + "loss": 2.9118, + "step": 9965 + }, + { + "epoch": 1.997194388777555, + "grad_norm": 17.641344040539455, + "learning_rate": 3.0356712610672136e-06, + "loss": 2.6501, + "step": 9966 + }, + { + "epoch": 1.9973947895791584, + "grad_norm": 21.888582288673703, + "learning_rate": 3.0345991718500134e-06, + "loss": 2.8473, + "step": 9967 + }, + { + "epoch": 1.9975951903807614, + "grad_norm": 23.341161124172213, + "learning_rate": 3.0335271894945585e-06, + "loss": 2.775, + "step": 9968 + }, + { + "epoch": 1.9977955911823648, + "grad_norm": 26.9676776614702, + "learning_rate": 3.032455314059134e-06, + "loss": 3.157, + "step": 9969 + }, + { + "epoch": 1.9979959919839678, + "grad_norm": 34.49248504575222, + "learning_rate": 3.0313835456020158e-06, + "loss": 2.5239, + "step": 9970 + }, + { + "epoch": 1.9981963927855713, + "grad_norm": 38.976175230891116, + "learning_rate": 3.0303118841814827e-06, + "loss": 2.4609, + "step": 9971 + }, + { + "epoch": 1.9983967935871743, + "grad_norm": 32.45841842778999, + "learning_rate": 3.0292403298557983e-06, + "loss": 2.9371, + "step": 9972 + }, + { + "epoch": 1.9985971943887777, + "grad_norm": 18.347354472057038, + "learning_rate": 3.028168882683228e-06, + "loss": 2.1152, + "step": 9973 + }, + { + "epoch": 1.9987975951903807, + "grad_norm": 19.634879177972355, + "learning_rate": 3.027097542722024e-06, + "loss": 2.7214, + "step": 9974 + }, + { + "epoch": 1.9989979959919841, + "grad_norm": 26.37056528629884, + "learning_rate": 3.026026310030441e-06, + "loss": 2.642, + "step": 9975 + }, + { + "epoch": 1.9991983967935871, + "grad_norm": 23.11014104873385, + "learning_rate": 3.0249551846667207e-06, + "loss": 2.5558, + "step": 9976 + }, + { + "epoch": 1.9993987975951903, + "grad_norm": 24.34325754014661, + "learning_rate": 3.023884166689101e-06, + "loss": 2.5956, + "step": 9977 + }, + { + "epoch": 1.9995991983967936, + "grad_norm": 21.302474308868486, + "learning_rate": 3.022813256155817e-06, + "loss": 2.8694, + "step": 9978 + }, + { + "epoch": 1.9997995991983968, + "grad_norm": 25.00335661314402, + "learning_rate": 3.021742453125096e-06, + "loss": 2.9844, + "step": 9979 + }, + { + "epoch": 2.0, + "grad_norm": 17.524135138575637, + "learning_rate": 3.020671757655156e-06, + "loss": 2.1039, + "step": 9980 + }, + { + "epoch": 2.000200400801603, + "grad_norm": 20.404203802892685, + "learning_rate": 3.019601169804216e-06, + "loss": 2.0574, + "step": 9981 + }, + { + "epoch": 2.0004008016032064, + "grad_norm": 17.994772325571507, + "learning_rate": 3.0185306896304846e-06, + "loss": 1.6772, + "step": 9982 + }, + { + "epoch": 2.0006012024048094, + "grad_norm": 17.765996311807793, + "learning_rate": 3.0174603171921624e-06, + "loss": 1.9408, + "step": 9983 + }, + { + "epoch": 2.000801603206413, + "grad_norm": 14.457538406617418, + "learning_rate": 3.01639005254745e-06, + "loss": 1.8035, + "step": 9984 + }, + { + "epoch": 2.001002004008016, + "grad_norm": 25.050630986519216, + "learning_rate": 3.0153198957545382e-06, + "loss": 1.6468, + "step": 9985 + }, + { + "epoch": 2.0012024048096193, + "grad_norm": 17.498180165158363, + "learning_rate": 3.0142498468716143e-06, + "loss": 1.8572, + "step": 9986 + }, + { + "epoch": 2.0014028056112223, + "grad_norm": 17.759943034869348, + "learning_rate": 3.0131799059568555e-06, + "loss": 1.1152, + "step": 9987 + }, + { + "epoch": 2.0016032064128257, + "grad_norm": 18.692880128644482, + "learning_rate": 3.0121100730684398e-06, + "loss": 1.3961, + "step": 9988 + }, + { + "epoch": 2.0018036072144287, + "grad_norm": 18.144497320436844, + "learning_rate": 3.0110403482645346e-06, + "loss": 1.755, + "step": 9989 + }, + { + "epoch": 2.002004008016032, + "grad_norm": 17.748511340662503, + "learning_rate": 3.0099707316033e-06, + "loss": 1.5268, + "step": 9990 + }, + { + "epoch": 2.002204408817635, + "grad_norm": 24.4254447905526, + "learning_rate": 3.008901223142893e-06, + "loss": 2.0166, + "step": 9991 + }, + { + "epoch": 2.0024048096192386, + "grad_norm": 19.207505596575643, + "learning_rate": 3.007831822941468e-06, + "loss": 1.6347, + "step": 9992 + }, + { + "epoch": 2.0026052104208416, + "grad_norm": 19.99717747566972, + "learning_rate": 3.006762531057168e-06, + "loss": 1.5975, + "step": 9993 + }, + { + "epoch": 2.002805611222445, + "grad_norm": 18.61375092351459, + "learning_rate": 3.0056933475481286e-06, + "loss": 1.6932, + "step": 9994 + }, + { + "epoch": 2.003006012024048, + "grad_norm": 16.74102266138557, + "learning_rate": 3.0046242724724885e-06, + "loss": 1.5329, + "step": 9995 + }, + { + "epoch": 2.0032064128256515, + "grad_norm": 14.399500049462763, + "learning_rate": 3.0035553058883706e-06, + "loss": 1.6906, + "step": 9996 + }, + { + "epoch": 2.0034068136272545, + "grad_norm": 21.201921335427357, + "learning_rate": 3.0024864478538983e-06, + "loss": 2.1461, + "step": 9997 + }, + { + "epoch": 2.003607214428858, + "grad_norm": 20.59274194461716, + "learning_rate": 3.001417698427186e-06, + "loss": 1.3047, + "step": 9998 + }, + { + "epoch": 2.003807615230461, + "grad_norm": 15.792358279185963, + "learning_rate": 3.0003490576663446e-06, + "loss": 1.7923, + "step": 9999 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 16.529654121811703, + "learning_rate": 2.9992805256294766e-06, + "loss": 1.8647, + "step": 10000 + }, + { + "epoch": 2.0042084168336673, + "grad_norm": 20.32898909920409, + "learning_rate": 2.998212102374678e-06, + "loss": 1.2709, + "step": 10001 + }, + { + "epoch": 2.0044088176352703, + "grad_norm": 22.631712395387183, + "learning_rate": 2.9971437879600447e-06, + "loss": 1.9068, + "step": 10002 + }, + { + "epoch": 2.004609218436874, + "grad_norm": 17.19047128942372, + "learning_rate": 2.9960755824436582e-06, + "loss": 1.5696, + "step": 10003 + }, + { + "epoch": 2.004809619238477, + "grad_norm": 36.53197811003067, + "learning_rate": 2.9950074858836e-06, + "loss": 1.7459, + "step": 10004 + }, + { + "epoch": 2.00501002004008, + "grad_norm": 21.757423822858662, + "learning_rate": 2.9939394983379464e-06, + "loss": 1.745, + "step": 10005 + }, + { + "epoch": 2.005210420841683, + "grad_norm": 17.532940025027717, + "learning_rate": 2.9928716198647633e-06, + "loss": 1.5277, + "step": 10006 + }, + { + "epoch": 2.0054108216432867, + "grad_norm": 44.606561273276256, + "learning_rate": 2.9918038505221116e-06, + "loss": 1.7045, + "step": 10007 + }, + { + "epoch": 2.0056112224448897, + "grad_norm": 16.73599754634021, + "learning_rate": 2.99073619036805e-06, + "loss": 1.4296, + "step": 10008 + }, + { + "epoch": 2.005811623246493, + "grad_norm": 38.12913307350643, + "learning_rate": 2.989668639460627e-06, + "loss": 1.3151, + "step": 10009 + }, + { + "epoch": 2.006012024048096, + "grad_norm": 16.7846429619861, + "learning_rate": 2.988601197857889e-06, + "loss": 1.4772, + "step": 10010 + }, + { + "epoch": 2.0062124248496995, + "grad_norm": 17.016058472276235, + "learning_rate": 2.98753386561787e-06, + "loss": 1.6497, + "step": 10011 + }, + { + "epoch": 2.0064128256513025, + "grad_norm": 15.155937242959137, + "learning_rate": 2.9864666427986085e-06, + "loss": 1.5999, + "step": 10012 + }, + { + "epoch": 2.006613226452906, + "grad_norm": 29.206277502472727, + "learning_rate": 2.9853995294581272e-06, + "loss": 1.8173, + "step": 10013 + }, + { + "epoch": 2.006813627254509, + "grad_norm": 23.092294640413368, + "learning_rate": 2.9843325256544453e-06, + "loss": 1.5344, + "step": 10014 + }, + { + "epoch": 2.0070140280561124, + "grad_norm": 22.7233134811783, + "learning_rate": 2.983265631445582e-06, + "loss": 0.9153, + "step": 10015 + }, + { + "epoch": 2.0072144288577154, + "grad_norm": 40.50517538779395, + "learning_rate": 2.982198846889541e-06, + "loss": 1.5101, + "step": 10016 + }, + { + "epoch": 2.007414829659319, + "grad_norm": 19.31540147965609, + "learning_rate": 2.98113217204433e-06, + "loss": 1.5264, + "step": 10017 + }, + { + "epoch": 2.007615230460922, + "grad_norm": 20.42732477226361, + "learning_rate": 2.98006560696794e-06, + "loss": 1.5516, + "step": 10018 + }, + { + "epoch": 2.0078156312625253, + "grad_norm": 23.390850235279053, + "learning_rate": 2.978999151718367e-06, + "loss": 1.692, + "step": 10019 + }, + { + "epoch": 2.0080160320641283, + "grad_norm": 22.87964816626479, + "learning_rate": 2.977932806353592e-06, + "loss": 1.7917, + "step": 10020 + }, + { + "epoch": 2.0082164328657313, + "grad_norm": 23.55340138307102, + "learning_rate": 2.9768665709315947e-06, + "loss": 1.5821, + "step": 10021 + }, + { + "epoch": 2.0084168336673347, + "grad_norm": 17.389454062178206, + "learning_rate": 2.9758004455103494e-06, + "loss": 1.6802, + "step": 10022 + }, + { + "epoch": 2.0086172344689377, + "grad_norm": 81.40766025817418, + "learning_rate": 2.974734430147822e-06, + "loss": 1.9039, + "step": 10023 + }, + { + "epoch": 2.008817635270541, + "grad_norm": 20.915862517578205, + "learning_rate": 2.9736685249019713e-06, + "loss": 1.5771, + "step": 10024 + }, + { + "epoch": 2.009018036072144, + "grad_norm": 23.065021845672838, + "learning_rate": 2.972602729830757e-06, + "loss": 1.2672, + "step": 10025 + }, + { + "epoch": 2.0092184368737476, + "grad_norm": 18.67538880111262, + "learning_rate": 2.9715370449921246e-06, + "loss": 1.5796, + "step": 10026 + }, + { + "epoch": 2.0094188376753506, + "grad_norm": 17.16781093552157, + "learning_rate": 2.9704714704440153e-06, + "loss": 2.0295, + "step": 10027 + }, + { + "epoch": 2.009619238476954, + "grad_norm": 22.178871832089108, + "learning_rate": 2.969406006244368e-06, + "loss": 1.7686, + "step": 10028 + }, + { + "epoch": 2.009819639278557, + "grad_norm": 26.46134208755608, + "learning_rate": 2.9683406524511155e-06, + "loss": 1.6225, + "step": 10029 + }, + { + "epoch": 2.0100200400801604, + "grad_norm": 36.12519280737834, + "learning_rate": 2.9672754091221807e-06, + "loss": 1.7054, + "step": 10030 + }, + { + "epoch": 2.0102204408817634, + "grad_norm": 25.757272965671024, + "learning_rate": 2.96621027631548e-06, + "loss": 1.6038, + "step": 10031 + }, + { + "epoch": 2.010420841683367, + "grad_norm": 17.246076955838817, + "learning_rate": 2.9651452540889315e-06, + "loss": 0.9972, + "step": 10032 + }, + { + "epoch": 2.01062124248497, + "grad_norm": 79.6405908227737, + "learning_rate": 2.964080342500437e-06, + "loss": 2.3946, + "step": 10033 + }, + { + "epoch": 2.0108216432865733, + "grad_norm": 25.440414910287167, + "learning_rate": 2.963015541607902e-06, + "loss": 1.5009, + "step": 10034 + }, + { + "epoch": 2.0110220440881763, + "grad_norm": 31.28494319902213, + "learning_rate": 2.9619508514692163e-06, + "loss": 0.9582, + "step": 10035 + }, + { + "epoch": 2.0112224448897797, + "grad_norm": 15.630437395878483, + "learning_rate": 2.9608862721422732e-06, + "loss": 1.5606, + "step": 10036 + }, + { + "epoch": 2.0114228456913827, + "grad_norm": 20.502052019820734, + "learning_rate": 2.9598218036849536e-06, + "loss": 1.5728, + "step": 10037 + }, + { + "epoch": 2.011623246492986, + "grad_norm": 19.234374721999533, + "learning_rate": 2.9587574461551315e-06, + "loss": 1.2976, + "step": 10038 + }, + { + "epoch": 2.011823647294589, + "grad_norm": 43.27565362343937, + "learning_rate": 2.9576931996106833e-06, + "loss": 2.3269, + "step": 10039 + }, + { + "epoch": 2.012024048096192, + "grad_norm": 24.45726323618886, + "learning_rate": 2.9566290641094686e-06, + "loss": 2.1319, + "step": 10040 + }, + { + "epoch": 2.0122244488977956, + "grad_norm": 27.036123243296696, + "learning_rate": 2.9555650397093478e-06, + "loss": 1.6667, + "step": 10041 + }, + { + "epoch": 2.0124248496993986, + "grad_norm": 20.671163044154692, + "learning_rate": 2.954501126468175e-06, + "loss": 1.3156, + "step": 10042 + }, + { + "epoch": 2.012625250501002, + "grad_norm": 27.30719805127174, + "learning_rate": 2.953437324443796e-06, + "loss": 1.5547, + "step": 10043 + }, + { + "epoch": 2.012825651302605, + "grad_norm": 22.304076143205343, + "learning_rate": 2.95237363369405e-06, + "loss": 1.2702, + "step": 10044 + }, + { + "epoch": 2.0130260521042085, + "grad_norm": 28.24267758207819, + "learning_rate": 2.9513100542767704e-06, + "loss": 1.4955, + "step": 10045 + }, + { + "epoch": 2.0132264529058115, + "grad_norm": 37.989574701558055, + "learning_rate": 2.9502465862497887e-06, + "loss": 1.6653, + "step": 10046 + }, + { + "epoch": 2.013426853707415, + "grad_norm": 21.4708484164118, + "learning_rate": 2.949183229670926e-06, + "loss": 2.0608, + "step": 10047 + }, + { + "epoch": 2.013627254509018, + "grad_norm": 32.704222526558986, + "learning_rate": 2.948119984597997e-06, + "loss": 1.6625, + "step": 10048 + }, + { + "epoch": 2.0138276553106214, + "grad_norm": 22.41099858882244, + "learning_rate": 2.9470568510888148e-06, + "loss": 1.6176, + "step": 10049 + }, + { + "epoch": 2.0140280561122244, + "grad_norm": 18.046630966119054, + "learning_rate": 2.9459938292011824e-06, + "loss": 1.8117, + "step": 10050 + }, + { + "epoch": 2.014228456913828, + "grad_norm": 22.745440754000054, + "learning_rate": 2.9449309189928955e-06, + "loss": 1.5183, + "step": 10051 + }, + { + "epoch": 2.014428857715431, + "grad_norm": 24.37704873889077, + "learning_rate": 2.94386812052175e-06, + "loss": 1.7611, + "step": 10052 + }, + { + "epoch": 2.0146292585170342, + "grad_norm": 19.64001727346039, + "learning_rate": 2.942805433845529e-06, + "loss": 1.5455, + "step": 10053 + }, + { + "epoch": 2.0148296593186372, + "grad_norm": 21.122737716158003, + "learning_rate": 2.9417428590220143e-06, + "loss": 1.4627, + "step": 10054 + }, + { + "epoch": 2.0150300601202407, + "grad_norm": 24.17912856044707, + "learning_rate": 2.940680396108976e-06, + "loss": 1.3951, + "step": 10055 + }, + { + "epoch": 2.0152304609218437, + "grad_norm": 19.67616752966884, + "learning_rate": 2.939618045164187e-06, + "loss": 1.4261, + "step": 10056 + }, + { + "epoch": 2.015430861723447, + "grad_norm": 24.02761368340905, + "learning_rate": 2.9385558062454063e-06, + "loss": 1.6426, + "step": 10057 + }, + { + "epoch": 2.01563126252505, + "grad_norm": 14.434615254719482, + "learning_rate": 2.937493679410387e-06, + "loss": 1.5076, + "step": 10058 + }, + { + "epoch": 2.015831663326653, + "grad_norm": 22.8444995742069, + "learning_rate": 2.936431664716882e-06, + "loss": 1.389, + "step": 10059 + }, + { + "epoch": 2.0160320641282565, + "grad_norm": 20.949016518799798, + "learning_rate": 2.935369762222635e-06, + "loss": 1.6833, + "step": 10060 + }, + { + "epoch": 2.0162324649298595, + "grad_norm": 15.243467073056568, + "learning_rate": 2.9343079719853816e-06, + "loss": 1.3449, + "step": 10061 + }, + { + "epoch": 2.016432865731463, + "grad_norm": 17.531317998645804, + "learning_rate": 2.9332462940628504e-06, + "loss": 1.6711, + "step": 10062 + }, + { + "epoch": 2.016633266533066, + "grad_norm": 61.76733080402768, + "learning_rate": 2.9321847285127714e-06, + "loss": 1.8003, + "step": 10063 + }, + { + "epoch": 2.0168336673346694, + "grad_norm": 29.511181337778314, + "learning_rate": 2.93112327539286e-06, + "loss": 1.5898, + "step": 10064 + }, + { + "epoch": 2.0170340681362724, + "grad_norm": 22.839972448693572, + "learning_rate": 2.9300619347608307e-06, + "loss": 1.2243, + "step": 10065 + }, + { + "epoch": 2.017234468937876, + "grad_norm": 25.898176552221262, + "learning_rate": 2.9290007066743886e-06, + "loss": 2.1498, + "step": 10066 + }, + { + "epoch": 2.017434869739479, + "grad_norm": 23.31841692825425, + "learning_rate": 2.927939591191237e-06, + "loss": 1.4488, + "step": 10067 + }, + { + "epoch": 2.0176352705410823, + "grad_norm": 28.416225225262217, + "learning_rate": 2.9268785883690655e-06, + "loss": 1.596, + "step": 10068 + }, + { + "epoch": 2.0178356713426853, + "grad_norm": 20.2041367837453, + "learning_rate": 2.9258176982655673e-06, + "loss": 1.061, + "step": 10069 + }, + { + "epoch": 2.0180360721442887, + "grad_norm": 17.67611222522796, + "learning_rate": 2.9247569209384235e-06, + "loss": 1.1822, + "step": 10070 + }, + { + "epoch": 2.0182364729458917, + "grad_norm": 17.26427745866539, + "learning_rate": 2.9236962564453077e-06, + "loss": 1.1032, + "step": 10071 + }, + { + "epoch": 2.018436873747495, + "grad_norm": 15.424825908027564, + "learning_rate": 2.92263570484389e-06, + "loss": 1.5138, + "step": 10072 + }, + { + "epoch": 2.018637274549098, + "grad_norm": 31.475037700016497, + "learning_rate": 2.9215752661918385e-06, + "loss": 1.4514, + "step": 10073 + }, + { + "epoch": 2.0188376753507016, + "grad_norm": 25.44183473189637, + "learning_rate": 2.9205149405468074e-06, + "loss": 1.4367, + "step": 10074 + }, + { + "epoch": 2.0190380761523046, + "grad_norm": 19.35779836137116, + "learning_rate": 2.9194547279664463e-06, + "loss": 1.692, + "step": 10075 + }, + { + "epoch": 2.019238476953908, + "grad_norm": 15.26306799265142, + "learning_rate": 2.9183946285084043e-06, + "loss": 1.2458, + "step": 10076 + }, + { + "epoch": 2.019438877755511, + "grad_norm": 22.428722280549028, + "learning_rate": 2.9173346422303206e-06, + "loss": 1.7862, + "step": 10077 + }, + { + "epoch": 2.0196392785571144, + "grad_norm": 21.908304213202303, + "learning_rate": 2.9162747691898254e-06, + "loss": 1.7361, + "step": 10078 + }, + { + "epoch": 2.0198396793587174, + "grad_norm": 38.337760038343255, + "learning_rate": 2.9152150094445464e-06, + "loss": 1.7991, + "step": 10079 + }, + { + "epoch": 2.0200400801603204, + "grad_norm": 27.99136277369204, + "learning_rate": 2.9141553630521048e-06, + "loss": 1.3345, + "step": 10080 + }, + { + "epoch": 2.020240480961924, + "grad_norm": 40.644006700600514, + "learning_rate": 2.9130958300701144e-06, + "loss": 1.1703, + "step": 10081 + }, + { + "epoch": 2.020440881763527, + "grad_norm": 24.21950838016862, + "learning_rate": 2.912036410556185e-06, + "loss": 1.892, + "step": 10082 + }, + { + "epoch": 2.0206412825651303, + "grad_norm": 25.926716863517125, + "learning_rate": 2.910977104567918e-06, + "loss": 1.5246, + "step": 10083 + }, + { + "epoch": 2.0208416833667333, + "grad_norm": 28.571719661583305, + "learning_rate": 2.9099179121629116e-06, + "loss": 1.7819, + "step": 10084 + }, + { + "epoch": 2.0210420841683367, + "grad_norm": 41.31034110082095, + "learning_rate": 2.9088588333987487e-06, + "loss": 0.9604, + "step": 10085 + }, + { + "epoch": 2.0212424849699397, + "grad_norm": 21.708967110365425, + "learning_rate": 2.907799868333023e-06, + "loss": 2.1013, + "step": 10086 + }, + { + "epoch": 2.021442885771543, + "grad_norm": 25.29735928090178, + "learning_rate": 2.9067410170233045e-06, + "loss": 1.9495, + "step": 10087 + }, + { + "epoch": 2.021643286573146, + "grad_norm": 21.4563272000922, + "learning_rate": 2.905682279527167e-06, + "loss": 1.8097, + "step": 10088 + }, + { + "epoch": 2.0218436873747496, + "grad_norm": 28.375966962936847, + "learning_rate": 2.9046236559021744e-06, + "loss": 1.1299, + "step": 10089 + }, + { + "epoch": 2.0220440881763526, + "grad_norm": 16.532217713025883, + "learning_rate": 2.903565146205888e-06, + "loss": 1.6592, + "step": 10090 + }, + { + "epoch": 2.022244488977956, + "grad_norm": 21.565136736057624, + "learning_rate": 2.9025067504958606e-06, + "loss": 1.7905, + "step": 10091 + }, + { + "epoch": 2.022444889779559, + "grad_norm": 20.879305433161505, + "learning_rate": 2.9014484688296335e-06, + "loss": 2.1883, + "step": 10092 + }, + { + "epoch": 2.0226452905811625, + "grad_norm": 21.13225393890143, + "learning_rate": 2.900390301264753e-06, + "loss": 1.4936, + "step": 10093 + }, + { + "epoch": 2.0228456913827655, + "grad_norm": 25.985275100660534, + "learning_rate": 2.8993322478587526e-06, + "loss": 1.6253, + "step": 10094 + }, + { + "epoch": 2.023046092184369, + "grad_norm": 22.387094016449645, + "learning_rate": 2.8982743086691554e-06, + "loss": 1.3274, + "step": 10095 + }, + { + "epoch": 2.023246492985972, + "grad_norm": 22.962359020697683, + "learning_rate": 2.8972164837534898e-06, + "loss": 1.4042, + "step": 10096 + }, + { + "epoch": 2.0234468937875754, + "grad_norm": 20.604542706306276, + "learning_rate": 2.8961587731692666e-06, + "loss": 1.4529, + "step": 10097 + }, + { + "epoch": 2.0236472945891784, + "grad_norm": 66.99531653737371, + "learning_rate": 2.895101176973997e-06, + "loss": 1.7075, + "step": 10098 + }, + { + "epoch": 2.0238476953907814, + "grad_norm": 18.1122227197669, + "learning_rate": 2.894043695225182e-06, + "loss": 1.5767, + "step": 10099 + }, + { + "epoch": 2.024048096192385, + "grad_norm": 20.521472224302485, + "learning_rate": 2.8929863279803205e-06, + "loss": 1.385, + "step": 10100 + }, + { + "epoch": 2.024248496993988, + "grad_norm": 20.660301762962796, + "learning_rate": 2.8919290752969053e-06, + "loss": 1.0998, + "step": 10101 + }, + { + "epoch": 2.0244488977955912, + "grad_norm": 25.006752159668018, + "learning_rate": 2.8908719372324134e-06, + "loss": 1.9068, + "step": 10102 + }, + { + "epoch": 2.0246492985971942, + "grad_norm": 25.919282104852574, + "learning_rate": 2.8898149138443322e-06, + "loss": 1.8486, + "step": 10103 + }, + { + "epoch": 2.0248496993987977, + "grad_norm": 28.98841959583552, + "learning_rate": 2.8887580051901265e-06, + "loss": 2.1105, + "step": 10104 + }, + { + "epoch": 2.0250501002004007, + "grad_norm": 18.9631536600795, + "learning_rate": 2.8877012113272645e-06, + "loss": 1.7383, + "step": 10105 + }, + { + "epoch": 2.025250501002004, + "grad_norm": 24.265592684509915, + "learning_rate": 2.886644532313206e-06, + "loss": 1.1769, + "step": 10106 + }, + { + "epoch": 2.025450901803607, + "grad_norm": 19.262477162058488, + "learning_rate": 2.8855879682054033e-06, + "loss": 1.6355, + "step": 10107 + }, + { + "epoch": 2.0256513026052105, + "grad_norm": 42.35074017738227, + "learning_rate": 2.8845315190613067e-06, + "loss": 1.8838, + "step": 10108 + }, + { + "epoch": 2.0258517034068135, + "grad_norm": 19.694058337884414, + "learning_rate": 2.8834751849383496e-06, + "loss": 1.6074, + "step": 10109 + }, + { + "epoch": 2.026052104208417, + "grad_norm": 18.575599103992367, + "learning_rate": 2.8824189658939757e-06, + "loss": 1.6147, + "step": 10110 + }, + { + "epoch": 2.02625250501002, + "grad_norm": 19.929826054848483, + "learning_rate": 2.881362861985606e-06, + "loss": 1.6289, + "step": 10111 + }, + { + "epoch": 2.0264529058116234, + "grad_norm": 16.560623740406385, + "learning_rate": 2.880306873270664e-06, + "loss": 1.8379, + "step": 10112 + }, + { + "epoch": 2.0266533066132264, + "grad_norm": 20.32950290328266, + "learning_rate": 2.8792509998065676e-06, + "loss": 1.4086, + "step": 10113 + }, + { + "epoch": 2.02685370741483, + "grad_norm": 24.92822249738661, + "learning_rate": 2.878195241650724e-06, + "loss": 1.9038, + "step": 10114 + }, + { + "epoch": 2.027054108216433, + "grad_norm": 27.69770349047982, + "learning_rate": 2.87713959886054e-06, + "loss": 2.0563, + "step": 10115 + }, + { + "epoch": 2.0272545090180363, + "grad_norm": 18.518295976379864, + "learning_rate": 2.8760840714934045e-06, + "loss": 1.1849, + "step": 10116 + }, + { + "epoch": 2.0274549098196393, + "grad_norm": 17.047183383184475, + "learning_rate": 2.8750286596067186e-06, + "loss": 1.1264, + "step": 10117 + }, + { + "epoch": 2.0276553106212423, + "grad_norm": 20.37032296857586, + "learning_rate": 2.873973363257858e-06, + "loss": 1.5586, + "step": 10118 + }, + { + "epoch": 2.0278557114228457, + "grad_norm": 36.50066663127176, + "learning_rate": 2.872918182504203e-06, + "loss": 1.6471, + "step": 10119 + }, + { + "epoch": 2.0280561122244487, + "grad_norm": 16.885126492412777, + "learning_rate": 2.8718631174031296e-06, + "loss": 0.9746, + "step": 10120 + }, + { + "epoch": 2.028256513026052, + "grad_norm": 24.407608138663218, + "learning_rate": 2.8708081680119983e-06, + "loss": 1.354, + "step": 10121 + }, + { + "epoch": 2.028456913827655, + "grad_norm": 22.86556874506125, + "learning_rate": 2.869753334388171e-06, + "loss": 1.5955, + "step": 10122 + }, + { + "epoch": 2.0286573146292586, + "grad_norm": 29.318943884898594, + "learning_rate": 2.8686986165889964e-06, + "loss": 1.8133, + "step": 10123 + }, + { + "epoch": 2.0288577154308616, + "grad_norm": 23.135978831488625, + "learning_rate": 2.8676440146718255e-06, + "loss": 1.2527, + "step": 10124 + }, + { + "epoch": 2.029058116232465, + "grad_norm": 22.72378744747207, + "learning_rate": 2.8665895286939994e-06, + "loss": 1.3897, + "step": 10125 + }, + { + "epoch": 2.029258517034068, + "grad_norm": 29.79610122670147, + "learning_rate": 2.865535158712846e-06, + "loss": 1.4133, + "step": 10126 + }, + { + "epoch": 2.0294589178356714, + "grad_norm": 27.68638041904442, + "learning_rate": 2.8644809047857e-06, + "loss": 1.3709, + "step": 10127 + }, + { + "epoch": 2.0296593186372744, + "grad_norm": 16.486295517666672, + "learning_rate": 2.863426766969879e-06, + "loss": 1.5236, + "step": 10128 + }, + { + "epoch": 2.029859719438878, + "grad_norm": 25.894435151005617, + "learning_rate": 2.862372745322698e-06, + "loss": 1.627, + "step": 10129 + }, + { + "epoch": 2.030060120240481, + "grad_norm": 19.018777280698163, + "learning_rate": 2.8613188399014657e-06, + "loss": 2.1884, + "step": 10130 + }, + { + "epoch": 2.0302605210420843, + "grad_norm": 20.66580843752643, + "learning_rate": 2.8602650507634854e-06, + "loss": 1.0226, + "step": 10131 + }, + { + "epoch": 2.0304609218436873, + "grad_norm": 29.233829509367208, + "learning_rate": 2.859211377966055e-06, + "loss": 1.3773, + "step": 10132 + }, + { + "epoch": 2.0306613226452908, + "grad_norm": 29.256831765014734, + "learning_rate": 2.8581578215664573e-06, + "loss": 1.433, + "step": 10133 + }, + { + "epoch": 2.0308617234468938, + "grad_norm": 18.89484507009607, + "learning_rate": 2.857104381621985e-06, + "loss": 1.4729, + "step": 10134 + }, + { + "epoch": 2.031062124248497, + "grad_norm": 31.860611442767013, + "learning_rate": 2.8560510581899092e-06, + "loss": 1.3517, + "step": 10135 + }, + { + "epoch": 2.0312625250501, + "grad_norm": 27.41174335250255, + "learning_rate": 2.854997851327501e-06, + "loss": 1.9015, + "step": 10136 + }, + { + "epoch": 2.031462925851703, + "grad_norm": 22.975292734188283, + "learning_rate": 2.8539447610920267e-06, + "loss": 1.7716, + "step": 10137 + }, + { + "epoch": 2.0316633266533066, + "grad_norm": 24.126204070708564, + "learning_rate": 2.8528917875407435e-06, + "loss": 1.8346, + "step": 10138 + }, + { + "epoch": 2.0318637274549096, + "grad_norm": 21.377549354547416, + "learning_rate": 2.8518389307309034e-06, + "loss": 1.8568, + "step": 10139 + }, + { + "epoch": 2.032064128256513, + "grad_norm": 23.271134216762352, + "learning_rate": 2.8507861907197514e-06, + "loss": 1.6807, + "step": 10140 + }, + { + "epoch": 2.032264529058116, + "grad_norm": 19.28127779856633, + "learning_rate": 2.8497335675645287e-06, + "loss": 1.5898, + "step": 10141 + }, + { + "epoch": 2.0324649298597195, + "grad_norm": 24.14432722570944, + "learning_rate": 2.8486810613224643e-06, + "loss": 1.5201, + "step": 10142 + }, + { + "epoch": 2.0326653306613225, + "grad_norm": 20.266637699076874, + "learning_rate": 2.8476286720507863e-06, + "loss": 1.7699, + "step": 10143 + }, + { + "epoch": 2.032865731462926, + "grad_norm": 16.78464297018986, + "learning_rate": 2.846576399806714e-06, + "loss": 1.489, + "step": 10144 + }, + { + "epoch": 2.033066132264529, + "grad_norm": 39.15374309185407, + "learning_rate": 2.845524244647462e-06, + "loss": 2.2249, + "step": 10145 + }, + { + "epoch": 2.0332665330661324, + "grad_norm": 88.52321171542364, + "learning_rate": 2.8444722066302368e-06, + "loss": 1.878, + "step": 10146 + }, + { + "epoch": 2.0334669338677354, + "grad_norm": 32.25885693594719, + "learning_rate": 2.8434202858122393e-06, + "loss": 2.1255, + "step": 10147 + }, + { + "epoch": 2.033667334669339, + "grad_norm": 24.47168754483374, + "learning_rate": 2.842368482250667e-06, + "loss": 1.5542, + "step": 10148 + }, + { + "epoch": 2.033867735470942, + "grad_norm": 18.323540178280584, + "learning_rate": 2.8413167960027035e-06, + "loss": 1.3698, + "step": 10149 + }, + { + "epoch": 2.0340681362725452, + "grad_norm": 24.92472129744508, + "learning_rate": 2.8402652271255293e-06, + "loss": 1.7239, + "step": 10150 + }, + { + "epoch": 2.0342685370741482, + "grad_norm": 56.31864925030471, + "learning_rate": 2.839213775676328e-06, + "loss": 1.7538, + "step": 10151 + }, + { + "epoch": 2.0344689378757517, + "grad_norm": 35.23082775383756, + "learning_rate": 2.83816244171226e-06, + "loss": 1.2516, + "step": 10152 + }, + { + "epoch": 2.0346693386773547, + "grad_norm": 18.08744073712077, + "learning_rate": 2.837111225290492e-06, + "loss": 1.5152, + "step": 10153 + }, + { + "epoch": 2.034869739478958, + "grad_norm": 16.464175223700764, + "learning_rate": 2.83606012646818e-06, + "loss": 1.6046, + "step": 10154 + }, + { + "epoch": 2.035070140280561, + "grad_norm": 20.676397242405976, + "learning_rate": 2.835009145302472e-06, + "loss": 1.5238, + "step": 10155 + }, + { + "epoch": 2.0352705410821645, + "grad_norm": 46.33906873343864, + "learning_rate": 2.833958281850513e-06, + "loss": 1.9816, + "step": 10156 + }, + { + "epoch": 2.0354709418837675, + "grad_norm": 19.472610980599555, + "learning_rate": 2.832907536169439e-06, + "loss": 1.6394, + "step": 10157 + }, + { + "epoch": 2.0356713426853705, + "grad_norm": 18.351360867008832, + "learning_rate": 2.8318569083163836e-06, + "loss": 1.4821, + "step": 10158 + }, + { + "epoch": 2.035871743486974, + "grad_norm": 17.31818684918144, + "learning_rate": 2.8308063983484674e-06, + "loss": 1.4736, + "step": 10159 + }, + { + "epoch": 2.036072144288577, + "grad_norm": 18.076778387496862, + "learning_rate": 2.8297560063228084e-06, + "loss": 1.6658, + "step": 10160 + }, + { + "epoch": 2.0362725450901804, + "grad_norm": 22.61799341389822, + "learning_rate": 2.8287057322965193e-06, + "loss": 1.8821, + "step": 10161 + }, + { + "epoch": 2.0364729458917834, + "grad_norm": 24.944851792808976, + "learning_rate": 2.8276555763267043e-06, + "loss": 1.5327, + "step": 10162 + }, + { + "epoch": 2.036673346693387, + "grad_norm": 23.12447374402463, + "learning_rate": 2.8266055384704626e-06, + "loss": 1.7105, + "step": 10163 + }, + { + "epoch": 2.03687374749499, + "grad_norm": 22.4693489486917, + "learning_rate": 2.8255556187848864e-06, + "loss": 1.0777, + "step": 10164 + }, + { + "epoch": 2.0370741482965933, + "grad_norm": 19.108213991234834, + "learning_rate": 2.8245058173270623e-06, + "loss": 1.485, + "step": 10165 + }, + { + "epoch": 2.0372745490981963, + "grad_norm": 27.482072406001265, + "learning_rate": 2.8234561341540643e-06, + "loss": 1.5829, + "step": 10166 + }, + { + "epoch": 2.0374749498997997, + "grad_norm": 20.958574483411585, + "learning_rate": 2.822406569322974e-06, + "loss": 1.1962, + "step": 10167 + }, + { + "epoch": 2.0376753507014027, + "grad_norm": 19.48433554752806, + "learning_rate": 2.82135712289085e-06, + "loss": 1.6224, + "step": 10168 + }, + { + "epoch": 2.037875751503006, + "grad_norm": 20.17843938214541, + "learning_rate": 2.820307794914756e-06, + "loss": 1.0203, + "step": 10169 + }, + { + "epoch": 2.038076152304609, + "grad_norm": 27.72494979748671, + "learning_rate": 2.819258585451744e-06, + "loss": 1.8622, + "step": 10170 + }, + { + "epoch": 2.0382765531062126, + "grad_norm": 21.19624268135098, + "learning_rate": 2.8182094945588616e-06, + "loss": 1.2679, + "step": 10171 + }, + { + "epoch": 2.0384769539078156, + "grad_norm": 27.784613364938625, + "learning_rate": 2.817160522293152e-06, + "loss": 1.6469, + "step": 10172 + }, + { + "epoch": 2.038677354709419, + "grad_norm": 160.7412908312189, + "learning_rate": 2.816111668711643e-06, + "loss": 1.5943, + "step": 10173 + }, + { + "epoch": 2.038877755511022, + "grad_norm": 19.612798096302477, + "learning_rate": 2.8150629338713704e-06, + "loss": 1.6998, + "step": 10174 + }, + { + "epoch": 2.0390781563126255, + "grad_norm": 30.87280964627112, + "learning_rate": 2.814014317829349e-06, + "loss": 1.6743, + "step": 10175 + }, + { + "epoch": 2.0392785571142285, + "grad_norm": 22.791404981766757, + "learning_rate": 2.8129658206425957e-06, + "loss": 1.2419, + "step": 10176 + }, + { + "epoch": 2.0394789579158314, + "grad_norm": 20.320689195435946, + "learning_rate": 2.81191744236812e-06, + "loss": 1.5715, + "step": 10177 + }, + { + "epoch": 2.039679358717435, + "grad_norm": 26.10518585364093, + "learning_rate": 2.810869183062922e-06, + "loss": 1.5566, + "step": 10178 + }, + { + "epoch": 2.039879759519038, + "grad_norm": 28.99847802465883, + "learning_rate": 2.8098210427840006e-06, + "loss": 2.1494, + "step": 10179 + }, + { + "epoch": 2.0400801603206413, + "grad_norm": 26.422369095553048, + "learning_rate": 2.808773021588338e-06, + "loss": 1.5109, + "step": 10180 + }, + { + "epoch": 2.0402805611222443, + "grad_norm": 20.774190868480055, + "learning_rate": 2.8077251195329226e-06, + "loss": 1.2538, + "step": 10181 + }, + { + "epoch": 2.0404809619238478, + "grad_norm": 20.544031102784057, + "learning_rate": 2.80667733667473e-06, + "loss": 1.0421, + "step": 10182 + }, + { + "epoch": 2.0406813627254508, + "grad_norm": 21.99416203602339, + "learning_rate": 2.8056296730707254e-06, + "loss": 1.8269, + "step": 10183 + }, + { + "epoch": 2.040881763527054, + "grad_norm": 20.682872321601277, + "learning_rate": 2.804582128777878e-06, + "loss": 1.3469, + "step": 10184 + }, + { + "epoch": 2.041082164328657, + "grad_norm": 22.0025984310698, + "learning_rate": 2.803534703853139e-06, + "loss": 1.6785, + "step": 10185 + }, + { + "epoch": 2.0412825651302606, + "grad_norm": 18.42197165817625, + "learning_rate": 2.8024873983534606e-06, + "loss": 1.149, + "step": 10186 + }, + { + "epoch": 2.0414829659318636, + "grad_norm": 45.927335487620226, + "learning_rate": 2.801440212335786e-06, + "loss": 1.9612, + "step": 10187 + }, + { + "epoch": 2.041683366733467, + "grad_norm": 20.365504887946404, + "learning_rate": 2.800393145857052e-06, + "loss": 1.4095, + "step": 10188 + }, + { + "epoch": 2.04188376753507, + "grad_norm": 24.215989505756962, + "learning_rate": 2.7993461989741926e-06, + "loss": 1.4757, + "step": 10189 + }, + { + "epoch": 2.0420841683366735, + "grad_norm": 28.84616013596324, + "learning_rate": 2.7982993717441243e-06, + "loss": 1.4539, + "step": 10190 + }, + { + "epoch": 2.0422845691382765, + "grad_norm": 19.870440509547777, + "learning_rate": 2.797252664223773e-06, + "loss": 1.4714, + "step": 10191 + }, + { + "epoch": 2.04248496993988, + "grad_norm": 16.880566003967825, + "learning_rate": 2.796206076470044e-06, + "loss": 1.4226, + "step": 10192 + }, + { + "epoch": 2.042685370741483, + "grad_norm": 23.29652802936135, + "learning_rate": 2.795159608539844e-06, + "loss": 1.3142, + "step": 10193 + }, + { + "epoch": 2.0428857715430864, + "grad_norm": 26.01292408394737, + "learning_rate": 2.7941132604900707e-06, + "loss": 1.4481, + "step": 10194 + }, + { + "epoch": 2.0430861723446894, + "grad_norm": 22.27661724726572, + "learning_rate": 2.7930670323776156e-06, + "loss": 1.1005, + "step": 10195 + }, + { + "epoch": 2.0432865731462924, + "grad_norm": 20.85500207265274, + "learning_rate": 2.7920209242593655e-06, + "loss": 1.8469, + "step": 10196 + }, + { + "epoch": 2.043486973947896, + "grad_norm": 27.50673042017849, + "learning_rate": 2.7909749361921924e-06, + "loss": 1.4469, + "step": 10197 + }, + { + "epoch": 2.043687374749499, + "grad_norm": 17.838208352766618, + "learning_rate": 2.789929068232978e-06, + "loss": 1.7841, + "step": 10198 + }, + { + "epoch": 2.0438877755511022, + "grad_norm": 18.148187036351406, + "learning_rate": 2.7888833204385803e-06, + "loss": 1.2039, + "step": 10199 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 25.101933631059907, + "learning_rate": 2.7878376928658597e-06, + "loss": 1.3404, + "step": 10200 + }, + { + "epoch": 2.0442885771543087, + "grad_norm": 19.22093214518569, + "learning_rate": 2.7867921855716694e-06, + "loss": 1.4581, + "step": 10201 + }, + { + "epoch": 2.0444889779559117, + "grad_norm": 17.470213648687523, + "learning_rate": 2.785746798612855e-06, + "loss": 1.903, + "step": 10202 + }, + { + "epoch": 2.044689378757515, + "grad_norm": 22.187064257944076, + "learning_rate": 2.784701532046258e-06, + "loss": 1.3153, + "step": 10203 + }, + { + "epoch": 2.044889779559118, + "grad_norm": 22.716308820111873, + "learning_rate": 2.7836563859287045e-06, + "loss": 1.5997, + "step": 10204 + }, + { + "epoch": 2.0450901803607215, + "grad_norm": 17.1248680485248, + "learning_rate": 2.782611360317029e-06, + "loss": 1.3504, + "step": 10205 + }, + { + "epoch": 2.0452905811623245, + "grad_norm": 20.63316366377259, + "learning_rate": 2.7815664552680454e-06, + "loss": 1.3991, + "step": 10206 + }, + { + "epoch": 2.045490981963928, + "grad_norm": 20.386262419666537, + "learning_rate": 2.7805216708385655e-06, + "loss": 1.1814, + "step": 10207 + }, + { + "epoch": 2.045691382765531, + "grad_norm": 21.118406466162927, + "learning_rate": 2.7794770070854033e-06, + "loss": 1.1372, + "step": 10208 + }, + { + "epoch": 2.0458917835671344, + "grad_norm": 37.21530959862326, + "learning_rate": 2.7784324640653515e-06, + "loss": 1.0392, + "step": 10209 + }, + { + "epoch": 2.0460921843687374, + "grad_norm": 28.253096499405288, + "learning_rate": 2.7773880418352063e-06, + "loss": 1.5607, + "step": 10210 + }, + { + "epoch": 2.046292585170341, + "grad_norm": 20.883529417179147, + "learning_rate": 2.7763437404517547e-06, + "loss": 1.3962, + "step": 10211 + }, + { + "epoch": 2.046492985971944, + "grad_norm": 48.283028318444174, + "learning_rate": 2.775299559971775e-06, + "loss": 2.075, + "step": 10212 + }, + { + "epoch": 2.0466933867735473, + "grad_norm": 32.29144403763659, + "learning_rate": 2.7742555004520456e-06, + "loss": 1.5898, + "step": 10213 + }, + { + "epoch": 2.0468937875751503, + "grad_norm": 21.659041951518404, + "learning_rate": 2.7732115619493248e-06, + "loss": 1.704, + "step": 10214 + }, + { + "epoch": 2.0470941883767537, + "grad_norm": 32.04985327155204, + "learning_rate": 2.7721677445203834e-06, + "loss": 1.44, + "step": 10215 + }, + { + "epoch": 2.0472945891783567, + "grad_norm": 18.735990727296766, + "learning_rate": 2.771124048221968e-06, + "loss": 1.8112, + "step": 10216 + }, + { + "epoch": 2.0474949899799597, + "grad_norm": 16.9322520836727, + "learning_rate": 2.770080473110828e-06, + "loss": 1.258, + "step": 10217 + }, + { + "epoch": 2.047695390781563, + "grad_norm": 23.863891485680007, + "learning_rate": 2.7690370192437047e-06, + "loss": 2.0482, + "step": 10218 + }, + { + "epoch": 2.047895791583166, + "grad_norm": 19.672897430621777, + "learning_rate": 2.7679936866773318e-06, + "loss": 1.4215, + "step": 10219 + }, + { + "epoch": 2.0480961923847696, + "grad_norm": 26.07605011158348, + "learning_rate": 2.766950475468439e-06, + "loss": 1.6448, + "step": 10220 + }, + { + "epoch": 2.0482965931863726, + "grad_norm": 23.666562105645756, + "learning_rate": 2.7659073856737407e-06, + "loss": 1.9018, + "step": 10221 + }, + { + "epoch": 2.048496993987976, + "grad_norm": 20.612548939808306, + "learning_rate": 2.76486441734996e-06, + "loss": 1.5924, + "step": 10222 + }, + { + "epoch": 2.048697394789579, + "grad_norm": 19.522155640463463, + "learning_rate": 2.7638215705537984e-06, + "loss": 1.3094, + "step": 10223 + }, + { + "epoch": 2.0488977955911825, + "grad_norm": 14.767420649413387, + "learning_rate": 2.7627788453419585e-06, + "loss": 1.4723, + "step": 10224 + }, + { + "epoch": 2.0490981963927855, + "grad_norm": 26.698475685995373, + "learning_rate": 2.7617362417711356e-06, + "loss": 1.9652, + "step": 10225 + }, + { + "epoch": 2.049298597194389, + "grad_norm": 23.853843240207205, + "learning_rate": 2.760693759898017e-06, + "loss": 1.5907, + "step": 10226 + }, + { + "epoch": 2.049498997995992, + "grad_norm": 18.678107262401372, + "learning_rate": 2.759651399779284e-06, + "loss": 1.8434, + "step": 10227 + }, + { + "epoch": 2.0496993987975953, + "grad_norm": 20.44354953522625, + "learning_rate": 2.758609161471612e-06, + "loss": 1.8814, + "step": 10228 + }, + { + "epoch": 2.0498997995991983, + "grad_norm": 19.510658907678177, + "learning_rate": 2.7575670450316704e-06, + "loss": 1.4492, + "step": 10229 + }, + { + "epoch": 2.0501002004008018, + "grad_norm": 33.317075156405465, + "learning_rate": 2.756525050516116e-06, + "loss": 1.3879, + "step": 10230 + }, + { + "epoch": 2.0503006012024048, + "grad_norm": 17.935879865923805, + "learning_rate": 2.755483177981605e-06, + "loss": 1.561, + "step": 10231 + }, + { + "epoch": 2.050501002004008, + "grad_norm": 20.829674194775567, + "learning_rate": 2.7544414274847907e-06, + "loss": 1.7097, + "step": 10232 + }, + { + "epoch": 2.050701402805611, + "grad_norm": 25.293173034160986, + "learning_rate": 2.7533997990823087e-06, + "loss": 1.6662, + "step": 10233 + }, + { + "epoch": 2.0509018036072146, + "grad_norm": 21.806996837406636, + "learning_rate": 2.7523582928307958e-06, + "loss": 1.7005, + "step": 10234 + }, + { + "epoch": 2.0511022044088176, + "grad_norm": 22.366614294413505, + "learning_rate": 2.7513169087868802e-06, + "loss": 1.3848, + "step": 10235 + }, + { + "epoch": 2.0513026052104206, + "grad_norm": 28.916237878263434, + "learning_rate": 2.750275647007184e-06, + "loss": 1.34, + "step": 10236 + }, + { + "epoch": 2.051503006012024, + "grad_norm": 24.68501149360993, + "learning_rate": 2.7492345075483236e-06, + "loss": 1.2885, + "step": 10237 + }, + { + "epoch": 2.051703406813627, + "grad_norm": 18.8655884688516, + "learning_rate": 2.7481934904669016e-06, + "loss": 1.3457, + "step": 10238 + }, + { + "epoch": 2.0519038076152305, + "grad_norm": 19.157152832757806, + "learning_rate": 2.7471525958195266e-06, + "loss": 1.8375, + "step": 10239 + }, + { + "epoch": 2.0521042084168335, + "grad_norm": 23.380927050726317, + "learning_rate": 2.7461118236627894e-06, + "loss": 1.7494, + "step": 10240 + }, + { + "epoch": 2.052304609218437, + "grad_norm": 17.759332789876428, + "learning_rate": 2.7450711740532797e-06, + "loss": 1.1171, + "step": 10241 + }, + { + "epoch": 2.05250501002004, + "grad_norm": 19.03623814801585, + "learning_rate": 2.744030647047578e-06, + "loss": 1.8068, + "step": 10242 + }, + { + "epoch": 2.0527054108216434, + "grad_norm": 18.058131556926238, + "learning_rate": 2.7429902427022604e-06, + "loss": 1.6519, + "step": 10243 + }, + { + "epoch": 2.0529058116232464, + "grad_norm": 30.480038886422136, + "learning_rate": 2.7419499610738954e-06, + "loss": 1.5746, + "step": 10244 + }, + { + "epoch": 2.05310621242485, + "grad_norm": 20.94389251710618, + "learning_rate": 2.740909802219044e-06, + "loss": 1.5654, + "step": 10245 + }, + { + "epoch": 2.053306613226453, + "grad_norm": 38.70126706044539, + "learning_rate": 2.7398697661942632e-06, + "loss": 1.9417, + "step": 10246 + }, + { + "epoch": 2.0535070140280562, + "grad_norm": 28.71501491327125, + "learning_rate": 2.7388298530560974e-06, + "loss": 1.5363, + "step": 10247 + }, + { + "epoch": 2.0537074148296592, + "grad_norm": 16.91738383747618, + "learning_rate": 2.73779006286109e-06, + "loss": 1.2085, + "step": 10248 + }, + { + "epoch": 2.0539078156312627, + "grad_norm": 34.27829271937203, + "learning_rate": 2.736750395665777e-06, + "loss": 1.4035, + "step": 10249 + }, + { + "epoch": 2.0541082164328657, + "grad_norm": 21.702385607548717, + "learning_rate": 2.735710851526685e-06, + "loss": 1.5005, + "step": 10250 + }, + { + "epoch": 2.054308617234469, + "grad_norm": 19.620369984526253, + "learning_rate": 2.734671430500337e-06, + "loss": 1.3823, + "step": 10251 + }, + { + "epoch": 2.054509018036072, + "grad_norm": 20.400786608396995, + "learning_rate": 2.733632132643247e-06, + "loss": 1.3898, + "step": 10252 + }, + { + "epoch": 2.0547094188376755, + "grad_norm": 22.920017967987846, + "learning_rate": 2.7325929580119253e-06, + "loss": 0.9714, + "step": 10253 + }, + { + "epoch": 2.0549098196392785, + "grad_norm": 22.313934537524215, + "learning_rate": 2.731553906662868e-06, + "loss": 1.1637, + "step": 10254 + }, + { + "epoch": 2.0551102204408815, + "grad_norm": 19.407797041044333, + "learning_rate": 2.730514978652577e-06, + "loss": 1.7329, + "step": 10255 + }, + { + "epoch": 2.055310621242485, + "grad_norm": 21.156856778263478, + "learning_rate": 2.729476174037534e-06, + "loss": 1.1954, + "step": 10256 + }, + { + "epoch": 2.055511022044088, + "grad_norm": 19.402392722636336, + "learning_rate": 2.728437492874224e-06, + "loss": 1.6282, + "step": 10257 + }, + { + "epoch": 2.0557114228456914, + "grad_norm": 17.6798266857516, + "learning_rate": 2.7273989352191206e-06, + "loss": 1.5595, + "step": 10258 + }, + { + "epoch": 2.0559118236472944, + "grad_norm": 32.01398490145513, + "learning_rate": 2.7263605011286914e-06, + "loss": 1.3042, + "step": 10259 + }, + { + "epoch": 2.056112224448898, + "grad_norm": 22.277667041780788, + "learning_rate": 2.7253221906594e-06, + "loss": 1.7994, + "step": 10260 + }, + { + "epoch": 2.056312625250501, + "grad_norm": 47.16698466009043, + "learning_rate": 2.7242840038676955e-06, + "loss": 1.358, + "step": 10261 + }, + { + "epoch": 2.0565130260521043, + "grad_norm": 21.523753316324818, + "learning_rate": 2.7232459408100297e-06, + "loss": 1.5384, + "step": 10262 + }, + { + "epoch": 2.0567134268537073, + "grad_norm": 34.763992300541894, + "learning_rate": 2.7222080015428453e-06, + "loss": 1.5304, + "step": 10263 + }, + { + "epoch": 2.0569138276553107, + "grad_norm": 18.834318676624854, + "learning_rate": 2.7211701861225724e-06, + "loss": 1.5454, + "step": 10264 + }, + { + "epoch": 2.0571142284569137, + "grad_norm": 18.762438937765708, + "learning_rate": 2.7201324946056408e-06, + "loss": 1.3345, + "step": 10265 + }, + { + "epoch": 2.057314629258517, + "grad_norm": 17.231627409939435, + "learning_rate": 2.71909492704847e-06, + "loss": 1.2615, + "step": 10266 + }, + { + "epoch": 2.05751503006012, + "grad_norm": 15.992860066011756, + "learning_rate": 2.718057483507476e-06, + "loss": 1.369, + "step": 10267 + }, + { + "epoch": 2.0577154308617236, + "grad_norm": 20.478651587608944, + "learning_rate": 2.717020164039064e-06, + "loss": 1.7537, + "step": 10268 + }, + { + "epoch": 2.0579158316633266, + "grad_norm": 29.351254865692546, + "learning_rate": 2.7159829686996365e-06, + "loss": 1.5053, + "step": 10269 + }, + { + "epoch": 2.05811623246493, + "grad_norm": 18.951156039730648, + "learning_rate": 2.714945897545588e-06, + "loss": 1.4477, + "step": 10270 + }, + { + "epoch": 2.058316633266533, + "grad_norm": 23.257788871599896, + "learning_rate": 2.7139089506333003e-06, + "loss": 1.965, + "step": 10271 + }, + { + "epoch": 2.0585170340681365, + "grad_norm": 33.136879336868404, + "learning_rate": 2.7128721280191616e-06, + "loss": 1.3494, + "step": 10272 + }, + { + "epoch": 2.0587174348697395, + "grad_norm": 15.178145967096235, + "learning_rate": 2.7118354297595396e-06, + "loss": 1.2443, + "step": 10273 + }, + { + "epoch": 2.058917835671343, + "grad_norm": 18.604460302547448, + "learning_rate": 2.7107988559108024e-06, + "loss": 1.3942, + "step": 10274 + }, + { + "epoch": 2.059118236472946, + "grad_norm": 21.508675227678214, + "learning_rate": 2.7097624065293106e-06, + "loss": 1.7319, + "step": 10275 + }, + { + "epoch": 2.059318637274549, + "grad_norm": 24.081645108110166, + "learning_rate": 2.7087260816714177e-06, + "loss": 1.9429, + "step": 10276 + }, + { + "epoch": 2.0595190380761523, + "grad_norm": 25.496918528126418, + "learning_rate": 2.7076898813934718e-06, + "loss": 1.9741, + "step": 10277 + }, + { + "epoch": 2.0597194388777553, + "grad_norm": 19.785535852155988, + "learning_rate": 2.7066538057518064e-06, + "loss": 1.7561, + "step": 10278 + }, + { + "epoch": 2.0599198396793588, + "grad_norm": 16.76953921619468, + "learning_rate": 2.7056178548027638e-06, + "loss": 1.5316, + "step": 10279 + }, + { + "epoch": 2.0601202404809618, + "grad_norm": 13.470849060030524, + "learning_rate": 2.7045820286026626e-06, + "loss": 1.2305, + "step": 10280 + }, + { + "epoch": 2.060320641282565, + "grad_norm": 18.31628321610334, + "learning_rate": 2.7035463272078255e-06, + "loss": 1.5352, + "step": 10281 + }, + { + "epoch": 2.060521042084168, + "grad_norm": 25.570135058185926, + "learning_rate": 2.702510750674564e-06, + "loss": 1.6738, + "step": 10282 + }, + { + "epoch": 2.0607214428857716, + "grad_norm": 33.293244032663374, + "learning_rate": 2.701475299059184e-06, + "loss": 1.8394, + "step": 10283 + }, + { + "epoch": 2.0609218436873746, + "grad_norm": 18.98435628841951, + "learning_rate": 2.7004399724179876e-06, + "loss": 1.7161, + "step": 10284 + }, + { + "epoch": 2.061122244488978, + "grad_norm": 21.00058904709676, + "learning_rate": 2.69940477080726e-06, + "loss": 1.7333, + "step": 10285 + }, + { + "epoch": 2.061322645290581, + "grad_norm": 26.317200288937357, + "learning_rate": 2.6983696942832955e-06, + "loss": 1.4628, + "step": 10286 + }, + { + "epoch": 2.0615230460921845, + "grad_norm": 23.105002972062405, + "learning_rate": 2.6973347429023654e-06, + "loss": 1.8494, + "step": 10287 + }, + { + "epoch": 2.0617234468937875, + "grad_norm": 23.69713466693377, + "learning_rate": 2.6962999167207433e-06, + "loss": 1.8068, + "step": 10288 + }, + { + "epoch": 2.061923847695391, + "grad_norm": 19.010899523959115, + "learning_rate": 2.695265215794699e-06, + "loss": 1.1242, + "step": 10289 + }, + { + "epoch": 2.062124248496994, + "grad_norm": 17.6172043033016, + "learning_rate": 2.6942306401804847e-06, + "loss": 1.1151, + "step": 10290 + }, + { + "epoch": 2.0623246492985974, + "grad_norm": 20.810295774730744, + "learning_rate": 2.693196189934356e-06, + "loss": 1.8836, + "step": 10291 + }, + { + "epoch": 2.0625250501002004, + "grad_norm": 34.71840999237253, + "learning_rate": 2.6921618651125523e-06, + "loss": 1.3238, + "step": 10292 + }, + { + "epoch": 2.062725450901804, + "grad_norm": 16.548722306376582, + "learning_rate": 2.691127665771316e-06, + "loss": 1.338, + "step": 10293 + }, + { + "epoch": 2.062925851703407, + "grad_norm": 45.072697739912364, + "learning_rate": 2.6900935919668792e-06, + "loss": 1.17, + "step": 10294 + }, + { + "epoch": 2.06312625250501, + "grad_norm": 18.238200242346597, + "learning_rate": 2.6890596437554594e-06, + "loss": 1.2666, + "step": 10295 + }, + { + "epoch": 2.0633266533066132, + "grad_norm": 18.57340254850122, + "learning_rate": 2.688025821193282e-06, + "loss": 1.4531, + "step": 10296 + }, + { + "epoch": 2.0635270541082162, + "grad_norm": 23.58471897971054, + "learning_rate": 2.6869921243365517e-06, + "loss": 1.7704, + "step": 10297 + }, + { + "epoch": 2.0637274549098197, + "grad_norm": 17.3599901126501, + "learning_rate": 2.6859585532414735e-06, + "loss": 1.3836, + "step": 10298 + }, + { + "epoch": 2.0639278557114227, + "grad_norm": 23.055849472585983, + "learning_rate": 2.6849251079642446e-06, + "loss": 1.7065, + "step": 10299 + }, + { + "epoch": 2.064128256513026, + "grad_norm": 18.975547725648223, + "learning_rate": 2.683891788561055e-06, + "loss": 1.8888, + "step": 10300 + }, + { + "epoch": 2.064328657314629, + "grad_norm": 22.51147979527021, + "learning_rate": 2.6828585950880897e-06, + "loss": 1.3129, + "step": 10301 + }, + { + "epoch": 2.0645290581162326, + "grad_norm": 22.56062235541088, + "learning_rate": 2.6818255276015194e-06, + "loss": 1.7017, + "step": 10302 + }, + { + "epoch": 2.0647294589178355, + "grad_norm": 18.8197312538571, + "learning_rate": 2.680792586157521e-06, + "loss": 1.5118, + "step": 10303 + }, + { + "epoch": 2.064929859719439, + "grad_norm": 22.992849812821007, + "learning_rate": 2.679759770812251e-06, + "loss": 1.2412, + "step": 10304 + }, + { + "epoch": 2.065130260521042, + "grad_norm": 46.67960573397359, + "learning_rate": 2.6787270816218668e-06, + "loss": 1.7426, + "step": 10305 + }, + { + "epoch": 2.0653306613226454, + "grad_norm": 23.293113940876754, + "learning_rate": 2.6776945186425186e-06, + "loss": 1.5785, + "step": 10306 + }, + { + "epoch": 2.0655310621242484, + "grad_norm": 20.91342754428169, + "learning_rate": 2.676662081930347e-06, + "loss": 1.6426, + "step": 10307 + }, + { + "epoch": 2.065731462925852, + "grad_norm": 23.926131622015642, + "learning_rate": 2.675629771541489e-06, + "loss": 1.5789, + "step": 10308 + }, + { + "epoch": 2.065931863727455, + "grad_norm": 14.970949443193428, + "learning_rate": 2.6745975875320684e-06, + "loss": 1.6366, + "step": 10309 + }, + { + "epoch": 2.0661322645290583, + "grad_norm": 17.13462369015378, + "learning_rate": 2.673565529958213e-06, + "loss": 1.2729, + "step": 10310 + }, + { + "epoch": 2.0663326653306613, + "grad_norm": 20.867623168621947, + "learning_rate": 2.672533598876032e-06, + "loss": 1.3417, + "step": 10311 + }, + { + "epoch": 2.0665330661322647, + "grad_norm": 14.971912955398976, + "learning_rate": 2.6715017943416348e-06, + "loss": 1.3405, + "step": 10312 + }, + { + "epoch": 2.0667334669338677, + "grad_norm": 23.9550954875366, + "learning_rate": 2.6704701164111224e-06, + "loss": 1.6079, + "step": 10313 + }, + { + "epoch": 2.0669338677354707, + "grad_norm": 27.540009541766455, + "learning_rate": 2.6694385651405887e-06, + "loss": 1.2733, + "step": 10314 + }, + { + "epoch": 2.067134268537074, + "grad_norm": 26.929144008157895, + "learning_rate": 2.66840714058612e-06, + "loss": 1.5928, + "step": 10315 + }, + { + "epoch": 2.067334669338677, + "grad_norm": 19.41478966609401, + "learning_rate": 2.6673758428037965e-06, + "loss": 1.8945, + "step": 10316 + }, + { + "epoch": 2.0675350701402806, + "grad_norm": 18.880973518926254, + "learning_rate": 2.6663446718496945e-06, + "loss": 1.3659, + "step": 10317 + }, + { + "epoch": 2.0677354709418836, + "grad_norm": 26.043062260533624, + "learning_rate": 2.665313627779875e-06, + "loss": 1.6703, + "step": 10318 + }, + { + "epoch": 2.067935871743487, + "grad_norm": 30.981937785533535, + "learning_rate": 2.664282710650398e-06, + "loss": 1.0616, + "step": 10319 + }, + { + "epoch": 2.06813627254509, + "grad_norm": 23.893482085446827, + "learning_rate": 2.663251920517322e-06, + "loss": 1.2409, + "step": 10320 + }, + { + "epoch": 2.0683366733466935, + "grad_norm": 26.083405962520366, + "learning_rate": 2.662221257436686e-06, + "loss": 1.6237, + "step": 10321 + }, + { + "epoch": 2.0685370741482965, + "grad_norm": 25.759773413179232, + "learning_rate": 2.6611907214645317e-06, + "loss": 1.5003, + "step": 10322 + }, + { + "epoch": 2.0687374749499, + "grad_norm": 23.199111595977648, + "learning_rate": 2.6601603126568897e-06, + "loss": 1.7199, + "step": 10323 + }, + { + "epoch": 2.068937875751503, + "grad_norm": 49.636636785805095, + "learning_rate": 2.6591300310697852e-06, + "loss": 1.2614, + "step": 10324 + }, + { + "epoch": 2.0691382765531063, + "grad_norm": 72.9883690514546, + "learning_rate": 2.6580998767592375e-06, + "loss": 1.528, + "step": 10325 + }, + { + "epoch": 2.0693386773547093, + "grad_norm": 17.598418186656325, + "learning_rate": 2.657069849781256e-06, + "loss": 1.4092, + "step": 10326 + }, + { + "epoch": 2.0695390781563128, + "grad_norm": 17.73697751192506, + "learning_rate": 2.6560399501918465e-06, + "loss": 1.7095, + "step": 10327 + }, + { + "epoch": 2.0697394789579158, + "grad_norm": 22.85559389522783, + "learning_rate": 2.655010178047004e-06, + "loss": 1.1748, + "step": 10328 + }, + { + "epoch": 2.069939879759519, + "grad_norm": 22.98190714372031, + "learning_rate": 2.653980533402719e-06, + "loss": 1.6576, + "step": 10329 + }, + { + "epoch": 2.070140280561122, + "grad_norm": 25.27478043040179, + "learning_rate": 2.652951016314975e-06, + "loss": 1.7141, + "step": 10330 + }, + { + "epoch": 2.0703406813627256, + "grad_norm": 18.89266192408767, + "learning_rate": 2.6519216268397496e-06, + "loss": 1.4511, + "step": 10331 + }, + { + "epoch": 2.0705410821643286, + "grad_norm": 25.161225293853555, + "learning_rate": 2.650892365033011e-06, + "loss": 1.5452, + "step": 10332 + }, + { + "epoch": 2.070741482965932, + "grad_norm": 15.547878338861473, + "learning_rate": 2.6498632309507225e-06, + "loss": 1.7364, + "step": 10333 + }, + { + "epoch": 2.070941883767535, + "grad_norm": 45.01657670257194, + "learning_rate": 2.648834224648841e-06, + "loss": 1.4061, + "step": 10334 + }, + { + "epoch": 2.071142284569138, + "grad_norm": 24.91393337409215, + "learning_rate": 2.6478053461833115e-06, + "loss": 1.9545, + "step": 10335 + }, + { + "epoch": 2.0713426853707415, + "grad_norm": 36.61390300055711, + "learning_rate": 2.6467765956100777e-06, + "loss": 1.3607, + "step": 10336 + }, + { + "epoch": 2.0715430861723445, + "grad_norm": 21.685769885037754, + "learning_rate": 2.6457479729850734e-06, + "loss": 1.361, + "step": 10337 + }, + { + "epoch": 2.071743486973948, + "grad_norm": 23.21298146955618, + "learning_rate": 2.644719478364227e-06, + "loss": 1.8208, + "step": 10338 + }, + { + "epoch": 2.071943887775551, + "grad_norm": 31.32708073359937, + "learning_rate": 2.64369111180346e-06, + "loss": 2.0675, + "step": 10339 + }, + { + "epoch": 2.0721442885771544, + "grad_norm": 21.363825804901712, + "learning_rate": 2.642662873358685e-06, + "loss": 1.7163, + "step": 10340 + }, + { + "epoch": 2.0723446893787574, + "grad_norm": 48.0048089833847, + "learning_rate": 2.641634763085812e-06, + "loss": 1.4633, + "step": 10341 + }, + { + "epoch": 2.072545090180361, + "grad_norm": 24.673495521268276, + "learning_rate": 2.640606781040733e-06, + "loss": 1.5461, + "step": 10342 + }, + { + "epoch": 2.072745490981964, + "grad_norm": 20.93337317400573, + "learning_rate": 2.639578927279351e-06, + "loss": 1.6463, + "step": 10343 + }, + { + "epoch": 2.0729458917835673, + "grad_norm": 19.518741546440324, + "learning_rate": 2.638551201857544e-06, + "loss": 1.9308, + "step": 10344 + }, + { + "epoch": 2.0731462925851702, + "grad_norm": 30.032224033810877, + "learning_rate": 2.6375236048311958e-06, + "loss": 1.2667, + "step": 10345 + }, + { + "epoch": 2.0733466933867737, + "grad_norm": 23.969326904914666, + "learning_rate": 2.6364961362561746e-06, + "loss": 2.3437, + "step": 10346 + }, + { + "epoch": 2.0735470941883767, + "grad_norm": 30.20861109161878, + "learning_rate": 2.6354687961883487e-06, + "loss": 1.5969, + "step": 10347 + }, + { + "epoch": 2.07374749498998, + "grad_norm": 19.2995804788595, + "learning_rate": 2.634441584683574e-06, + "loss": 2.2519, + "step": 10348 + }, + { + "epoch": 2.073947895791583, + "grad_norm": 19.795887246524387, + "learning_rate": 2.633414501797703e-06, + "loss": 1.5486, + "step": 10349 + }, + { + "epoch": 2.0741482965931866, + "grad_norm": 21.97845981514752, + "learning_rate": 2.6323875475865778e-06, + "loss": 1.6073, + "step": 10350 + }, + { + "epoch": 2.0743486973947896, + "grad_norm": 28.373256515666267, + "learning_rate": 2.6313607221060396e-06, + "loss": 1.4625, + "step": 10351 + }, + { + "epoch": 2.074549098196393, + "grad_norm": 24.886124548620824, + "learning_rate": 2.6303340254119127e-06, + "loss": 2.0561, + "step": 10352 + }, + { + "epoch": 2.074749498997996, + "grad_norm": 31.63504777564198, + "learning_rate": 2.629307457560023e-06, + "loss": 1.7476, + "step": 10353 + }, + { + "epoch": 2.074949899799599, + "grad_norm": 22.550602204705747, + "learning_rate": 2.628281018606186e-06, + "loss": 2.015, + "step": 10354 + }, + { + "epoch": 2.0751503006012024, + "grad_norm": 18.415564172073626, + "learning_rate": 2.6272547086062116e-06, + "loss": 2.0602, + "step": 10355 + }, + { + "epoch": 2.0753507014028054, + "grad_norm": 17.348471306644864, + "learning_rate": 2.6262285276159007e-06, + "loss": 1.5089, + "step": 10356 + }, + { + "epoch": 2.075551102204409, + "grad_norm": 25.492787665362222, + "learning_rate": 2.625202475691049e-06, + "loss": 1.5948, + "step": 10357 + }, + { + "epoch": 2.075751503006012, + "grad_norm": 29.93639230341644, + "learning_rate": 2.6241765528874464e-06, + "loss": 1.5114, + "step": 10358 + }, + { + "epoch": 2.0759519038076153, + "grad_norm": 23.12619132746647, + "learning_rate": 2.6231507592608674e-06, + "loss": 1.4607, + "step": 10359 + }, + { + "epoch": 2.0761523046092183, + "grad_norm": 15.45008934579172, + "learning_rate": 2.622125094867094e-06, + "loss": 1.0179, + "step": 10360 + }, + { + "epoch": 2.0763527054108217, + "grad_norm": 21.21613564502164, + "learning_rate": 2.621099559761887e-06, + "loss": 1.7522, + "step": 10361 + }, + { + "epoch": 2.0765531062124247, + "grad_norm": 30.708424604943254, + "learning_rate": 2.6200741540010094e-06, + "loss": 1.9221, + "step": 10362 + }, + { + "epoch": 2.076753507014028, + "grad_norm": 27.254952669583346, + "learning_rate": 2.6190488776402125e-06, + "loss": 1.4296, + "step": 10363 + }, + { + "epoch": 2.076953907815631, + "grad_norm": 42.25751592617922, + "learning_rate": 2.618023730735243e-06, + "loss": 1.6122, + "step": 10364 + }, + { + "epoch": 2.0771543086172346, + "grad_norm": 20.35373394312655, + "learning_rate": 2.6169987133418413e-06, + "loss": 1.5545, + "step": 10365 + }, + { + "epoch": 2.0773547094188376, + "grad_norm": 28.87644015116491, + "learning_rate": 2.615973825515733e-06, + "loss": 1.4479, + "step": 10366 + }, + { + "epoch": 2.077555110220441, + "grad_norm": 23.3515897699066, + "learning_rate": 2.614949067312651e-06, + "loss": 1.7543, + "step": 10367 + }, + { + "epoch": 2.077755511022044, + "grad_norm": 23.53013340779297, + "learning_rate": 2.6139244387883073e-06, + "loss": 1.4862, + "step": 10368 + }, + { + "epoch": 2.0779559118236475, + "grad_norm": 20.81593543036435, + "learning_rate": 2.6128999399984116e-06, + "loss": 1.4985, + "step": 10369 + }, + { + "epoch": 2.0781563126252505, + "grad_norm": 20.96700000035667, + "learning_rate": 2.6118755709986743e-06, + "loss": 1.863, + "step": 10370 + }, + { + "epoch": 2.078356713426854, + "grad_norm": 16.493013889395474, + "learning_rate": 2.610851331844785e-06, + "loss": 1.5093, + "step": 10371 + }, + { + "epoch": 2.078557114228457, + "grad_norm": 21.751712853514192, + "learning_rate": 2.6098272225924376e-06, + "loss": 1.5568, + "step": 10372 + }, + { + "epoch": 2.07875751503006, + "grad_norm": 26.439730918926305, + "learning_rate": 2.608803243297308e-06, + "loss": 1.9788, + "step": 10373 + }, + { + "epoch": 2.0789579158316633, + "grad_norm": 17.23747617372557, + "learning_rate": 2.6077793940150787e-06, + "loss": 1.3797, + "step": 10374 + }, + { + "epoch": 2.0791583166332663, + "grad_norm": 23.334871060634843, + "learning_rate": 2.6067556748014157e-06, + "loss": 1.2729, + "step": 10375 + }, + { + "epoch": 2.0793587174348698, + "grad_norm": 34.35484345185582, + "learning_rate": 2.605732085711976e-06, + "loss": 1.1819, + "step": 10376 + }, + { + "epoch": 2.0795591182364728, + "grad_norm": 29.00707486288623, + "learning_rate": 2.6047086268024214e-06, + "loss": 1.4436, + "step": 10377 + }, + { + "epoch": 2.079759519038076, + "grad_norm": 17.29879581836611, + "learning_rate": 2.6036852981283924e-06, + "loss": 1.7343, + "step": 10378 + }, + { + "epoch": 2.079959919839679, + "grad_norm": 23.710511056046755, + "learning_rate": 2.602662099745531e-06, + "loss": 1.5856, + "step": 10379 + }, + { + "epoch": 2.0801603206412826, + "grad_norm": 18.690127414371883, + "learning_rate": 2.6016390317094697e-06, + "loss": 1.7282, + "step": 10380 + }, + { + "epoch": 2.0803607214428856, + "grad_norm": 23.31478755647898, + "learning_rate": 2.600616094075835e-06, + "loss": 1.2333, + "step": 10381 + }, + { + "epoch": 2.080561122244489, + "grad_norm": 27.798554855314222, + "learning_rate": 2.599593286900247e-06, + "loss": 1.7444, + "step": 10382 + }, + { + "epoch": 2.080761523046092, + "grad_norm": 18.184336204720097, + "learning_rate": 2.5985706102383113e-06, + "loss": 1.2847, + "step": 10383 + }, + { + "epoch": 2.0809619238476955, + "grad_norm": 41.39855740169437, + "learning_rate": 2.5975480641456397e-06, + "loss": 1.6091, + "step": 10384 + }, + { + "epoch": 2.0811623246492985, + "grad_norm": 25.076380076586947, + "learning_rate": 2.596525648677825e-06, + "loss": 1.7509, + "step": 10385 + }, + { + "epoch": 2.081362725450902, + "grad_norm": 17.234253872849226, + "learning_rate": 2.595503363890458e-06, + "loss": 1.4602, + "step": 10386 + }, + { + "epoch": 2.081563126252505, + "grad_norm": 16.808854420595193, + "learning_rate": 2.5944812098391236e-06, + "loss": 1.5319, + "step": 10387 + }, + { + "epoch": 2.0817635270541084, + "grad_norm": 17.558260218253412, + "learning_rate": 2.5934591865793957e-06, + "loss": 1.015, + "step": 10388 + }, + { + "epoch": 2.0819639278557114, + "grad_norm": 19.326291051252703, + "learning_rate": 2.5924372941668463e-06, + "loss": 1.4532, + "step": 10389 + }, + { + "epoch": 2.082164328657315, + "grad_norm": 23.463678339436864, + "learning_rate": 2.591415532657031e-06, + "loss": 1.5188, + "step": 10390 + }, + { + "epoch": 2.082364729458918, + "grad_norm": 26.453820624657073, + "learning_rate": 2.5903939021055135e-06, + "loss": 1.365, + "step": 10391 + }, + { + "epoch": 2.0825651302605213, + "grad_norm": 25.699578490837542, + "learning_rate": 2.5893724025678337e-06, + "loss": 1.7583, + "step": 10392 + }, + { + "epoch": 2.0827655310621243, + "grad_norm": 20.045825278407435, + "learning_rate": 2.5883510340995345e-06, + "loss": 1.1831, + "step": 10393 + }, + { + "epoch": 2.0829659318637272, + "grad_norm": 30.354842816792413, + "learning_rate": 2.58732979675615e-06, + "loss": 1.688, + "step": 10394 + }, + { + "epoch": 2.0831663326653307, + "grad_norm": 17.66966341918266, + "learning_rate": 2.5863086905932056e-06, + "loss": 1.1526, + "step": 10395 + }, + { + "epoch": 2.0833667334669337, + "grad_norm": 18.79919539046686, + "learning_rate": 2.585287715666221e-06, + "loss": 1.2269, + "step": 10396 + }, + { + "epoch": 2.083567134268537, + "grad_norm": 15.383622097582913, + "learning_rate": 2.5842668720307076e-06, + "loss": 1.0807, + "step": 10397 + }, + { + "epoch": 2.08376753507014, + "grad_norm": 22.110988639788633, + "learning_rate": 2.583246159742172e-06, + "loss": 1.8708, + "step": 10398 + }, + { + "epoch": 2.0839679358717436, + "grad_norm": 18.510471530390664, + "learning_rate": 2.5822255788561083e-06, + "loss": 1.5845, + "step": 10399 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 19.651984518452444, + "learning_rate": 2.5812051294280073e-06, + "loss": 1.4108, + "step": 10400 + }, + { + "epoch": 2.08436873747495, + "grad_norm": 22.173185429409447, + "learning_rate": 2.580184811513358e-06, + "loss": 2.1121, + "step": 10401 + }, + { + "epoch": 2.084569138276553, + "grad_norm": 30.794185293374568, + "learning_rate": 2.5791646251676296e-06, + "loss": 1.5843, + "step": 10402 + }, + { + "epoch": 2.0847695390781564, + "grad_norm": 24.88304977462321, + "learning_rate": 2.5781445704462937e-06, + "loss": 2.1359, + "step": 10403 + }, + { + "epoch": 2.0849699398797594, + "grad_norm": 21.07056327847497, + "learning_rate": 2.577124647404814e-06, + "loss": 1.2777, + "step": 10404 + }, + { + "epoch": 2.085170340681363, + "grad_norm": 20.155491865745628, + "learning_rate": 2.5761048560986425e-06, + "loss": 1.4382, + "step": 10405 + }, + { + "epoch": 2.085370741482966, + "grad_norm": 23.382404385564968, + "learning_rate": 2.5750851965832303e-06, + "loss": 1.875, + "step": 10406 + }, + { + "epoch": 2.0855711422845693, + "grad_norm": 26.60243124170818, + "learning_rate": 2.574065668914012e-06, + "loss": 1.7175, + "step": 10407 + }, + { + "epoch": 2.0857715430861723, + "grad_norm": 28.172686986493602, + "learning_rate": 2.5730462731464272e-06, + "loss": 1.0691, + "step": 10408 + }, + { + "epoch": 2.0859719438877757, + "grad_norm": 16.45683982720005, + "learning_rate": 2.572027009335898e-06, + "loss": 1.6005, + "step": 10409 + }, + { + "epoch": 2.0861723446893787, + "grad_norm": 22.537129922855087, + "learning_rate": 2.571007877537844e-06, + "loss": 1.5078, + "step": 10410 + }, + { + "epoch": 2.086372745490982, + "grad_norm": 22.642760004250828, + "learning_rate": 2.569988877807677e-06, + "loss": 1.729, + "step": 10411 + }, + { + "epoch": 2.086573146292585, + "grad_norm": 21.571785046151106, + "learning_rate": 2.568970010200802e-06, + "loss": 1.611, + "step": 10412 + }, + { + "epoch": 2.086773547094188, + "grad_norm": 28.701850582470897, + "learning_rate": 2.567951274772616e-06, + "loss": 1.6098, + "step": 10413 + }, + { + "epoch": 2.0869739478957916, + "grad_norm": 36.206918805836175, + "learning_rate": 2.566932671578509e-06, + "loss": 1.4593, + "step": 10414 + }, + { + "epoch": 2.0871743486973946, + "grad_norm": 22.422340758130385, + "learning_rate": 2.565914200673867e-06, + "loss": 1.1842, + "step": 10415 + }, + { + "epoch": 2.087374749498998, + "grad_norm": 23.039961628598164, + "learning_rate": 2.5648958621140596e-06, + "loss": 1.7594, + "step": 10416 + }, + { + "epoch": 2.087575150300601, + "grad_norm": 22.200616373720926, + "learning_rate": 2.5638776559544597e-06, + "loss": 1.6874, + "step": 10417 + }, + { + "epoch": 2.0877755511022045, + "grad_norm": 28.44084388078536, + "learning_rate": 2.5628595822504276e-06, + "loss": 2.0187, + "step": 10418 + }, + { + "epoch": 2.0879759519038075, + "grad_norm": 29.52442074520718, + "learning_rate": 2.561841641057318e-06, + "loss": 1.3857, + "step": 10419 + }, + { + "epoch": 2.088176352705411, + "grad_norm": 18.870795367400238, + "learning_rate": 2.5608238324304763e-06, + "loss": 1.2744, + "step": 10420 + }, + { + "epoch": 2.088376753507014, + "grad_norm": 24.47659529130267, + "learning_rate": 2.559806156425244e-06, + "loss": 1.9856, + "step": 10421 + }, + { + "epoch": 2.0885771543086173, + "grad_norm": 21.98869373922492, + "learning_rate": 2.5587886130969553e-06, + "loss": 1.7349, + "step": 10422 + }, + { + "epoch": 2.0887775551102203, + "grad_norm": 25.095329211632865, + "learning_rate": 2.5577712025009304e-06, + "loss": 2.1804, + "step": 10423 + }, + { + "epoch": 2.088977955911824, + "grad_norm": 28.376755210140836, + "learning_rate": 2.556753924692491e-06, + "loss": 1.5329, + "step": 10424 + }, + { + "epoch": 2.0891783567134268, + "grad_norm": 19.006567235341496, + "learning_rate": 2.555736779726947e-06, + "loss": 1.5605, + "step": 10425 + }, + { + "epoch": 2.08937875751503, + "grad_norm": 24.09186084334198, + "learning_rate": 2.5547197676596024e-06, + "loss": 1.702, + "step": 10426 + }, + { + "epoch": 2.089579158316633, + "grad_norm": 21.936804092808423, + "learning_rate": 2.5537028885457536e-06, + "loss": 2.0378, + "step": 10427 + }, + { + "epoch": 2.0897795591182367, + "grad_norm": 20.656685486593158, + "learning_rate": 2.55268614244069e-06, + "loss": 1.9287, + "step": 10428 + }, + { + "epoch": 2.0899799599198396, + "grad_norm": 17.73437380560547, + "learning_rate": 2.5516695293996953e-06, + "loss": 1.6596, + "step": 10429 + }, + { + "epoch": 2.090180360721443, + "grad_norm": 20.811482500699835, + "learning_rate": 2.5506530494780385e-06, + "loss": 1.5752, + "step": 10430 + }, + { + "epoch": 2.090380761523046, + "grad_norm": 19.42835641814163, + "learning_rate": 2.5496367027309933e-06, + "loss": 1.3467, + "step": 10431 + }, + { + "epoch": 2.090581162324649, + "grad_norm": 18.35391317458157, + "learning_rate": 2.548620489213819e-06, + "loss": 1.5022, + "step": 10432 + }, + { + "epoch": 2.0907815631262525, + "grad_norm": 20.34834588998197, + "learning_rate": 2.5476044089817653e-06, + "loss": 1.332, + "step": 10433 + }, + { + "epoch": 2.0909819639278555, + "grad_norm": 20.003837092247004, + "learning_rate": 2.546588462090079e-06, + "loss": 1.4201, + "step": 10434 + }, + { + "epoch": 2.091182364729459, + "grad_norm": 25.67710380894895, + "learning_rate": 2.5455726485940015e-06, + "loss": 1.5155, + "step": 10435 + }, + { + "epoch": 2.091382765531062, + "grad_norm": 21.202683170821977, + "learning_rate": 2.5445569685487617e-06, + "loss": 1.5376, + "step": 10436 + }, + { + "epoch": 2.0915831663326654, + "grad_norm": 29.809601385583008, + "learning_rate": 2.5435414220095833e-06, + "loss": 1.6137, + "step": 10437 + }, + { + "epoch": 2.0917835671342684, + "grad_norm": 24.10621591220867, + "learning_rate": 2.5425260090316837e-06, + "loss": 2.025, + "step": 10438 + }, + { + "epoch": 2.091983967935872, + "grad_norm": 22.1024789888527, + "learning_rate": 2.541510729670276e-06, + "loss": 1.581, + "step": 10439 + }, + { + "epoch": 2.092184368737475, + "grad_norm": 18.647375419146663, + "learning_rate": 2.5404955839805535e-06, + "loss": 1.3207, + "step": 10440 + }, + { + "epoch": 2.0923847695390783, + "grad_norm": 51.65052949435439, + "learning_rate": 2.5394805720177214e-06, + "loss": 1.6986, + "step": 10441 + }, + { + "epoch": 2.0925851703406813, + "grad_norm": 19.257053456218532, + "learning_rate": 2.53846569383696e-06, + "loss": 1.7988, + "step": 10442 + }, + { + "epoch": 2.0927855711422847, + "grad_norm": 28.464272082832462, + "learning_rate": 2.537450949493453e-06, + "loss": 1.3849, + "step": 10443 + }, + { + "epoch": 2.0929859719438877, + "grad_norm": 21.43291139434854, + "learning_rate": 2.536436339042373e-06, + "loss": 1.489, + "step": 10444 + }, + { + "epoch": 2.093186372745491, + "grad_norm": 28.16561230482527, + "learning_rate": 2.535421862538886e-06, + "loss": 1.5988, + "step": 10445 + }, + { + "epoch": 2.093386773547094, + "grad_norm": 20.579599442760948, + "learning_rate": 2.534407520038151e-06, + "loss": 1.5393, + "step": 10446 + }, + { + "epoch": 2.0935871743486976, + "grad_norm": 22.24415316572495, + "learning_rate": 2.5333933115953165e-06, + "loss": 1.8156, + "step": 10447 + }, + { + "epoch": 2.0937875751503006, + "grad_norm": 35.72437203609316, + "learning_rate": 2.532379237265532e-06, + "loss": 1.4018, + "step": 10448 + }, + { + "epoch": 2.093987975951904, + "grad_norm": 70.25412353905757, + "learning_rate": 2.5313652971039282e-06, + "loss": 1.9233, + "step": 10449 + }, + { + "epoch": 2.094188376753507, + "grad_norm": 16.025379785698604, + "learning_rate": 2.530351491165638e-06, + "loss": 1.6632, + "step": 10450 + }, + { + "epoch": 2.0943887775551104, + "grad_norm": 23.246099171384206, + "learning_rate": 2.5293378195057826e-06, + "loss": 1.0182, + "step": 10451 + }, + { + "epoch": 2.0945891783567134, + "grad_norm": 21.829568216912623, + "learning_rate": 2.528324282179477e-06, + "loss": 1.5645, + "step": 10452 + }, + { + "epoch": 2.0947895791583164, + "grad_norm": 30.772301398958202, + "learning_rate": 2.527310879241831e-06, + "loss": 1.6895, + "step": 10453 + }, + { + "epoch": 2.09498997995992, + "grad_norm": 25.293960553866466, + "learning_rate": 2.526297610747938e-06, + "loss": 1.8843, + "step": 10454 + }, + { + "epoch": 2.095190380761523, + "grad_norm": 19.32897755693956, + "learning_rate": 2.5252844767528994e-06, + "loss": 1.5095, + "step": 10455 + }, + { + "epoch": 2.0953907815631263, + "grad_norm": 24.69687381835319, + "learning_rate": 2.524271477311795e-06, + "loss": 1.8607, + "step": 10456 + }, + { + "epoch": 2.0955911823647293, + "grad_norm": 16.849226699682095, + "learning_rate": 2.523258612479703e-06, + "loss": 1.1948, + "step": 10457 + }, + { + "epoch": 2.0957915831663327, + "grad_norm": 22.272518248966243, + "learning_rate": 2.5222458823117e-06, + "loss": 1.9158, + "step": 10458 + }, + { + "epoch": 2.0959919839679357, + "grad_norm": 26.736304522937008, + "learning_rate": 2.521233286862844e-06, + "loss": 1.5582, + "step": 10459 + }, + { + "epoch": 2.096192384769539, + "grad_norm": 19.729675899578332, + "learning_rate": 2.5202208261881955e-06, + "loss": 1.1793, + "step": 10460 + }, + { + "epoch": 2.096392785571142, + "grad_norm": 16.206349141236835, + "learning_rate": 2.5192085003427966e-06, + "loss": 0.9517, + "step": 10461 + }, + { + "epoch": 2.0965931863727456, + "grad_norm": 24.247770617197126, + "learning_rate": 2.5181963093816963e-06, + "loss": 1.6244, + "step": 10462 + }, + { + "epoch": 2.0967935871743486, + "grad_norm": 19.54224067043743, + "learning_rate": 2.5171842533599278e-06, + "loss": 1.3291, + "step": 10463 + }, + { + "epoch": 2.096993987975952, + "grad_norm": 26.847595544969778, + "learning_rate": 2.5161723323325123e-06, + "loss": 1.3848, + "step": 10464 + }, + { + "epoch": 2.097194388777555, + "grad_norm": 25.639200379240595, + "learning_rate": 2.515160546354477e-06, + "loss": 1.9291, + "step": 10465 + }, + { + "epoch": 2.0973947895791585, + "grad_norm": 38.08485267228723, + "learning_rate": 2.5141488954808295e-06, + "loss": 1.1848, + "step": 10466 + }, + { + "epoch": 2.0975951903807615, + "grad_norm": 20.908533565401036, + "learning_rate": 2.5131373797665757e-06, + "loss": 1.1236, + "step": 10467 + }, + { + "epoch": 2.097795591182365, + "grad_norm": 27.863805506349898, + "learning_rate": 2.5121259992667133e-06, + "loss": 1.5285, + "step": 10468 + }, + { + "epoch": 2.097995991983968, + "grad_norm": 30.943300757535074, + "learning_rate": 2.5111147540362335e-06, + "loss": 1.9168, + "step": 10469 + }, + { + "epoch": 2.0981963927855714, + "grad_norm": 19.456351520595188, + "learning_rate": 2.5101036441301203e-06, + "loss": 1.8743, + "step": 10470 + }, + { + "epoch": 2.0983967935871743, + "grad_norm": 22.17513315025556, + "learning_rate": 2.509092669603343e-06, + "loss": 1.828, + "step": 10471 + }, + { + "epoch": 2.0985971943887773, + "grad_norm": 22.878274519977726, + "learning_rate": 2.508081830510878e-06, + "loss": 1.886, + "step": 10472 + }, + { + "epoch": 2.098797595190381, + "grad_norm": 24.4533193907311, + "learning_rate": 2.5070711269076807e-06, + "loss": 2.0547, + "step": 10473 + }, + { + "epoch": 2.098997995991984, + "grad_norm": 22.388399043669438, + "learning_rate": 2.506060558848706e-06, + "loss": 1.4623, + "step": 10474 + }, + { + "epoch": 2.099198396793587, + "grad_norm": 46.05631114910528, + "learning_rate": 2.5050501263889004e-06, + "loss": 1.5949, + "step": 10475 + }, + { + "epoch": 2.09939879759519, + "grad_norm": 23.207480310419427, + "learning_rate": 2.5040398295832017e-06, + "loss": 1.4943, + "step": 10476 + }, + { + "epoch": 2.0995991983967937, + "grad_norm": 21.29558123274193, + "learning_rate": 2.5030296684865442e-06, + "loss": 1.1334, + "step": 10477 + }, + { + "epoch": 2.0997995991983966, + "grad_norm": 16.75291894804771, + "learning_rate": 2.502019643153846e-06, + "loss": 1.5102, + "step": 10478 + }, + { + "epoch": 2.1, + "grad_norm": 32.21114655992094, + "learning_rate": 2.5010097536400304e-06, + "loss": 1.8087, + "step": 10479 + }, + { + "epoch": 2.100200400801603, + "grad_norm": 20.34474255489078, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.3036, + "step": 10480 + }, + { + "epoch": 2.1004008016032065, + "grad_norm": 25.584895477441957, + "learning_rate": 2.498990382288661e-06, + "loss": 1.4822, + "step": 10481 + }, + { + "epoch": 2.1006012024048095, + "grad_norm": 41.96454393722654, + "learning_rate": 2.497980900560909e-06, + "loss": 2.2185, + "step": 10482 + }, + { + "epoch": 2.100801603206413, + "grad_norm": 21.62063552814291, + "learning_rate": 2.496971554871627e-06, + "loss": 1.5515, + "step": 10483 + }, + { + "epoch": 2.101002004008016, + "grad_norm": 19.636543106821847, + "learning_rate": 2.4959623452756965e-06, + "loss": 1.6683, + "step": 10484 + }, + { + "epoch": 2.1012024048096194, + "grad_norm": 24.031752818354207, + "learning_rate": 2.494953271827989e-06, + "loss": 1.5917, + "step": 10485 + }, + { + "epoch": 2.1014028056112224, + "grad_norm": 22.071530345240365, + "learning_rate": 2.4939443345833707e-06, + "loss": 1.6384, + "step": 10486 + }, + { + "epoch": 2.101603206412826, + "grad_norm": 17.74657417759559, + "learning_rate": 2.4929355335967e-06, + "loss": 1.4474, + "step": 10487 + }, + { + "epoch": 2.101803607214429, + "grad_norm": 21.379806236875947, + "learning_rate": 2.4919268689228202e-06, + "loss": 2.3947, + "step": 10488 + }, + { + "epoch": 2.1020040080160323, + "grad_norm": 23.558942225623138, + "learning_rate": 2.490918340616584e-06, + "loss": 1.5796, + "step": 10489 + }, + { + "epoch": 2.1022044088176353, + "grad_norm": 25.811431681223315, + "learning_rate": 2.4899099487328188e-06, + "loss": 1.441, + "step": 10490 + }, + { + "epoch": 2.1024048096192383, + "grad_norm": 17.509023385422985, + "learning_rate": 2.4889016933263547e-06, + "loss": 1.3688, + "step": 10491 + }, + { + "epoch": 2.1026052104208417, + "grad_norm": 27.08776984177787, + "learning_rate": 2.487893574452013e-06, + "loss": 1.7225, + "step": 10492 + }, + { + "epoch": 2.1028056112224447, + "grad_norm": 29.22070970395956, + "learning_rate": 2.4868855921646057e-06, + "loss": 1.5372, + "step": 10493 + }, + { + "epoch": 2.103006012024048, + "grad_norm": 15.972962341744372, + "learning_rate": 2.48587774651894e-06, + "loss": 1.7493, + "step": 10494 + }, + { + "epoch": 2.103206412825651, + "grad_norm": 21.21898517719525, + "learning_rate": 2.4848700375698086e-06, + "loss": 1.3908, + "step": 10495 + }, + { + "epoch": 2.1034068136272546, + "grad_norm": 17.01538939790012, + "learning_rate": 2.48386246537201e-06, + "loss": 1.6865, + "step": 10496 + }, + { + "epoch": 2.1036072144288576, + "grad_norm": 13.970845961647145, + "learning_rate": 2.482855029980321e-06, + "loss": 1.3035, + "step": 10497 + }, + { + "epoch": 2.103807615230461, + "grad_norm": 42.33000746647808, + "learning_rate": 2.4818477314495197e-06, + "loss": 1.5354, + "step": 10498 + }, + { + "epoch": 2.104008016032064, + "grad_norm": 15.626584484237464, + "learning_rate": 2.480840569834375e-06, + "loss": 1.5525, + "step": 10499 + }, + { + "epoch": 2.1042084168336674, + "grad_norm": 21.01720843573946, + "learning_rate": 2.4798335451896464e-06, + "loss": 1.4745, + "step": 10500 + }, + { + "epoch": 2.1044088176352704, + "grad_norm": 24.793790580663032, + "learning_rate": 2.4788266575700887e-06, + "loss": 2.1324, + "step": 10501 + }, + { + "epoch": 2.104609218436874, + "grad_norm": 25.77028635350499, + "learning_rate": 2.477819907030447e-06, + "loss": 1.7758, + "step": 10502 + }, + { + "epoch": 2.104809619238477, + "grad_norm": 20.63473530407351, + "learning_rate": 2.476813293625462e-06, + "loss": 1.6282, + "step": 10503 + }, + { + "epoch": 2.1050100200400803, + "grad_norm": 21.380078689559415, + "learning_rate": 2.4758068174098603e-06, + "loss": 1.4854, + "step": 10504 + }, + { + "epoch": 2.1052104208416833, + "grad_norm": 30.476781745018627, + "learning_rate": 2.474800478438368e-06, + "loss": 1.6095, + "step": 10505 + }, + { + "epoch": 2.1054108216432867, + "grad_norm": 18.48090049205869, + "learning_rate": 2.4737942767657025e-06, + "loss": 1.2965, + "step": 10506 + }, + { + "epoch": 2.1056112224448897, + "grad_norm": 20.00526072648464, + "learning_rate": 2.47278821244657e-06, + "loss": 1.568, + "step": 10507 + }, + { + "epoch": 2.105811623246493, + "grad_norm": 36.658314935392795, + "learning_rate": 2.471782285535673e-06, + "loss": 1.5311, + "step": 10508 + }, + { + "epoch": 2.106012024048096, + "grad_norm": 19.517512688707743, + "learning_rate": 2.4707764960877056e-06, + "loss": 1.5416, + "step": 10509 + }, + { + "epoch": 2.1062124248496996, + "grad_norm": 21.523290819851436, + "learning_rate": 2.469770844157355e-06, + "loss": 2.1344, + "step": 10510 + }, + { + "epoch": 2.1064128256513026, + "grad_norm": 33.36570990782713, + "learning_rate": 2.4687653297992974e-06, + "loss": 1.9823, + "step": 10511 + }, + { + "epoch": 2.1066132264529056, + "grad_norm": 28.39612918634352, + "learning_rate": 2.467759953068203e-06, + "loss": 1.9104, + "step": 10512 + }, + { + "epoch": 2.106813627254509, + "grad_norm": 21.32316042897591, + "learning_rate": 2.4667547140187415e-06, + "loss": 1.2741, + "step": 10513 + }, + { + "epoch": 2.107014028056112, + "grad_norm": 20.364458051000653, + "learning_rate": 2.4657496127055636e-06, + "loss": 1.7342, + "step": 10514 + }, + { + "epoch": 2.1072144288577155, + "grad_norm": 16.722778939178642, + "learning_rate": 2.4647446491833203e-06, + "loss": 1.3088, + "step": 10515 + }, + { + "epoch": 2.1074148296593185, + "grad_norm": 21.50147267010873, + "learning_rate": 2.4637398235066527e-06, + "loss": 1.3909, + "step": 10516 + }, + { + "epoch": 2.107615230460922, + "grad_norm": 19.714956311016678, + "learning_rate": 2.4627351357301947e-06, + "loss": 1.9627, + "step": 10517 + }, + { + "epoch": 2.107815631262525, + "grad_norm": 17.678044085290622, + "learning_rate": 2.4617305859085732e-06, + "loss": 1.3887, + "step": 10518 + }, + { + "epoch": 2.1080160320641284, + "grad_norm": 22.651270778456777, + "learning_rate": 2.460726174096406e-06, + "loss": 1.3664, + "step": 10519 + }, + { + "epoch": 2.1082164328657313, + "grad_norm": 22.999581188108515, + "learning_rate": 2.4597219003483073e-06, + "loss": 1.6935, + "step": 10520 + }, + { + "epoch": 2.108416833667335, + "grad_norm": 21.0529872739794, + "learning_rate": 2.458717764718877e-06, + "loss": 1.8074, + "step": 10521 + }, + { + "epoch": 2.108617234468938, + "grad_norm": 20.370329363282632, + "learning_rate": 2.4577137672627126e-06, + "loss": 1.6086, + "step": 10522 + }, + { + "epoch": 2.1088176352705412, + "grad_norm": 46.687258821488975, + "learning_rate": 2.4567099080344036e-06, + "loss": 1.5913, + "step": 10523 + }, + { + "epoch": 2.109018036072144, + "grad_norm": 32.99217150576827, + "learning_rate": 2.455706187088531e-06, + "loss": 1.8298, + "step": 10524 + }, + { + "epoch": 2.1092184368737477, + "grad_norm": 20.787949672478895, + "learning_rate": 2.454702604479669e-06, + "loss": 1.4837, + "step": 10525 + }, + { + "epoch": 2.1094188376753507, + "grad_norm": 22.60505111897013, + "learning_rate": 2.453699160262384e-06, + "loss": 1.2757, + "step": 10526 + }, + { + "epoch": 2.109619238476954, + "grad_norm": 17.52724217944618, + "learning_rate": 2.4526958544912353e-06, + "loss": 1.7772, + "step": 10527 + }, + { + "epoch": 2.109819639278557, + "grad_norm": 15.924803313052587, + "learning_rate": 2.45169268722077e-06, + "loss": 1.0135, + "step": 10528 + }, + { + "epoch": 2.1100200400801605, + "grad_norm": 26.86946617945896, + "learning_rate": 2.4506896585055386e-06, + "loss": 1.5218, + "step": 10529 + }, + { + "epoch": 2.1102204408817635, + "grad_norm": 20.644366102052395, + "learning_rate": 2.4496867684000713e-06, + "loss": 1.5963, + "step": 10530 + }, + { + "epoch": 2.1104208416833665, + "grad_norm": 25.14508018119396, + "learning_rate": 2.4486840169588993e-06, + "loss": 1.4737, + "step": 10531 + }, + { + "epoch": 2.11062124248497, + "grad_norm": 98.28464767980232, + "learning_rate": 2.447681404236544e-06, + "loss": 1.6295, + "step": 10532 + }, + { + "epoch": 2.110821643286573, + "grad_norm": 19.88102654184114, + "learning_rate": 2.446678930287517e-06, + "loss": 1.4228, + "step": 10533 + }, + { + "epoch": 2.1110220440881764, + "grad_norm": 20.49252530136623, + "learning_rate": 2.4456765951663285e-06, + "loss": 1.606, + "step": 10534 + }, + { + "epoch": 2.1112224448897794, + "grad_norm": 20.862462367845623, + "learning_rate": 2.4446743989274703e-06, + "loss": 1.9235, + "step": 10535 + }, + { + "epoch": 2.111422845691383, + "grad_norm": 22.89598987853833, + "learning_rate": 2.4436723416254406e-06, + "loss": 1.4772, + "step": 10536 + }, + { + "epoch": 2.111623246492986, + "grad_norm": 23.536000822246173, + "learning_rate": 2.442670423314717e-06, + "loss": 1.6421, + "step": 10537 + }, + { + "epoch": 2.1118236472945893, + "grad_norm": 20.75403355798976, + "learning_rate": 2.4416686440497776e-06, + "loss": 1.5941, + "step": 10538 + }, + { + "epoch": 2.1120240480961923, + "grad_norm": 24.65588018243935, + "learning_rate": 2.4406670038850906e-06, + "loss": 2.0433, + "step": 10539 + }, + { + "epoch": 2.1122244488977957, + "grad_norm": 28.79363762561225, + "learning_rate": 2.4396655028751165e-06, + "loss": 2.046, + "step": 10540 + }, + { + "epoch": 2.1124248496993987, + "grad_norm": 17.480154368967625, + "learning_rate": 2.43866414107431e-06, + "loss": 1.5905, + "step": 10541 + }, + { + "epoch": 2.112625250501002, + "grad_norm": 23.657387096693814, + "learning_rate": 2.4376629185371116e-06, + "loss": 1.2496, + "step": 10542 + }, + { + "epoch": 2.112825651302605, + "grad_norm": 33.56149739212769, + "learning_rate": 2.4366618353179644e-06, + "loss": 1.7977, + "step": 10543 + }, + { + "epoch": 2.1130260521042086, + "grad_norm": 23.68980507750595, + "learning_rate": 2.4356608914712992e-06, + "loss": 1.8166, + "step": 10544 + }, + { + "epoch": 2.1132264529058116, + "grad_norm": 27.802304303870002, + "learning_rate": 2.4346600870515325e-06, + "loss": 1.5958, + "step": 10545 + }, + { + "epoch": 2.113426853707415, + "grad_norm": 26.706812501185013, + "learning_rate": 2.433659422113088e-06, + "loss": 1.6437, + "step": 10546 + }, + { + "epoch": 2.113627254509018, + "grad_norm": 21.886529235081536, + "learning_rate": 2.4326588967103663e-06, + "loss": 1.6191, + "step": 10547 + }, + { + "epoch": 2.1138276553106214, + "grad_norm": 24.005617663673586, + "learning_rate": 2.4316585108977702e-06, + "loss": 1.4579, + "step": 10548 + }, + { + "epoch": 2.1140280561122244, + "grad_norm": 35.44929493095074, + "learning_rate": 2.4306582647296927e-06, + "loss": 1.5576, + "step": 10549 + }, + { + "epoch": 2.1142284569138274, + "grad_norm": 26.408130748462757, + "learning_rate": 2.429658158260518e-06, + "loss": 1.3721, + "step": 10550 + }, + { + "epoch": 2.114428857715431, + "grad_norm": 20.547559902110937, + "learning_rate": 2.428658191544625e-06, + "loss": 1.3044, + "step": 10551 + }, + { + "epoch": 2.114629258517034, + "grad_norm": 25.349828325911105, + "learning_rate": 2.4276583646363778e-06, + "loss": 1.7963, + "step": 10552 + }, + { + "epoch": 2.1148296593186373, + "grad_norm": 25.23932154263119, + "learning_rate": 2.4266586775901464e-06, + "loss": 0.9555, + "step": 10553 + }, + { + "epoch": 2.1150300601202403, + "grad_norm": 19.202811420840394, + "learning_rate": 2.4256591304602795e-06, + "loss": 2.1951, + "step": 10554 + }, + { + "epoch": 2.1152304609218437, + "grad_norm": 19.3979272333926, + "learning_rate": 2.424659723301126e-06, + "loss": 1.3183, + "step": 10555 + }, + { + "epoch": 2.1154308617234467, + "grad_norm": 21.8110701447607, + "learning_rate": 2.4236604561670246e-06, + "loss": 1.29, + "step": 10556 + }, + { + "epoch": 2.11563126252505, + "grad_norm": 16.840793760419455, + "learning_rate": 2.4226613291123067e-06, + "loss": 1.2676, + "step": 10557 + }, + { + "epoch": 2.115831663326653, + "grad_norm": 24.733834424870796, + "learning_rate": 2.4216623421912994e-06, + "loss": 1.5214, + "step": 10558 + }, + { + "epoch": 2.1160320641282566, + "grad_norm": 42.4042032257436, + "learning_rate": 2.4206634954583124e-06, + "loss": 1.9478, + "step": 10559 + }, + { + "epoch": 2.1162324649298596, + "grad_norm": 26.0659856257436, + "learning_rate": 2.4196647889676628e-06, + "loss": 1.5728, + "step": 10560 + }, + { + "epoch": 2.116432865731463, + "grad_norm": 17.00898920333664, + "learning_rate": 2.418666222773646e-06, + "loss": 1.423, + "step": 10561 + }, + { + "epoch": 2.116633266533066, + "grad_norm": 19.178694149817336, + "learning_rate": 2.417667796930556e-06, + "loss": 1.6147, + "step": 10562 + }, + { + "epoch": 2.1168336673346695, + "grad_norm": 19.098480405856524, + "learning_rate": 2.4166695114926802e-06, + "loss": 1.3491, + "step": 10563 + }, + { + "epoch": 2.1170340681362725, + "grad_norm": 31.31189947953271, + "learning_rate": 2.4156713665142965e-06, + "loss": 1.4104, + "step": 10564 + }, + { + "epoch": 2.117234468937876, + "grad_norm": 22.483051682769574, + "learning_rate": 2.4146733620496777e-06, + "loss": 1.3966, + "step": 10565 + }, + { + "epoch": 2.117434869739479, + "grad_norm": 19.99172260561184, + "learning_rate": 2.41367549815308e-06, + "loss": 1.9139, + "step": 10566 + }, + { + "epoch": 2.1176352705410824, + "grad_norm": 20.395480717013918, + "learning_rate": 2.4126777748787673e-06, + "loss": 1.4299, + "step": 10567 + }, + { + "epoch": 2.1178356713426854, + "grad_norm": 27.815398882899185, + "learning_rate": 2.4116801922809814e-06, + "loss": 1.358, + "step": 10568 + }, + { + "epoch": 2.118036072144289, + "grad_norm": 24.318255893256545, + "learning_rate": 2.4106827504139623e-06, + "loss": 1.4396, + "step": 10569 + }, + { + "epoch": 2.118236472945892, + "grad_norm": 28.343781889115736, + "learning_rate": 2.4096854493319476e-06, + "loss": 1.5115, + "step": 10570 + }, + { + "epoch": 2.118436873747495, + "grad_norm": 23.68156755002595, + "learning_rate": 2.408688289089157e-06, + "loss": 1.6072, + "step": 10571 + }, + { + "epoch": 2.1186372745490982, + "grad_norm": 20.937373640185033, + "learning_rate": 2.40769126973981e-06, + "loss": 1.5917, + "step": 10572 + }, + { + "epoch": 2.118837675350701, + "grad_norm": 40.755149430779525, + "learning_rate": 2.4066943913381142e-06, + "loss": 1.6403, + "step": 10573 + }, + { + "epoch": 2.1190380761523047, + "grad_norm": 15.412076529467331, + "learning_rate": 2.405697653938273e-06, + "loss": 1.3982, + "step": 10574 + }, + { + "epoch": 2.1192384769539077, + "grad_norm": 18.134107830333054, + "learning_rate": 2.404701057594482e-06, + "loss": 1.489, + "step": 10575 + }, + { + "epoch": 2.119438877755511, + "grad_norm": 20.058515439975153, + "learning_rate": 2.403704602360921e-06, + "loss": 1.7098, + "step": 10576 + }, + { + "epoch": 2.119639278557114, + "grad_norm": 22.016668285053957, + "learning_rate": 2.4027082882917773e-06, + "loss": 1.2157, + "step": 10577 + }, + { + "epoch": 2.1198396793587175, + "grad_norm": 23.870757213453732, + "learning_rate": 2.401712115441216e-06, + "loss": 1.5034, + "step": 10578 + }, + { + "epoch": 2.1200400801603205, + "grad_norm": 22.38209443259928, + "learning_rate": 2.4007160838634024e-06, + "loss": 1.8784, + "step": 10579 + }, + { + "epoch": 2.120240480961924, + "grad_norm": 15.572756104767041, + "learning_rate": 2.3997201936124924e-06, + "loss": 1.3828, + "step": 10580 + }, + { + "epoch": 2.120440881763527, + "grad_norm": 27.39039251054297, + "learning_rate": 2.3987244447426338e-06, + "loss": 1.341, + "step": 10581 + }, + { + "epoch": 2.1206412825651304, + "grad_norm": 20.787165558693044, + "learning_rate": 2.3977288373079687e-06, + "loss": 1.3968, + "step": 10582 + }, + { + "epoch": 2.1208416833667334, + "grad_norm": 15.324951196826152, + "learning_rate": 2.3967333713626244e-06, + "loss": 1.341, + "step": 10583 + }, + { + "epoch": 2.121042084168337, + "grad_norm": 18.895759864708772, + "learning_rate": 2.3957380469607335e-06, + "loss": 1.1937, + "step": 10584 + }, + { + "epoch": 2.12124248496994, + "grad_norm": 22.26686707239996, + "learning_rate": 2.394742864156407e-06, + "loss": 1.3905, + "step": 10585 + }, + { + "epoch": 2.1214428857715433, + "grad_norm": 22.393762233708703, + "learning_rate": 2.393747823003757e-06, + "loss": 1.951, + "step": 10586 + }, + { + "epoch": 2.1216432865731463, + "grad_norm": 22.095877566376288, + "learning_rate": 2.3927529235568846e-06, + "loss": 1.7962, + "step": 10587 + }, + { + "epoch": 2.1218436873747497, + "grad_norm": 19.46019785265895, + "learning_rate": 2.3917581658698845e-06, + "loss": 1.209, + "step": 10588 + }, + { + "epoch": 2.1220440881763527, + "grad_norm": 21.56619892025278, + "learning_rate": 2.3907635499968433e-06, + "loss": 1.5708, + "step": 10589 + }, + { + "epoch": 2.1222444889779557, + "grad_norm": 28.25144200072638, + "learning_rate": 2.3897690759918397e-06, + "loss": 1.4097, + "step": 10590 + }, + { + "epoch": 2.122444889779559, + "grad_norm": 24.533744658322995, + "learning_rate": 2.388774743908946e-06, + "loss": 1.2788, + "step": 10591 + }, + { + "epoch": 2.122645290581162, + "grad_norm": 17.838848291016408, + "learning_rate": 2.387780553802222e-06, + "loss": 1.6328, + "step": 10592 + }, + { + "epoch": 2.1228456913827656, + "grad_norm": 20.939942061174452, + "learning_rate": 2.386786505725725e-06, + "loss": 1.3335, + "step": 10593 + }, + { + "epoch": 2.1230460921843686, + "grad_norm": 22.617410125114777, + "learning_rate": 2.3857925997335035e-06, + "loss": 1.7925, + "step": 10594 + }, + { + "epoch": 2.123246492985972, + "grad_norm": 46.990832457966555, + "learning_rate": 2.3847988358795977e-06, + "loss": 1.5947, + "step": 10595 + }, + { + "epoch": 2.123446893787575, + "grad_norm": 19.21600358584218, + "learning_rate": 2.383805214218039e-06, + "loss": 1.2387, + "step": 10596 + }, + { + "epoch": 2.1236472945891784, + "grad_norm": 20.74815632847116, + "learning_rate": 2.382811734802853e-06, + "loss": 1.0098, + "step": 10597 + }, + { + "epoch": 2.1238476953907814, + "grad_norm": 23.457334592517977, + "learning_rate": 2.3818183976880575e-06, + "loss": 1.6099, + "step": 10598 + }, + { + "epoch": 2.124048096192385, + "grad_norm": 34.7711448283851, + "learning_rate": 2.3808252029276572e-06, + "loss": 1.6298, + "step": 10599 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 13.928122008785383, + "learning_rate": 2.3798321505756593e-06, + "loss": 1.3625, + "step": 10600 + }, + { + "epoch": 2.1244488977955913, + "grad_norm": 20.820954672186573, + "learning_rate": 2.3788392406860565e-06, + "loss": 1.775, + "step": 10601 + }, + { + "epoch": 2.1246492985971943, + "grad_norm": 91.02918148731975, + "learning_rate": 2.3778464733128315e-06, + "loss": 1.5965, + "step": 10602 + }, + { + "epoch": 2.1248496993987978, + "grad_norm": 18.16157316890612, + "learning_rate": 2.3768538485099646e-06, + "loss": 1.2174, + "step": 10603 + }, + { + "epoch": 2.1250501002004007, + "grad_norm": 18.70148533686174, + "learning_rate": 2.3758613663314256e-06, + "loss": 1.3586, + "step": 10604 + }, + { + "epoch": 2.125250501002004, + "grad_norm": 27.10656753309881, + "learning_rate": 2.374869026831178e-06, + "loss": 1.2784, + "step": 10605 + }, + { + "epoch": 2.125450901803607, + "grad_norm": 21.92092190176829, + "learning_rate": 2.3738768300631767e-06, + "loss": 1.6569, + "step": 10606 + }, + { + "epoch": 2.1256513026052106, + "grad_norm": 19.259308642976585, + "learning_rate": 2.3728847760813685e-06, + "loss": 1.5202, + "step": 10607 + }, + { + "epoch": 2.1258517034068136, + "grad_norm": 19.490145996419713, + "learning_rate": 2.3718928649396946e-06, + "loss": 1.8439, + "step": 10608 + }, + { + "epoch": 2.1260521042084166, + "grad_norm": 28.941372595819534, + "learning_rate": 2.3709010966920833e-06, + "loss": 1.3343, + "step": 10609 + }, + { + "epoch": 2.12625250501002, + "grad_norm": 23.850972883444424, + "learning_rate": 2.36990947139246e-06, + "loss": 1.6158, + "step": 10610 + }, + { + "epoch": 2.126452905811623, + "grad_norm": 19.673983928089683, + "learning_rate": 2.3689179890947417e-06, + "loss": 1.6035, + "step": 10611 + }, + { + "epoch": 2.1266533066132265, + "grad_norm": 16.57046157446773, + "learning_rate": 2.3679266498528358e-06, + "loss": 1.6075, + "step": 10612 + }, + { + "epoch": 2.1268537074148295, + "grad_norm": 21.557645177444314, + "learning_rate": 2.3669354537206437e-06, + "loss": 1.8779, + "step": 10613 + }, + { + "epoch": 2.127054108216433, + "grad_norm": 22.14211185479024, + "learning_rate": 2.365944400752057e-06, + "loss": 1.5484, + "step": 10614 + }, + { + "epoch": 2.127254509018036, + "grad_norm": 25.89966789374267, + "learning_rate": 2.364953491000964e-06, + "loss": 1.5101, + "step": 10615 + }, + { + "epoch": 2.1274549098196394, + "grad_norm": 20.50349526288087, + "learning_rate": 2.3639627245212353e-06, + "loss": 1.2176, + "step": 10616 + }, + { + "epoch": 2.1276553106212424, + "grad_norm": 26.65145976629128, + "learning_rate": 2.3629721013667483e-06, + "loss": 1.3751, + "step": 10617 + }, + { + "epoch": 2.127855711422846, + "grad_norm": 19.266945883702192, + "learning_rate": 2.3619816215913587e-06, + "loss": 1.6454, + "step": 10618 + }, + { + "epoch": 2.128056112224449, + "grad_norm": 26.940721769102602, + "learning_rate": 2.3609912852489226e-06, + "loss": 2.0355, + "step": 10619 + }, + { + "epoch": 2.1282565130260522, + "grad_norm": 27.28210235741226, + "learning_rate": 2.360001092393286e-06, + "loss": 1.3415, + "step": 10620 + }, + { + "epoch": 2.1284569138276552, + "grad_norm": 25.770444623450107, + "learning_rate": 2.3590110430782863e-06, + "loss": 1.3691, + "step": 10621 + }, + { + "epoch": 2.1286573146292587, + "grad_norm": 21.338736270825876, + "learning_rate": 2.358021137357757e-06, + "loss": 1.5707, + "step": 10622 + }, + { + "epoch": 2.1288577154308617, + "grad_norm": 19.719116873007447, + "learning_rate": 2.357031375285515e-06, + "loss": 1.4949, + "step": 10623 + }, + { + "epoch": 2.129058116232465, + "grad_norm": 20.457868889647322, + "learning_rate": 2.35604175691538e-06, + "loss": 1.5463, + "step": 10624 + }, + { + "epoch": 2.129258517034068, + "grad_norm": 19.188928208709697, + "learning_rate": 2.3550522823011596e-06, + "loss": 1.0457, + "step": 10625 + }, + { + "epoch": 2.1294589178356715, + "grad_norm": 26.39146693571732, + "learning_rate": 2.3540629514966485e-06, + "loss": 1.275, + "step": 10626 + }, + { + "epoch": 2.1296593186372745, + "grad_norm": 22.340706016084923, + "learning_rate": 2.3530737645556406e-06, + "loss": 1.8058, + "step": 10627 + }, + { + "epoch": 2.129859719438878, + "grad_norm": 19.812170243315045, + "learning_rate": 2.3520847215319196e-06, + "loss": 1.3317, + "step": 10628 + }, + { + "epoch": 2.130060120240481, + "grad_norm": 83.39426501503561, + "learning_rate": 2.351095822479261e-06, + "loss": 1.9446, + "step": 10629 + }, + { + "epoch": 2.130260521042084, + "grad_norm": 41.231699588046716, + "learning_rate": 2.3501070674514326e-06, + "loss": 1.8372, + "step": 10630 + }, + { + "epoch": 2.1304609218436874, + "grad_norm": 30.383296997920333, + "learning_rate": 2.3491184565021945e-06, + "loss": 1.2974, + "step": 10631 + }, + { + "epoch": 2.1306613226452904, + "grad_norm": 17.31679342616713, + "learning_rate": 2.348129989685301e-06, + "loss": 1.638, + "step": 10632 + }, + { + "epoch": 2.130861723446894, + "grad_norm": 19.977339147192176, + "learning_rate": 2.3471416670544905e-06, + "loss": 1.5519, + "step": 10633 + }, + { + "epoch": 2.131062124248497, + "grad_norm": 27.855856712350704, + "learning_rate": 2.346153488663508e-06, + "loss": 1.4049, + "step": 10634 + }, + { + "epoch": 2.1312625250501003, + "grad_norm": 26.388546610953558, + "learning_rate": 2.3451654545660753e-06, + "loss": 2.2527, + "step": 10635 + }, + { + "epoch": 2.1314629258517033, + "grad_norm": 15.040775974688616, + "learning_rate": 2.3441775648159154e-06, + "loss": 1.5549, + "step": 10636 + }, + { + "epoch": 2.1316633266533067, + "grad_norm": 25.60777242529212, + "learning_rate": 2.3431898194667417e-06, + "loss": 1.5851, + "step": 10637 + }, + { + "epoch": 2.1318637274549097, + "grad_norm": 20.42169780579117, + "learning_rate": 2.3422022185722597e-06, + "loss": 1.7629, + "step": 10638 + }, + { + "epoch": 2.132064128256513, + "grad_norm": 19.889473911148404, + "learning_rate": 2.3412147621861686e-06, + "loss": 1.2118, + "step": 10639 + }, + { + "epoch": 2.132264529058116, + "grad_norm": 27.394991268935918, + "learning_rate": 2.3402274503621514e-06, + "loss": 1.8857, + "step": 10640 + }, + { + "epoch": 2.1324649298597196, + "grad_norm": 23.933480005446594, + "learning_rate": 2.3392402831538975e-06, + "loss": 1.8389, + "step": 10641 + }, + { + "epoch": 2.1326653306613226, + "grad_norm": 21.911672896329588, + "learning_rate": 2.3382532606150755e-06, + "loss": 1.6531, + "step": 10642 + }, + { + "epoch": 2.132865731462926, + "grad_norm": 20.483064122228136, + "learning_rate": 2.3372663827993525e-06, + "loss": 1.8922, + "step": 10643 + }, + { + "epoch": 2.133066132264529, + "grad_norm": 20.106221456560625, + "learning_rate": 2.3362796497603867e-06, + "loss": 1.3769, + "step": 10644 + }, + { + "epoch": 2.1332665330661325, + "grad_norm": 19.68101188417864, + "learning_rate": 2.335293061551829e-06, + "loss": 1.415, + "step": 10645 + }, + { + "epoch": 2.1334669338677354, + "grad_norm": 18.690816123108497, + "learning_rate": 2.334306618227323e-06, + "loss": 1.4989, + "step": 10646 + }, + { + "epoch": 2.1336673346693384, + "grad_norm": 17.079279839091804, + "learning_rate": 2.3333203198404963e-06, + "loss": 1.4954, + "step": 10647 + }, + { + "epoch": 2.133867735470942, + "grad_norm": 17.31589603357617, + "learning_rate": 2.3323341664449843e-06, + "loss": 1.7141, + "step": 10648 + }, + { + "epoch": 2.1340681362725453, + "grad_norm": 47.48745807709493, + "learning_rate": 2.3313481580944e-06, + "loss": 1.5077, + "step": 10649 + }, + { + "epoch": 2.1342685370741483, + "grad_norm": 16.348227434821645, + "learning_rate": 2.330362294842354e-06, + "loss": 1.4476, + "step": 10650 + }, + { + "epoch": 2.1344689378757513, + "grad_norm": 22.303616281955016, + "learning_rate": 2.329376576742454e-06, + "loss": 1.501, + "step": 10651 + }, + { + "epoch": 2.1346693386773548, + "grad_norm": 21.880942708369922, + "learning_rate": 2.32839100384829e-06, + "loss": 1.2815, + "step": 10652 + }, + { + "epoch": 2.1348697394789578, + "grad_norm": 19.74196861863081, + "learning_rate": 2.327405576213452e-06, + "loss": 1.6626, + "step": 10653 + }, + { + "epoch": 2.135070140280561, + "grad_norm": 13.71685183359451, + "learning_rate": 2.3264202938915154e-06, + "loss": 1.7973, + "step": 10654 + }, + { + "epoch": 2.135270541082164, + "grad_norm": 22.096911599095506, + "learning_rate": 2.325435156936055e-06, + "loss": 1.6192, + "step": 10655 + }, + { + "epoch": 2.1354709418837676, + "grad_norm": 18.627746175310257, + "learning_rate": 2.324450165400635e-06, + "loss": 1.5704, + "step": 10656 + }, + { + "epoch": 2.1356713426853706, + "grad_norm": 17.34575551652386, + "learning_rate": 2.3234653193388055e-06, + "loss": 1.2728, + "step": 10657 + }, + { + "epoch": 2.135871743486974, + "grad_norm": 14.425513309823875, + "learning_rate": 2.3224806188041214e-06, + "loss": 1.426, + "step": 10658 + }, + { + "epoch": 2.136072144288577, + "grad_norm": 25.06599827051625, + "learning_rate": 2.3214960638501165e-06, + "loss": 1.7032, + "step": 10659 + }, + { + "epoch": 2.1362725450901805, + "grad_norm": 24.478018549542604, + "learning_rate": 2.3205116545303246e-06, + "loss": 1.448, + "step": 10660 + }, + { + "epoch": 2.1364729458917835, + "grad_norm": 23.354221036081203, + "learning_rate": 2.319527390898269e-06, + "loss": 1.9047, + "step": 10661 + }, + { + "epoch": 2.136673346693387, + "grad_norm": 21.941485349669186, + "learning_rate": 2.318543273007466e-06, + "loss": 1.1957, + "step": 10662 + }, + { + "epoch": 2.13687374749499, + "grad_norm": 14.834568371360055, + "learning_rate": 2.3175593009114255e-06, + "loss": 1.6915, + "step": 10663 + }, + { + "epoch": 2.1370741482965934, + "grad_norm": 24.65512431826531, + "learning_rate": 2.3165754746636416e-06, + "loss": 1.7574, + "step": 10664 + }, + { + "epoch": 2.1372745490981964, + "grad_norm": 21.30039385646173, + "learning_rate": 2.3155917943176136e-06, + "loss": 1.5634, + "step": 10665 + }, + { + "epoch": 2.1374749498998, + "grad_norm": 19.905090778440837, + "learning_rate": 2.314608259926821e-06, + "loss": 1.3215, + "step": 10666 + }, + { + "epoch": 2.137675350701403, + "grad_norm": 22.896303867706905, + "learning_rate": 2.3136248715447407e-06, + "loss": 1.6079, + "step": 10667 + }, + { + "epoch": 2.137875751503006, + "grad_norm": 37.222346629577046, + "learning_rate": 2.312641629224842e-06, + "loss": 1.2049, + "step": 10668 + }, + { + "epoch": 2.1380761523046092, + "grad_norm": 18.661839769102176, + "learning_rate": 2.3116585330205847e-06, + "loss": 1.4916, + "step": 10669 + }, + { + "epoch": 2.1382765531062122, + "grad_norm": 27.79016406241236, + "learning_rate": 2.3106755829854215e-06, + "loss": 1.2554, + "step": 10670 + }, + { + "epoch": 2.1384769539078157, + "grad_norm": 22.32272224399402, + "learning_rate": 2.3096927791727963e-06, + "loss": 2.0578, + "step": 10671 + }, + { + "epoch": 2.1386773547094187, + "grad_norm": 24.766061289826357, + "learning_rate": 2.3087101216361486e-06, + "loss": 1.3903, + "step": 10672 + }, + { + "epoch": 2.138877755511022, + "grad_norm": 22.417683650910327, + "learning_rate": 2.3077276104289014e-06, + "loss": 1.3104, + "step": 10673 + }, + { + "epoch": 2.139078156312625, + "grad_norm": 31.917895331113797, + "learning_rate": 2.3067452456044787e-06, + "loss": 1.6995, + "step": 10674 + }, + { + "epoch": 2.1392785571142285, + "grad_norm": 20.275762948759724, + "learning_rate": 2.3057630272162927e-06, + "loss": 1.7341, + "step": 10675 + }, + { + "epoch": 2.1394789579158315, + "grad_norm": 17.666173588053933, + "learning_rate": 2.304780955317748e-06, + "loss": 1.4036, + "step": 10676 + }, + { + "epoch": 2.139679358717435, + "grad_norm": 22.965806914856877, + "learning_rate": 2.303799029962241e-06, + "loss": 1.3736, + "step": 10677 + }, + { + "epoch": 2.139879759519038, + "grad_norm": 25.34664721222984, + "learning_rate": 2.3028172512031606e-06, + "loss": 1.619, + "step": 10678 + }, + { + "epoch": 2.1400801603206414, + "grad_norm": 16.186032411473317, + "learning_rate": 2.3018356190938896e-06, + "loss": 1.251, + "step": 10679 + }, + { + "epoch": 2.1402805611222444, + "grad_norm": 20.307040049067023, + "learning_rate": 2.300854133687797e-06, + "loss": 1.8507, + "step": 10680 + }, + { + "epoch": 2.140480961923848, + "grad_norm": 18.572119669624062, + "learning_rate": 2.299872795038248e-06, + "loss": 1.8353, + "step": 10681 + }, + { + "epoch": 2.140681362725451, + "grad_norm": 18.202558647411873, + "learning_rate": 2.298891603198604e-06, + "loss": 1.462, + "step": 10682 + }, + { + "epoch": 2.1408817635270543, + "grad_norm": 22.534400913649108, + "learning_rate": 2.2979105582222094e-06, + "loss": 1.7244, + "step": 10683 + }, + { + "epoch": 2.1410821643286573, + "grad_norm": 43.574294197728776, + "learning_rate": 2.2969296601624063e-06, + "loss": 1.2866, + "step": 10684 + }, + { + "epoch": 2.1412825651302607, + "grad_norm": 24.67451309801549, + "learning_rate": 2.2959489090725274e-06, + "loss": 1.4765, + "step": 10685 + }, + { + "epoch": 2.1414829659318637, + "grad_norm": 15.623574520532605, + "learning_rate": 2.294968305005898e-06, + "loss": 1.8943, + "step": 10686 + }, + { + "epoch": 2.141683366733467, + "grad_norm": 27.32468384605092, + "learning_rate": 2.2939878480158352e-06, + "loss": 1.2278, + "step": 10687 + }, + { + "epoch": 2.14188376753507, + "grad_norm": 18.52904354924982, + "learning_rate": 2.2930075381556473e-06, + "loss": 1.5582, + "step": 10688 + }, + { + "epoch": 2.142084168336673, + "grad_norm": 59.37350458750509, + "learning_rate": 2.2920273754786373e-06, + "loss": 1.6884, + "step": 10689 + }, + { + "epoch": 2.1422845691382766, + "grad_norm": 19.009676975273365, + "learning_rate": 2.291047360038094e-06, + "loss": 1.7204, + "step": 10690 + }, + { + "epoch": 2.1424849699398796, + "grad_norm": 21.103943112620307, + "learning_rate": 2.2900674918873045e-06, + "loss": 1.3525, + "step": 10691 + }, + { + "epoch": 2.142685370741483, + "grad_norm": 15.061912643553915, + "learning_rate": 2.289087771079546e-06, + "loss": 1.5544, + "step": 10692 + }, + { + "epoch": 2.142885771543086, + "grad_norm": 43.95651858088182, + "learning_rate": 2.288108197668087e-06, + "loss": 1.6758, + "step": 10693 + }, + { + "epoch": 2.1430861723446895, + "grad_norm": 14.050067007530455, + "learning_rate": 2.2871287717061887e-06, + "loss": 1.7646, + "step": 10694 + }, + { + "epoch": 2.1432865731462925, + "grad_norm": 24.690444872252655, + "learning_rate": 2.2861494932471035e-06, + "loss": 1.4122, + "step": 10695 + }, + { + "epoch": 2.143486973947896, + "grad_norm": 26.01055914239364, + "learning_rate": 2.285170362344078e-06, + "loss": 1.9359, + "step": 10696 + }, + { + "epoch": 2.143687374749499, + "grad_norm": 26.871492426947306, + "learning_rate": 2.2841913790503458e-06, + "loss": 1.6694, + "step": 10697 + }, + { + "epoch": 2.1438877755511023, + "grad_norm": 20.368128191405912, + "learning_rate": 2.2832125434191368e-06, + "loss": 1.4928, + "step": 10698 + }, + { + "epoch": 2.1440881763527053, + "grad_norm": 19.93051560519234, + "learning_rate": 2.282233855503672e-06, + "loss": 1.6842, + "step": 10699 + }, + { + "epoch": 2.1442885771543088, + "grad_norm": 26.672635506451616, + "learning_rate": 2.281255315357165e-06, + "loss": 1.8368, + "step": 10700 + }, + { + "epoch": 2.1444889779559118, + "grad_norm": 20.566442508635767, + "learning_rate": 2.2802769230328194e-06, + "loss": 1.5371, + "step": 10701 + }, + { + "epoch": 2.144689378757515, + "grad_norm": 14.32887939296892, + "learning_rate": 2.2792986785838323e-06, + "loss": 1.4206, + "step": 10702 + }, + { + "epoch": 2.144889779559118, + "grad_norm": 15.698993547236439, + "learning_rate": 2.2783205820633943e-06, + "loss": 1.5014, + "step": 10703 + }, + { + "epoch": 2.1450901803607216, + "grad_norm": 15.955472704211513, + "learning_rate": 2.2773426335246795e-06, + "loss": 1.2778, + "step": 10704 + }, + { + "epoch": 2.1452905811623246, + "grad_norm": 18.44273430276638, + "learning_rate": 2.276364833020868e-06, + "loss": 1.4011, + "step": 10705 + }, + { + "epoch": 2.1454909819639276, + "grad_norm": 22.425630436987152, + "learning_rate": 2.2753871806051203e-06, + "loss": 1.6177, + "step": 10706 + }, + { + "epoch": 2.145691382765531, + "grad_norm": 18.358338775042643, + "learning_rate": 2.274409676330593e-06, + "loss": 1.6499, + "step": 10707 + }, + { + "epoch": 2.1458917835671345, + "grad_norm": 16.81527068492886, + "learning_rate": 2.273432320250435e-06, + "loss": 1.574, + "step": 10708 + }, + { + "epoch": 2.1460921843687375, + "grad_norm": 24.91946075410912, + "learning_rate": 2.2724551124177864e-06, + "loss": 1.9329, + "step": 10709 + }, + { + "epoch": 2.1462925851703405, + "grad_norm": 22.5610315361631, + "learning_rate": 2.271478052885781e-06, + "loss": 1.6335, + "step": 10710 + }, + { + "epoch": 2.146492985971944, + "grad_norm": 33.4587150334964, + "learning_rate": 2.2705011417075384e-06, + "loss": 1.6884, + "step": 10711 + }, + { + "epoch": 2.146693386773547, + "grad_norm": 23.257857157474955, + "learning_rate": 2.26952437893618e-06, + "loss": 1.6271, + "step": 10712 + }, + { + "epoch": 2.1468937875751504, + "grad_norm": 24.68038397690502, + "learning_rate": 2.2685477646248126e-06, + "loss": 1.9355, + "step": 10713 + }, + { + "epoch": 2.1470941883767534, + "grad_norm": 31.94645004865323, + "learning_rate": 2.2675712988265326e-06, + "loss": 1.9271, + "step": 10714 + }, + { + "epoch": 2.147294589178357, + "grad_norm": 25.690040783087575, + "learning_rate": 2.266594981594437e-06, + "loss": 1.5068, + "step": 10715 + }, + { + "epoch": 2.14749498997996, + "grad_norm": 19.294555274971675, + "learning_rate": 2.2656188129816066e-06, + "loss": 1.8903, + "step": 10716 + }, + { + "epoch": 2.1476953907815632, + "grad_norm": 19.253754992391098, + "learning_rate": 2.2646427930411163e-06, + "loss": 1.8176, + "step": 10717 + }, + { + "epoch": 2.1478957915831662, + "grad_norm": 27.359777472343563, + "learning_rate": 2.263666921826036e-06, + "loss": 2.0419, + "step": 10718 + }, + { + "epoch": 2.1480961923847697, + "grad_norm": 17.16311540636996, + "learning_rate": 2.262691199389424e-06, + "loss": 1.284, + "step": 10719 + }, + { + "epoch": 2.1482965931863727, + "grad_norm": 23.335642529410563, + "learning_rate": 2.2617156257843344e-06, + "loss": 1.4818, + "step": 10720 + }, + { + "epoch": 2.148496993987976, + "grad_norm": 47.01914738110064, + "learning_rate": 2.2607402010638043e-06, + "loss": 1.5761, + "step": 10721 + }, + { + "epoch": 2.148697394789579, + "grad_norm": 25.787674328853353, + "learning_rate": 2.2597649252808766e-06, + "loss": 1.9946, + "step": 10722 + }, + { + "epoch": 2.1488977955911825, + "grad_norm": 20.344707126247254, + "learning_rate": 2.258789798488573e-06, + "loss": 1.4825, + "step": 10723 + }, + { + "epoch": 2.1490981963927855, + "grad_norm": 19.62209180640357, + "learning_rate": 2.257814820739915e-06, + "loss": 1.9357, + "step": 10724 + }, + { + "epoch": 2.149298597194389, + "grad_norm": 22.922908616914285, + "learning_rate": 2.2568399920879127e-06, + "loss": 1.3807, + "step": 10725 + }, + { + "epoch": 2.149498997995992, + "grad_norm": 34.39120663167431, + "learning_rate": 2.255865312585569e-06, + "loss": 1.2745, + "step": 10726 + }, + { + "epoch": 2.149699398797595, + "grad_norm": 29.6317651041914, + "learning_rate": 2.254890782285881e-06, + "loss": 1.3336, + "step": 10727 + }, + { + "epoch": 2.1498997995991984, + "grad_norm": 23.778244782866476, + "learning_rate": 2.25391640124183e-06, + "loss": 1.4246, + "step": 10728 + }, + { + "epoch": 2.1501002004008014, + "grad_norm": 23.91815333537607, + "learning_rate": 2.252942169506401e-06, + "loss": 1.3892, + "step": 10729 + }, + { + "epoch": 2.150300601202405, + "grad_norm": 27.475554822837374, + "learning_rate": 2.2519680871325596e-06, + "loss": 1.6644, + "step": 10730 + }, + { + "epoch": 2.150501002004008, + "grad_norm": 23.506875630085336, + "learning_rate": 2.2509941541732694e-06, + "loss": 0.9924, + "step": 10731 + }, + { + "epoch": 2.1507014028056113, + "grad_norm": 33.55331114468436, + "learning_rate": 2.2500203706814855e-06, + "loss": 1.5031, + "step": 10732 + }, + { + "epoch": 2.1509018036072143, + "grad_norm": 26.21289700453133, + "learning_rate": 2.249046736710153e-06, + "loss": 1.7909, + "step": 10733 + }, + { + "epoch": 2.1511022044088177, + "grad_norm": 24.88945923393558, + "learning_rate": 2.248073252312212e-06, + "loss": 1.6272, + "step": 10734 + }, + { + "epoch": 2.1513026052104207, + "grad_norm": 19.404975982038636, + "learning_rate": 2.247099917540587e-06, + "loss": 1.6653, + "step": 10735 + }, + { + "epoch": 2.151503006012024, + "grad_norm": 18.28513626118685, + "learning_rate": 2.246126732448207e-06, + "loss": 1.4069, + "step": 10736 + }, + { + "epoch": 2.151703406813627, + "grad_norm": 22.06038211580641, + "learning_rate": 2.2451536970879802e-06, + "loss": 1.7391, + "step": 10737 + }, + { + "epoch": 2.1519038076152306, + "grad_norm": 22.102967145982337, + "learning_rate": 2.244180811512811e-06, + "loss": 1.4276, + "step": 10738 + }, + { + "epoch": 2.1521042084168336, + "grad_norm": 26.38303272335336, + "learning_rate": 2.2432080757756024e-06, + "loss": 1.5301, + "step": 10739 + }, + { + "epoch": 2.152304609218437, + "grad_norm": 18.509407893604006, + "learning_rate": 2.2422354899292394e-06, + "loss": 1.7309, + "step": 10740 + }, + { + "epoch": 2.15250501002004, + "grad_norm": 21.122243271580608, + "learning_rate": 2.2412630540266045e-06, + "loss": 1.7598, + "step": 10741 + }, + { + "epoch": 2.1527054108216435, + "grad_norm": 19.06551083150359, + "learning_rate": 2.240290768120566e-06, + "loss": 1.4028, + "step": 10742 + }, + { + "epoch": 2.1529058116232465, + "grad_norm": 30.505617940001745, + "learning_rate": 2.2393186322639945e-06, + "loss": 2.0619, + "step": 10743 + }, + { + "epoch": 2.15310621242485, + "grad_norm": 27.60530933821787, + "learning_rate": 2.2383466465097455e-06, + "loss": 1.134, + "step": 10744 + }, + { + "epoch": 2.153306613226453, + "grad_norm": 35.939250153248494, + "learning_rate": 2.2373748109106623e-06, + "loss": 1.8236, + "step": 10745 + }, + { + "epoch": 2.1535070140280563, + "grad_norm": 17.572757328208116, + "learning_rate": 2.236403125519592e-06, + "loss": 1.3548, + "step": 10746 + }, + { + "epoch": 2.1537074148296593, + "grad_norm": 16.989710686383184, + "learning_rate": 2.235431590389362e-06, + "loss": 1.1168, + "step": 10747 + }, + { + "epoch": 2.1539078156312623, + "grad_norm": 19.20924041149382, + "learning_rate": 2.234460205572797e-06, + "loss": 1.3977, + "step": 10748 + }, + { + "epoch": 2.1541082164328658, + "grad_norm": 20.9315168714877, + "learning_rate": 2.2334889711227133e-06, + "loss": 1.4963, + "step": 10749 + }, + { + "epoch": 2.1543086172344688, + "grad_norm": 21.221326150063557, + "learning_rate": 2.2325178870919175e-06, + "loss": 1.1416, + "step": 10750 + }, + { + "epoch": 2.154509018036072, + "grad_norm": 23.893846025106974, + "learning_rate": 2.231546953533211e-06, + "loss": 1.7123, + "step": 10751 + }, + { + "epoch": 2.154709418837675, + "grad_norm": 29.881955170887895, + "learning_rate": 2.23057617049938e-06, + "loss": 1.9203, + "step": 10752 + }, + { + "epoch": 2.1549098196392786, + "grad_norm": 23.960079225681795, + "learning_rate": 2.2296055380432145e-06, + "loss": 1.2373, + "step": 10753 + }, + { + "epoch": 2.1551102204408816, + "grad_norm": 19.699016365514012, + "learning_rate": 2.228635056217483e-06, + "loss": 1.828, + "step": 10754 + }, + { + "epoch": 2.155310621242485, + "grad_norm": 28.204714045306538, + "learning_rate": 2.227664725074955e-06, + "loss": 1.7327, + "step": 10755 + }, + { + "epoch": 2.155511022044088, + "grad_norm": 19.09340551402026, + "learning_rate": 2.226694544668388e-06, + "loss": 1.6911, + "step": 10756 + }, + { + "epoch": 2.1557114228456915, + "grad_norm": 19.811001623636063, + "learning_rate": 2.2257245150505325e-06, + "loss": 1.7808, + "step": 10757 + }, + { + "epoch": 2.1559118236472945, + "grad_norm": 44.55014555338451, + "learning_rate": 2.22475463627413e-06, + "loss": 1.4734, + "step": 10758 + }, + { + "epoch": 2.156112224448898, + "grad_norm": 29.95415179368109, + "learning_rate": 2.2237849083919146e-06, + "loss": 1.6439, + "step": 10759 + }, + { + "epoch": 2.156312625250501, + "grad_norm": 17.105583541070043, + "learning_rate": 2.222815331456614e-06, + "loss": 1.0991, + "step": 10760 + }, + { + "epoch": 2.1565130260521044, + "grad_norm": 27.261019948127107, + "learning_rate": 2.2218459055209405e-06, + "loss": 1.2909, + "step": 10761 + }, + { + "epoch": 2.1567134268537074, + "grad_norm": 22.040877859041668, + "learning_rate": 2.2208766306376057e-06, + "loss": 2.0414, + "step": 10762 + }, + { + "epoch": 2.156913827655311, + "grad_norm": 20.62464552957636, + "learning_rate": 2.2199075068593133e-06, + "loss": 1.6709, + "step": 10763 + }, + { + "epoch": 2.157114228456914, + "grad_norm": 25.468970739080373, + "learning_rate": 2.2189385342387526e-06, + "loss": 1.5073, + "step": 10764 + }, + { + "epoch": 2.157314629258517, + "grad_norm": 23.74725349623399, + "learning_rate": 2.2179697128286083e-06, + "loss": 1.286, + "step": 10765 + }, + { + "epoch": 2.1575150300601202, + "grad_norm": 25.406940082735186, + "learning_rate": 2.2170010426815573e-06, + "loss": 1.4172, + "step": 10766 + }, + { + "epoch": 2.1577154308617237, + "grad_norm": 21.5144642002332, + "learning_rate": 2.216032523850268e-06, + "loss": 1.6195, + "step": 10767 + }, + { + "epoch": 2.1579158316633267, + "grad_norm": 20.630097240737303, + "learning_rate": 2.215064156387402e-06, + "loss": 1.595, + "step": 10768 + }, + { + "epoch": 2.1581162324649297, + "grad_norm": 26.449693333600457, + "learning_rate": 2.2140959403456053e-06, + "loss": 1.646, + "step": 10769 + }, + { + "epoch": 2.158316633266533, + "grad_norm": 19.771362512626403, + "learning_rate": 2.2131278757775276e-06, + "loss": 1.8809, + "step": 10770 + }, + { + "epoch": 2.158517034068136, + "grad_norm": 27.991535948900673, + "learning_rate": 2.2121599627357997e-06, + "loss": 2.1147, + "step": 10771 + }, + { + "epoch": 2.1587174348697395, + "grad_norm": 21.514011487192565, + "learning_rate": 2.2111922012730496e-06, + "loss": 1.2715, + "step": 10772 + }, + { + "epoch": 2.1589178356713425, + "grad_norm": 20.202348916828925, + "learning_rate": 2.2102245914418966e-06, + "loss": 1.4787, + "step": 10773 + }, + { + "epoch": 2.159118236472946, + "grad_norm": 21.909900078123453, + "learning_rate": 2.2092571332949504e-06, + "loss": 1.6107, + "step": 10774 + }, + { + "epoch": 2.159318637274549, + "grad_norm": 30.66989490219146, + "learning_rate": 2.2082898268848134e-06, + "loss": 1.842, + "step": 10775 + }, + { + "epoch": 2.1595190380761524, + "grad_norm": 37.48678368767811, + "learning_rate": 2.2073226722640796e-06, + "loss": 1.48, + "step": 10776 + }, + { + "epoch": 2.1597194388777554, + "grad_norm": 17.871663711445642, + "learning_rate": 2.206355669485336e-06, + "loss": 1.7344, + "step": 10777 + }, + { + "epoch": 2.159919839679359, + "grad_norm": 20.953079387273668, + "learning_rate": 2.2053888186011566e-06, + "loss": 1.5941, + "step": 10778 + }, + { + "epoch": 2.160120240480962, + "grad_norm": 17.631418429373962, + "learning_rate": 2.2044221196641123e-06, + "loss": 0.9274, + "step": 10779 + }, + { + "epoch": 2.1603206412825653, + "grad_norm": 16.537927394300645, + "learning_rate": 2.2034555727267644e-06, + "loss": 1.0502, + "step": 10780 + }, + { + "epoch": 2.1605210420841683, + "grad_norm": 53.81485163379294, + "learning_rate": 2.2024891778416646e-06, + "loss": 1.5249, + "step": 10781 + }, + { + "epoch": 2.1607214428857717, + "grad_norm": 22.046581056990703, + "learning_rate": 2.201522935061358e-06, + "loss": 1.8078, + "step": 10782 + }, + { + "epoch": 2.1609218436873747, + "grad_norm": 29.079364917209276, + "learning_rate": 2.2005568444383805e-06, + "loss": 1.7442, + "step": 10783 + }, + { + "epoch": 2.161122244488978, + "grad_norm": 23.911744514383244, + "learning_rate": 2.1995909060252614e-06, + "loss": 1.4769, + "step": 10784 + }, + { + "epoch": 2.161322645290581, + "grad_norm": 16.914302717787965, + "learning_rate": 2.198625119874517e-06, + "loss": 1.7201, + "step": 10785 + }, + { + "epoch": 2.161523046092184, + "grad_norm": 24.66734652623345, + "learning_rate": 2.19765948603866e-06, + "loss": 1.5632, + "step": 10786 + }, + { + "epoch": 2.1617234468937876, + "grad_norm": 15.486137548250243, + "learning_rate": 2.196694004570193e-06, + "loss": 1.5327, + "step": 10787 + }, + { + "epoch": 2.1619238476953906, + "grad_norm": 26.8647291104754, + "learning_rate": 2.1957286755216105e-06, + "loss": 1.3529, + "step": 10788 + }, + { + "epoch": 2.162124248496994, + "grad_norm": 24.26300117041429, + "learning_rate": 2.1947634989454002e-06, + "loss": 1.5765, + "step": 10789 + }, + { + "epoch": 2.162324649298597, + "grad_norm": 24.16647425424809, + "learning_rate": 2.1937984748940393e-06, + "loss": 1.945, + "step": 10790 + }, + { + "epoch": 2.1625250501002005, + "grad_norm": 19.140269043374996, + "learning_rate": 2.192833603419999e-06, + "loss": 1.8091, + "step": 10791 + }, + { + "epoch": 2.1627254509018035, + "grad_norm": 27.158922331313978, + "learning_rate": 2.1918688845757363e-06, + "loss": 1.7106, + "step": 10792 + }, + { + "epoch": 2.162925851703407, + "grad_norm": 23.065815181328418, + "learning_rate": 2.19090431841371e-06, + "loss": 1.3276, + "step": 10793 + }, + { + "epoch": 2.16312625250501, + "grad_norm": 20.736539665294444, + "learning_rate": 2.189939904986363e-06, + "loss": 1.2538, + "step": 10794 + }, + { + "epoch": 2.1633266533066133, + "grad_norm": 23.264692140842676, + "learning_rate": 2.18897564434613e-06, + "loss": 1.6055, + "step": 10795 + }, + { + "epoch": 2.1635270541082163, + "grad_norm": 22.667187661928438, + "learning_rate": 2.1880115365454413e-06, + "loss": 1.5539, + "step": 10796 + }, + { + "epoch": 2.1637274549098198, + "grad_norm": 34.99761204771971, + "learning_rate": 2.187047581636715e-06, + "loss": 1.9116, + "step": 10797 + }, + { + "epoch": 2.1639278557114228, + "grad_norm": 23.9651036304932, + "learning_rate": 2.186083779672365e-06, + "loss": 1.2078, + "step": 10798 + }, + { + "epoch": 2.164128256513026, + "grad_norm": 16.511024441454968, + "learning_rate": 2.1851201307047937e-06, + "loss": 1.6756, + "step": 10799 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 18.519097925488737, + "learning_rate": 2.1841566347863958e-06, + "loss": 1.3738, + "step": 10800 + }, + { + "epoch": 2.1645290581162326, + "grad_norm": 20.518686196891533, + "learning_rate": 2.1831932919695597e-06, + "loss": 1.4922, + "step": 10801 + }, + { + "epoch": 2.1647294589178356, + "grad_norm": 22.396557601307997, + "learning_rate": 2.1822301023066587e-06, + "loss": 1.3358, + "step": 10802 + }, + { + "epoch": 2.164929859719439, + "grad_norm": 24.712256599205396, + "learning_rate": 2.1812670658500705e-06, + "loss": 1.7038, + "step": 10803 + }, + { + "epoch": 2.165130260521042, + "grad_norm": 17.18836510256924, + "learning_rate": 2.180304182652151e-06, + "loss": 1.3411, + "step": 10804 + }, + { + "epoch": 2.1653306613226455, + "grad_norm": 33.76696668706119, + "learning_rate": 2.1793414527652545e-06, + "loss": 1.4038, + "step": 10805 + }, + { + "epoch": 2.1655310621242485, + "grad_norm": 18.43816071185047, + "learning_rate": 2.1783788762417277e-06, + "loss": 1.85, + "step": 10806 + }, + { + "epoch": 2.1657314629258515, + "grad_norm": 20.470171140595976, + "learning_rate": 2.1774164531339066e-06, + "loss": 1.3827, + "step": 10807 + }, + { + "epoch": 2.165931863727455, + "grad_norm": 20.015718774710106, + "learning_rate": 2.1764541834941207e-06, + "loss": 1.6666, + "step": 10808 + }, + { + "epoch": 2.166132264529058, + "grad_norm": 30.610708860604145, + "learning_rate": 2.1754920673746853e-06, + "loss": 1.4788, + "step": 10809 + }, + { + "epoch": 2.1663326653306614, + "grad_norm": 18.540020414546554, + "learning_rate": 2.1745301048279193e-06, + "loss": 1.4927, + "step": 10810 + }, + { + "epoch": 2.1665330661322644, + "grad_norm": 23.946517623538547, + "learning_rate": 2.1735682959061204e-06, + "loss": 1.6505, + "step": 10811 + }, + { + "epoch": 2.166733466933868, + "grad_norm": 15.262239090761302, + "learning_rate": 2.1726066406615854e-06, + "loss": 1.7041, + "step": 10812 + }, + { + "epoch": 2.166933867735471, + "grad_norm": 21.818377723994164, + "learning_rate": 2.171645139146601e-06, + "loss": 1.8815, + "step": 10813 + }, + { + "epoch": 2.1671342685370742, + "grad_norm": 25.07468072287402, + "learning_rate": 2.1706837914134452e-06, + "loss": 1.4512, + "step": 10814 + }, + { + "epoch": 2.1673346693386772, + "grad_norm": 23.78855595333222, + "learning_rate": 2.1697225975143903e-06, + "loss": 1.6987, + "step": 10815 + }, + { + "epoch": 2.1675350701402807, + "grad_norm": 23.71962190234069, + "learning_rate": 2.1687615575016918e-06, + "loss": 1.6036, + "step": 10816 + }, + { + "epoch": 2.1677354709418837, + "grad_norm": 22.834694771466143, + "learning_rate": 2.167800671427611e-06, + "loss": 1.5078, + "step": 10817 + }, + { + "epoch": 2.167935871743487, + "grad_norm": 21.17575586984658, + "learning_rate": 2.1668399393443857e-06, + "loss": 1.4904, + "step": 10818 + }, + { + "epoch": 2.16813627254509, + "grad_norm": 16.50249321902934, + "learning_rate": 2.1658793613042533e-06, + "loss": 1.92, + "step": 10819 + }, + { + "epoch": 2.1683366733466936, + "grad_norm": 21.199940347321633, + "learning_rate": 2.1649189373594475e-06, + "loss": 1.5231, + "step": 10820 + }, + { + "epoch": 2.1685370741482966, + "grad_norm": 20.045916591969565, + "learning_rate": 2.163958667562182e-06, + "loss": 1.5753, + "step": 10821 + }, + { + "epoch": 2.1687374749499, + "grad_norm": 17.363433266501474, + "learning_rate": 2.1629985519646725e-06, + "loss": 1.8201, + "step": 10822 + }, + { + "epoch": 2.168937875751503, + "grad_norm": 28.936775726521418, + "learning_rate": 2.1620385906191157e-06, + "loss": 2.1892, + "step": 10823 + }, + { + "epoch": 2.169138276553106, + "grad_norm": 18.630117520556155, + "learning_rate": 2.1610787835777114e-06, + "loss": 1.7448, + "step": 10824 + }, + { + "epoch": 2.1693386773547094, + "grad_norm": 19.27068064418133, + "learning_rate": 2.160119130892646e-06, + "loss": 1.0548, + "step": 10825 + }, + { + "epoch": 2.1695390781563124, + "grad_norm": 17.26124497810155, + "learning_rate": 2.1591596326160925e-06, + "loss": 1.7872, + "step": 10826 + }, + { + "epoch": 2.169739478957916, + "grad_norm": 17.181664253477688, + "learning_rate": 2.158200288800226e-06, + "loss": 1.4929, + "step": 10827 + }, + { + "epoch": 2.169939879759519, + "grad_norm": 21.60445513951664, + "learning_rate": 2.1572410994972037e-06, + "loss": 1.6268, + "step": 10828 + }, + { + "epoch": 2.1701402805611223, + "grad_norm": 23.122004214938617, + "learning_rate": 2.156282064759179e-06, + "loss": 1.4497, + "step": 10829 + }, + { + "epoch": 2.1703406813627253, + "grad_norm": 18.68356360252458, + "learning_rate": 2.1553231846382954e-06, + "loss": 1.4697, + "step": 10830 + }, + { + "epoch": 2.1705410821643287, + "grad_norm": 17.875384695436534, + "learning_rate": 2.1543644591866903e-06, + "loss": 1.5755, + "step": 10831 + }, + { + "epoch": 2.1707414829659317, + "grad_norm": 22.677033117003074, + "learning_rate": 2.1534058884564912e-06, + "loss": 1.5248, + "step": 10832 + }, + { + "epoch": 2.170941883767535, + "grad_norm": 31.159815852382298, + "learning_rate": 2.1524474724998123e-06, + "loss": 1.6863, + "step": 10833 + }, + { + "epoch": 2.171142284569138, + "grad_norm": 17.094043257344012, + "learning_rate": 2.151489211368772e-06, + "loss": 1.8785, + "step": 10834 + }, + { + "epoch": 2.1713426853707416, + "grad_norm": 29.875832283662415, + "learning_rate": 2.150531105115466e-06, + "loss": 1.2335, + "step": 10835 + }, + { + "epoch": 2.1715430861723446, + "grad_norm": 21.376020685973643, + "learning_rate": 2.1495731537919894e-06, + "loss": 1.7204, + "step": 10836 + }, + { + "epoch": 2.171743486973948, + "grad_norm": 25.357232133516405, + "learning_rate": 2.1486153574504286e-06, + "loss": 1.7173, + "step": 10837 + }, + { + "epoch": 2.171943887775551, + "grad_norm": 17.717951689371425, + "learning_rate": 2.14765771614286e-06, + "loss": 1.6903, + "step": 10838 + }, + { + "epoch": 2.1721442885771545, + "grad_norm": 41.45873871592161, + "learning_rate": 2.1467002299213533e-06, + "loss": 1.2147, + "step": 10839 + }, + { + "epoch": 2.1723446893787575, + "grad_norm": 25.474582048460825, + "learning_rate": 2.145742898837964e-06, + "loss": 1.6472, + "step": 10840 + }, + { + "epoch": 2.172545090180361, + "grad_norm": 23.20787136214733, + "learning_rate": 2.1447857229447486e-06, + "loss": 1.768, + "step": 10841 + }, + { + "epoch": 2.172745490981964, + "grad_norm": 20.776140603427397, + "learning_rate": 2.1438287022937475e-06, + "loss": 1.5612, + "step": 10842 + }, + { + "epoch": 2.1729458917835673, + "grad_norm": 30.781874895672512, + "learning_rate": 2.142871836936996e-06, + "loss": 1.565, + "step": 10843 + }, + { + "epoch": 2.1731462925851703, + "grad_norm": 37.308840926217734, + "learning_rate": 2.14191512692652e-06, + "loss": 1.4681, + "step": 10844 + }, + { + "epoch": 2.1733466933867733, + "grad_norm": 24.335149173093583, + "learning_rate": 2.1409585723143373e-06, + "loss": 1.7635, + "step": 10845 + }, + { + "epoch": 2.1735470941883768, + "grad_norm": 14.60836255005953, + "learning_rate": 2.140002173152458e-06, + "loss": 1.5021, + "step": 10846 + }, + { + "epoch": 2.1737474949899798, + "grad_norm": 19.479167061586697, + "learning_rate": 2.1390459294928815e-06, + "loss": 1.9967, + "step": 10847 + }, + { + "epoch": 2.173947895791583, + "grad_norm": 19.227231616028202, + "learning_rate": 2.1380898413876033e-06, + "loss": 1.1448, + "step": 10848 + }, + { + "epoch": 2.174148296593186, + "grad_norm": 51.48155738812339, + "learning_rate": 2.137133908888603e-06, + "loss": 1.3951, + "step": 10849 + }, + { + "epoch": 2.1743486973947896, + "grad_norm": 24.307085017956855, + "learning_rate": 2.136178132047856e-06, + "loss": 1.5161, + "step": 10850 + }, + { + "epoch": 2.1745490981963926, + "grad_norm": 17.487524012908032, + "learning_rate": 2.1352225109173354e-06, + "loss": 1.0505, + "step": 10851 + }, + { + "epoch": 2.174749498997996, + "grad_norm": 23.093475181762955, + "learning_rate": 2.1342670455489932e-06, + "loss": 1.3066, + "step": 10852 + }, + { + "epoch": 2.174949899799599, + "grad_norm": 23.757774548791392, + "learning_rate": 2.1333117359947825e-06, + "loss": 1.9837, + "step": 10853 + }, + { + "epoch": 2.1751503006012025, + "grad_norm": 23.96926521169251, + "learning_rate": 2.1323565823066445e-06, + "loss": 1.7671, + "step": 10854 + }, + { + "epoch": 2.1753507014028055, + "grad_norm": 18.79520513835737, + "learning_rate": 2.1314015845365114e-06, + "loss": 1.0719, + "step": 10855 + }, + { + "epoch": 2.175551102204409, + "grad_norm": 20.376754665088274, + "learning_rate": 2.1304467427363112e-06, + "loss": 1.8514, + "step": 10856 + }, + { + "epoch": 2.175751503006012, + "grad_norm": 21.383028275473485, + "learning_rate": 2.1294920569579537e-06, + "loss": 1.8081, + "step": 10857 + }, + { + "epoch": 2.1759519038076154, + "grad_norm": 19.996503632236536, + "learning_rate": 2.128537527253354e-06, + "loss": 1.7887, + "step": 10858 + }, + { + "epoch": 2.1761523046092184, + "grad_norm": 21.851826790518423, + "learning_rate": 2.127583153674405e-06, + "loss": 1.4737, + "step": 10859 + }, + { + "epoch": 2.176352705410822, + "grad_norm": 28.692636446389468, + "learning_rate": 2.126628936273001e-06, + "loss": 1.5662, + "step": 10860 + }, + { + "epoch": 2.176553106212425, + "grad_norm": 19.664376422841453, + "learning_rate": 2.1256748751010225e-06, + "loss": 1.7948, + "step": 10861 + }, + { + "epoch": 2.1767535070140283, + "grad_norm": 23.43627613549135, + "learning_rate": 2.1247209702103443e-06, + "loss": 1.8397, + "step": 10862 + }, + { + "epoch": 2.1769539078156313, + "grad_norm": 22.14505291118019, + "learning_rate": 2.123767221652831e-06, + "loss": 1.6456, + "step": 10863 + }, + { + "epoch": 2.1771543086172347, + "grad_norm": 21.961651176294406, + "learning_rate": 2.1228136294803397e-06, + "loss": 1.6854, + "step": 10864 + }, + { + "epoch": 2.1773547094188377, + "grad_norm": 24.204104563494646, + "learning_rate": 2.12186019374472e-06, + "loss": 1.6848, + "step": 10865 + }, + { + "epoch": 2.1775551102204407, + "grad_norm": 55.46674126534252, + "learning_rate": 2.120906914497809e-06, + "loss": 2.2768, + "step": 10866 + }, + { + "epoch": 2.177755511022044, + "grad_norm": 27.621772219063264, + "learning_rate": 2.1199537917914386e-06, + "loss": 1.4089, + "step": 10867 + }, + { + "epoch": 2.177955911823647, + "grad_norm": 29.630999889849246, + "learning_rate": 2.119000825677432e-06, + "loss": 1.4401, + "step": 10868 + }, + { + "epoch": 2.1781563126252506, + "grad_norm": 34.039441358229624, + "learning_rate": 2.1180480162076027e-06, + "loss": 1.6393, + "step": 10869 + }, + { + "epoch": 2.1783567134268536, + "grad_norm": 29.15111227120044, + "learning_rate": 2.1170953634337578e-06, + "loss": 1.6684, + "step": 10870 + }, + { + "epoch": 2.178557114228457, + "grad_norm": 16.639444448447687, + "learning_rate": 2.1161428674076928e-06, + "loss": 1.3245, + "step": 10871 + }, + { + "epoch": 2.17875751503006, + "grad_norm": 23.587097113011474, + "learning_rate": 2.115190528181199e-06, + "loss": 1.6634, + "step": 10872 + }, + { + "epoch": 2.1789579158316634, + "grad_norm": 20.965788388907608, + "learning_rate": 2.114238345806051e-06, + "loss": 1.3903, + "step": 10873 + }, + { + "epoch": 2.1791583166332664, + "grad_norm": 16.0881868836908, + "learning_rate": 2.113286320334028e-06, + "loss": 1.3069, + "step": 10874 + }, + { + "epoch": 2.17935871743487, + "grad_norm": 20.726951736941825, + "learning_rate": 2.1123344518168864e-06, + "loss": 1.7677, + "step": 10875 + }, + { + "epoch": 2.179559118236473, + "grad_norm": 20.941822380944995, + "learning_rate": 2.111382740306384e-06, + "loss": 1.6907, + "step": 10876 + }, + { + "epoch": 2.1797595190380763, + "grad_norm": 18.276896897530726, + "learning_rate": 2.1104311858542654e-06, + "loss": 1.7498, + "step": 10877 + }, + { + "epoch": 2.1799599198396793, + "grad_norm": 21.57271053793095, + "learning_rate": 2.109479788512269e-06, + "loss": 1.2995, + "step": 10878 + }, + { + "epoch": 2.1801603206412827, + "grad_norm": 18.63072960238164, + "learning_rate": 2.108528548332123e-06, + "loss": 1.5225, + "step": 10879 + }, + { + "epoch": 2.1803607214428857, + "grad_norm": 18.279806526588107, + "learning_rate": 2.107577465365548e-06, + "loss": 1.5788, + "step": 10880 + }, + { + "epoch": 2.180561122244489, + "grad_norm": 20.811365185196234, + "learning_rate": 2.106626539664256e-06, + "loss": 1.3931, + "step": 10881 + }, + { + "epoch": 2.180761523046092, + "grad_norm": 38.974202549407146, + "learning_rate": 2.105675771279952e-06, + "loss": 1.5234, + "step": 10882 + }, + { + "epoch": 2.180961923847695, + "grad_norm": 44.50863989072198, + "learning_rate": 2.1047251602643265e-06, + "loss": 1.47, + "step": 10883 + }, + { + "epoch": 2.1811623246492986, + "grad_norm": 26.62444598057846, + "learning_rate": 2.1037747066690684e-06, + "loss": 1.5494, + "step": 10884 + }, + { + "epoch": 2.1813627254509016, + "grad_norm": 23.419003974303365, + "learning_rate": 2.102824410545854e-06, + "loss": 1.8175, + "step": 10885 + }, + { + "epoch": 2.181563126252505, + "grad_norm": 26.80351235654549, + "learning_rate": 2.1018742719463536e-06, + "loss": 1.5647, + "step": 10886 + }, + { + "epoch": 2.181763527054108, + "grad_norm": 32.10245545865136, + "learning_rate": 2.1009242909222273e-06, + "loss": 1.9186, + "step": 10887 + }, + { + "epoch": 2.1819639278557115, + "grad_norm": 18.319559502685088, + "learning_rate": 2.099974467525126e-06, + "loss": 1.431, + "step": 10888 + }, + { + "epoch": 2.1821643286573145, + "grad_norm": 20.135000857796683, + "learning_rate": 2.0990248018066953e-06, + "loss": 2.3727, + "step": 10889 + }, + { + "epoch": 2.182364729458918, + "grad_norm": 17.316900448915675, + "learning_rate": 2.0980752938185654e-06, + "loss": 1.327, + "step": 10890 + }, + { + "epoch": 2.182565130260521, + "grad_norm": 21.31468462023604, + "learning_rate": 2.0971259436123692e-06, + "loss": 1.3284, + "step": 10891 + }, + { + "epoch": 2.1827655310621243, + "grad_norm": 27.95464232016457, + "learning_rate": 2.0961767512397184e-06, + "loss": 1.3867, + "step": 10892 + }, + { + "epoch": 2.1829659318637273, + "grad_norm": 28.41713118364968, + "learning_rate": 2.0952277167522246e-06, + "loss": 1.735, + "step": 10893 + }, + { + "epoch": 2.1831663326653308, + "grad_norm": 18.103707162269142, + "learning_rate": 2.094278840201487e-06, + "loss": 1.8721, + "step": 10894 + }, + { + "epoch": 2.1833667334669338, + "grad_norm": 29.58073195269921, + "learning_rate": 2.0933301216390976e-06, + "loss": 2.1568, + "step": 10895 + }, + { + "epoch": 2.183567134268537, + "grad_norm": 18.736003091715418, + "learning_rate": 2.092381561116643e-06, + "loss": 1.2689, + "step": 10896 + }, + { + "epoch": 2.18376753507014, + "grad_norm": 13.634633139487159, + "learning_rate": 2.0914331586856907e-06, + "loss": 1.3624, + "step": 10897 + }, + { + "epoch": 2.1839679358717436, + "grad_norm": 24.253900295561156, + "learning_rate": 2.0904849143978152e-06, + "loss": 1.8587, + "step": 10898 + }, + { + "epoch": 2.1841683366733466, + "grad_norm": 18.648574764705053, + "learning_rate": 2.0895368283045675e-06, + "loss": 1.7049, + "step": 10899 + }, + { + "epoch": 2.18436873747495, + "grad_norm": 19.136752473537374, + "learning_rate": 2.0885889004574988e-06, + "loss": 1.3348, + "step": 10900 + }, + { + "epoch": 2.184569138276553, + "grad_norm": 20.250384511778705, + "learning_rate": 2.087641130908149e-06, + "loss": 1.811, + "step": 10901 + }, + { + "epoch": 2.1847695390781565, + "grad_norm": 17.201382963224354, + "learning_rate": 2.0866935197080503e-06, + "loss": 1.9418, + "step": 10902 + }, + { + "epoch": 2.1849699398797595, + "grad_norm": 21.63228758947029, + "learning_rate": 2.0857460669087272e-06, + "loss": 1.4997, + "step": 10903 + }, + { + "epoch": 2.1851703406813625, + "grad_norm": 21.94035818631656, + "learning_rate": 2.084798772561688e-06, + "loss": 1.8891, + "step": 10904 + }, + { + "epoch": 2.185370741482966, + "grad_norm": 18.58865362629887, + "learning_rate": 2.0838516367184453e-06, + "loss": 1.6582, + "step": 10905 + }, + { + "epoch": 2.185571142284569, + "grad_norm": 15.813923450638264, + "learning_rate": 2.0829046594304952e-06, + "loss": 1.5849, + "step": 10906 + }, + { + "epoch": 2.1857715430861724, + "grad_norm": 18.281030277158216, + "learning_rate": 2.0819578407493214e-06, + "loss": 1.7052, + "step": 10907 + }, + { + "epoch": 2.1859719438877754, + "grad_norm": 17.339720905518217, + "learning_rate": 2.0810111807264103e-06, + "loss": 1.5475, + "step": 10908 + }, + { + "epoch": 2.186172344689379, + "grad_norm": 22.529339218975707, + "learning_rate": 2.080064679413229e-06, + "loss": 1.7041, + "step": 10909 + }, + { + "epoch": 2.186372745490982, + "grad_norm": 22.250827180564677, + "learning_rate": 2.0791183368612412e-06, + "loss": 1.6972, + "step": 10910 + }, + { + "epoch": 2.1865731462925853, + "grad_norm": 21.67114871026355, + "learning_rate": 2.0781721531219007e-06, + "loss": 1.7058, + "step": 10911 + }, + { + "epoch": 2.1867735470941883, + "grad_norm": 20.65663855824914, + "learning_rate": 2.077226128246653e-06, + "loss": 1.2511, + "step": 10912 + }, + { + "epoch": 2.1869739478957917, + "grad_norm": 16.874746248384685, + "learning_rate": 2.0762802622869376e-06, + "loss": 1.8815, + "step": 10913 + }, + { + "epoch": 2.1871743486973947, + "grad_norm": 20.894046470004504, + "learning_rate": 2.075334555294176e-06, + "loss": 1.1366, + "step": 10914 + }, + { + "epoch": 2.187374749498998, + "grad_norm": 19.207926875044453, + "learning_rate": 2.074389007319796e-06, + "loss": 1.7153, + "step": 10915 + }, + { + "epoch": 2.187575150300601, + "grad_norm": 20.687912236820075, + "learning_rate": 2.0734436184152024e-06, + "loss": 1.004, + "step": 10916 + }, + { + "epoch": 2.1877755511022046, + "grad_norm": 17.818403122327176, + "learning_rate": 2.072498388631799e-06, + "loss": 1.3933, + "step": 10917 + }, + { + "epoch": 2.1879759519038076, + "grad_norm": 15.734554507981773, + "learning_rate": 2.07155331802098e-06, + "loss": 1.6203, + "step": 10918 + }, + { + "epoch": 2.188176352705411, + "grad_norm": 20.206872116176342, + "learning_rate": 2.0706084066341286e-06, + "loss": 1.4966, + "step": 10919 + }, + { + "epoch": 2.188376753507014, + "grad_norm": 28.256758312765722, + "learning_rate": 2.069663654522625e-06, + "loss": 1.2029, + "step": 10920 + }, + { + "epoch": 2.1885771543086174, + "grad_norm": 27.672203975351188, + "learning_rate": 2.068719061737831e-06, + "loss": 1.5936, + "step": 10921 + }, + { + "epoch": 2.1887775551102204, + "grad_norm": 35.28184791546305, + "learning_rate": 2.0677746283311117e-06, + "loss": 1.5111, + "step": 10922 + }, + { + "epoch": 2.188977955911824, + "grad_norm": 19.649064008345718, + "learning_rate": 2.066830354353812e-06, + "loss": 1.3103, + "step": 10923 + }, + { + "epoch": 2.189178356713427, + "grad_norm": 22.438337127297853, + "learning_rate": 2.0658862398572767e-06, + "loss": 1.4836, + "step": 10924 + }, + { + "epoch": 2.18937875751503, + "grad_norm": 16.872268558245153, + "learning_rate": 2.064942284892837e-06, + "loss": 1.4117, + "step": 10925 + }, + { + "epoch": 2.1895791583166333, + "grad_norm": 21.849655900734433, + "learning_rate": 2.0639984895118175e-06, + "loss": 1.77, + "step": 10926 + }, + { + "epoch": 2.1897795591182363, + "grad_norm": 21.338015776141145, + "learning_rate": 2.0630548537655363e-06, + "loss": 1.6588, + "step": 10927 + }, + { + "epoch": 2.1899799599198397, + "grad_norm": 23.17650299207904, + "learning_rate": 2.062111377705294e-06, + "loss": 2.391, + "step": 10928 + }, + { + "epoch": 2.1901803607214427, + "grad_norm": 19.085058141391368, + "learning_rate": 2.061168061382397e-06, + "loss": 1.2484, + "step": 10929 + }, + { + "epoch": 2.190380761523046, + "grad_norm": 37.59574995664407, + "learning_rate": 2.060224904848129e-06, + "loss": 1.3287, + "step": 10930 + }, + { + "epoch": 2.190581162324649, + "grad_norm": 18.849793534243606, + "learning_rate": 2.0592819081537703e-06, + "loss": 1.6125, + "step": 10931 + }, + { + "epoch": 2.1907815631262526, + "grad_norm": 17.163913085356388, + "learning_rate": 2.0583390713505986e-06, + "loss": 1.9686, + "step": 10932 + }, + { + "epoch": 2.1909819639278556, + "grad_norm": 24.018564131000723, + "learning_rate": 2.0573963944898724e-06, + "loss": 1.4703, + "step": 10933 + }, + { + "epoch": 2.191182364729459, + "grad_norm": 26.164015912506343, + "learning_rate": 2.056453877622847e-06, + "loss": 1.718, + "step": 10934 + }, + { + "epoch": 2.191382765531062, + "grad_norm": 45.639420814170876, + "learning_rate": 2.0555115208007705e-06, + "loss": 1.5582, + "step": 10935 + }, + { + "epoch": 2.1915831663326655, + "grad_norm": 21.038394153546093, + "learning_rate": 2.054569324074878e-06, + "loss": 0.6977, + "step": 10936 + }, + { + "epoch": 2.1917835671342685, + "grad_norm": 42.75377041695214, + "learning_rate": 2.053627287496402e-06, + "loss": 1.2579, + "step": 10937 + }, + { + "epoch": 2.191983967935872, + "grad_norm": 16.346936087057546, + "learning_rate": 2.0526854111165555e-06, + "loss": 1.5572, + "step": 10938 + }, + { + "epoch": 2.192184368737475, + "grad_norm": 23.12345465399482, + "learning_rate": 2.0517436949865576e-06, + "loss": 0.9994, + "step": 10939 + }, + { + "epoch": 2.1923847695390783, + "grad_norm": 20.92797800054618, + "learning_rate": 2.0508021391576055e-06, + "loss": 1.3988, + "step": 10940 + }, + { + "epoch": 2.1925851703406813, + "grad_norm": 16.935854686044376, + "learning_rate": 2.0498607436808943e-06, + "loss": 1.4026, + "step": 10941 + }, + { + "epoch": 2.1927855711422843, + "grad_norm": 21.785789150035058, + "learning_rate": 2.048919508607609e-06, + "loss": 1.8445, + "step": 10942 + }, + { + "epoch": 2.192985971943888, + "grad_norm": 17.985960667263058, + "learning_rate": 2.0479784339889264e-06, + "loss": 1.4813, + "step": 10943 + }, + { + "epoch": 2.1931863727454908, + "grad_norm": 23.57792251419951, + "learning_rate": 2.047037519876014e-06, + "loss": 1.4711, + "step": 10944 + }, + { + "epoch": 2.193386773547094, + "grad_norm": 58.983507466155146, + "learning_rate": 2.0460967663200303e-06, + "loss": 1.8729, + "step": 10945 + }, + { + "epoch": 2.193587174348697, + "grad_norm": 29.17573991250572, + "learning_rate": 2.0451561733721283e-06, + "loss": 1.7642, + "step": 10946 + }, + { + "epoch": 2.1937875751503007, + "grad_norm": 31.51081652021233, + "learning_rate": 2.044215741083444e-06, + "loss": 2.2485, + "step": 10947 + }, + { + "epoch": 2.1939879759519036, + "grad_norm": 18.822943231111992, + "learning_rate": 2.043275469505114e-06, + "loss": 1.0096, + "step": 10948 + }, + { + "epoch": 2.194188376753507, + "grad_norm": 16.143500211978633, + "learning_rate": 2.0423353586882605e-06, + "loss": 1.442, + "step": 10949 + }, + { + "epoch": 2.19438877755511, + "grad_norm": 44.427988521478284, + "learning_rate": 2.0413954086839998e-06, + "loss": 1.9218, + "step": 10950 + }, + { + "epoch": 2.1945891783567135, + "grad_norm": 19.446612699933194, + "learning_rate": 2.0404556195434384e-06, + "loss": 1.6955, + "step": 10951 + }, + { + "epoch": 2.1947895791583165, + "grad_norm": 21.148029999874613, + "learning_rate": 2.039515991317673e-06, + "loss": 1.3852, + "step": 10952 + }, + { + "epoch": 2.19498997995992, + "grad_norm": 27.351615392846817, + "learning_rate": 2.0385765240577954e-06, + "loss": 1.31, + "step": 10953 + }, + { + "epoch": 2.195190380761523, + "grad_norm": 18.550634547856607, + "learning_rate": 2.0376372178148817e-06, + "loss": 1.1607, + "step": 10954 + }, + { + "epoch": 2.1953907815631264, + "grad_norm": 22.405356478281895, + "learning_rate": 2.0366980726400055e-06, + "loss": 1.3252, + "step": 10955 + }, + { + "epoch": 2.1955911823647294, + "grad_norm": 20.056829942894208, + "learning_rate": 2.0357590885842294e-06, + "loss": 1.2218, + "step": 10956 + }, + { + "epoch": 2.195791583166333, + "grad_norm": 24.738934241445545, + "learning_rate": 2.034820265698607e-06, + "loss": 1.3247, + "step": 10957 + }, + { + "epoch": 2.195991983967936, + "grad_norm": 20.269444004459135, + "learning_rate": 2.0338816040341836e-06, + "loss": 1.6317, + "step": 10958 + }, + { + "epoch": 2.1961923847695393, + "grad_norm": 23.158151914471194, + "learning_rate": 2.032943103641996e-06, + "loss": 1.9098, + "step": 10959 + }, + { + "epoch": 2.1963927855711423, + "grad_norm": 19.31337861578851, + "learning_rate": 2.032004764573074e-06, + "loss": 1.5787, + "step": 10960 + }, + { + "epoch": 2.1965931863727457, + "grad_norm": 27.648930211860673, + "learning_rate": 2.0310665868784295e-06, + "loss": 1.5985, + "step": 10961 + }, + { + "epoch": 2.1967935871743487, + "grad_norm": 35.39454318578565, + "learning_rate": 2.0301285706090797e-06, + "loss": 1.3374, + "step": 10962 + }, + { + "epoch": 2.1969939879759517, + "grad_norm": 25.796181212238647, + "learning_rate": 2.029190715816025e-06, + "loss": 1.5411, + "step": 10963 + }, + { + "epoch": 2.197194388777555, + "grad_norm": 19.161193232370007, + "learning_rate": 2.0282530225502544e-06, + "loss": 1.6482, + "step": 10964 + }, + { + "epoch": 2.197394789579158, + "grad_norm": 40.00510786134988, + "learning_rate": 2.027315490862754e-06, + "loss": 1.5775, + "step": 10965 + }, + { + "epoch": 2.1975951903807616, + "grad_norm": 30.573546561987328, + "learning_rate": 2.026378120804499e-06, + "loss": 1.4448, + "step": 10966 + }, + { + "epoch": 2.1977955911823646, + "grad_norm": 27.127360944886117, + "learning_rate": 2.025440912426454e-06, + "loss": 1.9157, + "step": 10967 + }, + { + "epoch": 2.197995991983968, + "grad_norm": 21.43418284530827, + "learning_rate": 2.024503865779578e-06, + "loss": 1.5018, + "step": 10968 + }, + { + "epoch": 2.198196392785571, + "grad_norm": 19.194637264928964, + "learning_rate": 2.0235669809148185e-06, + "loss": 1.7155, + "step": 10969 + }, + { + "epoch": 2.1983967935871744, + "grad_norm": 24.255841004215725, + "learning_rate": 2.022630257883118e-06, + "loss": 1.5227, + "step": 10970 + }, + { + "epoch": 2.1985971943887774, + "grad_norm": 31.056719609923356, + "learning_rate": 2.021693696735404e-06, + "loss": 1.8273, + "step": 10971 + }, + { + "epoch": 2.198797595190381, + "grad_norm": 27.17609260445814, + "learning_rate": 2.0207572975225996e-06, + "loss": 2.0047, + "step": 10972 + }, + { + "epoch": 2.198997995991984, + "grad_norm": 58.031063139835965, + "learning_rate": 2.0198210602956186e-06, + "loss": 1.9906, + "step": 10973 + }, + { + "epoch": 2.1991983967935873, + "grad_norm": 22.259315244016673, + "learning_rate": 2.018884985105366e-06, + "loss": 1.8127, + "step": 10974 + }, + { + "epoch": 2.1993987975951903, + "grad_norm": 23.112865459184366, + "learning_rate": 2.017949072002737e-06, + "loss": 1.462, + "step": 10975 + }, + { + "epoch": 2.1995991983967937, + "grad_norm": 28.37870500375281, + "learning_rate": 2.0170133210386195e-06, + "loss": 2.0567, + "step": 10976 + }, + { + "epoch": 2.1997995991983967, + "grad_norm": 20.229371682258236, + "learning_rate": 2.0160777322638924e-06, + "loss": 0.9899, + "step": 10977 + }, + { + "epoch": 2.2, + "grad_norm": 21.108194623047773, + "learning_rate": 2.01514230572942e-06, + "loss": 1.3762, + "step": 10978 + }, + { + "epoch": 2.200200400801603, + "grad_norm": 20.27810065705958, + "learning_rate": 2.0142070414860704e-06, + "loss": 1.3236, + "step": 10979 + }, + { + "epoch": 2.2004008016032066, + "grad_norm": 17.98213552875499, + "learning_rate": 2.013271939584689e-06, + "loss": 1.545, + "step": 10980 + }, + { + "epoch": 2.2006012024048096, + "grad_norm": 32.00287720794887, + "learning_rate": 2.0123370000761217e-06, + "loss": 1.0832, + "step": 10981 + }, + { + "epoch": 2.200801603206413, + "grad_norm": 16.908597182206123, + "learning_rate": 2.011402223011201e-06, + "loss": 1.245, + "step": 10982 + }, + { + "epoch": 2.201002004008016, + "grad_norm": 21.904233960592965, + "learning_rate": 2.0104676084407543e-06, + "loss": 1.4785, + "step": 10983 + }, + { + "epoch": 2.201202404809619, + "grad_norm": 21.917465237535588, + "learning_rate": 2.009533156415598e-06, + "loss": 1.615, + "step": 10984 + }, + { + "epoch": 2.2014028056112225, + "grad_norm": 15.834132867248142, + "learning_rate": 2.0085988669865342e-06, + "loss": 1.3616, + "step": 10985 + }, + { + "epoch": 2.2016032064128255, + "grad_norm": 20.74352552618441, + "learning_rate": 2.00766474020437e-06, + "loss": 1.4627, + "step": 10986 + }, + { + "epoch": 2.201803607214429, + "grad_norm": 19.925729837638375, + "learning_rate": 2.0067307761198882e-06, + "loss": 1.5083, + "step": 10987 + }, + { + "epoch": 2.202004008016032, + "grad_norm": 27.91671802288591, + "learning_rate": 2.005796974783872e-06, + "loss": 1.559, + "step": 10988 + }, + { + "epoch": 2.2022044088176354, + "grad_norm": 19.269862690157982, + "learning_rate": 2.0048633362470965e-06, + "loss": 1.1081, + "step": 10989 + }, + { + "epoch": 2.2024048096192383, + "grad_norm": 23.67609678471285, + "learning_rate": 2.003929860560322e-06, + "loss": 1.4369, + "step": 10990 + }, + { + "epoch": 2.202605210420842, + "grad_norm": 27.546473676964315, + "learning_rate": 2.0029965477743046e-06, + "loss": 1.7954, + "step": 10991 + }, + { + "epoch": 2.202805611222445, + "grad_norm": 22.733744133545414, + "learning_rate": 2.002063397939786e-06, + "loss": 1.9591, + "step": 10992 + }, + { + "epoch": 2.203006012024048, + "grad_norm": 19.05292984167933, + "learning_rate": 2.0011304111075064e-06, + "loss": 1.8123, + "step": 10993 + }, + { + "epoch": 2.203206412825651, + "grad_norm": 16.87938845072254, + "learning_rate": 2.000197587328196e-06, + "loss": 1.4644, + "step": 10994 + }, + { + "epoch": 2.2034068136272547, + "grad_norm": 23.434800169356386, + "learning_rate": 1.9992649266525666e-06, + "loss": 1.4243, + "step": 10995 + }, + { + "epoch": 2.2036072144288577, + "grad_norm": 25.361404965790445, + "learning_rate": 1.9983324291313365e-06, + "loss": 1.638, + "step": 10996 + }, + { + "epoch": 2.203807615230461, + "grad_norm": 21.626254779436383, + "learning_rate": 1.9974000948152017e-06, + "loss": 1.606, + "step": 10997 + }, + { + "epoch": 2.204008016032064, + "grad_norm": 24.922272729953658, + "learning_rate": 1.9964679237548555e-06, + "loss": 2.1627, + "step": 10998 + }, + { + "epoch": 2.2042084168336675, + "grad_norm": 35.63059488209234, + "learning_rate": 1.9955359160009818e-06, + "loss": 1.4029, + "step": 10999 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 26.000076183449053, + "learning_rate": 1.994604071604256e-06, + "loss": 2.3121, + "step": 11000 + }, + { + "epoch": 2.2046092184368735, + "grad_norm": 18.253874559538545, + "learning_rate": 1.9936723906153445e-06, + "loss": 1.5226, + "step": 11001 + }, + { + "epoch": 2.204809619238477, + "grad_norm": 56.50860566870665, + "learning_rate": 1.992740873084899e-06, + "loss": 1.6692, + "step": 11002 + }, + { + "epoch": 2.20501002004008, + "grad_norm": 20.771019690862833, + "learning_rate": 1.9918095190635754e-06, + "loss": 1.5464, + "step": 11003 + }, + { + "epoch": 2.2052104208416834, + "grad_norm": 23.18682507336405, + "learning_rate": 1.9908783286020067e-06, + "loss": 1.4338, + "step": 11004 + }, + { + "epoch": 2.2054108216432864, + "grad_norm": 20.043112509499267, + "learning_rate": 1.9899473017508254e-06, + "loss": 1.7251, + "step": 11005 + }, + { + "epoch": 2.20561122244489, + "grad_norm": 18.42870798147814, + "learning_rate": 1.9890164385606524e-06, + "loss": 1.4675, + "step": 11006 + }, + { + "epoch": 2.205811623246493, + "grad_norm": 27.05771947290273, + "learning_rate": 1.9880857390821005e-06, + "loss": 1.4004, + "step": 11007 + }, + { + "epoch": 2.2060120240480963, + "grad_norm": 15.64022977818193, + "learning_rate": 1.9871552033657743e-06, + "loss": 1.1493, + "step": 11008 + }, + { + "epoch": 2.2062124248496993, + "grad_norm": 23.778442110505274, + "learning_rate": 1.986224831462264e-06, + "loss": 1.9964, + "step": 11009 + }, + { + "epoch": 2.2064128256513027, + "grad_norm": 21.567946365728123, + "learning_rate": 1.9852946234221616e-06, + "loss": 0.9456, + "step": 11010 + }, + { + "epoch": 2.2066132264529057, + "grad_norm": 21.051553920785327, + "learning_rate": 1.984364579296039e-06, + "loss": 1.5564, + "step": 11011 + }, + { + "epoch": 2.206813627254509, + "grad_norm": 19.066469591959986, + "learning_rate": 1.9834346991344636e-06, + "loss": 1.3977, + "step": 11012 + }, + { + "epoch": 2.207014028056112, + "grad_norm": 18.372362128083513, + "learning_rate": 1.9825049829880006e-06, + "loss": 1.4784, + "step": 11013 + }, + { + "epoch": 2.2072144288577156, + "grad_norm": 25.15061382210148, + "learning_rate": 1.9815754309071933e-06, + "loss": 0.6998, + "step": 11014 + }, + { + "epoch": 2.2074148296593186, + "grad_norm": 21.146116045621117, + "learning_rate": 1.9806460429425877e-06, + "loss": 1.8467, + "step": 11015 + }, + { + "epoch": 2.207615230460922, + "grad_norm": 22.754087625390824, + "learning_rate": 1.97971681914471e-06, + "loss": 1.7101, + "step": 11016 + }, + { + "epoch": 2.207815631262525, + "grad_norm": 23.56566715188495, + "learning_rate": 1.9787877595640894e-06, + "loss": 1.6524, + "step": 11017 + }, + { + "epoch": 2.2080160320641284, + "grad_norm": 33.7803407534861, + "learning_rate": 1.9778588642512397e-06, + "loss": 1.6303, + "step": 11018 + }, + { + "epoch": 2.2082164328657314, + "grad_norm": 21.827172206065686, + "learning_rate": 1.9769301332566607e-06, + "loss": 1.8391, + "step": 11019 + }, + { + "epoch": 2.208416833667335, + "grad_norm": 21.68592919362786, + "learning_rate": 1.9760015666308568e-06, + "loss": 2.4295, + "step": 11020 + }, + { + "epoch": 2.208617234468938, + "grad_norm": 19.416655409953822, + "learning_rate": 1.9750731644243093e-06, + "loss": 1.8535, + "step": 11021 + }, + { + "epoch": 2.208817635270541, + "grad_norm": 30.534971702138066, + "learning_rate": 1.9741449266874994e-06, + "loss": 1.7218, + "step": 11022 + }, + { + "epoch": 2.2090180360721443, + "grad_norm": 27.303318339408243, + "learning_rate": 1.973216853470896e-06, + "loss": 1.5444, + "step": 11023 + }, + { + "epoch": 2.2092184368737473, + "grad_norm": 18.353017045821048, + "learning_rate": 1.972288944824961e-06, + "loss": 1.1084, + "step": 11024 + }, + { + "epoch": 2.2094188376753507, + "grad_norm": 15.71169115598408, + "learning_rate": 1.9713612008001464e-06, + "loss": 1.4415, + "step": 11025 + }, + { + "epoch": 2.2096192384769537, + "grad_norm": 23.879385640293396, + "learning_rate": 1.9704336214468913e-06, + "loss": 1.9958, + "step": 11026 + }, + { + "epoch": 2.209819639278557, + "grad_norm": 23.87872227923872, + "learning_rate": 1.9695062068156345e-06, + "loss": 1.6758, + "step": 11027 + }, + { + "epoch": 2.21002004008016, + "grad_norm": 20.043432460163885, + "learning_rate": 1.9685789569567982e-06, + "loss": 1.6525, + "step": 11028 + }, + { + "epoch": 2.2102204408817636, + "grad_norm": 22.03141416461927, + "learning_rate": 1.9676518719207975e-06, + "loss": 1.8601, + "step": 11029 + }, + { + "epoch": 2.2104208416833666, + "grad_norm": 17.0787835355168, + "learning_rate": 1.9667249517580417e-06, + "loss": 1.4366, + "step": 11030 + }, + { + "epoch": 2.21062124248497, + "grad_norm": 32.04850192514348, + "learning_rate": 1.9657981965189276e-06, + "loss": 1.9526, + "step": 11031 + }, + { + "epoch": 2.210821643286573, + "grad_norm": 34.12483807948795, + "learning_rate": 1.964871606253844e-06, + "loss": 1.4762, + "step": 11032 + }, + { + "epoch": 2.2110220440881765, + "grad_norm": 26.431501076099163, + "learning_rate": 1.963945181013172e-06, + "loss": 1.6051, + "step": 11033 + }, + { + "epoch": 2.2112224448897795, + "grad_norm": 28.090124380505568, + "learning_rate": 1.9630189208472838e-06, + "loss": 1.3928, + "step": 11034 + }, + { + "epoch": 2.211422845691383, + "grad_norm": 17.717206871200244, + "learning_rate": 1.962092825806538e-06, + "loss": 1.3856, + "step": 11035 + }, + { + "epoch": 2.211623246492986, + "grad_norm": 30.002260353318043, + "learning_rate": 1.9611668959412893e-06, + "loss": 1.4465, + "step": 11036 + }, + { + "epoch": 2.2118236472945894, + "grad_norm": 17.922729011338966, + "learning_rate": 1.9602411313018827e-06, + "loss": 2.0245, + "step": 11037 + }, + { + "epoch": 2.2120240480961924, + "grad_norm": 15.617034361899945, + "learning_rate": 1.9593155319386524e-06, + "loss": 1.2579, + "step": 11038 + }, + { + "epoch": 2.212224448897796, + "grad_norm": 21.729026653420895, + "learning_rate": 1.9583900979019255e-06, + "loss": 1.5844, + "step": 11039 + }, + { + "epoch": 2.212424849699399, + "grad_norm": 16.667951967082306, + "learning_rate": 1.957464829242019e-06, + "loss": 1.2897, + "step": 11040 + }, + { + "epoch": 2.2126252505010022, + "grad_norm": 20.53243011199255, + "learning_rate": 1.956539726009242e-06, + "loss": 1.3981, + "step": 11041 + }, + { + "epoch": 2.2128256513026052, + "grad_norm": 29.17626178672493, + "learning_rate": 1.955614788253891e-06, + "loss": 1.7592, + "step": 11042 + }, + { + "epoch": 2.213026052104208, + "grad_norm": 26.536684048818728, + "learning_rate": 1.9546900160262566e-06, + "loss": 1.7621, + "step": 11043 + }, + { + "epoch": 2.2132264529058117, + "grad_norm": 28.33942087498389, + "learning_rate": 1.953765409376625e-06, + "loss": 1.3673, + "step": 11044 + }, + { + "epoch": 2.2134268537074147, + "grad_norm": 20.762082046938627, + "learning_rate": 1.9528409683552623e-06, + "loss": 1.6849, + "step": 11045 + }, + { + "epoch": 2.213627254509018, + "grad_norm": 17.87219543472441, + "learning_rate": 1.9519166930124346e-06, + "loss": 1.3322, + "step": 11046 + }, + { + "epoch": 2.213827655310621, + "grad_norm": 33.17925203619865, + "learning_rate": 1.9509925833983955e-06, + "loss": 1.8431, + "step": 11047 + }, + { + "epoch": 2.2140280561122245, + "grad_norm": 40.19702807135393, + "learning_rate": 1.9500686395633904e-06, + "loss": 1.6534, + "step": 11048 + }, + { + "epoch": 2.2142284569138275, + "grad_norm": 18.244686053017798, + "learning_rate": 1.9491448615576557e-06, + "loss": 1.2207, + "step": 11049 + }, + { + "epoch": 2.214428857715431, + "grad_norm": 27.41099697545561, + "learning_rate": 1.9482212494314177e-06, + "loss": 1.2996, + "step": 11050 + }, + { + "epoch": 2.214629258517034, + "grad_norm": 21.6576558954267, + "learning_rate": 1.9472978032348976e-06, + "loss": 1.488, + "step": 11051 + }, + { + "epoch": 2.2148296593186374, + "grad_norm": 36.47145487594995, + "learning_rate": 1.9463745230183007e-06, + "loss": 1.1651, + "step": 11052 + }, + { + "epoch": 2.2150300601202404, + "grad_norm": 22.413948118184386, + "learning_rate": 1.9454514088318276e-06, + "loss": 1.3895, + "step": 11053 + }, + { + "epoch": 2.215230460921844, + "grad_norm": 23.01029858815311, + "learning_rate": 1.944528460725671e-06, + "loss": 1.687, + "step": 11054 + }, + { + "epoch": 2.215430861723447, + "grad_norm": 18.466339354826502, + "learning_rate": 1.9436056787500125e-06, + "loss": 1.4412, + "step": 11055 + }, + { + "epoch": 2.2156312625250503, + "grad_norm": 21.594178833977335, + "learning_rate": 1.9426830629550244e-06, + "loss": 1.631, + "step": 11056 + }, + { + "epoch": 2.2158316633266533, + "grad_norm": 24.422354020770694, + "learning_rate": 1.9417606133908713e-06, + "loss": 1.3013, + "step": 11057 + }, + { + "epoch": 2.2160320641282567, + "grad_norm": 20.434887810772643, + "learning_rate": 1.9408383301077096e-06, + "loss": 1.53, + "step": 11058 + }, + { + "epoch": 2.2162324649298597, + "grad_norm": 20.138289701835333, + "learning_rate": 1.9399162131556805e-06, + "loss": 1.588, + "step": 11059 + }, + { + "epoch": 2.2164328657314627, + "grad_norm": 31.361829915125313, + "learning_rate": 1.9389942625849268e-06, + "loss": 1.7642, + "step": 11060 + }, + { + "epoch": 2.216633266533066, + "grad_norm": 18.80310090245763, + "learning_rate": 1.9380724784455724e-06, + "loss": 1.8326, + "step": 11061 + }, + { + "epoch": 2.216833667334669, + "grad_norm": 51.14040061399237, + "learning_rate": 1.937150860787737e-06, + "loss": 2.0444, + "step": 11062 + }, + { + "epoch": 2.2170340681362726, + "grad_norm": 29.951980839338002, + "learning_rate": 1.9362294096615302e-06, + "loss": 1.3781, + "step": 11063 + }, + { + "epoch": 2.2172344689378756, + "grad_norm": 18.252212470820115, + "learning_rate": 1.9353081251170537e-06, + "loss": 1.8068, + "step": 11064 + }, + { + "epoch": 2.217434869739479, + "grad_norm": 21.906389539055564, + "learning_rate": 1.9343870072043993e-06, + "loss": 1.5968, + "step": 11065 + }, + { + "epoch": 2.217635270541082, + "grad_norm": 22.299125179927916, + "learning_rate": 1.9334660559736453e-06, + "loss": 1.6503, + "step": 11066 + }, + { + "epoch": 2.2178356713426854, + "grad_norm": 23.853717809819784, + "learning_rate": 1.9325452714748722e-06, + "loss": 1.7252, + "step": 11067 + }, + { + "epoch": 2.2180360721442884, + "grad_norm": 21.290764759875714, + "learning_rate": 1.9316246537581383e-06, + "loss": 1.6173, + "step": 11068 + }, + { + "epoch": 2.218236472945892, + "grad_norm": 22.1622355115431, + "learning_rate": 1.9307042028735017e-06, + "loss": 1.7569, + "step": 11069 + }, + { + "epoch": 2.218436873747495, + "grad_norm": 21.333098677706968, + "learning_rate": 1.9297839188710087e-06, + "loss": 1.7764, + "step": 11070 + }, + { + "epoch": 2.2186372745490983, + "grad_norm": 19.444982885691083, + "learning_rate": 1.9288638018006957e-06, + "loss": 1.7883, + "step": 11071 + }, + { + "epoch": 2.2188376753507013, + "grad_norm": 22.242054702900656, + "learning_rate": 1.9279438517125933e-06, + "loss": 1.7093, + "step": 11072 + }, + { + "epoch": 2.2190380761523048, + "grad_norm": 30.303429019102822, + "learning_rate": 1.9270240686567144e-06, + "loss": 1.5911, + "step": 11073 + }, + { + "epoch": 2.2192384769539077, + "grad_norm": 40.19259228323514, + "learning_rate": 1.9261044526830746e-06, + "loss": 1.0222, + "step": 11074 + }, + { + "epoch": 2.219438877755511, + "grad_norm": 17.28348261473742, + "learning_rate": 1.925185003841676e-06, + "loss": 1.7035, + "step": 11075 + }, + { + "epoch": 2.219639278557114, + "grad_norm": 20.369085790424542, + "learning_rate": 1.924265722182503e-06, + "loss": 1.3498, + "step": 11076 + }, + { + "epoch": 2.2198396793587176, + "grad_norm": 16.97194091124354, + "learning_rate": 1.9233466077555467e-06, + "loss": 1.4603, + "step": 11077 + }, + { + "epoch": 2.2200400801603206, + "grad_norm": 22.05799298519518, + "learning_rate": 1.9224276606107755e-06, + "loss": 1.8091, + "step": 11078 + }, + { + "epoch": 2.220240480961924, + "grad_norm": 19.25475325470303, + "learning_rate": 1.9215088807981547e-06, + "loss": 1.8814, + "step": 11079 + }, + { + "epoch": 2.220440881763527, + "grad_norm": 20.279299261284972, + "learning_rate": 1.920590268367641e-06, + "loss": 1.4168, + "step": 11080 + }, + { + "epoch": 2.22064128256513, + "grad_norm": 20.45146878638445, + "learning_rate": 1.9196718233691797e-06, + "loss": 1.4972, + "step": 11081 + }, + { + "epoch": 2.2208416833667335, + "grad_norm": 24.075439252263738, + "learning_rate": 1.9187535458527103e-06, + "loss": 1.7481, + "step": 11082 + }, + { + "epoch": 2.2210420841683365, + "grad_norm": 17.597286301228973, + "learning_rate": 1.917835435868155e-06, + "loss": 1.672, + "step": 11083 + }, + { + "epoch": 2.22124248496994, + "grad_norm": 19.379080287042367, + "learning_rate": 1.9169174934654404e-06, + "loss": 1.3822, + "step": 11084 + }, + { + "epoch": 2.221442885771543, + "grad_norm": 49.94525256156444, + "learning_rate": 1.9159997186944713e-06, + "loss": 1.6709, + "step": 11085 + }, + { + "epoch": 2.2216432865731464, + "grad_norm": 17.441088684026095, + "learning_rate": 1.9150821116051492e-06, + "loss": 1.1964, + "step": 11086 + }, + { + "epoch": 2.2218436873747494, + "grad_norm": 47.337120959251735, + "learning_rate": 1.914164672247367e-06, + "loss": 1.4684, + "step": 11087 + }, + { + "epoch": 2.222044088176353, + "grad_norm": 27.467807964178125, + "learning_rate": 1.9132474006710065e-06, + "loss": 1.5776, + "step": 11088 + }, + { + "epoch": 2.222244488977956, + "grad_norm": 21.598099053752374, + "learning_rate": 1.9123302969259435e-06, + "loss": 0.8835, + "step": 11089 + }, + { + "epoch": 2.2224448897795592, + "grad_norm": 50.70539614610191, + "learning_rate": 1.911413361062036e-06, + "loss": 2.0206, + "step": 11090 + }, + { + "epoch": 2.2226452905811622, + "grad_norm": 14.390825499098494, + "learning_rate": 1.910496593129147e-06, + "loss": 1.3966, + "step": 11091 + }, + { + "epoch": 2.2228456913827657, + "grad_norm": 30.78839149227475, + "learning_rate": 1.9095799931771168e-06, + "loss": 1.5703, + "step": 11092 + }, + { + "epoch": 2.2230460921843687, + "grad_norm": 21.745189811083417, + "learning_rate": 1.9086635612557848e-06, + "loss": 0.8842, + "step": 11093 + }, + { + "epoch": 2.223246492985972, + "grad_norm": 28.468529983981266, + "learning_rate": 1.9077472974149774e-06, + "loss": 1.306, + "step": 11094 + }, + { + "epoch": 2.223446893787575, + "grad_norm": 23.883830973152815, + "learning_rate": 1.9068312017045149e-06, + "loss": 1.4372, + "step": 11095 + }, + { + "epoch": 2.2236472945891785, + "grad_norm": 23.262648321120814, + "learning_rate": 1.9059152741742071e-06, + "loss": 1.2536, + "step": 11096 + }, + { + "epoch": 2.2238476953907815, + "grad_norm": 19.02571988397252, + "learning_rate": 1.90499951487385e-06, + "loss": 1.5139, + "step": 11097 + }, + { + "epoch": 2.224048096192385, + "grad_norm": 21.7277804489464, + "learning_rate": 1.9040839238532406e-06, + "loss": 1.5155, + "step": 11098 + }, + { + "epoch": 2.224248496993988, + "grad_norm": 21.652925970023393, + "learning_rate": 1.903168501162157e-06, + "loss": 1.5248, + "step": 11099 + }, + { + "epoch": 2.2244488977955914, + "grad_norm": 35.5160440037143, + "learning_rate": 1.9022532468503717e-06, + "loss": 1.9154, + "step": 11100 + }, + { + "epoch": 2.2246492985971944, + "grad_norm": 18.704857956675085, + "learning_rate": 1.9013381609676535e-06, + "loss": 1.4772, + "step": 11101 + }, + { + "epoch": 2.2248496993987974, + "grad_norm": 22.67550546269317, + "learning_rate": 1.9004232435637514e-06, + "loss": 1.7734, + "step": 11102 + }, + { + "epoch": 2.225050100200401, + "grad_norm": 21.08719365072354, + "learning_rate": 1.8995084946884134e-06, + "loss": 1.2983, + "step": 11103 + }, + { + "epoch": 2.225250501002004, + "grad_norm": 25.08916828911608, + "learning_rate": 1.8985939143913757e-06, + "loss": 1.8419, + "step": 11104 + }, + { + "epoch": 2.2254509018036073, + "grad_norm": 22.14324603439291, + "learning_rate": 1.8976795027223643e-06, + "loss": 1.2993, + "step": 11105 + }, + { + "epoch": 2.2256513026052103, + "grad_norm": 18.94810203822831, + "learning_rate": 1.8967652597310998e-06, + "loss": 1.5739, + "step": 11106 + }, + { + "epoch": 2.2258517034068137, + "grad_norm": 20.279255985857255, + "learning_rate": 1.895851185467285e-06, + "loss": 1.6623, + "step": 11107 + }, + { + "epoch": 2.2260521042084167, + "grad_norm": 18.315682751875624, + "learning_rate": 1.894937279980627e-06, + "loss": 1.8982, + "step": 11108 + }, + { + "epoch": 2.22625250501002, + "grad_norm": 27.3499479912491, + "learning_rate": 1.8940235433208116e-06, + "loss": 1.6845, + "step": 11109 + }, + { + "epoch": 2.226452905811623, + "grad_norm": 25.36677023664846, + "learning_rate": 1.8931099755375203e-06, + "loss": 1.2446, + "step": 11110 + }, + { + "epoch": 2.2266533066132266, + "grad_norm": 20.15518061489027, + "learning_rate": 1.8921965766804257e-06, + "loss": 1.7771, + "step": 11111 + }, + { + "epoch": 2.2268537074148296, + "grad_norm": 19.160682984855963, + "learning_rate": 1.891283346799191e-06, + "loss": 1.731, + "step": 11112 + }, + { + "epoch": 2.227054108216433, + "grad_norm": 16.207533601345606, + "learning_rate": 1.8903702859434713e-06, + "loss": 1.2241, + "step": 11113 + }, + { + "epoch": 2.227254509018036, + "grad_norm": 20.739296351927386, + "learning_rate": 1.8894573941629064e-06, + "loss": 1.787, + "step": 11114 + }, + { + "epoch": 2.2274549098196395, + "grad_norm": 16.623371526826233, + "learning_rate": 1.8885446715071376e-06, + "loss": 1.4148, + "step": 11115 + }, + { + "epoch": 2.2276553106212424, + "grad_norm": 20.576460295191833, + "learning_rate": 1.8876321180257862e-06, + "loss": 1.5295, + "step": 11116 + }, + { + "epoch": 2.227855711422846, + "grad_norm": 20.42126000204467, + "learning_rate": 1.8867197337684706e-06, + "loss": 1.3772, + "step": 11117 + }, + { + "epoch": 2.228056112224449, + "grad_norm": 25.39452323407839, + "learning_rate": 1.885807518784799e-06, + "loss": 1.535, + "step": 11118 + }, + { + "epoch": 2.228256513026052, + "grad_norm": 22.206467832210922, + "learning_rate": 1.8848954731243696e-06, + "loss": 1.7845, + "step": 11119 + }, + { + "epoch": 2.2284569138276553, + "grad_norm": 43.7589899093132, + "learning_rate": 1.8839835968367714e-06, + "loss": 1.0235, + "step": 11120 + }, + { + "epoch": 2.2286573146292583, + "grad_norm": 18.937230781491607, + "learning_rate": 1.883071889971585e-06, + "loss": 1.3739, + "step": 11121 + }, + { + "epoch": 2.2288577154308618, + "grad_norm": 19.221909264848165, + "learning_rate": 1.8821603525783826e-06, + "loss": 1.605, + "step": 11122 + }, + { + "epoch": 2.2290581162324647, + "grad_norm": 30.025199069057315, + "learning_rate": 1.8812489847067223e-06, + "loss": 1.5561, + "step": 11123 + }, + { + "epoch": 2.229258517034068, + "grad_norm": 19.237155371375813, + "learning_rate": 1.8803377864061583e-06, + "loss": 1.6889, + "step": 11124 + }, + { + "epoch": 2.229458917835671, + "grad_norm": 16.931313992821384, + "learning_rate": 1.8794267577262343e-06, + "loss": 1.3299, + "step": 11125 + }, + { + "epoch": 2.2296593186372746, + "grad_norm": 30.287049099305268, + "learning_rate": 1.8785158987164837e-06, + "loss": 1.7355, + "step": 11126 + }, + { + "epoch": 2.2298597194388776, + "grad_norm": 16.943680023776416, + "learning_rate": 1.8776052094264313e-06, + "loss": 2.1123, + "step": 11127 + }, + { + "epoch": 2.230060120240481, + "grad_norm": 19.257997015906724, + "learning_rate": 1.8766946899055926e-06, + "loss": 1.1551, + "step": 11128 + }, + { + "epoch": 2.230260521042084, + "grad_norm": 58.85241327939226, + "learning_rate": 1.8757843402034758e-06, + "loss": 1.7962, + "step": 11129 + }, + { + "epoch": 2.2304609218436875, + "grad_norm": 31.571942543096192, + "learning_rate": 1.8748741603695748e-06, + "loss": 1.8666, + "step": 11130 + }, + { + "epoch": 2.2306613226452905, + "grad_norm": 22.25085336888548, + "learning_rate": 1.8739641504533773e-06, + "loss": 1.6595, + "step": 11131 + }, + { + "epoch": 2.230861723446894, + "grad_norm": 29.08892299937214, + "learning_rate": 1.8730543105043664e-06, + "loss": 1.3912, + "step": 11132 + }, + { + "epoch": 2.231062124248497, + "grad_norm": 20.573344010416353, + "learning_rate": 1.8721446405720068e-06, + "loss": 1.4427, + "step": 11133 + }, + { + "epoch": 2.2312625250501004, + "grad_norm": 23.79488403229839, + "learning_rate": 1.8712351407057599e-06, + "loss": 1.5742, + "step": 11134 + }, + { + "epoch": 2.2314629258517034, + "grad_norm": 18.007578105796952, + "learning_rate": 1.870325810955077e-06, + "loss": 1.4242, + "step": 11135 + }, + { + "epoch": 2.231663326653307, + "grad_norm": 27.02370770628929, + "learning_rate": 1.8694166513694002e-06, + "loss": 1.3664, + "step": 11136 + }, + { + "epoch": 2.23186372745491, + "grad_norm": 18.978029386054423, + "learning_rate": 1.8685076619981607e-06, + "loss": 1.8067, + "step": 11137 + }, + { + "epoch": 2.2320641282565132, + "grad_norm": 23.37519576375043, + "learning_rate": 1.8675988428907821e-06, + "loss": 1.0473, + "step": 11138 + }, + { + "epoch": 2.2322645290581162, + "grad_norm": 23.32354740260493, + "learning_rate": 1.8666901940966802e-06, + "loss": 1.5628, + "step": 11139 + }, + { + "epoch": 2.2324649298597192, + "grad_norm": 27.143223779451233, + "learning_rate": 1.8657817156652557e-06, + "loss": 1.7524, + "step": 11140 + }, + { + "epoch": 2.2326653306613227, + "grad_norm": 30.35235694045239, + "learning_rate": 1.8648734076459064e-06, + "loss": 1.2056, + "step": 11141 + }, + { + "epoch": 2.2328657314629257, + "grad_norm": 22.844159232331755, + "learning_rate": 1.8639652700880168e-06, + "loss": 1.5884, + "step": 11142 + }, + { + "epoch": 2.233066132264529, + "grad_norm": 20.058446020353998, + "learning_rate": 1.8630573030409655e-06, + "loss": 1.681, + "step": 11143 + }, + { + "epoch": 2.233266533066132, + "grad_norm": 20.712665276281868, + "learning_rate": 1.862149506554119e-06, + "loss": 1.6313, + "step": 11144 + }, + { + "epoch": 2.2334669338677355, + "grad_norm": 44.79833041247536, + "learning_rate": 1.8612418806768356e-06, + "loss": 1.46, + "step": 11145 + }, + { + "epoch": 2.2336673346693385, + "grad_norm": 18.729778325212738, + "learning_rate": 1.860334425458466e-06, + "loss": 1.4567, + "step": 11146 + }, + { + "epoch": 2.233867735470942, + "grad_norm": 31.04068573386508, + "learning_rate": 1.859427140948345e-06, + "loss": 1.7337, + "step": 11147 + }, + { + "epoch": 2.234068136272545, + "grad_norm": 29.25605147769925, + "learning_rate": 1.8585200271958099e-06, + "loss": 1.6751, + "step": 11148 + }, + { + "epoch": 2.2342685370741484, + "grad_norm": 24.183565418620685, + "learning_rate": 1.857613084250176e-06, + "loss": 1.13, + "step": 11149 + }, + { + "epoch": 2.2344689378757514, + "grad_norm": 38.9031929478557, + "learning_rate": 1.8567063121607575e-06, + "loss": 1.3631, + "step": 11150 + }, + { + "epoch": 2.234669338677355, + "grad_norm": 19.55931156669259, + "learning_rate": 1.8557997109768567e-06, + "loss": 1.1058, + "step": 11151 + }, + { + "epoch": 2.234869739478958, + "grad_norm": 19.197333417202863, + "learning_rate": 1.854893280747767e-06, + "loss": 1.5745, + "step": 11152 + }, + { + "epoch": 2.2350701402805613, + "grad_norm": 56.68242658828325, + "learning_rate": 1.853987021522774e-06, + "loss": 1.7297, + "step": 11153 + }, + { + "epoch": 2.2352705410821643, + "grad_norm": 22.05410659449752, + "learning_rate": 1.853080933351147e-06, + "loss": 1.5461, + "step": 11154 + }, + { + "epoch": 2.2354709418837677, + "grad_norm": 21.183343301880523, + "learning_rate": 1.8521750162821567e-06, + "loss": 1.4773, + "step": 11155 + }, + { + "epoch": 2.2356713426853707, + "grad_norm": 29.913175951159367, + "learning_rate": 1.8512692703650586e-06, + "loss": 1.7574, + "step": 11156 + }, + { + "epoch": 2.235871743486974, + "grad_norm": 16.02386971888213, + "learning_rate": 1.8503636956490972e-06, + "loss": 1.2112, + "step": 11157 + }, + { + "epoch": 2.236072144288577, + "grad_norm": 21.89932589490326, + "learning_rate": 1.849458292183511e-06, + "loss": 1.6188, + "step": 11158 + }, + { + "epoch": 2.2362725450901806, + "grad_norm": 25.108338791600055, + "learning_rate": 1.8485530600175278e-06, + "loss": 1.7971, + "step": 11159 + }, + { + "epoch": 2.2364729458917836, + "grad_norm": 22.70411572840005, + "learning_rate": 1.8476479992003665e-06, + "loss": 1.6456, + "step": 11160 + }, + { + "epoch": 2.2366733466933866, + "grad_norm": 21.403156892388562, + "learning_rate": 1.8467431097812372e-06, + "loss": 1.1882, + "step": 11161 + }, + { + "epoch": 2.23687374749499, + "grad_norm": 20.316542799494847, + "learning_rate": 1.8458383918093391e-06, + "loss": 1.8406, + "step": 11162 + }, + { + "epoch": 2.237074148296593, + "grad_norm": 24.07756558091587, + "learning_rate": 1.8449338453338656e-06, + "loss": 1.2957, + "step": 11163 + }, + { + "epoch": 2.2372745490981965, + "grad_norm": 22.235643883059343, + "learning_rate": 1.844029470403993e-06, + "loss": 1.1749, + "step": 11164 + }, + { + "epoch": 2.2374749498997994, + "grad_norm": 20.3354983375506, + "learning_rate": 1.8431252670688998e-06, + "loss": 1.9253, + "step": 11165 + }, + { + "epoch": 2.237675350701403, + "grad_norm": 24.263422639255154, + "learning_rate": 1.8422212353777447e-06, + "loss": 1.3883, + "step": 11166 + }, + { + "epoch": 2.237875751503006, + "grad_norm": 21.299776703284902, + "learning_rate": 1.8413173753796815e-06, + "loss": 1.95, + "step": 11167 + }, + { + "epoch": 2.2380761523046093, + "grad_norm": 24.960236038765288, + "learning_rate": 1.8404136871238559e-06, + "loss": 1.6895, + "step": 11168 + }, + { + "epoch": 2.2382765531062123, + "grad_norm": 21.423055884463082, + "learning_rate": 1.839510170659402e-06, + "loss": 1.4156, + "step": 11169 + }, + { + "epoch": 2.2384769539078158, + "grad_norm": 21.246790551162643, + "learning_rate": 1.8386068260354467e-06, + "loss": 1.2628, + "step": 11170 + }, + { + "epoch": 2.2386773547094188, + "grad_norm": 25.36434625261044, + "learning_rate": 1.8377036533011022e-06, + "loss": 1.6555, + "step": 11171 + }, + { + "epoch": 2.238877755511022, + "grad_norm": 26.48460995552993, + "learning_rate": 1.836800652505481e-06, + "loss": 1.5586, + "step": 11172 + }, + { + "epoch": 2.239078156312625, + "grad_norm": 26.804693200167723, + "learning_rate": 1.835897823697676e-06, + "loss": 1.2638, + "step": 11173 + }, + { + "epoch": 2.2392785571142286, + "grad_norm": 20.946897135771245, + "learning_rate": 1.834995166926777e-06, + "loss": 1.6744, + "step": 11174 + }, + { + "epoch": 2.2394789579158316, + "grad_norm": 20.19986090625567, + "learning_rate": 1.8340926822418625e-06, + "loss": 1.6815, + "step": 11175 + }, + { + "epoch": 2.239679358717435, + "grad_norm": 24.31444317828185, + "learning_rate": 1.8331903696920028e-06, + "loss": 1.4606, + "step": 11176 + }, + { + "epoch": 2.239879759519038, + "grad_norm": 19.647726353482057, + "learning_rate": 1.8322882293262584e-06, + "loss": 1.6266, + "step": 11177 + }, + { + "epoch": 2.240080160320641, + "grad_norm": 30.565190105291418, + "learning_rate": 1.8313862611936762e-06, + "loss": 1.447, + "step": 11178 + }, + { + "epoch": 2.2402805611222445, + "grad_norm": 24.80703832293143, + "learning_rate": 1.8304844653433028e-06, + "loss": 1.8119, + "step": 11179 + }, + { + "epoch": 2.2404809619238475, + "grad_norm": 20.521332330337135, + "learning_rate": 1.8295828418241668e-06, + "loss": 1.5876, + "step": 11180 + }, + { + "epoch": 2.240681362725451, + "grad_norm": 66.46389099702874, + "learning_rate": 1.8286813906852897e-06, + "loss": 1.7425, + "step": 11181 + }, + { + "epoch": 2.240881763527054, + "grad_norm": 23.01405771708641, + "learning_rate": 1.82778011197569e-06, + "loss": 1.3012, + "step": 11182 + }, + { + "epoch": 2.2410821643286574, + "grad_norm": 24.134701988474767, + "learning_rate": 1.8268790057443664e-06, + "loss": 1.6472, + "step": 11183 + }, + { + "epoch": 2.2412825651302604, + "grad_norm": 22.721282606394386, + "learning_rate": 1.825978072040317e-06, + "loss": 1.4275, + "step": 11184 + }, + { + "epoch": 2.241482965931864, + "grad_norm": 29.442707253653523, + "learning_rate": 1.8250773109125215e-06, + "loss": 1.2272, + "step": 11185 + }, + { + "epoch": 2.241683366733467, + "grad_norm": 18.502247553877172, + "learning_rate": 1.8241767224099615e-06, + "loss": 1.3861, + "step": 11186 + }, + { + "epoch": 2.2418837675350702, + "grad_norm": 19.85208594462351, + "learning_rate": 1.8232763065816023e-06, + "loss": 1.0823, + "step": 11187 + }, + { + "epoch": 2.2420841683366732, + "grad_norm": 27.472671495258727, + "learning_rate": 1.822376063476396e-06, + "loss": 1.6717, + "step": 11188 + }, + { + "epoch": 2.2422845691382767, + "grad_norm": 31.386076945722078, + "learning_rate": 1.8214759931432973e-06, + "loss": 1.9103, + "step": 11189 + }, + { + "epoch": 2.2424849699398797, + "grad_norm": 33.44534094959118, + "learning_rate": 1.8205760956312396e-06, + "loss": 1.5057, + "step": 11190 + }, + { + "epoch": 2.242685370741483, + "grad_norm": 58.359328765565905, + "learning_rate": 1.8196763709891523e-06, + "loss": 1.6805, + "step": 11191 + }, + { + "epoch": 2.242885771543086, + "grad_norm": 27.951167840730378, + "learning_rate": 1.8187768192659556e-06, + "loss": 1.6527, + "step": 11192 + }, + { + "epoch": 2.2430861723446895, + "grad_norm": 25.10257617787322, + "learning_rate": 1.8178774405105587e-06, + "loss": 1.6754, + "step": 11193 + }, + { + "epoch": 2.2432865731462925, + "grad_norm": 23.308201844149966, + "learning_rate": 1.816978234771865e-06, + "loss": 1.8207, + "step": 11194 + }, + { + "epoch": 2.243486973947896, + "grad_norm": 19.34388416349267, + "learning_rate": 1.8160792020987594e-06, + "loss": 1.0991, + "step": 11195 + }, + { + "epoch": 2.243687374749499, + "grad_norm": 17.143225726267996, + "learning_rate": 1.815180342540131e-06, + "loss": 1.5435, + "step": 11196 + }, + { + "epoch": 2.2438877755511024, + "grad_norm": 17.50715434786948, + "learning_rate": 1.8142816561448468e-06, + "loss": 1.0784, + "step": 11197 + }, + { + "epoch": 2.2440881763527054, + "grad_norm": 29.863180484046428, + "learning_rate": 1.8133831429617715e-06, + "loss": 1.4864, + "step": 11198 + }, + { + "epoch": 2.2442885771543084, + "grad_norm": 21.432922929313403, + "learning_rate": 1.8124848030397585e-06, + "loss": 1.5891, + "step": 11199 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 17.33329725812642, + "learning_rate": 1.8115866364276519e-06, + "loss": 1.1766, + "step": 11200 + }, + { + "epoch": 2.244689378757515, + "grad_norm": 19.280368689256008, + "learning_rate": 1.8106886431742888e-06, + "loss": 1.7788, + "step": 11201 + }, + { + "epoch": 2.2448897795591183, + "grad_norm": 18.933502032385846, + "learning_rate": 1.8097908233284877e-06, + "loss": 1.0843, + "step": 11202 + }, + { + "epoch": 2.2450901803607213, + "grad_norm": 19.95593832291471, + "learning_rate": 1.8088931769390728e-06, + "loss": 1.398, + "step": 11203 + }, + { + "epoch": 2.2452905811623247, + "grad_norm": 20.216586717434343, + "learning_rate": 1.8079957040548445e-06, + "loss": 1.8824, + "step": 11204 + }, + { + "epoch": 2.2454909819639277, + "grad_norm": 19.169581715948528, + "learning_rate": 1.8070984047246015e-06, + "loss": 1.5843, + "step": 11205 + }, + { + "epoch": 2.245691382765531, + "grad_norm": 25.351852244183856, + "learning_rate": 1.8062012789971311e-06, + "loss": 1.4599, + "step": 11206 + }, + { + "epoch": 2.245891783567134, + "grad_norm": 50.06681286254582, + "learning_rate": 1.8053043269212122e-06, + "loss": 1.318, + "step": 11207 + }, + { + "epoch": 2.2460921843687376, + "grad_norm": 24.439331044951086, + "learning_rate": 1.8044075485456126e-06, + "loss": 1.5243, + "step": 11208 + }, + { + "epoch": 2.2462925851703406, + "grad_norm": 27.097965047587213, + "learning_rate": 1.8035109439190917e-06, + "loss": 1.7507, + "step": 11209 + }, + { + "epoch": 2.246492985971944, + "grad_norm": 17.451217364773058, + "learning_rate": 1.802614513090401e-06, + "loss": 1.5304, + "step": 11210 + }, + { + "epoch": 2.246693386773547, + "grad_norm": 20.504839370169464, + "learning_rate": 1.8017182561082768e-06, + "loss": 1.9586, + "step": 11211 + }, + { + "epoch": 2.2468937875751505, + "grad_norm": 24.045049491053977, + "learning_rate": 1.800822173021451e-06, + "loss": 2.2846, + "step": 11212 + }, + { + "epoch": 2.2470941883767535, + "grad_norm": 25.408439387784576, + "learning_rate": 1.7999262638786497e-06, + "loss": 1.8008, + "step": 11213 + }, + { + "epoch": 2.247294589178357, + "grad_norm": 17.568335370228944, + "learning_rate": 1.7990305287285786e-06, + "loss": 1.4909, + "step": 11214 + }, + { + "epoch": 2.24749498997996, + "grad_norm": 15.289536802860837, + "learning_rate": 1.7981349676199433e-06, + "loss": 1.6458, + "step": 11215 + }, + { + "epoch": 2.2476953907815633, + "grad_norm": 21.33732131091223, + "learning_rate": 1.7972395806014365e-06, + "loss": 1.6286, + "step": 11216 + }, + { + "epoch": 2.2478957915831663, + "grad_norm": 19.1599063802474, + "learning_rate": 1.7963443677217406e-06, + "loss": 2.0649, + "step": 11217 + }, + { + "epoch": 2.2480961923847698, + "grad_norm": 19.133267386820997, + "learning_rate": 1.795449329029531e-06, + "loss": 2.1156, + "step": 11218 + }, + { + "epoch": 2.2482965931863728, + "grad_norm": 17.882489091877595, + "learning_rate": 1.794554464573472e-06, + "loss": 1.1815, + "step": 11219 + }, + { + "epoch": 2.2484969939879758, + "grad_norm": 16.751612629406058, + "learning_rate": 1.7936597744022194e-06, + "loss": 1.5282, + "step": 11220 + }, + { + "epoch": 2.248697394789579, + "grad_norm": 19.022501637352534, + "learning_rate": 1.7927652585644168e-06, + "loss": 1.8176, + "step": 11221 + }, + { + "epoch": 2.248897795591182, + "grad_norm": 32.99831231438212, + "learning_rate": 1.7918709171087006e-06, + "loss": 1.5099, + "step": 11222 + }, + { + "epoch": 2.2490981963927856, + "grad_norm": 20.416010627642514, + "learning_rate": 1.790976750083699e-06, + "loss": 1.5166, + "step": 11223 + }, + { + "epoch": 2.2492985971943886, + "grad_norm": 22.751298422179246, + "learning_rate": 1.7900827575380282e-06, + "loss": 1.277, + "step": 11224 + }, + { + "epoch": 2.249498997995992, + "grad_norm": 26.070694487342458, + "learning_rate": 1.789188939520296e-06, + "loss": 1.2773, + "step": 11225 + }, + { + "epoch": 2.249699398797595, + "grad_norm": 20.660452580797266, + "learning_rate": 1.7882952960791012e-06, + "loss": 1.7603, + "step": 11226 + }, + { + "epoch": 2.2498997995991985, + "grad_norm": 17.510112987195072, + "learning_rate": 1.7874018272630334e-06, + "loss": 1.2596, + "step": 11227 + }, + { + "epoch": 2.2501002004008015, + "grad_norm": 19.520841247111928, + "learning_rate": 1.7865085331206688e-06, + "loss": 1.384, + "step": 11228 + }, + { + "epoch": 2.250300601202405, + "grad_norm": 50.47523260519712, + "learning_rate": 1.7856154137005788e-06, + "loss": 1.8009, + "step": 11229 + }, + { + "epoch": 2.250501002004008, + "grad_norm": 19.1138699175044, + "learning_rate": 1.7847224690513238e-06, + "loss": 1.4685, + "step": 11230 + }, + { + "epoch": 2.2507014028056114, + "grad_norm": 26.75422683096276, + "learning_rate": 1.7838296992214542e-06, + "loss": 1.7869, + "step": 11231 + }, + { + "epoch": 2.2509018036072144, + "grad_norm": 24.042597503076497, + "learning_rate": 1.7829371042595112e-06, + "loss": 1.4444, + "step": 11232 + }, + { + "epoch": 2.251102204408818, + "grad_norm": 20.62394978538348, + "learning_rate": 1.7820446842140272e-06, + "loss": 1.381, + "step": 11233 + }, + { + "epoch": 2.251302605210421, + "grad_norm": 20.657407904091272, + "learning_rate": 1.781152439133525e-06, + "loss": 1.5449, + "step": 11234 + }, + { + "epoch": 2.2515030060120242, + "grad_norm": 49.83180089379063, + "learning_rate": 1.7802603690665133e-06, + "loss": 1.6535, + "step": 11235 + }, + { + "epoch": 2.2517034068136272, + "grad_norm": 22.618849150064626, + "learning_rate": 1.7793684740615008e-06, + "loss": 2.0197, + "step": 11236 + }, + { + "epoch": 2.2519038076152302, + "grad_norm": 22.61307091721684, + "learning_rate": 1.7784767541669773e-06, + "loss": 1.5733, + "step": 11237 + }, + { + "epoch": 2.2521042084168337, + "grad_norm": 28.2757418300765, + "learning_rate": 1.7775852094314277e-06, + "loss": 1.4918, + "step": 11238 + }, + { + "epoch": 2.252304609218437, + "grad_norm": 18.93492869811902, + "learning_rate": 1.7766938399033272e-06, + "loss": 1.5231, + "step": 11239 + }, + { + "epoch": 2.25250501002004, + "grad_norm": 18.242455131277378, + "learning_rate": 1.7758026456311405e-06, + "loss": 1.2988, + "step": 11240 + }, + { + "epoch": 2.252705410821643, + "grad_norm": 17.56124831999579, + "learning_rate": 1.7749116266633254e-06, + "loss": 1.867, + "step": 11241 + }, + { + "epoch": 2.2529058116232465, + "grad_norm": 31.732219748947795, + "learning_rate": 1.774020783048322e-06, + "loss": 1.29, + "step": 11242 + }, + { + "epoch": 2.2531062124248495, + "grad_norm": 22.691360772497124, + "learning_rate": 1.7731301148345724e-06, + "loss": 1.7403, + "step": 11243 + }, + { + "epoch": 2.253306613226453, + "grad_norm": 18.18252720958475, + "learning_rate": 1.7722396220705035e-06, + "loss": 1.3912, + "step": 11244 + }, + { + "epoch": 2.253507014028056, + "grad_norm": 31.552836890232797, + "learning_rate": 1.7713493048045295e-06, + "loss": 1.6703, + "step": 11245 + }, + { + "epoch": 2.2537074148296594, + "grad_norm": 26.95439662753571, + "learning_rate": 1.7704591630850593e-06, + "loss": 1.2971, + "step": 11246 + }, + { + "epoch": 2.2539078156312624, + "grad_norm": 16.551330229877628, + "learning_rate": 1.7695691969604922e-06, + "loss": 1.9636, + "step": 11247 + }, + { + "epoch": 2.254108216432866, + "grad_norm": 32.98787349527749, + "learning_rate": 1.7686794064792156e-06, + "loss": 1.8929, + "step": 11248 + }, + { + "epoch": 2.254308617234469, + "grad_norm": 43.82668765018522, + "learning_rate": 1.7677897916896104e-06, + "loss": 1.7523, + "step": 11249 + }, + { + "epoch": 2.2545090180360723, + "grad_norm": 24.677012184952208, + "learning_rate": 1.7669003526400448e-06, + "loss": 1.9616, + "step": 11250 + }, + { + "epoch": 2.2547094188376753, + "grad_norm": 15.1913650994146, + "learning_rate": 1.7660110893788812e-06, + "loss": 1.4628, + "step": 11251 + }, + { + "epoch": 2.2549098196392787, + "grad_norm": 75.60192243131382, + "learning_rate": 1.7651220019544657e-06, + "loss": 1.8359, + "step": 11252 + }, + { + "epoch": 2.2551102204408817, + "grad_norm": 21.068770831300842, + "learning_rate": 1.764233090415145e-06, + "loss": 1.5806, + "step": 11253 + }, + { + "epoch": 2.255310621242485, + "grad_norm": 16.911415144463284, + "learning_rate": 1.763344354809246e-06, + "loss": 1.3824, + "step": 11254 + }, + { + "epoch": 2.255511022044088, + "grad_norm": 24.157283024857854, + "learning_rate": 1.7624557951850923e-06, + "loss": 1.7724, + "step": 11255 + }, + { + "epoch": 2.2557114228456916, + "grad_norm": 19.526710519800236, + "learning_rate": 1.761567411590996e-06, + "loss": 1.419, + "step": 11256 + }, + { + "epoch": 2.2559118236472946, + "grad_norm": 27.811827704328017, + "learning_rate": 1.7606792040752596e-06, + "loss": 1.3933, + "step": 11257 + }, + { + "epoch": 2.2561122244488976, + "grad_norm": 21.35913656634048, + "learning_rate": 1.7597911726861782e-06, + "loss": 1.3077, + "step": 11258 + }, + { + "epoch": 2.256312625250501, + "grad_norm": 21.299155518829206, + "learning_rate": 1.758903317472031e-06, + "loss": 1.7108, + "step": 11259 + }, + { + "epoch": 2.256513026052104, + "grad_norm": 29.79970214592477, + "learning_rate": 1.7580156384810975e-06, + "loss": 1.895, + "step": 11260 + }, + { + "epoch": 2.2567134268537075, + "grad_norm": 25.10645293961666, + "learning_rate": 1.7571281357616376e-06, + "loss": 1.2807, + "step": 11261 + }, + { + "epoch": 2.2569138276553105, + "grad_norm": 22.296375359822964, + "learning_rate": 1.7562408093619083e-06, + "loss": 1.4028, + "step": 11262 + }, + { + "epoch": 2.257114228456914, + "grad_norm": 21.007248916252863, + "learning_rate": 1.7553536593301545e-06, + "loss": 1.4714, + "step": 11263 + }, + { + "epoch": 2.257314629258517, + "grad_norm": 56.466844792253525, + "learning_rate": 1.7544666857146115e-06, + "loss": 2.0001, + "step": 11264 + }, + { + "epoch": 2.2575150300601203, + "grad_norm": 22.97489449924994, + "learning_rate": 1.7535798885635074e-06, + "loss": 1.7895, + "step": 11265 + }, + { + "epoch": 2.2577154308617233, + "grad_norm": 29.679838256620926, + "learning_rate": 1.7526932679250536e-06, + "loss": 1.264, + "step": 11266 + }, + { + "epoch": 2.2579158316633268, + "grad_norm": 20.27485987805082, + "learning_rate": 1.7518068238474634e-06, + "loss": 1.805, + "step": 11267 + }, + { + "epoch": 2.2581162324649298, + "grad_norm": 51.2178488213064, + "learning_rate": 1.7509205563789294e-06, + "loss": 1.5917, + "step": 11268 + }, + { + "epoch": 2.258316633266533, + "grad_norm": 27.425773998970282, + "learning_rate": 1.7500344655676393e-06, + "loss": 1.6405, + "step": 11269 + }, + { + "epoch": 2.258517034068136, + "grad_norm": 23.295311861877195, + "learning_rate": 1.7491485514617756e-06, + "loss": 1.5285, + "step": 11270 + }, + { + "epoch": 2.2587174348697396, + "grad_norm": 17.7844200376728, + "learning_rate": 1.7482628141095026e-06, + "loss": 1.519, + "step": 11271 + }, + { + "epoch": 2.2589178356713426, + "grad_norm": 19.885694440271095, + "learning_rate": 1.7473772535589822e-06, + "loss": 1.4867, + "step": 11272 + }, + { + "epoch": 2.259118236472946, + "grad_norm": 19.910758988505446, + "learning_rate": 1.7464918698583584e-06, + "loss": 1.2658, + "step": 11273 + }, + { + "epoch": 2.259318637274549, + "grad_norm": 14.737774398469803, + "learning_rate": 1.7456066630557756e-06, + "loss": 1.2287, + "step": 11274 + }, + { + "epoch": 2.259519038076152, + "grad_norm": 26.574480762384773, + "learning_rate": 1.7447216331993645e-06, + "loss": 1.9337, + "step": 11275 + }, + { + "epoch": 2.2597194388777555, + "grad_norm": 23.18092609426574, + "learning_rate": 1.7438367803372398e-06, + "loss": 1.4068, + "step": 11276 + }, + { + "epoch": 2.259919839679359, + "grad_norm": 22.606052549733786, + "learning_rate": 1.7429521045175195e-06, + "loss": 1.2591, + "step": 11277 + }, + { + "epoch": 2.260120240480962, + "grad_norm": 22.272581343860022, + "learning_rate": 1.742067605788299e-06, + "loss": 1.5039, + "step": 11278 + }, + { + "epoch": 2.260320641282565, + "grad_norm": 24.192141314790348, + "learning_rate": 1.7411832841976722e-06, + "loss": 1.6875, + "step": 11279 + }, + { + "epoch": 2.2605210420841684, + "grad_norm": 18.599630523968806, + "learning_rate": 1.7402991397937207e-06, + "loss": 1.4647, + "step": 11280 + }, + { + "epoch": 2.2607214428857714, + "grad_norm": 20.574130947955258, + "learning_rate": 1.7394151726245174e-06, + "loss": 1.6267, + "step": 11281 + }, + { + "epoch": 2.260921843687375, + "grad_norm": 28.45362162517701, + "learning_rate": 1.7385313827381251e-06, + "loss": 1.6873, + "step": 11282 + }, + { + "epoch": 2.261122244488978, + "grad_norm": 74.12901929218812, + "learning_rate": 1.7376477701825927e-06, + "loss": 1.6351, + "step": 11283 + }, + { + "epoch": 2.2613226452905812, + "grad_norm": 30.810262644833514, + "learning_rate": 1.7367643350059698e-06, + "loss": 1.7419, + "step": 11284 + }, + { + "epoch": 2.2615230460921842, + "grad_norm": 24.230014119230557, + "learning_rate": 1.7358810772562855e-06, + "loss": 1.9156, + "step": 11285 + }, + { + "epoch": 2.2617234468937877, + "grad_norm": 15.070925926778907, + "learning_rate": 1.7349979969815655e-06, + "loss": 0.8797, + "step": 11286 + }, + { + "epoch": 2.2619238476953907, + "grad_norm": 19.149153688371783, + "learning_rate": 1.7341150942298241e-06, + "loss": 1.3632, + "step": 11287 + }, + { + "epoch": 2.262124248496994, + "grad_norm": 23.39434672116619, + "learning_rate": 1.7332323690490654e-06, + "loss": 1.5784, + "step": 11288 + }, + { + "epoch": 2.262324649298597, + "grad_norm": 28.156943957893702, + "learning_rate": 1.732349821487287e-06, + "loss": 1.8811, + "step": 11289 + }, + { + "epoch": 2.2625250501002006, + "grad_norm": 21.218135299230784, + "learning_rate": 1.7314674515924689e-06, + "loss": 1.8171, + "step": 11290 + }, + { + "epoch": 2.2627254509018035, + "grad_norm": 26.6864637577956, + "learning_rate": 1.7305852594125938e-06, + "loss": 1.388, + "step": 11291 + }, + { + "epoch": 2.262925851703407, + "grad_norm": 27.646265972097012, + "learning_rate": 1.7297032449956225e-06, + "loss": 1.9639, + "step": 11292 + }, + { + "epoch": 2.26312625250501, + "grad_norm": 16.289533255769012, + "learning_rate": 1.728821408389511e-06, + "loss": 1.066, + "step": 11293 + }, + { + "epoch": 2.2633266533066134, + "grad_norm": 16.394683667614522, + "learning_rate": 1.7279397496422124e-06, + "loss": 1.2011, + "step": 11294 + }, + { + "epoch": 2.2635270541082164, + "grad_norm": 16.195420189152447, + "learning_rate": 1.7270582688016574e-06, + "loss": 1.3834, + "step": 11295 + }, + { + "epoch": 2.2637274549098194, + "grad_norm": 18.8108163123881, + "learning_rate": 1.7261769659157757e-06, + "loss": 1.3098, + "step": 11296 + }, + { + "epoch": 2.263927855711423, + "grad_norm": 24.18854181527509, + "learning_rate": 1.7252958410324849e-06, + "loss": 1.4579, + "step": 11297 + }, + { + "epoch": 2.2641282565130263, + "grad_norm": 22.471914204615754, + "learning_rate": 1.724414894199693e-06, + "loss": 1.7062, + "step": 11298 + }, + { + "epoch": 2.2643286573146293, + "grad_norm": 18.05697857079699, + "learning_rate": 1.7235341254653005e-06, + "loss": 1.3734, + "step": 11299 + }, + { + "epoch": 2.2645290581162323, + "grad_norm": 17.98854192461358, + "learning_rate": 1.7226535348771905e-06, + "loss": 1.676, + "step": 11300 + }, + { + "epoch": 2.2647294589178357, + "grad_norm": 27.46596492673055, + "learning_rate": 1.7217731224832491e-06, + "loss": 1.5346, + "step": 11301 + }, + { + "epoch": 2.2649298597194387, + "grad_norm": 21.283952052507853, + "learning_rate": 1.7208928883313408e-06, + "loss": 1.5917, + "step": 11302 + }, + { + "epoch": 2.265130260521042, + "grad_norm": 26.798528948604798, + "learning_rate": 1.7200128324693261e-06, + "loss": 1.4419, + "step": 11303 + }, + { + "epoch": 2.265330661322645, + "grad_norm": 18.26953227321781, + "learning_rate": 1.719132954945056e-06, + "loss": 1.1235, + "step": 11304 + }, + { + "epoch": 2.2655310621242486, + "grad_norm": 22.22890987238922, + "learning_rate": 1.718253255806369e-06, + "loss": 1.2636, + "step": 11305 + }, + { + "epoch": 2.2657314629258516, + "grad_norm": 28.77643879875719, + "learning_rate": 1.7173737351010978e-06, + "loss": 1.1431, + "step": 11306 + }, + { + "epoch": 2.265931863727455, + "grad_norm": 19.530178503330923, + "learning_rate": 1.7164943928770617e-06, + "loss": 1.8743, + "step": 11307 + }, + { + "epoch": 2.266132264529058, + "grad_norm": 20.747581289199058, + "learning_rate": 1.7156152291820744e-06, + "loss": 2.1088, + "step": 11308 + }, + { + "epoch": 2.2663326653306615, + "grad_norm": 21.658725750117124, + "learning_rate": 1.7147362440639337e-06, + "loss": 1.6769, + "step": 11309 + }, + { + "epoch": 2.2665330661322645, + "grad_norm": 23.653283834975166, + "learning_rate": 1.7138574375704325e-06, + "loss": 1.6024, + "step": 11310 + }, + { + "epoch": 2.266733466933868, + "grad_norm": 15.317553152723764, + "learning_rate": 1.7129788097493534e-06, + "loss": 1.4944, + "step": 11311 + }, + { + "epoch": 2.266933867735471, + "grad_norm": 23.376631297797235, + "learning_rate": 1.7121003606484682e-06, + "loss": 1.4094, + "step": 11312 + }, + { + "epoch": 2.2671342685370743, + "grad_norm": 39.49093435906596, + "learning_rate": 1.7112220903155402e-06, + "loss": 1.6609, + "step": 11313 + }, + { + "epoch": 2.2673346693386773, + "grad_norm": 20.26183763621446, + "learning_rate": 1.7103439987983217e-06, + "loss": 0.8782, + "step": 11314 + }, + { + "epoch": 2.2675350701402808, + "grad_norm": 70.28005910291583, + "learning_rate": 1.7094660861445577e-06, + "loss": 1.7727, + "step": 11315 + }, + { + "epoch": 2.2677354709418838, + "grad_norm": 18.34142960712633, + "learning_rate": 1.708588352401978e-06, + "loss": 1.1974, + "step": 11316 + }, + { + "epoch": 2.2679358717434868, + "grad_norm": 23.038862012881253, + "learning_rate": 1.7077107976183083e-06, + "loss": 1.7553, + "step": 11317 + }, + { + "epoch": 2.26813627254509, + "grad_norm": 29.600362097961746, + "learning_rate": 1.7068334218412624e-06, + "loss": 1.59, + "step": 11318 + }, + { + "epoch": 2.268336673346693, + "grad_norm": 17.835320342495148, + "learning_rate": 1.7059562251185441e-06, + "loss": 1.1643, + "step": 11319 + }, + { + "epoch": 2.2685370741482966, + "grad_norm": 20.32027759359553, + "learning_rate": 1.705079207497849e-06, + "loss": 1.2958, + "step": 11320 + }, + { + "epoch": 2.2687374749498996, + "grad_norm": 19.60129315318649, + "learning_rate": 1.7042023690268606e-06, + "loss": 1.2184, + "step": 11321 + }, + { + "epoch": 2.268937875751503, + "grad_norm": 22.619856847948533, + "learning_rate": 1.7033257097532563e-06, + "loss": 1.79, + "step": 11322 + }, + { + "epoch": 2.269138276553106, + "grad_norm": 24.194938851024688, + "learning_rate": 1.7024492297246964e-06, + "loss": 1.5451, + "step": 11323 + }, + { + "epoch": 2.2693386773547095, + "grad_norm": 17.35104430379104, + "learning_rate": 1.701572928988841e-06, + "loss": 0.7102, + "step": 11324 + }, + { + "epoch": 2.2695390781563125, + "grad_norm": 21.309214584902787, + "learning_rate": 1.7006968075933356e-06, + "loss": 1.2423, + "step": 11325 + }, + { + "epoch": 2.269739478957916, + "grad_norm": 18.645379111446335, + "learning_rate": 1.699820865585814e-06, + "loss": 1.6852, + "step": 11326 + }, + { + "epoch": 2.269939879759519, + "grad_norm": 41.65031840631552, + "learning_rate": 1.6989451030139032e-06, + "loss": 1.2662, + "step": 11327 + }, + { + "epoch": 2.2701402805611224, + "grad_norm": 18.960367837052665, + "learning_rate": 1.6980695199252196e-06, + "loss": 1.2124, + "step": 11328 + }, + { + "epoch": 2.2703406813627254, + "grad_norm": 44.30240538115895, + "learning_rate": 1.6971941163673705e-06, + "loss": 1.4472, + "step": 11329 + }, + { + "epoch": 2.270541082164329, + "grad_norm": 18.60501087066622, + "learning_rate": 1.696318892387953e-06, + "loss": 1.613, + "step": 11330 + }, + { + "epoch": 2.270741482965932, + "grad_norm": 17.38212111464495, + "learning_rate": 1.6954438480345536e-06, + "loss": 1.6068, + "step": 11331 + }, + { + "epoch": 2.2709418837675353, + "grad_norm": 19.541525610924293, + "learning_rate": 1.6945689833547523e-06, + "loss": 1.2291, + "step": 11332 + }, + { + "epoch": 2.2711422845691382, + "grad_norm": 25.054772054527582, + "learning_rate": 1.6936942983961108e-06, + "loss": 1.5279, + "step": 11333 + }, + { + "epoch": 2.2713426853707412, + "grad_norm": 19.761351075312948, + "learning_rate": 1.6928197932061945e-06, + "loss": 1.9636, + "step": 11334 + }, + { + "epoch": 2.2715430861723447, + "grad_norm": 27.33557167044553, + "learning_rate": 1.6919454678325465e-06, + "loss": 1.4075, + "step": 11335 + }, + { + "epoch": 2.271743486973948, + "grad_norm": 21.410035769665274, + "learning_rate": 1.691071322322706e-06, + "loss": 1.4178, + "step": 11336 + }, + { + "epoch": 2.271943887775551, + "grad_norm": 26.5104246801113, + "learning_rate": 1.6901973567242024e-06, + "loss": 1.2166, + "step": 11337 + }, + { + "epoch": 2.272144288577154, + "grad_norm": 16.143488918908552, + "learning_rate": 1.689323571084554e-06, + "loss": 1.6283, + "step": 11338 + }, + { + "epoch": 2.2723446893787576, + "grad_norm": 17.207715408143393, + "learning_rate": 1.6884499654512721e-06, + "loss": 1.6298, + "step": 11339 + }, + { + "epoch": 2.2725450901803605, + "grad_norm": 20.846032064022697, + "learning_rate": 1.6875765398718502e-06, + "loss": 1.6994, + "step": 11340 + }, + { + "epoch": 2.272745490981964, + "grad_norm": 18.699930843290637, + "learning_rate": 1.6867032943937845e-06, + "loss": 1.994, + "step": 11341 + }, + { + "epoch": 2.272945891783567, + "grad_norm": 18.522119459765108, + "learning_rate": 1.6858302290645506e-06, + "loss": 1.6147, + "step": 11342 + }, + { + "epoch": 2.2731462925851704, + "grad_norm": 18.50712451749667, + "learning_rate": 1.6849573439316186e-06, + "loss": 1.7682, + "step": 11343 + }, + { + "epoch": 2.2733466933867734, + "grad_norm": 16.52423033361769, + "learning_rate": 1.6840846390424499e-06, + "loss": 1.2557, + "step": 11344 + }, + { + "epoch": 2.273547094188377, + "grad_norm": 22.10531276001236, + "learning_rate": 1.6832121144444936e-06, + "loss": 1.5417, + "step": 11345 + }, + { + "epoch": 2.27374749498998, + "grad_norm": 20.500325104748907, + "learning_rate": 1.682339770185193e-06, + "loss": 1.7728, + "step": 11346 + }, + { + "epoch": 2.2739478957915833, + "grad_norm": 23.120498971997392, + "learning_rate": 1.6814676063119723e-06, + "loss": 1.3035, + "step": 11347 + }, + { + "epoch": 2.2741482965931863, + "grad_norm": 17.60949881068761, + "learning_rate": 1.6805956228722604e-06, + "loss": 1.1796, + "step": 11348 + }, + { + "epoch": 2.2743486973947897, + "grad_norm": 17.188829403744215, + "learning_rate": 1.6797238199134618e-06, + "loss": 1.8359, + "step": 11349 + }, + { + "epoch": 2.2745490981963927, + "grad_norm": 14.287372826235863, + "learning_rate": 1.6788521974829797e-06, + "loss": 1.2077, + "step": 11350 + }, + { + "epoch": 2.274749498997996, + "grad_norm": 17.54198084899559, + "learning_rate": 1.677980755628209e-06, + "loss": 1.58, + "step": 11351 + }, + { + "epoch": 2.274949899799599, + "grad_norm": 43.91979596942908, + "learning_rate": 1.6771094943965265e-06, + "loss": 1.8192, + "step": 11352 + }, + { + "epoch": 2.2751503006012026, + "grad_norm": 30.75528803051839, + "learning_rate": 1.6762384138353077e-06, + "loss": 1.9612, + "step": 11353 + }, + { + "epoch": 2.2753507014028056, + "grad_norm": 21.932995509074498, + "learning_rate": 1.675367513991909e-06, + "loss": 1.6669, + "step": 11354 + }, + { + "epoch": 2.2755511022044086, + "grad_norm": 21.73996747378963, + "learning_rate": 1.6744967949136881e-06, + "loss": 1.2853, + "step": 11355 + }, + { + "epoch": 2.275751503006012, + "grad_norm": 40.34974082148274, + "learning_rate": 1.673626256647987e-06, + "loss": 1.7657, + "step": 11356 + }, + { + "epoch": 2.2759519038076155, + "grad_norm": 20.066164101545407, + "learning_rate": 1.6727558992421334e-06, + "loss": 1.6612, + "step": 11357 + }, + { + "epoch": 2.2761523046092185, + "grad_norm": 19.355235935215912, + "learning_rate": 1.6718857227434566e-06, + "loss": 1.5457, + "step": 11358 + }, + { + "epoch": 2.2763527054108215, + "grad_norm": 21.03575922876772, + "learning_rate": 1.6710157271992634e-06, + "loss": 1.6467, + "step": 11359 + }, + { + "epoch": 2.276553106212425, + "grad_norm": 33.2649386245205, + "learning_rate": 1.6701459126568592e-06, + "loss": 2.2861, + "step": 11360 + }, + { + "epoch": 2.276753507014028, + "grad_norm": 60.1640568737553, + "learning_rate": 1.6692762791635376e-06, + "loss": 1.4789, + "step": 11361 + }, + { + "epoch": 2.2769539078156313, + "grad_norm": 35.447188670114144, + "learning_rate": 1.6684068267665809e-06, + "loss": 1.2224, + "step": 11362 + }, + { + "epoch": 2.2771543086172343, + "grad_norm": 18.028235430440184, + "learning_rate": 1.6675375555132644e-06, + "loss": 1.2465, + "step": 11363 + }, + { + "epoch": 2.2773547094188378, + "grad_norm": 22.562272384384297, + "learning_rate": 1.6666684654508474e-06, + "loss": 1.4662, + "step": 11364 + }, + { + "epoch": 2.2775551102204408, + "grad_norm": 62.7132087289357, + "learning_rate": 1.6657995566265895e-06, + "loss": 2.1196, + "step": 11365 + }, + { + "epoch": 2.277755511022044, + "grad_norm": 18.225632090933125, + "learning_rate": 1.6649308290877298e-06, + "loss": 1.191, + "step": 11366 + }, + { + "epoch": 2.277955911823647, + "grad_norm": 19.33099200083269, + "learning_rate": 1.6640622828815034e-06, + "loss": 1.5938, + "step": 11367 + }, + { + "epoch": 2.2781563126252506, + "grad_norm": 20.85711354691524, + "learning_rate": 1.6631939180551355e-06, + "loss": 1.8785, + "step": 11368 + }, + { + "epoch": 2.2783567134268536, + "grad_norm": 23.309227907187072, + "learning_rate": 1.6623257346558397e-06, + "loss": 1.2733, + "step": 11369 + }, + { + "epoch": 2.278557114228457, + "grad_norm": 20.86563875249689, + "learning_rate": 1.6614577327308222e-06, + "loss": 1.7431, + "step": 11370 + }, + { + "epoch": 2.27875751503006, + "grad_norm": 22.38811445517258, + "learning_rate": 1.6605899123272729e-06, + "loss": 1.5093, + "step": 11371 + }, + { + "epoch": 2.2789579158316635, + "grad_norm": 19.233155762842163, + "learning_rate": 1.6597222734923823e-06, + "loss": 1.6183, + "step": 11372 + }, + { + "epoch": 2.2791583166332665, + "grad_norm": 20.670104724121433, + "learning_rate": 1.6588548162733216e-06, + "loss": 1.6114, + "step": 11373 + }, + { + "epoch": 2.27935871743487, + "grad_norm": 20.038922030772948, + "learning_rate": 1.6579875407172558e-06, + "loss": 1.5582, + "step": 11374 + }, + { + "epoch": 2.279559118236473, + "grad_norm": 18.94853000345888, + "learning_rate": 1.657120446871341e-06, + "loss": 1.5051, + "step": 11375 + }, + { + "epoch": 2.279759519038076, + "grad_norm": 23.2726774116089, + "learning_rate": 1.6562535347827213e-06, + "loss": 1.5268, + "step": 11376 + }, + { + "epoch": 2.2799599198396794, + "grad_norm": 28.926655990278164, + "learning_rate": 1.6553868044985334e-06, + "loss": 1.4541, + "step": 11377 + }, + { + "epoch": 2.2801603206412824, + "grad_norm": 57.62740373424201, + "learning_rate": 1.654520256065902e-06, + "loss": 1.8793, + "step": 11378 + }, + { + "epoch": 2.280360721442886, + "grad_norm": 69.17732985626242, + "learning_rate": 1.6536538895319444e-06, + "loss": 1.916, + "step": 11379 + }, + { + "epoch": 2.280561122244489, + "grad_norm": 26.040643835786344, + "learning_rate": 1.6527877049437624e-06, + "loss": 1.9034, + "step": 11380 + }, + { + "epoch": 2.2807615230460923, + "grad_norm": 22.871348200415156, + "learning_rate": 1.6519217023484524e-06, + "loss": 1.3721, + "step": 11381 + }, + { + "epoch": 2.2809619238476952, + "grad_norm": 32.11166507958743, + "learning_rate": 1.651055881793105e-06, + "loss": 1.8566, + "step": 11382 + }, + { + "epoch": 2.2811623246492987, + "grad_norm": 30.892453755378334, + "learning_rate": 1.6501902433247914e-06, + "loss": 1.5042, + "step": 11383 + }, + { + "epoch": 2.2813627254509017, + "grad_norm": 15.870269116127908, + "learning_rate": 1.6493247869905787e-06, + "loss": 1.3449, + "step": 11384 + }, + { + "epoch": 2.281563126252505, + "grad_norm": 18.75545147930808, + "learning_rate": 1.6484595128375236e-06, + "loss": 1.4835, + "step": 11385 + }, + { + "epoch": 2.281763527054108, + "grad_norm": 21.686176655533973, + "learning_rate": 1.647594420912672e-06, + "loss": 1.6373, + "step": 11386 + }, + { + "epoch": 2.2819639278557116, + "grad_norm": 22.034024606044795, + "learning_rate": 1.6467295112630622e-06, + "loss": 1.4361, + "step": 11387 + }, + { + "epoch": 2.2821643286573146, + "grad_norm": 19.221201117555523, + "learning_rate": 1.645864783935715e-06, + "loss": 1.5527, + "step": 11388 + }, + { + "epoch": 2.282364729458918, + "grad_norm": 22.7444761165957, + "learning_rate": 1.6450002389776543e-06, + "loss": 1.4974, + "step": 11389 + }, + { + "epoch": 2.282565130260521, + "grad_norm": 30.240745261818887, + "learning_rate": 1.6441358764358812e-06, + "loss": 0.8141, + "step": 11390 + }, + { + "epoch": 2.2827655310621244, + "grad_norm": 27.90468551949286, + "learning_rate": 1.6432716963573942e-06, + "loss": 1.5333, + "step": 11391 + }, + { + "epoch": 2.2829659318637274, + "grad_norm": 33.917681944408436, + "learning_rate": 1.64240769878918e-06, + "loss": 1.4605, + "step": 11392 + }, + { + "epoch": 2.2831663326653304, + "grad_norm": 38.03389950584327, + "learning_rate": 1.641543883778216e-06, + "loss": 2.0325, + "step": 11393 + }, + { + "epoch": 2.283366733466934, + "grad_norm": 22.82010579635823, + "learning_rate": 1.640680251371468e-06, + "loss": 1.3174, + "step": 11394 + }, + { + "epoch": 2.2835671342685373, + "grad_norm": 21.268648203951358, + "learning_rate": 1.6398168016158938e-06, + "loss": 1.3266, + "step": 11395 + }, + { + "epoch": 2.2837675350701403, + "grad_norm": 47.79471797041225, + "learning_rate": 1.6389535345584417e-06, + "loss": 1.7036, + "step": 11396 + }, + { + "epoch": 2.2839679358717433, + "grad_norm": 18.363563941537127, + "learning_rate": 1.638090450246046e-06, + "loss": 1.6035, + "step": 11397 + }, + { + "epoch": 2.2841683366733467, + "grad_norm": 31.4147502894633, + "learning_rate": 1.6372275487256344e-06, + "loss": 1.398, + "step": 11398 + }, + { + "epoch": 2.2843687374749497, + "grad_norm": 19.8755141349809, + "learning_rate": 1.6363648300441254e-06, + "loss": 1.1849, + "step": 11399 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 27.822513473645067, + "learning_rate": 1.6355022942484262e-06, + "loss": 1.6502, + "step": 11400 + }, + { + "epoch": 2.284769539078156, + "grad_norm": 21.382055144772494, + "learning_rate": 1.634639941385433e-06, + "loss": 1.3621, + "step": 11401 + }, + { + "epoch": 2.2849699398797596, + "grad_norm": 36.82245845523203, + "learning_rate": 1.6337777715020342e-06, + "loss": 1.6647, + "step": 11402 + }, + { + "epoch": 2.2851703406813626, + "grad_norm": 37.514269404295916, + "learning_rate": 1.6329157846451083e-06, + "loss": 1.9402, + "step": 11403 + }, + { + "epoch": 2.285370741482966, + "grad_norm": 24.39793052162465, + "learning_rate": 1.6320539808615205e-06, + "loss": 1.6689, + "step": 11404 + }, + { + "epoch": 2.285571142284569, + "grad_norm": 24.76571910458077, + "learning_rate": 1.6311923601981271e-06, + "loss": 1.3074, + "step": 11405 + }, + { + "epoch": 2.2857715430861725, + "grad_norm": 19.530545469066507, + "learning_rate": 1.630330922701781e-06, + "loss": 1.3333, + "step": 11406 + }, + { + "epoch": 2.2859719438877755, + "grad_norm": 18.60757720913466, + "learning_rate": 1.6294696684193157e-06, + "loss": 1.4562, + "step": 11407 + }, + { + "epoch": 2.286172344689379, + "grad_norm": 53.22063975324122, + "learning_rate": 1.6286085973975597e-06, + "loss": 1.6834, + "step": 11408 + }, + { + "epoch": 2.286372745490982, + "grad_norm": 17.671109925513797, + "learning_rate": 1.6277477096833305e-06, + "loss": 1.4868, + "step": 11409 + }, + { + "epoch": 2.2865731462925853, + "grad_norm": 24.96213853204798, + "learning_rate": 1.6268870053234364e-06, + "loss": 1.7018, + "step": 11410 + }, + { + "epoch": 2.2867735470941883, + "grad_norm": 20.459725795205895, + "learning_rate": 1.6260264843646751e-06, + "loss": 1.1136, + "step": 11411 + }, + { + "epoch": 2.286973947895792, + "grad_norm": 21.80270833365829, + "learning_rate": 1.6251661468538343e-06, + "loss": 1.0887, + "step": 11412 + }, + { + "epoch": 2.2871743486973948, + "grad_norm": 16.057578059125774, + "learning_rate": 1.6243059928376935e-06, + "loss": 0.8744, + "step": 11413 + }, + { + "epoch": 2.2873747494989978, + "grad_norm": 35.28990873580099, + "learning_rate": 1.6234460223630172e-06, + "loss": 1.8478, + "step": 11414 + }, + { + "epoch": 2.287575150300601, + "grad_norm": 43.42222130695103, + "learning_rate": 1.6225862354765647e-06, + "loss": 2.0575, + "step": 11415 + }, + { + "epoch": 2.2877755511022047, + "grad_norm": 25.845284369820764, + "learning_rate": 1.621726632225084e-06, + "loss": 1.4852, + "step": 11416 + }, + { + "epoch": 2.2879759519038076, + "grad_norm": 36.66267672292626, + "learning_rate": 1.6208672126553137e-06, + "loss": 1.4645, + "step": 11417 + }, + { + "epoch": 2.2881763527054106, + "grad_norm": 22.540693352922297, + "learning_rate": 1.620007976813981e-06, + "loss": 1.7665, + "step": 11418 + }, + { + "epoch": 2.288376753507014, + "grad_norm": 20.670572219135757, + "learning_rate": 1.6191489247478031e-06, + "loss": 1.4026, + "step": 11419 + }, + { + "epoch": 2.288577154308617, + "grad_norm": 26.790341018802717, + "learning_rate": 1.6182900565034914e-06, + "loss": 1.982, + "step": 11420 + }, + { + "epoch": 2.2887775551102205, + "grad_norm": 20.109618160413575, + "learning_rate": 1.6174313721277373e-06, + "loss": 1.4968, + "step": 11421 + }, + { + "epoch": 2.2889779559118235, + "grad_norm": 32.225428226081114, + "learning_rate": 1.6165728716672363e-06, + "loss": 1.6008, + "step": 11422 + }, + { + "epoch": 2.289178356713427, + "grad_norm": 18.7611426783464, + "learning_rate": 1.6157145551686604e-06, + "loss": 1.5286, + "step": 11423 + }, + { + "epoch": 2.28937875751503, + "grad_norm": 26.83090740308511, + "learning_rate": 1.6148564226786806e-06, + "loss": 1.418, + "step": 11424 + }, + { + "epoch": 2.2895791583166334, + "grad_norm": 20.00195041310794, + "learning_rate": 1.6139984742439534e-06, + "loss": 1.55, + "step": 11425 + }, + { + "epoch": 2.2897795591182364, + "grad_norm": 20.781982453308682, + "learning_rate": 1.6131407099111273e-06, + "loss": 1.6667, + "step": 11426 + }, + { + "epoch": 2.28997995991984, + "grad_norm": 27.440275957629794, + "learning_rate": 1.6122831297268427e-06, + "loss": 1.1718, + "step": 11427 + }, + { + "epoch": 2.290180360721443, + "grad_norm": 23.37005763745077, + "learning_rate": 1.611425733737721e-06, + "loss": 1.6483, + "step": 11428 + }, + { + "epoch": 2.2903807615230463, + "grad_norm": 26.620366255113147, + "learning_rate": 1.6105685219903877e-06, + "loss": 1.4325, + "step": 11429 + }, + { + "epoch": 2.2905811623246493, + "grad_norm": 18.403658457010884, + "learning_rate": 1.6097114945314456e-06, + "loss": 1.9088, + "step": 11430 + }, + { + "epoch": 2.2907815631262527, + "grad_norm": 19.913238509000262, + "learning_rate": 1.6088546514074937e-06, + "loss": 1.4933, + "step": 11431 + }, + { + "epoch": 2.2909819639278557, + "grad_norm": 15.658313793770478, + "learning_rate": 1.6079979926651208e-06, + "loss": 0.94, + "step": 11432 + }, + { + "epoch": 2.291182364729459, + "grad_norm": 23.338216912195424, + "learning_rate": 1.6071415183509038e-06, + "loss": 2.0369, + "step": 11433 + }, + { + "epoch": 2.291382765531062, + "grad_norm": 31.851127445967606, + "learning_rate": 1.6062852285114122e-06, + "loss": 1.3753, + "step": 11434 + }, + { + "epoch": 2.291583166332665, + "grad_norm": 22.61313684191154, + "learning_rate": 1.6054291231931989e-06, + "loss": 1.4719, + "step": 11435 + }, + { + "epoch": 2.2917835671342686, + "grad_norm": 21.278827467532484, + "learning_rate": 1.6045732024428174e-06, + "loss": 1.6579, + "step": 11436 + }, + { + "epoch": 2.2919839679358716, + "grad_norm": 21.964031457080537, + "learning_rate": 1.6037174663068038e-06, + "loss": 1.3385, + "step": 11437 + }, + { + "epoch": 2.292184368737475, + "grad_norm": 23.955120830510168, + "learning_rate": 1.602861914831682e-06, + "loss": 1.7023, + "step": 11438 + }, + { + "epoch": 2.292384769539078, + "grad_norm": 25.756676860453346, + "learning_rate": 1.6020065480639757e-06, + "loss": 1.5662, + "step": 11439 + }, + { + "epoch": 2.2925851703406814, + "grad_norm": 19.781529070974134, + "learning_rate": 1.6011513660501882e-06, + "loss": 1.2927, + "step": 11440 + }, + { + "epoch": 2.2927855711422844, + "grad_norm": 26.40168064989166, + "learning_rate": 1.6002963688368172e-06, + "loss": 1.7475, + "step": 11441 + }, + { + "epoch": 2.292985971943888, + "grad_norm": 18.111106181510426, + "learning_rate": 1.599441556470352e-06, + "loss": 1.5169, + "step": 11442 + }, + { + "epoch": 2.293186372745491, + "grad_norm": 20.018407414720233, + "learning_rate": 1.5985869289972683e-06, + "loss": 1.6275, + "step": 11443 + }, + { + "epoch": 2.2933867735470943, + "grad_norm": 28.87850836759069, + "learning_rate": 1.5977324864640358e-06, + "loss": 1.7125, + "step": 11444 + }, + { + "epoch": 2.2935871743486973, + "grad_norm": 28.010454730954102, + "learning_rate": 1.5968782289171075e-06, + "loss": 1.578, + "step": 11445 + }, + { + "epoch": 2.2937875751503007, + "grad_norm": 34.41778963749615, + "learning_rate": 1.5960241564029365e-06, + "loss": 1.7847, + "step": 11446 + }, + { + "epoch": 2.2939879759519037, + "grad_norm": 18.136018257806917, + "learning_rate": 1.5951702689679549e-06, + "loss": 1.367, + "step": 11447 + }, + { + "epoch": 2.294188376753507, + "grad_norm": 27.36091193075434, + "learning_rate": 1.594316566658592e-06, + "loss": 1.5054, + "step": 11448 + }, + { + "epoch": 2.29438877755511, + "grad_norm": 19.49779554836538, + "learning_rate": 1.593463049521265e-06, + "loss": 1.5698, + "step": 11449 + }, + { + "epoch": 2.2945891783567136, + "grad_norm": 25.007091915486278, + "learning_rate": 1.5926097176023798e-06, + "loss": 1.9318, + "step": 11450 + }, + { + "epoch": 2.2947895791583166, + "grad_norm": 17.037059698900684, + "learning_rate": 1.5917565709483357e-06, + "loss": 1.8096, + "step": 11451 + }, + { + "epoch": 2.2949899799599196, + "grad_norm": 23.53999135548743, + "learning_rate": 1.5909036096055152e-06, + "loss": 1.975, + "step": 11452 + }, + { + "epoch": 2.295190380761523, + "grad_norm": 40.50391285648793, + "learning_rate": 1.5900508336203007e-06, + "loss": 1.8883, + "step": 11453 + }, + { + "epoch": 2.2953907815631265, + "grad_norm": 20.709329113985515, + "learning_rate": 1.589198243039054e-06, + "loss": 1.4082, + "step": 11454 + }, + { + "epoch": 2.2955911823647295, + "grad_norm": 23.093702476160836, + "learning_rate": 1.588345837908134e-06, + "loss": 1.5416, + "step": 11455 + }, + { + "epoch": 2.2957915831663325, + "grad_norm": 45.8566068173833, + "learning_rate": 1.5874936182738864e-06, + "loss": 1.8391, + "step": 11456 + }, + { + "epoch": 2.295991983967936, + "grad_norm": 26.863098148370543, + "learning_rate": 1.5866415841826487e-06, + "loss": 1.6925, + "step": 11457 + }, + { + "epoch": 2.296192384769539, + "grad_norm": 22.51795287663469, + "learning_rate": 1.5857897356807477e-06, + "loss": 1.1447, + "step": 11458 + }, + { + "epoch": 2.2963927855711423, + "grad_norm": 21.51166641048555, + "learning_rate": 1.5849380728144958e-06, + "loss": 1.6092, + "step": 11459 + }, + { + "epoch": 2.2965931863727453, + "grad_norm": 27.55511147627035, + "learning_rate": 1.5840865956302054e-06, + "loss": 1.7193, + "step": 11460 + }, + { + "epoch": 2.296793587174349, + "grad_norm": 19.62737846982583, + "learning_rate": 1.5832353041741672e-06, + "loss": 1.5936, + "step": 11461 + }, + { + "epoch": 2.296993987975952, + "grad_norm": 19.961378410490372, + "learning_rate": 1.5823841984926675e-06, + "loss": 1.7393, + "step": 11462 + }, + { + "epoch": 2.297194388777555, + "grad_norm": 19.287987271052174, + "learning_rate": 1.581533278631987e-06, + "loss": 1.6207, + "step": 11463 + }, + { + "epoch": 2.297394789579158, + "grad_norm": 27.819299837092185, + "learning_rate": 1.5806825446383867e-06, + "loss": 1.8115, + "step": 11464 + }, + { + "epoch": 2.2975951903807617, + "grad_norm": 70.8002792955169, + "learning_rate": 1.579831996558124e-06, + "loss": 2.0929, + "step": 11465 + }, + { + "epoch": 2.2977955911823646, + "grad_norm": 23.715886551340606, + "learning_rate": 1.5789816344374437e-06, + "loss": 1.5791, + "step": 11466 + }, + { + "epoch": 2.297995991983968, + "grad_norm": 21.54190859406678, + "learning_rate": 1.578131458322582e-06, + "loss": 1.6872, + "step": 11467 + }, + { + "epoch": 2.298196392785571, + "grad_norm": 25.089617025284348, + "learning_rate": 1.577281468259766e-06, + "loss": 1.9208, + "step": 11468 + }, + { + "epoch": 2.2983967935871745, + "grad_norm": 57.63714569896813, + "learning_rate": 1.5764316642952055e-06, + "loss": 1.7688, + "step": 11469 + }, + { + "epoch": 2.2985971943887775, + "grad_norm": 22.073855580898538, + "learning_rate": 1.5755820464751125e-06, + "loss": 1.2047, + "step": 11470 + }, + { + "epoch": 2.298797595190381, + "grad_norm": 20.94598188267791, + "learning_rate": 1.5747326148456765e-06, + "loss": 1.2104, + "step": 11471 + }, + { + "epoch": 2.298997995991984, + "grad_norm": 21.281936501294982, + "learning_rate": 1.5738833694530847e-06, + "loss": 1.453, + "step": 11472 + }, + { + "epoch": 2.299198396793587, + "grad_norm": 25.8252742201558, + "learning_rate": 1.5730343103435118e-06, + "loss": 1.6666, + "step": 11473 + }, + { + "epoch": 2.2993987975951904, + "grad_norm": 21.776935114031037, + "learning_rate": 1.5721854375631219e-06, + "loss": 1.8799, + "step": 11474 + }, + { + "epoch": 2.299599198396794, + "grad_norm": 24.659303288998117, + "learning_rate": 1.5713367511580713e-06, + "loss": 1.608, + "step": 11475 + }, + { + "epoch": 2.299799599198397, + "grad_norm": 19.891659638878263, + "learning_rate": 1.570488251174499e-06, + "loss": 1.3927, + "step": 11476 + }, + { + "epoch": 2.3, + "grad_norm": 19.71853697724485, + "learning_rate": 1.5696399376585468e-06, + "loss": 1.3651, + "step": 11477 + }, + { + "epoch": 2.3002004008016033, + "grad_norm": 28.539462923999864, + "learning_rate": 1.5687918106563326e-06, + "loss": 1.6017, + "step": 11478 + }, + { + "epoch": 2.3004008016032063, + "grad_norm": 73.61390438341111, + "learning_rate": 1.5679438702139732e-06, + "loss": 1.84, + "step": 11479 + }, + { + "epoch": 2.3006012024048097, + "grad_norm": 26.95368206846166, + "learning_rate": 1.5670961163775716e-06, + "loss": 1.0525, + "step": 11480 + }, + { + "epoch": 2.3008016032064127, + "grad_norm": 59.6215044455147, + "learning_rate": 1.5662485491932217e-06, + "loss": 1.6243, + "step": 11481 + }, + { + "epoch": 2.301002004008016, + "grad_norm": 24.013954884657238, + "learning_rate": 1.5654011687070064e-06, + "loss": 1.5521, + "step": 11482 + }, + { + "epoch": 2.301202404809619, + "grad_norm": 31.54861728631849, + "learning_rate": 1.5645539749649996e-06, + "loss": 1.5715, + "step": 11483 + }, + { + "epoch": 2.3014028056112226, + "grad_norm": 19.433813723942382, + "learning_rate": 1.5637069680132667e-06, + "loss": 1.2544, + "step": 11484 + }, + { + "epoch": 2.3016032064128256, + "grad_norm": 17.913967083969915, + "learning_rate": 1.562860147897856e-06, + "loss": 1.2252, + "step": 11485 + }, + { + "epoch": 2.301803607214429, + "grad_norm": 24.05609768621404, + "learning_rate": 1.562013514664813e-06, + "loss": 1.9919, + "step": 11486 + }, + { + "epoch": 2.302004008016032, + "grad_norm": 38.26222544856097, + "learning_rate": 1.5611670683601698e-06, + "loss": 1.549, + "step": 11487 + }, + { + "epoch": 2.3022044088176354, + "grad_norm": 40.97087985437441, + "learning_rate": 1.56032080902995e-06, + "loss": 1.9004, + "step": 11488 + }, + { + "epoch": 2.3024048096192384, + "grad_norm": 23.462931608463382, + "learning_rate": 1.5594747367201647e-06, + "loss": 1.5567, + "step": 11489 + }, + { + "epoch": 2.302605210420842, + "grad_norm": 21.80577450060282, + "learning_rate": 1.5586288514768166e-06, + "loss": 1.5356, + "step": 11490 + }, + { + "epoch": 2.302805611222445, + "grad_norm": 24.4486620588645, + "learning_rate": 1.5577831533458998e-06, + "loss": 1.5018, + "step": 11491 + }, + { + "epoch": 2.3030060120240483, + "grad_norm": 21.723077061225695, + "learning_rate": 1.5569376423733906e-06, + "loss": 1.5425, + "step": 11492 + }, + { + "epoch": 2.3032064128256513, + "grad_norm": 26.08531053462995, + "learning_rate": 1.5560923186052657e-06, + "loss": 1.6219, + "step": 11493 + }, + { + "epoch": 2.3034068136272543, + "grad_norm": 19.86440361502596, + "learning_rate": 1.555247182087487e-06, + "loss": 1.4893, + "step": 11494 + }, + { + "epoch": 2.3036072144288577, + "grad_norm": 19.137932042350613, + "learning_rate": 1.554402232866002e-06, + "loss": 1.34, + "step": 11495 + }, + { + "epoch": 2.3038076152304607, + "grad_norm": 17.26289567793116, + "learning_rate": 1.553557470986754e-06, + "loss": 1.5388, + "step": 11496 + }, + { + "epoch": 2.304008016032064, + "grad_norm": 20.402753525441927, + "learning_rate": 1.552712896495674e-06, + "loss": 1.4582, + "step": 11497 + }, + { + "epoch": 2.304208416833667, + "grad_norm": 24.53657780362802, + "learning_rate": 1.551868509438682e-06, + "loss": 1.5778, + "step": 11498 + }, + { + "epoch": 2.3044088176352706, + "grad_norm": 16.74085324268268, + "learning_rate": 1.551024309861689e-06, + "loss": 1.7582, + "step": 11499 + }, + { + "epoch": 2.3046092184368736, + "grad_norm": 55.51602912013667, + "learning_rate": 1.5501802978105957e-06, + "loss": 1.6594, + "step": 11500 + }, + { + "epoch": 2.304809619238477, + "grad_norm": 20.810953553328723, + "learning_rate": 1.5493364733312938e-06, + "loss": 1.3862, + "step": 11501 + }, + { + "epoch": 2.30501002004008, + "grad_norm": 27.371160252868506, + "learning_rate": 1.5484928364696594e-06, + "loss": 1.8866, + "step": 11502 + }, + { + "epoch": 2.3052104208416835, + "grad_norm": 22.38214915002998, + "learning_rate": 1.5476493872715648e-06, + "loss": 1.7103, + "step": 11503 + }, + { + "epoch": 2.3054108216432865, + "grad_norm": 20.818384217625074, + "learning_rate": 1.5468061257828692e-06, + "loss": 1.4781, + "step": 11504 + }, + { + "epoch": 2.30561122244489, + "grad_norm": 16.79330540520873, + "learning_rate": 1.5459630520494217e-06, + "loss": 1.5767, + "step": 11505 + }, + { + "epoch": 2.305811623246493, + "grad_norm": 20.162625332294173, + "learning_rate": 1.5451201661170618e-06, + "loss": 1.5326, + "step": 11506 + }, + { + "epoch": 2.3060120240480964, + "grad_norm": 18.648546758521473, + "learning_rate": 1.544277468031618e-06, + "loss": 1.8476, + "step": 11507 + }, + { + "epoch": 2.3062124248496993, + "grad_norm": 14.549382467262351, + "learning_rate": 1.5434349578389114e-06, + "loss": 1.618, + "step": 11508 + }, + { + "epoch": 2.306412825651303, + "grad_norm": 18.467681755885323, + "learning_rate": 1.5425926355847449e-06, + "loss": 1.1228, + "step": 11509 + }, + { + "epoch": 2.306613226452906, + "grad_norm": 16.24584409125997, + "learning_rate": 1.5417505013149236e-06, + "loss": 1.2657, + "step": 11510 + }, + { + "epoch": 2.306813627254509, + "grad_norm": 20.382605833960735, + "learning_rate": 1.540908555075231e-06, + "loss": 1.9984, + "step": 11511 + }, + { + "epoch": 2.307014028056112, + "grad_norm": 22.26968014616204, + "learning_rate": 1.5400667969114463e-06, + "loss": 1.3386, + "step": 11512 + }, + { + "epoch": 2.3072144288577157, + "grad_norm": 16.838683484445014, + "learning_rate": 1.5392252268693375e-06, + "loss": 1.3242, + "step": 11513 + }, + { + "epoch": 2.3074148296593187, + "grad_norm": 23.539826299799884, + "learning_rate": 1.5383838449946615e-06, + "loss": 1.3758, + "step": 11514 + }, + { + "epoch": 2.3076152304609217, + "grad_norm": 24.64897281458211, + "learning_rate": 1.537542651333167e-06, + "loss": 1.8713, + "step": 11515 + }, + { + "epoch": 2.307815631262525, + "grad_norm": 83.64169024020475, + "learning_rate": 1.5367016459305866e-06, + "loss": 1.4882, + "step": 11516 + }, + { + "epoch": 2.308016032064128, + "grad_norm": 65.40461862519767, + "learning_rate": 1.5358608288326533e-06, + "loss": 1.9547, + "step": 11517 + }, + { + "epoch": 2.3082164328657315, + "grad_norm": 17.597048539882678, + "learning_rate": 1.5350202000850785e-06, + "loss": 1.4426, + "step": 11518 + }, + { + "epoch": 2.3084168336673345, + "grad_norm": 13.335817983936865, + "learning_rate": 1.53417975973357e-06, + "loss": 1.2585, + "step": 11519 + }, + { + "epoch": 2.308617234468938, + "grad_norm": 32.3685662632426, + "learning_rate": 1.5333395078238245e-06, + "loss": 1.307, + "step": 11520 + }, + { + "epoch": 2.308817635270541, + "grad_norm": 24.415430675415106, + "learning_rate": 1.5324994444015267e-06, + "loss": 1.6042, + "step": 11521 + }, + { + "epoch": 2.3090180360721444, + "grad_norm": 23.55578743524026, + "learning_rate": 1.5316595695123548e-06, + "loss": 1.1857, + "step": 11522 + }, + { + "epoch": 2.3092184368737474, + "grad_norm": 19.611288354304644, + "learning_rate": 1.5308198832019683e-06, + "loss": 1.5544, + "step": 11523 + }, + { + "epoch": 2.309418837675351, + "grad_norm": 24.502123953919156, + "learning_rate": 1.5299803855160271e-06, + "loss": 1.0284, + "step": 11524 + }, + { + "epoch": 2.309619238476954, + "grad_norm": 26.402975900575075, + "learning_rate": 1.5291410765001768e-06, + "loss": 1.5797, + "step": 11525 + }, + { + "epoch": 2.3098196392785573, + "grad_norm": 23.503644577248295, + "learning_rate": 1.5283019562000456e-06, + "loss": 1.742, + "step": 11526 + }, + { + "epoch": 2.3100200400801603, + "grad_norm": 42.79929835170259, + "learning_rate": 1.5274630246612654e-06, + "loss": 1.3556, + "step": 11527 + }, + { + "epoch": 2.3102204408817637, + "grad_norm": 22.846858094164755, + "learning_rate": 1.5266242819294452e-06, + "loss": 1.4445, + "step": 11528 + }, + { + "epoch": 2.3104208416833667, + "grad_norm": 17.345912809230377, + "learning_rate": 1.5257857280501898e-06, + "loss": 1.2833, + "step": 11529 + }, + { + "epoch": 2.31062124248497, + "grad_norm": 17.96540075837815, + "learning_rate": 1.5249473630690924e-06, + "loss": 1.6639, + "step": 11530 + }, + { + "epoch": 2.310821643286573, + "grad_norm": 21.171309347852464, + "learning_rate": 1.5241091870317365e-06, + "loss": 1.6357, + "step": 11531 + }, + { + "epoch": 2.311022044088176, + "grad_norm": 18.092760809376937, + "learning_rate": 1.523271199983697e-06, + "loss": 1.7698, + "step": 11532 + }, + { + "epoch": 2.3112224448897796, + "grad_norm": 20.56628632312104, + "learning_rate": 1.5224334019705312e-06, + "loss": 1.5744, + "step": 11533 + }, + { + "epoch": 2.311422845691383, + "grad_norm": 22.87738764279613, + "learning_rate": 1.5215957930377978e-06, + "loss": 1.5091, + "step": 11534 + }, + { + "epoch": 2.311623246492986, + "grad_norm": 19.81644174511889, + "learning_rate": 1.5207583732310337e-06, + "loss": 1.7544, + "step": 11535 + }, + { + "epoch": 2.311823647294589, + "grad_norm": 23.23829560460371, + "learning_rate": 1.5199211425957732e-06, + "loss": 1.6211, + "step": 11536 + }, + { + "epoch": 2.3120240480961924, + "grad_norm": 28.09957772203038, + "learning_rate": 1.519084101177537e-06, + "loss": 2.0067, + "step": 11537 + }, + { + "epoch": 2.3122244488977954, + "grad_norm": 20.99779341861268, + "learning_rate": 1.518247249021837e-06, + "loss": 1.1932, + "step": 11538 + }, + { + "epoch": 2.312424849699399, + "grad_norm": 18.918131274527262, + "learning_rate": 1.5174105861741745e-06, + "loss": 1.2294, + "step": 11539 + }, + { + "epoch": 2.312625250501002, + "grad_norm": 23.07104588747038, + "learning_rate": 1.5165741126800365e-06, + "loss": 1.5615, + "step": 11540 + }, + { + "epoch": 2.3128256513026053, + "grad_norm": 27.981303870734802, + "learning_rate": 1.515737828584909e-06, + "loss": 1.5122, + "step": 11541 + }, + { + "epoch": 2.3130260521042083, + "grad_norm": 36.076481647387865, + "learning_rate": 1.5149017339342576e-06, + "loss": 1.8228, + "step": 11542 + }, + { + "epoch": 2.3132264529058117, + "grad_norm": 28.220710896592802, + "learning_rate": 1.5140658287735416e-06, + "loss": 1.9141, + "step": 11543 + }, + { + "epoch": 2.3134268537074147, + "grad_norm": 20.33187828672991, + "learning_rate": 1.5132301131482158e-06, + "loss": 1.9035, + "step": 11544 + }, + { + "epoch": 2.313627254509018, + "grad_norm": 23.57273228340725, + "learning_rate": 1.5123945871037138e-06, + "loss": 1.6831, + "step": 11545 + }, + { + "epoch": 2.313827655310621, + "grad_norm": 25.458297758389993, + "learning_rate": 1.5115592506854676e-06, + "loss": 1.165, + "step": 11546 + }, + { + "epoch": 2.3140280561122246, + "grad_norm": 18.272625395173588, + "learning_rate": 1.5107241039388914e-06, + "loss": 1.7004, + "step": 11547 + }, + { + "epoch": 2.3142284569138276, + "grad_norm": 35.1973893069822, + "learning_rate": 1.5098891469093978e-06, + "loss": 1.1238, + "step": 11548 + }, + { + "epoch": 2.314428857715431, + "grad_norm": 29.14823548474243, + "learning_rate": 1.509054379642385e-06, + "loss": 1.3101, + "step": 11549 + }, + { + "epoch": 2.314629258517034, + "grad_norm": 29.075015192006184, + "learning_rate": 1.508219802183235e-06, + "loss": 1.525, + "step": 11550 + }, + { + "epoch": 2.3148296593186375, + "grad_norm": 17.475643572289997, + "learning_rate": 1.5073854145773326e-06, + "loss": 1.4979, + "step": 11551 + }, + { + "epoch": 2.3150300601202405, + "grad_norm": 28.208034926516138, + "learning_rate": 1.5065512168700385e-06, + "loss": 1.7305, + "step": 11552 + }, + { + "epoch": 2.3152304609218435, + "grad_norm": 21.420549703756127, + "learning_rate": 1.5057172091067123e-06, + "loss": 1.6307, + "step": 11553 + }, + { + "epoch": 2.315430861723447, + "grad_norm": 21.520535894541254, + "learning_rate": 1.5048833913326989e-06, + "loss": 1.817, + "step": 11554 + }, + { + "epoch": 2.31563126252505, + "grad_norm": 19.236591205140154, + "learning_rate": 1.5040497635933348e-06, + "loss": 1.6734, + "step": 11555 + }, + { + "epoch": 2.3158316633266534, + "grad_norm": 20.782144052414804, + "learning_rate": 1.503216325933947e-06, + "loss": 1.6753, + "step": 11556 + }, + { + "epoch": 2.3160320641282564, + "grad_norm": 20.34139765538786, + "learning_rate": 1.5023830783998466e-06, + "loss": 1.5358, + "step": 11557 + }, + { + "epoch": 2.31623246492986, + "grad_norm": 31.948274268775776, + "learning_rate": 1.5015500210363443e-06, + "loss": 1.5886, + "step": 11558 + }, + { + "epoch": 2.316432865731463, + "grad_norm": 21.848834982779827, + "learning_rate": 1.50071715388873e-06, + "loss": 1.8177, + "step": 11559 + }, + { + "epoch": 2.3166332665330662, + "grad_norm": 22.32254862993936, + "learning_rate": 1.499884477002289e-06, + "loss": 1.977, + "step": 11560 + }, + { + "epoch": 2.3168336673346692, + "grad_norm": 22.15029484959571, + "learning_rate": 1.4990519904222956e-06, + "loss": 1.5515, + "step": 11561 + }, + { + "epoch": 2.3170340681362727, + "grad_norm": 22.038930991815956, + "learning_rate": 1.4982196941940135e-06, + "loss": 1.8922, + "step": 11562 + }, + { + "epoch": 2.3172344689378757, + "grad_norm": 27.11626013929431, + "learning_rate": 1.4973875883626975e-06, + "loss": 1.1584, + "step": 11563 + }, + { + "epoch": 2.317434869739479, + "grad_norm": 34.348071652399945, + "learning_rate": 1.496555672973586e-06, + "loss": 1.4237, + "step": 11564 + }, + { + "epoch": 2.317635270541082, + "grad_norm": 42.60476056247978, + "learning_rate": 1.4957239480719165e-06, + "loss": 1.4206, + "step": 11565 + }, + { + "epoch": 2.3178356713426855, + "grad_norm": 27.93510065850696, + "learning_rate": 1.4948924137029075e-06, + "loss": 1.9623, + "step": 11566 + }, + { + "epoch": 2.3180360721442885, + "grad_norm": 32.953512363871184, + "learning_rate": 1.4940610699117714e-06, + "loss": 1.8608, + "step": 11567 + }, + { + "epoch": 2.318236472945892, + "grad_norm": 25.892052018599507, + "learning_rate": 1.4932299167437109e-06, + "loss": 1.8416, + "step": 11568 + }, + { + "epoch": 2.318436873747495, + "grad_norm": 20.00960300851884, + "learning_rate": 1.492398954243916e-06, + "loss": 1.4093, + "step": 11569 + }, + { + "epoch": 2.318637274549098, + "grad_norm": 24.115803600051194, + "learning_rate": 1.4915681824575672e-06, + "loss": 1.7263, + "step": 11570 + }, + { + "epoch": 2.3188376753507014, + "grad_norm": 18.36625712050104, + "learning_rate": 1.4907376014298359e-06, + "loss": 1.5687, + "step": 11571 + }, + { + "epoch": 2.319038076152305, + "grad_norm": 21.559155190622587, + "learning_rate": 1.4899072112058827e-06, + "loss": 1.2926, + "step": 11572 + }, + { + "epoch": 2.319238476953908, + "grad_norm": 25.788523889861985, + "learning_rate": 1.4890770118308546e-06, + "loss": 1.8868, + "step": 11573 + }, + { + "epoch": 2.319438877755511, + "grad_norm": 21.442410825696637, + "learning_rate": 1.4882470033498898e-06, + "loss": 1.7524, + "step": 11574 + }, + { + "epoch": 2.3196392785571143, + "grad_norm": 21.43285019928418, + "learning_rate": 1.4874171858081227e-06, + "loss": 1.1022, + "step": 11575 + }, + { + "epoch": 2.3198396793587173, + "grad_norm": 28.58886058433422, + "learning_rate": 1.4865875592506669e-06, + "loss": 1.598, + "step": 11576 + }, + { + "epoch": 2.3200400801603207, + "grad_norm": 31.150248859785478, + "learning_rate": 1.485758123722631e-06, + "loss": 1.6739, + "step": 11577 + }, + { + "epoch": 2.3202404809619237, + "grad_norm": 23.872239861994693, + "learning_rate": 1.484928879269114e-06, + "loss": 1.4488, + "step": 11578 + }, + { + "epoch": 2.320440881763527, + "grad_norm": 24.000555425652582, + "learning_rate": 1.4840998259352023e-06, + "loss": 1.1765, + "step": 11579 + }, + { + "epoch": 2.32064128256513, + "grad_norm": 48.482604246989744, + "learning_rate": 1.483270963765973e-06, + "loss": 1.7266, + "step": 11580 + }, + { + "epoch": 2.3208416833667336, + "grad_norm": 20.850534482663786, + "learning_rate": 1.482442292806493e-06, + "loss": 1.5595, + "step": 11581 + }, + { + "epoch": 2.3210420841683366, + "grad_norm": 21.505461322743255, + "learning_rate": 1.4816138131018192e-06, + "loss": 1.6299, + "step": 11582 + }, + { + "epoch": 2.32124248496994, + "grad_norm": 20.63946943649737, + "learning_rate": 1.4807855246969943e-06, + "loss": 1.5947, + "step": 11583 + }, + { + "epoch": 2.321442885771543, + "grad_norm": 18.747300766893076, + "learning_rate": 1.4799574276370554e-06, + "loss": 1.2787, + "step": 11584 + }, + { + "epoch": 2.3216432865731464, + "grad_norm": 21.037059974599103, + "learning_rate": 1.4791295219670277e-06, + "loss": 1.152, + "step": 11585 + }, + { + "epoch": 2.3218436873747494, + "grad_norm": 17.025897524288176, + "learning_rate": 1.4783018077319245e-06, + "loss": 1.6374, + "step": 11586 + }, + { + "epoch": 2.322044088176353, + "grad_norm": 19.519929135715923, + "learning_rate": 1.4774742849767509e-06, + "loss": 1.9985, + "step": 11587 + }, + { + "epoch": 2.322244488977956, + "grad_norm": 20.384768910282094, + "learning_rate": 1.4766469537465e-06, + "loss": 1.0839, + "step": 11588 + }, + { + "epoch": 2.3224448897795593, + "grad_norm": 26.61157046227992, + "learning_rate": 1.4758198140861562e-06, + "loss": 1.2499, + "step": 11589 + }, + { + "epoch": 2.3226452905811623, + "grad_norm": 30.07972233890713, + "learning_rate": 1.4749928660406904e-06, + "loss": 1.2565, + "step": 11590 + }, + { + "epoch": 2.3228456913827653, + "grad_norm": 24.55422651277123, + "learning_rate": 1.4741661096550658e-06, + "loss": 1.5771, + "step": 11591 + }, + { + "epoch": 2.3230460921843687, + "grad_norm": 23.743882838671706, + "learning_rate": 1.4733395449742344e-06, + "loss": 1.473, + "step": 11592 + }, + { + "epoch": 2.323246492985972, + "grad_norm": 23.02437372770642, + "learning_rate": 1.4725131720431373e-06, + "loss": 1.9872, + "step": 11593 + }, + { + "epoch": 2.323446893787575, + "grad_norm": 33.50436362760986, + "learning_rate": 1.4716869909067067e-06, + "loss": 1.4723, + "step": 11594 + }, + { + "epoch": 2.323647294589178, + "grad_norm": 19.683422567673365, + "learning_rate": 1.4708610016098623e-06, + "loss": 1.8287, + "step": 11595 + }, + { + "epoch": 2.3238476953907816, + "grad_norm": 28.697352930046062, + "learning_rate": 1.470035204197517e-06, + "loss": 1.3328, + "step": 11596 + }, + { + "epoch": 2.3240480961923846, + "grad_norm": 22.84997461590516, + "learning_rate": 1.4692095987145643e-06, + "loss": 1.5104, + "step": 11597 + }, + { + "epoch": 2.324248496993988, + "grad_norm": 17.691662466737473, + "learning_rate": 1.4683841852059011e-06, + "loss": 1.1341, + "step": 11598 + }, + { + "epoch": 2.324448897795591, + "grad_norm": 26.947651248092562, + "learning_rate": 1.4675589637164012e-06, + "loss": 1.7213, + "step": 11599 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 25.4258808340251, + "learning_rate": 1.4667339342909354e-06, + "loss": 1.0724, + "step": 11600 + }, + { + "epoch": 2.3248496993987975, + "grad_norm": 23.300753438611565, + "learning_rate": 1.4659090969743606e-06, + "loss": 1.4524, + "step": 11601 + }, + { + "epoch": 2.325050100200401, + "grad_norm": 20.17710589503898, + "learning_rate": 1.4650844518115249e-06, + "loss": 1.5666, + "step": 11602 + }, + { + "epoch": 2.325250501002004, + "grad_norm": 21.521273521771008, + "learning_rate": 1.4642599988472672e-06, + "loss": 1.6193, + "step": 11603 + }, + { + "epoch": 2.3254509018036074, + "grad_norm": 25.600634797723878, + "learning_rate": 1.4634357381264098e-06, + "loss": 1.4413, + "step": 11604 + }, + { + "epoch": 2.3256513026052104, + "grad_norm": 31.651896698179254, + "learning_rate": 1.4626116696937732e-06, + "loss": 1.098, + "step": 11605 + }, + { + "epoch": 2.325851703406814, + "grad_norm": 28.293748527893367, + "learning_rate": 1.4617877935941633e-06, + "loss": 1.5458, + "step": 11606 + }, + { + "epoch": 2.326052104208417, + "grad_norm": 26.681991335719598, + "learning_rate": 1.4609641098723709e-06, + "loss": 1.299, + "step": 11607 + }, + { + "epoch": 2.3262525050100202, + "grad_norm": 20.206892888260118, + "learning_rate": 1.460140618573187e-06, + "loss": 2.124, + "step": 11608 + }, + { + "epoch": 2.3264529058116232, + "grad_norm": 21.418870854982323, + "learning_rate": 1.4593173197413812e-06, + "loss": 1.5056, + "step": 11609 + }, + { + "epoch": 2.3266533066132267, + "grad_norm": 48.025040828710296, + "learning_rate": 1.4584942134217195e-06, + "loss": 1.8393, + "step": 11610 + }, + { + "epoch": 2.3268537074148297, + "grad_norm": 25.641420473668, + "learning_rate": 1.4576712996589548e-06, + "loss": 1.5259, + "step": 11611 + }, + { + "epoch": 2.3270541082164327, + "grad_norm": 22.150174698881088, + "learning_rate": 1.456848578497831e-06, + "loss": 1.5274, + "step": 11612 + }, + { + "epoch": 2.327254509018036, + "grad_norm": 24.263874167127703, + "learning_rate": 1.4560260499830813e-06, + "loss": 2.1652, + "step": 11613 + }, + { + "epoch": 2.327454909819639, + "grad_norm": 26.26735877839215, + "learning_rate": 1.4552037141594239e-06, + "loss": 1.4193, + "step": 11614 + }, + { + "epoch": 2.3276553106212425, + "grad_norm": 23.480821082077792, + "learning_rate": 1.4543815710715764e-06, + "loss": 1.5144, + "step": 11615 + }, + { + "epoch": 2.3278557114228455, + "grad_norm": 24.753520809236193, + "learning_rate": 1.453559620764235e-06, + "loss": 1.4679, + "step": 11616 + }, + { + "epoch": 2.328056112224449, + "grad_norm": 23.982245613958913, + "learning_rate": 1.4527378632820915e-06, + "loss": 1.8687, + "step": 11617 + }, + { + "epoch": 2.328256513026052, + "grad_norm": 30.99475328283476, + "learning_rate": 1.4519162986698272e-06, + "loss": 1.6499, + "step": 11618 + }, + { + "epoch": 2.3284569138276554, + "grad_norm": 20.184154093362036, + "learning_rate": 1.451094926972111e-06, + "loss": 1.6559, + "step": 11619 + }, + { + "epoch": 2.3286573146292584, + "grad_norm": 31.32815467095399, + "learning_rate": 1.4502737482336038e-06, + "loss": 1.626, + "step": 11620 + }, + { + "epoch": 2.328857715430862, + "grad_norm": 28.82797480471503, + "learning_rate": 1.4494527624989496e-06, + "loss": 1.6487, + "step": 11621 + }, + { + "epoch": 2.329058116232465, + "grad_norm": 16.111584309641806, + "learning_rate": 1.4486319698127931e-06, + "loss": 1.3878, + "step": 11622 + }, + { + "epoch": 2.3292585170340683, + "grad_norm": 20.07189428201127, + "learning_rate": 1.447811370219757e-06, + "loss": 1.8093, + "step": 11623 + }, + { + "epoch": 2.3294589178356713, + "grad_norm": 54.90730810263727, + "learning_rate": 1.4469909637644602e-06, + "loss": 1.591, + "step": 11624 + }, + { + "epoch": 2.3296593186372747, + "grad_norm": 16.189381387458482, + "learning_rate": 1.4461707504915095e-06, + "loss": 1.3754, + "step": 11625 + }, + { + "epoch": 2.3298597194388777, + "grad_norm": 22.421850626976806, + "learning_rate": 1.4453507304455012e-06, + "loss": 1.8954, + "step": 11626 + }, + { + "epoch": 2.330060120240481, + "grad_norm": 20.665244637316203, + "learning_rate": 1.4445309036710225e-06, + "loss": 1.1123, + "step": 11627 + }, + { + "epoch": 2.330260521042084, + "grad_norm": 22.242262457718983, + "learning_rate": 1.4437112702126432e-06, + "loss": 1.687, + "step": 11628 + }, + { + "epoch": 2.330460921843687, + "grad_norm": 19.507999263491932, + "learning_rate": 1.4428918301149354e-06, + "loss": 1.2247, + "step": 11629 + }, + { + "epoch": 2.3306613226452906, + "grad_norm": 22.613300801067666, + "learning_rate": 1.4420725834224474e-06, + "loss": 1.6016, + "step": 11630 + }, + { + "epoch": 2.330861723446894, + "grad_norm": 24.977822293417926, + "learning_rate": 1.4412535301797242e-06, + "loss": 1.6808, + "step": 11631 + }, + { + "epoch": 2.331062124248497, + "grad_norm": 20.095398614547562, + "learning_rate": 1.4404346704313022e-06, + "loss": 1.4744, + "step": 11632 + }, + { + "epoch": 2.3312625250501, + "grad_norm": 20.875705360390352, + "learning_rate": 1.4396160042217005e-06, + "loss": 1.6866, + "step": 11633 + }, + { + "epoch": 2.3314629258517034, + "grad_norm": 27.77373541523115, + "learning_rate": 1.4387975315954334e-06, + "loss": 1.653, + "step": 11634 + }, + { + "epoch": 2.3316633266533064, + "grad_norm": 19.67365323051254, + "learning_rate": 1.4379792525969987e-06, + "loss": 1.9876, + "step": 11635 + }, + { + "epoch": 2.33186372745491, + "grad_norm": 30.937526112090328, + "learning_rate": 1.4371611672708913e-06, + "loss": 1.8072, + "step": 11636 + }, + { + "epoch": 2.332064128256513, + "grad_norm": 22.377913646377596, + "learning_rate": 1.436343275661592e-06, + "loss": 2.1027, + "step": 11637 + }, + { + "epoch": 2.3322645290581163, + "grad_norm": 21.382223055731608, + "learning_rate": 1.4355255778135662e-06, + "loss": 1.4759, + "step": 11638 + }, + { + "epoch": 2.3324649298597193, + "grad_norm": 25.624262001157877, + "learning_rate": 1.4347080737712794e-06, + "loss": 1.8633, + "step": 11639 + }, + { + "epoch": 2.3326653306613228, + "grad_norm": 20.173952668239334, + "learning_rate": 1.433890763579176e-06, + "loss": 1.2979, + "step": 11640 + }, + { + "epoch": 2.3328657314629258, + "grad_norm": 17.275725612623194, + "learning_rate": 1.433073647281696e-06, + "loss": 1.4142, + "step": 11641 + }, + { + "epoch": 2.333066132264529, + "grad_norm": 23.331751881037903, + "learning_rate": 1.4322567249232671e-06, + "loss": 1.0355, + "step": 11642 + }, + { + "epoch": 2.333266533066132, + "grad_norm": 24.359584238636717, + "learning_rate": 1.431439996548306e-06, + "loss": 1.3296, + "step": 11643 + }, + { + "epoch": 2.3334669338677356, + "grad_norm": 16.34325261189805, + "learning_rate": 1.430623462201222e-06, + "loss": 1.2121, + "step": 11644 + }, + { + "epoch": 2.3336673346693386, + "grad_norm": 24.56114920766419, + "learning_rate": 1.4298071219264058e-06, + "loss": 1.8443, + "step": 11645 + }, + { + "epoch": 2.333867735470942, + "grad_norm": 16.84256441765611, + "learning_rate": 1.4289909757682502e-06, + "loss": 1.442, + "step": 11646 + }, + { + "epoch": 2.334068136272545, + "grad_norm": 54.138002370193995, + "learning_rate": 1.4281750237711245e-06, + "loss": 1.5703, + "step": 11647 + }, + { + "epoch": 2.3342685370741485, + "grad_norm": 23.250352206799928, + "learning_rate": 1.4273592659793956e-06, + "loss": 1.4115, + "step": 11648 + }, + { + "epoch": 2.3344689378757515, + "grad_norm": 17.964731047142923, + "learning_rate": 1.4265437024374169e-06, + "loss": 1.7919, + "step": 11649 + }, + { + "epoch": 2.3346693386773545, + "grad_norm": 22.91136261692247, + "learning_rate": 1.4257283331895316e-06, + "loss": 1.6651, + "step": 11650 + }, + { + "epoch": 2.334869739478958, + "grad_norm": 62.08152314069625, + "learning_rate": 1.4249131582800735e-06, + "loss": 1.4691, + "step": 11651 + }, + { + "epoch": 2.3350701402805614, + "grad_norm": 21.405314728109037, + "learning_rate": 1.4240981777533641e-06, + "loss": 1.5402, + "step": 11652 + }, + { + "epoch": 2.3352705410821644, + "grad_norm": 19.052447662450334, + "learning_rate": 1.4232833916537165e-06, + "loss": 2.0831, + "step": 11653 + }, + { + "epoch": 2.3354709418837674, + "grad_norm": 21.802943618180468, + "learning_rate": 1.4224688000254295e-06, + "loss": 1.193, + "step": 11654 + }, + { + "epoch": 2.335671342685371, + "grad_norm": 19.446760448304513, + "learning_rate": 1.421654402912795e-06, + "loss": 1.6264, + "step": 11655 + }, + { + "epoch": 2.335871743486974, + "grad_norm": 18.77181446493534, + "learning_rate": 1.4208402003600919e-06, + "loss": 1.0563, + "step": 11656 + }, + { + "epoch": 2.3360721442885772, + "grad_norm": 21.413600303834382, + "learning_rate": 1.420026192411591e-06, + "loss": 1.4938, + "step": 11657 + }, + { + "epoch": 2.3362725450901802, + "grad_norm": 36.944758910724474, + "learning_rate": 1.4192123791115502e-06, + "loss": 1.4227, + "step": 11658 + }, + { + "epoch": 2.3364729458917837, + "grad_norm": 22.127070174104382, + "learning_rate": 1.4183987605042188e-06, + "loss": 1.6805, + "step": 11659 + }, + { + "epoch": 2.3366733466933867, + "grad_norm": 19.502395066332436, + "learning_rate": 1.417585336633835e-06, + "loss": 1.2729, + "step": 11660 + }, + { + "epoch": 2.33687374749499, + "grad_norm": 20.913445448978074, + "learning_rate": 1.4167721075446221e-06, + "loss": 1.3015, + "step": 11661 + }, + { + "epoch": 2.337074148296593, + "grad_norm": 25.675413877195037, + "learning_rate": 1.4159590732807988e-06, + "loss": 1.7748, + "step": 11662 + }, + { + "epoch": 2.3372745490981965, + "grad_norm": 31.671854232342568, + "learning_rate": 1.4151462338865734e-06, + "loss": 1.3617, + "step": 11663 + }, + { + "epoch": 2.3374749498997995, + "grad_norm": 30.378514565720856, + "learning_rate": 1.4143335894061378e-06, + "loss": 1.6288, + "step": 11664 + }, + { + "epoch": 2.337675350701403, + "grad_norm": 26.87390442435378, + "learning_rate": 1.413521139883678e-06, + "loss": 2.0793, + "step": 11665 + }, + { + "epoch": 2.337875751503006, + "grad_norm": 24.17901851715155, + "learning_rate": 1.412708885363368e-06, + "loss": 1.5636, + "step": 11666 + }, + { + "epoch": 2.3380761523046094, + "grad_norm": 19.771007277516915, + "learning_rate": 1.4118968258893706e-06, + "loss": 1.3672, + "step": 11667 + }, + { + "epoch": 2.3382765531062124, + "grad_norm": 22.042907864246665, + "learning_rate": 1.4110849615058397e-06, + "loss": 1.5939, + "step": 11668 + }, + { + "epoch": 2.338476953907816, + "grad_norm": 21.64203451981142, + "learning_rate": 1.4102732922569173e-06, + "loss": 1.6146, + "step": 11669 + }, + { + "epoch": 2.338677354709419, + "grad_norm": 38.98381176294614, + "learning_rate": 1.4094618181867365e-06, + "loss": 1.629, + "step": 11670 + }, + { + "epoch": 2.338877755511022, + "grad_norm": 37.01960794438628, + "learning_rate": 1.408650539339415e-06, + "loss": 1.3586, + "step": 11671 + }, + { + "epoch": 2.3390781563126253, + "grad_norm": 15.740880013419472, + "learning_rate": 1.4078394557590652e-06, + "loss": 1.1116, + "step": 11672 + }, + { + "epoch": 2.3392785571142283, + "grad_norm": 19.37593867130398, + "learning_rate": 1.4070285674897866e-06, + "loss": 1.5367, + "step": 11673 + }, + { + "epoch": 2.3394789579158317, + "grad_norm": 17.760820687737194, + "learning_rate": 1.4062178745756682e-06, + "loss": 1.4521, + "step": 11674 + }, + { + "epoch": 2.3396793587174347, + "grad_norm": 19.881151809890625, + "learning_rate": 1.4054073770607895e-06, + "loss": 1.4353, + "step": 11675 + }, + { + "epoch": 2.339879759519038, + "grad_norm": 29.021350873145416, + "learning_rate": 1.4045970749892174e-06, + "loss": 1.6713, + "step": 11676 + }, + { + "epoch": 2.340080160320641, + "grad_norm": 41.23501263746003, + "learning_rate": 1.4037869684050115e-06, + "loss": 1.9388, + "step": 11677 + }, + { + "epoch": 2.3402805611222446, + "grad_norm": 25.900796583626796, + "learning_rate": 1.4029770573522144e-06, + "loss": 1.5509, + "step": 11678 + }, + { + "epoch": 2.3404809619238476, + "grad_norm": 16.259498279506744, + "learning_rate": 1.4021673418748649e-06, + "loss": 1.6277, + "step": 11679 + }, + { + "epoch": 2.340681362725451, + "grad_norm": 19.498847601075884, + "learning_rate": 1.4013578220169882e-06, + "loss": 1.3032, + "step": 11680 + }, + { + "epoch": 2.340881763527054, + "grad_norm": 14.89209124866891, + "learning_rate": 1.400548497822598e-06, + "loss": 1.7272, + "step": 11681 + }, + { + "epoch": 2.3410821643286575, + "grad_norm": 17.216449469171987, + "learning_rate": 1.3997393693356998e-06, + "loss": 1.8955, + "step": 11682 + }, + { + "epoch": 2.3412825651302605, + "grad_norm": 21.815358345748727, + "learning_rate": 1.3989304366002865e-06, + "loss": 1.8326, + "step": 11683 + }, + { + "epoch": 2.341482965931864, + "grad_norm": 18.08209066713806, + "learning_rate": 1.3981216996603425e-06, + "loss": 1.5849, + "step": 11684 + }, + { + "epoch": 2.341683366733467, + "grad_norm": 25.493614836859795, + "learning_rate": 1.3973131585598354e-06, + "loss": 1.3603, + "step": 11685 + }, + { + "epoch": 2.3418837675350703, + "grad_norm": 17.852949403137252, + "learning_rate": 1.3965048133427316e-06, + "loss": 1.5019, + "step": 11686 + }, + { + "epoch": 2.3420841683366733, + "grad_norm": 26.45150551418898, + "learning_rate": 1.3956966640529817e-06, + "loss": 1.7736, + "step": 11687 + }, + { + "epoch": 2.3422845691382763, + "grad_norm": 24.21478785441625, + "learning_rate": 1.3948887107345234e-06, + "loss": 2.0739, + "step": 11688 + }, + { + "epoch": 2.3424849699398798, + "grad_norm": 23.53407896218112, + "learning_rate": 1.3940809534312871e-06, + "loss": 2.0818, + "step": 11689 + }, + { + "epoch": 2.342685370741483, + "grad_norm": 17.221584939177276, + "learning_rate": 1.3932733921871927e-06, + "loss": 1.5651, + "step": 11690 + }, + { + "epoch": 2.342885771543086, + "grad_norm": 24.04634661384996, + "learning_rate": 1.3924660270461475e-06, + "loss": 1.8856, + "step": 11691 + }, + { + "epoch": 2.343086172344689, + "grad_norm": 25.779776187094757, + "learning_rate": 1.3916588580520502e-06, + "loss": 1.7369, + "step": 11692 + }, + { + "epoch": 2.3432865731462926, + "grad_norm": 12.041283545071622, + "learning_rate": 1.3908518852487869e-06, + "loss": 1.2814, + "step": 11693 + }, + { + "epoch": 2.3434869739478956, + "grad_norm": 28.10158828859461, + "learning_rate": 1.3900451086802352e-06, + "loss": 1.8113, + "step": 11694 + }, + { + "epoch": 2.343687374749499, + "grad_norm": 19.065837167819108, + "learning_rate": 1.389238528390257e-06, + "loss": 1.7389, + "step": 11695 + }, + { + "epoch": 2.343887775551102, + "grad_norm": 30.081481420610565, + "learning_rate": 1.3884321444227133e-06, + "loss": 1.7726, + "step": 11696 + }, + { + "epoch": 2.3440881763527055, + "grad_norm": 22.19462844745656, + "learning_rate": 1.3876259568214434e-06, + "loss": 1.7305, + "step": 11697 + }, + { + "epoch": 2.3442885771543085, + "grad_norm": 17.661560521245164, + "learning_rate": 1.3868199656302828e-06, + "loss": 1.6674, + "step": 11698 + }, + { + "epoch": 2.344488977955912, + "grad_norm": 21.073282799948863, + "learning_rate": 1.3860141708930536e-06, + "loss": 1.666, + "step": 11699 + }, + { + "epoch": 2.344689378757515, + "grad_norm": 17.92569369888599, + "learning_rate": 1.3852085726535692e-06, + "loss": 1.8354, + "step": 11700 + }, + { + "epoch": 2.3448897795591184, + "grad_norm": 24.769687860834843, + "learning_rate": 1.3844031709556322e-06, + "loss": 1.7051, + "step": 11701 + }, + { + "epoch": 2.3450901803607214, + "grad_norm": 46.95558565857864, + "learning_rate": 1.3835979658430283e-06, + "loss": 1.8079, + "step": 11702 + }, + { + "epoch": 2.345290581162325, + "grad_norm": 19.838536767657526, + "learning_rate": 1.382792957359545e-06, + "loss": 2.0331, + "step": 11703 + }, + { + "epoch": 2.345490981963928, + "grad_norm": 21.30230223729211, + "learning_rate": 1.381988145548946e-06, + "loss": 1.3172, + "step": 11704 + }, + { + "epoch": 2.3456913827655312, + "grad_norm": 24.6184723868322, + "learning_rate": 1.3811835304549926e-06, + "loss": 1.8166, + "step": 11705 + }, + { + "epoch": 2.3458917835671342, + "grad_norm": 21.429366214878485, + "learning_rate": 1.3803791121214321e-06, + "loss": 1.6325, + "step": 11706 + }, + { + "epoch": 2.3460921843687377, + "grad_norm": 25.311725407175736, + "learning_rate": 1.3795748905920026e-06, + "loss": 1.8808, + "step": 11707 + }, + { + "epoch": 2.3462925851703407, + "grad_norm": 24.110504708234423, + "learning_rate": 1.378770865910432e-06, + "loss": 1.2486, + "step": 11708 + }, + { + "epoch": 2.3464929859719437, + "grad_norm": 19.235146972004728, + "learning_rate": 1.3779670381204318e-06, + "loss": 1.2287, + "step": 11709 + }, + { + "epoch": 2.346693386773547, + "grad_norm": 32.04368053674976, + "learning_rate": 1.3771634072657137e-06, + "loss": 1.0877, + "step": 11710 + }, + { + "epoch": 2.3468937875751505, + "grad_norm": 20.73287630644812, + "learning_rate": 1.3763599733899668e-06, + "loss": 1.6235, + "step": 11711 + }, + { + "epoch": 2.3470941883767535, + "grad_norm": 20.534003539307445, + "learning_rate": 1.3755567365368761e-06, + "loss": 1.0224, + "step": 11712 + }, + { + "epoch": 2.3472945891783565, + "grad_norm": 25.31116294464634, + "learning_rate": 1.374753696750119e-06, + "loss": 1.7948, + "step": 11713 + }, + { + "epoch": 2.34749498997996, + "grad_norm": 18.508165687171918, + "learning_rate": 1.3739508540733525e-06, + "loss": 0.8257, + "step": 11714 + }, + { + "epoch": 2.347695390781563, + "grad_norm": 24.595098812858993, + "learning_rate": 1.3731482085502328e-06, + "loss": 1.5993, + "step": 11715 + }, + { + "epoch": 2.3478957915831664, + "grad_norm": 18.644138729607263, + "learning_rate": 1.3723457602243955e-06, + "loss": 1.6994, + "step": 11716 + }, + { + "epoch": 2.3480961923847694, + "grad_norm": 26.433625516715274, + "learning_rate": 1.3715435091394757e-06, + "loss": 1.6433, + "step": 11717 + }, + { + "epoch": 2.348296593186373, + "grad_norm": 24.586487876414008, + "learning_rate": 1.370741455339093e-06, + "loss": 1.6194, + "step": 11718 + }, + { + "epoch": 2.348496993987976, + "grad_norm": 17.047910110974147, + "learning_rate": 1.3699395988668513e-06, + "loss": 1.5224, + "step": 11719 + }, + { + "epoch": 2.3486973947895793, + "grad_norm": 16.474459162630172, + "learning_rate": 1.3691379397663552e-06, + "loss": 1.316, + "step": 11720 + }, + { + "epoch": 2.3488977955911823, + "grad_norm": 18.538093536723487, + "learning_rate": 1.3683364780811875e-06, + "loss": 1.5531, + "step": 11721 + }, + { + "epoch": 2.3490981963927857, + "grad_norm": 19.53249304324662, + "learning_rate": 1.3675352138549264e-06, + "loss": 1.3462, + "step": 11722 + }, + { + "epoch": 2.3492985971943887, + "grad_norm": 19.599702350025456, + "learning_rate": 1.3667341471311373e-06, + "loss": 1.7049, + "step": 11723 + }, + { + "epoch": 2.349498997995992, + "grad_norm": 22.382041170069964, + "learning_rate": 1.3659332779533763e-06, + "loss": 1.2397, + "step": 11724 + }, + { + "epoch": 2.349699398797595, + "grad_norm": 33.885583137438296, + "learning_rate": 1.3651326063651887e-06, + "loss": 1.4396, + "step": 11725 + }, + { + "epoch": 2.3498997995991986, + "grad_norm": 15.552878863642375, + "learning_rate": 1.3643321324101038e-06, + "loss": 1.5791, + "step": 11726 + }, + { + "epoch": 2.3501002004008016, + "grad_norm": 20.338919710278315, + "learning_rate": 1.3635318561316508e-06, + "loss": 1.0627, + "step": 11727 + }, + { + "epoch": 2.350300601202405, + "grad_norm": 28.58447984274875, + "learning_rate": 1.3627317775733374e-06, + "loss": 1.5893, + "step": 11728 + }, + { + "epoch": 2.350501002004008, + "grad_norm": 21.618831469180112, + "learning_rate": 1.361931896778666e-06, + "loss": 1.7139, + "step": 11729 + }, + { + "epoch": 2.350701402805611, + "grad_norm": 19.393197992834804, + "learning_rate": 1.3611322137911275e-06, + "loss": 1.3926, + "step": 11730 + }, + { + "epoch": 2.3509018036072145, + "grad_norm": 24.858901528723973, + "learning_rate": 1.3603327286542024e-06, + "loss": 1.2568, + "step": 11731 + }, + { + "epoch": 2.3511022044088175, + "grad_norm": 18.99817358970955, + "learning_rate": 1.3595334414113604e-06, + "loss": 1.4091, + "step": 11732 + }, + { + "epoch": 2.351302605210421, + "grad_norm": 20.279659825731947, + "learning_rate": 1.3587343521060565e-06, + "loss": 1.8577, + "step": 11733 + }, + { + "epoch": 2.351503006012024, + "grad_norm": 17.368204038569214, + "learning_rate": 1.3579354607817435e-06, + "loss": 1.3839, + "step": 11734 + }, + { + "epoch": 2.3517034068136273, + "grad_norm": 18.98283355822935, + "learning_rate": 1.3571367674818548e-06, + "loss": 1.829, + "step": 11735 + }, + { + "epoch": 2.3519038076152303, + "grad_norm": 24.387607306908983, + "learning_rate": 1.3563382722498169e-06, + "loss": 1.7786, + "step": 11736 + }, + { + "epoch": 2.3521042084168338, + "grad_norm": 20.634622424007873, + "learning_rate": 1.3555399751290455e-06, + "loss": 1.5013, + "step": 11737 + }, + { + "epoch": 2.3523046092184368, + "grad_norm": 19.213079786131043, + "learning_rate": 1.354741876162946e-06, + "loss": 1.4567, + "step": 11738 + }, + { + "epoch": 2.35250501002004, + "grad_norm": 18.167314338332087, + "learning_rate": 1.3539439753949114e-06, + "loss": 1.1496, + "step": 11739 + }, + { + "epoch": 2.352705410821643, + "grad_norm": 18.024931043253467, + "learning_rate": 1.3531462728683243e-06, + "loss": 1.3296, + "step": 11740 + }, + { + "epoch": 2.3529058116232466, + "grad_norm": 26.991956673855807, + "learning_rate": 1.3523487686265596e-06, + "loss": 1.4623, + "step": 11741 + }, + { + "epoch": 2.3531062124248496, + "grad_norm": 33.250680966748284, + "learning_rate": 1.3515514627129756e-06, + "loss": 1.4763, + "step": 11742 + }, + { + "epoch": 2.353306613226453, + "grad_norm": 21.708598421488908, + "learning_rate": 1.3507543551709223e-06, + "loss": 1.9029, + "step": 11743 + }, + { + "epoch": 2.353507014028056, + "grad_norm": 16.042902848299537, + "learning_rate": 1.3499574460437442e-06, + "loss": 1.3862, + "step": 11744 + }, + { + "epoch": 2.3537074148296595, + "grad_norm": 25.09010552442444, + "learning_rate": 1.3491607353747665e-06, + "loss": 1.756, + "step": 11745 + }, + { + "epoch": 2.3539078156312625, + "grad_norm": 15.838935081483426, + "learning_rate": 1.348364223207309e-06, + "loss": 1.1824, + "step": 11746 + }, + { + "epoch": 2.3541082164328655, + "grad_norm": 22.4291810543676, + "learning_rate": 1.3475679095846789e-06, + "loss": 1.2825, + "step": 11747 + }, + { + "epoch": 2.354308617234469, + "grad_norm": 19.340447092073664, + "learning_rate": 1.346771794550173e-06, + "loss": 1.4318, + "step": 11748 + }, + { + "epoch": 2.3545090180360724, + "grad_norm": 16.091816755030738, + "learning_rate": 1.3459758781470782e-06, + "loss": 1.2653, + "step": 11749 + }, + { + "epoch": 2.3547094188376754, + "grad_norm": 16.493062774807484, + "learning_rate": 1.3451801604186664e-06, + "loss": 1.2645, + "step": 11750 + }, + { + "epoch": 2.3549098196392784, + "grad_norm": 19.944831986205184, + "learning_rate": 1.3443846414082073e-06, + "loss": 1.808, + "step": 11751 + }, + { + "epoch": 2.355110220440882, + "grad_norm": 22.57347256682962, + "learning_rate": 1.3435893211589496e-06, + "loss": 2.1372, + "step": 11752 + }, + { + "epoch": 2.355310621242485, + "grad_norm": 24.338399241444048, + "learning_rate": 1.342794199714138e-06, + "loss": 1.8647, + "step": 11753 + }, + { + "epoch": 2.3555110220440882, + "grad_norm": 18.143438666448567, + "learning_rate": 1.3419992771170043e-06, + "loss": 1.3225, + "step": 11754 + }, + { + "epoch": 2.3557114228456912, + "grad_norm": 26.642498053108152, + "learning_rate": 1.3412045534107698e-06, + "loss": 1.0619, + "step": 11755 + }, + { + "epoch": 2.3559118236472947, + "grad_norm": 23.05364796163384, + "learning_rate": 1.3404100286386445e-06, + "loss": 1.3733, + "step": 11756 + }, + { + "epoch": 2.3561122244488977, + "grad_norm": 18.13363642458672, + "learning_rate": 1.3396157028438278e-06, + "loss": 1.4515, + "step": 11757 + }, + { + "epoch": 2.356312625250501, + "grad_norm": 21.5338478389594, + "learning_rate": 1.33882157606951e-06, + "loss": 1.6657, + "step": 11758 + }, + { + "epoch": 2.356513026052104, + "grad_norm": 24.952986228637226, + "learning_rate": 1.3380276483588668e-06, + "loss": 1.9635, + "step": 11759 + }, + { + "epoch": 2.3567134268537075, + "grad_norm": 22.282768691932922, + "learning_rate": 1.3372339197550649e-06, + "loss": 1.3343, + "step": 11760 + }, + { + "epoch": 2.3569138276553105, + "grad_norm": 20.407715990843492, + "learning_rate": 1.3364403903012619e-06, + "loss": 1.1495, + "step": 11761 + }, + { + "epoch": 2.357114228456914, + "grad_norm": 17.67710905368753, + "learning_rate": 1.335647060040603e-06, + "loss": 1.6303, + "step": 11762 + }, + { + "epoch": 2.357314629258517, + "grad_norm": 19.188058198363834, + "learning_rate": 1.3348539290162221e-06, + "loss": 1.5371, + "step": 11763 + }, + { + "epoch": 2.3575150300601204, + "grad_norm": 15.21768853729312, + "learning_rate": 1.3340609972712438e-06, + "loss": 1.4549, + "step": 11764 + }, + { + "epoch": 2.3577154308617234, + "grad_norm": 29.77405241838732, + "learning_rate": 1.3332682648487815e-06, + "loss": 1.3386, + "step": 11765 + }, + { + "epoch": 2.357915831663327, + "grad_norm": 61.77043018418187, + "learning_rate": 1.3324757317919336e-06, + "loss": 1.9177, + "step": 11766 + }, + { + "epoch": 2.35811623246493, + "grad_norm": 33.280997254924955, + "learning_rate": 1.3316833981437972e-06, + "loss": 1.9513, + "step": 11767 + }, + { + "epoch": 2.358316633266533, + "grad_norm": 24.12393699185778, + "learning_rate": 1.3308912639474474e-06, + "loss": 1.9024, + "step": 11768 + }, + { + "epoch": 2.3585170340681363, + "grad_norm": 29.74897198141653, + "learning_rate": 1.3300993292459557e-06, + "loss": 1.7233, + "step": 11769 + }, + { + "epoch": 2.3587174348697397, + "grad_norm": 21.6803901526912, + "learning_rate": 1.3293075940823807e-06, + "loss": 1.5648, + "step": 11770 + }, + { + "epoch": 2.3589178356713427, + "grad_norm": 22.31422546889386, + "learning_rate": 1.3285160584997703e-06, + "loss": 1.6679, + "step": 11771 + }, + { + "epoch": 2.3591182364729457, + "grad_norm": 19.770665001234935, + "learning_rate": 1.3277247225411633e-06, + "loss": 1.1921, + "step": 11772 + }, + { + "epoch": 2.359318637274549, + "grad_norm": 45.05972815819757, + "learning_rate": 1.3269335862495808e-06, + "loss": 1.7922, + "step": 11773 + }, + { + "epoch": 2.359519038076152, + "grad_norm": 23.893336417538276, + "learning_rate": 1.3261426496680423e-06, + "loss": 1.9014, + "step": 11774 + }, + { + "epoch": 2.3597194388777556, + "grad_norm": 27.364158997052385, + "learning_rate": 1.3253519128395525e-06, + "loss": 1.4363, + "step": 11775 + }, + { + "epoch": 2.3599198396793586, + "grad_norm": 22.246524222990992, + "learning_rate": 1.3245613758071025e-06, + "loss": 1.6314, + "step": 11776 + }, + { + "epoch": 2.360120240480962, + "grad_norm": 19.87402891784427, + "learning_rate": 1.3237710386136755e-06, + "loss": 1.7934, + "step": 11777 + }, + { + "epoch": 2.360320641282565, + "grad_norm": 23.518779063873104, + "learning_rate": 1.3229809013022437e-06, + "loss": 1.3666, + "step": 11778 + }, + { + "epoch": 2.3605210420841685, + "grad_norm": 23.554103637394487, + "learning_rate": 1.3221909639157681e-06, + "loss": 1.5856, + "step": 11779 + }, + { + "epoch": 2.3607214428857715, + "grad_norm": 21.989500238618476, + "learning_rate": 1.3214012264971992e-06, + "loss": 1.8165, + "step": 11780 + }, + { + "epoch": 2.360921843687375, + "grad_norm": 41.65175053642405, + "learning_rate": 1.3206116890894755e-06, + "loss": 1.6368, + "step": 11781 + }, + { + "epoch": 2.361122244488978, + "grad_norm": 22.007315590196963, + "learning_rate": 1.319822351735527e-06, + "loss": 1.2879, + "step": 11782 + }, + { + "epoch": 2.3613226452905813, + "grad_norm": 17.875829937110947, + "learning_rate": 1.3190332144782663e-06, + "loss": 1.3126, + "step": 11783 + }, + { + "epoch": 2.3615230460921843, + "grad_norm": 19.14255081420717, + "learning_rate": 1.3182442773606068e-06, + "loss": 1.74, + "step": 11784 + }, + { + "epoch": 2.3617234468937873, + "grad_norm": 28.132437425460104, + "learning_rate": 1.3174555404254392e-06, + "loss": 1.7437, + "step": 11785 + }, + { + "epoch": 2.3619238476953908, + "grad_norm": 21.44976699307646, + "learning_rate": 1.3166670037156493e-06, + "loss": 1.5504, + "step": 11786 + }, + { + "epoch": 2.362124248496994, + "grad_norm": 29.642711799880626, + "learning_rate": 1.315878667274112e-06, + "loss": 1.2159, + "step": 11787 + }, + { + "epoch": 2.362324649298597, + "grad_norm": 22.906729666107275, + "learning_rate": 1.3150905311436896e-06, + "loss": 1.6964, + "step": 11788 + }, + { + "epoch": 2.3625250501002, + "grad_norm": 23.2087557023247, + "learning_rate": 1.314302595367236e-06, + "loss": 1.608, + "step": 11789 + }, + { + "epoch": 2.3627254509018036, + "grad_norm": 20.588455654467484, + "learning_rate": 1.313514859987588e-06, + "loss": 1.6032, + "step": 11790 + }, + { + "epoch": 2.3629258517034066, + "grad_norm": 24.77088245872098, + "learning_rate": 1.312727325047582e-06, + "loss": 1.7443, + "step": 11791 + }, + { + "epoch": 2.36312625250501, + "grad_norm": 18.49374992939089, + "learning_rate": 1.311939990590032e-06, + "loss": 0.9378, + "step": 11792 + }, + { + "epoch": 2.363326653306613, + "grad_norm": 15.286094966154389, + "learning_rate": 1.3111528566577498e-06, + "loss": 1.4748, + "step": 11793 + }, + { + "epoch": 2.3635270541082165, + "grad_norm": 18.53463308077324, + "learning_rate": 1.3103659232935311e-06, + "loss": 1.4483, + "step": 11794 + }, + { + "epoch": 2.3637274549098195, + "grad_norm": 21.56121029718523, + "learning_rate": 1.309579190540164e-06, + "loss": 1.8496, + "step": 11795 + }, + { + "epoch": 2.363927855711423, + "grad_norm": 17.006799233886404, + "learning_rate": 1.3087926584404253e-06, + "loss": 1.7461, + "step": 11796 + }, + { + "epoch": 2.364128256513026, + "grad_norm": 21.581579842725628, + "learning_rate": 1.3080063270370757e-06, + "loss": 1.7264, + "step": 11797 + }, + { + "epoch": 2.3643286573146294, + "grad_norm": 27.8140912720883, + "learning_rate": 1.3072201963728749e-06, + "loss": 1.8924, + "step": 11798 + }, + { + "epoch": 2.3645290581162324, + "grad_norm": 24.128325816541942, + "learning_rate": 1.3064342664905611e-06, + "loss": 1.6558, + "step": 11799 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 19.34550298768265, + "learning_rate": 1.3056485374328675e-06, + "loss": 1.511, + "step": 11800 + }, + { + "epoch": 2.364929859719439, + "grad_norm": 27.95285766238541, + "learning_rate": 1.304863009242519e-06, + "loss": 1.4549, + "step": 11801 + }, + { + "epoch": 2.3651302605210422, + "grad_norm": 33.66613634212282, + "learning_rate": 1.304077681962222e-06, + "loss": 1.5235, + "step": 11802 + }, + { + "epoch": 2.3653306613226452, + "grad_norm": 23.72987111004241, + "learning_rate": 1.3032925556346766e-06, + "loss": 1.7642, + "step": 11803 + }, + { + "epoch": 2.3655310621242487, + "grad_norm": 22.697865959071517, + "learning_rate": 1.3025076303025718e-06, + "loss": 1.5457, + "step": 11804 + }, + { + "epoch": 2.3657314629258517, + "grad_norm": 50.906629326461335, + "learning_rate": 1.3017229060085856e-06, + "loss": 1.8045, + "step": 11805 + }, + { + "epoch": 2.3659318637274547, + "grad_norm": 27.415684485169255, + "learning_rate": 1.300938382795386e-06, + "loss": 2.0047, + "step": 11806 + }, + { + "epoch": 2.366132264529058, + "grad_norm": 23.710271032254322, + "learning_rate": 1.300154060705623e-06, + "loss": 1.5167, + "step": 11807 + }, + { + "epoch": 2.3663326653306616, + "grad_norm": 18.58781346274777, + "learning_rate": 1.2993699397819487e-06, + "loss": 1.727, + "step": 11808 + }, + { + "epoch": 2.3665330661322646, + "grad_norm": 16.491468888345295, + "learning_rate": 1.2985860200669915e-06, + "loss": 1.8123, + "step": 11809 + }, + { + "epoch": 2.3667334669338675, + "grad_norm": 21.74093650945693, + "learning_rate": 1.2978023016033764e-06, + "loss": 1.422, + "step": 11810 + }, + { + "epoch": 2.366933867735471, + "grad_norm": 25.541145912673123, + "learning_rate": 1.2970187844337152e-06, + "loss": 1.28, + "step": 11811 + }, + { + "epoch": 2.367134268537074, + "grad_norm": 19.889545636114256, + "learning_rate": 1.2962354686006085e-06, + "loss": 1.3327, + "step": 11812 + }, + { + "epoch": 2.3673346693386774, + "grad_norm": 20.906890594715467, + "learning_rate": 1.2954523541466486e-06, + "loss": 1.7429, + "step": 11813 + }, + { + "epoch": 2.3675350701402804, + "grad_norm": 18.373893994363733, + "learning_rate": 1.294669441114409e-06, + "loss": 1.7149, + "step": 11814 + }, + { + "epoch": 2.367735470941884, + "grad_norm": 19.213125219161363, + "learning_rate": 1.2938867295464646e-06, + "loss": 1.8245, + "step": 11815 + }, + { + "epoch": 2.367935871743487, + "grad_norm": 25.997211386608118, + "learning_rate": 1.2931042194853682e-06, + "loss": 1.3298, + "step": 11816 + }, + { + "epoch": 2.3681362725450903, + "grad_norm": 22.764235641142996, + "learning_rate": 1.2923219109736673e-06, + "loss": 1.5353, + "step": 11817 + }, + { + "epoch": 2.3683366733466933, + "grad_norm": 19.176788533369702, + "learning_rate": 1.2915398040538974e-06, + "loss": 1.454, + "step": 11818 + }, + { + "epoch": 2.3685370741482967, + "grad_norm": 21.15148799917661, + "learning_rate": 1.290757898768582e-06, + "loss": 1.8482, + "step": 11819 + }, + { + "epoch": 2.3687374749498997, + "grad_norm": 18.83590547995802, + "learning_rate": 1.2899761951602363e-06, + "loss": 1.17, + "step": 11820 + }, + { + "epoch": 2.368937875751503, + "grad_norm": 24.73966016673133, + "learning_rate": 1.2891946932713589e-06, + "loss": 1.4518, + "step": 11821 + }, + { + "epoch": 2.369138276553106, + "grad_norm": 26.229751205944257, + "learning_rate": 1.2884133931444464e-06, + "loss": 1.8435, + "step": 11822 + }, + { + "epoch": 2.3693386773547096, + "grad_norm": 41.43626532148619, + "learning_rate": 1.2876322948219754e-06, + "loss": 1.5806, + "step": 11823 + }, + { + "epoch": 2.3695390781563126, + "grad_norm": 25.032195646680094, + "learning_rate": 1.2868513983464149e-06, + "loss": 1.5071, + "step": 11824 + }, + { + "epoch": 2.369739478957916, + "grad_norm": 25.58557684183001, + "learning_rate": 1.2860707037602277e-06, + "loss": 1.3554, + "step": 11825 + }, + { + "epoch": 2.369939879759519, + "grad_norm": 18.494572281536865, + "learning_rate": 1.2852902111058568e-06, + "loss": 1.4113, + "step": 11826 + }, + { + "epoch": 2.370140280561122, + "grad_norm": 19.93355897688422, + "learning_rate": 1.2845099204257411e-06, + "loss": 1.6658, + "step": 11827 + }, + { + "epoch": 2.3703406813627255, + "grad_norm": 23.290223897957123, + "learning_rate": 1.2837298317623053e-06, + "loss": 1.7445, + "step": 11828 + }, + { + "epoch": 2.370541082164329, + "grad_norm": 27.491242126312603, + "learning_rate": 1.2829499451579647e-06, + "loss": 1.6293, + "step": 11829 + }, + { + "epoch": 2.370741482965932, + "grad_norm": 24.904277254418417, + "learning_rate": 1.2821702606551233e-06, + "loss": 1.3467, + "step": 11830 + }, + { + "epoch": 2.370941883767535, + "grad_norm": 21.555037796250094, + "learning_rate": 1.2813907782961705e-06, + "loss": 2.034, + "step": 11831 + }, + { + "epoch": 2.3711422845691383, + "grad_norm": 19.848556042485313, + "learning_rate": 1.280611498123493e-06, + "loss": 1.9486, + "step": 11832 + }, + { + "epoch": 2.3713426853707413, + "grad_norm": 21.668597616895827, + "learning_rate": 1.2798324201794571e-06, + "loss": 1.2911, + "step": 11833 + }, + { + "epoch": 2.3715430861723448, + "grad_norm": 16.606214469734784, + "learning_rate": 1.2790535445064238e-06, + "loss": 1.5539, + "step": 11834 + }, + { + "epoch": 2.3717434869739478, + "grad_norm": 30.632961245791375, + "learning_rate": 1.2782748711467424e-06, + "loss": 1.3734, + "step": 11835 + }, + { + "epoch": 2.371943887775551, + "grad_norm": 18.007939776980532, + "learning_rate": 1.2774964001427497e-06, + "loss": 0.9891, + "step": 11836 + }, + { + "epoch": 2.372144288577154, + "grad_norm": 22.756987375610873, + "learning_rate": 1.2767181315367726e-06, + "loss": 1.9516, + "step": 11837 + }, + { + "epoch": 2.3723446893787576, + "grad_norm": 26.197653617131497, + "learning_rate": 1.2759400653711267e-06, + "loss": 1.496, + "step": 11838 + }, + { + "epoch": 2.3725450901803606, + "grad_norm": 21.560235232861025, + "learning_rate": 1.2751622016881182e-06, + "loss": 1.272, + "step": 11839 + }, + { + "epoch": 2.372745490981964, + "grad_norm": 23.004177771997522, + "learning_rate": 1.2743845405300375e-06, + "loss": 1.6506, + "step": 11840 + }, + { + "epoch": 2.372945891783567, + "grad_norm": 22.36547708859883, + "learning_rate": 1.2736070819391693e-06, + "loss": 1.4586, + "step": 11841 + }, + { + "epoch": 2.3731462925851705, + "grad_norm": 19.6215070402828, + "learning_rate": 1.2728298259577848e-06, + "loss": 1.8308, + "step": 11842 + }, + { + "epoch": 2.3733466933867735, + "grad_norm": 18.104131737637513, + "learning_rate": 1.272052772628144e-06, + "loss": 1.6145, + "step": 11843 + }, + { + "epoch": 2.3735470941883765, + "grad_norm": 38.19210648552495, + "learning_rate": 1.2712759219924969e-06, + "loss": 1.7556, + "step": 11844 + }, + { + "epoch": 2.37374749498998, + "grad_norm": 33.06714707583752, + "learning_rate": 1.2704992740930822e-06, + "loss": 1.7816, + "step": 11845 + }, + { + "epoch": 2.3739478957915834, + "grad_norm": 24.70228168581387, + "learning_rate": 1.2697228289721292e-06, + "loss": 1.997, + "step": 11846 + }, + { + "epoch": 2.3741482965931864, + "grad_norm": 17.7954742945482, + "learning_rate": 1.2689465866718503e-06, + "loss": 1.834, + "step": 11847 + }, + { + "epoch": 2.3743486973947894, + "grad_norm": 19.00379185059542, + "learning_rate": 1.268170547234453e-06, + "loss": 1.671, + "step": 11848 + }, + { + "epoch": 2.374549098196393, + "grad_norm": 26.14045412017584, + "learning_rate": 1.2673947107021317e-06, + "loss": 0.9877, + "step": 11849 + }, + { + "epoch": 2.374749498997996, + "grad_norm": 21.938501347595142, + "learning_rate": 1.26661907711707e-06, + "loss": 1.7548, + "step": 11850 + }, + { + "epoch": 2.3749498997995993, + "grad_norm": 19.975664187903167, + "learning_rate": 1.2658436465214402e-06, + "loss": 1.3932, + "step": 11851 + }, + { + "epoch": 2.3751503006012022, + "grad_norm": 22.989438414299727, + "learning_rate": 1.265068418957403e-06, + "loss": 1.7693, + "step": 11852 + }, + { + "epoch": 2.3753507014028057, + "grad_norm": 28.821435520540543, + "learning_rate": 1.264293394467111e-06, + "loss": 1.45, + "step": 11853 + }, + { + "epoch": 2.3755511022044087, + "grad_norm": 19.89251849667579, + "learning_rate": 1.2635185730926986e-06, + "loss": 1.4557, + "step": 11854 + }, + { + "epoch": 2.375751503006012, + "grad_norm": 19.855691324910854, + "learning_rate": 1.2627439548762981e-06, + "loss": 1.2821, + "step": 11855 + }, + { + "epoch": 2.375951903807615, + "grad_norm": 23.76452086696131, + "learning_rate": 1.2619695398600274e-06, + "loss": 1.3189, + "step": 11856 + }, + { + "epoch": 2.3761523046092186, + "grad_norm": 26.066776381647845, + "learning_rate": 1.261195328085989e-06, + "loss": 1.8018, + "step": 11857 + }, + { + "epoch": 2.3763527054108216, + "grad_norm": 32.0054261020524, + "learning_rate": 1.2604213195962794e-06, + "loss": 1.2599, + "step": 11858 + }, + { + "epoch": 2.376553106212425, + "grad_norm": 22.73756209489899, + "learning_rate": 1.2596475144329829e-06, + "loss": 1.2014, + "step": 11859 + }, + { + "epoch": 2.376753507014028, + "grad_norm": 18.394505406189786, + "learning_rate": 1.2588739126381721e-06, + "loss": 1.5993, + "step": 11860 + }, + { + "epoch": 2.3769539078156314, + "grad_norm": 23.253726621862686, + "learning_rate": 1.2581005142539094e-06, + "loss": 1.9233, + "step": 11861 + }, + { + "epoch": 2.3771543086172344, + "grad_norm": 23.834769798678533, + "learning_rate": 1.2573273193222451e-06, + "loss": 1.9623, + "step": 11862 + }, + { + "epoch": 2.377354709418838, + "grad_norm": 16.266308237561457, + "learning_rate": 1.256554327885221e-06, + "loss": 1.3481, + "step": 11863 + }, + { + "epoch": 2.377555110220441, + "grad_norm": 46.29971696677659, + "learning_rate": 1.255781539984862e-06, + "loss": 1.5171, + "step": 11864 + }, + { + "epoch": 2.377755511022044, + "grad_norm": 21.593754692360683, + "learning_rate": 1.2550089556631879e-06, + "loss": 1.4315, + "step": 11865 + }, + { + "epoch": 2.3779559118236473, + "grad_norm": 18.693777796728995, + "learning_rate": 1.2542365749622048e-06, + "loss": 1.6412, + "step": 11866 + }, + { + "epoch": 2.3781563126252507, + "grad_norm": 23.809561199764662, + "learning_rate": 1.253464397923908e-06, + "loss": 1.5081, + "step": 11867 + }, + { + "epoch": 2.3783567134268537, + "grad_norm": 19.348399667828016, + "learning_rate": 1.2526924245902832e-06, + "loss": 1.8066, + "step": 11868 + }, + { + "epoch": 2.3785571142284567, + "grad_norm": 35.062873155774646, + "learning_rate": 1.251920655003302e-06, + "loss": 2.0863, + "step": 11869 + }, + { + "epoch": 2.37875751503006, + "grad_norm": 25.64304128958473, + "learning_rate": 1.2511490892049293e-06, + "loss": 1.5518, + "step": 11870 + }, + { + "epoch": 2.378957915831663, + "grad_norm": 25.384052138585428, + "learning_rate": 1.2503777272371115e-06, + "loss": 1.5375, + "step": 11871 + }, + { + "epoch": 2.3791583166332666, + "grad_norm": 35.21076839405316, + "learning_rate": 1.2496065691417947e-06, + "loss": 1.8222, + "step": 11872 + }, + { + "epoch": 2.3793587174348696, + "grad_norm": 15.323283783858141, + "learning_rate": 1.2488356149609032e-06, + "loss": 1.4147, + "step": 11873 + }, + { + "epoch": 2.379559118236473, + "grad_norm": 18.623261542374763, + "learning_rate": 1.2480648647363564e-06, + "loss": 1.3666, + "step": 11874 + }, + { + "epoch": 2.379759519038076, + "grad_norm": 22.726841154235913, + "learning_rate": 1.247294318510061e-06, + "loss": 1.7031, + "step": 11875 + }, + { + "epoch": 2.3799599198396795, + "grad_norm": 19.01496747465213, + "learning_rate": 1.246523976323914e-06, + "loss": 1.496, + "step": 11876 + }, + { + "epoch": 2.3801603206412825, + "grad_norm": 27.143784612634228, + "learning_rate": 1.2457538382197998e-06, + "loss": 1.027, + "step": 11877 + }, + { + "epoch": 2.380360721442886, + "grad_norm": 15.064137343030438, + "learning_rate": 1.2449839042395883e-06, + "loss": 1.4823, + "step": 11878 + }, + { + "epoch": 2.380561122244489, + "grad_norm": 15.401907496013395, + "learning_rate": 1.244214174425148e-06, + "loss": 1.0329, + "step": 11879 + }, + { + "epoch": 2.3807615230460923, + "grad_norm": 20.87185885585547, + "learning_rate": 1.2434446488183249e-06, + "loss": 1.8081, + "step": 11880 + }, + { + "epoch": 2.3809619238476953, + "grad_norm": 33.122381598401375, + "learning_rate": 1.242675327460961e-06, + "loss": 2.2305, + "step": 11881 + }, + { + "epoch": 2.381162324649299, + "grad_norm": 18.473463531263537, + "learning_rate": 1.241906210394888e-06, + "loss": 1.4973, + "step": 11882 + }, + { + "epoch": 2.3813627254509018, + "grad_norm": 23.417971661608526, + "learning_rate": 1.2411372976619206e-06, + "loss": 1.4566, + "step": 11883 + }, + { + "epoch": 2.381563126252505, + "grad_norm": 22.74819064860148, + "learning_rate": 1.2403685893038685e-06, + "loss": 1.3704, + "step": 11884 + }, + { + "epoch": 2.381763527054108, + "grad_norm": 21.60225295108533, + "learning_rate": 1.2396000853625228e-06, + "loss": 1.2545, + "step": 11885 + }, + { + "epoch": 2.381963927855711, + "grad_norm": 26.07475068038079, + "learning_rate": 1.2388317858796728e-06, + "loss": 1.5636, + "step": 11886 + }, + { + "epoch": 2.3821643286573146, + "grad_norm": 19.357665743408194, + "learning_rate": 1.2380636908970922e-06, + "loss": 1.3694, + "step": 11887 + }, + { + "epoch": 2.382364729458918, + "grad_norm": 23.840438213017162, + "learning_rate": 1.237295800456539e-06, + "loss": 1.7228, + "step": 11888 + }, + { + "epoch": 2.382565130260521, + "grad_norm": 19.001904038749736, + "learning_rate": 1.2365281145997703e-06, + "loss": 1.5149, + "step": 11889 + }, + { + "epoch": 2.382765531062124, + "grad_norm": 21.5160528558854, + "learning_rate": 1.2357606333685217e-06, + "loss": 1.7474, + "step": 11890 + }, + { + "epoch": 2.3829659318637275, + "grad_norm": 21.709327308022715, + "learning_rate": 1.2349933568045242e-06, + "loss": 1.771, + "step": 11891 + }, + { + "epoch": 2.3831663326653305, + "grad_norm": 14.902441484702722, + "learning_rate": 1.2342262849494958e-06, + "loss": 1.1902, + "step": 11892 + }, + { + "epoch": 2.383366733466934, + "grad_norm": 34.80098328986112, + "learning_rate": 1.2334594178451426e-06, + "loss": 1.7521, + "step": 11893 + }, + { + "epoch": 2.383567134268537, + "grad_norm": 21.947199035988408, + "learning_rate": 1.2326927555331619e-06, + "loss": 1.5546, + "step": 11894 + }, + { + "epoch": 2.3837675350701404, + "grad_norm": 23.97004129990202, + "learning_rate": 1.231926298055235e-06, + "loss": 1.4393, + "step": 11895 + }, + { + "epoch": 2.3839679358717434, + "grad_norm": 22.445316852116385, + "learning_rate": 1.2311600454530398e-06, + "loss": 2.0062, + "step": 11896 + }, + { + "epoch": 2.384168336673347, + "grad_norm": 18.594525237091222, + "learning_rate": 1.2303939977682356e-06, + "loss": 1.4697, + "step": 11897 + }, + { + "epoch": 2.38436873747495, + "grad_norm": 22.787731843040053, + "learning_rate": 1.2296281550424732e-06, + "loss": 1.5919, + "step": 11898 + }, + { + "epoch": 2.3845691382765533, + "grad_norm": 22.765098194636163, + "learning_rate": 1.2288625173173941e-06, + "loss": 1.3825, + "step": 11899 + }, + { + "epoch": 2.3847695390781563, + "grad_norm": 23.562693500917952, + "learning_rate": 1.2280970846346269e-06, + "loss": 0.7269, + "step": 11900 + }, + { + "epoch": 2.3849699398797597, + "grad_norm": 18.51575391818088, + "learning_rate": 1.22733185703579e-06, + "loss": 1.54, + "step": 11901 + }, + { + "epoch": 2.3851703406813627, + "grad_norm": 18.870434817571702, + "learning_rate": 1.2265668345624864e-06, + "loss": 1.1756, + "step": 11902 + }, + { + "epoch": 2.3853707414829657, + "grad_norm": 25.397909363605862, + "learning_rate": 1.2258020172563168e-06, + "loss": 1.3645, + "step": 11903 + }, + { + "epoch": 2.385571142284569, + "grad_norm": 19.132012060827368, + "learning_rate": 1.2250374051588614e-06, + "loss": 1.2242, + "step": 11904 + }, + { + "epoch": 2.3857715430861726, + "grad_norm": 34.340237711543345, + "learning_rate": 1.2242729983116947e-06, + "loss": 1.6573, + "step": 11905 + }, + { + "epoch": 2.3859719438877756, + "grad_norm": 18.104340374592155, + "learning_rate": 1.2235087967563786e-06, + "loss": 1.4856, + "step": 11906 + }, + { + "epoch": 2.3861723446893786, + "grad_norm": 25.344331830274808, + "learning_rate": 1.2227448005344639e-06, + "loss": 1.7202, + "step": 11907 + }, + { + "epoch": 2.386372745490982, + "grad_norm": 30.43423718435166, + "learning_rate": 1.2219810096874913e-06, + "loss": 1.4457, + "step": 11908 + }, + { + "epoch": 2.386573146292585, + "grad_norm": 30.36104361179333, + "learning_rate": 1.2212174242569853e-06, + "loss": 1.2839, + "step": 11909 + }, + { + "epoch": 2.3867735470941884, + "grad_norm": 22.555317416983847, + "learning_rate": 1.2204540442844688e-06, + "loss": 1.3262, + "step": 11910 + }, + { + "epoch": 2.3869739478957914, + "grad_norm": 26.307097500133526, + "learning_rate": 1.2196908698114435e-06, + "loss": 1.4904, + "step": 11911 + }, + { + "epoch": 2.387174348697395, + "grad_norm": 33.94285525127787, + "learning_rate": 1.2189279008794047e-06, + "loss": 1.7221, + "step": 11912 + }, + { + "epoch": 2.387374749498998, + "grad_norm": 35.64230377285467, + "learning_rate": 1.2181651375298398e-06, + "loss": 1.4793, + "step": 11913 + }, + { + "epoch": 2.3875751503006013, + "grad_norm": 18.059710437941252, + "learning_rate": 1.2174025798042177e-06, + "loss": 1.1083, + "step": 11914 + }, + { + "epoch": 2.3877755511022043, + "grad_norm": 22.608823515149773, + "learning_rate": 1.216640227744001e-06, + "loss": 1.5707, + "step": 11915 + }, + { + "epoch": 2.3879759519038077, + "grad_norm": 21.033937277034347, + "learning_rate": 1.2158780813906396e-06, + "loss": 1.8461, + "step": 11916 + }, + { + "epoch": 2.3881763527054107, + "grad_norm": 37.65406216921912, + "learning_rate": 1.2151161407855728e-06, + "loss": 1.761, + "step": 11917 + }, + { + "epoch": 2.388376753507014, + "grad_norm": 28.690687708671998, + "learning_rate": 1.21435440597023e-06, + "loss": 1.7141, + "step": 11918 + }, + { + "epoch": 2.388577154308617, + "grad_norm": 19.405170859285427, + "learning_rate": 1.2135928769860233e-06, + "loss": 1.2805, + "step": 11919 + }, + { + "epoch": 2.3887775551102206, + "grad_norm": 23.648365909490895, + "learning_rate": 1.2128315538743646e-06, + "loss": 1.6407, + "step": 11920 + }, + { + "epoch": 2.3889779559118236, + "grad_norm": 19.350368604016346, + "learning_rate": 1.2120704366766429e-06, + "loss": 1.3736, + "step": 11921 + }, + { + "epoch": 2.389178356713427, + "grad_norm": 17.52903965645, + "learning_rate": 1.2113095254342428e-06, + "loss": 1.3374, + "step": 11922 + }, + { + "epoch": 2.38937875751503, + "grad_norm": 20.90419169935492, + "learning_rate": 1.2105488201885364e-06, + "loss": 0.9991, + "step": 11923 + }, + { + "epoch": 2.389579158316633, + "grad_norm": 19.76516598040432, + "learning_rate": 1.2097883209808847e-06, + "loss": 1.594, + "step": 11924 + }, + { + "epoch": 2.3897795591182365, + "grad_norm": 18.106530680451318, + "learning_rate": 1.2090280278526368e-06, + "loss": 1.9124, + "step": 11925 + }, + { + "epoch": 2.38997995991984, + "grad_norm": 22.090934245513086, + "learning_rate": 1.2082679408451308e-06, + "loss": 1.2821, + "step": 11926 + }, + { + "epoch": 2.390180360721443, + "grad_norm": 22.8329671259476, + "learning_rate": 1.2075080599996952e-06, + "loss": 1.3985, + "step": 11927 + }, + { + "epoch": 2.390380761523046, + "grad_norm": 20.556307048058006, + "learning_rate": 1.2067483853576439e-06, + "loss": 2.0514, + "step": 11928 + }, + { + "epoch": 2.3905811623246493, + "grad_norm": 21.368958534720978, + "learning_rate": 1.2059889169602817e-06, + "loss": 1.5866, + "step": 11929 + }, + { + "epoch": 2.3907815631262523, + "grad_norm": 25.762889775284812, + "learning_rate": 1.2052296548489029e-06, + "loss": 1.4794, + "step": 11930 + }, + { + "epoch": 2.390981963927856, + "grad_norm": 29.531315200333847, + "learning_rate": 1.2044705990647887e-06, + "loss": 1.5794, + "step": 11931 + }, + { + "epoch": 2.3911823647294588, + "grad_norm": 39.419321758079526, + "learning_rate": 1.2037117496492113e-06, + "loss": 1.4486, + "step": 11932 + }, + { + "epoch": 2.391382765531062, + "grad_norm": 22.74747247119393, + "learning_rate": 1.2029531066434296e-06, + "loss": 1.5646, + "step": 11933 + }, + { + "epoch": 2.391583166332665, + "grad_norm": 19.1844964050933, + "learning_rate": 1.2021946700886943e-06, + "loss": 1.4058, + "step": 11934 + }, + { + "epoch": 2.3917835671342687, + "grad_norm": 35.37341001460765, + "learning_rate": 1.201436440026239e-06, + "loss": 1.7095, + "step": 11935 + }, + { + "epoch": 2.3919839679358716, + "grad_norm": 17.233412417431957, + "learning_rate": 1.2006784164972908e-06, + "loss": 1.1335, + "step": 11936 + }, + { + "epoch": 2.392184368737475, + "grad_norm": 18.901004825769213, + "learning_rate": 1.199920599543068e-06, + "loss": 1.8471, + "step": 11937 + }, + { + "epoch": 2.392384769539078, + "grad_norm": 29.166297058722634, + "learning_rate": 1.1991629892047696e-06, + "loss": 1.8846, + "step": 11938 + }, + { + "epoch": 2.3925851703406815, + "grad_norm": 20.319072362463338, + "learning_rate": 1.198405585523591e-06, + "loss": 1.5478, + "step": 11939 + }, + { + "epoch": 2.3927855711422845, + "grad_norm": 36.103329039401174, + "learning_rate": 1.197648388540712e-06, + "loss": 1.5009, + "step": 11940 + }, + { + "epoch": 2.392985971943888, + "grad_norm": 18.237982230112237, + "learning_rate": 1.1968913982973025e-06, + "loss": 1.6455, + "step": 11941 + }, + { + "epoch": 2.393186372745491, + "grad_norm": 20.556247523175028, + "learning_rate": 1.1961346148345221e-06, + "loss": 1.3151, + "step": 11942 + }, + { + "epoch": 2.3933867735470944, + "grad_norm": 19.286122584801248, + "learning_rate": 1.1953780381935171e-06, + "loss": 1.4839, + "step": 11943 + }, + { + "epoch": 2.3935871743486974, + "grad_norm": 26.622505646129913, + "learning_rate": 1.1946216684154267e-06, + "loss": 1.8599, + "step": 11944 + }, + { + "epoch": 2.3937875751503004, + "grad_norm": 29.211653826832325, + "learning_rate": 1.1938655055413712e-06, + "loss": 1.4837, + "step": 11945 + }, + { + "epoch": 2.393987975951904, + "grad_norm": 19.46389924525069, + "learning_rate": 1.1931095496124667e-06, + "loss": 1.5925, + "step": 11946 + }, + { + "epoch": 2.3941883767535073, + "grad_norm": 25.408581273474436, + "learning_rate": 1.1923538006698154e-06, + "loss": 1.4312, + "step": 11947 + }, + { + "epoch": 2.3943887775551103, + "grad_norm": 16.765010432138137, + "learning_rate": 1.1915982587545088e-06, + "loss": 1.2226, + "step": 11948 + }, + { + "epoch": 2.3945891783567133, + "grad_norm": 16.913038597847706, + "learning_rate": 1.1908429239076264e-06, + "loss": 1.176, + "step": 11949 + }, + { + "epoch": 2.3947895791583167, + "grad_norm": 17.40007151649906, + "learning_rate": 1.1900877961702374e-06, + "loss": 1.1317, + "step": 11950 + }, + { + "epoch": 2.3949899799599197, + "grad_norm": 23.87859391951604, + "learning_rate": 1.1893328755834e-06, + "loss": 1.5181, + "step": 11951 + }, + { + "epoch": 2.395190380761523, + "grad_norm": 27.89357800052539, + "learning_rate": 1.1885781621881575e-06, + "loss": 1.3549, + "step": 11952 + }, + { + "epoch": 2.395390781563126, + "grad_norm": 19.423284556119334, + "learning_rate": 1.1878236560255469e-06, + "loss": 1.6349, + "step": 11953 + }, + { + "epoch": 2.3955911823647296, + "grad_norm": 19.184054235631585, + "learning_rate": 1.1870693571365916e-06, + "loss": 1.7175, + "step": 11954 + }, + { + "epoch": 2.3957915831663326, + "grad_norm": 24.135040676020417, + "learning_rate": 1.1863152655623034e-06, + "loss": 1.997, + "step": 11955 + }, + { + "epoch": 2.395991983967936, + "grad_norm": 35.633574996830426, + "learning_rate": 1.185561381343684e-06, + "loss": 1.6468, + "step": 11956 + }, + { + "epoch": 2.396192384769539, + "grad_norm": 23.062914937386385, + "learning_rate": 1.1848077045217226e-06, + "loss": 1.866, + "step": 11957 + }, + { + "epoch": 2.3963927855711424, + "grad_norm": 26.90200772785228, + "learning_rate": 1.1840542351373997e-06, + "loss": 1.6515, + "step": 11958 + }, + { + "epoch": 2.3965931863727454, + "grad_norm": 21.361049373481862, + "learning_rate": 1.1833009732316781e-06, + "loss": 1.6006, + "step": 11959 + }, + { + "epoch": 2.396793587174349, + "grad_norm": 19.73311727985753, + "learning_rate": 1.1825479188455192e-06, + "loss": 1.2811, + "step": 11960 + }, + { + "epoch": 2.396993987975952, + "grad_norm": 17.130797797103952, + "learning_rate": 1.1817950720198635e-06, + "loss": 1.4305, + "step": 11961 + }, + { + "epoch": 2.397194388777555, + "grad_norm": 27.151896269854166, + "learning_rate": 1.181042432795646e-06, + "loss": 1.6671, + "step": 11962 + }, + { + "epoch": 2.3973947895791583, + "grad_norm": 19.137716042456027, + "learning_rate": 1.1802900012137892e-06, + "loss": 1.5497, + "step": 11963 + }, + { + "epoch": 2.3975951903807617, + "grad_norm": 23.748400981587587, + "learning_rate": 1.1795377773152028e-06, + "loss": 1.8704, + "step": 11964 + }, + { + "epoch": 2.3977955911823647, + "grad_norm": 24.54935315060788, + "learning_rate": 1.1787857611407887e-06, + "loss": 1.4681, + "step": 11965 + }, + { + "epoch": 2.3979959919839677, + "grad_norm": 18.82403041759245, + "learning_rate": 1.1780339527314304e-06, + "loss": 1.7939, + "step": 11966 + }, + { + "epoch": 2.398196392785571, + "grad_norm": 23.15161424221057, + "learning_rate": 1.1772823521280092e-06, + "loss": 1.7193, + "step": 11967 + }, + { + "epoch": 2.398396793587174, + "grad_norm": 22.435142616755723, + "learning_rate": 1.1765309593713908e-06, + "loss": 2.0797, + "step": 11968 + }, + { + "epoch": 2.3985971943887776, + "grad_norm": 37.1401183663835, + "learning_rate": 1.1757797745024258e-06, + "loss": 2.0484, + "step": 11969 + }, + { + "epoch": 2.3987975951903806, + "grad_norm": 15.725195627904752, + "learning_rate": 1.1750287975619618e-06, + "loss": 1.4768, + "step": 11970 + }, + { + "epoch": 2.398997995991984, + "grad_norm": 23.403949282960742, + "learning_rate": 1.1742780285908267e-06, + "loss": 1.4847, + "step": 11971 + }, + { + "epoch": 2.399198396793587, + "grad_norm": 18.20661925883947, + "learning_rate": 1.173527467629843e-06, + "loss": 1.3164, + "step": 11972 + }, + { + "epoch": 2.3993987975951905, + "grad_norm": 23.889847725293134, + "learning_rate": 1.1727771147198192e-06, + "loss": 1.7766, + "step": 11973 + }, + { + "epoch": 2.3995991983967935, + "grad_norm": 16.90220611908173, + "learning_rate": 1.172026969901553e-06, + "loss": 1.778, + "step": 11974 + }, + { + "epoch": 2.399799599198397, + "grad_norm": 23.35118041568096, + "learning_rate": 1.1712770332158324e-06, + "loss": 1.4654, + "step": 11975 + }, + { + "epoch": 2.4, + "grad_norm": 23.517047488464954, + "learning_rate": 1.1705273047034283e-06, + "loss": 1.4097, + "step": 11976 + }, + { + "epoch": 2.4002004008016034, + "grad_norm": 17.09645990407423, + "learning_rate": 1.1697777844051105e-06, + "loss": 1.2944, + "step": 11977 + }, + { + "epoch": 2.4004008016032063, + "grad_norm": 23.213283399011694, + "learning_rate": 1.1690284723616268e-06, + "loss": 1.2867, + "step": 11978 + }, + { + "epoch": 2.40060120240481, + "grad_norm": 21.07964481287939, + "learning_rate": 1.1682793686137207e-06, + "loss": 1.6125, + "step": 11979 + }, + { + "epoch": 2.400801603206413, + "grad_norm": 18.056717798800378, + "learning_rate": 1.1675304732021208e-06, + "loss": 1.5457, + "step": 11980 + }, + { + "epoch": 2.401002004008016, + "grad_norm": 19.128064069050687, + "learning_rate": 1.1667817861675467e-06, + "loss": 1.7696, + "step": 11981 + }, + { + "epoch": 2.401202404809619, + "grad_norm": 16.721636936324245, + "learning_rate": 1.1660333075507069e-06, + "loss": 1.2291, + "step": 11982 + }, + { + "epoch": 2.401402805611222, + "grad_norm": 21.98960137446074, + "learning_rate": 1.165285037392292e-06, + "loss": 1.6244, + "step": 11983 + }, + { + "epoch": 2.4016032064128257, + "grad_norm": 30.747005608502317, + "learning_rate": 1.164536975732994e-06, + "loss": 1.4323, + "step": 11984 + }, + { + "epoch": 2.401803607214429, + "grad_norm": 34.71168740999324, + "learning_rate": 1.16378912261348e-06, + "loss": 1.5939, + "step": 11985 + }, + { + "epoch": 2.402004008016032, + "grad_norm": 21.983861393339403, + "learning_rate": 1.1630414780744142e-06, + "loss": 0.993, + "step": 11986 + }, + { + "epoch": 2.402204408817635, + "grad_norm": 17.519440188385655, + "learning_rate": 1.1622940421564477e-06, + "loss": 1.8491, + "step": 11987 + }, + { + "epoch": 2.4024048096192385, + "grad_norm": 25.178557425791794, + "learning_rate": 1.1615468149002186e-06, + "loss": 1.1022, + "step": 11988 + }, + { + "epoch": 2.4026052104208415, + "grad_norm": 24.17429424610678, + "learning_rate": 1.160799796346357e-06, + "loss": 1.7155, + "step": 11989 + }, + { + "epoch": 2.402805611222445, + "grad_norm": 27.014128002838184, + "learning_rate": 1.1600529865354748e-06, + "loss": 1.3619, + "step": 11990 + }, + { + "epoch": 2.403006012024048, + "grad_norm": 21.602280914124723, + "learning_rate": 1.1593063855081825e-06, + "loss": 1.4715, + "step": 11991 + }, + { + "epoch": 2.4032064128256514, + "grad_norm": 15.880239859568796, + "learning_rate": 1.15855999330507e-06, + "loss": 1.2019, + "step": 11992 + }, + { + "epoch": 2.4034068136272544, + "grad_norm": 23.64056604206148, + "learning_rate": 1.1578138099667201e-06, + "loss": 1.785, + "step": 11993 + }, + { + "epoch": 2.403607214428858, + "grad_norm": 24.728718867820348, + "learning_rate": 1.1570678355337078e-06, + "loss": 1.2725, + "step": 11994 + }, + { + "epoch": 2.403807615230461, + "grad_norm": 19.1872329091624, + "learning_rate": 1.1563220700465881e-06, + "loss": 1.2381, + "step": 11995 + }, + { + "epoch": 2.4040080160320643, + "grad_norm": 18.98777216422935, + "learning_rate": 1.155576513545912e-06, + "loss": 1.5039, + "step": 11996 + }, + { + "epoch": 2.4042084168336673, + "grad_norm": 28.22671822269268, + "learning_rate": 1.154831166072215e-06, + "loss": 1.5896, + "step": 11997 + }, + { + "epoch": 2.4044088176352707, + "grad_norm": 20.485399812441358, + "learning_rate": 1.1540860276660238e-06, + "loss": 1.3165, + "step": 11998 + }, + { + "epoch": 2.4046092184368737, + "grad_norm": 25.267862505607766, + "learning_rate": 1.1533410983678533e-06, + "loss": 1.6264, + "step": 11999 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 21.240627690277122, + "learning_rate": 1.152596378218203e-06, + "loss": 1.3792, + "step": 12000 + }, + { + "epoch": 2.40501002004008, + "grad_norm": 23.54131329675645, + "learning_rate": 1.1518518672575701e-06, + "loss": 1.8312, + "step": 12001 + }, + { + "epoch": 2.4052104208416836, + "grad_norm": 25.04894693384306, + "learning_rate": 1.1511075655264298e-06, + "loss": 1.1487, + "step": 12002 + }, + { + "epoch": 2.4054108216432866, + "grad_norm": 20.754953323995647, + "learning_rate": 1.150363473065253e-06, + "loss": 1.3064, + "step": 12003 + }, + { + "epoch": 2.4056112224448896, + "grad_norm": 33.2739434888055, + "learning_rate": 1.1496195899144964e-06, + "loss": 1.4405, + "step": 12004 + }, + { + "epoch": 2.405811623246493, + "grad_norm": 26.48052127878896, + "learning_rate": 1.1488759161146063e-06, + "loss": 2.1137, + "step": 12005 + }, + { + "epoch": 2.406012024048096, + "grad_norm": 19.908627401772915, + "learning_rate": 1.1481324517060188e-06, + "loss": 1.2365, + "step": 12006 + }, + { + "epoch": 2.4062124248496994, + "grad_norm": 20.718583275337537, + "learning_rate": 1.1473891967291535e-06, + "loss": 1.5981, + "step": 12007 + }, + { + "epoch": 2.4064128256513024, + "grad_norm": 13.874932545533454, + "learning_rate": 1.146646151224427e-06, + "loss": 1.4067, + "step": 12008 + }, + { + "epoch": 2.406613226452906, + "grad_norm": 23.97556236980345, + "learning_rate": 1.1459033152322356e-06, + "loss": 1.6765, + "step": 12009 + }, + { + "epoch": 2.406813627254509, + "grad_norm": 16.343702824557006, + "learning_rate": 1.1451606887929706e-06, + "loss": 1.4921, + "step": 12010 + }, + { + "epoch": 2.4070140280561123, + "grad_norm": 34.7854185054277, + "learning_rate": 1.1444182719470092e-06, + "loss": 1.7961, + "step": 12011 + }, + { + "epoch": 2.4072144288577153, + "grad_norm": 17.600822156757136, + "learning_rate": 1.1436760647347172e-06, + "loss": 1.407, + "step": 12012 + }, + { + "epoch": 2.4074148296593187, + "grad_norm": 18.364438955371067, + "learning_rate": 1.1429340671964506e-06, + "loss": 1.0562, + "step": 12013 + }, + { + "epoch": 2.4076152304609217, + "grad_norm": 21.74587568963, + "learning_rate": 1.142192279372552e-06, + "loss": 1.5015, + "step": 12014 + }, + { + "epoch": 2.407815631262525, + "grad_norm": 25.001067930912487, + "learning_rate": 1.1414507013033555e-06, + "loss": 1.9825, + "step": 12015 + }, + { + "epoch": 2.408016032064128, + "grad_norm": 26.05999032748613, + "learning_rate": 1.1407093330291786e-06, + "loss": 1.5237, + "step": 12016 + }, + { + "epoch": 2.4082164328657316, + "grad_norm": 21.870555726037008, + "learning_rate": 1.1399681745903318e-06, + "loss": 1.5594, + "step": 12017 + }, + { + "epoch": 2.4084168336673346, + "grad_norm": 26.191198281157398, + "learning_rate": 1.1392272260271136e-06, + "loss": 1.6814, + "step": 12018 + }, + { + "epoch": 2.408617234468938, + "grad_norm": 23.4023273938344, + "learning_rate": 1.1384864873798102e-06, + "loss": 1.4173, + "step": 12019 + }, + { + "epoch": 2.408817635270541, + "grad_norm": 15.746166302725968, + "learning_rate": 1.1377459586886962e-06, + "loss": 1.4831, + "step": 12020 + }, + { + "epoch": 2.409018036072144, + "grad_norm": 30.948717400110443, + "learning_rate": 1.137005639994036e-06, + "loss": 1.6208, + "step": 12021 + }, + { + "epoch": 2.4092184368737475, + "grad_norm": 24.970232769954542, + "learning_rate": 1.1362655313360826e-06, + "loss": 1.8742, + "step": 12022 + }, + { + "epoch": 2.409418837675351, + "grad_norm": 24.012159896486896, + "learning_rate": 1.1355256327550746e-06, + "loss": 2.1047, + "step": 12023 + }, + { + "epoch": 2.409619238476954, + "grad_norm": 20.85021933186046, + "learning_rate": 1.1347859442912406e-06, + "loss": 1.4772, + "step": 12024 + }, + { + "epoch": 2.409819639278557, + "grad_norm": 18.19647608333413, + "learning_rate": 1.1340464659848032e-06, + "loss": 1.7227, + "step": 12025 + }, + { + "epoch": 2.4100200400801604, + "grad_norm": 28.510834723756087, + "learning_rate": 1.1333071978759652e-06, + "loss": 1.938, + "step": 12026 + }, + { + "epoch": 2.4102204408817633, + "grad_norm": 20.079946471022975, + "learning_rate": 1.1325681400049221e-06, + "loss": 1.691, + "step": 12027 + }, + { + "epoch": 2.410420841683367, + "grad_norm": 20.089256422698153, + "learning_rate": 1.1318292924118584e-06, + "loss": 1.5003, + "step": 12028 + }, + { + "epoch": 2.41062124248497, + "grad_norm": 17.735272203442634, + "learning_rate": 1.1310906551369461e-06, + "loss": 1.6318, + "step": 12029 + }, + { + "epoch": 2.4108216432865732, + "grad_norm": 12.870516137004506, + "learning_rate": 1.1303522282203456e-06, + "loss": 1.4701, + "step": 12030 + }, + { + "epoch": 2.411022044088176, + "grad_norm": 22.597891051478015, + "learning_rate": 1.1296140117022069e-06, + "loss": 1.3146, + "step": 12031 + }, + { + "epoch": 2.4112224448897797, + "grad_norm": 25.60126422456547, + "learning_rate": 1.128876005622669e-06, + "loss": 1.192, + "step": 12032 + }, + { + "epoch": 2.4114228456913827, + "grad_norm": 31.900945535047722, + "learning_rate": 1.128138210021855e-06, + "loss": 1.4381, + "step": 12033 + }, + { + "epoch": 2.411623246492986, + "grad_norm": 23.467557067330503, + "learning_rate": 1.1274006249398823e-06, + "loss": 1.3799, + "step": 12034 + }, + { + "epoch": 2.411823647294589, + "grad_norm": 20.564054623481965, + "learning_rate": 1.1266632504168546e-06, + "loss": 1.3724, + "step": 12035 + }, + { + "epoch": 2.4120240480961925, + "grad_norm": 30.392114495116004, + "learning_rate": 1.125926086492863e-06, + "loss": 1.4099, + "step": 12036 + }, + { + "epoch": 2.4122244488977955, + "grad_norm": 33.71909409113204, + "learning_rate": 1.1251891332079884e-06, + "loss": 1.4773, + "step": 12037 + }, + { + "epoch": 2.412424849699399, + "grad_norm": 17.026818991355412, + "learning_rate": 1.1244523906023009e-06, + "loss": 1.2539, + "step": 12038 + }, + { + "epoch": 2.412625250501002, + "grad_norm": 18.684397349257832, + "learning_rate": 1.1237158587158592e-06, + "loss": 1.0245, + "step": 12039 + }, + { + "epoch": 2.4128256513026054, + "grad_norm": 20.58506968889149, + "learning_rate": 1.122979537588705e-06, + "loss": 1.5345, + "step": 12040 + }, + { + "epoch": 2.4130260521042084, + "grad_norm": 21.33052383726809, + "learning_rate": 1.1222434272608795e-06, + "loss": 1.3942, + "step": 12041 + }, + { + "epoch": 2.4132264529058114, + "grad_norm": 18.927226837222815, + "learning_rate": 1.1215075277724014e-06, + "loss": 1.5654, + "step": 12042 + }, + { + "epoch": 2.413426853707415, + "grad_norm": 25.814852655489112, + "learning_rate": 1.1207718391632844e-06, + "loss": 1.8776, + "step": 12043 + }, + { + "epoch": 2.4136272545090183, + "grad_norm": 19.577624059290613, + "learning_rate": 1.1200363614735282e-06, + "loss": 1.5699, + "step": 12044 + }, + { + "epoch": 2.4138276553106213, + "grad_norm": 16.279318559084793, + "learning_rate": 1.1193010947431227e-06, + "loss": 1.2178, + "step": 12045 + }, + { + "epoch": 2.4140280561122243, + "grad_norm": 22.5004274014545, + "learning_rate": 1.1185660390120468e-06, + "loss": 1.9142, + "step": 12046 + }, + { + "epoch": 2.4142284569138277, + "grad_norm": 24.300400985630603, + "learning_rate": 1.1178311943202619e-06, + "loss": 1.092, + "step": 12047 + }, + { + "epoch": 2.4144288577154307, + "grad_norm": 36.42562527940966, + "learning_rate": 1.1170965607077283e-06, + "loss": 1.4959, + "step": 12048 + }, + { + "epoch": 2.414629258517034, + "grad_norm": 25.27991044285531, + "learning_rate": 1.1163621382143846e-06, + "loss": 1.6046, + "step": 12049 + }, + { + "epoch": 2.414829659318637, + "grad_norm": 24.32680812730437, + "learning_rate": 1.115627926880165e-06, + "loss": 1.701, + "step": 12050 + }, + { + "epoch": 2.4150300601202406, + "grad_norm": 21.6128643715425, + "learning_rate": 1.1148939267449882e-06, + "loss": 1.4888, + "step": 12051 + }, + { + "epoch": 2.4152304609218436, + "grad_norm": 14.915823079400228, + "learning_rate": 1.1141601378487638e-06, + "loss": 1.4477, + "step": 12052 + }, + { + "epoch": 2.415430861723447, + "grad_norm": 19.58942622747608, + "learning_rate": 1.1134265602313903e-06, + "loss": 1.5112, + "step": 12053 + }, + { + "epoch": 2.41563126252505, + "grad_norm": 18.151347853343413, + "learning_rate": 1.112693193932749e-06, + "loss": 1.4493, + "step": 12054 + }, + { + "epoch": 2.4158316633266534, + "grad_norm": 20.839758162961207, + "learning_rate": 1.1119600389927182e-06, + "loss": 1.7902, + "step": 12055 + }, + { + "epoch": 2.4160320641282564, + "grad_norm": 25.537394816207243, + "learning_rate": 1.1112270954511606e-06, + "loss": 1.5324, + "step": 12056 + }, + { + "epoch": 2.41623246492986, + "grad_norm": 21.2889870345852, + "learning_rate": 1.1104943633479236e-06, + "loss": 1.1896, + "step": 12057 + }, + { + "epoch": 2.416432865731463, + "grad_norm": 25.893835948200028, + "learning_rate": 1.1097618427228524e-06, + "loss": 1.5956, + "step": 12058 + }, + { + "epoch": 2.4166332665330663, + "grad_norm": 18.86624562283851, + "learning_rate": 1.109029533615771e-06, + "loss": 1.394, + "step": 12059 + }, + { + "epoch": 2.4168336673346693, + "grad_norm": 24.032289137106666, + "learning_rate": 1.1082974360664972e-06, + "loss": 1.6965, + "step": 12060 + }, + { + "epoch": 2.4170340681362728, + "grad_norm": 19.250783632953038, + "learning_rate": 1.1075655501148365e-06, + "loss": 1.5111, + "step": 12061 + }, + { + "epoch": 2.4172344689378757, + "grad_norm": 21.496696060800577, + "learning_rate": 1.1068338758005826e-06, + "loss": 1.5599, + "step": 12062 + }, + { + "epoch": 2.4174348697394787, + "grad_norm": 31.210345249972452, + "learning_rate": 1.106102413163519e-06, + "loss": 1.9849, + "step": 12063 + }, + { + "epoch": 2.417635270541082, + "grad_norm": 33.62256027986739, + "learning_rate": 1.1053711622434127e-06, + "loss": 1.4481, + "step": 12064 + }, + { + "epoch": 2.417835671342685, + "grad_norm": 56.2360266507866, + "learning_rate": 1.104640123080028e-06, + "loss": 1.5567, + "step": 12065 + }, + { + "epoch": 2.4180360721442886, + "grad_norm": 18.698047519061554, + "learning_rate": 1.1039092957131082e-06, + "loss": 1.3598, + "step": 12066 + }, + { + "epoch": 2.4182364729458916, + "grad_norm": 18.369390700469776, + "learning_rate": 1.1031786801823912e-06, + "loss": 1.2726, + "step": 12067 + }, + { + "epoch": 2.418436873747495, + "grad_norm": 20.979990255945676, + "learning_rate": 1.1024482765276018e-06, + "loss": 1.372, + "step": 12068 + }, + { + "epoch": 2.418637274549098, + "grad_norm": 21.711565319817026, + "learning_rate": 1.101718084788453e-06, + "loss": 1.5113, + "step": 12069 + }, + { + "epoch": 2.4188376753507015, + "grad_norm": 22.893069018396805, + "learning_rate": 1.1009881050046473e-06, + "loss": 1.5874, + "step": 12070 + }, + { + "epoch": 2.4190380761523045, + "grad_norm": 20.701797904628673, + "learning_rate": 1.1002583372158715e-06, + "loss": 1.5261, + "step": 12071 + }, + { + "epoch": 2.419238476953908, + "grad_norm": 26.58037046168695, + "learning_rate": 1.0995287814618088e-06, + "loss": 1.6333, + "step": 12072 + }, + { + "epoch": 2.419438877755511, + "grad_norm": 41.91774585462196, + "learning_rate": 1.0987994377821226e-06, + "loss": 1.9069, + "step": 12073 + }, + { + "epoch": 2.4196392785571144, + "grad_norm": 16.78572911848953, + "learning_rate": 1.0980703062164684e-06, + "loss": 1.6628, + "step": 12074 + }, + { + "epoch": 2.4198396793587174, + "grad_norm": 18.330962465637583, + "learning_rate": 1.0973413868044942e-06, + "loss": 1.375, + "step": 12075 + }, + { + "epoch": 2.420040080160321, + "grad_norm": 48.92255251475617, + "learning_rate": 1.0966126795858285e-06, + "loss": 1.6753, + "step": 12076 + }, + { + "epoch": 2.420240480961924, + "grad_norm": 21.814926148626693, + "learning_rate": 1.095884184600094e-06, + "loss": 1.6429, + "step": 12077 + }, + { + "epoch": 2.4204408817635272, + "grad_norm": 21.294246041943797, + "learning_rate": 1.0951559018868974e-06, + "loss": 1.2404, + "step": 12078 + }, + { + "epoch": 2.4206412825651302, + "grad_norm": 18.519543178144495, + "learning_rate": 1.0944278314858392e-06, + "loss": 1.6189, + "step": 12079 + }, + { + "epoch": 2.420841683366733, + "grad_norm": 18.30545524065002, + "learning_rate": 1.0936999734365066e-06, + "loss": 1.721, + "step": 12080 + }, + { + "epoch": 2.4210420841683367, + "grad_norm": 29.193297318200887, + "learning_rate": 1.0929723277784703e-06, + "loss": 1.9162, + "step": 12081 + }, + { + "epoch": 2.42124248496994, + "grad_norm": 21.078226177639042, + "learning_rate": 1.092244894551298e-06, + "loss": 1.5709, + "step": 12082 + }, + { + "epoch": 2.421442885771543, + "grad_norm": 22.821073437209307, + "learning_rate": 1.0915176737945382e-06, + "loss": 1.7847, + "step": 12083 + }, + { + "epoch": 2.421643286573146, + "grad_norm": 24.120721310100542, + "learning_rate": 1.0907906655477319e-06, + "loss": 1.5903, + "step": 12084 + }, + { + "epoch": 2.4218436873747495, + "grad_norm": 30.344337687345966, + "learning_rate": 1.0900638698504074e-06, + "loss": 1.7702, + "step": 12085 + }, + { + "epoch": 2.4220440881763525, + "grad_norm": 19.463503418155444, + "learning_rate": 1.089337286742082e-06, + "loss": 1.3872, + "step": 12086 + }, + { + "epoch": 2.422244488977956, + "grad_norm": 28.4390708341058, + "learning_rate": 1.0886109162622626e-06, + "loss": 2.0527, + "step": 12087 + }, + { + "epoch": 2.422444889779559, + "grad_norm": 21.90752892434663, + "learning_rate": 1.0878847584504381e-06, + "loss": 1.9177, + "step": 12088 + }, + { + "epoch": 2.4226452905811624, + "grad_norm": 45.29851426181762, + "learning_rate": 1.0871588133460976e-06, + "loss": 1.7738, + "step": 12089 + }, + { + "epoch": 2.4228456913827654, + "grad_norm": 21.72136737102642, + "learning_rate": 1.0864330809887064e-06, + "loss": 1.8489, + "step": 12090 + }, + { + "epoch": 2.423046092184369, + "grad_norm": 27.026822696864567, + "learning_rate": 1.0857075614177254e-06, + "loss": 1.3338, + "step": 12091 + }, + { + "epoch": 2.423246492985972, + "grad_norm": 20.82258107206667, + "learning_rate": 1.084982254672603e-06, + "loss": 1.7352, + "step": 12092 + }, + { + "epoch": 2.4234468937875753, + "grad_norm": 17.40817934125913, + "learning_rate": 1.0842571607927737e-06, + "loss": 1.3264, + "step": 12093 + }, + { + "epoch": 2.4236472945891783, + "grad_norm": 22.675381916820644, + "learning_rate": 1.0835322798176646e-06, + "loss": 1.5716, + "step": 12094 + }, + { + "epoch": 2.4238476953907817, + "grad_norm": 21.166451041916016, + "learning_rate": 1.0828076117866838e-06, + "loss": 1.6958, + "step": 12095 + }, + { + "epoch": 2.4240480961923847, + "grad_norm": 17.252131201606282, + "learning_rate": 1.0820831567392381e-06, + "loss": 2.2284, + "step": 12096 + }, + { + "epoch": 2.424248496993988, + "grad_norm": 29.436139911874978, + "learning_rate": 1.0813589147147135e-06, + "loss": 1.9962, + "step": 12097 + }, + { + "epoch": 2.424448897795591, + "grad_norm": 23.933499244203997, + "learning_rate": 1.080634885752489e-06, + "loss": 1.0248, + "step": 12098 + }, + { + "epoch": 2.4246492985971946, + "grad_norm": 17.025494909601505, + "learning_rate": 1.0799110698919313e-06, + "loss": 1.4637, + "step": 12099 + }, + { + "epoch": 2.4248496993987976, + "grad_norm": 27.593649556090874, + "learning_rate": 1.079187467172395e-06, + "loss": 1.9744, + "step": 12100 + }, + { + "epoch": 2.4250501002004006, + "grad_norm": 17.398413975662304, + "learning_rate": 1.078464077633224e-06, + "loss": 1.7174, + "step": 12101 + }, + { + "epoch": 2.425250501002004, + "grad_norm": 24.79551727348432, + "learning_rate": 1.0777409013137502e-06, + "loss": 1.2877, + "step": 12102 + }, + { + "epoch": 2.4254509018036075, + "grad_norm": 18.849703729208546, + "learning_rate": 1.077017938253294e-06, + "loss": 1.444, + "step": 12103 + }, + { + "epoch": 2.4256513026052104, + "grad_norm": 20.797113533880957, + "learning_rate": 1.0762951884911627e-06, + "loss": 1.3949, + "step": 12104 + }, + { + "epoch": 2.4258517034068134, + "grad_norm": 36.34397528275055, + "learning_rate": 1.075572652066652e-06, + "loss": 1.6115, + "step": 12105 + }, + { + "epoch": 2.426052104208417, + "grad_norm": 19.81021328277916, + "learning_rate": 1.074850329019052e-06, + "loss": 1.5159, + "step": 12106 + }, + { + "epoch": 2.42625250501002, + "grad_norm": 20.972023159143404, + "learning_rate": 1.0741282193876323e-06, + "loss": 1.5646, + "step": 12107 + }, + { + "epoch": 2.4264529058116233, + "grad_norm": 25.616237542371564, + "learning_rate": 1.0734063232116572e-06, + "loss": 1.8706, + "step": 12108 + }, + { + "epoch": 2.4266533066132263, + "grad_norm": 19.89201950271866, + "learning_rate": 1.0726846405303753e-06, + "loss": 1.4683, + "step": 12109 + }, + { + "epoch": 2.4268537074148298, + "grad_norm": 16.61977706722312, + "learning_rate": 1.0719631713830274e-06, + "loss": 0.8691, + "step": 12110 + }, + { + "epoch": 2.4270541082164327, + "grad_norm": 25.486394786528173, + "learning_rate": 1.07124191580884e-06, + "loss": 1.443, + "step": 12111 + }, + { + "epoch": 2.427254509018036, + "grad_norm": 26.76744070875885, + "learning_rate": 1.0705208738470291e-06, + "loss": 1.8773, + "step": 12112 + }, + { + "epoch": 2.427454909819639, + "grad_norm": 19.87656248980295, + "learning_rate": 1.0698000455368e-06, + "loss": 1.5054, + "step": 12113 + }, + { + "epoch": 2.4276553106212426, + "grad_norm": 27.00883408926415, + "learning_rate": 1.069079430917343e-06, + "loss": 1.7733, + "step": 12114 + }, + { + "epoch": 2.4278557114228456, + "grad_norm": 22.330076462946845, + "learning_rate": 1.0683590300278395e-06, + "loss": 1.4675, + "step": 12115 + }, + { + "epoch": 2.428056112224449, + "grad_norm": 21.953817250243073, + "learning_rate": 1.067638842907459e-06, + "loss": 1.2615, + "step": 12116 + }, + { + "epoch": 2.428256513026052, + "grad_norm": 17.32733930236345, + "learning_rate": 1.0669188695953598e-06, + "loss": 1.664, + "step": 12117 + }, + { + "epoch": 2.4284569138276555, + "grad_norm": 23.8292973931485, + "learning_rate": 1.0661991101306868e-06, + "loss": 1.9589, + "step": 12118 + }, + { + "epoch": 2.4286573146292585, + "grad_norm": 23.10244125153054, + "learning_rate": 1.0654795645525757e-06, + "loss": 1.4552, + "step": 12119 + }, + { + "epoch": 2.428857715430862, + "grad_norm": 18.545769239730333, + "learning_rate": 1.0647602329001494e-06, + "loss": 1.1305, + "step": 12120 + }, + { + "epoch": 2.429058116232465, + "grad_norm": 22.79743063518103, + "learning_rate": 1.0640411152125175e-06, + "loss": 1.6375, + "step": 12121 + }, + { + "epoch": 2.429258517034068, + "grad_norm": 18.796100737450576, + "learning_rate": 1.0633222115287794e-06, + "loss": 1.2451, + "step": 12122 + }, + { + "epoch": 2.4294589178356714, + "grad_norm": 19.60885898982905, + "learning_rate": 1.0626035218880238e-06, + "loss": 1.2313, + "step": 12123 + }, + { + "epoch": 2.4296593186372744, + "grad_norm": 20.443453530937724, + "learning_rate": 1.0618850463293274e-06, + "loss": 1.3696, + "step": 12124 + }, + { + "epoch": 2.429859719438878, + "grad_norm": 20.156459136989927, + "learning_rate": 1.0611667848917539e-06, + "loss": 2.0376, + "step": 12125 + }, + { + "epoch": 2.430060120240481, + "grad_norm": 16.04854570652945, + "learning_rate": 1.0604487376143563e-06, + "loss": 0.9233, + "step": 12126 + }, + { + "epoch": 2.4302605210420842, + "grad_norm": 20.39190642155495, + "learning_rate": 1.0597309045361777e-06, + "loss": 1.6468, + "step": 12127 + }, + { + "epoch": 2.4304609218436872, + "grad_norm": 24.364283018411086, + "learning_rate": 1.0590132856962437e-06, + "loss": 1.9316, + "step": 12128 + }, + { + "epoch": 2.4306613226452907, + "grad_norm": 21.45097598661781, + "learning_rate": 1.058295881133577e-06, + "loss": 1.7705, + "step": 12129 + }, + { + "epoch": 2.4308617234468937, + "grad_norm": 22.54754693255783, + "learning_rate": 1.057578690887181e-06, + "loss": 1.4009, + "step": 12130 + }, + { + "epoch": 2.431062124248497, + "grad_norm": 26.515136358593377, + "learning_rate": 1.0568617149960508e-06, + "loss": 1.7042, + "step": 12131 + }, + { + "epoch": 2.4312625250501, + "grad_norm": 24.861376162774377, + "learning_rate": 1.0561449534991697e-06, + "loss": 1.7105, + "step": 12132 + }, + { + "epoch": 2.4314629258517035, + "grad_norm": 16.87093386467041, + "learning_rate": 1.0554284064355092e-06, + "loss": 1.686, + "step": 12133 + }, + { + "epoch": 2.4316633266533065, + "grad_norm": 31.04658045106785, + "learning_rate": 1.0547120738440303e-06, + "loss": 1.5206, + "step": 12134 + }, + { + "epoch": 2.43186372745491, + "grad_norm": 25.56126134882549, + "learning_rate": 1.0539959557636769e-06, + "loss": 1.9376, + "step": 12135 + }, + { + "epoch": 2.432064128256513, + "grad_norm": 48.46177088206496, + "learning_rate": 1.0532800522333902e-06, + "loss": 1.7461, + "step": 12136 + }, + { + "epoch": 2.4322645290581164, + "grad_norm": 15.978166794077596, + "learning_rate": 1.0525643632920935e-06, + "loss": 1.163, + "step": 12137 + }, + { + "epoch": 2.4324649298597194, + "grad_norm": 23.418923316605007, + "learning_rate": 1.0518488889786982e-06, + "loss": 1.4587, + "step": 12138 + }, + { + "epoch": 2.4326653306613224, + "grad_norm": 27.986691413738935, + "learning_rate": 1.0511336293321073e-06, + "loss": 1.6128, + "step": 12139 + }, + { + "epoch": 2.432865731462926, + "grad_norm": 17.64566648734329, + "learning_rate": 1.0504185843912096e-06, + "loss": 1.6242, + "step": 12140 + }, + { + "epoch": 2.4330661322645293, + "grad_norm": 19.542443391256104, + "learning_rate": 1.0497037541948834e-06, + "loss": 1.6427, + "step": 12141 + }, + { + "epoch": 2.4332665330661323, + "grad_norm": 21.457591941532314, + "learning_rate": 1.0489891387819957e-06, + "loss": 1.3261, + "step": 12142 + }, + { + "epoch": 2.4334669338677353, + "grad_norm": 21.25747447901826, + "learning_rate": 1.0482747381914005e-06, + "loss": 1.6821, + "step": 12143 + }, + { + "epoch": 2.4336673346693387, + "grad_norm": 21.93474944960757, + "learning_rate": 1.0475605524619425e-06, + "loss": 1.2564, + "step": 12144 + }, + { + "epoch": 2.4338677354709417, + "grad_norm": 19.581126616990314, + "learning_rate": 1.0468465816324486e-06, + "loss": 1.5007, + "step": 12145 + }, + { + "epoch": 2.434068136272545, + "grad_norm": 51.46077800390918, + "learning_rate": 1.0461328257417446e-06, + "loss": 2.2878, + "step": 12146 + }, + { + "epoch": 2.434268537074148, + "grad_norm": 18.778780430427577, + "learning_rate": 1.0454192848286337e-06, + "loss": 1.7919, + "step": 12147 + }, + { + "epoch": 2.4344689378757516, + "grad_norm": 31.31080839879224, + "learning_rate": 1.0447059589319136e-06, + "loss": 1.6869, + "step": 12148 + }, + { + "epoch": 2.4346693386773546, + "grad_norm": 25.179957124499122, + "learning_rate": 1.0439928480903693e-06, + "loss": 2.0348, + "step": 12149 + }, + { + "epoch": 2.434869739478958, + "grad_norm": 20.2369872332627, + "learning_rate": 1.0432799523427738e-06, + "loss": 1.477, + "step": 12150 + }, + { + "epoch": 2.435070140280561, + "grad_norm": 22.377660772503866, + "learning_rate": 1.042567271727889e-06, + "loss": 1.4149, + "step": 12151 + }, + { + "epoch": 2.4352705410821645, + "grad_norm": 20.826749979412487, + "learning_rate": 1.0418548062844602e-06, + "loss": 1.591, + "step": 12152 + }, + { + "epoch": 2.4354709418837674, + "grad_norm": 21.777694433956295, + "learning_rate": 1.0411425560512318e-06, + "loss": 1.9122, + "step": 12153 + }, + { + "epoch": 2.435671342685371, + "grad_norm": 17.82201482553186, + "learning_rate": 1.0404305210669246e-06, + "loss": 1.23, + "step": 12154 + }, + { + "epoch": 2.435871743486974, + "grad_norm": 23.237906648338544, + "learning_rate": 1.0397187013702548e-06, + "loss": 2.0385, + "step": 12155 + }, + { + "epoch": 2.4360721442885773, + "grad_norm": 24.261819361554203, + "learning_rate": 1.0390070969999256e-06, + "loss": 1.8451, + "step": 12156 + }, + { + "epoch": 2.4362725450901803, + "grad_norm": 18.974970132965634, + "learning_rate": 1.038295707994627e-06, + "loss": 1.684, + "step": 12157 + }, + { + "epoch": 2.4364729458917838, + "grad_norm": 22.35785025233126, + "learning_rate": 1.0375845343930403e-06, + "loss": 2.1188, + "step": 12158 + }, + { + "epoch": 2.4366733466933868, + "grad_norm": 18.336190131533055, + "learning_rate": 1.036873576233829e-06, + "loss": 1.1914, + "step": 12159 + }, + { + "epoch": 2.4368737474949898, + "grad_norm": 19.582156078935594, + "learning_rate": 1.036162833555654e-06, + "loss": 1.7732, + "step": 12160 + }, + { + "epoch": 2.437074148296593, + "grad_norm": 14.587587667660726, + "learning_rate": 1.0354523063971556e-06, + "loss": 1.337, + "step": 12161 + }, + { + "epoch": 2.4372745490981966, + "grad_norm": 20.63872427942264, + "learning_rate": 1.0347419947969656e-06, + "loss": 1.6432, + "step": 12162 + }, + { + "epoch": 2.4374749498997996, + "grad_norm": 21.032072281218017, + "learning_rate": 1.0340318987937097e-06, + "loss": 1.9659, + "step": 12163 + }, + { + "epoch": 2.4376753507014026, + "grad_norm": 19.961764199056503, + "learning_rate": 1.033322018425993e-06, + "loss": 1.8405, + "step": 12164 + }, + { + "epoch": 2.437875751503006, + "grad_norm": 15.356400352066235, + "learning_rate": 1.032612353732414e-06, + "loss": 1.3381, + "step": 12165 + }, + { + "epoch": 2.438076152304609, + "grad_norm": 60.680460228388206, + "learning_rate": 1.0319029047515556e-06, + "loss": 1.7853, + "step": 12166 + }, + { + "epoch": 2.4382765531062125, + "grad_norm": 21.105539359122847, + "learning_rate": 1.031193671521995e-06, + "loss": 1.4203, + "step": 12167 + }, + { + "epoch": 2.4384769539078155, + "grad_norm": 17.262012235411355, + "learning_rate": 1.0304846540822939e-06, + "loss": 1.9485, + "step": 12168 + }, + { + "epoch": 2.438677354709419, + "grad_norm": 46.63075192267103, + "learning_rate": 1.0297758524709994e-06, + "loss": 1.7237, + "step": 12169 + }, + { + "epoch": 2.438877755511022, + "grad_norm": 19.9091932819619, + "learning_rate": 1.0290672667266548e-06, + "loss": 1.7761, + "step": 12170 + }, + { + "epoch": 2.4390781563126254, + "grad_norm": 24.856716783292992, + "learning_rate": 1.0283588968877834e-06, + "loss": 1.827, + "step": 12171 + }, + { + "epoch": 2.4392785571142284, + "grad_norm": 20.225191167113316, + "learning_rate": 1.0276507429929017e-06, + "loss": 1.7179, + "step": 12172 + }, + { + "epoch": 2.439478957915832, + "grad_norm": 25.930741972243624, + "learning_rate": 1.0269428050805124e-06, + "loss": 1.4724, + "step": 12173 + }, + { + "epoch": 2.439679358717435, + "grad_norm": 23.654623000373252, + "learning_rate": 1.0262350831891076e-06, + "loss": 1.319, + "step": 12174 + }, + { + "epoch": 2.4398797595190382, + "grad_norm": 26.109850451304403, + "learning_rate": 1.0255275773571688e-06, + "loss": 1.6007, + "step": 12175 + }, + { + "epoch": 2.4400801603206412, + "grad_norm": 20.072594586295516, + "learning_rate": 1.0248202876231595e-06, + "loss": 1.3595, + "step": 12176 + }, + { + "epoch": 2.4402805611222447, + "grad_norm": 22.2272062968295, + "learning_rate": 1.0241132140255416e-06, + "loss": 1.5445, + "step": 12177 + }, + { + "epoch": 2.4404809619238477, + "grad_norm": 48.331206418497736, + "learning_rate": 1.0234063566027563e-06, + "loss": 1.732, + "step": 12178 + }, + { + "epoch": 2.440681362725451, + "grad_norm": 19.11008127031971, + "learning_rate": 1.0226997153932373e-06, + "loss": 1.3644, + "step": 12179 + }, + { + "epoch": 2.440881763527054, + "grad_norm": 17.574179390929793, + "learning_rate": 1.0219932904354058e-06, + "loss": 1.1801, + "step": 12180 + }, + { + "epoch": 2.441082164328657, + "grad_norm": 20.30740282558694, + "learning_rate": 1.0212870817676707e-06, + "loss": 1.7144, + "step": 12181 + }, + { + "epoch": 2.4412825651302605, + "grad_norm": 18.171056046645813, + "learning_rate": 1.0205810894284317e-06, + "loss": 1.488, + "step": 12182 + }, + { + "epoch": 2.4414829659318635, + "grad_norm": 23.480115780755273, + "learning_rate": 1.0198753134560708e-06, + "loss": 1.5733, + "step": 12183 + }, + { + "epoch": 2.441683366733467, + "grad_norm": 23.654987649630257, + "learning_rate": 1.0191697538889666e-06, + "loss": 1.2768, + "step": 12184 + }, + { + "epoch": 2.44188376753507, + "grad_norm": 24.145321995209372, + "learning_rate": 1.0184644107654778e-06, + "loss": 1.665, + "step": 12185 + }, + { + "epoch": 2.4420841683366734, + "grad_norm": 17.870844225327694, + "learning_rate": 1.0177592841239564e-06, + "loss": 1.1682, + "step": 12186 + }, + { + "epoch": 2.4422845691382764, + "grad_norm": 19.69045640948613, + "learning_rate": 1.017054374002741e-06, + "loss": 1.4026, + "step": 12187 + }, + { + "epoch": 2.44248496993988, + "grad_norm": 20.425103769331667, + "learning_rate": 1.0163496804401585e-06, + "loss": 1.3765, + "step": 12188 + }, + { + "epoch": 2.442685370741483, + "grad_norm": 22.39896692348511, + "learning_rate": 1.0156452034745246e-06, + "loss": 1.8101, + "step": 12189 + }, + { + "epoch": 2.4428857715430863, + "grad_norm": 26.5344919672937, + "learning_rate": 1.0149409431441421e-06, + "loss": 2.2286, + "step": 12190 + }, + { + "epoch": 2.4430861723446893, + "grad_norm": 23.77748016083761, + "learning_rate": 1.0142368994873043e-06, + "loss": 1.2478, + "step": 12191 + }, + { + "epoch": 2.4432865731462927, + "grad_norm": 14.292553047954165, + "learning_rate": 1.0135330725422887e-06, + "loss": 1.747, + "step": 12192 + }, + { + "epoch": 2.4434869739478957, + "grad_norm": 26.556812975890583, + "learning_rate": 1.0128294623473627e-06, + "loss": 1.4482, + "step": 12193 + }, + { + "epoch": 2.443687374749499, + "grad_norm": 18.49723310127984, + "learning_rate": 1.0121260689407876e-06, + "loss": 1.6236, + "step": 12194 + }, + { + "epoch": 2.443887775551102, + "grad_norm": 16.73757896788668, + "learning_rate": 1.0114228923608032e-06, + "loss": 1.2611, + "step": 12195 + }, + { + "epoch": 2.4440881763527056, + "grad_norm": 21.344163198656233, + "learning_rate": 1.010719932645644e-06, + "loss": 1.4294, + "step": 12196 + }, + { + "epoch": 2.4442885771543086, + "grad_norm": 22.654519402345056, + "learning_rate": 1.0100171898335309e-06, + "loss": 1.1583, + "step": 12197 + }, + { + "epoch": 2.4444889779559116, + "grad_norm": 23.127683343915695, + "learning_rate": 1.0093146639626728e-06, + "loss": 1.6449, + "step": 12198 + }, + { + "epoch": 2.444689378757515, + "grad_norm": 20.051025519046824, + "learning_rate": 1.0086123550712672e-06, + "loss": 1.4548, + "step": 12199 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 22.44568556569999, + "learning_rate": 1.0079102631974997e-06, + "loss": 1.6215, + "step": 12200 + }, + { + "epoch": 2.4450901803607215, + "grad_norm": 40.699453308925506, + "learning_rate": 1.0072083883795453e-06, + "loss": 1.6238, + "step": 12201 + }, + { + "epoch": 2.4452905811623245, + "grad_norm": 17.577787641156768, + "learning_rate": 1.0065067306555638e-06, + "loss": 1.2656, + "step": 12202 + }, + { + "epoch": 2.445490981963928, + "grad_norm": 22.6334280696563, + "learning_rate": 1.0058052900637054e-06, + "loss": 1.4212, + "step": 12203 + }, + { + "epoch": 2.445691382765531, + "grad_norm": 34.82261298347702, + "learning_rate": 1.0051040666421103e-06, + "loss": 1.3657, + "step": 12204 + }, + { + "epoch": 2.4458917835671343, + "grad_norm": 19.097569379270016, + "learning_rate": 1.0044030604289036e-06, + "loss": 1.5744, + "step": 12205 + }, + { + "epoch": 2.4460921843687373, + "grad_norm": 30.91166506812792, + "learning_rate": 1.0037022714622008e-06, + "loss": 1.7701, + "step": 12206 + }, + { + "epoch": 2.4462925851703408, + "grad_norm": 28.135521664523697, + "learning_rate": 1.0030016997801039e-06, + "loss": 1.6981, + "step": 12207 + }, + { + "epoch": 2.4464929859719438, + "grad_norm": 24.81986058027028, + "learning_rate": 1.0023013454207071e-06, + "loss": 1.2485, + "step": 12208 + }, + { + "epoch": 2.446693386773547, + "grad_norm": 35.59615489507899, + "learning_rate": 1.001601208422085e-06, + "loss": 1.6617, + "step": 12209 + }, + { + "epoch": 2.44689378757515, + "grad_norm": 26.083541921547866, + "learning_rate": 1.0009012888223079e-06, + "loss": 1.5898, + "step": 12210 + }, + { + "epoch": 2.4470941883767536, + "grad_norm": 21.466275092439396, + "learning_rate": 1.0002015866594306e-06, + "loss": 1.4991, + "step": 12211 + }, + { + "epoch": 2.4472945891783566, + "grad_norm": 24.07118929789231, + "learning_rate": 9.99502101971498e-07, + "loss": 2.0112, + "step": 12212 + }, + { + "epoch": 2.44749498997996, + "grad_norm": 43.00369422415375, + "learning_rate": 9.988028347965407e-07, + "loss": 1.5073, + "step": 12213 + }, + { + "epoch": 2.447695390781563, + "grad_norm": 33.67256546008061, + "learning_rate": 9.981037851725795e-07, + "loss": 1.426, + "step": 12214 + }, + { + "epoch": 2.4478957915831665, + "grad_norm": 17.758888067163163, + "learning_rate": 9.974049531376245e-07, + "loss": 1.3447, + "step": 12215 + }, + { + "epoch": 2.4480961923847695, + "grad_norm": 22.664863979873342, + "learning_rate": 9.967063387296677e-07, + "loss": 1.594, + "step": 12216 + }, + { + "epoch": 2.448296593186373, + "grad_norm": 16.46603165486075, + "learning_rate": 9.960079419866985e-07, + "loss": 1.6644, + "step": 12217 + }, + { + "epoch": 2.448496993987976, + "grad_norm": 31.012989233088174, + "learning_rate": 9.95309762946689e-07, + "loss": 1.3125, + "step": 12218 + }, + { + "epoch": 2.448697394789579, + "grad_norm": 35.65586511464045, + "learning_rate": 9.946118016475975e-07, + "loss": 1.3221, + "step": 12219 + }, + { + "epoch": 2.4488977955911824, + "grad_norm": 32.22438672840813, + "learning_rate": 9.939140581273754e-07, + "loss": 1.5674, + "step": 12220 + }, + { + "epoch": 2.449098196392786, + "grad_norm": 23.988254448521666, + "learning_rate": 9.93216532423959e-07, + "loss": 1.4155, + "step": 12221 + }, + { + "epoch": 2.449298597194389, + "grad_norm": 19.299641782016746, + "learning_rate": 9.92519224575274e-07, + "loss": 1.5294, + "step": 12222 + }, + { + "epoch": 2.449498997995992, + "grad_norm": 22.71793773935844, + "learning_rate": 9.918221346192353e-07, + "loss": 1.3663, + "step": 12223 + }, + { + "epoch": 2.4496993987975952, + "grad_norm": 32.94373636165303, + "learning_rate": 9.911252625937429e-07, + "loss": 1.3497, + "step": 12224 + }, + { + "epoch": 2.4498997995991982, + "grad_norm": 19.445994996733102, + "learning_rate": 9.904286085366893e-07, + "loss": 1.4171, + "step": 12225 + }, + { + "epoch": 2.4501002004008017, + "grad_norm": 16.75105147252051, + "learning_rate": 9.897321724859481e-07, + "loss": 1.5179, + "step": 12226 + }, + { + "epoch": 2.4503006012024047, + "grad_norm": 26.99296580655827, + "learning_rate": 9.890359544793916e-07, + "loss": 1.6084, + "step": 12227 + }, + { + "epoch": 2.450501002004008, + "grad_norm": 25.19971819331405, + "learning_rate": 9.883399545548689e-07, + "loss": 1.4894, + "step": 12228 + }, + { + "epoch": 2.450701402805611, + "grad_norm": 20.42583065522289, + "learning_rate": 9.876441727502251e-07, + "loss": 1.9436, + "step": 12229 + }, + { + "epoch": 2.4509018036072145, + "grad_norm": 20.35555272823447, + "learning_rate": 9.869486091032904e-07, + "loss": 1.8368, + "step": 12230 + }, + { + "epoch": 2.4511022044088175, + "grad_norm": 29.33972672387358, + "learning_rate": 9.862532636518835e-07, + "loss": 1.5938, + "step": 12231 + }, + { + "epoch": 2.451302605210421, + "grad_norm": 19.029568337789307, + "learning_rate": 9.85558136433813e-07, + "loss": 1.4071, + "step": 12232 + }, + { + "epoch": 2.451503006012024, + "grad_norm": 23.49812727054054, + "learning_rate": 9.84863227486869e-07, + "loss": 1.5669, + "step": 12233 + }, + { + "epoch": 2.4517034068136274, + "grad_norm": 19.660999921635533, + "learning_rate": 9.841685368488418e-07, + "loss": 1.4325, + "step": 12234 + }, + { + "epoch": 2.4519038076152304, + "grad_norm": 22.37264584466045, + "learning_rate": 9.834740645574975e-07, + "loss": 2.012, + "step": 12235 + }, + { + "epoch": 2.452104208416834, + "grad_norm": 21.488124765883725, + "learning_rate": 9.827798106505975e-07, + "loss": 1.7868, + "step": 12236 + }, + { + "epoch": 2.452304609218437, + "grad_norm": 26.5216672046641, + "learning_rate": 9.820857751658891e-07, + "loss": 1.4705, + "step": 12237 + }, + { + "epoch": 2.4525050100200403, + "grad_norm": 18.82889164323714, + "learning_rate": 9.813919581411075e-07, + "loss": 1.4273, + "step": 12238 + }, + { + "epoch": 2.4527054108216433, + "grad_norm": 29.095077743535096, + "learning_rate": 9.80698359613979e-07, + "loss": 1.686, + "step": 12239 + }, + { + "epoch": 2.4529058116232463, + "grad_norm": 25.116090984043623, + "learning_rate": 9.80004979622211e-07, + "loss": 1.531, + "step": 12240 + }, + { + "epoch": 2.4531062124248497, + "grad_norm": 25.20860101147969, + "learning_rate": 9.793118182035087e-07, + "loss": 1.7899, + "step": 12241 + }, + { + "epoch": 2.4533066132264527, + "grad_norm": 46.612890939002604, + "learning_rate": 9.786188753955572e-07, + "loss": 1.7548, + "step": 12242 + }, + { + "epoch": 2.453507014028056, + "grad_norm": 27.51586100960105, + "learning_rate": 9.779261512360315e-07, + "loss": 1.5631, + "step": 12243 + }, + { + "epoch": 2.453707414829659, + "grad_norm": 23.666246302191603, + "learning_rate": 9.772336457626015e-07, + "loss": 1.5979, + "step": 12244 + }, + { + "epoch": 2.4539078156312626, + "grad_norm": 18.965474074697532, + "learning_rate": 9.765413590129146e-07, + "loss": 1.7224, + "step": 12245 + }, + { + "epoch": 2.4541082164328656, + "grad_norm": 23.58285585693563, + "learning_rate": 9.758492910246142e-07, + "loss": 1.801, + "step": 12246 + }, + { + "epoch": 2.454308617234469, + "grad_norm": 19.517602864449593, + "learning_rate": 9.75157441835326e-07, + "loss": 1.5973, + "step": 12247 + }, + { + "epoch": 2.454509018036072, + "grad_norm": 31.130685260014623, + "learning_rate": 9.7446581148267e-07, + "loss": 1.7063, + "step": 12248 + }, + { + "epoch": 2.4547094188376755, + "grad_norm": 22.640465695033335, + "learning_rate": 9.73774400004251e-07, + "loss": 1.3973, + "step": 12249 + }, + { + "epoch": 2.4549098196392785, + "grad_norm": 15.258939597177704, + "learning_rate": 9.73083207437659e-07, + "loss": 1.4431, + "step": 12250 + }, + { + "epoch": 2.455110220440882, + "grad_norm": 20.988524825040415, + "learning_rate": 9.723922338204794e-07, + "loss": 1.7437, + "step": 12251 + }, + { + "epoch": 2.455310621242485, + "grad_norm": 19.169784907004573, + "learning_rate": 9.717014791902785e-07, + "loss": 1.4378, + "step": 12252 + }, + { + "epoch": 2.4555110220440883, + "grad_norm": 27.25009186207161, + "learning_rate": 9.710109435846144e-07, + "loss": 1.7203, + "step": 12253 + }, + { + "epoch": 2.4557114228456913, + "grad_norm": 44.06497306959463, + "learning_rate": 9.703206270410326e-07, + "loss": 1.7355, + "step": 12254 + }, + { + "epoch": 2.4559118236472948, + "grad_norm": 18.55102575325939, + "learning_rate": 9.696305295970664e-07, + "loss": 1.6074, + "step": 12255 + }, + { + "epoch": 2.4561122244488978, + "grad_norm": 65.15517082610083, + "learning_rate": 9.689406512902394e-07, + "loss": 1.8731, + "step": 12256 + }, + { + "epoch": 2.4563126252505008, + "grad_norm": 26.528933284665513, + "learning_rate": 9.682509921580568e-07, + "loss": 1.8116, + "step": 12257 + }, + { + "epoch": 2.456513026052104, + "grad_norm": 21.05060203645371, + "learning_rate": 9.675615522380217e-07, + "loss": 1.8323, + "step": 12258 + }, + { + "epoch": 2.4567134268537076, + "grad_norm": 27.331984464980103, + "learning_rate": 9.66872331567616e-07, + "loss": 1.7488, + "step": 12259 + }, + { + "epoch": 2.4569138276553106, + "grad_norm": 27.242041095945932, + "learning_rate": 9.661833301843155e-07, + "loss": 1.9514, + "step": 12260 + }, + { + "epoch": 2.4571142284569136, + "grad_norm": 26.518390909822276, + "learning_rate": 9.654945481255811e-07, + "loss": 2.242, + "step": 12261 + }, + { + "epoch": 2.457314629258517, + "grad_norm": 27.382233771807517, + "learning_rate": 9.648059854288643e-07, + "loss": 1.6784, + "step": 12262 + }, + { + "epoch": 2.45751503006012, + "grad_norm": 25.741530025620918, + "learning_rate": 9.641176421316034e-07, + "loss": 1.5423, + "step": 12263 + }, + { + "epoch": 2.4577154308617235, + "grad_norm": 16.721503734289485, + "learning_rate": 9.634295182712217e-07, + "loss": 1.2723, + "step": 12264 + }, + { + "epoch": 2.4579158316633265, + "grad_norm": 20.44597427930691, + "learning_rate": 9.627416138851375e-07, + "loss": 2.2291, + "step": 12265 + }, + { + "epoch": 2.45811623246493, + "grad_norm": 20.670376918313238, + "learning_rate": 9.620539290107505e-07, + "loss": 1.913, + "step": 12266 + }, + { + "epoch": 2.458316633266533, + "grad_norm": 20.44953754779304, + "learning_rate": 9.61366463685452e-07, + "loss": 1.5392, + "step": 12267 + }, + { + "epoch": 2.4585170340681364, + "grad_norm": 25.269924249089474, + "learning_rate": 9.606792179466202e-07, + "loss": 1.5158, + "step": 12268 + }, + { + "epoch": 2.4587174348697394, + "grad_norm": 16.292601055105646, + "learning_rate": 9.599921918316219e-07, + "loss": 1.3028, + "step": 12269 + }, + { + "epoch": 2.458917835671343, + "grad_norm": 18.04040907010513, + "learning_rate": 9.593053853778117e-07, + "loss": 1.319, + "step": 12270 + }, + { + "epoch": 2.459118236472946, + "grad_norm": 21.29762291840556, + "learning_rate": 9.586187986225326e-07, + "loss": 1.5132, + "step": 12271 + }, + { + "epoch": 2.4593186372745492, + "grad_norm": 23.196495329287508, + "learning_rate": 9.579324316031162e-07, + "loss": 1.4451, + "step": 12272 + }, + { + "epoch": 2.4595190380761522, + "grad_norm": 20.856992009648103, + "learning_rate": 9.572462843568786e-07, + "loss": 1.2215, + "step": 12273 + }, + { + "epoch": 2.4597194388777557, + "grad_norm": 27.780239582795147, + "learning_rate": 9.565603569211274e-07, + "loss": 1.6501, + "step": 12274 + }, + { + "epoch": 2.4599198396793587, + "grad_norm": 27.43191609280109, + "learning_rate": 9.558746493331606e-07, + "loss": 1.6743, + "step": 12275 + }, + { + "epoch": 2.460120240480962, + "grad_norm": 18.84938316722032, + "learning_rate": 9.551891616302573e-07, + "loss": 1.477, + "step": 12276 + }, + { + "epoch": 2.460320641282565, + "grad_norm": 24.423006197926284, + "learning_rate": 9.545038938496898e-07, + "loss": 1.8555, + "step": 12277 + }, + { + "epoch": 2.460521042084168, + "grad_norm": 19.029228601584695, + "learning_rate": 9.538188460287179e-07, + "loss": 1.2877, + "step": 12278 + }, + { + "epoch": 2.4607214428857715, + "grad_norm": 31.220934520624514, + "learning_rate": 9.531340182045879e-07, + "loss": 1.3944, + "step": 12279 + }, + { + "epoch": 2.460921843687375, + "grad_norm": 31.045332123935236, + "learning_rate": 9.52449410414536e-07, + "loss": 1.4618, + "step": 12280 + }, + { + "epoch": 2.461122244488978, + "grad_norm": 20.03334718576526, + "learning_rate": 9.51765022695782e-07, + "loss": 1.8256, + "step": 12281 + }, + { + "epoch": 2.461322645290581, + "grad_norm": 18.781506308713283, + "learning_rate": 9.510808550855422e-07, + "loss": 1.3714, + "step": 12282 + }, + { + "epoch": 2.4615230460921844, + "grad_norm": 28.717863502706034, + "learning_rate": 9.503969076210112e-07, + "loss": 1.818, + "step": 12283 + }, + { + "epoch": 2.4617234468937874, + "grad_norm": 19.850987155138924, + "learning_rate": 9.497131803393788e-07, + "loss": 1.6301, + "step": 12284 + }, + { + "epoch": 2.461923847695391, + "grad_norm": 22.2894658616596, + "learning_rate": 9.490296732778193e-07, + "loss": 1.8232, + "step": 12285 + }, + { + "epoch": 2.462124248496994, + "grad_norm": 17.185896526816542, + "learning_rate": 9.48346386473496e-07, + "loss": 1.1497, + "step": 12286 + }, + { + "epoch": 2.4623246492985973, + "grad_norm": 26.309003698577666, + "learning_rate": 9.476633199635604e-07, + "loss": 1.6211, + "step": 12287 + }, + { + "epoch": 2.4625250501002003, + "grad_norm": 23.664371183169376, + "learning_rate": 9.469804737851523e-07, + "loss": 1.4248, + "step": 12288 + }, + { + "epoch": 2.4627254509018037, + "grad_norm": 22.517871020452173, + "learning_rate": 9.462978479753998e-07, + "loss": 1.9453, + "step": 12289 + }, + { + "epoch": 2.4629258517034067, + "grad_norm": 30.45786139186895, + "learning_rate": 9.456154425714159e-07, + "loss": 1.1535, + "step": 12290 + }, + { + "epoch": 2.46312625250501, + "grad_norm": 21.075423294525137, + "learning_rate": 9.449332576103048e-07, + "loss": 1.2628, + "step": 12291 + }, + { + "epoch": 2.463326653306613, + "grad_norm": 24.04184818154469, + "learning_rate": 9.442512931291587e-07, + "loss": 1.3137, + "step": 12292 + }, + { + "epoch": 2.4635270541082166, + "grad_norm": 22.70505903220878, + "learning_rate": 9.435695491650565e-07, + "loss": 1.7194, + "step": 12293 + }, + { + "epoch": 2.4637274549098196, + "grad_norm": 35.37321296731262, + "learning_rate": 9.42888025755066e-07, + "loss": 2.0757, + "step": 12294 + }, + { + "epoch": 2.463927855711423, + "grad_norm": 24.709027325689593, + "learning_rate": 9.422067229362419e-07, + "loss": 2.0347, + "step": 12295 + }, + { + "epoch": 2.464128256513026, + "grad_norm": 19.671693764086932, + "learning_rate": 9.415256407456302e-07, + "loss": 1.9117, + "step": 12296 + }, + { + "epoch": 2.4643286573146295, + "grad_norm": 61.18430973442838, + "learning_rate": 9.408447792202585e-07, + "loss": 1.9178, + "step": 12297 + }, + { + "epoch": 2.4645290581162325, + "grad_norm": 20.02291464935151, + "learning_rate": 9.401641383971478e-07, + "loss": 1.2922, + "step": 12298 + }, + { + "epoch": 2.4647294589178355, + "grad_norm": 29.600149409006246, + "learning_rate": 9.39483718313306e-07, + "loss": 1.792, + "step": 12299 + }, + { + "epoch": 2.464929859719439, + "grad_norm": 27.388747559379624, + "learning_rate": 9.388035190057282e-07, + "loss": 1.5407, + "step": 12300 + }, + { + "epoch": 2.465130260521042, + "grad_norm": 37.911390291741064, + "learning_rate": 9.381235405113981e-07, + "loss": 1.795, + "step": 12301 + }, + { + "epoch": 2.4653306613226453, + "grad_norm": 17.29171916342202, + "learning_rate": 9.374437828672867e-07, + "loss": 1.9195, + "step": 12302 + }, + { + "epoch": 2.4655310621242483, + "grad_norm": 21.797169441695328, + "learning_rate": 9.367642461103555e-07, + "loss": 1.612, + "step": 12303 + }, + { + "epoch": 2.4657314629258518, + "grad_norm": 15.82202762329236, + "learning_rate": 9.360849302775471e-07, + "loss": 1.4622, + "step": 12304 + }, + { + "epoch": 2.4659318637274548, + "grad_norm": 39.65399085900512, + "learning_rate": 9.354058354058016e-07, + "loss": 1.7573, + "step": 12305 + }, + { + "epoch": 2.466132264529058, + "grad_norm": 19.957357047351373, + "learning_rate": 9.347269615320415e-07, + "loss": 1.4174, + "step": 12306 + }, + { + "epoch": 2.466332665330661, + "grad_norm": 22.75844488794899, + "learning_rate": 9.340483086931762e-07, + "loss": 2.1009, + "step": 12307 + }, + { + "epoch": 2.4665330661322646, + "grad_norm": 19.50920786744328, + "learning_rate": 9.333698769261068e-07, + "loss": 1.3616, + "step": 12308 + }, + { + "epoch": 2.4667334669338676, + "grad_norm": 18.021057820911007, + "learning_rate": 9.326916662677193e-07, + "loss": 1.2179, + "step": 12309 + }, + { + "epoch": 2.466933867735471, + "grad_norm": 29.919964340653117, + "learning_rate": 9.320136767548904e-07, + "loss": 1.514, + "step": 12310 + }, + { + "epoch": 2.467134268537074, + "grad_norm": 20.047179163763577, + "learning_rate": 9.313359084244827e-07, + "loss": 1.4595, + "step": 12311 + }, + { + "epoch": 2.4673346693386775, + "grad_norm": 22.666150867936114, + "learning_rate": 9.306583613133474e-07, + "loss": 1.4889, + "step": 12312 + }, + { + "epoch": 2.4675350701402805, + "grad_norm": 22.728889135472272, + "learning_rate": 9.299810354583255e-07, + "loss": 1.5815, + "step": 12313 + }, + { + "epoch": 2.467735470941884, + "grad_norm": 51.98224170017112, + "learning_rate": 9.293039308962393e-07, + "loss": 1.4869, + "step": 12314 + }, + { + "epoch": 2.467935871743487, + "grad_norm": 30.293390958167222, + "learning_rate": 9.286270476639103e-07, + "loss": 1.0764, + "step": 12315 + }, + { + "epoch": 2.46813627254509, + "grad_norm": 19.442856685357533, + "learning_rate": 9.279503857981376e-07, + "loss": 1.2859, + "step": 12316 + }, + { + "epoch": 2.4683366733466934, + "grad_norm": 25.808523495488902, + "learning_rate": 9.272739453357133e-07, + "loss": 1.4409, + "step": 12317 + }, + { + "epoch": 2.468537074148297, + "grad_norm": 25.645781440920906, + "learning_rate": 9.265977263134162e-07, + "loss": 1.5891, + "step": 12318 + }, + { + "epoch": 2.4687374749499, + "grad_norm": 24.638118723804613, + "learning_rate": 9.259217287680133e-07, + "loss": 2.0509, + "step": 12319 + }, + { + "epoch": 2.468937875751503, + "grad_norm": 21.981250580642275, + "learning_rate": 9.252459527362618e-07, + "loss": 1.5492, + "step": 12320 + }, + { + "epoch": 2.4691382765531062, + "grad_norm": 21.472266427005984, + "learning_rate": 9.245703982548998e-07, + "loss": 1.339, + "step": 12321 + }, + { + "epoch": 2.4693386773547092, + "grad_norm": 26.585236727490337, + "learning_rate": 9.238950653606637e-07, + "loss": 1.5854, + "step": 12322 + }, + { + "epoch": 2.4695390781563127, + "grad_norm": 43.71773023475868, + "learning_rate": 9.232199540902687e-07, + "loss": 1.6476, + "step": 12323 + }, + { + "epoch": 2.4697394789579157, + "grad_norm": 20.255631283478035, + "learning_rate": 9.22545064480423e-07, + "loss": 1.2133, + "step": 12324 + }, + { + "epoch": 2.469939879759519, + "grad_norm": 22.602402623571862, + "learning_rate": 9.218703965678205e-07, + "loss": 1.2518, + "step": 12325 + }, + { + "epoch": 2.470140280561122, + "grad_norm": 23.390909280449495, + "learning_rate": 9.211959503891443e-07, + "loss": 1.5226, + "step": 12326 + }, + { + "epoch": 2.4703406813627256, + "grad_norm": 22.387463316364443, + "learning_rate": 9.205217259810668e-07, + "loss": 1.4817, + "step": 12327 + }, + { + "epoch": 2.4705410821643286, + "grad_norm": 20.077634590855983, + "learning_rate": 9.198477233802422e-07, + "loss": 1.5264, + "step": 12328 + }, + { + "epoch": 2.470741482965932, + "grad_norm": 22.04721532014212, + "learning_rate": 9.191739426233226e-07, + "loss": 1.7654, + "step": 12329 + }, + { + "epoch": 2.470941883767535, + "grad_norm": 19.332269247444245, + "learning_rate": 9.185003837469375e-07, + "loss": 1.1633, + "step": 12330 + }, + { + "epoch": 2.4711422845691384, + "grad_norm": 24.834229006465538, + "learning_rate": 9.17827046787711e-07, + "loss": 1.8492, + "step": 12331 + }, + { + "epoch": 2.4713426853707414, + "grad_norm": 20.165478505027494, + "learning_rate": 9.171539317822558e-07, + "loss": 1.8119, + "step": 12332 + }, + { + "epoch": 2.471543086172345, + "grad_norm": 15.795762563198743, + "learning_rate": 9.164810387671669e-07, + "loss": 1.2666, + "step": 12333 + }, + { + "epoch": 2.471743486973948, + "grad_norm": 28.701286415385926, + "learning_rate": 9.158083677790319e-07, + "loss": 1.7003, + "step": 12334 + }, + { + "epoch": 2.4719438877755513, + "grad_norm": 16.975766301974108, + "learning_rate": 9.151359188544251e-07, + "loss": 1.0721, + "step": 12335 + }, + { + "epoch": 2.4721442885771543, + "grad_norm": 25.700914170974077, + "learning_rate": 9.144636920299077e-07, + "loss": 1.6708, + "step": 12336 + }, + { + "epoch": 2.4723446893787573, + "grad_norm": 18.32950064203271, + "learning_rate": 9.137916873420316e-07, + "loss": 1.2405, + "step": 12337 + }, + { + "epoch": 2.4725450901803607, + "grad_norm": 40.94181167903244, + "learning_rate": 9.131199048273309e-07, + "loss": 1.6523, + "step": 12338 + }, + { + "epoch": 2.472745490981964, + "grad_norm": 31.92828841689489, + "learning_rate": 9.124483445223364e-07, + "loss": 1.4268, + "step": 12339 + }, + { + "epoch": 2.472945891783567, + "grad_norm": 19.616505283280798, + "learning_rate": 9.117770064635572e-07, + "loss": 1.0911, + "step": 12340 + }, + { + "epoch": 2.47314629258517, + "grad_norm": 19.803737248465726, + "learning_rate": 9.111058906874976e-07, + "loss": 1.4724, + "step": 12341 + }, + { + "epoch": 2.4733466933867736, + "grad_norm": 24.32608435048232, + "learning_rate": 9.104349972306464e-07, + "loss": 1.9372, + "step": 12342 + }, + { + "epoch": 2.4735470941883766, + "grad_norm": 27.01633475310597, + "learning_rate": 9.097643261294809e-07, + "loss": 1.3282, + "step": 12343 + }, + { + "epoch": 2.47374749498998, + "grad_norm": 16.977784403255118, + "learning_rate": 9.090938774204682e-07, + "loss": 1.23, + "step": 12344 + }, + { + "epoch": 2.473947895791583, + "grad_norm": 26.922565890792526, + "learning_rate": 9.084236511400574e-07, + "loss": 1.2618, + "step": 12345 + }, + { + "epoch": 2.4741482965931865, + "grad_norm": 19.638007027367177, + "learning_rate": 9.07753647324695e-07, + "loss": 1.3619, + "step": 12346 + }, + { + "epoch": 2.4743486973947895, + "grad_norm": 22.27633769192266, + "learning_rate": 9.070838660108066e-07, + "loss": 1.573, + "step": 12347 + }, + { + "epoch": 2.474549098196393, + "grad_norm": 22.279624550527014, + "learning_rate": 9.064143072348097e-07, + "loss": 1.6413, + "step": 12348 + }, + { + "epoch": 2.474749498997996, + "grad_norm": 21.277058764332157, + "learning_rate": 9.057449710331095e-07, + "loss": 1.7171, + "step": 12349 + }, + { + "epoch": 2.4749498997995993, + "grad_norm": 28.130397163555575, + "learning_rate": 9.050758574420992e-07, + "loss": 1.7906, + "step": 12350 + }, + { + "epoch": 2.4751503006012023, + "grad_norm": 18.572162646242486, + "learning_rate": 9.044069664981602e-07, + "loss": 1.7444, + "step": 12351 + }, + { + "epoch": 2.4753507014028058, + "grad_norm": 15.865733576263343, + "learning_rate": 9.03738298237658e-07, + "loss": 1.167, + "step": 12352 + }, + { + "epoch": 2.4755511022044088, + "grad_norm": 17.250545636125512, + "learning_rate": 9.030698526969533e-07, + "loss": 1.5106, + "step": 12353 + }, + { + "epoch": 2.475751503006012, + "grad_norm": 24.540834828002065, + "learning_rate": 9.024016299123873e-07, + "loss": 1.8512, + "step": 12354 + }, + { + "epoch": 2.475951903807615, + "grad_norm": 20.68121580263496, + "learning_rate": 9.017336299202917e-07, + "loss": 2.024, + "step": 12355 + }, + { + "epoch": 2.4761523046092186, + "grad_norm": 20.98937125552195, + "learning_rate": 9.01065852756991e-07, + "loss": 1.4878, + "step": 12356 + }, + { + "epoch": 2.4763527054108216, + "grad_norm": 53.764743781956184, + "learning_rate": 9.003982984587889e-07, + "loss": 1.4918, + "step": 12357 + }, + { + "epoch": 2.4765531062124246, + "grad_norm": 18.964324199785846, + "learning_rate": 8.997309670619836e-07, + "loss": 1.4065, + "step": 12358 + }, + { + "epoch": 2.476753507014028, + "grad_norm": 19.400631683601798, + "learning_rate": 8.990638586028577e-07, + "loss": 1.6875, + "step": 12359 + }, + { + "epoch": 2.476953907815631, + "grad_norm": 26.29674126661859, + "learning_rate": 8.983969731176833e-07, + "loss": 1.5383, + "step": 12360 + }, + { + "epoch": 2.4771543086172345, + "grad_norm": 47.199524445623325, + "learning_rate": 8.97730310642721e-07, + "loss": 1.8781, + "step": 12361 + }, + { + "epoch": 2.4773547094188375, + "grad_norm": 19.528493740693023, + "learning_rate": 8.970638712142149e-07, + "loss": 1.5506, + "step": 12362 + }, + { + "epoch": 2.477555110220441, + "grad_norm": 23.58360230084048, + "learning_rate": 8.963976548684056e-07, + "loss": 1.6295, + "step": 12363 + }, + { + "epoch": 2.477755511022044, + "grad_norm": 19.725669578468324, + "learning_rate": 8.957316616415119e-07, + "loss": 1.4966, + "step": 12364 + }, + { + "epoch": 2.4779559118236474, + "grad_norm": 49.99215086590738, + "learning_rate": 8.950658915697458e-07, + "loss": 1.579, + "step": 12365 + }, + { + "epoch": 2.4781563126252504, + "grad_norm": 17.257439344281778, + "learning_rate": 8.944003446893068e-07, + "loss": 1.5404, + "step": 12366 + }, + { + "epoch": 2.478356713426854, + "grad_norm": 20.85062153968477, + "learning_rate": 8.93735021036381e-07, + "loss": 0.966, + "step": 12367 + }, + { + "epoch": 2.478557114228457, + "grad_norm": 30.972194217115014, + "learning_rate": 8.930699206471455e-07, + "loss": 1.3814, + "step": 12368 + }, + { + "epoch": 2.4787575150300603, + "grad_norm": 18.261176817842877, + "learning_rate": 8.924050435577581e-07, + "loss": 1.5951, + "step": 12369 + }, + { + "epoch": 2.4789579158316633, + "grad_norm": 22.666054186148603, + "learning_rate": 8.917403898043742e-07, + "loss": 1.6406, + "step": 12370 + }, + { + "epoch": 2.4791583166332667, + "grad_norm": 62.36287220855272, + "learning_rate": 8.910759594231283e-07, + "loss": 1.4827, + "step": 12371 + }, + { + "epoch": 2.4793587174348697, + "grad_norm": 21.19407153370653, + "learning_rate": 8.904117524501482e-07, + "loss": 1.6163, + "step": 12372 + }, + { + "epoch": 2.479559118236473, + "grad_norm": 22.473945033505444, + "learning_rate": 8.897477689215473e-07, + "loss": 1.7992, + "step": 12373 + }, + { + "epoch": 2.479759519038076, + "grad_norm": 24.249989882728197, + "learning_rate": 8.89084008873427e-07, + "loss": 1.2085, + "step": 12374 + }, + { + "epoch": 2.479959919839679, + "grad_norm": 21.127332848372127, + "learning_rate": 8.884204723418782e-07, + "loss": 1.4728, + "step": 12375 + }, + { + "epoch": 2.4801603206412826, + "grad_norm": 19.68652037015321, + "learning_rate": 8.877571593629769e-07, + "loss": 1.3521, + "step": 12376 + }, + { + "epoch": 2.480360721442886, + "grad_norm": 54.30872450791753, + "learning_rate": 8.870940699727909e-07, + "loss": 1.4702, + "step": 12377 + }, + { + "epoch": 2.480561122244489, + "grad_norm": 16.889560032791593, + "learning_rate": 8.864312042073698e-07, + "loss": 1.5281, + "step": 12378 + }, + { + "epoch": 2.480761523046092, + "grad_norm": 29.581254017419848, + "learning_rate": 8.857685621027568e-07, + "loss": 1.5591, + "step": 12379 + }, + { + "epoch": 2.4809619238476954, + "grad_norm": 19.596613924318746, + "learning_rate": 8.851061436949798e-07, + "loss": 1.5737, + "step": 12380 + }, + { + "epoch": 2.4811623246492984, + "grad_norm": 23.116136842803122, + "learning_rate": 8.844439490200557e-07, + "loss": 1.6997, + "step": 12381 + }, + { + "epoch": 2.481362725450902, + "grad_norm": 21.94050800115619, + "learning_rate": 8.837819781139894e-07, + "loss": 1.4559, + "step": 12382 + }, + { + "epoch": 2.481563126252505, + "grad_norm": 30.193992674930634, + "learning_rate": 8.831202310127734e-07, + "loss": 2.0396, + "step": 12383 + }, + { + "epoch": 2.4817635270541083, + "grad_norm": 23.176767235559883, + "learning_rate": 8.824587077523888e-07, + "loss": 1.6672, + "step": 12384 + }, + { + "epoch": 2.4819639278557113, + "grad_norm": 18.124210068691237, + "learning_rate": 8.817974083687991e-07, + "loss": 1.3929, + "step": 12385 + }, + { + "epoch": 2.4821643286573147, + "grad_norm": 21.49079032863527, + "learning_rate": 8.811363328979656e-07, + "loss": 1.7346, + "step": 12386 + }, + { + "epoch": 2.4823647294589177, + "grad_norm": 25.667285688139167, + "learning_rate": 8.804754813758299e-07, + "loss": 1.9535, + "step": 12387 + }, + { + "epoch": 2.482565130260521, + "grad_norm": 36.460005775874166, + "learning_rate": 8.798148538383228e-07, + "loss": 1.8759, + "step": 12388 + }, + { + "epoch": 2.482765531062124, + "grad_norm": 21.950221055333994, + "learning_rate": 8.791544503213634e-07, + "loss": 1.813, + "step": 12389 + }, + { + "epoch": 2.4829659318637276, + "grad_norm": 21.031589662683523, + "learning_rate": 8.784942708608601e-07, + "loss": 2.1433, + "step": 12390 + }, + { + "epoch": 2.4831663326653306, + "grad_norm": 22.47325961321572, + "learning_rate": 8.778343154927066e-07, + "loss": 2.0233, + "step": 12391 + }, + { + "epoch": 2.483366733466934, + "grad_norm": 19.883283467164727, + "learning_rate": 8.771745842527868e-07, + "loss": 1.706, + "step": 12392 + }, + { + "epoch": 2.483567134268537, + "grad_norm": 25.484861867092945, + "learning_rate": 8.765150771769703e-07, + "loss": 1.4604, + "step": 12393 + }, + { + "epoch": 2.4837675350701405, + "grad_norm": 27.31468828060644, + "learning_rate": 8.75855794301117e-07, + "loss": 1.7504, + "step": 12394 + }, + { + "epoch": 2.4839679358717435, + "grad_norm": 17.952951148699228, + "learning_rate": 8.751967356610713e-07, + "loss": 1.454, + "step": 12395 + }, + { + "epoch": 2.4841683366733465, + "grad_norm": 20.085815427825462, + "learning_rate": 8.745379012926675e-07, + "loss": 1.53, + "step": 12396 + }, + { + "epoch": 2.48436873747495, + "grad_norm": 22.98714893956412, + "learning_rate": 8.738792912317278e-07, + "loss": 1.836, + "step": 12397 + }, + { + "epoch": 2.4845691382765533, + "grad_norm": 29.14421433544696, + "learning_rate": 8.732209055140617e-07, + "loss": 1.7874, + "step": 12398 + }, + { + "epoch": 2.4847695390781563, + "grad_norm": 22.981260490406306, + "learning_rate": 8.725627441754663e-07, + "loss": 2.0644, + "step": 12399 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 20.066329034103724, + "learning_rate": 8.719048072517277e-07, + "loss": 1.9013, + "step": 12400 + }, + { + "epoch": 2.4851703406813628, + "grad_norm": 21.396202815158883, + "learning_rate": 8.712470947786194e-07, + "loss": 1.5898, + "step": 12401 + }, + { + "epoch": 2.4853707414829658, + "grad_norm": 22.92008227792179, + "learning_rate": 8.705896067918984e-07, + "loss": 1.666, + "step": 12402 + }, + { + "epoch": 2.485571142284569, + "grad_norm": 26.373028622952788, + "learning_rate": 8.699323433273188e-07, + "loss": 1.3676, + "step": 12403 + }, + { + "epoch": 2.485771543086172, + "grad_norm": 20.968059702879554, + "learning_rate": 8.692753044206131e-07, + "loss": 1.1171, + "step": 12404 + }, + { + "epoch": 2.4859719438877756, + "grad_norm": 21.674646088636752, + "learning_rate": 8.686184901075068e-07, + "loss": 1.5035, + "step": 12405 + }, + { + "epoch": 2.4861723446893786, + "grad_norm": 22.66283655525349, + "learning_rate": 8.679619004237111e-07, + "loss": 2.0675, + "step": 12406 + }, + { + "epoch": 2.486372745490982, + "grad_norm": 22.880577675516104, + "learning_rate": 8.673055354049264e-07, + "loss": 1.8498, + "step": 12407 + }, + { + "epoch": 2.486573146292585, + "grad_norm": 22.056741462632676, + "learning_rate": 8.666493950868421e-07, + "loss": 2.0899, + "step": 12408 + }, + { + "epoch": 2.4867735470941885, + "grad_norm": 16.218408140013434, + "learning_rate": 8.659934795051283e-07, + "loss": 1.6357, + "step": 12409 + }, + { + "epoch": 2.4869739478957915, + "grad_norm": 21.140830585125588, + "learning_rate": 8.653377886954545e-07, + "loss": 1.4694, + "step": 12410 + }, + { + "epoch": 2.487174348697395, + "grad_norm": 22.569673413204654, + "learning_rate": 8.646823226934664e-07, + "loss": 1.8575, + "step": 12411 + }, + { + "epoch": 2.487374749498998, + "grad_norm": 21.409854143802534, + "learning_rate": 8.64027081534805e-07, + "loss": 1.4989, + "step": 12412 + }, + { + "epoch": 2.4875751503006014, + "grad_norm": 20.069519625678204, + "learning_rate": 8.63372065255097e-07, + "loss": 1.5412, + "step": 12413 + }, + { + "epoch": 2.4877755511022044, + "grad_norm": 24.744265571176452, + "learning_rate": 8.627172738899553e-07, + "loss": 1.6837, + "step": 12414 + }, + { + "epoch": 2.487975951903808, + "grad_norm": 21.148885881213246, + "learning_rate": 8.620627074749843e-07, + "loss": 2.0318, + "step": 12415 + }, + { + "epoch": 2.488176352705411, + "grad_norm": 19.86937344095692, + "learning_rate": 8.614083660457695e-07, + "loss": 1.5773, + "step": 12416 + }, + { + "epoch": 2.488376753507014, + "grad_norm": 14.38102630221559, + "learning_rate": 8.607542496378923e-07, + "loss": 1.4396, + "step": 12417 + }, + { + "epoch": 2.4885771543086173, + "grad_norm": 18.873059538786762, + "learning_rate": 8.601003582869178e-07, + "loss": 1.6234, + "step": 12418 + }, + { + "epoch": 2.4887775551102203, + "grad_norm": 18.142016608000027, + "learning_rate": 8.59446692028395e-07, + "loss": 1.2407, + "step": 12419 + }, + { + "epoch": 2.4889779559118237, + "grad_norm": 25.213689589774912, + "learning_rate": 8.587932508978703e-07, + "loss": 1.5828, + "step": 12420 + }, + { + "epoch": 2.4891783567134267, + "grad_norm": 31.596226064118177, + "learning_rate": 8.581400349308683e-07, + "loss": 1.4257, + "step": 12421 + }, + { + "epoch": 2.48937875751503, + "grad_norm": 21.259856072915316, + "learning_rate": 8.574870441629068e-07, + "loss": 0.9164, + "step": 12422 + }, + { + "epoch": 2.489579158316633, + "grad_norm": 23.84553182826637, + "learning_rate": 8.568342786294892e-07, + "loss": 1.4153, + "step": 12423 + }, + { + "epoch": 2.4897795591182366, + "grad_norm": 18.697055762085483, + "learning_rate": 8.561817383661081e-07, + "loss": 1.187, + "step": 12424 + }, + { + "epoch": 2.4899799599198396, + "grad_norm": 21.826265208904623, + "learning_rate": 8.55529423408244e-07, + "loss": 1.3234, + "step": 12425 + }, + { + "epoch": 2.490180360721443, + "grad_norm": 17.952922199397698, + "learning_rate": 8.548773337913602e-07, + "loss": 1.5758, + "step": 12426 + }, + { + "epoch": 2.490380761523046, + "grad_norm": 20.924273998217323, + "learning_rate": 8.542254695509172e-07, + "loss": 2.1794, + "step": 12427 + }, + { + "epoch": 2.4905811623246494, + "grad_norm": 26.173080977616717, + "learning_rate": 8.535738307223534e-07, + "loss": 1.509, + "step": 12428 + }, + { + "epoch": 2.4907815631262524, + "grad_norm": 20.023537658422335, + "learning_rate": 8.529224173411016e-07, + "loss": 1.7602, + "step": 12429 + }, + { + "epoch": 2.490981963927856, + "grad_norm": 17.825358510204303, + "learning_rate": 8.522712294425795e-07, + "loss": 1.3209, + "step": 12430 + }, + { + "epoch": 2.491182364729459, + "grad_norm": 20.609092949930613, + "learning_rate": 8.516202670621937e-07, + "loss": 1.3159, + "step": 12431 + }, + { + "epoch": 2.4913827655310623, + "grad_norm": 31.619150431211832, + "learning_rate": 8.509695302353382e-07, + "loss": 1.4538, + "step": 12432 + }, + { + "epoch": 2.4915831663326653, + "grad_norm": 23.3884028289314, + "learning_rate": 8.503190189973915e-07, + "loss": 1.8667, + "step": 12433 + }, + { + "epoch": 2.4917835671342683, + "grad_norm": 22.523428489063043, + "learning_rate": 8.496687333837278e-07, + "loss": 1.5296, + "step": 12434 + }, + { + "epoch": 2.4919839679358717, + "grad_norm": 25.615989242889235, + "learning_rate": 8.490186734297002e-07, + "loss": 2.2677, + "step": 12435 + }, + { + "epoch": 2.492184368737475, + "grad_norm": 26.80277632723529, + "learning_rate": 8.483688391706546e-07, + "loss": 1.3271, + "step": 12436 + }, + { + "epoch": 2.492384769539078, + "grad_norm": 20.94567450078547, + "learning_rate": 8.477192306419241e-07, + "loss": 1.7322, + "step": 12437 + }, + { + "epoch": 2.492585170340681, + "grad_norm": 23.64209702292191, + "learning_rate": 8.47069847878828e-07, + "loss": 1.257, + "step": 12438 + }, + { + "epoch": 2.4927855711422846, + "grad_norm": 19.955303568184473, + "learning_rate": 8.464206909166755e-07, + "loss": 1.2269, + "step": 12439 + }, + { + "epoch": 2.4929859719438876, + "grad_norm": 17.822863664308628, + "learning_rate": 8.457717597907594e-07, + "loss": 1.7335, + "step": 12440 + }, + { + "epoch": 2.493186372745491, + "grad_norm": 43.3308024631269, + "learning_rate": 8.45123054536367e-07, + "loss": 1.3809, + "step": 12441 + }, + { + "epoch": 2.493386773547094, + "grad_norm": 28.728182232238378, + "learning_rate": 8.444745751887662e-07, + "loss": 1.8964, + "step": 12442 + }, + { + "epoch": 2.4935871743486975, + "grad_norm": 23.551825157853678, + "learning_rate": 8.43826321783216e-07, + "loss": 1.3241, + "step": 12443 + }, + { + "epoch": 2.4937875751503005, + "grad_norm": 14.563953675902585, + "learning_rate": 8.431782943549654e-07, + "loss": 1.0629, + "step": 12444 + }, + { + "epoch": 2.493987975951904, + "grad_norm": 22.28876039504347, + "learning_rate": 8.425304929392464e-07, + "loss": 1.4628, + "step": 12445 + }, + { + "epoch": 2.494188376753507, + "grad_norm": 17.64711950979863, + "learning_rate": 8.418829175712812e-07, + "loss": 1.6524, + "step": 12446 + }, + { + "epoch": 2.4943887775551103, + "grad_norm": 16.838264692918113, + "learning_rate": 8.412355682862799e-07, + "loss": 1.6541, + "step": 12447 + }, + { + "epoch": 2.4945891783567133, + "grad_norm": 21.33858777825091, + "learning_rate": 8.405884451194396e-07, + "loss": 1.3961, + "step": 12448 + }, + { + "epoch": 2.494789579158317, + "grad_norm": 17.956959173421613, + "learning_rate": 8.399415481059464e-07, + "loss": 1.4816, + "step": 12449 + }, + { + "epoch": 2.49498997995992, + "grad_norm": 21.703688923750835, + "learning_rate": 8.392948772809695e-07, + "loss": 1.4699, + "step": 12450 + }, + { + "epoch": 2.495190380761523, + "grad_norm": 21.93382528262471, + "learning_rate": 8.386484326796745e-07, + "loss": 1.7519, + "step": 12451 + }, + { + "epoch": 2.495390781563126, + "grad_norm": 31.425600470108527, + "learning_rate": 8.380022143372058e-07, + "loss": 1.6274, + "step": 12452 + }, + { + "epoch": 2.4955911823647297, + "grad_norm": 25.239711332185582, + "learning_rate": 8.373562222887005e-07, + "loss": 1.6235, + "step": 12453 + }, + { + "epoch": 2.4957915831663327, + "grad_norm": 18.924915793704226, + "learning_rate": 8.367104565692818e-07, + "loss": 1.2678, + "step": 12454 + }, + { + "epoch": 2.4959919839679356, + "grad_norm": 23.693663666928536, + "learning_rate": 8.360649172140611e-07, + "loss": 2.1587, + "step": 12455 + }, + { + "epoch": 2.496192384769539, + "grad_norm": 22.91493916969752, + "learning_rate": 8.354196042581392e-07, + "loss": 1.4, + "step": 12456 + }, + { + "epoch": 2.4963927855711425, + "grad_norm": 27.118308067085707, + "learning_rate": 8.347745177365979e-07, + "loss": 1.4368, + "step": 12457 + }, + { + "epoch": 2.4965931863727455, + "grad_norm": 28.865029737671687, + "learning_rate": 8.341296576845171e-07, + "loss": 1.9843, + "step": 12458 + }, + { + "epoch": 2.4967935871743485, + "grad_norm": 19.00716247626173, + "learning_rate": 8.334850241369547e-07, + "loss": 1.892, + "step": 12459 + }, + { + "epoch": 2.496993987975952, + "grad_norm": 19.71518118088075, + "learning_rate": 8.328406171289621e-07, + "loss": 1.8223, + "step": 12460 + }, + { + "epoch": 2.497194388777555, + "grad_norm": 16.27336613503207, + "learning_rate": 8.321964366955765e-07, + "loss": 1.399, + "step": 12461 + }, + { + "epoch": 2.4973947895791584, + "grad_norm": 23.293684845344234, + "learning_rate": 8.31552482871823e-07, + "loss": 1.3862, + "step": 12462 + }, + { + "epoch": 2.4975951903807614, + "grad_norm": 21.512741517196915, + "learning_rate": 8.309087556927137e-07, + "loss": 1.9318, + "step": 12463 + }, + { + "epoch": 2.497795591182365, + "grad_norm": 19.64507337162302, + "learning_rate": 8.3026525519325e-07, + "loss": 1.4007, + "step": 12464 + }, + { + "epoch": 2.497995991983968, + "grad_norm": 20.140616677809913, + "learning_rate": 8.296219814084205e-07, + "loss": 1.636, + "step": 12465 + }, + { + "epoch": 2.4981963927855713, + "grad_norm": 21.54497372776496, + "learning_rate": 8.289789343731985e-07, + "loss": 1.5356, + "step": 12466 + }, + { + "epoch": 2.4983967935871743, + "grad_norm": 18.64937687193081, + "learning_rate": 8.283361141225477e-07, + "loss": 1.7155, + "step": 12467 + }, + { + "epoch": 2.4985971943887777, + "grad_norm": 24.050655523542243, + "learning_rate": 8.276935206914233e-07, + "loss": 1.9842, + "step": 12468 + }, + { + "epoch": 2.4987975951903807, + "grad_norm": 14.058401794371079, + "learning_rate": 8.270511541147591e-07, + "loss": 1.2511, + "step": 12469 + }, + { + "epoch": 2.498997995991984, + "grad_norm": 24.767389413111637, + "learning_rate": 8.264090144274839e-07, + "loss": 1.5317, + "step": 12470 + }, + { + "epoch": 2.499198396793587, + "grad_norm": 29.369444366884338, + "learning_rate": 8.257671016645113e-07, + "loss": 1.9059, + "step": 12471 + }, + { + "epoch": 2.4993987975951906, + "grad_norm": 41.848490215539975, + "learning_rate": 8.25125415860743e-07, + "loss": 1.6015, + "step": 12472 + }, + { + "epoch": 2.4995991983967936, + "grad_norm": 19.261371823612755, + "learning_rate": 8.244839570510687e-07, + "loss": 1.7769, + "step": 12473 + }, + { + "epoch": 2.499799599198397, + "grad_norm": 29.423411525206575, + "learning_rate": 8.238427252703651e-07, + "loss": 1.6165, + "step": 12474 + }, + { + "epoch": 2.5, + "grad_norm": 25.406601048810234, + "learning_rate": 8.232017205534987e-07, + "loss": 1.2835, + "step": 12475 + }, + { + "epoch": 2.500200400801603, + "grad_norm": 23.212471857899168, + "learning_rate": 8.225609429353187e-07, + "loss": 1.7963, + "step": 12476 + }, + { + "epoch": 2.5004008016032064, + "grad_norm": 26.186968114518468, + "learning_rate": 8.219203924506663e-07, + "loss": 1.8585, + "step": 12477 + }, + { + "epoch": 2.50060120240481, + "grad_norm": 31.123013815170292, + "learning_rate": 8.212800691343703e-07, + "loss": 1.5054, + "step": 12478 + }, + { + "epoch": 2.500801603206413, + "grad_norm": 19.593143330037314, + "learning_rate": 8.206399730212449e-07, + "loss": 0.9592, + "step": 12479 + }, + { + "epoch": 2.501002004008016, + "grad_norm": 23.57089032954427, + "learning_rate": 8.200001041460936e-07, + "loss": 1.4282, + "step": 12480 + }, + { + "epoch": 2.5012024048096193, + "grad_norm": 27.63829602891, + "learning_rate": 8.193604625437068e-07, + "loss": 1.1788, + "step": 12481 + }, + { + "epoch": 2.5014028056112223, + "grad_norm": 21.732073709941247, + "learning_rate": 8.187210482488644e-07, + "loss": 1.9207, + "step": 12482 + }, + { + "epoch": 2.5016032064128257, + "grad_norm": 26.461854791136897, + "learning_rate": 8.180818612963293e-07, + "loss": 1.4327, + "step": 12483 + }, + { + "epoch": 2.5018036072144287, + "grad_norm": 18.036072744171765, + "learning_rate": 8.174429017208563e-07, + "loss": 1.7637, + "step": 12484 + }, + { + "epoch": 2.502004008016032, + "grad_norm": 79.08739751858396, + "learning_rate": 8.168041695571871e-07, + "loss": 2.2313, + "step": 12485 + }, + { + "epoch": 2.502204408817635, + "grad_norm": 18.794729576602343, + "learning_rate": 8.161656648400496e-07, + "loss": 1.4138, + "step": 12486 + }, + { + "epoch": 2.5024048096192386, + "grad_norm": 22.327240184874782, + "learning_rate": 8.155273876041614e-07, + "loss": 1.508, + "step": 12487 + }, + { + "epoch": 2.5026052104208416, + "grad_norm": 17.0759155132725, + "learning_rate": 8.148893378842254e-07, + "loss": 1.4381, + "step": 12488 + }, + { + "epoch": 2.502805611222445, + "grad_norm": 19.436294022809058, + "learning_rate": 8.142515157149356e-07, + "loss": 1.4788, + "step": 12489 + }, + { + "epoch": 2.503006012024048, + "grad_norm": 24.53819840686036, + "learning_rate": 8.136139211309674e-07, + "loss": 1.7778, + "step": 12490 + }, + { + "epoch": 2.5032064128256515, + "grad_norm": 19.409474414676456, + "learning_rate": 8.129765541669916e-07, + "loss": 1.6336, + "step": 12491 + }, + { + "epoch": 2.5034068136272545, + "grad_norm": 18.75663734614052, + "learning_rate": 8.123394148576607e-07, + "loss": 1.8881, + "step": 12492 + }, + { + "epoch": 2.5036072144288575, + "grad_norm": 14.96387058119108, + "learning_rate": 8.117025032376169e-07, + "loss": 1.3204, + "step": 12493 + }, + { + "epoch": 2.503807615230461, + "grad_norm": 21.225386662349578, + "learning_rate": 8.110658193414905e-07, + "loss": 1.4695, + "step": 12494 + }, + { + "epoch": 2.5040080160320644, + "grad_norm": 20.286686967687285, + "learning_rate": 8.104293632038996e-07, + "loss": 1.7531, + "step": 12495 + }, + { + "epoch": 2.5042084168336673, + "grad_norm": 23.651028548471853, + "learning_rate": 8.09793134859449e-07, + "loss": 1.3945, + "step": 12496 + }, + { + "epoch": 2.5044088176352703, + "grad_norm": 29.088203162630226, + "learning_rate": 8.091571343427291e-07, + "loss": 1.3087, + "step": 12497 + }, + { + "epoch": 2.504609218436874, + "grad_norm": 20.523697663022084, + "learning_rate": 8.085213616883231e-07, + "loss": 1.3691, + "step": 12498 + }, + { + "epoch": 2.5048096192384772, + "grad_norm": 27.996271338037207, + "learning_rate": 8.078858169307985e-07, + "loss": 1.6232, + "step": 12499 + }, + { + "epoch": 2.50501002004008, + "grad_norm": 22.812209794126556, + "learning_rate": 8.072505001047082e-07, + "loss": 1.6162, + "step": 12500 + }, + { + "epoch": 2.505210420841683, + "grad_norm": 23.424576193698705, + "learning_rate": 8.066154112446001e-07, + "loss": 1.6238, + "step": 12501 + }, + { + "epoch": 2.5054108216432867, + "grad_norm": 24.429391885832644, + "learning_rate": 8.059805503850005e-07, + "loss": 1.7319, + "step": 12502 + }, + { + "epoch": 2.5056112224448897, + "grad_norm": 23.331106652613602, + "learning_rate": 8.053459175604289e-07, + "loss": 1.0947, + "step": 12503 + }, + { + "epoch": 2.505811623246493, + "grad_norm": 18.67859560731119, + "learning_rate": 8.047115128053923e-07, + "loss": 1.523, + "step": 12504 + }, + { + "epoch": 2.506012024048096, + "grad_norm": 18.592593323966614, + "learning_rate": 8.040773361543835e-07, + "loss": 1.5625, + "step": 12505 + }, + { + "epoch": 2.5062124248496995, + "grad_norm": 22.486199764916815, + "learning_rate": 8.034433876418846e-07, + "loss": 1.17, + "step": 12506 + }, + { + "epoch": 2.5064128256513025, + "grad_norm": 29.343799507253248, + "learning_rate": 8.028096673023611e-07, + "loss": 1.728, + "step": 12507 + }, + { + "epoch": 2.506613226452906, + "grad_norm": 18.261733986667274, + "learning_rate": 8.021761751702739e-07, + "loss": 1.7448, + "step": 12508 + }, + { + "epoch": 2.506813627254509, + "grad_norm": 32.99040033903443, + "learning_rate": 8.015429112800638e-07, + "loss": 1.5117, + "step": 12509 + }, + { + "epoch": 2.507014028056112, + "grad_norm": 25.79580395302623, + "learning_rate": 8.009098756661627e-07, + "loss": 1.1013, + "step": 12510 + }, + { + "epoch": 2.5072144288577154, + "grad_norm": 29.809612621620154, + "learning_rate": 8.002770683629902e-07, + "loss": 1.7961, + "step": 12511 + }, + { + "epoch": 2.507414829659319, + "grad_norm": 22.878024878525363, + "learning_rate": 7.996444894049526e-07, + "loss": 1.1366, + "step": 12512 + }, + { + "epoch": 2.507615230460922, + "grad_norm": 23.169898009387303, + "learning_rate": 7.99012138826446e-07, + "loss": 1.932, + "step": 12513 + }, + { + "epoch": 2.507815631262525, + "grad_norm": 16.202483075404437, + "learning_rate": 7.983800166618482e-07, + "loss": 1.7642, + "step": 12514 + }, + { + "epoch": 2.5080160320641283, + "grad_norm": 15.537924999792764, + "learning_rate": 7.977481229455336e-07, + "loss": 1.1887, + "step": 12515 + }, + { + "epoch": 2.5082164328657317, + "grad_norm": 24.49720675745057, + "learning_rate": 7.971164577118551e-07, + "loss": 1.5361, + "step": 12516 + }, + { + "epoch": 2.5084168336673347, + "grad_norm": 26.078128247092742, + "learning_rate": 7.964850209951591e-07, + "loss": 1.8518, + "step": 12517 + }, + { + "epoch": 2.5086172344689377, + "grad_norm": 26.255930546770855, + "learning_rate": 7.958538128297771e-07, + "loss": 1.5545, + "step": 12518 + }, + { + "epoch": 2.508817635270541, + "grad_norm": 16.578411213023305, + "learning_rate": 7.952228332500295e-07, + "loss": 1.3312, + "step": 12519 + }, + { + "epoch": 2.509018036072144, + "grad_norm": 38.989983185394586, + "learning_rate": 7.945920822902243e-07, + "loss": 1.7615, + "step": 12520 + }, + { + "epoch": 2.5092184368737476, + "grad_norm": 22.899594439931725, + "learning_rate": 7.939615599846529e-07, + "loss": 2.0245, + "step": 12521 + }, + { + "epoch": 2.5094188376753506, + "grad_norm": 19.20021183691797, + "learning_rate": 7.933312663676029e-07, + "loss": 1.5787, + "step": 12522 + }, + { + "epoch": 2.509619238476954, + "grad_norm": 18.356878210716484, + "learning_rate": 7.927012014733399e-07, + "loss": 1.5049, + "step": 12523 + }, + { + "epoch": 2.509819639278557, + "grad_norm": 18.69001505768999, + "learning_rate": 7.920713653361223e-07, + "loss": 1.0935, + "step": 12524 + }, + { + "epoch": 2.5100200400801604, + "grad_norm": 25.0873091355352, + "learning_rate": 7.914417579901984e-07, + "loss": 1.3242, + "step": 12525 + }, + { + "epoch": 2.5102204408817634, + "grad_norm": 16.951244060284072, + "learning_rate": 7.908123794697969e-07, + "loss": 1.2211, + "step": 12526 + }, + { + "epoch": 2.510420841683367, + "grad_norm": 14.880315498307722, + "learning_rate": 7.90183229809141e-07, + "loss": 1.2434, + "step": 12527 + }, + { + "epoch": 2.51062124248497, + "grad_norm": 19.86505770618795, + "learning_rate": 7.895543090424352e-07, + "loss": 1.5127, + "step": 12528 + }, + { + "epoch": 2.5108216432865733, + "grad_norm": 21.29225058053715, + "learning_rate": 7.889256172038778e-07, + "loss": 1.2657, + "step": 12529 + }, + { + "epoch": 2.5110220440881763, + "grad_norm": 20.575788766410234, + "learning_rate": 7.882971543276518e-07, + "loss": 1.5183, + "step": 12530 + }, + { + "epoch": 2.5112224448897793, + "grad_norm": 16.120556267797447, + "learning_rate": 7.876689204479243e-07, + "loss": 1.5672, + "step": 12531 + }, + { + "epoch": 2.5114228456913827, + "grad_norm": 19.872707699860957, + "learning_rate": 7.870409155988578e-07, + "loss": 1.7546, + "step": 12532 + }, + { + "epoch": 2.511623246492986, + "grad_norm": 23.592037635923084, + "learning_rate": 7.864131398145941e-07, + "loss": 1.7226, + "step": 12533 + }, + { + "epoch": 2.511823647294589, + "grad_norm": 32.010137175566506, + "learning_rate": 7.857855931292684e-07, + "loss": 2.3939, + "step": 12534 + }, + { + "epoch": 2.512024048096192, + "grad_norm": 23.16552525876119, + "learning_rate": 7.851582755770004e-07, + "loss": 1.6993, + "step": 12535 + }, + { + "epoch": 2.5122244488977956, + "grad_norm": 21.26064343845931, + "learning_rate": 7.845311871918987e-07, + "loss": 1.368, + "step": 12536 + }, + { + "epoch": 2.512424849699399, + "grad_norm": 23.94232440824063, + "learning_rate": 7.839043280080605e-07, + "loss": 1.7107, + "step": 12537 + }, + { + "epoch": 2.512625250501002, + "grad_norm": 13.949343077957096, + "learning_rate": 7.832776980595647e-07, + "loss": 1.3066, + "step": 12538 + }, + { + "epoch": 2.512825651302605, + "grad_norm": 21.970803837571314, + "learning_rate": 7.826512973804878e-07, + "loss": 1.2002, + "step": 12539 + }, + { + "epoch": 2.5130260521042085, + "grad_norm": 18.926163634301624, + "learning_rate": 7.820251260048833e-07, + "loss": 1.7112, + "step": 12540 + }, + { + "epoch": 2.5132264529058115, + "grad_norm": 22.387069918217204, + "learning_rate": 7.813991839667994e-07, + "loss": 1.9068, + "step": 12541 + }, + { + "epoch": 2.513426853707415, + "grad_norm": 34.11979724111417, + "learning_rate": 7.807734713002691e-07, + "loss": 1.6584, + "step": 12542 + }, + { + "epoch": 2.513627254509018, + "grad_norm": 21.325202092657147, + "learning_rate": 7.801479880393137e-07, + "loss": 1.4541, + "step": 12543 + }, + { + "epoch": 2.5138276553106214, + "grad_norm": 20.047329757714508, + "learning_rate": 7.795227342179406e-07, + "loss": 1.3241, + "step": 12544 + }, + { + "epoch": 2.5140280561122244, + "grad_norm": 35.32921120638514, + "learning_rate": 7.788977098701467e-07, + "loss": 2.0724, + "step": 12545 + }, + { + "epoch": 2.514228456913828, + "grad_norm": 21.22887747332395, + "learning_rate": 7.782729150299162e-07, + "loss": 1.1734, + "step": 12546 + }, + { + "epoch": 2.514428857715431, + "grad_norm": 17.466126398442636, + "learning_rate": 7.776483497312182e-07, + "loss": 1.5686, + "step": 12547 + }, + { + "epoch": 2.5146292585170342, + "grad_norm": 20.834856944641473, + "learning_rate": 7.770240140080126e-07, + "loss": 1.6814, + "step": 12548 + }, + { + "epoch": 2.5148296593186372, + "grad_norm": 20.34844147270187, + "learning_rate": 7.763999078942447e-07, + "loss": 1.5005, + "step": 12549 + }, + { + "epoch": 2.5150300601202407, + "grad_norm": 17.261728079521657, + "learning_rate": 7.757760314238482e-07, + "loss": 1.4198, + "step": 12550 + }, + { + "epoch": 2.5152304609218437, + "grad_norm": 18.693391655916912, + "learning_rate": 7.751523846307446e-07, + "loss": 1.4085, + "step": 12551 + }, + { + "epoch": 2.5154308617234467, + "grad_norm": 16.116322975480017, + "learning_rate": 7.745289675488421e-07, + "loss": 1.1309, + "step": 12552 + }, + { + "epoch": 2.51563126252505, + "grad_norm": 21.22954404580782, + "learning_rate": 7.739057802120381e-07, + "loss": 1.6864, + "step": 12553 + }, + { + "epoch": 2.5158316633266535, + "grad_norm": 22.14920388320859, + "learning_rate": 7.732828226542144e-07, + "loss": 1.3988, + "step": 12554 + }, + { + "epoch": 2.5160320641282565, + "grad_norm": 24.380302296057085, + "learning_rate": 7.726600949092411e-07, + "loss": 1.6741, + "step": 12555 + }, + { + "epoch": 2.5162324649298595, + "grad_norm": 27.48002744201666, + "learning_rate": 7.720375970109811e-07, + "loss": 1.7092, + "step": 12556 + }, + { + "epoch": 2.516432865731463, + "grad_norm": 20.538971307052787, + "learning_rate": 7.714153289932775e-07, + "loss": 1.4028, + "step": 12557 + }, + { + "epoch": 2.5166332665330664, + "grad_norm": 23.107474241995014, + "learning_rate": 7.707932908899634e-07, + "loss": 1.4911, + "step": 12558 + }, + { + "epoch": 2.5168336673346694, + "grad_norm": 24.966213061536983, + "learning_rate": 7.701714827348617e-07, + "loss": 1.342, + "step": 12559 + }, + { + "epoch": 2.5170340681362724, + "grad_norm": 19.430464053116008, + "learning_rate": 7.6954990456178e-07, + "loss": 1.2124, + "step": 12560 + }, + { + "epoch": 2.517234468937876, + "grad_norm": 25.855492754488424, + "learning_rate": 7.689285564045146e-07, + "loss": 1.716, + "step": 12561 + }, + { + "epoch": 2.517434869739479, + "grad_norm": 21.671202192219845, + "learning_rate": 7.683074382968492e-07, + "loss": 1.9515, + "step": 12562 + }, + { + "epoch": 2.5176352705410823, + "grad_norm": 16.882411170562396, + "learning_rate": 7.676865502725567e-07, + "loss": 1.5268, + "step": 12563 + }, + { + "epoch": 2.5178356713426853, + "grad_norm": 37.718610924226994, + "learning_rate": 7.670658923653918e-07, + "loss": 1.1724, + "step": 12564 + }, + { + "epoch": 2.5180360721442887, + "grad_norm": 16.449877253196938, + "learning_rate": 7.664454646091036e-07, + "loss": 1.4731, + "step": 12565 + }, + { + "epoch": 2.5182364729458917, + "grad_norm": 25.555568755721946, + "learning_rate": 7.658252670374245e-07, + "loss": 1.0964, + "step": 12566 + }, + { + "epoch": 2.518436873747495, + "grad_norm": 15.760529543383003, + "learning_rate": 7.652052996840753e-07, + "loss": 1.2219, + "step": 12567 + }, + { + "epoch": 2.518637274549098, + "grad_norm": 18.22532587943481, + "learning_rate": 7.645855625827659e-07, + "loss": 1.6914, + "step": 12568 + }, + { + "epoch": 2.518837675350701, + "grad_norm": 18.382310843503458, + "learning_rate": 7.639660557671908e-07, + "loss": 1.7059, + "step": 12569 + }, + { + "epoch": 2.5190380761523046, + "grad_norm": 26.41702838927057, + "learning_rate": 7.633467792710359e-07, + "loss": 1.6223, + "step": 12570 + }, + { + "epoch": 2.519238476953908, + "grad_norm": 28.93772646627974, + "learning_rate": 7.627277331279692e-07, + "loss": 1.2525, + "step": 12571 + }, + { + "epoch": 2.519438877755511, + "grad_norm": 25.1636254047038, + "learning_rate": 7.6210891737165e-07, + "loss": 1.1246, + "step": 12572 + }, + { + "epoch": 2.519639278557114, + "grad_norm": 18.660209927857814, + "learning_rate": 7.614903320357253e-07, + "loss": 1.7826, + "step": 12573 + }, + { + "epoch": 2.5198396793587174, + "grad_norm": 20.01653057841562, + "learning_rate": 7.608719771538275e-07, + "loss": 1.6217, + "step": 12574 + }, + { + "epoch": 2.520040080160321, + "grad_norm": 22.529969627244625, + "learning_rate": 7.602538527595782e-07, + "loss": 1.8126, + "step": 12575 + }, + { + "epoch": 2.520240480961924, + "grad_norm": 17.550370994133356, + "learning_rate": 7.596359588865848e-07, + "loss": 1.0808, + "step": 12576 + }, + { + "epoch": 2.520440881763527, + "grad_norm": 30.75897704621309, + "learning_rate": 7.59018295568445e-07, + "loss": 1.5721, + "step": 12577 + }, + { + "epoch": 2.5206412825651303, + "grad_norm": 21.846954615033447, + "learning_rate": 7.584008628387384e-07, + "loss": 1.3825, + "step": 12578 + }, + { + "epoch": 2.5208416833667333, + "grad_norm": 17.515074147929695, + "learning_rate": 7.577836607310407e-07, + "loss": 1.33, + "step": 12579 + }, + { + "epoch": 2.5210420841683367, + "grad_norm": 20.895927930227927, + "learning_rate": 7.571666892789065e-07, + "loss": 1.408, + "step": 12580 + }, + { + "epoch": 2.5212424849699397, + "grad_norm": 17.964878413989318, + "learning_rate": 7.565499485158817e-07, + "loss": 1.1191, + "step": 12581 + }, + { + "epoch": 2.521442885771543, + "grad_norm": 16.13066837925779, + "learning_rate": 7.559334384755007e-07, + "loss": 1.1598, + "step": 12582 + }, + { + "epoch": 2.521643286573146, + "grad_norm": 22.33093882380759, + "learning_rate": 7.553171591912828e-07, + "loss": 1.4598, + "step": 12583 + }, + { + "epoch": 2.5218436873747496, + "grad_norm": 42.32014671922954, + "learning_rate": 7.547011106967383e-07, + "loss": 1.8257, + "step": 12584 + }, + { + "epoch": 2.5220440881763526, + "grad_norm": 29.21068830630584, + "learning_rate": 7.540852930253589e-07, + "loss": 1.2939, + "step": 12585 + }, + { + "epoch": 2.522244488977956, + "grad_norm": 61.28996079538605, + "learning_rate": 7.534697062106305e-07, + "loss": 1.9987, + "step": 12586 + }, + { + "epoch": 2.522444889779559, + "grad_norm": 24.38337369451183, + "learning_rate": 7.528543502860236e-07, + "loss": 1.9541, + "step": 12587 + }, + { + "epoch": 2.5226452905811625, + "grad_norm": 23.971520158540205, + "learning_rate": 7.522392252849931e-07, + "loss": 1.9855, + "step": 12588 + }, + { + "epoch": 2.5228456913827655, + "grad_norm": 14.787528702362115, + "learning_rate": 7.516243312409882e-07, + "loss": 1.3591, + "step": 12589 + }, + { + "epoch": 2.5230460921843685, + "grad_norm": 22.771197601137867, + "learning_rate": 7.51009668187439e-07, + "loss": 1.7644, + "step": 12590 + }, + { + "epoch": 2.523246492985972, + "grad_norm": 26.648959086349144, + "learning_rate": 7.503952361577654e-07, + "loss": 1.831, + "step": 12591 + }, + { + "epoch": 2.5234468937875754, + "grad_norm": 50.02250682379916, + "learning_rate": 7.497810351853763e-07, + "loss": 1.7316, + "step": 12592 + }, + { + "epoch": 2.5236472945891784, + "grad_norm": 18.670630525009457, + "learning_rate": 7.491670653036664e-07, + "loss": 1.4122, + "step": 12593 + }, + { + "epoch": 2.5238476953907814, + "grad_norm": 18.09535322238861, + "learning_rate": 7.485533265460187e-07, + "loss": 1.4321, + "step": 12594 + }, + { + "epoch": 2.524048096192385, + "grad_norm": 23.848279437456757, + "learning_rate": 7.479398189458004e-07, + "loss": 1.5533, + "step": 12595 + }, + { + "epoch": 2.5242484969939882, + "grad_norm": 22.093636633758212, + "learning_rate": 7.47326542536373e-07, + "loss": 1.5083, + "step": 12596 + }, + { + "epoch": 2.5244488977955912, + "grad_norm": 46.26005871646199, + "learning_rate": 7.467134973510776e-07, + "loss": 1.3863, + "step": 12597 + }, + { + "epoch": 2.5246492985971942, + "grad_norm": 23.210030062392264, + "learning_rate": 7.461006834232476e-07, + "loss": 1.995, + "step": 12598 + }, + { + "epoch": 2.5248496993987977, + "grad_norm": 20.660919788765437, + "learning_rate": 7.45488100786203e-07, + "loss": 0.8714, + "step": 12599 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 22.037567026859552, + "learning_rate": 7.448757494732511e-07, + "loss": 1.6393, + "step": 12600 + }, + { + "epoch": 2.525250501002004, + "grad_norm": 33.2175897123899, + "learning_rate": 7.442636295176864e-07, + "loss": 1.3078, + "step": 12601 + }, + { + "epoch": 2.525450901803607, + "grad_norm": 24.912264155889268, + "learning_rate": 7.436517409527882e-07, + "loss": 1.6005, + "step": 12602 + }, + { + "epoch": 2.5256513026052105, + "grad_norm": 28.550530948990914, + "learning_rate": 7.430400838118296e-07, + "loss": 1.2621, + "step": 12603 + }, + { + "epoch": 2.5258517034068135, + "grad_norm": 49.841386512060325, + "learning_rate": 7.424286581280649e-07, + "loss": 1.7086, + "step": 12604 + }, + { + "epoch": 2.526052104208417, + "grad_norm": 18.554279036768794, + "learning_rate": 7.418174639347376e-07, + "loss": 1.5712, + "step": 12605 + }, + { + "epoch": 2.52625250501002, + "grad_norm": 19.228593131764253, + "learning_rate": 7.412065012650826e-07, + "loss": 1.1076, + "step": 12606 + }, + { + "epoch": 2.5264529058116234, + "grad_norm": 20.34277922186804, + "learning_rate": 7.405957701523153e-07, + "loss": 1.4033, + "step": 12607 + }, + { + "epoch": 2.5266533066132264, + "grad_norm": 16.795341709777706, + "learning_rate": 7.399852706296451e-07, + "loss": 1.527, + "step": 12608 + }, + { + "epoch": 2.52685370741483, + "grad_norm": 16.681189095208826, + "learning_rate": 7.393750027302615e-07, + "loss": 1.4416, + "step": 12609 + }, + { + "epoch": 2.527054108216433, + "grad_norm": 31.617085078263916, + "learning_rate": 7.387649664873492e-07, + "loss": 1.1979, + "step": 12610 + }, + { + "epoch": 2.527254509018036, + "grad_norm": 24.90218997672138, + "learning_rate": 7.381551619340771e-07, + "loss": 1.6758, + "step": 12611 + }, + { + "epoch": 2.5274549098196393, + "grad_norm": 23.68964956747975, + "learning_rate": 7.375455891035977e-07, + "loss": 1.7796, + "step": 12612 + }, + { + "epoch": 2.5276553106212427, + "grad_norm": 29.45797473998933, + "learning_rate": 7.369362480290592e-07, + "loss": 1.0317, + "step": 12613 + }, + { + "epoch": 2.5278557114228457, + "grad_norm": 40.09059826890144, + "learning_rate": 7.363271387435883e-07, + "loss": 1.5612, + "step": 12614 + }, + { + "epoch": 2.5280561122244487, + "grad_norm": 28.7670746186375, + "learning_rate": 7.357182612803043e-07, + "loss": 1.5469, + "step": 12615 + }, + { + "epoch": 2.528256513026052, + "grad_norm": 31.485452085072613, + "learning_rate": 7.351096156723137e-07, + "loss": 2.1208, + "step": 12616 + }, + { + "epoch": 2.5284569138276556, + "grad_norm": 18.33352156207108, + "learning_rate": 7.345012019527082e-07, + "loss": 1.5273, + "step": 12617 + }, + { + "epoch": 2.5286573146292586, + "grad_norm": 20.361273645590856, + "learning_rate": 7.338930201545708e-07, + "loss": 1.7425, + "step": 12618 + }, + { + "epoch": 2.5288577154308616, + "grad_norm": 25.45870094483003, + "learning_rate": 7.33285070310964e-07, + "loss": 1.2618, + "step": 12619 + }, + { + "epoch": 2.529058116232465, + "grad_norm": 27.701873782504066, + "learning_rate": 7.326773524549491e-07, + "loss": 1.5635, + "step": 12620 + }, + { + "epoch": 2.529258517034068, + "grad_norm": 23.44167816971569, + "learning_rate": 7.320698666195642e-07, + "loss": 1.5297, + "step": 12621 + }, + { + "epoch": 2.5294589178356714, + "grad_norm": 20.359234523330553, + "learning_rate": 7.314626128378411e-07, + "loss": 1.8158, + "step": 12622 + }, + { + "epoch": 2.5296593186372744, + "grad_norm": 26.82898177621874, + "learning_rate": 7.308555911427961e-07, + "loss": 1.7025, + "step": 12623 + }, + { + "epoch": 2.529859719438878, + "grad_norm": 26.01451286527926, + "learning_rate": 7.302488015674353e-07, + "loss": 1.2414, + "step": 12624 + }, + { + "epoch": 2.530060120240481, + "grad_norm": 18.87770912443924, + "learning_rate": 7.296422441447504e-07, + "loss": 1.4408, + "step": 12625 + }, + { + "epoch": 2.5302605210420843, + "grad_norm": 35.520265289566034, + "learning_rate": 7.290359189077179e-07, + "loss": 1.4678, + "step": 12626 + }, + { + "epoch": 2.5304609218436873, + "grad_norm": 19.59185996022062, + "learning_rate": 7.284298258893097e-07, + "loss": 1.9228, + "step": 12627 + }, + { + "epoch": 2.5306613226452903, + "grad_norm": 18.585026339699038, + "learning_rate": 7.278239651224761e-07, + "loss": 1.8808, + "step": 12628 + }, + { + "epoch": 2.5308617234468938, + "grad_norm": 15.481442145197853, + "learning_rate": 7.272183366401597e-07, + "loss": 1.5763, + "step": 12629 + }, + { + "epoch": 2.531062124248497, + "grad_norm": 24.90918995388916, + "learning_rate": 7.266129404752898e-07, + "loss": 1.3007, + "step": 12630 + }, + { + "epoch": 2.5312625250501, + "grad_norm": 20.360134659418062, + "learning_rate": 7.26007776660782e-07, + "loss": 1.5152, + "step": 12631 + }, + { + "epoch": 2.531462925851703, + "grad_norm": 19.32730909800827, + "learning_rate": 7.254028452295403e-07, + "loss": 1.8503, + "step": 12632 + }, + { + "epoch": 2.5316633266533066, + "grad_norm": 32.07032770295188, + "learning_rate": 7.24798146214456e-07, + "loss": 1.1474, + "step": 12633 + }, + { + "epoch": 2.53186372745491, + "grad_norm": 25.467469246066845, + "learning_rate": 7.241936796484084e-07, + "loss": 1.2843, + "step": 12634 + }, + { + "epoch": 2.532064128256513, + "grad_norm": 23.26643062727696, + "learning_rate": 7.235894455642611e-07, + "loss": 1.3123, + "step": 12635 + }, + { + "epoch": 2.532264529058116, + "grad_norm": 22.397884455005265, + "learning_rate": 7.229854439948669e-07, + "loss": 1.2236, + "step": 12636 + }, + { + "epoch": 2.5324649298597195, + "grad_norm": 25.03807269294086, + "learning_rate": 7.223816749730699e-07, + "loss": 1.8171, + "step": 12637 + }, + { + "epoch": 2.5326653306613225, + "grad_norm": 24.23297526687586, + "learning_rate": 7.217781385316947e-07, + "loss": 1.5295, + "step": 12638 + }, + { + "epoch": 2.532865731462926, + "grad_norm": 18.0579439360005, + "learning_rate": 7.211748347035568e-07, + "loss": 1.592, + "step": 12639 + }, + { + "epoch": 2.533066132264529, + "grad_norm": 27.8187924906072, + "learning_rate": 7.205717635214598e-07, + "loss": 1.6532, + "step": 12640 + }, + { + "epoch": 2.5332665330661324, + "grad_norm": 18.52752373044725, + "learning_rate": 7.199689250181929e-07, + "loss": 1.172, + "step": 12641 + }, + { + "epoch": 2.5334669338677354, + "grad_norm": 26.670567398129315, + "learning_rate": 7.193663192265344e-07, + "loss": 1.367, + "step": 12642 + }, + { + "epoch": 2.533667334669339, + "grad_norm": 33.255877424034246, + "learning_rate": 7.187639461792456e-07, + "loss": 1.6527, + "step": 12643 + }, + { + "epoch": 2.533867735470942, + "grad_norm": 19.030417907820663, + "learning_rate": 7.181618059090833e-07, + "loss": 1.1536, + "step": 12644 + }, + { + "epoch": 2.5340681362725452, + "grad_norm": 21.19415036048244, + "learning_rate": 7.175598984487831e-07, + "loss": 1.3475, + "step": 12645 + }, + { + "epoch": 2.5342685370741482, + "grad_norm": 22.82079942153022, + "learning_rate": 7.169582238310724e-07, + "loss": 1.829, + "step": 12646 + }, + { + "epoch": 2.5344689378757517, + "grad_norm": 26.87891326704563, + "learning_rate": 7.163567820886653e-07, + "loss": 1.5769, + "step": 12647 + }, + { + "epoch": 2.5346693386773547, + "grad_norm": 16.387002825821014, + "learning_rate": 7.157555732542632e-07, + "loss": 1.306, + "step": 12648 + }, + { + "epoch": 2.5348697394789577, + "grad_norm": 27.119346931389046, + "learning_rate": 7.15154597360555e-07, + "loss": 1.9262, + "step": 12649 + }, + { + "epoch": 2.535070140280561, + "grad_norm": 26.03524187606187, + "learning_rate": 7.145538544402164e-07, + "loss": 1.9007, + "step": 12650 + }, + { + "epoch": 2.5352705410821645, + "grad_norm": 27.247744959992364, + "learning_rate": 7.139533445259117e-07, + "loss": 2.005, + "step": 12651 + }, + { + "epoch": 2.5354709418837675, + "grad_norm": 25.669605034218513, + "learning_rate": 7.133530676502892e-07, + "loss": 1.3085, + "step": 12652 + }, + { + "epoch": 2.5356713426853705, + "grad_norm": 22.668586344206407, + "learning_rate": 7.127530238459878e-07, + "loss": 1.7736, + "step": 12653 + }, + { + "epoch": 2.535871743486974, + "grad_norm": 20.002923129641545, + "learning_rate": 7.121532131456333e-07, + "loss": 1.7137, + "step": 12654 + }, + { + "epoch": 2.5360721442885774, + "grad_norm": 22.678414726939604, + "learning_rate": 7.115536355818381e-07, + "loss": 1.3912, + "step": 12655 + }, + { + "epoch": 2.5362725450901804, + "grad_norm": 22.161034173569725, + "learning_rate": 7.109542911872025e-07, + "loss": 1.8075, + "step": 12656 + }, + { + "epoch": 2.5364729458917834, + "grad_norm": 23.110194687725205, + "learning_rate": 7.103551799943125e-07, + "loss": 1.5139, + "step": 12657 + }, + { + "epoch": 2.536673346693387, + "grad_norm": 20.65957118345017, + "learning_rate": 7.097563020357451e-07, + "loss": 1.3362, + "step": 12658 + }, + { + "epoch": 2.53687374749499, + "grad_norm": 18.715904198526346, + "learning_rate": 7.091576573440579e-07, + "loss": 1.1187, + "step": 12659 + }, + { + "epoch": 2.5370741482965933, + "grad_norm": 21.639433457230922, + "learning_rate": 7.085592459518054e-07, + "loss": 1.6245, + "step": 12660 + }, + { + "epoch": 2.5372745490981963, + "grad_norm": 25.341101202706657, + "learning_rate": 7.0796106789152e-07, + "loss": 1.6871, + "step": 12661 + }, + { + "epoch": 2.5374749498997997, + "grad_norm": 19.983008323279535, + "learning_rate": 7.073631231957273e-07, + "loss": 1.4562, + "step": 12662 + }, + { + "epoch": 2.5376753507014027, + "grad_norm": 17.35455039136636, + "learning_rate": 7.067654118969381e-07, + "loss": 1.6074, + "step": 12663 + }, + { + "epoch": 2.537875751503006, + "grad_norm": 16.745795633938474, + "learning_rate": 7.061679340276506e-07, + "loss": 1.5019, + "step": 12664 + }, + { + "epoch": 2.538076152304609, + "grad_norm": 17.211857487787686, + "learning_rate": 7.055706896203518e-07, + "loss": 1.4486, + "step": 12665 + }, + { + "epoch": 2.5382765531062126, + "grad_norm": 20.51171626005002, + "learning_rate": 7.049736787075117e-07, + "loss": 1.5324, + "step": 12666 + }, + { + "epoch": 2.5384769539078156, + "grad_norm": 22.320715674314403, + "learning_rate": 7.04376901321594e-07, + "loss": 1.6263, + "step": 12667 + }, + { + "epoch": 2.538677354709419, + "grad_norm": 18.631283472793665, + "learning_rate": 7.037803574950458e-07, + "loss": 1.941, + "step": 12668 + }, + { + "epoch": 2.538877755511022, + "grad_norm": 14.72246636737451, + "learning_rate": 7.031840472603008e-07, + "loss": 1.1207, + "step": 12669 + }, + { + "epoch": 2.539078156312625, + "grad_norm": 18.986339785668104, + "learning_rate": 7.025879706497812e-07, + "loss": 1.2308, + "step": 12670 + }, + { + "epoch": 2.5392785571142285, + "grad_norm": 22.078705083751792, + "learning_rate": 7.01992127695898e-07, + "loss": 1.5527, + "step": 12671 + }, + { + "epoch": 2.539478957915832, + "grad_norm": 22.484716340938714, + "learning_rate": 7.013965184310467e-07, + "loss": 1.4779, + "step": 12672 + }, + { + "epoch": 2.539679358717435, + "grad_norm": 22.90057803316938, + "learning_rate": 7.008011428876121e-07, + "loss": 1.3199, + "step": 12673 + }, + { + "epoch": 2.539879759519038, + "grad_norm": 19.995013453278116, + "learning_rate": 7.002060010979655e-07, + "loss": 1.5247, + "step": 12674 + }, + { + "epoch": 2.5400801603206413, + "grad_norm": 26.48066398319838, + "learning_rate": 6.996110930944667e-07, + "loss": 1.7532, + "step": 12675 + }, + { + "epoch": 2.5402805611222448, + "grad_norm": 17.225946379949356, + "learning_rate": 6.990164189094589e-07, + "loss": 1.6184, + "step": 12676 + }, + { + "epoch": 2.5404809619238478, + "grad_norm": 24.056372645529507, + "learning_rate": 6.984219785752794e-07, + "loss": 1.276, + "step": 12677 + }, + { + "epoch": 2.5406813627254508, + "grad_norm": 31.465454552032917, + "learning_rate": 6.978277721242449e-07, + "loss": 1.2573, + "step": 12678 + }, + { + "epoch": 2.540881763527054, + "grad_norm": 27.421145748382287, + "learning_rate": 6.972337995886657e-07, + "loss": 1.6233, + "step": 12679 + }, + { + "epoch": 2.541082164328657, + "grad_norm": 15.473682419653933, + "learning_rate": 6.966400610008361e-07, + "loss": 1.3271, + "step": 12680 + }, + { + "epoch": 2.5412825651302606, + "grad_norm": 20.686787288446887, + "learning_rate": 6.960465563930385e-07, + "loss": 1.6209, + "step": 12681 + }, + { + "epoch": 2.5414829659318636, + "grad_norm": 19.165881647949103, + "learning_rate": 6.954532857975443e-07, + "loss": 1.0677, + "step": 12682 + }, + { + "epoch": 2.541683366733467, + "grad_norm": 35.72609349040259, + "learning_rate": 6.948602492466067e-07, + "loss": 1.4209, + "step": 12683 + }, + { + "epoch": 2.54188376753507, + "grad_norm": 21.351675901052484, + "learning_rate": 6.942674467724747e-07, + "loss": 1.2942, + "step": 12684 + }, + { + "epoch": 2.5420841683366735, + "grad_norm": 18.544931310635885, + "learning_rate": 6.93674878407376e-07, + "loss": 1.1189, + "step": 12685 + }, + { + "epoch": 2.5422845691382765, + "grad_norm": 25.73246098090622, + "learning_rate": 6.930825441835309e-07, + "loss": 1.7289, + "step": 12686 + }, + { + "epoch": 2.5424849699398795, + "grad_norm": 24.924061779751142, + "learning_rate": 6.92490444133146e-07, + "loss": 1.7745, + "step": 12687 + }, + { + "epoch": 2.542685370741483, + "grad_norm": 38.806384357463706, + "learning_rate": 6.918985782884136e-07, + "loss": 1.6602, + "step": 12688 + }, + { + "epoch": 2.5428857715430864, + "grad_norm": 17.556272155927328, + "learning_rate": 6.913069466815159e-07, + "loss": 1.6879, + "step": 12689 + }, + { + "epoch": 2.5430861723446894, + "grad_norm": 25.35928253609917, + "learning_rate": 6.907155493446177e-07, + "loss": 1.2094, + "step": 12690 + }, + { + "epoch": 2.5432865731462924, + "grad_norm": 23.534058089218895, + "learning_rate": 6.901243863098783e-07, + "loss": 1.1373, + "step": 12691 + }, + { + "epoch": 2.543486973947896, + "grad_norm": 27.396665134231828, + "learning_rate": 6.89533457609437e-07, + "loss": 1.1533, + "step": 12692 + }, + { + "epoch": 2.5436873747494992, + "grad_norm": 18.22662956921582, + "learning_rate": 6.889427632754237e-07, + "loss": 1.7307, + "step": 12693 + }, + { + "epoch": 2.5438877755511022, + "grad_norm": 18.83173386885634, + "learning_rate": 6.883523033399576e-07, + "loss": 1.4315, + "step": 12694 + }, + { + "epoch": 2.5440881763527052, + "grad_norm": 25.55404430699926, + "learning_rate": 6.877620778351407e-07, + "loss": 1.4185, + "step": 12695 + }, + { + "epoch": 2.5442885771543087, + "grad_norm": 19.243599121668357, + "learning_rate": 6.871720867930659e-07, + "loss": 1.815, + "step": 12696 + }, + { + "epoch": 2.5444889779559117, + "grad_norm": 19.619136140142267, + "learning_rate": 6.865823302458085e-07, + "loss": 1.8913, + "step": 12697 + }, + { + "epoch": 2.544689378757515, + "grad_norm": 20.225342086389553, + "learning_rate": 6.859928082254386e-07, + "loss": 1.7142, + "step": 12698 + }, + { + "epoch": 2.544889779559118, + "grad_norm": 15.635225858268397, + "learning_rate": 6.854035207640092e-07, + "loss": 1.6899, + "step": 12699 + }, + { + "epoch": 2.5450901803607215, + "grad_norm": 22.078398521218922, + "learning_rate": 6.848144678935564e-07, + "loss": 1.6932, + "step": 12700 + }, + { + "epoch": 2.5452905811623245, + "grad_norm": 24.765049856230828, + "learning_rate": 6.842256496461136e-07, + "loss": 1.1679, + "step": 12701 + }, + { + "epoch": 2.545490981963928, + "grad_norm": 14.849393520371217, + "learning_rate": 6.836370660536912e-07, + "loss": 1.5148, + "step": 12702 + }, + { + "epoch": 2.545691382765531, + "grad_norm": 19.64749257652158, + "learning_rate": 6.830487171482935e-07, + "loss": 1.6006, + "step": 12703 + }, + { + "epoch": 2.5458917835671344, + "grad_norm": 21.56249564346139, + "learning_rate": 6.824606029619091e-07, + "loss": 1.928, + "step": 12704 + }, + { + "epoch": 2.5460921843687374, + "grad_norm": 19.8541131947101, + "learning_rate": 6.818727235265155e-07, + "loss": 1.2063, + "step": 12705 + }, + { + "epoch": 2.546292585170341, + "grad_norm": 19.169829444518, + "learning_rate": 6.812850788740766e-07, + "loss": 1.6512, + "step": 12706 + }, + { + "epoch": 2.546492985971944, + "grad_norm": 22.274327850688195, + "learning_rate": 6.806976690365408e-07, + "loss": 1.6479, + "step": 12707 + }, + { + "epoch": 2.546693386773547, + "grad_norm": 19.819315959743243, + "learning_rate": 6.801104940458514e-07, + "loss": 1.6955, + "step": 12708 + }, + { + "epoch": 2.5468937875751503, + "grad_norm": 22.873113981621994, + "learning_rate": 6.795235539339295e-07, + "loss": 1.5666, + "step": 12709 + }, + { + "epoch": 2.5470941883767537, + "grad_norm": 21.566314758813768, + "learning_rate": 6.789368487326898e-07, + "loss": 1.5526, + "step": 12710 + }, + { + "epoch": 2.5472945891783567, + "grad_norm": 22.617422294402136, + "learning_rate": 6.78350378474032e-07, + "loss": 1.5427, + "step": 12711 + }, + { + "epoch": 2.5474949899799597, + "grad_norm": 27.577177991301433, + "learning_rate": 6.777641431898435e-07, + "loss": 1.4839, + "step": 12712 + }, + { + "epoch": 2.547695390781563, + "grad_norm": 15.765929742045113, + "learning_rate": 6.77178142912e-07, + "loss": 1.2271, + "step": 12713 + }, + { + "epoch": 2.5478957915831666, + "grad_norm": 29.618544017554633, + "learning_rate": 6.765923776723598e-07, + "loss": 1.4183, + "step": 12714 + }, + { + "epoch": 2.5480961923847696, + "grad_norm": 22.95700804112625, + "learning_rate": 6.760068475027759e-07, + "loss": 1.2499, + "step": 12715 + }, + { + "epoch": 2.5482965931863726, + "grad_norm": 21.501684030321172, + "learning_rate": 6.754215524350816e-07, + "loss": 1.3973, + "step": 12716 + }, + { + "epoch": 2.548496993987976, + "grad_norm": 16.559595860036143, + "learning_rate": 6.748364925011009e-07, + "loss": 1.605, + "step": 12717 + }, + { + "epoch": 2.548697394789579, + "grad_norm": 17.408022906184513, + "learning_rate": 6.742516677326449e-07, + "loss": 0.8967, + "step": 12718 + }, + { + "epoch": 2.5488977955911825, + "grad_norm": 20.104344283231626, + "learning_rate": 6.736670781615107e-07, + "loss": 1.5128, + "step": 12719 + }, + { + "epoch": 2.5490981963927855, + "grad_norm": 19.032639046186702, + "learning_rate": 6.730827238194842e-07, + "loss": 1.752, + "step": 12720 + }, + { + "epoch": 2.549298597194389, + "grad_norm": 19.451917778182587, + "learning_rate": 6.72498604738337e-07, + "loss": 1.6723, + "step": 12721 + }, + { + "epoch": 2.549498997995992, + "grad_norm": 24.812976004737756, + "learning_rate": 6.719147209498295e-07, + "loss": 1.2467, + "step": 12722 + }, + { + "epoch": 2.5496993987975953, + "grad_norm": 21.913799375505754, + "learning_rate": 6.713310724857064e-07, + "loss": 1.3478, + "step": 12723 + }, + { + "epoch": 2.5498997995991983, + "grad_norm": 20.658286706144466, + "learning_rate": 6.707476593777018e-07, + "loss": 1.106, + "step": 12724 + }, + { + "epoch": 2.5501002004008018, + "grad_norm": 26.786629594109684, + "learning_rate": 6.701644816575393e-07, + "loss": 1.6303, + "step": 12725 + }, + { + "epoch": 2.5503006012024048, + "grad_norm": 26.545286311808383, + "learning_rate": 6.695815393569249e-07, + "loss": 1.395, + "step": 12726 + }, + { + "epoch": 2.550501002004008, + "grad_norm": 24.148322426980048, + "learning_rate": 6.689988325075536e-07, + "loss": 1.3752, + "step": 12727 + }, + { + "epoch": 2.550701402805611, + "grad_norm": 21.006742587844368, + "learning_rate": 6.684163611411098e-07, + "loss": 1.3038, + "step": 12728 + }, + { + "epoch": 2.550901803607214, + "grad_norm": 23.517705516886423, + "learning_rate": 6.678341252892618e-07, + "loss": 1.4041, + "step": 12729 + }, + { + "epoch": 2.5511022044088176, + "grad_norm": 27.15903775080256, + "learning_rate": 6.672521249836689e-07, + "loss": 1.6086, + "step": 12730 + }, + { + "epoch": 2.551302605210421, + "grad_norm": 30.033786306502385, + "learning_rate": 6.66670360255971e-07, + "loss": 1.6659, + "step": 12731 + }, + { + "epoch": 2.551503006012024, + "grad_norm": 25.039115385924248, + "learning_rate": 6.660888311378044e-07, + "loss": 1.7102, + "step": 12732 + }, + { + "epoch": 2.551703406813627, + "grad_norm": 25.278556206113166, + "learning_rate": 6.655075376607851e-07, + "loss": 1.4791, + "step": 12733 + }, + { + "epoch": 2.5519038076152305, + "grad_norm": 21.45624832926246, + "learning_rate": 6.649264798565185e-07, + "loss": 1.5108, + "step": 12734 + }, + { + "epoch": 2.552104208416834, + "grad_norm": 22.621291873545832, + "learning_rate": 6.643456577565988e-07, + "loss": 1.5061, + "step": 12735 + }, + { + "epoch": 2.552304609218437, + "grad_norm": 18.39572519966582, + "learning_rate": 6.637650713926053e-07, + "loss": 1.7266, + "step": 12736 + }, + { + "epoch": 2.55250501002004, + "grad_norm": 19.703300327016077, + "learning_rate": 6.631847207961062e-07, + "loss": 1.1446, + "step": 12737 + }, + { + "epoch": 2.5527054108216434, + "grad_norm": 29.163183223806914, + "learning_rate": 6.626046059986551e-07, + "loss": 1.3836, + "step": 12738 + }, + { + "epoch": 2.5529058116232464, + "grad_norm": 18.061418906187132, + "learning_rate": 6.620247270317953e-07, + "loss": 1.4205, + "step": 12739 + }, + { + "epoch": 2.55310621242485, + "grad_norm": 21.42886767860064, + "learning_rate": 6.614450839270536e-07, + "loss": 1.6485, + "step": 12740 + }, + { + "epoch": 2.553306613226453, + "grad_norm": 35.269264624538955, + "learning_rate": 6.608656767159472e-07, + "loss": 1.8529, + "step": 12741 + }, + { + "epoch": 2.5535070140280562, + "grad_norm": 18.24686891666044, + "learning_rate": 6.602865054299789e-07, + "loss": 1.548, + "step": 12742 + }, + { + "epoch": 2.5537074148296592, + "grad_norm": 38.08101905365714, + "learning_rate": 6.597075701006394e-07, + "loss": 1.6935, + "step": 12743 + }, + { + "epoch": 2.5539078156312627, + "grad_norm": 16.055234393916137, + "learning_rate": 6.591288707594057e-07, + "loss": 1.455, + "step": 12744 + }, + { + "epoch": 2.5541082164328657, + "grad_norm": 22.44240077952273, + "learning_rate": 6.585504074377436e-07, + "loss": 1.5229, + "step": 12745 + }, + { + "epoch": 2.5543086172344687, + "grad_norm": 16.58570024619344, + "learning_rate": 6.579721801671052e-07, + "loss": 1.6468, + "step": 12746 + }, + { + "epoch": 2.554509018036072, + "grad_norm": 30.61516268822457, + "learning_rate": 6.573941889789265e-07, + "loss": 1.5603, + "step": 12747 + }, + { + "epoch": 2.5547094188376755, + "grad_norm": 17.939016533484967, + "learning_rate": 6.568164339046373e-07, + "loss": 1.8997, + "step": 12748 + }, + { + "epoch": 2.5549098196392785, + "grad_norm": 57.705088489488226, + "learning_rate": 6.56238914975651e-07, + "loss": 1.4197, + "step": 12749 + }, + { + "epoch": 2.5551102204408815, + "grad_norm": 22.836092725155503, + "learning_rate": 6.556616322233655e-07, + "loss": 1.3238, + "step": 12750 + }, + { + "epoch": 2.555310621242485, + "grad_norm": 18.244129394536298, + "learning_rate": 6.550845856791698e-07, + "loss": 1.1384, + "step": 12751 + }, + { + "epoch": 2.5555110220440884, + "grad_norm": 20.471528088809553, + "learning_rate": 6.545077753744395e-07, + "loss": 1.8365, + "step": 12752 + }, + { + "epoch": 2.5557114228456914, + "grad_norm": 18.352185421062842, + "learning_rate": 6.539312013405352e-07, + "loss": 1.5082, + "step": 12753 + }, + { + "epoch": 2.5559118236472944, + "grad_norm": 17.043389091289907, + "learning_rate": 6.533548636088072e-07, + "loss": 1.5088, + "step": 12754 + }, + { + "epoch": 2.556112224448898, + "grad_norm": 23.243491073242993, + "learning_rate": 6.527787622105913e-07, + "loss": 2.1556, + "step": 12755 + }, + { + "epoch": 2.556312625250501, + "grad_norm": 19.225150832289398, + "learning_rate": 6.522028971772126e-07, + "loss": 1.8872, + "step": 12756 + }, + { + "epoch": 2.5565130260521043, + "grad_norm": 26.767623737159774, + "learning_rate": 6.516272685399793e-07, + "loss": 1.3925, + "step": 12757 + }, + { + "epoch": 2.5567134268537073, + "grad_norm": 21.94075709912016, + "learning_rate": 6.510518763301899e-07, + "loss": 1.5153, + "step": 12758 + }, + { + "epoch": 2.5569138276553107, + "grad_norm": 21.013805372068514, + "learning_rate": 6.504767205791296e-07, + "loss": 1.0733, + "step": 12759 + }, + { + "epoch": 2.5571142284569137, + "grad_norm": 22.55259124744689, + "learning_rate": 6.499018013180708e-07, + "loss": 1.6406, + "step": 12760 + }, + { + "epoch": 2.557314629258517, + "grad_norm": 26.989962264427113, + "learning_rate": 6.493271185782718e-07, + "loss": 1.3354, + "step": 12761 + }, + { + "epoch": 2.55751503006012, + "grad_norm": 21.915505513780126, + "learning_rate": 6.487526723909798e-07, + "loss": 1.6989, + "step": 12762 + }, + { + "epoch": 2.5577154308617236, + "grad_norm": 20.328802853333645, + "learning_rate": 6.481784627874294e-07, + "loss": 1.6331, + "step": 12763 + }, + { + "epoch": 2.5579158316633266, + "grad_norm": 43.87110141680841, + "learning_rate": 6.476044897988377e-07, + "loss": 1.6655, + "step": 12764 + }, + { + "epoch": 2.55811623246493, + "grad_norm": 24.378278925868532, + "learning_rate": 6.470307534564169e-07, + "loss": 1.1979, + "step": 12765 + }, + { + "epoch": 2.558316633266533, + "grad_norm": 24.203732761649764, + "learning_rate": 6.464572537913577e-07, + "loss": 1.4307, + "step": 12766 + }, + { + "epoch": 2.558517034068136, + "grad_norm": 23.687074154701524, + "learning_rate": 6.458839908348447e-07, + "loss": 1.5792, + "step": 12767 + }, + { + "epoch": 2.5587174348697395, + "grad_norm": 23.678738459447874, + "learning_rate": 6.453109646180461e-07, + "loss": 1.9691, + "step": 12768 + }, + { + "epoch": 2.558917835671343, + "grad_norm": 26.736211654845935, + "learning_rate": 6.447381751721188e-07, + "loss": 1.6249, + "step": 12769 + }, + { + "epoch": 2.559118236472946, + "grad_norm": 17.298822662399644, + "learning_rate": 6.441656225282067e-07, + "loss": 1.9534, + "step": 12770 + }, + { + "epoch": 2.559318637274549, + "grad_norm": 18.855781365865766, + "learning_rate": 6.435933067174377e-07, + "loss": 1.3987, + "step": 12771 + }, + { + "epoch": 2.5595190380761523, + "grad_norm": 22.107140353173076, + "learning_rate": 6.43021227770933e-07, + "loss": 1.9062, + "step": 12772 + }, + { + "epoch": 2.5597194388777558, + "grad_norm": 20.18992749175152, + "learning_rate": 6.424493857197944e-07, + "loss": 1.607, + "step": 12773 + }, + { + "epoch": 2.5599198396793588, + "grad_norm": 22.25961780757557, + "learning_rate": 6.418777805951143e-07, + "loss": 1.7918, + "step": 12774 + }, + { + "epoch": 2.5601202404809618, + "grad_norm": 24.64144985680956, + "learning_rate": 6.41306412427975e-07, + "loss": 1.2199, + "step": 12775 + }, + { + "epoch": 2.560320641282565, + "grad_norm": 24.279077958145447, + "learning_rate": 6.407352812494383e-07, + "loss": 1.1492, + "step": 12776 + }, + { + "epoch": 2.560521042084168, + "grad_norm": 14.176823377893808, + "learning_rate": 6.401643870905605e-07, + "loss": 1.2129, + "step": 12777 + }, + { + "epoch": 2.5607214428857716, + "grad_norm": 28.408706712493508, + "learning_rate": 6.395937299823784e-07, + "loss": 1.794, + "step": 12778 + }, + { + "epoch": 2.5609218436873746, + "grad_norm": 32.5463513471982, + "learning_rate": 6.390233099559235e-07, + "loss": 1.4183, + "step": 12779 + }, + { + "epoch": 2.561122244488978, + "grad_norm": 15.899863937993075, + "learning_rate": 6.384531270422089e-07, + "loss": 1.0743, + "step": 12780 + }, + { + "epoch": 2.561322645290581, + "grad_norm": 24.43664458910847, + "learning_rate": 6.378831812722341e-07, + "loss": 1.2462, + "step": 12781 + }, + { + "epoch": 2.5615230460921845, + "grad_norm": 34.28522988136101, + "learning_rate": 6.373134726769919e-07, + "loss": 1.6376, + "step": 12782 + }, + { + "epoch": 2.5617234468937875, + "grad_norm": 18.858496236560246, + "learning_rate": 6.367440012874554e-07, + "loss": 1.3361, + "step": 12783 + }, + { + "epoch": 2.561923847695391, + "grad_norm": 16.00673658272666, + "learning_rate": 6.36174767134588e-07, + "loss": 1.2535, + "step": 12784 + }, + { + "epoch": 2.562124248496994, + "grad_norm": 61.07898338280721, + "learning_rate": 6.356057702493401e-07, + "loss": 1.9783, + "step": 12785 + }, + { + "epoch": 2.5623246492985974, + "grad_norm": 18.248867204367585, + "learning_rate": 6.350370106626491e-07, + "loss": 1.524, + "step": 12786 + }, + { + "epoch": 2.5625250501002004, + "grad_norm": 29.00908336015145, + "learning_rate": 6.344684884054403e-07, + "loss": 1.4989, + "step": 12787 + }, + { + "epoch": 2.5627254509018034, + "grad_norm": 19.05076815975541, + "learning_rate": 6.339002035086211e-07, + "loss": 1.3198, + "step": 12788 + }, + { + "epoch": 2.562925851703407, + "grad_norm": 23.42876655541729, + "learning_rate": 6.333321560030958e-07, + "loss": 1.7508, + "step": 12789 + }, + { + "epoch": 2.5631262525050102, + "grad_norm": 20.913245838234875, + "learning_rate": 6.327643459197453e-07, + "loss": 1.5532, + "step": 12790 + }, + { + "epoch": 2.5633266533066132, + "grad_norm": 19.002939303965974, + "learning_rate": 6.321967732894441e-07, + "loss": 1.7291, + "step": 12791 + }, + { + "epoch": 2.5635270541082162, + "grad_norm": 22.94551890855742, + "learning_rate": 6.316294381430516e-07, + "loss": 2.0684, + "step": 12792 + }, + { + "epoch": 2.5637274549098197, + "grad_norm": 18.041154267004192, + "learning_rate": 6.310623405114153e-07, + "loss": 1.6461, + "step": 12793 + }, + { + "epoch": 2.563927855711423, + "grad_norm": 17.733675505672128, + "learning_rate": 6.304954804253693e-07, + "loss": 1.6962, + "step": 12794 + }, + { + "epoch": 2.564128256513026, + "grad_norm": 29.969417606028443, + "learning_rate": 6.29928857915732e-07, + "loss": 1.6781, + "step": 12795 + }, + { + "epoch": 2.564328657314629, + "grad_norm": 19.02800760643271, + "learning_rate": 6.293624730133158e-07, + "loss": 1.6529, + "step": 12796 + }, + { + "epoch": 2.5645290581162326, + "grad_norm": 21.215958149537002, + "learning_rate": 6.287963257489127e-07, + "loss": 1.4933, + "step": 12797 + }, + { + "epoch": 2.5647294589178355, + "grad_norm": 26.63702754212235, + "learning_rate": 6.282304161533054e-07, + "loss": 1.425, + "step": 12798 + }, + { + "epoch": 2.564929859719439, + "grad_norm": 27.30777322408452, + "learning_rate": 6.276647442572642e-07, + "loss": 1.8262, + "step": 12799 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 24.62907338896506, + "learning_rate": 6.270993100915446e-07, + "loss": 1.6352, + "step": 12800 + }, + { + "epoch": 2.5653306613226454, + "grad_norm": 20.729036413949917, + "learning_rate": 6.265341136868919e-07, + "loss": 1.4606, + "step": 12801 + }, + { + "epoch": 2.5655310621242484, + "grad_norm": 19.205757521276677, + "learning_rate": 6.259691550740326e-07, + "loss": 1.4236, + "step": 12802 + }, + { + "epoch": 2.565731462925852, + "grad_norm": 25.704229754176154, + "learning_rate": 6.254044342836896e-07, + "loss": 1.589, + "step": 12803 + }, + { + "epoch": 2.565931863727455, + "grad_norm": 17.18874319773173, + "learning_rate": 6.248399513465636e-07, + "loss": 1.0356, + "step": 12804 + }, + { + "epoch": 2.566132264529058, + "grad_norm": 18.89014304361592, + "learning_rate": 6.242757062933468e-07, + "loss": 1.6572, + "step": 12805 + }, + { + "epoch": 2.5663326653306613, + "grad_norm": 18.095642809666725, + "learning_rate": 6.237116991547215e-07, + "loss": 1.6708, + "step": 12806 + }, + { + "epoch": 2.5665330661322647, + "grad_norm": 19.3638373627381, + "learning_rate": 6.231479299613497e-07, + "loss": 1.7214, + "step": 12807 + }, + { + "epoch": 2.5667334669338677, + "grad_norm": 30.063972929850195, + "learning_rate": 6.225843987438862e-07, + "loss": 1.6046, + "step": 12808 + }, + { + "epoch": 2.5669338677354707, + "grad_norm": 16.560729255265528, + "learning_rate": 6.2202110553297e-07, + "loss": 1.4568, + "step": 12809 + }, + { + "epoch": 2.567134268537074, + "grad_norm": 30.079790112206485, + "learning_rate": 6.214580503592294e-07, + "loss": 0.9841, + "step": 12810 + }, + { + "epoch": 2.5673346693386776, + "grad_norm": 19.639194640357513, + "learning_rate": 6.208952332532787e-07, + "loss": 1.7458, + "step": 12811 + }, + { + "epoch": 2.5675350701402806, + "grad_norm": 17.3068830659776, + "learning_rate": 6.203326542457156e-07, + "loss": 1.5458, + "step": 12812 + }, + { + "epoch": 2.5677354709418836, + "grad_norm": 18.49775061373404, + "learning_rate": 6.19770313367134e-07, + "loss": 1.4596, + "step": 12813 + }, + { + "epoch": 2.567935871743487, + "grad_norm": 18.29282059136076, + "learning_rate": 6.192082106481046e-07, + "loss": 1.191, + "step": 12814 + }, + { + "epoch": 2.56813627254509, + "grad_norm": 19.290182901730507, + "learning_rate": 6.186463461191921e-07, + "loss": 1.716, + "step": 12815 + }, + { + "epoch": 2.5683366733466935, + "grad_norm": 18.90805981377332, + "learning_rate": 6.180847198109447e-07, + "loss": 1.9097, + "step": 12816 + }, + { + "epoch": 2.5685370741482965, + "grad_norm": 21.01772489794187, + "learning_rate": 6.175233317538998e-07, + "loss": 1.7413, + "step": 12817 + }, + { + "epoch": 2.5687374749499, + "grad_norm": 14.3389183904248, + "learning_rate": 6.169621819785804e-07, + "loss": 1.5155, + "step": 12818 + }, + { + "epoch": 2.568937875751503, + "grad_norm": 19.93867634963005, + "learning_rate": 6.164012705154965e-07, + "loss": 1.0473, + "step": 12819 + }, + { + "epoch": 2.5691382765531063, + "grad_norm": 23.0811752388406, + "learning_rate": 6.158405973951482e-07, + "loss": 2.3246, + "step": 12820 + }, + { + "epoch": 2.5693386773547093, + "grad_norm": 22.409659359146183, + "learning_rate": 6.152801626480165e-07, + "loss": 1.4038, + "step": 12821 + }, + { + "epoch": 2.5695390781563128, + "grad_norm": 25.309146807765696, + "learning_rate": 6.147199663045755e-07, + "loss": 1.6184, + "step": 12822 + }, + { + "epoch": 2.5697394789579158, + "grad_norm": 17.4120932308891, + "learning_rate": 6.141600083952826e-07, + "loss": 1.5148, + "step": 12823 + }, + { + "epoch": 2.569939879759519, + "grad_norm": 25.02057965738277, + "learning_rate": 6.136002889505843e-07, + "loss": 1.765, + "step": 12824 + }, + { + "epoch": 2.570140280561122, + "grad_norm": 21.210845104105122, + "learning_rate": 6.130408080009131e-07, + "loss": 1.4768, + "step": 12825 + }, + { + "epoch": 2.570340681362725, + "grad_norm": 16.456032274111482, + "learning_rate": 6.124815655766891e-07, + "loss": 1.628, + "step": 12826 + }, + { + "epoch": 2.5705410821643286, + "grad_norm": 18.467763352479206, + "learning_rate": 6.119225617083202e-07, + "loss": 1.7502, + "step": 12827 + }, + { + "epoch": 2.570741482965932, + "grad_norm": 22.93063169742807, + "learning_rate": 6.113637964261976e-07, + "loss": 0.9641, + "step": 12828 + }, + { + "epoch": 2.570941883767535, + "grad_norm": 29.851342796467087, + "learning_rate": 6.108052697607042e-07, + "loss": 1.7667, + "step": 12829 + }, + { + "epoch": 2.571142284569138, + "grad_norm": 22.72719465436738, + "learning_rate": 6.102469817422069e-07, + "loss": 1.4085, + "step": 12830 + }, + { + "epoch": 2.5713426853707415, + "grad_norm": 31.43696708433016, + "learning_rate": 6.096889324010607e-07, + "loss": 1.4097, + "step": 12831 + }, + { + "epoch": 2.571543086172345, + "grad_norm": 22.27055795091327, + "learning_rate": 6.091311217676082e-07, + "loss": 1.8556, + "step": 12832 + }, + { + "epoch": 2.571743486973948, + "grad_norm": 15.171365046273284, + "learning_rate": 6.085735498721785e-07, + "loss": 1.3298, + "step": 12833 + }, + { + "epoch": 2.571943887775551, + "grad_norm": 21.162981487498545, + "learning_rate": 6.080162167450882e-07, + "loss": 1.0684, + "step": 12834 + }, + { + "epoch": 2.5721442885771544, + "grad_norm": 26.61405205443373, + "learning_rate": 6.07459122416637e-07, + "loss": 1.5208, + "step": 12835 + }, + { + "epoch": 2.5723446893787574, + "grad_norm": 20.616371872658178, + "learning_rate": 6.069022669171182e-07, + "loss": 1.2895, + "step": 12836 + }, + { + "epoch": 2.572545090180361, + "grad_norm": 38.488168477201434, + "learning_rate": 6.063456502768094e-07, + "loss": 1.4152, + "step": 12837 + }, + { + "epoch": 2.572745490981964, + "grad_norm": 23.882618082644264, + "learning_rate": 6.057892725259717e-07, + "loss": 1.4549, + "step": 12838 + }, + { + "epoch": 2.5729458917835673, + "grad_norm": 23.175333404020325, + "learning_rate": 6.05233133694858e-07, + "loss": 1.9884, + "step": 12839 + }, + { + "epoch": 2.5731462925851702, + "grad_norm": 27.264320358730927, + "learning_rate": 6.046772338137063e-07, + "loss": 1.8661, + "step": 12840 + }, + { + "epoch": 2.5733466933867737, + "grad_norm": 22.067072574975167, + "learning_rate": 6.041215729127414e-07, + "loss": 1.5746, + "step": 12841 + }, + { + "epoch": 2.5735470941883767, + "grad_norm": 19.079579762181734, + "learning_rate": 6.035661510221752e-07, + "loss": 1.7587, + "step": 12842 + }, + { + "epoch": 2.57374749498998, + "grad_norm": 20.59511678664473, + "learning_rate": 6.030109681722074e-07, + "loss": 1.5507, + "step": 12843 + }, + { + "epoch": 2.573947895791583, + "grad_norm": 25.269469624824925, + "learning_rate": 6.024560243930244e-07, + "loss": 1.1414, + "step": 12844 + }, + { + "epoch": 2.5741482965931866, + "grad_norm": 17.735858808290807, + "learning_rate": 6.019013197147982e-07, + "loss": 1.3719, + "step": 12845 + }, + { + "epoch": 2.5743486973947896, + "grad_norm": 19.666813750676134, + "learning_rate": 6.013468541676892e-07, + "loss": 1.3338, + "step": 12846 + }, + { + "epoch": 2.5745490981963925, + "grad_norm": 37.24871644420438, + "learning_rate": 6.007926277818448e-07, + "loss": 1.7939, + "step": 12847 + }, + { + "epoch": 2.574749498997996, + "grad_norm": 28.086082146922887, + "learning_rate": 6.002386405873989e-07, + "loss": 1.433, + "step": 12848 + }, + { + "epoch": 2.5749498997995994, + "grad_norm": 24.38139745239401, + "learning_rate": 5.996848926144733e-07, + "loss": 1.8167, + "step": 12849 + }, + { + "epoch": 2.5751503006012024, + "grad_norm": 18.959552196983463, + "learning_rate": 5.99131383893175e-07, + "loss": 2.0762, + "step": 12850 + }, + { + "epoch": 2.5753507014028054, + "grad_norm": 24.724990876298488, + "learning_rate": 5.985781144536007e-07, + "loss": 1.7657, + "step": 12851 + }, + { + "epoch": 2.575551102204409, + "grad_norm": 17.892349564180968, + "learning_rate": 5.980250843258295e-07, + "loss": 1.1882, + "step": 12852 + }, + { + "epoch": 2.5757515030060123, + "grad_norm": 20.78764356647483, + "learning_rate": 5.974722935399347e-07, + "loss": 1.3731, + "step": 12853 + }, + { + "epoch": 2.5759519038076153, + "grad_norm": 19.640598397155447, + "learning_rate": 5.969197421259682e-07, + "loss": 1.3666, + "step": 12854 + }, + { + "epoch": 2.5761523046092183, + "grad_norm": 17.869317525652118, + "learning_rate": 5.963674301139755e-07, + "loss": 1.8017, + "step": 12855 + }, + { + "epoch": 2.5763527054108217, + "grad_norm": 19.066054388442794, + "learning_rate": 5.958153575339859e-07, + "loss": 1.4261, + "step": 12856 + }, + { + "epoch": 2.5765531062124247, + "grad_norm": 18.317074423352256, + "learning_rate": 5.952635244160165e-07, + "loss": 1.2985, + "step": 12857 + }, + { + "epoch": 2.576753507014028, + "grad_norm": 15.614525044405573, + "learning_rate": 5.947119307900723e-07, + "loss": 1.4287, + "step": 12858 + }, + { + "epoch": 2.576953907815631, + "grad_norm": 15.474733815753831, + "learning_rate": 5.941605766861408e-07, + "loss": 1.6404, + "step": 12859 + }, + { + "epoch": 2.5771543086172346, + "grad_norm": 31.36956501059931, + "learning_rate": 5.936094621342036e-07, + "loss": 1.2845, + "step": 12860 + }, + { + "epoch": 2.5773547094188376, + "grad_norm": 28.047097307425037, + "learning_rate": 5.930585871642258e-07, + "loss": 2.2659, + "step": 12861 + }, + { + "epoch": 2.577555110220441, + "grad_norm": 17.645198878405658, + "learning_rate": 5.925079518061555e-07, + "loss": 1.5512, + "step": 12862 + }, + { + "epoch": 2.577755511022044, + "grad_norm": 20.079795575217382, + "learning_rate": 5.91957556089936e-07, + "loss": 1.2356, + "step": 12863 + }, + { + "epoch": 2.577955911823647, + "grad_norm": 23.090794610185373, + "learning_rate": 5.9140740004549e-07, + "loss": 1.6512, + "step": 12864 + }, + { + "epoch": 2.5781563126252505, + "grad_norm": 22.860794225721442, + "learning_rate": 5.908574837027309e-07, + "loss": 1.8245, + "step": 12865 + }, + { + "epoch": 2.578356713426854, + "grad_norm": 30.436874951857043, + "learning_rate": 5.903078070915596e-07, + "loss": 1.6255, + "step": 12866 + }, + { + "epoch": 2.578557114228457, + "grad_norm": 22.100534576327114, + "learning_rate": 5.897583702418619e-07, + "loss": 1.974, + "step": 12867 + }, + { + "epoch": 2.57875751503006, + "grad_norm": 17.17402033351451, + "learning_rate": 5.892091731835126e-07, + "loss": 1.5056, + "step": 12868 + }, + { + "epoch": 2.5789579158316633, + "grad_norm": 19.80659941833235, + "learning_rate": 5.886602159463694e-07, + "loss": 1.4339, + "step": 12869 + }, + { + "epoch": 2.579158316633267, + "grad_norm": 45.076722101287935, + "learning_rate": 5.881114985602843e-07, + "loss": 1.7546, + "step": 12870 + }, + { + "epoch": 2.5793587174348698, + "grad_norm": 19.958377413386255, + "learning_rate": 5.875630210550881e-07, + "loss": 1.4239, + "step": 12871 + }, + { + "epoch": 2.5795591182364728, + "grad_norm": 17.8556810490276, + "learning_rate": 5.870147834606043e-07, + "loss": 1.393, + "step": 12872 + }, + { + "epoch": 2.579759519038076, + "grad_norm": 23.754268457713298, + "learning_rate": 5.864667858066408e-07, + "loss": 1.6989, + "step": 12873 + }, + { + "epoch": 2.579959919839679, + "grad_norm": 25.67183538121571, + "learning_rate": 5.859190281229926e-07, + "loss": 1.2415, + "step": 12874 + }, + { + "epoch": 2.5801603206412826, + "grad_norm": 29.81260848347223, + "learning_rate": 5.853715104394442e-07, + "loss": 1.5096, + "step": 12875 + }, + { + "epoch": 2.5803607214428856, + "grad_norm": 23.563804384317677, + "learning_rate": 5.848242327857611e-07, + "loss": 1.8665, + "step": 12876 + }, + { + "epoch": 2.580561122244489, + "grad_norm": 15.40521660451283, + "learning_rate": 5.842771951917042e-07, + "loss": 1.6359, + "step": 12877 + }, + { + "epoch": 2.580761523046092, + "grad_norm": 22.777704049133206, + "learning_rate": 5.837303976870135e-07, + "loss": 2.2854, + "step": 12878 + }, + { + "epoch": 2.5809619238476955, + "grad_norm": 23.742733719062237, + "learning_rate": 5.831838403014196e-07, + "loss": 1.5558, + "step": 12879 + }, + { + "epoch": 2.5811623246492985, + "grad_norm": 23.551302911347538, + "learning_rate": 5.826375230646403e-07, + "loss": 1.3876, + "step": 12880 + }, + { + "epoch": 2.581362725450902, + "grad_norm": 30.418134623141054, + "learning_rate": 5.820914460063792e-07, + "loss": 1.5002, + "step": 12881 + }, + { + "epoch": 2.581563126252505, + "grad_norm": 20.42355494281479, + "learning_rate": 5.815456091563287e-07, + "loss": 1.6218, + "step": 12882 + }, + { + "epoch": 2.5817635270541084, + "grad_norm": 25.31821370155739, + "learning_rate": 5.810000125441639e-07, + "loss": 1.7494, + "step": 12883 + }, + { + "epoch": 2.5819639278557114, + "grad_norm": 20.645855718932204, + "learning_rate": 5.804546561995539e-07, + "loss": 1.5419, + "step": 12884 + }, + { + "epoch": 2.5821643286573144, + "grad_norm": 19.776013607000692, + "learning_rate": 5.799095401521465e-07, + "loss": 1.4691, + "step": 12885 + }, + { + "epoch": 2.582364729458918, + "grad_norm": 19.962426077365702, + "learning_rate": 5.793646644315809e-07, + "loss": 1.4392, + "step": 12886 + }, + { + "epoch": 2.5825651302605213, + "grad_norm": 19.52408530553552, + "learning_rate": 5.788200290674862e-07, + "loss": 1.7931, + "step": 12887 + }, + { + "epoch": 2.5827655310621243, + "grad_norm": 24.64400037928885, + "learning_rate": 5.782756340894718e-07, + "loss": 1.7172, + "step": 12888 + }, + { + "epoch": 2.5829659318637272, + "grad_norm": 36.62212366286232, + "learning_rate": 5.777314795271383e-07, + "loss": 1.3901, + "step": 12889 + }, + { + "epoch": 2.5831663326653307, + "grad_norm": 17.403130806815806, + "learning_rate": 5.771875654100722e-07, + "loss": 1.7737, + "step": 12890 + }, + { + "epoch": 2.583366733466934, + "grad_norm": 19.859675248159558, + "learning_rate": 5.766438917678463e-07, + "loss": 1.5022, + "step": 12891 + }, + { + "epoch": 2.583567134268537, + "grad_norm": 23.473475907106856, + "learning_rate": 5.761004586300234e-07, + "loss": 1.6497, + "step": 12892 + }, + { + "epoch": 2.58376753507014, + "grad_norm": 24.986921470691755, + "learning_rate": 5.75557266026146e-07, + "loss": 1.6878, + "step": 12893 + }, + { + "epoch": 2.5839679358717436, + "grad_norm": 22.87844911297366, + "learning_rate": 5.750143139857534e-07, + "loss": 1.3377, + "step": 12894 + }, + { + "epoch": 2.5841683366733466, + "grad_norm": 29.13015282128068, + "learning_rate": 5.744716025383634e-07, + "loss": 2.1866, + "step": 12895 + }, + { + "epoch": 2.58436873747495, + "grad_norm": 21.11098062553445, + "learning_rate": 5.739291317134854e-07, + "loss": 1.3843, + "step": 12896 + }, + { + "epoch": 2.584569138276553, + "grad_norm": 19.513587546458602, + "learning_rate": 5.733869015406135e-07, + "loss": 1.7687, + "step": 12897 + }, + { + "epoch": 2.5847695390781564, + "grad_norm": 19.332586213525538, + "learning_rate": 5.728449120492302e-07, + "loss": 1.332, + "step": 12898 + }, + { + "epoch": 2.5849699398797594, + "grad_norm": 16.928888131946188, + "learning_rate": 5.723031632688053e-07, + "loss": 1.564, + "step": 12899 + }, + { + "epoch": 2.585170340681363, + "grad_norm": 18.35986820120566, + "learning_rate": 5.717616552287908e-07, + "loss": 0.895, + "step": 12900 + }, + { + "epoch": 2.585370741482966, + "grad_norm": 23.245677997818074, + "learning_rate": 5.712203879586342e-07, + "loss": 1.5781, + "step": 12901 + }, + { + "epoch": 2.5855711422845693, + "grad_norm": 24.800658710290396, + "learning_rate": 5.706793614877615e-07, + "loss": 1.7887, + "step": 12902 + }, + { + "epoch": 2.5857715430861723, + "grad_norm": 28.707674586216008, + "learning_rate": 5.701385758455901e-07, + "loss": 1.8313, + "step": 12903 + }, + { + "epoch": 2.5859719438877757, + "grad_norm": 55.64119260387672, + "learning_rate": 5.69598031061524e-07, + "loss": 1.9564, + "step": 12904 + }, + { + "epoch": 2.5861723446893787, + "grad_norm": 21.31434965011015, + "learning_rate": 5.690577271649522e-07, + "loss": 1.3162, + "step": 12905 + }, + { + "epoch": 2.5863727454909817, + "grad_norm": 40.77455639121012, + "learning_rate": 5.685176641852524e-07, + "loss": 1.9003, + "step": 12906 + }, + { + "epoch": 2.586573146292585, + "grad_norm": 21.213812203881215, + "learning_rate": 5.679778421517889e-07, + "loss": 1.5538, + "step": 12907 + }, + { + "epoch": 2.5867735470941886, + "grad_norm": 18.392140100597945, + "learning_rate": 5.674382610939133e-07, + "loss": 1.4309, + "step": 12908 + }, + { + "epoch": 2.5869739478957916, + "grad_norm": 17.718662731194446, + "learning_rate": 5.668989210409615e-07, + "loss": 1.8297, + "step": 12909 + }, + { + "epoch": 2.5871743486973946, + "grad_norm": 44.74957916278961, + "learning_rate": 5.663598220222593e-07, + "loss": 1.3427, + "step": 12910 + }, + { + "epoch": 2.587374749498998, + "grad_norm": 21.80973984776634, + "learning_rate": 5.65820964067118e-07, + "loss": 1.6138, + "step": 12911 + }, + { + "epoch": 2.5875751503006015, + "grad_norm": 20.909182363899216, + "learning_rate": 5.652823472048368e-07, + "loss": 1.692, + "step": 12912 + }, + { + "epoch": 2.5877755511022045, + "grad_norm": 19.77014187711671, + "learning_rate": 5.647439714647002e-07, + "loss": 1.5962, + "step": 12913 + }, + { + "epoch": 2.5879759519038075, + "grad_norm": 23.05535459774302, + "learning_rate": 5.642058368759812e-07, + "loss": 1.465, + "step": 12914 + }, + { + "epoch": 2.588176352705411, + "grad_norm": 25.594263215209985, + "learning_rate": 5.636679434679393e-07, + "loss": 1.9103, + "step": 12915 + }, + { + "epoch": 2.588376753507014, + "grad_norm": 20.600216285204166, + "learning_rate": 5.631302912698194e-07, + "loss": 2.1136, + "step": 12916 + }, + { + "epoch": 2.5885771543086173, + "grad_norm": 23.45055138104073, + "learning_rate": 5.625928803108538e-07, + "loss": 1.3555, + "step": 12917 + }, + { + "epoch": 2.5887775551102203, + "grad_norm": 18.304630867159876, + "learning_rate": 5.620557106202651e-07, + "loss": 1.642, + "step": 12918 + }, + { + "epoch": 2.588977955911824, + "grad_norm": 21.37085269806271, + "learning_rate": 5.615187822272583e-07, + "loss": 1.2158, + "step": 12919 + }, + { + "epoch": 2.5891783567134268, + "grad_norm": 23.76973917747046, + "learning_rate": 5.609820951610262e-07, + "loss": 1.6175, + "step": 12920 + }, + { + "epoch": 2.58937875751503, + "grad_norm": 22.421770582635126, + "learning_rate": 5.60445649450751e-07, + "loss": 1.1998, + "step": 12921 + }, + { + "epoch": 2.589579158316633, + "grad_norm": 76.14264492067097, + "learning_rate": 5.599094451255988e-07, + "loss": 1.8919, + "step": 12922 + }, + { + "epoch": 2.589779559118236, + "grad_norm": 63.15558261111077, + "learning_rate": 5.59373482214724e-07, + "loss": 1.9211, + "step": 12923 + }, + { + "epoch": 2.5899799599198396, + "grad_norm": 22.337782724120334, + "learning_rate": 5.588377607472684e-07, + "loss": 1.3869, + "step": 12924 + }, + { + "epoch": 2.590180360721443, + "grad_norm": 23.624305532678346, + "learning_rate": 5.583022807523602e-07, + "loss": 1.381, + "step": 12925 + }, + { + "epoch": 2.590380761523046, + "grad_norm": 24.05971197617766, + "learning_rate": 5.577670422591125e-07, + "loss": 1.3967, + "step": 12926 + }, + { + "epoch": 2.590581162324649, + "grad_norm": 19.95378118127776, + "learning_rate": 5.572320452966279e-07, + "loss": 1.6384, + "step": 12927 + }, + { + "epoch": 2.5907815631262525, + "grad_norm": 45.53646354610407, + "learning_rate": 5.566972898939954e-07, + "loss": 1.5855, + "step": 12928 + }, + { + "epoch": 2.590981963927856, + "grad_norm": 20.68997000237831, + "learning_rate": 5.561627760802901e-07, + "loss": 1.132, + "step": 12929 + }, + { + "epoch": 2.591182364729459, + "grad_norm": 42.52851042754658, + "learning_rate": 5.556285038845749e-07, + "loss": 2.1737, + "step": 12930 + }, + { + "epoch": 2.591382765531062, + "grad_norm": 18.10414553670186, + "learning_rate": 5.550944733358976e-07, + "loss": 1.568, + "step": 12931 + }, + { + "epoch": 2.5915831663326654, + "grad_norm": 23.07707505657631, + "learning_rate": 5.545606844632967e-07, + "loss": 1.4391, + "step": 12932 + }, + { + "epoch": 2.5917835671342684, + "grad_norm": 31.8456257998151, + "learning_rate": 5.540271372957912e-07, + "loss": 1.4166, + "step": 12933 + }, + { + "epoch": 2.591983967935872, + "grad_norm": 18.607321784047773, + "learning_rate": 5.53493831862395e-07, + "loss": 1.0589, + "step": 12934 + }, + { + "epoch": 2.592184368737475, + "grad_norm": 18.575895271658535, + "learning_rate": 5.529607681921018e-07, + "loss": 1.9301, + "step": 12935 + }, + { + "epoch": 2.5923847695390783, + "grad_norm": 22.806195403301775, + "learning_rate": 5.524279463138965e-07, + "loss": 1.9455, + "step": 12936 + }, + { + "epoch": 2.5925851703406813, + "grad_norm": 19.636633418214682, + "learning_rate": 5.518953662567484e-07, + "loss": 1.153, + "step": 12937 + }, + { + "epoch": 2.5927855711422847, + "grad_norm": 26.538861018503557, + "learning_rate": 5.513630280496158e-07, + "loss": 1.7904, + "step": 12938 + }, + { + "epoch": 2.5929859719438877, + "grad_norm": 17.66341219060405, + "learning_rate": 5.508309317214428e-07, + "loss": 1.4036, + "step": 12939 + }, + { + "epoch": 2.593186372745491, + "grad_norm": 18.307783509916813, + "learning_rate": 5.502990773011574e-07, + "loss": 1.5908, + "step": 12940 + }, + { + "epoch": 2.593386773547094, + "grad_norm": 18.287101667636716, + "learning_rate": 5.49767464817682e-07, + "loss": 1.4077, + "step": 12941 + }, + { + "epoch": 2.5935871743486976, + "grad_norm": 23.940757497378385, + "learning_rate": 5.492360942999176e-07, + "loss": 1.7224, + "step": 12942 + }, + { + "epoch": 2.5937875751503006, + "grad_norm": 26.94380033295883, + "learning_rate": 5.487049657767563e-07, + "loss": 1.1126, + "step": 12943 + }, + { + "epoch": 2.5939879759519036, + "grad_norm": 20.089642833713135, + "learning_rate": 5.481740792770768e-07, + "loss": 1.6404, + "step": 12944 + }, + { + "epoch": 2.594188376753507, + "grad_norm": 39.87407662388067, + "learning_rate": 5.476434348297443e-07, + "loss": 2.1398, + "step": 12945 + }, + { + "epoch": 2.5943887775551104, + "grad_norm": 19.61732419197565, + "learning_rate": 5.471130324636115e-07, + "loss": 1.4324, + "step": 12946 + }, + { + "epoch": 2.5945891783567134, + "grad_norm": 16.04120816244126, + "learning_rate": 5.465828722075145e-07, + "loss": 1.3225, + "step": 12947 + }, + { + "epoch": 2.5947895791583164, + "grad_norm": 23.377922219127935, + "learning_rate": 5.460529540902809e-07, + "loss": 1.743, + "step": 12948 + }, + { + "epoch": 2.59498997995992, + "grad_norm": 19.339613474687127, + "learning_rate": 5.455232781407243e-07, + "loss": 1.1786, + "step": 12949 + }, + { + "epoch": 2.5951903807615233, + "grad_norm": 19.530661891045888, + "learning_rate": 5.449938443876407e-07, + "loss": 1.2942, + "step": 12950 + }, + { + "epoch": 2.5953907815631263, + "grad_norm": 23.291345925773246, + "learning_rate": 5.444646528598197e-07, + "loss": 1.6002, + "step": 12951 + }, + { + "epoch": 2.5955911823647293, + "grad_norm": 23.177761806165297, + "learning_rate": 5.43935703586031e-07, + "loss": 1.5107, + "step": 12952 + }, + { + "epoch": 2.5957915831663327, + "grad_norm": 27.73911782936538, + "learning_rate": 5.434069965950368e-07, + "loss": 2.0118, + "step": 12953 + }, + { + "epoch": 2.5959919839679357, + "grad_norm": 19.764962542469483, + "learning_rate": 5.428785319155821e-07, + "loss": 1.7593, + "step": 12954 + }, + { + "epoch": 2.596192384769539, + "grad_norm": 27.183420982158996, + "learning_rate": 5.42350309576401e-07, + "loss": 1.9675, + "step": 12955 + }, + { + "epoch": 2.596392785571142, + "grad_norm": 17.9683821510165, + "learning_rate": 5.418223296062148e-07, + "loss": 1.4346, + "step": 12956 + }, + { + "epoch": 2.5965931863727456, + "grad_norm": 23.243072316798134, + "learning_rate": 5.41294592033727e-07, + "loss": 1.5028, + "step": 12957 + }, + { + "epoch": 2.5967935871743486, + "grad_norm": 34.43663126524327, + "learning_rate": 5.407670968876366e-07, + "loss": 1.7054, + "step": 12958 + }, + { + "epoch": 2.596993987975952, + "grad_norm": 22.547575880609187, + "learning_rate": 5.4023984419662e-07, + "loss": 1.7655, + "step": 12959 + }, + { + "epoch": 2.597194388777555, + "grad_norm": 17.495912936353484, + "learning_rate": 5.397128339893465e-07, + "loss": 1.557, + "step": 12960 + }, + { + "epoch": 2.5973947895791585, + "grad_norm": 20.607907422703132, + "learning_rate": 5.391860662944704e-07, + "loss": 1.5952, + "step": 12961 + }, + { + "epoch": 2.5975951903807615, + "grad_norm": 24.160957060463744, + "learning_rate": 5.38659541140632e-07, + "loss": 1.6548, + "step": 12962 + }, + { + "epoch": 2.597795591182365, + "grad_norm": 39.67985679548396, + "learning_rate": 5.381332585564614e-07, + "loss": 1.5204, + "step": 12963 + }, + { + "epoch": 2.597995991983968, + "grad_norm": 19.96667089003923, + "learning_rate": 5.376072185705699e-07, + "loss": 1.2359, + "step": 12964 + }, + { + "epoch": 2.598196392785571, + "grad_norm": 21.759918564987817, + "learning_rate": 5.370814212115627e-07, + "loss": 1.6904, + "step": 12965 + }, + { + "epoch": 2.5983967935871743, + "grad_norm": 19.215701949764853, + "learning_rate": 5.365558665080256e-07, + "loss": 1.0869, + "step": 12966 + }, + { + "epoch": 2.598597194388778, + "grad_norm": 22.435491244359078, + "learning_rate": 5.360305544885353e-07, + "loss": 1.7724, + "step": 12967 + }, + { + "epoch": 2.598797595190381, + "grad_norm": 21.470912586165756, + "learning_rate": 5.355054851816521e-07, + "loss": 1.5009, + "step": 12968 + }, + { + "epoch": 2.598997995991984, + "grad_norm": 19.83725840990157, + "learning_rate": 5.349806586159268e-07, + "loss": 1.8721, + "step": 12969 + }, + { + "epoch": 2.599198396793587, + "grad_norm": 19.831468614254298, + "learning_rate": 5.344560748198946e-07, + "loss": 1.8333, + "step": 12970 + }, + { + "epoch": 2.5993987975951907, + "grad_norm": 25.895457326000212, + "learning_rate": 5.339317338220757e-07, + "loss": 1.699, + "step": 12971 + }, + { + "epoch": 2.5995991983967937, + "grad_norm": 20.47698576002248, + "learning_rate": 5.334076356509826e-07, + "loss": 1.7308, + "step": 12972 + }, + { + "epoch": 2.5997995991983966, + "grad_norm": 30.5846615240657, + "learning_rate": 5.328837803351083e-07, + "loss": 1.9512, + "step": 12973 + }, + { + "epoch": 2.6, + "grad_norm": 18.818759298627768, + "learning_rate": 5.323601679029367e-07, + "loss": 1.5513, + "step": 12974 + }, + { + "epoch": 2.600200400801603, + "grad_norm": 18.767037491979735, + "learning_rate": 5.318367983829393e-07, + "loss": 1.4753, + "step": 12975 + }, + { + "epoch": 2.6004008016032065, + "grad_norm": 23.615138256017094, + "learning_rate": 5.313136718035694e-07, + "loss": 1.5709, + "step": 12976 + }, + { + "epoch": 2.6006012024048095, + "grad_norm": 19.043009851829503, + "learning_rate": 5.307907881932723e-07, + "loss": 1.6189, + "step": 12977 + }, + { + "epoch": 2.600801603206413, + "grad_norm": 18.90926658914987, + "learning_rate": 5.302681475804767e-07, + "loss": 1.2852, + "step": 12978 + }, + { + "epoch": 2.601002004008016, + "grad_norm": 29.519063877632597, + "learning_rate": 5.297457499935998e-07, + "loss": 1.6666, + "step": 12979 + }, + { + "epoch": 2.6012024048096194, + "grad_norm": 20.23336596242089, + "learning_rate": 5.292235954610464e-07, + "loss": 1.4193, + "step": 12980 + }, + { + "epoch": 2.6014028056112224, + "grad_norm": 29.55378294161303, + "learning_rate": 5.287016840112036e-07, + "loss": 1.8599, + "step": 12981 + }, + { + "epoch": 2.6016032064128254, + "grad_norm": 19.671209590864258, + "learning_rate": 5.281800156724526e-07, + "loss": 1.3734, + "step": 12982 + }, + { + "epoch": 2.601803607214429, + "grad_norm": 25.19417012991528, + "learning_rate": 5.276585904731541e-07, + "loss": 1.5152, + "step": 12983 + }, + { + "epoch": 2.6020040080160323, + "grad_norm": 19.700595735814982, + "learning_rate": 5.271374084416591e-07, + "loss": 1.867, + "step": 12984 + }, + { + "epoch": 2.6022044088176353, + "grad_norm": 18.32207817374327, + "learning_rate": 5.266164696063064e-07, + "loss": 1.4098, + "step": 12985 + }, + { + "epoch": 2.6024048096192383, + "grad_norm": 22.459872692448766, + "learning_rate": 5.260957739954198e-07, + "loss": 1.6966, + "step": 12986 + }, + { + "epoch": 2.6026052104208417, + "grad_norm": 19.45926441528566, + "learning_rate": 5.255753216373105e-07, + "loss": 1.6865, + "step": 12987 + }, + { + "epoch": 2.602805611222445, + "grad_norm": 31.18711663715221, + "learning_rate": 5.250551125602743e-07, + "loss": 1.7654, + "step": 12988 + }, + { + "epoch": 2.603006012024048, + "grad_norm": 21.446102487647963, + "learning_rate": 5.24535146792599e-07, + "loss": 1.1211, + "step": 12989 + }, + { + "epoch": 2.603206412825651, + "grad_norm": 15.039674337020283, + "learning_rate": 5.240154243625534e-07, + "loss": 1.2925, + "step": 12990 + }, + { + "epoch": 2.6034068136272546, + "grad_norm": 18.886130743462665, + "learning_rate": 5.234959452983962e-07, + "loss": 1.3055, + "step": 12991 + }, + { + "epoch": 2.6036072144288576, + "grad_norm": 20.28663766133847, + "learning_rate": 5.229767096283723e-07, + "loss": 1.4434, + "step": 12992 + }, + { + "epoch": 2.603807615230461, + "grad_norm": 19.587205775440143, + "learning_rate": 5.224577173807133e-07, + "loss": 1.6491, + "step": 12993 + }, + { + "epoch": 2.604008016032064, + "grad_norm": 23.17168329073103, + "learning_rate": 5.219389685836374e-07, + "loss": 1.7368, + "step": 12994 + }, + { + "epoch": 2.6042084168336674, + "grad_norm": 19.745326394171318, + "learning_rate": 5.214204632653508e-07, + "loss": 1.1771, + "step": 12995 + }, + { + "epoch": 2.6044088176352704, + "grad_norm": 20.28670071931863, + "learning_rate": 5.209022014540449e-07, + "loss": 1.861, + "step": 12996 + }, + { + "epoch": 2.604609218436874, + "grad_norm": 16.75116277298783, + "learning_rate": 5.203841831778972e-07, + "loss": 1.5219, + "step": 12997 + }, + { + "epoch": 2.604809619238477, + "grad_norm": 21.67599990308302, + "learning_rate": 5.198664084650734e-07, + "loss": 1.7919, + "step": 12998 + }, + { + "epoch": 2.6050100200400803, + "grad_norm": 17.349236877849318, + "learning_rate": 5.193488773437278e-07, + "loss": 1.7979, + "step": 12999 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 23.933768182509645, + "learning_rate": 5.188315898419971e-07, + "loss": 1.4884, + "step": 13000 + }, + { + "epoch": 2.6054108216432867, + "grad_norm": 18.67763104109004, + "learning_rate": 5.183145459880073e-07, + "loss": 1.4183, + "step": 13001 + }, + { + "epoch": 2.6056112224448897, + "grad_norm": 19.369321658653913, + "learning_rate": 5.177977458098715e-07, + "loss": 1.4278, + "step": 13002 + }, + { + "epoch": 2.6058116232464927, + "grad_norm": 35.398172021300155, + "learning_rate": 5.172811893356889e-07, + "loss": 1.1765, + "step": 13003 + }, + { + "epoch": 2.606012024048096, + "grad_norm": 21.840331361857015, + "learning_rate": 5.167648765935457e-07, + "loss": 1.8208, + "step": 13004 + }, + { + "epoch": 2.6062124248496996, + "grad_norm": 22.30579614583414, + "learning_rate": 5.162488076115119e-07, + "loss": 1.5332, + "step": 13005 + }, + { + "epoch": 2.6064128256513026, + "grad_norm": 22.252359829956585, + "learning_rate": 5.15732982417651e-07, + "loss": 1.4373, + "step": 13006 + }, + { + "epoch": 2.6066132264529056, + "grad_norm": 27.5612873392322, + "learning_rate": 5.152174010400068e-07, + "loss": 1.4872, + "step": 13007 + }, + { + "epoch": 2.606813627254509, + "grad_norm": 24.02336919079566, + "learning_rate": 5.147020635066124e-07, + "loss": 1.3485, + "step": 13008 + }, + { + "epoch": 2.6070140280561125, + "grad_norm": 19.2467007283405, + "learning_rate": 5.141869698454877e-07, + "loss": 1.578, + "step": 13009 + }, + { + "epoch": 2.6072144288577155, + "grad_norm": 20.294303070562144, + "learning_rate": 5.13672120084639e-07, + "loss": 1.258, + "step": 13010 + }, + { + "epoch": 2.6074148296593185, + "grad_norm": 23.233369414893943, + "learning_rate": 5.131575142520595e-07, + "loss": 1.9125, + "step": 13011 + }, + { + "epoch": 2.607615230460922, + "grad_norm": 21.044782138652288, + "learning_rate": 5.126431523757291e-07, + "loss": 1.2311, + "step": 13012 + }, + { + "epoch": 2.607815631262525, + "grad_norm": 21.18267889122844, + "learning_rate": 5.121290344836155e-07, + "loss": 2.0629, + "step": 13013 + }, + { + "epoch": 2.6080160320641284, + "grad_norm": 17.272682043193186, + "learning_rate": 5.116151606036702e-07, + "loss": 1.5894, + "step": 13014 + }, + { + "epoch": 2.6082164328657313, + "grad_norm": 19.503581593468827, + "learning_rate": 5.11101530763834e-07, + "loss": 1.5652, + "step": 13015 + }, + { + "epoch": 2.608416833667335, + "grad_norm": 18.128035108589334, + "learning_rate": 5.105881449920336e-07, + "loss": 1.1702, + "step": 13016 + }, + { + "epoch": 2.608617234468938, + "grad_norm": 23.6484967752133, + "learning_rate": 5.100750033161833e-07, + "loss": 1.7086, + "step": 13017 + }, + { + "epoch": 2.6088176352705412, + "grad_norm": 22.63135604990643, + "learning_rate": 5.095621057641826e-07, + "loss": 1.548, + "step": 13018 + }, + { + "epoch": 2.609018036072144, + "grad_norm": 23.56097382994545, + "learning_rate": 5.090494523639183e-07, + "loss": 1.6422, + "step": 13019 + }, + { + "epoch": 2.6092184368737477, + "grad_norm": 24.667345688612237, + "learning_rate": 5.085370431432657e-07, + "loss": 1.3155, + "step": 13020 + }, + { + "epoch": 2.6094188376753507, + "grad_norm": 19.58959244807531, + "learning_rate": 5.080248781300823e-07, + "loss": 1.6022, + "step": 13021 + }, + { + "epoch": 2.609619238476954, + "grad_norm": 24.553913210188547, + "learning_rate": 5.075129573522186e-07, + "loss": 1.3542, + "step": 13022 + }, + { + "epoch": 2.609819639278557, + "grad_norm": 21.683552791887443, + "learning_rate": 5.070012808375063e-07, + "loss": 1.9398, + "step": 13023 + }, + { + "epoch": 2.61002004008016, + "grad_norm": 20.32323346953692, + "learning_rate": 5.064898486137665e-07, + "loss": 2.1069, + "step": 13024 + }, + { + "epoch": 2.6102204408817635, + "grad_norm": 28.38615935522764, + "learning_rate": 5.059786607088069e-07, + "loss": 1.2988, + "step": 13025 + }, + { + "epoch": 2.610420841683367, + "grad_norm": 18.83629581529702, + "learning_rate": 5.054677171504207e-07, + "loss": 1.5413, + "step": 13026 + }, + { + "epoch": 2.61062124248497, + "grad_norm": 17.790479096373346, + "learning_rate": 5.04957017966391e-07, + "loss": 1.4259, + "step": 13027 + }, + { + "epoch": 2.610821643286573, + "grad_norm": 17.155824662046683, + "learning_rate": 5.044465631844808e-07, + "loss": 1.333, + "step": 13028 + }, + { + "epoch": 2.6110220440881764, + "grad_norm": 27.585056823276506, + "learning_rate": 5.039363528324487e-07, + "loss": 1.0485, + "step": 13029 + }, + { + "epoch": 2.6112224448897794, + "grad_norm": 24.527650203559165, + "learning_rate": 5.034263869380346e-07, + "loss": 1.8922, + "step": 13030 + }, + { + "epoch": 2.611422845691383, + "grad_norm": 25.275226634702143, + "learning_rate": 5.029166655289641e-07, + "loss": 1.6283, + "step": 13031 + }, + { + "epoch": 2.611623246492986, + "grad_norm": 19.493710053462657, + "learning_rate": 5.024071886329529e-07, + "loss": 1.276, + "step": 13032 + }, + { + "epoch": 2.6118236472945893, + "grad_norm": 16.44979604474819, + "learning_rate": 5.018979562777021e-07, + "loss": 1.5259, + "step": 13033 + }, + { + "epoch": 2.6120240480961923, + "grad_norm": 34.59257253494964, + "learning_rate": 5.013889684908996e-07, + "loss": 1.5278, + "step": 13034 + }, + { + "epoch": 2.6122244488977957, + "grad_norm": 40.14685352657628, + "learning_rate": 5.008802253002187e-07, + "loss": 1.5134, + "step": 13035 + }, + { + "epoch": 2.6124248496993987, + "grad_norm": 21.858911739202497, + "learning_rate": 5.003717267333219e-07, + "loss": 1.4294, + "step": 13036 + }, + { + "epoch": 2.612625250501002, + "grad_norm": 45.97356561454659, + "learning_rate": 4.998634728178569e-07, + "loss": 1.4375, + "step": 13037 + }, + { + "epoch": 2.612825651302605, + "grad_norm": 28.201308593707186, + "learning_rate": 4.993554635814557e-07, + "loss": 1.4971, + "step": 13038 + }, + { + "epoch": 2.6130260521042086, + "grad_norm": 17.76619005603483, + "learning_rate": 4.988476990517438e-07, + "loss": 1.0644, + "step": 13039 + }, + { + "epoch": 2.6132264529058116, + "grad_norm": 50.20760720677937, + "learning_rate": 4.983401792563258e-07, + "loss": 1.5395, + "step": 13040 + }, + { + "epoch": 2.6134268537074146, + "grad_norm": 28.653757571712298, + "learning_rate": 4.978329042227969e-07, + "loss": 1.6845, + "step": 13041 + }, + { + "epoch": 2.613627254509018, + "grad_norm": 22.729603968939358, + "learning_rate": 4.97325873978739e-07, + "loss": 1.5076, + "step": 13042 + }, + { + "epoch": 2.6138276553106214, + "grad_norm": 19.78394517351037, + "learning_rate": 4.968190885517205e-07, + "loss": 1.5427, + "step": 13043 + }, + { + "epoch": 2.6140280561122244, + "grad_norm": 24.55433071352585, + "learning_rate": 4.963125479692954e-07, + "loss": 1.3565, + "step": 13044 + }, + { + "epoch": 2.6142284569138274, + "grad_norm": 26.411603944619078, + "learning_rate": 4.95806252259004e-07, + "loss": 1.2747, + "step": 13045 + }, + { + "epoch": 2.614428857715431, + "grad_norm": 15.34784123791379, + "learning_rate": 4.953002014483771e-07, + "loss": 1.523, + "step": 13046 + }, + { + "epoch": 2.6146292585170343, + "grad_norm": 22.124406589452246, + "learning_rate": 4.94794395564927e-07, + "loss": 1.5638, + "step": 13047 + }, + { + "epoch": 2.6148296593186373, + "grad_norm": 24.60272373517622, + "learning_rate": 4.942888346361558e-07, + "loss": 1.7122, + "step": 13048 + }, + { + "epoch": 2.6150300601202403, + "grad_norm": 25.062968830160834, + "learning_rate": 4.937835186895512e-07, + "loss": 1.5311, + "step": 13049 + }, + { + "epoch": 2.6152304609218437, + "grad_norm": 23.037577098035392, + "learning_rate": 4.932784477525893e-07, + "loss": 1.468, + "step": 13050 + }, + { + "epoch": 2.6154308617234467, + "grad_norm": 17.133066231093192, + "learning_rate": 4.927736218527312e-07, + "loss": 1.0936, + "step": 13051 + }, + { + "epoch": 2.61563126252505, + "grad_norm": 20.782831282097984, + "learning_rate": 4.922690410174225e-07, + "loss": 1.3734, + "step": 13052 + }, + { + "epoch": 2.615831663326653, + "grad_norm": 23.29512035626524, + "learning_rate": 4.917647052741026e-07, + "loss": 1.7117, + "step": 13053 + }, + { + "epoch": 2.6160320641282566, + "grad_norm": 20.77806874071024, + "learning_rate": 4.912606146501885e-07, + "loss": 1.5454, + "step": 13054 + }, + { + "epoch": 2.6162324649298596, + "grad_norm": 14.024401903252611, + "learning_rate": 4.907567691730902e-07, + "loss": 1.4134, + "step": 13055 + }, + { + "epoch": 2.616432865731463, + "grad_norm": 31.412418864485893, + "learning_rate": 4.902531688702044e-07, + "loss": 1.7322, + "step": 13056 + }, + { + "epoch": 2.616633266533066, + "grad_norm": 20.888198855719956, + "learning_rate": 4.897498137689094e-07, + "loss": 1.8781, + "step": 13057 + }, + { + "epoch": 2.6168336673346695, + "grad_norm": 19.784434300502742, + "learning_rate": 4.892467038965765e-07, + "loss": 1.1062, + "step": 13058 + }, + { + "epoch": 2.6170340681362725, + "grad_norm": 30.234098563574648, + "learning_rate": 4.88743839280556e-07, + "loss": 1.371, + "step": 13059 + }, + { + "epoch": 2.617234468937876, + "grad_norm": 20.073501565101566, + "learning_rate": 4.882412199481939e-07, + "loss": 1.3598, + "step": 13060 + }, + { + "epoch": 2.617434869739479, + "grad_norm": 29.40769736074584, + "learning_rate": 4.877388459268173e-07, + "loss": 1.5704, + "step": 13061 + }, + { + "epoch": 2.617635270541082, + "grad_norm": 22.158789287312008, + "learning_rate": 4.872367172437381e-07, + "loss": 2.0974, + "step": 13062 + }, + { + "epoch": 2.6178356713426854, + "grad_norm": 13.43361651826519, + "learning_rate": 4.867348339262623e-07, + "loss": 1.4028, + "step": 13063 + }, + { + "epoch": 2.618036072144289, + "grad_norm": 19.217460862251283, + "learning_rate": 4.862331960016747e-07, + "loss": 1.6323, + "step": 13064 + }, + { + "epoch": 2.618236472945892, + "grad_norm": 22.638779660017555, + "learning_rate": 4.857318034972514e-07, + "loss": 1.3664, + "step": 13065 + }, + { + "epoch": 2.618436873747495, + "grad_norm": 22.660885851024823, + "learning_rate": 4.852306564402532e-07, + "loss": 1.6543, + "step": 13066 + }, + { + "epoch": 2.6186372745490982, + "grad_norm": 27.200123714903437, + "learning_rate": 4.847297548579288e-07, + "loss": 1.547, + "step": 13067 + }, + { + "epoch": 2.6188376753507017, + "grad_norm": 16.14734670343983, + "learning_rate": 4.842290987775139e-07, + "loss": 1.3427, + "step": 13068 + }, + { + "epoch": 2.6190380761523047, + "grad_norm": 24.411237616313617, + "learning_rate": 4.837286882262265e-07, + "loss": 1.6979, + "step": 13069 + }, + { + "epoch": 2.6192384769539077, + "grad_norm": 22.94905535918909, + "learning_rate": 4.832285232312795e-07, + "loss": 1.2262, + "step": 13070 + }, + { + "epoch": 2.619438877755511, + "grad_norm": 19.218191917420302, + "learning_rate": 4.827286038198637e-07, + "loss": 1.4471, + "step": 13071 + }, + { + "epoch": 2.619639278557114, + "grad_norm": 22.30668174195754, + "learning_rate": 4.822289300191618e-07, + "loss": 1.462, + "step": 13072 + }, + { + "epoch": 2.6198396793587175, + "grad_norm": 58.3285705030371, + "learning_rate": 4.817295018563417e-07, + "loss": 1.7501, + "step": 13073 + }, + { + "epoch": 2.6200400801603205, + "grad_norm": 23.50850141664144, + "learning_rate": 4.812303193585583e-07, + "loss": 1.8852, + "step": 13074 + }, + { + "epoch": 2.620240480961924, + "grad_norm": 36.239317326049736, + "learning_rate": 4.807313825529536e-07, + "loss": 1.6722, + "step": 13075 + }, + { + "epoch": 2.620440881763527, + "grad_norm": 23.073084105486483, + "learning_rate": 4.802326914666533e-07, + "loss": 1.1557, + "step": 13076 + }, + { + "epoch": 2.6206412825651304, + "grad_norm": 25.123610353195687, + "learning_rate": 4.797342461267745e-07, + "loss": 1.5014, + "step": 13077 + }, + { + "epoch": 2.6208416833667334, + "grad_norm": 18.949372907559507, + "learning_rate": 4.792360465604168e-07, + "loss": 1.6171, + "step": 13078 + }, + { + "epoch": 2.621042084168337, + "grad_norm": 23.480655383940412, + "learning_rate": 4.787380927946683e-07, + "loss": 1.5585, + "step": 13079 + }, + { + "epoch": 2.62124248496994, + "grad_norm": 26.42951273494587, + "learning_rate": 4.782403848566037e-07, + "loss": 1.6131, + "step": 13080 + }, + { + "epoch": 2.6214428857715433, + "grad_norm": 20.613571685528537, + "learning_rate": 4.777429227732844e-07, + "loss": 1.4681, + "step": 13081 + }, + { + "epoch": 2.6216432865731463, + "grad_norm": 16.99906010152141, + "learning_rate": 4.772457065717572e-07, + "loss": 1.4882, + "step": 13082 + }, + { + "epoch": 2.6218436873747493, + "grad_norm": 23.051922127871855, + "learning_rate": 4.767487362790579e-07, + "loss": 1.7747, + "step": 13083 + }, + { + "epoch": 2.6220440881763527, + "grad_norm": 24.232990652974824, + "learning_rate": 4.762520119222075e-07, + "loss": 1.7289, + "step": 13084 + }, + { + "epoch": 2.622244488977956, + "grad_norm": 27.162773064672887, + "learning_rate": 4.7575553352821156e-07, + "loss": 1.9128, + "step": 13085 + }, + { + "epoch": 2.622444889779559, + "grad_norm": 19.060647949131415, + "learning_rate": 4.752593011240647e-07, + "loss": 1.3671, + "step": 13086 + }, + { + "epoch": 2.622645290581162, + "grad_norm": 24.28019643309943, + "learning_rate": 4.7476331473675085e-07, + "loss": 1.9669, + "step": 13087 + }, + { + "epoch": 2.6228456913827656, + "grad_norm": 23.92087645475488, + "learning_rate": 4.742675743932346e-07, + "loss": 1.5609, + "step": 13088 + }, + { + "epoch": 2.6230460921843686, + "grad_norm": 18.40994360611469, + "learning_rate": 4.737720801204709e-07, + "loss": 1.5313, + "step": 13089 + }, + { + "epoch": 2.623246492985972, + "grad_norm": 21.818618831216316, + "learning_rate": 4.7327683194540064e-07, + "loss": 1.7687, + "step": 13090 + }, + { + "epoch": 2.623446893787575, + "grad_norm": 16.060113393754733, + "learning_rate": 4.72781829894951e-07, + "loss": 1.2856, + "step": 13091 + }, + { + "epoch": 2.6236472945891784, + "grad_norm": 16.997552924921894, + "learning_rate": 4.722870739960356e-07, + "loss": 1.2359, + "step": 13092 + }, + { + "epoch": 2.6238476953907814, + "grad_norm": 22.869342101622276, + "learning_rate": 4.7179256427555607e-07, + "loss": 1.4199, + "step": 13093 + }, + { + "epoch": 2.624048096192385, + "grad_norm": 25.290138899640393, + "learning_rate": 4.7129830076039997e-07, + "loss": 1.4127, + "step": 13094 + }, + { + "epoch": 2.624248496993988, + "grad_norm": 45.404582267439075, + "learning_rate": 4.70804283477439e-07, + "loss": 1.7922, + "step": 13095 + }, + { + "epoch": 2.6244488977955913, + "grad_norm": 17.69605558410207, + "learning_rate": 4.703105124535351e-07, + "loss": 1.3633, + "step": 13096 + }, + { + "epoch": 2.6246492985971943, + "grad_norm": 21.20965760989396, + "learning_rate": 4.698169877155345e-07, + "loss": 1.6272, + "step": 13097 + }, + { + "epoch": 2.6248496993987978, + "grad_norm": 16.961624749929065, + "learning_rate": 4.693237092902719e-07, + "loss": 1.2414, + "step": 13098 + }, + { + "epoch": 2.6250501002004007, + "grad_norm": 21.066203852706206, + "learning_rate": 4.6883067720456753e-07, + "loss": 1.5814, + "step": 13099 + }, + { + "epoch": 2.6252505010020037, + "grad_norm": 19.579598705121562, + "learning_rate": 4.683378914852271e-07, + "loss": 1.5652, + "step": 13100 + }, + { + "epoch": 2.625450901803607, + "grad_norm": 29.790326353213345, + "learning_rate": 4.6784535215904645e-07, + "loss": 1.4138, + "step": 13101 + }, + { + "epoch": 2.6256513026052106, + "grad_norm": 28.98276301553585, + "learning_rate": 4.673530592528025e-07, + "loss": 1.7808, + "step": 13102 + }, + { + "epoch": 2.6258517034068136, + "grad_norm": 30.5320416578933, + "learning_rate": 4.668610127932638e-07, + "loss": 1.3728, + "step": 13103 + }, + { + "epoch": 2.6260521042084166, + "grad_norm": 21.149577278252167, + "learning_rate": 4.663692128071834e-07, + "loss": 1.4016, + "step": 13104 + }, + { + "epoch": 2.62625250501002, + "grad_norm": 22.477418548576512, + "learning_rate": 4.658776593213016e-07, + "loss": 1.5011, + "step": 13105 + }, + { + "epoch": 2.6264529058116235, + "grad_norm": 18.66389524858641, + "learning_rate": 4.6538635236234366e-07, + "loss": 1.2285, + "step": 13106 + }, + { + "epoch": 2.6266533066132265, + "grad_norm": 18.903822585800075, + "learning_rate": 4.648952919570238e-07, + "loss": 1.113, + "step": 13107 + }, + { + "epoch": 2.6268537074148295, + "grad_norm": 26.589693983336634, + "learning_rate": 4.6440447813204227e-07, + "loss": 1.4825, + "step": 13108 + }, + { + "epoch": 2.627054108216433, + "grad_norm": 17.85693028173431, + "learning_rate": 4.639139109140828e-07, + "loss": 1.6297, + "step": 13109 + }, + { + "epoch": 2.627254509018036, + "grad_norm": 22.06144270175895, + "learning_rate": 4.634235903298212e-07, + "loss": 2.0015, + "step": 13110 + }, + { + "epoch": 2.6274549098196394, + "grad_norm": 37.90459247861137, + "learning_rate": 4.629335164059151e-07, + "loss": 1.7222, + "step": 13111 + }, + { + "epoch": 2.6276553106212424, + "grad_norm": 20.48788999347172, + "learning_rate": 4.6244368916901096e-07, + "loss": 1.1712, + "step": 13112 + }, + { + "epoch": 2.627855711422846, + "grad_norm": 19.128557619905145, + "learning_rate": 4.619541086457413e-07, + "loss": 1.4184, + "step": 13113 + }, + { + "epoch": 2.628056112224449, + "grad_norm": 25.80690139351941, + "learning_rate": 4.6146477486272546e-07, + "loss": 1.7873, + "step": 13114 + }, + { + "epoch": 2.6282565130260522, + "grad_norm": 19.310634956626448, + "learning_rate": 4.6097568784657043e-07, + "loss": 2.1145, + "step": 13115 + }, + { + "epoch": 2.6284569138276552, + "grad_norm": 16.165725772069024, + "learning_rate": 4.604868476238655e-07, + "loss": 1.6624, + "step": 13116 + }, + { + "epoch": 2.6286573146292587, + "grad_norm": 22.810491364651696, + "learning_rate": 4.599982542211928e-07, + "loss": 1.3236, + "step": 13117 + }, + { + "epoch": 2.6288577154308617, + "grad_norm": 23.226762816353236, + "learning_rate": 4.5950990766511716e-07, + "loss": 1.8808, + "step": 13118 + }, + { + "epoch": 2.629058116232465, + "grad_norm": 20.342898128528677, + "learning_rate": 4.5902180798218953e-07, + "loss": 1.4643, + "step": 13119 + }, + { + "epoch": 2.629258517034068, + "grad_norm": 27.131913365962816, + "learning_rate": 4.5853395519894985e-07, + "loss": 1.4926, + "step": 13120 + }, + { + "epoch": 2.629458917835671, + "grad_norm": 29.571597517529376, + "learning_rate": 4.5804634934192184e-07, + "loss": 1.2405, + "step": 13121 + }, + { + "epoch": 2.6296593186372745, + "grad_norm": 18.895622208321722, + "learning_rate": 4.5755899043761874e-07, + "loss": 1.6368, + "step": 13122 + }, + { + "epoch": 2.629859719438878, + "grad_norm": 27.13545363673933, + "learning_rate": 4.570718785125389e-07, + "loss": 2.0481, + "step": 13123 + }, + { + "epoch": 2.630060120240481, + "grad_norm": 22.892874171731293, + "learning_rate": 4.5658501359316653e-07, + "loss": 1.6628, + "step": 13124 + }, + { + "epoch": 2.630260521042084, + "grad_norm": 23.63238504513463, + "learning_rate": 4.5609839570597503e-07, + "loss": 1.559, + "step": 13125 + }, + { + "epoch": 2.6304609218436874, + "grad_norm": 34.17875584562787, + "learning_rate": 4.5561202487741927e-07, + "loss": 1.1935, + "step": 13126 + }, + { + "epoch": 2.630661322645291, + "grad_norm": 22.816984203005518, + "learning_rate": 4.551259011339476e-07, + "loss": 1.3616, + "step": 13127 + }, + { + "epoch": 2.630861723446894, + "grad_norm": 24.410181662477402, + "learning_rate": 4.546400245019883e-07, + "loss": 1.3052, + "step": 13128 + }, + { + "epoch": 2.631062124248497, + "grad_norm": 18.869888942522373, + "learning_rate": 4.5415439500796077e-07, + "loss": 1.3716, + "step": 13129 + }, + { + "epoch": 2.6312625250501003, + "grad_norm": 25.19973869218665, + "learning_rate": 4.536690126782689e-07, + "loss": 1.9472, + "step": 13130 + }, + { + "epoch": 2.6314629258517033, + "grad_norm": 30.130143215033737, + "learning_rate": 4.5318387753930383e-07, + "loss": 2.2472, + "step": 13131 + }, + { + "epoch": 2.6316633266533067, + "grad_norm": 34.862924508873185, + "learning_rate": 4.526989896174444e-07, + "loss": 1.5345, + "step": 13132 + }, + { + "epoch": 2.6318637274549097, + "grad_norm": 20.413110028806415, + "learning_rate": 4.522143489390507e-07, + "loss": 1.5974, + "step": 13133 + }, + { + "epoch": 2.632064128256513, + "grad_norm": 28.92856660604694, + "learning_rate": 4.517299555304783e-07, + "loss": 1.6911, + "step": 13134 + }, + { + "epoch": 2.632264529058116, + "grad_norm": 19.83770538085691, + "learning_rate": 4.5124580941806165e-07, + "loss": 1.5411, + "step": 13135 + }, + { + "epoch": 2.6324649298597196, + "grad_norm": 37.745563071102076, + "learning_rate": 4.5076191062812304e-07, + "loss": 1.3202, + "step": 13136 + }, + { + "epoch": 2.6326653306613226, + "grad_norm": 19.8142869627102, + "learning_rate": 4.50278259186977e-07, + "loss": 1.6674, + "step": 13137 + }, + { + "epoch": 2.632865731462926, + "grad_norm": 20.091864011286862, + "learning_rate": 4.497948551209169e-07, + "loss": 1.6211, + "step": 13138 + }, + { + "epoch": 2.633066132264529, + "grad_norm": 18.720508254522226, + "learning_rate": 4.4931169845622847e-07, + "loss": 1.0375, + "step": 13139 + }, + { + "epoch": 2.6332665330661325, + "grad_norm": 16.714726042791536, + "learning_rate": 4.488287892191784e-07, + "loss": 1.3405, + "step": 13140 + }, + { + "epoch": 2.6334669338677354, + "grad_norm": 20.3090561553138, + "learning_rate": 4.4834612743602624e-07, + "loss": 1.3429, + "step": 13141 + }, + { + "epoch": 2.6336673346693384, + "grad_norm": 23.837980351239366, + "learning_rate": 4.478637131330149e-07, + "loss": 1.6981, + "step": 13142 + }, + { + "epoch": 2.633867735470942, + "grad_norm": 23.668309338140034, + "learning_rate": 4.473815463363707e-07, + "loss": 1.4163, + "step": 13143 + }, + { + "epoch": 2.6340681362725453, + "grad_norm": 14.789463867372985, + "learning_rate": 4.4689962707231473e-07, + "loss": 1.2972, + "step": 13144 + }, + { + "epoch": 2.6342685370741483, + "grad_norm": 46.156759325606984, + "learning_rate": 4.4641795536704557e-07, + "loss": 1.6028, + "step": 13145 + }, + { + "epoch": 2.6344689378757513, + "grad_norm": 24.344353104806213, + "learning_rate": 4.45936531246754e-07, + "loss": 1.3929, + "step": 13146 + }, + { + "epoch": 2.6346693386773548, + "grad_norm": 24.312156073129398, + "learning_rate": 4.4545535473761556e-07, + "loss": 1.5488, + "step": 13147 + }, + { + "epoch": 2.6348697394789578, + "grad_norm": 19.351491467641953, + "learning_rate": 4.449744258657929e-07, + "loss": 1.4593, + "step": 13148 + }, + { + "epoch": 2.635070140280561, + "grad_norm": 17.125251765374117, + "learning_rate": 4.4449374465743543e-07, + "loss": 1.1272, + "step": 13149 + }, + { + "epoch": 2.635270541082164, + "grad_norm": 20.16858407580218, + "learning_rate": 4.440133111386752e-07, + "loss": 1.8755, + "step": 13150 + }, + { + "epoch": 2.6354709418837676, + "grad_norm": 20.184377227861678, + "learning_rate": 4.43533125335639e-07, + "loss": 1.177, + "step": 13151 + }, + { + "epoch": 2.6356713426853706, + "grad_norm": 26.272628895283905, + "learning_rate": 4.430531872744315e-07, + "loss": 1.2536, + "step": 13152 + }, + { + "epoch": 2.635871743486974, + "grad_norm": 26.754725160715623, + "learning_rate": 4.42573496981149e-07, + "loss": 1.5385, + "step": 13153 + }, + { + "epoch": 2.636072144288577, + "grad_norm": 16.88780618380959, + "learning_rate": 4.42094054481873e-07, + "loss": 1.408, + "step": 13154 + }, + { + "epoch": 2.6362725450901805, + "grad_norm": 17.530913349435313, + "learning_rate": 4.416148598026709e-07, + "loss": 1.5487, + "step": 13155 + }, + { + "epoch": 2.6364729458917835, + "grad_norm": 22.0241795145226, + "learning_rate": 4.4113591296959913e-07, + "loss": 1.7901, + "step": 13156 + }, + { + "epoch": 2.636673346693387, + "grad_norm": 49.44745014128103, + "learning_rate": 4.406572140086951e-07, + "loss": 1.5821, + "step": 13157 + }, + { + "epoch": 2.63687374749499, + "grad_norm": 20.823352680879776, + "learning_rate": 4.4017876294599024e-07, + "loss": 1.376, + "step": 13158 + }, + { + "epoch": 2.637074148296593, + "grad_norm": 17.711939916992915, + "learning_rate": 4.3970055980749606e-07, + "loss": 1.3083, + "step": 13159 + }, + { + "epoch": 2.6372745490981964, + "grad_norm": 23.315130263769287, + "learning_rate": 4.392226046192144e-07, + "loss": 1.2005, + "step": 13160 + }, + { + "epoch": 2.6374749498998, + "grad_norm": 25.314882826943617, + "learning_rate": 4.387448974071318e-07, + "loss": 1.2487, + "step": 13161 + }, + { + "epoch": 2.637675350701403, + "grad_norm": 19.414124501040043, + "learning_rate": 4.382674381972224e-07, + "loss": 1.9458, + "step": 13162 + }, + { + "epoch": 2.637875751503006, + "grad_norm": 18.78248463852089, + "learning_rate": 4.3779022701544547e-07, + "loss": 1.7022, + "step": 13163 + }, + { + "epoch": 2.6380761523046092, + "grad_norm": 17.503504057342877, + "learning_rate": 4.373132638877492e-07, + "loss": 1.4694, + "step": 13164 + }, + { + "epoch": 2.6382765531062127, + "grad_norm": 28.364627462589414, + "learning_rate": 4.368365488400661e-07, + "loss": 1.8429, + "step": 13165 + }, + { + "epoch": 2.6384769539078157, + "grad_norm": 17.243663758086512, + "learning_rate": 4.3636008189831545e-07, + "loss": 1.6433, + "step": 13166 + }, + { + "epoch": 2.6386773547094187, + "grad_norm": 29.005974986022654, + "learning_rate": 4.358838630884021e-07, + "loss": 2.0306, + "step": 13167 + }, + { + "epoch": 2.638877755511022, + "grad_norm": 22.539033754821023, + "learning_rate": 4.3540789243622264e-07, + "loss": 1.8695, + "step": 13168 + }, + { + "epoch": 2.639078156312625, + "grad_norm": 24.24405381354414, + "learning_rate": 4.3493216996765343e-07, + "loss": 1.4641, + "step": 13169 + }, + { + "epoch": 2.6392785571142285, + "grad_norm": 23.321608158637858, + "learning_rate": 4.3445669570855996e-07, + "loss": 1.5751, + "step": 13170 + }, + { + "epoch": 2.6394789579158315, + "grad_norm": 22.143094120413334, + "learning_rate": 4.33981469684796e-07, + "loss": 1.7009, + "step": 13171 + }, + { + "epoch": 2.639679358717435, + "grad_norm": 19.01563974274614, + "learning_rate": 4.335064919221993e-07, + "loss": 1.7739, + "step": 13172 + }, + { + "epoch": 2.639879759519038, + "grad_norm": 26.675915332072922, + "learning_rate": 4.3303176244659626e-07, + "loss": 1.9566, + "step": 13173 + }, + { + "epoch": 2.6400801603206414, + "grad_norm": 43.104145083331524, + "learning_rate": 4.3255728128379583e-07, + "loss": 2.2411, + "step": 13174 + }, + { + "epoch": 2.6402805611222444, + "grad_norm": 26.44165773439957, + "learning_rate": 4.320830484596006e-07, + "loss": 1.4149, + "step": 13175 + }, + { + "epoch": 2.640480961923848, + "grad_norm": 19.29004200162897, + "learning_rate": 4.316090639997911e-07, + "loss": 1.4978, + "step": 13176 + }, + { + "epoch": 2.640681362725451, + "grad_norm": 21.5138945302195, + "learning_rate": 4.3113532793014114e-07, + "loss": 1.6106, + "step": 13177 + }, + { + "epoch": 2.6408817635270543, + "grad_norm": 16.988431084049903, + "learning_rate": 4.3066184027640736e-07, + "loss": 1.3038, + "step": 13178 + }, + { + "epoch": 2.6410821643286573, + "grad_norm": 22.650069446686206, + "learning_rate": 4.3018860106433413e-07, + "loss": 1.3114, + "step": 13179 + }, + { + "epoch": 2.6412825651302603, + "grad_norm": 23.702416109918595, + "learning_rate": 4.297156103196526e-07, + "loss": 1.9018, + "step": 13180 + }, + { + "epoch": 2.6414829659318637, + "grad_norm": 15.308487911721928, + "learning_rate": 4.2924286806807935e-07, + "loss": 1.7407, + "step": 13181 + }, + { + "epoch": 2.641683366733467, + "grad_norm": 20.51343869523838, + "learning_rate": 4.287703743353194e-07, + "loss": 1.659, + "step": 13182 + }, + { + "epoch": 2.64188376753507, + "grad_norm": 28.44231257435099, + "learning_rate": 4.2829812914706115e-07, + "loss": 1.9123, + "step": 13183 + }, + { + "epoch": 2.642084168336673, + "grad_norm": 20.23514667779313, + "learning_rate": 4.2782613252898233e-07, + "loss": 1.2556, + "step": 13184 + }, + { + "epoch": 2.6422845691382766, + "grad_norm": 24.857813146542938, + "learning_rate": 4.273543845067457e-07, + "loss": 1.9766, + "step": 13185 + }, + { + "epoch": 2.64248496993988, + "grad_norm": 13.036914335705928, + "learning_rate": 4.2688288510600083e-07, + "loss": 1.1734, + "step": 13186 + }, + { + "epoch": 2.642685370741483, + "grad_norm": 35.45586338730162, + "learning_rate": 4.264116343523839e-07, + "loss": 1.2609, + "step": 13187 + }, + { + "epoch": 2.642885771543086, + "grad_norm": 25.55841587165328, + "learning_rate": 4.2594063227151814e-07, + "loss": 1.3988, + "step": 13188 + }, + { + "epoch": 2.6430861723446895, + "grad_norm": 18.203116888637204, + "learning_rate": 4.254698788890127e-07, + "loss": 1.0786, + "step": 13189 + }, + { + "epoch": 2.6432865731462925, + "grad_norm": 20.321703657996597, + "learning_rate": 4.249993742304614e-07, + "loss": 1.3161, + "step": 13190 + }, + { + "epoch": 2.643486973947896, + "grad_norm": 20.23185315814713, + "learning_rate": 4.245291183214478e-07, + "loss": 1.2481, + "step": 13191 + }, + { + "epoch": 2.643687374749499, + "grad_norm": 16.70402958154697, + "learning_rate": 4.240591111875403e-07, + "loss": 1.4641, + "step": 13192 + }, + { + "epoch": 2.6438877755511023, + "grad_norm": 23.39045213572751, + "learning_rate": 4.2358935285429337e-07, + "loss": 1.36, + "step": 13193 + }, + { + "epoch": 2.6440881763527053, + "grad_norm": 22.265495905498945, + "learning_rate": 4.231198433472483e-07, + "loss": 1.291, + "step": 13194 + }, + { + "epoch": 2.6442885771543088, + "grad_norm": 15.892438979769782, + "learning_rate": 4.2265058269193416e-07, + "loss": 1.3302, + "step": 13195 + }, + { + "epoch": 2.6444889779559118, + "grad_norm": 19.246545556059317, + "learning_rate": 4.221815709138649e-07, + "loss": 1.5558, + "step": 13196 + }, + { + "epoch": 2.6446893787575148, + "grad_norm": 20.05656941747082, + "learning_rate": 4.2171280803853963e-07, + "loss": 1.4045, + "step": 13197 + }, + { + "epoch": 2.644889779559118, + "grad_norm": 29.64670654707375, + "learning_rate": 4.2124429409144853e-07, + "loss": 1.7628, + "step": 13198 + }, + { + "epoch": 2.6450901803607216, + "grad_norm": 23.841645770915772, + "learning_rate": 4.207760290980645e-07, + "loss": 1.4591, + "step": 13199 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 33.722068968616156, + "learning_rate": 4.2030801308384607e-07, + "loss": 2.1619, + "step": 13200 + }, + { + "epoch": 2.6454909819639276, + "grad_norm": 22.718012884983516, + "learning_rate": 4.198402460742418e-07, + "loss": 1.769, + "step": 13201 + }, + { + "epoch": 2.645691382765531, + "grad_norm": 32.452118282758214, + "learning_rate": 4.1937272809468354e-07, + "loss": 1.8653, + "step": 13202 + }, + { + "epoch": 2.6458917835671345, + "grad_norm": 15.466680291898045, + "learning_rate": 4.189054591705921e-07, + "loss": 1.2981, + "step": 13203 + }, + { + "epoch": 2.6460921843687375, + "grad_norm": 19.60886037516754, + "learning_rate": 4.184384393273733e-07, + "loss": 1.4033, + "step": 13204 + }, + { + "epoch": 2.6462925851703405, + "grad_norm": 19.56436965469766, + "learning_rate": 4.179716685904195e-07, + "loss": 1.604, + "step": 13205 + }, + { + "epoch": 2.646492985971944, + "grad_norm": 24.29746868717235, + "learning_rate": 4.17505146985111e-07, + "loss": 1.5102, + "step": 13206 + }, + { + "epoch": 2.646693386773547, + "grad_norm": 21.433144887796015, + "learning_rate": 4.170388745368098e-07, + "loss": 1.4044, + "step": 13207 + }, + { + "epoch": 2.6468937875751504, + "grad_norm": 23.616914539628276, + "learning_rate": 4.1657285127087166e-07, + "loss": 1.4627, + "step": 13208 + }, + { + "epoch": 2.6470941883767534, + "grad_norm": 27.638326585403394, + "learning_rate": 4.16107077212633e-07, + "loss": 1.8688, + "step": 13209 + }, + { + "epoch": 2.647294589178357, + "grad_norm": 20.580632050309305, + "learning_rate": 4.156415523874185e-07, + "loss": 1.6224, + "step": 13210 + }, + { + "epoch": 2.64749498997996, + "grad_norm": 22.30964936897901, + "learning_rate": 4.151762768205403e-07, + "loss": 1.8715, + "step": 13211 + }, + { + "epoch": 2.6476953907815632, + "grad_norm": 28.796569735653996, + "learning_rate": 4.147112505372958e-07, + "loss": 1.4089, + "step": 13212 + }, + { + "epoch": 2.6478957915831662, + "grad_norm": 16.103360966468657, + "learning_rate": 4.1424647356296977e-07, + "loss": 1.4807, + "step": 13213 + }, + { + "epoch": 2.6480961923847697, + "grad_norm": 26.847302212014128, + "learning_rate": 4.137819459228304e-07, + "loss": 1.5647, + "step": 13214 + }, + { + "epoch": 2.6482965931863727, + "grad_norm": 24.69115706983484, + "learning_rate": 4.13317667642138e-07, + "loss": 1.0982, + "step": 13215 + }, + { + "epoch": 2.648496993987976, + "grad_norm": 19.394102061455303, + "learning_rate": 4.12853638746134e-07, + "loss": 1.1686, + "step": 13216 + }, + { + "epoch": 2.648697394789579, + "grad_norm": 26.504303914889487, + "learning_rate": 4.1238985926004893e-07, + "loss": 1.5125, + "step": 13217 + }, + { + "epoch": 2.648897795591182, + "grad_norm": 19.699641132489976, + "learning_rate": 4.119263292090991e-07, + "loss": 1.4051, + "step": 13218 + }, + { + "epoch": 2.6490981963927855, + "grad_norm": 24.563318728256974, + "learning_rate": 4.1146304861848784e-07, + "loss": 1.4806, + "step": 13219 + }, + { + "epoch": 2.649298597194389, + "grad_norm": 23.29742315467343, + "learning_rate": 4.110000175134044e-07, + "loss": 1.5682, + "step": 13220 + }, + { + "epoch": 2.649498997995992, + "grad_norm": 26.214709387191636, + "learning_rate": 4.1053723591902185e-07, + "loss": 1.4166, + "step": 13221 + }, + { + "epoch": 2.649699398797595, + "grad_norm": 19.506454061099603, + "learning_rate": 4.1007470386050684e-07, + "loss": 1.3496, + "step": 13222 + }, + { + "epoch": 2.6498997995991984, + "grad_norm": 22.3810033854936, + "learning_rate": 4.096124213630043e-07, + "loss": 1.9061, + "step": 13223 + }, + { + "epoch": 2.650100200400802, + "grad_norm": 25.10789590622956, + "learning_rate": 4.0915038845164957e-07, + "loss": 1.512, + "step": 13224 + }, + { + "epoch": 2.650300601202405, + "grad_norm": 24.354059517613926, + "learning_rate": 4.086886051515665e-07, + "loss": 2.0164, + "step": 13225 + }, + { + "epoch": 2.650501002004008, + "grad_norm": 28.197775212352457, + "learning_rate": 4.0822707148786113e-07, + "loss": 1.9791, + "step": 13226 + }, + { + "epoch": 2.6507014028056113, + "grad_norm": 81.53047962052437, + "learning_rate": 4.0776578748562836e-07, + "loss": 1.5501, + "step": 13227 + }, + { + "epoch": 2.6509018036072143, + "grad_norm": 33.653299771760864, + "learning_rate": 4.07304753169947e-07, + "loss": 1.5065, + "step": 13228 + }, + { + "epoch": 2.6511022044088177, + "grad_norm": 22.67088764562177, + "learning_rate": 4.0684396856588593e-07, + "loss": 1.9736, + "step": 13229 + }, + { + "epoch": 2.6513026052104207, + "grad_norm": 26.256926744596814, + "learning_rate": 4.063834336985001e-07, + "loss": 1.7442, + "step": 13230 + }, + { + "epoch": 2.651503006012024, + "grad_norm": 27.228561748367646, + "learning_rate": 4.0592314859282556e-07, + "loss": 1.3131, + "step": 13231 + }, + { + "epoch": 2.651703406813627, + "grad_norm": 20.803882862066516, + "learning_rate": 4.0546311327389286e-07, + "loss": 1.5168, + "step": 13232 + }, + { + "epoch": 2.6519038076152306, + "grad_norm": 20.632798187177826, + "learning_rate": 4.050033277667115e-07, + "loss": 1.4612, + "step": 13233 + }, + { + "epoch": 2.6521042084168336, + "grad_norm": 25.573778405919917, + "learning_rate": 4.045437920962825e-07, + "loss": 1.2235, + "step": 13234 + }, + { + "epoch": 2.652304609218437, + "grad_norm": 25.315601471330623, + "learning_rate": 4.0408450628759086e-07, + "loss": 1.3954, + "step": 13235 + }, + { + "epoch": 2.65250501002004, + "grad_norm": 26.46895294861009, + "learning_rate": 4.0362547036560896e-07, + "loss": 1.4253, + "step": 13236 + }, + { + "epoch": 2.6527054108216435, + "grad_norm": 22.200831112528498, + "learning_rate": 4.0316668435529614e-07, + "loss": 1.6594, + "step": 13237 + }, + { + "epoch": 2.6529058116232465, + "grad_norm": 18.041545508134305, + "learning_rate": 4.027081482815942e-07, + "loss": 1.4655, + "step": 13238 + }, + { + "epoch": 2.6531062124248495, + "grad_norm": 20.707213700715553, + "learning_rate": 4.0224986216943816e-07, + "loss": 1.8101, + "step": 13239 + }, + { + "epoch": 2.653306613226453, + "grad_norm": 22.4488817796839, + "learning_rate": 4.017918260437431e-07, + "loss": 1.3116, + "step": 13240 + }, + { + "epoch": 2.6535070140280563, + "grad_norm": 31.266975326235674, + "learning_rate": 4.013340399294141e-07, + "loss": 1.3188, + "step": 13241 + }, + { + "epoch": 2.6537074148296593, + "grad_norm": 18.903319372932124, + "learning_rate": 4.008765038513418e-07, + "loss": 1.3204, + "step": 13242 + }, + { + "epoch": 2.6539078156312623, + "grad_norm": 27.308017230303456, + "learning_rate": 4.0041921783440294e-07, + "loss": 1.6281, + "step": 13243 + }, + { + "epoch": 2.6541082164328658, + "grad_norm": 19.127551113387522, + "learning_rate": 3.999621819034616e-07, + "loss": 1.2497, + "step": 13244 + }, + { + "epoch": 2.654308617234469, + "grad_norm": 26.006625845453204, + "learning_rate": 3.99505396083365e-07, + "loss": 1.5301, + "step": 13245 + }, + { + "epoch": 2.654509018036072, + "grad_norm": 24.00035227168259, + "learning_rate": 3.990488603989523e-07, + "loss": 1.8307, + "step": 13246 + }, + { + "epoch": 2.654709418837675, + "grad_norm": 25.74548577501681, + "learning_rate": 3.985925748750441e-07, + "loss": 1.7629, + "step": 13247 + }, + { + "epoch": 2.6549098196392786, + "grad_norm": 21.75195416297776, + "learning_rate": 3.9813653953645004e-07, + "loss": 1.6217, + "step": 13248 + }, + { + "epoch": 2.6551102204408816, + "grad_norm": 22.834126507063843, + "learning_rate": 3.976807544079653e-07, + "loss": 1.0675, + "step": 13249 + }, + { + "epoch": 2.655310621242485, + "grad_norm": 20.270793782063453, + "learning_rate": 3.9722521951437165e-07, + "loss": 1.6232, + "step": 13250 + }, + { + "epoch": 2.655511022044088, + "grad_norm": 24.632413244069532, + "learning_rate": 3.967699348804371e-07, + "loss": 1.4964, + "step": 13251 + }, + { + "epoch": 2.6557114228456915, + "grad_norm": 22.363392734493647, + "learning_rate": 3.963149005309158e-07, + "loss": 1.6226, + "step": 13252 + }, + { + "epoch": 2.6559118236472945, + "grad_norm": 27.847373217114153, + "learning_rate": 3.9586011649055013e-07, + "loss": 1.6251, + "step": 13253 + }, + { + "epoch": 2.656112224448898, + "grad_norm": 23.23517646990439, + "learning_rate": 3.954055827840653e-07, + "loss": 1.6698, + "step": 13254 + }, + { + "epoch": 2.656312625250501, + "grad_norm": 18.551776734496368, + "learning_rate": 3.949512994361754e-07, + "loss": 0.8986, + "step": 13255 + }, + { + "epoch": 2.656513026052104, + "grad_norm": 19.890130139294687, + "learning_rate": 3.944972664715824e-07, + "loss": 1.6756, + "step": 13256 + }, + { + "epoch": 2.6567134268537074, + "grad_norm": 28.89074439792327, + "learning_rate": 3.9404348391497095e-07, + "loss": 1.8984, + "step": 13257 + }, + { + "epoch": 2.656913827655311, + "grad_norm": 14.395213945922716, + "learning_rate": 3.935899517910136e-07, + "loss": 1.246, + "step": 13258 + }, + { + "epoch": 2.657114228456914, + "grad_norm": 20.35730296401721, + "learning_rate": 3.931366701243705e-07, + "loss": 1.3299, + "step": 13259 + }, + { + "epoch": 2.657314629258517, + "grad_norm": 18.201954683732982, + "learning_rate": 3.926836389396876e-07, + "loss": 1.2731, + "step": 13260 + }, + { + "epoch": 2.6575150300601202, + "grad_norm": 13.93079604906657, + "learning_rate": 3.9223085826159633e-07, + "loss": 1.3582, + "step": 13261 + }, + { + "epoch": 2.6577154308617237, + "grad_norm": 16.843518642313406, + "learning_rate": 3.917783281147136e-07, + "loss": 1.7368, + "step": 13262 + }, + { + "epoch": 2.6579158316633267, + "grad_norm": 16.239916748839576, + "learning_rate": 3.9132604852364753e-07, + "loss": 1.2764, + "step": 13263 + }, + { + "epoch": 2.6581162324649297, + "grad_norm": 25.032430888647678, + "learning_rate": 3.908740195129862e-07, + "loss": 1.1173, + "step": 13264 + }, + { + "epoch": 2.658316633266533, + "grad_norm": 17.08351713275621, + "learning_rate": 3.9042224110730775e-07, + "loss": 1.4515, + "step": 13265 + }, + { + "epoch": 2.658517034068136, + "grad_norm": 19.275957981584252, + "learning_rate": 3.89970713331177e-07, + "loss": 1.4949, + "step": 13266 + }, + { + "epoch": 2.6587174348697395, + "grad_norm": 18.159702353224272, + "learning_rate": 3.895194362091431e-07, + "loss": 1.431, + "step": 13267 + }, + { + "epoch": 2.6589178356713425, + "grad_norm": 18.11963990858681, + "learning_rate": 3.890684097657432e-07, + "loss": 1.6463, + "step": 13268 + }, + { + "epoch": 2.659118236472946, + "grad_norm": 25.91486077037459, + "learning_rate": 3.8861763402550044e-07, + "loss": 1.4023, + "step": 13269 + }, + { + "epoch": 2.659318637274549, + "grad_norm": 19.714575091482274, + "learning_rate": 3.8816710901292467e-07, + "loss": 1.3593, + "step": 13270 + }, + { + "epoch": 2.6595190380761524, + "grad_norm": 20.08526512966797, + "learning_rate": 3.877168347525101e-07, + "loss": 1.401, + "step": 13271 + }, + { + "epoch": 2.6597194388777554, + "grad_norm": 23.589237232316783, + "learning_rate": 3.8726681126873944e-07, + "loss": 1.6064, + "step": 13272 + }, + { + "epoch": 2.659919839679359, + "grad_norm": 32.046387741991936, + "learning_rate": 3.8681703858608144e-07, + "loss": 1.1539, + "step": 13273 + }, + { + "epoch": 2.660120240480962, + "grad_norm": 22.386108722444096, + "learning_rate": 3.863675167289904e-07, + "loss": 1.1789, + "step": 13274 + }, + { + "epoch": 2.6603206412825653, + "grad_norm": 21.68030941001628, + "learning_rate": 3.859182457219085e-07, + "loss": 1.5351, + "step": 13275 + }, + { + "epoch": 2.6605210420841683, + "grad_norm": 31.10141932178783, + "learning_rate": 3.854692255892617e-07, + "loss": 2.0544, + "step": 13276 + }, + { + "epoch": 2.6607214428857713, + "grad_norm": 32.97985671427481, + "learning_rate": 3.85020456355466e-07, + "loss": 1.4189, + "step": 13277 + }, + { + "epoch": 2.6609218436873747, + "grad_norm": 22.59237853114182, + "learning_rate": 3.845719380449192e-07, + "loss": 1.5724, + "step": 13278 + }, + { + "epoch": 2.661122244488978, + "grad_norm": 23.754218483274293, + "learning_rate": 3.841236706820095e-07, + "loss": 1.636, + "step": 13279 + }, + { + "epoch": 2.661322645290581, + "grad_norm": 39.892172840060724, + "learning_rate": 3.836756542911102e-07, + "loss": 1.5095, + "step": 13280 + }, + { + "epoch": 2.661523046092184, + "grad_norm": 29.813719280710146, + "learning_rate": 3.832278888965796e-07, + "loss": 1.0853, + "step": 13281 + }, + { + "epoch": 2.6617234468937876, + "grad_norm": 26.56202331115643, + "learning_rate": 3.827803745227632e-07, + "loss": 1.5425, + "step": 13282 + }, + { + "epoch": 2.661923847695391, + "grad_norm": 27.470619582923824, + "learning_rate": 3.8233311119399375e-07, + "loss": 1.3126, + "step": 13283 + }, + { + "epoch": 2.662124248496994, + "grad_norm": 21.899294717652374, + "learning_rate": 3.8188609893458907e-07, + "loss": 1.5425, + "step": 13284 + }, + { + "epoch": 2.662324649298597, + "grad_norm": 24.023417745155122, + "learning_rate": 3.814393377688541e-07, + "loss": 1.8035, + "step": 13285 + }, + { + "epoch": 2.6625250501002005, + "grad_norm": 23.85006697005915, + "learning_rate": 3.8099282772108006e-07, + "loss": 2.0625, + "step": 13286 + }, + { + "epoch": 2.6627254509018035, + "grad_norm": 34.824676202832194, + "learning_rate": 3.805465688155452e-07, + "loss": 1.3449, + "step": 13287 + }, + { + "epoch": 2.662925851703407, + "grad_norm": 17.298574075210926, + "learning_rate": 3.8010056107651184e-07, + "loss": 1.9781, + "step": 13288 + }, + { + "epoch": 2.66312625250501, + "grad_norm": 21.355182577029233, + "learning_rate": 3.7965480452823e-07, + "loss": 1.6568, + "step": 13289 + }, + { + "epoch": 2.6633266533066133, + "grad_norm": 20.27024266841736, + "learning_rate": 3.79209299194937e-07, + "loss": 1.5966, + "step": 13290 + }, + { + "epoch": 2.6635270541082163, + "grad_norm": 33.21699920410041, + "learning_rate": 3.787640451008551e-07, + "loss": 1.6834, + "step": 13291 + }, + { + "epoch": 2.6637274549098198, + "grad_norm": 23.41742067726432, + "learning_rate": 3.783190422701938e-07, + "loss": 1.3651, + "step": 13292 + }, + { + "epoch": 2.6639278557114228, + "grad_norm": 18.44177470638752, + "learning_rate": 3.778742907271482e-07, + "loss": 1.5899, + "step": 13293 + }, + { + "epoch": 2.664128256513026, + "grad_norm": 21.047464891453984, + "learning_rate": 3.774297904959012e-07, + "loss": 1.3502, + "step": 13294 + }, + { + "epoch": 2.664328657314629, + "grad_norm": 48.422640108189114, + "learning_rate": 3.7698554160061853e-07, + "loss": 1.8373, + "step": 13295 + }, + { + "epoch": 2.6645290581162326, + "grad_norm": 25.508766823898178, + "learning_rate": 3.7654154406545796e-07, + "loss": 1.9236, + "step": 13296 + }, + { + "epoch": 2.6647294589178356, + "grad_norm": 28.051291807073163, + "learning_rate": 3.7609779791455745e-07, + "loss": 1.1942, + "step": 13297 + }, + { + "epoch": 2.6649298597194386, + "grad_norm": 39.238761061302476, + "learning_rate": 3.75654303172045e-07, + "loss": 1.3873, + "step": 13298 + }, + { + "epoch": 2.665130260521042, + "grad_norm": 28.04287459768806, + "learning_rate": 3.7521105986203455e-07, + "loss": 2.1827, + "step": 13299 + }, + { + "epoch": 2.6653306613226455, + "grad_norm": 16.474378502608715, + "learning_rate": 3.7476806800862517e-07, + "loss": 1.4146, + "step": 13300 + }, + { + "epoch": 2.6655310621242485, + "grad_norm": 19.836647283045025, + "learning_rate": 3.743253276359049e-07, + "loss": 1.7034, + "step": 13301 + }, + { + "epoch": 2.6657314629258515, + "grad_norm": 32.290053832795124, + "learning_rate": 3.738828387679422e-07, + "loss": 1.399, + "step": 13302 + }, + { + "epoch": 2.665931863727455, + "grad_norm": 17.050957962928347, + "learning_rate": 3.734406014288006e-07, + "loss": 1.3048, + "step": 13303 + }, + { + "epoch": 2.6661322645290584, + "grad_norm": 27.293321931636807, + "learning_rate": 3.7299861564252195e-07, + "loss": 2.0418, + "step": 13304 + }, + { + "epoch": 2.6663326653306614, + "grad_norm": 12.95581978854343, + "learning_rate": 3.725568814331387e-07, + "loss": 0.9463, + "step": 13305 + }, + { + "epoch": 2.6665330661322644, + "grad_norm": 22.796931839275086, + "learning_rate": 3.7211539882466843e-07, + "loss": 1.2534, + "step": 13306 + }, + { + "epoch": 2.666733466933868, + "grad_norm": 32.90702900259826, + "learning_rate": 3.716741678411151e-07, + "loss": 1.4476, + "step": 13307 + }, + { + "epoch": 2.666933867735471, + "grad_norm": 41.28878067735865, + "learning_rate": 3.7123318850647074e-07, + "loss": 1.8048, + "step": 13308 + }, + { + "epoch": 2.6671342685370742, + "grad_norm": 30.16139822802871, + "learning_rate": 3.7079246084470835e-07, + "loss": 1.9012, + "step": 13309 + }, + { + "epoch": 2.6673346693386772, + "grad_norm": 17.839174166760984, + "learning_rate": 3.703519848797943e-07, + "loss": 1.2752, + "step": 13310 + }, + { + "epoch": 2.6675350701402807, + "grad_norm": 19.319493179070367, + "learning_rate": 3.6991176063567725e-07, + "loss": 1.4488, + "step": 13311 + }, + { + "epoch": 2.6677354709418837, + "grad_norm": 23.008655681836725, + "learning_rate": 3.6947178813629137e-07, + "loss": 2.0027, + "step": 13312 + }, + { + "epoch": 2.667935871743487, + "grad_norm": 24.0944895754711, + "learning_rate": 3.6903206740556085e-07, + "loss": 1.2487, + "step": 13313 + }, + { + "epoch": 2.66813627254509, + "grad_norm": 19.289961513179826, + "learning_rate": 3.6859259846739214e-07, + "loss": 1.3779, + "step": 13314 + }, + { + "epoch": 2.668336673346693, + "grad_norm": 27.29829446088027, + "learning_rate": 3.6815338134568056e-07, + "loss": 1.4654, + "step": 13315 + }, + { + "epoch": 2.6685370741482966, + "grad_norm": 21.27942367996848, + "learning_rate": 3.6771441606430646e-07, + "loss": 1.4133, + "step": 13316 + }, + { + "epoch": 2.6687374749499, + "grad_norm": 17.31920686050594, + "learning_rate": 3.6727570264713795e-07, + "loss": 1.5939, + "step": 13317 + }, + { + "epoch": 2.668937875751503, + "grad_norm": 21.147412413342867, + "learning_rate": 3.6683724111802877e-07, + "loss": 1.6476, + "step": 13318 + }, + { + "epoch": 2.669138276553106, + "grad_norm": 19.017324286156462, + "learning_rate": 3.663990315008159e-07, + "loss": 1.4549, + "step": 13319 + }, + { + "epoch": 2.6693386773547094, + "grad_norm": 20.73106616753136, + "learning_rate": 3.659610738193298e-07, + "loss": 1.555, + "step": 13320 + }, + { + "epoch": 2.669539078156313, + "grad_norm": 20.79200372707177, + "learning_rate": 3.655233680973791e-07, + "loss": 1.2913, + "step": 13321 + }, + { + "epoch": 2.669739478957916, + "grad_norm": 23.02105620938993, + "learning_rate": 3.650859143587643e-07, + "loss": 1.117, + "step": 13322 + }, + { + "epoch": 2.669939879759519, + "grad_norm": 19.40492043981211, + "learning_rate": 3.646487126272702e-07, + "loss": 1.2866, + "step": 13323 + }, + { + "epoch": 2.6701402805611223, + "grad_norm": 20.7437502718066, + "learning_rate": 3.642117629266678e-07, + "loss": 1.9163, + "step": 13324 + }, + { + "epoch": 2.6703406813627253, + "grad_norm": 18.019374135587945, + "learning_rate": 3.637750652807154e-07, + "loss": 1.4294, + "step": 13325 + }, + { + "epoch": 2.6705410821643287, + "grad_norm": 17.539962671175864, + "learning_rate": 3.633386197131555e-07, + "loss": 1.3211, + "step": 13326 + }, + { + "epoch": 2.6707414829659317, + "grad_norm": 28.761946641865144, + "learning_rate": 3.629024262477204e-07, + "loss": 2.0674, + "step": 13327 + }, + { + "epoch": 2.670941883767535, + "grad_norm": 21.18143607427241, + "learning_rate": 3.624664849081244e-07, + "loss": 1.4226, + "step": 13328 + }, + { + "epoch": 2.671142284569138, + "grad_norm": 35.42619277163153, + "learning_rate": 3.620307957180713e-07, + "loss": 1.7434, + "step": 13329 + }, + { + "epoch": 2.6713426853707416, + "grad_norm": 21.782518635252018, + "learning_rate": 3.6159535870125053e-07, + "loss": 1.2158, + "step": 13330 + }, + { + "epoch": 2.6715430861723446, + "grad_norm": 26.29489370139156, + "learning_rate": 3.6116017388133697e-07, + "loss": 1.7515, + "step": 13331 + }, + { + "epoch": 2.671743486973948, + "grad_norm": 28.77235473345872, + "learning_rate": 3.607252412819928e-07, + "loss": 1.6938, + "step": 13332 + }, + { + "epoch": 2.671943887775551, + "grad_norm": 20.253042361702786, + "learning_rate": 3.6029056092686366e-07, + "loss": 1.4836, + "step": 13333 + }, + { + "epoch": 2.6721442885771545, + "grad_norm": 21.275103236587185, + "learning_rate": 3.5985613283958773e-07, + "loss": 1.753, + "step": 13334 + }, + { + "epoch": 2.6723446893787575, + "grad_norm": 16.036047793687068, + "learning_rate": 3.5942195704378237e-07, + "loss": 1.1524, + "step": 13335 + }, + { + "epoch": 2.6725450901803605, + "grad_norm": 23.634696373172567, + "learning_rate": 3.589880335630541e-07, + "loss": 1.5448, + "step": 13336 + }, + { + "epoch": 2.672745490981964, + "grad_norm": 26.075507120202328, + "learning_rate": 3.585543624209992e-07, + "loss": 1.7047, + "step": 13337 + }, + { + "epoch": 2.6729458917835673, + "grad_norm": 15.91216632337745, + "learning_rate": 3.581209436411942e-07, + "loss": 1.2681, + "step": 13338 + }, + { + "epoch": 2.6731462925851703, + "grad_norm": 25.969837415995773, + "learning_rate": 3.576877772472054e-07, + "loss": 1.3864, + "step": 13339 + }, + { + "epoch": 2.6733466933867733, + "grad_norm": 26.144561097689387, + "learning_rate": 3.5725486326258494e-07, + "loss": 1.7645, + "step": 13340 + }, + { + "epoch": 2.6735470941883768, + "grad_norm": 32.92354943729935, + "learning_rate": 3.568222017108708e-07, + "loss": 1.6465, + "step": 13341 + }, + { + "epoch": 2.67374749498998, + "grad_norm": 19.211264542612327, + "learning_rate": 3.5638979261558846e-07, + "loss": 1.6521, + "step": 13342 + }, + { + "epoch": 2.673947895791583, + "grad_norm": 34.22235301487525, + "learning_rate": 3.559576360002459e-07, + "loss": 1.6033, + "step": 13343 + }, + { + "epoch": 2.674148296593186, + "grad_norm": 41.80995772003633, + "learning_rate": 3.5552573188834373e-07, + "loss": 1.4426, + "step": 13344 + }, + { + "epoch": 2.6743486973947896, + "grad_norm": 19.561661219348025, + "learning_rate": 3.550940803033626e-07, + "loss": 1.5083, + "step": 13345 + }, + { + "epoch": 2.6745490981963926, + "grad_norm": 22.67104281903241, + "learning_rate": 3.5466268126877257e-07, + "loss": 1.6074, + "step": 13346 + }, + { + "epoch": 2.674749498997996, + "grad_norm": 20.48462943667284, + "learning_rate": 3.5423153480802996e-07, + "loss": 0.77, + "step": 13347 + }, + { + "epoch": 2.674949899799599, + "grad_norm": 16.877028932229745, + "learning_rate": 3.53800640944576e-07, + "loss": 1.4526, + "step": 13348 + }, + { + "epoch": 2.6751503006012025, + "grad_norm": 17.386739016300968, + "learning_rate": 3.533699997018414e-07, + "loss": 1.0119, + "step": 13349 + }, + { + "epoch": 2.6753507014028055, + "grad_norm": 18.8736212491151, + "learning_rate": 3.529396111032368e-07, + "loss": 1.659, + "step": 13350 + }, + { + "epoch": 2.675551102204409, + "grad_norm": 17.451552351330946, + "learning_rate": 3.525094751721664e-07, + "loss": 1.5361, + "step": 13351 + }, + { + "epoch": 2.675751503006012, + "grad_norm": 25.65391024930285, + "learning_rate": 3.520795919320158e-07, + "loss": 1.4171, + "step": 13352 + }, + { + "epoch": 2.6759519038076154, + "grad_norm": 24.220386866585407, + "learning_rate": 3.5164996140615915e-07, + "loss": 1.8529, + "step": 13353 + }, + { + "epoch": 2.6761523046092184, + "grad_norm": 23.899914285445142, + "learning_rate": 3.51220583617955e-07, + "loss": 1.6856, + "step": 13354 + }, + { + "epoch": 2.676352705410822, + "grad_norm": 50.2352448030462, + "learning_rate": 3.507914585907507e-07, + "loss": 1.516, + "step": 13355 + }, + { + "epoch": 2.676553106212425, + "grad_norm": 26.59699219486976, + "learning_rate": 3.503625863478771e-07, + "loss": 0.9524, + "step": 13356 + }, + { + "epoch": 2.676753507014028, + "grad_norm": 19.11079253119622, + "learning_rate": 3.499339669126534e-07, + "loss": 1.1996, + "step": 13357 + }, + { + "epoch": 2.6769539078156313, + "grad_norm": 24.28572355197748, + "learning_rate": 3.4950560030838467e-07, + "loss": 1.8832, + "step": 13358 + }, + { + "epoch": 2.6771543086172347, + "grad_norm": 17.569310664714717, + "learning_rate": 3.4907748655836013e-07, + "loss": 1.518, + "step": 13359 + }, + { + "epoch": 2.6773547094188377, + "grad_norm": 20.590411477925823, + "learning_rate": 3.4864962568585846e-07, + "loss": 1.6657, + "step": 13360 + }, + { + "epoch": 2.6775551102204407, + "grad_norm": 23.81751064221898, + "learning_rate": 3.482220177141427e-07, + "loss": 1.9033, + "step": 13361 + }, + { + "epoch": 2.677755511022044, + "grad_norm": 18.326851537563005, + "learning_rate": 3.4779466266646245e-07, + "loss": 1.2728, + "step": 13362 + }, + { + "epoch": 2.6779559118236476, + "grad_norm": 38.47530385836263, + "learning_rate": 3.473675605660537e-07, + "loss": 1.7134, + "step": 13363 + }, + { + "epoch": 2.6781563126252506, + "grad_norm": 54.88460576567306, + "learning_rate": 3.4694071143613837e-07, + "loss": 1.7732, + "step": 13364 + }, + { + "epoch": 2.6783567134268536, + "grad_norm": 21.6281851126439, + "learning_rate": 3.465141152999263e-07, + "loss": 1.3027, + "step": 13365 + }, + { + "epoch": 2.678557114228457, + "grad_norm": 20.000034564556913, + "learning_rate": 3.4608777218060883e-07, + "loss": 1.5469, + "step": 13366 + }, + { + "epoch": 2.67875751503006, + "grad_norm": 22.29184621823339, + "learning_rate": 3.456616821013703e-07, + "loss": 2.068, + "step": 13367 + }, + { + "epoch": 2.6789579158316634, + "grad_norm": 20.257070695465703, + "learning_rate": 3.452358450853771e-07, + "loss": 1.3722, + "step": 13368 + }, + { + "epoch": 2.6791583166332664, + "grad_norm": 55.1123201290329, + "learning_rate": 3.448102611557813e-07, + "loss": 1.4597, + "step": 13369 + }, + { + "epoch": 2.67935871743487, + "grad_norm": 78.08211857998181, + "learning_rate": 3.4438493033572326e-07, + "loss": 1.7738, + "step": 13370 + }, + { + "epoch": 2.679559118236473, + "grad_norm": 23.852619315238943, + "learning_rate": 3.43959852648329e-07, + "loss": 1.2105, + "step": 13371 + }, + { + "epoch": 2.6797595190380763, + "grad_norm": 33.19609614378954, + "learning_rate": 3.435350281167099e-07, + "loss": 1.1637, + "step": 13372 + }, + { + "epoch": 2.6799599198396793, + "grad_norm": 21.636193919693685, + "learning_rate": 3.4311045676396535e-07, + "loss": 1.5853, + "step": 13373 + }, + { + "epoch": 2.6801603206412823, + "grad_norm": 18.665260543578487, + "learning_rate": 3.4268613861317967e-07, + "loss": 1.3825, + "step": 13374 + }, + { + "epoch": 2.6803607214428857, + "grad_norm": 23.50482808209428, + "learning_rate": 3.4226207368742436e-07, + "loss": 1.3579, + "step": 13375 + }, + { + "epoch": 2.680561122244489, + "grad_norm": 23.476896604343633, + "learning_rate": 3.418382620097543e-07, + "loss": 1.5559, + "step": 13376 + }, + { + "epoch": 2.680761523046092, + "grad_norm": 29.15448420856185, + "learning_rate": 3.4141470360321393e-07, + "loss": 1.8273, + "step": 13377 + }, + { + "epoch": 2.680961923847695, + "grad_norm": 26.175810743871562, + "learning_rate": 3.4099139849083307e-07, + "loss": 2.1188, + "step": 13378 + }, + { + "epoch": 2.6811623246492986, + "grad_norm": 20.62452525546753, + "learning_rate": 3.405683466956272e-07, + "loss": 1.4031, + "step": 13379 + }, + { + "epoch": 2.681362725450902, + "grad_norm": 23.077698589634256, + "learning_rate": 3.401455482405985e-07, + "loss": 1.6047, + "step": 13380 + }, + { + "epoch": 2.681563126252505, + "grad_norm": 19.103676507473445, + "learning_rate": 3.3972300314873477e-07, + "loss": 1.7345, + "step": 13381 + }, + { + "epoch": 2.681763527054108, + "grad_norm": 19.73022915656008, + "learning_rate": 3.3930071144301137e-07, + "loss": 1.7092, + "step": 13382 + }, + { + "epoch": 2.6819639278557115, + "grad_norm": 15.667074518511592, + "learning_rate": 3.388786731463861e-07, + "loss": 1.8081, + "step": 13383 + }, + { + "epoch": 2.6821643286573145, + "grad_norm": 27.310103268663436, + "learning_rate": 3.384568882818101e-07, + "loss": 1.7511, + "step": 13384 + }, + { + "epoch": 2.682364729458918, + "grad_norm": 22.440563934632507, + "learning_rate": 3.3803535687221277e-07, + "loss": 1.8557, + "step": 13385 + }, + { + "epoch": 2.682565130260521, + "grad_norm": 19.443890232403493, + "learning_rate": 3.3761407894051525e-07, + "loss": 1.8538, + "step": 13386 + }, + { + "epoch": 2.6827655310621243, + "grad_norm": 35.59980502465761, + "learning_rate": 3.371930545096219e-07, + "loss": 1.54, + "step": 13387 + }, + { + "epoch": 2.6829659318637273, + "grad_norm": 33.278011571934805, + "learning_rate": 3.3677228360242565e-07, + "loss": 1.477, + "step": 13388 + }, + { + "epoch": 2.6831663326653308, + "grad_norm": 24.559005888961938, + "learning_rate": 3.363517662418048e-07, + "loss": 1.3405, + "step": 13389 + }, + { + "epoch": 2.6833667334669338, + "grad_norm": 19.461778036123707, + "learning_rate": 3.35931502450621e-07, + "loss": 1.7466, + "step": 13390 + }, + { + "epoch": 2.683567134268537, + "grad_norm": 22.68857414819403, + "learning_rate": 3.3551149225172663e-07, + "loss": 1.8188, + "step": 13391 + }, + { + "epoch": 2.68376753507014, + "grad_norm": 28.191779235671437, + "learning_rate": 3.350917356679589e-07, + "loss": 1.6185, + "step": 13392 + }, + { + "epoch": 2.6839679358717436, + "grad_norm": 36.101364951002616, + "learning_rate": 3.3467223272213854e-07, + "loss": 1.7988, + "step": 13393 + }, + { + "epoch": 2.6841683366733466, + "grad_norm": 20.47649780045203, + "learning_rate": 3.3425298343707555e-07, + "loss": 1.4131, + "step": 13394 + }, + { + "epoch": 2.6843687374749496, + "grad_norm": 19.63894351843068, + "learning_rate": 3.338339878355651e-07, + "loss": 1.6783, + "step": 13395 + }, + { + "epoch": 2.684569138276553, + "grad_norm": 22.379821160219862, + "learning_rate": 3.334152459403889e-07, + "loss": 1.0355, + "step": 13396 + }, + { + "epoch": 2.6847695390781565, + "grad_norm": 21.695086070433405, + "learning_rate": 3.329967577743143e-07, + "loss": 1.6187, + "step": 13397 + }, + { + "epoch": 2.6849699398797595, + "grad_norm": 25.021500350402594, + "learning_rate": 3.325785233600948e-07, + "loss": 1.6858, + "step": 13398 + }, + { + "epoch": 2.6851703406813625, + "grad_norm": 14.997733295839499, + "learning_rate": 3.321605427204716e-07, + "loss": 1.0671, + "step": 13399 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 24.384266343604736, + "learning_rate": 3.3174281587816826e-07, + "loss": 1.4707, + "step": 13400 + }, + { + "epoch": 2.6855711422845694, + "grad_norm": 14.597783735153987, + "learning_rate": 3.31325342855901e-07, + "loss": 1.3259, + "step": 13401 + }, + { + "epoch": 2.6857715430861724, + "grad_norm": 24.754582334779254, + "learning_rate": 3.309081236763656e-07, + "loss": 2.122, + "step": 13402 + }, + { + "epoch": 2.6859719438877754, + "grad_norm": 17.203153448580686, + "learning_rate": 3.304911583622478e-07, + "loss": 1.3275, + "step": 13403 + }, + { + "epoch": 2.686172344689379, + "grad_norm": 25.924389605260643, + "learning_rate": 3.300744469362183e-07, + "loss": 1.5124, + "step": 13404 + }, + { + "epoch": 2.686372745490982, + "grad_norm": 25.916043097197722, + "learning_rate": 3.296579894209345e-07, + "loss": 1.2067, + "step": 13405 + }, + { + "epoch": 2.6865731462925853, + "grad_norm": 21.89987955449998, + "learning_rate": 3.2924178583904063e-07, + "loss": 1.5321, + "step": 13406 + }, + { + "epoch": 2.6867735470941883, + "grad_norm": 56.45483949055993, + "learning_rate": 3.288258362131647e-07, + "loss": 2.1709, + "step": 13407 + }, + { + "epoch": 2.6869739478957917, + "grad_norm": 28.618551344378012, + "learning_rate": 3.2841014056592404e-07, + "loss": 1.48, + "step": 13408 + }, + { + "epoch": 2.6871743486973947, + "grad_norm": 21.304767525473867, + "learning_rate": 3.279946989199195e-07, + "loss": 1.3896, + "step": 13409 + }, + { + "epoch": 2.687374749498998, + "grad_norm": 27.69342357280389, + "learning_rate": 3.275795112977398e-07, + "loss": 1.6275, + "step": 13410 + }, + { + "epoch": 2.687575150300601, + "grad_norm": 22.146931312436518, + "learning_rate": 3.2716457772195896e-07, + "loss": 1.5335, + "step": 13411 + }, + { + "epoch": 2.6877755511022046, + "grad_norm": 20.358401253641745, + "learning_rate": 3.2674989821513794e-07, + "loss": 1.5342, + "step": 13412 + }, + { + "epoch": 2.6879759519038076, + "grad_norm": 28.798129153992587, + "learning_rate": 3.2633547279982413e-07, + "loss": 1.1594, + "step": 13413 + }, + { + "epoch": 2.688176352705411, + "grad_norm": 51.171578513111044, + "learning_rate": 3.259213014985474e-07, + "loss": 1.6992, + "step": 13414 + }, + { + "epoch": 2.688376753507014, + "grad_norm": 18.8107912338184, + "learning_rate": 3.2550738433383135e-07, + "loss": 1.5076, + "step": 13415 + }, + { + "epoch": 2.688577154308617, + "grad_norm": 19.067245412573243, + "learning_rate": 3.2509372132817795e-07, + "loss": 1.7454, + "step": 13416 + }, + { + "epoch": 2.6887775551102204, + "grad_norm": 29.397542565330102, + "learning_rate": 3.246803125040793e-07, + "loss": 1.9728, + "step": 13417 + }, + { + "epoch": 2.688977955911824, + "grad_norm": 28.253342599981803, + "learning_rate": 3.2426715788401455e-07, + "loss": 1.7441, + "step": 13418 + }, + { + "epoch": 2.689178356713427, + "grad_norm": 20.436241331908775, + "learning_rate": 3.2385425749044575e-07, + "loss": 1.4595, + "step": 13419 + }, + { + "epoch": 2.68937875751503, + "grad_norm": 17.810057555951094, + "learning_rate": 3.23441611345825e-07, + "loss": 1.3966, + "step": 13420 + }, + { + "epoch": 2.6895791583166333, + "grad_norm": 30.741629264833573, + "learning_rate": 3.230292194725848e-07, + "loss": 0.9614, + "step": 13421 + }, + { + "epoch": 2.6897795591182367, + "grad_norm": 31.12949225005845, + "learning_rate": 3.226170818931507e-07, + "loss": 1.6955, + "step": 13422 + }, + { + "epoch": 2.6899799599198397, + "grad_norm": 25.178478292326787, + "learning_rate": 3.222051986299313e-07, + "loss": 1.5105, + "step": 13423 + }, + { + "epoch": 2.6901803607214427, + "grad_norm": 19.959611771939002, + "learning_rate": 3.2179356970531814e-07, + "loss": 1.3404, + "step": 13424 + }, + { + "epoch": 2.690380761523046, + "grad_norm": 16.46378054047734, + "learning_rate": 3.2138219514169676e-07, + "loss": 1.4247, + "step": 13425 + }, + { + "epoch": 2.690581162324649, + "grad_norm": 23.051737125198642, + "learning_rate": 3.2097107496143033e-07, + "loss": 1.826, + "step": 13426 + }, + { + "epoch": 2.6907815631262526, + "grad_norm": 23.588092671154197, + "learning_rate": 3.205602091868737e-07, + "loss": 1.6194, + "step": 13427 + }, + { + "epoch": 2.6909819639278556, + "grad_norm": 59.10640726121962, + "learning_rate": 3.201495978403657e-07, + "loss": 2.0372, + "step": 13428 + }, + { + "epoch": 2.691182364729459, + "grad_norm": 22.347549538513245, + "learning_rate": 3.197392409442324e-07, + "loss": 1.3851, + "step": 13429 + }, + { + "epoch": 2.691382765531062, + "grad_norm": 27.329141359479696, + "learning_rate": 3.1932913852078586e-07, + "loss": 1.5011, + "step": 13430 + }, + { + "epoch": 2.6915831663326655, + "grad_norm": 27.21661349241368, + "learning_rate": 3.1891929059232163e-07, + "loss": 1.5348, + "step": 13431 + }, + { + "epoch": 2.6917835671342685, + "grad_norm": 56.61808619119629, + "learning_rate": 3.1850969718112743e-07, + "loss": 1.9409, + "step": 13432 + }, + { + "epoch": 2.6919839679358715, + "grad_norm": 23.375093715153984, + "learning_rate": 3.181003583094705e-07, + "loss": 1.3406, + "step": 13433 + }, + { + "epoch": 2.692184368737475, + "grad_norm": 16.517779301066945, + "learning_rate": 3.176912739996085e-07, + "loss": 1.6836, + "step": 13434 + }, + { + "epoch": 2.6923847695390783, + "grad_norm": 34.10136960303594, + "learning_rate": 3.172824442737832e-07, + "loss": 1.8724, + "step": 13435 + }, + { + "epoch": 2.6925851703406813, + "grad_norm": 20.46873688306146, + "learning_rate": 3.168738691542239e-07, + "loss": 1.6231, + "step": 13436 + }, + { + "epoch": 2.6927855711422843, + "grad_norm": 22.816551126473733, + "learning_rate": 3.164655486631457e-07, + "loss": 1.442, + "step": 13437 + }, + { + "epoch": 2.692985971943888, + "grad_norm": 36.3481255455128, + "learning_rate": 3.1605748282274863e-07, + "loss": 1.7801, + "step": 13438 + }, + { + "epoch": 2.693186372745491, + "grad_norm": 18.444082450185167, + "learning_rate": 3.156496716552215e-07, + "loss": 1.3075, + "step": 13439 + }, + { + "epoch": 2.693386773547094, + "grad_norm": 38.403782687403606, + "learning_rate": 3.152421151827351e-07, + "loss": 1.5008, + "step": 13440 + }, + { + "epoch": 2.693587174348697, + "grad_norm": 62.001573002917524, + "learning_rate": 3.1483481342745095e-07, + "loss": 1.3629, + "step": 13441 + }, + { + "epoch": 2.6937875751503007, + "grad_norm": 17.52523303359239, + "learning_rate": 3.1442776641151363e-07, + "loss": 1.2815, + "step": 13442 + }, + { + "epoch": 2.6939879759519036, + "grad_norm": 21.395100546949198, + "learning_rate": 3.140209741570549e-07, + "loss": 1.7229, + "step": 13443 + }, + { + "epoch": 2.694188376753507, + "grad_norm": 23.27364624927601, + "learning_rate": 3.1361443668619315e-07, + "loss": 1.5661, + "step": 13444 + }, + { + "epoch": 2.69438877755511, + "grad_norm": 22.93510584563478, + "learning_rate": 3.132081540210324e-07, + "loss": 1.3677, + "step": 13445 + }, + { + "epoch": 2.6945891783567135, + "grad_norm": 25.52793196511341, + "learning_rate": 3.1280212618366335e-07, + "loss": 1.9232, + "step": 13446 + }, + { + "epoch": 2.6947895791583165, + "grad_norm": 29.57488906628641, + "learning_rate": 3.1239635319616047e-07, + "loss": 1.2717, + "step": 13447 + }, + { + "epoch": 2.69498997995992, + "grad_norm": 20.222255605039305, + "learning_rate": 3.1199083508058723e-07, + "loss": 1.1911, + "step": 13448 + }, + { + "epoch": 2.695190380761523, + "grad_norm": 32.13306471334469, + "learning_rate": 3.115855718589933e-07, + "loss": 1.5015, + "step": 13449 + }, + { + "epoch": 2.6953907815631264, + "grad_norm": 26.597585676347222, + "learning_rate": 3.111805635534126e-07, + "loss": 1.3045, + "step": 13450 + }, + { + "epoch": 2.6955911823647294, + "grad_norm": 18.25935886369751, + "learning_rate": 3.107758101858654e-07, + "loss": 1.4999, + "step": 13451 + }, + { + "epoch": 2.695791583166333, + "grad_norm": 25.762708397392103, + "learning_rate": 3.103713117783597e-07, + "loss": 1.7816, + "step": 13452 + }, + { + "epoch": 2.695991983967936, + "grad_norm": 18.475317832737773, + "learning_rate": 3.099670683528877e-07, + "loss": 1.6686, + "step": 13453 + }, + { + "epoch": 2.696192384769539, + "grad_norm": 26.87288105114868, + "learning_rate": 3.0956307993142975e-07, + "loss": 1.4304, + "step": 13454 + }, + { + "epoch": 2.6963927855711423, + "grad_norm": 20.702025621863, + "learning_rate": 3.0915934653595105e-07, + "loss": 1.5992, + "step": 13455 + }, + { + "epoch": 2.6965931863727457, + "grad_norm": 17.66633621125979, + "learning_rate": 3.0875586818840343e-07, + "loss": 1.25, + "step": 13456 + }, + { + "epoch": 2.6967935871743487, + "grad_norm": 19.75516954942937, + "learning_rate": 3.0835264491072383e-07, + "loss": 1.4347, + "step": 13457 + }, + { + "epoch": 2.6969939879759517, + "grad_norm": 17.616802006568232, + "learning_rate": 3.079496767248358e-07, + "loss": 1.5668, + "step": 13458 + }, + { + "epoch": 2.697194388777555, + "grad_norm": 22.473345393384278, + "learning_rate": 3.075469636526507e-07, + "loss": 1.6099, + "step": 13459 + }, + { + "epoch": 2.6973947895791586, + "grad_norm": 22.802245239755646, + "learning_rate": 3.0714450571606325e-07, + "loss": 1.4452, + "step": 13460 + }, + { + "epoch": 2.6975951903807616, + "grad_norm": 22.627099443330927, + "learning_rate": 3.067423029369565e-07, + "loss": 1.5689, + "step": 13461 + }, + { + "epoch": 2.6977955911823646, + "grad_norm": 22.27929934418228, + "learning_rate": 3.063403553371985e-07, + "loss": 1.4484, + "step": 13462 + }, + { + "epoch": 2.697995991983968, + "grad_norm": 18.420790651569583, + "learning_rate": 3.0593866293864513e-07, + "loss": 1.1833, + "step": 13463 + }, + { + "epoch": 2.698196392785571, + "grad_norm": 21.36097229271742, + "learning_rate": 3.055372257631345e-07, + "loss": 1.6334, + "step": 13464 + }, + { + "epoch": 2.6983967935871744, + "grad_norm": 25.87586489133198, + "learning_rate": 3.051360438324946e-07, + "loss": 1.7151, + "step": 13465 + }, + { + "epoch": 2.6985971943887774, + "grad_norm": 21.07312738723613, + "learning_rate": 3.047351171685381e-07, + "loss": 1.4284, + "step": 13466 + }, + { + "epoch": 2.698797595190381, + "grad_norm": 17.851291607552895, + "learning_rate": 3.043344457930647e-07, + "loss": 1.5359, + "step": 13467 + }, + { + "epoch": 2.698997995991984, + "grad_norm": 27.597709996142854, + "learning_rate": 3.039340297278581e-07, + "loss": 1.5961, + "step": 13468 + }, + { + "epoch": 2.6991983967935873, + "grad_norm": 18.266655303131166, + "learning_rate": 3.0353386899469093e-07, + "loss": 1.6691, + "step": 13469 + }, + { + "epoch": 2.6993987975951903, + "grad_norm": 21.402183940344973, + "learning_rate": 3.0313396361532075e-07, + "loss": 1.8044, + "step": 13470 + }, + { + "epoch": 2.6995991983967937, + "grad_norm": 24.754038225849662, + "learning_rate": 3.0273431361148853e-07, + "loss": 1.4413, + "step": 13471 + }, + { + "epoch": 2.6997995991983967, + "grad_norm": 18.71191351426695, + "learning_rate": 3.0233491900492686e-07, + "loss": 1.5619, + "step": 13472 + }, + { + "epoch": 2.7, + "grad_norm": 21.38029916568579, + "learning_rate": 3.019357798173495e-07, + "loss": 1.325, + "step": 13473 + }, + { + "epoch": 2.700200400801603, + "grad_norm": 22.560023300958758, + "learning_rate": 3.015368960704584e-07, + "loss": 1.5872, + "step": 13474 + }, + { + "epoch": 2.700400801603206, + "grad_norm": 20.919808216368946, + "learning_rate": 3.0113826778594204e-07, + "loss": 1.4331, + "step": 13475 + }, + { + "epoch": 2.7006012024048096, + "grad_norm": 22.360229591640582, + "learning_rate": 3.0073989498547394e-07, + "loss": 1.2994, + "step": 13476 + }, + { + "epoch": 2.700801603206413, + "grad_norm": 23.7100926600629, + "learning_rate": 3.003417776907158e-07, + "loss": 1.3867, + "step": 13477 + }, + { + "epoch": 2.701002004008016, + "grad_norm": 23.08215063720785, + "learning_rate": 2.9994391592331084e-07, + "loss": 1.6056, + "step": 13478 + }, + { + "epoch": 2.701202404809619, + "grad_norm": 20.95482449505422, + "learning_rate": 2.995463097048934e-07, + "loss": 1.5207, + "step": 13479 + }, + { + "epoch": 2.7014028056112225, + "grad_norm": 25.16017514975037, + "learning_rate": 2.9914895905708283e-07, + "loss": 1.6259, + "step": 13480 + }, + { + "epoch": 2.701603206412826, + "grad_norm": 22.97285091636577, + "learning_rate": 2.987518640014808e-07, + "loss": 1.6506, + "step": 13481 + }, + { + "epoch": 2.701803607214429, + "grad_norm": 34.3171215693869, + "learning_rate": 2.9835502455968113e-07, + "loss": 1.5709, + "step": 13482 + }, + { + "epoch": 2.702004008016032, + "grad_norm": 31.361466689273467, + "learning_rate": 2.9795844075325873e-07, + "loss": 1.81, + "step": 13483 + }, + { + "epoch": 2.7022044088176354, + "grad_norm": 23.794816428113673, + "learning_rate": 2.975621126037759e-07, + "loss": 1.4704, + "step": 13484 + }, + { + "epoch": 2.7024048096192383, + "grad_norm": 21.266031688520535, + "learning_rate": 2.9716604013278304e-07, + "loss": 1.6281, + "step": 13485 + }, + { + "epoch": 2.702605210420842, + "grad_norm": 26.903204405999013, + "learning_rate": 2.9677022336181414e-07, + "loss": 1.251, + "step": 13486 + }, + { + "epoch": 2.702805611222445, + "grad_norm": 21.51051464643269, + "learning_rate": 2.9637466231239197e-07, + "loss": 1.7968, + "step": 13487 + }, + { + "epoch": 2.703006012024048, + "grad_norm": 21.759943764525904, + "learning_rate": 2.95979357006021e-07, + "loss": 1.361, + "step": 13488 + }, + { + "epoch": 2.703206412825651, + "grad_norm": 17.075762404066538, + "learning_rate": 2.9558430746419784e-07, + "loss": 1.5508, + "step": 13489 + }, + { + "epoch": 2.7034068136272547, + "grad_norm": 24.00259742569721, + "learning_rate": 2.9518951370839935e-07, + "loss": 2.0588, + "step": 13490 + }, + { + "epoch": 2.7036072144288577, + "grad_norm": 19.07338172713236, + "learning_rate": 2.9479497576009153e-07, + "loss": 1.4198, + "step": 13491 + }, + { + "epoch": 2.7038076152304606, + "grad_norm": 35.1570942943264, + "learning_rate": 2.9440069364072676e-07, + "loss": 1.6069, + "step": 13492 + }, + { + "epoch": 2.704008016032064, + "grad_norm": 21.789606738994923, + "learning_rate": 2.9400666737174233e-07, + "loss": 1.7716, + "step": 13493 + }, + { + "epoch": 2.7042084168336675, + "grad_norm": 25.543918836690846, + "learning_rate": 2.936128969745627e-07, + "loss": 1.4538, + "step": 13494 + }, + { + "epoch": 2.7044088176352705, + "grad_norm": 22.908314411561456, + "learning_rate": 2.932193824705959e-07, + "loss": 2.0546, + "step": 13495 + }, + { + "epoch": 2.7046092184368735, + "grad_norm": 24.86051501191174, + "learning_rate": 2.9282612388124077e-07, + "loss": 1.6592, + "step": 13496 + }, + { + "epoch": 2.704809619238477, + "grad_norm": 18.814937888687265, + "learning_rate": 2.9243312122787635e-07, + "loss": 1.3422, + "step": 13497 + }, + { + "epoch": 2.7050100200400804, + "grad_norm": 26.946231932449717, + "learning_rate": 2.9204037453187284e-07, + "loss": 1.3468, + "step": 13498 + }, + { + "epoch": 2.7052104208416834, + "grad_norm": 18.354624195503252, + "learning_rate": 2.916478838145831e-07, + "loss": 1.4515, + "step": 13499 + }, + { + "epoch": 2.7054108216432864, + "grad_norm": 29.220548899644545, + "learning_rate": 2.912556490973484e-07, + "loss": 2.0236, + "step": 13500 + }, + { + "epoch": 2.70561122244489, + "grad_norm": 19.699090147151335, + "learning_rate": 2.9086367040149555e-07, + "loss": 1.6455, + "step": 13501 + }, + { + "epoch": 2.705811623246493, + "grad_norm": 22.391781017841268, + "learning_rate": 2.9047194774833476e-07, + "loss": 1.8991, + "step": 13502 + }, + { + "epoch": 2.7060120240480963, + "grad_norm": 30.439344984260376, + "learning_rate": 2.900804811591673e-07, + "loss": 1.5766, + "step": 13503 + }, + { + "epoch": 2.7062124248496993, + "grad_norm": 18.165223854791314, + "learning_rate": 2.8968927065527617e-07, + "loss": 1.5712, + "step": 13504 + }, + { + "epoch": 2.7064128256513027, + "grad_norm": 19.90844066440693, + "learning_rate": 2.892983162579316e-07, + "loss": 1.4689, + "step": 13505 + }, + { + "epoch": 2.7066132264529057, + "grad_norm": 27.799870123686986, + "learning_rate": 2.8890761798839263e-07, + "loss": 1.7424, + "step": 13506 + }, + { + "epoch": 2.706813627254509, + "grad_norm": 20.30399243603063, + "learning_rate": 2.8851717586790004e-07, + "loss": 1.359, + "step": 13507 + }, + { + "epoch": 2.707014028056112, + "grad_norm": 33.37121495596516, + "learning_rate": 2.881269899176842e-07, + "loss": 1.6553, + "step": 13508 + }, + { + "epoch": 2.7072144288577156, + "grad_norm": 20.698962508001486, + "learning_rate": 2.877370601589574e-07, + "loss": 1.4523, + "step": 13509 + }, + { + "epoch": 2.7074148296593186, + "grad_norm": 22.408585626233528, + "learning_rate": 2.873473866129234e-07, + "loss": 1.1531, + "step": 13510 + }, + { + "epoch": 2.707615230460922, + "grad_norm": 21.64564306960919, + "learning_rate": 2.8695796930076956e-07, + "loss": 1.4427, + "step": 13511 + }, + { + "epoch": 2.707815631262525, + "grad_norm": 27.03950350923171, + "learning_rate": 2.865688082436663e-07, + "loss": 1.0915, + "step": 13512 + }, + { + "epoch": 2.708016032064128, + "grad_norm": 27.664242928233033, + "learning_rate": 2.8617990346277656e-07, + "loss": 1.7567, + "step": 13513 + }, + { + "epoch": 2.7082164328657314, + "grad_norm": 23.310505674831354, + "learning_rate": 2.857912549792424e-07, + "loss": 1.1635, + "step": 13514 + }, + { + "epoch": 2.708416833667335, + "grad_norm": 32.18203284178351, + "learning_rate": 2.8540286281419693e-07, + "loss": 1.5837, + "step": 13515 + }, + { + "epoch": 2.708617234468938, + "grad_norm": 43.08626751698365, + "learning_rate": 2.850147269887565e-07, + "loss": 1.2158, + "step": 13516 + }, + { + "epoch": 2.708817635270541, + "grad_norm": 48.590544523349884, + "learning_rate": 2.84626847524026e-07, + "loss": 1.6806, + "step": 13517 + }, + { + "epoch": 2.7090180360721443, + "grad_norm": 24.06466198182814, + "learning_rate": 2.8423922444109463e-07, + "loss": 1.4286, + "step": 13518 + }, + { + "epoch": 2.7092184368737477, + "grad_norm": 22.594023691479965, + "learning_rate": 2.838518577610366e-07, + "loss": 1.3878, + "step": 13519 + }, + { + "epoch": 2.7094188376753507, + "grad_norm": 21.01894605628452, + "learning_rate": 2.8346474750491627e-07, + "loss": 1.286, + "step": 13520 + }, + { + "epoch": 2.7096192384769537, + "grad_norm": 19.288474482106754, + "learning_rate": 2.830778936937789e-07, + "loss": 1.5182, + "step": 13521 + }, + { + "epoch": 2.709819639278557, + "grad_norm": 35.98466817819104, + "learning_rate": 2.8269129634865945e-07, + "loss": 1.7051, + "step": 13522 + }, + { + "epoch": 2.71002004008016, + "grad_norm": 16.073242165449823, + "learning_rate": 2.8230495549057714e-07, + "loss": 1.4033, + "step": 13523 + }, + { + "epoch": 2.7102204408817636, + "grad_norm": 22.1690493696177, + "learning_rate": 2.819188711405391e-07, + "loss": 1.6846, + "step": 13524 + }, + { + "epoch": 2.7104208416833666, + "grad_norm": 32.12446828795913, + "learning_rate": 2.815330433195357e-07, + "loss": 1.0146, + "step": 13525 + }, + { + "epoch": 2.71062124248497, + "grad_norm": 34.04142515243546, + "learning_rate": 2.811474720485463e-07, + "loss": 1.8475, + "step": 13526 + }, + { + "epoch": 2.710821643286573, + "grad_norm": 36.40375754003639, + "learning_rate": 2.807621573485353e-07, + "loss": 1.4755, + "step": 13527 + }, + { + "epoch": 2.7110220440881765, + "grad_norm": 23.199487520873355, + "learning_rate": 2.803770992404514e-07, + "loss": 1.5196, + "step": 13528 + }, + { + "epoch": 2.7112224448897795, + "grad_norm": 52.10243325760268, + "learning_rate": 2.799922977452302e-07, + "loss": 1.9856, + "step": 13529 + }, + { + "epoch": 2.711422845691383, + "grad_norm": 19.738285339980585, + "learning_rate": 2.7960775288379703e-07, + "loss": 1.3962, + "step": 13530 + }, + { + "epoch": 2.711623246492986, + "grad_norm": 33.99589048547371, + "learning_rate": 2.792234646770575e-07, + "loss": 0.8815, + "step": 13531 + }, + { + "epoch": 2.7118236472945894, + "grad_norm": 59.198960736301665, + "learning_rate": 2.7883943314590654e-07, + "loss": 1.2427, + "step": 13532 + }, + { + "epoch": 2.7120240480961924, + "grad_norm": 18.146992748188797, + "learning_rate": 2.7845565831122467e-07, + "loss": 1.4237, + "step": 13533 + }, + { + "epoch": 2.7122244488977953, + "grad_norm": 21.090356864212637, + "learning_rate": 2.780721401938785e-07, + "loss": 1.6873, + "step": 13534 + }, + { + "epoch": 2.712424849699399, + "grad_norm": 18.96747719755891, + "learning_rate": 2.7768887881472086e-07, + "loss": 1.132, + "step": 13535 + }, + { + "epoch": 2.7126252505010022, + "grad_norm": 39.05429885313991, + "learning_rate": 2.773058741945883e-07, + "loss": 1.7608, + "step": 13536 + }, + { + "epoch": 2.7128256513026052, + "grad_norm": 22.39107790463204, + "learning_rate": 2.769231263543082e-07, + "loss": 1.2486, + "step": 13537 + }, + { + "epoch": 2.713026052104208, + "grad_norm": 26.916223087523758, + "learning_rate": 2.7654063531468877e-07, + "loss": 1.612, + "step": 13538 + }, + { + "epoch": 2.7132264529058117, + "grad_norm": 24.024776199092976, + "learning_rate": 2.761584010965274e-07, + "loss": 1.528, + "step": 13539 + }, + { + "epoch": 2.713426853707415, + "grad_norm": 58.159783381577284, + "learning_rate": 2.7577642372060676e-07, + "loss": 1.7129, + "step": 13540 + }, + { + "epoch": 2.713627254509018, + "grad_norm": 16.224118083740628, + "learning_rate": 2.7539470320769533e-07, + "loss": 1.6278, + "step": 13541 + }, + { + "epoch": 2.713827655310621, + "grad_norm": 20.744789844955672, + "learning_rate": 2.750132395785488e-07, + "loss": 1.7433, + "step": 13542 + }, + { + "epoch": 2.7140280561122245, + "grad_norm": 26.272254265384195, + "learning_rate": 2.746320328539065e-07, + "loss": 1.9195, + "step": 13543 + }, + { + "epoch": 2.7142284569138275, + "grad_norm": 22.408315752146677, + "learning_rate": 2.74251083054497e-07, + "loss": 1.4101, + "step": 13544 + }, + { + "epoch": 2.714428857715431, + "grad_norm": 22.706578370116464, + "learning_rate": 2.738703902010309e-07, + "loss": 1.4984, + "step": 13545 + }, + { + "epoch": 2.714629258517034, + "grad_norm": 32.27199405488466, + "learning_rate": 2.7348995431420834e-07, + "loss": 1.0261, + "step": 13546 + }, + { + "epoch": 2.7148296593186374, + "grad_norm": 25.648429303205003, + "learning_rate": 2.7310977541471386e-07, + "loss": 1.6204, + "step": 13547 + }, + { + "epoch": 2.7150300601202404, + "grad_norm": 29.15625397351588, + "learning_rate": 2.727298535232181e-07, + "loss": 1.1605, + "step": 13548 + }, + { + "epoch": 2.715230460921844, + "grad_norm": 22.941902481840025, + "learning_rate": 2.7235018866037856e-07, + "loss": 1.6718, + "step": 13549 + }, + { + "epoch": 2.715430861723447, + "grad_norm": 31.732528981902917, + "learning_rate": 2.71970780846838e-07, + "loss": 1.6642, + "step": 13550 + }, + { + "epoch": 2.71563126252505, + "grad_norm": 20.32724729674326, + "learning_rate": 2.715916301032256e-07, + "loss": 1.5734, + "step": 13551 + }, + { + "epoch": 2.7158316633266533, + "grad_norm": 17.815617168180253, + "learning_rate": 2.712127364501554e-07, + "loss": 1.5472, + "step": 13552 + }, + { + "epoch": 2.7160320641282567, + "grad_norm": 23.201937520824796, + "learning_rate": 2.7083409990822985e-07, + "loss": 1.6218, + "step": 13553 + }, + { + "epoch": 2.7162324649298597, + "grad_norm": 21.82786382204346, + "learning_rate": 2.704557204980346e-07, + "loss": 1.5966, + "step": 13554 + }, + { + "epoch": 2.7164328657314627, + "grad_norm": 42.497978644341664, + "learning_rate": 2.700775982401432e-07, + "loss": 2.1244, + "step": 13555 + }, + { + "epoch": 2.716633266533066, + "grad_norm": 23.506533766411295, + "learning_rate": 2.696997331551149e-07, + "loss": 1.549, + "step": 13556 + }, + { + "epoch": 2.7168336673346696, + "grad_norm": 22.257297532225966, + "learning_rate": 2.6932212526349476e-07, + "loss": 1.8956, + "step": 13557 + }, + { + "epoch": 2.7170340681362726, + "grad_norm": 23.139313996021407, + "learning_rate": 2.6894477458581426e-07, + "loss": 1.5842, + "step": 13558 + }, + { + "epoch": 2.7172344689378756, + "grad_norm": 20.606286178112022, + "learning_rate": 2.6856768114258914e-07, + "loss": 1.289, + "step": 13559 + }, + { + "epoch": 2.717434869739479, + "grad_norm": 39.41322375328425, + "learning_rate": 2.6819084495432355e-07, + "loss": 1.4482, + "step": 13560 + }, + { + "epoch": 2.717635270541082, + "grad_norm": 20.85093337022325, + "learning_rate": 2.678142660415073e-07, + "loss": 1.3476, + "step": 13561 + }, + { + "epoch": 2.7178356713426854, + "grad_norm": 19.902828468274787, + "learning_rate": 2.6743794442461446e-07, + "loss": 1.8622, + "step": 13562 + }, + { + "epoch": 2.7180360721442884, + "grad_norm": 32.264014058251604, + "learning_rate": 2.6706188012410594e-07, + "loss": 1.6365, + "step": 13563 + }, + { + "epoch": 2.718236472945892, + "grad_norm": 32.12102732085442, + "learning_rate": 2.6668607316042985e-07, + "loss": 1.8108, + "step": 13564 + }, + { + "epoch": 2.718436873747495, + "grad_norm": 35.12534815137782, + "learning_rate": 2.663105235540186e-07, + "loss": 1.3858, + "step": 13565 + }, + { + "epoch": 2.7186372745490983, + "grad_norm": 19.010680737486858, + "learning_rate": 2.6593523132529217e-07, + "loss": 1.3393, + "step": 13566 + }, + { + "epoch": 2.7188376753507013, + "grad_norm": 32.47132204417413, + "learning_rate": 2.6556019649465525e-07, + "loss": 1.5096, + "step": 13567 + }, + { + "epoch": 2.7190380761523048, + "grad_norm": 24.001270141911416, + "learning_rate": 2.6518541908249986e-07, + "loss": 1.5881, + "step": 13568 + }, + { + "epoch": 2.7192384769539077, + "grad_norm": 20.054214075151222, + "learning_rate": 2.648108991092008e-07, + "loss": 1.7673, + "step": 13569 + }, + { + "epoch": 2.719438877755511, + "grad_norm": 22.668560612025964, + "learning_rate": 2.6443663659512453e-07, + "loss": 1.3705, + "step": 13570 + }, + { + "epoch": 2.719639278557114, + "grad_norm": 27.427325215383515, + "learning_rate": 2.640626315606176e-07, + "loss": 1.6114, + "step": 13571 + }, + { + "epoch": 2.719839679358717, + "grad_norm": 24.756080094934013, + "learning_rate": 2.6368888402601654e-07, + "loss": 1.9076, + "step": 13572 + }, + { + "epoch": 2.7200400801603206, + "grad_norm": 21.5175577827383, + "learning_rate": 2.6331539401164286e-07, + "loss": 1.6379, + "step": 13573 + }, + { + "epoch": 2.720240480961924, + "grad_norm": 19.387635279845092, + "learning_rate": 2.629421615378025e-07, + "loss": 1.5004, + "step": 13574 + }, + { + "epoch": 2.720440881763527, + "grad_norm": 25.80839977986161, + "learning_rate": 2.6256918662479036e-07, + "loss": 1.5544, + "step": 13575 + }, + { + "epoch": 2.72064128256513, + "grad_norm": 23.850942544221727, + "learning_rate": 2.62196469292883e-07, + "loss": 1.6193, + "step": 13576 + }, + { + "epoch": 2.7208416833667335, + "grad_norm": 17.040432427090888, + "learning_rate": 2.6182400956234865e-07, + "loss": 1.7237, + "step": 13577 + }, + { + "epoch": 2.721042084168337, + "grad_norm": 18.98975540209064, + "learning_rate": 2.614518074534367e-07, + "loss": 1.9792, + "step": 13578 + }, + { + "epoch": 2.72124248496994, + "grad_norm": 18.96300373250436, + "learning_rate": 2.610798629863842e-07, + "loss": 1.5578, + "step": 13579 + }, + { + "epoch": 2.721442885771543, + "grad_norm": 31.00605413386509, + "learning_rate": 2.607081761814151e-07, + "loss": 1.7404, + "step": 13580 + }, + { + "epoch": 2.7216432865731464, + "grad_norm": 25.03410693710143, + "learning_rate": 2.603367470587381e-07, + "loss": 1.6475, + "step": 13581 + }, + { + "epoch": 2.7218436873747494, + "grad_norm": 32.59174434868726, + "learning_rate": 2.5996557563855e-07, + "loss": 1.5829, + "step": 13582 + }, + { + "epoch": 2.722044088176353, + "grad_norm": 25.263038308212995, + "learning_rate": 2.5959466194102833e-07, + "loss": 1.4368, + "step": 13583 + }, + { + "epoch": 2.722244488977956, + "grad_norm": 21.823884108832704, + "learning_rate": 2.592240059863438e-07, + "loss": 1.5372, + "step": 13584 + }, + { + "epoch": 2.7224448897795592, + "grad_norm": 20.0088286046203, + "learning_rate": 2.5885360779464796e-07, + "loss": 1.6778, + "step": 13585 + }, + { + "epoch": 2.7226452905811622, + "grad_norm": 15.910929387509565, + "learning_rate": 2.584834673860787e-07, + "loss": 1.2453, + "step": 13586 + }, + { + "epoch": 2.7228456913827657, + "grad_norm": 18.123572495253878, + "learning_rate": 2.581135847807642e-07, + "loss": 1.8393, + "step": 13587 + }, + { + "epoch": 2.7230460921843687, + "grad_norm": 19.24310876047183, + "learning_rate": 2.57743959998813e-07, + "loss": 1.2643, + "step": 13588 + }, + { + "epoch": 2.723246492985972, + "grad_norm": 22.100410586493037, + "learning_rate": 2.573745930603233e-07, + "loss": 1.8805, + "step": 13589 + }, + { + "epoch": 2.723446893787575, + "grad_norm": 28.117064372851917, + "learning_rate": 2.5700548398537696e-07, + "loss": 1.3806, + "step": 13590 + }, + { + "epoch": 2.7236472945891785, + "grad_norm": 24.840845522960112, + "learning_rate": 2.5663663279404395e-07, + "loss": 1.7655, + "step": 13591 + }, + { + "epoch": 2.7238476953907815, + "grad_norm": 26.224423320189153, + "learning_rate": 2.562680395063799e-07, + "loss": 1.3277, + "step": 13592 + }, + { + "epoch": 2.7240480961923845, + "grad_norm": 21.535233081549357, + "learning_rate": 2.558997041424238e-07, + "loss": 1.5717, + "step": 13593 + }, + { + "epoch": 2.724248496993988, + "grad_norm": 22.21598590764522, + "learning_rate": 2.555316267222047e-07, + "loss": 1.7635, + "step": 13594 + }, + { + "epoch": 2.7244488977955914, + "grad_norm": 54.553148214168104, + "learning_rate": 2.551638072657342e-07, + "loss": 1.929, + "step": 13595 + }, + { + "epoch": 2.7246492985971944, + "grad_norm": 21.236580685462126, + "learning_rate": 2.547962457930109e-07, + "loss": 1.8217, + "step": 13596 + }, + { + "epoch": 2.7248496993987974, + "grad_norm": 17.410750645163635, + "learning_rate": 2.544289423240209e-07, + "loss": 1.3233, + "step": 13597 + }, + { + "epoch": 2.725050100200401, + "grad_norm": 19.401434266704072, + "learning_rate": 2.5406189687873393e-07, + "loss": 1.2839, + "step": 13598 + }, + { + "epoch": 2.7252505010020043, + "grad_norm": 63.03222058742535, + "learning_rate": 2.536951094771084e-07, + "loss": 2.1352, + "step": 13599 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 20.345152724264086, + "learning_rate": 2.5332858013908444e-07, + "loss": 1.6306, + "step": 13600 + }, + { + "epoch": 2.7256513026052103, + "grad_norm": 22.283128877395054, + "learning_rate": 2.5296230888459395e-07, + "loss": 1.9524, + "step": 13601 + }, + { + "epoch": 2.7258517034068137, + "grad_norm": 51.23259170967105, + "learning_rate": 2.525962957335487e-07, + "loss": 1.4284, + "step": 13602 + }, + { + "epoch": 2.7260521042084167, + "grad_norm": 18.632327036295724, + "learning_rate": 2.5223054070585117e-07, + "loss": 1.6056, + "step": 13603 + }, + { + "epoch": 2.72625250501002, + "grad_norm": 25.723419738573696, + "learning_rate": 2.518650438213871e-07, + "loss": 1.6883, + "step": 13604 + }, + { + "epoch": 2.726452905811623, + "grad_norm": 26.632393234160126, + "learning_rate": 2.5149980510003e-07, + "loss": 2.0493, + "step": 13605 + }, + { + "epoch": 2.7266533066132266, + "grad_norm": 18.16521253682114, + "learning_rate": 2.51134824561638e-07, + "loss": 1.4134, + "step": 13606 + }, + { + "epoch": 2.7268537074148296, + "grad_norm": 18.36455912492879, + "learning_rate": 2.5077010222605447e-07, + "loss": 1.6913, + "step": 13607 + }, + { + "epoch": 2.727054108216433, + "grad_norm": 21.117008311021067, + "learning_rate": 2.504056381131126e-07, + "loss": 1.5831, + "step": 13608 + }, + { + "epoch": 2.727254509018036, + "grad_norm": 17.459384888797903, + "learning_rate": 2.5004143224262643e-07, + "loss": 1.6218, + "step": 13609 + }, + { + "epoch": 2.727454909819639, + "grad_norm": 17.588227111546953, + "learning_rate": 2.496774846343991e-07, + "loss": 1.8083, + "step": 13610 + }, + { + "epoch": 2.7276553106212424, + "grad_norm": 38.04016579236395, + "learning_rate": 2.493137953082186e-07, + "loss": 1.3005, + "step": 13611 + }, + { + "epoch": 2.727855711422846, + "grad_norm": 21.13186727562229, + "learning_rate": 2.4895036428386034e-07, + "loss": 1.4762, + "step": 13612 + }, + { + "epoch": 2.728056112224449, + "grad_norm": 18.71013600089281, + "learning_rate": 2.4858719158108393e-07, + "loss": 1.2973, + "step": 13613 + }, + { + "epoch": 2.728256513026052, + "grad_norm": 23.112432166617797, + "learning_rate": 2.4822427721963526e-07, + "loss": 1.5005, + "step": 13614 + }, + { + "epoch": 2.7284569138276553, + "grad_norm": 14.16994632698177, + "learning_rate": 2.478616212192481e-07, + "loss": 1.2755, + "step": 13615 + }, + { + "epoch": 2.7286573146292588, + "grad_norm": 22.76140656390261, + "learning_rate": 2.474992235996382e-07, + "loss": 1.6575, + "step": 13616 + }, + { + "epoch": 2.7288577154308618, + "grad_norm": 18.04514364662031, + "learning_rate": 2.471370843805104e-07, + "loss": 1.2544, + "step": 13617 + }, + { + "epoch": 2.7290581162324647, + "grad_norm": 26.162520047936276, + "learning_rate": 2.4677520358155626e-07, + "loss": 1.9487, + "step": 13618 + }, + { + "epoch": 2.729258517034068, + "grad_norm": 23.117817868991835, + "learning_rate": 2.464135812224505e-07, + "loss": 1.5605, + "step": 13619 + }, + { + "epoch": 2.729458917835671, + "grad_norm": 24.66029758574435, + "learning_rate": 2.4605221732285466e-07, + "loss": 1.6241, + "step": 13620 + }, + { + "epoch": 2.7296593186372746, + "grad_norm": 14.188189950940666, + "learning_rate": 2.4569111190241803e-07, + "loss": 1.4806, + "step": 13621 + }, + { + "epoch": 2.7298597194388776, + "grad_norm": 25.047877474186087, + "learning_rate": 2.453302649807732e-07, + "loss": 1.5411, + "step": 13622 + }, + { + "epoch": 2.730060120240481, + "grad_norm": 22.71074590476532, + "learning_rate": 2.449696765775411e-07, + "loss": 1.7958, + "step": 13623 + }, + { + "epoch": 2.730260521042084, + "grad_norm": 25.041828884957877, + "learning_rate": 2.4460934671232504e-07, + "loss": 1.6283, + "step": 13624 + }, + { + "epoch": 2.7304609218436875, + "grad_norm": 27.713861772694287, + "learning_rate": 2.4424927540472044e-07, + "loss": 1.4286, + "step": 13625 + }, + { + "epoch": 2.7306613226452905, + "grad_norm": 45.47359279010568, + "learning_rate": 2.438894626743016e-07, + "loss": 1.902, + "step": 13626 + }, + { + "epoch": 2.730861723446894, + "grad_norm": 35.92092304764273, + "learning_rate": 2.4352990854063397e-07, + "loss": 1.7688, + "step": 13627 + }, + { + "epoch": 2.731062124248497, + "grad_norm": 25.16140574647204, + "learning_rate": 2.4317061302326585e-07, + "loss": 1.4104, + "step": 13628 + }, + { + "epoch": 2.7312625250501004, + "grad_norm": 22.31186901392285, + "learning_rate": 2.428115761417338e-07, + "loss": 1.4487, + "step": 13629 + }, + { + "epoch": 2.7314629258517034, + "grad_norm": 20.674677559963534, + "learning_rate": 2.4245279791555774e-07, + "loss": 1.423, + "step": 13630 + }, + { + "epoch": 2.7316633266533064, + "grad_norm": 30.63899763906053, + "learning_rate": 2.420942783642466e-07, + "loss": 2.2856, + "step": 13631 + }, + { + "epoch": 2.73186372745491, + "grad_norm": 21.580453426922052, + "learning_rate": 2.41736017507293e-07, + "loss": 1.7877, + "step": 13632 + }, + { + "epoch": 2.7320641282565132, + "grad_norm": 24.44351776615734, + "learning_rate": 2.4137801536417595e-07, + "loss": 1.3737, + "step": 13633 + }, + { + "epoch": 2.7322645290581162, + "grad_norm": 17.02367437683447, + "learning_rate": 2.410202719543603e-07, + "loss": 0.9683, + "step": 13634 + }, + { + "epoch": 2.7324649298597192, + "grad_norm": 35.19713067258994, + "learning_rate": 2.406627872972972e-07, + "loss": 1.8361, + "step": 13635 + }, + { + "epoch": 2.7326653306613227, + "grad_norm": 21.371333439489113, + "learning_rate": 2.4030556141242336e-07, + "loss": 1.5066, + "step": 13636 + }, + { + "epoch": 2.732865731462926, + "grad_norm": 16.839067224187314, + "learning_rate": 2.399485943191626e-07, + "loss": 1.6186, + "step": 13637 + }, + { + "epoch": 2.733066132264529, + "grad_norm": 22.32342003330422, + "learning_rate": 2.395918860369234e-07, + "loss": 1.4453, + "step": 13638 + }, + { + "epoch": 2.733266533066132, + "grad_norm": 25.85796305945312, + "learning_rate": 2.3923543658510074e-07, + "loss": 1.2907, + "step": 13639 + }, + { + "epoch": 2.7334669338677355, + "grad_norm": 23.293209565563547, + "learning_rate": 2.388792459830741e-07, + "loss": 1.2041, + "step": 13640 + }, + { + "epoch": 2.7336673346693385, + "grad_norm": 26.273391737750124, + "learning_rate": 2.3852331425021136e-07, + "loss": 1.4985, + "step": 13641 + }, + { + "epoch": 2.733867735470942, + "grad_norm": 22.532563965505823, + "learning_rate": 2.381676414058648e-07, + "loss": 2.0074, + "step": 13642 + }, + { + "epoch": 2.734068136272545, + "grad_norm": 23.177285094445338, + "learning_rate": 2.3781222746937226e-07, + "loss": 1.8729, + "step": 13643 + }, + { + "epoch": 2.7342685370741484, + "grad_norm": 20.04104557438498, + "learning_rate": 2.3745707246005833e-07, + "loss": 1.4492, + "step": 13644 + }, + { + "epoch": 2.7344689378757514, + "grad_norm": 23.91426808590274, + "learning_rate": 2.371021763972342e-07, + "loss": 1.836, + "step": 13645 + }, + { + "epoch": 2.734669338677355, + "grad_norm": 43.240979738751015, + "learning_rate": 2.3674753930019556e-07, + "loss": 1.7107, + "step": 13646 + }, + { + "epoch": 2.734869739478958, + "grad_norm": 21.463443483937315, + "learning_rate": 2.3639316118822308e-07, + "loss": 1.7198, + "step": 13647 + }, + { + "epoch": 2.7350701402805613, + "grad_norm": 21.527486565027335, + "learning_rate": 2.3603904208058693e-07, + "loss": 1.7543, + "step": 13648 + }, + { + "epoch": 2.7352705410821643, + "grad_norm": 20.683072166379542, + "learning_rate": 2.3568518199654112e-07, + "loss": 1.1097, + "step": 13649 + }, + { + "epoch": 2.7354709418837677, + "grad_norm": 25.110857726178594, + "learning_rate": 2.3533158095532417e-07, + "loss": 1.4587, + "step": 13650 + }, + { + "epoch": 2.7356713426853707, + "grad_norm": 18.995409201763575, + "learning_rate": 2.349782389761618e-07, + "loss": 1.7172, + "step": 13651 + }, + { + "epoch": 2.7358717434869737, + "grad_norm": 22.824196245435243, + "learning_rate": 2.3462515607826696e-07, + "loss": 1.5702, + "step": 13652 + }, + { + "epoch": 2.736072144288577, + "grad_norm": 15.405740291854952, + "learning_rate": 2.3427233228083656e-07, + "loss": 1.4997, + "step": 13653 + }, + { + "epoch": 2.7362725450901806, + "grad_norm": 84.6940880055016, + "learning_rate": 2.3391976760305468e-07, + "loss": 1.8015, + "step": 13654 + }, + { + "epoch": 2.7364729458917836, + "grad_norm": 26.368649653000546, + "learning_rate": 2.3356746206408987e-07, + "loss": 1.464, + "step": 13655 + }, + { + "epoch": 2.7366733466933866, + "grad_norm": 23.31217372121274, + "learning_rate": 2.3321541568309902e-07, + "loss": 1.2065, + "step": 13656 + }, + { + "epoch": 2.73687374749499, + "grad_norm": 25.218765194330285, + "learning_rate": 2.3286362847922074e-07, + "loss": 1.4106, + "step": 13657 + }, + { + "epoch": 2.7370741482965935, + "grad_norm": 81.00759653917862, + "learning_rate": 2.3251210047158578e-07, + "loss": 1.4491, + "step": 13658 + }, + { + "epoch": 2.7372745490981965, + "grad_norm": 22.46633728745609, + "learning_rate": 2.3216083167930447e-07, + "loss": 1.6814, + "step": 13659 + }, + { + "epoch": 2.7374749498997994, + "grad_norm": 17.83009863517748, + "learning_rate": 2.3180982212147651e-07, + "loss": 1.447, + "step": 13660 + }, + { + "epoch": 2.737675350701403, + "grad_norm": 24.32688681350359, + "learning_rate": 2.3145907181718719e-07, + "loss": 1.8686, + "step": 13661 + }, + { + "epoch": 2.737875751503006, + "grad_norm": 23.698245072587188, + "learning_rate": 2.3110858078550735e-07, + "loss": 1.5565, + "step": 13662 + }, + { + "epoch": 2.7380761523046093, + "grad_norm": 35.192198802367315, + "learning_rate": 2.3075834904549398e-07, + "loss": 1.7672, + "step": 13663 + }, + { + "epoch": 2.7382765531062123, + "grad_norm": 34.18409569217015, + "learning_rate": 2.3040837661618798e-07, + "loss": 1.4196, + "step": 13664 + }, + { + "epoch": 2.7384769539078158, + "grad_norm": 24.891181065561124, + "learning_rate": 2.300586635166202e-07, + "loss": 1.1988, + "step": 13665 + }, + { + "epoch": 2.7386773547094188, + "grad_norm": 22.626754680323565, + "learning_rate": 2.2970920976580325e-07, + "loss": 1.3779, + "step": 13666 + }, + { + "epoch": 2.738877755511022, + "grad_norm": 19.643311623098068, + "learning_rate": 2.29360015382738e-07, + "loss": 1.6264, + "step": 13667 + }, + { + "epoch": 2.739078156312625, + "grad_norm": 20.826014377195172, + "learning_rate": 2.2901108038641152e-07, + "loss": 1.6493, + "step": 13668 + }, + { + "epoch": 2.739278557114228, + "grad_norm": 38.61754719508752, + "learning_rate": 2.2866240479579526e-07, + "loss": 1.2581, + "step": 13669 + }, + { + "epoch": 2.7394789579158316, + "grad_norm": 23.352371593602623, + "learning_rate": 2.2831398862984745e-07, + "loss": 1.3813, + "step": 13670 + }, + { + "epoch": 2.739679358717435, + "grad_norm": 30.85245075633211, + "learning_rate": 2.2796583190751008e-07, + "loss": 1.713, + "step": 13671 + }, + { + "epoch": 2.739879759519038, + "grad_norm": 16.67522701874155, + "learning_rate": 2.2761793464771587e-07, + "loss": 1.4209, + "step": 13672 + }, + { + "epoch": 2.740080160320641, + "grad_norm": 18.1190058450382, + "learning_rate": 2.2727029686937962e-07, + "loss": 1.3384, + "step": 13673 + }, + { + "epoch": 2.7402805611222445, + "grad_norm": 35.60770846574964, + "learning_rate": 2.2692291859140126e-07, + "loss": 1.5791, + "step": 13674 + }, + { + "epoch": 2.740480961923848, + "grad_norm": 23.752704672841265, + "learning_rate": 2.2657579983267065e-07, + "loss": 1.3686, + "step": 13675 + }, + { + "epoch": 2.740681362725451, + "grad_norm": 31.488289451462432, + "learning_rate": 2.2622894061205936e-07, + "loss": 1.7362, + "step": 13676 + }, + { + "epoch": 2.740881763527054, + "grad_norm": 17.70526584395569, + "learning_rate": 2.2588234094842787e-07, + "loss": 1.0783, + "step": 13677 + }, + { + "epoch": 2.7410821643286574, + "grad_norm": 22.53102788816348, + "learning_rate": 2.2553600086062e-07, + "loss": 1.1837, + "step": 13678 + }, + { + "epoch": 2.7412825651302604, + "grad_norm": 14.168058250819731, + "learning_rate": 2.251899203674679e-07, + "loss": 1.5439, + "step": 13679 + }, + { + "epoch": 2.741482965931864, + "grad_norm": 42.80247793460442, + "learning_rate": 2.2484409948778874e-07, + "loss": 1.6113, + "step": 13680 + }, + { + "epoch": 2.741683366733467, + "grad_norm": 16.546603511794363, + "learning_rate": 2.2449853824038358e-07, + "loss": 1.517, + "step": 13681 + }, + { + "epoch": 2.7418837675350702, + "grad_norm": 22.37286573448461, + "learning_rate": 2.2415323664404298e-07, + "loss": 1.2833, + "step": 13682 + }, + { + "epoch": 2.7420841683366732, + "grad_norm": 27.529952903728883, + "learning_rate": 2.2380819471754022e-07, + "loss": 1.26, + "step": 13683 + }, + { + "epoch": 2.7422845691382767, + "grad_norm": 18.372269102715947, + "learning_rate": 2.2346341247963644e-07, + "loss": 1.7307, + "step": 13684 + }, + { + "epoch": 2.7424849699398797, + "grad_norm": 23.24064664246176, + "learning_rate": 2.231188899490777e-07, + "loss": 1.867, + "step": 13685 + }, + { + "epoch": 2.742685370741483, + "grad_norm": 27.927307619121763, + "learning_rate": 2.2277462714459576e-07, + "loss": 2.1523, + "step": 13686 + }, + { + "epoch": 2.742885771543086, + "grad_norm": 26.138160025208972, + "learning_rate": 2.2243062408491055e-07, + "loss": 2.3126, + "step": 13687 + }, + { + "epoch": 2.7430861723446895, + "grad_norm": 23.796212393507705, + "learning_rate": 2.2208688078872276e-07, + "loss": 1.7824, + "step": 13688 + }, + { + "epoch": 2.7432865731462925, + "grad_norm": 22.928161295948925, + "learning_rate": 2.2174339727472573e-07, + "loss": 1.7623, + "step": 13689 + }, + { + "epoch": 2.7434869739478955, + "grad_norm": 23.17186977026767, + "learning_rate": 2.2140017356159283e-07, + "loss": 1.9072, + "step": 13690 + }, + { + "epoch": 2.743687374749499, + "grad_norm": 22.81937890794246, + "learning_rate": 2.210572096679864e-07, + "loss": 1.3336, + "step": 13691 + }, + { + "epoch": 2.7438877755511024, + "grad_norm": 16.402194838729663, + "learning_rate": 2.207145056125537e-07, + "loss": 1.5885, + "step": 13692 + }, + { + "epoch": 2.7440881763527054, + "grad_norm": 18.67597587446811, + "learning_rate": 2.2037206141392819e-07, + "loss": 1.2428, + "step": 13693 + }, + { + "epoch": 2.7442885771543084, + "grad_norm": 34.96048760206967, + "learning_rate": 2.2002987709072998e-07, + "loss": 1.7208, + "step": 13694 + }, + { + "epoch": 2.744488977955912, + "grad_norm": 28.054111778946947, + "learning_rate": 2.1968795266156197e-07, + "loss": 1.6672, + "step": 13695 + }, + { + "epoch": 2.7446893787575153, + "grad_norm": 37.9010842108132, + "learning_rate": 2.1934628814501701e-07, + "loss": 1.5736, + "step": 13696 + }, + { + "epoch": 2.7448897795591183, + "grad_norm": 18.326962261044248, + "learning_rate": 2.1900488355967143e-07, + "loss": 1.3671, + "step": 13697 + }, + { + "epoch": 2.7450901803607213, + "grad_norm": 22.31848010530519, + "learning_rate": 2.1866373892408643e-07, + "loss": 1.316, + "step": 13698 + }, + { + "epoch": 2.7452905811623247, + "grad_norm": 21.46146009476672, + "learning_rate": 2.183228542568133e-07, + "loss": 1.7775, + "step": 13699 + }, + { + "epoch": 2.7454909819639277, + "grad_norm": 14.968469132139472, + "learning_rate": 2.179822295763845e-07, + "loss": 1.5176, + "step": 13700 + }, + { + "epoch": 2.745691382765531, + "grad_norm": 30.701239990628768, + "learning_rate": 2.1764186490132066e-07, + "loss": 1.3208, + "step": 13701 + }, + { + "epoch": 2.745891783567134, + "grad_norm": 21.555449769269675, + "learning_rate": 2.1730176025012817e-07, + "loss": 1.7395, + "step": 13702 + }, + { + "epoch": 2.7460921843687376, + "grad_norm": 19.8705504147163, + "learning_rate": 2.1696191564129888e-07, + "loss": 1.4717, + "step": 13703 + }, + { + "epoch": 2.7462925851703406, + "grad_norm": 28.646215240086384, + "learning_rate": 2.1662233109331132e-07, + "loss": 1.7486, + "step": 13704 + }, + { + "epoch": 2.746492985971944, + "grad_norm": 28.653765373972103, + "learning_rate": 2.1628300662462743e-07, + "loss": 1.3543, + "step": 13705 + }, + { + "epoch": 2.746693386773547, + "grad_norm": 25.43205492548257, + "learning_rate": 2.1594394225369854e-07, + "loss": 1.4993, + "step": 13706 + }, + { + "epoch": 2.7468937875751505, + "grad_norm": 27.265444574804064, + "learning_rate": 2.1560513799895932e-07, + "loss": 1.3755, + "step": 13707 + }, + { + "epoch": 2.7470941883767535, + "grad_norm": 20.82283528236627, + "learning_rate": 2.152665938788312e-07, + "loss": 1.4642, + "step": 13708 + }, + { + "epoch": 2.747294589178357, + "grad_norm": 22.467025743750664, + "learning_rate": 2.1492830991172108e-07, + "loss": 1.4505, + "step": 13709 + }, + { + "epoch": 2.74749498997996, + "grad_norm": 17.845933957461558, + "learning_rate": 2.1459028611602207e-07, + "loss": 1.4601, + "step": 13710 + }, + { + "epoch": 2.747695390781563, + "grad_norm": 16.89207745915357, + "learning_rate": 2.142525225101133e-07, + "loss": 1.5138, + "step": 13711 + }, + { + "epoch": 2.7478957915831663, + "grad_norm": 18.398079674330923, + "learning_rate": 2.13915019112359e-07, + "loss": 1.4191, + "step": 13712 + }, + { + "epoch": 2.7480961923847698, + "grad_norm": 14.509302492999561, + "learning_rate": 2.1357777594111063e-07, + "loss": 1.368, + "step": 13713 + }, + { + "epoch": 2.7482965931863728, + "grad_norm": 18.584469180423433, + "learning_rate": 2.1324079301470403e-07, + "loss": 1.2982, + "step": 13714 + }, + { + "epoch": 2.7484969939879758, + "grad_norm": 22.378809504840635, + "learning_rate": 2.1290407035146065e-07, + "loss": 1.7575, + "step": 13715 + }, + { + "epoch": 2.748697394789579, + "grad_norm": 21.13769848835136, + "learning_rate": 2.1256760796968978e-07, + "loss": 1.767, + "step": 13716 + }, + { + "epoch": 2.7488977955911826, + "grad_norm": 20.98229297661732, + "learning_rate": 2.122314058876851e-07, + "loss": 1.9358, + "step": 13717 + }, + { + "epoch": 2.7490981963927856, + "grad_norm": 30.128875857538233, + "learning_rate": 2.1189546412372586e-07, + "loss": 1.0016, + "step": 13718 + }, + { + "epoch": 2.7492985971943886, + "grad_norm": 24.260513412685693, + "learning_rate": 2.1155978269607802e-07, + "loss": 1.2318, + "step": 13719 + }, + { + "epoch": 2.749498997995992, + "grad_norm": 15.129165803808197, + "learning_rate": 2.1122436162299421e-07, + "loss": 1.2133, + "step": 13720 + }, + { + "epoch": 2.749699398797595, + "grad_norm": 18.8938658155594, + "learning_rate": 2.1088920092271038e-07, + "loss": 1.4095, + "step": 13721 + }, + { + "epoch": 2.7498997995991985, + "grad_norm": 21.18917450740461, + "learning_rate": 2.1055430061344916e-07, + "loss": 1.4878, + "step": 13722 + }, + { + "epoch": 2.7501002004008015, + "grad_norm": 19.29022348697729, + "learning_rate": 2.1021966071342104e-07, + "loss": 1.5515, + "step": 13723 + }, + { + "epoch": 2.750300601202405, + "grad_norm": 20.89247616178239, + "learning_rate": 2.098852812408203e-07, + "loss": 1.4278, + "step": 13724 + }, + { + "epoch": 2.750501002004008, + "grad_norm": 32.92817019013842, + "learning_rate": 2.09551162213828e-07, + "loss": 1.5924, + "step": 13725 + }, + { + "epoch": 2.7507014028056114, + "grad_norm": 21.96186322307634, + "learning_rate": 2.092173036506101e-07, + "loss": 1.8612, + "step": 13726 + }, + { + "epoch": 2.7509018036072144, + "grad_norm": 43.877654094846704, + "learning_rate": 2.0888370556931993e-07, + "loss": 1.5512, + "step": 13727 + }, + { + "epoch": 2.7511022044088174, + "grad_norm": 18.61998903449465, + "learning_rate": 2.0855036798809348e-07, + "loss": 1.6631, + "step": 13728 + }, + { + "epoch": 2.751302605210421, + "grad_norm": 18.291628766991078, + "learning_rate": 2.0821729092505682e-07, + "loss": 1.2012, + "step": 13729 + }, + { + "epoch": 2.7515030060120242, + "grad_norm": 45.86901938113897, + "learning_rate": 2.0788447439832048e-07, + "loss": 1.1797, + "step": 13730 + }, + { + "epoch": 2.7517034068136272, + "grad_norm": 25.634842445536634, + "learning_rate": 2.0755191842597832e-07, + "loss": 1.5566, + "step": 13731 + }, + { + "epoch": 2.7519038076152302, + "grad_norm": 23.238981864147526, + "learning_rate": 2.0721962302611252e-07, + "loss": 0.772, + "step": 13732 + }, + { + "epoch": 2.7521042084168337, + "grad_norm": 20.068699757161625, + "learning_rate": 2.068875882167909e-07, + "loss": 1.7596, + "step": 13733 + }, + { + "epoch": 2.752304609218437, + "grad_norm": 38.687390387026355, + "learning_rate": 2.0655581401606618e-07, + "loss": 1.6744, + "step": 13734 + }, + { + "epoch": 2.75250501002004, + "grad_norm": 54.79772558196307, + "learning_rate": 2.0622430044197783e-07, + "loss": 1.5725, + "step": 13735 + }, + { + "epoch": 2.752705410821643, + "grad_norm": 19.329715381658367, + "learning_rate": 2.0589304751255034e-07, + "loss": 1.1576, + "step": 13736 + }, + { + "epoch": 2.7529058116232465, + "grad_norm": 21.933493259365967, + "learning_rate": 2.0556205524579542e-07, + "loss": 1.6606, + "step": 13737 + }, + { + "epoch": 2.7531062124248495, + "grad_norm": 24.400927551631362, + "learning_rate": 2.0523132365970756e-07, + "loss": 1.8713, + "step": 13738 + }, + { + "epoch": 2.753306613226453, + "grad_norm": 18.544052667645584, + "learning_rate": 2.0490085277227123e-07, + "loss": 1.3312, + "step": 13739 + }, + { + "epoch": 2.753507014028056, + "grad_norm": 67.80557706559203, + "learning_rate": 2.045706426014532e-07, + "loss": 1.3988, + "step": 13740 + }, + { + "epoch": 2.7537074148296594, + "grad_norm": 16.95759730816929, + "learning_rate": 2.04240693165208e-07, + "loss": 1.7832, + "step": 13741 + }, + { + "epoch": 2.7539078156312624, + "grad_norm": 21.67146431911044, + "learning_rate": 2.0391100448147573e-07, + "loss": 1.5232, + "step": 13742 + }, + { + "epoch": 2.754108216432866, + "grad_norm": 18.383503427900553, + "learning_rate": 2.03581576568182e-07, + "loss": 1.2045, + "step": 13743 + }, + { + "epoch": 2.754308617234469, + "grad_norm": 21.708538933302734, + "learning_rate": 2.032524094432381e-07, + "loss": 1.1468, + "step": 13744 + }, + { + "epoch": 2.7545090180360723, + "grad_norm": 37.58743705006384, + "learning_rate": 2.0292350312454078e-07, + "loss": 1.6804, + "step": 13745 + }, + { + "epoch": 2.7547094188376753, + "grad_norm": 17.542085615572724, + "learning_rate": 2.0259485762997467e-07, + "loss": 1.3378, + "step": 13746 + }, + { + "epoch": 2.7549098196392787, + "grad_norm": 23.36789677579198, + "learning_rate": 2.0226647297740708e-07, + "loss": 1.4199, + "step": 13747 + }, + { + "epoch": 2.7551102204408817, + "grad_norm": 24.77141787595518, + "learning_rate": 2.019383491846938e-07, + "loss": 1.7739, + "step": 13748 + }, + { + "epoch": 2.7553106212424847, + "grad_norm": 28.779607985276122, + "learning_rate": 2.0161048626967439e-07, + "loss": 1.6215, + "step": 13749 + }, + { + "epoch": 2.755511022044088, + "grad_norm": 24.046445800655416, + "learning_rate": 2.0128288425017628e-07, + "loss": 1.5152, + "step": 13750 + }, + { + "epoch": 2.7557114228456916, + "grad_norm": 20.968206158080783, + "learning_rate": 2.0095554314401244e-07, + "loss": 1.8715, + "step": 13751 + }, + { + "epoch": 2.7559118236472946, + "grad_norm": 21.623415267062054, + "learning_rate": 2.0062846296897808e-07, + "loss": 0.9728, + "step": 13752 + }, + { + "epoch": 2.7561122244488976, + "grad_norm": 20.88389269387669, + "learning_rate": 2.0030164374285953e-07, + "loss": 1.2231, + "step": 13753 + }, + { + "epoch": 2.756312625250501, + "grad_norm": 19.33738672088691, + "learning_rate": 1.9997508548342537e-07, + "loss": 1.2555, + "step": 13754 + }, + { + "epoch": 2.7565130260521045, + "grad_norm": 24.228350407721873, + "learning_rate": 1.9964878820843082e-07, + "loss": 1.2235, + "step": 13755 + }, + { + "epoch": 2.7567134268537075, + "grad_norm": 38.28164886170503, + "learning_rate": 1.993227519356189e-07, + "loss": 1.2623, + "step": 13756 + }, + { + "epoch": 2.7569138276553105, + "grad_norm": 21.49716777840276, + "learning_rate": 1.9899697668271434e-07, + "loss": 1.7527, + "step": 13757 + }, + { + "epoch": 2.757114228456914, + "grad_norm": 16.791457938620557, + "learning_rate": 1.986714624674324e-07, + "loss": 1.0427, + "step": 13758 + }, + { + "epoch": 2.757314629258517, + "grad_norm": 22.83134996640268, + "learning_rate": 1.983462093074684e-07, + "loss": 1.3692, + "step": 13759 + }, + { + "epoch": 2.7575150300601203, + "grad_norm": 28.374517205052364, + "learning_rate": 1.9802121722051038e-07, + "loss": 1.5339, + "step": 13760 + }, + { + "epoch": 2.7577154308617233, + "grad_norm": 40.50518495531847, + "learning_rate": 1.97696486224227e-07, + "loss": 1.4984, + "step": 13761 + }, + { + "epoch": 2.7579158316633268, + "grad_norm": 25.174619042201762, + "learning_rate": 1.9737201633627357e-07, + "loss": 1.0565, + "step": 13762 + }, + { + "epoch": 2.7581162324649298, + "grad_norm": 22.913710348421322, + "learning_rate": 1.9704780757429431e-07, + "loss": 1.8098, + "step": 13763 + }, + { + "epoch": 2.758316633266533, + "grad_norm": 23.505343060319102, + "learning_rate": 1.9672385995591459e-07, + "loss": 1.6736, + "step": 13764 + }, + { + "epoch": 2.758517034068136, + "grad_norm": 18.015442217615707, + "learning_rate": 1.9640017349874918e-07, + "loss": 1.3819, + "step": 13765 + }, + { + "epoch": 2.7587174348697396, + "grad_norm": 15.848582680995909, + "learning_rate": 1.9607674822039734e-07, + "loss": 1.2595, + "step": 13766 + }, + { + "epoch": 2.7589178356713426, + "grad_norm": 17.65405850225693, + "learning_rate": 1.9575358413844337e-07, + "loss": 1.9904, + "step": 13767 + }, + { + "epoch": 2.759118236472946, + "grad_norm": 23.31760813205279, + "learning_rate": 1.9543068127045982e-07, + "loss": 1.1274, + "step": 13768 + }, + { + "epoch": 2.759318637274549, + "grad_norm": 20.63680774104376, + "learning_rate": 1.9510803963400105e-07, + "loss": 1.5326, + "step": 13769 + }, + { + "epoch": 2.759519038076152, + "grad_norm": 16.19713582124088, + "learning_rate": 1.9478565924661186e-07, + "loss": 1.424, + "step": 13770 + }, + { + "epoch": 2.7597194388777555, + "grad_norm": 23.774571456453376, + "learning_rate": 1.9446354012581882e-07, + "loss": 1.5637, + "step": 13771 + }, + { + "epoch": 2.759919839679359, + "grad_norm": 21.663364371341167, + "learning_rate": 1.9414168228913676e-07, + "loss": 1.715, + "step": 13772 + }, + { + "epoch": 2.760120240480962, + "grad_norm": 29.03124215518807, + "learning_rate": 1.938200857540662e-07, + "loss": 1.7738, + "step": 13773 + }, + { + "epoch": 2.760320641282565, + "grad_norm": 21.40736233384184, + "learning_rate": 1.9349875053809142e-07, + "loss": 1.6181, + "step": 13774 + }, + { + "epoch": 2.7605210420841684, + "grad_norm": 23.021283717926696, + "learning_rate": 1.9317767665868514e-07, + "loss": 1.541, + "step": 13775 + }, + { + "epoch": 2.760721442885772, + "grad_norm": 30.706500772291406, + "learning_rate": 1.928568641333034e-07, + "loss": 2.0458, + "step": 13776 + }, + { + "epoch": 2.760921843687375, + "grad_norm": 18.18847092410739, + "learning_rate": 1.9253631297939113e-07, + "loss": 1.6263, + "step": 13777 + }, + { + "epoch": 2.761122244488978, + "grad_norm": 28.2718530016916, + "learning_rate": 1.9221602321437494e-07, + "loss": 1.5966, + "step": 13778 + }, + { + "epoch": 2.7613226452905812, + "grad_norm": 19.859015690030642, + "learning_rate": 1.918959948556709e-07, + "loss": 1.9361, + "step": 13779 + }, + { + "epoch": 2.7615230460921842, + "grad_norm": 19.98005157343125, + "learning_rate": 1.91576227920679e-07, + "loss": 1.1998, + "step": 13780 + }, + { + "epoch": 2.7617234468937877, + "grad_norm": 17.129885387736653, + "learning_rate": 1.9125672242678528e-07, + "loss": 1.4785, + "step": 13781 + }, + { + "epoch": 2.7619238476953907, + "grad_norm": 40.00105479496293, + "learning_rate": 1.9093747839136257e-07, + "loss": 2.1633, + "step": 13782 + }, + { + "epoch": 2.762124248496994, + "grad_norm": 26.610333294119595, + "learning_rate": 1.9061849583176639e-07, + "loss": 1.2752, + "step": 13783 + }, + { + "epoch": 2.762324649298597, + "grad_norm": 18.212763050511388, + "learning_rate": 1.9029977476534345e-07, + "loss": 1.3777, + "step": 13784 + }, + { + "epoch": 2.7625250501002006, + "grad_norm": 25.465186173687485, + "learning_rate": 1.8998131520942153e-07, + "loss": 1.6619, + "step": 13785 + }, + { + "epoch": 2.7627254509018035, + "grad_norm": 21.320539624107255, + "learning_rate": 1.8966311718131457e-07, + "loss": 1.9148, + "step": 13786 + }, + { + "epoch": 2.7629258517034065, + "grad_norm": 26.22336647618575, + "learning_rate": 1.893451806983254e-07, + "loss": 1.4707, + "step": 13787 + }, + { + "epoch": 2.76312625250501, + "grad_norm": 21.132210269112008, + "learning_rate": 1.890275057777402e-07, + "loss": 1.5521, + "step": 13788 + }, + { + "epoch": 2.7633266533066134, + "grad_norm": 18.946374756191442, + "learning_rate": 1.8871009243683068e-07, + "loss": 1.5104, + "step": 13789 + }, + { + "epoch": 2.7635270541082164, + "grad_norm": 21.49086599612413, + "learning_rate": 1.8839294069285586e-07, + "loss": 1.3961, + "step": 13790 + }, + { + "epoch": 2.7637274549098194, + "grad_norm": 19.737792548055282, + "learning_rate": 1.880760505630591e-07, + "loss": 1.4751, + "step": 13791 + }, + { + "epoch": 2.763927855711423, + "grad_norm": 25.336335231851994, + "learning_rate": 1.8775942206467113e-07, + "loss": 1.4229, + "step": 13792 + }, + { + "epoch": 2.7641282565130263, + "grad_norm": 19.58555805546421, + "learning_rate": 1.8744305521490592e-07, + "loss": 1.2459, + "step": 13793 + }, + { + "epoch": 2.7643286573146293, + "grad_norm": 17.284955587284355, + "learning_rate": 1.8712695003096692e-07, + "loss": 1.4284, + "step": 13794 + }, + { + "epoch": 2.7645290581162323, + "grad_norm": 17.371997836936476, + "learning_rate": 1.8681110653003932e-07, + "loss": 1.8098, + "step": 13795 + }, + { + "epoch": 2.7647294589178357, + "grad_norm": 19.719839027944236, + "learning_rate": 1.8649552472929656e-07, + "loss": 1.9511, + "step": 13796 + }, + { + "epoch": 2.7649298597194387, + "grad_norm": 52.889482692655484, + "learning_rate": 1.8618020464589827e-07, + "loss": 1.6185, + "step": 13797 + }, + { + "epoch": 2.765130260521042, + "grad_norm": 34.867602449560906, + "learning_rate": 1.8586514629698738e-07, + "loss": 1.2582, + "step": 13798 + }, + { + "epoch": 2.765330661322645, + "grad_norm": 18.671498116470122, + "learning_rate": 1.855503496996952e-07, + "loss": 1.4367, + "step": 13799 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 22.875763391043897, + "learning_rate": 1.8523581487113696e-07, + "loss": 1.2304, + "step": 13800 + }, + { + "epoch": 2.7657314629258516, + "grad_norm": 25.258031951308354, + "learning_rate": 1.849215418284156e-07, + "loss": 1.1795, + "step": 13801 + }, + { + "epoch": 2.765931863727455, + "grad_norm": 25.596638487255895, + "learning_rate": 1.846075305886169e-07, + "loss": 1.6973, + "step": 13802 + }, + { + "epoch": 2.766132264529058, + "grad_norm": 23.693340178591242, + "learning_rate": 1.8429378116881557e-07, + "loss": 1.434, + "step": 13803 + }, + { + "epoch": 2.7663326653306615, + "grad_norm": 20.68481302106865, + "learning_rate": 1.8398029358606905e-07, + "loss": 1.6092, + "step": 13804 + }, + { + "epoch": 2.7665330661322645, + "grad_norm": 19.039653599460575, + "learning_rate": 1.8366706785742372e-07, + "loss": 1.0784, + "step": 13805 + }, + { + "epoch": 2.766733466933868, + "grad_norm": 28.03191422411529, + "learning_rate": 1.8335410399990928e-07, + "loss": 1.5726, + "step": 13806 + }, + { + "epoch": 2.766933867735471, + "grad_norm": 18.628234706425335, + "learning_rate": 1.830414020305421e-07, + "loss": 0.8489, + "step": 13807 + }, + { + "epoch": 2.767134268537074, + "grad_norm": 18.07209378664251, + "learning_rate": 1.8272896196632527e-07, + "loss": 1.6185, + "step": 13808 + }, + { + "epoch": 2.7673346693386773, + "grad_norm": 22.429312755112917, + "learning_rate": 1.8241678382424466e-07, + "loss": 2.1153, + "step": 13809 + }, + { + "epoch": 2.7675350701402808, + "grad_norm": 24.189572891455654, + "learning_rate": 1.82104867621275e-07, + "loss": 1.5836, + "step": 13810 + }, + { + "epoch": 2.7677354709418838, + "grad_norm": 19.708883366923573, + "learning_rate": 1.8179321337437607e-07, + "loss": 1.2654, + "step": 13811 + }, + { + "epoch": 2.7679358717434868, + "grad_norm": 20.61418669296042, + "learning_rate": 1.8148182110049206e-07, + "loss": 1.9042, + "step": 13812 + }, + { + "epoch": 2.76813627254509, + "grad_norm": 21.849361198473694, + "learning_rate": 1.8117069081655447e-07, + "loss": 1.5601, + "step": 13813 + }, + { + "epoch": 2.7683366733466936, + "grad_norm": 19.7992763150214, + "learning_rate": 1.808598225394792e-07, + "loss": 1.6102, + "step": 13814 + }, + { + "epoch": 2.7685370741482966, + "grad_norm": 22.195896348697147, + "learning_rate": 1.805492162861694e-07, + "loss": 1.6577, + "step": 13815 + }, + { + "epoch": 2.7687374749498996, + "grad_norm": 17.935399308831574, + "learning_rate": 1.802388720735132e-07, + "loss": 1.692, + "step": 13816 + }, + { + "epoch": 2.768937875751503, + "grad_norm": 19.504817715182618, + "learning_rate": 1.799287899183838e-07, + "loss": 1.522, + "step": 13817 + }, + { + "epoch": 2.769138276553106, + "grad_norm": 19.03463928256982, + "learning_rate": 1.796189698376416e-07, + "loss": 1.5607, + "step": 13818 + }, + { + "epoch": 2.7693386773547095, + "grad_norm": 30.073335307266447, + "learning_rate": 1.7930941184813088e-07, + "loss": 1.3859, + "step": 13819 + }, + { + "epoch": 2.7695390781563125, + "grad_norm": 21.508624792606785, + "learning_rate": 1.7900011596668376e-07, + "loss": 1.6939, + "step": 13820 + }, + { + "epoch": 2.769739478957916, + "grad_norm": 20.425507457049623, + "learning_rate": 1.7869108221011677e-07, + "loss": 1.4246, + "step": 13821 + }, + { + "epoch": 2.769939879759519, + "grad_norm": 18.454788073306663, + "learning_rate": 1.7838231059523258e-07, + "loss": 1.5271, + "step": 13822 + }, + { + "epoch": 2.7701402805611224, + "grad_norm": 37.73552843501761, + "learning_rate": 1.7807380113881944e-07, + "loss": 1.3884, + "step": 13823 + }, + { + "epoch": 2.7703406813627254, + "grad_norm": 20.43171561767, + "learning_rate": 1.7776555385765114e-07, + "loss": 1.5026, + "step": 13824 + }, + { + "epoch": 2.770541082164329, + "grad_norm": 29.15516959340039, + "learning_rate": 1.774575687684893e-07, + "loss": 1.3918, + "step": 13825 + }, + { + "epoch": 2.770741482965932, + "grad_norm": 21.86874947690484, + "learning_rate": 1.771498458880766e-07, + "loss": 0.9358, + "step": 13826 + }, + { + "epoch": 2.7709418837675353, + "grad_norm": 26.878513553830963, + "learning_rate": 1.768423852331469e-07, + "loss": 1.7921, + "step": 13827 + }, + { + "epoch": 2.7711422845691382, + "grad_norm": 25.598338113248072, + "learning_rate": 1.7653518682041626e-07, + "loss": 1.7048, + "step": 13828 + }, + { + "epoch": 2.7713426853707412, + "grad_norm": 22.707536577174704, + "learning_rate": 1.7622825066658689e-07, + "loss": 1.3718, + "step": 13829 + }, + { + "epoch": 2.7715430861723447, + "grad_norm": 22.300650300104422, + "learning_rate": 1.7592157678834876e-07, + "loss": 1.6771, + "step": 13830 + }, + { + "epoch": 2.771743486973948, + "grad_norm": 19.446549979698837, + "learning_rate": 1.756151652023752e-07, + "loss": 1.5291, + "step": 13831 + }, + { + "epoch": 2.771943887775551, + "grad_norm": 26.48688047106369, + "learning_rate": 1.7530901592532735e-07, + "loss": 1.895, + "step": 13832 + }, + { + "epoch": 2.772144288577154, + "grad_norm": 24.09233938498994, + "learning_rate": 1.7500312897384853e-07, + "loss": 1.9127, + "step": 13833 + }, + { + "epoch": 2.7723446893787576, + "grad_norm": 29.31098303792895, + "learning_rate": 1.7469750436457377e-07, + "loss": 1.6992, + "step": 13834 + }, + { + "epoch": 2.772545090180361, + "grad_norm": 21.957482179102705, + "learning_rate": 1.743921421141176e-07, + "loss": 1.4968, + "step": 13835 + }, + { + "epoch": 2.772745490981964, + "grad_norm": 16.24313468337316, + "learning_rate": 1.740870422390839e-07, + "loss": 1.3951, + "step": 13836 + }, + { + "epoch": 2.772945891783567, + "grad_norm": 31.45743927560077, + "learning_rate": 1.7378220475606112e-07, + "loss": 1.4471, + "step": 13837 + }, + { + "epoch": 2.7731462925851704, + "grad_norm": 24.57726572949429, + "learning_rate": 1.734776296816243e-07, + "loss": 1.4043, + "step": 13838 + }, + { + "epoch": 2.7733466933867734, + "grad_norm": 27.401700488289592, + "learning_rate": 1.7317331703233408e-07, + "loss": 1.5646, + "step": 13839 + }, + { + "epoch": 2.773547094188377, + "grad_norm": 16.899364132995803, + "learning_rate": 1.7286926682473448e-07, + "loss": 1.3028, + "step": 13840 + }, + { + "epoch": 2.77374749498998, + "grad_norm": 36.079919453013204, + "learning_rate": 1.7256547907535838e-07, + "loss": 1.3419, + "step": 13841 + }, + { + "epoch": 2.7739478957915833, + "grad_norm": 20.940472417556553, + "learning_rate": 1.722619538007242e-07, + "loss": 1.4912, + "step": 13842 + }, + { + "epoch": 2.7741482965931863, + "grad_norm": 30.4236738876497, + "learning_rate": 1.719586910173332e-07, + "loss": 1.4419, + "step": 13843 + }, + { + "epoch": 2.7743486973947897, + "grad_norm": 19.38293435897039, + "learning_rate": 1.7165569074167555e-07, + "loss": 1.1599, + "step": 13844 + }, + { + "epoch": 2.7745490981963927, + "grad_norm": 17.908924544054837, + "learning_rate": 1.7135295299022468e-07, + "loss": 1.543, + "step": 13845 + }, + { + "epoch": 2.7747494989979957, + "grad_norm": 22.40654786601017, + "learning_rate": 1.7105047777944194e-07, + "loss": 1.5224, + "step": 13846 + }, + { + "epoch": 2.774949899799599, + "grad_norm": 27.05355078363455, + "learning_rate": 1.7074826512577248e-07, + "loss": 1.7637, + "step": 13847 + }, + { + "epoch": 2.7751503006012026, + "grad_norm": 21.590311909786152, + "learning_rate": 1.704463150456487e-07, + "loss": 1.5, + "step": 13848 + }, + { + "epoch": 2.7753507014028056, + "grad_norm": 43.056698120037424, + "learning_rate": 1.7014462755548856e-07, + "loss": 1.7956, + "step": 13849 + }, + { + "epoch": 2.7755511022044086, + "grad_norm": 21.58108562988513, + "learning_rate": 1.6984320267169285e-07, + "loss": 1.452, + "step": 13850 + }, + { + "epoch": 2.775751503006012, + "grad_norm": 26.480622730243223, + "learning_rate": 1.6954204041065402e-07, + "loss": 1.2382, + "step": 13851 + }, + { + "epoch": 2.7759519038076155, + "grad_norm": 25.191663305114506, + "learning_rate": 1.6924114078874398e-07, + "loss": 1.0098, + "step": 13852 + }, + { + "epoch": 2.7761523046092185, + "grad_norm": 23.566957376372457, + "learning_rate": 1.6894050382232351e-07, + "loss": 1.6409, + "step": 13853 + }, + { + "epoch": 2.7763527054108215, + "grad_norm": 17.456222763036244, + "learning_rate": 1.6864012952774012e-07, + "loss": 1.3992, + "step": 13854 + }, + { + "epoch": 2.776553106212425, + "grad_norm": 20.64345028455729, + "learning_rate": 1.6834001792132403e-07, + "loss": 1.2128, + "step": 13855 + }, + { + "epoch": 2.776753507014028, + "grad_norm": 27.571743127043884, + "learning_rate": 1.680401690193939e-07, + "loss": 1.8638, + "step": 13856 + }, + { + "epoch": 2.7769539078156313, + "grad_norm": 21.140294446846593, + "learning_rate": 1.6774058283825113e-07, + "loss": 1.6601, + "step": 13857 + }, + { + "epoch": 2.7771543086172343, + "grad_norm": 24.480158201459837, + "learning_rate": 1.6744125939418765e-07, + "loss": 1.6539, + "step": 13858 + }, + { + "epoch": 2.7773547094188378, + "grad_norm": 22.118830685173112, + "learning_rate": 1.6714219870347548e-07, + "loss": 1.4258, + "step": 13859 + }, + { + "epoch": 2.7775551102204408, + "grad_norm": 21.853138821152417, + "learning_rate": 1.6684340078237548e-07, + "loss": 1.701, + "step": 13860 + }, + { + "epoch": 2.777755511022044, + "grad_norm": 17.753762466146785, + "learning_rate": 1.6654486564713468e-07, + "loss": 1.5254, + "step": 13861 + }, + { + "epoch": 2.777955911823647, + "grad_norm": 19.305602452927754, + "learning_rate": 1.6624659331398452e-07, + "loss": 1.8933, + "step": 13862 + }, + { + "epoch": 2.7781563126252506, + "grad_norm": 19.079512986810407, + "learning_rate": 1.6594858379914257e-07, + "loss": 1.4988, + "step": 13863 + }, + { + "epoch": 2.7783567134268536, + "grad_norm": 17.58230945602462, + "learning_rate": 1.6565083711881093e-07, + "loss": 1.1486, + "step": 13864 + }, + { + "epoch": 2.778557114228457, + "grad_norm": 24.841064390224883, + "learning_rate": 1.6535335328918046e-07, + "loss": 2.1197, + "step": 13865 + }, + { + "epoch": 2.77875751503006, + "grad_norm": 21.71012029108809, + "learning_rate": 1.6505613232642436e-07, + "loss": 1.4056, + "step": 13866 + }, + { + "epoch": 2.778957915831663, + "grad_norm": 27.97022123536826, + "learning_rate": 1.6475917424670252e-07, + "loss": 1.4171, + "step": 13867 + }, + { + "epoch": 2.7791583166332665, + "grad_norm": 22.91166395451628, + "learning_rate": 1.644624790661631e-07, + "loss": 2.0029, + "step": 13868 + }, + { + "epoch": 2.77935871743487, + "grad_norm": 42.62244096489542, + "learning_rate": 1.6416604680093596e-07, + "loss": 1.7141, + "step": 13869 + }, + { + "epoch": 2.779559118236473, + "grad_norm": 28.057395786754395, + "learning_rate": 1.6386987746713933e-07, + "loss": 1.9074, + "step": 13870 + }, + { + "epoch": 2.779759519038076, + "grad_norm": 24.094735716609588, + "learning_rate": 1.6357397108087646e-07, + "loss": 1.8598, + "step": 13871 + }, + { + "epoch": 2.7799599198396794, + "grad_norm": 41.38321845935786, + "learning_rate": 1.6327832765823559e-07, + "loss": 1.456, + "step": 13872 + }, + { + "epoch": 2.780160320641283, + "grad_norm": 22.162237765047006, + "learning_rate": 1.6298294721529218e-07, + "loss": 1.6735, + "step": 13873 + }, + { + "epoch": 2.780360721442886, + "grad_norm": 16.008900527534692, + "learning_rate": 1.6268782976810504e-07, + "loss": 1.6849, + "step": 13874 + }, + { + "epoch": 2.780561122244489, + "grad_norm": 25.996065990266864, + "learning_rate": 1.6239297533272246e-07, + "loss": 1.6259, + "step": 13875 + }, + { + "epoch": 2.7807615230460923, + "grad_norm": 22.762393317446886, + "learning_rate": 1.620983839251744e-07, + "loss": 1.4974, + "step": 13876 + }, + { + "epoch": 2.7809619238476952, + "grad_norm": 20.98353144510378, + "learning_rate": 1.6180405556147805e-07, + "loss": 1.1489, + "step": 13877 + }, + { + "epoch": 2.7811623246492987, + "grad_norm": 21.338893512107678, + "learning_rate": 1.6150999025763726e-07, + "loss": 1.4107, + "step": 13878 + }, + { + "epoch": 2.7813627254509017, + "grad_norm": 27.968611764176345, + "learning_rate": 1.6121618802964089e-07, + "loss": 1.8403, + "step": 13879 + }, + { + "epoch": 2.781563126252505, + "grad_norm": 19.78958877175752, + "learning_rate": 1.6092264889346288e-07, + "loss": 1.0452, + "step": 13880 + }, + { + "epoch": 2.781763527054108, + "grad_norm": 17.919358503180153, + "learning_rate": 1.6062937286506318e-07, + "loss": 1.8521, + "step": 13881 + }, + { + "epoch": 2.7819639278557116, + "grad_norm": 24.56642780560383, + "learning_rate": 1.603363599603891e-07, + "loss": 1.7789, + "step": 13882 + }, + { + "epoch": 2.7821643286573146, + "grad_norm": 47.84600314444151, + "learning_rate": 1.6004361019537007e-07, + "loss": 1.826, + "step": 13883 + }, + { + "epoch": 2.782364729458918, + "grad_norm": 18.060006863579943, + "learning_rate": 1.597511235859245e-07, + "loss": 1.6582, + "step": 13884 + }, + { + "epoch": 2.782565130260521, + "grad_norm": 25.24211191770386, + "learning_rate": 1.5945890014795572e-07, + "loss": 1.6182, + "step": 13885 + }, + { + "epoch": 2.7827655310621244, + "grad_norm": 20.493802289712345, + "learning_rate": 1.5916693989735166e-07, + "loss": 1.839, + "step": 13886 + }, + { + "epoch": 2.7829659318637274, + "grad_norm": 25.908315216520077, + "learning_rate": 1.5887524284998624e-07, + "loss": 1.728, + "step": 13887 + }, + { + "epoch": 2.7831663326653304, + "grad_norm": 16.045071980110666, + "learning_rate": 1.5858380902172066e-07, + "loss": 1.2478, + "step": 13888 + }, + { + "epoch": 2.783366733466934, + "grad_norm": 17.013038443052782, + "learning_rate": 1.5829263842840003e-07, + "loss": 1.4192, + "step": 13889 + }, + { + "epoch": 2.7835671342685373, + "grad_norm": 23.392977282899825, + "learning_rate": 1.580017310858556e-07, + "loss": 1.7267, + "step": 13890 + }, + { + "epoch": 2.7837675350701403, + "grad_norm": 19.65269020793089, + "learning_rate": 1.5771108700990412e-07, + "loss": 1.3174, + "step": 13891 + }, + { + "epoch": 2.7839679358717433, + "grad_norm": 26.12747750978764, + "learning_rate": 1.5742070621634909e-07, + "loss": 1.5198, + "step": 13892 + }, + { + "epoch": 2.7841683366733467, + "grad_norm": 18.35059922469764, + "learning_rate": 1.5713058872097842e-07, + "loss": 1.4252, + "step": 13893 + }, + { + "epoch": 2.78436873747495, + "grad_norm": 21.430424346122923, + "learning_rate": 1.5684073453956616e-07, + "loss": 1.5008, + "step": 13894 + }, + { + "epoch": 2.784569138276553, + "grad_norm": 22.032934849478718, + "learning_rate": 1.565511436878725e-07, + "loss": 1.2684, + "step": 13895 + }, + { + "epoch": 2.784769539078156, + "grad_norm": 20.940840359816843, + "learning_rate": 1.5626181618164317e-07, + "loss": 1.705, + "step": 13896 + }, + { + "epoch": 2.7849699398797596, + "grad_norm": 22.49441600370339, + "learning_rate": 1.5597275203660833e-07, + "loss": 1.6948, + "step": 13897 + }, + { + "epoch": 2.7851703406813626, + "grad_norm": 16.177922775071064, + "learning_rate": 1.556839512684849e-07, + "loss": 1.1468, + "step": 13898 + }, + { + "epoch": 2.785370741482966, + "grad_norm": 19.528081808681303, + "learning_rate": 1.5539541389297695e-07, + "loss": 1.5349, + "step": 13899 + }, + { + "epoch": 2.785571142284569, + "grad_norm": 20.754986436619273, + "learning_rate": 1.5510713992577142e-07, + "loss": 1.6075, + "step": 13900 + }, + { + "epoch": 2.7857715430861725, + "grad_norm": 22.297269229894248, + "learning_rate": 1.5481912938254183e-07, + "loss": 1.4331, + "step": 13901 + }, + { + "epoch": 2.7859719438877755, + "grad_norm": 14.855319723923488, + "learning_rate": 1.5453138227894905e-07, + "loss": 1.5127, + "step": 13902 + }, + { + "epoch": 2.786172344689379, + "grad_norm": 30.034180909169145, + "learning_rate": 1.5424389863063717e-07, + "loss": 1.7006, + "step": 13903 + }, + { + "epoch": 2.786372745490982, + "grad_norm": 27.78953981583524, + "learning_rate": 1.539566784532376e-07, + "loss": 1.4842, + "step": 13904 + }, + { + "epoch": 2.786573146292585, + "grad_norm": 23.99347290896657, + "learning_rate": 1.5366972176236673e-07, + "loss": 1.5844, + "step": 13905 + }, + { + "epoch": 2.7867735470941883, + "grad_norm": 19.49620577434106, + "learning_rate": 1.5338302857362764e-07, + "loss": 1.6326, + "step": 13906 + }, + { + "epoch": 2.786973947895792, + "grad_norm": 19.166796890703356, + "learning_rate": 1.5309659890260676e-07, + "loss": 1.5039, + "step": 13907 + }, + { + "epoch": 2.7871743486973948, + "grad_norm": 17.80234509208146, + "learning_rate": 1.5281043276487883e-07, + "loss": 1.1348, + "step": 13908 + }, + { + "epoch": 2.7873747494989978, + "grad_norm": 22.09920779118342, + "learning_rate": 1.5252453017600255e-07, + "loss": 1.8097, + "step": 13909 + }, + { + "epoch": 2.787575150300601, + "grad_norm": 25.470865840740842, + "learning_rate": 1.5223889115152267e-07, + "loss": 1.9637, + "step": 13910 + }, + { + "epoch": 2.7877755511022047, + "grad_norm": 22.02630917554142, + "learning_rate": 1.519535157069707e-07, + "loss": 1.4883, + "step": 13911 + }, + { + "epoch": 2.7879759519038076, + "grad_norm": 18.091175075331236, + "learning_rate": 1.516684038578625e-07, + "loss": 1.5601, + "step": 13912 + }, + { + "epoch": 2.7881763527054106, + "grad_norm": 18.050779161265602, + "learning_rate": 1.5138355561970074e-07, + "loss": 1.3939, + "step": 13913 + }, + { + "epoch": 2.788376753507014, + "grad_norm": 22.98383824363757, + "learning_rate": 1.5109897100797076e-07, + "loss": 1.6655, + "step": 13914 + }, + { + "epoch": 2.788577154308617, + "grad_norm": 20.45749164505525, + "learning_rate": 1.5081465003814856e-07, + "loss": 1.6117, + "step": 13915 + }, + { + "epoch": 2.7887775551102205, + "grad_norm": 23.618969520275925, + "learning_rate": 1.5053059272569115e-07, + "loss": 1.7684, + "step": 13916 + }, + { + "epoch": 2.7889779559118235, + "grad_norm": 20.641811650550036, + "learning_rate": 1.50246799086044e-07, + "loss": 1.3658, + "step": 13917 + }, + { + "epoch": 2.789178356713427, + "grad_norm": 24.059638943973535, + "learning_rate": 1.4996326913463756e-07, + "loss": 1.6216, + "step": 13918 + }, + { + "epoch": 2.78937875751503, + "grad_norm": 52.34889679014776, + "learning_rate": 1.4968000288688723e-07, + "loss": 1.7044, + "step": 13919 + }, + { + "epoch": 2.7895791583166334, + "grad_norm": 65.10975693513171, + "learning_rate": 1.493970003581957e-07, + "loss": 1.7123, + "step": 13920 + }, + { + "epoch": 2.7897795591182364, + "grad_norm": 29.53214020009314, + "learning_rate": 1.4911426156394793e-07, + "loss": 1.9073, + "step": 13921 + }, + { + "epoch": 2.78997995991984, + "grad_norm": 85.24564053213211, + "learning_rate": 1.4883178651951936e-07, + "loss": 2.2934, + "step": 13922 + }, + { + "epoch": 2.790180360721443, + "grad_norm": 61.96313057242721, + "learning_rate": 1.485495752402677e-07, + "loss": 1.3826, + "step": 13923 + }, + { + "epoch": 2.7903807615230463, + "grad_norm": 15.965448516740867, + "learning_rate": 1.4826762774153625e-07, + "loss": 1.6638, + "step": 13924 + }, + { + "epoch": 2.7905811623246493, + "grad_norm": 23.81726624777557, + "learning_rate": 1.4798594403865606e-07, + "loss": 1.344, + "step": 13925 + }, + { + "epoch": 2.7907815631262523, + "grad_norm": 43.31338131161654, + "learning_rate": 1.4770452414694214e-07, + "loss": 1.3403, + "step": 13926 + }, + { + "epoch": 2.7909819639278557, + "grad_norm": 15.71762128344315, + "learning_rate": 1.474233680816961e-07, + "loss": 1.4296, + "step": 13927 + }, + { + "epoch": 2.791182364729459, + "grad_norm": 27.577971179814572, + "learning_rate": 1.4714247585820463e-07, + "loss": 1.5703, + "step": 13928 + }, + { + "epoch": 2.791382765531062, + "grad_norm": 21.108815273357425, + "learning_rate": 1.4686184749173993e-07, + "loss": 1.9121, + "step": 13929 + }, + { + "epoch": 2.791583166332665, + "grad_norm": 20.31873722313202, + "learning_rate": 1.4658148299756148e-07, + "loss": 1.6278, + "step": 13930 + }, + { + "epoch": 2.7917835671342686, + "grad_norm": 37.46864570708287, + "learning_rate": 1.463013823909104e-07, + "loss": 1.7364, + "step": 13931 + }, + { + "epoch": 2.791983967935872, + "grad_norm": 22.271686714183748, + "learning_rate": 1.4602154568701953e-07, + "loss": 1.346, + "step": 13932 + }, + { + "epoch": 2.792184368737475, + "grad_norm": 26.84918463920629, + "learning_rate": 1.457419729011017e-07, + "loss": 1.9182, + "step": 13933 + }, + { + "epoch": 2.792384769539078, + "grad_norm": 19.724475311957487, + "learning_rate": 1.4546266404835807e-07, + "loss": 1.6318, + "step": 13934 + }, + { + "epoch": 2.7925851703406814, + "grad_norm": 20.524857507252197, + "learning_rate": 1.4518361914397538e-07, + "loss": 1.5844, + "step": 13935 + }, + { + "epoch": 2.7927855711422844, + "grad_norm": 37.420432817169946, + "learning_rate": 1.4490483820312595e-07, + "loss": 1.6616, + "step": 13936 + }, + { + "epoch": 2.792985971943888, + "grad_norm": 23.03579797902105, + "learning_rate": 1.4462632124096765e-07, + "loss": 1.3832, + "step": 13937 + }, + { + "epoch": 2.793186372745491, + "grad_norm": 19.164619338899087, + "learning_rate": 1.4434806827264225e-07, + "loss": 1.4297, + "step": 13938 + }, + { + "epoch": 2.7933867735470943, + "grad_norm": 27.695368383623112, + "learning_rate": 1.4407007931328097e-07, + "loss": 1.6361, + "step": 13939 + }, + { + "epoch": 2.7935871743486973, + "grad_norm": 37.57345241235609, + "learning_rate": 1.4379235437799733e-07, + "loss": 1.3194, + "step": 13940 + }, + { + "epoch": 2.7937875751503007, + "grad_norm": 23.632655218957584, + "learning_rate": 1.435148934818914e-07, + "loss": 1.392, + "step": 13941 + }, + { + "epoch": 2.7939879759519037, + "grad_norm": 20.569319536456106, + "learning_rate": 1.4323769664005005e-07, + "loss": 1.0751, + "step": 13942 + }, + { + "epoch": 2.794188376753507, + "grad_norm": 21.27938227555962, + "learning_rate": 1.4296076386754398e-07, + "loss": 1.4691, + "step": 13943 + }, + { + "epoch": 2.79438877755511, + "grad_norm": 47.463377105397996, + "learning_rate": 1.4268409517943115e-07, + "loss": 1.7292, + "step": 13944 + }, + { + "epoch": 2.7945891783567136, + "grad_norm": 26.737078531264466, + "learning_rate": 1.4240769059075344e-07, + "loss": 1.3746, + "step": 13945 + }, + { + "epoch": 2.7947895791583166, + "grad_norm": 22.43297534058241, + "learning_rate": 1.4213155011654101e-07, + "loss": 0.9887, + "step": 13946 + }, + { + "epoch": 2.7949899799599196, + "grad_norm": 17.637050468754012, + "learning_rate": 1.4185567377180687e-07, + "loss": 1.1163, + "step": 13947 + }, + { + "epoch": 2.795190380761523, + "grad_norm": 24.67903240600438, + "learning_rate": 1.4158006157155013e-07, + "loss": 1.2241, + "step": 13948 + }, + { + "epoch": 2.7953907815631265, + "grad_norm": 20.137906982742976, + "learning_rate": 1.4130471353075825e-07, + "loss": 1.4557, + "step": 13949 + }, + { + "epoch": 2.7955911823647295, + "grad_norm": 23.376150735093812, + "learning_rate": 1.4102962966440038e-07, + "loss": 1.4306, + "step": 13950 + }, + { + "epoch": 2.7957915831663325, + "grad_norm": 46.35540517258537, + "learning_rate": 1.4075480998743508e-07, + "loss": 1.9026, + "step": 13951 + }, + { + "epoch": 2.795991983967936, + "grad_norm": 39.68118865923247, + "learning_rate": 1.4048025451480208e-07, + "loss": 0.9301, + "step": 13952 + }, + { + "epoch": 2.7961923847695394, + "grad_norm": 30.082705721260854, + "learning_rate": 1.4020596326143167e-07, + "loss": 2.1555, + "step": 13953 + }, + { + "epoch": 2.7963927855711423, + "grad_norm": 25.103434491390193, + "learning_rate": 1.3993193624223744e-07, + "loss": 1.8771, + "step": 13954 + }, + { + "epoch": 2.7965931863727453, + "grad_norm": 21.050997441043464, + "learning_rate": 1.3965817347211696e-07, + "loss": 1.7256, + "step": 13955 + }, + { + "epoch": 2.796793587174349, + "grad_norm": 21.639588230654063, + "learning_rate": 1.393846749659572e-07, + "loss": 1.4765, + "step": 13956 + }, + { + "epoch": 2.796993987975952, + "grad_norm": 37.38014806491431, + "learning_rate": 1.391114407386268e-07, + "loss": 2.1353, + "step": 13957 + }, + { + "epoch": 2.797194388777555, + "grad_norm": 14.965175068408632, + "learning_rate": 1.3883847080498282e-07, + "loss": 1.1438, + "step": 13958 + }, + { + "epoch": 2.797394789579158, + "grad_norm": 17.975797361867212, + "learning_rate": 1.3856576517986664e-07, + "loss": 1.4229, + "step": 13959 + }, + { + "epoch": 2.7975951903807617, + "grad_norm": 17.83880107324791, + "learning_rate": 1.3829332387810644e-07, + "loss": 1.2357, + "step": 13960 + }, + { + "epoch": 2.7977955911823646, + "grad_norm": 22.145581674081814, + "learning_rate": 1.3802114691451486e-07, + "loss": 1.6127, + "step": 13961 + }, + { + "epoch": 2.797995991983968, + "grad_norm": 27.4961011652966, + "learning_rate": 1.3774923430388998e-07, + "loss": 2.1197, + "step": 13962 + }, + { + "epoch": 2.798196392785571, + "grad_norm": 28.41266715448804, + "learning_rate": 1.3747758606101725e-07, + "loss": 1.6085, + "step": 13963 + }, + { + "epoch": 2.798396793587174, + "grad_norm": 22.909533365992417, + "learning_rate": 1.3720620220066484e-07, + "loss": 1.5576, + "step": 13964 + }, + { + "epoch": 2.7985971943887775, + "grad_norm": 21.65960024120966, + "learning_rate": 1.3693508273759037e-07, + "loss": 1.3316, + "step": 13965 + }, + { + "epoch": 2.798797595190381, + "grad_norm": 15.895939555652381, + "learning_rate": 1.366642276865332e-07, + "loss": 1.3475, + "step": 13966 + }, + { + "epoch": 2.798997995991984, + "grad_norm": 21.318754475168543, + "learning_rate": 1.3639363706222154e-07, + "loss": 1.3569, + "step": 13967 + }, + { + "epoch": 2.799198396793587, + "grad_norm": 24.60514966481892, + "learning_rate": 1.3612331087936691e-07, + "loss": 1.5436, + "step": 13968 + }, + { + "epoch": 2.7993987975951904, + "grad_norm": 26.16605857183508, + "learning_rate": 1.3585324915266707e-07, + "loss": 1.2834, + "step": 13969 + }, + { + "epoch": 2.799599198396794, + "grad_norm": 20.56739443108512, + "learning_rate": 1.3558345189680743e-07, + "loss": 1.7645, + "step": 13970 + }, + { + "epoch": 2.799799599198397, + "grad_norm": 37.94751141504138, + "learning_rate": 1.353139191264552e-07, + "loss": 1.4111, + "step": 13971 + }, + { + "epoch": 2.8, + "grad_norm": 36.61015190843945, + "learning_rate": 1.3504465085626638e-07, + "loss": 1.8038, + "step": 13972 + }, + { + "epoch": 2.8002004008016033, + "grad_norm": 18.192314056504188, + "learning_rate": 1.3477564710088097e-07, + "loss": 1.2004, + "step": 13973 + }, + { + "epoch": 2.8004008016032063, + "grad_norm": 24.68855501120385, + "learning_rate": 1.345069078749256e-07, + "loss": 2.1163, + "step": 13974 + }, + { + "epoch": 2.8006012024048097, + "grad_norm": 16.75009770907192, + "learning_rate": 1.3423843319301133e-07, + "loss": 1.7031, + "step": 13975 + }, + { + "epoch": 2.8008016032064127, + "grad_norm": 22.08557287880619, + "learning_rate": 1.339702230697365e-07, + "loss": 1.7957, + "step": 13976 + }, + { + "epoch": 2.801002004008016, + "grad_norm": 20.174454951259076, + "learning_rate": 1.337022775196839e-07, + "loss": 1.6679, + "step": 13977 + }, + { + "epoch": 2.801202404809619, + "grad_norm": 28.84543160958399, + "learning_rate": 1.334345965574213e-07, + "loss": 1.7755, + "step": 13978 + }, + { + "epoch": 2.8014028056112226, + "grad_norm": 19.308164227130185, + "learning_rate": 1.3316718019750263e-07, + "loss": 1.3129, + "step": 13979 + }, + { + "epoch": 2.8016032064128256, + "grad_norm": 24.56451431585584, + "learning_rate": 1.329000284544696e-07, + "loss": 1.9187, + "step": 13980 + }, + { + "epoch": 2.801803607214429, + "grad_norm": 27.642936789963045, + "learning_rate": 1.3263314134284612e-07, + "loss": 1.1697, + "step": 13981 + }, + { + "epoch": 2.802004008016032, + "grad_norm": 21.008185325356003, + "learning_rate": 1.3236651887714392e-07, + "loss": 1.5631, + "step": 13982 + }, + { + "epoch": 2.8022044088176354, + "grad_norm": 28.037472696872406, + "learning_rate": 1.3210016107185862e-07, + "loss": 1.4963, + "step": 13983 + }, + { + "epoch": 2.8024048096192384, + "grad_norm": 26.181948834536577, + "learning_rate": 1.318340679414737e-07, + "loss": 1.2771, + "step": 13984 + }, + { + "epoch": 2.8026052104208414, + "grad_norm": 28.90903964580049, + "learning_rate": 1.3156823950045695e-07, + "loss": 2.1239, + "step": 13985 + }, + { + "epoch": 2.802805611222445, + "grad_norm": 35.44349217880469, + "learning_rate": 1.3130267576326073e-07, + "loss": 1.5386, + "step": 13986 + }, + { + "epoch": 2.8030060120240483, + "grad_norm": 24.58960601132706, + "learning_rate": 1.310373767443257e-07, + "loss": 1.6596, + "step": 13987 + }, + { + "epoch": 2.8032064128256513, + "grad_norm": 21.914866620060213, + "learning_rate": 1.3077234245807536e-07, + "loss": 1.4357, + "step": 13988 + }, + { + "epoch": 2.8034068136272543, + "grad_norm": 31.990164779331415, + "learning_rate": 1.3050757291892034e-07, + "loss": 2.176, + "step": 13989 + }, + { + "epoch": 2.8036072144288577, + "grad_norm": 26.771205939957714, + "learning_rate": 1.3024306814125698e-07, + "loss": 1.7295, + "step": 13990 + }, + { + "epoch": 2.803807615230461, + "grad_norm": 20.11879503169594, + "learning_rate": 1.2997882813946595e-07, + "loss": 1.2653, + "step": 13991 + }, + { + "epoch": 2.804008016032064, + "grad_norm": 23.497921797161414, + "learning_rate": 1.2971485292791463e-07, + "loss": 1.5155, + "step": 13992 + }, + { + "epoch": 2.804208416833667, + "grad_norm": 17.517482281959786, + "learning_rate": 1.2945114252095657e-07, + "loss": 0.98, + "step": 13993 + }, + { + "epoch": 2.8044088176352706, + "grad_norm": 23.058813907423485, + "learning_rate": 1.2918769693292975e-07, + "loss": 1.8004, + "step": 13994 + }, + { + "epoch": 2.8046092184368736, + "grad_norm": 28.124196052006393, + "learning_rate": 1.2892451617815714e-07, + "loss": 1.1383, + "step": 13995 + }, + { + "epoch": 2.804809619238477, + "grad_norm": 19.357573703356675, + "learning_rate": 1.2866160027094897e-07, + "loss": 1.4704, + "step": 13996 + }, + { + "epoch": 2.80501002004008, + "grad_norm": 25.457918324023105, + "learning_rate": 1.2839894922560049e-07, + "loss": 1.2036, + "step": 13997 + }, + { + "epoch": 2.8052104208416835, + "grad_norm": 17.525661921258227, + "learning_rate": 1.2813656305639244e-07, + "loss": 1.3233, + "step": 13998 + }, + { + "epoch": 2.8054108216432865, + "grad_norm": 20.27871288205774, + "learning_rate": 1.2787444177759068e-07, + "loss": 1.4543, + "step": 13999 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 22.592600645570737, + "learning_rate": 1.2761258540344767e-07, + "loss": 1.4091, + "step": 14000 + }, + { + "epoch": 2.805811623246493, + "grad_norm": 14.680329705944306, + "learning_rate": 1.2735099394820094e-07, + "loss": 1.5455, + "step": 14001 + }, + { + "epoch": 2.8060120240480964, + "grad_norm": 22.746802146933465, + "learning_rate": 1.2708966742607244e-07, + "loss": 1.0364, + "step": 14002 + }, + { + "epoch": 2.8062124248496993, + "grad_norm": 17.93586958246573, + "learning_rate": 1.2682860585127298e-07, + "loss": 1.7534, + "step": 14003 + }, + { + "epoch": 2.806412825651303, + "grad_norm": 23.441158879883, + "learning_rate": 1.265678092379946e-07, + "loss": 1.6845, + "step": 14004 + }, + { + "epoch": 2.806613226452906, + "grad_norm": 21.34303819172993, + "learning_rate": 1.2630727760041872e-07, + "loss": 1.5429, + "step": 14005 + }, + { + "epoch": 2.806813627254509, + "grad_norm": 25.56558012256857, + "learning_rate": 1.2604701095271066e-07, + "loss": 1.6335, + "step": 14006 + }, + { + "epoch": 2.807014028056112, + "grad_norm": 22.382696003262232, + "learning_rate": 1.2578700930902077e-07, + "loss": 2.1299, + "step": 14007 + }, + { + "epoch": 2.8072144288577157, + "grad_norm": 30.36433814072942, + "learning_rate": 1.2552727268348718e-07, + "loss": 1.7644, + "step": 14008 + }, + { + "epoch": 2.8074148296593187, + "grad_norm": 15.009903998778858, + "learning_rate": 1.2526780109022973e-07, + "loss": 1.0118, + "step": 14009 + }, + { + "epoch": 2.8076152304609217, + "grad_norm": 18.753304039747476, + "learning_rate": 1.2500859454335823e-07, + "loss": 1.1287, + "step": 14010 + }, + { + "epoch": 2.807815631262525, + "grad_norm": 17.575909022451004, + "learning_rate": 1.247496530569664e-07, + "loss": 1.6065, + "step": 14011 + }, + { + "epoch": 2.8080160320641285, + "grad_norm": 22.86320420588768, + "learning_rate": 1.2449097664513133e-07, + "loss": 1.5735, + "step": 14012 + }, + { + "epoch": 2.8082164328657315, + "grad_norm": 20.570588234239487, + "learning_rate": 1.242325653219195e-07, + "loss": 1.7003, + "step": 14013 + }, + { + "epoch": 2.8084168336673345, + "grad_norm": 16.830782932535726, + "learning_rate": 1.2397441910137974e-07, + "loss": 1.2248, + "step": 14014 + }, + { + "epoch": 2.808617234468938, + "grad_norm": 31.889234240048697, + "learning_rate": 1.2371653799754912e-07, + "loss": 1.3133, + "step": 14015 + }, + { + "epoch": 2.808817635270541, + "grad_norm": 17.954287045313755, + "learning_rate": 1.234589220244481e-07, + "loss": 1.0062, + "step": 14016 + }, + { + "epoch": 2.8090180360721444, + "grad_norm": 17.73301149968698, + "learning_rate": 1.2320157119608379e-07, + "loss": 1.8622, + "step": 14017 + }, + { + "epoch": 2.8092184368737474, + "grad_norm": 19.92630161771639, + "learning_rate": 1.2294448552644943e-07, + "loss": 1.2565, + "step": 14018 + }, + { + "epoch": 2.809418837675351, + "grad_norm": 48.304289371838884, + "learning_rate": 1.2268766502952166e-07, + "loss": 0.9988, + "step": 14019 + }, + { + "epoch": 2.809619238476954, + "grad_norm": 24.34015907914379, + "learning_rate": 1.2243110971926596e-07, + "loss": 1.454, + "step": 14020 + }, + { + "epoch": 2.8098196392785573, + "grad_norm": 21.775195556603627, + "learning_rate": 1.2217481960963063e-07, + "loss": 1.6046, + "step": 14021 + }, + { + "epoch": 2.8100200400801603, + "grad_norm": 21.69300061383716, + "learning_rate": 1.219187947145506e-07, + "loss": 1.7051, + "step": 14022 + }, + { + "epoch": 2.8102204408817633, + "grad_norm": 21.88665019850455, + "learning_rate": 1.2166303504794587e-07, + "loss": 1.2793, + "step": 14023 + }, + { + "epoch": 2.8104208416833667, + "grad_norm": 28.6421022889431, + "learning_rate": 1.214075406237236e-07, + "loss": 1.6557, + "step": 14024 + }, + { + "epoch": 2.81062124248497, + "grad_norm": 22.669999221983442, + "learning_rate": 1.21152311455775e-07, + "loss": 1.2631, + "step": 14025 + }, + { + "epoch": 2.810821643286573, + "grad_norm": 20.12679284721176, + "learning_rate": 1.2089734755797611e-07, + "loss": 1.1862, + "step": 14026 + }, + { + "epoch": 2.811022044088176, + "grad_norm": 22.475387660632684, + "learning_rate": 1.2064264894419196e-07, + "loss": 1.8086, + "step": 14027 + }, + { + "epoch": 2.8112224448897796, + "grad_norm": 39.07161486675945, + "learning_rate": 1.2038821562826873e-07, + "loss": 1.6381, + "step": 14028 + }, + { + "epoch": 2.811422845691383, + "grad_norm": 19.07880403223875, + "learning_rate": 1.2013404762404146e-07, + "loss": 1.8996, + "step": 14029 + }, + { + "epoch": 2.811623246492986, + "grad_norm": 22.832653325870584, + "learning_rate": 1.1988014494532906e-07, + "loss": 1.5874, + "step": 14030 + }, + { + "epoch": 2.811823647294589, + "grad_norm": 17.702833211956758, + "learning_rate": 1.196265076059372e-07, + "loss": 1.4897, + "step": 14031 + }, + { + "epoch": 2.8120240480961924, + "grad_norm": 18.88150640241375, + "learning_rate": 1.1937313561965646e-07, + "loss": 1.3768, + "step": 14032 + }, + { + "epoch": 2.8122244488977954, + "grad_norm": 22.233886085843938, + "learning_rate": 1.1912002900026198e-07, + "loss": 1.4581, + "step": 14033 + }, + { + "epoch": 2.812424849699399, + "grad_norm": 18.059016270410424, + "learning_rate": 1.1886718776151718e-07, + "loss": 1.1559, + "step": 14034 + }, + { + "epoch": 2.812625250501002, + "grad_norm": 19.516791493367638, + "learning_rate": 1.1861461191716828e-07, + "loss": 1.68, + "step": 14035 + }, + { + "epoch": 2.8128256513026053, + "grad_norm": 20.65576433741635, + "learning_rate": 1.1836230148094763e-07, + "loss": 1.546, + "step": 14036 + }, + { + "epoch": 2.8130260521042083, + "grad_norm": 19.68266046403512, + "learning_rate": 1.1811025646657593e-07, + "loss": 1.2351, + "step": 14037 + }, + { + "epoch": 2.8132264529058117, + "grad_norm": 23.390986559221027, + "learning_rate": 1.1785847688775554e-07, + "loss": 1.8926, + "step": 14038 + }, + { + "epoch": 2.8134268537074147, + "grad_norm": 21.961860532591402, + "learning_rate": 1.1760696275817662e-07, + "loss": 1.0528, + "step": 14039 + }, + { + "epoch": 2.813627254509018, + "grad_norm": 20.3988074118827, + "learning_rate": 1.173557140915138e-07, + "loss": 1.9372, + "step": 14040 + }, + { + "epoch": 2.813827655310621, + "grad_norm": 22.44308155064769, + "learning_rate": 1.1710473090142837e-07, + "loss": 1.5255, + "step": 14041 + }, + { + "epoch": 2.8140280561122246, + "grad_norm": 30.25526026431239, + "learning_rate": 1.1685401320156665e-07, + "loss": 1.2265, + "step": 14042 + }, + { + "epoch": 2.8142284569138276, + "grad_norm": 17.54491518670995, + "learning_rate": 1.1660356100555992e-07, + "loss": 1.4597, + "step": 14043 + }, + { + "epoch": 2.8144288577154306, + "grad_norm": 23.580127848433712, + "learning_rate": 1.1635337432702676e-07, + "loss": 1.78, + "step": 14044 + }, + { + "epoch": 2.814629258517034, + "grad_norm": 19.740551214033143, + "learning_rate": 1.1610345317956961e-07, + "loss": 1.6996, + "step": 14045 + }, + { + "epoch": 2.8148296593186375, + "grad_norm": 63.78328577506786, + "learning_rate": 1.158537975767765e-07, + "loss": 1.4632, + "step": 14046 + }, + { + "epoch": 2.8150300601202405, + "grad_norm": 32.79910540689068, + "learning_rate": 1.1560440753222269e-07, + "loss": 1.8107, + "step": 14047 + }, + { + "epoch": 2.8152304609218435, + "grad_norm": 18.225865438522497, + "learning_rate": 1.1535528305946675e-07, + "loss": 2.0444, + "step": 14048 + }, + { + "epoch": 2.815430861723447, + "grad_norm": 16.665536547701183, + "learning_rate": 1.1510642417205509e-07, + "loss": 1.8308, + "step": 14049 + }, + { + "epoch": 2.8156312625250504, + "grad_norm": 21.78191357350729, + "learning_rate": 1.1485783088351688e-07, + "loss": 1.4748, + "step": 14050 + }, + { + "epoch": 2.8158316633266534, + "grad_norm": 16.15443519242502, + "learning_rate": 1.1460950320737018e-07, + "loss": 1.6476, + "step": 14051 + }, + { + "epoch": 2.8160320641282564, + "grad_norm": 26.5169114859884, + "learning_rate": 1.1436144115711644e-07, + "loss": 1.7421, + "step": 14052 + }, + { + "epoch": 2.81623246492986, + "grad_norm": 59.00456600723231, + "learning_rate": 1.1411364474624265e-07, + "loss": 1.6126, + "step": 14053 + }, + { + "epoch": 2.816432865731463, + "grad_norm": 24.642461044277937, + "learning_rate": 1.1386611398822244e-07, + "loss": 1.8501, + "step": 14054 + }, + { + "epoch": 2.8166332665330662, + "grad_norm": 20.879247695436547, + "learning_rate": 1.1361884889651398e-07, + "loss": 1.5456, + "step": 14055 + }, + { + "epoch": 2.8168336673346692, + "grad_norm": 18.180411567528076, + "learning_rate": 1.1337184948456148e-07, + "loss": 1.5832, + "step": 14056 + }, + { + "epoch": 2.8170340681362727, + "grad_norm": 17.359897908333828, + "learning_rate": 1.1312511576579532e-07, + "loss": 1.0979, + "step": 14057 + }, + { + "epoch": 2.8172344689378757, + "grad_norm": 27.339952035288842, + "learning_rate": 1.1287864775363034e-07, + "loss": 1.4262, + "step": 14058 + }, + { + "epoch": 2.817434869739479, + "grad_norm": 22.582043688359768, + "learning_rate": 1.1263244546146745e-07, + "loss": 1.5899, + "step": 14059 + }, + { + "epoch": 2.817635270541082, + "grad_norm": 19.665483282996473, + "learning_rate": 1.1238650890269209e-07, + "loss": 1.2176, + "step": 14060 + }, + { + "epoch": 2.8178356713426855, + "grad_norm": 59.542787684916895, + "learning_rate": 1.1214083809067799e-07, + "loss": 1.8189, + "step": 14061 + }, + { + "epoch": 2.8180360721442885, + "grad_norm": 31.24583731356241, + "learning_rate": 1.1189543303878114e-07, + "loss": 1.7597, + "step": 14062 + }, + { + "epoch": 2.818236472945892, + "grad_norm": 17.91226675678609, + "learning_rate": 1.1165029376034475e-07, + "loss": 1.4337, + "step": 14063 + }, + { + "epoch": 2.818436873747495, + "grad_norm": 25.70968670844325, + "learning_rate": 1.1140542026869816e-07, + "loss": 1.7471, + "step": 14064 + }, + { + "epoch": 2.818637274549098, + "grad_norm": 21.825317453443166, + "learning_rate": 1.1116081257715517e-07, + "loss": 1.3805, + "step": 14065 + }, + { + "epoch": 2.8188376753507014, + "grad_norm": 18.677631915971016, + "learning_rate": 1.1091647069901568e-07, + "loss": 1.4881, + "step": 14066 + }, + { + "epoch": 2.819038076152305, + "grad_norm": 15.61633947162336, + "learning_rate": 1.1067239464756407e-07, + "loss": 1.2795, + "step": 14067 + }, + { + "epoch": 2.819238476953908, + "grad_norm": 24.489995446894767, + "learning_rate": 1.1042858443607197e-07, + "loss": 1.9437, + "step": 14068 + }, + { + "epoch": 2.819438877755511, + "grad_norm": 57.10375822890369, + "learning_rate": 1.101850400777954e-07, + "loss": 1.6126, + "step": 14069 + }, + { + "epoch": 2.8196392785571143, + "grad_norm": 27.417732096180575, + "learning_rate": 1.0994176158597603e-07, + "loss": 1.5104, + "step": 14070 + }, + { + "epoch": 2.8198396793587177, + "grad_norm": 39.34026206358141, + "learning_rate": 1.0969874897384158e-07, + "loss": 1.8493, + "step": 14071 + }, + { + "epoch": 2.8200400801603207, + "grad_norm": 18.66242790109478, + "learning_rate": 1.0945600225460485e-07, + "loss": 1.4097, + "step": 14072 + }, + { + "epoch": 2.8202404809619237, + "grad_norm": 17.509038710182136, + "learning_rate": 1.0921352144146469e-07, + "loss": 1.6897, + "step": 14073 + }, + { + "epoch": 2.820440881763527, + "grad_norm": 22.836708589955354, + "learning_rate": 1.0897130654760446e-07, + "loss": 1.6042, + "step": 14074 + }, + { + "epoch": 2.82064128256513, + "grad_norm": 16.58119355942559, + "learning_rate": 1.0872935758619419e-07, + "loss": 1.9122, + "step": 14075 + }, + { + "epoch": 2.8208416833667336, + "grad_norm": 18.3041841642966, + "learning_rate": 1.0848767457038944e-07, + "loss": 1.3706, + "step": 14076 + }, + { + "epoch": 2.8210420841683366, + "grad_norm": 27.085254354954387, + "learning_rate": 1.0824625751332973e-07, + "loss": 1.386, + "step": 14077 + }, + { + "epoch": 2.82124248496994, + "grad_norm": 26.37507879464341, + "learning_rate": 1.0800510642814177e-07, + "loss": 1.465, + "step": 14078 + }, + { + "epoch": 2.821442885771543, + "grad_norm": 22.659016360429227, + "learning_rate": 1.0776422132793785e-07, + "loss": 1.5127, + "step": 14079 + }, + { + "epoch": 2.8216432865731464, + "grad_norm": 37.50718954249744, + "learning_rate": 1.0752360222581471e-07, + "loss": 1.0458, + "step": 14080 + }, + { + "epoch": 2.8218436873747494, + "grad_norm": 42.05565989873382, + "learning_rate": 1.0728324913485521e-07, + "loss": 1.6625, + "step": 14081 + }, + { + "epoch": 2.8220440881763524, + "grad_norm": 25.761323425971813, + "learning_rate": 1.0704316206812782e-07, + "loss": 1.6248, + "step": 14082 + }, + { + "epoch": 2.822244488977956, + "grad_norm": 31.221736462898654, + "learning_rate": 1.0680334103868651e-07, + "loss": 1.5795, + "step": 14083 + }, + { + "epoch": 2.8224448897795593, + "grad_norm": 19.678766128178, + "learning_rate": 1.065637860595703e-07, + "loss": 1.7408, + "step": 14084 + }, + { + "epoch": 2.8226452905811623, + "grad_norm": 23.80575980401492, + "learning_rate": 1.0632449714380433e-07, + "loss": 1.3925, + "step": 14085 + }, + { + "epoch": 2.8228456913827653, + "grad_norm": 23.14166131396307, + "learning_rate": 1.060854743043993e-07, + "loss": 1.7811, + "step": 14086 + }, + { + "epoch": 2.8230460921843687, + "grad_norm": 18.105243775035905, + "learning_rate": 1.0584671755435094e-07, + "loss": 1.5351, + "step": 14087 + }, + { + "epoch": 2.823246492985972, + "grad_norm": 22.75986383039993, + "learning_rate": 1.0560822690664163e-07, + "loss": 1.2501, + "step": 14088 + }, + { + "epoch": 2.823446893787575, + "grad_norm": 22.889278303658447, + "learning_rate": 1.0537000237423766e-07, + "loss": 1.5536, + "step": 14089 + }, + { + "epoch": 2.823647294589178, + "grad_norm": 19.98861312462297, + "learning_rate": 1.0513204397009091e-07, + "loss": 1.5016, + "step": 14090 + }, + { + "epoch": 2.8238476953907816, + "grad_norm": 20.843375896603224, + "learning_rate": 1.0489435170714101e-07, + "loss": 1.6119, + "step": 14091 + }, + { + "epoch": 2.8240480961923846, + "grad_norm": 21.96881350776991, + "learning_rate": 1.0465692559831153e-07, + "loss": 1.6304, + "step": 14092 + }, + { + "epoch": 2.824248496993988, + "grad_norm": 21.339621920895542, + "learning_rate": 1.0441976565651046e-07, + "loss": 1.9325, + "step": 14093 + }, + { + "epoch": 2.824448897795591, + "grad_norm": 21.336963245898005, + "learning_rate": 1.0418287189463361e-07, + "loss": 1.5039, + "step": 14094 + }, + { + "epoch": 2.8246492985971945, + "grad_norm": 20.722999461508866, + "learning_rate": 1.0394624432556067e-07, + "loss": 1.535, + "step": 14095 + }, + { + "epoch": 2.8248496993987975, + "grad_norm": 18.041665828871306, + "learning_rate": 1.03709882962158e-07, + "loss": 1.5737, + "step": 14096 + }, + { + "epoch": 2.825050100200401, + "grad_norm": 23.71024945815784, + "learning_rate": 1.0347378781727647e-07, + "loss": 1.8999, + "step": 14097 + }, + { + "epoch": 2.825250501002004, + "grad_norm": 20.51770317460118, + "learning_rate": 1.03237958903753e-07, + "loss": 1.2658, + "step": 14098 + }, + { + "epoch": 2.8254509018036074, + "grad_norm": 20.714647911688935, + "learning_rate": 1.0300239623441011e-07, + "loss": 1.3914, + "step": 14099 + }, + { + "epoch": 2.8256513026052104, + "grad_norm": 15.576612396230226, + "learning_rate": 1.0276709982205479e-07, + "loss": 1.3483, + "step": 14100 + }, + { + "epoch": 2.825851703406814, + "grad_norm": 19.258759607208802, + "learning_rate": 1.0253206967948237e-07, + "loss": 1.5916, + "step": 14101 + }, + { + "epoch": 2.826052104208417, + "grad_norm": 28.045628316627496, + "learning_rate": 1.0229730581947039e-07, + "loss": 1.4078, + "step": 14102 + }, + { + "epoch": 2.82625250501002, + "grad_norm": 19.757249848440647, + "learning_rate": 1.020628082547831e-07, + "loss": 1.7347, + "step": 14103 + }, + { + "epoch": 2.8264529058116232, + "grad_norm": 15.813623897102095, + "learning_rate": 1.0182857699817084e-07, + "loss": 0.8004, + "step": 14104 + }, + { + "epoch": 2.8266533066132267, + "grad_norm": 27.15796502031431, + "learning_rate": 1.0159461206236954e-07, + "loss": 1.2913, + "step": 14105 + }, + { + "epoch": 2.8268537074148297, + "grad_norm": 22.66212665814352, + "learning_rate": 1.0136091346010068e-07, + "loss": 1.5176, + "step": 14106 + }, + { + "epoch": 2.8270541082164327, + "grad_norm": 21.581294362905297, + "learning_rate": 1.0112748120406857e-07, + "loss": 1.3934, + "step": 14107 + }, + { + "epoch": 2.827254509018036, + "grad_norm": 24.283722867267013, + "learning_rate": 1.0089431530696747e-07, + "loss": 1.0881, + "step": 14108 + }, + { + "epoch": 2.8274549098196395, + "grad_norm": 23.180491991167173, + "learning_rate": 1.0066141578147448e-07, + "loss": 1.7526, + "step": 14109 + }, + { + "epoch": 2.8276553106212425, + "grad_norm": 21.72431340029825, + "learning_rate": 1.0042878264025169e-07, + "loss": 1.4506, + "step": 14110 + }, + { + "epoch": 2.8278557114228455, + "grad_norm": 61.68358967053611, + "learning_rate": 1.0019641589594841e-07, + "loss": 1.5789, + "step": 14111 + }, + { + "epoch": 2.828056112224449, + "grad_norm": 22.024369124100335, + "learning_rate": 9.996431556119956e-08, + "loss": 1.0907, + "step": 14112 + }, + { + "epoch": 2.828256513026052, + "grad_norm": 23.01473574015366, + "learning_rate": 9.973248164862337e-08, + "loss": 1.174, + "step": 14113 + }, + { + "epoch": 2.8284569138276554, + "grad_norm": 20.774104294750813, + "learning_rate": 9.950091417082531e-08, + "loss": 1.3652, + "step": 14114 + }, + { + "epoch": 2.8286573146292584, + "grad_norm": 21.831772041628387, + "learning_rate": 9.926961314039751e-08, + "loss": 1.1614, + "step": 14115 + }, + { + "epoch": 2.828857715430862, + "grad_norm": 27.481975031414024, + "learning_rate": 9.903857856991383e-08, + "loss": 1.6154, + "step": 14116 + }, + { + "epoch": 2.829058116232465, + "grad_norm": 12.521914832070854, + "learning_rate": 9.880781047193755e-08, + "loss": 1.3318, + "step": 14117 + }, + { + "epoch": 2.8292585170340683, + "grad_norm": 30.764912006264304, + "learning_rate": 9.857730885901585e-08, + "loss": 1.5003, + "step": 14118 + }, + { + "epoch": 2.8294589178356713, + "grad_norm": 43.379094688610024, + "learning_rate": 9.834707374368036e-08, + "loss": 1.5996, + "step": 14119 + }, + { + "epoch": 2.8296593186372747, + "grad_norm": 17.662417106994706, + "learning_rate": 9.811710513845051e-08, + "loss": 1.5944, + "step": 14120 + }, + { + "epoch": 2.8298597194388777, + "grad_norm": 23.532439951334016, + "learning_rate": 9.788740305582855e-08, + "loss": 1.2407, + "step": 14121 + }, + { + "epoch": 2.830060120240481, + "grad_norm": 21.9384503022874, + "learning_rate": 9.765796750830558e-08, + "loss": 1.4434, + "step": 14122 + }, + { + "epoch": 2.830260521042084, + "grad_norm": 48.21386961973601, + "learning_rate": 9.742879850835552e-08, + "loss": 1.42, + "step": 14123 + }, + { + "epoch": 2.830460921843687, + "grad_norm": 18.02495829211754, + "learning_rate": 9.719989606843783e-08, + "loss": 1.448, + "step": 14124 + }, + { + "epoch": 2.8306613226452906, + "grad_norm": 26.42947767555083, + "learning_rate": 9.697126020099979e-08, + "loss": 1.9031, + "step": 14125 + }, + { + "epoch": 2.830861723446894, + "grad_norm": 23.653238555151827, + "learning_rate": 9.674289091847145e-08, + "loss": 1.7909, + "step": 14126 + }, + { + "epoch": 2.831062124248497, + "grad_norm": 25.929533621510412, + "learning_rate": 9.651478823326954e-08, + "loss": 1.3044, + "step": 14127 + }, + { + "epoch": 2.8312625250501, + "grad_norm": 23.191672316426448, + "learning_rate": 9.628695215779749e-08, + "loss": 1.5475, + "step": 14128 + }, + { + "epoch": 2.8314629258517034, + "grad_norm": 29.114626139804002, + "learning_rate": 9.605938270444204e-08, + "loss": 1.0666, + "step": 14129 + }, + { + "epoch": 2.831663326653307, + "grad_norm": 17.41444144685702, + "learning_rate": 9.583207988557718e-08, + "loss": 1.4361, + "step": 14130 + }, + { + "epoch": 2.83186372745491, + "grad_norm": 21.936601484587083, + "learning_rate": 9.560504371356083e-08, + "loss": 1.5901, + "step": 14131 + }, + { + "epoch": 2.832064128256513, + "grad_norm": 18.536633405256783, + "learning_rate": 9.537827420073864e-08, + "loss": 1.0897, + "step": 14132 + }, + { + "epoch": 2.8322645290581163, + "grad_norm": 17.251411445407328, + "learning_rate": 9.51517713594391e-08, + "loss": 1.1872, + "step": 14133 + }, + { + "epoch": 2.8324649298597193, + "grad_norm": 28.07108528612193, + "learning_rate": 9.492553520197734e-08, + "loss": 1.3651, + "step": 14134 + }, + { + "epoch": 2.8326653306613228, + "grad_norm": 21.65870362427797, + "learning_rate": 9.469956574065575e-08, + "loss": 2.0093, + "step": 14135 + }, + { + "epoch": 2.8328657314629258, + "grad_norm": 15.037542767526412, + "learning_rate": 9.447386298775895e-08, + "loss": 1.6156, + "step": 14136 + }, + { + "epoch": 2.833066132264529, + "grad_norm": 19.34002239026296, + "learning_rate": 9.424842695555991e-08, + "loss": 1.5992, + "step": 14137 + }, + { + "epoch": 2.833266533066132, + "grad_norm": 26.04727726087847, + "learning_rate": 9.402325765631493e-08, + "loss": 1.4747, + "step": 14138 + }, + { + "epoch": 2.8334669338677356, + "grad_norm": 25.074747407229534, + "learning_rate": 9.379835510226809e-08, + "loss": 1.8312, + "step": 14139 + }, + { + "epoch": 2.8336673346693386, + "grad_norm": 32.7497095951643, + "learning_rate": 9.35737193056463e-08, + "loss": 1.8156, + "step": 14140 + }, + { + "epoch": 2.8338677354709416, + "grad_norm": 31.659424826243093, + "learning_rate": 9.334935027866421e-08, + "loss": 1.4252, + "step": 14141 + }, + { + "epoch": 2.834068136272545, + "grad_norm": 23.653666012904033, + "learning_rate": 9.312524803352097e-08, + "loss": 1.873, + "step": 14142 + }, + { + "epoch": 2.8342685370741485, + "grad_norm": 25.279818174481303, + "learning_rate": 9.290141258240071e-08, + "loss": 1.7725, + "step": 14143 + }, + { + "epoch": 2.8344689378757515, + "grad_norm": 22.498754915662676, + "learning_rate": 9.267784393747426e-08, + "loss": 1.1816, + "step": 14144 + }, + { + "epoch": 2.8346693386773545, + "grad_norm": 21.618843492226347, + "learning_rate": 9.245454211089744e-08, + "loss": 1.6632, + "step": 14145 + }, + { + "epoch": 2.834869739478958, + "grad_norm": 26.49030994529714, + "learning_rate": 9.223150711481222e-08, + "loss": 1.5, + "step": 14146 + }, + { + "epoch": 2.8350701402805614, + "grad_norm": 21.991382185251982, + "learning_rate": 9.200873896134333e-08, + "loss": 1.8207, + "step": 14147 + }, + { + "epoch": 2.8352705410821644, + "grad_norm": 15.48326409509124, + "learning_rate": 9.178623766260441e-08, + "loss": 1.5171, + "step": 14148 + }, + { + "epoch": 2.8354709418837674, + "grad_norm": 21.730151431189583, + "learning_rate": 9.156400323069303e-08, + "loss": 1.6303, + "step": 14149 + }, + { + "epoch": 2.835671342685371, + "grad_norm": 20.765483993300233, + "learning_rate": 9.134203567769229e-08, + "loss": 1.9357, + "step": 14150 + }, + { + "epoch": 2.835871743486974, + "grad_norm": 16.38196109009242, + "learning_rate": 9.112033501567086e-08, + "loss": 1.4361, + "step": 14151 + }, + { + "epoch": 2.8360721442885772, + "grad_norm": 23.7313601549844, + "learning_rate": 9.0898901256683e-08, + "loss": 1.8312, + "step": 14152 + }, + { + "epoch": 2.8362725450901802, + "grad_norm": 27.903348774666075, + "learning_rate": 9.06777344127685e-08, + "loss": 1.1155, + "step": 14153 + }, + { + "epoch": 2.8364729458917837, + "grad_norm": 23.696774253620692, + "learning_rate": 9.045683449595277e-08, + "loss": 1.2892, + "step": 14154 + }, + { + "epoch": 2.8366733466933867, + "grad_norm": 20.506323784312503, + "learning_rate": 9.023620151824508e-08, + "loss": 1.9202, + "step": 14155 + }, + { + "epoch": 2.83687374749499, + "grad_norm": 47.26605044517606, + "learning_rate": 9.001583549164361e-08, + "loss": 1.2496, + "step": 14156 + }, + { + "epoch": 2.837074148296593, + "grad_norm": 33.127114705507985, + "learning_rate": 8.979573642812878e-08, + "loss": 1.7059, + "step": 14157 + }, + { + "epoch": 2.8372745490981965, + "grad_norm": 16.34822211389029, + "learning_rate": 8.957590433966768e-08, + "loss": 1.0466, + "step": 14158 + }, + { + "epoch": 2.8374749498997995, + "grad_norm": 22.306205436418324, + "learning_rate": 8.935633923821296e-08, + "loss": 1.3763, + "step": 14159 + }, + { + "epoch": 2.837675350701403, + "grad_norm": 23.69299425622371, + "learning_rate": 8.913704113570287e-08, + "loss": 1.8238, + "step": 14160 + }, + { + "epoch": 2.837875751503006, + "grad_norm": 20.585402683873664, + "learning_rate": 8.89180100440612e-08, + "loss": 1.7419, + "step": 14161 + }, + { + "epoch": 2.838076152304609, + "grad_norm": 19.22976282369329, + "learning_rate": 8.869924597519619e-08, + "loss": 1.6067, + "step": 14162 + }, + { + "epoch": 2.8382765531062124, + "grad_norm": 23.93861985078695, + "learning_rate": 8.848074894100389e-08, + "loss": 1.1609, + "step": 14163 + }, + { + "epoch": 2.838476953907816, + "grad_norm": 58.02015868318987, + "learning_rate": 8.826251895336258e-08, + "loss": 1.712, + "step": 14164 + }, + { + "epoch": 2.838677354709419, + "grad_norm": 20.602992543206657, + "learning_rate": 8.804455602413886e-08, + "loss": 2.0417, + "step": 14165 + }, + { + "epoch": 2.838877755511022, + "grad_norm": 22.192398556136858, + "learning_rate": 8.782686016518327e-08, + "loss": 1.711, + "step": 14166 + }, + { + "epoch": 2.8390781563126253, + "grad_norm": 21.367322465426344, + "learning_rate": 8.760943138833189e-08, + "loss": 1.153, + "step": 14167 + }, + { + "epoch": 2.8392785571142287, + "grad_norm": 17.4922940558891, + "learning_rate": 8.739226970540748e-08, + "loss": 2.0707, + "step": 14168 + }, + { + "epoch": 2.8394789579158317, + "grad_norm": 29.1044986028388, + "learning_rate": 8.717537512821728e-08, + "loss": 1.6728, + "step": 14169 + }, + { + "epoch": 2.8396793587174347, + "grad_norm": 24.82986087392204, + "learning_rate": 8.695874766855405e-08, + "loss": 1.336, + "step": 14170 + }, + { + "epoch": 2.839879759519038, + "grad_norm": 18.631808575043916, + "learning_rate": 8.674238733819562e-08, + "loss": 1.4548, + "step": 14171 + }, + { + "epoch": 2.840080160320641, + "grad_norm": 67.05766567974116, + "learning_rate": 8.65262941489059e-08, + "loss": 1.8316, + "step": 14172 + }, + { + "epoch": 2.8402805611222446, + "grad_norm": 17.303837934350895, + "learning_rate": 8.631046811243493e-08, + "loss": 1.3474, + "step": 14173 + }, + { + "epoch": 2.8404809619238476, + "grad_norm": 25.14264250102964, + "learning_rate": 8.609490924051666e-08, + "loss": 1.4289, + "step": 14174 + }, + { + "epoch": 2.840681362725451, + "grad_norm": 22.693878087191646, + "learning_rate": 8.587961754487229e-08, + "loss": 1.2544, + "step": 14175 + }, + { + "epoch": 2.840881763527054, + "grad_norm": 24.17346448960749, + "learning_rate": 8.566459303720686e-08, + "loss": 1.5911, + "step": 14176 + }, + { + "epoch": 2.8410821643286575, + "grad_norm": 24.22564858867724, + "learning_rate": 8.544983572921217e-08, + "loss": 1.3382, + "step": 14177 + }, + { + "epoch": 2.8412825651302605, + "grad_norm": 18.51554436209434, + "learning_rate": 8.523534563256331e-08, + "loss": 1.4767, + "step": 14178 + }, + { + "epoch": 2.841482965931864, + "grad_norm": 25.588501901653437, + "learning_rate": 8.502112275892427e-08, + "loss": 1.5316, + "step": 14179 + }, + { + "epoch": 2.841683366733467, + "grad_norm": 25.363124790092648, + "learning_rate": 8.480716711994242e-08, + "loss": 1.6996, + "step": 14180 + }, + { + "epoch": 2.8418837675350703, + "grad_norm": 16.54235764111735, + "learning_rate": 8.459347872724954e-08, + "loss": 1.6215, + "step": 14181 + }, + { + "epoch": 2.8420841683366733, + "grad_norm": 23.909430922903375, + "learning_rate": 8.438005759246526e-08, + "loss": 1.4538, + "step": 14182 + }, + { + "epoch": 2.8422845691382763, + "grad_norm": 35.22141792968431, + "learning_rate": 8.416690372719361e-08, + "loss": 1.9785, + "step": 14183 + }, + { + "epoch": 2.8424849699398798, + "grad_norm": 20.86049670876017, + "learning_rate": 8.395401714302365e-08, + "loss": 1.5102, + "step": 14184 + }, + { + "epoch": 2.842685370741483, + "grad_norm": 20.369888369663165, + "learning_rate": 8.374139785153057e-08, + "loss": 0.9276, + "step": 14185 + }, + { + "epoch": 2.842885771543086, + "grad_norm": 17.48837368860392, + "learning_rate": 8.352904586427457e-08, + "loss": 1.6496, + "step": 14186 + }, + { + "epoch": 2.843086172344689, + "grad_norm": 15.516316738526172, + "learning_rate": 8.331696119280252e-08, + "loss": 1.4475, + "step": 14187 + }, + { + "epoch": 2.8432865731462926, + "grad_norm": 21.563319577092656, + "learning_rate": 8.31051438486441e-08, + "loss": 1.4663, + "step": 14188 + }, + { + "epoch": 2.843486973947896, + "grad_norm": 19.413427816674645, + "learning_rate": 8.28935938433173e-08, + "loss": 1.0901, + "step": 14189 + }, + { + "epoch": 2.843687374749499, + "grad_norm": 23.66634569284049, + "learning_rate": 8.268231118832404e-08, + "loss": 1.6756, + "step": 14190 + }, + { + "epoch": 2.843887775551102, + "grad_norm": 19.094862034782366, + "learning_rate": 8.247129589515234e-08, + "loss": 0.9636, + "step": 14191 + }, + { + "epoch": 2.8440881763527055, + "grad_norm": 27.000543489374756, + "learning_rate": 8.226054797527528e-08, + "loss": 1.3771, + "step": 14192 + }, + { + "epoch": 2.8442885771543085, + "grad_norm": 42.60035962929585, + "learning_rate": 8.205006744015087e-08, + "loss": 1.7015, + "step": 14193 + }, + { + "epoch": 2.844488977955912, + "grad_norm": 23.445746791240726, + "learning_rate": 8.183985430122499e-08, + "loss": 1.716, + "step": 14194 + }, + { + "epoch": 2.844689378757515, + "grad_norm": 23.465768779356058, + "learning_rate": 8.162990856992459e-08, + "loss": 1.4598, + "step": 14195 + }, + { + "epoch": 2.8448897795591184, + "grad_norm": 24.083038891933334, + "learning_rate": 8.142023025766776e-08, + "loss": 1.2617, + "step": 14196 + }, + { + "epoch": 2.8450901803607214, + "grad_norm": 22.16973834017776, + "learning_rate": 8.121081937585262e-08, + "loss": 1.6405, + "step": 14197 + }, + { + "epoch": 2.845290581162325, + "grad_norm": 21.043175909576693, + "learning_rate": 8.100167593586616e-08, + "loss": 1.7472, + "step": 14198 + }, + { + "epoch": 2.845490981963928, + "grad_norm": 21.791087612505923, + "learning_rate": 8.079279994907985e-08, + "loss": 1.9829, + "step": 14199 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 24.423394256463283, + "learning_rate": 8.058419142685015e-08, + "loss": 1.5888, + "step": 14200 + }, + { + "epoch": 2.8458917835671342, + "grad_norm": 24.287775182427733, + "learning_rate": 8.03758503805202e-08, + "loss": 1.8488, + "step": 14201 + }, + { + "epoch": 2.8460921843687377, + "grad_norm": 20.597326127037952, + "learning_rate": 8.016777682141652e-08, + "loss": 1.7073, + "step": 14202 + }, + { + "epoch": 2.8462925851703407, + "grad_norm": 16.552872766762746, + "learning_rate": 7.995997076085337e-08, + "loss": 1.0741, + "step": 14203 + }, + { + "epoch": 2.8464929859719437, + "grad_norm": 24.129229258271337, + "learning_rate": 7.975243221012952e-08, + "loss": 0.8912, + "step": 14204 + }, + { + "epoch": 2.846693386773547, + "grad_norm": 21.263868991087897, + "learning_rate": 7.954516118052869e-08, + "loss": 1.9248, + "step": 14205 + }, + { + "epoch": 2.8468937875751505, + "grad_norm": 22.644410025761545, + "learning_rate": 7.933815768332131e-08, + "loss": 1.9721, + "step": 14206 + }, + { + "epoch": 2.8470941883767535, + "grad_norm": 40.77943558667694, + "learning_rate": 7.913142172976118e-08, + "loss": 1.8005, + "step": 14207 + }, + { + "epoch": 2.8472945891783565, + "grad_norm": 17.378401737676942, + "learning_rate": 7.892495333108985e-08, + "loss": 1.6802, + "step": 14208 + }, + { + "epoch": 2.84749498997996, + "grad_norm": 25.25781082325431, + "learning_rate": 7.871875249853277e-08, + "loss": 1.6595, + "step": 14209 + }, + { + "epoch": 2.847695390781563, + "grad_norm": 16.528602496309645, + "learning_rate": 7.851281924330156e-08, + "loss": 1.068, + "step": 14210 + }, + { + "epoch": 2.8478957915831664, + "grad_norm": 23.69743714509426, + "learning_rate": 7.830715357659335e-08, + "loss": 1.8942, + "step": 14211 + }, + { + "epoch": 2.8480961923847694, + "grad_norm": 23.021232156280774, + "learning_rate": 7.810175550958976e-08, + "loss": 1.3845, + "step": 14212 + }, + { + "epoch": 2.848296593186373, + "grad_norm": 20.56783613720378, + "learning_rate": 7.789662505346018e-08, + "loss": 1.3868, + "step": 14213 + }, + { + "epoch": 2.848496993987976, + "grad_norm": 36.53263321268072, + "learning_rate": 7.76917622193557e-08, + "loss": 1.5461, + "step": 14214 + }, + { + "epoch": 2.8486973947895793, + "grad_norm": 18.828315737987875, + "learning_rate": 7.748716701841686e-08, + "loss": 1.6069, + "step": 14215 + }, + { + "epoch": 2.8488977955911823, + "grad_norm": 25.280781610128578, + "learning_rate": 7.72828394617664e-08, + "loss": 1.761, + "step": 14216 + }, + { + "epoch": 2.8490981963927857, + "grad_norm": 26.784669926903486, + "learning_rate": 7.70787795605149e-08, + "loss": 1.689, + "step": 14217 + }, + { + "epoch": 2.8492985971943887, + "grad_norm": 18.678876381740764, + "learning_rate": 7.687498732575738e-08, + "loss": 1.5693, + "step": 14218 + }, + { + "epoch": 2.849498997995992, + "grad_norm": 27.138344607749517, + "learning_rate": 7.667146276857329e-08, + "loss": 1.6948, + "step": 14219 + }, + { + "epoch": 2.849699398797595, + "grad_norm": 22.83865867667252, + "learning_rate": 7.64682059000299e-08, + "loss": 1.5765, + "step": 14220 + }, + { + "epoch": 2.849899799599198, + "grad_norm": 17.777498092772586, + "learning_rate": 7.626521673117781e-08, + "loss": 1.3103, + "step": 14221 + }, + { + "epoch": 2.8501002004008016, + "grad_norm": 23.1437560959099, + "learning_rate": 7.606249527305375e-08, + "loss": 1.384, + "step": 14222 + }, + { + "epoch": 2.850300601202405, + "grad_norm": 20.621729690584846, + "learning_rate": 7.586004153668003e-08, + "loss": 1.7298, + "step": 14223 + }, + { + "epoch": 2.850501002004008, + "grad_norm": 23.556029162712736, + "learning_rate": 7.565785553306504e-08, + "loss": 1.6585, + "step": 14224 + }, + { + "epoch": 2.850701402805611, + "grad_norm": 16.817819842123704, + "learning_rate": 7.545593727320166e-08, + "loss": 1.4663, + "step": 14225 + }, + { + "epoch": 2.8509018036072145, + "grad_norm": 35.08052240489813, + "learning_rate": 7.525428676806723e-08, + "loss": 0.8785, + "step": 14226 + }, + { + "epoch": 2.851102204408818, + "grad_norm": 25.55752751558959, + "learning_rate": 7.505290402862797e-08, + "loss": 1.8147, + "step": 14227 + }, + { + "epoch": 2.851302605210421, + "grad_norm": 18.305764058049366, + "learning_rate": 7.485178906583179e-08, + "loss": 1.6206, + "step": 14228 + }, + { + "epoch": 2.851503006012024, + "grad_norm": 20.88582178425897, + "learning_rate": 7.465094189061328e-08, + "loss": 1.2697, + "step": 14229 + }, + { + "epoch": 2.8517034068136273, + "grad_norm": 15.366835728288585, + "learning_rate": 7.445036251389426e-08, + "loss": 1.5806, + "step": 14230 + }, + { + "epoch": 2.8519038076152303, + "grad_norm": 22.02786505515236, + "learning_rate": 7.425005094657989e-08, + "loss": 1.5042, + "step": 14231 + }, + { + "epoch": 2.8521042084168338, + "grad_norm": 17.324748282762865, + "learning_rate": 7.40500071995609e-08, + "loss": 1.2883, + "step": 14232 + }, + { + "epoch": 2.8523046092184368, + "grad_norm": 22.292206947565198, + "learning_rate": 7.385023128371471e-08, + "loss": 1.96, + "step": 14233 + }, + { + "epoch": 2.85250501002004, + "grad_norm": 21.297216945684006, + "learning_rate": 7.365072320990318e-08, + "loss": 1.4955, + "step": 14234 + }, + { + "epoch": 2.852705410821643, + "grad_norm": 22.00292267080715, + "learning_rate": 7.345148298897375e-08, + "loss": 1.6322, + "step": 14235 + }, + { + "epoch": 2.8529058116232466, + "grad_norm": 21.779731016917125, + "learning_rate": 7.325251063175943e-08, + "loss": 1.5443, + "step": 14236 + }, + { + "epoch": 2.8531062124248496, + "grad_norm": 31.852746528296162, + "learning_rate": 7.305380614907875e-08, + "loss": 1.3774, + "step": 14237 + }, + { + "epoch": 2.853306613226453, + "grad_norm": 23.536899616248768, + "learning_rate": 7.285536955173533e-08, + "loss": 1.6682, + "step": 14238 + }, + { + "epoch": 2.853507014028056, + "grad_norm": 22.72994503155333, + "learning_rate": 7.26572008505183e-08, + "loss": 2.0188, + "step": 14239 + }, + { + "epoch": 2.8537074148296595, + "grad_norm": 21.5960150534293, + "learning_rate": 7.245930005620294e-08, + "loss": 1.8615, + "step": 14240 + }, + { + "epoch": 2.8539078156312625, + "grad_norm": 27.583790530649704, + "learning_rate": 7.226166717954896e-08, + "loss": 1.8623, + "step": 14241 + }, + { + "epoch": 2.8541082164328655, + "grad_norm": 36.162190627718786, + "learning_rate": 7.206430223130278e-08, + "loss": 1.6015, + "step": 14242 + }, + { + "epoch": 2.854308617234469, + "grad_norm": 22.07112202462543, + "learning_rate": 7.186720522219415e-08, + "loss": 1.4641, + "step": 14243 + }, + { + "epoch": 2.8545090180360724, + "grad_norm": 20.90204693563611, + "learning_rate": 7.16703761629406e-08, + "loss": 1.5961, + "step": 14244 + }, + { + "epoch": 2.8547094188376754, + "grad_norm": 27.078355561419517, + "learning_rate": 7.1473815064243e-08, + "loss": 1.6337, + "step": 14245 + }, + { + "epoch": 2.8549098196392784, + "grad_norm": 20.436959011817766, + "learning_rate": 7.127752193678949e-08, + "loss": 1.1882, + "step": 14246 + }, + { + "epoch": 2.855110220440882, + "grad_norm": 23.219741541120225, + "learning_rate": 7.108149679125265e-08, + "loss": 1.5705, + "step": 14247 + }, + { + "epoch": 2.8553106212424852, + "grad_norm": 19.384879821219968, + "learning_rate": 7.08857396382906e-08, + "loss": 1.0601, + "step": 14248 + }, + { + "epoch": 2.8555110220440882, + "grad_norm": 17.471606183604393, + "learning_rate": 7.069025048854706e-08, + "loss": 1.501, + "step": 14249 + }, + { + "epoch": 2.8557114228456912, + "grad_norm": 21.449991462128043, + "learning_rate": 7.049502935265073e-08, + "loss": 1.3956, + "step": 14250 + }, + { + "epoch": 2.8559118236472947, + "grad_norm": 19.42170239121483, + "learning_rate": 7.030007624121648e-08, + "loss": 1.6939, + "step": 14251 + }, + { + "epoch": 2.8561122244488977, + "grad_norm": 28.292536044242276, + "learning_rate": 7.010539116484416e-08, + "loss": 1.6552, + "step": 14252 + }, + { + "epoch": 2.856312625250501, + "grad_norm": 19.499483837134303, + "learning_rate": 6.991097413411862e-08, + "loss": 1.4078, + "step": 14253 + }, + { + "epoch": 2.856513026052104, + "grad_norm": 21.86154760921837, + "learning_rate": 6.971682515961087e-08, + "loss": 1.6347, + "step": 14254 + }, + { + "epoch": 2.8567134268537075, + "grad_norm": 17.546679222529015, + "learning_rate": 6.95229442518769e-08, + "loss": 1.5162, + "step": 14255 + }, + { + "epoch": 2.8569138276553105, + "grad_norm": 20.956805067236353, + "learning_rate": 6.932933142145881e-08, + "loss": 1.4869, + "step": 14256 + }, + { + "epoch": 2.857114228456914, + "grad_norm": 21.920064539534913, + "learning_rate": 6.913598667888322e-08, + "loss": 1.8233, + "step": 14257 + }, + { + "epoch": 2.857314629258517, + "grad_norm": 20.238100323176255, + "learning_rate": 6.89429100346628e-08, + "loss": 1.514, + "step": 14258 + }, + { + "epoch": 2.85751503006012, + "grad_norm": 48.6870275457123, + "learning_rate": 6.875010149929473e-08, + "loss": 1.3642, + "step": 14259 + }, + { + "epoch": 2.8577154308617234, + "grad_norm": 15.61979845841767, + "learning_rate": 6.855756108326284e-08, + "loss": 1.1932, + "step": 14260 + }, + { + "epoch": 2.857915831663327, + "grad_norm": 16.56497975464991, + "learning_rate": 6.836528879703653e-08, + "loss": 1.4349, + "step": 14261 + }, + { + "epoch": 2.85811623246493, + "grad_norm": 24.7383411633229, + "learning_rate": 6.817328465106909e-08, + "loss": 1.2201, + "step": 14262 + }, + { + "epoch": 2.858316633266533, + "grad_norm": 17.221735130857322, + "learning_rate": 6.798154865579998e-08, + "loss": 1.1268, + "step": 14263 + }, + { + "epoch": 2.8585170340681363, + "grad_norm": 18.367057294971637, + "learning_rate": 6.779008082165417e-08, + "loss": 1.4934, + "step": 14264 + }, + { + "epoch": 2.8587174348697397, + "grad_norm": 18.82125823158912, + "learning_rate": 6.759888115904278e-08, + "loss": 1.4681, + "step": 14265 + }, + { + "epoch": 2.8589178356713427, + "grad_norm": 23.985437104633704, + "learning_rate": 6.740794967836085e-08, + "loss": 1.4714, + "step": 14266 + }, + { + "epoch": 2.8591182364729457, + "grad_norm": 21.93624633422435, + "learning_rate": 6.721728638999003e-08, + "loss": 1.4953, + "step": 14267 + }, + { + "epoch": 2.859318637274549, + "grad_norm": 31.551875562748958, + "learning_rate": 6.702689130429652e-08, + "loss": 1.6803, + "step": 14268 + }, + { + "epoch": 2.859519038076152, + "grad_norm": 26.331596827611264, + "learning_rate": 6.683676443163312e-08, + "loss": 1.5956, + "step": 14269 + }, + { + "epoch": 2.8597194388777556, + "grad_norm": 26.09750788848082, + "learning_rate": 6.664690578233657e-08, + "loss": 1.2588, + "step": 14270 + }, + { + "epoch": 2.8599198396793586, + "grad_norm": 23.05007924442807, + "learning_rate": 6.645731536672972e-08, + "loss": 1.3386, + "step": 14271 + }, + { + "epoch": 2.860120240480962, + "grad_norm": 18.188525010735326, + "learning_rate": 6.626799319512157e-08, + "loss": 1.7834, + "step": 14272 + }, + { + "epoch": 2.860320641282565, + "grad_norm": 21.226219707284837, + "learning_rate": 6.607893927780551e-08, + "loss": 1.7399, + "step": 14273 + }, + { + "epoch": 2.8605210420841685, + "grad_norm": 24.97823038063782, + "learning_rate": 6.589015362506057e-08, + "loss": 1.597, + "step": 14274 + }, + { + "epoch": 2.8607214428857715, + "grad_norm": 22.707926706451726, + "learning_rate": 6.570163624715131e-08, + "loss": 1.5748, + "step": 14275 + }, + { + "epoch": 2.860921843687375, + "grad_norm": 19.551456568996944, + "learning_rate": 6.551338715432787e-08, + "loss": 0.9761, + "step": 14276 + }, + { + "epoch": 2.861122244488978, + "grad_norm": 20.506105014279235, + "learning_rate": 6.532540635682593e-08, + "loss": 1.6133, + "step": 14277 + }, + { + "epoch": 2.8613226452905813, + "grad_norm": 17.515306794710753, + "learning_rate": 6.513769386486513e-08, + "loss": 1.6791, + "step": 14278 + }, + { + "epoch": 2.8615230460921843, + "grad_norm": 23.810594471601586, + "learning_rate": 6.495024968865282e-08, + "loss": 1.8834, + "step": 14279 + }, + { + "epoch": 2.8617234468937873, + "grad_norm": 21.507946164571138, + "learning_rate": 6.476307383838032e-08, + "loss": 1.4699, + "step": 14280 + }, + { + "epoch": 2.8619238476953908, + "grad_norm": 16.7909784840814, + "learning_rate": 6.457616632422448e-08, + "loss": 1.3593, + "step": 14281 + }, + { + "epoch": 2.862124248496994, + "grad_norm": 23.679090570264133, + "learning_rate": 6.438952715634828e-08, + "loss": 1.1318, + "step": 14282 + }, + { + "epoch": 2.862324649298597, + "grad_norm": 20.99492868310794, + "learning_rate": 6.420315634489804e-08, + "loss": 1.5864, + "step": 14283 + }, + { + "epoch": 2.8625250501002, + "grad_norm": 24.999127749837424, + "learning_rate": 6.401705390000901e-08, + "loss": 1.663, + "step": 14284 + }, + { + "epoch": 2.8627254509018036, + "grad_norm": 18.01144682184674, + "learning_rate": 6.383121983179863e-08, + "loss": 1.5467, + "step": 14285 + }, + { + "epoch": 2.862925851703407, + "grad_norm": 25.73225374298451, + "learning_rate": 6.36456541503716e-08, + "loss": 1.6564, + "step": 14286 + }, + { + "epoch": 2.86312625250501, + "grad_norm": 23.10897610451761, + "learning_rate": 6.346035686581653e-08, + "loss": 1.9323, + "step": 14287 + }, + { + "epoch": 2.863326653306613, + "grad_norm": 15.7858537198554, + "learning_rate": 6.327532798820924e-08, + "loss": 1.3176, + "step": 14288 + }, + { + "epoch": 2.8635270541082165, + "grad_norm": 23.273816692209678, + "learning_rate": 6.309056752760945e-08, + "loss": 1.6362, + "step": 14289 + }, + { + "epoch": 2.8637274549098195, + "grad_norm": 18.86413586368274, + "learning_rate": 6.290607549406302e-08, + "loss": 1.3448, + "step": 14290 + }, + { + "epoch": 2.863927855711423, + "grad_norm": 19.967473248673684, + "learning_rate": 6.272185189760083e-08, + "loss": 2.0061, + "step": 14291 + }, + { + "epoch": 2.864128256513026, + "grad_norm": 19.02265286316687, + "learning_rate": 6.25378967482404e-08, + "loss": 1.3962, + "step": 14292 + }, + { + "epoch": 2.8643286573146294, + "grad_norm": 18.05272321665863, + "learning_rate": 6.235421005598207e-08, + "loss": 1.5893, + "step": 14293 + }, + { + "epoch": 2.8645290581162324, + "grad_norm": 22.849248369527274, + "learning_rate": 6.217079183081509e-08, + "loss": 1.5915, + "step": 14294 + }, + { + "epoch": 2.864729458917836, + "grad_norm": 16.075129651103914, + "learning_rate": 6.19876420827098e-08, + "loss": 1.5133, + "step": 14295 + }, + { + "epoch": 2.864929859719439, + "grad_norm": 28.836789647150447, + "learning_rate": 6.180476082162656e-08, + "loss": 2.0621, + "step": 14296 + }, + { + "epoch": 2.8651302605210422, + "grad_norm": 27.59631837937453, + "learning_rate": 6.162214805750744e-08, + "loss": 1.445, + "step": 14297 + }, + { + "epoch": 2.8653306613226452, + "grad_norm": 21.72067572315582, + "learning_rate": 6.143980380028169e-08, + "loss": 1.3787, + "step": 14298 + }, + { + "epoch": 2.8655310621242487, + "grad_norm": 21.425775574585508, + "learning_rate": 6.125772805986418e-08, + "loss": 1.3232, + "step": 14299 + }, + { + "epoch": 2.8657314629258517, + "grad_norm": 18.32589323561997, + "learning_rate": 6.107592084615366e-08, + "loss": 1.4864, + "step": 14300 + }, + { + "epoch": 2.8659318637274547, + "grad_norm": 22.521021261147066, + "learning_rate": 6.089438216903665e-08, + "loss": 1.7917, + "step": 14301 + }, + { + "epoch": 2.866132264529058, + "grad_norm": 14.711833302152368, + "learning_rate": 6.071311203838248e-08, + "loss": 1.2227, + "step": 14302 + }, + { + "epoch": 2.8663326653306616, + "grad_norm": 16.709999218748237, + "learning_rate": 6.053211046404772e-08, + "loss": 1.4986, + "step": 14303 + }, + { + "epoch": 2.8665330661322646, + "grad_norm": 24.091623421781154, + "learning_rate": 6.035137745587283e-08, + "loss": 1.7115, + "step": 14304 + }, + { + "epoch": 2.8667334669338675, + "grad_norm": 20.634734749509832, + "learning_rate": 6.01709130236855e-08, + "loss": 1.5587, + "step": 14305 + }, + { + "epoch": 2.866933867735471, + "grad_norm": 21.50495056032166, + "learning_rate": 5.999071717729787e-08, + "loss": 1.797, + "step": 14306 + }, + { + "epoch": 2.867134268537074, + "grad_norm": 23.375153854167753, + "learning_rate": 5.981078992650602e-08, + "loss": 1.6117, + "step": 14307 + }, + { + "epoch": 2.8673346693386774, + "grad_norm": 25.042946437726147, + "learning_rate": 5.963113128109488e-08, + "loss": 1.519, + "step": 14308 + }, + { + "epoch": 2.8675350701402804, + "grad_norm": 21.544284265266633, + "learning_rate": 5.9451741250831644e-08, + "loss": 1.7313, + "step": 14309 + }, + { + "epoch": 2.867735470941884, + "grad_norm": 16.081095586056296, + "learning_rate": 5.9272619845469614e-08, + "loss": 0.9306, + "step": 14310 + }, + { + "epoch": 2.867935871743487, + "grad_norm": 21.93834268912905, + "learning_rate": 5.9093767074748786e-08, + "loss": 1.9327, + "step": 14311 + }, + { + "epoch": 2.8681362725450903, + "grad_norm": 24.190770618102885, + "learning_rate": 5.89151829483936e-08, + "loss": 2.0217, + "step": 14312 + }, + { + "epoch": 2.8683366733466933, + "grad_norm": 22.7655907166465, + "learning_rate": 5.873686747611351e-08, + "loss": 1.0543, + "step": 14313 + }, + { + "epoch": 2.8685370741482967, + "grad_norm": 23.475348927629174, + "learning_rate": 5.8558820667603547e-08, + "loss": 1.6898, + "step": 14314 + }, + { + "epoch": 2.8687374749498997, + "grad_norm": 30.774821123408152, + "learning_rate": 5.838104253254484e-08, + "loss": 1.7136, + "step": 14315 + }, + { + "epoch": 2.868937875751503, + "grad_norm": 18.65974538334041, + "learning_rate": 5.820353308060356e-08, + "loss": 1.3213, + "step": 14316 + }, + { + "epoch": 2.869138276553106, + "grad_norm": 21.276530743642216, + "learning_rate": 5.802629232143031e-08, + "loss": 1.4758, + "step": 14317 + }, + { + "epoch": 2.869338677354709, + "grad_norm": 26.88152211914919, + "learning_rate": 5.78493202646635e-08, + "loss": 1.331, + "step": 14318 + }, + { + "epoch": 2.8695390781563126, + "grad_norm": 14.447242781201028, + "learning_rate": 5.7672616919923765e-08, + "loss": 1.1838, + "step": 14319 + }, + { + "epoch": 2.869739478957916, + "grad_norm": 18.263076641382025, + "learning_rate": 5.749618229681953e-08, + "loss": 1.1544, + "step": 14320 + }, + { + "epoch": 2.869939879759519, + "grad_norm": 24.7036035672074, + "learning_rate": 5.7320016404943666e-08, + "loss": 1.4628, + "step": 14321 + }, + { + "epoch": 2.870140280561122, + "grad_norm": 21.680385853965188, + "learning_rate": 5.714411925387464e-08, + "loss": 1.8023, + "step": 14322 + }, + { + "epoch": 2.8703406813627255, + "grad_norm": 23.334439646658822, + "learning_rate": 5.696849085317646e-08, + "loss": 1.2671, + "step": 14323 + }, + { + "epoch": 2.870541082164329, + "grad_norm": 32.49970175038989, + "learning_rate": 5.67931312123976e-08, + "loss": 1.6077, + "step": 14324 + }, + { + "epoch": 2.870741482965932, + "grad_norm": 22.955249145737326, + "learning_rate": 5.661804034107377e-08, + "loss": 1.7169, + "step": 14325 + }, + { + "epoch": 2.870941883767535, + "grad_norm": 18.891739826432534, + "learning_rate": 5.644321824872345e-08, + "loss": 2.0959, + "step": 14326 + }, + { + "epoch": 2.8711422845691383, + "grad_norm": 19.469623809907294, + "learning_rate": 5.626866494485295e-08, + "loss": 1.7842, + "step": 14327 + }, + { + "epoch": 2.8713426853707413, + "grad_norm": 25.49103109860661, + "learning_rate": 5.609438043895243e-08, + "loss": 1.5699, + "step": 14328 + }, + { + "epoch": 2.8715430861723448, + "grad_norm": 55.44091986429108, + "learning_rate": 5.592036474049878e-08, + "loss": 1.6689, + "step": 14329 + }, + { + "epoch": 2.8717434869739478, + "grad_norm": 25.65462137268508, + "learning_rate": 5.574661785895274e-08, + "loss": 1.2906, + "step": 14330 + }, + { + "epoch": 2.871943887775551, + "grad_norm": 20.75134141789999, + "learning_rate": 5.557313980376122e-08, + "loss": 1.4779, + "step": 14331 + }, + { + "epoch": 2.872144288577154, + "grad_norm": 19.656766520011168, + "learning_rate": 5.539993058435722e-08, + "loss": 1.3391, + "step": 14332 + }, + { + "epoch": 2.8723446893787576, + "grad_norm": 22.155670535445704, + "learning_rate": 5.522699021015765e-08, + "loss": 1.4471, + "step": 14333 + }, + { + "epoch": 2.8725450901803606, + "grad_norm": 19.786623106735235, + "learning_rate": 5.50543186905661e-08, + "loss": 1.5447, + "step": 14334 + }, + { + "epoch": 2.872745490981964, + "grad_norm": 46.80487193476049, + "learning_rate": 5.4881916034970617e-08, + "loss": 1.4412, + "step": 14335 + }, + { + "epoch": 2.872945891783567, + "grad_norm": 23.69347682450168, + "learning_rate": 5.470978225274481e-08, + "loss": 1.6798, + "step": 14336 + }, + { + "epoch": 2.8731462925851705, + "grad_norm": 25.625369280511027, + "learning_rate": 5.453791735324787e-08, + "loss": 1.452, + "step": 14337 + }, + { + "epoch": 2.8733466933867735, + "grad_norm": 18.641762972052437, + "learning_rate": 5.436632134582453e-08, + "loss": 1.5306, + "step": 14338 + }, + { + "epoch": 2.8735470941883765, + "grad_norm": 23.89939172701862, + "learning_rate": 5.419499423980512e-08, + "loss": 1.4956, + "step": 14339 + }, + { + "epoch": 2.87374749498998, + "grad_norm": 17.142294016558836, + "learning_rate": 5.40239360445044e-08, + "loss": 1.519, + "step": 14340 + }, + { + "epoch": 2.8739478957915834, + "grad_norm": 17.68538217993138, + "learning_rate": 5.385314676922271e-08, + "loss": 1.2825, + "step": 14341 + }, + { + "epoch": 2.8741482965931864, + "grad_norm": 22.018472266985057, + "learning_rate": 5.368262642324762e-08, + "loss": 1.6389, + "step": 14342 + }, + { + "epoch": 2.8743486973947894, + "grad_norm": 28.00855118359158, + "learning_rate": 5.351237501584894e-08, + "loss": 1.4284, + "step": 14343 + }, + { + "epoch": 2.874549098196393, + "grad_norm": 28.81310649900234, + "learning_rate": 5.3342392556284264e-08, + "loss": 1.6971, + "step": 14344 + }, + { + "epoch": 2.8747494989979963, + "grad_norm": 24.432499092315776, + "learning_rate": 5.317267905379564e-08, + "loss": 1.42, + "step": 14345 + }, + { + "epoch": 2.8749498997995993, + "grad_norm": 33.20003778012563, + "learning_rate": 5.300323451761014e-08, + "loss": 1.8172, + "step": 14346 + }, + { + "epoch": 2.8751503006012022, + "grad_norm": 27.48306801449265, + "learning_rate": 5.2834058956941494e-08, + "loss": 1.5164, + "step": 14347 + }, + { + "epoch": 2.8753507014028057, + "grad_norm": 22.24094702003145, + "learning_rate": 5.2665152380987906e-08, + "loss": 1.0221, + "step": 14348 + }, + { + "epoch": 2.8755511022044087, + "grad_norm": 24.617582654245997, + "learning_rate": 5.249651479893369e-08, + "loss": 1.9285, + "step": 14349 + }, + { + "epoch": 2.875751503006012, + "grad_norm": 17.53491683289234, + "learning_rate": 5.232814621994597e-08, + "loss": 1.5375, + "step": 14350 + }, + { + "epoch": 2.875951903807615, + "grad_norm": 20.63731153768313, + "learning_rate": 5.216004665318075e-08, + "loss": 1.5156, + "step": 14351 + }, + { + "epoch": 2.8761523046092186, + "grad_norm": 18.24495914479357, + "learning_rate": 5.199221610777738e-08, + "loss": 1.4031, + "step": 14352 + }, + { + "epoch": 2.8763527054108216, + "grad_norm": 20.02158311882841, + "learning_rate": 5.182465459286135e-08, + "loss": 1.1201, + "step": 14353 + }, + { + "epoch": 2.876553106212425, + "grad_norm": 23.021104145357146, + "learning_rate": 5.165736211754313e-08, + "loss": 1.3991, + "step": 14354 + }, + { + "epoch": 2.876753507014028, + "grad_norm": 18.79160598594061, + "learning_rate": 5.149033869091824e-08, + "loss": 1.6014, + "step": 14355 + }, + { + "epoch": 2.8769539078156314, + "grad_norm": 23.32444565688459, + "learning_rate": 5.1323584322068856e-08, + "loss": 1.6221, + "step": 14356 + }, + { + "epoch": 2.8771543086172344, + "grad_norm": 18.10721461709681, + "learning_rate": 5.1157099020060494e-08, + "loss": 1.4145, + "step": 14357 + }, + { + "epoch": 2.877354709418838, + "grad_norm": 37.631585716400686, + "learning_rate": 5.099088279394593e-08, + "loss": 1.6998, + "step": 14358 + }, + { + "epoch": 2.877555110220441, + "grad_norm": 22.196036476347867, + "learning_rate": 5.0824935652762363e-08, + "loss": 1.3457, + "step": 14359 + }, + { + "epoch": 2.877755511022044, + "grad_norm": 20.447957748316494, + "learning_rate": 5.0659257605533143e-08, + "loss": 1.3958, + "step": 14360 + }, + { + "epoch": 2.8779559118236473, + "grad_norm": 15.818324653542703, + "learning_rate": 5.04938486612655e-08, + "loss": 1.6606, + "step": 14361 + }, + { + "epoch": 2.8781563126252507, + "grad_norm": 19.324968046031096, + "learning_rate": 5.032870882895391e-08, + "loss": 1.3765, + "step": 14362 + }, + { + "epoch": 2.8783567134268537, + "grad_norm": 13.714765942052464, + "learning_rate": 5.016383811757675e-08, + "loss": 0.959, + "step": 14363 + }, + { + "epoch": 2.8785571142284567, + "grad_norm": 21.223718919825338, + "learning_rate": 4.999923653609795e-08, + "loss": 1.2927, + "step": 14364 + }, + { + "epoch": 2.87875751503006, + "grad_norm": 22.99284672266324, + "learning_rate": 4.983490409346814e-08, + "loss": 1.3937, + "step": 14365 + }, + { + "epoch": 2.878957915831663, + "grad_norm": 23.296762668996074, + "learning_rate": 4.967084079862128e-08, + "loss": 1.4486, + "step": 14366 + }, + { + "epoch": 2.8791583166332666, + "grad_norm": 22.660346890719772, + "learning_rate": 4.9507046660478006e-08, + "loss": 1.3665, + "step": 14367 + }, + { + "epoch": 2.8793587174348696, + "grad_norm": 27.99261833474183, + "learning_rate": 4.9343521687943985e-08, + "loss": 1.4033, + "step": 14368 + }, + { + "epoch": 2.879559118236473, + "grad_norm": 20.99266422549201, + "learning_rate": 4.918026588991098e-08, + "loss": 2.1919, + "step": 14369 + }, + { + "epoch": 2.879759519038076, + "grad_norm": 22.873557808794533, + "learning_rate": 4.9017279275255236e-08, + "loss": 1.9467, + "step": 14370 + }, + { + "epoch": 2.8799599198396795, + "grad_norm": 25.18803913386924, + "learning_rate": 4.885456185283799e-08, + "loss": 1.8229, + "step": 14371 + }, + { + "epoch": 2.8801603206412825, + "grad_norm": 22.735797653439974, + "learning_rate": 4.869211363150661e-08, + "loss": 1.2796, + "step": 14372 + }, + { + "epoch": 2.880360721442886, + "grad_norm": 24.341963789535296, + "learning_rate": 4.852993462009459e-08, + "loss": 1.2447, + "step": 14373 + }, + { + "epoch": 2.880561122244489, + "grad_norm": 17.167477437833995, + "learning_rate": 4.83680248274182e-08, + "loss": 1.5685, + "step": 14374 + }, + { + "epoch": 2.8807615230460923, + "grad_norm": 39.188259758539076, + "learning_rate": 4.820638426228208e-08, + "loss": 1.146, + "step": 14375 + }, + { + "epoch": 2.8809619238476953, + "grad_norm": 26.388156644452728, + "learning_rate": 4.804501293347419e-08, + "loss": 1.7538, + "step": 14376 + }, + { + "epoch": 2.8811623246492983, + "grad_norm": 39.33204296574365, + "learning_rate": 4.7883910849768624e-08, + "loss": 1.5118, + "step": 14377 + }, + { + "epoch": 2.8813627254509018, + "grad_norm": 20.974795768013575, + "learning_rate": 4.772307801992504e-08, + "loss": 1.8987, + "step": 14378 + }, + { + "epoch": 2.881563126252505, + "grad_norm": 64.40381635522799, + "learning_rate": 4.756251445268756e-08, + "loss": 2.2576, + "step": 14379 + }, + { + "epoch": 2.881763527054108, + "grad_norm": 22.006352552929908, + "learning_rate": 4.740222015678697e-08, + "loss": 1.7678, + "step": 14380 + }, + { + "epoch": 2.881963927855711, + "grad_norm": 18.458842027376, + "learning_rate": 4.724219514093798e-08, + "loss": 1.1479, + "step": 14381 + }, + { + "epoch": 2.8821643286573146, + "grad_norm": 25.381358820997303, + "learning_rate": 4.708243941384249e-08, + "loss": 1.82, + "step": 14382 + }, + { + "epoch": 2.882364729458918, + "grad_norm": 37.084687957082565, + "learning_rate": 4.692295298418526e-08, + "loss": 1.8947, + "step": 14383 + }, + { + "epoch": 2.882565130260521, + "grad_norm": 33.34935437966299, + "learning_rate": 4.676373586063876e-08, + "loss": 1.6014, + "step": 14384 + }, + { + "epoch": 2.882765531062124, + "grad_norm": 28.602268625420464, + "learning_rate": 4.6604788051859974e-08, + "loss": 1.5328, + "step": 14385 + }, + { + "epoch": 2.8829659318637275, + "grad_norm": 21.74926472144515, + "learning_rate": 4.644610956649032e-08, + "loss": 1.7826, + "step": 14386 + }, + { + "epoch": 2.8831663326653305, + "grad_norm": 23.153921497374192, + "learning_rate": 4.628770041315789e-08, + "loss": 1.9858, + "step": 14387 + }, + { + "epoch": 2.883366733466934, + "grad_norm": 21.678948334212773, + "learning_rate": 4.6129560600475244e-08, + "loss": 1.6789, + "step": 14388 + }, + { + "epoch": 2.883567134268537, + "grad_norm": 25.62171050761803, + "learning_rate": 4.5971690137041616e-08, + "loss": 1.5321, + "step": 14389 + }, + { + "epoch": 2.8837675350701404, + "grad_norm": 38.08539089694196, + "learning_rate": 4.581408903143958e-08, + "loss": 1.6588, + "step": 14390 + }, + { + "epoch": 2.8839679358717434, + "grad_norm": 21.6330547476456, + "learning_rate": 4.56567572922384e-08, + "loss": 2.2049, + "step": 14391 + }, + { + "epoch": 2.884168336673347, + "grad_norm": 24.754422471574166, + "learning_rate": 4.5499694927992885e-08, + "loss": 1.539, + "step": 14392 + }, + { + "epoch": 2.88436873747495, + "grad_norm": 21.436418153339567, + "learning_rate": 4.5342901947242334e-08, + "loss": 1.4102, + "step": 14393 + }, + { + "epoch": 2.8845691382765533, + "grad_norm": 16.39238871713112, + "learning_rate": 4.518637835851214e-08, + "loss": 1.2134, + "step": 14394 + }, + { + "epoch": 2.8847695390781563, + "grad_norm": 16.546597057458, + "learning_rate": 4.503012417031216e-08, + "loss": 1.4668, + "step": 14395 + }, + { + "epoch": 2.8849699398797597, + "grad_norm": 22.993438587677097, + "learning_rate": 4.487413939113894e-08, + "loss": 1.5054, + "step": 14396 + }, + { + "epoch": 2.8851703406813627, + "grad_norm": 22.21673460385866, + "learning_rate": 4.471842402947291e-08, + "loss": 1.4398, + "step": 14397 + }, + { + "epoch": 2.8853707414829657, + "grad_norm": 29.587162406913766, + "learning_rate": 4.456297809378063e-08, + "loss": 1.7977, + "step": 14398 + }, + { + "epoch": 2.885571142284569, + "grad_norm": 58.65457745947705, + "learning_rate": 4.440780159251479e-08, + "loss": 1.5291, + "step": 14399 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 40.14398346165151, + "learning_rate": 4.4252894534111413e-08, + "loss": 1.5767, + "step": 14400 + }, + { + "epoch": 2.8859719438877756, + "grad_norm": 23.444086681977655, + "learning_rate": 4.409825692699432e-08, + "loss": 1.4692, + "step": 14401 + }, + { + "epoch": 2.8861723446893786, + "grad_norm": 26.37340378456422, + "learning_rate": 4.394388877956956e-08, + "loss": 1.8221, + "step": 14402 + }, + { + "epoch": 2.886372745490982, + "grad_norm": 21.00143162709664, + "learning_rate": 4.378979010023154e-08, + "loss": 1.6758, + "step": 14403 + }, + { + "epoch": 2.8865731462925854, + "grad_norm": 31.412363974603256, + "learning_rate": 4.363596089735911e-08, + "loss": 2.0288, + "step": 14404 + }, + { + "epoch": 2.8867735470941884, + "grad_norm": 24.284373943956062, + "learning_rate": 4.348240117931501e-08, + "loss": 1.2612, + "step": 14405 + }, + { + "epoch": 2.8869739478957914, + "grad_norm": 26.27473050151368, + "learning_rate": 4.332911095445036e-08, + "loss": 1.7253, + "step": 14406 + }, + { + "epoch": 2.887174348697395, + "grad_norm": 20.71343714599608, + "learning_rate": 4.317609023109792e-08, + "loss": 1.6852, + "step": 14407 + }, + { + "epoch": 2.887374749498998, + "grad_norm": 18.072766840311438, + "learning_rate": 4.302333901757827e-08, + "loss": 1.5494, + "step": 14408 + }, + { + "epoch": 2.8875751503006013, + "grad_norm": 20.42603160620593, + "learning_rate": 4.287085732219698e-08, + "loss": 1.2452, + "step": 14409 + }, + { + "epoch": 2.8877755511022043, + "grad_norm": 21.69195286786234, + "learning_rate": 4.271864515324464e-08, + "loss": 1.3193, + "step": 14410 + }, + { + "epoch": 2.8879759519038077, + "grad_norm": 20.122422035336452, + "learning_rate": 4.256670251899797e-08, + "loss": 1.9608, + "step": 14411 + }, + { + "epoch": 2.8881763527054107, + "grad_norm": 18.672246183773932, + "learning_rate": 4.241502942771647e-08, + "loss": 1.79, + "step": 14412 + }, + { + "epoch": 2.888376753507014, + "grad_norm": 23.550618298026365, + "learning_rate": 4.226362588764799e-08, + "loss": 1.4523, + "step": 14413 + }, + { + "epoch": 2.888577154308617, + "grad_norm": 19.166574502568647, + "learning_rate": 4.2112491907024825e-08, + "loss": 1.2161, + "step": 14414 + }, + { + "epoch": 2.8887775551102206, + "grad_norm": 23.743851311364622, + "learning_rate": 4.1961627494063736e-08, + "loss": 1.3317, + "step": 14415 + }, + { + "epoch": 2.8889779559118236, + "grad_norm": 24.052055164088802, + "learning_rate": 4.181103265696818e-08, + "loss": 1.6666, + "step": 14416 + }, + { + "epoch": 2.889178356713427, + "grad_norm": 31.094195475151835, + "learning_rate": 4.1660707403924913e-08, + "loss": 1.6208, + "step": 14417 + }, + { + "epoch": 2.88937875751503, + "grad_norm": 21.604361766455348, + "learning_rate": 4.1510651743108534e-08, + "loss": 1.525, + "step": 14418 + }, + { + "epoch": 2.889579158316633, + "grad_norm": 26.158806101781877, + "learning_rate": 4.1360865682677495e-08, + "loss": 1.358, + "step": 14419 + }, + { + "epoch": 2.8897795591182365, + "grad_norm": 27.860535680078005, + "learning_rate": 4.121134923077641e-08, + "loss": 1.445, + "step": 14420 + }, + { + "epoch": 2.88997995991984, + "grad_norm": 27.48964978945957, + "learning_rate": 4.106210239553321e-08, + "loss": 1.6074, + "step": 14421 + }, + { + "epoch": 2.890180360721443, + "grad_norm": 20.161267690842074, + "learning_rate": 4.0913125185064186e-08, + "loss": 1.496, + "step": 14422 + }, + { + "epoch": 2.890380761523046, + "grad_norm": 51.35428104104469, + "learning_rate": 4.076441760746841e-08, + "loss": 1.8463, + "step": 14423 + }, + { + "epoch": 2.8905811623246493, + "grad_norm": 23.28725002968385, + "learning_rate": 4.061597967083164e-08, + "loss": 1.1543, + "step": 14424 + }, + { + "epoch": 2.8907815631262523, + "grad_norm": 23.03483872845528, + "learning_rate": 4.04678113832252e-08, + "loss": 1.7523, + "step": 14425 + }, + { + "epoch": 2.890981963927856, + "grad_norm": 15.200539059994101, + "learning_rate": 4.0319912752704306e-08, + "loss": 1.2493, + "step": 14426 + }, + { + "epoch": 2.8911823647294588, + "grad_norm": 19.21273360265981, + "learning_rate": 4.017228378731142e-08, + "loss": 1.3886, + "step": 14427 + }, + { + "epoch": 2.891382765531062, + "grad_norm": 26.863482387739257, + "learning_rate": 4.002492449507289e-08, + "loss": 1.645, + "step": 14428 + }, + { + "epoch": 2.891583166332665, + "grad_norm": 16.918425927444172, + "learning_rate": 3.98778348840001e-08, + "loss": 1.7906, + "step": 14429 + }, + { + "epoch": 2.8917835671342687, + "grad_norm": 20.05335702884362, + "learning_rate": 3.973101496209164e-08, + "loss": 1.3037, + "step": 14430 + }, + { + "epoch": 2.8919839679358716, + "grad_norm": 53.452823507204265, + "learning_rate": 3.958446473733002e-08, + "loss": 2.0377, + "step": 14431 + }, + { + "epoch": 2.892184368737475, + "grad_norm": 18.515192140339607, + "learning_rate": 3.94381842176833e-08, + "loss": 1.2089, + "step": 14432 + }, + { + "epoch": 2.892384769539078, + "grad_norm": 16.832320397022176, + "learning_rate": 3.9292173411105136e-08, + "loss": 1.672, + "step": 14433 + }, + { + "epoch": 2.8925851703406815, + "grad_norm": 22.401724233289045, + "learning_rate": 3.914643232553361e-08, + "loss": 1.7899, + "step": 14434 + }, + { + "epoch": 2.8927855711422845, + "grad_norm": 18.003340310828893, + "learning_rate": 3.9000960968894055e-08, + "loss": 1.1401, + "step": 14435 + }, + { + "epoch": 2.8929859719438875, + "grad_norm": 21.052578883005165, + "learning_rate": 3.885575934909513e-08, + "loss": 1.532, + "step": 14436 + }, + { + "epoch": 2.893186372745491, + "grad_norm": 21.558647621587635, + "learning_rate": 3.8710827474031655e-08, + "loss": 1.3014, + "step": 14437 + }, + { + "epoch": 2.8933867735470944, + "grad_norm": 20.88028837693529, + "learning_rate": 3.856616535158453e-08, + "loss": 1.6222, + "step": 14438 + }, + { + "epoch": 2.8935871743486974, + "grad_norm": 20.90362214913819, + "learning_rate": 3.8421772989618025e-08, + "loss": 1.1489, + "step": 14439 + }, + { + "epoch": 2.8937875751503004, + "grad_norm": 24.147254391933664, + "learning_rate": 3.827765039598419e-08, + "loss": 1.8793, + "step": 14440 + }, + { + "epoch": 2.893987975951904, + "grad_norm": 25.21231812600193, + "learning_rate": 3.813379757851843e-08, + "loss": 2.0906, + "step": 14441 + }, + { + "epoch": 2.8941883767535073, + "grad_norm": 26.03289826435024, + "learning_rate": 3.799021454504281e-08, + "loss": 1.5352, + "step": 14442 + }, + { + "epoch": 2.8943887775551103, + "grad_norm": 22.224351395055315, + "learning_rate": 3.7846901303363883e-08, + "loss": 1.4533, + "step": 14443 + }, + { + "epoch": 2.8945891783567133, + "grad_norm": 19.911610493857964, + "learning_rate": 3.770385786127373e-08, + "loss": 1.4929, + "step": 14444 + }, + { + "epoch": 2.8947895791583167, + "grad_norm": 23.939513930107164, + "learning_rate": 3.756108422654947e-08, + "loss": 1.8342, + "step": 14445 + }, + { + "epoch": 2.8949899799599197, + "grad_norm": 17.3742150976649, + "learning_rate": 3.7418580406954343e-08, + "loss": 1.72, + "step": 14446 + }, + { + "epoch": 2.895190380761523, + "grad_norm": 18.63857514492704, + "learning_rate": 3.7276346410236585e-08, + "loss": 1.8065, + "step": 14447 + }, + { + "epoch": 2.895390781563126, + "grad_norm": 21.859096110280298, + "learning_rate": 3.713438224412946e-08, + "loss": 1.9599, + "step": 14448 + }, + { + "epoch": 2.8955911823647296, + "grad_norm": 16.108916955110548, + "learning_rate": 3.6992687916351796e-08, + "loss": 1.1489, + "step": 14449 + }, + { + "epoch": 2.8957915831663326, + "grad_norm": 23.112384088129875, + "learning_rate": 3.6851263434607984e-08, + "loss": 1.6634, + "step": 14450 + }, + { + "epoch": 2.895991983967936, + "grad_norm": 22.30415627500973, + "learning_rate": 3.6710108806586876e-08, + "loss": 1.6404, + "step": 14451 + }, + { + "epoch": 2.896192384769539, + "grad_norm": 19.14076747490863, + "learning_rate": 3.6569224039963994e-08, + "loss": 1.3458, + "step": 14452 + }, + { + "epoch": 2.8963927855711424, + "grad_norm": 20.940468390773738, + "learning_rate": 3.6428609142398765e-08, + "loss": 1.3678, + "step": 14453 + }, + { + "epoch": 2.8965931863727454, + "grad_norm": 22.970128295092383, + "learning_rate": 3.6288264121537295e-08, + "loss": 1.654, + "step": 14454 + }, + { + "epoch": 2.896793587174349, + "grad_norm": 23.802633580725402, + "learning_rate": 3.6148188985010155e-08, + "loss": 1.1422, + "step": 14455 + }, + { + "epoch": 2.896993987975952, + "grad_norm": 19.506923828061748, + "learning_rate": 3.600838374043292e-08, + "loss": 1.6721, + "step": 14456 + }, + { + "epoch": 2.897194388777555, + "grad_norm": 18.37064342812161, + "learning_rate": 3.5868848395407275e-08, + "loss": 1.5591, + "step": 14457 + }, + { + "epoch": 2.8973947895791583, + "grad_norm": 23.351904385480076, + "learning_rate": 3.572958295752049e-08, + "loss": 1.5893, + "step": 14458 + }, + { + "epoch": 2.8975951903807617, + "grad_norm": 18.845676485777194, + "learning_rate": 3.559058743434374e-08, + "loss": 1.4398, + "step": 14459 + }, + { + "epoch": 2.8977955911823647, + "grad_norm": 16.822324677214237, + "learning_rate": 3.545186183343485e-08, + "loss": 0.9485, + "step": 14460 + }, + { + "epoch": 2.8979959919839677, + "grad_norm": 22.622384932235082, + "learning_rate": 3.531340616233725e-08, + "loss": 1.7047, + "step": 14461 + }, + { + "epoch": 2.898196392785571, + "grad_norm": 19.011471873448777, + "learning_rate": 3.517522042857768e-08, + "loss": 1.1579, + "step": 14462 + }, + { + "epoch": 2.8983967935871746, + "grad_norm": 38.186384977416544, + "learning_rate": 3.50373046396707e-08, + "loss": 1.349, + "step": 14463 + }, + { + "epoch": 2.8985971943887776, + "grad_norm": 21.252823541594164, + "learning_rate": 3.489965880311419e-08, + "loss": 1.4046, + "step": 14464 + }, + { + "epoch": 2.8987975951903806, + "grad_norm": 30.29109438903742, + "learning_rate": 3.4762282926392167e-08, + "loss": 2.1982, + "step": 14465 + }, + { + "epoch": 2.898997995991984, + "grad_norm": 18.64029083736275, + "learning_rate": 3.462517701697421e-08, + "loss": 1.3104, + "step": 14466 + }, + { + "epoch": 2.899198396793587, + "grad_norm": 24.35444180017305, + "learning_rate": 3.448834108231491e-08, + "loss": 2.0932, + "step": 14467 + }, + { + "epoch": 2.8993987975951905, + "grad_norm": 27.93356662274996, + "learning_rate": 3.4351775129854995e-08, + "loss": 2.2662, + "step": 14468 + }, + { + "epoch": 2.8995991983967935, + "grad_norm": 15.962001256065896, + "learning_rate": 3.421547916701795e-08, + "loss": 1.4193, + "step": 14469 + }, + { + "epoch": 2.899799599198397, + "grad_norm": 19.970410503109452, + "learning_rate": 3.4079453201216196e-08, + "loss": 1.5013, + "step": 14470 + }, + { + "epoch": 2.9, + "grad_norm": 25.63698865744112, + "learning_rate": 3.394369723984492e-08, + "loss": 2.0947, + "step": 14471 + }, + { + "epoch": 2.9002004008016034, + "grad_norm": 40.08795675035586, + "learning_rate": 3.3808211290284886e-08, + "loss": 1.6755, + "step": 14472 + }, + { + "epoch": 2.9004008016032063, + "grad_norm": 21.721084253284783, + "learning_rate": 3.367299535990354e-08, + "loss": 1.1739, + "step": 14473 + }, + { + "epoch": 2.9006012024048093, + "grad_norm": 24.178881259316285, + "learning_rate": 3.353804945605277e-08, + "loss": 1.8677, + "step": 14474 + }, + { + "epoch": 2.900801603206413, + "grad_norm": 21.58302197268267, + "learning_rate": 3.340337358606893e-08, + "loss": 1.2714, + "step": 14475 + }, + { + "epoch": 2.901002004008016, + "grad_norm": 19.685571065828956, + "learning_rate": 3.326896775727506e-08, + "loss": 1.3804, + "step": 14476 + }, + { + "epoch": 2.901202404809619, + "grad_norm": 33.128228630554894, + "learning_rate": 3.31348319769792e-08, + "loss": 1.3381, + "step": 14477 + }, + { + "epoch": 2.901402805611222, + "grad_norm": 23.998425357142402, + "learning_rate": 3.300096625247384e-08, + "loss": 1.5069, + "step": 14478 + }, + { + "epoch": 2.9016032064128257, + "grad_norm": 22.73692336321239, + "learning_rate": 3.2867370591038174e-08, + "loss": 1.6823, + "step": 14479 + }, + { + "epoch": 2.901803607214429, + "grad_norm": 31.967646659938552, + "learning_rate": 3.273404499993582e-08, + "loss": 1.438, + "step": 14480 + }, + { + "epoch": 2.902004008016032, + "grad_norm": 23.896584700891424, + "learning_rate": 3.260098948641599e-08, + "loss": 1.5924, + "step": 14481 + }, + { + "epoch": 2.902204408817635, + "grad_norm": 18.647593570812376, + "learning_rate": 3.2468204057712896e-08, + "loss": 1.2581, + "step": 14482 + }, + { + "epoch": 2.9024048096192385, + "grad_norm": 31.218715837158598, + "learning_rate": 3.2335688721045755e-08, + "loss": 1.4991, + "step": 14483 + }, + { + "epoch": 2.9026052104208415, + "grad_norm": 22.68374485928208, + "learning_rate": 3.2203443483620475e-08, + "loss": 1.5548, + "step": 14484 + }, + { + "epoch": 2.902805611222445, + "grad_norm": 26.662226455138363, + "learning_rate": 3.207146835262742e-08, + "loss": 1.2547, + "step": 14485 + }, + { + "epoch": 2.903006012024048, + "grad_norm": 21.163940543210515, + "learning_rate": 3.1939763335240845e-08, + "loss": 1.7123, + "step": 14486 + }, + { + "epoch": 2.9032064128256514, + "grad_norm": 21.91517804664682, + "learning_rate": 3.1808328438623916e-08, + "loss": 1.403, + "step": 14487 + }, + { + "epoch": 2.9034068136272544, + "grad_norm": 28.41261687387294, + "learning_rate": 3.167716366992202e-08, + "loss": 1.6499, + "step": 14488 + }, + { + "epoch": 2.903607214428858, + "grad_norm": 22.52191939255486, + "learning_rate": 3.154626903626612e-08, + "loss": 1.5199, + "step": 14489 + }, + { + "epoch": 2.903807615230461, + "grad_norm": 74.90273848466458, + "learning_rate": 3.141564454477386e-08, + "loss": 1.891, + "step": 14490 + }, + { + "epoch": 2.9040080160320643, + "grad_norm": 23.813645984235993, + "learning_rate": 3.1285290202547334e-08, + "loss": 1.3775, + "step": 14491 + }, + { + "epoch": 2.9042084168336673, + "grad_norm": 19.097588771795248, + "learning_rate": 3.115520601667421e-08, + "loss": 1.6895, + "step": 14492 + }, + { + "epoch": 2.9044088176352707, + "grad_norm": 15.816975394320975, + "learning_rate": 3.1025391994226604e-08, + "loss": 1.2147, + "step": 14493 + }, + { + "epoch": 2.9046092184368737, + "grad_norm": 19.82596075259055, + "learning_rate": 3.0895848142263874e-08, + "loss": 1.5648, + "step": 14494 + }, + { + "epoch": 2.9048096192384767, + "grad_norm": 21.33148314080459, + "learning_rate": 3.076657446782927e-08, + "loss": 1.3064, + "step": 14495 + }, + { + "epoch": 2.90501002004008, + "grad_norm": 17.78552028142959, + "learning_rate": 3.0637570977950507e-08, + "loss": 1.6148, + "step": 14496 + }, + { + "epoch": 2.9052104208416836, + "grad_norm": 18.667468304542616, + "learning_rate": 3.050883767964308e-08, + "loss": 1.8616, + "step": 14497 + }, + { + "epoch": 2.9054108216432866, + "grad_norm": 26.487226042307736, + "learning_rate": 3.038037457990528e-08, + "loss": 1.2904, + "step": 14498 + }, + { + "epoch": 2.9056112224448896, + "grad_norm": 46.10376142252502, + "learning_rate": 3.0252181685722636e-08, + "loss": 1.4439, + "step": 14499 + }, + { + "epoch": 2.905811623246493, + "grad_norm": 29.127231193856765, + "learning_rate": 3.012425900406457e-08, + "loss": 1.0909, + "step": 14500 + }, + { + "epoch": 2.9060120240480964, + "grad_norm": 30.022072314817507, + "learning_rate": 2.9996606541887185e-08, + "loss": 1.685, + "step": 14501 + }, + { + "epoch": 2.9062124248496994, + "grad_norm": 30.309061586609708, + "learning_rate": 2.9869224306130485e-08, + "loss": 1.3335, + "step": 14502 + }, + { + "epoch": 2.9064128256513024, + "grad_norm": 25.604962210993925, + "learning_rate": 2.9742112303720593e-08, + "loss": 1.8926, + "step": 14503 + }, + { + "epoch": 2.906613226452906, + "grad_norm": 27.83183346159523, + "learning_rate": 2.9615270541569207e-08, + "loss": 1.7649, + "step": 14504 + }, + { + "epoch": 2.906813627254509, + "grad_norm": 20.940397603006893, + "learning_rate": 2.948869902657192e-08, + "loss": 1.5608, + "step": 14505 + }, + { + "epoch": 2.9070140280561123, + "grad_norm": 22.470770260742235, + "learning_rate": 2.936239776561156e-08, + "loss": 1.8274, + "step": 14506 + }, + { + "epoch": 2.9072144288577153, + "grad_norm": 20.063616078770316, + "learning_rate": 2.9236366765554858e-08, + "loss": 1.5772, + "step": 14507 + }, + { + "epoch": 2.9074148296593187, + "grad_norm": 24.263306917696898, + "learning_rate": 2.911060603325466e-08, + "loss": 1.6361, + "step": 14508 + }, + { + "epoch": 2.9076152304609217, + "grad_norm": 22.593598745867254, + "learning_rate": 2.8985115575548285e-08, + "loss": 0.9446, + "step": 14509 + }, + { + "epoch": 2.907815631262525, + "grad_norm": 25.02965457704541, + "learning_rate": 2.8859895399259152e-08, + "loss": 1.684, + "step": 14510 + }, + { + "epoch": 2.908016032064128, + "grad_norm": 19.23360525529045, + "learning_rate": 2.873494551119571e-08, + "loss": 1.3014, + "step": 14511 + }, + { + "epoch": 2.9082164328657316, + "grad_norm": 15.76103730824573, + "learning_rate": 2.8610265918151413e-08, + "loss": 1.2346, + "step": 14512 + }, + { + "epoch": 2.9084168336673346, + "grad_norm": 19.318920415023307, + "learning_rate": 2.8485856626905285e-08, + "loss": 1.215, + "step": 14513 + }, + { + "epoch": 2.908617234468938, + "grad_norm": 14.943256770081538, + "learning_rate": 2.8361717644221908e-08, + "loss": 1.1462, + "step": 14514 + }, + { + "epoch": 2.908817635270541, + "grad_norm": 51.22981940110272, + "learning_rate": 2.823784897685089e-08, + "loss": 1.7545, + "step": 14515 + }, + { + "epoch": 2.909018036072144, + "grad_norm": 22.48854218910506, + "learning_rate": 2.811425063152684e-08, + "loss": 1.4652, + "step": 14516 + }, + { + "epoch": 2.9092184368737475, + "grad_norm": 22.63041902861856, + "learning_rate": 2.7990922614969938e-08, + "loss": 1.3859, + "step": 14517 + }, + { + "epoch": 2.909418837675351, + "grad_norm": 26.2222409285412, + "learning_rate": 2.786786493388649e-08, + "loss": 1.707, + "step": 14518 + }, + { + "epoch": 2.909619238476954, + "grad_norm": 19.86604603259638, + "learning_rate": 2.7745077594966142e-08, + "loss": 1.3537, + "step": 14519 + }, + { + "epoch": 2.909819639278557, + "grad_norm": 20.3734142672724, + "learning_rate": 2.7622560604885773e-08, + "loss": 1.9918, + "step": 14520 + }, + { + "epoch": 2.9100200400801604, + "grad_norm": 24.496631829350004, + "learning_rate": 2.7500313970306723e-08, + "loss": 2.611, + "step": 14521 + }, + { + "epoch": 2.910220440881764, + "grad_norm": 22.605335194840514, + "learning_rate": 2.73783376978759e-08, + "loss": 1.3933, + "step": 14522 + }, + { + "epoch": 2.910420841683367, + "grad_norm": 33.32195025135543, + "learning_rate": 2.725663179422522e-08, + "loss": 1.8507, + "step": 14523 + }, + { + "epoch": 2.91062124248497, + "grad_norm": 15.990454206902951, + "learning_rate": 2.713519626597161e-08, + "loss": 1.5176, + "step": 14524 + }, + { + "epoch": 2.9108216432865732, + "grad_norm": 17.237523143183797, + "learning_rate": 2.701403111971812e-08, + "loss": 1.6334, + "step": 14525 + }, + { + "epoch": 2.911022044088176, + "grad_norm": 45.572957579412765, + "learning_rate": 2.689313636205282e-08, + "loss": 2.05, + "step": 14526 + }, + { + "epoch": 2.9112224448897797, + "grad_norm": 25.289825657963775, + "learning_rate": 2.6772511999548225e-08, + "loss": 1.4259, + "step": 14527 + }, + { + "epoch": 2.9114228456913827, + "grad_norm": 20.918295101828498, + "learning_rate": 2.665215803876353e-08, + "loss": 1.4377, + "step": 14528 + }, + { + "epoch": 2.911623246492986, + "grad_norm": 17.42386901102022, + "learning_rate": 2.65320744862424e-08, + "loss": 1.5824, + "step": 14529 + }, + { + "epoch": 2.911823647294589, + "grad_norm": 18.79461253851847, + "learning_rate": 2.6412261348513492e-08, + "loss": 1.3704, + "step": 14530 + }, + { + "epoch": 2.9120240480961925, + "grad_norm": 20.32712350893373, + "learning_rate": 2.629271863209215e-08, + "loss": 1.4405, + "step": 14531 + }, + { + "epoch": 2.9122244488977955, + "grad_norm": 19.78909588682123, + "learning_rate": 2.617344634347707e-08, + "loss": 1.0045, + "step": 14532 + }, + { + "epoch": 2.9124248496993985, + "grad_norm": 24.22797455901645, + "learning_rate": 2.6054444489154174e-08, + "loss": 1.9546, + "step": 14533 + }, + { + "epoch": 2.912625250501002, + "grad_norm": 24.26597744416973, + "learning_rate": 2.5935713075593283e-08, + "loss": 2.1152, + "step": 14534 + }, + { + "epoch": 2.9128256513026054, + "grad_norm": 21.96851222596927, + "learning_rate": 2.581725210924979e-08, + "loss": 1.8473, + "step": 14535 + }, + { + "epoch": 2.9130260521042084, + "grad_norm": 23.172024517417004, + "learning_rate": 2.5699061596565212e-08, + "loss": 1.7305, + "step": 14536 + }, + { + "epoch": 2.9132264529058114, + "grad_norm": 31.26979774428668, + "learning_rate": 2.558114154396496e-08, + "loss": 1.1146, + "step": 14537 + }, + { + "epoch": 2.913426853707415, + "grad_norm": 17.03523834334537, + "learning_rate": 2.546349195786113e-08, + "loss": 1.6972, + "step": 14538 + }, + { + "epoch": 2.9136272545090183, + "grad_norm": 19.360626283402286, + "learning_rate": 2.5346112844650826e-08, + "loss": 1.4594, + "step": 14539 + }, + { + "epoch": 2.9138276553106213, + "grad_norm": 19.146691523898784, + "learning_rate": 2.5229004210715058e-08, + "loss": 1.1312, + "step": 14540 + }, + { + "epoch": 2.9140280561122243, + "grad_norm": 18.551105024536028, + "learning_rate": 2.5112166062421504e-08, + "loss": 1.2888, + "step": 14541 + }, + { + "epoch": 2.9142284569138277, + "grad_norm": 20.926261626042905, + "learning_rate": 2.4995598406123977e-08, + "loss": 1.3142, + "step": 14542 + }, + { + "epoch": 2.9144288577154307, + "grad_norm": 23.357297927543694, + "learning_rate": 2.487930124815907e-08, + "loss": 1.7972, + "step": 14543 + }, + { + "epoch": 2.914629258517034, + "grad_norm": 25.57274082869065, + "learning_rate": 2.476327459485062e-08, + "loss": 1.8388, + "step": 14544 + }, + { + "epoch": 2.914829659318637, + "grad_norm": 24.061311221156004, + "learning_rate": 2.4647518452506903e-08, + "loss": 1.2564, + "step": 14545 + }, + { + "epoch": 2.9150300601202406, + "grad_norm": 25.827127754654764, + "learning_rate": 2.453203282742178e-08, + "loss": 1.8637, + "step": 14546 + }, + { + "epoch": 2.9152304609218436, + "grad_norm": 28.831583913055496, + "learning_rate": 2.4416817725874675e-08, + "loss": 1.6967, + "step": 14547 + }, + { + "epoch": 2.915430861723447, + "grad_norm": 21.326290102860863, + "learning_rate": 2.4301873154130017e-08, + "loss": 1.4564, + "step": 14548 + }, + { + "epoch": 2.91563126252505, + "grad_norm": 67.75487883637726, + "learning_rate": 2.4187199118437256e-08, + "loss": 1.5347, + "step": 14549 + }, + { + "epoch": 2.9158316633266534, + "grad_norm": 20.78306093304139, + "learning_rate": 2.4072795625030844e-08, + "loss": 1.9475, + "step": 14550 + }, + { + "epoch": 2.9160320641282564, + "grad_norm": 19.921428229796675, + "learning_rate": 2.395866268013247e-08, + "loss": 1.3537, + "step": 14551 + }, + { + "epoch": 2.91623246492986, + "grad_norm": 21.058738235532708, + "learning_rate": 2.3844800289946623e-08, + "loss": 1.897, + "step": 14552 + }, + { + "epoch": 2.916432865731463, + "grad_norm": 20.751155213858084, + "learning_rate": 2.3731208460664456e-08, + "loss": 1.5071, + "step": 14553 + }, + { + "epoch": 2.916633266533066, + "grad_norm": 20.056672836069545, + "learning_rate": 2.3617887198462143e-08, + "loss": 1.7616, + "step": 14554 + }, + { + "epoch": 2.9168336673346693, + "grad_norm": 22.440614112220764, + "learning_rate": 2.350483650950086e-08, + "loss": 1.8892, + "step": 14555 + }, + { + "epoch": 2.9170340681362728, + "grad_norm": 25.768397485779143, + "learning_rate": 2.339205639992792e-08, + "loss": 1.3221, + "step": 14556 + }, + { + "epoch": 2.9172344689378757, + "grad_norm": 21.29005713072659, + "learning_rate": 2.3279546875874527e-08, + "loss": 1.3614, + "step": 14557 + }, + { + "epoch": 2.9174348697394787, + "grad_norm": 22.283222883108184, + "learning_rate": 2.3167307943459115e-08, + "loss": 1.7913, + "step": 14558 + }, + { + "epoch": 2.917635270541082, + "grad_norm": 30.206456019003863, + "learning_rate": 2.305533960878348e-08, + "loss": 1.4937, + "step": 14559 + }, + { + "epoch": 2.9178356713426856, + "grad_norm": 21.863400057931155, + "learning_rate": 2.294364187793552e-08, + "loss": 1.8688, + "step": 14560 + }, + { + "epoch": 2.9180360721442886, + "grad_norm": 18.95571809587651, + "learning_rate": 2.2832214756988714e-08, + "loss": 1.4008, + "step": 14561 + }, + { + "epoch": 2.9182364729458916, + "grad_norm": 27.751508505639393, + "learning_rate": 2.272105825200155e-08, + "loss": 1.3982, + "step": 14562 + }, + { + "epoch": 2.918436873747495, + "grad_norm": 20.69106613539909, + "learning_rate": 2.2610172369016968e-08, + "loss": 1.3404, + "step": 14563 + }, + { + "epoch": 2.918637274549098, + "grad_norm": 21.772267957156604, + "learning_rate": 2.249955711406515e-08, + "loss": 1.2494, + "step": 14564 + }, + { + "epoch": 2.9188376753507015, + "grad_norm": 18.88651938648475, + "learning_rate": 2.2389212493159616e-08, + "loss": 2.0453, + "step": 14565 + }, + { + "epoch": 2.9190380761523045, + "grad_norm": 21.35773566949132, + "learning_rate": 2.227913851230057e-08, + "loss": 1.2039, + "step": 14566 + }, + { + "epoch": 2.919238476953908, + "grad_norm": 23.461812865371076, + "learning_rate": 2.2169335177472106e-08, + "loss": 1.3196, + "step": 14567 + }, + { + "epoch": 2.919438877755511, + "grad_norm": 23.800824882230305, + "learning_rate": 2.205980249464501e-08, + "loss": 1.7753, + "step": 14568 + }, + { + "epoch": 2.9196392785571144, + "grad_norm": 25.348488667203668, + "learning_rate": 2.1950540469774516e-08, + "loss": 1.6382, + "step": 14569 + }, + { + "epoch": 2.9198396793587174, + "grad_norm": 24.238277724795452, + "learning_rate": 2.1841549108801984e-08, + "loss": 1.6191, + "step": 14570 + }, + { + "epoch": 2.920040080160321, + "grad_norm": 21.979002298229418, + "learning_rate": 2.173282841765212e-08, + "loss": 1.4842, + "step": 14571 + }, + { + "epoch": 2.920240480961924, + "grad_norm": 29.53943325884435, + "learning_rate": 2.1624378402236856e-08, + "loss": 1.4891, + "step": 14572 + }, + { + "epoch": 2.9204408817635272, + "grad_norm": 45.20801870720136, + "learning_rate": 2.15161990684537e-08, + "loss": 1.9441, + "step": 14573 + }, + { + "epoch": 2.9206412825651302, + "grad_norm": 31.50213673977768, + "learning_rate": 2.1408290422182955e-08, + "loss": 1.4037, + "step": 14574 + }, + { + "epoch": 2.920841683366733, + "grad_norm": 18.593051943017702, + "learning_rate": 2.1300652469292693e-08, + "loss": 0.9537, + "step": 14575 + }, + { + "epoch": 2.9210420841683367, + "grad_norm": 23.171808369740727, + "learning_rate": 2.1193285215635462e-08, + "loss": 2.1565, + "step": 14576 + }, + { + "epoch": 2.92124248496994, + "grad_norm": 21.340888058128492, + "learning_rate": 2.1086188667048814e-08, + "loss": 1.7975, + "step": 14577 + }, + { + "epoch": 2.921442885771543, + "grad_norm": 17.210803778234173, + "learning_rate": 2.097936282935531e-08, + "loss": 1.2982, + "step": 14578 + }, + { + "epoch": 2.921643286573146, + "grad_norm": 23.0583542501403, + "learning_rate": 2.087280770836364e-08, + "loss": 1.2953, + "step": 14579 + }, + { + "epoch": 2.9218436873747495, + "grad_norm": 23.05803711067991, + "learning_rate": 2.0766523309867502e-08, + "loss": 1.4942, + "step": 14580 + }, + { + "epoch": 2.922044088176353, + "grad_norm": 24.295900549081843, + "learning_rate": 2.0660509639645054e-08, + "loss": 1.603, + "step": 14581 + }, + { + "epoch": 2.922244488977956, + "grad_norm": 17.97778673763381, + "learning_rate": 2.0554766703461127e-08, + "loss": 1.027, + "step": 14582 + }, + { + "epoch": 2.922444889779559, + "grad_norm": 25.04607413714632, + "learning_rate": 2.044929450706501e-08, + "loss": 1.2237, + "step": 14583 + }, + { + "epoch": 2.9226452905811624, + "grad_norm": 41.795323151020206, + "learning_rate": 2.0344093056191004e-08, + "loss": 0.936, + "step": 14584 + }, + { + "epoch": 2.9228456913827654, + "grad_norm": 21.158155032877296, + "learning_rate": 2.0239162356559537e-08, + "loss": 1.1516, + "step": 14585 + }, + { + "epoch": 2.923046092184369, + "grad_norm": 16.891937178492814, + "learning_rate": 2.013450241387549e-08, + "loss": 1.2249, + "step": 14586 + }, + { + "epoch": 2.923246492985972, + "grad_norm": 30.25914593411585, + "learning_rate": 2.0030113233829306e-08, + "loss": 1.1484, + "step": 14587 + }, + { + "epoch": 2.9234468937875753, + "grad_norm": 20.119085098162014, + "learning_rate": 1.9925994822097006e-08, + "loss": 1.5472, + "step": 14588 + }, + { + "epoch": 2.9236472945891783, + "grad_norm": 22.446128309361487, + "learning_rate": 1.982214718434017e-08, + "loss": 1.3942, + "step": 14589 + }, + { + "epoch": 2.9238476953907817, + "grad_norm": 22.41773231391634, + "learning_rate": 1.9718570326204278e-08, + "loss": 1.6634, + "step": 14590 + }, + { + "epoch": 2.9240480961923847, + "grad_norm": 20.635836698238233, + "learning_rate": 1.961526425332094e-08, + "loss": 1.618, + "step": 14591 + }, + { + "epoch": 2.9242484969939877, + "grad_norm": 22.132582808315526, + "learning_rate": 1.951222897130789e-08, + "loss": 1.2303, + "step": 14592 + }, + { + "epoch": 2.924448897795591, + "grad_norm": 65.60595049504613, + "learning_rate": 1.940946448576675e-08, + "loss": 1.7841, + "step": 14593 + }, + { + "epoch": 2.9246492985971946, + "grad_norm": 22.004186066781696, + "learning_rate": 1.930697080228472e-08, + "loss": 1.4579, + "step": 14594 + }, + { + "epoch": 2.9248496993987976, + "grad_norm": 21.805179552509966, + "learning_rate": 1.920474792643512e-08, + "loss": 1.3888, + "step": 14595 + }, + { + "epoch": 2.9250501002004006, + "grad_norm": 20.155452494453527, + "learning_rate": 1.9102795863775725e-08, + "loss": 1.4659, + "step": 14596 + }, + { + "epoch": 2.925250501002004, + "grad_norm": 17.319720654672846, + "learning_rate": 1.9001114619849882e-08, + "loss": 1.4556, + "step": 14597 + }, + { + "epoch": 2.9254509018036075, + "grad_norm": 27.66166742994407, + "learning_rate": 1.8899704200185942e-08, + "loss": 1.6219, + "step": 14598 + }, + { + "epoch": 2.9256513026052104, + "grad_norm": 18.729944286257773, + "learning_rate": 1.879856461029783e-08, + "loss": 1.2778, + "step": 14599 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 24.883906050733604, + "learning_rate": 1.8697695855684484e-08, + "loss": 1.1995, + "step": 14600 + }, + { + "epoch": 2.926052104208417, + "grad_norm": 22.76090597088475, + "learning_rate": 1.8597097941830955e-08, + "loss": 1.5418, + "step": 14601 + }, + { + "epoch": 2.92625250501002, + "grad_norm": 22.425747782744878, + "learning_rate": 1.8496770874206204e-08, + "loss": 1.1391, + "step": 14602 + }, + { + "epoch": 2.9264529058116233, + "grad_norm": 17.62986999565997, + "learning_rate": 1.8396714658265313e-08, + "loss": 1.7124, + "step": 14603 + }, + { + "epoch": 2.9266533066132263, + "grad_norm": 18.92210010976009, + "learning_rate": 1.8296929299448374e-08, + "loss": 1.8271, + "step": 14604 + }, + { + "epoch": 2.9268537074148298, + "grad_norm": 20.5015190726504, + "learning_rate": 1.8197414803181602e-08, + "loss": 1.5962, + "step": 14605 + }, + { + "epoch": 2.9270541082164327, + "grad_norm": 23.121543502562698, + "learning_rate": 1.8098171174875113e-08, + "loss": 1.3488, + "step": 14606 + }, + { + "epoch": 2.927254509018036, + "grad_norm": 28.93313932014294, + "learning_rate": 1.7999198419924592e-08, + "loss": 2.0784, + "step": 14607 + }, + { + "epoch": 2.927454909819639, + "grad_norm": 20.929045361831104, + "learning_rate": 1.7900496543711843e-08, + "loss": 1.4274, + "step": 14608 + }, + { + "epoch": 2.9276553106212426, + "grad_norm": 17.491407693716198, + "learning_rate": 1.7802065551603688e-08, + "loss": 1.7552, + "step": 14609 + }, + { + "epoch": 2.9278557114228456, + "grad_norm": 24.811749787254243, + "learning_rate": 1.7703905448951398e-08, + "loss": 1.4989, + "step": 14610 + }, + { + "epoch": 2.928056112224449, + "grad_norm": 25.589607484603825, + "learning_rate": 1.760601624109237e-08, + "loss": 1.5902, + "step": 14611 + }, + { + "epoch": 2.928256513026052, + "grad_norm": 22.648918150136385, + "learning_rate": 1.750839793334902e-08, + "loss": 1.6945, + "step": 14612 + }, + { + "epoch": 2.928456913827655, + "grad_norm": 22.92844839151868, + "learning_rate": 1.741105053102876e-08, + "loss": 1.7125, + "step": 14613 + }, + { + "epoch": 2.9286573146292585, + "grad_norm": 29.768622715310585, + "learning_rate": 1.7313974039424586e-08, + "loss": 2.0343, + "step": 14614 + }, + { + "epoch": 2.928857715430862, + "grad_norm": 22.003056385176155, + "learning_rate": 1.7217168463815047e-08, + "loss": 1.7598, + "step": 14615 + }, + { + "epoch": 2.929058116232465, + "grad_norm": 18.55314771515125, + "learning_rate": 1.7120633809463162e-08, + "loss": 1.2055, + "step": 14616 + }, + { + "epoch": 2.929258517034068, + "grad_norm": 23.754761388448188, + "learning_rate": 1.7024370081617504e-08, + "loss": 1.2126, + "step": 14617 + }, + { + "epoch": 2.9294589178356714, + "grad_norm": 35.31897492261565, + "learning_rate": 1.6928377285512776e-08, + "loss": 1.3238, + "step": 14618 + }, + { + "epoch": 2.929659318637275, + "grad_norm": 27.50287572398578, + "learning_rate": 1.6832655426367585e-08, + "loss": 1.3808, + "step": 14619 + }, + { + "epoch": 2.929859719438878, + "grad_norm": 24.60681635219125, + "learning_rate": 1.6737204509387206e-08, + "loss": 1.7418, + "step": 14620 + }, + { + "epoch": 2.930060120240481, + "grad_norm": 19.66639364826394, + "learning_rate": 1.6642024539760826e-08, + "loss": 1.8114, + "step": 14621 + }, + { + "epoch": 2.9302605210420842, + "grad_norm": 17.640049654417737, + "learning_rate": 1.6547115522663192e-08, + "loss": 1.4532, + "step": 14622 + }, + { + "epoch": 2.9304609218436872, + "grad_norm": 36.7772483813929, + "learning_rate": 1.6452477463255733e-08, + "loss": 1.4371, + "step": 14623 + }, + { + "epoch": 2.9306613226452907, + "grad_norm": 18.203437251216364, + "learning_rate": 1.6358110366683222e-08, + "loss": 1.4905, + "step": 14624 + }, + { + "epoch": 2.9308617234468937, + "grad_norm": 24.88713740469273, + "learning_rate": 1.626401423807711e-08, + "loss": 1.5675, + "step": 14625 + }, + { + "epoch": 2.931062124248497, + "grad_norm": 19.308450211595847, + "learning_rate": 1.6170189082552747e-08, + "loss": 1.5351, + "step": 14626 + }, + { + "epoch": 2.9312625250501, + "grad_norm": 24.229464107934135, + "learning_rate": 1.6076634905212164e-08, + "loss": 1.7196, + "step": 14627 + }, + { + "epoch": 2.9314629258517035, + "grad_norm": 17.823906897548788, + "learning_rate": 1.5983351711141848e-08, + "loss": 1.5461, + "step": 14628 + }, + { + "epoch": 2.9316633266533065, + "grad_norm": 17.930475596931227, + "learning_rate": 1.5890339505413853e-08, + "loss": 1.3859, + "step": 14629 + }, + { + "epoch": 2.93186372745491, + "grad_norm": 20.481691620295262, + "learning_rate": 1.579759829308525e-08, + "loss": 1.7299, + "step": 14630 + }, + { + "epoch": 2.932064128256513, + "grad_norm": 22.25966058760962, + "learning_rate": 1.570512807919866e-08, + "loss": 1.6807, + "step": 14631 + }, + { + "epoch": 2.9322645290581164, + "grad_norm": 24.229762241730484, + "learning_rate": 1.561292886878174e-08, + "loss": 1.4308, + "step": 14632 + }, + { + "epoch": 2.9324649298597194, + "grad_norm": 24.074936721540976, + "learning_rate": 1.5521000666847695e-08, + "loss": 1.4755, + "step": 14633 + }, + { + "epoch": 2.9326653306613224, + "grad_norm": 21.46383888673229, + "learning_rate": 1.54293434783942e-08, + "loss": 1.0518, + "step": 14634 + }, + { + "epoch": 2.932865731462926, + "grad_norm": 20.50110524876781, + "learning_rate": 1.5337957308405594e-08, + "loss": 1.4847, + "step": 14635 + }, + { + "epoch": 2.9330661322645293, + "grad_norm": 19.859268101503822, + "learning_rate": 1.524684216185013e-08, + "loss": 1.0785, + "step": 14636 + }, + { + "epoch": 2.9332665330661323, + "grad_norm": 19.519236474059372, + "learning_rate": 1.5155998043682174e-08, + "loss": 1.528, + "step": 14637 + }, + { + "epoch": 2.9334669338677353, + "grad_norm": 18.200876834071234, + "learning_rate": 1.5065424958840558e-08, + "loss": 1.1656, + "step": 14638 + }, + { + "epoch": 2.9336673346693387, + "grad_norm": 22.238892518777295, + "learning_rate": 1.497512291225023e-08, + "loss": 1.7064, + "step": 14639 + }, + { + "epoch": 2.933867735470942, + "grad_norm": 19.569645759841453, + "learning_rate": 1.4885091908821147e-08, + "loss": 1.3337, + "step": 14640 + }, + { + "epoch": 2.934068136272545, + "grad_norm": 35.70095205526906, + "learning_rate": 1.4795331953448288e-08, + "loss": 1.5025, + "step": 14641 + }, + { + "epoch": 2.934268537074148, + "grad_norm": 25.065282226182063, + "learning_rate": 1.4705843051012193e-08, + "loss": 1.7234, + "step": 14642 + }, + { + "epoch": 2.9344689378757516, + "grad_norm": 23.96939505349903, + "learning_rate": 1.461662520637841e-08, + "loss": 1.1212, + "step": 14643 + }, + { + "epoch": 2.9346693386773546, + "grad_norm": 19.410567465690246, + "learning_rate": 1.4527678424398062e-08, + "loss": 1.5976, + "step": 14644 + }, + { + "epoch": 2.934869739478958, + "grad_norm": 23.425546269729537, + "learning_rate": 1.4439002709906724e-08, + "loss": 1.5341, + "step": 14645 + }, + { + "epoch": 2.935070140280561, + "grad_norm": 19.447265208150107, + "learning_rate": 1.4350598067726096e-08, + "loss": 1.4657, + "step": 14646 + }, + { + "epoch": 2.9352705410821645, + "grad_norm": 18.31757826527878, + "learning_rate": 1.4262464502663442e-08, + "loss": 1.3217, + "step": 14647 + }, + { + "epoch": 2.9354709418837674, + "grad_norm": 22.601344490136906, + "learning_rate": 1.4174602019509376e-08, + "loss": 1.5167, + "step": 14648 + }, + { + "epoch": 2.935671342685371, + "grad_norm": 22.637657510462585, + "learning_rate": 1.4087010623042852e-08, + "loss": 1.8304, + "step": 14649 + }, + { + "epoch": 2.935871743486974, + "grad_norm": 21.94356428611182, + "learning_rate": 1.3999690318024506e-08, + "loss": 1.4773, + "step": 14650 + }, + { + "epoch": 2.936072144288577, + "grad_norm": 18.99925680641878, + "learning_rate": 1.3912641109203873e-08, + "loss": 1.784, + "step": 14651 + }, + { + "epoch": 2.9362725450901803, + "grad_norm": 49.81445099520561, + "learning_rate": 1.3825863001312723e-08, + "loss": 1.4173, + "step": 14652 + }, + { + "epoch": 2.9364729458917838, + "grad_norm": 29.539575851407395, + "learning_rate": 1.3739355999069504e-08, + "loss": 1.9833, + "step": 14653 + }, + { + "epoch": 2.9366733466933868, + "grad_norm": 22.35309807695939, + "learning_rate": 1.365312010717823e-08, + "loss": 1.5106, + "step": 14654 + }, + { + "epoch": 2.9368737474949898, + "grad_norm": 21.138472092458457, + "learning_rate": 1.356715533032682e-08, + "loss": 1.611, + "step": 14655 + }, + { + "epoch": 2.937074148296593, + "grad_norm": 18.708144040655455, + "learning_rate": 1.3481461673190421e-08, + "loss": 1.3161, + "step": 14656 + }, + { + "epoch": 2.9372745490981966, + "grad_norm": 16.830085355651917, + "learning_rate": 1.339603914042753e-08, + "loss": 1.4054, + "step": 14657 + }, + { + "epoch": 2.9374749498997996, + "grad_norm": 48.06871476867569, + "learning_rate": 1.3310887736682765e-08, + "loss": 1.8003, + "step": 14658 + }, + { + "epoch": 2.9376753507014026, + "grad_norm": 22.503306337898294, + "learning_rate": 1.322600746658631e-08, + "loss": 1.8277, + "step": 14659 + }, + { + "epoch": 2.937875751503006, + "grad_norm": 35.113961157739666, + "learning_rate": 1.3141398334752809e-08, + "loss": 1.233, + "step": 14660 + }, + { + "epoch": 2.938076152304609, + "grad_norm": 24.141883330321637, + "learning_rate": 1.3057060345782468e-08, + "loss": 1.6488, + "step": 14661 + }, + { + "epoch": 2.9382765531062125, + "grad_norm": 20.959861495496757, + "learning_rate": 1.2972993504261622e-08, + "loss": 1.5169, + "step": 14662 + }, + { + "epoch": 2.9384769539078155, + "grad_norm": 25.019671693080017, + "learning_rate": 1.2889197814760501e-08, + "loss": 1.3659, + "step": 14663 + }, + { + "epoch": 2.938677354709419, + "grad_norm": 22.179446041816313, + "learning_rate": 1.2805673281835462e-08, + "loss": 1.6107, + "step": 14664 + }, + { + "epoch": 2.938877755511022, + "grad_norm": 23.87765181927027, + "learning_rate": 1.2722419910027318e-08, + "loss": 1.6376, + "step": 14665 + }, + { + "epoch": 2.9390781563126254, + "grad_norm": 16.455685469355767, + "learning_rate": 1.2639437703863556e-08, + "loss": 1.6005, + "step": 14666 + }, + { + "epoch": 2.9392785571142284, + "grad_norm": 32.22361189314923, + "learning_rate": 1.2556726667855012e-08, + "loss": 1.7763, + "step": 14667 + }, + { + "epoch": 2.939478957915832, + "grad_norm": 22.998648106534596, + "learning_rate": 1.2474286806500313e-08, + "loss": 1.3483, + "step": 14668 + }, + { + "epoch": 2.939679358717435, + "grad_norm": 20.314510507119493, + "learning_rate": 1.2392118124279762e-08, + "loss": 1.4181, + "step": 14669 + }, + { + "epoch": 2.9398797595190382, + "grad_norm": 44.59517637589048, + "learning_rate": 1.2310220625663116e-08, + "loss": 1.4528, + "step": 14670 + }, + { + "epoch": 2.9400801603206412, + "grad_norm": 20.117967492029322, + "learning_rate": 1.2228594315101816e-08, + "loss": 1.5844, + "step": 14671 + }, + { + "epoch": 2.9402805611222442, + "grad_norm": 21.474886505072753, + "learning_rate": 1.2147239197034532e-08, + "loss": 1.0383, + "step": 14672 + }, + { + "epoch": 2.9404809619238477, + "grad_norm": 20.225496380658182, + "learning_rate": 1.2066155275884949e-08, + "loss": 1.7288, + "step": 14673 + }, + { + "epoch": 2.940681362725451, + "grad_norm": 20.02084154515142, + "learning_rate": 1.1985342556060653e-08, + "loss": 1.8046, + "step": 14674 + }, + { + "epoch": 2.940881763527054, + "grad_norm": 25.19338717904858, + "learning_rate": 1.1904801041957015e-08, + "loss": 1.6044, + "step": 14675 + }, + { + "epoch": 2.941082164328657, + "grad_norm": 28.352954485340376, + "learning_rate": 1.1824530737951645e-08, + "loss": 1.2141, + "step": 14676 + }, + { + "epoch": 2.9412825651302605, + "grad_norm": 21.665324261799213, + "learning_rate": 1.1744531648410495e-08, + "loss": 1.6818, + "step": 14677 + }, + { + "epoch": 2.941482965931864, + "grad_norm": 18.150150243663578, + "learning_rate": 1.1664803777682309e-08, + "loss": 1.4413, + "step": 14678 + }, + { + "epoch": 2.941683366733467, + "grad_norm": 17.875446247570938, + "learning_rate": 1.1585347130101953e-08, + "loss": 1.186, + "step": 14679 + }, + { + "epoch": 2.94188376753507, + "grad_norm": 17.621137080782383, + "learning_rate": 1.1506161709989861e-08, + "loss": 1.5344, + "step": 14680 + }, + { + "epoch": 2.9420841683366734, + "grad_norm": 17.435902593084073, + "learning_rate": 1.1427247521651474e-08, + "loss": 1.9959, + "step": 14681 + }, + { + "epoch": 2.9422845691382764, + "grad_norm": 19.579080210715286, + "learning_rate": 1.1348604569377808e-08, + "loss": 1.702, + "step": 14682 + }, + { + "epoch": 2.94248496993988, + "grad_norm": 21.326415913686123, + "learning_rate": 1.127023285744433e-08, + "loss": 1.6505, + "step": 14683 + }, + { + "epoch": 2.942685370741483, + "grad_norm": 22.76501566297083, + "learning_rate": 1.1192132390112076e-08, + "loss": 1.2752, + "step": 14684 + }, + { + "epoch": 2.9428857715430863, + "grad_norm": 74.28588478414092, + "learning_rate": 1.1114303171627649e-08, + "loss": 1.4527, + "step": 14685 + }, + { + "epoch": 2.9430861723446893, + "grad_norm": 22.304059522808217, + "learning_rate": 1.103674520622322e-08, + "loss": 1.29, + "step": 14686 + }, + { + "epoch": 2.9432865731462927, + "grad_norm": 23.44750467970711, + "learning_rate": 1.0959458498115416e-08, + "loss": 1.8211, + "step": 14687 + }, + { + "epoch": 2.9434869739478957, + "grad_norm": 19.67547878274897, + "learning_rate": 1.0882443051505875e-08, + "loss": 1.4005, + "step": 14688 + }, + { + "epoch": 2.943687374749499, + "grad_norm": 27.388963191558307, + "learning_rate": 1.0805698870582914e-08, + "loss": 1.3425, + "step": 14689 + }, + { + "epoch": 2.943887775551102, + "grad_norm": 20.62749017144104, + "learning_rate": 1.0729225959518197e-08, + "loss": 1.2986, + "step": 14690 + }, + { + "epoch": 2.9440881763527056, + "grad_norm": 16.868778468720198, + "learning_rate": 1.0653024322471173e-08, + "loss": 1.4032, + "step": 14691 + }, + { + "epoch": 2.9442885771543086, + "grad_norm": 19.540492002101438, + "learning_rate": 1.0577093963583529e-08, + "loss": 1.409, + "step": 14692 + }, + { + "epoch": 2.9444889779559116, + "grad_norm": 22.682048503823633, + "learning_rate": 1.050143488698474e-08, + "loss": 1.5945, + "step": 14693 + }, + { + "epoch": 2.944689378757515, + "grad_norm": 22.045961749794785, + "learning_rate": 1.0426047096787628e-08, + "loss": 1.6958, + "step": 14694 + }, + { + "epoch": 2.9448897795591185, + "grad_norm": 23.624200483905167, + "learning_rate": 1.0350930597092246e-08, + "loss": 1.363, + "step": 14695 + }, + { + "epoch": 2.9450901803607215, + "grad_norm": 24.703550722887808, + "learning_rate": 1.0276085391981438e-08, + "loss": 1.4421, + "step": 14696 + }, + { + "epoch": 2.9452905811623245, + "grad_norm": 19.876297835361353, + "learning_rate": 1.0201511485525839e-08, + "loss": 1.5328, + "step": 14697 + }, + { + "epoch": 2.945490981963928, + "grad_norm": 16.997846723364294, + "learning_rate": 1.0127208881779981e-08, + "loss": 1.0324, + "step": 14698 + }, + { + "epoch": 2.9456913827655313, + "grad_norm": 20.187770445007153, + "learning_rate": 1.0053177584782859e-08, + "loss": 1.2754, + "step": 14699 + }, + { + "epoch": 2.9458917835671343, + "grad_norm": 24.42209863276524, + "learning_rate": 9.979417598560693e-09, + "loss": 1.5413, + "step": 14700 + }, + { + "epoch": 2.9460921843687373, + "grad_norm": 23.44696433677164, + "learning_rate": 9.90592892712361e-09, + "loss": 1.7793, + "step": 14701 + }, + { + "epoch": 2.9462925851703408, + "grad_norm": 21.887165204978206, + "learning_rate": 9.832711574466746e-09, + "loss": 1.371, + "step": 14702 + }, + { + "epoch": 2.9464929859719438, + "grad_norm": 20.14674093966164, + "learning_rate": 9.75976554457192e-09, + "loss": 1.3237, + "step": 14703 + }, + { + "epoch": 2.946693386773547, + "grad_norm": 20.094218294872974, + "learning_rate": 9.687090841404845e-09, + "loss": 1.7604, + "step": 14704 + }, + { + "epoch": 2.94689378757515, + "grad_norm": 18.277361059803255, + "learning_rate": 9.61468746891736e-09, + "loss": 1.6738, + "step": 14705 + }, + { + "epoch": 2.9470941883767536, + "grad_norm": 23.30705629952824, + "learning_rate": 9.542555431045209e-09, + "loss": 1.7728, + "step": 14706 + }, + { + "epoch": 2.9472945891783566, + "grad_norm": 23.097983160902814, + "learning_rate": 9.470694731710805e-09, + "loss": 1.9045, + "step": 14707 + }, + { + "epoch": 2.94749498997996, + "grad_norm": 16.63176977124724, + "learning_rate": 9.399105374822137e-09, + "loss": 1.6249, + "step": 14708 + }, + { + "epoch": 2.947695390781563, + "grad_norm": 21.180616797785564, + "learning_rate": 9.327787364270535e-09, + "loss": 1.4264, + "step": 14709 + }, + { + "epoch": 2.947895791583166, + "grad_norm": 20.926567122763394, + "learning_rate": 9.256740703934008e-09, + "loss": 1.1856, + "step": 14710 + }, + { + "epoch": 2.9480961923847695, + "grad_norm": 21.588936097023907, + "learning_rate": 9.185965397675579e-09, + "loss": 1.5373, + "step": 14711 + }, + { + "epoch": 2.948296593186373, + "grad_norm": 28.462920410409343, + "learning_rate": 9.115461449343276e-09, + "loss": 1.36, + "step": 14712 + }, + { + "epoch": 2.948496993987976, + "grad_norm": 21.348075061280486, + "learning_rate": 9.045228862770704e-09, + "loss": 1.4566, + "step": 14713 + }, + { + "epoch": 2.948697394789579, + "grad_norm": 24.964969914281504, + "learning_rate": 8.97526764177592e-09, + "loss": 1.4588, + "step": 14714 + }, + { + "epoch": 2.9488977955911824, + "grad_norm": 22.07543355551936, + "learning_rate": 8.905577790163656e-09, + "loss": 1.7516, + "step": 14715 + }, + { + "epoch": 2.949098196392786, + "grad_norm": 37.375105903882236, + "learning_rate": 8.836159311722547e-09, + "loss": 1.637, + "step": 14716 + }, + { + "epoch": 2.949298597194389, + "grad_norm": 26.67348095662602, + "learning_rate": 8.7670122102268e-09, + "loss": 1.9207, + "step": 14717 + }, + { + "epoch": 2.949498997995992, + "grad_norm": 15.533254271334462, + "learning_rate": 8.698136489437292e-09, + "loss": 1.4508, + "step": 14718 + }, + { + "epoch": 2.9496993987975952, + "grad_norm": 28.785752711312952, + "learning_rate": 8.629532153097142e-09, + "loss": 1.4365, + "step": 14719 + }, + { + "epoch": 2.9498997995991982, + "grad_norm": 35.66131843176952, + "learning_rate": 8.561199204937254e-09, + "loss": 1.8846, + "step": 14720 + }, + { + "epoch": 2.9501002004008017, + "grad_norm": 25.901869588141313, + "learning_rate": 8.49313764867299e-09, + "loss": 1.7179, + "step": 14721 + }, + { + "epoch": 2.9503006012024047, + "grad_norm": 21.090091334985416, + "learning_rate": 8.425347488005275e-09, + "loss": 1.3315, + "step": 14722 + }, + { + "epoch": 2.950501002004008, + "grad_norm": 20.26585033526405, + "learning_rate": 8.357828726619499e-09, + "loss": 1.6531, + "step": 14723 + }, + { + "epoch": 2.950701402805611, + "grad_norm": 21.845918595781026, + "learning_rate": 8.290581368187167e-09, + "loss": 1.2876, + "step": 14724 + }, + { + "epoch": 2.9509018036072145, + "grad_norm": 24.896447794488385, + "learning_rate": 8.223605416364244e-09, + "loss": 1.7025, + "step": 14725 + }, + { + "epoch": 2.9511022044088175, + "grad_norm": 20.08212159796774, + "learning_rate": 8.156900874792261e-09, + "loss": 1.4184, + "step": 14726 + }, + { + "epoch": 2.951302605210421, + "grad_norm": 20.267872460568327, + "learning_rate": 8.090467747098318e-09, + "loss": 1.5568, + "step": 14727 + }, + { + "epoch": 2.951503006012024, + "grad_norm": 17.5612882514458, + "learning_rate": 8.024306036893969e-09, + "loss": 2.038, + "step": 14728 + }, + { + "epoch": 2.9517034068136274, + "grad_norm": 22.249751767013077, + "learning_rate": 7.958415747777449e-09, + "loss": 1.215, + "step": 14729 + }, + { + "epoch": 2.9519038076152304, + "grad_norm": 18.49865900971243, + "learning_rate": 7.89279688333089e-09, + "loss": 1.4245, + "step": 14730 + }, + { + "epoch": 2.9521042084168334, + "grad_norm": 23.597637714338248, + "learning_rate": 7.827449447121438e-09, + "loss": 1.2733, + "step": 14731 + }, + { + "epoch": 2.952304609218437, + "grad_norm": 18.846903872777155, + "learning_rate": 7.762373442703474e-09, + "loss": 1.4628, + "step": 14732 + }, + { + "epoch": 2.9525050100200403, + "grad_norm": 28.12132914820629, + "learning_rate": 7.697568873613614e-09, + "loss": 1.611, + "step": 14733 + }, + { + "epoch": 2.9527054108216433, + "grad_norm": 28.794821563973997, + "learning_rate": 7.633035743376815e-09, + "loss": 1.5263, + "step": 14734 + }, + { + "epoch": 2.9529058116232463, + "grad_norm": 18.407891360125788, + "learning_rate": 7.568774055500827e-09, + "loss": 1.2601, + "step": 14735 + }, + { + "epoch": 2.9531062124248497, + "grad_norm": 22.15867157057182, + "learning_rate": 7.504783813480631e-09, + "loss": 1.5611, + "step": 14736 + }, + { + "epoch": 2.953306613226453, + "grad_norm": 21.41654590877261, + "learning_rate": 7.441065020794558e-09, + "loss": 1.7587, + "step": 14737 + }, + { + "epoch": 2.953507014028056, + "grad_norm": 15.678365231844374, + "learning_rate": 7.3776176809076115e-09, + "loss": 1.1208, + "step": 14738 + }, + { + "epoch": 2.953707414829659, + "grad_norm": 20.56922726134579, + "learning_rate": 7.314441797269256e-09, + "loss": 1.8461, + "step": 14739 + }, + { + "epoch": 2.9539078156312626, + "grad_norm": 23.43828858022823, + "learning_rate": 7.2515373733145214e-09, + "loss": 1.4919, + "step": 14740 + }, + { + "epoch": 2.9541082164328656, + "grad_norm": 19.329358824225974, + "learning_rate": 7.188904412464004e-09, + "loss": 1.0395, + "step": 14741 + }, + { + "epoch": 2.954308617234469, + "grad_norm": 20.252594123909795, + "learning_rate": 7.1265429181227585e-09, + "loss": 1.5648, + "step": 14742 + }, + { + "epoch": 2.954509018036072, + "grad_norm": 24.62128241219238, + "learning_rate": 7.064452893681406e-09, + "loss": 1.3894, + "step": 14743 + }, + { + "epoch": 2.9547094188376755, + "grad_norm": 26.514299128061662, + "learning_rate": 7.002634342516135e-09, + "loss": 1.8829, + "step": 14744 + }, + { + "epoch": 2.9549098196392785, + "grad_norm": 604.3473787749626, + "learning_rate": 6.94108726798759e-09, + "loss": 1.6577, + "step": 14745 + }, + { + "epoch": 2.955110220440882, + "grad_norm": 18.359676321393255, + "learning_rate": 6.879811673443093e-09, + "loss": 1.1229, + "step": 14746 + }, + { + "epoch": 2.955310621242485, + "grad_norm": 19.767901949792126, + "learning_rate": 6.818807562213315e-09, + "loss": 1.7698, + "step": 14747 + }, + { + "epoch": 2.9555110220440883, + "grad_norm": 17.876424257440327, + "learning_rate": 6.7580749376156e-09, + "loss": 1.6319, + "step": 14748 + }, + { + "epoch": 2.9557114228456913, + "grad_norm": 29.977223317066404, + "learning_rate": 6.697613802952308e-09, + "loss": 1.616, + "step": 14749 + }, + { + "epoch": 2.9559118236472948, + "grad_norm": 24.689032862069567, + "learning_rate": 6.637424161510253e-09, + "loss": 1.3959, + "step": 14750 + }, + { + "epoch": 2.9561122244488978, + "grad_norm": 20.547038102527782, + "learning_rate": 6.577506016562929e-09, + "loss": 1.8113, + "step": 14751 + }, + { + "epoch": 2.9563126252505008, + "grad_norm": 21.11556395765804, + "learning_rate": 6.517859371367174e-09, + "loss": 1.8385, + "step": 14752 + }, + { + "epoch": 2.956513026052104, + "grad_norm": 21.39386841430494, + "learning_rate": 6.458484229166506e-09, + "loss": 1.8529, + "step": 14753 + }, + { + "epoch": 2.9567134268537076, + "grad_norm": 30.21309844085172, + "learning_rate": 6.399380593188898e-09, + "loss": 1.6385, + "step": 14754 + }, + { + "epoch": 2.9569138276553106, + "grad_norm": 21.099911807808525, + "learning_rate": 6.340548466648444e-09, + "loss": 1.4879, + "step": 14755 + }, + { + "epoch": 2.9571142284569136, + "grad_norm": 35.39322139620091, + "learning_rate": 6.281987852743698e-09, + "loss": 1.722, + "step": 14756 + }, + { + "epoch": 2.957314629258517, + "grad_norm": 24.042536822677935, + "learning_rate": 6.22369875465878e-09, + "loss": 1.8111, + "step": 14757 + }, + { + "epoch": 2.9575150300601205, + "grad_norm": 26.81165568677322, + "learning_rate": 6.165681175562821e-09, + "loss": 1.4225, + "step": 14758 + }, + { + "epoch": 2.9577154308617235, + "grad_norm": 17.82420268633988, + "learning_rate": 6.107935118609965e-09, + "loss": 1.2553, + "step": 14759 + }, + { + "epoch": 2.9579158316633265, + "grad_norm": 21.269793633075764, + "learning_rate": 6.050460586940477e-09, + "loss": 1.4793, + "step": 14760 + }, + { + "epoch": 2.95811623246493, + "grad_norm": 34.055892975163864, + "learning_rate": 5.993257583679634e-09, + "loss": 1.3977, + "step": 14761 + }, + { + "epoch": 2.958316633266533, + "grad_norm": 34.198940579101944, + "learning_rate": 5.936326111936619e-09, + "loss": 1.1687, + "step": 14762 + }, + { + "epoch": 2.9585170340681364, + "grad_norm": 30.200319274278314, + "learning_rate": 5.879666174807841e-09, + "loss": 1.9733, + "step": 14763 + }, + { + "epoch": 2.9587174348697394, + "grad_norm": 16.73567908320897, + "learning_rate": 5.823277775373615e-09, + "loss": 1.5767, + "step": 14764 + }, + { + "epoch": 2.958917835671343, + "grad_norm": 24.845705281028053, + "learning_rate": 5.767160916699821e-09, + "loss": 1.7108, + "step": 14765 + }, + { + "epoch": 2.959118236472946, + "grad_norm": 18.959263349716995, + "learning_rate": 5.711315601837353e-09, + "loss": 1.7397, + "step": 14766 + }, + { + "epoch": 2.9593186372745492, + "grad_norm": 15.882783569248424, + "learning_rate": 5.655741833823225e-09, + "loss": 1.0364, + "step": 14767 + }, + { + "epoch": 2.9595190380761522, + "grad_norm": 21.976993602607095, + "learning_rate": 5.60043961567891e-09, + "loss": 1.3399, + "step": 14768 + }, + { + "epoch": 2.9597194388777552, + "grad_norm": 27.69597005110127, + "learning_rate": 5.545408950410891e-09, + "loss": 1.8138, + "step": 14769 + }, + { + "epoch": 2.9599198396793587, + "grad_norm": 21.14005423373361, + "learning_rate": 5.4906498410112196e-09, + "loss": 1.5836, + "step": 14770 + }, + { + "epoch": 2.960120240480962, + "grad_norm": 16.62759416134666, + "learning_rate": 5.436162290458069e-09, + "loss": 1.2496, + "step": 14771 + }, + { + "epoch": 2.960320641282565, + "grad_norm": 23.660669776318763, + "learning_rate": 5.381946301712959e-09, + "loss": 1.4652, + "step": 14772 + }, + { + "epoch": 2.960521042084168, + "grad_norm": 21.1058116336435, + "learning_rate": 5.328001877724087e-09, + "loss": 1.8253, + "step": 14773 + }, + { + "epoch": 2.9607214428857715, + "grad_norm": 23.229205453071025, + "learning_rate": 5.274329021424662e-09, + "loss": 1.6107, + "step": 14774 + }, + { + "epoch": 2.960921843687375, + "grad_norm": 21.209131071013342, + "learning_rate": 5.220927735732906e-09, + "loss": 1.635, + "step": 14775 + }, + { + "epoch": 2.961122244488978, + "grad_norm": 46.45165663858852, + "learning_rate": 5.167798023552606e-09, + "loss": 1.9435, + "step": 14776 + }, + { + "epoch": 2.961322645290581, + "grad_norm": 48.71309780374817, + "learning_rate": 5.114939887772008e-09, + "loss": 2.0246, + "step": 14777 + }, + { + "epoch": 2.9615230460921844, + "grad_norm": 21.583045538878558, + "learning_rate": 5.062353331264924e-09, + "loss": 1.7827, + "step": 14778 + }, + { + "epoch": 2.9617234468937874, + "grad_norm": 34.10701696191623, + "learning_rate": 5.010038356891289e-09, + "loss": 1.9389, + "step": 14779 + }, + { + "epoch": 2.961923847695391, + "grad_norm": 18.286268534149027, + "learning_rate": 4.957994967494939e-09, + "loss": 1.5161, + "step": 14780 + }, + { + "epoch": 2.962124248496994, + "grad_norm": 29.68024991793561, + "learning_rate": 4.906223165905832e-09, + "loss": 1.6388, + "step": 14781 + }, + { + "epoch": 2.9623246492985973, + "grad_norm": 25.015098371264557, + "learning_rate": 4.8547229549383846e-09, + "loss": 1.86, + "step": 14782 + }, + { + "epoch": 2.9625250501002003, + "grad_norm": 25.079105203438772, + "learning_rate": 4.8034943373936885e-09, + "loss": 1.3363, + "step": 14783 + }, + { + "epoch": 2.9627254509018037, + "grad_norm": 21.994021716321964, + "learning_rate": 4.752537316056738e-09, + "loss": 1.1096, + "step": 14784 + }, + { + "epoch": 2.9629258517034067, + "grad_norm": 27.92414536038161, + "learning_rate": 4.70185189369754e-09, + "loss": 1.571, + "step": 14785 + }, + { + "epoch": 2.96312625250501, + "grad_norm": 22.24379356776227, + "learning_rate": 4.651438073072223e-09, + "loss": 1.405, + "step": 14786 + }, + { + "epoch": 2.963326653306613, + "grad_norm": 23.113168400083275, + "learning_rate": 4.601295856922483e-09, + "loss": 1.6059, + "step": 14787 + }, + { + "epoch": 2.9635270541082166, + "grad_norm": 19.04029287230202, + "learning_rate": 4.5514252479739175e-09, + "loss": 1.2888, + "step": 14788 + }, + { + "epoch": 2.9637274549098196, + "grad_norm": 25.672384357705244, + "learning_rate": 4.501826248938801e-09, + "loss": 1.3386, + "step": 14789 + }, + { + "epoch": 2.9639278557114226, + "grad_norm": 21.43075224263391, + "learning_rate": 4.4524988625127555e-09, + "loss": 1.8793, + "step": 14790 + }, + { + "epoch": 2.964128256513026, + "grad_norm": 24.967922395031337, + "learning_rate": 4.4034430913791894e-09, + "loss": 1.6589, + "step": 14791 + }, + { + "epoch": 2.9643286573146295, + "grad_norm": 18.973007115610393, + "learning_rate": 4.354658938203748e-09, + "loss": 1.5493, + "step": 14792 + }, + { + "epoch": 2.9645290581162325, + "grad_norm": 17.912622510482954, + "learning_rate": 4.3061464056404215e-09, + "loss": 1.5861, + "step": 14793 + }, + { + "epoch": 2.9647294589178355, + "grad_norm": 18.382664797035186, + "learning_rate": 4.257905496325987e-09, + "loss": 1.2596, + "step": 14794 + }, + { + "epoch": 2.964929859719439, + "grad_norm": 24.05241573721087, + "learning_rate": 4.209936212883348e-09, + "loss": 1.6046, + "step": 14795 + }, + { + "epoch": 2.9651302605210423, + "grad_norm": 23.50826529653061, + "learning_rate": 4.162238557921528e-09, + "loss": 1.8557, + "step": 14796 + }, + { + "epoch": 2.9653306613226453, + "grad_norm": 20.24645787480474, + "learning_rate": 4.114812534032897e-09, + "loss": 1.1411, + "step": 14797 + }, + { + "epoch": 2.9655310621242483, + "grad_norm": 25.437589278781985, + "learning_rate": 4.067658143796504e-09, + "loss": 2.0097, + "step": 14798 + }, + { + "epoch": 2.9657314629258518, + "grad_norm": 22.1743548771761, + "learning_rate": 4.02077538977641e-09, + "loss": 1.7693, + "step": 14799 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 21.48938718901974, + "learning_rate": 3.974164274521686e-09, + "loss": 1.4544, + "step": 14800 + }, + { + "epoch": 2.966132264529058, + "grad_norm": 21.603631893640816, + "learning_rate": 3.92782480056586e-09, + "loss": 1.5349, + "step": 14801 + }, + { + "epoch": 2.966332665330661, + "grad_norm": 24.635028346893222, + "learning_rate": 3.881756970429695e-09, + "loss": 1.8127, + "step": 14802 + }, + { + "epoch": 2.9665330661322646, + "grad_norm": 15.893604599852367, + "learning_rate": 3.835960786617299e-09, + "loss": 1.4334, + "step": 14803 + }, + { + "epoch": 2.9667334669338676, + "grad_norm": 22.01773815861069, + "learning_rate": 3.790436251618345e-09, + "loss": 1.6321, + "step": 14804 + }, + { + "epoch": 2.966933867735471, + "grad_norm": 22.31743065028876, + "learning_rate": 3.7451833679086335e-09, + "loss": 1.7573, + "step": 14805 + }, + { + "epoch": 2.967134268537074, + "grad_norm": 22.213903571960426, + "learning_rate": 3.7002021379484164e-09, + "loss": 1.4622, + "step": 14806 + }, + { + "epoch": 2.9673346693386775, + "grad_norm": 21.608732172260652, + "learning_rate": 3.655492564183516e-09, + "loss": 1.4196, + "step": 14807 + }, + { + "epoch": 2.9675350701402805, + "grad_norm": 19.450727147212167, + "learning_rate": 3.6110546490447653e-09, + "loss": 1.978, + "step": 14808 + }, + { + "epoch": 2.967735470941884, + "grad_norm": 27.566868667317582, + "learning_rate": 3.5668883949480095e-09, + "loss": 1.3644, + "step": 14809 + }, + { + "epoch": 2.967935871743487, + "grad_norm": 26.901338476264495, + "learning_rate": 3.522993804295216e-09, + "loss": 1.8947, + "step": 14810 + }, + { + "epoch": 2.96813627254509, + "grad_norm": 32.74777346880004, + "learning_rate": 3.47937087947281e-09, + "loss": 2.0038, + "step": 14811 + }, + { + "epoch": 2.9683366733466934, + "grad_norm": 18.861185219921623, + "learning_rate": 3.436019622852227e-09, + "loss": 1.4522, + "step": 14812 + }, + { + "epoch": 2.968537074148297, + "grad_norm": 21.873809856124698, + "learning_rate": 3.392940036791026e-09, + "loss": 1.3967, + "step": 14813 + }, + { + "epoch": 2.9687374749499, + "grad_norm": 21.54494913367075, + "learning_rate": 3.350132123631222e-09, + "loss": 1.2301, + "step": 14814 + }, + { + "epoch": 2.968937875751503, + "grad_norm": 22.67871733108559, + "learning_rate": 3.307595885700954e-09, + "loss": 1.7185, + "step": 14815 + }, + { + "epoch": 2.9691382765531062, + "grad_norm": 28.379153558876517, + "learning_rate": 3.26533132531226e-09, + "loss": 1.2734, + "step": 14816 + }, + { + "epoch": 2.9693386773547097, + "grad_norm": 21.924535690346822, + "learning_rate": 3.223338444763302e-09, + "loss": 1.407, + "step": 14817 + }, + { + "epoch": 2.9695390781563127, + "grad_norm": 20.656394787978833, + "learning_rate": 3.181617246337254e-09, + "loss": 1.4803, + "step": 14818 + }, + { + "epoch": 2.9697394789579157, + "grad_norm": 19.335839151213186, + "learning_rate": 3.1401677323023017e-09, + "loss": 1.5312, + "step": 14819 + }, + { + "epoch": 2.969939879759519, + "grad_norm": 31.494332712450475, + "learning_rate": 3.0989899049133075e-09, + "loss": 1.8187, + "step": 14820 + }, + { + "epoch": 2.970140280561122, + "grad_norm": 34.92685689347154, + "learning_rate": 3.0580837664079267e-09, + "loss": 1.5678, + "step": 14821 + }, + { + "epoch": 2.9703406813627256, + "grad_norm": 22.970395578094294, + "learning_rate": 3.01744931901049e-09, + "loss": 2.0007, + "step": 14822 + }, + { + "epoch": 2.9705410821643286, + "grad_norm": 26.644561762123868, + "learning_rate": 2.9770865649308978e-09, + "loss": 0.8063, + "step": 14823 + }, + { + "epoch": 2.970741482965932, + "grad_norm": 14.885401687956607, + "learning_rate": 2.9369955063629495e-09, + "loss": 1.248, + "step": 14824 + }, + { + "epoch": 2.970941883767535, + "grad_norm": 24.707294530874016, + "learning_rate": 2.897176145487124e-09, + "loss": 1.3668, + "step": 14825 + }, + { + "epoch": 2.9711422845691384, + "grad_norm": 31.007111782735983, + "learning_rate": 2.8576284844683557e-09, + "loss": 1.7719, + "step": 14826 + }, + { + "epoch": 2.9713426853707414, + "grad_norm": 17.987202276781698, + "learning_rate": 2.818352525456591e-09, + "loss": 1.5879, + "step": 14827 + }, + { + "epoch": 2.9715430861723444, + "grad_norm": 20.639129425014787, + "learning_rate": 2.7793482705879003e-09, + "loss": 1.449, + "step": 14828 + }, + { + "epoch": 2.971743486973948, + "grad_norm": 37.26903195531425, + "learning_rate": 2.740615721982254e-09, + "loss": 1.4936, + "step": 14829 + }, + { + "epoch": 2.9719438877755513, + "grad_norm": 28.321091652935518, + "learning_rate": 2.7021548817462995e-09, + "loss": 1.5706, + "step": 14830 + }, + { + "epoch": 2.9721442885771543, + "grad_norm": 32.093926632114595, + "learning_rate": 2.6639657519705876e-09, + "loss": 1.805, + "step": 14831 + }, + { + "epoch": 2.9723446893787573, + "grad_norm": 19.082801746691874, + "learning_rate": 2.62604833473179e-09, + "loss": 1.4048, + "step": 14832 + }, + { + "epoch": 2.9725450901803607, + "grad_norm": 20.30076162805454, + "learning_rate": 2.5884026320915913e-09, + "loss": 1.932, + "step": 14833 + }, + { + "epoch": 2.972745490981964, + "grad_norm": 25.765769668934244, + "learning_rate": 2.551028646096687e-09, + "loss": 1.6726, + "step": 14834 + }, + { + "epoch": 2.972945891783567, + "grad_norm": 21.397064791902306, + "learning_rate": 2.513926378779341e-09, + "loss": 1.4884, + "step": 14835 + }, + { + "epoch": 2.97314629258517, + "grad_norm": 23.081132949115464, + "learning_rate": 2.4770958321568283e-09, + "loss": 1.4903, + "step": 14836 + }, + { + "epoch": 2.9733466933867736, + "grad_norm": 28.153643200354377, + "learning_rate": 2.4405370082319912e-09, + "loss": 1.1511, + "step": 14837 + }, + { + "epoch": 2.9735470941883766, + "grad_norm": 25.44470154437952, + "learning_rate": 2.404249908991574e-09, + "loss": 1.8085, + "step": 14838 + }, + { + "epoch": 2.97374749498998, + "grad_norm": 18.002073256582815, + "learning_rate": 2.368234536409553e-09, + "loss": 1.6979, + "step": 14839 + }, + { + "epoch": 2.973947895791583, + "grad_norm": 16.890211086653316, + "learning_rate": 2.332490892443806e-09, + "loss": 1.3853, + "step": 14840 + }, + { + "epoch": 2.9741482965931865, + "grad_norm": 46.58824615443281, + "learning_rate": 2.2970189790377797e-09, + "loss": 1.5941, + "step": 14841 + }, + { + "epoch": 2.9743486973947895, + "grad_norm": 22.071955905442692, + "learning_rate": 2.26181879811993e-09, + "loss": 1.6905, + "step": 14842 + }, + { + "epoch": 2.974549098196393, + "grad_norm": 15.425958718189838, + "learning_rate": 2.226890351604838e-09, + "loss": 1.2753, + "step": 14843 + }, + { + "epoch": 2.974749498997996, + "grad_norm": 21.17870072609053, + "learning_rate": 2.1922336413904286e-09, + "loss": 1.3059, + "step": 14844 + }, + { + "epoch": 2.9749498997995993, + "grad_norm": 16.63596233066156, + "learning_rate": 2.1578486693624166e-09, + "loss": 1.5654, + "step": 14845 + }, + { + "epoch": 2.9751503006012023, + "grad_norm": 15.616865562949824, + "learning_rate": 2.123735437389307e-09, + "loss": 1.4858, + "step": 14846 + }, + { + "epoch": 2.9753507014028058, + "grad_norm": 24.58400269028686, + "learning_rate": 2.0898939473262823e-09, + "loss": 1.1103, + "step": 14847 + }, + { + "epoch": 2.9755511022044088, + "grad_norm": 25.566848910175985, + "learning_rate": 2.0563242010129826e-09, + "loss": 1.5234, + "step": 14848 + }, + { + "epoch": 2.9757515030060118, + "grad_norm": 19.165284322984142, + "learning_rate": 2.023026200275169e-09, + "loss": 1.4405, + "step": 14849 + }, + { + "epoch": 2.975951903807615, + "grad_norm": 18.882963686961375, + "learning_rate": 1.98999994692306e-09, + "loss": 1.7704, + "step": 14850 + }, + { + "epoch": 2.9761523046092186, + "grad_norm": 22.42671619403927, + "learning_rate": 1.9572454427524422e-09, + "loss": 1.822, + "step": 14851 + }, + { + "epoch": 2.9763527054108216, + "grad_norm": 25.196873904654446, + "learning_rate": 1.924762689544668e-09, + "loss": 1.342, + "step": 14852 + }, + { + "epoch": 2.9765531062124246, + "grad_norm": 23.97387838430692, + "learning_rate": 1.892551689064992e-09, + "loss": 2.0707, + "step": 14853 + }, + { + "epoch": 2.976753507014028, + "grad_norm": 19.573173035405603, + "learning_rate": 1.860612443064791e-09, + "loss": 1.0872, + "step": 14854 + }, + { + "epoch": 2.9769539078156315, + "grad_norm": 35.868912900043355, + "learning_rate": 1.828944953281564e-09, + "loss": 1.8421, + "step": 14855 + }, + { + "epoch": 2.9771543086172345, + "grad_norm": 20.45248878467425, + "learning_rate": 1.7975492214361567e-09, + "loss": 1.9365, + "step": 14856 + }, + { + "epoch": 2.9773547094188375, + "grad_norm": 23.897161723966097, + "learning_rate": 1.7664252492366473e-09, + "loss": 1.9062, + "step": 14857 + }, + { + "epoch": 2.977555110220441, + "grad_norm": 20.13353833629346, + "learning_rate": 1.7355730383739055e-09, + "loss": 1.9825, + "step": 14858 + }, + { + "epoch": 2.977755511022044, + "grad_norm": 22.85950411169496, + "learning_rate": 1.7049925905265884e-09, + "loss": 1.8109, + "step": 14859 + }, + { + "epoch": 2.9779559118236474, + "grad_norm": 23.13583662050665, + "learning_rate": 1.6746839073572552e-09, + "loss": 1.4323, + "step": 14860 + }, + { + "epoch": 2.9781563126252504, + "grad_norm": 20.587855187257716, + "learning_rate": 1.6446469905129214e-09, + "loss": 1.3768, + "step": 14861 + }, + { + "epoch": 2.978356713426854, + "grad_norm": 20.692773930209643, + "learning_rate": 1.6148818416278357e-09, + "loss": 1.6934, + "step": 14862 + }, + { + "epoch": 2.978557114228457, + "grad_norm": 26.253476203763615, + "learning_rate": 1.5853884623195925e-09, + "loss": 1.4421, + "step": 14863 + }, + { + "epoch": 2.9787575150300603, + "grad_norm": 25.627348450782176, + "learning_rate": 1.5561668541924647e-09, + "loss": 1.4168, + "step": 14864 + }, + { + "epoch": 2.9789579158316633, + "grad_norm": 16.839239116885675, + "learning_rate": 1.5272170188346258e-09, + "loss": 1.5253, + "step": 14865 + }, + { + "epoch": 2.9791583166332667, + "grad_norm": 20.335967860464947, + "learning_rate": 1.4985389578209275e-09, + "loss": 1.2974, + "step": 14866 + }, + { + "epoch": 2.9793587174348697, + "grad_norm": 70.76300636603115, + "learning_rate": 1.4701326727095677e-09, + "loss": 1.5162, + "step": 14867 + }, + { + "epoch": 2.979559118236473, + "grad_norm": 27.62192336987911, + "learning_rate": 1.441998165045977e-09, + "loss": 1.4866, + "step": 14868 + }, + { + "epoch": 2.979759519038076, + "grad_norm": 21.726617201608725, + "learning_rate": 1.414135436358932e-09, + "loss": 1.2471, + "step": 14869 + }, + { + "epoch": 2.979959919839679, + "grad_norm": 26.09353810059187, + "learning_rate": 1.386544488164443e-09, + "loss": 2.0947, + "step": 14870 + }, + { + "epoch": 2.9801603206412826, + "grad_norm": 24.444920366399195, + "learning_rate": 1.3592253219618657e-09, + "loss": 1.4753, + "step": 14871 + }, + { + "epoch": 2.980360721442886, + "grad_norm": 16.96284365949188, + "learning_rate": 1.3321779392372336e-09, + "loss": 1.1865, + "step": 14872 + }, + { + "epoch": 2.980561122244489, + "grad_norm": 17.71973680201868, + "learning_rate": 1.3054023414604822e-09, + "loss": 1.1538, + "step": 14873 + }, + { + "epoch": 2.980761523046092, + "grad_norm": 19.67561021328541, + "learning_rate": 1.278898530088224e-09, + "loss": 2.0742, + "step": 14874 + }, + { + "epoch": 2.9809619238476954, + "grad_norm": 22.59093271398405, + "learning_rate": 1.2526665065604181e-09, + "loss": 1.9778, + "step": 14875 + }, + { + "epoch": 2.981162324649299, + "grad_norm": 17.90314997798867, + "learning_rate": 1.2267062723042567e-09, + "loss": 1.634, + "step": 14876 + }, + { + "epoch": 2.981362725450902, + "grad_norm": 24.99719724359485, + "learning_rate": 1.2010178287308327e-09, + "loss": 1.4191, + "step": 14877 + }, + { + "epoch": 2.981563126252505, + "grad_norm": 57.726717488053254, + "learning_rate": 1.1756011772373622e-09, + "loss": 1.6328, + "step": 14878 + }, + { + "epoch": 2.9817635270541083, + "grad_norm": 22.9096146991987, + "learning_rate": 1.1504563192049623e-09, + "loss": 1.7704, + "step": 14879 + }, + { + "epoch": 2.9819639278557113, + "grad_norm": 17.202962904535937, + "learning_rate": 1.125583256001428e-09, + "loss": 1.5345, + "step": 14880 + }, + { + "epoch": 2.9821643286573147, + "grad_norm": 21.782742469722226, + "learning_rate": 1.1009819889790108e-09, + "loss": 1.3509, + "step": 14881 + }, + { + "epoch": 2.9823647294589177, + "grad_norm": 22.163892701451946, + "learning_rate": 1.0766525194749745e-09, + "loss": 1.8847, + "step": 14882 + }, + { + "epoch": 2.982565130260521, + "grad_norm": 23.330776091432174, + "learning_rate": 1.0525948488127047e-09, + "loss": 1.6812, + "step": 14883 + }, + { + "epoch": 2.982765531062124, + "grad_norm": 18.750619297430884, + "learning_rate": 1.0288089782994892e-09, + "loss": 1.313, + "step": 14884 + }, + { + "epoch": 2.9829659318637276, + "grad_norm": 25.677644759849777, + "learning_rate": 1.005294909229848e-09, + "loss": 1.6942, + "step": 14885 + }, + { + "epoch": 2.9831663326653306, + "grad_norm": 18.317531642076492, + "learning_rate": 9.820526428810927e-10, + "loss": 1.6391, + "step": 14886 + }, + { + "epoch": 2.9833667334669336, + "grad_norm": 27.165409047692762, + "learning_rate": 9.590821805172124e-10, + "loss": 1.8764, + "step": 14887 + }, + { + "epoch": 2.983567134268537, + "grad_norm": 20.480073189704406, + "learning_rate": 9.363835233872076e-10, + "loss": 1.2327, + "step": 14888 + }, + { + "epoch": 2.9837675350701405, + "grad_norm": 19.12314856805191, + "learning_rate": 9.139566727256466e-10, + "loss": 1.2009, + "step": 14889 + }, + { + "epoch": 2.9839679358717435, + "grad_norm": 17.970689527992448, + "learning_rate": 8.918016297515541e-10, + "loss": 1.6493, + "step": 14890 + }, + { + "epoch": 2.9841683366733465, + "grad_norm": 21.362312218317456, + "learning_rate": 8.699183956695223e-10, + "loss": 1.8679, + "step": 14891 + }, + { + "epoch": 2.98436873747495, + "grad_norm": 17.308229003810197, + "learning_rate": 8.483069716697101e-10, + "loss": 1.2807, + "step": 14892 + }, + { + "epoch": 2.9845691382765533, + "grad_norm": 21.897385444311062, + "learning_rate": 8.269673589267335e-10, + "loss": 1.3458, + "step": 14893 + }, + { + "epoch": 2.9847695390781563, + "grad_norm": 20.190941116765227, + "learning_rate": 8.058995586007756e-10, + "loss": 1.3576, + "step": 14894 + }, + { + "epoch": 2.9849699398797593, + "grad_norm": 34.035596713700045, + "learning_rate": 7.851035718375866e-10, + "loss": 1.8469, + "step": 14895 + }, + { + "epoch": 2.9851703406813628, + "grad_norm": 21.90739344323036, + "learning_rate": 7.645793997679285e-10, + "loss": 1.3096, + "step": 14896 + }, + { + "epoch": 2.9853707414829658, + "grad_norm": 16.711060886199743, + "learning_rate": 7.443270435075755e-10, + "loss": 1.1183, + "step": 14897 + }, + { + "epoch": 2.985571142284569, + "grad_norm": 16.16143924686126, + "learning_rate": 7.24346504157869e-10, + "loss": 1.4509, + "step": 14898 + }, + { + "epoch": 2.985771543086172, + "grad_norm": 23.915294756532333, + "learning_rate": 7.046377828046069e-10, + "loss": 1.685, + "step": 14899 + }, + { + "epoch": 2.9859719438877756, + "grad_norm": 17.590260725923258, + "learning_rate": 6.852008805202648e-10, + "loss": 1.5692, + "step": 14900 + }, + { + "epoch": 2.9861723446893786, + "grad_norm": 21.202925415608462, + "learning_rate": 6.660357983612198e-10, + "loss": 1.8744, + "step": 14901 + }, + { + "epoch": 2.986372745490982, + "grad_norm": 19.20781838673156, + "learning_rate": 6.471425373694162e-10, + "loss": 1.3904, + "step": 14902 + }, + { + "epoch": 2.986573146292585, + "grad_norm": 24.513859096614695, + "learning_rate": 6.285210985718104e-10, + "loss": 1.56, + "step": 14903 + }, + { + "epoch": 2.9867735470941885, + "grad_norm": 26.651536815495106, + "learning_rate": 6.10171482982036e-10, + "loss": 2.1138, + "step": 14904 + }, + { + "epoch": 2.9869739478957915, + "grad_norm": 19.97911708284108, + "learning_rate": 5.920936915965181e-10, + "loss": 1.3565, + "step": 14905 + }, + { + "epoch": 2.987174348697395, + "grad_norm": 28.0029275240001, + "learning_rate": 5.742877253983592e-10, + "loss": 1.5457, + "step": 14906 + }, + { + "epoch": 2.987374749498998, + "grad_norm": 38.591737500988586, + "learning_rate": 5.56753585356784e-10, + "loss": 1.0627, + "step": 14907 + }, + { + "epoch": 2.987575150300601, + "grad_norm": 21.001981233073675, + "learning_rate": 5.394912724238088e-10, + "loss": 1.5298, + "step": 14908 + }, + { + "epoch": 2.9877755511022044, + "grad_norm": 21.195873124613275, + "learning_rate": 5.225007875386823e-10, + "loss": 1.353, + "step": 14909 + }, + { + "epoch": 2.987975951903808, + "grad_norm": 28.479167696201053, + "learning_rate": 5.057821316251099e-10, + "loss": 1.8426, + "step": 14910 + }, + { + "epoch": 2.988176352705411, + "grad_norm": 19.415026673185114, + "learning_rate": 4.893353055923645e-10, + "loss": 1.8874, + "step": 14911 + }, + { + "epoch": 2.988376753507014, + "grad_norm": 35.10168976201106, + "learning_rate": 4.731603103341753e-10, + "loss": 1.3076, + "step": 14912 + }, + { + "epoch": 2.9885771543086173, + "grad_norm": 23.43687055707435, + "learning_rate": 4.5725714673039435e-10, + "loss": 1.6646, + "step": 14913 + }, + { + "epoch": 2.9887775551102207, + "grad_norm": 32.05504977374002, + "learning_rate": 4.4162581564533015e-10, + "loss": 1.718, + "step": 14914 + }, + { + "epoch": 2.9889779559118237, + "grad_norm": 28.81335565023289, + "learning_rate": 4.262663179288584e-10, + "loss": 1.588, + "step": 14915 + }, + { + "epoch": 2.9891783567134267, + "grad_norm": 19.73655540753423, + "learning_rate": 4.1117865441697713e-10, + "loss": 1.6061, + "step": 14916 + }, + { + "epoch": 2.98937875751503, + "grad_norm": 21.4061087680804, + "learning_rate": 3.963628259290309e-10, + "loss": 1.4627, + "step": 14917 + }, + { + "epoch": 2.989579158316633, + "grad_norm": 16.89051515782094, + "learning_rate": 3.818188332710415e-10, + "loss": 1.7037, + "step": 14918 + }, + { + "epoch": 2.9897795591182366, + "grad_norm": 17.360256705354722, + "learning_rate": 3.675466772334879e-10, + "loss": 1.6242, + "step": 14919 + }, + { + "epoch": 2.9899799599198396, + "grad_norm": 21.40280798929223, + "learning_rate": 3.535463585924159e-10, + "loss": 1.2155, + "step": 14920 + }, + { + "epoch": 2.990180360721443, + "grad_norm": 24.870003295296065, + "learning_rate": 3.398178781094386e-10, + "loss": 1.4382, + "step": 14921 + }, + { + "epoch": 2.990380761523046, + "grad_norm": 19.874204401340787, + "learning_rate": 3.263612365306257e-10, + "loss": 1.5264, + "step": 14922 + }, + { + "epoch": 2.9905811623246494, + "grad_norm": 22.0986024380774, + "learning_rate": 3.1317643458816935e-10, + "loss": 1.0924, + "step": 14923 + }, + { + "epoch": 2.9907815631262524, + "grad_norm": 17.278612781366768, + "learning_rate": 3.002634729981635e-10, + "loss": 1.3317, + "step": 14924 + }, + { + "epoch": 2.990981963927856, + "grad_norm": 19.335208878496655, + "learning_rate": 2.876223524628241e-10, + "loss": 1.9081, + "step": 14925 + }, + { + "epoch": 2.991182364729459, + "grad_norm": 20.5251559679649, + "learning_rate": 2.7525307366993437e-10, + "loss": 2.0, + "step": 14926 + }, + { + "epoch": 2.9913827655310623, + "grad_norm": 33.48387468864427, + "learning_rate": 2.631556372922894e-10, + "loss": 1.4831, + "step": 14927 + }, + { + "epoch": 2.9915831663326653, + "grad_norm": 22.033393517616087, + "learning_rate": 2.5133004398658623e-10, + "loss": 1.7447, + "step": 14928 + }, + { + "epoch": 2.9917835671342683, + "grad_norm": 28.850163891169377, + "learning_rate": 2.39776294396199e-10, + "loss": 1.3621, + "step": 14929 + }, + { + "epoch": 2.9919839679358717, + "grad_norm": 20.95289339313071, + "learning_rate": 2.2849438915006904e-10, + "loss": 1.6205, + "step": 14930 + }, + { + "epoch": 2.992184368737475, + "grad_norm": 22.813810145670228, + "learning_rate": 2.1748432886048443e-10, + "loss": 1.464, + "step": 14931 + }, + { + "epoch": 2.992384769539078, + "grad_norm": 23.202269547680608, + "learning_rate": 2.0674611412696556e-10, + "loss": 1.5965, + "step": 14932 + }, + { + "epoch": 2.992585170340681, + "grad_norm": 20.37920278762436, + "learning_rate": 1.9627974553348972e-10, + "loss": 1.5382, + "step": 14933 + }, + { + "epoch": 2.9927855711422846, + "grad_norm": 18.505192014806273, + "learning_rate": 1.8608522364793602e-10, + "loss": 1.5382, + "step": 14934 + }, + { + "epoch": 2.992985971943888, + "grad_norm": 16.64124084875505, + "learning_rate": 1.7616254902541596e-10, + "loss": 1.4699, + "step": 14935 + }, + { + "epoch": 2.993186372745491, + "grad_norm": 21.805349102434157, + "learning_rate": 1.6651172220549793e-10, + "loss": 1.5039, + "step": 14936 + }, + { + "epoch": 2.993386773547094, + "grad_norm": 22.893913928216495, + "learning_rate": 1.5713274371276232e-10, + "loss": 1.1979, + "step": 14937 + }, + { + "epoch": 2.9935871743486975, + "grad_norm": 28.36528081531583, + "learning_rate": 1.480256140573566e-10, + "loss": 1.397, + "step": 14938 + }, + { + "epoch": 2.9937875751503005, + "grad_norm": 20.718313551312157, + "learning_rate": 1.3919033373388513e-10, + "loss": 1.4318, + "step": 14939 + }, + { + "epoch": 2.993987975951904, + "grad_norm": 20.723363209325843, + "learning_rate": 1.306269032236296e-10, + "loss": 1.9768, + "step": 14940 + }, + { + "epoch": 2.994188376753507, + "grad_norm": 20.58742384851016, + "learning_rate": 1.223353229912183e-10, + "loss": 1.7808, + "step": 14941 + }, + { + "epoch": 2.9943887775551103, + "grad_norm": 28.28987063626941, + "learning_rate": 1.1431559348795695e-10, + "loss": 1.4986, + "step": 14942 + }, + { + "epoch": 2.9945891783567133, + "grad_norm": 24.420495066180187, + "learning_rate": 1.0656771515016318e-10, + "loss": 1.6659, + "step": 14943 + }, + { + "epoch": 2.994789579158317, + "grad_norm": 29.872234298958414, + "learning_rate": 9.90916883986115e-11, + "loss": 1.5991, + "step": 14944 + }, + { + "epoch": 2.99498997995992, + "grad_norm": 21.002809751303946, + "learning_rate": 9.188751363964354e-11, + "loss": 1.7746, + "step": 14945 + }, + { + "epoch": 2.9951903807615228, + "grad_norm": 22.45981849435363, + "learning_rate": 8.495519126572316e-11, + "loss": 1.6245, + "step": 14946 + }, + { + "epoch": 2.995390781563126, + "grad_norm": 17.821259709508205, + "learning_rate": 7.829472165321595e-11, + "loss": 1.7407, + "step": 14947 + }, + { + "epoch": 2.9955911823647297, + "grad_norm": 18.032529331238848, + "learning_rate": 7.190610516460972e-11, + "loss": 1.3892, + "step": 14948 + }, + { + "epoch": 2.9957915831663327, + "grad_norm": 33.518721450380646, + "learning_rate": 6.578934214684918e-11, + "loss": 1.7158, + "step": 14949 + }, + { + "epoch": 2.9959919839679356, + "grad_norm": 17.326762199182216, + "learning_rate": 5.994443293244612e-11, + "loss": 1.1358, + "step": 14950 + }, + { + "epoch": 2.996192384769539, + "grad_norm": 22.444825335951638, + "learning_rate": 5.437137784003454e-11, + "loss": 2.0335, + "step": 14951 + }, + { + "epoch": 2.9963927855711425, + "grad_norm": 26.42127399771342, + "learning_rate": 4.9070177171595125e-11, + "loss": 1.5268, + "step": 14952 + }, + { + "epoch": 2.9965931863727455, + "grad_norm": 23.110968998305445, + "learning_rate": 4.404083121578584e-11, + "loss": 1.6741, + "step": 14953 + }, + { + "epoch": 2.9967935871743485, + "grad_norm": 21.62932112579211, + "learning_rate": 3.9283340246276666e-11, + "loss": 1.6396, + "step": 14954 + }, + { + "epoch": 2.996993987975952, + "grad_norm": 30.373978532727495, + "learning_rate": 3.479770452174958e-11, + "loss": 1.64, + "step": 14955 + }, + { + "epoch": 2.997194388777555, + "grad_norm": 15.294509307760821, + "learning_rate": 3.05839242853434e-11, + "loss": 1.642, + "step": 14956 + }, + { + "epoch": 2.9973947895791584, + "grad_norm": 25.07238185673692, + "learning_rate": 2.6641999766874317e-11, + "loss": 1.6773, + "step": 14957 + }, + { + "epoch": 2.9975951903807614, + "grad_norm": 24.54419840837605, + "learning_rate": 2.297193118117047e-11, + "loss": 1.6591, + "step": 14958 + }, + { + "epoch": 2.997795591182365, + "grad_norm": 20.00817065142384, + "learning_rate": 1.9573718726406676e-11, + "loss": 1.6479, + "step": 14959 + }, + { + "epoch": 2.997995991983968, + "grad_norm": 25.30899796099581, + "learning_rate": 1.644736258799018e-11, + "loss": 1.7121, + "step": 14960 + }, + { + "epoch": 2.9981963927855713, + "grad_norm": 21.87602384436359, + "learning_rate": 1.3592862936340212e-11, + "loss": 1.3283, + "step": 14961 + }, + { + "epoch": 2.9983967935871743, + "grad_norm": 24.56660591672679, + "learning_rate": 1.1010219926332888e-11, + "loss": 1.418, + "step": 14962 + }, + { + "epoch": 2.9985971943887777, + "grad_norm": 23.243210607859297, + "learning_rate": 8.69943369841142e-12, + "loss": 1.4407, + "step": 14963 + }, + { + "epoch": 2.9987975951903807, + "grad_norm": 25.340254214610205, + "learning_rate": 6.660504378031008e-12, + "loss": 1.8874, + "step": 14964 + }, + { + "epoch": 2.998997995991984, + "grad_norm": 22.152712932942574, + "learning_rate": 4.8934320762139555e-12, + "loss": 1.313, + "step": 14965 + }, + { + "epoch": 2.999198396793587, + "grad_norm": 19.867526153370548, + "learning_rate": 3.3982168889945545e-12, + "loss": 1.4523, + "step": 14966 + }, + { + "epoch": 2.99939879759519, + "grad_norm": 25.191095426713684, + "learning_rate": 2.1748588974190855e-12, + "loss": 2.1367, + "step": 14967 + }, + { + "epoch": 2.9995991983967936, + "grad_norm": 29.70625948467962, + "learning_rate": 1.2233581686560414e-12, + "loss": 1.4819, + "step": 14968 + }, + { + "epoch": 2.999799599198397, + "grad_norm": 21.694164025336487, + "learning_rate": 5.437147537756815e-13, + "loss": 2.1125, + "step": 14969 + }, + { + "epoch": 3.0, + "grad_norm": 24.795052184944144, + "learning_rate": 1.3592869052558854e-13, + "loss": 1.8204, + "step": 14970 + }, + { + "epoch": 3.0, + "step": 14970, + "total_flos": 5863369973760.0, + "train_loss": 2.824610478247335, + "train_runtime": 9580.845, + "train_samples_per_second": 12.499, + "train_steps_per_second": 1.562 + } + ], + "logging_steps": 1, + "max_steps": 14970, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5863369973760.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}